LCOV - code coverage report
Current view: top level - src - submatrix_dissection.F (source / functions) Hit Total Coverage
Test: CP2K Regtests (git:2fce0f8) Lines: 351 403 87.1 %
Date: 2024-12-21 06:28:57 Functions: 8 10 80.0 %

          Line data    Source code
       1             : !--------------------------------------------------------------------------------------------------!
       2             : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3             : !   Copyright 2000-2024 CP2K developers group <https://cp2k.org>                                   !
       4             : !                                                                                                  !
       5             : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6             : !--------------------------------------------------------------------------------------------------!
       7             : 
       8             : ! **************************************************************************************************
       9             : !> \note
      10             : !> This module implements a modified version of the submatrix method, proposed in
      11             : !>   M. Lass, S. Mohr, H. Wiebeler, T. Kuehne, C. Plessl
      12             : !>   A Massively Parallel Algorithm for the Approximate Calculation of Inverse p-th Roots of Large Sparse Matrices
      13             : !>   Proc. Platform for Advanced Scientific Computing (PASC) Conference, ACM, 2018
      14             : !>
      15             : !> The method is extended to minimize the required data transfers and floating-point operations under the assumption that non-zero
      16             : !> blocks of the matrix are close to its diagonal.
      17             : !>
      18             : !> Submatrices can be constructed not for single columns of the matrix but for a set of w consecutive submatrices. The underlying
      19             : !> assumption is that columns next to each other have relatively similar occupation patterns, i.e., constructing a submatrix from
      20             : !> columns x and x+1 should not lead to a much bigger submatrix than contructing it only from column x.
      21             : !>
      22             : !> The construction of the submatrices requires communication between all ranks. It is crucial to implement this communication as
      23             : !> efficient as possible, i.e., data should only ever be transferred once between two ranks and message sizes need to be
      24             : !> sufficiently large to utilize the communication bandwidth. To achieve this, we communicate the required blocks for all
      25             : !> submatrices at once and copy them to large buffers before transmitting them via MPI.
      26             : !>
      27             : !> Note on multi-threading:
      28             : !> Submatrices can be constructed and processed in parallel by multiple threads. However, generate_submatrix, get_sm_ids_for_rank
      29             : !> and copy_resultcol are the only thread-safe routines in this module. All other routines involve MPI communication or operate on
      30             : !> common, non-protected data and are hence not thread-safe.
      31             : !>
      32             : !> TODO:
      33             : !> * generic types (for now only dp supported)
      34             : !> * optimization of threaded initialization
      35             : !> * sanity checks at the beginning of all methods
      36             : !>
      37             : !> \author Michael Lass
      38             : ! **************************************************************************************************
      39             : 
      40             : MODULE submatrix_dissection
      41             : 
      42             :    USE bibliography,                    ONLY: Lass2018,&
      43             :                                               cite_reference
      44             :    USE cp_dbcsr_api,                    ONLY: &
      45             :         dbcsr_distribution_get, dbcsr_distribution_type, dbcsr_finalize, dbcsr_get_block_p, &
      46             :         dbcsr_get_info, dbcsr_get_stored_coordinates, dbcsr_iterator_blocks_left, &
      47             :         dbcsr_iterator_next_block, dbcsr_iterator_start, dbcsr_iterator_stop, dbcsr_iterator_type, &
      48             :         dbcsr_put_block, dbcsr_type
      49             :    USE kinds,                           ONLY: dp
      50             :    USE message_passing,                 ONLY: mp_comm_type
      51             :    USE submatrix_types,                 ONLY: buffer_type,&
      52             :                                               bufptr_type,&
      53             :                                               intBuffer_type,&
      54             :                                               set_type,&
      55             :                                               setarray_type
      56             : 
      57             : !$ USE omp_lib, ONLY: omp_get_max_threads, omp_get_thread_num
      58             : 
      59             : #include "./base/base_uses.f90"
      60             : 
      61             :    IMPLICIT NONE
      62             :    PRIVATE
      63             : 
      64             :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'submatrix_dissection'
      65             : 
      66             :    TYPE, PUBLIC :: submatrix_dissection_type
      67             :       TYPE(dbcsr_type)                                :: dbcsr_mat
      68             :       TYPE(dbcsr_distribution_type)                   :: dist
      69             :       LOGICAL                                         :: initialized = .FALSE.
      70             :       TYPE(mp_comm_type)                              :: group = mp_comm_type()
      71             :       INTEGER                                         :: numnodes = -1, myrank = -1, nblkcols = -1, &
      72             :                                                          nblkrows = -1, nblks = -1, local_blocks = -1, &
      73             :                                                          cols_per_sm = -1, number_of_submatrices = -1
      74             :       INTEGER, DIMENSION(:), POINTER                  :: row_blk_size => NULL(), col_blk_size => NULL()
      75             :       INTEGER, DIMENSION(:), ALLOCATABLE              :: coo_cols, coo_rows, coo_col_offsets, coo_cols_local, coo_rows_local, &
      76             :                                                          coo_col_offsets_local, submatrix_owners, submatrix_sizes
      77             :       TYPE(buffer_type), DIMENSION(:), ALLOCATABLE    :: recvbufs, result_sendbufs ! Indexing starts with 0 to match rank ids!
      78             :       TYPE(set_type), DIMENSION(:), ALLOCATABLE       :: result_blocks_for_rank, result_blocks_from_rank
      79             :       TYPE(bufptr_type), DIMENSION(:), ALLOCATABLE    :: coo_dptr
      80             :       TYPE(intBuffer_type), DIMENSION(:), ALLOCATABLE :: result_blocks_for_rank_buf_offsets
      81             :    CONTAINS
      82             :       PROCEDURE :: init => submatrix_dissection_init
      83             :       PROCEDURE :: final => submatrix_dissection_final
      84             :       PROCEDURE :: get_sm_ids_for_rank => submatrix_get_sm_ids_for_rank
      85             :       PROCEDURE :: generate_submatrix => submatrix_generate_sm
      86             :       PROCEDURE :: copy_resultcol => submatrix_cpy_resultcol
      87             :       PROCEDURE :: communicate_results => submatrix_communicate_results
      88             :       PROCEDURE :: get_relevant_sm_columns => submatrix_get_relevant_sm_columns
      89             :    END TYPE submatrix_dissection_type
      90             : 
      91             : CONTAINS
      92             : 
      93             : ! **************************************************************************************************
      94             : !> \brief determine which columns of the submatrix are relevant for the result matrix
      95             : !> \param this - object of class submatrix_dissection_type
      96             : !> \param sm_id - id of the submatrix
      97             : !> \param first - first column of submatrix that is relevant
      98             : !> \param last - last column of submatrix that is relevant
      99             : ! **************************************************************************************************
     100          36 :    SUBROUTINE submatrix_get_relevant_sm_columns(this, sm_id, first, last)
     101             :       CLASS(submatrix_dissection_type), INTENT(IN)     :: this
     102             :       INTEGER, INTENT(IN)                              :: sm_id
     103             :       INTEGER, INTENT(OUT)                             :: first, last
     104             :       INTEGER                                          :: i, j, blkid
     105        9288 :       TYPE(set_type)                                   :: nonzero_rows
     106             : 
     107             :       ! TODO: Should we buffer the list of non-zero rows for each submatrix instead of recalculating it each time?
     108          72 :       DO i = (sm_id - 1)*this%cols_per_sm + 1, sm_id*this%cols_per_sm     ! all colums that determine submatrix sm_id
     109         108 :          DO j = this%coo_col_offsets(i), this%coo_col_offsets(i + 1) - 1  ! all blocks that are within this column
     110          72 :             CALL nonzero_rows%insert(this%coo_rows(j))
     111             :          END DO
     112             :       END DO
     113             : 
     114          36 :       first = 1
     115          36 :       DO i = 1, nonzero_rows%elements
     116          36 :          blkid = nonzero_rows%get(i)
     117          36 :          IF (blkid .EQ. (sm_id - 1)*this%cols_per_sm + 1) THEN
     118             :             ! We just found the nonzero row that corresponds to the first inducing column of our submatrix
     119             :             ! Now add up block sizes to determine the last one as well
     120          36 :             last = first - 1
     121          36 :             DO j = i, nonzero_rows%elements
     122          36 :                blkid = nonzero_rows%get(j)
     123          36 :                last = last + this%col_blk_size(blkid)
     124          36 :                IF (blkid .EQ. (sm_id)*this%cols_per_sm) EXIT
     125             :             END DO
     126             :             EXIT
     127             :          END IF
     128           0 :          first = first + this%col_blk_size(blkid)
     129             :       END DO
     130             : 
     131          36 :       CALL nonzero_rows%reset
     132             : 
     133        9288 :    END SUBROUTINE submatrix_get_relevant_sm_columns
     134             : 
     135             : ! **************************************************************************************************
     136             : !> \brief initialize submatrix dissection and communicate, needs to be called before constructing any submatrices.
     137             : !> \param this - object of class submatrix_dissection_type
     138             : !> \param matrix_p - dbcsr input matrix
     139             : !> \par History
     140             : !>       2020.02 created [Michael Lass]
     141             : !>       2020.05 add time measurements [Michael Lass]
     142             : ! **************************************************************************************************
     143          10 :    SUBROUTINE submatrix_dissection_init(this, matrix_p) ! Should be PURE but the iterator routines are not
     144             :       CLASS(submatrix_dissection_type), INTENT(INOUT)  :: this
     145             :       TYPE(dbcsr_type), INTENT(IN)                     :: matrix_p
     146             : 
     147             :       INTEGER                                          :: cur_row, cur_col, cur_blk, i, j, k, l, m, l_limit_left, l_limit_right, &
     148             :                                                           bufsize, bufsize_next
     149          20 :       INTEGER, DIMENSION(:), ALLOCATABLE               :: blocks_per_rank, coo_dsplcmnts, num_blockids_send, num_blockids_recv
     150             :       TYPE(dbcsr_iterator_type)                        :: iter
     151        2580 :       TYPE(set_type)                                   :: nonzero_rows
     152             :       REAL(KIND=dp)                                    :: flops_total, flops_per_rank, flops_per_sm, flops_threshold, &
     153             :                                                           flops_current, flops_remaining
     154             : 
     155             :       ! Indexing for the following arrays starts with 0 to match rank ids
     156          10 :       TYPE(set_type), DIMENSION(:), ALLOCATABLE        :: blocks_from_rank
     157          10 :       TYPE(buffer_type), DIMENSION(:), ALLOCATABLE     :: sendbufs
     158          10 :       TYPE(intBuffer_type), DIMENSION(:), ALLOCATABLE  :: blocks_for_rank
     159             : 
     160             :       ! Additional structures for threaded parts
     161             :       INTEGER                                          :: numthreads, mytid, group_handle
     162             :       ! Indexing starts at 0 to match thread ids
     163          10 :       TYPE(setarray_type), DIMENSION(:), ALLOCATABLE   :: nonzero_rows_t
     164          10 :       TYPE(setarray_type), DIMENSION(:), ALLOCATABLE   :: result_blocks_from_rank_t, result_blocks_for_rank_t, &
     165          10 :                                                           blocks_from_rank_t
     166             : 
     167             :       LOGICAL                                          :: valid
     168          10 :       REAL(KIND=dp), DIMENSION(:), POINTER             :: blockp
     169             : 
     170             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'submatrix_dissection_init'
     171             :       INTEGER :: handle
     172             : 
     173          10 :       CALL timeset(routineN, handle)
     174          10 :       CALL cite_reference(Lass2018)
     175             : 
     176          10 :       this%dbcsr_mat = matrix_p
     177             :       CALL dbcsr_get_info(matrix=this%dbcsr_mat, nblkcols_total=this%nblkcols, nblkrows_total=this%nblkrows, &
     178          10 :                          row_blk_size=this%row_blk_size, col_blk_size=this%col_blk_size, group=group_handle, distribution=this%dist)
     179          10 :       CALL dbcsr_distribution_get(dist=this%dist, mynode=this%myrank, numnodes=this%numnodes)
     180             : 
     181          10 :       CALL this%group%set_handle(group_handle)
     182             : 
     183          10 :       IF (this%nblkcols .NE. this%nblkrows) THEN
     184           0 :          CPABORT("Number of block rows and cols need to be identical")
     185             :       END IF
     186             : 
     187          20 :       DO i = 1, this%nblkcols
     188          20 :          IF (this%col_blk_size(i) .NE. this%row_blk_size(i)) THEN
     189           0 :             CPABORT("Block row sizes and col sizes need to be identical")
     190             :          END IF
     191             :       END DO
     192             : 
     193             :       ! TODO: We could do even more sanity checks here, e.g., the matrix must not be stored symmetrically
     194             : 
     195             :       ! For the submatrix method, we need global knwoledge about which blocks are actually used. Therefore, we create a COO
     196             :       ! representation of the blocks (without their contents) on all ranks.
     197             : 
     198             :       ! TODO: Right now, the COO contains all blocks. Also we transfer all blocks. We could skip half of them due to the matrix
     199             :       ! being symmetric (however, we need to transpose the blocks). This can increase performance only by a factor of 2 and is
     200             :       ! therefore deferred.
     201             : 
     202             :       ! Determine number of locally stored blocks
     203          10 :       this%local_blocks = 0
     204          10 :       CALL dbcsr_iterator_start(iter, this%dbcsr_mat, read_only=.TRUE.)
     205          15 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     206           5 :          CALL dbcsr_iterator_next_block(iter, cur_row, cur_col, cur_blk)
     207           5 :          this%local_blocks = this%local_blocks + 1
     208             :       END DO
     209          10 :       CALL dbcsr_iterator_stop(iter)
     210             : 
     211           0 :       ALLOCATE (this%coo_cols_local(this%local_blocks), this%coo_rows_local(this%local_blocks), blocks_per_rank(this%numnodes), &
     212           0 :                 coo_dsplcmnts(this%numnodes), this%coo_col_offsets_local(this%nblkcols + 1), &
     213             :                 blocks_for_rank(0:this%numnodes - 1), blocks_from_rank(0:this%numnodes - 1), &
     214           0 :                 sendbufs(0:this%numnodes - 1), this%recvbufs(0:this%numnodes - 1), this%result_sendbufs(0:this%numnodes - 1), &
     215           0 :                 this%result_blocks_for_rank(0:this%numnodes - 1), this%result_blocks_from_rank(0:this%numnodes - 1), &
     216       23540 :                 this%result_blocks_for_rank_buf_offsets(0:this%numnodes - 1))
     217             : 
     218          10 :       i = 1
     219          10 :       CALL dbcsr_iterator_start(iter, this%dbcsr_mat, read_only=.TRUE.)
     220          15 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     221           5 :          CALL dbcsr_iterator_next_block(iter, cur_row, cur_col, cur_blk)
     222           5 :          this%coo_cols_local(i) = cur_col
     223           5 :          this%coo_rows_local(i) = cur_row
     224           5 :          i = i + 1
     225             :       END DO
     226          10 :       CALL dbcsr_iterator_stop(iter)
     227             : 
     228             :       ! We only know how many blocks we own. What's with the other ranks?
     229          10 :       CALL this%group%allgather(msgout=this%local_blocks, msgin=blocks_per_rank)
     230          10 :       coo_dsplcmnts(1) = 0
     231          20 :       DO i = 1, this%numnodes - 1
     232          20 :          coo_dsplcmnts(i + 1) = coo_dsplcmnts(i) + blocks_per_rank(i)
     233             :       END DO
     234             : 
     235             :       ! Get a global view on the matrix
     236          30 :       this%nblks = SUM(blocks_per_rank)
     237           0 :       ALLOCATE (this%coo_cols(this%nblks), this%coo_rows(this%nblks), this%coo_col_offsets(this%nblkcols + 1), &
     238         100 :                 this%coo_dptr(this%nblks))
     239             :       CALL this%group%allgatherv(msgout=this%coo_rows_local, msgin=this%coo_rows, rcount=blocks_per_rank, &
     240          10 :                                  rdispl=coo_dsplcmnts)
     241             :       CALL this%group%allgatherv(msgout=this%coo_cols_local, msgin=this%coo_cols, rcount=blocks_per_rank, &
     242          10 :                                  rdispl=coo_dsplcmnts)
     243             : 
     244          10 :       DEALLOCATE (blocks_per_rank, coo_dsplcmnts)
     245             : 
     246             :       ! Sort COO arrays according to their columns
     247          10 :       CALL qsort_two(this%coo_cols_local, this%coo_rows_local, 1, this%local_blocks)
     248          10 :       CALL qsort_two(this%coo_cols, this%coo_rows, 1, this%nblks)
     249             : 
     250             :       ! Get COO array offsets for columns to accelerate lookups
     251          10 :       this%coo_col_offsets(this%nblkcols + 1) = this%nblks + 1
     252          10 :       j = 1
     253          20 :       DO i = 1, this%nblkcols
     254          10 :          DO WHILE ((j .LE. this%nblks))
     255          10 :             IF (this%coo_cols(j) .GE. i) EXIT
     256          10 :             j = j + 1
     257             :          END DO
     258          20 :          this%coo_col_offsets(i) = j
     259             :       END DO
     260             : 
     261             :       ! Same for local COO
     262          10 :       this%coo_col_offsets_local(this%nblkcols + 1) = this%local_blocks + 1
     263          10 :       j = 1
     264          20 :       DO i = 1, this%nblkcols
     265          10 :          DO WHILE ((j .LE. this%local_blocks))
     266           5 :             IF (this%coo_cols_local(j) .GE. i) EXIT
     267           5 :             j = j + 1
     268             :          END DO
     269          20 :          this%coo_col_offsets_local(i) = j
     270             :       END DO
     271             : 
     272             :       ! We could combine multiple columns to generate a single submatrix. For now, we have not found a practical use-case for this
     273             :       ! so we only look at single columns for now.
     274          10 :       this%cols_per_sm = 1
     275             : 
     276             :       ! Determine sizes of all submatrices. This is required in order to assess the computational effort that is required to process
     277             :       ! the submatrices.
     278          10 :       this%number_of_submatrices = this%nblkcols/this%cols_per_sm
     279          30 :       ALLOCATE (this%submatrix_sizes(this%number_of_submatrices))
     280          20 :       this%submatrix_sizes = 0
     281          10 :       flops_total = 0.0D0
     282          20 :       DO i = 1, this%number_of_submatrices
     283          10 :          CALL nonzero_rows%reset
     284          20 :          DO j = (i - 1)*this%cols_per_sm + 1, i*this%cols_per_sm            ! all colums that determine submatrix i
     285          30 :             DO k = this%coo_col_offsets(j), this%coo_col_offsets(j + 1) - 1 ! all blocks that are within this column
     286          20 :                CALL nonzero_rows%insert(this%coo_rows(k))
     287             :             END DO
     288             :          END DO
     289          20 :          DO j = 1, nonzero_rows%elements
     290          20 :             this%submatrix_sizes(i) = this%submatrix_sizes(i) + this%col_blk_size(nonzero_rows%get(j))
     291             :          END DO
     292          20 :          flops_total = flops_total + 2.0D0*this%submatrix_sizes(i)*this%submatrix_sizes(i)*this%submatrix_sizes(i)
     293             :       END DO
     294             : 
     295             :       ! Create mapping from submatrices to ranks. Since submatrices can be of different sizes, we need to perform some load
     296             :       ! balancing here. For that we assume that arithmetic operations performed on the submatrices scale cubically.
     297          30 :       ALLOCATE (this%submatrix_owners(this%number_of_submatrices))
     298          10 :       flops_per_sm = flops_total/this%number_of_submatrices
     299          10 :       flops_per_rank = flops_total/this%numnodes
     300          10 :       flops_current = 0.0D0
     301          10 :       j = 0
     302          20 :       DO i = 1, this%number_of_submatrices
     303          10 :          this%submatrix_owners(i) = j
     304          10 :          flops_current = flops_current + 2.0D0*this%submatrix_sizes(i)*this%submatrix_sizes(i)*this%submatrix_sizes(i)
     305          10 :          flops_remaining = flops_total - flops_current
     306          10 :          flops_threshold = (this%numnodes - j - 1)*flops_per_rank
     307             :          IF ((j .LT. (this%numnodes - 1)) &
     308          10 :              .AND. ((flops_remaining .LE. flops_threshold &
     309          10 :                      .OR. (this%number_of_submatrices - i) .LT. (this%numnodes - j)))) THEN
     310          10 :             j = j + 1
     311          10 :             flops_total = flops_total - flops_current
     312          10 :             flops_current = 0.0D0
     313             :          END IF
     314             :       END DO
     315             : 
     316             :       ! Prepare data structures for multithreaded loop
     317          10 :       numthreads = 1
     318          10 : !$    numthreads = omp_get_max_threads()
     319             : 
     320           0 :       ALLOCATE (result_blocks_from_rank_t(0:numthreads - 1), &
     321           0 :                 result_blocks_for_rank_t(0:numthreads - 1), &
     322           0 :                 blocks_from_rank_t(0:numthreads - 1), &
     323         100 :                 nonzero_rows_t(0:numthreads - 1))
     324             : 
     325             :       ! Figure out which blocks we need to receive. Blocks are identified here as indices into our COO representation.
     326             :       ! TODO: This currently shows limited parallel efficiency. Investigate further.
     327             : 
     328             :       !$OMP PARALLEL DEFAULT(OMP_DEFAULT_NONE_WITH_OOP) &
     329             :       !$OMP          NUM_THREADS(numthreads) &
     330             :       !$OMP          PRIVATE(i,j,k,l,m,l_limit_left,l_limit_right,cur_col,cur_row,mytid) &
     331             :       !$OMP          SHARED(result_blocks_from_rank_t,result_blocks_for_rank_t,blocks_from_rank_t,this,numthreads,nonzero_rows_t)
     332             :       mytid = 0
     333             : !$    mytid = omp_get_thread_num()
     334             : 
     335             :       ALLOCATE (nonzero_rows_t(mytid)%sets(1), &
     336             :                 result_blocks_from_rank_t(mytid)%sets(0:this%numnodes - 1), &
     337             :                 result_blocks_for_rank_t(mytid)%sets(0:this%numnodes - 1), &
     338             :                 blocks_from_rank_t(mytid)%sets(0:this%numnodes - 1))
     339             : 
     340          10 :       !$OMP DO schedule(guided)
     341             :       DO i = 1, this%number_of_submatrices
     342             :          CALL nonzero_rows_t(mytid)%sets(1)%reset
     343             :          DO j = (i - 1)*this%cols_per_sm + 1, i*this%cols_per_sm            ! all colums that determine submatrix i
     344             :             DO k = this%coo_col_offsets(j), this%coo_col_offsets(j + 1) - 1 ! all blocks that are within this column
     345             :                ! This block will be required to assemble the final block matrix as it is within an inducing column for submatrix i.
     346             :                ! Figure out who stores it and insert it into the result_blocks_* sets.
     347             :                CALL dbcsr_get_stored_coordinates(matrix=this%dbcsr_mat, row=this%coo_rows(k), column=j, processor=m)
     348             :                IF (m .EQ. this%myrank) THEN
     349             :                   CALL result_blocks_from_rank_t(mytid)%sets(this%submatrix_owners(i))%insert(k)
     350             :                END IF
     351             :                IF (this%submatrix_owners(i) .EQ. this%myrank) THEN
     352             :                   CALL nonzero_rows_t(mytid)%sets(1)%insert(this%coo_rows(k))
     353             :                   CALL result_blocks_for_rank_t(mytid)%sets(m)%insert(k)
     354             :                END IF
     355             :             END DO
     356             :          END DO
     357             : 
     358             :          IF (this%submatrix_owners(i) .NE. this%myrank) CYCLE
     359             : 
     360             :          ! In the following, we determine all blocks required to build the submatrix. We interpret nonzero_rows_t(mytid)(j) as
     361             :          ! column and nonzero_rows_t(mytid)(k) as row.
     362             :          DO j = 1, nonzero_rows_t(mytid)%sets(1)%elements
     363             :             cur_col = nonzero_rows_t(mytid)%sets(1)%get(j)
     364             :             l_limit_left = this%coo_col_offsets(cur_col)
     365             :             l_limit_right = this%coo_col_offsets(cur_col + 1) - 1
     366             :             DO k = 1, nonzero_rows_t(mytid)%sets(1)%elements
     367             :                cur_row = nonzero_rows_t(mytid)%sets(1)%get(k)
     368             :                l = l_limit_left
     369             :                DO WHILE (l .LE. l_limit_right)
     370             :                   IF (this%coo_rows(l) .GE. cur_row) EXIT
     371             :                   l = l + 1
     372             :                END DO
     373             :                l_limit_left = l
     374             :                IF (l .LE. l_limit_right) THEN
     375             :                   IF (this%coo_rows(l) .EQ. cur_row) THEN
     376             :                      ! We found a valid block. Figure out what to do with it.
     377             :                      CALL dbcsr_get_stored_coordinates(matrix=this%dbcsr_mat, row=this%coo_rows(l), &
     378             :                                                        column=this%coo_cols(l), processor=m)
     379             :                      CALL blocks_from_rank_t(mytid)%sets(m)%insert(l)
     380             :                   END IF
     381             :                END IF
     382             :             END DO
     383             :          END DO
     384             :       END DO
     385             :       !$OMP END DO
     386             :       !$OMP END PARALLEL
     387             : 
     388             :       ! Merge partial results from threads
     389          30 :       DO i = 0, this%numnodes - 1
     390          50 :          DO j = 0, numthreads - 1
     391          25 :             DO k = 1, result_blocks_from_rank_t(j)%sets(i)%elements
     392          25 :                CALL this%result_blocks_from_rank(i)%insert(result_blocks_from_rank_t(j)%sets(i)%get(k))
     393             :             END DO
     394          20 :             CALL result_blocks_from_rank_t(j)%sets(i)%reset
     395          25 :             DO k = 1, result_blocks_for_rank_t(j)%sets(i)%elements
     396          25 :                CALL this%result_blocks_for_rank(i)%insert(result_blocks_for_rank_t(j)%sets(i)%get(k))
     397             :             END DO
     398          20 :             CALL result_blocks_for_rank_t(j)%sets(i)%reset
     399          25 :             DO k = 1, blocks_from_rank_t(j)%sets(i)%elements
     400          25 :                CALL blocks_from_rank(i)%insert(blocks_from_rank_t(j)%sets(i)%get(k))
     401             :             END DO
     402          40 :             CALL blocks_from_rank_t(j)%sets(i)%reset
     403             :          END DO
     404             :       END DO
     405          20 :       DO i = 0, numthreads - 1
     406          10 :          CALL nonzero_rows_t(i)%sets(1)%reset
     407           0 :          DEALLOCATE (result_blocks_from_rank_t(i)%sets, result_blocks_for_rank_t(i)%sets, blocks_from_rank_t(i)%sets, &
     408       18080 :                      nonzero_rows_t(i)%sets)
     409             :       END DO
     410          50 :       DEALLOCATE (result_blocks_from_rank_t, result_blocks_for_rank_t, blocks_from_rank_t, nonzero_rows_t)
     411             : 
     412             :       ! Make other ranks aware of our needs
     413          40 :       ALLOCATE (num_blockids_send(0:this%numnodes - 1), num_blockids_recv(0:this%numnodes - 1))
     414          30 :       DO i = 0, this%numnodes - 1
     415          30 :          num_blockids_send(i) = blocks_from_rank(i)%elements
     416             :       END DO
     417          10 :       CALL this%group%alltoall(num_blockids_send, num_blockids_recv, 1)
     418          30 :       DO i = 0, this%numnodes - 1
     419          30 :          CALL blocks_for_rank(i)%alloc(num_blockids_recv(i))
     420             :       END DO
     421          10 :       DEALLOCATE (num_blockids_send, num_blockids_recv)
     422             : 
     423          10 :       IF (this%numnodes .GT. 1) THEN
     424          30 :          DO i = 1, this%numnodes
     425          20 :             k = MODULO(this%myrank - i, this%numnodes) ! rank to receive from
     426          30 :             CALL this%group%irecv(msgout=blocks_for_rank(k)%data, source=k, request=blocks_for_rank(k)%mpi_request)
     427             :          END DO
     428          30 :          DO i = 1, this%numnodes
     429          20 :             k = MODULO(this%myrank + i, this%numnodes) ! rank to send to
     430          30 :             CALL this%group%send(blocks_from_rank(k)%getall(), k, 0)
     431             :          END DO
     432          30 :          DO i = 0, this%numnodes - 1
     433          30 :             CALL blocks_for_rank(i)%mpi_request%wait()
     434             :          END DO
     435             :       ELSE
     436           0 :          blocks_for_rank(0)%data = blocks_from_rank(0)%getall()
     437             :       END IF
     438             : 
     439             :       ! Free memory allocated in nonzero_rows
     440          10 :       CALL nonzero_rows%reset
     441             : 
     442             :       ! Make get calls on this%result_blocks_for_rank(...) threadsafe in other routines by updating the internal sorted list
     443          30 :       DO m = 0, this%numnodes - 1
     444          30 :          IF ((.NOT. this%result_blocks_for_rank(m)%sorted_up_to_date) .AND. (this%result_blocks_for_rank(m)%elements .GT. 0)) THEN
     445           5 :             CALL this%result_blocks_for_rank(m)%update_sorted
     446             :          END IF
     447             :       END DO
     448             : 
     449             :       ! Create and fill send buffers
     450          30 :       DO i = 0, this%numnodes - 1
     451          20 :          bufsize = 0
     452          25 :          DO j = 1, blocks_for_rank(i)%size
     453           5 :             k = blocks_for_rank(i)%data(j)
     454          25 :             bufsize = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
     455             :          END DO
     456          20 :          CALL sendbufs(i)%alloc(bufsize)
     457             : 
     458          20 :          bufsize = 0
     459          20 :          CALL this%result_blocks_for_rank_buf_offsets(i)%alloc(this%result_blocks_for_rank(i)%elements)
     460          25 :          DO j = 1, this%result_blocks_for_rank(i)%elements
     461           5 :             k = this%result_blocks_for_rank(i)%get(j)
     462           5 :             this%result_blocks_for_rank_buf_offsets(i)%data(j) = bufsize
     463          25 :             bufsize = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
     464             :          END DO
     465          20 :          CALL this%result_sendbufs(i)%alloc(bufsize)
     466             : 
     467          20 :          bufsize = 0
     468          35 :          DO j = 1, blocks_for_rank(i)%size
     469           5 :             k = blocks_for_rank(i)%data(j)
     470           5 :             CALL dbcsr_get_block_p(this%dbcsr_mat, row=this%coo_rows(k), col=this%coo_cols(k), block=blockp, found=valid)
     471           5 :             IF (.NOT. valid) THEN
     472           0 :                CPABORT("Block included in our COO and placed on our rank could not be fetched!")
     473             :             END IF
     474           5 :             bufsize_next = bufsize + SIZE(blockp)
     475         185 :             sendbufs(i)%data(bufsize + 1:bufsize_next) = blockp
     476          30 :             bufsize = bufsize_next
     477             :          END DO
     478             :       END DO
     479             : 
     480             :       ! Create receive buffers and mapping from blocks to memory locations
     481          30 :       DO i = 0, this%numnodes - 1
     482          20 :          bufsize = 0
     483          25 :          DO j = 1, blocks_from_rank(i)%elements
     484           5 :             k = blocks_from_rank(i)%get(j)
     485          25 :             bufsize = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
     486             :          END DO
     487          20 :          CALL this%recvbufs(i)%alloc(bufsize)
     488          20 :          bufsize = 0
     489          35 :          DO j = 1, blocks_from_rank(i)%elements
     490           5 :             k = blocks_from_rank(i)%get(j)
     491           5 :             bufsize_next = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
     492           5 :             this%coo_dptr(k)%target => this%recvbufs(i)%data(bufsize + 1:bufsize_next)
     493          25 :             bufsize = bufsize_next
     494             :          END DO
     495             :       END DO
     496             : 
     497          30 :       DO i = 0, this%numnodes - 1
     498          20 :          CALL blocks_for_rank(i)%dealloc
     499          30 :          CALL blocks_from_rank(i)%reset
     500             :       END DO
     501        5170 :       DEALLOCATE (blocks_for_rank, blocks_from_rank)
     502             : 
     503          10 :       IF (this%numnodes .GT. 1) THEN
     504             :          ! Communicate. We attempt to balance communication load in the network here by starting our sends with our right neighbor
     505             :          ! and first trying to receive from our left neighbor.
     506          30 :          DO i = 1, this%numnodes
     507          20 :             k = MODULO(this%myrank - i, this%numnodes) ! rank to receive from
     508          20 :             CALL this%group%irecv(msgout=this%recvbufs(k)%data, source=k, request=this%recvbufs(k)%mpi_request)
     509          20 :             k = MODULO(this%myrank + i, this%numnodes) ! rank to send to
     510          30 :             CALL this%group%isend(msgin=sendbufs(k)%data, dest=k, request=sendbufs(k)%mpi_request)
     511             :          END DO
     512          30 :          DO i = 0, this%numnodes - 1
     513          20 :             CALL sendbufs(i)%mpi_request%wait()
     514          30 :             CALL this%recvbufs(i)%mpi_request%wait()
     515             :          END DO
     516             :       ELSE
     517           0 :          this%recvbufs(0)%data = sendbufs(0)%data
     518             :       END IF
     519             : 
     520          30 :       DO i = 0, this%numnodes - 1
     521          30 :          CALL sendbufs(i)%dealloc
     522             :       END DO
     523          10 :       DEALLOCATE (sendbufs)
     524             : 
     525          10 :       this%initialized = .TRUE.
     526             : 
     527          10 :       CALL timestop(handle)
     528        2620 :    END SUBROUTINE submatrix_dissection_init
     529             : 
     530             : ! **************************************************************************************************
     531             : !> \brief free all associated memory, afterwards submatrix_dissection_init needs to be called again
     532             : !> \param this - object of class submatrix_dissection_type
     533             : ! **************************************************************************************************
     534          10 :    PURE SUBROUTINE submatrix_dissection_final(this)
     535             :       CLASS(submatrix_dissection_type), INTENT(INOUT) :: this
     536             :       INTEGER                                         :: i
     537             : 
     538          10 :       this%initialized = .FALSE.
     539             : 
     540          10 :       IF (ALLOCATED(this%submatrix_sizes)) DEALLOCATE (this%submatrix_sizes)
     541          10 :       IF (ALLOCATED(this%coo_cols_local)) DEALLOCATE (this%coo_cols_local)
     542          10 :       IF (ALLOCATED(this%coo_rows_local)) DEALLOCATE (this%coo_rows_local)
     543          10 :       IF (ALLOCATED(this%coo_col_offsets_local)) DEALLOCATE (this%coo_col_offsets_local)
     544          10 :       IF (ALLOCATED(this%result_blocks_for_rank_buf_offsets)) THEN
     545          30 :          DO i = 0, this%numnodes - 1
     546          30 :             CALL this%result_blocks_for_rank_buf_offsets(i)%dealloc
     547             :          END DO
     548          10 :          DEALLOCATE (this%result_blocks_for_rank_buf_offsets)
     549             :       END IF
     550          10 :       IF (ALLOCATED(this%recvbufs)) THEN
     551          30 :          DO i = 0, this%numnodes - 1
     552          30 :             CALL this%recvbufs(i)%dealloc
     553             :          END DO
     554          10 :          DEALLOCATE (this%recvbufs)
     555             :       END IF
     556          10 :       IF (ALLOCATED(this%result_sendbufs)) THEN
     557          30 :          DO i = 0, this%numnodes - 1
     558          30 :             CALL this%result_sendbufs(i)%dealloc
     559             :          END DO
     560          10 :          DEALLOCATE (this%result_sendbufs)
     561             :       END IF
     562          10 :       IF (ALLOCATED(this%result_blocks_for_rank)) THEN
     563          30 :          DO i = 0, this%numnodes - 1
     564          30 :             CALL this%result_blocks_for_rank(i)%reset
     565             :          END DO
     566        5170 :          DEALLOCATE (this%result_blocks_for_rank)
     567             :       END IF
     568          10 :       IF (ALLOCATED(this%result_blocks_from_rank)) THEN
     569          30 :          DO i = 0, this%numnodes - 1
     570          30 :             CALL this%result_blocks_from_rank(i)%reset
     571             :          END DO
     572        5170 :          DEALLOCATE (this%result_blocks_from_rank)
     573             :       END IF
     574          10 :       IF (ALLOCATED(this%coo_cols)) DEALLOCATE (this%coo_cols)
     575          10 :       IF (ALLOCATED(this%coo_rows)) DEALLOCATE (this%coo_rows)
     576          10 :       IF (ALLOCATED(this%coo_col_offsets)) DEALLOCATE (this%coo_col_offsets)
     577          10 :       IF (ALLOCATED(this%coo_dptr)) DEALLOCATE (this%coo_dptr)
     578          10 :       IF (ALLOCATED(this%submatrix_owners)) DEALLOCATE (this%submatrix_owners)
     579             : 
     580          10 :    END SUBROUTINE submatrix_dissection_final
     581             : 
     582             : ! **************************************************************************************************
     583             : !> \brief generate a specific submatrix
     584             : !> \param this - object of class submatrix_dissection_type
     585             : !> \param sm_id - id of the submatrix to generate
     586             : !> \param sm - generated submatrix
     587             : ! **************************************************************************************************
     588           6 :    SUBROUTINE submatrix_generate_sm(this, sm_id, sm)
     589             :       CLASS(submatrix_dissection_type), INTENT(IN)             :: this
     590             :       INTEGER, INTENT(IN)                                      :: sm_id
     591             :       REAL(KIND=dp), DIMENSION(:, :), ALLOCATABLE, INTENT(OUT) :: sm
     592             : 
     593             :       INTEGER                                                  :: sm_dim, i, j, k, offset_x1, offset_x2, offset_y1, &
     594             :                                                                   offset_y2, k_limit_left, k_limit_right
     595        1548 :       TYPE(set_type)                                           :: nonzero_rows
     596             : 
     597           6 :       IF (.NOT. this%initialized) THEN
     598           0 :          CPABORT("Submatrix dissection not initialized")
     599             :       END IF
     600             : 
     601           6 :       IF (this%myrank .NE. this%submatrix_owners(sm_id)) THEN
     602           0 :          CPABORT("This rank is not supposed to construct this submatrix")
     603             :       END IF
     604             : 
     605             :       ! TODO: Should we buffer the list of non-zero rows for each submatrix instead of recalculating it each time?
     606           6 :       CALL nonzero_rows%reset
     607          12 :       DO i = (sm_id - 1)*this%cols_per_sm + 1, sm_id*this%cols_per_sm     ! all colums that determine submatrix sm_id
     608          18 :          DO j = this%coo_col_offsets(i), this%coo_col_offsets(i + 1) - 1  ! all blocks that are within this column
     609          12 :             CALL nonzero_rows%insert(this%coo_rows(j))
     610             :          END DO
     611             :       END DO
     612           6 :       sm_dim = 0
     613          12 :       DO i = 1, nonzero_rows%elements
     614          12 :          sm_dim = sm_dim + this%col_blk_size(nonzero_rows%get(i))
     615             :       END DO
     616             : 
     617          24 :       ALLOCATE (sm(sm_dim, sm_dim))
     618         258 :       sm = 0
     619             : 
     620           6 :       offset_x1 = 0
     621          12 :       DO j = 1, nonzero_rows%elements
     622           6 :          offset_x2 = offset_x1 + this%col_blk_size(nonzero_rows%get(j))
     623           6 :          offset_y1 = 0
     624           6 :          k_limit_left = this%coo_col_offsets(nonzero_rows%get(j))
     625           6 :          k_limit_right = this%coo_col_offsets(nonzero_rows%get(j) + 1) - 1
     626          12 :          DO i = 1, nonzero_rows%elements
     627           6 :             offset_y2 = offset_y1 + this%col_blk_size(nonzero_rows%get(i))
     628             :             ! Copy block nonzero_rows(i),nonzero_rows(j) to sm(i,j) (if the block actually exists)
     629           6 :             k = k_limit_left
     630           6 :             DO WHILE (k .LE. k_limit_right)
     631           6 :                IF (this%coo_rows(k) .GE. nonzero_rows%get(i)) EXIT
     632           0 :                k = k + 1
     633             :             END DO
     634           6 :             k_limit_left = k
     635           6 :             IF (this%coo_rows(k) .EQ. nonzero_rows%get(i)) THEN ! it does exist and k is our block id
     636             :                sm(offset_y1 + 1:offset_y2, offset_x1 + 1:offset_x2) = RESHAPE(this%coo_dptr(k)%target, &
     637         270 :                                                                               (/offset_y2 - offset_y1, offset_x2 - offset_x1/))
     638             :             END IF
     639          12 :             offset_y1 = offset_y2
     640             :          END DO
     641          12 :          offset_x1 = offset_x2
     642             :       END DO
     643             : 
     644             :       ! Free memory allocated in nonzero_rows
     645           6 :       CALL nonzero_rows%reset
     646             : 
     647        1548 :    END SUBROUTINE submatrix_generate_sm
     648             : 
     649             : ! **************************************************************************************************
     650             : !> \brief determine submatrix ids that are handled by a specific rank
     651             : !> \param this - object of class submatrix_dissection_type
     652             : !> \param rank - rank id of interest
     653             : !> \param sm_ids - list of submatrix ids handled by that rank
     654             : ! **************************************************************************************************
     655          10 :    SUBROUTINE submatrix_get_sm_ids_for_rank(this, rank, sm_ids)
     656             :       CLASS(submatrix_dissection_type), INTENT(IN)    :: this
     657             :       INTEGER, INTENT(IN)                             :: rank
     658             :       INTEGER, DIMENSION(:), ALLOCATABLE, INTENT(OUT) :: sm_ids
     659             :       INTEGER                                         :: count, i
     660             : 
     661          10 :       IF (.NOT. this%initialized) THEN
     662           0 :          CPABORT("Submatrix dissection not initialized")
     663             :       END IF
     664             : 
     665          10 :       count = 0
     666          20 :       DO i = 1, this%number_of_submatrices
     667          20 :          IF (this%submatrix_owners(i) .EQ. rank) count = count + 1
     668             :       END DO
     669             : 
     670          25 :       ALLOCATE (sm_ids(count))
     671             : 
     672          10 :       count = 0
     673          20 :       DO i = 1, this%number_of_submatrices
     674          20 :          IF (this%submatrix_owners(i) .EQ. rank) THEN
     675           5 :             count = count + 1
     676           5 :             sm_ids(count) = i
     677             :          END IF
     678             :       END DO
     679             : 
     680          10 :    END SUBROUTINE submatrix_get_sm_ids_for_rank
     681             : 
     682             : ! **************************************************************************************************
     683             : !> \brief copy result columns from a submatrix into result buffer
     684             : !> \param this - object of class submatrix_dissection_type
     685             : !> \param sm_id - id of the submatrix
     686             : !> \param sm - result-submatrix
     687             : ! **************************************************************************************************
     688           6 :    SUBROUTINE submatrix_cpy_resultcol(this, sm_id, sm)
     689             :       CLASS(submatrix_dissection_type), INTENT(INOUT)         :: this
     690             :       INTEGER, INTENT(in)                                     :: sm_id
     691             :       REAL(KIND=dp), DIMENSION(:, :), ALLOCATABLE, INTENT(IN) :: sm
     692             : 
     693        1548 :       TYPE(set_type)                                          :: nonzero_rows
     694             :       INTEGER                                                 :: i, j, k, m, sm_col_offset, offset_x1, offset_x2, offset_y1, &
     695             :                                                                  offset_y2, bufsize, bufsize_next, k_limit_left, k_limit_right
     696           6 :       INTEGER, DIMENSION(:), ALLOCATABLE                      :: buf_offset_idxs
     697             : 
     698           6 :       IF (.NOT. this%initialized) THEN
     699           0 :          CPABORT("Submatrix dissection is not initizalized")
     700             :       END IF
     701             : 
     702           6 :       IF (this%myrank .NE. this%submatrix_owners(sm_id)) THEN
     703           0 :          CPABORT("This rank is not supposed to operate on this submatrix")
     704             :       END IF
     705             : 
     706          18 :       ALLOCATE (buf_offset_idxs(0:this%numnodes - 1))
     707          18 :       buf_offset_idxs = 1
     708             : 
     709             :       ! TODO: Should we buffer the list of non-zero rows for each submatrix instead of recalculating it each time?
     710           6 :       sm_col_offset = 0
     711          12 :       DO i = (sm_id - 1)*this%cols_per_sm + 1, sm_id*this%cols_per_sm     ! all colums that determine submatrix sm_id
     712          18 :          DO j = this%coo_col_offsets(i), this%coo_col_offsets(i + 1) - 1  ! all blocks that are within this column
     713          12 :             CALL nonzero_rows%insert(this%coo_rows(j))
     714             :          END DO
     715             :       END DO
     716             : 
     717           6 :       sm_col_offset = 0
     718           6 :       DO i = 1, nonzero_rows%elements
     719           6 :          IF (nonzero_rows%get(i) .EQ. (sm_id - 1)*this%cols_per_sm + 1) THEN
     720             :             ! We just found the nonzero row that corresponds to the first inducing column of our submatrix
     721           6 :             sm_col_offset = i
     722           6 :             EXIT
     723             :          END IF
     724             :       END DO
     725           6 :       IF (sm_col_offset .EQ. 0) THEN
     726           0 :          CPABORT("Could not determine relevant result columns of submatrix")
     727             :       END IF
     728             : 
     729           6 :       offset_x1 = 0
     730          12 :       DO j = 1, nonzero_rows%elements
     731           6 :          offset_x2 = offset_x1 + this%col_blk_size(nonzero_rows%get(j))
     732             :          ! We only want to copy the blocks from the result columns
     733           6 :          IF ((j .GE. sm_col_offset) .AND. (j .LT. sm_col_offset + this%cols_per_sm)) THEN
     734           6 :             offset_y1 = 0
     735           6 :             k_limit_left = this%coo_col_offsets(nonzero_rows%get(j))
     736           6 :             k_limit_right = this%coo_col_offsets(nonzero_rows%get(j) + 1) - 1
     737          12 :             DO i = 1, nonzero_rows%elements
     738           6 :                offset_y2 = offset_y1 + this%col_blk_size(nonzero_rows%get(i))
     739             :                ! Check if sm(i,j), i.e., (nonzero_rows(i),nonzero_rows(j)) exists in the original matrix and if so, copy it into the
     740             :                ! correct send buffer.
     741           6 :                k = k_limit_left
     742           6 :                DO WHILE (k .LE. k_limit_right)
     743           6 :                   IF (this%coo_rows(k) .GE. nonzero_rows%get(i)) EXIT
     744           0 :                   k = k + 1
     745             :                END DO
     746           6 :                k_limit_left = k
     747           6 :                IF (this%coo_rows(k) .EQ. nonzero_rows%get(i)) THEN ! it does exist and k is our block id
     748             :                   CALL dbcsr_get_stored_coordinates(matrix=this%dbcsr_mat, row=this%coo_rows(k), column=this%coo_cols(k), &
     749           6 :                                                     processor=m)
     750           6 :                   DO WHILE (buf_offset_idxs(m) .LE. this%result_blocks_for_rank(m)%elements)
     751           6 :                      IF (this%result_blocks_for_rank(m)%get(buf_offset_idxs(m)) .GE. k) EXIT
     752           0 :                      buf_offset_idxs(m) = buf_offset_idxs(m) + 1
     753             :                   END DO
     754           6 :                   IF (this%result_blocks_for_rank(m)%get(buf_offset_idxs(m)) .NE. k) THEN
     755           0 :                      CPABORT("Could not determine buffer offset for block")
     756             :                   END IF
     757           6 :                   bufsize = this%result_blocks_for_rank_buf_offsets(m)%data(buf_offset_idxs(m))
     758           6 :                   bufsize_next = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
     759             :                   this%result_sendbufs(m)%data(bufsize + 1:bufsize_next) = RESHAPE( &
     760             :                                                                            sm(offset_y1 + 1:offset_y2, offset_x1 + 1:offset_x2), &
     761         228 :                                                                            (/bufsize_next - bufsize/))
     762             :                END IF
     763          12 :                offset_y1 = offset_y2
     764             :             END DO
     765             :          END IF
     766          12 :          offset_x1 = offset_x2
     767             :       END DO
     768             : 
     769           6 :       DEALLOCATE (buf_offset_idxs)
     770             : 
     771             :       ! Free memory in set
     772           6 :       CALL nonzero_rows%reset
     773             : 
     774        1548 :    END SUBROUTINE submatrix_cpy_resultcol
     775             : 
     776             : ! **************************************************************************************************
     777             : !> \brief finalize results back into a dbcsr matrix
     778             : !> \param this - object of class submatrix_dissection_type
     779             : !> \param resultmat - result dbcsr matrix
     780             : !> \par History
     781             : !>       2020.02 created [Michael Lass]
     782             : !>       2020.05 add time measurements [Michael Lass]
     783             : ! **************************************************************************************************
     784          10 :    SUBROUTINE submatrix_communicate_results(this, resultmat)
     785             :       CLASS(submatrix_dissection_type), INTENT(INOUT)                 :: this
     786             :       TYPE(dbcsr_type), INTENT(INOUT)                                 :: resultmat
     787             : 
     788             :       INTEGER                                                         :: i, j, k, cur_row, cur_col, cur_sm, cur_buf, last_buf, &
     789             :                                                                          bufsize, bufsize_next
     790          10 :       TYPE(buffer_type), DIMENSION(:), ALLOCATABLE                    :: result_recvbufs
     791             : 
     792             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'submatrix_communicate_results'
     793             :       INTEGER :: handle
     794             : 
     795          10 :       CALL timeset(routineN, handle)
     796             : 
     797          60 :       ALLOCATE (result_recvbufs(0:this%numnodes - 1))
     798          30 :       DO i = 0, this%numnodes - 1
     799          20 :          bufsize = 0
     800          25 :          DO j = 1, this%result_blocks_from_rank(i)%elements
     801           5 :             k = this%result_blocks_from_rank(i)%get(j)
     802          25 :             bufsize = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
     803             :          END DO
     804          30 :          CALL result_recvbufs(i)%alloc(bufsize)
     805             :       END DO
     806             : 
     807             :       ! Communicate. We attempt to balance communication load in the network here by starting our sends with our right neighbor
     808             :       ! and first trying to receive from our left neighbor.
     809          10 :       IF (this%numnodes .GT. 1) THEN
     810          30 :          DO i = 1, this%numnodes
     811          20 :             k = MODULO(this%myrank - i, this%numnodes) ! rank to receive from
     812          20 :             CALL this%group%irecv(msgout=result_recvbufs(k)%data, source=k, request=result_recvbufs(k)%mpi_request)
     813          20 :             k = MODULO(this%myrank + i, this%numnodes) ! rank to send to
     814          30 :             CALL this%group%isend(msgin=this%result_sendbufs(k)%data, dest=k, request=this%result_sendbufs(k)%mpi_request)
     815             :          END DO
     816          30 :          DO i = 0, this%numnodes - 1
     817          20 :             CALL this%result_sendbufs(i)%mpi_request%wait()
     818          30 :             CALL result_recvbufs(i)%mpi_request%wait()
     819             :          END DO
     820             :       ELSE
     821           0 :          result_recvbufs(0)%data = this%result_sendbufs(0)%data
     822             :       END IF
     823             : 
     824          10 :       last_buf = -1
     825          10 :       bufsize = 0
     826          15 :       DO i = 1, this%local_blocks
     827           5 :          cur_col = this%coo_cols_local(i)
     828           5 :          cur_row = this%coo_rows_local(i)
     829           5 :          cur_sm = (cur_col - 1)/this%cols_per_sm + 1
     830           5 :          cur_buf = this%submatrix_owners(cur_sm)
     831           5 :          IF (cur_buf .GT. last_buf) bufsize = 0
     832           5 :          bufsize_next = bufsize + this%col_blk_size(cur_row)*this%col_blk_size(cur_col)
     833             :          CALL dbcsr_put_block(matrix=resultmat, row=cur_row, col=cur_col, &
     834           5 :                               block=result_recvbufs(cur_buf)%data(bufsize + 1:bufsize_next))
     835           5 :          bufsize = bufsize_next
     836          15 :          last_buf = cur_buf
     837             :       END DO
     838             : 
     839          30 :       DO i = 0, this%numnodes - 1
     840          30 :          CALL result_recvbufs(i)%dealloc
     841             :       END DO
     842          10 :       DEALLOCATE (result_recvbufs)
     843             : 
     844          10 :       CALL dbcsr_finalize(resultmat)
     845             : 
     846          10 :       CALL timestop(handle)
     847          10 :    END SUBROUTINE submatrix_communicate_results
     848             : 
     849             : ! **************************************************************************************************
     850             : !> \brief sort two integer arrays using quicksort, using the second list as second-level sorting criterion
     851             : !> \param arr_a - first input array
     852             : !> \param arr_b - second input array
     853             : !> \param left - left boundary of region to be sorted
     854             : !> \param right - right boundary of region to be sorted
     855             : ! **************************************************************************************************
     856          20 :    RECURSIVE PURE SUBROUTINE qsort_two(arr_a, arr_b, left, right)
     857             : 
     858             :       INTEGER, DIMENSION(:), INTENT(inout)               :: arr_a, arr_b
     859             :       INTEGER, INTENT(in)                                :: left, right
     860             : 
     861             :       INTEGER                                            :: i, j, pivot_a, pivot_b, tmp
     862             : 
     863          20 :       IF (right - left .LT. 1) RETURN
     864             : 
     865           0 :       i = left
     866           0 :       j = right - 1
     867           0 :       pivot_a = arr_a(right)
     868           0 :       pivot_b = arr_b(right)
     869             : 
     870           0 :       DO
     871           0 :          DO WHILE ((arr_a(i) .LT. pivot_a) .OR. ((arr_a(i) .EQ. pivot_a) .AND. (arr_b(i) .LT. pivot_b)))
     872           0 :             i = i + 1
     873             :          END DO
     874           0 :          DO WHILE ((j .GT. i) .AND. ((arr_a(j) .GT. pivot_a) .OR. ((arr_a(j) .EQ. pivot_a) .AND. (arr_b(j) .GE. pivot_b))))
     875           0 :             j = j - 1
     876             :          END DO
     877           0 :          IF (i .LT. j) THEN
     878           0 :             tmp = arr_a(i)
     879           0 :             arr_a(i) = arr_a(j)
     880           0 :             arr_a(j) = tmp
     881           0 :             tmp = arr_b(i)
     882           0 :             arr_b(i) = arr_b(j)
     883           0 :             arr_b(j) = tmp
     884             :          ELSE
     885             :             EXIT
     886             :          END IF
     887             :       END DO
     888             : 
     889           0 :       IF ((arr_a(i) .GT. pivot_a) .OR. (arr_a(i) .EQ. pivot_a .AND. arr_b(i) .GT. pivot_b)) THEN
     890           0 :          tmp = arr_a(i)
     891           0 :          arr_a(i) = arr_a(right)
     892           0 :          arr_a(right) = tmp
     893           0 :          tmp = arr_b(i)
     894           0 :          arr_b(i) = arr_b(right)
     895           0 :          arr_b(right) = tmp
     896             :       END IF
     897             : 
     898           0 :       IF (i - 1 .GT. left) CALL qsort_two(arr_a, arr_b, left, i - 1)
     899           0 :       IF (i + 1 .LT. right) CALL qsort_two(arr_a, arr_b, i + 1, right)
     900             : 
     901             :    END SUBROUTINE qsort_two
     902             : 
     903           0 : END MODULE submatrix_dissection

Generated by: LCOV version 1.15