LCOV - code coverage report
Current view: top level - src - submatrix_dissection.F (source / functions) Hit Total Coverage
Test: CP2K Regtests (git:15a58fb) Lines: 354 406 87.2 %
Date: 2025-02-18 08:24:35 Functions: 8 10 80.0 %

          Line data    Source code
       1             : !--------------------------------------------------------------------------------------------------!
       2             : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3             : !   Copyright 2000-2025 CP2K developers group <https://cp2k.org>                                   !
       4             : !                                                                                                  !
       5             : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6             : !--------------------------------------------------------------------------------------------------!
       7             : 
       8             : ! **************************************************************************************************
       9             : !> \note
      10             : !> This module implements a modified version of the submatrix method, proposed in
      11             : !>   M. Lass, S. Mohr, H. Wiebeler, T. Kuehne, C. Plessl
      12             : !>   A Massively Parallel Algorithm for the Approximate Calculation of Inverse p-th Roots of Large Sparse Matrices
      13             : !>   Proc. Platform for Advanced Scientific Computing (PASC) Conference, ACM, 2018
      14             : !>
      15             : !> The method is extended to minimize the required data transfers and floating-point operations under the assumption that non-zero
      16             : !> blocks of the matrix are close to its diagonal.
      17             : !>
      18             : !> Submatrices can be constructed not for single columns of the matrix but for a set of w consecutive submatrices. The underlying
      19             : !> assumption is that columns next to each other have relatively similar occupation patterns, i.e., constructing a submatrix from
      20             : !> columns x and x+1 should not lead to a much bigger submatrix than contructing it only from column x.
      21             : !>
      22             : !> The construction of the submatrices requires communication between all ranks. It is crucial to implement this communication as
      23             : !> efficient as possible, i.e., data should only ever be transferred once between two ranks and message sizes need to be
      24             : !> sufficiently large to utilize the communication bandwidth. To achieve this, we communicate the required blocks for all
      25             : !> submatrices at once and copy them to large buffers before transmitting them via MPI.
      26             : !>
      27             : !> Note on multi-threading:
      28             : !> Submatrices can be constructed and processed in parallel by multiple threads. However, generate_submatrix, get_sm_ids_for_rank
      29             : !> and copy_resultcol are the only thread-safe routines in this module. All other routines involve MPI communication or operate on
      30             : !> common, non-protected data and are hence not thread-safe.
      31             : !>
      32             : !> TODO:
      33             : !> * generic types (for now only dp supported)
      34             : !> * optimization of threaded initialization
      35             : !> * sanity checks at the beginning of all methods
      36             : !>
      37             : !> \author Michael Lass
      38             : ! **************************************************************************************************
      39             : 
      40             : MODULE submatrix_dissection
      41             : 
      42             :    USE bibliography,                    ONLY: Lass2018,&
      43             :                                               cite_reference
      44             :    USE cp_dbcsr_api,                    ONLY: &
      45             :         dbcsr_distribution_get, dbcsr_distribution_type, dbcsr_finalize, dbcsr_get_block_p, &
      46             :         dbcsr_get_info, dbcsr_get_stored_coordinates, dbcsr_iterator_blocks_left, &
      47             :         dbcsr_iterator_next_block, dbcsr_iterator_start, dbcsr_iterator_stop, dbcsr_iterator_type, &
      48             :         dbcsr_put_block, dbcsr_type
      49             :    USE kinds,                           ONLY: dp
      50             :    USE message_passing,                 ONLY: mp_comm_type
      51             :    USE submatrix_types,                 ONLY: buffer_type,&
      52             :                                               bufptr_type,&
      53             :                                               intBuffer_type,&
      54             :                                               set_type,&
      55             :                                               setarray_type
      56             : 
      57             : !$ USE omp_lib, ONLY: omp_get_max_threads, omp_get_thread_num
      58             : 
      59             : #include "./base/base_uses.f90"
      60             : 
      61             :    IMPLICIT NONE
      62             :    PRIVATE
      63             : 
      64             :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'submatrix_dissection'
      65             : 
      66             :    TYPE, PUBLIC :: submatrix_dissection_type
      67             :       TYPE(dbcsr_type)                                :: dbcsr_mat
      68             :       TYPE(dbcsr_distribution_type)                   :: dist
      69             :       LOGICAL                                         :: initialized = .FALSE.
      70             :       TYPE(mp_comm_type)                              :: group = mp_comm_type()
      71             :       INTEGER                                         :: numnodes = -1, myrank = -1, nblkcols = -1, &
      72             :                                                          nblkrows = -1, nblks = -1, local_blocks = -1, &
      73             :                                                          cols_per_sm = -1, number_of_submatrices = -1
      74             :       INTEGER, DIMENSION(:), POINTER                  :: row_blk_size => NULL(), col_blk_size => NULL()
      75             :       INTEGER, DIMENSION(:), ALLOCATABLE              :: coo_cols, coo_rows, coo_col_offsets, coo_cols_local, coo_rows_local, &
      76             :                                                          coo_col_offsets_local, submatrix_owners, submatrix_sizes
      77             :       TYPE(buffer_type), DIMENSION(:), ALLOCATABLE    :: recvbufs, result_sendbufs ! Indexing starts with 0 to match rank ids!
      78             :       TYPE(set_type), DIMENSION(:), ALLOCATABLE       :: result_blocks_for_rank, result_blocks_from_rank
      79             :       TYPE(bufptr_type), DIMENSION(:), ALLOCATABLE    :: coo_dptr
      80             :       TYPE(intBuffer_type), DIMENSION(:), ALLOCATABLE :: result_blocks_for_rank_buf_offsets
      81             :    CONTAINS
      82             :       PROCEDURE :: init => submatrix_dissection_init
      83             :       PROCEDURE :: final => submatrix_dissection_final
      84             :       PROCEDURE :: get_sm_ids_for_rank => submatrix_get_sm_ids_for_rank
      85             :       PROCEDURE :: generate_submatrix => submatrix_generate_sm
      86             :       PROCEDURE :: copy_resultcol => submatrix_cpy_resultcol
      87             :       PROCEDURE :: communicate_results => submatrix_communicate_results
      88             :       PROCEDURE :: get_relevant_sm_columns => submatrix_get_relevant_sm_columns
      89             :    END TYPE submatrix_dissection_type
      90             : 
      91             : CONTAINS
      92             : 
      93             : ! **************************************************************************************************
      94             : !> \brief determine which columns of the submatrix are relevant for the result matrix
      95             : !> \param this - object of class submatrix_dissection_type
      96             : !> \param sm_id - id of the submatrix
      97             : !> \param first - first column of submatrix that is relevant
      98             : !> \param last - last column of submatrix that is relevant
      99             : ! **************************************************************************************************
     100          36 :    SUBROUTINE submatrix_get_relevant_sm_columns(this, sm_id, first, last)
     101             :       CLASS(submatrix_dissection_type), INTENT(IN)     :: this
     102             :       INTEGER, INTENT(IN)                              :: sm_id
     103             :       INTEGER, INTENT(OUT)                             :: first, last
     104             :       INTEGER                                          :: i, j, blkid
     105        9288 :       TYPE(set_type)                                   :: nonzero_rows
     106             : 
     107             :       ! TODO: Should we buffer the list of non-zero rows for each submatrix instead of recalculating it each time?
     108          72 :       DO i = (sm_id - 1)*this%cols_per_sm + 1, sm_id*this%cols_per_sm     ! all colums that determine submatrix sm_id
     109         108 :          DO j = this%coo_col_offsets(i), this%coo_col_offsets(i + 1) - 1  ! all blocks that are within this column
     110          72 :             CALL nonzero_rows%insert(this%coo_rows(j))
     111             :          END DO
     112             :       END DO
     113             : 
     114          36 :       first = 1
     115          36 :       DO i = 1, nonzero_rows%elements
     116          36 :          blkid = nonzero_rows%get(i)
     117          36 :          IF (blkid .EQ. (sm_id - 1)*this%cols_per_sm + 1) THEN
     118             :             ! We just found the nonzero row that corresponds to the first inducing column of our submatrix
     119             :             ! Now add up block sizes to determine the last one as well
     120          36 :             last = first - 1
     121          36 :             DO j = i, nonzero_rows%elements
     122          36 :                blkid = nonzero_rows%get(j)
     123          36 :                last = last + this%col_blk_size(blkid)
     124          36 :                IF (blkid .EQ. (sm_id)*this%cols_per_sm) EXIT
     125             :             END DO
     126             :             EXIT
     127             :          END IF
     128           0 :          first = first + this%col_blk_size(blkid)
     129             :       END DO
     130             : 
     131          36 :       CALL nonzero_rows%reset
     132             : 
     133        9288 :    END SUBROUTINE submatrix_get_relevant_sm_columns
     134             : 
     135             : ! **************************************************************************************************
     136             : !> \brief initialize submatrix dissection and communicate, needs to be called before constructing any submatrices.
     137             : !> \param this - object of class submatrix_dissection_type
     138             : !> \param matrix_p - dbcsr input matrix
     139             : !> \par History
     140             : !>       2020.02 created [Michael Lass]
     141             : !>       2020.05 add time measurements [Michael Lass]
     142             : ! **************************************************************************************************
     143          10 :    SUBROUTINE submatrix_dissection_init(this, matrix_p) ! Should be PURE but the iterator routines are not
     144             :       CLASS(submatrix_dissection_type), INTENT(INOUT)  :: this
     145             :       TYPE(dbcsr_type), INTENT(IN)                     :: matrix_p
     146             : 
     147             :       INTEGER                                          :: cur_row, cur_col, i, j, k, l, m, l_limit_left, l_limit_right, &
     148             :                                                           bufsize, bufsize_next
     149          20 :       INTEGER, DIMENSION(:), ALLOCATABLE               :: blocks_per_rank, coo_dsplcmnts, num_blockids_send, num_blockids_recv
     150             :       TYPE(dbcsr_iterator_type)                        :: iter
     151        2580 :       TYPE(set_type)                                   :: nonzero_rows
     152             :       REAL(KIND=dp)                                    :: flops_total, flops_per_rank, flops_per_sm, flops_threshold, &
     153             :                                                           flops_current, flops_remaining
     154             : 
     155             :       ! Indexing for the following arrays starts with 0 to match rank ids
     156          10 :       TYPE(set_type), DIMENSION(:), ALLOCATABLE        :: blocks_from_rank
     157          10 :       TYPE(buffer_type), DIMENSION(:), ALLOCATABLE     :: sendbufs
     158          10 :       TYPE(intBuffer_type), DIMENSION(:), ALLOCATABLE  :: blocks_for_rank
     159             : 
     160             :       ! Additional structures for threaded parts
     161             :       INTEGER                                          :: numthreads, mytid
     162             :       ! Indexing starts at 0 to match thread ids
     163          10 :       TYPE(setarray_type), DIMENSION(:), ALLOCATABLE   :: nonzero_rows_t
     164          10 :       TYPE(setarray_type), DIMENSION(:), ALLOCATABLE   :: result_blocks_from_rank_t, result_blocks_for_rank_t, &
     165          10 :                                                           blocks_from_rank_t
     166             : 
     167             :       LOGICAL                                          :: valid
     168          10 :       REAL(KIND=dp), DIMENSION(:, :), POINTER           :: blockp
     169             : 
     170             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'submatrix_dissection_init'
     171             :       INTEGER :: handle
     172             : 
     173          10 :       CALL timeset(routineN, handle)
     174          10 :       CALL cite_reference(Lass2018)
     175             : 
     176          10 :       this%dbcsr_mat = matrix_p
     177             :       CALL dbcsr_get_info(matrix=this%dbcsr_mat, nblkcols_total=this%nblkcols, nblkrows_total=this%nblkrows, &
     178          10 :                           row_blk_size=this%row_blk_size, col_blk_size=this%col_blk_size, group=this%group, distribution=this%dist)
     179          10 :       CALL dbcsr_distribution_get(dist=this%dist, mynode=this%myrank, numnodes=this%numnodes)
     180             : 
     181          10 :       IF (this%nblkcols .NE. this%nblkrows) THEN
     182           0 :          CPABORT("Number of block rows and cols need to be identical")
     183             :       END IF
     184             : 
     185          20 :       DO i = 1, this%nblkcols
     186          20 :          IF (this%col_blk_size(i) .NE. this%row_blk_size(i)) THEN
     187           0 :             CPABORT("Block row sizes and col sizes need to be identical")
     188             :          END IF
     189             :       END DO
     190             : 
     191             :       ! TODO: We could do even more sanity checks here, e.g., the matrix must not be stored symmetrically
     192             : 
     193             :       ! For the submatrix method, we need global knwoledge about which blocks are actually used. Therefore, we create a COO
     194             :       ! representation of the blocks (without their contents) on all ranks.
     195             : 
     196             :       ! TODO: Right now, the COO contains all blocks. Also we transfer all blocks. We could skip half of them due to the matrix
     197             :       ! being symmetric (however, we need to transpose the blocks). This can increase performance only by a factor of 2 and is
     198             :       ! therefore deferred.
     199             : 
     200             :       ! Determine number of locally stored blocks
     201          10 :       this%local_blocks = 0
     202          10 :       CALL dbcsr_iterator_start(iter, this%dbcsr_mat, read_only=.TRUE.)
     203          15 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     204           5 :          CALL dbcsr_iterator_next_block(iter, cur_row, cur_col)
     205           5 :          this%local_blocks = this%local_blocks + 1
     206             :       END DO
     207          10 :       CALL dbcsr_iterator_stop(iter)
     208             : 
     209           0 :       ALLOCATE (this%coo_cols_local(this%local_blocks), this%coo_rows_local(this%local_blocks), blocks_per_rank(this%numnodes), &
     210           0 :                 coo_dsplcmnts(this%numnodes), this%coo_col_offsets_local(this%nblkcols + 1), &
     211             :                 blocks_for_rank(0:this%numnodes - 1), blocks_from_rank(0:this%numnodes - 1), &
     212           0 :                 sendbufs(0:this%numnodes - 1), this%recvbufs(0:this%numnodes - 1), this%result_sendbufs(0:this%numnodes - 1), &
     213           0 :                 this%result_blocks_for_rank(0:this%numnodes - 1), this%result_blocks_from_rank(0:this%numnodes - 1), &
     214       23560 :                 this%result_blocks_for_rank_buf_offsets(0:this%numnodes - 1))
     215             : 
     216          10 :       i = 1
     217          10 :       CALL dbcsr_iterator_start(iter, this%dbcsr_mat, read_only=.TRUE.)
     218          15 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     219           5 :          CALL dbcsr_iterator_next_block(iter, cur_row, cur_col)
     220           5 :          this%coo_cols_local(i) = cur_col
     221           5 :          this%coo_rows_local(i) = cur_row
     222           5 :          i = i + 1
     223             :       END DO
     224          10 :       CALL dbcsr_iterator_stop(iter)
     225             : 
     226             :       ! We only know how many blocks we own. What's with the other ranks?
     227          10 :       CALL this%group%allgather(msgout=this%local_blocks, msgin=blocks_per_rank)
     228          10 :       coo_dsplcmnts(1) = 0
     229          20 :       DO i = 1, this%numnodes - 1
     230          20 :          coo_dsplcmnts(i + 1) = coo_dsplcmnts(i) + blocks_per_rank(i)
     231             :       END DO
     232             : 
     233             :       ! Get a global view on the matrix
     234          30 :       this%nblks = SUM(blocks_per_rank)
     235           0 :       ALLOCATE (this%coo_cols(this%nblks), this%coo_rows(this%nblks), this%coo_col_offsets(this%nblkcols + 1), &
     236         100 :                 this%coo_dptr(this%nblks))
     237             :       CALL this%group%allgatherv(msgout=this%coo_rows_local, msgin=this%coo_rows, rcount=blocks_per_rank, &
     238          10 :                                  rdispl=coo_dsplcmnts)
     239             :       CALL this%group%allgatherv(msgout=this%coo_cols_local, msgin=this%coo_cols, rcount=blocks_per_rank, &
     240          10 :                                  rdispl=coo_dsplcmnts)
     241             : 
     242          10 :       DEALLOCATE (blocks_per_rank, coo_dsplcmnts)
     243             : 
     244             :       ! Sort COO arrays according to their columns
     245          10 :       CALL qsort_two(this%coo_cols_local, this%coo_rows_local, 1, this%local_blocks)
     246          10 :       CALL qsort_two(this%coo_cols, this%coo_rows, 1, this%nblks)
     247             : 
     248             :       ! Get COO array offsets for columns to accelerate lookups
     249          10 :       this%coo_col_offsets(this%nblkcols + 1) = this%nblks + 1
     250          10 :       j = 1
     251          20 :       DO i = 1, this%nblkcols
     252          10 :          DO WHILE ((j .LE. this%nblks))
     253          10 :             IF (this%coo_cols(j) .GE. i) EXIT
     254          10 :             j = j + 1
     255             :          END DO
     256          20 :          this%coo_col_offsets(i) = j
     257             :       END DO
     258             : 
     259             :       ! Same for local COO
     260          10 :       this%coo_col_offsets_local(this%nblkcols + 1) = this%local_blocks + 1
     261          10 :       j = 1
     262          20 :       DO i = 1, this%nblkcols
     263          10 :          DO WHILE ((j .LE. this%local_blocks))
     264           5 :             IF (this%coo_cols_local(j) .GE. i) EXIT
     265           5 :             j = j + 1
     266             :          END DO
     267          20 :          this%coo_col_offsets_local(i) = j
     268             :       END DO
     269             : 
     270             :       ! We could combine multiple columns to generate a single submatrix. For now, we have not found a practical use-case for this
     271             :       ! so we only look at single columns for now.
     272          10 :       this%cols_per_sm = 1
     273             : 
     274             :       ! Determine sizes of all submatrices. This is required in order to assess the computational effort that is required to process
     275             :       ! the submatrices.
     276          10 :       this%number_of_submatrices = this%nblkcols/this%cols_per_sm
     277          30 :       ALLOCATE (this%submatrix_sizes(this%number_of_submatrices))
     278          20 :       this%submatrix_sizes = 0
     279          10 :       flops_total = 0.0D0
     280          20 :       DO i = 1, this%number_of_submatrices
     281          10 :          CALL nonzero_rows%reset
     282          20 :          DO j = (i - 1)*this%cols_per_sm + 1, i*this%cols_per_sm            ! all colums that determine submatrix i
     283          30 :             DO k = this%coo_col_offsets(j), this%coo_col_offsets(j + 1) - 1 ! all blocks that are within this column
     284          20 :                CALL nonzero_rows%insert(this%coo_rows(k))
     285             :             END DO
     286             :          END DO
     287          20 :          DO j = 1, nonzero_rows%elements
     288          20 :             this%submatrix_sizes(i) = this%submatrix_sizes(i) + this%col_blk_size(nonzero_rows%get(j))
     289             :          END DO
     290          20 :          flops_total = flops_total + 2.0D0*this%submatrix_sizes(i)*this%submatrix_sizes(i)*this%submatrix_sizes(i)
     291             :       END DO
     292             : 
     293             :       ! Create mapping from submatrices to ranks. Since submatrices can be of different sizes, we need to perform some load
     294             :       ! balancing here. For that we assume that arithmetic operations performed on the submatrices scale cubically.
     295          30 :       ALLOCATE (this%submatrix_owners(this%number_of_submatrices))
     296          10 :       flops_per_sm = flops_total/this%number_of_submatrices
     297          10 :       flops_per_rank = flops_total/this%numnodes
     298          10 :       flops_current = 0.0D0
     299          10 :       j = 0
     300          20 :       DO i = 1, this%number_of_submatrices
     301          10 :          this%submatrix_owners(i) = j
     302          10 :          flops_current = flops_current + 2.0D0*this%submatrix_sizes(i)*this%submatrix_sizes(i)*this%submatrix_sizes(i)
     303          10 :          flops_remaining = flops_total - flops_current
     304          10 :          flops_threshold = (this%numnodes - j - 1)*flops_per_rank
     305             :          IF ((j .LT. (this%numnodes - 1)) &
     306          10 :              .AND. ((flops_remaining .LE. flops_threshold &
     307          10 :                      .OR. (this%number_of_submatrices - i) .LT. (this%numnodes - j)))) THEN
     308          10 :             j = j + 1
     309          10 :             flops_total = flops_total - flops_current
     310          10 :             flops_current = 0.0D0
     311             :          END IF
     312             :       END DO
     313             : 
     314             :       ! Prepare data structures for multithreaded loop
     315          10 :       numthreads = 1
     316          10 : !$    numthreads = omp_get_max_threads()
     317             : 
     318           0 :       ALLOCATE (result_blocks_from_rank_t(0:numthreads - 1), &
     319           0 :                 result_blocks_for_rank_t(0:numthreads - 1), &
     320           0 :                 blocks_from_rank_t(0:numthreads - 1), &
     321         100 :                 nonzero_rows_t(0:numthreads - 1))
     322             : 
     323             :       ! Figure out which blocks we need to receive. Blocks are identified here as indices into our COO representation.
     324             :       ! TODO: This currently shows limited parallel efficiency. Investigate further.
     325             : 
     326             :       !$OMP PARALLEL DEFAULT(OMP_DEFAULT_NONE_WITH_OOP) &
     327             :       !$OMP          NUM_THREADS(numthreads) &
     328             :       !$OMP          PRIVATE(i,j,k,l,m,l_limit_left,l_limit_right,cur_col,cur_row,mytid) &
     329             :       !$OMP          SHARED(result_blocks_from_rank_t,result_blocks_for_rank_t,blocks_from_rank_t,this,numthreads,nonzero_rows_t)
     330             :       mytid = 0
     331             : !$    mytid = omp_get_thread_num()
     332             : 
     333             :       ALLOCATE (nonzero_rows_t(mytid)%sets(1), &
     334             :                 result_blocks_from_rank_t(mytid)%sets(0:this%numnodes - 1), &
     335             :                 result_blocks_for_rank_t(mytid)%sets(0:this%numnodes - 1), &
     336             :                 blocks_from_rank_t(mytid)%sets(0:this%numnodes - 1))
     337             : 
     338          10 :       !$OMP DO schedule(guided)
     339             :       DO i = 1, this%number_of_submatrices
     340             :          CALL nonzero_rows_t(mytid)%sets(1)%reset
     341             :          DO j = (i - 1)*this%cols_per_sm + 1, i*this%cols_per_sm            ! all colums that determine submatrix i
     342             :             DO k = this%coo_col_offsets(j), this%coo_col_offsets(j + 1) - 1 ! all blocks that are within this column
     343             :                ! This block will be required to assemble the final block matrix as it is within an inducing column for submatrix i.
     344             :                ! Figure out who stores it and insert it into the result_blocks_* sets.
     345             :                CALL dbcsr_get_stored_coordinates(matrix=this%dbcsr_mat, row=this%coo_rows(k), column=j, processor=m)
     346             :                IF (m .EQ. this%myrank) THEN
     347             :                   CALL result_blocks_from_rank_t(mytid)%sets(this%submatrix_owners(i))%insert(k)
     348             :                END IF
     349             :                IF (this%submatrix_owners(i) .EQ. this%myrank) THEN
     350             :                   CALL nonzero_rows_t(mytid)%sets(1)%insert(this%coo_rows(k))
     351             :                   CALL result_blocks_for_rank_t(mytid)%sets(m)%insert(k)
     352             :                END IF
     353             :             END DO
     354             :          END DO
     355             : 
     356             :          IF (this%submatrix_owners(i) .NE. this%myrank) CYCLE
     357             : 
     358             :          ! In the following, we determine all blocks required to build the submatrix. We interpret nonzero_rows_t(mytid)(j) as
     359             :          ! column and nonzero_rows_t(mytid)(k) as row.
     360             :          DO j = 1, nonzero_rows_t(mytid)%sets(1)%elements
     361             :             cur_col = nonzero_rows_t(mytid)%sets(1)%get(j)
     362             :             l_limit_left = this%coo_col_offsets(cur_col)
     363             :             l_limit_right = this%coo_col_offsets(cur_col + 1) - 1
     364             :             DO k = 1, nonzero_rows_t(mytid)%sets(1)%elements
     365             :                cur_row = nonzero_rows_t(mytid)%sets(1)%get(k)
     366             :                l = l_limit_left
     367             :                DO WHILE (l .LE. l_limit_right)
     368             :                   IF (this%coo_rows(l) .GE. cur_row) EXIT
     369             :                   l = l + 1
     370             :                END DO
     371             :                l_limit_left = l
     372             :                IF (l .LE. l_limit_right) THEN
     373             :                   IF (this%coo_rows(l) .EQ. cur_row) THEN
     374             :                      ! We found a valid block. Figure out what to do with it.
     375             :                      CALL dbcsr_get_stored_coordinates(matrix=this%dbcsr_mat, row=this%coo_rows(l), &
     376             :                                                        column=this%coo_cols(l), processor=m)
     377             :                      CALL blocks_from_rank_t(mytid)%sets(m)%insert(l)
     378             :                   END IF
     379             :                END IF
     380             :             END DO
     381             :          END DO
     382             :       END DO
     383             :       !$OMP END DO
     384             :       !$OMP END PARALLEL
     385             : 
     386             :       ! Merge partial results from threads
     387          30 :       DO i = 0, this%numnodes - 1
     388          50 :          DO j = 0, numthreads - 1
     389          25 :             DO k = 1, result_blocks_from_rank_t(j)%sets(i)%elements
     390          25 :                CALL this%result_blocks_from_rank(i)%insert(result_blocks_from_rank_t(j)%sets(i)%get(k))
     391             :             END DO
     392          20 :             CALL result_blocks_from_rank_t(j)%sets(i)%reset
     393          25 :             DO k = 1, result_blocks_for_rank_t(j)%sets(i)%elements
     394          25 :                CALL this%result_blocks_for_rank(i)%insert(result_blocks_for_rank_t(j)%sets(i)%get(k))
     395             :             END DO
     396          20 :             CALL result_blocks_for_rank_t(j)%sets(i)%reset
     397          25 :             DO k = 1, blocks_from_rank_t(j)%sets(i)%elements
     398          25 :                CALL blocks_from_rank(i)%insert(blocks_from_rank_t(j)%sets(i)%get(k))
     399             :             END DO
     400          40 :             CALL blocks_from_rank_t(j)%sets(i)%reset
     401             :          END DO
     402             :       END DO
     403          20 :       DO i = 0, numthreads - 1
     404          10 :          CALL nonzero_rows_t(i)%sets(1)%reset
     405           0 :          DEALLOCATE (result_blocks_from_rank_t(i)%sets, result_blocks_for_rank_t(i)%sets, blocks_from_rank_t(i)%sets, &
     406       18080 :                      nonzero_rows_t(i)%sets)
     407             :       END DO
     408          50 :       DEALLOCATE (result_blocks_from_rank_t, result_blocks_for_rank_t, blocks_from_rank_t, nonzero_rows_t)
     409             : 
     410             :       ! Make other ranks aware of our needs
     411          40 :       ALLOCATE (num_blockids_send(0:this%numnodes - 1), num_blockids_recv(0:this%numnodes - 1))
     412          30 :       DO i = 0, this%numnodes - 1
     413          30 :          num_blockids_send(i) = blocks_from_rank(i)%elements
     414             :       END DO
     415          10 :       CALL this%group%alltoall(num_blockids_send, num_blockids_recv, 1)
     416          30 :       DO i = 0, this%numnodes - 1
     417          30 :          CALL blocks_for_rank(i)%alloc(num_blockids_recv(i))
     418             :       END DO
     419          10 :       DEALLOCATE (num_blockids_send, num_blockids_recv)
     420             : 
     421          10 :       IF (this%numnodes .GT. 1) THEN
     422          30 :          DO i = 1, this%numnodes
     423          20 :             k = MODULO(this%myrank - i, this%numnodes) ! rank to receive from
     424          30 :             CALL this%group%irecv(msgout=blocks_for_rank(k)%data, source=k, request=blocks_for_rank(k)%mpi_request)
     425             :          END DO
     426          30 :          DO i = 1, this%numnodes
     427          20 :             k = MODULO(this%myrank + i, this%numnodes) ! rank to send to
     428          30 :             CALL this%group%send(blocks_from_rank(k)%getall(), k, 0)
     429             :          END DO
     430          30 :          DO i = 0, this%numnodes - 1
     431          30 :             CALL blocks_for_rank(i)%mpi_request%wait()
     432             :          END DO
     433             :       ELSE
     434           0 :          blocks_for_rank(0)%data = blocks_from_rank(0)%getall()
     435             :       END IF
     436             : 
     437             :       ! Free memory allocated in nonzero_rows
     438          10 :       CALL nonzero_rows%reset
     439             : 
     440             :       ! Make get calls on this%result_blocks_for_rank(...) threadsafe in other routines by updating the internal sorted list
     441          30 :       DO m = 0, this%numnodes - 1
     442          30 :          IF ((.NOT. this%result_blocks_for_rank(m)%sorted_up_to_date) .AND. (this%result_blocks_for_rank(m)%elements .GT. 0)) THEN
     443           5 :             CALL this%result_blocks_for_rank(m)%update_sorted
     444             :          END IF
     445             :       END DO
     446             : 
     447             :       ! Create and fill send buffers
     448          30 :       DO i = 0, this%numnodes - 1
     449          20 :          bufsize = 0
     450          25 :          DO j = 1, blocks_for_rank(i)%size
     451           5 :             k = blocks_for_rank(i)%data(j)
     452          25 :             bufsize = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
     453             :          END DO
     454          20 :          CALL sendbufs(i)%alloc(bufsize)
     455             : 
     456          20 :          bufsize = 0
     457          20 :          CALL this%result_blocks_for_rank_buf_offsets(i)%alloc(this%result_blocks_for_rank(i)%elements)
     458          25 :          DO j = 1, this%result_blocks_for_rank(i)%elements
     459           5 :             k = this%result_blocks_for_rank(i)%get(j)
     460           5 :             this%result_blocks_for_rank_buf_offsets(i)%data(j) = bufsize
     461          25 :             bufsize = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
     462             :          END DO
     463          20 :          CALL this%result_sendbufs(i)%alloc(bufsize)
     464             : 
     465          20 :          bufsize = 0
     466          35 :          DO j = 1, blocks_for_rank(i)%size
     467           5 :             k = blocks_for_rank(i)%data(j)
     468           5 :             CALL dbcsr_get_block_p(this%dbcsr_mat, row=this%coo_rows(k), col=this%coo_cols(k), block=blockp, found=valid)
     469           5 :             IF (.NOT. valid) THEN
     470           0 :                CPABORT("Block included in our COO and placed on our rank could not be fetched!")
     471             :             END IF
     472          15 :             bufsize_next = bufsize + SIZE(blockp)
     473         200 :             sendbufs(i)%data(bufsize + 1:bufsize_next) = RESHAPE(blockp, [SIZE(blockp)])
     474          30 :             bufsize = bufsize_next
     475             :          END DO
     476             :       END DO
     477             : 
     478             :       ! Create receive buffers and mapping from blocks to memory locations
     479          30 :       DO i = 0, this%numnodes - 1
     480          20 :          bufsize = 0
     481          25 :          DO j = 1, blocks_from_rank(i)%elements
     482           5 :             k = blocks_from_rank(i)%get(j)
     483          25 :             bufsize = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
     484             :          END DO
     485          20 :          CALL this%recvbufs(i)%alloc(bufsize)
     486          20 :          bufsize = 0
     487          35 :          DO j = 1, blocks_from_rank(i)%elements
     488           5 :             k = blocks_from_rank(i)%get(j)
     489           5 :             bufsize_next = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
     490           5 :             this%coo_dptr(k)%target => this%recvbufs(i)%data(bufsize + 1:bufsize_next)
     491          25 :             bufsize = bufsize_next
     492             :          END DO
     493             :       END DO
     494             : 
     495          30 :       DO i = 0, this%numnodes - 1
     496          20 :          CALL blocks_for_rank(i)%dealloc
     497          30 :          CALL blocks_from_rank(i)%reset
     498             :       END DO
     499        5180 :       DEALLOCATE (blocks_for_rank, blocks_from_rank)
     500             : 
     501          10 :       IF (this%numnodes .GT. 1) THEN
     502             :          ! Communicate. We attempt to balance communication load in the network here by starting our sends with our right neighbor
     503             :          ! and first trying to receive from our left neighbor.
     504          30 :          DO i = 1, this%numnodes
     505          20 :             k = MODULO(this%myrank - i, this%numnodes) ! rank to receive from
     506          20 :             CALL this%group%irecv(msgout=this%recvbufs(k)%data, source=k, request=this%recvbufs(k)%mpi_request)
     507          20 :             k = MODULO(this%myrank + i, this%numnodes) ! rank to send to
     508          30 :             CALL this%group%isend(msgin=sendbufs(k)%data, dest=k, request=sendbufs(k)%mpi_request)
     509             :          END DO
     510          30 :          DO i = 0, this%numnodes - 1
     511          20 :             CALL sendbufs(i)%mpi_request%wait()
     512          30 :             CALL this%recvbufs(i)%mpi_request%wait()
     513             :          END DO
     514             :       ELSE
     515           0 :          this%recvbufs(0)%data = sendbufs(0)%data
     516             :       END IF
     517             : 
     518          30 :       DO i = 0, this%numnodes - 1
     519          30 :          CALL sendbufs(i)%dealloc
     520             :       END DO
     521          10 :       DEALLOCATE (sendbufs)
     522             : 
     523          10 :       this%initialized = .TRUE.
     524             : 
     525          10 :       CALL timestop(handle)
     526        2610 :    END SUBROUTINE submatrix_dissection_init
     527             : 
     528             : ! **************************************************************************************************
     529             : !> \brief free all associated memory, afterwards submatrix_dissection_init needs to be called again
     530             : !> \param this - object of class submatrix_dissection_type
     531             : ! **************************************************************************************************
     532          10 :    PURE SUBROUTINE submatrix_dissection_final(this)
     533             :       CLASS(submatrix_dissection_type), INTENT(INOUT) :: this
     534             :       INTEGER                                         :: i
     535             : 
     536          10 :       this%initialized = .FALSE.
     537             : 
     538          10 :       IF (ALLOCATED(this%submatrix_sizes)) DEALLOCATE (this%submatrix_sizes)
     539          10 :       IF (ALLOCATED(this%coo_cols_local)) DEALLOCATE (this%coo_cols_local)
     540          10 :       IF (ALLOCATED(this%coo_rows_local)) DEALLOCATE (this%coo_rows_local)
     541          10 :       IF (ALLOCATED(this%coo_col_offsets_local)) DEALLOCATE (this%coo_col_offsets_local)
     542          10 :       IF (ALLOCATED(this%result_blocks_for_rank_buf_offsets)) THEN
     543          30 :          DO i = 0, this%numnodes - 1
     544          30 :             CALL this%result_blocks_for_rank_buf_offsets(i)%dealloc
     545             :          END DO
     546          10 :          DEALLOCATE (this%result_blocks_for_rank_buf_offsets)
     547             :       END IF
     548          10 :       IF (ALLOCATED(this%recvbufs)) THEN
     549          30 :          DO i = 0, this%numnodes - 1
     550          30 :             CALL this%recvbufs(i)%dealloc
     551             :          END DO
     552          10 :          DEALLOCATE (this%recvbufs)
     553             :       END IF
     554          10 :       IF (ALLOCATED(this%result_sendbufs)) THEN
     555          30 :          DO i = 0, this%numnodes - 1
     556          30 :             CALL this%result_sendbufs(i)%dealloc
     557             :          END DO
     558          10 :          DEALLOCATE (this%result_sendbufs)
     559             :       END IF
     560          10 :       IF (ALLOCATED(this%result_blocks_for_rank)) THEN
     561          30 :          DO i = 0, this%numnodes - 1
     562          30 :             CALL this%result_blocks_for_rank(i)%reset
     563             :          END DO
     564        5170 :          DEALLOCATE (this%result_blocks_for_rank)
     565             :       END IF
     566          10 :       IF (ALLOCATED(this%result_blocks_from_rank)) THEN
     567          30 :          DO i = 0, this%numnodes - 1
     568          30 :             CALL this%result_blocks_from_rank(i)%reset
     569             :          END DO
     570        5170 :          DEALLOCATE (this%result_blocks_from_rank)
     571             :       END IF
     572          10 :       IF (ALLOCATED(this%coo_cols)) DEALLOCATE (this%coo_cols)
     573          10 :       IF (ALLOCATED(this%coo_rows)) DEALLOCATE (this%coo_rows)
     574          10 :       IF (ALLOCATED(this%coo_col_offsets)) DEALLOCATE (this%coo_col_offsets)
     575          10 :       IF (ALLOCATED(this%coo_dptr)) DEALLOCATE (this%coo_dptr)
     576          10 :       IF (ALLOCATED(this%submatrix_owners)) DEALLOCATE (this%submatrix_owners)
     577             : 
     578          10 :    END SUBROUTINE submatrix_dissection_final
     579             : 
     580             : ! **************************************************************************************************
     581             : !> \brief generate a specific submatrix
     582             : !> \param this - object of class submatrix_dissection_type
     583             : !> \param sm_id - id of the submatrix to generate
     584             : !> \param sm - generated submatrix
     585             : ! **************************************************************************************************
     586           6 :    SUBROUTINE submatrix_generate_sm(this, sm_id, sm)
     587             :       CLASS(submatrix_dissection_type), INTENT(IN)             :: this
     588             :       INTEGER, INTENT(IN)                                      :: sm_id
     589             :       REAL(KIND=dp), DIMENSION(:, :), ALLOCATABLE, INTENT(OUT) :: sm
     590             : 
     591             :       INTEGER                                                  :: sm_dim, i, j, k, offset_x1, offset_x2, offset_y1, &
     592             :                                                                   offset_y2, k_limit_left, k_limit_right
     593        1548 :       TYPE(set_type)                                           :: nonzero_rows
     594             : 
     595           6 :       IF (.NOT. this%initialized) THEN
     596           0 :          CPABORT("Submatrix dissection not initialized")
     597             :       END IF
     598             : 
     599           6 :       IF (this%myrank .NE. this%submatrix_owners(sm_id)) THEN
     600           0 :          CPABORT("This rank is not supposed to construct this submatrix")
     601             :       END IF
     602             : 
     603             :       ! TODO: Should we buffer the list of non-zero rows for each submatrix instead of recalculating it each time?
     604           6 :       CALL nonzero_rows%reset
     605          12 :       DO i = (sm_id - 1)*this%cols_per_sm + 1, sm_id*this%cols_per_sm     ! all colums that determine submatrix sm_id
     606          18 :          DO j = this%coo_col_offsets(i), this%coo_col_offsets(i + 1) - 1  ! all blocks that are within this column
     607          12 :             CALL nonzero_rows%insert(this%coo_rows(j))
     608             :          END DO
     609             :       END DO
     610           6 :       sm_dim = 0
     611          12 :       DO i = 1, nonzero_rows%elements
     612          12 :          sm_dim = sm_dim + this%col_blk_size(nonzero_rows%get(i))
     613             :       END DO
     614             : 
     615          24 :       ALLOCATE (sm(sm_dim, sm_dim))
     616         258 :       sm = 0
     617             : 
     618           6 :       offset_x1 = 0
     619          12 :       DO j = 1, nonzero_rows%elements
     620           6 :          offset_x2 = offset_x1 + this%col_blk_size(nonzero_rows%get(j))
     621           6 :          offset_y1 = 0
     622           6 :          k_limit_left = this%coo_col_offsets(nonzero_rows%get(j))
     623           6 :          k_limit_right = this%coo_col_offsets(nonzero_rows%get(j) + 1) - 1
     624          12 :          DO i = 1, nonzero_rows%elements
     625           6 :             offset_y2 = offset_y1 + this%col_blk_size(nonzero_rows%get(i))
     626             :             ! Copy block nonzero_rows(i),nonzero_rows(j) to sm(i,j) (if the block actually exists)
     627           6 :             k = k_limit_left
     628           6 :             DO WHILE (k .LE. k_limit_right)
     629           6 :                IF (this%coo_rows(k) .GE. nonzero_rows%get(i)) EXIT
     630           0 :                k = k + 1
     631             :             END DO
     632           6 :             k_limit_left = k
     633           6 :             IF (this%coo_rows(k) .EQ. nonzero_rows%get(i)) THEN ! it does exist and k is our block id
     634             :                sm(offset_y1 + 1:offset_y2, offset_x1 + 1:offset_x2) = RESHAPE(this%coo_dptr(k)%target, &
     635         270 :                                                                               (/offset_y2 - offset_y1, offset_x2 - offset_x1/))
     636             :             END IF
     637          12 :             offset_y1 = offset_y2
     638             :          END DO
     639          12 :          offset_x1 = offset_x2
     640             :       END DO
     641             : 
     642             :       ! Free memory allocated in nonzero_rows
     643           6 :       CALL nonzero_rows%reset
     644             : 
     645        1548 :    END SUBROUTINE submatrix_generate_sm
     646             : 
     647             : ! **************************************************************************************************
     648             : !> \brief determine submatrix ids that are handled by a specific rank
     649             : !> \param this - object of class submatrix_dissection_type
     650             : !> \param rank - rank id of interest
     651             : !> \param sm_ids - list of submatrix ids handled by that rank
     652             : ! **************************************************************************************************
     653          10 :    SUBROUTINE submatrix_get_sm_ids_for_rank(this, rank, sm_ids)
     654             :       CLASS(submatrix_dissection_type), INTENT(IN)    :: this
     655             :       INTEGER, INTENT(IN)                             :: rank
     656             :       INTEGER, DIMENSION(:), ALLOCATABLE, INTENT(OUT) :: sm_ids
     657             :       INTEGER                                         :: count, i
     658             : 
     659          10 :       IF (.NOT. this%initialized) THEN
     660           0 :          CPABORT("Submatrix dissection not initialized")
     661             :       END IF
     662             : 
     663          10 :       count = 0
     664          20 :       DO i = 1, this%number_of_submatrices
     665          20 :          IF (this%submatrix_owners(i) .EQ. rank) count = count + 1
     666             :       END DO
     667             : 
     668          25 :       ALLOCATE (sm_ids(count))
     669             : 
     670          10 :       count = 0
     671          20 :       DO i = 1, this%number_of_submatrices
     672          20 :          IF (this%submatrix_owners(i) .EQ. rank) THEN
     673           5 :             count = count + 1
     674           5 :             sm_ids(count) = i
     675             :          END IF
     676             :       END DO
     677             : 
     678          10 :    END SUBROUTINE submatrix_get_sm_ids_for_rank
     679             : 
     680             : ! **************************************************************************************************
     681             : !> \brief copy result columns from a submatrix into result buffer
     682             : !> \param this - object of class submatrix_dissection_type
     683             : !> \param sm_id - id of the submatrix
     684             : !> \param sm - result-submatrix
     685             : ! **************************************************************************************************
     686           6 :    SUBROUTINE submatrix_cpy_resultcol(this, sm_id, sm)
     687             :       CLASS(submatrix_dissection_type), INTENT(INOUT)         :: this
     688             :       INTEGER, INTENT(in)                                     :: sm_id
     689             :       REAL(KIND=dp), DIMENSION(:, :), ALLOCATABLE, INTENT(IN) :: sm
     690             : 
     691        1548 :       TYPE(set_type)                                          :: nonzero_rows
     692             :       INTEGER                                                 :: i, j, k, m, sm_col_offset, offset_x1, offset_x2, offset_y1, &
     693             :                                                                  offset_y2, bufsize, bufsize_next, k_limit_left, k_limit_right
     694           6 :       INTEGER, DIMENSION(:), ALLOCATABLE                      :: buf_offset_idxs
     695             : 
     696           6 :       IF (.NOT. this%initialized) THEN
     697           0 :          CPABORT("Submatrix dissection is not initizalized")
     698             :       END IF
     699             : 
     700           6 :       IF (this%myrank .NE. this%submatrix_owners(sm_id)) THEN
     701           0 :          CPABORT("This rank is not supposed to operate on this submatrix")
     702             :       END IF
     703             : 
     704          18 :       ALLOCATE (buf_offset_idxs(0:this%numnodes - 1))
     705          18 :       buf_offset_idxs = 1
     706             : 
     707             :       ! TODO: Should we buffer the list of non-zero rows for each submatrix instead of recalculating it each time?
     708           6 :       sm_col_offset = 0
     709          12 :       DO i = (sm_id - 1)*this%cols_per_sm + 1, sm_id*this%cols_per_sm     ! all colums that determine submatrix sm_id
     710          18 :          DO j = this%coo_col_offsets(i), this%coo_col_offsets(i + 1) - 1  ! all blocks that are within this column
     711          12 :             CALL nonzero_rows%insert(this%coo_rows(j))
     712             :          END DO
     713             :       END DO
     714             : 
     715           6 :       sm_col_offset = 0
     716           6 :       DO i = 1, nonzero_rows%elements
     717           6 :          IF (nonzero_rows%get(i) .EQ. (sm_id - 1)*this%cols_per_sm + 1) THEN
     718             :             ! We just found the nonzero row that corresponds to the first inducing column of our submatrix
     719           6 :             sm_col_offset = i
     720           6 :             EXIT
     721             :          END IF
     722             :       END DO
     723           6 :       IF (sm_col_offset .EQ. 0) THEN
     724           0 :          CPABORT("Could not determine relevant result columns of submatrix")
     725             :       END IF
     726             : 
     727           6 :       offset_x1 = 0
     728          12 :       DO j = 1, nonzero_rows%elements
     729           6 :          offset_x2 = offset_x1 + this%col_blk_size(nonzero_rows%get(j))
     730             :          ! We only want to copy the blocks from the result columns
     731           6 :          IF ((j .GE. sm_col_offset) .AND. (j .LT. sm_col_offset + this%cols_per_sm)) THEN
     732           6 :             offset_y1 = 0
     733           6 :             k_limit_left = this%coo_col_offsets(nonzero_rows%get(j))
     734           6 :             k_limit_right = this%coo_col_offsets(nonzero_rows%get(j) + 1) - 1
     735          12 :             DO i = 1, nonzero_rows%elements
     736           6 :                offset_y2 = offset_y1 + this%col_blk_size(nonzero_rows%get(i))
     737             :                ! Check if sm(i,j), i.e., (nonzero_rows(i),nonzero_rows(j)) exists in the original matrix and if so, copy it into the
     738             :                ! correct send buffer.
     739           6 :                k = k_limit_left
     740           6 :                DO WHILE (k .LE. k_limit_right)
     741           6 :                   IF (this%coo_rows(k) .GE. nonzero_rows%get(i)) EXIT
     742           0 :                   k = k + 1
     743             :                END DO
     744           6 :                k_limit_left = k
     745           6 :                IF (this%coo_rows(k) .EQ. nonzero_rows%get(i)) THEN ! it does exist and k is our block id
     746             :                   CALL dbcsr_get_stored_coordinates(matrix=this%dbcsr_mat, row=this%coo_rows(k), column=this%coo_cols(k), &
     747           6 :                                                     processor=m)
     748           6 :                   DO WHILE (buf_offset_idxs(m) .LE. this%result_blocks_for_rank(m)%elements)
     749           6 :                      IF (this%result_blocks_for_rank(m)%get(buf_offset_idxs(m)) .GE. k) EXIT
     750           0 :                      buf_offset_idxs(m) = buf_offset_idxs(m) + 1
     751             :                   END DO
     752           6 :                   IF (this%result_blocks_for_rank(m)%get(buf_offset_idxs(m)) .NE. k) THEN
     753           0 :                      CPABORT("Could not determine buffer offset for block")
     754             :                   END IF
     755           6 :                   bufsize = this%result_blocks_for_rank_buf_offsets(m)%data(buf_offset_idxs(m))
     756           6 :                   bufsize_next = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
     757             :                   this%result_sendbufs(m)%data(bufsize + 1:bufsize_next) = RESHAPE( &
     758             :                                                                            sm(offset_y1 + 1:offset_y2, offset_x1 + 1:offset_x2), &
     759         228 :                                                                            (/bufsize_next - bufsize/))
     760             :                END IF
     761          12 :                offset_y1 = offset_y2
     762             :             END DO
     763             :          END IF
     764          12 :          offset_x1 = offset_x2
     765             :       END DO
     766             : 
     767           6 :       DEALLOCATE (buf_offset_idxs)
     768             : 
     769             :       ! Free memory in set
     770           6 :       CALL nonzero_rows%reset
     771             : 
     772        1548 :    END SUBROUTINE submatrix_cpy_resultcol
     773             : 
     774             : ! **************************************************************************************************
     775             : !> \brief finalize results back into a dbcsr matrix
     776             : !> \param this - object of class submatrix_dissection_type
     777             : !> \param resultmat - result dbcsr matrix
     778             : !> \par History
     779             : !>       2020.02 created [Michael Lass]
     780             : !>       2020.05 add time measurements [Michael Lass]
     781             : ! **************************************************************************************************
     782          10 :    SUBROUTINE submatrix_communicate_results(this, resultmat)
     783             :       CLASS(submatrix_dissection_type), INTENT(INOUT)                 :: this
     784             :       TYPE(dbcsr_type), INTENT(INOUT)                                 :: resultmat
     785             : 
     786             :       INTEGER                                                         :: i, j, k, cur_row, cur_col, cur_sm, cur_buf, last_buf, &
     787             :                                                                          bufsize, bufsize_next, row_size, col_size
     788          10 :       REAL(kind=dp), DIMENSION(:), POINTER                            :: vector
     789          10 :       TYPE(buffer_type), DIMENSION(:), ALLOCATABLE                    :: result_recvbufs
     790             : 
     791             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'submatrix_communicate_results'
     792             :       INTEGER :: handle
     793             : 
     794          10 :       CALL timeset(routineN, handle)
     795             : 
     796          60 :       ALLOCATE (result_recvbufs(0:this%numnodes - 1))
     797          30 :       DO i = 0, this%numnodes - 1
     798          20 :          bufsize = 0
     799          25 :          DO j = 1, this%result_blocks_from_rank(i)%elements
     800           5 :             k = this%result_blocks_from_rank(i)%get(j)
     801          25 :             bufsize = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
     802             :          END DO
     803          30 :          CALL result_recvbufs(i)%alloc(bufsize)
     804             :       END DO
     805             : 
     806             :       ! Communicate. We attempt to balance communication load in the network here by starting our sends with our right neighbor
     807             :       ! and first trying to receive from our left neighbor.
     808          10 :       IF (this%numnodes .GT. 1) THEN
     809          30 :          DO i = 1, this%numnodes
     810          20 :             k = MODULO(this%myrank - i, this%numnodes) ! rank to receive from
     811          20 :             CALL this%group%irecv(msgout=result_recvbufs(k)%data, source=k, request=result_recvbufs(k)%mpi_request)
     812          20 :             k = MODULO(this%myrank + i, this%numnodes) ! rank to send to
     813          30 :             CALL this%group%isend(msgin=this%result_sendbufs(k)%data, dest=k, request=this%result_sendbufs(k)%mpi_request)
     814             :          END DO
     815          30 :          DO i = 0, this%numnodes - 1
     816          20 :             CALL this%result_sendbufs(i)%mpi_request%wait()
     817          30 :             CALL result_recvbufs(i)%mpi_request%wait()
     818             :          END DO
     819             :       ELSE
     820           0 :          result_recvbufs(0)%data = this%result_sendbufs(0)%data
     821             :       END IF
     822             : 
     823          10 :       last_buf = -1
     824          10 :       bufsize = 0
     825          15 :       DO i = 1, this%local_blocks
     826           5 :          cur_col = this%coo_cols_local(i)
     827           5 :          cur_row = this%coo_rows_local(i)
     828           5 :          cur_sm = (cur_col - 1)/this%cols_per_sm + 1
     829           5 :          cur_buf = this%submatrix_owners(cur_sm)
     830           5 :          IF (cur_buf .GT. last_buf) bufsize = 0
     831           5 :          row_size = this%row_blk_size(cur_row)
     832           5 :          col_size = this%col_blk_size(cur_col)
     833           5 :          bufsize_next = bufsize + row_size*col_size
     834           5 :          vector => result_recvbufs(cur_buf)%data(bufsize + 1:bufsize_next)
     835             :          CALL dbcsr_put_block(matrix=resultmat, row=cur_row, col=cur_col, &
     836          15 :                               block=RESHAPE(vector, [row_size, col_size]))
     837           5 :          bufsize = bufsize_next
     838          15 :          last_buf = cur_buf
     839             :       END DO
     840             : 
     841          30 :       DO i = 0, this%numnodes - 1
     842          30 :          CALL result_recvbufs(i)%dealloc
     843             :       END DO
     844          10 :       DEALLOCATE (result_recvbufs)
     845             : 
     846          10 :       CALL dbcsr_finalize(resultmat)
     847             : 
     848          10 :       CALL timestop(handle)
     849          10 :    END SUBROUTINE submatrix_communicate_results
     850             : 
     851             : ! **************************************************************************************************
     852             : !> \brief sort two integer arrays using quicksort, using the second list as second-level sorting criterion
     853             : !> \param arr_a - first input array
     854             : !> \param arr_b - second input array
     855             : !> \param left - left boundary of region to be sorted
     856             : !> \param right - right boundary of region to be sorted
     857             : ! **************************************************************************************************
     858          20 :    RECURSIVE PURE SUBROUTINE qsort_two(arr_a, arr_b, left, right)
     859             : 
     860             :       INTEGER, DIMENSION(:), INTENT(inout)               :: arr_a, arr_b
     861             :       INTEGER, INTENT(in)                                :: left, right
     862             : 
     863             :       INTEGER                                            :: i, j, pivot_a, pivot_b, tmp
     864             : 
     865          20 :       IF (right - left .LT. 1) RETURN
     866             : 
     867           0 :       i = left
     868           0 :       j = right - 1
     869           0 :       pivot_a = arr_a(right)
     870           0 :       pivot_b = arr_b(right)
     871             : 
     872           0 :       DO
     873           0 :          DO WHILE ((arr_a(i) .LT. pivot_a) .OR. ((arr_a(i) .EQ. pivot_a) .AND. (arr_b(i) .LT. pivot_b)))
     874           0 :             i = i + 1
     875             :          END DO
     876           0 :          DO WHILE ((j .GT. i) .AND. ((arr_a(j) .GT. pivot_a) .OR. ((arr_a(j) .EQ. pivot_a) .AND. (arr_b(j) .GE. pivot_b))))
     877           0 :             j = j - 1
     878             :          END DO
     879           0 :          IF (i .LT. j) THEN
     880           0 :             tmp = arr_a(i)
     881           0 :             arr_a(i) = arr_a(j)
     882           0 :             arr_a(j) = tmp
     883           0 :             tmp = arr_b(i)
     884           0 :             arr_b(i) = arr_b(j)
     885           0 :             arr_b(j) = tmp
     886             :          ELSE
     887             :             EXIT
     888             :          END IF
     889             :       END DO
     890             : 
     891           0 :       IF ((arr_a(i) .GT. pivot_a) .OR. (arr_a(i) .EQ. pivot_a .AND. arr_b(i) .GT. pivot_b)) THEN
     892           0 :          tmp = arr_a(i)
     893           0 :          arr_a(i) = arr_a(right)
     894           0 :          arr_a(right) = tmp
     895           0 :          tmp = arr_b(i)
     896           0 :          arr_b(i) = arr_b(right)
     897           0 :          arr_b(right) = tmp
     898             :       END IF
     899             : 
     900           0 :       IF (i - 1 .GT. left) CALL qsort_two(arr_a, arr_b, left, i - 1)
     901           0 :       IF (i + 1 .LT. right) CALL qsort_two(arr_a, arr_b, i + 1, right)
     902             : 
     903             :    END SUBROUTINE qsort_two
     904             : 
     905           0 : END MODULE submatrix_dissection

Generated by: LCOV version 1.15