LCOV - code coverage report
Current view: top level - src - smeagol_matrix_utils.F (source / functions) Hit Total Coverage
Test: CP2K Regtests (git:4dc10b3) Lines: 523 698 74.9 %
Date: 2024-11-21 06:45:46 Functions: 14 17 82.4 %

          Line data    Source code
       1             : !--------------------------------------------------------------------------------------------------!
       2             : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3             : !   Copyright 2000-2024 CP2K developers group <https://cp2k.org>                                   !
       4             : !                                                                                                  !
       5             : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6             : !--------------------------------------------------------------------------------------------------!
       7             : 
       8             : ! **************************************************************************************************
       9             : !> \brief Routines to convert sparse matrices between DBCSR (distributed-blocks compressed sparse rows)
      10             : !>        and SIESTA (distributed compressed sparse columns) formats.
      11             : !> \author Sergey Chulkov
      12             : !> \author Christian Ahart
      13             : !> \author Clotilde Cucinotta
      14             : ! **************************************************************************************************
      15             : MODULE smeagol_matrix_utils
      16             :    USE cell_types, ONLY: cell_type, &
      17             :                          real_to_scaled, &
      18             :                          scaled_to_real
      19             :    USE cp_dbcsr_api, ONLY: dbcsr_get_block_p, &
      20             :                            dbcsr_get_info, &
      21             :                            dbcsr_p_type, &
      22             :                            dbcsr_set
      23             :    USE kinds, ONLY: dp, &
      24             :                     dp_size, &
      25             :                     int_8
      26             :    USE message_passing, ONLY: mp_para_env_type, &
      27             :                               mp_request_type, &
      28             :                               mp_waitall
      29             :    USE negf_matrix_utils, ONLY: get_index_by_cell
      30             : #if defined(__SMEAGOL)
      31             :    USE parallel, ONLY: GetNodeOrbs, &
      32             :                        GlobalToLocalOrb, &
      33             :                        LocalToGlobalOrb, &
      34             :                        WhichNodeOrb
      35             : #endif
      36             :    USE particle_types, ONLY: particle_type
      37             :    USE qs_neighbor_list_types, ONLY: get_iterator_info, &
      38             :                                      get_neighbor_list_set_p, &
      39             :                                      neighbor_list_iterate, &
      40             :                                      neighbor_list_iterator_create, &
      41             :                                      neighbor_list_iterator_p_type, &
      42             :                                      neighbor_list_iterator_release, &
      43             :                                      neighbor_list_set_p_type
      44             :    USE qs_subsys_types, ONLY: qs_subsys_get, &
      45             :                               qs_subsys_type
      46             :    USE util, ONLY: sort
      47             : #include "./base/base_uses.f90"
      48             : 
      49             :    IMPLICIT NONE
      50             :    PRIVATE
      51             : 
      52             :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'smeagol_matrix_utils'
      53             :    LOGICAL, PARAMETER, PRIVATE          :: debug_this_module = .FALSE.
      54             : 
      55             :    INTEGER, PARAMETER, PRIVATE          :: neighbor_list_iatom_index = 1
      56             :    INTEGER, PARAMETER, PRIVATE          :: neighbor_list_jatom_index = 2
      57             :    INTEGER, PARAMETER, PRIVATE          :: neighbor_list_dbcsr_image_index = 3
      58             :    INTEGER, PARAMETER, PRIVATE          :: neighbor_list_siesta_image_index = 4
      59             :    INTEGER, PARAMETER, PRIVATE          :: neighbor_list_siesta_transp_image_index = 5
      60             :    INTEGER, PARAMETER, PRIVATE          :: neighbor_list_dim1 = neighbor_list_siesta_transp_image_index
      61             : 
      62             :    PUBLIC :: siesta_distrib_csc_struct_type
      63             :    PUBLIC :: siesta_struct_create, siesta_struct_release
      64             :    PUBLIC :: convert_dbcsr_to_distributed_siesta, convert_distributed_siesta_to_dbcsr
      65             : 
      66             :    PRIVATE :: get_negf_cell_ijk, index_in_canonical_enumeration, number_from_canonical_enumeration, pbc_0_1
      67             :    PRIVATE :: get_number_of_mpi_sendrecv_requests, assign_nonzero_elements_to_requests
      68             : 
      69             :    !> number of DBCSR matrix elements to receive from a given rank
      70             :    INTEGER, PARAMETER, PRIVATE          :: nelements_dbcsr_recv = 1
      71             :    !> number of DBCSR matrix elements to send to a given rank
      72             :    INTEGER, PARAMETER, PRIVATE          :: nelements_dbcsr_send = 2
      73             :    INTEGER, PARAMETER, PRIVATE          :: nelements_dbcsr_dim2 = nelements_dbcsr_send
      74             : 
      75             :    INTEGER(kind=int_8), PARAMETER, PRIVATE :: max_mpi_packet_size_bytes = 134217728 ! 128 MiB (to limit memory usage for matrix redistribution)
      76             :    INTEGER(kind=int_8), PARAMETER, PRIVATE :: max_mpi_packet_size_dp = max_mpi_packet_size_bytes/INT(dp_size, kind=int_8)
      77             : 
      78             :    ! a portable way to determine the upper bound for tag value is to call
      79             :    ! MPI_COMM_GET_ATTR(comm, MPI_TAG_UB, max_mpi_rank, flag, ierror).
      80             :    ! The MPI specification guarantees a value of 32767
      81             :    INTEGER, PARAMETER, PRIVATE          :: max_mpi_rank = 32767
      82             : 
      83             : ! **************************************************************************************************
      84             : !> \brief Sparsity pattern of replicated SIESTA compressed sparse column (CSC) matrices
      85             : ! **************************************************************************************************
      86             :    TYPE siesta_distrib_csc_struct_type
      87             :       !> gather all non-zero matrix elements on the given MPI process.
      88             :       !> Distribute the elements across MPI processes if gather_root < 0.
      89             :       !> The matrix elements should be located on I/O process in case of bulk transport,
      90             :       !> and should be distributed in case of SMEAGOL calculation.
      91             :       INTEGER                                            :: gather_root = 0
      92             :       !> Based of full (.FALSE.) or upper-triangular (.TRUE.) DBCSR matrix.
      93             :       !> It is used in CPASSERTs to allow access to lower-triangular matrix elements of non-symmetric DBCSR matrices
      94             :       !> In case there is no bugs in this module, these CPASSERTs should newer trigger.
      95             :       !> Therefore the 'symmetric' variable alongside with relevant CPASSERT calls are excessive and can be removed.
      96             :       LOGICAL                                            :: symmetric = .TRUE.
      97             :       !> number of neighbour list nodes for each MPI process (0:num_pe-1).
      98             :       !> If do_merge == .TRUE., nodes for different cell images along k cell vector are merged into one node
      99             :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: nnodes_per_proc
     100             : 
     101             :       !> replicated neighbour list (1:neighbor_list_dim1, 1:SUM(nnodes_per_proc)).
     102             :       !> Neighbour list nodes are ordered according to their MPI ranks.
     103             :       !> Thus, the first nnodes_per_proc(0) nodes are stored on MPI rank 0,
     104             :       !> the next nnodes_per_proc(1) nodes reside on MPI rank 1, etc
     105             :       !> Nodes for cell images along transport direction are merged into one node.
     106             :       !> The number of non-zero DBCSR matrix blocks and their DBCSR cell image indices
     107             :       !> are stored into 'n_dbcsr_cell_images_to_merge' and 'dbcsr_cell_image_to_merge' arrays
     108             :       INTEGER, ALLOCATABLE, DIMENSION(:, :)               :: nl_repl
     109             : 
     110             :       !> number of DBCSR images for each local merged neighbour list node (1:nnodes_per_proc(para_env%mepos))
     111             :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: n_dbcsr_cell_images_to_merge
     112             :       !> list of DBCSR image indices to merge; (1:SUM(n_dbcsr_cell_images_to_merge))
     113             :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: dbcsr_cell_image_to_merge
     114             : 
     115             :       !> number of DBCSR non-zero matrix elements that should be received/sent from each MPI rank
     116             :       !> (0:num_pe-1, 1:nelements_dbcsr_dim2)
     117             :       INTEGER(kind=int_8), ALLOCATABLE, DIMENSION(:, :)   :: nelements_per_proc
     118             : 
     119             :       !> number of non-zero matrix elements local to this MPI rank.
     120             :       INTEGER(kind=int_8)                                :: n_nonzero_elements = 0_int_8
     121             :       INTEGER                                            :: nrows = 0, ncols = 0
     122             :       !> Number of non-zero matrix elements (columns) on each row
     123             :       !> n_nonzero_cols(1:nrows); same as 'numh' in SMEAGOL code.
     124             :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: n_nonzero_cols
     125             :       !> offset of the first non-zero matrix elements on each row.
     126             :       !> column_offset(1:nrows); same as 'listhptr' in SMEAGOL code.
     127             :       !> It should be declared as INTEGER(kind=int_8), but SMEAGOL expects it to be INTEGER
     128             :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: row_offset
     129             :       !> column index of each non-zero matrix element.
     130             :       !> col_index(1:n_nonzero_elements); same as 'listh' in SMEAGOL code
     131             :       !> col index of the first non-zero matrix elements of irow row is col_index(row_offset(irow)+1)
     132             :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: col_index
     133             :       !> index of the non-zero matrix element in a communication buffer for each rank
     134             :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: packed_index
     135             :       !> R_atom_row - R_atom_col
     136             :       REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :)         :: xij
     137             :       !> equivalent atomic orbitals
     138             :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: indxuo
     139             :       !> atomic index on which the orbital is centred
     140             :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: iaorb
     141             :       !> coordinates of all atoms in the supercell
     142             :       REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :)         :: xa
     143             :    END TYPE siesta_distrib_csc_struct_type
     144             : 
     145             : CONTAINS
     146             : 
     147             : ! **************************************************************************************************
     148             : !> \brief Map non-zero matrix blocks between sparse matrices in DBCSR and SIESTA formats.
     149             : !> \param siesta_struct      structure that stores metadata (sparsity pattern) of sparse SIESTA matrices
     150             : !> \param matrix_dbcsr_kp    DBCSR matrices for each cell image
     151             : !> \param subsys             QuickStep molecular system
     152             : !> \param cell_to_index      array to convert 3-D cell indices to 1-D DBCSR image indices
     153             : !> \param sab_nl             pair-wise neighbour list
     154             : !> \param para_env           MPI parallel environment
     155             : !> \param max_ij_cell_image  largest index of cell images along i and j cell vectors (e.g. (2,0) in
     156             : !>                           case of 5 cell images (0,0), (1,0), (-1,0), (2,0), and (-2,0))
     157             : !> \param do_merge           merge DBCSR images along transport direction (k cell vector)
     158             : !> \param gather_root        distribute non-zero matrix elements of SIESTA matrices across all
     159             : !>                           parallel processes (-1), or gather them on the given MPI rank (>= 0).
     160             : ! **************************************************************************************************
     161           2 :    SUBROUTINE siesta_struct_create(siesta_struct, matrix_dbcsr_kp, subsys, cell_to_index, &
     162             :                                    sab_nl, para_env, max_ij_cell_image, do_merge, gather_root)
     163             :       TYPE(siesta_distrib_csc_struct_type), &
     164             :          INTENT(inout)                                   :: siesta_struct
     165             :       TYPE(dbcsr_p_type), DIMENSION(:), INTENT(in)       :: matrix_dbcsr_kp
     166             :       TYPE(qs_subsys_type), POINTER                      :: subsys
     167             :       INTEGER, DIMENSION(:, :, :), POINTER               :: cell_to_index
     168             :       TYPE(neighbor_list_set_p_type), DIMENSION(:), &
     169             :          POINTER                                         :: sab_nl
     170             :       TYPE(mp_para_env_type), POINTER                    :: para_env
     171             :       INTEGER, DIMENSION(2), INTENT(inout)               :: max_ij_cell_image
     172             :       LOGICAL, INTENT(in)                                :: do_merge
     173             :       INTEGER, INTENT(in)                                :: gather_root
     174             : 
     175             :       CHARACTER(len=*), PARAMETER :: routineN = 'siesta_struct_create'
     176             : 
     177             :       CHARACTER(len=20)                                  :: str_nelem, str_nelem_max
     178             :       INTEGER :: handle, iatom, icol, icol_blk, icol_local, image, image_j, image_k, irow, &
     179             :          irow_local, natoms, ncells_siesta_total, ncols_blk, ncols_total, nrows_local, &
     180             :          nrows_total, offset
     181             :       INTEGER(kind=int_8)                                :: n_nonzero_elements_local
     182             :       INTEGER, DIMENSION(3)                              :: max_ijk_cell_image, ncells_siesta
     183           2 :       INTEGER, DIMENSION(:), POINTER                     :: col_blk_offset, col_blk_size
     184             :       LOGICAL                                            :: do_distribute, is_root_rank
     185             :       REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :)        :: particle_coords
     186             :       REAL(kind=dp), DIMENSION(3)                        :: real_cell_shift, scaled_cell_shift
     187             :       TYPE(cell_type), POINTER                           :: cell
     188           2 :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
     189             : 
     190           2 :       CALL timeset(routineN, handle)
     191           2 :       do_distribute = gather_root < 0
     192           2 :       is_root_rank = gather_root == para_env%mepos
     193             : 
     194             :       ! here row_blk_offset / col_blk_offset are global indices of the first row / column of a given non-zero block.
     195             :       ! They are not offsets (index-1) but the actual indices.
     196             :       CALL dbcsr_get_info(matrix=matrix_dbcsr_kp(1)%matrix, &
     197             :                           nfullrows_total=nrows_total, nfullcols_total=ncols_total, &
     198           2 :                           nblkcols_total=ncols_blk, col_blk_size=col_blk_size, col_blk_offset=col_blk_offset)
     199             :       IF (debug_this_module) THEN
     200             :          CPASSERT(nrows_total == ncols_total)
     201             :          CPASSERT(gather_root < para_env%num_pe)
     202             :       END IF
     203             : 
     204           2 :       siesta_struct%gather_root = gather_root
     205           2 :       CALL get_neighbor_list_set_p(neighbor_list_sets=sab_nl, symmetric=siesta_struct%symmetric)
     206             : 
     207             :       ! apply periodic boundary conditions to atomic coordinates
     208           2 :       CALL qs_subsys_get(subsys, cell=cell, particle_set=particle_set, nparticle=natoms)
     209           6 :       ALLOCATE (particle_coords(3, natoms))
     210          66 :       DO iatom = 1, natoms
     211          66 :          CALL pbc_0_1(particle_coords(1:3, iatom), particle_set(iatom)%r(1:3), cell)
     212             :       END DO
     213             : 
     214             :       ! Note: in case we would like to limit the number of cell images along transport direction (k cell vector)
     215             :       !       by enabling 'BulkTransvCellSizeZ' keyword, we need to pass max_ijk_cell_image(1:3) vector
     216             :       !       to the subroutine instead of the reduced vector max_ij_cell_image(1:2)
     217           6 :       max_ijk_cell_image(1:2) = max_ij_cell_image(1:2)
     218             : 
     219             :       ! determine the actual number of cell images along k cell vector. Unless the third element is also passed
     220             :       ! via subroutine arguments, an extra MPI_Allreduce operation is needed each time we call
     221             :       ! replicate_neighbour_list() / get_nnodes_local().
     222           2 :       max_ijk_cell_image(3) = -1
     223           2 :       IF (.NOT. do_merge) max_ijk_cell_image(3) = 1 ! bulk-transport calculation expects exactly 3 cell images along transport direction
     224             : 
     225             :       ! replicate pair-wise neighbour list. Identical non-zero matrix blocks from cell image along transport direction
     226             :       ! are grouped together if do_merge == .TRUE.
     227           6 :       ALLOCATE (siesta_struct%nnodes_per_proc(0:para_env%num_pe - 1))
     228             :       CALL replicate_neighbour_list(siesta_struct%nl_repl, &
     229             :                                     siesta_struct%n_dbcsr_cell_images_to_merge, &
     230             :                                     siesta_struct%dbcsr_cell_image_to_merge, &
     231             :                                     siesta_struct%nnodes_per_proc, &
     232           2 :                                     max_ijk_cell_image, sab_nl, para_env, particle_coords, cell, cell_to_index, do_merge)
     233           6 :       max_ij_cell_image(1:2) = max_ijk_cell_image(1:2)
     234             : 
     235             :       ! count number of non-zero matrix elements that need to be send to and received from other parallel processes
     236           8 :       ALLOCATE (siesta_struct%nelements_per_proc(0:para_env%num_pe - 1, nelements_dbcsr_dim2))
     237             :       CALL count_remote_dbcsr_elements(siesta_struct%nelements_per_proc, siesta_struct%nnodes_per_proc, &
     238           2 :                                        siesta_struct%nl_repl, matrix_dbcsr_kp, siesta_struct%symmetric, para_env, gather_root)
     239             : 
     240             :       ! number of SIESTA non-zero matrix elements that are going to be stored on this parallel process
     241           6 :       n_nonzero_elements_local = SUM(siesta_struct%nelements_per_proc(:, nelements_dbcsr_recv))
     242           2 :       siesta_struct%n_nonzero_elements = n_nonzero_elements_local
     243             : 
     244             :       ! as SMEAGOL uses 32-bits integers, the number of non-zero matrix elements is limited by 2^31 per MPI rank.
     245             :       ! Abort CP2K if we are about to exceed this limit.
     246           2 :       IF (n_nonzero_elements_local > INT(HUGE(0), kind=int_8)) THEN
     247           0 :          WRITE (str_nelem, '(I0)') n_nonzero_elements_local
     248           0 :          WRITE (str_nelem_max, '(I0)') HUGE(0)
     249             :          CALL cp_abort(__LOCATION__, &
     250             :                        "The number of non-zero matrix elements per MPI process "//TRIM(str_nelem)// &
     251             :                        " cannot exceed "//TRIM(str_nelem_max)// &
     252           0 :                        ". Please increase the number of MPI processes to satisfy this SMEAGOL limitation.")
     253             :       END IF
     254             : 
     255             :       ! in case there is no nonzero matrix element stored on this process, allocate arrays with one element to avoid SEGFAULT
     256           2 :       IF (n_nonzero_elements_local == 0) n_nonzero_elements_local = 1
     257             : 
     258             :       ! number of SIESTA-matrix rows local to the given parallel process
     259           2 :       IF (do_distribute) THEN
     260             : #if defined(__SMEAGOL)
     261             :          CALL GetNodeOrbs(nrows_total, para_env%mepos, para_env%num_pe, nrows_local)
     262             : #else
     263             :          CALL cp_abort(__LOCATION__, &
     264           0 :                        "CP2K was compiled with no SMEAGOL support.")
     265             : #endif
     266             :       ELSE
     267           2 :          IF (is_root_rank) THEN
     268           1 :             nrows_local = nrows_total
     269             :          ELSE
     270             :             nrows_local = 0
     271             :          END IF
     272             :       END IF
     273             : 
     274             :       ! number of cell images along each cell vector. It is 2*m+1, as SIESTA images are ordered as 0, 1, -1, ..., m, -m
     275           8 :       ncells_siesta(1:3) = 2*max_ijk_cell_image(1:3) + 1
     276             :       ! in case of merged cell images along the transport direction, there will be just 1 'merged' cell image along it
     277           2 :       IF (do_merge) ncells_siesta(3) = 1
     278             : 
     279           2 :       ncells_siesta_total = ncells_siesta(1)*ncells_siesta(2)*ncells_siesta(3)
     280             : 
     281             :       ! number of rows local to the given parallel process. Rows are distributed in a block-cyclic manner
     282           2 :       siesta_struct%nrows = nrows_local
     283             : 
     284             :       ! number of columns of the matrix in its dense form. SIESTA uses 1-D (rows) block-cyclic distribution.
     285             :       ! All non-zero matrix elements on a given row are stored on the same parallel process
     286           2 :       siesta_struct%ncols = nrows_total*ncells_siesta_total
     287             : 
     288             :       ! allocate at least one array element to avoid SIGFAULT when passing unallocated arrays to subroutines
     289           2 :       IF (nrows_local == 0) nrows_local = 1
     290             : 
     291           6 :       ALLOCATE (siesta_struct%n_nonzero_cols(nrows_local))
     292           4 :       ALLOCATE (siesta_struct%row_offset(nrows_local))
     293           6 :       ALLOCATE (siesta_struct%col_index(n_nonzero_elements_local))
     294           4 :       ALLOCATE (siesta_struct%packed_index(n_nonzero_elements_local))
     295             : 
     296             :       ! restore the actual number of local rows
     297           2 :       nrows_local = siesta_struct%nrows
     298             : 
     299             :       ! get number of non-zero matrix element on each local row (n_nonzero_cols),
     300             :       ! offset of the first non-zero matrix element for each local row (row_offset),
     301             :       ! global column indices of all local non-zero matrix elements (col_index), and
     302             :       ! the indices of all local non-zero matrix elements in the communication buffer (packed_index)
     303             :       CALL get_nonzero_element_indices(siesta_struct%n_nonzero_cols, siesta_struct%row_offset, &
     304             :                                        siesta_struct%col_index, siesta_struct%packed_index, &
     305             :                                        siesta_struct%nl_repl, matrix_dbcsr_kp, &
     306           2 :                                        siesta_struct%symmetric, para_env, gather_root)
     307             : 
     308             :       ! indices of equivalent atomic orbitals
     309           6 :       ALLOCATE (siesta_struct%indxuo(siesta_struct%ncols))
     310         578 :       DO icol = 1, ncols_total
     311         578 :          siesta_struct%indxuo(icol) = icol
     312             :       END DO
     313         150 :       DO image = 2, ncells_siesta_total
     314       85398 :          siesta_struct%indxuo((image - 1)*ncols_total + 1:image*ncols_total) = siesta_struct%indxuo(1:ncols_total)
     315             :       END DO
     316             : 
     317             :       ! particle index on which the orbital is centred
     318           4 :       ALLOCATE (siesta_struct%iaorb(siesta_struct%ncols))
     319          66 :       DO icol_blk = 1, ncols_blk
     320             :          ! col_blk_offset() is not an offset but the index of the first atomic orbital in the column block
     321         642 :          siesta_struct%iaorb(col_blk_offset(icol_blk):col_blk_offset(icol_blk) + col_blk_size(icol_blk) - 1) = icol_blk
     322             :       END DO
     323         150 :       DO image = 2, ncells_siesta_total
     324       85398 :         siesta_struct%iaorb((image - 1)*ncols_total + 1:image*ncols_total) = siesta_struct%iaorb(1:ncols_total) + (image - 1)*natoms
     325             :       END DO
     326             : 
     327             :       ! coordinates of all particles in each cell images
     328           6 :       ALLOCATE (siesta_struct%xa(3, natoms*ncells_siesta_total))
     329           8 :       DO image_k = 1, ncells_siesta(3)
     330             :          !icell_siesta(3) = image_k
     331           6 :          scaled_cell_shift(3) = REAL(number_from_canonical_enumeration(image_k), kind=dp) ! SIESTA -> actual cell index
     332          38 :          DO image_j = 1, ncells_siesta(2)
     333          30 :             scaled_cell_shift(2) = REAL(number_from_canonical_enumeration(image_j), kind=dp)
     334         186 :             DO image = 1, ncells_siesta(1)
     335         150 :                scaled_cell_shift(1) = REAL(number_from_canonical_enumeration(image), kind=dp)
     336         150 :                CALL scaled_to_real(real_cell_shift, scaled_cell_shift, cell)
     337         150 :                offset = (((image_k - 1)*ncells_siesta(2) + image_j - 1)*ncells_siesta(1) + image - 1)*natoms
     338        4980 :                DO iatom = 1, natoms
     339       19350 :                   siesta_struct%xa(1:3, offset + iatom) = particle_set(iatom)%r(1:3) + real_cell_shift(1:3)
     340             :                END DO
     341             :             END DO
     342             :          END DO
     343             :       END DO
     344             : 
     345             :       ! inter-atomic distance
     346           6 :       ALLOCATE (siesta_struct%xij(3, n_nonzero_elements_local))
     347         290 :       DO irow_local = 1, nrows_local
     348         288 :          IF (do_distribute) THEN
     349             : #if defined(__SMEAGOL)
     350             :             CALL LocalToGlobalOrb(irow_local, para_env%mepos, para_env%num_pe, irow)
     351             : #else
     352             :             CALL cp_abort(__LOCATION__, &
     353           0 :                           "CP2K was compiled with no SMEAGOL support.")
     354             : #endif
     355             :          ELSE
     356             :             irow = irow_local
     357             :             IF (debug_this_module) THEN
     358             :                CPASSERT(is_root_rank)
     359             :             END IF
     360             :          END IF
     361         288 :          offset = siesta_struct%row_offset(irow_local)
     362     1091522 :          DO icol_local = offset + 1, offset + siesta_struct%n_nonzero_cols(irow_local)
     363     1091232 :             icol = siesta_struct%col_index(icol_local)
     364             :             siesta_struct%xij(1:3, icol_local) = siesta_struct%xa(1:3, siesta_struct%iaorb(icol)) - &
     365     4365216 :                                                  siesta_struct%xa(1:3, siesta_struct%iaorb(irow))
     366             :          END DO
     367             :       END DO
     368             : 
     369           2 :       DEALLOCATE (particle_coords)
     370             : 
     371           2 :       CALL timestop(handle)
     372           4 :    END SUBROUTINE siesta_struct_create
     373             : 
     374             : ! **************************************************************************************************
     375             : !> \brief Release a SIESTA matrix structure
     376             : !> \param siesta_struct      structure to release
     377             : ! **************************************************************************************************
     378           2 :    SUBROUTINE siesta_struct_release(siesta_struct)
     379             :       TYPE(siesta_distrib_csc_struct_type), &
     380             :          INTENT(inout)                                   :: siesta_struct
     381             : 
     382             :       CHARACTER(len=*), PARAMETER :: routineN = 'siesta_struct_release'
     383             : 
     384             :       INTEGER                                            :: handle
     385             : 
     386           2 :       CALL timeset(routineN, handle)
     387             : 
     388           2 :       siesta_struct%gather_root = -1
     389             : 
     390           2 :       IF (ALLOCATED(siesta_struct%nnodes_per_proc)) DEALLOCATE (siesta_struct%nnodes_per_proc)
     391           2 :       IF (ALLOCATED(siesta_struct%nl_repl)) DEALLOCATE (siesta_struct%nl_repl)
     392           2 :       IF (ALLOCATED(siesta_struct%n_dbcsr_cell_images_to_merge)) DEALLOCATE (siesta_struct%n_dbcsr_cell_images_to_merge)
     393           2 :       IF (ALLOCATED(siesta_struct%dbcsr_cell_image_to_merge)) DEALLOCATE (siesta_struct%dbcsr_cell_image_to_merge)
     394           2 :       IF (ALLOCATED(siesta_struct%nelements_per_proc)) DEALLOCATE (siesta_struct%nelements_per_proc)
     395             : 
     396           2 :       siesta_struct%n_nonzero_elements = 0
     397           2 :       siesta_struct%nrows = 0
     398           2 :       siesta_struct%ncols = 0
     399             : 
     400           2 :       IF (ALLOCATED(siesta_struct%n_nonzero_cols)) DEALLOCATE (siesta_struct%n_nonzero_cols)
     401           2 :       IF (ALLOCATED(siesta_struct%row_offset)) DEALLOCATE (siesta_struct%row_offset)
     402           2 :       IF (ALLOCATED(siesta_struct%col_index)) DEALLOCATE (siesta_struct%col_index)
     403           2 :       IF (ALLOCATED(siesta_struct%packed_index)) DEALLOCATE (siesta_struct%packed_index)
     404             : 
     405           2 :       IF (ALLOCATED(siesta_struct%xij)) DEALLOCATE (siesta_struct%xij)
     406           2 :       IF (ALLOCATED(siesta_struct%indxuo)) DEALLOCATE (siesta_struct%indxuo)
     407           2 :       IF (ALLOCATED(siesta_struct%iaorb)) DEALLOCATE (siesta_struct%iaorb)
     408           2 :       IF (ALLOCATED(siesta_struct%xa)) DEALLOCATE (siesta_struct%xa)
     409             : 
     410           2 :       CALL timestop(handle)
     411           2 :    END SUBROUTINE siesta_struct_release
     412             : 
     413             : ! **************************************************************************************************
     414             : !> \brief Convert matrix from DBCSR to sparse SIESTA format.
     415             : !> \param matrix_siesta      matrix in SIESTA format [out]
     416             : !> \param matrix_dbcsr_kp    DBCSR matrix [in]
     417             : !> \param siesta_struct      structure to map matrix blocks between formats
     418             : !> \param para_env           MPI parallel environment
     419             : ! **************************************************************************************************
     420           8 :    SUBROUTINE convert_dbcsr_to_distributed_siesta(matrix_siesta, matrix_dbcsr_kp, siesta_struct, para_env)
     421             :       REAL(kind=dp), DIMENSION(:), INTENT(out)           :: matrix_siesta
     422             :       TYPE(dbcsr_p_type), DIMENSION(:), INTENT(in)       :: matrix_dbcsr_kp
     423             :       TYPE(siesta_distrib_csc_struct_type), INTENT(in)   :: siesta_struct
     424             :       TYPE(mp_para_env_type), INTENT(in), POINTER        :: para_env
     425             : 
     426             :       CHARACTER(len=*), PARAMETER :: routineN = 'convert_dbcsr_to_distributed_siesta'
     427             : 
     428             :       INTEGER :: first_col_minus_one, first_row_minus_one, handle, icol_blk, icol_local, &
     429             :          image_dbcsr, image_ind, image_ind_offset, image_siesta, image_siesta_transp, inode, &
     430             :          inode_proc, iproc, irequest, irow_blk, irow_local, irow_proc, mepos, n_image_ind, &
     431             :          ncols_blk, ncols_local, nnodes_proc, node_offset, nprocs, nrequests_recv, &
     432             :          nrequests_total, nrows_blk, nrows_local
     433             :       INTEGER(kind=int_8)                                :: n_nonzero_elements_dbcsr, &
     434             :                                                             n_nonzero_elements_siesta, &
     435             :                                                             offset_recv_mepos, offset_send_mepos
     436           8 :       INTEGER(kind=int_8), ALLOCATABLE, DIMENSION(:)     :: n_packed_elements_per_proc, &
     437           8 :                                                             nelements_per_request, &
     438           8 :                                                             offset_per_proc, offset_per_request
     439           8 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: next_nonzero_element_offset, peer_rank, &
     440           8 :                                                             request_tag
     441           8 :       INTEGER, DIMENSION(:), POINTER                     :: col_blk_offset, col_blk_size, &
     442           8 :                                                             row_blk_offset, row_blk_size
     443             :       LOGICAL                                            :: do_distribute, found, is_root_rank, &
     444             :                                                             symmetric
     445           8 :       REAL(kind=dp), ALLOCATABLE, DIMENSION(:)           :: recv_buffer, reorder_recv_buffer, &
     446           8 :                                                             send_buffer
     447           8 :       REAL(kind=dp), DIMENSION(:, :), POINTER            :: sm_block, sm_block_merged
     448           8 :       TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:)   :: requests
     449             : 
     450           8 :       CALL timeset(routineN, handle)
     451     4364940 :       matrix_siesta(:) = 0.0_dp
     452             : 
     453           8 :       mepos = para_env%mepos
     454           8 :       nprocs = para_env%num_pe
     455           8 :       do_distribute = siesta_struct%gather_root < 0
     456           8 :       is_root_rank = siesta_struct%gather_root == mepos
     457             : 
     458             :       CALL dbcsr_get_info(matrix=matrix_dbcsr_kp(1)%matrix, &
     459             :                           nblkrows_total=nrows_blk, nblkcols_total=ncols_blk, &
     460             :                           row_blk_size=row_blk_size, col_blk_size=col_blk_size, &
     461           8 :                           row_blk_offset=row_blk_offset, col_blk_offset=col_blk_offset)
     462           8 :       symmetric = siesta_struct%symmetric
     463             : 
     464             :       ! number of locally stored SIESTA non-zero matrix elements
     465          24 :       n_nonzero_elements_siesta = SUM(siesta_struct%nelements_per_proc(:, nelements_dbcsr_recv))
     466             :       ! number of locally stored DBCSR non-zero matrix elements
     467          24 :       n_nonzero_elements_dbcsr = SUM(siesta_struct%nelements_per_proc(:, nelements_dbcsr_send))
     468             : 
     469             :       ! number of concurrent MPI isend / irecv operations
     470             :       nrequests_recv = get_number_of_mpi_sendrecv_requests(mepos, siesta_struct%nelements_per_proc(:, nelements_dbcsr_recv), &
     471           8 :                                                            max_mpi_packet_size_dp)
     472             :       nrequests_total = get_number_of_mpi_sendrecv_requests(mepos, siesta_struct%nelements_per_proc(:, nelements_dbcsr_send), &
     473           8 :                                                             max_mpi_packet_size_dp) + nrequests_recv
     474             : 
     475           8 :       IF (nrequests_total > 0) THEN
     476             :          ! allocate MPI-related arrays. request_tag is not actually needed, as MPI standard guarantees the order of
     477             :          ! peer-to-peer messages with the same tag between same processes
     478          32 :          ALLOCATE (requests(nrequests_total))
     479          24 :          ALLOCATE (peer_rank(nrequests_total), request_tag(nrequests_total))
     480          32 :          ALLOCATE (offset_per_request(nrequests_total), nelements_per_request(nrequests_total))
     481             :          !requests(:) = mp_request_null
     482             : 
     483             :          ! split large messages into a number of smaller messages. It is not really needed
     484             :          ! unless we are going to send > 2^31 matrix elements per MPI request
     485           8 :          IF (nrequests_recv > 0) THEN
     486             :             CALL assign_nonzero_elements_to_requests(offset_per_request(1:nrequests_recv), &
     487             :                                                      nelements_per_request(1:nrequests_recv), &
     488             :                                                      peer_rank(1:nrequests_recv), &
     489             :                                                      request_tag(1:nrequests_recv), &
     490             :                                                      mepos, &
     491             :                                                      siesta_struct%nelements_per_proc(:, nelements_dbcsr_recv), &
     492           4 :                                                      max_mpi_packet_size_dp)
     493             :          END IF
     494           8 :          IF (nrequests_total > nrequests_recv) THEN
     495             :             CALL assign_nonzero_elements_to_requests(offset_per_request(nrequests_recv + 1:nrequests_total), &
     496             :                                                      nelements_per_request(nrequests_recv + 1:nrequests_total), &
     497             :                                                      peer_rank(nrequests_recv + 1:nrequests_total), &
     498             :                                                      request_tag(nrequests_recv + 1:nrequests_total), &
     499             :                                                      mepos, &
     500             :                                                      siesta_struct%nelements_per_proc(:, nelements_dbcsr_send), &
     501           4 :                                                      max_mpi_packet_size_dp)
     502             :          END IF
     503             :       END IF
     504             : 
     505             :       ! point-to-point recv/send can be replaced with alltoallv, if data to distribute per rank is < 2^31 elements (should be OK due to SMEAGOL limitation)
     506             :       ! in principle, it is possible to overcome this limit by using derived datatypes (which will require additional wrapper functions, indeed).
     507             :       !
     508             :       ! pre-post non-blocking receive operations
     509           8 :       IF (n_nonzero_elements_siesta > 0) THEN
     510          12 :          ALLOCATE (recv_buffer(n_nonzero_elements_siesta))
     511             :       END IF
     512          12 :       DO irequest = 1, nrequests_recv
     513             :          CALL para_env%irecv(recv_buffer(offset_per_request(irequest) + 1: &
     514             :                                          offset_per_request(irequest) + nelements_per_request(irequest)), &
     515          12 :                              peer_rank(irequest), requests(irequest), request_tag(irequest))
     516             :       END DO
     517             : 
     518             :       ! pack local DBCSR non-zero matrix elements ordering by their target parallel process
     519          32 :       ALLOCATE (offset_per_proc(0:nprocs - 1), n_packed_elements_per_proc(0:nprocs - 1))
     520           8 :       offset_per_proc(0) = 0
     521          16 :       DO iproc = 1, nprocs - 1
     522          16 :          offset_per_proc(iproc) = offset_per_proc(iproc - 1) + siesta_struct%nelements_per_proc(iproc - 1, nelements_dbcsr_send)
     523             :       END DO
     524          24 :       n_packed_elements_per_proc(:) = 0
     525             : 
     526             :       ! number of local neighbour-list nodes and offset of the first local neighbour-list node
     527           8 :       nnodes_proc = siesta_struct%nnodes_per_proc(mepos)
     528             :       !node_offset = SUM(siesta_struct%nnodes_per_proc(0:mepos)) - siesta_struct%nnodes_per_proc(mepos)
     529          20 :       node_offset = SUM(siesta_struct%nnodes_per_proc(0:mepos)) - nnodes_proc
     530             : 
     531             :       ! if do_distribute == .FALSE., send all matrix elements to MPI process with rank gather_root
     532             :       ! in case of do_distribute == .TRUE., iproc is determined by calling WhichNodeOrb()
     533           8 :       iproc = siesta_struct%gather_root
     534             : 
     535           8 :       IF (n_nonzero_elements_dbcsr > 0) THEN
     536          24 :          ALLOCATE (send_buffer(n_nonzero_elements_dbcsr))
     537     4364936 :          send_buffer(:) = 0.0_dp
     538             : 
     539             :          ! iterate over locally-stored DBCSR matrix blocks.
     540             :          ! inode_proc is the target parallel process (where data are going to be sent)
     541             :          image_ind_offset = 0
     542       28168 :          DO inode_proc = 1, nnodes_proc
     543       28160 :             n_image_ind = siesta_struct%n_dbcsr_cell_images_to_merge(inode_proc)
     544       28160 :             IF (n_image_ind > 0) THEN
     545       28160 :                inode = node_offset + inode_proc
     546             : 
     547       28160 :                irow_blk = siesta_struct%nl_repl(neighbor_list_iatom_index, inode)
     548       28160 :                icol_blk = siesta_struct%nl_repl(neighbor_list_jatom_index, inode)
     549       28160 :                CPASSERT(irow_blk <= icol_blk .OR. (.NOT. symmetric))
     550       28160 :                image_siesta = siesta_struct%nl_repl(neighbor_list_siesta_image_index, inode)
     551       28160 :                image_siesta_transp = siesta_struct%nl_repl(neighbor_list_siesta_transp_image_index, inode)
     552             : 
     553       28160 :                nrows_local = row_blk_size(irow_blk)
     554       28160 :                ncols_local = col_blk_size(icol_blk)
     555       28160 :                first_row_minus_one = row_blk_offset(irow_blk) - 1
     556       28160 :                first_col_minus_one = col_blk_offset(icol_blk) - 1
     557             : 
     558             :                ! merging cell images along transport direction
     559       28160 :                IF (n_image_ind == 1) THEN
     560             :                   ! the most common case. Nothing to merge, so there is no need to allocate memory for a merged block
     561       28160 :                   image_dbcsr = siesta_struct%dbcsr_cell_image_to_merge(image_ind_offset + 1)
     562             :                   CALL dbcsr_get_block_p(matrix=matrix_dbcsr_kp(image_dbcsr)%matrix, &
     563       28160 :                                          row=irow_blk, col=icol_blk, block=sm_block_merged, found=found)
     564       28160 :                   CPASSERT(found)
     565             :                ELSE ! n_image_ind > 1
     566           0 :                   ALLOCATE (sm_block_merged(nrows_local, ncols_local))
     567             : 
     568           0 :                   DO image_ind = 1, n_image_ind
     569           0 :                      image_dbcsr = siesta_struct%dbcsr_cell_image_to_merge(image_ind + image_ind_offset)
     570             : 
     571             :                      CALL dbcsr_get_block_p(matrix=matrix_dbcsr_kp(image_dbcsr)%matrix, &
     572           0 :                                             row=irow_blk, col=icol_blk, block=sm_block, found=found)
     573           0 :                      CPASSERT(found)
     574             :                      sm_block_merged(1:nrows_local, 1:ncols_local) = sm_block_merged(1:nrows_local, 1:ncols_local) + &
     575           0 :                                                                      sm_block(1:nrows_local, 1:ncols_local)
     576             :                   END DO
     577             :                END IF
     578             : 
     579             :                ! pack matrix elements for the 'normal' SIESTA matrix block
     580       28160 :                IF (image_siesta > 0) THEN
     581      281600 :                   DO irow_local = 1, nrows_local
     582      253440 :                      IF (do_distribute) THEN
     583             : #if defined(__SMEAGOL)
     584             :                         CALL WhichNodeOrb(irow_local + first_row_minus_one, nprocs, iproc)
     585             : #else
     586             :                         CALL cp_abort(__LOCATION__, &
     587           0 :                                       "CP2K was compiled with no SMEAGOL support.")
     588             : #endif
     589             :                      END IF
     590             : 
     591             :                      ! CPASSERT
     592             :                      IF (debug_this_module) THEN
     593             :                         CPASSERT(iproc >= 0 .AND. iproc < nprocs)
     594             :                         IF (n_packed_elements_per_proc(iproc) + ncols_local > &
     595             :                             siesta_struct%nelements_per_proc(iproc, nelements_dbcsr_send)) THEN
     596             :                            CALL cp__a(__SHORT_FILE__, __LINE__)
     597             :                         END IF
     598             :                      END IF
     599             : 
     600             :                      offset_send_mepos = offset_per_proc(iproc) + n_packed_elements_per_proc(iproc)
     601      253440 :                      send_buffer(offset_send_mepos + 1:offset_send_mepos + ncols_local) = sm_block_merged(irow_local, 1:ncols_local)
     602     2534400 : 
     603             :                      n_packed_elements_per_proc(iproc) = n_packed_elements_per_proc(iproc) + ncols_local
     604      281600 :                   END DO
     605             :                END IF
     606             : 
     607             :                ! pack matrix elements of the transposed SIESTA matrix block
     608             :                IF (image_siesta_transp > 0) THEN
     609       28160 :                   DO icol_local = 1, ncols_local
     610      257280 :                      IF (do_distribute) THEN
     611      231552 : #if defined(__SMEAGOL)
     612             :                         CALL WhichNodeOrb(icol_local + first_col_minus_one, nprocs, iproc) ! iproc_orb
     613             : #else
     614             :                         CALL cp_abort(__LOCATION__, &
     615             :                                       "CP2K was compiled with no SMEAGOL support.")
     616           0 : #endif
     617             :                      END IF
     618             : 
     619             :                      ! CPASSERT
     620             :                      IF (debug_this_module) THEN
     621             :                         CPASSERT(iproc >= 0 .AND. iproc < nprocs)
     622             :                         IF (n_packed_elements_per_proc(iproc) + nrows_local > &
     623             :                             siesta_struct%nelements_per_proc(iproc, nelements_dbcsr_send)) THEN
     624             :                            CALL cp__a(__SHORT_FILE__, __LINE__)
     625             :                         END IF
     626             :                      END IF
     627             : 
     628             :                      offset_send_mepos = offset_per_proc(iproc) + n_packed_elements_per_proc(iproc)
     629             :                      send_buffer(offset_send_mepos + 1:offset_send_mepos + nrows_local) = sm_block_merged(1:nrows_local, icol_local)
     630      231552 : 
     631     2315520 :                      n_packed_elements_per_proc(iproc) = n_packed_elements_per_proc(iproc) + nrows_local
     632             :                   END DO
     633      257280 :                END IF
     634             : 
     635             :                IF (n_image_ind > 1) THEN
     636             :                   DEALLOCATE (sm_block_merged)
     637       28160 :                END IF
     638           0 :             END IF
     639             : 
     640             :             image_ind_offset = image_ind_offset + siesta_struct%n_dbcsr_cell_images_to_merge(inode_proc)
     641             :          END DO
     642       28168 : 
     643             :          IF (debug_this_module) THEN
     644             :             DO iproc = 0, nprocs - 1
     645             :                IF (n_packed_elements_per_proc(iproc) /= siesta_struct%nelements_per_proc(iproc, nelements_dbcsr_send)) THEN
     646             :                   CALL cp__a(__SHORT_FILE__, __LINE__)
     647             :                END IF
     648             :             END DO
     649             :          END IF
     650             : 
     651             :          ! send packed data to other parallel processes
     652             :          DO irequest = nrequests_recv + 1, nrequests_total
     653             :             CALL para_env%isend(send_buffer(offset_per_request(irequest) + 1: &
     654          12 :                                             offset_per_request(irequest) + nelements_per_request(irequest)), &
     655             :                                 peer_rank(irequest), requests(irequest), request_tag(irequest))
     656             :          END DO
     657          12 : 
     658             :          ! copy data locally that stay on the same process.
     659             :          IF (mepos > 0) THEN
     660             :             offset_recv_mepos = SUM(siesta_struct%nelements_per_proc(0:mepos - 1, nelements_dbcsr_recv))
     661           8 :          ELSE
     662           8 :             offset_recv_mepos = 0
     663             :          END IF
     664             :          offset_send_mepos = offset_per_proc(mepos)
     665             : 
     666           8 :          IF (debug_this_module) THEN
     667             :             IF (n_packed_elements_per_proc(mepos) /= siesta_struct%nelements_per_proc(mepos, nelements_dbcsr_recv)) THEN
     668             :                CALL cp__a(__SHORT_FILE__, __LINE__)
     669             :             END IF
     670             :          END IF
     671             : 
     672             :          IF (n_packed_elements_per_proc(mepos) > 0) THEN
     673             :             recv_buffer(offset_recv_mepos + 1:offset_recv_mepos + n_packed_elements_per_proc(mepos)) = &
     674           8 :                send_buffer(offset_send_mepos + 1:offset_send_mepos + n_packed_elements_per_proc(mepos))
     675             :          END IF
     676     2182468 :       END IF
     677             : 
     678             :       IF (nrequests_total > 0) THEN
     679             :          ! wait for pending isend/irecv requests
     680           8 :          CALL mp_waitall(requests)
     681             :          DEALLOCATE (nelements_per_request, offset_per_request, peer_rank, requests, request_tag)
     682           8 :       END IF
     683           8 : 
     684             :       ! release send buffers
     685             :       IF (ALLOCATED(send_buffer)) DEALLOCATE (send_buffer)
     686             :       DEALLOCATE (offset_per_proc, n_packed_elements_per_proc)
     687           8 : 
     688           8 :       ! non-zero matrix elements in 'recv_buffer' array are grouped by their source MPI rank, local row index, and column index (in this order).
     689             :       ! Reorder the matrix elements ('reorder_recv_buffer') so they are grouped by their local row index, source MPI rank, and column index.
     690             :       ! (column indices are in the ascending order within each (row index, source MPI rank) block).
     691             :       ! The array 'packed_index' allows mapping matrix element between these intermediate order and SIESTA order : local row index, column index
     692             :       IF (n_nonzero_elements_siesta > 0) THEN
     693             :          ALLOCATE (reorder_recv_buffer(n_nonzero_elements_siesta))
     694           8 :          ALLOCATE (next_nonzero_element_offset(siesta_struct%nrows))
     695          12 :          next_nonzero_element_offset(:) = 0
     696          12 :          offset_recv_mepos = 0
     697        1156 : 
     698           4 :          DO inode = 1, SIZE(siesta_struct%nl_repl, 2)
     699             :             irow_blk = siesta_struct%nl_repl(neighbor_list_iatom_index, inode)
     700       28164 :             icol_blk = siesta_struct%nl_repl(neighbor_list_jatom_index, inode)
     701       28160 :             CPASSERT(irow_blk <= icol_blk .OR. (.NOT. symmetric))
     702       28160 :             image_siesta = siesta_struct%nl_repl(neighbor_list_siesta_image_index, inode)
     703       28160 :             image_siesta_transp = siesta_struct%nl_repl(neighbor_list_siesta_transp_image_index, inode)
     704       28160 : 
     705       28160 :             nrows_local = row_blk_size(irow_blk)
     706             :             ncols_local = col_blk_size(icol_blk)
     707       28160 :             first_row_minus_one = row_blk_offset(irow_blk) - 1
     708       28160 :             first_col_minus_one = col_blk_offset(icol_blk) - 1
     709       28160 : 
     710       28160 :             ! normal block
     711             :             IF (image_siesta > 0) THEN
     712             :                DO irow_local = 1, nrows_local
     713       28160 :                   IF (do_distribute) THEN
     714      281600 : #if defined(__SMEAGOL)
     715      253440 :                      CALL GlobalToLocalOrb(irow_local + first_row_minus_one, mepos, nprocs, irow_proc)
     716             : #else
     717             :                      CALL cp_abort(__LOCATION__, &
     718             :                                    "CP2K was compiled with no SMEAGOL support.")
     719             : #endif
     720           0 :                   ELSE
     721             :                      IF (is_root_rank) THEN
     722             :                         irow_proc = irow_local + first_row_minus_one
     723      253440 :                      ELSE
     724      253440 :                         irow_proc = 0
     725             :                      END IF
     726             :                   END IF
     727             :                   IF (irow_proc > 0) THEN
     728             :                      offset_send_mepos = siesta_struct%row_offset(irow_proc) + next_nonzero_element_offset(irow_proc)
     729      281600 :                      reorder_recv_buffer(offset_send_mepos + 1:offset_send_mepos + ncols_local) = &
     730      253440 :                         recv_buffer(offset_recv_mepos + 1:offset_recv_mepos + ncols_local)
     731             :                      offset_recv_mepos = offset_recv_mepos + ncols_local
     732     2534400 :                      next_nonzero_element_offset(irow_proc) = next_nonzero_element_offset(irow_proc) + ncols_local
     733      253440 :                   END IF
     734      253440 :                END DO
     735             :             END IF
     736             : 
     737             :             ! transposed block
     738             :             IF (image_siesta_transp > 0) THEN
     739             :                DO icol_local = 1, ncols_local
     740       28164 :                   IF (do_distribute) THEN
     741      257280 : #if defined(__SMEAGOL)
     742      231552 :                      CALL GlobalToLocalOrb(icol_local + first_col_minus_one, mepos, nprocs, irow_proc)
     743             : #else
     744             :                      CALL cp_abort(__LOCATION__, &
     745             :                                    "CP2K was compiled with no SMEAGOL support.")
     746             : #endif
     747           0 :                   ELSE
     748             :                      IF (is_root_rank) THEN
     749             :                         irow_proc = icol_local + first_col_minus_one
     750      231552 :                      ELSE
     751      231552 :                         irow_proc = 0
     752             :                      END IF
     753             :                   END IF
     754             :                   IF (irow_proc > 0) THEN
     755             :                      offset_send_mepos = siesta_struct%row_offset(irow_proc) + next_nonzero_element_offset(irow_proc)
     756      257280 :                      reorder_recv_buffer(offset_send_mepos + 1:offset_send_mepos + nrows_local) = &
     757      231552 :                         recv_buffer(offset_recv_mepos + 1:offset_recv_mepos + nrows_local)
     758             :                      offset_recv_mepos = offset_recv_mepos + nrows_local
     759     2315520 :                      next_nonzero_element_offset(irow_proc) = next_nonzero_element_offset(irow_proc) + nrows_local
     760      231552 :                   END IF
     761      231552 :                END DO
     762             :             END IF
     763             :          END DO
     764             : 
     765             :          IF (debug_this_module) THEN
     766             :             DO irow_local = 1, siesta_struct%nrows
     767             :                IF (siesta_struct%n_nonzero_cols(irow_local) /= next_nonzero_element_offset(irow_local)) THEN
     768             :                   CALL cp__a(__SHORT_FILE__, __LINE__)
     769             :                END IF
     770             :             END DO
     771             :          END IF
     772             : 
     773             :          DEALLOCATE (next_nonzero_element_offset)
     774             :          DEALLOCATE (recv_buffer)
     775           4 : 
     776           4 :          ! Map non-zero matrix element between the intermediate order and SIESTA order
     777             :          DO irow_local = 1, siesta_struct%nrows
     778             :             offset_recv_mepos = siesta_struct%row_offset(irow_local)
     779        1156 :             DO icol_local = 1, siesta_struct%n_nonzero_cols(irow_local)
     780        1152 :                matrix_siesta(offset_recv_mepos + icol_local) = &
     781     4366084 :                   reorder_recv_buffer(offset_recv_mepos + siesta_struct%packed_index(offset_recv_mepos + icol_local))
     782             :             END DO
     783     4366080 :          END DO
     784             :          DEALLOCATE (reorder_recv_buffer)
     785             :       END IF
     786           4 : 
     787             :       CALL timestop(handle)
     788             :    END SUBROUTINE convert_dbcsr_to_distributed_siesta
     789           8 : 
     790          24 : ! **************************************************************************************************
     791             : !> \brief Convert matrix from DBCSR to sparse SIESTA format.
     792             : !> \param matrix_dbcsr_kp    DBCSR matrix [out]. The matrix is declared as INTENT(in) as pointers to
     793             : !>                 dbcsr matrices remain intact. However we have intention to update matrix elements
     794             : !> \param matrix_siesta      matrix in SIESTA format [in]
     795             : !> \param siesta_struct      structure to map matrix blocks between formats
     796             : !> \param para_env           MPI parallel environment
     797             : ! **************************************************************************************************
     798             :    SUBROUTINE convert_distributed_siesta_to_dbcsr(matrix_dbcsr_kp, matrix_siesta, siesta_struct, para_env)
     799             :       TYPE(dbcsr_p_type), DIMENSION(:), INTENT(in)       :: matrix_dbcsr_kp
     800           0 :       REAL(kind=dp), DIMENSION(:), INTENT(in)            :: matrix_siesta
     801             :       TYPE(siesta_distrib_csc_struct_type), INTENT(in)   :: siesta_struct
     802             :       TYPE(mp_para_env_type), INTENT(in), POINTER        :: para_env
     803             : 
     804             :       CHARACTER(len=*), PARAMETER :: routineN = 'convert_distributed_siesta_to_dbcsr'
     805             : 
     806             :       INTEGER :: first_col_minus_one, first_row_minus_one, handle, icol_blk, icol_local, &
     807             :          image_dbcsr, image_siesta, image_siesta_transp, inode, inode_proc, iproc, irequest, &
     808             :          irow_blk, irow_local, irow_proc, mepos, n_image_ind, ncols_blk, ncols_local, nnodes_proc, &
     809             :          node_offset, nprocs, nrequests_recv, nrequests_total, nrows_blk, nrows_local
     810             :       INTEGER(kind=int_8)                                :: n_nonzero_elements_dbcsr, &
     811             :                                                             n_nonzero_elements_siesta, &
     812             :                                                             offset_recv_mepos, offset_send_mepos
     813             :       INTEGER(kind=int_8), ALLOCATABLE, DIMENSION(:)     :: n_packed_elements_per_proc, &
     814             :                                                             nelements_per_request, &
     815           0 :                                                             offset_per_proc, offset_per_request
     816           0 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: next_nonzero_element_offset, peer_rank, &
     817           0 :                                                             request_tag
     818           0 :       INTEGER, DIMENSION(:), POINTER                     :: col_blk_offset, col_blk_size, &
     819           0 :                                                             row_blk_offset, row_blk_size
     820           0 :       LOGICAL                                            :: do_distribute, found, is_root_rank, &
     821           0 :                                                             symmetric
     822             :       REAL(kind=dp), ALLOCATABLE, DIMENSION(:)           :: recv_buffer, reorder_send_buffer, &
     823             :                                                             send_buffer
     824           0 :       REAL(kind=dp), DIMENSION(:, :), POINTER            :: sm_block
     825           0 :       TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:)   :: requests
     826           0 : 
     827           0 :       CALL timeset(routineN, handle)
     828             :       DO image_dbcsr = 1, SIZE(matrix_dbcsr_kp)
     829           0 :          CALL dbcsr_set(matrix_dbcsr_kp(image_dbcsr)%matrix, 0.0_dp)
     830           0 :       END DO
     831           0 : 
     832             :       mepos = para_env%mepos
     833             :       nprocs = para_env%num_pe
     834           0 :       do_distribute = siesta_struct%gather_root < 0
     835           0 :       is_root_rank = siesta_struct%gather_root == mepos
     836           0 : 
     837           0 :       CALL dbcsr_get_info(matrix=matrix_dbcsr_kp(1)%matrix, &
     838             :                           nblkrows_total=nrows_blk, nblkcols_total=ncols_blk, &
     839             :                           row_blk_size=row_blk_size, col_blk_size=col_blk_size, &
     840             :                           row_blk_offset=row_blk_offset, col_blk_offset=col_blk_offset)
     841             :       symmetric = siesta_struct%symmetric
     842           0 : 
     843           0 :       n_nonzero_elements_siesta = SUM(siesta_struct%nelements_per_proc(:, nelements_dbcsr_recv))
     844             :       n_nonzero_elements_dbcsr = SUM(siesta_struct%nelements_per_proc(:, nelements_dbcsr_send))
     845           0 : 
     846           0 :       nrequests_recv = get_number_of_mpi_sendrecv_requests(mepos, siesta_struct%nelements_per_proc(:, nelements_dbcsr_send), &
     847             :                                                            max_mpi_packet_size_dp)
     848             :       nrequests_total = get_number_of_mpi_sendrecv_requests(mepos, siesta_struct%nelements_per_proc(:, nelements_dbcsr_recv), &
     849           0 :                                                             max_mpi_packet_size_dp) + nrequests_recv
     850             :       IF (nrequests_total > 0) THEN
     851           0 :          ALLOCATE (requests(nrequests_total))
     852           0 :          ALLOCATE (peer_rank(nrequests_total), request_tag(nrequests_total))
     853           0 :          ALLOCATE (offset_per_request(nrequests_total), nelements_per_request(nrequests_total))
     854           0 :          !requests(:) = mp_request_null
     855           0 :          IF (nrequests_recv > 0) THEN
     856             :             CALL assign_nonzero_elements_to_requests(offset_per_request(1:nrequests_recv), &
     857           0 :                                                      nelements_per_request(1:nrequests_recv), &
     858             :                                                      peer_rank(1:nrequests_recv), &
     859             :                                                      request_tag(1:nrequests_recv), &
     860             :                                                      mepos, &
     861             :                                                      siesta_struct%nelements_per_proc(:, nelements_dbcsr_send), &
     862             :                                                      max_mpi_packet_size_dp)
     863             :          END IF
     864           0 :          IF (nrequests_total > nrequests_recv) THEN
     865             :             CALL assign_nonzero_elements_to_requests(offset_per_request(nrequests_recv + 1:nrequests_total), &
     866           0 :                                                      nelements_per_request(nrequests_recv + 1:nrequests_total), &
     867             :                                                      peer_rank(nrequests_recv + 1:nrequests_total), &
     868             :                                                      request_tag(nrequests_recv + 1:nrequests_total), &
     869             :                                                      mepos, &
     870             :                                                      siesta_struct%nelements_per_proc(:, nelements_dbcsr_recv), &
     871             :                                                      max_mpi_packet_size_dp)
     872             :          END IF
     873           0 :       END IF
     874             : 
     875             :       IF (n_nonzero_elements_dbcsr > 0) THEN
     876             :          ALLOCATE (recv_buffer(n_nonzero_elements_dbcsr))
     877           0 :       END IF
     878           0 :       DO irequest = 1, nrequests_recv
     879             :          CALL para_env%irecv(recv_buffer(offset_per_request(irequest) + 1: &
     880           0 :                                          offset_per_request(irequest) + nelements_per_request(irequest)), &
     881             :                              peer_rank(irequest), requests(irequest), request_tag(irequest))
     882             :       END DO
     883           0 : 
     884             :       ALLOCATE (offset_per_proc(0:nprocs - 1), n_packed_elements_per_proc(0:nprocs - 1))
     885             :       offset_per_proc(0) = 0
     886           0 :       DO iproc = 1, nprocs - 1
     887           0 :          offset_per_proc(iproc) = offset_per_proc(iproc - 1) + siesta_struct%nelements_per_proc(iproc - 1, nelements_dbcsr_send)
     888           0 :       END DO
     889           0 :       n_packed_elements_per_proc(:) = 0
     890             : 
     891           0 :       IF (mepos > 0) THEN
     892             :          node_offset = SUM(siesta_struct%nnodes_per_proc(0:mepos - 1))
     893           0 :       ELSE
     894           0 :          node_offset = 0
     895             :       END IF
     896             :       nnodes_proc = siesta_struct%nnodes_per_proc(mepos)
     897             : 
     898           0 :       IF (n_nonzero_elements_siesta > 0) THEN
     899             :          ALLOCATE (send_buffer(n_nonzero_elements_siesta))
     900           0 : 
     901           0 :          ALLOCATE (reorder_send_buffer(n_nonzero_elements_siesta))
     902             :          DO irow_local = 1, siesta_struct%nrows
     903           0 :             offset_send_mepos = siesta_struct%row_offset(irow_local)
     904           0 :             DO icol_local = 1, siesta_struct%n_nonzero_cols(irow_local)
     905           0 :                reorder_send_buffer(offset_send_mepos + siesta_struct%packed_index(offset_send_mepos + icol_local)) = &
     906           0 :                   matrix_siesta(offset_send_mepos + icol_local)
     907             :             END DO
     908           0 :          END DO
     909             : 
     910             :          ALLOCATE (next_nonzero_element_offset(siesta_struct%nrows))
     911             :          next_nonzero_element_offset(:) = 0
     912           0 :          offset_send_mepos = 0
     913           0 : 
     914           0 :          DO inode = 1, SIZE(siesta_struct%nl_repl, 2)
     915             :             irow_blk = siesta_struct%nl_repl(neighbor_list_iatom_index, inode)
     916           0 :             icol_blk = siesta_struct%nl_repl(neighbor_list_jatom_index, inode)
     917           0 :             CPASSERT(irow_blk <= icol_blk .OR. (.NOT. symmetric))
     918           0 :             image_siesta = siesta_struct%nl_repl(neighbor_list_siesta_image_index, inode)
     919           0 :             image_siesta_transp = siesta_struct%nl_repl(neighbor_list_siesta_transp_image_index, inode)
     920           0 : 
     921           0 :             nrows_local = row_blk_size(irow_blk)
     922             :             ncols_local = col_blk_size(icol_blk)
     923           0 :             first_row_minus_one = row_blk_offset(irow_blk) - 1
     924           0 :             first_col_minus_one = col_blk_offset(icol_blk) - 1
     925           0 : 
     926           0 :             IF (image_siesta > 0) THEN
     927             :                DO irow_local = 1, nrows_local
     928           0 :                   IF (do_distribute) THEN
     929           0 : #if defined(__SMEAGOL)
     930           0 :                      CALL GlobalToLocalOrb(irow_local + first_row_minus_one, mepos, nprocs, irow_proc)
     931             : #else
     932             :                      CALL cp_abort(__LOCATION__, &
     933             :                                    "CP2K was compiled with no SMEAGOL support.")
     934             : #endif
     935           0 :                   ELSE
     936             :                      IF (is_root_rank) THEN
     937             :                         irow_proc = irow_local + first_row_minus_one
     938           0 :                      ELSE
     939           0 :                         irow_proc = 0
     940             :                      END IF
     941             :                   END IF
     942             :                   IF (irow_proc > 0) THEN
     943             :                      offset_recv_mepos = siesta_struct%row_offset(irow_proc) + next_nonzero_element_offset(irow_proc)
     944           0 :                      send_buffer(offset_send_mepos + 1:offset_send_mepos + ncols_local) = &
     945           0 :                         reorder_send_buffer(offset_recv_mepos + 1:offset_recv_mepos + ncols_local)
     946             :                      offset_send_mepos = offset_send_mepos + ncols_local
     947           0 :                      next_nonzero_element_offset(irow_proc) = next_nonzero_element_offset(irow_proc) + ncols_local
     948           0 :                   END IF
     949           0 :                END DO
     950             :             END IF
     951             : 
     952             :             ! transposed block
     953             :             IF (image_siesta_transp > 0) THEN
     954             :                DO icol_local = 1, ncols_local
     955           0 :                   IF (do_distribute) THEN
     956           0 : #if defined(__SMEAGOL)
     957           0 :                      CALL GlobalToLocalOrb(icol_local + first_col_minus_one, mepos, nprocs, irow_proc)
     958             : #else
     959             :                      CALL cp_abort(__LOCATION__, &
     960             :                                    "CP2K was compiled with no SMEAGOL support.")
     961             : #endif
     962           0 :                   ELSE
     963             :                      IF (is_root_rank) THEN
     964             :                         irow_proc = icol_local + first_col_minus_one
     965           0 :                      ELSE
     966           0 :                         irow_proc = 0
     967             :                      END IF
     968             :                   END IF
     969             :                   IF (irow_proc > 0) THEN
     970             :                      offset_recv_mepos = siesta_struct%row_offset(irow_proc) + next_nonzero_element_offset(irow_proc)
     971           0 :                      send_buffer(offset_send_mepos + 1:offset_send_mepos + nrows_local) = &
     972           0 :                         reorder_send_buffer(offset_recv_mepos + 1:offset_recv_mepos + nrows_local)
     973             :                      offset_send_mepos = offset_send_mepos + nrows_local
     974           0 :                      next_nonzero_element_offset(irow_proc) = next_nonzero_element_offset(irow_proc) + nrows_local
     975           0 :                   END IF
     976           0 :                END DO
     977             :             END IF
     978             :          END DO
     979             : 
     980             :          IF (debug_this_module) THEN
     981             :             DO irow_local = 1, siesta_struct%nrows
     982             :                IF (siesta_struct%n_nonzero_cols(irow_local) /= next_nonzero_element_offset(irow_local)) THEN
     983             :                   CALL cp__a(__SHORT_FILE__, __LINE__)
     984             :                END IF
     985             :             END DO
     986             :          END IF
     987             : 
     988             :          DEALLOCATE (next_nonzero_element_offset)
     989             :          DEALLOCATE (reorder_send_buffer)
     990           0 : 
     991           0 :          DO irequest = nrequests_recv + 1, nrequests_total
     992             :             CALL para_env%isend(send_buffer(offset_per_request(irequest) + 1: &
     993           0 :                                             offset_per_request(irequest) + nelements_per_request(irequest)), &
     994             :                                 peer_rank(irequest), requests(irequest), request_tag(irequest))
     995             :          END DO
     996           0 : 
     997             :          ! copy data locally that stay on the same process.
     998             :          IF (mepos > 0) THEN
     999             :             offset_send_mepos = SUM(siesta_struct%nelements_per_proc(0:mepos - 1, nelements_dbcsr_recv))
    1000           0 :          ELSE
    1001           0 :             offset_send_mepos = 0
    1002             :          END IF
    1003             :          offset_recv_mepos = offset_per_proc(mepos)
    1004             : 
    1005           0 :          IF (debug_this_module) THEN
    1006             :             IF (siesta_struct%nelements_per_proc(mepos, nelements_dbcsr_recv) /= &
    1007             :                 siesta_struct%nelements_per_proc(mepos, nelements_dbcsr_send)) THEN
    1008             :                CALL cp__a(__SHORT_FILE__, __LINE__)
    1009             :             END IF
    1010             :          END IF
    1011             : 
    1012             :          IF (siesta_struct%nelements_per_proc(mepos, nelements_dbcsr_send) > 0) THEN
    1013             :             recv_buffer(offset_recv_mepos + 1:offset_recv_mepos + siesta_struct%nelements_per_proc(mepos, nelements_dbcsr_send)) = &
    1014           0 :                send_buffer(offset_send_mepos + 1:offset_send_mepos + siesta_struct%nelements_per_proc(mepos, nelements_dbcsr_send))
    1015             :          END IF
    1016           0 :       END IF
    1017             : 
    1018             :       IF (nrequests_total > 0) THEN
    1019             :          ! wait for pending isend/irecv requests
    1020           0 :          CALL mp_waitall(requests)
    1021             :          DEALLOCATE (nelements_per_request, offset_per_request, peer_rank, requests, request_tag)
    1022           0 :       END IF
    1023           0 : 
    1024             :       IF (ALLOCATED(send_buffer)) DEALLOCATE (send_buffer)
    1025             : 
    1026           0 :       iproc = siesta_struct%gather_root ! if do_distribute == .FALSE., collect matrix elements from MPI process with rank gather_root
    1027             :       IF (n_nonzero_elements_dbcsr > 0) THEN
    1028           0 :          DO inode_proc = 1, nnodes_proc
    1029           0 :             n_image_ind = siesta_struct%n_dbcsr_cell_images_to_merge(inode_proc)
    1030           0 :             IF (n_image_ind > 0) THEN
    1031           0 :                inode = node_offset + inode_proc
    1032           0 : 
    1033           0 :                irow_blk = siesta_struct%nl_repl(neighbor_list_iatom_index, inode)
    1034             :                icol_blk = siesta_struct%nl_repl(neighbor_list_jatom_index, inode)
    1035           0 :                image_dbcsr = siesta_struct%nl_repl(neighbor_list_dbcsr_image_index, inode)
    1036           0 :                CPASSERT(irow_blk <= icol_blk .OR. (.NOT. symmetric))
    1037           0 :                image_siesta = siesta_struct%nl_repl(neighbor_list_siesta_image_index, inode)
    1038           0 :                image_siesta_transp = siesta_struct%nl_repl(neighbor_list_siesta_transp_image_index, inode)
    1039           0 : 
    1040           0 :                nrows_local = row_blk_size(irow_blk)
    1041             :                ncols_local = col_blk_size(icol_blk)
    1042           0 :                first_row_minus_one = row_blk_offset(irow_blk) - 1
    1043           0 :                first_col_minus_one = col_blk_offset(icol_blk) - 1
    1044           0 : 
    1045           0 :                CALL dbcsr_get_block_p(matrix=matrix_dbcsr_kp(image_dbcsr)%matrix, &
    1046             :                                       row=irow_blk, col=icol_blk, block=sm_block, found=found)
    1047             :                CPASSERT(found)
    1048           0 : 
    1049           0 :                IF (image_siesta > 0) THEN
    1050             :                   DO irow_local = 1, nrows_local
    1051           0 :                      IF (do_distribute) THEN
    1052           0 : #if defined(__SMEAGOL)
    1053           0 :                         CALL WhichNodeOrb(irow_local + first_row_minus_one, nprocs, iproc) ! iproc_orb
    1054             : #else
    1055             :                         CALL cp_abort(__LOCATION__, &
    1056             :                                       "CP2K was compiled with no SMEAGOL support.")
    1057             : #endif
    1058           0 :                      END IF
    1059             :                      ! CPASSERT
    1060             :                      IF (debug_this_module) THEN
    1061             :                         CPASSERT(iproc >= 0 .AND. iproc < nprocs)
    1062             :                         IF (n_packed_elements_per_proc(iproc) + ncols_local > &
    1063             :                             siesta_struct%nelements_per_proc(iproc, nelements_dbcsr_send)) THEN
    1064             :                            CALL cp__a(__SHORT_FILE__, __LINE__)
    1065             :                         END IF
    1066             :                      END IF
    1067             : 
    1068             :                      offset_recv_mepos = offset_per_proc(iproc) + n_packed_elements_per_proc(iproc)
    1069             :                      sm_block(irow_local, 1:ncols_local) = recv_buffer(offset_recv_mepos + 1:offset_recv_mepos + ncols_local)
    1070             : 
    1071           0 :                      n_packed_elements_per_proc(iproc) = n_packed_elements_per_proc(iproc) + ncols_local
    1072           0 :                   END DO
    1073             :                END IF
    1074           0 : 
    1075             :                ! transposed block
    1076             :                IF (image_siesta_transp > 0) THEN
    1077             :                   DO icol_local = 1, ncols_local
    1078             :                      IF (do_distribute) THEN
    1079           0 : #if defined(__SMEAGOL)
    1080           0 :                         CALL WhichNodeOrb(icol_local + first_col_minus_one, nprocs, iproc) ! iproc_orb
    1081           0 : #else
    1082             :                         CALL cp_abort(__LOCATION__, &
    1083             :                                       "CP2K was compiled with no SMEAGOL support.")
    1084             : #endif
    1085             :                      END IF
    1086           0 :                      ! CPASSERT
    1087             :                      IF (debug_this_module) THEN
    1088             :                         CPASSERT(iproc >= 0 .AND. iproc < nprocs)
    1089             :                         IF (n_packed_elements_per_proc(iproc) + nrows_local > &
    1090             :                             siesta_struct%nelements_per_proc(iproc, nelements_dbcsr_send)) THEN
    1091             :                            CALL cp__a(__SHORT_FILE__, __LINE__)
    1092             :                         END IF
    1093             :                      END IF
    1094             : 
    1095             :                      offset_recv_mepos = offset_per_proc(iproc) + n_packed_elements_per_proc(iproc)
    1096             :                      sm_block(1:nrows_local, icol_local) = recv_buffer(offset_recv_mepos + 1:offset_recv_mepos + nrows_local)
    1097             : 
    1098             :                      n_packed_elements_per_proc(iproc) = n_packed_elements_per_proc(iproc) + nrows_local
    1099           0 :                   END DO
    1100           0 :                END IF
    1101             :             END IF
    1102           0 :          END DO
    1103             : 
    1104             :          IF (debug_this_module) THEN
    1105             :             DO iproc = 0, nprocs - 1
    1106             :                IF (n_packed_elements_per_proc(iproc) /= siesta_struct%nelements_per_proc(iproc, nelements_dbcsr_send)) THEN
    1107             :                   CALL cp__a(__SHORT_FILE__, __LINE__)
    1108             :                END IF
    1109             :             END DO
    1110             :          END IF
    1111             : 
    1112             :          DEALLOCATE (recv_buffer)
    1113             :       END IF
    1114             : 
    1115             :       DEALLOCATE (offset_per_proc, n_packed_elements_per_proc)
    1116           0 : 
    1117             :       CALL timestop(handle)
    1118             :    END SUBROUTINE convert_distributed_siesta_to_dbcsr
    1119           0 : 
    1120             :    ! *** PRIVATE SUBROUTINES ***
    1121           0 : 
    1122           0 : ! **************************************************************************************************
    1123             : !> \brief Computes number of neighbour-list nodes on the current parallel process.
    1124             : !> \param nnodes_local              number of nodes [out]
    1125             : !> \param max_ijk_cell_image_local  largest index of cell images along i, j and k cell vectors
    1126             : !>                                  on this parallel process [out]
    1127             : !> \param max_ijk_cell_image        largest index of cell images along i, j and k cell vectors [inout]
    1128             : !> \param sab_nl                    pair-wise neighbour list [in]
    1129             : !> \param para_env                  MPI parallel environment [in]
    1130             : !> \param particle_coords           list of atomic coordinates subject to periodic boundary conditions [in]
    1131             : !> \param cell                      simulation unit cell [in]
    1132             : ! **************************************************************************************************
    1133             :    SUBROUTINE get_nnodes_local(nnodes_local, max_ijk_cell_image_local, max_ijk_cell_image, sab_nl, para_env, particle_coords, cell)
    1134             :       INTEGER, INTENT(out)                               :: nnodes_local
    1135             :       INTEGER, DIMENSION(3), INTENT(out)                 :: max_ijk_cell_image_local
    1136             :       INTEGER, DIMENSION(3), INTENT(inout)               :: max_ijk_cell_image
    1137           2 :       TYPE(neighbor_list_set_p_type), DIMENSION(:), &
    1138             :          INTENT(in), POINTER                             :: sab_nl
    1139             :       TYPE(mp_para_env_type), INTENT(in), POINTER        :: para_env
    1140             :       REAL(kind=dp), CONTIGUOUS, DIMENSION(:, :), &
    1141             :          INTENT(in)                                      :: particle_coords
    1142             :       TYPE(cell_type), INTENT(in), POINTER               :: cell
    1143             : 
    1144             :       CHARACTER(len=*), PARAMETER                        :: routineN = 'get_nnodes_local'
    1145             : 
    1146             :       INTEGER                                            :: handle, iatom, icoord, jatom
    1147             :       INTEGER, DIMENSION(3)                              :: cell_ijk, max_ijk_cell_image_tmp
    1148             :       LOGICAL                                            :: update_ncells
    1149             :       REAL(kind=dp), DIMENSION(3)                        :: r_ij
    1150             :       TYPE(neighbor_list_iterator_p_type), &
    1151             :          DIMENSION(:), POINTER                           :: nl_iterator
    1152             : 
    1153             :       CALL timeset(routineN, handle)
    1154             : 
    1155           2 :       update_ncells = .FALSE.
    1156             :       DO icoord = 1, 3 ! x, y, z
    1157           2 :          IF (max_ijk_cell_image(icoord) >= 0) THEN
    1158             :             max_ijk_cell_image_tmp(icoord) = max_ijk_cell_image(icoord)
    1159           2 :          ELSE
    1160           8 :             max_ijk_cell_image_tmp(icoord) = HUGE(max_ijk_cell_image_tmp(icoord))
    1161           8 :             update_ncells = .TRUE.
    1162           2 :          END IF
    1163             :       END DO
    1164           4 : 
    1165           4 :       nnodes_local = 0
    1166             :       max_ijk_cell_image_local(:) = 0
    1167             : 
    1168             :       CALL neighbor_list_iterator_create(nl_iterator, sab_nl)
    1169           2 :       DO WHILE (neighbor_list_iterate(nl_iterator) == 0)
    1170           2 :          CALL get_iterator_info(nl_iterator, iatom=iatom, jatom=jatom, r=r_ij)
    1171             :          CALL get_negf_cell_ijk(cell_ijk, r_ij, r_i=particle_coords(1:3, iatom), r_j=particle_coords(1:3, jatom), cell=cell)
    1172           2 :          cell_ijk(1:3) = ABS(cell_ijk(1:3))
    1173        7170 : 
    1174        7168 :          IF (cell_ijk(1) <= max_ijk_cell_image_tmp(1) .AND. cell_ijk(2) <= max_ijk_cell_image_tmp(2) .AND. &
    1175        7168 :              cell_ijk(3) <= max_ijk_cell_image_tmp(3)) THEN
    1176       28672 :             nnodes_local = nnodes_local + 1
    1177             :             max_ijk_cell_image_local(1:3) = MAX(max_ijk_cell_image_local(1:3), cell_ijk(1:3))
    1178        7168 :          END IF
    1179           2 :       END DO
    1180        7040 :       CALL neighbor_list_iterator_release(nl_iterator)
    1181       28160 : 
    1182             :       IF (update_ncells) THEN
    1183             :          max_ijk_cell_image_tmp(1:3) = max_ijk_cell_image_local(1:3)
    1184           2 :          CALL para_env%max(max_ijk_cell_image_tmp)
    1185             :          DO icoord = 1, 3
    1186           2 :             IF (max_ijk_cell_image(icoord) < 0) THEN
    1187           2 :                max_ijk_cell_image(icoord) = max_ijk_cell_image_tmp(icoord)
    1188           2 :             END IF
    1189           8 :          END DO
    1190           8 :       END IF
    1191           4 : 
    1192             :       CALL timestop(handle)
    1193             :    END SUBROUTINE get_nnodes_local
    1194             : 
    1195             : ! **************************************************************************************************
    1196           2 : !> \brief Construct list of neighbour-list's nodes on the current parallel process.
    1197           2 : !> \param nl_local                  non-merged local neighbour-list's nodes [out]
    1198             : !> \param max_ijk_cell_image_local  largest index of cell images along i, j and k cell vectors
    1199             : !>                                  on this parallel process [in]
    1200             : !> \param max_ijk_cell_image        largest index of cell images along i, j and k cell vectors [in]
    1201             : !> \param sab_nl                    pair-wise neighbour list [in]
    1202             : !> \param particle_coords           list of atomic coordinates subject to periodic boundary conditions [in]
    1203             : !> \param cell                      simulation unit cell [in]
    1204             : !> \param cell_to_index             array to convert 3-D cell indices to 1-D DBCSR image indices
    1205             : !> \param do_merge                  merge cell images along transport direction [in]
    1206             : !> \param node_merged_indices       nodes-related indices. Nodes with identical indices will be merged [out]
    1207             : !> \param k_cells                   list of cell image indices along transport direction. Nodes to
    1208             : !>                be merged have the same 'node_merged_indices' but different 'k_cells' indices [out]
    1209             : ! **************************************************************************************************
    1210             :    SUBROUTINE get_nl_nodes_local(nl_local, max_ijk_cell_image_local, max_ijk_cell_image, sab_nl, particle_coords, &
    1211             :                                  cell, cell_to_index, do_merge, node_merged_indices, k_cells)
    1212             :       INTEGER, DIMENSION(:, :), INTENT(out)              :: nl_local
    1213             :       INTEGER, DIMENSION(3), INTENT(in)                  :: max_ijk_cell_image_local, &
    1214           2 :                                                             max_ijk_cell_image
    1215           2 :       TYPE(neighbor_list_set_p_type), DIMENSION(:), &
    1216             :          INTENT(in), POINTER                             :: sab_nl
    1217             :       REAL(kind=dp), CONTIGUOUS, DIMENSION(:, :), &
    1218             :          INTENT(in)                                      :: particle_coords
    1219             :       TYPE(cell_type), INTENT(in), POINTER               :: cell
    1220             :       INTEGER, DIMENSION(:, :, :), INTENT(in), POINTER   :: cell_to_index
    1221             :       LOGICAL, INTENT(in)                                :: do_merge
    1222             :       INTEGER(kind=int_8), DIMENSION(:), INTENT(out)     :: node_merged_indices
    1223             :       INTEGER, DIMENSION(:), INTENT(out)                 :: k_cells
    1224             : 
    1225             :       CHARACTER(len=*), PARAMETER :: routineN = 'get_nl_nodes_local'
    1226             : 
    1227             :       INTEGER                                            :: handle, iatom, icol_blk, image, inode, &
    1228             :                                                             irow_blk, jatom, natoms
    1229             :       INTEGER(kind=8), DIMENSION(2)                      :: ncells_siesta_local
    1230             :       INTEGER, DIMENSION(2)                              :: ncells_siesta
    1231             :       INTEGER, DIMENSION(3)                              :: cell_ijk_abs, cell_ijk_dbcsr, &
    1232             :                                                             cell_ijk_siesta
    1233             :       LOGICAL                                            :: do_symmetric
    1234             :       REAL(kind=dp), DIMENSION(3)                        :: r_ij
    1235             :       TYPE(neighbor_list_iterator_p_type), &
    1236             :          DIMENSION(:), POINTER                           :: nl_iterator
    1237             : 
    1238             :       CALL timeset(routineN, handle)
    1239             :       ! natoms only used to compute a merged 1D index of each DBCSR block
    1240           2 :       natoms = SIZE(particle_coords, 2)
    1241             :       CALL get_neighbor_list_set_p(neighbor_list_sets=sab_nl, symmetric=do_symmetric)
    1242           2 :       ncells_siesta(1:2) = 2*max_ijk_cell_image(1:2) + 1
    1243             : 
    1244           2 :       ncells_siesta_local(1:2) = INT(2*max_ijk_cell_image_local(1:2) + 1, kind=int_8)
    1245           2 : 
    1246           6 :       inode = 0
    1247             :       CALL neighbor_list_iterator_create(nl_iterator, sab_nl)
    1248           6 :       DO WHILE (neighbor_list_iterate(nl_iterator) == 0)
    1249             :          CALL get_iterator_info(nl_iterator, iatom=iatom, jatom=jatom, cell=cell_ijk_dbcsr, r=r_ij)
    1250           2 :          CALL get_negf_cell_ijk(cell_ijk_abs, r_ij, r_i=particle_coords(1:3, iatom), r_j=particle_coords(1:3, jatom), cell=cell)
    1251           2 : 
    1252        7170 :          IF (ABS(cell_ijk_abs(1)) <= max_ijk_cell_image(1) .AND. ABS(cell_ijk_abs(2)) <= max_ijk_cell_image(2) .AND. &
    1253        7168 :              ABS(cell_ijk_abs(3)) <= max_ijk_cell_image(3)) THEN
    1254        7168 : 
    1255             :             inode = inode + 1
    1256        7168 : 
    1257           2 :             image = get_index_by_cell(cell_ijk_dbcsr, cell_to_index)
    1258             :             CPASSERT(image > 0)
    1259        7040 :             nl_local(neighbor_list_dbcsr_image_index, inode) = image
    1260             : 
    1261        7040 :             IF (do_symmetric .AND. iatom > jatom) THEN
    1262        7040 :                irow_blk = jatom
    1263        7040 :                icol_blk = iatom
    1264             :                cell_ijk_abs(1:3) = -cell_ijk_abs(1:3)
    1265        7040 :             ELSE
    1266       12672 :                irow_blk = iatom
    1267       12672 :                icol_blk = jatom
    1268       12672 :             END IF
    1269             : 
    1270             :             nl_local(neighbor_list_iatom_index, inode) = irow_blk
    1271             :             nl_local(neighbor_list_jatom_index, inode) = icol_blk
    1272             : 
    1273             :             cell_ijk_siesta(1:3) = index_in_canonical_enumeration(cell_ijk_abs(1:3)) ! absolute -> SIESTA
    1274        7040 : 
    1275        7040 :             IF (do_merge) THEN
    1276             :                node_merged_indices(inode) = (((cell_ijk_siesta(2) - 1)*ncells_siesta_local(1) + cell_ijk_siesta(1) - 1)* &
    1277       28160 :                                      INT(natoms, kind=int_8) + icol_blk - 1)*INT(natoms, kind=int_8) + INT(irow_blk - 1, kind=int_8)
    1278             :                image = cell_ijk_siesta(1) + ncells_siesta(1)*(cell_ijk_siesta(2) - 1)
    1279        7040 :             ELSE
    1280             :                node_merged_indices(inode) = ((((cell_ijk_siesta(3) - 1)*ncells_siesta_local(2) + &
    1281           0 :                                                cell_ijk_siesta(2) - 1)*ncells_siesta_local(1) + cell_ijk_siesta(1) - 1)* &
    1282           0 :                                      INT(natoms, kind=int_8) + icol_blk - 1)*INT(natoms, kind=int_8) + INT(irow_blk - 1, kind=int_8)
    1283             :                image = cell_ijk_siesta(1) + ncells_siesta(1)*(cell_ijk_siesta(2) - 1 + ncells_siesta(2)*(cell_ijk_siesta(3) - 1))
    1284             :             END IF
    1285             :             k_cells(inode) = cell_ijk_siesta(3)
    1286        7040 :             nl_local(neighbor_list_siesta_image_index, inode) = image
    1287        7040 : 
    1288             :             IF (do_symmetric .AND. irow_blk /= icol_blk) THEN
    1289        7040 :                cell_ijk_abs(1:3) = -cell_ijk_abs(1:3)
    1290        7040 :                cell_ijk_siesta(1:3) = index_in_canonical_enumeration(cell_ijk_abs(1:3)) ! absolute -> SIESTA
    1291             :                IF (do_merge) cell_ijk_siesta(3) = 1
    1292        7040 :                nl_local(neighbor_list_siesta_transp_image_index, inode) = &
    1293       25728 :                   cell_ijk_siesta(1) + ncells_siesta(1)*(cell_ijk_siesta(2) - 1 + ncells_siesta(2)*(cell_ijk_siesta(3) - 1))
    1294       25728 :             ELSE
    1295        6432 :                nl_local(neighbor_list_siesta_transp_image_index, inode) = 0
    1296             :             END IF
    1297        6432 :          END IF
    1298             :       END DO
    1299         608 :       CALL neighbor_list_iterator_release(nl_iterator)
    1300             : 
    1301             :       IF (debug_this_module) THEN
    1302             :          CPASSERT(SIZE(nl_local, 2) == inode)
    1303           2 :       END IF
    1304             : 
    1305             :       CALL timestop(handle)
    1306             :    END SUBROUTINE get_nl_nodes_local
    1307             : 
    1308             : ! **************************************************************************************************
    1309           2 : !> \brief Replicate (and optionally merge) pair-wise neighbour list.
    1310           2 : !> \param repl_nl                   replicated neighbour list. It needs to be deallocated elsewhere [allocated]
    1311             : !> \param n_dbcsr_cell_images_to_merge  number of merged blocks per neighbour-list node [allocated]
    1312             : !> \param dbcsr_cell_image_to_merge list of DBCSR image indices to merge [allocated]
    1313             : !> \param nnodes_per_proc           number of merged nodes on each parallel processes [out]
    1314             : !> \param max_ijk_cell_image        largest index of cell images along i, j and k cell vectors [inout]
    1315             : !> \param sab_nl                    pair-wise neighbour list [in]
    1316             : !> \param para_env                  MPI parallel environment [in]
    1317             : !> \param particle_coords           list of atomic coordinates subject to periodic boundary conditions [in]
    1318             : !> \param cell                      simulation unit cell [in]
    1319             : !> \param cell_to_index             array to convert 3-D cell indices to 1-D DBCSR image indices [in]
    1320             : !> \param do_merge                  merge cell images along transport direction [in]
    1321             : ! **************************************************************************************************
    1322             :    SUBROUTINE replicate_neighbour_list(repl_nl, n_dbcsr_cell_images_to_merge, dbcsr_cell_image_to_merge, &
    1323             :                                        nnodes_per_proc, max_ijk_cell_image, sab_nl, para_env, particle_coords, &
    1324             :                                        cell, cell_to_index, do_merge)
    1325             :       INTEGER, ALLOCATABLE, DIMENSION(:, :), &
    1326           2 :          INTENT(inout)                                   :: repl_nl
    1327           2 :       INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(inout)  :: n_dbcsr_cell_images_to_merge, &
    1328             :                                                             dbcsr_cell_image_to_merge
    1329             :       INTEGER, DIMENSION(0:), INTENT(out)                :: nnodes_per_proc
    1330             :       INTEGER, DIMENSION(3), INTENT(inout)               :: max_ijk_cell_image
    1331             :       TYPE(neighbor_list_set_p_type), DIMENSION(:), &
    1332             :          INTENT(in), POINTER                             :: sab_nl
    1333             :       TYPE(mp_para_env_type), INTENT(in), POINTER        :: para_env
    1334             :       REAL(kind=dp), CONTIGUOUS, DIMENSION(:, :), &
    1335             :          INTENT(in)                                      :: particle_coords
    1336             :       TYPE(cell_type), INTENT(in), POINTER               :: cell
    1337             :       INTEGER, DIMENSION(:, :, :), INTENT(in), POINTER   :: cell_to_index
    1338             :       LOGICAL, INTENT(in)                                :: do_merge
    1339             : 
    1340             :       CHARACTER(len=*), PARAMETER :: routineN = 'replicate_neighbour_list'
    1341             : 
    1342             :       INTEGER                                            :: handle, inode, iproc, kcell_closest, &
    1343             :                                                             nnodes_local, nnodes_merged, &
    1344             :                                                             nnodes_repl, offset_inode
    1345             :       INTEGER(kind=int_8), ALLOCATABLE, DIMENSION(:)     :: node_merged_indices
    1346             :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: inodes_orig, k_cells
    1347             :       INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: nl_local
    1348             :       INTEGER, DIMENSION(3)                              :: max_ijk_cell_image_local
    1349           2 : 
    1350           2 :       CALL timeset(routineN, handle)
    1351           2 :       CPASSERT(.NOT. ALLOCATED(repl_nl))
    1352             :       CPASSERT(.NOT. ALLOCATED(n_dbcsr_cell_images_to_merge))
    1353             :       CPASSERT(.NOT. ALLOCATED(dbcsr_cell_image_to_merge))
    1354           2 : 
    1355           2 :       CALL get_nnodes_local(nnodes_local, max_ijk_cell_image_local, max_ijk_cell_image, sab_nl, para_env, particle_coords, cell)
    1356           2 : 
    1357           2 :       nnodes_per_proc(:) = 0
    1358             : 
    1359           2 :       IF (nnodes_local > 0) THEN
    1360             :          ALLOCATE (nl_local(neighbor_list_dim1, nnodes_local))
    1361           6 :          ALLOCATE (node_merged_indices(nnodes_local))
    1362             :          ALLOCATE (k_cells(nnodes_local))
    1363           2 :          CALL get_nl_nodes_local(nl_local, max_ijk_cell_image_local, max_ijk_cell_image, sab_nl, particle_coords, cell, &
    1364           6 :                                  cell_to_index, do_merge, node_merged_indices, k_cells)
    1365           6 : 
    1366           6 :          ALLOCATE (inodes_orig(nnodes_local))
    1367             :          CALL sort(node_merged_indices, nnodes_local, inodes_orig)
    1368           2 : 
    1369             :          nnodes_merged = 1
    1370           4 :          DO inode = 2, nnodes_local
    1371           2 :             IF (node_merged_indices(inode) > node_merged_indices(inode - 1)) nnodes_merged = nnodes_merged + 1
    1372             :          END DO
    1373           2 :       ELSE
    1374        7040 :          nnodes_merged = 0
    1375        7040 :       END IF
    1376             : 
    1377             :       nnodes_per_proc(para_env%mepos) = nnodes_merged
    1378             :       CALL para_env%sum(nnodes_per_proc)
    1379             : 
    1380             :       nnodes_repl = SUM(nnodes_per_proc(:))
    1381           2 :       ALLOCATE (repl_nl(neighbor_list_dim1, nnodes_repl))
    1382          10 : 
    1383             :       IF (nnodes_local > 0) THEN
    1384           6 :          IF (para_env%mepos > 0) THEN
    1385           6 :             offset_inode = SUM(nnodes_per_proc(0:para_env%mepos - 1))
    1386             :          ELSE
    1387           2 :             offset_inode = 0
    1388           2 :          END IF
    1389           2 : 
    1390             :          ALLOCATE (n_dbcsr_cell_images_to_merge(nnodes_merged))
    1391             :          ALLOCATE (dbcsr_cell_image_to_merge(nnodes_local))
    1392             :          n_dbcsr_cell_images_to_merge(:) = 0
    1393             : 
    1394           6 :          nnodes_merged = 1 !offset_inode + 1
    1395           6 :          repl_nl(:, offset_inode + 1) = nl_local(:, inodes_orig(1))
    1396        7042 :          n_dbcsr_cell_images_to_merge(1) = 1
    1397             :          dbcsr_cell_image_to_merge(1) = nl_local(neighbor_list_dbcsr_image_index, inodes_orig(1))
    1398           2 :          kcell_closest = k_cells(inodes_orig(1))
    1399          12 :          DO inode = 2, nnodes_local
    1400           2 :             IF (node_merged_indices(inode) > node_merged_indices(inode - 1)) THEN
    1401           2 :                nnodes_merged = nnodes_merged + 1
    1402           2 :                repl_nl(:, offset_inode + nnodes_merged) = nl_local(:, inodes_orig(inode))
    1403        7040 :                !n_dbcsr_cell_images_to_merge(nnodes_merged) = 1
    1404        7038 :                kcell_closest = k_cells(inodes_orig(inode))
    1405        7038 :             ELSE
    1406       42228 :                IF (ABS(k_cells(inodes_orig(inode))) < ABS(kcell_closest) .OR. &
    1407             :                    (ABS(k_cells(inodes_orig(inode))) == ABS(kcell_closest) .AND. kcell_closest < 0)) THEN
    1408        7038 :                   repl_nl(:, offset_inode + nnodes_merged) = nl_local(:, inodes_orig(inode))
    1409             :                   kcell_closest = k_cells(inodes_orig(inode))
    1410           0 :                END IF
    1411           0 :             END IF
    1412           0 :             dbcsr_cell_image_to_merge(inode) = nl_local(neighbor_list_dbcsr_image_index, inodes_orig(inode))
    1413           0 :             n_dbcsr_cell_images_to_merge(nnodes_merged) = n_dbcsr_cell_images_to_merge(nnodes_merged) + 1
    1414             :          END DO
    1415             : 
    1416        7038 :          IF (debug_this_module) THEN
    1417        7040 :             CPASSERT(SUM(n_dbcsr_cell_images_to_merge) == nnodes_local)
    1418             :          END IF
    1419             : 
    1420             :          DEALLOCATE (inodes_orig)
    1421             :          DEALLOCATE (node_merged_indices, k_cells)
    1422             :          DEALLOCATE (nl_local)
    1423             :       END IF
    1424           2 : 
    1425           2 :       IF (para_env%num_pe > 1) THEN
    1426           2 :          offset_inode = 0
    1427             :          DO iproc = 0, para_env%num_pe - 1
    1428             :             IF (nnodes_per_proc(iproc) > 0) THEN
    1429           2 :                CALL para_env%bcast(repl_nl(:, offset_inode + 1:offset_inode + nnodes_per_proc(iproc)), iproc)
    1430           2 :                offset_inode = offset_inode + nnodes_per_proc(iproc)
    1431           6 :             END IF
    1432           6 :          END DO
    1433           4 :       END IF
    1434           4 : 
    1435             :       CALL timestop(handle)
    1436             :    END SUBROUTINE replicate_neighbour_list
    1437             : 
    1438             : ! **************************************************************************************************
    1439           2 : !> \brief Count number of DBCSR matrix elements that should be received from (*,nelements_dbcsr_recv)
    1440           4 : !>        and send to (*,nelements_dbcsr_send) each parallel process.
    1441             : !> \param nelements_per_proc number of non-zero matrix elements for each MPI process
    1442             : !> \param nnodes_per_proc    number of non-zero DBCSR matrix blocks (neighbour-list nodes)
    1443             : !> \param nl_repl            replicated neighbour list
    1444             : !> \param matrix_dbcsr_kp    DBCSR matrix
    1445             : !> \param symmetric          whether the DBCSR matrix is a symmetric one
    1446             : !> \param para_env           parallel environment
    1447             : !> \param gather_root        if >=0, gather all non-zero matrix element on the MPI process with
    1448             : !>                           gather_root rank (useful for bulk transport calculation).
    1449             : !>                           If <0, distribute non-zero matrix element across all MPI processes
    1450             : ! **************************************************************************************************
    1451             :    SUBROUTINE count_remote_dbcsr_elements(nelements_per_proc, nnodes_per_proc, nl_repl, matrix_dbcsr_kp, &
    1452             :                                           symmetric, para_env, gather_root)
    1453             :       INTEGER(kind=int_8), DIMENSION(0:, :), INTENT(out) :: nelements_per_proc
    1454             :       INTEGER, DIMENSION(0:), INTENT(in)                 :: nnodes_per_proc
    1455           2 :       INTEGER, DIMENSION(:, :), INTENT(in)               :: nl_repl
    1456             :       TYPE(dbcsr_p_type), DIMENSION(:), INTENT(in)       :: matrix_dbcsr_kp
    1457             :       LOGICAL, INTENT(in)                                :: symmetric
    1458             :       TYPE(mp_para_env_type), INTENT(in), POINTER        :: para_env
    1459             :       INTEGER, INTENT(in)                                :: gather_root
    1460             : 
    1461             :       CHARACTER(len=*), PARAMETER :: routineN = 'count_remote_dbcsr_elements'
    1462             : 
    1463             :       INTEGER :: first_row_minus_one, handle, icol_blk, image, image_transp, inode, inode_proc, &
    1464             :          iproc, iproc_orb, irow_blk, irow_local, mepos, ncols_blk, ncols_local, nnodes_proc, &
    1465             :          nprocs, nrows_blk, nrows_local, offset_inode
    1466             :       INTEGER, DIMENSION(:), POINTER                     :: col_blk_offset, col_blk_size, &
    1467             :                                                             row_blk_offset, row_blk_size
    1468             :       LOGICAL                                            :: do_distribute
    1469             : 
    1470           2 :       CALL timeset(routineN, handle)
    1471           2 :       nelements_per_proc(:, :) = 0
    1472             :       mepos = para_env%mepos
    1473             :       nprocs = para_env%num_pe
    1474           2 :       do_distribute = gather_root < 0
    1475          14 :       IF (debug_this_module) THEN
    1476           2 :          CPASSERT(SIZE(nnodes_per_proc) == nprocs)
    1477           2 :       END IF
    1478           2 : 
    1479             :       CALL dbcsr_get_info(matrix=matrix_dbcsr_kp(1)%matrix, &
    1480             :                           nblkrows_total=nrows_blk, nblkcols_total=ncols_blk, &
    1481             :                           row_blk_size=row_blk_size, col_blk_size=col_blk_size, &
    1482             :                           row_blk_offset=row_blk_offset, col_blk_offset=col_blk_offset)
    1483             : 
    1484             :       offset_inode = 0
    1485             :       iproc_orb = gather_root ! if do_distribute == .FALSE., send all matrix elements to MPI process with rank gather_root
    1486           2 :       DO iproc = LBOUND(nnodes_per_proc, 1), UBOUND(nnodes_per_proc, 1)
    1487             :          nnodes_proc = nnodes_per_proc(iproc)
    1488           2 :          DO inode_proc = 1, nnodes_proc
    1489           2 :             inode = inode_proc + offset_inode
    1490           8 : 
    1491           4 :             irow_blk = nl_repl(neighbor_list_iatom_index, inode)
    1492       14084 :             icol_blk = nl_repl(neighbor_list_jatom_index, inode)
    1493       14080 :             CPASSERT(irow_blk <= icol_blk .OR. (.NOT. symmetric))
    1494             :             image = nl_repl(neighbor_list_siesta_image_index, inode)
    1495       14080 :             image_transp = nl_repl(neighbor_list_siesta_transp_image_index, inode)
    1496       14080 : 
    1497       14080 :             IF (image > 0) THEN
    1498       14080 :                nrows_local = row_blk_size(irow_blk)
    1499       14080 :                first_row_minus_one = row_blk_offset(irow_blk) - 1
    1500             :                ncols_local = col_blk_size(icol_blk)
    1501       14080 :                DO irow_local = 1, nrows_local
    1502       14080 :                   IF (do_distribute) THEN
    1503       14080 : #if defined(__SMEAGOL)
    1504       14080 :                      CALL WhichNodeOrb(irow_local + first_row_minus_one, nprocs, iproc_orb)
    1505      140800 : #else
    1506      126720 :                      CALL cp_abort(__LOCATION__, &
    1507             :                                    "CP2K was compiled with no SMEAGOL support.")
    1508             : #endif
    1509             :                   END IF
    1510             :                   IF (iproc_orb == mepos) THEN
    1511           0 :                      nelements_per_proc(iproc, nelements_dbcsr_recv) = nelements_per_proc(iproc, nelements_dbcsr_recv) + &
    1512             :                                                                        ncols_local
    1513             :                   END IF
    1514      126720 : 
    1515             :                   IF (iproc == mepos) THEN
    1516       63360 :                      nelements_per_proc(iproc_orb, nelements_dbcsr_send) = nelements_per_proc(iproc_orb, nelements_dbcsr_send) + &
    1517             :                                                                            ncols_local
    1518             :                   END IF
    1519      140800 :                END DO
    1520             :             END IF
    1521       63360 : 
    1522             :             ! transposed block
    1523             :             IF (image_transp > 0) THEN
    1524             :                nrows_local = col_blk_size(icol_blk)
    1525             :                first_row_minus_one = col_blk_offset(icol_blk) - 1
    1526             :                ncols_local = row_blk_size(irow_blk)
    1527       14084 :                DO irow_local = 1, nrows_local
    1528       12864 :                   IF (do_distribute) THEN
    1529       12864 : #if defined(__SMEAGOL)
    1530       12864 :                      CALL WhichNodeOrb(irow_local + first_row_minus_one, nprocs, iproc_orb)
    1531      128640 : #else
    1532      115776 :                      CALL cp_abort(__LOCATION__, &
    1533             :                                    "CP2K was compiled with no SMEAGOL support.")
    1534             : #endif
    1535             :                   END IF
    1536             :                   IF (iproc_orb == mepos) THEN
    1537           0 :                      nelements_per_proc(iproc, nelements_dbcsr_recv) = nelements_per_proc(iproc, nelements_dbcsr_recv) + &
    1538             :                                                                        ncols_local
    1539             :                   END IF
    1540      115776 : 
    1541             :                   IF (iproc == mepos) THEN
    1542       57888 :                      nelements_per_proc(iproc_orb, nelements_dbcsr_send) = nelements_per_proc(iproc_orb, nelements_dbcsr_send) + &
    1543             :                                                                            ncols_local
    1544             :                   END IF
    1545      128640 :                END DO
    1546             :             END IF
    1547       57888 :          END DO
    1548             :          offset_inode = offset_inode + nnodes_proc
    1549             :       END DO
    1550             :       CALL timestop(handle)
    1551             :    END SUBROUTINE count_remote_dbcsr_elements
    1552           6 : 
    1553             : ! **************************************************************************************************
    1554           2 : !> \brief Construct list of non-zero matrix elements' indices in SIESTA format.
    1555           2 : !> \param n_nonzero_cols  number of non-zero matrix elements on each matrix row local to the current
    1556             : !>                        MPI process
    1557             : !> \param row_offset      offset of the first non-zero matrix elements for each locally-stores row
    1558             : !> \param col_index       sorted list of column indices of non-zero matrix element
    1559             : !> \param packed_index    original order of non-sorted column indices
    1560             : !> \param nl_repl         replicated neighbour list
    1561             : !> \param matrix_dbcsr_kp DBCSR matrix
    1562             : !> \param symmetric       whether the DBCSR matrix is a symmetric one
    1563             : !> \param para_env        parallel environment
    1564             : !> \param gather_root     if >=0, gather all non-zero matrix element on the MPI process with
    1565             : !>                        gather_root rank (useful for bulk transport calculation).
    1566             : !>                        If <0, distribute non-zero matrix element across all MPI processes
    1567             : ! **************************************************************************************************
    1568             :    SUBROUTINE get_nonzero_element_indices(n_nonzero_cols, row_offset, col_index, packed_index, &
    1569             :                                           nl_repl, matrix_dbcsr_kp, symmetric, para_env, gather_root)
    1570             :       INTEGER, DIMENSION(:), INTENT(out)                 :: n_nonzero_cols, row_offset, col_index, &
    1571             :                                                             packed_index
    1572           4 :       INTEGER, DIMENSION(:, :), INTENT(in)               :: nl_repl
    1573           2 :       TYPE(dbcsr_p_type), DIMENSION(:), INTENT(in)       :: matrix_dbcsr_kp
    1574             :       LOGICAL, INTENT(in)                                :: symmetric
    1575             :       TYPE(mp_para_env_type), INTENT(in), POINTER        :: para_env
    1576             :       INTEGER, INTENT(in)                                :: gather_root
    1577             : 
    1578             :       CHARACTER(len=*), PARAMETER :: routineN = 'get_nonzero_element_indices'
    1579             : 
    1580             :       INTEGER :: first_col_minus_one, first_row_minus_one, handle, icol_blk, icol_local, &
    1581             :          icol_offset, image, image_transp, inode, irow_blk, irow_local, irow_proc, mepos, &
    1582             :          ncols_blk, ncols_local, ncols_total, nnodes, nprocs, nrows_blk, nrows_local, nrows_total
    1583             :       INTEGER, DIMENSION(:), POINTER                     :: col_blk_offset, col_blk_size, &
    1584             :                                                             row_blk_offset, row_blk_size
    1585             :       LOGICAL                                            :: do_distribute, is_root_rank
    1586             : 
    1587           2 :       CALL timeset(routineN, handle)
    1588           2 :       n_nonzero_cols(:) = 0
    1589             :       mepos = para_env%mepos
    1590             :       nprocs = para_env%num_pe
    1591           2 :       do_distribute = gather_root < 0
    1592         291 :       is_root_rank = gather_root == mepos
    1593           2 : 
    1594           2 :       CALL dbcsr_get_info(matrix=matrix_dbcsr_kp(1)%matrix, &
    1595           2 :                           nblkrows_total=nrows_blk, nblkcols_total=ncols_blk, &
    1596           2 :                           nfullrows_total=nrows_total, nfullcols_total=ncols_total, &
    1597             :                           row_blk_size=row_blk_size, col_blk_size=col_blk_size, &
    1598             :                           row_blk_offset=row_blk_offset, col_blk_offset=col_blk_offset)
    1599             : 
    1600             :       nnodes = SIZE(nl_repl, 2)
    1601             :       DO inode = 1, nnodes
    1602           2 :          irow_blk = nl_repl(neighbor_list_iatom_index, inode)
    1603             :          icol_blk = nl_repl(neighbor_list_jatom_index, inode)
    1604           2 :          CPASSERT(irow_blk <= icol_blk .OR. (.NOT. symmetric))
    1605       14082 :          image = nl_repl(neighbor_list_siesta_image_index, inode)
    1606       14080 :          image_transp = nl_repl(neighbor_list_siesta_transp_image_index, inode)
    1607       14080 : 
    1608       14080 :          IF (image > 0) THEN
    1609       14080 :             nrows_local = row_blk_size(irow_blk)
    1610       14080 :             first_row_minus_one = row_blk_offset(irow_blk) - 1
    1611             :             ncols_local = col_blk_size(icol_blk)
    1612       14080 :             DO irow_local = 1, nrows_local
    1613       14080 :                IF (do_distribute) THEN
    1614       14080 : #if defined(__SMEAGOL)
    1615       14080 :                   CALL GlobalToLocalOrb(irow_local + first_row_minus_one, mepos, nprocs, irow_proc)
    1616      140800 : #else
    1617      126720 :                   CALL cp_abort(__LOCATION__, &
    1618             :                                 "CP2K was compiled with no SMEAGOL support.")
    1619             : #endif
    1620             :                ELSE
    1621             :                   IF (is_root_rank) THEN
    1622           0 :                      irow_proc = irow_local + first_row_minus_one
    1623             :                   ELSE
    1624             :                      irow_proc = 0
    1625      126720 :                   END IF
    1626       63360 :                END IF
    1627             :                IF (irow_proc > 0) THEN
    1628             :                   n_nonzero_cols(irow_proc) = n_nonzero_cols(irow_proc) + ncols_local
    1629             :                END IF
    1630             :             END DO
    1631       77440 :          END IF
    1632       63360 : 
    1633             :          ! transposed block
    1634             :          IF (image_transp > 0) THEN
    1635             :             nrows_local = col_blk_size(icol_blk)
    1636             :             first_row_minus_one = col_blk_offset(icol_blk) - 1
    1637             :             ncols_local = row_blk_size(irow_blk)
    1638       14082 :             DO irow_local = 1, nrows_local
    1639       12864 :                IF (do_distribute) THEN
    1640       12864 : #if defined(__SMEAGOL)
    1641       12864 :                   CALL GlobalToLocalOrb(irow_local + first_row_minus_one, mepos, nprocs, irow_proc)
    1642      128640 : #else
    1643      115776 :                   CALL cp_abort(__LOCATION__, &
    1644             :                                 "CP2K was compiled with no SMEAGOL support.")
    1645             : #endif
    1646             :                ELSE
    1647             :                   IF (is_root_rank) THEN
    1648           0 :                      irow_proc = irow_local + first_row_minus_one
    1649             :                   ELSE
    1650             :                      irow_proc = 0
    1651      115776 :                   END IF
    1652       57888 :                END IF
    1653             :                IF (irow_proc > 0) THEN
    1654             :                   n_nonzero_cols(irow_proc) = n_nonzero_cols(irow_proc) + ncols_local
    1655             :                END IF
    1656             :             END DO
    1657       70752 :          END IF
    1658       57888 :       END DO
    1659             : 
    1660             :       row_offset(1) = 0
    1661             :       DO irow_local = 1, SIZE(n_nonzero_cols) - 1
    1662             :          row_offset(irow_local + 1) = row_offset(irow_local) + n_nonzero_cols(irow_local)
    1663             :       END DO
    1664           2 : 
    1665         289 :       n_nonzero_cols(:) = 0
    1666         289 :       col_index(:) = 0
    1667             :       DO inode = 1, nnodes
    1668             :          irow_blk = nl_repl(neighbor_list_iatom_index, inode)
    1669         291 :          icol_blk = nl_repl(neighbor_list_jatom_index, inode)
    1670     1091235 :          CPASSERT(irow_blk <= icol_blk .OR. (.NOT. symmetric))
    1671       14082 :          image = nl_repl(neighbor_list_siesta_image_index, inode)
    1672       14080 :          image_transp = nl_repl(neighbor_list_siesta_transp_image_index, inode)
    1673       14080 : 
    1674       14080 :          IF (image > 0) THEN
    1675       14080 :             nrows_local = row_blk_size(irow_blk)
    1676       14080 :             first_row_minus_one = row_blk_offset(irow_blk) - 1
    1677             :             ncols_local = col_blk_size(icol_blk)
    1678       14080 :             first_col_minus_one = col_blk_offset(icol_blk) + (image - 1)*ncols_total - 1
    1679       14080 :             DO irow_local = 1, nrows_local
    1680       14080 :                IF (do_distribute) THEN
    1681       14080 : #if defined(__SMEAGOL)
    1682       14080 :                   CALL GlobalToLocalOrb(irow_local + first_row_minus_one, mepos, nprocs, irow_proc)
    1683      140800 : #else
    1684      126720 :                   CALL cp_abort(__LOCATION__, &
    1685             :                                 "CP2K was compiled with no SMEAGOL support.")
    1686             : #endif
    1687             :                ELSE
    1688             :                   IF (is_root_rank) THEN
    1689           0 :                      irow_proc = irow_local + first_row_minus_one
    1690             :                   ELSE
    1691             :                      irow_proc = 0
    1692      126720 :                   END IF
    1693       63360 :                END IF
    1694             :                IF (irow_proc > 0) THEN
    1695             :                   icol_offset = row_offset(irow_proc) + n_nonzero_cols(irow_proc)
    1696             :                   DO icol_local = 1, ncols_local
    1697             :                      col_index(icol_offset + icol_local) = first_col_minus_one + icol_local
    1698       77440 :                   END DO
    1699       63360 :                   n_nonzero_cols(irow_proc) = n_nonzero_cols(irow_proc) + ncols_local
    1700      633600 :                END IF
    1701      633600 :             END DO
    1702             :          END IF
    1703       63360 : 
    1704             :          ! transposed block
    1705             :          IF (image_transp > 0) THEN
    1706             :             nrows_local = col_blk_size(icol_blk)
    1707             :             first_row_minus_one = col_blk_offset(icol_blk) - 1
    1708             :             ncols_local = row_blk_size(irow_blk)
    1709       14082 :             first_col_minus_one = row_blk_offset(irow_blk) + (image_transp - 1)*nrows_total - 1
    1710       12864 :             DO irow_local = 1, nrows_local
    1711       12864 :                IF (do_distribute) THEN
    1712       12864 : #if defined(__SMEAGOL)
    1713       12864 :                   CALL GlobalToLocalOrb(irow_local + first_row_minus_one, mepos, nprocs, irow_proc)
    1714      128640 : #else
    1715      115776 :                   CALL cp_abort(__LOCATION__, &
    1716             :                                 "CP2K was compiled with no SMEAGOL support.")
    1717             : #endif
    1718             :                ELSE
    1719             :                   IF (is_root_rank) THEN
    1720           0 :                      irow_proc = irow_local + first_row_minus_one
    1721             :                   ELSE
    1722             :                      irow_proc = 0
    1723      115776 :                   END IF
    1724       57888 :                END IF
    1725             :                IF (irow_proc > 0) THEN
    1726             :                   icol_offset = row_offset(irow_proc) + n_nonzero_cols(irow_proc)
    1727             :                   DO icol_local = 1, ncols_local
    1728             :                      col_index(icol_offset + icol_local) = first_col_minus_one + icol_local
    1729       70752 :                   END DO
    1730       57888 :                   n_nonzero_cols(irow_proc) = n_nonzero_cols(irow_proc) + ncols_local
    1731      578880 :                END IF
    1732      578880 :             END DO
    1733             :          END IF
    1734       57888 :       END DO
    1735             : 
    1736             :       IF (SIZE(n_nonzero_cols) > 0) THEN
    1737             :          DO irow_local = 1, SIZE(n_nonzero_cols)
    1738             :             CALL sort(col_index(row_offset(irow_local) + 1:row_offset(irow_local) + n_nonzero_cols(irow_local)), &
    1739             :                       n_nonzero_cols(irow_local), &
    1740           2 :                       packed_index(row_offset(irow_local) + 1:row_offset(irow_local) + n_nonzero_cols(irow_local)))
    1741         291 :          END DO
    1742             :       END IF
    1743             : 
    1744         291 :       CALL timestop(handle)
    1745             :    END SUBROUTINE get_nonzero_element_indices
    1746             : 
    1747             : ! **************************************************************************************************
    1748           2 : !> \brief Get absolute i, j, and k indices of cell image for the given DBCSR matrix block.
    1749           2 : !>        Effective cell image recorded in the neighbour-list (where the matrix block is actually
    1750             : !>        stored) depends on atomic coordinates can be significantly different.
    1751             : !> \param cell_ijk      array with 3 indices along the cell's vectors
    1752             : !> \param r_ij          actual interatomic distance (vector R_j - r_i), where R_j is the coordinates
    1753             : !>                      of the j-th atom in the supercell
    1754             : !> \param r_i           coordinates of the i-th atom in the primary unit cell
    1755             : !> \param r_j           coordinates of the j-th atom in the primary unit cell
    1756             : !> \param cell          unit cell
    1757             : ! **************************************************************************************************
    1758             :    SUBROUTINE get_negf_cell_ijk(cell_ijk, r_ij, r_i, r_j, cell)
    1759             :       INTEGER, DIMENSION(3), INTENT(out)                 :: cell_ijk
    1760             :       REAL(kind=dp), DIMENSION(3), INTENT(in)            :: r_ij, r_i, r_j
    1761             :       TYPE(cell_type), INTENT(in), POINTER               :: cell
    1762       14336 : 
    1763             :       REAL(kind=dp), DIMENSION(3)                        :: coords_scaled, r
    1764             : 
    1765             :       r(:) = r_ij(:) + r_i(:) - r_j(:)
    1766             :       CALL real_to_scaled(coords_scaled, r, cell)
    1767             :       cell_ijk(:) = NINT(coords_scaled(:))
    1768             :    END SUBROUTINE get_negf_cell_ijk
    1769       57344 : 
    1770       14336 : ! **************************************************************************************************
    1771       57344 : !> \brief Return the index of an integer number in the sequence 0, 1, -1, ..., n, -n, ...
    1772       14336 : !>        (canonical enumeration of integers, oeis.org/A001057).
    1773             : !> \param  inum     integer number [in]
    1774             : !> \return index of 'inum' in A001057
    1775             : !> \note Cell images in SMEAGOL / SIESTA are ordered according to A001057. Therefore this
    1776             : !>       function converts the absolute index of a cell replica along some (x/y/z) dimension
    1777             : !>       into its corresponding SIESTA's index.
    1778             : ! **************************************************************************************************
    1779             :    ELEMENTAL FUNCTION index_in_canonical_enumeration(inum) RESULT(ind)
    1780             :       INTEGER, INTENT(in)                                :: inum
    1781             :       INTEGER                                            :: ind
    1782             : 
    1783       40416 :       INTEGER                                            :: inum_abs, is_non_positive
    1784             : 
    1785             :       inum_abs = ABS(inum)
    1786             :       !IF (inum <= 0) THEN; is_non_positive = 1; ELSE; is_non_positive = 0; END IF
    1787             :       is_non_positive = MERGE(1, 0, inum <= 0)
    1788             : 
    1789       40416 :       ! inum =  0 -> inum_abs = 0, is_non_positive = 1 -> ind = 1
    1790             :       ! inum =  1 -> inum_abs = 1, is_non_positive = 0 -> ind = 2
    1791       40416 :       ! inum = -1 -> inum_abs = 1, is_non_positive = 1 -> ind = 3
    1792             :       ind = 2*inum_abs + is_non_positive
    1793             :    END FUNCTION index_in_canonical_enumeration
    1794             : 
    1795             : ! **************************************************************************************************
    1796       40416 : !> \brief Return an integer number according to its index in the sequence 0, 1, -1, ..., n, -n, ...
    1797       40416 : !>        (canonical enumeration of integers, oeis.org/A001057)
    1798             : !> \param  ind       index in A001057 starting from 1
    1799             : !> \return integer number according to its position 'ind' in A001057
    1800             : !> \note Cell images in SMEAGOL / SIESTA are ordered according to A001057. Therefore this
    1801             : !>       function converts SIESTA's index of a cell replica along some (x/y/z) dimension
    1802             : !>       into the corresponding absolute index.
    1803             : ! **************************************************************************************************
    1804             :    ELEMENTAL FUNCTION number_from_canonical_enumeration(ind) RESULT(inum)
    1805             :       INTEGER, INTENT(in)                                :: ind
    1806             :       INTEGER                                            :: inum
    1807             : 
    1808         186 :       ! ind < 1 is invalid
    1809             :       ! ind = 1 -> SIGN(0, -1) =  0
    1810             :       ! ind = 2 -> SIGN(1,  0) =  1
    1811             :       ! ind = 3 -> SIGN(1, -1) = -1
    1812             :       inum = SIGN(ind/2, -MOD(ind, 2))
    1813             :    END FUNCTION number_from_canonical_enumeration
    1814             : 
    1815             : ! **************************************************************************************************
    1816         186 : !> \brief   Apply periodic boundary conditions defined by a simulation cell to a position
    1817         186 : !>          vector r. Similar to pbc1 from cell_types.F but returns unscaled coordinates from
    1818             : !>          the scaled range [0, 1) instead of [-0.5, 0.5)
    1819             : !> \param r_pbc   position vector subject to the periodic boundary conditions [out]
    1820             : !> \param r       initial position vector [in]
    1821             : !> \param cell    simulation unit cell [in]
    1822             : ! **************************************************************************************************
    1823             :    PURE SUBROUTINE pbc_0_1(r_pbc, r, cell)
    1824             :       REAL(KIND=dp), DIMENSION(3), INTENT(out)           :: r_pbc
    1825             :       REAL(KIND=dp), DIMENSION(3), INTENT(in)            :: r
    1826             :       TYPE(cell_type), INTENT(in), POINTER               :: cell
    1827          64 : 
    1828             :       REAL(KIND=dp), DIMENSION(3)                        :: s
    1829             : 
    1830             :       IF (cell%orthorhombic) THEN
    1831             :          r_pbc(1) = r(1) - cell%hmat(1, 1)*cell%perd(1)*REAL(FLOOR(cell%h_inv(1, 1)*r(1)), dp)
    1832             :          r_pbc(2) = r(2) - cell%hmat(2, 2)*cell%perd(2)*REAL(FLOOR(cell%h_inv(2, 2)*r(2)), dp)
    1833             :          r_pbc(3) = r(3) - cell%hmat(3, 3)*cell%perd(3)*REAL(FLOOR(cell%h_inv(3, 3)*r(3)), dp)
    1834          64 :       ELSE
    1835          64 :          s(1) = cell%h_inv(1, 1)*r(1) + cell%h_inv(1, 2)*r(2) + cell%h_inv(1, 3)*r(3)
    1836          64 :          s(2) = cell%h_inv(2, 1)*r(1) + cell%h_inv(2, 2)*r(2) + cell%h_inv(2, 3)*r(3)
    1837          64 :          s(3) = cell%h_inv(3, 1)*r(1) + cell%h_inv(3, 2)*r(2) + cell%h_inv(3, 3)*r(3)
    1838             :          s(1) = s(1) - cell%perd(1)*REAL(FLOOR(s(1)), dp)
    1839           0 :          s(2) = s(2) - cell%perd(2)*REAL(FLOOR(s(2)), dp)
    1840           0 :          s(3) = s(3) - cell%perd(3)*REAL(FLOOR(s(3)), dp)
    1841           0 :          r_pbc(1) = cell%hmat(1, 1)*s(1) + cell%hmat(1, 2)*s(2) + cell%hmat(1, 3)*s(3)
    1842           0 :          r_pbc(2) = cell%hmat(2, 1)*s(1) + cell%hmat(2, 2)*s(2) + cell%hmat(2, 3)*s(3)
    1843           0 :          r_pbc(3) = cell%hmat(3, 1)*s(1) + cell%hmat(3, 2)*s(2) + cell%hmat(3, 3)*s(3)
    1844           0 :       END IF
    1845           0 :    END SUBROUTINE pbc_0_1
    1846           0 : 
    1847           0 : ! **************************************************************************************************
    1848             : !> \brief Computes the number of send requests from this MPI process to all the other processes.
    1849          64 : !>        Alternatively computes the number of recv requests per MPI process that the given process
    1850             : !>        expects.
    1851             : !> \param mepos                    MPI rank of the given process
    1852             : !> \param nelements_per_proc       number of element to send / receive
    1853             : !> \param max_nelements_per_packet maximum number of elements per single MPI request
    1854             : !> \return number of MPI requests
    1855             : ! **************************************************************************************************
    1856             :    PURE FUNCTION get_number_of_mpi_sendrecv_requests(mepos, nelements_per_proc, max_nelements_per_packet) RESULT(nrequests)
    1857             :       INTEGER, INTENT(in)                                :: mepos
    1858             :       INTEGER(kind=int_8), DIMENSION(0:), INTENT(in)     :: nelements_per_proc
    1859             :       INTEGER(kind=int_8), INTENT(in)                    :: max_nelements_per_packet
    1860          16 :       INTEGER                                            :: nrequests
    1861             : 
    1862             :       INTEGER                                            :: iproc
    1863             : 
    1864             :       nrequests = 0
    1865             :       DO iproc = LBOUND(nelements_per_proc, 1), UBOUND(nelements_per_proc, 1)
    1866             :          ! there is no need to send data to the same MPI process
    1867             :          IF (iproc /= mepos) THEN
    1868          16 :             nrequests = nrequests + INT(nelements_per_proc(iproc)/max_nelements_per_packet)
    1869          64 :             IF (MOD(nelements_per_proc(iproc), max_nelements_per_packet) > 0) &
    1870             :                nrequests = nrequests + 1
    1871          48 :          END IF
    1872          16 :       END DO
    1873          16 :    END FUNCTION get_number_of_mpi_sendrecv_requests
    1874           8 : 
    1875             : ! **************************************************************************************************
    1876             : !> \brief Map non-zero matrix elements on to MPI requests.
    1877          16 : !> \param element_offset           offset (index-1) of the first element [out]
    1878             : !> \param nelements_per_request    number of element for each request [out]
    1879             : !> \param peer_rank                rank of a peering MPI process
    1880             : !> \param tag                      MPI tag
    1881             : !> \param mepos                    MPI rank of a given MPI process
    1882             : !> \param nelements_per_proc       number of element to send / receive by the current MPI process
    1883             : !> \param max_nelements_per_packet maximum number of elements per single MPI request
    1884             : ! **************************************************************************************************
    1885             :    SUBROUTINE assign_nonzero_elements_to_requests(element_offset, nelements_per_request, peer_rank, tag, &
    1886             :                                                   mepos, nelements_per_proc, max_nelements_per_packet)
    1887             :       INTEGER(kind=int_8), DIMENSION(:), INTENT(out)     :: element_offset, nelements_per_request
    1888             :       INTEGER, DIMENSION(:), INTENT(out)                 :: peer_rank, tag
    1889           8 :       INTEGER, INTENT(in)                                :: mepos
    1890           8 :       INTEGER(kind=int_8), DIMENSION(0:), INTENT(in)     :: nelements_per_proc
    1891             :       INTEGER(kind=int_8), INTENT(in)                    :: max_nelements_per_packet
    1892             : 
    1893             :       INTEGER                                            :: iproc, irequest, nrequests, &
    1894             :                                                             request_offset
    1895             :       INTEGER(kind=int_8)                                :: element_offset_tmp, nelements
    1896             : 
    1897             :       request_offset = 0
    1898             :       element_offset_tmp = 0
    1899             :       DO iproc = LBOUND(nelements_per_proc, 1), UBOUND(nelements_per_proc, 1)
    1900             :          IF (iproc /= mepos) THEN
    1901           8 :             nrequests = INT(nelements_per_proc(iproc)/max_nelements_per_packet)
    1902           8 :             IF (MOD(nelements_per_proc(iproc), max_nelements_per_packet) > 0) nrequests = nrequests + 1
    1903          32 :             CPASSERT(nrequests <= max_mpi_rank + 1)
    1904          16 :             IF (nrequests > 0) THEN
    1905           8 :                nelements = nelements_per_proc(iproc)/nrequests
    1906           8 :                IF (nelements_per_proc(iproc) - nelements*nrequests > 0) nelements = nelements + 1
    1907           8 :                CPASSERT(nelements <= max_nelements_per_packet)
    1908           8 : 
    1909           8 :                DO irequest = 1, nrequests
    1910           8 :                   element_offset(request_offset + irequest) = (irequest - 1)*nelements + element_offset_tmp
    1911           8 :                   IF (irequest < nrequests) THEN
    1912             :                      nelements_per_request(request_offset + irequest) = nelements
    1913          16 :                   ELSE
    1914           8 :                      nelements_per_request(request_offset + irequest) = nelements_per_proc(iproc) - nelements*(nrequests - 1)
    1915           8 :                   END IF
    1916           0 :                   peer_rank(request_offset + irequest) = iproc
    1917             :                   tag(request_offset + irequest) = irequest - 1
    1918           8 :                END DO
    1919             :             END IF
    1920           8 :             request_offset = request_offset + nrequests
    1921          16 :          END IF
    1922             :          element_offset_tmp = element_offset_tmp + nelements_per_proc(iproc)
    1923             :       END DO
    1924           8 : 
    1925             :       IF (debug_this_module) THEN
    1926          24 :          CPASSERT(SIZE(element_offset) == request_offset)
    1927             :          CPASSERT(SIZE(nelements_per_request) == request_offset)
    1928             :          CPASSERT(SIZE(peer_rank) == request_offset)
    1929             :          CPASSERT(SIZE(tag) == request_offset)
    1930             :       END IF
    1931             :    END SUBROUTINE assign_nonzero_elements_to_requests
    1932             : 
    1933             : END MODULE smeagol_matrix_utils

Generated by: LCOV version 1.15