LCOV - code coverage report
Current view: top level - src - rpa_grad.F (source / functions) Hit Total Coverage
Test: CP2K Regtests (git:2fce0f8) Lines: 931 1141 81.6 %
Date: 2024-12-21 06:28:57 Functions: 26 32 81.2 %

          Line data    Source code
       1             : !--------------------------------------------------------------------------------------------------!
       2             : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3             : !   Copyright 2000-2024 CP2K developers group <https://cp2k.org>                                   !
       4             : !                                                                                                  !
       5             : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6             : !--------------------------------------------------------------------------------------------------!
       7             : 
       8             : ! **************************************************************************************************
       9             : !> \brief Routines to calculate RI-RPA and SOS-MP2 gradients
      10             : !> \par History
      11             : !>      10.2021 created [Frederick Stein]
      12             : ! **************************************************************************************************
      13             : MODULE rpa_grad
      14             :    USE cp_array_utils,                  ONLY: cp_1d_r_cp_type,&
      15             :                                               cp_3d_r_cp_type
      16             :    USE cp_blacs_env,                    ONLY: cp_blacs_env_type
      17             :    USE cp_fm_basic_linalg,              ONLY: cp_fm_geadd,&
      18             :                                               cp_fm_scale_and_add,&
      19             :                                               cp_fm_upper_to_full
      20             :    USE cp_fm_cholesky,                  ONLY: cp_fm_cholesky_invert
      21             :    USE cp_fm_struct,                    ONLY: cp_fm_struct_create,&
      22             :                                               cp_fm_struct_get,&
      23             :                                               cp_fm_struct_release,&
      24             :                                               cp_fm_struct_type
      25             :    USE cp_fm_types,                     ONLY: cp_fm_create,&
      26             :                                               cp_fm_get_info,&
      27             :                                               cp_fm_release,&
      28             :                                               cp_fm_set_all,&
      29             :                                               cp_fm_to_fm,&
      30             :                                               cp_fm_to_fm_submat_general,&
      31             :                                               cp_fm_type
      32             :    USE dgemm_counter_types,             ONLY: dgemm_counter_start,&
      33             :                                               dgemm_counter_stop,&
      34             :                                               dgemm_counter_type
      35             :    USE group_dist_types,                ONLY: create_group_dist,&
      36             :                                               get_group_dist,&
      37             :                                               group_dist_d1_type,&
      38             :                                               group_dist_proc,&
      39             :                                               maxsize,&
      40             :                                               release_group_dist
      41             :    USE kahan_sum,                       ONLY: accurate_dot_product,&
      42             :                                               accurate_dot_product_2
      43             :    USE kinds,                           ONLY: dp,&
      44             :                                               int_8
      45             :    USE libint_2c_3c,                    ONLY: compare_potential_types
      46             :    USE local_gemm_api,                  ONLY: LOCAL_GEMM_PU_GPU,&
      47             :                                               local_gemm_ctxt_type
      48             :    USE machine,                         ONLY: m_flush,&
      49             :                                               m_memory
      50             :    USE mathconstants,                   ONLY: pi
      51             :    USE message_passing,                 ONLY: mp_comm_type,&
      52             :                                               mp_para_env_type,&
      53             :                                               mp_request_null,&
      54             :                                               mp_request_type,&
      55             :                                               mp_waitall,&
      56             :                                               mp_waitany
      57             :    USE mp2_laplace,                     ONLY: calc_fm_mat_s_laplace
      58             :    USE mp2_ri_grad_util,                ONLY: array2fm,&
      59             :                                               create_dbcsr_gamma,&
      60             :                                               fm2array,&
      61             :                                               prepare_redistribution
      62             :    USE mp2_types,                       ONLY: mp2_type,&
      63             :                                               one_dim_int_array,&
      64             :                                               two_dim_int_array,&
      65             :                                               two_dim_real_array
      66             :    USE parallel_gemm_api,               ONLY: parallel_gemm
      67             :    USE qs_environment_types,            ONLY: get_qs_env,&
      68             :                                               qs_environment_type
      69             :    USE rpa_util,                        ONLY: calc_fm_mat_S_rpa,&
      70             :                                               remove_scaling_factor_rpa
      71             :    USE util,                            ONLY: get_limit
      72             : #include "./base/base_uses.f90"
      73             : 
      74             :    IMPLICIT NONE
      75             : 
      76             :    PRIVATE
      77             : 
      78             :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'rpa_grad'
      79             : 
      80             :    PUBLIC :: rpa_grad_needed_mem, rpa_grad_type, rpa_grad_create, rpa_grad_finalize, rpa_grad_matrix_operations, rpa_grad_copy_Q
      81             : 
      82             :    TYPE sos_mp2_grad_work_type
      83             :       PRIVATE
      84             :       INTEGER, DIMENSION(:, :), ALLOCATABLE :: pair_list
      85             :       TYPE(one_dim_int_array), DIMENSION(:), ALLOCATABLE :: index2send, index2recv
      86             :       REAL(KIND=dp), DIMENSION(:), ALLOCATABLE :: P
      87             :    END TYPE
      88             : 
      89             :    TYPE rpa_grad_work_type
      90             :       TYPE(cp_fm_type) :: fm_mat_Q_copy = cp_fm_type()
      91             :       TYPE(one_dim_int_array), DIMENSION(:, :), ALLOCATABLE :: index2send
      92             :       TYPE(two_dim_int_array), DIMENSION(:, :), ALLOCATABLE :: index2recv
      93             :       TYPE(group_dist_d1_type), DIMENSION(:), ALLOCATABLE :: gd_homo, gd_virtual
      94             :       INTEGER, DIMENSION(2) :: grid = -1, mepos = -1
      95             :       TYPE(two_dim_real_array), DIMENSION(:), ALLOCATABLE :: P_ij, P_ab
      96             :    END TYPE
      97             : 
      98             :    TYPE rpa_grad_type
      99             :       PRIVATE
     100             :       TYPE(cp_fm_type) :: fm_Gamma_PQ = cp_fm_type()
     101             :       TYPE(cp_fm_type), DIMENSION(:), ALLOCATABLE :: fm_Y
     102             :       TYPE(sos_mp2_grad_work_type), ALLOCATABLE, DIMENSION(:) :: sos_mp2_work_occ, sos_mp2_work_virt
     103             :       TYPE(rpa_grad_work_type) :: rpa_work
     104             :    END TYPE
     105             : 
     106             :    INTEGER, PARAMETER :: spla_threshold = 128*128*128*2
     107             :    INTEGER, PARAMETER :: blksize_threshold = 4
     108             : 
     109             : CONTAINS
     110             : 
     111             : ! **************************************************************************************************
     112             : !> \brief Calculates the necessary minimum memory for the Gradient code ion MiB
     113             : !> \param homo ...
     114             : !> \param virtual ...
     115             : !> \param dimen_RI ...
     116             : !> \param mem_per_rank ...
     117             : !> \param mem_per_repl ...
     118             : !> \param do_ri_sos_laplace_mp2 ...
     119             : !> \return ...
     120             : ! **************************************************************************************************
     121          40 :    PURE SUBROUTINE rpa_grad_needed_mem(homo, virtual, dimen_RI, mem_per_rank, mem_per_repl, do_ri_sos_laplace_mp2)
     122             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: homo, virtual
     123             :       INTEGER, INTENT(IN)                                :: dimen_RI
     124             :       REAL(KIND=dp), INTENT(INOUT)                       :: mem_per_rank, mem_per_repl
     125             :       LOGICAL, INTENT(IN)                                :: do_ri_sos_laplace_mp2
     126             : 
     127             :       REAL(KIND=dp)                                      :: mem_iaK, mem_KL, mem_pab, mem_pij
     128             : 
     129          88 :       mem_iaK = SUM(REAL(virtual, KIND=dp)*homo)*dimen_RI
     130          88 :       mem_pij = SUM(REAL(homo, KIND=dp)**2)
     131          88 :       mem_pab = SUM(REAL(virtual, KIND=dp)**2)
     132          40 :       mem_KL = REAL(dimen_RI, KIND=dp)*dimen_RI
     133             : 
     134             :       ! Required matrices iaK
     135             :       ! Ytot_iaP = sum_tau Y_iaP(tau)
     136             :       ! Y_iaP(tau) = S_iaP(tau)*Q_PQ(tau) (work array)
     137             :       ! Required matrices density matrices
     138             :       ! Pij (local)
     139             :       ! Pab (local)
     140             :       ! Additionally with SOS-MP2
     141             :       ! Send and receive buffers for degenerate orbital pairs (rough estimate: everything)
     142             :       ! Additionally with RPA
     143             :       ! copy of work matrix
     144             :       ! receive buffer for calculation of density matrix
     145             :       ! copy of matrix Q
     146          40 :       mem_per_rank = mem_per_rank + (mem_pij + mem_pab)*8.0_dp/(1024**2)
     147          40 :       mem_per_repl = mem_per_repl + (mem_iaK + 2.0_dp*mem_iaK/SIZE(homo) + mem_KL)*8.0_dp/(1024**2)
     148          40 :       IF (.NOT. do_ri_sos_laplace_mp2) THEN
     149          20 :          mem_per_repl = mem_per_rank + (mem_iaK/SIZE(homo) + mem_KL)*8.0_dp/(1024**2)
     150             :       END IF
     151             : 
     152          40 :    END SUBROUTINE rpa_grad_needed_mem
     153             : 
     154             : ! **************************************************************************************************
     155             : !> \brief Creates the arrays of a rpa_grad_type
     156             : !> \param rpa_grad ...
     157             : !> \param fm_mat_Q ...
     158             : !> \param fm_mat_S ...
     159             : !> \param homo ...
     160             : !> \param virtual ...
     161             : !> \param mp2_env ...
     162             : !> \param Eigenval ...
     163             : !> \param unit_nr ...
     164             : !> \param do_ri_sos_laplace_mp2 ...
     165             : ! **************************************************************************************************
     166         280 :    SUBROUTINE rpa_grad_create(rpa_grad, fm_mat_Q, fm_mat_S, &
     167          40 :                               homo, virtual, mp2_env, Eigenval, unit_nr, do_ri_sos_laplace_mp2)
     168             :       TYPE(rpa_grad_type), INTENT(OUT)                   :: rpa_grad
     169             :       TYPE(cp_fm_type), INTENT(IN)                       :: fm_mat_Q
     170             :       TYPE(cp_fm_type), DIMENSION(:), INTENT(IN)         :: fm_mat_S
     171             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: homo, virtual
     172             :       TYPE(mp2_type), INTENT(INOUT)                      :: mp2_env
     173             :       REAL(KIND=dp), DIMENSION(:, :), INTENT(IN)         :: Eigenval
     174             :       INTEGER, INTENT(IN)                                :: unit_nr
     175             :       LOGICAL, INTENT(IN)                                :: do_ri_sos_laplace_mp2
     176             : 
     177             :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'rpa_grad_create'
     178             : 
     179             :       INTEGER                                            :: handle, ispin, nrow_local, nspins
     180             : 
     181          40 :       CALL timeset(routineN, handle)
     182             : 
     183          40 :       CALL cp_fm_create(rpa_grad%fm_Gamma_PQ, matrix_struct=fm_mat_Q%matrix_struct)
     184          40 :       CALL cp_fm_set_all(rpa_grad%fm_Gamma_PQ, 0.0_dp)
     185             : 
     186          40 :       nspins = SIZE(fm_mat_S)
     187             : 
     188         168 :       ALLOCATE (rpa_grad%fm_Y(nspins))
     189          88 :       DO ispin = 1, nspins
     190          88 :          CALL cp_fm_create(rpa_grad%fm_Y(ispin), fm_mat_S(ispin)%matrix_struct)
     191             :       END DO
     192             : 
     193          40 :       IF (do_ri_sos_laplace_mp2) THEN
     194             :          CALL sos_mp2_work_type_create(rpa_grad%sos_mp2_work_occ, rpa_grad%sos_mp2_work_virt, &
     195          20 :                                        unit_nr, Eigenval, homo, virtual, mp2_env%ri_grad%eps_canonical, fm_mat_S)
     196             :       ELSE
     197          20 :          CALL rpa_work_type_create(rpa_grad%rpa_work, fm_mat_Q, fm_mat_S, homo, virtual)
     198             :       END IF
     199             : 
     200             :       ! Set blocksize
     201          40 :       CALL cp_fm_struct_get(fm_mat_S(1)%matrix_struct, nrow_local=nrow_local)
     202          40 :       IF (mp2_env%ri_grad%dot_blksize < 1) mp2_env%ri_grad%dot_blksize = nrow_local
     203          40 :       mp2_env%ri_grad%dot_blksize = MIN(mp2_env%ri_grad%dot_blksize, nrow_local)
     204          40 :       IF (unit_nr > 0) THEN
     205          20 :          WRITE (unit_nr, '(T3,A,T75,I6)') 'GRAD_INFO| Block size for the contraction:', mp2_env%ri_grad%dot_blksize
     206          20 :          CALL m_flush(unit_nr)
     207             :       END IF
     208          40 :       CALL fm_mat_S(1)%matrix_struct%para_env%sync()
     209             : 
     210          40 :       CALL timestop(handle)
     211             : 
     212          80 :    END SUBROUTINE rpa_grad_create
     213             : 
     214             : ! **************************************************************************************************
     215             : !> \brief ...
     216             : !> \param sos_mp2_work_occ ...
     217             : !> \param sos_mp2_work_virt ...
     218             : !> \param unit_nr ...
     219             : !> \param Eigenval ...
     220             : !> \param homo ...
     221             : !> \param virtual ...
     222             : !> \param eps_degenerate ...
     223             : !> \param fm_mat_S ...
     224             : ! **************************************************************************************************
     225          20 :    SUBROUTINE sos_mp2_work_type_create(sos_mp2_work_occ, sos_mp2_work_virt, unit_nr, &
     226          20 :                                        Eigenval, homo, virtual, eps_degenerate, fm_mat_S)
     227             :       TYPE(sos_mp2_grad_work_type), ALLOCATABLE, &
     228             :          DIMENSION(:), INTENT(OUT)                       :: sos_mp2_work_occ, sos_mp2_work_virt
     229             :       INTEGER, INTENT(IN)                                :: unit_nr
     230             :       REAL(KIND=dp), DIMENSION(:, :), INTENT(IN)         :: Eigenval
     231             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: homo, virtual
     232             :       REAL(KIND=dp), INTENT(IN)                          :: eps_degenerate
     233             :       TYPE(cp_fm_type), DIMENSION(:), INTENT(IN)         :: fm_mat_S
     234             : 
     235             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'sos_mp2_work_type_create'
     236             : 
     237             :       INTEGER                                            :: handle, ispin, nspins
     238             : 
     239          20 :       CALL timeset(routineN, handle)
     240             : 
     241          20 :       nspins = SIZE(fm_mat_S)
     242         148 :       ALLOCATE (sos_mp2_work_occ(nspins), sos_mp2_work_virt(nspins))
     243          44 :       DO ispin = 1, nspins
     244             : 
     245             :          CALL create_list_nearly_degen_pairs(Eigenval(1:homo(ispin), ispin), &
     246          24 :                                              eps_degenerate, sos_mp2_work_occ(ispin)%pair_list)
     247          24 :          IF (unit_nr > 0) WRITE (unit_nr, "(T3,A,T75,i6)") &
     248          12 :             "MO_INFO| Number of ij pairs below EPS_CANONICAL:", SIZE(sos_mp2_work_occ(ispin)%pair_list, 2)
     249          72 :          ALLOCATE (sos_mp2_work_occ(ispin)%P(homo(ispin) + SIZE(sos_mp2_work_occ(ispin)%pair_list, 2)))
     250         164 :          sos_mp2_work_occ(ispin)%P = 0.0_dp
     251          24 :          CALL prepare_comm_Pij(sos_mp2_work_occ(ispin), virtual(ispin), fm_mat_S(ispin))
     252             : 
     253             :          CALL create_list_nearly_degen_pairs(Eigenval(homo(ispin) + 1:, ispin), &
     254          24 :                                              eps_degenerate, sos_mp2_work_virt(ispin)%pair_list)
     255          24 :          IF (unit_nr > 0) WRITE (unit_nr, "(T3,A,T75,i6)") &
     256          12 :             "MO_INFO| Number of ab pairs below EPS_CANONICAL:", SIZE(sos_mp2_work_virt(ispin)%pair_list, 2)
     257          72 :          ALLOCATE (sos_mp2_work_virt(ispin)%P(virtual(ispin) + SIZE(sos_mp2_work_virt(ispin)%pair_list, 2)))
     258        1136 :          sos_mp2_work_virt(ispin)%P = 0.0_dp
     259          44 :          CALL prepare_comm_Pab(sos_mp2_work_virt(ispin), virtual(ispin), fm_mat_S(ispin))
     260             :       END DO
     261             : 
     262          20 :       CALL timestop(handle)
     263             : 
     264          20 :    END SUBROUTINE
     265             : 
     266             : ! **************************************************************************************************
     267             : !> \brief ...
     268             : !> \param rpa_work ...
     269             : !> \param fm_mat_Q ...
     270             : !> \param fm_mat_S ...
     271             : !> \param homo ...
     272             : !> \param virtual ...
     273             : ! **************************************************************************************************
     274         120 :    SUBROUTINE rpa_work_type_create(rpa_work, fm_mat_Q, fm_mat_S, homo, virtual)
     275             :       TYPE(rpa_grad_work_type), INTENT(OUT)              :: rpa_work
     276             :       TYPE(cp_fm_type), INTENT(IN)                       :: fm_mat_Q
     277             :       TYPE(cp_fm_type), DIMENSION(:), INTENT(IN)         :: fm_mat_S
     278             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: homo, virtual
     279             : 
     280             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'rpa_work_type_create'
     281             : 
     282             :       INTEGER :: avirt, col_global, col_local, handle, iocc, ispin, my_a, my_a_end, my_a_size, &
     283             :          my_a_start, my_i, my_i_end, my_i_size, my_i_start, my_pcol, ncol_local, nspins, &
     284             :          num_pe_col, proc_homo, proc_homo_send, proc_recv, proc_send, proc_virtual, &
     285             :          proc_virtual_send
     286          20 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: data2recv, data2send
     287          20 :       INTEGER, DIMENSION(:), POINTER                     :: col_indices
     288             : 
     289          20 :       CALL timeset(routineN, handle)
     290             : 
     291          20 :       CALL cp_fm_create(rpa_work%fm_mat_Q_copy, matrix_struct=fm_mat_Q%matrix_struct)
     292             : 
     293          20 :       CALL fm_mat_S(1)%matrix_struct%context%get(number_of_process_columns=num_pe_col, my_process_column=my_pcol)
     294             : 
     295          20 :       nspins = SIZE(fm_mat_S)
     296             : 
     297           0 :       ALLOCATE (rpa_work%index2send(0:num_pe_col - 1, nspins), &
     298           0 :                 rpa_work%index2recv(0:num_pe_col - 1, nspins), &
     299           0 :                 rpa_work%gd_homo(nspins), rpa_work%gd_virtual(nspins), &
     300             :                 data2send(0:num_pe_col - 1), data2recv(0:num_pe_col - 1), &
     301         512 :                 rpa_work%P_ij(nspins), rpa_work%P_ab(nspins))
     302             : 
     303             :       ! Determine new process grid
     304          20 :       proc_homo = MAX(1, CEILING(SQRT(REAL(num_pe_col, KIND=dp))))
     305          20 :       DO WHILE (MOD(num_pe_col, proc_homo) /= 0)
     306           0 :          proc_homo = proc_homo - 1
     307             :       END DO
     308          20 :       proc_virtual = num_pe_col/proc_homo
     309             : 
     310          20 :       rpa_work%grid(1) = proc_virtual
     311          20 :       rpa_work%grid(2) = proc_homo
     312             : 
     313          20 :       rpa_work%mepos(1) = MOD(my_pcol, proc_virtual)
     314          20 :       rpa_work%mepos(2) = my_pcol/proc_virtual
     315             : 
     316          44 :       DO ispin = 1, nspins
     317             : 
     318             :          ! Determine distributions of the orbitals
     319          24 :          CALL create_group_dist(rpa_work%gd_homo(ispin), proc_homo, homo(ispin))
     320          24 :          CALL create_group_dist(rpa_work%gd_virtual(ispin), proc_virtual, virtual(ispin))
     321             : 
     322          24 :          CALL cp_fm_struct_get(fm_mat_S(ispin)%matrix_struct, ncol_local=ncol_local, col_indices=col_indices)
     323             : 
     324          48 :          data2send = 0
     325             :          ! Count the amount of data2send to each process
     326        1924 :          DO col_local = 1, ncol_local
     327        1900 :             col_global = col_indices(col_local)
     328             : 
     329        1900 :             iocc = (col_global - 1)/virtual(ispin) + 1
     330        1900 :             avirt = col_global - (iocc - 1)*virtual(ispin)
     331             : 
     332        1900 :             proc_homo_send = group_dist_proc(rpa_work%gd_homo(ispin), iocc)
     333        1900 :             proc_virtual_send = group_dist_proc(rpa_work%gd_virtual(ispin), avirt)
     334             : 
     335        1900 :             proc_send = proc_homo_send*proc_virtual + proc_virtual_send
     336             : 
     337        1924 :             data2send(proc_send) = data2send(proc_send) + 1
     338             :          END DO
     339             : 
     340          48 :          DO proc_send = 0, num_pe_col - 1
     341          96 :             ALLOCATE (rpa_work%index2send(proc_send, ispin)%array(data2send(proc_send)))
     342             :          END DO
     343             : 
     344             :          ! Prepare the indices
     345          48 :          data2send = 0
     346        1924 :          DO col_local = 1, ncol_local
     347        1900 :             col_global = col_indices(col_local)
     348             : 
     349        1900 :             iocc = (col_global - 1)/virtual(ispin) + 1
     350        1900 :             avirt = col_global - (iocc - 1)*virtual(ispin)
     351             : 
     352        1900 :             proc_homo_send = group_dist_proc(rpa_work%gd_homo(ispin), iocc)
     353        1900 :             proc_virtual_send = group_dist_proc(rpa_work%gd_virtual(ispin), avirt)
     354             : 
     355        1900 :             proc_send = proc_homo_send*proc_virtual + proc_virtual_send
     356             : 
     357        1900 :             data2send(proc_send) = data2send(proc_send) + 1
     358             : 
     359        1924 :             rpa_work%index2send(proc_send, ispin)%array(data2send(proc_send)) = col_local
     360             :          END DO
     361             : 
     362             :          ! Count the amount of data2recv from each process
     363          24 :          CALL get_group_dist(rpa_work%gd_homo(ispin), my_pcol/proc_virtual, my_i_start, my_i_end, my_i_size)
     364          24 :          CALL get_group_dist(rpa_work%gd_virtual(ispin), MOD(my_pcol, proc_virtual), my_a_start, my_a_end, my_a_size)
     365             : 
     366          48 :          data2recv = 0
     367         116 :          DO my_i = my_i_start, my_i_end
     368        2016 :          DO my_a = my_a_start, my_a_end
     369        1900 :             proc_recv = fm_mat_S(ispin)%matrix_struct%g2p_col((my_i - 1)*virtual(ispin) + my_a)
     370        1992 :             data2recv(proc_recv) = data2recv(proc_recv) + 1
     371             :          END DO
     372             :          END DO
     373             : 
     374          48 :          DO proc_recv = 0, num_pe_col - 1
     375          96 :             ALLOCATE (rpa_work%index2recv(proc_recv, ispin)%array(2, data2recv(proc_recv)))
     376             :          END DO
     377             : 
     378          48 :          data2recv = 0
     379         116 :          DO my_i = my_i_start, my_i_end
     380        2016 :          DO my_a = my_a_start, my_a_end
     381        1900 :             proc_recv = fm_mat_S(ispin)%matrix_struct%g2p_col((my_i - 1)*virtual(ispin) + my_a)
     382        1900 :             data2recv(proc_recv) = data2recv(proc_recv) + 1
     383             : 
     384        1900 :             rpa_work%index2recv(proc_recv, ispin)%array(2, data2recv(proc_recv)) = my_i - my_i_start + 1
     385        1992 :             rpa_work%index2recv(proc_recv, ispin)%array(1, data2recv(proc_recv)) = my_a - my_a_start + 1
     386             :          END DO
     387             :          END DO
     388             : 
     389           0 :          ALLOCATE (rpa_work%P_ij(ispin)%array(my_i_size, homo(ispin)), &
     390         168 :                    rpa_work%P_ab(ispin)%array(my_a_size, virtual(ispin)))
     391         472 :          rpa_work%P_ij(ispin)%array(:, :) = 0.0_dp
     392       11148 :          rpa_work%P_ab(ispin)%array(:, :) = 0.0_dp
     393             : 
     394             :       END DO
     395             : 
     396          20 :       DEALLOCATE (data2send, data2recv)
     397             : 
     398          20 :       CALL timestop(handle)
     399             : 
     400          40 :    END SUBROUTINE
     401             : 
     402             : ! **************************************************************************************************
     403             : !> \brief ...
     404             : !> \param Eigenval ...
     405             : !> \param eps_degen ...
     406             : !> \param pair_list ...
     407             : ! **************************************************************************************************
     408          48 :    SUBROUTINE create_list_nearly_degen_pairs(Eigenval, eps_degen, pair_list)
     409             :       REAL(KIND=dp), DIMENSION(:), INTENT(IN)            :: Eigenval
     410             :       REAL(KIND=dp), INTENT(IN)                          :: eps_degen
     411             :       INTEGER, ALLOCATABLE, DIMENSION(:, :), INTENT(OUT) :: pair_list
     412             : 
     413             :       INTEGER                                            :: my_i, my_j, num_orbitals, num_pairs, &
     414             :                                                             pair_counter
     415             : 
     416          48 :       num_orbitals = SIZE(Eigenval)
     417             : 
     418             : ! Determine number of nearly degenerate orbital pairs
     419             : ! Trivial cases: diagonal elements
     420          48 :       num_pairs = 0
     421         640 :       DO my_i = 1, num_orbitals
     422       11576 :       DO my_j = 1, num_orbitals
     423       10936 :          IF (my_i == my_j) CYCLE
     424       10936 :          IF (ABS(Eigenval(my_i) - Eigenval(my_j)) < eps_degen) num_pairs = num_pairs + 1
     425             :       END DO
     426             :       END DO
     427         104 :       ALLOCATE (pair_list(2, num_pairs))
     428             : 
     429             : ! Print the required pairs
     430          48 :       pair_counter = 1
     431         640 :       DO my_i = 1, num_orbitals
     432       11576 :       DO my_j = 1, num_orbitals
     433       10936 :          IF (my_i == my_j) CYCLE
     434       10936 :          IF (ABS(Eigenval(my_i) - Eigenval(my_j)) < eps_degen) THEN
     435         660 :             pair_list(1, pair_counter) = my_i
     436         660 :             pair_list(2, pair_counter) = my_j
     437         660 :             pair_counter = pair_counter + 1
     438             :          END IF
     439             :       END DO
     440             :       END DO
     441             : 
     442          48 :    END SUBROUTINE create_list_nearly_degen_pairs
     443             : 
     444             : ! **************************************************************************************************
     445             : !> \brief ...
     446             : !> \param sos_mp2_work ...
     447             : !> \param virtual ...
     448             : !> \param fm_mat_S ...
     449             : ! **************************************************************************************************
     450          24 :    SUBROUTINE prepare_comm_Pij(sos_mp2_work, virtual, fm_mat_S)
     451             :       TYPE(sos_mp2_grad_work_type), INTENT(INOUT)        :: sos_mp2_work
     452             :       INTEGER, INTENT(IN)                                :: virtual
     453             :       TYPE(cp_fm_type), INTENT(IN)                       :: fm_mat_S
     454             : 
     455             :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'prepare_comm_Pij'
     456             : 
     457             :       INTEGER :: avirt, col_global, col_local, counter, handle, ij_counter, iocc, my_i, my_j, &
     458             :          my_pcol, my_prow, ncol_local, nrow_local, num_ij_pairs, num_pe_col, pcol, pcol_recv, &
     459             :          pcol_send, proc_shift, tag
     460          24 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: data2recv, data2send
     461          24 :       INTEGER, DIMENSION(:), POINTER                     :: col_indices, ncol_locals
     462          24 :       INTEGER, DIMENSION(:, :), POINTER                  :: blacs2mpi
     463             :       TYPE(cp_blacs_env_type), POINTER                   :: context
     464             :       TYPE(mp_comm_type)                                 :: comm_exchange
     465             :       TYPE(mp_para_env_type), POINTER                    :: para_env
     466             : 
     467          24 :       CALL timeset(routineN, handle)
     468             : 
     469          24 :       tag = 44
     470             : 
     471          24 :       CALL fm_mat_S%matrix_struct%context%get(number_of_process_columns=num_pe_col)
     472           0 :       ALLOCATE (sos_mp2_work%index2send(0:num_pe_col - 1), &
     473         168 :                 sos_mp2_work%index2recv(0:num_pe_col - 1))
     474             : 
     475          72 :       ALLOCATE (data2send(0:num_pe_col - 1))
     476          72 :       ALLOCATE (data2recv(0:num_pe_col - 1))
     477             : 
     478             :       CALL cp_fm_struct_get(fm_mat_S%matrix_struct, para_env=para_env, ncol_locals=ncol_locals, &
     479             :                             ncol_local=ncol_local, col_indices=col_indices, &
     480          24 :                             context=context, nrow_local=nrow_local)
     481             :       CALL context%get(my_process_row=my_prow, my_process_column=my_pcol, &
     482          24 :                        blacs2mpi=blacs2mpi)
     483             : 
     484          24 :       num_ij_pairs = SIZE(sos_mp2_work%pair_list, 2)
     485             : 
     486          24 :       IF (num_ij_pairs > 0) THEN
     487             : 
     488           4 :          CALL comm_exchange%from_split(para_env, my_prow)
     489             : 
     490           8 :          data2send = 0
     491           8 :          data2recv = 0
     492             : 
     493           8 :          DO proc_shift = 0, num_pe_col - 1
     494           4 :             pcol_send = MODULO(my_pcol + proc_shift, num_pe_col)
     495             : 
     496           4 :             counter = 0
     497         308 :             DO col_local = 1, ncol_local
     498         304 :                col_global = col_indices(col_local)
     499             : 
     500         304 :                iocc = MAX(1, col_global - 1)/virtual + 1
     501         304 :                avirt = col_global - (iocc - 1)*virtual
     502             : 
     503         764 :                DO ij_counter = 1, num_ij_pairs
     504             : 
     505         760 :                   my_i = sos_mp2_work%pair_list(1, ij_counter)
     506         760 :                   my_j = sos_mp2_work%pair_list(2, ij_counter)
     507             : 
     508         760 :                   IF (iocc /= my_j) CYCLE
     509         304 :                   pcol = fm_mat_S%matrix_struct%g2p_col((my_i - 1)*virtual + avirt)
     510         304 :                   IF (pcol /= pcol_send) CYCLE
     511             : 
     512         304 :                   counter = counter + 1
     513             : 
     514         760 :                   EXIT
     515             : 
     516             :                END DO
     517             :             END DO
     518           8 :             data2send(pcol_send) = counter
     519             :          END DO
     520             : 
     521           4 :          CALL comm_exchange%alltoall(data2send, data2recv, 1)
     522             : 
     523           8 :          DO proc_shift = 0, num_pe_col - 1
     524           4 :             pcol_send = MODULO(my_pcol + proc_shift, num_pe_col)
     525           4 :             pcol_recv = MODULO(my_pcol - proc_shift, num_pe_col)
     526             : 
     527             :             ! Collect indices and exchange
     528          12 :             ALLOCATE (sos_mp2_work%index2send(pcol_send)%array(data2send(pcol_send)))
     529             : 
     530           4 :             counter = 0
     531         308 :             DO col_local = 1, ncol_local
     532         304 :                col_global = col_indices(col_local)
     533             : 
     534         304 :                iocc = MAX(1, col_global - 1)/virtual + 1
     535         304 :                avirt = col_global - (iocc - 1)*virtual
     536             : 
     537         764 :                DO ij_counter = 1, num_ij_pairs
     538             : 
     539         760 :                   my_i = sos_mp2_work%pair_list(1, ij_counter)
     540         760 :                   my_j = sos_mp2_work%pair_list(2, ij_counter)
     541             : 
     542         760 :                   IF (iocc /= my_j) CYCLE
     543         304 :                   pcol = fm_mat_S%matrix_struct%g2p_col((my_i - 1)*virtual + avirt)
     544         304 :                   IF (pcol /= pcol_send) CYCLE
     545             : 
     546         304 :                   counter = counter + 1
     547             : 
     548         304 :                   sos_mp2_work%index2send(pcol_send)%array(counter) = col_global
     549             : 
     550         760 :                   EXIT
     551             : 
     552             :                END DO
     553             :             END DO
     554             : 
     555          12 :             ALLOCATE (sos_mp2_work%index2recv(pcol_recv)%array(data2recv(pcol_recv)))
     556             :             !
     557             :             CALL para_env%sendrecv(sos_mp2_work%index2send(pcol_send)%array, blacs2mpi(my_prow, pcol_send), &
     558           4 :                                    sos_mp2_work%index2recv(pcol_recv)%array, blacs2mpi(my_prow, pcol_recv), tag)
     559             : 
     560             :             ! Convert to global coordinates to local coordinates as we always work with them
     561         312 :             DO counter = 1, data2send(pcol_send)
     562             :                sos_mp2_work%index2send(pcol_send)%array(counter) = &
     563         308 :                   fm_mat_S%matrix_struct%g2l_col(sos_mp2_work%index2send(pcol_send)%array(counter))
     564             :             END DO
     565             :          END DO
     566             : 
     567           4 :          CALL comm_exchange%free()
     568             :       END IF
     569             : 
     570          24 :       DEALLOCATE (data2send, data2recv)
     571             : 
     572          24 :       CALL timestop(handle)
     573             : 
     574          48 :    END SUBROUTINE
     575             : 
     576             : ! **************************************************************************************************
     577             : !> \brief ...
     578             : !> \param sos_mp2_work ...
     579             : !> \param virtual ...
     580             : !> \param fm_mat_S ...
     581             : ! **************************************************************************************************
     582          24 :    SUBROUTINE prepare_comm_Pab(sos_mp2_work, virtual, fm_mat_S)
     583             :       TYPE(sos_mp2_grad_work_type), INTENT(INOUT)        :: sos_mp2_work
     584             :       INTEGER, INTENT(IN)                                :: virtual
     585             :       TYPE(cp_fm_type), INTENT(IN)                       :: fm_mat_S
     586             : 
     587             :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'prepare_comm_Pab'
     588             : 
     589             :       INTEGER :: ab_counter, avirt, col_global, col_local, counter, handle, iocc, my_a, my_b, &
     590             :          my_pcol, my_prow, ncol_local, nrow_local, num_ab_pairs, num_pe_col, pcol, pcol_recv, &
     591             :          pcol_send, proc_shift, tag
     592          24 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: data2recv, data2send
     593          24 :       INTEGER, DIMENSION(:), POINTER                     :: col_indices, ncol_locals
     594          24 :       INTEGER, DIMENSION(:, :), POINTER                  :: blacs2mpi
     595             :       TYPE(cp_blacs_env_type), POINTER                   :: context
     596             :       TYPE(mp_comm_type)                                 :: comm_exchange
     597             :       TYPE(mp_para_env_type), POINTER                    :: para_env
     598             : 
     599          24 :       CALL timeset(routineN, handle)
     600             : 
     601          24 :       tag = 44
     602             : 
     603          24 :       CALL fm_mat_S%matrix_struct%context%get(number_of_process_columns=num_pe_col)
     604           0 :       ALLOCATE (sos_mp2_work%index2send(0:num_pe_col - 1), &
     605         168 :                 sos_mp2_work%index2recv(0:num_pe_col - 1))
     606             : 
     607          24 :       num_ab_pairs = SIZE(sos_mp2_work%pair_list, 2)
     608          24 :       IF (num_ab_pairs > 0) THEN
     609             : 
     610             :          CALL cp_fm_struct_get(fm_mat_S%matrix_struct, para_env=para_env, ncol_locals=ncol_locals, &
     611             :                                ncol_local=ncol_local, col_indices=col_indices, &
     612           4 :                                context=context, nrow_local=nrow_local)
     613             :          CALL context%get(my_process_row=my_prow, my_process_column=my_pcol, &
     614           4 :                           blacs2mpi=blacs2mpi)
     615             : 
     616           4 :          CALL comm_exchange%from_split(para_env, my_prow)
     617             : 
     618          12 :          ALLOCATE (data2send(0:num_pe_col - 1))
     619          12 :          ALLOCATE (data2recv(0:num_pe_col - 1))
     620             : 
     621           8 :          data2send = 0
     622           8 :          data2recv = 0
     623           8 :          DO proc_shift = 0, num_pe_col - 1
     624           4 :             pcol_send = MODULO(my_pcol + proc_shift, num_pe_col)
     625           4 :             pcol_recv = MODULO(my_pcol - proc_shift, num_pe_col)
     626             : 
     627           4 :             counter = 0
     628         308 :             DO col_local = 1, ncol_local
     629         304 :                col_global = col_indices(col_local)
     630             : 
     631         304 :                iocc = MAX(1, col_global - 1)/virtual + 1
     632         304 :                avirt = col_global - (iocc - 1)*virtual
     633             : 
     634       15476 :                DO ab_counter = 1, num_ab_pairs
     635             : 
     636       15472 :                   my_a = sos_mp2_work%pair_list(1, ab_counter)
     637       15472 :                   my_b = sos_mp2_work%pair_list(2, ab_counter)
     638             : 
     639       15472 :                   IF (avirt /= my_b) CYCLE
     640         304 :                   pcol = fm_mat_S%matrix_struct%g2p_col((iocc - 1)*virtual + my_a)
     641         304 :                   IF (pcol /= pcol_send) CYCLE
     642             : 
     643         304 :                   counter = counter + 1
     644             : 
     645       15472 :                   EXIT
     646             : 
     647             :                END DO
     648             :             END DO
     649           8 :             data2send(pcol_send) = counter
     650             :          END DO
     651             : 
     652           4 :          CALL comm_exchange%alltoall(data2send, data2recv, 1)
     653             : 
     654           8 :          DO proc_shift = 0, num_pe_col - 1
     655           4 :             pcol_send = MODULO(my_pcol + proc_shift, num_pe_col)
     656           4 :             pcol_recv = MODULO(my_pcol - proc_shift, num_pe_col)
     657             : 
     658             :             ! Collect indices and exchange
     659          12 :             ALLOCATE (sos_mp2_work%index2send(pcol_send)%array(data2send(pcol_send)))
     660             : 
     661           4 :             counter = 0
     662         308 :             DO col_local = 1, ncol_local
     663         304 :                col_global = col_indices(col_local)
     664             : 
     665         304 :                iocc = MAX(1, col_global - 1)/virtual + 1
     666         304 :                avirt = col_global - (iocc - 1)*virtual
     667             : 
     668       15476 :                DO ab_counter = 1, num_ab_pairs
     669             : 
     670       15472 :                   my_a = sos_mp2_work%pair_list(1, ab_counter)
     671       15472 :                   my_b = sos_mp2_work%pair_list(2, ab_counter)
     672             : 
     673       15472 :                   IF (avirt /= my_b) CYCLE
     674         304 :                   pcol = fm_mat_S%matrix_struct%g2p_col((iocc - 1)*virtual + my_a)
     675         304 :                   IF (pcol /= pcol_send) CYCLE
     676             : 
     677         304 :                   counter = counter + 1
     678             : 
     679         304 :                   sos_mp2_work%index2send(pcol_send)%array(counter) = col_global
     680             : 
     681       15472 :                   EXIT
     682             : 
     683             :                END DO
     684             :             END DO
     685             : 
     686          12 :             ALLOCATE (sos_mp2_work%index2recv(pcol_recv)%array(data2recv(pcol_recv)))
     687             :             !
     688             :             CALL para_env%sendrecv(sos_mp2_work%index2send(pcol_send)%array, blacs2mpi(my_prow, pcol_send), &
     689           4 :                                    sos_mp2_work%index2recv(pcol_recv)%array, blacs2mpi(my_prow, pcol_recv), tag)
     690             : 
     691             :             ! Convert to global coordinates to local coordinates as we always work with them
     692         312 :             DO counter = 1, data2send(pcol_send)
     693             :                sos_mp2_work%index2send(pcol_send)%array(counter) = &
     694         308 :                   fm_mat_S%matrix_struct%g2l_col(sos_mp2_work%index2send(pcol_send)%array(counter))
     695             :             END DO
     696             :          END DO
     697             : 
     698           4 :          CALL comm_exchange%free()
     699           8 :          DEALLOCATE (data2send, data2recv)
     700             : 
     701             :       END IF
     702             : 
     703          24 :       CALL timestop(handle)
     704             : 
     705          48 :    END SUBROUTINE
     706             : 
     707             : ! **************************************************************************************************
     708             : !> \brief ...
     709             : !> \param fm_mat_Q ...
     710             : !> \param rpa_grad ...
     711             : ! **************************************************************************************************
     712          48 :    SUBROUTINE rpa_grad_copy_Q(fm_mat_Q, rpa_grad)
     713             :       TYPE(cp_fm_type), INTENT(IN)                       :: fm_mat_Q
     714             :       TYPE(rpa_grad_type), INTENT(INOUT)                 :: rpa_grad
     715             : 
     716          48 :       CALL cp_fm_to_fm(fm_mat_Q, rpa_grad%rpa_work%fm_mat_Q_copy)
     717             : 
     718          48 :    END SUBROUTINE
     719             : 
     720             : ! **************************************************************************************************
     721             : !> \brief ...
     722             : !> \param mp2_env ...
     723             : !> \param rpa_grad ...
     724             : !> \param do_ri_sos_laplace_mp2 ...
     725             : !> \param fm_mat_Q ...
     726             : !> \param fm_mat_Q_gemm ...
     727             : !> \param dgemm_counter ...
     728             : !> \param fm_mat_S ...
     729             : !> \param omega ...
     730             : !> \param homo ...
     731             : !> \param virtual ...
     732             : !> \param Eigenval ...
     733             : !> \param weight ...
     734             : !> \param unit_nr ...
     735             : ! **************************************************************************************************
     736         108 :    SUBROUTINE rpa_grad_matrix_operations(mp2_env, rpa_grad, do_ri_sos_laplace_mp2, fm_mat_Q, fm_mat_Q_gemm, &
     737         108 :                                          dgemm_counter, fm_mat_S, omega, homo, virtual, Eigenval, weight, unit_nr)
     738             :       TYPE(mp2_type), INTENT(INOUT)                      :: mp2_env
     739             :       TYPE(rpa_grad_type), INTENT(INOUT)                 :: rpa_grad
     740             :       LOGICAL, INTENT(IN)                                :: do_ri_sos_laplace_mp2
     741             :       TYPE(cp_fm_type), DIMENSION(:), INTENT(IN)         :: fm_mat_Q, fm_mat_Q_gemm
     742             :       TYPE(dgemm_counter_type), INTENT(INOUT)            :: dgemm_counter
     743             :       TYPE(cp_fm_type), DIMENSION(:), INTENT(IN)         :: fm_mat_S
     744             :       REAL(KIND=dp), INTENT(IN)                          :: omega
     745             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: homo, virtual
     746             :       REAL(KIND=dp), DIMENSION(:, :), INTENT(IN)         :: Eigenval
     747             :       REAL(KIND=dp), INTENT(IN)                          :: weight
     748             :       INTEGER, INTENT(IN)                                :: unit_nr
     749             : 
     750             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'rpa_grad_matrix_operations'
     751             : 
     752             :       INTEGER                                            :: col_global, col_local, dimen_ia, &
     753             :                                                             dimen_RI, handle, handle2, ispin, &
     754             :                                                             jspin, ncol_local, nrow_local, nspins, &
     755             :                                                             row_local
     756         108 :       INTEGER, DIMENSION(:), POINTER                     :: col_indices, row_indices
     757             :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
     758         108 :          TARGET                                          :: mat_S_3D, mat_work_iaP_3D
     759             :       TYPE(cp_fm_type)                                   :: fm_work_iaP, fm_work_PQ
     760             : 
     761         108 :       CALL timeset(routineN, handle)
     762             : 
     763         108 :       nspins = SIZE(fm_mat_Q)
     764             : 
     765             :       CALL cp_fm_get_info(fm_mat_Q(1), nrow_global=dimen_RI, nrow_local=nrow_local, ncol_local=ncol_local, &
     766         108 :                           col_indices=col_indices, row_indices=row_indices)
     767             : 
     768         108 :       IF (.NOT. do_ri_sos_laplace_mp2) THEN
     769          48 :          CALL cp_fm_create(fm_work_PQ, fm_mat_Q(1)%matrix_struct)
     770             : 
     771             :          ! calculate [1+Q(iw')]^-1
     772          48 :          CALL cp_fm_cholesky_invert(fm_mat_Q(1))
     773             :          ! symmetrize the result, fm_work_PQ is only a work matrix
     774          48 :          CALL cp_fm_upper_to_full(fm_mat_Q(1), fm_work_PQ)
     775             : 
     776          48 :          CALL cp_fm_release(fm_work_PQ)
     777             : 
     778        4144 :          DO col_local = 1, ncol_local
     779        4096 :             col_global = col_indices(col_local)
     780      164048 :             DO row_local = 1, nrow_local
     781      164000 :             IF (col_global == row_indices(row_local)) THEN
     782        3432 :                fm_mat_Q(1)%local_data(row_local, col_local) = fm_mat_Q(1)%local_data(row_local, col_local) - 1.0_dp
     783        3432 :                EXIT
     784             :             END IF
     785             :             END DO
     786             :          END DO
     787             : 
     788          48 :          CALL timeset(routineN//"_PQ", handle2)
     789          48 :          CALL dgemm_counter_start(dgemm_counter)
     790             :          CALL parallel_gemm(transa="N", transb="N", m=dimen_RI, n=dimen_RI, k=dimen_RI, alpha=weight, &
     791             :                             matrix_a=rpa_grad%rpa_work%fm_mat_Q_copy, matrix_b=fm_mat_Q(1), beta=1.0_dp, &
     792          48 :                             matrix_c=rpa_grad%fm_Gamma_PQ)
     793          48 :          CALL dgemm_counter_stop(dgemm_counter, dimen_RI, dimen_RI, dimen_RI)
     794          48 :          CALL timestop(handle2)
     795             : 
     796             :          CALL cp_fm_to_fm_submat_general(fm_mat_Q(1), fm_mat_Q_gemm(1), dimen_RI, dimen_RI, 1, 1, 1, 1, &
     797          48 :                                          fm_mat_Q_gemm(1)%matrix_struct%context)
     798             :       END IF
     799             : 
     800         232 :       DO ispin = 1, nspins
     801         124 :          IF (do_ri_sos_laplace_mp2) THEN
     802             :             ! The spin of the other Q matrix is always the other spin
     803          68 :             jspin = nspins - ispin + 1
     804             :          ELSE
     805             :             ! or the first matrix in the case of RPA
     806             :             jspin = 1
     807             :          END IF
     808             : 
     809         124 :          IF (do_ri_sos_laplace_mp2) THEN
     810          68 :             CALL timeset(routineN//"_PQ", handle2)
     811          68 :             CALL dgemm_counter_start(dgemm_counter)
     812             :             CALL parallel_gemm(transa="N", transb="N", m=dimen_RI, n=dimen_RI, k=dimen_RI, alpha=weight, &
     813             :                                matrix_a=fm_mat_Q(ispin), matrix_b=fm_mat_Q(jspin), beta=1.0_dp, &
     814          68 :                                matrix_c=rpa_grad%fm_Gamma_PQ)
     815          68 :             CALL dgemm_counter_stop(dgemm_counter, dimen_RI, dimen_RI, dimen_RI)
     816          68 :             CALL timestop(handle2)
     817             : 
     818             :             CALL cp_fm_to_fm_submat_general(fm_mat_Q(jspin), fm_mat_Q_gemm(jspin), dimen_RI, dimen_RI, 1, 1, 1, 1, &
     819          68 :                                             fm_mat_Q_gemm(jspin)%matrix_struct%context)
     820             :          ELSE
     821             :             CALL calc_fm_mat_S_rpa(fm_mat_S(ispin), .TRUE., virtual(ispin), Eigenval(:, ispin), &
     822          56 :                                    homo(ispin), omega, 0.0_dp)
     823             :          END IF
     824             : 
     825         124 :          CALL timeset(routineN//"_contr_S", handle2)
     826         124 :          CALL cp_fm_create(fm_work_iaP, rpa_grad%fm_Y(ispin)%matrix_struct)
     827             : 
     828         124 :          CALL cp_fm_get_info(fm_mat_S(ispin), ncol_global=dimen_ia)
     829             : 
     830         124 :          CALL dgemm_counter_start(dgemm_counter)
     831             :          CALL parallel_gemm(transa="N", transb="N", m=dimen_RI, n=dimen_ia, k=dimen_RI, alpha=1.0_dp, &
     832             :                             matrix_a=fm_mat_Q_gemm(jspin), matrix_b=fm_mat_S(ispin), beta=0.0_dp, &
     833         124 :                             matrix_c=fm_work_iaP)
     834         124 :          CALL dgemm_counter_stop(dgemm_counter, dimen_ia, dimen_RI, dimen_RI)
     835         124 :          CALL timestop(handle2)
     836             : 
     837         356 :          IF (do_ri_sos_laplace_mp2) THEN
     838             :             CALL calc_P_sos_mp2(homo(ispin), fm_mat_S(ispin), fm_work_iaP, &
     839             :                                 rpa_grad%sos_mp2_work_occ(ispin), rpa_grad%sos_mp2_work_virt(ispin), &
     840          68 :                                 omega, weight, virtual(ispin), Eigenval(:, ispin), mp2_env%ri_grad%dot_blksize)
     841             : 
     842          68 :             CALL calc_fm_mat_S_laplace(fm_work_iaP, homo(ispin), virtual(ispin), Eigenval(:, ispin), omega)
     843             : 
     844          68 :             CALL cp_fm_scale_and_add(1.0_dp, rpa_grad%fm_Y(ispin), -weight, fm_work_iaP)
     845             : 
     846          68 :             CALL cp_fm_release(fm_work_iaP)
     847             :          ELSE
     848             :             ! To save memory, we add it now
     849          56 :             CALL cp_fm_scale_and_add(1.0_dp, rpa_grad%fm_Y(ispin), -weight, fm_work_iaP)
     850             : 
     851             :             ! Redistribute both matrices and deallocate fm_work_iaP
     852             :             CALL redistribute_fm_mat_S(rpa_grad%rpa_work%index2send(:, ispin), rpa_grad%rpa_work%index2recv(:, ispin), &
     853             :                                        fm_work_iaP, mat_work_iaP_3D, &
     854             :                                        rpa_grad%rpa_work%gd_homo(ispin), rpa_grad%rpa_work%gd_virtual(ispin), &
     855          56 :                                        rpa_grad%rpa_work%mepos)
     856          56 :             CALL cp_fm_release(fm_work_iaP)
     857             : 
     858             :             CALL redistribute_fm_mat_S(rpa_grad%rpa_work%index2send(:, ispin), rpa_grad%rpa_work%index2recv(:, ispin), &
     859             :                                        fm_mat_S(ispin), mat_S_3D, &
     860             :                                        rpa_grad%rpa_work%gd_homo(ispin), rpa_grad%rpa_work%gd_virtual(ispin), &
     861          56 :                                        rpa_grad%rpa_work%mepos)
     862             : 
     863             :             ! Now collect the density matrix
     864             :             CALL calc_P_rpa(mat_S_3D, mat_work_iaP_3D, rpa_grad%rpa_work%gd_homo(ispin), rpa_grad%rpa_work%gd_virtual(ispin), &
     865             :                             rpa_grad%rpa_work%grid, rpa_grad%rpa_work%mepos, &
     866             :                             fm_mat_S(ispin)%matrix_struct, &
     867             :                             rpa_grad%rpa_work%P_ij(ispin)%array, rpa_grad%rpa_work%P_ab(ispin)%array, &
     868          56 :                             weight, omega, Eigenval(:, ispin), homo(ispin), unit_nr, mp2_env)
     869             : 
     870          56 :             DEALLOCATE (mat_work_iaP_3D, mat_S_3D)
     871             : 
     872          56 :             CALL remove_scaling_factor_rpa(fm_mat_S(ispin), virtual(ispin), Eigenval(:, ispin), homo(ispin), omega)
     873             : 
     874             :          END IF
     875             : 
     876             :       END DO
     877             : 
     878         108 :       CALL timestop(handle)
     879             : 
     880         216 :    END SUBROUTINE
     881             : 
     882             : ! **************************************************************************************************
     883             : !> \brief ...
     884             : !> \param homo ...
     885             : !> \param fm_mat_S ...
     886             : !> \param fm_work_iaP ...
     887             : !> \param sos_mp2_work_occ ...
     888             : !> \param sos_mp2_work_virt ...
     889             : !> \param omega ...
     890             : !> \param weight ...
     891             : !> \param virtual ...
     892             : !> \param Eigenval ...
     893             : !> \param dot_blksize ...
     894             : ! **************************************************************************************************
     895         340 :    SUBROUTINE calc_P_sos_mp2(homo, fm_mat_S, fm_work_iaP, sos_mp2_work_occ, sos_mp2_work_virt, &
     896          68 :                              omega, weight, virtual, Eigenval, dot_blksize)
     897             :       INTEGER, INTENT(IN)                                :: homo
     898             :       TYPE(cp_fm_type), INTENT(IN)                       :: fm_mat_S, fm_work_iaP
     899             :       TYPE(sos_mp2_grad_work_type), INTENT(INOUT)        :: sos_mp2_work_occ, sos_mp2_work_virt
     900             :       REAL(KIND=dp), INTENT(IN)                          :: omega, weight
     901             :       INTEGER, INTENT(IN)                                :: virtual
     902             :       REAL(KIND=dp), DIMENSION(:), INTENT(IN)            :: Eigenval
     903             :       INTEGER, INTENT(IN)                                :: dot_blksize
     904             : 
     905             :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'calc_P_sos_mp2'
     906             : 
     907             :       INTEGER                                            :: avirt, col_global, col_local, handle, &
     908             :                                                             handle2, iocc, my_a, my_i, ncol_local, &
     909             :                                                             nrow_local, num_ab_pairs, num_ij_pairs
     910          68 :       INTEGER, DIMENSION(:), POINTER                     :: col_indices
     911             :       REAL(KIND=dp)                                      :: ddot, trace
     912             : 
     913          68 :       CALL timeset(routineN, handle)
     914             : 
     915          68 :       CALL cp_fm_get_info(fm_mat_S, col_indices=col_indices, ncol_local=ncol_local, nrow_local=nrow_local)
     916             : 
     917          68 :       CALL timeset(routineN//"_Pij_diag", handle2)
     918         332 :       DO my_i = 1, homo
     919             :          ! Collect the contributions of the matrix elements
     920             : 
     921         264 :          trace = 0.0_dp
     922             : 
     923       20944 :          DO col_local = 1, ncol_local
     924       20680 :             col_global = col_indices(col_local)
     925             : 
     926       20680 :             iocc = MAX(1, col_global - 1)/virtual + 1
     927       20680 :             avirt = col_global - (iocc - 1)*virtual
     928             : 
     929       20680 :             IF (iocc == my_i) trace = trace + &
     930        5584 :                                      ddot(nrow_local, fm_mat_S%local_data(:, col_local), 1, fm_work_iaP%local_data(:, col_local), 1)
     931             :          END DO
     932             : 
     933         332 :          sos_mp2_work_occ%P(my_i) = sos_mp2_work_occ%P(my_i) - trace*omega*weight
     934             : 
     935             :       END DO
     936          68 :       CALL timestop(handle2)
     937             : 
     938          68 :       CALL timeset(routineN//"_Pab_diag", handle2)
     939        1448 :       DO my_a = 1, virtual
     940             :          ! Collect the contributions of the matrix elements
     941             : 
     942        1380 :          trace = 0.0_dp
     943             : 
     944      109900 :          DO col_local = 1, ncol_local
     945      108520 :             col_global = col_indices(col_local)
     946             : 
     947      108520 :             iocc = MAX(1, col_global - 1)/virtual + 1
     948      108520 :             avirt = col_global - (iocc - 1)*virtual
     949             : 
     950      108520 :             IF (avirt == my_a) trace = trace + &
     951        6700 :                                      ddot(nrow_local, fm_mat_S%local_data(:, col_local), 1, fm_work_iaP%local_data(:, col_local), 1)
     952             :          END DO
     953             : 
     954        1448 :          sos_mp2_work_virt%P(my_a) = sos_mp2_work_virt%P(my_a) + trace*omega*weight
     955             : 
     956             :       END DO
     957          68 :       CALL timestop(handle2)
     958             : 
     959             :       ! Loop over list and carry out operations
     960          68 :       num_ij_pairs = SIZE(sos_mp2_work_occ%pair_list, 2)
     961          68 :       num_ab_pairs = SIZE(sos_mp2_work_virt%pair_list, 2)
     962          68 :       IF (num_ij_pairs > 0) THEN
     963             :          CALL calc_Pij_degen(fm_work_iaP, fm_mat_S, sos_mp2_work_occ%pair_list, &
     964             :                              virtual, sos_mp2_work_occ%P(homo + 1:), Eigenval(:homo), omega, weight, &
     965           8 :                              sos_mp2_work_occ%index2send, sos_mp2_work_occ%index2recv, dot_blksize)
     966             :       END IF
     967          68 :       IF (num_ab_pairs > 0) THEN
     968             :          CALL calc_Pab_degen(fm_work_iaP, fm_mat_S, sos_mp2_work_virt%pair_list, &
     969             :                              virtual, sos_mp2_work_virt%P(virtual + 1:), Eigenval(homo + 1:), omega, weight, &
     970           8 :                              sos_mp2_work_virt%index2send, sos_mp2_work_virt%index2recv, dot_blksize)
     971             :       END IF
     972             : 
     973          68 :       CALL timestop(handle)
     974             : 
     975          68 :    END SUBROUTINE
     976             : 
     977             : ! **************************************************************************************************
     978             : !> \brief ...
     979             : !> \param mat_S_1D ...
     980             : !> \param mat_work_iaP_3D ...
     981             : !> \param gd_homo ...
     982             : !> \param gd_virtual ...
     983             : !> \param grid ...
     984             : !> \param mepos ...
     985             : !> \param fm_struct_S ...
     986             : !> \param P_ij ...
     987             : !> \param P_ab ...
     988             : !> \param weight ...
     989             : !> \param omega ...
     990             : !> \param Eigenval ...
     991             : !> \param homo ...
     992             : !> \param unit_nr ...
     993             : !> \param mp2_env ...
     994             : ! **************************************************************************************************
     995          56 :    SUBROUTINE calc_P_rpa(mat_S_1D, mat_work_iaP_3D, gd_homo, gd_virtual, grid, mepos, &
     996          56 :                          fm_struct_S, P_ij, P_ab, weight, omega, Eigenval, homo, unit_nr, mp2_env)
     997             :       REAL(KIND=dp), DIMENSION(*), INTENT(INOUT), TARGET :: mat_S_1D
     998             :       REAL(KIND=dp), DIMENSION(:, :, :), INTENT(INOUT)   :: mat_work_iaP_3D
     999             :       TYPE(group_dist_d1_type), INTENT(IN)               :: gd_homo, gd_virtual
    1000             :       INTEGER, DIMENSION(2), INTENT(IN)                  :: grid, mepos
    1001             :       TYPE(cp_fm_struct_type), INTENT(IN), POINTER       :: fm_struct_S
    1002             :       REAL(KIND=dp), DIMENSION(:, :)                     :: P_ij, P_ab
    1003             :       REAL(KIND=dp), INTENT(IN)                          :: weight, omega
    1004             :       REAL(KIND=dp), DIMENSION(:), INTENT(IN)            :: Eigenval
    1005             :       INTEGER, INTENT(IN)                                :: homo, unit_nr
    1006             :       TYPE(mp2_type), INTENT(INOUT)                      :: mp2_env
    1007             : 
    1008             :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'calc_P_rpa'
    1009             : 
    1010             :       INTEGER :: completed, handle, handle2, my_a_end, my_a_size, my_a_start, my_i_end, my_i_size, &
    1011             :          my_i_start, my_P_size, my_prow, number_of_parallel_channels, proc_a_recv, proc_a_send, &
    1012             :          proc_i_recv, proc_i_send, proc_recv, proc_send, proc_shift, recv_a_end, recv_a_size, &
    1013             :          recv_a_start, recv_i_end, recv_i_size, recv_i_start, tag
    1014             :       INTEGER(KIND=int_8)                                :: mem, number_of_elements_per_blk
    1015          56 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: procs_recv
    1016          56 :       INTEGER, DIMENSION(:, :), POINTER                  :: blacs2mpi
    1017             :       REAL(KIND=dp)                                      :: mem_per_block, mem_real
    1018          56 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:), TARGET   :: buffer_compens_1D
    1019          56 :       REAL(KIND=dp), DIMENSION(:, :, :), POINTER         :: mat_S_3D
    1020          56 :       TYPE(cp_1d_r_cp_type), ALLOCATABLE, DIMENSION(:)   :: buffer_1D
    1021          56 :       TYPE(cp_3d_r_cp_type), ALLOCATABLE, DIMENSION(:)   :: buffer_3D
    1022             :       TYPE(mp_para_env_type), POINTER                    :: para_env
    1023          56 :       TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:)   :: recv_requests, send_requests
    1024             : 
    1025          56 :       CALL timeset(routineN, handle)
    1026             : 
    1027             :       ! We allocate it at every step to reduce potential memory conflicts with COSMA
    1028          56 :       IF (mp2_env%ri_grad%dot_blksize >= blksize_threshold) THEN
    1029          48 :          CALL mp2_env%local_gemm_ctx%create(LOCAL_GEMM_PU_GPU)
    1030          48 :          CALL mp2_env%local_gemm_ctx%set_op_threshold_gpu(spla_threshold)
    1031             :       END IF
    1032             : 
    1033          56 :       tag = 47
    1034             : 
    1035          56 :       my_P_size = SIZE(mat_work_iaP_3D, 1)
    1036             : 
    1037          56 :       CALL cp_fm_struct_get(fm_struct_S, para_env=para_env)
    1038          56 :       CALL fm_struct_S%context%get(my_process_row=my_prow, blacs2mpi=blacs2mpi, para_env=para_env)
    1039             : 
    1040          56 :       CALL get_group_dist(gd_virtual, mepos(1), my_a_start, my_a_end, my_a_size)
    1041          56 :       CALL get_group_dist(gd_homo, mepos(2), my_i_start, my_i_end, my_i_size)
    1042             : 
    1043             :       ! We have to remap the indices because mp_sendrecv requires a 3D array (because of mat_work_iaP_3D)
    1044             :       ! and dgemm requires 2D arrays
    1045             :       ! Fortran 2008 does allow pointer remapping independently of the ranks but GCC 7 does not properly support it
    1046          56 :       mat_S_3D(1:my_P_size, 1:my_a_size, 1:my_i_size) => mat_S_1D(1:INT(my_P_size, int_8)*my_a_size*my_i_size)
    1047             : 
    1048             :       number_of_elements_per_blk = MAX(INT(maxsize(gd_homo), KIND=int_8)*my_a_size, &
    1049          56 :                                        INT(maxsize(gd_virtual), KIND=int_8)*my_i_size)*my_P_size
    1050             : 
    1051             :       ! Determine the available memory and estimate the number of possible parallel communication channels
    1052          56 :       CALL m_memory(mem)
    1053          56 :       mem_real = REAL(mem, KIND=dp)
    1054          56 :       mem_per_block = REAL(number_of_elements_per_blk, KIND=dp)*8.0_dp
    1055         168 :       number_of_parallel_channels = MAX(1, MIN(MAXVAL(grid) - 1, FLOOR(mem_real/mem_per_block)))
    1056          56 :       CALL para_env%min(number_of_parallel_channels)
    1057          56 :       IF (mp2_env%ri_grad%max_parallel_comm > 0) &
    1058          56 :          number_of_parallel_channels = MIN(number_of_parallel_channels, mp2_env%ri_grad%max_parallel_comm)
    1059             : 
    1060          56 :       IF (unit_nr > 0) THEN
    1061          28 :          WRITE (unit_nr, '(T3,A,T75,I6)') 'GRAD_INFO| Number of parallel communication channels:', number_of_parallel_channels
    1062          28 :          CALL m_flush(unit_nr)
    1063             :       END IF
    1064          56 :       CALL para_env%sync()
    1065             : 
    1066         224 :       ALLOCATE (buffer_1D(number_of_parallel_channels))
    1067         112 :       DO proc_shift = 1, number_of_parallel_channels
    1068         224 :          ALLOCATE (buffer_1D(proc_shift)%array(number_of_elements_per_blk))
    1069             :       END DO
    1070             : 
    1071         224 :       ALLOCATE (buffer_3D(number_of_parallel_channels))
    1072             : 
    1073             :       ! Allocate buffers for vector version of kahan summation
    1074          56 :       IF (mp2_env%ri_grad%dot_blksize >= blksize_threshold) THEN
    1075         144 :          ALLOCATE (buffer_compens_1D(2*MAX(my_a_size*maxsize(gd_virtual), my_i_size*maxsize(gd_homo))))
    1076             :       END IF
    1077             : 
    1078          56 :       IF (number_of_parallel_channels > 1) THEN
    1079           0 :          ALLOCATE (procs_recv(number_of_parallel_channels))
    1080           0 :          ALLOCATE (recv_requests(number_of_parallel_channels))
    1081           0 :          ALLOCATE (send_requests(MAXVAL(grid) - 1))
    1082             :       END IF
    1083             : 
    1084          56 :       IF (number_of_parallel_channels > 1 .AND. grid(1) > 1) THEN
    1085           0 :          CALL timeset(routineN//"_comm_a", handle2)
    1086           0 :          recv_requests(:) = mp_request_null
    1087           0 :          procs_recv(:) = -1
    1088           0 :          DO proc_shift = 1, MIN(grid(1) - 1, number_of_parallel_channels)
    1089           0 :             proc_a_recv = MODULO(mepos(1) - proc_shift, grid(1))
    1090           0 :             proc_recv = mepos(2)*grid(1) + proc_a_recv
    1091             : 
    1092           0 :             CALL get_group_dist(gd_virtual, proc_a_recv, recv_a_start, recv_a_end, recv_a_size)
    1093             : 
    1094             :             buffer_3D(proc_shift)%array(1:my_P_size, 1:recv_a_size, 1:my_i_size) => &
    1095           0 :                buffer_1D(proc_shift)%array(1:INT(my_P_size, KIND=int_8)*recv_a_size*my_i_size)
    1096             : 
    1097             :             CALL para_env%irecv(buffer_3D(proc_shift)%array, blacs2mpi(my_prow, proc_recv), &
    1098           0 :                                 recv_requests(proc_shift), tag)
    1099             : 
    1100           0 :             procs_recv(proc_shift) = proc_a_recv
    1101             :          END DO
    1102             : 
    1103           0 :          send_requests(:) = mp_request_null
    1104           0 :          DO proc_shift = 1, grid(1) - 1
    1105           0 :             proc_a_send = MODULO(mepos(1) + proc_shift, grid(1))
    1106           0 :             proc_send = mepos(2)*grid(1) + proc_a_send
    1107             : 
    1108             :             CALL para_env%isend(mat_work_iaP_3D, blacs2mpi(my_prow, proc_send), &
    1109           0 :                                 send_requests(proc_shift), tag)
    1110             :          END DO
    1111           0 :          CALL timestop(handle2)
    1112             :       END IF
    1113             : 
    1114             :       CALL calc_P_rpa_a(P_ab(:, my_a_start:my_a_end), &
    1115             :                         mat_S_3D, mat_work_iaP_3D, &
    1116             :                         mp2_env%ri_grad%dot_blksize, buffer_compens_1D, mp2_env%local_gemm_ctx, &
    1117             :                         Eigenval(homo + my_a_start:homo + my_a_end), Eigenval(my_i_start:my_i_end), &
    1118          56 :                         Eigenval(homo + my_a_start:homo + my_a_end), omega, weight)
    1119             : 
    1120          56 :       DO proc_shift = 1, grid(1) - 1
    1121           0 :          CALL timeset(routineN//"_comm_a", handle2)
    1122           0 :          IF (number_of_parallel_channels > 1) THEN
    1123           0 :             CALL mp_waitany(recv_requests, completed)
    1124             : 
    1125           0 :             CALL get_group_dist(gd_virtual, procs_recv(completed), recv_a_start, recv_a_end, recv_a_size)
    1126             :          ELSE
    1127           0 :             proc_a_send = MODULO(mepos(1) + proc_shift, grid(1))
    1128           0 :             proc_a_recv = MODULO(mepos(1) - proc_shift, grid(1))
    1129             : 
    1130           0 :             proc_send = mepos(2)*grid(1) + proc_a_send
    1131           0 :             proc_recv = mepos(2)*grid(1) + proc_a_recv
    1132             : 
    1133           0 :             CALL get_group_dist(gd_virtual, proc_a_recv, recv_a_start, recv_a_end, recv_a_size)
    1134             : 
    1135             :             buffer_3D(1)%array(1:my_P_size, 1:recv_a_size, 1:my_i_size) => &
    1136           0 :                buffer_1D(1)%array(1:INT(my_P_size, KIND=int_8)*recv_a_size*my_i_size)
    1137             : 
    1138             :             CALL para_env%sendrecv(mat_work_iaP_3D, blacs2mpi(my_prow, proc_send), &
    1139           0 :                                    buffer_3D(1)%array, blacs2mpi(my_prow, proc_recv), tag)
    1140           0 :             completed = 1
    1141             :          END IF
    1142           0 :          CALL timestop(handle2)
    1143             : 
    1144             :          CALL calc_P_rpa_a(P_ab(:, recv_a_start:recv_a_end), &
    1145             :                            mat_S_3D, buffer_3D(completed)%array, &
    1146             :                            mp2_env%ri_grad%dot_blksize, buffer_compens_1D, mp2_env%local_gemm_ctx, &
    1147             :                            Eigenval(homo + my_a_start:homo + my_a_end), Eigenval(my_i_start:my_i_end), &
    1148           0 :                            Eigenval(homo + recv_a_start:homo + recv_a_end), omega, weight)
    1149             : 
    1150          56 :          IF (number_of_parallel_channels > 1 .AND. number_of_parallel_channels + proc_shift < grid(1)) THEN
    1151           0 :             proc_a_recv = MODULO(mepos(1) - proc_shift - number_of_parallel_channels, grid(1))
    1152           0 :             proc_recv = mepos(2)*grid(1) + proc_a_recv
    1153             : 
    1154           0 :             CALL get_group_dist(gd_virtual, proc_a_recv, recv_a_start, recv_a_end, recv_a_size)
    1155             : 
    1156             :             buffer_3D(completed)%array(1:my_P_size, 1:recv_a_size, 1:my_i_size) => &
    1157           0 :                buffer_1D(completed)%array(1:INT(my_P_size, KIND=int_8)*recv_a_size*my_i_size)
    1158             : 
    1159             :             CALL para_env%irecv(buffer_3D(completed)%array, blacs2mpi(my_prow, proc_recv), &
    1160           0 :                                 recv_requests(completed), tag)
    1161             : 
    1162           0 :             procs_recv(completed) = proc_a_recv
    1163             :          END IF
    1164             :       END DO
    1165             : 
    1166          56 :       IF (number_of_parallel_channels > 1 .AND. grid(1) > 1) THEN
    1167           0 :          CALL mp_waitall(send_requests)
    1168             :       END IF
    1169             : 
    1170          56 :       IF (number_of_parallel_channels > 1 .AND. grid(2) > 1) THEN
    1171           0 :          recv_requests(:) = mp_request_null
    1172           0 :          procs_recv(:) = -1
    1173           0 :          DO proc_shift = 1, MIN(grid(2) - 1, number_of_parallel_channels)
    1174           0 :             proc_i_recv = MODULO(mepos(2) - proc_shift, grid(2))
    1175           0 :             proc_recv = proc_i_recv*grid(1) + mepos(1)
    1176             : 
    1177           0 :             CALL get_group_dist(gd_homo, proc_i_recv, recv_i_start, recv_i_end, recv_i_size)
    1178             : 
    1179             :             buffer_3D(proc_shift)%array(1:my_P_size, 1:my_a_size, 1:recv_i_size) => &
    1180           0 :                buffer_1D(proc_shift)%array(1:INT(my_P_size, KIND=int_8)*my_a_size*recv_i_size)
    1181             : 
    1182             :             CALL para_env%irecv(buffer_3D(proc_shift)%array, blacs2mpi(my_prow, proc_recv), &
    1183           0 :                                 recv_requests(proc_shift), tag)
    1184             : 
    1185           0 :             procs_recv(proc_shift) = proc_i_recv
    1186             :          END DO
    1187             : 
    1188           0 :          send_requests(:) = mp_request_null
    1189           0 :          DO proc_shift = 1, grid(2) - 1
    1190           0 :             proc_i_send = MODULO(mepos(2) + proc_shift, grid(2))
    1191           0 :             proc_send = proc_i_send*grid(1) + mepos(1)
    1192             : 
    1193             :             CALL para_env%isend(mat_work_iaP_3D, blacs2mpi(my_prow, proc_send), &
    1194           0 :                                 send_requests(proc_shift), tag)
    1195             :          END DO
    1196             :       END IF
    1197             : 
    1198             :       CALL calc_P_rpa_i(P_ij(:, my_i_start:my_i_end), &
    1199             :                         mat_S_3D, mat_work_iaP_3D, &
    1200             :                         mp2_env%ri_grad%dot_blksize, buffer_compens_1D, mp2_env%local_gemm_ctx, &
    1201             :                         Eigenval(homo + my_a_start:homo + my_a_end), Eigenval(my_i_start:my_i_end), &
    1202          56 :                         Eigenval(my_i_start:my_i_end), omega, weight)
    1203             : 
    1204          56 :       DO proc_shift = 1, grid(2) - 1
    1205           0 :          CALL timeset(routineN//"_comm_i", handle2)
    1206           0 :          IF (number_of_parallel_channels > 1) THEN
    1207           0 :             CALL mp_waitany(recv_requests, completed)
    1208             : 
    1209           0 :             CALL get_group_dist(gd_homo, procs_recv(completed), recv_i_start, recv_i_end, recv_i_size)
    1210             :          ELSE
    1211           0 :             proc_i_send = MODULO(mepos(2) + proc_shift, grid(2))
    1212           0 :             proc_i_recv = MODULO(mepos(2) - proc_shift, grid(2))
    1213             : 
    1214           0 :             proc_send = proc_i_send*grid(1) + mepos(1)
    1215           0 :             proc_recv = proc_i_recv*grid(1) + mepos(1)
    1216             : 
    1217           0 :             CALL get_group_dist(gd_homo, proc_i_recv, recv_i_start, recv_i_end, recv_i_size)
    1218             : 
    1219             :             buffer_3D(1)%array(1:my_P_size, 1:my_a_size, 1:recv_i_size) => &
    1220           0 :                buffer_1D(1)%array(1:INT(my_P_size, KIND=int_8)*my_a_size*recv_i_size)
    1221             : 
    1222             :             CALL para_env%sendrecv(mat_work_iaP_3D, blacs2mpi(my_prow, proc_send), &
    1223           0 :                                    buffer_3D(1)%array, blacs2mpi(my_prow, proc_recv), tag)
    1224           0 :             completed = 1
    1225             :          END IF
    1226           0 :          CALL timestop(handle2)
    1227             : 
    1228             :          CALL calc_P_rpa_i(P_ij(:, recv_i_start:recv_i_end), &
    1229             :                            mat_S_3D, buffer_3D(completed)%array, &
    1230             :                            mp2_env%ri_grad%dot_blksize, buffer_compens_1D, mp2_env%local_gemm_ctx, &
    1231             :                            Eigenval(homo + my_a_start:homo + my_a_end), Eigenval(my_i_start:my_i_end), &
    1232           0 :                            Eigenval(recv_i_start:recv_i_end), omega, weight)
    1233             : 
    1234          56 :          IF (number_of_parallel_channels > 1 .AND. number_of_parallel_channels + proc_shift < grid(2)) THEN
    1235           0 :             proc_i_recv = MODULO(mepos(2) - proc_shift - number_of_parallel_channels, grid(2))
    1236           0 :             proc_recv = proc_i_recv*grid(1) + mepos(1)
    1237             : 
    1238           0 :             CALL get_group_dist(gd_homo, proc_i_recv, recv_i_start, recv_a_end, recv_i_size)
    1239             : 
    1240             :             buffer_3D(completed)%array(1:my_P_size, 1:my_a_size, 1:recv_i_size) => &
    1241           0 :                buffer_1D(completed)%array(1:INT(my_P_size, KIND=int_8)*my_a_size*recv_i_size)
    1242             : 
    1243             :             CALL para_env%irecv(buffer_3D(completed)%array, blacs2mpi(my_prow, proc_recv), &
    1244           0 :                                 recv_requests(completed), tag)
    1245             : 
    1246           0 :             procs_recv(completed) = proc_i_recv
    1247             :          END IF
    1248             :       END DO
    1249             : 
    1250          56 :       IF (number_of_parallel_channels > 1 .AND. grid(2) > 1) THEN
    1251           0 :          CALL mp_waitall(send_requests)
    1252             :       END IF
    1253             : 
    1254          56 :       IF (number_of_parallel_channels > 1) THEN
    1255           0 :          DEALLOCATE (procs_recv)
    1256           0 :          DEALLOCATE (recv_requests)
    1257           0 :          DEALLOCATE (send_requests)
    1258             :       END IF
    1259             : 
    1260          56 :       IF (mp2_env%ri_grad%dot_blksize >= blksize_threshold) THEN
    1261             :          ! release memory allocated by local_gemm when run on GPU. local_gemm_ctx is null on cpu only runs
    1262          48 :          CALL mp2_env%local_gemm_ctx%destroy()
    1263          48 :          DEALLOCATE (buffer_compens_1D)
    1264             :       END IF
    1265             : 
    1266         112 :       DO proc_shift = 1, number_of_parallel_channels
    1267          56 :          NULLIFY (buffer_3D(proc_shift)%array)
    1268         112 :          DEALLOCATE (buffer_1D(proc_shift)%array)
    1269             :       END DO
    1270          56 :       DEALLOCATE (buffer_3D, buffer_1D)
    1271             : 
    1272          56 :       CALL timestop(handle)
    1273             : 
    1274         168 :    END SUBROUTINE
    1275             : 
    1276             : ! **************************************************************************************************
    1277             : !> \brief ...
    1278             : !> \param P_ab ...
    1279             : !> \param mat_S ...
    1280             : !> \param mat_work ...
    1281             : !> \param dot_blksize ...
    1282             : !> \param buffer_1D ...
    1283             : !> \param local_gemm_ctx ...
    1284             : !> \param my_eval_virt ...
    1285             : !> \param my_eval_occ ...
    1286             : !> \param recv_eval_virt ...
    1287             : !> \param omega ...
    1288             : !> \param weight ...
    1289             : ! **************************************************************************************************
    1290          56 :    SUBROUTINE calc_P_rpa_a(P_ab, mat_S, mat_work, dot_blksize, buffer_1D, local_gemm_ctx, &
    1291          56 :                            my_eval_virt, my_eval_occ, recv_eval_virt, omega, weight)
    1292             :       REAL(KIND=dp), DIMENSION(:, :), INTENT(INOUT)      :: P_ab
    1293             :       REAL(KIND=dp), DIMENSION(:, :, :), INTENT(IN)      :: mat_S, mat_work
    1294             :       INTEGER, INTENT(IN)                                :: dot_blksize
    1295             :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:), &
    1296             :          INTENT(INOUT), TARGET                           :: buffer_1D
    1297             :       TYPE(local_gemm_ctxt_type), INTENT(INOUT)          :: local_gemm_ctx
    1298             :       REAL(KIND=dp), DIMENSION(:), INTENT(IN)            :: my_eval_virt, my_eval_occ, recv_eval_virt
    1299             :       REAL(KIND=dp), INTENT(IN)                          :: omega, weight
    1300             : 
    1301             :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'calc_P_rpa_a'
    1302             : 
    1303             :       INTEGER                                            :: handle, my_a, my_a_size, my_i, &
    1304             :                                                             my_i_size, my_P_size, P_end, P_start, &
    1305             :                                                             recv_a_size, stripesize
    1306          56 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: buffer_compens, buffer_unscaled
    1307             : 
    1308          56 :       CALL timeset(routineN, handle)
    1309             : 
    1310          56 :       my_i_size = SIZE(mat_S, 3)
    1311          56 :       recv_a_size = SIZE(mat_work, 2)
    1312          56 :       my_a_size = SIZE(mat_S, 2)
    1313          56 :       my_P_size = SIZE(mat_S, 1)
    1314             : 
    1315          56 :       IF (dot_blksize >= blksize_threshold) THEN
    1316          48 :          buffer_compens(1:my_a_size, 1:recv_a_size) => buffer_1D(1:my_a_size*recv_a_size)
    1317       22208 :          buffer_compens = 0.0_dp
    1318          48 :          buffer_unscaled(1:my_a_size, 1:recv_a_size) => buffer_1D(my_a_size*recv_a_size + 1:2*my_a_size*recv_a_size)
    1319             : 
    1320             :          ! This loop imitates the actual tensor contraction
    1321         232 :          DO my_i = 1, my_i_size
    1322         416 :             DO P_start = 1, my_P_size, dot_blksize
    1323         184 :                stripesize = MIN(dot_blksize, my_P_size - P_start + 1)
    1324         184 :                P_end = P_start + stripesize - 1
    1325             : 
    1326             :                CALL local_gemm_ctx%gemm("T", "N", my_a_size, recv_a_size, stripesize, &
    1327             :                                         -weight, mat_S(P_start:P_end, :, my_i), stripesize, &
    1328             :                                         mat_work(P_start:P_end, :, my_i), stripesize, &
    1329         184 :                                         0.0_dp, buffer_unscaled, my_a_size)
    1330             : 
    1331             :                CALL scale_buffer_and_add_compens_virt(buffer_unscaled, buffer_compens, omega, &
    1332         184 :                                                       my_eval_virt, recv_eval_virt, my_eval_occ(my_i))
    1333             : 
    1334         368 :                CALL kahan_step(buffer_compens, P_ab)
    1335             :             END DO
    1336             :          END DO
    1337             :       ELSE
    1338             :          BLOCK
    1339             :             INTEGER :: recv_a
    1340             :             REAL(KIND=dp) :: tmp, e_i, e_a, e_b, omega2, my_compens, my_p, s
    1341           8 :             omega2 = -omega**2
    1342             : !$OMP PARALLEL DO COLLAPSE(2) DEFAULT(NONE)&
    1343             : !$OMP SHARED(my_a_size,recv_a_size,my_i_size,mat_S,my_eval_virt,recv_eval_virt,my_eval_occ,omega2,&
    1344             : !$OMP        P_ab,weight,mat_work)&
    1345           8 : !$OMP PRIVATE(tmp,my_a,recv_a,my_i,e_a,e_b,e_i,my_compens,my_p,s)
    1346             :             DO my_a = 1, my_a_size
    1347             :             DO recv_a = 1, recv_a_size
    1348             :                e_a = my_eval_virt(my_a)
    1349             :                e_b = recv_eval_virt(recv_a)
    1350             :                my_p = P_ab(my_a, recv_a)
    1351             :                my_compens = 0.0_dp
    1352             :                DO my_i = 1, my_i_size
    1353             :                   e_i = -my_eval_occ(my_i)
    1354             :                   tmp = -weight*accurate_dot_product(mat_S(:, my_a, my_i), mat_work(:, recv_a, my_i)) &
    1355             :                         *(1.0_dp + omega2/((e_a + e_i)*(e_b + e_i))) - my_compens
    1356             :                   s = my_p + tmp
    1357             :                   my_compens = (s - my_p) - tmp
    1358             :                   my_p = s
    1359             :                END DO
    1360             :                P_ab(my_a, recv_a) = my_p
    1361             :             END DO
    1362             :             END DO
    1363             :          END BLOCK
    1364             :       END IF
    1365             : 
    1366          56 :       CALL timestop(handle)
    1367             : 
    1368          56 :    END SUBROUTINE calc_P_rpa_a
    1369             : 
    1370             : ! **************************************************************************************************
    1371             : !> \brief ...
    1372             : !> \param P_ij ...
    1373             : !> \param mat_S ...
    1374             : !> \param mat_work ...
    1375             : !> \param dot_blksize ...
    1376             : !> \param buffer_1D ...
    1377             : !> \param local_gemm_ctx ...
    1378             : !> \param my_eval_virt ...
    1379             : !> \param my_eval_occ ...
    1380             : !> \param recv_eval_occ ...
    1381             : !> \param omega ...
    1382             : !> \param weight ...
    1383             : ! **************************************************************************************************
    1384          56 :    SUBROUTINE calc_P_rpa_i(P_ij, mat_S, mat_work, dot_blksize, buffer_1D, local_gemm_ctx, &
    1385          56 :                            my_eval_virt, my_eval_occ, recv_eval_occ, omega, weight)
    1386             :       REAL(KIND=dp), DIMENSION(:, :), INTENT(INOUT)      :: P_ij
    1387             :       REAL(KIND=dp), DIMENSION(:, :, :), INTENT(INOUT)   :: mat_S, mat_work
    1388             :       INTEGER, INTENT(IN)                                :: dot_blksize
    1389             :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:), &
    1390             :          INTENT(INOUT), TARGET                           :: buffer_1D
    1391             :       TYPE(local_gemm_ctxt_type), INTENT(INOUT)          :: local_gemm_ctx
    1392             :       REAL(KIND=dp), DIMENSION(:), INTENT(IN)            :: my_eval_virt, my_eval_occ, recv_eval_occ
    1393             :       REAL(KIND=dp), INTENT(IN)                          :: omega, weight
    1394             : 
    1395             :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'calc_P_rpa_i'
    1396             : 
    1397             :       INTEGER                                            :: handle, my_a, my_a_size, my_i, &
    1398             :                                                             my_i_size, my_P_size, P_end, P_start, &
    1399             :                                                             recv_i_size, stripesize
    1400          56 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: buffer_compens, buffer_unscaled
    1401             : 
    1402          56 :       CALL timeset(routineN, handle)
    1403             : 
    1404          56 :       my_i_size = SIZE(mat_S, 3)
    1405          56 :       recv_i_size = SIZE(mat_work, 3)
    1406          56 :       my_a_size = SIZE(mat_S, 2)
    1407          56 :       my_P_size = SIZE(mat_S, 1)
    1408             : 
    1409          56 :       IF (dot_blksize >= blksize_threshold) THEN
    1410          48 :          buffer_compens(1:my_i_size, 1:recv_i_size) => buffer_1D(1:my_i_size*recv_i_size)
    1411         944 :          buffer_compens = 0.0_dp
    1412          48 :          buffer_unscaled(1:my_i_size, 1:recv_i_size) => buffer_1D(my_i_size*recv_i_size + 1:2*my_i_size*recv_i_size)
    1413             : 
    1414             :          ! This loop imitates the actual tensor contraction
    1415        1048 :          DO my_a = 1, my_a_size
    1416        2048 :             DO P_start = 1, my_P_size, dot_blksize
    1417        1000 :                stripesize = MIN(dot_blksize, my_P_size - P_start + 1)
    1418        1000 :                P_end = P_start + stripesize - 1
    1419             : 
    1420             :                CALL local_gemm_ctx%gemm("T", "N", my_i_size, recv_i_size, stripesize, &
    1421             :                                         weight, mat_S(P_start:P_end, my_a, :), stripesize, &
    1422             :                                         mat_work(P_start:P_end, my_a, :), stripesize, &
    1423        1000 :                                         0.0_dp, buffer_unscaled, my_i_size)
    1424             : 
    1425             :                CALL scale_buffer_and_add_compens_occ(buffer_unscaled, buffer_compens, omega, &
    1426        1000 :                                                      my_eval_occ, recv_eval_occ, my_eval_virt(my_a))
    1427             : 
    1428        2000 :                CALL kahan_step(buffer_compens, P_ij)
    1429             :             END DO
    1430             :          END DO
    1431             :       ELSE
    1432             :          BLOCK
    1433             :             REAL(KIND=dp) :: tmp, e_i, e_a, e_j, omega2, my_compens, my_p, s
    1434             :             INTEGER :: recv_i
    1435           8 :             omega2 = -omega**2
    1436             : !$OMP PARALLEL DO COLLAPSE(2) DEFAULT(NONE)&
    1437             : !$OMP SHARED(my_a_size,recv_i_size,my_i_size,mat_S,my_eval_occ,my_eval_virt,omega2,&
    1438             : !$OMP        recv_eval_occ,P_ij,weight,mat_work)&
    1439           8 : !$OMP PRIVATE(tmp,my_a,recv_i,my_i,e_i,e_j,e_a,my_compens,my_p,s)
    1440             :             DO my_i = 1, my_i_size
    1441             :             DO recv_i = 1, recv_i_size
    1442             :                e_i = my_eval_occ(my_i)
    1443             :                e_j = recv_eval_occ(recv_i)
    1444             :                my_p = P_ij(my_i, recv_i)
    1445             :                my_compens = 0.0_dp
    1446             :                DO my_a = 1, my_a_size
    1447             :                   e_a = my_eval_virt(my_a)
    1448             :                   tmp = weight*accurate_dot_product(mat_S(:, my_a, my_i), mat_work(:, my_a, recv_i)) &
    1449             :                         *(1.0_dp + omega2/((e_a - e_i)*(e_a - e_j))) - my_compens
    1450             :                   s = my_p + tmp
    1451             :                   my_compens = (s - my_p) - tmp
    1452             :                   my_p = s
    1453             :                END DO
    1454             :                P_ij(my_i, recv_i) = my_p
    1455             :             END DO
    1456             :             END DO
    1457             :          END BLOCK
    1458             :       END IF
    1459             : 
    1460          56 :       CALL timestop(handle)
    1461             : 
    1462          56 :    END SUBROUTINE calc_P_rpa_i
    1463             : 
    1464             : ! **************************************************************************************************
    1465             : !> \brief ...
    1466             : !> \param compens ...
    1467             : !> \param P ...
    1468             : ! **************************************************************************************************
    1469        1184 :    SUBROUTINE kahan_step(compens, P)
    1470             :       REAL(KIND=dp), DIMENSION(:, :), INTENT(INOUT)      :: compens, P
    1471             : 
    1472             :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'kahan_step'
    1473             : 
    1474             :       INTEGER                                            :: handle, i, j
    1475             :       REAL(KIND=dp)                                      :: my_compens, my_p, s
    1476             : 
    1477        1184 :       CALL timeset(routineN, handle)
    1478             : 
    1479        1184 : !$OMP PARALLEL DO DEFAULT(NONE) SHARED(P,compens) PRIVATE(i,my_p,my_compens,s, j) COLLAPSE(2)
    1480             :       DO j = 1, SIZE(compens, 2)
    1481             :          DO i = 1, SIZE(compens, 1)
    1482             :             my_p = P(i, j)
    1483             :             my_compens = compens(i, j)
    1484             :             s = my_p + my_compens
    1485             :             compens(i, j) = (s - my_p) - my_compens
    1486             :             P(i, j) = s
    1487             :          END DO
    1488             :       END DO
    1489             : !$OMP END PARALLEL DO
    1490             : 
    1491        1184 :       CALL timestop(handle)
    1492             : 
    1493        1184 :    END SUBROUTINE
    1494             : 
    1495             : ! **************************************************************************************************
    1496             : !> \brief ...
    1497             : !> \param buffer_unscaled ...
    1498             : !> \param buffer_compens ...
    1499             : !> \param omega ...
    1500             : !> \param my_eval_virt ...
    1501             : !> \param recv_eval_virt ...
    1502             : !> \param my_eval_occ ...
    1503             : ! **************************************************************************************************
    1504         184 :    SUBROUTINE scale_buffer_and_add_compens_virt(buffer_unscaled, buffer_compens, omega, &
    1505         184 :                                                 my_eval_virt, recv_eval_virt, my_eval_occ)
    1506             :       REAL(KIND=dp), DIMENSION(:, :), INTENT(IN)         :: buffer_unscaled
    1507             :       REAL(KIND=dp), DIMENSION(:, :), INTENT(INOUT)      :: buffer_compens
    1508             :       REAL(KIND=dp), INTENT(IN)                          :: omega
    1509             :       REAL(KIND=dp), DIMENSION(:), INTENT(IN)            :: my_eval_virt, recv_eval_virt
    1510             :       REAL(KIND=dp), INTENT(IN)                          :: my_eval_occ
    1511             : 
    1512             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'scale_buffer_and_add_compens_virt'
    1513             : 
    1514             :       INTEGER                                            :: handle, my_a, my_b
    1515             : 
    1516         184 :       CALL timeset(routineN, handle)
    1517             : 
    1518             : !$OMP PARALLEL DO DEFAULT(NONE) SHARED(buffer_unscaled,buffer_compens,omega,&
    1519         184 : !$OMP                                  my_eval_virt,recv_eval_virt,my_eval_occ) PRIVATE(my_a,my_b)
    1520             :       DO my_b = 1, SIZE(buffer_compens, 2)
    1521             :          DO my_a = 1, SIZE(buffer_compens, 1)
    1522             :             buffer_compens(my_a, my_b) = buffer_unscaled(my_a, my_b) &
    1523             :                                     *(1.0_dp - omega**2/((my_eval_virt(my_a) - my_eval_occ)*(recv_eval_virt(my_b) - my_eval_occ))) &
    1524             :                                          - buffer_compens(my_a, my_b)
    1525             :          END DO
    1526             :       END DO
    1527             : !$OMP END PARALLEL DO
    1528             : 
    1529         184 :       CALL timestop(handle)
    1530             : 
    1531         184 :    END SUBROUTINE scale_buffer_and_add_compens_virt
    1532             : 
    1533             : ! **************************************************************************************************
    1534             : !> \brief ...
    1535             : !> \param buffer_unscaled ...
    1536             : !> \param buffer_compens ...
    1537             : !> \param omega ...
    1538             : !> \param my_eval_occ ...
    1539             : !> \param recv_eval_occ ...
    1540             : !> \param my_eval_virt ...
    1541             : ! **************************************************************************************************
    1542        1000 :    SUBROUTINE scale_buffer_and_add_compens_occ(buffer_unscaled, buffer_compens, omega, &
    1543        1000 :                                                my_eval_occ, recv_eval_occ, my_eval_virt)
    1544             :       REAL(KIND=dp), DIMENSION(:, :), INTENT(IN)         :: buffer_unscaled
    1545             :       REAL(KIND=dp), DIMENSION(:, :), INTENT(INOUT)      :: buffer_compens
    1546             :       REAL(KIND=dp), INTENT(IN)                          :: omega
    1547             :       REAL(KIND=dp), DIMENSION(:), INTENT(IN)            :: my_eval_occ, recv_eval_occ
    1548             :       REAL(KIND=dp), INTENT(IN)                          :: my_eval_virt
    1549             : 
    1550             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'scale_buffer_and_add_compens_occ'
    1551             : 
    1552             :       INTEGER                                            :: handle, my_i, my_j
    1553             : 
    1554        1000 :       CALL timeset(routineN, handle)
    1555             : 
    1556             : !$OMP PARALLEL DO DEFAULT(NONE) SHARED(buffer_compens,buffer_unscaled,omega,&
    1557        1000 : !$OMP        my_eval_virt,my_eval_occ,recv_eval_occ) PRIVATE(my_i,my_j)
    1558             :       DO my_j = 1, SIZE(buffer_compens, 2)
    1559             :          DO my_i = 1, SIZE(buffer_compens, 1)
    1560             :             buffer_compens(my_i, my_j) = buffer_unscaled(my_i, my_j) &
    1561             :                                     *(1.0_dp - omega**2/((my_eval_virt - my_eval_occ(my_i))*(my_eval_virt - recv_eval_occ(my_j)))) &
    1562             :                                          - buffer_compens(my_i, my_j)
    1563             :          END DO
    1564             :       END DO
    1565             : !$OMP END PARALLEL DO
    1566             : 
    1567        1000 :       CALL timestop(handle)
    1568             : 
    1569        1000 :    END SUBROUTINE scale_buffer_and_add_compens_occ
    1570             : 
    1571             : ! **************************************************************************************************
    1572             : !> \brief ...
    1573             : !> \param x ...
    1574             : !> \return ...
    1575             : ! **************************************************************************************************
    1576        1320 :    ELEMENTAL FUNCTION sinh_over_x(x) RESULT(res)
    1577             :       REAL(KIND=dp), INTENT(IN)                          :: x
    1578             :       REAL(KIND=dp)                                      :: res
    1579             : 
    1580             :       ! Calculate sinh(x)/x
    1581             :       ! Split the intervall to prevent numerical instabilities
    1582        1320 :       IF (ABS(x) > 3.0e-4_dp) THEN
    1583        1318 :          res = SINH(x)/x
    1584             :       ELSE
    1585           2 :          res = 1.0_dp + x**2/6.0_dp
    1586             :       END IF
    1587             : 
    1588        1320 :    END FUNCTION sinh_over_x
    1589             : 
    1590             : ! **************************************************************************************************
    1591             : !> \brief ...
    1592             : !> \param fm_work_iaP ...
    1593             : !> \param fm_mat_S ...
    1594             : !> \param pair_list ...
    1595             : !> \param virtual ...
    1596             : !> \param P_ij ...
    1597             : !> \param Eigenval ...
    1598             : !> \param omega ...
    1599             : !> \param weight ...
    1600             : !> \param index2send ...
    1601             : !> \param index2recv ...
    1602             : !> \param dot_blksize ...
    1603             : ! **************************************************************************************************
    1604           8 :    SUBROUTINE calc_Pij_degen(fm_work_iaP, fm_mat_S, pair_list, virtual, P_ij, Eigenval, &
    1605           8 :                              omega, weight, index2send, index2recv, dot_blksize)
    1606             :       TYPE(cp_fm_type), INTENT(IN)                       :: fm_work_iaP, fm_mat_S
    1607             :       INTEGER, DIMENSION(:, :), INTENT(IN)               :: pair_list
    1608             :       INTEGER, INTENT(IN)                                :: virtual
    1609             :       REAL(KIND=dp), DIMENSION(:), INTENT(INOUT)         :: P_ij
    1610             :       REAL(KIND=dp), DIMENSION(:), INTENT(IN)            :: Eigenval
    1611             :       REAL(KIND=dp), INTENT(IN)                          :: omega, weight
    1612             :       TYPE(one_dim_int_array), DIMENSION(0:), INTENT(IN) :: index2send, index2recv
    1613             :       INTEGER, INTENT(IN)                                :: dot_blksize
    1614             : 
    1615             :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'calc_Pij_degen'
    1616             : 
    1617             :       INTEGER :: avirt, col_global, col_local, counter, handle, handle2, ij_counter, iocc, &
    1618             :          my_col_local, my_i, my_j, my_pcol, my_prow, ncol_local, nrow_local, num_ij_pairs, &
    1619             :          num_pe_col, pcol, pcol_recv, pcol_send, proc_shift, recv_size, send_size, &
    1620             :          size_recv_buffer, size_send_buffer, tag
    1621           8 :       INTEGER, DIMENSION(:), POINTER                     :: col_indices, ncol_locals
    1622           8 :       INTEGER, DIMENSION(:, :), POINTER                  :: blacs2mpi
    1623             :       REAL(KIND=dp)                                      :: trace
    1624           8 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :)        :: buffer_recv, buffer_send
    1625             :       TYPE(cp_blacs_env_type), POINTER                   :: context
    1626             :       TYPE(mp_para_env_type), POINTER                    :: para_env
    1627             : 
    1628           8 :       CALL timeset(routineN, handle)
    1629             : 
    1630             :       CALL cp_fm_struct_get(fm_work_iaP%matrix_struct, para_env=para_env, ncol_locals=ncol_locals, &
    1631             :                             ncol_local=ncol_local, col_indices=col_indices, &
    1632           8 :                             context=context, nrow_local=nrow_local)
    1633             :       CALL context%get(my_process_row=my_prow, my_process_column=my_pcol, &
    1634           8 :                        number_of_process_columns=num_pe_col, blacs2mpi=blacs2mpi)
    1635             : 
    1636           8 :       num_ij_pairs = SIZE(pair_list, 2)
    1637             : 
    1638           8 :       tag = 42
    1639             : 
    1640         104 :       DO ij_counter = 1, num_ij_pairs
    1641             : 
    1642          96 :          my_i = pair_list(1, ij_counter)
    1643          96 :          my_j = pair_list(2, ij_counter)
    1644             : 
    1645          96 :          trace = 0.0_dp
    1646             : 
    1647        7392 :          DO col_local = 1, ncol_local
    1648        7296 :             col_global = col_indices(col_local)
    1649             : 
    1650        7296 :             iocc = MAX(1, col_global - 1)/virtual + 1
    1651        7296 :             avirt = col_global - (iocc - 1)*virtual
    1652             : 
    1653        7296 :             IF (iocc /= my_j) CYCLE
    1654        1824 :             pcol = fm_work_iaP%matrix_struct%g2p_col((my_i - 1)*virtual + avirt)
    1655        1824 :             IF (pcol /= my_pcol) CYCLE
    1656             : 
    1657        1824 :             my_col_local = fm_work_iaP%matrix_struct%g2l_col((my_i - 1)*virtual + avirt)
    1658             : 
    1659             :             trace = trace + accurate_dot_product_2(fm_mat_S%local_data(:, my_col_local), fm_work_iaP%local_data(:, col_local), &
    1660        7392 :                                                    dot_blksize)
    1661             :          END DO
    1662             : 
    1663         104 :          P_ij(ij_counter) = P_ij(ij_counter) - trace*sinh_over_x(0.5_dp*(Eigenval(my_i) - Eigenval(my_j))*omega)*omega*weight
    1664             : 
    1665             :       END DO
    1666             : 
    1667           8 :       IF (num_pe_col > 1) THEN
    1668             :          size_send_buffer = 0
    1669             :          size_recv_buffer = 0
    1670           0 :          DO proc_shift = 1, num_pe_col - 1
    1671           0 :             pcol_send = MODULO(my_pcol + proc_shift, num_pe_col)
    1672           0 :             pcol_recv = MODULO(my_pcol - proc_shift, num_pe_col)
    1673             : 
    1674           0 :             IF (ALLOCATED(index2send(pcol_send)%array)) &
    1675           0 :                size_send_buffer = MAX(size_send_buffer, SIZE(index2send(pcol_send)%array))
    1676             : 
    1677           0 :             IF (ALLOCATED(index2recv(pcol_recv)%array)) &
    1678           0 :                size_recv_buffer = MAX(size_recv_buffer, SIZE(index2recv(pcol_recv)%array))
    1679             :          END DO
    1680             : 
    1681           0 :          ALLOCATE (buffer_send(nrow_local, size_send_buffer), buffer_recv(nrow_local, size_recv_buffer))
    1682             : 
    1683           0 :          DO proc_shift = 1, num_pe_col - 1
    1684           0 :             pcol_send = MODULO(my_pcol + proc_shift, num_pe_col)
    1685           0 :             pcol_recv = MODULO(my_pcol - proc_shift, num_pe_col)
    1686             : 
    1687             :             ! Collect data and exchange
    1688           0 :             send_size = 0
    1689           0 :             IF (ALLOCATED(index2send(pcol_send)%array)) send_size = SIZE(index2send(pcol_send)%array)
    1690             : 
    1691           0 :             DO counter = 1, send_size
    1692           0 :                buffer_send(:, counter) = fm_work_iaP%local_data(:, index2send(pcol_send)%array(counter))
    1693             :             END DO
    1694             : 
    1695           0 :             recv_size = 0
    1696           0 :             IF (ALLOCATED(index2recv(pcol_recv)%array)) recv_size = SIZE(index2recv(pcol_recv)%array)
    1697           0 :             IF (recv_size > 0) THEN
    1698           0 :                CALL timeset(routineN//"_send", handle2)
    1699           0 :                IF (send_size > 0) THEN
    1700             :                   CALL para_env%sendrecv(buffer_send(:, :send_size), blacs2mpi(my_prow, pcol_send), &
    1701           0 :                                          buffer_recv(:, :recv_size), blacs2mpi(my_prow, pcol_recv), tag)
    1702             :                ELSE
    1703           0 :                   CALL para_env%recv(buffer_recv(:, :recv_size), blacs2mpi(my_prow, pcol_recv), tag)
    1704             :                END IF
    1705           0 :                CALL timestop(handle2)
    1706             : 
    1707           0 :                DO ij_counter = 1, num_ij_pairs
    1708             :                   ! Collect the contributions of the matrix elements
    1709             : 
    1710           0 :                   my_i = pair_list(1, ij_counter)
    1711           0 :                   my_j = pair_list(2, ij_counter)
    1712             : 
    1713           0 :                   trace = 0.0_dp
    1714             : 
    1715           0 :                   DO col_local = 1, recv_size
    1716           0 :                      col_global = index2recv(pcol_recv)%array(col_local)
    1717             : 
    1718           0 :                      iocc = MAX(1, col_global - 1)/virtual + 1
    1719           0 :                      IF (iocc /= my_j) CYCLE
    1720           0 :                      avirt = col_global - (iocc - 1)*virtual
    1721           0 :                      pcol = fm_work_iaP%matrix_struct%g2p_col((my_i - 1)*virtual + avirt)
    1722           0 :                      IF (pcol /= my_pcol) CYCLE
    1723             : 
    1724           0 :                      my_col_local = fm_work_iaP%matrix_struct%g2l_col((my_i - 1)*virtual + avirt)
    1725             : 
    1726             :                      trace = trace + accurate_dot_product_2(fm_mat_S%local_data(:, my_col_local), buffer_recv(:, col_local), &
    1727           0 :                                                             dot_blksize)
    1728             :                   END DO
    1729             : 
    1730             :                   P_ij(ij_counter) = P_ij(ij_counter) &
    1731           0 :                                      - trace*sinh_over_x(0.5_dp*(Eigenval(my_i) - Eigenval(my_j))*omega)*omega*weight
    1732             :                END DO
    1733           0 :             ELSE IF (send_size > 0) THEN
    1734           0 :                CALL timeset(routineN//"_send", handle2)
    1735           0 :                CALL para_env%send(buffer_send(:, :send_size), blacs2mpi(my_prow, pcol_send), tag)
    1736           0 :                CALL timestop(handle2)
    1737             :             END IF
    1738             :          END DO
    1739           0 :          IF (ALLOCATED(buffer_send)) DEALLOCATE (buffer_send)
    1740           0 :          IF (ALLOCATED(buffer_recv)) DEALLOCATE (buffer_recv)
    1741             :       END IF
    1742             : 
    1743           8 :       CALL timestop(handle)
    1744             : 
    1745          16 :    END SUBROUTINE
    1746             : 
    1747             : ! **************************************************************************************************
    1748             : !> \brief ...
    1749             : !> \param fm_work_iaP ...
    1750             : !> \param fm_mat_S ...
    1751             : !> \param pair_list ...
    1752             : !> \param virtual ...
    1753             : !> \param P_ab ...
    1754             : !> \param Eigenval ...
    1755             : !> \param omega ...
    1756             : !> \param weight ...
    1757             : !> \param index2send ...
    1758             : !> \param index2recv ...
    1759             : !> \param dot_blksize ...
    1760             : ! **************************************************************************************************
    1761           8 :    SUBROUTINE calc_Pab_degen(fm_work_iaP, fm_mat_S, pair_list, virtual, P_ab, Eigenval, &
    1762           8 :                              omega, weight, index2send, index2recv, dot_blksize)
    1763             :       TYPE(cp_fm_type), INTENT(IN)                       :: fm_work_iaP, fm_mat_S
    1764             :       INTEGER, DIMENSION(:, :), INTENT(IN)               :: pair_list
    1765             :       INTEGER, INTENT(IN)                                :: virtual
    1766             :       REAL(KIND=dp), DIMENSION(:), INTENT(INOUT)         :: P_ab
    1767             :       REAL(KIND=dp), DIMENSION(:), INTENT(IN)            :: Eigenval
    1768             :       REAL(KIND=dp), INTENT(IN)                          :: omega, weight
    1769             :       TYPE(one_dim_int_array), DIMENSION(0:), INTENT(IN) :: index2send, index2recv
    1770             :       INTEGER, INTENT(IN)                                :: dot_blksize
    1771             : 
    1772             :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'calc_Pab_degen'
    1773             : 
    1774             :       INTEGER :: ab_counter, avirt, col_global, col_local, counter, handle, handle2, iocc, my_a, &
    1775             :          my_b, my_col_local, my_pcol, my_prow, ncol_local, nrow_local, num_ab_pairs, num_pe_col, &
    1776             :          pcol, pcol_recv, pcol_send, proc_shift, recv_size, send_size, size_recv_buffer, &
    1777             :          size_send_buffer, tag
    1778           8 :       INTEGER, DIMENSION(:), POINTER                     :: col_indices, ncol_locals
    1779           8 :       INTEGER, DIMENSION(:, :), POINTER                  :: blacs2mpi
    1780             :       REAL(KIND=dp)                                      :: trace
    1781           8 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :)        :: buffer_recv, buffer_send
    1782             :       TYPE(cp_blacs_env_type), POINTER                   :: context
    1783             :       TYPE(mp_para_env_type), POINTER                    :: para_env
    1784             : 
    1785           8 :       CALL timeset(routineN, handle)
    1786             : 
    1787             :       CALL cp_fm_struct_get(fm_work_iaP%matrix_struct, para_env=para_env, ncol_locals=ncol_locals, &
    1788             :                             ncol_local=ncol_local, col_indices=col_indices, &
    1789           8 :                             context=context, nrow_local=nrow_local)
    1790             :       CALL context%get(my_process_row=my_prow, my_process_column=my_pcol, &
    1791           8 :                        number_of_process_columns=num_pe_col, blacs2mpi=blacs2mpi)
    1792             : 
    1793           8 :       num_ab_pairs = SIZE(pair_list, 2)
    1794             : 
    1795           8 :       tag = 43
    1796             : 
    1797        1232 :       DO ab_counter = 1, num_ab_pairs
    1798             : 
    1799        1224 :          my_a = pair_list(1, ab_counter)
    1800        1224 :          my_b = pair_list(2, ab_counter)
    1801             : 
    1802        1224 :          trace = 0.0_dp
    1803             : 
    1804       94248 :          DO col_local = 1, ncol_local
    1805       93024 :             col_global = col_indices(col_local)
    1806             : 
    1807       93024 :             iocc = MAX(1, col_global - 1)/virtual + 1
    1808       93024 :             avirt = col_global - (iocc - 1)*virtual
    1809             : 
    1810       93024 :             IF (avirt /= my_b) CYCLE
    1811        4896 :             pcol = fm_work_iaP%matrix_struct%g2p_col((iocc - 1)*virtual + my_a)
    1812        4896 :             IF (pcol /= my_pcol) CYCLE
    1813        4896 :             my_col_local = fm_work_iaP%matrix_struct%g2l_col((iocc - 1)*virtual + my_a)
    1814             : 
    1815             :             trace = trace + accurate_dot_product_2(fm_mat_S%local_data(:, my_col_local), fm_work_iaP%local_data(:, col_local), &
    1816       94248 :                                                    dot_blksize)
    1817             : 
    1818             :          END DO
    1819             : 
    1820             :          P_ab(ab_counter) = P_ab(ab_counter) &
    1821        1232 :                             + trace*sinh_over_x(0.5_dp*(Eigenval(my_a) - Eigenval(my_b))*omega)*omega*weight
    1822             : 
    1823             :       END DO
    1824             : 
    1825           8 :       IF (num_pe_col > 1) THEN
    1826             :          size_send_buffer = 0
    1827             :          size_recv_buffer = 0
    1828           0 :          DO proc_shift = 1, num_pe_col - 1
    1829           0 :             pcol_send = MODULO(my_pcol + proc_shift, num_pe_col)
    1830           0 :             pcol_recv = MODULO(my_pcol - proc_shift, num_pe_col)
    1831             : 
    1832           0 :             IF (ALLOCATED(index2send(pcol_send)%array)) &
    1833           0 :                size_send_buffer = MAX(size_send_buffer, SIZE(index2send(pcol_send)%array))
    1834             : 
    1835           0 :             IF (ALLOCATED(index2recv(pcol_recv)%array)) &
    1836           0 :                size_recv_buffer = MAX(size_recv_buffer, SIZE(index2recv(pcol_recv)%array))
    1837             :          END DO
    1838             : 
    1839           0 :          ALLOCATE (buffer_send(nrow_local, size_send_buffer), buffer_recv(nrow_local, size_recv_buffer))
    1840             : 
    1841           0 :          DO proc_shift = 1, num_pe_col - 1
    1842           0 :             pcol_send = MODULO(my_pcol + proc_shift, num_pe_col)
    1843           0 :             pcol_recv = MODULO(my_pcol - proc_shift, num_pe_col)
    1844             : 
    1845             :             ! Collect data and exchange
    1846           0 :             send_size = 0
    1847           0 :             IF (ALLOCATED(index2send(pcol_send)%array)) send_size = SIZE(index2send(pcol_send)%array)
    1848             : 
    1849           0 :             DO counter = 1, send_size
    1850           0 :                buffer_send(:, counter) = fm_work_iaP%local_data(:, index2send(pcol_send)%array(counter))
    1851             :             END DO
    1852             : 
    1853           0 :             recv_size = 0
    1854           0 :             IF (ALLOCATED(index2recv(pcol_recv)%array)) recv_size = SIZE(index2recv(pcol_recv)%array)
    1855           0 :             IF (recv_size > 0) THEN
    1856           0 :                CALL timeset(routineN//"_send", handle2)
    1857           0 :                IF (send_size > 0) THEN
    1858             :                   CALL para_env%sendrecv(buffer_send(:, :send_size), blacs2mpi(my_prow, pcol_send), &
    1859           0 :                                          buffer_recv(:, :recv_size), blacs2mpi(my_prow, pcol_recv), tag)
    1860             :                ELSE
    1861           0 :                   CALL para_env%recv(buffer_recv(:, :recv_size), blacs2mpi(my_prow, pcol_recv), tag)
    1862             :                END IF
    1863           0 :                CALL timestop(handle2)
    1864             : 
    1865           0 :                DO ab_counter = 1, num_ab_pairs
    1866             :                   ! Collect the contributions of the matrix elements
    1867             : 
    1868           0 :                   my_a = pair_list(1, ab_counter)
    1869           0 :                   my_b = pair_list(2, ab_counter)
    1870             : 
    1871           0 :                   trace = 0.0_dp
    1872             : 
    1873           0 :                   DO col_local = 1, SIZE(index2recv(pcol_recv)%array)
    1874           0 :                      col_global = index2recv(pcol_recv)%array(col_local)
    1875             : 
    1876           0 :                      iocc = MAX(1, col_global - 1)/virtual + 1
    1877           0 :                      avirt = col_global - (iocc - 1)*virtual
    1878           0 :                      IF (avirt /= my_b) CYCLE
    1879           0 :                      pcol = fm_work_iaP%matrix_struct%g2p_col((iocc - 1)*virtual + my_a)
    1880           0 :                      IF (pcol /= my_pcol) CYCLE
    1881             : 
    1882           0 :                      my_col_local = fm_work_iaP%matrix_struct%g2l_col((iocc - 1)*virtual + my_a)
    1883             : 
    1884             :                      trace = trace + accurate_dot_product_2(fm_mat_S%local_data(:, my_col_local), buffer_recv(:, col_local), &
    1885           0 :                                                             dot_blksize)
    1886             :                   END DO
    1887             : 
    1888             :                   P_ab(ab_counter) = P_ab(ab_counter) &
    1889           0 :                                      + trace*sinh_over_x(0.5_dp*(Eigenval(my_a) - Eigenval(my_b))*omega)*omega*weight
    1890             : 
    1891             :                END DO
    1892           0 :             ELSE IF (send_size > 0) THEN
    1893           0 :                CALL timeset(routineN//"_send", handle2)
    1894           0 :                CALL para_env%send(buffer_send(:, :send_size), blacs2mpi(my_prow, pcol_send), tag)
    1895           0 :                CALL timestop(handle2)
    1896             :             END IF
    1897             :          END DO
    1898           0 :          IF (ALLOCATED(buffer_send)) DEALLOCATE (buffer_send)
    1899           0 :          IF (ALLOCATED(buffer_recv)) DEALLOCATE (buffer_recv)
    1900             :       END IF
    1901             : 
    1902           8 :       CALL timestop(handle)
    1903             : 
    1904          16 :    END SUBROUTINE
    1905             : 
    1906             : ! **************************************************************************************************
    1907             : !> \brief ...
    1908             : !> \param index2send ...
    1909             : !> \param index2recv ...
    1910             : !> \param fm_mat_S ...
    1911             : !> \param mat_S_3D ...
    1912             : !> \param gd_homo ...
    1913             : !> \param gd_virtual ...
    1914             : !> \param mepos ...
    1915             : ! **************************************************************************************************
    1916         112 :    SUBROUTINE redistribute_fm_mat_S(index2send, index2recv, fm_mat_S, mat_S_3D, gd_homo, gd_virtual, mepos)
    1917             :       TYPE(one_dim_int_array), DIMENSION(0:), INTENT(IN) :: index2send
    1918             :       TYPE(two_dim_int_array), DIMENSION(0:), INTENT(IN) :: index2recv
    1919             :       TYPE(cp_fm_type), INTENT(IN)                       :: fm_mat_S
    1920             :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
    1921             :          INTENT(OUT)                                     :: mat_S_3D
    1922             :       TYPE(group_dist_d1_type), INTENT(IN)               :: gd_homo, gd_virtual
    1923             :       INTEGER, DIMENSION(2), INTENT(IN)                  :: mepos
    1924             : 
    1925             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'redistribute_fm_mat_S'
    1926             : 
    1927             :       INTEGER :: col_local, handle, my_a, my_homo, my_i, my_pcol, my_prow, my_virtual, nrow_local, &
    1928             :          num_pe_col, proc_recv, proc_send, proc_shift, recv_size, send_size, size_recv_buffer, &
    1929             :          size_send_buffer, tag
    1930         112 :       INTEGER, DIMENSION(:, :), POINTER                  :: blacs2mpi
    1931         112 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :)        :: buffer_recv, buffer_send
    1932             :       TYPE(mp_para_env_type), POINTER                    :: para_env
    1933             : 
    1934         112 :       CALL timeset(routineN, handle)
    1935             : 
    1936         112 :       tag = 46
    1937             : 
    1938             :       CALL fm_mat_S%matrix_struct%context%get(my_process_row=my_prow, my_process_column=my_pcol, &
    1939         112 :                                               number_of_process_columns=num_pe_col, blacs2mpi=blacs2mpi)
    1940             : 
    1941         112 :       CALL cp_fm_struct_get(fm_mat_S%matrix_struct, nrow_local=nrow_local, para_env=para_env)
    1942             : 
    1943         112 :       CALL get_group_dist(gd_homo, mepos(2), sizes=my_homo)
    1944         112 :       CALL get_group_dist(gd_virtual, mepos(1), sizes=my_virtual)
    1945             : 
    1946         560 :       ALLOCATE (mat_S_3D(nrow_local, my_virtual, my_homo))
    1947             : 
    1948         112 :       IF (ALLOCATED(index2send(my_pcol)%array)) THEN
    1949        8928 :          DO col_local = 1, SIZE(index2send(my_pcol)%array)
    1950        8816 :             my_a = index2recv(my_pcol)%array(1, col_local)
    1951        8816 :             my_i = index2recv(my_pcol)%array(2, col_local)
    1952      678032 :             mat_S_3D(:, my_a, my_i) = fm_mat_S%local_data(:, index2send(my_pcol)%array(col_local))
    1953             :          END DO
    1954             :       END IF
    1955             : 
    1956         112 :       IF (num_pe_col > 1) THEN
    1957             :          size_send_buffer = 0
    1958             :          size_recv_buffer = 0
    1959           0 :          DO proc_shift = 1, num_pe_col - 1
    1960           0 :             proc_send = MODULO(my_pcol + proc_shift, num_pe_col)
    1961           0 :             proc_recv = MODULO(my_pcol - proc_shift, num_pe_col)
    1962             : 
    1963           0 :             send_size = 0
    1964           0 :             IF (ALLOCATED(index2send(proc_send)%array)) send_size = SIZE(index2send(proc_send)%array)
    1965           0 :             size_send_buffer = MAX(size_send_buffer, send_size)
    1966             : 
    1967           0 :             recv_size = 0
    1968           0 :             IF (ALLOCATED(index2recv(proc_recv)%array)) recv_size = SIZE(index2recv(proc_recv)%array)
    1969           0 :             size_recv_buffer = MAX(size_recv_buffer, recv_size)
    1970             : 
    1971             :          END DO
    1972             : 
    1973           0 :          ALLOCATE (buffer_send(nrow_local, size_send_buffer), buffer_recv(nrow_local, size_recv_buffer))
    1974             : 
    1975           0 :          DO proc_shift = 1, num_pe_col - 1
    1976           0 :             proc_send = MODULO(my_pcol + proc_shift, num_pe_col)
    1977           0 :             proc_recv = MODULO(my_pcol - proc_shift, num_pe_col)
    1978             : 
    1979           0 :             send_size = 0
    1980           0 :             IF (ALLOCATED(index2send(proc_send)%array)) send_size = SIZE(index2send(proc_send)%array)
    1981           0 :             DO col_local = 1, send_size
    1982           0 :                buffer_send(:, col_local) = fm_mat_S%local_data(:, index2send(proc_send)%array(col_local))
    1983             :             END DO
    1984             : 
    1985           0 :             recv_size = 0
    1986           0 :             IF (ALLOCATED(index2recv(proc_recv)%array)) recv_size = SIZE(index2recv(proc_recv)%array, 2)
    1987           0 :             IF (recv_size > 0) THEN
    1988           0 :                IF (send_size > 0) THEN
    1989             :                   CALL para_env%sendrecv(buffer_send(:, :send_size), blacs2mpi(my_prow, proc_send), &
    1990           0 :                                          buffer_recv(:, :recv_size), blacs2mpi(my_prow, proc_recv), tag)
    1991             :                ELSE
    1992           0 :                   CALL para_env%recv(buffer_recv(:, :recv_size), blacs2mpi(my_prow, proc_recv), tag)
    1993             :                END IF
    1994             : 
    1995           0 :                DO col_local = 1, recv_size
    1996           0 :                   my_a = index2recv(proc_recv)%array(1, col_local)
    1997           0 :                   my_i = index2recv(proc_recv)%array(2, col_local)
    1998           0 :                   mat_S_3D(:, my_a, my_i) = buffer_recv(:, col_local)
    1999             :                END DO
    2000           0 :             ELSE IF (send_size > 0) THEN
    2001           0 :                CALL para_env%send(buffer_send(:, :send_size), blacs2mpi(my_prow, proc_send), tag)
    2002             :             END IF
    2003             : 
    2004             :          END DO
    2005             : 
    2006           0 :          IF (ALLOCATED(buffer_send)) DEALLOCATE (buffer_send)
    2007           0 :          IF (ALLOCATED(buffer_recv)) DEALLOCATE (buffer_recv)
    2008             :       END IF
    2009             : 
    2010         112 :       CALL timestop(handle)
    2011             : 
    2012         336 :    END SUBROUTINE
    2013             : 
    2014             : ! **************************************************************************************************
    2015             : !> \brief ...
    2016             : !> \param rpa_grad ...
    2017             : !> \param mp2_env ...
    2018             : !> \param para_env_sub ...
    2019             : !> \param para_env ...
    2020             : !> \param qs_env ...
    2021             : !> \param gd_array ...
    2022             : !> \param color_sub ...
    2023             : !> \param do_ri_sos_laplace_mp2 ...
    2024             : !> \param homo ...
    2025             : !> \param virtual ...
    2026             : ! **************************************************************************************************
    2027          40 :    SUBROUTINE rpa_grad_finalize(rpa_grad, mp2_env, para_env_sub, para_env, qs_env, gd_array, &
    2028          40 :                                 color_sub, do_ri_sos_laplace_mp2, homo, virtual)
    2029             :       TYPE(rpa_grad_type), INTENT(INOUT)                 :: rpa_grad
    2030             :       TYPE(mp2_type), INTENT(INOUT)                      :: mp2_env
    2031             :       TYPE(mp_para_env_type), INTENT(IN), POINTER        :: para_env_sub, para_env
    2032             :       TYPE(qs_environment_type), INTENT(IN), POINTER     :: qs_env
    2033             :       TYPE(group_dist_d1_type)                           :: gd_array
    2034             :       INTEGER, INTENT(IN)                                :: color_sub
    2035             :       LOGICAL, INTENT(IN)                                :: do_ri_sos_laplace_mp2
    2036             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: homo, virtual
    2037             : 
    2038             :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'rpa_grad_finalize'
    2039             : 
    2040             :       INTEGER :: dimen_ia, dimen_RI, handle, iiB, ispin, my_group_L_end, my_group_L_size, &
    2041             :          my_group_L_start, my_ia_end, my_ia_size, my_ia_start, my_P_end, my_P_size, my_P_start, &
    2042             :          ngroup, nspins, pos_group, pos_sub, proc
    2043          40 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: pos_info
    2044          40 :       INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: group_grid_2_mepos, mepos_2_grid_group
    2045             :       REAL(KIND=dp)                                      :: my_scale
    2046          40 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :)        :: Gamma_2D
    2047             :       TYPE(cp_blacs_env_type), POINTER                   :: blacs_env
    2048             :       TYPE(cp_fm_struct_type), POINTER                   :: fm_struct
    2049             :       TYPE(cp_fm_type)                                   :: fm_G_P_ia, fm_PQ, fm_PQ_2, fm_PQ_half, &
    2050             :                                                             fm_work_PQ, fm_work_PQ_2, fm_Y, &
    2051             :                                                             operator_half
    2052          40 :       TYPE(group_dist_d1_type)                           :: gd_array_new, gd_ia, gd_P, gd_P_new
    2053             : 
    2054          40 :       CALL timeset(routineN, handle)
    2055             : 
    2056             :       ! Release unnecessary matrices to save memory for next steps
    2057             : 
    2058          40 :       nspins = SIZE(rpa_grad%fm_Y)
    2059             : 
    2060             :       ! Scaling factor is required to scale the density matrices and the Gamma matrices later
    2061          40 :       IF (do_ri_sos_laplace_mp2) THEN
    2062          20 :          my_scale = mp2_env%scale_s
    2063             :       ELSE
    2064          20 :          my_scale = -mp2_env%ri_rpa%scale_rpa/(2.0_dp*pi)
    2065          20 :          IF (mp2_env%ri_rpa%minimax_quad) my_scale = my_scale/2.0_dp
    2066             :       END IF
    2067             : 
    2068          40 :       IF (do_ri_sos_laplace_mp2) THEN
    2069             :          CALL sos_mp2_grad_finalize(rpa_grad%sos_mp2_work_occ, rpa_grad%sos_mp2_work_virt, &
    2070          20 :                                     para_env, para_env_sub, homo, virtual, mp2_env)
    2071             :       ELSE
    2072             :          CALL rpa_grad_work_finalize(rpa_grad%rpa_work, mp2_env, homo, &
    2073          20 :                                      virtual, para_env, para_env_sub)
    2074             :       END IF
    2075             : 
    2076          40 :       CALL get_qs_env(qs_env, blacs_env=blacs_env)
    2077             : 
    2078          40 :       CALL cp_fm_get_info(rpa_grad%fm_Gamma_PQ, ncol_global=dimen_RI)
    2079             : 
    2080          40 :       NULLIFY (fm_struct)
    2081             :       CALL cp_fm_struct_create(fm_struct, context=blacs_env, nrow_global=dimen_RI, &
    2082          40 :                                ncol_global=dimen_RI, para_env=para_env)
    2083          40 :       CALL cp_fm_create(fm_PQ, fm_struct)
    2084          40 :       CALL cp_fm_create(fm_work_PQ, fm_struct)
    2085          40 :       IF (.NOT. compare_potential_types(mp2_env%ri_metric, mp2_env%potential_parameter)) THEN
    2086           4 :          CALL cp_fm_create(fm_PQ_2, fm_struct)
    2087             :       END IF
    2088          40 :       CALL cp_fm_struct_release(fm_struct)
    2089          40 :       CALL cp_fm_set_all(fm_PQ, 0.0_dp)
    2090             : 
    2091             :       ! We still have to left- and right multiply it with PQhalf
    2092          40 :       CALL dereplicate_and_sum_fm(rpa_grad%fm_Gamma_PQ, fm_PQ)
    2093             : 
    2094          40 :       ngroup = para_env%num_pe/para_env_sub%num_pe
    2095             : 
    2096             :       CALL prepare_redistribution(para_env, para_env_sub, ngroup, &
    2097             :                                   group_grid_2_mepos, mepos_2_grid_group, pos_info=pos_info)
    2098             : 
    2099             :       ! Create fm_PQ_half
    2100          40 :       CALL create_group_dist(gd_P, para_env_sub%num_pe, dimen_RI)
    2101          40 :       CALL get_group_dist(gd_P, para_env_sub%mepos, my_P_start, my_P_end, my_P_size)
    2102             : 
    2103          40 :       CALL get_group_dist(gd_array, color_sub, my_group_L_start, my_group_L_end, my_group_L_size)
    2104             : 
    2105          40 :       CALL create_group_dist(gd_P_new, para_env%num_pe)
    2106          40 :       CALL create_group_dist(gd_array_new, para_env%num_pe)
    2107             : 
    2108         120 :       DO proc = 0, para_env%num_pe - 1
    2109             :          ! calculate position of the group
    2110          80 :          pos_group = proc/para_env_sub%num_pe
    2111             :          ! calculate position in the subgroup
    2112          80 :          pos_sub = pos_info(proc)
    2113             :          ! 1 -> rows, 2 -> cols
    2114          80 :          CALL get_group_dist(gd_array, pos_group, gd_array_new, proc)
    2115         120 :          CALL get_group_dist(gd_P, pos_sub, gd_P_new, proc)
    2116             :       END DO
    2117             : 
    2118          40 :       DEALLOCATE (pos_info)
    2119          40 :       CALL release_group_dist(gd_P)
    2120             : 
    2121             :       CALL array2fm(mp2_env%ri_grad%PQ_half, fm_PQ%matrix_struct, &
    2122             :                     my_P_start, my_P_end, &
    2123             :                     my_group_L_start, my_group_L_end, &
    2124             :                     gd_P_new, gd_array_new, &
    2125             :                     group_grid_2_mepos, para_env_sub%num_pe, ngroup, &
    2126          40 :                     fm_PQ_half)
    2127             : 
    2128          40 :       IF (.NOT. compare_potential_types(mp2_env%ri_metric, mp2_env%potential_parameter)) THEN
    2129             :          CALL array2fm(mp2_env%ri_grad%operator_half, fm_PQ%matrix_struct, my_P_start, my_P_end, &
    2130             :                        my_group_L_start, my_group_L_end, &
    2131             :                        gd_P_new, gd_array_new, &
    2132             :                        group_grid_2_mepos, para_env_sub%num_pe, ngroup, &
    2133           4 :                        operator_half)
    2134             :       END IF
    2135             : 
    2136             :       ! deallocate the info array
    2137          40 :       CALL release_group_dist(gd_P_new)
    2138          40 :       CALL release_group_dist(gd_array_new)
    2139             : 
    2140          40 :       IF (compare_potential_types(mp2_env%ri_metric, mp2_env%potential_parameter)) THEN
    2141             : ! Finish Gamma_PQ
    2142             :          CALL parallel_gemm(transa="N", transb="T", m=dimen_RI, n=dimen_RI, k=dimen_RI, alpha=1.0_dp, &
    2143             :                             matrix_a=fm_PQ, matrix_b=fm_PQ_half, beta=0.0_dp, &
    2144          36 :                             matrix_c=fm_work_PQ)
    2145             : 
    2146             :          CALL parallel_gemm(transa="N", transb="N", m=dimen_RI, n=dimen_RI, k=dimen_RI, alpha=-my_scale, &
    2147             :                             matrix_a=fm_PQ_half, matrix_b=fm_work_PQ, beta=0.0_dp, &
    2148          36 :                             matrix_c=fm_PQ)
    2149             : 
    2150          36 :          CALL cp_fm_release(fm_work_PQ)
    2151             :       ELSE
    2152             :          CALL parallel_gemm(transa="N", transb="T", m=dimen_RI, n=dimen_RI, k=dimen_RI, alpha=1.0_dp, &
    2153             :                             matrix_a=fm_PQ, matrix_b=operator_half, beta=0.0_dp, &
    2154           4 :                             matrix_c=fm_work_PQ)
    2155             : 
    2156             :          CALL parallel_gemm(transa="N", transb="N", m=dimen_RI, n=dimen_RI, k=dimen_RI, alpha=my_scale, &
    2157             :                             matrix_a=operator_half, matrix_b=fm_work_PQ, beta=0.0_dp, &
    2158           4 :                             matrix_c=fm_PQ)
    2159           4 :          CALL cp_fm_release(operator_half)
    2160             : 
    2161           4 :          CALL cp_fm_create(fm_work_PQ_2, fm_PQ%matrix_struct, name="fm_Gamma_PQ_2")
    2162             :          CALL parallel_gemm(transa="N", transb="N", m=dimen_RI, n=dimen_RI, k=dimen_RI, alpha=-my_scale, &
    2163             :                             matrix_a=fm_PQ_half, matrix_b=fm_work_PQ, beta=0.0_dp, &
    2164           4 :                             matrix_c=fm_work_PQ_2)
    2165           4 :          CALL cp_fm_to_fm(fm_work_PQ_2, fm_PQ_2)
    2166           4 :          CALL cp_fm_geadd(1.0_dp, "T", fm_work_PQ_2, 1.0_dp, fm_PQ_2)
    2167           4 :          CALL cp_fm_release(fm_work_PQ_2)
    2168           4 :          CALL cp_fm_release(fm_work_PQ)
    2169             :       END IF
    2170             : 
    2171         160 :       ALLOCATE (mp2_env%ri_grad%Gamma_PQ(my_P_size, my_group_L_size))
    2172             :       CALL fm2array(mp2_env%ri_grad%Gamma_PQ, &
    2173             :                     my_P_size, my_P_start, my_P_end, &
    2174             :                     my_group_L_size, my_group_L_start, my_group_L_end, &
    2175             :                     group_grid_2_mepos, mepos_2_grid_group, &
    2176             :                     para_env_sub%num_pe, ngroup, &
    2177          40 :                     fm_PQ)
    2178             : 
    2179          40 :       IF (.NOT. compare_potential_types(mp2_env%ri_metric, mp2_env%potential_parameter)) THEN
    2180          12 :          ALLOCATE (mp2_env%ri_grad%Gamma_PQ_2(my_P_size, my_group_L_size))
    2181             :          CALL fm2array(mp2_env%ri_grad%Gamma_PQ_2, my_P_size, my_P_start, my_P_end, &
    2182             :                        my_group_L_size, my_group_L_start, my_group_L_end, &
    2183             :                        group_grid_2_mepos, mepos_2_grid_group, &
    2184             :                        para_env_sub%num_pe, ngroup, &
    2185           4 :                        fm_PQ_2)
    2186             :       END IF
    2187             : 
    2188             : ! Now, Gamma_Pia
    2189        2644 :       ALLOCATE (mp2_env%ri_grad%G_P_ia(my_group_L_size, nspins))
    2190          88 :       DO ispin = 1, nspins
    2191        2524 :       DO iiB = 1, my_group_L_size
    2192        2484 :          NULLIFY (mp2_env%ri_grad%G_P_ia(iiB, ispin)%matrix)
    2193             :       END DO
    2194             :       END DO
    2195             : 
    2196             :       ! Redistribute the Y matrix
    2197          88 :       DO ispin = 1, nspins
    2198             :          ! Collect all data of columns for the own sub group locally
    2199          48 :          CALL cp_fm_get_info(rpa_grad%fm_Y(ispin), ncol_global=dimen_ia)
    2200             : 
    2201          48 :          CALL get_qs_env(qs_env, blacs_env=blacs_env)
    2202             : 
    2203          48 :          NULLIFY (fm_struct)
    2204          48 :          CALL cp_fm_struct_create(fm_struct, template_fmstruct=fm_PQ_half%matrix_struct, nrow_global=dimen_ia)
    2205          48 :          CALL cp_fm_create(fm_Y, fm_struct)
    2206          48 :          CALL cp_fm_struct_release(fm_struct)
    2207          48 :          CALL cp_fm_set_all(fm_Y, 0.0_dp)
    2208             : 
    2209          48 :          CALL dereplicate_and_sum_fm(rpa_grad%fm_Y(ispin), fm_Y)
    2210             : 
    2211          48 :          CALL cp_fm_create(fm_G_P_ia, fm_Y%matrix_struct)
    2212          48 :          CALL cp_fm_set_all(fm_G_P_ia, 0.0_dp)
    2213             : 
    2214             :          CALL parallel_gemm(transa="N", transb="T", m=dimen_ia, n=dimen_RI, k=dimen_RI, alpha=my_scale, &
    2215             :                             matrix_a=fm_Y, matrix_b=fm_PQ_half, beta=0.0_dp, &
    2216          48 :                             matrix_c=fm_G_P_ia)
    2217             : 
    2218          48 :          CALL cp_fm_release(fm_Y)
    2219             : 
    2220          48 :          CALL create_group_dist(gd_ia, para_env_sub%num_pe, dimen_ia)
    2221          48 :          CALL get_group_dist(gd_ia, para_env_sub%mepos, my_ia_start, my_ia_end, my_ia_size)
    2222             : 
    2223             :          CALL fm2array(Gamma_2D, my_ia_size, my_ia_start, my_ia_end, &
    2224             :                        my_group_L_size, my_group_L_start, my_group_L_end, &
    2225             :                        group_grid_2_mepos, mepos_2_grid_group, &
    2226             :                        para_env_sub%num_pe, ngroup, &
    2227          48 :                        fm_G_P_ia)
    2228             : 
    2229             :          ! create the Gamma_ia_P in DBCSR style
    2230             :          CALL create_dbcsr_gamma(Gamma_2D, homo(ispin), virtual(ispin), dimen_ia, para_env_sub, &
    2231             :                                  my_ia_start, my_ia_end, my_group_L_size, gd_ia, &
    2232          48 :                                  mp2_env%ri_grad%G_P_ia(:, ispin), mp2_env%ri_grad%mo_coeff_o(ispin)%matrix)
    2233             : 
    2234         328 :          CALL release_group_dist(gd_ia)
    2235             : 
    2236             :       END DO
    2237          40 :       DEALLOCATE (rpa_grad%fm_Y)
    2238          40 :       CALL cp_fm_release(fm_PQ_half)
    2239             : 
    2240          40 :       CALL timestop(handle)
    2241             : 
    2242         320 :    END SUBROUTINE rpa_grad_finalize
    2243             : 
    2244             : ! **************************************************************************************************
    2245             : !> \brief ...
    2246             : !> \param sos_mp2_work_occ ...
    2247             : !> \param sos_mp2_work_virt ...
    2248             : !> \param para_env ...
    2249             : !> \param para_env_sub ...
    2250             : !> \param homo ...
    2251             : !> \param virtual ...
    2252             : !> \param mp2_env ...
    2253             : ! **************************************************************************************************
    2254          20 :    SUBROUTINE sos_mp2_grad_finalize(sos_mp2_work_occ, sos_mp2_work_virt, para_env, para_env_sub, homo, virtual, mp2_env)
    2255             :       TYPE(sos_mp2_grad_work_type), ALLOCATABLE, &
    2256             :          DIMENSION(:), INTENT(INOUT)                     :: sos_mp2_work_occ, sos_mp2_work_virt
    2257             :       TYPE(mp_para_env_type), INTENT(IN), POINTER        :: para_env, para_env_sub
    2258             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: homo, virtual
    2259             :       TYPE(mp2_type), INTENT(INOUT)                      :: mp2_env
    2260             : 
    2261             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'sos_mp2_grad_finalize'
    2262             : 
    2263             :       INTEGER                                            :: ab_counter, handle, ij_counter, ispin, &
    2264             :                                                             itmp(2), my_a, my_b, my_B_end, &
    2265             :                                                             my_B_size, my_B_start, my_i, my_j, &
    2266             :                                                             nspins, pcol
    2267             :       REAL(KIND=dp)                                      :: my_scale
    2268             : 
    2269          20 :       CALL timeset(routineN, handle)
    2270             : 
    2271          20 :       nspins = SIZE(sos_mp2_work_occ)
    2272          20 :       my_scale = mp2_env%scale_s
    2273             : 
    2274          44 :       DO ispin = 1, nspins
    2275          48 :          DO pcol = 0, SIZE(sos_mp2_work_occ(ispin)%index2send, 1) - 1
    2276          24 :             IF (ALLOCATED(sos_mp2_work_occ(ispin)%index2send(pcol)%array)) &
    2277           4 :                DEALLOCATE (sos_mp2_work_occ(ispin)%index2send(pcol)%array)
    2278          24 :             IF (ALLOCATED(sos_mp2_work_occ(ispin)%index2send(pcol)%array)) &
    2279           0 :                DEALLOCATE (sos_mp2_work_occ(ispin)%index2send(pcol)%array)
    2280          24 :             IF (ALLOCATED(sos_mp2_work_virt(ispin)%index2recv(pcol)%array)) &
    2281           4 :                DEALLOCATE (sos_mp2_work_virt(ispin)%index2recv(pcol)%array)
    2282          24 :             IF (ALLOCATED(sos_mp2_work_virt(ispin)%index2recv(pcol)%array)) &
    2283          24 :                DEALLOCATE (sos_mp2_work_virt(ispin)%index2recv(pcol)%array)
    2284             :          END DO
    2285           0 :          DEALLOCATE (sos_mp2_work_occ(ispin)%index2send, &
    2286           0 :                      sos_mp2_work_occ(ispin)%index2recv, &
    2287           0 :                      sos_mp2_work_virt(ispin)%index2send, &
    2288         140 :                      sos_mp2_work_virt(ispin)%index2recv)
    2289             :       END DO
    2290             : 
    2291             :       ! Sum P_ij and P_ab and redistribute them
    2292          44 :       DO ispin = 1, nspins
    2293          24 :          CALL para_env%sum(sos_mp2_work_occ(ispin)%P)
    2294             : 
    2295          96 :          ALLOCATE (mp2_env%ri_grad%P_ij(ispin)%array(homo(ispin), homo(ispin)))
    2296         472 :          mp2_env%ri_grad%P_ij(ispin)%array = 0.0_dp
    2297         116 :          DO my_i = 1, homo(ispin)
    2298         116 :             mp2_env%ri_grad%P_ij(ispin)%array(my_i, my_i) = my_scale*sos_mp2_work_occ(ispin)%P(my_i)
    2299             :          END DO
    2300          72 :          DO ij_counter = 1, SIZE(sos_mp2_work_occ(ispin)%pair_list, 2)
    2301          48 :             my_i = sos_mp2_work_occ(ispin)%pair_list(1, ij_counter)
    2302          48 :             my_j = sos_mp2_work_occ(ispin)%pair_list(2, ij_counter)
    2303             : 
    2304          72 :             mp2_env%ri_grad%P_ij(ispin)%array(my_i, my_j) = my_scale*sos_mp2_work_occ(ispin)%P(homo(ispin) + ij_counter)
    2305             :          END DO
    2306          24 :          DEALLOCATE (sos_mp2_work_occ(ispin)%P, sos_mp2_work_occ(ispin)%pair_list)
    2307             : 
    2308             :          ! Symmetrize P_ij
    2309             :          mp2_env%ri_grad%P_ij(ispin)%array(:, :) = 0.5_dp*(mp2_env%ri_grad%P_ij(ispin)%array + &
    2310         920 :                                                            TRANSPOSE(mp2_env%ri_grad%P_ij(ispin)%array))
    2311             : 
    2312             :          ! The first index of P_ab has to be distributed within the subgroups, so sum it up first and add the required elements later
    2313          24 :          CALL para_env%sum(sos_mp2_work_virt(ispin)%P)
    2314             : 
    2315          24 :          itmp = get_limit(virtual(ispin), para_env_sub%num_pe, para_env_sub%mepos)
    2316          24 :          my_B_size = itmp(2) - itmp(1) + 1
    2317          24 :          my_B_start = itmp(1)
    2318          24 :          my_B_end = itmp(2)
    2319             : 
    2320          96 :          ALLOCATE (mp2_env%ri_grad%P_ab(ispin)%array(my_B_size, virtual(ispin)))
    2321       10382 :          mp2_env%ri_grad%P_ab(ispin)%array = 0.0_dp
    2322         486 :          DO my_a = itmp(1), itmp(2)
    2323         486 :             mp2_env%ri_grad%P_ab(ispin)%array(my_a - itmp(1) + 1, my_a) = my_scale*sos_mp2_work_virt(ispin)%P(my_a)
    2324             :          END DO
    2325         636 :          DO ab_counter = 1, SIZE(sos_mp2_work_virt(ispin)%pair_list, 2)
    2326         612 :             my_a = sos_mp2_work_virt(ispin)%pair_list(1, ab_counter)
    2327         612 :             my_b = sos_mp2_work_virt(ispin)%pair_list(2, ab_counter)
    2328             : 
    2329         612 :             IF (my_a >= itmp(1) .AND. my_a <= itmp(2)) mp2_env%ri_grad%P_ab(ispin)%array(my_a - itmp(1) + 1, my_b) = &
    2330         636 :                my_scale*sos_mp2_work_virt(ispin)%P(virtual(ispin) + ab_counter)
    2331             :          END DO
    2332             : 
    2333          24 :          DEALLOCATE (sos_mp2_work_virt(ispin)%P, sos_mp2_work_virt(ispin)%pair_list)
    2334             : 
    2335             :          ! Symmetrize P_ab
    2336          44 :          IF (para_env_sub%num_pe > 1) THEN
    2337          12 :             BLOCK
    2338             :                INTEGER :: send_a_start, send_a_end, send_a_size, &
    2339             :                           recv_a_start, recv_a_end, recv_a_size, proc_shift, proc_send, proc_recv
    2340           4 :                REAL(KIND=dp), DIMENSION(:), ALLOCATABLE, TARGET :: buffer_send_1D
    2341           4 :                REAL(KIND=dp), DIMENSION(:, :), POINTER :: buffer_send
    2342           4 :                REAL(KIND=dp), DIMENSION(:, :), ALLOCATABLE :: buffer_recv
    2343           4 :                TYPE(group_dist_d1_type)                           :: gd_virtual_sub
    2344             : 
    2345           4 :                CALL create_group_dist(gd_virtual_sub, para_env_sub%num_pe, virtual(ispin))
    2346             : 
    2347             :                mp2_env%ri_grad%P_ab(ispin)%array(:, my_B_start:my_B_end) = &
    2348             :                   0.5_dp*(mp2_env%ri_grad%P_ab(ispin)%array(:, my_B_start:my_B_end) &
    2349         804 :                           + TRANSPOSE(mp2_env%ri_grad%P_ab(ispin)%array(:, my_B_start:my_B_end)))
    2350             : 
    2351          12 :                ALLOCATE (buffer_send_1D(my_B_size*maxsize(gd_virtual_sub)))
    2352          16 :                ALLOCATE (buffer_recv(my_B_size, maxsize(gd_virtual_sub)))
    2353             : 
    2354           8 :                DO proc_shift = 1, para_env_sub%num_pe - 1
    2355             : 
    2356           4 :                   proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
    2357           4 :                   proc_recv = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
    2358             : 
    2359           4 :                   CALL get_group_dist(gd_virtual_sub, proc_send, send_a_start, send_a_end, send_a_size)
    2360           4 :                   CALL get_group_dist(gd_virtual_sub, proc_recv, recv_a_start, recv_a_end, recv_a_size)
    2361             : 
    2362           4 :                   buffer_send(1:send_a_size, 1:my_B_size) => buffer_send_1D(1:my_B_size*send_a_size)
    2363             : 
    2364         402 :                   buffer_send(:send_a_size, :) = TRANSPOSE(mp2_env%ri_grad%P_ab(ispin)%array(:, send_a_start:send_a_end))
    2365             :                   CALL para_env_sub%sendrecv(buffer_send(:send_a_size, :), proc_send, &
    2366         402 :                                              buffer_recv(:, :recv_a_size), proc_recv)
    2367             : 
    2368             :                   mp2_env%ri_grad%P_ab(ispin)%array(:, recv_a_start:recv_a_end) = &
    2369         410 :                      0.5_dp*(mp2_env%ri_grad%P_ab(ispin)%array(:, recv_a_start:recv_a_end) + buffer_recv(:, 1:recv_a_size))
    2370             : 
    2371             :                END DO
    2372             : 
    2373           4 :                DEALLOCATE (buffer_send_1D, buffer_recv)
    2374             : 
    2375          16 :                CALL release_group_dist(gd_virtual_sub)
    2376             :             END BLOCK
    2377             :          ELSE
    2378             :             mp2_env%ri_grad%P_ab(ispin)%array(:, :) = 0.5_dp*(mp2_env%ri_grad%P_ab(ispin)%array + &
    2379       19140 :                                                               TRANSPOSE(mp2_env%ri_grad%P_ab(ispin)%array))
    2380             :          END IF
    2381             : 
    2382             :       END DO
    2383          68 :       DEALLOCATE (sos_mp2_work_occ, sos_mp2_work_virt)
    2384          20 :       IF (nspins == 1) THEN
    2385         336 :          mp2_env%ri_grad%P_ij(1)%array(:, :) = 2.0_dp*mp2_env%ri_grad%P_ij(1)%array
    2386        5374 :          mp2_env%ri_grad%P_ab(1)%array(:, :) = 2.0_dp*mp2_env%ri_grad%P_ab(1)%array
    2387             :       END IF
    2388             : 
    2389          20 :       CALL timestop(handle)
    2390             : 
    2391          20 :    END SUBROUTINE
    2392             : 
    2393             : ! **************************************************************************************************
    2394             : !> \brief ...
    2395             : !> \param rpa_work ...
    2396             : !> \param mp2_env ...
    2397             : !> \param homo ...
    2398             : !> \param virtual ...
    2399             : !> \param para_env ...
    2400             : !> \param para_env_sub ...
    2401             : ! **************************************************************************************************
    2402          20 :    SUBROUTINE rpa_grad_work_finalize(rpa_work, mp2_env, homo, virtual, para_env, para_env_sub)
    2403             :       TYPE(rpa_grad_work_type), INTENT(INOUT)            :: rpa_work
    2404             :       TYPE(mp2_type), INTENT(INOUT)                      :: mp2_env
    2405             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: homo, virtual
    2406             :       TYPE(mp_para_env_type), INTENT(IN), POINTER        :: para_env, para_env_sub
    2407             : 
    2408             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'rpa_grad_work_finalize'
    2409             : 
    2410             :       INTEGER :: handle, ispin, itmp(2), my_a_end, my_a_size, my_a_start, my_B_end, my_B_size, &
    2411             :          my_B_start, my_i_end, my_i_size, my_i_start, nspins, proc, proc_recv, proc_send, &
    2412             :          proc_shift, recv_a_end, recv_a_size, recv_a_start, recv_end, recv_start, send_a_end, &
    2413             :          send_a_size, send_a_start, send_end, send_start, size_recv_buffer, size_send_buffer
    2414             :       REAL(KIND=dp)                                      :: my_scale
    2415          20 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :)        :: buffer_recv, buffer_send
    2416          20 :       TYPE(group_dist_d1_type)                           :: gd_a_sub, gd_virtual_sub
    2417             : 
    2418          20 :       CALL timeset(routineN, handle)
    2419             : 
    2420          20 :       nspins = SIZE(homo)
    2421          20 :       my_scale = mp2_env%ri_rpa%scale_rpa/(2.0_dp*pi)
    2422          20 :       IF (mp2_env%ri_rpa%minimax_quad) my_scale = my_scale/2.0_dp
    2423             : 
    2424          20 :       CALL cp_fm_release(rpa_work%fm_mat_Q_copy)
    2425             : 
    2426          44 :       DO ispin = 1, nspins
    2427          68 :       DO proc = 0, SIZE(rpa_work%index2send, 1) - 1
    2428          24 :          IF (ALLOCATED(rpa_work%index2send(proc, ispin)%array)) DEALLOCATE (rpa_work%index2send(proc, ispin)%array)
    2429          48 :          IF (ALLOCATED(rpa_work%index2recv(proc, ispin)%array)) DEALLOCATE (rpa_work%index2recv(proc, ispin)%array)
    2430             :       END DO
    2431             :       END DO
    2432          68 :       DEALLOCATE (rpa_work%index2send, rpa_work%index2recv)
    2433             : 
    2434          44 :       DO ispin = 1, nspins
    2435          24 :          CALL get_group_dist(rpa_work%gd_homo(ispin), rpa_work%mepos(2), my_i_start, my_i_end, my_i_size)
    2436          24 :          CALL release_group_dist(rpa_work%gd_homo(ispin))
    2437             : 
    2438          96 :          ALLOCATE (mp2_env%ri_grad%P_ij(ispin)%array(homo(ispin), homo(ispin)))
    2439         472 :          mp2_env%ri_grad%P_ij(ispin)%array = 0.0_dp
    2440         472 :          mp2_env%ri_grad%P_ij(ispin)%array(my_i_start:my_i_end, :) = my_scale*rpa_work%P_ij(ispin)%array
    2441          24 :          DEALLOCATE (rpa_work%P_ij(ispin)%array)
    2442          24 :          CALL para_env%sum(mp2_env%ri_grad%P_ij(ispin)%array)
    2443             : 
    2444             :          ! Symmetrize P_ij
    2445             :          mp2_env%ri_grad%P_ij(ispin)%array(:, :) = 0.5_dp*(mp2_env%ri_grad%P_ij(ispin)%array + &
    2446         920 :                                                            TRANSPOSE(mp2_env%ri_grad%P_ij(ispin)%array))
    2447             : 
    2448          24 :          itmp = get_limit(virtual(ispin), para_env_sub%num_pe, para_env_sub%mepos)
    2449          24 :          my_B_start = itmp(1)
    2450          24 :          my_B_end = itmp(2)
    2451          24 :          my_B_size = my_B_end - my_B_start + 1
    2452             : 
    2453          96 :          ALLOCATE (mp2_env%ri_grad%P_ab(ispin)%array(my_B_size, virtual(ispin)))
    2454       10382 :          mp2_env%ri_grad%P_ab(ispin)%array = 0.0_dp
    2455             : 
    2456          24 :          CALL get_group_dist(rpa_work%gd_virtual(ispin), rpa_work%mepos(1), my_a_start, my_a_end, my_a_size)
    2457          24 :          CALL release_group_dist(rpa_work%gd_virtual(ispin))
    2458             :          ! This group dist contains the info which parts of Pab a process currently owns
    2459          24 :          CALL create_group_dist(gd_a_sub, my_a_start, my_a_end, my_a_size, para_env_sub)
    2460             :          ! This group dist contains the info which parts of Pab a process is supposed to own later
    2461          24 :          CALL create_group_dist(gd_virtual_sub, para_env_sub%num_pe, virtual(ispin))
    2462             : 
    2463             :          ! Calculate local indices of the common range of own matrix and send process
    2464          24 :          send_start = MAX(1, my_B_start - my_a_start + 1)
    2465          24 :          send_end = MIN(my_a_size, my_B_end - my_a_start + 1)
    2466             : 
    2467             :          ! Same for recv process but with reverse positions
    2468          24 :          recv_start = MAX(1, my_a_start - my_B_start + 1)
    2469          24 :          recv_end = MIN(my_B_size, my_a_end - my_B_start + 1)
    2470             : 
    2471             :          mp2_env%ri_grad%P_ab(ispin)%array(recv_start:recv_end, :) = &
    2472       10382 :             my_scale*rpa_work%P_ab(ispin)%array(send_start:send_end, :)
    2473             : 
    2474          24 :          IF (para_env_sub%num_pe > 1) THEN
    2475             :             size_send_buffer = 0
    2476             :             size_recv_buffer = 0
    2477           8 :             DO proc_shift = 1, para_env_sub%num_pe - 1
    2478           4 :                proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
    2479           4 :                proc_recv = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
    2480             : 
    2481           4 :                CALL get_group_dist(gd_virtual_sub, proc_send, send_a_start, send_a_end)
    2482           4 :                CALL get_group_dist(gd_a_sub, proc_recv, recv_a_start, recv_a_end)
    2483             : 
    2484             :                ! Calculate local indices of the common range of own matrix and send process
    2485           4 :                send_start = MAX(1, send_a_start - my_a_start + 1)
    2486           4 :                send_end = MIN(my_a_size, send_a_end - my_a_start + 1)
    2487             : 
    2488           4 :                size_send_buffer = MAX(size_send_buffer, MAX(send_end - send_start + 1, 0))
    2489             : 
    2490             :                ! Same for recv process but with reverse positions
    2491           4 :                recv_start = MAX(1, recv_a_start - my_B_start + 1)
    2492           4 :                recv_end = MIN(my_B_size, recv_a_end - my_B_start + 1)
    2493             : 
    2494           8 :                size_recv_buffer = MAX(size_recv_buffer, MAX(recv_end - recv_start + 1, 0))
    2495             :             END DO
    2496          28 :             ALLOCATE (buffer_send(size_send_buffer, virtual(ispin)), buffer_recv(size_recv_buffer, virtual(ispin)))
    2497             : 
    2498           8 :             DO proc_shift = 1, para_env_sub%num_pe - 1
    2499           4 :                proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
    2500           4 :                proc_recv = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
    2501             : 
    2502           4 :                CALL get_group_dist(gd_virtual_sub, proc_send, send_a_start, send_a_end)
    2503           4 :                CALL get_group_dist(gd_a_sub, proc_recv, recv_a_start, recv_a_end)
    2504             : 
    2505             :                ! Calculate local indices of the common range of own matrix and send process
    2506           4 :                send_start = MAX(1, send_a_start - my_a_start + 1)
    2507           4 :                send_end = MIN(my_a_size, send_a_end - my_a_start + 1)
    2508         802 :                buffer_send(1:MAX(send_end - send_start + 1, 0), :) = rpa_work%P_ab(ispin)%array(send_start:send_end, :)
    2509             : 
    2510             :                ! Same for recv process but with reverse positions
    2511           4 :                recv_start = MAX(1, recv_a_start - my_B_start + 1)
    2512           4 :                recv_end = MIN(my_B_size, recv_a_end - my_B_start + 1)
    2513             : 
    2514             :                CALL para_env_sub%sendrecv(buffer_send(1:MAX(send_end - send_start + 1, 0), :), proc_send, &
    2515        1600 :                                           buffer_recv(1:MAX(recv_end - recv_start + 1, 0), :), proc_recv)
    2516             : 
    2517             :                mp2_env%ri_grad%P_ab(ispin)%array(recv_start:recv_end, :) = &
    2518             :                   mp2_env%ri_grad%P_ab(ispin)%array(recv_start:recv_end, :) + &
    2519         810 :                   my_scale*buffer_recv(1:MAX(recv_end - recv_start + 1, 0), :)
    2520             : 
    2521             :             END DO
    2522             : 
    2523           4 :             IF (ALLOCATED(buffer_send)) DEALLOCATE (buffer_send)
    2524           4 :             IF (ALLOCATED(buffer_recv)) DEALLOCATE (buffer_recv)
    2525             :          END IF
    2526          24 :          DEALLOCATE (rpa_work%P_ab(ispin)%array)
    2527             : 
    2528          24 :          CALL release_group_dist(gd_a_sub)
    2529             : 
    2530             :          BLOCK
    2531             :             TYPE(mp_comm_type) :: comm_exchange
    2532          24 :             CALL comm_exchange%from_split(para_env, para_env_sub%mepos)
    2533          24 :             CALL comm_exchange%sum(mp2_env%ri_grad%P_ab(ispin)%array)
    2534          48 :             CALL comm_exchange%free()
    2535             :          END BLOCK
    2536             : 
    2537             :          ! Symmetrize P_ab
    2538          24 :          IF (para_env_sub%num_pe > 1) THEN
    2539             :             BLOCK
    2540           4 :                REAL(KIND=dp), DIMENSION(:), ALLOCATABLE, TARGET :: buffer_send_1D
    2541           4 :                REAL(KIND=dp), DIMENSION(:, :), POINTER :: buffer_send
    2542           4 :                REAL(KIND=dp), DIMENSION(:, :), ALLOCATABLE :: buffer_recv
    2543             : 
    2544             :                mp2_env%ri_grad%P_ab(ispin)%array(:, my_B_start:my_B_end) = &
    2545             :                   0.5_dp*(mp2_env%ri_grad%P_ab(ispin)%array(:, my_B_start:my_B_end) &
    2546         804 :                           + TRANSPOSE(mp2_env%ri_grad%P_ab(ispin)%array(:, my_B_start:my_B_end)))
    2547             : 
    2548          12 :                ALLOCATE (buffer_send_1D(my_B_size*maxsize(gd_virtual_sub)))
    2549          16 :                ALLOCATE (buffer_recv(my_B_size, maxsize(gd_virtual_sub)))
    2550             : 
    2551           8 :                DO proc_shift = 1, para_env_sub%num_pe - 1
    2552             : 
    2553           4 :                   proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
    2554           4 :                   proc_recv = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
    2555             : 
    2556           4 :                   CALL get_group_dist(gd_virtual_sub, proc_send, send_a_start, send_a_end, send_a_size)
    2557           4 :                   CALL get_group_dist(gd_virtual_sub, proc_recv, recv_a_start, recv_a_end, recv_a_size)
    2558             : 
    2559           4 :                   buffer_send(1:send_a_size, 1:my_B_size) => buffer_send_1D(1:my_B_size*send_a_size)
    2560             : 
    2561         402 :                   buffer_send(:send_a_size, :) = TRANSPOSE(mp2_env%ri_grad%P_ab(ispin)%array(:, send_a_start:send_a_end))
    2562             :                   CALL para_env_sub%sendrecv(buffer_send(:send_a_size, :), proc_send, &
    2563         402 :                                              buffer_recv(:, :recv_a_size), proc_recv)
    2564             : 
    2565             :                   mp2_env%ri_grad%P_ab(ispin)%array(:, recv_a_start:recv_a_end) = &
    2566         410 :                      0.5_dp*(mp2_env%ri_grad%P_ab(ispin)%array(:, recv_a_start:recv_a_end) + buffer_recv(:, 1:recv_a_size))
    2567             : 
    2568             :                END DO
    2569             : 
    2570           8 :                DEALLOCATE (buffer_send_1D, buffer_recv)
    2571             :             END BLOCK
    2572             :          ELSE
    2573             :             mp2_env%ri_grad%P_ab(ispin)%array(:, :) = 0.5_dp*(mp2_env%ri_grad%P_ab(ispin)%array + &
    2574       19140 :                                                               TRANSPOSE(mp2_env%ri_grad%P_ab(ispin)%array))
    2575             :          END IF
    2576             : 
    2577          92 :          CALL release_group_dist(gd_virtual_sub)
    2578             : 
    2579             :       END DO
    2580         116 :       DEALLOCATE (rpa_work%gd_homo, rpa_work%gd_virtual, rpa_work%P_ij, rpa_work%P_ab)
    2581          20 :       IF (nspins == 1) THEN
    2582         336 :          mp2_env%ri_grad%P_ij(1)%array(:, :) = 2.0_dp*mp2_env%ri_grad%P_ij(1)%array
    2583        5374 :          mp2_env%ri_grad%P_ab(1)%array(:, :) = 2.0_dp*mp2_env%ri_grad%P_ab(1)%array
    2584             :       END IF
    2585             : 
    2586          20 :       CALL timestop(handle)
    2587          20 :    END SUBROUTINE
    2588             : 
    2589             : ! **************************************************************************************************
    2590             : !> \brief Dereplicate data from fm_sub and collect in fm_global, overlapping data will be added
    2591             : !> \param fm_sub replicated matrix, all subgroups have the same size, will be release on output
    2592             : !> \param fm_global global matrix, on output it will contain the sum of the replicated matrices redistributed
    2593             : ! **************************************************************************************************
    2594          88 :    SUBROUTINE dereplicate_and_sum_fm(fm_sub, fm_global)
    2595             :       TYPE(cp_fm_type), INTENT(INOUT)                    :: fm_sub, fm_global
    2596             : 
    2597             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'dereplicate_and_sum_fm'
    2598             : 
    2599             :       INTEGER :: col_local, elements2recv_col, elements2recv_row, elements2send_col, &
    2600             :          elements2send_row, handle, handle2, mypcol_global, myprow_global, ncol_local_global, &
    2601             :          ncol_local_sub, npcol_global, npcol_sub, nprow_global, nprow_sub, nrow_local_global, &
    2602             :          nrow_local_sub, pcol_recv, pcol_send, proc_recv, proc_send, proc_send_global, proc_shift, &
    2603             :          prow_recv, prow_send, row_local, tag
    2604             :       INTEGER(int_8)                                     :: size_recv_buffer, size_send_buffer
    2605          88 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: data2recv_col, data2recv_row, &
    2606          88 :                                                             data2send_col, data2send_row, &
    2607          88 :                                                             subgroup2mepos
    2608          88 :       INTEGER, DIMENSION(:), POINTER                     :: col_indices_global, col_indices_sub, &
    2609          88 :                                                             row_indices_global, row_indices_sub
    2610          88 :       INTEGER, DIMENSION(:, :), POINTER                  :: blacs2mpi_global, blacs2mpi_sub, &
    2611          88 :                                                             mpi2blacs_global, mpi2blacs_sub
    2612          88 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:), TARGET   :: recv_buffer_1D, send_buffer_1D
    2613          88 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: recv_buffer, send_buffer
    2614             :       TYPE(mp_para_env_type), POINTER                    :: para_env, para_env_sub
    2615          88 :       TYPE(one_dim_int_array), ALLOCATABLE, DIMENSION(:) :: index2recv_col, index2recv_row, &
    2616          88 :                                                             index2send_col, index2send_row
    2617             : 
    2618          88 :       CALL timeset(routineN, handle)
    2619             : 
    2620          88 :       tag = 1
    2621             : 
    2622          88 :       nprow_sub = fm_sub%matrix_struct%context%num_pe(1)
    2623          88 :       npcol_sub = fm_sub%matrix_struct%context%num_pe(2)
    2624             : 
    2625          88 :       myprow_global = fm_global%matrix_struct%context%mepos(1)
    2626          88 :       mypcol_global = fm_global%matrix_struct%context%mepos(2)
    2627          88 :       nprow_global = fm_global%matrix_struct%context%num_pe(1)
    2628          88 :       npcol_global = fm_global%matrix_struct%context%num_pe(2)
    2629             : 
    2630             :       CALL cp_fm_get_info(fm_sub, col_indices=col_indices_sub, row_indices=row_indices_sub, &
    2631          88 :                           nrow_local=nrow_local_sub, ncol_local=ncol_local_sub)
    2632          88 :       CALL cp_fm_struct_get(fm_sub%matrix_struct, para_env=para_env_sub)
    2633             :       CALL cp_fm_struct_get(fm_global%matrix_struct, para_env=para_env, &
    2634             :                             col_indices=col_indices_global, row_indices=row_indices_global, &
    2635          88 :                             nrow_local=nrow_local_global, ncol_local=ncol_local_global)
    2636          88 :       CALL fm_sub%matrix_struct%context%get(blacs2mpi=blacs2mpi_sub, mpi2blacs=mpi2blacs_sub)
    2637          88 :       CALL fm_global%matrix_struct%context%get(blacs2mpi=blacs2mpi_global, mpi2blacs=mpi2blacs_global)
    2638             : 
    2639          88 :       IF (para_env%num_pe /= para_env_sub%num_pe) THEN
    2640             :          BLOCK
    2641             :             TYPE(mp_comm_type) :: comm_exchange
    2642          64 :             comm_exchange = fm_sub%matrix_struct%context%interconnect(para_env)
    2643          64 :             CALL comm_exchange%sum(fm_sub%local_data)
    2644         128 :             CALL comm_exchange%free()
    2645             :          END BLOCK
    2646             :       END IF
    2647             : 
    2648         264 :       ALLOCATE (subgroup2mepos(0:para_env_sub%num_pe - 1))
    2649          88 :       CALL para_env_sub%allgather(para_env%mepos, subgroup2mepos)
    2650             : 
    2651          88 :       CALL timeset(routineN//"_data2", handle2)
    2652             :       ! Create a map how much data has to be sent to what process coordinate, interchange rows and columns to transpose the matrices
    2653          88 :       CALL get_elements2send_col(data2send_col, fm_global%matrix_struct, row_indices_sub, index2send_col)
    2654          88 :       CALL get_elements2send_row(data2send_row, fm_global%matrix_struct, col_indices_sub, index2send_row)
    2655             : 
    2656             :       ! Create a map how much data has to be sent to what process coordinate, interchange rows and columns to transpose the matrices
    2657             :       ! Do the reverse for the recieve processes
    2658          88 :       CALL get_elements2send_col(data2recv_col, fm_sub%matrix_struct, row_indices_global, index2recv_col)
    2659          88 :       CALL get_elements2send_row(data2recv_row, fm_sub%matrix_struct, col_indices_global, index2recv_row)
    2660          88 :       CALL timestop(handle2)
    2661             : 
    2662          88 :       CALL timeset(routineN//"_local", handle2)
    2663             :       ! Loop over local data and transpose
    2664          88 :       prow_send = mpi2blacs_global(1, para_env%mepos)
    2665          88 :       pcol_send = mpi2blacs_global(2, para_env%mepos)
    2666          88 :       prow_recv = mpi2blacs_sub(1, para_env_sub%mepos)
    2667          88 :       pcol_recv = mpi2blacs_sub(2, para_env_sub%mepos)
    2668          88 :       elements2recv_col = data2recv_col(pcol_recv)
    2669          88 :       elements2recv_row = data2recv_row(prow_recv)
    2670             : 
    2671             : !$OMP    PARALLEL DO DEFAULT(NONE) PRIVATE(row_local,col_local) &
    2672             : !$OMP                SHARED(elements2recv_col,elements2recv_row,recv_buffer,fm_global,&
    2673             : !$OMP                       index2recv_col,index2recv_row,pcol_recv,prow_recv, &
    2674          88 : !$OMP                       fm_sub,index2send_col,index2send_row,pcol_send,prow_send)
    2675             :       DO col_local = 1, elements2recv_col
    2676             :          DO row_local = 1, elements2recv_row
    2677             :             fm_global%local_data(index2recv_col(pcol_recv)%array(col_local), &
    2678             :                                  index2recv_row(prow_recv)%array(row_local)) &
    2679             :                = fm_sub%local_data(index2send_col(pcol_send)%array(row_local), &
    2680             :                                    index2send_row(prow_send)%array(col_local))
    2681             :          END DO
    2682             :       END DO
    2683             : !$OMP    END PARALLEL DO
    2684          88 :       CALL timestop(handle2)
    2685             : 
    2686          88 :       IF (para_env_sub%num_pe > 1) THEN
    2687             :          size_send_buffer = 0_int_8
    2688             :          size_recv_buffer = 0_int_8
    2689             :          ! Loop over all processes in para_env_sub
    2690          48 :          DO proc_shift = 1, para_env_sub%num_pe - 1
    2691          24 :             proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
    2692          24 :             proc_recv = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
    2693             : 
    2694          24 :             proc_send_global = subgroup2mepos(proc_send)
    2695          24 :             prow_send = mpi2blacs_global(1, proc_send_global)
    2696          24 :             pcol_send = mpi2blacs_global(2, proc_send_global)
    2697          24 :             elements2send_col = data2send_col(pcol_send)
    2698          24 :             elements2send_row = data2send_row(prow_send)
    2699             : 
    2700          24 :             size_send_buffer = MAX(size_send_buffer, INT(elements2send_col, int_8)*elements2send_row)
    2701             : 
    2702          24 :             prow_recv = mpi2blacs_sub(1, proc_recv)
    2703          24 :             pcol_recv = mpi2blacs_sub(2, proc_recv)
    2704          24 :             elements2recv_col = data2recv_col(pcol_recv)
    2705          24 :             elements2recv_row = data2recv_row(prow_recv)
    2706             : 
    2707          48 :             size_recv_buffer = MAX(size_recv_buffer, INT(elements2recv_col, int_8)*elements2recv_row)
    2708             :          END DO
    2709         120 :          ALLOCATE (send_buffer_1D(size_send_buffer), recv_buffer_1D(size_recv_buffer))
    2710             : 
    2711             :          ! Loop over all processes in para_env_sub
    2712          48 :          DO proc_shift = 1, para_env_sub%num_pe - 1
    2713          24 :             proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
    2714          24 :             proc_recv = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
    2715             : 
    2716          24 :             proc_send_global = subgroup2mepos(proc_send)
    2717          24 :             prow_send = mpi2blacs_global(1, proc_send_global)
    2718          24 :             pcol_send = mpi2blacs_global(2, proc_send_global)
    2719          24 :             elements2send_col = data2send_col(pcol_send)
    2720          24 :             elements2send_row = data2send_row(prow_send)
    2721             : 
    2722          24 :             CALL timeset(routineN//"_pack", handle2)
    2723             :             ! Loop over local data and pack the buffer
    2724             :             ! Transpose the matrix already
    2725          24 :           send_buffer(1:elements2send_row, 1:elements2send_col) => send_buffer_1D(1:INT(elements2send_row, int_8)*elements2send_col)
    2726             : !$OMP    PARALLEL DO DEFAULT(NONE) PRIVATE(row_local,col_local) &
    2727             : !$OMP                SHARED(elements2send_col,elements2send_row,send_buffer,fm_sub,&
    2728          24 : !$OMP                       index2send_col,index2send_row,pcol_send,prow_send)
    2729             :             DO row_local = 1, elements2send_col
    2730             :                DO col_local = 1, elements2send_row
    2731             :                   send_buffer(col_local, row_local) = &
    2732             :                      fm_sub%local_data(index2send_col(pcol_send)%array(row_local), &
    2733             :                                        index2send_row(prow_send)%array(col_local))
    2734             :                END DO
    2735             :             END DO
    2736             : !$OMP    END PARALLEL DO
    2737          24 :             CALL timestop(handle2)
    2738             : 
    2739          24 :             prow_recv = mpi2blacs_sub(1, proc_recv)
    2740          24 :             pcol_recv = mpi2blacs_sub(2, proc_recv)
    2741          24 :             elements2recv_col = data2recv_col(pcol_recv)
    2742          24 :             elements2recv_row = data2recv_row(prow_recv)
    2743             : 
    2744             :             ! Send data
    2745          24 :           recv_buffer(1:elements2recv_col, 1:elements2recv_row) => recv_buffer_1D(1:INT(elements2recv_row, int_8)*elements2recv_col)
    2746         120 :             IF (SIZE(recv_buffer) > 0_int_8) THEN
    2747          72 :             IF (SIZE(send_buffer) > 0_int_8) THEN
    2748       81072 :                CALL para_env_sub%sendrecv(send_buffer, proc_send, recv_buffer, proc_recv, tag)
    2749             :             ELSE
    2750           0 :                CALL para_env_sub%recv(recv_buffer, proc_recv, tag)
    2751             :             END IF
    2752             : 
    2753          24 :             CALL timeset(routineN//"_unpack", handle2)
    2754             : !$OMP    PARALLEL DO DEFAULT(NONE) PRIVATE(row_local,col_local) &
    2755             : !$OMP                SHARED(elements2recv_col,elements2recv_row,recv_buffer,fm_global,&
    2756          24 : !$OMP                       index2recv_col,index2recv_row,pcol_recv,prow_recv)
    2757             :             DO col_local = 1, elements2recv_col
    2758             :                DO row_local = 1, elements2recv_row
    2759             :                   fm_global%local_data(index2recv_col(pcol_recv)%array(col_local), &
    2760             :                                        index2recv_row(prow_recv)%array(row_local)) &
    2761             :                      = recv_buffer(col_local, row_local)
    2762             :                END DO
    2763             :             END DO
    2764             : !$OMP    END PARALLEL DO
    2765          24 :             CALL timestop(handle2)
    2766           0 :             ELSE IF (SIZE(send_buffer) > 0_int_8) THEN
    2767           0 :             CALL para_env_sub%send(send_buffer, proc_send, tag)
    2768             :             END IF
    2769             :          END DO
    2770             :       END IF
    2771             : 
    2772          88 :       DEALLOCATE (data2send_col, data2send_row, data2recv_col, data2recv_row)
    2773         176 :       DO proc_shift = 0, npcol_global - 1
    2774         176 :          DEALLOCATE (index2send_col(proc_shift)%array)
    2775             :       END DO
    2776         176 :       DO proc_shift = 0, npcol_sub - 1
    2777         176 :          DEALLOCATE (index2recv_col(proc_shift)%array)
    2778             :       END DO
    2779         264 :       DO proc_shift = 0, nprow_global - 1
    2780         264 :          DEALLOCATE (index2send_row(proc_shift)%array)
    2781             :       END DO
    2782         200 :       DO proc_shift = 0, nprow_sub - 1
    2783         200 :          DEALLOCATE (index2recv_row(proc_shift)%array)
    2784             :       END DO
    2785         664 :       DEALLOCATE (index2send_col, index2recv_col, index2send_row, index2recv_row)
    2786             : 
    2787          88 :       CALL cp_fm_release(fm_sub)
    2788             : 
    2789          88 :       CALL timestop(handle)
    2790             : 
    2791         440 :    END SUBROUTINE dereplicate_and_sum_fm
    2792             : 
    2793             : ! **************************************************************************************************
    2794             : !> \brief ...
    2795             : !> \param data2send ...
    2796             : !> \param struct_global ...
    2797             : !> \param indices_sub ...
    2798             : !> \param index2send ...
    2799             : ! **************************************************************************************************
    2800         176 :    SUBROUTINE get_elements2send_col(data2send, struct_global, indices_sub, index2send)
    2801             :       INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(OUT)    :: data2send
    2802             :       TYPE(cp_fm_struct_type), INTENT(INOUT)             :: struct_global
    2803             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: indices_sub
    2804             :       TYPE(one_dim_int_array), ALLOCATABLE, &
    2805             :          DIMENSION(:), INTENT(OUT)                       :: index2send
    2806             : 
    2807             :       INTEGER                                            :: i_global, i_local, np_global, proc
    2808             : 
    2809         176 :       CALL struct_global%context%get(number_of_process_rows=np_global)
    2810             : 
    2811         528 :       ALLOCATE (data2send(0:np_global - 1))
    2812         464 :       data2send = 0
    2813       10436 :       DO i_local = 1, SIZE(indices_sub)
    2814       10260 :          i_global = indices_sub(i_local)
    2815       10260 :          proc = struct_global%g2p_col(i_global)
    2816       10436 :          data2send(proc) = data2send(proc) + 1
    2817             :       END DO
    2818             : 
    2819         816 :       ALLOCATE (index2send(0:np_global - 1))
    2820         464 :       DO proc = 0, np_global - 1
    2821         752 :          ALLOCATE (index2send(proc)%array(data2send(proc)))
    2822             :          ! We want to crash if there is an error
    2823       10724 :          index2send(proc)%array = -1
    2824             :       END DO
    2825             : 
    2826         464 :       data2send = 0
    2827       10436 :       DO i_local = 1, SIZE(indices_sub)
    2828       10260 :          i_global = indices_sub(i_local)
    2829       10260 :          proc = struct_global%g2p_col(i_global)
    2830       10260 :          data2send(proc) = data2send(proc) + 1
    2831       10436 :          index2send(proc)%array(data2send(proc)) = i_local
    2832             :       END DO
    2833             : 
    2834         176 :    END SUBROUTINE
    2835             : 
    2836             : ! **************************************************************************************************
    2837             : !> \brief ...
    2838             : !> \param data2send ...
    2839             : !> \param struct_global ...
    2840             : !> \param indices_sub ...
    2841             : !> \param index2send ...
    2842             : ! **************************************************************************************************
    2843         176 :    SUBROUTINE get_elements2send_row(data2send, struct_global, indices_sub, index2send)
    2844             :       INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(OUT)    :: data2send
    2845             :       TYPE(cp_fm_struct_type), INTENT(INOUT)             :: struct_global
    2846             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: indices_sub
    2847             :       TYPE(one_dim_int_array), ALLOCATABLE, &
    2848             :          DIMENSION(:), INTENT(OUT)                       :: index2send
    2849             : 
    2850             :       INTEGER                                            :: i_global, i_local, np_global, proc
    2851             : 
    2852         176 :       CALL struct_global%context%get(number_of_process_rows=np_global)
    2853             : 
    2854         528 :       ALLOCATE (data2send(0:np_global - 1))
    2855         464 :       data2send = 0
    2856       15048 :       DO i_local = 1, SIZE(indices_sub)
    2857       14872 :          i_global = indices_sub(i_local)
    2858       14872 :          proc = struct_global%g2p_row(i_global)
    2859       15048 :          data2send(proc) = data2send(proc) + 1
    2860             :       END DO
    2861             : 
    2862         816 :       ALLOCATE (index2send(0:np_global - 1))
    2863         464 :       DO proc = 0, np_global - 1
    2864         864 :          ALLOCATE (index2send(proc)%array(data2send(proc)))
    2865             :          ! We want to crash if there is an error
    2866       15336 :          index2send(proc)%array = -1
    2867             :       END DO
    2868             : 
    2869         464 :       data2send = 0
    2870       15048 :       DO i_local = 1, SIZE(indices_sub)
    2871       14872 :          i_global = indices_sub(i_local)
    2872       14872 :          proc = struct_global%g2p_row(i_global)
    2873       14872 :          data2send(proc) = data2send(proc) + 1
    2874       15048 :          index2send(proc)%array(data2send(proc)) = i_local
    2875             :       END DO
    2876             : 
    2877         176 :    END SUBROUTINE
    2878             : 
    2879           0 : END MODULE rpa_grad

Generated by: LCOV version 1.15