LCOV - code coverage report
Current view: top level - src - mp2_ri_gpw.F (source / functions) Hit Total Coverage
Test: CP2K Regtests (git:b8e0b09) Lines: 1053 1078 97.7 %
Date: 2024-08-31 06:31:37 Functions: 14 14 100.0 %

          Line data    Source code
       1             : !--------------------------------------------------------------------------------------------------!
       2             : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3             : !   Copyright 2000-2024 CP2K developers group <https://cp2k.org>                                   !
       4             : !                                                                                                  !
       5             : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6             : !--------------------------------------------------------------------------------------------------!
       7             : 
       8             : ! **************************************************************************************************
       9             : !> \brief Routines to calculate RI-GPW-MP2 energy using pw
      10             : !> \par History
      11             : !>      06.2012 created [Mauro Del Ben]
      12             : !>      03.2019 Refactored from mp2_ri_gpw [Frederick Stein]
      13             : ! **************************************************************************************************
      14             : MODULE mp2_ri_gpw
      15             :    USE cp_log_handling,                 ONLY: cp_to_string
      16             :    USE dgemm_counter_types,             ONLY: dgemm_counter_init,&
      17             :                                               dgemm_counter_start,&
      18             :                                               dgemm_counter_stop,&
      19             :                                               dgemm_counter_type,&
      20             :                                               dgemm_counter_write
      21             :    USE group_dist_types,                ONLY: get_group_dist,&
      22             :                                               group_dist_d1_type,&
      23             :                                               maxsize,&
      24             :                                               release_group_dist
      25             :    USE kinds,                           ONLY: dp,&
      26             :                                               int_8
      27             :    USE libint_2c_3c,                    ONLY: compare_potential_types
      28             :    USE local_gemm_api,                  ONLY: LOCAL_GEMM_PU_GPU
      29             :    USE machine,                         ONLY: m_flush,&
      30             :                                               m_memory,&
      31             :                                               m_walltime
      32             :    USE message_passing,                 ONLY: mp_comm_type,&
      33             :                                               mp_para_env_type
      34             :    USE mp2_ri_grad_util,                ONLY: complete_gamma
      35             :    USE mp2_types,                       ONLY: mp2_type,&
      36             :                                               three_dim_real_array
      37             : 
      38             : !$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num
      39             : #include "./base/base_uses.f90"
      40             : 
      41             :    IMPLICIT NONE
      42             : 
      43             :    PRIVATE
      44             : 
      45             :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'mp2_ri_gpw'
      46             : 
      47             :    PUBLIC :: mp2_ri_gpw_compute_en
      48             : 
      49             : CONTAINS
      50             : 
      51             : ! **************************************************************************************************
      52             : !> \brief ...
      53             : !> \param Emp2_Cou ...
      54             : !> \param Emp2_EX ...
      55             : !> \param Emp2_S ...
      56             : !> \param Emp2_T ...
      57             : !> \param BIb_C ...
      58             : !> \param mp2_env ...
      59             : !> \param para_env ...
      60             : !> \param para_env_sub ...
      61             : !> \param color_sub ...
      62             : !> \param gd_array ...
      63             : !> \param gd_B_virtual ...
      64             : !> \param Eigenval ...
      65             : !> \param nmo ...
      66             : !> \param homo ...
      67             : !> \param dimen_RI ...
      68             : !> \param unit_nr ...
      69             : !> \param calc_forces ...
      70             : !> \param calc_ex ...
      71             : ! **************************************************************************************************
      72        1038 :    SUBROUTINE mp2_ri_gpw_compute_en(Emp2_Cou, Emp2_EX, Emp2_S, Emp2_T, BIb_C, mp2_env, para_env, para_env_sub, color_sub, &
      73         346 :                                     gd_array, gd_B_virtual, &
      74         346 :                                     Eigenval, nmo, homo, dimen_RI, unit_nr, calc_forces, calc_ex)
      75             :       REAL(KIND=dp), INTENT(INOUT)                       :: Emp2_Cou, Emp2_EX, Emp2_S, Emp2_T
      76             :       TYPE(three_dim_real_array), DIMENSION(:), &
      77             :          INTENT(INOUT)                                   :: BIb_C
      78             :       TYPE(mp2_type)                                     :: mp2_env
      79             :       TYPE(mp_para_env_type), INTENT(IN), POINTER        :: para_env, para_env_sub
      80             :       INTEGER, INTENT(IN)                                :: color_sub
      81             :       TYPE(group_dist_d1_type), INTENT(INOUT)            :: gd_array
      82             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: homo
      83             :       INTEGER, INTENT(IN)                                :: nmo
      84             :       REAL(KIND=dp), DIMENSION(:, :), INTENT(IN)         :: Eigenval
      85             :       TYPE(group_dist_d1_type), DIMENSION(SIZE(homo)), &
      86             :          INTENT(INOUT)                                   :: gd_B_virtual
      87             :       INTEGER, INTENT(IN)                                :: dimen_RI, unit_nr
      88             :       LOGICAL, INTENT(IN)                                :: calc_forces, calc_ex
      89             : 
      90             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_gpw_compute_en'
      91             : 
      92             :       INTEGER :: a, a_global, b, b_global, block_size, decil, end_point, handle, handle2, handle3, &
      93             :          iiB, ij_counter, ij_counter_send, ij_index, integ_group_size, ispin, jjB, jspin, &
      94             :          max_ij_pairs, my_block_size, my_group_L_end, my_group_L_size, my_group_L_size_orig, &
      95             :          my_group_L_start, my_i, my_ij_pairs, my_j, my_new_group_L_size, ngroup, nspins, &
      96             :          num_integ_group, proc_receive, proc_send, proc_shift, rec_B_size, rec_B_virtual_end, &
      97             :          rec_B_virtual_start, rec_L_size, send_B_size, send_B_virtual_end, send_B_virtual_start, &
      98             :          send_i, send_ij_index, send_j, start_point, tag, total_ij_pairs
      99         346 :       INTEGER, ALLOCATABLE, DIMENSION(:) :: integ_group_pos2color_sub, my_B_size, &
     100         346 :          my_B_virtual_end, my_B_virtual_start, num_ij_pairs, sizes_array_orig, virtual
     101         346 :       INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: ij_map
     102         346 :       INTEGER, ALLOCATABLE, DIMENSION(:, :, :)           :: ranges_info_array
     103             :       LOGICAL                                            :: my_alpha_beta_case, my_beta_beta_case, &
     104             :                                                             my_open_shell_SS
     105             :       REAL(KIND=dp)                                      :: amp_fac, my_Emp2_Cou, my_Emp2_EX, &
     106             :                                                             sym_fac, t_new, t_start
     107         346 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:), TARGET   :: buffer_1D
     108             :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :), &
     109         346 :          TARGET                                          :: local_ab, local_ba, t_ab
     110             :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
     111         346 :          TARGET                                          :: local_i_aL, local_j_aL, Y_i_aP, Y_j_aP
     112             :       REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
     113         346 :          POINTER                                         :: external_ab, external_i_aL
     114             :       REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :, :), &
     115         346 :          POINTER                                         :: BI_C_rec
     116             :       TYPE(dgemm_counter_type)                           :: dgemm_counter
     117             :       TYPE(mp_comm_type)                                 :: comm_exchange, comm_rep
     118             :       TYPE(three_dim_real_array), ALLOCATABLE, &
     119         346 :          DIMENSION(:)                                    :: B_ia_Q
     120             : 
     121         346 :       CALL timeset(routineN, handle)
     122             : 
     123         346 :       nspins = SIZE(homo)
     124             : 
     125        1038 :       ALLOCATE (virtual(nspins))
     126         778 :       virtual(:) = nmo - homo(:)
     127             : 
     128        1384 :       ALLOCATE (my_B_size(nspins), my_B_virtual_start(nspins), my_B_virtual_end(nspins))
     129         778 :       DO ispin = 1, nspins
     130             :          CALL get_group_dist(gd_B_virtual(ispin), para_env_sub%mepos, &
     131         778 :                              my_B_virtual_start(ispin), my_B_virtual_end(ispin), my_B_size(ispin))
     132             :       END DO
     133             : 
     134         346 :       CALL get_group_dist(gd_array, color_sub, my_group_L_start, my_group_L_end, my_group_L_size)
     135             : 
     136         346 :       CALL dgemm_counter_init(dgemm_counter, unit_nr, mp2_env%ri_mp2%print_dgemm_info)
     137             : 
     138             :       ! local_gemm_ctx has a very footprint the first time this routine is
     139             :       ! called.
     140         346 :       CALL mp2_env%local_gemm_ctx%create(LOCAL_GEMM_PU_GPU)
     141         346 :       CALL mp2_env%local_gemm_ctx%set_op_threshold_gpu(128*128*128*2)
     142             : 
     143             :       CALL mp2_ri_get_integ_group_size( &
     144             :          mp2_env, para_env, para_env_sub, gd_array, gd_B_virtual, &
     145             :          homo, dimen_RI, unit_nr, &
     146             :          integ_group_size, ngroup, &
     147         346 :          num_integ_group, virtual, calc_forces)
     148             : 
     149             :       ! now create a group that contains all the proc that have the same virtual starting point
     150             :       ! in the integ group
     151             :       CALL mp2_ri_create_group( &
     152             :          para_env, para_env_sub, color_sub, &
     153             :          gd_array%sizes, calc_forces, &
     154             :          integ_group_size, my_group_L_end, &
     155             :          my_group_L_size, my_group_L_size_orig, my_group_L_start, my_new_group_L_size, &
     156             :          integ_group_pos2color_sub, sizes_array_orig, &
     157         346 :          ranges_info_array, comm_exchange, comm_rep, num_integ_group)
     158             : 
     159             :       ! We cannot fix the tag because of the recv routine
     160         346 :       tag = 42
     161             : 
     162         778 :       DO jspin = 1, nspins
     163             : 
     164             :          CALL replicate_iaK_2intgroup(BIb_C(jspin)%array, comm_exchange, comm_rep, &
     165             :                                       homo(jspin), gd_array%sizes, my_B_size(jspin), &
     166         432 :                                       my_group_L_size, ranges_info_array)
     167             : 
     168        1296 :          DO ispin = 1, jspin
     169             : 
     170         518 :             IF (unit_nr > 0) THEN
     171         259 :                IF (nspins == 1) THEN
     172         130 :                   WRITE (unit_nr, *) "Start loop run"
     173         129 :                ELSE IF (ispin == 1 .AND. jspin == 1) THEN
     174          43 :                   WRITE (unit_nr, *) "Start loop run alpha-alpha"
     175          86 :                ELSE IF (ispin == 1 .AND. jspin == 2) THEN
     176          43 :                   WRITE (unit_nr, *) "Start loop run alpha-beta"
     177          43 :                ELSE IF (ispin == 2 .AND. jspin == 2) THEN
     178          43 :                   WRITE (unit_nr, *) "Start loop run beta-beta"
     179             :                END IF
     180         259 :                CALL m_flush(unit_nr)
     181             :             END IF
     182             : 
     183         518 :             my_open_shell_SS = (nspins == 2) .AND. (ispin == jspin)
     184             : 
     185             :             ! t_ab = amp_fac*(:,a|:,b)-(:,b|:,a)
     186             :             ! If we calculate the gradient we need to distinguish
     187             :             ! between alpha-alpha and beta-beta cases for UMP2
     188             : 
     189         518 :             my_beta_beta_case = .FALSE.
     190         518 :             my_alpha_beta_case = .FALSE.
     191         518 :             IF (ispin /= jspin) THEN
     192          86 :                my_alpha_beta_case = .TRUE.
     193         432 :             ELSE IF (my_open_shell_SS) THEN
     194         172 :                IF (ispin == 2) my_beta_beta_case = .TRUE.
     195             :             END IF
     196             : 
     197         518 :             amp_fac = mp2_env%scale_S + mp2_env%scale_T
     198         518 :             IF (my_alpha_beta_case .OR. my_open_shell_SS) amp_fac = mp2_env%scale_T
     199             : 
     200             :             CALL mp2_ri_allocate_no_blk(local_ab, t_ab, mp2_env, homo, virtual, my_B_size, &
     201         518 :                                         my_group_L_size, calc_forces, ispin, jspin, local_ba)
     202             : 
     203             :             CALL mp2_ri_get_block_size( &
     204             :                mp2_env, para_env, para_env_sub, gd_array, gd_B_virtual(ispin:jspin), &
     205             :                homo(ispin:jspin), virtual(ispin:jspin), dimen_RI, unit_nr, block_size, &
     206         518 :                ngroup, num_integ_group, my_open_shell_ss, calc_forces, buffer_1D)
     207             : 
     208             :             ! *****************************************************************
     209             :             ! **********  REPLICATION-BLOCKED COMMUNICATION SCHEME  ***********
     210             :             ! *****************************************************************
     211             :             ! introduce block size, the number of occupied orbitals has to be a
     212             :             ! multiple of the block size
     213             : 
     214             :             ! Calculate the maximum number of ij pairs that have to be computed
     215             :             ! among groups
     216             :             CALL mp2_ri_communication(my_alpha_beta_case, total_ij_pairs, homo(ispin), homo(jspin), &
     217         518 :                                       block_size, ngroup, ij_map, color_sub, my_ij_pairs, my_open_shell_SS, unit_nr)
     218             : 
     219        1554 :             ALLOCATE (num_ij_pairs(0:comm_exchange%num_pe - 1))
     220         518 :             CALL comm_exchange%allgather(my_ij_pairs, num_ij_pairs)
     221             : 
     222        1146 :             max_ij_pairs = MAXVAL(num_ij_pairs)
     223             : 
     224             :             ! start real stuff
     225             :             CALL mp2_ri_allocate_blk(dimen_RI, my_B_size, block_size, local_i_aL, &
     226         518 :                                      local_j_aL, calc_forces, Y_i_aP, Y_j_aP, ispin, jspin)
     227             : 
     228         518 :             CALL timeset(routineN//"_RI_loop", handle2)
     229         518 :             my_Emp2_Cou = 0.0_dp
     230         518 :             my_Emp2_EX = 0.0_dp
     231         518 :             t_start = m_walltime()
     232        2509 :             DO ij_index = 1, max_ij_pairs
     233             : 
     234             :                ! Prediction is unreliable if we are in the first step of the loop
     235        1991 :                IF (unit_nr > 0 .AND. ij_index > 1) THEN
     236         723 :                   decil = ij_index*10/max_ij_pairs
     237         723 :                   IF (decil /= (ij_index - 1)*10/max_ij_pairs) THEN
     238         682 :                      t_new = m_walltime()
     239         682 :                      t_new = (t_new - t_start)/60.0_dp*(max_ij_pairs - ij_index + 1)/(ij_index - 1)
     240             :                      WRITE (unit_nr, FMT="(T3,A)") "Percentage of finished loop: "// &
     241         682 :                         cp_to_string(decil*10)//". Minutes left: "//cp_to_string(t_new)
     242         682 :                      CALL m_flush(unit_nr)
     243             :                   END IF
     244             :                END IF
     245             : 
     246        1991 :                IF (calc_forces) THEN
     247     2791144 :                   Y_i_aP = 0.0_dp
     248     2891166 :                   Y_j_aP = 0.0_dp
     249             :                END IF
     250             : 
     251        1991 :                IF (ij_index <= my_ij_pairs) THEN
     252             :                   ! We have work to do
     253        1942 :                   ij_counter = (ij_index - MIN(1, color_sub))*ngroup + color_sub
     254        1942 :                   my_i = ij_map(1, ij_counter)
     255        1942 :                   my_j = ij_map(2, ij_counter)
     256        1942 :                   my_block_size = ij_map(3, ij_counter)
     257             : 
     258     3277655 :                   local_i_aL = 0.0_dp
     259             :                   CALL fill_local_i_aL(local_i_aL(:, :, 1:my_block_size), ranges_info_array(:, :, comm_exchange%mepos), &
     260        1942 :                                        BIb_C(ispin)%array(:, :, my_i:my_i + my_block_size - 1))
     261             : 
     262     3391073 :                   local_j_aL = 0.0_dp
     263             :                   CALL fill_local_i_aL(local_j_aL(:, :, 1:my_block_size), ranges_info_array(:, :, comm_exchange%mepos), &
     264        1942 :                                        BIb_C(jspin)%array(:, :, my_j:my_j + my_block_size - 1))
     265             : 
     266             :                   ! collect data from other proc
     267        1942 :                   CALL timeset(routineN//"_comm", handle3)
     268        2051 :                   DO proc_shift = 1, comm_exchange%num_pe - 1
     269         109 :                      proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
     270         109 :                      proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
     271             : 
     272         109 :                      send_ij_index = num_ij_pairs(proc_send)
     273             : 
     274         109 :                      CALL get_group_dist(gd_array, proc_receive, sizes=rec_L_size)
     275             : 
     276        2051 :                      IF (ij_index <= send_ij_index) THEN
     277             :                         ij_counter_send = (ij_index - MIN(1, integ_group_pos2color_sub(proc_send)))*ngroup + &
     278          60 :                                           integ_group_pos2color_sub(proc_send)
     279          60 :                         send_i = ij_map(1, ij_counter_send)
     280          60 :                         send_j = ij_map(2, ij_counter_send)
     281             : 
     282             :                         ! occupied i
     283             :                         BI_C_rec(1:rec_L_size, 1:my_B_size(ispin), 1:my_block_size) => &
     284          60 :                            buffer_1D(1:rec_L_size*my_B_size(ispin)*my_block_size)
     285       43353 :                         BI_C_rec = 0.0_dp
     286             :                         CALL comm_exchange%sendrecv(BIb_C(ispin)%array(:, :, send_i:send_i + my_block_size - 1), &
     287          60 :                                                     proc_send, BI_C_rec, proc_receive, tag)
     288             : 
     289             :                         CALL fill_local_i_aL(local_i_aL(:, :, 1:my_block_size), ranges_info_array(:, :, proc_receive), &
     290          60 :                                              BI_C_rec(:, 1:my_B_size(ispin), :))
     291             : 
     292             :                         ! occupied j
     293             :                         BI_C_rec(1:rec_L_size, 1:my_B_size(jspin), 1:my_block_size) => &
     294          60 :                            buffer_1D(1:INT(rec_L_size, int_8)*my_B_size(jspin)*my_block_size)
     295       44373 :                         BI_C_rec = 0.0_dp
     296             :                         CALL comm_exchange%sendrecv(BIb_C(jspin)%array(:, :, send_j:send_j + my_block_size - 1), &
     297          60 :                                                     proc_send, BI_C_rec, proc_receive, tag)
     298             : 
     299             :                         CALL fill_local_i_aL(local_j_aL(:, :, 1:my_block_size), ranges_info_array(:, :, proc_receive), &
     300          60 :                                              BI_C_rec(:, 1:my_B_size(jspin), :))
     301             : 
     302             :                      ELSE
     303             :                         ! we send nothing while we know that we have to receive something
     304             : 
     305             :                         ! occupied i
     306             :                         BI_C_rec(1:rec_L_size, 1:my_B_size(ispin), 1:my_block_size) => &
     307          49 :                            buffer_1D(1:INT(rec_L_size, int_8)*my_B_size(ispin)*my_block_size)
     308        9124 :                         BI_C_rec = 0.0_dp
     309          49 :                         CALL comm_exchange%recv(BI_C_rec, proc_receive, tag)
     310             : 
     311             :                         CALL fill_local_i_aL(local_i_aL(:, :, 1:my_block_size), ranges_info_array(:, :, proc_receive), &
     312          49 :                                              BI_C_rec(:, 1:my_B_size(ispin), 1:my_block_size))
     313             : 
     314             :                         ! occupied j
     315             :                         BI_C_rec(1:rec_L_size, 1:my_B_size(jspin), 1:my_block_size) => &
     316          49 :                            buffer_1D(1:INT(rec_L_size, int_8)*my_B_size(jspin)*my_block_size)
     317        9124 :                         BI_C_rec = 0.0_dp
     318          49 :                         CALL comm_exchange%recv(BI_C_rec, proc_receive, tag)
     319             : 
     320             :                         CALL fill_local_i_aL(local_j_aL(:, :, 1:my_block_size), ranges_info_array(:, :, proc_receive), &
     321          49 :                                              BI_C_rec(:, 1:my_B_size(jspin), 1:my_block_size))
     322             : 
     323             :                      END IF
     324             : 
     325             :                   END DO
     326             : 
     327        1942 :                   CALL timestop(handle3)
     328             : 
     329             :                   ! loop over the block elements
     330        3888 :                   DO iiB = 1, my_block_size
     331        5842 :                      DO jjB = 1, my_block_size
     332        1954 :                         CALL timeset(routineN//"_expansion", handle3)
     333        3900 :                         ASSOCIATE (my_local_i_aL => local_i_aL(:, :, iiB), my_local_j_aL => local_j_aL(:, :, jjB))
     334             : 
     335             :                            ! calculate the integrals (ia|jb) strating from my local data ...
     336      578764 :                            local_ab = 0.0_dp
     337        1954 :                            IF ((my_alpha_beta_case) .AND. (calc_forces)) THEN
     338      139874 :                               local_ba = 0.0_dp
     339             :                            END IF
     340        1954 :                            CALL dgemm_counter_start(dgemm_counter)
     341             :                            CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_B_size(ispin), my_B_size(jspin), dimen_RI, 1.0_dp, &
     342             :                                                             my_local_i_aL, dimen_RI, my_local_j_aL, dimen_RI, &
     343             :                                                            0.0_dp, local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), &
     344        1954 :                                                             my_B_size(ispin))
     345             :                            ! Additional integrals only for alpha_beta case and forces
     346        1954 :                            IF (my_alpha_beta_case .AND. calc_forces) THEN
     347             :                               local_ba(my_B_virtual_start(jspin):my_B_virtual_end(jspin), :) = &
     348      132909 :                                  TRANSPOSE(local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :))
     349             :                            END IF
     350             :                            ! ... and from the other of my subgroup
     351        2228 :                            DO proc_shift = 1, para_env_sub%num_pe - 1
     352         274 :                               proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
     353         274 :                               proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
     354             : 
     355             :                               CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, &
     356         274 :                                                   rec_B_virtual_end, rec_B_size)
     357             : 
     358         274 :                               external_i_aL(1:dimen_RI, 1:rec_B_size) => buffer_1D(1:INT(dimen_RI, int_8)*rec_B_size)
     359      267944 :                               external_i_aL = 0.0_dp
     360             : 
     361             :                               CALL para_env_sub%sendrecv(my_local_i_aL, proc_send, &
     362         274 :                                                          external_i_aL, proc_receive, tag)
     363             : 
     364             :                               CALL mp2_env%local_gemm_ctx%gemm( &
     365             :                                  'T', 'N', rec_B_size, my_B_size(jspin), dimen_RI, 1.0_dp, &
     366             :                                  external_i_aL, dimen_RI, my_local_j_aL, dimen_RI, &
     367         274 :                                  0.0_dp, local_ab(rec_B_virtual_start:rec_B_virtual_end, 1:my_B_size(jspin)), rec_B_size)
     368             : 
     369             :                               ! Additional integrals only for alpha_beta case and forces
     370        2502 :                               IF (my_alpha_beta_case .AND. calc_forces) THEN
     371             : 
     372             :                                  CALL get_group_dist(gd_B_virtual(jspin), proc_receive, rec_B_virtual_start, &
     373          70 :                                                      rec_B_virtual_end, rec_B_size)
     374             : 
     375          70 :                                  external_i_aL(1:dimen_RI, 1:rec_B_size) => buffer_1D(1:INT(dimen_RI, int_8)*rec_B_size)
     376       81655 :                                  external_i_aL = 0.0_dp
     377             : 
     378             :                                  CALL para_env_sub%sendrecv(my_local_j_aL, proc_send, &
     379          70 :                                                             external_i_aL, proc_receive, tag)
     380             : 
     381             :                                  CALL mp2_env%local_gemm_ctx%gemm('T', 'N', rec_B_size, my_B_size(ispin), dimen_RI, 1.0_dp, &
     382             :                                                                   external_i_aL, dimen_RI, my_local_i_aL, dimen_RI, &
     383          70 :                                             0.0_dp, local_ba(rec_B_virtual_start:rec_B_virtual_end, 1:my_B_size(ispin)), rec_B_size)
     384             :                               END IF
     385             :                            END DO
     386        1954 :                            IF (my_alpha_beta_case .AND. calc_forces) THEN
     387             :                               ! Is just an approximation, but the call does not allow it, it ought to be (virtual_i*B_size_j+virtual_j*B_size_i)*dimen_RI
     388         490 :                               CALL dgemm_counter_stop(dgemm_counter, virtual(ispin), my_B_size(ispin) + my_B_size(jspin), dimen_RI)
     389             :                            ELSE
     390        1464 :                               CALL dgemm_counter_stop(dgemm_counter, virtual(ispin), my_B_size(jspin), dimen_RI)
     391             :                            END IF
     392        1954 :                            CALL timestop(handle3)
     393             : 
     394             :                            !sample peak memory
     395        1954 :                            CALL m_memory()
     396             : 
     397        1954 :                            CALL timeset(routineN//"_ener", handle3)
     398             :                            ! calculate coulomb only MP2
     399        1954 :                            sym_fac = 2.0_dp
     400        1954 :                            IF (my_i == my_j) sym_fac = 1.0_dp
     401        1954 :                            IF (my_alpha_beta_case) sym_fac = 0.5_dp
     402       30429 :                            DO b = 1, my_B_size(jspin)
     403       28475 :                               b_global = b + my_B_virtual_start(jspin) - 1
     404      578764 :                               DO a = 1, virtual(ispin)
     405             :                                  my_Emp2_Cou = my_Emp2_Cou - sym_fac*2.0_dp*local_ab(a, b)**2/ &
     406             :                                                (Eigenval(homo(ispin) + a, ispin) + Eigenval(homo(jspin) + b_global, jspin) - &
     407      576810 :                                                 Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, jspin))
     408             :                               END DO
     409             :                            END DO
     410             : 
     411        1954 :                            IF (calc_ex) THEN
     412             :                               ! contract integrals with orbital energies for exchange MP2 energy
     413             :                               ! starting with local ...
     414      323406 :                               IF (calc_forces .AND. (.NOT. my_alpha_beta_case)) t_ab = 0.0_dp
     415       29841 :                               DO b = 1, my_B_size(ispin)
     416       27887 :                                  b_global = b + my_B_virtual_start(ispin) - 1
     417      541500 :                                  DO a = 1, my_B_size(ispin)
     418      511659 :                                     a_global = a + my_B_virtual_start(ispin) - 1
     419             :                                     my_Emp2_Ex = my_Emp2_Ex + sym_fac*local_ab(a_global, b)*local_ab(b_global, a)/ &
     420             :                                               (Eigenval(homo(ispin) + a_global, ispin) + Eigenval(homo(ispin) + b_global, ispin) - &
     421      511659 :                                                   Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, ispin))
     422      539546 :                                     IF (calc_forces .AND. (.NOT. my_alpha_beta_case)) THEN
     423             :                                      t_ab(a_global, b) = -(amp_fac*local_ab(a_global, b) - mp2_env%scale_T*local_ab(b_global, a))/ &
     424             :                                                            (Eigenval(homo(ispin) + a_global, ispin) + &
     425             :                                                             Eigenval(homo(ispin) + b_global, ispin) - &
     426      295758 :                                                             Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, ispin))
     427             :                                     END IF
     428             :                                  END DO
     429             :                               END DO
     430             :                               ! ... and then with external data
     431        2228 :                               DO proc_shift = 1, para_env_sub%num_pe - 1
     432         274 :                                  proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
     433         274 :                                  proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
     434             : 
     435             :                                  CALL get_group_dist(gd_B_virtual(ispin), proc_receive, &
     436         274 :                                                      rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
     437             :                                  CALL get_group_dist(gd_B_virtual(ispin), proc_send, &
     438         274 :                                                      send_B_virtual_start, send_B_virtual_end, send_B_size)
     439             : 
     440             :                                  external_ab(1:my_B_size(ispin), 1:rec_B_size) => &
     441         274 :                                     buffer_1D(1:INT(rec_B_size, int_8)*my_B_size(ispin))
     442       30405 :                                  external_ab = 0.0_dp
     443             : 
     444             :                       CALL para_env_sub%sendrecv(local_ab(send_B_virtual_start:send_B_virtual_end, 1:my_B_size(ispin)), proc_send, &
     445       60536 :                                                             external_ab(1:my_B_size(ispin), 1:rec_B_size), proc_receive, tag)
     446             : 
     447        5233 :                                  DO b = 1, my_B_size(ispin)
     448        2731 :                                     b_global = b + my_B_virtual_start(ispin) - 1
     449       30405 :                                     DO a = 1, rec_B_size
     450       27400 :                                        a_global = a + rec_B_virtual_start - 1
     451             :                                        my_Emp2_Ex = my_Emp2_Ex + sym_fac*local_ab(a_global, b)*external_ab(b, a)/ &
     452             :                                               (Eigenval(homo(ispin) + a_global, ispin) + Eigenval(homo(ispin) + b_global, ispin) - &
     453       27400 :                                                      Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, ispin))
     454       27400 :                                        IF (calc_forces .AND. (.NOT. my_alpha_beta_case)) &
     455             :                                          t_ab(a_global, b) = -(amp_fac*local_ab(a_global, b) - mp2_env%scale_T*external_ab(b, a))/ &
     456             :                                                               (Eigenval(homo(ispin) + a_global, ispin) + &
     457             :                                                                Eigenval(homo(ispin) + b_global, ispin) - &
     458       12311 :                                                                Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, ispin))
     459             :                                     END DO
     460             :                                  END DO
     461             :                               END DO
     462             :                            END IF
     463        1954 :                            CALL timestop(handle3)
     464             : 
     465        3908 :                            IF (calc_forces) THEN
     466             :                               ! update P_ab, Gamma_P_ia
     467             :                               CALL mp2_update_P_gamma(mp2_env, para_env_sub, gd_B_virtual, &
     468             :                                                       Eigenval, homo, dimen_RI, iiB, jjB, my_B_size, &
     469             :                                                       my_B_virtual_end, my_B_virtual_start, my_i, my_j, virtual, &
     470             :                                                       local_ab, t_ab, my_local_i_aL, my_local_j_aL, &
     471             :                                                       my_open_shell_ss, Y_i_aP(:, :, iiB), Y_j_aP(:, :, jjB), local_ba, &
     472        1569 :                                                       ispin, jspin, dgemm_counter, buffer_1D)
     473             : 
     474             :                            END IF
     475             : 
     476             :                         END ASSOCIATE
     477             : 
     478             :                      END DO ! jjB
     479             :                   END DO ! iiB
     480             : 
     481             :                ELSE
     482             :                   ! We need it later in case of gradients
     483          49 :                   my_block_size = 1
     484             : 
     485          49 :                   CALL timeset(routineN//"_comm", handle3)
     486             :                   ! No work to do and we know that we have to receive nothing, but send something
     487             :                   ! send data to other proc
     488          98 :                   DO proc_shift = 1, comm_exchange%num_pe - 1
     489          49 :                      proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
     490          49 :                      proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
     491             : 
     492          49 :                      send_ij_index = num_ij_pairs(proc_send)
     493             : 
     494          98 :                      IF (ij_index <= send_ij_index) THEN
     495             :                         ! something to send
     496             :                         ij_counter_send = (ij_index - MIN(1, integ_group_pos2color_sub(proc_send)))*ngroup + &
     497          49 :                                           integ_group_pos2color_sub(proc_send)
     498          49 :                         send_i = ij_map(1, ij_counter_send)
     499          49 :                         send_j = ij_map(2, ij_counter_send)
     500             : 
     501             :                         ! occupied i
     502             :                         CALL comm_exchange%send(BIb_C(ispin)%array(:, :, send_i:send_i + my_block_size - 1), &
     503          49 :                                                 proc_send, tag)
     504             :                         ! occupied j
     505             :                         CALL comm_exchange%send(BIb_C(jspin)%array(:, :, send_j:send_j + my_block_size - 1), &
     506          49 :                                                 proc_send, tag)
     507             :                      END IF
     508             :                   END DO
     509          49 :                   CALL timestop(handle3)
     510             :                END IF
     511             : 
     512             :                ! redistribute gamma
     513        2509 :                IF (calc_forces) THEN
     514             :                   CALL mp2_redistribute_gamma(mp2_env%ri_grad%Gamma_P_ia(ispin)%array, ij_index, my_B_size(ispin), &
     515             :                                               my_block_size, my_group_L_size, my_i, my_ij_pairs, ngroup, &
     516             :                                               num_integ_group, integ_group_pos2color_sub, num_ij_pairs, &
     517             :                                               ij_map, ranges_info_array, Y_i_aP(:, :, 1:my_block_size), comm_exchange, &
     518        1566 :                                               gd_array%sizes, 1, buffer_1D)
     519             :                   CALL mp2_redistribute_gamma(mp2_env%ri_grad%Gamma_P_ia(jspin)%array, ij_index, my_B_size(jspin), &
     520             :                                               my_block_size, my_group_L_size, my_j, my_ij_pairs, ngroup, &
     521             :                                               num_integ_group, integ_group_pos2color_sub, num_ij_pairs, &
     522             :                                               ij_map, ranges_info_array, Y_j_aP(:, :, 1:my_block_size), comm_exchange, &
     523        1566 :                                               gd_array%sizes, 2, buffer_1D)
     524             :                END IF
     525             : 
     526             :             END DO
     527         518 :             CALL timestop(handle2)
     528             : 
     529         518 :             DEALLOCATE (local_i_aL)
     530         518 :             DEALLOCATE (local_j_aL)
     531         518 :             DEALLOCATE (ij_map)
     532         518 :             DEALLOCATE (num_ij_pairs)
     533         518 :             DEALLOCATE (local_ab)
     534             : 
     535         518 :             IF (calc_forces) THEN
     536         372 :                DEALLOCATE (Y_i_aP)
     537         372 :                DEALLOCATE (Y_j_aP)
     538         372 :                IF (ALLOCATED(t_ab)) THEN
     539         296 :                   DEALLOCATE (t_ab)
     540             :                END IF
     541         372 :                DEALLOCATE (local_ba)
     542             : 
     543             :                ! here we check if there are almost degenerate ij
     544             :                ! pairs and we update P_ij with these contribution.
     545             :                ! If all pairs are degenerate with each other this step will scale O(N^6),
     546             :                ! if the number of degenerate pairs scales linearly with the system size
     547             :                ! this step will scale O(N^5).
     548             :                ! Start counting the number of almost degenerate ij pairs according
     549             :                ! to eps_canonical
     550             :                CALL quasi_degenerate_P_ij( &
     551             :                   mp2_env, Eigenval(:, ispin:jspin), homo(ispin:jspin), virtual(ispin:jspin), my_open_shell_ss, &
     552             :                   my_beta_beta_case, Bib_C(ispin:jspin), unit_nr, dimen_RI, &
     553             :                   my_B_size(ispin:jspin), ngroup, my_group_L_size, &
     554             :                   color_sub, ranges_info_array, comm_exchange, para_env_sub, para_env, &
     555             :                   my_B_virtual_start(ispin:jspin), my_B_virtual_end(ispin:jspin), gd_array%sizes, gd_B_virtual(ispin:jspin), &
     556         372 :                   integ_group_pos2color_sub, dgemm_counter, buffer_1D)
     557             : 
     558             :             END IF
     559             : 
     560         518 :             DEALLOCATE (buffer_1D)
     561             : 
     562             :             ! Dereplicate BIb_C and Gamma_P_ia to save memory
     563             :             ! These matrices will not be needed in that fashion anymore
     564             :             ! B_ia_Q will needed later
     565         518 :             IF (calc_forces .AND. jspin == nspins) THEN
     566        1032 :                IF (.NOT. ALLOCATED(B_ia_Q)) ALLOCATE (B_ia_Q(nspins))
     567        1480 :                ALLOCATE (B_ia_Q(ispin)%array(homo(ispin), my_B_size(ispin), my_group_L_size_orig))
     568      888463 :                B_ia_Q(ispin)%array = 0.0_dp
     569        1360 :                DO jjB = 1, homo(ispin)
     570       16940 :                   DO iiB = 1, my_B_size(ispin)
     571             :                      B_ia_Q(ispin)%array(jjB, iiB, 1:my_group_L_size_orig) = &
     572      709332 :                         BIb_C(ispin)%array(1:my_group_L_size_orig, iiB, jjB)
     573             :                   END DO
     574             :                END DO
     575         296 :                DEALLOCATE (BIb_C(ispin)%array)
     576             : 
     577             :                ! sum Gamma and dereplicate
     578        1480 :                ALLOCATE (BIb_C(ispin)%array(my_B_size(ispin), homo(ispin), my_group_L_size_orig))
     579         562 :                DO proc_shift = 1, comm_rep%num_pe - 1
     580             :                   ! invert order
     581         266 :                   proc_send = MODULO(comm_rep%mepos - proc_shift, comm_rep%num_pe)
     582         266 :                   proc_receive = MODULO(comm_rep%mepos + proc_shift, comm_rep%num_pe)
     583             : 
     584         266 :                   start_point = ranges_info_array(3, proc_shift, comm_exchange%mepos)
     585         266 :                   end_point = ranges_info_array(4, proc_shift, comm_exchange%mepos)
     586             : 
     587             :                   CALL comm_rep%sendrecv(mp2_env%ri_grad%Gamma_P_ia(ispin)%array(:, :, start_point:end_point), &
     588         266 :                                          proc_send, BIb_C(ispin)%array, proc_receive, tag)
     589             : !$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
     590         562 : !$OMP          SHARED(mp2_env,BIb_C,ispin,homo,my_B_size,my_group_L_size_orig)
     591             :                   mp2_env%ri_grad%Gamma_P_ia(ispin)%array(:, :, 1:my_group_L_size_orig) = &
     592             :                      mp2_env%ri_grad%Gamma_P_ia(ispin)%array(:, :, 1:my_group_L_size_orig) &
     593             :                      + BIb_C(ispin)%array(:, :, :)
     594             : !$OMP END PARALLEL WORKSHARE
     595             :                END DO
     596             : 
     597      753612 :                BIb_C(ispin)%array(:, :, :) = mp2_env%ri_grad%Gamma_P_ia(ispin)%array(:, :, 1:my_group_L_size_orig)
     598         296 :                DEALLOCATE (mp2_env%ri_grad%Gamma_P_ia(ispin)%array)
     599         296 :                CALL MOVE_ALLOC(BIb_C(ispin)%array, mp2_env%ri_grad%Gamma_P_ia(ispin)%array)
     600         222 :             ELSE IF (jspin == nspins) THEN
     601         136 :                DEALLOCATE (BIb_C(ispin)%array)
     602             :             END IF
     603             : 
     604         518 :             CALL para_env%sum(my_Emp2_Cou)
     605         518 :             CALL para_env%sum(my_Emp2_Ex)
     606             : 
     607        1986 :             IF (my_open_shell_SS .OR. my_alpha_beta_case) THEN
     608         258 :                IF (my_alpha_beta_case) THEN
     609          86 :                   Emp2_S = Emp2_S + my_Emp2_Cou
     610          86 :                   Emp2_Cou = Emp2_Cou + my_Emp2_Cou
     611             :                ELSE
     612         172 :                   my_Emp2_Cou = my_Emp2_Cou*0.25_dp
     613         172 :                   my_Emp2_EX = my_Emp2_EX*0.5_dp
     614         172 :                   Emp2_T = Emp2_T + my_Emp2_Cou + my_Emp2_EX
     615         172 :                   Emp2_Cou = Emp2_Cou + my_Emp2_Cou
     616         172 :                   Emp2_EX = Emp2_EX + my_Emp2_EX
     617             :                END IF
     618             :             ELSE
     619         260 :                Emp2_Cou = Emp2_Cou + my_Emp2_Cou
     620         260 :                Emp2_EX = Emp2_EX + my_Emp2_EX
     621             :             END IF
     622             :          END DO
     623             : 
     624             :       END DO
     625             : 
     626         346 :       DEALLOCATE (integ_group_pos2color_sub)
     627         346 :       DEALLOCATE (ranges_info_array)
     628             : 
     629         346 :       CALL comm_exchange%free()
     630         346 :       CALL comm_rep%free()
     631             : 
     632         346 :       IF (calc_forces) THEN
     633             :          ! recover original information (before replication)
     634         220 :          DEALLOCATE (gd_array%sizes)
     635         220 :          iiB = SIZE(sizes_array_orig)
     636         660 :          ALLOCATE (gd_array%sizes(0:iiB - 1))
     637         654 :          gd_array%sizes(:) = sizes_array_orig
     638         220 :          DEALLOCATE (sizes_array_orig)
     639             : 
     640             :          ! Remove replication from BIb_C and reorder the matrix
     641         220 :          my_group_L_size = my_group_L_size_orig
     642             : 
     643             :          ! B_ia_Q(ispin)%array will be deallocated inside of complete_gamma
     644         516 :          DO ispin = 1, nspins
     645             :             CALL complete_gamma(mp2_env, B_ia_Q(ispin)%array, dimen_RI, homo(ispin), &
     646             :                                 virtual(ispin), para_env, para_env_sub, ngroup, &
     647             :                                 my_group_L_size, my_group_L_start, my_group_L_end, &
     648             :                                 my_B_size(ispin), my_B_virtual_start(ispin), &
     649             :                                 gd_array, gd_B_virtual(ispin), &
     650         516 :                                 ispin)
     651             :          END DO
     652         516 :          DEALLOCATE (B_ia_Q)
     653             : 
     654       43758 :          IF (nspins == 1) mp2_env%ri_grad%P_ab(1)%array(:, :) = mp2_env%ri_grad%P_ab(1)%array(:, :)*2.0_dp
     655             :          BLOCK
     656             :             TYPE(mp_comm_type) :: comm
     657         220 :             CALL comm%from_split(para_env, para_env_sub%mepos)
     658         516 :             DO ispin = 1, nspins
     659             :                ! P_ab is only replicated over all subgroups
     660         296 :                CALL comm%sum(mp2_env%ri_grad%P_ab(ispin)%array)
     661             :                ! P_ij is replicated over all processes
     662         516 :                CALL para_env%sum(mp2_env%ri_grad%P_ij(ispin)%array)
     663             :             END DO
     664         440 :             CALL comm%free()
     665             :          END BLOCK
     666             :       END IF
     667             : 
     668         346 :       CALL release_group_dist(gd_array)
     669         778 :       DO ispin = 1, nspins
     670         432 :          IF (ALLOCATED(BIb_C(ispin)%array)) DEALLOCATE (BIb_C(ispin)%array)
     671         778 :          CALL release_group_dist(gd_B_virtual(ispin))
     672             :       END DO
     673             : 
     674             :       ! We do not need this matrix later, so deallocate it here to safe memory
     675         346 :       IF (calc_forces) DEALLOCATE (mp2_env%ri_grad%PQ_half)
     676         346 :       IF (calc_forces .AND. .NOT. compare_potential_types(mp2_env%ri_metric, mp2_env%potential_parameter)) &
     677           8 :          DEALLOCATE (mp2_env%ri_grad%operator_half)
     678             : 
     679         346 :       CALL dgemm_counter_write(dgemm_counter, para_env)
     680             : 
     681             :       ! release memory allocated by local_gemm when run on GPU. local_gemm_ctx is null on cpu only runs
     682         346 :       CALL mp2_env%local_gemm_ctx%destroy()
     683         346 :       CALL timestop(handle)
     684             : 
     685        1384 :    END SUBROUTINE mp2_ri_gpw_compute_en
     686             : 
     687             : ! **************************************************************************************************
     688             : !> \brief ...
     689             : !> \param local_i_aL ...
     690             : !> \param ranges_info_array ...
     691             : !> \param BIb_C_rec ...
     692             : ! **************************************************************************************************
     693        4248 :    SUBROUTINE fill_local_i_aL(local_i_aL, ranges_info_array, BIb_C_rec)
     694             :       REAL(KIND=dp), DIMENSION(:, :, :), INTENT(INOUT)   :: local_i_aL
     695             :       INTEGER, DIMENSION(:, :), INTENT(IN)               :: ranges_info_array
     696             :       REAL(KIND=dp), DIMENSION(:, :, :), INTENT(IN)      :: BIb_C_rec
     697             : 
     698             :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'fill_local_i_aL'
     699             : 
     700             :       INTEGER                                            :: end_point, handle, irep, Lend_pos, &
     701             :                                                             Lstart_pos, start_point
     702             : 
     703        4248 :       CALL timeset(routineN, handle)
     704             : 
     705       11764 :       DO irep = 1, SIZE(ranges_info_array, 2)
     706        7516 :          Lstart_pos = ranges_info_array(1, irep)
     707        7516 :          Lend_pos = ranges_info_array(2, irep)
     708        7516 :          start_point = ranges_info_array(3, irep)
     709        7516 :          end_point = ranges_info_array(4, irep)
     710             : 
     711             : !$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
     712       11764 : !$OMP          SHARED(BIb_C_rec,local_i_aL,Lstart_pos,Lend_pos,start_point,end_point)
     713             :          local_i_aL(Lstart_pos:Lend_pos, :, :) = BIb_C_rec(start_point:end_point, :, :)
     714             : !$OMP END PARALLEL WORKSHARE
     715             :       END DO
     716             : 
     717        4248 :       CALL timestop(handle)
     718             : 
     719        4248 :    END SUBROUTINE fill_local_i_aL
     720             : 
     721             : ! **************************************************************************************************
     722             : !> \brief ...
     723             : !> \param local_i_aL ...
     724             : !> \param ranges_info_array ...
     725             : !> \param BIb_C_rec ...
     726             : ! **************************************************************************************************
     727         250 :    SUBROUTINE fill_local_i_aL_2D(local_i_aL, ranges_info_array, BIb_C_rec)
     728             :       REAL(KIND=dp), DIMENSION(:, :), INTENT(INOUT)      :: local_i_aL
     729             :       INTEGER, DIMENSION(:, :), INTENT(IN)               :: ranges_info_array
     730             :       REAL(KIND=dp), DIMENSION(:, :), INTENT(IN)         :: BIb_C_rec
     731             : 
     732             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'fill_local_i_aL_2D'
     733             : 
     734             :       INTEGER                                            :: end_point, handle, irep, Lend_pos, &
     735             :                                                             Lstart_pos, start_point
     736             : 
     737         250 :       CALL timeset(routineN, handle)
     738             : 
     739         718 :       DO irep = 1, SIZE(ranges_info_array, 2)
     740         468 :          Lstart_pos = ranges_info_array(1, irep)
     741         468 :          Lend_pos = ranges_info_array(2, irep)
     742         468 :          start_point = ranges_info_array(3, irep)
     743         468 :          end_point = ranges_info_array(4, irep)
     744             : 
     745             : !$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
     746         718 : !$OMP          SHARED(BIb_C_rec,local_i_aL,Lstart_pos,Lend_pos,start_point,end_point)
     747             :          local_i_aL(Lstart_pos:Lend_pos, :) = BIb_C_rec(start_point:end_point, :)
     748             : !$OMP END PARALLEL WORKSHARE
     749             :       END DO
     750             : 
     751         250 :       CALL timestop(handle)
     752             : 
     753         250 :    END SUBROUTINE fill_local_i_aL_2D
     754             : 
     755             : ! **************************************************************************************************
     756             : !> \brief ...
     757             : !> \param BIb_C ...
     758             : !> \param comm_exchange ...
     759             : !> \param comm_rep ...
     760             : !> \param homo ...
     761             : !> \param sizes_array ...
     762             : !> \param my_B_size ...
     763             : !> \param my_group_L_size ...
     764             : !> \param ranges_info_array ...
     765             : ! **************************************************************************************************
     766         432 :    SUBROUTINE replicate_iaK_2intgroup(BIb_C, comm_exchange, comm_rep, homo, sizes_array, my_B_size, &
     767         432 :                                       my_group_L_size, ranges_info_array)
     768             :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
     769             :          INTENT(INOUT)                                   :: BIb_C
     770             :       TYPE(mp_comm_type), INTENT(IN)                     :: comm_exchange, comm_rep
     771             :       INTEGER, INTENT(IN)                                :: homo
     772             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: sizes_array
     773             :       INTEGER, INTENT(IN)                                :: my_B_size, my_group_L_size
     774             :       INTEGER, DIMENSION(:, 0:, 0:), INTENT(IN)          :: ranges_info_array
     775             : 
     776             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'replicate_iaK_2intgroup'
     777             : 
     778             :       INTEGER                                            :: end_point, handle, max_L_size, &
     779             :                                                             proc_receive, proc_shift, start_point
     780             :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :)     :: BIb_C_copy
     781         432 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :, :)  :: BIb_C_gather
     782             : 
     783         432 :       CALL timeset(routineN, handle)
     784             : 
     785             :       ! replication scheme using mpi_allgather
     786             :       ! get the max L size of the
     787         968 :       max_L_size = MAXVAL(sizes_array)
     788             : 
     789        2160 :       ALLOCATE (BIb_C_copy(max_L_size, my_B_size, homo))
     790     1626068 :       BIb_C_copy = 0.0_dp
     791      883470 :       BIb_C_copy(1:SIZE(BIb_C, 1), 1:my_B_size, 1:homo) = BIb_C
     792             : 
     793         432 :       DEALLOCATE (BIb_C)
     794             : 
     795        2592 :       ALLOCATE (BIb_C_gather(max_L_size, my_B_size, homo, 0:comm_rep%num_pe - 1))
     796     3129602 :       BIb_C_gather = 0.0_dp
     797             : 
     798         432 :       CALL comm_rep%allgather(BIb_C_copy, BIb_C_gather)
     799             : 
     800         432 :       DEALLOCATE (BIb_C_copy)
     801             : 
     802        2160 :       ALLOCATE (BIb_C(my_group_L_size, my_B_size, homo))
     803     1625721 :       BIb_C = 0.0_dp
     804             : 
     805             :       ! reorder data
     806        1174 :       DO proc_shift = 0, comm_rep%num_pe - 1
     807         742 :          proc_receive = MODULO(comm_rep%mepos - proc_shift, comm_rep%num_pe)
     808             : 
     809         742 :          start_point = ranges_info_array(3, proc_shift, comm_exchange%mepos)
     810         742 :          end_point = ranges_info_array(4, proc_shift, comm_exchange%mepos)
     811             : 
     812             :          BIb_C(start_point:end_point, 1:my_B_size, 1:homo) = &
     813     1644753 :             BIb_C_gather(1:end_point - start_point + 1, 1:my_B_size, 1:homo, proc_receive)
     814             : 
     815             :       END DO
     816             : 
     817         432 :       DEALLOCATE (BIb_C_gather)
     818             : 
     819         432 :       CALL timestop(handle)
     820             : 
     821         432 :    END SUBROUTINE replicate_iaK_2intgroup
     822             : 
     823             : ! **************************************************************************************************
     824             : !> \brief ...
     825             : !> \param local_ab ...
     826             : !> \param t_ab ...
     827             : !> \param mp2_env ...
     828             : !> \param homo ...
     829             : !> \param virtual ...
     830             : !> \param my_B_size ...
     831             : !> \param my_group_L_size ...
     832             : !> \param calc_forces ...
     833             : !> \param ispin ...
     834             : !> \param jspin ...
     835             : !> \param local_ba ...
     836             : ! **************************************************************************************************
     837         518 :    SUBROUTINE mp2_ri_allocate_no_blk(local_ab, t_ab, mp2_env, homo, virtual, my_B_size, &
     838             :                                      my_group_L_size, calc_forces, ispin, jspin, local_ba)
     839             :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :), &
     840             :          INTENT(OUT)                                     :: local_ab, t_ab
     841             :       TYPE(mp2_type)                                     :: mp2_env
     842             :       INTEGER, INTENT(IN)                                :: homo(2), virtual(2), my_B_size(2), &
     843             :                                                             my_group_L_size
     844             :       LOGICAL, INTENT(IN)                                :: calc_forces
     845             :       INTEGER, INTENT(IN)                                :: ispin, jspin
     846             :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :), &
     847             :          INTENT(OUT)                                     :: local_ba
     848             : 
     849             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_allocate_no_blk'
     850             : 
     851             :       INTEGER                                            :: handle
     852             : 
     853         518 :       CALL timeset(routineN, handle)
     854             : 
     855        2072 :       ALLOCATE (local_ab(virtual(ispin), my_B_size(jspin)))
     856      137025 :       local_ab = 0.0_dp
     857             : 
     858         518 :       IF (calc_forces) THEN
     859         372 :          IF (.NOT. ALLOCATED(mp2_env%ri_grad%P_ij(jspin)%array)) THEN
     860        1184 :             ALLOCATE (mp2_env%ri_grad%P_ij(jspin)%array(homo(ispin), homo(ispin)))
     861        6068 :             mp2_env%ri_grad%P_ij(jspin)%array = 0.0_dp
     862             :          END IF
     863         372 :          IF (.NOT. ALLOCATED(mp2_env%ri_grad%P_ab(jspin)%array)) THEN
     864        1184 :             ALLOCATE (mp2_env%ri_grad%P_ab(jspin)%array(my_B_size(jspin), virtual(jspin)))
     865       83688 :             mp2_env%ri_grad%P_ab(jspin)%array = 0.0_dp
     866             :          END IF
     867         372 :          IF (.NOT. ALLOCATED(mp2_env%ri_grad%Gamma_P_ia(jspin)%array)) THEN
     868        1480 :             ALLOCATE (mp2_env%ri_grad%Gamma_P_ia(jspin)%array(my_B_size(jspin), homo(jspin), my_group_L_size))
     869     1442032 :             mp2_env%ri_grad%Gamma_P_ia(jspin)%array = 0.0_dp
     870             :          END IF
     871             : 
     872         372 :          IF (ispin == jspin) THEN
     873             :             ! For non-alpha-beta case we need amplitudes
     874        1184 :             ALLOCATE (t_ab(virtual(ispin), my_B_size(jspin)))
     875             : 
     876             :             ! That is just a dummy. In that way, we can pass it as array to other routines w/o requirement for allocatable array
     877         296 :             ALLOCATE (local_ba(1, 1))
     878             :          ELSE
     879             :             ! We need more integrals
     880         304 :             ALLOCATE (local_ba(virtual(jspin), my_B_size(ispin)))
     881             :          END IF
     882             :       END IF
     883             :       !
     884             : 
     885         518 :       CALL timestop(handle)
     886             : 
     887         518 :    END SUBROUTINE mp2_ri_allocate_no_blk
     888             : 
     889             : ! **************************************************************************************************
     890             : !> \brief ...
     891             : !> \param dimen_RI ...
     892             : !> \param my_B_size ...
     893             : !> \param block_size ...
     894             : !> \param local_i_aL ...
     895             : !> \param local_j_aL ...
     896             : !> \param calc_forces ...
     897             : !> \param Y_i_aP ...
     898             : !> \param Y_j_aP ...
     899             : !> \param ispin ...
     900             : !> \param jspin ...
     901             : ! **************************************************************************************************
     902         518 :    SUBROUTINE mp2_ri_allocate_blk(dimen_RI, my_B_size, block_size, &
     903             :                                   local_i_aL, local_j_aL, calc_forces, &
     904             :                                   Y_i_aP, Y_j_aP, ispin, jspin)
     905             :       INTEGER, INTENT(IN)                                :: dimen_RI, my_B_size(2), block_size
     906             :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
     907             :          INTENT(OUT)                                     :: local_i_aL, local_j_aL
     908             :       LOGICAL, INTENT(IN)                                :: calc_forces
     909             :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
     910             :          INTENT(OUT)                                     :: Y_i_aP, Y_j_aP
     911             :       INTEGER, INTENT(IN)                                :: ispin, jspin
     912             : 
     913             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_allocate_blk'
     914             : 
     915             :       INTEGER                                            :: handle
     916             : 
     917         518 :       CALL timeset(routineN, handle)
     918             : 
     919        2590 :       ALLOCATE (local_i_aL(dimen_RI, my_B_size(ispin), block_size))
     920      648457 :       local_i_aL = 0.0_dp
     921        2590 :       ALLOCATE (local_j_aL(dimen_RI, my_B_size(jspin), block_size))
     922      660909 :       local_j_aL = 0.0_dp
     923             : 
     924         518 :       IF (calc_forces) THEN
     925        1860 :          ALLOCATE (Y_i_aP(my_B_size(ispin), dimen_RI, block_size))
     926      531690 :          Y_i_aP = 0.0_dp
     927             :          ! For  closed-shell, alpha-alpha and beta-beta my_B_size_beta=my_b_size
     928             :          ! Not for alpha-beta case: Y_j_aP_beta is sent and received as Y_j_aP
     929        1860 :          ALLOCATE (Y_j_aP(my_B_size(jspin), dimen_RI, block_size))
     930      542254 :          Y_j_aP = 0.0_dp
     931             :       END IF
     932             :       !
     933             : 
     934         518 :       CALL timestop(handle)
     935             : 
     936         518 :    END SUBROUTINE mp2_ri_allocate_blk
     937             : 
     938             : ! **************************************************************************************************
     939             : !> \brief ...
     940             : !> \param my_alpha_beta_case ...
     941             : !> \param total_ij_pairs ...
     942             : !> \param homo ...
     943             : !> \param homo_beta ...
     944             : !> \param block_size ...
     945             : !> \param ngroup ...
     946             : !> \param ij_map ...
     947             : !> \param color_sub ...
     948             : !> \param my_ij_pairs ...
     949             : !> \param my_open_shell_SS ...
     950             : !> \param unit_nr ...
     951             : ! **************************************************************************************************
     952         518 :    SUBROUTINE mp2_ri_communication(my_alpha_beta_case, total_ij_pairs, homo, homo_beta, &
     953             :                                    block_size, ngroup, ij_map, color_sub, my_ij_pairs, my_open_shell_SS, unit_nr)
     954             :       LOGICAL, INTENT(IN)                                :: my_alpha_beta_case
     955             :       INTEGER, INTENT(OUT)                               :: total_ij_pairs
     956             :       INTEGER, INTENT(IN)                                :: homo, homo_beta, block_size, ngroup
     957             :       INTEGER, ALLOCATABLE, DIMENSION(:, :), INTENT(OUT) :: ij_map
     958             :       INTEGER, INTENT(IN)                                :: color_sub
     959             :       INTEGER, INTENT(OUT)                               :: my_ij_pairs
     960             :       LOGICAL, INTENT(IN)                                :: my_open_shell_SS
     961             :       INTEGER, INTENT(IN)                                :: unit_nr
     962             : 
     963             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_communication'
     964             : 
     965             :       INTEGER :: assigned_blocks, first_I_block, first_J_block, handle, iiB, ij_block_counter, &
     966             :          ij_counter, jjB, last_i_block, last_J_block, num_block_per_group, num_IJ_blocks, &
     967             :          num_IJ_blocks_beta, total_ij_block, total_ij_pairs_blocks
     968         518 :       LOGICAL, ALLOCATABLE, DIMENSION(:, :)              :: ij_marker
     969             : 
     970             : ! Calculate the maximum number of ij pairs that have to be computed
     971             : ! among groups
     972             : 
     973         518 :       CALL timeset(routineN, handle)
     974             : 
     975         518 :       IF (.NOT. my_open_shell_ss .AND. .NOT. my_alpha_beta_case) THEN
     976         260 :          total_ij_pairs = homo*(1 + homo)/2
     977         260 :          num_IJ_blocks = homo/block_size - 1
     978             : 
     979         260 :          first_I_block = 1
     980         260 :          last_i_block = block_size*(num_IJ_blocks - 1)
     981             : 
     982         260 :          first_J_block = block_size + 1
     983         260 :          last_J_block = block_size*(num_IJ_blocks + 1)
     984             : 
     985         260 :          ij_block_counter = 0
     986         584 :          DO iiB = first_I_block, last_i_block, block_size
     987         584 :             DO jjB = iiB + block_size, last_J_block, block_size
     988         810 :                ij_block_counter = ij_block_counter + 1
     989             :             END DO
     990             :          END DO
     991             : 
     992         260 :          total_ij_block = ij_block_counter
     993         260 :          num_block_per_group = total_ij_block/ngroup
     994         260 :          assigned_blocks = num_block_per_group*ngroup
     995             : 
     996         260 :          total_ij_pairs_blocks = assigned_blocks + (total_ij_pairs - assigned_blocks*(block_size**2))
     997             : 
     998        1040 :          ALLOCATE (ij_marker(homo, homo))
     999        3892 :          ij_marker = .TRUE.
    1000         780 :          ALLOCATE (ij_map(3, total_ij_pairs_blocks))
    1001        7524 :          ij_map = 0
    1002         260 :          ij_counter = 0
    1003         260 :          my_ij_pairs = 0
    1004         584 :          DO iiB = first_I_block, last_i_block, block_size
    1005        1236 :             DO jjB = iiB + block_size, last_J_block, block_size
    1006         810 :                IF (ij_counter + 1 > assigned_blocks) EXIT
    1007         652 :                ij_counter = ij_counter + 1
    1008        1956 :                ij_marker(iiB:iiB + block_size - 1, jjB:jjB + block_size - 1) = .FALSE.
    1009         652 :                ij_map(1, ij_counter) = iiB
    1010         652 :                ij_map(2, ij_counter) = jjB
    1011         652 :                ij_map(3, ij_counter) = block_size
    1012         976 :                IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
    1013             :             END DO
    1014             :          END DO
    1015        1040 :          DO iiB = 1, homo
    1016        2856 :             DO jjB = iiB, homo
    1017        2596 :                IF (ij_marker(iiB, jjB)) THEN
    1018        1164 :                   ij_counter = ij_counter + 1
    1019        1164 :                   ij_map(1, ij_counter) = iiB
    1020        1164 :                   ij_map(2, ij_counter) = jjB
    1021        1164 :                   ij_map(3, ij_counter) = 1
    1022        1164 :                   IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
    1023             :                END IF
    1024             :             END DO
    1025             :          END DO
    1026         260 :          DEALLOCATE (ij_marker)
    1027             : 
    1028         258 :       ELSE IF (.NOT. my_alpha_beta_case) THEN
    1029             :          ! THese are the cases alpha/alpha and beta/beta
    1030             :          ! We do not have to consider the diagonal elements
    1031         172 :          total_ij_pairs = homo*(homo - 1)/2
    1032         172 :          num_IJ_blocks = (homo - 1)/block_size - 1
    1033             : 
    1034         172 :          first_I_block = 1
    1035         172 :          last_i_block = block_size*(num_IJ_blocks - 1)
    1036             : 
    1037             :          ! We shift the blocks to prevent the calculation of the diagonal elements which always give zero
    1038         172 :          first_J_block = block_size + 2
    1039         172 :          last_J_block = block_size*(num_IJ_blocks + 1) + 1
    1040             : 
    1041         172 :          ij_block_counter = 0
    1042         256 :          DO iiB = first_I_block, last_i_block, block_size
    1043         256 :             DO jjB = iiB + block_size + 1, last_J_block, block_size
    1044         196 :                ij_block_counter = ij_block_counter + 1
    1045             :             END DO
    1046             :          END DO
    1047             : 
    1048         172 :          total_ij_block = ij_block_counter
    1049         172 :          num_block_per_group = total_ij_block/ngroup
    1050         172 :          assigned_blocks = num_block_per_group*ngroup
    1051             : 
    1052         172 :          total_ij_pairs_blocks = assigned_blocks + (total_ij_pairs - assigned_blocks*(block_size**2))
    1053             : 
    1054         688 :          ALLOCATE (ij_marker(homo, homo))
    1055        2916 :          ij_marker = .TRUE.
    1056         516 :          ALLOCATE (ij_map(3, total_ij_pairs_blocks))
    1057        3276 :          ij_map = 0
    1058         172 :          ij_counter = 0
    1059         172 :          my_ij_pairs = 0
    1060         256 :          DO iiB = first_I_block, last_i_block, block_size
    1061         448 :             DO jjB = iiB + block_size + 1, last_J_block, block_size
    1062         196 :                IF (ij_counter + 1 > assigned_blocks) EXIT
    1063         192 :                ij_counter = ij_counter + 1
    1064         592 :                ij_marker(iiB:iiB + block_size - 1, jjB:jjB + block_size - 1) = .FALSE.
    1065         192 :                ij_map(1, ij_counter) = iiB
    1066         192 :                ij_map(2, ij_counter) = jjB
    1067         192 :                ij_map(3, ij_counter) = block_size
    1068         276 :                IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
    1069             :             END DO
    1070             :          END DO
    1071         756 :          DO iiB = 1, homo
    1072        1544 :             DO jjB = iiB + 1, homo
    1073        1372 :                IF (ij_marker(iiB, jjB)) THEN
    1074         584 :                   ij_counter = ij_counter + 1
    1075         584 :                   ij_map(1, ij_counter) = iiB
    1076         584 :                   ij_map(2, ij_counter) = jjB
    1077         584 :                   ij_map(3, ij_counter) = 1
    1078         584 :                   IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
    1079             :                END IF
    1080             :             END DO
    1081             :          END DO
    1082         172 :          DEALLOCATE (ij_marker)
    1083             : 
    1084             :       ELSE
    1085          86 :          total_ij_pairs = homo*homo_beta
    1086          86 :          num_IJ_blocks = homo/block_size
    1087          86 :          num_IJ_blocks_beta = homo_beta/block_size
    1088             : 
    1089          86 :          first_I_block = 1
    1090          86 :          last_i_block = block_size*(num_IJ_blocks - 1)
    1091             : 
    1092          86 :          first_J_block = 1
    1093          86 :          last_J_block = block_size*(num_IJ_blocks_beta - 1)
    1094             : 
    1095          86 :          ij_block_counter = 0
    1096         242 :          DO iiB = first_I_block, last_i_block, block_size
    1097         242 :             DO jjB = first_J_block, last_J_block, block_size
    1098         282 :                ij_block_counter = ij_block_counter + 1
    1099             :             END DO
    1100             :          END DO
    1101             : 
    1102          86 :          total_ij_block = ij_block_counter
    1103          86 :          num_block_per_group = total_ij_block/ngroup
    1104          86 :          assigned_blocks = num_block_per_group*ngroup
    1105             : 
    1106          86 :          total_ij_pairs_blocks = assigned_blocks + (total_ij_pairs - assigned_blocks*(block_size**2))
    1107             : 
    1108         344 :          ALLOCATE (ij_marker(homo, homo_beta))
    1109        1364 :          ij_marker = .TRUE.
    1110         258 :          ALLOCATE (ij_map(3, total_ij_pairs_blocks))
    1111        4206 :          ij_map = 0
    1112          86 :          ij_counter = 0
    1113          86 :          my_ij_pairs = 0
    1114         242 :          DO iiB = first_I_block, last_i_block, block_size
    1115         482 :             DO jjB = first_J_block, last_J_block, block_size
    1116         244 :                IF (ij_counter + 1 > assigned_blocks) EXIT
    1117         240 :                ij_counter = ij_counter + 1
    1118         720 :                ij_marker(iiB:iiB + block_size - 1, jjB:jjB + block_size - 1) = .FALSE.
    1119         240 :                ij_map(1, ij_counter) = iiB
    1120         240 :                ij_map(2, ij_counter) = jjB
    1121         240 :                ij_map(3, ij_counter) = block_size
    1122         396 :                IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
    1123             :             END DO
    1124             :          END DO
    1125         422 :          DO iiB = 1, homo
    1126        1452 :             DO jjB = 1, homo_beta
    1127        1366 :                IF (ij_marker(iiB, jjB)) THEN
    1128         790 :                   ij_counter = ij_counter + 1
    1129         790 :                   ij_map(1, ij_counter) = iiB
    1130         790 :                   ij_map(2, ij_counter) = jjB
    1131         790 :                   ij_map(3, ij_counter) = 1
    1132         790 :                   IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
    1133             :                END IF
    1134             :             END DO
    1135             :          END DO
    1136          86 :          DEALLOCATE (ij_marker)
    1137             :       END IF
    1138             : 
    1139         518 :       IF (unit_nr > 0) THEN
    1140         259 :          IF (block_size == 1) THEN
    1141             :             WRITE (UNIT=unit_nr, FMT="(T3,A,T66,F15.1)") &
    1142         231 :                "RI_INFO| Percentage of ij pairs communicated with block size 1:", 100.0_dp
    1143             :          ELSE
    1144             :             WRITE (UNIT=unit_nr, FMT="(T3,A,T66,F15.1)") &
    1145          28 :                "RI_INFO| Percentage of ij pairs communicated with block size 1:", &
    1146          56 :                100.0_dp*REAL((total_ij_pairs - assigned_blocks*(block_size**2)), KIND=dp)/REAL(total_ij_pairs, KIND=dp)
    1147             :          END IF
    1148         259 :          CALL m_flush(unit_nr)
    1149             :       END IF
    1150             : 
    1151         518 :       CALL timestop(handle)
    1152             : 
    1153         518 :    END SUBROUTINE mp2_ri_communication
    1154             : 
    1155             : ! **************************************************************************************************
    1156             : !> \brief ...
    1157             : !> \param para_env ...
    1158             : !> \param para_env_sub ...
    1159             : !> \param color_sub ...
    1160             : !> \param sizes_array ...
    1161             : !> \param calc_forces ...
    1162             : !> \param integ_group_size ...
    1163             : !> \param my_group_L_end ...
    1164             : !> \param my_group_L_size ...
    1165             : !> \param my_group_L_size_orig ...
    1166             : !> \param my_group_L_start ...
    1167             : !> \param my_new_group_L_size ...
    1168             : !> \param integ_group_pos2color_sub ...
    1169             : !> \param sizes_array_orig ...
    1170             : !> \param ranges_info_array ...
    1171             : !> \param comm_exchange ...
    1172             : !> \param comm_rep ...
    1173             : !> \param num_integ_group ...
    1174             : ! **************************************************************************************************
    1175         346 :    SUBROUTINE mp2_ri_create_group(para_env, para_env_sub, color_sub, &
    1176             :                                   sizes_array, calc_forces, &
    1177             :                                   integ_group_size, my_group_L_end, &
    1178             :                                   my_group_L_size, my_group_L_size_orig, my_group_L_start, my_new_group_L_size, &
    1179             :                                   integ_group_pos2color_sub, &
    1180             :                                   sizes_array_orig, ranges_info_array, comm_exchange, comm_rep, num_integ_group)
    1181             :       TYPE(mp_para_env_type), INTENT(IN)                 :: para_env, para_env_sub
    1182             :       INTEGER, INTENT(IN)                                :: color_sub
    1183             :       INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(INOUT)  :: sizes_array
    1184             :       LOGICAL, INTENT(IN)                                :: calc_forces
    1185             :       INTEGER, INTENT(IN)                                :: integ_group_size, my_group_L_end
    1186             :       INTEGER, INTENT(INOUT)                             :: my_group_L_size
    1187             :       INTEGER, INTENT(OUT)                               :: my_group_L_size_orig
    1188             :       INTEGER, INTENT(IN)                                :: my_group_L_start
    1189             :       INTEGER, INTENT(INOUT)                             :: my_new_group_L_size
    1190             :       INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(OUT)    :: integ_group_pos2color_sub, &
    1191             :                                                             sizes_array_orig
    1192             :       INTEGER, ALLOCATABLE, DIMENSION(:, :, :), &
    1193             :          INTENT(OUT)                                     :: ranges_info_array
    1194             :       TYPE(mp_comm_type), INTENT(OUT)                    :: comm_exchange, comm_rep
    1195             :       INTEGER, INTENT(IN)                                :: num_integ_group
    1196             : 
    1197             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_create_group'
    1198             : 
    1199             :       INTEGER                                            :: handle, iiB, proc_receive, proc_shift, &
    1200             :                                                             sub_sub_color
    1201         346 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: new_sizes_array, rep_ends_array, &
    1202             :                                                             rep_sizes_array, rep_starts_array
    1203             :       INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: my_info
    1204             : 
    1205         346 :       CALL timeset(routineN, handle)
    1206             :       !
    1207         346 :       sub_sub_color = para_env_sub%mepos*num_integ_group + color_sub/integ_group_size
    1208         346 :       CALL comm_exchange%from_split(para_env, sub_sub_color)
    1209             : 
    1210             :       ! create the replication group
    1211         346 :       sub_sub_color = para_env_sub%mepos*comm_exchange%num_pe + comm_exchange%mepos
    1212         346 :       CALL comm_rep%from_split(para_env, sub_sub_color)
    1213             : 
    1214             :       ! create the new limits for K according to the size
    1215             :       ! of the integral group
    1216             : 
    1217             :       ! info array for replication
    1218        1038 :       ALLOCATE (rep_ends_array(0:comm_rep%num_pe - 1))
    1219        1038 :       ALLOCATE (rep_starts_array(0:comm_rep%num_pe - 1))
    1220        1038 :       ALLOCATE (rep_sizes_array(0:comm_rep%num_pe - 1))
    1221             : 
    1222         346 :       CALL comm_rep%allgather(my_group_L_size, rep_sizes_array)
    1223         346 :       CALL comm_rep%allgather(my_group_L_start, rep_starts_array)
    1224         346 :       CALL comm_rep%allgather(my_group_L_end, rep_ends_array)
    1225             : 
    1226             :       ! calculate my_new_group_L_size according to sizes_array
    1227         346 :       my_new_group_L_size = my_group_L_size
    1228             : 
    1229             :       ! Info of this process
    1230        1038 :       ALLOCATE (my_info(4, 0:comm_rep%num_pe - 1))
    1231         346 :       my_info(1, 0) = my_group_L_start
    1232         346 :       my_info(2, 0) = my_group_L_end
    1233         346 :       my_info(3, 0) = 1
    1234         346 :       my_info(4, 0) = my_group_L_size
    1235             : 
    1236         580 :       DO proc_shift = 1, comm_rep%num_pe - 1
    1237         234 :          proc_receive = MODULO(comm_rep%mepos - proc_shift, comm_rep%num_pe)
    1238             : 
    1239         234 :          my_new_group_L_size = my_new_group_L_size + rep_sizes_array(proc_receive)
    1240             : 
    1241         234 :          my_info(1, proc_shift) = rep_starts_array(proc_receive)
    1242         234 :          my_info(2, proc_shift) = rep_ends_array(proc_receive)
    1243         234 :          my_info(3, proc_shift) = my_info(4, proc_shift - 1) + 1
    1244         580 :          my_info(4, proc_shift) = my_new_group_L_size
    1245             : 
    1246             :       END DO
    1247             : 
    1248        1038 :       ALLOCATE (new_sizes_array(0:comm_exchange%num_pe - 1))
    1249        1384 :       ALLOCATE (ranges_info_array(4, 0:comm_rep%num_pe - 1, 0:comm_exchange%num_pe - 1))
    1250         346 :       CALL comm_exchange%allgather(my_new_group_L_size, new_sizes_array)
    1251         346 :       CALL comm_exchange%allgather(my_info, ranges_info_array)
    1252             : 
    1253         346 :       DEALLOCATE (rep_sizes_array)
    1254         346 :       DEALLOCATE (rep_starts_array)
    1255         346 :       DEALLOCATE (rep_ends_array)
    1256             : 
    1257        1038 :       ALLOCATE (integ_group_pos2color_sub(0:comm_exchange%num_pe - 1))
    1258         346 :       CALL comm_exchange%allgather(color_sub, integ_group_pos2color_sub)
    1259             : 
    1260         346 :       IF (calc_forces) THEN
    1261         220 :          iiB = SIZE(sizes_array)
    1262         660 :          ALLOCATE (sizes_array_orig(0:iiB - 1))
    1263         654 :          sizes_array_orig(:) = sizes_array
    1264             :       END IF
    1265             : 
    1266         346 :       my_group_L_size_orig = my_group_L_size
    1267         346 :       my_group_L_size = my_new_group_L_size
    1268         346 :       DEALLOCATE (sizes_array)
    1269             : 
    1270        1038 :       ALLOCATE (sizes_array(0:integ_group_size - 1))
    1271         790 :       sizes_array(:) = new_sizes_array
    1272             : 
    1273         346 :       DEALLOCATE (new_sizes_array)
    1274             :       !
    1275         346 :       CALL timestop(handle)
    1276             : 
    1277         692 :    END SUBROUTINE mp2_ri_create_group
    1278             : 
    1279             : ! **************************************************************************************************
    1280             : !> \brief ...
    1281             : !> \param mp2_env ...
    1282             : !> \param para_env ...
    1283             : !> \param para_env_sub ...
    1284             : !> \param gd_array ...
    1285             : !> \param gd_B_virtual ...
    1286             : !> \param homo ...
    1287             : !> \param dimen_RI ...
    1288             : !> \param unit_nr ...
    1289             : !> \param integ_group_size ...
    1290             : !> \param ngroup ...
    1291             : !> \param num_integ_group ...
    1292             : !> \param virtual ...
    1293             : !> \param calc_forces ...
    1294             : ! **************************************************************************************************
    1295        1384 :    SUBROUTINE mp2_ri_get_integ_group_size(mp2_env, para_env, para_env_sub, gd_array, gd_B_virtual, &
    1296         346 :                                           homo, dimen_RI, unit_nr, &
    1297             :                                           integ_group_size, &
    1298             :                                           ngroup, num_integ_group, &
    1299         346 :                                           virtual, calc_forces)
    1300             :       TYPE(mp2_type)                                     :: mp2_env
    1301             :       TYPE(mp_para_env_type), INTENT(IN)                 :: para_env, para_env_sub
    1302             :       TYPE(group_dist_d1_type), INTENT(IN)               :: gd_array
    1303             :       TYPE(group_dist_d1_type), DIMENSION(:), INTENT(IN) :: gd_B_virtual
    1304             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: homo
    1305             :       INTEGER, INTENT(IN)                                :: dimen_RI, unit_nr
    1306             :       INTEGER, INTENT(OUT)                               :: integ_group_size, ngroup, num_integ_group
    1307             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: virtual
    1308             :       LOGICAL, INTENT(IN)                                :: calc_forces
    1309             : 
    1310             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_get_integ_group_size'
    1311             : 
    1312             :       INTEGER                                            :: block_size, handle, iiB, &
    1313             :                                                             max_repl_group_size, &
    1314             :                                                             min_integ_group_size
    1315             :       INTEGER(KIND=int_8)                                :: mem
    1316             :       LOGICAL                                            :: calc_group_size
    1317             :       REAL(KIND=dp)                                      :: factor, mem_base, mem_min, mem_per_blk, &
    1318             :                                                             mem_per_repl, mem_per_repl_blk, &
    1319             :                                                             mem_real
    1320             : 
    1321         346 :       CALL timeset(routineN, handle)
    1322             : 
    1323         346 :       ngroup = para_env%num_pe/para_env_sub%num_pe
    1324             : 
    1325         346 :       calc_group_size = mp2_env%ri_mp2%number_integration_groups <= 0
    1326         346 :       IF (.NOT. calc_group_size) THEN
    1327          10 :          IF (MOD(ngroup, mp2_env%ri_mp2%number_integration_groups) /= 0) calc_group_size = .TRUE.
    1328             :       END IF
    1329             : 
    1330         346 :       IF (calc_group_size) THEN
    1331         336 :          CALL m_memory(mem)
    1332         336 :          mem_real = (mem + 1024*1024 - 1)/(1024*1024)
    1333         336 :          CALL para_env%min(mem_real)
    1334             : 
    1335         336 :          mem_base = 0.0_dp
    1336         336 :          mem_per_blk = 0.0_dp
    1337         336 :          mem_per_repl = 0.0_dp
    1338         336 :          mem_per_repl_blk = 0.0_dp
    1339             : 
    1340             :          ! BIB_C_copy
    1341             :          mem_per_repl = mem_per_repl + MAXVAL(MAX(REAL(homo, KIND=dp)*maxsize(gd_array), REAL(dimen_RI, KIND=dp))* &
    1342        1088 :                                               maxsize(gd_B_virtual))*8.0_dp/(1024**2)
    1343             :          ! BIB_C
    1344         752 :          mem_per_repl = mem_per_repl + SUM(REAL(homo, KIND=dp)*maxsize(gd_B_virtual))*maxsize(gd_array)*8.0_dp/(1024**2)
    1345             :          ! BIB_C_rec
    1346         752 :          mem_per_repl_blk = mem_per_repl_blk + REAL(MAXVAL(maxsize(gd_B_virtual)), KIND=dp)*maxsize(gd_array)*8.0_dp/(1024**2)
    1347             :          ! local_i_aL+local_j_aL
    1348         752 :          mem_per_blk = mem_per_blk + 2.0_dp*MAXVAL(maxsize(gd_B_virtual))*REAL(dimen_RI, KIND=dp)*8.0_dp/(1024**2)
    1349             :          ! local_ab
    1350        1088 :          mem_base = mem_base + MAXVAL(REAL(virtual, KIND=dp)*maxsize(gd_B_virtual))*8.0_dp/(1024**2)
    1351             :          ! external_ab/external_i_aL
    1352        1168 :          mem_base = mem_base + REAL(MAX(dimen_RI, MAXVAL(virtual)), KIND=dp)*MAXVAL(maxsize(gd_B_virtual))*8.0_dp/(1024**2)
    1353             : 
    1354         336 :          IF (calc_forces) THEN
    1355             :             ! Gamma_P_ia
    1356             :             mem_per_repl = mem_per_repl + SUM(REAL(homo, KIND=dp)*maxsize(gd_array)* &
    1357         496 :                                               maxsize(gd_B_virtual))*8.0_dp/(1024**2)
    1358             :             ! Y_i_aP+Y_j_aP
    1359         496 :             mem_per_blk = mem_per_blk + 2.0_dp*MAXVAL(maxsize(gd_B_virtual))*dimen_RI*8.0_dp/(1024**2)
    1360             :             ! local_ba/t_ab
    1361         780 :             mem_base = mem_base + REAL(MAXVAL(maxsize(gd_B_virtual)), KIND=dp)*MAX(dimen_RI, MAXVAL(virtual))*8.0_dp/(1024**2)
    1362             :             ! P_ij
    1363         496 :             mem_base = mem_base + SUM(REAL(homo, KIND=dp)*homo)*8.0_dp/(1024**2)
    1364             :             ! P_ab
    1365         496 :             mem_base = mem_base + SUM(REAL(virtual, KIND=dp)*maxsize(gd_B_virtual))*8.0_dp/(1024**2)
    1366             :             ! send_ab/send_i_aL
    1367         780 :             mem_base = mem_base + REAL(MAX(dimen_RI, MAXVAL(virtual)), KIND=dp)*MAXVAL(maxsize(gd_B_virtual))*8.0_dp/(1024**2)
    1368             :          END IF
    1369             : 
    1370             :          ! This a first guess based on the assumption of optimal block sizes
    1371         752 :          block_size = MAX(1, MIN(FLOOR(SQRT(REAL(MINVAL(homo), KIND=dp))), FLOOR(MINVAL(homo)/SQRT(2.0_dp*ngroup))))
    1372         336 :          IF (mp2_env%ri_mp2%block_size > 0) block_size = mp2_env%ri_mp2%block_size
    1373             : 
    1374         336 :          mem_min = mem_base + mem_per_repl + (mem_per_blk + mem_per_repl_blk)*block_size
    1375             : 
    1376         504 :          IF (unit_nr > 0) WRITE (unit_nr, '(T3,A,T68,F9.2,A4)') 'RI_INFO| Minimum available memory per MPI process:', &
    1377         336 :             mem_real, ' MiB'
    1378         504 :          IF (unit_nr > 0) WRITE (unit_nr, '(T3,A,T68,F9.2,A4)') 'RI_INFO| Minimum required memory per MPI process:', &
    1379         336 :             mem_min, ' MiB'
    1380             : 
    1381             :          ! We use the following communication model
    1382             :          ! Comm(replication)+Comm(collection of data for ij pair)+Comm(contraction)
    1383             :          ! One can show that the costs of the contraction step are independent of the block size and the replication group size
    1384             :          ! With gradients, the other two steps are carried out twice (Y_i_aP -> Gamma_i_aP, and dereplication)
    1385             :          ! NL ... number of RI basis functions
    1386             :          ! NR ... replication group size
    1387             :          ! NG ... number of sub groups
    1388             :          ! NB ... Block size
    1389             :          ! o  ... number of occupied orbitals
    1390             :          ! Then, we have the communication costs (in multiples of the original BIb_C matrix)
    1391             :          ! (NR/NG)+(1-(NR/NG))*(o/NB+NB-2)/NG = (NR/NG)*(1-(o/NB+NB-2)/NG)+(o/NB+NB-2)/NG
    1392             :          ! and with gradients
    1393             :          ! 2*(NR/NG)+2*(1-(NR/NG))*(o/NB+NB-2)/NG = (NR/NG)*(1-(o/NB+NB-2)/NG)+(o/NB+NB-2)/NG
    1394             :          ! We are looking for the minimum of the communication volume,
    1395             :          ! thus, if the prefactor of (NR/NG) is smaller than zero, use the largest possible replication group size.
    1396             :          ! If the factor is larger than zero, set the replication group size to 1. (For small systems and a large number of subgroups)
    1397             :          ! Replication group size = 1 implies that the integration group size equals the number of subgroups
    1398             : 
    1399         336 :          integ_group_size = ngroup
    1400             : 
    1401             :          ! Multiply everything by homo*virtual to consider differences between spin channels in case of open-shell calculations
    1402             :          factor = REAL(SUM(homo*virtual), KIND=dp) &
    1403        1584 :                   - SUM((REAL(MAXVAL(homo), KIND=dp)/block_size + block_size - 2.0_dp)*homo*virtual)/ngroup
    1404         656 :          IF (SIZE(homo) == 2) factor = factor - 2.0_dp*PRODUCT(homo)/block_size/ngroup*SUM(homo*virtual)
    1405             : 
    1406         672 :          IF (factor <= 0.0_dp) THEN
    1407             :             ! Remove the fixed memory and divide by the memory per replication group size
    1408             :             max_repl_group_size = MIN(MAX(FLOOR((mem_real - mem_base - mem_per_blk*block_size)/ &
    1409         248 :                                                 (mem_per_repl + mem_per_repl_blk*block_size)), 1), ngroup)
    1410             :             ! Convert to an integration group size
    1411         248 :             min_integ_group_size = ngroup/max_repl_group_size
    1412             : 
    1413             :             ! Ensure that the integration group size is a divisor of the number of sub groups
    1414         248 :             DO iiB = MAX(MIN(min_integ_group_size, ngroup), 1), ngroup
    1415             :                ! check that the ngroup is a multiple of  integ_group_size
    1416         248 :                IF (MOD(ngroup, iiB) == 0) THEN
    1417         248 :                   integ_group_size = iiB
    1418         248 :                   EXIT
    1419             :                END IF
    1420           0 :                integ_group_size = integ_group_size + 1
    1421             :             END DO
    1422             :          END IF
    1423             :       ELSE ! We take the user provided group size
    1424          10 :          integ_group_size = ngroup/mp2_env%ri_mp2%number_integration_groups
    1425             :       END IF
    1426             : 
    1427         346 :       IF (unit_nr > 0) THEN
    1428             :          WRITE (UNIT=unit_nr, FMT="(T3,A,T75,i6)") &
    1429         173 :             "RI_INFO| Group size for integral replication:", integ_group_size*para_env_sub%num_pe
    1430         173 :          CALL m_flush(unit_nr)
    1431             :       END IF
    1432             : 
    1433         346 :       num_integ_group = ngroup/integ_group_size
    1434             : 
    1435         346 :       CALL timestop(handle)
    1436             : 
    1437         346 :    END SUBROUTINE mp2_ri_get_integ_group_size
    1438             : 
    1439             : ! **************************************************************************************************
    1440             : !> \brief ...
    1441             : !> \param mp2_env ...
    1442             : !> \param para_env ...
    1443             : !> \param para_env_sub ...
    1444             : !> \param gd_array ...
    1445             : !> \param gd_B_virtual ...
    1446             : !> \param homo ...
    1447             : !> \param virtual ...
    1448             : !> \param dimen_RI ...
    1449             : !> \param unit_nr ...
    1450             : !> \param block_size ...
    1451             : !> \param ngroup ...
    1452             : !> \param num_integ_group ...
    1453             : !> \param my_open_shell_ss ...
    1454             : !> \param calc_forces ...
    1455             : !> \param buffer_1D ...
    1456             : ! **************************************************************************************************
    1457         518 :    SUBROUTINE mp2_ri_get_block_size(mp2_env, para_env, para_env_sub, gd_array, gd_B_virtual, &
    1458         518 :                                     homo, virtual, dimen_RI, unit_nr, &
    1459             :                                     block_size, ngroup, num_integ_group, &
    1460             :                                     my_open_shell_ss, calc_forces, buffer_1D)
    1461             :       TYPE(mp2_type)                                     :: mp2_env
    1462             :       TYPE(mp_para_env_type), INTENT(IN)                 :: para_env, para_env_sub
    1463             :       TYPE(group_dist_d1_type), INTENT(IN)               :: gd_array
    1464             :       TYPE(group_dist_d1_type), DIMENSION(:), INTENT(IN) :: gd_B_virtual
    1465             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: homo, virtual
    1466             :       INTEGER, INTENT(IN)                                :: dimen_RI, unit_nr
    1467             :       INTEGER, INTENT(OUT)                               :: block_size, ngroup
    1468             :       INTEGER, INTENT(IN)                                :: num_integ_group
    1469             :       LOGICAL, INTENT(IN)                                :: my_open_shell_ss, calc_forces
    1470             :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:), &
    1471             :          INTENT(OUT)                                     :: buffer_1D
    1472             : 
    1473             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_get_block_size'
    1474             : 
    1475             :       INTEGER                                            :: best_block_size, handle, num_IJ_blocks
    1476             :       INTEGER(KIND=int_8)                                :: buffer_size, mem
    1477             :       REAL(KIND=dp)                                      :: mem_base, mem_per_blk, mem_per_repl_blk, &
    1478             :                                                             mem_real
    1479             : 
    1480         518 :       CALL timeset(routineN, handle)
    1481             : 
    1482         518 :       ngroup = para_env%num_pe/para_env_sub%num_pe
    1483             : 
    1484         518 :       CALL m_memory(mem)
    1485         518 :       mem_real = (mem + 1024*1024 - 1)/(1024*1024)
    1486         518 :       CALL para_env%min(mem_real)
    1487             : 
    1488         518 :       mem_base = 0.0_dp
    1489         518 :       mem_per_blk = 0.0_dp
    1490         518 :       mem_per_repl_blk = 0.0_dp
    1491             : 
    1492             :       ! external_ab
    1493        1726 :       mem_base = mem_base + MAXVAL(maxsize(gd_B_virtual))*MAX(dimen_RI, MAXVAL(virtual))*8.0_dp/(1024**2)
    1494             :       ! BIB_C_rec
    1495        1122 :       mem_per_repl_blk = mem_per_repl_blk + REAL(MAXVAL(maxsize(gd_B_virtual)), KIND=dp)*maxsize(gd_array)*8.0_dp/(1024**2)
    1496             :       ! local_i_aL+local_j_aL
    1497        1122 :       mem_per_blk = mem_per_blk + 2.0_dp*MAXVAL(maxsize(gd_B_virtual))*REAL(dimen_RI, KIND=dp)*8.0_dp/(1024**2)
    1498             :       ! Copy to keep arrays contiguous
    1499        1726 :       mem_base = mem_base + MAXVAL(maxsize(gd_B_virtual))*MAX(dimen_RI, MAXVAL(virtual))*8.0_dp/(1024**2)
    1500             : 
    1501         518 :       IF (calc_forces) THEN
    1502             :          ! Y_i_aP+Y_j_aP+BIb_C_send
    1503         820 :          mem_per_blk = mem_per_blk + 3.0_dp*MAXVAL(maxsize(gd_B_virtual))*dimen_RI*8.0_dp/(1024**2)
    1504             :          ! send_ab
    1505        1268 :          mem_base = mem_base + MAXVAL(maxsize(gd_B_virtual))*MAX(dimen_RI, MAXVAL(virtual))*8.0_dp/(1024**2)
    1506             :       END IF
    1507             : 
    1508         518 :       best_block_size = 1
    1509             : 
    1510             :       ! Here we split the memory half for the communication, half for replication
    1511         518 :       IF (mp2_env%ri_mp2%block_size > 0) THEN
    1512             :          best_block_size = mp2_env%ri_mp2%block_size
    1513             :       ELSE
    1514         286 :          best_block_size = MAX(FLOOR((mem_real - mem_base)/(mem_per_blk + mem_per_repl_blk*ngroup/num_integ_group)), 1)
    1515             : 
    1516     4315476 :          DO
    1517     4315762 :             IF (SIZE(homo) == 1) THEN
    1518     3466062 :             IF (.NOT. my_open_shell_ss) THEN
    1519     1582084 :                num_IJ_blocks = (homo(1)/best_block_size)
    1520     1582084 :                num_IJ_blocks = (num_IJ_blocks*num_IJ_blocks - num_IJ_blocks)/2
    1521             :             ELSE
    1522     1883978 :                num_IJ_blocks = ((homo(1) - 1)/best_block_size)
    1523     1883978 :                num_IJ_blocks = (num_IJ_blocks*num_IJ_blocks - num_IJ_blocks)/2
    1524             :             END IF
    1525             :             ELSE
    1526     2549100 :             num_ij_blocks = PRODUCT(homo/best_block_size)
    1527             :             END IF
    1528             :             ! Enforce at least one large block for each subgroup
    1529     4315762 :             IF ((num_IJ_blocks >= ngroup .AND. num_IJ_blocks > 0) .OR. best_block_size == 1) THEN
    1530             :                EXIT
    1531             :             ELSE
    1532     4315476 :                best_block_size = best_block_size - 1
    1533             :             END IF
    1534             :          END DO
    1535             : 
    1536         286 :          IF (SIZE(homo) == 1) THEN
    1537         226 :          IF (my_open_shell_ss) THEN
    1538             :             ! check that best_block_size is not bigger than sqrt(homo-1)
    1539             :             ! Diagonal elements do not have to be considered
    1540         120 :             best_block_size = MIN(FLOOR(SQRT(REAL(homo(1) - 1, KIND=dp))), best_block_size)
    1541             :          ELSE
    1542             :             ! check that best_block_size is not bigger than sqrt(homo)
    1543         106 :             best_block_size = MIN(FLOOR(SQRT(REAL(homo(1), KIND=dp))), best_block_size)
    1544             :          END IF
    1545             :          END IF
    1546             :       END IF
    1547         518 :       block_size = MAX(1, best_block_size)
    1548             : 
    1549         518 :       IF (unit_nr > 0) THEN
    1550             :          WRITE (UNIT=unit_nr, FMT="(T3,A,T75,i6)") &
    1551         259 :             "RI_INFO| Block size:", block_size
    1552         259 :          CALL m_flush(unit_nr)
    1553             :       END IF
    1554             : 
    1555             :       ! Determine recv buffer size (BI_C_recv, external_i_aL, external_ab)
    1556             :       buffer_size = MAX(INT(maxsize(gd_array), KIND=int_8)*block_size, INT(MAX(dimen_RI, MAXVAL(virtual)), KIND=int_8)) &
    1557        1726 :                     *MAXVAL(maxsize(gd_B_virtual))
    1558             :       ! The send buffer has the same size as the recv buffer
    1559         518 :       IF (calc_forces) buffer_size = buffer_size*2
    1560        1554 :       ALLOCATE (buffer_1D(buffer_size))
    1561             : 
    1562         518 :       CALL timestop(handle)
    1563             : 
    1564         518 :    END SUBROUTINE mp2_ri_get_block_size
    1565             : 
    1566             : ! **************************************************************************************************
    1567             : !> \brief ...
    1568             : !> \param mp2_env ...
    1569             : !> \param para_env_sub ...
    1570             : !> \param gd_B_virtual ...
    1571             : !> \param Eigenval ...
    1572             : !> \param homo ...
    1573             : !> \param dimen_RI ...
    1574             : !> \param iiB ...
    1575             : !> \param jjB ...
    1576             : !> \param my_B_size ...
    1577             : !> \param my_B_virtual_end ...
    1578             : !> \param my_B_virtual_start ...
    1579             : !> \param my_i ...
    1580             : !> \param my_j ...
    1581             : !> \param virtual ...
    1582             : !> \param local_ab ...
    1583             : !> \param t_ab ...
    1584             : !> \param my_local_i_aL ...
    1585             : !> \param my_local_j_aL ...
    1586             : !> \param open_ss ...
    1587             : !> \param Y_i_aP ...
    1588             : !> \param Y_j_aP ...
    1589             : !> \param local_ba ...
    1590             : !> \param ispin ...
    1591             : !> \param jspin ...
    1592             : !> \param dgemm_counter ...
    1593             : !> \param buffer_1D ...
    1594             : ! **************************************************************************************************
    1595        6276 :    SUBROUTINE mp2_update_P_gamma(mp2_env, para_env_sub, gd_B_virtual, &
    1596        3138 :                                  Eigenval, homo, dimen_RI, iiB, jjB, my_B_size, &
    1597        3138 :                                  my_B_virtual_end, my_B_virtual_start, my_i, my_j, virtual, local_ab, &
    1598        3138 :                                  t_ab, my_local_i_aL, my_local_j_aL, open_ss, Y_i_aP, Y_j_aP, &
    1599        1569 :                                  local_ba, ispin, jspin, dgemm_counter, buffer_1D)
    1600             :       TYPE(mp2_type)                                     :: mp2_env
    1601             :       TYPE(mp_para_env_type), INTENT(IN)                 :: para_env_sub
    1602             :       TYPE(group_dist_d1_type), DIMENSION(:), INTENT(IN) :: gd_B_virtual
    1603             :       REAL(KIND=dp), DIMENSION(:, :), INTENT(IN)         :: Eigenval
    1604             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: homo
    1605             :       INTEGER, INTENT(IN)                                :: dimen_RI, iiB, jjB
    1606             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: my_B_size, my_B_virtual_end, &
    1607             :                                                             my_B_virtual_start
    1608             :       INTEGER, INTENT(IN)                                :: my_i, my_j
    1609             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: virtual
    1610             :       REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
    1611             :          INTENT(INOUT), TARGET                           :: local_ab
    1612             :       REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
    1613             :          INTENT(IN), TARGET                              :: t_ab, my_local_i_aL, my_local_j_aL
    1614             :       LOGICAL, INTENT(IN)                                :: open_ss
    1615             :       REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
    1616             :          INTENT(INOUT), TARGET                           :: Y_i_aP, Y_j_aP, local_ba
    1617             :       INTEGER, INTENT(IN)                                :: ispin, jspin
    1618             :       TYPE(dgemm_counter_type), INTENT(INOUT)            :: dgemm_counter
    1619             :       REAL(KIND=dp), CONTIGUOUS, DIMENSION(:), TARGET    :: buffer_1D
    1620             : 
    1621             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_update_P_gamma'
    1622             : 
    1623             :       INTEGER :: a, b, b_global, handle, proc_receive, proc_send, proc_shift, rec_B_size, &
    1624             :          rec_B_virtual_end, rec_B_virtual_start, send_B_size, send_B_virtual_end, &
    1625             :          send_B_virtual_start
    1626             :       INTEGER(KIND=int_8)                                :: offset
    1627             :       LOGICAL                                            :: alpha_beta
    1628             :       REAL(KIND=dp)                                      :: factor, P_ij_diag
    1629             :       REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
    1630        1569 :          POINTER                                         :: external_ab, send_ab
    1631             : 
    1632        1569 :       CALL timeset(routineN//"_Pia", handle)
    1633             : 
    1634        1569 :       alpha_beta = .NOT. (ispin == jspin)
    1635        1569 :       IF (open_ss) THEN
    1636             :          factor = 1.0_dp
    1637             :       ELSE
    1638        1189 :          factor = 2.0_dp
    1639             :       END IF
    1640             :       ! divide the (ia|jb) integrals by Delta_ij^ab
    1641       24660 :       DO b = 1, my_B_size(jspin)
    1642       23091 :          b_global = b + my_B_virtual_start(jspin) - 1
    1643      462933 :          DO a = 1, virtual(ispin)
    1644             :             local_ab(a, b) = -local_ab(a, b)/ &
    1645             :                              (Eigenval(homo(ispin) + a, ispin) + Eigenval(homo(jspin) + b_global, jspin) - &
    1646      461364 :                               Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, jspin))
    1647             :          END DO
    1648             :       END DO
    1649        1569 :       IF (.NOT. (alpha_beta)) THEN
    1650      322531 :          P_ij_diag = -SUM(local_ab*t_ab)*factor
    1651             :       ELSE
    1652             :          ! update diagonal part of P_ij
    1653      140402 :          P_ij_diag = -SUM(local_ab*local_ab)*mp2_env%scale_S
    1654             :          ! More integrals needed only for alpha-beta case: local_ba
    1655        6939 :          DO b = 1, my_B_size(ispin)
    1656        6449 :             b_global = b + my_B_virtual_start(ispin) - 1
    1657      139874 :             DO a = 1, virtual(jspin)
    1658             :                local_ba(a, b) = -local_ba(a, b)/ &
    1659             :                                 (Eigenval(homo(jspin) + a, jspin) + Eigenval(homo(ispin) + b_global, ispin) - &
    1660      139384 :                                  Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, jspin))
    1661             :             END DO
    1662             :          END DO
    1663             :       END IF
    1664             : 
    1665             :       ! P_ab and add diagonal part of P_ij
    1666             : 
    1667        1569 :       CALL dgemm_counter_start(dgemm_counter)
    1668        1569 :       IF (.NOT. (alpha_beta)) THEN
    1669             :          CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_B_size(ispin), my_B_size(ispin), virtual(ispin), 1.0_dp, &
    1670             :                                           t_ab, virtual(ispin), local_ab, virtual(ispin), &
    1671             :                                           1.0_dp, mp2_env%ri_grad%P_ab(ispin)%array(:, &
    1672        1079 :                                                                my_B_virtual_start(ispin):my_B_virtual_end(ispin)), my_B_size(ispin))
    1673             :          mp2_env%ri_grad%P_ij(ispin)%array(my_i + iiB - 1, my_i + iiB - 1) = &
    1674        1079 :             mp2_env%ri_grad%P_ij(ispin)%array(my_i + iiB - 1, my_i + iiB - 1) + P_ij_diag
    1675             :       ELSE
    1676             :          CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_B_size(ispin), my_B_size(ispin), virtual(jspin), mp2_env%scale_S, &
    1677             :                                           local_ba, virtual(jspin), local_ba, virtual(jspin), 1.0_dp, &
    1678         490 :                           mp2_env%ri_grad%P_ab(ispin)%array(:, my_B_virtual_start(ispin):my_B_virtual_end(ispin)), my_B_size(ispin))
    1679             : 
    1680             :          mp2_env%ri_grad%P_ij(ispin)%array(my_i + iiB - 1, my_i + iiB - 1) = &
    1681         490 :             mp2_env%ri_grad%P_ij(ispin)%array(my_i + iiB - 1, my_i + iiB - 1) + P_ij_diag
    1682             : 
    1683             :          CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_B_size(jspin), my_B_size(jspin), virtual(ispin), mp2_env%scale_S, &
    1684             :                                           local_ab, virtual(ispin), local_ab, virtual(ispin), 1.0_dp, &
    1685         490 :                           mp2_env%ri_grad%P_ab(jspin)%array(:, my_B_virtual_start(jspin):my_B_virtual_end(jspin)), my_B_size(jspin))
    1686             : 
    1687             :          mp2_env%ri_grad%P_ij(jspin)%array(my_j + jjB - 1, my_j + jjB - 1) = &
    1688         490 :             mp2_env%ri_grad%P_ij(jspin)%array(my_j + jjB - 1, my_j + jjB - 1) + P_ij_diag
    1689             :       END IF
    1690             :       ! The summation is over unique pairs. In alpha-beta case, all pairs are unique: subroutine is called for
    1691             :       ! both i^alpha,j^beta and i^beta,j^alpha. Formally, my_i can be equal to my_j, but they are different
    1692             :       ! due to spin in alpha-beta case.
    1693        1569 :       IF ((my_i /= my_j) .AND. (.NOT. alpha_beta)) THEN
    1694             : 
    1695             :          CALL mp2_env%local_gemm_ctx%gemm('N', 'T', my_B_size(ispin), virtual(ispin), my_B_size(ispin), 1.0_dp, &
    1696             :                                           t_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), my_B_size(ispin), &
    1697             :                                           local_ab, virtual(ispin), &
    1698         796 :                                           1.0_dp, mp2_env%ri_grad%P_ab(ispin)%array, my_B_size(ispin))
    1699             : 
    1700             :          mp2_env%ri_grad%P_ij(ispin)%array(my_j + jjB - 1, my_j + jjB - 1) = &
    1701         796 :             mp2_env%ri_grad%P_ij(ispin)%array(my_j + jjB - 1, my_j + jjB - 1) + P_ij_diag
    1702             :       END IF
    1703        1741 :       DO proc_shift = 1, para_env_sub%num_pe - 1
    1704         172 :          proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
    1705         172 :          proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
    1706             : 
    1707         172 :          CALL get_group_dist(gd_B_virtual(jspin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
    1708         172 :          CALL get_group_dist(gd_B_virtual(jspin), proc_send, send_B_virtual_start, send_B_virtual_end, send_B_size)
    1709             : 
    1710         172 :          external_ab(1:virtual(ispin), 1:rec_B_size) => buffer_1D(1:INT(virtual(ispin), int_8)*rec_B_size)
    1711       35072 :          external_ab = 0.0_dp
    1712             : 
    1713             :          CALL para_env_sub%sendrecv(local_ab, proc_send, &
    1714         172 :                                     external_ab, proc_receive)
    1715             : 
    1716         172 :          IF (.NOT. (alpha_beta)) THEN
    1717             :             CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_B_size(ispin), rec_B_size, virtual(ispin), 1.0_dp, &
    1718             :                                              t_ab, virtual(ispin), external_ab, virtual(ispin), &
    1719         102 :                               1.0_dp, mp2_env%ri_grad%P_ab(ispin)%array(:, rec_B_virtual_start:rec_B_virtual_end), my_B_size(ispin))
    1720             :          ELSE
    1721             :             CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_B_size(jspin), rec_B_size, virtual(ispin), mp2_env%scale_S, &
    1722             :                                              local_ab, virtual(ispin), external_ab, virtual(ispin), &
    1723             :                                              1.0_dp, mp2_env%ri_grad%P_ab(jspin)%array(:, rec_B_virtual_start:rec_B_virtual_end), &
    1724          70 :                                              my_B_size(jspin))
    1725             : 
    1726             :             ! For alpha-beta part of alpha-density we need a new parallel code
    1727             :             ! And new external_ab (of a different size)
    1728          70 :             CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
    1729          70 :             CALL get_group_dist(gd_B_virtual(ispin), proc_send, send_B_virtual_start, send_B_virtual_end, send_B_size)
    1730          70 :             external_ab(1:virtual(jspin), 1:rec_B_size) => buffer_1D(1:INT(virtual(jspin), int_8)*rec_B_size)
    1731       14700 :             external_ab = 0.0_dp
    1732             :             CALL para_env_sub%sendrecv(local_ba, proc_send, &
    1733          70 :                                        external_ab, proc_receive)
    1734             :             CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_B_size(ispin), rec_B_size, virtual(jspin), mp2_env%scale_S, &
    1735             :                                              local_ba, virtual(jspin), external_ab, virtual(jspin), &
    1736          70 :                               1.0_dp, mp2_env%ri_grad%P_ab(ispin)%array(:, rec_B_virtual_start:rec_B_virtual_end), my_B_size(ispin))
    1737             :          END IF
    1738             : 
    1739        1913 :          IF ((my_i /= my_j) .AND. (.NOT. alpha_beta)) THEN
    1740             :             external_ab(1:my_B_size(ispin), 1:virtual(ispin)) => &
    1741          86 :                buffer_1D(1:INT(virtual(ispin), int_8)*my_B_size(ispin))
    1742       18083 :             external_ab = 0.0_dp
    1743             : 
    1744          86 :             offset = INT(virtual(ispin), int_8)*my_B_size(ispin)
    1745             : 
    1746          86 :             send_ab(1:send_B_size, 1:virtual(ispin)) => buffer_1D(offset + 1:offset + INT(send_B_size, int_8)*virtual(ispin))
    1747       18083 :             send_ab = 0.0_dp
    1748             : 
    1749             :             CALL mp2_env%local_gemm_ctx%gemm('N', 'T', send_B_size, virtual(ispin), my_B_size(ispin), 1.0_dp, &
    1750             :                                              t_ab(send_B_virtual_start:send_B_virtual_end, :), send_B_size, &
    1751          86 :                                              local_ab, virtual(ispin), 0.0_dp, send_ab, send_B_size)
    1752             :             CALL para_env_sub%sendrecv(send_ab, proc_send, &
    1753          86 :                                        external_ab, proc_receive)
    1754             : 
    1755       18083 :             mp2_env%ri_grad%P_ab(ispin)%array(:, :) = mp2_env%ri_grad%P_ab(ispin)%array + external_ab
    1756             :          END IF
    1757             : 
    1758             :       END DO
    1759        1569 :       IF (.NOT. alpha_beta) THEN
    1760        1079 :          IF (my_i /= my_j) THEN
    1761         796 :             CALL dgemm_counter_stop(dgemm_counter, 2*my_B_size(ispin), virtual(ispin), virtual(ispin))
    1762             :          ELSE
    1763         283 :             CALL dgemm_counter_stop(dgemm_counter, my_B_size(ispin), virtual(ispin), virtual(ispin))
    1764             :          END IF
    1765             :       ELSE
    1766        1470 :          CALL dgemm_counter_stop(dgemm_counter, SUM(my_B_size), virtual(ispin), virtual(jspin))
    1767             :       END IF
    1768        1569 :       CALL timestop(handle)
    1769             : 
    1770             :       ! Now, Gamma_P_ia (made of Y_ia_P)
    1771             : 
    1772        1569 :       CALL timeset(routineN//"_Gamma", handle)
    1773        1569 :       CALL dgemm_counter_start(dgemm_counter)
    1774        1569 :       IF (.NOT. alpha_beta) THEN
    1775             :          ! Alpha-alpha, beta-beta and closed shell
    1776             :          CALL mp2_env%local_gemm_ctx%gemm('N', 'T', my_B_size(ispin), dimen_RI, my_B_size(ispin), 1.0_dp, &
    1777             :                                           t_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), my_B_size(ispin), &
    1778        1079 :                                           my_local_j_aL, dimen_RI, 1.0_dp, Y_i_aP, my_B_size(ispin))
    1779             :       ELSE ! Alpha-beta
    1780             :          CALL mp2_env%local_gemm_ctx%gemm('N', 'T', my_B_size(ispin), dimen_RI, my_B_size(jspin), mp2_env%scale_S, &
    1781             :                                           local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), my_B_size(ispin), &
    1782         490 :                                           my_local_j_aL, dimen_RI, 1.0_dp, Y_i_aP, my_B_size(ispin))
    1783             :          CALL mp2_env%local_gemm_ctx%gemm('T', 'T', my_B_size(jspin), dimen_RI, my_B_size(ispin), mp2_env%scale_S, &
    1784             :                                           local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), my_B_size(ispin), &
    1785         490 :                                           my_local_i_aL, dimen_RI, 1.0_dp, Y_j_aP, my_B_size(jspin))
    1786             :       END IF
    1787             : 
    1788        1569 :       IF (para_env_sub%num_pe > 1) THEN
    1789         172 :          external_ab(1:my_B_size(ispin), 1:dimen_RI) => buffer_1D(1:INT(my_B_size(ispin), int_8)*dimen_RI)
    1790      189692 :          external_ab = 0.0_dp
    1791             : 
    1792         172 :          offset = INT(my_B_size(ispin), int_8)*dimen_RI
    1793             :       END IF
    1794             :       !
    1795        1741 :       DO proc_shift = 1, para_env_sub%num_pe - 1
    1796         172 :          proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
    1797         172 :          proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
    1798             : 
    1799         172 :          CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
    1800         172 :          CALL get_group_dist(gd_B_virtual(ispin), proc_send, send_B_virtual_start, send_B_virtual_end, send_B_size)
    1801             : 
    1802         172 :          send_ab(1:send_B_size, 1:dimen_RI) => buffer_1D(offset + 1:offset + INT(dimen_RI, int_8)*send_B_size)
    1803      189692 :          send_ab = 0.0_dp
    1804        1913 :          IF (.NOT. alpha_beta) THEN
    1805             :             CALL mp2_env%local_gemm_ctx%gemm('N', 'T', send_B_size, dimen_RI, my_B_size(ispin), 1.0_dp, &
    1806             :                                              t_ab(send_B_virtual_start:send_B_virtual_end, :), send_B_size, &
    1807         102 :                                              my_local_j_aL, dimen_RI, 0.0_dp, send_ab, send_B_size)
    1808         102 :             CALL para_env_sub%sendrecv(send_ab, proc_send, external_ab, proc_receive)
    1809             : 
    1810      217544 :             Y_i_aP(:, :) = Y_i_aP + external_ab
    1811             : 
    1812             :          ELSE ! Alpha-beta case
    1813             :             ! Alpha-alpha part
    1814             :             CALL mp2_env%local_gemm_ctx%gemm('N', 'T', send_B_size, dimen_RI, my_B_size(jspin), mp2_env%scale_S, &
    1815             :                                              local_ab(send_B_virtual_start:send_B_virtual_end, :), send_B_size, &
    1816          70 :                                              my_local_j_aL, dimen_RI, 0.0_dp, send_ab, send_B_size)
    1817          70 :             CALL para_env_sub%sendrecv(send_ab, proc_send, external_ab, proc_receive)
    1818      161840 :             Y_i_aP(:, :) = Y_i_aP + external_ab
    1819             :          END IF
    1820             :       END DO
    1821             : 
    1822        1569 :       IF (alpha_beta) THEN
    1823             :          ! For beta-beta part (in alpha-beta case) we need a new parallel code
    1824         490 :          IF (para_env_sub%num_pe > 1) THEN
    1825          70 :             external_ab(1:my_B_size(jspin), 1:dimen_RI) => buffer_1D(1:INT(my_B_size(jspin), int_8)*dimen_RI)
    1826       88620 :             external_ab = 0.0_dp
    1827             : 
    1828          70 :             offset = INT(my_B_size(jspin), int_8)*dimen_RI
    1829             :          END IF
    1830         560 :          DO proc_shift = 1, para_env_sub%num_pe - 1
    1831          70 :             proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
    1832          70 :             proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
    1833             : 
    1834          70 :             CALL get_group_dist(gd_B_virtual(jspin), proc_send, send_B_virtual_start, send_B_virtual_end, send_B_size)
    1835          70 :             send_ab(1:send_B_size, 1:dimen_RI) => buffer_1D(offset + 1:offset + INT(dimen_RI, int_8)*send_B_size)
    1836       88620 :             send_ab = 0.0_dp
    1837             :             CALL mp2_env%local_gemm_ctx%gemm('N', 'T', send_B_size, dimen_RI, my_B_size(ispin), mp2_env%scale_S, &
    1838             :                                              local_ba(send_B_virtual_start:send_B_virtual_end, :), send_B_size, &
    1839          70 :                                              my_local_i_aL, dimen_RI, 0.0_dp, send_ab, send_B_size)
    1840          70 :             CALL para_env_sub%sendrecv(send_ab, proc_send, external_ab, proc_receive)
    1841      177800 :             Y_j_aP(:, :) = Y_j_aP + external_ab
    1842             : 
    1843             :          END DO
    1844             : 
    1845             :          ! Here, we just use approximate bounds. For large systems virtual(ispin) is approx virtual(jspin), same for B_size
    1846         490 :          CALL dgemm_counter_stop(dgemm_counter, 3*virtual(ispin), dimen_RI, my_B_size(jspin))
    1847             :       ELSE
    1848        1079 :          CALL dgemm_counter_stop(dgemm_counter, virtual(ispin), dimen_RI, my_B_size(ispin))
    1849             :       END IF
    1850             : 
    1851        1569 :       IF ((my_i /= my_j) .AND. (.NOT. alpha_beta)) THEN
    1852             :          ! Alpha-alpha, beta-beta and closed shell
    1853         796 :          CALL dgemm_counter_start(dgemm_counter)
    1854             :          CALL mp2_env%local_gemm_ctx%gemm('T', 'T', my_B_size(ispin), dimen_RI, my_B_size(ispin), 1.0_dp, &
    1855             :                                           t_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), my_B_size(ispin), &
    1856         796 :                                           my_local_i_aL, dimen_RI, 1.0_dp, Y_j_aP, my_B_size(ispin))
    1857         882 :          DO proc_shift = 1, para_env_sub%num_pe - 1
    1858          86 :             proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
    1859          86 :             proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
    1860             : 
    1861          86 :             CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
    1862             : 
    1863          86 :             external_ab(1:dimen_RI, 1:rec_B_size) => buffer_1D(1:INT(dimen_RI, int_8)*rec_B_size)
    1864       86837 :             external_ab = 0.0_dp
    1865             : 
    1866             :             CALL para_env_sub%sendrecv(my_local_i_aL, proc_send, &
    1867          86 :                                        external_ab, proc_receive)
    1868             : 
    1869             :             ! Alpha-alpha, beta-beta and closed shell
    1870             :             CALL mp2_env%local_gemm_ctx%gemm('T', 'T', my_B_size(ispin), dimen_RI, rec_B_size, 1.0_dp, &
    1871             :                                              t_ab(rec_B_virtual_start:rec_B_virtual_end, :), rec_B_size, &
    1872         968 :                                              external_ab, dimen_RI, 1.0_dp, Y_j_aP, my_B_size(ispin))
    1873             :          END DO
    1874             : 
    1875         796 :          CALL dgemm_counter_stop(dgemm_counter, my_B_size(ispin), dimen_RI, virtual(ispin))
    1876             :       END IF
    1877             : 
    1878        1569 :       CALL timestop(handle)
    1879        1569 :    END SUBROUTINE mp2_update_P_gamma
    1880             : 
    1881             : ! **************************************************************************************************
    1882             : !> \brief ...
    1883             : !> \param Gamma_P_ia ...
    1884             : !> \param ij_index ...
    1885             : !> \param my_B_size ...
    1886             : !> \param my_block_size ...
    1887             : !> \param my_group_L_size ...
    1888             : !> \param my_i ...
    1889             : !> \param my_ij_pairs ...
    1890             : !> \param ngroup ...
    1891             : !> \param num_integ_group ...
    1892             : !> \param integ_group_pos2color_sub ...
    1893             : !> \param num_ij_pairs ...
    1894             : !> \param ij_map ...
    1895             : !> \param ranges_info_array ...
    1896             : !> \param Y_i_aP ...
    1897             : !> \param comm_exchange ...
    1898             : !> \param sizes_array ...
    1899             : !> \param spin ...
    1900             : !> \param buffer_1D ...
    1901             : ! **************************************************************************************************
    1902        3132 :    SUBROUTINE mp2_redistribute_gamma(Gamma_P_ia, ij_index, my_B_size, &
    1903             :                                      my_block_size, my_group_L_size, my_i, my_ij_pairs, ngroup, &
    1904             :                                      num_integ_group, integ_group_pos2color_sub, num_ij_pairs, &
    1905        3132 :                                      ij_map, ranges_info_array, Y_i_aP, comm_exchange, &
    1906        3132 :                                      sizes_array, spin, buffer_1D)
    1907             : 
    1908             :       REAL(KIND=dp), DIMENSION(:, :, :), INTENT(INOUT)   :: Gamma_P_ia
    1909             :       INTEGER, INTENT(IN)                                :: ij_index, my_B_size, my_block_size, &
    1910             :                                                             my_group_L_size, my_i, my_ij_pairs, &
    1911             :                                                             ngroup, num_integ_group
    1912             :       INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(IN)     :: integ_group_pos2color_sub, num_ij_pairs
    1913             :       INTEGER, ALLOCATABLE, DIMENSION(:, :), INTENT(IN)  :: ij_map
    1914             :       INTEGER, ALLOCATABLE, DIMENSION(:, :, :), &
    1915             :          INTENT(IN)                                      :: ranges_info_array
    1916             :       REAL(KIND=dp), DIMENSION(:, :, :), INTENT(IN)      :: Y_i_aP
    1917             :       TYPE(mp_comm_type), INTENT(IN)                     :: comm_exchange
    1918             :       INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(IN)     :: sizes_array
    1919             :       INTEGER, INTENT(IN)                                :: spin
    1920             :       REAL(KIND=dp), CONTIGUOUS, DIMENSION(:), TARGET    :: buffer_1D
    1921             : 
    1922             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_redistribute_gamma'
    1923             : 
    1924             :       INTEGER :: end_point, handle, handle2, iiB, ij_counter_rec, irep, kkk, lll, Lstart_pos, &
    1925             :          proc_receive, proc_send, proc_shift, rec_i, rec_ij_index, send_L_size, start_point, tag
    1926             :       INTEGER(KIND=int_8)                                :: offset
    1927             :       REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :, :), &
    1928        3132 :          POINTER                                         :: BI_C_rec, BI_C_send
    1929             : 
    1930             : ! In alpha-beta case Y_i_aP_beta is sent as Y_j_aP
    1931             : 
    1932        3132 :       CALL timeset(routineN//"_comm2", handle)
    1933             : 
    1934        3132 :       tag = 43
    1935             : 
    1936        3132 :       IF (ij_index <= my_ij_pairs) THEN
    1937             :          ! somethig to send
    1938             :          ! start with myself
    1939        3114 :          CALL timeset(routineN//"_comm2_w", handle2)
    1940        8924 :          DO irep = 0, num_integ_group - 1
    1941        5810 :             Lstart_pos = ranges_info_array(1, irep, comm_exchange%mepos)
    1942        5810 :             start_point = ranges_info_array(3, irep, comm_exchange%mepos)
    1943        5810 :             end_point = ranges_info_array(4, irep, comm_exchange%mepos)
    1944             : !$OMP PARALLEL DO DEFAULT(NONE) &
    1945             : !$OMP             PRIVATE(kkk,lll,iiB) &
    1946             : !$OMP             SHARED(start_point,end_point,Lstart_pos,my_block_size,&
    1947        8924 : !$OMP                    Gamma_P_ia,my_i,my_B_size,Y_i_aP)
    1948             :             DO kkk = start_point, end_point
    1949             :                lll = kkk - start_point + Lstart_pos
    1950             :                DO iiB = 1, my_block_size
    1951             :                   Gamma_P_ia(1:my_B_size, my_i + iiB - 1, kkk) = &
    1952             :                      Gamma_P_ia(1:my_B_size, my_i + iiB - 1, kkk) + &
    1953             :                      Y_i_aP(1:my_B_size, lll, iiB)
    1954             :                END DO
    1955             :             END DO
    1956             : !$OMP END PARALLEL DO
    1957             :          END DO
    1958        3114 :          CALL timestop(handle2)
    1959             : 
    1960             :          ! Y_i_aP(my_B_size,dimen_RI,block_size)
    1961             : 
    1962        3212 :          DO proc_shift = 1, comm_exchange%num_pe - 1
    1963          98 :             proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
    1964          98 :             proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
    1965             : 
    1966          98 :             send_L_size = sizes_array(proc_send)
    1967             :             BI_C_send(1:my_B_size, 1:my_block_size, 1:send_L_size) => &
    1968          98 :                buffer_1D(1:INT(my_B_size, int_8)*my_block_size*send_L_size)
    1969             : 
    1970          98 :             offset = INT(my_B_size, int_8)*my_block_size*send_L_size
    1971             : 
    1972          98 :             CALL timeset(routineN//"_comm2_w", handle2)
    1973       48692 :             BI_C_send = 0.0_dp
    1974         196 :             DO irep = 0, num_integ_group - 1
    1975          98 :                Lstart_pos = ranges_info_array(1, irep, proc_send)
    1976          98 :                start_point = ranges_info_array(3, irep, proc_send)
    1977          98 :                end_point = ranges_info_array(4, irep, proc_send)
    1978             : !$OMP PARALLEL DO DEFAULT(NONE) &
    1979             : !$OMP             PRIVATE(kkk,lll,iiB) &
    1980             : !$OMP             SHARED(start_point,end_point,Lstart_pos,my_block_size,&
    1981         196 : !$OMP                    BI_C_send,my_B_size,Y_i_aP)
    1982             :                DO kkk = start_point, end_point
    1983             :                   lll = kkk - start_point + Lstart_pos
    1984             :                   DO iiB = 1, my_block_size
    1985             :                      BI_C_send(1:my_B_size, iiB, kkk) = Y_i_aP(1:my_B_size, lll, iiB)
    1986             :                   END DO
    1987             :                END DO
    1988             : !$OMP END PARALLEL DO
    1989             :             END DO
    1990          98 :             CALL timestop(handle2)
    1991             : 
    1992          98 :             rec_ij_index = num_ij_pairs(proc_receive)
    1993             : 
    1994        3310 :             IF (ij_index <= rec_ij_index) THEN
    1995             :                ! we know that proc_receive has something to send for us, let's see what
    1996             :                ij_counter_rec = &
    1997          80 :                   (ij_index - MIN(1, integ_group_pos2color_sub(proc_receive)))*ngroup + integ_group_pos2color_sub(proc_receive)
    1998             : 
    1999          80 :                rec_i = ij_map(spin, ij_counter_rec)
    2000             : 
    2001             :                BI_C_rec(1:my_B_size, 1:my_block_size, 1:my_group_L_size) => &
    2002          80 :                   buffer_1D(offset + 1:offset + INT(my_B_size, int_8)*my_block_size*my_group_L_size)
    2003       44250 :                BI_C_rec = 0.0_dp
    2004             : 
    2005             :                CALL comm_exchange%sendrecv(BI_C_send, proc_send, &
    2006          80 :                                            BI_C_rec, proc_receive, tag)
    2007             : 
    2008          80 :                CALL timeset(routineN//"_comm2_w", handle2)
    2009         160 :                DO irep = 0, num_integ_group - 1
    2010          80 :                   start_point = ranges_info_array(3, irep, comm_exchange%mepos)
    2011          80 :                   end_point = ranges_info_array(4, irep, comm_exchange%mepos)
    2012             : !$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
    2013             : !$OMP                    SHARED(start_point,end_point,my_block_size,&
    2014         160 : !$OMP                           Gamma_P_ia,rec_i,iiB,my_B_size,BI_C_rec)
    2015             :                   Gamma_P_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) = &
    2016             :                      Gamma_P_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) + &
    2017             :                      BI_C_rec(1:my_B_size, :, start_point:end_point)
    2018             : !$OMP END PARALLEL WORKSHARE
    2019             :                END DO
    2020          80 :                CALL timestop(handle2)
    2021             : 
    2022             :             ELSE
    2023             :                ! we have something to send but nothing to receive
    2024          18 :                CALL comm_exchange%send(BI_C_send, proc_send, tag)
    2025             : 
    2026             :             END IF
    2027             : 
    2028             :          END DO
    2029             : 
    2030             :       ELSE
    2031             :          ! noting to send check if we have to receive
    2032          36 :          DO proc_shift = 1, comm_exchange%num_pe - 1
    2033          18 :             proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
    2034          18 :             proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
    2035          18 :             rec_ij_index = num_ij_pairs(proc_receive)
    2036             : 
    2037          36 :             IF (ij_index <= rec_ij_index) THEN
    2038             :                ! we know that proc_receive has something to send for us, let's see what
    2039             :                ij_counter_rec = &
    2040          18 :                   (ij_index - MIN(1, integ_group_pos2color_sub(proc_receive)))*ngroup + integ_group_pos2color_sub(proc_receive)
    2041             : 
    2042          18 :                rec_i = ij_map(spin, ij_counter_rec)
    2043             : 
    2044             :                BI_C_rec(1:my_B_size, 1:my_block_size, 1:my_group_L_size) => &
    2045          18 :                   buffer_1D(1:INT(my_B_size, int_8)*my_block_size*my_group_L_size)
    2046             : 
    2047        4442 :                BI_C_rec = 0.0_dp
    2048             : 
    2049          18 :                CALL comm_exchange%recv(BI_C_rec, proc_receive, tag)
    2050             : 
    2051          18 :                CALL timeset(routineN//"_comm2_w", handle2)
    2052          36 :                DO irep = 0, num_integ_group - 1
    2053          18 :                   start_point = ranges_info_array(3, irep, comm_exchange%mepos)
    2054          18 :                   end_point = ranges_info_array(4, irep, comm_exchange%mepos)
    2055             : #if defined(__INTEL_LLVM_COMPILER)
    2056             :                   Gamma_P_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) = &
    2057             :                      Gamma_P_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) + &
    2058             :                      BI_C_rec(1:my_B_size, :, start_point:end_point)
    2059             : #else
    2060             : !$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
    2061             : !$OMP                    SHARED(start_point,end_point,my_block_size,&
    2062          36 : !$OMP                           Gamma_P_ia,rec_i,my_B_size,BI_C_rec)
    2063             :                   Gamma_P_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) = &
    2064             :                      Gamma_P_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) + &
    2065             :                      BI_C_rec(1:my_B_size, :, start_point:end_point)
    2066             : !$OMP END PARALLEL WORKSHARE
    2067             : #endif
    2068             :                END DO
    2069          18 :                CALL timestop(handle2)
    2070             : 
    2071             :             END IF
    2072             :          END DO
    2073             : 
    2074             :       END IF
    2075        3132 :       CALL timestop(handle)
    2076             : 
    2077        3132 :    END SUBROUTINE mp2_redistribute_gamma
    2078             : 
    2079             : ! **************************************************************************************************
    2080             : !> \brief ...
    2081             : !> \param mp2_env ...
    2082             : !> \param Eigenval ...
    2083             : !> \param homo ...
    2084             : !> \param virtual ...
    2085             : !> \param open_shell ...
    2086             : !> \param beta_beta ...
    2087             : !> \param Bib_C ...
    2088             : !> \param unit_nr ...
    2089             : !> \param dimen_RI ...
    2090             : !> \param my_B_size ...
    2091             : !> \param ngroup ...
    2092             : !> \param my_group_L_size ...
    2093             : !> \param color_sub ...
    2094             : !> \param ranges_info_array ...
    2095             : !> \param comm_exchange ...
    2096             : !> \param para_env_sub ...
    2097             : !> \param para_env ...
    2098             : !> \param my_B_virtual_start ...
    2099             : !> \param my_B_virtual_end ...
    2100             : !> \param sizes_array ...
    2101             : !> \param gd_B_virtual ...
    2102             : !> \param integ_group_pos2color_sub ...
    2103             : !> \param dgemm_counter ...
    2104             : !> \param buffer_1D ...
    2105             : ! **************************************************************************************************
    2106         372 :    SUBROUTINE quasi_degenerate_P_ij(mp2_env, Eigenval, homo, virtual, open_shell, &
    2107         372 :                                     beta_beta, Bib_C, unit_nr, dimen_RI, &
    2108         372 :                                     my_B_size, ngroup, my_group_L_size, &
    2109             :                                     color_sub, ranges_info_array, comm_exchange, para_env_sub, para_env, &
    2110         372 :                                     my_B_virtual_start, my_B_virtual_end, sizes_array, gd_B_virtual, &
    2111         372 :                                     integ_group_pos2color_sub, dgemm_counter, buffer_1D)
    2112             :       TYPE(mp2_type)                                     :: mp2_env
    2113             :       REAL(KIND=dp), DIMENSION(:, :), INTENT(IN)         :: Eigenval
    2114             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: homo, virtual
    2115             :       LOGICAL, INTENT(IN)                                :: open_shell, beta_beta
    2116             :       TYPE(three_dim_real_array), DIMENSION(:), &
    2117             :          INTENT(IN)                                      :: BIb_C
    2118             :       INTEGER, INTENT(IN)                                :: unit_nr, dimen_RI
    2119             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: my_B_size
    2120             :       INTEGER, INTENT(IN)                                :: ngroup, my_group_L_size, color_sub
    2121             :       INTEGER, ALLOCATABLE, DIMENSION(:, :, :), &
    2122             :          INTENT(IN)                                      :: ranges_info_array
    2123             :       TYPE(mp_comm_type), INTENT(IN)                     :: comm_exchange
    2124             :       TYPE(mp_para_env_type), INTENT(IN)                 :: para_env_sub, para_env
    2125             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: my_B_virtual_start, my_B_virtual_end
    2126             :       INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(IN)     :: sizes_array
    2127             :       TYPE(group_dist_d1_type), DIMENSION(:), INTENT(IN) :: gd_B_virtual
    2128             :       INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(IN)     :: integ_group_pos2color_sub
    2129             :       TYPE(dgemm_counter_type), INTENT(INOUT)            :: dgemm_counter
    2130             :       REAL(KIND=dp), CONTIGUOUS, DIMENSION(:), TARGET    :: buffer_1D
    2131             : 
    2132             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'quasi_degenerate_P_ij'
    2133             : 
    2134             :       INTEGER :: a, a_global, b, b_global, block_size, decil, handle, handle2, ijk_counter, &
    2135             :          ijk_counter_send, ijk_index, ispin, kkB, kspin, max_block_size, max_ijk, my_i, my_ijk, &
    2136             :          my_j, my_k, my_last_k(2), my_virtual, nspins, proc_receive, proc_send, proc_shift, &
    2137             :          rec_B_size, rec_B_virtual_end, rec_B_virtual_start, rec_L_size, send_B_size, &
    2138             :          send_B_virtual_end, send_B_virtual_start, send_i, send_ijk_index, send_j, send_k, &
    2139             :          size_B_i, size_B_k, tag, tag2
    2140         372 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: num_ijk
    2141         372 :       INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: ijk_map, send_last_k
    2142             :       LOGICAL                                            :: alpha_beta, do_recv_i, do_recv_j, &
    2143             :                                                             do_recv_k, do_send_i, do_send_j, &
    2144             :                                                             do_send_k
    2145             :       REAL(KIND=dp)                                      :: amp_fac, P_ij_elem, t_new, t_start
    2146             :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :), &
    2147         372 :          TARGET                                          :: local_ab, local_aL_i, local_aL_j, t_ab
    2148         372 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :)     :: local_aL_k
    2149         372 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: BI_C_rec, external_ab, external_aL
    2150         372 :       REAL(KIND=dp), DIMENSION(:, :, :), POINTER         :: BI_C_rec_3D
    2151             : 
    2152         372 :       CALL timeset(routineN//"_ij_sing", handle)
    2153             : 
    2154         372 :       tag = 44
    2155         372 :       tag2 = 45
    2156             : 
    2157         372 :       nspins = SIZE(BIb_C)
    2158         372 :       alpha_beta = (nspins == 2)
    2159             : 
    2160             :       ! Set amplitude factor
    2161         372 :       amp_fac = mp2_env%scale_S + mp2_env%scale_T
    2162         372 :       IF (open_shell) amp_fac = mp2_env%scale_T
    2163             : 
    2164         770 :       ALLOCATE (send_last_k(2, comm_exchange%num_pe - 1))
    2165             : 
    2166             :       ! Loop(s) over orbital triplets
    2167         820 :       DO ispin = 1, nspins
    2168         448 :          size_B_i = my_B_size(ispin)
    2169         448 :          IF (ispin == 1 .AND. alpha_beta) THEN
    2170             :             kspin = 2
    2171             :          ELSE
    2172         372 :             kspin = 1
    2173             :          END IF
    2174         448 :          size_B_k = my_B_size(kspin)
    2175             : 
    2176             :          ! Find the number of quasi-degenerate orbitals and orbital triplets
    2177             : 
    2178             :          CALL Find_quasi_degenerate_ij(my_ijk, homo(ispin), homo(kspin), Eigenval(:, ispin), mp2_env, ijk_map, unit_nr, ngroup, &
    2179             :                                        .NOT. beta_beta .AND. ispin /= 2, comm_exchange, num_ijk, max_ijk, color_sub, &
    2180         600 :                                        SIZE(buffer_1D), my_group_L_size, size_B_k, para_env, virtual(ispin), size_B_i)
    2181             : 
    2182         448 :          my_virtual = virtual(ispin)
    2183         448 :          IF (SIZE(ijk_map, 2) > 0) THEN
    2184          90 :             max_block_size = ijk_map(4, 1)
    2185             :          ELSE
    2186             :             max_block_size = 1
    2187             :          END IF
    2188             : 
    2189        1792 :          ALLOCATE (local_aL_i(dimen_RI, size_B_i))
    2190        1344 :          ALLOCATE (local_aL_j(dimen_RI, size_B_i))
    2191        2240 :          ALLOCATE (local_aL_k(dimen_RI, size_B_k, max_block_size))
    2192        1792 :          ALLOCATE (t_ab(my_virtual, size_B_k))
    2193             : 
    2194        1344 :          my_last_k = -1
    2195         538 :          send_last_k = -1
    2196             : 
    2197         448 :          t_start = m_walltime()
    2198         594 :          DO ijk_index = 1, max_ijk
    2199             : 
    2200             :             ! Prediction is unreliable if we are in the first step of the loop
    2201         146 :             IF (unit_nr > 0 .AND. ijk_index > 1) THEN
    2202          18 :                decil = ijk_index*10/max_ijk
    2203          18 :                IF (decil /= (ijk_index - 1)*10/max_ijk) THEN
    2204          18 :                   t_new = m_walltime()
    2205          18 :                   t_new = (t_new - t_start)/60.0_dp*(max_ijk - ijk_index + 1)/(ijk_index - 1)
    2206             :                   WRITE (unit_nr, FMT="(T3,A)") "Percentage of finished loop: "// &
    2207          18 :                      cp_to_string(decil*10)//". Minutes left: "//cp_to_string(t_new)
    2208          18 :                   CALL m_flush(unit_nr)
    2209             :                END IF
    2210             :             END IF
    2211             : 
    2212         594 :             IF (ijk_index <= my_ijk) THEN
    2213             :                ! work to be done
    2214         144 :                ijk_counter = (ijk_index - MIN(1, color_sub))*ngroup + color_sub
    2215         144 :                my_i = ijk_map(1, ijk_counter)
    2216         144 :                my_j = ijk_map(2, ijk_counter)
    2217         144 :                my_k = ijk_map(3, ijk_counter)
    2218         144 :                block_size = ijk_map(4, ijk_counter)
    2219             : 
    2220         144 :                do_recv_i = (ispin /= kspin) .OR. my_i < my_k .OR. my_i > my_k + block_size - 1
    2221         144 :                do_recv_j = (ispin /= kspin) .OR. my_j < my_k .OR. my_j > my_k + block_size - 1
    2222         144 :                do_recv_k = my_k /= my_last_k(1) .OR. my_k + block_size - 1 /= my_last_k(2)
    2223         144 :                my_last_k(1) = my_k
    2224         144 :                my_last_k(2) = my_k + block_size - 1
    2225             : 
    2226      127614 :                local_aL_i = 0.0_dp
    2227         144 :                IF (do_recv_i) THEN
    2228             :                   CALL fill_local_i_aL_2D(local_al_i, ranges_info_array(:, :, comm_exchange%mepos), &
    2229         117 :                                           BIb_C(ispin)%array(:, :, my_i))
    2230             :                END IF
    2231             : 
    2232      127614 :                local_aL_j = 0.0_dp
    2233         144 :                IF (do_recv_j) THEN
    2234             :                   CALL fill_local_i_aL_2D(local_al_j, ranges_info_array(:, :, comm_exchange%mepos), &
    2235         117 :                                           BIb_C(ispin)%array(:, :, my_j))
    2236             :                END IF
    2237             : 
    2238         144 :                IF (do_recv_k) THEN
    2239      219805 :                   local_aL_k = 0.0_dp
    2240             :                   CALL fill_local_i_aL(local_aL_k(:, :, 1:block_size), ranges_info_array(:, :, comm_exchange%mepos), &
    2241         136 :                                        BIb_C(kspin)%array(:, :, my_k:my_k + block_size - 1))
    2242             :                END IF
    2243             : 
    2244         144 :                CALL timeset(routineN//"_comm", handle2)
    2245         154 :                DO proc_shift = 1, comm_exchange%num_pe - 1
    2246          10 :                   proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
    2247          10 :                   proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
    2248             : 
    2249          10 :                   send_ijk_index = num_ijk(proc_send)
    2250             : 
    2251          10 :                   rec_L_size = sizes_array(proc_receive)
    2252          10 :                   BI_C_rec(1:rec_L_size, 1:size_B_i) => buffer_1D(1:INT(rec_L_size, KIND=int_8)*size_B_i)
    2253             : 
    2254          10 :                   do_send_i = .FALSE.
    2255          10 :                   do_send_j = .FALSE.
    2256          10 :                   do_send_k = .FALSE.
    2257          10 :                   IF (ijk_index <= send_ijk_index) THEN
    2258             :                      ! something to send
    2259             :                      ijk_counter_send = (ijk_index - MIN(1, integ_group_pos2color_sub(proc_send)))* &
    2260           8 :                                         ngroup + integ_group_pos2color_sub(proc_send)
    2261           8 :                      send_i = ijk_map(1, ijk_counter_send)
    2262           8 :                      send_j = ijk_map(2, ijk_counter_send)
    2263           8 :                      send_k = ijk_map(3, ijk_counter_send)
    2264             : 
    2265           8 :                      do_send_i = (ispin /= kspin) .OR. send_i < send_k .OR. send_i > send_k + block_size - 1
    2266           8 :                      do_send_j = (ispin /= kspin) .OR. send_j < send_k .OR. send_j > send_k + block_size - 1
    2267           8 :                      do_send_k = send_k /= send_last_k(1, proc_shift) .OR. send_k + block_size - 1 /= send_last_k(2, proc_shift)
    2268           8 :                      send_last_k(1, proc_shift) = send_k
    2269           8 :                      send_last_k(2, proc_shift) = send_k + block_size - 1
    2270             :                   END IF
    2271             : 
    2272             :                   ! occupied i
    2273         722 :                   BI_C_rec = 0.0_dp
    2274          10 :                   IF (do_send_i) THEN
    2275           6 :                   IF (do_recv_i) THEN
    2276             :                      CALL comm_exchange%sendrecv(BIb_C(ispin)%array(:, :, send_i), proc_send, &
    2277         288 :                                                  BI_C_rec, proc_receive, tag)
    2278             :                   ELSE
    2279           2 :                      CALL comm_exchange%send(BIb_C(ispin)%array(:, :, send_i), proc_send, tag)
    2280             :                   END IF
    2281           4 :                   ELSE IF (do_recv_i) THEN
    2282         580 :                   CALL comm_exchange%recv(BI_C_rec, proc_receive, tag)
    2283             :                   END IF
    2284          10 :                   IF (do_recv_i) THEN
    2285           8 :                      CALL fill_local_i_aL_2D(local_al_i, ranges_info_array(:, :, proc_receive), BI_C_rec)
    2286             :                   END IF
    2287             : 
    2288             :                   ! occupied j
    2289         722 :                   BI_C_rec = 0.0_dp
    2290          10 :                   IF (do_send_j) THEN
    2291           8 :                   IF (do_recv_j) THEN
    2292             :                      CALL comm_exchange%sendrecv(BIb_C(ispin)%array(:, :, send_j), proc_send, &
    2293         576 :                                                  BI_C_rec, proc_receive, tag)
    2294             :                   ELSE
    2295           0 :                      CALL comm_exchange%send(BIb_C(ispin)%array(:, :, send_j), proc_send, tag)
    2296             :                   END IF
    2297           2 :                   ELSE IF (do_recv_j) THEN
    2298           0 :                   CALL comm_exchange%recv(BI_C_rec, proc_receive, tag)
    2299             :                   END IF
    2300           8 :                   IF (do_recv_j) THEN
    2301           8 :                      CALL fill_local_i_aL_2D(local_al_j, ranges_info_array(:, :, proc_receive), BI_C_rec)
    2302             :                   END IF
    2303             : 
    2304             :                   ! occupied k
    2305             :                   BI_C_rec_3D(1:rec_L_size, 1:size_B_k, 1:block_size) => &
    2306          10 :                      buffer_1D(1:INT(rec_L_size, KIND=int_8)*size_B_k*block_size)
    2307          10 :                   IF (do_send_k) THEN
    2308           8 :                   IF (do_recv_k) THEN
    2309             :                      CALL comm_exchange%sendrecv(BIb_C(kspin)%array(:, :, send_k:send_k + block_size - 1), proc_send, &
    2310         726 :                                                  BI_C_rec_3D, proc_receive, tag)
    2311             :                   ELSE
    2312           0 :                      CALL comm_exchange%send(BI_C_rec, proc_receive, tag)
    2313             :                   END IF
    2314           2 :                   ELSE IF (do_recv_k) THEN
    2315         294 :                   CALL comm_exchange%recv(BI_C_rec_3D, proc_receive, tag)
    2316             :                   END IF
    2317         154 :                   IF (do_recv_k) THEN
    2318          10 :                      CALL fill_local_i_aL(local_al_k(:, :, 1:block_size), ranges_info_array(:, :, proc_receive), BI_C_rec_3D)
    2319             :                   END IF
    2320             :                END DO
    2321             : 
    2322       20172 :                IF (.NOT. do_recv_i) local_aL_i(:, :) = local_aL_k(:, :, my_i - my_k + 1)
    2323       20172 :                IF (.NOT. do_recv_j) local_aL_j(:, :) = local_aL_k(:, :, my_j - my_k + 1)
    2324         144 :                CALL timestop(handle2)
    2325             : 
    2326             :                ! expand integrals
    2327         352 :                DO kkB = 1, block_size
    2328         208 :                   CALL timeset(routineN//"_exp_ik", handle2)
    2329         208 :                   CALL dgemm_counter_start(dgemm_counter)
    2330         624 :                   ALLOCATE (local_ab(my_virtual, size_B_k))
    2331       32408 :                   local_ab = 0.0_dp
    2332             :                   CALL mp2_env%local_gemm_ctx%gemm('T', 'N', size_B_i, size_B_k, dimen_RI, 1.0_dp, &
    2333             :                                                    local_aL_i, dimen_RI, local_aL_k(:, :, kkB), dimen_RI, &
    2334         208 :                                           0.0_dp, local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), 1:size_B_k), size_B_i)
    2335         208 :                   DO proc_shift = 1, para_env_sub%num_pe - 1
    2336           0 :                      proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
    2337           0 :                      proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
    2338             : 
    2339           0 :                      CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
    2340             : 
    2341           0 :                      external_aL(1:dimen_RI, 1:rec_B_size) => buffer_1D(1:INT(dimen_RI, KIND=int_8)*rec_B_size)
    2342             : 
    2343             :                      CALL comm_exchange%sendrecv(local_aL_i, proc_send, &
    2344           0 :                                                  external_aL, proc_receive, tag)
    2345             : 
    2346             :                      CALL mp2_env%local_gemm_ctx%gemm('T', 'N', rec_B_size, size_B_k, dimen_RI, 1.0_dp, &
    2347             :                                                       external_aL, dimen_RI, local_aL_k(:, :, kkB), dimen_RI, &
    2348         208 :                                                     0.0_dp, local_ab(rec_B_virtual_start:rec_B_virtual_end, 1:size_B_k), rec_B_size)
    2349             :                   END DO
    2350         208 :                   CALL dgemm_counter_stop(dgemm_counter, my_virtual, size_B_k, dimen_RI)
    2351         208 :                   CALL timestop(handle2)
    2352             : 
    2353             :                   ! Amplitudes
    2354         208 :                   CALL timeset(routineN//"_tab", handle2)
    2355       32408 :                   t_ab = 0.0_dp
    2356             :                   ! Alpha-alpha, beta-beta and closed shell
    2357         208 :                   IF (.NOT. alpha_beta) THEN
    2358        1125 :                      DO b = 1, size_B_k
    2359        1014 :                         b_global = b + my_B_virtual_start(1) - 1
    2360       17135 :                         DO a = 1, my_B_size(1)
    2361       16010 :                            a_global = a + my_B_virtual_start(1) - 1
    2362             :                            t_ab(a_global, b) = (amp_fac*local_ab(a_global, b) - mp2_env%scale_T*local_ab(b_global, a))/ &
    2363             :                                                (Eigenval(my_i, 1) + Eigenval(my_k + kkB - 1, 1) &
    2364       17024 :                                                 - Eigenval(homo(1) + a_global, 1) - Eigenval(homo(1) + b_global, 1))
    2365             :                         END DO
    2366             :                      END DO
    2367             :                   ELSE
    2368         999 :                      DO b = 1, size_B_k
    2369         902 :                         b_global = b + my_B_virtual_start(kspin) - 1
    2370       15273 :                         DO a = 1, my_B_size(ispin)
    2371       14274 :                            a_global = a + my_B_virtual_start(ispin) - 1
    2372             :                            t_ab(a_global, b) = mp2_env%scale_S*local_ab(a_global, b)/ &
    2373             :                                                (Eigenval(my_i, ispin) + Eigenval(my_k + kkB - 1, kspin) &
    2374       15176 :                                                 - Eigenval(homo(ispin) + a_global, ispin) - Eigenval(homo(kspin) + b_global, kspin))
    2375             :                         END DO
    2376             :                      END DO
    2377             :                   END IF
    2378             : 
    2379         208 :                   IF (.NOT. alpha_beta) THEN
    2380         111 :                      DO proc_shift = 1, para_env_sub%num_pe - 1
    2381           0 :                         proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
    2382           0 :                         proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
    2383           0 :                         CALL get_group_dist(gd_B_virtual(1), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
    2384           0 :                         CALL get_group_dist(gd_B_virtual(1), proc_send, send_B_virtual_start, send_B_virtual_end, send_B_size)
    2385             : 
    2386           0 :                         external_ab(1:size_B_i, 1:rec_B_size) => buffer_1D(1:INT(size_B_i, KIND=int_8)*rec_B_size)
    2387             :                         CALL para_env_sub%sendrecv(local_ab(send_B_virtual_start:send_B_virtual_end, 1:size_B_k), proc_send, &
    2388           0 :                                                    external_ab(1:size_B_i, 1:rec_B_size), proc_receive, tag)
    2389             : 
    2390         111 :                         DO b = 1, my_B_size(1)
    2391           0 :                            b_global = b + my_B_virtual_start(1) - 1
    2392           0 :                            DO a = 1, rec_B_size
    2393           0 :                               a_global = a + rec_B_virtual_start - 1
    2394             :                               t_ab(a_global, b) = (amp_fac*local_ab(a_global, b) - mp2_env%scale_T*external_ab(b, a))/ &
    2395             :                                                   (Eigenval(my_i, 1) + Eigenval(my_k + kkB - 1, 1) &
    2396           0 :                                                    - Eigenval(homo(1) + a_global, 1) - Eigenval(homo(1) + b_global, 1))
    2397             :                            END DO
    2398             :                         END DO
    2399             :                      END DO
    2400             :                   END IF
    2401         208 :                   CALL timestop(handle2)
    2402             : 
    2403             :                   ! Expand the second set of integrals
    2404         208 :                   CALL timeset(routineN//"_exp_jk", handle2)
    2405       32408 :                   local_ab = 0.0_dp
    2406         208 :                   CALL dgemm_counter_start(dgemm_counter)
    2407             :                   CALL mp2_env%local_gemm_ctx%gemm('T', 'N', size_B_i, size_B_k, dimen_RI, 1.0_dp, &
    2408             :                                                    local_aL_j, dimen_RI, local_aL_k(:, :, kkB), dimen_RI, &
    2409         208 :                                           0.0_dp, local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), 1:size_B_k), size_B_i)
    2410         208 :                   DO proc_shift = 1, para_env_sub%num_pe - 1
    2411           0 :                      proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
    2412           0 :                      proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
    2413             : 
    2414           0 :                      CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
    2415             : 
    2416           0 :                      external_aL(1:dimen_RI, 1:rec_B_size) => buffer_1D(1:INT(dimen_RI, KIND=int_8)*rec_B_size)
    2417             : 
    2418             :                      CALL comm_exchange%sendrecv(local_aL_j, proc_send, &
    2419           0 :                                                  external_aL, proc_receive, tag)
    2420             :                      CALL mp2_env%local_gemm_ctx%gemm('T', 'N', rec_B_size, size_B_k, dimen_RI, 1.0_dp, &
    2421             :                                                       external_aL, dimen_RI, local_aL_k(:, :, kkB), dimen_RI, &
    2422         208 :                                                     0.0_dp, local_ab(rec_B_virtual_start:rec_B_virtual_end, 1:size_B_k), rec_B_size)
    2423             :                   END DO
    2424         208 :                   CALL dgemm_counter_stop(dgemm_counter, my_virtual, size_B_k, dimen_RI)
    2425         208 :                   CALL timestop(handle2)
    2426             : 
    2427         208 :                   CALL timeset(routineN//"_Pij", handle2)
    2428        2124 :                   DO b = 1, size_B_k
    2429        1916 :                      b_global = b + my_B_virtual_start(kspin) - 1
    2430       32408 :                      DO a = 1, my_B_size(ispin)
    2431       30284 :                         a_global = a + my_B_virtual_start(ispin) - 1
    2432             :                         local_ab(a_global, b) = &
    2433             :                            local_ab(a_global, b)/(Eigenval(my_j, ispin) + Eigenval(my_k + kkB - 1, kspin) &
    2434       32200 :                                                 - Eigenval(homo(ispin) + a_global, ispin) - Eigenval(homo(kspin) + b_global, kspin))
    2435             :                      END DO
    2436             :                   END DO
    2437             :                   !
    2438       32408 :                   P_ij_elem = SUM(local_ab*t_ab)
    2439         208 :                   DEALLOCATE (local_ab)
    2440         208 :                   IF ((.NOT. open_shell) .AND. (.NOT. alpha_beta)) THEN
    2441           4 :                      P_ij_elem = P_ij_elem*2.0_dp
    2442             :                   END IF
    2443         208 :                   IF (beta_beta) THEN
    2444          31 :                      mp2_env%ri_grad%P_ij(2)%array(my_i, my_j) = mp2_env%ri_grad%P_ij(2)%array(my_i, my_j) - P_ij_elem
    2445          31 :                      mp2_env%ri_grad%P_ij(2)%array(my_j, my_i) = mp2_env%ri_grad%P_ij(2)%array(my_j, my_i) - P_ij_elem
    2446             :                   ELSE
    2447         177 :                      mp2_env%ri_grad%P_ij(ispin)%array(my_i, my_j) = mp2_env%ri_grad%P_ij(ispin)%array(my_i, my_j) - P_ij_elem
    2448         177 :                      mp2_env%ri_grad%P_ij(ispin)%array(my_j, my_i) = mp2_env%ri_grad%P_ij(ispin)%array(my_j, my_i) - P_ij_elem
    2449             :                   END IF
    2450        1184 :                   CALL timestop(handle2)
    2451             :                END DO
    2452             :             ELSE
    2453           2 :                CALL timeset(routineN//"_comm", handle2)
    2454             :                ! no work to be done, possible messeges to be exchanged
    2455           4 :                DO proc_shift = 1, comm_exchange%num_pe - 1
    2456           2 :                   proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
    2457           2 :                   proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
    2458             : 
    2459           2 :                   send_ijk_index = num_ijk(proc_send)
    2460             : 
    2461           4 :                   IF (ijk_index <= send_ijk_index) THEN
    2462             :                      ! somethig to send
    2463             :                      ijk_counter_send = (ijk_index - MIN(1, integ_group_pos2color_sub(proc_send)))*ngroup + &
    2464           2 :                                         integ_group_pos2color_sub(proc_send)
    2465           2 :                      send_i = ijk_map(1, ijk_counter_send)
    2466           2 :                      send_j = ijk_map(2, ijk_counter_send)
    2467           2 :                      send_k = ijk_map(3, ijk_counter_send)
    2468           2 :                      block_size = ijk_map(4, ijk_counter_send)
    2469             : 
    2470           2 :                      do_send_i = (ispin /= kspin) .OR. send_i < send_k .OR. send_i > send_k + block_size - 1
    2471           2 :                      do_send_j = (ispin /= kspin) .OR. send_j < send_k .OR. send_j > send_k + block_size - 1
    2472             :                      ! occupied i
    2473           2 :                      IF (do_send_i) THEN
    2474           2 :                         CALL comm_exchange%send(BIb_C(ispin)%array(:, :, send_i), proc_send, tag)
    2475             :                      END IF
    2476             :                      ! occupied j
    2477           2 :                      IF (do_send_j) THEN
    2478           0 :                         CALL comm_exchange%send(BIb_C(ispin)%array(:, :, send_j), proc_send, tag)
    2479             :                      END IF
    2480             :                      ! occupied k
    2481           2 :                      CALL comm_exchange%send(BIb_C(kspin)%array(:, :, send_k:send_k + block_size - 1), proc_send, tag)
    2482             :                   END IF
    2483             : 
    2484             :                END DO ! proc loop
    2485           2 :                CALL timestop(handle2)
    2486             :             END IF
    2487             :          END DO ! ijk_index loop
    2488         448 :          DEALLOCATE (local_aL_i)
    2489         448 :          DEALLOCATE (local_aL_j)
    2490         448 :          DEALLOCATE (local_aL_k)
    2491         448 :          DEALLOCATE (t_ab)
    2492         820 :          DEALLOCATE (ijk_map)
    2493             :       END DO ! over number of loops (ispin)
    2494         372 :       CALL timestop(handle)
    2495             : 
    2496         744 :    END SUBROUTINE Quasi_degenerate_P_ij
    2497             : 
    2498             : ! **************************************************************************************************
    2499             : !> \brief ...
    2500             : !> \param my_ijk ...
    2501             : !> \param homo ...
    2502             : !> \param homo_beta ...
    2503             : !> \param Eigenval ...
    2504             : !> \param mp2_env ...
    2505             : !> \param ijk_map ...
    2506             : !> \param unit_nr ...
    2507             : !> \param ngroup ...
    2508             : !> \param do_print_alpha ...
    2509             : !> \param comm_exchange ...
    2510             : !> \param num_ijk ...
    2511             : !> \param max_ijk ...
    2512             : !> \param color_sub ...
    2513             : !> \param buffer_size ...
    2514             : !> \param my_group_L_size ...
    2515             : !> \param B_size_k ...
    2516             : !> \param para_env ...
    2517             : !> \param virtual ...
    2518             : !> \param B_size_i ...
    2519             : ! **************************************************************************************************
    2520         448 :    SUBROUTINE Find_quasi_degenerate_ij(my_ijk, homo, homo_beta, Eigenval, mp2_env, ijk_map, unit_nr, ngroup, &
    2521             :                                        do_print_alpha, comm_exchange, num_ijk, max_ijk, color_sub, &
    2522             :                                        buffer_size, my_group_L_size, B_size_k, para_env, virtual, B_size_i)
    2523             : 
    2524             :       INTEGER, INTENT(OUT)                               :: my_ijk
    2525             :       INTEGER, INTENT(IN)                                :: homo, homo_beta
    2526             :       REAL(KIND=dp), DIMENSION(:), INTENT(IN)            :: Eigenval
    2527             :       TYPE(mp2_type), INTENT(IN)                         :: mp2_env
    2528             :       INTEGER, ALLOCATABLE, DIMENSION(:, :), INTENT(OUT) :: ijk_map
    2529             :       INTEGER, INTENT(IN)                                :: unit_nr, ngroup
    2530             :       LOGICAL, INTENT(IN)                                :: do_print_alpha
    2531             :       TYPE(mp_comm_type), INTENT(IN)                     :: comm_exchange
    2532             :       INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(OUT)    :: num_ijk
    2533             :       INTEGER, INTENT(OUT)                               :: max_ijk
    2534             :       INTEGER, INTENT(IN)                                :: color_sub, buffer_size, my_group_L_size, &
    2535             :                                                             B_size_k
    2536             :       TYPE(mp_para_env_type), INTENT(IN)                 :: para_env
    2537             :       INTEGER, INTENT(IN)                                :: virtual, B_size_i
    2538             : 
    2539             :       INTEGER :: block_size, communication_steps, communication_volume, iib, ij_counter, &
    2540             :          ijk_counter, jjb, kkb, max_block_size, max_num_k_blocks, min_communication_volume, &
    2541             :          my_steps, num_k_blocks, num_sing_ij, total_ijk
    2542             :       INTEGER(KIND=int_8)                                :: mem
    2543         448 :       LOGICAL, ALLOCATABLE, DIMENSION(:, :)              :: ijk_marker
    2544             : 
    2545        1344 :       ALLOCATE (num_ijk(0:comm_exchange%num_pe - 1))
    2546             : 
    2547         448 :       num_sing_ij = 0
    2548        2026 :       DO iiB = 1, homo
    2549             :          ! diagonal elements already updated
    2550        4230 :          DO jjB = iiB + 1, homo
    2551        2204 :             IF (ABS(Eigenval(jjB) - Eigenval(iiB)) < mp2_env%ri_grad%eps_canonical) &
    2552        1684 :                num_sing_ij = num_sing_ij + 1
    2553             :          END DO
    2554             :       END DO
    2555             : 
    2556         448 :       IF (unit_nr > 0) THEN
    2557         224 :       IF (do_print_alpha) THEN
    2558             :          WRITE (UNIT=unit_nr, FMT="(T3,A,T75,i6)") &
    2559         148 :             "MO_INFO| Number of ij pairs below EPS_CANONICAL:", num_sing_ij
    2560             :       ELSE
    2561             :          WRITE (UNIT=unit_nr, FMT="(T3,A,T75,i6)") &
    2562          76 :             "MO_INFO| Number of ij pairs (spin beta) below EPS_CANONICAL:", num_sing_ij
    2563             :       END IF
    2564             :       END IF
    2565             : 
    2566             :       ! Determine the block size, first guess: use available buffer
    2567         448 :       max_block_size = buffer_size/(my_group_L_size*B_size_k)
    2568             : 
    2569             :       ! Second limit: memory
    2570         448 :       CALL m_memory(mem)
    2571             :       ! Convert to number of doubles
    2572         448 :       mem = mem/8
    2573             :       ! Remove local_ab (2x) and local_aL_i (2x)
    2574         448 :       mem = mem - 2_int_8*(virtual*B_size_k + B_size_i*my_group_L_size)
    2575         448 :       max_block_size = MIN(max_block_size, MAX(1, INT(mem/(my_group_L_size*B_size_k), KIND(max_block_size))))
    2576             : 
    2577             :       ! Exchange the limit
    2578         448 :       CALL para_env%min(max_block_size)
    2579             : 
    2580             :       ! Find now the block size which minimizes the communication volume and then the number of communication steps
    2581         448 :       block_size = 1
    2582         448 :       min_communication_volume = 3*homo_beta*num_sing_ij
    2583         448 :       communication_steps = 3*homo_beta*num_sing_ij
    2584        1136 :       DO iiB = max_block_size, 2, -1
    2585         688 :          max_num_k_blocks = homo_beta/iiB*num_sing_ij
    2586         688 :          num_k_blocks = max_num_k_blocks - MOD(max_num_k_blocks, ngroup)
    2587         688 :          communication_volume = num_k_blocks*(2 + iiB) + 3*(homo_beta*num_sing_ij - iiB*num_k_blocks)
    2588         688 :          my_steps = num_k_blocks + homo_beta*num_sing_ij - iiB*num_k_blocks
    2589        1136 :          IF (communication_volume < min_communication_volume) THEN
    2590          48 :             block_size = iiB
    2591          48 :             min_communication_volume = communication_volume
    2592          48 :             communication_steps = my_steps
    2593         640 :          ELSE IF (communication_volume == min_communication_volume .AND. my_steps < communication_steps) THEN
    2594          52 :             block_size = iiB
    2595          52 :             communication_steps = my_steps
    2596             :          END IF
    2597             :       END DO
    2598             : 
    2599         448 :       IF (unit_nr > 0) THEN
    2600             :          WRITE (UNIT=unit_nr, FMT="(T3,A,T75,i6)") &
    2601         224 :             "MO_INFO| Block size:", block_size
    2602         224 :          CALL m_flush(unit_nr)
    2603             :       END IF
    2604             : 
    2605             :       ! Calculate number of large blocks
    2606         448 :       max_num_k_blocks = homo_beta/block_size*num_sing_ij
    2607         448 :       num_k_blocks = max_num_k_blocks - MOD(max_num_k_blocks, ngroup)
    2608             : 
    2609         448 :       total_ijk = num_k_blocks + homo_beta*num_sing_ij - num_k_blocks*block_size
    2610         986 :       ALLOCATE (ijk_map(4, total_ijk))
    2611        1888 :       ijk_map = 0
    2612        1434 :       ALLOCATE (ijk_marker(homo_beta, num_sing_ij))
    2613         970 :       ijk_marker = .TRUE.
    2614             : 
    2615         448 :       my_ijk = 0
    2616         448 :       ijk_counter = 0
    2617         448 :       ij_counter = 0
    2618        2026 :       DO iiB = 1, homo
    2619             :          ! diagonal elements already updated
    2620        4230 :          DO jjB = iiB + 1, homo
    2621        2204 :             IF (ABS(Eigenval(jjB) - Eigenval(iiB)) >= mp2_env%ri_grad%eps_canonical) CYCLE
    2622         106 :             ij_counter = ij_counter + 1
    2623        1812 :             DO kkB = 1, homo_beta - MOD(homo_beta, block_size), block_size
    2624         162 :                IF (ijk_counter + 1 > num_k_blocks) EXIT
    2625         128 :                ijk_counter = ijk_counter + 1
    2626         384 :                ijk_marker(kkB:kkB + block_size - 1, ij_counter) = .FALSE.
    2627         128 :                ijk_map(1, ijk_counter) = iiB
    2628         128 :                ijk_map(2, ijk_counter) = jjB
    2629         128 :                ijk_map(3, ijk_counter) = kkB
    2630         128 :                ijk_map(4, ijk_counter) = block_size
    2631        2332 :                IF (MOD(ijk_counter, ngroup) == color_sub) my_ijk = my_ijk + 1
    2632             :             END DO
    2633             :          END DO
    2634             :       END DO
    2635             :       ij_counter = 0
    2636        2026 :       DO iiB = 1, homo
    2637             :          ! diagonal elements already updated
    2638        4230 :          DO jjB = iiB + 1, homo
    2639        2204 :             IF (ABS(Eigenval(jjB) - Eigenval(iiB)) >= mp2_env%ri_grad%eps_canonical) CYCLE
    2640         106 :             ij_counter = ij_counter + 1
    2641        2100 :             DO kkB = 1, homo_beta
    2642        2620 :                IF (ijk_marker(kkB, ij_counter)) THEN
    2643         160 :                   ijk_counter = ijk_counter + 1
    2644         160 :                   ijk_map(1, ijk_counter) = iiB
    2645         160 :                   ijk_map(2, ijk_counter) = jjB
    2646         160 :                   ijk_map(3, ijk_counter) = kkB
    2647         160 :                   ijk_map(4, ijk_counter) = 1
    2648         160 :                   IF (MOD(ijk_counter, ngroup) == color_sub) my_ijk = my_ijk + 1
    2649             :                END IF
    2650             :             END DO
    2651             :          END DO
    2652             :       END DO
    2653             : 
    2654         448 :       DEALLOCATE (ijk_marker)
    2655             : 
    2656         448 :       CALL comm_exchange%allgather(my_ijk, num_ijk)
    2657         926 :       max_ijk = MAXVAL(num_ijk)
    2658             : 
    2659         448 :    END SUBROUTINE Find_quasi_degenerate_ij
    2660             : 
    2661             : END MODULE mp2_ri_gpw

Generated by: LCOV version 1.15