Line data Source code
1 : !--------------------------------------------------------------------------------------------------!
2 : ! CP2K: A general program to perform molecular dynamics simulations !
3 : ! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4 : ! !
5 : ! SPDX-License-Identifier: GPL-2.0-or-later !
6 : !--------------------------------------------------------------------------------------------------!
7 :
8 : ! **************************************************************************************************
9 : !> \brief Routines to calculate RI-GPW-MP2 energy using pw
10 : !> \par History
11 : !> 06.2012 created [Mauro Del Ben]
12 : !> 03.2019 Refactored from mp2_ri_gpw [Frederick Stein]
13 : ! **************************************************************************************************
14 : MODULE mp2_ri_gpw
15 : USE cp_log_handling, ONLY: cp_to_string
16 : USE dgemm_counter_types, ONLY: dgemm_counter_init,&
17 : dgemm_counter_start,&
18 : dgemm_counter_stop,&
19 : dgemm_counter_type,&
20 : dgemm_counter_write
21 : USE group_dist_types, ONLY: get_group_dist,&
22 : group_dist_d1_type,&
23 : maxsize,&
24 : release_group_dist
25 : USE kinds, ONLY: dp,&
26 : int_8
27 : USE libint_2c_3c, ONLY: compare_potential_types
28 : USE local_gemm_api, ONLY: LOCAL_GEMM_PU_GPU
29 : USE machine, ONLY: m_flush,&
30 : m_memory,&
31 : m_walltime
32 : USE message_passing, ONLY: mp_comm_type,&
33 : mp_para_env_type
34 : USE mp2_ri_grad_util, ONLY: complete_gamma
35 : USE mp2_types, ONLY: mp2_type,&
36 : three_dim_real_array
37 :
38 : !$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num
39 : #include "./base/base_uses.f90"
40 :
41 : IMPLICIT NONE
42 :
43 : PRIVATE
44 :
45 : CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'mp2_ri_gpw'
46 :
47 : PUBLIC :: mp2_ri_gpw_compute_en
48 :
49 : CONTAINS
50 :
51 : ! **************************************************************************************************
52 : !> \brief ...
53 : !> \param Emp2_Cou ...
54 : !> \param Emp2_EX ...
55 : !> \param Emp2_S ...
56 : !> \param Emp2_T ...
57 : !> \param BIb_C ...
58 : !> \param mp2_env ...
59 : !> \param para_env ...
60 : !> \param para_env_sub ...
61 : !> \param color_sub ...
62 : !> \param gd_array ...
63 : !> \param gd_B_virtual ...
64 : !> \param Eigenval ...
65 : !> \param nmo ...
66 : !> \param homo ...
67 : !> \param dimen_RI ...
68 : !> \param unit_nr ...
69 : !> \param calc_forces ...
70 : !> \param calc_ex ...
71 : ! **************************************************************************************************
72 1050 : SUBROUTINE mp2_ri_gpw_compute_en(Emp2_Cou, Emp2_EX, Emp2_S, Emp2_T, BIb_C, mp2_env, para_env, para_env_sub, color_sub, &
73 350 : gd_array, gd_B_virtual, &
74 350 : Eigenval, nmo, homo, dimen_RI, unit_nr, calc_forces, calc_ex)
75 : REAL(KIND=dp), INTENT(INOUT) :: Emp2_Cou, Emp2_EX, Emp2_S, Emp2_T
76 : TYPE(three_dim_real_array), DIMENSION(:), &
77 : INTENT(INOUT) :: BIb_C
78 : TYPE(mp2_type) :: mp2_env
79 : TYPE(mp_para_env_type), INTENT(IN), POINTER :: para_env, para_env_sub
80 : INTEGER, INTENT(IN) :: color_sub
81 : TYPE(group_dist_d1_type), INTENT(INOUT) :: gd_array
82 : INTEGER, DIMENSION(:), INTENT(IN) :: homo
83 : INTEGER, INTENT(IN) :: nmo
84 : REAL(KIND=dp), DIMENSION(:, :), INTENT(IN) :: Eigenval
85 : TYPE(group_dist_d1_type), DIMENSION(SIZE(homo)), &
86 : INTENT(INOUT) :: gd_B_virtual
87 : INTEGER, INTENT(IN) :: dimen_RI, unit_nr
88 : LOGICAL, INTENT(IN) :: calc_forces, calc_ex
89 :
90 : CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_gpw_compute_en'
91 :
92 : INTEGER :: a, a_global, b, b_global, block_size, decil, end_point, handle, handle2, handle3, &
93 : iiB, ij_counter, ij_counter_send, ij_index, integ_group_size, ispin, jjB, jspin, &
94 : max_ij_pairs, my_block_size, my_group_L_end, my_group_L_size, my_group_L_size_orig, &
95 : my_group_L_start, my_i, my_ij_pairs, my_j, my_new_group_L_size, ngroup, nspins, &
96 : num_integ_group, proc_receive, proc_send, proc_shift, rec_B_size, rec_B_virtual_end, &
97 : rec_B_virtual_start, rec_L_size, send_B_size, send_B_virtual_end, send_B_virtual_start, &
98 : send_i, send_ij_index, send_j, start_point, tag, total_ij_pairs
99 350 : INTEGER, ALLOCATABLE, DIMENSION(:) :: integ_group_pos2color_sub, my_B_size, &
100 350 : my_B_virtual_end, my_B_virtual_start, num_ij_pairs, sizes_array_orig, virtual
101 350 : INTEGER, ALLOCATABLE, DIMENSION(:, :) :: ij_map
102 350 : INTEGER, ALLOCATABLE, DIMENSION(:, :, :) :: ranges_info_array
103 : LOGICAL :: my_alpha_beta_case, my_beta_beta_case, &
104 : my_open_shell_SS
105 : REAL(KIND=dp) :: amp_fac, my_Emp2_Cou, my_Emp2_EX, &
106 : sym_fac, t_new, t_start
107 350 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:), TARGET :: buffer_1D
108 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :), &
109 350 : TARGET :: local_ab, local_ba, t_ab
110 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
111 350 : TARGET :: local_i_aL, local_j_aL, Y_i_aP, Y_j_aP
112 : REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
113 350 : POINTER :: external_ab, external_i_aL
114 : REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :, :), &
115 350 : POINTER :: BI_C_rec
116 : TYPE(dgemm_counter_type) :: dgemm_counter
117 : TYPE(mp_comm_type) :: comm_exchange, comm_rep
118 : TYPE(three_dim_real_array), ALLOCATABLE, &
119 350 : DIMENSION(:) :: B_ia_Q
120 :
121 350 : CALL timeset(routineN, handle)
122 :
123 350 : nspins = SIZE(homo)
124 :
125 1050 : ALLOCATE (virtual(nspins))
126 788 : virtual(:) = nmo - homo(:)
127 :
128 1400 : ALLOCATE (my_B_size(nspins), my_B_virtual_start(nspins), my_B_virtual_end(nspins))
129 788 : DO ispin = 1, nspins
130 : CALL get_group_dist(gd_B_virtual(ispin), para_env_sub%mepos, &
131 788 : my_B_virtual_start(ispin), my_B_virtual_end(ispin), my_B_size(ispin))
132 : END DO
133 :
134 350 : CALL get_group_dist(gd_array, color_sub, my_group_L_start, my_group_L_end, my_group_L_size)
135 :
136 350 : CALL dgemm_counter_init(dgemm_counter, unit_nr, mp2_env%ri_mp2%print_dgemm_info)
137 :
138 : ! local_gemm_ctx has a very footprint the first time this routine is
139 : ! called.
140 350 : CALL mp2_env%local_gemm_ctx%create(LOCAL_GEMM_PU_GPU)
141 350 : CALL mp2_env%local_gemm_ctx%set_op_threshold_gpu(128*128*128*2)
142 :
143 : CALL mp2_ri_get_integ_group_size( &
144 : mp2_env, para_env, para_env_sub, gd_array, gd_B_virtual, &
145 : homo, dimen_RI, unit_nr, &
146 : integ_group_size, ngroup, &
147 350 : num_integ_group, virtual, calc_forces)
148 :
149 : ! now create a group that contains all the proc that have the same virtual starting point
150 : ! in the integ group
151 : CALL mp2_ri_create_group( &
152 : para_env, para_env_sub, color_sub, &
153 : gd_array%sizes, calc_forces, &
154 : integ_group_size, my_group_L_end, &
155 : my_group_L_size, my_group_L_size_orig, my_group_L_start, my_new_group_L_size, &
156 : integ_group_pos2color_sub, sizes_array_orig, &
157 350 : ranges_info_array, comm_exchange, comm_rep, num_integ_group)
158 :
159 : ! We cannot fix the tag because of the recv routine
160 350 : tag = 42
161 :
162 788 : DO jspin = 1, nspins
163 :
164 : CALL replicate_iaK_2intgroup(BIb_C(jspin)%array, comm_exchange, comm_rep, &
165 : homo(jspin), gd_array%sizes, my_B_size(jspin), &
166 438 : my_group_L_size, ranges_info_array)
167 :
168 1314 : DO ispin = 1, jspin
169 :
170 526 : IF (unit_nr > 0) THEN
171 263 : IF (nspins == 1) THEN
172 131 : WRITE (unit_nr, *) "Start loop run"
173 132 : ELSE IF (ispin == 1 .AND. jspin == 1) THEN
174 44 : WRITE (unit_nr, *) "Start loop run alpha-alpha"
175 88 : ELSE IF (ispin == 1 .AND. jspin == 2) THEN
176 44 : WRITE (unit_nr, *) "Start loop run alpha-beta"
177 44 : ELSE IF (ispin == 2 .AND. jspin == 2) THEN
178 44 : WRITE (unit_nr, *) "Start loop run beta-beta"
179 : END IF
180 263 : CALL m_flush(unit_nr)
181 : END IF
182 :
183 526 : my_open_shell_SS = (nspins == 2) .AND. (ispin == jspin)
184 :
185 : ! t_ab = amp_fac*(:,a|:,b)-(:,b|:,a)
186 : ! If we calculate the gradient we need to distinguish
187 : ! between alpha-alpha and beta-beta cases for UMP2
188 :
189 526 : my_beta_beta_case = .FALSE.
190 526 : my_alpha_beta_case = .FALSE.
191 526 : IF (ispin /= jspin) THEN
192 88 : my_alpha_beta_case = .TRUE.
193 438 : ELSE IF (my_open_shell_SS) THEN
194 176 : IF (ispin == 2) my_beta_beta_case = .TRUE.
195 : END IF
196 :
197 526 : amp_fac = mp2_env%scale_S + mp2_env%scale_T
198 526 : IF (my_alpha_beta_case .OR. my_open_shell_SS) amp_fac = mp2_env%scale_T
199 :
200 : CALL mp2_ri_allocate_no_blk(local_ab, t_ab, mp2_env, homo, virtual, my_B_size, &
201 526 : my_group_L_size, calc_forces, ispin, jspin, local_ba)
202 :
203 : CALL mp2_ri_get_block_size( &
204 : mp2_env, para_env, para_env_sub, gd_array, gd_B_virtual(ispin:jspin), &
205 : homo(ispin:jspin), virtual(ispin:jspin), dimen_RI, unit_nr, block_size, &
206 526 : ngroup, num_integ_group, my_open_shell_ss, calc_forces, buffer_1D)
207 :
208 : ! *****************************************************************
209 : ! ********** REPLICATION-BLOCKED COMMUNICATION SCHEME ***********
210 : ! *****************************************************************
211 : ! introduce block size, the number of occupied orbitals has to be a
212 : ! multiple of the block size
213 :
214 : ! Calculate the maximum number of ij pairs that have to be computed
215 : ! among groups
216 : CALL mp2_ri_communication(my_alpha_beta_case, total_ij_pairs, homo(ispin), homo(jspin), &
217 526 : block_size, ngroup, ij_map, color_sub, my_ij_pairs, my_open_shell_SS, unit_nr)
218 :
219 1578 : ALLOCATE (num_ij_pairs(0:comm_exchange%num_pe - 1))
220 526 : CALL comm_exchange%allgather(my_ij_pairs, num_ij_pairs)
221 :
222 1162 : max_ij_pairs = MAXVAL(num_ij_pairs)
223 :
224 : ! start real stuff
225 : CALL mp2_ri_allocate_blk(dimen_RI, my_B_size, block_size, local_i_aL, &
226 526 : local_j_aL, calc_forces, Y_i_aP, Y_j_aP, ispin, jspin)
227 :
228 526 : CALL timeset(routineN//"_RI_loop", handle2)
229 526 : my_Emp2_Cou = 0.0_dp
230 526 : my_Emp2_EX = 0.0_dp
231 526 : t_start = m_walltime()
232 2548 : DO ij_index = 1, max_ij_pairs
233 :
234 : ! Prediction is unreliable if we are in the first step of the loop
235 2022 : IF (unit_nr > 0 .AND. ij_index > 1) THEN
236 734 : decil = ij_index*10/max_ij_pairs
237 734 : IF (decil /= (ij_index - 1)*10/max_ij_pairs) THEN
238 693 : t_new = m_walltime()
239 693 : t_new = (t_new - t_start)/60.0_dp*(max_ij_pairs - ij_index + 1)/(ij_index - 1)
240 : WRITE (unit_nr, FMT="(T3,A)") "Percentage of finished loop: "// &
241 693 : cp_to_string(decil*10)//". Minutes left: "//cp_to_string(t_new)
242 693 : CALL m_flush(unit_nr)
243 : END IF
244 : END IF
245 :
246 2022 : IF (calc_forces) THEN
247 2806803 : Y_i_aP = 0.0_dp
248 2909153 : Y_j_aP = 0.0_dp
249 : END IF
250 :
251 2022 : IF (ij_index <= my_ij_pairs) THEN
252 : ! We have work to do
253 1973 : ij_counter = (ij_index - MIN(1, color_sub))*ngroup + color_sub
254 1973 : my_i = ij_map(1, ij_counter)
255 1973 : my_j = ij_map(2, ij_counter)
256 1973 : my_block_size = ij_map(3, ij_counter)
257 :
258 3289405 : local_i_aL = 0.0_dp
259 : CALL fill_local_i_aL(local_i_aL(:, :, 1:my_block_size), ranges_info_array(:, :, comm_exchange%mepos), &
260 1973 : BIb_C(ispin)%array(:, :, my_i:my_i + my_block_size - 1))
261 :
262 3405175 : local_j_aL = 0.0_dp
263 : CALL fill_local_i_aL(local_j_aL(:, :, 1:my_block_size), ranges_info_array(:, :, comm_exchange%mepos), &
264 1973 : BIb_C(jspin)%array(:, :, my_j:my_j + my_block_size - 1))
265 :
266 : ! collect data from other proc
267 1973 : CALL timeset(routineN//"_comm", handle3)
268 2082 : DO proc_shift = 1, comm_exchange%num_pe - 1
269 109 : proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
270 109 : proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
271 :
272 109 : send_ij_index = num_ij_pairs(proc_send)
273 :
274 109 : CALL get_group_dist(gd_array, proc_receive, sizes=rec_L_size)
275 :
276 2082 : IF (ij_index <= send_ij_index) THEN
277 : ij_counter_send = (ij_index - MIN(1, integ_group_pos2color_sub(proc_send)))*ngroup + &
278 60 : integ_group_pos2color_sub(proc_send)
279 60 : send_i = ij_map(1, ij_counter_send)
280 60 : send_j = ij_map(2, ij_counter_send)
281 :
282 : ! occupied i
283 : BI_C_rec(1:rec_L_size, 1:my_B_size(ispin), 1:my_block_size) => &
284 60 : buffer_1D(1:rec_L_size*my_B_size(ispin)*my_block_size)
285 43353 : BI_C_rec = 0.0_dp
286 : CALL comm_exchange%sendrecv(BIb_C(ispin)%array(:, :, send_i:send_i + my_block_size - 1), &
287 60 : proc_send, BI_C_rec, proc_receive, tag)
288 :
289 : CALL fill_local_i_aL(local_i_aL(:, :, 1:my_block_size), ranges_info_array(:, :, proc_receive), &
290 60 : BI_C_rec(:, 1:my_B_size(ispin), :))
291 :
292 : ! occupied j
293 : BI_C_rec(1:rec_L_size, 1:my_B_size(jspin), 1:my_block_size) => &
294 60 : buffer_1D(1:INT(rec_L_size, int_8)*my_B_size(jspin)*my_block_size)
295 44373 : BI_C_rec = 0.0_dp
296 : CALL comm_exchange%sendrecv(BIb_C(jspin)%array(:, :, send_j:send_j + my_block_size - 1), &
297 60 : proc_send, BI_C_rec, proc_receive, tag)
298 :
299 : CALL fill_local_i_aL(local_j_aL(:, :, 1:my_block_size), ranges_info_array(:, :, proc_receive), &
300 60 : BI_C_rec(:, 1:my_B_size(jspin), :))
301 :
302 : ELSE
303 : ! we send nothing while we know that we have to receive something
304 :
305 : ! occupied i
306 : BI_C_rec(1:rec_L_size, 1:my_B_size(ispin), 1:my_block_size) => &
307 49 : buffer_1D(1:INT(rec_L_size, int_8)*my_B_size(ispin)*my_block_size)
308 9124 : BI_C_rec = 0.0_dp
309 49 : CALL comm_exchange%recv(BI_C_rec, proc_receive, tag)
310 :
311 : CALL fill_local_i_aL(local_i_aL(:, :, 1:my_block_size), ranges_info_array(:, :, proc_receive), &
312 49 : BI_C_rec(:, 1:my_B_size(ispin), 1:my_block_size))
313 :
314 : ! occupied j
315 : BI_C_rec(1:rec_L_size, 1:my_B_size(jspin), 1:my_block_size) => &
316 49 : buffer_1D(1:INT(rec_L_size, int_8)*my_B_size(jspin)*my_block_size)
317 9124 : BI_C_rec = 0.0_dp
318 49 : CALL comm_exchange%recv(BI_C_rec, proc_receive, tag)
319 :
320 : CALL fill_local_i_aL(local_j_aL(:, :, 1:my_block_size), ranges_info_array(:, :, proc_receive), &
321 49 : BI_C_rec(:, 1:my_B_size(jspin), 1:my_block_size))
322 :
323 : END IF
324 :
325 : END DO
326 :
327 1973 : CALL timestop(handle3)
328 :
329 : ! loop over the block elements
330 3950 : DO iiB = 1, my_block_size
331 5935 : DO jjB = 1, my_block_size
332 1985 : CALL timeset(routineN//"_expansion", handle3)
333 3962 : ASSOCIATE (my_local_i_aL => local_i_aL(:, :, iiB), my_local_j_aL => local_j_aL(:, :, jjB))
334 :
335 : ! calculate the integrals (ia|jb) strating from my local data ...
336 579179 : local_ab = 0.0_dp
337 1985 : IF ((my_alpha_beta_case) .AND. (calc_forces)) THEN
338 140066 : local_ba = 0.0_dp
339 : END IF
340 1985 : CALL dgemm_counter_start(dgemm_counter)
341 : CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_B_size(ispin), my_B_size(jspin), dimen_RI, 1.0_dp, &
342 : my_local_i_aL, dimen_RI, my_local_j_aL, dimen_RI, &
343 : 0.0_dp, local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), &
344 1985 : my_B_size(ispin))
345 : ! Additional integrals only for alpha_beta case and forces
346 1985 : IF (my_alpha_beta_case .AND. calc_forces) THEN
347 : local_ba(my_B_virtual_start(jspin):my_B_virtual_end(jspin), :) = &
348 133101 : TRANSPOSE(local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :))
349 : END IF
350 : ! ... and from the other of my subgroup
351 2259 : DO proc_shift = 1, para_env_sub%num_pe - 1
352 274 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
353 274 : proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
354 :
355 : CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, &
356 274 : rec_B_virtual_end, rec_B_size)
357 :
358 274 : external_i_aL(1:dimen_RI, 1:rec_B_size) => buffer_1D(1:INT(dimen_RI, int_8)*rec_B_size)
359 267944 : external_i_aL = 0.0_dp
360 :
361 : CALL para_env_sub%sendrecv(my_local_i_aL, proc_send, &
362 274 : external_i_aL, proc_receive, tag)
363 :
364 : CALL mp2_env%local_gemm_ctx%gemm( &
365 : 'T', 'N', rec_B_size, my_B_size(jspin), dimen_RI, 1.0_dp, &
366 : external_i_aL, dimen_RI, my_local_j_aL, dimen_RI, &
367 274 : 0.0_dp, local_ab(rec_B_virtual_start:rec_B_virtual_end, 1:my_B_size(jspin)), rec_B_size)
368 :
369 : ! Additional integrals only for alpha_beta case and forces
370 2533 : IF (my_alpha_beta_case .AND. calc_forces) THEN
371 :
372 : CALL get_group_dist(gd_B_virtual(jspin), proc_receive, rec_B_virtual_start, &
373 70 : rec_B_virtual_end, rec_B_size)
374 :
375 70 : external_i_aL(1:dimen_RI, 1:rec_B_size) => buffer_1D(1:INT(dimen_RI, int_8)*rec_B_size)
376 81655 : external_i_aL = 0.0_dp
377 :
378 : CALL para_env_sub%sendrecv(my_local_j_aL, proc_send, &
379 70 : external_i_aL, proc_receive, tag)
380 :
381 : CALL mp2_env%local_gemm_ctx%gemm('T', 'N', rec_B_size, my_B_size(ispin), dimen_RI, 1.0_dp, &
382 : external_i_aL, dimen_RI, my_local_i_aL, dimen_RI, &
383 70 : 0.0_dp, local_ba(rec_B_virtual_start:rec_B_virtual_end, 1:my_B_size(ispin)), rec_B_size)
384 : END IF
385 : END DO
386 1985 : IF (my_alpha_beta_case .AND. calc_forces) THEN
387 : ! Is just an approximation, but the call does not allow it, it ought to be (virtual_i*B_size_j+virtual_j*B_size_i)*dimen_RI
388 502 : CALL dgemm_counter_stop(dgemm_counter, virtual(ispin), my_B_size(ispin) + my_B_size(jspin), dimen_RI)
389 : ELSE
390 1483 : CALL dgemm_counter_stop(dgemm_counter, virtual(ispin), my_B_size(jspin), dimen_RI)
391 : END IF
392 1985 : CALL timestop(handle3)
393 :
394 : !sample peak memory
395 1985 : CALL m_memory()
396 :
397 1985 : CALL timeset(routineN//"_ener", handle3)
398 : ! calculate coulomb only MP2
399 1985 : sym_fac = 2.0_dp
400 1985 : IF (my_i == my_j) sym_fac = 1.0_dp
401 1985 : IF (my_alpha_beta_case) sym_fac = 0.5_dp
402 30558 : DO b = 1, my_B_size(jspin)
403 28573 : b_global = b + my_B_virtual_start(jspin) - 1
404 579179 : DO a = 1, virtual(ispin)
405 : my_Emp2_Cou = my_Emp2_Cou - sym_fac*2.0_dp*local_ab(a, b)**2/ &
406 : (Eigenval(homo(ispin) + a, ispin) + Eigenval(homo(jspin) + b_global, jspin) - &
407 577194 : Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, jspin))
408 : END DO
409 : END DO
410 :
411 1985 : IF (calc_ex) THEN
412 : ! contract integrals with orbital energies for exchange MP2 energy
413 : ! starting with local ...
414 323629 : IF (calc_forces .AND. (.NOT. my_alpha_beta_case)) t_ab = 0.0_dp
415 29958 : DO b = 1, my_B_size(ispin)
416 27973 : b_global = b + my_B_virtual_start(ispin) - 1
417 541867 : DO a = 1, my_B_size(ispin)
418 511909 : a_global = a + my_B_virtual_start(ispin) - 1
419 : my_Emp2_Ex = my_Emp2_Ex + sym_fac*local_ab(a_global, b)*local_ab(b_global, a)/ &
420 : (Eigenval(homo(ispin) + a_global, ispin) + Eigenval(homo(ispin) + b_global, ispin) - &
421 511909 : Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, ispin))
422 539882 : IF (calc_forces .AND. (.NOT. my_alpha_beta_case)) THEN
423 : t_ab(a_global, b) = -(amp_fac*local_ab(a_global, b) - mp2_env%scale_T*local_ab(b_global, a))/ &
424 : (Eigenval(homo(ispin) + a_global, ispin) + &
425 : Eigenval(homo(ispin) + b_global, ispin) - &
426 295900 : Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, ispin))
427 : END IF
428 : END DO
429 : END DO
430 : ! ... and then with external data
431 2259 : DO proc_shift = 1, para_env_sub%num_pe - 1
432 274 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
433 274 : proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
434 :
435 : CALL get_group_dist(gd_B_virtual(ispin), proc_receive, &
436 274 : rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
437 : CALL get_group_dist(gd_B_virtual(ispin), proc_send, &
438 274 : send_B_virtual_start, send_B_virtual_end, send_B_size)
439 :
440 : external_ab(1:my_B_size(ispin), 1:rec_B_size) => &
441 274 : buffer_1D(1:INT(rec_B_size, int_8)*my_B_size(ispin))
442 30405 : external_ab = 0.0_dp
443 :
444 : CALL para_env_sub%sendrecv(local_ab(send_B_virtual_start:send_B_virtual_end, 1:my_B_size(ispin)), proc_send, &
445 60536 : external_ab(1:my_B_size(ispin), 1:rec_B_size), proc_receive, tag)
446 :
447 5264 : DO b = 1, my_B_size(ispin)
448 2731 : b_global = b + my_B_virtual_start(ispin) - 1
449 30405 : DO a = 1, rec_B_size
450 27400 : a_global = a + rec_B_virtual_start - 1
451 : my_Emp2_Ex = my_Emp2_Ex + sym_fac*local_ab(a_global, b)*external_ab(b, a)/ &
452 : (Eigenval(homo(ispin) + a_global, ispin) + Eigenval(homo(ispin) + b_global, ispin) - &
453 27400 : Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, ispin))
454 27400 : IF (calc_forces .AND. (.NOT. my_alpha_beta_case)) &
455 : t_ab(a_global, b) = -(amp_fac*local_ab(a_global, b) - mp2_env%scale_T*external_ab(b, a))/ &
456 : (Eigenval(homo(ispin) + a_global, ispin) + &
457 : Eigenval(homo(ispin) + b_global, ispin) - &
458 12311 : Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, ispin))
459 : END DO
460 : END DO
461 : END DO
462 : END IF
463 1985 : CALL timestop(handle3)
464 :
465 3970 : IF (calc_forces) THEN
466 : ! update P_ab, Gamma_P_ia
467 : CALL mp2_update_P_gamma(mp2_env, para_env_sub, gd_B_virtual, &
468 : Eigenval, homo, dimen_RI, iiB, jjB, my_B_size, &
469 : my_B_virtual_end, my_B_virtual_start, my_i, my_j, virtual, &
470 : local_ab, t_ab, my_local_i_aL, my_local_j_aL, &
471 : my_open_shell_ss, Y_i_aP(:, :, iiB), Y_j_aP(:, :, jjB), local_ba, &
472 1600 : ispin, jspin, dgemm_counter, buffer_1D)
473 :
474 : END IF
475 :
476 : END ASSOCIATE
477 :
478 : END DO ! jjB
479 : END DO ! iiB
480 :
481 : ELSE
482 : ! We need it later in case of gradients
483 49 : my_block_size = 1
484 :
485 49 : CALL timeset(routineN//"_comm", handle3)
486 : ! No work to do and we know that we have to receive nothing, but send something
487 : ! send data to other proc
488 98 : DO proc_shift = 1, comm_exchange%num_pe - 1
489 49 : proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
490 49 : proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
491 :
492 49 : send_ij_index = num_ij_pairs(proc_send)
493 :
494 98 : IF (ij_index <= send_ij_index) THEN
495 : ! something to send
496 : ij_counter_send = (ij_index - MIN(1, integ_group_pos2color_sub(proc_send)))*ngroup + &
497 49 : integ_group_pos2color_sub(proc_send)
498 49 : send_i = ij_map(1, ij_counter_send)
499 49 : send_j = ij_map(2, ij_counter_send)
500 :
501 : ! occupied i
502 : CALL comm_exchange%send(BIb_C(ispin)%array(:, :, send_i:send_i + my_block_size - 1), &
503 49 : proc_send, tag)
504 : ! occupied j
505 : CALL comm_exchange%send(BIb_C(jspin)%array(:, :, send_j:send_j + my_block_size - 1), &
506 49 : proc_send, tag)
507 : END IF
508 : END DO
509 49 : CALL timestop(handle3)
510 : END IF
511 :
512 : ! redistribute gamma
513 2548 : IF (calc_forces) THEN
514 : CALL mp2_redistribute_gamma(mp2_env%ri_grad%Gamma_P_ia(ispin)%array, ij_index, my_B_size(ispin), &
515 : my_block_size, my_group_L_size, my_i, my_ij_pairs, ngroup, &
516 : num_integ_group, integ_group_pos2color_sub, num_ij_pairs, &
517 : ij_map, ranges_info_array, Y_i_aP(:, :, 1:my_block_size), comm_exchange, &
518 1597 : gd_array%sizes, 1, buffer_1D)
519 : CALL mp2_redistribute_gamma(mp2_env%ri_grad%Gamma_P_ia(jspin)%array, ij_index, my_B_size(jspin), &
520 : my_block_size, my_group_L_size, my_j, my_ij_pairs, ngroup, &
521 : num_integ_group, integ_group_pos2color_sub, num_ij_pairs, &
522 : ij_map, ranges_info_array, Y_j_aP(:, :, 1:my_block_size), comm_exchange, &
523 1597 : gd_array%sizes, 2, buffer_1D)
524 : END IF
525 :
526 : END DO
527 526 : CALL timestop(handle2)
528 :
529 526 : DEALLOCATE (local_i_aL)
530 526 : DEALLOCATE (local_j_aL)
531 526 : DEALLOCATE (ij_map)
532 526 : DEALLOCATE (num_ij_pairs)
533 526 : DEALLOCATE (local_ab)
534 :
535 526 : IF (calc_forces) THEN
536 380 : DEALLOCATE (Y_i_aP)
537 380 : DEALLOCATE (Y_j_aP)
538 380 : IF (ALLOCATED(t_ab)) THEN
539 302 : DEALLOCATE (t_ab)
540 : END IF
541 380 : DEALLOCATE (local_ba)
542 :
543 : ! here we check if there are almost degenerate ij
544 : ! pairs and we update P_ij with these contribution.
545 : ! If all pairs are degenerate with each other this step will scale O(N^6),
546 : ! if the number of degenerate pairs scales linearly with the system size
547 : ! this step will scale O(N^5).
548 : ! Start counting the number of almost degenerate ij pairs according
549 : ! to eps_canonical
550 : CALL quasi_degenerate_P_ij( &
551 : mp2_env, Eigenval(:, ispin:jspin), homo(ispin:jspin), virtual(ispin:jspin), my_open_shell_ss, &
552 : my_beta_beta_case, Bib_C(ispin:jspin), unit_nr, dimen_RI, &
553 : my_B_size(ispin:jspin), ngroup, my_group_L_size, &
554 : color_sub, ranges_info_array, comm_exchange, para_env_sub, para_env, &
555 : my_B_virtual_start(ispin:jspin), my_B_virtual_end(ispin:jspin), gd_array%sizes, gd_B_virtual(ispin:jspin), &
556 380 : integ_group_pos2color_sub, dgemm_counter, buffer_1D)
557 :
558 : END IF
559 :
560 526 : DEALLOCATE (buffer_1D)
561 :
562 : ! Dereplicate BIb_C and Gamma_P_ia to save memory
563 : ! These matrices will not be needed in that fashion anymore
564 : ! B_ia_Q will needed later
565 526 : IF (calc_forces .AND. jspin == nspins) THEN
566 1052 : IF (.NOT. ALLOCATED(B_ia_Q)) ALLOCATE (B_ia_Q(nspins))
567 1510 : ALLOCATE (B_ia_Q(ispin)%array(homo(ispin), my_B_size(ispin), my_group_L_size_orig))
568 892583 : B_ia_Q(ispin)%array = 0.0_dp
569 1388 : DO jjB = 1, homo(ispin)
570 17032 : DO iiB = 1, my_B_size(ispin)
571 : B_ia_Q(ispin)%array(jjB, iiB, 1:my_group_L_size_orig) = &
572 712410 : BIb_C(ispin)%array(1:my_group_L_size_orig, iiB, jjB)
573 : END DO
574 : END DO
575 302 : DEALLOCATE (BIb_C(ispin)%array)
576 :
577 : ! sum Gamma and dereplicate
578 1510 : ALLOCATE (BIb_C(ispin)%array(my_B_size(ispin), homo(ispin), my_group_L_size_orig))
579 574 : DO proc_shift = 1, comm_rep%num_pe - 1
580 : ! invert order
581 272 : proc_send = MODULO(comm_rep%mepos - proc_shift, comm_rep%num_pe)
582 272 : proc_receive = MODULO(comm_rep%mepos + proc_shift, comm_rep%num_pe)
583 :
584 272 : start_point = ranges_info_array(3, proc_shift, comm_exchange%mepos)
585 272 : end_point = ranges_info_array(4, proc_shift, comm_exchange%mepos)
586 :
587 : CALL comm_rep%sendrecv(mp2_env%ri_grad%Gamma_P_ia(ispin)%array(:, :, start_point:end_point), &
588 272 : proc_send, BIb_C(ispin)%array, proc_receive, tag)
589 : !$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
590 574 : !$OMP SHARED(mp2_env,BIb_C,ispin,homo,my_B_size,my_group_L_size_orig)
591 : mp2_env%ri_grad%Gamma_P_ia(ispin)%array(:, :, 1:my_group_L_size_orig) = &
592 : mp2_env%ri_grad%Gamma_P_ia(ispin)%array(:, :, 1:my_group_L_size_orig) &
593 : + BIb_C(ispin)%array(:, :, :)
594 : !$OMP END PARALLEL WORKSHARE
595 : END DO
596 :
597 757898 : BIb_C(ispin)%array(:, :, :) = mp2_env%ri_grad%Gamma_P_ia(ispin)%array(:, :, 1:my_group_L_size_orig)
598 302 : DEALLOCATE (mp2_env%ri_grad%Gamma_P_ia(ispin)%array)
599 302 : CALL MOVE_ALLOC(BIb_C(ispin)%array, mp2_env%ri_grad%Gamma_P_ia(ispin)%array)
600 224 : ELSE IF (jspin == nspins) THEN
601 136 : DEALLOCATE (BIb_C(ispin)%array)
602 : END IF
603 :
604 526 : CALL para_env%sum(my_Emp2_Cou)
605 526 : CALL para_env%sum(my_Emp2_Ex)
606 :
607 2016 : IF (my_open_shell_SS .OR. my_alpha_beta_case) THEN
608 264 : IF (my_alpha_beta_case) THEN
609 88 : Emp2_S = Emp2_S + my_Emp2_Cou
610 88 : Emp2_Cou = Emp2_Cou + my_Emp2_Cou
611 : ELSE
612 176 : my_Emp2_Cou = my_Emp2_Cou*0.25_dp
613 176 : my_Emp2_EX = my_Emp2_EX*0.5_dp
614 176 : Emp2_T = Emp2_T + my_Emp2_Cou + my_Emp2_EX
615 176 : Emp2_Cou = Emp2_Cou + my_Emp2_Cou
616 176 : Emp2_EX = Emp2_EX + my_Emp2_EX
617 : END IF
618 : ELSE
619 262 : Emp2_Cou = Emp2_Cou + my_Emp2_Cou
620 262 : Emp2_EX = Emp2_EX + my_Emp2_EX
621 : END IF
622 : END DO
623 :
624 : END DO
625 :
626 350 : DEALLOCATE (integ_group_pos2color_sub)
627 350 : DEALLOCATE (ranges_info_array)
628 :
629 350 : CALL comm_exchange%free()
630 350 : CALL comm_rep%free()
631 :
632 350 : IF (calc_forces) THEN
633 : ! recover original information (before replication)
634 224 : DEALLOCATE (gd_array%sizes)
635 224 : iiB = SIZE(sizes_array_orig)
636 672 : ALLOCATE (gd_array%sizes(0:iiB - 1))
637 666 : gd_array%sizes(:) = sizes_array_orig
638 224 : DEALLOCATE (sizes_array_orig)
639 :
640 : ! Remove replication from BIb_C and reorder the matrix
641 224 : my_group_L_size = my_group_L_size_orig
642 :
643 : ! B_ia_Q(ispin)%array will be deallocated inside of complete_gamma
644 526 : DO ispin = 1, nspins
645 : CALL complete_gamma(mp2_env, B_ia_Q(ispin)%array, dimen_RI, homo(ispin), &
646 : virtual(ispin), para_env, para_env_sub, ngroup, &
647 : my_group_L_size, my_group_L_start, my_group_L_end, &
648 : my_B_size(ispin), my_B_virtual_start(ispin), &
649 : gd_array, gd_B_virtual(ispin), &
650 526 : ispin)
651 : END DO
652 526 : DEALLOCATE (B_ia_Q)
653 :
654 43774 : IF (nspins == 1) mp2_env%ri_grad%P_ab(1)%array(:, :) = mp2_env%ri_grad%P_ab(1)%array(:, :)*2.0_dp
655 : BLOCK
656 : TYPE(mp_comm_type) :: comm
657 224 : CALL comm%from_split(para_env, para_env_sub%mepos)
658 526 : DO ispin = 1, nspins
659 : ! P_ab is only replicated over all subgroups
660 302 : CALL comm%sum(mp2_env%ri_grad%P_ab(ispin)%array)
661 : ! P_ij is replicated over all processes
662 526 : CALL para_env%sum(mp2_env%ri_grad%P_ij(ispin)%array)
663 : END DO
664 448 : CALL comm%free()
665 : END BLOCK
666 : END IF
667 :
668 350 : CALL release_group_dist(gd_array)
669 788 : DO ispin = 1, nspins
670 438 : IF (ALLOCATED(BIb_C(ispin)%array)) DEALLOCATE (BIb_C(ispin)%array)
671 788 : CALL release_group_dist(gd_B_virtual(ispin))
672 : END DO
673 :
674 : ! We do not need this matrix later, so deallocate it here to safe memory
675 350 : IF (calc_forces) DEALLOCATE (mp2_env%ri_grad%PQ_half)
676 350 : IF (calc_forces .AND. .NOT. compare_potential_types(mp2_env%ri_metric, mp2_env%potential_parameter)) &
677 8 : DEALLOCATE (mp2_env%ri_grad%operator_half)
678 :
679 350 : CALL dgemm_counter_write(dgemm_counter, para_env)
680 :
681 : ! release memory allocated by local_gemm when run on GPU. local_gemm_ctx is null on cpu only runs
682 350 : CALL mp2_env%local_gemm_ctx%destroy()
683 350 : CALL timestop(handle)
684 :
685 1400 : END SUBROUTINE mp2_ri_gpw_compute_en
686 :
687 : ! **************************************************************************************************
688 : !> \brief ...
689 : !> \param local_i_aL ...
690 : !> \param ranges_info_array ...
691 : !> \param BIb_C_rec ...
692 : ! **************************************************************************************************
693 4320 : SUBROUTINE fill_local_i_aL(local_i_aL, ranges_info_array, BIb_C_rec)
694 : REAL(KIND=dp), DIMENSION(:, :, :), INTENT(INOUT) :: local_i_aL
695 : INTEGER, DIMENSION(:, :), INTENT(IN) :: ranges_info_array
696 : REAL(KIND=dp), DIMENSION(:, :, :), INTENT(IN) :: BIb_C_rec
697 :
698 : CHARACTER(LEN=*), PARAMETER :: routineN = 'fill_local_i_aL'
699 :
700 : INTEGER :: end_point, handle, irep, Lend_pos, &
701 : Lstart_pos, start_point
702 :
703 4320 : CALL timeset(routineN, handle)
704 :
705 11980 : DO irep = 1, SIZE(ranges_info_array, 2)
706 7660 : Lstart_pos = ranges_info_array(1, irep)
707 7660 : Lend_pos = ranges_info_array(2, irep)
708 7660 : start_point = ranges_info_array(3, irep)
709 7660 : end_point = ranges_info_array(4, irep)
710 :
711 : !$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
712 11980 : !$OMP SHARED(BIb_C_rec,local_i_aL,Lstart_pos,Lend_pos,start_point,end_point)
713 : local_i_aL(Lstart_pos:Lend_pos, :, :) = BIb_C_rec(start_point:end_point, :, :)
714 : !$OMP END PARALLEL WORKSHARE
715 : END DO
716 :
717 4320 : CALL timestop(handle)
718 :
719 4320 : END SUBROUTINE fill_local_i_aL
720 :
721 : ! **************************************************************************************************
722 : !> \brief ...
723 : !> \param local_i_aL ...
724 : !> \param ranges_info_array ...
725 : !> \param BIb_C_rec ...
726 : ! **************************************************************************************************
727 266 : SUBROUTINE fill_local_i_aL_2D(local_i_aL, ranges_info_array, BIb_C_rec)
728 : REAL(KIND=dp), DIMENSION(:, :), INTENT(INOUT) :: local_i_aL
729 : INTEGER, DIMENSION(:, :), INTENT(IN) :: ranges_info_array
730 : REAL(KIND=dp), DIMENSION(:, :), INTENT(IN) :: BIb_C_rec
731 :
732 : CHARACTER(LEN=*), PARAMETER :: routineN = 'fill_local_i_aL_2D'
733 :
734 : INTEGER :: end_point, handle, irep, Lend_pos, &
735 : Lstart_pos, start_point
736 :
737 266 : CALL timeset(routineN, handle)
738 :
739 766 : DO irep = 1, SIZE(ranges_info_array, 2)
740 500 : Lstart_pos = ranges_info_array(1, irep)
741 500 : Lend_pos = ranges_info_array(2, irep)
742 500 : start_point = ranges_info_array(3, irep)
743 500 : end_point = ranges_info_array(4, irep)
744 :
745 : !$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
746 766 : !$OMP SHARED(BIb_C_rec,local_i_aL,Lstart_pos,Lend_pos,start_point,end_point)
747 : local_i_aL(Lstart_pos:Lend_pos, :) = BIb_C_rec(start_point:end_point, :)
748 : !$OMP END PARALLEL WORKSHARE
749 : END DO
750 :
751 266 : CALL timestop(handle)
752 :
753 266 : END SUBROUTINE fill_local_i_aL_2D
754 :
755 : ! **************************************************************************************************
756 : !> \brief ...
757 : !> \param BIb_C ...
758 : !> \param comm_exchange ...
759 : !> \param comm_rep ...
760 : !> \param homo ...
761 : !> \param sizes_array ...
762 : !> \param my_B_size ...
763 : !> \param my_group_L_size ...
764 : !> \param ranges_info_array ...
765 : ! **************************************************************************************************
766 438 : SUBROUTINE replicate_iaK_2intgroup(BIb_C, comm_exchange, comm_rep, homo, sizes_array, my_B_size, &
767 438 : my_group_L_size, ranges_info_array)
768 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
769 : INTENT(INOUT) :: BIb_C
770 : TYPE(mp_comm_type), INTENT(IN) :: comm_exchange, comm_rep
771 : INTEGER, INTENT(IN) :: homo
772 : INTEGER, DIMENSION(:), INTENT(IN) :: sizes_array
773 : INTEGER, INTENT(IN) :: my_B_size, my_group_L_size
774 : INTEGER, DIMENSION(:, 0:, 0:), INTENT(IN) :: ranges_info_array
775 :
776 : CHARACTER(LEN=*), PARAMETER :: routineN = 'replicate_iaK_2intgroup'
777 :
778 : INTEGER :: end_point, handle, max_L_size, &
779 : proc_receive, proc_shift, start_point
780 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :) :: BIb_C_copy
781 438 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :, :) :: BIb_C_gather
782 :
783 438 : CALL timeset(routineN, handle)
784 :
785 : ! replication scheme using mpi_allgather
786 : ! get the max L size of the
787 980 : max_L_size = MAXVAL(sizes_array)
788 :
789 2190 : ALLOCATE (BIb_C_copy(max_L_size, my_B_size, homo))
790 1632144 : BIb_C_copy = 0.0_dp
791 886554 : BIb_C_copy(1:SIZE(BIb_C, 1), 1:my_B_size, 1:homo) = BIb_C
792 :
793 438 : DEALLOCATE (BIb_C)
794 :
795 2628 : ALLOCATE (BIb_C_gather(max_L_size, my_B_size, homo, 0:comm_rep%num_pe - 1))
796 3141760 : BIb_C_gather = 0.0_dp
797 :
798 438 : CALL comm_rep%allgather(BIb_C_copy, BIb_C_gather)
799 :
800 438 : DEALLOCATE (BIb_C_copy)
801 :
802 2190 : ALLOCATE (BIb_C(my_group_L_size, my_B_size, homo))
803 1631797 : BIb_C = 0.0_dp
804 :
805 : ! reorder data
806 1192 : DO proc_shift = 0, comm_rep%num_pe - 1
807 754 : proc_receive = MODULO(comm_rep%mepos - proc_shift, comm_rep%num_pe)
808 :
809 754 : start_point = ranges_info_array(3, proc_shift, comm_exchange%mepos)
810 754 : end_point = ranges_info_array(4, proc_shift, comm_exchange%mepos)
811 :
812 : BIb_C(start_point:end_point, 1:my_B_size, 1:homo) = &
813 1650927 : BIb_C_gather(1:end_point - start_point + 1, 1:my_B_size, 1:homo, proc_receive)
814 :
815 : END DO
816 :
817 438 : DEALLOCATE (BIb_C_gather)
818 :
819 438 : CALL timestop(handle)
820 :
821 438 : END SUBROUTINE replicate_iaK_2intgroup
822 :
823 : ! **************************************************************************************************
824 : !> \brief ...
825 : !> \param local_ab ...
826 : !> \param t_ab ...
827 : !> \param mp2_env ...
828 : !> \param homo ...
829 : !> \param virtual ...
830 : !> \param my_B_size ...
831 : !> \param my_group_L_size ...
832 : !> \param calc_forces ...
833 : !> \param ispin ...
834 : !> \param jspin ...
835 : !> \param local_ba ...
836 : ! **************************************************************************************************
837 526 : SUBROUTINE mp2_ri_allocate_no_blk(local_ab, t_ab, mp2_env, homo, virtual, my_B_size, &
838 : my_group_L_size, calc_forces, ispin, jspin, local_ba)
839 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :), &
840 : INTENT(OUT) :: local_ab, t_ab
841 : TYPE(mp2_type) :: mp2_env
842 : INTEGER, INTENT(IN) :: homo(2), virtual(2), my_B_size(2), &
843 : my_group_L_size
844 : LOGICAL, INTENT(IN) :: calc_forces
845 : INTEGER, INTENT(IN) :: ispin, jspin
846 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :), &
847 : INTENT(OUT) :: local_ba
848 :
849 : CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_allocate_no_blk'
850 :
851 : INTEGER :: handle
852 :
853 526 : CALL timeset(routineN, handle)
854 :
855 2104 : ALLOCATE (local_ab(virtual(ispin), my_B_size(jspin)))
856 137141 : local_ab = 0.0_dp
857 :
858 526 : IF (calc_forces) THEN
859 380 : IF (.NOT. ALLOCATED(mp2_env%ri_grad%P_ij(jspin)%array)) THEN
860 1208 : ALLOCATE (mp2_env%ri_grad%P_ij(jspin)%array(homo(ispin), homo(ispin)))
861 6194 : mp2_env%ri_grad%P_ij(jspin)%array = 0.0_dp
862 : END IF
863 380 : IF (.NOT. ALLOCATED(mp2_env%ri_grad%P_ab(jspin)%array)) THEN
864 1208 : ALLOCATE (mp2_env%ri_grad%P_ab(jspin)%array(my_B_size(jspin), virtual(jspin)))
865 83770 : mp2_env%ri_grad%P_ab(jspin)%array = 0.0_dp
866 : END IF
867 380 : IF (.NOT. ALLOCATED(mp2_env%ri_grad%Gamma_P_ia(jspin)%array)) THEN
868 1510 : ALLOCATE (mp2_env%ri_grad%Gamma_P_ia(jspin)%array(my_B_size(jspin), homo(jspin), my_group_L_size))
869 1450598 : mp2_env%ri_grad%Gamma_P_ia(jspin)%array = 0.0_dp
870 : END IF
871 :
872 380 : IF (ispin == jspin) THEN
873 : ! For non-alpha-beta case we need amplitudes
874 1208 : ALLOCATE (t_ab(virtual(ispin), my_B_size(jspin)))
875 :
876 : ! That is just a dummy. In that way, we can pass it as array to other routines w/o requirement for allocatable array
877 302 : ALLOCATE (local_ba(1, 1))
878 : ELSE
879 : ! We need more integrals
880 312 : ALLOCATE (local_ba(virtual(jspin), my_B_size(ispin)))
881 : END IF
882 : END IF
883 : !
884 :
885 526 : CALL timestop(handle)
886 :
887 526 : END SUBROUTINE mp2_ri_allocate_no_blk
888 :
889 : ! **************************************************************************************************
890 : !> \brief ...
891 : !> \param dimen_RI ...
892 : !> \param my_B_size ...
893 : !> \param block_size ...
894 : !> \param local_i_aL ...
895 : !> \param local_j_aL ...
896 : !> \param calc_forces ...
897 : !> \param Y_i_aP ...
898 : !> \param Y_j_aP ...
899 : !> \param ispin ...
900 : !> \param jspin ...
901 : ! **************************************************************************************************
902 526 : SUBROUTINE mp2_ri_allocate_blk(dimen_RI, my_B_size, block_size, &
903 : local_i_aL, local_j_aL, calc_forces, &
904 : Y_i_aP, Y_j_aP, ispin, jspin)
905 : INTEGER, INTENT(IN) :: dimen_RI, my_B_size(2), block_size
906 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
907 : INTENT(OUT) :: local_i_aL, local_j_aL
908 : LOGICAL, INTENT(IN) :: calc_forces
909 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
910 : INTENT(OUT) :: Y_i_aP, Y_j_aP
911 : INTEGER, INTENT(IN) :: ispin, jspin
912 :
913 : CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_allocate_blk'
914 :
915 : INTEGER :: handle
916 :
917 526 : CALL timeset(routineN, handle)
918 :
919 2630 : ALLOCATE (local_i_aL(dimen_RI, my_B_size(ispin), block_size))
920 651359 : local_i_aL = 0.0_dp
921 2630 : ALLOCATE (local_j_aL(dimen_RI, my_B_size(jspin), block_size))
922 664203 : local_j_aL = 0.0_dp
923 :
924 526 : IF (calc_forces) THEN
925 1900 : ALLOCATE (Y_i_aP(my_B_size(ispin), dimen_RI, block_size))
926 535504 : Y_i_aP = 0.0_dp
927 : ! For closed-shell, alpha-alpha and beta-beta my_B_size_beta=my_b_size
928 : ! Not for alpha-beta case: Y_j_aP_beta is sent and received as Y_j_aP
929 1900 : ALLOCATE (Y_j_aP(my_B_size(jspin), dimen_RI, block_size))
930 546456 : Y_j_aP = 0.0_dp
931 : END IF
932 : !
933 :
934 526 : CALL timestop(handle)
935 :
936 526 : END SUBROUTINE mp2_ri_allocate_blk
937 :
938 : ! **************************************************************************************************
939 : !> \brief ...
940 : !> \param my_alpha_beta_case ...
941 : !> \param total_ij_pairs ...
942 : !> \param homo ...
943 : !> \param homo_beta ...
944 : !> \param block_size ...
945 : !> \param ngroup ...
946 : !> \param ij_map ...
947 : !> \param color_sub ...
948 : !> \param my_ij_pairs ...
949 : !> \param my_open_shell_SS ...
950 : !> \param unit_nr ...
951 : ! **************************************************************************************************
952 526 : SUBROUTINE mp2_ri_communication(my_alpha_beta_case, total_ij_pairs, homo, homo_beta, &
953 : block_size, ngroup, ij_map, color_sub, my_ij_pairs, my_open_shell_SS, unit_nr)
954 : LOGICAL, INTENT(IN) :: my_alpha_beta_case
955 : INTEGER, INTENT(OUT) :: total_ij_pairs
956 : INTEGER, INTENT(IN) :: homo, homo_beta, block_size, ngroup
957 : INTEGER, ALLOCATABLE, DIMENSION(:, :), INTENT(OUT) :: ij_map
958 : INTEGER, INTENT(IN) :: color_sub
959 : INTEGER, INTENT(OUT) :: my_ij_pairs
960 : LOGICAL, INTENT(IN) :: my_open_shell_SS
961 : INTEGER, INTENT(IN) :: unit_nr
962 :
963 : CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_communication'
964 :
965 : INTEGER :: assigned_blocks, first_I_block, first_J_block, handle, iiB, ij_block_counter, &
966 : ij_counter, jjB, last_i_block, last_J_block, num_block_per_group, num_IJ_blocks, &
967 : num_IJ_blocks_beta, total_ij_block, total_ij_pairs_blocks
968 526 : LOGICAL, ALLOCATABLE, DIMENSION(:, :) :: ij_marker
969 :
970 : ! Calculate the maximum number of ij pairs that have to be computed
971 : ! among groups
972 :
973 526 : CALL timeset(routineN, handle)
974 :
975 526 : IF (.NOT. my_open_shell_ss .AND. .NOT. my_alpha_beta_case) THEN
976 262 : total_ij_pairs = homo*(1 + homo)/2
977 262 : num_IJ_blocks = homo/block_size - 1
978 :
979 262 : first_I_block = 1
980 262 : last_i_block = block_size*(num_IJ_blocks - 1)
981 :
982 262 : first_J_block = block_size + 1
983 262 : last_J_block = block_size*(num_IJ_blocks + 1)
984 :
985 262 : ij_block_counter = 0
986 590 : DO iiB = first_I_block, last_i_block, block_size
987 590 : DO jjB = iiB + block_size, last_J_block, block_size
988 820 : ij_block_counter = ij_block_counter + 1
989 : END DO
990 : END DO
991 :
992 262 : total_ij_block = ij_block_counter
993 262 : num_block_per_group = total_ij_block/ngroup
994 262 : assigned_blocks = num_block_per_group*ngroup
995 :
996 262 : total_ij_pairs_blocks = assigned_blocks + (total_ij_pairs - assigned_blocks*(block_size**2))
997 :
998 1048 : ALLOCATE (ij_marker(homo, homo))
999 3934 : ij_marker = .TRUE.
1000 786 : ALLOCATE (ij_map(3, total_ij_pairs_blocks))
1001 7606 : ij_map = 0
1002 262 : ij_counter = 0
1003 262 : my_ij_pairs = 0
1004 590 : DO iiB = first_I_block, last_i_block, block_size
1005 1250 : DO jjB = iiB + block_size, last_J_block, block_size
1006 820 : IF (ij_counter + 1 > assigned_blocks) EXIT
1007 660 : ij_counter = ij_counter + 1
1008 1980 : ij_marker(iiB:iiB + block_size - 1, jjB:jjB + block_size - 1) = .FALSE.
1009 660 : ij_map(1, ij_counter) = iiB
1010 660 : ij_map(2, ij_counter) = jjB
1011 660 : ij_map(3, ij_counter) = block_size
1012 988 : IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
1013 : END DO
1014 : END DO
1015 1050 : DO iiB = 1, homo
1016 2886 : DO jjB = iiB, homo
1017 2624 : IF (ij_marker(iiB, jjB)) THEN
1018 1176 : ij_counter = ij_counter + 1
1019 1176 : ij_map(1, ij_counter) = iiB
1020 1176 : ij_map(2, ij_counter) = jjB
1021 1176 : ij_map(3, ij_counter) = 1
1022 1176 : IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
1023 : END IF
1024 : END DO
1025 : END DO
1026 262 : DEALLOCATE (ij_marker)
1027 :
1028 264 : ELSE IF (.NOT. my_alpha_beta_case) THEN
1029 : ! THese are the cases alpha/alpha and beta/beta
1030 : ! We do not have to consider the diagonal elements
1031 176 : total_ij_pairs = homo*(homo - 1)/2
1032 176 : num_IJ_blocks = (homo - 1)/block_size - 1
1033 :
1034 176 : first_I_block = 1
1035 176 : last_i_block = block_size*(num_IJ_blocks - 1)
1036 :
1037 : ! We shift the blocks to prevent the calculation of the diagonal elements which always give zero
1038 176 : first_J_block = block_size + 2
1039 176 : last_J_block = block_size*(num_IJ_blocks + 1) + 1
1040 :
1041 176 : ij_block_counter = 0
1042 262 : DO iiB = first_I_block, last_i_block, block_size
1043 262 : DO jjB = iiB + block_size + 1, last_J_block, block_size
1044 200 : ij_block_counter = ij_block_counter + 1
1045 : END DO
1046 : END DO
1047 :
1048 176 : total_ij_block = ij_block_counter
1049 176 : num_block_per_group = total_ij_block/ngroup
1050 176 : assigned_blocks = num_block_per_group*ngroup
1051 :
1052 176 : total_ij_pairs_blocks = assigned_blocks + (total_ij_pairs - assigned_blocks*(block_size**2))
1053 :
1054 704 : ALLOCATE (ij_marker(homo, homo))
1055 2984 : ij_marker = .TRUE.
1056 528 : ALLOCATE (ij_map(3, total_ij_pairs_blocks))
1057 3352 : ij_map = 0
1058 176 : ij_counter = 0
1059 176 : my_ij_pairs = 0
1060 262 : DO iiB = first_I_block, last_i_block, block_size
1061 458 : DO jjB = iiB + block_size + 1, last_J_block, block_size
1062 200 : IF (ij_counter + 1 > assigned_blocks) EXIT
1063 196 : ij_counter = ij_counter + 1
1064 604 : ij_marker(iiB:iiB + block_size - 1, jjB:jjB + block_size - 1) = .FALSE.
1065 196 : ij_map(1, ij_counter) = iiB
1066 196 : ij_map(2, ij_counter) = jjB
1067 196 : ij_map(3, ij_counter) = block_size
1068 282 : IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
1069 : END DO
1070 : END DO
1071 774 : DO iiB = 1, homo
1072 1580 : DO jjB = iiB + 1, homo
1073 1404 : IF (ij_marker(iiB, jjB)) THEN
1074 598 : ij_counter = ij_counter + 1
1075 598 : ij_map(1, ij_counter) = iiB
1076 598 : ij_map(2, ij_counter) = jjB
1077 598 : ij_map(3, ij_counter) = 1
1078 598 : IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
1079 : END IF
1080 : END DO
1081 : END DO
1082 176 : DEALLOCATE (ij_marker)
1083 :
1084 : ELSE
1085 88 : total_ij_pairs = homo*homo_beta
1086 88 : num_IJ_blocks = homo/block_size
1087 88 : num_IJ_blocks_beta = homo_beta/block_size
1088 :
1089 88 : first_I_block = 1
1090 88 : last_i_block = block_size*(num_IJ_blocks - 1)
1091 :
1092 88 : first_J_block = 1
1093 88 : last_J_block = block_size*(num_IJ_blocks_beta - 1)
1094 :
1095 88 : ij_block_counter = 0
1096 246 : DO iiB = first_I_block, last_i_block, block_size
1097 246 : DO jjB = first_J_block, last_J_block, block_size
1098 284 : ij_block_counter = ij_block_counter + 1
1099 : END DO
1100 : END DO
1101 :
1102 88 : total_ij_block = ij_block_counter
1103 88 : num_block_per_group = total_ij_block/ngroup
1104 88 : assigned_blocks = num_block_per_group*ngroup
1105 :
1106 88 : total_ij_pairs_blocks = assigned_blocks + (total_ij_pairs - assigned_blocks*(block_size**2))
1107 :
1108 352 : ALLOCATE (ij_marker(homo, homo_beta))
1109 1396 : ij_marker = .TRUE.
1110 264 : ALLOCATE (ij_map(3, total_ij_pairs_blocks))
1111 4304 : ij_map = 0
1112 88 : ij_counter = 0
1113 88 : my_ij_pairs = 0
1114 246 : DO iiB = first_I_block, last_i_block, block_size
1115 486 : DO jjB = first_J_block, last_J_block, block_size
1116 244 : IF (ij_counter + 1 > assigned_blocks) EXIT
1117 240 : ij_counter = ij_counter + 1
1118 720 : ij_marker(iiB:iiB + block_size - 1, jjB:jjB + block_size - 1) = .FALSE.
1119 240 : ij_map(1, ij_counter) = iiB
1120 240 : ij_map(2, ij_counter) = jjB
1121 240 : ij_map(3, ij_counter) = block_size
1122 398 : IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
1123 : END DO
1124 : END DO
1125 432 : DO iiB = 1, homo
1126 1486 : DO jjB = 1, homo_beta
1127 1398 : IF (ij_marker(iiB, jjB)) THEN
1128 814 : ij_counter = ij_counter + 1
1129 814 : ij_map(1, ij_counter) = iiB
1130 814 : ij_map(2, ij_counter) = jjB
1131 814 : ij_map(3, ij_counter) = 1
1132 814 : IF (MOD(ij_counter, ngroup) == color_sub) my_ij_pairs = my_ij_pairs + 1
1133 : END IF
1134 : END DO
1135 : END DO
1136 88 : DEALLOCATE (ij_marker)
1137 : END IF
1138 :
1139 526 : IF (unit_nr > 0) THEN
1140 263 : IF (block_size == 1) THEN
1141 : WRITE (UNIT=unit_nr, FMT="(T3,A,T66,F15.1)") &
1142 234 : "RI_INFO| Percentage of ij pairs communicated with block size 1:", 100.0_dp
1143 : ELSE
1144 : WRITE (UNIT=unit_nr, FMT="(T3,A,T66,F15.1)") &
1145 29 : "RI_INFO| Percentage of ij pairs communicated with block size 1:", &
1146 58 : 100.0_dp*REAL((total_ij_pairs - assigned_blocks*(block_size**2)), KIND=dp)/REAL(total_ij_pairs, KIND=dp)
1147 : END IF
1148 263 : CALL m_flush(unit_nr)
1149 : END IF
1150 :
1151 526 : CALL timestop(handle)
1152 :
1153 526 : END SUBROUTINE mp2_ri_communication
1154 :
1155 : ! **************************************************************************************************
1156 : !> \brief ...
1157 : !> \param para_env ...
1158 : !> \param para_env_sub ...
1159 : !> \param color_sub ...
1160 : !> \param sizes_array ...
1161 : !> \param calc_forces ...
1162 : !> \param integ_group_size ...
1163 : !> \param my_group_L_end ...
1164 : !> \param my_group_L_size ...
1165 : !> \param my_group_L_size_orig ...
1166 : !> \param my_group_L_start ...
1167 : !> \param my_new_group_L_size ...
1168 : !> \param integ_group_pos2color_sub ...
1169 : !> \param sizes_array_orig ...
1170 : !> \param ranges_info_array ...
1171 : !> \param comm_exchange ...
1172 : !> \param comm_rep ...
1173 : !> \param num_integ_group ...
1174 : ! **************************************************************************************************
1175 350 : SUBROUTINE mp2_ri_create_group(para_env, para_env_sub, color_sub, &
1176 : sizes_array, calc_forces, &
1177 : integ_group_size, my_group_L_end, &
1178 : my_group_L_size, my_group_L_size_orig, my_group_L_start, my_new_group_L_size, &
1179 : integ_group_pos2color_sub, &
1180 : sizes_array_orig, ranges_info_array, comm_exchange, comm_rep, num_integ_group)
1181 : TYPE(mp_para_env_type), INTENT(IN) :: para_env, para_env_sub
1182 : INTEGER, INTENT(IN) :: color_sub
1183 : INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(INOUT) :: sizes_array
1184 : LOGICAL, INTENT(IN) :: calc_forces
1185 : INTEGER, INTENT(IN) :: integ_group_size, my_group_L_end
1186 : INTEGER, INTENT(INOUT) :: my_group_L_size
1187 : INTEGER, INTENT(OUT) :: my_group_L_size_orig
1188 : INTEGER, INTENT(IN) :: my_group_L_start
1189 : INTEGER, INTENT(INOUT) :: my_new_group_L_size
1190 : INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(OUT) :: integ_group_pos2color_sub, &
1191 : sizes_array_orig
1192 : INTEGER, ALLOCATABLE, DIMENSION(:, :, :), &
1193 : INTENT(OUT) :: ranges_info_array
1194 : TYPE(mp_comm_type), INTENT(OUT) :: comm_exchange, comm_rep
1195 : INTEGER, INTENT(IN) :: num_integ_group
1196 :
1197 : CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_create_group'
1198 :
1199 : INTEGER :: handle, iiB, proc_receive, proc_shift, &
1200 : sub_sub_color
1201 350 : INTEGER, ALLOCATABLE, DIMENSION(:) :: new_sizes_array, rep_ends_array, &
1202 : rep_sizes_array, rep_starts_array
1203 : INTEGER, ALLOCATABLE, DIMENSION(:, :) :: my_info
1204 :
1205 350 : CALL timeset(routineN, handle)
1206 : !
1207 350 : sub_sub_color = para_env_sub%mepos*num_integ_group + color_sub/integ_group_size
1208 350 : CALL comm_exchange%from_split(para_env, sub_sub_color)
1209 :
1210 : ! create the replication group
1211 350 : sub_sub_color = para_env_sub%mepos*comm_exchange%num_pe + comm_exchange%mepos
1212 350 : CALL comm_rep%from_split(para_env, sub_sub_color)
1213 :
1214 : ! create the new limits for K according to the size
1215 : ! of the integral group
1216 :
1217 : ! info array for replication
1218 1050 : ALLOCATE (rep_ends_array(0:comm_rep%num_pe - 1))
1219 1050 : ALLOCATE (rep_starts_array(0:comm_rep%num_pe - 1))
1220 1050 : ALLOCATE (rep_sizes_array(0:comm_rep%num_pe - 1))
1221 :
1222 350 : CALL comm_rep%allgather(my_group_L_size, rep_sizes_array)
1223 350 : CALL comm_rep%allgather(my_group_L_start, rep_starts_array)
1224 350 : CALL comm_rep%allgather(my_group_L_end, rep_ends_array)
1225 :
1226 : ! calculate my_new_group_L_size according to sizes_array
1227 350 : my_new_group_L_size = my_group_L_size
1228 :
1229 : ! Info of this process
1230 1050 : ALLOCATE (my_info(4, 0:comm_rep%num_pe - 1))
1231 350 : my_info(1, 0) = my_group_L_start
1232 350 : my_info(2, 0) = my_group_L_end
1233 350 : my_info(3, 0) = 1
1234 350 : my_info(4, 0) = my_group_L_size
1235 :
1236 588 : DO proc_shift = 1, comm_rep%num_pe - 1
1237 238 : proc_receive = MODULO(comm_rep%mepos - proc_shift, comm_rep%num_pe)
1238 :
1239 238 : my_new_group_L_size = my_new_group_L_size + rep_sizes_array(proc_receive)
1240 :
1241 238 : my_info(1, proc_shift) = rep_starts_array(proc_receive)
1242 238 : my_info(2, proc_shift) = rep_ends_array(proc_receive)
1243 238 : my_info(3, proc_shift) = my_info(4, proc_shift - 1) + 1
1244 588 : my_info(4, proc_shift) = my_new_group_L_size
1245 :
1246 : END DO
1247 :
1248 1050 : ALLOCATE (new_sizes_array(0:comm_exchange%num_pe - 1))
1249 1400 : ALLOCATE (ranges_info_array(4, 0:comm_rep%num_pe - 1, 0:comm_exchange%num_pe - 1))
1250 350 : CALL comm_exchange%allgather(my_new_group_L_size, new_sizes_array)
1251 350 : CALL comm_exchange%allgather(my_info, ranges_info_array)
1252 :
1253 350 : DEALLOCATE (rep_sizes_array)
1254 350 : DEALLOCATE (rep_starts_array)
1255 350 : DEALLOCATE (rep_ends_array)
1256 :
1257 1050 : ALLOCATE (integ_group_pos2color_sub(0:comm_exchange%num_pe - 1))
1258 350 : CALL comm_exchange%allgather(color_sub, integ_group_pos2color_sub)
1259 :
1260 350 : IF (calc_forces) THEN
1261 224 : iiB = SIZE(sizes_array)
1262 672 : ALLOCATE (sizes_array_orig(0:iiB - 1))
1263 666 : sizes_array_orig(:) = sizes_array
1264 : END IF
1265 :
1266 350 : my_group_L_size_orig = my_group_L_size
1267 350 : my_group_L_size = my_new_group_L_size
1268 350 : DEALLOCATE (sizes_array)
1269 :
1270 1050 : ALLOCATE (sizes_array(0:integ_group_size - 1))
1271 798 : sizes_array(:) = new_sizes_array
1272 :
1273 350 : DEALLOCATE (new_sizes_array)
1274 : !
1275 350 : CALL timestop(handle)
1276 :
1277 700 : END SUBROUTINE mp2_ri_create_group
1278 :
1279 : ! **************************************************************************************************
1280 : !> \brief ...
1281 : !> \param mp2_env ...
1282 : !> \param para_env ...
1283 : !> \param para_env_sub ...
1284 : !> \param gd_array ...
1285 : !> \param gd_B_virtual ...
1286 : !> \param homo ...
1287 : !> \param dimen_RI ...
1288 : !> \param unit_nr ...
1289 : !> \param integ_group_size ...
1290 : !> \param ngroup ...
1291 : !> \param num_integ_group ...
1292 : !> \param virtual ...
1293 : !> \param calc_forces ...
1294 : ! **************************************************************************************************
1295 1400 : SUBROUTINE mp2_ri_get_integ_group_size(mp2_env, para_env, para_env_sub, gd_array, gd_B_virtual, &
1296 350 : homo, dimen_RI, unit_nr, &
1297 : integ_group_size, &
1298 : ngroup, num_integ_group, &
1299 350 : virtual, calc_forces)
1300 : TYPE(mp2_type) :: mp2_env
1301 : TYPE(mp_para_env_type), INTENT(IN) :: para_env, para_env_sub
1302 : TYPE(group_dist_d1_type), INTENT(IN) :: gd_array
1303 : TYPE(group_dist_d1_type), DIMENSION(:), INTENT(IN) :: gd_B_virtual
1304 : INTEGER, DIMENSION(:), INTENT(IN) :: homo
1305 : INTEGER, INTENT(IN) :: dimen_RI, unit_nr
1306 : INTEGER, INTENT(OUT) :: integ_group_size, ngroup, num_integ_group
1307 : INTEGER, DIMENSION(:), INTENT(IN) :: virtual
1308 : LOGICAL, INTENT(IN) :: calc_forces
1309 :
1310 : CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_get_integ_group_size'
1311 :
1312 : INTEGER :: block_size, handle, iiB, &
1313 : max_repl_group_size, &
1314 : min_integ_group_size
1315 : INTEGER(KIND=int_8) :: mem
1316 : LOGICAL :: calc_group_size
1317 : REAL(KIND=dp) :: factor, mem_base, mem_min, mem_per_blk, &
1318 : mem_per_repl, mem_per_repl_blk, &
1319 : mem_real
1320 :
1321 350 : CALL timeset(routineN, handle)
1322 :
1323 350 : ngroup = para_env%num_pe/para_env_sub%num_pe
1324 :
1325 350 : calc_group_size = mp2_env%ri_mp2%number_integration_groups <= 0
1326 350 : IF (.NOT. calc_group_size) THEN
1327 10 : IF (MOD(ngroup, mp2_env%ri_mp2%number_integration_groups) /= 0) calc_group_size = .TRUE.
1328 : END IF
1329 :
1330 350 : IF (calc_group_size) THEN
1331 340 : CALL m_memory(mem)
1332 340 : mem_real = (mem + 1024*1024 - 1)/(1024*1024)
1333 340 : CALL para_env%min(mem_real)
1334 :
1335 340 : mem_base = 0.0_dp
1336 340 : mem_per_blk = 0.0_dp
1337 340 : mem_per_repl = 0.0_dp
1338 340 : mem_per_repl_blk = 0.0_dp
1339 :
1340 : ! BIB_C_copy
1341 : mem_per_repl = mem_per_repl + MAXVAL(MAX(REAL(homo, KIND=dp)*maxsize(gd_array), REAL(dimen_RI, KIND=dp))* &
1342 1102 : maxsize(gd_B_virtual))*8.0_dp/(1024**2)
1343 : ! BIB_C
1344 762 : mem_per_repl = mem_per_repl + SUM(REAL(homo, KIND=dp)*maxsize(gd_B_virtual))*maxsize(gd_array)*8.0_dp/(1024**2)
1345 : ! BIB_C_rec
1346 762 : mem_per_repl_blk = mem_per_repl_blk + REAL(MAXVAL(maxsize(gd_B_virtual)), KIND=dp)*maxsize(gd_array)*8.0_dp/(1024**2)
1347 : ! local_i_aL+local_j_aL
1348 762 : mem_per_blk = mem_per_blk + 2.0_dp*MAXVAL(maxsize(gd_B_virtual))*REAL(dimen_RI, KIND=dp)*8.0_dp/(1024**2)
1349 : ! local_ab
1350 1102 : mem_base = mem_base + MAXVAL(REAL(virtual, KIND=dp)*maxsize(gd_B_virtual))*8.0_dp/(1024**2)
1351 : ! external_ab/external_i_aL
1352 1184 : mem_base = mem_base + REAL(MAX(dimen_RI, MAXVAL(virtual)), KIND=dp)*MAXVAL(maxsize(gd_B_virtual))*8.0_dp/(1024**2)
1353 :
1354 340 : IF (calc_forces) THEN
1355 : ! Gamma_P_ia
1356 : mem_per_repl = mem_per_repl + SUM(REAL(homo, KIND=dp)*maxsize(gd_array)* &
1357 506 : maxsize(gd_B_virtual))*8.0_dp/(1024**2)
1358 : ! Y_i_aP+Y_j_aP
1359 506 : mem_per_blk = mem_per_blk + 2.0_dp*MAXVAL(maxsize(gd_B_virtual))*dimen_RI*8.0_dp/(1024**2)
1360 : ! local_ba/t_ab
1361 796 : mem_base = mem_base + REAL(MAXVAL(maxsize(gd_B_virtual)), KIND=dp)*MAX(dimen_RI, MAXVAL(virtual))*8.0_dp/(1024**2)
1362 : ! P_ij
1363 506 : mem_base = mem_base + SUM(REAL(homo, KIND=dp)*homo)*8.0_dp/(1024**2)
1364 : ! P_ab
1365 506 : mem_base = mem_base + SUM(REAL(virtual, KIND=dp)*maxsize(gd_B_virtual))*8.0_dp/(1024**2)
1366 : ! send_ab/send_i_aL
1367 796 : mem_base = mem_base + REAL(MAX(dimen_RI, MAXVAL(virtual)), KIND=dp)*MAXVAL(maxsize(gd_B_virtual))*8.0_dp/(1024**2)
1368 : END IF
1369 :
1370 : ! This a first guess based on the assumption of optimal block sizes
1371 762 : block_size = MAX(1, MIN(FLOOR(SQRT(REAL(MINVAL(homo), KIND=dp))), FLOOR(MINVAL(homo)/SQRT(2.0_dp*ngroup))))
1372 340 : IF (mp2_env%ri_mp2%block_size > 0) block_size = mp2_env%ri_mp2%block_size
1373 :
1374 340 : mem_min = mem_base + mem_per_repl + (mem_per_blk + mem_per_repl_blk)*block_size
1375 :
1376 510 : IF (unit_nr > 0) WRITE (unit_nr, '(T3,A,T68,F9.2,A4)') 'RI_INFO| Minimum available memory per MPI process:', &
1377 340 : mem_real, ' MiB'
1378 510 : IF (unit_nr > 0) WRITE (unit_nr, '(T3,A,T68,F9.2,A4)') 'RI_INFO| Minimum required memory per MPI process:', &
1379 340 : mem_min, ' MiB'
1380 :
1381 : ! We use the following communication model
1382 : ! Comm(replication)+Comm(collection of data for ij pair)+Comm(contraction)
1383 : ! One can show that the costs of the contraction step are independent of the block size and the replication group size
1384 : ! With gradients, the other two steps are carried out twice (Y_i_aP -> Gamma_i_aP, and dereplication)
1385 : ! NL ... number of RI basis functions
1386 : ! NR ... replication group size
1387 : ! NG ... number of sub groups
1388 : ! NB ... Block size
1389 : ! o ... number of occupied orbitals
1390 : ! Then, we have the communication costs (in multiples of the original BIb_C matrix)
1391 : ! (NR/NG)+(1-(NR/NG))*(o/NB+NB-2)/NG = (NR/NG)*(1-(o/NB+NB-2)/NG)+(o/NB+NB-2)/NG
1392 : ! and with gradients
1393 : ! 2*(NR/NG)+2*(1-(NR/NG))*(o/NB+NB-2)/NG = (NR/NG)*(1-(o/NB+NB-2)/NG)+(o/NB+NB-2)/NG
1394 : ! We are looking for the minimum of the communication volume,
1395 : ! thus, if the prefactor of (NR/NG) is smaller than zero, use the largest possible replication group size.
1396 : ! If the factor is larger than zero, set the replication group size to 1. (For small systems and a large number of subgroups)
1397 : ! Replication group size = 1 implies that the integration group size equals the number of subgroups
1398 :
1399 340 : integ_group_size = ngroup
1400 :
1401 : ! Multiply everything by homo*virtual to consider differences between spin channels in case of open-shell calculations
1402 : factor = REAL(SUM(homo*virtual), KIND=dp) &
1403 1606 : - SUM((REAL(MAXVAL(homo), KIND=dp)/block_size + block_size - 2.0_dp)*homo*virtual)/ngroup
1404 668 : IF (SIZE(homo) == 2) factor = factor - 2.0_dp*PRODUCT(homo)/block_size/ngroup*SUM(homo*virtual)
1405 :
1406 680 : IF (factor <= 0.0_dp) THEN
1407 : ! Remove the fixed memory and divide by the memory per replication group size
1408 : max_repl_group_size = MIN(MAX(FLOOR((mem_real - mem_base - mem_per_blk*block_size)/ &
1409 252 : (mem_per_repl + mem_per_repl_blk*block_size)), 1), ngroup)
1410 : ! Convert to an integration group size
1411 252 : min_integ_group_size = ngroup/max_repl_group_size
1412 :
1413 : ! Ensure that the integration group size is a divisor of the number of sub groups
1414 252 : DO iiB = MAX(MIN(min_integ_group_size, ngroup), 1), ngroup
1415 : ! check that the ngroup is a multiple of integ_group_size
1416 252 : IF (MOD(ngroup, iiB) == 0) THEN
1417 252 : integ_group_size = iiB
1418 252 : EXIT
1419 : END IF
1420 0 : integ_group_size = integ_group_size + 1
1421 : END DO
1422 : END IF
1423 : ELSE ! We take the user provided group size
1424 10 : integ_group_size = ngroup/mp2_env%ri_mp2%number_integration_groups
1425 : END IF
1426 :
1427 350 : IF (unit_nr > 0) THEN
1428 : WRITE (UNIT=unit_nr, FMT="(T3,A,T75,i6)") &
1429 175 : "RI_INFO| Group size for integral replication:", integ_group_size*para_env_sub%num_pe
1430 175 : CALL m_flush(unit_nr)
1431 : END IF
1432 :
1433 350 : num_integ_group = ngroup/integ_group_size
1434 :
1435 350 : CALL timestop(handle)
1436 :
1437 350 : END SUBROUTINE mp2_ri_get_integ_group_size
1438 :
1439 : ! **************************************************************************************************
1440 : !> \brief ...
1441 : !> \param mp2_env ...
1442 : !> \param para_env ...
1443 : !> \param para_env_sub ...
1444 : !> \param gd_array ...
1445 : !> \param gd_B_virtual ...
1446 : !> \param homo ...
1447 : !> \param virtual ...
1448 : !> \param dimen_RI ...
1449 : !> \param unit_nr ...
1450 : !> \param block_size ...
1451 : !> \param ngroup ...
1452 : !> \param num_integ_group ...
1453 : !> \param my_open_shell_ss ...
1454 : !> \param calc_forces ...
1455 : !> \param buffer_1D ...
1456 : ! **************************************************************************************************
1457 526 : SUBROUTINE mp2_ri_get_block_size(mp2_env, para_env, para_env_sub, gd_array, gd_B_virtual, &
1458 526 : homo, virtual, dimen_RI, unit_nr, &
1459 : block_size, ngroup, num_integ_group, &
1460 : my_open_shell_ss, calc_forces, buffer_1D)
1461 : TYPE(mp2_type) :: mp2_env
1462 : TYPE(mp_para_env_type), INTENT(IN) :: para_env, para_env_sub
1463 : TYPE(group_dist_d1_type), INTENT(IN) :: gd_array
1464 : TYPE(group_dist_d1_type), DIMENSION(:), INTENT(IN) :: gd_B_virtual
1465 : INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
1466 : INTEGER, INTENT(IN) :: dimen_RI, unit_nr
1467 : INTEGER, INTENT(OUT) :: block_size, ngroup
1468 : INTEGER, INTENT(IN) :: num_integ_group
1469 : LOGICAL, INTENT(IN) :: my_open_shell_ss, calc_forces
1470 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:), &
1471 : INTENT(OUT) :: buffer_1D
1472 :
1473 : CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_ri_get_block_size'
1474 :
1475 : INTEGER :: best_block_size, handle, num_IJ_blocks
1476 : INTEGER(KIND=int_8) :: buffer_size, mem
1477 : REAL(KIND=dp) :: mem_base, mem_per_blk, mem_per_repl_blk, &
1478 : mem_real
1479 :
1480 526 : CALL timeset(routineN, handle)
1481 :
1482 526 : ngroup = para_env%num_pe/para_env_sub%num_pe
1483 :
1484 526 : CALL m_memory(mem)
1485 526 : mem_real = (mem + 1024*1024 - 1)/(1024*1024)
1486 526 : CALL para_env%min(mem_real)
1487 :
1488 526 : mem_base = 0.0_dp
1489 526 : mem_per_blk = 0.0_dp
1490 526 : mem_per_repl_blk = 0.0_dp
1491 :
1492 : ! external_ab
1493 1754 : mem_base = mem_base + MAXVAL(maxsize(gd_B_virtual))*MAX(dimen_RI, MAXVAL(virtual))*8.0_dp/(1024**2)
1494 : ! BIB_C_rec
1495 1140 : mem_per_repl_blk = mem_per_repl_blk + REAL(MAXVAL(maxsize(gd_B_virtual)), KIND=dp)*maxsize(gd_array)*8.0_dp/(1024**2)
1496 : ! local_i_aL+local_j_aL
1497 1140 : mem_per_blk = mem_per_blk + 2.0_dp*MAXVAL(maxsize(gd_B_virtual))*REAL(dimen_RI, KIND=dp)*8.0_dp/(1024**2)
1498 : ! Copy to keep arrays contiguous
1499 1754 : mem_base = mem_base + MAXVAL(maxsize(gd_B_virtual))*MAX(dimen_RI, MAXVAL(virtual))*8.0_dp/(1024**2)
1500 :
1501 526 : IF (calc_forces) THEN
1502 : ! Y_i_aP+Y_j_aP+BIb_C_send
1503 838 : mem_per_blk = mem_per_blk + 3.0_dp*MAXVAL(maxsize(gd_B_virtual))*dimen_RI*8.0_dp/(1024**2)
1504 : ! send_ab
1505 1296 : mem_base = mem_base + MAXVAL(maxsize(gd_B_virtual))*MAX(dimen_RI, MAXVAL(virtual))*8.0_dp/(1024**2)
1506 : END IF
1507 :
1508 526 : best_block_size = 1
1509 :
1510 : ! Here we split the memory half for the communication, half for replication
1511 526 : IF (mp2_env%ri_mp2%block_size > 0) THEN
1512 : best_block_size = mp2_env%ri_mp2%block_size
1513 : ELSE
1514 294 : best_block_size = MAX(FLOOR((mem_real - mem_base)/(mem_per_blk + mem_per_repl_blk*ngroup/num_integ_group)), 1)
1515 :
1516 4630188 : DO
1517 4630482 : IF (SIZE(homo) == 1) THEN
1518 3724018 : IF (.NOT. my_open_shell_ss) THEN
1519 1707990 : num_IJ_blocks = (homo(1)/best_block_size)
1520 1707990 : num_IJ_blocks = (num_IJ_blocks*num_IJ_blocks - num_IJ_blocks)/2
1521 : ELSE
1522 2016028 : num_IJ_blocks = ((homo(1) - 1)/best_block_size)
1523 2016028 : num_IJ_blocks = (num_IJ_blocks*num_IJ_blocks - num_IJ_blocks)/2
1524 : END IF
1525 : ELSE
1526 2719392 : num_ij_blocks = PRODUCT(homo/best_block_size)
1527 : END IF
1528 : ! Enforce at least one large block for each subgroup
1529 4630482 : IF ((num_IJ_blocks >= ngroup .AND. num_IJ_blocks > 0) .OR. best_block_size == 1) THEN
1530 : EXIT
1531 : ELSE
1532 4630188 : best_block_size = best_block_size - 1
1533 : END IF
1534 : END DO
1535 :
1536 294 : IF (SIZE(homo) == 1) THEN
1537 232 : IF (my_open_shell_ss) THEN
1538 : ! check that best_block_size is not bigger than sqrt(homo-1)
1539 : ! Diagonal elements do not have to be considered
1540 124 : best_block_size = MIN(FLOOR(SQRT(REAL(homo(1) - 1, KIND=dp))), best_block_size)
1541 : ELSE
1542 : ! check that best_block_size is not bigger than sqrt(homo)
1543 108 : best_block_size = MIN(FLOOR(SQRT(REAL(homo(1), KIND=dp))), best_block_size)
1544 : END IF
1545 : END IF
1546 : END IF
1547 526 : block_size = MAX(1, best_block_size)
1548 :
1549 526 : IF (unit_nr > 0) THEN
1550 : WRITE (UNIT=unit_nr, FMT="(T3,A,T75,i6)") &
1551 263 : "RI_INFO| Block size:", block_size
1552 263 : CALL m_flush(unit_nr)
1553 : END IF
1554 :
1555 : ! Determine recv buffer size (BI_C_recv, external_i_aL, external_ab)
1556 : buffer_size = MAX(INT(maxsize(gd_array), KIND=int_8)*block_size, INT(MAX(dimen_RI, MAXVAL(virtual)), KIND=int_8)) &
1557 1754 : *MAXVAL(maxsize(gd_B_virtual))
1558 : ! The send buffer has the same size as the recv buffer
1559 526 : IF (calc_forces) buffer_size = buffer_size*2
1560 1578 : ALLOCATE (buffer_1D(buffer_size))
1561 :
1562 526 : CALL timestop(handle)
1563 :
1564 526 : END SUBROUTINE mp2_ri_get_block_size
1565 :
1566 : ! **************************************************************************************************
1567 : !> \brief ...
1568 : !> \param mp2_env ...
1569 : !> \param para_env_sub ...
1570 : !> \param gd_B_virtual ...
1571 : !> \param Eigenval ...
1572 : !> \param homo ...
1573 : !> \param dimen_RI ...
1574 : !> \param iiB ...
1575 : !> \param jjB ...
1576 : !> \param my_B_size ...
1577 : !> \param my_B_virtual_end ...
1578 : !> \param my_B_virtual_start ...
1579 : !> \param my_i ...
1580 : !> \param my_j ...
1581 : !> \param virtual ...
1582 : !> \param local_ab ...
1583 : !> \param t_ab ...
1584 : !> \param my_local_i_aL ...
1585 : !> \param my_local_j_aL ...
1586 : !> \param open_ss ...
1587 : !> \param Y_i_aP ...
1588 : !> \param Y_j_aP ...
1589 : !> \param local_ba ...
1590 : !> \param ispin ...
1591 : !> \param jspin ...
1592 : !> \param dgemm_counter ...
1593 : !> \param buffer_1D ...
1594 : ! **************************************************************************************************
1595 6400 : SUBROUTINE mp2_update_P_gamma(mp2_env, para_env_sub, gd_B_virtual, &
1596 3200 : Eigenval, homo, dimen_RI, iiB, jjB, my_B_size, &
1597 3200 : my_B_virtual_end, my_B_virtual_start, my_i, my_j, virtual, local_ab, &
1598 3200 : t_ab, my_local_i_aL, my_local_j_aL, open_ss, Y_i_aP, Y_j_aP, &
1599 1600 : local_ba, ispin, jspin, dgemm_counter, buffer_1D)
1600 : TYPE(mp2_type) :: mp2_env
1601 : TYPE(mp_para_env_type), INTENT(IN) :: para_env_sub
1602 : TYPE(group_dist_d1_type), DIMENSION(:), INTENT(IN) :: gd_B_virtual
1603 : REAL(KIND=dp), DIMENSION(:, :), INTENT(IN) :: Eigenval
1604 : INTEGER, DIMENSION(:), INTENT(IN) :: homo
1605 : INTEGER, INTENT(IN) :: dimen_RI, iiB, jjB
1606 : INTEGER, DIMENSION(:), INTENT(IN) :: my_B_size, my_B_virtual_end, &
1607 : my_B_virtual_start
1608 : INTEGER, INTENT(IN) :: my_i, my_j
1609 : INTEGER, DIMENSION(:), INTENT(IN) :: virtual
1610 : REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
1611 : INTENT(INOUT), TARGET :: local_ab
1612 : REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
1613 : INTENT(IN), TARGET :: t_ab, my_local_i_aL, my_local_j_aL
1614 : LOGICAL, INTENT(IN) :: open_ss
1615 : REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
1616 : INTENT(INOUT), TARGET :: Y_i_aP, Y_j_aP, local_ba
1617 : INTEGER, INTENT(IN) :: ispin, jspin
1618 : TYPE(dgemm_counter_type), INTENT(INOUT) :: dgemm_counter
1619 : REAL(KIND=dp), CONTIGUOUS, DIMENSION(:), TARGET :: buffer_1D
1620 :
1621 : CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_update_P_gamma'
1622 :
1623 : INTEGER :: a, b, b_global, handle, proc_receive, proc_send, proc_shift, rec_B_size, &
1624 : rec_B_virtual_end, rec_B_virtual_start, send_B_size, send_B_virtual_end, &
1625 : send_B_virtual_start
1626 : INTEGER(KIND=int_8) :: offset
1627 : LOGICAL :: alpha_beta
1628 : REAL(KIND=dp) :: factor, P_ij_diag
1629 : REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
1630 1600 : POINTER :: external_ab, send_ab
1631 :
1632 1600 : CALL timeset(routineN//"_Pia", handle)
1633 :
1634 1600 : alpha_beta = .NOT. (ispin == jspin)
1635 1600 : IF (open_ss) THEN
1636 : factor = 1.0_dp
1637 : ELSE
1638 1211 : factor = 2.0_dp
1639 : END IF
1640 : ! divide the (ia|jb) integrals by Delta_ij^ab
1641 24789 : DO b = 1, my_B_size(jspin)
1642 23189 : b_global = b + my_B_virtual_start(jspin) - 1
1643 463348 : DO a = 1, virtual(ispin)
1644 : local_ab(a, b) = -local_ab(a, b)/ &
1645 : (Eigenval(homo(ispin) + a, ispin) + Eigenval(homo(jspin) + b_global, jspin) - &
1646 461748 : Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, jspin))
1647 : END DO
1648 : END DO
1649 1600 : IF (.NOT. (alpha_beta)) THEN
1650 322742 : P_ij_diag = -SUM(local_ab*t_ab)*factor
1651 : ELSE
1652 : ! update diagonal part of P_ij
1653 140606 : P_ij_diag = -SUM(local_ab*local_ab)*mp2_env%scale_S
1654 : ! More integrals needed only for alpha-beta case: local_ba
1655 6987 : DO b = 1, my_B_size(ispin)
1656 6485 : b_global = b + my_B_virtual_start(ispin) - 1
1657 140066 : DO a = 1, virtual(jspin)
1658 : local_ba(a, b) = -local_ba(a, b)/ &
1659 : (Eigenval(homo(jspin) + a, jspin) + Eigenval(homo(ispin) + b_global, ispin) - &
1660 139564 : Eigenval(my_i + iiB - 1, ispin) - Eigenval(my_j + jjB - 1, jspin))
1661 : END DO
1662 : END DO
1663 : END IF
1664 :
1665 : ! P_ab and add diagonal part of P_ij
1666 :
1667 1600 : CALL dgemm_counter_start(dgemm_counter)
1668 1600 : IF (.NOT. (alpha_beta)) THEN
1669 : CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_B_size(ispin), my_B_size(ispin), virtual(ispin), 1.0_dp, &
1670 : t_ab, virtual(ispin), local_ab, virtual(ispin), &
1671 : 1.0_dp, mp2_env%ri_grad%P_ab(ispin)%array(:, &
1672 1098 : my_B_virtual_start(ispin):my_B_virtual_end(ispin)), my_B_size(ispin))
1673 : mp2_env%ri_grad%P_ij(ispin)%array(my_i + iiB - 1, my_i + iiB - 1) = &
1674 1098 : mp2_env%ri_grad%P_ij(ispin)%array(my_i + iiB - 1, my_i + iiB - 1) + P_ij_diag
1675 : ELSE
1676 : CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_B_size(ispin), my_B_size(ispin), virtual(jspin), mp2_env%scale_S, &
1677 : local_ba, virtual(jspin), local_ba, virtual(jspin), 1.0_dp, &
1678 502 : mp2_env%ri_grad%P_ab(ispin)%array(:, my_B_virtual_start(ispin):my_B_virtual_end(ispin)), my_B_size(ispin))
1679 :
1680 : mp2_env%ri_grad%P_ij(ispin)%array(my_i + iiB - 1, my_i + iiB - 1) = &
1681 502 : mp2_env%ri_grad%P_ij(ispin)%array(my_i + iiB - 1, my_i + iiB - 1) + P_ij_diag
1682 :
1683 : CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_B_size(jspin), my_B_size(jspin), virtual(ispin), mp2_env%scale_S, &
1684 : local_ab, virtual(ispin), local_ab, virtual(ispin), 1.0_dp, &
1685 502 : mp2_env%ri_grad%P_ab(jspin)%array(:, my_B_virtual_start(jspin):my_B_virtual_end(jspin)), my_B_size(jspin))
1686 :
1687 : mp2_env%ri_grad%P_ij(jspin)%array(my_j + jjB - 1, my_j + jjB - 1) = &
1688 502 : mp2_env%ri_grad%P_ij(jspin)%array(my_j + jjB - 1, my_j + jjB - 1) + P_ij_diag
1689 : END IF
1690 : ! The summation is over unique pairs. In alpha-beta case, all pairs are unique: subroutine is called for
1691 : ! both i^alpha,j^beta and i^beta,j^alpha. Formally, my_i can be equal to my_j, but they are different
1692 : ! due to spin in alpha-beta case.
1693 1600 : IF ((my_i /= my_j) .AND. (.NOT. alpha_beta)) THEN
1694 :
1695 : CALL mp2_env%local_gemm_ctx%gemm('N', 'T', my_B_size(ispin), virtual(ispin), my_B_size(ispin), 1.0_dp, &
1696 : t_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), my_B_size(ispin), &
1697 : local_ab, virtual(ispin), &
1698 811 : 1.0_dp, mp2_env%ri_grad%P_ab(ispin)%array, my_B_size(ispin))
1699 :
1700 : mp2_env%ri_grad%P_ij(ispin)%array(my_j + jjB - 1, my_j + jjB - 1) = &
1701 811 : mp2_env%ri_grad%P_ij(ispin)%array(my_j + jjB - 1, my_j + jjB - 1) + P_ij_diag
1702 : END IF
1703 1772 : DO proc_shift = 1, para_env_sub%num_pe - 1
1704 172 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
1705 172 : proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
1706 :
1707 172 : CALL get_group_dist(gd_B_virtual(jspin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
1708 172 : CALL get_group_dist(gd_B_virtual(jspin), proc_send, send_B_virtual_start, send_B_virtual_end, send_B_size)
1709 :
1710 172 : external_ab(1:virtual(ispin), 1:rec_B_size) => buffer_1D(1:INT(virtual(ispin), int_8)*rec_B_size)
1711 35072 : external_ab = 0.0_dp
1712 :
1713 : CALL para_env_sub%sendrecv(local_ab, proc_send, &
1714 172 : external_ab, proc_receive)
1715 :
1716 172 : IF (.NOT. (alpha_beta)) THEN
1717 : CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_B_size(ispin), rec_B_size, virtual(ispin), 1.0_dp, &
1718 : t_ab, virtual(ispin), external_ab, virtual(ispin), &
1719 102 : 1.0_dp, mp2_env%ri_grad%P_ab(ispin)%array(:, rec_B_virtual_start:rec_B_virtual_end), my_B_size(ispin))
1720 : ELSE
1721 : CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_B_size(jspin), rec_B_size, virtual(ispin), mp2_env%scale_S, &
1722 : local_ab, virtual(ispin), external_ab, virtual(ispin), &
1723 : 1.0_dp, mp2_env%ri_grad%P_ab(jspin)%array(:, rec_B_virtual_start:rec_B_virtual_end), &
1724 70 : my_B_size(jspin))
1725 :
1726 : ! For alpha-beta part of alpha-density we need a new parallel code
1727 : ! And new external_ab (of a different size)
1728 70 : CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
1729 70 : CALL get_group_dist(gd_B_virtual(ispin), proc_send, send_B_virtual_start, send_B_virtual_end, send_B_size)
1730 70 : external_ab(1:virtual(jspin), 1:rec_B_size) => buffer_1D(1:INT(virtual(jspin), int_8)*rec_B_size)
1731 14700 : external_ab = 0.0_dp
1732 : CALL para_env_sub%sendrecv(local_ba, proc_send, &
1733 70 : external_ab, proc_receive)
1734 : CALL mp2_env%local_gemm_ctx%gemm('T', 'N', my_B_size(ispin), rec_B_size, virtual(jspin), mp2_env%scale_S, &
1735 : local_ba, virtual(jspin), external_ab, virtual(jspin), &
1736 70 : 1.0_dp, mp2_env%ri_grad%P_ab(ispin)%array(:, rec_B_virtual_start:rec_B_virtual_end), my_B_size(ispin))
1737 : END IF
1738 :
1739 1944 : IF ((my_i /= my_j) .AND. (.NOT. alpha_beta)) THEN
1740 : external_ab(1:my_B_size(ispin), 1:virtual(ispin)) => &
1741 86 : buffer_1D(1:INT(virtual(ispin), int_8)*my_B_size(ispin))
1742 18083 : external_ab = 0.0_dp
1743 :
1744 86 : offset = INT(virtual(ispin), int_8)*my_B_size(ispin)
1745 :
1746 86 : send_ab(1:send_B_size, 1:virtual(ispin)) => buffer_1D(offset + 1:offset + INT(send_B_size, int_8)*virtual(ispin))
1747 18083 : send_ab = 0.0_dp
1748 :
1749 : CALL mp2_env%local_gemm_ctx%gemm('N', 'T', send_B_size, virtual(ispin), my_B_size(ispin), 1.0_dp, &
1750 : t_ab(send_B_virtual_start:send_B_virtual_end, :), send_B_size, &
1751 86 : local_ab, virtual(ispin), 0.0_dp, send_ab, send_B_size)
1752 : CALL para_env_sub%sendrecv(send_ab, proc_send, &
1753 86 : external_ab, proc_receive)
1754 :
1755 18083 : mp2_env%ri_grad%P_ab(ispin)%array(:, :) = mp2_env%ri_grad%P_ab(ispin)%array + external_ab
1756 : END IF
1757 :
1758 : END DO
1759 1600 : IF (.NOT. alpha_beta) THEN
1760 1098 : IF (my_i /= my_j) THEN
1761 811 : CALL dgemm_counter_stop(dgemm_counter, 2*my_B_size(ispin), virtual(ispin), virtual(ispin))
1762 : ELSE
1763 287 : CALL dgemm_counter_stop(dgemm_counter, my_B_size(ispin), virtual(ispin), virtual(ispin))
1764 : END IF
1765 : ELSE
1766 1506 : CALL dgemm_counter_stop(dgemm_counter, SUM(my_B_size), virtual(ispin), virtual(jspin))
1767 : END IF
1768 1600 : CALL timestop(handle)
1769 :
1770 : ! Now, Gamma_P_ia (made of Y_ia_P)
1771 :
1772 1600 : CALL timeset(routineN//"_Gamma", handle)
1773 1600 : CALL dgemm_counter_start(dgemm_counter)
1774 1600 : IF (.NOT. alpha_beta) THEN
1775 : ! Alpha-alpha, beta-beta and closed shell
1776 : CALL mp2_env%local_gemm_ctx%gemm('N', 'T', my_B_size(ispin), dimen_RI, my_B_size(ispin), 1.0_dp, &
1777 : t_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), my_B_size(ispin), &
1778 1098 : my_local_j_aL, dimen_RI, 1.0_dp, Y_i_aP, my_B_size(ispin))
1779 : ELSE ! Alpha-beta
1780 : CALL mp2_env%local_gemm_ctx%gemm('N', 'T', my_B_size(ispin), dimen_RI, my_B_size(jspin), mp2_env%scale_S, &
1781 : local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), my_B_size(ispin), &
1782 502 : my_local_j_aL, dimen_RI, 1.0_dp, Y_i_aP, my_B_size(ispin))
1783 : CALL mp2_env%local_gemm_ctx%gemm('T', 'T', my_B_size(jspin), dimen_RI, my_B_size(ispin), mp2_env%scale_S, &
1784 : local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), my_B_size(ispin), &
1785 502 : my_local_i_aL, dimen_RI, 1.0_dp, Y_j_aP, my_B_size(jspin))
1786 : END IF
1787 :
1788 1600 : IF (para_env_sub%num_pe > 1) THEN
1789 172 : external_ab(1:my_B_size(ispin), 1:dimen_RI) => buffer_1D(1:INT(my_B_size(ispin), int_8)*dimen_RI)
1790 189692 : external_ab = 0.0_dp
1791 :
1792 172 : offset = INT(my_B_size(ispin), int_8)*dimen_RI
1793 : END IF
1794 : !
1795 1772 : DO proc_shift = 1, para_env_sub%num_pe - 1
1796 172 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
1797 172 : proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
1798 :
1799 172 : CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
1800 172 : CALL get_group_dist(gd_B_virtual(ispin), proc_send, send_B_virtual_start, send_B_virtual_end, send_B_size)
1801 :
1802 172 : send_ab(1:send_B_size, 1:dimen_RI) => buffer_1D(offset + 1:offset + INT(dimen_RI, int_8)*send_B_size)
1803 189692 : send_ab = 0.0_dp
1804 1944 : IF (.NOT. alpha_beta) THEN
1805 : CALL mp2_env%local_gemm_ctx%gemm('N', 'T', send_B_size, dimen_RI, my_B_size(ispin), 1.0_dp, &
1806 : t_ab(send_B_virtual_start:send_B_virtual_end, :), send_B_size, &
1807 102 : my_local_j_aL, dimen_RI, 0.0_dp, send_ab, send_B_size)
1808 102 : CALL para_env_sub%sendrecv(send_ab, proc_send, external_ab, proc_receive)
1809 :
1810 217544 : Y_i_aP(:, :) = Y_i_aP + external_ab
1811 :
1812 : ELSE ! Alpha-beta case
1813 : ! Alpha-alpha part
1814 : CALL mp2_env%local_gemm_ctx%gemm('N', 'T', send_B_size, dimen_RI, my_B_size(jspin), mp2_env%scale_S, &
1815 : local_ab(send_B_virtual_start:send_B_virtual_end, :), send_B_size, &
1816 70 : my_local_j_aL, dimen_RI, 0.0_dp, send_ab, send_B_size)
1817 70 : CALL para_env_sub%sendrecv(send_ab, proc_send, external_ab, proc_receive)
1818 161840 : Y_i_aP(:, :) = Y_i_aP + external_ab
1819 : END IF
1820 : END DO
1821 :
1822 1600 : IF (alpha_beta) THEN
1823 : ! For beta-beta part (in alpha-beta case) we need a new parallel code
1824 502 : IF (para_env_sub%num_pe > 1) THEN
1825 70 : external_ab(1:my_B_size(jspin), 1:dimen_RI) => buffer_1D(1:INT(my_B_size(jspin), int_8)*dimen_RI)
1826 88620 : external_ab = 0.0_dp
1827 :
1828 70 : offset = INT(my_B_size(jspin), int_8)*dimen_RI
1829 : END IF
1830 572 : DO proc_shift = 1, para_env_sub%num_pe - 1
1831 70 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
1832 70 : proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
1833 :
1834 70 : CALL get_group_dist(gd_B_virtual(jspin), proc_send, send_B_virtual_start, send_B_virtual_end, send_B_size)
1835 70 : send_ab(1:send_B_size, 1:dimen_RI) => buffer_1D(offset + 1:offset + INT(dimen_RI, int_8)*send_B_size)
1836 88620 : send_ab = 0.0_dp
1837 : CALL mp2_env%local_gemm_ctx%gemm('N', 'T', send_B_size, dimen_RI, my_B_size(ispin), mp2_env%scale_S, &
1838 : local_ba(send_B_virtual_start:send_B_virtual_end, :), send_B_size, &
1839 70 : my_local_i_aL, dimen_RI, 0.0_dp, send_ab, send_B_size)
1840 70 : CALL para_env_sub%sendrecv(send_ab, proc_send, external_ab, proc_receive)
1841 177812 : Y_j_aP(:, :) = Y_j_aP + external_ab
1842 :
1843 : END DO
1844 :
1845 : ! Here, we just use approximate bounds. For large systems virtual(ispin) is approx virtual(jspin), same for B_size
1846 502 : CALL dgemm_counter_stop(dgemm_counter, 3*virtual(ispin), dimen_RI, my_B_size(jspin))
1847 : ELSE
1848 1098 : CALL dgemm_counter_stop(dgemm_counter, virtual(ispin), dimen_RI, my_B_size(ispin))
1849 : END IF
1850 :
1851 1600 : IF ((my_i /= my_j) .AND. (.NOT. alpha_beta)) THEN
1852 : ! Alpha-alpha, beta-beta and closed shell
1853 811 : CALL dgemm_counter_start(dgemm_counter)
1854 : CALL mp2_env%local_gemm_ctx%gemm('T', 'T', my_B_size(ispin), dimen_RI, my_B_size(ispin), 1.0_dp, &
1855 : t_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), :), my_B_size(ispin), &
1856 811 : my_local_i_aL, dimen_RI, 1.0_dp, Y_j_aP, my_B_size(ispin))
1857 897 : DO proc_shift = 1, para_env_sub%num_pe - 1
1858 86 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
1859 86 : proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
1860 :
1861 86 : CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
1862 :
1863 86 : external_ab(1:dimen_RI, 1:rec_B_size) => buffer_1D(1:INT(dimen_RI, int_8)*rec_B_size)
1864 86837 : external_ab = 0.0_dp
1865 :
1866 : CALL para_env_sub%sendrecv(my_local_i_aL, proc_send, &
1867 86 : external_ab, proc_receive)
1868 :
1869 : ! Alpha-alpha, beta-beta and closed shell
1870 : CALL mp2_env%local_gemm_ctx%gemm('T', 'T', my_B_size(ispin), dimen_RI, rec_B_size, 1.0_dp, &
1871 : t_ab(rec_B_virtual_start:rec_B_virtual_end, :), rec_B_size, &
1872 983 : external_ab, dimen_RI, 1.0_dp, Y_j_aP, my_B_size(ispin))
1873 : END DO
1874 :
1875 811 : CALL dgemm_counter_stop(dgemm_counter, my_B_size(ispin), dimen_RI, virtual(ispin))
1876 : END IF
1877 :
1878 1600 : CALL timestop(handle)
1879 1600 : END SUBROUTINE mp2_update_P_gamma
1880 :
1881 : ! **************************************************************************************************
1882 : !> \brief ...
1883 : !> \param Gamma_P_ia ...
1884 : !> \param ij_index ...
1885 : !> \param my_B_size ...
1886 : !> \param my_block_size ...
1887 : !> \param my_group_L_size ...
1888 : !> \param my_i ...
1889 : !> \param my_ij_pairs ...
1890 : !> \param ngroup ...
1891 : !> \param num_integ_group ...
1892 : !> \param integ_group_pos2color_sub ...
1893 : !> \param num_ij_pairs ...
1894 : !> \param ij_map ...
1895 : !> \param ranges_info_array ...
1896 : !> \param Y_i_aP ...
1897 : !> \param comm_exchange ...
1898 : !> \param sizes_array ...
1899 : !> \param spin ...
1900 : !> \param buffer_1D ...
1901 : ! **************************************************************************************************
1902 3194 : SUBROUTINE mp2_redistribute_gamma(Gamma_P_ia, ij_index, my_B_size, &
1903 : my_block_size, my_group_L_size, my_i, my_ij_pairs, ngroup, &
1904 : num_integ_group, integ_group_pos2color_sub, num_ij_pairs, &
1905 3194 : ij_map, ranges_info_array, Y_i_aP, comm_exchange, &
1906 3194 : sizes_array, spin, buffer_1D)
1907 :
1908 : REAL(KIND=dp), DIMENSION(:, :, :), INTENT(INOUT) :: Gamma_P_ia
1909 : INTEGER, INTENT(IN) :: ij_index, my_B_size, my_block_size, &
1910 : my_group_L_size, my_i, my_ij_pairs, &
1911 : ngroup, num_integ_group
1912 : INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(IN) :: integ_group_pos2color_sub, num_ij_pairs
1913 : INTEGER, ALLOCATABLE, DIMENSION(:, :), INTENT(IN) :: ij_map
1914 : INTEGER, ALLOCATABLE, DIMENSION(:, :, :), &
1915 : INTENT(IN) :: ranges_info_array
1916 : REAL(KIND=dp), DIMENSION(:, :, :), INTENT(IN) :: Y_i_aP
1917 : TYPE(mp_comm_type), INTENT(IN) :: comm_exchange
1918 : INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(IN) :: sizes_array
1919 : INTEGER, INTENT(IN) :: spin
1920 : REAL(KIND=dp), CONTIGUOUS, DIMENSION(:), TARGET :: buffer_1D
1921 :
1922 : CHARACTER(LEN=*), PARAMETER :: routineN = 'mp2_redistribute_gamma'
1923 :
1924 : INTEGER :: end_point, handle, handle2, iiB, ij_counter_rec, irep, kkk, lll, Lstart_pos, &
1925 : proc_receive, proc_send, proc_shift, rec_i, rec_ij_index, send_L_size, start_point, tag
1926 : INTEGER(KIND=int_8) :: offset
1927 : REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :, :), &
1928 3194 : POINTER :: BI_C_rec, BI_C_send
1929 :
1930 : ! In alpha-beta case Y_i_aP_beta is sent as Y_j_aP
1931 :
1932 3194 : CALL timeset(routineN//"_comm2", handle)
1933 :
1934 3194 : tag = 43
1935 :
1936 3194 : IF (ij_index <= my_ij_pairs) THEN
1937 : ! somethig to send
1938 : ! start with myself
1939 3176 : CALL timeset(routineN//"_comm2_w", handle2)
1940 9110 : DO irep = 0, num_integ_group - 1
1941 5934 : Lstart_pos = ranges_info_array(1, irep, comm_exchange%mepos)
1942 5934 : start_point = ranges_info_array(3, irep, comm_exchange%mepos)
1943 5934 : end_point = ranges_info_array(4, irep, comm_exchange%mepos)
1944 : !$OMP PARALLEL DO DEFAULT(NONE) &
1945 : !$OMP PRIVATE(kkk,lll,iiB) &
1946 : !$OMP SHARED(start_point,end_point,Lstart_pos,my_block_size,&
1947 9110 : !$OMP Gamma_P_ia,my_i,my_B_size,Y_i_aP)
1948 : DO kkk = start_point, end_point
1949 : lll = kkk - start_point + Lstart_pos
1950 : DO iiB = 1, my_block_size
1951 : Gamma_P_ia(1:my_B_size, my_i + iiB - 1, kkk) = &
1952 : Gamma_P_ia(1:my_B_size, my_i + iiB - 1, kkk) + &
1953 : Y_i_aP(1:my_B_size, lll, iiB)
1954 : END DO
1955 : END DO
1956 : !$OMP END PARALLEL DO
1957 : END DO
1958 3176 : CALL timestop(handle2)
1959 :
1960 : ! Y_i_aP(my_B_size,dimen_RI,block_size)
1961 :
1962 3274 : DO proc_shift = 1, comm_exchange%num_pe - 1
1963 98 : proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
1964 98 : proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
1965 :
1966 98 : send_L_size = sizes_array(proc_send)
1967 : BI_C_send(1:my_B_size, 1:my_block_size, 1:send_L_size) => &
1968 98 : buffer_1D(1:INT(my_B_size, int_8)*my_block_size*send_L_size)
1969 :
1970 98 : offset = INT(my_B_size, int_8)*my_block_size*send_L_size
1971 :
1972 98 : CALL timeset(routineN//"_comm2_w", handle2)
1973 48692 : BI_C_send = 0.0_dp
1974 196 : DO irep = 0, num_integ_group - 1
1975 98 : Lstart_pos = ranges_info_array(1, irep, proc_send)
1976 98 : start_point = ranges_info_array(3, irep, proc_send)
1977 98 : end_point = ranges_info_array(4, irep, proc_send)
1978 : !$OMP PARALLEL DO DEFAULT(NONE) &
1979 : !$OMP PRIVATE(kkk,lll,iiB) &
1980 : !$OMP SHARED(start_point,end_point,Lstart_pos,my_block_size,&
1981 196 : !$OMP BI_C_send,my_B_size,Y_i_aP)
1982 : DO kkk = start_point, end_point
1983 : lll = kkk - start_point + Lstart_pos
1984 : DO iiB = 1, my_block_size
1985 : BI_C_send(1:my_B_size, iiB, kkk) = Y_i_aP(1:my_B_size, lll, iiB)
1986 : END DO
1987 : END DO
1988 : !$OMP END PARALLEL DO
1989 : END DO
1990 98 : CALL timestop(handle2)
1991 :
1992 98 : rec_ij_index = num_ij_pairs(proc_receive)
1993 :
1994 3372 : IF (ij_index <= rec_ij_index) THEN
1995 : ! we know that proc_receive has something to send for us, let's see what
1996 : ij_counter_rec = &
1997 80 : (ij_index - MIN(1, integ_group_pos2color_sub(proc_receive)))*ngroup + integ_group_pos2color_sub(proc_receive)
1998 :
1999 80 : rec_i = ij_map(spin, ij_counter_rec)
2000 :
2001 : BI_C_rec(1:my_B_size, 1:my_block_size, 1:my_group_L_size) => &
2002 80 : buffer_1D(offset + 1:offset + INT(my_B_size, int_8)*my_block_size*my_group_L_size)
2003 44250 : BI_C_rec = 0.0_dp
2004 :
2005 : CALL comm_exchange%sendrecv(BI_C_send, proc_send, &
2006 80 : BI_C_rec, proc_receive, tag)
2007 :
2008 80 : CALL timeset(routineN//"_comm2_w", handle2)
2009 160 : DO irep = 0, num_integ_group - 1
2010 80 : start_point = ranges_info_array(3, irep, comm_exchange%mepos)
2011 80 : end_point = ranges_info_array(4, irep, comm_exchange%mepos)
2012 : !$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
2013 : !$OMP SHARED(start_point,end_point,my_block_size,&
2014 160 : !$OMP Gamma_P_ia,rec_i,iiB,my_B_size,BI_C_rec)
2015 : Gamma_P_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) = &
2016 : Gamma_P_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) + &
2017 : BI_C_rec(1:my_B_size, :, start_point:end_point)
2018 : !$OMP END PARALLEL WORKSHARE
2019 : END DO
2020 80 : CALL timestop(handle2)
2021 :
2022 : ELSE
2023 : ! we have something to send but nothing to receive
2024 18 : CALL comm_exchange%send(BI_C_send, proc_send, tag)
2025 :
2026 : END IF
2027 :
2028 : END DO
2029 :
2030 : ELSE
2031 : ! noting to send check if we have to receive
2032 36 : DO proc_shift = 1, comm_exchange%num_pe - 1
2033 18 : proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
2034 18 : proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
2035 18 : rec_ij_index = num_ij_pairs(proc_receive)
2036 :
2037 36 : IF (ij_index <= rec_ij_index) THEN
2038 : ! we know that proc_receive has something to send for us, let's see what
2039 : ij_counter_rec = &
2040 18 : (ij_index - MIN(1, integ_group_pos2color_sub(proc_receive)))*ngroup + integ_group_pos2color_sub(proc_receive)
2041 :
2042 18 : rec_i = ij_map(spin, ij_counter_rec)
2043 :
2044 : BI_C_rec(1:my_B_size, 1:my_block_size, 1:my_group_L_size) => &
2045 18 : buffer_1D(1:INT(my_B_size, int_8)*my_block_size*my_group_L_size)
2046 :
2047 4442 : BI_C_rec = 0.0_dp
2048 :
2049 18 : CALL comm_exchange%recv(BI_C_rec, proc_receive, tag)
2050 :
2051 18 : CALL timeset(routineN//"_comm2_w", handle2)
2052 36 : DO irep = 0, num_integ_group - 1
2053 18 : start_point = ranges_info_array(3, irep, comm_exchange%mepos)
2054 18 : end_point = ranges_info_array(4, irep, comm_exchange%mepos)
2055 : #if !defined(__INTEL_LLVM_COMPILER) || (20250000 <= __INTEL_LLVM_COMPILER)
2056 : !$OMP PARALLEL WORKSHARE DEFAULT(NONE) &
2057 : !$OMP SHARED(start_point,end_point,my_block_size,&
2058 36 : !$OMP Gamma_P_ia,rec_i,my_B_size,BI_C_rec)
2059 : #endif
2060 : Gamma_P_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) = &
2061 : Gamma_P_ia(:, rec_i:rec_i + my_block_size - 1, start_point:end_point) + &
2062 : BI_C_rec(1:my_B_size, :, start_point:end_point)
2063 : #if !defined(__INTEL_LLVM_COMPILER) || (20250000 <= __INTEL_LLVM_COMPILER)
2064 : !$OMP END PARALLEL WORKSHARE
2065 : #endif
2066 : END DO
2067 18 : CALL timestop(handle2)
2068 :
2069 : END IF
2070 : END DO
2071 :
2072 : END IF
2073 3194 : CALL timestop(handle)
2074 :
2075 3194 : END SUBROUTINE mp2_redistribute_gamma
2076 :
2077 : ! **************************************************************************************************
2078 : !> \brief ...
2079 : !> \param mp2_env ...
2080 : !> \param Eigenval ...
2081 : !> \param homo ...
2082 : !> \param virtual ...
2083 : !> \param open_shell ...
2084 : !> \param beta_beta ...
2085 : !> \param Bib_C ...
2086 : !> \param unit_nr ...
2087 : !> \param dimen_RI ...
2088 : !> \param my_B_size ...
2089 : !> \param ngroup ...
2090 : !> \param my_group_L_size ...
2091 : !> \param color_sub ...
2092 : !> \param ranges_info_array ...
2093 : !> \param comm_exchange ...
2094 : !> \param para_env_sub ...
2095 : !> \param para_env ...
2096 : !> \param my_B_virtual_start ...
2097 : !> \param my_B_virtual_end ...
2098 : !> \param sizes_array ...
2099 : !> \param gd_B_virtual ...
2100 : !> \param integ_group_pos2color_sub ...
2101 : !> \param dgemm_counter ...
2102 : !> \param buffer_1D ...
2103 : ! **************************************************************************************************
2104 380 : SUBROUTINE quasi_degenerate_P_ij(mp2_env, Eigenval, homo, virtual, open_shell, &
2105 380 : beta_beta, Bib_C, unit_nr, dimen_RI, &
2106 380 : my_B_size, ngroup, my_group_L_size, &
2107 : color_sub, ranges_info_array, comm_exchange, para_env_sub, para_env, &
2108 380 : my_B_virtual_start, my_B_virtual_end, sizes_array, gd_B_virtual, &
2109 380 : integ_group_pos2color_sub, dgemm_counter, buffer_1D)
2110 : TYPE(mp2_type) :: mp2_env
2111 : REAL(KIND=dp), DIMENSION(:, :), INTENT(IN) :: Eigenval
2112 : INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
2113 : LOGICAL, INTENT(IN) :: open_shell, beta_beta
2114 : TYPE(three_dim_real_array), DIMENSION(:), &
2115 : INTENT(IN) :: BIb_C
2116 : INTEGER, INTENT(IN) :: unit_nr, dimen_RI
2117 : INTEGER, DIMENSION(:), INTENT(IN) :: my_B_size
2118 : INTEGER, INTENT(IN) :: ngroup, my_group_L_size, color_sub
2119 : INTEGER, ALLOCATABLE, DIMENSION(:, :, :), &
2120 : INTENT(IN) :: ranges_info_array
2121 : TYPE(mp_comm_type), INTENT(IN) :: comm_exchange
2122 : TYPE(mp_para_env_type), INTENT(IN) :: para_env_sub, para_env
2123 : INTEGER, DIMENSION(:), INTENT(IN) :: my_B_virtual_start, my_B_virtual_end
2124 : INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(IN) :: sizes_array
2125 : TYPE(group_dist_d1_type), DIMENSION(:), INTENT(IN) :: gd_B_virtual
2126 : INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(IN) :: integ_group_pos2color_sub
2127 : TYPE(dgemm_counter_type), INTENT(INOUT) :: dgemm_counter
2128 : REAL(KIND=dp), CONTIGUOUS, DIMENSION(:), TARGET :: buffer_1D
2129 :
2130 : CHARACTER(LEN=*), PARAMETER :: routineN = 'quasi_degenerate_P_ij'
2131 :
2132 : INTEGER :: a, a_global, b, b_global, block_size, decil, handle, handle2, ijk_counter, &
2133 : ijk_counter_send, ijk_index, ispin, kkB, kspin, max_block_size, max_ijk, my_i, my_ijk, &
2134 : my_j, my_k, my_last_k(2), my_virtual, nspins, proc_receive, proc_send, proc_shift, &
2135 : rec_B_size, rec_B_virtual_end, rec_B_virtual_start, rec_L_size, send_B_size, &
2136 : send_B_virtual_end, send_B_virtual_start, send_i, send_ijk_index, send_j, send_k, &
2137 : size_B_i, size_B_k, tag, tag2
2138 380 : INTEGER, ALLOCATABLE, DIMENSION(:) :: num_ijk
2139 380 : INTEGER, ALLOCATABLE, DIMENSION(:, :) :: ijk_map, send_last_k
2140 : LOGICAL :: alpha_beta, do_recv_i, do_recv_j, &
2141 : do_recv_k, do_send_i, do_send_j, &
2142 : do_send_k
2143 : REAL(KIND=dp) :: amp_fac, P_ij_elem, t_new, t_start
2144 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :), &
2145 380 : TARGET :: local_ab, local_aL_i, local_aL_j, t_ab
2146 380 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :) :: local_aL_k
2147 380 : REAL(KIND=dp), DIMENSION(:, :), POINTER :: BI_C_rec, external_ab, external_aL
2148 380 : REAL(KIND=dp), DIMENSION(:, :, :), POINTER :: BI_C_rec_3D
2149 :
2150 380 : CALL timeset(routineN//"_ij_sing", handle)
2151 :
2152 380 : tag = 44
2153 380 : tag2 = 45
2154 :
2155 380 : nspins = SIZE(BIb_C)
2156 380 : alpha_beta = (nspins == 2)
2157 :
2158 : ! Set amplitude factor
2159 380 : amp_fac = mp2_env%scale_S + mp2_env%scale_T
2160 380 : IF (open_shell) amp_fac = mp2_env%scale_T
2161 :
2162 786 : ALLOCATE (send_last_k(2, comm_exchange%num_pe - 1))
2163 :
2164 : ! Loop(s) over orbital triplets
2165 838 : DO ispin = 1, nspins
2166 458 : size_B_i = my_B_size(ispin)
2167 458 : IF (ispin == 1 .AND. alpha_beta) THEN
2168 : kspin = 2
2169 : ELSE
2170 380 : kspin = 1
2171 : END IF
2172 458 : size_B_k = my_B_size(kspin)
2173 :
2174 : ! Find the number of quasi-degenerate orbitals and orbital triplets
2175 :
2176 : CALL Find_quasi_degenerate_ij(my_ijk, homo(ispin), homo(kspin), Eigenval(:, ispin), mp2_env, ijk_map, unit_nr, ngroup, &
2177 : .NOT. beta_beta .AND. ispin /= 2, comm_exchange, num_ijk, max_ijk, color_sub, &
2178 614 : SIZE(buffer_1D), my_group_L_size, size_B_k, para_env, virtual(ispin), size_B_i)
2179 :
2180 458 : my_virtual = virtual(ispin)
2181 458 : IF (SIZE(ijk_map, 2) > 0) THEN
2182 98 : max_block_size = ijk_map(4, 1)
2183 : ELSE
2184 : max_block_size = 1
2185 : END IF
2186 :
2187 1832 : ALLOCATE (local_aL_i(dimen_RI, size_B_i))
2188 1374 : ALLOCATE (local_aL_j(dimen_RI, size_B_i))
2189 2290 : ALLOCATE (local_aL_k(dimen_RI, size_B_k, max_block_size))
2190 1832 : ALLOCATE (t_ab(my_virtual, size_B_k))
2191 :
2192 1374 : my_last_k = -1
2193 548 : send_last_k = -1
2194 :
2195 458 : t_start = m_walltime()
2196 614 : DO ijk_index = 1, max_ijk
2197 :
2198 : ! Prediction is unreliable if we are in the first step of the loop
2199 156 : IF (unit_nr > 0 .AND. ijk_index > 1) THEN
2200 18 : decil = ijk_index*10/max_ijk
2201 18 : IF (decil /= (ijk_index - 1)*10/max_ijk) THEN
2202 18 : t_new = m_walltime()
2203 18 : t_new = (t_new - t_start)/60.0_dp*(max_ijk - ijk_index + 1)/(ijk_index - 1)
2204 : WRITE (unit_nr, FMT="(T3,A)") "Percentage of finished loop: "// &
2205 18 : cp_to_string(decil*10)//". Minutes left: "//cp_to_string(t_new)
2206 18 : CALL m_flush(unit_nr)
2207 : END IF
2208 : END IF
2209 :
2210 614 : IF (ijk_index <= my_ijk) THEN
2211 : ! work to be done
2212 154 : ijk_counter = (ijk_index - MIN(1, color_sub))*ngroup + color_sub
2213 154 : my_i = ijk_map(1, ijk_counter)
2214 154 : my_j = ijk_map(2, ijk_counter)
2215 154 : my_k = ijk_map(3, ijk_counter)
2216 154 : block_size = ijk_map(4, ijk_counter)
2217 :
2218 154 : do_recv_i = (ispin /= kspin) .OR. my_i < my_k .OR. my_i > my_k + block_size - 1
2219 154 : do_recv_j = (ispin /= kspin) .OR. my_j < my_k .OR. my_j > my_k + block_size - 1
2220 154 : do_recv_k = my_k /= my_last_k(1) .OR. my_k + block_size - 1 /= my_last_k(2)
2221 154 : my_last_k(1) = my_k
2222 154 : my_last_k(2) = my_k + block_size - 1
2223 :
2224 131054 : local_aL_i = 0.0_dp
2225 154 : IF (do_recv_i) THEN
2226 : CALL fill_local_i_aL_2D(local_al_i, ranges_info_array(:, :, comm_exchange%mepos), &
2227 125 : BIb_C(ispin)%array(:, :, my_i))
2228 : END IF
2229 :
2230 131054 : local_aL_j = 0.0_dp
2231 154 : IF (do_recv_j) THEN
2232 : CALL fill_local_i_aL_2D(local_al_j, ranges_info_array(:, :, comm_exchange%mepos), &
2233 125 : BIb_C(ispin)%array(:, :, my_j))
2234 : END IF
2235 :
2236 154 : IF (do_recv_k) THEN
2237 224533 : local_aL_k = 0.0_dp
2238 : CALL fill_local_i_aL(local_aL_k(:, :, 1:block_size), ranges_info_array(:, :, comm_exchange%mepos), &
2239 146 : BIb_C(kspin)%array(:, :, my_k:my_k + block_size - 1))
2240 : END IF
2241 :
2242 154 : CALL timeset(routineN//"_comm", handle2)
2243 164 : DO proc_shift = 1, comm_exchange%num_pe - 1
2244 10 : proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
2245 10 : proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
2246 :
2247 10 : send_ijk_index = num_ijk(proc_send)
2248 :
2249 10 : rec_L_size = sizes_array(proc_receive)
2250 10 : BI_C_rec(1:rec_L_size, 1:size_B_i) => buffer_1D(1:INT(rec_L_size, KIND=int_8)*size_B_i)
2251 :
2252 10 : do_send_i = .FALSE.
2253 10 : do_send_j = .FALSE.
2254 10 : do_send_k = .FALSE.
2255 10 : IF (ijk_index <= send_ijk_index) THEN
2256 : ! something to send
2257 : ijk_counter_send = (ijk_index - MIN(1, integ_group_pos2color_sub(proc_send)))* &
2258 8 : ngroup + integ_group_pos2color_sub(proc_send)
2259 8 : send_i = ijk_map(1, ijk_counter_send)
2260 8 : send_j = ijk_map(2, ijk_counter_send)
2261 8 : send_k = ijk_map(3, ijk_counter_send)
2262 :
2263 8 : do_send_i = (ispin /= kspin) .OR. send_i < send_k .OR. send_i > send_k + block_size - 1
2264 8 : do_send_j = (ispin /= kspin) .OR. send_j < send_k .OR. send_j > send_k + block_size - 1
2265 8 : do_send_k = send_k /= send_last_k(1, proc_shift) .OR. send_k + block_size - 1 /= send_last_k(2, proc_shift)
2266 8 : send_last_k(1, proc_shift) = send_k
2267 8 : send_last_k(2, proc_shift) = send_k + block_size - 1
2268 : END IF
2269 :
2270 : ! occupied i
2271 722 : BI_C_rec = 0.0_dp
2272 10 : IF (do_send_i) THEN
2273 6 : IF (do_recv_i) THEN
2274 : CALL comm_exchange%sendrecv(BIb_C(ispin)%array(:, :, send_i), proc_send, &
2275 288 : BI_C_rec, proc_receive, tag)
2276 : ELSE
2277 2 : CALL comm_exchange%send(BIb_C(ispin)%array(:, :, send_i), proc_send, tag)
2278 : END IF
2279 4 : ELSE IF (do_recv_i) THEN
2280 580 : CALL comm_exchange%recv(BI_C_rec, proc_receive, tag)
2281 : END IF
2282 10 : IF (do_recv_i) THEN
2283 8 : CALL fill_local_i_aL_2D(local_al_i, ranges_info_array(:, :, proc_receive), BI_C_rec)
2284 : END IF
2285 :
2286 : ! occupied j
2287 722 : BI_C_rec = 0.0_dp
2288 10 : IF (do_send_j) THEN
2289 8 : IF (do_recv_j) THEN
2290 : CALL comm_exchange%sendrecv(BIb_C(ispin)%array(:, :, send_j), proc_send, &
2291 576 : BI_C_rec, proc_receive, tag)
2292 : ELSE
2293 0 : CALL comm_exchange%send(BIb_C(ispin)%array(:, :, send_j), proc_send, tag)
2294 : END IF
2295 2 : ELSE IF (do_recv_j) THEN
2296 0 : CALL comm_exchange%recv(BI_C_rec, proc_receive, tag)
2297 : END IF
2298 8 : IF (do_recv_j) THEN
2299 8 : CALL fill_local_i_aL_2D(local_al_j, ranges_info_array(:, :, proc_receive), BI_C_rec)
2300 : END IF
2301 :
2302 : ! occupied k
2303 : BI_C_rec_3D(1:rec_L_size, 1:size_B_k, 1:block_size) => &
2304 10 : buffer_1D(1:INT(rec_L_size, KIND=int_8)*size_B_k*block_size)
2305 10 : IF (do_send_k) THEN
2306 8 : IF (do_recv_k) THEN
2307 : CALL comm_exchange%sendrecv(BIb_C(kspin)%array(:, :, send_k:send_k + block_size - 1), proc_send, &
2308 726 : BI_C_rec_3D, proc_receive, tag)
2309 : ELSE
2310 0 : CALL comm_exchange%send(BI_C_rec, proc_receive, tag)
2311 : END IF
2312 2 : ELSE IF (do_recv_k) THEN
2313 294 : CALL comm_exchange%recv(BI_C_rec_3D, proc_receive, tag)
2314 : END IF
2315 164 : IF (do_recv_k) THEN
2316 10 : CALL fill_local_i_aL(local_al_k(:, :, 1:block_size), ranges_info_array(:, :, proc_receive), BI_C_rec_3D)
2317 : END IF
2318 : END DO
2319 :
2320 20868 : IF (.NOT. do_recv_i) local_aL_i(:, :) = local_aL_k(:, :, my_i - my_k + 1)
2321 20868 : IF (.NOT. do_recv_j) local_aL_j(:, :) = local_aL_k(:, :, my_j - my_k + 1)
2322 154 : CALL timestop(handle2)
2323 :
2324 : ! expand integrals
2325 376 : DO kkB = 1, block_size
2326 222 : CALL timeset(routineN//"_exp_ik", handle2)
2327 222 : CALL dgemm_counter_start(dgemm_counter)
2328 666 : ALLOCATE (local_ab(my_virtual, size_B_k))
2329 32638 : local_ab = 0.0_dp
2330 : CALL mp2_env%local_gemm_ctx%gemm('T', 'N', size_B_i, size_B_k, dimen_RI, 1.0_dp, &
2331 : local_aL_i, dimen_RI, local_aL_k(:, :, kkB), dimen_RI, &
2332 222 : 0.0_dp, local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), 1:size_B_k), size_B_i)
2333 222 : DO proc_shift = 1, para_env_sub%num_pe - 1
2334 0 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
2335 0 : proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
2336 :
2337 0 : CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
2338 :
2339 0 : external_aL(1:dimen_RI, 1:rec_B_size) => buffer_1D(1:INT(dimen_RI, KIND=int_8)*rec_B_size)
2340 :
2341 : CALL comm_exchange%sendrecv(local_aL_i, proc_send, &
2342 0 : external_aL, proc_receive, tag)
2343 :
2344 : CALL mp2_env%local_gemm_ctx%gemm('T', 'N', rec_B_size, size_B_k, dimen_RI, 1.0_dp, &
2345 : external_aL, dimen_RI, local_aL_k(:, :, kkB), dimen_RI, &
2346 222 : 0.0_dp, local_ab(rec_B_virtual_start:rec_B_virtual_end, 1:size_B_k), rec_B_size)
2347 : END DO
2348 222 : CALL dgemm_counter_stop(dgemm_counter, my_virtual, size_B_k, dimen_RI)
2349 222 : CALL timestop(handle2)
2350 :
2351 : ! Amplitudes
2352 222 : CALL timeset(routineN//"_tab", handle2)
2353 32638 : t_ab = 0.0_dp
2354 : ! Alpha-alpha, beta-beta and closed shell
2355 222 : IF (.NOT. alpha_beta) THEN
2356 1156 : DO b = 1, size_B_k
2357 1038 : b_global = b + my_B_virtual_start(1) - 1
2358 17250 : DO a = 1, my_B_size(1)
2359 16094 : a_global = a + my_B_virtual_start(1) - 1
2360 : t_ab(a_global, b) = (amp_fac*local_ab(a_global, b) - mp2_env%scale_T*local_ab(b_global, a))/ &
2361 : (Eigenval(my_i, 1) + Eigenval(my_k + kkB - 1, 1) &
2362 17132 : - Eigenval(homo(1) + a_global, 1) - Eigenval(homo(1) + b_global, 1))
2363 : END DO
2364 : END DO
2365 : ELSE
2366 1030 : DO b = 1, size_B_k
2367 926 : b_global = b + my_B_virtual_start(kspin) - 1
2368 15388 : DO a = 1, my_B_size(ispin)
2369 14358 : a_global = a + my_B_virtual_start(ispin) - 1
2370 : t_ab(a_global, b) = mp2_env%scale_S*local_ab(a_global, b)/ &
2371 : (Eigenval(my_i, ispin) + Eigenval(my_k + kkB - 1, kspin) &
2372 15284 : - Eigenval(homo(ispin) + a_global, ispin) - Eigenval(homo(kspin) + b_global, kspin))
2373 : END DO
2374 : END DO
2375 : END IF
2376 :
2377 222 : IF (.NOT. alpha_beta) THEN
2378 118 : DO proc_shift = 1, para_env_sub%num_pe - 1
2379 0 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
2380 0 : proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
2381 0 : CALL get_group_dist(gd_B_virtual(1), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
2382 0 : CALL get_group_dist(gd_B_virtual(1), proc_send, send_B_virtual_start, send_B_virtual_end, send_B_size)
2383 :
2384 0 : external_ab(1:size_B_i, 1:rec_B_size) => buffer_1D(1:INT(size_B_i, KIND=int_8)*rec_B_size)
2385 : CALL para_env_sub%sendrecv(local_ab(send_B_virtual_start:send_B_virtual_end, 1:size_B_k), proc_send, &
2386 0 : external_ab(1:size_B_i, 1:rec_B_size), proc_receive, tag)
2387 :
2388 118 : DO b = 1, my_B_size(1)
2389 0 : b_global = b + my_B_virtual_start(1) - 1
2390 0 : DO a = 1, rec_B_size
2391 0 : a_global = a + rec_B_virtual_start - 1
2392 : t_ab(a_global, b) = (amp_fac*local_ab(a_global, b) - mp2_env%scale_T*external_ab(b, a))/ &
2393 : (Eigenval(my_i, 1) + Eigenval(my_k + kkB - 1, 1) &
2394 0 : - Eigenval(homo(1) + a_global, 1) - Eigenval(homo(1) + b_global, 1))
2395 : END DO
2396 : END DO
2397 : END DO
2398 : END IF
2399 222 : CALL timestop(handle2)
2400 :
2401 : ! Expand the second set of integrals
2402 222 : CALL timeset(routineN//"_exp_jk", handle2)
2403 32638 : local_ab = 0.0_dp
2404 222 : CALL dgemm_counter_start(dgemm_counter)
2405 : CALL mp2_env%local_gemm_ctx%gemm('T', 'N', size_B_i, size_B_k, dimen_RI, 1.0_dp, &
2406 : local_aL_j, dimen_RI, local_aL_k(:, :, kkB), dimen_RI, &
2407 222 : 0.0_dp, local_ab(my_B_virtual_start(ispin):my_B_virtual_end(ispin), 1:size_B_k), size_B_i)
2408 222 : DO proc_shift = 1, para_env_sub%num_pe - 1
2409 0 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
2410 0 : proc_receive = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
2411 :
2412 0 : CALL get_group_dist(gd_B_virtual(ispin), proc_receive, rec_B_virtual_start, rec_B_virtual_end, rec_B_size)
2413 :
2414 0 : external_aL(1:dimen_RI, 1:rec_B_size) => buffer_1D(1:INT(dimen_RI, KIND=int_8)*rec_B_size)
2415 :
2416 : CALL comm_exchange%sendrecv(local_aL_j, proc_send, &
2417 0 : external_aL, proc_receive, tag)
2418 : CALL mp2_env%local_gemm_ctx%gemm('T', 'N', rec_B_size, size_B_k, dimen_RI, 1.0_dp, &
2419 : external_aL, dimen_RI, local_aL_k(:, :, kkB), dimen_RI, &
2420 222 : 0.0_dp, local_ab(rec_B_virtual_start:rec_B_virtual_end, 1:size_B_k), rec_B_size)
2421 : END DO
2422 222 : CALL dgemm_counter_stop(dgemm_counter, my_virtual, size_B_k, dimen_RI)
2423 222 : CALL timestop(handle2)
2424 :
2425 222 : CALL timeset(routineN//"_Pij", handle2)
2426 2186 : DO b = 1, size_B_k
2427 1964 : b_global = b + my_B_virtual_start(kspin) - 1
2428 32638 : DO a = 1, my_B_size(ispin)
2429 30452 : a_global = a + my_B_virtual_start(ispin) - 1
2430 : local_ab(a_global, b) = &
2431 : local_ab(a_global, b)/(Eigenval(my_j, ispin) + Eigenval(my_k + kkB - 1, kspin) &
2432 32416 : - Eigenval(homo(ispin) + a_global, ispin) - Eigenval(homo(kspin) + b_global, kspin))
2433 : END DO
2434 : END DO
2435 : !
2436 32638 : P_ij_elem = SUM(local_ab*t_ab)
2437 222 : DEALLOCATE (local_ab)
2438 222 : IF ((.NOT. open_shell) .AND. (.NOT. alpha_beta)) THEN
2439 4 : P_ij_elem = P_ij_elem*2.0_dp
2440 : END IF
2441 222 : IF (beta_beta) THEN
2442 34 : mp2_env%ri_grad%P_ij(2)%array(my_i, my_j) = mp2_env%ri_grad%P_ij(2)%array(my_i, my_j) - P_ij_elem
2443 34 : mp2_env%ri_grad%P_ij(2)%array(my_j, my_i) = mp2_env%ri_grad%P_ij(2)%array(my_j, my_i) - P_ij_elem
2444 : ELSE
2445 188 : mp2_env%ri_grad%P_ij(ispin)%array(my_i, my_j) = mp2_env%ri_grad%P_ij(ispin)%array(my_i, my_j) - P_ij_elem
2446 188 : mp2_env%ri_grad%P_ij(ispin)%array(my_j, my_i) = mp2_env%ri_grad%P_ij(ispin)%array(my_j, my_i) - P_ij_elem
2447 : END IF
2448 1264 : CALL timestop(handle2)
2449 : END DO
2450 : ELSE
2451 2 : CALL timeset(routineN//"_comm", handle2)
2452 : ! no work to be done, possible messeges to be exchanged
2453 4 : DO proc_shift = 1, comm_exchange%num_pe - 1
2454 2 : proc_send = MODULO(comm_exchange%mepos + proc_shift, comm_exchange%num_pe)
2455 2 : proc_receive = MODULO(comm_exchange%mepos - proc_shift, comm_exchange%num_pe)
2456 :
2457 2 : send_ijk_index = num_ijk(proc_send)
2458 :
2459 4 : IF (ijk_index <= send_ijk_index) THEN
2460 : ! somethig to send
2461 : ijk_counter_send = (ijk_index - MIN(1, integ_group_pos2color_sub(proc_send)))*ngroup + &
2462 2 : integ_group_pos2color_sub(proc_send)
2463 2 : send_i = ijk_map(1, ijk_counter_send)
2464 2 : send_j = ijk_map(2, ijk_counter_send)
2465 2 : send_k = ijk_map(3, ijk_counter_send)
2466 2 : block_size = ijk_map(4, ijk_counter_send)
2467 :
2468 2 : do_send_i = (ispin /= kspin) .OR. send_i < send_k .OR. send_i > send_k + block_size - 1
2469 2 : do_send_j = (ispin /= kspin) .OR. send_j < send_k .OR. send_j > send_k + block_size - 1
2470 : ! occupied i
2471 2 : IF (do_send_i) THEN
2472 2 : CALL comm_exchange%send(BIb_C(ispin)%array(:, :, send_i), proc_send, tag)
2473 : END IF
2474 : ! occupied j
2475 2 : IF (do_send_j) THEN
2476 0 : CALL comm_exchange%send(BIb_C(ispin)%array(:, :, send_j), proc_send, tag)
2477 : END IF
2478 : ! occupied k
2479 2 : CALL comm_exchange%send(BIb_C(kspin)%array(:, :, send_k:send_k + block_size - 1), proc_send, tag)
2480 : END IF
2481 :
2482 : END DO ! proc loop
2483 2 : CALL timestop(handle2)
2484 : END IF
2485 : END DO ! ijk_index loop
2486 458 : DEALLOCATE (local_aL_i)
2487 458 : DEALLOCATE (local_aL_j)
2488 458 : DEALLOCATE (local_aL_k)
2489 458 : DEALLOCATE (t_ab)
2490 838 : DEALLOCATE (ijk_map)
2491 : END DO ! over number of loops (ispin)
2492 380 : CALL timestop(handle)
2493 :
2494 760 : END SUBROUTINE Quasi_degenerate_P_ij
2495 :
2496 : ! **************************************************************************************************
2497 : !> \brief ...
2498 : !> \param my_ijk ...
2499 : !> \param homo ...
2500 : !> \param homo_beta ...
2501 : !> \param Eigenval ...
2502 : !> \param mp2_env ...
2503 : !> \param ijk_map ...
2504 : !> \param unit_nr ...
2505 : !> \param ngroup ...
2506 : !> \param do_print_alpha ...
2507 : !> \param comm_exchange ...
2508 : !> \param num_ijk ...
2509 : !> \param max_ijk ...
2510 : !> \param color_sub ...
2511 : !> \param buffer_size ...
2512 : !> \param my_group_L_size ...
2513 : !> \param B_size_k ...
2514 : !> \param para_env ...
2515 : !> \param virtual ...
2516 : !> \param B_size_i ...
2517 : ! **************************************************************************************************
2518 458 : SUBROUTINE Find_quasi_degenerate_ij(my_ijk, homo, homo_beta, Eigenval, mp2_env, ijk_map, unit_nr, ngroup, &
2519 : do_print_alpha, comm_exchange, num_ijk, max_ijk, color_sub, &
2520 : buffer_size, my_group_L_size, B_size_k, para_env, virtual, B_size_i)
2521 :
2522 : INTEGER, INTENT(OUT) :: my_ijk
2523 : INTEGER, INTENT(IN) :: homo, homo_beta
2524 : REAL(KIND=dp), DIMENSION(:), INTENT(IN) :: Eigenval
2525 : TYPE(mp2_type), INTENT(IN) :: mp2_env
2526 : INTEGER, ALLOCATABLE, DIMENSION(:, :), INTENT(OUT) :: ijk_map
2527 : INTEGER, INTENT(IN) :: unit_nr, ngroup
2528 : LOGICAL, INTENT(IN) :: do_print_alpha
2529 : TYPE(mp_comm_type), INTENT(IN) :: comm_exchange
2530 : INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(OUT) :: num_ijk
2531 : INTEGER, INTENT(OUT) :: max_ijk
2532 : INTEGER, INTENT(IN) :: color_sub, buffer_size, my_group_L_size, &
2533 : B_size_k
2534 : TYPE(mp_para_env_type), INTENT(IN) :: para_env
2535 : INTEGER, INTENT(IN) :: virtual, B_size_i
2536 :
2537 : INTEGER :: block_size, communication_steps, communication_volume, iib, ij_counter, &
2538 : ijk_counter, jjb, kkb, max_block_size, max_num_k_blocks, min_communication_volume, &
2539 : my_steps, num_k_blocks, num_sing_ij, total_ijk
2540 : INTEGER(KIND=int_8) :: mem
2541 458 : LOGICAL, ALLOCATABLE, DIMENSION(:, :) :: ijk_marker
2542 :
2543 1374 : ALLOCATE (num_ijk(0:comm_exchange%num_pe - 1))
2544 :
2545 458 : num_sing_ij = 0
2546 2072 : DO iiB = 1, homo
2547 : ! diagonal elements already updated
2548 4324 : DO jjB = iiB + 1, homo
2549 2252 : IF (ABS(Eigenval(jjB) - Eigenval(iiB)) < mp2_env%ri_grad%eps_canonical) &
2550 1728 : num_sing_ij = num_sing_ij + 1
2551 : END DO
2552 : END DO
2553 :
2554 458 : IF (unit_nr > 0) THEN
2555 229 : IF (do_print_alpha) THEN
2556 : WRITE (UNIT=unit_nr, FMT="(T3,A,T75,i6)") &
2557 151 : "MO_INFO| Number of ij pairs below EPS_CANONICAL:", num_sing_ij
2558 : ELSE
2559 : WRITE (UNIT=unit_nr, FMT="(T3,A,T75,i6)") &
2560 78 : "MO_INFO| Number of ij pairs (spin beta) below EPS_CANONICAL:", num_sing_ij
2561 : END IF
2562 : END IF
2563 :
2564 : ! Determine the block size, first guess: use available buffer
2565 458 : max_block_size = buffer_size/(my_group_L_size*B_size_k)
2566 :
2567 : ! Second limit: memory
2568 458 : CALL m_memory(mem)
2569 : ! Convert to number of doubles
2570 458 : mem = mem/8
2571 : ! Remove local_ab (2x) and local_aL_i (2x)
2572 458 : mem = mem - 2_int_8*(virtual*B_size_k + B_size_i*my_group_L_size)
2573 458 : max_block_size = MIN(max_block_size, MAX(1, INT(mem/(my_group_L_size*B_size_k), KIND(max_block_size))))
2574 :
2575 : ! Exchange the limit
2576 458 : CALL para_env%min(max_block_size)
2577 :
2578 : ! Find now the block size which minimizes the communication volume and then the number of communication steps
2579 458 : block_size = 1
2580 458 : min_communication_volume = 3*homo_beta*num_sing_ij
2581 458 : communication_steps = 3*homo_beta*num_sing_ij
2582 1166 : DO iiB = max_block_size, 2, -1
2583 708 : max_num_k_blocks = homo_beta/iiB*num_sing_ij
2584 708 : num_k_blocks = max_num_k_blocks - MOD(max_num_k_blocks, ngroup)
2585 708 : communication_volume = num_k_blocks*(2 + iiB) + 3*(homo_beta*num_sing_ij - iiB*num_k_blocks)
2586 708 : my_steps = num_k_blocks + homo_beta*num_sing_ij - iiB*num_k_blocks
2587 1166 : IF (communication_volume < min_communication_volume) THEN
2588 52 : block_size = iiB
2589 52 : min_communication_volume = communication_volume
2590 52 : communication_steps = my_steps
2591 656 : ELSE IF (communication_volume == min_communication_volume .AND. my_steps < communication_steps) THEN
2592 58 : block_size = iiB
2593 58 : communication_steps = my_steps
2594 : END IF
2595 : END DO
2596 :
2597 458 : IF (unit_nr > 0) THEN
2598 : WRITE (UNIT=unit_nr, FMT="(T3,A,T75,i6)") &
2599 229 : "MO_INFO| Block size:", block_size
2600 229 : CALL m_flush(unit_nr)
2601 : END IF
2602 :
2603 : ! Calculate number of large blocks
2604 458 : max_num_k_blocks = homo_beta/block_size*num_sing_ij
2605 458 : num_k_blocks = max_num_k_blocks - MOD(max_num_k_blocks, ngroup)
2606 :
2607 458 : total_ijk = num_k_blocks + homo_beta*num_sing_ij - num_k_blocks*block_size
2608 1014 : ALLOCATE (ijk_map(4, total_ijk))
2609 1998 : ijk_map = 0
2610 1472 : ALLOCATE (ijk_marker(homo_beta, num_sing_ij))
2611 1016 : ijk_marker = .TRUE.
2612 :
2613 458 : my_ijk = 0
2614 458 : ijk_counter = 0
2615 458 : ij_counter = 0
2616 2072 : DO iiB = 1, homo
2617 : ! diagonal elements already updated
2618 4324 : DO jjB = iiB + 1, homo
2619 2252 : IF (ABS(Eigenval(jjB) - Eigenval(iiB)) >= mp2_env%ri_grad%eps_canonical) CYCLE
2620 114 : ij_counter = ij_counter + 1
2621 1864 : DO kkB = 1, homo_beta - MOD(homo_beta, block_size), block_size
2622 172 : IF (ijk_counter + 1 > num_k_blocks) EXIT
2623 136 : ijk_counter = ijk_counter + 1
2624 408 : ijk_marker(kkB:kkB + block_size - 1, ij_counter) = .FALSE.
2625 136 : ijk_map(1, ijk_counter) = iiB
2626 136 : ijk_map(2, ijk_counter) = jjB
2627 136 : ijk_map(3, ijk_counter) = kkB
2628 136 : ijk_map(4, ijk_counter) = block_size
2629 2388 : IF (MOD(ijk_counter, ngroup) == color_sub) my_ijk = my_ijk + 1
2630 : END DO
2631 : END DO
2632 : END DO
2633 : ij_counter = 0
2634 2072 : DO iiB = 1, homo
2635 : ! diagonal elements already updated
2636 4324 : DO jjB = iiB + 1, homo
2637 2252 : IF (ABS(Eigenval(jjB) - Eigenval(iiB)) >= mp2_env%ri_grad%eps_canonical) CYCLE
2638 114 : ij_counter = ij_counter + 1
2639 2172 : DO kkB = 1, homo_beta
2640 2696 : IF (ijk_marker(kkB, ij_counter)) THEN
2641 172 : ijk_counter = ijk_counter + 1
2642 172 : ijk_map(1, ijk_counter) = iiB
2643 172 : ijk_map(2, ijk_counter) = jjB
2644 172 : ijk_map(3, ijk_counter) = kkB
2645 172 : ijk_map(4, ijk_counter) = 1
2646 172 : IF (MOD(ijk_counter, ngroup) == color_sub) my_ijk = my_ijk + 1
2647 : END IF
2648 : END DO
2649 : END DO
2650 : END DO
2651 :
2652 458 : DEALLOCATE (ijk_marker)
2653 :
2654 458 : CALL comm_exchange%allgather(my_ijk, num_ijk)
2655 946 : max_ijk = MAXVAL(num_ijk)
2656 :
2657 458 : END SUBROUTINE Find_quasi_degenerate_ij
2658 :
2659 : END MODULE mp2_ri_gpw
|