Line data Source code
1 : !--------------------------------------------------------------------------------------------------!
2 : ! CP2K: A general program to perform molecular dynamics simulations !
3 : ! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4 : ! !
5 : ! SPDX-License-Identifier: GPL-2.0-or-later !
6 : !--------------------------------------------------------------------------------------------------!
7 :
8 : ! **************************************************************************************************
9 : !> \brief Routines to calculate RI-RPA and SOS-MP2 gradients
10 : !> \par History
11 : !> 10.2021 created [Frederick Stein]
12 : ! **************************************************************************************************
13 : MODULE rpa_grad
14 : USE cp_array_utils, ONLY: cp_1d_r_cp_type,&
15 : cp_3d_r_cp_type
16 : USE cp_blacs_env, ONLY: cp_blacs_env_type
17 : USE cp_fm_basic_linalg, ONLY: cp_fm_geadd,&
18 : cp_fm_scale_and_add,&
19 : cp_fm_upper_to_full
20 : USE cp_fm_cholesky, ONLY: cp_fm_cholesky_invert
21 : USE cp_fm_struct, ONLY: cp_fm_struct_create,&
22 : cp_fm_struct_get,&
23 : cp_fm_struct_release,&
24 : cp_fm_struct_type
25 : USE cp_fm_types, ONLY: cp_fm_create,&
26 : cp_fm_get_info,&
27 : cp_fm_release,&
28 : cp_fm_set_all,&
29 : cp_fm_to_fm,&
30 : cp_fm_to_fm_submat_general,&
31 : cp_fm_type
32 : USE dgemm_counter_types, ONLY: dgemm_counter_start,&
33 : dgemm_counter_stop,&
34 : dgemm_counter_type
35 : USE group_dist_types, ONLY: create_group_dist,&
36 : get_group_dist,&
37 : group_dist_d1_type,&
38 : group_dist_proc,&
39 : maxsize,&
40 : release_group_dist
41 : USE kahan_sum, ONLY: accurate_dot_product,&
42 : accurate_dot_product_2
43 : USE kinds, ONLY: dp,&
44 : int_8
45 : USE libint_2c_3c, ONLY: compare_potential_types
46 : USE local_gemm_api, ONLY: LOCAL_GEMM_PU_GPU,&
47 : local_gemm_ctxt_type
48 : USE machine, ONLY: m_flush,&
49 : m_memory
50 : USE mathconstants, ONLY: pi
51 : USE message_passing, ONLY: mp_comm_type,&
52 : mp_para_env_type,&
53 : mp_request_null,&
54 : mp_request_type,&
55 : mp_waitall,&
56 : mp_waitany
57 : USE mp2_laplace, ONLY: calc_fm_mat_s_laplace
58 : USE mp2_ri_grad_util, ONLY: array2fm,&
59 : create_dbcsr_gamma,&
60 : fm2array,&
61 : prepare_redistribution
62 : USE mp2_types, ONLY: mp2_type,&
63 : one_dim_int_array,&
64 : two_dim_int_array,&
65 : two_dim_real_array
66 : USE parallel_gemm_api, ONLY: parallel_gemm
67 : USE qs_environment_types, ONLY: get_qs_env,&
68 : qs_environment_type
69 : USE rpa_util, ONLY: calc_fm_mat_S_rpa,&
70 : remove_scaling_factor_rpa
71 : USE util, ONLY: get_limit
72 : #include "./base/base_uses.f90"
73 :
74 : IMPLICIT NONE
75 :
76 : PRIVATE
77 :
78 : CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'rpa_grad'
79 :
80 : PUBLIC :: rpa_grad_needed_mem, rpa_grad_type, rpa_grad_create, rpa_grad_finalize, rpa_grad_matrix_operations, rpa_grad_copy_Q
81 :
82 : TYPE sos_mp2_grad_work_type
83 : PRIVATE
84 : INTEGER, DIMENSION(:, :), ALLOCATABLE :: pair_list
85 : TYPE(one_dim_int_array), DIMENSION(:), ALLOCATABLE :: index2send, index2recv
86 : REAL(KIND=dp), DIMENSION(:), ALLOCATABLE :: P
87 : END TYPE
88 :
89 : TYPE rpa_grad_work_type
90 : TYPE(cp_fm_type) :: fm_mat_Q_copy = cp_fm_type()
91 : TYPE(one_dim_int_array), DIMENSION(:, :), ALLOCATABLE :: index2send
92 : TYPE(two_dim_int_array), DIMENSION(:, :), ALLOCATABLE :: index2recv
93 : TYPE(group_dist_d1_type), DIMENSION(:), ALLOCATABLE :: gd_homo, gd_virtual
94 : INTEGER, DIMENSION(2) :: grid = -1, mepos = -1
95 : TYPE(two_dim_real_array), DIMENSION(:), ALLOCATABLE :: P_ij, P_ab
96 : END TYPE
97 :
98 : TYPE rpa_grad_type
99 : PRIVATE
100 : TYPE(cp_fm_type) :: fm_Gamma_PQ = cp_fm_type()
101 : TYPE(cp_fm_type), DIMENSION(:), ALLOCATABLE :: fm_Y
102 : TYPE(sos_mp2_grad_work_type), ALLOCATABLE, DIMENSION(:) :: sos_mp2_work_occ, sos_mp2_work_virt
103 : TYPE(rpa_grad_work_type) :: rpa_work
104 : END TYPE
105 :
106 : INTEGER, PARAMETER :: spla_threshold = 128*128*128*2
107 : INTEGER, PARAMETER :: blksize_threshold = 4
108 :
109 : CONTAINS
110 :
111 : ! **************************************************************************************************
112 : !> \brief Calculates the necessary minimum memory for the Gradient code ion MiB
113 : !> \param homo ...
114 : !> \param virtual ...
115 : !> \param dimen_RI ...
116 : !> \param mem_per_rank ...
117 : !> \param mem_per_repl ...
118 : !> \param do_ri_sos_laplace_mp2 ...
119 : !> \return ...
120 : ! **************************************************************************************************
121 40 : PURE SUBROUTINE rpa_grad_needed_mem(homo, virtual, dimen_RI, mem_per_rank, mem_per_repl, do_ri_sos_laplace_mp2)
122 : INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
123 : INTEGER, INTENT(IN) :: dimen_RI
124 : REAL(KIND=dp), INTENT(INOUT) :: mem_per_rank, mem_per_repl
125 : LOGICAL, INTENT(IN) :: do_ri_sos_laplace_mp2
126 :
127 : REAL(KIND=dp) :: mem_iaK, mem_KL, mem_pab, mem_pij
128 :
129 88 : mem_iaK = SUM(REAL(virtual, KIND=dp)*homo)*dimen_RI
130 88 : mem_pij = SUM(REAL(homo, KIND=dp)**2)
131 88 : mem_pab = SUM(REAL(virtual, KIND=dp)**2)
132 40 : mem_KL = REAL(dimen_RI, KIND=dp)*dimen_RI
133 :
134 : ! Required matrices iaK
135 : ! Ytot_iaP = sum_tau Y_iaP(tau)
136 : ! Y_iaP(tau) = S_iaP(tau)*Q_PQ(tau) (work array)
137 : ! Required matrices density matrices
138 : ! Pij (local)
139 : ! Pab (local)
140 : ! Additionally with SOS-MP2
141 : ! Send and receive buffers for degenerate orbital pairs (rough estimate: everything)
142 : ! Additionally with RPA
143 : ! copy of work matrix
144 : ! receive buffer for calculation of density matrix
145 : ! copy of matrix Q
146 40 : mem_per_rank = mem_per_rank + (mem_pij + mem_pab)*8.0_dp/(1024**2)
147 40 : mem_per_repl = mem_per_repl + (mem_iaK + 2.0_dp*mem_iaK/SIZE(homo) + mem_KL)*8.0_dp/(1024**2)
148 40 : IF (.NOT. do_ri_sos_laplace_mp2) THEN
149 20 : mem_per_repl = mem_per_rank + (mem_iaK/SIZE(homo) + mem_KL)*8.0_dp/(1024**2)
150 : END IF
151 :
152 40 : END SUBROUTINE rpa_grad_needed_mem
153 :
154 : ! **************************************************************************************************
155 : !> \brief Creates the arrays of a rpa_grad_type
156 : !> \param rpa_grad ...
157 : !> \param fm_mat_Q ...
158 : !> \param fm_mat_S ...
159 : !> \param homo ...
160 : !> \param virtual ...
161 : !> \param mp2_env ...
162 : !> \param Eigenval ...
163 : !> \param unit_nr ...
164 : !> \param do_ri_sos_laplace_mp2 ...
165 : ! **************************************************************************************************
166 280 : SUBROUTINE rpa_grad_create(rpa_grad, fm_mat_Q, fm_mat_S, &
167 40 : homo, virtual, mp2_env, Eigenval, unit_nr, do_ri_sos_laplace_mp2)
168 : TYPE(rpa_grad_type), INTENT(OUT) :: rpa_grad
169 : TYPE(cp_fm_type), INTENT(IN) :: fm_mat_Q
170 : TYPE(cp_fm_type), DIMENSION(:), INTENT(IN) :: fm_mat_S
171 : INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
172 : TYPE(mp2_type), INTENT(INOUT) :: mp2_env
173 : REAL(KIND=dp), DIMENSION(:, :), INTENT(IN) :: Eigenval
174 : INTEGER, INTENT(IN) :: unit_nr
175 : LOGICAL, INTENT(IN) :: do_ri_sos_laplace_mp2
176 :
177 : CHARACTER(LEN=*), PARAMETER :: routineN = 'rpa_grad_create'
178 :
179 : INTEGER :: handle, ispin, nrow_local, nspins
180 :
181 40 : CALL timeset(routineN, handle)
182 :
183 40 : CALL cp_fm_create(rpa_grad%fm_Gamma_PQ, matrix_struct=fm_mat_Q%matrix_struct)
184 40 : CALL cp_fm_set_all(rpa_grad%fm_Gamma_PQ, 0.0_dp)
185 :
186 40 : nspins = SIZE(fm_mat_S)
187 :
188 168 : ALLOCATE (rpa_grad%fm_Y(nspins))
189 88 : DO ispin = 1, nspins
190 88 : CALL cp_fm_create(rpa_grad%fm_Y(ispin), fm_mat_S(ispin)%matrix_struct)
191 : END DO
192 :
193 40 : IF (do_ri_sos_laplace_mp2) THEN
194 : CALL sos_mp2_work_type_create(rpa_grad%sos_mp2_work_occ, rpa_grad%sos_mp2_work_virt, &
195 20 : unit_nr, Eigenval, homo, virtual, mp2_env%ri_grad%eps_canonical, fm_mat_S)
196 : ELSE
197 20 : CALL rpa_work_type_create(rpa_grad%rpa_work, fm_mat_Q, fm_mat_S, homo, virtual)
198 : END IF
199 :
200 : ! Set blocksize
201 40 : CALL cp_fm_struct_get(fm_mat_S(1)%matrix_struct, nrow_local=nrow_local)
202 40 : IF (mp2_env%ri_grad%dot_blksize < 1) mp2_env%ri_grad%dot_blksize = nrow_local
203 40 : mp2_env%ri_grad%dot_blksize = MIN(mp2_env%ri_grad%dot_blksize, nrow_local)
204 40 : IF (unit_nr > 0) THEN
205 20 : WRITE (unit_nr, '(T3,A,T75,I6)') 'GRAD_INFO| Block size for the contraction:', mp2_env%ri_grad%dot_blksize
206 20 : CALL m_flush(unit_nr)
207 : END IF
208 40 : CALL fm_mat_S(1)%matrix_struct%para_env%sync()
209 :
210 40 : CALL timestop(handle)
211 :
212 80 : END SUBROUTINE rpa_grad_create
213 :
214 : ! **************************************************************************************************
215 : !> \brief ...
216 : !> \param sos_mp2_work_occ ...
217 : !> \param sos_mp2_work_virt ...
218 : !> \param unit_nr ...
219 : !> \param Eigenval ...
220 : !> \param homo ...
221 : !> \param virtual ...
222 : !> \param eps_degenerate ...
223 : !> \param fm_mat_S ...
224 : ! **************************************************************************************************
225 20 : SUBROUTINE sos_mp2_work_type_create(sos_mp2_work_occ, sos_mp2_work_virt, unit_nr, &
226 20 : Eigenval, homo, virtual, eps_degenerate, fm_mat_S)
227 : TYPE(sos_mp2_grad_work_type), ALLOCATABLE, &
228 : DIMENSION(:), INTENT(OUT) :: sos_mp2_work_occ, sos_mp2_work_virt
229 : INTEGER, INTENT(IN) :: unit_nr
230 : REAL(KIND=dp), DIMENSION(:, :), INTENT(IN) :: Eigenval
231 : INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
232 : REAL(KIND=dp), INTENT(IN) :: eps_degenerate
233 : TYPE(cp_fm_type), DIMENSION(:), INTENT(IN) :: fm_mat_S
234 :
235 : CHARACTER(LEN=*), PARAMETER :: routineN = 'sos_mp2_work_type_create'
236 :
237 : INTEGER :: handle, ispin, nspins
238 :
239 20 : CALL timeset(routineN, handle)
240 :
241 20 : nspins = SIZE(fm_mat_S)
242 148 : ALLOCATE (sos_mp2_work_occ(nspins), sos_mp2_work_virt(nspins))
243 44 : DO ispin = 1, nspins
244 :
245 : CALL create_list_nearly_degen_pairs(Eigenval(1:homo(ispin), ispin), &
246 24 : eps_degenerate, sos_mp2_work_occ(ispin)%pair_list)
247 24 : IF (unit_nr > 0) WRITE (unit_nr, "(T3,A,T75,i6)") &
248 12 : "MO_INFO| Number of ij pairs below EPS_CANONICAL:", SIZE(sos_mp2_work_occ(ispin)%pair_list, 2)
249 72 : ALLOCATE (sos_mp2_work_occ(ispin)%P(homo(ispin) + SIZE(sos_mp2_work_occ(ispin)%pair_list, 2)))
250 164 : sos_mp2_work_occ(ispin)%P = 0.0_dp
251 24 : CALL prepare_comm_Pij(sos_mp2_work_occ(ispin), virtual(ispin), fm_mat_S(ispin))
252 :
253 : CALL create_list_nearly_degen_pairs(Eigenval(homo(ispin) + 1:, ispin), &
254 24 : eps_degenerate, sos_mp2_work_virt(ispin)%pair_list)
255 24 : IF (unit_nr > 0) WRITE (unit_nr, "(T3,A,T75,i6)") &
256 12 : "MO_INFO| Number of ab pairs below EPS_CANONICAL:", SIZE(sos_mp2_work_virt(ispin)%pair_list, 2)
257 72 : ALLOCATE (sos_mp2_work_virt(ispin)%P(virtual(ispin) + SIZE(sos_mp2_work_virt(ispin)%pair_list, 2)))
258 1136 : sos_mp2_work_virt(ispin)%P = 0.0_dp
259 44 : CALL prepare_comm_Pab(sos_mp2_work_virt(ispin), virtual(ispin), fm_mat_S(ispin))
260 : END DO
261 :
262 20 : CALL timestop(handle)
263 :
264 20 : END SUBROUTINE
265 :
266 : ! **************************************************************************************************
267 : !> \brief ...
268 : !> \param rpa_work ...
269 : !> \param fm_mat_Q ...
270 : !> \param fm_mat_S ...
271 : !> \param homo ...
272 : !> \param virtual ...
273 : ! **************************************************************************************************
274 120 : SUBROUTINE rpa_work_type_create(rpa_work, fm_mat_Q, fm_mat_S, homo, virtual)
275 : TYPE(rpa_grad_work_type), INTENT(OUT) :: rpa_work
276 : TYPE(cp_fm_type), INTENT(IN) :: fm_mat_Q
277 : TYPE(cp_fm_type), DIMENSION(:), INTENT(IN) :: fm_mat_S
278 : INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
279 :
280 : CHARACTER(LEN=*), PARAMETER :: routineN = 'rpa_work_type_create'
281 :
282 : INTEGER :: avirt, col_global, col_local, handle, iocc, ispin, my_a, my_a_end, my_a_size, &
283 : my_a_start, my_i, my_i_end, my_i_size, my_i_start, my_pcol, ncol_local, nspins, &
284 : num_pe_col, proc_homo, proc_homo_send, proc_recv, proc_send, proc_virtual, &
285 : proc_virtual_send
286 20 : INTEGER, ALLOCATABLE, DIMENSION(:) :: data2recv, data2send
287 20 : INTEGER, DIMENSION(:), POINTER :: col_indices
288 :
289 20 : CALL timeset(routineN, handle)
290 :
291 20 : CALL cp_fm_create(rpa_work%fm_mat_Q_copy, matrix_struct=fm_mat_Q%matrix_struct)
292 :
293 20 : CALL fm_mat_S(1)%matrix_struct%context%get(number_of_process_columns=num_pe_col, my_process_column=my_pcol)
294 :
295 20 : nspins = SIZE(fm_mat_S)
296 :
297 0 : ALLOCATE (rpa_work%index2send(0:num_pe_col - 1, nspins), &
298 0 : rpa_work%index2recv(0:num_pe_col - 1, nspins), &
299 0 : rpa_work%gd_homo(nspins), rpa_work%gd_virtual(nspins), &
300 : data2send(0:num_pe_col - 1), data2recv(0:num_pe_col - 1), &
301 512 : rpa_work%P_ij(nspins), rpa_work%P_ab(nspins))
302 :
303 : ! Determine new process grid
304 20 : proc_homo = MAX(1, CEILING(SQRT(REAL(num_pe_col, KIND=dp))))
305 20 : DO WHILE (MOD(num_pe_col, proc_homo) /= 0)
306 0 : proc_homo = proc_homo - 1
307 : END DO
308 20 : proc_virtual = num_pe_col/proc_homo
309 :
310 20 : rpa_work%grid(1) = proc_virtual
311 20 : rpa_work%grid(2) = proc_homo
312 :
313 20 : rpa_work%mepos(1) = MOD(my_pcol, proc_virtual)
314 20 : rpa_work%mepos(2) = my_pcol/proc_virtual
315 :
316 44 : DO ispin = 1, nspins
317 :
318 : ! Determine distributions of the orbitals
319 24 : CALL create_group_dist(rpa_work%gd_homo(ispin), proc_homo, homo(ispin))
320 24 : CALL create_group_dist(rpa_work%gd_virtual(ispin), proc_virtual, virtual(ispin))
321 :
322 24 : CALL cp_fm_struct_get(fm_mat_S(ispin)%matrix_struct, ncol_local=ncol_local, col_indices=col_indices)
323 :
324 48 : data2send = 0
325 : ! Count the amount of data2send to each process
326 1924 : DO col_local = 1, ncol_local
327 1900 : col_global = col_indices(col_local)
328 :
329 1900 : iocc = (col_global - 1)/virtual(ispin) + 1
330 1900 : avirt = col_global - (iocc - 1)*virtual(ispin)
331 :
332 1900 : proc_homo_send = group_dist_proc(rpa_work%gd_homo(ispin), iocc)
333 1900 : proc_virtual_send = group_dist_proc(rpa_work%gd_virtual(ispin), avirt)
334 :
335 1900 : proc_send = proc_homo_send*proc_virtual + proc_virtual_send
336 :
337 1924 : data2send(proc_send) = data2send(proc_send) + 1
338 : END DO
339 :
340 48 : DO proc_send = 0, num_pe_col - 1
341 96 : ALLOCATE (rpa_work%index2send(proc_send, ispin)%array(data2send(proc_send)))
342 : END DO
343 :
344 : ! Prepare the indices
345 48 : data2send = 0
346 1924 : DO col_local = 1, ncol_local
347 1900 : col_global = col_indices(col_local)
348 :
349 1900 : iocc = (col_global - 1)/virtual(ispin) + 1
350 1900 : avirt = col_global - (iocc - 1)*virtual(ispin)
351 :
352 1900 : proc_homo_send = group_dist_proc(rpa_work%gd_homo(ispin), iocc)
353 1900 : proc_virtual_send = group_dist_proc(rpa_work%gd_virtual(ispin), avirt)
354 :
355 1900 : proc_send = proc_homo_send*proc_virtual + proc_virtual_send
356 :
357 1900 : data2send(proc_send) = data2send(proc_send) + 1
358 :
359 1924 : rpa_work%index2send(proc_send, ispin)%array(data2send(proc_send)) = col_local
360 : END DO
361 :
362 : ! Count the amount of data2recv from each process
363 24 : CALL get_group_dist(rpa_work%gd_homo(ispin), my_pcol/proc_virtual, my_i_start, my_i_end, my_i_size)
364 24 : CALL get_group_dist(rpa_work%gd_virtual(ispin), MOD(my_pcol, proc_virtual), my_a_start, my_a_end, my_a_size)
365 :
366 48 : data2recv = 0
367 116 : DO my_i = my_i_start, my_i_end
368 2016 : DO my_a = my_a_start, my_a_end
369 1900 : proc_recv = fm_mat_S(ispin)%matrix_struct%g2p_col((my_i - 1)*virtual(ispin) + my_a)
370 1992 : data2recv(proc_recv) = data2recv(proc_recv) + 1
371 : END DO
372 : END DO
373 :
374 48 : DO proc_recv = 0, num_pe_col - 1
375 96 : ALLOCATE (rpa_work%index2recv(proc_recv, ispin)%array(2, data2recv(proc_recv)))
376 : END DO
377 :
378 48 : data2recv = 0
379 116 : DO my_i = my_i_start, my_i_end
380 2016 : DO my_a = my_a_start, my_a_end
381 1900 : proc_recv = fm_mat_S(ispin)%matrix_struct%g2p_col((my_i - 1)*virtual(ispin) + my_a)
382 1900 : data2recv(proc_recv) = data2recv(proc_recv) + 1
383 :
384 1900 : rpa_work%index2recv(proc_recv, ispin)%array(2, data2recv(proc_recv)) = my_i - my_i_start + 1
385 1992 : rpa_work%index2recv(proc_recv, ispin)%array(1, data2recv(proc_recv)) = my_a - my_a_start + 1
386 : END DO
387 : END DO
388 :
389 0 : ALLOCATE (rpa_work%P_ij(ispin)%array(my_i_size, homo(ispin)), &
390 168 : rpa_work%P_ab(ispin)%array(my_a_size, virtual(ispin)))
391 472 : rpa_work%P_ij(ispin)%array(:, :) = 0.0_dp
392 11148 : rpa_work%P_ab(ispin)%array(:, :) = 0.0_dp
393 :
394 : END DO
395 :
396 20 : DEALLOCATE (data2send, data2recv)
397 :
398 20 : CALL timestop(handle)
399 :
400 40 : END SUBROUTINE
401 :
402 : ! **************************************************************************************************
403 : !> \brief ...
404 : !> \param Eigenval ...
405 : !> \param eps_degen ...
406 : !> \param pair_list ...
407 : ! **************************************************************************************************
408 48 : SUBROUTINE create_list_nearly_degen_pairs(Eigenval, eps_degen, pair_list)
409 : REAL(KIND=dp), DIMENSION(:), INTENT(IN) :: Eigenval
410 : REAL(KIND=dp), INTENT(IN) :: eps_degen
411 : INTEGER, ALLOCATABLE, DIMENSION(:, :), INTENT(OUT) :: pair_list
412 :
413 : INTEGER :: my_i, my_j, num_orbitals, num_pairs, &
414 : pair_counter
415 :
416 48 : num_orbitals = SIZE(Eigenval)
417 :
418 : ! Determine number of nearly degenerate orbital pairs
419 : ! Trivial cases: diagonal elements
420 48 : num_pairs = 0
421 640 : DO my_i = 1, num_orbitals
422 11576 : DO my_j = 1, num_orbitals
423 10936 : IF (my_i == my_j) CYCLE
424 10936 : IF (ABS(Eigenval(my_i) - Eigenval(my_j)) < eps_degen) num_pairs = num_pairs + 1
425 : END DO
426 : END DO
427 104 : ALLOCATE (pair_list(2, num_pairs))
428 :
429 : ! Print the required pairs
430 48 : pair_counter = 1
431 640 : DO my_i = 1, num_orbitals
432 11576 : DO my_j = 1, num_orbitals
433 10936 : IF (my_i == my_j) CYCLE
434 10936 : IF (ABS(Eigenval(my_i) - Eigenval(my_j)) < eps_degen) THEN
435 660 : pair_list(1, pair_counter) = my_i
436 660 : pair_list(2, pair_counter) = my_j
437 660 : pair_counter = pair_counter + 1
438 : END IF
439 : END DO
440 : END DO
441 :
442 48 : END SUBROUTINE create_list_nearly_degen_pairs
443 :
444 : ! **************************************************************************************************
445 : !> \brief ...
446 : !> \param sos_mp2_work ...
447 : !> \param virtual ...
448 : !> \param fm_mat_S ...
449 : ! **************************************************************************************************
450 24 : SUBROUTINE prepare_comm_Pij(sos_mp2_work, virtual, fm_mat_S)
451 : TYPE(sos_mp2_grad_work_type), INTENT(INOUT) :: sos_mp2_work
452 : INTEGER, INTENT(IN) :: virtual
453 : TYPE(cp_fm_type), INTENT(IN) :: fm_mat_S
454 :
455 : CHARACTER(LEN=*), PARAMETER :: routineN = 'prepare_comm_Pij'
456 :
457 : INTEGER :: avirt, col_global, col_local, counter, handle, ij_counter, iocc, my_i, my_j, &
458 : my_pcol, my_prow, ncol_local, nrow_local, num_ij_pairs, num_pe_col, pcol, pcol_recv, &
459 : pcol_send, proc_shift, tag
460 24 : INTEGER, ALLOCATABLE, DIMENSION(:) :: data2recv, data2send
461 24 : INTEGER, DIMENSION(:), POINTER :: col_indices, ncol_locals
462 24 : INTEGER, DIMENSION(:, :), POINTER :: blacs2mpi
463 : TYPE(cp_blacs_env_type), POINTER :: context
464 : TYPE(mp_comm_type) :: comm_exchange
465 : TYPE(mp_para_env_type), POINTER :: para_env
466 :
467 24 : CALL timeset(routineN, handle)
468 :
469 24 : tag = 44
470 :
471 24 : CALL fm_mat_S%matrix_struct%context%get(number_of_process_columns=num_pe_col)
472 0 : ALLOCATE (sos_mp2_work%index2send(0:num_pe_col - 1), &
473 168 : sos_mp2_work%index2recv(0:num_pe_col - 1))
474 :
475 72 : ALLOCATE (data2send(0:num_pe_col - 1))
476 72 : ALLOCATE (data2recv(0:num_pe_col - 1))
477 :
478 : CALL cp_fm_struct_get(fm_mat_S%matrix_struct, para_env=para_env, ncol_locals=ncol_locals, &
479 : ncol_local=ncol_local, col_indices=col_indices, &
480 24 : context=context, nrow_local=nrow_local)
481 : CALL context%get(my_process_row=my_prow, my_process_column=my_pcol, &
482 24 : blacs2mpi=blacs2mpi)
483 :
484 24 : num_ij_pairs = SIZE(sos_mp2_work%pair_list, 2)
485 :
486 24 : IF (num_ij_pairs > 0) THEN
487 :
488 4 : CALL comm_exchange%from_split(para_env, my_prow)
489 :
490 8 : data2send = 0
491 8 : data2recv = 0
492 :
493 8 : DO proc_shift = 0, num_pe_col - 1
494 4 : pcol_send = MODULO(my_pcol + proc_shift, num_pe_col)
495 :
496 4 : counter = 0
497 308 : DO col_local = 1, ncol_local
498 304 : col_global = col_indices(col_local)
499 :
500 304 : iocc = MAX(1, col_global - 1)/virtual + 1
501 304 : avirt = col_global - (iocc - 1)*virtual
502 :
503 764 : DO ij_counter = 1, num_ij_pairs
504 :
505 760 : my_i = sos_mp2_work%pair_list(1, ij_counter)
506 760 : my_j = sos_mp2_work%pair_list(2, ij_counter)
507 :
508 760 : IF (iocc /= my_j) CYCLE
509 304 : pcol = fm_mat_S%matrix_struct%g2p_col((my_i - 1)*virtual + avirt)
510 304 : IF (pcol /= pcol_send) CYCLE
511 :
512 304 : counter = counter + 1
513 :
514 760 : EXIT
515 :
516 : END DO
517 : END DO
518 8 : data2send(pcol_send) = counter
519 : END DO
520 :
521 4 : CALL comm_exchange%alltoall(data2send, data2recv, 1)
522 :
523 8 : DO proc_shift = 0, num_pe_col - 1
524 4 : pcol_send = MODULO(my_pcol + proc_shift, num_pe_col)
525 4 : pcol_recv = MODULO(my_pcol - proc_shift, num_pe_col)
526 :
527 : ! Collect indices and exchange
528 12 : ALLOCATE (sos_mp2_work%index2send(pcol_send)%array(data2send(pcol_send)))
529 :
530 4 : counter = 0
531 308 : DO col_local = 1, ncol_local
532 304 : col_global = col_indices(col_local)
533 :
534 304 : iocc = MAX(1, col_global - 1)/virtual + 1
535 304 : avirt = col_global - (iocc - 1)*virtual
536 :
537 764 : DO ij_counter = 1, num_ij_pairs
538 :
539 760 : my_i = sos_mp2_work%pair_list(1, ij_counter)
540 760 : my_j = sos_mp2_work%pair_list(2, ij_counter)
541 :
542 760 : IF (iocc /= my_j) CYCLE
543 304 : pcol = fm_mat_S%matrix_struct%g2p_col((my_i - 1)*virtual + avirt)
544 304 : IF (pcol /= pcol_send) CYCLE
545 :
546 304 : counter = counter + 1
547 :
548 304 : sos_mp2_work%index2send(pcol_send)%array(counter) = col_global
549 :
550 760 : EXIT
551 :
552 : END DO
553 : END DO
554 :
555 12 : ALLOCATE (sos_mp2_work%index2recv(pcol_recv)%array(data2recv(pcol_recv)))
556 : !
557 : CALL para_env%sendrecv(sos_mp2_work%index2send(pcol_send)%array, blacs2mpi(my_prow, pcol_send), &
558 4 : sos_mp2_work%index2recv(pcol_recv)%array, blacs2mpi(my_prow, pcol_recv), tag)
559 :
560 : ! Convert to global coordinates to local coordinates as we always work with them
561 312 : DO counter = 1, data2send(pcol_send)
562 : sos_mp2_work%index2send(pcol_send)%array(counter) = &
563 308 : fm_mat_S%matrix_struct%g2l_col(sos_mp2_work%index2send(pcol_send)%array(counter))
564 : END DO
565 : END DO
566 :
567 4 : CALL comm_exchange%free()
568 : END IF
569 :
570 24 : DEALLOCATE (data2send, data2recv)
571 :
572 24 : CALL timestop(handle)
573 :
574 48 : END SUBROUTINE
575 :
576 : ! **************************************************************************************************
577 : !> \brief ...
578 : !> \param sos_mp2_work ...
579 : !> \param virtual ...
580 : !> \param fm_mat_S ...
581 : ! **************************************************************************************************
582 24 : SUBROUTINE prepare_comm_Pab(sos_mp2_work, virtual, fm_mat_S)
583 : TYPE(sos_mp2_grad_work_type), INTENT(INOUT) :: sos_mp2_work
584 : INTEGER, INTENT(IN) :: virtual
585 : TYPE(cp_fm_type), INTENT(IN) :: fm_mat_S
586 :
587 : CHARACTER(LEN=*), PARAMETER :: routineN = 'prepare_comm_Pab'
588 :
589 : INTEGER :: ab_counter, avirt, col_global, col_local, counter, handle, iocc, my_a, my_b, &
590 : my_pcol, my_prow, ncol_local, nrow_local, num_ab_pairs, num_pe_col, pcol, pcol_recv, &
591 : pcol_send, proc_shift, tag
592 24 : INTEGER, ALLOCATABLE, DIMENSION(:) :: data2recv, data2send
593 24 : INTEGER, DIMENSION(:), POINTER :: col_indices, ncol_locals
594 24 : INTEGER, DIMENSION(:, :), POINTER :: blacs2mpi
595 : TYPE(cp_blacs_env_type), POINTER :: context
596 : TYPE(mp_comm_type) :: comm_exchange
597 : TYPE(mp_para_env_type), POINTER :: para_env
598 :
599 24 : CALL timeset(routineN, handle)
600 :
601 24 : tag = 44
602 :
603 24 : CALL fm_mat_S%matrix_struct%context%get(number_of_process_columns=num_pe_col)
604 0 : ALLOCATE (sos_mp2_work%index2send(0:num_pe_col - 1), &
605 168 : sos_mp2_work%index2recv(0:num_pe_col - 1))
606 :
607 24 : num_ab_pairs = SIZE(sos_mp2_work%pair_list, 2)
608 24 : IF (num_ab_pairs > 0) THEN
609 :
610 : CALL cp_fm_struct_get(fm_mat_S%matrix_struct, para_env=para_env, ncol_locals=ncol_locals, &
611 : ncol_local=ncol_local, col_indices=col_indices, &
612 4 : context=context, nrow_local=nrow_local)
613 : CALL context%get(my_process_row=my_prow, my_process_column=my_pcol, &
614 4 : blacs2mpi=blacs2mpi)
615 :
616 4 : CALL comm_exchange%from_split(para_env, my_prow)
617 :
618 12 : ALLOCATE (data2send(0:num_pe_col - 1))
619 12 : ALLOCATE (data2recv(0:num_pe_col - 1))
620 :
621 8 : data2send = 0
622 8 : data2recv = 0
623 8 : DO proc_shift = 0, num_pe_col - 1
624 4 : pcol_send = MODULO(my_pcol + proc_shift, num_pe_col)
625 4 : pcol_recv = MODULO(my_pcol - proc_shift, num_pe_col)
626 :
627 4 : counter = 0
628 308 : DO col_local = 1, ncol_local
629 304 : col_global = col_indices(col_local)
630 :
631 304 : iocc = MAX(1, col_global - 1)/virtual + 1
632 304 : avirt = col_global - (iocc - 1)*virtual
633 :
634 15476 : DO ab_counter = 1, num_ab_pairs
635 :
636 15472 : my_a = sos_mp2_work%pair_list(1, ab_counter)
637 15472 : my_b = sos_mp2_work%pair_list(2, ab_counter)
638 :
639 15472 : IF (avirt /= my_b) CYCLE
640 304 : pcol = fm_mat_S%matrix_struct%g2p_col((iocc - 1)*virtual + my_a)
641 304 : IF (pcol /= pcol_send) CYCLE
642 :
643 304 : counter = counter + 1
644 :
645 15472 : EXIT
646 :
647 : END DO
648 : END DO
649 8 : data2send(pcol_send) = counter
650 : END DO
651 :
652 4 : CALL comm_exchange%alltoall(data2send, data2recv, 1)
653 :
654 8 : DO proc_shift = 0, num_pe_col - 1
655 4 : pcol_send = MODULO(my_pcol + proc_shift, num_pe_col)
656 4 : pcol_recv = MODULO(my_pcol - proc_shift, num_pe_col)
657 :
658 : ! Collect indices and exchange
659 12 : ALLOCATE (sos_mp2_work%index2send(pcol_send)%array(data2send(pcol_send)))
660 :
661 4 : counter = 0
662 308 : DO col_local = 1, ncol_local
663 304 : col_global = col_indices(col_local)
664 :
665 304 : iocc = MAX(1, col_global - 1)/virtual + 1
666 304 : avirt = col_global - (iocc - 1)*virtual
667 :
668 15476 : DO ab_counter = 1, num_ab_pairs
669 :
670 15472 : my_a = sos_mp2_work%pair_list(1, ab_counter)
671 15472 : my_b = sos_mp2_work%pair_list(2, ab_counter)
672 :
673 15472 : IF (avirt /= my_b) CYCLE
674 304 : pcol = fm_mat_S%matrix_struct%g2p_col((iocc - 1)*virtual + my_a)
675 304 : IF (pcol /= pcol_send) CYCLE
676 :
677 304 : counter = counter + 1
678 :
679 304 : sos_mp2_work%index2send(pcol_send)%array(counter) = col_global
680 :
681 15472 : EXIT
682 :
683 : END DO
684 : END DO
685 :
686 12 : ALLOCATE (sos_mp2_work%index2recv(pcol_recv)%array(data2recv(pcol_recv)))
687 : !
688 : CALL para_env%sendrecv(sos_mp2_work%index2send(pcol_send)%array, blacs2mpi(my_prow, pcol_send), &
689 4 : sos_mp2_work%index2recv(pcol_recv)%array, blacs2mpi(my_prow, pcol_recv), tag)
690 :
691 : ! Convert to global coordinates to local coordinates as we always work with them
692 312 : DO counter = 1, data2send(pcol_send)
693 : sos_mp2_work%index2send(pcol_send)%array(counter) = &
694 308 : fm_mat_S%matrix_struct%g2l_col(sos_mp2_work%index2send(pcol_send)%array(counter))
695 : END DO
696 : END DO
697 :
698 4 : CALL comm_exchange%free()
699 8 : DEALLOCATE (data2send, data2recv)
700 :
701 : END IF
702 :
703 24 : CALL timestop(handle)
704 :
705 48 : END SUBROUTINE
706 :
707 : ! **************************************************************************************************
708 : !> \brief ...
709 : !> \param fm_mat_Q ...
710 : !> \param rpa_grad ...
711 : ! **************************************************************************************************
712 48 : SUBROUTINE rpa_grad_copy_Q(fm_mat_Q, rpa_grad)
713 : TYPE(cp_fm_type), INTENT(IN) :: fm_mat_Q
714 : TYPE(rpa_grad_type), INTENT(INOUT) :: rpa_grad
715 :
716 48 : CALL cp_fm_to_fm(fm_mat_Q, rpa_grad%rpa_work%fm_mat_Q_copy)
717 :
718 48 : END SUBROUTINE
719 :
720 : ! **************************************************************************************************
721 : !> \brief ...
722 : !> \param mp2_env ...
723 : !> \param rpa_grad ...
724 : !> \param do_ri_sos_laplace_mp2 ...
725 : !> \param fm_mat_Q ...
726 : !> \param fm_mat_Q_gemm ...
727 : !> \param dgemm_counter ...
728 : !> \param fm_mat_S ...
729 : !> \param omega ...
730 : !> \param homo ...
731 : !> \param virtual ...
732 : !> \param Eigenval ...
733 : !> \param weight ...
734 : !> \param unit_nr ...
735 : ! **************************************************************************************************
736 108 : SUBROUTINE rpa_grad_matrix_operations(mp2_env, rpa_grad, do_ri_sos_laplace_mp2, fm_mat_Q, fm_mat_Q_gemm, &
737 108 : dgemm_counter, fm_mat_S, omega, homo, virtual, Eigenval, weight, unit_nr)
738 : TYPE(mp2_type), INTENT(INOUT) :: mp2_env
739 : TYPE(rpa_grad_type), INTENT(INOUT) :: rpa_grad
740 : LOGICAL, INTENT(IN) :: do_ri_sos_laplace_mp2
741 : TYPE(cp_fm_type), DIMENSION(:), INTENT(IN) :: fm_mat_Q, fm_mat_Q_gemm
742 : TYPE(dgemm_counter_type), INTENT(INOUT) :: dgemm_counter
743 : TYPE(cp_fm_type), DIMENSION(:), INTENT(IN) :: fm_mat_S
744 : REAL(KIND=dp), INTENT(IN) :: omega
745 : INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
746 : REAL(KIND=dp), DIMENSION(:, :), INTENT(IN) :: Eigenval
747 : REAL(KIND=dp), INTENT(IN) :: weight
748 : INTEGER, INTENT(IN) :: unit_nr
749 :
750 : CHARACTER(LEN=*), PARAMETER :: routineN = 'rpa_grad_matrix_operations'
751 :
752 : INTEGER :: col_global, col_local, dimen_ia, &
753 : dimen_RI, handle, handle2, ispin, &
754 : jspin, ncol_local, nrow_local, nspins, &
755 : row_local
756 108 : INTEGER, DIMENSION(:), POINTER :: col_indices, row_indices
757 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
758 108 : TARGET :: mat_S_3D, mat_work_iaP_3D
759 : TYPE(cp_fm_type) :: fm_work_iaP, fm_work_PQ
760 :
761 108 : CALL timeset(routineN, handle)
762 :
763 108 : nspins = SIZE(fm_mat_Q)
764 :
765 : CALL cp_fm_get_info(fm_mat_Q(1), nrow_global=dimen_RI, nrow_local=nrow_local, ncol_local=ncol_local, &
766 108 : col_indices=col_indices, row_indices=row_indices)
767 :
768 108 : IF (.NOT. do_ri_sos_laplace_mp2) THEN
769 48 : CALL cp_fm_create(fm_work_PQ, fm_mat_Q(1)%matrix_struct)
770 :
771 : ! calculate [1+Q(iw')]^-1
772 48 : CALL cp_fm_cholesky_invert(fm_mat_Q(1))
773 : ! symmetrize the result, fm_work_PQ is only a work matrix
774 48 : CALL cp_fm_upper_to_full(fm_mat_Q(1), fm_work_PQ)
775 :
776 48 : CALL cp_fm_release(fm_work_PQ)
777 :
778 4144 : DO col_local = 1, ncol_local
779 4096 : col_global = col_indices(col_local)
780 164048 : DO row_local = 1, nrow_local
781 164000 : IF (col_global == row_indices(row_local)) THEN
782 3432 : fm_mat_Q(1)%local_data(row_local, col_local) = fm_mat_Q(1)%local_data(row_local, col_local) - 1.0_dp
783 3432 : EXIT
784 : END IF
785 : END DO
786 : END DO
787 :
788 48 : CALL timeset(routineN//"_PQ", handle2)
789 48 : CALL dgemm_counter_start(dgemm_counter)
790 : CALL parallel_gemm(transa="N", transb="N", m=dimen_RI, n=dimen_RI, k=dimen_RI, alpha=weight, &
791 : matrix_a=rpa_grad%rpa_work%fm_mat_Q_copy, matrix_b=fm_mat_Q(1), beta=1.0_dp, &
792 48 : matrix_c=rpa_grad%fm_Gamma_PQ)
793 48 : CALL dgemm_counter_stop(dgemm_counter, dimen_RI, dimen_RI, dimen_RI)
794 48 : CALL timestop(handle2)
795 :
796 : CALL cp_fm_to_fm_submat_general(fm_mat_Q(1), fm_mat_Q_gemm(1), dimen_RI, dimen_RI, 1, 1, 1, 1, &
797 48 : fm_mat_Q_gemm(1)%matrix_struct%context)
798 : END IF
799 :
800 232 : DO ispin = 1, nspins
801 124 : IF (do_ri_sos_laplace_mp2) THEN
802 : ! The spin of the other Q matrix is always the other spin
803 68 : jspin = nspins - ispin + 1
804 : ELSE
805 : ! or the first matrix in the case of RPA
806 : jspin = 1
807 : END IF
808 :
809 124 : IF (do_ri_sos_laplace_mp2) THEN
810 68 : CALL timeset(routineN//"_PQ", handle2)
811 68 : CALL dgemm_counter_start(dgemm_counter)
812 : CALL parallel_gemm(transa="N", transb="N", m=dimen_RI, n=dimen_RI, k=dimen_RI, alpha=weight, &
813 : matrix_a=fm_mat_Q(ispin), matrix_b=fm_mat_Q(jspin), beta=1.0_dp, &
814 68 : matrix_c=rpa_grad%fm_Gamma_PQ)
815 68 : CALL dgemm_counter_stop(dgemm_counter, dimen_RI, dimen_RI, dimen_RI)
816 68 : CALL timestop(handle2)
817 :
818 : CALL cp_fm_to_fm_submat_general(fm_mat_Q(jspin), fm_mat_Q_gemm(jspin), dimen_RI, dimen_RI, 1, 1, 1, 1, &
819 68 : fm_mat_Q_gemm(jspin)%matrix_struct%context)
820 : ELSE
821 : CALL calc_fm_mat_S_rpa(fm_mat_S(ispin), .TRUE., virtual(ispin), Eigenval(:, ispin), &
822 56 : homo(ispin), omega, 0.0_dp)
823 : END IF
824 :
825 124 : CALL timeset(routineN//"_contr_S", handle2)
826 124 : CALL cp_fm_create(fm_work_iaP, rpa_grad%fm_Y(ispin)%matrix_struct)
827 :
828 124 : CALL cp_fm_get_info(fm_mat_S(ispin), ncol_global=dimen_ia)
829 :
830 124 : CALL dgemm_counter_start(dgemm_counter)
831 : CALL parallel_gemm(transa="N", transb="N", m=dimen_RI, n=dimen_ia, k=dimen_RI, alpha=1.0_dp, &
832 : matrix_a=fm_mat_Q_gemm(jspin), matrix_b=fm_mat_S(ispin), beta=0.0_dp, &
833 124 : matrix_c=fm_work_iaP)
834 124 : CALL dgemm_counter_stop(dgemm_counter, dimen_ia, dimen_RI, dimen_RI)
835 124 : CALL timestop(handle2)
836 :
837 356 : IF (do_ri_sos_laplace_mp2) THEN
838 : CALL calc_P_sos_mp2(homo(ispin), fm_mat_S(ispin), fm_work_iaP, &
839 : rpa_grad%sos_mp2_work_occ(ispin), rpa_grad%sos_mp2_work_virt(ispin), &
840 68 : omega, weight, virtual(ispin), Eigenval(:, ispin), mp2_env%ri_grad%dot_blksize)
841 :
842 68 : CALL calc_fm_mat_S_laplace(fm_work_iaP, homo(ispin), virtual(ispin), Eigenval(:, ispin), omega)
843 :
844 68 : CALL cp_fm_scale_and_add(1.0_dp, rpa_grad%fm_Y(ispin), -weight, fm_work_iaP)
845 :
846 68 : CALL cp_fm_release(fm_work_iaP)
847 : ELSE
848 : ! To save memory, we add it now
849 56 : CALL cp_fm_scale_and_add(1.0_dp, rpa_grad%fm_Y(ispin), -weight, fm_work_iaP)
850 :
851 : ! Redistribute both matrices and deallocate fm_work_iaP
852 : CALL redistribute_fm_mat_S(rpa_grad%rpa_work%index2send(:, ispin), rpa_grad%rpa_work%index2recv(:, ispin), &
853 : fm_work_iaP, mat_work_iaP_3D, &
854 : rpa_grad%rpa_work%gd_homo(ispin), rpa_grad%rpa_work%gd_virtual(ispin), &
855 56 : rpa_grad%rpa_work%mepos)
856 56 : CALL cp_fm_release(fm_work_iaP)
857 :
858 : CALL redistribute_fm_mat_S(rpa_grad%rpa_work%index2send(:, ispin), rpa_grad%rpa_work%index2recv(:, ispin), &
859 : fm_mat_S(ispin), mat_S_3D, &
860 : rpa_grad%rpa_work%gd_homo(ispin), rpa_grad%rpa_work%gd_virtual(ispin), &
861 56 : rpa_grad%rpa_work%mepos)
862 :
863 : ! Now collect the density matrix
864 : CALL calc_P_rpa(mat_S_3D, mat_work_iaP_3D, rpa_grad%rpa_work%gd_homo(ispin), rpa_grad%rpa_work%gd_virtual(ispin), &
865 : rpa_grad%rpa_work%grid, rpa_grad%rpa_work%mepos, &
866 : fm_mat_S(ispin)%matrix_struct, &
867 : rpa_grad%rpa_work%P_ij(ispin)%array, rpa_grad%rpa_work%P_ab(ispin)%array, &
868 56 : weight, omega, Eigenval(:, ispin), homo(ispin), unit_nr, mp2_env)
869 :
870 56 : DEALLOCATE (mat_work_iaP_3D, mat_S_3D)
871 :
872 56 : CALL remove_scaling_factor_rpa(fm_mat_S(ispin), virtual(ispin), Eigenval(:, ispin), homo(ispin), omega)
873 :
874 : END IF
875 :
876 : END DO
877 :
878 108 : CALL timestop(handle)
879 :
880 216 : END SUBROUTINE
881 :
882 : ! **************************************************************************************************
883 : !> \brief ...
884 : !> \param homo ...
885 : !> \param fm_mat_S ...
886 : !> \param fm_work_iaP ...
887 : !> \param sos_mp2_work_occ ...
888 : !> \param sos_mp2_work_virt ...
889 : !> \param omega ...
890 : !> \param weight ...
891 : !> \param virtual ...
892 : !> \param Eigenval ...
893 : !> \param dot_blksize ...
894 : ! **************************************************************************************************
895 340 : SUBROUTINE calc_P_sos_mp2(homo, fm_mat_S, fm_work_iaP, sos_mp2_work_occ, sos_mp2_work_virt, &
896 68 : omega, weight, virtual, Eigenval, dot_blksize)
897 : INTEGER, INTENT(IN) :: homo
898 : TYPE(cp_fm_type), INTENT(IN) :: fm_mat_S, fm_work_iaP
899 : TYPE(sos_mp2_grad_work_type), INTENT(INOUT) :: sos_mp2_work_occ, sos_mp2_work_virt
900 : REAL(KIND=dp), INTENT(IN) :: omega, weight
901 : INTEGER, INTENT(IN) :: virtual
902 : REAL(KIND=dp), DIMENSION(:), INTENT(IN) :: Eigenval
903 : INTEGER, INTENT(IN) :: dot_blksize
904 :
905 : CHARACTER(LEN=*), PARAMETER :: routineN = 'calc_P_sos_mp2'
906 :
907 : INTEGER :: avirt, col_global, col_local, handle, &
908 : handle2, iocc, my_a, my_i, ncol_local, &
909 : nrow_local, num_ab_pairs, num_ij_pairs
910 68 : INTEGER, DIMENSION(:), POINTER :: col_indices
911 : REAL(KIND=dp) :: ddot, trace
912 :
913 68 : CALL timeset(routineN, handle)
914 :
915 68 : CALL cp_fm_get_info(fm_mat_S, col_indices=col_indices, ncol_local=ncol_local, nrow_local=nrow_local)
916 :
917 68 : CALL timeset(routineN//"_Pij_diag", handle2)
918 332 : DO my_i = 1, homo
919 : ! Collect the contributions of the matrix elements
920 :
921 264 : trace = 0.0_dp
922 :
923 20944 : DO col_local = 1, ncol_local
924 20680 : col_global = col_indices(col_local)
925 :
926 20680 : iocc = MAX(1, col_global - 1)/virtual + 1
927 20680 : avirt = col_global - (iocc - 1)*virtual
928 :
929 20680 : IF (iocc == my_i) trace = trace + &
930 5584 : ddot(nrow_local, fm_mat_S%local_data(:, col_local), 1, fm_work_iaP%local_data(:, col_local), 1)
931 : END DO
932 :
933 332 : sos_mp2_work_occ%P(my_i) = sos_mp2_work_occ%P(my_i) - trace*omega*weight
934 :
935 : END DO
936 68 : CALL timestop(handle2)
937 :
938 68 : CALL timeset(routineN//"_Pab_diag", handle2)
939 1448 : DO my_a = 1, virtual
940 : ! Collect the contributions of the matrix elements
941 :
942 1380 : trace = 0.0_dp
943 :
944 109900 : DO col_local = 1, ncol_local
945 108520 : col_global = col_indices(col_local)
946 :
947 108520 : iocc = MAX(1, col_global - 1)/virtual + 1
948 108520 : avirt = col_global - (iocc - 1)*virtual
949 :
950 108520 : IF (avirt == my_a) trace = trace + &
951 6700 : ddot(nrow_local, fm_mat_S%local_data(:, col_local), 1, fm_work_iaP%local_data(:, col_local), 1)
952 : END DO
953 :
954 1448 : sos_mp2_work_virt%P(my_a) = sos_mp2_work_virt%P(my_a) + trace*omega*weight
955 :
956 : END DO
957 68 : CALL timestop(handle2)
958 :
959 : ! Loop over list and carry out operations
960 68 : num_ij_pairs = SIZE(sos_mp2_work_occ%pair_list, 2)
961 68 : num_ab_pairs = SIZE(sos_mp2_work_virt%pair_list, 2)
962 68 : IF (num_ij_pairs > 0) THEN
963 : CALL calc_Pij_degen(fm_work_iaP, fm_mat_S, sos_mp2_work_occ%pair_list, &
964 : virtual, sos_mp2_work_occ%P(homo + 1:), Eigenval(:homo), omega, weight, &
965 8 : sos_mp2_work_occ%index2send, sos_mp2_work_occ%index2recv, dot_blksize)
966 : END IF
967 68 : IF (num_ab_pairs > 0) THEN
968 : CALL calc_Pab_degen(fm_work_iaP, fm_mat_S, sos_mp2_work_virt%pair_list, &
969 : virtual, sos_mp2_work_virt%P(virtual + 1:), Eigenval(homo + 1:), omega, weight, &
970 8 : sos_mp2_work_virt%index2send, sos_mp2_work_virt%index2recv, dot_blksize)
971 : END IF
972 :
973 68 : CALL timestop(handle)
974 :
975 68 : END SUBROUTINE
976 :
977 : ! **************************************************************************************************
978 : !> \brief ...
979 : !> \param mat_S_1D ...
980 : !> \param mat_work_iaP_3D ...
981 : !> \param gd_homo ...
982 : !> \param gd_virtual ...
983 : !> \param grid ...
984 : !> \param mepos ...
985 : !> \param fm_struct_S ...
986 : !> \param P_ij ...
987 : !> \param P_ab ...
988 : !> \param weight ...
989 : !> \param omega ...
990 : !> \param Eigenval ...
991 : !> \param homo ...
992 : !> \param unit_nr ...
993 : !> \param mp2_env ...
994 : ! **************************************************************************************************
995 56 : SUBROUTINE calc_P_rpa(mat_S_1D, mat_work_iaP_3D, gd_homo, gd_virtual, grid, mepos, &
996 56 : fm_struct_S, P_ij, P_ab, weight, omega, Eigenval, homo, unit_nr, mp2_env)
997 : REAL(KIND=dp), DIMENSION(*), INTENT(INOUT), TARGET :: mat_S_1D
998 : REAL(KIND=dp), DIMENSION(:, :, :), INTENT(INOUT) :: mat_work_iaP_3D
999 : TYPE(group_dist_d1_type), INTENT(IN) :: gd_homo, gd_virtual
1000 : INTEGER, DIMENSION(2), INTENT(IN) :: grid, mepos
1001 : TYPE(cp_fm_struct_type), INTENT(IN), POINTER :: fm_struct_S
1002 : REAL(KIND=dp), DIMENSION(:, :) :: P_ij, P_ab
1003 : REAL(KIND=dp), INTENT(IN) :: weight, omega
1004 : REAL(KIND=dp), DIMENSION(:), INTENT(IN) :: Eigenval
1005 : INTEGER, INTENT(IN) :: homo, unit_nr
1006 : TYPE(mp2_type), INTENT(INOUT) :: mp2_env
1007 :
1008 : CHARACTER(LEN=*), PARAMETER :: routineN = 'calc_P_rpa'
1009 :
1010 : INTEGER :: completed, handle, handle2, my_a_end, my_a_size, my_a_start, my_i_end, my_i_size, &
1011 : my_i_start, my_P_size, my_prow, number_of_parallel_channels, proc_a_recv, proc_a_send, &
1012 : proc_i_recv, proc_i_send, proc_recv, proc_send, proc_shift, recv_a_end, recv_a_size, &
1013 : recv_a_start, recv_i_end, recv_i_size, recv_i_start, tag
1014 : INTEGER(KIND=int_8) :: mem, number_of_elements_per_blk
1015 56 : INTEGER, ALLOCATABLE, DIMENSION(:) :: procs_recv
1016 56 : INTEGER, DIMENSION(:, :), POINTER :: blacs2mpi
1017 : REAL(KIND=dp) :: mem_per_block, mem_real
1018 56 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:), TARGET :: buffer_compens_1D
1019 56 : REAL(KIND=dp), DIMENSION(:, :, :), POINTER :: mat_S_3D
1020 56 : TYPE(cp_1d_r_cp_type), ALLOCATABLE, DIMENSION(:) :: buffer_1D
1021 56 : TYPE(cp_3d_r_cp_type), ALLOCATABLE, DIMENSION(:) :: buffer_3D
1022 : TYPE(mp_para_env_type), POINTER :: para_env
1023 56 : TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:) :: recv_requests, send_requests
1024 :
1025 56 : CALL timeset(routineN, handle)
1026 :
1027 : ! We allocate it at every step to reduce potential memory conflicts with COSMA
1028 56 : IF (mp2_env%ri_grad%dot_blksize >= blksize_threshold) THEN
1029 48 : CALL mp2_env%local_gemm_ctx%create(LOCAL_GEMM_PU_GPU)
1030 48 : CALL mp2_env%local_gemm_ctx%set_op_threshold_gpu(spla_threshold)
1031 : END IF
1032 :
1033 56 : tag = 47
1034 :
1035 56 : my_P_size = SIZE(mat_work_iaP_3D, 1)
1036 :
1037 56 : CALL cp_fm_struct_get(fm_struct_S, para_env=para_env)
1038 56 : CALL fm_struct_S%context%get(my_process_row=my_prow, blacs2mpi=blacs2mpi, para_env=para_env)
1039 :
1040 56 : CALL get_group_dist(gd_virtual, mepos(1), my_a_start, my_a_end, my_a_size)
1041 56 : CALL get_group_dist(gd_homo, mepos(2), my_i_start, my_i_end, my_i_size)
1042 :
1043 : ! We have to remap the indices because mp_sendrecv requires a 3D array (because of mat_work_iaP_3D)
1044 : ! and dgemm requires 2D arrays
1045 : ! Fortran 2008 does allow pointer remapping independently of the ranks but GCC 7 does not properly support it
1046 56 : mat_S_3D(1:my_P_size, 1:my_a_size, 1:my_i_size) => mat_S_1D(1:INT(my_P_size, int_8)*my_a_size*my_i_size)
1047 :
1048 : number_of_elements_per_blk = MAX(INT(maxsize(gd_homo), KIND=int_8)*my_a_size, &
1049 56 : INT(maxsize(gd_virtual), KIND=int_8)*my_i_size)*my_P_size
1050 :
1051 : ! Determine the available memory and estimate the number of possible parallel communication channels
1052 56 : CALL m_memory(mem)
1053 56 : mem_real = REAL(mem, KIND=dp)
1054 56 : mem_per_block = REAL(number_of_elements_per_blk, KIND=dp)*8.0_dp
1055 168 : number_of_parallel_channels = MAX(1, MIN(MAXVAL(grid) - 1, FLOOR(mem_real/mem_per_block)))
1056 56 : CALL para_env%min(number_of_parallel_channels)
1057 56 : IF (mp2_env%ri_grad%max_parallel_comm > 0) &
1058 56 : number_of_parallel_channels = MIN(number_of_parallel_channels, mp2_env%ri_grad%max_parallel_comm)
1059 :
1060 56 : IF (unit_nr > 0) THEN
1061 28 : WRITE (unit_nr, '(T3,A,T75,I6)') 'GRAD_INFO| Number of parallel communication channels:', number_of_parallel_channels
1062 28 : CALL m_flush(unit_nr)
1063 : END IF
1064 56 : CALL para_env%sync()
1065 :
1066 224 : ALLOCATE (buffer_1D(number_of_parallel_channels))
1067 112 : DO proc_shift = 1, number_of_parallel_channels
1068 224 : ALLOCATE (buffer_1D(proc_shift)%array(number_of_elements_per_blk))
1069 : END DO
1070 :
1071 224 : ALLOCATE (buffer_3D(number_of_parallel_channels))
1072 :
1073 : ! Allocate buffers for vector version of kahan summation
1074 56 : IF (mp2_env%ri_grad%dot_blksize >= blksize_threshold) THEN
1075 144 : ALLOCATE (buffer_compens_1D(2*MAX(my_a_size*maxsize(gd_virtual), my_i_size*maxsize(gd_homo))))
1076 : END IF
1077 :
1078 56 : IF (number_of_parallel_channels > 1) THEN
1079 0 : ALLOCATE (procs_recv(number_of_parallel_channels))
1080 0 : ALLOCATE (recv_requests(number_of_parallel_channels))
1081 0 : ALLOCATE (send_requests(MAXVAL(grid) - 1))
1082 : END IF
1083 :
1084 56 : IF (number_of_parallel_channels > 1 .AND. grid(1) > 1) THEN
1085 0 : CALL timeset(routineN//"_comm_a", handle2)
1086 0 : recv_requests(:) = mp_request_null
1087 0 : procs_recv(:) = -1
1088 0 : DO proc_shift = 1, MIN(grid(1) - 1, number_of_parallel_channels)
1089 0 : proc_a_recv = MODULO(mepos(1) - proc_shift, grid(1))
1090 0 : proc_recv = mepos(2)*grid(1) + proc_a_recv
1091 :
1092 0 : CALL get_group_dist(gd_virtual, proc_a_recv, recv_a_start, recv_a_end, recv_a_size)
1093 :
1094 : buffer_3D(proc_shift)%array(1:my_P_size, 1:recv_a_size, 1:my_i_size) => &
1095 0 : buffer_1D(proc_shift)%array(1:INT(my_P_size, KIND=int_8)*recv_a_size*my_i_size)
1096 :
1097 : CALL para_env%irecv(buffer_3D(proc_shift)%array, blacs2mpi(my_prow, proc_recv), &
1098 0 : recv_requests(proc_shift), tag)
1099 :
1100 0 : procs_recv(proc_shift) = proc_a_recv
1101 : END DO
1102 :
1103 0 : send_requests(:) = mp_request_null
1104 0 : DO proc_shift = 1, grid(1) - 1
1105 0 : proc_a_send = MODULO(mepos(1) + proc_shift, grid(1))
1106 0 : proc_send = mepos(2)*grid(1) + proc_a_send
1107 :
1108 : CALL para_env%isend(mat_work_iaP_3D, blacs2mpi(my_prow, proc_send), &
1109 0 : send_requests(proc_shift), tag)
1110 : END DO
1111 0 : CALL timestop(handle2)
1112 : END IF
1113 :
1114 : CALL calc_P_rpa_a(P_ab(:, my_a_start:my_a_end), &
1115 : mat_S_3D, mat_work_iaP_3D, &
1116 : mp2_env%ri_grad%dot_blksize, buffer_compens_1D, mp2_env%local_gemm_ctx, &
1117 : Eigenval(homo + my_a_start:homo + my_a_end), Eigenval(my_i_start:my_i_end), &
1118 56 : Eigenval(homo + my_a_start:homo + my_a_end), omega, weight)
1119 :
1120 56 : DO proc_shift = 1, grid(1) - 1
1121 0 : CALL timeset(routineN//"_comm_a", handle2)
1122 0 : IF (number_of_parallel_channels > 1) THEN
1123 0 : CALL mp_waitany(recv_requests, completed)
1124 :
1125 0 : CALL get_group_dist(gd_virtual, procs_recv(completed), recv_a_start, recv_a_end, recv_a_size)
1126 : ELSE
1127 0 : proc_a_send = MODULO(mepos(1) + proc_shift, grid(1))
1128 0 : proc_a_recv = MODULO(mepos(1) - proc_shift, grid(1))
1129 :
1130 0 : proc_send = mepos(2)*grid(1) + proc_a_send
1131 0 : proc_recv = mepos(2)*grid(1) + proc_a_recv
1132 :
1133 0 : CALL get_group_dist(gd_virtual, proc_a_recv, recv_a_start, recv_a_end, recv_a_size)
1134 :
1135 : buffer_3D(1)%array(1:my_P_size, 1:recv_a_size, 1:my_i_size) => &
1136 0 : buffer_1D(1)%array(1:INT(my_P_size, KIND=int_8)*recv_a_size*my_i_size)
1137 :
1138 : CALL para_env%sendrecv(mat_work_iaP_3D, blacs2mpi(my_prow, proc_send), &
1139 0 : buffer_3D(1)%array, blacs2mpi(my_prow, proc_recv), tag)
1140 0 : completed = 1
1141 : END IF
1142 0 : CALL timestop(handle2)
1143 :
1144 : CALL calc_P_rpa_a(P_ab(:, recv_a_start:recv_a_end), &
1145 : mat_S_3D, buffer_3D(completed)%array, &
1146 : mp2_env%ri_grad%dot_blksize, buffer_compens_1D, mp2_env%local_gemm_ctx, &
1147 : Eigenval(homo + my_a_start:homo + my_a_end), Eigenval(my_i_start:my_i_end), &
1148 0 : Eigenval(homo + recv_a_start:homo + recv_a_end), omega, weight)
1149 :
1150 56 : IF (number_of_parallel_channels > 1 .AND. number_of_parallel_channels + proc_shift < grid(1)) THEN
1151 0 : proc_a_recv = MODULO(mepos(1) - proc_shift - number_of_parallel_channels, grid(1))
1152 0 : proc_recv = mepos(2)*grid(1) + proc_a_recv
1153 :
1154 0 : CALL get_group_dist(gd_virtual, proc_a_recv, recv_a_start, recv_a_end, recv_a_size)
1155 :
1156 : buffer_3D(completed)%array(1:my_P_size, 1:recv_a_size, 1:my_i_size) => &
1157 0 : buffer_1D(completed)%array(1:INT(my_P_size, KIND=int_8)*recv_a_size*my_i_size)
1158 :
1159 : CALL para_env%irecv(buffer_3D(completed)%array, blacs2mpi(my_prow, proc_recv), &
1160 0 : recv_requests(completed), tag)
1161 :
1162 0 : procs_recv(completed) = proc_a_recv
1163 : END IF
1164 : END DO
1165 :
1166 56 : IF (number_of_parallel_channels > 1 .AND. grid(1) > 1) THEN
1167 0 : CALL mp_waitall(send_requests)
1168 : END IF
1169 :
1170 56 : IF (number_of_parallel_channels > 1 .AND. grid(2) > 1) THEN
1171 0 : recv_requests(:) = mp_request_null
1172 0 : procs_recv(:) = -1
1173 0 : DO proc_shift = 1, MIN(grid(2) - 1, number_of_parallel_channels)
1174 0 : proc_i_recv = MODULO(mepos(2) - proc_shift, grid(2))
1175 0 : proc_recv = proc_i_recv*grid(1) + mepos(1)
1176 :
1177 0 : CALL get_group_dist(gd_homo, proc_i_recv, recv_i_start, recv_i_end, recv_i_size)
1178 :
1179 : buffer_3D(proc_shift)%array(1:my_P_size, 1:my_a_size, 1:recv_i_size) => &
1180 0 : buffer_1D(proc_shift)%array(1:INT(my_P_size, KIND=int_8)*my_a_size*recv_i_size)
1181 :
1182 : CALL para_env%irecv(buffer_3D(proc_shift)%array, blacs2mpi(my_prow, proc_recv), &
1183 0 : recv_requests(proc_shift), tag)
1184 :
1185 0 : procs_recv(proc_shift) = proc_i_recv
1186 : END DO
1187 :
1188 0 : send_requests(:) = mp_request_null
1189 0 : DO proc_shift = 1, grid(2) - 1
1190 0 : proc_i_send = MODULO(mepos(2) + proc_shift, grid(2))
1191 0 : proc_send = proc_i_send*grid(1) + mepos(1)
1192 :
1193 : CALL para_env%isend(mat_work_iaP_3D, blacs2mpi(my_prow, proc_send), &
1194 0 : send_requests(proc_shift), tag)
1195 : END DO
1196 : END IF
1197 :
1198 : CALL calc_P_rpa_i(P_ij(:, my_i_start:my_i_end), &
1199 : mat_S_3D, mat_work_iaP_3D, &
1200 : mp2_env%ri_grad%dot_blksize, buffer_compens_1D, mp2_env%local_gemm_ctx, &
1201 : Eigenval(homo + my_a_start:homo + my_a_end), Eigenval(my_i_start:my_i_end), &
1202 56 : Eigenval(my_i_start:my_i_end), omega, weight)
1203 :
1204 56 : DO proc_shift = 1, grid(2) - 1
1205 0 : CALL timeset(routineN//"_comm_i", handle2)
1206 0 : IF (number_of_parallel_channels > 1) THEN
1207 0 : CALL mp_waitany(recv_requests, completed)
1208 :
1209 0 : CALL get_group_dist(gd_homo, procs_recv(completed), recv_i_start, recv_i_end, recv_i_size)
1210 : ELSE
1211 0 : proc_i_send = MODULO(mepos(2) + proc_shift, grid(2))
1212 0 : proc_i_recv = MODULO(mepos(2) - proc_shift, grid(2))
1213 :
1214 0 : proc_send = proc_i_send*grid(1) + mepos(1)
1215 0 : proc_recv = proc_i_recv*grid(1) + mepos(1)
1216 :
1217 0 : CALL get_group_dist(gd_homo, proc_i_recv, recv_i_start, recv_i_end, recv_i_size)
1218 :
1219 : buffer_3D(1)%array(1:my_P_size, 1:my_a_size, 1:recv_i_size) => &
1220 0 : buffer_1D(1)%array(1:INT(my_P_size, KIND=int_8)*my_a_size*recv_i_size)
1221 :
1222 : CALL para_env%sendrecv(mat_work_iaP_3D, blacs2mpi(my_prow, proc_send), &
1223 0 : buffer_3D(1)%array, blacs2mpi(my_prow, proc_recv), tag)
1224 0 : completed = 1
1225 : END IF
1226 0 : CALL timestop(handle2)
1227 :
1228 : CALL calc_P_rpa_i(P_ij(:, recv_i_start:recv_i_end), &
1229 : mat_S_3D, buffer_3D(completed)%array, &
1230 : mp2_env%ri_grad%dot_blksize, buffer_compens_1D, mp2_env%local_gemm_ctx, &
1231 : Eigenval(homo + my_a_start:homo + my_a_end), Eigenval(my_i_start:my_i_end), &
1232 0 : Eigenval(recv_i_start:recv_i_end), omega, weight)
1233 :
1234 56 : IF (number_of_parallel_channels > 1 .AND. number_of_parallel_channels + proc_shift < grid(2)) THEN
1235 0 : proc_i_recv = MODULO(mepos(2) - proc_shift - number_of_parallel_channels, grid(2))
1236 0 : proc_recv = proc_i_recv*grid(1) + mepos(1)
1237 :
1238 0 : CALL get_group_dist(gd_homo, proc_i_recv, recv_i_start, recv_a_end, recv_i_size)
1239 :
1240 : buffer_3D(completed)%array(1:my_P_size, 1:my_a_size, 1:recv_i_size) => &
1241 0 : buffer_1D(completed)%array(1:INT(my_P_size, KIND=int_8)*my_a_size*recv_i_size)
1242 :
1243 : CALL para_env%irecv(buffer_3D(completed)%array, blacs2mpi(my_prow, proc_recv), &
1244 0 : recv_requests(completed), tag)
1245 :
1246 0 : procs_recv(completed) = proc_i_recv
1247 : END IF
1248 : END DO
1249 :
1250 56 : IF (number_of_parallel_channels > 1 .AND. grid(2) > 1) THEN
1251 0 : CALL mp_waitall(send_requests)
1252 : END IF
1253 :
1254 56 : IF (number_of_parallel_channels > 1) THEN
1255 0 : DEALLOCATE (procs_recv)
1256 0 : DEALLOCATE (recv_requests)
1257 0 : DEALLOCATE (send_requests)
1258 : END IF
1259 :
1260 56 : IF (mp2_env%ri_grad%dot_blksize >= blksize_threshold) THEN
1261 : ! release memory allocated by local_gemm when run on GPU. local_gemm_ctx is null on cpu only runs
1262 48 : CALL mp2_env%local_gemm_ctx%destroy()
1263 48 : DEALLOCATE (buffer_compens_1D)
1264 : END IF
1265 :
1266 112 : DO proc_shift = 1, number_of_parallel_channels
1267 56 : NULLIFY (buffer_3D(proc_shift)%array)
1268 112 : DEALLOCATE (buffer_1D(proc_shift)%array)
1269 : END DO
1270 56 : DEALLOCATE (buffer_3D, buffer_1D)
1271 :
1272 56 : CALL timestop(handle)
1273 :
1274 168 : END SUBROUTINE
1275 :
1276 : ! **************************************************************************************************
1277 : !> \brief ...
1278 : !> \param P_ab ...
1279 : !> \param mat_S ...
1280 : !> \param mat_work ...
1281 : !> \param dot_blksize ...
1282 : !> \param buffer_1D ...
1283 : !> \param local_gemm_ctx ...
1284 : !> \param my_eval_virt ...
1285 : !> \param my_eval_occ ...
1286 : !> \param recv_eval_virt ...
1287 : !> \param omega ...
1288 : !> \param weight ...
1289 : ! **************************************************************************************************
1290 56 : SUBROUTINE calc_P_rpa_a(P_ab, mat_S, mat_work, dot_blksize, buffer_1D, local_gemm_ctx, &
1291 56 : my_eval_virt, my_eval_occ, recv_eval_virt, omega, weight)
1292 : REAL(KIND=dp), DIMENSION(:, :), INTENT(INOUT) :: P_ab
1293 : REAL(KIND=dp), DIMENSION(:, :, :), INTENT(IN) :: mat_S, mat_work
1294 : INTEGER, INTENT(IN) :: dot_blksize
1295 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:), &
1296 : INTENT(INOUT), TARGET :: buffer_1D
1297 : TYPE(local_gemm_ctxt_type), INTENT(INOUT) :: local_gemm_ctx
1298 : REAL(KIND=dp), DIMENSION(:), INTENT(IN) :: my_eval_virt, my_eval_occ, recv_eval_virt
1299 : REAL(KIND=dp), INTENT(IN) :: omega, weight
1300 :
1301 : CHARACTER(LEN=*), PARAMETER :: routineN = 'calc_P_rpa_a'
1302 :
1303 : INTEGER :: handle, my_a, my_a_size, my_i, &
1304 : my_i_size, my_P_size, P_end, P_start, &
1305 : recv_a_size, stripesize
1306 56 : REAL(KIND=dp), DIMENSION(:, :), POINTER :: buffer_compens, buffer_unscaled
1307 :
1308 56 : CALL timeset(routineN, handle)
1309 :
1310 56 : my_i_size = SIZE(mat_S, 3)
1311 56 : recv_a_size = SIZE(mat_work, 2)
1312 56 : my_a_size = SIZE(mat_S, 2)
1313 56 : my_P_size = SIZE(mat_S, 1)
1314 :
1315 56 : IF (dot_blksize >= blksize_threshold) THEN
1316 48 : buffer_compens(1:my_a_size, 1:recv_a_size) => buffer_1D(1:my_a_size*recv_a_size)
1317 22208 : buffer_compens = 0.0_dp
1318 48 : buffer_unscaled(1:my_a_size, 1:recv_a_size) => buffer_1D(my_a_size*recv_a_size + 1:2*my_a_size*recv_a_size)
1319 :
1320 : ! This loop imitates the actual tensor contraction
1321 232 : DO my_i = 1, my_i_size
1322 416 : DO P_start = 1, my_P_size, dot_blksize
1323 184 : stripesize = MIN(dot_blksize, my_P_size - P_start + 1)
1324 184 : P_end = P_start + stripesize - 1
1325 :
1326 : CALL local_gemm_ctx%gemm("T", "N", my_a_size, recv_a_size, stripesize, &
1327 : -weight, mat_S(P_start:P_end, :, my_i), stripesize, &
1328 : mat_work(P_start:P_end, :, my_i), stripesize, &
1329 184 : 0.0_dp, buffer_unscaled, my_a_size)
1330 :
1331 : CALL scale_buffer_and_add_compens_virt(buffer_unscaled, buffer_compens, omega, &
1332 184 : my_eval_virt, recv_eval_virt, my_eval_occ(my_i))
1333 :
1334 368 : CALL kahan_step(buffer_compens, P_ab)
1335 : END DO
1336 : END DO
1337 : ELSE
1338 : BLOCK
1339 : INTEGER :: recv_a
1340 : REAL(KIND=dp) :: tmp, e_i, e_a, e_b, omega2, my_compens, my_p, s
1341 8 : omega2 = -omega**2
1342 : !$OMP PARALLEL DO COLLAPSE(2) DEFAULT(NONE)&
1343 : !$OMP SHARED(my_a_size,recv_a_size,my_i_size,mat_S,my_eval_virt,recv_eval_virt,my_eval_occ,omega2,&
1344 : !$OMP P_ab,weight,mat_work)&
1345 8 : !$OMP PRIVATE(tmp,my_a,recv_a,my_i,e_a,e_b,e_i,my_compens,my_p,s)
1346 : DO my_a = 1, my_a_size
1347 : DO recv_a = 1, recv_a_size
1348 : e_a = my_eval_virt(my_a)
1349 : e_b = recv_eval_virt(recv_a)
1350 : my_p = P_ab(my_a, recv_a)
1351 : my_compens = 0.0_dp
1352 : DO my_i = 1, my_i_size
1353 : e_i = -my_eval_occ(my_i)
1354 : tmp = -weight*accurate_dot_product(mat_S(:, my_a, my_i), mat_work(:, recv_a, my_i)) &
1355 : *(1.0_dp + omega2/((e_a + e_i)*(e_b + e_i))) - my_compens
1356 : s = my_p + tmp
1357 : my_compens = (s - my_p) - tmp
1358 : my_p = s
1359 : END DO
1360 : P_ab(my_a, recv_a) = my_p
1361 : END DO
1362 : END DO
1363 : END BLOCK
1364 : END IF
1365 :
1366 56 : CALL timestop(handle)
1367 :
1368 56 : END SUBROUTINE calc_P_rpa_a
1369 :
1370 : ! **************************************************************************************************
1371 : !> \brief ...
1372 : !> \param P_ij ...
1373 : !> \param mat_S ...
1374 : !> \param mat_work ...
1375 : !> \param dot_blksize ...
1376 : !> \param buffer_1D ...
1377 : !> \param local_gemm_ctx ...
1378 : !> \param my_eval_virt ...
1379 : !> \param my_eval_occ ...
1380 : !> \param recv_eval_occ ...
1381 : !> \param omega ...
1382 : !> \param weight ...
1383 : ! **************************************************************************************************
1384 56 : SUBROUTINE calc_P_rpa_i(P_ij, mat_S, mat_work, dot_blksize, buffer_1D, local_gemm_ctx, &
1385 56 : my_eval_virt, my_eval_occ, recv_eval_occ, omega, weight)
1386 : REAL(KIND=dp), DIMENSION(:, :), INTENT(INOUT) :: P_ij
1387 : REAL(KIND=dp), DIMENSION(:, :, :), INTENT(INOUT) :: mat_S, mat_work
1388 : INTEGER, INTENT(IN) :: dot_blksize
1389 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:), &
1390 : INTENT(INOUT), TARGET :: buffer_1D
1391 : TYPE(local_gemm_ctxt_type), INTENT(INOUT) :: local_gemm_ctx
1392 : REAL(KIND=dp), DIMENSION(:), INTENT(IN) :: my_eval_virt, my_eval_occ, recv_eval_occ
1393 : REAL(KIND=dp), INTENT(IN) :: omega, weight
1394 :
1395 : CHARACTER(LEN=*), PARAMETER :: routineN = 'calc_P_rpa_i'
1396 :
1397 : INTEGER :: handle, my_a, my_a_size, my_i, &
1398 : my_i_size, my_P_size, P_end, P_start, &
1399 : recv_i_size, stripesize
1400 56 : REAL(KIND=dp), DIMENSION(:, :), POINTER :: buffer_compens, buffer_unscaled
1401 :
1402 56 : CALL timeset(routineN, handle)
1403 :
1404 56 : my_i_size = SIZE(mat_S, 3)
1405 56 : recv_i_size = SIZE(mat_work, 3)
1406 56 : my_a_size = SIZE(mat_S, 2)
1407 56 : my_P_size = SIZE(mat_S, 1)
1408 :
1409 56 : IF (dot_blksize >= blksize_threshold) THEN
1410 48 : buffer_compens(1:my_i_size, 1:recv_i_size) => buffer_1D(1:my_i_size*recv_i_size)
1411 944 : buffer_compens = 0.0_dp
1412 48 : buffer_unscaled(1:my_i_size, 1:recv_i_size) => buffer_1D(my_i_size*recv_i_size + 1:2*my_i_size*recv_i_size)
1413 :
1414 : ! This loop imitates the actual tensor contraction
1415 1048 : DO my_a = 1, my_a_size
1416 2048 : DO P_start = 1, my_P_size, dot_blksize
1417 1000 : stripesize = MIN(dot_blksize, my_P_size - P_start + 1)
1418 1000 : P_end = P_start + stripesize - 1
1419 :
1420 : CALL local_gemm_ctx%gemm("T", "N", my_i_size, recv_i_size, stripesize, &
1421 : weight, mat_S(P_start:P_end, my_a, :), stripesize, &
1422 : mat_work(P_start:P_end, my_a, :), stripesize, &
1423 1000 : 0.0_dp, buffer_unscaled, my_i_size)
1424 :
1425 : CALL scale_buffer_and_add_compens_occ(buffer_unscaled, buffer_compens, omega, &
1426 1000 : my_eval_occ, recv_eval_occ, my_eval_virt(my_a))
1427 :
1428 2000 : CALL kahan_step(buffer_compens, P_ij)
1429 : END DO
1430 : END DO
1431 : ELSE
1432 : BLOCK
1433 : REAL(KIND=dp) :: tmp, e_i, e_a, e_j, omega2, my_compens, my_p, s
1434 : INTEGER :: recv_i
1435 8 : omega2 = -omega**2
1436 : !$OMP PARALLEL DO COLLAPSE(2) DEFAULT(NONE)&
1437 : !$OMP SHARED(my_a_size,recv_i_size,my_i_size,mat_S,my_eval_occ,my_eval_virt,omega2,&
1438 : !$OMP recv_eval_occ,P_ij,weight,mat_work)&
1439 8 : !$OMP PRIVATE(tmp,my_a,recv_i,my_i,e_i,e_j,e_a,my_compens,my_p,s)
1440 : DO my_i = 1, my_i_size
1441 : DO recv_i = 1, recv_i_size
1442 : e_i = my_eval_occ(my_i)
1443 : e_j = recv_eval_occ(recv_i)
1444 : my_p = P_ij(my_i, recv_i)
1445 : my_compens = 0.0_dp
1446 : DO my_a = 1, my_a_size
1447 : e_a = my_eval_virt(my_a)
1448 : tmp = weight*accurate_dot_product(mat_S(:, my_a, my_i), mat_work(:, my_a, recv_i)) &
1449 : *(1.0_dp + omega2/((e_a - e_i)*(e_a - e_j))) - my_compens
1450 : s = my_p + tmp
1451 : my_compens = (s - my_p) - tmp
1452 : my_p = s
1453 : END DO
1454 : P_ij(my_i, recv_i) = my_p
1455 : END DO
1456 : END DO
1457 : END BLOCK
1458 : END IF
1459 :
1460 56 : CALL timestop(handle)
1461 :
1462 56 : END SUBROUTINE calc_P_rpa_i
1463 :
1464 : ! **************************************************************************************************
1465 : !> \brief ...
1466 : !> \param compens ...
1467 : !> \param P ...
1468 : ! **************************************************************************************************
1469 1184 : SUBROUTINE kahan_step(compens, P)
1470 : REAL(KIND=dp), DIMENSION(:, :), INTENT(INOUT) :: compens, P
1471 :
1472 : CHARACTER(LEN=*), PARAMETER :: routineN = 'kahan_step'
1473 :
1474 : INTEGER :: handle, i, j
1475 : REAL(KIND=dp) :: my_compens, my_p, s
1476 :
1477 1184 : CALL timeset(routineN, handle)
1478 :
1479 1184 : !$OMP PARALLEL DO DEFAULT(NONE) SHARED(P,compens) PRIVATE(i,my_p,my_compens,s, j) COLLAPSE(2)
1480 : DO j = 1, SIZE(compens, 2)
1481 : DO i = 1, SIZE(compens, 1)
1482 : my_p = P(i, j)
1483 : my_compens = compens(i, j)
1484 : s = my_p + my_compens
1485 : compens(i, j) = (s - my_p) - my_compens
1486 : P(i, j) = s
1487 : END DO
1488 : END DO
1489 : !$OMP END PARALLEL DO
1490 :
1491 1184 : CALL timestop(handle)
1492 :
1493 1184 : END SUBROUTINE
1494 :
1495 : ! **************************************************************************************************
1496 : !> \brief ...
1497 : !> \param buffer_unscaled ...
1498 : !> \param buffer_compens ...
1499 : !> \param omega ...
1500 : !> \param my_eval_virt ...
1501 : !> \param recv_eval_virt ...
1502 : !> \param my_eval_occ ...
1503 : ! **************************************************************************************************
1504 184 : SUBROUTINE scale_buffer_and_add_compens_virt(buffer_unscaled, buffer_compens, omega, &
1505 184 : my_eval_virt, recv_eval_virt, my_eval_occ)
1506 : REAL(KIND=dp), DIMENSION(:, :), INTENT(IN) :: buffer_unscaled
1507 : REAL(KIND=dp), DIMENSION(:, :), INTENT(INOUT) :: buffer_compens
1508 : REAL(KIND=dp), INTENT(IN) :: omega
1509 : REAL(KIND=dp), DIMENSION(:), INTENT(IN) :: my_eval_virt, recv_eval_virt
1510 : REAL(KIND=dp), INTENT(IN) :: my_eval_occ
1511 :
1512 : CHARACTER(LEN=*), PARAMETER :: routineN = 'scale_buffer_and_add_compens_virt'
1513 :
1514 : INTEGER :: handle, my_a, my_b
1515 :
1516 184 : CALL timeset(routineN, handle)
1517 :
1518 : !$OMP PARALLEL DO DEFAULT(NONE) SHARED(buffer_unscaled,buffer_compens,omega,&
1519 184 : !$OMP my_eval_virt,recv_eval_virt,my_eval_occ) PRIVATE(my_a,my_b)
1520 : DO my_b = 1, SIZE(buffer_compens, 2)
1521 : DO my_a = 1, SIZE(buffer_compens, 1)
1522 : buffer_compens(my_a, my_b) = buffer_unscaled(my_a, my_b) &
1523 : *(1.0_dp - omega**2/((my_eval_virt(my_a) - my_eval_occ)*(recv_eval_virt(my_b) - my_eval_occ))) &
1524 : - buffer_compens(my_a, my_b)
1525 : END DO
1526 : END DO
1527 : !$OMP END PARALLEL DO
1528 :
1529 184 : CALL timestop(handle)
1530 :
1531 184 : END SUBROUTINE scale_buffer_and_add_compens_virt
1532 :
1533 : ! **************************************************************************************************
1534 : !> \brief ...
1535 : !> \param buffer_unscaled ...
1536 : !> \param buffer_compens ...
1537 : !> \param omega ...
1538 : !> \param my_eval_occ ...
1539 : !> \param recv_eval_occ ...
1540 : !> \param my_eval_virt ...
1541 : ! **************************************************************************************************
1542 1000 : SUBROUTINE scale_buffer_and_add_compens_occ(buffer_unscaled, buffer_compens, omega, &
1543 1000 : my_eval_occ, recv_eval_occ, my_eval_virt)
1544 : REAL(KIND=dp), DIMENSION(:, :), INTENT(IN) :: buffer_unscaled
1545 : REAL(KIND=dp), DIMENSION(:, :), INTENT(INOUT) :: buffer_compens
1546 : REAL(KIND=dp), INTENT(IN) :: omega
1547 : REAL(KIND=dp), DIMENSION(:), INTENT(IN) :: my_eval_occ, recv_eval_occ
1548 : REAL(KIND=dp), INTENT(IN) :: my_eval_virt
1549 :
1550 : CHARACTER(LEN=*), PARAMETER :: routineN = 'scale_buffer_and_add_compens_occ'
1551 :
1552 : INTEGER :: handle, my_i, my_j
1553 :
1554 1000 : CALL timeset(routineN, handle)
1555 :
1556 : !$OMP PARALLEL DO DEFAULT(NONE) SHARED(buffer_compens,buffer_unscaled,omega,&
1557 1000 : !$OMP my_eval_virt,my_eval_occ,recv_eval_occ) PRIVATE(my_i,my_j)
1558 : DO my_j = 1, SIZE(buffer_compens, 2)
1559 : DO my_i = 1, SIZE(buffer_compens, 1)
1560 : buffer_compens(my_i, my_j) = buffer_unscaled(my_i, my_j) &
1561 : *(1.0_dp - omega**2/((my_eval_virt - my_eval_occ(my_i))*(my_eval_virt - recv_eval_occ(my_j)))) &
1562 : - buffer_compens(my_i, my_j)
1563 : END DO
1564 : END DO
1565 : !$OMP END PARALLEL DO
1566 :
1567 1000 : CALL timestop(handle)
1568 :
1569 1000 : END SUBROUTINE scale_buffer_and_add_compens_occ
1570 :
1571 : ! **************************************************************************************************
1572 : !> \brief ...
1573 : !> \param x ...
1574 : !> \return ...
1575 : ! **************************************************************************************************
1576 1320 : ELEMENTAL FUNCTION sinh_over_x(x) RESULT(res)
1577 : REAL(KIND=dp), INTENT(IN) :: x
1578 : REAL(KIND=dp) :: res
1579 :
1580 : ! Calculate sinh(x)/x
1581 : ! Split the intervall to prevent numerical instabilities
1582 1320 : IF (ABS(x) > 3.0e-4_dp) THEN
1583 1318 : res = SINH(x)/x
1584 : ELSE
1585 2 : res = 1.0_dp + x**2/6.0_dp
1586 : END IF
1587 :
1588 1320 : END FUNCTION sinh_over_x
1589 :
1590 : ! **************************************************************************************************
1591 : !> \brief ...
1592 : !> \param fm_work_iaP ...
1593 : !> \param fm_mat_S ...
1594 : !> \param pair_list ...
1595 : !> \param virtual ...
1596 : !> \param P_ij ...
1597 : !> \param Eigenval ...
1598 : !> \param omega ...
1599 : !> \param weight ...
1600 : !> \param index2send ...
1601 : !> \param index2recv ...
1602 : !> \param dot_blksize ...
1603 : ! **************************************************************************************************
1604 8 : SUBROUTINE calc_Pij_degen(fm_work_iaP, fm_mat_S, pair_list, virtual, P_ij, Eigenval, &
1605 8 : omega, weight, index2send, index2recv, dot_blksize)
1606 : TYPE(cp_fm_type), INTENT(IN) :: fm_work_iaP, fm_mat_S
1607 : INTEGER, DIMENSION(:, :), INTENT(IN) :: pair_list
1608 : INTEGER, INTENT(IN) :: virtual
1609 : REAL(KIND=dp), DIMENSION(:), INTENT(INOUT) :: P_ij
1610 : REAL(KIND=dp), DIMENSION(:), INTENT(IN) :: Eigenval
1611 : REAL(KIND=dp), INTENT(IN) :: omega, weight
1612 : TYPE(one_dim_int_array), DIMENSION(0:), INTENT(IN) :: index2send, index2recv
1613 : INTEGER, INTENT(IN) :: dot_blksize
1614 :
1615 : CHARACTER(LEN=*), PARAMETER :: routineN = 'calc_Pij_degen'
1616 :
1617 : INTEGER :: avirt, col_global, col_local, counter, handle, handle2, ij_counter, iocc, &
1618 : my_col_local, my_i, my_j, my_pcol, my_prow, ncol_local, nrow_local, num_ij_pairs, &
1619 : num_pe_col, pcol, pcol_recv, pcol_send, proc_shift, recv_size, send_size, &
1620 : size_recv_buffer, size_send_buffer, tag
1621 8 : INTEGER, DIMENSION(:), POINTER :: col_indices, ncol_locals
1622 8 : INTEGER, DIMENSION(:, :), POINTER :: blacs2mpi
1623 : REAL(KIND=dp) :: trace
1624 8 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :) :: buffer_recv, buffer_send
1625 : TYPE(cp_blacs_env_type), POINTER :: context
1626 : TYPE(mp_para_env_type), POINTER :: para_env
1627 :
1628 8 : CALL timeset(routineN, handle)
1629 :
1630 : CALL cp_fm_struct_get(fm_work_iaP%matrix_struct, para_env=para_env, ncol_locals=ncol_locals, &
1631 : ncol_local=ncol_local, col_indices=col_indices, &
1632 8 : context=context, nrow_local=nrow_local)
1633 : CALL context%get(my_process_row=my_prow, my_process_column=my_pcol, &
1634 8 : number_of_process_columns=num_pe_col, blacs2mpi=blacs2mpi)
1635 :
1636 8 : num_ij_pairs = SIZE(pair_list, 2)
1637 :
1638 8 : tag = 42
1639 :
1640 104 : DO ij_counter = 1, num_ij_pairs
1641 :
1642 96 : my_i = pair_list(1, ij_counter)
1643 96 : my_j = pair_list(2, ij_counter)
1644 :
1645 96 : trace = 0.0_dp
1646 :
1647 7392 : DO col_local = 1, ncol_local
1648 7296 : col_global = col_indices(col_local)
1649 :
1650 7296 : iocc = MAX(1, col_global - 1)/virtual + 1
1651 7296 : avirt = col_global - (iocc - 1)*virtual
1652 :
1653 7296 : IF (iocc /= my_j) CYCLE
1654 1824 : pcol = fm_work_iaP%matrix_struct%g2p_col((my_i - 1)*virtual + avirt)
1655 1824 : IF (pcol /= my_pcol) CYCLE
1656 :
1657 1824 : my_col_local = fm_work_iaP%matrix_struct%g2l_col((my_i - 1)*virtual + avirt)
1658 :
1659 : trace = trace + accurate_dot_product_2(fm_mat_S%local_data(:, my_col_local), fm_work_iaP%local_data(:, col_local), &
1660 7392 : dot_blksize)
1661 : END DO
1662 :
1663 104 : P_ij(ij_counter) = P_ij(ij_counter) - trace*sinh_over_x(0.5_dp*(Eigenval(my_i) - Eigenval(my_j))*omega)*omega*weight
1664 :
1665 : END DO
1666 :
1667 8 : IF (num_pe_col > 1) THEN
1668 : size_send_buffer = 0
1669 : size_recv_buffer = 0
1670 0 : DO proc_shift = 1, num_pe_col - 1
1671 0 : pcol_send = MODULO(my_pcol + proc_shift, num_pe_col)
1672 0 : pcol_recv = MODULO(my_pcol - proc_shift, num_pe_col)
1673 :
1674 0 : IF (ALLOCATED(index2send(pcol_send)%array)) &
1675 0 : size_send_buffer = MAX(size_send_buffer, SIZE(index2send(pcol_send)%array))
1676 :
1677 0 : IF (ALLOCATED(index2recv(pcol_recv)%array)) &
1678 0 : size_recv_buffer = MAX(size_recv_buffer, SIZE(index2recv(pcol_recv)%array))
1679 : END DO
1680 :
1681 0 : ALLOCATE (buffer_send(nrow_local, size_send_buffer), buffer_recv(nrow_local, size_recv_buffer))
1682 :
1683 0 : DO proc_shift = 1, num_pe_col - 1
1684 0 : pcol_send = MODULO(my_pcol + proc_shift, num_pe_col)
1685 0 : pcol_recv = MODULO(my_pcol - proc_shift, num_pe_col)
1686 :
1687 : ! Collect data and exchange
1688 0 : send_size = 0
1689 0 : IF (ALLOCATED(index2send(pcol_send)%array)) send_size = SIZE(index2send(pcol_send)%array)
1690 :
1691 0 : DO counter = 1, send_size
1692 0 : buffer_send(:, counter) = fm_work_iaP%local_data(:, index2send(pcol_send)%array(counter))
1693 : END DO
1694 :
1695 0 : recv_size = 0
1696 0 : IF (ALLOCATED(index2recv(pcol_recv)%array)) recv_size = SIZE(index2recv(pcol_recv)%array)
1697 0 : IF (recv_size > 0) THEN
1698 0 : CALL timeset(routineN//"_send", handle2)
1699 0 : IF (send_size > 0) THEN
1700 : CALL para_env%sendrecv(buffer_send(:, :send_size), blacs2mpi(my_prow, pcol_send), &
1701 0 : buffer_recv(:, :recv_size), blacs2mpi(my_prow, pcol_recv), tag)
1702 : ELSE
1703 0 : CALL para_env%recv(buffer_recv(:, :recv_size), blacs2mpi(my_prow, pcol_recv), tag)
1704 : END IF
1705 0 : CALL timestop(handle2)
1706 :
1707 0 : DO ij_counter = 1, num_ij_pairs
1708 : ! Collect the contributions of the matrix elements
1709 :
1710 0 : my_i = pair_list(1, ij_counter)
1711 0 : my_j = pair_list(2, ij_counter)
1712 :
1713 0 : trace = 0.0_dp
1714 :
1715 0 : DO col_local = 1, recv_size
1716 0 : col_global = index2recv(pcol_recv)%array(col_local)
1717 :
1718 0 : iocc = MAX(1, col_global - 1)/virtual + 1
1719 0 : IF (iocc /= my_j) CYCLE
1720 0 : avirt = col_global - (iocc - 1)*virtual
1721 0 : pcol = fm_work_iaP%matrix_struct%g2p_col((my_i - 1)*virtual + avirt)
1722 0 : IF (pcol /= my_pcol) CYCLE
1723 :
1724 0 : my_col_local = fm_work_iaP%matrix_struct%g2l_col((my_i - 1)*virtual + avirt)
1725 :
1726 : trace = trace + accurate_dot_product_2(fm_mat_S%local_data(:, my_col_local), buffer_recv(:, col_local), &
1727 0 : dot_blksize)
1728 : END DO
1729 :
1730 : P_ij(ij_counter) = P_ij(ij_counter) &
1731 0 : - trace*sinh_over_x(0.5_dp*(Eigenval(my_i) - Eigenval(my_j))*omega)*omega*weight
1732 : END DO
1733 0 : ELSE IF (send_size > 0) THEN
1734 0 : CALL timeset(routineN//"_send", handle2)
1735 0 : CALL para_env%send(buffer_send(:, :send_size), blacs2mpi(my_prow, pcol_send), tag)
1736 0 : CALL timestop(handle2)
1737 : END IF
1738 : END DO
1739 0 : IF (ALLOCATED(buffer_send)) DEALLOCATE (buffer_send)
1740 0 : IF (ALLOCATED(buffer_recv)) DEALLOCATE (buffer_recv)
1741 : END IF
1742 :
1743 8 : CALL timestop(handle)
1744 :
1745 16 : END SUBROUTINE
1746 :
1747 : ! **************************************************************************************************
1748 : !> \brief ...
1749 : !> \param fm_work_iaP ...
1750 : !> \param fm_mat_S ...
1751 : !> \param pair_list ...
1752 : !> \param virtual ...
1753 : !> \param P_ab ...
1754 : !> \param Eigenval ...
1755 : !> \param omega ...
1756 : !> \param weight ...
1757 : !> \param index2send ...
1758 : !> \param index2recv ...
1759 : !> \param dot_blksize ...
1760 : ! **************************************************************************************************
1761 8 : SUBROUTINE calc_Pab_degen(fm_work_iaP, fm_mat_S, pair_list, virtual, P_ab, Eigenval, &
1762 8 : omega, weight, index2send, index2recv, dot_blksize)
1763 : TYPE(cp_fm_type), INTENT(IN) :: fm_work_iaP, fm_mat_S
1764 : INTEGER, DIMENSION(:, :), INTENT(IN) :: pair_list
1765 : INTEGER, INTENT(IN) :: virtual
1766 : REAL(KIND=dp), DIMENSION(:), INTENT(INOUT) :: P_ab
1767 : REAL(KIND=dp), DIMENSION(:), INTENT(IN) :: Eigenval
1768 : REAL(KIND=dp), INTENT(IN) :: omega, weight
1769 : TYPE(one_dim_int_array), DIMENSION(0:), INTENT(IN) :: index2send, index2recv
1770 : INTEGER, INTENT(IN) :: dot_blksize
1771 :
1772 : CHARACTER(LEN=*), PARAMETER :: routineN = 'calc_Pab_degen'
1773 :
1774 : INTEGER :: ab_counter, avirt, col_global, col_local, counter, handle, handle2, iocc, my_a, &
1775 : my_b, my_col_local, my_pcol, my_prow, ncol_local, nrow_local, num_ab_pairs, num_pe_col, &
1776 : pcol, pcol_recv, pcol_send, proc_shift, recv_size, send_size, size_recv_buffer, &
1777 : size_send_buffer, tag
1778 8 : INTEGER, DIMENSION(:), POINTER :: col_indices, ncol_locals
1779 8 : INTEGER, DIMENSION(:, :), POINTER :: blacs2mpi
1780 : REAL(KIND=dp) :: trace
1781 8 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :) :: buffer_recv, buffer_send
1782 : TYPE(cp_blacs_env_type), POINTER :: context
1783 : TYPE(mp_para_env_type), POINTER :: para_env
1784 :
1785 8 : CALL timeset(routineN, handle)
1786 :
1787 : CALL cp_fm_struct_get(fm_work_iaP%matrix_struct, para_env=para_env, ncol_locals=ncol_locals, &
1788 : ncol_local=ncol_local, col_indices=col_indices, &
1789 8 : context=context, nrow_local=nrow_local)
1790 : CALL context%get(my_process_row=my_prow, my_process_column=my_pcol, &
1791 8 : number_of_process_columns=num_pe_col, blacs2mpi=blacs2mpi)
1792 :
1793 8 : num_ab_pairs = SIZE(pair_list, 2)
1794 :
1795 8 : tag = 43
1796 :
1797 1232 : DO ab_counter = 1, num_ab_pairs
1798 :
1799 1224 : my_a = pair_list(1, ab_counter)
1800 1224 : my_b = pair_list(2, ab_counter)
1801 :
1802 1224 : trace = 0.0_dp
1803 :
1804 94248 : DO col_local = 1, ncol_local
1805 93024 : col_global = col_indices(col_local)
1806 :
1807 93024 : iocc = MAX(1, col_global - 1)/virtual + 1
1808 93024 : avirt = col_global - (iocc - 1)*virtual
1809 :
1810 93024 : IF (avirt /= my_b) CYCLE
1811 4896 : pcol = fm_work_iaP%matrix_struct%g2p_col((iocc - 1)*virtual + my_a)
1812 4896 : IF (pcol /= my_pcol) CYCLE
1813 4896 : my_col_local = fm_work_iaP%matrix_struct%g2l_col((iocc - 1)*virtual + my_a)
1814 :
1815 : trace = trace + accurate_dot_product_2(fm_mat_S%local_data(:, my_col_local), fm_work_iaP%local_data(:, col_local), &
1816 94248 : dot_blksize)
1817 :
1818 : END DO
1819 :
1820 : P_ab(ab_counter) = P_ab(ab_counter) &
1821 1232 : + trace*sinh_over_x(0.5_dp*(Eigenval(my_a) - Eigenval(my_b))*omega)*omega*weight
1822 :
1823 : END DO
1824 :
1825 8 : IF (num_pe_col > 1) THEN
1826 : size_send_buffer = 0
1827 : size_recv_buffer = 0
1828 0 : DO proc_shift = 1, num_pe_col - 1
1829 0 : pcol_send = MODULO(my_pcol + proc_shift, num_pe_col)
1830 0 : pcol_recv = MODULO(my_pcol - proc_shift, num_pe_col)
1831 :
1832 0 : IF (ALLOCATED(index2send(pcol_send)%array)) &
1833 0 : size_send_buffer = MAX(size_send_buffer, SIZE(index2send(pcol_send)%array))
1834 :
1835 0 : IF (ALLOCATED(index2recv(pcol_recv)%array)) &
1836 0 : size_recv_buffer = MAX(size_recv_buffer, SIZE(index2recv(pcol_recv)%array))
1837 : END DO
1838 :
1839 0 : ALLOCATE (buffer_send(nrow_local, size_send_buffer), buffer_recv(nrow_local, size_recv_buffer))
1840 :
1841 0 : DO proc_shift = 1, num_pe_col - 1
1842 0 : pcol_send = MODULO(my_pcol + proc_shift, num_pe_col)
1843 0 : pcol_recv = MODULO(my_pcol - proc_shift, num_pe_col)
1844 :
1845 : ! Collect data and exchange
1846 0 : send_size = 0
1847 0 : IF (ALLOCATED(index2send(pcol_send)%array)) send_size = SIZE(index2send(pcol_send)%array)
1848 :
1849 0 : DO counter = 1, send_size
1850 0 : buffer_send(:, counter) = fm_work_iaP%local_data(:, index2send(pcol_send)%array(counter))
1851 : END DO
1852 :
1853 0 : recv_size = 0
1854 0 : IF (ALLOCATED(index2recv(pcol_recv)%array)) recv_size = SIZE(index2recv(pcol_recv)%array)
1855 0 : IF (recv_size > 0) THEN
1856 0 : CALL timeset(routineN//"_send", handle2)
1857 0 : IF (send_size > 0) THEN
1858 : CALL para_env%sendrecv(buffer_send(:, :send_size), blacs2mpi(my_prow, pcol_send), &
1859 0 : buffer_recv(:, :recv_size), blacs2mpi(my_prow, pcol_recv), tag)
1860 : ELSE
1861 0 : CALL para_env%recv(buffer_recv(:, :recv_size), blacs2mpi(my_prow, pcol_recv), tag)
1862 : END IF
1863 0 : CALL timestop(handle2)
1864 :
1865 0 : DO ab_counter = 1, num_ab_pairs
1866 : ! Collect the contributions of the matrix elements
1867 :
1868 0 : my_a = pair_list(1, ab_counter)
1869 0 : my_b = pair_list(2, ab_counter)
1870 :
1871 0 : trace = 0.0_dp
1872 :
1873 0 : DO col_local = 1, SIZE(index2recv(pcol_recv)%array)
1874 0 : col_global = index2recv(pcol_recv)%array(col_local)
1875 :
1876 0 : iocc = MAX(1, col_global - 1)/virtual + 1
1877 0 : avirt = col_global - (iocc - 1)*virtual
1878 0 : IF (avirt /= my_b) CYCLE
1879 0 : pcol = fm_work_iaP%matrix_struct%g2p_col((iocc - 1)*virtual + my_a)
1880 0 : IF (pcol /= my_pcol) CYCLE
1881 :
1882 0 : my_col_local = fm_work_iaP%matrix_struct%g2l_col((iocc - 1)*virtual + my_a)
1883 :
1884 : trace = trace + accurate_dot_product_2(fm_mat_S%local_data(:, my_col_local), buffer_recv(:, col_local), &
1885 0 : dot_blksize)
1886 : END DO
1887 :
1888 : P_ab(ab_counter) = P_ab(ab_counter) &
1889 0 : + trace*sinh_over_x(0.5_dp*(Eigenval(my_a) - Eigenval(my_b))*omega)*omega*weight
1890 :
1891 : END DO
1892 0 : ELSE IF (send_size > 0) THEN
1893 0 : CALL timeset(routineN//"_send", handle2)
1894 0 : CALL para_env%send(buffer_send(:, :send_size), blacs2mpi(my_prow, pcol_send), tag)
1895 0 : CALL timestop(handle2)
1896 : END IF
1897 : END DO
1898 0 : IF (ALLOCATED(buffer_send)) DEALLOCATE (buffer_send)
1899 0 : IF (ALLOCATED(buffer_recv)) DEALLOCATE (buffer_recv)
1900 : END IF
1901 :
1902 8 : CALL timestop(handle)
1903 :
1904 16 : END SUBROUTINE
1905 :
1906 : ! **************************************************************************************************
1907 : !> \brief ...
1908 : !> \param index2send ...
1909 : !> \param index2recv ...
1910 : !> \param fm_mat_S ...
1911 : !> \param mat_S_3D ...
1912 : !> \param gd_homo ...
1913 : !> \param gd_virtual ...
1914 : !> \param mepos ...
1915 : ! **************************************************************************************************
1916 112 : SUBROUTINE redistribute_fm_mat_S(index2send, index2recv, fm_mat_S, mat_S_3D, gd_homo, gd_virtual, mepos)
1917 : TYPE(one_dim_int_array), DIMENSION(0:), INTENT(IN) :: index2send
1918 : TYPE(two_dim_int_array), DIMENSION(0:), INTENT(IN) :: index2recv
1919 : TYPE(cp_fm_type), INTENT(IN) :: fm_mat_S
1920 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :), &
1921 : INTENT(OUT) :: mat_S_3D
1922 : TYPE(group_dist_d1_type), INTENT(IN) :: gd_homo, gd_virtual
1923 : INTEGER, DIMENSION(2), INTENT(IN) :: mepos
1924 :
1925 : CHARACTER(LEN=*), PARAMETER :: routineN = 'redistribute_fm_mat_S'
1926 :
1927 : INTEGER :: col_local, handle, my_a, my_homo, my_i, my_pcol, my_prow, my_virtual, nrow_local, &
1928 : num_pe_col, proc_recv, proc_send, proc_shift, recv_size, send_size, size_recv_buffer, &
1929 : size_send_buffer, tag
1930 112 : INTEGER, DIMENSION(:, :), POINTER :: blacs2mpi
1931 112 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :) :: buffer_recv, buffer_send
1932 : TYPE(mp_para_env_type), POINTER :: para_env
1933 :
1934 112 : CALL timeset(routineN, handle)
1935 :
1936 112 : tag = 46
1937 :
1938 : CALL fm_mat_S%matrix_struct%context%get(my_process_row=my_prow, my_process_column=my_pcol, &
1939 112 : number_of_process_columns=num_pe_col, blacs2mpi=blacs2mpi)
1940 :
1941 112 : CALL cp_fm_struct_get(fm_mat_S%matrix_struct, nrow_local=nrow_local, para_env=para_env)
1942 :
1943 112 : CALL get_group_dist(gd_homo, mepos(2), sizes=my_homo)
1944 112 : CALL get_group_dist(gd_virtual, mepos(1), sizes=my_virtual)
1945 :
1946 560 : ALLOCATE (mat_S_3D(nrow_local, my_virtual, my_homo))
1947 :
1948 112 : IF (ALLOCATED(index2send(my_pcol)%array)) THEN
1949 8928 : DO col_local = 1, SIZE(index2send(my_pcol)%array)
1950 8816 : my_a = index2recv(my_pcol)%array(1, col_local)
1951 8816 : my_i = index2recv(my_pcol)%array(2, col_local)
1952 678032 : mat_S_3D(:, my_a, my_i) = fm_mat_S%local_data(:, index2send(my_pcol)%array(col_local))
1953 : END DO
1954 : END IF
1955 :
1956 112 : IF (num_pe_col > 1) THEN
1957 : size_send_buffer = 0
1958 : size_recv_buffer = 0
1959 0 : DO proc_shift = 1, num_pe_col - 1
1960 0 : proc_send = MODULO(my_pcol + proc_shift, num_pe_col)
1961 0 : proc_recv = MODULO(my_pcol - proc_shift, num_pe_col)
1962 :
1963 0 : send_size = 0
1964 0 : IF (ALLOCATED(index2send(proc_send)%array)) send_size = SIZE(index2send(proc_send)%array)
1965 0 : size_send_buffer = MAX(size_send_buffer, send_size)
1966 :
1967 0 : recv_size = 0
1968 0 : IF (ALLOCATED(index2recv(proc_recv)%array)) recv_size = SIZE(index2recv(proc_recv)%array)
1969 0 : size_recv_buffer = MAX(size_recv_buffer, recv_size)
1970 :
1971 : END DO
1972 :
1973 0 : ALLOCATE (buffer_send(nrow_local, size_send_buffer), buffer_recv(nrow_local, size_recv_buffer))
1974 :
1975 0 : DO proc_shift = 1, num_pe_col - 1
1976 0 : proc_send = MODULO(my_pcol + proc_shift, num_pe_col)
1977 0 : proc_recv = MODULO(my_pcol - proc_shift, num_pe_col)
1978 :
1979 0 : send_size = 0
1980 0 : IF (ALLOCATED(index2send(proc_send)%array)) send_size = SIZE(index2send(proc_send)%array)
1981 0 : DO col_local = 1, send_size
1982 0 : buffer_send(:, col_local) = fm_mat_S%local_data(:, index2send(proc_send)%array(col_local))
1983 : END DO
1984 :
1985 0 : recv_size = 0
1986 0 : IF (ALLOCATED(index2recv(proc_recv)%array)) recv_size = SIZE(index2recv(proc_recv)%array, 2)
1987 0 : IF (recv_size > 0) THEN
1988 0 : IF (send_size > 0) THEN
1989 : CALL para_env%sendrecv(buffer_send(:, :send_size), blacs2mpi(my_prow, proc_send), &
1990 0 : buffer_recv(:, :recv_size), blacs2mpi(my_prow, proc_recv), tag)
1991 : ELSE
1992 0 : CALL para_env%recv(buffer_recv(:, :recv_size), blacs2mpi(my_prow, proc_recv), tag)
1993 : END IF
1994 :
1995 0 : DO col_local = 1, recv_size
1996 0 : my_a = index2recv(proc_recv)%array(1, col_local)
1997 0 : my_i = index2recv(proc_recv)%array(2, col_local)
1998 0 : mat_S_3D(:, my_a, my_i) = buffer_recv(:, col_local)
1999 : END DO
2000 0 : ELSE IF (send_size > 0) THEN
2001 0 : CALL para_env%send(buffer_send(:, :send_size), blacs2mpi(my_prow, proc_send), tag)
2002 : END IF
2003 :
2004 : END DO
2005 :
2006 0 : IF (ALLOCATED(buffer_send)) DEALLOCATE (buffer_send)
2007 0 : IF (ALLOCATED(buffer_recv)) DEALLOCATE (buffer_recv)
2008 : END IF
2009 :
2010 112 : CALL timestop(handle)
2011 :
2012 336 : END SUBROUTINE
2013 :
2014 : ! **************************************************************************************************
2015 : !> \brief ...
2016 : !> \param rpa_grad ...
2017 : !> \param mp2_env ...
2018 : !> \param para_env_sub ...
2019 : !> \param para_env ...
2020 : !> \param qs_env ...
2021 : !> \param gd_array ...
2022 : !> \param color_sub ...
2023 : !> \param do_ri_sos_laplace_mp2 ...
2024 : !> \param homo ...
2025 : !> \param virtual ...
2026 : ! **************************************************************************************************
2027 40 : SUBROUTINE rpa_grad_finalize(rpa_grad, mp2_env, para_env_sub, para_env, qs_env, gd_array, &
2028 40 : color_sub, do_ri_sos_laplace_mp2, homo, virtual)
2029 : TYPE(rpa_grad_type), INTENT(INOUT) :: rpa_grad
2030 : TYPE(mp2_type), INTENT(INOUT) :: mp2_env
2031 : TYPE(mp_para_env_type), INTENT(IN), POINTER :: para_env_sub, para_env
2032 : TYPE(qs_environment_type), INTENT(IN), POINTER :: qs_env
2033 : TYPE(group_dist_d1_type) :: gd_array
2034 : INTEGER, INTENT(IN) :: color_sub
2035 : LOGICAL, INTENT(IN) :: do_ri_sos_laplace_mp2
2036 : INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
2037 :
2038 : CHARACTER(LEN=*), PARAMETER :: routineN = 'rpa_grad_finalize'
2039 :
2040 : INTEGER :: dimen_ia, dimen_RI, handle, iiB, ispin, my_group_L_end, my_group_L_size, &
2041 : my_group_L_start, my_ia_end, my_ia_size, my_ia_start, my_P_end, my_P_size, my_P_start, &
2042 : ngroup, nspins, pos_group, pos_sub, proc
2043 40 : INTEGER, ALLOCATABLE, DIMENSION(:) :: pos_info
2044 40 : INTEGER, ALLOCATABLE, DIMENSION(:, :) :: group_grid_2_mepos, mepos_2_grid_group
2045 : REAL(KIND=dp) :: my_scale
2046 40 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :) :: Gamma_2D
2047 : TYPE(cp_blacs_env_type), POINTER :: blacs_env
2048 : TYPE(cp_fm_struct_type), POINTER :: fm_struct
2049 : TYPE(cp_fm_type) :: fm_G_P_ia, fm_PQ, fm_PQ_2, fm_PQ_half, &
2050 : fm_work_PQ, fm_work_PQ_2, fm_Y, &
2051 : operator_half
2052 40 : TYPE(group_dist_d1_type) :: gd_array_new, gd_ia, gd_P, gd_P_new
2053 :
2054 40 : CALL timeset(routineN, handle)
2055 :
2056 : ! Release unnecessary matrices to save memory for next steps
2057 :
2058 40 : nspins = SIZE(rpa_grad%fm_Y)
2059 :
2060 : ! Scaling factor is required to scale the density matrices and the Gamma matrices later
2061 40 : IF (do_ri_sos_laplace_mp2) THEN
2062 20 : my_scale = mp2_env%scale_s
2063 : ELSE
2064 20 : my_scale = -mp2_env%ri_rpa%scale_rpa/(2.0_dp*pi)
2065 20 : IF (mp2_env%ri_rpa%minimax_quad) my_scale = my_scale/2.0_dp
2066 : END IF
2067 :
2068 40 : IF (do_ri_sos_laplace_mp2) THEN
2069 : CALL sos_mp2_grad_finalize(rpa_grad%sos_mp2_work_occ, rpa_grad%sos_mp2_work_virt, &
2070 20 : para_env, para_env_sub, homo, virtual, mp2_env)
2071 : ELSE
2072 : CALL rpa_grad_work_finalize(rpa_grad%rpa_work, mp2_env, homo, &
2073 20 : virtual, para_env, para_env_sub)
2074 : END IF
2075 :
2076 40 : CALL get_qs_env(qs_env, blacs_env=blacs_env)
2077 :
2078 40 : CALL cp_fm_get_info(rpa_grad%fm_Gamma_PQ, ncol_global=dimen_RI)
2079 :
2080 40 : NULLIFY (fm_struct)
2081 : CALL cp_fm_struct_create(fm_struct, context=blacs_env, nrow_global=dimen_RI, &
2082 40 : ncol_global=dimen_RI, para_env=para_env)
2083 40 : CALL cp_fm_create(fm_PQ, fm_struct)
2084 40 : CALL cp_fm_create(fm_work_PQ, fm_struct)
2085 40 : IF (.NOT. compare_potential_types(mp2_env%ri_metric, mp2_env%potential_parameter)) THEN
2086 4 : CALL cp_fm_create(fm_PQ_2, fm_struct)
2087 : END IF
2088 40 : CALL cp_fm_struct_release(fm_struct)
2089 40 : CALL cp_fm_set_all(fm_PQ, 0.0_dp)
2090 :
2091 : ! We still have to left- and right multiply it with PQhalf
2092 40 : CALL dereplicate_and_sum_fm(rpa_grad%fm_Gamma_PQ, fm_PQ)
2093 :
2094 40 : ngroup = para_env%num_pe/para_env_sub%num_pe
2095 :
2096 : CALL prepare_redistribution(para_env, para_env_sub, ngroup, &
2097 : group_grid_2_mepos, mepos_2_grid_group, pos_info=pos_info)
2098 :
2099 : ! Create fm_PQ_half
2100 40 : CALL create_group_dist(gd_P, para_env_sub%num_pe, dimen_RI)
2101 40 : CALL get_group_dist(gd_P, para_env_sub%mepos, my_P_start, my_P_end, my_P_size)
2102 :
2103 40 : CALL get_group_dist(gd_array, color_sub, my_group_L_start, my_group_L_end, my_group_L_size)
2104 :
2105 40 : CALL create_group_dist(gd_P_new, para_env%num_pe)
2106 40 : CALL create_group_dist(gd_array_new, para_env%num_pe)
2107 :
2108 120 : DO proc = 0, para_env%num_pe - 1
2109 : ! calculate position of the group
2110 80 : pos_group = proc/para_env_sub%num_pe
2111 : ! calculate position in the subgroup
2112 80 : pos_sub = pos_info(proc)
2113 : ! 1 -> rows, 2 -> cols
2114 80 : CALL get_group_dist(gd_array, pos_group, gd_array_new, proc)
2115 120 : CALL get_group_dist(gd_P, pos_sub, gd_P_new, proc)
2116 : END DO
2117 :
2118 40 : DEALLOCATE (pos_info)
2119 40 : CALL release_group_dist(gd_P)
2120 :
2121 : CALL array2fm(mp2_env%ri_grad%PQ_half, fm_PQ%matrix_struct, &
2122 : my_P_start, my_P_end, &
2123 : my_group_L_start, my_group_L_end, &
2124 : gd_P_new, gd_array_new, &
2125 : group_grid_2_mepos, para_env_sub%num_pe, ngroup, &
2126 40 : fm_PQ_half)
2127 :
2128 40 : IF (.NOT. compare_potential_types(mp2_env%ri_metric, mp2_env%potential_parameter)) THEN
2129 : CALL array2fm(mp2_env%ri_grad%operator_half, fm_PQ%matrix_struct, my_P_start, my_P_end, &
2130 : my_group_L_start, my_group_L_end, &
2131 : gd_P_new, gd_array_new, &
2132 : group_grid_2_mepos, para_env_sub%num_pe, ngroup, &
2133 4 : operator_half)
2134 : END IF
2135 :
2136 : ! deallocate the info array
2137 40 : CALL release_group_dist(gd_P_new)
2138 40 : CALL release_group_dist(gd_array_new)
2139 :
2140 40 : IF (compare_potential_types(mp2_env%ri_metric, mp2_env%potential_parameter)) THEN
2141 : ! Finish Gamma_PQ
2142 : CALL parallel_gemm(transa="N", transb="T", m=dimen_RI, n=dimen_RI, k=dimen_RI, alpha=1.0_dp, &
2143 : matrix_a=fm_PQ, matrix_b=fm_PQ_half, beta=0.0_dp, &
2144 36 : matrix_c=fm_work_PQ)
2145 :
2146 : CALL parallel_gemm(transa="N", transb="N", m=dimen_RI, n=dimen_RI, k=dimen_RI, alpha=-my_scale, &
2147 : matrix_a=fm_PQ_half, matrix_b=fm_work_PQ, beta=0.0_dp, &
2148 36 : matrix_c=fm_PQ)
2149 :
2150 36 : CALL cp_fm_release(fm_work_PQ)
2151 : ELSE
2152 : CALL parallel_gemm(transa="N", transb="T", m=dimen_RI, n=dimen_RI, k=dimen_RI, alpha=1.0_dp, &
2153 : matrix_a=fm_PQ, matrix_b=operator_half, beta=0.0_dp, &
2154 4 : matrix_c=fm_work_PQ)
2155 :
2156 : CALL parallel_gemm(transa="N", transb="N", m=dimen_RI, n=dimen_RI, k=dimen_RI, alpha=my_scale, &
2157 : matrix_a=operator_half, matrix_b=fm_work_PQ, beta=0.0_dp, &
2158 4 : matrix_c=fm_PQ)
2159 4 : CALL cp_fm_release(operator_half)
2160 :
2161 4 : CALL cp_fm_create(fm_work_PQ_2, fm_PQ%matrix_struct, name="fm_Gamma_PQ_2")
2162 : CALL parallel_gemm(transa="N", transb="N", m=dimen_RI, n=dimen_RI, k=dimen_RI, alpha=-my_scale, &
2163 : matrix_a=fm_PQ_half, matrix_b=fm_work_PQ, beta=0.0_dp, &
2164 4 : matrix_c=fm_work_PQ_2)
2165 4 : CALL cp_fm_to_fm(fm_work_PQ_2, fm_PQ_2)
2166 4 : CALL cp_fm_geadd(1.0_dp, "T", fm_work_PQ_2, 1.0_dp, fm_PQ_2)
2167 4 : CALL cp_fm_release(fm_work_PQ_2)
2168 4 : CALL cp_fm_release(fm_work_PQ)
2169 : END IF
2170 :
2171 160 : ALLOCATE (mp2_env%ri_grad%Gamma_PQ(my_P_size, my_group_L_size))
2172 : CALL fm2array(mp2_env%ri_grad%Gamma_PQ, &
2173 : my_P_size, my_P_start, my_P_end, &
2174 : my_group_L_size, my_group_L_start, my_group_L_end, &
2175 : group_grid_2_mepos, mepos_2_grid_group, &
2176 : para_env_sub%num_pe, ngroup, &
2177 40 : fm_PQ)
2178 :
2179 40 : IF (.NOT. compare_potential_types(mp2_env%ri_metric, mp2_env%potential_parameter)) THEN
2180 12 : ALLOCATE (mp2_env%ri_grad%Gamma_PQ_2(my_P_size, my_group_L_size))
2181 : CALL fm2array(mp2_env%ri_grad%Gamma_PQ_2, my_P_size, my_P_start, my_P_end, &
2182 : my_group_L_size, my_group_L_start, my_group_L_end, &
2183 : group_grid_2_mepos, mepos_2_grid_group, &
2184 : para_env_sub%num_pe, ngroup, &
2185 4 : fm_PQ_2)
2186 : END IF
2187 :
2188 : ! Now, Gamma_Pia
2189 2644 : ALLOCATE (mp2_env%ri_grad%G_P_ia(my_group_L_size, nspins))
2190 88 : DO ispin = 1, nspins
2191 2524 : DO iiB = 1, my_group_L_size
2192 2484 : NULLIFY (mp2_env%ri_grad%G_P_ia(iiB, ispin)%matrix)
2193 : END DO
2194 : END DO
2195 :
2196 : ! Redistribute the Y matrix
2197 88 : DO ispin = 1, nspins
2198 : ! Collect all data of columns for the own sub group locally
2199 48 : CALL cp_fm_get_info(rpa_grad%fm_Y(ispin), ncol_global=dimen_ia)
2200 :
2201 48 : CALL get_qs_env(qs_env, blacs_env=blacs_env)
2202 :
2203 48 : NULLIFY (fm_struct)
2204 48 : CALL cp_fm_struct_create(fm_struct, template_fmstruct=fm_PQ_half%matrix_struct, nrow_global=dimen_ia)
2205 48 : CALL cp_fm_create(fm_Y, fm_struct)
2206 48 : CALL cp_fm_struct_release(fm_struct)
2207 48 : CALL cp_fm_set_all(fm_Y, 0.0_dp)
2208 :
2209 48 : CALL dereplicate_and_sum_fm(rpa_grad%fm_Y(ispin), fm_Y)
2210 :
2211 48 : CALL cp_fm_create(fm_G_P_ia, fm_Y%matrix_struct)
2212 48 : CALL cp_fm_set_all(fm_G_P_ia, 0.0_dp)
2213 :
2214 : CALL parallel_gemm(transa="N", transb="T", m=dimen_ia, n=dimen_RI, k=dimen_RI, alpha=my_scale, &
2215 : matrix_a=fm_Y, matrix_b=fm_PQ_half, beta=0.0_dp, &
2216 48 : matrix_c=fm_G_P_ia)
2217 :
2218 48 : CALL cp_fm_release(fm_Y)
2219 :
2220 48 : CALL create_group_dist(gd_ia, para_env_sub%num_pe, dimen_ia)
2221 48 : CALL get_group_dist(gd_ia, para_env_sub%mepos, my_ia_start, my_ia_end, my_ia_size)
2222 :
2223 : CALL fm2array(Gamma_2D, my_ia_size, my_ia_start, my_ia_end, &
2224 : my_group_L_size, my_group_L_start, my_group_L_end, &
2225 : group_grid_2_mepos, mepos_2_grid_group, &
2226 : para_env_sub%num_pe, ngroup, &
2227 48 : fm_G_P_ia)
2228 :
2229 : ! create the Gamma_ia_P in DBCSR style
2230 : CALL create_dbcsr_gamma(Gamma_2D, homo(ispin), virtual(ispin), dimen_ia, para_env_sub, &
2231 : my_ia_start, my_ia_end, my_group_L_size, gd_ia, &
2232 48 : mp2_env%ri_grad%G_P_ia(:, ispin), mp2_env%ri_grad%mo_coeff_o(ispin)%matrix)
2233 :
2234 328 : CALL release_group_dist(gd_ia)
2235 :
2236 : END DO
2237 40 : DEALLOCATE (rpa_grad%fm_Y)
2238 40 : CALL cp_fm_release(fm_PQ_half)
2239 :
2240 40 : CALL timestop(handle)
2241 :
2242 320 : END SUBROUTINE rpa_grad_finalize
2243 :
2244 : ! **************************************************************************************************
2245 : !> \brief ...
2246 : !> \param sos_mp2_work_occ ...
2247 : !> \param sos_mp2_work_virt ...
2248 : !> \param para_env ...
2249 : !> \param para_env_sub ...
2250 : !> \param homo ...
2251 : !> \param virtual ...
2252 : !> \param mp2_env ...
2253 : ! **************************************************************************************************
2254 20 : SUBROUTINE sos_mp2_grad_finalize(sos_mp2_work_occ, sos_mp2_work_virt, para_env, para_env_sub, homo, virtual, mp2_env)
2255 : TYPE(sos_mp2_grad_work_type), ALLOCATABLE, &
2256 : DIMENSION(:), INTENT(INOUT) :: sos_mp2_work_occ, sos_mp2_work_virt
2257 : TYPE(mp_para_env_type), INTENT(IN), POINTER :: para_env, para_env_sub
2258 : INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
2259 : TYPE(mp2_type), INTENT(INOUT) :: mp2_env
2260 :
2261 : CHARACTER(LEN=*), PARAMETER :: routineN = 'sos_mp2_grad_finalize'
2262 :
2263 : INTEGER :: ab_counter, handle, ij_counter, ispin, &
2264 : itmp(2), my_a, my_b, my_B_end, &
2265 : my_B_size, my_B_start, my_i, my_j, &
2266 : nspins, pcol
2267 : REAL(KIND=dp) :: my_scale
2268 :
2269 20 : CALL timeset(routineN, handle)
2270 :
2271 20 : nspins = SIZE(sos_mp2_work_occ)
2272 20 : my_scale = mp2_env%scale_s
2273 :
2274 44 : DO ispin = 1, nspins
2275 48 : DO pcol = 0, SIZE(sos_mp2_work_occ(ispin)%index2send, 1) - 1
2276 24 : IF (ALLOCATED(sos_mp2_work_occ(ispin)%index2send(pcol)%array)) &
2277 4 : DEALLOCATE (sos_mp2_work_occ(ispin)%index2send(pcol)%array)
2278 24 : IF (ALLOCATED(sos_mp2_work_occ(ispin)%index2send(pcol)%array)) &
2279 0 : DEALLOCATE (sos_mp2_work_occ(ispin)%index2send(pcol)%array)
2280 24 : IF (ALLOCATED(sos_mp2_work_virt(ispin)%index2recv(pcol)%array)) &
2281 4 : DEALLOCATE (sos_mp2_work_virt(ispin)%index2recv(pcol)%array)
2282 24 : IF (ALLOCATED(sos_mp2_work_virt(ispin)%index2recv(pcol)%array)) &
2283 24 : DEALLOCATE (sos_mp2_work_virt(ispin)%index2recv(pcol)%array)
2284 : END DO
2285 0 : DEALLOCATE (sos_mp2_work_occ(ispin)%index2send, &
2286 0 : sos_mp2_work_occ(ispin)%index2recv, &
2287 0 : sos_mp2_work_virt(ispin)%index2send, &
2288 140 : sos_mp2_work_virt(ispin)%index2recv)
2289 : END DO
2290 :
2291 : ! Sum P_ij and P_ab and redistribute them
2292 44 : DO ispin = 1, nspins
2293 24 : CALL para_env%sum(sos_mp2_work_occ(ispin)%P)
2294 :
2295 96 : ALLOCATE (mp2_env%ri_grad%P_ij(ispin)%array(homo(ispin), homo(ispin)))
2296 472 : mp2_env%ri_grad%P_ij(ispin)%array = 0.0_dp
2297 116 : DO my_i = 1, homo(ispin)
2298 116 : mp2_env%ri_grad%P_ij(ispin)%array(my_i, my_i) = my_scale*sos_mp2_work_occ(ispin)%P(my_i)
2299 : END DO
2300 72 : DO ij_counter = 1, SIZE(sos_mp2_work_occ(ispin)%pair_list, 2)
2301 48 : my_i = sos_mp2_work_occ(ispin)%pair_list(1, ij_counter)
2302 48 : my_j = sos_mp2_work_occ(ispin)%pair_list(2, ij_counter)
2303 :
2304 72 : mp2_env%ri_grad%P_ij(ispin)%array(my_i, my_j) = my_scale*sos_mp2_work_occ(ispin)%P(homo(ispin) + ij_counter)
2305 : END DO
2306 24 : DEALLOCATE (sos_mp2_work_occ(ispin)%P, sos_mp2_work_occ(ispin)%pair_list)
2307 :
2308 : ! Symmetrize P_ij
2309 : mp2_env%ri_grad%P_ij(ispin)%array(:, :) = 0.5_dp*(mp2_env%ri_grad%P_ij(ispin)%array + &
2310 920 : TRANSPOSE(mp2_env%ri_grad%P_ij(ispin)%array))
2311 :
2312 : ! The first index of P_ab has to be distributed within the subgroups, so sum it up first and add the required elements later
2313 24 : CALL para_env%sum(sos_mp2_work_virt(ispin)%P)
2314 :
2315 24 : itmp = get_limit(virtual(ispin), para_env_sub%num_pe, para_env_sub%mepos)
2316 24 : my_B_size = itmp(2) - itmp(1) + 1
2317 24 : my_B_start = itmp(1)
2318 24 : my_B_end = itmp(2)
2319 :
2320 96 : ALLOCATE (mp2_env%ri_grad%P_ab(ispin)%array(my_B_size, virtual(ispin)))
2321 10382 : mp2_env%ri_grad%P_ab(ispin)%array = 0.0_dp
2322 486 : DO my_a = itmp(1), itmp(2)
2323 486 : mp2_env%ri_grad%P_ab(ispin)%array(my_a - itmp(1) + 1, my_a) = my_scale*sos_mp2_work_virt(ispin)%P(my_a)
2324 : END DO
2325 636 : DO ab_counter = 1, SIZE(sos_mp2_work_virt(ispin)%pair_list, 2)
2326 612 : my_a = sos_mp2_work_virt(ispin)%pair_list(1, ab_counter)
2327 612 : my_b = sos_mp2_work_virt(ispin)%pair_list(2, ab_counter)
2328 :
2329 612 : IF (my_a >= itmp(1) .AND. my_a <= itmp(2)) mp2_env%ri_grad%P_ab(ispin)%array(my_a - itmp(1) + 1, my_b) = &
2330 636 : my_scale*sos_mp2_work_virt(ispin)%P(virtual(ispin) + ab_counter)
2331 : END DO
2332 :
2333 24 : DEALLOCATE (sos_mp2_work_virt(ispin)%P, sos_mp2_work_virt(ispin)%pair_list)
2334 :
2335 : ! Symmetrize P_ab
2336 44 : IF (para_env_sub%num_pe > 1) THEN
2337 12 : BLOCK
2338 : INTEGER :: send_a_start, send_a_end, send_a_size, &
2339 : recv_a_start, recv_a_end, recv_a_size, proc_shift, proc_send, proc_recv
2340 4 : REAL(KIND=dp), DIMENSION(:), ALLOCATABLE, TARGET :: buffer_send_1D
2341 4 : REAL(KIND=dp), DIMENSION(:, :), POINTER :: buffer_send
2342 4 : REAL(KIND=dp), DIMENSION(:, :), ALLOCATABLE :: buffer_recv
2343 4 : TYPE(group_dist_d1_type) :: gd_virtual_sub
2344 :
2345 4 : CALL create_group_dist(gd_virtual_sub, para_env_sub%num_pe, virtual(ispin))
2346 :
2347 : mp2_env%ri_grad%P_ab(ispin)%array(:, my_B_start:my_B_end) = &
2348 : 0.5_dp*(mp2_env%ri_grad%P_ab(ispin)%array(:, my_B_start:my_B_end) &
2349 804 : + TRANSPOSE(mp2_env%ri_grad%P_ab(ispin)%array(:, my_B_start:my_B_end)))
2350 :
2351 12 : ALLOCATE (buffer_send_1D(my_B_size*maxsize(gd_virtual_sub)))
2352 16 : ALLOCATE (buffer_recv(my_B_size, maxsize(gd_virtual_sub)))
2353 :
2354 8 : DO proc_shift = 1, para_env_sub%num_pe - 1
2355 :
2356 4 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
2357 4 : proc_recv = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
2358 :
2359 4 : CALL get_group_dist(gd_virtual_sub, proc_send, send_a_start, send_a_end, send_a_size)
2360 4 : CALL get_group_dist(gd_virtual_sub, proc_recv, recv_a_start, recv_a_end, recv_a_size)
2361 :
2362 4 : buffer_send(1:send_a_size, 1:my_B_size) => buffer_send_1D(1:my_B_size*send_a_size)
2363 :
2364 402 : buffer_send(:send_a_size, :) = TRANSPOSE(mp2_env%ri_grad%P_ab(ispin)%array(:, send_a_start:send_a_end))
2365 : CALL para_env_sub%sendrecv(buffer_send(:send_a_size, :), proc_send, &
2366 402 : buffer_recv(:, :recv_a_size), proc_recv)
2367 :
2368 : mp2_env%ri_grad%P_ab(ispin)%array(:, recv_a_start:recv_a_end) = &
2369 410 : 0.5_dp*(mp2_env%ri_grad%P_ab(ispin)%array(:, recv_a_start:recv_a_end) + buffer_recv(:, 1:recv_a_size))
2370 :
2371 : END DO
2372 :
2373 4 : DEALLOCATE (buffer_send_1D, buffer_recv)
2374 :
2375 16 : CALL release_group_dist(gd_virtual_sub)
2376 : END BLOCK
2377 : ELSE
2378 : mp2_env%ri_grad%P_ab(ispin)%array(:, :) = 0.5_dp*(mp2_env%ri_grad%P_ab(ispin)%array + &
2379 19140 : TRANSPOSE(mp2_env%ri_grad%P_ab(ispin)%array))
2380 : END IF
2381 :
2382 : END DO
2383 68 : DEALLOCATE (sos_mp2_work_occ, sos_mp2_work_virt)
2384 20 : IF (nspins == 1) THEN
2385 336 : mp2_env%ri_grad%P_ij(1)%array(:, :) = 2.0_dp*mp2_env%ri_grad%P_ij(1)%array
2386 5374 : mp2_env%ri_grad%P_ab(1)%array(:, :) = 2.0_dp*mp2_env%ri_grad%P_ab(1)%array
2387 : END IF
2388 :
2389 20 : CALL timestop(handle)
2390 :
2391 20 : END SUBROUTINE
2392 :
2393 : ! **************************************************************************************************
2394 : !> \brief ...
2395 : !> \param rpa_work ...
2396 : !> \param mp2_env ...
2397 : !> \param homo ...
2398 : !> \param virtual ...
2399 : !> \param para_env ...
2400 : !> \param para_env_sub ...
2401 : ! **************************************************************************************************
2402 20 : SUBROUTINE rpa_grad_work_finalize(rpa_work, mp2_env, homo, virtual, para_env, para_env_sub)
2403 : TYPE(rpa_grad_work_type), INTENT(INOUT) :: rpa_work
2404 : TYPE(mp2_type), INTENT(INOUT) :: mp2_env
2405 : INTEGER, DIMENSION(:), INTENT(IN) :: homo, virtual
2406 : TYPE(mp_para_env_type), INTENT(IN), POINTER :: para_env, para_env_sub
2407 :
2408 : CHARACTER(LEN=*), PARAMETER :: routineN = 'rpa_grad_work_finalize'
2409 :
2410 : INTEGER :: handle, ispin, itmp(2), my_a_end, my_a_size, my_a_start, my_B_end, my_B_size, &
2411 : my_B_start, my_i_end, my_i_size, my_i_start, nspins, proc, proc_recv, proc_send, &
2412 : proc_shift, recv_a_end, recv_a_size, recv_a_start, recv_end, recv_start, send_a_end, &
2413 : send_a_size, send_a_start, send_end, send_start, size_recv_buffer, size_send_buffer
2414 : REAL(KIND=dp) :: my_scale
2415 20 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :) :: buffer_recv, buffer_send
2416 20 : TYPE(group_dist_d1_type) :: gd_a_sub, gd_virtual_sub
2417 :
2418 20 : CALL timeset(routineN, handle)
2419 :
2420 20 : nspins = SIZE(homo)
2421 20 : my_scale = mp2_env%ri_rpa%scale_rpa/(2.0_dp*pi)
2422 20 : IF (mp2_env%ri_rpa%minimax_quad) my_scale = my_scale/2.0_dp
2423 :
2424 20 : CALL cp_fm_release(rpa_work%fm_mat_Q_copy)
2425 :
2426 44 : DO ispin = 1, nspins
2427 68 : DO proc = 0, SIZE(rpa_work%index2send, 1) - 1
2428 24 : IF (ALLOCATED(rpa_work%index2send(proc, ispin)%array)) DEALLOCATE (rpa_work%index2send(proc, ispin)%array)
2429 48 : IF (ALLOCATED(rpa_work%index2recv(proc, ispin)%array)) DEALLOCATE (rpa_work%index2recv(proc, ispin)%array)
2430 : END DO
2431 : END DO
2432 68 : DEALLOCATE (rpa_work%index2send, rpa_work%index2recv)
2433 :
2434 44 : DO ispin = 1, nspins
2435 24 : CALL get_group_dist(rpa_work%gd_homo(ispin), rpa_work%mepos(2), my_i_start, my_i_end, my_i_size)
2436 24 : CALL release_group_dist(rpa_work%gd_homo(ispin))
2437 :
2438 96 : ALLOCATE (mp2_env%ri_grad%P_ij(ispin)%array(homo(ispin), homo(ispin)))
2439 472 : mp2_env%ri_grad%P_ij(ispin)%array = 0.0_dp
2440 472 : mp2_env%ri_grad%P_ij(ispin)%array(my_i_start:my_i_end, :) = my_scale*rpa_work%P_ij(ispin)%array
2441 24 : DEALLOCATE (rpa_work%P_ij(ispin)%array)
2442 24 : CALL para_env%sum(mp2_env%ri_grad%P_ij(ispin)%array)
2443 :
2444 : ! Symmetrize P_ij
2445 : mp2_env%ri_grad%P_ij(ispin)%array(:, :) = 0.5_dp*(mp2_env%ri_grad%P_ij(ispin)%array + &
2446 920 : TRANSPOSE(mp2_env%ri_grad%P_ij(ispin)%array))
2447 :
2448 24 : itmp = get_limit(virtual(ispin), para_env_sub%num_pe, para_env_sub%mepos)
2449 24 : my_B_start = itmp(1)
2450 24 : my_B_end = itmp(2)
2451 24 : my_B_size = my_B_end - my_B_start + 1
2452 :
2453 96 : ALLOCATE (mp2_env%ri_grad%P_ab(ispin)%array(my_B_size, virtual(ispin)))
2454 10382 : mp2_env%ri_grad%P_ab(ispin)%array = 0.0_dp
2455 :
2456 24 : CALL get_group_dist(rpa_work%gd_virtual(ispin), rpa_work%mepos(1), my_a_start, my_a_end, my_a_size)
2457 24 : CALL release_group_dist(rpa_work%gd_virtual(ispin))
2458 : ! This group dist contains the info which parts of Pab a process currently owns
2459 24 : CALL create_group_dist(gd_a_sub, my_a_start, my_a_end, my_a_size, para_env_sub)
2460 : ! This group dist contains the info which parts of Pab a process is supposed to own later
2461 24 : CALL create_group_dist(gd_virtual_sub, para_env_sub%num_pe, virtual(ispin))
2462 :
2463 : ! Calculate local indices of the common range of own matrix and send process
2464 24 : send_start = MAX(1, my_B_start - my_a_start + 1)
2465 24 : send_end = MIN(my_a_size, my_B_end - my_a_start + 1)
2466 :
2467 : ! Same for recv process but with reverse positions
2468 24 : recv_start = MAX(1, my_a_start - my_B_start + 1)
2469 24 : recv_end = MIN(my_B_size, my_a_end - my_B_start + 1)
2470 :
2471 : mp2_env%ri_grad%P_ab(ispin)%array(recv_start:recv_end, :) = &
2472 10382 : my_scale*rpa_work%P_ab(ispin)%array(send_start:send_end, :)
2473 :
2474 24 : IF (para_env_sub%num_pe > 1) THEN
2475 : size_send_buffer = 0
2476 : size_recv_buffer = 0
2477 8 : DO proc_shift = 1, para_env_sub%num_pe - 1
2478 4 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
2479 4 : proc_recv = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
2480 :
2481 4 : CALL get_group_dist(gd_virtual_sub, proc_send, send_a_start, send_a_end)
2482 4 : CALL get_group_dist(gd_a_sub, proc_recv, recv_a_start, recv_a_end)
2483 :
2484 : ! Calculate local indices of the common range of own matrix and send process
2485 4 : send_start = MAX(1, send_a_start - my_a_start + 1)
2486 4 : send_end = MIN(my_a_size, send_a_end - my_a_start + 1)
2487 :
2488 4 : size_send_buffer = MAX(size_send_buffer, MAX(send_end - send_start + 1, 0))
2489 :
2490 : ! Same for recv process but with reverse positions
2491 4 : recv_start = MAX(1, recv_a_start - my_B_start + 1)
2492 4 : recv_end = MIN(my_B_size, recv_a_end - my_B_start + 1)
2493 :
2494 8 : size_recv_buffer = MAX(size_recv_buffer, MAX(recv_end - recv_start + 1, 0))
2495 : END DO
2496 28 : ALLOCATE (buffer_send(size_send_buffer, virtual(ispin)), buffer_recv(size_recv_buffer, virtual(ispin)))
2497 :
2498 8 : DO proc_shift = 1, para_env_sub%num_pe - 1
2499 4 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
2500 4 : proc_recv = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
2501 :
2502 4 : CALL get_group_dist(gd_virtual_sub, proc_send, send_a_start, send_a_end)
2503 4 : CALL get_group_dist(gd_a_sub, proc_recv, recv_a_start, recv_a_end)
2504 :
2505 : ! Calculate local indices of the common range of own matrix and send process
2506 4 : send_start = MAX(1, send_a_start - my_a_start + 1)
2507 4 : send_end = MIN(my_a_size, send_a_end - my_a_start + 1)
2508 802 : buffer_send(1:MAX(send_end - send_start + 1, 0), :) = rpa_work%P_ab(ispin)%array(send_start:send_end, :)
2509 :
2510 : ! Same for recv process but with reverse positions
2511 4 : recv_start = MAX(1, recv_a_start - my_B_start + 1)
2512 4 : recv_end = MIN(my_B_size, recv_a_end - my_B_start + 1)
2513 :
2514 : CALL para_env_sub%sendrecv(buffer_send(1:MAX(send_end - send_start + 1, 0), :), proc_send, &
2515 1600 : buffer_recv(1:MAX(recv_end - recv_start + 1, 0), :), proc_recv)
2516 :
2517 : mp2_env%ri_grad%P_ab(ispin)%array(recv_start:recv_end, :) = &
2518 : mp2_env%ri_grad%P_ab(ispin)%array(recv_start:recv_end, :) + &
2519 810 : my_scale*buffer_recv(1:MAX(recv_end - recv_start + 1, 0), :)
2520 :
2521 : END DO
2522 :
2523 4 : IF (ALLOCATED(buffer_send)) DEALLOCATE (buffer_send)
2524 4 : IF (ALLOCATED(buffer_recv)) DEALLOCATE (buffer_recv)
2525 : END IF
2526 24 : DEALLOCATE (rpa_work%P_ab(ispin)%array)
2527 :
2528 24 : CALL release_group_dist(gd_a_sub)
2529 :
2530 : BLOCK
2531 : TYPE(mp_comm_type) :: comm_exchange
2532 24 : CALL comm_exchange%from_split(para_env, para_env_sub%mepos)
2533 24 : CALL comm_exchange%sum(mp2_env%ri_grad%P_ab(ispin)%array)
2534 48 : CALL comm_exchange%free()
2535 : END BLOCK
2536 :
2537 : ! Symmetrize P_ab
2538 24 : IF (para_env_sub%num_pe > 1) THEN
2539 : BLOCK
2540 4 : REAL(KIND=dp), DIMENSION(:), ALLOCATABLE, TARGET :: buffer_send_1D
2541 4 : REAL(KIND=dp), DIMENSION(:, :), POINTER :: buffer_send
2542 4 : REAL(KIND=dp), DIMENSION(:, :), ALLOCATABLE :: buffer_recv
2543 :
2544 : mp2_env%ri_grad%P_ab(ispin)%array(:, my_B_start:my_B_end) = &
2545 : 0.5_dp*(mp2_env%ri_grad%P_ab(ispin)%array(:, my_B_start:my_B_end) &
2546 804 : + TRANSPOSE(mp2_env%ri_grad%P_ab(ispin)%array(:, my_B_start:my_B_end)))
2547 :
2548 12 : ALLOCATE (buffer_send_1D(my_B_size*maxsize(gd_virtual_sub)))
2549 16 : ALLOCATE (buffer_recv(my_B_size, maxsize(gd_virtual_sub)))
2550 :
2551 8 : DO proc_shift = 1, para_env_sub%num_pe - 1
2552 :
2553 4 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
2554 4 : proc_recv = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
2555 :
2556 4 : CALL get_group_dist(gd_virtual_sub, proc_send, send_a_start, send_a_end, send_a_size)
2557 4 : CALL get_group_dist(gd_virtual_sub, proc_recv, recv_a_start, recv_a_end, recv_a_size)
2558 :
2559 4 : buffer_send(1:send_a_size, 1:my_B_size) => buffer_send_1D(1:my_B_size*send_a_size)
2560 :
2561 402 : buffer_send(:send_a_size, :) = TRANSPOSE(mp2_env%ri_grad%P_ab(ispin)%array(:, send_a_start:send_a_end))
2562 : CALL para_env_sub%sendrecv(buffer_send(:send_a_size, :), proc_send, &
2563 402 : buffer_recv(:, :recv_a_size), proc_recv)
2564 :
2565 : mp2_env%ri_grad%P_ab(ispin)%array(:, recv_a_start:recv_a_end) = &
2566 410 : 0.5_dp*(mp2_env%ri_grad%P_ab(ispin)%array(:, recv_a_start:recv_a_end) + buffer_recv(:, 1:recv_a_size))
2567 :
2568 : END DO
2569 :
2570 8 : DEALLOCATE (buffer_send_1D, buffer_recv)
2571 : END BLOCK
2572 : ELSE
2573 : mp2_env%ri_grad%P_ab(ispin)%array(:, :) = 0.5_dp*(mp2_env%ri_grad%P_ab(ispin)%array + &
2574 19140 : TRANSPOSE(mp2_env%ri_grad%P_ab(ispin)%array))
2575 : END IF
2576 :
2577 92 : CALL release_group_dist(gd_virtual_sub)
2578 :
2579 : END DO
2580 116 : DEALLOCATE (rpa_work%gd_homo, rpa_work%gd_virtual, rpa_work%P_ij, rpa_work%P_ab)
2581 20 : IF (nspins == 1) THEN
2582 336 : mp2_env%ri_grad%P_ij(1)%array(:, :) = 2.0_dp*mp2_env%ri_grad%P_ij(1)%array
2583 5374 : mp2_env%ri_grad%P_ab(1)%array(:, :) = 2.0_dp*mp2_env%ri_grad%P_ab(1)%array
2584 : END IF
2585 :
2586 20 : CALL timestop(handle)
2587 20 : END SUBROUTINE
2588 :
2589 : ! **************************************************************************************************
2590 : !> \brief Dereplicate data from fm_sub and collect in fm_global, overlapping data will be added
2591 : !> \param fm_sub replicated matrix, all subgroups have the same size, will be release on output
2592 : !> \param fm_global global matrix, on output it will contain the sum of the replicated matrices redistributed
2593 : ! **************************************************************************************************
2594 88 : SUBROUTINE dereplicate_and_sum_fm(fm_sub, fm_global)
2595 : TYPE(cp_fm_type), INTENT(INOUT) :: fm_sub, fm_global
2596 :
2597 : CHARACTER(LEN=*), PARAMETER :: routineN = 'dereplicate_and_sum_fm'
2598 :
2599 : INTEGER :: col_local, elements2recv_col, elements2recv_row, elements2send_col, &
2600 : elements2send_row, handle, handle2, mypcol_global, myprow_global, ncol_local_global, &
2601 : ncol_local_sub, npcol_global, npcol_sub, nprow_global, nprow_sub, nrow_local_global, &
2602 : nrow_local_sub, pcol_recv, pcol_send, proc_recv, proc_send, proc_send_global, proc_shift, &
2603 : prow_recv, prow_send, row_local, tag
2604 : INTEGER(int_8) :: size_recv_buffer, size_send_buffer
2605 88 : INTEGER, ALLOCATABLE, DIMENSION(:) :: data2recv_col, data2recv_row, &
2606 88 : data2send_col, data2send_row, &
2607 88 : subgroup2mepos
2608 88 : INTEGER, DIMENSION(:), POINTER :: col_indices_global, col_indices_sub, &
2609 88 : row_indices_global, row_indices_sub
2610 88 : INTEGER, DIMENSION(:, :), POINTER :: blacs2mpi_global, blacs2mpi_sub, &
2611 88 : mpi2blacs_global, mpi2blacs_sub
2612 88 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:), TARGET :: recv_buffer_1D, send_buffer_1D
2613 88 : REAL(KIND=dp), DIMENSION(:, :), POINTER :: recv_buffer, send_buffer
2614 : TYPE(mp_para_env_type), POINTER :: para_env, para_env_sub
2615 88 : TYPE(one_dim_int_array), ALLOCATABLE, DIMENSION(:) :: index2recv_col, index2recv_row, &
2616 88 : index2send_col, index2send_row
2617 :
2618 88 : CALL timeset(routineN, handle)
2619 :
2620 88 : tag = 1
2621 :
2622 88 : nprow_sub = fm_sub%matrix_struct%context%num_pe(1)
2623 88 : npcol_sub = fm_sub%matrix_struct%context%num_pe(2)
2624 :
2625 88 : myprow_global = fm_global%matrix_struct%context%mepos(1)
2626 88 : mypcol_global = fm_global%matrix_struct%context%mepos(2)
2627 88 : nprow_global = fm_global%matrix_struct%context%num_pe(1)
2628 88 : npcol_global = fm_global%matrix_struct%context%num_pe(2)
2629 :
2630 : CALL cp_fm_get_info(fm_sub, col_indices=col_indices_sub, row_indices=row_indices_sub, &
2631 88 : nrow_local=nrow_local_sub, ncol_local=ncol_local_sub)
2632 88 : CALL cp_fm_struct_get(fm_sub%matrix_struct, para_env=para_env_sub)
2633 : CALL cp_fm_struct_get(fm_global%matrix_struct, para_env=para_env, &
2634 : col_indices=col_indices_global, row_indices=row_indices_global, &
2635 88 : nrow_local=nrow_local_global, ncol_local=ncol_local_global)
2636 88 : CALL fm_sub%matrix_struct%context%get(blacs2mpi=blacs2mpi_sub, mpi2blacs=mpi2blacs_sub)
2637 88 : CALL fm_global%matrix_struct%context%get(blacs2mpi=blacs2mpi_global, mpi2blacs=mpi2blacs_global)
2638 :
2639 88 : IF (para_env%num_pe /= para_env_sub%num_pe) THEN
2640 : BLOCK
2641 : TYPE(mp_comm_type) :: comm_exchange
2642 64 : comm_exchange = fm_sub%matrix_struct%context%interconnect(para_env)
2643 64 : CALL comm_exchange%sum(fm_sub%local_data)
2644 128 : CALL comm_exchange%free()
2645 : END BLOCK
2646 : END IF
2647 :
2648 264 : ALLOCATE (subgroup2mepos(0:para_env_sub%num_pe - 1))
2649 88 : CALL para_env_sub%allgather(para_env%mepos, subgroup2mepos)
2650 :
2651 88 : CALL timeset(routineN//"_data2", handle2)
2652 : ! Create a map how much data has to be sent to what process coordinate, interchange rows and columns to transpose the matrices
2653 88 : CALL get_elements2send_col(data2send_col, fm_global%matrix_struct, row_indices_sub, index2send_col)
2654 88 : CALL get_elements2send_row(data2send_row, fm_global%matrix_struct, col_indices_sub, index2send_row)
2655 :
2656 : ! Create a map how much data has to be sent to what process coordinate, interchange rows and columns to transpose the matrices
2657 : ! Do the reverse for the recieve processes
2658 88 : CALL get_elements2send_col(data2recv_col, fm_sub%matrix_struct, row_indices_global, index2recv_col)
2659 88 : CALL get_elements2send_row(data2recv_row, fm_sub%matrix_struct, col_indices_global, index2recv_row)
2660 88 : CALL timestop(handle2)
2661 :
2662 88 : CALL timeset(routineN//"_local", handle2)
2663 : ! Loop over local data and transpose
2664 88 : prow_send = mpi2blacs_global(1, para_env%mepos)
2665 88 : pcol_send = mpi2blacs_global(2, para_env%mepos)
2666 88 : prow_recv = mpi2blacs_sub(1, para_env_sub%mepos)
2667 88 : pcol_recv = mpi2blacs_sub(2, para_env_sub%mepos)
2668 88 : elements2recv_col = data2recv_col(pcol_recv)
2669 88 : elements2recv_row = data2recv_row(prow_recv)
2670 :
2671 : !$OMP PARALLEL DO DEFAULT(NONE) PRIVATE(row_local,col_local) &
2672 : !$OMP SHARED(elements2recv_col,elements2recv_row,recv_buffer,fm_global,&
2673 : !$OMP index2recv_col,index2recv_row,pcol_recv,prow_recv, &
2674 88 : !$OMP fm_sub,index2send_col,index2send_row,pcol_send,prow_send)
2675 : DO col_local = 1, elements2recv_col
2676 : DO row_local = 1, elements2recv_row
2677 : fm_global%local_data(index2recv_col(pcol_recv)%array(col_local), &
2678 : index2recv_row(prow_recv)%array(row_local)) &
2679 : = fm_sub%local_data(index2send_col(pcol_send)%array(row_local), &
2680 : index2send_row(prow_send)%array(col_local))
2681 : END DO
2682 : END DO
2683 : !$OMP END PARALLEL DO
2684 88 : CALL timestop(handle2)
2685 :
2686 88 : IF (para_env_sub%num_pe > 1) THEN
2687 : size_send_buffer = 0_int_8
2688 : size_recv_buffer = 0_int_8
2689 : ! Loop over all processes in para_env_sub
2690 48 : DO proc_shift = 1, para_env_sub%num_pe - 1
2691 24 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
2692 24 : proc_recv = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
2693 :
2694 24 : proc_send_global = subgroup2mepos(proc_send)
2695 24 : prow_send = mpi2blacs_global(1, proc_send_global)
2696 24 : pcol_send = mpi2blacs_global(2, proc_send_global)
2697 24 : elements2send_col = data2send_col(pcol_send)
2698 24 : elements2send_row = data2send_row(prow_send)
2699 :
2700 24 : size_send_buffer = MAX(size_send_buffer, INT(elements2send_col, int_8)*elements2send_row)
2701 :
2702 24 : prow_recv = mpi2blacs_sub(1, proc_recv)
2703 24 : pcol_recv = mpi2blacs_sub(2, proc_recv)
2704 24 : elements2recv_col = data2recv_col(pcol_recv)
2705 24 : elements2recv_row = data2recv_row(prow_recv)
2706 :
2707 48 : size_recv_buffer = MAX(size_recv_buffer, INT(elements2recv_col, int_8)*elements2recv_row)
2708 : END DO
2709 120 : ALLOCATE (send_buffer_1D(size_send_buffer), recv_buffer_1D(size_recv_buffer))
2710 :
2711 : ! Loop over all processes in para_env_sub
2712 48 : DO proc_shift = 1, para_env_sub%num_pe - 1
2713 24 : proc_send = MODULO(para_env_sub%mepos + proc_shift, para_env_sub%num_pe)
2714 24 : proc_recv = MODULO(para_env_sub%mepos - proc_shift, para_env_sub%num_pe)
2715 :
2716 24 : proc_send_global = subgroup2mepos(proc_send)
2717 24 : prow_send = mpi2blacs_global(1, proc_send_global)
2718 24 : pcol_send = mpi2blacs_global(2, proc_send_global)
2719 24 : elements2send_col = data2send_col(pcol_send)
2720 24 : elements2send_row = data2send_row(prow_send)
2721 :
2722 24 : CALL timeset(routineN//"_pack", handle2)
2723 : ! Loop over local data and pack the buffer
2724 : ! Transpose the matrix already
2725 24 : send_buffer(1:elements2send_row, 1:elements2send_col) => send_buffer_1D(1:INT(elements2send_row, int_8)*elements2send_col)
2726 : !$OMP PARALLEL DO DEFAULT(NONE) PRIVATE(row_local,col_local) &
2727 : !$OMP SHARED(elements2send_col,elements2send_row,send_buffer,fm_sub,&
2728 24 : !$OMP index2send_col,index2send_row,pcol_send,prow_send)
2729 : DO row_local = 1, elements2send_col
2730 : DO col_local = 1, elements2send_row
2731 : send_buffer(col_local, row_local) = &
2732 : fm_sub%local_data(index2send_col(pcol_send)%array(row_local), &
2733 : index2send_row(prow_send)%array(col_local))
2734 : END DO
2735 : END DO
2736 : !$OMP END PARALLEL DO
2737 24 : CALL timestop(handle2)
2738 :
2739 24 : prow_recv = mpi2blacs_sub(1, proc_recv)
2740 24 : pcol_recv = mpi2blacs_sub(2, proc_recv)
2741 24 : elements2recv_col = data2recv_col(pcol_recv)
2742 24 : elements2recv_row = data2recv_row(prow_recv)
2743 :
2744 : ! Send data
2745 24 : recv_buffer(1:elements2recv_col, 1:elements2recv_row) => recv_buffer_1D(1:INT(elements2recv_row, int_8)*elements2recv_col)
2746 120 : IF (SIZE(recv_buffer) > 0_int_8) THEN
2747 72 : IF (SIZE(send_buffer) > 0_int_8) THEN
2748 81072 : CALL para_env_sub%sendrecv(send_buffer, proc_send, recv_buffer, proc_recv, tag)
2749 : ELSE
2750 0 : CALL para_env_sub%recv(recv_buffer, proc_recv, tag)
2751 : END IF
2752 :
2753 24 : CALL timeset(routineN//"_unpack", handle2)
2754 : !$OMP PARALLEL DO DEFAULT(NONE) PRIVATE(row_local,col_local) &
2755 : !$OMP SHARED(elements2recv_col,elements2recv_row,recv_buffer,fm_global,&
2756 24 : !$OMP index2recv_col,index2recv_row,pcol_recv,prow_recv)
2757 : DO col_local = 1, elements2recv_col
2758 : DO row_local = 1, elements2recv_row
2759 : fm_global%local_data(index2recv_col(pcol_recv)%array(col_local), &
2760 : index2recv_row(prow_recv)%array(row_local)) &
2761 : = recv_buffer(col_local, row_local)
2762 : END DO
2763 : END DO
2764 : !$OMP END PARALLEL DO
2765 24 : CALL timestop(handle2)
2766 0 : ELSE IF (SIZE(send_buffer) > 0_int_8) THEN
2767 0 : CALL para_env_sub%send(send_buffer, proc_send, tag)
2768 : END IF
2769 : END DO
2770 : END IF
2771 :
2772 88 : DEALLOCATE (data2send_col, data2send_row, data2recv_col, data2recv_row)
2773 176 : DO proc_shift = 0, npcol_global - 1
2774 176 : DEALLOCATE (index2send_col(proc_shift)%array)
2775 : END DO
2776 176 : DO proc_shift = 0, npcol_sub - 1
2777 176 : DEALLOCATE (index2recv_col(proc_shift)%array)
2778 : END DO
2779 264 : DO proc_shift = 0, nprow_global - 1
2780 264 : DEALLOCATE (index2send_row(proc_shift)%array)
2781 : END DO
2782 200 : DO proc_shift = 0, nprow_sub - 1
2783 200 : DEALLOCATE (index2recv_row(proc_shift)%array)
2784 : END DO
2785 664 : DEALLOCATE (index2send_col, index2recv_col, index2send_row, index2recv_row)
2786 :
2787 88 : CALL cp_fm_release(fm_sub)
2788 :
2789 88 : CALL timestop(handle)
2790 :
2791 440 : END SUBROUTINE dereplicate_and_sum_fm
2792 :
2793 : ! **************************************************************************************************
2794 : !> \brief ...
2795 : !> \param data2send ...
2796 : !> \param struct_global ...
2797 : !> \param indices_sub ...
2798 : !> \param index2send ...
2799 : ! **************************************************************************************************
2800 176 : SUBROUTINE get_elements2send_col(data2send, struct_global, indices_sub, index2send)
2801 : INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(OUT) :: data2send
2802 : TYPE(cp_fm_struct_type), INTENT(INOUT) :: struct_global
2803 : INTEGER, DIMENSION(:), INTENT(IN) :: indices_sub
2804 : TYPE(one_dim_int_array), ALLOCATABLE, &
2805 : DIMENSION(:), INTENT(OUT) :: index2send
2806 :
2807 : INTEGER :: i_global, i_local, np_global, proc
2808 :
2809 176 : CALL struct_global%context%get(number_of_process_rows=np_global)
2810 :
2811 528 : ALLOCATE (data2send(0:np_global - 1))
2812 464 : data2send = 0
2813 10436 : DO i_local = 1, SIZE(indices_sub)
2814 10260 : i_global = indices_sub(i_local)
2815 10260 : proc = struct_global%g2p_col(i_global)
2816 10436 : data2send(proc) = data2send(proc) + 1
2817 : END DO
2818 :
2819 816 : ALLOCATE (index2send(0:np_global - 1))
2820 464 : DO proc = 0, np_global - 1
2821 752 : ALLOCATE (index2send(proc)%array(data2send(proc)))
2822 : ! We want to crash if there is an error
2823 10724 : index2send(proc)%array = -1
2824 : END DO
2825 :
2826 464 : data2send = 0
2827 10436 : DO i_local = 1, SIZE(indices_sub)
2828 10260 : i_global = indices_sub(i_local)
2829 10260 : proc = struct_global%g2p_col(i_global)
2830 10260 : data2send(proc) = data2send(proc) + 1
2831 10436 : index2send(proc)%array(data2send(proc)) = i_local
2832 : END DO
2833 :
2834 176 : END SUBROUTINE
2835 :
2836 : ! **************************************************************************************************
2837 : !> \brief ...
2838 : !> \param data2send ...
2839 : !> \param struct_global ...
2840 : !> \param indices_sub ...
2841 : !> \param index2send ...
2842 : ! **************************************************************************************************
2843 176 : SUBROUTINE get_elements2send_row(data2send, struct_global, indices_sub, index2send)
2844 : INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(OUT) :: data2send
2845 : TYPE(cp_fm_struct_type), INTENT(INOUT) :: struct_global
2846 : INTEGER, DIMENSION(:), INTENT(IN) :: indices_sub
2847 : TYPE(one_dim_int_array), ALLOCATABLE, &
2848 : DIMENSION(:), INTENT(OUT) :: index2send
2849 :
2850 : INTEGER :: i_global, i_local, np_global, proc
2851 :
2852 176 : CALL struct_global%context%get(number_of_process_rows=np_global)
2853 :
2854 528 : ALLOCATE (data2send(0:np_global - 1))
2855 464 : data2send = 0
2856 15048 : DO i_local = 1, SIZE(indices_sub)
2857 14872 : i_global = indices_sub(i_local)
2858 14872 : proc = struct_global%g2p_row(i_global)
2859 15048 : data2send(proc) = data2send(proc) + 1
2860 : END DO
2861 :
2862 816 : ALLOCATE (index2send(0:np_global - 1))
2863 464 : DO proc = 0, np_global - 1
2864 864 : ALLOCATE (index2send(proc)%array(data2send(proc)))
2865 : ! We want to crash if there is an error
2866 15336 : index2send(proc)%array = -1
2867 : END DO
2868 :
2869 464 : data2send = 0
2870 15048 : DO i_local = 1, SIZE(indices_sub)
2871 14872 : i_global = indices_sub(i_local)
2872 14872 : proc = struct_global%g2p_row(i_global)
2873 14872 : data2send(proc) = data2send(proc) + 1
2874 15048 : index2send(proc)%array(data2send(proc)) = i_local
2875 : END DO
2876 :
2877 176 : END SUBROUTINE
2878 :
2879 0 : END MODULE rpa_grad
|