Line data Source code
1 : !--------------------------------------------------------------------------------------------------!
2 : ! CP2K: A general program to perform molecular dynamics simulations !
3 : ! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4 : ! !
5 : ! SPDX-License-Identifier: GPL-2.0-or-later !
6 : !--------------------------------------------------------------------------------------------------!
7 :
8 : ! **************************************************************************************************
9 : !> \note
10 : !> Basic type for real space grid methods
11 : !> \par History
12 : !> JGH (22-May-2002) : New routine rs_grid_zero
13 : !> JGH (12-Jun-2002) : Bug fix for mpi groups
14 : !> JGH (19-Jun-2003) : Added routine for task distribution
15 : !> JGH (23-Nov-2003) : Added routine for task loop separation
16 : !> \author JGH (18-Mar-2001)
17 : ! **************************************************************************************************
18 : MODULE realspace_grid_types
19 : USE cp_array_utils, ONLY: cp_1d_r_p_type
20 : USE cp_log_handling, ONLY: cp_to_string
21 : USE kahan_sum, ONLY: accurate_sum
22 : USE kinds, ONLY: dp,&
23 : int_8
24 : USE machine, ONLY: m_memory
25 : USE mathlib, ONLY: det_3x3
26 : USE message_passing, ONLY: mp_comm_null,&
27 : mp_comm_type,&
28 : mp_request_null,&
29 : mp_request_type,&
30 : mp_waitall,&
31 : mp_waitany
32 : USE offload_api, ONLY: offload_buffer_type,&
33 : offload_create_buffer,&
34 : offload_free_buffer
35 : USE pw_grid_types, ONLY: PW_MODE_LOCAL,&
36 : pw_grid_type
37 : USE pw_grids, ONLY: pw_grid_release,&
38 : pw_grid_retain
39 : USE pw_methods, ONLY: pw_integrate_function
40 : USE pw_types, ONLY: pw_r3d_rs_type
41 : USE util, ONLY: get_limit
42 :
43 : !$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
44 :
45 : #include "../base/base_uses.f90"
46 :
47 : IMPLICIT NONE
48 :
49 : PRIVATE
50 : PUBLIC :: realspace_grid_type, &
51 : realspace_grid_desc_type, &
52 : realspace_grid_p_type, &
53 : realspace_grid_desc_p_type, &
54 : realspace_grid_input_type
55 :
56 : PUBLIC :: transfer_rs2pw, &
57 : transfer_pw2rs, &
58 : rs_grid_zero, &
59 : rs_grid_set_box, &
60 : rs_grid_create, &
61 : rs_grid_create_descriptor, &
62 : rs_grid_retain_descriptor, &
63 : rs_grid_release, &
64 : rs_grid_release_descriptor, &
65 : rs_grid_reorder_ranks, &
66 : rs_grid_print, &
67 : rs_grid_locate_rank, &
68 : rs_grid_max_ngpts, &
69 : rs_grid_mult_and_add, &
70 : map_gaussian_here
71 :
72 : INTEGER, PARAMETER, PUBLIC :: rsgrid_distributed = 0, &
73 : rsgrid_replicated = 1, &
74 : rsgrid_automatic = 2
75 :
76 : LOGICAL, PRIVATE, PARAMETER :: debug_this_module = .FALSE.
77 : CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'realspace_grid_types'
78 :
79 : ! **************************************************************************************************
80 : TYPE realspace_grid_input_type
81 : INTEGER :: distribution_type = rsgrid_replicated
82 : INTEGER :: distribution_layout(3) = -1
83 : REAL(KIND=dp) :: memory_factor = 0.0_dp
84 : LOGICAL :: lock_distribution = .FALSE.
85 : INTEGER :: nsmax = -1
86 : REAL(KIND=dp) :: halo_reduction_factor = 1.0_dp
87 : END TYPE realspace_grid_input_type
88 :
89 : ! **************************************************************************************************
90 : TYPE realspace_grid_desc_type
91 : TYPE(pw_grid_type), POINTER :: pw => NULL() ! the pw grid
92 :
93 : INTEGER :: ref_count = 0 ! reference count
94 :
95 : INTEGER(int_8) :: ngpts = 0_int_8 ! # grid points
96 : INTEGER, DIMENSION(3) :: npts = 0 ! # grid points per dimension
97 : INTEGER, DIMENSION(3) :: lb = 0 ! lower bounds
98 : INTEGER, DIMENSION(3) :: ub = 0 ! upper bounds
99 :
100 : INTEGER :: border = 0 ! border points
101 :
102 : INTEGER, DIMENSION(3) :: perd = -1 ! periodicity enforced
103 : REAL(KIND=dp), DIMENSION(3, 3) :: dh = 0.0_dp ! incremental grid matrix
104 : REAL(KIND=dp), DIMENSION(3, 3) :: dh_inv = 0.0_dp ! inverse incremental grid matrix
105 : LOGICAL :: orthorhombic = .TRUE. ! grid symmetry
106 :
107 : LOGICAL :: parallel = .TRUE. ! whether the corresponding pw grid is distributed
108 : LOGICAL :: distributed = .TRUE. ! whether the rs grid is distributed
109 : ! these MPI related quantities are only meaningful depending on how the grid has been laid out
110 : ! they are most useful for fully distributed grids, where they reflect the topology of the grid
111 : TYPE(mp_comm_type) :: group = mp_comm_null
112 : INTEGER :: my_pos = -1
113 : INTEGER :: group_size = 0
114 : INTEGER, DIMENSION(3) :: group_dim = -1
115 : INTEGER, DIMENSION(3) :: group_coor = -1
116 : INTEGER, DIMENSION(3) :: neighbours = -1
117 : ! only meaningful on distributed grids
118 : ! a list of bounds for each CPU
119 : INTEGER, DIMENSION(:, :), ALLOCATABLE :: lb_global
120 : INTEGER, DIMENSION(:, :), ALLOCATABLE :: ub_global
121 : ! a mapping from linear rank to 3d coord
122 : INTEGER, DIMENSION(:, :), ALLOCATABLE :: rank2coord
123 : INTEGER, DIMENSION(:, :, :), ALLOCATABLE :: coord2rank
124 : ! a mapping from index to rank (which allows to figure out easily on which rank a given point of the grid is)
125 : INTEGER, DIMENSION(:), ALLOCATABLE :: x2coord
126 : INTEGER, DIMENSION(:), ALLOCATABLE :: y2coord
127 : INTEGER, DIMENSION(:), ALLOCATABLE :: z2coord
128 :
129 : INTEGER :: my_virtual_pos = -1
130 : INTEGER, DIMENSION(3) :: virtual_group_coor = -1
131 :
132 : INTEGER, DIMENSION(:), ALLOCATABLE :: virtual2real, real2virtual
133 :
134 : END TYPE realspace_grid_desc_type
135 :
136 : TYPE realspace_grid_type
137 :
138 : TYPE(realspace_grid_desc_type), POINTER :: desc => NULL()
139 :
140 : INTEGER :: ngpts_local = -1 ! local dimensions
141 : INTEGER, DIMENSION(3) :: npts_local = -1
142 : INTEGER, DIMENSION(3) :: lb_local = -1
143 : INTEGER, DIMENSION(3) :: ub_local = -1
144 : INTEGER, DIMENSION(3) :: lb_real = -1 ! lower bounds of the real local data
145 : INTEGER, DIMENSION(3) :: ub_real = -1 ! upper bounds of the real local data
146 :
147 : INTEGER, DIMENSION(:), ALLOCATABLE :: px, py, pz ! index translators
148 : TYPE(offload_buffer_type) :: buffer = offload_buffer_type() ! owner of the grid's memory
149 : REAL(KIND=dp), DIMENSION(:, :, :), CONTIGUOUS, POINTER :: r => NULL() ! the grid (pointer to buffer%host_buffer)
150 :
151 : END TYPE realspace_grid_type
152 :
153 : ! **************************************************************************************************
154 : TYPE realspace_grid_p_type
155 : TYPE(realspace_grid_type), POINTER :: rs_grid => NULL()
156 : END TYPE realspace_grid_p_type
157 :
158 : TYPE realspace_grid_desc_p_type
159 : TYPE(realspace_grid_desc_type), POINTER :: rs_desc => NULL()
160 : END TYPE realspace_grid_desc_p_type
161 :
162 : CONTAINS
163 :
164 : ! **************************************************************************************************
165 : !> \brief returns the 1D rank of the task which is a cartesian shift away from 1D rank rank_in
166 : !> only possible if rs_grid is a distributed grid
167 : !> \param rs_desc ...
168 : !> \param rank_in ...
169 : !> \param shift ...
170 : !> \return ...
171 : ! **************************************************************************************************
172 2346 : PURE FUNCTION rs_grid_locate_rank(rs_desc, rank_in, shift) RESULT(rank_out)
173 : TYPE(realspace_grid_desc_type), INTENT(IN) :: rs_desc
174 : INTEGER, INTENT(IN) :: rank_in
175 : INTEGER, DIMENSION(3), INTENT(IN) :: shift
176 : INTEGER :: rank_out
177 :
178 : INTEGER :: coord(3)
179 :
180 9384 : coord = MODULO(rs_desc%rank2coord(:, rank_in) + shift, rs_desc%group_dim)
181 2346 : rank_out = rs_desc%coord2rank(coord(1), coord(2), coord(3))
182 2346 : END FUNCTION rs_grid_locate_rank
183 :
184 : ! **************************************************************************************************
185 : !> \brief Determine the setup of real space grids - this is divided up into the
186 : !> creation of a descriptor and the actual grid itself (see rs_grid_create)
187 : !> \param desc ...
188 : !> \param pw_grid ...
189 : !> \param input_settings ...
190 : !> \param border_points ...
191 : !> \par History
192 : !> JGH (08-Jun-2003) : nsmax <= 0 indicates fully replicated grid
193 : !> Iain Bethune (05-Sep-2008) : modified cut heuristic
194 : !> (c) The Numerical Algorithms Group (NAG) Ltd, 2008 on behalf of the HECToR project
195 : !> - Create a descriptor for realspace grids with a number of border
196 : !> points as exactly given by the optional argument border_points.
197 : !> These grids are always distributed.
198 : !> (27.11.2013, Matthias Krack)
199 : !> \author JGH (18-Mar-2001)
200 : ! **************************************************************************************************
201 31718 : SUBROUTINE rs_grid_create_descriptor(desc, pw_grid, input_settings, border_points)
202 : TYPE(realspace_grid_desc_type), POINTER :: desc
203 : TYPE(pw_grid_type), INTENT(INOUT), TARGET :: pw_grid
204 : TYPE(realspace_grid_input_type), INTENT(IN) :: input_settings
205 : INTEGER, INTENT(IN), OPTIONAL :: border_points
206 :
207 : CHARACTER(LEN=*), PARAMETER :: routineN = 'rs_grid_create_descriptor'
208 :
209 : INTEGER :: border_size, dir, handle, i, j, k, l, &
210 : lb(2), min_npts_real, n_slices(3), &
211 : n_slices_tmp(3), nmin
212 : LOGICAL :: overlap
213 : REAL(KIND=dp) :: ratio, ratio_best, volume, volume_dist
214 :
215 31718 : CALL timeset(routineN, handle)
216 :
217 31718 : IF (PRESENT(border_points)) THEN
218 128 : border_size = border_points
219 : ELSE
220 : border_size = 0
221 : END IF
222 :
223 1554182 : ALLOCATE (desc)
224 :
225 31718 : CALL pw_grid%para%group%sync()
226 :
227 31718 : desc%pw => pw_grid
228 31718 : CALL pw_grid_retain(desc%pw)
229 :
230 412334 : desc%dh = pw_grid%dh
231 412334 : desc%dh_inv = pw_grid%dh_inv
232 31718 : desc%orthorhombic = pw_grid%orthorhombic
233 31718 : desc%ref_count = 1
234 :
235 31718 : IF (pw_grid%para%mode == PW_MODE_LOCAL) THEN
236 : ! The corresponding group has dimension 1
237 : ! All operations will be done locally
238 16328 : desc%npts = pw_grid%npts
239 16328 : desc%ngpts = PRODUCT(INT(desc%npts, KIND=int_8))
240 16328 : desc%lb = pw_grid%bounds(1, :)
241 16328 : desc%ub = pw_grid%bounds(2, :)
242 4082 : desc%border = border_size
243 4082 : IF (border_size == 0) THEN
244 16328 : desc%perd = 1
245 : ELSE
246 0 : desc%perd = 0
247 : END IF
248 4082 : desc%parallel = .FALSE.
249 4082 : desc%distributed = .FALSE.
250 4082 : desc%group = mp_comm_null
251 4082 : desc%group_size = 1
252 16328 : desc%group_dim = 1
253 16328 : desc%group_coor = 0
254 4082 : desc%my_pos = 0
255 : ELSE
256 : ! group size of desc grid
257 : ! global grid dimensions are still the same
258 27636 : desc%group_size = pw_grid%para%group%num_pe
259 110544 : desc%npts = pw_grid%npts
260 110544 : desc%ngpts = PRODUCT(INT(desc%npts, KIND=int_8))
261 110544 : desc%lb = pw_grid%bounds(1, :)
262 110544 : desc%ub = pw_grid%bounds(2, :)
263 :
264 : ! this is the eventual border size
265 27636 : IF (border_size == 0) THEN
266 27508 : nmin = (input_settings%nsmax + 1)/2
267 27508 : nmin = MAX(0, NINT(nmin*input_settings%halo_reduction_factor))
268 : ELSE
269 : ! Set explicitly the requested border size
270 : nmin = border_size
271 : END IF
272 :
273 27636 : IF (input_settings%distribution_type == rsgrid_replicated) THEN
274 :
275 44280 : n_slices = 1
276 11070 : IF (border_size > 0) THEN
277 : CALL cp_abort(__LOCATION__, &
278 : "An explicit border size > 0 is not yet working for "// &
279 : "replicated realspace grids. Request DISTRIBUTION_TYPE "// &
280 0 : "distributed for RS_GRID explicitly.")
281 : END IF
282 :
283 : ELSE
284 :
285 66264 : n_slices = 1
286 16566 : ratio_best = -HUGE(ratio_best)
287 :
288 : ! don't allow distributions with more processors than real grid points
289 49698 : DO k = 1, MIN(desc%npts(3), desc%group_size)
290 115962 : DO j = 1, MIN(desc%npts(2), desc%group_size)
291 66264 : i = MIN(desc%npts(1), desc%group_size/(j*k))
292 265056 : n_slices_tmp = (/i, j, k/)
293 :
294 : ! we don't match the actual number of CPUs
295 265056 : IF (PRODUCT(n_slices_tmp) .NE. desc%group_size) CYCLE
296 :
297 : ! we see if there has been a input constraint
298 : ! i.e. if the layout is not -1 we need to fullfil it
299 397842 : IF (.NOT. ALL(PACK(n_slices_tmp == input_settings%distribution_layout, &
300 : (/-1, -1, -1/) /= input_settings%distribution_layout) &
301 49698 : )) CYCLE
302 :
303 : ! We can not work with a grid that has more local than global grid points.
304 : ! This can happen when a halo region wraps around and overlaps with the other halo.
305 49472 : overlap = .FALSE.
306 197888 : DO dir = 1, 3
307 197888 : IF (n_slices_tmp(dir) > 1) THEN
308 148416 : DO l = 0, n_slices_tmp(dir) - 1
309 98944 : lb = get_limit(desc%npts(dir), n_slices_tmp(dir), l)
310 148416 : IF (lb(2) - lb(1) + 1 + 2*nmin > desc%npts(dir)) overlap = .TRUE.
311 : END DO
312 : END IF
313 : END DO
314 49472 : IF (overlap) CYCLE
315 :
316 : ! a heuristic optimisation to reduce the memory usage
317 : ! we go for the smallest local to real volume
318 : ! volume of the box without the wings / volume of the box with the wings
319 : ! with prefactodesc to promote less cuts in Z dimension
320 : ratio = PRODUCT(REAL(desc%npts, KIND=dp)/n_slices_tmp)/ &
321 : PRODUCT(REAL(desc%npts, KIND=dp)/n_slices_tmp + &
322 58716 : MERGE((/0.0, 0.0, 0.0/), 2*(/1.06*nmin, 1.05*nmin, 1.03*nmin/), n_slices_tmp == (/1, 1, 1/)))
323 41520 : IF (ratio > ratio_best) THEN
324 8372 : ratio_best = ratio
325 8372 : n_slices = n_slices_tmp
326 : END IF
327 :
328 : END DO
329 : END DO
330 :
331 : ! if automatic we can still decide this is a replicated grid
332 : ! if the memory gain (or the gain is messages) is too small.
333 16566 : IF (input_settings%distribution_type == rsgrid_automatic) THEN
334 64064 : volume = PRODUCT(REAL(desc%npts, KIND=dp))
335 : volume_dist = PRODUCT(REAL(desc%npts, KIND=dp)/n_slices + &
336 64064 : MERGE((/0, 0, 0/), 2*(/nmin, nmin, nmin/), n_slices == (/1, 1, 1/)))
337 16016 : IF (volume < volume_dist*input_settings%memory_factor) THEN
338 64064 : n_slices = 1
339 : END IF
340 : END IF
341 :
342 : END IF
343 :
344 110544 : desc%group_dim(:) = n_slices(:)
345 27636 : CALL desc%group%from_dup(pw_grid%para%group)
346 27636 : desc%group_size = desc%group%num_pe
347 27636 : desc%my_pos = desc%group%mepos
348 :
349 110366 : IF (ALL(n_slices == 1)) THEN
350 : ! CASE 1 : only one slice: we do not need overlapping regions and special
351 : ! recombination of the total density
352 27478 : desc%border = border_size
353 27478 : IF (border_size == 0) THEN
354 109912 : desc%perd = 1
355 : ELSE
356 0 : desc%perd = 0
357 : END IF
358 27478 : desc%distributed = .FALSE.
359 27478 : desc%parallel = .TRUE.
360 109912 : desc%group_coor(:) = 0
361 27478 : desc%my_virtual_pos = 0
362 :
363 82434 : ALLOCATE (desc%virtual2real(0:desc%group_size - 1))
364 82434 : ALLOCATE (desc%real2virtual(0:desc%group_size - 1))
365 : ! Start with no reordering
366 82434 : DO i = 0, desc%group_size - 1
367 54956 : desc%virtual2real(i) = i
368 82434 : desc%real2virtual(i) = i
369 : END DO
370 : ELSE
371 : ! CASE 2 : general case
372 : ! periodicity is no longer enforced arbritary directions
373 158 : IF (border_size == 0) THEN
374 120 : desc%perd = 1
375 120 : DO dir = 1, 3
376 120 : IF (n_slices(dir) > 1) desc%perd(dir) = 0
377 : END DO
378 : ELSE
379 512 : desc%perd(:) = 0
380 : END IF
381 : ! we keep a border of nmin points
382 158 : desc%border = nmin
383 : ! we are going parallel on the real space grid
384 158 : desc%parallel = .TRUE.
385 158 : desc%distributed = .TRUE.
386 :
387 : ! set up global info about the distribution
388 474 : ALLOCATE (desc%rank2coord(3, 0:desc%group_size - 1))
389 790 : ALLOCATE (desc%coord2rank(0:desc%group_dim(1) - 1, 0:desc%group_dim(2) - 1, 0:desc%group_dim(3) - 1))
390 474 : ALLOCATE (desc%lb_global(3, 0:desc%group_size - 1))
391 474 : ALLOCATE (desc%ub_global(3, 0:desc%group_size - 1))
392 474 : ALLOCATE (desc%x2coord(desc%lb(1):desc%ub(1)))
393 474 : ALLOCATE (desc%y2coord(desc%lb(2):desc%ub(2)))
394 474 : ALLOCATE (desc%z2coord(desc%lb(3):desc%ub(3)))
395 :
396 474 : DO i = 0, desc%group_size - 1
397 : ! Calculate coordinates in a row-major order (to be SMP-friendly)
398 316 : desc%rank2coord(1, i) = i/(desc%group_dim(2)*desc%group_dim(3))
399 : desc%rank2coord(2, i) = MODULO(i, desc%group_dim(2)*desc%group_dim(3)) &
400 316 : /desc%group_dim(3)
401 316 : desc%rank2coord(3, i) = MODULO(i, desc%group_dim(3))
402 :
403 316 : IF (i == desc%my_pos) THEN
404 632 : desc%group_coor = desc%rank2coord(:, i)
405 : END IF
406 :
407 316 : desc%coord2rank(desc%rank2coord(1, i), desc%rank2coord(2, i), desc%rank2coord(3, i)) = i
408 : ! the lb_global and ub_global correspond to lb_real and ub_real of each task
409 1264 : desc%lb_global(:, i) = desc%lb
410 1264 : desc%ub_global(:, i) = desc%ub
411 1422 : DO dir = 1, 3
412 1264 : IF (desc%group_dim(dir) .GT. 1) THEN
413 316 : lb = get_limit(desc%npts(dir), desc%group_dim(dir), desc%rank2coord(dir, i))
414 316 : desc%lb_global(dir, i) = lb(1) + desc%lb(dir) - 1
415 316 : desc%ub_global(dir, i) = lb(2) + desc%lb(dir) - 1
416 : END IF
417 : END DO
418 : END DO
419 :
420 : ! map a grid point to a CPU coord
421 632 : DO dir = 1, 3
422 1264 : DO l = 0, desc%group_dim(dir) - 1
423 632 : IF (desc%group_dim(dir) .GT. 1) THEN
424 316 : lb = get_limit(desc%npts(dir), desc%group_dim(dir), l)
425 948 : lb = lb + desc%lb(dir) - 1
426 : ELSE
427 316 : lb(1) = desc%lb(dir)
428 316 : lb(2) = desc%ub(dir)
429 : END IF
430 474 : SELECT CASE (dir)
431 : CASE (1)
432 11696 : desc%x2coord(lb(1):lb(2)) = l
433 : CASE (2)
434 12104 : desc%y2coord(lb(1):lb(2)) = l
435 : CASE (3)
436 12200 : desc%z2coord(lb(1):lb(2)) = l
437 : END SELECT
438 : END DO
439 : END DO
440 :
441 : ! an upper bound for the number of neighbours the border is overlapping with
442 632 : DO dir = 1, 3
443 474 : desc%neighbours(dir) = 0
444 632 : IF ((n_slices(dir) > 1) .OR. (border_size > 0)) THEN
445 414 : min_npts_real = HUGE(0)
446 986 : DO l = 0, n_slices(dir) - 1
447 572 : lb = get_limit(desc%npts(dir), n_slices(dir), l)
448 986 : min_npts_real = MIN(lb(2) - lb(1) + 1, min_npts_real)
449 : END DO
450 414 : desc%neighbours(dir) = (desc%border + min_npts_real - 1)/min_npts_real
451 : END IF
452 : END DO
453 :
454 474 : ALLOCATE (desc%virtual2real(0:desc%group_size - 1))
455 474 : ALLOCATE (desc%real2virtual(0:desc%group_size - 1))
456 : ! Start with no reordering
457 474 : DO i = 0, desc%group_size - 1
458 316 : desc%virtual2real(i) = i
459 474 : desc%real2virtual(i) = i
460 : END DO
461 :
462 158 : desc%my_virtual_pos = desc%real2virtual(desc%my_pos)
463 632 : desc%virtual_group_coor(:) = desc%rank2coord(:, desc%my_virtual_pos)
464 :
465 : END IF
466 : END IF
467 :
468 31718 : CALL timestop(handle)
469 :
470 31718 : END SUBROUTINE rs_grid_create_descriptor
471 :
472 : ! **************************************************************************************************
473 : !> \brief ...
474 : !> \param rs ...
475 : !> \param desc ...
476 : ! **************************************************************************************************
477 4926054 : SUBROUTINE rs_grid_create(rs, desc)
478 : TYPE(realspace_grid_type), INTENT(OUT) :: rs
479 : TYPE(realspace_grid_desc_type), INTENT(INOUT), &
480 : TARGET :: desc
481 :
482 : CHARACTER(LEN=*), PARAMETER :: routineN = 'rs_grid_create'
483 :
484 : INTEGER :: handle
485 :
486 234574 : CALL timeset(routineN, handle)
487 :
488 234574 : rs%desc => desc
489 234574 : CALL rs_grid_retain_descriptor(rs%desc)
490 :
491 234574 : IF (desc%pw%para%mode == PW_MODE_LOCAL) THEN
492 : ! The corresponding group has dimension 1
493 : ! All operations will be done locally
494 63928 : rs%lb_real = desc%lb
495 63928 : rs%ub_real = desc%ub
496 63928 : rs%lb_local = rs%lb_real - desc%border*(1 - desc%perd)
497 63928 : rs%ub_local = rs%ub_real + desc%border*(1 - desc%perd)
498 63928 : rs%npts_local = rs%ub_local - rs%lb_local + 1
499 63928 : rs%ngpts_local = PRODUCT(rs%npts_local)
500 : END IF
501 :
502 935924 : IF (ALL(rs%desc%group_dim == 1)) THEN
503 : ! CASE 1 : only one slice: we do not need overlapping regions and special
504 : ! recombination of the total density
505 931288 : rs%lb_real = desc%lb
506 931288 : rs%ub_real = desc%ub
507 931288 : rs%lb_local = rs%lb_real - desc%border*(1 - desc%perd)
508 931288 : rs%ub_local = rs%ub_real + desc%border*(1 - desc%perd)
509 931288 : rs%npts_local = rs%ub_local - rs%lb_local + 1
510 931288 : rs%ngpts_local = PRODUCT(rs%npts_local)
511 : ELSE
512 : ! CASE 2 : general case
513 : ! extract some more derived quantities about the local grid
514 7008 : rs%lb_real = desc%lb_global(:, desc%my_virtual_pos)
515 7008 : rs%ub_real = desc%ub_global(:, desc%my_virtual_pos)
516 7008 : rs%lb_local = rs%lb_real - desc%border*(1 - desc%perd)
517 7008 : rs%ub_local = rs%ub_real + desc%border*(1 - desc%perd)
518 7008 : rs%npts_local = rs%ub_local - rs%lb_local + 1
519 7008 : rs%ngpts_local = PRODUCT(rs%npts_local)
520 : END IF
521 :
522 234574 : CALL offload_create_buffer(rs%ngpts_local, rs%buffer)
523 : rs%r(rs%lb_local(1):rs%ub_local(1), &
524 : rs%lb_local(2):rs%ub_local(2), &
525 234574 : rs%lb_local(3):rs%ub_local(3)) => rs%buffer%host_buffer
526 :
527 703722 : ALLOCATE (rs%px(desc%npts(1)))
528 703722 : ALLOCATE (rs%py(desc%npts(2)))
529 703722 : ALLOCATE (rs%pz(desc%npts(3)))
530 :
531 234574 : CALL timestop(handle)
532 :
533 234574 : END SUBROUTINE rs_grid_create
534 :
535 : ! **************************************************************************************************
536 : !> \brief Defines a new ordering of ranks on this realspace grid, recalculating
537 : !> the data bounds and reallocating the grid. As a result, each MPI process
538 : !> now has a real rank (i.e., its rank in the MPI communicator from the pw grid)
539 : !> and a virtual rank (the rank of the process where the data now owned by this
540 : !> process would reside in an ordinary cartesian distribution).
541 : !> NB. Since the grid size required may change, the caller should be sure to release
542 : !> and recreate the corresponding rs_grids
543 : !> The desc%real2virtual and desc%virtual2real arrays can be used to map
544 : !> a physical rank to the 'rank' of data owned by that process and vice versa
545 : !> \param desc ...
546 : !> \param real2virtual ...
547 : !> \par History
548 : !> 04-2009 created [Iain Bethune]
549 : !> (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
550 : ! **************************************************************************************************
551 6 : PURE SUBROUTINE rs_grid_reorder_ranks(desc, real2virtual)
552 :
553 : TYPE(realspace_grid_desc_type), INTENT(INOUT) :: desc
554 : INTEGER, DIMENSION(:), INTENT(IN) :: real2virtual
555 :
556 : INTEGER :: i
557 :
558 18 : desc%real2virtual(:) = real2virtual
559 :
560 18 : DO i = 0, desc%group_size - 1
561 18 : desc%virtual2real(desc%real2virtual(i)) = i
562 : END DO
563 :
564 6 : desc%my_virtual_pos = desc%real2virtual(desc%my_pos)
565 :
566 12 : IF (.NOT. ALL(desc%group_dim == 1)) THEN
567 24 : desc%virtual_group_coor(:) = desc%rank2coord(:, desc%my_virtual_pos)
568 : END IF
569 :
570 6 : END SUBROUTINE
571 :
572 : ! **************************************************************************************************
573 : !> \brief Print information on grids to output
574 : !> \param rs ...
575 : !> \param iounit ...
576 : !> \author JGH (17-May-2007)
577 : ! **************************************************************************************************
578 13782 : SUBROUTINE rs_grid_print(rs, iounit)
579 : TYPE(realspace_grid_type), INTENT(IN) :: rs
580 : INTEGER, INTENT(in) :: iounit
581 :
582 : INTEGER :: dir, i, nn
583 : REAL(KIND=dp) :: pp(3)
584 :
585 13782 : IF (rs%desc%parallel) THEN
586 13500 : IF (iounit > 0) THEN
587 : WRITE (iounit, '(/,A,T71,I10)') &
588 6018 : " RS_GRID| Information for grid number ", rs%desc%pw%id_nr
589 24072 : DO i = 1, 3
590 18054 : WRITE (iounit, '(A,I3,T30,2I8,T62,A,T71,I10)') " RS_GRID| Bounds ", &
591 42126 : i, rs%desc%lb(i), rs%desc%ub(i), "Points:", rs%desc%npts(i)
592 : END DO
593 6018 : IF (.NOT. rs%desc%distributed) THEN
594 6003 : WRITE (iounit, '(A)') " RS_GRID| Real space fully replicated"
595 : WRITE (iounit, '(A,T71,I10)') &
596 6003 : " RS_GRID| Group size ", rs%desc%group_dim(2)
597 : ELSE
598 60 : DO dir = 1, 3
599 60 : IF (rs%desc%perd(dir) /= 1) THEN
600 : WRITE (iounit, '(A,T71,I3,A)') &
601 15 : " RS_GRID| Real space distribution over ", rs%desc%group_dim(dir), " groups"
602 : WRITE (iounit, '(A,T71,I10)') &
603 15 : " RS_GRID| Real space distribution along direction ", dir
604 : WRITE (iounit, '(A,T71,I10)') &
605 15 : " RS_GRID| Border size ", rs%desc%border
606 : END IF
607 : END DO
608 : END IF
609 : END IF
610 13500 : IF (rs%desc%distributed) THEN
611 120 : DO dir = 1, 3
612 120 : IF (rs%desc%perd(dir) /= 1) THEN
613 30 : nn = rs%npts_local(dir)
614 30 : CALL rs%desc%group%sum(nn)
615 120 : pp(1) = REAL(nn, KIND=dp)/REAL(PRODUCT(rs%desc%group_dim), KIND=dp)
616 30 : nn = rs%npts_local(dir)
617 30 : CALL rs%desc%group%max(nn)
618 30 : pp(2) = REAL(nn, KIND=dp)
619 30 : nn = rs%npts_local(dir)
620 30 : CALL rs%desc%group%min(nn)
621 30 : pp(3) = REAL(nn, KIND=dp)
622 30 : IF (iounit > 0) THEN
623 15 : WRITE (iounit, '(A,T48,A)') " RS_GRID| Distribution", &
624 30 : " Average Max Min"
625 15 : WRITE (iounit, '(A,T45,F12.1,2I12)') " RS_GRID| Planes ", &
626 30 : pp(1), NINT(pp(2)), NINT(pp(3))
627 : END IF
628 : END IF
629 : END DO
630 : ! WRITE ( iounit, '(/)' )
631 : END IF
632 : ELSE
633 282 : IF (iounit > 0) THEN
634 : WRITE (iounit, '(/,A,T71,I10)') &
635 172 : " RS_GRID| Information for grid number ", rs%desc%pw%id_nr
636 688 : DO i = 1, 3
637 516 : WRITE (iounit, '(A,I3,T30,2I8,T62,A,T71,I10)') " RS_GRID| Bounds ", &
638 1204 : i, rs%desc%lb(i), rs%desc%ub(i), "Points:", rs%desc%npts(i)
639 : END DO
640 : ! WRITE ( iounit, '(/)' )
641 : END IF
642 : END IF
643 :
644 13782 : END SUBROUTINE rs_grid_print
645 :
646 : ! **************************************************************************************************
647 : !> \brief ...
648 : !> \param rs ...
649 : !> \param pw ...
650 : ! **************************************************************************************************
651 1059308 : SUBROUTINE transfer_rs2pw(rs, pw)
652 : TYPE(realspace_grid_type), INTENT(IN) :: rs
653 : TYPE(pw_r3d_rs_type), INTENT(INOUT) :: pw
654 :
655 : CHARACTER(len=*), PARAMETER :: routineN = 'transfer_rs2pw'
656 :
657 : INTEGER :: handle, handle2, i
658 :
659 1059308 : CALL timeset(routineN, handle2)
660 1059308 : CALL timeset(routineN//"_"//TRIM(ADJUSTL(cp_to_string(CEILING(pw%pw_grid%cutoff/10)*10))), handle)
661 :
662 1059308 : IF (.NOT. ASSOCIATED(rs%desc%pw, pw%pw_grid)) &
663 0 : CPABORT("Different rs and pw indentifiers")
664 :
665 1059308 : IF (rs%desc%distributed) THEN
666 1828 : CALL transfer_rs2pw_distributed(rs, pw)
667 1057480 : ELSE IF (rs%desc%parallel) THEN
668 848130 : CALL transfer_rs2pw_replicated(rs, pw)
669 : ELSE ! treat simple serial case locally
670 209350 : IF (rs%desc%border == 0) THEN
671 837400 : CALL dcopy(SIZE(rs%r), rs%r, 1, pw%array, 1)
672 : ELSE
673 0 : CPASSERT(LBOUND(pw%array, 3) .EQ. rs%lb_real(3))
674 0 : !$OMP PARALLEL DO DEFAULT(NONE) SHARED(pw,rs)
675 : DO i = rs%lb_real(3), rs%ub_real(3)
676 : pw%array(:, :, i) = rs%r(rs%lb_real(1):rs%ub_real(1), &
677 : rs%lb_real(2):rs%ub_real(2), i)
678 : END DO
679 : !$OMP END PARALLEL DO
680 : END IF
681 : END IF
682 :
683 1059308 : CALL timestop(handle)
684 1059308 : CALL timestop(handle2)
685 :
686 1059308 : END SUBROUTINE transfer_rs2pw
687 :
688 : ! **************************************************************************************************
689 : !> \brief ...
690 : !> \param rs ...
691 : !> \param pw ...
692 : ! **************************************************************************************************
693 1030469 : SUBROUTINE transfer_pw2rs(rs, pw)
694 :
695 : TYPE(realspace_grid_type), INTENT(IN) :: rs
696 : TYPE(pw_r3d_rs_type), INTENT(IN) :: pw
697 :
698 : CHARACTER(len=*), PARAMETER :: routineN = 'transfer_pw2rs'
699 :
700 : INTEGER :: handle, handle2, i, im, j, jm, k, km
701 :
702 1030469 : CALL timeset(routineN, handle2)
703 1030469 : CALL timeset(routineN//"_"//TRIM(ADJUSTL(cp_to_string(CEILING(pw%pw_grid%cutoff/10)*10))), handle)
704 :
705 1030469 : IF (.NOT. ASSOCIATED(rs%desc%pw, pw%pw_grid)) &
706 0 : CPABORT("Different rs and pw indentifiers")
707 :
708 1030469 : IF (rs%desc%distributed) THEN
709 864 : CALL transfer_pw2rs_distributed(rs, pw)
710 1029605 : ELSE IF (rs%desc%parallel) THEN
711 785312 : CALL transfer_pw2rs_replicated(rs, pw)
712 : ELSE ! treat simple serial case locally
713 244293 : IF (rs%desc%border == 0) THEN
714 977172 : CALL dcopy(SIZE(rs%r), pw%array, 1, rs%r, 1)
715 : ELSE
716 : !$OMP PARALLEL DO DEFAULT(NONE) &
717 : !$OMP PRIVATE(i,im,j,jm,k,km) &
718 0 : !$OMP SHARED(pw,rs)
719 : DO k = rs%lb_local(3), rs%ub_local(3)
720 : IF (k < rs%lb_real(3)) THEN
721 : km = k + rs%desc%npts(3)
722 : ELSE IF (k > rs%ub_real(3)) THEN
723 : km = k - rs%desc%npts(3)
724 : ELSE
725 : km = k
726 : END IF
727 : DO j = rs%lb_local(2), rs%ub_local(2)
728 : IF (j < rs%lb_real(2)) THEN
729 : jm = j + rs%desc%npts(2)
730 : ELSE IF (j > rs%ub_real(2)) THEN
731 : jm = j - rs%desc%npts(2)
732 : ELSE
733 : jm = j
734 : END IF
735 : DO i = rs%lb_local(1), rs%ub_local(1)
736 : IF (i < rs%lb_real(1)) THEN
737 : im = i + rs%desc%npts(1)
738 : ELSE IF (i > rs%ub_real(1)) THEN
739 : im = i - rs%desc%npts(1)
740 : ELSE
741 : im = i
742 : END IF
743 : rs%r(i, j, k) = pw%array(im, jm, km)
744 : END DO
745 : END DO
746 : END DO
747 : !$OMP END PARALLEL DO
748 : END IF
749 : END IF
750 :
751 1030469 : CALL timestop(handle)
752 1030469 : CALL timestop(handle2)
753 :
754 1030469 : END SUBROUTINE transfer_pw2rs
755 :
756 : ! **************************************************************************************************
757 : !> \brief transfer from a realspace grid to a planewave grid
758 : !> \param rs ...
759 : !> \param pw ...
760 : ! **************************************************************************************************
761 848130 : SUBROUTINE transfer_rs2pw_replicated(rs, pw)
762 : TYPE(realspace_grid_type), INTENT(IN) :: rs
763 : TYPE(pw_r3d_rs_type), INTENT(INOUT) :: pw
764 :
765 : INTEGER :: dest, ii, ip, ix, iy, iz, nma, nn, s(3), &
766 : source
767 848130 : INTEGER, ALLOCATABLE, DIMENSION(:) :: rcount
768 : INTEGER, DIMENSION(3) :: lb, ub
769 848130 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:) :: recvbuf, sendbuf, swaparray
770 :
771 : ASSOCIATE (np => pw%pw_grid%para%group%num_pe, bo => pw%pw_grid%para%bo(1:2, 1:3, 0:pw%pw_grid%para%group%num_pe - 1, 1), &
772 : pbo => pw%pw_grid%bounds, group => pw%pw_grid%para%group, mepos => pw%pw_grid%para%group%mepos, &
773 : grid => rs%r)
774 2544390 : ALLOCATE (rcount(0:np - 1))
775 2544390 : DO ip = 1, np
776 7633170 : rcount(ip - 1) = PRODUCT(bo(2, :, ip) - bo(1, :, ip) + 1)
777 : END DO
778 2544390 : nma = MAXVAL(rcount(0:np - 1))
779 3392520 : ALLOCATE (sendbuf(nma), recvbuf(nma))
780 31846714640 : sendbuf = 1.0E99_dp; recvbuf = 1.0E99_dp ! init mpi'ed buffers to silence warnings under valgrind
781 :
782 : !sample peak memory
783 848130 : CALL m_memory()
784 :
785 848130 : dest = MODULO(mepos + 1, np)
786 848130 : source = MODULO(mepos - 1, np)
787 15923357320 : sendbuf = 0.0_dp
788 :
789 1696260 : DO ip = 1, np
790 :
791 6785040 : lb = pbo(1, :) + bo(1, :, MODULO(mepos - ip, np) + 1) - 1
792 6785040 : ub = pbo(1, :) + bo(2, :, MODULO(mepos - ip, np) + 1) - 1
793 : ! this loop takes about the same time as the message passing call
794 : ! notice that the range of ix is only a small fraction of the first index of grid
795 : ! therefore it seems faster to have the second index as the innermost loop
796 : ! if this runs on many cpus
797 : ! tested on itanium, pentium4, opteron, ultrasparc...
798 6785040 : s = ub - lb + 1
799 42436392 : DO iz = lb(3), ub(3)
800 750299854 : DO ix = lb(1), ub(1)
801 707863462 : ii = (iz - lb(3))*s(1)*s(2) + (ix - lb(1)) + 1
802 32374406210 : DO iy = lb(2), ub(2)
803 31625802616 : sendbuf(ii) = sendbuf(ii) + grid(ix, iy, iz)
804 32333666078 : ii = ii + s(1)
805 : END DO
806 : END DO
807 : END DO
808 1696260 : IF (ip .EQ. np) EXIT
809 848130 : CALL group%sendrecv(sendbuf, dest, recvbuf, source, 13)
810 848130 : CALL MOVE_ALLOC(sendbuf, swaparray)
811 848130 : CALL MOVE_ALLOC(recvbuf, sendbuf)
812 1696260 : CALL MOVE_ALLOC(swaparray, recvbuf)
813 : END DO
814 848130 : nn = rcount(mepos)
815 : END ASSOCIATE
816 :
817 848130 : CALL dcopy(nn, sendbuf, 1, pw%array, 1)
818 :
819 848130 : DEALLOCATE (rcount)
820 848130 : DEALLOCATE (sendbuf)
821 848130 : DEALLOCATE (recvbuf)
822 :
823 848130 : END SUBROUTINE transfer_rs2pw_replicated
824 :
825 : ! **************************************************************************************************
826 : !> \brief transfer from a planewave grid to a realspace grid
827 : !> \param rs ...
828 : !> \param pw ...
829 : ! **************************************************************************************************
830 785312 : SUBROUTINE transfer_pw2rs_replicated(rs, pw)
831 : TYPE(realspace_grid_type), INTENT(IN) :: rs
832 : TYPE(pw_r3d_rs_type), INTENT(IN) :: pw
833 :
834 : INTEGER :: dest, i, ii, im, ip, ix, iy, iz, j, jm, &
835 : k, km, nma, nn, source
836 785312 : INTEGER, ALLOCATABLE, DIMENSION(:) :: rcount
837 : INTEGER, DIMENSION(3) :: lb, ub
838 785312 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:) :: recvbuf, sendbuf, swaparray
839 2355936 : TYPE(mp_request_type), DIMENSION(2) :: req
840 :
841 : ASSOCIATE (np => pw%pw_grid%para%group%num_pe, bo => pw%pw_grid%para%bo(1:2, 1:3, 0:pw%pw_grid%para%group%num_pe - 1, 1), &
842 : pbo => pw%pw_grid%bounds, group => pw%pw_grid%para%group, mepos => pw%pw_grid%para%group%mepos, &
843 : grid => rs%r)
844 2355936 : ALLOCATE (rcount(0:np - 1))
845 2355936 : DO ip = 1, np
846 7067808 : rcount(ip - 1) = PRODUCT(bo(2, :, ip) - bo(1, :, ip) + 1)
847 : END DO
848 2355936 : nma = MAXVAL(rcount(0:np - 1))
849 3141248 : ALLOCATE (sendbuf(nma), recvbuf(nma))
850 30849837776 : sendbuf = 1.0E99_dp; recvbuf = 1.0E99_dp ! init mpi'ed buffers to silence warnings under valgrind
851 :
852 : !sample peak memory
853 785312 : CALL m_memory()
854 :
855 785312 : nn = rcount(mepos)
856 785312 : CALL dcopy(nn, pw%array, 1, sendbuf, 1)
857 :
858 785312 : dest = MODULO(mepos + 1, np)
859 785312 : source = MODULO(mepos - 1, np)
860 :
861 2355936 : DO ip = 0, np - 1
862 : ! we must shift the buffer only np-1 times around
863 1570624 : IF (ip .NE. np - 1) THEN
864 : CALL group%isendrecv(sendbuf, dest, recvbuf, source, &
865 785312 : req(1), req(2), 13)
866 : END IF
867 6282496 : lb = pbo(1, :) + bo(1, :, MODULO(mepos - ip, np) + 1) - 1
868 6282496 : ub = pbo(1, :) + bo(2, :, MODULO(mepos - ip, np) + 1) - 1
869 1570624 : ii = 0
870 : ! this loop takes about the same time as the message passing call
871 : ! If I read the code correctly then:
872 40255192 : DO iz = lb(3), ub(3)
873 1390460520 : DO iy = lb(2), ub(2)
874 32029421078 : DO ix = lb(1), ub(1)
875 30640531182 : ii = ii + 1
876 31990736510 : grid(ix, iy, iz) = sendbuf(ii)
877 : END DO
878 : END DO
879 : END DO
880 1570624 : IF (ip .NE. np - 1) THEN
881 785312 : CALL mp_waitall(req)
882 : END IF
883 1570624 : CALL MOVE_ALLOC(sendbuf, swaparray)
884 1570624 : CALL MOVE_ALLOC(recvbuf, sendbuf)
885 2355936 : CALL MOVE_ALLOC(swaparray, recvbuf)
886 : END DO
887 1570624 : IF (rs%desc%border > 0) THEN
888 : !$OMP PARALLEL DO DEFAULT(NONE) &
889 : !$OMP PRIVATE(i,im,j,jm,k,km) &
890 0 : !$OMP SHARED(rs)
891 : DO k = rs%lb_local(3), rs%ub_local(3)
892 : IF (k < rs%lb_real(3)) THEN
893 : km = k + rs%desc%npts(3)
894 : ELSE IF (k > rs%ub_real(3)) THEN
895 : km = k - rs%desc%npts(3)
896 : ELSE
897 : km = k
898 : END IF
899 : DO j = rs%lb_local(2), rs%ub_local(2)
900 : IF (j < rs%lb_real(2)) THEN
901 : jm = j + rs%desc%npts(2)
902 : ELSE IF (j > rs%ub_real(2)) THEN
903 : jm = j - rs%desc%npts(2)
904 : ELSE
905 : jm = j
906 : END IF
907 : DO i = rs%lb_local(1), rs%ub_local(1)
908 : IF (i < rs%lb_real(1)) THEN
909 : im = i + rs%desc%npts(1)
910 : ELSE IF (i > rs%ub_real(1)) THEN
911 : im = i - rs%desc%npts(1)
912 : ELSE
913 : im = i
914 : END IF
915 : rs%r(i, j, k) = rs%r(im, jm, km)
916 : END DO
917 : END DO
918 : END DO
919 : !$OMP END PARALLEL DO
920 : END IF
921 : END ASSOCIATE
922 :
923 785312 : DEALLOCATE (rcount)
924 785312 : DEALLOCATE (sendbuf)
925 785312 : DEALLOCATE (recvbuf)
926 :
927 785312 : END SUBROUTINE transfer_pw2rs_replicated
928 :
929 : ! **************************************************************************************************
930 : !> \brief does the rs2pw transfer in the case where the rs grid is
931 : !> distributed (3D domain decomposition)
932 : !> \param rs ...
933 : !> \param pw ...
934 : !> \par History
935 : !> 12.2007 created [Matt Watkins]
936 : !> 9.2008 reduced amount of halo data sent [Iain Bethune]
937 : !> 10.2008 added non-blocking communication [Iain Bethune]
938 : !> 4.2009 added support for rank-reordering on the grid [Iain Bethune]
939 : !> 12.2009 added OMP and sparse alltoall [Iain Bethune]
940 : !> (c) The Numerical Algorithms Group (NAG) Ltd, 2008-2009 on behalf of the HECToR project
941 : !> \note
942 : !> the transfer is a two step procedure. For example, for the rs2pw transfer:
943 : !>
944 : !> 1) Halo-exchange in 3D so that the local part of the rs_grid contains the full data
945 : !> 2) an alltoall communication to redistribute the local rs_grid to the local pw_grid
946 : !>
947 : !> the halo exchange is most expensive on a large number of CPUs. Particular in this halo
948 : !> exchange is that the border region is rather large (e.g. 20 points) and that it might overlap
949 : !> with the central domain of several CPUs (i.e. next nearest neighbors)
950 : ! **************************************************************************************************
951 1828 : SUBROUTINE transfer_rs2pw_distributed(rs, pw)
952 : TYPE(realspace_grid_type), INTENT(IN) :: rs
953 : TYPE(pw_r3d_rs_type), INTENT(IN) :: pw
954 :
955 : CHARACTER(LEN=200) :: error_string
956 : INTEGER :: completed, dest_down, dest_up, i, idir, j, k, lb, my_id, my_pw_rank, my_rs_rank, &
957 : n_shifts, nn, num_threads, position, source_down, source_up, ub, x, y, z
958 1828 : INTEGER, ALLOCATABLE, DIMENSION(:) :: dshifts, recv_disps, recv_sizes, &
959 1828 : send_disps, send_sizes, ushifts
960 3656 : INTEGER, ALLOCATABLE, DIMENSION(:, :) :: bounds, recv_tasks, send_tasks
961 : INTEGER, DIMENSION(2) :: neighbours, pos
962 : INTEGER, DIMENSION(3) :: coords, lb_recv, lb_recv_down, lb_recv_up, lb_send, lb_send_down, &
963 : lb_send_up, ub_recv, ub_recv_down, ub_recv_up, ub_send, ub_send_down, ub_send_up
964 : LOGICAL, DIMENSION(3) :: halo_swapped
965 : REAL(KIND=dp) :: pw_sum, rs_sum
966 1828 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :) :: recv_buf_3d_down, recv_buf_3d_up, &
967 1828 : send_buf_3d_down, send_buf_3d_up
968 3656 : TYPE(cp_1d_r_p_type), ALLOCATABLE, DIMENSION(:) :: recv_bufs, send_bufs
969 1828 : TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:) :: recv_reqs, send_reqs
970 9140 : TYPE(mp_request_type), DIMENSION(4) :: req
971 :
972 1828 : num_threads = 1
973 1828 : my_id = 0
974 :
975 : ! safety check, to be removed once we're absolute sure the routine is correct
976 : IF (debug_this_module) THEN
977 : rs_sum = accurate_sum(rs%r)*ABS(det_3x3(rs%desc%dh))
978 : CALL rs%desc%group%sum(rs_sum)
979 : END IF
980 :
981 1828 : halo_swapped = .FALSE.
982 : ! We don't need to send the 'edges' of the halos that have already been sent
983 : ! Halos are contiguous in memory in z-direction only, so swap these first,
984 : ! and send less data in the y and x directions which are more expensive
985 :
986 7312 : DO idir = 3, 1, -1
987 :
988 5484 : IF (rs%desc%perd(idir) .NE. 1) THEN
989 :
990 14196 : ALLOCATE (dshifts(0:rs%desc%neighbours(idir)))
991 9464 : ALLOCATE (ushifts(0:rs%desc%neighbours(idir)))
992 :
993 14196 : ushifts = 0
994 14196 : dshifts = 0
995 :
996 : ! check that we don't try to send data to ourself
997 6560 : DO n_shifts = 1, MIN(rs%desc%neighbours(idir), rs%desc%group_dim(idir) - 1)
998 :
999 : ! need to take into account the possible varying widths of neighbouring cells
1000 : ! offset_up and offset_down hold the real size of the neighbouring cells
1001 1828 : position = MODULO(rs%desc%virtual_group_coor(idir) - n_shifts, rs%desc%group_dim(idir))
1002 1828 : neighbours = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), position)
1003 1828 : dshifts(n_shifts) = dshifts(n_shifts - 1) + (neighbours(2) - neighbours(1) + 1)
1004 :
1005 1828 : position = MODULO(rs%desc%virtual_group_coor(idir) + n_shifts, rs%desc%group_dim(idir))
1006 1828 : neighbours = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), position)
1007 1828 : ushifts(n_shifts) = ushifts(n_shifts - 1) + (neighbours(2) - neighbours(1) + 1)
1008 :
1009 : ! The border data has to be send/received from the neighbours
1010 : ! First we calculate the source and destination processes for the shift
1011 : ! We do both shifts at once to allow for more overlap of communication and buffer packing/unpacking
1012 :
1013 1828 : CALL cart_shift(rs, idir, -1*n_shifts, source_down, dest_down)
1014 :
1015 7312 : lb_send_down(:) = rs%lb_local(:)
1016 7312 : lb_recv_down(:) = rs%lb_local(:)
1017 7312 : ub_recv_down(:) = rs%ub_local(:)
1018 7312 : ub_send_down(:) = rs%ub_local(:)
1019 :
1020 1828 : IF (dshifts(n_shifts - 1) .LE. rs%desc%border) THEN
1021 1828 : ub_send_down(idir) = lb_send_down(idir) + rs%desc%border - 1 - dshifts(n_shifts - 1)
1022 : lb_send_down(idir) = MAX(lb_send_down(idir), &
1023 1828 : lb_send_down(idir) + rs%desc%border - dshifts(n_shifts))
1024 :
1025 1828 : ub_recv_down(idir) = ub_recv_down(idir) - rs%desc%border
1026 : lb_recv_down(idir) = MAX(lb_recv_down(idir) + rs%desc%border, &
1027 1828 : ub_recv_down(idir) - rs%desc%border + 1 + ushifts(n_shifts - 1))
1028 : ELSE
1029 0 : lb_send_down(idir) = 0
1030 0 : ub_send_down(idir) = -1
1031 0 : lb_recv_down(idir) = 0
1032 0 : ub_recv_down(idir) = -1
1033 : END IF
1034 :
1035 7312 : DO i = 1, 3
1036 7312 : IF (halo_swapped(i)) THEN
1037 554 : lb_send_down(i) = rs%lb_real(i)
1038 554 : ub_send_down(i) = rs%ub_real(i)
1039 554 : lb_recv_down(i) = rs%lb_real(i)
1040 554 : ub_recv_down(i) = rs%ub_real(i)
1041 : END IF
1042 : END DO
1043 :
1044 : ! post the receive
1045 0 : ALLOCATE (recv_buf_3d_down(lb_recv_down(1):ub_recv_down(1), &
1046 9140 : lb_recv_down(2):ub_recv_down(2), lb_recv_down(3):ub_recv_down(3)))
1047 1828 : CALL rs%desc%group%irecv(recv_buf_3d_down, source_down, req(1))
1048 :
1049 : ! now allocate, pack and send the send buffer
1050 7312 : nn = PRODUCT(ub_send_down - lb_send_down + 1)
1051 0 : ALLOCATE (send_buf_3d_down(lb_send_down(1):ub_send_down(1), &
1052 9140 : lb_send_down(2):ub_send_down(2), lb_send_down(3):ub_send_down(3)))
1053 :
1054 : !$OMP PARALLEL DEFAULT(NONE), &
1055 : !$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1056 1828 : !$OMP SHARED(send_buf_3d_down,rs,lb_send_down,ub_send_down)
1057 : !$ num_threads = MIN(omp_get_max_threads(), ub_send_down(3) - lb_send_down(3) + 1)
1058 : !$ my_id = omp_get_thread_num()
1059 : IF (my_id < num_threads) THEN
1060 : lb = lb_send_down(3) + ((ub_send_down(3) - lb_send_down(3) + 1)*my_id)/num_threads
1061 : ub = lb_send_down(3) + ((ub_send_down(3) - lb_send_down(3) + 1)*(my_id + 1))/num_threads - 1
1062 :
1063 : send_buf_3d_down(lb_send_down(1):ub_send_down(1), lb_send_down(2):ub_send_down(2), &
1064 : lb:ub) = rs%r(lb_send_down(1):ub_send_down(1), &
1065 : lb_send_down(2):ub_send_down(2), lb:ub)
1066 : END IF
1067 : !$OMP END PARALLEL
1068 :
1069 1828 : CALL rs%desc%group%isend(send_buf_3d_down, dest_down, req(3))
1070 :
1071 : ! Now for the other direction
1072 1828 : CALL cart_shift(rs, idir, n_shifts, source_up, dest_up)
1073 :
1074 7312 : lb_send_up(:) = rs%lb_local(:)
1075 7312 : lb_recv_up(:) = rs%lb_local(:)
1076 7312 : ub_recv_up(:) = rs%ub_local(:)
1077 7312 : ub_send_up(:) = rs%ub_local(:)
1078 :
1079 1828 : IF (ushifts(n_shifts - 1) .LE. rs%desc%border) THEN
1080 :
1081 1828 : lb_send_up(idir) = ub_send_up(idir) - rs%desc%border + 1 + ushifts(n_shifts - 1)
1082 : ub_send_up(idir) = MIN(ub_send_up(idir), &
1083 1828 : ub_send_up(idir) - rs%desc%border + ushifts(n_shifts))
1084 :
1085 1828 : lb_recv_up(idir) = lb_recv_up(idir) + rs%desc%border
1086 : ub_recv_up(idir) = MIN(ub_recv_up(idir) - rs%desc%border, &
1087 1828 : lb_recv_up(idir) + rs%desc%border - 1 - dshifts(n_shifts - 1))
1088 : ELSE
1089 0 : lb_send_up(idir) = 0
1090 0 : ub_send_up(idir) = -1
1091 0 : lb_recv_up(idir) = 0
1092 0 : ub_recv_up(idir) = -1
1093 : END IF
1094 :
1095 7312 : DO i = 1, 3
1096 7312 : IF (halo_swapped(i)) THEN
1097 554 : lb_send_up(i) = rs%lb_real(i)
1098 554 : ub_send_up(i) = rs%ub_real(i)
1099 554 : lb_recv_up(i) = rs%lb_real(i)
1100 554 : ub_recv_up(i) = rs%ub_real(i)
1101 : END IF
1102 : END DO
1103 :
1104 : ! post the receive
1105 0 : ALLOCATE (recv_buf_3d_up(lb_recv_up(1):ub_recv_up(1), &
1106 9140 : lb_recv_up(2):ub_recv_up(2), lb_recv_up(3):ub_recv_up(3)))
1107 1828 : CALL rs%desc%group%irecv(recv_buf_3d_up, source_up, req(2))
1108 :
1109 : ! now allocate,pack and send the send buffer
1110 7312 : nn = PRODUCT(ub_send_up - lb_send_up + 1)
1111 0 : ALLOCATE (send_buf_3d_up(lb_send_up(1):ub_send_up(1), &
1112 9140 : lb_send_up(2):ub_send_up(2), lb_send_up(3):ub_send_up(3)))
1113 :
1114 : !$OMP PARALLEL DEFAULT(NONE), &
1115 : !$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1116 1828 : !$OMP SHARED(send_buf_3d_up,rs,lb_send_up,ub_send_up)
1117 : !$ num_threads = MIN(omp_get_max_threads(), ub_send_up(3) - lb_send_up(3) + 1)
1118 : !$ my_id = omp_get_thread_num()
1119 : IF (my_id < num_threads) THEN
1120 : lb = lb_send_up(3) + ((ub_send_up(3) - lb_send_up(3) + 1)*my_id)/num_threads
1121 : ub = lb_send_up(3) + ((ub_send_up(3) - lb_send_up(3) + 1)*(my_id + 1))/num_threads - 1
1122 :
1123 : send_buf_3d_up(lb_send_up(1):ub_send_up(1), lb_send_up(2):ub_send_up(2), &
1124 : lb:ub) = rs%r(lb_send_up(1):ub_send_up(1), &
1125 : lb_send_up(2):ub_send_up(2), lb:ub)
1126 : END IF
1127 : !$OMP END PARALLEL
1128 :
1129 1828 : CALL rs%desc%group%isend(send_buf_3d_up, dest_up, req(4))
1130 :
1131 : ! wait for a recv to complete, then we can unpack
1132 :
1133 5484 : DO i = 1, 2
1134 :
1135 3656 : CALL mp_waitany(req(1:2), completed)
1136 :
1137 5484 : IF (completed .EQ. 1) THEN
1138 :
1139 : ! only some procs may need later shifts
1140 1828 : IF (ub_recv_down(idir) .GE. lb_recv_down(idir)) THEN
1141 : ! Sum the data in the RS Grid
1142 : !$OMP PARALLEL DEFAULT(NONE), &
1143 : !$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1144 1828 : !$OMP SHARED(recv_buf_3d_down,rs,lb_recv_down,ub_recv_down)
1145 : !$ num_threads = MIN(omp_get_max_threads(), ub_recv_down(3) - lb_recv_down(3) + 1)
1146 : !$ my_id = omp_get_thread_num()
1147 : IF (my_id < num_threads) THEN
1148 : lb = lb_recv_down(3) + ((ub_recv_down(3) - lb_recv_down(3) + 1)*my_id)/num_threads
1149 : ub = lb_recv_down(3) + ((ub_recv_down(3) - lb_recv_down(3) + 1)*(my_id + 1))/num_threads - 1
1150 :
1151 : rs%r(lb_recv_down(1):ub_recv_down(1), &
1152 : lb_recv_down(2):ub_recv_down(2), lb:ub) = &
1153 : rs%r(lb_recv_down(1):ub_recv_down(1), &
1154 : lb_recv_down(2):ub_recv_down(2), lb:ub) + &
1155 : recv_buf_3d_down(:, :, lb:ub)
1156 : END IF
1157 : !$OMP END PARALLEL
1158 : END IF
1159 1828 : DEALLOCATE (recv_buf_3d_down)
1160 : ELSE
1161 :
1162 : ! only some procs may need later shifts
1163 1828 : IF (ub_recv_up(idir) .GE. lb_recv_up(idir)) THEN
1164 : ! Sum the data in the RS Grid
1165 : !$OMP PARALLEL DEFAULT(NONE), &
1166 : !$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1167 1828 : !$OMP SHARED(recv_buf_3d_up,rs,lb_recv_up,ub_recv_up)
1168 : !$ num_threads = MIN(omp_get_max_threads(), ub_recv_up(3) - lb_recv_up(3) + 1)
1169 : !$ my_id = omp_get_thread_num()
1170 : IF (my_id < num_threads) THEN
1171 : lb = lb_recv_up(3) + ((ub_recv_up(3) - lb_recv_up(3) + 1)*my_id)/num_threads
1172 : ub = lb_recv_up(3) + ((ub_recv_up(3) - lb_recv_up(3) + 1)*(my_id + 1))/num_threads - 1
1173 :
1174 : rs%r(lb_recv_up(1):ub_recv_up(1), &
1175 : lb_recv_up(2):ub_recv_up(2), lb:ub) = &
1176 : rs%r(lb_recv_up(1):ub_recv_up(1), &
1177 : lb_recv_up(2):ub_recv_up(2), lb:ub) + &
1178 : recv_buf_3d_up(:, :, lb:ub)
1179 : END IF
1180 : !$OMP END PARALLEL
1181 : END IF
1182 1828 : DEALLOCATE (recv_buf_3d_up)
1183 : END IF
1184 :
1185 : END DO
1186 :
1187 : ! make sure the sends have completed before we deallocate
1188 :
1189 1828 : CALL mp_waitall(req(3:4))
1190 :
1191 1828 : DEALLOCATE (send_buf_3d_down)
1192 8388 : DEALLOCATE (send_buf_3d_up)
1193 : END DO
1194 :
1195 4732 : DEALLOCATE (dshifts)
1196 4732 : DEALLOCATE (ushifts)
1197 :
1198 : END IF
1199 :
1200 7312 : halo_swapped(idir) = .TRUE.
1201 :
1202 : END DO
1203 :
1204 : ! This is the real redistribution
1205 7312 : ALLOCATE (bounds(0:pw%pw_grid%para%group%num_pe - 1, 1:4))
1206 :
1207 : ! work out the pw grid points each proc holds
1208 5484 : DO i = 0, pw%pw_grid%para%group%num_pe - 1
1209 10968 : bounds(i, 1:2) = pw%pw_grid%para%bo(1:2, 1, i, 1)
1210 10968 : bounds(i, 3:4) = pw%pw_grid%para%bo(1:2, 2, i, 1)
1211 10968 : bounds(i, 1:2) = bounds(i, 1:2) - pw%pw_grid%npts(1)/2 - 1
1212 12796 : bounds(i, 3:4) = bounds(i, 3:4) - pw%pw_grid%npts(2)/2 - 1
1213 : END DO
1214 :
1215 7312 : ALLOCATE (send_tasks(0:pw%pw_grid%para%group%num_pe - 1, 1:6))
1216 5484 : ALLOCATE (send_sizes(0:pw%pw_grid%para%group%num_pe - 1))
1217 3656 : ALLOCATE (send_disps(0:pw%pw_grid%para%group%num_pe - 1))
1218 3656 : ALLOCATE (recv_tasks(0:pw%pw_grid%para%group%num_pe - 1, 1:6))
1219 3656 : ALLOCATE (recv_sizes(0:pw%pw_grid%para%group%num_pe - 1))
1220 3656 : ALLOCATE (recv_disps(0:pw%pw_grid%para%group%num_pe - 1))
1221 5484 : send_tasks(:, 1) = 1
1222 5484 : send_tasks(:, 2) = 0
1223 5484 : send_tasks(:, 3) = 1
1224 5484 : send_tasks(:, 4) = 0
1225 5484 : send_tasks(:, 5) = 1
1226 5484 : send_tasks(:, 6) = 0
1227 5484 : send_sizes = 0
1228 5484 : recv_sizes = 0
1229 :
1230 1828 : my_rs_rank = rs%desc%my_pos
1231 1828 : my_pw_rank = pw%pw_grid%para%group%mepos
1232 :
1233 : ! find the processors that should hold our data
1234 : ! should be part of the rs grid type
1235 : ! this is a loop over real ranks (i.e. the in-order cartesian ranks)
1236 : ! do the recv and send tasks in two separate loops which will
1237 : ! load balance better for OpenMP with large numbers of MPI tasks
1238 :
1239 : !$OMP PARALLEL DO DEFAULT(NONE), &
1240 : !$OMP PRIVATE(coords,idir,pos,lb_send,ub_send), &
1241 1828 : !$OMP SHARED(rs,bounds,my_rs_rank,recv_tasks,recv_sizes)
1242 : DO i = 0, rs%desc%group_size - 1
1243 :
1244 : coords(:) = rs%desc%rank2coord(:, rs%desc%real2virtual(i))
1245 : !calculate the rs grid points on each processor
1246 : !coords is the part of the grid that rank i actually holds
1247 : DO idir = 1, 3
1248 : pos(:) = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), coords(idir))
1249 : pos(:) = pos(:) - rs%desc%npts(idir)/2 - 1
1250 : lb_send(idir) = pos(1)
1251 : ub_send(idir) = pos(2)
1252 : END DO
1253 :
1254 : IF (lb_send(1) .GT. bounds(my_rs_rank, 2)) CYCLE
1255 : IF (ub_send(1) .LT. bounds(my_rs_rank, 1)) CYCLE
1256 : IF (lb_send(2) .GT. bounds(my_rs_rank, 4)) CYCLE
1257 : IF (ub_send(2) .LT. bounds(my_rs_rank, 3)) CYCLE
1258 :
1259 : recv_tasks(i, 1) = MAX(lb_send(1), bounds(my_rs_rank, 1))
1260 : recv_tasks(i, 2) = MIN(ub_send(1), bounds(my_rs_rank, 2))
1261 : recv_tasks(i, 3) = MAX(lb_send(2), bounds(my_rs_rank, 3))
1262 : recv_tasks(i, 4) = MIN(ub_send(2), bounds(my_rs_rank, 4))
1263 : recv_tasks(i, 5) = lb_send(3)
1264 : recv_tasks(i, 6) = ub_send(3)
1265 : recv_sizes(i) = (recv_tasks(i, 2) - recv_tasks(i, 1) + 1)* &
1266 : (recv_tasks(i, 4) - recv_tasks(i, 3) + 1)*(recv_tasks(i, 6) - recv_tasks(i, 5) + 1)
1267 :
1268 : END DO
1269 : !$OMP END PARALLEL DO
1270 :
1271 7312 : coords(:) = rs%desc%rank2coord(:, rs%desc%real2virtual(my_rs_rank))
1272 7312 : DO idir = 1, 3
1273 5484 : pos(:) = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), coords(idir))
1274 16452 : pos(:) = pos(:) - rs%desc%npts(idir)/2 - 1
1275 5484 : lb_send(idir) = pos(1)
1276 7312 : ub_send(idir) = pos(2)
1277 : END DO
1278 :
1279 1828 : lb_recv(:) = lb_send(:)
1280 1828 : ub_recv(:) = ub_send(:)
1281 : !$OMP PARALLEL DO DEFAULT(NONE), &
1282 1828 : !$OMP SHARED(pw,lb_send,ub_send,bounds,send_tasks,send_sizes)
1283 : DO j = 0, pw%pw_grid%para%group%num_pe - 1
1284 :
1285 : IF (lb_send(1) .GT. bounds(j, 2)) CYCLE
1286 : IF (ub_send(1) .LT. bounds(j, 1)) CYCLE
1287 : IF (lb_send(2) .GT. bounds(j, 4)) CYCLE
1288 : IF (ub_send(2) .LT. bounds(j, 3)) CYCLE
1289 :
1290 : send_tasks(j, 1) = MAX(lb_send(1), bounds(j, 1))
1291 : send_tasks(j, 2) = MIN(ub_send(1), bounds(j, 2))
1292 : send_tasks(j, 3) = MAX(lb_send(2), bounds(j, 3))
1293 : send_tasks(j, 4) = MIN(ub_send(2), bounds(j, 4))
1294 : send_tasks(j, 5) = lb_send(3)
1295 : send_tasks(j, 6) = ub_send(3)
1296 : send_sizes(j) = (send_tasks(j, 2) - send_tasks(j, 1) + 1)* &
1297 : (send_tasks(j, 4) - send_tasks(j, 3) + 1)*(send_tasks(j, 6) - send_tasks(j, 5) + 1)
1298 :
1299 : END DO
1300 : !$OMP END PARALLEL DO
1301 :
1302 1828 : send_disps(0) = 0
1303 1828 : recv_disps(0) = 0
1304 3656 : DO i = 1, pw%pw_grid%para%group%num_pe - 1
1305 1828 : send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
1306 3656 : recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
1307 : END DO
1308 :
1309 12796 : CPASSERT(SUM(send_sizes) == PRODUCT(ub_recv - lb_recv + 1))
1310 :
1311 9140 : ALLOCATE (send_bufs(0:rs%desc%group_size - 1))
1312 10968 : ALLOCATE (recv_bufs(0:rs%desc%group_size - 1))
1313 :
1314 5484 : DO i = 0, rs%desc%group_size - 1
1315 3656 : IF (send_sizes(i) .NE. 0) THEN
1316 10230 : ALLOCATE (send_bufs(i)%array(send_sizes(i)))
1317 : ELSE
1318 246 : NULLIFY (send_bufs(i)%array)
1319 : END IF
1320 5484 : IF (recv_sizes(i) .NE. 0) THEN
1321 10230 : ALLOCATE (recv_bufs(i)%array(recv_sizes(i)))
1322 : ELSE
1323 246 : NULLIFY (recv_bufs(i)%array)
1324 : END IF
1325 : END DO
1326 :
1327 9140 : ALLOCATE (recv_reqs(0:rs%desc%group_size - 1))
1328 5484 : recv_reqs = mp_request_null
1329 :
1330 5484 : DO i = 0, rs%desc%group_size - 1
1331 5484 : IF (recv_sizes(i) .NE. 0) THEN
1332 3410 : CALL rs%desc%group%irecv(recv_bufs(i)%array, i, recv_reqs(i))
1333 : END IF
1334 : END DO
1335 :
1336 : ! do packing
1337 : !$OMP PARALLEL DO DEFAULT(NONE), &
1338 : !$OMP PRIVATE(k,z,y,x), &
1339 1828 : !$OMP SHARED(rs,send_tasks,send_bufs,send_disps)
1340 : DO i = 0, rs%desc%group_size - 1
1341 : k = 0
1342 : DO z = send_tasks(i, 5), send_tasks(i, 6)
1343 : DO y = send_tasks(i, 3), send_tasks(i, 4)
1344 : DO x = send_tasks(i, 1), send_tasks(i, 2)
1345 : k = k + 1
1346 : send_bufs(i)%array(k) = rs%r(x, y, z)
1347 : END DO
1348 : END DO
1349 : END DO
1350 : END DO
1351 : !$OMP END PARALLEL DO
1352 :
1353 9140 : ALLOCATE (send_reqs(0:rs%desc%group_size - 1))
1354 5484 : send_reqs = mp_request_null
1355 :
1356 5484 : DO i = 0, rs%desc%group_size - 1
1357 5484 : IF (send_sizes(i) .NE. 0) THEN
1358 3410 : CALL rs%desc%group%isend(send_bufs(i)%array, i, send_reqs(i))
1359 : END IF
1360 : END DO
1361 :
1362 : ! do unpacking
1363 : ! no OMP here so we can unpack each message as it arrives
1364 5484 : DO i = 0, rs%desc%group_size - 1
1365 3656 : IF (recv_sizes(i) .EQ. 0) CYCLE
1366 :
1367 3410 : CALL mp_waitany(recv_reqs, completed)
1368 3410 : k = 0
1369 141980 : DO z = recv_tasks(completed - 1, 5), recv_tasks(completed - 1, 6)
1370 10628098 : DO y = recv_tasks(completed - 1, 3), recv_tasks(completed - 1, 4)
1371 447639902 : DO x = recv_tasks(completed - 1, 1), recv_tasks(completed - 1, 2)
1372 437015460 : k = k + 1
1373 447503160 : pw%array(x, y, z) = recv_bufs(completed - 1)%array(k)
1374 : END DO
1375 : END DO
1376 : END DO
1377 : END DO
1378 :
1379 1828 : CALL mp_waitall(send_reqs)
1380 :
1381 1828 : DEALLOCATE (recv_reqs)
1382 1828 : DEALLOCATE (send_reqs)
1383 :
1384 5484 : DO i = 0, rs%desc%group_size - 1
1385 3656 : IF (ASSOCIATED(send_bufs(i)%array)) THEN
1386 3410 : DEALLOCATE (send_bufs(i)%array)
1387 : END IF
1388 5484 : IF (ASSOCIATED(recv_bufs(i)%array)) THEN
1389 3410 : DEALLOCATE (recv_bufs(i)%array)
1390 : END IF
1391 : END DO
1392 :
1393 1828 : DEALLOCATE (send_bufs)
1394 1828 : DEALLOCATE (recv_bufs)
1395 1828 : DEALLOCATE (send_tasks)
1396 1828 : DEALLOCATE (send_sizes)
1397 1828 : DEALLOCATE (send_disps)
1398 1828 : DEALLOCATE (recv_tasks)
1399 1828 : DEALLOCATE (recv_sizes)
1400 1828 : DEALLOCATE (recv_disps)
1401 :
1402 : IF (debug_this_module) THEN
1403 : ! safety check, to be removed once we're absolute sure the routine is correct
1404 : pw_sum = pw_integrate_function(pw)
1405 : IF (ABS(pw_sum - rs_sum)/MAX(1.0_dp, ABS(pw_sum), ABS(rs_sum)) > EPSILON(rs_sum)*1000) THEN
1406 : WRITE (error_string, '(A,6(1X,I4.4),3F25.16)') "rs_pw_transfer_distributed", &
1407 : rs%desc%npts, rs%desc%group_dim, pw_sum, rs_sum, ABS(pw_sum - rs_sum)
1408 : CALL cp_abort(__LOCATION__, &
1409 : error_string//" Please report this bug ... quick workaround: use "// &
1410 : "DISTRIBUTION_TYPE REPLICATED")
1411 : END IF
1412 : END IF
1413 :
1414 1828 : END SUBROUTINE transfer_rs2pw_distributed
1415 :
1416 : ! **************************************************************************************************
1417 : !> \brief does the pw2rs transfer in the case where the rs grid is
1418 : !> distributed (3D domain decomposition)
1419 : !> \param rs ...
1420 : !> \param pw ...
1421 : !> \par History
1422 : !> 12.2007 created [Matt Watkins]
1423 : !> 9.2008 reduced amount of halo data sent [Iain Bethune]
1424 : !> 10.2008 added non-blocking communication [Iain Bethune]
1425 : !> 4.2009 added support for rank-reordering on the grid [Iain Bethune]
1426 : !> 12.2009 added OMP and sparse alltoall [Iain Bethune]
1427 : !> (c) The Numerical Algorithms Group (NAG) Ltd, 2008-2009 on behalf of the HECToR project
1428 : !> \note
1429 : !> the transfer is a two step procedure. For example, for the rs2pw transfer:
1430 : !>
1431 : !> 1) Halo-exchange in 3D so that the local part of the rs_grid contains the full data
1432 : !> 2) an alltoall communication to redistribute the local rs_grid to the local pw_grid
1433 : !>
1434 : !> the halo exchange is most expensive on a large number of CPUs. Particular in this halo
1435 : !> exchange is that the border region is rather large (e.g. 20 points) and that it might overlap
1436 : !> with the central domain of several CPUs (i.e. next nearest neighbors)
1437 : ! **************************************************************************************************
1438 864 : SUBROUTINE transfer_pw2rs_distributed(rs, pw)
1439 : TYPE(realspace_grid_type), INTENT(IN) :: rs
1440 : TYPE(pw_r3d_rs_type), INTENT(IN) :: pw
1441 :
1442 : INTEGER :: completed, dest_down, dest_up, i, idir, j, k, lb, my_id, my_pw_rank, my_rs_rank, &
1443 : n_shifts, nn, num_threads, position, source_down, source_up, ub, x, y, z
1444 864 : INTEGER, ALLOCATABLE, DIMENSION(:) :: dshifts, recv_disps, recv_sizes, &
1445 864 : send_disps, send_sizes, ushifts
1446 1728 : INTEGER, ALLOCATABLE, DIMENSION(:, :) :: bounds, recv_tasks, send_tasks
1447 : INTEGER, DIMENSION(2) :: neighbours, pos
1448 : INTEGER, DIMENSION(3) :: coords, lb_recv, lb_recv_down, lb_recv_up, lb_send, lb_send_down, &
1449 : lb_send_up, ub_recv, ub_recv_down, ub_recv_up, ub_send, ub_send_down, ub_send_up
1450 : LOGICAL, DIMENSION(3) :: halo_swapped
1451 864 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :) :: recv_buf_3d_down, recv_buf_3d_up, &
1452 864 : send_buf_3d_down, send_buf_3d_up
1453 1728 : TYPE(cp_1d_r_p_type), ALLOCATABLE, DIMENSION(:) :: recv_bufs, send_bufs
1454 864 : TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:) :: recv_reqs, send_reqs
1455 4320 : TYPE(mp_request_type), DIMENSION(4) :: req
1456 :
1457 864 : num_threads = 1
1458 864 : my_id = 0
1459 :
1460 864 : CALL rs_grid_zero(rs)
1461 :
1462 : ! This is the real redistribution
1463 :
1464 3456 : ALLOCATE (bounds(0:pw%pw_grid%para%group%num_pe - 1, 1:4))
1465 :
1466 2592 : DO i = 0, pw%pw_grid%para%group%num_pe - 1
1467 5184 : bounds(i, 1:2) = pw%pw_grid%para%bo(1:2, 1, i, 1)
1468 5184 : bounds(i, 3:4) = pw%pw_grid%para%bo(1:2, 2, i, 1)
1469 5184 : bounds(i, 1:2) = bounds(i, 1:2) - pw%pw_grid%npts(1)/2 - 1
1470 6048 : bounds(i, 3:4) = bounds(i, 3:4) - pw%pw_grid%npts(2)/2 - 1
1471 : END DO
1472 :
1473 3456 : ALLOCATE (send_tasks(0:pw%pw_grid%para%group%num_pe - 1, 1:6))
1474 2592 : ALLOCATE (send_sizes(0:pw%pw_grid%para%group%num_pe - 1))
1475 1728 : ALLOCATE (send_disps(0:pw%pw_grid%para%group%num_pe - 1))
1476 1728 : ALLOCATE (recv_tasks(0:pw%pw_grid%para%group%num_pe - 1, 1:6))
1477 1728 : ALLOCATE (recv_sizes(0:pw%pw_grid%para%group%num_pe - 1))
1478 1728 : ALLOCATE (recv_disps(0:pw%pw_grid%para%group%num_pe - 1))
1479 :
1480 16416 : send_tasks = 0
1481 2592 : send_tasks(:, 1) = 1
1482 2592 : send_tasks(:, 2) = 0
1483 2592 : send_tasks(:, 3) = 1
1484 2592 : send_tasks(:, 4) = 0
1485 2592 : send_tasks(:, 5) = 1
1486 2592 : send_tasks(:, 6) = 0
1487 2592 : send_sizes = 0
1488 :
1489 16416 : recv_tasks = 0
1490 2592 : recv_tasks(:, 1) = 1
1491 2592 : recv_tasks(:, 2) = 0
1492 2592 : send_tasks(:, 3) = 1
1493 2592 : send_tasks(:, 4) = 0
1494 2592 : send_tasks(:, 5) = 1
1495 2592 : send_tasks(:, 6) = 0
1496 2592 : recv_sizes = 0
1497 :
1498 864 : my_rs_rank = rs%desc%my_pos
1499 864 : my_pw_rank = pw%pw_grid%para%group%mepos
1500 :
1501 : ! find the processors that should hold our data
1502 : ! should be part of the rs grid type
1503 : ! this is a loop over real ranks (i.e. the in-order cartesian ranks)
1504 : ! do the recv and send tasks in two separate loops which will
1505 : ! load balance better for OpenMP with large numbers of MPI tasks
1506 :
1507 : ! this is the reverse of rs2pw: what were the sends are now the recvs
1508 :
1509 : !$OMP PARALLEL DO DEFAULT(NONE), &
1510 : !$OMP PRIVATE(coords,idir,pos,lb_send,ub_send), &
1511 864 : !$OMP SHARED(rs,bounds,my_rs_rank,send_tasks,send_sizes,pw)
1512 : DO i = 0, pw%pw_grid%para%group%num_pe - 1
1513 :
1514 : coords(:) = rs%desc%rank2coord(:, rs%desc%real2virtual(i))
1515 : !calculate the real rs grid points on each processor
1516 : !coords is the part of the grid that rank i actually holds
1517 : DO idir = 1, 3
1518 : pos(:) = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), coords(idir))
1519 : pos(:) = pos(:) - rs%desc%npts(idir)/2 - 1
1520 : lb_send(idir) = pos(1)
1521 : ub_send(idir) = pos(2)
1522 : END DO
1523 :
1524 : IF (ub_send(1) .LT. bounds(my_rs_rank, 1)) CYCLE
1525 : IF (lb_send(1) .GT. bounds(my_rs_rank, 2)) CYCLE
1526 : IF (ub_send(2) .LT. bounds(my_rs_rank, 3)) CYCLE
1527 : IF (lb_send(2) .GT. bounds(my_rs_rank, 4)) CYCLE
1528 :
1529 : send_tasks(i, 1) = MAX(lb_send(1), bounds(my_rs_rank, 1))
1530 : send_tasks(i, 2) = MIN(ub_send(1), bounds(my_rs_rank, 2))
1531 : send_tasks(i, 3) = MAX(lb_send(2), bounds(my_rs_rank, 3))
1532 : send_tasks(i, 4) = MIN(ub_send(2), bounds(my_rs_rank, 4))
1533 : send_tasks(i, 5) = lb_send(3)
1534 : send_tasks(i, 6) = ub_send(3)
1535 : send_sizes(i) = (send_tasks(i, 2) - send_tasks(i, 1) + 1)* &
1536 : (send_tasks(i, 4) - send_tasks(i, 3) + 1)*(send_tasks(i, 6) - send_tasks(i, 5) + 1)
1537 :
1538 : END DO
1539 : !$OMP END PARALLEL DO
1540 :
1541 3456 : coords(:) = rs%desc%rank2coord(:, rs%desc%real2virtual(my_rs_rank))
1542 3456 : DO idir = 1, 3
1543 2592 : pos(:) = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), coords(idir))
1544 7776 : pos(:) = pos(:) - rs%desc%npts(idir)/2 - 1
1545 2592 : lb_send(idir) = pos(1)
1546 3456 : ub_send(idir) = pos(2)
1547 : END DO
1548 :
1549 864 : lb_recv(:) = lb_send(:)
1550 864 : ub_recv(:) = ub_send(:)
1551 :
1552 : !$OMP PARALLEL DO DEFAULT(NONE), &
1553 864 : !$OMP SHARED(pw,lb_send,ub_send,bounds,recv_tasks,recv_sizes)
1554 : DO j = 0, pw%pw_grid%para%group%num_pe - 1
1555 :
1556 : IF (ub_send(1) .LT. bounds(j, 1)) CYCLE
1557 : IF (lb_send(1) .GT. bounds(j, 2)) CYCLE
1558 : IF (ub_send(2) .LT. bounds(j, 3)) CYCLE
1559 : IF (lb_send(2) .GT. bounds(j, 4)) CYCLE
1560 :
1561 : recv_tasks(j, 1) = MAX(lb_send(1), bounds(j, 1))
1562 : recv_tasks(j, 2) = MIN(ub_send(1), bounds(j, 2))
1563 : recv_tasks(j, 3) = MAX(lb_send(2), bounds(j, 3))
1564 : recv_tasks(j, 4) = MIN(ub_send(2), bounds(j, 4))
1565 : recv_tasks(j, 5) = lb_send(3)
1566 : recv_tasks(j, 6) = ub_send(3)
1567 : recv_sizes(j) = (recv_tasks(j, 2) - recv_tasks(j, 1) + 1)* &
1568 : (recv_tasks(j, 4) - recv_tasks(j, 3) + 1)*(recv_tasks(j, 6) - recv_tasks(j, 5) + 1)
1569 :
1570 : END DO
1571 : !$OMP END PARALLEL DO
1572 :
1573 864 : send_disps(0) = 0
1574 864 : recv_disps(0) = 0
1575 1728 : DO i = 1, pw%pw_grid%para%group%num_pe - 1
1576 864 : send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
1577 1728 : recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
1578 : END DO
1579 :
1580 6048 : CPASSERT(SUM(recv_sizes) == PRODUCT(ub_recv - lb_recv + 1))
1581 :
1582 4320 : ALLOCATE (send_bufs(0:rs%desc%group_size - 1))
1583 5184 : ALLOCATE (recv_bufs(0:rs%desc%group_size - 1))
1584 :
1585 2592 : DO i = 0, rs%desc%group_size - 1
1586 1728 : IF (send_sizes(i) .NE. 0) THEN
1587 4938 : ALLOCATE (send_bufs(i)%array(send_sizes(i)))
1588 : ELSE
1589 82 : NULLIFY (send_bufs(i)%array)
1590 : END IF
1591 2592 : IF (recv_sizes(i) .NE. 0) THEN
1592 4938 : ALLOCATE (recv_bufs(i)%array(recv_sizes(i)))
1593 : ELSE
1594 82 : NULLIFY (recv_bufs(i)%array)
1595 : END IF
1596 : END DO
1597 :
1598 4320 : ALLOCATE (recv_reqs(0:rs%desc%group_size - 1))
1599 2592 : recv_reqs = mp_request_null
1600 :
1601 2592 : DO i = 0, rs%desc%group_size - 1
1602 2592 : IF (recv_sizes(i) .NE. 0) THEN
1603 1646 : CALL rs%desc%group%irecv(recv_bufs(i)%array, i, recv_reqs(i))
1604 : END IF
1605 : END DO
1606 :
1607 : ! do packing
1608 : !$OMP PARALLEL DO DEFAULT(NONE), &
1609 : !$OMP PRIVATE(k,z,y,x), &
1610 864 : !$OMP SHARED(pw,rs,send_tasks,send_bufs,send_disps)
1611 : DO i = 0, rs%desc%group_size - 1
1612 : k = 0
1613 : DO z = send_tasks(i, 5), send_tasks(i, 6)
1614 : DO y = send_tasks(i, 3), send_tasks(i, 4)
1615 : DO x = send_tasks(i, 1), send_tasks(i, 2)
1616 : k = k + 1
1617 : send_bufs(i)%array(k) = pw%array(x, y, z)
1618 : END DO
1619 : END DO
1620 : END DO
1621 : END DO
1622 : !$OMP END PARALLEL DO
1623 :
1624 4320 : ALLOCATE (send_reqs(0:rs%desc%group_size - 1))
1625 2592 : send_reqs = mp_request_null
1626 :
1627 2592 : DO i = 0, rs%desc%group_size - 1
1628 2592 : IF (send_sizes(i) .NE. 0) THEN
1629 1646 : CALL rs%desc%group%isend(send_bufs(i)%array, i, send_reqs(i))
1630 : END IF
1631 : END DO
1632 :
1633 : ! do unpacking
1634 : ! no OMP here so we can unpack each message as it arrives
1635 :
1636 2592 : DO i = 0, rs%desc%group_size - 1
1637 1728 : IF (recv_sizes(i) .EQ. 0) CYCLE
1638 :
1639 1646 : CALL mp_waitany(recv_reqs, completed)
1640 1646 : k = 0
1641 65956 : DO z = recv_tasks(completed - 1, 5), recv_tasks(completed - 1, 6)
1642 4721236 : DO y = recv_tasks(completed - 1, 3), recv_tasks(completed - 1, 4)
1643 193190191 : DO x = recv_tasks(completed - 1, 1), recv_tasks(completed - 1, 2)
1644 188470683 : k = k + 1
1645 193126745 : rs%r(x, y, z) = recv_bufs(completed - 1)%array(k)
1646 : END DO
1647 : END DO
1648 : END DO
1649 : END DO
1650 :
1651 864 : CALL mp_waitall(send_reqs)
1652 :
1653 864 : DEALLOCATE (recv_reqs)
1654 864 : DEALLOCATE (send_reqs)
1655 :
1656 2592 : DO i = 0, rs%desc%group_size - 1
1657 1728 : IF (ASSOCIATED(send_bufs(i)%array)) THEN
1658 1646 : DEALLOCATE (send_bufs(i)%array)
1659 : END IF
1660 2592 : IF (ASSOCIATED(recv_bufs(i)%array)) THEN
1661 1646 : DEALLOCATE (recv_bufs(i)%array)
1662 : END IF
1663 : END DO
1664 :
1665 864 : DEALLOCATE (send_bufs)
1666 864 : DEALLOCATE (recv_bufs)
1667 864 : DEALLOCATE (send_tasks)
1668 864 : DEALLOCATE (send_sizes)
1669 864 : DEALLOCATE (send_disps)
1670 864 : DEALLOCATE (recv_tasks)
1671 864 : DEALLOCATE (recv_sizes)
1672 864 : DEALLOCATE (recv_disps)
1673 :
1674 : ! now pass wings around
1675 864 : halo_swapped = .FALSE.
1676 :
1677 3456 : DO idir = 1, 3
1678 :
1679 2592 : IF (rs%desc%perd(idir) /= 1) THEN
1680 :
1681 5496 : ALLOCATE (dshifts(0:rs%desc%neighbours(idir)))
1682 3664 : ALLOCATE (ushifts(0:rs%desc%neighbours(idir)))
1683 5496 : ushifts = 0
1684 5496 : dshifts = 0
1685 :
1686 3664 : DO n_shifts = 1, rs%desc%neighbours(idir)
1687 :
1688 : ! need to take into account the possible varying widths of neighbouring cells
1689 : ! ushifts and dshifts hold the real size of the neighbouring cells
1690 :
1691 1832 : position = MODULO(rs%desc%virtual_group_coor(idir) - n_shifts, rs%desc%group_dim(idir))
1692 1832 : neighbours = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), position)
1693 1832 : dshifts(n_shifts) = dshifts(n_shifts - 1) + (neighbours(2) - neighbours(1) + 1)
1694 :
1695 1832 : position = MODULO(rs%desc%virtual_group_coor(idir) + n_shifts, rs%desc%group_dim(idir))
1696 1832 : neighbours = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), position)
1697 1832 : ushifts(n_shifts) = ushifts(n_shifts - 1) + (neighbours(2) - neighbours(1) + 1)
1698 :
1699 : ! The border data has to be send/received from the neighbors
1700 : ! First we calculate the source and destination processes for the shift
1701 : ! The first shift is "downwards"
1702 :
1703 1832 : CALL cart_shift(rs, idir, -1*n_shifts, source_down, dest_down)
1704 :
1705 7328 : lb_send_down(:) = rs%lb_local(:)
1706 7328 : ub_send_down(:) = rs%ub_local(:)
1707 7328 : lb_recv_down(:) = rs%lb_local(:)
1708 7328 : ub_recv_down(:) = rs%ub_local(:)
1709 :
1710 1832 : IF (dshifts(n_shifts - 1) .LE. rs%desc%border) THEN
1711 1832 : lb_send_down(idir) = lb_send_down(idir) + rs%desc%border
1712 : ub_send_down(idir) = MIN(ub_send_down(idir) - rs%desc%border, &
1713 1832 : lb_send_down(idir) + rs%desc%border - 1 - dshifts(n_shifts - 1))
1714 :
1715 1832 : lb_recv_down(idir) = ub_recv_down(idir) - rs%desc%border + 1 + ushifts(n_shifts - 1)
1716 : ub_recv_down(idir) = MIN(ub_recv_down(idir), &
1717 1832 : ub_recv_down(idir) - rs%desc%border + ushifts(n_shifts))
1718 : ELSE
1719 0 : lb_send_down(idir) = 0
1720 0 : ub_send_down(idir) = -1
1721 0 : lb_recv_down(idir) = 0
1722 0 : ub_recv_down(idir) = -1
1723 : END IF
1724 :
1725 7328 : DO i = 1, 3
1726 7328 : IF (.NOT. (halo_swapped(i) .OR. i .EQ. idir)) THEN
1727 1536 : lb_send_down(i) = rs%lb_real(i)
1728 1536 : ub_send_down(i) = rs%ub_real(i)
1729 1536 : lb_recv_down(i) = rs%lb_real(i)
1730 1536 : ub_recv_down(i) = rs%ub_real(i)
1731 : END IF
1732 : END DO
1733 :
1734 : ! allocate the recv buffer
1735 7328 : nn = PRODUCT(ub_recv_down - lb_recv_down + 1)
1736 0 : ALLOCATE (recv_buf_3d_down(lb_recv_down(1):ub_recv_down(1), &
1737 9160 : lb_recv_down(2):ub_recv_down(2), lb_recv_down(3):ub_recv_down(3)))
1738 :
1739 : ! recv buffer is now ready, so post the receive
1740 1832 : CALL rs%desc%group%irecv(recv_buf_3d_down, source_down, req(1))
1741 :
1742 : ! now allocate,pack and send the send buffer
1743 7328 : nn = PRODUCT(ub_send_down - lb_send_down + 1)
1744 0 : ALLOCATE (send_buf_3d_down(lb_send_down(1):ub_send_down(1), &
1745 9160 : lb_send_down(2):ub_send_down(2), lb_send_down(3):ub_send_down(3)))
1746 :
1747 : !$OMP PARALLEL DEFAULT(NONE), &
1748 : !$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1749 1832 : !$OMP SHARED(send_buf_3d_down,rs,lb_send_down,ub_send_down)
1750 : !$ num_threads = MIN(omp_get_max_threads(), ub_send_down(3) - lb_send_down(3) + 1)
1751 : !$ my_id = omp_get_thread_num()
1752 : IF (my_id < num_threads) THEN
1753 : lb = lb_send_down(3) + ((ub_send_down(3) - lb_send_down(3) + 1)*my_id)/num_threads
1754 : ub = lb_send_down(3) + ((ub_send_down(3) - lb_send_down(3) + 1)*(my_id + 1))/num_threads - 1
1755 :
1756 : send_buf_3d_down(lb_send_down(1):ub_send_down(1), lb_send_down(2):ub_send_down(2), &
1757 : lb:ub) = rs%r(lb_send_down(1):ub_send_down(1), &
1758 : lb_send_down(2):ub_send_down(2), lb:ub)
1759 : END IF
1760 : !$OMP END PARALLEL
1761 :
1762 1832 : CALL rs%desc%group%isend(send_buf_3d_down, dest_down, req(3))
1763 :
1764 : ! Now for the other direction
1765 :
1766 1832 : CALL cart_shift(rs, idir, n_shifts, source_up, dest_up)
1767 :
1768 7328 : lb_send_up(:) = rs%lb_local(:)
1769 7328 : ub_send_up(:) = rs%ub_local(:)
1770 7328 : lb_recv_up(:) = rs%lb_local(:)
1771 7328 : ub_recv_up(:) = rs%ub_local(:)
1772 :
1773 1832 : IF (ushifts(n_shifts - 1) .LE. rs%desc%border) THEN
1774 1832 : ub_send_up(idir) = ub_send_up(idir) - rs%desc%border
1775 : lb_send_up(idir) = MAX(lb_send_up(idir) + rs%desc%border, &
1776 1832 : ub_send_up(idir) - rs%desc%border + 1 + ushifts(n_shifts - 1))
1777 :
1778 1832 : ub_recv_up(idir) = lb_recv_up(idir) + rs%desc%border - 1 - dshifts(n_shifts - 1)
1779 : lb_recv_up(idir) = MAX(lb_recv_up(idir), &
1780 1832 : lb_recv_up(idir) + rs%desc%border - dshifts(n_shifts))
1781 : ELSE
1782 0 : lb_send_up(idir) = 0
1783 0 : ub_send_up(idir) = -1
1784 0 : lb_recv_up(idir) = 0
1785 0 : ub_recv_up(idir) = -1
1786 : END IF
1787 :
1788 7328 : DO i = 1, 3
1789 7328 : IF (.NOT. (halo_swapped(i) .OR. i .EQ. idir)) THEN
1790 1536 : lb_send_up(i) = rs%lb_real(i)
1791 1536 : ub_send_up(i) = rs%ub_real(i)
1792 1536 : lb_recv_up(i) = rs%lb_real(i)
1793 1536 : ub_recv_up(i) = rs%ub_real(i)
1794 : END IF
1795 : END DO
1796 :
1797 : ! allocate the recv buffer
1798 7328 : nn = PRODUCT(ub_recv_up - lb_recv_up + 1)
1799 0 : ALLOCATE (recv_buf_3d_up(lb_recv_up(1):ub_recv_up(1), &
1800 9160 : lb_recv_up(2):ub_recv_up(2), lb_recv_up(3):ub_recv_up(3)))
1801 :
1802 : ! recv buffer is now ready, so post the receive
1803 :
1804 1832 : CALL rs%desc%group%irecv(recv_buf_3d_up, source_up, req(2))
1805 :
1806 : ! now allocate,pack and send the send buffer
1807 7328 : nn = PRODUCT(ub_send_up - lb_send_up + 1)
1808 0 : ALLOCATE (send_buf_3d_up(lb_send_up(1):ub_send_up(1), &
1809 9160 : lb_send_up(2):ub_send_up(2), lb_send_up(3):ub_send_up(3)))
1810 :
1811 : !$OMP PARALLEL DEFAULT(NONE), &
1812 : !$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1813 1832 : !$OMP SHARED(send_buf_3d_up,rs,lb_send_up,ub_send_up)
1814 : !$ num_threads = MIN(omp_get_max_threads(), ub_send_up(3) - lb_send_up(3) + 1)
1815 : !$ my_id = omp_get_thread_num()
1816 : IF (my_id < num_threads) THEN
1817 : lb = lb_send_up(3) + ((ub_send_up(3) - lb_send_up(3) + 1)*my_id)/num_threads
1818 : ub = lb_send_up(3) + ((ub_send_up(3) - lb_send_up(3) + 1)*(my_id + 1))/num_threads - 1
1819 :
1820 : send_buf_3d_up(lb_send_up(1):ub_send_up(1), lb_send_up(2):ub_send_up(2), &
1821 : lb:ub) = rs%r(lb_send_up(1):ub_send_up(1), &
1822 : lb_send_up(2):ub_send_up(2), lb:ub)
1823 : END IF
1824 : !$OMP END PARALLEL
1825 :
1826 1832 : CALL rs%desc%group%isend(send_buf_3d_up, dest_up, req(4))
1827 :
1828 : ! wait for a recv to complete, then we can unpack
1829 :
1830 5496 : DO i = 1, 2
1831 :
1832 3664 : CALL mp_waitany(req(1:2), completed)
1833 :
1834 5496 : IF (completed .EQ. 1) THEN
1835 :
1836 : ! only some procs may need later shifts
1837 1832 : IF (ub_recv_down(idir) .GE. lb_recv_down(idir)) THEN
1838 :
1839 : ! Add the data to the RS Grid
1840 : !$OMP PARALLEL DEFAULT(NONE), &
1841 : !$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1842 1832 : !$OMP SHARED(recv_buf_3d_down,rs,lb_recv_down,ub_recv_down)
1843 : !$ num_threads = MIN(omp_get_max_threads(), ub_recv_down(3) - lb_recv_down(3) + 1)
1844 : !$ my_id = omp_get_thread_num()
1845 : IF (my_id < num_threads) THEN
1846 : lb = lb_recv_down(3) + ((ub_recv_down(3) - lb_recv_down(3) + 1)*my_id)/num_threads
1847 : ub = lb_recv_down(3) + ((ub_recv_down(3) - lb_recv_down(3) + 1)*(my_id + 1))/num_threads - 1
1848 :
1849 : rs%r(lb_recv_down(1):ub_recv_down(1), lb_recv_down(2):ub_recv_down(2), &
1850 : lb:ub) = recv_buf_3d_down(:, :, lb:ub)
1851 : END IF
1852 : !$OMP END PARALLEL
1853 : END IF
1854 :
1855 1832 : DEALLOCATE (recv_buf_3d_down)
1856 : ELSE
1857 :
1858 : ! only some procs may need later shifts
1859 1832 : IF (ub_recv_up(idir) .GE. lb_recv_up(idir)) THEN
1860 :
1861 : ! Add the data to the RS Grid
1862 : !$OMP PARALLEL DEFAULT(NONE), &
1863 : !$OMP PRIVATE(lb,ub,my_id,NUM_THREADS), &
1864 1832 : !$OMP SHARED(recv_buf_3d_up,rs,lb_recv_up,ub_recv_up)
1865 : !$ num_threads = MIN(omp_get_max_threads(), ub_recv_up(3) - lb_recv_up(3) + 1)
1866 : !$ my_id = omp_get_thread_num()
1867 : IF (my_id < num_threads) THEN
1868 : lb = lb_recv_up(3) + ((ub_recv_up(3) - lb_recv_up(3) + 1)*my_id)/num_threads
1869 : ub = lb_recv_up(3) + ((ub_recv_up(3) - lb_recv_up(3) + 1)*(my_id + 1))/num_threads - 1
1870 :
1871 : rs%r(lb_recv_up(1):ub_recv_up(1), lb_recv_up(2):ub_recv_up(2), &
1872 : lb:ub) = recv_buf_3d_up(:, :, lb:ub)
1873 : END IF
1874 : !$OMP END PARALLEL
1875 : END IF
1876 :
1877 1832 : DEALLOCATE (recv_buf_3d_up)
1878 : END IF
1879 : END DO
1880 :
1881 1832 : CALL mp_waitall(req(3:4))
1882 :
1883 1832 : DEALLOCATE (send_buf_3d_down)
1884 5496 : DEALLOCATE (send_buf_3d_up)
1885 : END DO
1886 :
1887 1832 : DEALLOCATE (ushifts)
1888 1832 : DEALLOCATE (dshifts)
1889 : END IF
1890 :
1891 3456 : halo_swapped(idir) = .TRUE.
1892 :
1893 : END DO
1894 :
1895 864 : END SUBROUTINE transfer_pw2rs_distributed
1896 :
1897 : ! **************************************************************************************************
1898 : !> \brief Initialize grid to zero
1899 : !> \param rs ...
1900 : !> \par History
1901 : !> none
1902 : !> \author JGH (23-Mar-2002)
1903 : ! **************************************************************************************************
1904 343264 : SUBROUTINE rs_grid_zero(rs)
1905 :
1906 : TYPE(realspace_grid_type), INTENT(IN) :: rs
1907 :
1908 : CHARACTER(len=*), PARAMETER :: routineN = 'rs_grid_zero'
1909 :
1910 : INTEGER :: handle, i, j, k, l(3), u(3)
1911 :
1912 343264 : CALL timeset(routineN, handle)
1913 1029792 : l(1) = LBOUND(rs%r, 1); l(2) = LBOUND(rs%r, 2); l(3) = LBOUND(rs%r, 3)
1914 1029792 : u(1) = UBOUND(rs%r, 1); u(2) = UBOUND(rs%r, 2); u(3) = UBOUND(rs%r, 3)
1915 : !$OMP PARALLEL DO DEFAULT(NONE) COLLAPSE(3) &
1916 : !$OMP PRIVATE(i,j,k) &
1917 343264 : !$OMP SHARED(rs,l,u)
1918 : DO k = l(3), u(3)
1919 : DO j = l(2), u(2)
1920 : DO i = l(1), u(1)
1921 : rs%r(i, j, k) = 0.0_dp
1922 : END DO
1923 : END DO
1924 : END DO
1925 : !$OMP END PARALLEL DO
1926 343264 : CALL timestop(handle)
1927 :
1928 343264 : END SUBROUTINE rs_grid_zero
1929 :
1930 : ! **************************************************************************************************
1931 : !> \brief rs1(i) = rs1(i) + rs2(i)*rs3(i)
1932 : !> \param rs1 ...
1933 : !> \param rs2 ...
1934 : !> \param rs3 ...
1935 : !> \param scalar ...
1936 : !> \par History
1937 : !> none
1938 : !> \author
1939 : ! **************************************************************************************************
1940 1404 : SUBROUTINE rs_grid_mult_and_add(rs1, rs2, rs3, scalar)
1941 :
1942 : TYPE(realspace_grid_type), INTENT(IN) :: rs1, rs2, rs3
1943 : REAL(dp), INTENT(IN) :: scalar
1944 :
1945 : CHARACTER(len=*), PARAMETER :: routineN = 'rs_grid_mult_and_add'
1946 :
1947 : INTEGER :: handle, i, j, k, l(3), u(3)
1948 :
1949 : !-----------------------------------------------------------------------------!
1950 :
1951 1404 : CALL timeset(routineN, handle)
1952 1404 : IF (scalar .NE. 0.0_dp) THEN
1953 4212 : l(1) = LBOUND(rs1%r, 1); l(2) = LBOUND(rs1%r, 2); l(3) = LBOUND(rs1%r, 3)
1954 4212 : u(1) = UBOUND(rs1%r, 1); u(2) = UBOUND(rs1%r, 2); u(3) = UBOUND(rs1%r, 3)
1955 : !$OMP PARALLEL DO DEFAULT(NONE) COLLAPSE(3) &
1956 : !$OMP PRIVATE(i,j,k) &
1957 1404 : !$OMP SHARED(rs1,rs2,rs3,scalar,l,u)
1958 : DO k = l(3), u(3)
1959 : DO j = l(2), u(2)
1960 : DO i = l(1), u(1)
1961 : rs1%r(i, j, k) = rs1%r(i, j, k) + scalar*rs2%r(i, j, k)*rs3%r(i, j, k)
1962 : END DO
1963 : END DO
1964 : END DO
1965 : !$OMP END PARALLEL DO
1966 : END IF
1967 1404 : CALL timestop(handle)
1968 1404 : END SUBROUTINE rs_grid_mult_and_add
1969 :
1970 : ! **************************************************************************************************
1971 : !> \brief Set box matrix info for real space grid
1972 : !> This is needed for variable cell simulations
1973 : !> \param pw_grid ...
1974 : !> \param rs ...
1975 : !> \par History
1976 : !> none
1977 : !> \author JGH (15-May-2007)
1978 : ! **************************************************************************************************
1979 176154 : SUBROUTINE rs_grid_set_box(pw_grid, rs)
1980 :
1981 : TYPE(pw_grid_type), INTENT(IN), TARGET :: pw_grid
1982 : TYPE(realspace_grid_type), INTENT(IN) :: rs
1983 :
1984 176154 : CPASSERT(ASSOCIATED(rs%desc%pw, pw_grid))
1985 2290002 : rs%desc%dh = pw_grid%dh
1986 2290002 : rs%desc%dh_inv = pw_grid%dh_inv
1987 :
1988 176154 : END SUBROUTINE rs_grid_set_box
1989 :
1990 : ! **************************************************************************************************
1991 : !> \brief retains the given rs grid descriptor (see doc/ReferenceCounting.html)
1992 : !> \param rs_desc the grid descriptor to retain
1993 : !> \par History
1994 : !> 04.2009 created [Iain Bethune]
1995 : !> (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
1996 : ! **************************************************************************************************
1997 236948 : SUBROUTINE rs_grid_retain_descriptor(rs_desc)
1998 : TYPE(realspace_grid_desc_type), INTENT(INOUT) :: rs_desc
1999 :
2000 236948 : CPASSERT(rs_desc%ref_count > 0)
2001 236948 : rs_desc%ref_count = rs_desc%ref_count + 1
2002 236948 : END SUBROUTINE rs_grid_retain_descriptor
2003 :
2004 : ! **************************************************************************************************
2005 : !> \brief releases the given rs grid (see doc/ReferenceCounting.html)
2006 : !> \param rs_grid the rs grid to release
2007 : !> \par History
2008 : !> 03.2003 created [fawzi]
2009 : !> \author fawzi
2010 : ! **************************************************************************************************
2011 236326 : SUBROUTINE rs_grid_release(rs_grid)
2012 : TYPE(realspace_grid_type), INTENT(INOUT) :: rs_grid
2013 :
2014 236326 : CALL rs_grid_release_descriptor(rs_grid%desc)
2015 :
2016 236326 : CALL offload_free_buffer(rs_grid%buffer)
2017 236326 : NULLIFY (rs_grid%r)
2018 :
2019 236326 : IF (ALLOCATED(rs_grid%px)) DEALLOCATE (rs_grid%px)
2020 236326 : IF (ALLOCATED(rs_grid%py)) DEALLOCATE (rs_grid%py)
2021 236326 : IF (ALLOCATED(rs_grid%pz)) DEALLOCATE (rs_grid%pz)
2022 236326 : END SUBROUTINE rs_grid_release
2023 :
2024 : ! **************************************************************************************************
2025 : !> \brief releases the given rs grid descriptor (see doc/ReferenceCounting.html)
2026 : !> \param rs_desc the rs grid descriptor to release
2027 : !> \par History
2028 : !> 04.2009 created [Iain Bethune]
2029 : !> (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
2030 : ! **************************************************************************************************
2031 272285 : SUBROUTINE rs_grid_release_descriptor(rs_desc)
2032 : TYPE(realspace_grid_desc_type), POINTER :: rs_desc
2033 :
2034 272285 : IF (ASSOCIATED(rs_desc)) THEN
2035 268666 : CPASSERT(rs_desc%ref_count > 0)
2036 268666 : rs_desc%ref_count = rs_desc%ref_count - 1
2037 268666 : IF (rs_desc%ref_count == 0) THEN
2038 :
2039 31718 : CALL pw_grid_release(rs_desc%pw)
2040 :
2041 31718 : IF (rs_desc%parallel) THEN
2042 : ! release the group communicator
2043 27636 : CALL rs_desc%group%free()
2044 :
2045 27636 : DEALLOCATE (rs_desc%virtual2real)
2046 27636 : DEALLOCATE (rs_desc%real2virtual)
2047 : END IF
2048 :
2049 31718 : IF (rs_desc%distributed) THEN
2050 158 : DEALLOCATE (rs_desc%rank2coord)
2051 158 : DEALLOCATE (rs_desc%coord2rank)
2052 158 : DEALLOCATE (rs_desc%lb_global)
2053 158 : DEALLOCATE (rs_desc%ub_global)
2054 158 : DEALLOCATE (rs_desc%x2coord)
2055 158 : DEALLOCATE (rs_desc%y2coord)
2056 158 : DEALLOCATE (rs_desc%z2coord)
2057 : END IF
2058 :
2059 31718 : DEALLOCATE (rs_desc)
2060 : END IF
2061 : END IF
2062 272285 : NULLIFY (rs_desc)
2063 272285 : END SUBROUTINE rs_grid_release_descriptor
2064 :
2065 : ! **************************************************************************************************
2066 : !> \brief emulates the function of an MPI_cart_shift operation, but the shift is
2067 : !> done in virtual coordinates, and the corresponding real ranks are returned
2068 : !> \param rs_grid ...
2069 : !> \param dir ...
2070 : !> \param disp ...
2071 : !> \param source ...
2072 : !> \param dest ...
2073 : !> \par History
2074 : !> 04.2009 created [Iain Bethune]
2075 : !> (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
2076 : ! **************************************************************************************************
2077 7320 : PURE SUBROUTINE cart_shift(rs_grid, dir, disp, source, dest)
2078 :
2079 : TYPE(realspace_grid_type), INTENT(IN) :: rs_grid
2080 : INTEGER, INTENT(IN) :: dir, disp
2081 : INTEGER, INTENT(OUT) :: source, dest
2082 :
2083 : INTEGER, DIMENSION(3) :: shift_coords
2084 :
2085 29280 : shift_coords = rs_grid%desc%virtual_group_coor
2086 7320 : shift_coords(dir) = MODULO(shift_coords(dir) + disp, rs_grid%desc%group_dim(dir))
2087 7320 : dest = rs_grid%desc%virtual2real(rs_grid%desc%coord2rank(shift_coords(1), shift_coords(2), shift_coords(3)))
2088 29280 : shift_coords = rs_grid%desc%virtual_group_coor
2089 7320 : shift_coords(dir) = MODULO(shift_coords(dir) - disp, rs_grid%desc%group_dim(dir))
2090 7320 : source = rs_grid%desc%virtual2real(rs_grid%desc%coord2rank(shift_coords(1), shift_coords(2), shift_coords(3)))
2091 :
2092 7320 : END SUBROUTINE
2093 :
2094 : ! **************************************************************************************************
2095 : !> \brief returns the maximum number of points in the local grid of any process
2096 : !> to account for the case where the grid may later be reordered
2097 : !> \param desc ...
2098 : !> \return ...
2099 : !> \par History
2100 : !> 10.2011 created [Iain Bethune]
2101 : ! **************************************************************************************************
2102 0 : FUNCTION rs_grid_max_ngpts(desc) RESULT(max_ngpts)
2103 : TYPE(realspace_grid_desc_type), INTENT(IN) :: desc
2104 : INTEGER :: max_ngpts
2105 :
2106 : CHARACTER(len=*), PARAMETER :: routineN = 'rs_grid_max_ngpts'
2107 :
2108 : INTEGER :: handle, i
2109 : INTEGER, DIMENSION(3) :: lb, ub
2110 :
2111 0 : CALL timeset(routineN, handle)
2112 :
2113 0 : max_ngpts = 0
2114 0 : IF ((desc%pw%para%mode == PW_MODE_LOCAL) .OR. &
2115 : (ALL(desc%group_dim == 1))) THEN
2116 0 : CPASSERT(PRODUCT(INT(desc%npts, KIND=int_8)) < HUGE(1))
2117 0 : max_ngpts = PRODUCT(desc%npts)
2118 : ELSE
2119 0 : DO i = 0, desc%group_size - 1
2120 0 : lb = desc%lb_global(:, i)
2121 0 : ub = desc%ub_global(:, i)
2122 0 : lb = lb - desc%border*(1 - desc%perd)
2123 0 : ub = ub + desc%border*(1 - desc%perd)
2124 0 : CPASSERT(PRODUCT(INT(ub - lb + 1, KIND=int_8)) < HUGE(1))
2125 0 : max_ngpts = MAX(max_ngpts, PRODUCT(ub - lb + 1))
2126 : END DO
2127 : END IF
2128 :
2129 0 : CALL timestop(handle)
2130 :
2131 0 : END FUNCTION rs_grid_max_ngpts
2132 :
2133 : ! **************************************************************************************************
2134 : !> \brief ...
2135 : !> \param rs_grid ...
2136 : !> \param h_inv ...
2137 : !> \param ra ...
2138 : !> \param offset ...
2139 : !> \param group_size ...
2140 : !> \param my_pos ...
2141 : !> \return ...
2142 : ! **************************************************************************************************
2143 1555487 : PURE LOGICAL FUNCTION map_gaussian_here(rs_grid, h_inv, ra, offset, group_size, my_pos) RESULT(res)
2144 : TYPE(realspace_grid_type), INTENT(IN) :: rs_grid
2145 : REAL(KIND=dp), DIMENSION(3, 3), INTENT(IN) :: h_inv
2146 : REAL(KIND=dp), DIMENSION(3), INTENT(IN) :: ra
2147 : INTEGER, INTENT(IN), OPTIONAL :: offset, group_size, my_pos
2148 :
2149 : INTEGER :: dir, lb(3), location(3), tp(3), ub(3)
2150 :
2151 1555487 : res = .FALSE.
2152 :
2153 6221940 : IF (.NOT. ALL(rs_grid%desc%perd == 1)) THEN
2154 32 : DO dir = 1, 3
2155 : ! bounds of local grid (i.e. removing the 'wings'), if periodic
2156 96 : tp(dir) = FLOOR(DOT_PRODUCT(h_inv(dir, :), ra)*rs_grid%desc%npts(dir))
2157 24 : tp(dir) = MODULO(tp(dir), rs_grid%desc%npts(dir))
2158 24 : IF (rs_grid%desc%perd(dir) .NE. 1) THEN
2159 8 : lb(dir) = rs_grid%lb_local(dir) + rs_grid%desc%border
2160 8 : ub(dir) = rs_grid%ub_local(dir) - rs_grid%desc%border
2161 : ELSE
2162 16 : lb(dir) = rs_grid%lb_local(dir)
2163 16 : ub(dir) = rs_grid%ub_local(dir)
2164 : END IF
2165 : ! distributed grid, only map if it is local to the grid
2166 32 : location(dir) = tp(dir) + rs_grid%desc%lb(dir)
2167 : END DO
2168 60 : IF (ALL(lb(:) <= location(:)) .AND. ALL(location(:) <= ub(:))) THEN
2169 4 : res = .TRUE.
2170 : END IF
2171 : ELSE
2172 1555479 : IF (PRESENT(offset) .AND. PRESENT(group_size) .AND. PRESENT(my_pos)) THEN
2173 : ! not distributed, just a round-robin distribution over the full set of CPUs
2174 1555479 : IF (MODULO(offset, group_size) == my_pos) res = .TRUE.
2175 : END IF
2176 : END IF
2177 :
2178 1555487 : END FUNCTION map_gaussian_here
2179 :
2180 0 : END MODULE realspace_grid_types
|