Line data Source code
1 : !--------------------------------------------------------------------------------------------------!
2 : ! CP2K: A general program to perform molecular dynamics simulations !
3 : ! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4 : ! !
5 : ! SPDX-License-Identifier: GPL-2.0-or-later !
6 : !--------------------------------------------------------------------------------------------------!
7 :
8 : ! **************************************************************************************************
9 : !> \brief generate the tasks lists used by collocate and integrate routines
10 : !> \par History
11 : !> 01.2008 [Joost VandeVondele] refactered out of qs_collocate / qs_integrate
12 : !> \author Joost VandeVondele
13 : ! **************************************************************************************************
14 : MODULE task_list_methods
15 : USE offload_api, ONLY: offload_create_buffer, offload_buffer_type
16 : USE grid_api, ONLY: grid_create_basis_set, grid_create_task_list
17 : USE ao_util, ONLY: exp_radius_very_extended
18 : USE basis_set_types, ONLY: get_gto_basis_set, &
19 : gto_basis_set_p_type, &
20 : gto_basis_set_type
21 : USE cell_types, ONLY: cell_type, &
22 : pbc
23 : USE cp_control_types, ONLY: dft_control_type
24 : USE cube_utils, ONLY: compute_cube_center, &
25 : cube_info_type, &
26 : return_cube, &
27 : return_cube_nonortho
28 : USE cp_dbcsr_api, ONLY: dbcsr_convert_sizes_to_offsets, &
29 : dbcsr_finalize, &
30 : dbcsr_get_block_p, &
31 : dbcsr_get_info, &
32 : dbcsr_p_type, &
33 : dbcsr_put_block, &
34 : dbcsr_type, &
35 : dbcsr_work_create
36 : USE gaussian_gridlevels, ONLY: gaussian_gridlevel, &
37 : gridlevel_info_type
38 : USE kinds, ONLY: default_string_length, &
39 : dp, &
40 : int_8
41 : USE kpoint_types, ONLY: get_kpoint_info, &
42 : kpoint_type
43 : USE memory_utilities, ONLY: reallocate
44 : USE message_passing, ONLY: &
45 : mp_comm_type
46 : USE particle_types, ONLY: particle_type
47 : USE particle_methods, ONLY: get_particle_set
48 : USE pw_env_types, ONLY: pw_env_get, &
49 : pw_env_type
50 : USE qs_kind_types, ONLY: get_qs_kind, &
51 : qs_kind_type
52 : USE qs_ks_types, ONLY: get_ks_env, &
53 : qs_ks_env_type
54 : USE qs_neighbor_list_types, ONLY: neighbor_list_set_p_type
55 : USE realspace_grid_types, ONLY: realspace_grid_desc_p_type, &
56 : realspace_grid_desc_type, &
57 : rs_grid_create, &
58 : rs_grid_locate_rank, &
59 : rs_grid_release, &
60 : rs_grid_reorder_ranks, realspace_grid_type
61 : USE task_list_types, ONLY: deserialize_task, &
62 : reallocate_tasks, &
63 : serialize_task, &
64 : task_list_type, &
65 : atom_pair_type, &
66 : task_size_in_int8, &
67 : task_type
68 : USE util, ONLY: sort
69 :
70 : !$ USE OMP_LIB, ONLY: omp_destroy_lock, omp_get_num_threads, omp_init_lock, &
71 : !$ omp_lock_kind, omp_set_lock, omp_unset_lock, omp_get_max_threads
72 : #include "./base/base_uses.f90"
73 :
74 : #:include './common/array_sort.fypp'
75 :
76 : IMPLICIT NONE
77 :
78 : LOGICAL, PRIVATE, PARAMETER :: debug_this_module = .FALSE.
79 :
80 : PRIVATE
81 :
82 : CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'task_list_methods'
83 :
84 : PUBLIC :: generate_qs_task_list, &
85 : task_list_inner_loop
86 : PUBLIC :: distribute_tasks, &
87 : rs_distribute_matrix, &
88 : rs_scatter_matrices, &
89 : rs_gather_matrices, &
90 : rs_copy_to_buffer, &
91 : rs_copy_to_matrices
92 :
93 : CONTAINS
94 :
95 : ! **************************************************************************************************
96 : !> \brief ...
97 : !> \param ks_env ...
98 : !> \param task_list ...
99 : !> \param reorder_rs_grid_ranks Flag that indicates if this routine should
100 : !> or should not overwrite the rs descriptor (see comment below)
101 : !> \param skip_load_balance_distributed ...
102 : !> \param soft_valid ...
103 : !> \param basis_type ...
104 : !> \param pw_env_external ...
105 : !> \param sab_orb_external ...
106 : !> \par History
107 : !> 01.2008 factored out of calculate_rho_elec [Joost VandeVondele]
108 : !> 04.2010 divides tasks into grid levels and atom pairs for integrate/collocate [Iain Bethune]
109 : !> (c) The Numerical Algorithms Group (NAG) Ltd, 2010 on behalf of the HECToR project
110 : !> 06.2015 adjusted to be used with multiple images (k-points) [JGH]
111 : !> \note If this routine is called several times with different task lists,
112 : !> the default behaviour is to re-optimize the grid ranks and overwrite
113 : !> the rs descriptor and grids. reorder_rs_grid_ranks = .FALSE. prevents the code
114 : !> of performing a new optimization by leaving the rank order in
115 : !> its current state.
116 : ! **************************************************************************************************
117 13424 : SUBROUTINE generate_qs_task_list(ks_env, task_list, &
118 : reorder_rs_grid_ranks, skip_load_balance_distributed, &
119 : soft_valid, basis_type, pw_env_external, sab_orb_external)
120 :
121 : TYPE(qs_ks_env_type), POINTER :: ks_env
122 : TYPE(task_list_type), POINTER :: task_list
123 : LOGICAL, INTENT(IN) :: reorder_rs_grid_ranks, &
124 : skip_load_balance_distributed
125 : LOGICAL, INTENT(IN), OPTIONAL :: soft_valid
126 : CHARACTER(LEN=*), INTENT(IN), OPTIONAL :: basis_type
127 : TYPE(pw_env_type), OPTIONAL, POINTER :: pw_env_external
128 : TYPE(neighbor_list_set_p_type), DIMENSION(:), &
129 : OPTIONAL, POINTER :: sab_orb_external
130 :
131 : CHARACTER(LEN=*), PARAMETER :: routineN = 'generate_qs_task_list'
132 : INTEGER, PARAMETER :: max_tasks = 2000
133 :
134 : CHARACTER(LEN=default_string_length) :: my_basis_type
135 : INTEGER :: cindex, curr_tasks, handle, i, iatom, iatom_old, igrid_level, igrid_level_old, &
136 : ikind, ilevel, img, img_old, ipair, ipgf, iset, itask, jatom, jatom_old, jkind, jpgf, &
137 : jset, maxpgf, maxset, natoms, nimages, nkind, nseta, nsetb, slot
138 13424 : INTEGER, ALLOCATABLE, DIMENSION(:, :, :) :: blocks
139 : INTEGER, DIMENSION(3) :: cellind
140 13424 : INTEGER, DIMENSION(:), POINTER :: la_max, la_min, lb_max, lb_min, npgfa, &
141 13424 : npgfb, nsgf
142 13424 : INTEGER, DIMENSION(:, :, :), POINTER :: cell_to_index
143 : LOGICAL :: dokp, my_soft
144 : REAL(KIND=dp) :: kind_radius_a, kind_radius_b
145 : REAL(KIND=dp), DIMENSION(3) :: ra, rab
146 13424 : REAL(KIND=dp), DIMENSION(:), POINTER :: set_radius_a, set_radius_b
147 13424 : REAL(KIND=dp), DIMENSION(:, :), POINTER :: rpgfa, rpgfb, zeta, zetb
148 : TYPE(cell_type), POINTER :: cell
149 13424 : TYPE(cube_info_type), DIMENSION(:), POINTER :: cube_info
150 : TYPE(dft_control_type), POINTER :: dft_control
151 : TYPE(gridlevel_info_type), POINTER :: gridlevel_info
152 13424 : TYPE(gto_basis_set_p_type), DIMENSION(:), POINTER :: basis_set_list
153 : TYPE(gto_basis_set_type), POINTER :: basis_set_a, basis_set_b, orb_basis_set
154 : TYPE(kpoint_type), POINTER :: kpoints
155 : TYPE(neighbor_list_set_p_type), DIMENSION(:), &
156 13424 : POINTER :: sab_orb
157 13424 : TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
158 : TYPE(pw_env_type), POINTER :: pw_env
159 13424 : TYPE(qs_kind_type), DIMENSION(:), POINTER :: qs_kind_set
160 : TYPE(qs_kind_type), POINTER :: qs_kind
161 : TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
162 13424 : POINTER :: rs_descs
163 13424 : TYPE(realspace_grid_type), DIMENSION(:), POINTER :: rs_grids
164 13424 : TYPE(task_type), DIMENSION(:), POINTER :: tasks
165 :
166 13424 : CALL timeset(routineN, handle)
167 :
168 : CALL get_ks_env(ks_env, &
169 : qs_kind_set=qs_kind_set, &
170 : cell=cell, &
171 : particle_set=particle_set, &
172 13424 : dft_control=dft_control)
173 :
174 : ! OPTION 1) basis is set through input
175 : ! OPTION 2) soft orb basis is requested
176 : ! OPTION 3) by default, the full orb basis density is calculated
177 13424 : my_soft = .FALSE.
178 13424 : IF (PRESENT(soft_valid)) my_soft = soft_valid
179 13424 : IF (PRESENT(basis_type)) THEN
180 1576 : CPASSERT(.NOT. my_soft)
181 1576 : my_basis_type = basis_type
182 11848 : ELSEIF (my_soft) THEN
183 1598 : my_basis_type = "ORB_SOFT"
184 : ELSE
185 10250 : my_basis_type = "ORB"
186 : END IF
187 :
188 13424 : CALL get_ks_env(ks_env, sab_orb=sab_orb)
189 13424 : IF (PRESENT(sab_orb_external)) sab_orb => sab_orb_external
190 :
191 13424 : CALL get_ks_env(ks_env, pw_env=pw_env)
192 13424 : IF (PRESENT(pw_env_external)) pw_env => pw_env_external
193 13424 : CALL pw_env_get(pw_env, rs_descs=rs_descs, rs_grids=rs_grids)
194 :
195 : ! *** assign from pw_env
196 13424 : gridlevel_info => pw_env%gridlevel_info
197 13424 : cube_info => pw_env%cube_info
198 :
199 : ! find maximum numbers
200 13424 : nkind = SIZE(qs_kind_set)
201 13424 : natoms = SIZE(particle_set)
202 13424 : maxset = 0
203 13424 : maxpgf = 0
204 37511 : DO ikind = 1, nkind
205 24087 : qs_kind => qs_kind_set(ikind)
206 : CALL get_qs_kind(qs_kind=qs_kind, &
207 24087 : basis_set=orb_basis_set, basis_type=my_basis_type)
208 :
209 24087 : IF (.NOT. ASSOCIATED(orb_basis_set)) CYCLE
210 24087 : CALL get_gto_basis_set(gto_basis_set=orb_basis_set, npgf=npgfa, nset=nseta)
211 :
212 24087 : maxset = MAX(nseta, maxset)
213 87850 : maxpgf = MAX(MAXVAL(npgfa), maxpgf)
214 : END DO
215 :
216 : ! kpoint related
217 13424 : nimages = dft_control%nimages
218 13424 : IF (nimages > 1) THEN
219 260 : dokp = .TRUE.
220 260 : NULLIFY (kpoints)
221 260 : CALL get_ks_env(ks_env=ks_env, kpoints=kpoints)
222 260 : CALL get_kpoint_info(kpoint=kpoints, cell_to_index=cell_to_index)
223 : ELSE
224 13164 : dokp = .FALSE.
225 13164 : NULLIFY (cell_to_index)
226 : END IF
227 :
228 : ! free the atom_pair lists if allocated
229 13424 : IF (ASSOCIATED(task_list%atom_pair_send)) DEALLOCATE (task_list%atom_pair_send)
230 13424 : IF (ASSOCIATED(task_list%atom_pair_recv)) DEALLOCATE (task_list%atom_pair_recv)
231 :
232 : ! construct a list of all tasks
233 13424 : IF (.NOT. ASSOCIATED(task_list%tasks)) THEN
234 8038 : CALL reallocate_tasks(task_list%tasks, max_tasks)
235 : END IF
236 13424 : task_list%ntasks = 0
237 13424 : curr_tasks = SIZE(task_list%tasks)
238 :
239 64359 : ALLOCATE (basis_set_list(nkind))
240 37511 : DO ikind = 1, nkind
241 24087 : qs_kind => qs_kind_set(ikind)
242 : CALL get_qs_kind(qs_kind=qs_kind, basis_set=basis_set_a, &
243 24087 : basis_type=my_basis_type)
244 37511 : IF (ASSOCIATED(basis_set_a)) THEN
245 24087 : basis_set_list(ikind)%gto_basis_set => basis_set_a
246 : ELSE
247 0 : NULLIFY (basis_set_list(ikind)%gto_basis_set)
248 : END IF
249 : END DO
250 : !!$OMP PARALLEL DEFAULT(NONE) &
251 : !!$OMP SHARED (sab_orb, dokp, basis_set_list, task_list, rs_descs, dft_control, cube_info, gridlevel_info, &
252 : !!$OMP curr_tasks, maxpgf, maxset, natoms, nimages, particle_set, cell_to_index, cell) &
253 : !!$OMP PRIVATE (ikind, jkind, iatom, jatom, rab, cellind, basis_set_a, basis_set_b, ra, &
254 : !!$OMP la_max, la_min, npgfa, nseta, rpgfa, set_radius_a, kind_radius_a, zeta, &
255 : !!$OMP lb_max, lb_min, npgfb, nsetb, rpgfb, set_radius_b, kind_radius_b, zetb, &
256 : !!$OMP cindex, slot)
257 : ! Loop over neighbor list
258 : !!$OMP DO SCHEDULE(GUIDED)
259 1356250 : DO slot = 1, sab_orb(1)%nl_size
260 1342826 : ikind = sab_orb(1)%nlist_task(slot)%ikind
261 1342826 : jkind = sab_orb(1)%nlist_task(slot)%jkind
262 1342826 : iatom = sab_orb(1)%nlist_task(slot)%iatom
263 1342826 : jatom = sab_orb(1)%nlist_task(slot)%jatom
264 5371304 : rab(1:3) = sab_orb(1)%nlist_task(slot)%r(1:3)
265 5371304 : cellind(1:3) = sab_orb(1)%nlist_task(slot)%cell(1:3)
266 :
267 1342826 : basis_set_a => basis_set_list(ikind)%gto_basis_set
268 1342826 : IF (.NOT. ASSOCIATED(basis_set_a)) CYCLE
269 1342826 : basis_set_b => basis_set_list(jkind)%gto_basis_set
270 1342826 : IF (.NOT. ASSOCIATED(basis_set_b)) CYCLE
271 1342826 : ra(:) = pbc(particle_set(iatom)%r, cell)
272 : ! basis ikind
273 1342826 : la_max => basis_set_a%lmax
274 1342826 : la_min => basis_set_a%lmin
275 1342826 : npgfa => basis_set_a%npgf
276 1342826 : nseta = basis_set_a%nset
277 1342826 : rpgfa => basis_set_a%pgf_radius
278 1342826 : set_radius_a => basis_set_a%set_radius
279 1342826 : kind_radius_a = basis_set_a%kind_radius
280 1342826 : zeta => basis_set_a%zet
281 : ! basis jkind
282 1342826 : lb_max => basis_set_b%lmax
283 1342826 : lb_min => basis_set_b%lmin
284 1342826 : npgfb => basis_set_b%npgf
285 1342826 : nsetb = basis_set_b%nset
286 1342826 : rpgfb => basis_set_b%pgf_radius
287 1342826 : set_radius_b => basis_set_b%set_radius
288 1342826 : kind_radius_b = basis_set_b%kind_radius
289 1342826 : zetb => basis_set_b%zet
290 :
291 1342826 : IF (dokp) THEN
292 146456 : cindex = cell_to_index(cellind(1), cellind(2), cellind(3))
293 : ELSE
294 1196370 : cindex = 1
295 : END IF
296 :
297 : CALL task_list_inner_loop(task_list%tasks, task_list%ntasks, curr_tasks, &
298 : rs_descs, dft_control, cube_info, gridlevel_info, cindex, &
299 : iatom, jatom, rpgfa, rpgfb, zeta, zetb, kind_radius_b, &
300 : set_radius_a, set_radius_b, ra, rab, &
301 1356250 : la_max, la_min, lb_max, lb_min, npgfa, npgfb, nseta, nsetb)
302 :
303 : END DO
304 : !!$OMP END PARALLEL
305 :
306 : ! redistribute the task list so that all tasks map on the local rs grids
307 : CALL distribute_tasks( &
308 : rs_descs=rs_descs, ntasks=task_list%ntasks, natoms=natoms, &
309 : tasks=task_list%tasks, atom_pair_send=task_list%atom_pair_send, &
310 : atom_pair_recv=task_list%atom_pair_recv, symmetric=.TRUE., &
311 : reorder_rs_grid_ranks=reorder_rs_grid_ranks, &
312 13424 : skip_load_balance_distributed=skip_load_balance_distributed)
313 :
314 : ! compute offsets for rs_scatter_matrix / rs_copy_matrix
315 40272 : ALLOCATE (nsgf(natoms))
316 13424 : CALL get_particle_set(particle_set, qs_kind_set, basis=basis_set_list, nsgf=nsgf)
317 13424 : IF (ASSOCIATED(task_list%atom_pair_send)) THEN
318 : ! only needed when there is a distributed grid
319 : CALL rs_calc_offsets(pairs=task_list%atom_pair_send, &
320 : nsgf=nsgf, &
321 : group_size=rs_descs(1)%rs_desc%group_size, &
322 : pair_offsets=task_list%pair_offsets_send, &
323 : rank_offsets=task_list%rank_offsets_send, &
324 : rank_sizes=task_list%rank_sizes_send, &
325 48 : buffer_size=task_list%buffer_size_send)
326 : END IF
327 : CALL rs_calc_offsets(pairs=task_list%atom_pair_recv, &
328 : nsgf=nsgf, &
329 : group_size=rs_descs(1)%rs_desc%group_size, &
330 : pair_offsets=task_list%pair_offsets_recv, &
331 : rank_offsets=task_list%rank_offsets_recv, &
332 : rank_sizes=task_list%rank_sizes_recv, &
333 13424 : buffer_size=task_list%buffer_size_recv)
334 13424 : DEALLOCATE (basis_set_list, nsgf)
335 :
336 : ! If the rank order has changed, reallocate any of the distributed rs_grids
337 13424 : IF (reorder_rs_grid_ranks) THEN
338 57976 : DO i = 1, gridlevel_info%ngrid_levels
339 57976 : IF (rs_descs(i)%rs_desc%distributed) THEN
340 54 : CALL rs_grid_release(rs_grids(i))
341 54 : CALL rs_grid_create(rs_grids(i), rs_descs(i)%rs_desc)
342 : END IF
343 : END DO
344 : END IF
345 :
346 : CALL create_grid_task_list(task_list=task_list, &
347 : qs_kind_set=qs_kind_set, &
348 : particle_set=particle_set, &
349 : cell=cell, &
350 : basis_type=my_basis_type, &
351 13424 : rs_grids=rs_grids)
352 :
353 : ! Now we have the final list of tasks, setup the task_list with the
354 : ! data needed for the loops in integrate_v/calculate_rho
355 :
356 13424 : IF (ASSOCIATED(task_list%taskstart)) THEN
357 5386 : DEALLOCATE (task_list%taskstart)
358 : END IF
359 13424 : IF (ASSOCIATED(task_list%taskstop)) THEN
360 5386 : DEALLOCATE (task_list%taskstop)
361 : END IF
362 13424 : IF (ASSOCIATED(task_list%npairs)) THEN
363 5386 : DEALLOCATE (task_list%npairs)
364 : END IF
365 :
366 : ! First, count the number of unique atom pairs per grid level
367 :
368 40272 : ALLOCATE (task_list%npairs(SIZE(rs_descs)))
369 :
370 13424 : iatom_old = -1; jatom_old = -1; igrid_level_old = -1; img_old = -1
371 13424 : ipair = 0
372 66668 : task_list%npairs = 0
373 :
374 7699229 : DO i = 1, task_list%ntasks
375 7685805 : igrid_level = task_list%tasks(i)%grid_level
376 7685805 : img = task_list%tasks(i)%image
377 7685805 : iatom = task_list%tasks(i)%iatom
378 7685805 : jatom = task_list%tasks(i)%jatom
379 7685805 : iset = task_list%tasks(i)%iset
380 7685805 : jset = task_list%tasks(i)%jset
381 7685805 : ipgf = task_list%tasks(i)%ipgf
382 7685805 : jpgf = task_list%tasks(i)%jpgf
383 7699229 : IF (igrid_level .NE. igrid_level_old) THEN
384 36390 : IF (igrid_level_old .NE. -1) THEN
385 23195 : task_list%npairs(igrid_level_old) = ipair
386 : END IF
387 : ipair = 1
388 : igrid_level_old = igrid_level
389 : iatom_old = iatom
390 : jatom_old = jatom
391 : img_old = img
392 7649415 : ELSE IF (iatom .NE. iatom_old .OR. jatom .NE. jatom_old .OR. img .NE. img_old) THEN
393 495135 : ipair = ipair + 1
394 495135 : iatom_old = iatom
395 495135 : jatom_old = jatom
396 495135 : img_old = img
397 : END IF
398 : END DO
399 : ! Take care of the last iteration
400 13424 : IF (task_list%ntasks /= 0) THEN
401 13195 : task_list%npairs(igrid_level) = ipair
402 : END IF
403 :
404 : ! Second, for each atom pair, find the indices in the task list
405 : ! of the first and last task
406 :
407 : ! Array sized for worst case
408 106711 : ALLOCATE (task_list%taskstart(MAXVAL(task_list%npairs), SIZE(rs_descs)))
409 106711 : ALLOCATE (task_list%taskstop(MAXVAL(task_list%npairs), SIZE(rs_descs)))
410 :
411 13424 : iatom_old = -1; jatom_old = -1; igrid_level_old = -1; img_old = -1
412 13424 : ipair = 0
413 1116993 : task_list%taskstart = 0
414 1116993 : task_list%taskstop = 0
415 :
416 7699229 : DO i = 1, task_list%ntasks
417 7685805 : igrid_level = task_list%tasks(i)%grid_level
418 7685805 : img = task_list%tasks(i)%image
419 7685805 : iatom = task_list%tasks(i)%iatom
420 7685805 : jatom = task_list%tasks(i)%jatom
421 7685805 : iset = task_list%tasks(i)%iset
422 7685805 : jset = task_list%tasks(i)%jset
423 7685805 : ipgf = task_list%tasks(i)%ipgf
424 7685805 : jpgf = task_list%tasks(i)%jpgf
425 7699229 : IF (igrid_level .NE. igrid_level_old) THEN
426 36390 : IF (igrid_level_old .NE. -1) THEN
427 23195 : task_list%taskstop(ipair, igrid_level_old) = i - 1
428 : END IF
429 36390 : ipair = 1
430 36390 : task_list%taskstart(ipair, igrid_level) = i
431 36390 : igrid_level_old = igrid_level
432 36390 : iatom_old = iatom
433 36390 : jatom_old = jatom
434 36390 : img_old = img
435 7649415 : ELSE IF (iatom .NE. iatom_old .OR. jatom .NE. jatom_old .OR. img .NE. img_old) THEN
436 495135 : ipair = ipair + 1
437 495135 : task_list%taskstart(ipair, igrid_level) = i
438 495135 : task_list%taskstop(ipair - 1, igrid_level) = i - 1
439 495135 : iatom_old = iatom
440 495135 : jatom_old = jatom
441 495135 : img_old = img
442 : END IF
443 : END DO
444 : ! Take care of the last iteration
445 13424 : IF (task_list%ntasks /= 0) THEN
446 13195 : task_list%taskstop(ipair, igrid_level) = task_list%ntasks
447 : END IF
448 :
449 : ! Debug task destribution
450 : IF (debug_this_module) THEN
451 : tasks => task_list%tasks
452 : WRITE (6, *)
453 : WRITE (6, *) "Total number of tasks ", task_list%ntasks
454 : DO igrid_level = 1, gridlevel_info%ngrid_levels
455 : WRITE (6, *) "Total number of pairs(grid_level) ", &
456 : igrid_level, task_list%npairs(igrid_level)
457 : END DO
458 : WRITE (6, *)
459 :
460 : DO igrid_level = 1, gridlevel_info%ngrid_levels
461 :
462 : ALLOCATE (blocks(natoms, natoms, nimages))
463 : blocks = -1
464 : DO ipair = 1, task_list%npairs(igrid_level)
465 : itask = task_list%taskstart(ipair, igrid_level)
466 : ilevel = task_list%tasks(itask)%grid_level
467 : img = task_list%tasks(itask)%image
468 : iatom = task_list%tasks(itask)%iatom
469 : jatom = task_list%tasks(itask)%jatom
470 : iset = task_list%tasks(itask)%iset
471 : jset = task_list%tasks(itask)%jset
472 : ipgf = task_list%tasks(itask)%ipgf
473 : jpgf = task_list%tasks(itask)%jpgf
474 : IF (blocks(iatom, jatom, img) == -1 .AND. blocks(jatom, iatom, img) == -1) THEN
475 : blocks(iatom, jatom, img) = 1
476 : blocks(jatom, iatom, img) = 1
477 : ELSE
478 : WRITE (6, *) "TASK LIST CONFLICT IN PAIR ", ipair
479 : WRITE (6, *) "Reuse of iatom, jatom, image ", iatom, jatom, img
480 : END IF
481 :
482 : iatom_old = iatom
483 : jatom_old = jatom
484 : img_old = img
485 : DO itask = task_list%taskstart(ipair, igrid_level), task_list%taskstop(ipair, igrid_level)
486 : ilevel = task_list%tasks(itask)%grid_level
487 : img = task_list%tasks(itask)%image
488 : iatom = task_list%tasks(itask)%iatom
489 : jatom = task_list%tasks(itask)%jatom
490 : iset = task_list%tasks(itask)%iset
491 : jset = task_list%tasks(itask)%jset
492 : ipgf = task_list%tasks(itask)%ipgf
493 : jpgf = task_list%tasks(itask)%jpgf
494 : IF (iatom /= iatom_old .OR. jatom /= jatom_old .OR. img /= img_old) THEN
495 : WRITE (6, *) "TASK LIST CONFLICT IN TASK ", itask
496 : WRITE (6, *) "Inconsistent iatom, jatom, image ", iatom, jatom, img
497 : WRITE (6, *) "Should be iatom, jatom, image ", iatom_old, jatom_old, img_old
498 : END IF
499 :
500 : END DO
501 : END DO
502 : DEALLOCATE (blocks)
503 :
504 : END DO
505 :
506 : END IF
507 :
508 13424 : CALL timestop(handle)
509 :
510 13424 : END SUBROUTINE generate_qs_task_list
511 :
512 : ! **************************************************************************************************
513 : !> \brief Sends the task list data to the grid API.
514 : !> \author Ole Schuett
515 : ! **************************************************************************************************
516 13424 : SUBROUTINE create_grid_task_list(task_list, qs_kind_set, particle_set, cell, basis_type, rs_grids)
517 : TYPE(task_list_type), POINTER :: task_list
518 : TYPE(qs_kind_type), DIMENSION(:), POINTER :: qs_kind_set
519 : TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
520 : TYPE(cell_type), POINTER :: cell
521 : CHARACTER(LEN=default_string_length) :: basis_type
522 : TYPE(realspace_grid_type), DIMENSION(:), POINTER :: rs_grids
523 :
524 : TYPE(gto_basis_set_type), POINTER :: orb_basis_set
525 : INTEGER :: nset, natoms, nkinds, ntasks, &
526 : ikind, iatom, itask, nsgf
527 13424 : INTEGER, DIMENSION(:), ALLOCATABLE :: atom_kinds, level_list, iatom_list, jatom_list, &
528 13424 : iset_list, jset_list, ipgf_list, jpgf_list, &
529 13424 : border_mask_list, block_num_list
530 13424 : REAL(KIND=dp), DIMENSION(:), ALLOCATABLE :: radius_list
531 13424 : REAL(KIND=dp), DIMENSION(:, :), ALLOCATABLE :: rab_list, atom_positions
532 13424 : TYPE(task_type), DIMENSION(:), POINTER :: tasks
533 13424 : INTEGER, DIMENSION(:, :), POINTER :: first_sgf
534 13424 : REAL(KIND=dp), DIMENSION(:, :), POINTER :: sphi, zet
535 13424 : INTEGER, DIMENSION(:), POINTER :: lmax, lmin, npgf, nsgf_set
536 :
537 13424 : nkinds = SIZE(qs_kind_set)
538 13424 : natoms = SIZE(particle_set)
539 13424 : ntasks = task_list%ntasks
540 13424 : tasks => task_list%tasks
541 :
542 13424 : IF (.NOT. ASSOCIATED(task_list%grid_basis_sets)) THEN
543 : ! Basis sets do not change during simulation - only need to create them once.
544 38487 : ALLOCATE (task_list%grid_basis_sets(nkinds))
545 22411 : DO ikind = 1, nkinds
546 14373 : CALL get_qs_kind(qs_kind_set(ikind), basis_type=basis_type, basis_set=orb_basis_set)
547 : CALL get_gto_basis_set(gto_basis_set=orb_basis_set, &
548 : nset=nset, &
549 : nsgf=nsgf, &
550 : nsgf_set=nsgf_set, &
551 : npgf=npgf, &
552 : first_sgf=first_sgf, &
553 : lmax=lmax, &
554 : lmin=lmin, &
555 : sphi=sphi, &
556 14373 : zet=zet)
557 : CALL grid_create_basis_set(nset=nset, &
558 : nsgf=nsgf, &
559 : maxco=SIZE(sphi, 1), &
560 : maxpgf=SIZE(zet, 1), &
561 : lmin=lmin, &
562 : lmax=lmax, &
563 : npgf=npgf, &
564 : nsgf_set=nsgf_set, &
565 : first_sgf=first_sgf, &
566 : sphi=sphi, &
567 : zet=zet, &
568 22411 : basis_set=task_list%grid_basis_sets(ikind))
569 : END DO
570 : END IF
571 :
572 : ! Pack task list infos
573 67120 : ALLOCATE (atom_kinds(natoms), atom_positions(3, natoms))
574 63165 : DO iatom = 1, natoms
575 49741 : atom_kinds(iatom) = particle_set(iatom)%atomic_kind%kind_number
576 63165 : atom_positions(:, iatom) = pbc(particle_set(iatom)%r, cell)
577 : END DO
578 :
579 66433 : ALLOCATE (level_list(ntasks), iatom_list(ntasks), jatom_list(ntasks))
580 66204 : ALLOCATE (iset_list(ntasks), jset_list(ntasks), ipgf_list(ntasks), jpgf_list(ntasks))
581 39814 : ALLOCATE (border_mask_list(ntasks), block_num_list(ntasks))
582 66662 : ALLOCATE (radius_list(ntasks), rab_list(3, ntasks))
583 :
584 7699229 : DO itask = 1, ntasks
585 7685805 : level_list(itask) = tasks(itask)%grid_level
586 7685805 : iatom_list(itask) = tasks(itask)%iatom
587 7685805 : jatom_list(itask) = tasks(itask)%jatom
588 7685805 : iset_list(itask) = tasks(itask)%iset
589 7685805 : jset_list(itask) = tasks(itask)%jset
590 7685805 : ipgf_list(itask) = tasks(itask)%ipgf
591 7685805 : jpgf_list(itask) = tasks(itask)%jpgf
592 7685805 : IF (tasks(itask)%dist_type == 2) THEN
593 0 : border_mask_list(itask) = IAND(63, NOT(tasks(itask)%subpatch_pattern)) ! invert last 6 bits
594 : ELSE
595 7685805 : border_mask_list(itask) = 0 ! no masking
596 : END IF
597 7685805 : block_num_list(itask) = tasks(itask)%pair_index ! change of nomenclature pair_index -> block_num
598 7685805 : radius_list(itask) = tasks(itask)%radius
599 30756644 : rab_list(:, itask) = tasks(itask)%rab(:)
600 : END DO
601 :
602 : CALL grid_create_task_list(ntasks=ntasks, &
603 : natoms=natoms, &
604 : nkinds=nkinds, &
605 : nblocks=SIZE(task_list%pair_offsets_recv), &
606 : block_offsets=task_list%pair_offsets_recv, &
607 : atom_positions=atom_positions, &
608 : atom_kinds=atom_kinds, &
609 : basis_sets=task_list%grid_basis_sets, &
610 : level_list=level_list, &
611 : iatom_list=iatom_list, &
612 : jatom_list=jatom_list, &
613 : iset_list=iset_list, &
614 : jset_list=jset_list, &
615 : ipgf_list=ipgf_list, &
616 : jpgf_list=jpgf_list, &
617 : border_mask_list=border_mask_list, &
618 : block_num_list=block_num_list, &
619 : radius_list=radius_list, &
620 : rab_list=rab_list, &
621 : rs_grids=rs_grids, &
622 13424 : task_list=task_list%grid_task_list)
623 :
624 13424 : CALL offload_create_buffer(task_list%buffer_size_recv, task_list%pab_buffer)
625 13424 : CALL offload_create_buffer(task_list%buffer_size_recv, task_list%hab_buffer)
626 :
627 26848 : END SUBROUTINE create_grid_task_list
628 :
629 : ! **************************************************************************************************
630 : !> \brief ...
631 : !> \param tasks ...
632 : !> \param ntasks ...
633 : !> \param curr_tasks ...
634 : !> \param rs_descs ...
635 : !> \param dft_control ...
636 : !> \param cube_info ...
637 : !> \param gridlevel_info ...
638 : !> \param cindex ...
639 : !> \param iatom ...
640 : !> \param jatom ...
641 : !> \param rpgfa ...
642 : !> \param rpgfb ...
643 : !> \param zeta ...
644 : !> \param zetb ...
645 : !> \param kind_radius_b ...
646 : !> \param set_radius_a ...
647 : !> \param set_radius_b ...
648 : !> \param ra ...
649 : !> \param rab ...
650 : !> \param la_max ...
651 : !> \param la_min ...
652 : !> \param lb_max ...
653 : !> \param lb_min ...
654 : !> \param npgfa ...
655 : !> \param npgfb ...
656 : !> \param nseta ...
657 : !> \param nsetb ...
658 : !> \par History
659 : !> Joost VandeVondele: 10.2008 refactored
660 : ! **************************************************************************************************
661 1354316 : SUBROUTINE task_list_inner_loop(tasks, ntasks, curr_tasks, rs_descs, dft_control, &
662 : cube_info, gridlevel_info, cindex, &
663 : iatom, jatom, rpgfa, rpgfb, zeta, zetb, kind_radius_b, set_radius_a, set_radius_b, ra, rab, &
664 : la_max, la_min, lb_max, lb_min, npgfa, npgfb, nseta, nsetb)
665 :
666 : TYPE(task_type), DIMENSION(:), POINTER :: tasks
667 : INTEGER :: ntasks, curr_tasks
668 : TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
669 : POINTER :: rs_descs
670 : TYPE(dft_control_type), POINTER :: dft_control
671 : TYPE(cube_info_type), DIMENSION(:), POINTER :: cube_info
672 : TYPE(gridlevel_info_type), POINTER :: gridlevel_info
673 : INTEGER :: cindex, iatom, jatom
674 : REAL(KIND=dp), DIMENSION(:, :), POINTER :: rpgfa, rpgfb, zeta, zetb
675 : REAL(KIND=dp) :: kind_radius_b
676 : REAL(KIND=dp), DIMENSION(:), POINTER :: set_radius_a, set_radius_b
677 : REAL(KIND=dp), DIMENSION(3) :: ra, rab
678 : INTEGER, DIMENSION(:), POINTER :: la_max, la_min, lb_max, lb_min, npgfa, &
679 : npgfb
680 : INTEGER :: nseta, nsetb
681 :
682 : INTEGER :: cube_center(3), igrid_level, ipgf, iset, &
683 : jpgf, jset, lb_cube(3), ub_cube(3)
684 : REAL(KIND=dp) :: dab, rab2, radius, zetp
685 :
686 1354316 : rab2 = rab(1)*rab(1) + rab(2)*rab(2) + rab(3)*rab(3)
687 1354316 : dab = SQRT(rab2)
688 :
689 4349748 : loop_iset: DO iset = 1, nseta
690 :
691 2995432 : IF (set_radius_a(iset) + kind_radius_b < dab) CYCLE
692 :
693 9035258 : loop_jset: DO jset = 1, nsetb
694 :
695 5603958 : IF (set_radius_a(iset) + set_radius_b(jset) < dab) CYCLE
696 :
697 14617706 : loop_ipgf: DO ipgf = 1, npgfa(iset)
698 :
699 8189149 : IF (rpgfa(ipgf, iset) + set_radius_b(jset) < dab) CYCLE
700 :
701 24737561 : loop_jpgf: DO jpgf = 1, npgfb(jset)
702 :
703 14081616 : IF (rpgfa(ipgf, iset) + rpgfb(jpgf, jset) < dab) CYCLE
704 :
705 7850268 : zetp = zeta(ipgf, iset) + zetb(jpgf, jset)
706 7850268 : igrid_level = gaussian_gridlevel(gridlevel_info, zetp)
707 :
708 : CALL compute_pgf_properties(cube_center, lb_cube, ub_cube, radius, &
709 : rs_descs(igrid_level)%rs_desc, cube_info(igrid_level), &
710 : la_max(iset), zeta(ipgf, iset), la_min(iset), &
711 : lb_max(jset), zetb(jpgf, jset), lb_min(jset), &
712 7850268 : ra, rab, rab2, dft_control%qs_control%eps_rho_rspace)
713 :
714 : CALL pgf_to_tasks(tasks, ntasks, curr_tasks, &
715 : rab, cindex, iatom, jatom, iset, jset, ipgf, jpgf, &
716 : la_max(iset), lb_max(jset), rs_descs(igrid_level)%rs_desc, &
717 : igrid_level, gridlevel_info%ngrid_levels, cube_center, &
718 22270765 : lb_cube, ub_cube, radius)
719 :
720 : END DO loop_jpgf
721 :
722 : END DO loop_ipgf
723 :
724 : END DO loop_jset
725 :
726 : END DO loop_iset
727 :
728 1354316 : END SUBROUTINE task_list_inner_loop
729 :
730 : ! **************************************************************************************************
731 : !> \brief combines the calculation of several basic properties of a given pgf:
732 : !> its center, the bounding cube, the radius, the cost,
733 : !> tries to predict the time needed for processing this task
734 : !> in this way an improved load balance might be obtained
735 : !> \param cube_center ...
736 : !> \param lb_cube ...
737 : !> \param ub_cube ...
738 : !> \param radius ...
739 : !> \param rs_desc ...
740 : !> \param cube_info ...
741 : !> \param la_max ...
742 : !> \param zeta ...
743 : !> \param la_min ...
744 : !> \param lb_max ...
745 : !> \param zetb ...
746 : !> \param lb_min ...
747 : !> \param ra ...
748 : !> \param rab ...
749 : !> \param rab2 ...
750 : !> \param eps ...
751 : !> \par History
752 : !> 10.2008 refactored [Joost VandeVondele]
753 : !> \note
754 : !> -) this requires the radius to be computed in the same way as
755 : !> collocate_pgf_product, we should factor that part into a subroutine
756 : !> -) we're assuming that integrate_pgf and collocate_pgf are the same cost for load balancing
757 : !> this is more or less true for map_consistent
758 : !> -) in principle, the computed radius could be recycled in integrate_pgf/collocate_pgf if it is certainly
759 : !> the same, this could lead to a small speedup
760 : !> -) the cost function is a fit through the median cost of mapping a pgf with a given l and a given radius (in grid points)
761 : !> fitting the measured data on an opteron/g95 using the expression
762 : !> a*(l+b)(r+c)**3+d which is based on the innerloop of the collocating routines
763 : ! **************************************************************************************************
764 7850268 : SUBROUTINE compute_pgf_properties(cube_center, lb_cube, ub_cube, radius, &
765 : rs_desc, cube_info, la_max, zeta, la_min, lb_max, zetb, lb_min, ra, rab, rab2, eps)
766 :
767 : INTEGER, DIMENSION(3), INTENT(OUT) :: cube_center, lb_cube, ub_cube
768 : REAL(KIND=dp), INTENT(OUT) :: radius
769 : TYPE(realspace_grid_desc_type), POINTER :: rs_desc
770 : TYPE(cube_info_type), INTENT(IN) :: cube_info
771 : INTEGER, INTENT(IN) :: la_max
772 : REAL(KIND=dp), INTENT(IN) :: zeta
773 : INTEGER, INTENT(IN) :: la_min, lb_max
774 : REAL(KIND=dp), INTENT(IN) :: zetb
775 : INTEGER, INTENT(IN) :: lb_min
776 : REAL(KIND=dp), INTENT(IN) :: ra(3), rab(3), rab2, eps
777 :
778 : INTEGER :: extent(3)
779 7850268 : INTEGER, DIMENSION(:), POINTER :: sphere_bounds
780 : REAL(KIND=dp) :: cutoff, f, prefactor, rb(3), zetp
781 : REAL(KIND=dp), DIMENSION(3) :: rp
782 :
783 : ! the radius for this task
784 :
785 7850268 : zetp = zeta + zetb
786 31401072 : rp(:) = ra(:) + zetb/zetp*rab(:)
787 31401072 : rb(:) = ra(:) + rab(:)
788 7850268 : cutoff = 1.0_dp
789 7850268 : f = zetb/zetp
790 7850268 : prefactor = EXP(-zeta*f*rab2)
791 : radius = exp_radius_very_extended(la_min, la_max, lb_min, lb_max, ra=ra, rb=rb, rp=rp, &
792 7850268 : zetp=zetp, eps=eps, prefactor=prefactor, cutoff=cutoff)
793 :
794 7850268 : CALL compute_cube_center(cube_center, rs_desc, zeta, zetb, ra, rab)
795 : ! compute cube_center, the center of the gaussian product to map (folded to within the unit cell)
796 31401072 : cube_center(:) = MODULO(cube_center(:), rs_desc%npts(:))
797 31401072 : cube_center(:) = cube_center(:) + rs_desc%lb(:)
798 :
799 7850268 : IF (rs_desc%orthorhombic) THEN
800 7411887 : CALL return_cube(cube_info, radius, lb_cube, ub_cube, sphere_bounds)
801 : ELSE
802 438381 : CALL return_cube_nonortho(cube_info, radius, lb_cube, ub_cube, rp)
803 : !? unclear if extent is computed correctly.
804 1753524 : extent(:) = ub_cube(:) - lb_cube(:)
805 1753524 : lb_cube(:) = -extent(:)/2 - 1
806 1753524 : ub_cube(:) = extent(:)/2
807 : END IF
808 :
809 7850268 : END SUBROUTINE compute_pgf_properties
810 : ! **************************************************************************************************
811 : !> \brief predicts the cost of a task in kcycles for a given task
812 : !> the model is based on a fit of actual data, and might need updating
813 : !> as collocate_pgf_product changes (or CPUs/compilers change)
814 : !> maybe some dynamic approach, improving the cost model on the fly could
815 : !> work as well
816 : !> the cost model does not yet take into account the fraction of space
817 : !> that is mapped locally for a given cube and rs_grid (generalised tasks)
818 : !> \param lb_cube ...
819 : !> \param ub_cube ...
820 : !> \param fraction ...
821 : !> \param lmax ...
822 : !> \param is_ortho ...
823 : !> \return ...
824 : ! **************************************************************************************************
825 7850268 : INTEGER FUNCTION cost_model(lb_cube, ub_cube, fraction, lmax, is_ortho)
826 : INTEGER, DIMENSION(3), INTENT(IN) :: lb_cube, ub_cube
827 : REAL(KIND=dp), INTENT(IN) :: fraction
828 : INTEGER :: lmax
829 : LOGICAL :: is_ortho
830 :
831 : INTEGER :: cmax
832 : REAL(KIND=dp) :: v1, v2, v3, v4, v5
833 :
834 31401072 : cmax = MAXVAL(((ub_cube - lb_cube) + 1)/2)
835 :
836 7850268 : IF (is_ortho) THEN
837 : v1 = 1.504760E+00_dp
838 : v2 = 3.126770E+00_dp
839 : v3 = 5.074106E+00_dp
840 : v4 = 1.091568E+00_dp
841 : v5 = 1.070187E+00_dp
842 : ELSE
843 438381 : v1 = 7.831105E+00_dp
844 438381 : v2 = 2.675174E+00_dp
845 438381 : v3 = 7.546553E+00_dp
846 438381 : v4 = 6.122446E-01_dp
847 438381 : v5 = 3.886382E+00_dp
848 : END IF
849 7850268 : cost_model = CEILING(((lmax + v1)*(cmax + v2)**3*v3*fraction + v4 + v5*lmax**7)/1000.0_dp)
850 :
851 7850268 : END FUNCTION cost_model
852 : ! **************************************************************************************************
853 : !> \brief pgf_to_tasks converts a given pgf to one or more tasks, in particular
854 : !> this determines by which CPUs a given pgf gets collocated
855 : !> the format of the task array is as follows
856 : !> tasks(1,i) := destination
857 : !> tasks(2,i) := source
858 : !> tasks(3,i) := compressed type (iatom, jatom, ....)
859 : !> tasks(4,i) := type (0: replicated, 1: distributed local, 2: distributed generalised)
860 : !> tasks(5,i) := cost
861 : !> tasks(6,i) := alternate destination code (0 if none available)
862 : !>
863 : !> \param tasks ...
864 : !> \param ntasks ...
865 : !> \param curr_tasks ...
866 : !> \param rab ...
867 : !> \param cindex ...
868 : !> \param iatom ...
869 : !> \param jatom ...
870 : !> \param iset ...
871 : !> \param jset ...
872 : !> \param ipgf ...
873 : !> \param jpgf ...
874 : !> \param la_max ...
875 : !> \param lb_max ...
876 : !> \param rs_desc ...
877 : !> \param igrid_level ...
878 : !> \param n_levels ...
879 : !> \param cube_center ...
880 : !> \param lb_cube ...
881 : !> \param ub_cube ...
882 : !> \par History
883 : !> 10.2008 Refactored based on earlier routines by MattW [Joost VandeVondele]
884 : ! **************************************************************************************************
885 7850268 : SUBROUTINE pgf_to_tasks(tasks, ntasks, curr_tasks, &
886 : rab, cindex, iatom, jatom, iset, jset, ipgf, jpgf, &
887 : la_max, lb_max, rs_desc, igrid_level, n_levels, &
888 : cube_center, lb_cube, ub_cube, radius)
889 :
890 : TYPE(task_type), DIMENSION(:), POINTER :: tasks
891 : INTEGER, INTENT(INOUT) :: ntasks, curr_tasks
892 : REAL(KIND=dp), DIMENSION(3), INTENT(IN) :: rab
893 : INTEGER, INTENT(IN) :: cindex, iatom, jatom, iset, jset, ipgf, &
894 : jpgf, la_max, lb_max
895 : TYPE(realspace_grid_desc_type), POINTER :: rs_desc
896 : INTEGER, INTENT(IN) :: igrid_level, n_levels
897 : INTEGER, DIMENSION(3), INTENT(IN) :: cube_center, lb_cube, ub_cube
898 : REAL(KIND=dp), INTENT(IN) :: radius
899 :
900 : INTEGER, PARAMETER :: add_tasks = 1000
901 : REAL(kind=dp), PARAMETER :: mult_tasks = 2.0_dp
902 :
903 : INTEGER :: added_tasks, cost, j, lmax
904 : LOGICAL :: is_ortho
905 : REAL(KIND=dp) :: tfraction
906 :
907 : !$OMP SINGLE
908 7850268 : ntasks = ntasks + 1
909 7850268 : IF (ntasks > curr_tasks) THEN
910 983 : curr_tasks = INT((curr_tasks + add_tasks)*mult_tasks)
911 983 : CALL reallocate_tasks(tasks, curr_tasks)
912 : END IF
913 : !$OMP END SINGLE
914 :
915 7850268 : IF (rs_desc%distributed) THEN
916 :
917 : ! finds the node(s) that need to process this task
918 : ! on exit tasks(:)%dist_type is 1 for distributed tasks and 2 for generalised tasks
919 : CALL rs_find_node(rs_desc, igrid_level, n_levels, cube_center, &
920 1380 : ntasks=ntasks, tasks=tasks, lb_cube=lb_cube, ub_cube=ub_cube, added_tasks=added_tasks)
921 :
922 : ELSE
923 7848888 : tasks(ntasks)%destination = encode_rank(rs_desc%my_pos, igrid_level, n_levels)
924 7848888 : tasks(ntasks)%dist_type = 0
925 7848888 : tasks(ntasks)%subpatch_pattern = 0
926 7848888 : added_tasks = 1
927 : END IF
928 :
929 7850268 : lmax = la_max + lb_max
930 7850268 : is_ortho = (tasks(ntasks)%dist_type == 0 .OR. tasks(ntasks)%dist_type == 1) .AND. rs_desc%orthorhombic
931 : ! we assume the load is shared equally between processes dealing with a generalised Gaussian.
932 : ! this could be refined in the future
933 7850268 : tfraction = 1.0_dp/added_tasks
934 :
935 7850268 : cost = cost_model(lb_cube, ub_cube, tfraction, lmax, is_ortho)
936 :
937 15700536 : DO j = 1, added_tasks
938 7850268 : tasks(ntasks - added_tasks + j)%source = encode_rank(rs_desc%my_pos, igrid_level, n_levels)
939 7850268 : tasks(ntasks - added_tasks + j)%cost = cost
940 7850268 : tasks(ntasks - added_tasks + j)%grid_level = igrid_level
941 7850268 : tasks(ntasks - added_tasks + j)%image = cindex
942 7850268 : tasks(ntasks - added_tasks + j)%iatom = iatom
943 7850268 : tasks(ntasks - added_tasks + j)%jatom = jatom
944 7850268 : tasks(ntasks - added_tasks + j)%iset = iset
945 7850268 : tasks(ntasks - added_tasks + j)%jset = jset
946 7850268 : tasks(ntasks - added_tasks + j)%ipgf = ipgf
947 7850268 : tasks(ntasks - added_tasks + j)%jpgf = jpgf
948 31401072 : tasks(ntasks - added_tasks + j)%rab = rab
949 15700536 : tasks(ntasks - added_tasks + j)%radius = radius
950 : END DO
951 :
952 7850268 : END SUBROUTINE pgf_to_tasks
953 :
954 : ! **************************************************************************************************
955 : !> \brief performs load balancing of the tasks on the distributed grids
956 : !> \param tasks ...
957 : !> \param ntasks ...
958 : !> \param rs_descs ...
959 : !> \param grid_level ...
960 : !> \param natoms ...
961 : !> \par History
962 : !> created 2008-10-03 [Joost VandeVondele]
963 : ! **************************************************************************************************
964 54 : SUBROUTINE load_balance_distributed(tasks, ntasks, rs_descs, grid_level, natoms)
965 :
966 : TYPE(task_type), DIMENSION(:), POINTER :: tasks
967 : INTEGER :: ntasks
968 : TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
969 : POINTER :: rs_descs
970 : INTEGER :: grid_level, natoms
971 :
972 : CHARACTER(LEN=*), PARAMETER :: routineN = 'load_balance_distributed'
973 :
974 : INTEGER :: handle
975 54 : INTEGER, DIMENSION(:, :, :), POINTER :: list
976 :
977 54 : CALL timeset(routineN, handle)
978 :
979 54 : NULLIFY (list)
980 : ! here we create for each cpu (0:ncpu-1) a list of possible destinations.
981 : ! if a destination would not be in this list, it is a bug
982 54 : CALL create_destination_list(list, rs_descs, grid_level)
983 :
984 : ! now, walk over the tasks, filling in the loads of each destination
985 54 : CALL compute_load_list(list, rs_descs, grid_level, tasks, ntasks, natoms, create_list=.TRUE.)
986 :
987 : ! optimize loads & fluxes
988 54 : CALL optimize_load_list(list, rs_descs(1)%rs_desc%group, rs_descs(1)%rs_desc%my_pos)
989 :
990 : ! now, walk over the tasks, using the list to set the destinations
991 54 : CALL compute_load_list(list, rs_descs, grid_level, tasks, ntasks, natoms, create_list=.FALSE.)
992 :
993 54 : DEALLOCATE (list)
994 :
995 54 : CALL timestop(handle)
996 :
997 54 : END SUBROUTINE load_balance_distributed
998 :
999 : ! **************************************************************************************************
1000 : !> \brief this serial routine adjusts the fluxes in the global list
1001 : !>
1002 : !> \param list_global ...
1003 : !> \par History
1004 : !> created 2008-10-06 [Joost VandeVondele]
1005 : ! **************************************************************************************************
1006 27 : SUBROUTINE balance_global_list(list_global)
1007 : INTEGER, DIMENSION(:, :, 0:) :: list_global
1008 :
1009 : CHARACTER(LEN=*), PARAMETER :: routineN = 'balance_global_list'
1010 : INTEGER, PARAMETER :: Max_Iter = 100
1011 : REAL(KIND=dp), PARAMETER :: Tolerance_factor = 0.005_dp
1012 :
1013 : INTEGER :: dest, handle, icpu, idest, iflux, &
1014 : ilocal, k, maxdest, Ncpu, Nflux
1015 27 : INTEGER, ALLOCATABLE, DIMENSION(:, :) :: flux_connections
1016 : LOGICAL :: solution_optimal
1017 : REAL(KIND=dp) :: average, load_shift, max_load_shift, &
1018 : tolerance
1019 27 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:) :: load, optimized_flux, optimized_load
1020 27 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :) :: flux_limits
1021 :
1022 27 : CALL timeset(routineN, handle)
1023 :
1024 27 : Ncpu = SIZE(list_global, 3)
1025 27 : maxdest = SIZE(list_global, 2)
1026 81 : ALLOCATE (load(0:Ncpu - 1))
1027 81 : load = 0.0_dp
1028 54 : ALLOCATE (optimized_load(0:Ncpu - 1))
1029 :
1030 : ! figure out the number of fluxes
1031 : ! we assume that the global_list is symmetric
1032 81 : Nflux = 0
1033 81 : DO icpu = 0, ncpu - 1
1034 189 : DO idest = 1, maxdest
1035 108 : dest = list_global(1, idest, icpu)
1036 162 : IF (dest < ncpu .AND. dest > icpu) Nflux = Nflux + 1
1037 : END DO
1038 : END DO
1039 81 : ALLOCATE (optimized_flux(Nflux))
1040 81 : ALLOCATE (flux_limits(2, Nflux))
1041 81 : ALLOCATE (flux_connections(2, Nflux))
1042 :
1043 : ! reorder data
1044 108 : flux_limits = 0
1045 : Nflux = 0
1046 81 : DO icpu = 0, ncpu - 1
1047 162 : load(icpu) = SUM(list_global(2, :, icpu))
1048 189 : DO idest = 1, maxdest
1049 108 : dest = list_global(1, idest, icpu)
1050 162 : IF (dest < ncpu) THEN
1051 108 : IF (dest .NE. icpu) THEN
1052 54 : IF (dest > icpu) THEN
1053 27 : Nflux = Nflux + 1
1054 27 : flux_limits(2, Nflux) = list_global(2, idest, icpu)
1055 27 : flux_connections(1, Nflux) = icpu
1056 27 : flux_connections(2, Nflux) = dest
1057 : ELSE
1058 27 : DO iflux = 1, Nflux
1059 27 : IF (flux_connections(1, iflux) == dest .AND. flux_connections(2, iflux) == icpu) THEN
1060 27 : flux_limits(1, iflux) = -list_global(2, idest, icpu)
1061 27 : EXIT
1062 : END IF
1063 : END DO
1064 : END IF
1065 : END IF
1066 : END IF
1067 : END DO
1068 : END DO
1069 :
1070 : solution_optimal = .FALSE.
1071 54 : optimized_flux = 0.0_dp
1072 :
1073 : ! an iterative solver, if iterated till convergence the maximum load is minimal
1074 : ! we terminate before things are fully converged, since this does show up in the timings
1075 : ! once the largest shift becomes less than a small fraction of the average load, we're done
1076 : ! we're perfectly happy if the load balance is within 1 percent or so
1077 : ! the maximum load normally converges even faster
1078 81 : average = SUM(load)/SIZE(load)
1079 27 : tolerance = Tolerance_factor*average
1080 :
1081 81 : optimized_load(:) = load
1082 332 : DO k = 1, Max_iter
1083 : max_load_shift = 0.0_dp
1084 658 : DO iflux = 1, Nflux
1085 329 : load_shift = (optimized_load(flux_connections(1, iflux)) - optimized_load(flux_connections(2, iflux)))/2
1086 329 : load_shift = MAX(flux_limits(1, iflux) - optimized_flux(iflux), load_shift)
1087 329 : load_shift = MIN(flux_limits(2, iflux) - optimized_flux(iflux), load_shift)
1088 329 : max_load_shift = MAX(ABS(load_shift), max_load_shift)
1089 329 : optimized_load(flux_connections(1, iflux)) = optimized_load(flux_connections(1, iflux)) - load_shift
1090 329 : optimized_load(flux_connections(2, iflux)) = optimized_load(flux_connections(2, iflux)) + load_shift
1091 658 : optimized_flux(iflux) = optimized_flux(iflux) + load_shift
1092 : END DO
1093 332 : IF (max_load_shift < tolerance) THEN
1094 : solution_optimal = .TRUE.
1095 : EXIT
1096 : END IF
1097 : END DO
1098 :
1099 : ! now adjust the load list to reflect the optimized fluxes
1100 : ! reorder data
1101 : Nflux = 0
1102 81 : DO icpu = 0, ncpu - 1
1103 162 : DO idest = 1, maxdest
1104 162 : IF (list_global(1, idest, icpu) == icpu) ilocal = idest
1105 : END DO
1106 189 : DO idest = 1, maxdest
1107 108 : dest = list_global(1, idest, icpu)
1108 162 : IF (dest < ncpu) THEN
1109 108 : IF (dest .NE. icpu) THEN
1110 54 : IF (dest > icpu) THEN
1111 27 : Nflux = Nflux + 1
1112 27 : IF (optimized_flux(Nflux) > 0) THEN
1113 : list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
1114 0 : list_global(2, idest, icpu) - NINT(optimized_flux(Nflux))
1115 0 : list_global(2, idest, icpu) = NINT(optimized_flux(Nflux))
1116 : ELSE
1117 : list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
1118 27 : list_global(2, idest, icpu)
1119 27 : list_global(2, idest, icpu) = 0
1120 : END IF
1121 : ELSE
1122 27 : DO iflux = 1, Nflux
1123 27 : IF (flux_connections(1, iflux) == dest .AND. flux_connections(2, iflux) == icpu) THEN
1124 27 : IF (optimized_flux(iflux) > 0) THEN
1125 : list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
1126 0 : list_global(2, idest, icpu)
1127 0 : list_global(2, idest, icpu) = 0
1128 : ELSE
1129 : list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
1130 27 : list_global(2, idest, icpu) + NINT(optimized_flux(iflux))
1131 27 : list_global(2, idest, icpu) = -NINT(optimized_flux(iflux))
1132 : END IF
1133 : EXIT
1134 : END IF
1135 : END DO
1136 : END IF
1137 : END IF
1138 : END IF
1139 : END DO
1140 : END DO
1141 :
1142 27 : CALL timestop(handle)
1143 :
1144 54 : END SUBROUTINE balance_global_list
1145 :
1146 : ! **************************************************************************************************
1147 : !> \brief this routine gets back optimized loads for all destinations
1148 : !>
1149 : !> \param list ...
1150 : !> \param group ...
1151 : !> \param my_pos ...
1152 : !> \par History
1153 : !> created 2008-10-06 [Joost VandeVondele]
1154 : !> Modified 2016-01 [EPCC] Reduce memory requirements on P processes
1155 : !> from O(P^2) to O(P)
1156 : ! **************************************************************************************************
1157 54 : SUBROUTINE optimize_load_list(list, group, my_pos)
1158 : INTEGER, DIMENSION(:, :, 0:) :: list
1159 : TYPE(mp_comm_type), INTENT(IN) :: group
1160 : INTEGER, INTENT(IN) :: my_pos
1161 :
1162 : CHARACTER(LEN=*), PARAMETER :: routineN = 'optimize_load_list'
1163 : INTEGER, PARAMETER :: rank_of_root = 0
1164 :
1165 : INTEGER :: handle, icpu, idest, maxdest, ncpu
1166 : INTEGER, ALLOCATABLE, DIMENSION(:) :: load_all
1167 54 : INTEGER, ALLOCATABLE, DIMENSION(:, :) :: load_partial
1168 : INTEGER, ALLOCATABLE, DIMENSION(:, :, :) :: list_global
1169 :
1170 54 : CALL timeset(routineN, handle)
1171 :
1172 54 : ncpu = SIZE(list, 3)
1173 54 : maxdest = SIZE(list, 2)
1174 :
1175 : !find total workload ...
1176 162 : ALLOCATE (load_all(maxdest*ncpu))
1177 108 : load_all(:) = RESHAPE(list(2, :, :), (/maxdest*ncpu/))
1178 54 : CALL group%sum(load_all(:), rank_of_root)
1179 :
1180 : ! ... and optimise the work per process
1181 216 : ALLOCATE (list_global(2, maxdest, ncpu))
1182 54 : IF (rank_of_root .EQ. my_pos) THEN
1183 189 : list_global(1, :, :) = list(1, :, :)
1184 243 : list_global(2, :, :) = RESHAPE(load_all, (/maxdest, ncpu/))
1185 27 : CALL balance_global_list(list_global)
1186 : END IF
1187 54 : CALL group%bcast(list_global, rank_of_root)
1188 :
1189 : !figure out how much can be sent to other processes
1190 216 : ALLOCATE (load_partial(maxdest, ncpu))
1191 : ! send 'load_all', which is a copy of 'list' (but without leading dimension/stride)
1192 162 : CALL group%sum_partial(RESHAPE(load_all, (/maxdest, ncpu/)), load_partial(:, :))
1193 :
1194 162 : DO icpu = 1, ncpu
1195 378 : DO idest = 1, maxdest
1196 :
1197 : !need to deduct 1 because `list' was passed in to this routine as being indexed from zero
1198 324 : IF (load_partial(idest, icpu) > list_global(2, idest, icpu)) THEN
1199 37 : IF (load_partial(idest, icpu) - list(2, idest, icpu - 1) < list_global(2, idest, icpu)) THEN
1200 : list(2, idest, icpu - 1) = list_global(2, idest, icpu) &
1201 10 : - (load_partial(idest, icpu) - list(2, idest, icpu - 1))
1202 : ELSE
1203 27 : list(2, idest, icpu - 1) = 0
1204 : END IF
1205 : END IF
1206 :
1207 : END DO
1208 : END DO
1209 :
1210 : !clean up before leaving
1211 54 : DEALLOCATE (load_all)
1212 54 : DEALLOCATE (list_global)
1213 54 : DEALLOCATE (load_partial)
1214 :
1215 54 : CALL timestop(handle)
1216 54 : END SUBROUTINE optimize_load_list
1217 :
1218 : ! **************************************************************************************************
1219 : !> \brief fill the load list with values derived from the tasks array
1220 : !> from the alternate locations, we select the alternate location that
1221 : !> can be used without increasing the number of matrix blocks needed to
1222 : !> distribute.
1223 : !> Replicated tasks are not yet considered
1224 : !>
1225 : !> \param list ...
1226 : !> \param rs_descs ...
1227 : !> \param grid_level ...
1228 : !> \param tasks ...
1229 : !> \param ntasks ...
1230 : !> \param natoms ...
1231 : !> \param create_list ...
1232 : !> \par History
1233 : !> created 2008-10-06 [Joost VandeVondele]
1234 : ! **************************************************************************************************
1235 108 : SUBROUTINE compute_load_list(list, rs_descs, grid_level, tasks, ntasks, natoms, create_list)
1236 : INTEGER, DIMENSION(:, :, 0:) :: list
1237 : TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1238 : POINTER :: rs_descs
1239 : INTEGER :: grid_level
1240 : TYPE(task_type), DIMENSION(:), POINTER :: tasks
1241 : INTEGER :: ntasks, natoms
1242 : LOGICAL :: create_list
1243 :
1244 : CHARACTER(LEN=*), PARAMETER :: routineN = 'compute_load_list'
1245 :
1246 : INTEGER :: cost, dest, handle, i, iatom, ilevel, img, img_old, iopt, ipgf, iset, itask, &
1247 : itask_start, itask_stop, jatom, jpgf, jset, li, maxdest, ncpu, ndest_pair, nopt, nshort, &
1248 : rank
1249 : INTEGER(KIND=int_8) :: bit_pattern, ipair, ipair_old, natom8
1250 : INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: loads
1251 108 : INTEGER, ALLOCATABLE, DIMENSION(:) :: all_dests, index
1252 : INTEGER, DIMENSION(6) :: options
1253 :
1254 108 : CALL timeset(routineN, handle)
1255 :
1256 324 : ALLOCATE (loads(0:rs_descs(grid_level)%rs_desc%group_size - 1))
1257 108 : CALL get_current_loads(loads, rs_descs, grid_level, ntasks, tasks, use_reordered_ranks=.FALSE.)
1258 :
1259 108 : maxdest = SIZE(list, 2)
1260 108 : ncpu = SIZE(list, 3)
1261 108 : natom8 = natoms
1262 :
1263 : ! first find the tasks that deal with the same atom pair
1264 108 : itask_stop = 0
1265 108 : ipair_old = HUGE(ipair_old)
1266 108 : img_old = -1
1267 108 : ALLOCATE (all_dests(0))
1268 108 : ALLOCATE (INDEX(0))
1269 :
1270 : DO
1271 :
1272 : ! first find the range of tasks that deal with the same atom pair
1273 290 : itask_start = itask_stop + 1
1274 290 : itask_stop = itask_start
1275 290 : IF (itask_stop > ntasks) EXIT
1276 182 : ilevel = tasks(itask_stop)%grid_level
1277 182 : img_old = tasks(itask_stop)%image
1278 182 : iatom = tasks(itask_stop)%iatom
1279 182 : jatom = tasks(itask_stop)%jatom
1280 182 : iset = tasks(itask_stop)%iset
1281 182 : jset = tasks(itask_stop)%jset
1282 182 : ipgf = tasks(itask_stop)%ipgf
1283 182 : jpgf = tasks(itask_stop)%jpgf
1284 :
1285 182 : ipair_old = (iatom - 1)*natom8 + (jatom - 1)
1286 : DO
1287 7386 : IF (itask_stop + 1 > ntasks) EXIT
1288 7298 : ilevel = tasks(itask_stop + 1)%grid_level
1289 7298 : img = tasks(itask_stop + 1)%image
1290 7298 : iatom = tasks(itask_stop + 1)%iatom
1291 7298 : jatom = tasks(itask_stop + 1)%jatom
1292 7298 : iset = tasks(itask_stop + 1)%iset
1293 7298 : jset = tasks(itask_stop + 1)%jset
1294 7298 : ipgf = tasks(itask_stop + 1)%ipgf
1295 7298 : jpgf = tasks(itask_stop + 1)%jpgf
1296 :
1297 7298 : ipair = (iatom - 1)*natom8 + (jatom - 1)
1298 7386 : IF (ipair == ipair_old .AND. img == img_old) THEN
1299 : itask_stop = itask_stop + 1
1300 : ELSE
1301 : EXIT
1302 : END IF
1303 : END DO
1304 182 : ipair = ipair_old
1305 182 : nshort = itask_stop - itask_start + 1
1306 :
1307 : ! find the unique list of destinations on this grid level only
1308 182 : DEALLOCATE (all_dests)
1309 546 : ALLOCATE (all_dests(nshort))
1310 182 : DEALLOCATE (index)
1311 364 : ALLOCATE (INDEX(nshort))
1312 7568 : DO i = 1, nshort
1313 7386 : ilevel = tasks(itask_start + i - 1)%grid_level
1314 7386 : img = tasks(itask_start + i - 1)%image
1315 7386 : iatom = tasks(itask_start + i - 1)%iatom
1316 7386 : jatom = tasks(itask_start + i - 1)%jatom
1317 7386 : iset = tasks(itask_start + i - 1)%iset
1318 7386 : jset = tasks(itask_start + i - 1)%jset
1319 7386 : ipgf = tasks(itask_start + i - 1)%ipgf
1320 7386 : jpgf = tasks(itask_start + i - 1)%jpgf
1321 :
1322 7568 : IF (ilevel .EQ. grid_level) THEN
1323 2760 : all_dests(i) = decode_rank(tasks(itask_start + i - 1)%destination, SIZE(rs_descs))
1324 : ELSE
1325 4626 : all_dests(i) = HUGE(all_dests(i))
1326 : END IF
1327 : END DO
1328 182 : CALL sort(all_dests, nshort, index)
1329 182 : ndest_pair = 1
1330 7386 : DO i = 2, nshort
1331 7386 : IF ((all_dests(ndest_pair) .NE. all_dests(i)) .AND. (all_dests(i) .NE. HUGE(all_dests(i)))) THEN
1332 10 : ndest_pair = ndest_pair + 1
1333 10 : all_dests(ndest_pair) = all_dests(i)
1334 : END IF
1335 : END DO
1336 :
1337 7676 : DO itask = itask_start, itask_stop
1338 :
1339 7386 : dest = decode_rank(tasks(itask)%destination, SIZE(rs_descs)) ! notice that dest can be changed
1340 7386 : ilevel = tasks(itask)%grid_level
1341 7386 : img = tasks(itask)%image
1342 7386 : iatom = tasks(itask)%iatom
1343 7386 : jatom = tasks(itask)%jatom
1344 7386 : iset = tasks(itask)%iset
1345 7386 : jset = tasks(itask)%jset
1346 7386 : ipgf = tasks(itask)%ipgf
1347 7386 : jpgf = tasks(itask)%jpgf
1348 :
1349 : ! Only proceed with tasks which are on this grid level
1350 7386 : IF (ilevel .NE. grid_level) CYCLE
1351 2760 : ipair = (iatom - 1)*natom8 + (jatom - 1)
1352 2760 : cost = INT(tasks(itask)%cost)
1353 :
1354 182 : SELECT CASE (tasks(itask)%dist_type)
1355 : CASE (1)
1356 2760 : bit_pattern = tasks(itask)%subpatch_pattern
1357 2760 : nopt = 0
1358 2760 : IF (BTEST(bit_pattern, 0)) THEN
1359 0 : rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, (/-1, 0, 0/))
1360 0 : IF (ANY(all_dests(1:ndest_pair) .EQ. rank)) THEN
1361 0 : nopt = nopt + 1
1362 0 : options(nopt) = rank
1363 : END IF
1364 : END IF
1365 2760 : IF (BTEST(bit_pattern, 1)) THEN
1366 0 : rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, (/+1, 0, 0/))
1367 0 : IF (ANY(all_dests(1:ndest_pair) .EQ. rank)) THEN
1368 0 : nopt = nopt + 1
1369 0 : options(nopt) = rank
1370 : END IF
1371 : END IF
1372 2760 : IF (BTEST(bit_pattern, 2)) THEN
1373 48 : rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, (/0, -1, 0/))
1374 96 : IF (ANY(all_dests(1:ndest_pair) .EQ. rank)) THEN
1375 0 : nopt = nopt + 1
1376 0 : options(nopt) = rank
1377 : END IF
1378 : END IF
1379 2760 : IF (BTEST(bit_pattern, 3)) THEN
1380 0 : rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, (/0, +1, 0/))
1381 0 : IF (ANY(all_dests(1:ndest_pair) .EQ. rank)) THEN
1382 0 : nopt = nopt + 1
1383 0 : options(nopt) = rank
1384 : END IF
1385 : END IF
1386 2760 : IF (BTEST(bit_pattern, 4)) THEN
1387 1150 : rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, (/0, 0, -1/))
1388 2200 : IF (ANY(all_dests(1:ndest_pair) .EQ. rank)) THEN
1389 100 : nopt = nopt + 1
1390 100 : options(nopt) = rank
1391 : END IF
1392 : END IF
1393 2760 : IF (BTEST(bit_pattern, 5)) THEN
1394 500 : rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, (/0, 0, +1/))
1395 1000 : IF (ANY(all_dests(1:ndest_pair) .EQ. rank)) THEN
1396 80 : nopt = nopt + 1
1397 80 : options(nopt) = rank
1398 : END IF
1399 : END IF
1400 2760 : IF (nopt > 0) THEN
1401 : ! set it to the rank with the lowest load
1402 180 : rank = options(1)
1403 180 : DO iopt = 2, nopt
1404 180 : IF (loads(rank) > loads(options(iopt))) rank = options(iopt)
1405 : END DO
1406 : ELSE
1407 2580 : rank = dest
1408 : END IF
1409 2760 : li = list_index(list, rank, dest)
1410 2760 : IF (create_list) THEN
1411 1380 : list(2, li, dest) = list(2, li, dest) + cost
1412 : ELSE
1413 1380 : IF (list(1, li, dest) == dest) THEN
1414 1290 : tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
1415 : ELSE
1416 90 : IF (list(2, li, dest) >= cost) THEN
1417 10 : list(2, li, dest) = list(2, li, dest) - cost
1418 10 : tasks(itask)%destination = encode_rank(list(1, li, dest), ilevel, SIZE(rs_descs))
1419 : ELSE
1420 80 : tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
1421 : END IF
1422 : END IF
1423 : END IF
1424 : CASE (2) ! generalised
1425 0 : li = list_index(list, dest, dest)
1426 0 : IF (create_list) THEN
1427 0 : list(2, li, dest) = list(2, li, dest) + cost
1428 : ELSE
1429 0 : IF (list(1, li, dest) == dest) THEN
1430 0 : tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
1431 : ELSE
1432 0 : IF (list(2, li, dest) >= cost) THEN
1433 0 : list(2, li, dest) = list(2, li, dest) - cost
1434 0 : tasks(itask)%destination = encode_rank(list(1, li, dest), ilevel, SIZE(rs_descs))
1435 : ELSE
1436 0 : tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
1437 : END IF
1438 : END IF
1439 : END IF
1440 : CASE DEFAULT
1441 7386 : CPABORT("")
1442 : END SELECT
1443 :
1444 : END DO
1445 :
1446 : END DO
1447 :
1448 108 : CALL timestop(handle)
1449 :
1450 216 : END SUBROUTINE compute_load_list
1451 : ! **************************************************************************************************
1452 : !> \brief small helper function to return the proper index in the list array
1453 : !>
1454 : !> \param list ...
1455 : !> \param rank ...
1456 : !> \param dest ...
1457 : !> \return ...
1458 : !> \par History
1459 : !> created 2008-10-06 [Joost VandeVondele]
1460 : ! **************************************************************************************************
1461 2760 : INTEGER FUNCTION list_index(list, rank, dest)
1462 : INTEGER, DIMENSION(:, :, 0:), INTENT(IN) :: list
1463 : INTEGER, INTENT(IN) :: rank, dest
1464 :
1465 2760 : list_index = 1
1466 1424 : DO
1467 4184 : IF (list(1, list_index, dest) == rank) EXIT
1468 1424 : list_index = list_index + 1
1469 : END DO
1470 2760 : END FUNCTION list_index
1471 : ! **************************************************************************************************
1472 : !> \brief create a list with possible destinations (i.e. the central cpu and neighbors) for each cpu
1473 : !> note that we allocate it with an additional field to store the load of this destination
1474 : !>
1475 : !> \param list ...
1476 : !> \param rs_descs ...
1477 : !> \param grid_level ...
1478 : !> \par History
1479 : !> created 2008-10-06 [Joost VandeVondele]
1480 : ! **************************************************************************************************
1481 54 : SUBROUTINE create_destination_list(list, rs_descs, grid_level)
1482 : INTEGER, DIMENSION(:, :, :), POINTER :: list
1483 : TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1484 : POINTER :: rs_descs
1485 : INTEGER, INTENT(IN) :: grid_level
1486 :
1487 : CHARACTER(LEN=*), PARAMETER :: routineN = 'create_destination_list'
1488 :
1489 : INTEGER :: handle, i, icpu, j, maxcount, ncpu, &
1490 : ultimate_max
1491 54 : INTEGER, ALLOCATABLE, DIMENSION(:) :: index, sublist
1492 :
1493 54 : CALL timeset(routineN, handle)
1494 :
1495 54 : CPASSERT(.NOT. ASSOCIATED(list))
1496 54 : ncpu = rs_descs(grid_level)%rs_desc%group_size
1497 54 : ultimate_max = 7
1498 :
1499 162 : ALLOCATE (list(2, ultimate_max, 0:ncpu - 1))
1500 :
1501 54 : ALLOCATE (INDEX(ultimate_max))
1502 54 : ALLOCATE (sublist(ultimate_max))
1503 432 : sublist = HUGE(sublist)
1504 :
1505 54 : maxcount = 1
1506 162 : DO icpu = 0, ncpu - 1
1507 108 : sublist(1) = icpu
1508 108 : sublist(2) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, (/-1, 0, 0/))
1509 108 : sublist(3) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, (/+1, 0, 0/))
1510 108 : sublist(4) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, (/0, -1, 0/))
1511 108 : sublist(5) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, (/0, +1, 0/))
1512 108 : sublist(6) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, (/0, 0, -1/))
1513 108 : sublist(7) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, (/0, 0, +1/))
1514 : ! only retain unique values of the destination
1515 108 : CALL sort(sublist, ultimate_max, index)
1516 108 : j = 1
1517 756 : DO i = 2, 7
1518 756 : IF (sublist(i) .NE. sublist(j)) THEN
1519 108 : j = j + 1
1520 108 : sublist(j) = sublist(i)
1521 : END IF
1522 : END DO
1523 108 : maxcount = MAX(maxcount, j)
1524 648 : sublist(j + 1:ultimate_max) = HUGE(sublist)
1525 864 : list(1, :, icpu) = sublist
1526 918 : list(2, :, icpu) = 0
1527 : END DO
1528 :
1529 54 : CALL reallocate(list, 1, 2, 1, maxcount, 0, ncpu - 1)
1530 :
1531 54 : CALL timestop(handle)
1532 :
1533 108 : END SUBROUTINE create_destination_list
1534 :
1535 : ! **************************************************************************************************
1536 : !> \brief given a task list, compute the load of each process everywhere
1537 : !> giving this function the ability to loop over a (sub)set of rs_grids,
1538 : !> and do all the communication in one shot, would speed it up
1539 : !> \param loads ...
1540 : !> \param rs_descs ...
1541 : !> \param grid_level ...
1542 : !> \param ntasks ...
1543 : !> \param tasks ...
1544 : !> \param use_reordered_ranks ...
1545 : !> \par History
1546 : !> none
1547 : !> \author MattW 21/11/2007
1548 : ! **************************************************************************************************
1549 480 : SUBROUTINE get_current_loads(loads, rs_descs, grid_level, ntasks, tasks, use_reordered_ranks)
1550 : INTEGER(KIND=int_8), DIMENSION(:) :: loads
1551 : TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1552 : POINTER :: rs_descs
1553 : INTEGER :: grid_level, ntasks
1554 : TYPE(task_type), DIMENSION(:), POINTER :: tasks
1555 : LOGICAL, INTENT(IN) :: use_reordered_ranks
1556 :
1557 : CHARACTER(LEN=*), PARAMETER :: routineN = 'get_current_loads'
1558 :
1559 : INTEGER :: handle, i, iatom, ilevel, img, ipgf, &
1560 : iset, jatom, jpgf, jset
1561 : INTEGER(KIND=int_8) :: total_cost_local
1562 : INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: recv_buf_i, send_buf_i
1563 : TYPE(realspace_grid_desc_type), POINTER :: desc
1564 :
1565 480 : CALL timeset(routineN, handle)
1566 :
1567 480 : desc => rs_descs(grid_level)%rs_desc
1568 :
1569 : ! allocate local arrays
1570 1440 : ALLOCATE (send_buf_i(desc%group_size))
1571 960 : ALLOCATE (recv_buf_i(desc%group_size))
1572 :
1573 : ! communication step 1 : compute the total local cost of the tasks
1574 : ! each proc needs to know the amount of work he will receive
1575 :
1576 : ! send buffer now contains for each target the cost of the tasks it will receive
1577 1440 : send_buf_i = 0
1578 36792 : DO i = 1, ntasks
1579 36312 : ilevel = tasks(i)%grid_level
1580 36312 : img = tasks(i)%image
1581 36312 : iatom = tasks(i)%iatom
1582 36312 : jatom = tasks(i)%jatom
1583 36312 : iset = tasks(i)%iset
1584 36312 : jset = tasks(i)%jset
1585 36312 : ipgf = tasks(i)%ipgf
1586 36312 : jpgf = tasks(i)%jpgf
1587 36312 : IF (ilevel .NE. grid_level) CYCLE
1588 10476 : IF (use_reordered_ranks) THEN
1589 : send_buf_i(rs_descs(ilevel)%rs_desc%virtual2real(decode_rank(tasks(i)%destination, SIZE(rs_descs))) + 1) = &
1590 : send_buf_i(rs_descs(ilevel)%rs_desc%virtual2real(decode_rank(tasks(i)%destination, SIZE(rs_descs))) + 1) &
1591 3618 : + tasks(i)%cost
1592 : ELSE
1593 : send_buf_i(decode_rank(tasks(i)%destination, SIZE(rs_descs)) + 1) = &
1594 : send_buf_i(decode_rank(tasks(i)%destination, SIZE(rs_descs)) + 1) &
1595 6378 : + tasks(i)%cost
1596 : END IF
1597 : END DO
1598 480 : CALL desc%group%alltoall(send_buf_i, recv_buf_i, 1)
1599 :
1600 : ! communication step 2 : compute the global cost of the tasks
1601 1440 : total_cost_local = SUM(recv_buf_i)
1602 :
1603 : ! after this step, the recv buffer contains the local cost for each CPU
1604 1440 : CALL desc%group%allgather(total_cost_local, loads)
1605 :
1606 480 : CALL timestop(handle)
1607 :
1608 960 : END SUBROUTINE get_current_loads
1609 : ! **************************************************************************************************
1610 : !> \brief performs load balancing shifting tasks on the replicated grids
1611 : !> this modifies the destination of some of the tasks on replicated
1612 : !> grids, and in this way balances the load
1613 : !> \param rs_descs ...
1614 : !> \param ntasks ...
1615 : !> \param tasks ...
1616 : !> \par History
1617 : !> none
1618 : !> \author MattW 21/11/2007
1619 : ! **************************************************************************************************
1620 48 : SUBROUTINE load_balance_replicated(rs_descs, ntasks, tasks)
1621 :
1622 : TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1623 : POINTER :: rs_descs
1624 : INTEGER :: ntasks
1625 : TYPE(task_type), DIMENSION(:), POINTER :: tasks
1626 :
1627 : CHARACTER(LEN=*), PARAMETER :: routineN = 'load_balance_replicated'
1628 :
1629 : INTEGER :: handle, i, iatom, ilevel, img, ipgf, &
1630 : iset, j, jatom, jpgf, jset, &
1631 : no_overloaded, no_underloaded, &
1632 : proc_receiving
1633 : INTEGER(KIND=int_8) :: average_cost, cost_task_rep, count, &
1634 : offset, total_cost_global
1635 48 : INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: load_imbalance, loads, recv_buf_i
1636 48 : INTEGER, ALLOCATABLE, DIMENSION(:) :: index
1637 : TYPE(realspace_grid_desc_type), POINTER :: desc
1638 :
1639 48 : CALL timeset(routineN, handle)
1640 :
1641 48 : desc => rs_descs(1)%rs_desc
1642 :
1643 : ! allocate local arrays
1644 144 : ALLOCATE (recv_buf_i(desc%group_size))
1645 96 : ALLOCATE (loads(desc%group_size))
1646 :
1647 144 : recv_buf_i = 0
1648 234 : DO i = 1, SIZE(rs_descs)
1649 186 : CALL get_current_loads(loads, rs_descs, i, ntasks, tasks, use_reordered_ranks=.TRUE.)
1650 606 : recv_buf_i(:) = recv_buf_i + loads
1651 : END DO
1652 :
1653 144 : total_cost_global = SUM(recv_buf_i)
1654 48 : average_cost = total_cost_global/desc%group_size
1655 :
1656 : !
1657 : ! compute how to redistribute the replicated tasks so that the average cost is reached
1658 : !
1659 :
1660 : ! load imbalance measures the load of a given CPU relative
1661 : ! to the optimal load distribution (load=average)
1662 144 : ALLOCATE (load_imbalance(desc%group_size))
1663 144 : ALLOCATE (INDEX(desc%group_size))
1664 :
1665 144 : load_imbalance(:) = recv_buf_i - average_cost
1666 48 : no_overloaded = 0
1667 48 : no_underloaded = 0
1668 :
1669 144 : DO i = 1, desc%group_size
1670 96 : IF (load_imbalance(i) .GT. 0) no_overloaded = no_overloaded + 1
1671 144 : IF (load_imbalance(i) .LT. 0) no_underloaded = no_underloaded + 1
1672 : END DO
1673 :
1674 : ! sort the recv_buffer on number of tasks, gives us index which provides a
1675 : ! mapping between processor ranks and how overloaded the processor
1676 48 : CALL sort(recv_buf_i, SIZE(recv_buf_i), index)
1677 :
1678 : ! find out the number of replicated tasks each proc has
1679 : ! but only those tasks which have not yet been assigned
1680 48 : cost_task_rep = 0
1681 3666 : DO i = 1, ntasks
1682 : IF (tasks(i)%dist_type .EQ. 0 &
1683 3666 : .AND. decode_rank(tasks(i)%destination, SIZE(rs_descs)) == decode_rank(tasks(i)%source, SIZE(rs_descs))) THEN
1684 2238 : cost_task_rep = cost_task_rep + tasks(i)%cost
1685 : END IF
1686 : END DO
1687 :
1688 : ! now, correct the load imbalance for the overloaded CPUs
1689 : ! they will send away not more than the total load of replicated tasks
1690 48 : CALL desc%group%allgather(cost_task_rep, recv_buf_i)
1691 :
1692 144 : DO i = 1, desc%group_size
1693 : ! At the moment we can only offload replicated tasks
1694 96 : IF (load_imbalance(i) .GT. 0) &
1695 96 : load_imbalance(i) = MIN(load_imbalance(i), recv_buf_i(i))
1696 : END DO
1697 :
1698 : ! simplest algorithm I can think of of is that the processor with the most
1699 : ! excess tasks fills up the process needing most, then moves on to next most.
1700 : ! At the moment if we've got less replicated tasks than we're overloaded then
1701 : ! task balancing will be incomplete
1702 :
1703 : ! only need to do anything if I've excess tasks
1704 48 : IF (load_imbalance(desc%my_pos + 1) .GT. 0) THEN
1705 :
1706 22 : count = 0 ! weighted amount of tasks offloaded
1707 22 : offset = 0 ! no of underloaded processes already filled by other more overloaded procs
1708 :
1709 : ! calculate offset
1710 22 : DO i = desc%group_size, desc%group_size - no_overloaded + 1, -1
1711 22 : IF (INDEX(i) .EQ. desc%my_pos + 1) THEN
1712 : EXIT
1713 : ELSE
1714 0 : offset = offset + load_imbalance(INDEX(i))
1715 : END IF
1716 : END DO
1717 :
1718 : ! find my starting processor to send to
1719 22 : proc_receiving = HUGE(proc_receiving)
1720 22 : DO i = 1, no_underloaded
1721 22 : offset = offset + load_imbalance(INDEX(i))
1722 22 : IF (offset .LE. 0) THEN
1723 : proc_receiving = i
1724 : EXIT
1725 : END IF
1726 : END DO
1727 :
1728 : ! offset now contains minus the number of tasks proc_receiving requires
1729 : ! we fill this up by adjusting the destination of tasks on the replicated grid,
1730 : ! then move to next most underloaded proc
1731 1380 : DO j = 1, ntasks
1732 : IF (tasks(j)%dist_type .EQ. 0 &
1733 1380 : .AND. decode_rank(tasks(j)%destination, SIZE(rs_descs)) == decode_rank(tasks(j)%source, SIZE(rs_descs))) THEN
1734 : ! just avoid sending to non existing procs due to integer truncation
1735 : ! in the computation of the average
1736 808 : IF (proc_receiving .GT. no_underloaded) EXIT
1737 : ! set new destination
1738 808 : ilevel = tasks(j)%grid_level
1739 808 : img = tasks(j)%image
1740 808 : iatom = tasks(j)%iatom
1741 808 : jatom = tasks(j)%jatom
1742 808 : iset = tasks(j)%iset
1743 808 : jset = tasks(j)%jset
1744 808 : ipgf = tasks(j)%ipgf
1745 808 : jpgf = tasks(j)%jpgf
1746 808 : tasks(j)%destination = encode_rank(INDEX(proc_receiving) - 1, ilevel, SIZE(rs_descs))
1747 808 : offset = offset + tasks(j)%cost
1748 808 : count = count + tasks(j)%cost
1749 808 : IF (count .GE. load_imbalance(desc%my_pos + 1)) EXIT
1750 786 : IF (offset .GT. 0) THEN
1751 0 : proc_receiving = proc_receiving + 1
1752 : ! just avoid sending to non existing procs due to integer truncation
1753 : ! in the computation of the average
1754 0 : IF (proc_receiving .GT. no_underloaded) EXIT
1755 0 : offset = load_imbalance(INDEX(proc_receiving))
1756 : END IF
1757 : END IF
1758 : END DO
1759 : END IF
1760 :
1761 48 : DEALLOCATE (index)
1762 48 : DEALLOCATE (load_imbalance)
1763 :
1764 48 : CALL timestop(handle)
1765 :
1766 96 : END SUBROUTINE load_balance_replicated
1767 :
1768 : ! **************************************************************************************************
1769 : !> \brief given an input task list, redistribute so that all tasks can be processed locally,
1770 : !> i.e. dest equals rank
1771 : !> \param rs_descs ...
1772 : !> \param ntasks ...
1773 : !> \param tasks ...
1774 : !> \param ntasks_recv ...
1775 : !> \param tasks_recv ...
1776 : !> \par History
1777 : !> none
1778 : !> \author MattW 21/11/2007
1779 : ! **************************************************************************************************
1780 48 : SUBROUTINE create_local_tasks(rs_descs, ntasks, tasks, ntasks_recv, tasks_recv)
1781 :
1782 : TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1783 : POINTER :: rs_descs
1784 : INTEGER :: ntasks
1785 : TYPE(task_type), DIMENSION(:), POINTER :: tasks
1786 : INTEGER :: ntasks_recv
1787 : TYPE(task_type), DIMENSION(:), POINTER :: tasks_recv
1788 :
1789 : CHARACTER(LEN=*), PARAMETER :: routineN = 'create_local_tasks'
1790 :
1791 : INTEGER :: handle, i, j, k, l, rank
1792 : INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: recv_buf, send_buf
1793 : INTEGER, ALLOCATABLE, DIMENSION(:) :: recv_disps, recv_sizes, send_disps, &
1794 : send_sizes
1795 : TYPE(realspace_grid_desc_type), POINTER :: desc
1796 :
1797 48 : CALL timeset(routineN, handle)
1798 :
1799 48 : desc => rs_descs(1)%rs_desc
1800 :
1801 : ! allocate local arrays
1802 144 : ALLOCATE (send_sizes(desc%group_size))
1803 96 : ALLOCATE (recv_sizes(desc%group_size))
1804 96 : ALLOCATE (send_disps(desc%group_size))
1805 96 : ALLOCATE (recv_disps(desc%group_size))
1806 144 : ALLOCATE (send_buf(desc%group_size))
1807 96 : ALLOCATE (recv_buf(desc%group_size))
1808 :
1809 : ! fill send buffer, now counting how many tasks will be send (stored in an int8 array for convenience only).
1810 144 : send_buf = 0
1811 3666 : DO i = 1, ntasks
1812 : rank = rs_descs(decode_level(tasks(i)%destination, SIZE(rs_descs))) &
1813 3618 : %rs_desc%virtual2real(decode_rank(tasks(i)%destination, SIZE(rs_descs)))
1814 3666 : send_buf(rank + 1) = send_buf(rank + 1) + 1
1815 : END DO
1816 :
1817 48 : CALL desc%group%alltoall(send_buf, recv_buf, 1)
1818 :
1819 : ! pack the tasks, and send them around
1820 :
1821 144 : send_sizes = 0
1822 144 : send_disps = 0
1823 144 : recv_sizes = 0
1824 144 : recv_disps = 0
1825 :
1826 48 : send_sizes(1) = INT(send_buf(1)*task_size_in_int8)
1827 48 : recv_sizes(1) = INT(recv_buf(1)*task_size_in_int8)
1828 96 : DO i = 2, desc%group_size
1829 48 : send_sizes(i) = INT(send_buf(i)*task_size_in_int8)
1830 48 : recv_sizes(i) = INT(recv_buf(i)*task_size_in_int8)
1831 48 : send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
1832 96 : recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
1833 : END DO
1834 :
1835 : ! deallocate old send/recv buffers
1836 48 : DEALLOCATE (send_buf)
1837 48 : DEALLOCATE (recv_buf)
1838 :
1839 : ! allocate them with new sizes
1840 233 : ALLOCATE (send_buf(SUM(send_sizes)))
1841 238 : ALLOCATE (recv_buf(SUM(recv_sizes)))
1842 :
1843 : ! do packing
1844 61554 : send_buf = 0
1845 144 : send_sizes = 0
1846 3666 : DO j = 1, ntasks
1847 : i = rs_descs(decode_level(tasks(j)%destination, SIZE(rs_descs))) &
1848 3618 : %rs_desc%virtual2real(decode_rank(tasks(j)%destination, SIZE(rs_descs))) + 1
1849 3618 : l = send_disps(i) + send_sizes(i)
1850 3618 : CALL serialize_task(tasks(j), send_buf(l + 1:l + task_size_in_int8))
1851 3666 : send_sizes(i) = send_sizes(i) + task_size_in_int8
1852 : END DO
1853 :
1854 : ! do communication
1855 48 : CALL desc%group%alltoall(send_buf, send_sizes, send_disps, recv_buf, recv_sizes, recv_disps)
1856 :
1857 48 : DEALLOCATE (send_buf)
1858 :
1859 144 : ntasks_recv = SUM(recv_sizes)/task_size_in_int8
1860 3904 : ALLOCATE (tasks_recv(ntasks_recv))
1861 :
1862 : ! do unpacking
1863 48 : l = 0
1864 144 : DO i = 1, desc%group_size
1865 3762 : DO j = 0, recv_sizes(i)/task_size_in_int8 - 1
1866 3618 : l = l + 1
1867 3618 : k = recv_disps(i) + j*task_size_in_int8
1868 3714 : CALL deserialize_task(tasks_recv(l), recv_buf(k + 1:k + task_size_in_int8))
1869 : END DO
1870 : END DO
1871 :
1872 48 : DEALLOCATE (recv_buf)
1873 48 : DEALLOCATE (send_sizes)
1874 48 : DEALLOCATE (recv_sizes)
1875 48 : DEALLOCATE (send_disps)
1876 48 : DEALLOCATE (recv_disps)
1877 :
1878 48 : CALL timestop(handle)
1879 :
1880 48 : END SUBROUTINE create_local_tasks
1881 :
1882 : ! **************************************************************************************************
1883 : !> \brief Assembles tasks to be performed on local grid
1884 : !> \param rs_descs the grids
1885 : !> \param ntasks Number of tasks for local processing
1886 : !> \param natoms ...
1887 : !> \param nimages ...
1888 : !> \param tasks the task set generated on this processor
1889 : !> \param rval ...
1890 : !> \param atom_pair_send ...
1891 : !> \param atom_pair_recv ...
1892 : !> \param symmetric ...
1893 : !> \param reorder_rs_grid_ranks ...
1894 : !> \param skip_load_balance_distributed ...
1895 : !> \par History
1896 : !> none
1897 : !> \author MattW 21/11/2007
1898 : ! **************************************************************************************************
1899 15818 : SUBROUTINE distribute_tasks(rs_descs, ntasks, natoms, &
1900 : tasks, atom_pair_send, atom_pair_recv, &
1901 : symmetric, reorder_rs_grid_ranks, skip_load_balance_distributed)
1902 :
1903 : TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
1904 : POINTER :: rs_descs
1905 : INTEGER :: ntasks, natoms
1906 : TYPE(task_type), DIMENSION(:), POINTER :: tasks
1907 : TYPE(atom_pair_type), DIMENSION(:), POINTER :: atom_pair_send, atom_pair_recv
1908 : LOGICAL, INTENT(IN) :: symmetric, reorder_rs_grid_ranks, &
1909 : skip_load_balance_distributed
1910 :
1911 : CHARACTER(LEN=*), PARAMETER :: routineN = 'distribute_tasks'
1912 :
1913 : INTEGER :: handle, igrid_level, irank, ntasks_recv
1914 : INTEGER(KIND=int_8) :: load_gap, max_load, replicated_load
1915 15818 : INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:) :: total_loads, total_loads_tmp, trial_loads
1916 15818 : INTEGER(KIND=int_8), DIMENSION(:, :), POINTER :: loads
1917 15818 : INTEGER, ALLOCATABLE, DIMENSION(:) :: indices, real2virtual, total_index
1918 : LOGICAL :: distributed_grids, fixed_first_grid
1919 : TYPE(realspace_grid_desc_type), POINTER :: desc
1920 15818 : TYPE(task_type), DIMENSION(:), POINTER :: tasks_recv
1921 :
1922 15818 : CALL timeset(routineN, handle)
1923 :
1924 15818 : CPASSERT(ASSOCIATED(tasks))
1925 :
1926 : ! *** figure out if we have distributed grids
1927 15818 : distributed_grids = .FALSE.
1928 78602 : DO igrid_level = 1, SIZE(rs_descs)
1929 78602 : IF (rs_descs(igrid_level)%rs_desc%distributed) THEN
1930 54 : distributed_grids = .TRUE.
1931 : END IF
1932 : END DO
1933 15818 : desc => rs_descs(1)%rs_desc
1934 :
1935 15818 : IF (distributed_grids) THEN
1936 :
1937 192 : ALLOCATE (loads(0:desc%group_size - 1, SIZE(rs_descs)))
1938 144 : ALLOCATE (total_loads(0:desc%group_size - 1))
1939 :
1940 144 : total_loads = 0
1941 :
1942 : ! First round of balancing on the distributed grids
1943 : ! we just balance each of the distributed grids independently
1944 234 : DO igrid_level = 1, SIZE(rs_descs)
1945 234 : IF (rs_descs(igrid_level)%rs_desc%distributed) THEN
1946 :
1947 54 : IF (.NOT. skip_load_balance_distributed) &
1948 54 : CALL load_balance_distributed(tasks, ntasks, rs_descs, igrid_level, natoms)
1949 :
1950 : CALL get_current_loads(loads(:, igrid_level), rs_descs, igrid_level, ntasks, &
1951 54 : tasks, use_reordered_ranks=.FALSE.)
1952 :
1953 162 : total_loads(:) = total_loads + loads(:, igrid_level)
1954 :
1955 : END IF
1956 : END DO
1957 :
1958 : ! calculate the total load of replicated tasks, so we can decide if it is worth
1959 : ! reordering the distributed grid levels
1960 :
1961 48 : replicated_load = 0
1962 234 : DO igrid_level = 1, SIZE(rs_descs)
1963 234 : IF (.NOT. rs_descs(igrid_level)%rs_desc%distributed) THEN
1964 : CALL get_current_loads(loads(:, igrid_level), rs_descs, igrid_level, ntasks, &
1965 132 : tasks, use_reordered_ranks=.FALSE.)
1966 396 : replicated_load = replicated_load + SUM(loads(:, igrid_level))
1967 : END IF
1968 : END DO
1969 :
1970 : !IF (desc%my_pos==0) THEN
1971 : ! WRITE(*,*) "Total replicated load is ",replicated_load
1972 : !END IF
1973 :
1974 : ! Now we adjust the rank ordering based on the current loads
1975 : ! we leave the first distributed level and all the replicated levels in the default order
1976 48 : IF (reorder_rs_grid_ranks) THEN
1977 48 : fixed_first_grid = .FALSE.
1978 234 : DO igrid_level = 1, SIZE(rs_descs)
1979 234 : IF (rs_descs(igrid_level)%rs_desc%distributed) THEN
1980 54 : IF (fixed_first_grid .EQV. .FALSE.) THEN
1981 144 : total_loads(:) = loads(:, igrid_level)
1982 : fixed_first_grid = .TRUE.
1983 : ELSE
1984 18 : ALLOCATE (trial_loads(0:desc%group_size - 1))
1985 :
1986 18 : trial_loads(:) = total_loads + loads(:, igrid_level)
1987 18 : max_load = MAXVAL(trial_loads)
1988 : load_gap = 0
1989 18 : DO irank = 0, desc%group_size - 1
1990 18 : load_gap = load_gap + max_load - trial_loads(irank)
1991 : END DO
1992 :
1993 : ! If there is not enough replicated load to load balance well enough
1994 : ! then we will reorder this grid level
1995 6 : IF (load_gap > replicated_load*1.05_dp) THEN
1996 :
1997 18 : ALLOCATE (indices(0:desc%group_size - 1))
1998 12 : ALLOCATE (total_index(0:desc%group_size - 1))
1999 12 : ALLOCATE (total_loads_tmp(0:desc%group_size - 1))
2000 12 : ALLOCATE (real2virtual(0:desc%group_size - 1))
2001 :
2002 18 : total_loads_tmp(:) = total_loads
2003 6 : CALL sort(total_loads_tmp, desc%group_size, total_index)
2004 6 : CALL sort(loads(:, igrid_level), desc%group_size, indices)
2005 :
2006 : ! Reorder so that the rank with smallest load on this grid level is paired with
2007 : ! the highest load in total
2008 18 : DO irank = 0, desc%group_size - 1
2009 : total_loads(total_index(irank) - 1) = total_loads(total_index(irank) - 1) + &
2010 12 : loads(desc%group_size - irank - 1, igrid_level)
2011 18 : real2virtual(total_index(irank) - 1) = indices(desc%group_size - irank - 1) - 1
2012 : END DO
2013 :
2014 6 : CALL rs_grid_reorder_ranks(rs_descs(igrid_level)%rs_desc, real2virtual)
2015 :
2016 6 : DEALLOCATE (indices)
2017 6 : DEALLOCATE (total_index)
2018 6 : DEALLOCATE (total_loads_tmp)
2019 6 : DEALLOCATE (real2virtual)
2020 : ELSE
2021 0 : total_loads(:) = trial_loads
2022 : END IF
2023 :
2024 6 : DEALLOCATE (trial_loads)
2025 :
2026 : END IF
2027 : END IF
2028 : END DO
2029 : END IF
2030 :
2031 : ! Now we use the replicated tasks to balance out the rest of the load
2032 48 : CALL load_balance_replicated(rs_descs, ntasks, tasks)
2033 :
2034 : !total_loads = 0
2035 : !DO igrid_level=1,SIZE(rs_descs)
2036 : ! CALL get_current_loads(loads(:,igrid_level), rs_descs, igrid_level, ntasks, &
2037 : ! tasks, use_reordered_ranks=.TRUE.)
2038 : ! total_loads = total_loads + loads(:, igrid_level)
2039 : !END DO
2040 :
2041 : !IF (desc%my_pos==0) THEN
2042 : ! WRITE(*,*) ""
2043 : ! WRITE(*,*) "At the end of the load balancing procedure"
2044 : ! WRITE(*,*) "Maximum load:",MAXVAL(total_loads)
2045 : ! WRITE(*,*) "Average load:",SUM(total_loads)/SIZE(total_loads)
2046 : ! WRITE(*,*) "Minimum load:",MINVAL(total_loads)
2047 : !ENDIF
2048 :
2049 : ! given a list of tasks, this will do the needed reshuffle so that all tasks will be local
2050 48 : CALL create_local_tasks(rs_descs, ntasks, tasks, ntasks_recv, tasks_recv)
2051 :
2052 : !
2053 : ! tasks list are complete, we can compute the list of atomic blocks (atom pairs)
2054 : ! we will be sending. These lists are needed for redistribute_matrix.
2055 : !
2056 48 : CALL get_atom_pair(atom_pair_send, tasks, ntasks=ntasks, send=.TRUE., symmetric=symmetric, rs_descs=rs_descs)
2057 :
2058 : ! natom_send=SIZE(atom_pair_send)
2059 : ! CALL desc%group%sum(natom_send)
2060 : ! IF (desc%my_pos==0) THEN
2061 : ! WRITE(*,*) ""
2062 : ! WRITE(*,*) "Total number of atomic blocks to be send:",natom_send
2063 : ! ENDIF
2064 :
2065 48 : CALL get_atom_pair(atom_pair_recv, tasks_recv, ntasks=ntasks_recv, send=.FALSE., symmetric=symmetric, rs_descs=rs_descs)
2066 :
2067 : ! cleanup, at this point we don't need the original tasks anymore
2068 48 : DEALLOCATE (tasks)
2069 48 : DEALLOCATE (loads)
2070 48 : DEALLOCATE (total_loads)
2071 :
2072 : ELSE
2073 15770 : tasks_recv => tasks
2074 15770 : ntasks_recv = ntasks
2075 15770 : CALL get_atom_pair(atom_pair_recv, tasks_recv, ntasks=ntasks_recv, send=.FALSE., symmetric=symmetric, rs_descs=rs_descs)
2076 : ! not distributed, hence atom_pair_send not needed
2077 : END IF
2078 :
2079 : ! here we sort the task list we will process locally.
2080 47174 : ALLOCATE (indices(ntasks_recv))
2081 15818 : CALL tasks_sort(tasks_recv, ntasks_recv, indices)
2082 15818 : DEALLOCATE (indices)
2083 :
2084 : !
2085 : ! final lists are ready
2086 : !
2087 :
2088 15818 : tasks => tasks_recv
2089 15818 : ntasks = ntasks_recv
2090 :
2091 15818 : CALL timestop(handle)
2092 :
2093 31636 : END SUBROUTINE distribute_tasks
2094 :
2095 : ! **************************************************************************************************
2096 : !> \brief ...
2097 : !> \param atom_pair ...
2098 : !> \param my_tasks ...
2099 : !> \param send ...
2100 : !> \param symmetric ...
2101 : !> \param natoms ...
2102 : !> \param nimages ...
2103 : !> \param rs_descs ...
2104 : ! **************************************************************************************************
2105 15866 : SUBROUTINE get_atom_pair(atom_pair, tasks, ntasks, send, symmetric, rs_descs)
2106 :
2107 : TYPE(atom_pair_type), DIMENSION(:), POINTER :: atom_pair
2108 : TYPE(task_type), DIMENSION(:), INTENT(INOUT) :: tasks
2109 : INTEGER, INTENT(IN) :: ntasks
2110 : LOGICAL, INTENT(IN) :: send, symmetric
2111 : TYPE(realspace_grid_desc_p_type), DIMENSION(:), INTENT(IN) :: rs_descs
2112 :
2113 : INTEGER :: i, ilevel, iatom, jatom, npairs, virt_rank
2114 15866 : INTEGER, DIMENSION(:), ALLOCATABLE :: indices
2115 15866 : TYPE(atom_pair_type), DIMENSION(:), ALLOCATABLE :: atom_pair_tmp
2116 :
2117 15866 : CPASSERT(.NOT. ASSOCIATED(atom_pair))
2118 15866 : IF (ntasks == 0) THEN
2119 287 : ALLOCATE (atom_pair(0))
2120 : RETURN
2121 : END IF
2122 :
2123 : ! calculate list of atom pairs
2124 : ! fill pair list taking into account symmetry
2125 7900623 : ALLOCATE (atom_pair_tmp(ntasks))
2126 7869465 : DO i = 1, ntasks
2127 7853886 : atom_pair_tmp(i)%image = tasks(i)%image
2128 7853886 : iatom = tasks(i)%iatom
2129 7853886 : jatom = tasks(i)%jatom
2130 7853886 : IF (symmetric .AND. iatom > jatom) THEN
2131 : ! iatom / jatom swapped
2132 2961086 : atom_pair_tmp(i)%row = jatom
2133 2961086 : atom_pair_tmp(i)%col = iatom
2134 : ELSE
2135 4892800 : atom_pair_tmp(i)%row = iatom
2136 4892800 : atom_pair_tmp(i)%col = jatom
2137 : END IF
2138 :
2139 7869465 : IF (send) THEN
2140 : ! If sending, we need to use the 'real rank' as the pair has to be sent to the process which
2141 : ! actually has the correct part of the rs_grid to do the mapping
2142 3618 : ilevel = tasks(i)%grid_level
2143 3618 : virt_rank = decode_rank(tasks(i)%destination, SIZE(rs_descs))
2144 3618 : atom_pair_tmp(i)%rank = rs_descs(ilevel)%rs_desc%virtual2real(virt_rank)
2145 : ELSE
2146 : ! If we are receiving, then no conversion is needed as the rank is that of the process with the
2147 : ! required matrix block, and the ordering of the rs grid is irrelevant
2148 7850268 : atom_pair_tmp(i)%rank = decode_rank(tasks(i)%source, SIZE(rs_descs))
2149 : END IF
2150 : END DO
2151 :
2152 : ! find unique atom pairs that I'm sending/receiving
2153 46737 : ALLOCATE (indices(ntasks))
2154 15579 : CALL atom_pair_sort(atom_pair_tmp, ntasks, indices)
2155 15579 : npairs = 1
2156 15579 : tasks(indices(1))%pair_index = 1
2157 7853886 : DO i = 2, ntasks
2158 7838307 : IF (atom_pair_less_than(atom_pair_tmp(i - 1), atom_pair_tmp(i))) THEN
2159 258397 : npairs = npairs + 1
2160 258397 : atom_pair_tmp(npairs) = atom_pair_tmp(i)
2161 : END IF
2162 7853886 : tasks(indices(i))%pair_index = npairs
2163 : END DO
2164 15579 : DEALLOCATE (indices)
2165 :
2166 : ! Copy unique pairs to final location.
2167 320713 : ALLOCATE (atom_pair(npairs))
2168 289555 : atom_pair(:) = atom_pair_tmp(:npairs)
2169 15579 : DEALLOCATE (atom_pair_tmp)
2170 :
2171 : END SUBROUTINE get_atom_pair
2172 :
2173 : ! **************************************************************************************************
2174 : !> \brief redistributes the matrix so that it can be used in realspace operations
2175 : !> i.e. according to the task lists for collocate and integrate.
2176 : !> This routine can become a bottleneck in large calculations.
2177 : !> \param rs_descs ...
2178 : !> \param pmats ...
2179 : !> \param atom_pair_send ...
2180 : !> \param atom_pair_recv ...
2181 : !> \param natoms ...
2182 : !> \param nimages ...
2183 : !> \param scatter ...
2184 : !> \param hmats ...
2185 : ! **************************************************************************************************
2186 0 : SUBROUTINE rs_distribute_matrix(rs_descs, pmats, atom_pair_send, atom_pair_recv, &
2187 : nimages, scatter, hmats)
2188 :
2189 : TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
2190 : POINTER :: rs_descs
2191 : TYPE(dbcsr_p_type), DIMENSION(:), POINTER :: pmats
2192 : TYPE(atom_pair_type), DIMENSION(:), POINTER :: atom_pair_send, atom_pair_recv
2193 : INTEGER :: nimages
2194 : LOGICAL :: scatter
2195 : TYPE(dbcsr_p_type), DIMENSION(:), OPTIONAL, &
2196 : POINTER :: hmats
2197 :
2198 : CHARACTER(LEN=*), PARAMETER :: routineN = 'rs_distribute_matrix'
2199 :
2200 : INTEGER :: acol, arow, handle, i, img, j, k, l, me, &
2201 : nblkcols_total, nblkrows_total, ncol, &
2202 : nrow, nthread, nthread_left
2203 0 : INTEGER, ALLOCATABLE, DIMENSION(:) :: first_col, first_row, last_col, last_row, recv_disps, &
2204 0 : recv_pair_count, recv_pair_disps, recv_sizes, send_disps, send_pair_count, &
2205 0 : send_pair_disps, send_sizes
2206 0 : INTEGER, DIMENSION(:), POINTER :: col_blk_size, row_blk_size
2207 : LOGICAL :: found
2208 0 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:) :: recv_buf_r, send_buf_r
2209 0 : REAL(KIND=dp), DIMENSION(:, :), POINTER :: h_block, p_block
2210 : TYPE(dbcsr_type), POINTER :: hmat, pmat
2211 : TYPE(realspace_grid_desc_type), POINTER :: desc
2212 :
2213 0 : !$ INTEGER(kind=omp_lock_kind), ALLOCATABLE, DIMENSION(:) :: locks
2214 :
2215 0 : CALL timeset(routineN, handle)
2216 :
2217 0 : IF (.NOT. scatter) THEN
2218 0 : CPASSERT(PRESENT(hmats))
2219 : END IF
2220 :
2221 0 : desc => rs_descs(1)%rs_desc
2222 0 : me = desc%my_pos + 1
2223 :
2224 : ! allocate local arrays
2225 0 : ALLOCATE (send_sizes(desc%group_size))
2226 0 : ALLOCATE (recv_sizes(desc%group_size))
2227 0 : ALLOCATE (send_disps(desc%group_size))
2228 0 : ALLOCATE (recv_disps(desc%group_size))
2229 0 : ALLOCATE (send_pair_count(desc%group_size))
2230 0 : ALLOCATE (recv_pair_count(desc%group_size))
2231 0 : ALLOCATE (send_pair_disps(desc%group_size))
2232 0 : ALLOCATE (recv_pair_disps(desc%group_size))
2233 :
2234 0 : pmat => pmats(1)%matrix
2235 : CALL dbcsr_get_info(pmat, &
2236 : row_blk_size=row_blk_size, &
2237 : col_blk_size=col_blk_size, &
2238 : nblkrows_total=nblkrows_total, &
2239 0 : nblkcols_total=nblkcols_total)
2240 0 : ALLOCATE (first_row(nblkrows_total), last_row(nblkrows_total), &
2241 0 : first_col(nblkcols_total), last_col(nblkcols_total))
2242 0 : CALL dbcsr_convert_sizes_to_offsets(row_blk_size, first_row, last_row)
2243 0 : CALL dbcsr_convert_sizes_to_offsets(col_blk_size, first_col, last_col)
2244 :
2245 : ! set up send buffer sizes
2246 0 : send_sizes = 0
2247 0 : send_pair_count = 0
2248 0 : DO i = 1, SIZE(atom_pair_send)
2249 0 : k = atom_pair_send(i)%rank + 1 ! proc we're sending this block to
2250 0 : arow = atom_pair_send(i)%row
2251 0 : acol = atom_pair_send(i)%col
2252 0 : nrow = last_row(arow) - first_row(arow) + 1
2253 0 : ncol = last_col(acol) - first_col(acol) + 1
2254 0 : send_sizes(k) = send_sizes(k) + nrow*ncol
2255 0 : send_pair_count(k) = send_pair_count(k) + 1
2256 : END DO
2257 :
2258 0 : send_disps = 0
2259 0 : send_pair_disps = 0
2260 0 : DO i = 2, desc%group_size
2261 0 : send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
2262 0 : send_pair_disps(i) = send_pair_disps(i - 1) + send_pair_count(i - 1)
2263 : END DO
2264 :
2265 0 : ALLOCATE (send_buf_r(SUM(send_sizes)))
2266 :
2267 : ! set up recv buffer
2268 :
2269 0 : recv_sizes = 0
2270 0 : recv_pair_count = 0
2271 0 : DO i = 1, SIZE(atom_pair_recv)
2272 0 : k = atom_pair_recv(i)%rank + 1 ! proc we're receiving this data from
2273 0 : arow = atom_pair_recv(i)%row
2274 0 : acol = atom_pair_recv(i)%col
2275 0 : nrow = last_row(arow) - first_row(arow) + 1
2276 0 : ncol = last_col(acol) - first_col(acol) + 1
2277 0 : recv_sizes(k) = recv_sizes(k) + nrow*ncol
2278 0 : recv_pair_count(k) = recv_pair_count(k) + 1
2279 : END DO
2280 :
2281 0 : recv_disps = 0
2282 0 : recv_pair_disps = 0
2283 0 : DO i = 2, desc%group_size
2284 0 : recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
2285 0 : recv_pair_disps(i) = recv_pair_disps(i - 1) + recv_pair_count(i - 1)
2286 : END DO
2287 0 : ALLOCATE (recv_buf_r(SUM(recv_sizes)))
2288 :
2289 : !$OMP PARALLEL DEFAULT(OMP_DEFAULT_NONE_WITH_OOP) &
2290 : !$OMP SHARED(desc,send_pair_count,send_pair_disps,nimages),&
2291 : !$OMP SHARED(last_row,first_row,last_col,first_col),&
2292 : !$OMP SHARED(pmats,send_buf_r,send_disps,send_sizes),&
2293 : !$OMP SHARED(atom_pair_send,me,hmats,nblkrows_total),&
2294 : !$OMP SHARED(atom_pair_recv,recv_buf_r,scatter,recv_pair_disps), &
2295 : !$OMP SHARED(recv_sizes,recv_disps,recv_pair_count,locks), &
2296 : !$OMP PRIVATE(i,img,arow,acol,nrow,ncol,p_block,found,j,k,l),&
2297 0 : !$OMP PRIVATE(nthread,h_block,nthread_left,hmat,pmat)
2298 :
2299 : nthread = 1
2300 : !$ nthread = omp_get_num_threads()
2301 : nthread_left = 1
2302 : !$ nthread_left = MAX(1, nthread - 1)
2303 :
2304 : ! do packing
2305 : !$OMP DO schedule(guided)
2306 : DO l = 1, desc%group_size
2307 : IF (l .EQ. me) CYCLE
2308 : send_sizes(l) = 0
2309 : DO i = 1, send_pair_count(l)
2310 : arow = atom_pair_send(send_pair_disps(l) + i)%row
2311 : acol = atom_pair_send(send_pair_disps(l) + i)%col
2312 : img = atom_pair_send(send_pair_disps(l) + i)%image
2313 : nrow = last_row(arow) - first_row(arow) + 1
2314 : ncol = last_col(acol) - first_col(acol) + 1
2315 : pmat => pmats(img)%matrix
2316 : CALL dbcsr_get_block_p(matrix=pmat, row=arow, col=acol, block=p_block, found=found)
2317 : CPASSERT(found)
2318 :
2319 : DO k = 1, ncol
2320 : DO j = 1, nrow
2321 : send_buf_r(send_disps(l) + send_sizes(l) + j + (k - 1)*nrow) = p_block(j, k)
2322 : END DO
2323 : END DO
2324 : send_sizes(l) = send_sizes(l) + nrow*ncol
2325 : END DO
2326 : END DO
2327 : !$OMP END DO
2328 :
2329 : IF (.NOT. scatter) THEN
2330 : ! We need locks to protect concurrent summation into H
2331 : !$OMP SINGLE
2332 : !$ ALLOCATE (locks(nthread*10))
2333 : !$OMP END SINGLE
2334 :
2335 : !$OMP DO
2336 : !$ do i = 1, nthread*10
2337 : !$ call omp_init_lock(locks(i))
2338 : !$ end do
2339 : !$OMP END DO
2340 : END IF
2341 :
2342 : !$OMP MASTER
2343 : ! do communication
2344 : CALL desc%group%alltoall(send_buf_r, send_sizes, send_disps, &
2345 : recv_buf_r, recv_sizes, recv_disps)
2346 : !$OMP END MASTER
2347 :
2348 : ! If this is a scatter, then no need to copy local blocks,
2349 : ! If not, we sum them directly into H (bypassing the alltoall)
2350 : IF (.NOT. scatter) THEN
2351 :
2352 : ! Distribute work over remaining threads assuming one is still in the alltoall
2353 : !$OMP DO schedule(dynamic,MAX(1,send_pair_count(me)/nthread_left))
2354 : DO i = 1, send_pair_count(me)
2355 : arow = atom_pair_send(send_pair_disps(me) + i)%row
2356 : acol = atom_pair_send(send_pair_disps(me) + i)%col
2357 : img = atom_pair_send(send_pair_disps(me) + i)%image
2358 : nrow = last_row(arow) - first_row(arow) + 1
2359 : ncol = last_col(acol) - first_col(acol) + 1
2360 : hmat => hmats(img)%matrix
2361 : pmat => pmats(img)%matrix
2362 : CALL dbcsr_get_block_p(matrix=hmat, row=arow, col=acol, BLOCK=h_block, found=found)
2363 : CPASSERT(found)
2364 : CALL dbcsr_get_block_p(matrix=pmat, row=arow, col=acol, BLOCK=p_block, found=found)
2365 : CPASSERT(found)
2366 :
2367 : !$ call omp_set_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
2368 : DO k = 1, ncol
2369 : DO j = 1, nrow
2370 : h_block(j, k) = h_block(j, k) + p_block(j, k)
2371 : END DO
2372 : END DO
2373 : !$ call omp_unset_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
2374 : END DO
2375 : !$OMP END DO
2376 : ELSE
2377 : ! We will insert new blocks into P, so create mutable work matrices
2378 : DO img = 1, nimages
2379 : pmat => pmats(img)%matrix
2380 : CALL dbcsr_work_create(pmat, work_mutable=.TRUE., &
2381 : nblks_guess=SIZE(atom_pair_recv)/nthread, sizedata_guess=SIZE(recv_buf_r)/nthread, &
2382 : n=nthread)
2383 : END DO
2384 : END IF
2385 :
2386 : ! wait for comm and setup to finish
2387 : !$OMP BARRIER
2388 :
2389 : !do unpacking
2390 : !$OMP DO schedule(guided)
2391 : DO l = 1, desc%group_size
2392 : IF (l .EQ. me) CYCLE
2393 : recv_sizes(l) = 0
2394 : DO i = 1, recv_pair_count(l)
2395 : arow = atom_pair_recv(recv_pair_disps(l) + i)%row
2396 : acol = atom_pair_recv(recv_pair_disps(l) + i)%col
2397 : img = atom_pair_recv(recv_pair_disps(l) + i)%image
2398 : nrow = last_row(arow) - first_row(arow) + 1
2399 : ncol = last_col(acol) - first_col(acol) + 1
2400 : pmat => pmats(img)%matrix
2401 : NULLIFY (p_block)
2402 : CALL dbcsr_get_block_p(matrix=pmat, row=arow, col=acol, BLOCK=p_block, found=found)
2403 :
2404 : IF (PRESENT(hmats)) THEN
2405 : hmat => hmats(img)%matrix
2406 : CALL dbcsr_get_block_p(matrix=hmat, row=arow, col=acol, BLOCK=h_block, found=found)
2407 : CPASSERT(found)
2408 : END IF
2409 :
2410 : IF (scatter .AND. .NOT. ASSOCIATED(p_block)) THEN
2411 : CALL dbcsr_put_block(pmat, arow, acol, &
2412 : block=recv_buf_r(recv_disps(l) + recv_sizes(l) + 1:recv_disps(l) + recv_sizes(l) + nrow*ncol))
2413 : END IF
2414 : IF (.NOT. scatter) THEN
2415 : !$ call omp_set_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
2416 : DO k = 1, ncol
2417 : DO j = 1, nrow
2418 : h_block(j, k) = h_block(j, k) + recv_buf_r(recv_disps(l) + recv_sizes(l) + j + (k - 1)*nrow)
2419 : END DO
2420 : END DO
2421 : !$ call omp_unset_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
2422 : END IF
2423 : recv_sizes(l) = recv_sizes(l) + nrow*ncol
2424 : END DO
2425 : END DO
2426 : !$OMP END DO
2427 :
2428 : !$ IF (.not. scatter) THEN
2429 : !$OMP DO
2430 : !$ do i = 1, nthread*10
2431 : !$ call omp_destroy_lock(locks(i))
2432 : !$ end do
2433 : !$OMP END DO
2434 : !$ END IF
2435 :
2436 : !$OMP SINGLE
2437 : !$ IF (.not. scatter) THEN
2438 : !$ DEALLOCATE (locks)
2439 : !$ END IF
2440 : !$OMP END SINGLE NOWAIT
2441 :
2442 : IF (scatter) THEN
2443 : ! Blocks were added to P
2444 : DO img = 1, nimages
2445 : pmat => pmats(img)%matrix
2446 : CALL dbcsr_finalize(pmat)
2447 : END DO
2448 : END IF
2449 : !$OMP END PARALLEL
2450 :
2451 0 : DEALLOCATE (send_buf_r)
2452 0 : DEALLOCATE (recv_buf_r)
2453 :
2454 0 : DEALLOCATE (send_sizes)
2455 0 : DEALLOCATE (recv_sizes)
2456 0 : DEALLOCATE (send_disps)
2457 0 : DEALLOCATE (recv_disps)
2458 0 : DEALLOCATE (send_pair_count)
2459 0 : DEALLOCATE (recv_pair_count)
2460 0 : DEALLOCATE (send_pair_disps)
2461 0 : DEALLOCATE (recv_pair_disps)
2462 :
2463 0 : DEALLOCATE (first_row, last_row, first_col, last_col)
2464 :
2465 0 : CALL timestop(handle)
2466 :
2467 0 : END SUBROUTINE rs_distribute_matrix
2468 :
2469 : ! **************************************************************************************************
2470 : !> \brief Calculates offsets and sizes for rs_scatter_matrix and rs_copy_matrix.
2471 : !> \author Ole Schuett
2472 : ! **************************************************************************************************
2473 13472 : SUBROUTINE rs_calc_offsets(pairs, nsgf, group_size, &
2474 : pair_offsets, rank_offsets, rank_sizes, buffer_size)
2475 : TYPE(atom_pair_type), DIMENSION(:), INTENT(IN) :: pairs
2476 : INTEGER, DIMENSION(:), INTENT(IN) :: nsgf
2477 : INTEGER, INTENT(IN) :: group_size
2478 : INTEGER, DIMENSION(:), POINTER :: pair_offsets, rank_offsets, rank_sizes
2479 : INTEGER, INTENT(INOUT) :: buffer_size
2480 :
2481 : INTEGER :: acol, arow, i, block_size, total_size, k, prev_k
2482 :
2483 13472 : IF (ASSOCIATED(pair_offsets)) DEALLOCATE (pair_offsets)
2484 13472 : IF (ASSOCIATED(rank_offsets)) DEALLOCATE (rank_offsets)
2485 13472 : IF (ASSOCIATED(rank_sizes)) DEALLOCATE (rank_sizes)
2486 :
2487 : ! calculate buffer_size and pair_offsets
2488 40180 : ALLOCATE (pair_offsets(SIZE(pairs)))
2489 13472 : total_size = 0
2490 276003 : DO i = 1, SIZE(pairs)
2491 262531 : pair_offsets(i) = total_size
2492 262531 : arow = pairs(i)%row
2493 262531 : acol = pairs(i)%col
2494 262531 : block_size = nsgf(arow)*nsgf(acol)
2495 276003 : total_size = total_size + block_size
2496 : END DO
2497 13472 : buffer_size = total_size
2498 :
2499 : ! calculate rank_offsets and rank_sizes
2500 40416 : ALLOCATE (rank_offsets(group_size))
2501 26944 : ALLOCATE (rank_sizes(group_size))
2502 39224 : rank_offsets = 0
2503 39224 : rank_sizes = 0
2504 13472 : IF (SIZE(pairs) > 0) THEN
2505 13236 : prev_k = pairs(1)%rank + 1
2506 275767 : DO i = 1, SIZE(pairs)
2507 262531 : k = pairs(i)%rank + 1
2508 262531 : CPASSERT(k >= prev_k) ! expecting the pairs to be ordered by rank
2509 275767 : IF (k > prev_k) THEN
2510 61 : rank_offsets(k) = pair_offsets(i)
2511 61 : rank_sizes(prev_k) = rank_offsets(k) - rank_offsets(prev_k)
2512 61 : prev_k = k
2513 : END IF
2514 : END DO
2515 13236 : rank_sizes(k) = buffer_size - rank_offsets(k) ! complete last rank
2516 : END IF
2517 :
2518 13472 : END SUBROUTINE rs_calc_offsets
2519 :
2520 : ! **************************************************************************************************
2521 : !> \brief Scatters dbcsr matrix blocks and receives them into a buffer as needed before collocation.
2522 : !> \author Ole Schuett
2523 : ! **************************************************************************************************
2524 266 : SUBROUTINE rs_scatter_matrices(src_matrices, dest_buffer, task_list, group)
2525 : TYPE(dbcsr_p_type), DIMENSION(:), INTENT(IN) :: src_matrices
2526 : TYPE(offload_buffer_type), INTENT(INOUT) :: dest_buffer
2527 : TYPE(task_list_type), INTENT(IN) :: task_list
2528 : TYPE(mp_comm_type), INTENT(IN) :: group
2529 :
2530 : CHARACTER(LEN=*), PARAMETER :: routineN = 'rs_scatter_matrices'
2531 :
2532 : INTEGER :: handle
2533 : REAL(KIND=dp), DIMENSION(:), ALLOCATABLE :: buffer_send
2534 :
2535 266 : CALL timeset(routineN, handle)
2536 777 : ALLOCATE (buffer_send(task_list%buffer_size_send))
2537 :
2538 : ! pack dbcsr blocks into send buffer
2539 266 : CPASSERT(ASSOCIATED(task_list%atom_pair_send))
2540 : CALL rs_pack_buffer(src_matrices=src_matrices, &
2541 : dest_buffer=buffer_send, &
2542 : atom_pair=task_list%atom_pair_send, &
2543 266 : pair_offsets=task_list%pair_offsets_send)
2544 :
2545 : ! mpi all-to-all communication, receiving directly into blocks_recv%buffer.
2546 : CALL group%alltoall(buffer_send, task_list%rank_sizes_send, task_list%rank_offsets_send, &
2547 : dest_buffer%host_buffer, &
2548 2394 : task_list%rank_sizes_recv, task_list%rank_offsets_recv)
2549 :
2550 266 : DEALLOCATE (buffer_send)
2551 266 : CALL timestop(handle)
2552 :
2553 266 : END SUBROUTINE rs_scatter_matrices
2554 :
2555 : ! **************************************************************************************************
2556 : !> \brief Gather the dbcsr matrix blocks and receives them into a buffer as needed after integration.
2557 : !> \author Ole Schuett
2558 : ! **************************************************************************************************
2559 218 : SUBROUTINE rs_gather_matrices(src_buffer, dest_matrices, task_list, group)
2560 : TYPE(offload_buffer_type), INTENT(IN) :: src_buffer
2561 : TYPE(dbcsr_p_type), DIMENSION(:), INTENT(INOUT) :: dest_matrices
2562 : TYPE(task_list_type), INTENT(IN) :: task_list
2563 : TYPE(mp_comm_type), INTENT(IN) :: group
2564 :
2565 : CHARACTER(LEN=*), PARAMETER :: routineN = 'rs_gather_matrices'
2566 :
2567 : INTEGER :: handle
2568 : REAL(KIND=dp), DIMENSION(:), ALLOCATABLE :: buffer_send
2569 :
2570 218 : CALL timeset(routineN, handle)
2571 :
2572 : ! Caution: The meaning of send and recv are reversed in this routine.
2573 640 : ALLOCATE (buffer_send(task_list%buffer_size_send)) ! e.g. this is actually used for receiving
2574 :
2575 : ! mpi all-to-all communication
2576 : CALL group%alltoall(src_buffer%host_buffer, task_list%rank_sizes_recv, task_list%rank_offsets_recv, &
2577 1962 : buffer_send, task_list%rank_sizes_send, task_list%rank_offsets_send)
2578 :
2579 : ! unpack dbcsr blocks from send buffer
2580 218 : CPASSERT(ASSOCIATED(task_list%atom_pair_send))
2581 : CALL rs_unpack_buffer(src_buffer=buffer_send, &
2582 : dest_matrices=dest_matrices, &
2583 : atom_pair=task_list%atom_pair_send, &
2584 218 : pair_offsets=task_list%pair_offsets_send)
2585 :
2586 218 : DEALLOCATE (buffer_send)
2587 218 : CALL timestop(handle)
2588 :
2589 218 : END SUBROUTINE rs_gather_matrices
2590 :
2591 : ! **************************************************************************************************
2592 : !> \brief Copies the DBCSR blocks into buffer, replaces rs_scatter_matrix for non-distributed grids.
2593 : !> \author Ole Schuett
2594 : ! **************************************************************************************************
2595 215250 : SUBROUTINE rs_copy_to_buffer(src_matrices, dest_buffer, task_list)
2596 : TYPE(dbcsr_p_type), DIMENSION(:), INTENT(IN) :: src_matrices
2597 : TYPE(offload_buffer_type), INTENT(INOUT) :: dest_buffer
2598 : TYPE(task_list_type), INTENT(IN) :: task_list
2599 :
2600 : CALL rs_pack_buffer(src_matrices=src_matrices, &
2601 : dest_buffer=dest_buffer%host_buffer, &
2602 : atom_pair=task_list%atom_pair_recv, &
2603 215250 : pair_offsets=task_list%pair_offsets_recv)
2604 :
2605 215250 : END SUBROUTINE rs_copy_to_buffer
2606 :
2607 : ! **************************************************************************************************
2608 : !> \brief Copies from buffer into DBCSR matrics, replaces rs_gather_matrix for non-distributed grids.
2609 : !> \author Ole Schuett
2610 : ! **************************************************************************************************
2611 180595 : SUBROUTINE rs_copy_to_matrices(src_buffer, dest_matrices, task_list)
2612 : TYPE(offload_buffer_type), INTENT(IN) :: src_buffer
2613 : TYPE(dbcsr_p_type), DIMENSION(:), INTENT(INOUT) :: dest_matrices
2614 : TYPE(task_list_type), INTENT(IN) :: task_list
2615 :
2616 : CALL rs_unpack_buffer(src_buffer=src_buffer%host_buffer, &
2617 : dest_matrices=dest_matrices, &
2618 : atom_pair=task_list%atom_pair_recv, &
2619 180595 : pair_offsets=task_list%pair_offsets_recv)
2620 :
2621 180595 : END SUBROUTINE rs_copy_to_matrices
2622 :
2623 : ! **************************************************************************************************
2624 : !> \brief Helper routine for rs_scatter_matrix and rs_copy_to_buffer.
2625 : !> \author Ole Schuett
2626 : ! **************************************************************************************************
2627 215516 : SUBROUTINE rs_pack_buffer(src_matrices, dest_buffer, atom_pair, pair_offsets)
2628 : TYPE(dbcsr_p_type), DIMENSION(:), INTENT(IN) :: src_matrices
2629 : REAL(KIND=dp), DIMENSION(:), INTENT(INOUT) :: dest_buffer
2630 : TYPE(atom_pair_type), DIMENSION(:), INTENT(IN) :: atom_pair
2631 : INTEGER, DIMENSION(:), INTENT(IN) :: pair_offsets
2632 :
2633 : INTEGER :: acol, arow, img, i, offset, block_size
2634 : LOGICAL :: found
2635 215516 : REAL(KIND=dp), DIMENSION(:, :), POINTER :: block
2636 :
2637 : !$OMP PARALLEL DEFAULT(NONE), &
2638 : !$OMP SHARED(src_matrices,atom_pair,pair_offsets,dest_buffer), &
2639 215516 : !$OMP PRIVATE(acol,arow,img,i,offset,block_size,found,block)
2640 : !$OMP DO schedule(guided)
2641 : DO i = 1, SIZE(atom_pair)
2642 : arow = atom_pair(i)%row
2643 : acol = atom_pair(i)%col
2644 : img = atom_pair(i)%image
2645 : CALL dbcsr_get_block_p(matrix=src_matrices(img)%matrix, row=arow, col=acol, &
2646 : block=block, found=found)
2647 : CPASSERT(found)
2648 : block_size = SIZE(block)
2649 : offset = pair_offsets(i)
2650 : dest_buffer(offset + 1:offset + block_size) = RESHAPE(block, shape=(/block_size/))
2651 : END DO
2652 : !$OMP END DO
2653 : !$OMP END PARALLEL
2654 :
2655 215516 : END SUBROUTINE rs_pack_buffer
2656 :
2657 : ! **************************************************************************************************
2658 : !> \brief Helper routine for rs_gather_matrix and rs_copy_to_matrices.
2659 : !> \author Ole Schuett
2660 : ! **************************************************************************************************
2661 180813 : SUBROUTINE rs_unpack_buffer(src_buffer, dest_matrices, atom_pair, pair_offsets)
2662 : REAL(KIND=dp), DIMENSION(:), INTENT(IN) :: src_buffer
2663 : TYPE(dbcsr_p_type), DIMENSION(:), INTENT(INOUT) :: dest_matrices
2664 : TYPE(atom_pair_type), DIMENSION(:), INTENT(IN) :: atom_pair
2665 : INTEGER, DIMENSION(:), INTENT(IN) :: pair_offsets
2666 :
2667 : INTEGER :: acol, arow, img, i, offset, &
2668 : nrows, ncols, lock_num
2669 : LOGICAL :: found
2670 180813 : REAL(KIND=dp), DIMENSION(:, :), POINTER :: block
2671 180813 : INTEGER(kind=omp_lock_kind), ALLOCATABLE, DIMENSION(:) :: locks
2672 :
2673 : ! initialize locks
2674 542439 : ALLOCATE (locks(10*omp_get_max_threads()))
2675 1988943 : DO i = 1, SIZE(locks)
2676 1988943 : CALL omp_init_lock(locks(i))
2677 : END DO
2678 :
2679 : !$OMP PARALLEL DEFAULT(NONE), &
2680 : !$OMP SHARED(src_buffer,atom_pair,pair_offsets,dest_matrices,locks), &
2681 180813 : !$OMP PRIVATE(acol,arow,img,i,offset,nrows,ncols,lock_num,found,block)
2682 : !$OMP DO schedule(guided)
2683 : DO i = 1, SIZE(atom_pair)
2684 : arow = atom_pair(i)%row
2685 : acol = atom_pair(i)%col
2686 : img = atom_pair(i)%image
2687 : CALL dbcsr_get_block_p(matrix=dest_matrices(img)%matrix, row=arow, col=acol, &
2688 : block=block, found=found)
2689 : CPASSERT(found)
2690 : nrows = SIZE(block, 1)
2691 : ncols = SIZE(block, 2)
2692 : offset = pair_offsets(i)
2693 : lock_num = MODULO(arow, SIZE(locks)) + 1 ! map matrix rows round-robin to available locks
2694 :
2695 : CALL omp_set_lock(locks(lock_num))
2696 : block = block + RESHAPE(src_buffer(offset + 1:offset + nrows*ncols), shape=(/nrows, ncols/))
2697 : CALL omp_unset_lock(locks(lock_num))
2698 : END DO
2699 : !$OMP END DO
2700 : !$OMP END PARALLEL
2701 :
2702 : ! destroy locks
2703 1988943 : DO i = 1, SIZE(locks)
2704 1988943 : CALL omp_destroy_lock(locks(i))
2705 : END DO
2706 180813 : DEALLOCATE (locks)
2707 :
2708 180813 : END SUBROUTINE rs_unpack_buffer
2709 :
2710 : ! **************************************************************************************************
2711 : !> \brief determines the rank of the processor who's real rs grid contains point
2712 : !> \param rs_desc ...
2713 : !> \param igrid_level ...
2714 : !> \param n_levels ...
2715 : !> \param cube_center ...
2716 : !> \param ntasks ...
2717 : !> \param tasks ...
2718 : !> \param lb_cube ...
2719 : !> \param ub_cube ...
2720 : !> \param added_tasks ...
2721 : !> \par History
2722 : !> 11.2007 created [MattW]
2723 : !> 10.2008 rewritten [Joost VandeVondele]
2724 : !> \author MattW
2725 : ! **************************************************************************************************
2726 1380 : SUBROUTINE rs_find_node(rs_desc, igrid_level, n_levels, cube_center, ntasks, tasks, &
2727 : lb_cube, ub_cube, added_tasks)
2728 :
2729 : TYPE(realspace_grid_desc_type), POINTER :: rs_desc
2730 : INTEGER, INTENT(IN) :: igrid_level, n_levels
2731 : INTEGER, DIMENSION(3), INTENT(IN) :: cube_center
2732 : INTEGER, INTENT(INOUT) :: ntasks
2733 : TYPE(task_type), DIMENSION(:), POINTER :: tasks
2734 : INTEGER, DIMENSION(3), INTENT(IN) :: lb_cube, ub_cube
2735 : INTEGER, INTENT(OUT) :: added_tasks
2736 :
2737 : INTEGER, PARAMETER :: add_tasks = 1000
2738 : REAL(kind=dp), PARAMETER :: mult_tasks = 2.0_dp
2739 :
2740 : INTEGER :: bit_index, coord(3), curr_tasks, dest, i, icoord(3), idest, itask, ix, iy, iz, &
2741 : lb_coord(3), lb_domain(3), lbc(3), ub_coord(3), ub_domain(3), ubc(3)
2742 : INTEGER :: bit_pattern
2743 : LOGICAL :: dir_periodic(3)
2744 :
2745 1380 : coord(1) = rs_desc%x2coord(cube_center(1))
2746 1380 : coord(2) = rs_desc%y2coord(cube_center(2))
2747 1380 : coord(3) = rs_desc%z2coord(cube_center(3))
2748 1380 : dest = rs_desc%coord2rank(coord(1), coord(2), coord(3))
2749 :
2750 : ! the real cube coordinates
2751 5520 : lbc = lb_cube + cube_center
2752 5520 : ubc = ub_cube + cube_center
2753 :
2754 9660 : IF (ALL((rs_desc%lb_global(:, dest) - rs_desc%border) .LE. lbc) .AND. &
2755 : ALL((rs_desc%ub_global(:, dest) + rs_desc%border) .GE. ubc)) THEN
2756 : !standard distributed collocation/integration
2757 1380 : tasks(ntasks)%destination = encode_rank(dest, igrid_level, n_levels)
2758 1380 : tasks(ntasks)%dist_type = 1
2759 1380 : tasks(ntasks)%subpatch_pattern = 0
2760 1380 : added_tasks = 1
2761 :
2762 : ! here we figure out if there is an alternate location for this task
2763 : ! i.e. even though the cube_center is not in the real local domain,
2764 : ! it might fully fit in the halo of the neighbor
2765 : ! if its radius is smaller than the maximum radius
2766 : ! the list of possible neighbors is stored here as a bit pattern
2767 : ! to reconstruct what the rank of the neigbor is,
2768 : ! we can use (note this requires the correct rs_grid)
2769 : ! IF (BTEST(bit_pattern,0)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,(/-1,0,0/))
2770 : ! IF (BTEST(bit_pattern,1)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,(/+1,0,0/))
2771 : ! IF (BTEST(bit_pattern,2)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,(/0,-1,0/))
2772 : ! IF (BTEST(bit_pattern,3)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,(/0,+1,0/))
2773 : ! IF (BTEST(bit_pattern,4)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,(/0,0,-1/))
2774 : ! IF (BTEST(bit_pattern,5)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,(/0,0,+1/))
2775 1380 : bit_index = 0
2776 1380 : bit_pattern = 0
2777 5520 : DO i = 1, 3
2778 5520 : IF (rs_desc%perd(i) == 1) THEN
2779 2760 : bit_pattern = IBCLR(bit_pattern, bit_index)
2780 2760 : bit_index = bit_index + 1
2781 2760 : bit_pattern = IBCLR(bit_pattern, bit_index)
2782 2760 : bit_index = bit_index + 1
2783 : ELSE
2784 : ! fits the left neighbor ?
2785 1380 : IF (ubc(i) <= rs_desc%lb_global(i, dest) - 1 + rs_desc%border) THEN
2786 599 : bit_pattern = IBSET(bit_pattern, bit_index)
2787 599 : bit_index = bit_index + 1
2788 : ELSE
2789 781 : bit_pattern = IBCLR(bit_pattern, bit_index)
2790 781 : bit_index = bit_index + 1
2791 : END IF
2792 : ! fits the right neighbor ?
2793 1380 : IF (lbc(i) >= rs_desc%ub_global(i, dest) + 1 - rs_desc%border) THEN
2794 250 : bit_pattern = IBSET(bit_pattern, bit_index)
2795 250 : bit_index = bit_index + 1
2796 : ELSE
2797 1130 : bit_pattern = IBCLR(bit_pattern, bit_index)
2798 1130 : bit_index = bit_index + 1
2799 : END IF
2800 : END IF
2801 : END DO
2802 1380 : tasks(ntasks)%subpatch_pattern = bit_pattern
2803 :
2804 : ELSE
2805 : ! generalised collocation/integration needed
2806 : ! first we figure out how many neighbors we have to add to include the lbc/ubc
2807 : ! in the available domains (inclusive of halo points)
2808 : ! first we 'ignore' periodic boundary conditions
2809 : ! i.e. ub_coord-lb_coord+1 might be larger than group_dim
2810 0 : lb_coord = coord
2811 0 : ub_coord = coord
2812 0 : lb_domain = rs_desc%lb_global(:, dest) - rs_desc%border
2813 0 : ub_domain = rs_desc%ub_global(:, dest) + rs_desc%border
2814 0 : DO i = 1, 3
2815 : ! only if the grid is not periodic in this direction we need to take care of adding neighbors
2816 0 : IF (rs_desc%perd(i) == 0) THEN
2817 : ! if the domain lower bound is greater than the lbc we need to add the size of the neighbor domain
2818 0 : DO
2819 0 : IF (lb_domain(i) > lbc(i)) THEN
2820 0 : lb_coord(i) = lb_coord(i) - 1
2821 0 : icoord = MODULO(lb_coord, rs_desc%group_dim)
2822 0 : idest = rs_desc%coord2rank(icoord(1), icoord(2), icoord(3))
2823 0 : lb_domain(i) = lb_domain(i) - (rs_desc%ub_global(i, idest) - rs_desc%lb_global(i, idest) + 1)
2824 : ELSE
2825 : EXIT
2826 : END IF
2827 : END DO
2828 : ! same for the upper bound
2829 0 : DO
2830 0 : IF (ub_domain(i) < ubc(i)) THEN
2831 0 : ub_coord(i) = ub_coord(i) + 1
2832 0 : icoord = MODULO(ub_coord, rs_desc%group_dim)
2833 0 : idest = rs_desc%coord2rank(icoord(1), icoord(2), icoord(3))
2834 0 : ub_domain(i) = ub_domain(i) + (rs_desc%ub_global(i, idest) - rs_desc%lb_global(i, idest) + 1)
2835 : ELSE
2836 : EXIT
2837 : END IF
2838 : END DO
2839 : END IF
2840 : END DO
2841 :
2842 : ! some care is needed for the periodic boundaries
2843 0 : DO i = 1, 3
2844 0 : IF (ub_domain(i) - lb_domain(i) + 1 >= rs_desc%npts(i)) THEN
2845 0 : dir_periodic(i) = .TRUE.
2846 0 : lb_coord(i) = 0
2847 0 : ub_coord(i) = rs_desc%group_dim(i) - 1
2848 : ELSE
2849 0 : dir_periodic(i) = .FALSE.
2850 : END IF
2851 : END DO
2852 :
2853 0 : added_tasks = PRODUCT(ub_coord - lb_coord + 1)
2854 0 : itask = ntasks
2855 0 : ntasks = ntasks + added_tasks - 1
2856 0 : IF (ntasks > SIZE(tasks)) THEN
2857 0 : curr_tasks = INT((SIZE(tasks) + add_tasks)*mult_tasks)
2858 0 : CALL reallocate_tasks(tasks, curr_tasks)
2859 : END IF
2860 0 : DO iz = lb_coord(3), ub_coord(3)
2861 0 : DO iy = lb_coord(2), ub_coord(2)
2862 0 : DO ix = lb_coord(1), ub_coord(1)
2863 0 : icoord = MODULO((/ix, iy, iz/), rs_desc%group_dim)
2864 0 : idest = rs_desc%coord2rank(icoord(1), icoord(2), icoord(3))
2865 0 : tasks(itask)%destination = encode_rank(idest, igrid_level, n_levels)
2866 0 : tasks(itask)%dist_type = 2
2867 0 : tasks(itask)%subpatch_pattern = 0
2868 : ! encode the domain size for this task
2869 : ! if the bit is set, we need to add the border in that direction
2870 0 : IF (ix == lb_coord(1) .AND. .NOT. dir_periodic(1)) &
2871 0 : tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 0)
2872 0 : IF (ix == ub_coord(1) .AND. .NOT. dir_periodic(1)) &
2873 0 : tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 1)
2874 0 : IF (iy == lb_coord(2) .AND. .NOT. dir_periodic(2)) &
2875 0 : tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 2)
2876 0 : IF (iy == ub_coord(2) .AND. .NOT. dir_periodic(2)) &
2877 0 : tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 3)
2878 0 : IF (iz == lb_coord(3) .AND. .NOT. dir_periodic(3)) &
2879 0 : tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 4)
2880 0 : IF (iz == ub_coord(3) .AND. .NOT. dir_periodic(3)) &
2881 0 : tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 5)
2882 0 : itask = itask + 1
2883 : END DO
2884 : END DO
2885 : END DO
2886 : END IF
2887 :
2888 1380 : END SUBROUTINE rs_find_node
2889 :
2890 : ! **************************************************************************************************
2891 : !> \brief utility functions for encoding the grid level with a rank, allowing
2892 : !> different grid levels to maintain a different rank ordering without
2893 : !> losing information. These encoded_ints are stored in the tasks(1:2,:) array
2894 : !> \param rank ...
2895 : !> \param grid_level ...
2896 : !> \param n_levels ...
2897 : !> \return ...
2898 : !> \par History
2899 : !> 4.2009 created [Iain Bethune]
2900 : !> (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
2901 : ! **************************************************************************************************
2902 15702724 : FUNCTION encode_rank(rank, grid_level, n_levels) RESULT(encoded_int)
2903 :
2904 : INTEGER, INTENT(IN) :: rank, grid_level, n_levels
2905 : INTEGER :: encoded_int
2906 :
2907 : ! ordered so can still sort by rank
2908 :
2909 15702724 : encoded_int = rank*n_levels + grid_level - 1
2910 :
2911 15702724 : END FUNCTION
2912 :
2913 : ! **************************************************************************************************
2914 : !> \brief ...
2915 : !> \param encoded_int ...
2916 : !> \param n_levels ...
2917 : !> \return ...
2918 : ! **************************************************************************************************
2919 7897352 : FUNCTION decode_rank(encoded_int, n_levels) RESULT(rank)
2920 :
2921 : INTEGER, INTENT(IN) :: encoded_int
2922 : INTEGER, INTENT(IN) :: n_levels
2923 : INTEGER :: rank
2924 :
2925 7897352 : rank = INT(encoded_int/n_levels)
2926 :
2927 7897352 : END FUNCTION
2928 :
2929 : ! **************************************************************************************************
2930 : !> \brief ...
2931 : !> \param encoded_int ...
2932 : !> \param n_levels ...
2933 : !> \return ...
2934 : ! **************************************************************************************************
2935 7236 : FUNCTION decode_level(encoded_int, n_levels) RESULT(grid_level)
2936 :
2937 : INTEGER, INTENT(IN) :: encoded_int
2938 : INTEGER, INTENT(IN) :: n_levels
2939 : INTEGER :: grid_level
2940 :
2941 7236 : grid_level = INT(MODULO(encoded_int, n_levels)) + 1
2942 :
2943 7236 : END FUNCTION decode_level
2944 :
2945 : ! **************************************************************************************************
2946 : !> \brief Sort pgf index pair (ipgf,iset,iatom),(jpgf,jset,jatom) for which all atom pairs are
2947 : !> grouped, and for each atom pair all set pairs are grouped and for each set pair,
2948 : !> all pgfs are grouped.
2949 : !> This yields the right order of the tasks for collocation after the sort
2950 : !> in distribute_tasks. E.g. for a atom pair, all sets and pgfs are computed in one go.
2951 : !> The exception is the gridlevel. Tasks are first ordered wrt to grid_level. This implies
2952 : !> that a given density matrix block will be decontracted several times,
2953 : !> but cache effects on the grid make up for this.
2954 : !> \param a ...
2955 : !> \param b ...
2956 : !> \return ...
2957 : !> \author Ole Schuett
2958 : ! **************************************************************************************************
2959 74208817 : PURE FUNCTION tasks_less_than(a, b) RESULT(res)
2960 : TYPE(task_type), INTENT(IN) :: a, b
2961 : LOGICAL :: res
2962 :
2963 74208817 : IF (a%grid_level /= b%grid_level) THEN
2964 18995994 : res = a%grid_level < b%grid_level
2965 :
2966 55212823 : ELSE IF (a%image /= b%image) THEN
2967 5395236 : res = a%image < b%image
2968 :
2969 49817587 : ELSE IF (a%iatom /= b%iatom) THEN
2970 6412539 : res = a%iatom < b%iatom
2971 :
2972 43405048 : ELSE IF (a%jatom /= b%jatom) THEN
2973 6914707 : res = a%jatom < b%jatom
2974 :
2975 36490341 : ELSE IF (a%iset /= b%iset) THEN
2976 5824192 : res = a%iset < b%iset
2977 :
2978 30666149 : ELSE IF (a%jset /= b%jset) THEN
2979 6234464 : res = a%jset < b%jset
2980 :
2981 24431685 : ELSE IF (a%ipgf /= b%ipgf) THEN
2982 6229140 : res = a%ipgf < b%ipgf
2983 :
2984 : ELSE
2985 18202545 : res = a%jpgf < b%jpgf
2986 :
2987 : END IF
2988 74208817 : END FUNCTION tasks_less_than
2989 :
2990 176070911 : #:call array_sort(prefix='tasks', type='TYPE(task_type)')
2991 : #:endcall
2992 :
2993 : ! **************************************************************************************************
2994 : !> \brief Order atom pairs to find duplicates.
2995 : !> \param a ...
2996 : !> \param b ...
2997 : !> \return ...
2998 : !> \author Ole Schuett
2999 : ! **************************************************************************************************
3000 31681101 : PURE FUNCTION atom_pair_less_than(a, b) RESULT(res)
3001 : TYPE(atom_pair_type), INTENT(IN) :: a, b
3002 : LOGICAL :: res
3003 :
3004 31681101 : IF (a%rank /= b%rank) THEN
3005 4243 : res = a%rank < b%rank
3006 :
3007 31676858 : ELSE IF (a%row /= b%row) THEN
3008 5173148 : res = a%row < b%row
3009 :
3010 26503710 : ELSE IF (a%col /= b%col) THEN
3011 5191450 : res = a%col < b%col
3012 :
3013 : ELSE
3014 21312260 : res = a%image < b%image
3015 :
3016 : END IF
3017 31681101 : END FUNCTION atom_pair_less_than
3018 :
3019 67142645 : #:call array_sort(prefix='atom_pair', type='TYPE(atom_pair_type)')
3020 : #:endcall
3021 :
3022 : END MODULE task_list_methods
|