Line data Source code
1 : !--------------------------------------------------------------------------------------------------!
2 : ! CP2K: A general program to perform molecular dynamics simulations !
3 : ! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4 : ! !
5 : ! SPDX-License-Identifier: GPL-2.0-or-later !
6 : !--------------------------------------------------------------------------------------------------!
7 :
8 : ! **************************************************************************************************
9 : !> \brief Routines for optimizing load balance between processes in HFX calculations
10 : !> \par History
11 : !> 04.2008 created [Manuel Guidon]
12 : !> \author Manuel Guidon
13 : ! **************************************************************************************************
14 : MODULE hfx_load_balance_methods
15 : USE cell_types, ONLY: cell_type
16 : USE cp_files, ONLY: close_file, &
17 : open_file
18 : USE message_passing, ONLY: mp_para_env_type
19 : USE hfx_pair_list_methods, ONLY: build_atomic_pair_list, &
20 : build_pair_list
21 : USE hfx_types, ONLY: &
22 : hfx_basis_type, hfx_block_range_type, hfx_distribution, hfx_load_balance_type, hfx_p_kind, &
23 : hfx_screen_coeff_type, hfx_set_distr_energy, hfx_set_distr_forces, hfx_type, &
24 : pair_list_type, pair_set_list_type
25 : USE input_constants, ONLY: hfx_do_eval_energy, &
26 : hfx_do_eval_forces
27 : USE kinds, ONLY: dp, &
28 : int_8
29 : USE message_passing, ONLY: mp_waitall, mp_request_type
30 : USE parallel_rng_types, ONLY: UNIFORM, &
31 : rng_stream_type
32 : USE particle_types, ONLY: particle_type
33 : USE util, ONLY: sort
34 : #include "./base/base_uses.f90"
35 :
36 : IMPLICIT NONE
37 : PRIVATE
38 :
39 : PUBLIC :: hfx_load_balance, &
40 : hfx_update_load_balance, &
41 : collect_load_balance_info, cost_model, p1_energy, p2_energy, p3_energy
42 :
43 : CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'hfx_load_balance_methods'
44 :
45 : REAL(KIND=dp), PARAMETER :: p1_energy(12) = (/2.9461408209700424_dp, 1.0624718662999657_dp, &
46 : -1.91570128356921242E-002_dp, -1.6668495454436603_dp, &
47 : 1.7512639006523709_dp, -9.76074323945336081E-002_dp, &
48 : 2.6230786127311889_dp, -0.31870737623014189_dp, &
49 : 7.9588203912690973_dp, 1.8331423413134813_dp, &
50 : -0.15427618665346299_dp, 0.19749436090711650_dp/)
51 : REAL(KIND=dp), PARAMETER :: p2_energy(12) = (/2.3104682960662593_dp, 1.8744052737304417_dp, &
52 : -9.36564055598656797E-002_dp, 0.64284973765086939_dp, &
53 : 1.0137565430060556_dp, -6.80088178288954567E-003_dp, &
54 : 1.1692629207374552_dp, -2.6314710080507573_dp, &
55 : 19.237814781880786_dp, 1.0505934173661349_dp, &
56 : 0.80382371955699250_dp, 0.49903401991818103_dp/)
57 : REAL(KIND=dp), PARAMETER :: p3_energy(2) = (/7.82336287670072350E-002_dp, 0.38073304105744837_dp/)
58 : REAL(KIND=dp), PARAMETER :: p1_forces(12) = (/2.5746279948798874_dp, 1.3420575378609276_dp, &
59 : -9.41673106447732111E-002_dp, 0.94568006899317825_dp, &
60 : -1.4511897117448544_dp, 0.59178934677316952_dp, &
61 : 2.7291149361757236_dp, -0.50555512044800210_dp, &
62 : 8.3508180969609871_dp, 1.6829982496141809_dp, &
63 : -0.74895370472152600_dp, 0.43801726744197500_dp/)
64 : REAL(KIND=dp), PARAMETER :: p2_forces(12) = (/2.6398568961569020_dp, 2.3024918834564101_dp, &
65 : 5.33216585432061581E-003_dp, 0.45572145697283628_dp, &
66 : 1.8119743851500618_dp, -0.12533918548421166_dp, &
67 : -1.4040312084552751_dp, -4.5331650463917859_dp, &
68 : 12.593431549069477_dp, 1.1311978374487595_dp, &
69 : 1.4245996087624646_dp, 1.1425350529853495_dp/)
70 : REAL(KIND=dp), PARAMETER :: p3_forces(2) = (/0.12051930516830946_dp, 1.3828051586144336_dp/)
71 :
72 : !***
73 :
74 : CONTAINS
75 :
76 : ! **************************************************************************************************
77 : !> \brief Distributes the computation of eri's to all available processes.
78 : !> \param x_data Object that stores the indices array
79 : !> \param eps_schwarz screening parameter
80 : !> \param particle_set , atomic_kind_set, para_env ...
81 : !> \param max_set Maximum number of set to be considered
82 : !> \param para_env para_env
83 : !> \param coeffs_set screening functions
84 : !> \param coeffs_kind screening functions
85 : !> \param is_assoc_atomic_block_global KS-matrix sparsity
86 : !> \param do_periodic flag for periodicity
87 : !> \param load_balance_parameter Parameters for Monte-Carlo routines
88 : !> \param kind_of helper array for mapping
89 : !> \param basis_parameter Basis set parameters
90 : !> \param pmax_set Initial screening matrix
91 : !> \param pmax_atom ...
92 : !> \param i_thread Process ID of current Thread
93 : !> \param n_threads Total Number of Threads
94 : !> \param cell cell
95 : !> \param do_p_screening Flag for initial p screening
96 : !> \param map_atom_to_kind_atom ...
97 : !> \param nkind ...
98 : !> \param eval_type ...
99 : !> \param pmax_block ...
100 : !> \param use_virial ...
101 : !> \par History
102 : !> 06.2007 created [Manuel Guidon]
103 : !> 08.2007 new parallel scheme [Manuel Guidon]
104 : !> 09.2007 new 'modulo' parellel scheme and Monte Carlo step [Manuel Guidon]
105 : !> 11.2007 parallelize load balance on box_idx1 [Manuel Guidon]
106 : !> 02.2009 completely refactored [Manuel Guidon]
107 : !> \author Manuel Guidon
108 : !> \note
109 : !> The optimization is done via a binning procedure followed by simple
110 : !> Monte Carlo procedure:
111 : !> In a first step the total amount of integrals in the system is calculated,
112 : !> taking into account the sparsity of the KS-matrix , the screening based
113 : !> on near/farfield approximations and if desired the screening on an initial
114 : !> density matrix.
115 : !> In a second step, bins are generate that contain approximately the same number
116 : !> of integrals, and a cost for these bins is estimated (currently the number of integrals)
117 : !> In a third step, a Monte Carlo procedure optimizes the assignment
118 : !> of the different loads to each process
119 : !> At the end each process owns an unique array of *atomic* indices-ranges
120 : !> that are used to decide whether a process has to calculate a certain
121 : !> bunch of integrals or not
122 : ! **************************************************************************************************
123 1746 : SUBROUTINE hfx_load_balance(x_data, eps_schwarz, particle_set, max_set, para_env, &
124 : coeffs_set, coeffs_kind, &
125 1746 : is_assoc_atomic_block_global, do_periodic, &
126 : load_balance_parameter, kind_of, basis_parameter, pmax_set, &
127 : pmax_atom, i_thread, n_threads, cell, &
128 : do_p_screening, map_atom_to_kind_atom, nkind, eval_type, &
129 : pmax_block, use_virial)
130 : TYPE(hfx_type), POINTER :: x_data
131 : REAL(dp), INTENT(IN) :: eps_schwarz
132 : TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
133 : INTEGER, INTENT(IN) :: max_set
134 : TYPE(mp_para_env_type), POINTER :: para_env
135 : TYPE(hfx_screen_coeff_type), &
136 : DIMENSION(:, :, :, :), POINTER :: coeffs_set
137 : TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
138 : POINTER :: coeffs_kind
139 : INTEGER, DIMENSION(:, :) :: is_assoc_atomic_block_global
140 : LOGICAL :: do_periodic
141 : TYPE(hfx_load_balance_type), POINTER :: load_balance_parameter
142 : INTEGER :: kind_of(*)
143 : TYPE(hfx_basis_type), DIMENSION(:), POINTER :: basis_parameter
144 : TYPE(hfx_p_kind), DIMENSION(:), POINTER :: pmax_set
145 : REAL(dp), DIMENSION(:, :), POINTER :: pmax_atom
146 : INTEGER, INTENT(IN) :: i_thread, n_threads
147 : TYPE(cell_type), POINTER :: cell
148 : LOGICAL, INTENT(IN) :: do_p_screening
149 : INTEGER, DIMENSION(:), POINTER :: map_atom_to_kind_atom
150 : INTEGER, INTENT(IN) :: nkind, eval_type
151 : REAL(dp), DIMENSION(:, :), POINTER :: pmax_block
152 : LOGICAL, INTENT(IN) :: use_virial
153 :
154 : CHARACTER(LEN=*), PARAMETER :: routineN = 'hfx_load_balance'
155 :
156 : CHARACTER(LEN=512) :: error_msg
157 : INTEGER :: block_size, current_block_id, data_from, dest, handle, handle_inner, &
158 : handle_range, i, iatom_block, iatom_end, iatom_start, ibin, icpu, j, jatom_block, &
159 : jatom_end, jatom_start, katom_block, katom_end, katom_start, latom_block, latom_end, &
160 : latom_start, mepos, my_process_id, n_processes, natom, nbins, nblocks, ncpu, &
161 : new_iatom_end, new_iatom_start, new_jatom_end, new_jatom_start, non_empty_blocks, &
162 : objective_block_size, objective_nblocks, source, total_blocks
163 5238 : TYPE(mp_request_type), DIMENSION(2) :: req
164 : INTEGER(int_8) :: atom_block, cost_per_bin, cost_per_core, current_cost, &
165 : distribution_counter_end, distribution_counter_start, global_quartet_counter, &
166 : local_quartet_counter, self_cost_per_block, tmp_block, total_block_self_cost
167 1746 : INTEGER(int_8), ALLOCATABLE, DIMENSION(:) :: buffer_in, buffer_out
168 1746 : INTEGER(int_8), DIMENSION(:), POINTER :: local_cost_matrix, recbuffer, &
169 1746 : sendbuffer, swapbuffer
170 : INTEGER(int_8), DIMENSION(:), POINTER, SAVE :: cost_matrix
171 : INTEGER(int_8), SAVE :: shm_global_quartet_counter, &
172 : shm_local_quartet_counter
173 3492 : INTEGER, ALLOCATABLE, DIMENSION(:) :: rcount, rdispl, tmp_index, tmp_pos, &
174 1746 : to_be_sorted
175 : INTEGER, DIMENSION(:), POINTER, SAVE :: shm_distribution_vector
176 : INTEGER, SAVE :: shm_nblocks
177 : LOGICAL :: changed, last_bin_needs_to_be_filled, &
178 : optimized
179 : LOGICAL, DIMENSION(:, :), POINTER, SAVE :: atomic_pair_list
180 : REAL(dp) :: coeffs_kind_max0, log10_eps_schwarz, &
181 : log_2, pmax_blocks
182 3492 : TYPE(hfx_block_range_type), DIMENSION(:), POINTER :: blocks_guess, tmp_blocks, tmp_blocks2
183 : TYPE(hfx_block_range_type), DIMENSION(:), &
184 : POINTER, SAVE :: shm_blocks
185 5238 : TYPE(hfx_distribution), DIMENSION(:), POINTER :: binned_dist, ptr_to_tmp_dist, tmp_dist
186 : TYPE(hfx_distribution), DIMENSION(:, :), POINTER, &
187 : SAVE :: full_dist
188 : TYPE(pair_list_type) :: list_ij, list_kl
189 : TYPE(pair_set_list_type), ALLOCATABLE, &
190 1746 : DIMENSION(:) :: set_list_ij, set_list_kl
191 :
192 1746 : !$OMP BARRIER
193 3492 : !$OMP MASTER
194 1746 : CALL timeset(routineN, handle)
195 : !$OMP END MASTER
196 1746 : !$OMP BARRIER
197 :
198 1746 : log10_eps_schwarz = LOG10(eps_schwarz)
199 1746 : log_2 = LOG10(2.0_dp)
200 12034 : coeffs_kind_max0 = MAXVAL(coeffs_kind(:, :)%x(2))
201 1746 : ncpu = para_env%num_pe
202 1746 : n_processes = ncpu*n_threads
203 1746 : natom = SIZE(particle_set)
204 :
205 1746 : block_size = load_balance_parameter%block_size
206 33376 : ALLOCATE (set_list_ij((max_set*block_size)**2))
207 31630 : ALLOCATE (set_list_kl((max_set*block_size)**2))
208 :
209 1746 : IF (.NOT. load_balance_parameter%blocks_initialized) THEN
210 1218 : !$OMP BARRIER
211 1218 : !$OMP MASTER
212 1218 : CALL timeset(routineN//"_range", handle_range)
213 :
214 1218 : nblocks = MAX((natom + block_size - 1)/block_size, 1)
215 7492 : ALLOCATE (blocks_guess(nblocks))
216 7492 : ALLOCATE (tmp_blocks(natom))
217 6274 : ALLOCATE (tmp_blocks2(natom))
218 :
219 1218 : pmax_blocks = 0.0_dp
220 2436 : SELECT CASE (eval_type)
221 : CASE (hfx_do_eval_energy)
222 1218 : atomic_pair_list => x_data%atomic_pair_list
223 : CASE (hfx_do_eval_forces)
224 1218 : atomic_pair_list => x_data%atomic_pair_list_forces
225 : END SELECT
226 20758 : atomic_pair_list = .TRUE.
227 : CALL init_blocks(nkind, para_env, natom, block_size, nblocks, blocks_guess, &
228 : list_ij, list_kl, set_list_ij, set_list_kl, &
229 : particle_set, &
230 : coeffs_set, coeffs_kind, &
231 : is_assoc_atomic_block_global, do_periodic, &
232 : kind_of, basis_parameter, pmax_set, pmax_atom, &
233 : pmax_blocks, cell, &
234 : do_p_screening, map_atom_to_kind_atom, eval_type, &
235 1218 : log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
236 :
237 1218 : total_block_self_cost = 0
238 :
239 5056 : DO i = 1, nblocks
240 5056 : total_block_self_cost = total_block_self_cost + blocks_guess(i)%cost
241 : END DO
242 :
243 1218 : CALL para_env%sum(total_block_self_cost)
244 :
245 1218 : objective_block_size = load_balance_parameter%block_size
246 1218 : objective_nblocks = MAX((natom + objective_block_size - 1)/objective_block_size, 1)
247 :
248 1218 : self_cost_per_block = (total_block_self_cost + objective_nblocks - 1)/(objective_nblocks)
249 :
250 5056 : DO i = 1, nblocks
251 5056 : tmp_blocks2(i) = blocks_guess(i)
252 : END DO
253 :
254 : optimized = .FALSE.
255 : i = 0
256 2436 : DO WHILE (.NOT. optimized)
257 1218 : i = i + 1
258 1218 : current_block_id = 0
259 1218 : changed = .FALSE.
260 5056 : DO atom_block = 1, nblocks
261 3838 : current_block_id = current_block_id + 1
262 3838 : iatom_start = tmp_blocks2(atom_block)%istart
263 3838 : iatom_end = tmp_blocks2(atom_block)%iend
264 5056 : IF (tmp_blocks2(atom_block)%cost > 1.5_dp*self_cost_per_block .AND. iatom_end - iatom_start > 0) THEN
265 0 : changed = .TRUE.
266 0 : new_iatom_start = iatom_start
267 0 : new_iatom_end = (iatom_end - iatom_start + 1)/2 + iatom_start - 1
268 0 : new_jatom_start = new_iatom_end + 1
269 0 : new_jatom_end = iatom_end
270 0 : tmp_blocks(current_block_id)%istart = new_iatom_start
271 0 : tmp_blocks(current_block_id)%iend = new_iatom_end
272 : tmp_blocks(current_block_id)%cost = estimate_block_cost( &
273 : natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
274 : new_iatom_start, new_iatom_end, new_iatom_start, new_iatom_end, &
275 : new_iatom_start, new_iatom_end, new_iatom_start, new_iatom_end, &
276 : particle_set, &
277 : coeffs_set, coeffs_kind, &
278 : is_assoc_atomic_block_global, do_periodic, &
279 : kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
280 : cell, &
281 : do_p_screening, map_atom_to_kind_atom, eval_type, &
282 0 : log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
283 0 : current_block_id = current_block_id + 1
284 0 : tmp_blocks(current_block_id)%istart = new_jatom_start
285 0 : tmp_blocks(current_block_id)%iend = new_jatom_end
286 : tmp_blocks(current_block_id)%cost = estimate_block_cost( &
287 : natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
288 : new_jatom_start, new_jatom_end, new_jatom_start, new_jatom_end, &
289 : new_jatom_start, new_jatom_end, new_jatom_start, new_jatom_end, &
290 : particle_set, &
291 : coeffs_set, coeffs_kind, &
292 : is_assoc_atomic_block_global, do_periodic, &
293 : kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
294 : cell, &
295 : do_p_screening, map_atom_to_kind_atom, eval_type, &
296 0 : log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
297 : ELSE
298 3838 : tmp_blocks(current_block_id)%istart = iatom_start
299 3838 : tmp_blocks(current_block_id)%iend = iatom_end
300 3838 : tmp_blocks(current_block_id)%cost = tmp_blocks2(atom_block)%cost
301 : END IF
302 : END DO
303 1218 : IF (.NOT. changed) optimized = .TRUE.
304 1218 : IF (i > 20) optimized = .TRUE.
305 1218 : nblocks = current_block_id
306 6274 : DO atom_block = 1, nblocks
307 5056 : tmp_blocks2(atom_block) = tmp_blocks(atom_block)
308 : END DO
309 : END DO
310 :
311 1218 : DEALLOCATE (tmp_blocks2)
312 :
313 : ! ** count number of non empty blocks on each node
314 1218 : non_empty_blocks = 0
315 5056 : DO atom_block = 1, nblocks
316 3838 : IF (tmp_blocks(atom_block)%istart == 0) CYCLE
317 5056 : non_empty_blocks = non_empty_blocks + 1
318 : END DO
319 :
320 3654 : ALLOCATE (rcount(ncpu))
321 3652 : rcount = 0
322 1218 : rcount(para_env%mepos + 1) = non_empty_blocks
323 1218 : CALL para_env%sum(rcount)
324 :
325 : ! ** sum all non_empty_blocks
326 1218 : total_blocks = 0
327 3652 : DO i = 1, ncpu
328 3652 : total_blocks = total_blocks + rcount(i)
329 : END DO
330 :
331 : ! ** calculate offsets
332 2436 : ALLOCATE (rdispl(ncpu))
333 3652 : rcount(:) = rcount(:)*3
334 1218 : rdispl(1) = 0
335 2434 : DO i = 2, ncpu
336 2434 : rdispl(i) = rdispl(i - 1) + rcount(i - 1)
337 : END DO
338 :
339 3626 : ALLOCATE (buffer_in(3*non_empty_blocks))
340 :
341 1218 : non_empty_blocks = 0
342 5056 : DO atom_block = 1, nblocks
343 3838 : IF (tmp_blocks(atom_block)%istart == 0) CYCLE
344 1922 : buffer_in(non_empty_blocks*3 + 1) = tmp_blocks(atom_block)%istart
345 1922 : buffer_in(non_empty_blocks*3 + 2) = tmp_blocks(atom_block)%iend
346 1922 : buffer_in(non_empty_blocks*3 + 3) = tmp_blocks(atom_block)%cost
347 5056 : non_empty_blocks = non_empty_blocks + 1
348 : END DO
349 :
350 1218 : nblocks = total_blocks
351 :
352 7492 : ALLOCATE (tmp_blocks2(nblocks))
353 :
354 3654 : ALLOCATE (buffer_out(3*nblocks))
355 :
356 : ! ** Gather all three arrays
357 1218 : CALL para_env%allgatherv(buffer_in, buffer_out, rcount, rdispl)
358 :
359 5056 : DO i = 1, nblocks
360 3838 : tmp_blocks2(i)%istart = INT(buffer_out((i - 1)*3 + 1))
361 3838 : tmp_blocks2(i)%iend = INT(buffer_out((i - 1)*3 + 2))
362 5056 : tmp_blocks2(i)%cost = buffer_out((i - 1)*3 + 3)
363 : END DO
364 :
365 : ! ** Now we sort the blocks
366 3654 : ALLOCATE (to_be_sorted(nblocks))
367 2436 : ALLOCATE (tmp_index(nblocks))
368 :
369 5056 : DO atom_block = 1, nblocks
370 5056 : to_be_sorted(atom_block) = tmp_blocks2(atom_block)%istart
371 : END DO
372 :
373 1218 : CALL sort(to_be_sorted, nblocks, tmp_index)
374 :
375 6274 : ALLOCATE (x_data%blocks(nblocks))
376 :
377 5056 : DO atom_block = 1, nblocks
378 5056 : x_data%blocks(atom_block) = tmp_blocks2(tmp_index(atom_block))
379 : END DO
380 :
381 1218 : shm_blocks => x_data%blocks
382 1218 : shm_nblocks = nblocks
383 :
384 : ! ** Set nblocks in structure
385 1218 : load_balance_parameter%nblocks = nblocks
386 :
387 1218 : DEALLOCATE (blocks_guess, tmp_blocks, tmp_blocks2)
388 :
389 1218 : DEALLOCATE (rcount, rdispl, buffer_in, buffer_out, to_be_sorted, tmp_index)
390 :
391 1218 : load_balance_parameter%blocks_initialized = .TRUE.
392 :
393 8894 : x_data%blocks = shm_blocks
394 1218 : load_balance_parameter%nblocks = shm_nblocks
395 1218 : load_balance_parameter%blocks_initialized = .TRUE.
396 :
397 4872 : ALLOCATE (x_data%pmax_block(shm_nblocks, shm_nblocks))
398 20758 : x_data%pmax_block = 0.0_dp
399 1218 : pmax_block => x_data%pmax_block
400 2436 : CALL timestop(handle_range)
401 : !$OMP END MASTER
402 1218 : !$OMP BARRIER
403 :
404 1218 : IF (.NOT. load_balance_parameter%blocks_initialized) THEN
405 0 : ALLOCATE (x_data%blocks(shm_nblocks))
406 0 : x_data%blocks = shm_blocks
407 0 : load_balance_parameter%nblocks = shm_nblocks
408 0 : load_balance_parameter%blocks_initialized = .TRUE.
409 : END IF
410 : !! ** precalculate maximum density matrix elements in blocks
411 1218 : !$OMP BARRIER
412 : END IF
413 :
414 1746 : !$OMP BARRIER
415 1746 : !$OMP MASTER
416 1746 : pmax_block => x_data%pmax_block
417 28174 : pmax_block = 0.0_dp
418 1746 : IF (do_p_screening) THEN
419 1430 : DO iatom_block = 1, shm_nblocks
420 1100 : iatom_start = x_data%blocks(iatom_block)%istart
421 1100 : iatom_end = x_data%blocks(iatom_block)%iend
422 5570 : DO jatom_block = 1, shm_nblocks
423 4140 : jatom_start = x_data%blocks(jatom_block)%istart
424 4140 : jatom_end = x_data%blocks(jatom_block)%iend
425 13520 : pmax_block(iatom_block, jatom_block) = MAXVAL(pmax_atom(iatom_start:iatom_end, jatom_start:jatom_end))
426 : END DO
427 : END DO
428 : END IF
429 :
430 2964 : SELECT CASE (eval_type)
431 : CASE (hfx_do_eval_energy)
432 1218 : atomic_pair_list => x_data%atomic_pair_list
433 : CASE (hfx_do_eval_forces)
434 1746 : atomic_pair_list => x_data%atomic_pair_list_forces
435 : END SELECT
436 : CALL build_atomic_pair_list(natom, atomic_pair_list, kind_of, basis_parameter, particle_set, &
437 : do_periodic, coeffs_kind, coeffs_kind_max0, log10_eps_schwarz, cell, &
438 1746 : x_data%blocks)
439 :
440 : !$OMP END MASTER
441 1746 : !$OMP BARRIER
442 :
443 : !! If there is only 1 cpu skip the binning
444 1746 : IF (n_processes == 1) THEN
445 4 : ALLOCATE (tmp_dist(1))
446 2 : tmp_dist(1)%number_of_atom_quartets = HUGE(tmp_dist(1)%number_of_atom_quartets)
447 2 : tmp_dist(1)%istart = 0_int_8
448 2 : ptr_to_tmp_dist => tmp_dist(:)
449 4 : SELECT CASE (eval_type)
450 : CASE (hfx_do_eval_energy)
451 2 : CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
452 : CASE (hfx_do_eval_forces)
453 2 : CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
454 : END SELECT
455 2 : DEALLOCATE (tmp_dist)
456 : ELSE
457 : !! Calculate total numbers of integrals that have to be calculated (wrt screening and symmetry)
458 1744 : !$OMP BARRIER
459 1744 : !$OMP MASTER
460 1744 : CALL timeset(routineN//"_count", handle_inner)
461 : !$OMP END MASTER
462 1744 : !$OMP BARRIER
463 :
464 1744 : cost_per_core = 0_int_8
465 1744 : my_process_id = para_env%mepos*n_threads + i_thread
466 1744 : nblocks = load_balance_parameter%nblocks
467 :
468 481780 : DO atom_block = my_process_id, INT(nblocks, KIND=int_8)**4 - 1, n_processes
469 :
470 480036 : latom_block = INT(MODULO(atom_block, INT(nblocks, KIND=int_8))) + 1
471 480036 : tmp_block = atom_block/nblocks
472 480036 : katom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
473 480036 : IF (latom_block < katom_block) CYCLE
474 268894 : tmp_block = tmp_block/nblocks
475 268894 : jatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
476 268894 : tmp_block = tmp_block/nblocks
477 268894 : iatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
478 268894 : IF (jatom_block < iatom_block) CYCLE
479 :
480 151507 : iatom_start = x_data%blocks(iatom_block)%istart
481 151507 : iatom_end = x_data%blocks(iatom_block)%iend
482 151507 : jatom_start = x_data%blocks(jatom_block)%istart
483 151507 : jatom_end = x_data%blocks(jatom_block)%iend
484 151507 : katom_start = x_data%blocks(katom_block)%istart
485 151507 : katom_end = x_data%blocks(katom_block)%iend
486 151507 : latom_start = x_data%blocks(latom_block)%istart
487 151507 : latom_end = x_data%blocks(latom_block)%iend
488 :
489 286814 : SELECT CASE (eval_type)
490 : CASE (hfx_do_eval_energy)
491 : pmax_blocks = MAX(pmax_block(katom_block, iatom_block), &
492 : pmax_block(latom_block, jatom_block), &
493 : pmax_block(latom_block, iatom_block), &
494 135307 : pmax_block(katom_block, jatom_block))
495 : CASE (hfx_do_eval_forces)
496 : pmax_blocks = MAX(pmax_block(katom_block, iatom_block) + &
497 : pmax_block(latom_block, jatom_block), &
498 : pmax_block(latom_block, iatom_block) + &
499 151507 : pmax_block(katom_block, jatom_block))
500 : END SELECT
501 :
502 151507 : IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) CYCLE
503 :
504 : cost_per_core = cost_per_core &
505 : + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
506 : iatom_start, iatom_end, jatom_start, jatom_end, &
507 : katom_start, katom_end, latom_start, latom_end, &
508 : particle_set, &
509 : coeffs_set, coeffs_kind, &
510 : is_assoc_atomic_block_global, do_periodic, &
511 : kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
512 : cell, &
513 : do_p_screening, map_atom_to_kind_atom, eval_type, &
514 481780 : log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
515 :
516 : END DO ! atom_block
517 :
518 1744 : nbins = load_balance_parameter%nbins
519 1744 : cost_per_bin = (cost_per_core + nbins - 1)/(nbins)
520 :
521 1744 : !$OMP BARRIER
522 1744 : !$OMP MASTER
523 1744 : CALL timestop(handle_inner)
524 : !$OMP END MASTER
525 1744 : !$OMP BARRIER
526 :
527 : ! new load balancing test
528 : IF (.FALSE.) THEN
529 : CALL hfx_recursive_load_balance(n_processes, my_process_id, nblocks, &
530 : natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
531 : particle_set, &
532 : coeffs_set, coeffs_kind, &
533 : is_assoc_atomic_block_global, do_periodic, &
534 : kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
535 : cell, x_data, para_env, pmax_block, &
536 : do_p_screening, map_atom_to_kind_atom, eval_type, &
537 : log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
538 : END IF
539 :
540 1744 : !$OMP BARRIER
541 1744 : !$OMP MASTER
542 1744 : CALL timeset(routineN//"_bin", handle_inner)
543 : !$OMP END MASTER
544 1744 : !$OMP BARRIER
545 :
546 116848 : ALLOCATE (binned_dist(nbins))
547 113360 : binned_dist(:)%istart = -1_int_8
548 113360 : binned_dist(:)%number_of_atom_quartets = 0_int_8
549 113360 : binned_dist(:)%cost = 0_int_8
550 113360 : binned_dist(:)%time_first_scf = 0.0_dp
551 113360 : binned_dist(:)%time_other_scf = 0.0_dp
552 113360 : binned_dist(:)%time_forces = 0.0_dp
553 :
554 1744 : current_cost = 0
555 1744 : mepos = 1
556 1744 : distribution_counter_start = 1
557 1744 : distribution_counter_end = 0
558 1744 : ibin = 1
559 :
560 1744 : global_quartet_counter = 0
561 1744 : local_quartet_counter = 0
562 1744 : last_bin_needs_to_be_filled = .FALSE.
563 481780 : DO atom_block = my_process_id, INT(nblocks, KIND=int_8)**4 - 1, n_processes
564 480036 : latom_block = INT(MODULO(atom_block, INT(nblocks, KIND=int_8))) + 1
565 480036 : tmp_block = atom_block/nblocks
566 480036 : katom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
567 480036 : IF (latom_block < katom_block) CYCLE
568 268894 : tmp_block = tmp_block/nblocks
569 268894 : jatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
570 268894 : tmp_block = tmp_block/nblocks
571 268894 : iatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
572 268894 : IF (jatom_block < iatom_block) CYCLE
573 :
574 151507 : distribution_counter_end = distribution_counter_end + 1
575 151507 : global_quartet_counter = global_quartet_counter + 1
576 151507 : last_bin_needs_to_be_filled = .TRUE.
577 :
578 151507 : IF (binned_dist(ibin)%istart == -1_int_8) binned_dist(ibin)%istart = atom_block
579 :
580 151507 : iatom_start = x_data%blocks(iatom_block)%istart
581 151507 : iatom_end = x_data%blocks(iatom_block)%iend
582 151507 : jatom_start = x_data%blocks(jatom_block)%istart
583 151507 : jatom_end = x_data%blocks(jatom_block)%iend
584 151507 : katom_start = x_data%blocks(katom_block)%istart
585 151507 : katom_end = x_data%blocks(katom_block)%iend
586 151507 : latom_start = x_data%blocks(latom_block)%istart
587 151507 : latom_end = x_data%blocks(latom_block)%iend
588 :
589 286814 : SELECT CASE (eval_type)
590 : CASE (hfx_do_eval_energy)
591 : pmax_blocks = MAX(pmax_block(katom_block, iatom_block), &
592 : pmax_block(latom_block, jatom_block), &
593 : pmax_block(latom_block, iatom_block), &
594 135307 : pmax_block(katom_block, jatom_block))
595 : CASE (hfx_do_eval_forces)
596 : pmax_blocks = MAX(pmax_block(katom_block, iatom_block) + &
597 : pmax_block(latom_block, jatom_block), &
598 : pmax_block(latom_block, iatom_block) + &
599 151507 : pmax_block(katom_block, jatom_block))
600 : END SELECT
601 :
602 151507 : IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) CYCLE
603 :
604 : current_cost = current_cost &
605 : + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
606 : iatom_start, iatom_end, jatom_start, jatom_end, &
607 : katom_start, katom_end, latom_start, latom_end, &
608 : particle_set, &
609 : coeffs_set, coeffs_kind, &
610 : is_assoc_atomic_block_global, do_periodic, &
611 : kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
612 : cell, &
613 : do_p_screening, map_atom_to_kind_atom, eval_type, &
614 150881 : log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
615 :
616 152625 : IF (current_cost >= cost_per_bin) THEN
617 18862 : IF (ibin == nbins) THEN
618 : binned_dist(ibin)%number_of_atom_quartets = binned_dist(ibin)%number_of_atom_quartets + &
619 0 : distribution_counter_end - distribution_counter_start + 1
620 : ELSE
621 18862 : binned_dist(ibin)%number_of_atom_quartets = distribution_counter_end - distribution_counter_start + 1
622 : END IF
623 18862 : binned_dist(ibin)%cost = binned_dist(ibin)%cost + current_cost
624 18862 : ibin = MIN(ibin + 1, nbins)
625 18862 : distribution_counter_start = distribution_counter_end + 1
626 18862 : current_cost = 0
627 18862 : last_bin_needs_to_be_filled = .FALSE.
628 : END IF
629 : END DO
630 :
631 1744 : !$OMP BARRIER
632 1744 : !$OMP MASTER
633 1744 : CALL timestop(handle_inner)
634 1744 : CALL timeset(routineN//"_dist", handle_inner)
635 : !$OMP END MASTER
636 1744 : !$OMP BARRIER
637 : !! Fill the last bin if necessary
638 1744 : IF (last_bin_needs_to_be_filled) THEN
639 511 : binned_dist(ibin)%cost = binned_dist(ibin)%cost + current_cost
640 511 : IF (ibin == nbins) THEN
641 : binned_dist(ibin)%number_of_atom_quartets = binned_dist(ibin)%number_of_atom_quartets + &
642 0 : distribution_counter_end - distribution_counter_start + 1
643 : ELSE
644 511 : binned_dist(ibin)%number_of_atom_quartets = distribution_counter_end - distribution_counter_start + 1
645 : END IF
646 : END IF
647 :
648 : !! Sanity-Check
649 113360 : DO ibin = 1, nbins
650 113360 : local_quartet_counter = local_quartet_counter + binned_dist(ibin)%number_of_atom_quartets
651 : END DO
652 1744 : !$OMP BARRIER
653 1744 : !$OMP MASTER
654 1744 : shm_local_quartet_counter = 0
655 1744 : shm_global_quartet_counter = 0
656 : !$OMP END MASTER
657 1744 : !$OMP BARRIER
658 1744 : !$OMP ATOMIC
659 : shm_local_quartet_counter = shm_local_quartet_counter + local_quartet_counter
660 1744 : !$OMP ATOMIC
661 : shm_global_quartet_counter = shm_global_quartet_counter + global_quartet_counter
662 :
663 1744 : !$OMP BARRIER
664 1744 : !$OMP MASTER
665 1744 : CALL para_env%sum(shm_local_quartet_counter)
666 1744 : CALL para_env%sum(shm_global_quartet_counter)
667 1744 : IF (para_env%is_source()) THEN
668 872 : IF (shm_local_quartet_counter /= shm_global_quartet_counter) THEN
669 : WRITE (error_msg, '(A,I0,A,I0,A)') "HFX Sanity check for parallel distribution failed. "// &
670 0 : "Number of local quartets (", shm_local_quartet_counter, &
671 0 : ") and number of global quartets (", shm_global_quartet_counter, &
672 0 : ") are different. Please send in a bug report."
673 0 : CPABORT(error_msg)
674 : END IF
675 : END IF
676 : !$OMP END MASTER
677 :
678 1744 : !$OMP BARRIER
679 1744 : !$OMP MASTER
680 5232 : ALLOCATE (cost_matrix(ncpu*nbins*n_threads))
681 224976 : cost_matrix = 0
682 : !$OMP END MASTER
683 1744 : !$OMP BARRIER
684 1744 : icpu = para_env%mepos + 1
685 113360 : DO i = 1, nbins
686 113360 : cost_matrix((icpu - 1)*nbins*n_threads + i_thread*nbins + i) = binned_dist(i)%cost
687 : END DO
688 1744 : mepos = para_env%mepos
689 1744 : !$OMP BARRIER
690 :
691 1744 : !$OMP MASTER
692 : ! sync before/after ring of isendrecv
693 1744 : CALL para_env%sync()
694 :
695 5232 : ALLOCATE (sendbuffer(nbins*n_threads))
696 3488 : ALLOCATE (recbuffer(nbins*n_threads))
697 :
698 224976 : sendbuffer = cost_matrix(mepos*nbins*n_threads + 1:mepos*nbins*n_threads + nbins*n_threads)
699 :
700 1744 : dest = MODULO(mepos + 1, ncpu)
701 1744 : source = MODULO(mepos - 1, ncpu)
702 5232 : DO icpu = 0, ncpu - 1
703 3488 : IF (icpu .NE. ncpu - 1) THEN
704 : CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
705 1744 : req(1), req(2), 13)
706 : END IF
707 3488 : data_from = MODULO(mepos - icpu, ncpu)
708 449952 : cost_matrix(data_from*nbins*n_threads + 1:data_from*nbins*n_threads + nbins*n_threads) = sendbuffer
709 3488 : IF (icpu .NE. ncpu - 1) THEN
710 1744 : CALL mp_waitall(req)
711 : END IF
712 3488 : swapbuffer => sendbuffer
713 3488 : sendbuffer => recbuffer
714 5232 : recbuffer => swapbuffer
715 : END DO
716 1744 : DEALLOCATE (recbuffer, sendbuffer)
717 : !$OMP END MASTER
718 1744 : !$OMP BARRIER
719 :
720 1744 : !$OMP BARRIER
721 1744 : !$OMP MASTER
722 1744 : CALL timestop(handle_inner)
723 1744 : CALL timeset(routineN//"_opt", handle_inner)
724 : !$OMP END MASTER
725 1744 : !$OMP BARRIER
726 :
727 : !! Find an optimal distribution i.e. assign each element of the cost matrix to a certain process
728 1744 : !$OMP BARRIER
729 5232 : ALLOCATE (local_cost_matrix(SIZE(cost_matrix, 1)))
730 448208 : local_cost_matrix = cost_matrix
731 1744 : !$OMP MASTER
732 5232 : ALLOCATE (shm_distribution_vector(ncpu*nbins*n_threads))
733 :
734 : CALL optimize_distribution(ncpu*nbins*n_threads, ncpu*n_threads, local_cost_matrix, &
735 1744 : shm_distribution_vector, x_data%load_balance_parameter%do_randomize)
736 :
737 1744 : CALL timestop(handle_inner)
738 1744 : CALL timeset(routineN//"_redist", handle_inner)
739 : !! Collect local data to global array
740 341824 : ALLOCATE (full_dist(ncpu*n_threads, nbins))
741 :
742 336592 : full_dist(:, :)%istart = 0_int_8
743 336592 : full_dist(:, :)%number_of_atom_quartets = 0_int_8
744 336592 : full_dist(:, :)%cost = 0_int_8
745 336592 : full_dist(:, :)%time_first_scf = 0.0_dp
746 336592 : full_dist(:, :)%time_other_scf = 0.0_dp
747 338336 : full_dist(:, :)%time_forces = 0.0_dp
748 : !$OMP END MASTER
749 1744 : !$OMP BARRIER
750 1744 : mepos = para_env%mepos + 1
751 224976 : full_dist((mepos - 1)*n_threads + i_thread + 1, :) = binned_dist(:)
752 :
753 1744 : !$OMP BARRIER
754 1744 : !$OMP MASTER
755 5232 : ALLOCATE (sendbuffer(3*nbins*n_threads))
756 3488 : ALLOCATE (recbuffer(3*nbins*n_threads))
757 1744 : mepos = para_env%mepos
758 3488 : DO j = 1, n_threads
759 115104 : DO i = 1, nbins
760 111616 : sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 1) = full_dist(mepos*n_threads + j, i)%istart
761 111616 : sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 2) = full_dist(mepos*n_threads + j, i)%number_of_atom_quartets
762 113360 : sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 3) = full_dist(mepos*n_threads + j, i)%cost
763 : END DO
764 : END DO
765 :
766 : ! sync before/after ring of isendrecv
767 1744 : CALL para_env%sync()
768 1744 : dest = MODULO(mepos + 1, ncpu)
769 1744 : source = MODULO(mepos - 1, ncpu)
770 5232 : DO icpu = 0, ncpu - 1
771 3488 : IF (icpu .NE. ncpu - 1) THEN
772 : CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
773 1744 : req(1), req(2), 13)
774 : END IF
775 3488 : data_from = MODULO(mepos - icpu, ncpu)
776 6976 : DO j = 1, n_threads
777 230208 : DO i = 1, nbins
778 223232 : full_dist(data_from*n_threads + j, i)%istart = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 1)
779 223232 : full_dist(data_from*n_threads + j, i)%number_of_atom_quartets = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 2)
780 226720 : full_dist(data_from*n_threads + j, i)%cost = sendbuffer((j - 1)*3*nbins + (i - 1)*3 + 3)
781 : END DO
782 : END DO
783 :
784 3488 : IF (icpu .NE. ncpu - 1) THEN
785 1744 : CALL mp_waitall(req)
786 : END IF
787 3488 : swapbuffer => sendbuffer
788 3488 : sendbuffer => recbuffer
789 5232 : recbuffer => swapbuffer
790 : END DO
791 1744 : DEALLOCATE (recbuffer, sendbuffer)
792 :
793 : ! sync before/after ring of isendrecv
794 1744 : CALL para_env%sync()
795 : !$OMP END MASTER
796 1744 : !$OMP BARRIER
797 : !! reorder the distribution according to the distribution vector
798 5232 : ALLOCATE (tmp_pos(ncpu*n_threads))
799 5232 : tmp_pos = 1
800 228464 : ALLOCATE (tmp_dist(nbins*ncpu*n_threads))
801 :
802 224976 : tmp_dist(:)%istart = 0_int_8
803 224976 : tmp_dist(:)%number_of_atom_quartets = 0_int_8
804 224976 : tmp_dist(:)%cost = 0_int_8
805 224976 : tmp_dist(:)%time_first_scf = 0.0_dp
806 224976 : tmp_dist(:)%time_other_scf = 0.0_dp
807 224976 : tmp_dist(:)%time_forces = 0.0_dp
808 :
809 5232 : DO icpu = 1, n_processes
810 228464 : DO i = 1, nbins
811 223232 : mepos = my_process_id + 1
812 226720 : IF (shm_distribution_vector((icpu - 1)*nbins + i) == mepos) THEN
813 111616 : tmp_dist(tmp_pos(mepos)) = full_dist(icpu, i)
814 111616 : tmp_pos(mepos) = tmp_pos(mepos) + 1
815 : END IF
816 : END DO
817 : END DO
818 :
819 : !! Assign the load to each process
820 : NULLIFY (ptr_to_tmp_dist)
821 1744 : mepos = my_process_id + 1
822 1744 : ptr_to_tmp_dist => tmp_dist(1:tmp_pos(mepos) - 1)
823 2960 : SELECT CASE (eval_type)
824 : CASE (hfx_do_eval_energy)
825 1216 : CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
826 : CASE (hfx_do_eval_forces)
827 1744 : CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
828 : END SELECT
829 :
830 1744 : !$OMP BARRIER
831 1744 : !$OMP MASTER
832 1744 : DEALLOCATE (full_dist, cost_matrix, shm_distribution_vector)
833 : !$OMP END MASTER
834 1744 : !$OMP BARRIER
835 1744 : DEALLOCATE (tmp_dist, tmp_pos)
836 1744 : DEALLOCATE (binned_dist, local_cost_matrix)
837 1744 : DEALLOCATE (set_list_ij, set_list_kl)
838 :
839 1744 : !$OMP BARRIER
840 1744 : !$OMP MASTER
841 1744 : CALL timestop(handle_inner)
842 : !$OMP END MASTER
843 1744 : !$OMP BARRIER
844 : END IF
845 1746 : !$OMP BARRIER
846 1746 : !$OMP MASTER
847 1746 : CALL timestop(handle)
848 : !$OMP END MASTER
849 1746 : !$OMP BARRIER
850 3622950 : END SUBROUTINE hfx_load_balance
851 :
852 : ! **************************************************************************************************
853 : !> \brief Reference implementation of new recursive load balancing routine
854 : !> Computes a local list of atom_blocks (p_atom_blocks,q_atom_blocks) for
855 : !> each process in a P-Q grid such that every process has more or less the
856 : !> same amount of work. Has no output at the moment (not used) but writes
857 : !> its computed load balance values into a file. Possible output is ready
858 : !> to use in the two arrays p_atom_blocks & q_atom_blocks
859 : !> \param n_processes ...
860 : !> \param my_process_id ...
861 : !> \param nblocks ...
862 : !> \param natom ...
863 : !> \param nkind ...
864 : !> \param list_ij ...
865 : !> \param list_kl ...
866 : !> \param set_list_ij ...
867 : !> \param set_list_kl ...
868 : !> \param particle_set ...
869 : !> \param coeffs_set ...
870 : !> \param coeffs_kind ...
871 : !> \param is_assoc_atomic_block_global ...
872 : !> \param do_periodic ...
873 : !> \param kind_of ...
874 : !> \param basis_parameter ...
875 : !> \param pmax_set ...
876 : !> \param pmax_atom ...
877 : !> \param pmax_blocks ...
878 : !> \param cell ...
879 : !> \param x_data ...
880 : !> \param para_env ...
881 : !> \param pmax_block ...
882 : !> \param do_p_screening ...
883 : !> \param map_atom_to_kind_atom ...
884 : !> \param eval_type ...
885 : !> \param log10_eps_schwarz ...
886 : !> \param log_2 ...
887 : !> \param coeffs_kind_max0 ...
888 : !> \param use_virial ...
889 : !> \param atomic_pair_list ...
890 : !> \par History
891 : !> 03.2011 created [Michael Steinlechner]
892 : !> \author Michael Steinlechner
893 : ! **************************************************************************************************
894 :
895 0 : SUBROUTINE hfx_recursive_load_balance(n_processes, my_process_id, nblocks, &
896 : natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
897 : particle_set, &
898 : coeffs_set, coeffs_kind, &
899 0 : is_assoc_atomic_block_global, do_periodic, &
900 : kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
901 : cell, x_data, para_env, pmax_block, &
902 : do_p_screening, map_atom_to_kind_atom, eval_type, &
903 : log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
904 :
905 : ! input variables:
906 : INTEGER, INTENT(IN) :: n_processes, my_process_id, nblocks, &
907 : natom, nkind
908 : TYPE(pair_list_type), INTENT(IN) :: list_ij, list_kl
909 : TYPE(pair_set_list_type), ALLOCATABLE, &
910 : DIMENSION(:), INTENT(IN) :: set_list_ij, set_list_kl
911 : TYPE(particle_type), DIMENSION(:), INTENT(IN), &
912 : POINTER :: particle_set
913 : TYPE(hfx_screen_coeff_type), &
914 : DIMENSION(:, :, :, :), INTENT(IN), POINTER :: coeffs_set
915 : TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
916 : INTENT(IN), POINTER :: coeffs_kind
917 : INTEGER, DIMENSION(:, :), INTENT(IN) :: is_assoc_atomic_block_global
918 : LOGICAL, INTENT(IN) :: do_periodic
919 : INTEGER, INTENT(IN) :: kind_of(*)
920 : TYPE(hfx_basis_type), DIMENSION(:), INTENT(IN), &
921 : POINTER :: basis_parameter
922 : TYPE(hfx_p_kind), DIMENSION(:), INTENT(IN), &
923 : POINTER :: pmax_set
924 : REAL(dp), DIMENSION(:, :), INTENT(IN), POINTER :: pmax_atom
925 : REAL(dp) :: pmax_blocks
926 : TYPE(cell_type), INTENT(IN), POINTER :: cell
927 : TYPE(hfx_type), INTENT(IN), POINTER :: x_data
928 : TYPE(mp_para_env_type), INTENT(IN) :: para_env
929 : REAL(dp), DIMENSION(:, :), INTENT(IN), POINTER :: pmax_block
930 : LOGICAL, INTENT(IN) :: do_p_screening
931 : INTEGER, DIMENSION(:), INTENT(IN), POINTER :: map_atom_to_kind_atom
932 : INTEGER, INTENT(IN) :: eval_type
933 : REAL(dp), INTENT(IN) :: log10_eps_schwarz, log_2, &
934 : coeffs_kind_max0
935 : LOGICAL, INTENT(IN) :: use_virial
936 : LOGICAL, DIMENSION(:, :), INTENT(IN), POINTER :: atomic_pair_list
937 :
938 : CHARACTER(LEN=*), PARAMETER :: routineN = 'hfx_recursive_load_balance'
939 :
940 : INTEGER :: handle, i, iatom_block, iatom_end, iatom_start, j, jatom_block, jatom_end, &
941 : jatom_start, katom_block, katom_end, katom_start, latom_block, latom_end, latom_start, &
942 : nP, nQ, numBins, p, q, sizeP, sizeQ, unit_nr
943 : INTEGER(int_8) :: local_cost, pidx, qidx, sumP, sumQ
944 0 : INTEGER(int_8), ALLOCATABLE, DIMENSION(:) :: local_cost_vector
945 0 : INTEGER, ALLOCATABLE, DIMENSION(:) :: blocksize, p_atom_blocks, permute, &
946 0 : q_atom_blocks
947 : REAL(dp) :: maximum, mean
948 :
949 : ! internal variables:
950 :
951 0 : !$OMP BARRIER
952 0 : !$OMP MASTER
953 0 : CALL timeset(routineN, handle)
954 : !$OMP END MASTER
955 0 : !$OMP BARRIER
956 :
957 : ! calculate best p/q distribution grid for the n_processes
958 0 : CALL hfx_calculate_PQ(p, q, numBins, n_processes)
959 :
960 0 : ALLOCATE (blocksize(numBins))
961 0 : ALLOCATE (permute(nblocks**2))
962 0 : DO i = 1, nblocks**2
963 0 : permute(i) = i
964 : END DO
965 :
966 : ! call the main recursive permutation routine.
967 : ! Output:
968 : ! blocksize :: vector (size numBins) with the sizes for each column/row block
969 : ! permute :: permutation vector
970 : CALL hfx_recursive_permute(blocksize, 1, nblocks**2, numBins, &
971 : permute, 1, &
972 : my_process_id, n_processes, nblocks, &
973 : natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
974 : particle_set, &
975 : coeffs_set, coeffs_kind, &
976 : is_assoc_atomic_block_global, do_periodic, &
977 : kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
978 : cell, x_data, para_env, pmax_block, &
979 : do_p_screening, map_atom_to_kind_atom, eval_type, &
980 0 : log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
981 :
982 : ! number of blocks per processor in p-direction (vertical)
983 0 : nP = numBins/p
984 : ! number of blocks per processor in q-direction (horizontal)
985 0 : nQ = numBins/q
986 :
987 : ! calc own position in P-Q-processor grid (PQ-grid is column-major)
988 0 : pidx = MODULO(INT(my_process_id), INT(p)) + 1
989 0 : qidx = my_process_id/p + 1
990 :
991 0 : sizeP = SUM(blocksize((nP*(pidx - 1) + 1):(nP*pidx)))
992 0 : sizeQ = SUM(blocksize((nQ*(qidx - 1) + 1):(nQ*qidx)))
993 :
994 0 : sumP = SUM(blocksize(1:(nP*(pidx - 1))))
995 0 : sumQ = SUM(blocksize(1:(nQ*(qidx - 1))))
996 :
997 0 : ALLOCATE (p_atom_blocks(sizeP))
998 0 : ALLOCATE (q_atom_blocks(sizeQ))
999 :
1000 0 : p_atom_blocks(:) = permute((sumP + 1):(sumP + sizeP))
1001 0 : q_atom_blocks(:) = permute((sumQ + 1):(sumQ + sizeQ))
1002 :
1003 : ! from here on, we are actually finished, each process has been
1004 : ! assigned a (p_atom_blocks,q_atom_blocks) pair list.
1005 : ! what follows is just a small routine to calculate the local cost
1006 : ! for each processor which is then written to a file.
1007 :
1008 : ! calculate local cost for each processor!
1009 : ! ****************************************
1010 : local_cost = 0
1011 0 : DO i = 1, sizeP
1012 0 : DO j = 1, sizeQ
1013 :
1014 : ! get corresponding 4D block indices out of our own P-Q-block
1015 0 : latom_block = MODULO(q_atom_blocks(j), nblocks)
1016 0 : iatom_block = q_atom_blocks(j)/nblocks + 1
1017 0 : jatom_block = MODULO(p_atom_blocks(i), nblocks)
1018 0 : katom_block = p_atom_blocks(i)/nblocks + 1
1019 :
1020 : ! symmetry checks.
1021 0 : IF (latom_block < katom_block) CYCLE
1022 0 : IF (jatom_block < iatom_block) CYCLE
1023 :
1024 0 : iatom_start = x_data%blocks(iatom_block)%istart
1025 0 : iatom_end = x_data%blocks(iatom_block)%iend
1026 0 : jatom_start = x_data%blocks(jatom_block)%istart
1027 0 : jatom_end = x_data%blocks(jatom_block)%iend
1028 0 : katom_start = x_data%blocks(katom_block)%istart
1029 0 : katom_end = x_data%blocks(katom_block)%iend
1030 0 : latom_start = x_data%blocks(latom_block)%istart
1031 0 : latom_end = x_data%blocks(latom_block)%iend
1032 :
1033 : ! whatever.
1034 0 : SELECT CASE (eval_type)
1035 : CASE (hfx_do_eval_energy)
1036 : pmax_blocks = MAX(pmax_block(katom_block, iatom_block), &
1037 : pmax_block(latom_block, jatom_block), &
1038 : pmax_block(latom_block, iatom_block), &
1039 0 : pmax_block(katom_block, jatom_block))
1040 : CASE (hfx_do_eval_forces)
1041 : pmax_blocks = MAX(pmax_block(katom_block, iatom_block) + &
1042 : pmax_block(latom_block, jatom_block), &
1043 : pmax_block(latom_block, iatom_block) + &
1044 0 : pmax_block(katom_block, jatom_block))
1045 : END SELECT
1046 :
1047 : ! screening.
1048 0 : IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) CYCLE
1049 :
1050 : ! estimate the cost of this atom_block.
1051 : local_cost = local_cost + estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, &
1052 : set_list_kl, &
1053 : iatom_start, iatom_end, jatom_start, jatom_end, &
1054 : katom_start, katom_end, latom_start, latom_end, &
1055 : particle_set, &
1056 : coeffs_set, coeffs_kind, &
1057 : is_assoc_atomic_block_global, do_periodic, &
1058 : kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1059 : cell, &
1060 : do_p_screening, map_atom_to_kind_atom, eval_type, &
1061 0 : log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
1062 : END DO
1063 : END DO
1064 :
1065 0 : ALLOCATE (local_cost_vector(n_processes))
1066 0 : local_cost_vector = 0
1067 0 : local_cost_vector(my_process_id + 1) = local_cost
1068 0 : CALL para_env%sum(local_cost_vector)
1069 :
1070 0 : mean = SUM(local_cost_vector)/n_processes
1071 0 : maximum = MAXVAL(local_cost_vector)
1072 :
1073 0 : !$OMP BARRIER
1074 0 : !$OMP MASTER
1075 : ! only output once
1076 0 : IF (my_process_id == 0) THEN
1077 0 : CALL open_file(unit_number=unit_nr, file_name="loads.dat")
1078 0 : WRITE (unit_nr, *) 'maximum cost:', maximum
1079 0 : WRITE (unit_nr, *) 'mean cost:', mean
1080 0 : WRITE (unit_nr, *) 'load balance ratio max/mean: ', maximum/mean
1081 0 : WRITE (unit_nr, *) '-------- detailed per-process costs ---------'
1082 0 : DO i = 1, n_processes
1083 0 : WRITE (unit_nr, *) local_cost_vector(i)
1084 : END DO
1085 0 : CALL close_file(unit_nr)
1086 : END IF
1087 : !$OMP END MASTER
1088 0 : !$OMP BARRIER
1089 :
1090 0 : DEALLOCATE (local_cost_vector)
1091 0 : DEALLOCATE (p_atom_blocks, q_atom_blocks)
1092 0 : DEALLOCATE (blocksize, permute)
1093 :
1094 0 : !$OMP BARRIER
1095 0 : !$OMP MASTER
1096 0 : CALL timestop(handle)
1097 : !$OMP END MASTER
1098 0 : !$OMP BARRIER
1099 :
1100 0 : END SUBROUTINE hfx_recursive_load_balance
1101 :
1102 : ! **************************************************************************************************
1103 : !> \brief Small routine to calculate the optimal P-Q-processor grid distribution
1104 : !> for a given number of processors N
1105 : !> and the corresponding number of Bins for the load balancing routine
1106 : !> \param p number of rows on P-Q process grid (output)
1107 : !> \param q number of columns on P-Q process grid (output)
1108 : !> \param nBins number of Bins (output)
1109 : !> \param N number of processes (input)
1110 : !> \par History
1111 : !> 03.2011 created [Michael Steinlechner]
1112 : !> \author Michael Steinlechner
1113 : ! **************************************************************************************************
1114 0 : SUBROUTINE hfx_calculate_PQ(p, q, nBins, N)
1115 :
1116 : INTEGER, INTENT(OUT) :: p, q, nBins
1117 : INTEGER, INTENT(IN) :: N
1118 :
1119 : INTEGER :: a, b, k
1120 : REAL(dp) :: sqN
1121 :
1122 0 : k = 2
1123 0 : sqN = SQRT(REAL(N, KIND=dp))
1124 0 : p = 1
1125 :
1126 0 : DO WHILE (REAL(k, KIND=dp) <= sqN)
1127 0 : IF (MODULO(N, k) == 0) THEN
1128 0 : p = k
1129 : END IF
1130 0 : k = k + 1
1131 : END DO
1132 0 : q = N/p
1133 :
1134 : ! now compute the least common multiple of p & q to get the number of necessary bins
1135 : ! compute using the relation LCM(p,q) = abs(p*q) / GCD(p,q)
1136 : ! and use euclid's algorithm for GCD computation.
1137 0 : a = p
1138 0 : b = q
1139 :
1140 0 : DO WHILE (b .NE. 0)
1141 0 : IF (a > b) THEN
1142 0 : a = a - b
1143 : ELSE
1144 0 : b = b - a
1145 : END IF
1146 : END DO
1147 : ! gcd(p,q) is now saved in a
1148 :
1149 0 : nBins = p*q/a
1150 :
1151 0 : END SUBROUTINE
1152 :
1153 : ! **************************************************************************************************
1154 : !> \brief Recursive permutation routine for the load balancing of the integral
1155 : !> computation
1156 : !> \param blocksize vector of blocksizes, size(nProc), which contains for
1157 : !> each process the local blocksize (OUTPUT)
1158 : !> \param blockstart starting row/column idx of the block which is to be examined
1159 : !> at this point (INPUT)
1160 : !> \param blockend ending row/column idx of the block which is to be examined
1161 : !> (INPUT)
1162 : !> \param nProc_in number of bins into which the current block has to be divided
1163 : !> (INPUT)
1164 : !> \param permute permutation vector which balances column/row cost
1165 : !> size(nblocks^2). (OUTPUT)
1166 : !> \param step ...
1167 : !> \param my_process_id ...
1168 : !> \param n_processes ...
1169 : !> \param nblocks ...
1170 : !> \param natom ...
1171 : !> \param nkind ...
1172 : !> \param list_ij ...
1173 : !> \param list_kl ...
1174 : !> \param set_list_ij ...
1175 : !> \param set_list_kl ...
1176 : !> \param particle_set ...
1177 : !> \param coeffs_set ...
1178 : !> \param coeffs_kind ...
1179 : !> \param is_assoc_atomic_block_global ...
1180 : !> \param do_periodic ...
1181 : !> \param kind_of ...
1182 : !> \param basis_parameter ...
1183 : !> \param pmax_set ...
1184 : !> \param pmax_atom ...
1185 : !> \param pmax_blocks ...
1186 : !> \param cell ...
1187 : !> \param x_data ...
1188 : !> \param para_env ...
1189 : !> \param pmax_block ...
1190 : !> \param do_p_screening ...
1191 : !> \param map_atom_to_kind_atom ...
1192 : !> \param eval_type ...
1193 : !> \param log10_eps_schwarz ...
1194 : !> \param log_2 ...
1195 : !> \param coeffs_kind_max0 ...
1196 : !> \param use_virial ...
1197 : !> \param atomic_pair_list ...
1198 : !> \par History
1199 : !> 03.2011 created [Michael Steinlechner]
1200 : !> \author Michael Steinlechner
1201 : ! **************************************************************************************************
1202 0 : RECURSIVE SUBROUTINE hfx_recursive_permute(blocksize, blockstart, blockend, nProc_in, &
1203 0 : permute, step, &
1204 : my_process_id, n_processes, nblocks, &
1205 : natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
1206 : particle_set, &
1207 : coeffs_set, coeffs_kind, &
1208 0 : is_assoc_atomic_block_global, do_periodic, &
1209 : kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1210 : cell, x_data, para_env, pmax_block, &
1211 : do_p_screening, map_atom_to_kind_atom, eval_type, &
1212 : log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
1213 :
1214 : INTEGER :: nProc_in, blockend, blockstart
1215 : INTEGER, DIMENSION(nProc_in) :: blocksize
1216 : INTEGER :: nblocks, n_processes, my_process_id
1217 : INTEGER, INTENT(IN) :: step
1218 : INTEGER, DIMENSION(nblocks*nblocks) :: permute
1219 : INTEGER :: natom
1220 : INTEGER, INTENT(IN) :: nkind
1221 : TYPE(pair_list_type) :: list_ij, list_kl
1222 : TYPE(pair_set_list_type), ALLOCATABLE, &
1223 : DIMENSION(:) :: set_list_ij, set_list_kl
1224 : TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
1225 : TYPE(hfx_screen_coeff_type), &
1226 : DIMENSION(:, :, :, :), POINTER :: coeffs_set
1227 : TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
1228 : POINTER :: coeffs_kind
1229 : INTEGER, DIMENSION(:, :) :: is_assoc_atomic_block_global
1230 : LOGICAL :: do_periodic
1231 : INTEGER :: kind_of(*)
1232 : TYPE(hfx_basis_type), DIMENSION(:), POINTER :: basis_parameter
1233 : TYPE(hfx_p_kind), DIMENSION(:), POINTER :: pmax_set
1234 : REAL(dp), DIMENSION(:, :), POINTER :: pmax_atom
1235 : REAL(dp) :: pmax_blocks
1236 : TYPE(cell_type), POINTER :: cell
1237 : TYPE(hfx_type), POINTER :: x_data
1238 : TYPE(mp_para_env_type), INTENT(IN) :: para_env
1239 : REAL(dp), DIMENSION(:, :), POINTER :: pmax_block
1240 : LOGICAL, INTENT(IN) :: do_p_screening
1241 : INTEGER, DIMENSION(:), POINTER :: map_atom_to_kind_atom
1242 : INTEGER, INTENT(IN) :: eval_type
1243 : REAL(dp) :: log10_eps_schwarz, log_2, &
1244 : coeffs_kind_max0
1245 : LOGICAL, INTENT(IN) :: use_virial
1246 : LOGICAL, DIMENSION(:, :), POINTER :: atomic_pair_list
1247 :
1248 : INTEGER :: col, endoffset, i, iatom_block, iatom_end, iatom_start, idx, inv_perm, &
1249 : jatom_block, jatom_end, jatom_start, katom_block, katom_end, katom_start, latom_block, &
1250 : latom_end, latom_start, nbins, nProc, row, startoffset
1251 : INTEGER(int_8) :: atom_block, tmp_block
1252 0 : INTEGER, ALLOCATABLE, DIMENSION(:) :: ithblocksize, localblocksize
1253 0 : INTEGER, DIMENSION(blockend - blockstart + 1) :: bin_perm, tmp_perm
1254 : REAL(dp) :: partialcost
1255 0 : REAL(dp), DIMENSION(nblocks*nblocks) :: cost_vector
1256 :
1257 0 : nProc = nProc_in
1258 0 : cost_vector = 0.0_dp
1259 :
1260 : ! loop over local atom_blocks.
1261 0 : DO atom_block = my_process_id, INT(nblocks, KIND=int_8)**4 - 1, n_processes
1262 :
1263 : ! get corresponding 4D block indices
1264 0 : latom_block = INT(MODULO(atom_block, INT(nblocks, KIND=int_8))) + 1
1265 0 : tmp_block = atom_block/nblocks
1266 0 : katom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
1267 0 : IF (latom_block < katom_block) CYCLE
1268 0 : tmp_block = tmp_block/nblocks
1269 0 : jatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
1270 0 : tmp_block = tmp_block/nblocks
1271 0 : iatom_block = INT(MODULO(tmp_block, INT(nblocks, KIND=int_8))) + 1
1272 0 : IF (jatom_block < iatom_block) CYCLE
1273 :
1274 : ! get 2D indices of this atom_block (with permutation applied)
1275 : ! for this, we need to invert the permutation, this means
1276 : ! find position in permutation vector where value==idx
1277 :
1278 0 : row = (katom_block - 1)*nblocks + jatom_block
1279 0 : inv_perm = 1
1280 0 : DO WHILE (permute(inv_perm) .NE. row)
1281 0 : inv_perm = inv_perm + 1
1282 : END DO
1283 0 : row = inv_perm
1284 :
1285 0 : col = (iatom_block - 1)*nblocks + latom_block
1286 0 : inv_perm = 1
1287 0 : DO WHILE (permute(inv_perm) .NE. col)
1288 0 : inv_perm = inv_perm + 1
1289 : END DO
1290 0 : col = inv_perm
1291 :
1292 : ! if row/col outside our current diagonal block, skip calculation.
1293 0 : IF (col < blockstart .OR. col > blockend) CYCLE
1294 0 : IF (row < blockstart .OR. row > blockend) CYCLE
1295 :
1296 0 : iatom_start = x_data%blocks(iatom_block)%istart
1297 0 : iatom_end = x_data%blocks(iatom_block)%iend
1298 0 : jatom_start = x_data%blocks(jatom_block)%istart
1299 0 : jatom_end = x_data%blocks(jatom_block)%iend
1300 0 : katom_start = x_data%blocks(katom_block)%istart
1301 0 : katom_end = x_data%blocks(katom_block)%iend
1302 0 : latom_start = x_data%blocks(latom_block)%istart
1303 0 : latom_end = x_data%blocks(latom_block)%iend
1304 :
1305 : ! whatever.
1306 0 : SELECT CASE (eval_type)
1307 : CASE (hfx_do_eval_energy)
1308 : pmax_blocks = MAX(pmax_block(katom_block, iatom_block), &
1309 : pmax_block(latom_block, jatom_block), &
1310 : pmax_block(latom_block, iatom_block), &
1311 0 : pmax_block(katom_block, jatom_block))
1312 : CASE (hfx_do_eval_forces)
1313 : pmax_blocks = MAX(pmax_block(katom_block, iatom_block) + &
1314 : pmax_block(latom_block, jatom_block), &
1315 : pmax_block(latom_block, iatom_block) + &
1316 0 : pmax_block(katom_block, jatom_block))
1317 : END SELECT
1318 :
1319 : ! screening.
1320 0 : IF (2.0_dp*coeffs_kind_max0 + pmax_blocks < log10_eps_schwarz) CYCLE
1321 :
1322 : ! every second recursion step, compute row sum instead of column sum
1323 :
1324 0 : IF (MODULO(step, 2) .EQ. 0) THEN
1325 : idx = row
1326 : ELSE
1327 0 : idx = col
1328 : END IF
1329 :
1330 : ! estimate the cost of this atom_block.
1331 : partialcost = estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, &
1332 : set_list_kl, &
1333 : iatom_start, iatom_end, jatom_start, jatom_end, &
1334 : katom_start, katom_end, latom_start, latom_end, &
1335 : particle_set, &
1336 : coeffs_set, coeffs_kind, &
1337 : is_assoc_atomic_block_global, do_periodic, &
1338 : kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1339 : cell, &
1340 : do_p_screening, map_atom_to_kind_atom, eval_type, &
1341 0 : log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
1342 :
1343 0 : cost_vector(idx) = cost_vector(idx) + partialcost
1344 : END DO ! atom_block
1345 :
1346 : ! sum costvector over all processes
1347 0 : CALL para_env%sum(cost_vector)
1348 :
1349 : ! calculate next prime factor of nProc
1350 0 : nBins = 2
1351 0 : DO WHILE (MODULO(INT(nProc), INT(nBins)) .NE. 0)
1352 0 : nBins = nBins + 1
1353 : END DO
1354 :
1355 0 : nProc = nProc/nBins
1356 :
1357 : ! ... do the binning...
1358 :
1359 0 : ALLOCATE (localblocksize(nBins))
1360 0 : CALL hfx_permute_binning(nBins, cost_vector(blockstart:blockend), blockend - blockstart + 1, bin_perm, localblocksize)
1361 :
1362 : !... and update the permutation vector
1363 :
1364 0 : tmp_perm = permute(blockstart:blockend)
1365 0 : permute(blockstart:blockend) = tmp_perm(bin_perm)
1366 :
1367 : ! split recursion into the nBins Bins
1368 0 : IF (nProc > 1) THEN
1369 0 : ALLOCATE (ithblocksize(nProc))
1370 0 : DO i = 1, nBins
1371 0 : startoffset = SUM(localblocksize(1:(i - 1)))
1372 0 : endoffset = SUM(localblocksize(1:i)) - 1
1373 :
1374 : CALL hfx_recursive_permute(ithblocksize, blockstart + startoffset, blockstart + endoffset, nProc, &
1375 : permute, step + 1, &
1376 : my_process_id, n_processes, nblocks, &
1377 : natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
1378 : particle_set, &
1379 : coeffs_set, coeffs_kind, &
1380 : is_assoc_atomic_block_global, do_periodic, &
1381 : kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1382 : cell, x_data, para_env, pmax_block, &
1383 : do_p_screening, map_atom_to_kind_atom, eval_type, &
1384 0 : log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
1385 0 : blocksize(((i - 1)*nProc + 1):(i*nProc)) = ithblocksize
1386 : END DO
1387 0 : DEALLOCATE (ithblocksize)
1388 : ELSE
1389 0 : DO i = 1, nBins
1390 0 : blocksize(i) = localblocksize(i)
1391 : END DO
1392 : END IF
1393 :
1394 0 : DEALLOCATE (localblocksize)
1395 :
1396 0 : END SUBROUTINE hfx_recursive_permute
1397 :
1398 : ! **************************************************************************************************
1399 : !> \brief small binning routine for the recursive load balancing
1400 : !>
1401 : !> \param nBins number of Bins (INPUT)
1402 : !> \param costvector vector of current row/column costs which have to be binned (INPUT)
1403 : !> \param maxbinsize upper bound for bin size (INPUT)
1404 : !> \param perm resulting permutation due to be binning routine (OUTPUT)
1405 : !> \param block_count vector of size(nbins) which contains the size of each bin (OUTPUT)
1406 : !> \par History
1407 : !> 03.2011 created [Michael Steinlechner]
1408 : !> \author Michael Steinlechner
1409 : ! **************************************************************************************************
1410 0 : SUBROUTINE hfx_permute_binning(nBins, costvector, maxbinsize, perm, block_count)
1411 :
1412 : INTEGER, INTENT(IN) :: nBins, maxbinsize
1413 : REAL(dp), DIMENSION(maxbinsize), INTENT(IN) :: costvector
1414 : INTEGER, DIMENSION(maxbinsize), INTENT(OUT) :: perm
1415 : INTEGER, DIMENSION(nBins), INTENT(OUT) :: block_count
1416 :
1417 : INTEGER :: i, j, mod_idx, offset
1418 0 : INTEGER, DIMENSION(nBins, maxbinsize) :: bin
1419 0 : INTEGER, DIMENSION(nBins) :: bin_idx
1420 0 : INTEGER, DIMENSION(maxbinsize) :: idx
1421 0 : REAL(dp), DIMENSION(maxbinsize) :: vec
1422 0 : REAL(dp), DIMENSION(nBins) :: bincosts
1423 :
1424 : ! be careful not to change costvector (copy it!)
1425 :
1426 0 : vec = costvector
1427 0 : block_count = 0
1428 0 : bincosts = 0
1429 :
1430 : !sort the array (ascending)
1431 0 : CALL sort(vec, maxbinsize, idx)
1432 :
1433 : ! count the loop down to distribute the largest cols/rows first
1434 0 : DO i = maxbinsize, 1, -1
1435 0 : IF (vec(i) == 0) THEN
1436 : ! spread zero-cost col/rows evenly among procs
1437 0 : mod_idx = MODULO(i, nBins) + 1 !(note the fortran offset by one!)
1438 0 : block_count(mod_idx) = block_count(mod_idx) + 1
1439 0 : bin(mod_idx, block_count(mod_idx)) = idx(i)
1440 : ELSE
1441 : ! sort the bins so that the one with the lowest cost is at the
1442 : ! first place, where we then assign the current col/row
1443 0 : CALL sort(bincosts, nBins, bin_idx)
1444 0 : block_count = block_count(bin_idx)
1445 0 : bin = bin(bin_idx, :)
1446 :
1447 0 : bincosts(1) = bincosts(1) + vec(i)
1448 0 : block_count(1) = block_count(1) + 1
1449 0 : bin(1, block_count(1)) = idx(i)
1450 : END IF
1451 : END DO
1452 :
1453 : ! construct permutation vector from the binning
1454 : offset = 0
1455 0 : DO i = 1, nBins
1456 0 : DO j = 1, block_count(i)
1457 0 : perm(offset + j) = bin(i, j)
1458 : END DO
1459 0 : offset = offset + block_count(i)
1460 : END DO
1461 :
1462 0 : END SUBROUTINE hfx_permute_binning
1463 :
1464 : ! **************************************************************************************************
1465 : !> \brief Cheap way of redistributing the eri's
1466 : !> \param x_data Object that stores the indices array
1467 : !> \param para_env para_env
1468 : !> \param load_balance_parameter contains parmameter for Monte-Carlo routines
1469 : !> \param i_thread current thread ID
1470 : !> \param n_threads Total Number of threads
1471 : !> \param eval_type ...
1472 : !> \par History
1473 : !> 12.2007 created [Manuel Guidon]
1474 : !> 02.2009 optimize Memory Usage [Manuel Guidon]
1475 : !> \author Manuel Guidon
1476 : !> \note
1477 : !> The cost matrix is given by the walltime for each bin that is measured
1478 : !> during the calculation
1479 : ! **************************************************************************************************
1480 1682 : SUBROUTINE hfx_update_load_balance(x_data, para_env, &
1481 : load_balance_parameter, &
1482 : i_thread, n_threads, eval_type)
1483 :
1484 : TYPE(hfx_type), POINTER :: x_data
1485 : TYPE(mp_para_env_type), INTENT(IN) :: para_env
1486 : TYPE(hfx_load_balance_type) :: load_balance_parameter
1487 : INTEGER, INTENT(IN) :: i_thread, n_threads, eval_type
1488 :
1489 : CHARACTER(LEN=*), PARAMETER :: routineN = 'hfx_update_load_balance'
1490 :
1491 : INTEGER :: data_from, dest, end_idx, handle, i, ibin, icpu, iprocess, j, mepos, my_bin_size, &
1492 : my_global_start_idx, my_process_id, n_processes, nbins, ncpu, source, start_idx
1493 5046 : TYPE(mp_request_type), DIMENSION(2) :: req
1494 1682 : INTEGER(int_8), DIMENSION(:), POINTER :: local_cost_matrix, recbuffer, &
1495 1682 : sendbuffer, swapbuffer
1496 : INTEGER(int_8), DIMENSION(:), POINTER, SAVE :: cost_matrix
1497 1682 : INTEGER, ALLOCATABLE, DIMENSION(:) :: tmp_pos
1498 : INTEGER, ALLOCATABLE, DIMENSION(:), SAVE :: bins_per_rank
1499 : INTEGER, ALLOCATABLE, DIMENSION(:, :), SAVE :: bin_histogram
1500 : INTEGER, DIMENSION(:), POINTER, SAVE :: shm_distribution_vector
1501 : INTEGER, SAVE :: max_bin_size
1502 3364 : TYPE(hfx_distribution), DIMENSION(:), POINTER :: binned_dist, ptr_to_tmp_dist, tmp_dist
1503 : TYPE(hfx_distribution), DIMENSION(:, :), POINTER, &
1504 : SAVE :: full_dist
1505 :
1506 1682 : !$OMP BARRIER
1507 3364 : !$OMP MASTER
1508 1682 : CALL timeset(routineN, handle)
1509 : !$OMP END MASTER
1510 1682 : !$OMP BARRIER
1511 :
1512 1682 : ncpu = para_env%num_pe
1513 1682 : n_processes = ncpu*n_threads
1514 : !! If there is only 1 cpu skip the binning
1515 1682 : IF (n_processes == 1) THEN
1516 0 : ALLOCATE (tmp_dist(1))
1517 0 : tmp_dist(1)%number_of_atom_quartets = HUGE(tmp_dist(1)%number_of_atom_quartets)
1518 0 : tmp_dist(1)%istart = 0_int_8
1519 0 : ptr_to_tmp_dist => tmp_dist(:)
1520 0 : SELECT CASE (eval_type)
1521 : CASE (hfx_do_eval_energy)
1522 0 : CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
1523 : CASE (hfx_do_eval_forces)
1524 0 : CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
1525 : END SELECT
1526 0 : DEALLOCATE (tmp_dist)
1527 : ELSE
1528 1682 : mepos = para_env%mepos
1529 1682 : my_process_id = para_env%mepos*n_threads + i_thread
1530 1682 : nbins = load_balance_parameter%nbins
1531 1682 : !$OMP MASTER
1532 5046 : ALLOCATE (bin_histogram(n_processes, 2))
1533 11774 : bin_histogram = 0
1534 : !$OMP END MASTER
1535 1682 : !$OMP BARRIER
1536 2562 : SELECT CASE (eval_type)
1537 : CASE (hfx_do_eval_energy)
1538 880 : my_bin_size = SIZE(x_data%distribution_energy)
1539 : CASE (hfx_do_eval_forces)
1540 1682 : my_bin_size = SIZE(x_data%distribution_forces)
1541 : END SELECT
1542 1682 : bin_histogram(my_process_id + 1, 1) = my_bin_size
1543 1682 : !$OMP BARRIER
1544 1682 : !$OMP MASTER
1545 1682 : CALL para_env%sum(bin_histogram(:, 1))
1546 1682 : bin_histogram(1, 2) = bin_histogram(1, 1)
1547 3364 : DO iprocess = 2, n_processes
1548 3364 : bin_histogram(iprocess, 2) = bin_histogram(iprocess - 1, 2) + bin_histogram(iprocess, 1)
1549 : END DO
1550 :
1551 3364 : max_bin_size = MAXVAL(bin_histogram(para_env%mepos*n_threads + 1:para_env%mepos*n_threads + n_threads, 1))
1552 1682 : CALL para_env%max(max_bin_size)
1553 : !$OMP END MASTER
1554 1682 : !$OMP BARRIER
1555 112688 : ALLOCATE (binned_dist(my_bin_size))
1556 : !! Use old binned_dist, but with timings cost
1557 880 : SELECT CASE (eval_type)
1558 : CASE (hfx_do_eval_energy)
1559 113520 : binned_dist = x_data%distribution_energy
1560 : CASE (hfx_do_eval_forces)
1561 104338 : binned_dist = x_data%distribution_forces
1562 : END SELECT
1563 :
1564 109330 : DO ibin = 1, my_bin_size
1565 109330 : IF (binned_dist(ibin)%number_of_atom_quartets == 0) THEN
1566 92841 : binned_dist(ibin)%cost = 0
1567 : ELSE
1568 8431 : SELECT CASE (eval_type)
1569 : CASE (hfx_do_eval_energy)
1570 8431 : IF (.NOT. load_balance_parameter%rtp_redistribute) THEN
1571 : binned_dist(ibin)%cost = INT((binned_dist(ibin)%time_first_scf + &
1572 8431 : binned_dist(ibin)%time_other_scf)*10000.0_dp, int_8)
1573 : ELSE
1574 0 : binned_dist(ibin)%cost = INT((binned_dist(ibin)%time_other_scf)*10000.0_dp, int_8)
1575 : END IF
1576 : CASE (hfx_do_eval_forces)
1577 14807 : binned_dist(ibin)%cost = INT((binned_dist(ibin)%time_forces)*10000.0_dp, int_8)
1578 : END SELECT
1579 : END IF
1580 : END DO
1581 1682 : !$OMP BARRIER
1582 1682 : !$OMP MASTER
1583 : !! store all local results in a big cost matrix
1584 5046 : ALLOCATE (cost_matrix(ncpu*nbins*n_threads))
1585 216978 : cost_matrix = 0
1586 5046 : ALLOCATE (sendbuffer(max_bin_size*n_threads))
1587 3364 : ALLOCATE (recbuffer(max_bin_size*n_threads))
1588 : !$OMP END MASTER
1589 1682 : !$OMP BARRIER
1590 1682 : my_global_start_idx = bin_histogram(my_process_id + 1, 2) - my_bin_size
1591 1682 : icpu = para_env%mepos + 1
1592 109330 : DO i = 1, my_bin_size
1593 109330 : cost_matrix(my_global_start_idx + i) = binned_dist(i)%cost
1594 : END DO
1595 :
1596 1682 : mepos = para_env%mepos
1597 1682 : !$OMP BARRIER
1598 1682 : !$OMP MASTER
1599 5046 : ALLOCATE (bins_per_rank(ncpu))
1600 5046 : bins_per_rank = 0
1601 5046 : DO icpu = 1, ncpu
1602 8410 : bins_per_rank(icpu) = SUM(bin_histogram((icpu - 1)*n_threads + 1:(icpu - 1)*n_threads + n_threads, 1))
1603 : END DO
1604 : sendbuffer(1:bins_per_rank(para_env%mepos + 1)) = &
1605 216978 : cost_matrix(my_global_start_idx + 1:my_global_start_idx + bins_per_rank(para_env%mepos + 1))
1606 :
1607 1682 : dest = MODULO(mepos + 1, ncpu)
1608 1682 : source = MODULO(mepos - 1, ncpu)
1609 : ! sync before/after ring of isendrecv
1610 1682 : CALL para_env%sync()
1611 5046 : DO icpu = 0, ncpu - 1
1612 3364 : IF (icpu .NE. ncpu - 1) THEN
1613 : CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
1614 1682 : req(1), req(2), 13)
1615 : END IF
1616 3364 : data_from = MODULO(mepos - icpu, ncpu)
1617 8410 : start_idx = SUM(bins_per_rank(1:data_from + 1)) - bins_per_rank(data_from + 1) + 1
1618 3364 : end_idx = start_idx + bins_per_rank(data_from + 1) - 1
1619 433956 : cost_matrix(start_idx:end_idx) = sendbuffer(1:end_idx - start_idx + 1)
1620 :
1621 3364 : IF (icpu .NE. ncpu - 1) THEN
1622 1682 : CALL mp_waitall(req)
1623 : END IF
1624 3364 : swapbuffer => sendbuffer
1625 3364 : sendbuffer => recbuffer
1626 5046 : recbuffer => swapbuffer
1627 : END DO
1628 1682 : DEALLOCATE (recbuffer, sendbuffer)
1629 : ! sync before/after ring of isendrecv
1630 1682 : CALL para_env%sync()
1631 : !$OMP END MASTER
1632 1682 : !$OMP BARRIER
1633 5046 : ALLOCATE (local_cost_matrix(SIZE(cost_matrix, 1)))
1634 432274 : local_cost_matrix = cost_matrix
1635 1682 : !$OMP MASTER
1636 5046 : ALLOCATE (shm_distribution_vector(ncpu*nbins*n_threads))
1637 : CALL optimize_distribution(ncpu*nbins*n_threads, ncpu*n_threads, local_cost_matrix, &
1638 1682 : shm_distribution_vector, x_data%load_balance_parameter%do_randomize)
1639 :
1640 609740 : ALLOCATE (full_dist(ncpu*n_threads, max_bin_size))
1641 :
1642 604694 : full_dist(:, :)%istart = 0_int_8
1643 604694 : full_dist(:, :)%number_of_atom_quartets = 0_int_8
1644 604694 : full_dist(:, :)%cost = 0_int_8
1645 604694 : full_dist(:, :)%time_first_scf = 0.0_dp
1646 604694 : full_dist(:, :)%time_other_scf = 0.0_dp
1647 604694 : full_dist(:, :)%time_forces = 0.0_dp
1648 : !$OMP END MASTER
1649 :
1650 1682 : !$OMP BARRIER
1651 1682 : mepos = para_env%mepos + 1
1652 216978 : full_dist((mepos - 1)*n_threads + i_thread + 1, 1:my_bin_size) = binned_dist(1:my_bin_size)
1653 1682 : !$OMP BARRIER
1654 1682 : !$OMP MASTER
1655 5046 : ALLOCATE (sendbuffer(3*max_bin_size*n_threads))
1656 3364 : ALLOCATE (recbuffer(3*max_bin_size*n_threads))
1657 1682 : mepos = para_env%mepos
1658 3364 : DO j = 1, n_threads
1659 204368 : DO i = 1, max_bin_size
1660 201004 : sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 1) = full_dist(mepos*n_threads + j, i)%istart
1661 201004 : sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 2) = full_dist(mepos*n_threads + j, i)%number_of_atom_quartets
1662 202686 : sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 3) = full_dist(mepos*n_threads + j, i)%cost
1663 : END DO
1664 : END DO
1665 1682 : dest = MODULO(mepos + 1, ncpu)
1666 1682 : source = MODULO(mepos - 1, ncpu)
1667 : ! sync before/after ring of isendrecv
1668 1682 : CALL para_env%sync()
1669 5046 : DO icpu = 0, ncpu - 1
1670 3364 : IF (icpu .NE. ncpu - 1) THEN
1671 : CALL para_env%isendrecv(sendbuffer, dest, recbuffer, source, &
1672 1682 : req(1), req(2), 13)
1673 : END IF
1674 3364 : data_from = MODULO(mepos - icpu, ncpu)
1675 6728 : DO j = 1, n_threads
1676 408736 : DO i = 1, max_bin_size
1677 402008 : full_dist(data_from*n_threads + j, i)%istart = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 1)
1678 402008 : full_dist(data_from*n_threads + j, i)%number_of_atom_quartets = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 2)
1679 405372 : full_dist(data_from*n_threads + j, i)%cost = sendbuffer((j - 1)*3*max_bin_size + (i - 1)*3 + 3)
1680 : END DO
1681 : END DO
1682 :
1683 3364 : IF (icpu .NE. ncpu - 1) THEN
1684 1682 : CALL mp_waitall(req)
1685 : END IF
1686 3364 : swapbuffer => sendbuffer
1687 3364 : sendbuffer => recbuffer
1688 5046 : recbuffer => swapbuffer
1689 : END DO
1690 : ! sync before/after ring of isendrecv
1691 1682 : DEALLOCATE (recbuffer, sendbuffer)
1692 1682 : CALL para_env%sync()
1693 : !$OMP END MASTER
1694 1682 : !$OMP BARRIER
1695 : !! reorder the distribution according to the distribution vector
1696 5046 : ALLOCATE (tmp_pos(ncpu*n_threads))
1697 5046 : tmp_pos = 1
1698 220342 : ALLOCATE (tmp_dist(nbins*ncpu*n_threads))
1699 :
1700 216978 : tmp_dist(:)%istart = 0_int_8
1701 216978 : tmp_dist(:)%number_of_atom_quartets = 0_int_8
1702 216978 : tmp_dist(:)%cost = 0_int_8
1703 216978 : tmp_dist(:)%time_first_scf = 0.0_dp
1704 216978 : tmp_dist(:)%time_other_scf = 0.0_dp
1705 216978 : tmp_dist(:)%time_forces = 0.0_dp
1706 :
1707 5046 : mepos = my_process_id + 1
1708 5046 : DO icpu = 1, n_processes
1709 220342 : DO i = 1, bin_histogram(icpu, 1)
1710 218660 : IF (shm_distribution_vector(bin_histogram(icpu, 2) - bin_histogram(icpu, 1) + i) == mepos) THEN
1711 107648 : tmp_dist(tmp_pos(mepos)) = full_dist(icpu, i)
1712 107648 : tmp_pos(mepos) = tmp_pos(mepos) + 1
1713 : END IF
1714 : END DO
1715 : END DO
1716 :
1717 : !! Assign the load to each process
1718 : NULLIFY (ptr_to_tmp_dist)
1719 1682 : mepos = my_process_id + 1
1720 1682 : ptr_to_tmp_dist => tmp_dist(1:tmp_pos(mepos) - 1)
1721 880 : SELECT CASE (eval_type)
1722 : CASE (hfx_do_eval_energy)
1723 880 : CALL hfx_set_distr_energy(ptr_to_tmp_dist, x_data)
1724 : CASE (hfx_do_eval_forces)
1725 1682 : CALL hfx_set_distr_forces(ptr_to_tmp_dist, x_data)
1726 : END SELECT
1727 :
1728 1682 : !$OMP BARRIER
1729 1682 : !$OMP MASTER
1730 1682 : DEALLOCATE (full_dist, cost_matrix, shm_distribution_vector)
1731 1682 : DEALLOCATE (bins_per_rank, bin_histogram)
1732 : !$OMP END MASTER
1733 1682 : !$OMP BARRIER
1734 1682 : DEALLOCATE (tmp_dist, tmp_pos)
1735 1682 : DEALLOCATE (binned_dist, local_cost_matrix)
1736 : END IF
1737 1682 : !$OMP BARRIER
1738 1682 : !$OMP MASTER
1739 1682 : CALL timestop(handle)
1740 : !$OMP END MASTER
1741 1682 : !$OMP BARRIER
1742 :
1743 3364 : END SUBROUTINE hfx_update_load_balance
1744 :
1745 : ! **************************************************************************************************
1746 : !> \brief estimates the cost of a set quartet with info available at load balance time
1747 : !> i.e. without much info on the primitives primitives
1748 : !> \param nsa ...
1749 : !> \param nsb ...
1750 : !> \param nsc ...
1751 : !> \param nsd ...
1752 : !> \param npgfa ...
1753 : !> \param npgfb ...
1754 : !> \param npgfc ...
1755 : !> \param npgfd ...
1756 : !> \param ratio ...
1757 : !> \param p1 ...
1758 : !> \param p2 ...
1759 : !> \param p3 ...
1760 : !> \return ...
1761 : !> \par History
1762 : !> 08.2009 created Joost VandeVondele
1763 : !> \author Joost VandeVondele
1764 : ! **************************************************************************************************
1765 8859903 : FUNCTION cost_model(nsa, nsb, nsc, nsd, npgfa, npgfb, npgfc, npgfd, ratio, p1, p2, p3) RESULT(res)
1766 : IMPLICIT NONE
1767 : REAL(KIND=dp) :: estimate1, estimate2, estimate, ratio, switch, mu, sigma
1768 : INTEGER(KIND=int_8) :: res
1769 : REAL(KIND=dp), INTENT(IN) :: p1(12), p2(12), p3(2)
1770 :
1771 : INTEGER :: nsa, nsb, nsc, nsd, npgfa, npgfb, npgfc, npgfd
1772 :
1773 8859903 : estimate1 = estimate_basic(p1)
1774 8859903 : estimate2 = estimate_basic(p2)
1775 8859903 : mu = LOG(ABS(1.0E6_dp*p3(1)) + 1)
1776 8859903 : sigma = p3(2)*0.1_dp*mu
1777 8859903 : switch = 1.0_dp/(1.0_dp + EXP((LOG(estimate1) - mu)/sigma))
1778 8859903 : estimate = estimate1*(1.0_dp - switch) + estimate2*switch
1779 8859903 : res = INT(estimate*0.001_dp, KIND=int_8) + 1
1780 :
1781 : CONTAINS
1782 :
1783 : ! **************************************************************************************************
1784 : !> \brief ...
1785 : !> \param p ...
1786 : !> \return ...
1787 : ! **************************************************************************************************
1788 17719806 : REAL(KIND=dp) FUNCTION estimate_basic(p) RESULT(res)
1789 : REAL(KIND=dp) :: p(12)
1790 :
1791 : REAL(KIND=dp) :: p1, p10, p11, p12, p2, p3, p4, p5, p6, &
1792 : p7, p8, p9
1793 :
1794 17719806 : p1 = p(1); p2 = p(2); p3 = p(3); p4 = p(4)
1795 17719806 : p5 = p(5); p6 = p(6); p7 = p(7); p8 = p(8)
1796 17719806 : p9 = p(9); p10 = p(10); p11 = p(11); p12 = p(12)
1797 : res = poly2(nsa, p1, p2, p3)*poly2(nsb, p1, p2, p3)*poly2(nsc, p1, p2, p3)*poly2(nsd, p1, p2, p3)* &
1798 : poly2(npgfa, p4, p5, p6)*poly2(npgfb, p4, p5, p6)*poly2(npgfc, p4, p5, p6)* &
1799 : poly2(npgfd, p4, p5, p6)*EXP(-p7*ratio + p8*ratio**2) + &
1800 17719806 : 1000.0_dp*p9 + poly2(nsa, p10, p11, p12)*poly2(nsb, p10, p11, p12)*poly2(nsc, p10, p11, p12)*poly2(nsd, p10, p11, p12)
1801 17719806 : res = 1 + ABS(res)
1802 17719806 : END FUNCTION estimate_basic
1803 :
1804 : ! **************************************************************************************************
1805 : !> \brief ...
1806 : !> \param x ...
1807 : !> \param a0 ...
1808 : !> \param a1 ...
1809 : !> \param a2 ...
1810 : !> \return ...
1811 : ! **************************************************************************************************
1812 212637672 : REAL(KIND=dp) FUNCTION poly2(x, a0, a1, a2)
1813 : INTEGER, INTENT(IN) :: x
1814 : REAL(KIND=dp), INTENT(IN) :: a0, a1, a2
1815 : REAL(KIND=dp) :: r
1816 :
1817 212637672 : r = REAL(x, KIND=dp)
1818 212637672 : poly2 = a0 + (a1 + a2*r)*r
1819 212637672 : END FUNCTION poly2
1820 :
1821 : END FUNCTION cost_model
1822 : ! **************************************************************************************************
1823 : !> \brief Minimizes the maximum cost per cpu by shuffling around all bins
1824 : !> \param total_number_of_bins ...
1825 : !> \param number_of_processes ...
1826 : !> \param bin_costs costs per bin
1827 : !> \param distribution_vector will contain the final distribution
1828 : !> \param do_randomize ...
1829 : !> \par History
1830 : !> 03.2009 created from a hack by Joost [Manuel Guidon]
1831 : !> \author Manuel Guidon
1832 : ! **************************************************************************************************
1833 3426 : SUBROUTINE optimize_distribution(total_number_of_bins, number_of_processes, bin_costs, &
1834 : distribution_vector, do_randomize)
1835 : INTEGER :: total_number_of_bins, number_of_processes
1836 : INTEGER(int_8), DIMENSION(:), POINTER :: bin_costs
1837 : INTEGER, DIMENSION(:), POINTER :: distribution_vector
1838 : LOGICAL, INTENT(IN) :: do_randomize
1839 :
1840 : INTEGER :: i, itmp, j, nstep
1841 3426 : INTEGER(int_8), DIMENSION(:), POINTER :: my_cost_cpu, tmp_cost, tmp_cpu_cost
1842 : INTEGER, DIMENSION(:), POINTER :: tmp_cpu_index, tmp_index
1843 3426 : TYPE(rng_stream_type), ALLOCATABLE :: rng_stream
1844 :
1845 3426 : nstep = MAX(1, INT(number_of_processes)/2)
1846 :
1847 10278 : ALLOCATE (tmp_cost(total_number_of_bins))
1848 10278 : ALLOCATE (tmp_index(total_number_of_bins))
1849 10278 : ALLOCATE (tmp_cpu_cost(number_of_processes))
1850 10278 : ALLOCATE (tmp_cpu_index(number_of_processes))
1851 6852 : ALLOCATE (my_cost_cpu(number_of_processes))
1852 883908 : tmp_cost = bin_costs
1853 :
1854 3426 : CALL sort(tmp_cost, total_number_of_bins, tmp_index)
1855 10278 : my_cost_cpu = 0
1856 : !
1857 : ! assign the largest remaining bin to the CPU with the smallest load
1858 : ! gives near perfect distributions for a sufficient number of bins ...
1859 : ! doing this in chunks of nstep (where nstep ~ number_of_processes) makes this n log n and gives
1860 : ! each cpu a similar number of tasks.
1861 : ! it also avoids degenerate cases where thousands of zero sized tasks
1862 : ! are assigned to the same (least loaded) cpu
1863 : !
1864 3426 : IF (do_randomize) &
1865 : rng_stream = rng_stream_type(name="uniform_rng", &
1866 0 : distribution_type=UNIFORM)
1867 :
1868 441954 : DO i = total_number_of_bins, 1, -nstep
1869 2631168 : tmp_cpu_cost = my_cost_cpu
1870 438528 : CALL sort(tmp_cpu_cost, INT(number_of_processes), tmp_cpu_index)
1871 438528 : IF (do_randomize) THEN
1872 0 : CALL rng_stream%shuffle(tmp_cpu_index(1:MIN(i, nstep)))
1873 : END IF
1874 880482 : DO j = 1, MIN(i, nstep)
1875 438528 : itmp = tmp_cpu_index(j)
1876 438528 : distribution_vector(tmp_index(i - j + 1)) = itmp
1877 877056 : my_cost_cpu(itmp) = my_cost_cpu(itmp) + bin_costs(tmp_index(i - j + 1))
1878 : END DO
1879 : END DO
1880 :
1881 3426 : DEALLOCATE (tmp_cost, tmp_index, tmp_cpu_cost)
1882 3426 : DEALLOCATE (tmp_cpu_index, my_cost_cpu)
1883 6852 : END SUBROUTINE optimize_distribution
1884 :
1885 : ! **************************************************************************************************
1886 : !> \brief Given a 2d index pair, this function returns a 1d index pair for
1887 : !> a symmetric upper triangle NxN matrix
1888 : !> The compiler should inline this function, therefore it appears in
1889 : !> several modules
1890 : !> \param i 2d index
1891 : !> \param j 2d index
1892 : !> \param N matrix size
1893 : !> \return ...
1894 : !> \par History
1895 : !> 03.2009 created [Manuel Guidon]
1896 : !> \author Manuel Guidon
1897 : ! **************************************************************************************************
1898 64236 : PURE FUNCTION get_1D_idx(i, j, N)
1899 : INTEGER, INTENT(IN) :: i, j
1900 : INTEGER(int_8), INTENT(IN) :: N
1901 : INTEGER(int_8) :: get_1D_idx
1902 :
1903 : INTEGER(int_8) :: min_ij
1904 :
1905 64236 : min_ij = MIN(i, j)
1906 64236 : get_1D_idx = min_ij*N + MAX(i, j) - (min_ij - 1)*min_ij/2 - N
1907 :
1908 64236 : END FUNCTION get_1D_idx
1909 :
1910 : ! **************************************************************************************************
1911 : !> \brief ...
1912 : !> \param natom ...
1913 : !> \param nkind ...
1914 : !> \param list_ij ...
1915 : !> \param list_kl ...
1916 : !> \param set_list_ij ...
1917 : !> \param set_list_kl ...
1918 : !> \param iatom_start ...
1919 : !> \param iatom_end ...
1920 : !> \param jatom_start ...
1921 : !> \param jatom_end ...
1922 : !> \param katom_start ...
1923 : !> \param katom_end ...
1924 : !> \param latom_start ...
1925 : !> \param latom_end ...
1926 : !> \param particle_set ...
1927 : !> \param coeffs_set ...
1928 : !> \param coeffs_kind ...
1929 : !> \param is_assoc_atomic_block_global ...
1930 : !> \param do_periodic ...
1931 : !> \param kind_of ...
1932 : !> \param basis_parameter ...
1933 : !> \param pmax_set ...
1934 : !> \param pmax_atom ...
1935 : !> \param pmax_blocks ...
1936 : !> \param cell ...
1937 : !> \param do_p_screening ...
1938 : !> \param map_atom_to_kind_atom ...
1939 : !> \param eval_type ...
1940 : !> \param log10_eps_schwarz ...
1941 : !> \param log_2 ...
1942 : !> \param coeffs_kind_max0 ...
1943 : !> \param use_virial ...
1944 : !> \param atomic_pair_list ...
1945 : !> \return ...
1946 : ! **************************************************************************************************
1947 303684 : FUNCTION estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
1948 : iatom_start, iatom_end, jatom_start, jatom_end, &
1949 : katom_start, katom_end, latom_start, latom_end, &
1950 : particle_set, &
1951 303684 : coeffs_set, coeffs_kind, &
1952 303684 : is_assoc_atomic_block_global, do_periodic, &
1953 : kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
1954 : cell, &
1955 : do_p_screening, map_atom_to_kind_atom, eval_type, &
1956 : log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, &
1957 303684 : atomic_pair_list)
1958 :
1959 : INTEGER, INTENT(IN) :: natom, nkind
1960 : TYPE(pair_list_type) :: list_ij, list_kl
1961 : TYPE(pair_set_list_type), DIMENSION(:) :: set_list_ij, set_list_kl
1962 : INTEGER, INTENT(IN) :: iatom_start, iatom_end, jatom_start, &
1963 : jatom_end, katom_start, katom_end, &
1964 : latom_start, latom_end
1965 : TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
1966 : TYPE(hfx_screen_coeff_type), &
1967 : DIMENSION(:, :, :, :), POINTER :: coeffs_set
1968 : TYPE(hfx_screen_coeff_type), &
1969 : DIMENSION(nkind, nkind) :: coeffs_kind
1970 : INTEGER, DIMENSION(:, :) :: is_assoc_atomic_block_global
1971 : LOGICAL :: do_periodic
1972 : INTEGER :: kind_of(*)
1973 : TYPE(hfx_basis_type), DIMENSION(:), POINTER :: basis_parameter
1974 : TYPE(hfx_p_kind), DIMENSION(:), POINTER :: pmax_set
1975 : REAL(dp), DIMENSION(:, :), POINTER :: pmax_atom
1976 : REAL(dp) :: pmax_blocks
1977 : TYPE(cell_type), POINTER :: cell
1978 : LOGICAL, INTENT(IN) :: do_p_screening
1979 : INTEGER, DIMENSION(:), POINTER :: map_atom_to_kind_atom
1980 : INTEGER, INTENT(IN) :: eval_type
1981 : REAL(dp) :: log10_eps_schwarz, log_2, &
1982 : coeffs_kind_max0
1983 : LOGICAL, INTENT(IN) :: use_virial
1984 : LOGICAL, DIMENSION(natom, natom) :: atomic_pair_list
1985 : INTEGER(int_8) :: estimate_block_cost
1986 :
1987 : INTEGER :: i_list_ij, i_list_kl, i_set_list_ij, i_set_list_ij_start, i_set_list_ij_stop, &
1988 : i_set_list_kl, i_set_list_kl_start, i_set_list_kl_stop, iatom, ikind, iset, jatom, jkind, &
1989 : jset, katom, kind_kind_idx, kkind, kset, latom, lkind, lset, swap_id
1990 303684 : INTEGER, DIMENSION(:), POINTER :: npgfa, npgfb, npgfc, npgfd, nsgfa, &
1991 303684 : nsgfb, nsgfc, nsgfd
1992 : REAL(dp) :: actual_pmax_atom, cost_tmp, max_val1, &
1993 : max_val2, pmax_entry, rab2, rcd2, &
1994 : screen_kind_ij, screen_kind_kl
1995 303684 : REAL(dp), DIMENSION(:, :), POINTER :: ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4
1996 :
1997 303684 : estimate_block_cost = 0_int_8
1998 :
1999 : CALL build_pair_list(natom, list_ij, set_list_ij, iatom_start, iatom_end, jatom_start, jatom_end, &
2000 : kind_of, basis_parameter, particle_set, &
2001 : do_periodic, coeffs_set, coeffs_kind, coeffs_kind_max0, &
2002 303684 : log10_eps_schwarz, cell, pmax_blocks, atomic_pair_list)
2003 :
2004 : CALL build_pair_list(natom, list_kl, set_list_kl, katom_start, katom_end, latom_start, latom_end, &
2005 : kind_of, basis_parameter, particle_set, &
2006 : do_periodic, coeffs_set, coeffs_kind, coeffs_kind_max0, &
2007 303684 : log10_eps_schwarz, cell, pmax_blocks, atomic_pair_list)
2008 :
2009 577082 : DO i_list_ij = 1, list_ij%n_element
2010 273398 : iatom = list_ij%elements(i_list_ij)%pair(1)
2011 273398 : jatom = list_ij%elements(i_list_ij)%pair(2)
2012 273398 : i_set_list_ij_start = list_ij%elements(i_list_ij)%set_bounds(1)
2013 273398 : i_set_list_ij_stop = list_ij%elements(i_list_ij)%set_bounds(2)
2014 273398 : ikind = list_ij%elements(i_list_ij)%kind_pair(1)
2015 273398 : jkind = list_ij%elements(i_list_ij)%kind_pair(2)
2016 273398 : rab2 = list_ij%elements(i_list_ij)%dist2
2017 :
2018 273398 : nsgfa => basis_parameter(ikind)%nsgf
2019 273398 : nsgfb => basis_parameter(jkind)%nsgf
2020 273398 : npgfa => basis_parameter(ikind)%npgf
2021 273398 : npgfb => basis_parameter(jkind)%npgf
2022 :
2023 832166 : DO i_list_kl = 1, list_kl%n_element
2024 :
2025 255084 : katom = list_kl%elements(i_list_kl)%pair(1)
2026 255084 : latom = list_kl%elements(i_list_kl)%pair(2)
2027 :
2028 255084 : IF (.NOT. (katom + latom <= iatom + jatom)) CYCLE
2029 140554 : IF (((iatom + jatom) .EQ. (katom + latom)) .AND. (katom < iatom)) CYCLE
2030 :
2031 134920 : IF (eval_type == hfx_do_eval_forces) THEN
2032 14548 : IF (.NOT. use_virial) THEN
2033 13166 : IF ((iatom == jatom .AND. iatom == katom .AND. iatom == latom)) CYCLE
2034 : END IF
2035 : END IF
2036 :
2037 133606 : i_set_list_kl_start = list_kl%elements(i_list_kl)%set_bounds(1)
2038 133606 : i_set_list_kl_stop = list_kl%elements(i_list_kl)%set_bounds(2)
2039 133606 : kkind = list_kl%elements(i_list_kl)%kind_pair(1)
2040 133606 : lkind = list_kl%elements(i_list_kl)%kind_pair(2)
2041 133606 : rcd2 = list_kl%elements(i_list_kl)%dist2
2042 :
2043 133606 : nsgfc => basis_parameter(kkind)%nsgf
2044 133606 : nsgfd => basis_parameter(lkind)%nsgf
2045 133606 : npgfc => basis_parameter(kkind)%npgf
2046 133606 : npgfd => basis_parameter(lkind)%npgf
2047 :
2048 133606 : IF (do_p_screening) THEN
2049 : actual_pmax_atom = MAX(pmax_atom(katom, iatom), &
2050 : pmax_atom(latom, jatom), &
2051 : pmax_atom(latom, iatom), &
2052 16419 : pmax_atom(katom, jatom))
2053 : ELSE
2054 : actual_pmax_atom = 0.0_dp
2055 : END IF
2056 :
2057 : screen_kind_ij = coeffs_kind(jkind, ikind)%x(1)*rab2 + &
2058 133606 : coeffs_kind(jkind, ikind)%x(2)
2059 : screen_kind_kl = coeffs_kind(lkind, kkind)%x(1)*rcd2 + &
2060 133606 : coeffs_kind(lkind, kkind)%x(2)
2061 133606 : IF (screen_kind_ij + screen_kind_kl + actual_pmax_atom < log10_eps_schwarz) CYCLE
2062 :
2063 126428 : IF (.NOT. (is_assoc_atomic_block_global(latom, iatom) >= 1 .AND. &
2064 : is_assoc_atomic_block_global(katom, iatom) >= 1 .AND. &
2065 : is_assoc_atomic_block_global(katom, jatom) >= 1 .AND. &
2066 : is_assoc_atomic_block_global(latom, jatom) >= 1)) CYCLE
2067 :
2068 123244 : IF (do_p_screening) THEN
2069 6243 : SELECT CASE (eval_type)
2070 : CASE (hfx_do_eval_energy)
2071 6243 : swap_id = 0
2072 6243 : kind_kind_idx = INT(get_1D_idx(kkind, ikind, INT(nkind, int_8)))
2073 6243 : IF (ikind >= kkind) THEN
2074 : ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2075 : map_atom_to_kind_atom(katom), &
2076 5629 : map_atom_to_kind_atom(iatom))
2077 : ELSE
2078 : ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2079 : map_atom_to_kind_atom(iatom), &
2080 614 : map_atom_to_kind_atom(katom))
2081 614 : swap_id = swap_id + 1
2082 : END IF
2083 6243 : kind_kind_idx = INT(get_1D_idx(lkind, jkind, INT(nkind, int_8)))
2084 6243 : IF (jkind >= lkind) THEN
2085 : ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2086 : map_atom_to_kind_atom(latom), &
2087 5879 : map_atom_to_kind_atom(jatom))
2088 : ELSE
2089 : ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2090 : map_atom_to_kind_atom(jatom), &
2091 364 : map_atom_to_kind_atom(latom))
2092 364 : swap_id = swap_id + 2
2093 : END IF
2094 6243 : kind_kind_idx = INT(get_1D_idx(lkind, ikind, INT(nkind, int_8)))
2095 6243 : IF (ikind >= lkind) THEN
2096 : ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2097 : map_atom_to_kind_atom(latom), &
2098 3489 : map_atom_to_kind_atom(iatom))
2099 : ELSE
2100 : ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2101 : map_atom_to_kind_atom(iatom), &
2102 2754 : map_atom_to_kind_atom(latom))
2103 2754 : swap_id = swap_id + 4
2104 : END IF
2105 6243 : kind_kind_idx = INT(get_1D_idx(kkind, jkind, INT(nkind, int_8)))
2106 6243 : IF (jkind >= kkind) THEN
2107 : ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2108 : map_atom_to_kind_atom(katom), &
2109 6243 : map_atom_to_kind_atom(jatom))
2110 : ELSE
2111 : ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2112 : map_atom_to_kind_atom(jatom), &
2113 0 : map_atom_to_kind_atom(katom))
2114 0 : swap_id = swap_id + 8
2115 : END IF
2116 : CASE (hfx_do_eval_forces)
2117 9816 : swap_id = 16
2118 9816 : kind_kind_idx = INT(get_1D_idx(kkind, ikind, INT(nkind, int_8)))
2119 9816 : IF (ikind >= kkind) THEN
2120 : ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2121 : map_atom_to_kind_atom(katom), &
2122 9282 : map_atom_to_kind_atom(iatom))
2123 : ELSE
2124 : ptr_p_1 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2125 : map_atom_to_kind_atom(iatom), &
2126 534 : map_atom_to_kind_atom(katom))
2127 534 : swap_id = swap_id + 1
2128 : END IF
2129 9816 : kind_kind_idx = INT(get_1D_idx(lkind, jkind, INT(nkind, int_8)))
2130 9816 : IF (jkind >= lkind) THEN
2131 : ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2132 : map_atom_to_kind_atom(latom), &
2133 9816 : map_atom_to_kind_atom(jatom))
2134 : ELSE
2135 : ptr_p_2 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2136 : map_atom_to_kind_atom(jatom), &
2137 0 : map_atom_to_kind_atom(latom))
2138 0 : swap_id = swap_id + 2
2139 : END IF
2140 9816 : kind_kind_idx = INT(get_1D_idx(lkind, ikind, INT(nkind, int_8)))
2141 9816 : IF (ikind >= lkind) THEN
2142 : ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2143 : map_atom_to_kind_atom(latom), &
2144 8066 : map_atom_to_kind_atom(iatom))
2145 : ELSE
2146 : ptr_p_3 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2147 : map_atom_to_kind_atom(iatom), &
2148 1750 : map_atom_to_kind_atom(latom))
2149 1750 : swap_id = swap_id + 4
2150 : END IF
2151 9816 : kind_kind_idx = INT(get_1D_idx(kkind, jkind, INT(nkind, int_8)))
2152 25875 : IF (jkind >= kkind) THEN
2153 : ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2154 : map_atom_to_kind_atom(katom), &
2155 9816 : map_atom_to_kind_atom(jatom))
2156 : ELSE
2157 : ptr_p_4 => pmax_set(kind_kind_idx)%p_kind(:, :, &
2158 : map_atom_to_kind_atom(jatom), &
2159 0 : map_atom_to_kind_atom(katom))
2160 0 : swap_id = swap_id + 8
2161 : END IF
2162 : END SELECT
2163 : END IF
2164 :
2165 1084490 : DO i_set_list_ij = i_set_list_ij_start, i_set_list_ij_stop
2166 687848 : iset = set_list_ij(i_set_list_ij)%pair(1)
2167 687848 : jset = set_list_ij(i_set_list_ij)%pair(2)
2168 :
2169 : max_val1 = coeffs_set(jset, iset, jkind, ikind)%x(1)*rab2 + &
2170 687848 : coeffs_set(jset, iset, jkind, ikind)%x(2)
2171 :
2172 687848 : IF (max_val1 + screen_kind_kl + actual_pmax_atom < log10_eps_schwarz) CYCLE
2173 9961066 : DO i_set_list_kl = i_set_list_kl_start, i_set_list_kl_stop
2174 9034494 : kset = set_list_kl(i_set_list_kl)%pair(1)
2175 9034494 : lset = set_list_kl(i_set_list_kl)%pair(2)
2176 :
2177 : max_val2 = max_val1 + (coeffs_set(lset, kset, lkind, kkind)%x(1)*rcd2 + &
2178 9034494 : coeffs_set(lset, kset, lkind, kkind)%x(2))
2179 :
2180 9034494 : IF (max_val2 + actual_pmax_atom < log10_eps_schwarz) CYCLE
2181 8029702 : IF (do_p_screening) THEN
2182 : CALL get_pmax_val(ptr_p_1, ptr_p_2, ptr_p_3, ptr_p_4, &
2183 : iset, jset, kset, lset, &
2184 986464 : pmax_entry, swap_id)
2185 986464 : IF (eval_type == hfx_do_eval_forces) THEN
2186 577972 : pmax_entry = log_2 + pmax_entry
2187 : END IF
2188 : ELSE
2189 7043238 : pmax_entry = 0.0_dp
2190 : END IF
2191 8029702 : max_val2 = max_val2 + pmax_entry
2192 8029702 : IF (max_val2 < log10_eps_schwarz) CYCLE
2193 687848 : SELECT CASE (eval_type)
2194 : CASE (hfx_do_eval_energy)
2195 : cost_tmp = cost_model(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
2196 : npgfa(iset), npgfb(jset), npgfc(kset), npgfd(lset), &
2197 : max_val2/log10_eps_schwarz, &
2198 6933772 : p1_energy, p2_energy, p3_energy)
2199 6933772 : estimate_block_cost = estimate_block_cost + INT(cost_tmp, KIND=int_8)
2200 : CASE (hfx_do_eval_forces)
2201 : cost_tmp = cost_model(nsgfa(iset), nsgfb(jset), nsgfc(kset), nsgfd(lset), &
2202 : npgfa(iset), npgfb(jset), npgfc(kset), npgfd(lset), &
2203 : max_val2/log10_eps_schwarz, &
2204 914362 : p1_forces, p2_forces, p3_forces)
2205 8762496 : estimate_block_cost = estimate_block_cost + INT(cost_tmp, KIND=int_8)
2206 : END SELECT
2207 : END DO ! i_set_list_kl
2208 : END DO ! i_set_list_ij
2209 : END DO ! i_list_kl
2210 : END DO ! i_list_ij
2211 :
2212 303684 : END FUNCTION estimate_block_cost
2213 :
2214 : ! **************************************************************************************************
2215 : !> \brief ...
2216 : !> \param nkind ...
2217 : !> \param para_env ...
2218 : !> \param natom ...
2219 : !> \param block_size ...
2220 : !> \param nblock ...
2221 : !> \param blocks ...
2222 : !> \param list_ij ...
2223 : !> \param list_kl ...
2224 : !> \param set_list_ij ...
2225 : !> \param set_list_kl ...
2226 : !> \param particle_set ...
2227 : !> \param coeffs_set ...
2228 : !> \param coeffs_kind ...
2229 : !> \param is_assoc_atomic_block_global ...
2230 : !> \param do_periodic ...
2231 : !> \param kind_of ...
2232 : !> \param basis_parameter ...
2233 : !> \param pmax_set ...
2234 : !> \param pmax_atom ...
2235 : !> \param pmax_blocks ...
2236 : !> \param cell ...
2237 : !> \param do_p_screening ...
2238 : !> \param map_atom_to_kind_atom ...
2239 : !> \param eval_type ...
2240 : !> \param log10_eps_schwarz ...
2241 : !> \param log_2 ...
2242 : !> \param coeffs_kind_max0 ...
2243 : !> \param use_virial ...
2244 : !> \param atomic_pair_list ...
2245 : ! **************************************************************************************************
2246 1218 : SUBROUTINE init_blocks(nkind, para_env, natom, block_size, nblock, blocks, &
2247 1218 : list_ij, list_kl, set_list_ij, set_list_kl, &
2248 : particle_set, &
2249 : coeffs_set, coeffs_kind, &
2250 1218 : is_assoc_atomic_block_global, do_periodic, &
2251 : kind_of, basis_parameter, pmax_set, pmax_atom, &
2252 : pmax_blocks, cell, &
2253 : do_p_screening, map_atom_to_kind_atom, eval_type, &
2254 : log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, &
2255 1218 : atomic_pair_list)
2256 :
2257 : INTEGER, INTENT(IN) :: nkind
2258 : TYPE(mp_para_env_type), INTENT(IN) :: para_env
2259 : INTEGER :: natom, block_size, nblock
2260 : TYPE(hfx_block_range_type), DIMENSION(1:nblock) :: blocks
2261 : TYPE(pair_list_type) :: list_ij, list_kl
2262 : TYPE(pair_set_list_type), DIMENSION(:) :: set_list_ij, set_list_kl
2263 : TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
2264 : TYPE(hfx_screen_coeff_type), &
2265 : DIMENSION(:, :, :, :), POINTER :: coeffs_set
2266 : TYPE(hfx_screen_coeff_type), DIMENSION(:, :), &
2267 : POINTER :: coeffs_kind
2268 : INTEGER, DIMENSION(:, :) :: is_assoc_atomic_block_global
2269 : LOGICAL :: do_periodic
2270 : INTEGER :: kind_of(*)
2271 : TYPE(hfx_basis_type), DIMENSION(:), POINTER :: basis_parameter
2272 : TYPE(hfx_p_kind), DIMENSION(:), POINTER :: pmax_set
2273 : REAL(dp), DIMENSION(:, :), POINTER :: pmax_atom
2274 : REAL(dp) :: pmax_blocks
2275 : TYPE(cell_type), POINTER :: cell
2276 : LOGICAL, INTENT(IN) :: do_p_screening
2277 : INTEGER, DIMENSION(:), POINTER :: map_atom_to_kind_atom
2278 : INTEGER, INTENT(IN) :: eval_type
2279 : REAL(dp) :: log10_eps_schwarz, log_2, &
2280 : coeffs_kind_max0
2281 : LOGICAL, INTENT(IN) :: use_virial
2282 : LOGICAL, DIMENSION(natom, natom) :: atomic_pair_list
2283 :
2284 : INTEGER :: atom_block, i, iatom_block, iatom_end, &
2285 : iatom_start, my_cpu_rank, ncpus
2286 :
2287 5056 : DO atom_block = 0, nblock - 1
2288 3838 : iatom_block = MODULO(atom_block, nblock) + 1
2289 3838 : iatom_start = (iatom_block - 1)*block_size + 1
2290 3838 : iatom_end = MIN(iatom_block*block_size, natom)
2291 3838 : blocks(atom_block + 1)%istart = iatom_start
2292 3838 : blocks(atom_block + 1)%iend = iatom_end
2293 5056 : blocks(atom_block + 1)%cost = 0_int_8
2294 : END DO
2295 :
2296 1218 : ncpus = para_env%num_pe
2297 1218 : my_cpu_rank = para_env%mepos
2298 5056 : DO i = 1, nblock
2299 3838 : IF (MODULO(i, ncpus) /= my_cpu_rank) THEN
2300 1916 : blocks(i)%istart = 0
2301 1916 : blocks(i)%iend = 0
2302 1916 : CYCLE
2303 : END IF
2304 1922 : iatom_start = blocks(i)%istart
2305 1922 : iatom_end = blocks(i)%iend
2306 : blocks(i)%cost = estimate_block_cost(natom, nkind, list_ij, list_kl, set_list_ij, set_list_kl, &
2307 : iatom_start, iatom_end, iatom_start, iatom_end, &
2308 : iatom_start, iatom_end, iatom_start, iatom_end, &
2309 : particle_set, &
2310 : coeffs_set, coeffs_kind, &
2311 : is_assoc_atomic_block_global, do_periodic, &
2312 : kind_of, basis_parameter, pmax_set, pmax_atom, pmax_blocks, &
2313 : cell, &
2314 : do_p_screening, map_atom_to_kind_atom, eval_type, &
2315 3140 : log10_eps_schwarz, log_2, coeffs_kind_max0, use_virial, atomic_pair_list)
2316 :
2317 : END DO
2318 1218 : END SUBROUTINE init_blocks
2319 :
2320 : ! **************************************************************************************************
2321 : !> \brief ...
2322 : !> \param para_env ...
2323 : !> \param x_data ...
2324 : !> \param iw ...
2325 : !> \param n_threads ...
2326 : !> \param i_thread ...
2327 : !> \param eval_type ...
2328 : ! **************************************************************************************************
2329 0 : SUBROUTINE collect_load_balance_info(para_env, x_data, iw, n_threads, i_thread, &
2330 : eval_type)
2331 :
2332 : TYPE(mp_para_env_type), INTENT(IN) :: para_env
2333 : TYPE(hfx_type), POINTER :: x_data
2334 : INTEGER, INTENT(IN) :: iw, n_threads, i_thread, eval_type
2335 :
2336 : INTEGER :: i, j, k, my_rank, nbins, nranks, &
2337 : total_bins
2338 : INTEGER(int_8) :: avg_bin, avg_rank, max_bin, max_rank, &
2339 : min_bin, min_rank, sum_bin, sum_rank
2340 0 : INTEGER(int_8), ALLOCATABLE, DIMENSION(:) :: buffer, buffer_in, buffer_out, summary
2341 : INTEGER(int_8), ALLOCATABLE, DIMENSION(:), SAVE :: shm_cost_vector
2342 0 : INTEGER, ALLOCATABLE, DIMENSION(:) :: bins_per_rank, rdispl, sort_idx
2343 : INTEGER, ALLOCATABLE, DIMENSION(:), SAVE :: shm_bins_per_rank, shm_displ
2344 :
2345 0 : SELECT CASE (eval_type)
2346 : CASE (hfx_do_eval_energy)
2347 0 : nbins = SIZE(x_data%distribution_energy)
2348 : CASE (hfx_do_eval_forces)
2349 0 : nbins = SIZE(x_data%distribution_forces)
2350 : END SELECT
2351 :
2352 0 : !$OMP MASTER
2353 0 : ALLOCATE (shm_bins_per_rank(n_threads))
2354 0 : ALLOCATE (shm_displ(n_threads + 1))
2355 : !$OMP END MASTER
2356 0 : !$OMP BARRIER
2357 :
2358 0 : shm_bins_per_rank(i_thread + 1) = nbins
2359 0 : !$OMP BARRIER
2360 0 : nbins = 0
2361 0 : DO i = 1, n_threads
2362 0 : nbins = nbins + shm_bins_per_rank(i)
2363 : END DO
2364 0 : my_rank = para_env%mepos
2365 0 : nranks = para_env%num_pe
2366 :
2367 0 : !$OMP BARRIER
2368 0 : !$OMP MASTER
2369 0 : ALLOCATE (bins_per_rank(nranks))
2370 0 : bins_per_rank = 0
2371 :
2372 0 : bins_per_rank(my_rank + 1) = nbins
2373 :
2374 0 : CALL para_env%sum(bins_per_rank)
2375 :
2376 0 : total_bins = 0
2377 0 : DO i = 1, nranks
2378 0 : total_bins = total_bins + bins_per_rank(i)
2379 : END DO
2380 :
2381 0 : ALLOCATE (shm_cost_vector(2*total_bins))
2382 0 : shm_cost_vector = -1_int_8
2383 0 : shm_displ(1) = 1
2384 0 : DO i = 2, n_threads
2385 0 : shm_displ(i) = shm_displ(i - 1) + shm_bins_per_rank(i - 1)
2386 : END DO
2387 0 : shm_displ(n_threads + 1) = nbins + 1
2388 : !$OMP END MASTER
2389 0 : !$OMP BARRIER
2390 0 : j = 0
2391 0 : SELECT CASE (eval_type)
2392 : CASE (hfx_do_eval_energy)
2393 0 : DO i = shm_displ(i_thread + 1), shm_displ(i_thread + 2) - 1
2394 0 : j = j + 1
2395 0 : shm_cost_vector(2*(i - 1) + 1) = x_data%distribution_energy(j)%cost
2396 0 : shm_cost_vector(2*i) = INT(x_data%distribution_energy(j)%time_first_scf*10000.0_dp, KIND=int_8)
2397 : END DO
2398 : CASE (hfx_do_eval_forces)
2399 0 : DO i = shm_displ(i_thread + 1), shm_displ(i_thread + 2) - 1
2400 0 : j = j + 1
2401 0 : shm_cost_vector(2*(i - 1) + 1) = x_data%distribution_forces(j)%cost
2402 0 : shm_cost_vector(2*i) = INT(x_data%distribution_forces(j)%time_forces*10000.0_dp, KIND=int_8)
2403 : END DO
2404 : END SELECT
2405 0 : !$OMP BARRIER
2406 0 : !$OMP MASTER
2407 : ! ** calculate offsets
2408 0 : ALLOCATE (rdispl(nranks))
2409 0 : bins_per_rank(:) = bins_per_rank(:)*2
2410 0 : rdispl(1) = 0
2411 0 : DO i = 2, nranks
2412 0 : rdispl(i) = rdispl(i - 1) + bins_per_rank(i - 1)
2413 : END DO
2414 :
2415 0 : ALLOCATE (buffer_in(2*nbins))
2416 0 : ALLOCATE (buffer_out(2*total_bins))
2417 :
2418 0 : DO i = 1, nbins
2419 0 : buffer_in(2*(i - 1) + 1) = shm_cost_vector(2*(i - 1) + 1)
2420 0 : buffer_in(2*i) = shm_cost_vector(2*i)
2421 : END DO
2422 :
2423 0 : CALL para_env%gatherv(buffer_in, buffer_out, bins_per_rank, rdispl)
2424 :
2425 0 : IF (iw > 0) THEN
2426 :
2427 0 : ALLOCATE (summary(2*nranks))
2428 0 : summary = 0_int_8
2429 :
2430 0 : WRITE (iw, '( /, 1X, 79("-") )')
2431 0 : WRITE (iw, '( " -", 77X, "-" )')
2432 0 : SELECT CASE (eval_type)
2433 : CASE (hfx_do_eval_energy)
2434 0 : WRITE (iw, '( " -", 20X, A, 19X, "-" )') ' HFX LOAD BALANCE INFORMATION - ENERGY '
2435 : CASE (hfx_do_eval_forces)
2436 0 : WRITE (iw, '( " -", 20X, A, 19X, "-" )') ' HFX LOAD BALANCE INFORMATION - FORCES '
2437 : END SELECT
2438 0 : WRITE (iw, '( " -", 77X, "-" )')
2439 0 : WRITE (iw, '( 1X, 79("-") )')
2440 :
2441 0 : WRITE (iw, FMT="(T3,A,T15,A,T35,A,T55,A)") "MPI RANK", "BIN #", "EST cost", "Processing time [s]"
2442 0 : WRITE (iw, '( 1X, 79("-"), / )')
2443 0 : k = 0
2444 0 : DO i = 1, nranks
2445 0 : DO j = 1, bins_per_rank(i)/2
2446 0 : k = k + 1
2447 : WRITE (iw, FMT="(T6,I5,T15,I5,T27,I16,T55,F19.8)") &
2448 0 : i - 1, j, buffer_out(2*(k - 1) + 1), REAL(buffer_out(2*k), dp)/10000.0_dp
2449 0 : summary(2*(i - 1) + 1) = summary(2*(i - 1) + 1) + buffer_out(2*(k - 1) + 1)
2450 0 : summary(2*i) = summary(2*i) + buffer_out(2*k)
2451 : END DO
2452 : END DO
2453 :
2454 : !** Summary
2455 : max_bin = 0_int_8
2456 : min_bin = HUGE(min_bin)
2457 : sum_bin = 0_int_8
2458 0 : DO i = 1, total_bins
2459 0 : sum_bin = sum_bin + buffer_out(2*i)
2460 0 : max_bin = MAX(max_bin, buffer_out(2*i))
2461 0 : min_bin = MIN(min_bin, buffer_out(2*i))
2462 : END DO
2463 0 : avg_bin = sum_bin/total_bins
2464 :
2465 0 : max_rank = 0_int_8
2466 0 : min_rank = HUGE(min_rank)
2467 0 : sum_rank = 0_int_8
2468 0 : DO i = 1, nranks
2469 0 : sum_rank = sum_rank + summary(2*i)
2470 0 : max_rank = MAX(max_rank, summary(2*i))
2471 0 : min_rank = MIN(min_rank, summary(2*i))
2472 : END DO
2473 0 : avg_rank = sum_rank/nranks
2474 :
2475 0 : WRITE (iw, FMT='(/,T3,A,/)') "SUMMARY:"
2476 0 : WRITE (iw, FMT="(T3,A,T35,F19.8)") "Max bin", REAL(max_bin, dp)/10000.0_dp
2477 0 : WRITE (iw, FMT="(T3,A,T35,F19.8)") "Min bin", REAL(min_bin, dp)/10000.0_dp
2478 0 : WRITE (iw, FMT="(T3,A,T35,F19.8)") "Sum bin", REAL(sum_bin, dp)/10000.0_dp
2479 0 : WRITE (iw, FMT="(T3,A,T35,F19.8,/)") "Avg bin", REAL(avg_bin, dp)/10000.0_dp
2480 0 : WRITE (iw, FMT="(T3,A,T35,F19.8)") "Max rank", REAL(max_rank, dp)/10000.0_dp
2481 0 : WRITE (iw, FMT="(T3,A,T35,F19.8)") "Min rank", REAL(min_rank, dp)/10000.0_dp
2482 0 : WRITE (iw, FMT="(T3,A,T35,F19.8)") "Sum rank", REAL(sum_rank, dp)/10000.0_dp
2483 0 : WRITE (iw, FMT="(T3,A,T35,F19.8,/)") "Avg rank", REAL(avg_rank, dp)/10000.0_dp
2484 :
2485 0 : ALLOCATE (buffer(nranks))
2486 0 : ALLOCATE (sort_idx(nranks))
2487 :
2488 0 : DO i = 1, nranks
2489 0 : buffer(i) = summary(2*i)
2490 : END DO
2491 :
2492 0 : CALL sort(buffer, nranks, sort_idx)
2493 :
2494 0 : WRITE (iw, FMT="(T3,A,T35,A,T55,A,/)") "MPI RANK", "EST cost", "Processing time [s]"
2495 0 : DO i = nranks, 1, -1
2496 0 : WRITE (iw, FMT="(T6,I5,T27,I16,T55,F19.8)") sort_idx(i) - 1, summary(2*(sort_idx(i) - 1) + 1), REAL(buffer(i), dp)/10000.0_dp
2497 : END DO
2498 :
2499 0 : DEALLOCATE (summary, buffer, sort_idx)
2500 :
2501 : END IF
2502 :
2503 0 : DEALLOCATE (buffer_in, buffer_out, rdispl)
2504 :
2505 0 : CALL para_env%sync()
2506 :
2507 0 : DEALLOCATE (shm_bins_per_rank, shm_displ, shm_cost_vector)
2508 : !$OMP END MASTER
2509 0 : !$OMP BARRIER
2510 :
2511 0 : END SUBROUTINE collect_load_balance_info
2512 :
2513 : #:include 'hfx_get_pmax_val.fypp'
2514 :
2515 : END MODULE hfx_load_balance_methods
|