Line data Source code
1 : !--------------------------------------------------------------------------------------------------!
2 : ! CP2K: A general program to perform molecular dynamics simulations !
3 : ! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4 : ! !
5 : ! SPDX-License-Identifier: GPL-2.0-or-later !
6 : !--------------------------------------------------------------------------------------------------!
7 :
8 : ! **************************************************************************************************
9 : !> \brief Auxiliary tools to redistribute cp_fm_type matrices before and after diagonalization.
10 : !> Heuristics are used to determine the optimal number of CPUs for diagonalization and the
11 : !> input matrices are redistributed if necessary
12 : !> \par History
13 : !> - [01.2018] moved redistribution related code from cp_fm_syevd here
14 : !> \author Nico Holmberg [01.2018]
15 : ! **************************************************************************************************
16 : MODULE cp_fm_diag_utils
17 : USE cp_blacs_env, ONLY: cp_blacs_env_create,&
18 : cp_blacs_env_release,&
19 : cp_blacs_env_type
20 : USE cp_fm_struct, ONLY: cp_fm_struct_create,&
21 : cp_fm_struct_release,&
22 : cp_fm_struct_type
23 : USE cp_fm_types, ONLY: cp_fm_create,&
24 : cp_fm_get_info,&
25 : cp_fm_release,&
26 : cp_fm_type
27 : USE cp_log_handling, ONLY: cp_get_default_logger,&
28 : cp_logger_get_default_io_unit,&
29 : cp_logger_type
30 : USE kinds, ONLY: dp
31 : USE mathlib, ONLY: gcd
32 : USE message_passing, ONLY: mp_para_env_type
33 : #include "../base/base_uses.f90"
34 :
35 : IMPLICIT NONE
36 :
37 : PRIVATE
38 :
39 : CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'cp_fm_diag_utils'
40 :
41 : ! Information on redistribution
42 : TYPE, PUBLIC :: cp_fm_redistribute_info
43 : INTEGER :: matrix_order = -1
44 : INTEGER :: num_pe_old = -1 ! number of processes before a potential redistribute
45 : INTEGER :: num_pe_new = -1 ! number of processes after a potential redistribute
46 : INTEGER :: num_pe_opt = -1 ! optimal number of processes for the given matrix
47 : INTEGER :: num_pe_max_nz_col = -1 ! the maximal number of processes s.t. no column has zero width, may be < 0 if ignored
48 : LOGICAL :: redistribute = .FALSE. ! whether or not the matrix was actually redistributed
49 : CONTAINS
50 : PROCEDURE, PASS(self) :: write => cp_fm_redistribute_info_write
51 : END TYPE
52 :
53 : ! Container for redistribution settings and temporary work structs
54 : TYPE cp_fm_redistribute_type
55 : ! Settings
56 : INTEGER :: a = -1, x = -1
57 : LOGICAL :: should_print = .FALSE.
58 : LOGICAL :: elpa_force_redistribute = .FALSE.
59 : ! Temporaries
60 : INTEGER, DIMENSION(:), POINTER :: group_distribution => NULL(), &
61 : group_partition => NULL()
62 : TYPE(cp_blacs_env_type), POINTER :: blacs_env_new => NULL()
63 : TYPE(mp_para_env_type), POINTER :: para_env_new => NULL()
64 : END TYPE cp_fm_redistribute_type
65 :
66 : ! Permanent instance of the redistribute type
67 : TYPE(cp_fm_redistribute_type), PRIVATE, &
68 : SAVE :: work_redistribute
69 :
70 : ! Public subroutines
71 :
72 : PUBLIC :: cp_fm_redistribute_start, &
73 : cp_fm_redistribute_end, &
74 : cp_fm_redistribute_init
75 :
76 : CONTAINS
77 :
78 : ! **************************************************************************************************
79 : !> \brief Write the redistribute info nicely formatted to the given I/O unit
80 : !> \param self reference to the cp_fm_redistribute_info instance
81 : !> \param io_unit I/O unit to use for writing
82 : ! **************************************************************************************************
83 0 : SUBROUTINE cp_fm_redistribute_info_write(self, io_unit)
84 : CLASS(cp_fm_redistribute_info), INTENT(IN) :: self
85 : INTEGER, INTENT(IN) :: io_unit
86 :
87 0 : WRITE (UNIT=io_unit, FMT="(A)") ""
88 : WRITE (UNIT=io_unit, FMT="(T2,A,T71,I10)") &
89 0 : "CP_FM_DIAG| Number of processes over which the matrix is distributed ", self%num_pe_old, &
90 0 : "CP_FM_DIAG| Matrix order ", self%matrix_order
91 : WRITE (UNIT=io_unit, FMT="(T2,A,T71,I10)") &
92 0 : "CP_FM_DIAG| Optimal number of CPUs ", self%num_pe_opt
93 0 : IF (self%num_pe_max_nz_col < 0) THEN
94 : WRITE (UNIT=io_unit, FMT="(T2,A,T71,A10)") &
95 0 : "CP_FM_DIAG| Maximum number of CPUs (with non-zero columns) ", "<N/A>"
96 : ELSE
97 : WRITE (UNIT=io_unit, FMT="(T2,A,T71,I10)") &
98 0 : "CP_FM_DIAG| Maximum number of CPUs (with non-zero columns): ", self%num_pe_max_nz_col
99 : END IF
100 0 : IF (self%redistribute) THEN
101 : WRITE (UNIT=io_unit, FMT="(T2,A,T71,I10)") &
102 0 : "CP_FM_DIAG| Number of processes for the redistribution ", self%num_pe_new
103 : ELSE
104 : WRITE (UNIT=io_unit, FMT="(T2,A)") &
105 0 : "CP_FM_DIAG| The matrix will NOT be redistributed"
106 : END IF
107 0 : WRITE (UNIT=io_unit, FMT="(A)") ""
108 :
109 0 : END SUBROUTINE cp_fm_redistribute_info_write
110 :
111 : ! **************************************************************************************************
112 : !> \brief Releases the temporary storage needed when redistributing arrays
113 : !> \param has_redistributed flag that determines if the processors holds a part of the
114 : !> redistributed array
115 : !> \author Nico Holmberg [01.2018]
116 : ! **************************************************************************************************
117 157471 : SUBROUTINE cp_fm_redistribute_work_finalize(has_redistributed)
118 : LOGICAL, INTENT(IN) :: has_redistributed
119 :
120 157471 : IF (ASSOCIATED(work_redistribute%group_distribution)) THEN
121 157471 : IF (has_redistributed) THEN
122 80665 : CALL cp_blacs_env_release(work_redistribute%blacs_env_new)
123 : END IF
124 157471 : CALL work_redistribute%para_env_new%free()
125 157471 : DEALLOCATE (work_redistribute%para_env_new)
126 157471 : DEALLOCATE (work_redistribute%group_distribution)
127 157471 : DEALLOCATE (work_redistribute%group_partition)
128 : END IF
129 : ! Return work to its initial state
130 157471 : work_redistribute = cp_fm_redistribute_type()
131 :
132 157471 : END SUBROUTINE cp_fm_redistribute_work_finalize
133 :
134 : ! **************************************************************************************************
135 : !> \brief Initializes the parameters that determine how to calculate the optimal number of CPUs
136 : !> for diagonalizing a matrix. The parameters are read from the GLOBAL input section.
137 : !> \param a integer parameter used to define the rule for determining the optimal
138 : !> number of CPUs for diagonalization
139 : !> \param x integer parameter used to define the rule for determining the optimal
140 : !> number of CPUs for diagonalization
141 : !> \param should_print flag that determines if information about the redistribution process
142 : !> should be printed
143 : !> \param elpa_force_redistribute flag that if redistribution should always be performed when
144 : !> the ELPA diagonalization library is in use
145 : !> \author Nico Holmberg [01.2018]
146 : ! **************************************************************************************************
147 9127 : SUBROUTINE cp_fm_redistribute_init(a, x, should_print, elpa_force_redistribute)
148 : INTEGER, INTENT(IN) :: a, x
149 : LOGICAL, INTENT(IN) :: should_print, elpa_force_redistribute
150 :
151 : work_redistribute%a = a
152 : work_redistribute%x = x
153 : work_redistribute%should_print = should_print
154 : work_redistribute%elpa_force_redistribute = elpa_force_redistribute
155 : ! Init work
156 9127 : work_redistribute = cp_fm_redistribute_type()
157 :
158 9127 : END SUBROUTINE cp_fm_redistribute_init
159 :
160 : ! **************************************************************************************************
161 : !> \brief Calculates the optimal number of CPUs for diagonalizing a matrix.
162 : !> \param size the size of the diagonalized matrix
163 : !> \return the optimal number of CPUs
164 : !> \author Nico Holmberg [01.2018]
165 : ! **************************************************************************************************
166 165319 : PURE FUNCTION cp_fm_diag_get_optimal_ncpu(size) RESULT(ncpu)
167 : INTEGER, INTENT(IN) :: size
168 : INTEGER :: ncpu
169 :
170 : ncpu = ((size + work_redistribute%a*work_redistribute%x - 1)/ &
171 165319 : (work_redistribute%a*work_redistribute%x))*work_redistribute%a
172 :
173 165319 : END FUNCTION cp_fm_diag_get_optimal_ncpu
174 :
175 : #if defined(__parallel)
176 : ! **************************************************************************************************
177 : !> \brief Determines the largest number of CPUs a matrix can be distributed on without any of the
178 : !> processors getting a zero-width column (currently only needed for ELPA).
179 : !> \param matrix the matrix that will be diagonalized
180 : !> \return the maximum number of CPUs for ELPA
181 : !> \author Nico Holmberg [01.2018]
182 : ! **************************************************************************************************
183 59878 : FUNCTION cp_fm_max_ncpu_non_zero_column(matrix) RESULT(ncpu)
184 : TYPE(cp_fm_type), INTENT(IN) :: matrix
185 : INTEGER :: ncpu
186 :
187 : INTEGER :: gcd_max, ipe, jpe, ncol_block, &
188 : ncol_global, npcol, nrow_block, &
189 : nrow_global, num_pe_old, nzero
190 59878 : INTEGER, DIMENSION(:), POINTER :: ncol_locals
191 : INTEGER, EXTERNAL :: numroc
192 :
193 59878 : NULLIFY (ncol_locals)
194 : ! First check if there are any zero width columns in current layout
195 : CALL cp_fm_get_info(matrix, ncol_locals=ncol_locals, &
196 : nrow_global=nrow_global, ncol_global=ncol_global, &
197 59878 : nrow_block=nrow_block, ncol_block=ncol_block)
198 119756 : nzero = COUNT(ncol_locals == 0)
199 59878 : num_pe_old = matrix%matrix_struct%para_env%num_pe
200 59878 : ncpu = num_pe_old - nzero
201 :
202 : ! Avoid layouts with odd number of CPUs (blacs grid layout will be square)
203 59878 : IF (ncpu > 2) &
204 0 : ncpu = ncpu - MODULO(ncpu, 2)
205 :
206 : ! if there are no zero-width columns and the number of processors was even, leave it at that
207 59878 : IF (ncpu == num_pe_old) &
208 : RETURN
209 :
210 : ! Iteratively search for the maximum number of CPUs for ELPA
211 : ! On each step, we test whether the blacs grid created with ncpu processes
212 : ! contains any columns with zero width
213 0 : DO WHILE (ncpu > 1)
214 : ! Determine layout of new blacs grid with ncpu CPUs
215 : ! (snippet copied from cp_blacs_env.F:cp_blacs_env_create)
216 0 : gcd_max = -1
217 0 : DO ipe = 1, CEILING(SQRT(REAL(ncpu, dp)))
218 0 : jpe = ncpu/ipe
219 0 : IF (ipe*jpe .NE. ncpu) &
220 : CYCLE
221 0 : IF (gcd(ipe, jpe) >= gcd_max) THEN
222 0 : npcol = jpe
223 0 : gcd_max = gcd(ipe, jpe)
224 : END IF
225 : END DO
226 :
227 : ! Count the number of processors without any columns
228 : ! (snippet copied from cp_fm_struct.F:cp_fm_struct_create)
229 0 : nzero = 0
230 0 : DO ipe = 0, npcol - 1
231 0 : IF (numroc(ncol_global, ncol_block, ipe, 0, npcol) == 0) &
232 0 : nzero = nzero + 1
233 : END DO
234 :
235 0 : IF (nzero == 0) &
236 : EXIT
237 :
238 0 : ncpu = ncpu - nzero
239 :
240 0 : IF (ncpu > 2) &
241 0 : ncpu = ncpu - MODULO(ncpu, 2)
242 : END DO
243 :
244 59878 : END FUNCTION cp_fm_max_ncpu_non_zero_column
245 : #endif
246 :
247 : ! **************************************************************************************************
248 : !> \brief Determines the optimal number of CPUs for matrix diagonalization and redistributes
249 : !> the input matrices if necessary
250 : !> \param matrix the input cp_fm_type matrix to be diagonalized
251 : !> \param eigenvectors the cp_fm_type matrix that will hold the eigenvectors of the input matrix
252 : !> \param matrix_new the redistributed input matrix which will subsequently be diagonalized,
253 : !> or a pointer to the original matrix if no redistribution is required
254 : !> \param eigenvectors_new the redistributed eigenvectors matrix, or a pointer to the original
255 : !> matrix if no redistribution is required
256 : !> \param caller_is_elpa flag that determines if ELPA is used for diagonalization
257 : !> \param redist_info get info about the redistribution
258 : !> \par History
259 : !> - [01.2018] created by moving redistribution related code from cp_fm_syevd here
260 : !> \author Nico Holmberg [01.2018]
261 : ! **************************************************************************************************
262 59878 : SUBROUTINE cp_fm_redistribute_start(matrix, eigenvectors, matrix_new, eigenvectors_new, &
263 : caller_is_elpa, redist_info)
264 :
265 : TYPE(cp_fm_type), INTENT(IN) :: matrix, eigenvectors
266 : TYPE(cp_fm_type), INTENT(OUT) :: matrix_new, eigenvectors_new
267 : LOGICAL, OPTIONAL, INTENT(IN) :: caller_is_elpa
268 :
269 : CHARACTER(len=*), PARAMETER :: routineN = 'cp_fm_redistribute_start'
270 :
271 : INTEGER :: handle
272 : LOGICAL :: is_elpa
273 : TYPE(cp_fm_redistribute_info), OPTIONAL, INTENT(OUT) :: redist_info
274 :
275 : #if defined(__parallel)
276 : REAL(KIND=dp) :: fake_local_data(1, 1)
277 : INTEGER :: fake_descriptor(9), mepos_old, &
278 : io_unit, ngroups, ncol_block, blksize, nrow_block
279 : TYPE(cp_fm_struct_type), POINTER :: fm_struct_new
280 : TYPE(mp_para_env_type), POINTER :: para_env
281 : TYPE(cp_logger_type), POINTER :: logger
282 : TYPE(cp_fm_redistribute_info) :: rdinfo
283 : #endif
284 :
285 165319 : CALL timeset(routineN, handle)
286 165319 : is_elpa = .FALSE.
287 165319 : IF (PRESENT(caller_is_elpa)) THEN
288 : #if defined(__ELPA)
289 59878 : is_elpa = caller_is_elpa
290 : #else
291 : CPABORT("CP2K compiled without the ELPA library.")
292 : #endif
293 : END IF
294 :
295 : #if defined(__parallel)
296 :
297 165319 : logger => cp_get_default_logger()
298 165319 : io_unit = cp_logger_get_default_io_unit(logger)
299 :
300 : ! first figure out the optimal number of cpus
301 : ! this is pure heuristics, the defaults are based on rosa timings
302 : ! that demonstrate that timings go up sharply if too many tasks are used
303 : ! we take a multiple of 4, and approximately n/60
304 165319 : para_env => matrix%matrix_struct%para_env
305 165319 : mepos_old = para_env%mepos
306 165319 : ncol_block = -1 ! normally we also want to adjust the block size according to the optimal # of CPUs
307 165319 : nrow_block = -1
308 165319 : blksize = -1
309 :
310 165319 : rdinfo%matrix_order = matrix%matrix_struct%nrow_global
311 165319 : rdinfo%num_pe_old = para_env%num_pe
312 165319 : rdinfo%num_pe_opt = cp_fm_diag_get_optimal_ncpu(rdinfo%matrix_order)
313 165319 : rdinfo%num_pe_new = rdinfo%num_pe_opt
314 : rdinfo%num_pe_max_nz_col = -1
315 : rdinfo%redistribute = .FALSE.
316 :
317 165319 : IF (is_elpa) THEN
318 : ! with ELPA we don't have to redistribute if not necessary (scales, unlike ScaLAPACK)
319 59878 : rdinfo%num_pe_new = rdinfo%num_pe_old
320 :
321 : ! BUT: Diagonalization with ELPA fails when a processor column has zero width
322 : ! Determine the maximum number of CPUs the matrix can be distributed without zero-width columns
323 : ! for the current block size.
324 59878 : rdinfo%num_pe_max_nz_col = cp_fm_max_ncpu_non_zero_column(matrix)
325 :
326 : ! if the user wants to redistribute to the ScaLAPACK optimal number of CPUs anyway, let him if it's safe.
327 59878 : IF (work_redistribute%elpa_force_redistribute .AND. rdinfo%num_pe_opt < rdinfo%num_pe_max_nz_col) THEN
328 : ! Use heuristics to determine the need for redistribution (when num_pe_opt is smaller than the safe maximum)
329 : ! in this case we can also take the block size used for ScaLAPACK
330 0 : rdinfo%num_pe_new = rdinfo%num_pe_opt
331 59878 : ELSE IF (rdinfo%num_pe_old > rdinfo%num_pe_max_nz_col) THEN
332 : ! Otherwise, only redistribute if we have to
333 0 : rdinfo%num_pe_new = rdinfo%num_pe_max_nz_col
334 : ! do NOT let cp_fm_struct_create automatically adjust the block size because the
335 : ! calculated number of processors such that no block has 0 columns wouldn't match (see #578):
336 : ! if the automatically chosen block size is larger than the present one we would still end
337 : ! up with empty processors
338 : END IF
339 :
340 59878 : CALL cp_fm_get_info(matrix, ncol_block=ncol_block, nrow_block=nrow_block)
341 :
342 : ! On GPUs, ELPA requires the block size to be a power of 2
343 59878 : blksize = 1
344 274403 : DO WHILE (2*blksize <= MIN(nrow_block, ncol_block))
345 59878 : blksize = blksize*2
346 : END DO
347 59878 : nrow_block = blksize
348 59878 : ncol_block = blksize
349 : END IF
350 :
351 : ! finally, only redistribute if we're going to use less CPUs than before or changed the block size
352 : rdinfo%redistribute = (rdinfo%num_pe_old > rdinfo%num_pe_new) .OR. (blksize >= 0 .AND. &
353 165319 : ((blksize /= matrix%matrix_struct%ncol_block) .OR. (blksize /= matrix%matrix_struct%nrow_block)))
354 :
355 165319 : IF (work_redistribute%should_print .AND. io_unit > 0) THEN
356 0 : IF (is_elpa) THEN
357 0 : IF (work_redistribute%elpa_force_redistribute) THEN
358 : WRITE (UNIT=io_unit, FMT="(T2,A,T78,A3)") &
359 0 : "CP_FM_DIAG| Force redistribute (ELPA):", "YES"
360 : ELSE
361 : WRITE (UNIT=io_unit, FMT="(T2,A,T79,A2)") &
362 0 : "CP_FM_DIAG| Force redistribute (ELPA):", "NO"
363 : END IF
364 : END IF
365 0 : CALL rdinfo%write(io_unit)
366 : END IF
367 165319 : CALL para_env%sync()
368 :
369 : ! if the optimal is smaller than num_pe, we will redistribute the input matrix
370 165319 : IF (rdinfo%redistribute) THEN
371 : ! split comm, the first num_pe_new tasks will do the work
372 472413 : ALLOCATE (work_redistribute%group_distribution(0:rdinfo%num_pe_old - 1))
373 157471 : ALLOCATE (work_redistribute%group_partition(0:1))
374 472413 : work_redistribute%group_partition = (/rdinfo%num_pe_new, rdinfo%num_pe_old - rdinfo%num_pe_new/)
375 157471 : ALLOCATE (work_redistribute%para_env_new)
376 : CALL work_redistribute%para_env_new%from_split( &
377 : comm=para_env, ngroups=ngroups, group_distribution=work_redistribute%group_distribution, &
378 157471 : n_subgroups=2, group_partition=work_redistribute%group_partition)
379 :
380 157471 : IF (work_redistribute%group_distribution(mepos_old) == 0) THEN
381 :
382 : ! create blacs, should inherit the preferences for the layout and so on, from the higher level
383 80665 : NULLIFY (work_redistribute%blacs_env_new)
384 80665 : CALL cp_blacs_env_create(blacs_env=work_redistribute%blacs_env_new, para_env=work_redistribute%para_env_new)
385 :
386 : ! create new matrix
387 80665 : NULLIFY (fm_struct_new)
388 80665 : IF (nrow_block == -1 .OR. ncol_block == -1) THEN
389 : CALL cp_fm_struct_create(fmstruct=fm_struct_new, &
390 : para_env=work_redistribute%para_env_new, &
391 : context=work_redistribute%blacs_env_new, &
392 : nrow_global=rdinfo%matrix_order, ncol_global=rdinfo%matrix_order, &
393 53384 : ncol_block=ncol_block, nrow_block=nrow_block)
394 : ELSE
395 : CALL cp_fm_struct_create(fmstruct=fm_struct_new, &
396 : para_env=work_redistribute%para_env_new, &
397 : context=work_redistribute%blacs_env_new, &
398 : nrow_global=rdinfo%matrix_order, ncol_global=rdinfo%matrix_order, &
399 27281 : ncol_block=ncol_block, nrow_block=nrow_block, force_block=.TRUE.)
400 : END IF
401 80665 : CALL cp_fm_create(matrix_new, matrix_struct=fm_struct_new, name="yevd_new_mat")
402 80665 : CALL cp_fm_create(eigenvectors_new, matrix_struct=fm_struct_new, name="yevd_new_vec")
403 80665 : CALL cp_fm_struct_release(fm_struct_new)
404 :
405 : ! redistribute old
406 : CALL pdgemr2d(rdinfo%matrix_order, rdinfo%matrix_order, matrix%local_data(1, 1), 1, 1, &
407 : matrix%matrix_struct%descriptor, &
408 : matrix_new%local_data(1, 1), 1, 1, matrix_new%matrix_struct%descriptor, &
409 80665 : matrix%matrix_struct%context)
410 : ELSE
411 : ! these tasks must help redistribute (they own part of the data),
412 : ! but need fake 'new' data, and their descriptor must indicate this with -1
413 : ! see also scalapack comments on pdgemr2d
414 768060 : fake_descriptor = -1
415 : CALL pdgemr2d(rdinfo%matrix_order, rdinfo%matrix_order, matrix%local_data(1, 1), 1, 1, &
416 : matrix%matrix_struct%descriptor, &
417 : fake_local_data(1, 1), 1, 1, fake_descriptor, &
418 76806 : matrix%matrix_struct%context)
419 : END IF
420 : ELSE
421 : ! No need to redistribute, just return pointers to the original arrays
422 7848 : matrix_new = matrix
423 7848 : eigenvectors_new = eigenvectors
424 : END IF
425 :
426 165319 : IF (PRESENT(redist_info)) &
427 59878 : redist_info = rdinfo
428 : #else
429 :
430 : MARK_USED(matrix)
431 : MARK_USED(eigenvectors)
432 : MARK_USED(matrix_new)
433 : MARK_USED(eigenvectors_new)
434 : MARK_USED(redist_info)
435 : CPABORT("Routine called in non-parallel case.")
436 : #endif
437 :
438 165319 : CALL timestop(handle)
439 :
440 165319 : END SUBROUTINE cp_fm_redistribute_start
441 :
442 : ! **************************************************************************************************
443 : !> \brief Redistributes eigenvectors and eigenvalues back to the original communicator group
444 : !> \param matrix the input cp_fm_type matrix to be diagonalized
445 : !> \param eigenvectors the cp_fm_type matrix that will hold the eigenvectors of the input matrix
446 : !> \param eig global array holding the eigenvalues of the input matrixmatrix
447 : !> \param matrix_new the redistributed input matrix which will subsequently be diagonalized,
448 : !> or a pointer to the original matrix if no redistribution is required
449 : !> \param eigenvectors_new the redistributed eigenvectors matrix, or a pointer to the original
450 : !> matrix if no redistribution is required
451 : !> \par History
452 : !> - [01.2018] created by moving redistribution related code from cp_fm_syevd here
453 : !> \author Nico Holmberg [01.2018]
454 : ! **************************************************************************************************
455 165319 : SUBROUTINE cp_fm_redistribute_end(matrix, eigenvectors, eig, matrix_new, eigenvectors_new)
456 :
457 : TYPE(cp_fm_type), INTENT(IN) :: matrix, eigenvectors
458 : REAL(KIND=dp), DIMENSION(:), INTENT(INOUT) :: eig
459 : TYPE(cp_fm_type), INTENT(INOUT) :: matrix_new, eigenvectors_new
460 :
461 : CHARACTER(len=*), PARAMETER :: routineN = 'cp_fm_redistribute_end'
462 :
463 : INTEGER :: handle
464 : #if defined(__parallel)
465 : REAL(KIND=dp) :: fake_local_data(1, 1)
466 : INTEGER :: fake_descriptor(9), mepos_old, n
467 : TYPE(mp_para_env_type), POINTER :: para_env
468 : #endif
469 :
470 165319 : CALL timeset(routineN, handle)
471 :
472 : #if defined(__parallel)
473 :
474 : ! Check if matrix was redistributed
475 165319 : IF (ASSOCIATED(work_redistribute%group_distribution)) THEN
476 157471 : n = matrix%matrix_struct%nrow_global
477 157471 : para_env => matrix%matrix_struct%para_env
478 157471 : mepos_old = para_env%mepos
479 :
480 157471 : IF (work_redistribute%group_distribution(mepos_old) == 0) THEN
481 : ! redistribute results on CPUs that hold the redistributed matrix
482 : CALL pdgemr2d(n, n, eigenvectors_new%local_data(1, 1), 1, 1, eigenvectors_new%matrix_struct%descriptor, &
483 : eigenvectors%local_data(1, 1), 1, 1, eigenvectors%matrix_struct%descriptor, &
484 80665 : eigenvectors%matrix_struct%context)
485 80665 : CALL cp_fm_release(matrix_new)
486 80665 : CALL cp_fm_release(eigenvectors_new)
487 : ELSE
488 : ! these tasks must help redistribute (they own part of the data),
489 : ! but need fake 'new' data, and their descriptor must indicate this with -1
490 : ! see also scalapack comments on pdgemr2d
491 768060 : fake_descriptor = -1
492 : CALL pdgemr2d(n, n, fake_local_data(1, 1), 1, 1, fake_descriptor, &
493 : eigenvectors%local_data(1, 1), 1, 1, eigenvectors%matrix_struct%descriptor, &
494 76806 : eigenvectors%matrix_struct%context)
495 : END IF
496 : ! free work
497 157471 : CALL cp_fm_redistribute_work_finalize(work_redistribute%group_distribution(mepos_old) == 0)
498 :
499 : ! finally, also the eigenvalues need to end up on the non-group member tasks
500 3622691 : CALL para_env%bcast(eig, 0)
501 : END IF
502 :
503 : #else
504 :
505 : MARK_USED(matrix)
506 : MARK_USED(eigenvectors)
507 : MARK_USED(eig)
508 : MARK_USED(matrix_new)
509 : MARK_USED(eigenvectors_new)
510 : CPABORT("Routine called in non-parallel case.")
511 : #endif
512 :
513 165319 : CALL timestop(handle)
514 :
515 165319 : END SUBROUTINE cp_fm_redistribute_end
516 :
517 0 : END MODULE cp_fm_diag_utils
|