Line data Source code
1 : !--------------------------------------------------------------------------------------------------!
2 : ! CP2K: A general program to perform molecular dynamics simulations !
3 : ! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4 : ! !
5 : ! SPDX-License-Identifier: GPL-2.0-or-later !
6 : !--------------------------------------------------------------------------------------------------!
7 :
8 : ! **************************************************************************************************
9 : !> \brief Routines to convert sparse matrices between DBCSR (distributed-blocks compressed sparse rows)
10 : !> and SIESTA (distributed compressed sparse columns) formats.
11 : !> \author Sergey Chulkov
12 : !> \author Christian Ahart
13 : !> \author Clotilde Cucinotta
14 : ! **************************************************************************************************
15 : MODULE smeagol_matrix_utils
16 : USE cell_types, ONLY: cell_type, &
17 : real_to_scaled, &
18 : scaled_to_real
19 : USE cp_dbcsr_api, ONLY: dbcsr_get_block_p, &
20 : dbcsr_get_info, &
21 : dbcsr_p_type, &
22 : dbcsr_set
23 : USE kinds, ONLY: dp, &
24 : dp_size, &
25 : int_8
26 : USE message_passing, ONLY: mp_para_env_type, &
27 : mp_request_type, &
28 : mp_waitall
29 : USE negf_matrix_utils, ONLY: get_index_by_cell
30 : #if defined(__SMEAGOL)
31 : USE parallel, ONLY: GetNodeOrbs, &
32 : GlobalToLocalOrb, &
33 : LocalToGlobalOrb, &
34 : WhichNodeOrb
35 : #endif
36 : USE particle_types, ONLY: particle_type
37 : USE qs_neighbor_list_types, ONLY: get_iterator_info, &
38 : get_neighbor_list_set_p, &
39 : neighbor_list_iterate, &
40 : neighbor_list_iterator_create, &
41 : neighbor_list_iterator_p_type, &
42 : neighbor_list_iterator_release, &
43 : neighbor_list_set_p_type
44 : USE qs_subsys_types, ONLY: qs_subsys_get, &
45 : qs_subsys_type
46 : USE util, ONLY: sort
47 : #include "./base/base_uses.f90"
48 :
49 : IMPLICIT NONE
50 : PRIVATE
51 :
52 : CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'smeagol_matrix_utils'
53 : LOGICAL, PARAMETER, PRIVATE :: debug_this_module = .FALSE.
54 :
55 : INTEGER, PARAMETER, PRIVATE :: neighbor_list_iatom_index = 1
56 : INTEGER, PARAMETER, PRIVATE :: neighbor_list_jatom_index = 2
57 : INTEGER, PARAMETER, PRIVATE :: neighbor_list_dbcsr_image_index = 3
58 : INTEGER, PARAMETER, PRIVATE :: neighbor_list_siesta_image_index = 4
59 : INTEGER, PARAMETER, PRIVATE :: neighbor_list_siesta_transp_image_index = 5
60 : INTEGER, PARAMETER, PRIVATE :: neighbor_list_dim1 = neighbor_list_siesta_transp_image_index
61 :
62 : PUBLIC :: siesta_distrib_csc_struct_type
63 : PUBLIC :: siesta_struct_create, siesta_struct_release
64 : PUBLIC :: convert_dbcsr_to_distributed_siesta, convert_distributed_siesta_to_dbcsr
65 :
66 : PRIVATE :: get_negf_cell_ijk, index_in_canonical_enumeration, number_from_canonical_enumeration, pbc_0_1
67 : PRIVATE :: get_number_of_mpi_sendrecv_requests, assign_nonzero_elements_to_requests
68 :
69 : !> number of DBCSR matrix elements to receive from a given rank
70 : INTEGER, PARAMETER, PRIVATE :: nelements_dbcsr_recv = 1
71 : !> number of DBCSR matrix elements to send to a given rank
72 : INTEGER, PARAMETER, PRIVATE :: nelements_dbcsr_send = 2
73 : INTEGER, PARAMETER, PRIVATE :: nelements_dbcsr_dim2 = nelements_dbcsr_send
74 :
75 : INTEGER(kind=int_8), PARAMETER, PRIVATE :: max_mpi_packet_size_bytes = 134217728 ! 128 MiB (to limit memory usage for matrix redistribution)
76 : INTEGER(kind=int_8), PARAMETER, PRIVATE :: max_mpi_packet_size_dp = max_mpi_packet_size_bytes/INT(dp_size, kind=int_8)
77 :
78 : ! a portable way to determine the upper bound for tag value is to call
79 : ! MPI_COMM_GET_ATTR(comm, MPI_TAG_UB, max_mpi_rank, flag, ierror).
80 : ! The MPI specification guarantees a value of 32767
81 : INTEGER, PARAMETER, PRIVATE :: max_mpi_rank = 32767
82 :
83 : ! **************************************************************************************************
84 : !> \brief Sparsity pattern of replicated SIESTA compressed sparse column (CSC) matrices
85 : ! **************************************************************************************************
86 : TYPE siesta_distrib_csc_struct_type
87 : !> gather all non-zero matrix elements on the given MPI process.
88 : !> Distribute the elements across MPI processes if gather_root < 0.
89 : !> The matrix elements should be located on I/O process in case of bulk transport,
90 : !> and should be distributed in case of SMEAGOL calculation.
91 : INTEGER :: gather_root = 0
92 : !> Based of full (.FALSE.) or upper-triangular (.TRUE.) DBCSR matrix.
93 : !> It is used in CPASSERTs to allow access to lower-triangular matrix elements of non-symmetric DBCSR matrices
94 : !> In case there is no bugs in this module, these CPASSERTs should newer trigger.
95 : !> Therefore the 'symmetric' variable alongside with relevant CPASSERT calls are excessive and can be removed.
96 : LOGICAL :: symmetric = .TRUE.
97 : !> number of neighbour list nodes for each MPI process (0:num_pe-1).
98 : !> If do_merge == .TRUE., nodes for different cell images along k cell vector are merged into one node
99 : INTEGER, ALLOCATABLE, DIMENSION(:) :: nnodes_per_proc
100 :
101 : !> replicated neighbour list (1:neighbor_list_dim1, 1:SUM(nnodes_per_proc)).
102 : !> Neighbour list nodes are ordered according to their MPI ranks.
103 : !> Thus, the first nnodes_per_proc(0) nodes are stored on MPI rank 0,
104 : !> the next nnodes_per_proc(1) nodes reside on MPI rank 1, etc
105 : !> Nodes for cell images along transport direction are merged into one node.
106 : !> The number of non-zero DBCSR matrix blocks and their DBCSR cell image indices
107 : !> are stored into 'n_dbcsr_cell_images_to_merge' and 'dbcsr_cell_image_to_merge' arrays
108 : INTEGER, ALLOCATABLE, DIMENSION(:, :) :: nl_repl
109 :
110 : !> number of DBCSR images for each local merged neighbour list node (1:nnodes_per_proc(para_env%mepos))
111 : INTEGER, ALLOCATABLE, DIMENSION(:) :: n_dbcsr_cell_images_to_merge
112 : !> list of DBCSR image indices to merge; (1:SUM(n_dbcsr_cell_images_to_merge))
113 : INTEGER, ALLOCATABLE, DIMENSION(:) :: dbcsr_cell_image_to_merge
114 :
115 : !> number of DBCSR non-zero matrix elements that should be received/sent from each MPI rank
116 : !> (0:num_pe-1, 1:nelements_dbcsr_dim2)
117 : INTEGER(kind=int_8), ALLOCATABLE, DIMENSION(:, :) :: nelements_per_proc
118 :
119 : !> number of non-zero matrix elements local to this MPI rank.
120 : INTEGER(kind=int_8) :: n_nonzero_elements = 0_int_8
121 : INTEGER :: nrows = 0, ncols = 0
122 : !> Number of non-zero matrix elements (columns) on each row
123 : !> n_nonzero_cols(1:nrows); same as 'numh' in SMEAGOL code.
124 : INTEGER, ALLOCATABLE, DIMENSION(:) :: n_nonzero_cols
125 : !> offset of the first non-zero matrix elements on each row.
126 : !> column_offset(1:nrows); same as 'listhptr' in SMEAGOL code.
127 : !> It should be declared as INTEGER(kind=int_8), but SMEAGOL expects it to be INTEGER
128 : INTEGER, ALLOCATABLE, DIMENSION(:) :: row_offset
129 : !> column index of each non-zero matrix element.
130 : !> col_index(1:n_nonzero_elements); same as 'listh' in SMEAGOL code
131 : !> col index of the first non-zero matrix elements of irow row is col_index(row_offset(irow)+1)
132 : INTEGER, ALLOCATABLE, DIMENSION(:) :: col_index
133 : !> index of the non-zero matrix element in a communication buffer for each rank
134 : INTEGER, ALLOCATABLE, DIMENSION(:) :: packed_index
135 : !> R_atom_row - R_atom_col
136 : REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :) :: xij
137 : !> equivalent atomic orbitals
138 : INTEGER, ALLOCATABLE, DIMENSION(:) :: indxuo
139 : !> atomic index on which the orbital is centred
140 : INTEGER, ALLOCATABLE, DIMENSION(:) :: iaorb
141 : !> coordinates of all atoms in the supercell
142 : REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :) :: xa
143 : END TYPE siesta_distrib_csc_struct_type
144 :
145 : CONTAINS
146 :
147 : ! **************************************************************************************************
148 : !> \brief Map non-zero matrix blocks between sparse matrices in DBCSR and SIESTA formats.
149 : !> \param siesta_struct structure that stores metadata (sparsity pattern) of sparse SIESTA matrices
150 : !> \param matrix_dbcsr_kp DBCSR matrices for each cell image
151 : !> \param subsys QuickStep molecular system
152 : !> \param cell_to_index array to convert 3-D cell indices to 1-D DBCSR image indices
153 : !> \param sab_nl pair-wise neighbour list
154 : !> \param para_env MPI parallel environment
155 : !> \param max_ij_cell_image largest index of cell images along i and j cell vectors (e.g. (2,0) in
156 : !> case of 5 cell images (0,0), (1,0), (-1,0), (2,0), and (-2,0))
157 : !> \param do_merge merge DBCSR images along transport direction (k cell vector)
158 : !> \param gather_root distribute non-zero matrix elements of SIESTA matrices across all
159 : !> parallel processes (-1), or gather them on the given MPI rank (>= 0).
160 : ! **************************************************************************************************
161 2 : SUBROUTINE siesta_struct_create(siesta_struct, matrix_dbcsr_kp, subsys, cell_to_index, &
162 : sab_nl, para_env, max_ij_cell_image, do_merge, gather_root)
163 : TYPE(siesta_distrib_csc_struct_type), &
164 : INTENT(inout) :: siesta_struct
165 : TYPE(dbcsr_p_type), DIMENSION(:), INTENT(in) :: matrix_dbcsr_kp
166 : TYPE(qs_subsys_type), POINTER :: subsys
167 : INTEGER, DIMENSION(:, :, :), POINTER :: cell_to_index
168 : TYPE(neighbor_list_set_p_type), DIMENSION(:), &
169 : POINTER :: sab_nl
170 : TYPE(mp_para_env_type), POINTER :: para_env
171 : INTEGER, DIMENSION(2), INTENT(inout) :: max_ij_cell_image
172 : LOGICAL, INTENT(in) :: do_merge
173 : INTEGER, INTENT(in) :: gather_root
174 :
175 : CHARACTER(len=*), PARAMETER :: routineN = 'siesta_struct_create'
176 :
177 : CHARACTER(len=20) :: str_nelem, str_nelem_max
178 : INTEGER :: handle, iatom, icol, icol_blk, icol_local, image, image_j, image_k, irow, &
179 : irow_local, natoms, ncells_siesta_total, ncols_blk, ncols_total, nrows_local, &
180 : nrows_total, offset
181 : INTEGER(kind=int_8) :: n_nonzero_elements_local
182 : INTEGER, DIMENSION(3) :: max_ijk_cell_image, ncells_siesta
183 2 : INTEGER, DIMENSION(:), POINTER :: col_blk_offset, col_blk_size
184 : LOGICAL :: do_distribute, is_root_rank
185 : REAL(kind=dp), ALLOCATABLE, DIMENSION(:, :) :: particle_coords
186 : REAL(kind=dp), DIMENSION(3) :: real_cell_shift, scaled_cell_shift
187 : TYPE(cell_type), POINTER :: cell
188 2 : TYPE(particle_type), DIMENSION(:), POINTER :: particle_set
189 :
190 2 : CALL timeset(routineN, handle)
191 2 : do_distribute = gather_root < 0
192 2 : is_root_rank = gather_root == para_env%mepos
193 :
194 : ! here row_blk_offset / col_blk_offset are global indices of the first row / column of a given non-zero block.
195 : ! They are not offsets (index-1) but the actual indices.
196 : CALL dbcsr_get_info(matrix=matrix_dbcsr_kp(1)%matrix, &
197 : nfullrows_total=nrows_total, nfullcols_total=ncols_total, &
198 2 : nblkcols_total=ncols_blk, col_blk_size=col_blk_size, col_blk_offset=col_blk_offset)
199 : IF (debug_this_module) THEN
200 : CPASSERT(nrows_total == ncols_total)
201 : CPASSERT(gather_root < para_env%num_pe)
202 : END IF
203 :
204 2 : siesta_struct%gather_root = gather_root
205 2 : CALL get_neighbor_list_set_p(neighbor_list_sets=sab_nl, symmetric=siesta_struct%symmetric)
206 :
207 : ! apply periodic boundary conditions to atomic coordinates
208 2 : CALL qs_subsys_get(subsys, cell=cell, particle_set=particle_set, nparticle=natoms)
209 6 : ALLOCATE (particle_coords(3, natoms))
210 66 : DO iatom = 1, natoms
211 66 : CALL pbc_0_1(particle_coords(1:3, iatom), particle_set(iatom)%r(1:3), cell)
212 : END DO
213 :
214 : ! Note: in case we would like to limit the number of cell images along transport direction (k cell vector)
215 : ! by enabling 'BulkTransvCellSizeZ' keyword, we need to pass max_ijk_cell_image(1:3) vector
216 : ! to the subroutine instead of the reduced vector max_ij_cell_image(1:2)
217 6 : max_ijk_cell_image(1:2) = max_ij_cell_image(1:2)
218 :
219 : ! determine the actual number of cell images along k cell vector. Unless the third element is also passed
220 : ! via subroutine arguments, an extra MPI_Allreduce operation is needed each time we call
221 : ! replicate_neighbour_list() / get_nnodes_local().
222 2 : max_ijk_cell_image(3) = -1
223 2 : IF (.NOT. do_merge) max_ijk_cell_image(3) = 1 ! bulk-transport calculation expects exactly 3 cell images along transport direction
224 :
225 : ! replicate pair-wise neighbour list. Identical non-zero matrix blocks from cell image along transport direction
226 : ! are grouped together if do_merge == .TRUE.
227 6 : ALLOCATE (siesta_struct%nnodes_per_proc(0:para_env%num_pe - 1))
228 : CALL replicate_neighbour_list(siesta_struct%nl_repl, &
229 : siesta_struct%n_dbcsr_cell_images_to_merge, &
230 : siesta_struct%dbcsr_cell_image_to_merge, &
231 : siesta_struct%nnodes_per_proc, &
232 2 : max_ijk_cell_image, sab_nl, para_env, particle_coords, cell, cell_to_index, do_merge)
233 6 : max_ij_cell_image(1:2) = max_ijk_cell_image(1:2)
234 :
235 : ! count number of non-zero matrix elements that need to be send to and received from other parallel processes
236 8 : ALLOCATE (siesta_struct%nelements_per_proc(0:para_env%num_pe - 1, nelements_dbcsr_dim2))
237 : CALL count_remote_dbcsr_elements(siesta_struct%nelements_per_proc, siesta_struct%nnodes_per_proc, &
238 2 : siesta_struct%nl_repl, matrix_dbcsr_kp, siesta_struct%symmetric, para_env, gather_root)
239 :
240 : ! number of SIESTA non-zero matrix elements that are going to be stored on this parallel process
241 6 : n_nonzero_elements_local = SUM(siesta_struct%nelements_per_proc(:, nelements_dbcsr_recv))
242 2 : siesta_struct%n_nonzero_elements = n_nonzero_elements_local
243 :
244 : ! as SMEAGOL uses 32-bits integers, the number of non-zero matrix elements is limited by 2^31 per MPI rank.
245 : ! Abort CP2K if we are about to exceed this limit.
246 2 : IF (n_nonzero_elements_local > INT(HUGE(0), kind=int_8)) THEN
247 0 : WRITE (str_nelem, '(I0)') n_nonzero_elements_local
248 0 : WRITE (str_nelem_max, '(I0)') HUGE(0)
249 : CALL cp_abort(__LOCATION__, &
250 : "The number of non-zero matrix elements per MPI process "//TRIM(str_nelem)// &
251 : " cannot exceed "//TRIM(str_nelem_max)// &
252 0 : ". Please increase the number of MPI processes to satisfy this SMEAGOL limitation.")
253 : END IF
254 :
255 : ! in case there is no nonzero matrix element stored on this process, allocate arrays with one element to avoid SEGFAULT
256 2 : IF (n_nonzero_elements_local == 0) n_nonzero_elements_local = 1
257 :
258 : ! number of SIESTA-matrix rows local to the given parallel process
259 2 : IF (do_distribute) THEN
260 : #if defined(__SMEAGOL)
261 : CALL GetNodeOrbs(nrows_total, para_env%mepos, para_env%num_pe, nrows_local)
262 : #else
263 : CALL cp_abort(__LOCATION__, &
264 0 : "CP2K was compiled with no SMEAGOL support.")
265 : #endif
266 : ELSE
267 2 : IF (is_root_rank) THEN
268 1 : nrows_local = nrows_total
269 : ELSE
270 : nrows_local = 0
271 : END IF
272 : END IF
273 :
274 : ! number of cell images along each cell vector. It is 2*m+1, as SIESTA images are ordered as 0, 1, -1, ..., m, -m
275 8 : ncells_siesta(1:3) = 2*max_ijk_cell_image(1:3) + 1
276 : ! in case of merged cell images along the transport direction, there will be just 1 'merged' cell image along it
277 2 : IF (do_merge) ncells_siesta(3) = 1
278 :
279 2 : ncells_siesta_total = ncells_siesta(1)*ncells_siesta(2)*ncells_siesta(3)
280 :
281 : ! number of rows local to the given parallel process. Rows are distributed in a block-cyclic manner
282 2 : siesta_struct%nrows = nrows_local
283 :
284 : ! number of columns of the matrix in its dense form. SIESTA uses 1-D (rows) block-cyclic distribution.
285 : ! All non-zero matrix elements on a given row are stored on the same parallel process
286 2 : siesta_struct%ncols = nrows_total*ncells_siesta_total
287 :
288 : ! allocate at least one array element to avoid SIGFAULT when passing unallocated arrays to subroutines
289 2 : IF (nrows_local == 0) nrows_local = 1
290 :
291 6 : ALLOCATE (siesta_struct%n_nonzero_cols(nrows_local))
292 4 : ALLOCATE (siesta_struct%row_offset(nrows_local))
293 6 : ALLOCATE (siesta_struct%col_index(n_nonzero_elements_local))
294 4 : ALLOCATE (siesta_struct%packed_index(n_nonzero_elements_local))
295 :
296 : ! restore the actual number of local rows
297 2 : nrows_local = siesta_struct%nrows
298 :
299 : ! get number of non-zero matrix element on each local row (n_nonzero_cols),
300 : ! offset of the first non-zero matrix element for each local row (row_offset),
301 : ! global column indices of all local non-zero matrix elements (col_index), and
302 : ! the indices of all local non-zero matrix elements in the communication buffer (packed_index)
303 : CALL get_nonzero_element_indices(siesta_struct%n_nonzero_cols, siesta_struct%row_offset, &
304 : siesta_struct%col_index, siesta_struct%packed_index, &
305 : siesta_struct%nl_repl, matrix_dbcsr_kp, &
306 2 : siesta_struct%symmetric, para_env, gather_root)
307 :
308 : ! indices of equivalent atomic orbitals
309 6 : ALLOCATE (siesta_struct%indxuo(siesta_struct%ncols))
310 578 : DO icol = 1, ncols_total
311 578 : siesta_struct%indxuo(icol) = icol
312 : END DO
313 150 : DO image = 2, ncells_siesta_total
314 85398 : siesta_struct%indxuo((image - 1)*ncols_total + 1:image*ncols_total) = siesta_struct%indxuo(1:ncols_total)
315 : END DO
316 :
317 : ! particle index on which the orbital is centred
318 4 : ALLOCATE (siesta_struct%iaorb(siesta_struct%ncols))
319 66 : DO icol_blk = 1, ncols_blk
320 : ! col_blk_offset() is not an offset but the index of the first atomic orbital in the column block
321 642 : siesta_struct%iaorb(col_blk_offset(icol_blk):col_blk_offset(icol_blk) + col_blk_size(icol_blk) - 1) = icol_blk
322 : END DO
323 150 : DO image = 2, ncells_siesta_total
324 85398 : siesta_struct%iaorb((image - 1)*ncols_total + 1:image*ncols_total) = siesta_struct%iaorb(1:ncols_total) + (image - 1)*natoms
325 : END DO
326 :
327 : ! coordinates of all particles in each cell images
328 6 : ALLOCATE (siesta_struct%xa(3, natoms*ncells_siesta_total))
329 8 : DO image_k = 1, ncells_siesta(3)
330 : !icell_siesta(3) = image_k
331 6 : scaled_cell_shift(3) = REAL(number_from_canonical_enumeration(image_k), kind=dp) ! SIESTA -> actual cell index
332 38 : DO image_j = 1, ncells_siesta(2)
333 30 : scaled_cell_shift(2) = REAL(number_from_canonical_enumeration(image_j), kind=dp)
334 186 : DO image = 1, ncells_siesta(1)
335 150 : scaled_cell_shift(1) = REAL(number_from_canonical_enumeration(image), kind=dp)
336 150 : CALL scaled_to_real(real_cell_shift, scaled_cell_shift, cell)
337 150 : offset = (((image_k - 1)*ncells_siesta(2) + image_j - 1)*ncells_siesta(1) + image - 1)*natoms
338 4980 : DO iatom = 1, natoms
339 19350 : siesta_struct%xa(1:3, offset + iatom) = particle_set(iatom)%r(1:3) + real_cell_shift(1:3)
340 : END DO
341 : END DO
342 : END DO
343 : END DO
344 :
345 : ! inter-atomic distance
346 6 : ALLOCATE (siesta_struct%xij(3, n_nonzero_elements_local))
347 290 : DO irow_local = 1, nrows_local
348 288 : IF (do_distribute) THEN
349 : #if defined(__SMEAGOL)
350 : CALL LocalToGlobalOrb(irow_local, para_env%mepos, para_env%num_pe, irow)
351 : #else
352 : CALL cp_abort(__LOCATION__, &
353 0 : "CP2K was compiled with no SMEAGOL support.")
354 : #endif
355 : ELSE
356 : irow = irow_local
357 : IF (debug_this_module) THEN
358 : CPASSERT(is_root_rank)
359 : END IF
360 : END IF
361 288 : offset = siesta_struct%row_offset(irow_local)
362 1091522 : DO icol_local = offset + 1, offset + siesta_struct%n_nonzero_cols(irow_local)
363 1091232 : icol = siesta_struct%col_index(icol_local)
364 : siesta_struct%xij(1:3, icol_local) = siesta_struct%xa(1:3, siesta_struct%iaorb(icol)) - &
365 4365216 : siesta_struct%xa(1:3, siesta_struct%iaorb(irow))
366 : END DO
367 : END DO
368 :
369 2 : DEALLOCATE (particle_coords)
370 :
371 2 : CALL timestop(handle)
372 4 : END SUBROUTINE siesta_struct_create
373 :
374 : ! **************************************************************************************************
375 : !> \brief Release a SIESTA matrix structure
376 : !> \param siesta_struct structure to release
377 : ! **************************************************************************************************
378 2 : SUBROUTINE siesta_struct_release(siesta_struct)
379 : TYPE(siesta_distrib_csc_struct_type), &
380 : INTENT(inout) :: siesta_struct
381 :
382 : CHARACTER(len=*), PARAMETER :: routineN = 'siesta_struct_release'
383 :
384 : INTEGER :: handle
385 :
386 2 : CALL timeset(routineN, handle)
387 :
388 2 : siesta_struct%gather_root = -1
389 :
390 2 : IF (ALLOCATED(siesta_struct%nnodes_per_proc)) DEALLOCATE (siesta_struct%nnodes_per_proc)
391 2 : IF (ALLOCATED(siesta_struct%nl_repl)) DEALLOCATE (siesta_struct%nl_repl)
392 2 : IF (ALLOCATED(siesta_struct%n_dbcsr_cell_images_to_merge)) DEALLOCATE (siesta_struct%n_dbcsr_cell_images_to_merge)
393 2 : IF (ALLOCATED(siesta_struct%dbcsr_cell_image_to_merge)) DEALLOCATE (siesta_struct%dbcsr_cell_image_to_merge)
394 2 : IF (ALLOCATED(siesta_struct%nelements_per_proc)) DEALLOCATE (siesta_struct%nelements_per_proc)
395 :
396 2 : siesta_struct%n_nonzero_elements = 0
397 2 : siesta_struct%nrows = 0
398 2 : siesta_struct%ncols = 0
399 :
400 2 : IF (ALLOCATED(siesta_struct%n_nonzero_cols)) DEALLOCATE (siesta_struct%n_nonzero_cols)
401 2 : IF (ALLOCATED(siesta_struct%row_offset)) DEALLOCATE (siesta_struct%row_offset)
402 2 : IF (ALLOCATED(siesta_struct%col_index)) DEALLOCATE (siesta_struct%col_index)
403 2 : IF (ALLOCATED(siesta_struct%packed_index)) DEALLOCATE (siesta_struct%packed_index)
404 :
405 2 : IF (ALLOCATED(siesta_struct%xij)) DEALLOCATE (siesta_struct%xij)
406 2 : IF (ALLOCATED(siesta_struct%indxuo)) DEALLOCATE (siesta_struct%indxuo)
407 2 : IF (ALLOCATED(siesta_struct%iaorb)) DEALLOCATE (siesta_struct%iaorb)
408 2 : IF (ALLOCATED(siesta_struct%xa)) DEALLOCATE (siesta_struct%xa)
409 :
410 2 : CALL timestop(handle)
411 2 : END SUBROUTINE siesta_struct_release
412 :
413 : ! **************************************************************************************************
414 : !> \brief Convert matrix from DBCSR to sparse SIESTA format.
415 : !> \param matrix_siesta matrix in SIESTA format [out]
416 : !> \param matrix_dbcsr_kp DBCSR matrix [in]
417 : !> \param siesta_struct structure to map matrix blocks between formats
418 : !> \param para_env MPI parallel environment
419 : ! **************************************************************************************************
420 8 : SUBROUTINE convert_dbcsr_to_distributed_siesta(matrix_siesta, matrix_dbcsr_kp, siesta_struct, para_env)
421 : REAL(kind=dp), DIMENSION(:), INTENT(out) :: matrix_siesta
422 : TYPE(dbcsr_p_type), DIMENSION(:), INTENT(in) :: matrix_dbcsr_kp
423 : TYPE(siesta_distrib_csc_struct_type), INTENT(in) :: siesta_struct
424 : TYPE(mp_para_env_type), INTENT(in), POINTER :: para_env
425 :
426 : CHARACTER(len=*), PARAMETER :: routineN = 'convert_dbcsr_to_distributed_siesta'
427 :
428 : INTEGER :: first_col_minus_one, first_row_minus_one, handle, icol_blk, icol_local, &
429 : image_dbcsr, image_ind, image_ind_offset, image_siesta, image_siesta_transp, inode, &
430 : inode_proc, iproc, irequest, irow_blk, irow_local, irow_proc, mepos, n_image_ind, &
431 : ncols_blk, ncols_local, nnodes_proc, node_offset, nprocs, nrequests_recv, &
432 : nrequests_total, nrows_blk, nrows_local
433 : INTEGER(kind=int_8) :: n_nonzero_elements_dbcsr, &
434 : n_nonzero_elements_siesta, &
435 : offset_recv_mepos, offset_send_mepos
436 8 : INTEGER(kind=int_8), ALLOCATABLE, DIMENSION(:) :: n_packed_elements_per_proc, &
437 8 : nelements_per_request, &
438 8 : offset_per_proc, offset_per_request
439 8 : INTEGER, ALLOCATABLE, DIMENSION(:) :: next_nonzero_element_offset, peer_rank, &
440 8 : request_tag
441 8 : INTEGER, DIMENSION(:), POINTER :: col_blk_offset, col_blk_size, &
442 8 : row_blk_offset, row_blk_size
443 : LOGICAL :: do_distribute, found, is_root_rank, &
444 : symmetric
445 8 : REAL(kind=dp), ALLOCATABLE, DIMENSION(:) :: recv_buffer, reorder_recv_buffer, &
446 8 : send_buffer
447 8 : REAL(kind=dp), DIMENSION(:, :), POINTER :: sm_block, sm_block_merged
448 8 : TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:) :: requests
449 :
450 8 : CALL timeset(routineN, handle)
451 4364940 : matrix_siesta(:) = 0.0_dp
452 :
453 8 : mepos = para_env%mepos
454 8 : nprocs = para_env%num_pe
455 8 : do_distribute = siesta_struct%gather_root < 0
456 8 : is_root_rank = siesta_struct%gather_root == mepos
457 :
458 : CALL dbcsr_get_info(matrix=matrix_dbcsr_kp(1)%matrix, &
459 : nblkrows_total=nrows_blk, nblkcols_total=ncols_blk, &
460 : row_blk_size=row_blk_size, col_blk_size=col_blk_size, &
461 8 : row_blk_offset=row_blk_offset, col_blk_offset=col_blk_offset)
462 8 : symmetric = siesta_struct%symmetric
463 :
464 : ! number of locally stored SIESTA non-zero matrix elements
465 24 : n_nonzero_elements_siesta = SUM(siesta_struct%nelements_per_proc(:, nelements_dbcsr_recv))
466 : ! number of locally stored DBCSR non-zero matrix elements
467 24 : n_nonzero_elements_dbcsr = SUM(siesta_struct%nelements_per_proc(:, nelements_dbcsr_send))
468 :
469 : ! number of concurrent MPI isend / irecv operations
470 : nrequests_recv = get_number_of_mpi_sendrecv_requests(mepos, siesta_struct%nelements_per_proc(:, nelements_dbcsr_recv), &
471 8 : max_mpi_packet_size_dp)
472 : nrequests_total = get_number_of_mpi_sendrecv_requests(mepos, siesta_struct%nelements_per_proc(:, nelements_dbcsr_send), &
473 8 : max_mpi_packet_size_dp) + nrequests_recv
474 :
475 8 : IF (nrequests_total > 0) THEN
476 : ! allocate MPI-related arrays. request_tag is not actually needed, as MPI standard guarantees the order of
477 : ! peer-to-peer messages with the same tag between same processes
478 32 : ALLOCATE (requests(nrequests_total))
479 24 : ALLOCATE (peer_rank(nrequests_total), request_tag(nrequests_total))
480 32 : ALLOCATE (offset_per_request(nrequests_total), nelements_per_request(nrequests_total))
481 : !requests(:) = mp_request_null
482 :
483 : ! split large messages into a number of smaller messages. It is not really needed
484 : ! unless we are going to send > 2^31 matrix elements per MPI request
485 8 : IF (nrequests_recv > 0) THEN
486 : CALL assign_nonzero_elements_to_requests(offset_per_request(1:nrequests_recv), &
487 : nelements_per_request(1:nrequests_recv), &
488 : peer_rank(1:nrequests_recv), &
489 : request_tag(1:nrequests_recv), &
490 : mepos, &
491 : siesta_struct%nelements_per_proc(:, nelements_dbcsr_recv), &
492 4 : max_mpi_packet_size_dp)
493 : END IF
494 8 : IF (nrequests_total > nrequests_recv) THEN
495 : CALL assign_nonzero_elements_to_requests(offset_per_request(nrequests_recv + 1:nrequests_total), &
496 : nelements_per_request(nrequests_recv + 1:nrequests_total), &
497 : peer_rank(nrequests_recv + 1:nrequests_total), &
498 : request_tag(nrequests_recv + 1:nrequests_total), &
499 : mepos, &
500 : siesta_struct%nelements_per_proc(:, nelements_dbcsr_send), &
501 4 : max_mpi_packet_size_dp)
502 : END IF
503 : END IF
504 :
505 : ! point-to-point recv/send can be replaced with alltoallv, if data to distribute per rank is < 2^31 elements (should be OK due to SMEAGOL limitation)
506 : ! in principle, it is possible to overcome this limit by using derived datatypes (which will require additional wrapper functions, indeed).
507 : !
508 : ! pre-post non-blocking receive operations
509 8 : IF (n_nonzero_elements_siesta > 0) THEN
510 12 : ALLOCATE (recv_buffer(n_nonzero_elements_siesta))
511 : END IF
512 12 : DO irequest = 1, nrequests_recv
513 : CALL para_env%irecv(recv_buffer(offset_per_request(irequest) + 1: &
514 : offset_per_request(irequest) + nelements_per_request(irequest)), &
515 12 : peer_rank(irequest), requests(irequest), request_tag(irequest))
516 : END DO
517 :
518 : ! pack local DBCSR non-zero matrix elements ordering by their target parallel process
519 32 : ALLOCATE (offset_per_proc(0:nprocs - 1), n_packed_elements_per_proc(0:nprocs - 1))
520 8 : offset_per_proc(0) = 0
521 16 : DO iproc = 1, nprocs - 1
522 16 : offset_per_proc(iproc) = offset_per_proc(iproc - 1) + siesta_struct%nelements_per_proc(iproc - 1, nelements_dbcsr_send)
523 : END DO
524 24 : n_packed_elements_per_proc(:) = 0
525 :
526 : ! number of local neighbour-list nodes and offset of the first local neighbour-list node
527 8 : nnodes_proc = siesta_struct%nnodes_per_proc(mepos)
528 : !node_offset = SUM(siesta_struct%nnodes_per_proc(0:mepos)) - siesta_struct%nnodes_per_proc(mepos)
529 20 : node_offset = SUM(siesta_struct%nnodes_per_proc(0:mepos)) - nnodes_proc
530 :
531 : ! if do_distribute == .FALSE., send all matrix elements to MPI process with rank gather_root
532 : ! in case of do_distribute == .TRUE., iproc is determined by calling WhichNodeOrb()
533 8 : iproc = siesta_struct%gather_root
534 :
535 8 : IF (n_nonzero_elements_dbcsr > 0) THEN
536 24 : ALLOCATE (send_buffer(n_nonzero_elements_dbcsr))
537 4364936 : send_buffer(:) = 0.0_dp
538 :
539 : ! iterate over locally-stored DBCSR matrix blocks.
540 : ! inode_proc is the target parallel process (where data are going to be sent)
541 : image_ind_offset = 0
542 28168 : DO inode_proc = 1, nnodes_proc
543 28160 : n_image_ind = siesta_struct%n_dbcsr_cell_images_to_merge(inode_proc)
544 28160 : IF (n_image_ind > 0) THEN
545 28160 : inode = node_offset + inode_proc
546 :
547 28160 : irow_blk = siesta_struct%nl_repl(neighbor_list_iatom_index, inode)
548 28160 : icol_blk = siesta_struct%nl_repl(neighbor_list_jatom_index, inode)
549 28160 : CPASSERT(irow_blk <= icol_blk .OR. (.NOT. symmetric))
550 28160 : image_siesta = siesta_struct%nl_repl(neighbor_list_siesta_image_index, inode)
551 28160 : image_siesta_transp = siesta_struct%nl_repl(neighbor_list_siesta_transp_image_index, inode)
552 :
553 28160 : nrows_local = row_blk_size(irow_blk)
554 28160 : ncols_local = col_blk_size(icol_blk)
555 28160 : first_row_minus_one = row_blk_offset(irow_blk) - 1
556 28160 : first_col_minus_one = col_blk_offset(icol_blk) - 1
557 :
558 : ! merging cell images along transport direction
559 28160 : IF (n_image_ind == 1) THEN
560 : ! the most common case. Nothing to merge, so there is no need to allocate memory for a merged block
561 28160 : image_dbcsr = siesta_struct%dbcsr_cell_image_to_merge(image_ind_offset + 1)
562 : CALL dbcsr_get_block_p(matrix=matrix_dbcsr_kp(image_dbcsr)%matrix, &
563 28160 : row=irow_blk, col=icol_blk, block=sm_block_merged, found=found)
564 28160 : CPASSERT(found)
565 : ELSE ! n_image_ind > 1
566 0 : ALLOCATE (sm_block_merged(nrows_local, ncols_local))
567 :
568 0 : DO image_ind = 1, n_image_ind
569 0 : image_dbcsr = siesta_struct%dbcsr_cell_image_to_merge(image_ind + image_ind_offset)
570 :
571 : CALL dbcsr_get_block_p(matrix=matrix_dbcsr_kp(image_dbcsr)%matrix, &
572 0 : row=irow_blk, col=icol_blk, block=sm_block, found=found)
573 0 : CPASSERT(found)
574 : sm_block_merged(1:nrows_local, 1:ncols_local) = sm_block_merged(1:nrows_local, 1:ncols_local) + &
575 0 : sm_block(1:nrows_local, 1:ncols_local)
576 : END DO
577 : END IF
578 :
579 : ! pack matrix elements for the 'normal' SIESTA matrix block
580 28160 : IF (image_siesta > 0) THEN
581 281600 : DO irow_local = 1, nrows_local
582 253440 : IF (do_distribute) THEN
583 : #if defined(__SMEAGOL)
584 : CALL WhichNodeOrb(irow_local + first_row_minus_one, nprocs, iproc)
585 : #else
586 : CALL cp_abort(__LOCATION__, &
587 0 : "CP2K was compiled with no SMEAGOL support.")
588 : #endif
589 : END IF
590 :
591 : ! CPASSERT
592 : IF (debug_this_module) THEN
593 : CPASSERT(iproc >= 0 .AND. iproc < nprocs)
594 : IF (n_packed_elements_per_proc(iproc) + ncols_local > &
595 : siesta_struct%nelements_per_proc(iproc, nelements_dbcsr_send)) THEN
596 : CALL cp__a(__SHORT_FILE__, __LINE__)
597 : END IF
598 : END IF
599 :
600 : offset_send_mepos = offset_per_proc(iproc) + n_packed_elements_per_proc(iproc)
601 253440 : send_buffer(offset_send_mepos + 1:offset_send_mepos + ncols_local) = sm_block_merged(irow_local, 1:ncols_local)
602 2534400 :
603 : n_packed_elements_per_proc(iproc) = n_packed_elements_per_proc(iproc) + ncols_local
604 281600 : END DO
605 : END IF
606 :
607 : ! pack matrix elements of the transposed SIESTA matrix block
608 : IF (image_siesta_transp > 0) THEN
609 28160 : DO icol_local = 1, ncols_local
610 257280 : IF (do_distribute) THEN
611 231552 : #if defined(__SMEAGOL)
612 : CALL WhichNodeOrb(icol_local + first_col_minus_one, nprocs, iproc) ! iproc_orb
613 : #else
614 : CALL cp_abort(__LOCATION__, &
615 : "CP2K was compiled with no SMEAGOL support.")
616 0 : #endif
617 : END IF
618 :
619 : ! CPASSERT
620 : IF (debug_this_module) THEN
621 : CPASSERT(iproc >= 0 .AND. iproc < nprocs)
622 : IF (n_packed_elements_per_proc(iproc) + nrows_local > &
623 : siesta_struct%nelements_per_proc(iproc, nelements_dbcsr_send)) THEN
624 : CALL cp__a(__SHORT_FILE__, __LINE__)
625 : END IF
626 : END IF
627 :
628 : offset_send_mepos = offset_per_proc(iproc) + n_packed_elements_per_proc(iproc)
629 : send_buffer(offset_send_mepos + 1:offset_send_mepos + nrows_local) = sm_block_merged(1:nrows_local, icol_local)
630 231552 :
631 2315520 : n_packed_elements_per_proc(iproc) = n_packed_elements_per_proc(iproc) + nrows_local
632 : END DO
633 257280 : END IF
634 :
635 : IF (n_image_ind > 1) THEN
636 : DEALLOCATE (sm_block_merged)
637 28160 : END IF
638 0 : END IF
639 :
640 : image_ind_offset = image_ind_offset + siesta_struct%n_dbcsr_cell_images_to_merge(inode_proc)
641 : END DO
642 28168 :
643 : IF (debug_this_module) THEN
644 : DO iproc = 0, nprocs - 1
645 : IF (n_packed_elements_per_proc(iproc) /= siesta_struct%nelements_per_proc(iproc, nelements_dbcsr_send)) THEN
646 : CALL cp__a(__SHORT_FILE__, __LINE__)
647 : END IF
648 : END DO
649 : END IF
650 :
651 : ! send packed data to other parallel processes
652 : DO irequest = nrequests_recv + 1, nrequests_total
653 : CALL para_env%isend(send_buffer(offset_per_request(irequest) + 1: &
654 12 : offset_per_request(irequest) + nelements_per_request(irequest)), &
655 : peer_rank(irequest), requests(irequest), request_tag(irequest))
656 : END DO
657 12 :
658 : ! copy data locally that stay on the same process.
659 : IF (mepos > 0) THEN
660 : offset_recv_mepos = SUM(siesta_struct%nelements_per_proc(0:mepos - 1, nelements_dbcsr_recv))
661 8 : ELSE
662 8 : offset_recv_mepos = 0
663 : END IF
664 : offset_send_mepos = offset_per_proc(mepos)
665 :
666 8 : IF (debug_this_module) THEN
667 : IF (n_packed_elements_per_proc(mepos) /= siesta_struct%nelements_per_proc(mepos, nelements_dbcsr_recv)) THEN
668 : CALL cp__a(__SHORT_FILE__, __LINE__)
669 : END IF
670 : END IF
671 :
672 : IF (n_packed_elements_per_proc(mepos) > 0) THEN
673 : recv_buffer(offset_recv_mepos + 1:offset_recv_mepos + n_packed_elements_per_proc(mepos)) = &
674 8 : send_buffer(offset_send_mepos + 1:offset_send_mepos + n_packed_elements_per_proc(mepos))
675 : END IF
676 2182468 : END IF
677 :
678 : IF (nrequests_total > 0) THEN
679 : ! wait for pending isend/irecv requests
680 8 : CALL mp_waitall(requests)
681 : DEALLOCATE (nelements_per_request, offset_per_request, peer_rank, requests, request_tag)
682 8 : END IF
683 8 :
684 : ! release send buffers
685 : IF (ALLOCATED(send_buffer)) DEALLOCATE (send_buffer)
686 : DEALLOCATE (offset_per_proc, n_packed_elements_per_proc)
687 8 :
688 8 : ! non-zero matrix elements in 'recv_buffer' array are grouped by their source MPI rank, local row index, and column index (in this order).
689 : ! Reorder the matrix elements ('reorder_recv_buffer') so they are grouped by their local row index, source MPI rank, and column index.
690 : ! (column indices are in the ascending order within each (row index, source MPI rank) block).
691 : ! The array 'packed_index' allows mapping matrix element between these intermediate order and SIESTA order : local row index, column index
692 : IF (n_nonzero_elements_siesta > 0) THEN
693 : ALLOCATE (reorder_recv_buffer(n_nonzero_elements_siesta))
694 8 : ALLOCATE (next_nonzero_element_offset(siesta_struct%nrows))
695 12 : next_nonzero_element_offset(:) = 0
696 12 : offset_recv_mepos = 0
697 1156 :
698 4 : DO inode = 1, SIZE(siesta_struct%nl_repl, 2)
699 : irow_blk = siesta_struct%nl_repl(neighbor_list_iatom_index, inode)
700 28164 : icol_blk = siesta_struct%nl_repl(neighbor_list_jatom_index, inode)
701 28160 : CPASSERT(irow_blk <= icol_blk .OR. (.NOT. symmetric))
702 28160 : image_siesta = siesta_struct%nl_repl(neighbor_list_siesta_image_index, inode)
703 28160 : image_siesta_transp = siesta_struct%nl_repl(neighbor_list_siesta_transp_image_index, inode)
704 28160 :
705 28160 : nrows_local = row_blk_size(irow_blk)
706 : ncols_local = col_blk_size(icol_blk)
707 28160 : first_row_minus_one = row_blk_offset(irow_blk) - 1
708 28160 : first_col_minus_one = col_blk_offset(icol_blk) - 1
709 28160 :
710 28160 : ! normal block
711 : IF (image_siesta > 0) THEN
712 : DO irow_local = 1, nrows_local
713 28160 : IF (do_distribute) THEN
714 281600 : #if defined(__SMEAGOL)
715 253440 : CALL GlobalToLocalOrb(irow_local + first_row_minus_one, mepos, nprocs, irow_proc)
716 : #else
717 : CALL cp_abort(__LOCATION__, &
718 : "CP2K was compiled with no SMEAGOL support.")
719 : #endif
720 0 : ELSE
721 : IF (is_root_rank) THEN
722 : irow_proc = irow_local + first_row_minus_one
723 253440 : ELSE
724 253440 : irow_proc = 0
725 : END IF
726 : END IF
727 : IF (irow_proc > 0) THEN
728 : offset_send_mepos = siesta_struct%row_offset(irow_proc) + next_nonzero_element_offset(irow_proc)
729 281600 : reorder_recv_buffer(offset_send_mepos + 1:offset_send_mepos + ncols_local) = &
730 253440 : recv_buffer(offset_recv_mepos + 1:offset_recv_mepos + ncols_local)
731 : offset_recv_mepos = offset_recv_mepos + ncols_local
732 2534400 : next_nonzero_element_offset(irow_proc) = next_nonzero_element_offset(irow_proc) + ncols_local
733 253440 : END IF
734 253440 : END DO
735 : END IF
736 :
737 : ! transposed block
738 : IF (image_siesta_transp > 0) THEN
739 : DO icol_local = 1, ncols_local
740 28164 : IF (do_distribute) THEN
741 257280 : #if defined(__SMEAGOL)
742 231552 : CALL GlobalToLocalOrb(icol_local + first_col_minus_one, mepos, nprocs, irow_proc)
743 : #else
744 : CALL cp_abort(__LOCATION__, &
745 : "CP2K was compiled with no SMEAGOL support.")
746 : #endif
747 0 : ELSE
748 : IF (is_root_rank) THEN
749 : irow_proc = icol_local + first_col_minus_one
750 231552 : ELSE
751 231552 : irow_proc = 0
752 : END IF
753 : END IF
754 : IF (irow_proc > 0) THEN
755 : offset_send_mepos = siesta_struct%row_offset(irow_proc) + next_nonzero_element_offset(irow_proc)
756 257280 : reorder_recv_buffer(offset_send_mepos + 1:offset_send_mepos + nrows_local) = &
757 231552 : recv_buffer(offset_recv_mepos + 1:offset_recv_mepos + nrows_local)
758 : offset_recv_mepos = offset_recv_mepos + nrows_local
759 2315520 : next_nonzero_element_offset(irow_proc) = next_nonzero_element_offset(irow_proc) + nrows_local
760 231552 : END IF
761 231552 : END DO
762 : END IF
763 : END DO
764 :
765 : IF (debug_this_module) THEN
766 : DO irow_local = 1, siesta_struct%nrows
767 : IF (siesta_struct%n_nonzero_cols(irow_local) /= next_nonzero_element_offset(irow_local)) THEN
768 : CALL cp__a(__SHORT_FILE__, __LINE__)
769 : END IF
770 : END DO
771 : END IF
772 :
773 : DEALLOCATE (next_nonzero_element_offset)
774 : DEALLOCATE (recv_buffer)
775 4 :
776 4 : ! Map non-zero matrix element between the intermediate order and SIESTA order
777 : DO irow_local = 1, siesta_struct%nrows
778 : offset_recv_mepos = siesta_struct%row_offset(irow_local)
779 1156 : DO icol_local = 1, siesta_struct%n_nonzero_cols(irow_local)
780 1152 : matrix_siesta(offset_recv_mepos + icol_local) = &
781 4366084 : reorder_recv_buffer(offset_recv_mepos + siesta_struct%packed_index(offset_recv_mepos + icol_local))
782 : END DO
783 4366080 : END DO
784 : DEALLOCATE (reorder_recv_buffer)
785 : END IF
786 4 :
787 : CALL timestop(handle)
788 : END SUBROUTINE convert_dbcsr_to_distributed_siesta
789 8 :
790 24 : ! **************************************************************************************************
791 : !> \brief Convert matrix from DBCSR to sparse SIESTA format.
792 : !> \param matrix_dbcsr_kp DBCSR matrix [out]. The matrix is declared as INTENT(in) as pointers to
793 : !> dbcsr matrices remain intact. However we have intention to update matrix elements
794 : !> \param matrix_siesta matrix in SIESTA format [in]
795 : !> \param siesta_struct structure to map matrix blocks between formats
796 : !> \param para_env MPI parallel environment
797 : ! **************************************************************************************************
798 : SUBROUTINE convert_distributed_siesta_to_dbcsr(matrix_dbcsr_kp, matrix_siesta, siesta_struct, para_env)
799 : TYPE(dbcsr_p_type), DIMENSION(:), INTENT(in) :: matrix_dbcsr_kp
800 0 : REAL(kind=dp), DIMENSION(:), INTENT(in) :: matrix_siesta
801 : TYPE(siesta_distrib_csc_struct_type), INTENT(in) :: siesta_struct
802 : TYPE(mp_para_env_type), INTENT(in), POINTER :: para_env
803 :
804 : CHARACTER(len=*), PARAMETER :: routineN = 'convert_distributed_siesta_to_dbcsr'
805 :
806 : INTEGER :: first_col_minus_one, first_row_minus_one, handle, icol_blk, icol_local, &
807 : image_dbcsr, image_siesta, image_siesta_transp, inode, inode_proc, iproc, irequest, &
808 : irow_blk, irow_local, irow_proc, mepos, n_image_ind, ncols_blk, ncols_local, nnodes_proc, &
809 : node_offset, nprocs, nrequests_recv, nrequests_total, nrows_blk, nrows_local
810 : INTEGER(kind=int_8) :: n_nonzero_elements_dbcsr, &
811 : n_nonzero_elements_siesta, &
812 : offset_recv_mepos, offset_send_mepos
813 : INTEGER(kind=int_8), ALLOCATABLE, DIMENSION(:) :: n_packed_elements_per_proc, &
814 : nelements_per_request, &
815 0 : offset_per_proc, offset_per_request
816 0 : INTEGER, ALLOCATABLE, DIMENSION(:) :: next_nonzero_element_offset, peer_rank, &
817 0 : request_tag
818 0 : INTEGER, DIMENSION(:), POINTER :: col_blk_offset, col_blk_size, &
819 0 : row_blk_offset, row_blk_size
820 0 : LOGICAL :: do_distribute, found, is_root_rank, &
821 0 : symmetric
822 : REAL(kind=dp), ALLOCATABLE, DIMENSION(:) :: recv_buffer, reorder_send_buffer, &
823 : send_buffer
824 0 : REAL(kind=dp), DIMENSION(:, :), POINTER :: sm_block
825 0 : TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:) :: requests
826 0 :
827 0 : CALL timeset(routineN, handle)
828 : DO image_dbcsr = 1, SIZE(matrix_dbcsr_kp)
829 0 : CALL dbcsr_set(matrix_dbcsr_kp(image_dbcsr)%matrix, 0.0_dp)
830 0 : END DO
831 0 :
832 : mepos = para_env%mepos
833 : nprocs = para_env%num_pe
834 0 : do_distribute = siesta_struct%gather_root < 0
835 0 : is_root_rank = siesta_struct%gather_root == mepos
836 0 :
837 0 : CALL dbcsr_get_info(matrix=matrix_dbcsr_kp(1)%matrix, &
838 : nblkrows_total=nrows_blk, nblkcols_total=ncols_blk, &
839 : row_blk_size=row_blk_size, col_blk_size=col_blk_size, &
840 : row_blk_offset=row_blk_offset, col_blk_offset=col_blk_offset)
841 : symmetric = siesta_struct%symmetric
842 0 :
843 0 : n_nonzero_elements_siesta = SUM(siesta_struct%nelements_per_proc(:, nelements_dbcsr_recv))
844 : n_nonzero_elements_dbcsr = SUM(siesta_struct%nelements_per_proc(:, nelements_dbcsr_send))
845 0 :
846 0 : nrequests_recv = get_number_of_mpi_sendrecv_requests(mepos, siesta_struct%nelements_per_proc(:, nelements_dbcsr_send), &
847 : max_mpi_packet_size_dp)
848 : nrequests_total = get_number_of_mpi_sendrecv_requests(mepos, siesta_struct%nelements_per_proc(:, nelements_dbcsr_recv), &
849 0 : max_mpi_packet_size_dp) + nrequests_recv
850 : IF (nrequests_total > 0) THEN
851 0 : ALLOCATE (requests(nrequests_total))
852 0 : ALLOCATE (peer_rank(nrequests_total), request_tag(nrequests_total))
853 0 : ALLOCATE (offset_per_request(nrequests_total), nelements_per_request(nrequests_total))
854 0 : !requests(:) = mp_request_null
855 0 : IF (nrequests_recv > 0) THEN
856 : CALL assign_nonzero_elements_to_requests(offset_per_request(1:nrequests_recv), &
857 0 : nelements_per_request(1:nrequests_recv), &
858 : peer_rank(1:nrequests_recv), &
859 : request_tag(1:nrequests_recv), &
860 : mepos, &
861 : siesta_struct%nelements_per_proc(:, nelements_dbcsr_send), &
862 : max_mpi_packet_size_dp)
863 : END IF
864 0 : IF (nrequests_total > nrequests_recv) THEN
865 : CALL assign_nonzero_elements_to_requests(offset_per_request(nrequests_recv + 1:nrequests_total), &
866 0 : nelements_per_request(nrequests_recv + 1:nrequests_total), &
867 : peer_rank(nrequests_recv + 1:nrequests_total), &
868 : request_tag(nrequests_recv + 1:nrequests_total), &
869 : mepos, &
870 : siesta_struct%nelements_per_proc(:, nelements_dbcsr_recv), &
871 : max_mpi_packet_size_dp)
872 : END IF
873 0 : END IF
874 :
875 : IF (n_nonzero_elements_dbcsr > 0) THEN
876 : ALLOCATE (recv_buffer(n_nonzero_elements_dbcsr))
877 0 : END IF
878 0 : DO irequest = 1, nrequests_recv
879 : CALL para_env%irecv(recv_buffer(offset_per_request(irequest) + 1: &
880 0 : offset_per_request(irequest) + nelements_per_request(irequest)), &
881 : peer_rank(irequest), requests(irequest), request_tag(irequest))
882 : END DO
883 0 :
884 : ALLOCATE (offset_per_proc(0:nprocs - 1), n_packed_elements_per_proc(0:nprocs - 1))
885 : offset_per_proc(0) = 0
886 0 : DO iproc = 1, nprocs - 1
887 0 : offset_per_proc(iproc) = offset_per_proc(iproc - 1) + siesta_struct%nelements_per_proc(iproc - 1, nelements_dbcsr_send)
888 0 : END DO
889 0 : n_packed_elements_per_proc(:) = 0
890 :
891 0 : IF (mepos > 0) THEN
892 : node_offset = SUM(siesta_struct%nnodes_per_proc(0:mepos - 1))
893 0 : ELSE
894 0 : node_offset = 0
895 : END IF
896 : nnodes_proc = siesta_struct%nnodes_per_proc(mepos)
897 :
898 0 : IF (n_nonzero_elements_siesta > 0) THEN
899 : ALLOCATE (send_buffer(n_nonzero_elements_siesta))
900 0 :
901 0 : ALLOCATE (reorder_send_buffer(n_nonzero_elements_siesta))
902 : DO irow_local = 1, siesta_struct%nrows
903 0 : offset_send_mepos = siesta_struct%row_offset(irow_local)
904 0 : DO icol_local = 1, siesta_struct%n_nonzero_cols(irow_local)
905 0 : reorder_send_buffer(offset_send_mepos + siesta_struct%packed_index(offset_send_mepos + icol_local)) = &
906 0 : matrix_siesta(offset_send_mepos + icol_local)
907 : END DO
908 0 : END DO
909 :
910 : ALLOCATE (next_nonzero_element_offset(siesta_struct%nrows))
911 : next_nonzero_element_offset(:) = 0
912 0 : offset_send_mepos = 0
913 0 :
914 0 : DO inode = 1, SIZE(siesta_struct%nl_repl, 2)
915 : irow_blk = siesta_struct%nl_repl(neighbor_list_iatom_index, inode)
916 0 : icol_blk = siesta_struct%nl_repl(neighbor_list_jatom_index, inode)
917 0 : CPASSERT(irow_blk <= icol_blk .OR. (.NOT. symmetric))
918 0 : image_siesta = siesta_struct%nl_repl(neighbor_list_siesta_image_index, inode)
919 0 : image_siesta_transp = siesta_struct%nl_repl(neighbor_list_siesta_transp_image_index, inode)
920 0 :
921 0 : nrows_local = row_blk_size(irow_blk)
922 : ncols_local = col_blk_size(icol_blk)
923 0 : first_row_minus_one = row_blk_offset(irow_blk) - 1
924 0 : first_col_minus_one = col_blk_offset(icol_blk) - 1
925 0 :
926 0 : IF (image_siesta > 0) THEN
927 : DO irow_local = 1, nrows_local
928 0 : IF (do_distribute) THEN
929 0 : #if defined(__SMEAGOL)
930 0 : CALL GlobalToLocalOrb(irow_local + first_row_minus_one, mepos, nprocs, irow_proc)
931 : #else
932 : CALL cp_abort(__LOCATION__, &
933 : "CP2K was compiled with no SMEAGOL support.")
934 : #endif
935 0 : ELSE
936 : IF (is_root_rank) THEN
937 : irow_proc = irow_local + first_row_minus_one
938 0 : ELSE
939 0 : irow_proc = 0
940 : END IF
941 : END IF
942 : IF (irow_proc > 0) THEN
943 : offset_recv_mepos = siesta_struct%row_offset(irow_proc) + next_nonzero_element_offset(irow_proc)
944 0 : send_buffer(offset_send_mepos + 1:offset_send_mepos + ncols_local) = &
945 0 : reorder_send_buffer(offset_recv_mepos + 1:offset_recv_mepos + ncols_local)
946 : offset_send_mepos = offset_send_mepos + ncols_local
947 0 : next_nonzero_element_offset(irow_proc) = next_nonzero_element_offset(irow_proc) + ncols_local
948 0 : END IF
949 0 : END DO
950 : END IF
951 :
952 : ! transposed block
953 : IF (image_siesta_transp > 0) THEN
954 : DO icol_local = 1, ncols_local
955 0 : IF (do_distribute) THEN
956 0 : #if defined(__SMEAGOL)
957 0 : CALL GlobalToLocalOrb(icol_local + first_col_minus_one, mepos, nprocs, irow_proc)
958 : #else
959 : CALL cp_abort(__LOCATION__, &
960 : "CP2K was compiled with no SMEAGOL support.")
961 : #endif
962 0 : ELSE
963 : IF (is_root_rank) THEN
964 : irow_proc = icol_local + first_col_minus_one
965 0 : ELSE
966 0 : irow_proc = 0
967 : END IF
968 : END IF
969 : IF (irow_proc > 0) THEN
970 : offset_recv_mepos = siesta_struct%row_offset(irow_proc) + next_nonzero_element_offset(irow_proc)
971 0 : send_buffer(offset_send_mepos + 1:offset_send_mepos + nrows_local) = &
972 0 : reorder_send_buffer(offset_recv_mepos + 1:offset_recv_mepos + nrows_local)
973 : offset_send_mepos = offset_send_mepos + nrows_local
974 0 : next_nonzero_element_offset(irow_proc) = next_nonzero_element_offset(irow_proc) + nrows_local
975 0 : END IF
976 0 : END DO
977 : END IF
978 : END DO
979 :
980 : IF (debug_this_module) THEN
981 : DO irow_local = 1, siesta_struct%nrows
982 : IF (siesta_struct%n_nonzero_cols(irow_local) /= next_nonzero_element_offset(irow_local)) THEN
983 : CALL cp__a(__SHORT_FILE__, __LINE__)
984 : END IF
985 : END DO
986 : END IF
987 :
988 : DEALLOCATE (next_nonzero_element_offset)
989 : DEALLOCATE (reorder_send_buffer)
990 0 :
991 0 : DO irequest = nrequests_recv + 1, nrequests_total
992 : CALL para_env%isend(send_buffer(offset_per_request(irequest) + 1: &
993 0 : offset_per_request(irequest) + nelements_per_request(irequest)), &
994 : peer_rank(irequest), requests(irequest), request_tag(irequest))
995 : END DO
996 0 :
997 : ! copy data locally that stay on the same process.
998 : IF (mepos > 0) THEN
999 : offset_send_mepos = SUM(siesta_struct%nelements_per_proc(0:mepos - 1, nelements_dbcsr_recv))
1000 0 : ELSE
1001 0 : offset_send_mepos = 0
1002 : END IF
1003 : offset_recv_mepos = offset_per_proc(mepos)
1004 :
1005 0 : IF (debug_this_module) THEN
1006 : IF (siesta_struct%nelements_per_proc(mepos, nelements_dbcsr_recv) /= &
1007 : siesta_struct%nelements_per_proc(mepos, nelements_dbcsr_send)) THEN
1008 : CALL cp__a(__SHORT_FILE__, __LINE__)
1009 : END IF
1010 : END IF
1011 :
1012 : IF (siesta_struct%nelements_per_proc(mepos, nelements_dbcsr_send) > 0) THEN
1013 : recv_buffer(offset_recv_mepos + 1:offset_recv_mepos + siesta_struct%nelements_per_proc(mepos, nelements_dbcsr_send)) = &
1014 0 : send_buffer(offset_send_mepos + 1:offset_send_mepos + siesta_struct%nelements_per_proc(mepos, nelements_dbcsr_send))
1015 : END IF
1016 0 : END IF
1017 :
1018 : IF (nrequests_total > 0) THEN
1019 : ! wait for pending isend/irecv requests
1020 0 : CALL mp_waitall(requests)
1021 : DEALLOCATE (nelements_per_request, offset_per_request, peer_rank, requests, request_tag)
1022 0 : END IF
1023 0 :
1024 : IF (ALLOCATED(send_buffer)) DEALLOCATE (send_buffer)
1025 :
1026 0 : iproc = siesta_struct%gather_root ! if do_distribute == .FALSE., collect matrix elements from MPI process with rank gather_root
1027 : IF (n_nonzero_elements_dbcsr > 0) THEN
1028 0 : DO inode_proc = 1, nnodes_proc
1029 0 : n_image_ind = siesta_struct%n_dbcsr_cell_images_to_merge(inode_proc)
1030 0 : IF (n_image_ind > 0) THEN
1031 0 : inode = node_offset + inode_proc
1032 0 :
1033 0 : irow_blk = siesta_struct%nl_repl(neighbor_list_iatom_index, inode)
1034 : icol_blk = siesta_struct%nl_repl(neighbor_list_jatom_index, inode)
1035 0 : image_dbcsr = siesta_struct%nl_repl(neighbor_list_dbcsr_image_index, inode)
1036 0 : CPASSERT(irow_blk <= icol_blk .OR. (.NOT. symmetric))
1037 0 : image_siesta = siesta_struct%nl_repl(neighbor_list_siesta_image_index, inode)
1038 0 : image_siesta_transp = siesta_struct%nl_repl(neighbor_list_siesta_transp_image_index, inode)
1039 0 :
1040 0 : nrows_local = row_blk_size(irow_blk)
1041 : ncols_local = col_blk_size(icol_blk)
1042 0 : first_row_minus_one = row_blk_offset(irow_blk) - 1
1043 0 : first_col_minus_one = col_blk_offset(icol_blk) - 1
1044 0 :
1045 0 : CALL dbcsr_get_block_p(matrix=matrix_dbcsr_kp(image_dbcsr)%matrix, &
1046 : row=irow_blk, col=icol_blk, block=sm_block, found=found)
1047 : CPASSERT(found)
1048 0 :
1049 0 : IF (image_siesta > 0) THEN
1050 : DO irow_local = 1, nrows_local
1051 0 : IF (do_distribute) THEN
1052 0 : #if defined(__SMEAGOL)
1053 0 : CALL WhichNodeOrb(irow_local + first_row_minus_one, nprocs, iproc) ! iproc_orb
1054 : #else
1055 : CALL cp_abort(__LOCATION__, &
1056 : "CP2K was compiled with no SMEAGOL support.")
1057 : #endif
1058 0 : END IF
1059 : ! CPASSERT
1060 : IF (debug_this_module) THEN
1061 : CPASSERT(iproc >= 0 .AND. iproc < nprocs)
1062 : IF (n_packed_elements_per_proc(iproc) + ncols_local > &
1063 : siesta_struct%nelements_per_proc(iproc, nelements_dbcsr_send)) THEN
1064 : CALL cp__a(__SHORT_FILE__, __LINE__)
1065 : END IF
1066 : END IF
1067 :
1068 : offset_recv_mepos = offset_per_proc(iproc) + n_packed_elements_per_proc(iproc)
1069 : sm_block(irow_local, 1:ncols_local) = recv_buffer(offset_recv_mepos + 1:offset_recv_mepos + ncols_local)
1070 :
1071 0 : n_packed_elements_per_proc(iproc) = n_packed_elements_per_proc(iproc) + ncols_local
1072 0 : END DO
1073 : END IF
1074 0 :
1075 : ! transposed block
1076 : IF (image_siesta_transp > 0) THEN
1077 : DO icol_local = 1, ncols_local
1078 : IF (do_distribute) THEN
1079 0 : #if defined(__SMEAGOL)
1080 0 : CALL WhichNodeOrb(icol_local + first_col_minus_one, nprocs, iproc) ! iproc_orb
1081 0 : #else
1082 : CALL cp_abort(__LOCATION__, &
1083 : "CP2K was compiled with no SMEAGOL support.")
1084 : #endif
1085 : END IF
1086 0 : ! CPASSERT
1087 : IF (debug_this_module) THEN
1088 : CPASSERT(iproc >= 0 .AND. iproc < nprocs)
1089 : IF (n_packed_elements_per_proc(iproc) + nrows_local > &
1090 : siesta_struct%nelements_per_proc(iproc, nelements_dbcsr_send)) THEN
1091 : CALL cp__a(__SHORT_FILE__, __LINE__)
1092 : END IF
1093 : END IF
1094 :
1095 : offset_recv_mepos = offset_per_proc(iproc) + n_packed_elements_per_proc(iproc)
1096 : sm_block(1:nrows_local, icol_local) = recv_buffer(offset_recv_mepos + 1:offset_recv_mepos + nrows_local)
1097 :
1098 : n_packed_elements_per_proc(iproc) = n_packed_elements_per_proc(iproc) + nrows_local
1099 0 : END DO
1100 0 : END IF
1101 : END IF
1102 0 : END DO
1103 :
1104 : IF (debug_this_module) THEN
1105 : DO iproc = 0, nprocs - 1
1106 : IF (n_packed_elements_per_proc(iproc) /= siesta_struct%nelements_per_proc(iproc, nelements_dbcsr_send)) THEN
1107 : CALL cp__a(__SHORT_FILE__, __LINE__)
1108 : END IF
1109 : END DO
1110 : END IF
1111 :
1112 : DEALLOCATE (recv_buffer)
1113 : END IF
1114 :
1115 : DEALLOCATE (offset_per_proc, n_packed_elements_per_proc)
1116 0 :
1117 : CALL timestop(handle)
1118 : END SUBROUTINE convert_distributed_siesta_to_dbcsr
1119 0 :
1120 : ! *** PRIVATE SUBROUTINES ***
1121 0 :
1122 0 : ! **************************************************************************************************
1123 : !> \brief Computes number of neighbour-list nodes on the current parallel process.
1124 : !> \param nnodes_local number of nodes [out]
1125 : !> \param max_ijk_cell_image_local largest index of cell images along i, j and k cell vectors
1126 : !> on this parallel process [out]
1127 : !> \param max_ijk_cell_image largest index of cell images along i, j and k cell vectors [inout]
1128 : !> \param sab_nl pair-wise neighbour list [in]
1129 : !> \param para_env MPI parallel environment [in]
1130 : !> \param particle_coords list of atomic coordinates subject to periodic boundary conditions [in]
1131 : !> \param cell simulation unit cell [in]
1132 : ! **************************************************************************************************
1133 : SUBROUTINE get_nnodes_local(nnodes_local, max_ijk_cell_image_local, max_ijk_cell_image, sab_nl, para_env, particle_coords, cell)
1134 : INTEGER, INTENT(out) :: nnodes_local
1135 : INTEGER, DIMENSION(3), INTENT(out) :: max_ijk_cell_image_local
1136 : INTEGER, DIMENSION(3), INTENT(inout) :: max_ijk_cell_image
1137 2 : TYPE(neighbor_list_set_p_type), DIMENSION(:), &
1138 : INTENT(in), POINTER :: sab_nl
1139 : TYPE(mp_para_env_type), INTENT(in), POINTER :: para_env
1140 : REAL(kind=dp), CONTIGUOUS, DIMENSION(:, :), &
1141 : INTENT(in) :: particle_coords
1142 : TYPE(cell_type), INTENT(in), POINTER :: cell
1143 :
1144 : CHARACTER(len=*), PARAMETER :: routineN = 'get_nnodes_local'
1145 :
1146 : INTEGER :: handle, iatom, icoord, jatom
1147 : INTEGER, DIMENSION(3) :: cell_ijk, max_ijk_cell_image_tmp
1148 : LOGICAL :: update_ncells
1149 : REAL(kind=dp), DIMENSION(3) :: r_ij
1150 : TYPE(neighbor_list_iterator_p_type), &
1151 : DIMENSION(:), POINTER :: nl_iterator
1152 :
1153 : CALL timeset(routineN, handle)
1154 :
1155 2 : update_ncells = .FALSE.
1156 : DO icoord = 1, 3 ! x, y, z
1157 2 : IF (max_ijk_cell_image(icoord) >= 0) THEN
1158 : max_ijk_cell_image_tmp(icoord) = max_ijk_cell_image(icoord)
1159 2 : ELSE
1160 8 : max_ijk_cell_image_tmp(icoord) = HUGE(max_ijk_cell_image_tmp(icoord))
1161 8 : update_ncells = .TRUE.
1162 2 : END IF
1163 : END DO
1164 4 :
1165 4 : nnodes_local = 0
1166 : max_ijk_cell_image_local(:) = 0
1167 :
1168 : CALL neighbor_list_iterator_create(nl_iterator, sab_nl)
1169 2 : DO WHILE (neighbor_list_iterate(nl_iterator) == 0)
1170 2 : CALL get_iterator_info(nl_iterator, iatom=iatom, jatom=jatom, r=r_ij)
1171 : CALL get_negf_cell_ijk(cell_ijk, r_ij, r_i=particle_coords(1:3, iatom), r_j=particle_coords(1:3, jatom), cell=cell)
1172 2 : cell_ijk(1:3) = ABS(cell_ijk(1:3))
1173 7170 :
1174 7168 : IF (cell_ijk(1) <= max_ijk_cell_image_tmp(1) .AND. cell_ijk(2) <= max_ijk_cell_image_tmp(2) .AND. &
1175 7168 : cell_ijk(3) <= max_ijk_cell_image_tmp(3)) THEN
1176 28672 : nnodes_local = nnodes_local + 1
1177 : max_ijk_cell_image_local(1:3) = MAX(max_ijk_cell_image_local(1:3), cell_ijk(1:3))
1178 7168 : END IF
1179 2 : END DO
1180 7040 : CALL neighbor_list_iterator_release(nl_iterator)
1181 28160 :
1182 : IF (update_ncells) THEN
1183 : max_ijk_cell_image_tmp(1:3) = max_ijk_cell_image_local(1:3)
1184 2 : CALL para_env%max(max_ijk_cell_image_tmp)
1185 : DO icoord = 1, 3
1186 2 : IF (max_ijk_cell_image(icoord) < 0) THEN
1187 2 : max_ijk_cell_image(icoord) = max_ijk_cell_image_tmp(icoord)
1188 2 : END IF
1189 8 : END DO
1190 8 : END IF
1191 4 :
1192 : CALL timestop(handle)
1193 : END SUBROUTINE get_nnodes_local
1194 :
1195 : ! **************************************************************************************************
1196 2 : !> \brief Construct list of neighbour-list's nodes on the current parallel process.
1197 2 : !> \param nl_local non-merged local neighbour-list's nodes [out]
1198 : !> \param max_ijk_cell_image_local largest index of cell images along i, j and k cell vectors
1199 : !> on this parallel process [in]
1200 : !> \param max_ijk_cell_image largest index of cell images along i, j and k cell vectors [in]
1201 : !> \param sab_nl pair-wise neighbour list [in]
1202 : !> \param particle_coords list of atomic coordinates subject to periodic boundary conditions [in]
1203 : !> \param cell simulation unit cell [in]
1204 : !> \param cell_to_index array to convert 3-D cell indices to 1-D DBCSR image indices
1205 : !> \param do_merge merge cell images along transport direction [in]
1206 : !> \param node_merged_indices nodes-related indices. Nodes with identical indices will be merged [out]
1207 : !> \param k_cells list of cell image indices along transport direction. Nodes to
1208 : !> be merged have the same 'node_merged_indices' but different 'k_cells' indices [out]
1209 : ! **************************************************************************************************
1210 : SUBROUTINE get_nl_nodes_local(nl_local, max_ijk_cell_image_local, max_ijk_cell_image, sab_nl, particle_coords, &
1211 : cell, cell_to_index, do_merge, node_merged_indices, k_cells)
1212 : INTEGER, DIMENSION(:, :), INTENT(out) :: nl_local
1213 : INTEGER, DIMENSION(3), INTENT(in) :: max_ijk_cell_image_local, &
1214 2 : max_ijk_cell_image
1215 2 : TYPE(neighbor_list_set_p_type), DIMENSION(:), &
1216 : INTENT(in), POINTER :: sab_nl
1217 : REAL(kind=dp), CONTIGUOUS, DIMENSION(:, :), &
1218 : INTENT(in) :: particle_coords
1219 : TYPE(cell_type), INTENT(in), POINTER :: cell
1220 : INTEGER, DIMENSION(:, :, :), INTENT(in), POINTER :: cell_to_index
1221 : LOGICAL, INTENT(in) :: do_merge
1222 : INTEGER(kind=int_8), DIMENSION(:), INTENT(out) :: node_merged_indices
1223 : INTEGER, DIMENSION(:), INTENT(out) :: k_cells
1224 :
1225 : CHARACTER(len=*), PARAMETER :: routineN = 'get_nl_nodes_local'
1226 :
1227 : INTEGER :: handle, iatom, icol_blk, image, inode, &
1228 : irow_blk, jatom, natoms
1229 : INTEGER(kind=8), DIMENSION(2) :: ncells_siesta_local
1230 : INTEGER, DIMENSION(2) :: ncells_siesta
1231 : INTEGER, DIMENSION(3) :: cell_ijk_abs, cell_ijk_dbcsr, &
1232 : cell_ijk_siesta
1233 : LOGICAL :: do_symmetric
1234 : REAL(kind=dp), DIMENSION(3) :: r_ij
1235 : TYPE(neighbor_list_iterator_p_type), &
1236 : DIMENSION(:), POINTER :: nl_iterator
1237 :
1238 : CALL timeset(routineN, handle)
1239 : ! natoms only used to compute a merged 1D index of each DBCSR block
1240 2 : natoms = SIZE(particle_coords, 2)
1241 : CALL get_neighbor_list_set_p(neighbor_list_sets=sab_nl, symmetric=do_symmetric)
1242 2 : ncells_siesta(1:2) = 2*max_ijk_cell_image(1:2) + 1
1243 :
1244 2 : ncells_siesta_local(1:2) = INT(2*max_ijk_cell_image_local(1:2) + 1, kind=int_8)
1245 2 :
1246 6 : inode = 0
1247 : CALL neighbor_list_iterator_create(nl_iterator, sab_nl)
1248 6 : DO WHILE (neighbor_list_iterate(nl_iterator) == 0)
1249 : CALL get_iterator_info(nl_iterator, iatom=iatom, jatom=jatom, cell=cell_ijk_dbcsr, r=r_ij)
1250 2 : CALL get_negf_cell_ijk(cell_ijk_abs, r_ij, r_i=particle_coords(1:3, iatom), r_j=particle_coords(1:3, jatom), cell=cell)
1251 2 :
1252 7170 : IF (ABS(cell_ijk_abs(1)) <= max_ijk_cell_image(1) .AND. ABS(cell_ijk_abs(2)) <= max_ijk_cell_image(2) .AND. &
1253 7168 : ABS(cell_ijk_abs(3)) <= max_ijk_cell_image(3)) THEN
1254 7168 :
1255 : inode = inode + 1
1256 7168 :
1257 2 : image = get_index_by_cell(cell_ijk_dbcsr, cell_to_index)
1258 : CPASSERT(image > 0)
1259 7040 : nl_local(neighbor_list_dbcsr_image_index, inode) = image
1260 :
1261 7040 : IF (do_symmetric .AND. iatom > jatom) THEN
1262 7040 : irow_blk = jatom
1263 7040 : icol_blk = iatom
1264 : cell_ijk_abs(1:3) = -cell_ijk_abs(1:3)
1265 7040 : ELSE
1266 12672 : irow_blk = iatom
1267 12672 : icol_blk = jatom
1268 12672 : END IF
1269 :
1270 : nl_local(neighbor_list_iatom_index, inode) = irow_blk
1271 : nl_local(neighbor_list_jatom_index, inode) = icol_blk
1272 :
1273 : cell_ijk_siesta(1:3) = index_in_canonical_enumeration(cell_ijk_abs(1:3)) ! absolute -> SIESTA
1274 7040 :
1275 7040 : IF (do_merge) THEN
1276 : node_merged_indices(inode) = (((cell_ijk_siesta(2) - 1)*ncells_siesta_local(1) + cell_ijk_siesta(1) - 1)* &
1277 28160 : INT(natoms, kind=int_8) + icol_blk - 1)*INT(natoms, kind=int_8) + INT(irow_blk - 1, kind=int_8)
1278 : image = cell_ijk_siesta(1) + ncells_siesta(1)*(cell_ijk_siesta(2) - 1)
1279 7040 : ELSE
1280 : node_merged_indices(inode) = ((((cell_ijk_siesta(3) - 1)*ncells_siesta_local(2) + &
1281 0 : cell_ijk_siesta(2) - 1)*ncells_siesta_local(1) + cell_ijk_siesta(1) - 1)* &
1282 0 : INT(natoms, kind=int_8) + icol_blk - 1)*INT(natoms, kind=int_8) + INT(irow_blk - 1, kind=int_8)
1283 : image = cell_ijk_siesta(1) + ncells_siesta(1)*(cell_ijk_siesta(2) - 1 + ncells_siesta(2)*(cell_ijk_siesta(3) - 1))
1284 : END IF
1285 : k_cells(inode) = cell_ijk_siesta(3)
1286 7040 : nl_local(neighbor_list_siesta_image_index, inode) = image
1287 7040 :
1288 : IF (do_symmetric .AND. irow_blk /= icol_blk) THEN
1289 7040 : cell_ijk_abs(1:3) = -cell_ijk_abs(1:3)
1290 7040 : cell_ijk_siesta(1:3) = index_in_canonical_enumeration(cell_ijk_abs(1:3)) ! absolute -> SIESTA
1291 : IF (do_merge) cell_ijk_siesta(3) = 1
1292 7040 : nl_local(neighbor_list_siesta_transp_image_index, inode) = &
1293 25728 : cell_ijk_siesta(1) + ncells_siesta(1)*(cell_ijk_siesta(2) - 1 + ncells_siesta(2)*(cell_ijk_siesta(3) - 1))
1294 25728 : ELSE
1295 6432 : nl_local(neighbor_list_siesta_transp_image_index, inode) = 0
1296 : END IF
1297 6432 : END IF
1298 : END DO
1299 608 : CALL neighbor_list_iterator_release(nl_iterator)
1300 :
1301 : IF (debug_this_module) THEN
1302 : CPASSERT(SIZE(nl_local, 2) == inode)
1303 2 : END IF
1304 :
1305 : CALL timestop(handle)
1306 : END SUBROUTINE get_nl_nodes_local
1307 :
1308 : ! **************************************************************************************************
1309 2 : !> \brief Replicate (and optionally merge) pair-wise neighbour list.
1310 2 : !> \param repl_nl replicated neighbour list. It needs to be deallocated elsewhere [allocated]
1311 : !> \param n_dbcsr_cell_images_to_merge number of merged blocks per neighbour-list node [allocated]
1312 : !> \param dbcsr_cell_image_to_merge list of DBCSR image indices to merge [allocated]
1313 : !> \param nnodes_per_proc number of merged nodes on each parallel processes [out]
1314 : !> \param max_ijk_cell_image largest index of cell images along i, j and k cell vectors [inout]
1315 : !> \param sab_nl pair-wise neighbour list [in]
1316 : !> \param para_env MPI parallel environment [in]
1317 : !> \param particle_coords list of atomic coordinates subject to periodic boundary conditions [in]
1318 : !> \param cell simulation unit cell [in]
1319 : !> \param cell_to_index array to convert 3-D cell indices to 1-D DBCSR image indices [in]
1320 : !> \param do_merge merge cell images along transport direction [in]
1321 : ! **************************************************************************************************
1322 : SUBROUTINE replicate_neighbour_list(repl_nl, n_dbcsr_cell_images_to_merge, dbcsr_cell_image_to_merge, &
1323 : nnodes_per_proc, max_ijk_cell_image, sab_nl, para_env, particle_coords, &
1324 : cell, cell_to_index, do_merge)
1325 : INTEGER, ALLOCATABLE, DIMENSION(:, :), &
1326 2 : INTENT(inout) :: repl_nl
1327 2 : INTEGER, ALLOCATABLE, DIMENSION(:), INTENT(inout) :: n_dbcsr_cell_images_to_merge, &
1328 : dbcsr_cell_image_to_merge
1329 : INTEGER, DIMENSION(0:), INTENT(out) :: nnodes_per_proc
1330 : INTEGER, DIMENSION(3), INTENT(inout) :: max_ijk_cell_image
1331 : TYPE(neighbor_list_set_p_type), DIMENSION(:), &
1332 : INTENT(in), POINTER :: sab_nl
1333 : TYPE(mp_para_env_type), INTENT(in), POINTER :: para_env
1334 : REAL(kind=dp), CONTIGUOUS, DIMENSION(:, :), &
1335 : INTENT(in) :: particle_coords
1336 : TYPE(cell_type), INTENT(in), POINTER :: cell
1337 : INTEGER, DIMENSION(:, :, :), INTENT(in), POINTER :: cell_to_index
1338 : LOGICAL, INTENT(in) :: do_merge
1339 :
1340 : CHARACTER(len=*), PARAMETER :: routineN = 'replicate_neighbour_list'
1341 :
1342 : INTEGER :: handle, inode, iproc, kcell_closest, &
1343 : nnodes_local, nnodes_merged, &
1344 : nnodes_repl, offset_inode
1345 : INTEGER(kind=int_8), ALLOCATABLE, DIMENSION(:) :: node_merged_indices
1346 : INTEGER, ALLOCATABLE, DIMENSION(:) :: inodes_orig, k_cells
1347 : INTEGER, ALLOCATABLE, DIMENSION(:, :) :: nl_local
1348 : INTEGER, DIMENSION(3) :: max_ijk_cell_image_local
1349 2 :
1350 2 : CALL timeset(routineN, handle)
1351 2 : CPASSERT(.NOT. ALLOCATED(repl_nl))
1352 : CPASSERT(.NOT. ALLOCATED(n_dbcsr_cell_images_to_merge))
1353 : CPASSERT(.NOT. ALLOCATED(dbcsr_cell_image_to_merge))
1354 2 :
1355 2 : CALL get_nnodes_local(nnodes_local, max_ijk_cell_image_local, max_ijk_cell_image, sab_nl, para_env, particle_coords, cell)
1356 2 :
1357 2 : nnodes_per_proc(:) = 0
1358 :
1359 2 : IF (nnodes_local > 0) THEN
1360 : ALLOCATE (nl_local(neighbor_list_dim1, nnodes_local))
1361 6 : ALLOCATE (node_merged_indices(nnodes_local))
1362 : ALLOCATE (k_cells(nnodes_local))
1363 2 : CALL get_nl_nodes_local(nl_local, max_ijk_cell_image_local, max_ijk_cell_image, sab_nl, particle_coords, cell, &
1364 6 : cell_to_index, do_merge, node_merged_indices, k_cells)
1365 6 :
1366 6 : ALLOCATE (inodes_orig(nnodes_local))
1367 : CALL sort(node_merged_indices, nnodes_local, inodes_orig)
1368 2 :
1369 : nnodes_merged = 1
1370 4 : DO inode = 2, nnodes_local
1371 2 : IF (node_merged_indices(inode) > node_merged_indices(inode - 1)) nnodes_merged = nnodes_merged + 1
1372 : END DO
1373 2 : ELSE
1374 7040 : nnodes_merged = 0
1375 7040 : END IF
1376 :
1377 : nnodes_per_proc(para_env%mepos) = nnodes_merged
1378 : CALL para_env%sum(nnodes_per_proc)
1379 :
1380 : nnodes_repl = SUM(nnodes_per_proc(:))
1381 2 : ALLOCATE (repl_nl(neighbor_list_dim1, nnodes_repl))
1382 10 :
1383 : IF (nnodes_local > 0) THEN
1384 6 : IF (para_env%mepos > 0) THEN
1385 6 : offset_inode = SUM(nnodes_per_proc(0:para_env%mepos - 1))
1386 : ELSE
1387 2 : offset_inode = 0
1388 2 : END IF
1389 2 :
1390 : ALLOCATE (n_dbcsr_cell_images_to_merge(nnodes_merged))
1391 : ALLOCATE (dbcsr_cell_image_to_merge(nnodes_local))
1392 : n_dbcsr_cell_images_to_merge(:) = 0
1393 :
1394 6 : nnodes_merged = 1 !offset_inode + 1
1395 6 : repl_nl(:, offset_inode + 1) = nl_local(:, inodes_orig(1))
1396 7042 : n_dbcsr_cell_images_to_merge(1) = 1
1397 : dbcsr_cell_image_to_merge(1) = nl_local(neighbor_list_dbcsr_image_index, inodes_orig(1))
1398 2 : kcell_closest = k_cells(inodes_orig(1))
1399 12 : DO inode = 2, nnodes_local
1400 2 : IF (node_merged_indices(inode) > node_merged_indices(inode - 1)) THEN
1401 2 : nnodes_merged = nnodes_merged + 1
1402 2 : repl_nl(:, offset_inode + nnodes_merged) = nl_local(:, inodes_orig(inode))
1403 7040 : !n_dbcsr_cell_images_to_merge(nnodes_merged) = 1
1404 7038 : kcell_closest = k_cells(inodes_orig(inode))
1405 7038 : ELSE
1406 42228 : IF (ABS(k_cells(inodes_orig(inode))) < ABS(kcell_closest) .OR. &
1407 : (ABS(k_cells(inodes_orig(inode))) == ABS(kcell_closest) .AND. kcell_closest < 0)) THEN
1408 7038 : repl_nl(:, offset_inode + nnodes_merged) = nl_local(:, inodes_orig(inode))
1409 : kcell_closest = k_cells(inodes_orig(inode))
1410 0 : END IF
1411 0 : END IF
1412 0 : dbcsr_cell_image_to_merge(inode) = nl_local(neighbor_list_dbcsr_image_index, inodes_orig(inode))
1413 0 : n_dbcsr_cell_images_to_merge(nnodes_merged) = n_dbcsr_cell_images_to_merge(nnodes_merged) + 1
1414 : END DO
1415 :
1416 7038 : IF (debug_this_module) THEN
1417 7040 : CPASSERT(SUM(n_dbcsr_cell_images_to_merge) == nnodes_local)
1418 : END IF
1419 :
1420 : DEALLOCATE (inodes_orig)
1421 : DEALLOCATE (node_merged_indices, k_cells)
1422 : DEALLOCATE (nl_local)
1423 : END IF
1424 2 :
1425 2 : IF (para_env%num_pe > 1) THEN
1426 2 : offset_inode = 0
1427 : DO iproc = 0, para_env%num_pe - 1
1428 : IF (nnodes_per_proc(iproc) > 0) THEN
1429 2 : CALL para_env%bcast(repl_nl(:, offset_inode + 1:offset_inode + nnodes_per_proc(iproc)), iproc)
1430 2 : offset_inode = offset_inode + nnodes_per_proc(iproc)
1431 6 : END IF
1432 6 : END DO
1433 4 : END IF
1434 4 :
1435 : CALL timestop(handle)
1436 : END SUBROUTINE replicate_neighbour_list
1437 :
1438 : ! **************************************************************************************************
1439 2 : !> \brief Count number of DBCSR matrix elements that should be received from (*,nelements_dbcsr_recv)
1440 4 : !> and send to (*,nelements_dbcsr_send) each parallel process.
1441 : !> \param nelements_per_proc number of non-zero matrix elements for each MPI process
1442 : !> \param nnodes_per_proc number of non-zero DBCSR matrix blocks (neighbour-list nodes)
1443 : !> \param nl_repl replicated neighbour list
1444 : !> \param matrix_dbcsr_kp DBCSR matrix
1445 : !> \param symmetric whether the DBCSR matrix is a symmetric one
1446 : !> \param para_env parallel environment
1447 : !> \param gather_root if >=0, gather all non-zero matrix element on the MPI process with
1448 : !> gather_root rank (useful for bulk transport calculation).
1449 : !> If <0, distribute non-zero matrix element across all MPI processes
1450 : ! **************************************************************************************************
1451 : SUBROUTINE count_remote_dbcsr_elements(nelements_per_proc, nnodes_per_proc, nl_repl, matrix_dbcsr_kp, &
1452 : symmetric, para_env, gather_root)
1453 : INTEGER(kind=int_8), DIMENSION(0:, :), INTENT(out) :: nelements_per_proc
1454 : INTEGER, DIMENSION(0:), INTENT(in) :: nnodes_per_proc
1455 2 : INTEGER, DIMENSION(:, :), INTENT(in) :: nl_repl
1456 : TYPE(dbcsr_p_type), DIMENSION(:), INTENT(in) :: matrix_dbcsr_kp
1457 : LOGICAL, INTENT(in) :: symmetric
1458 : TYPE(mp_para_env_type), INTENT(in), POINTER :: para_env
1459 : INTEGER, INTENT(in) :: gather_root
1460 :
1461 : CHARACTER(len=*), PARAMETER :: routineN = 'count_remote_dbcsr_elements'
1462 :
1463 : INTEGER :: first_row_minus_one, handle, icol_blk, image, image_transp, inode, inode_proc, &
1464 : iproc, iproc_orb, irow_blk, irow_local, mepos, ncols_blk, ncols_local, nnodes_proc, &
1465 : nprocs, nrows_blk, nrows_local, offset_inode
1466 : INTEGER, DIMENSION(:), POINTER :: col_blk_offset, col_blk_size, &
1467 : row_blk_offset, row_blk_size
1468 : LOGICAL :: do_distribute
1469 :
1470 2 : CALL timeset(routineN, handle)
1471 2 : nelements_per_proc(:, :) = 0
1472 : mepos = para_env%mepos
1473 : nprocs = para_env%num_pe
1474 2 : do_distribute = gather_root < 0
1475 14 : IF (debug_this_module) THEN
1476 2 : CPASSERT(SIZE(nnodes_per_proc) == nprocs)
1477 2 : END IF
1478 2 :
1479 : CALL dbcsr_get_info(matrix=matrix_dbcsr_kp(1)%matrix, &
1480 : nblkrows_total=nrows_blk, nblkcols_total=ncols_blk, &
1481 : row_blk_size=row_blk_size, col_blk_size=col_blk_size, &
1482 : row_blk_offset=row_blk_offset, col_blk_offset=col_blk_offset)
1483 :
1484 : offset_inode = 0
1485 : iproc_orb = gather_root ! if do_distribute == .FALSE., send all matrix elements to MPI process with rank gather_root
1486 2 : DO iproc = LBOUND(nnodes_per_proc, 1), UBOUND(nnodes_per_proc, 1)
1487 : nnodes_proc = nnodes_per_proc(iproc)
1488 2 : DO inode_proc = 1, nnodes_proc
1489 2 : inode = inode_proc + offset_inode
1490 8 :
1491 4 : irow_blk = nl_repl(neighbor_list_iatom_index, inode)
1492 14084 : icol_blk = nl_repl(neighbor_list_jatom_index, inode)
1493 14080 : CPASSERT(irow_blk <= icol_blk .OR. (.NOT. symmetric))
1494 : image = nl_repl(neighbor_list_siesta_image_index, inode)
1495 14080 : image_transp = nl_repl(neighbor_list_siesta_transp_image_index, inode)
1496 14080 :
1497 14080 : IF (image > 0) THEN
1498 14080 : nrows_local = row_blk_size(irow_blk)
1499 14080 : first_row_minus_one = row_blk_offset(irow_blk) - 1
1500 : ncols_local = col_blk_size(icol_blk)
1501 14080 : DO irow_local = 1, nrows_local
1502 14080 : IF (do_distribute) THEN
1503 14080 : #if defined(__SMEAGOL)
1504 14080 : CALL WhichNodeOrb(irow_local + first_row_minus_one, nprocs, iproc_orb)
1505 140800 : #else
1506 126720 : CALL cp_abort(__LOCATION__, &
1507 : "CP2K was compiled with no SMEAGOL support.")
1508 : #endif
1509 : END IF
1510 : IF (iproc_orb == mepos) THEN
1511 0 : nelements_per_proc(iproc, nelements_dbcsr_recv) = nelements_per_proc(iproc, nelements_dbcsr_recv) + &
1512 : ncols_local
1513 : END IF
1514 126720 :
1515 : IF (iproc == mepos) THEN
1516 63360 : nelements_per_proc(iproc_orb, nelements_dbcsr_send) = nelements_per_proc(iproc_orb, nelements_dbcsr_send) + &
1517 : ncols_local
1518 : END IF
1519 140800 : END DO
1520 : END IF
1521 63360 :
1522 : ! transposed block
1523 : IF (image_transp > 0) THEN
1524 : nrows_local = col_blk_size(icol_blk)
1525 : first_row_minus_one = col_blk_offset(icol_blk) - 1
1526 : ncols_local = row_blk_size(irow_blk)
1527 14084 : DO irow_local = 1, nrows_local
1528 12864 : IF (do_distribute) THEN
1529 12864 : #if defined(__SMEAGOL)
1530 12864 : CALL WhichNodeOrb(irow_local + first_row_minus_one, nprocs, iproc_orb)
1531 128640 : #else
1532 115776 : CALL cp_abort(__LOCATION__, &
1533 : "CP2K was compiled with no SMEAGOL support.")
1534 : #endif
1535 : END IF
1536 : IF (iproc_orb == mepos) THEN
1537 0 : nelements_per_proc(iproc, nelements_dbcsr_recv) = nelements_per_proc(iproc, nelements_dbcsr_recv) + &
1538 : ncols_local
1539 : END IF
1540 115776 :
1541 : IF (iproc == mepos) THEN
1542 57888 : nelements_per_proc(iproc_orb, nelements_dbcsr_send) = nelements_per_proc(iproc_orb, nelements_dbcsr_send) + &
1543 : ncols_local
1544 : END IF
1545 128640 : END DO
1546 : END IF
1547 57888 : END DO
1548 : offset_inode = offset_inode + nnodes_proc
1549 : END DO
1550 : CALL timestop(handle)
1551 : END SUBROUTINE count_remote_dbcsr_elements
1552 6 :
1553 : ! **************************************************************************************************
1554 2 : !> \brief Construct list of non-zero matrix elements' indices in SIESTA format.
1555 2 : !> \param n_nonzero_cols number of non-zero matrix elements on each matrix row local to the current
1556 : !> MPI process
1557 : !> \param row_offset offset of the first non-zero matrix elements for each locally-stores row
1558 : !> \param col_index sorted list of column indices of non-zero matrix element
1559 : !> \param packed_index original order of non-sorted column indices
1560 : !> \param nl_repl replicated neighbour list
1561 : !> \param matrix_dbcsr_kp DBCSR matrix
1562 : !> \param symmetric whether the DBCSR matrix is a symmetric one
1563 : !> \param para_env parallel environment
1564 : !> \param gather_root if >=0, gather all non-zero matrix element on the MPI process with
1565 : !> gather_root rank (useful for bulk transport calculation).
1566 : !> If <0, distribute non-zero matrix element across all MPI processes
1567 : ! **************************************************************************************************
1568 : SUBROUTINE get_nonzero_element_indices(n_nonzero_cols, row_offset, col_index, packed_index, &
1569 : nl_repl, matrix_dbcsr_kp, symmetric, para_env, gather_root)
1570 : INTEGER, DIMENSION(:), INTENT(out) :: n_nonzero_cols, row_offset, col_index, &
1571 : packed_index
1572 4 : INTEGER, DIMENSION(:, :), INTENT(in) :: nl_repl
1573 2 : TYPE(dbcsr_p_type), DIMENSION(:), INTENT(in) :: matrix_dbcsr_kp
1574 : LOGICAL, INTENT(in) :: symmetric
1575 : TYPE(mp_para_env_type), INTENT(in), POINTER :: para_env
1576 : INTEGER, INTENT(in) :: gather_root
1577 :
1578 : CHARACTER(len=*), PARAMETER :: routineN = 'get_nonzero_element_indices'
1579 :
1580 : INTEGER :: first_col_minus_one, first_row_minus_one, handle, icol_blk, icol_local, &
1581 : icol_offset, image, image_transp, inode, irow_blk, irow_local, irow_proc, mepos, &
1582 : ncols_blk, ncols_local, ncols_total, nnodes, nprocs, nrows_blk, nrows_local, nrows_total
1583 : INTEGER, DIMENSION(:), POINTER :: col_blk_offset, col_blk_size, &
1584 : row_blk_offset, row_blk_size
1585 : LOGICAL :: do_distribute, is_root_rank
1586 :
1587 2 : CALL timeset(routineN, handle)
1588 2 : n_nonzero_cols(:) = 0
1589 : mepos = para_env%mepos
1590 : nprocs = para_env%num_pe
1591 2 : do_distribute = gather_root < 0
1592 291 : is_root_rank = gather_root == mepos
1593 2 :
1594 2 : CALL dbcsr_get_info(matrix=matrix_dbcsr_kp(1)%matrix, &
1595 2 : nblkrows_total=nrows_blk, nblkcols_total=ncols_blk, &
1596 2 : nfullrows_total=nrows_total, nfullcols_total=ncols_total, &
1597 : row_blk_size=row_blk_size, col_blk_size=col_blk_size, &
1598 : row_blk_offset=row_blk_offset, col_blk_offset=col_blk_offset)
1599 :
1600 : nnodes = SIZE(nl_repl, 2)
1601 : DO inode = 1, nnodes
1602 2 : irow_blk = nl_repl(neighbor_list_iatom_index, inode)
1603 : icol_blk = nl_repl(neighbor_list_jatom_index, inode)
1604 2 : CPASSERT(irow_blk <= icol_blk .OR. (.NOT. symmetric))
1605 14082 : image = nl_repl(neighbor_list_siesta_image_index, inode)
1606 14080 : image_transp = nl_repl(neighbor_list_siesta_transp_image_index, inode)
1607 14080 :
1608 14080 : IF (image > 0) THEN
1609 14080 : nrows_local = row_blk_size(irow_blk)
1610 14080 : first_row_minus_one = row_blk_offset(irow_blk) - 1
1611 : ncols_local = col_blk_size(icol_blk)
1612 14080 : DO irow_local = 1, nrows_local
1613 14080 : IF (do_distribute) THEN
1614 14080 : #if defined(__SMEAGOL)
1615 14080 : CALL GlobalToLocalOrb(irow_local + first_row_minus_one, mepos, nprocs, irow_proc)
1616 140800 : #else
1617 126720 : CALL cp_abort(__LOCATION__, &
1618 : "CP2K was compiled with no SMEAGOL support.")
1619 : #endif
1620 : ELSE
1621 : IF (is_root_rank) THEN
1622 0 : irow_proc = irow_local + first_row_minus_one
1623 : ELSE
1624 : irow_proc = 0
1625 126720 : END IF
1626 63360 : END IF
1627 : IF (irow_proc > 0) THEN
1628 : n_nonzero_cols(irow_proc) = n_nonzero_cols(irow_proc) + ncols_local
1629 : END IF
1630 : END DO
1631 77440 : END IF
1632 63360 :
1633 : ! transposed block
1634 : IF (image_transp > 0) THEN
1635 : nrows_local = col_blk_size(icol_blk)
1636 : first_row_minus_one = col_blk_offset(icol_blk) - 1
1637 : ncols_local = row_blk_size(irow_blk)
1638 14082 : DO irow_local = 1, nrows_local
1639 12864 : IF (do_distribute) THEN
1640 12864 : #if defined(__SMEAGOL)
1641 12864 : CALL GlobalToLocalOrb(irow_local + first_row_minus_one, mepos, nprocs, irow_proc)
1642 128640 : #else
1643 115776 : CALL cp_abort(__LOCATION__, &
1644 : "CP2K was compiled with no SMEAGOL support.")
1645 : #endif
1646 : ELSE
1647 : IF (is_root_rank) THEN
1648 0 : irow_proc = irow_local + first_row_minus_one
1649 : ELSE
1650 : irow_proc = 0
1651 115776 : END IF
1652 57888 : END IF
1653 : IF (irow_proc > 0) THEN
1654 : n_nonzero_cols(irow_proc) = n_nonzero_cols(irow_proc) + ncols_local
1655 : END IF
1656 : END DO
1657 70752 : END IF
1658 57888 : END DO
1659 :
1660 : row_offset(1) = 0
1661 : DO irow_local = 1, SIZE(n_nonzero_cols) - 1
1662 : row_offset(irow_local + 1) = row_offset(irow_local) + n_nonzero_cols(irow_local)
1663 : END DO
1664 2 :
1665 289 : n_nonzero_cols(:) = 0
1666 289 : col_index(:) = 0
1667 : DO inode = 1, nnodes
1668 : irow_blk = nl_repl(neighbor_list_iatom_index, inode)
1669 291 : icol_blk = nl_repl(neighbor_list_jatom_index, inode)
1670 1091235 : CPASSERT(irow_blk <= icol_blk .OR. (.NOT. symmetric))
1671 14082 : image = nl_repl(neighbor_list_siesta_image_index, inode)
1672 14080 : image_transp = nl_repl(neighbor_list_siesta_transp_image_index, inode)
1673 14080 :
1674 14080 : IF (image > 0) THEN
1675 14080 : nrows_local = row_blk_size(irow_blk)
1676 14080 : first_row_minus_one = row_blk_offset(irow_blk) - 1
1677 : ncols_local = col_blk_size(icol_blk)
1678 14080 : first_col_minus_one = col_blk_offset(icol_blk) + (image - 1)*ncols_total - 1
1679 14080 : DO irow_local = 1, nrows_local
1680 14080 : IF (do_distribute) THEN
1681 14080 : #if defined(__SMEAGOL)
1682 14080 : CALL GlobalToLocalOrb(irow_local + first_row_minus_one, mepos, nprocs, irow_proc)
1683 140800 : #else
1684 126720 : CALL cp_abort(__LOCATION__, &
1685 : "CP2K was compiled with no SMEAGOL support.")
1686 : #endif
1687 : ELSE
1688 : IF (is_root_rank) THEN
1689 0 : irow_proc = irow_local + first_row_minus_one
1690 : ELSE
1691 : irow_proc = 0
1692 126720 : END IF
1693 63360 : END IF
1694 : IF (irow_proc > 0) THEN
1695 : icol_offset = row_offset(irow_proc) + n_nonzero_cols(irow_proc)
1696 : DO icol_local = 1, ncols_local
1697 : col_index(icol_offset + icol_local) = first_col_minus_one + icol_local
1698 77440 : END DO
1699 63360 : n_nonzero_cols(irow_proc) = n_nonzero_cols(irow_proc) + ncols_local
1700 633600 : END IF
1701 633600 : END DO
1702 : END IF
1703 63360 :
1704 : ! transposed block
1705 : IF (image_transp > 0) THEN
1706 : nrows_local = col_blk_size(icol_blk)
1707 : first_row_minus_one = col_blk_offset(icol_blk) - 1
1708 : ncols_local = row_blk_size(irow_blk)
1709 14082 : first_col_minus_one = row_blk_offset(irow_blk) + (image_transp - 1)*nrows_total - 1
1710 12864 : DO irow_local = 1, nrows_local
1711 12864 : IF (do_distribute) THEN
1712 12864 : #if defined(__SMEAGOL)
1713 12864 : CALL GlobalToLocalOrb(irow_local + first_row_minus_one, mepos, nprocs, irow_proc)
1714 128640 : #else
1715 115776 : CALL cp_abort(__LOCATION__, &
1716 : "CP2K was compiled with no SMEAGOL support.")
1717 : #endif
1718 : ELSE
1719 : IF (is_root_rank) THEN
1720 0 : irow_proc = irow_local + first_row_minus_one
1721 : ELSE
1722 : irow_proc = 0
1723 115776 : END IF
1724 57888 : END IF
1725 : IF (irow_proc > 0) THEN
1726 : icol_offset = row_offset(irow_proc) + n_nonzero_cols(irow_proc)
1727 : DO icol_local = 1, ncols_local
1728 : col_index(icol_offset + icol_local) = first_col_minus_one + icol_local
1729 70752 : END DO
1730 57888 : n_nonzero_cols(irow_proc) = n_nonzero_cols(irow_proc) + ncols_local
1731 578880 : END IF
1732 578880 : END DO
1733 : END IF
1734 57888 : END DO
1735 :
1736 : IF (SIZE(n_nonzero_cols) > 0) THEN
1737 : DO irow_local = 1, SIZE(n_nonzero_cols)
1738 : CALL sort(col_index(row_offset(irow_local) + 1:row_offset(irow_local) + n_nonzero_cols(irow_local)), &
1739 : n_nonzero_cols(irow_local), &
1740 2 : packed_index(row_offset(irow_local) + 1:row_offset(irow_local) + n_nonzero_cols(irow_local)))
1741 291 : END DO
1742 : END IF
1743 :
1744 291 : CALL timestop(handle)
1745 : END SUBROUTINE get_nonzero_element_indices
1746 :
1747 : ! **************************************************************************************************
1748 2 : !> \brief Get absolute i, j, and k indices of cell image for the given DBCSR matrix block.
1749 2 : !> Effective cell image recorded in the neighbour-list (where the matrix block is actually
1750 : !> stored) depends on atomic coordinates can be significantly different.
1751 : !> \param cell_ijk array with 3 indices along the cell's vectors
1752 : !> \param r_ij actual interatomic distance (vector R_j - r_i), where R_j is the coordinates
1753 : !> of the j-th atom in the supercell
1754 : !> \param r_i coordinates of the i-th atom in the primary unit cell
1755 : !> \param r_j coordinates of the j-th atom in the primary unit cell
1756 : !> \param cell unit cell
1757 : ! **************************************************************************************************
1758 : SUBROUTINE get_negf_cell_ijk(cell_ijk, r_ij, r_i, r_j, cell)
1759 : INTEGER, DIMENSION(3), INTENT(out) :: cell_ijk
1760 : REAL(kind=dp), DIMENSION(3), INTENT(in) :: r_ij, r_i, r_j
1761 : TYPE(cell_type), INTENT(in), POINTER :: cell
1762 14336 :
1763 : REAL(kind=dp), DIMENSION(3) :: coords_scaled, r
1764 :
1765 : r(:) = r_ij(:) + r_i(:) - r_j(:)
1766 : CALL real_to_scaled(coords_scaled, r, cell)
1767 : cell_ijk(:) = NINT(coords_scaled(:))
1768 : END SUBROUTINE get_negf_cell_ijk
1769 57344 :
1770 14336 : ! **************************************************************************************************
1771 57344 : !> \brief Return the index of an integer number in the sequence 0, 1, -1, ..., n, -n, ...
1772 14336 : !> (canonical enumeration of integers, oeis.org/A001057).
1773 : !> \param inum integer number [in]
1774 : !> \return index of 'inum' in A001057
1775 : !> \note Cell images in SMEAGOL / SIESTA are ordered according to A001057. Therefore this
1776 : !> function converts the absolute index of a cell replica along some (x/y/z) dimension
1777 : !> into its corresponding SIESTA's index.
1778 : ! **************************************************************************************************
1779 : ELEMENTAL FUNCTION index_in_canonical_enumeration(inum) RESULT(ind)
1780 : INTEGER, INTENT(in) :: inum
1781 : INTEGER :: ind
1782 :
1783 40416 : INTEGER :: inum_abs, is_non_positive
1784 :
1785 : inum_abs = ABS(inum)
1786 : !IF (inum <= 0) THEN; is_non_positive = 1; ELSE; is_non_positive = 0; END IF
1787 : is_non_positive = MERGE(1, 0, inum <= 0)
1788 :
1789 40416 : ! inum = 0 -> inum_abs = 0, is_non_positive = 1 -> ind = 1
1790 : ! inum = 1 -> inum_abs = 1, is_non_positive = 0 -> ind = 2
1791 40416 : ! inum = -1 -> inum_abs = 1, is_non_positive = 1 -> ind = 3
1792 : ind = 2*inum_abs + is_non_positive
1793 : END FUNCTION index_in_canonical_enumeration
1794 :
1795 : ! **************************************************************************************************
1796 40416 : !> \brief Return an integer number according to its index in the sequence 0, 1, -1, ..., n, -n, ...
1797 40416 : !> (canonical enumeration of integers, oeis.org/A001057)
1798 : !> \param ind index in A001057 starting from 1
1799 : !> \return integer number according to its position 'ind' in A001057
1800 : !> \note Cell images in SMEAGOL / SIESTA are ordered according to A001057. Therefore this
1801 : !> function converts SIESTA's index of a cell replica along some (x/y/z) dimension
1802 : !> into the corresponding absolute index.
1803 : ! **************************************************************************************************
1804 : ELEMENTAL FUNCTION number_from_canonical_enumeration(ind) RESULT(inum)
1805 : INTEGER, INTENT(in) :: ind
1806 : INTEGER :: inum
1807 :
1808 186 : ! ind < 1 is invalid
1809 : ! ind = 1 -> SIGN(0, -1) = 0
1810 : ! ind = 2 -> SIGN(1, 0) = 1
1811 : ! ind = 3 -> SIGN(1, -1) = -1
1812 : inum = SIGN(ind/2, -MOD(ind, 2))
1813 : END FUNCTION number_from_canonical_enumeration
1814 :
1815 : ! **************************************************************************************************
1816 186 : !> \brief Apply periodic boundary conditions defined by a simulation cell to a position
1817 186 : !> vector r. Similar to pbc1 from cell_types.F but returns unscaled coordinates from
1818 : !> the scaled range [0, 1) instead of [-0.5, 0.5)
1819 : !> \param r_pbc position vector subject to the periodic boundary conditions [out]
1820 : !> \param r initial position vector [in]
1821 : !> \param cell simulation unit cell [in]
1822 : ! **************************************************************************************************
1823 : PURE SUBROUTINE pbc_0_1(r_pbc, r, cell)
1824 : REAL(KIND=dp), DIMENSION(3), INTENT(out) :: r_pbc
1825 : REAL(KIND=dp), DIMENSION(3), INTENT(in) :: r
1826 : TYPE(cell_type), INTENT(in), POINTER :: cell
1827 64 :
1828 : REAL(KIND=dp), DIMENSION(3) :: s
1829 :
1830 : IF (cell%orthorhombic) THEN
1831 : r_pbc(1) = r(1) - cell%hmat(1, 1)*cell%perd(1)*REAL(FLOOR(cell%h_inv(1, 1)*r(1)), dp)
1832 : r_pbc(2) = r(2) - cell%hmat(2, 2)*cell%perd(2)*REAL(FLOOR(cell%h_inv(2, 2)*r(2)), dp)
1833 : r_pbc(3) = r(3) - cell%hmat(3, 3)*cell%perd(3)*REAL(FLOOR(cell%h_inv(3, 3)*r(3)), dp)
1834 64 : ELSE
1835 64 : s(1) = cell%h_inv(1, 1)*r(1) + cell%h_inv(1, 2)*r(2) + cell%h_inv(1, 3)*r(3)
1836 64 : s(2) = cell%h_inv(2, 1)*r(1) + cell%h_inv(2, 2)*r(2) + cell%h_inv(2, 3)*r(3)
1837 64 : s(3) = cell%h_inv(3, 1)*r(1) + cell%h_inv(3, 2)*r(2) + cell%h_inv(3, 3)*r(3)
1838 : s(1) = s(1) - cell%perd(1)*REAL(FLOOR(s(1)), dp)
1839 0 : s(2) = s(2) - cell%perd(2)*REAL(FLOOR(s(2)), dp)
1840 0 : s(3) = s(3) - cell%perd(3)*REAL(FLOOR(s(3)), dp)
1841 0 : r_pbc(1) = cell%hmat(1, 1)*s(1) + cell%hmat(1, 2)*s(2) + cell%hmat(1, 3)*s(3)
1842 0 : r_pbc(2) = cell%hmat(2, 1)*s(1) + cell%hmat(2, 2)*s(2) + cell%hmat(2, 3)*s(3)
1843 0 : r_pbc(3) = cell%hmat(3, 1)*s(1) + cell%hmat(3, 2)*s(2) + cell%hmat(3, 3)*s(3)
1844 0 : END IF
1845 0 : END SUBROUTINE pbc_0_1
1846 0 :
1847 0 : ! **************************************************************************************************
1848 : !> \brief Computes the number of send requests from this MPI process to all the other processes.
1849 64 : !> Alternatively computes the number of recv requests per MPI process that the given process
1850 : !> expects.
1851 : !> \param mepos MPI rank of the given process
1852 : !> \param nelements_per_proc number of element to send / receive
1853 : !> \param max_nelements_per_packet maximum number of elements per single MPI request
1854 : !> \return number of MPI requests
1855 : ! **************************************************************************************************
1856 : PURE FUNCTION get_number_of_mpi_sendrecv_requests(mepos, nelements_per_proc, max_nelements_per_packet) RESULT(nrequests)
1857 : INTEGER, INTENT(in) :: mepos
1858 : INTEGER(kind=int_8), DIMENSION(0:), INTENT(in) :: nelements_per_proc
1859 : INTEGER(kind=int_8), INTENT(in) :: max_nelements_per_packet
1860 16 : INTEGER :: nrequests
1861 :
1862 : INTEGER :: iproc
1863 :
1864 : nrequests = 0
1865 : DO iproc = LBOUND(nelements_per_proc, 1), UBOUND(nelements_per_proc, 1)
1866 : ! there is no need to send data to the same MPI process
1867 : IF (iproc /= mepos) THEN
1868 16 : nrequests = nrequests + INT(nelements_per_proc(iproc)/max_nelements_per_packet)
1869 64 : IF (MOD(nelements_per_proc(iproc), max_nelements_per_packet) > 0) &
1870 : nrequests = nrequests + 1
1871 48 : END IF
1872 16 : END DO
1873 16 : END FUNCTION get_number_of_mpi_sendrecv_requests
1874 8 :
1875 : ! **************************************************************************************************
1876 : !> \brief Map non-zero matrix elements on to MPI requests.
1877 16 : !> \param element_offset offset (index-1) of the first element [out]
1878 : !> \param nelements_per_request number of element for each request [out]
1879 : !> \param peer_rank rank of a peering MPI process
1880 : !> \param tag MPI tag
1881 : !> \param mepos MPI rank of a given MPI process
1882 : !> \param nelements_per_proc number of element to send / receive by the current MPI process
1883 : !> \param max_nelements_per_packet maximum number of elements per single MPI request
1884 : ! **************************************************************************************************
1885 : SUBROUTINE assign_nonzero_elements_to_requests(element_offset, nelements_per_request, peer_rank, tag, &
1886 : mepos, nelements_per_proc, max_nelements_per_packet)
1887 : INTEGER(kind=int_8), DIMENSION(:), INTENT(out) :: element_offset, nelements_per_request
1888 : INTEGER, DIMENSION(:), INTENT(out) :: peer_rank, tag
1889 8 : INTEGER, INTENT(in) :: mepos
1890 8 : INTEGER(kind=int_8), DIMENSION(0:), INTENT(in) :: nelements_per_proc
1891 : INTEGER(kind=int_8), INTENT(in) :: max_nelements_per_packet
1892 :
1893 : INTEGER :: iproc, irequest, nrequests, &
1894 : request_offset
1895 : INTEGER(kind=int_8) :: element_offset_tmp, nelements
1896 :
1897 : request_offset = 0
1898 : element_offset_tmp = 0
1899 : DO iproc = LBOUND(nelements_per_proc, 1), UBOUND(nelements_per_proc, 1)
1900 : IF (iproc /= mepos) THEN
1901 8 : nrequests = INT(nelements_per_proc(iproc)/max_nelements_per_packet)
1902 8 : IF (MOD(nelements_per_proc(iproc), max_nelements_per_packet) > 0) nrequests = nrequests + 1
1903 32 : CPASSERT(nrequests <= max_mpi_rank + 1)
1904 16 : IF (nrequests > 0) THEN
1905 8 : nelements = nelements_per_proc(iproc)/nrequests
1906 8 : IF (nelements_per_proc(iproc) - nelements*nrequests > 0) nelements = nelements + 1
1907 8 : CPASSERT(nelements <= max_nelements_per_packet)
1908 8 :
1909 8 : DO irequest = 1, nrequests
1910 8 : element_offset(request_offset + irequest) = (irequest - 1)*nelements + element_offset_tmp
1911 8 : IF (irequest < nrequests) THEN
1912 : nelements_per_request(request_offset + irequest) = nelements
1913 16 : ELSE
1914 8 : nelements_per_request(request_offset + irequest) = nelements_per_proc(iproc) - nelements*(nrequests - 1)
1915 8 : END IF
1916 0 : peer_rank(request_offset + irequest) = iproc
1917 : tag(request_offset + irequest) = irequest - 1
1918 8 : END DO
1919 : END IF
1920 8 : request_offset = request_offset + nrequests
1921 16 : END IF
1922 : element_offset_tmp = element_offset_tmp + nelements_per_proc(iproc)
1923 : END DO
1924 8 :
1925 : IF (debug_this_module) THEN
1926 24 : CPASSERT(SIZE(element_offset) == request_offset)
1927 : CPASSERT(SIZE(nelements_per_request) == request_offset)
1928 : CPASSERT(SIZE(peer_rank) == request_offset)
1929 : CPASSERT(SIZE(tag) == request_offset)
1930 : END IF
1931 : END SUBROUTINE assign_nonzero_elements_to_requests
1932 :
1933 : END MODULE smeagol_matrix_utils
|