Line data Source code
1 : !--------------------------------------------------------------------------------------------------!
2 : ! CP2K: A general program to perform molecular dynamics simulations !
3 : ! Copyright 2000-2025 CP2K developers group <https://cp2k.org> !
4 : ! !
5 : ! SPDX-License-Identifier: GPL-2.0-or-later !
6 : !--------------------------------------------------------------------------------------------------!
7 :
8 : ! **************************************************************************************************
9 : !> \brief operations for skinny matrices/vectors expressed in dbcsr form
10 : !> \par History
11 : !> 2014.10 created [Florian Schiffmann]
12 : !> 2023.12 Removed support for single-precision [Ole Schuett]
13 : !> 2024.12 Removed support for complex input matrices [Ole Schuett]
14 : !> \author Florian Schiffmann
15 : ! **************************************************************************************************
16 : MODULE arnoldi_vector
17 : USE cp_dbcsr_api, ONLY: &
18 : dbcsr_copy, dbcsr_create, dbcsr_distribution_get, dbcsr_distribution_new, &
19 : dbcsr_distribution_release, dbcsr_distribution_type, dbcsr_get_data_p, dbcsr_get_info, &
20 : dbcsr_iterator_blocks_left, dbcsr_iterator_next_block, dbcsr_iterator_start, &
21 : dbcsr_iterator_stop, dbcsr_iterator_type, dbcsr_release, dbcsr_set, dbcsr_type, &
22 : dbcsr_type_antisymmetric, dbcsr_type_no_symmetry, dbcsr_type_symmetric
23 : USE cp_dbcsr_contrib, ONLY: dbcsr_reserve_all_blocks
24 : USE kinds, ONLY: dp,&
25 : real_8
26 : USE message_passing, ONLY: mp_comm_type
27 : #include "../base/base_uses.f90"
28 :
29 : !$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
30 :
31 : IMPLICIT NONE
32 :
33 : PRIVATE
34 :
35 : CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'arnoldi_vector'
36 :
37 : ! **************************************************************************************************
38 : !> \brief Types needed for the hashtable.
39 : ! **************************************************************************************************
40 : TYPE ele_type
41 : INTEGER :: c = 0
42 : INTEGER :: p = 0
43 : END TYPE ele_type
44 :
45 : TYPE hash_table_type
46 : TYPE(ele_type), DIMENSION(:), POINTER :: table => NULL()
47 : INTEGER :: nele = 0
48 : INTEGER :: nmax = 0
49 : INTEGER :: prime = 0
50 : END TYPE hash_table_type
51 :
52 : ! **************************************************************************************************
53 : !> \brief Types needed for fast access to distributed dbcsr vectors.
54 : ! **************************************************************************************************
55 : TYPE block_ptr
56 : REAL(real_8), DIMENSION(:, :), POINTER :: ptr => NULL()
57 : INTEGER :: assigned_thread = -1
58 : END TYPE block_ptr
59 :
60 : TYPE fast_vec_access_type
61 : TYPE(hash_table_type) :: hash_table = hash_table_type()
62 : TYPE(block_ptr), DIMENSION(:), ALLOCATABLE :: blk_map
63 : END TYPE
64 :
65 : PUBLIC :: dbcsr_matrix_colvec_multiply, &
66 : create_col_vec_from_matrix, &
67 : create_row_vec_from_matrix, &
68 : create_replicated_col_vec_from_matrix, &
69 : create_replicated_row_vec_from_matrix
70 :
71 : CONTAINS
72 :
73 : ! **************************************************************************************************
74 : !> \brief creates a dbcsr col vector like object which lives on proc_col 0
75 : !> and has the same row dist as the template matrix
76 : !> the returned matrix is fully allocated and all blocks are set to 0
77 : !> this is not a sparse object (and must never be)
78 : !> \param dbcsr_vec the vector object to create must be allocated but not initialized
79 : !> \param matrix a dbcsr matrix used as template
80 : !> \param ncol number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
81 : ! **************************************************************************************************
82 262972 : SUBROUTINE create_col_vec_from_matrix(dbcsr_vec, matrix, ncol)
83 : TYPE(dbcsr_type) :: dbcsr_vec, matrix
84 : INTEGER :: ncol
85 :
86 : CHARACTER(LEN=*), PARAMETER :: routineN = 'create_col_vec_from_matrix'
87 :
88 : INTEGER :: handle, npcols
89 262972 : INTEGER, DIMENSION(:), POINTER :: col_blk_size, col_dist, row_blk_size, &
90 262972 : row_dist
91 : TYPE(dbcsr_distribution_type) :: dist, dist_col_vec
92 :
93 262972 : CALL timeset(routineN, handle)
94 :
95 262972 : CALL dbcsr_get_info(matrix, row_blk_size=row_blk_size, distribution=dist)
96 262972 : CALL dbcsr_distribution_get(dist, npcols=npcols, row_dist=row_dist)
97 :
98 262972 : ALLOCATE (col_dist(1), col_blk_size(1))
99 262972 : col_dist(1) = 0
100 262972 : col_blk_size(1) = ncol
101 262972 : CALL dbcsr_distribution_new(dist_col_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
102 :
103 : CALL dbcsr_create(dbcsr_vec, "D", dist_col_vec, matrix_type=dbcsr_type_no_symmetry, &
104 262972 : row_blk_size=row_blk_size, col_blk_size=col_blk_size)
105 262972 : CALL dbcsr_reserve_all_blocks(dbcsr_vec)
106 :
107 262972 : CALL dbcsr_distribution_release(dist_col_vec)
108 262972 : DEALLOCATE (col_dist, col_blk_size)
109 262972 : CALL timestop(handle)
110 :
111 525944 : END SUBROUTINE create_col_vec_from_matrix
112 :
113 : ! **************************************************************************************************
114 : !> \brief creates a dbcsr row vector like object which lives on proc_row 0
115 : !> and has the same row dist as the template matrix
116 : !> the returned matrix is fully allocated and all blocks are set to 0
117 : !> this is not a sparse object (and must never be)
118 : !> \param dbcsr_vec ...
119 : !> \param matrix a dbcsr matrix used as template
120 : !> \param nrow number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
121 : ! **************************************************************************************************
122 0 : SUBROUTINE create_row_vec_from_matrix(dbcsr_vec, matrix, nrow)
123 : TYPE(dbcsr_type) :: dbcsr_vec, matrix
124 : INTEGER :: nrow
125 :
126 : CHARACTER(LEN=*), PARAMETER :: routineN = 'create_row_vec_from_matrix'
127 :
128 : INTEGER :: handle, nprows
129 0 : INTEGER, DIMENSION(:), POINTER :: col_blk_size, col_dist, row_blk_size, &
130 0 : row_dist
131 : TYPE(dbcsr_distribution_type) :: dist, dist_row_vec
132 :
133 0 : CALL timeset(routineN, handle)
134 :
135 0 : CALL dbcsr_get_info(matrix, col_blk_size=col_blk_size, distribution=dist)
136 0 : CALL dbcsr_distribution_get(dist, nprows=nprows, col_dist=col_dist)
137 :
138 0 : ALLOCATE (row_dist(1), row_blk_size(1))
139 0 : row_dist(1) = 0
140 0 : row_blk_size(1) = nrow
141 0 : CALL dbcsr_distribution_new(dist_row_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
142 :
143 : CALL dbcsr_create(dbcsr_vec, "D", dist_row_vec, matrix_type=dbcsr_type_no_symmetry, &
144 0 : row_blk_size=row_blk_size, col_blk_size=col_blk_size)
145 0 : CALL dbcsr_reserve_all_blocks(dbcsr_vec)
146 :
147 0 : CALL dbcsr_distribution_release(dist_row_vec)
148 0 : DEALLOCATE (row_dist, row_blk_size)
149 0 : CALL timestop(handle)
150 :
151 0 : END SUBROUTINE create_row_vec_from_matrix
152 :
153 : ! **************************************************************************************************
154 : !> \brief creates a col vector like object whose blocks can be replicated
155 : !> along the processor row and has the same row dist as the template matrix
156 : !> the returned matrix is fully allocated and all blocks are set to 0
157 : !> this is not a sparse object (and must never be)
158 : !> \param dbcsr_vec the vector object to create must be allocated but not initialized
159 : !> \param matrix a dbcsr matrix used as template
160 : !> \param ncol number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
161 : ! **************************************************************************************************
162 129213 : SUBROUTINE create_replicated_col_vec_from_matrix(dbcsr_vec, matrix, ncol)
163 : TYPE(dbcsr_type) :: dbcsr_vec, matrix
164 : INTEGER :: ncol
165 :
166 : CHARACTER(LEN=*), PARAMETER :: routineN = 'create_replicated_col_vec_from_matrix'
167 :
168 : INTEGER :: handle, i, npcols
169 129213 : INTEGER, DIMENSION(:), POINTER :: col_blk_size, col_dist, row_blk_size, &
170 129213 : row_dist
171 : TYPE(dbcsr_distribution_type) :: dist, dist_col_vec
172 :
173 129213 : CALL timeset(routineN, handle)
174 :
175 129213 : CALL dbcsr_get_info(matrix, row_blk_size=row_blk_size, distribution=dist)
176 129213 : CALL dbcsr_distribution_get(dist, npcols=npcols, row_dist=row_dist)
177 :
178 646065 : ALLOCATE (col_dist(npcols), col_blk_size(npcols))
179 258426 : col_blk_size(:) = ncol
180 258426 : DO i = 0, npcols - 1
181 258426 : col_dist(i + 1) = i
182 : END DO
183 129213 : CALL dbcsr_distribution_new(dist_col_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
184 :
185 : CALL dbcsr_create(dbcsr_vec, "D", dist_col_vec, matrix_type=dbcsr_type_no_symmetry, &
186 129213 : row_blk_size=row_blk_size, col_blk_size=col_blk_size)
187 129213 : CALL dbcsr_reserve_all_blocks(dbcsr_vec)
188 :
189 129213 : CALL dbcsr_distribution_release(dist_col_vec)
190 129213 : DEALLOCATE (col_dist, col_blk_size)
191 129213 : CALL timestop(handle)
192 :
193 258426 : END SUBROUTINE create_replicated_col_vec_from_matrix
194 :
195 : ! **************************************************************************************************
196 : !> \brief creates a row vector like object whose blocks can be replicated
197 : !> along the processor col and has the same col dist as the template matrix
198 : !> the returned matrix is fully allocated and all blocks are set to 0
199 : !> this is not a sparse object (and must never be)
200 : !> \param dbcsr_vec the vector object to create must be allocated but not initialized
201 : !> \param matrix a dbcsr matrix used as template
202 : !> \param nrow number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
203 : ! **************************************************************************************************
204 129213 : SUBROUTINE create_replicated_row_vec_from_matrix(dbcsr_vec, matrix, nrow)
205 : TYPE(dbcsr_type) :: dbcsr_vec, matrix
206 : INTEGER :: nrow
207 :
208 : CHARACTER(LEN=*), PARAMETER :: routineN = 'create_replicated_row_vec_from_matrix'
209 :
210 : INTEGER :: handle, i, nprows
211 129213 : INTEGER, DIMENSION(:), POINTER :: col_blk_size, col_dist, row_blk_size, &
212 129213 : row_dist
213 : TYPE(dbcsr_distribution_type) :: dist, dist_row_vec
214 :
215 129213 : CALL timeset(routineN, handle)
216 :
217 129213 : CALL dbcsr_get_info(matrix, distribution=dist, col_blk_size=col_blk_size)
218 129213 : CALL dbcsr_distribution_get(dist, nprows=nprows, col_dist=col_dist)
219 :
220 646065 : ALLOCATE (row_dist(nprows), row_blk_size(nprows))
221 385284 : row_blk_size(:) = nrow
222 385284 : DO i = 0, nprows - 1
223 385284 : row_dist(i + 1) = i
224 : END DO
225 129213 : CALL dbcsr_distribution_new(dist_row_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
226 :
227 : CALL dbcsr_create(dbcsr_vec, "D", dist_row_vec, dbcsr_type_no_symmetry, &
228 129213 : row_blk_size=row_blk_size, col_blk_size=col_blk_size)
229 129213 : CALL dbcsr_reserve_all_blocks(dbcsr_vec)
230 :
231 129213 : CALL dbcsr_distribution_release(dist_row_vec)
232 129213 : DEALLOCATE (row_dist, row_blk_Size)
233 129213 : CALL timestop(handle)
234 :
235 258426 : END SUBROUTINE create_replicated_row_vec_from_matrix
236 :
237 : ! **************************************************************************************************
238 : !> \brief given a column vector, prepare the fast_vec_access container
239 : !> \param vec ...
240 : !> \param fast_vec_access ...
241 : ! **************************************************************************************************
242 1040703 : SUBROUTINE create_fast_col_vec_access(vec, fast_vec_access)
243 : TYPE(dbcsr_type) :: vec
244 : TYPE(fast_vec_access_type) :: fast_vec_access
245 :
246 : CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_col_vec_access'
247 :
248 : INTEGER :: col, handle, iblock, nblk_local, &
249 : nthreads, row
250 1040703 : REAL(kind=real_8), DIMENSION(:, :), POINTER :: vec_bl
251 : TYPE(dbcsr_iterator_type) :: iter
252 :
253 1040703 : CALL timeset(routineN, handle)
254 :
255 : ! figure out the number of threads
256 1040703 : nthreads = 1
257 1040703 : !$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)
258 : !$OMP MASTER
259 : !$ nthreads = OMP_GET_NUM_THREADS()
260 : !$OMP END MASTER
261 : !$OMP END PARALLEL
262 :
263 1040703 : CALL dbcsr_get_info(matrix=vec, nblkrows_local=nblk_local)
264 : ! 4 times makes sure the table is big enough to limit collisions.
265 1040703 : CALL hash_table_create(fast_vec_access%hash_table, 4*nblk_local)
266 : ! include zero for effective dealing with values not in the hash table (will return 0)
267 6837125 : ALLOCATE (fast_vec_access%blk_map(0:nblk_local))
268 :
269 1040703 : CALL dbcsr_get_info(matrix=vec, nblkcols_local=col)
270 1040703 : IF (col .GT. 1) CPABORT("BUG")
271 :
272 : ! go through the blocks of the vector
273 1040703 : iblock = 0
274 1040703 : CALL dbcsr_iterator_start(iter, vec)
275 3715016 : DO WHILE (dbcsr_iterator_blocks_left(iter))
276 2674313 : CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)
277 2674313 : iblock = iblock + 1
278 2674313 : CALL hash_table_add(fast_vec_access%hash_table, row, iblock)
279 2674313 : fast_vec_access%blk_map(iblock)%ptr => vec_bl
280 2674313 : fast_vec_access%blk_map(iblock)%assigned_thread = MOD(iblock, nthreads)
281 : END DO
282 1040703 : CALL dbcsr_iterator_stop(iter)
283 1040703 : CALL timestop(handle)
284 :
285 3122109 : END SUBROUTINE create_fast_col_vec_access
286 :
287 : ! **************************************************************************************************
288 : !> \brief given a row vector, prepare the fast_vec_access_container
289 : !> \param vec ...
290 : !> \param fast_vec_access ...
291 : ! **************************************************************************************************
292 1040703 : SUBROUTINE create_fast_row_vec_access(vec, fast_vec_access)
293 : TYPE(dbcsr_type) :: vec
294 : TYPE(fast_vec_access_type) :: fast_vec_access
295 :
296 : CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_row_vec_access'
297 :
298 : INTEGER :: col, handle, iblock, nblk_local, &
299 : nthreads, row
300 1040703 : REAL(kind=real_8), DIMENSION(:, :), POINTER :: vec_bl
301 : TYPE(dbcsr_iterator_type) :: iter
302 :
303 1040703 : CALL timeset(routineN, handle)
304 :
305 : ! figure out the number of threads
306 1040703 : nthreads = 1
307 1040703 : !$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)
308 : !$OMP MASTER
309 : !$ nthreads = OMP_GET_NUM_THREADS()
310 : !$OMP END MASTER
311 : !$OMP END PARALLEL
312 :
313 1040703 : CALL dbcsr_get_info(matrix=vec, nblkcols_local=nblk_local)
314 : ! 4 times makes sure the table is big enough to limit collisions.
315 1040703 : CALL hash_table_create(fast_vec_access%hash_table, 4*nblk_local)
316 : ! include zero for effective dealing with values not in the hash table (will return 0)
317 9486383 : ALLOCATE (fast_vec_access%blk_map(0:nblk_local))
318 :
319 : ! sanity check
320 1040703 : CALL dbcsr_get_info(matrix=vec, nblkrows_local=row)
321 1040703 : IF (row .GT. 1) CPABORT("BUG")
322 :
323 : ! go through the blocks of the vector
324 1040703 : iblock = 0
325 1040703 : CALL dbcsr_iterator_start(iter, vec)
326 6364274 : DO WHILE (dbcsr_iterator_blocks_left(iter))
327 5323571 : CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)
328 5323571 : iblock = iblock + 1
329 5323571 : CALL hash_table_add(fast_vec_access%hash_table, col, iblock)
330 5323571 : fast_vec_access%blk_map(iblock)%ptr => vec_bl
331 5323571 : fast_vec_access%blk_map(iblock)%assigned_thread = MOD(iblock, nthreads)
332 : END DO
333 1040703 : CALL dbcsr_iterator_stop(iter)
334 :
335 1040703 : CALL timestop(handle)
336 :
337 3122109 : END SUBROUTINE create_fast_row_vec_access
338 :
339 : ! **************************************************************************************************
340 : !> \brief release all memory associated with the fast_vec_access type
341 : !> \param fast_vec_access ...
342 : ! **************************************************************************************************
343 2081406 : SUBROUTINE release_fast_vec_access(fast_vec_access)
344 : TYPE(fast_vec_access_type) :: fast_vec_access
345 :
346 : CHARACTER(LEN=*), PARAMETER :: routineN = 'release_fast_vec_access'
347 :
348 : INTEGER :: handle
349 :
350 2081406 : CALL timeset(routineN, handle)
351 :
352 2081406 : CALL hash_table_release(fast_vec_access%hash_table)
353 2081406 : DEALLOCATE (fast_vec_access%blk_map)
354 :
355 2081406 : CALL timestop(handle)
356 :
357 2081406 : END SUBROUTINE release_fast_vec_access
358 :
359 : ! --------------------------------------------------------------------------------------------------
360 : ! Beginning of hashtable.
361 : ! this file can be 'INCLUDE'ed verbatim in various place, where it needs to be
362 : ! part of the module to guarantee inlining
363 : ! hashes (c,p) pairs, where p is assumed to be >0
364 : ! on return (0 is used as a flag for not present)
365 : !
366 : !
367 : ! **************************************************************************************************
368 : !> \brief finds a prime equal or larger than i, needed at creation
369 : !> \param i ...
370 : !> \return ...
371 : ! **************************************************************************************************
372 2081406 : FUNCTION matching_prime(i) RESULT(res)
373 : INTEGER, INTENT(IN) :: i
374 : INTEGER :: res
375 :
376 : INTEGER :: j
377 :
378 2081406 : res = i
379 2081406 : j = 0
380 6416202 : DO WHILE (j < res)
381 54438519 : DO j = 2, res - 1
382 52357113 : IF (MOD(res, j) == 0) THEN
383 2253390 : res = res + 1
384 2253390 : EXIT
385 : END IF
386 : END DO
387 : END DO
388 2081406 : END FUNCTION
389 :
390 : ! **************************************************************************************************
391 : !> \brief create a hash_table of given initial size.
392 : !> the hash table will expand as needed (but this requires rehashing)
393 : !> \param hash_table ...
394 : !> \param table_size ...
395 : ! **************************************************************************************************
396 2081406 : SUBROUTINE hash_table_create(hash_table, table_size)
397 : TYPE(hash_table_type) :: hash_table
398 : INTEGER, INTENT(IN) :: table_size
399 :
400 : INTEGER :: j
401 :
402 : ! guarantee a minimal hash table size (8), so that expansion works
403 :
404 2081406 : j = 3
405 4057574 : DO WHILE (2**j - 1 < table_size)
406 1976168 : j = j + 1
407 : END DO
408 2081406 : hash_table%nmax = 2**j - 1
409 2081406 : hash_table%prime = matching_prime(hash_table%nmax)
410 2081406 : hash_table%nele = 0
411 56759394 : ALLOCATE (hash_table%table(0:hash_table%nmax))
412 2081406 : END SUBROUTINE hash_table_create
413 :
414 : ! **************************************************************************************************
415 : !> \brief ...
416 : !> \param hash_table ...
417 : ! **************************************************************************************************
418 2081406 : SUBROUTINE hash_table_release(hash_table)
419 : TYPE(hash_table_type) :: hash_table
420 :
421 2081406 : hash_table%nmax = 0
422 2081406 : hash_table%nele = 0
423 2081406 : DEALLOCATE (hash_table%table)
424 :
425 2081406 : END SUBROUTINE hash_table_release
426 :
427 : ! **************************************************************************************************
428 : !> \brief add a pair (c,p) to the hash table
429 : !> \param hash_table ...
430 : !> \param c this value is being hashed
431 : !> \param p this is being stored
432 : ! **************************************************************************************************
433 7997884 : RECURSIVE SUBROUTINE hash_table_add(hash_table, c, p)
434 : TYPE(hash_table_type), INTENT(INOUT) :: hash_table
435 : INTEGER, INTENT(IN) :: c, p
436 :
437 : REAL(KIND=real_8), PARAMETER :: hash_table_expand = 1.5_real_8, &
438 : inv_hash_table_fill = 2.5_real_8
439 :
440 : INTEGER :: i, j
441 7997884 : TYPE(ele_type), ALLOCATABLE, DIMENSION(:) :: tmp_hash
442 :
443 : ! if too small, make a copy and rehash in a larger table
444 :
445 7997884 : IF (hash_table%nele*inv_hash_table_fill > hash_table%nmax) THEN
446 0 : ALLOCATE (tmp_hash(LBOUND(hash_table%table, 1):UBOUND(hash_table%table, 1)))
447 0 : tmp_hash(:) = hash_table%table
448 0 : CALL hash_table_release(hash_table)
449 0 : CALL hash_table_create(hash_table, INT((UBOUND(tmp_hash, 1) + 8)*hash_table_expand))
450 0 : DO i = LBOUND(tmp_hash, 1), UBOUND(tmp_hash, 1)
451 0 : IF (tmp_hash(i)%c .NE. 0) THEN
452 0 : CALL hash_table_add(hash_table, tmp_hash(i)%c, tmp_hash(i)%p)
453 : END IF
454 : END DO
455 0 : DEALLOCATE (tmp_hash)
456 : END IF
457 :
458 7997884 : hash_table%nele = hash_table%nele + 1
459 7997884 : i = IAND(c*hash_table%prime, hash_table%nmax)
460 :
461 7997884 : DO j = i, hash_table%nmax
462 7997884 : IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
463 7997884 : hash_table%table(j)%c = c
464 7997884 : hash_table%table(j)%p = p
465 7997884 : RETURN
466 : END IF
467 : END DO
468 0 : DO j = 0, i - 1
469 0 : IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
470 0 : hash_table%table(j)%c = c
471 0 : hash_table%table(j)%p = p
472 0 : RETURN
473 : END IF
474 : END DO
475 :
476 : END SUBROUTINE hash_table_add
477 :
478 : ! **************************************************************************************************
479 : !> \brief ...
480 : !> \param hash_table ...
481 : !> \param c ...
482 : !> \return ...
483 : ! **************************************************************************************************
484 120165378 : PURE FUNCTION hash_table_get(hash_table, c) RESULT(p)
485 : TYPE(hash_table_type), INTENT(IN) :: hash_table
486 : INTEGER, INTENT(IN) :: c
487 : INTEGER :: p
488 :
489 : INTEGER :: i, j
490 :
491 120165378 : i = IAND(c*hash_table%prime, hash_table%nmax)
492 :
493 : ! catch the likely case first
494 120165378 : IF (hash_table%table(i)%c == c) THEN
495 115869330 : p = hash_table%table(i)%p
496 115869330 : RETURN
497 : END IF
498 :
499 4296048 : DO j = i, hash_table%nmax
500 4296048 : IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
501 4296048 : p = hash_table%table(j)%p
502 4296048 : RETURN
503 : END IF
504 : END DO
505 0 : DO j = 0, i - 1
506 0 : IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
507 0 : p = hash_table%table(j)%p
508 0 : RETURN
509 : END IF
510 : END DO
511 :
512 : ! we should never reach this point.
513 120165378 : p = HUGE(p)
514 :
515 : END FUNCTION hash_table_get
516 :
517 : ! End of hashtable
518 : ! --------------------------------------------------------------------------------------------------
519 :
520 : ! **************************************************************************************************
521 : !> \brief the real driver routine for the multiply, not all symmetries implemented yet
522 : !> \param matrix ...
523 : !> \param vec_in ...
524 : !> \param vec_out ...
525 : !> \param alpha ...
526 : !> \param beta ...
527 : !> \param work_row ...
528 : !> \param work_col ...
529 : ! **************************************************************************************************
530 770182 : SUBROUTINE dbcsr_matrix_colvec_multiply(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
531 : TYPE(dbcsr_type) :: matrix, vec_in, vec_out
532 : REAL(kind=real_8) :: alpha, beta
533 : TYPE(dbcsr_type) :: work_row, work_col
534 :
535 : CHARACTER :: matrix_type
536 :
537 770182 : CALL dbcsr_get_info(matrix, matrix_type=matrix_type)
538 :
539 499661 : SELECT CASE (matrix_type)
540 : CASE (dbcsr_type_no_symmetry)
541 499661 : CALL dbcsr_matrix_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
542 : CASE (dbcsr_type_symmetric)
543 270521 : CALL dbcsr_sym_matrix_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
544 : CASE (dbcsr_type_antisymmetric)
545 : ! Not yet implemented, should mainly be some prefactor magic, but who knows how antisymmetric matrices are stored???
546 0 : CPABORT("NYI, antisymmetric matrix not permitted")
547 : CASE DEFAULT
548 770182 : CPABORT("Unknown matrix type, ...")
549 : END SELECT
550 :
551 770182 : END SUBROUTINE dbcsr_matrix_colvec_multiply
552 :
553 : ! **************************************************************************************************
554 : !> \brief low level routines for matrix vector multiplies
555 : !> \param matrix ...
556 : !> \param vec_in ...
557 : !> \param vec_out ...
558 : !> \param alpha ...
559 : !> \param beta ...
560 : !> \param work_row ...
561 : !> \param work_col ...
562 : ! **************************************************************************************************
563 499661 : SUBROUTINE dbcsr_matrix_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
564 : TYPE(dbcsr_type) :: matrix, vec_in, vec_out
565 : REAL(kind=real_8) :: alpha, beta
566 : TYPE(dbcsr_type) :: work_row, work_col
567 :
568 : CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_matrix_vector_mult'
569 :
570 : INTEGER :: col, handle, handle1, ithread, mypcol, &
571 : myprow, ncols, nrows, pcol, prow, &
572 : prow_handle, row
573 499661 : REAL(kind=real_8), DIMENSION(:), POINTER :: data_vec
574 499661 : REAL(kind=real_8), DIMENSION(:, :), POINTER :: data_d, vec_res
575 : TYPE(dbcsr_distribution_type) :: dist
576 : TYPE(dbcsr_iterator_type) :: iter
577 499661 : TYPE(fast_vec_access_type) :: fast_vec_col, fast_vec_row
578 : TYPE(mp_comm_type) :: prow_group
579 :
580 499661 : CALL timeset(routineN, handle)
581 499661 : ithread = 0
582 :
583 : ! Collect some data about the parallel environment. We will use them later to move the vector around
584 499661 : CALL dbcsr_get_info(matrix, distribution=dist)
585 499661 : CALL dbcsr_distribution_get(dist, prow_group=prow_handle, myprow=myprow, mypcol=mypcol)
586 :
587 499661 : CALL prow_group%set_handle(prow_handle)
588 :
589 499661 : CALL create_fast_row_vec_access(work_row, fast_vec_row)
590 499661 : CALL create_fast_col_vec_access(work_col, fast_vec_col)
591 :
592 : ! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply
593 499661 : CALL dbcsr_col_vec_to_rep_row(vec_in, work_col, work_row, fast_vec_col)
594 :
595 : ! Set the work vector for the results to 0
596 499661 : CALL dbcsr_set(work_col, 0.0_real_8)
597 :
598 : ! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes
599 : ! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)
600 499661 : CALL timeset(routineN//"_local_mm", handle1)
601 :
602 : !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow) &
603 499661 : !$OMP SHARED(matrix,fast_vec_col,fast_vec_row)
604 : !$ ithread = omp_get_thread_num()
605 : CALL dbcsr_iterator_start(iter, matrix, shared=.FALSE.)
606 : DO WHILE (dbcsr_iterator_blocks_left(iter))
607 : CALL dbcsr_iterator_next_block(iter, row, col, data_d)
608 : prow = hash_table_get(fast_vec_col%hash_table, row)
609 : IF (fast_vec_col%blk_map(prow)%assigned_thread .NE. ithread) CYCLE
610 : pcol = hash_table_get(fast_vec_row%hash_table, col)
611 : IF (SIZE(fast_vec_col%blk_map(prow)%ptr, 1) .EQ. 0 .OR. &
612 : SIZE(fast_vec_col%blk_map(prow)%ptr, 2) .EQ. 0 .OR. &
613 : SIZE(data_d, 2) .EQ. 0) CYCLE
614 : CALL dgemm('N', 'T', SIZE(fast_vec_col%blk_map(prow)%ptr, 1), &
615 : SIZE(fast_vec_col%blk_map(prow)%ptr, 2), &
616 : SIZE(data_d, 2), &
617 : 1.0_dp, &
618 : data_d, &
619 : SIZE(fast_vec_col%blk_map(prow)%ptr, 1), &
620 : fast_vec_row%blk_map(pcol)%ptr, &
621 : SIZE(fast_vec_col%blk_map(prow)%ptr, 2), &
622 : 1.0_dp, &
623 : fast_vec_col%blk_map(prow)%ptr, &
624 : SIZE(fast_vec_col%blk_map(prow)%ptr, 1))
625 : END DO
626 : CALL dbcsr_iterator_stop(iter)
627 : !$OMP END PARALLEL
628 :
629 499661 : CALL timestop(handle1)
630 :
631 : ! sum all the data onto the first processor col where the original vector is stored
632 499661 : data_vec => dbcsr_get_data_p(work_col)
633 499661 : CALL dbcsr_get_info(matrix=work_col, nfullrows_local=nrows, nfullcols_local=ncols)
634 10571431 : CALL prow_group%sum(data_vec(1:nrows*ncols))
635 :
636 : ! Local copy on the first mpi col (as this is the localtion of the vec_res blocks) of the result vector
637 : ! from the replicated to the original vector. Let's play it safe and use the iterator
638 499661 : CALL dbcsr_iterator_start(iter, vec_out)
639 1069624 : DO WHILE (dbcsr_iterator_blocks_left(iter))
640 569963 : CALL dbcsr_iterator_next_block(iter, row, col, vec_res)
641 569963 : prow = hash_table_get(fast_vec_col%hash_table, row)
642 1069624 : IF (ASSOCIATED(fast_vec_col%blk_map(prow)%ptr)) THEN
643 6175811 : vec_res(:, :) = beta*vec_res(:, :) + alpha*fast_vec_col%blk_map(prow)%ptr(:, :)
644 : ELSE
645 0 : vec_res(:, :) = beta*vec_res(:, :)
646 : END IF
647 : END DO
648 499661 : CALL dbcsr_iterator_stop(iter)
649 :
650 499661 : CALL release_fast_vec_access(fast_vec_row)
651 499661 : CALL release_fast_vec_access(fast_vec_col)
652 :
653 499661 : CALL timestop(handle)
654 :
655 999322 : END SUBROUTINE dbcsr_matrix_vector_mult
656 :
657 : ! **************************************************************************************************
658 : !> \brief ...
659 : !> \param matrix ...
660 : !> \param vec_in ...
661 : !> \param vec_out ...
662 : !> \param alpha ...
663 : !> \param beta ...
664 : !> \param work_row ...
665 : !> \param work_col ...
666 : !> \param skip_diag ...
667 : ! **************************************************************************************************
668 0 : SUBROUTINE dbcsr_matrixT_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col, skip_diag)
669 : TYPE(dbcsr_type) :: matrix, vec_in, vec_out
670 : REAL(kind=real_8) :: alpha, beta
671 : TYPE(dbcsr_type) :: work_row, work_col
672 : LOGICAL :: skip_diag
673 :
674 : CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_matrixT_vector_mult'
675 :
676 : INTEGER :: col, col_size, handle, handle1, ithread, &
677 : mypcol, myprow, ncols, nrows, pcol, &
678 : pcol_handle, prow, prow_handle, row, &
679 : row_size
680 0 : REAL(kind=real_8), DIMENSION(:), POINTER :: data_vec
681 0 : REAL(kind=real_8), DIMENSION(:, :), POINTER :: data_d, vec_bl, vec_res
682 : TYPE(dbcsr_distribution_type) :: dist
683 : TYPE(dbcsr_iterator_type) :: iter
684 0 : TYPE(fast_vec_access_type) :: fast_vec_col, fast_vec_row
685 : TYPE(mp_comm_type) :: pcol_group, prow_group
686 :
687 0 : CALL timeset(routineN, handle)
688 0 : ithread = 0
689 :
690 : ! Collect some data about the parallel environment. We will use them later to move the vector around
691 0 : CALL dbcsr_get_info(matrix, distribution=dist)
692 0 : CALL dbcsr_distribution_get(dist, prow_group=prow_handle, pcol_group=pcol_handle, myprow=myprow, mypcol=mypcol)
693 :
694 0 : CALL prow_group%set_handle(prow_handle)
695 0 : CALL pcol_group%set_handle(pcol_handle)
696 :
697 0 : CALL create_fast_row_vec_access(work_row, fast_vec_row)
698 0 : CALL create_fast_col_vec_access(work_col, fast_vec_col)
699 :
700 : ! Set the work vector for the results to 0
701 0 : CALL dbcsr_set(work_row, 0.0_real_8)
702 :
703 : ! Transfer the correct parts of the input vector to the replicated vector on proc_col 0
704 0 : CALL dbcsr_iterator_start(iter, vec_in)
705 0 : DO WHILE (dbcsr_iterator_blocks_left(iter))
706 0 : CALL dbcsr_iterator_next_block(iter, row, col, vec_bl, row_size=row_size, col_size=col_size)
707 0 : prow = hash_table_get(fast_vec_col%hash_table, row)
708 0 : fast_vec_col%blk_map(prow)%ptr(1:row_size, 1:col_size) = vec_bl(1:row_size, 1:col_size)
709 : END DO
710 0 : CALL dbcsr_iterator_stop(iter)
711 : ! Replicate the data on all processore in the row
712 0 : data_vec => dbcsr_get_data_p(work_col)
713 0 : CALL prow_group%bcast(data_vec, 0)
714 :
715 : ! Perform the local multiply. Here it is obvious why the vectors are replicated on the mpi rows and cols
716 0 : CALL timeset(routineN//"local_mm", handle1)
717 0 : CALL dbcsr_get_info(matrix=work_col, nfullcols_local=ncols)
718 : !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,row_size,col_size,ithread,prow,pcol) &
719 0 : !$OMP SHARED(matrix,fast_vec_row,fast_vec_col,skip_diag,ncols)
720 : !$ ithread = omp_get_thread_num()
721 : CALL dbcsr_iterator_start(iter, matrix, shared=.FALSE.)
722 : DO WHILE (dbcsr_iterator_blocks_left(iter))
723 : CALL dbcsr_iterator_next_block(iter, row, col, data_d, row_size=row_size, col_size=col_size)
724 : IF (skip_diag .AND. col == row) CYCLE
725 : prow = hash_table_get(fast_vec_col%hash_table, row)
726 : pcol = hash_table_get(fast_vec_row%hash_table, col)
727 : IF (ASSOCIATED(fast_vec_row%blk_map(pcol)%ptr) .AND. &
728 : ASSOCIATED(fast_vec_col%blk_map(prow)%ptr)) THEN
729 : IF (fast_vec_row%blk_map(pcol)%assigned_thread .NE. ithread) CYCLE
730 : fast_vec_row%blk_map(pcol)%ptr = fast_vec_row%blk_map(pcol)%ptr + &
731 : MATMUL(TRANSPOSE(fast_vec_col%blk_map(prow)%ptr), data_d)
732 : ELSE
733 : prow = hash_table_get(fast_vec_row%hash_table, row)
734 : pcol = hash_table_get(fast_vec_col%hash_table, col)
735 : IF (fast_vec_row%blk_map(prow)%assigned_thread .NE. ithread) CYCLE
736 : fast_vec_row%blk_map(prow)%ptr = fast_vec_row%blk_map(prow)%ptr + &
737 : MATMUL(TRANSPOSE(fast_vec_col%blk_map(pcol)%ptr), TRANSPOSE(data_d))
738 : END IF
739 : END DO
740 : CALL dbcsr_iterator_stop(iter)
741 : !$OMP END PARALLEL
742 :
743 0 : CALL timestop(handle1)
744 :
745 : ! sum all the data within a processor column to obtain the replicated result
746 0 : data_vec => dbcsr_get_data_p(work_row)
747 : ! we use the replicated vector but the final answer is only summed to proc_col 0 for efficiency
748 0 : CALL dbcsr_get_info(matrix=work_row, nfullrows_local=nrows, nfullcols_local=ncols)
749 0 : CALL pcol_group%sum(data_vec(1:nrows*ncols))
750 :
751 : ! Convert the result to a column wise distribution
752 0 : CALL dbcsr_rep_row_to_rep_col_vec(work_col, work_row, fast_vec_row)
753 :
754 : ! Create_the final vector by summing it to the result vector which lives on proc_col 0
755 0 : CALL dbcsr_iterator_start(iter, vec_out)
756 0 : DO WHILE (dbcsr_iterator_blocks_left(iter))
757 0 : CALL dbcsr_iterator_next_block(iter, row, col, vec_res, row_size=row_size)
758 0 : prow = hash_table_get(fast_vec_col%hash_table, row)
759 0 : IF (ASSOCIATED(fast_vec_col%blk_map(prow)%ptr)) THEN
760 0 : vec_res(:, :) = beta*vec_res(:, :) + alpha*fast_vec_col%blk_map(prow)%ptr(:, :)
761 : ELSE
762 0 : vec_res(:, :) = beta*vec_res(:, :)
763 : END IF
764 : END DO
765 0 : CALL dbcsr_iterator_stop(iter)
766 :
767 0 : CALL timestop(handle)
768 :
769 0 : END SUBROUTINE dbcsr_matrixT_vector_mult
770 :
771 : ! **************************************************************************************************
772 : !> \brief ...
773 : !> \param vec_in ...
774 : !> \param rep_col_vec ...
775 : !> \param rep_row_vec ...
776 : !> \param fast_vec_col ...
777 : ! **************************************************************************************************
778 6161456 : SUBROUTINE dbcsr_col_vec_to_rep_row(vec_in, rep_col_vec, rep_row_vec, fast_vec_col)
779 : TYPE(dbcsr_type) :: vec_in, rep_col_vec, rep_row_vec
780 : TYPE(fast_vec_access_type), INTENT(IN) :: fast_vec_col
781 :
782 : CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_col_vec_to_rep_row'
783 :
784 : INTEGER :: col, handle, mypcol, myprow, ncols, &
785 : nrows, pcol_handle, prow_handle, row
786 770182 : INTEGER, DIMENSION(:), POINTER :: local_cols, row_dist
787 770182 : REAL(kind=real_8), DIMENSION(:), POINTER :: data_vec, data_vec_rep
788 770182 : REAL(kind=real_8), DIMENSION(:, :), POINTER :: vec_row
789 : TYPE(dbcsr_distribution_type) :: dist_in, dist_rep_col
790 : TYPE(dbcsr_iterator_type) :: iter
791 : TYPE(mp_comm_type) :: pcol_group, prow_group
792 :
793 770182 : CALL timeset(routineN, handle)
794 :
795 : ! get information about the parallel environment
796 770182 : CALL dbcsr_get_info(vec_in, distribution=dist_in)
797 : CALL dbcsr_distribution_get(dist_in, &
798 : prow_group=prow_handle, &
799 : pcol_group=pcol_handle, &
800 : myprow=myprow, &
801 770182 : mypcol=mypcol)
802 :
803 770182 : CALL prow_group%set_handle(prow_handle)
804 770182 : CALL pcol_group%set_handle(pcol_handle)
805 :
806 : ! Get the vector which tells us which blocks are local to which processor row in the col vec
807 770182 : CALL dbcsr_get_info(rep_col_vec, distribution=dist_rep_col)
808 770182 : CALL dbcsr_distribution_get(dist_rep_col, row_dist=row_dist)
809 :
810 : ! Copy the local vector to the replicated on the first processor column (this is where vec_in lives)
811 770182 : CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)
812 770182 : data_vec_rep => dbcsr_get_data_p(rep_col_vec)
813 770182 : data_vec => dbcsr_get_data_p(vec_in)
814 23637774 : IF (mypcol == 0) data_vec_rep(1:nrows*ncols) = data_vec(1:nrows*ncols)
815 : ! Replicate the data along the row
816 23637774 : CALL prow_group%bcast(data_vec_rep(1:nrows*ncols), 0)
817 :
818 : ! Here it gets a bit tricky as we are dealing with two different parallel layouts:
819 : ! The rep_col_vec contains all blocks local to the row distribution of the vector.
820 : ! The rep_row_vec only needs the fraction which is local to the col distribution.
821 : ! However in most cases this won't the complete set of block which can be obtained from col_vector p_row i
822 : ! Anyway, as the blocks don't repeat in the col_vec, a different fraction of the row vec will be available
823 : ! on every replica in the processor column, by summing along the column we end up with the complete vector everywhere
824 : ! Hope this clarifies the idea
825 770182 : CALL dbcsr_set(rep_row_vec, 0.0_real_8)
826 770182 : CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, local_cols=local_cols, nfullcols_local=ncols)
827 770182 : CALL dbcsr_iterator_start(iter, rep_row_vec)
828 4000258 : DO WHILE (dbcsr_iterator_blocks_left(iter))
829 3230076 : CALL dbcsr_iterator_next_block(iter, row, col, vec_row)
830 4000258 : IF (row_dist(col) == myprow) THEN
831 24489730 : vec_row = TRANSPOSE(fast_vec_col%blk_map(hash_table_get(fast_vec_col%hash_table, col))%ptr)
832 : END IF
833 : END DO
834 770182 : CALL dbcsr_iterator_stop(iter)
835 770182 : CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, nfullcols_local=ncols)
836 770182 : data_vec_rep => dbcsr_get_data_p(rep_row_vec)
837 46368282 : CALL pcol_group%sum(data_vec_rep(1:ncols*nrows))
838 :
839 770182 : CALL timestop(handle)
840 :
841 770182 : END SUBROUTINE dbcsr_col_vec_to_rep_row
842 :
843 : ! **************************************************************************************************
844 : !> \brief ...
845 : !> \param rep_col_vec ...
846 : !> \param rep_row_vec ...
847 : !> \param fast_vec_row ...
848 : !> \param fast_vec_col_add ...
849 : ! **************************************************************************************************
850 1623126 : SUBROUTINE dbcsr_rep_row_to_rep_col_vec(rep_col_vec, rep_row_vec, fast_vec_row, fast_vec_col_add)
851 : TYPE(dbcsr_type) :: rep_col_vec, rep_row_vec
852 : TYPE(fast_vec_access_type) :: fast_vec_row
853 : TYPE(fast_vec_access_type), OPTIONAL :: fast_vec_col_add
854 :
855 : CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_rep_row_to_rep_col_vec'
856 :
857 : INTEGER :: col, handle, mypcol, myprow, ncols, &
858 : nrows, prow_handle, row
859 270521 : INTEGER, DIMENSION(:), POINTER :: col_dist
860 270521 : REAL(kind=real_8), DIMENSION(:), POINTER :: data_vec_rep
861 270521 : REAL(kind=real_8), DIMENSION(:, :), POINTER :: vec_col
862 : TYPE(dbcsr_distribution_type) :: dist_rep_col, dist_rep_row
863 : TYPE(dbcsr_iterator_type) :: iter
864 : TYPE(mp_comm_type) :: prow_group
865 :
866 270521 : CALL timeset(routineN, handle)
867 :
868 : ! get information about the parallel environment
869 270521 : CALL dbcsr_get_info(matrix=rep_col_vec, distribution=dist_rep_col)
870 : CALL dbcsr_distribution_get(dist_rep_col, &
871 : prow_group=prow_handle, &
872 : myprow=myprow, &
873 270521 : mypcol=mypcol)
874 :
875 270521 : CALL prow_group%set_handle(prow_handle)
876 :
877 : ! Get the vector which tells us which blocks are local to which processor col in the row vec
878 270521 : CALL dbcsr_get_info(matrix=rep_row_vec, distribution=dist_rep_row)
879 270521 : CALL dbcsr_distribution_get(dist_rep_row, col_dist=col_dist)
880 :
881 : ! The same trick as described above with opposite direction
882 270521 : CALL dbcsr_set(rep_col_vec, 0.0_real_8)
883 270521 : CALL dbcsr_iterator_start(iter, rep_col_vec)
884 1322696 : DO WHILE (dbcsr_iterator_blocks_left(iter))
885 1052175 : CALL dbcsr_iterator_next_block(iter, row, col, vec_col)
886 1052175 : IF (col_dist(row) == mypcol) THEN
887 8502261 : vec_col = TRANSPOSE(fast_vec_row%blk_map(hash_table_get(fast_vec_row%hash_table, row))%ptr)
888 : END IF
889 : ! this one is special and allows to add the elements of a not yet summed replicated
890 : ! column vector as it appears in M*V(row_rep) as result. Save an parallel summation in the symmetric case
891 1322696 : IF (PRESENT(fast_vec_col_add)) THEN
892 8502261 : vec_col = vec_col + fast_vec_col_add%blk_map(hash_table_get(fast_vec_col_add%hash_table, row))%ptr(:, :)
893 : END IF
894 : END DO
895 270521 : CALL dbcsr_iterator_stop(iter)
896 270521 : CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)
897 270521 : data_vec_rep => dbcsr_get_data_p(rep_col_vec)
898 13066343 : CALL prow_group%sum(data_vec_rep(1:nrows*ncols))
899 :
900 270521 : CALL timestop(handle)
901 :
902 270521 : END SUBROUTINE dbcsr_rep_row_to_rep_col_vec
903 :
904 : ! **************************************************************************************************
905 : !> \brief ...
906 : !> \param matrix ...
907 : !> \param vec_in ...
908 : !> \param vec_out ...
909 : !> \param alpha ...
910 : !> \param beta ...
911 : !> \param work_row ...
912 : !> \param work_col ...
913 : ! **************************************************************************************************
914 270521 : SUBROUTINE dbcsr_sym_matrix_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
915 : TYPE(dbcsr_type) :: matrix, vec_in, vec_out
916 : REAL(kind=real_8) :: alpha, beta
917 : TYPE(dbcsr_type) :: work_row, work_col
918 :
919 : CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_sym_matrix_vector_mult'
920 :
921 : INTEGER :: col, handle, handle1, ithread, mypcol, &
922 : myprow, ncols, nrows, pcol, &
923 : pcol_handle, prow, row, rpcol, rprow, &
924 : vec_dim
925 270521 : REAL(kind=real_8), DIMENSION(:), POINTER :: data_vec
926 270521 : REAL(kind=real_8), DIMENSION(:, :), POINTER :: data_d, vec_res
927 : TYPE(dbcsr_distribution_type) :: dist
928 : TYPE(dbcsr_iterator_type) :: iter
929 : TYPE(dbcsr_type) :: result_col, result_row
930 270521 : TYPE(fast_vec_access_type) :: fast_vec_col, fast_vec_row, &
931 270521 : res_fast_vec_col, res_fast_vec_row
932 : TYPE(mp_comm_type) :: pcol_group
933 :
934 270521 : CALL timeset(routineN, handle)
935 270521 : ithread = 0
936 : ! We need some work matrices as we try to exploit operations on the replicated vectors which are duplicated otherwise
937 270521 : CALL dbcsr_get_info(matrix=vec_in, nfullcols_total=vec_dim)
938 : ! This is a performance hack as the new creation of a replicated vector is a fair bit more expensive
939 270521 : CALL dbcsr_set(work_col, 0.0_real_8)
940 270521 : CALL dbcsr_copy(result_col, work_col)
941 270521 : CALL dbcsr_set(work_row, 0.0_real_8)
942 270521 : CALL dbcsr_copy(result_row, work_row)
943 :
944 : ! Collect some data about the parallel environment. We will use them later to move the vector around
945 270521 : CALL dbcsr_get_info(matrix=matrix, distribution=dist)
946 270521 : CALL dbcsr_distribution_get(dist, pcol_group=pcol_handle, myprow=myprow, mypcol=mypcol)
947 :
948 270521 : CALL pcol_group%set_handle(pcol_handle)
949 :
950 270521 : CALL create_fast_row_vec_access(work_row, fast_vec_row)
951 270521 : CALL create_fast_col_vec_access(work_col, fast_vec_col)
952 270521 : CALL create_fast_row_vec_access(result_row, res_fast_vec_row)
953 270521 : CALL create_fast_col_vec_access(result_col, res_fast_vec_col)
954 :
955 : ! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply
956 270521 : CALL dbcsr_col_vec_to_rep_row(vec_in, work_col, work_row, fast_vec_col)
957 :
958 : ! Probably I should rename the routine above as it delivers both the replicated row and column vector
959 :
960 : ! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes
961 : ! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)
962 270521 : CALL timeset(routineN//"_local_mm", handle1)
963 :
964 : !------ perform the multiplication, we have to take car to take the correct blocks ----------
965 :
966 : !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow,rpcol,rprow) &
967 270521 : !$OMP SHARED(matrix,fast_vec_row,res_fast_vec_col,res_fast_vec_row,fast_vec_col)
968 : !$ ithread = omp_get_thread_num()
969 : CALL dbcsr_iterator_start(iter, matrix, shared=.FALSE.)
970 : DO WHILE (dbcsr_iterator_blocks_left(iter))
971 : CALL dbcsr_iterator_next_block(iter, row, col, data_d)
972 : pcol = hash_table_get(fast_vec_row%hash_table, col)
973 : rprow = hash_table_get(res_fast_vec_col%hash_table, row)
974 : IF (ASSOCIATED(fast_vec_row%blk_map(pcol)%ptr) .AND. &
975 : ASSOCIATED(res_fast_vec_col%blk_map(rprow)%ptr)) THEN
976 : IF (res_fast_vec_col%blk_map(rprow)%assigned_thread .EQ. ithread) THEN
977 : res_fast_vec_col%blk_map(rprow)%ptr = res_fast_vec_col%blk_map(rprow)%ptr + &
978 : MATMUL(data_d, TRANSPOSE(fast_vec_row%blk_map(pcol)%ptr))
979 : END IF
980 : prow = hash_table_get(fast_vec_col%hash_table, row)
981 : rpcol = hash_table_get(res_fast_vec_row%hash_table, col)
982 : IF (res_fast_vec_row%blk_map(rpcol)%assigned_thread .EQ. ithread .AND. row .NE. col) THEN
983 : res_fast_vec_row%blk_map(rpcol)%ptr = res_fast_vec_row%blk_map(rpcol)%ptr + &
984 : MATMUL(TRANSPOSE(fast_vec_col%blk_map(prow)%ptr), data_d)
985 : END IF
986 : ELSE
987 : rpcol = hash_table_get(res_fast_vec_col%hash_table, col)
988 : prow = hash_table_get(fast_vec_row%hash_table, row)
989 : IF (res_fast_vec_col%blk_map(rpcol)%assigned_thread .EQ. ithread) THEN
990 : res_fast_vec_col%blk_map(rpcol)%ptr = res_fast_vec_col%blk_map(rpcol)%ptr + &
991 : TRANSPOSE(MATMUL(fast_vec_row%blk_map(prow)%ptr, data_d))
992 : END IF
993 : rprow = hash_table_get(res_fast_vec_row%hash_table, row)
994 : pcol = hash_table_get(fast_vec_col%hash_table, col)
995 : IF (res_fast_vec_row%blk_map(rprow)%assigned_thread .EQ. ithread .AND. row .NE. col) THEN
996 : res_fast_vec_row%blk_map(rprow)%ptr = res_fast_vec_row%blk_map(rprow)%ptr + &
997 : TRANSPOSE(MATMUL(data_d, fast_vec_col%blk_map(pcol)%ptr))
998 : END IF
999 : END IF
1000 : END DO
1001 : CALL dbcsr_iterator_stop(iter)
1002 : !$OMP END PARALLEL
1003 :
1004 270521 : CALL timestop(handle1)
1005 :
1006 : ! sum all the data within a processor column to obtain the replicated result from lower
1007 270521 : data_vec => dbcsr_get_data_p(result_row)
1008 270521 : CALL dbcsr_get_info(matrix=result_row, nfullrows_local=nrows, nfullcols_local=ncols)
1009 :
1010 25752247 : CALL pcol_group%sum(data_vec(1:nrows*ncols))
1011 :
1012 : ! Convert the results to a column wise distribution, this is a bit involved as the result_row is fully replicated
1013 : ! While the result_col still has the partial results in parallel. The routine below takes care of that and saves a
1014 : ! parallel summation. Of the res_row vectors are created only taking the appropriate element (0 otherwise) while the res_col
1015 : ! parallel bits are locally added. The sum magically creates the correct vector
1016 270521 : CALL dbcsr_rep_row_to_rep_col_vec(work_col, result_row, res_fast_vec_row, res_fast_vec_col)
1017 :
1018 : ! Create_the final vector by summing it to the result vector which lives on proc_col 0 lower
1019 270521 : CALL dbcsr_iterator_start(iter, vec_out)
1020 1322696 : DO WHILE (dbcsr_iterator_blocks_left(iter))
1021 1052175 : CALL dbcsr_iterator_next_block(iter, row, col, vec_res)
1022 1052175 : prow = hash_table_get(fast_vec_col%hash_table, row)
1023 1322696 : IF (ASSOCIATED(fast_vec_col%blk_map(prow)%ptr)) THEN
1024 8502261 : vec_res(:, :) = beta*vec_res(:, :) + alpha*(fast_vec_col%blk_map(prow)%ptr(:, :))
1025 : ELSE
1026 0 : vec_res(:, :) = beta*vec_res(:, :)
1027 : END IF
1028 : END DO
1029 270521 : CALL dbcsr_iterator_stop(iter)
1030 :
1031 270521 : CALL release_fast_vec_access(fast_vec_row)
1032 270521 : CALL release_fast_vec_access(fast_vec_col)
1033 270521 : CALL release_fast_vec_access(res_fast_vec_row)
1034 270521 : CALL release_fast_vec_access(res_fast_vec_col)
1035 :
1036 270521 : CALL dbcsr_release(result_row); CALL dbcsr_release(result_col)
1037 :
1038 270521 : CALL timestop(handle)
1039 :
1040 541042 : END SUBROUTINE dbcsr_sym_matrix_vector_mult
1041 :
1042 0 : END MODULE arnoldi_vector
|