Line data Source code
1 : !--------------------------------------------------------------------------------------------------!
2 : ! CP2K: A general program to perform molecular dynamics simulations !
3 : ! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4 : ! !
5 : ! SPDX-License-Identifier: GPL-2.0-or-later !
6 : !--------------------------------------------------------------------------------------------------!
7 :
8 : ! **************************************************************************************************
9 : !> \brief operations for skinny matrices/vectors expressed in dbcsr form
10 : !> \par History
11 : !> 2014.10 created [Florian Schiffmann]
12 : !> \author Florian Schiffmann
13 : ! **************************************************************************************************
14 :
15 : MODULE dbcsr_vector
16 : USE cp_dbcsr_api, ONLY: dbcsr_copy, &
17 : dbcsr_create, &
18 : dbcsr_distribution_get, &
19 : dbcsr_distribution_new, &
20 : dbcsr_distribution_release, &
21 : dbcsr_distribution_type, &
22 : dbcsr_get_info, &
23 : dbcsr_iterator_blocks_left, &
24 : dbcsr_iterator_next_block, &
25 : dbcsr_iterator_start, &
26 : dbcsr_iterator_stop, &
27 : dbcsr_iterator_type, &
28 : dbcsr_release, &
29 : dbcsr_reserve_all_blocks, &
30 : dbcsr_set, dbcsr_get_data_p, &
31 : dbcsr_type, &
32 : dbcsr_type_antisymmetric, &
33 : dbcsr_type_complex_8, &
34 : dbcsr_type_complex_8, &
35 : dbcsr_type_no_symmetry, &
36 : dbcsr_type_real_8, &
37 : dbcsr_type_real_8, &
38 : dbcsr_type_symmetric
39 : USE kinds, ONLY: dp, &
40 : real_8
41 : USE message_passing, ONLY: mp_comm_type
42 :
43 : #include "../base/base_uses.f90"
44 :
45 : !$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
46 :
47 : IMPLICIT NONE
48 :
49 : PRIVATE
50 :
51 : CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'dbcsr_vector_operations'
52 :
53 : ! **************************************************************************************************
54 : !> \brief Types needed for the hashtable.
55 : ! **************************************************************************************************
56 : TYPE ele_type
57 : INTEGER :: c = 0
58 : INTEGER :: p = 0
59 : END TYPE ele_type
60 :
61 : TYPE hash_table_type
62 : TYPE(ele_type), DIMENSION(:), POINTER :: table => NULL()
63 : INTEGER :: nele = 0
64 : INTEGER :: nmax = 0
65 : INTEGER :: prime = 0
66 : END TYPE hash_table_type
67 :
68 : ! **************************************************************************************************
69 : !> \brief Types needed for fast access to distributed dbcsr vectors.
70 : ! **************************************************************************************************
71 : TYPE block_ptr_d
72 : REAL(real_8), DIMENSION(:, :), POINTER :: ptr => NULL()
73 : INTEGER :: assigned_thread = -1
74 : END TYPE
75 : TYPE block_ptr_z
76 : COMPLEX(real_8), DIMENSION(:, :), POINTER :: ptr => NULL()
77 : INTEGER :: assigned_thread = -1
78 : END TYPE
79 :
80 : TYPE fast_vec_access_type
81 : TYPE(hash_table_type) :: hash_table = hash_table_type()
82 : TYPE(block_ptr_d), DIMENSION(:), ALLOCATABLE :: blk_map_d
83 : TYPE(block_ptr_z), DIMENSION(:), ALLOCATABLE :: blk_map_z
84 : END TYPE
85 :
86 : PUBLIC :: dbcsr_matrix_colvec_multiply, &
87 : create_col_vec_from_matrix, &
88 : create_row_vec_from_matrix, &
89 : create_replicated_col_vec_from_matrix, &
90 : create_replicated_row_vec_from_matrix
91 :
92 : INTERFACE dbcsr_matrix_colvec_multiply
93 : MODULE PROCEDURE dbcsr_matrix_colvec_multiply_d
94 : MODULE PROCEDURE dbcsr_matrix_colvec_multiply_z
95 : END INTERFACE
96 :
97 : CONTAINS
98 :
99 : ! **************************************************************************************************
100 : !> \brief creates a dbcsr col vector like object which lives on proc_col 0
101 : !> and has the same row dist as the template matrix
102 : !> the returned matrix is fully allocated and all blocks are set to 0
103 : !> this is not a sparse object (and must never be)
104 : !> \param dbcsr_vec the vector object to create must be allocated but not initialized
105 : !> \param matrix a dbcsr matrix used as template
106 : !> \param ncol number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
107 : ! **************************************************************************************************
108 262388 : SUBROUTINE create_col_vec_from_matrix(dbcsr_vec, matrix, ncol)
109 : TYPE(dbcsr_type) :: dbcsr_vec, matrix
110 : INTEGER :: ncol
111 :
112 : CHARACTER(LEN=*), PARAMETER :: routineN = 'create_col_vec_from_matrix'
113 :
114 : INTEGER :: handle, npcols, data_type
115 262388 : INTEGER, DIMENSION(:), POINTER :: row_blk_size, col_blk_size, row_dist, col_dist
116 : TYPE(dbcsr_distribution_type) :: dist_col_vec, dist
117 :
118 262388 : CALL timeset(routineN, handle)
119 :
120 262388 : CALL dbcsr_get_info(matrix, data_type=data_type, row_blk_size=row_blk_size, distribution=dist)
121 262388 : CALL dbcsr_distribution_get(dist, npcols=npcols, row_dist=row_dist)
122 :
123 262388 : ALLOCATE (col_dist(1), col_blk_size(1))
124 262388 : col_dist(1) = 0
125 262388 : col_blk_size(1) = ncol
126 262388 : CALL dbcsr_distribution_new(dist_col_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
127 :
128 : CALL dbcsr_create(dbcsr_vec, "D", dist_col_vec, &
129 : matrix_type=dbcsr_type_no_symmetry, &
130 : row_blk_size=row_blk_size, &
131 : col_blk_size=col_blk_size, &
132 262388 : nze=0, data_type=data_type)
133 262388 : CALL dbcsr_reserve_all_blocks(dbcsr_vec)
134 :
135 262388 : CALL dbcsr_distribution_release(dist_col_vec)
136 262388 : DEALLOCATE (col_dist, col_blk_size)
137 262388 : CALL timestop(handle)
138 :
139 524776 : END SUBROUTINE create_col_vec_from_matrix
140 :
141 : ! **************************************************************************************************
142 : !> \brief creates a dbcsr row vector like object which lives on proc_row 0
143 : !> and has the same row dist as the template matrix
144 : !> the returned matrix is fully allocated and all blocks are set to 0
145 : !> this is not a sparse object (and must never be)
146 : !> \param dbcsr_vec ...
147 : !> \param matrix a dbcsr matrix used as template
148 : !> \param nrow number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
149 : ! **************************************************************************************************
150 0 : SUBROUTINE create_row_vec_from_matrix(dbcsr_vec, matrix, nrow)
151 : TYPE(dbcsr_type) :: dbcsr_vec, matrix
152 : INTEGER :: nrow
153 :
154 : CHARACTER(LEN=*), PARAMETER :: routineN = 'create_row_vec_from_matrix'
155 :
156 : INTEGER :: handle, nprows, data_type
157 0 : INTEGER, DIMENSION(:), POINTER :: row_blk_size, col_blk_size, row_dist, col_dist
158 : TYPE(dbcsr_distribution_type) :: dist_row_vec, dist
159 :
160 0 : CALL timeset(routineN, handle)
161 :
162 0 : CALL dbcsr_get_info(matrix, data_type=data_type, col_blk_size=col_blk_size, distribution=dist)
163 0 : CALL dbcsr_distribution_get(dist, nprows=nprows, col_dist=col_dist)
164 :
165 0 : ALLOCATE (row_dist(1), row_blk_size(1))
166 0 : row_dist(1) = 0
167 0 : row_blk_size(1) = nrow
168 0 : CALL dbcsr_distribution_new(dist_row_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
169 :
170 : CALL dbcsr_create(dbcsr_vec, "D", dist_row_vec, &
171 : matrix_type=dbcsr_type_no_symmetry, &
172 : row_blk_size=row_blk_size, &
173 : col_blk_size=col_blk_size, &
174 0 : nze=0, data_type=data_type)
175 0 : CALL dbcsr_reserve_all_blocks(dbcsr_vec)
176 :
177 0 : CALL dbcsr_distribution_release(dist_row_vec)
178 0 : DEALLOCATE (row_dist, row_blk_size)
179 0 : CALL timestop(handle)
180 :
181 0 : END SUBROUTINE create_row_vec_from_matrix
182 :
183 : ! **************************************************************************************************
184 : !> \brief creates a col vector like object whose blocks can be replicated
185 : !> along the processor row and has the same row dist as the template matrix
186 : !> the returned matrix is fully allocated and all blocks are set to 0
187 : !> this is not a sparse object (and must never be)
188 : !> \param dbcsr_vec the vector object to create must be allocated but not initialized
189 : !> \param matrix a dbcsr matrix used as template
190 : !> \param ncol number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
191 : ! **************************************************************************************************
192 128923 : SUBROUTINE create_replicated_col_vec_from_matrix(dbcsr_vec, matrix, ncol)
193 : TYPE(dbcsr_type) :: dbcsr_vec, matrix
194 : INTEGER :: ncol
195 :
196 : CHARACTER(LEN=*), PARAMETER :: routineN = 'create_replicated_col_vec_from_matrix'
197 :
198 : INTEGER :: handle, npcols, data_type, i
199 128923 : INTEGER, DIMENSION(:), POINTER :: row_blk_size, col_blk_size, row_dist, col_dist
200 : TYPE(dbcsr_distribution_type) :: dist_col_vec, dist
201 128923 : CALL timeset(routineN, handle)
202 :
203 128923 : CALL dbcsr_get_info(matrix, data_type=data_type, row_blk_size=row_blk_size, distribution=dist)
204 128923 : CALL dbcsr_distribution_get(dist, npcols=npcols, row_dist=row_dist)
205 :
206 644615 : ALLOCATE (col_dist(npcols), col_blk_size(npcols))
207 257846 : col_blk_size(:) = ncol
208 257846 : DO i = 0, npcols - 1
209 257846 : col_dist(i + 1) = i
210 : END DO
211 128923 : CALL dbcsr_distribution_new(dist_col_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
212 :
213 : CALL dbcsr_create(dbcsr_vec, "D", dist_col_vec, &
214 : matrix_type=dbcsr_type_no_symmetry, &
215 : row_blk_size=row_blk_size, &
216 : col_blk_size=col_blk_size, &
217 128923 : nze=0, data_type=data_type)
218 128923 : CALL dbcsr_reserve_all_blocks(dbcsr_vec)
219 :
220 128923 : CALL dbcsr_distribution_release(dist_col_vec)
221 128923 : DEALLOCATE (col_dist, col_blk_size)
222 128923 : CALL timestop(handle)
223 :
224 257846 : END SUBROUTINE create_replicated_col_vec_from_matrix
225 :
226 : ! **************************************************************************************************
227 : !> \brief creates a row vector like object whose blocks can be replicated
228 : !> along the processor col and has the same col dist as the template matrix
229 : !> the returned matrix is fully allocated and all blocks are set to 0
230 : !> this is not a sparse object (and must never be)
231 : !> \param dbcsr_vec the vector object to create must be allocated but not initialized
232 : !> \param matrix a dbcsr matrix used as template
233 : !> \param nrow number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
234 : ! **************************************************************************************************
235 128923 : SUBROUTINE create_replicated_row_vec_from_matrix(dbcsr_vec, matrix, nrow)
236 : TYPE(dbcsr_type) :: dbcsr_vec
237 : TYPE(dbcsr_type) :: matrix
238 : INTEGER :: nrow
239 :
240 : CHARACTER(LEN=*), PARAMETER :: routineN = 'create_replicated_row_vec_from_matrix'
241 :
242 : INTEGER :: handle, i, nprows, data_type
243 128923 : INTEGER, DIMENSION(:), POINTER :: row_dist, col_dist, row_blk_size, col_blk_size
244 : TYPE(dbcsr_distribution_type) :: dist_row_vec, dist
245 :
246 128923 : CALL timeset(routineN, handle)
247 :
248 128923 : CALL dbcsr_get_info(matrix, distribution=dist, col_blk_size=col_blk_size, data_type=data_type)
249 128923 : CALL dbcsr_distribution_get(dist, nprows=nprows, col_dist=col_dist)
250 :
251 644615 : ALLOCATE (row_dist(nprows), row_blk_size(nprows))
252 384414 : row_blk_size(:) = nrow
253 384414 : DO i = 0, nprows - 1
254 384414 : row_dist(i + 1) = i
255 : END DO
256 128923 : CALL dbcsr_distribution_new(dist_row_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
257 :
258 : CALL dbcsr_create(dbcsr_vec, "D", dist_row_vec, dbcsr_type_no_symmetry, &
259 : row_blk_size=row_blk_size, col_blk_size=col_blk_size, &
260 128923 : nze=0, data_type=data_type)
261 128923 : CALL dbcsr_reserve_all_blocks(dbcsr_vec)
262 :
263 128923 : CALL dbcsr_distribution_release(dist_row_vec)
264 128923 : DEALLOCATE (row_dist, row_blk_Size)
265 128923 : CALL timestop(handle)
266 :
267 257846 : END SUBROUTINE create_replicated_row_vec_from_matrix
268 :
269 : ! **************************************************************************************************
270 : !> \brief given a column vector, prepare the fast_vec_access container
271 : !> \param vec ...
272 : !> \param fast_vec_access ...
273 : ! **************************************************************************************************
274 2099726 : SUBROUTINE create_fast_col_vec_access(vec, fast_vec_access)
275 : TYPE(dbcsr_type) :: vec
276 : TYPE(fast_vec_access_type) :: fast_vec_access
277 :
278 : CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_col_vec_access'
279 :
280 : INTEGER :: handle, data_type
281 :
282 1049863 : CALL timeset(routineN, handle)
283 :
284 1049863 : CALL dbcsr_get_info(vec, data_type=data_type)
285 :
286 1049863 : SELECT CASE (data_type)
287 : CASE (dbcsr_type_real_8)
288 1049863 : CALL create_fast_col_vec_access_d(vec, fast_vec_access)
289 : CASE (dbcsr_type_complex_8)
290 1049863 : CALL create_fast_col_vec_access_z(vec, fast_vec_access)
291 : END SELECT
292 :
293 1049863 : CALL timestop(handle)
294 :
295 1049863 : END SUBROUTINE create_fast_col_vec_access
296 :
297 : ! **************************************************************************************************
298 : !> \brief given a row vector, prepare the fast_vec_access_container
299 : !> \param vec ...
300 : !> \param fast_vec_access ...
301 : ! **************************************************************************************************
302 2099726 : SUBROUTINE create_fast_row_vec_access(vec, fast_vec_access)
303 : TYPE(dbcsr_type) :: vec
304 : TYPE(fast_vec_access_type) :: fast_vec_access
305 :
306 : CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_row_vec_access'
307 :
308 : INTEGER :: handle, data_type
309 :
310 1049863 : CALL timeset(routineN, handle)
311 :
312 1049863 : CALL dbcsr_get_info(vec, data_type=data_type)
313 :
314 1049863 : SELECT CASE (data_type)
315 : CASE (dbcsr_type_real_8)
316 1049863 : CALL create_fast_row_vec_access_d(vec, fast_vec_access)
317 : CASE (dbcsr_type_complex_8)
318 1049863 : CALL create_fast_row_vec_access_z(vec, fast_vec_access)
319 : END SELECT
320 :
321 1049863 : CALL timestop(handle)
322 :
323 1049863 : END SUBROUTINE create_fast_row_vec_access
324 :
325 : ! **************************************************************************************************
326 : !> \brief release all memory associated with the fast_vec_access type
327 : !> \param fast_vec_access ...
328 : ! **************************************************************************************************
329 2099726 : SUBROUTINE release_fast_vec_access(fast_vec_access)
330 : TYPE(fast_vec_access_type) :: fast_vec_access
331 :
332 : CHARACTER(LEN=*), PARAMETER :: routineN = 'release_fast_vec_access'
333 :
334 : INTEGER :: handle
335 :
336 2099726 : CALL timeset(routineN, handle)
337 :
338 2099726 : CALL hash_table_release(fast_vec_access%hash_table)
339 2099726 : IF (ALLOCATED(fast_vec_access%blk_map_d)) DEALLOCATE (fast_vec_access%blk_map_d)
340 2099726 : IF (ALLOCATED(fast_vec_access%blk_map_z)) DEALLOCATE (fast_vec_access%blk_map_z)
341 :
342 2099726 : CALL timestop(handle)
343 :
344 2099726 : END SUBROUTINE release_fast_vec_access
345 :
346 : ! --------------------------------------------------------------------------------------------------
347 : ! Beginning of hashtable.
348 : ! this file can be 'INCLUDE'ed verbatim in various place, where it needs to be
349 : ! part of the module to guarantee inlining
350 : ! hashes (c,p) pairs, where p is assumed to be >0
351 : ! on return (0 is used as a flag for not present)
352 : !
353 : !
354 : ! **************************************************************************************************
355 : !> \brief finds a prime equal or larger than i, needed at creation
356 : !> \param i ...
357 : !> \return ...
358 : ! **************************************************************************************************
359 2099726 : FUNCTION matching_prime(i) RESULT(res)
360 : INTEGER, INTENT(IN) :: i
361 : INTEGER :: res
362 :
363 : INTEGER :: j
364 :
365 2099726 : res = i
366 2099726 : j = 0
367 6491280 : DO WHILE (j < res)
368 55440606 : DO j = 2, res - 1
369 53340880 : IF (MOD(res, j) == 0) THEN
370 2291828 : res = res + 1
371 2291828 : EXIT
372 : END IF
373 : END DO
374 : END DO
375 2099726 : END FUNCTION
376 :
377 : ! **************************************************************************************************
378 : !> \brief create a hash_table of given initial size.
379 : !> the hash table will expand as needed (but this requires rehashing)
380 : !> \param hash_table ...
381 : !> \param table_size ...
382 : ! **************************************************************************************************
383 2099726 : SUBROUTINE hash_table_create(hash_table, table_size)
384 : TYPE(hash_table_type) :: hash_table
385 : INTEGER, INTENT(IN) :: table_size
386 :
387 : INTEGER :: j
388 :
389 : ! guarantee a minimal hash table size (8), so that expansion works
390 :
391 2099726 : j = 3
392 4118429 : DO WHILE (2**j - 1 < table_size)
393 2018703 : j = j + 1
394 : END DO
395 2099726 : hash_table%nmax = 2**j - 1
396 2099726 : hash_table%prime = matching_prime(hash_table%nmax)
397 2099726 : hash_table%nele = 0
398 57723514 : ALLOCATE (hash_table%table(0:hash_table%nmax))
399 2099726 : END SUBROUTINE hash_table_create
400 :
401 : ! **************************************************************************************************
402 : !> \brief ...
403 : !> \param hash_table ...
404 : ! **************************************************************************************************
405 2099726 : SUBROUTINE hash_table_release(hash_table)
406 : TYPE(hash_table_type) :: hash_table
407 :
408 2099726 : hash_table%nmax = 0
409 2099726 : hash_table%nele = 0
410 2099726 : DEALLOCATE (hash_table%table)
411 :
412 2099726 : END SUBROUTINE hash_table_release
413 :
414 : ! **************************************************************************************************
415 : !> \brief add a pair (c,p) to the hash table
416 : !> \param hash_table ...
417 : !> \param c this value is being hashed
418 : !> \param p this is being stored
419 : ! **************************************************************************************************
420 8115670 : RECURSIVE SUBROUTINE hash_table_add(hash_table, c, p)
421 : TYPE(hash_table_type), INTENT(INOUT) :: hash_table
422 : INTEGER, INTENT(IN) :: c, p
423 :
424 : REAL(KIND=real_8), PARAMETER :: hash_table_expand = 1.5_real_8, &
425 : inv_hash_table_fill = 2.5_real_8
426 :
427 : INTEGER :: i, j
428 : TYPE(ele_type), ALLOCATABLE, &
429 8115670 : DIMENSION(:) :: tmp_hash
430 :
431 : ! if too small, make a copy and rehash in a larger table
432 :
433 8115670 : IF (hash_table%nele*inv_hash_table_fill > hash_table%nmax) THEN
434 0 : ALLOCATE (tmp_hash(LBOUND(hash_table%table, 1):UBOUND(hash_table%table, 1)))
435 0 : tmp_hash(:) = hash_table%table
436 0 : CALL hash_table_release(hash_table)
437 0 : CALL hash_table_create(hash_table, INT((UBOUND(tmp_hash, 1) + 8)*hash_table_expand))
438 0 : DO i = LBOUND(tmp_hash, 1), UBOUND(tmp_hash, 1)
439 0 : IF (tmp_hash(i)%c .NE. 0) THEN
440 0 : CALL hash_table_add(hash_table, tmp_hash(i)%c, tmp_hash(i)%p)
441 : END IF
442 : END DO
443 0 : DEALLOCATE (tmp_hash)
444 : END IF
445 :
446 8115670 : hash_table%nele = hash_table%nele + 1
447 8115670 : i = IAND(c*hash_table%prime, hash_table%nmax)
448 :
449 8115670 : DO j = i, hash_table%nmax
450 8115670 : IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
451 8115670 : hash_table%table(j)%c = c
452 8115670 : hash_table%table(j)%p = p
453 8115670 : RETURN
454 : END IF
455 : END DO
456 0 : DO j = 0, i - 1
457 0 : IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
458 0 : hash_table%table(j)%c = c
459 0 : hash_table%table(j)%p = p
460 0 : RETURN
461 : END IF
462 : END DO
463 :
464 : END SUBROUTINE hash_table_add
465 :
466 : ! **************************************************************************************************
467 : !> \brief ...
468 : !> \param hash_table ...
469 : !> \param c ...
470 : !> \return ...
471 : ! **************************************************************************************************
472 121700452 : PURE FUNCTION hash_table_get(hash_table, c) RESULT(p)
473 : TYPE(hash_table_type), INTENT(IN) :: hash_table
474 : INTEGER, INTENT(IN) :: c
475 : INTEGER :: p
476 :
477 : INTEGER :: i, j
478 :
479 121700452 : i = IAND(c*hash_table%prime, hash_table%nmax)
480 :
481 : ! catch the likely case first
482 121700452 : IF (hash_table%table(i)%c == c) THEN
483 117374476 : p = hash_table%table(i)%p
484 117374476 : RETURN
485 : END IF
486 :
487 4325976 : DO j = i, hash_table%nmax
488 4325976 : IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
489 4325976 : p = hash_table%table(j)%p
490 4325976 : RETURN
491 : END IF
492 : END DO
493 0 : DO j = 0, i - 1
494 0 : IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
495 0 : p = hash_table%table(j)%p
496 0 : RETURN
497 : END IF
498 : END DO
499 :
500 : ! we should never reach this point.
501 121700452 : p = HUGE(p)
502 :
503 : END FUNCTION hash_table_get
504 :
505 : ! End of hashtable
506 : ! --------------------------------------------------------------------------------------------------
507 :
508 : #:set instances = [ &
509 : ('d', 'REAL(kind=real_8)', '0.0_real_8'), &
510 : ('z', 'COMPLEX(kind=real_8)', 'CMPLX(0.0, 0.0, real_8)') ]
511 :
512 : #:for nametype, type, zero in instances
513 :
514 : ! **************************************************************************************************
515 : !> \brief the real driver routine for the multiply, not all symmetries implemented yet
516 : !> \param matrix ...
517 : !> \param vec_in ...
518 : !> \param vec_out ...
519 : !> \param alpha ...
520 : !> \param beta ...
521 : !> \param work_row ...
522 : !> \param work_col ...
523 : ! **************************************************************************************************
524 773708 : SUBROUTINE dbcsr_matrix_colvec_multiply_${nametype}$ (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
525 : TYPE(dbcsr_type) :: matrix, vec_in, vec_out
526 : ${type}$ :: alpha, beta
527 : TYPE(dbcsr_type) :: work_row, work_col
528 :
529 : CHARACTER :: matrix_type
530 :
531 773708 : CALL dbcsr_get_info(matrix, matrix_type=matrix_type)
532 :
533 497553 : SELECT CASE (matrix_type)
534 : CASE (dbcsr_type_no_symmetry)
535 497553 : CALL dbcsr_matrix_vector_mult_${nametype}$ (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
536 : CASE (dbcsr_type_symmetric)
537 276155 : CALL dbcsr_sym_matrix_vector_mult_${nametype}$ (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
538 : CASE (dbcsr_type_antisymmetric)
539 : ! Not yet implemented, should mainly be some prefactor magic, but who knows how antisymmetric matrices are stored???
540 0 : CPABORT("NYI, antisymmetric matrix not permitted")
541 : CASE DEFAULT
542 773708 : CPABORT("Unknown matrix type, ...")
543 : END SELECT
544 :
545 773708 : END SUBROUTINE dbcsr_matrix_colvec_multiply_${nametype}$
546 :
547 : ! **************************************************************************************************
548 : !> \brief low level routines for matrix vector multiplies
549 : !> \param matrix ...
550 : !> \param vec_in ...
551 : !> \param vec_out ...
552 : !> \param alpha ...
553 : !> \param beta ...
554 : !> \param work_row ...
555 : !> \param work_col ...
556 : ! **************************************************************************************************
557 497553 : SUBROUTINE dbcsr_matrix_vector_mult_${nametype}$ (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
558 : TYPE(dbcsr_type) :: matrix, vec_in, vec_out
559 : ${type}$ :: alpha, beta
560 : TYPE(dbcsr_type) :: work_row, work_col
561 :
562 : CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_matrix_vector_mult'
563 :
564 : INTEGER :: col, mypcol, &
565 : myprow, prow_handle, &
566 : ncols, nrows, &
567 : row, &
568 : handle, handle1, ithread
569 : TYPE(mp_comm_type) :: prow_group
570 497553 : ${type}$, DIMENSION(:), POINTER :: data_vec
571 497553 : ${type}$, DIMENSION(:, :), POINTER :: data_d, vec_res
572 : TYPE(dbcsr_distribution_type) :: dist
573 : TYPE(dbcsr_iterator_type) :: iter
574 497553 : TYPE(fast_vec_access_type) :: fast_vec_row, fast_vec_col
575 : INTEGER :: prow, pcol
576 :
577 497553 : CALL timeset(routineN, handle)
578 497553 : ithread = 0
579 :
580 : ! Collect some data about the parallel environment. We will use them later to move the vector around
581 497553 : CALL dbcsr_get_info(matrix, distribution=dist)
582 497553 : CALL dbcsr_distribution_get(dist, prow_group=prow_handle, myprow=myprow, mypcol=mypcol)
583 :
584 497553 : CALL prow_group%set_handle(prow_handle)
585 :
586 497553 : CALL create_fast_row_vec_access(work_row, fast_vec_row)
587 497553 : CALL create_fast_col_vec_access(work_col, fast_vec_col)
588 :
589 : ! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply
590 497553 : CALL dbcsr_col_vec_to_rep_row_${nametype}$ (vec_in, work_col, work_row, fast_vec_col)
591 :
592 : ! Set the work vector for the results to 0
593 497553 : CALL dbcsr_set(work_col, ${zero}$)
594 :
595 : ! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes
596 : ! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)
597 497553 : CALL timeset(routineN//"_local_mm", handle1)
598 :
599 : !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow) &
600 497553 : !$OMP SHARED(matrix,fast_vec_col,fast_vec_row)
601 : !$ ithread = omp_get_thread_num()
602 : CALL dbcsr_iterator_start(iter, matrix, shared=.FALSE.)
603 : DO WHILE (dbcsr_iterator_blocks_left(iter))
604 : CALL dbcsr_iterator_next_block(iter, row, col, data_d)
605 : prow = hash_table_get(fast_vec_col%hash_table, row)
606 : IF (fast_vec_col%blk_map_${nametype}$ (prow)%assigned_thread .NE. ithread) CYCLE
607 : pcol = hash_table_get(fast_vec_row%hash_table, col)
608 : #:if nametype=='d'
609 : IF (SIZE(fast_vec_col%blk_map_${nametype}$ (prow)%ptr, 1) .EQ. 0 .OR. &
610 : SIZE(fast_vec_col%blk_map_${nametype}$ (prow)%ptr, 2) .EQ. 0 .OR. &
611 : SIZE(data_d, 2) .EQ. 0) CYCLE
612 : CALL dgemm('N', 'T', SIZE(fast_vec_col%blk_map_${nametype}$ (prow)%ptr, 1), &
613 : SIZE(fast_vec_col%blk_map_${nametype}$ (prow)%ptr, 2), &
614 : SIZE(data_d, 2), &
615 : 1.0_dp, &
616 : data_d, &
617 : SIZE(fast_vec_col%blk_map_${nametype}$ (prow)%ptr, 1), &
618 : fast_vec_row%blk_map_${nametype}$ (pcol)%ptr, &
619 : SIZE(fast_vec_col%blk_map_${nametype}$ (prow)%ptr, 2), &
620 : 1.0_dp, &
621 : fast_vec_col%blk_map_${nametype}$ (prow)%ptr, &
622 : SIZE(fast_vec_col%blk_map_${nametype}$ (prow)%ptr, 1))
623 : #:else
624 : fast_vec_col%blk_map_${nametype}$ (prow)%ptr = fast_vec_col%blk_map_${nametype}$ (prow)%ptr + &
625 : MATMUL(data_d, TRANSPOSE(fast_vec_row%blk_map_${nametype}$ (pcol)%ptr))
626 : #:endif
627 : END DO
628 : CALL dbcsr_iterator_stop(iter)
629 : !$OMP END PARALLEL
630 :
631 497553 : CALL timestop(handle1)
632 :
633 : ! sum all the data onto the first processor col where the original vector is stored
634 497553 : data_vec => dbcsr_get_data_p(work_col, select_data_type=${zero}$)
635 497553 : CALL dbcsr_get_info(matrix=work_col, nfullrows_local=nrows, nfullcols_local=ncols)
636 10530667 : CALL prow_group%sum(data_vec(1:nrows*ncols))
637 :
638 : ! Local copy on the first mpi col (as this is the localtion of the vec_res blocks) of the result vector
639 : ! from the replicated to the original vector. Let's play it safe and use the iterator
640 497553 : CALL dbcsr_iterator_start(iter, vec_out)
641 1062734 : DO WHILE (dbcsr_iterator_blocks_left(iter))
642 565181 : CALL dbcsr_iterator_next_block(iter, row, col, vec_res)
643 565181 : prow = hash_table_get(fast_vec_col%hash_table, row)
644 1062734 : IF (ASSOCIATED(fast_vec_col%blk_map_${nametype}$ (prow)%ptr)) THEN
645 6146919 : vec_res(:, :) = beta*vec_res(:, :) + alpha*fast_vec_col%blk_map_${nametype}$ (prow)%ptr(:, :)
646 : ELSE
647 0 : vec_res(:, :) = beta*vec_res(:, :)
648 : END IF
649 : END DO
650 497553 : CALL dbcsr_iterator_stop(iter)
651 :
652 497553 : CALL release_fast_vec_access(fast_vec_row)
653 497553 : CALL release_fast_vec_access(fast_vec_col)
654 :
655 497553 : CALL timestop(handle)
656 :
657 995106 : END SUBROUTINE dbcsr_matrix_vector_mult_${nametype}$
658 :
659 : ! **************************************************************************************************
660 : !> \brief ...
661 : !> \param matrix ...
662 : !> \param vec_in ...
663 : !> \param vec_out ...
664 : !> \param alpha ...
665 : !> \param beta ...
666 : !> \param work_row ...
667 : !> \param work_col ...
668 : !> \param skip_diag ...
669 : ! **************************************************************************************************
670 0 : SUBROUTINE dbcsr_matrixT_vector_mult_${nametype}$ (matrix, vec_in, vec_out, alpha, beta, work_row, work_col, skip_diag)
671 : TYPE(dbcsr_type) :: matrix, vec_in, vec_out
672 : ${type}$ :: alpha, beta
673 : TYPE(dbcsr_type) :: work_row, work_col
674 : LOGICAL :: skip_diag
675 :
676 : CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_matrixT_vector_mult'
677 :
678 : INTEGER :: col, col_size, mypcol, &
679 : myprow, pcol_handle, prow_handle, &
680 : ncols, nrows, &
681 : row, row_size, &
682 : handle, handle1, ithread
683 : TYPE(mp_comm_type) :: pcol_group, prow_group
684 0 : ${type}$, DIMENSION(:), POINTER :: data_vec
685 0 : ${type}$, DIMENSION(:, :), POINTER :: data_d, vec_bl, vec_res
686 : TYPE(dbcsr_distribution_type) :: dist
687 : TYPE(dbcsr_iterator_type) :: iter
688 :
689 0 : TYPE(fast_vec_access_type) :: fast_vec_row, fast_vec_col
690 : INTEGER :: prow, pcol
691 :
692 0 : CALL timeset(routineN, handle)
693 0 : ithread = 0
694 :
695 : ! Collect some data about the parallel environment. We will use them later to move the vector around
696 0 : CALL dbcsr_get_info(matrix, distribution=dist)
697 0 : CALL dbcsr_distribution_get(dist, prow_group=prow_handle, pcol_group=pcol_handle, myprow=myprow, mypcol=mypcol)
698 :
699 0 : CALL prow_group%set_handle(prow_handle)
700 0 : CALL pcol_group%set_handle(pcol_handle)
701 :
702 0 : CALL create_fast_row_vec_access(work_row, fast_vec_row)
703 0 : CALL create_fast_col_vec_access(work_col, fast_vec_col)
704 :
705 : ! Set the work vector for the results to 0
706 0 : CALL dbcsr_set(work_row, ${zero}$)
707 :
708 : ! Transfer the correct parts of the input vector to the replicated vector on proc_col 0
709 0 : CALL dbcsr_iterator_start(iter, vec_in)
710 0 : DO WHILE (dbcsr_iterator_blocks_left(iter))
711 0 : CALL dbcsr_iterator_next_block(iter, row, col, vec_bl, row_size=row_size, col_size=col_size)
712 0 : prow = hash_table_get(fast_vec_col%hash_table, row)
713 0 : fast_vec_col%blk_map_${nametype}$ (prow)%ptr(1:row_size, 1:col_size) = vec_bl(1:row_size, 1:col_size)
714 : END DO
715 0 : CALL dbcsr_iterator_stop(iter)
716 : ! Replicate the data on all processore in the row
717 0 : data_vec => dbcsr_get_data_p(work_col, select_data_type=${zero}$)
718 0 : CALL prow_group%bcast(data_vec, 0)
719 :
720 : ! Perform the local multiply. Here it is obvious why the vectors are replicated on the mpi rows and cols
721 0 : CALL timeset(routineN//"local_mm", handle1)
722 0 : CALL dbcsr_get_info(matrix=work_col, nfullcols_local=ncols)
723 : !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,row_size,col_size,ithread,prow,pcol) &
724 0 : !$OMP SHARED(matrix,fast_vec_row,fast_vec_col,skip_diag,ncols)
725 : !$ ithread = omp_get_thread_num()
726 : CALL dbcsr_iterator_start(iter, matrix, shared=.FALSE.)
727 : DO WHILE (dbcsr_iterator_blocks_left(iter))
728 : CALL dbcsr_iterator_next_block(iter, row, col, data_d, row_size=row_size, col_size=col_size)
729 : IF (skip_diag .AND. col == row) CYCLE
730 : prow = hash_table_get(fast_vec_col%hash_table, row)
731 : pcol = hash_table_get(fast_vec_row%hash_table, col)
732 : IF (ASSOCIATED(fast_vec_row%blk_map_${nametype}$ (pcol)%ptr) .AND. &
733 : ASSOCIATED(fast_vec_col%blk_map_${nametype}$ (prow)%ptr)) THEN
734 : IF (fast_vec_row%blk_map_${nametype}$ (pcol)%assigned_thread .NE. ithread) CYCLE
735 : fast_vec_row%blk_map_${nametype}$ (pcol)%ptr = fast_vec_row%blk_map_${nametype}$ (pcol)%ptr + &
736 : MATMUL(TRANSPOSE(fast_vec_col%blk_map_${nametype}$ (prow)%ptr), data_d)
737 : ELSE
738 : prow = hash_table_get(fast_vec_row%hash_table, row)
739 : pcol = hash_table_get(fast_vec_col%hash_table, col)
740 : IF (fast_vec_row%blk_map_${nametype}$ (prow)%assigned_thread .NE. ithread) CYCLE
741 : fast_vec_row%blk_map_${nametype}$ (prow)%ptr = fast_vec_row%blk_map_${nametype}$ (prow)%ptr + &
742 : MATMUL(TRANSPOSE(fast_vec_col%blk_map_${nametype}$ (pcol)%ptr), TRANSPOSE(data_d))
743 : END IF
744 : END DO
745 : CALL dbcsr_iterator_stop(iter)
746 : !$OMP END PARALLEL
747 :
748 0 : CALL timestop(handle1)
749 :
750 : ! sum all the data within a processor column to obtain the replicated result
751 0 : data_vec => dbcsr_get_data_p(work_row, select_data_type=${zero}$)
752 : ! we use the replicated vector but the final answer is only summed to proc_col 0 for efficiency
753 0 : CALL dbcsr_get_info(matrix=work_row, nfullrows_local=nrows, nfullcols_local=ncols)
754 0 : CALL pcol_group%sum(data_vec(1:nrows*ncols))
755 :
756 : ! Convert the result to a column wise distribution
757 0 : CALL dbcsr_rep_row_to_rep_col_vec_${nametype}$ (work_col, work_row, fast_vec_row)
758 :
759 : ! Create_the final vector by summing it to the result vector which lives on proc_col 0
760 0 : CALL dbcsr_iterator_start(iter, vec_out)
761 0 : DO WHILE (dbcsr_iterator_blocks_left(iter))
762 0 : CALL dbcsr_iterator_next_block(iter, row, col, vec_res, row_size=row_size)
763 0 : prow = hash_table_get(fast_vec_col%hash_table, row)
764 0 : IF (ASSOCIATED(fast_vec_col%blk_map_${nametype}$ (prow)%ptr)) THEN
765 0 : vec_res(:, :) = beta*vec_res(:, :) + alpha*fast_vec_col%blk_map_${nametype}$ (prow)%ptr(:, :)
766 : ELSE
767 0 : vec_res(:, :) = beta*vec_res(:, :)
768 : END IF
769 : END DO
770 0 : CALL dbcsr_iterator_stop(iter)
771 :
772 0 : CALL timestop(handle)
773 :
774 0 : END SUBROUTINE dbcsr_matrixT_vector_mult_${nametype}$
775 :
776 : ! **************************************************************************************************
777 : !> \brief ...
778 : !> \param vec_in ...
779 : !> \param rep_col_vec ...
780 : !> \param rep_row_vec ...
781 : !> \param fast_vec_col ...
782 : ! **************************************************************************************************
783 6189664 : SUBROUTINE dbcsr_col_vec_to_rep_row_${nametype}$ (vec_in, rep_col_vec, rep_row_vec, fast_vec_col)
784 : TYPE(dbcsr_type) :: vec_in, rep_col_vec, &
785 : rep_row_vec
786 : TYPE(fast_vec_access_type), INTENT(IN) :: fast_vec_col
787 :
788 : CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_col_vec_to_rep_row'
789 :
790 : INTEGER :: col, mypcol, myprow, ncols, &
791 : nrows, pcol_handle, prow_handle, &
792 : row, handle
793 : TYPE(mp_comm_type) :: pcol_group, prow_group
794 773708 : INTEGER, DIMENSION(:), POINTER :: local_cols, row_dist
795 773708 : ${type}$, DIMENSION(:), POINTER :: data_vec, data_vec_rep
796 773708 : ${type}$, DIMENSION(:, :), POINTER :: vec_row
797 : TYPE(dbcsr_distribution_type) :: dist_in, dist_rep_col
798 : TYPE(dbcsr_iterator_type) :: iter
799 :
800 773708 : CALL timeset(routineN, handle)
801 :
802 : ! get information about the parallel environment
803 773708 : CALL dbcsr_get_info(vec_in, distribution=dist_in)
804 : CALL dbcsr_distribution_get(dist_in, &
805 : prow_group=prow_handle, &
806 : pcol_group=pcol_handle, &
807 : myprow=myprow, &
808 773708 : mypcol=mypcol)
809 :
810 773708 : CALL prow_group%set_handle(prow_handle)
811 773708 : CALL pcol_group%set_handle(pcol_handle)
812 :
813 : ! Get the vector which tells us which blocks are local to which processor row in the col vec
814 773708 : CALL dbcsr_get_info(rep_col_vec, distribution=dist_rep_col)
815 773708 : CALL dbcsr_distribution_get(dist_rep_col, row_dist=row_dist)
816 :
817 : ! Copy the local vector to the replicated on the first processor column (this is where vec_in lives)
818 773708 : CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)
819 773708 : data_vec_rep => dbcsr_get_data_p(rep_col_vec, select_data_type=${zero}$)
820 773708 : data_vec => dbcsr_get_data_p(vec_in, select_data_type=${zero}$)
821 24067890 : IF (mypcol == 0) data_vec_rep(1:nrows*ncols) = data_vec(1:nrows*ncols)
822 : ! Replicate the data along the row
823 24067890 : CALL prow_group%bcast(data_vec_rep(1:nrows*ncols), 0)
824 :
825 : ! Here it gets a bit tricky as we are dealing with two different parallel layouts:
826 : ! The rep_col_vec contains all blocks local to the row distribution of the vector.
827 : ! The rep_row_vec only needs the fraction which is local to the col distribution.
828 : ! However in most cases this won't the complete set of block which can be obtained from col_vector p_row i
829 : ! Anyway, as the blocks don't repeat in the col_vec, a different fraction of the row vec will be available
830 : ! on every replica in the processor column, by summing along the column we end up with the complete vector everywhere
831 : ! Hope this clarifies the idea
832 773708 : CALL dbcsr_set(rep_row_vec, ${zero}$)
833 773708 : CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, local_cols=local_cols, nfullcols_local=ncols)
834 773708 : CALL dbcsr_iterator_start(iter, rep_row_vec)
835 4038264 : DO WHILE (dbcsr_iterator_blocks_left(iter))
836 3264556 : CALL dbcsr_iterator_next_block(iter, row, col, vec_row)
837 4038264 : IF (row_dist(col) == myprow) THEN
838 24933560 : vec_row = TRANSPOSE(fast_vec_col%blk_map_${nametype}$ (hash_table_get(fast_vec_col%hash_table, col))%ptr)
839 : END IF
840 : END DO
841 773708 : CALL dbcsr_iterator_stop(iter)
842 773708 : CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, nfullcols_local=ncols)
843 773708 : data_vec_rep => dbcsr_get_data_p(rep_row_vec, select_data_type=${zero}$)
844 47224988 : CALL pcol_group%sum(data_vec_rep(1:ncols*nrows))
845 :
846 773708 : CALL timestop(handle)
847 :
848 773708 : END SUBROUTINE dbcsr_col_vec_to_rep_row_${nametype}$
849 :
850 : ! **************************************************************************************************
851 : !> \brief ...
852 : !> \param rep_col_vec ...
853 : !> \param rep_row_vec ...
854 : !> \param fast_vec_row ...
855 : !> \param fast_vec_col_add ...
856 : ! **************************************************************************************************
857 1656930 : SUBROUTINE dbcsr_rep_row_to_rep_col_vec_${nametype}$ (rep_col_vec, rep_row_vec, fast_vec_row, fast_vec_col_add)
858 : TYPE(dbcsr_type) :: rep_col_vec, rep_row_vec
859 : TYPE(fast_vec_access_type), OPTIONAL :: fast_vec_col_add
860 : TYPE(fast_vec_access_type) :: fast_vec_row
861 :
862 : CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_rep_row_to_rep_col_vec'
863 :
864 : INTEGER :: col, mypcol, myprow, ncols, &
865 : nrows, prow_handle, &
866 : row, handle
867 276155 : INTEGER, DIMENSION(:), POINTER :: col_dist
868 : TYPE(mp_comm_type) :: prow_group
869 276155 : ${type}$, DIMENSION(:), POINTER :: data_vec_rep
870 276155 : ${type}$, DIMENSION(:, :), POINTER :: vec_col
871 : TYPE(dbcsr_distribution_type) :: dist_rep_row, dist_rep_col
872 : TYPE(dbcsr_iterator_type) :: iter
873 :
874 276155 : CALL timeset(routineN, handle)
875 :
876 : ! get information about the parallel environment
877 276155 : CALL dbcsr_get_info(matrix=rep_col_vec, distribution=dist_rep_col)
878 : CALL dbcsr_distribution_get(dist_rep_col, &
879 : prow_group=prow_handle, &
880 : myprow=myprow, &
881 276155 : mypcol=mypcol)
882 :
883 276155 : CALL prow_group%set_handle(prow_handle)
884 :
885 : ! Get the vector which tells us which blocks are local to which processor col in the row vec
886 276155 : CALL dbcsr_get_info(matrix=rep_row_vec, distribution=dist_rep_row)
887 276155 : CALL dbcsr_distribution_get(dist_rep_row, col_dist=col_dist)
888 :
889 : ! The same trick as described above with opposite direction
890 276155 : CALL dbcsr_set(rep_col_vec, ${zero}$)
891 276155 : CALL dbcsr_iterator_start(iter, rep_col_vec)
892 1350352 : DO WHILE (dbcsr_iterator_blocks_left(iter))
893 1074197 : CALL dbcsr_iterator_next_block(iter, row, col, vec_col)
894 1074197 : IF (col_dist(row) == mypcol) THEN
895 8778928 : vec_col = TRANSPOSE(fast_vec_row%blk_map_${nametype}$ (hash_table_get(fast_vec_row%hash_table, row))%ptr)
896 : END IF
897 : ! this one is special and allows to add the elements of a not yet summed replicated
898 : ! column vector as it appears in M*V(row_rep) as result. Save an parallel summation in the symmetric case
899 1074197 : IF (PRESENT(fast_vec_col_add)) vec_col = vec_col + &
900 9055083 : fast_vec_col_add%blk_map_${nametype}$ (hash_table_get(fast_vec_col_add%hash_table, row))%ptr(:, :)
901 : END DO
902 276155 : CALL dbcsr_iterator_stop(iter)
903 276155 : CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)
904 276155 : data_vec_rep => dbcsr_get_data_p(rep_col_vec, select_data_type=${zero}$)
905 13537223 : CALL prow_group%sum(data_vec_rep(1:nrows*ncols))
906 :
907 276155 : CALL timestop(handle)
908 :
909 276155 : END SUBROUTINE dbcsr_rep_row_to_rep_col_vec_${nametype}$
910 :
911 : ! **************************************************************************************************
912 : !> \brief given a column vector, prepare the fast_vec_access container
913 : !> \param vec ...
914 : !> \param fast_vec_access ...
915 : ! **************************************************************************************************
916 1049863 : SUBROUTINE create_fast_col_vec_access_${nametype}$ (vec, fast_vec_access)
917 : TYPE(dbcsr_type) :: vec
918 : TYPE(fast_vec_access_type) :: fast_vec_access
919 :
920 : CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_col_vec_access_${nametype}$'
921 :
922 : INTEGER :: handle, nblk_local
923 : INTEGER :: col, row, iblock, nthreads
924 1049863 : ${type}$, DIMENSION(:, :), POINTER :: vec_bl
925 : TYPE(dbcsr_iterator_type) :: iter
926 :
927 1049863 : CALL timeset(routineN, handle)
928 :
929 : ! figure out the number of threads
930 1049863 : nthreads = 1
931 1049863 : !$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)
932 : !$OMP MASTER
933 : !$ nthreads = OMP_GET_NUM_THREADS()
934 : !$OMP END MASTER
935 : !$OMP END PARALLEL
936 :
937 1049863 : CALL dbcsr_get_info(matrix=vec, nblkrows_local=nblk_local)
938 : ! 4 times makes sure the table is big enough to limit collisions.
939 1049863 : CALL hash_table_create(fast_vec_access%hash_table, 4*nblk_local)
940 : ! include zero for effective dealing with values not in the hash table (will return 0)
941 6913027 : ALLOCATE (fast_vec_access%blk_map_${nametype}$ (0:nblk_local))
942 :
943 1049863 : CALL dbcsr_get_info(matrix=vec, nblkcols_local=col)
944 1049863 : IF (col .GT. 1) CPABORT("BUG")
945 :
946 : ! go through the blocks of the vector
947 1049863 : iblock = 0
948 1049863 : CALL dbcsr_iterator_start(iter, vec)
949 3763438 : DO WHILE (dbcsr_iterator_blocks_left(iter))
950 2713575 : CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)
951 2713575 : iblock = iblock + 1
952 2713575 : CALL hash_table_add(fast_vec_access%hash_table, row, iblock)
953 2713575 : fast_vec_access%blk_map_${nametype}$ (iblock)%ptr => vec_bl
954 2713575 : fast_vec_access%blk_map_${nametype}$ (iblock)%assigned_thread = MOD(iblock, nthreads)
955 : END DO
956 1049863 : CALL dbcsr_iterator_stop(iter)
957 :
958 1049863 : CALL timestop(handle)
959 :
960 3149589 : END SUBROUTINE create_fast_col_vec_access_${nametype}$
961 :
962 : ! **************************************************************************************************
963 : !> \brief given a row vector, prepare the fast_vec_access_container
964 : !> \param vec ...
965 : !> \param fast_vec_access ...
966 : ! **************************************************************************************************
967 1049863 : SUBROUTINE create_fast_row_vec_access_${nametype}$ (vec, fast_vec_access)
968 : TYPE(dbcsr_type) :: vec
969 : TYPE(fast_vec_access_type) :: fast_vec_access
970 :
971 : CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_row_vec_access_${nametype}$'
972 :
973 : INTEGER :: handle, nblk_local
974 : INTEGER :: col, row, iblock, nthreads
975 1049863 : ${type}$, DIMENSION(:, :), POINTER :: vec_bl
976 : TYPE(dbcsr_iterator_type) :: iter
977 :
978 1049863 : CALL timeset(routineN, handle)
979 :
980 : ! figure out the number of threads
981 1049863 : nthreads = 1
982 1049863 : !$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)
983 : !$OMP MASTER
984 : !$ nthreads = OMP_GET_NUM_THREADS()
985 : !$OMP END MASTER
986 : !$OMP END PARALLEL
987 :
988 1049863 : CALL dbcsr_get_info(matrix=vec, nblkcols_local=nblk_local)
989 : ! 4 times makes sure the table is big enough to limit collisions.
990 1049863 : CALL hash_table_create(fast_vec_access%hash_table, 4*nblk_local)
991 : ! include zero for effective dealing with values not in the hash table (will return 0)
992 9601547 : ALLOCATE (fast_vec_access%blk_map_${nametype}$ (0:nblk_local))
993 :
994 : ! sanity check
995 1049863 : CALL dbcsr_get_info(matrix=vec, nblkrows_local=row)
996 1049863 : IF (row .GT. 1) CPABORT("BUG")
997 :
998 : ! go through the blocks of the vector
999 1049863 : iblock = 0
1000 1049863 : CALL dbcsr_iterator_start(iter, vec)
1001 6451958 : DO WHILE (dbcsr_iterator_blocks_left(iter))
1002 5402095 : CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)
1003 5402095 : iblock = iblock + 1
1004 5402095 : CALL hash_table_add(fast_vec_access%hash_table, col, iblock)
1005 5402095 : fast_vec_access%blk_map_${nametype}$ (iblock)%ptr => vec_bl
1006 5402095 : fast_vec_access%blk_map_${nametype}$ (iblock)%assigned_thread = MOD(iblock, nthreads)
1007 : END DO
1008 1049863 : CALL dbcsr_iterator_stop(iter)
1009 :
1010 1049863 : CALL timestop(handle)
1011 :
1012 3149589 : END SUBROUTINE create_fast_row_vec_access_${nametype}$
1013 :
1014 : ! **************************************************************************************************
1015 : !> \brief ...
1016 : !> \param matrix ...
1017 : !> \param vec_in ...
1018 : !> \param vec_out ...
1019 : !> \param alpha ...
1020 : !> \param beta ...
1021 : !> \param work_row ...
1022 : !> \param work_col ...
1023 : ! **************************************************************************************************
1024 276155 : SUBROUTINE dbcsr_sym_matrix_vector_mult_${nametype}$ (matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
1025 : TYPE(dbcsr_type) :: matrix, vec_in, vec_out
1026 : ${type}$ :: alpha, beta
1027 : TYPE(dbcsr_type) :: work_row, work_col
1028 :
1029 : CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_sym_m_v_mult'
1030 :
1031 : INTEGER :: col, mypcol, &
1032 : myprow, &
1033 : nrows, ncols, &
1034 : row, pcol_handle, &
1035 : handle, handle1, ithread, vec_dim
1036 276155 : ${type}$, DIMENSION(:), POINTER :: data_vec
1037 276155 : ${type}$, DIMENSION(:, :), POINTER :: data_d, vec_res
1038 : TYPE(dbcsr_distribution_type) :: dist
1039 : TYPE(dbcsr_iterator_type) :: iter
1040 : TYPE(dbcsr_type) :: result_row, result_col
1041 : TYPE(mp_comm_type) :: pcol_group
1042 :
1043 276155 : TYPE(fast_vec_access_type) :: fast_vec_row, fast_vec_col, res_fast_vec_row, res_fast_vec_col
1044 : INTEGER :: prow, pcol, rprow, rpcol
1045 :
1046 276155 : CALL timeset(routineN, handle)
1047 276155 : ithread = 0
1048 : ! We need some work matrices as we try to exploit operations on the replicated vectors which are duplicated otherwise
1049 276155 : CALL dbcsr_get_info(matrix=vec_in, nfullcols_total=vec_dim)
1050 : ! This is a performance hack as the new creation of a replicated vector is a fair bit more expensive
1051 276155 : CALL dbcsr_set(work_col, ${zero}$)
1052 276155 : CALL dbcsr_copy(result_col, work_col)
1053 276155 : CALL dbcsr_set(work_row, ${zero}$)
1054 276155 : CALL dbcsr_copy(result_row, work_row)
1055 :
1056 : ! Collect some data about the parallel environment. We will use them later to move the vector around
1057 276155 : CALL dbcsr_get_info(matrix=matrix, distribution=dist)
1058 276155 : CALL dbcsr_distribution_get(dist, pcol_group=pcol_handle, myprow=myprow, mypcol=mypcol)
1059 :
1060 276155 : CALL pcol_group%set_handle(pcol_handle)
1061 :
1062 276155 : CALL create_fast_row_vec_access(work_row, fast_vec_row)
1063 276155 : CALL create_fast_col_vec_access(work_col, fast_vec_col)
1064 276155 : CALL create_fast_row_vec_access(result_row, res_fast_vec_row)
1065 276155 : CALL create_fast_col_vec_access(result_col, res_fast_vec_col)
1066 :
1067 : ! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply
1068 276155 : CALL dbcsr_col_vec_to_rep_row_${nametype}$ (vec_in, work_col, work_row, fast_vec_col)
1069 :
1070 : ! Probably I should rename the routine above as it delivers both the replicated row and column vector
1071 :
1072 : ! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes
1073 : ! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)
1074 276155 : CALL timeset(routineN//"_local_mm", handle1)
1075 :
1076 : !------ perform the multiplication, we have to take car to take the correct blocks ----------
1077 :
1078 : !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow,rpcol,rprow) &
1079 276155 : !$OMP SHARED(matrix,fast_vec_row,res_fast_vec_col,res_fast_vec_row,fast_vec_col)
1080 : !$ ithread = omp_get_thread_num()
1081 : CALL dbcsr_iterator_start(iter, matrix, shared=.FALSE.)
1082 : DO WHILE (dbcsr_iterator_blocks_left(iter))
1083 : CALL dbcsr_iterator_next_block(iter, row, col, data_d)
1084 : pcol = hash_table_get(fast_vec_row%hash_table, col)
1085 : rprow = hash_table_get(res_fast_vec_col%hash_table, row)
1086 : IF (ASSOCIATED(fast_vec_row%blk_map_${nametype}$ (pcol)%ptr) .AND. &
1087 : ASSOCIATED(res_fast_vec_col%blk_map_${nametype}$ (rprow)%ptr)) THEN
1088 : IF (res_fast_vec_col%blk_map_${nametype}$ (rprow)%assigned_thread .EQ. ithread) THEN
1089 : res_fast_vec_col%blk_map_${nametype}$ (rprow)%ptr = res_fast_vec_col%blk_map_${nametype}$ (rprow)%ptr + &
1090 : MATMUL(data_d, TRANSPOSE(fast_vec_row%blk_map_${nametype}$ (pcol)%ptr))
1091 : END IF
1092 : prow = hash_table_get(fast_vec_col%hash_table, row)
1093 : rpcol = hash_table_get(res_fast_vec_row%hash_table, col)
1094 : IF (res_fast_vec_row%blk_map_${nametype}$ (rpcol)%assigned_thread .EQ. ithread .AND. row .NE. col) THEN
1095 : res_fast_vec_row%blk_map_${nametype}$ (rpcol)%ptr = res_fast_vec_row%blk_map_${nametype}$ (rpcol)%ptr + &
1096 : MATMUL(TRANSPOSE(fast_vec_col%blk_map_${nametype}$ (prow)%ptr), data_d)
1097 : END IF
1098 : ELSE
1099 : rpcol = hash_table_get(res_fast_vec_col%hash_table, col)
1100 : prow = hash_table_get(fast_vec_row%hash_table, row)
1101 : IF (res_fast_vec_col%blk_map_${nametype}$ (rpcol)%assigned_thread .EQ. ithread) THEN
1102 : res_fast_vec_col%blk_map_${nametype}$ (rpcol)%ptr = res_fast_vec_col%blk_map_${nametype}$ (rpcol)%ptr + &
1103 : TRANSPOSE(MATMUL(fast_vec_row%blk_map_${nametype}$ (prow)%ptr, data_d))
1104 : END IF
1105 : rprow = hash_table_get(res_fast_vec_row%hash_table, row)
1106 : pcol = hash_table_get(fast_vec_col%hash_table, col)
1107 : IF (res_fast_vec_row%blk_map_${nametype}$ (rprow)%assigned_thread .EQ. ithread .AND. row .NE. col) THEN
1108 : res_fast_vec_row%blk_map_${nametype}$ (rprow)%ptr = res_fast_vec_row%blk_map_${nametype}$ (rprow)%ptr + &
1109 : TRANSPOSE(MATMUL(data_d, fast_vec_col%blk_map_${nametype}$ (pcol)%ptr))
1110 : END IF
1111 : END IF
1112 : END DO
1113 : CALL dbcsr_iterator_stop(iter)
1114 : !$OMP END PARALLEL
1115 :
1116 276155 : CALL timestop(handle1)
1117 :
1118 : ! sum all the data within a processor column to obtain the replicated result from lower
1119 276155 : data_vec => dbcsr_get_data_p(result_row, select_data_type=${zero}$)
1120 276155 : CALL dbcsr_get_info(matrix=result_row, nfullrows_local=nrows, nfullcols_local=ncols)
1121 :
1122 26688373 : CALL pcol_group%sum(data_vec(1:nrows*ncols))
1123 : !
1124 : !! Convert the results to a column wise distribution, this is a bit involved as the result_row is fully replicated
1125 : !! While the result_col still has the partial results in parallel. The routine below takes care of that and saves a
1126 : !! parallel summation. Of the res_row vectors are created only taking the appropriate element (0 otherwise) while the res_col
1127 : !! parallel bits are locally added. The sum magically creates the correct vector
1128 276155 : CALL dbcsr_rep_row_to_rep_col_vec_${nametype}$ (work_col, result_row, res_fast_vec_row, res_fast_vec_col)
1129 :
1130 : ! ! Create_the final vector by summing it to the result vector which lives on proc_col 0 lower
1131 276155 : CALL dbcsr_iterator_start(iter, vec_out)
1132 1350352 : DO WHILE (dbcsr_iterator_blocks_left(iter))
1133 1074197 : CALL dbcsr_iterator_next_block(iter, row, col, vec_res)
1134 1074197 : prow = hash_table_get(fast_vec_col%hash_table, row)
1135 1350352 : IF (ASSOCIATED(fast_vec_col%blk_map_${nametype}$ (prow)%ptr)) THEN
1136 8778928 : vec_res(:, :) = beta*vec_res(:, :) + alpha*(fast_vec_col%blk_map_${nametype}$ (prow)%ptr(:, :))
1137 : ELSE
1138 0 : vec_res(:, :) = beta*vec_res(:, :)
1139 : END IF
1140 : END DO
1141 276155 : CALL dbcsr_iterator_stop(iter)
1142 :
1143 276155 : CALL release_fast_vec_access(fast_vec_row)
1144 276155 : CALL release_fast_vec_access(fast_vec_col)
1145 276155 : CALL release_fast_vec_access(res_fast_vec_row)
1146 276155 : CALL release_fast_vec_access(res_fast_vec_col)
1147 :
1148 276155 : CALL dbcsr_release(result_row); CALL dbcsr_release(result_col)
1149 :
1150 276155 : CALL timestop(handle)
1151 :
1152 552310 : END SUBROUTINE dbcsr_sym_matrix_vector_mult_${nametype}$
1153 :
1154 : #:endfor
1155 :
1156 0 : END MODULE dbcsr_vector
|