Line data Source code
1 : !--------------------------------------------------------------------------------------------------!
2 : ! CP2K: A general program to perform molecular dynamics simulations !
3 : ! Copyright 2000-2025 CP2K developers group <https://cp2k.org> !
4 : ! !
5 : ! SPDX-License-Identifier: GPL-2.0-or-later !
6 : !--------------------------------------------------------------------------------------------------!
7 :
8 : ! **************************************************************************************************
9 : !> \brief operations for skinny matrices/vectors expressed in dbcsr form
10 : !> \par History
11 : !> 2014.10 created [Florian Schiffmann]
12 : !> 2023.12 Removed support for single-precision [Ole Schuett]
13 : !> 2024.12 Removed support for complex input matrices [Ole Schuett]
14 : !> \author Florian Schiffmann
15 : ! **************************************************************************************************
16 : MODULE arnoldi_vector
17 : USE cp_dbcsr_api, ONLY: &
18 : dbcsr_copy, dbcsr_create, dbcsr_distribution_get, dbcsr_distribution_new, &
19 : dbcsr_distribution_release, dbcsr_distribution_type, dbcsr_get_data_p, dbcsr_get_info, &
20 : dbcsr_iterator_blocks_left, dbcsr_iterator_next_block, dbcsr_iterator_start, &
21 : dbcsr_iterator_stop, dbcsr_iterator_type, dbcsr_release, dbcsr_reserve_all_blocks, &
22 : dbcsr_set, dbcsr_type, dbcsr_type_antisymmetric, dbcsr_type_no_symmetry, &
23 : dbcsr_type_real_8, dbcsr_type_symmetric
24 : USE kinds, ONLY: dp,&
25 : real_8
26 : USE message_passing, ONLY: mp_comm_type
27 : #include "../base/base_uses.f90"
28 :
29 : !$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
30 :
31 : IMPLICIT NONE
32 :
33 : PRIVATE
34 :
35 : CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'arnoldi_vector'
36 :
37 : ! **************************************************************************************************
38 : !> \brief Types needed for the hashtable.
39 : ! **************************************************************************************************
40 : TYPE ele_type
41 : INTEGER :: c = 0
42 : INTEGER :: p = 0
43 : END TYPE ele_type
44 :
45 : TYPE hash_table_type
46 : TYPE(ele_type), DIMENSION(:), POINTER :: table => NULL()
47 : INTEGER :: nele = 0
48 : INTEGER :: nmax = 0
49 : INTEGER :: prime = 0
50 : END TYPE hash_table_type
51 :
52 : ! **************************************************************************************************
53 : !> \brief Types needed for fast access to distributed dbcsr vectors.
54 : ! **************************************************************************************************
55 : TYPE block_ptr
56 : REAL(real_8), DIMENSION(:, :), POINTER :: ptr => NULL()
57 : INTEGER :: assigned_thread = -1
58 : END TYPE block_ptr
59 :
60 : TYPE fast_vec_access_type
61 : TYPE(hash_table_type) :: hash_table = hash_table_type()
62 : TYPE(block_ptr), DIMENSION(:), ALLOCATABLE :: blk_map
63 : END TYPE
64 :
65 : PUBLIC :: dbcsr_matrix_colvec_multiply, &
66 : create_col_vec_from_matrix, &
67 : create_row_vec_from_matrix, &
68 : create_replicated_col_vec_from_matrix, &
69 : create_replicated_row_vec_from_matrix
70 :
71 : CONTAINS
72 :
73 : ! **************************************************************************************************
74 : !> \brief creates a dbcsr col vector like object which lives on proc_col 0
75 : !> and has the same row dist as the template matrix
76 : !> the returned matrix is fully allocated and all blocks are set to 0
77 : !> this is not a sparse object (and must never be)
78 : !> \param dbcsr_vec the vector object to create must be allocated but not initialized
79 : !> \param matrix a dbcsr matrix used as template
80 : !> \param ncol number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
81 : ! **************************************************************************************************
82 262218 : SUBROUTINE create_col_vec_from_matrix(dbcsr_vec, matrix, ncol)
83 : TYPE(dbcsr_type) :: dbcsr_vec, matrix
84 : INTEGER :: ncol
85 :
86 : CHARACTER(LEN=*), PARAMETER :: routineN = 'create_col_vec_from_matrix'
87 :
88 : INTEGER :: data_type, handle, npcols
89 262218 : INTEGER, DIMENSION(:), POINTER :: col_blk_size, col_dist, row_blk_size, &
90 262218 : row_dist
91 : TYPE(dbcsr_distribution_type) :: dist, dist_col_vec
92 :
93 262218 : CALL timeset(routineN, handle)
94 :
95 262218 : CALL dbcsr_get_info(matrix, data_type=data_type, row_blk_size=row_blk_size, distribution=dist)
96 262218 : CALL dbcsr_distribution_get(dist, npcols=npcols, row_dist=row_dist)
97 262218 : CPASSERT(data_type == dbcsr_type_real_8)
98 :
99 262218 : ALLOCATE (col_dist(1), col_blk_size(1))
100 262218 : col_dist(1) = 0
101 262218 : col_blk_size(1) = ncol
102 262218 : CALL dbcsr_distribution_new(dist_col_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
103 :
104 : CALL dbcsr_create(dbcsr_vec, "D", dist_col_vec, &
105 : matrix_type=dbcsr_type_no_symmetry, &
106 : row_blk_size=row_blk_size, &
107 : col_blk_size=col_blk_size, &
108 262218 : nze=0, data_type=data_type)
109 262218 : CALL dbcsr_reserve_all_blocks(dbcsr_vec)
110 :
111 262218 : CALL dbcsr_distribution_release(dist_col_vec)
112 262218 : DEALLOCATE (col_dist, col_blk_size)
113 262218 : CALL timestop(handle)
114 :
115 524436 : END SUBROUTINE create_col_vec_from_matrix
116 :
117 : ! **************************************************************************************************
118 : !> \brief creates a dbcsr row vector like object which lives on proc_row 0
119 : !> and has the same row dist as the template matrix
120 : !> the returned matrix is fully allocated and all blocks are set to 0
121 : !> this is not a sparse object (and must never be)
122 : !> \param dbcsr_vec ...
123 : !> \param matrix a dbcsr matrix used as template
124 : !> \param nrow number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
125 : ! **************************************************************************************************
126 0 : SUBROUTINE create_row_vec_from_matrix(dbcsr_vec, matrix, nrow)
127 : TYPE(dbcsr_type) :: dbcsr_vec, matrix
128 : INTEGER :: nrow
129 :
130 : CHARACTER(LEN=*), PARAMETER :: routineN = 'create_row_vec_from_matrix'
131 :
132 : INTEGER :: data_type, handle, nprows
133 0 : INTEGER, DIMENSION(:), POINTER :: col_blk_size, col_dist, row_blk_size, &
134 0 : row_dist
135 : TYPE(dbcsr_distribution_type) :: dist, dist_row_vec
136 :
137 0 : CALL timeset(routineN, handle)
138 :
139 0 : CALL dbcsr_get_info(matrix, data_type=data_type, col_blk_size=col_blk_size, distribution=dist)
140 0 : CALL dbcsr_distribution_get(dist, nprows=nprows, col_dist=col_dist)
141 0 : CPASSERT(data_type == dbcsr_type_real_8)
142 :
143 0 : ALLOCATE (row_dist(1), row_blk_size(1))
144 0 : row_dist(1) = 0
145 0 : row_blk_size(1) = nrow
146 0 : CALL dbcsr_distribution_new(dist_row_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
147 :
148 : CALL dbcsr_create(dbcsr_vec, "D", dist_row_vec, &
149 : matrix_type=dbcsr_type_no_symmetry, &
150 : row_blk_size=row_blk_size, &
151 : col_blk_size=col_blk_size, &
152 0 : nze=0, data_type=data_type)
153 0 : CALL dbcsr_reserve_all_blocks(dbcsr_vec)
154 :
155 0 : CALL dbcsr_distribution_release(dist_row_vec)
156 0 : DEALLOCATE (row_dist, row_blk_size)
157 0 : CALL timestop(handle)
158 :
159 0 : END SUBROUTINE create_row_vec_from_matrix
160 :
161 : ! **************************************************************************************************
162 : !> \brief creates a col vector like object whose blocks can be replicated
163 : !> along the processor row and has the same row dist as the template matrix
164 : !> the returned matrix is fully allocated and all blocks are set to 0
165 : !> this is not a sparse object (and must never be)
166 : !> \param dbcsr_vec the vector object to create must be allocated but not initialized
167 : !> \param matrix a dbcsr matrix used as template
168 : !> \param ncol number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
169 : ! **************************************************************************************************
170 128837 : SUBROUTINE create_replicated_col_vec_from_matrix(dbcsr_vec, matrix, ncol)
171 : TYPE(dbcsr_type) :: dbcsr_vec, matrix
172 : INTEGER :: ncol
173 :
174 : CHARACTER(LEN=*), PARAMETER :: routineN = 'create_replicated_col_vec_from_matrix'
175 :
176 : INTEGER :: data_type, handle, i, npcols
177 128837 : INTEGER, DIMENSION(:), POINTER :: col_blk_size, col_dist, row_blk_size, &
178 128837 : row_dist
179 : TYPE(dbcsr_distribution_type) :: dist, dist_col_vec
180 :
181 128837 : CALL timeset(routineN, handle)
182 :
183 128837 : CALL dbcsr_get_info(matrix, data_type=data_type, row_blk_size=row_blk_size, distribution=dist)
184 128837 : CALL dbcsr_distribution_get(dist, npcols=npcols, row_dist=row_dist)
185 128837 : CPASSERT(data_type == dbcsr_type_real_8)
186 :
187 644185 : ALLOCATE (col_dist(npcols), col_blk_size(npcols))
188 257674 : col_blk_size(:) = ncol
189 257674 : DO i = 0, npcols - 1
190 257674 : col_dist(i + 1) = i
191 : END DO
192 128837 : CALL dbcsr_distribution_new(dist_col_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
193 :
194 : CALL dbcsr_create(dbcsr_vec, "D", dist_col_vec, &
195 : matrix_type=dbcsr_type_no_symmetry, &
196 : row_blk_size=row_blk_size, &
197 : col_blk_size=col_blk_size, &
198 128837 : nze=0, data_type=data_type)
199 128837 : CALL dbcsr_reserve_all_blocks(dbcsr_vec)
200 :
201 128837 : CALL dbcsr_distribution_release(dist_col_vec)
202 128837 : DEALLOCATE (col_dist, col_blk_size)
203 128837 : CALL timestop(handle)
204 :
205 257674 : END SUBROUTINE create_replicated_col_vec_from_matrix
206 :
207 : ! **************************************************************************************************
208 : !> \brief creates a row vector like object whose blocks can be replicated
209 : !> along the processor col and has the same col dist as the template matrix
210 : !> the returned matrix is fully allocated and all blocks are set to 0
211 : !> this is not a sparse object (and must never be)
212 : !> \param dbcsr_vec the vector object to create must be allocated but not initialized
213 : !> \param matrix a dbcsr matrix used as template
214 : !> \param nrow number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
215 : ! **************************************************************************************************
216 128837 : SUBROUTINE create_replicated_row_vec_from_matrix(dbcsr_vec, matrix, nrow)
217 : TYPE(dbcsr_type) :: dbcsr_vec, matrix
218 : INTEGER :: nrow
219 :
220 : CHARACTER(LEN=*), PARAMETER :: routineN = 'create_replicated_row_vec_from_matrix'
221 :
222 : INTEGER :: data_type, handle, i, nprows
223 128837 : INTEGER, DIMENSION(:), POINTER :: col_blk_size, col_dist, row_blk_size, &
224 128837 : row_dist
225 : TYPE(dbcsr_distribution_type) :: dist, dist_row_vec
226 :
227 128837 : CALL timeset(routineN, handle)
228 :
229 128837 : CALL dbcsr_get_info(matrix, distribution=dist, col_blk_size=col_blk_size, data_type=data_type)
230 128837 : CALL dbcsr_distribution_get(dist, nprows=nprows, col_dist=col_dist)
231 128837 : CPASSERT(data_type == dbcsr_type_real_8)
232 :
233 644185 : ALLOCATE (row_dist(nprows), row_blk_size(nprows))
234 384156 : row_blk_size(:) = nrow
235 384156 : DO i = 0, nprows - 1
236 384156 : row_dist(i + 1) = i
237 : END DO
238 128837 : CALL dbcsr_distribution_new(dist_row_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
239 :
240 : CALL dbcsr_create(dbcsr_vec, "D", dist_row_vec, dbcsr_type_no_symmetry, &
241 : row_blk_size=row_blk_size, col_blk_size=col_blk_size, &
242 128837 : nze=0, data_type=data_type)
243 128837 : CALL dbcsr_reserve_all_blocks(dbcsr_vec)
244 :
245 128837 : CALL dbcsr_distribution_release(dist_row_vec)
246 128837 : DEALLOCATE (row_dist, row_blk_Size)
247 128837 : CALL timestop(handle)
248 :
249 257674 : END SUBROUTINE create_replicated_row_vec_from_matrix
250 :
251 : ! **************************************************************************************************
252 : !> \brief given a column vector, prepare the fast_vec_access container
253 : !> \param vec ...
254 : !> \param fast_vec_access ...
255 : ! **************************************************************************************************
256 1039529 : SUBROUTINE create_fast_col_vec_access(vec, fast_vec_access)
257 : TYPE(dbcsr_type) :: vec
258 : TYPE(fast_vec_access_type) :: fast_vec_access
259 :
260 : CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_col_vec_access'
261 :
262 : INTEGER :: col, handle, iblock, nblk_local, &
263 : nthreads, row
264 1039529 : REAL(kind=real_8), DIMENSION(:, :), POINTER :: vec_bl
265 : TYPE(dbcsr_iterator_type) :: iter
266 :
267 1039529 : CALL timeset(routineN, handle)
268 :
269 : ! figure out the number of threads
270 1039529 : nthreads = 1
271 1039529 : !$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)
272 : !$OMP MASTER
273 : !$ nthreads = OMP_GET_NUM_THREADS()
274 : !$OMP END MASTER
275 : !$OMP END PARALLEL
276 :
277 1039529 : CALL dbcsr_get_info(matrix=vec, nblkrows_local=nblk_local)
278 : ! 4 times makes sure the table is big enough to limit collisions.
279 1039529 : CALL hash_table_create(fast_vec_access%hash_table, 4*nblk_local)
280 : ! include zero for effective dealing with values not in the hash table (will return 0)
281 6831408 : ALLOCATE (fast_vec_access%blk_map(0:nblk_local))
282 :
283 1039529 : CALL dbcsr_get_info(matrix=vec, nblkcols_local=col)
284 1039529 : IF (col .GT. 1) CPABORT("BUG")
285 :
286 : ! go through the blocks of the vector
287 1039529 : iblock = 0
288 1039529 : CALL dbcsr_iterator_start(iter, vec)
289 3712821 : DO WHILE (dbcsr_iterator_blocks_left(iter))
290 2673292 : CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)
291 2673292 : iblock = iblock + 1
292 2673292 : CALL hash_table_add(fast_vec_access%hash_table, row, iblock)
293 2673292 : fast_vec_access%blk_map(iblock)%ptr => vec_bl
294 2673292 : fast_vec_access%blk_map(iblock)%assigned_thread = MOD(iblock, nthreads)
295 : END DO
296 1039529 : CALL dbcsr_iterator_stop(iter)
297 1039529 : CALL timestop(handle)
298 :
299 3118587 : END SUBROUTINE create_fast_col_vec_access
300 :
301 : ! **************************************************************************************************
302 : !> \brief given a row vector, prepare the fast_vec_access_container
303 : !> \param vec ...
304 : !> \param fast_vec_access ...
305 : ! **************************************************************************************************
306 1039529 : SUBROUTINE create_fast_row_vec_access(vec, fast_vec_access)
307 : TYPE(dbcsr_type) :: vec
308 : TYPE(fast_vec_access_type) :: fast_vec_access
309 :
310 : CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_row_vec_access'
311 :
312 : INTEGER :: col, handle, iblock, nblk_local, &
313 : nthreads, row
314 1039529 : REAL(kind=real_8), DIMENSION(:, :), POINTER :: vec_bl
315 : TYPE(dbcsr_iterator_type) :: iter
316 :
317 1039529 : CALL timeset(routineN, handle)
318 :
319 : ! figure out the number of threads
320 1039529 : nthreads = 1
321 1039529 : !$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)
322 : !$OMP MASTER
323 : !$ nthreads = OMP_GET_NUM_THREADS()
324 : !$OMP END MASTER
325 : !$OMP END PARALLEL
326 :
327 1039529 : CALL dbcsr_get_info(matrix=vec, nblkcols_local=nblk_local)
328 : ! 4 times makes sure the table is big enough to limit collisions.
329 1039529 : CALL hash_table_create(fast_vec_access%hash_table, 4*nblk_local)
330 : ! include zero for effective dealing with values not in the hash table (will return 0)
331 9479645 : ALLOCATE (fast_vec_access%blk_map(0:nblk_local))
332 :
333 : ! sanity check
334 1039529 : CALL dbcsr_get_info(matrix=vec, nblkrows_local=row)
335 1039529 : IF (row .GT. 1) CPABORT("BUG")
336 :
337 : ! go through the blocks of the vector
338 1039529 : iblock = 0
339 1039529 : CALL dbcsr_iterator_start(iter, vec)
340 6361058 : DO WHILE (dbcsr_iterator_blocks_left(iter))
341 5321529 : CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)
342 5321529 : iblock = iblock + 1
343 5321529 : CALL hash_table_add(fast_vec_access%hash_table, col, iblock)
344 5321529 : fast_vec_access%blk_map(iblock)%ptr => vec_bl
345 5321529 : fast_vec_access%blk_map(iblock)%assigned_thread = MOD(iblock, nthreads)
346 : END DO
347 1039529 : CALL dbcsr_iterator_stop(iter)
348 :
349 1039529 : CALL timestop(handle)
350 :
351 3118587 : END SUBROUTINE create_fast_row_vec_access
352 :
353 : ! **************************************************************************************************
354 : !> \brief release all memory associated with the fast_vec_access type
355 : !> \param fast_vec_access ...
356 : ! **************************************************************************************************
357 2079058 : SUBROUTINE release_fast_vec_access(fast_vec_access)
358 : TYPE(fast_vec_access_type) :: fast_vec_access
359 :
360 : CHARACTER(LEN=*), PARAMETER :: routineN = 'release_fast_vec_access'
361 :
362 : INTEGER :: handle
363 :
364 2079058 : CALL timeset(routineN, handle)
365 :
366 2079058 : CALL hash_table_release(fast_vec_access%hash_table)
367 2079058 : DEALLOCATE (fast_vec_access%blk_map)
368 :
369 2079058 : CALL timestop(handle)
370 :
371 2079058 : END SUBROUTINE release_fast_vec_access
372 :
373 : ! --------------------------------------------------------------------------------------------------
374 : ! Beginning of hashtable.
375 : ! this file can be 'INCLUDE'ed verbatim in various place, where it needs to be
376 : ! part of the module to guarantee inlining
377 : ! hashes (c,p) pairs, where p is assumed to be >0
378 : ! on return (0 is used as a flag for not present)
379 : !
380 : !
381 : ! **************************************************************************************************
382 : !> \brief finds a prime equal or larger than i, needed at creation
383 : !> \param i ...
384 : !> \return ...
385 : ! **************************************************************************************************
386 2079058 : FUNCTION matching_prime(i) RESULT(res)
387 : INTEGER, INTENT(IN) :: i
388 : INTEGER :: res
389 :
390 : INTEGER :: j
391 :
392 2079058 : res = i
393 2079058 : j = 0
394 6409986 : DO WHILE (j < res)
395 54416043 : DO j = 2, res - 1
396 52336985 : IF (MOD(res, j) == 0) THEN
397 2251870 : res = res + 1
398 2251870 : EXIT
399 : END IF
400 : END DO
401 : END DO
402 2079058 : END FUNCTION
403 :
404 : ! **************************************************************************************************
405 : !> \brief create a hash_table of given initial size.
406 : !> the hash table will expand as needed (but this requires rehashing)
407 : !> \param hash_table ...
408 : !> \param table_size ...
409 : ! **************************************************************************************************
410 2079058 : SUBROUTINE hash_table_create(hash_table, table_size)
411 : TYPE(hash_table_type) :: hash_table
412 : INTEGER, INTENT(IN) :: table_size
413 :
414 : INTEGER :: j
415 :
416 : ! guarantee a minimal hash table size (8), so that expansion works
417 :
418 2079058 : j = 3
419 4054786 : DO WHILE (2**j - 1 < table_size)
420 1975728 : j = j + 1
421 : END DO
422 2079058 : hash_table%nmax = 2**j - 1
423 2079058 : hash_table%prime = matching_prime(hash_table%nmax)
424 2079058 : hash_table%nele = 0
425 56731326 : ALLOCATE (hash_table%table(0:hash_table%nmax))
426 2079058 : END SUBROUTINE hash_table_create
427 :
428 : ! **************************************************************************************************
429 : !> \brief ...
430 : !> \param hash_table ...
431 : ! **************************************************************************************************
432 2079058 : SUBROUTINE hash_table_release(hash_table)
433 : TYPE(hash_table_type) :: hash_table
434 :
435 2079058 : hash_table%nmax = 0
436 2079058 : hash_table%nele = 0
437 2079058 : DEALLOCATE (hash_table%table)
438 :
439 2079058 : END SUBROUTINE hash_table_release
440 :
441 : ! **************************************************************************************************
442 : !> \brief add a pair (c,p) to the hash table
443 : !> \param hash_table ...
444 : !> \param c this value is being hashed
445 : !> \param p this is being stored
446 : ! **************************************************************************************************
447 7994821 : RECURSIVE SUBROUTINE hash_table_add(hash_table, c, p)
448 : TYPE(hash_table_type), INTENT(INOUT) :: hash_table
449 : INTEGER, INTENT(IN) :: c, p
450 :
451 : REAL(KIND=real_8), PARAMETER :: hash_table_expand = 1.5_real_8, &
452 : inv_hash_table_fill = 2.5_real_8
453 :
454 : INTEGER :: i, j
455 7994821 : TYPE(ele_type), ALLOCATABLE, DIMENSION(:) :: tmp_hash
456 :
457 : ! if too small, make a copy and rehash in a larger table
458 :
459 7994821 : IF (hash_table%nele*inv_hash_table_fill > hash_table%nmax) THEN
460 0 : ALLOCATE (tmp_hash(LBOUND(hash_table%table, 1):UBOUND(hash_table%table, 1)))
461 0 : tmp_hash(:) = hash_table%table
462 0 : CALL hash_table_release(hash_table)
463 0 : CALL hash_table_create(hash_table, INT((UBOUND(tmp_hash, 1) + 8)*hash_table_expand))
464 0 : DO i = LBOUND(tmp_hash, 1), UBOUND(tmp_hash, 1)
465 0 : IF (tmp_hash(i)%c .NE. 0) THEN
466 0 : CALL hash_table_add(hash_table, tmp_hash(i)%c, tmp_hash(i)%p)
467 : END IF
468 : END DO
469 0 : DEALLOCATE (tmp_hash)
470 : END IF
471 :
472 7994821 : hash_table%nele = hash_table%nele + 1
473 7994821 : i = IAND(c*hash_table%prime, hash_table%nmax)
474 :
475 7994821 : DO j = i, hash_table%nmax
476 7994821 : IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
477 7994821 : hash_table%table(j)%c = c
478 7994821 : hash_table%table(j)%p = p
479 7994821 : RETURN
480 : END IF
481 : END DO
482 0 : DO j = 0, i - 1
483 0 : IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
484 0 : hash_table%table(j)%c = c
485 0 : hash_table%table(j)%p = p
486 0 : RETURN
487 : END IF
488 : END DO
489 :
490 : END SUBROUTINE hash_table_add
491 :
492 : ! **************************************************************************************************
493 : !> \brief ...
494 : !> \param hash_table ...
495 : !> \param c ...
496 : !> \return ...
497 : ! **************************************************************************************************
498 120147260 : PURE FUNCTION hash_table_get(hash_table, c) RESULT(p)
499 : TYPE(hash_table_type), INTENT(IN) :: hash_table
500 : INTEGER, INTENT(IN) :: c
501 : INTEGER :: p
502 :
503 : INTEGER :: i, j
504 :
505 120147260 : i = IAND(c*hash_table%prime, hash_table%nmax)
506 :
507 : ! catch the likely case first
508 120147260 : IF (hash_table%table(i)%c == c) THEN
509 115851280 : p = hash_table%table(i)%p
510 115851280 : RETURN
511 : END IF
512 :
513 4295980 : DO j = i, hash_table%nmax
514 4295980 : IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
515 4295980 : p = hash_table%table(j)%p
516 4295980 : RETURN
517 : END IF
518 : END DO
519 0 : DO j = 0, i - 1
520 0 : IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
521 0 : p = hash_table%table(j)%p
522 0 : RETURN
523 : END IF
524 : END DO
525 :
526 : ! we should never reach this point.
527 120147260 : p = HUGE(p)
528 :
529 : END FUNCTION hash_table_get
530 :
531 : ! End of hashtable
532 : ! --------------------------------------------------------------------------------------------------
533 :
534 : ! **************************************************************************************************
535 : !> \brief the real driver routine for the multiply, not all symmetries implemented yet
536 : !> \param matrix ...
537 : !> \param vec_in ...
538 : !> \param vec_out ...
539 : !> \param alpha ...
540 : !> \param beta ...
541 : !> \param work_row ...
542 : !> \param work_col ...
543 : ! **************************************************************************************************
544 769232 : SUBROUTINE dbcsr_matrix_colvec_multiply(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
545 : TYPE(dbcsr_type) :: matrix, vec_in, vec_out
546 : REAL(kind=real_8) :: alpha, beta
547 : TYPE(dbcsr_type) :: work_row, work_col
548 :
549 : CHARACTER :: matrix_type
550 : INTEGER :: data_type
551 :
552 769232 : CALL dbcsr_get_info(matrix, data_type=data_type, matrix_type=matrix_type)
553 769232 : CPASSERT(data_type == dbcsr_type_real_8)
554 :
555 498935 : SELECT CASE (matrix_type)
556 : CASE (dbcsr_type_no_symmetry)
557 498935 : CALL dbcsr_matrix_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
558 : CASE (dbcsr_type_symmetric)
559 270297 : CALL dbcsr_sym_matrix_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
560 : CASE (dbcsr_type_antisymmetric)
561 : ! Not yet implemented, should mainly be some prefactor magic, but who knows how antisymmetric matrices are stored???
562 0 : CPABORT("NYI, antisymmetric matrix not permitted")
563 : CASE DEFAULT
564 769232 : CPABORT("Unknown matrix type, ...")
565 : END SELECT
566 :
567 769232 : END SUBROUTINE dbcsr_matrix_colvec_multiply
568 :
569 : ! **************************************************************************************************
570 : !> \brief low level routines for matrix vector multiplies
571 : !> \param matrix ...
572 : !> \param vec_in ...
573 : !> \param vec_out ...
574 : !> \param alpha ...
575 : !> \param beta ...
576 : !> \param work_row ...
577 : !> \param work_col ...
578 : ! **************************************************************************************************
579 498935 : SUBROUTINE dbcsr_matrix_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
580 : TYPE(dbcsr_type) :: matrix, vec_in, vec_out
581 : REAL(kind=real_8) :: alpha, beta
582 : TYPE(dbcsr_type) :: work_row, work_col
583 :
584 : CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_matrix_vector_mult'
585 :
586 : INTEGER :: col, data_type, handle, handle1, &
587 : ithread, mypcol, myprow, ncols, nrows, &
588 : pcol, prow, prow_handle, row
589 498935 : REAL(kind=real_8), DIMENSION(:), POINTER :: data_vec
590 498935 : REAL(kind=real_8), DIMENSION(:, :), POINTER :: data_d, vec_res
591 : TYPE(dbcsr_distribution_type) :: dist
592 : TYPE(dbcsr_iterator_type) :: iter
593 498935 : TYPE(fast_vec_access_type) :: fast_vec_col, fast_vec_row
594 : TYPE(mp_comm_type) :: prow_group
595 :
596 498935 : CALL timeset(routineN, handle)
597 498935 : ithread = 0
598 :
599 : ! Collect some data about the parallel environment. We will use them later to move the vector around
600 498935 : CALL dbcsr_get_info(matrix, distribution=dist, data_type=data_type)
601 498935 : CALL dbcsr_distribution_get(dist, prow_group=prow_handle, myprow=myprow, mypcol=mypcol)
602 498935 : CPASSERT(data_type == dbcsr_type_real_8)
603 :
604 498935 : CALL prow_group%set_handle(prow_handle)
605 :
606 498935 : CALL create_fast_row_vec_access(work_row, fast_vec_row)
607 498935 : CALL create_fast_col_vec_access(work_col, fast_vec_col)
608 :
609 : ! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply
610 498935 : CALL dbcsr_col_vec_to_rep_row(vec_in, work_col, work_row, fast_vec_col)
611 :
612 : ! Set the work vector for the results to 0
613 498935 : CALL dbcsr_set(work_col, 0.0_real_8)
614 :
615 : ! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes
616 : ! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)
617 498935 : CALL timeset(routineN//"_local_mm", handle1)
618 :
619 : !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow) &
620 498935 : !$OMP SHARED(matrix,fast_vec_col,fast_vec_row)
621 : !$ ithread = omp_get_thread_num()
622 : CALL dbcsr_iterator_start(iter, matrix, shared=.FALSE.)
623 : DO WHILE (dbcsr_iterator_blocks_left(iter))
624 : CALL dbcsr_iterator_next_block(iter, row, col, data_d)
625 : prow = hash_table_get(fast_vec_col%hash_table, row)
626 : IF (fast_vec_col%blk_map(prow)%assigned_thread .NE. ithread) CYCLE
627 : pcol = hash_table_get(fast_vec_row%hash_table, col)
628 : IF (SIZE(fast_vec_col%blk_map(prow)%ptr, 1) .EQ. 0 .OR. &
629 : SIZE(fast_vec_col%blk_map(prow)%ptr, 2) .EQ. 0 .OR. &
630 : SIZE(data_d, 2) .EQ. 0) CYCLE
631 : CALL dgemm('N', 'T', SIZE(fast_vec_col%blk_map(prow)%ptr, 1), &
632 : SIZE(fast_vec_col%blk_map(prow)%ptr, 2), &
633 : SIZE(data_d, 2), &
634 : 1.0_dp, &
635 : data_d, &
636 : SIZE(fast_vec_col%blk_map(prow)%ptr, 1), &
637 : fast_vec_row%blk_map(pcol)%ptr, &
638 : SIZE(fast_vec_col%blk_map(prow)%ptr, 2), &
639 : 1.0_dp, &
640 : fast_vec_col%blk_map(prow)%ptr, &
641 : SIZE(fast_vec_col%blk_map(prow)%ptr, 1))
642 : END DO
643 : CALL dbcsr_iterator_stop(iter)
644 : !$OMP END PARALLEL
645 :
646 498935 : CALL timestop(handle1)
647 :
648 : ! sum all the data onto the first processor col where the original vector is stored
649 498935 : data_vec => dbcsr_get_data_p(work_col, select_data_type=0.0_real_8)
650 498935 : CALL dbcsr_get_info(matrix=work_col, nfullrows_local=nrows, nfullcols_local=ncols)
651 10567301 : CALL prow_group%sum(data_vec(1:nrows*ncols))
652 :
653 : ! Local copy on the first mpi col (as this is the localtion of the vec_res blocks) of the result vector
654 : ! from the replicated to the original vector. Let's play it safe and use the iterator
655 498935 : CALL dbcsr_iterator_start(iter, vec_out)
656 1068485 : DO WHILE (dbcsr_iterator_blocks_left(iter))
657 569550 : CALL dbcsr_iterator_next_block(iter, row, col, vec_res)
658 569550 : prow = hash_table_get(fast_vec_col%hash_table, row)
659 1068485 : IF (ASSOCIATED(fast_vec_col%blk_map(prow)%ptr)) THEN
660 6173283 : vec_res(:, :) = beta*vec_res(:, :) + alpha*fast_vec_col%blk_map(prow)%ptr(:, :)
661 : ELSE
662 0 : vec_res(:, :) = beta*vec_res(:, :)
663 : END IF
664 : END DO
665 498935 : CALL dbcsr_iterator_stop(iter)
666 :
667 498935 : CALL release_fast_vec_access(fast_vec_row)
668 498935 : CALL release_fast_vec_access(fast_vec_col)
669 :
670 498935 : CALL timestop(handle)
671 :
672 997870 : END SUBROUTINE dbcsr_matrix_vector_mult
673 :
674 : ! **************************************************************************************************
675 : !> \brief ...
676 : !> \param matrix ...
677 : !> \param vec_in ...
678 : !> \param vec_out ...
679 : !> \param alpha ...
680 : !> \param beta ...
681 : !> \param work_row ...
682 : !> \param work_col ...
683 : !> \param skip_diag ...
684 : ! **************************************************************************************************
685 0 : SUBROUTINE dbcsr_matrixT_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col, skip_diag)
686 : TYPE(dbcsr_type) :: matrix, vec_in, vec_out
687 : REAL(kind=real_8) :: alpha, beta
688 : TYPE(dbcsr_type) :: work_row, work_col
689 : LOGICAL :: skip_diag
690 :
691 : CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_matrixT_vector_mult'
692 :
693 : INTEGER :: col, col_size, data_type, handle, handle1, ithread, mypcol, myprow, ncols, nrows, &
694 : pcol, pcol_handle, prow, prow_handle, row, row_size
695 0 : REAL(kind=real_8), DIMENSION(:), POINTER :: data_vec
696 0 : REAL(kind=real_8), DIMENSION(:, :), POINTER :: data_d, vec_bl, vec_res
697 : TYPE(dbcsr_distribution_type) :: dist
698 : TYPE(dbcsr_iterator_type) :: iter
699 0 : TYPE(fast_vec_access_type) :: fast_vec_col, fast_vec_row
700 : TYPE(mp_comm_type) :: pcol_group, prow_group
701 :
702 0 : CALL timeset(routineN, handle)
703 0 : ithread = 0
704 :
705 : ! Collect some data about the parallel environment. We will use them later to move the vector around
706 0 : CALL dbcsr_get_info(matrix, distribution=dist, data_type=data_type)
707 0 : CALL dbcsr_distribution_get(dist, prow_group=prow_handle, pcol_group=pcol_handle, myprow=myprow, mypcol=mypcol)
708 0 : CPASSERT(data_type == dbcsr_type_real_8)
709 :
710 0 : CALL prow_group%set_handle(prow_handle)
711 0 : CALL pcol_group%set_handle(pcol_handle)
712 :
713 0 : CALL create_fast_row_vec_access(work_row, fast_vec_row)
714 0 : CALL create_fast_col_vec_access(work_col, fast_vec_col)
715 :
716 : ! Set the work vector for the results to 0
717 0 : CALL dbcsr_set(work_row, 0.0_real_8)
718 :
719 : ! Transfer the correct parts of the input vector to the replicated vector on proc_col 0
720 0 : CALL dbcsr_iterator_start(iter, vec_in)
721 0 : DO WHILE (dbcsr_iterator_blocks_left(iter))
722 0 : CALL dbcsr_iterator_next_block(iter, row, col, vec_bl, row_size=row_size, col_size=col_size)
723 0 : prow = hash_table_get(fast_vec_col%hash_table, row)
724 0 : fast_vec_col%blk_map(prow)%ptr(1:row_size, 1:col_size) = vec_bl(1:row_size, 1:col_size)
725 : END DO
726 0 : CALL dbcsr_iterator_stop(iter)
727 : ! Replicate the data on all processore in the row
728 0 : data_vec => dbcsr_get_data_p(work_col, select_data_type=0.0_real_8)
729 0 : CALL prow_group%bcast(data_vec, 0)
730 :
731 : ! Perform the local multiply. Here it is obvious why the vectors are replicated on the mpi rows and cols
732 0 : CALL timeset(routineN//"local_mm", handle1)
733 0 : CALL dbcsr_get_info(matrix=work_col, nfullcols_local=ncols)
734 : !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,row_size,col_size,ithread,prow,pcol) &
735 0 : !$OMP SHARED(matrix,fast_vec_row,fast_vec_col,skip_diag,ncols)
736 : !$ ithread = omp_get_thread_num()
737 : CALL dbcsr_iterator_start(iter, matrix, shared=.FALSE.)
738 : DO WHILE (dbcsr_iterator_blocks_left(iter))
739 : CALL dbcsr_iterator_next_block(iter, row, col, data_d, row_size=row_size, col_size=col_size)
740 : IF (skip_diag .AND. col == row) CYCLE
741 : prow = hash_table_get(fast_vec_col%hash_table, row)
742 : pcol = hash_table_get(fast_vec_row%hash_table, col)
743 : IF (ASSOCIATED(fast_vec_row%blk_map(pcol)%ptr) .AND. &
744 : ASSOCIATED(fast_vec_col%blk_map(prow)%ptr)) THEN
745 : IF (fast_vec_row%blk_map(pcol)%assigned_thread .NE. ithread) CYCLE
746 : fast_vec_row%blk_map(pcol)%ptr = fast_vec_row%blk_map(pcol)%ptr + &
747 : MATMUL(TRANSPOSE(fast_vec_col%blk_map(prow)%ptr), data_d)
748 : ELSE
749 : prow = hash_table_get(fast_vec_row%hash_table, row)
750 : pcol = hash_table_get(fast_vec_col%hash_table, col)
751 : IF (fast_vec_row%blk_map(prow)%assigned_thread .NE. ithread) CYCLE
752 : fast_vec_row%blk_map(prow)%ptr = fast_vec_row%blk_map(prow)%ptr + &
753 : MATMUL(TRANSPOSE(fast_vec_col%blk_map(pcol)%ptr), TRANSPOSE(data_d))
754 : END IF
755 : END DO
756 : CALL dbcsr_iterator_stop(iter)
757 : !$OMP END PARALLEL
758 :
759 0 : CALL timestop(handle1)
760 :
761 : ! sum all the data within a processor column to obtain the replicated result
762 0 : data_vec => dbcsr_get_data_p(work_row, select_data_type=0.0_real_8)
763 : ! we use the replicated vector but the final answer is only summed to proc_col 0 for efficiency
764 0 : CALL dbcsr_get_info(matrix=work_row, nfullrows_local=nrows, nfullcols_local=ncols)
765 0 : CALL pcol_group%sum(data_vec(1:nrows*ncols))
766 :
767 : ! Convert the result to a column wise distribution
768 0 : CALL dbcsr_rep_row_to_rep_col_vec(work_col, work_row, fast_vec_row)
769 :
770 : ! Create_the final vector by summing it to the result vector which lives on proc_col 0
771 0 : CALL dbcsr_iterator_start(iter, vec_out)
772 0 : DO WHILE (dbcsr_iterator_blocks_left(iter))
773 0 : CALL dbcsr_iterator_next_block(iter, row, col, vec_res, row_size=row_size)
774 0 : prow = hash_table_get(fast_vec_col%hash_table, row)
775 0 : IF (ASSOCIATED(fast_vec_col%blk_map(prow)%ptr)) THEN
776 0 : vec_res(:, :) = beta*vec_res(:, :) + alpha*fast_vec_col%blk_map(prow)%ptr(:, :)
777 : ELSE
778 0 : vec_res(:, :) = beta*vec_res(:, :)
779 : END IF
780 : END DO
781 0 : CALL dbcsr_iterator_stop(iter)
782 :
783 0 : CALL timestop(handle)
784 :
785 0 : END SUBROUTINE dbcsr_matrixT_vector_mult
786 :
787 : ! **************************************************************************************************
788 : !> \brief ...
789 : !> \param vec_in ...
790 : !> \param rep_col_vec ...
791 : !> \param rep_row_vec ...
792 : !> \param fast_vec_col ...
793 : ! **************************************************************************************************
794 6153856 : SUBROUTINE dbcsr_col_vec_to_rep_row(vec_in, rep_col_vec, rep_row_vec, fast_vec_col)
795 : TYPE(dbcsr_type) :: vec_in, rep_col_vec, rep_row_vec
796 : TYPE(fast_vec_access_type), INTENT(IN) :: fast_vec_col
797 :
798 : CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_col_vec_to_rep_row'
799 :
800 : INTEGER :: col, handle, mypcol, myprow, ncols, &
801 : nrows, pcol_handle, prow_handle, row
802 769232 : INTEGER, DIMENSION(:), POINTER :: local_cols, row_dist
803 769232 : REAL(kind=real_8), DIMENSION(:), POINTER :: data_vec, data_vec_rep
804 769232 : REAL(kind=real_8), DIMENSION(:, :), POINTER :: vec_row
805 : TYPE(dbcsr_distribution_type) :: dist_in, dist_rep_col
806 : TYPE(dbcsr_iterator_type) :: iter
807 : TYPE(mp_comm_type) :: pcol_group, prow_group
808 :
809 769232 : CALL timeset(routineN, handle)
810 :
811 : ! get information about the parallel environment
812 769232 : CALL dbcsr_get_info(vec_in, distribution=dist_in)
813 : CALL dbcsr_distribution_get(dist_in, &
814 : prow_group=prow_handle, &
815 : pcol_group=pcol_handle, &
816 : myprow=myprow, &
817 769232 : mypcol=mypcol)
818 :
819 769232 : CALL prow_group%set_handle(prow_handle)
820 769232 : CALL pcol_group%set_handle(pcol_handle)
821 :
822 : ! Get the vector which tells us which blocks are local to which processor row in the col vec
823 769232 : CALL dbcsr_get_info(rep_col_vec, distribution=dist_rep_col)
824 769232 : CALL dbcsr_distribution_get(dist_rep_col, row_dist=row_dist)
825 :
826 : ! Copy the local vector to the replicated on the first processor column (this is where vec_in lives)
827 769232 : CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)
828 769232 : data_vec_rep => dbcsr_get_data_p(rep_col_vec, select_data_type=0.0_real_8)
829 769232 : data_vec => dbcsr_get_data_p(vec_in, select_data_type=0.0_real_8)
830 23628916 : IF (mypcol == 0) data_vec_rep(1:nrows*ncols) = data_vec(1:nrows*ncols)
831 : ! Replicate the data along the row
832 23628916 : CALL prow_group%bcast(data_vec_rep(1:nrows*ncols), 0)
833 :
834 : ! Here it gets a bit tricky as we are dealing with two different parallel layouts:
835 : ! The rep_col_vec contains all blocks local to the row distribution of the vector.
836 : ! The rep_row_vec only needs the fraction which is local to the col distribution.
837 : ! However in most cases this won't the complete set of block which can be obtained from col_vector p_row i
838 : ! Anyway, as the blocks don't repeat in the col_vec, a different fraction of the row vec will be available
839 : ! on every replica in the processor column, by summing along the column we end up with the complete vector everywhere
840 : ! Hope this clarifies the idea
841 769232 : CALL dbcsr_set(rep_row_vec, 0.0_real_8)
842 769232 : CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, local_cols=local_cols, nfullcols_local=ncols)
843 769232 : CALL dbcsr_iterator_start(iter, rep_row_vec)
844 3997874 : DO WHILE (dbcsr_iterator_blocks_left(iter))
845 3228642 : CALL dbcsr_iterator_next_block(iter, row, col, vec_row)
846 3997874 : IF (row_dist(col) == myprow) THEN
847 24481105 : vec_row = TRANSPOSE(fast_vec_col%blk_map(hash_table_get(fast_vec_col%hash_table, col))%ptr)
848 : END IF
849 : END DO
850 769232 : CALL dbcsr_iterator_stop(iter)
851 769232 : CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, nfullcols_local=ncols)
852 769232 : data_vec_rep => dbcsr_get_data_p(rep_row_vec, select_data_type=0.0_real_8)
853 46351516 : CALL pcol_group%sum(data_vec_rep(1:ncols*nrows))
854 :
855 769232 : CALL timestop(handle)
856 :
857 769232 : END SUBROUTINE dbcsr_col_vec_to_rep_row
858 :
859 : ! **************************************************************************************************
860 : !> \brief ...
861 : !> \param rep_col_vec ...
862 : !> \param rep_row_vec ...
863 : !> \param fast_vec_row ...
864 : !> \param fast_vec_col_add ...
865 : ! **************************************************************************************************
866 1621782 : SUBROUTINE dbcsr_rep_row_to_rep_col_vec(rep_col_vec, rep_row_vec, fast_vec_row, fast_vec_col_add)
867 : TYPE(dbcsr_type) :: rep_col_vec, rep_row_vec
868 : TYPE(fast_vec_access_type) :: fast_vec_row
869 : TYPE(fast_vec_access_type), OPTIONAL :: fast_vec_col_add
870 :
871 : CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_rep_row_to_rep_col_vec'
872 :
873 : INTEGER :: col, handle, mypcol, myprow, ncols, &
874 : nrows, prow_handle, row
875 270297 : INTEGER, DIMENSION(:), POINTER :: col_dist
876 270297 : REAL(kind=real_8), DIMENSION(:), POINTER :: data_vec_rep
877 270297 : REAL(kind=real_8), DIMENSION(:, :), POINTER :: vec_col
878 : TYPE(dbcsr_distribution_type) :: dist_rep_col, dist_rep_row
879 : TYPE(dbcsr_iterator_type) :: iter
880 : TYPE(mp_comm_type) :: prow_group
881 :
882 270297 : CALL timeset(routineN, handle)
883 :
884 : ! get information about the parallel environment
885 270297 : CALL dbcsr_get_info(matrix=rep_col_vec, distribution=dist_rep_col)
886 : CALL dbcsr_distribution_get(dist_rep_col, &
887 : prow_group=prow_handle, &
888 : myprow=myprow, &
889 270297 : mypcol=mypcol)
890 :
891 270297 : CALL prow_group%set_handle(prow_handle)
892 :
893 : ! Get the vector which tells us which blocks are local to which processor col in the row vec
894 270297 : CALL dbcsr_get_info(matrix=rep_row_vec, distribution=dist_rep_row)
895 270297 : CALL dbcsr_distribution_get(dist_rep_row, col_dist=col_dist)
896 :
897 : ! The same trick as described above with opposite direction
898 270297 : CALL dbcsr_set(rep_col_vec, 0.0_real_8)
899 270297 : CALL dbcsr_iterator_start(iter, rep_col_vec)
900 1322168 : DO WHILE (dbcsr_iterator_blocks_left(iter))
901 1051871 : CALL dbcsr_iterator_next_block(iter, row, col, vec_col)
902 1051871 : IF (col_dist(row) == mypcol) THEN
903 8499401 : vec_col = TRANSPOSE(fast_vec_row%blk_map(hash_table_get(fast_vec_row%hash_table, row))%ptr)
904 : END IF
905 : ! this one is special and allows to add the elements of a not yet summed replicated
906 : ! column vector as it appears in M*V(row_rep) as result. Save an parallel summation in the symmetric case
907 1322168 : IF (PRESENT(fast_vec_col_add)) THEN
908 8499401 : vec_col = vec_col + fast_vec_col_add%blk_map(hash_table_get(fast_vec_col_add%hash_table, row))%ptr(:, :)
909 : END IF
910 : END DO
911 270297 : CALL dbcsr_iterator_stop(iter)
912 270297 : CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)
913 270297 : data_vec_rep => dbcsr_get_data_p(rep_col_vec, select_data_type=0.0_real_8)
914 13061615 : CALL prow_group%sum(data_vec_rep(1:nrows*ncols))
915 :
916 270297 : CALL timestop(handle)
917 :
918 270297 : END SUBROUTINE dbcsr_rep_row_to_rep_col_vec
919 :
920 : ! **************************************************************************************************
921 : !> \brief ...
922 : !> \param matrix ...
923 : !> \param vec_in ...
924 : !> \param vec_out ...
925 : !> \param alpha ...
926 : !> \param beta ...
927 : !> \param work_row ...
928 : !> \param work_col ...
929 : ! **************************************************************************************************
930 270297 : SUBROUTINE dbcsr_sym_matrix_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
931 : TYPE(dbcsr_type) :: matrix, vec_in, vec_out
932 : REAL(kind=real_8) :: alpha, beta
933 : TYPE(dbcsr_type) :: work_row, work_col
934 :
935 : CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_sym_matrix_vector_mult'
936 :
937 : INTEGER :: col, handle, handle1, ithread, mypcol, &
938 : myprow, ncols, nrows, pcol, &
939 : pcol_handle, prow, row, rpcol, rprow, &
940 : vec_dim
941 270297 : REAL(kind=real_8), DIMENSION(:), POINTER :: data_vec
942 270297 : REAL(kind=real_8), DIMENSION(:, :), POINTER :: data_d, vec_res
943 : TYPE(dbcsr_distribution_type) :: dist
944 : TYPE(dbcsr_iterator_type) :: iter
945 : TYPE(dbcsr_type) :: result_col, result_row
946 270297 : TYPE(fast_vec_access_type) :: fast_vec_col, fast_vec_row, &
947 270297 : res_fast_vec_col, res_fast_vec_row
948 : TYPE(mp_comm_type) :: pcol_group
949 :
950 270297 : CALL timeset(routineN, handle)
951 270297 : ithread = 0
952 : ! We need some work matrices as we try to exploit operations on the replicated vectors which are duplicated otherwise
953 270297 : CALL dbcsr_get_info(matrix=vec_in, nfullcols_total=vec_dim)
954 : ! This is a performance hack as the new creation of a replicated vector is a fair bit more expensive
955 270297 : CALL dbcsr_set(work_col, 0.0_real_8)
956 270297 : CALL dbcsr_copy(result_col, work_col)
957 270297 : CALL dbcsr_set(work_row, 0.0_real_8)
958 270297 : CALL dbcsr_copy(result_row, work_row)
959 :
960 : ! Collect some data about the parallel environment. We will use them later to move the vector around
961 270297 : CALL dbcsr_get_info(matrix=matrix, distribution=dist)
962 270297 : CALL dbcsr_distribution_get(dist, pcol_group=pcol_handle, myprow=myprow, mypcol=mypcol)
963 :
964 270297 : CALL pcol_group%set_handle(pcol_handle)
965 :
966 270297 : CALL create_fast_row_vec_access(work_row, fast_vec_row)
967 270297 : CALL create_fast_col_vec_access(work_col, fast_vec_col)
968 270297 : CALL create_fast_row_vec_access(result_row, res_fast_vec_row)
969 270297 : CALL create_fast_col_vec_access(result_col, res_fast_vec_col)
970 :
971 : ! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply
972 270297 : CALL dbcsr_col_vec_to_rep_row(vec_in, work_col, work_row, fast_vec_col)
973 :
974 : ! Probably I should rename the routine above as it delivers both the replicated row and column vector
975 :
976 : ! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes
977 : ! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)
978 270297 : CALL timeset(routineN//"_local_mm", handle1)
979 :
980 : !------ perform the multiplication, we have to take car to take the correct blocks ----------
981 :
982 : !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow,rpcol,rprow) &
983 270297 : !$OMP SHARED(matrix,fast_vec_row,res_fast_vec_col,res_fast_vec_row,fast_vec_col)
984 : !$ ithread = omp_get_thread_num()
985 : CALL dbcsr_iterator_start(iter, matrix, shared=.FALSE.)
986 : DO WHILE (dbcsr_iterator_blocks_left(iter))
987 : CALL dbcsr_iterator_next_block(iter, row, col, data_d)
988 : pcol = hash_table_get(fast_vec_row%hash_table, col)
989 : rprow = hash_table_get(res_fast_vec_col%hash_table, row)
990 : IF (ASSOCIATED(fast_vec_row%blk_map(pcol)%ptr) .AND. &
991 : ASSOCIATED(res_fast_vec_col%blk_map(rprow)%ptr)) THEN
992 : IF (res_fast_vec_col%blk_map(rprow)%assigned_thread .EQ. ithread) THEN
993 : res_fast_vec_col%blk_map(rprow)%ptr = res_fast_vec_col%blk_map(rprow)%ptr + &
994 : MATMUL(data_d, TRANSPOSE(fast_vec_row%blk_map(pcol)%ptr))
995 : END IF
996 : prow = hash_table_get(fast_vec_col%hash_table, row)
997 : rpcol = hash_table_get(res_fast_vec_row%hash_table, col)
998 : IF (res_fast_vec_row%blk_map(rpcol)%assigned_thread .EQ. ithread .AND. row .NE. col) THEN
999 : res_fast_vec_row%blk_map(rpcol)%ptr = res_fast_vec_row%blk_map(rpcol)%ptr + &
1000 : MATMUL(TRANSPOSE(fast_vec_col%blk_map(prow)%ptr), data_d)
1001 : END IF
1002 : ELSE
1003 : rpcol = hash_table_get(res_fast_vec_col%hash_table, col)
1004 : prow = hash_table_get(fast_vec_row%hash_table, row)
1005 : IF (res_fast_vec_col%blk_map(rpcol)%assigned_thread .EQ. ithread) THEN
1006 : res_fast_vec_col%blk_map(rpcol)%ptr = res_fast_vec_col%blk_map(rpcol)%ptr + &
1007 : TRANSPOSE(MATMUL(fast_vec_row%blk_map(prow)%ptr, data_d))
1008 : END IF
1009 : rprow = hash_table_get(res_fast_vec_row%hash_table, row)
1010 : pcol = hash_table_get(fast_vec_col%hash_table, col)
1011 : IF (res_fast_vec_row%blk_map(rprow)%assigned_thread .EQ. ithread .AND. row .NE. col) THEN
1012 : res_fast_vec_row%blk_map(rprow)%ptr = res_fast_vec_row%blk_map(rprow)%ptr + &
1013 : TRANSPOSE(MATMUL(data_d, fast_vec_col%blk_map(pcol)%ptr))
1014 : END IF
1015 : END IF
1016 : END DO
1017 : CALL dbcsr_iterator_stop(iter)
1018 : !$OMP END PARALLEL
1019 :
1020 270297 : CALL timestop(handle1)
1021 :
1022 : ! sum all the data within a processor column to obtain the replicated result from lower
1023 270297 : data_vec => dbcsr_get_data_p(result_row, select_data_type=0.0_real_8)
1024 270297 : CALL dbcsr_get_info(matrix=result_row, nfullrows_local=nrows, nfullcols_local=ncols)
1025 :
1026 25743015 : CALL pcol_group%sum(data_vec(1:nrows*ncols))
1027 :
1028 : ! Convert the results to a column wise distribution, this is a bit involved as the result_row is fully replicated
1029 : ! While the result_col still has the partial results in parallel. The routine below takes care of that and saves a
1030 : ! parallel summation. Of the res_row vectors are created only taking the appropriate element (0 otherwise) while the res_col
1031 : ! parallel bits are locally added. The sum magically creates the correct vector
1032 270297 : CALL dbcsr_rep_row_to_rep_col_vec(work_col, result_row, res_fast_vec_row, res_fast_vec_col)
1033 :
1034 : ! Create_the final vector by summing it to the result vector which lives on proc_col 0 lower
1035 270297 : CALL dbcsr_iterator_start(iter, vec_out)
1036 1322168 : DO WHILE (dbcsr_iterator_blocks_left(iter))
1037 1051871 : CALL dbcsr_iterator_next_block(iter, row, col, vec_res)
1038 1051871 : prow = hash_table_get(fast_vec_col%hash_table, row)
1039 1322168 : IF (ASSOCIATED(fast_vec_col%blk_map(prow)%ptr)) THEN
1040 8499401 : vec_res(:, :) = beta*vec_res(:, :) + alpha*(fast_vec_col%blk_map(prow)%ptr(:, :))
1041 : ELSE
1042 0 : vec_res(:, :) = beta*vec_res(:, :)
1043 : END IF
1044 : END DO
1045 270297 : CALL dbcsr_iterator_stop(iter)
1046 :
1047 270297 : CALL release_fast_vec_access(fast_vec_row)
1048 270297 : CALL release_fast_vec_access(fast_vec_col)
1049 270297 : CALL release_fast_vec_access(res_fast_vec_row)
1050 270297 : CALL release_fast_vec_access(res_fast_vec_col)
1051 :
1052 270297 : CALL dbcsr_release(result_row); CALL dbcsr_release(result_col)
1053 :
1054 270297 : CALL timestop(handle)
1055 :
1056 540594 : END SUBROUTINE dbcsr_sym_matrix_vector_mult
1057 :
1058 0 : END MODULE arnoldi_vector
|