LCOV - code coverage report
Current view: top level - src/arnoldi - arnoldi_vector.F (source / functions) Hit Total Coverage
Test: CP2K Regtests (git:4c33f95) Lines: 277 355 78.0 %
Date: 2025-01-30 06:53:08 Functions: 16 23 69.6 %

          Line data    Source code
       1             : !--------------------------------------------------------------------------------------------------!
       2             : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3             : !   Copyright 2000-2025 CP2K developers group <https://cp2k.org>                                   !
       4             : !                                                                                                  !
       5             : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6             : !--------------------------------------------------------------------------------------------------!
       7             : 
       8             : ! **************************************************************************************************
       9             : !> \brief operations for skinny matrices/vectors expressed in dbcsr form
      10             : !> \par History
      11             : !>       2014.10 created [Florian Schiffmann]
      12             : !>       2023.12 Removed support for single-precision [Ole Schuett]
      13             : !>       2024.12 Removed support for complex input matrices [Ole Schuett]
      14             : !> \author Florian Schiffmann
      15             : ! **************************************************************************************************
      16             : MODULE arnoldi_vector
      17             :    USE cp_dbcsr_api,                    ONLY: &
      18             :         dbcsr_copy, dbcsr_create, dbcsr_distribution_get, dbcsr_distribution_new, &
      19             :         dbcsr_distribution_release, dbcsr_distribution_type, dbcsr_get_data_p, dbcsr_get_info, &
      20             :         dbcsr_iterator_blocks_left, dbcsr_iterator_next_block, dbcsr_iterator_start, &
      21             :         dbcsr_iterator_stop, dbcsr_iterator_type, dbcsr_release, dbcsr_reserve_all_blocks, &
      22             :         dbcsr_set, dbcsr_type, dbcsr_type_antisymmetric, dbcsr_type_no_symmetry, &
      23             :         dbcsr_type_real_8, dbcsr_type_symmetric
      24             :    USE kinds,                           ONLY: dp,&
      25             :                                               real_8
      26             :    USE message_passing,                 ONLY: mp_comm_type
      27             : #include "../base/base_uses.f90"
      28             : 
      29             : !$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
      30             : 
      31             :    IMPLICIT NONE
      32             : 
      33             :    PRIVATE
      34             : 
      35             :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'arnoldi_vector'
      36             : 
      37             : ! **************************************************************************************************
      38             : !> \brief Types needed for the hashtable.
      39             : ! **************************************************************************************************
      40             :    TYPE ele_type
      41             :       INTEGER :: c = 0
      42             :       INTEGER :: p = 0
      43             :    END TYPE ele_type
      44             : 
      45             :    TYPE hash_table_type
      46             :       TYPE(ele_type), DIMENSION(:), POINTER :: table => NULL()
      47             :       INTEGER :: nele = 0
      48             :       INTEGER :: nmax = 0
      49             :       INTEGER :: prime = 0
      50             :    END TYPE hash_table_type
      51             : 
      52             : ! **************************************************************************************************
      53             : !> \brief Types needed for fast access to distributed dbcsr vectors.
      54             : ! **************************************************************************************************
      55             :    TYPE block_ptr
      56             :       REAL(real_8), DIMENSION(:, :), POINTER          :: ptr => NULL()
      57             :       INTEGER                                         :: assigned_thread = -1
      58             :    END TYPE block_ptr
      59             : 
      60             :    TYPE fast_vec_access_type
      61             :       TYPE(hash_table_type) :: hash_table = hash_table_type()
      62             :       TYPE(block_ptr), DIMENSION(:), ALLOCATABLE :: blk_map
      63             :    END TYPE
      64             : 
      65             :    PUBLIC :: dbcsr_matrix_colvec_multiply, &
      66             :              create_col_vec_from_matrix, &
      67             :              create_row_vec_from_matrix, &
      68             :              create_replicated_col_vec_from_matrix, &
      69             :              create_replicated_row_vec_from_matrix
      70             : 
      71             : CONTAINS
      72             : 
      73             : ! **************************************************************************************************
      74             : !> \brief creates a dbcsr col vector like object which lives on proc_col 0
      75             : !>        and has the same row dist as the template matrix
      76             : !>        the returned matrix is fully allocated and all blocks are set to 0
      77             : !>        this is not a sparse object (and must never be)
      78             : !> \param dbcsr_vec  the vector object to create must be allocated but not initialized
      79             : !> \param matrix a dbcsr matrix used as template
      80             : !> \param ncol number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
      81             : ! **************************************************************************************************
      82      262218 :    SUBROUTINE create_col_vec_from_matrix(dbcsr_vec, matrix, ncol)
      83             :       TYPE(dbcsr_type)                                   :: dbcsr_vec, matrix
      84             :       INTEGER                                            :: ncol
      85             : 
      86             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_col_vec_from_matrix'
      87             : 
      88             :       INTEGER                                            :: data_type, handle, npcols
      89      262218 :       INTEGER, DIMENSION(:), POINTER                     :: col_blk_size, col_dist, row_blk_size, &
      90      262218 :                                                             row_dist
      91             :       TYPE(dbcsr_distribution_type)                      :: dist, dist_col_vec
      92             : 
      93      262218 :       CALL timeset(routineN, handle)
      94             : 
      95      262218 :       CALL dbcsr_get_info(matrix, data_type=data_type, row_blk_size=row_blk_size, distribution=dist)
      96      262218 :       CALL dbcsr_distribution_get(dist, npcols=npcols, row_dist=row_dist)
      97      262218 :       CPASSERT(data_type == dbcsr_type_real_8)
      98             : 
      99      262218 :       ALLOCATE (col_dist(1), col_blk_size(1))
     100      262218 :       col_dist(1) = 0
     101      262218 :       col_blk_size(1) = ncol
     102      262218 :       CALL dbcsr_distribution_new(dist_col_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
     103             : 
     104             :       CALL dbcsr_create(dbcsr_vec, "D", dist_col_vec, &
     105             :                         matrix_type=dbcsr_type_no_symmetry, &
     106             :                         row_blk_size=row_blk_size, &
     107             :                         col_blk_size=col_blk_size, &
     108      262218 :                         nze=0, data_type=data_type)
     109      262218 :       CALL dbcsr_reserve_all_blocks(dbcsr_vec)
     110             : 
     111      262218 :       CALL dbcsr_distribution_release(dist_col_vec)
     112      262218 :       DEALLOCATE (col_dist, col_blk_size)
     113      262218 :       CALL timestop(handle)
     114             : 
     115      524436 :    END SUBROUTINE create_col_vec_from_matrix
     116             : 
     117             : ! **************************************************************************************************
     118             : !> \brief creates a dbcsr row vector like object which lives on proc_row 0
     119             : !>        and has the same row dist as the template matrix
     120             : !>        the returned matrix is fully allocated and all blocks are set to 0
     121             : !>        this is not a sparse object (and must never be)
     122             : !> \param dbcsr_vec ...
     123             : !> \param matrix a dbcsr matrix used as template
     124             : !> \param nrow number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
     125             : ! **************************************************************************************************
     126           0 :    SUBROUTINE create_row_vec_from_matrix(dbcsr_vec, matrix, nrow)
     127             :       TYPE(dbcsr_type)                                   :: dbcsr_vec, matrix
     128             :       INTEGER                                            :: nrow
     129             : 
     130             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_row_vec_from_matrix'
     131             : 
     132             :       INTEGER                                            :: data_type, handle, nprows
     133           0 :       INTEGER, DIMENSION(:), POINTER                     :: col_blk_size, col_dist, row_blk_size, &
     134           0 :                                                             row_dist
     135             :       TYPE(dbcsr_distribution_type)                      :: dist, dist_row_vec
     136             : 
     137           0 :       CALL timeset(routineN, handle)
     138             : 
     139           0 :       CALL dbcsr_get_info(matrix, data_type=data_type, col_blk_size=col_blk_size, distribution=dist)
     140           0 :       CALL dbcsr_distribution_get(dist, nprows=nprows, col_dist=col_dist)
     141           0 :       CPASSERT(data_type == dbcsr_type_real_8)
     142             : 
     143           0 :       ALLOCATE (row_dist(1), row_blk_size(1))
     144           0 :       row_dist(1) = 0
     145           0 :       row_blk_size(1) = nrow
     146           0 :       CALL dbcsr_distribution_new(dist_row_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
     147             : 
     148             :       CALL dbcsr_create(dbcsr_vec, "D", dist_row_vec, &
     149             :                         matrix_type=dbcsr_type_no_symmetry, &
     150             :                         row_blk_size=row_blk_size, &
     151             :                         col_blk_size=col_blk_size, &
     152           0 :                         nze=0, data_type=data_type)
     153           0 :       CALL dbcsr_reserve_all_blocks(dbcsr_vec)
     154             : 
     155           0 :       CALL dbcsr_distribution_release(dist_row_vec)
     156           0 :       DEALLOCATE (row_dist, row_blk_size)
     157           0 :       CALL timestop(handle)
     158             : 
     159           0 :    END SUBROUTINE create_row_vec_from_matrix
     160             : 
     161             : ! **************************************************************************************************
     162             : !> \brief creates a col vector like object whose blocks can be replicated
     163             : !>        along the processor row and has the same row dist as the template matrix
     164             : !>        the returned matrix is fully allocated and all blocks are set to 0
     165             : !>        this is not a sparse object (and must never be)
     166             : !> \param dbcsr_vec the vector object to create must be allocated but not initialized
     167             : !> \param matrix a dbcsr matrix used as template
     168             : !> \param ncol number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
     169             : ! **************************************************************************************************
     170      128837 :    SUBROUTINE create_replicated_col_vec_from_matrix(dbcsr_vec, matrix, ncol)
     171             :       TYPE(dbcsr_type)                                   :: dbcsr_vec, matrix
     172             :       INTEGER                                            :: ncol
     173             : 
     174             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_replicated_col_vec_from_matrix'
     175             : 
     176             :       INTEGER                                            :: data_type, handle, i, npcols
     177      128837 :       INTEGER, DIMENSION(:), POINTER                     :: col_blk_size, col_dist, row_blk_size, &
     178      128837 :                                                             row_dist
     179             :       TYPE(dbcsr_distribution_type)                      :: dist, dist_col_vec
     180             : 
     181      128837 :       CALL timeset(routineN, handle)
     182             : 
     183      128837 :       CALL dbcsr_get_info(matrix, data_type=data_type, row_blk_size=row_blk_size, distribution=dist)
     184      128837 :       CALL dbcsr_distribution_get(dist, npcols=npcols, row_dist=row_dist)
     185      128837 :       CPASSERT(data_type == dbcsr_type_real_8)
     186             : 
     187      644185 :       ALLOCATE (col_dist(npcols), col_blk_size(npcols))
     188      257674 :       col_blk_size(:) = ncol
     189      257674 :       DO i = 0, npcols - 1
     190      257674 :          col_dist(i + 1) = i
     191             :       END DO
     192      128837 :       CALL dbcsr_distribution_new(dist_col_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
     193             : 
     194             :       CALL dbcsr_create(dbcsr_vec, "D", dist_col_vec, &
     195             :                         matrix_type=dbcsr_type_no_symmetry, &
     196             :                         row_blk_size=row_blk_size, &
     197             :                         col_blk_size=col_blk_size, &
     198      128837 :                         nze=0, data_type=data_type)
     199      128837 :       CALL dbcsr_reserve_all_blocks(dbcsr_vec)
     200             : 
     201      128837 :       CALL dbcsr_distribution_release(dist_col_vec)
     202      128837 :       DEALLOCATE (col_dist, col_blk_size)
     203      128837 :       CALL timestop(handle)
     204             : 
     205      257674 :    END SUBROUTINE create_replicated_col_vec_from_matrix
     206             : 
     207             : ! **************************************************************************************************
     208             : !> \brief creates a row vector like object whose blocks can be replicated
     209             : !>        along the processor col and has the same col dist as the template matrix
     210             : !>        the returned matrix is fully allocated and all blocks are set to 0
     211             : !>        this is not a sparse object (and must never be)
     212             : !> \param dbcsr_vec the vector object to create must be allocated but not initialized
     213             : !> \param matrix a dbcsr matrix used as template
     214             : !> \param nrow number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
     215             : ! **************************************************************************************************
     216      128837 :    SUBROUTINE create_replicated_row_vec_from_matrix(dbcsr_vec, matrix, nrow)
     217             :       TYPE(dbcsr_type)                                   :: dbcsr_vec, matrix
     218             :       INTEGER                                            :: nrow
     219             : 
     220             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_replicated_row_vec_from_matrix'
     221             : 
     222             :       INTEGER                                            :: data_type, handle, i, nprows
     223      128837 :       INTEGER, DIMENSION(:), POINTER                     :: col_blk_size, col_dist, row_blk_size, &
     224      128837 :                                                             row_dist
     225             :       TYPE(dbcsr_distribution_type)                      :: dist, dist_row_vec
     226             : 
     227      128837 :       CALL timeset(routineN, handle)
     228             : 
     229      128837 :       CALL dbcsr_get_info(matrix, distribution=dist, col_blk_size=col_blk_size, data_type=data_type)
     230      128837 :       CALL dbcsr_distribution_get(dist, nprows=nprows, col_dist=col_dist)
     231      128837 :       CPASSERT(data_type == dbcsr_type_real_8)
     232             : 
     233      644185 :       ALLOCATE (row_dist(nprows), row_blk_size(nprows))
     234      384156 :       row_blk_size(:) = nrow
     235      384156 :       DO i = 0, nprows - 1
     236      384156 :          row_dist(i + 1) = i
     237             :       END DO
     238      128837 :       CALL dbcsr_distribution_new(dist_row_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
     239             : 
     240             :       CALL dbcsr_create(dbcsr_vec, "D", dist_row_vec, dbcsr_type_no_symmetry, &
     241             :                         row_blk_size=row_blk_size, col_blk_size=col_blk_size, &
     242      128837 :                         nze=0, data_type=data_type)
     243      128837 :       CALL dbcsr_reserve_all_blocks(dbcsr_vec)
     244             : 
     245      128837 :       CALL dbcsr_distribution_release(dist_row_vec)
     246      128837 :       DEALLOCATE (row_dist, row_blk_Size)
     247      128837 :       CALL timestop(handle)
     248             : 
     249      257674 :    END SUBROUTINE create_replicated_row_vec_from_matrix
     250             : 
     251             : ! **************************************************************************************************
     252             : !> \brief given a column vector, prepare the fast_vec_access container
     253             : !> \param vec ...
     254             : !> \param fast_vec_access ...
     255             : ! **************************************************************************************************
     256     1039529 :    SUBROUTINE create_fast_col_vec_access(vec, fast_vec_access)
     257             :       TYPE(dbcsr_type)                                   :: vec
     258             :       TYPE(fast_vec_access_type)                         :: fast_vec_access
     259             : 
     260             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_col_vec_access'
     261             : 
     262             :       INTEGER                                            :: col, handle, iblock, nblk_local, &
     263             :                                                             nthreads, row
     264     1039529 :       REAL(kind=real_8), DIMENSION(:, :), POINTER        :: vec_bl
     265             :       TYPE(dbcsr_iterator_type)                          :: iter
     266             : 
     267     1039529 :       CALL timeset(routineN, handle)
     268             : 
     269             :       ! figure out the number of threads
     270     1039529 :       nthreads = 1
     271     1039529 : !$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)
     272             : !$OMP MASTER
     273             : !$    nthreads = OMP_GET_NUM_THREADS()
     274             : !$OMP END MASTER
     275             : !$OMP END PARALLEL
     276             : 
     277     1039529 :       CALL dbcsr_get_info(matrix=vec, nblkrows_local=nblk_local)
     278             :       ! 4 times makes sure the table is big enough to limit collisions.
     279     1039529 :       CALL hash_table_create(fast_vec_access%hash_table, 4*nblk_local)
     280             :       ! include zero for effective dealing with values not in the hash table (will return 0)
     281     6831408 :       ALLOCATE (fast_vec_access%blk_map(0:nblk_local))
     282             : 
     283     1039529 :       CALL dbcsr_get_info(matrix=vec, nblkcols_local=col)
     284     1039529 :       IF (col .GT. 1) CPABORT("BUG")
     285             : 
     286             :       ! go through the blocks of the vector
     287     1039529 :       iblock = 0
     288     1039529 :       CALL dbcsr_iterator_start(iter, vec)
     289     3712821 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     290     2673292 :          CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)
     291     2673292 :          iblock = iblock + 1
     292     2673292 :          CALL hash_table_add(fast_vec_access%hash_table, row, iblock)
     293     2673292 :          fast_vec_access%blk_map(iblock)%ptr => vec_bl
     294     2673292 :          fast_vec_access%blk_map(iblock)%assigned_thread = MOD(iblock, nthreads)
     295             :       END DO
     296     1039529 :       CALL dbcsr_iterator_stop(iter)
     297     1039529 :       CALL timestop(handle)
     298             : 
     299     3118587 :    END SUBROUTINE create_fast_col_vec_access
     300             : 
     301             : ! **************************************************************************************************
     302             : !> \brief given a row vector, prepare the fast_vec_access_container
     303             : !> \param vec ...
     304             : !> \param fast_vec_access ...
     305             : ! **************************************************************************************************
     306     1039529 :    SUBROUTINE create_fast_row_vec_access(vec, fast_vec_access)
     307             :       TYPE(dbcsr_type)                                   :: vec
     308             :       TYPE(fast_vec_access_type)                         :: fast_vec_access
     309             : 
     310             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_row_vec_access'
     311             : 
     312             :       INTEGER                                            :: col, handle, iblock, nblk_local, &
     313             :                                                             nthreads, row
     314     1039529 :       REAL(kind=real_8), DIMENSION(:, :), POINTER        :: vec_bl
     315             :       TYPE(dbcsr_iterator_type)                          :: iter
     316             : 
     317     1039529 :       CALL timeset(routineN, handle)
     318             : 
     319             :       ! figure out the number of threads
     320     1039529 :       nthreads = 1
     321     1039529 : !$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)
     322             : !$OMP MASTER
     323             : !$    nthreads = OMP_GET_NUM_THREADS()
     324             : !$OMP END MASTER
     325             : !$OMP END PARALLEL
     326             : 
     327     1039529 :       CALL dbcsr_get_info(matrix=vec, nblkcols_local=nblk_local)
     328             :       ! 4 times makes sure the table is big enough to limit collisions.
     329     1039529 :       CALL hash_table_create(fast_vec_access%hash_table, 4*nblk_local)
     330             :       ! include zero for effective dealing with values not in the hash table (will return 0)
     331     9479645 :       ALLOCATE (fast_vec_access%blk_map(0:nblk_local))
     332             : 
     333             :       ! sanity check
     334     1039529 :       CALL dbcsr_get_info(matrix=vec, nblkrows_local=row)
     335     1039529 :       IF (row .GT. 1) CPABORT("BUG")
     336             : 
     337             :       ! go through the blocks of the vector
     338     1039529 :       iblock = 0
     339     1039529 :       CALL dbcsr_iterator_start(iter, vec)
     340     6361058 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     341     5321529 :          CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)
     342     5321529 :          iblock = iblock + 1
     343     5321529 :          CALL hash_table_add(fast_vec_access%hash_table, col, iblock)
     344     5321529 :          fast_vec_access%blk_map(iblock)%ptr => vec_bl
     345     5321529 :          fast_vec_access%blk_map(iblock)%assigned_thread = MOD(iblock, nthreads)
     346             :       END DO
     347     1039529 :       CALL dbcsr_iterator_stop(iter)
     348             : 
     349     1039529 :       CALL timestop(handle)
     350             : 
     351     3118587 :    END SUBROUTINE create_fast_row_vec_access
     352             : 
     353             : ! **************************************************************************************************
     354             : !> \brief release all memory associated with the fast_vec_access type
     355             : !> \param fast_vec_access ...
     356             : ! **************************************************************************************************
     357     2079058 :    SUBROUTINE release_fast_vec_access(fast_vec_access)
     358             :       TYPE(fast_vec_access_type)                         :: fast_vec_access
     359             : 
     360             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'release_fast_vec_access'
     361             : 
     362             :       INTEGER                                            :: handle
     363             : 
     364     2079058 :       CALL timeset(routineN, handle)
     365             : 
     366     2079058 :       CALL hash_table_release(fast_vec_access%hash_table)
     367     2079058 :       DEALLOCATE (fast_vec_access%blk_map)
     368             : 
     369     2079058 :       CALL timestop(handle)
     370             : 
     371     2079058 :    END SUBROUTINE release_fast_vec_access
     372             : 
     373             : ! --------------------------------------------------------------------------------------------------
     374             : ! Beginning of hashtable.
     375             : ! this file can be 'INCLUDE'ed verbatim in various place, where it needs to be
     376             : ! part of the module to guarantee inlining
     377             : ! hashes (c,p) pairs, where p is assumed to be >0
     378             : ! on return (0 is used as a flag for not present)
     379             : !
     380             : !
     381             : ! **************************************************************************************************
     382             : !> \brief finds a prime equal or larger than i, needed at creation
     383             : !> \param i ...
     384             : !> \return ...
     385             : ! **************************************************************************************************
     386     2079058 :    FUNCTION matching_prime(i) RESULT(res)
     387             :       INTEGER, INTENT(IN)                                :: i
     388             :       INTEGER                                            :: res
     389             : 
     390             :       INTEGER                                            :: j
     391             : 
     392     2079058 :       res = i
     393     2079058 :       j = 0
     394     6409986 :       DO WHILE (j < res)
     395    54416043 :          DO j = 2, res - 1
     396    52336985 :             IF (MOD(res, j) == 0) THEN
     397     2251870 :                res = res + 1
     398     2251870 :                EXIT
     399             :             END IF
     400             :          END DO
     401             :       END DO
     402     2079058 :    END FUNCTION
     403             : 
     404             : ! **************************************************************************************************
     405             : !> \brief create a hash_table of given initial size.
     406             : !>        the hash table will expand as needed (but this requires rehashing)
     407             : !> \param hash_table ...
     408             : !> \param table_size ...
     409             : ! **************************************************************************************************
     410     2079058 :    SUBROUTINE hash_table_create(hash_table, table_size)
     411             :       TYPE(hash_table_type)                              :: hash_table
     412             :       INTEGER, INTENT(IN)                                :: table_size
     413             : 
     414             :       INTEGER                                            :: j
     415             : 
     416             :       ! guarantee a minimal hash table size (8), so that expansion works
     417             : 
     418     2079058 :       j = 3
     419     4054786 :       DO WHILE (2**j - 1 < table_size)
     420     1975728 :          j = j + 1
     421             :       END DO
     422     2079058 :       hash_table%nmax = 2**j - 1
     423     2079058 :       hash_table%prime = matching_prime(hash_table%nmax)
     424     2079058 :       hash_table%nele = 0
     425    56731326 :       ALLOCATE (hash_table%table(0:hash_table%nmax))
     426     2079058 :    END SUBROUTINE hash_table_create
     427             : 
     428             : ! **************************************************************************************************
     429             : !> \brief ...
     430             : !> \param hash_table ...
     431             : ! **************************************************************************************************
     432     2079058 :    SUBROUTINE hash_table_release(hash_table)
     433             :       TYPE(hash_table_type)                              :: hash_table
     434             : 
     435     2079058 :       hash_table%nmax = 0
     436     2079058 :       hash_table%nele = 0
     437     2079058 :       DEALLOCATE (hash_table%table)
     438             : 
     439     2079058 :    END SUBROUTINE hash_table_release
     440             : 
     441             : ! **************************************************************************************************
     442             : !> \brief add a pair (c,p) to the hash table
     443             : !> \param hash_table ...
     444             : !> \param c this value is being hashed
     445             : !> \param p this is being stored
     446             : ! **************************************************************************************************
     447     7994821 :    RECURSIVE SUBROUTINE hash_table_add(hash_table, c, p)
     448             :       TYPE(hash_table_type), INTENT(INOUT)               :: hash_table
     449             :       INTEGER, INTENT(IN)                                :: c, p
     450             : 
     451             :       REAL(KIND=real_8), PARAMETER :: hash_table_expand = 1.5_real_8, &
     452             :          inv_hash_table_fill = 2.5_real_8
     453             : 
     454             :       INTEGER                                            :: i, j
     455     7994821 :       TYPE(ele_type), ALLOCATABLE, DIMENSION(:)          :: tmp_hash
     456             : 
     457             :       ! if too small, make a copy and rehash in a larger table
     458             : 
     459     7994821 :       IF (hash_table%nele*inv_hash_table_fill > hash_table%nmax) THEN
     460           0 :          ALLOCATE (tmp_hash(LBOUND(hash_table%table, 1):UBOUND(hash_table%table, 1)))
     461           0 :          tmp_hash(:) = hash_table%table
     462           0 :          CALL hash_table_release(hash_table)
     463           0 :          CALL hash_table_create(hash_table, INT((UBOUND(tmp_hash, 1) + 8)*hash_table_expand))
     464           0 :          DO i = LBOUND(tmp_hash, 1), UBOUND(tmp_hash, 1)
     465           0 :             IF (tmp_hash(i)%c .NE. 0) THEN
     466           0 :                CALL hash_table_add(hash_table, tmp_hash(i)%c, tmp_hash(i)%p)
     467             :             END IF
     468             :          END DO
     469           0 :          DEALLOCATE (tmp_hash)
     470             :       END IF
     471             : 
     472     7994821 :       hash_table%nele = hash_table%nele + 1
     473     7994821 :       i = IAND(c*hash_table%prime, hash_table%nmax)
     474             : 
     475     7994821 :       DO j = i, hash_table%nmax
     476     7994821 :          IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
     477     7994821 :             hash_table%table(j)%c = c
     478     7994821 :             hash_table%table(j)%p = p
     479     7994821 :             RETURN
     480             :          END IF
     481             :       END DO
     482           0 :       DO j = 0, i - 1
     483           0 :          IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
     484           0 :             hash_table%table(j)%c = c
     485           0 :             hash_table%table(j)%p = p
     486           0 :             RETURN
     487             :          END IF
     488             :       END DO
     489             : 
     490             :    END SUBROUTINE hash_table_add
     491             : 
     492             : ! **************************************************************************************************
     493             : !> \brief ...
     494             : !> \param hash_table ...
     495             : !> \param c ...
     496             : !> \return ...
     497             : ! **************************************************************************************************
     498   120147260 :    PURE FUNCTION hash_table_get(hash_table, c) RESULT(p)
     499             :       TYPE(hash_table_type), INTENT(IN)                  :: hash_table
     500             :       INTEGER, INTENT(IN)                                :: c
     501             :       INTEGER                                            :: p
     502             : 
     503             :       INTEGER                                            :: i, j
     504             : 
     505   120147260 :       i = IAND(c*hash_table%prime, hash_table%nmax)
     506             : 
     507             :       ! catch the likely case first
     508   120147260 :       IF (hash_table%table(i)%c == c) THEN
     509   115851280 :          p = hash_table%table(i)%p
     510   115851280 :          RETURN
     511             :       END IF
     512             : 
     513     4295980 :       DO j = i, hash_table%nmax
     514     4295980 :          IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
     515     4295980 :             p = hash_table%table(j)%p
     516     4295980 :             RETURN
     517             :          END IF
     518             :       END DO
     519           0 :       DO j = 0, i - 1
     520           0 :          IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
     521           0 :             p = hash_table%table(j)%p
     522           0 :             RETURN
     523             :          END IF
     524             :       END DO
     525             : 
     526             :       ! we should never reach this point.
     527   120147260 :       p = HUGE(p)
     528             : 
     529             :    END FUNCTION hash_table_get
     530             : 
     531             : ! End of hashtable
     532             : ! --------------------------------------------------------------------------------------------------
     533             : 
     534             : ! **************************************************************************************************
     535             : !> \brief the real driver routine for the multiply, not all symmetries implemented yet
     536             : !> \param matrix ...
     537             : !> \param vec_in ...
     538             : !> \param vec_out ...
     539             : !> \param alpha ...
     540             : !> \param beta ...
     541             : !> \param work_row ...
     542             : !> \param work_col ...
     543             : ! **************************************************************************************************
     544      769232 :    SUBROUTINE dbcsr_matrix_colvec_multiply(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
     545             :       TYPE(dbcsr_type)                                   :: matrix, vec_in, vec_out
     546             :       REAL(kind=real_8)                                  :: alpha, beta
     547             :       TYPE(dbcsr_type)                                   :: work_row, work_col
     548             : 
     549             :       CHARACTER                                          :: matrix_type
     550             :       INTEGER                                            :: data_type
     551             : 
     552      769232 :       CALL dbcsr_get_info(matrix, data_type=data_type, matrix_type=matrix_type)
     553      769232 :       CPASSERT(data_type == dbcsr_type_real_8)
     554             : 
     555      498935 :       SELECT CASE (matrix_type)
     556             :       CASE (dbcsr_type_no_symmetry)
     557      498935 :          CALL dbcsr_matrix_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
     558             :       CASE (dbcsr_type_symmetric)
     559      270297 :          CALL dbcsr_sym_matrix_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
     560             :       CASE (dbcsr_type_antisymmetric)
     561             :          ! Not yet implemented, should mainly be some prefactor magic, but who knows how antisymmetric matrices are stored???
     562           0 :          CPABORT("NYI, antisymmetric matrix not permitted")
     563             :       CASE DEFAULT
     564      769232 :          CPABORT("Unknown matrix type, ...")
     565             :       END SELECT
     566             : 
     567      769232 :    END SUBROUTINE dbcsr_matrix_colvec_multiply
     568             : 
     569             : ! **************************************************************************************************
     570             : !> \brief low level routines for matrix vector multiplies
     571             : !> \param matrix ...
     572             : !> \param vec_in ...
     573             : !> \param vec_out ...
     574             : !> \param alpha ...
     575             : !> \param beta ...
     576             : !> \param work_row ...
     577             : !> \param work_col ...
     578             : ! **************************************************************************************************
     579      498935 :    SUBROUTINE dbcsr_matrix_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
     580             :       TYPE(dbcsr_type)                                   :: matrix, vec_in, vec_out
     581             :       REAL(kind=real_8)                                  :: alpha, beta
     582             :       TYPE(dbcsr_type)                                   :: work_row, work_col
     583             : 
     584             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_matrix_vector_mult'
     585             : 
     586             :       INTEGER                                            :: col, data_type, handle, handle1, &
     587             :                                                             ithread, mypcol, myprow, ncols, nrows, &
     588             :                                                             pcol, prow, prow_handle, row
     589      498935 :       REAL(kind=real_8), DIMENSION(:), POINTER           :: data_vec
     590      498935 :       REAL(kind=real_8), DIMENSION(:, :), POINTER        :: data_d, vec_res
     591             :       TYPE(dbcsr_distribution_type)                      :: dist
     592             :       TYPE(dbcsr_iterator_type)                          :: iter
     593      498935 :       TYPE(fast_vec_access_type)                         :: fast_vec_col, fast_vec_row
     594             :       TYPE(mp_comm_type)                                 :: prow_group
     595             : 
     596      498935 :       CALL timeset(routineN, handle)
     597      498935 :       ithread = 0
     598             : 
     599             :       ! Collect some data about the parallel environment. We will use them later to move the vector around
     600      498935 :       CALL dbcsr_get_info(matrix, distribution=dist, data_type=data_type)
     601      498935 :       CALL dbcsr_distribution_get(dist, prow_group=prow_handle, myprow=myprow, mypcol=mypcol)
     602      498935 :       CPASSERT(data_type == dbcsr_type_real_8)
     603             : 
     604      498935 :       CALL prow_group%set_handle(prow_handle)
     605             : 
     606      498935 :       CALL create_fast_row_vec_access(work_row, fast_vec_row)
     607      498935 :       CALL create_fast_col_vec_access(work_col, fast_vec_col)
     608             : 
     609             :       ! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply
     610      498935 :       CALL dbcsr_col_vec_to_rep_row(vec_in, work_col, work_row, fast_vec_col)
     611             : 
     612             :       ! Set the work vector for the results to 0
     613      498935 :       CALL dbcsr_set(work_col, 0.0_real_8)
     614             : 
     615             :       ! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes
     616             :       ! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)
     617      498935 :       CALL timeset(routineN//"_local_mm", handle1)
     618             : 
     619             : !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow) &
     620      498935 : !$OMP          SHARED(matrix,fast_vec_col,fast_vec_row)
     621             : !$    ithread = omp_get_thread_num()
     622             :       CALL dbcsr_iterator_start(iter, matrix, shared=.FALSE.)
     623             :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     624             :          CALL dbcsr_iterator_next_block(iter, row, col, data_d)
     625             :          prow = hash_table_get(fast_vec_col%hash_table, row)
     626             :          IF (fast_vec_col%blk_map(prow)%assigned_thread .NE. ithread) CYCLE
     627             :          pcol = hash_table_get(fast_vec_row%hash_table, col)
     628             :          IF (SIZE(fast_vec_col%blk_map(prow)%ptr, 1) .EQ. 0 .OR. &
     629             :              SIZE(fast_vec_col%blk_map(prow)%ptr, 2) .EQ. 0 .OR. &
     630             :              SIZE(data_d, 2) .EQ. 0) CYCLE
     631             :          CALL dgemm('N', 'T', SIZE(fast_vec_col%blk_map(prow)%ptr, 1), &
     632             :                     SIZE(fast_vec_col%blk_map(prow)%ptr, 2), &
     633             :                     SIZE(data_d, 2), &
     634             :                     1.0_dp, &
     635             :                     data_d, &
     636             :                     SIZE(fast_vec_col%blk_map(prow)%ptr, 1), &
     637             :                     fast_vec_row%blk_map(pcol)%ptr, &
     638             :                     SIZE(fast_vec_col%blk_map(prow)%ptr, 2), &
     639             :                     1.0_dp, &
     640             :                     fast_vec_col%blk_map(prow)%ptr, &
     641             :                     SIZE(fast_vec_col%blk_map(prow)%ptr, 1))
     642             :       END DO
     643             :       CALL dbcsr_iterator_stop(iter)
     644             : !$OMP END PARALLEL
     645             : 
     646      498935 :       CALL timestop(handle1)
     647             : 
     648             :       ! sum all the data onto the first processor col where the original vector is stored
     649      498935 :       data_vec => dbcsr_get_data_p(work_col, select_data_type=0.0_real_8)
     650      498935 :       CALL dbcsr_get_info(matrix=work_col, nfullrows_local=nrows, nfullcols_local=ncols)
     651    10567301 :       CALL prow_group%sum(data_vec(1:nrows*ncols))
     652             : 
     653             :       ! Local copy on the first mpi col (as this is the localtion of the vec_res blocks) of the result vector
     654             :       ! from the replicated to the original vector. Let's play it safe and use the iterator
     655      498935 :       CALL dbcsr_iterator_start(iter, vec_out)
     656     1068485 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     657      569550 :          CALL dbcsr_iterator_next_block(iter, row, col, vec_res)
     658      569550 :          prow = hash_table_get(fast_vec_col%hash_table, row)
     659     1068485 :          IF (ASSOCIATED(fast_vec_col%blk_map(prow)%ptr)) THEN
     660     6173283 :             vec_res(:, :) = beta*vec_res(:, :) + alpha*fast_vec_col%blk_map(prow)%ptr(:, :)
     661             :          ELSE
     662           0 :             vec_res(:, :) = beta*vec_res(:, :)
     663             :          END IF
     664             :       END DO
     665      498935 :       CALL dbcsr_iterator_stop(iter)
     666             : 
     667      498935 :       CALL release_fast_vec_access(fast_vec_row)
     668      498935 :       CALL release_fast_vec_access(fast_vec_col)
     669             : 
     670      498935 :       CALL timestop(handle)
     671             : 
     672      997870 :    END SUBROUTINE dbcsr_matrix_vector_mult
     673             : 
     674             : ! **************************************************************************************************
     675             : !> \brief ...
     676             : !> \param matrix ...
     677             : !> \param vec_in ...
     678             : !> \param vec_out ...
     679             : !> \param alpha ...
     680             : !> \param beta ...
     681             : !> \param work_row ...
     682             : !> \param work_col ...
     683             : !> \param skip_diag ...
     684             : ! **************************************************************************************************
     685           0 :    SUBROUTINE dbcsr_matrixT_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col, skip_diag)
     686             :       TYPE(dbcsr_type)                                   :: matrix, vec_in, vec_out
     687             :       REAL(kind=real_8)                                  :: alpha, beta
     688             :       TYPE(dbcsr_type)                                   :: work_row, work_col
     689             :       LOGICAL                                            :: skip_diag
     690             : 
     691             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_matrixT_vector_mult'
     692             : 
     693             :       INTEGER :: col, col_size, data_type, handle, handle1, ithread, mypcol, myprow, ncols, nrows, &
     694             :          pcol, pcol_handle, prow, prow_handle, row, row_size
     695           0 :       REAL(kind=real_8), DIMENSION(:), POINTER           :: data_vec
     696           0 :       REAL(kind=real_8), DIMENSION(:, :), POINTER        :: data_d, vec_bl, vec_res
     697             :       TYPE(dbcsr_distribution_type)                      :: dist
     698             :       TYPE(dbcsr_iterator_type)                          :: iter
     699           0 :       TYPE(fast_vec_access_type)                         :: fast_vec_col, fast_vec_row
     700             :       TYPE(mp_comm_type)                                 :: pcol_group, prow_group
     701             : 
     702           0 :       CALL timeset(routineN, handle)
     703           0 :       ithread = 0
     704             : 
     705             :       ! Collect some data about the parallel environment. We will use them later to move the vector around
     706           0 :       CALL dbcsr_get_info(matrix, distribution=dist, data_type=data_type)
     707           0 :       CALL dbcsr_distribution_get(dist, prow_group=prow_handle, pcol_group=pcol_handle, myprow=myprow, mypcol=mypcol)
     708           0 :       CPASSERT(data_type == dbcsr_type_real_8)
     709             : 
     710           0 :       CALL prow_group%set_handle(prow_handle)
     711           0 :       CALL pcol_group%set_handle(pcol_handle)
     712             : 
     713           0 :       CALL create_fast_row_vec_access(work_row, fast_vec_row)
     714           0 :       CALL create_fast_col_vec_access(work_col, fast_vec_col)
     715             : 
     716             :       ! Set the work vector for the results to 0
     717           0 :       CALL dbcsr_set(work_row, 0.0_real_8)
     718             : 
     719             :       ! Transfer the correct parts of the input vector to the replicated vector on proc_col 0
     720           0 :       CALL dbcsr_iterator_start(iter, vec_in)
     721           0 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     722           0 :          CALL dbcsr_iterator_next_block(iter, row, col, vec_bl, row_size=row_size, col_size=col_size)
     723           0 :          prow = hash_table_get(fast_vec_col%hash_table, row)
     724           0 :          fast_vec_col%blk_map(prow)%ptr(1:row_size, 1:col_size) = vec_bl(1:row_size, 1:col_size)
     725             :       END DO
     726           0 :       CALL dbcsr_iterator_stop(iter)
     727             :       ! Replicate the data on all processore in the row
     728           0 :       data_vec => dbcsr_get_data_p(work_col, select_data_type=0.0_real_8)
     729           0 :       CALL prow_group%bcast(data_vec, 0)
     730             : 
     731             :       ! Perform the local multiply. Here it is obvious why the vectors are replicated on the mpi rows and cols
     732           0 :       CALL timeset(routineN//"local_mm", handle1)
     733           0 :       CALL dbcsr_get_info(matrix=work_col, nfullcols_local=ncols)
     734             : !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,row_size,col_size,ithread,prow,pcol) &
     735           0 : !$OMP          SHARED(matrix,fast_vec_row,fast_vec_col,skip_diag,ncols)
     736             : !$    ithread = omp_get_thread_num()
     737             :       CALL dbcsr_iterator_start(iter, matrix, shared=.FALSE.)
     738             :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     739             :          CALL dbcsr_iterator_next_block(iter, row, col, data_d, row_size=row_size, col_size=col_size)
     740             :          IF (skip_diag .AND. col == row) CYCLE
     741             :          prow = hash_table_get(fast_vec_col%hash_table, row)
     742             :          pcol = hash_table_get(fast_vec_row%hash_table, col)
     743             :          IF (ASSOCIATED(fast_vec_row%blk_map(pcol)%ptr) .AND. &
     744             :              ASSOCIATED(fast_vec_col%blk_map(prow)%ptr)) THEN
     745             :             IF (fast_vec_row%blk_map(pcol)%assigned_thread .NE. ithread) CYCLE
     746             :             fast_vec_row%blk_map(pcol)%ptr = fast_vec_row%blk_map(pcol)%ptr + &
     747             :                                              MATMUL(TRANSPOSE(fast_vec_col%blk_map(prow)%ptr), data_d)
     748             :          ELSE
     749             :             prow = hash_table_get(fast_vec_row%hash_table, row)
     750             :             pcol = hash_table_get(fast_vec_col%hash_table, col)
     751             :             IF (fast_vec_row%blk_map(prow)%assigned_thread .NE. ithread) CYCLE
     752             :             fast_vec_row%blk_map(prow)%ptr = fast_vec_row%blk_map(prow)%ptr + &
     753             :                                              MATMUL(TRANSPOSE(fast_vec_col%blk_map(pcol)%ptr), TRANSPOSE(data_d))
     754             :          END IF
     755             :       END DO
     756             :       CALL dbcsr_iterator_stop(iter)
     757             : !$OMP END PARALLEL
     758             : 
     759           0 :       CALL timestop(handle1)
     760             : 
     761             :       ! sum all the data within a processor column to obtain the replicated result
     762           0 :       data_vec => dbcsr_get_data_p(work_row, select_data_type=0.0_real_8)
     763             :       ! we use the replicated vector but the final answer is only summed to proc_col 0 for efficiency
     764           0 :       CALL dbcsr_get_info(matrix=work_row, nfullrows_local=nrows, nfullcols_local=ncols)
     765           0 :       CALL pcol_group%sum(data_vec(1:nrows*ncols))
     766             : 
     767             :       ! Convert the result to a column wise distribution
     768           0 :       CALL dbcsr_rep_row_to_rep_col_vec(work_col, work_row, fast_vec_row)
     769             : 
     770             :       ! Create_the final vector by summing it to the result vector which lives on proc_col 0
     771           0 :       CALL dbcsr_iterator_start(iter, vec_out)
     772           0 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     773           0 :          CALL dbcsr_iterator_next_block(iter, row, col, vec_res, row_size=row_size)
     774           0 :          prow = hash_table_get(fast_vec_col%hash_table, row)
     775           0 :          IF (ASSOCIATED(fast_vec_col%blk_map(prow)%ptr)) THEN
     776           0 :             vec_res(:, :) = beta*vec_res(:, :) + alpha*fast_vec_col%blk_map(prow)%ptr(:, :)
     777             :          ELSE
     778           0 :             vec_res(:, :) = beta*vec_res(:, :)
     779             :          END IF
     780             :       END DO
     781           0 :       CALL dbcsr_iterator_stop(iter)
     782             : 
     783           0 :       CALL timestop(handle)
     784             : 
     785           0 :    END SUBROUTINE dbcsr_matrixT_vector_mult
     786             : 
     787             : ! **************************************************************************************************
     788             : !> \brief ...
     789             : !> \param vec_in ...
     790             : !> \param rep_col_vec ...
     791             : !> \param rep_row_vec ...
     792             : !> \param fast_vec_col ...
     793             : ! **************************************************************************************************
     794     6153856 :    SUBROUTINE dbcsr_col_vec_to_rep_row(vec_in, rep_col_vec, rep_row_vec, fast_vec_col)
     795             :       TYPE(dbcsr_type)                                   :: vec_in, rep_col_vec, rep_row_vec
     796             :       TYPE(fast_vec_access_type), INTENT(IN)             :: fast_vec_col
     797             : 
     798             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_col_vec_to_rep_row'
     799             : 
     800             :       INTEGER                                            :: col, handle, mypcol, myprow, ncols, &
     801             :                                                             nrows, pcol_handle, prow_handle, row
     802      769232 :       INTEGER, DIMENSION(:), POINTER                     :: local_cols, row_dist
     803      769232 :       REAL(kind=real_8), DIMENSION(:), POINTER           :: data_vec, data_vec_rep
     804      769232 :       REAL(kind=real_8), DIMENSION(:, :), POINTER        :: vec_row
     805             :       TYPE(dbcsr_distribution_type)                      :: dist_in, dist_rep_col
     806             :       TYPE(dbcsr_iterator_type)                          :: iter
     807             :       TYPE(mp_comm_type)                                 :: pcol_group, prow_group
     808             : 
     809      769232 :       CALL timeset(routineN, handle)
     810             : 
     811             :       ! get information about the parallel environment
     812      769232 :       CALL dbcsr_get_info(vec_in, distribution=dist_in)
     813             :       CALL dbcsr_distribution_get(dist_in, &
     814             :                                   prow_group=prow_handle, &
     815             :                                   pcol_group=pcol_handle, &
     816             :                                   myprow=myprow, &
     817      769232 :                                   mypcol=mypcol)
     818             : 
     819      769232 :       CALL prow_group%set_handle(prow_handle)
     820      769232 :       CALL pcol_group%set_handle(pcol_handle)
     821             : 
     822             :       ! Get the vector which tells us which blocks are local to which processor row in the col vec
     823      769232 :       CALL dbcsr_get_info(rep_col_vec, distribution=dist_rep_col)
     824      769232 :       CALL dbcsr_distribution_get(dist_rep_col, row_dist=row_dist)
     825             : 
     826             :       ! Copy the local vector to the replicated on the first processor column (this is where vec_in lives)
     827      769232 :       CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)
     828      769232 :       data_vec_rep => dbcsr_get_data_p(rep_col_vec, select_data_type=0.0_real_8)
     829      769232 :       data_vec => dbcsr_get_data_p(vec_in, select_data_type=0.0_real_8)
     830    23628916 :       IF (mypcol == 0) data_vec_rep(1:nrows*ncols) = data_vec(1:nrows*ncols)
     831             :       ! Replicate the data along the row
     832    23628916 :       CALL prow_group%bcast(data_vec_rep(1:nrows*ncols), 0)
     833             : 
     834             :       ! Here it gets a bit tricky as we are dealing with two different parallel layouts:
     835             :       ! The rep_col_vec contains all blocks local to the row distribution of the vector.
     836             :       ! The rep_row_vec only needs the fraction which is local to the col distribution.
     837             :       ! However in most cases this won't the complete set of block which can be obtained from col_vector p_row i
     838             :       ! Anyway, as the blocks don't repeat in the col_vec, a different fraction of the row vec will be available
     839             :       ! on every replica in the processor column, by summing along the column we end up with the complete vector everywhere
     840             :       ! Hope this clarifies the idea
     841      769232 :       CALL dbcsr_set(rep_row_vec, 0.0_real_8)
     842      769232 :       CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, local_cols=local_cols, nfullcols_local=ncols)
     843      769232 :       CALL dbcsr_iterator_start(iter, rep_row_vec)
     844     3997874 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     845     3228642 :          CALL dbcsr_iterator_next_block(iter, row, col, vec_row)
     846     3997874 :          IF (row_dist(col) == myprow) THEN
     847    24481105 :             vec_row = TRANSPOSE(fast_vec_col%blk_map(hash_table_get(fast_vec_col%hash_table, col))%ptr)
     848             :          END IF
     849             :       END DO
     850      769232 :       CALL dbcsr_iterator_stop(iter)
     851      769232 :       CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, nfullcols_local=ncols)
     852      769232 :       data_vec_rep => dbcsr_get_data_p(rep_row_vec, select_data_type=0.0_real_8)
     853    46351516 :       CALL pcol_group%sum(data_vec_rep(1:ncols*nrows))
     854             : 
     855      769232 :       CALL timestop(handle)
     856             : 
     857      769232 :    END SUBROUTINE dbcsr_col_vec_to_rep_row
     858             : 
     859             : ! **************************************************************************************************
     860             : !> \brief ...
     861             : !> \param rep_col_vec ...
     862             : !> \param rep_row_vec ...
     863             : !> \param fast_vec_row ...
     864             : !> \param fast_vec_col_add ...
     865             : ! **************************************************************************************************
     866     1621782 :    SUBROUTINE dbcsr_rep_row_to_rep_col_vec(rep_col_vec, rep_row_vec, fast_vec_row, fast_vec_col_add)
     867             :       TYPE(dbcsr_type)                                   :: rep_col_vec, rep_row_vec
     868             :       TYPE(fast_vec_access_type)                         :: fast_vec_row
     869             :       TYPE(fast_vec_access_type), OPTIONAL               :: fast_vec_col_add
     870             : 
     871             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_rep_row_to_rep_col_vec'
     872             : 
     873             :       INTEGER                                            :: col, handle, mypcol, myprow, ncols, &
     874             :                                                             nrows, prow_handle, row
     875      270297 :       INTEGER, DIMENSION(:), POINTER                     :: col_dist
     876      270297 :       REAL(kind=real_8), DIMENSION(:), POINTER           :: data_vec_rep
     877      270297 :       REAL(kind=real_8), DIMENSION(:, :), POINTER        :: vec_col
     878             :       TYPE(dbcsr_distribution_type)                      :: dist_rep_col, dist_rep_row
     879             :       TYPE(dbcsr_iterator_type)                          :: iter
     880             :       TYPE(mp_comm_type)                                 :: prow_group
     881             : 
     882      270297 :       CALL timeset(routineN, handle)
     883             : 
     884             :       ! get information about the parallel environment
     885      270297 :       CALL dbcsr_get_info(matrix=rep_col_vec, distribution=dist_rep_col)
     886             :       CALL dbcsr_distribution_get(dist_rep_col, &
     887             :                                   prow_group=prow_handle, &
     888             :                                   myprow=myprow, &
     889      270297 :                                   mypcol=mypcol)
     890             : 
     891      270297 :       CALL prow_group%set_handle(prow_handle)
     892             : 
     893             :       ! Get the vector which tells us which blocks are local to which processor col in the row vec
     894      270297 :       CALL dbcsr_get_info(matrix=rep_row_vec, distribution=dist_rep_row)
     895      270297 :       CALL dbcsr_distribution_get(dist_rep_row, col_dist=col_dist)
     896             : 
     897             :       ! The same trick as described above with opposite direction
     898      270297 :       CALL dbcsr_set(rep_col_vec, 0.0_real_8)
     899      270297 :       CALL dbcsr_iterator_start(iter, rep_col_vec)
     900     1322168 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     901     1051871 :          CALL dbcsr_iterator_next_block(iter, row, col, vec_col)
     902     1051871 :          IF (col_dist(row) == mypcol) THEN
     903     8499401 :             vec_col = TRANSPOSE(fast_vec_row%blk_map(hash_table_get(fast_vec_row%hash_table, row))%ptr)
     904             :          END IF
     905             :          ! this one is special and allows to add the elements of a not yet summed replicated
     906             :          ! column vector as it appears in M*V(row_rep) as result. Save an parallel summation in the symmetric case
     907     1322168 :          IF (PRESENT(fast_vec_col_add)) THEN
     908     8499401 :             vec_col = vec_col + fast_vec_col_add%blk_map(hash_table_get(fast_vec_col_add%hash_table, row))%ptr(:, :)
     909             :          END IF
     910             :       END DO
     911      270297 :       CALL dbcsr_iterator_stop(iter)
     912      270297 :       CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)
     913      270297 :       data_vec_rep => dbcsr_get_data_p(rep_col_vec, select_data_type=0.0_real_8)
     914    13061615 :       CALL prow_group%sum(data_vec_rep(1:nrows*ncols))
     915             : 
     916      270297 :       CALL timestop(handle)
     917             : 
     918      270297 :    END SUBROUTINE dbcsr_rep_row_to_rep_col_vec
     919             : 
     920             : ! **************************************************************************************************
     921             : !> \brief ...
     922             : !> \param matrix ...
     923             : !> \param vec_in ...
     924             : !> \param vec_out ...
     925             : !> \param alpha ...
     926             : !> \param beta ...
     927             : !> \param work_row ...
     928             : !> \param work_col ...
     929             : ! **************************************************************************************************
     930      270297 :    SUBROUTINE dbcsr_sym_matrix_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
     931             :       TYPE(dbcsr_type)                                   :: matrix, vec_in, vec_out
     932             :       REAL(kind=real_8)                                  :: alpha, beta
     933             :       TYPE(dbcsr_type)                                   :: work_row, work_col
     934             : 
     935             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_sym_matrix_vector_mult'
     936             : 
     937             :       INTEGER                                            :: col, handle, handle1, ithread, mypcol, &
     938             :                                                             myprow, ncols, nrows, pcol, &
     939             :                                                             pcol_handle, prow, row, rpcol, rprow, &
     940             :                                                             vec_dim
     941      270297 :       REAL(kind=real_8), DIMENSION(:), POINTER           :: data_vec
     942      270297 :       REAL(kind=real_8), DIMENSION(:, :), POINTER        :: data_d, vec_res
     943             :       TYPE(dbcsr_distribution_type)                      :: dist
     944             :       TYPE(dbcsr_iterator_type)                          :: iter
     945             :       TYPE(dbcsr_type)                                   :: result_col, result_row
     946      270297 :       TYPE(fast_vec_access_type)                         :: fast_vec_col, fast_vec_row, &
     947      270297 :                                                             res_fast_vec_col, res_fast_vec_row
     948             :       TYPE(mp_comm_type)                                 :: pcol_group
     949             : 
     950      270297 :       CALL timeset(routineN, handle)
     951      270297 :       ithread = 0
     952             :       ! We need some work matrices as we try to exploit operations on the replicated vectors which are duplicated otherwise
     953      270297 :       CALL dbcsr_get_info(matrix=vec_in, nfullcols_total=vec_dim)
     954             :       ! This is a performance hack as the new creation of a replicated vector is a fair bit more expensive
     955      270297 :       CALL dbcsr_set(work_col, 0.0_real_8)
     956      270297 :       CALL dbcsr_copy(result_col, work_col)
     957      270297 :       CALL dbcsr_set(work_row, 0.0_real_8)
     958      270297 :       CALL dbcsr_copy(result_row, work_row)
     959             : 
     960             :       ! Collect some data about the parallel environment. We will use them later to move the vector around
     961      270297 :       CALL dbcsr_get_info(matrix=matrix, distribution=dist)
     962      270297 :       CALL dbcsr_distribution_get(dist, pcol_group=pcol_handle, myprow=myprow, mypcol=mypcol)
     963             : 
     964      270297 :       CALL pcol_group%set_handle(pcol_handle)
     965             : 
     966      270297 :       CALL create_fast_row_vec_access(work_row, fast_vec_row)
     967      270297 :       CALL create_fast_col_vec_access(work_col, fast_vec_col)
     968      270297 :       CALL create_fast_row_vec_access(result_row, res_fast_vec_row)
     969      270297 :       CALL create_fast_col_vec_access(result_col, res_fast_vec_col)
     970             : 
     971             :       ! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply
     972      270297 :       CALL dbcsr_col_vec_to_rep_row(vec_in, work_col, work_row, fast_vec_col)
     973             : 
     974             :       ! Probably I should rename the routine above as it delivers both the replicated row and column vector
     975             : 
     976             :       ! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes
     977             :       ! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)
     978      270297 :       CALL timeset(routineN//"_local_mm", handle1)
     979             : 
     980             :       !------ perform the multiplication, we have to take car to take the correct blocks ----------
     981             : 
     982             : !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow,rpcol,rprow) &
     983      270297 : !$OMP          SHARED(matrix,fast_vec_row,res_fast_vec_col,res_fast_vec_row,fast_vec_col)
     984             : !$    ithread = omp_get_thread_num()
     985             :       CALL dbcsr_iterator_start(iter, matrix, shared=.FALSE.)
     986             :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     987             :          CALL dbcsr_iterator_next_block(iter, row, col, data_d)
     988             :          pcol = hash_table_get(fast_vec_row%hash_table, col)
     989             :          rprow = hash_table_get(res_fast_vec_col%hash_table, row)
     990             :          IF (ASSOCIATED(fast_vec_row%blk_map(pcol)%ptr) .AND. &
     991             :              ASSOCIATED(res_fast_vec_col%blk_map(rprow)%ptr)) THEN
     992             :             IF (res_fast_vec_col%blk_map(rprow)%assigned_thread .EQ. ithread) THEN
     993             :                res_fast_vec_col%blk_map(rprow)%ptr = res_fast_vec_col%blk_map(rprow)%ptr + &
     994             :                                                      MATMUL(data_d, TRANSPOSE(fast_vec_row%blk_map(pcol)%ptr))
     995             :             END IF
     996             :             prow = hash_table_get(fast_vec_col%hash_table, row)
     997             :             rpcol = hash_table_get(res_fast_vec_row%hash_table, col)
     998             :             IF (res_fast_vec_row%blk_map(rpcol)%assigned_thread .EQ. ithread .AND. row .NE. col) THEN
     999             :                res_fast_vec_row%blk_map(rpcol)%ptr = res_fast_vec_row%blk_map(rpcol)%ptr + &
    1000             :                                                      MATMUL(TRANSPOSE(fast_vec_col%blk_map(prow)%ptr), data_d)
    1001             :             END IF
    1002             :          ELSE
    1003             :             rpcol = hash_table_get(res_fast_vec_col%hash_table, col)
    1004             :             prow = hash_table_get(fast_vec_row%hash_table, row)
    1005             :             IF (res_fast_vec_col%blk_map(rpcol)%assigned_thread .EQ. ithread) THEN
    1006             :                res_fast_vec_col%blk_map(rpcol)%ptr = res_fast_vec_col%blk_map(rpcol)%ptr + &
    1007             :                                                      TRANSPOSE(MATMUL(fast_vec_row%blk_map(prow)%ptr, data_d))
    1008             :             END IF
    1009             :             rprow = hash_table_get(res_fast_vec_row%hash_table, row)
    1010             :             pcol = hash_table_get(fast_vec_col%hash_table, col)
    1011             :             IF (res_fast_vec_row%blk_map(rprow)%assigned_thread .EQ. ithread .AND. row .NE. col) THEN
    1012             :                res_fast_vec_row%blk_map(rprow)%ptr = res_fast_vec_row%blk_map(rprow)%ptr + &
    1013             :                                                      TRANSPOSE(MATMUL(data_d, fast_vec_col%blk_map(pcol)%ptr))
    1014             :             END IF
    1015             :          END IF
    1016             :       END DO
    1017             :       CALL dbcsr_iterator_stop(iter)
    1018             : !$OMP END PARALLEL
    1019             : 
    1020      270297 :       CALL timestop(handle1)
    1021             : 
    1022             :       ! sum all the data within a processor column to obtain the replicated result from lower
    1023      270297 :       data_vec => dbcsr_get_data_p(result_row, select_data_type=0.0_real_8)
    1024      270297 :       CALL dbcsr_get_info(matrix=result_row, nfullrows_local=nrows, nfullcols_local=ncols)
    1025             : 
    1026    25743015 :       CALL pcol_group%sum(data_vec(1:nrows*ncols))
    1027             : 
    1028             :       ! Convert the results to a column wise distribution, this is a bit involved as the result_row is fully replicated
    1029             :       ! While the result_col still has the partial results in parallel. The routine below takes care of that and saves a
    1030             :       ! parallel summation. Of the res_row vectors are created only taking the appropriate element (0 otherwise) while the res_col
    1031             :       ! parallel bits are locally added. The sum magically creates the correct vector
    1032      270297 :       CALL dbcsr_rep_row_to_rep_col_vec(work_col, result_row, res_fast_vec_row, res_fast_vec_col)
    1033             : 
    1034             :       ! Create_the final vector by summing it to the result vector which lives on proc_col 0 lower
    1035      270297 :       CALL dbcsr_iterator_start(iter, vec_out)
    1036     1322168 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
    1037     1051871 :          CALL dbcsr_iterator_next_block(iter, row, col, vec_res)
    1038     1051871 :          prow = hash_table_get(fast_vec_col%hash_table, row)
    1039     1322168 :          IF (ASSOCIATED(fast_vec_col%blk_map(prow)%ptr)) THEN
    1040     8499401 :             vec_res(:, :) = beta*vec_res(:, :) + alpha*(fast_vec_col%blk_map(prow)%ptr(:, :))
    1041             :          ELSE
    1042           0 :             vec_res(:, :) = beta*vec_res(:, :)
    1043             :          END IF
    1044             :       END DO
    1045      270297 :       CALL dbcsr_iterator_stop(iter)
    1046             : 
    1047      270297 :       CALL release_fast_vec_access(fast_vec_row)
    1048      270297 :       CALL release_fast_vec_access(fast_vec_col)
    1049      270297 :       CALL release_fast_vec_access(res_fast_vec_row)
    1050      270297 :       CALL release_fast_vec_access(res_fast_vec_col)
    1051             : 
    1052      270297 :       CALL dbcsr_release(result_row); CALL dbcsr_release(result_col)
    1053             : 
    1054      270297 :       CALL timestop(handle)
    1055             : 
    1056      540594 :    END SUBROUTINE dbcsr_sym_matrix_vector_mult
    1057             : 
    1058           0 : END MODULE arnoldi_vector

Generated by: LCOV version 1.15