LCOV - code coverage report
Current view: top level - src/arnoldi - arnoldi_vector.F (source / functions) Hit Total Coverage
Test: CP2K Regtests (git:b4bd748) Lines: 272 348 78.2 %
Date: 2025-03-09 07:56:22 Functions: 16 23 69.6 %

          Line data    Source code
       1             : !--------------------------------------------------------------------------------------------------!
       2             : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3             : !   Copyright 2000-2025 CP2K developers group <https://cp2k.org>                                   !
       4             : !                                                                                                  !
       5             : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6             : !--------------------------------------------------------------------------------------------------!
       7             : 
       8             : ! **************************************************************************************************
       9             : !> \brief operations for skinny matrices/vectors expressed in dbcsr form
      10             : !> \par History
      11             : !>       2014.10 created [Florian Schiffmann]
      12             : !>       2023.12 Removed support for single-precision [Ole Schuett]
      13             : !>       2024.12 Removed support for complex input matrices [Ole Schuett]
      14             : !> \author Florian Schiffmann
      15             : ! **************************************************************************************************
      16             : MODULE arnoldi_vector
      17             :    USE cp_dbcsr_api,                    ONLY: &
      18             :         dbcsr_copy, dbcsr_create, dbcsr_distribution_get, dbcsr_distribution_new, &
      19             :         dbcsr_distribution_release, dbcsr_distribution_type, dbcsr_get_data_p, dbcsr_get_info, &
      20             :         dbcsr_iterator_blocks_left, dbcsr_iterator_next_block, dbcsr_iterator_start, &
      21             :         dbcsr_iterator_stop, dbcsr_iterator_type, dbcsr_release, dbcsr_set, dbcsr_type, &
      22             :         dbcsr_type_antisymmetric, dbcsr_type_no_symmetry, dbcsr_type_symmetric
      23             :    USE cp_dbcsr_contrib,                ONLY: dbcsr_reserve_all_blocks
      24             :    USE kinds,                           ONLY: dp,&
      25             :                                               real_8
      26             :    USE message_passing,                 ONLY: mp_comm_type
      27             : #include "../base/base_uses.f90"
      28             : 
      29             : !$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
      30             : 
      31             :    IMPLICIT NONE
      32             : 
      33             :    PRIVATE
      34             : 
      35             :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'arnoldi_vector'
      36             : 
      37             : ! **************************************************************************************************
      38             : !> \brief Types needed for the hashtable.
      39             : ! **************************************************************************************************
      40             :    TYPE ele_type
      41             :       INTEGER :: c = 0
      42             :       INTEGER :: p = 0
      43             :    END TYPE ele_type
      44             : 
      45             :    TYPE hash_table_type
      46             :       TYPE(ele_type), DIMENSION(:), POINTER :: table => NULL()
      47             :       INTEGER :: nele = 0
      48             :       INTEGER :: nmax = 0
      49             :       INTEGER :: prime = 0
      50             :    END TYPE hash_table_type
      51             : 
      52             : ! **************************************************************************************************
      53             : !> \brief Types needed for fast access to distributed dbcsr vectors.
      54             : ! **************************************************************************************************
      55             :    TYPE block_ptr
      56             :       REAL(real_8), DIMENSION(:, :), POINTER          :: ptr => NULL()
      57             :       INTEGER                                         :: assigned_thread = -1
      58             :    END TYPE block_ptr
      59             : 
      60             :    TYPE fast_vec_access_type
      61             :       TYPE(hash_table_type) :: hash_table = hash_table_type()
      62             :       TYPE(block_ptr), DIMENSION(:), ALLOCATABLE :: blk_map
      63             :    END TYPE
      64             : 
      65             :    PUBLIC :: dbcsr_matrix_colvec_multiply, &
      66             :              create_col_vec_from_matrix, &
      67             :              create_row_vec_from_matrix, &
      68             :              create_replicated_col_vec_from_matrix, &
      69             :              create_replicated_row_vec_from_matrix
      70             : 
      71             : CONTAINS
      72             : 
      73             : ! **************************************************************************************************
      74             : !> \brief creates a dbcsr col vector like object which lives on proc_col 0
      75             : !>        and has the same row dist as the template matrix
      76             : !>        the returned matrix is fully allocated and all blocks are set to 0
      77             : !>        this is not a sparse object (and must never be)
      78             : !> \param dbcsr_vec  the vector object to create must be allocated but not initialized
      79             : !> \param matrix a dbcsr matrix used as template
      80             : !> \param ncol number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
      81             : ! **************************************************************************************************
      82      262972 :    SUBROUTINE create_col_vec_from_matrix(dbcsr_vec, matrix, ncol)
      83             :       TYPE(dbcsr_type)                                   :: dbcsr_vec, matrix
      84             :       INTEGER                                            :: ncol
      85             : 
      86             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_col_vec_from_matrix'
      87             : 
      88             :       INTEGER                                            :: handle, npcols
      89      262972 :       INTEGER, DIMENSION(:), POINTER                     :: col_blk_size, col_dist, row_blk_size, &
      90      262972 :                                                             row_dist
      91             :       TYPE(dbcsr_distribution_type)                      :: dist, dist_col_vec
      92             : 
      93      262972 :       CALL timeset(routineN, handle)
      94             : 
      95      262972 :       CALL dbcsr_get_info(matrix, row_blk_size=row_blk_size, distribution=dist)
      96      262972 :       CALL dbcsr_distribution_get(dist, npcols=npcols, row_dist=row_dist)
      97             : 
      98      262972 :       ALLOCATE (col_dist(1), col_blk_size(1))
      99      262972 :       col_dist(1) = 0
     100      262972 :       col_blk_size(1) = ncol
     101      262972 :       CALL dbcsr_distribution_new(dist_col_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
     102             : 
     103             :       CALL dbcsr_create(dbcsr_vec, "D", dist_col_vec, matrix_type=dbcsr_type_no_symmetry, &
     104      262972 :                         row_blk_size=row_blk_size, col_blk_size=col_blk_size)
     105      262972 :       CALL dbcsr_reserve_all_blocks(dbcsr_vec)
     106             : 
     107      262972 :       CALL dbcsr_distribution_release(dist_col_vec)
     108      262972 :       DEALLOCATE (col_dist, col_blk_size)
     109      262972 :       CALL timestop(handle)
     110             : 
     111      525944 :    END SUBROUTINE create_col_vec_from_matrix
     112             : 
     113             : ! **************************************************************************************************
     114             : !> \brief creates a dbcsr row vector like object which lives on proc_row 0
     115             : !>        and has the same row dist as the template matrix
     116             : !>        the returned matrix is fully allocated and all blocks are set to 0
     117             : !>        this is not a sparse object (and must never be)
     118             : !> \param dbcsr_vec ...
     119             : !> \param matrix a dbcsr matrix used as template
     120             : !> \param nrow number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
     121             : ! **************************************************************************************************
     122           0 :    SUBROUTINE create_row_vec_from_matrix(dbcsr_vec, matrix, nrow)
     123             :       TYPE(dbcsr_type)                                   :: dbcsr_vec, matrix
     124             :       INTEGER                                            :: nrow
     125             : 
     126             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_row_vec_from_matrix'
     127             : 
     128             :       INTEGER                                            :: handle, nprows
     129           0 :       INTEGER, DIMENSION(:), POINTER                     :: col_blk_size, col_dist, row_blk_size, &
     130           0 :                                                             row_dist
     131             :       TYPE(dbcsr_distribution_type)                      :: dist, dist_row_vec
     132             : 
     133           0 :       CALL timeset(routineN, handle)
     134             : 
     135           0 :       CALL dbcsr_get_info(matrix, col_blk_size=col_blk_size, distribution=dist)
     136           0 :       CALL dbcsr_distribution_get(dist, nprows=nprows, col_dist=col_dist)
     137             : 
     138           0 :       ALLOCATE (row_dist(1), row_blk_size(1))
     139           0 :       row_dist(1) = 0
     140           0 :       row_blk_size(1) = nrow
     141           0 :       CALL dbcsr_distribution_new(dist_row_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
     142             : 
     143             :       CALL dbcsr_create(dbcsr_vec, "D", dist_row_vec, matrix_type=dbcsr_type_no_symmetry, &
     144           0 :                         row_blk_size=row_blk_size, col_blk_size=col_blk_size)
     145           0 :       CALL dbcsr_reserve_all_blocks(dbcsr_vec)
     146             : 
     147           0 :       CALL dbcsr_distribution_release(dist_row_vec)
     148           0 :       DEALLOCATE (row_dist, row_blk_size)
     149           0 :       CALL timestop(handle)
     150             : 
     151           0 :    END SUBROUTINE create_row_vec_from_matrix
     152             : 
     153             : ! **************************************************************************************************
     154             : !> \brief creates a col vector like object whose blocks can be replicated
     155             : !>        along the processor row and has the same row dist as the template matrix
     156             : !>        the returned matrix is fully allocated and all blocks are set to 0
     157             : !>        this is not a sparse object (and must never be)
     158             : !> \param dbcsr_vec the vector object to create must be allocated but not initialized
     159             : !> \param matrix a dbcsr matrix used as template
     160             : !> \param ncol number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
     161             : ! **************************************************************************************************
     162      129213 :    SUBROUTINE create_replicated_col_vec_from_matrix(dbcsr_vec, matrix, ncol)
     163             :       TYPE(dbcsr_type)                                   :: dbcsr_vec, matrix
     164             :       INTEGER                                            :: ncol
     165             : 
     166             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_replicated_col_vec_from_matrix'
     167             : 
     168             :       INTEGER                                            :: handle, i, npcols
     169      129213 :       INTEGER, DIMENSION(:), POINTER                     :: col_blk_size, col_dist, row_blk_size, &
     170      129213 :                                                             row_dist
     171             :       TYPE(dbcsr_distribution_type)                      :: dist, dist_col_vec
     172             : 
     173      129213 :       CALL timeset(routineN, handle)
     174             : 
     175      129213 :       CALL dbcsr_get_info(matrix, row_blk_size=row_blk_size, distribution=dist)
     176      129213 :       CALL dbcsr_distribution_get(dist, npcols=npcols, row_dist=row_dist)
     177             : 
     178      646065 :       ALLOCATE (col_dist(npcols), col_blk_size(npcols))
     179      258426 :       col_blk_size(:) = ncol
     180      258426 :       DO i = 0, npcols - 1
     181      258426 :          col_dist(i + 1) = i
     182             :       END DO
     183      129213 :       CALL dbcsr_distribution_new(dist_col_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
     184             : 
     185             :       CALL dbcsr_create(dbcsr_vec, "D", dist_col_vec, matrix_type=dbcsr_type_no_symmetry, &
     186      129213 :                         row_blk_size=row_blk_size, col_blk_size=col_blk_size)
     187      129213 :       CALL dbcsr_reserve_all_blocks(dbcsr_vec)
     188             : 
     189      129213 :       CALL dbcsr_distribution_release(dist_col_vec)
     190      129213 :       DEALLOCATE (col_dist, col_blk_size)
     191      129213 :       CALL timestop(handle)
     192             : 
     193      258426 :    END SUBROUTINE create_replicated_col_vec_from_matrix
     194             : 
     195             : ! **************************************************************************************************
     196             : !> \brief creates a row vector like object whose blocks can be replicated
     197             : !>        along the processor col and has the same col dist as the template matrix
     198             : !>        the returned matrix is fully allocated and all blocks are set to 0
     199             : !>        this is not a sparse object (and must never be)
     200             : !> \param dbcsr_vec the vector object to create must be allocated but not initialized
     201             : !> \param matrix a dbcsr matrix used as template
     202             : !> \param nrow number of vectors in the dbcsr_typeect (1 for vector, n for skinny matrix)
     203             : ! **************************************************************************************************
     204      129213 :    SUBROUTINE create_replicated_row_vec_from_matrix(dbcsr_vec, matrix, nrow)
     205             :       TYPE(dbcsr_type)                                   :: dbcsr_vec, matrix
     206             :       INTEGER                                            :: nrow
     207             : 
     208             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_replicated_row_vec_from_matrix'
     209             : 
     210             :       INTEGER                                            :: handle, i, nprows
     211      129213 :       INTEGER, DIMENSION(:), POINTER                     :: col_blk_size, col_dist, row_blk_size, &
     212      129213 :                                                             row_dist
     213             :       TYPE(dbcsr_distribution_type)                      :: dist, dist_row_vec
     214             : 
     215      129213 :       CALL timeset(routineN, handle)
     216             : 
     217      129213 :       CALL dbcsr_get_info(matrix, distribution=dist, col_blk_size=col_blk_size)
     218      129213 :       CALL dbcsr_distribution_get(dist, nprows=nprows, col_dist=col_dist)
     219             : 
     220      646065 :       ALLOCATE (row_dist(nprows), row_blk_size(nprows))
     221      385284 :       row_blk_size(:) = nrow
     222      385284 :       DO i = 0, nprows - 1
     223      385284 :          row_dist(i + 1) = i
     224             :       END DO
     225      129213 :       CALL dbcsr_distribution_new(dist_row_vec, template=dist, row_dist=row_dist, col_dist=col_dist)
     226             : 
     227             :       CALL dbcsr_create(dbcsr_vec, "D", dist_row_vec, dbcsr_type_no_symmetry, &
     228      129213 :                         row_blk_size=row_blk_size, col_blk_size=col_blk_size)
     229      129213 :       CALL dbcsr_reserve_all_blocks(dbcsr_vec)
     230             : 
     231      129213 :       CALL dbcsr_distribution_release(dist_row_vec)
     232      129213 :       DEALLOCATE (row_dist, row_blk_Size)
     233      129213 :       CALL timestop(handle)
     234             : 
     235      258426 :    END SUBROUTINE create_replicated_row_vec_from_matrix
     236             : 
     237             : ! **************************************************************************************************
     238             : !> \brief given a column vector, prepare the fast_vec_access container
     239             : !> \param vec ...
     240             : !> \param fast_vec_access ...
     241             : ! **************************************************************************************************
     242     1040703 :    SUBROUTINE create_fast_col_vec_access(vec, fast_vec_access)
     243             :       TYPE(dbcsr_type)                                   :: vec
     244             :       TYPE(fast_vec_access_type)                         :: fast_vec_access
     245             : 
     246             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_col_vec_access'
     247             : 
     248             :       INTEGER                                            :: col, handle, iblock, nblk_local, &
     249             :                                                             nthreads, row
     250     1040703 :       REAL(kind=real_8), DIMENSION(:, :), POINTER        :: vec_bl
     251             :       TYPE(dbcsr_iterator_type)                          :: iter
     252             : 
     253     1040703 :       CALL timeset(routineN, handle)
     254             : 
     255             :       ! figure out the number of threads
     256     1040703 :       nthreads = 1
     257     1040703 : !$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)
     258             : !$OMP MASTER
     259             : !$    nthreads = OMP_GET_NUM_THREADS()
     260             : !$OMP END MASTER
     261             : !$OMP END PARALLEL
     262             : 
     263     1040703 :       CALL dbcsr_get_info(matrix=vec, nblkrows_local=nblk_local)
     264             :       ! 4 times makes sure the table is big enough to limit collisions.
     265     1040703 :       CALL hash_table_create(fast_vec_access%hash_table, 4*nblk_local)
     266             :       ! include zero for effective dealing with values not in the hash table (will return 0)
     267     6837125 :       ALLOCATE (fast_vec_access%blk_map(0:nblk_local))
     268             : 
     269     1040703 :       CALL dbcsr_get_info(matrix=vec, nblkcols_local=col)
     270     1040703 :       IF (col .GT. 1) CPABORT("BUG")
     271             : 
     272             :       ! go through the blocks of the vector
     273     1040703 :       iblock = 0
     274     1040703 :       CALL dbcsr_iterator_start(iter, vec)
     275     3715016 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     276     2674313 :          CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)
     277     2674313 :          iblock = iblock + 1
     278     2674313 :          CALL hash_table_add(fast_vec_access%hash_table, row, iblock)
     279     2674313 :          fast_vec_access%blk_map(iblock)%ptr => vec_bl
     280     2674313 :          fast_vec_access%blk_map(iblock)%assigned_thread = MOD(iblock, nthreads)
     281             :       END DO
     282     1040703 :       CALL dbcsr_iterator_stop(iter)
     283     1040703 :       CALL timestop(handle)
     284             : 
     285     3122109 :    END SUBROUTINE create_fast_col_vec_access
     286             : 
     287             : ! **************************************************************************************************
     288             : !> \brief given a row vector, prepare the fast_vec_access_container
     289             : !> \param vec ...
     290             : !> \param fast_vec_access ...
     291             : ! **************************************************************************************************
     292     1040703 :    SUBROUTINE create_fast_row_vec_access(vec, fast_vec_access)
     293             :       TYPE(dbcsr_type)                                   :: vec
     294             :       TYPE(fast_vec_access_type)                         :: fast_vec_access
     295             : 
     296             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_fast_row_vec_access'
     297             : 
     298             :       INTEGER                                            :: col, handle, iblock, nblk_local, &
     299             :                                                             nthreads, row
     300     1040703 :       REAL(kind=real_8), DIMENSION(:, :), POINTER        :: vec_bl
     301             :       TYPE(dbcsr_iterator_type)                          :: iter
     302             : 
     303     1040703 :       CALL timeset(routineN, handle)
     304             : 
     305             :       ! figure out the number of threads
     306     1040703 :       nthreads = 1
     307     1040703 : !$OMP PARALLEL DEFAULT(NONE) SHARED(nthreads)
     308             : !$OMP MASTER
     309             : !$    nthreads = OMP_GET_NUM_THREADS()
     310             : !$OMP END MASTER
     311             : !$OMP END PARALLEL
     312             : 
     313     1040703 :       CALL dbcsr_get_info(matrix=vec, nblkcols_local=nblk_local)
     314             :       ! 4 times makes sure the table is big enough to limit collisions.
     315     1040703 :       CALL hash_table_create(fast_vec_access%hash_table, 4*nblk_local)
     316             :       ! include zero for effective dealing with values not in the hash table (will return 0)
     317     9486383 :       ALLOCATE (fast_vec_access%blk_map(0:nblk_local))
     318             : 
     319             :       ! sanity check
     320     1040703 :       CALL dbcsr_get_info(matrix=vec, nblkrows_local=row)
     321     1040703 :       IF (row .GT. 1) CPABORT("BUG")
     322             : 
     323             :       ! go through the blocks of the vector
     324     1040703 :       iblock = 0
     325     1040703 :       CALL dbcsr_iterator_start(iter, vec)
     326     6364274 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     327     5323571 :          CALL dbcsr_iterator_next_block(iter, row, col, vec_bl)
     328     5323571 :          iblock = iblock + 1
     329     5323571 :          CALL hash_table_add(fast_vec_access%hash_table, col, iblock)
     330     5323571 :          fast_vec_access%blk_map(iblock)%ptr => vec_bl
     331     5323571 :          fast_vec_access%blk_map(iblock)%assigned_thread = MOD(iblock, nthreads)
     332             :       END DO
     333     1040703 :       CALL dbcsr_iterator_stop(iter)
     334             : 
     335     1040703 :       CALL timestop(handle)
     336             : 
     337     3122109 :    END SUBROUTINE create_fast_row_vec_access
     338             : 
     339             : ! **************************************************************************************************
     340             : !> \brief release all memory associated with the fast_vec_access type
     341             : !> \param fast_vec_access ...
     342             : ! **************************************************************************************************
     343     2081406 :    SUBROUTINE release_fast_vec_access(fast_vec_access)
     344             :       TYPE(fast_vec_access_type)                         :: fast_vec_access
     345             : 
     346             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'release_fast_vec_access'
     347             : 
     348             :       INTEGER                                            :: handle
     349             : 
     350     2081406 :       CALL timeset(routineN, handle)
     351             : 
     352     2081406 :       CALL hash_table_release(fast_vec_access%hash_table)
     353     2081406 :       DEALLOCATE (fast_vec_access%blk_map)
     354             : 
     355     2081406 :       CALL timestop(handle)
     356             : 
     357     2081406 :    END SUBROUTINE release_fast_vec_access
     358             : 
     359             : ! --------------------------------------------------------------------------------------------------
     360             : ! Beginning of hashtable.
     361             : ! this file can be 'INCLUDE'ed verbatim in various place, where it needs to be
     362             : ! part of the module to guarantee inlining
     363             : ! hashes (c,p) pairs, where p is assumed to be >0
     364             : ! on return (0 is used as a flag for not present)
     365             : !
     366             : !
     367             : ! **************************************************************************************************
     368             : !> \brief finds a prime equal or larger than i, needed at creation
     369             : !> \param i ...
     370             : !> \return ...
     371             : ! **************************************************************************************************
     372     2081406 :    FUNCTION matching_prime(i) RESULT(res)
     373             :       INTEGER, INTENT(IN)                                :: i
     374             :       INTEGER                                            :: res
     375             : 
     376             :       INTEGER                                            :: j
     377             : 
     378     2081406 :       res = i
     379     2081406 :       j = 0
     380     6416202 :       DO WHILE (j < res)
     381    54438519 :          DO j = 2, res - 1
     382    52357113 :             IF (MOD(res, j) == 0) THEN
     383     2253390 :                res = res + 1
     384     2253390 :                EXIT
     385             :             END IF
     386             :          END DO
     387             :       END DO
     388     2081406 :    END FUNCTION
     389             : 
     390             : ! **************************************************************************************************
     391             : !> \brief create a hash_table of given initial size.
     392             : !>        the hash table will expand as needed (but this requires rehashing)
     393             : !> \param hash_table ...
     394             : !> \param table_size ...
     395             : ! **************************************************************************************************
     396     2081406 :    SUBROUTINE hash_table_create(hash_table, table_size)
     397             :       TYPE(hash_table_type)                              :: hash_table
     398             :       INTEGER, INTENT(IN)                                :: table_size
     399             : 
     400             :       INTEGER                                            :: j
     401             : 
     402             :       ! guarantee a minimal hash table size (8), so that expansion works
     403             : 
     404     2081406 :       j = 3
     405     4057574 :       DO WHILE (2**j - 1 < table_size)
     406     1976168 :          j = j + 1
     407             :       END DO
     408     2081406 :       hash_table%nmax = 2**j - 1
     409     2081406 :       hash_table%prime = matching_prime(hash_table%nmax)
     410     2081406 :       hash_table%nele = 0
     411    56759394 :       ALLOCATE (hash_table%table(0:hash_table%nmax))
     412     2081406 :    END SUBROUTINE hash_table_create
     413             : 
     414             : ! **************************************************************************************************
     415             : !> \brief ...
     416             : !> \param hash_table ...
     417             : ! **************************************************************************************************
     418     2081406 :    SUBROUTINE hash_table_release(hash_table)
     419             :       TYPE(hash_table_type)                              :: hash_table
     420             : 
     421     2081406 :       hash_table%nmax = 0
     422     2081406 :       hash_table%nele = 0
     423     2081406 :       DEALLOCATE (hash_table%table)
     424             : 
     425     2081406 :    END SUBROUTINE hash_table_release
     426             : 
     427             : ! **************************************************************************************************
     428             : !> \brief add a pair (c,p) to the hash table
     429             : !> \param hash_table ...
     430             : !> \param c this value is being hashed
     431             : !> \param p this is being stored
     432             : ! **************************************************************************************************
     433     7997884 :    RECURSIVE SUBROUTINE hash_table_add(hash_table, c, p)
     434             :       TYPE(hash_table_type), INTENT(INOUT)               :: hash_table
     435             :       INTEGER, INTENT(IN)                                :: c, p
     436             : 
     437             :       REAL(KIND=real_8), PARAMETER :: hash_table_expand = 1.5_real_8, &
     438             :          inv_hash_table_fill = 2.5_real_8
     439             : 
     440             :       INTEGER                                            :: i, j
     441     7997884 :       TYPE(ele_type), ALLOCATABLE, DIMENSION(:)          :: tmp_hash
     442             : 
     443             :       ! if too small, make a copy and rehash in a larger table
     444             : 
     445     7997884 :       IF (hash_table%nele*inv_hash_table_fill > hash_table%nmax) THEN
     446           0 :          ALLOCATE (tmp_hash(LBOUND(hash_table%table, 1):UBOUND(hash_table%table, 1)))
     447           0 :          tmp_hash(:) = hash_table%table
     448           0 :          CALL hash_table_release(hash_table)
     449           0 :          CALL hash_table_create(hash_table, INT((UBOUND(tmp_hash, 1) + 8)*hash_table_expand))
     450           0 :          DO i = LBOUND(tmp_hash, 1), UBOUND(tmp_hash, 1)
     451           0 :             IF (tmp_hash(i)%c .NE. 0) THEN
     452           0 :                CALL hash_table_add(hash_table, tmp_hash(i)%c, tmp_hash(i)%p)
     453             :             END IF
     454             :          END DO
     455           0 :          DEALLOCATE (tmp_hash)
     456             :       END IF
     457             : 
     458     7997884 :       hash_table%nele = hash_table%nele + 1
     459     7997884 :       i = IAND(c*hash_table%prime, hash_table%nmax)
     460             : 
     461     7997884 :       DO j = i, hash_table%nmax
     462     7997884 :          IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
     463     7997884 :             hash_table%table(j)%c = c
     464     7997884 :             hash_table%table(j)%p = p
     465     7997884 :             RETURN
     466             :          END IF
     467             :       END DO
     468           0 :       DO j = 0, i - 1
     469           0 :          IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
     470           0 :             hash_table%table(j)%c = c
     471           0 :             hash_table%table(j)%p = p
     472           0 :             RETURN
     473             :          END IF
     474             :       END DO
     475             : 
     476             :    END SUBROUTINE hash_table_add
     477             : 
     478             : ! **************************************************************************************************
     479             : !> \brief ...
     480             : !> \param hash_table ...
     481             : !> \param c ...
     482             : !> \return ...
     483             : ! **************************************************************************************************
     484   120165378 :    PURE FUNCTION hash_table_get(hash_table, c) RESULT(p)
     485             :       TYPE(hash_table_type), INTENT(IN)                  :: hash_table
     486             :       INTEGER, INTENT(IN)                                :: c
     487             :       INTEGER                                            :: p
     488             : 
     489             :       INTEGER                                            :: i, j
     490             : 
     491   120165378 :       i = IAND(c*hash_table%prime, hash_table%nmax)
     492             : 
     493             :       ! catch the likely case first
     494   120165378 :       IF (hash_table%table(i)%c == c) THEN
     495   115869330 :          p = hash_table%table(i)%p
     496   115869330 :          RETURN
     497             :       END IF
     498             : 
     499     4296048 :       DO j = i, hash_table%nmax
     500     4296048 :          IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
     501     4296048 :             p = hash_table%table(j)%p
     502     4296048 :             RETURN
     503             :          END IF
     504             :       END DO
     505           0 :       DO j = 0, i - 1
     506           0 :          IF (hash_table%table(j)%c == 0 .OR. hash_table%table(j)%c == c) THEN
     507           0 :             p = hash_table%table(j)%p
     508           0 :             RETURN
     509             :          END IF
     510             :       END DO
     511             : 
     512             :       ! we should never reach this point.
     513   120165378 :       p = HUGE(p)
     514             : 
     515             :    END FUNCTION hash_table_get
     516             : 
     517             : ! End of hashtable
     518             : ! --------------------------------------------------------------------------------------------------
     519             : 
     520             : ! **************************************************************************************************
     521             : !> \brief the real driver routine for the multiply, not all symmetries implemented yet
     522             : !> \param matrix ...
     523             : !> \param vec_in ...
     524             : !> \param vec_out ...
     525             : !> \param alpha ...
     526             : !> \param beta ...
     527             : !> \param work_row ...
     528             : !> \param work_col ...
     529             : ! **************************************************************************************************
     530      770182 :    SUBROUTINE dbcsr_matrix_colvec_multiply(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
     531             :       TYPE(dbcsr_type)                                   :: matrix, vec_in, vec_out
     532             :       REAL(kind=real_8)                                  :: alpha, beta
     533             :       TYPE(dbcsr_type)                                   :: work_row, work_col
     534             : 
     535             :       CHARACTER                                          :: matrix_type
     536             : 
     537      770182 :       CALL dbcsr_get_info(matrix, matrix_type=matrix_type)
     538             : 
     539      499661 :       SELECT CASE (matrix_type)
     540             :       CASE (dbcsr_type_no_symmetry)
     541      499661 :          CALL dbcsr_matrix_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
     542             :       CASE (dbcsr_type_symmetric)
     543      270521 :          CALL dbcsr_sym_matrix_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
     544             :       CASE (dbcsr_type_antisymmetric)
     545             :          ! Not yet implemented, should mainly be some prefactor magic, but who knows how antisymmetric matrices are stored???
     546           0 :          CPABORT("NYI, antisymmetric matrix not permitted")
     547             :       CASE DEFAULT
     548      770182 :          CPABORT("Unknown matrix type, ...")
     549             :       END SELECT
     550             : 
     551      770182 :    END SUBROUTINE dbcsr_matrix_colvec_multiply
     552             : 
     553             : ! **************************************************************************************************
     554             : !> \brief low level routines for matrix vector multiplies
     555             : !> \param matrix ...
     556             : !> \param vec_in ...
     557             : !> \param vec_out ...
     558             : !> \param alpha ...
     559             : !> \param beta ...
     560             : !> \param work_row ...
     561             : !> \param work_col ...
     562             : ! **************************************************************************************************
     563      499661 :    SUBROUTINE dbcsr_matrix_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
     564             :       TYPE(dbcsr_type)                                   :: matrix, vec_in, vec_out
     565             :       REAL(kind=real_8)                                  :: alpha, beta
     566             :       TYPE(dbcsr_type)                                   :: work_row, work_col
     567             : 
     568             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_matrix_vector_mult'
     569             : 
     570             :       INTEGER                                            :: col, handle, handle1, ithread, mypcol, &
     571             :                                                             myprow, ncols, nrows, pcol, prow, &
     572             :                                                             prow_handle, row
     573      499661 :       REAL(kind=real_8), DIMENSION(:), POINTER           :: data_vec
     574      499661 :       REAL(kind=real_8), DIMENSION(:, :), POINTER        :: data_d, vec_res
     575             :       TYPE(dbcsr_distribution_type)                      :: dist
     576             :       TYPE(dbcsr_iterator_type)                          :: iter
     577      499661 :       TYPE(fast_vec_access_type)                         :: fast_vec_col, fast_vec_row
     578             :       TYPE(mp_comm_type)                                 :: prow_group
     579             : 
     580      499661 :       CALL timeset(routineN, handle)
     581      499661 :       ithread = 0
     582             : 
     583             :       ! Collect some data about the parallel environment. We will use them later to move the vector around
     584      499661 :       CALL dbcsr_get_info(matrix, distribution=dist)
     585      499661 :       CALL dbcsr_distribution_get(dist, prow_group=prow_handle, myprow=myprow, mypcol=mypcol)
     586             : 
     587      499661 :       CALL prow_group%set_handle(prow_handle)
     588             : 
     589      499661 :       CALL create_fast_row_vec_access(work_row, fast_vec_row)
     590      499661 :       CALL create_fast_col_vec_access(work_col, fast_vec_col)
     591             : 
     592             :       ! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply
     593      499661 :       CALL dbcsr_col_vec_to_rep_row(vec_in, work_col, work_row, fast_vec_col)
     594             : 
     595             :       ! Set the work vector for the results to 0
     596      499661 :       CALL dbcsr_set(work_col, 0.0_real_8)
     597             : 
     598             :       ! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes
     599             :       ! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)
     600      499661 :       CALL timeset(routineN//"_local_mm", handle1)
     601             : 
     602             : !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow) &
     603      499661 : !$OMP          SHARED(matrix,fast_vec_col,fast_vec_row)
     604             : !$    ithread = omp_get_thread_num()
     605             :       CALL dbcsr_iterator_start(iter, matrix, shared=.FALSE.)
     606             :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     607             :          CALL dbcsr_iterator_next_block(iter, row, col, data_d)
     608             :          prow = hash_table_get(fast_vec_col%hash_table, row)
     609             :          IF (fast_vec_col%blk_map(prow)%assigned_thread .NE. ithread) CYCLE
     610             :          pcol = hash_table_get(fast_vec_row%hash_table, col)
     611             :          IF (SIZE(fast_vec_col%blk_map(prow)%ptr, 1) .EQ. 0 .OR. &
     612             :              SIZE(fast_vec_col%blk_map(prow)%ptr, 2) .EQ. 0 .OR. &
     613             :              SIZE(data_d, 2) .EQ. 0) CYCLE
     614             :          CALL dgemm('N', 'T', SIZE(fast_vec_col%blk_map(prow)%ptr, 1), &
     615             :                     SIZE(fast_vec_col%blk_map(prow)%ptr, 2), &
     616             :                     SIZE(data_d, 2), &
     617             :                     1.0_dp, &
     618             :                     data_d, &
     619             :                     SIZE(fast_vec_col%blk_map(prow)%ptr, 1), &
     620             :                     fast_vec_row%blk_map(pcol)%ptr, &
     621             :                     SIZE(fast_vec_col%blk_map(prow)%ptr, 2), &
     622             :                     1.0_dp, &
     623             :                     fast_vec_col%blk_map(prow)%ptr, &
     624             :                     SIZE(fast_vec_col%blk_map(prow)%ptr, 1))
     625             :       END DO
     626             :       CALL dbcsr_iterator_stop(iter)
     627             : !$OMP END PARALLEL
     628             : 
     629      499661 :       CALL timestop(handle1)
     630             : 
     631             :       ! sum all the data onto the first processor col where the original vector is stored
     632      499661 :       data_vec => dbcsr_get_data_p(work_col)
     633      499661 :       CALL dbcsr_get_info(matrix=work_col, nfullrows_local=nrows, nfullcols_local=ncols)
     634    10571431 :       CALL prow_group%sum(data_vec(1:nrows*ncols))
     635             : 
     636             :       ! Local copy on the first mpi col (as this is the localtion of the vec_res blocks) of the result vector
     637             :       ! from the replicated to the original vector. Let's play it safe and use the iterator
     638      499661 :       CALL dbcsr_iterator_start(iter, vec_out)
     639     1069624 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     640      569963 :          CALL dbcsr_iterator_next_block(iter, row, col, vec_res)
     641      569963 :          prow = hash_table_get(fast_vec_col%hash_table, row)
     642     1069624 :          IF (ASSOCIATED(fast_vec_col%blk_map(prow)%ptr)) THEN
     643     6175811 :             vec_res(:, :) = beta*vec_res(:, :) + alpha*fast_vec_col%blk_map(prow)%ptr(:, :)
     644             :          ELSE
     645           0 :             vec_res(:, :) = beta*vec_res(:, :)
     646             :          END IF
     647             :       END DO
     648      499661 :       CALL dbcsr_iterator_stop(iter)
     649             : 
     650      499661 :       CALL release_fast_vec_access(fast_vec_row)
     651      499661 :       CALL release_fast_vec_access(fast_vec_col)
     652             : 
     653      499661 :       CALL timestop(handle)
     654             : 
     655      999322 :    END SUBROUTINE dbcsr_matrix_vector_mult
     656             : 
     657             : ! **************************************************************************************************
     658             : !> \brief ...
     659             : !> \param matrix ...
     660             : !> \param vec_in ...
     661             : !> \param vec_out ...
     662             : !> \param alpha ...
     663             : !> \param beta ...
     664             : !> \param work_row ...
     665             : !> \param work_col ...
     666             : !> \param skip_diag ...
     667             : ! **************************************************************************************************
     668           0 :    SUBROUTINE dbcsr_matrixT_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col, skip_diag)
     669             :       TYPE(dbcsr_type)                                   :: matrix, vec_in, vec_out
     670             :       REAL(kind=real_8)                                  :: alpha, beta
     671             :       TYPE(dbcsr_type)                                   :: work_row, work_col
     672             :       LOGICAL                                            :: skip_diag
     673             : 
     674             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_matrixT_vector_mult'
     675             : 
     676             :       INTEGER                                            :: col, col_size, handle, handle1, ithread, &
     677             :                                                             mypcol, myprow, ncols, nrows, pcol, &
     678             :                                                             pcol_handle, prow, prow_handle, row, &
     679             :                                                             row_size
     680           0 :       REAL(kind=real_8), DIMENSION(:), POINTER           :: data_vec
     681           0 :       REAL(kind=real_8), DIMENSION(:, :), POINTER        :: data_d, vec_bl, vec_res
     682             :       TYPE(dbcsr_distribution_type)                      :: dist
     683             :       TYPE(dbcsr_iterator_type)                          :: iter
     684           0 :       TYPE(fast_vec_access_type)                         :: fast_vec_col, fast_vec_row
     685             :       TYPE(mp_comm_type)                                 :: pcol_group, prow_group
     686             : 
     687           0 :       CALL timeset(routineN, handle)
     688           0 :       ithread = 0
     689             : 
     690             :       ! Collect some data about the parallel environment. We will use them later to move the vector around
     691           0 :       CALL dbcsr_get_info(matrix, distribution=dist)
     692           0 :       CALL dbcsr_distribution_get(dist, prow_group=prow_handle, pcol_group=pcol_handle, myprow=myprow, mypcol=mypcol)
     693             : 
     694           0 :       CALL prow_group%set_handle(prow_handle)
     695           0 :       CALL pcol_group%set_handle(pcol_handle)
     696             : 
     697           0 :       CALL create_fast_row_vec_access(work_row, fast_vec_row)
     698           0 :       CALL create_fast_col_vec_access(work_col, fast_vec_col)
     699             : 
     700             :       ! Set the work vector for the results to 0
     701           0 :       CALL dbcsr_set(work_row, 0.0_real_8)
     702             : 
     703             :       ! Transfer the correct parts of the input vector to the replicated vector on proc_col 0
     704           0 :       CALL dbcsr_iterator_start(iter, vec_in)
     705           0 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     706           0 :          CALL dbcsr_iterator_next_block(iter, row, col, vec_bl, row_size=row_size, col_size=col_size)
     707           0 :          prow = hash_table_get(fast_vec_col%hash_table, row)
     708           0 :          fast_vec_col%blk_map(prow)%ptr(1:row_size, 1:col_size) = vec_bl(1:row_size, 1:col_size)
     709             :       END DO
     710           0 :       CALL dbcsr_iterator_stop(iter)
     711             :       ! Replicate the data on all processore in the row
     712           0 :       data_vec => dbcsr_get_data_p(work_col)
     713           0 :       CALL prow_group%bcast(data_vec, 0)
     714             : 
     715             :       ! Perform the local multiply. Here it is obvious why the vectors are replicated on the mpi rows and cols
     716           0 :       CALL timeset(routineN//"local_mm", handle1)
     717           0 :       CALL dbcsr_get_info(matrix=work_col, nfullcols_local=ncols)
     718             : !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,row_size,col_size,ithread,prow,pcol) &
     719           0 : !$OMP          SHARED(matrix,fast_vec_row,fast_vec_col,skip_diag,ncols)
     720             : !$    ithread = omp_get_thread_num()
     721             :       CALL dbcsr_iterator_start(iter, matrix, shared=.FALSE.)
     722             :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     723             :          CALL dbcsr_iterator_next_block(iter, row, col, data_d, row_size=row_size, col_size=col_size)
     724             :          IF (skip_diag .AND. col == row) CYCLE
     725             :          prow = hash_table_get(fast_vec_col%hash_table, row)
     726             :          pcol = hash_table_get(fast_vec_row%hash_table, col)
     727             :          IF (ASSOCIATED(fast_vec_row%blk_map(pcol)%ptr) .AND. &
     728             :              ASSOCIATED(fast_vec_col%blk_map(prow)%ptr)) THEN
     729             :             IF (fast_vec_row%blk_map(pcol)%assigned_thread .NE. ithread) CYCLE
     730             :             fast_vec_row%blk_map(pcol)%ptr = fast_vec_row%blk_map(pcol)%ptr + &
     731             :                                              MATMUL(TRANSPOSE(fast_vec_col%blk_map(prow)%ptr), data_d)
     732             :          ELSE
     733             :             prow = hash_table_get(fast_vec_row%hash_table, row)
     734             :             pcol = hash_table_get(fast_vec_col%hash_table, col)
     735             :             IF (fast_vec_row%blk_map(prow)%assigned_thread .NE. ithread) CYCLE
     736             :             fast_vec_row%blk_map(prow)%ptr = fast_vec_row%blk_map(prow)%ptr + &
     737             :                                              MATMUL(TRANSPOSE(fast_vec_col%blk_map(pcol)%ptr), TRANSPOSE(data_d))
     738             :          END IF
     739             :       END DO
     740             :       CALL dbcsr_iterator_stop(iter)
     741             : !$OMP END PARALLEL
     742             : 
     743           0 :       CALL timestop(handle1)
     744             : 
     745             :       ! sum all the data within a processor column to obtain the replicated result
     746           0 :       data_vec => dbcsr_get_data_p(work_row)
     747             :       ! we use the replicated vector but the final answer is only summed to proc_col 0 for efficiency
     748           0 :       CALL dbcsr_get_info(matrix=work_row, nfullrows_local=nrows, nfullcols_local=ncols)
     749           0 :       CALL pcol_group%sum(data_vec(1:nrows*ncols))
     750             : 
     751             :       ! Convert the result to a column wise distribution
     752           0 :       CALL dbcsr_rep_row_to_rep_col_vec(work_col, work_row, fast_vec_row)
     753             : 
     754             :       ! Create_the final vector by summing it to the result vector which lives on proc_col 0
     755           0 :       CALL dbcsr_iterator_start(iter, vec_out)
     756           0 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     757           0 :          CALL dbcsr_iterator_next_block(iter, row, col, vec_res, row_size=row_size)
     758           0 :          prow = hash_table_get(fast_vec_col%hash_table, row)
     759           0 :          IF (ASSOCIATED(fast_vec_col%blk_map(prow)%ptr)) THEN
     760           0 :             vec_res(:, :) = beta*vec_res(:, :) + alpha*fast_vec_col%blk_map(prow)%ptr(:, :)
     761             :          ELSE
     762           0 :             vec_res(:, :) = beta*vec_res(:, :)
     763             :          END IF
     764             :       END DO
     765           0 :       CALL dbcsr_iterator_stop(iter)
     766             : 
     767           0 :       CALL timestop(handle)
     768             : 
     769           0 :    END SUBROUTINE dbcsr_matrixT_vector_mult
     770             : 
     771             : ! **************************************************************************************************
     772             : !> \brief ...
     773             : !> \param vec_in ...
     774             : !> \param rep_col_vec ...
     775             : !> \param rep_row_vec ...
     776             : !> \param fast_vec_col ...
     777             : ! **************************************************************************************************
     778     6161456 :    SUBROUTINE dbcsr_col_vec_to_rep_row(vec_in, rep_col_vec, rep_row_vec, fast_vec_col)
     779             :       TYPE(dbcsr_type)                                   :: vec_in, rep_col_vec, rep_row_vec
     780             :       TYPE(fast_vec_access_type), INTENT(IN)             :: fast_vec_col
     781             : 
     782             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_col_vec_to_rep_row'
     783             : 
     784             :       INTEGER                                            :: col, handle, mypcol, myprow, ncols, &
     785             :                                                             nrows, pcol_handle, prow_handle, row
     786      770182 :       INTEGER, DIMENSION(:), POINTER                     :: local_cols, row_dist
     787      770182 :       REAL(kind=real_8), DIMENSION(:), POINTER           :: data_vec, data_vec_rep
     788      770182 :       REAL(kind=real_8), DIMENSION(:, :), POINTER        :: vec_row
     789             :       TYPE(dbcsr_distribution_type)                      :: dist_in, dist_rep_col
     790             :       TYPE(dbcsr_iterator_type)                          :: iter
     791             :       TYPE(mp_comm_type)                                 :: pcol_group, prow_group
     792             : 
     793      770182 :       CALL timeset(routineN, handle)
     794             : 
     795             :       ! get information about the parallel environment
     796      770182 :       CALL dbcsr_get_info(vec_in, distribution=dist_in)
     797             :       CALL dbcsr_distribution_get(dist_in, &
     798             :                                   prow_group=prow_handle, &
     799             :                                   pcol_group=pcol_handle, &
     800             :                                   myprow=myprow, &
     801      770182 :                                   mypcol=mypcol)
     802             : 
     803      770182 :       CALL prow_group%set_handle(prow_handle)
     804      770182 :       CALL pcol_group%set_handle(pcol_handle)
     805             : 
     806             :       ! Get the vector which tells us which blocks are local to which processor row in the col vec
     807      770182 :       CALL dbcsr_get_info(rep_col_vec, distribution=dist_rep_col)
     808      770182 :       CALL dbcsr_distribution_get(dist_rep_col, row_dist=row_dist)
     809             : 
     810             :       ! Copy the local vector to the replicated on the first processor column (this is where vec_in lives)
     811      770182 :       CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)
     812      770182 :       data_vec_rep => dbcsr_get_data_p(rep_col_vec)
     813      770182 :       data_vec => dbcsr_get_data_p(vec_in)
     814    23637774 :       IF (mypcol == 0) data_vec_rep(1:nrows*ncols) = data_vec(1:nrows*ncols)
     815             :       ! Replicate the data along the row
     816    23637774 :       CALL prow_group%bcast(data_vec_rep(1:nrows*ncols), 0)
     817             : 
     818             :       ! Here it gets a bit tricky as we are dealing with two different parallel layouts:
     819             :       ! The rep_col_vec contains all blocks local to the row distribution of the vector.
     820             :       ! The rep_row_vec only needs the fraction which is local to the col distribution.
     821             :       ! However in most cases this won't the complete set of block which can be obtained from col_vector p_row i
     822             :       ! Anyway, as the blocks don't repeat in the col_vec, a different fraction of the row vec will be available
     823             :       ! on every replica in the processor column, by summing along the column we end up with the complete vector everywhere
     824             :       ! Hope this clarifies the idea
     825      770182 :       CALL dbcsr_set(rep_row_vec, 0.0_real_8)
     826      770182 :       CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, local_cols=local_cols, nfullcols_local=ncols)
     827      770182 :       CALL dbcsr_iterator_start(iter, rep_row_vec)
     828     4000258 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     829     3230076 :          CALL dbcsr_iterator_next_block(iter, row, col, vec_row)
     830     4000258 :          IF (row_dist(col) == myprow) THEN
     831    24489730 :             vec_row = TRANSPOSE(fast_vec_col%blk_map(hash_table_get(fast_vec_col%hash_table, col))%ptr)
     832             :          END IF
     833             :       END DO
     834      770182 :       CALL dbcsr_iterator_stop(iter)
     835      770182 :       CALL dbcsr_get_info(matrix=rep_row_vec, nfullrows_local=nrows, nfullcols_local=ncols)
     836      770182 :       data_vec_rep => dbcsr_get_data_p(rep_row_vec)
     837    46368282 :       CALL pcol_group%sum(data_vec_rep(1:ncols*nrows))
     838             : 
     839      770182 :       CALL timestop(handle)
     840             : 
     841      770182 :    END SUBROUTINE dbcsr_col_vec_to_rep_row
     842             : 
     843             : ! **************************************************************************************************
     844             : !> \brief ...
     845             : !> \param rep_col_vec ...
     846             : !> \param rep_row_vec ...
     847             : !> \param fast_vec_row ...
     848             : !> \param fast_vec_col_add ...
     849             : ! **************************************************************************************************
     850     1623126 :    SUBROUTINE dbcsr_rep_row_to_rep_col_vec(rep_col_vec, rep_row_vec, fast_vec_row, fast_vec_col_add)
     851             :       TYPE(dbcsr_type)                                   :: rep_col_vec, rep_row_vec
     852             :       TYPE(fast_vec_access_type)                         :: fast_vec_row
     853             :       TYPE(fast_vec_access_type), OPTIONAL               :: fast_vec_col_add
     854             : 
     855             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_rep_row_to_rep_col_vec'
     856             : 
     857             :       INTEGER                                            :: col, handle, mypcol, myprow, ncols, &
     858             :                                                             nrows, prow_handle, row
     859      270521 :       INTEGER, DIMENSION(:), POINTER                     :: col_dist
     860      270521 :       REAL(kind=real_8), DIMENSION(:), POINTER           :: data_vec_rep
     861      270521 :       REAL(kind=real_8), DIMENSION(:, :), POINTER        :: vec_col
     862             :       TYPE(dbcsr_distribution_type)                      :: dist_rep_col, dist_rep_row
     863             :       TYPE(dbcsr_iterator_type)                          :: iter
     864             :       TYPE(mp_comm_type)                                 :: prow_group
     865             : 
     866      270521 :       CALL timeset(routineN, handle)
     867             : 
     868             :       ! get information about the parallel environment
     869      270521 :       CALL dbcsr_get_info(matrix=rep_col_vec, distribution=dist_rep_col)
     870             :       CALL dbcsr_distribution_get(dist_rep_col, &
     871             :                                   prow_group=prow_handle, &
     872             :                                   myprow=myprow, &
     873      270521 :                                   mypcol=mypcol)
     874             : 
     875      270521 :       CALL prow_group%set_handle(prow_handle)
     876             : 
     877             :       ! Get the vector which tells us which blocks are local to which processor col in the row vec
     878      270521 :       CALL dbcsr_get_info(matrix=rep_row_vec, distribution=dist_rep_row)
     879      270521 :       CALL dbcsr_distribution_get(dist_rep_row, col_dist=col_dist)
     880             : 
     881             :       ! The same trick as described above with opposite direction
     882      270521 :       CALL dbcsr_set(rep_col_vec, 0.0_real_8)
     883      270521 :       CALL dbcsr_iterator_start(iter, rep_col_vec)
     884     1322696 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     885     1052175 :          CALL dbcsr_iterator_next_block(iter, row, col, vec_col)
     886     1052175 :          IF (col_dist(row) == mypcol) THEN
     887     8502261 :             vec_col = TRANSPOSE(fast_vec_row%blk_map(hash_table_get(fast_vec_row%hash_table, row))%ptr)
     888             :          END IF
     889             :          ! this one is special and allows to add the elements of a not yet summed replicated
     890             :          ! column vector as it appears in M*V(row_rep) as result. Save an parallel summation in the symmetric case
     891     1322696 :          IF (PRESENT(fast_vec_col_add)) THEN
     892     8502261 :             vec_col = vec_col + fast_vec_col_add%blk_map(hash_table_get(fast_vec_col_add%hash_table, row))%ptr(:, :)
     893             :          END IF
     894             :       END DO
     895      270521 :       CALL dbcsr_iterator_stop(iter)
     896      270521 :       CALL dbcsr_get_info(matrix=rep_col_vec, nfullrows_local=nrows, nfullcols_local=ncols)
     897      270521 :       data_vec_rep => dbcsr_get_data_p(rep_col_vec)
     898    13066343 :       CALL prow_group%sum(data_vec_rep(1:nrows*ncols))
     899             : 
     900      270521 :       CALL timestop(handle)
     901             : 
     902      270521 :    END SUBROUTINE dbcsr_rep_row_to_rep_col_vec
     903             : 
     904             : ! **************************************************************************************************
     905             : !> \brief ...
     906             : !> \param matrix ...
     907             : !> \param vec_in ...
     908             : !> \param vec_out ...
     909             : !> \param alpha ...
     910             : !> \param beta ...
     911             : !> \param work_row ...
     912             : !> \param work_col ...
     913             : ! **************************************************************************************************
     914      270521 :    SUBROUTINE dbcsr_sym_matrix_vector_mult(matrix, vec_in, vec_out, alpha, beta, work_row, work_col)
     915             :       TYPE(dbcsr_type)                                   :: matrix, vec_in, vec_out
     916             :       REAL(kind=real_8)                                  :: alpha, beta
     917             :       TYPE(dbcsr_type)                                   :: work_row, work_col
     918             : 
     919             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_sym_matrix_vector_mult'
     920             : 
     921             :       INTEGER                                            :: col, handle, handle1, ithread, mypcol, &
     922             :                                                             myprow, ncols, nrows, pcol, &
     923             :                                                             pcol_handle, prow, row, rpcol, rprow, &
     924             :                                                             vec_dim
     925      270521 :       REAL(kind=real_8), DIMENSION(:), POINTER           :: data_vec
     926      270521 :       REAL(kind=real_8), DIMENSION(:, :), POINTER        :: data_d, vec_res
     927             :       TYPE(dbcsr_distribution_type)                      :: dist
     928             :       TYPE(dbcsr_iterator_type)                          :: iter
     929             :       TYPE(dbcsr_type)                                   :: result_col, result_row
     930      270521 :       TYPE(fast_vec_access_type)                         :: fast_vec_col, fast_vec_row, &
     931      270521 :                                                             res_fast_vec_col, res_fast_vec_row
     932             :       TYPE(mp_comm_type)                                 :: pcol_group
     933             : 
     934      270521 :       CALL timeset(routineN, handle)
     935      270521 :       ithread = 0
     936             :       ! We need some work matrices as we try to exploit operations on the replicated vectors which are duplicated otherwise
     937      270521 :       CALL dbcsr_get_info(matrix=vec_in, nfullcols_total=vec_dim)
     938             :       ! This is a performance hack as the new creation of a replicated vector is a fair bit more expensive
     939      270521 :       CALL dbcsr_set(work_col, 0.0_real_8)
     940      270521 :       CALL dbcsr_copy(result_col, work_col)
     941      270521 :       CALL dbcsr_set(work_row, 0.0_real_8)
     942      270521 :       CALL dbcsr_copy(result_row, work_row)
     943             : 
     944             :       ! Collect some data about the parallel environment. We will use them later to move the vector around
     945      270521 :       CALL dbcsr_get_info(matrix=matrix, distribution=dist)
     946      270521 :       CALL dbcsr_distribution_get(dist, pcol_group=pcol_handle, myprow=myprow, mypcol=mypcol)
     947             : 
     948      270521 :       CALL pcol_group%set_handle(pcol_handle)
     949             : 
     950      270521 :       CALL create_fast_row_vec_access(work_row, fast_vec_row)
     951      270521 :       CALL create_fast_col_vec_access(work_col, fast_vec_col)
     952      270521 :       CALL create_fast_row_vec_access(result_row, res_fast_vec_row)
     953      270521 :       CALL create_fast_col_vec_access(result_col, res_fast_vec_col)
     954             : 
     955             :       ! Transfer the correct parts of the input vector to the correct locations so we can do a local multiply
     956      270521 :       CALL dbcsr_col_vec_to_rep_row(vec_in, work_col, work_row, fast_vec_col)
     957             : 
     958             :       ! Probably I should rename the routine above as it delivers both the replicated row and column vector
     959             : 
     960             :       ! Perform the local multiply. Here we exploit, that we have the blocks replicated on the mpi processes
     961             :       ! It is important to note, that the input and result vector are distributed differently (row wise, col wise respectively)
     962      270521 :       CALL timeset(routineN//"_local_mm", handle1)
     963             : 
     964             :       !------ perform the multiplication, we have to take car to take the correct blocks ----------
     965             : 
     966             : !$OMP PARALLEL DEFAULT(NONE) PRIVATE(row,col,iter,data_d,ithread,pcol,prow,rpcol,rprow) &
     967      270521 : !$OMP          SHARED(matrix,fast_vec_row,res_fast_vec_col,res_fast_vec_row,fast_vec_col)
     968             : !$    ithread = omp_get_thread_num()
     969             :       CALL dbcsr_iterator_start(iter, matrix, shared=.FALSE.)
     970             :       DO WHILE (dbcsr_iterator_blocks_left(iter))
     971             :          CALL dbcsr_iterator_next_block(iter, row, col, data_d)
     972             :          pcol = hash_table_get(fast_vec_row%hash_table, col)
     973             :          rprow = hash_table_get(res_fast_vec_col%hash_table, row)
     974             :          IF (ASSOCIATED(fast_vec_row%blk_map(pcol)%ptr) .AND. &
     975             :              ASSOCIATED(res_fast_vec_col%blk_map(rprow)%ptr)) THEN
     976             :             IF (res_fast_vec_col%blk_map(rprow)%assigned_thread .EQ. ithread) THEN
     977             :                res_fast_vec_col%blk_map(rprow)%ptr = res_fast_vec_col%blk_map(rprow)%ptr + &
     978             :                                                      MATMUL(data_d, TRANSPOSE(fast_vec_row%blk_map(pcol)%ptr))
     979             :             END IF
     980             :             prow = hash_table_get(fast_vec_col%hash_table, row)
     981             :             rpcol = hash_table_get(res_fast_vec_row%hash_table, col)
     982             :             IF (res_fast_vec_row%blk_map(rpcol)%assigned_thread .EQ. ithread .AND. row .NE. col) THEN
     983             :                res_fast_vec_row%blk_map(rpcol)%ptr = res_fast_vec_row%blk_map(rpcol)%ptr + &
     984             :                                                      MATMUL(TRANSPOSE(fast_vec_col%blk_map(prow)%ptr), data_d)
     985             :             END IF
     986             :          ELSE
     987             :             rpcol = hash_table_get(res_fast_vec_col%hash_table, col)
     988             :             prow = hash_table_get(fast_vec_row%hash_table, row)
     989             :             IF (res_fast_vec_col%blk_map(rpcol)%assigned_thread .EQ. ithread) THEN
     990             :                res_fast_vec_col%blk_map(rpcol)%ptr = res_fast_vec_col%blk_map(rpcol)%ptr + &
     991             :                                                      TRANSPOSE(MATMUL(fast_vec_row%blk_map(prow)%ptr, data_d))
     992             :             END IF
     993             :             rprow = hash_table_get(res_fast_vec_row%hash_table, row)
     994             :             pcol = hash_table_get(fast_vec_col%hash_table, col)
     995             :             IF (res_fast_vec_row%blk_map(rprow)%assigned_thread .EQ. ithread .AND. row .NE. col) THEN
     996             :                res_fast_vec_row%blk_map(rprow)%ptr = res_fast_vec_row%blk_map(rprow)%ptr + &
     997             :                                                      TRANSPOSE(MATMUL(data_d, fast_vec_col%blk_map(pcol)%ptr))
     998             :             END IF
     999             :          END IF
    1000             :       END DO
    1001             :       CALL dbcsr_iterator_stop(iter)
    1002             : !$OMP END PARALLEL
    1003             : 
    1004      270521 :       CALL timestop(handle1)
    1005             : 
    1006             :       ! sum all the data within a processor column to obtain the replicated result from lower
    1007      270521 :       data_vec => dbcsr_get_data_p(result_row)
    1008      270521 :       CALL dbcsr_get_info(matrix=result_row, nfullrows_local=nrows, nfullcols_local=ncols)
    1009             : 
    1010    25752247 :       CALL pcol_group%sum(data_vec(1:nrows*ncols))
    1011             : 
    1012             :       ! Convert the results to a column wise distribution, this is a bit involved as the result_row is fully replicated
    1013             :       ! While the result_col still has the partial results in parallel. The routine below takes care of that and saves a
    1014             :       ! parallel summation. Of the res_row vectors are created only taking the appropriate element (0 otherwise) while the res_col
    1015             :       ! parallel bits are locally added. The sum magically creates the correct vector
    1016      270521 :       CALL dbcsr_rep_row_to_rep_col_vec(work_col, result_row, res_fast_vec_row, res_fast_vec_col)
    1017             : 
    1018             :       ! Create_the final vector by summing it to the result vector which lives on proc_col 0 lower
    1019      270521 :       CALL dbcsr_iterator_start(iter, vec_out)
    1020     1322696 :       DO WHILE (dbcsr_iterator_blocks_left(iter))
    1021     1052175 :          CALL dbcsr_iterator_next_block(iter, row, col, vec_res)
    1022     1052175 :          prow = hash_table_get(fast_vec_col%hash_table, row)
    1023     1322696 :          IF (ASSOCIATED(fast_vec_col%blk_map(prow)%ptr)) THEN
    1024     8502261 :             vec_res(:, :) = beta*vec_res(:, :) + alpha*(fast_vec_col%blk_map(prow)%ptr(:, :))
    1025             :          ELSE
    1026           0 :             vec_res(:, :) = beta*vec_res(:, :)
    1027             :          END IF
    1028             :       END DO
    1029      270521 :       CALL dbcsr_iterator_stop(iter)
    1030             : 
    1031      270521 :       CALL release_fast_vec_access(fast_vec_row)
    1032      270521 :       CALL release_fast_vec_access(fast_vec_col)
    1033      270521 :       CALL release_fast_vec_access(res_fast_vec_row)
    1034      270521 :       CALL release_fast_vec_access(res_fast_vec_col)
    1035             : 
    1036      270521 :       CALL dbcsr_release(result_row); CALL dbcsr_release(result_col)
    1037             : 
    1038      270521 :       CALL timestop(handle)
    1039             : 
    1040      541042 :    END SUBROUTINE dbcsr_sym_matrix_vector_mult
    1041             : 
    1042           0 : END MODULE arnoldi_vector

Generated by: LCOV version 1.15