LCOV - code coverage report
Current view: top level - src/fm - cp_fm_types.F (source / functions) Hit Total Coverage
Test: CP2K Regtests (git:b8e0b09) Lines: 702 822 85.4 %
Date: 2024-08-31 06:31:37 Functions: 36 45 80.0 %

          Line data    Source code
       1             : !--------------------------------------------------------------------------------------------------!
       2             : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3             : !   Copyright 2000-2024 CP2K developers group <https://cp2k.org>                                   !
       4             : !                                                                                                  !
       5             : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6             : !--------------------------------------------------------------------------------------------------!
       7             : 
       8             : ! **************************************************************************************************
       9             : !> \brief represent a full matrix distributed on many processors
      10             : !> \par History
      11             : !>      3) separated structure object, removed globenv, renamed to full matrix
      12             : !>         many changes (fawzi 08.2002)
      13             : !> \author Matthias Krack (22.05.2001)
      14             : ! **************************************************************************************************
      15             : MODULE cp_fm_types
      16             :    USE cp_blacs_env,                    ONLY: cp_blacs_env_type
      17             :    USE cp_blacs_types,                  ONLY: cp_blacs_type
      18             :    USE cp_fm_struct,                    ONLY: cp_fm_struct_equivalent,&
      19             :                                               cp_fm_struct_get,&
      20             :                                               cp_fm_struct_release,&
      21             :                                               cp_fm_struct_retain,&
      22             :                                               cp_fm_struct_type,&
      23             :                                               cp_fm_struct_write_info
      24             :    USE kinds,                           ONLY: dp,&
      25             :                                               sp
      26             :    USE message_passing,                 ONLY: cp2k_is_parallel,&
      27             :                                               mp_any_source,&
      28             :                                               mp_para_env_type,&
      29             :                                               mp_proc_null,&
      30             :                                               mp_request_null,&
      31             :                                               mp_request_type,&
      32             :                                               mp_waitall
      33             :    USE parallel_rng_types,              ONLY: UNIFORM,&
      34             :                                               rng_stream_type
      35             : #include "../base/base_uses.f90"
      36             : 
      37             :    IMPLICIT NONE
      38             : 
      39             :    PRIVATE
      40             : 
      41             :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'cp_fm_types'
      42             :    LOGICAL, PARAMETER          :: debug_this_module = .TRUE.
      43             :    INTEGER, PARAMETER :: src_tag = 3, dest_tag = 5, send_tag = 7, recv_tag = 11
      44             : 
      45             :    INTEGER, PRIVATE :: mm_type = 1
      46             : 
      47             :    PUBLIC :: cp_fm_type, &
      48             :              cp_fm_p_type, copy_info_type
      49             : 
      50             :    PUBLIC :: cp_fm_add_to_element, &
      51             :              cp_fm_create, &
      52             :              cp_fm_release, &
      53             :              cp_fm_get_info, &
      54             :              cp_fm_set_element, &
      55             :              cp_fm_get_element, &
      56             :              cp_fm_get_diag, & ! get diagonal
      57             :              cp_fm_set_all, & ! set all elements and diagonal
      58             :              cp_fm_set_submatrix, & ! set a submatrix to given values
      59             :              cp_fm_get_submatrix, & ! get a submatrix of given values
      60             :              cp_fm_init_random, &
      61             :              cp_fm_maxabsval, & ! find the maximum absolute value
      62             :              cp_fm_maxabsrownorm, & ! find the maximum of the sum of the abs of the elements of a row
      63             :              cp_fm_to_fm, & ! copy (parts of) a fm to a fm
      64             :              cp_fm_vectorsnorm, & ! compute the norm of the column-vectors
      65             :              cp_fm_vectorssum, & ! compute the sum of all elements of the column-vectors
      66             :              cp_fm_to_fm_submat, & ! copy (parts of) a fm to a fm
      67             :              cp_fm_to_fm_triangular, &
      68             :              cp_fm_copy_general, &
      69             :              cp_fm_start_copy_general, &
      70             :              cp_fm_finish_copy_general, &
      71             :              cp_fm_cleanup_copy_general, &
      72             :              cp_fm_write_unformatted, & ! writes a full matrix to an open unit
      73             :              cp_fm_write_formatted, & ! writes a full matrix to an open unit
      74             :              cp_fm_read_unformatted, & ! reads a full matrix from an open unit
      75             :              cp_fm_setup, & ! allows to set flags for fms
      76             :              cp_fm_get_mm_type, &
      77             :              cp_fm_write_info, &
      78             :              cp_fm_to_fm_submat_general ! copy matrix across different contexts
      79             : 
      80             :    PUBLIC :: cp_fm_pilaenv
      81             : 
      82             :    INTERFACE cp_fm_to_fm
      83             :       MODULE PROCEDURE cp_fm_to_fm_matrix, & ! a full matrix
      84             :          cp_fm_to_fm_columns ! just a number of columns
      85             :    END INTERFACE
      86             : 
      87             :    INTERFACE cp_fm_release
      88             :       MODULE PROCEDURE cp_fm_release_aa0, &
      89             :          cp_fm_release_aa1, &
      90             :          cp_fm_release_aa2, &
      91             :          cp_fm_release_aa3, &
      92             :          cp_fm_release_ap1, &
      93             :          cp_fm_release_ap2, &
      94             :          cp_fm_release_pa1, &
      95             :          cp_fm_release_pa2, &
      96             :          cp_fm_release_pa3, &
      97             :          cp_fm_release_pp1, &
      98             :          cp_fm_release_pp2
      99             :    END INTERFACE
     100             : 
     101             : ! **************************************************************************************************
     102             : !> \brief represent a full matrix
     103             : !> \param name the name of the matrix, used for printing
     104             : !> \param matrix_struct structure of this matrix
     105             : !> \param local_data array with the data of the matrix (its contents
     106             : !>        depend on the matrix type used: in parallel runs it will be
     107             : !>        in scalapack format, in sequential, it will simply contain
     108             : !>        the matrix)
     109             : !> \par History
     110             : !>      08.2002 created [fawzi]
     111             : !> \author fawzi
     112             : ! **************************************************************************************************
     113             :    TYPE cp_fm_type
     114             : !    PRIVATE
     115             :       CHARACTER(LEN=60) :: name = ""
     116             :       LOGICAL :: use_sp = .FALSE.
     117             :       TYPE(cp_fm_struct_type), POINTER :: matrix_struct => NULL()
     118             :       REAL(KIND=dp), DIMENSION(:, :), POINTER, CONTIGUOUS :: local_data => NULL()
     119             :       REAL(KIND=sp), DIMENSION(:, :), POINTER, CONTIGUOUS :: local_data_sp => NULL()
     120             :    END TYPE cp_fm_type
     121             : 
     122             : ! **************************************************************************************************
     123             : !> \brief just to build arrays of pointers to matrices
     124             : !> \param matrix the pointer to the matrix
     125             : !> \par History
     126             : !>      08.2002 created [fawzi]
     127             : !> \author fawzi
     128             : ! **************************************************************************************************
     129             :    TYPE cp_fm_p_type
     130             :       TYPE(cp_fm_type), POINTER :: matrix => NULL()
     131             :    END TYPE cp_fm_p_type
     132             : 
     133             : ! **************************************************************************************************
     134             : !> \brief Stores the state of a copy between cp_fm_start_copy_general
     135             : !>        and cp_fm_finish_copy_general
     136             : !> \par History
     137             : !>      Jan 2017  [Mark T]
     138             : ! **************************************************************************************************
     139             :    TYPE copy_info_type
     140             :       INTEGER :: send_size = -1
     141             :       INTEGER, DIMENSION(2) :: nlocal_recv = -1, nblock_src = -1, src_num_pe = -1 ! 1->row  2->col
     142             :       TYPE(mp_request_type), DIMENSION(:), ALLOCATABLE :: send_request, recv_request
     143             :       INTEGER, DIMENSION(:), ALLOCATABLE   :: recv_disp
     144             :       INTEGER, DIMENSION(:), POINTER       :: recv_col_indices => NULL(), recv_row_indices => NULL()
     145             :       INTEGER, DIMENSION(:, :), ALLOCATABLE :: src_blacs2mpi
     146             :       REAL(KIND=dp), DIMENSION(:), ALLOCATABLE :: recv_buf, send_buf
     147             :    END TYPE copy_info_type
     148             : 
     149             : CONTAINS
     150             : 
     151             : ! **************************************************************************************************
     152             : !> \brief creates a new full matrix with the given structure
     153             : !> \param matrix the matrix to be created
     154             : !> \param matrix_struct the structure of matrix
     155             : !> \param name ...
     156             : !> \param use_sp ...
     157             : !> \par History
     158             : !>      08.2002 created [fawzi]
     159             : !> \author Fawzi Mohamed
     160             : !> \note
     161             : !>      preferred allocation routine
     162             : ! **************************************************************************************************
     163     1231422 :    SUBROUTINE cp_fm_create(matrix, matrix_struct, name, use_sp)
     164             :       TYPE(cp_fm_type), INTENT(OUT)                      :: matrix
     165             :       TYPE(cp_fm_struct_type), INTENT(IN), TARGET        :: matrix_struct
     166             :       CHARACTER(len=*), INTENT(in), OPTIONAL             :: name
     167             :       LOGICAL, INTENT(in), OPTIONAL                      :: use_sp
     168             : 
     169             :       CHARACTER(len=*), PARAMETER                        :: routineN = 'cp_fm_create'
     170             : 
     171             :       INTEGER                                            :: handle, ncol_local, npcol, nprow, &
     172             :                                                             nrow_local
     173             :       TYPE(cp_blacs_env_type), POINTER                   :: context
     174             : 
     175     1231422 :       CALL timeset(routineN, handle)
     176             : 
     177     1231422 :       context => matrix_struct%context
     178     1231422 :       matrix%matrix_struct => matrix_struct
     179     1231422 :       CALL cp_fm_struct_retain(matrix%matrix_struct)
     180             : 
     181     1231422 :       matrix%use_sp = .FALSE.
     182     1231422 :       IF (PRESENT(use_sp)) matrix%use_sp = use_sp
     183             : 
     184     1231422 :       nprow = context%num_pe(1)
     185     1231422 :       npcol = context%num_pe(2)
     186     1231422 :       NULLIFY (matrix%local_data)
     187     1231422 :       NULLIFY (matrix%local_data_sp)
     188             : 
     189             :       ! OK, we allocate here at least a 1 x 1 matrix
     190             :       ! this must (and is) compatible with the descinit call
     191             :       ! in cp_fm_struct
     192     1231422 :       nrow_local = matrix_struct%local_leading_dimension
     193     1231422 :       ncol_local = MAX(1, matrix_struct%ncol_locals(context%mepos(2)))
     194     1231422 :       IF (matrix%use_sp) THEN
     195           0 :          ALLOCATE (matrix%local_data_sp(nrow_local, ncol_local))
     196             :       ELSE
     197     4925688 :          ALLOCATE (matrix%local_data(nrow_local, ncol_local))
     198             :       END IF
     199             : 
     200             :       ! JVDV we should remove this, as it is up to the user to zero afterwards
     201     1231422 :       IF (matrix%use_sp) THEN
     202           0 :          matrix%local_data_sp(1:nrow_local, 1:ncol_local) = 0.0_sp
     203             :       ELSE
     204   606924264 :          matrix%local_data(1:nrow_local, 1:ncol_local) = 0.0_dp
     205             :       END IF
     206             : 
     207     1231422 :       IF (PRESENT(name)) THEN
     208      621797 :          matrix%name = name
     209             :       ELSE
     210      609625 :          matrix%name = 'full matrix'
     211             :       END IF
     212             : 
     213     1231422 :       CALL timestop(handle)
     214             : 
     215     1231422 :    END SUBROUTINE cp_fm_create
     216             : 
     217             : ! **************************************************************************************************
     218             : !> \brief releases a full matrix
     219             : !> \param matrix the matrix to release
     220             : !> \par History
     221             : !>      08.2002 created [fawzi]
     222             : !> \author Fawzi Mohamed
     223             : ! **************************************************************************************************
     224     1242366 :    SUBROUTINE cp_fm_release_aa0(matrix)
     225             :       TYPE(cp_fm_type), INTENT(INOUT)                    :: matrix
     226             : 
     227     1242366 :       IF (ASSOCIATED(matrix%local_data)) THEN
     228     1230734 :          DEALLOCATE (matrix%local_data)
     229             :          NULLIFY (matrix%local_data)
     230             :       END IF
     231     1242366 :       IF (ASSOCIATED(matrix%local_data_sp)) THEN
     232           0 :          DEALLOCATE (matrix%local_data_sp)
     233             :          NULLIFY (matrix%local_data_sp)
     234             :       END IF
     235     1242366 :       matrix%name = ""
     236     1242366 :       CALL cp_fm_struct_release(matrix%matrix_struct)
     237             : 
     238     1242366 :    END SUBROUTINE cp_fm_release_aa0
     239             : 
     240             : ! **************************************************************************************************
     241             : !> \brief ...
     242             : !> \param matrices ...
     243             : ! **************************************************************************************************
     244       71330 :    SUBROUTINE cp_fm_release_aa1(matrices)
     245             :       TYPE(cp_fm_type), ALLOCATABLE, DIMENSION(:)        :: matrices
     246             : 
     247             :       INTEGER                                            :: i
     248             : 
     249       71330 :       IF (ALLOCATED(matrices)) THEN
     250      193419 :          DO i = 1, SIZE(matrices)
     251      193419 :             CALL cp_fm_release(matrices(i))
     252             :          END DO
     253       70246 :          DEALLOCATE (matrices)
     254             :       END IF
     255       71330 :    END SUBROUTINE cp_fm_release_aa1
     256             : 
     257             : ! **************************************************************************************************
     258             : !> \brief ...
     259             : !> \param matrices ...
     260             : ! **************************************************************************************************
     261        7812 :    SUBROUTINE cp_fm_release_aa2(matrices)
     262             :       TYPE(cp_fm_type), ALLOCATABLE, DIMENSION(:, :)     :: matrices
     263             : 
     264             :       INTEGER                                            :: i, j
     265             : 
     266        7812 :       IF (ALLOCATED(matrices)) THEN
     267       18916 :          DO i = 1, SIZE(matrices, 1)
     268       50836 :             DO j = 1, SIZE(matrices, 2)
     269       45292 :                CALL cp_fm_release(matrices(i, j))
     270             :             END DO
     271             :          END DO
     272        5544 :          DEALLOCATE (matrices)
     273             :       END IF
     274        7812 :    END SUBROUTINE cp_fm_release_aa2
     275             : 
     276             : ! **************************************************************************************************
     277             : !> \brief ...
     278             : !> \param matrices ...
     279             : ! **************************************************************************************************
     280          30 :    SUBROUTINE cp_fm_release_aa3(matrices)
     281             :       TYPE(cp_fm_type), ALLOCATABLE, DIMENSION(:, :, :)  :: matrices
     282             : 
     283             :       INTEGER                                            :: i, j, k
     284             : 
     285          30 :       IF (ALLOCATED(matrices)) THEN
     286         202 :          DO i = 1, SIZE(matrices, 1)
     287         574 :             DO j = 1, SIZE(matrices, 2)
     288        1076 :                DO k = 1, SIZE(matrices, 3)
     289         904 :                   CALL cp_fm_release(matrices(i, j, k))
     290             :                END DO
     291             :             END DO
     292             :          END DO
     293          30 :          DEALLOCATE (matrices)
     294             :       END IF
     295          30 :    END SUBROUTINE cp_fm_release_aa3
     296             : 
     297             : ! **************************************************************************************************
     298             : !> \brief ...
     299             : !> \param matrices ...
     300             : ! **************************************************************************************************
     301      125510 :    SUBROUTINE cp_fm_release_pa1(matrices)
     302             :       TYPE(cp_fm_type), DIMENSION(:), POINTER            :: matrices
     303             : 
     304             :       INTEGER                                            :: i
     305             : 
     306      125510 :       IF (ASSOCIATED(matrices)) THEN
     307      114205 :          DO i = 1, SIZE(matrices)
     308      114205 :             CALL cp_fm_release(matrices(i))
     309             :          END DO
     310       48830 :          DEALLOCATE (matrices)
     311             :          NULLIFY (matrices)
     312             :       END IF
     313      125510 :    END SUBROUTINE cp_fm_release_pa1
     314             : 
     315             : ! **************************************************************************************************
     316             : !> \brief ...
     317             : !> \param matrices ...
     318             : ! **************************************************************************************************
     319       19472 :    SUBROUTINE cp_fm_release_pa2(matrices)
     320             :       TYPE(cp_fm_type), DIMENSION(:, :), POINTER         :: matrices
     321             : 
     322             :       INTEGER                                            :: i, j
     323             : 
     324       19472 :       IF (ASSOCIATED(matrices)) THEN
     325       43570 :          DO i = 1, SIZE(matrices, 1)
     326       85764 :             DO j = 1, SIZE(matrices, 2)
     327       74996 :                CALL cp_fm_release(matrices(i, j))
     328             :             END DO
     329             :          END DO
     330       10768 :          DEALLOCATE (matrices)
     331             :          NULLIFY (matrices)
     332             :       END IF
     333       19472 :    END SUBROUTINE cp_fm_release_pa2
     334             : 
     335             : ! **************************************************************************************************
     336             : !> \brief ...
     337             : !> \param matrices ...
     338             : ! **************************************************************************************************
     339           0 :    SUBROUTINE cp_fm_release_pa3(matrices)
     340             :       TYPE(cp_fm_type), DIMENSION(:, :, :), POINTER      :: matrices
     341             : 
     342             :       INTEGER                                            :: i, j, k
     343             : 
     344           0 :       IF (ASSOCIATED(matrices)) THEN
     345           0 :          DO i = 1, SIZE(matrices, 1)
     346           0 :             DO j = 1, SIZE(matrices, 2)
     347           0 :                DO k = 1, SIZE(matrices, 3)
     348           0 :                   CALL cp_fm_release(matrices(i, j, k))
     349             :                END DO
     350             :             END DO
     351             :          END DO
     352           0 :          DEALLOCATE (matrices)
     353             :          NULLIFY (matrices)
     354             :       END IF
     355           0 :    END SUBROUTINE cp_fm_release_pa3
     356             : 
     357             : ! **************************************************************************************************
     358             : !> \brief ...
     359             : !> \param matrices ...
     360             : ! **************************************************************************************************
     361           0 :    SUBROUTINE cp_fm_release_ap1(matrices)
     362             :       TYPE(cp_fm_p_type), ALLOCATABLE, DIMENSION(:)      :: matrices
     363             : 
     364             :       INTEGER                                            :: i
     365             : 
     366           0 :       IF (ALLOCATED(matrices)) THEN
     367           0 :          DO i = 1, SIZE(matrices)
     368           0 :             CALL cp_fm_release(matrices(i)%matrix)
     369           0 :             DEALLOCATE (matrices(i)%matrix)
     370             :          END DO
     371           0 :          DEALLOCATE (matrices)
     372             :       END IF
     373           0 :    END SUBROUTINE cp_fm_release_ap1
     374             : 
     375             : ! **************************************************************************************************
     376             : !> \brief ...
     377             : !> \param matrices ...
     378             : ! **************************************************************************************************
     379           0 :    SUBROUTINE cp_fm_release_ap2(matrices)
     380             :       TYPE(cp_fm_p_type), ALLOCATABLE, DIMENSION(:, :)   :: matrices
     381             : 
     382             :       INTEGER                                            :: i, j
     383             : 
     384           0 :       IF (ALLOCATED(matrices)) THEN
     385           0 :          DO i = 1, SIZE(matrices, 1)
     386           0 :             DO j = 1, SIZE(matrices, 2)
     387           0 :                CALL cp_fm_release(matrices(i, j)%matrix)
     388           0 :                DEALLOCATE (matrices(i, j)%matrix)
     389             :             END DO
     390             :          END DO
     391           0 :          DEALLOCATE (matrices)
     392             :       END IF
     393           0 :    END SUBROUTINE cp_fm_release_ap2
     394             : 
     395             : ! **************************************************************************************************
     396             : !> \brief ...
     397             : !> \param matrices ...
     398             : ! **************************************************************************************************
     399           0 :    SUBROUTINE cp_fm_release_pp1(matrices)
     400             :       TYPE(cp_fm_p_type), DIMENSION(:), POINTER          :: matrices
     401             : 
     402             :       INTEGER                                            :: i
     403             : 
     404           0 :       IF (ASSOCIATED(matrices)) THEN
     405           0 :          DO i = 1, SIZE(matrices)
     406           0 :             CALL cp_fm_release(matrices(i)%matrix)
     407           0 :             DEALLOCATE (matrices(i)%matrix)
     408             :          END DO
     409           0 :          DEALLOCATE (matrices)
     410             :          NULLIFY (matrices)
     411             :       END IF
     412           0 :    END SUBROUTINE cp_fm_release_pp1
     413             : 
     414             : ! **************************************************************************************************
     415             : !> \brief ...
     416             : !> \param matrices ...
     417             : ! **************************************************************************************************
     418           0 :    SUBROUTINE cp_fm_release_pp2(matrices)
     419             :       TYPE(cp_fm_p_type), DIMENSION(:, :), POINTER       :: matrices
     420             : 
     421             :       INTEGER                                            :: i, j
     422             : 
     423           0 :       IF (ASSOCIATED(matrices)) THEN
     424           0 :          DO i = 1, SIZE(matrices, 1)
     425           0 :             DO j = 1, SIZE(matrices, 2)
     426           0 :                CALL cp_fm_release(matrices(i, j)%matrix)
     427           0 :                DEALLOCATE (matrices(i, j)%matrix)
     428             :             END DO
     429             :          END DO
     430           0 :          DEALLOCATE (matrices)
     431             :          NULLIFY (matrices)
     432             :       END IF
     433           0 :    END SUBROUTINE cp_fm_release_pp2
     434             : 
     435             : ! **************************************************************************************************
     436             : !> \brief fills a matrix with random numbers
     437             : !> \param matrix : to be initialized
     438             : !> \param ncol : numbers of cols to fill
     439             : !> \param start_col : starting at coll number
     440             : !> \author Joost VandeVondele
     441             : !> \note
     442             : !>      the value of a_ij is independent of the number of cpus
     443             : ! **************************************************************************************************
     444        2612 :    SUBROUTINE cp_fm_init_random(matrix, ncol, start_col)
     445             :       TYPE(cp_fm_type), INTENT(IN)                       :: matrix
     446             :       INTEGER, INTENT(IN), OPTIONAL                      :: ncol, start_col
     447             : 
     448             :       CHARACTER(len=*), PARAMETER                        :: routineN = 'cp_fm_init_random'
     449             : 
     450             :       INTEGER :: handle, icol_global, icol_local, irow_local, my_ncol, my_start_col, ncol_global, &
     451             :          ncol_local, nrow_global, nrow_local
     452        5224 :       INTEGER, DIMENSION(:), POINTER                     :: col_indices, row_indices
     453        2612 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:)           :: buff
     454             :       REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
     455        2612 :          POINTER                                         :: local_data
     456             :       REAL(KIND=dp), DIMENSION(3, 2), SAVE :: &
     457             :          seed = RESHAPE((/1.0_dp, 2.0_dp, 3.0_dp, 4.0_dp, 5.0_dp, 6.0_dp/), (/3, 2/))
     458             :       TYPE(rng_stream_type)                              :: rng
     459             : 
     460        2612 :       CALL timeset(routineN, handle)
     461             : 
     462             :       ! guarantee same seed over all tasks
     463        2612 :       CALL matrix%matrix_struct%para_env%bcast(seed, 0)
     464             : 
     465             :       rng = rng_stream_type("cp_fm_init_random_stream", distribution_type=UNIFORM, &
     466        2612 :                             extended_precision=.TRUE., seed=seed)
     467             : 
     468        2612 :       CPASSERT(.NOT. matrix%use_sp)
     469             : 
     470             :       CALL cp_fm_get_info(matrix, nrow_global=nrow_global, ncol_global=ncol_global, &
     471             :                           nrow_local=nrow_local, ncol_local=ncol_local, &
     472             :                           local_data=local_data, &
     473        2612 :                           row_indices=row_indices, col_indices=col_indices)
     474             : 
     475        2612 :       my_start_col = 1
     476        2612 :       IF (PRESENT(start_col)) my_start_col = start_col
     477        2612 :       my_ncol = matrix%matrix_struct%ncol_global
     478        2612 :       IF (PRESENT(ncol)) my_ncol = ncol
     479             : 
     480        2612 :       IF (ncol_global < (my_start_col + my_ncol - 1)) &
     481           0 :          CPABORT("ncol_global>=(my_start_col+my_ncol-1)")
     482             : 
     483        7836 :       ALLOCATE (buff(nrow_global))
     484             : 
     485             :       ! each global row has its own substream, in order to reach the stream for the local col,
     486             :       ! we just reset to the next substream
     487             :       ! following this, we fill the full buff with random numbers, and pick those we need
     488        2612 :       icol_global = 0
     489       21379 :       DO icol_local = 1, ncol_local
     490       18767 :          CPASSERT(col_indices(icol_local) > icol_global)
     491             :          DO
     492       18767 :             CALL rng%reset_to_next_substream()
     493       18767 :             icol_global = icol_global + 1
     494       18767 :             IF (icol_global == col_indices(icol_local)) EXIT
     495             :          END DO
     496       18767 :          CALL rng%fill(buff)
     497      786013 :          DO irow_local = 1, nrow_local
     498      783401 :             local_data(irow_local, icol_local) = buff(row_indices(irow_local))
     499             :          END DO
     500             :       END DO
     501             : 
     502        2612 :       DEALLOCATE (buff)
     503             : 
     504             :       ! store seed before deletion (unclear if this is the proper seed)
     505             : 
     506             :       ! Note that, the initial state (rng%ig) instead of the current state (rng%cg) is stored in the
     507             :       ! seed variable. As a consequence, each invocation of cp_fm_init_random uses exactly the same
     508             :       ! stream of random numbers. While this seems odd and contrary to the original design,
     509             :       ! it was probably introduced to improve reproducibility.
     510             :       ! See also https://github.com/cp2k/cp2k/pull/506
     511        2612 :       CALL rng%get(ig=seed)
     512             : 
     513        2612 :       CALL timestop(handle)
     514             : 
     515       73136 :    END SUBROUTINE cp_fm_init_random
     516             : 
     517             : ! **************************************************************************************************
     518             : !> \brief set all elements of a matrix to the same value,
     519             : !>      and optionally the diagonal to a different one
     520             : !> \param matrix input matrix
     521             : !> \param alpha scalar used to set all elements of the matrix
     522             : !> \param beta scalar used to set diagonal of the matrix
     523             : !> \note
     524             : !>      can be used to zero a matrix
     525             : !>      can be used to create a unit matrix (I-matrix) alpha=0.0_dp beta=1.0_dp
     526             : ! **************************************************************************************************
     527      312426 :    SUBROUTINE cp_fm_set_all(matrix, alpha, beta)
     528             : 
     529             :       TYPE(cp_fm_type), INTENT(IN)                       :: matrix
     530             :       REAL(KIND=dp), INTENT(IN)                          :: alpha
     531             :       REAL(KIND=dp), INTENT(IN), OPTIONAL                :: beta
     532             : 
     533             :       CHARACTER(len=*), PARAMETER                        :: routineN = 'cp_fm_set_all'
     534             : 
     535             :       INTEGER                                            :: handle, i, n
     536             : 
     537      312426 :       CALL timeset(routineN, handle)
     538             : 
     539      312426 :       IF (matrix%use_sp) THEN
     540           0 :          matrix%local_data_sp(:, :) = REAL(alpha, sp)
     541             :       ELSE
     542   176289309 :          matrix%local_data(:, :) = alpha
     543             :       END IF
     544             : 
     545      312426 :       IF (PRESENT(beta)) THEN
     546       47420 :          CPASSERT(.NOT. matrix%use_sp)
     547       47420 :          n = MIN(matrix%matrix_struct%nrow_global, matrix%matrix_struct%ncol_global)
     548      458838 :          DO i = 1, n
     549      458838 :             CALL cp_fm_set_element(matrix, i, i, beta)
     550             :          END DO
     551             :       END IF
     552             : 
     553      312426 :       CALL timestop(handle)
     554             : 
     555      312426 :    END SUBROUTINE cp_fm_set_all
     556             : 
     557             : ! **************************************************************************************************
     558             : !> \brief returns the diagonal elements of a fm
     559             : !> \param matrix ...
     560             : !> \param diag ...
     561             : ! **************************************************************************************************
     562       15990 :    SUBROUTINE cp_fm_get_diag(matrix, diag)
     563             : 
     564             :       ! arguments
     565             :       TYPE(cp_fm_type), INTENT(IN)             :: matrix
     566             :       REAL(KIND=dp), DIMENSION(:), INTENT(OUT) :: diag
     567             : 
     568             :       ! locals
     569             :       INTEGER :: i, nrow_global
     570             : 
     571             : #if defined(__parallel)
     572             :       INTEGER, DIMENSION(9) :: desca
     573             :       TYPE(cp_blacs_env_type), POINTER :: context
     574             :       INTEGER :: icol_local, ipcol, iprow, irow_local, mypcol, myprow, npcol, &
     575             :                  nprow
     576       15990 :       REAL(KIND=dp), DIMENSION(:, :), POINTER :: a
     577       15990 :       REAL(KIND=sp), DIMENSION(:, :), POINTER :: a_sp
     578             : #endif
     579             : 
     580       15990 :       CALL cp_fm_get_info(matrix, nrow_global=nrow_global)
     581             : 
     582             : #if defined(__parallel)
     583      200640 :       diag = 0.0_dp
     584       15990 :       context => matrix%matrix_struct%context
     585       15990 :       myprow = context%mepos(1)
     586       15990 :       mypcol = context%mepos(2)
     587       15990 :       nprow = context%num_pe(1)
     588       15990 :       npcol = context%num_pe(2)
     589             : 
     590       15990 :       a => matrix%local_data
     591       15990 :       a_sp => matrix%local_data_sp
     592      159900 :       desca(:) = matrix%matrix_struct%descriptor(:)
     593             : 
     594      200640 :       DO i = 1, nrow_global
     595             :          CALL infog2l(i, i, desca, nprow, npcol, myprow, mypcol, &
     596      184650 :                       irow_local, icol_local, iprow, ipcol)
     597      200640 :          IF ((iprow == myprow) .AND. (ipcol == mypcol)) THEN
     598       92409 :             IF (matrix%use_sp) THEN
     599           0 :                diag(i) = REAL(a_sp(irow_local, icol_local), dp)
     600             :             ELSE
     601       92409 :                diag(i) = a(irow_local, icol_local)
     602             :             END IF
     603             :          END IF
     604             :       END DO
     605             : #else
     606             :       DO i = 1, nrow_global
     607             :          IF (matrix%use_sp) THEN
     608             :             diag(i) = REAL(matrix%local_data_sp(i, i), dp)
     609             :          ELSE
     610             :             diag(i) = matrix%local_data(i, i)
     611             :          END IF
     612             :       END DO
     613             : #endif
     614      385290 :       CALL matrix%matrix_struct%para_env%sum(diag)
     615             : 
     616       15990 :    END SUBROUTINE cp_fm_get_diag
     617             : 
     618             : ! **************************************************************************************************
     619             : !> \brief returns an element of a fm
     620             : !>      this value is valid on every cpu
     621             : !>      using this call is expensive
     622             : !> \param matrix the matrix to read
     623             : !> \param irow_global the row
     624             : !> \param icol_global the col
     625             : !> \param alpha the value of matrix(irow_global, icol_global)
     626             : !> \param local true if the element is on this cpu, false otherwise
     627             : !> \note
     628             : !>      - modified semantics. now this function always returns the value
     629             : !>        previously the value was zero on cpus that didn't own the relevant
     630             : !>        part of the matrix (Joost VandeVondele, May 2003)
     631             : !>      - usage of the function should be avoided, as it is likely to rather slow
     632             : !>        using row_indices/col_indices/local_data + some smart scheme normally
     633             : !>        yields a real parallel code
     634             : ! **************************************************************************************************
     635     2474636 :    SUBROUTINE cp_fm_get_element(matrix, irow_global, icol_global, alpha, local)
     636             : 
     637             :       ! arguments
     638             :       TYPE(cp_fm_type), INTENT(IN)          :: matrix
     639             :       REAL(KIND=dp), INTENT(OUT)                     :: alpha
     640             :       INTEGER, INTENT(IN)                       :: icol_global, &
     641             :                                                    irow_global
     642             :       LOGICAL, INTENT(OUT), OPTIONAL            :: local
     643             : 
     644             :       ! locals
     645             : #if defined(__parallel)
     646             :       INTEGER, DIMENSION(9) :: desca
     647             :       TYPE(cp_blacs_env_type), POINTER :: context
     648             :       INTEGER :: icol_local, ipcol, iprow, irow_local, mypcol, myprow, npcol, &
     649             :                  nprow
     650     2474636 :       REAL(KIND=dp), DIMENSION(:, :), POINTER :: a
     651             : #endif
     652             : 
     653             : #if defined(__parallel)
     654     2474636 :       context => matrix%matrix_struct%context
     655     2474636 :       myprow = context%mepos(1)
     656     2474636 :       mypcol = context%mepos(2)
     657     2474636 :       nprow = context%num_pe(1)
     658     2474636 :       npcol = context%num_pe(2)
     659             : 
     660     2474636 :       a => matrix%local_data
     661    24746360 :       desca(:) = matrix%matrix_struct%descriptor(:)
     662             : 
     663             :       CALL infog2l(irow_global, icol_global, desca, nprow, npcol, myprow, mypcol, &
     664     2474636 :                    irow_local, icol_local, iprow, ipcol)
     665             : 
     666     2474636 :       IF ((iprow == myprow) .AND. (ipcol == mypcol)) THEN
     667     1237318 :          alpha = a(irow_local, icol_local)
     668     1237318 :          CALL context%dgebs2d('All', ' ', 1, 1, alpha, 1)
     669     1237318 :          IF (PRESENT(local)) local = .TRUE.
     670             :       ELSE
     671     1237318 :          CALL context%dgebr2d('All', ' ', 1, 1, alpha, 1, iprow, ipcol)
     672     1237318 :          IF (PRESENT(local)) local = .FALSE.
     673             :       END IF
     674             : 
     675             : #else
     676             :       IF (PRESENT(local)) local = .TRUE.
     677             :       alpha = matrix%local_data(irow_global, icol_global)
     678             : #endif
     679             : 
     680     2474636 :    END SUBROUTINE cp_fm_get_element
     681             : 
     682             : ! **************************************************************************************************
     683             : !> \brief sets an element of a matrix
     684             : !> \param matrix ...
     685             : !> \param irow_global ...
     686             : !> \param icol_global ...
     687             : !> \param alpha ...
     688             : !> \note
     689             : !>      we expect all cpus to have the same arguments in the call to this function
     690             : !>      (otherwise one should use local_data tricks)
     691             : ! **************************************************************************************************
     692      815756 :    SUBROUTINE cp_fm_set_element(matrix, irow_global, icol_global, alpha)
     693             :       TYPE(cp_fm_type), INTENT(IN)          :: matrix
     694             :       INTEGER, INTENT(IN)                      :: irow_global, icol_global
     695             :       REAL(KIND=dp), INTENT(IN)                :: alpha
     696             : 
     697             :       INTEGER                                  :: mypcol, myprow, npcol, nprow
     698             :       TYPE(cp_blacs_env_type), POINTER         :: context
     699             : #if defined(__parallel)
     700             :       INTEGER                                  :: icol_local, ipcol, iprow, &
     701             :                                                   irow_local
     702             :       INTEGER, DIMENSION(9)                    :: desca
     703      815756 :       REAL(KIND=dp), DIMENSION(:, :), POINTER  :: a
     704             : #endif
     705             : 
     706      815756 :       context => matrix%matrix_struct%context
     707      815756 :       myprow = context%mepos(1)
     708      815756 :       mypcol = context%mepos(2)
     709      815756 :       nprow = context%num_pe(1)
     710      815756 :       npcol = context%num_pe(2)
     711             : 
     712           0 :       CPASSERT(.NOT. matrix%use_sp)
     713             : 
     714             : #if defined(__parallel)
     715             : 
     716      815756 :       a => matrix%local_data
     717             : 
     718     8157560 :       desca(:) = matrix%matrix_struct%descriptor(:)
     719             : 
     720             :       CALL infog2l(irow_global, icol_global, desca, nprow, npcol, myprow, mypcol, &
     721      815756 :                    irow_local, icol_local, iprow, ipcol)
     722             : 
     723      815756 :       IF ((iprow == myprow) .AND. (ipcol == mypcol)) THEN
     724      412033 :          a(irow_local, icol_local) = alpha
     725             :       END IF
     726             : 
     727             : #else
     728             : 
     729             :       matrix%local_data(irow_global, icol_global) = alpha
     730             : 
     731             : #endif
     732      815756 :    END SUBROUTINE cp_fm_set_element
     733             : 
     734             : ! **************************************************************************************************
     735             : !> \brief sets a submatrix of a full matrix
     736             : !>       fm(start_row:start_row+n_rows,start_col:start_col+n_cols)
     737             : !>       = alpha*op(new_values)(1:n_rows,1:n_cols)+ beta
     738             : !>       * fm(start_row:start_row+n_rows,start_col:start_col+n_cols)
     739             : !> \param fm the full to change
     740             : !> \param new_values a replicated full matrix with the new values
     741             : !> \param start_row the starting row of b_matrix (defaults to 1)
     742             : !> \param start_col the starting col of b_matrix (defaults to 1)
     743             : !> \param n_rows the number of row to change in b (defaults to
     744             : !>        size(op(new_values),1))
     745             : !> \param n_cols the number of columns to change in b (defaults to
     746             : !>        size(op(new_values),2))
     747             : !> \param alpha rescaling factor for the new values (defaults to 1.0)
     748             : !> \param beta rescaling factor for the old values (defaults to 0.0)
     749             : !> \param transpose if new_values should be transposed: if true
     750             : !>        op(new_values)=new_values^T, else op(new_values)=new_values
     751             : !>        (defaults to false)
     752             : !> \par History
     753             : !>      07.2002 created borrowing from Joost's blacs_replicated_copy [fawzi]
     754             : !> \author Fawzi Mohamed
     755             : !> \note
     756             : !>      optimized for full column updates and alpha=1.0, beta=0.0
     757             : !>      the new_values need to be valid on all cpus
     758             : ! **************************************************************************************************
     759       61421 :    SUBROUTINE cp_fm_set_submatrix(fm, new_values, start_row, &
     760             :                                   start_col, n_rows, n_cols, alpha, beta, transpose)
     761             :       TYPE(cp_fm_type), INTENT(IN)                       :: fm
     762             :       REAL(KIND=dp), DIMENSION(:, :), INTENT(in)         :: new_values
     763             :       INTEGER, INTENT(in), OPTIONAL                      :: start_row, start_col, n_rows, n_cols
     764             :       REAL(KIND=dp), INTENT(in), OPTIONAL                :: alpha, beta
     765             :       LOGICAL, INTENT(in), OPTIONAL                      :: transpose
     766             : 
     767             :       INTEGER                                            :: i, i0, j, j0, ncol, ncol_block, &
     768             :                                                             ncol_global, ncol_local, nrow, &
     769             :                                                             nrow_block, nrow_global, nrow_local, &
     770             :                                                             this_col, this_row
     771       61421 :       INTEGER, DIMENSION(:), POINTER                     :: col_indices, row_indices
     772             :       LOGICAL                                            :: tr_a
     773             :       REAL(KIND=dp)                                      :: al, be
     774       61421 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: full_block
     775             : 
     776       61421 :       al = 1.0_dp; be = 0.0_dp; i0 = 1; j0 = 1; tr_a = .FALSE.
     777             :       ! can be called too many times, making it a bit useless
     778             :       ! CALL timeset(routineN//','//moduleN,handle)
     779             : 
     780           0 :       CPASSERT(.NOT. fm%use_sp)
     781             : 
     782       61421 :       IF (PRESENT(alpha)) al = alpha
     783       61421 :       IF (PRESENT(beta)) be = beta
     784       61421 :       IF (PRESENT(start_row)) i0 = start_row
     785       61421 :       IF (PRESENT(start_col)) j0 = start_col
     786       61421 :       IF (PRESENT(transpose)) tr_a = transpose
     787       15683 :       IF (tr_a) THEN
     788       15591 :          nrow = SIZE(new_values, 2)
     789       15591 :          ncol = SIZE(new_values, 1)
     790             :       ELSE
     791       45830 :          nrow = SIZE(new_values, 1)
     792       45830 :          ncol = SIZE(new_values, 2)
     793             :       END IF
     794       61421 :       IF (PRESENT(n_rows)) nrow = n_rows
     795       61421 :       IF (PRESENT(n_cols)) ncol = n_cols
     796             : 
     797       61421 :       full_block => fm%local_data
     798             : 
     799             :       CALL cp_fm_get_info(matrix=fm, &
     800             :                           nrow_global=nrow_global, ncol_global=ncol_global, &
     801             :                           nrow_block=nrow_block, ncol_block=ncol_block, &
     802             :                           nrow_local=nrow_local, ncol_local=ncol_local, &
     803       61421 :                           row_indices=row_indices, col_indices=col_indices)
     804             : 
     805       61421 :       IF (al == 1.0 .AND. be == 0.0) THEN
     806     1287316 :          DO j = 1, ncol_local
     807     1235033 :             this_col = col_indices(j) - j0 + 1
     808     1287316 :             IF (this_col .GE. 1 .AND. this_col .LE. ncol) THEN
     809      282515 :                IF (tr_a) THEN
     810        6453 :                   IF (i0 == 1 .AND. nrow_global == nrow) THEN
     811      157620 :                      DO i = 1, nrow_local
     812      157620 :                         full_block(i, j) = new_values(this_col, row_indices(i))
     813             :                      END DO
     814             :                   ELSE
     815         594 :                      DO i = 1, nrow_local
     816         510 :                         this_row = row_indices(i) - i0 + 1
     817         594 :                         IF (this_row >= 1 .AND. this_row <= nrow) THEN
     818         255 :                            full_block(i, j) = new_values(this_col, this_row)
     819             :                         END IF
     820             :                      END DO
     821             :                   END IF
     822             :                ELSE
     823      276062 :                   IF (i0 == 1 .AND. nrow_global == nrow) THEN
     824     8875977 :                      DO i = 1, nrow_local
     825     8875977 :                         full_block(i, j) = new_values(row_indices(i), this_col)
     826             :                      END DO
     827             :                   ELSE
     828        4347 :                      DO i = 1, nrow_local
     829        3745 :                         this_row = row_indices(i) - i0 + 1
     830        4347 :                         IF (this_row >= 1 .AND. this_row <= nrow) THEN
     831         301 :                            full_block(i, j) = new_values(this_row, this_col)
     832             :                         END IF
     833             :                      END DO
     834             :                   END IF
     835             :                END IF
     836             :             END IF
     837             :          END DO
     838             :       ELSE
     839      831608 :          DO j = 1, ncol_local
     840      822470 :             this_col = col_indices(j) - j0 + 1
     841      831608 :             IF (this_col .GE. 1 .AND. this_col .LE. ncol) THEN
     842      822470 :                IF (tr_a) THEN
     843    88223375 :                   DO i = 1, nrow_local
     844    87400905 :                      this_row = row_indices(i) - i0 + 1
     845    88223375 :                      IF (this_row >= 1 .AND. this_row <= nrow) THEN
     846             :                         full_block(i, j) = al*new_values(this_col, this_row) + &
     847      411235 :                                            be*full_block(i, j)
     848             :                      END IF
     849             :                   END DO
     850             :                ELSE
     851           0 :                   DO i = 1, nrow_local
     852           0 :                      this_row = row_indices(i) - i0 + 1
     853           0 :                      IF (this_row >= 1 .AND. this_row <= nrow) THEN
     854             :                         full_block(i, j) = al*new_values(this_row, this_col) + &
     855           0 :                                            be*full_block(i, j)
     856             :                      END IF
     857             :                   END DO
     858             :                END IF
     859             :             END IF
     860             :          END DO
     861             :       END IF
     862             : 
     863             :       ! CALL timestop(handle)
     864             : 
     865       61421 :    END SUBROUTINE cp_fm_set_submatrix
     866             : 
     867             : ! **************************************************************************************************
     868             : !> \brief gets a submatrix of a full matrix
     869             : !>       op(target_m)(1:n_rows,1:n_cols)
     870             : !>       =fm(start_row:start_row+n_rows,start_col:start_col+n_cols)
     871             : !>      target_m is replicated on all cpus
     872             : !>      using this call is expensive
     873             : !> \param fm the full you want to get the info from
     874             : !> \param target_m a replicated full matrix that will contain the result
     875             : !> \param start_row the starting row of b_matrix (defaults to 1)
     876             : !> \param start_col the starting col of b_matrix (defaults to 1)
     877             : !> \param n_rows the number of row to change in b (defaults to
     878             : !>        size(op(new_values),1))
     879             : !> \param n_cols the number of columns to change in b (defaults to
     880             : !>        size(op(new_values),2))
     881             : !> \param transpose if target_m should be transposed: if true
     882             : !>        op(target_m)=target_m^T, else op(target_m)=target_m
     883             : !>        (defaults to false)
     884             : !> \par History
     885             : !>      07.2002 created borrowing from Joost's blacs_replicated_copy [fawzi]
     886             : !> \author Fawzi Mohamed
     887             : !> \note
     888             : !>      optimized for full column updates. Zeros out a little too much
     889             : !>      of target_m
     890             : !>      the target_m is replicated and valid on all cpus
     891             : ! **************************************************************************************************
     892       70880 :    SUBROUTINE cp_fm_get_submatrix(fm, target_m, start_row, &
     893             :                                   start_col, n_rows, n_cols, transpose)
     894             :       TYPE(cp_fm_type), INTENT(IN)                       :: fm
     895             :       REAL(KIND=dp), DIMENSION(:, :), INTENT(out)        :: target_m
     896             :       INTEGER, INTENT(in), OPTIONAL                      :: start_row, start_col, n_rows, n_cols
     897             :       LOGICAL, INTENT(in), OPTIONAL                      :: transpose
     898             : 
     899             :       CHARACTER(len=*), PARAMETER :: routineN = 'cp_fm_get_submatrix'
     900             : 
     901             :       INTEGER                                            :: handle, i, i0, j, j0, ncol, ncol_global, &
     902             :                                                             ncol_local, nrow, nrow_global, &
     903             :                                                             nrow_local, this_col, this_row
     904       70880 :       INTEGER, DIMENSION(:), POINTER                     :: col_indices, row_indices
     905             :       LOGICAL                                            :: tr_a
     906       70880 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: full_block
     907             :       TYPE(mp_para_env_type), POINTER                    :: para_env
     908             : 
     909       70880 :       CALL timeset(routineN, handle)
     910             : 
     911       70880 :       i0 = 1; j0 = 1; tr_a = .FALSE.
     912             : 
     913       70880 :       CPASSERT(.NOT. fm%use_sp)
     914             : 
     915       70880 :       IF (PRESENT(start_row)) i0 = start_row
     916       70880 :       IF (PRESENT(start_col)) j0 = start_col
     917       70880 :       IF (PRESENT(transpose)) tr_a = transpose
     918        6240 :       IF (tr_a) THEN
     919        2362 :          nrow = SIZE(target_m, 2)
     920        2362 :          ncol = SIZE(target_m, 1)
     921             :       ELSE
     922       68518 :          nrow = SIZE(target_m, 1)
     923       68518 :          ncol = SIZE(target_m, 2)
     924             :       END IF
     925       70880 :       IF (PRESENT(n_rows)) nrow = n_rows
     926       70880 :       IF (PRESENT(n_cols)) ncol = n_cols
     927             : 
     928       70880 :       para_env => fm%matrix_struct%para_env
     929             : 
     930       70880 :       full_block => fm%local_data
     931             : #if defined(__parallel)
     932             :       ! zero-out whole target_m
     933       70880 :       IF (SIZE(target_m, 1)*SIZE(target_m, 2) /= 0) THEN
     934       81252 :          CALL dcopy(SIZE(target_m, 1)*SIZE(target_m, 2), [0.0_dp], 0, target_m, 1)
     935             :       END IF
     936             : #endif
     937             : 
     938             :       CALL cp_fm_get_info(matrix=fm, &
     939             :                           nrow_global=nrow_global, ncol_global=ncol_global, &
     940             :                           nrow_local=nrow_local, ncol_local=ncol_local, &
     941       70880 :                           row_indices=row_indices, col_indices=col_indices)
     942             : 
     943      450192 :       DO j = 1, ncol_local
     944      379312 :          this_col = col_indices(j) - j0 + 1
     945      450192 :          IF (this_col .GE. 1 .AND. this_col .LE. ncol) THEN
     946      271516 :             IF (tr_a) THEN
     947        2362 :                IF (i0 == 1 .AND. nrow_global == nrow) THEN
     948       81863 :                   DO i = 1, nrow_local
     949       81863 :                      target_m(this_col, row_indices(i)) = full_block(i, j)
     950             :                   END DO
     951             :                ELSE
     952           0 :                   DO i = 1, nrow_local
     953           0 :                      this_row = row_indices(i) - i0 + 1
     954           0 :                      IF (this_row >= 1 .AND. this_row <= nrow) THEN
     955           0 :                         target_m(this_col, this_row) = full_block(i, j)
     956             :                      END IF
     957             :                   END DO
     958             :                END IF
     959             :             ELSE
     960      269154 :                IF (i0 == 1 .AND. nrow_global == nrow) THEN
     961     5992727 :                   DO i = 1, nrow_local
     962     5992727 :                      target_m(row_indices(i), this_col) = full_block(i, j)
     963             :                   END DO
     964             :                ELSE
     965     1168114 :                   DO i = 1, nrow_local
     966     1141698 :                      this_row = row_indices(i) - i0 + 1
     967     1168114 :                      IF (this_row >= 1 .AND. this_row <= nrow) THEN
     968       24462 :                         target_m(this_row, this_col) = full_block(i, j)
     969             :                      END IF
     970             :                   END DO
     971             :                END IF
     972             :             END IF
     973             :          END IF
     974             :       END DO
     975             : 
     976    15463408 :       CALL para_env%sum(target_m)
     977             : 
     978       70880 :       CALL timestop(handle)
     979             : 
     980       70880 :    END SUBROUTINE cp_fm_get_submatrix
     981             : 
     982             : ! **************************************************************************************************
     983             : !> \brief returns all kind of information about the full matrix
     984             : !> \param matrix ...
     985             : !> \param name ...
     986             : !> \param nrow_global ...
     987             : !> \param ncol_global ...
     988             : !> \param nrow_block ...
     989             : !> \param ncol_block ...
     990             : !> \param nrow_local ...
     991             : !> \param ncol_local ...
     992             : !> \param row_indices ...
     993             : !> \param col_indices ...
     994             : !> \param local_data ...
     995             : !> \param context ...
     996             : !> \param nrow_locals ...
     997             : !> \param ncol_locals ...
     998             : !> \param matrix_struct ...
     999             : !> \param para_env ...
    1000             : !> \note
    1001             : !>       see also cp_fm_struct for explanation
    1002             : !>       - nrow_local, ncol_local, row_indices, col_indices, local_data are hooks for efficient
    1003             : !>         access to the local blacs block
    1004             : ! **************************************************************************************************
    1005     5296475 :    SUBROUTINE cp_fm_get_info(matrix, name, nrow_global, ncol_global, &
    1006             :                              nrow_block, ncol_block, nrow_local, ncol_local, &
    1007             :                              row_indices, col_indices, local_data, context, &
    1008             :                              nrow_locals, ncol_locals, matrix_struct, para_env)
    1009             : 
    1010             :       TYPE(cp_fm_type), INTENT(IN)                       :: matrix
    1011             :       CHARACTER(LEN=*), INTENT(OUT), OPTIONAL            :: name
    1012             :       INTEGER, INTENT(OUT), OPTIONAL                     :: nrow_global, ncol_global, nrow_block, &
    1013             :                                                             ncol_block, nrow_local, ncol_local
    1014             :       INTEGER, DIMENSION(:), OPTIONAL, POINTER           :: row_indices, col_indices
    1015             :       REAL(KIND=dp), CONTIGUOUS, DIMENSION(:, :), &
    1016             :          OPTIONAL, POINTER                               :: local_data
    1017             :       TYPE(cp_blacs_env_type), OPTIONAL, POINTER         :: context
    1018             :       INTEGER, DIMENSION(:), OPTIONAL, POINTER           :: nrow_locals, ncol_locals
    1019             :       TYPE(cp_fm_struct_type), OPTIONAL, POINTER         :: matrix_struct
    1020             :       TYPE(mp_para_env_type), OPTIONAL, POINTER          :: para_env
    1021             : 
    1022           8 :       IF (PRESENT(name)) name = matrix%name
    1023     5296475 :       IF (PRESENT(matrix_struct)) matrix_struct => matrix%matrix_struct
    1024     5296475 :       IF (PRESENT(local_data)) local_data => matrix%local_data ! not hiding things anymore :-(
    1025             : 
    1026             :       CALL cp_fm_struct_get(matrix%matrix_struct, nrow_local=nrow_local, &
    1027             :                             ncol_local=ncol_local, nrow_global=nrow_global, &
    1028             :                             ncol_global=ncol_global, nrow_block=nrow_block, &
    1029             :                             ncol_block=ncol_block, row_indices=row_indices, &
    1030             :                             col_indices=col_indices, nrow_locals=nrow_locals, &
    1031     5296475 :                             ncol_locals=ncol_locals, context=context, para_env=para_env)
    1032             : 
    1033     5296475 :    END SUBROUTINE cp_fm_get_info
    1034             : 
    1035             : ! **************************************************************************************************
    1036             : !> \brief Write nicely formatted info about the FM to the given I/O unit (including the underlying FM struct)
    1037             : !> \param matrix a cp_fm_type instance
    1038             : !> \param io_unit the I/O unit to use for writing
    1039             : ! **************************************************************************************************
    1040           3 :    SUBROUTINE cp_fm_write_info(matrix, io_unit)
    1041             :       TYPE(cp_fm_type), INTENT(IN)                       :: matrix
    1042             :       INTEGER, INTENT(IN)                                :: io_unit
    1043             : 
    1044           3 :       WRITE (io_unit, '(/,A,A12)') "CP_FM | Name:                           ", matrix%name
    1045           3 :       CALL cp_fm_struct_write_info(matrix%matrix_struct, io_unit)
    1046           3 :    END SUBROUTINE cp_fm_write_info
    1047             : 
    1048             : ! **************************************************************************************************
    1049             : !> \brief find the maximum absolute value of the matrix element
    1050             : !>      maxval(abs(matrix))
    1051             : !> \param matrix ...
    1052             : !> \param a_max ...
    1053             : !> \param ir_max ...
    1054             : !> \param ic_max ...
    1055             : ! **************************************************************************************************
    1056       88869 :    SUBROUTINE cp_fm_maxabsval(matrix, a_max, ir_max, ic_max)
    1057             :       TYPE(cp_fm_type), INTENT(IN)                       :: matrix
    1058             :       REAL(KIND=dp), INTENT(OUT)                         :: a_max
    1059             :       INTEGER, INTENT(OUT), OPTIONAL                     :: ir_max, ic_max
    1060             : 
    1061             :       CHARACTER(len=*), PARAMETER                        :: routineN = 'cp_fm_maxabsval'
    1062             : 
    1063             :       INTEGER                                            :: handle, i, ic_max_local, ir_max_local, &
    1064             :                                                             j, mepos, ncol_local, nrow_local, &
    1065             :                                                             num_pe
    1066       88869 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: ic_max_vec, ir_max_vec
    1067       88869 :       INTEGER, DIMENSION(:), POINTER                     :: col_indices, row_indices
    1068             :       REAL(dp)                                           :: my_max
    1069       88869 :       REAL(dp), ALLOCATABLE, DIMENSION(:)                :: a_max_vec
    1070       88869 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: my_block
    1071       88869 :       REAL(KIND=sp), DIMENSION(:, :), POINTER            :: my_block_sp
    1072             : 
    1073       88869 :       CALL timeset(routineN, handle)
    1074             : 
    1075       88869 :       my_block => matrix%local_data
    1076       88869 :       my_block_sp => matrix%local_data_sp
    1077             : 
    1078             :       CALL cp_fm_get_info(matrix, nrow_local=nrow_local, ncol_local=ncol_local, &
    1079       88869 :                           row_indices=row_indices, col_indices=col_indices)
    1080             : 
    1081       88869 :       IF (matrix%use_sp) THEN
    1082           0 :          a_max = REAL(MAXVAL(ABS(my_block_sp(1:nrow_local, 1:ncol_local))), dp)
    1083             :       ELSE
    1084    54237616 :          a_max = MAXVAL(ABS(my_block(1:nrow_local, 1:ncol_local)))
    1085             :       END IF
    1086             : 
    1087       88869 :       IF (PRESENT(ir_max)) THEN
    1088           0 :          num_pe = matrix%matrix_struct%para_env%num_pe
    1089           0 :          mepos = matrix%matrix_struct%para_env%mepos
    1090           0 :          ALLOCATE (ir_max_vec(0:num_pe - 1))
    1091           0 :          ir_max_vec(0:num_pe - 1) = 0
    1092           0 :          ALLOCATE (ic_max_vec(0:num_pe - 1))
    1093           0 :          ic_max_vec(0:num_pe - 1) = 0
    1094           0 :          ALLOCATE (a_max_vec(0:num_pe - 1))
    1095           0 :          a_max_vec(0:num_pe - 1) = 0.0_dp
    1096           0 :          my_max = 0.0_dp
    1097             : 
    1098           0 :          IF ((ncol_local > 0) .AND. (nrow_local > 0)) THEN
    1099           0 :             DO i = 1, ncol_local
    1100           0 :                DO j = 1, nrow_local
    1101           0 :                   IF (matrix%use_sp) THEN
    1102           0 :                      IF (ABS(my_block_sp(j, i)) > my_max) THEN
    1103           0 :                         my_max = REAL(my_block_sp(j, i), dp)
    1104           0 :                         ir_max_local = j
    1105           0 :                         ic_max_local = i
    1106             :                      END IF
    1107             :                   ELSE
    1108           0 :                      IF (ABS(my_block(j, i)) > my_max) THEN
    1109           0 :                         my_max = my_block(j, i)
    1110           0 :                         ir_max_local = j
    1111           0 :                         ic_max_local = i
    1112             :                      END IF
    1113             :                   END IF
    1114             :                END DO
    1115             :             END DO
    1116             : 
    1117           0 :             a_max_vec(mepos) = my_max
    1118           0 :             ir_max_vec(mepos) = row_indices(ir_max_local)
    1119           0 :             ic_max_vec(mepos) = col_indices(ic_max_local)
    1120             : 
    1121             :          END IF
    1122             : 
    1123           0 :          CALL matrix%matrix_struct%para_env%sum(a_max_vec)
    1124           0 :          CALL matrix%matrix_struct%para_env%sum(ir_max_vec)
    1125           0 :          CALL matrix%matrix_struct%para_env%sum(ic_max_vec)
    1126             : 
    1127           0 :          my_max = 0.0_dp
    1128           0 :          DO i = 0, num_pe - 1
    1129           0 :             IF (a_max_vec(i) > my_max) THEN
    1130           0 :                ir_max = ir_max_vec(i)
    1131           0 :                ic_max = ic_max_vec(i)
    1132             :             END IF
    1133             :          END DO
    1134             : 
    1135           0 :          DEALLOCATE (ir_max_vec, ic_max_vec, a_max_vec)
    1136           0 :          CPASSERT(ic_max > 0)
    1137           0 :          CPASSERT(ir_max > 0)
    1138             : 
    1139             :       END IF
    1140             : 
    1141       88869 :       CALL matrix%matrix_struct%para_env%max(a_max)
    1142             : 
    1143       88869 :       CALL timestop(handle)
    1144             : 
    1145      177738 :    END SUBROUTINE cp_fm_maxabsval
    1146             : 
    1147             : ! **************************************************************************************************
    1148             : !> \brief find the maximum over the rows of the sum of the absolute values of the elements of a given row
    1149             : !>      = || A ||_infinity
    1150             : !> \param matrix ...
    1151             : !> \param a_max ...
    1152             : !> \note
    1153             : !>      for a real symmetric matrix it holds that || A ||_2 = |lambda_max| < || A ||_infinity
    1154             : !>      Hence this can be used to estimate an upper bound for the eigenvalues of a matrix
    1155             : !>      http://mathworld.wolfram.com/MatrixNorm.html
    1156             : !>      (but the bound is not so tight in the general case)
    1157             : ! **************************************************************************************************
    1158        4296 :    SUBROUTINE cp_fm_maxabsrownorm(matrix, a_max)
    1159             :       TYPE(cp_fm_type), INTENT(IN)                       :: matrix
    1160             :       REAL(KIND=dp), INTENT(OUT)                         :: a_max
    1161             : 
    1162             :       CHARACTER(len=*), PARAMETER :: routineN = 'cp_fm_maxabsrownorm'
    1163             : 
    1164             :       INTEGER                                            :: handle, i, j, ncol_local, nrow_global, &
    1165             :                                                             nrow_local
    1166        4296 :       INTEGER, DIMENSION(:), POINTER                     :: row_indices
    1167             :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:)           :: values
    1168        4296 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: my_block
    1169             : 
    1170        4296 :       CALL timeset(routineN, handle)
    1171             : 
    1172        4296 :       my_block => matrix%local_data
    1173             : 
    1174        4296 :       CPASSERT(.NOT. matrix%use_sp)
    1175             : 
    1176             :       CALL cp_fm_get_info(matrix, row_indices=row_indices, nrow_global=nrow_global, &
    1177        4296 :                           nrow_local=nrow_local, ncol_local=ncol_local)
    1178             : 
    1179             :       ! the efficiency could be improved by making use of the row-col distribution of scalapack
    1180       12888 :       ALLOCATE (values(nrow_global))
    1181       62878 :       values = 0.0_dp
    1182       62878 :       DO j = 1, ncol_local
    1183      531187 :          DO i = 1, nrow_local
    1184      526891 :             values(row_indices(i)) = values(row_indices(i)) + ABS(my_block(i, j))
    1185             :          END DO
    1186             :       END DO
    1187        4296 :       CALL matrix%matrix_struct%para_env%sum(values)
    1188       67174 :       a_max = MAXVAL(values)
    1189        4296 :       DEALLOCATE (values)
    1190             : 
    1191        4296 :       CALL timestop(handle)
    1192        8592 :    END SUBROUTINE
    1193             : 
    1194             : ! **************************************************************************************************
    1195             : !> \brief find the inorm of each column norm_{j}= sqrt( \sum_{i} A_{ij}*A_{ij} )
    1196             : !> \param matrix ...
    1197             : !> \param norm_array ...
    1198             : ! **************************************************************************************************
    1199        1282 :    SUBROUTINE cp_fm_vectorsnorm(matrix, norm_array)
    1200             :       TYPE(cp_fm_type), INTENT(IN)                       :: matrix
    1201             :       REAL(KIND=dp), DIMENSION(:), INTENT(OUT)           :: norm_array
    1202             : 
    1203             :       CHARACTER(len=*), PARAMETER                        :: routineN = 'cp_fm_vectorsnorm'
    1204             : 
    1205             :       INTEGER                                            :: handle, i, j, ncol_global, ncol_local, &
    1206             :                                                             nrow_local
    1207        1282 :       INTEGER, DIMENSION(:), POINTER                     :: col_indices
    1208        1282 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: my_block
    1209             : 
    1210        1282 :       CALL timeset(routineN, handle)
    1211             : 
    1212        1282 :       my_block => matrix%local_data
    1213             : 
    1214        1282 :       CPASSERT(.NOT. matrix%use_sp)
    1215             : 
    1216             :       CALL cp_fm_get_info(matrix, col_indices=col_indices, ncol_global=ncol_global, &
    1217        1282 :                           nrow_local=nrow_local, ncol_local=ncol_local)
    1218             : 
    1219             :       ! the efficiency could be improved by making use of the row-col distribution of scalapack
    1220       29746 :       norm_array = 0.0_dp
    1221       29746 :       DO j = 1, ncol_local
    1222     1261016 :          DO i = 1, nrow_local
    1223     1259734 :             norm_array(col_indices(j)) = norm_array(col_indices(j)) + my_block(i, j)*my_block(i, j)
    1224             :          END DO
    1225             :       END DO
    1226       58210 :       CALL matrix%matrix_struct%para_env%sum(norm_array)
    1227       29746 :       norm_array = SQRT(norm_array)
    1228             : 
    1229        1282 :       CALL timestop(handle)
    1230        1282 :    END SUBROUTINE
    1231             : 
    1232             : ! **************************************************************************************************
    1233             : !> \brief summing up all the elements along the matrix's i-th index
    1234             : !>        \f$ \mathrm{sum}_{j} = \sum_{i} A_{ij} \f$
    1235             : !>        or
    1236             : !>        \f$ \mathrm{sum}_{i} = \sum_{j} A_{ij} \f$
    1237             : !> \param matrix     an input matrix A
    1238             : !> \param sum_array  sums of elements in each column/row
    1239             : !> \param dir ...
    1240             : !> \note  forked from cp_fm_vectorsnorm() to be used with
    1241             : !>        the maximum overlap method
    1242             : !>        added row variation
    1243             : ! **************************************************************************************************
    1244        7760 :    SUBROUTINE cp_fm_vectorssum(matrix, sum_array, dir)
    1245             :       TYPE(cp_fm_type), INTENT(IN)                       :: matrix
    1246             :       REAL(KIND=dp), DIMENSION(:), INTENT(OUT)           :: sum_array
    1247             :       CHARACTER(LEN=1), INTENT(IN), OPTIONAL             :: dir
    1248             : 
    1249             :       CHARACTER(len=*), PARAMETER                        :: routineN = 'cp_fm_vectorssum'
    1250             : 
    1251             :       INTEGER                                            :: handle, i, j, ncol_local, nrow_local
    1252        7760 :       INTEGER, DIMENSION(:), POINTER                     :: col_indices, row_indices
    1253             :       LOGICAL                                            :: docol
    1254        7760 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: my_block
    1255             : 
    1256        7760 :       CALL timeset(routineN, handle)
    1257             : 
    1258        7760 :       IF (PRESENT(dir)) THEN
    1259        7720 :          IF (dir == 'c' .OR. dir == 'C') THEN
    1260             :             docol = .TRUE.
    1261             :          ELSEIF (dir == 'r' .OR. dir == 'R') THEN
    1262             :             docol = .FALSE.
    1263             :          ELSE
    1264           0 :             CPABORT('Wrong argument DIR')
    1265             :          END IF
    1266             :       ELSE
    1267             :          docol = .TRUE.
    1268             :       END IF
    1269             : 
    1270        7760 :       my_block => matrix%local_data
    1271             : 
    1272        7760 :       CPASSERT(.NOT. matrix%use_sp)
    1273             : 
    1274             :       CALL cp_fm_get_info(matrix, col_indices=col_indices, row_indices=row_indices, &
    1275        7760 :                           nrow_local=nrow_local, ncol_local=ncol_local)
    1276             : 
    1277             :       ! the efficiency could be improved by making use of the row-col distribution of scalapack
    1278      240690 :       sum_array(:) = 0.0_dp
    1279        7760 :       IF (docol) THEN
    1280         448 :       DO j = 1, ncol_local
    1281        3628 :          DO i = 1, nrow_local
    1282        3588 :             sum_array(col_indices(j)) = sum_array(col_indices(j)) + my_block(i, j)
    1283             :          END DO
    1284             :       END DO
    1285             :       ELSE
    1286       86224 :       DO j = 1, ncol_local
    1287     6633258 :          DO i = 1, nrow_local
    1288     6625538 :             sum_array(row_indices(i)) = sum_array(row_indices(i)) + my_block(i, j)
    1289             :          END DO
    1290             :       END DO
    1291             :       END IF
    1292      473620 :       CALL matrix%matrix_struct%para_env%sum(sum_array)
    1293             : 
    1294        7760 :       CALL timestop(handle)
    1295        7760 :    END SUBROUTINE
    1296             : 
    1297             : ! **************************************************************************************************
    1298             : !> \brief copy one identically sized matrix in the other
    1299             : !> \param source ...
    1300             : !> \param destination ...
    1301             : !> \note
    1302             : !>      see also cp_fm_to_fm_columns
    1303             : ! **************************************************************************************************
    1304      357192 :    SUBROUTINE cp_fm_to_fm_matrix(source, destination)
    1305             : 
    1306             :       TYPE(cp_fm_type), INTENT(IN)                       :: source, destination
    1307             : 
    1308             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'cp_fm_to_fm_matrix'
    1309             : 
    1310             :       INTEGER                                            :: handle, npcol, nprow
    1311             : 
    1312      357192 :       CALL timeset(routineN, handle)
    1313             : 
    1314      357192 :       nprow = source%matrix_struct%context%num_pe(1)
    1315      357192 :       npcol = source%matrix_struct%context%num_pe(2)
    1316             : 
    1317      357192 :       IF ((.NOT. cp2k_is_parallel) .OR. &
    1318             :           cp_fm_struct_equivalent(source%matrix_struct, &
    1319             :                                   destination%matrix_struct)) THEN
    1320      357192 :          IF (source%use_sp .AND. destination%use_sp) THEN
    1321           0 :             IF (SIZE(source%local_data_sp, 1) /= SIZE(destination%local_data_sp, 1) .OR. &
    1322             :                 SIZE(source%local_data_sp, 2) /= SIZE(destination%local_data_sp, 2)) &
    1323             :                CALL cp_abort(__LOCATION__, &
    1324             :                              "Cannot copy full matrix <"//TRIM(source%name)// &
    1325             :                              "> to full matrix <"//TRIM(destination%name)// &
    1326           0 :                              ">. The local_data blocks have different sizes.")
    1327             :             CALL scopy(SIZE(source%local_data_sp, 1)*SIZE(source%local_data_sp, 2), &
    1328           0 :                        source%local_data_sp, 1, destination%local_data_sp, 1)
    1329      357192 :          ELSE IF (source%use_sp .AND. .NOT. destination%use_sp) THEN
    1330           0 :             IF (SIZE(source%local_data_sp, 1) /= SIZE(destination%local_data, 1) .OR. &
    1331             :                 SIZE(source%local_data_sp, 2) /= SIZE(destination%local_data, 2)) &
    1332             :                CALL cp_abort(__LOCATION__, &
    1333             :                              "Cannot copy full matrix <"//TRIM(source%name)// &
    1334             :                              "> to full matrix <"//TRIM(destination%name)// &
    1335           0 :                              ">. The local_data blocks have different sizes.")
    1336           0 :             destination%local_data = REAL(source%local_data_sp, dp)
    1337      357192 :          ELSE IF (.NOT. source%use_sp .AND. destination%use_sp) THEN
    1338           0 :             IF (SIZE(source%local_data, 1) /= SIZE(destination%local_data_sp, 1) .OR. &
    1339             :                 SIZE(source%local_data, 2) /= SIZE(destination%local_data_sp, 2)) &
    1340             :                CALL cp_abort(__LOCATION__, &
    1341             :                              "Cannot copy full matrix <"//TRIM(source%name)// &
    1342             :                              "> to full matrix <"//TRIM(destination%name)// &
    1343           0 :                              ">. The local_data blocks have different sizes.")
    1344           0 :             destination%local_data_sp = REAL(source%local_data, sp)
    1345             :          ELSE
    1346      357192 :             IF (SIZE(source%local_data, 1) /= SIZE(destination%local_data, 1) .OR. &
    1347             :                 SIZE(source%local_data, 2) /= SIZE(destination%local_data, 2)) &
    1348             :                CALL cp_abort(__LOCATION__, &
    1349             :                              "Cannot copy full matrix <"//TRIM(source%name)// &
    1350             :                              "> to full matrix <"//TRIM(destination%name)// &
    1351           0 :                              ">. The local_data blocks have different sizes.")
    1352             :             CALL dcopy(SIZE(source%local_data, 1)*SIZE(source%local_data, 2), &
    1353      357192 :                        source%local_data, 1, destination%local_data, 1)
    1354             :          END IF
    1355             :       ELSE
    1356           0 :          CPABORT("Data structures of source and target full matrix are not equivalent")
    1357             :       END IF
    1358             : 
    1359      357192 :       CALL timestop(handle)
    1360             : 
    1361      357192 :    END SUBROUTINE cp_fm_to_fm_matrix
    1362             : 
    1363             : ! **************************************************************************************************
    1364             : !> \brief copy just a subset of columns of a fm to a fm
    1365             : !> \param msource ...
    1366             : !> \param mtarget ...
    1367             : !> \param ncol ...
    1368             : !> \param source_start ...
    1369             : !> \param target_start ...
    1370             : ! **************************************************************************************************
    1371      162904 :    SUBROUTINE cp_fm_to_fm_columns(msource, mtarget, ncol, source_start, &
    1372             :                                   target_start)
    1373             : 
    1374             :       TYPE(cp_fm_type), INTENT(IN)          :: msource, mtarget
    1375             :       INTEGER, INTENT(IN)                      :: ncol
    1376             :       INTEGER, INTENT(IN), OPTIONAL            :: source_start, target_start
    1377             : 
    1378             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'cp_fm_to_fm_columns'
    1379             : 
    1380             :       INTEGER                                  :: handle, n, ss, ts
    1381             :       REAL(KIND=dp), DIMENSION(:, :), POINTER  :: a, b
    1382             : #if defined(__parallel)
    1383             :       INTEGER                                  :: i
    1384             :       INTEGER, DIMENSION(9)                    :: desca, descb
    1385             : #endif
    1386             : 
    1387      162904 :       CALL timeset(routineN, handle)
    1388             : 
    1389      162904 :       ss = 1
    1390      162904 :       ts = 1
    1391             : 
    1392      162904 :       IF (PRESENT(source_start)) ss = source_start
    1393      162904 :       IF (PRESENT(target_start)) ts = target_start
    1394             : 
    1395      162904 :       n = msource%matrix_struct%nrow_global
    1396             : 
    1397      162904 :       a => msource%local_data
    1398      162904 :       b => mtarget%local_data
    1399             : 
    1400             : #if defined(__parallel)
    1401     1629040 :       desca(:) = msource%matrix_struct%descriptor(:)
    1402     1629040 :       descb(:) = mtarget%matrix_struct%descriptor(:)
    1403      643456 :       DO i = 0, ncol - 1
    1404      643456 :          CALL pdcopy(n, a, 1, ss + i, desca, 1, b, 1, ts + i, descb, 1)
    1405             :       END DO
    1406             : #else
    1407             :       IF (ss <= SIZE(a, 2) .AND. ts <= SIZE(b, 2)) THEN
    1408             :          CALL dcopy(ncol*n, a(:, ss), 1, b(:, ts), 1)
    1409             :       END IF
    1410             : #endif
    1411             : 
    1412      162904 :       CALL timestop(handle)
    1413             : 
    1414      162904 :    END SUBROUTINE cp_fm_to_fm_columns
    1415             : 
    1416             : ! **************************************************************************************************
    1417             : !> \brief copy just a triangular matrix
    1418             : !> \param msource ...
    1419             : !> \param mtarget ...
    1420             : !> \param uplo ...
    1421             : ! **************************************************************************************************
    1422         370 :    SUBROUTINE cp_fm_to_fm_triangular(msource, mtarget, uplo)
    1423             : 
    1424             :       TYPE(cp_fm_type), INTENT(IN)          :: msource, mtarget
    1425             :       CHARACTER(LEN=*), INTENT(IN)             :: uplo
    1426             : 
    1427             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'cp_fm_to_fm_triangular'
    1428             : 
    1429             :       INTEGER                                  :: handle, ncol, nrow
    1430         370 :       REAL(KIND=dp), DIMENSION(:, :), POINTER  :: a, b
    1431             : #if defined(__parallel)
    1432             :       INTEGER, DIMENSION(9)                    :: desca, descb
    1433             : #endif
    1434             : 
    1435         370 :       CALL timeset(routineN, handle)
    1436             : 
    1437         370 :       nrow = msource%matrix_struct%nrow_global
    1438         370 :       ncol = msource%matrix_struct%ncol_global
    1439             : 
    1440         370 :       a => msource%local_data
    1441         370 :       b => mtarget%local_data
    1442             : 
    1443             : #if defined(__parallel)
    1444        3700 :       desca(:) = msource%matrix_struct%descriptor(:)
    1445        3700 :       descb(:) = mtarget%matrix_struct%descriptor(:)
    1446         370 :       CALL pdlacpy(uplo, nrow, ncol, a(1, 1), 1, 1, desca, b(1, 1), 1, 1, descb)
    1447             : #else
    1448             :       CALL dlacpy(uplo, nrow, ncol, a(1, 1), nrow, b(1, 1), nrow)
    1449             : #endif
    1450             : 
    1451         370 :       CALL timestop(handle)
    1452             : 
    1453         370 :    END SUBROUTINE cp_fm_to_fm_triangular
    1454             : 
    1455             : ! **************************************************************************************************
    1456             : !> \brief copy just a part ot the matrix
    1457             : !> \param msource ...
    1458             : !> \param mtarget ...
    1459             : !> \param nrow ...
    1460             : !> \param ncol ...
    1461             : !> \param s_firstrow ...
    1462             : !> \param s_firstcol ...
    1463             : !> \param t_firstrow ...
    1464             : !> \param t_firstcol ...
    1465             : ! **************************************************************************************************
    1466             : 
    1467       10678 :    SUBROUTINE cp_fm_to_fm_submat(msource, mtarget, nrow, ncol, s_firstrow, s_firstcol, t_firstrow, t_firstcol)
    1468             : 
    1469             :       TYPE(cp_fm_type), INTENT(IN)             :: msource, mtarget
    1470             :       INTEGER, INTENT(IN)                      :: nrow, ncol, s_firstrow, &
    1471             :                                                   s_firstcol, t_firstrow, &
    1472             :                                                   t_firstcol
    1473             : 
    1474             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'cp_fm_to_fm_submat'
    1475             : 
    1476             :       INTEGER                                  :: handle, i, na, nb, ss, ts
    1477             :       REAL(KIND=dp), DIMENSION(:, :), POINTER  :: a, b
    1478             : #if defined(__parallel)
    1479             :       INTEGER, DIMENSION(9)                    :: desca, descb
    1480             : #endif
    1481             : 
    1482       10678 :       CALL timeset(routineN, handle)
    1483             : 
    1484       10678 :       a => msource%local_data
    1485       10678 :       b => mtarget%local_data
    1486             : 
    1487       10678 :       na = msource%matrix_struct%nrow_global
    1488       10678 :       nb = mtarget%matrix_struct%nrow_global
    1489             : !    nrow must be <= na and nb
    1490       10678 :       IF (nrow > na) &
    1491           0 :          CPABORT("cannot copy because nrow > number of rows of source matrix")
    1492       10678 :       IF (nrow > nb) &
    1493           0 :          CPABORT("cannot copy because nrow > number of rows of target matrix")
    1494       10678 :       na = msource%matrix_struct%ncol_global
    1495       10678 :       nb = mtarget%matrix_struct%ncol_global
    1496             : !    ncol must be <= na_col and nb_col
    1497       10678 :       IF (ncol > na) &
    1498           0 :          CPABORT("cannot copy because nrow > number of rows of source matrix")
    1499       10678 :       IF (ncol > nb) &
    1500           0 :          CPABORT("cannot copy because nrow > number of rows of target matrix")
    1501             : 
    1502             : #if defined(__parallel)
    1503      106780 :       desca(:) = msource%matrix_struct%descriptor(:)
    1504      106780 :       descb(:) = mtarget%matrix_struct%descriptor(:)
    1505      123858 :       DO i = 0, ncol - 1
    1506      113180 :          ss = s_firstcol + i
    1507      113180 :          ts = t_firstcol + i
    1508      123858 :          CALL pdcopy(nrow, a, s_firstrow, ss, desca, 1, b, t_firstrow, ts, descb, 1)
    1509             :       END DO
    1510             : #else
    1511             :       DO i = 0, ncol - 1
    1512             :          ss = s_firstcol + i
    1513             :          ts = t_firstcol + i
    1514             :          CALL dcopy(nrow, a(s_firstrow:, ss), 1, b(t_firstrow:, ts), 1)
    1515             :       END DO
    1516             : #endif
    1517             : 
    1518       10678 :       CALL timestop(handle)
    1519       10678 :    END SUBROUTINE cp_fm_to_fm_submat
    1520             : 
    1521             : ! **************************************************************************************************
    1522             : !> \brief General copy of a fm matrix to another fm matrix.
    1523             : !>        Uses non-blocking MPI rather than ScaLAPACK.
    1524             : !>
    1525             : !> \param source          input fm matrix
    1526             : !> \param destination     output fm matrix
    1527             : !> \param para_env        parallel environment corresponding to the BLACS env that covers all parts
    1528             : !>                        of the input and output matrices
    1529             : !> \par History
    1530             : !>      31-Jan-2017 : Re-implemented using non-blocking MPI [IainB, MarkT]
    1531             : ! **************************************************************************************************
    1532        8714 :    SUBROUTINE cp_fm_copy_general(source, destination, para_env)
    1533             :       TYPE(cp_fm_type), INTENT(IN)                       :: source, destination
    1534             :       TYPE(mp_para_env_type), INTENT(IN)                 :: para_env
    1535             : 
    1536             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'cp_fm_copy_general'
    1537             : 
    1538             :       INTEGER                                            :: handle
    1539       78426 :       TYPE(copy_info_type)                               :: info
    1540             : 
    1541        8714 :       CALL timeset(routineN, handle)
    1542             : 
    1543        8714 :       CALL cp_fm_start_copy_general(source, destination, para_env, info)
    1544        8714 :       IF (ASSOCIATED(destination%matrix_struct)) THEN
    1545        8700 :          CALL cp_fm_finish_copy_general(destination, info)
    1546             :       END IF
    1547        8714 :       IF (ASSOCIATED(source%matrix_struct)) THEN
    1548        8560 :          CALL cp_fm_cleanup_copy_general(info)
    1549             :       END IF
    1550             : 
    1551        8714 :       CALL timestop(handle)
    1552        8714 :    END SUBROUTINE cp_fm_copy_general
    1553             : 
    1554             : ! **************************************************************************************************
    1555             : !> \brief Initiates the copy operation: get distribution data, post MPI isend and irecvs
    1556             : !> \param source input fm matrix
    1557             : !> \param destination output fm matrix
    1558             : !> \param para_env parallel environment corresponding to the BLACS env that covers all parts
    1559             : !>                        of the input and output matrices
    1560             : !> \param info all of the data that will be needed to complete the copy operation
    1561             : ! **************************************************************************************************
    1562     1578580 :    SUBROUTINE cp_fm_start_copy_general(source, destination, para_env, info)
    1563             :       TYPE(cp_fm_type), INTENT(IN)                       :: source, destination
    1564             :       TYPE(mp_para_env_type), INTENT(IN)                 :: para_env
    1565             :       TYPE(copy_info_type), INTENT(OUT)                  :: info
    1566             : 
    1567             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'cp_fm_start_copy_general'
    1568             : 
    1569             :       INTEGER :: dest_p_i, dest_q_j, global_rank, global_size, handle, i, j, k, mpi_rank, &
    1570             :          ncol_block_dest, ncol_block_src, ncol_local_recv, ncol_local_send, ncols, &
    1571             :          nrow_block_dest, nrow_block_src, nrow_local_recv, nrow_local_send, nrows, p, q, &
    1572             :          recv_rank, recv_size, send_rank, send_size
    1573      157858 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: all_ranks, dest2global, dest_p, dest_q, &
    1574      315716 :                                                             recv_count, send_count, send_disp, &
    1575      157858 :                                                             source2global, src_p, src_q
    1576      157858 :       INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: dest_blacs2mpi
    1577             :       INTEGER, DIMENSION(2)                              :: dest_block, dest_block_tmp, dest_num_pe, &
    1578             :                                                             src_block, src_block_tmp, src_num_pe
    1579      315716 :       INTEGER, DIMENSION(:), POINTER                     :: recv_col_indices, recv_row_indices, &
    1580      315716 :                                                             send_col_indices, send_row_indices
    1581             :       TYPE(cp_fm_struct_type), POINTER                   :: recv_dist, send_dist
    1582     2210012 :       TYPE(mp_request_type), DIMENSION(6)                :: recv_req, send_req
    1583             : 
    1584      157858 :       CALL timeset(routineN, handle)
    1585             : 
    1586             :       IF (.NOT. cp2k_is_parallel) THEN
    1587             :          ! Just copy all of the matrix data into a 'send buffer', to be unpacked later
    1588             :          nrow_local_send = SIZE(source%local_data, 1)
    1589             :          ncol_local_send = SIZE(source%local_data, 2)
    1590             :          ALLOCATE (info%send_buf(nrow_local_send*ncol_local_send))
    1591             :          k = 0
    1592             :          DO j = 1, ncol_local_send
    1593             :             DO i = 1, nrow_local_send
    1594             :                k = k + 1
    1595             :                info%send_buf(k) = source%local_data(i, j)
    1596             :             END DO
    1597             :          END DO
    1598             :       ELSE
    1599             :          ! assumption of double precision data is carried forward from ScaLAPACK version
    1600      157858 :          IF (source%use_sp) CPABORT("only DP kind implemented")
    1601      157858 :          IF (destination%use_sp) CPABORT("only DP kind implemented")
    1602             : 
    1603      157858 :          NULLIFY (recv_dist, send_dist)
    1604      157858 :          NULLIFY (recv_col_indices, recv_row_indices, send_col_indices, send_row_indices)
    1605             : 
    1606             :          ! The 'global' communicator contains both the source and destination decompositions
    1607      157858 :          global_size = para_env%num_pe
    1608      157858 :          global_rank = para_env%mepos
    1609             : 
    1610             :          ! The source/send decomposition and destination/recv decompositions may only exist on
    1611             :          ! on a subset of the processes involved in the communication
    1612             :          ! Check if the source and/or destination arguments are .not. ASSOCIATED():
    1613             :          ! if so, skip the send / recv parts (since these processes do not participate in the sending/receiving distribution)
    1614      157858 :          IF (ASSOCIATED(destination%matrix_struct)) THEN
    1615      126184 :             recv_dist => destination%matrix_struct
    1616      126184 :             recv_rank = recv_dist%para_env%mepos
    1617             :          ELSE
    1618       31674 :             recv_rank = mp_proc_null
    1619             :          END IF
    1620             : 
    1621      157858 :          IF (ASSOCIATED(source%matrix_struct)) THEN
    1622      139100 :             send_dist => source%matrix_struct
    1623      139100 :             send_rank = send_dist%para_env%mepos
    1624             :          ELSE
    1625       18758 :             send_rank = mp_proc_null
    1626             :          END IF
    1627             : 
    1628             :          ! Map the rank in the source/dest communicator to the global rank
    1629      473574 :          ALLOCATE (all_ranks(0:global_size - 1))
    1630             : 
    1631      157858 :          CALL para_env%allgather(send_rank, all_ranks)
    1632      157858 :          IF (ASSOCIATED(recv_dist)) THEN
    1633      630920 :             ALLOCATE (source2global(0:COUNT(all_ranks .NE. mp_proc_null) - 1))
    1634      378552 :             DO i = 0, global_size - 1
    1635      378552 :                IF (all_ranks(i) .NE. mp_proc_null) THEN
    1636      214852 :                   source2global(all_ranks(i)) = i
    1637             :                END IF
    1638             :             END DO
    1639             :          END IF
    1640             : 
    1641      157858 :          CALL para_env%allgather(recv_rank, all_ranks)
    1642      157858 :          IF (ASSOCIATED(send_dist)) THEN
    1643      695500 :             ALLOCATE (dest2global(0:COUNT(all_ranks .NE. mp_proc_null) - 1))
    1644      417300 :             DO i = 0, global_size - 1
    1645      417300 :                IF (all_ranks(i) .NE. mp_proc_null) THEN
    1646      214852 :                   dest2global(all_ranks(i)) = i
    1647             :                END IF
    1648             :             END DO
    1649             :          END IF
    1650      157858 :          DEALLOCATE (all_ranks)
    1651             : 
    1652             :          ! Some data from the two decompositions will be needed by all processes in the global group :
    1653             :          ! process grid shape, block size, and the BLACS-to-MPI mapping
    1654             : 
    1655             :          ! The global root process will receive the data (from the root process in each decomposition)
    1656     1105006 :          send_req(:) = mp_request_null
    1657      157858 :          IF (global_rank == 0) THEN
    1658      552503 :             recv_req(:) = mp_request_null
    1659       78929 :             CALL para_env%irecv(src_block, mp_any_source, recv_req(1), tag=src_tag)
    1660       78929 :             CALL para_env%irecv(dest_block, mp_any_source, recv_req(2), tag=dest_tag)
    1661       78929 :             CALL para_env%irecv(src_num_pe, mp_any_source, recv_req(3), tag=src_tag)
    1662       78929 :             CALL para_env%irecv(dest_num_pe, mp_any_source, recv_req(4), tag=dest_tag)
    1663             :          END IF
    1664             : 
    1665      157858 :          IF (ASSOCIATED(send_dist)) THEN
    1666      139100 :             IF ((send_rank .EQ. 0)) THEN
    1667             :                ! need to use separate buffers here in case this is actually global rank 0
    1668      236787 :                src_block_tmp = (/send_dist%nrow_block, send_dist%ncol_block/)
    1669       78929 :                CALL para_env%isend(src_block_tmp, 0, send_req(1), tag=src_tag)
    1670       78929 :                CALL para_env%isend(send_dist%context%num_pe, 0, send_req(2), tag=src_tag)
    1671             :             END IF
    1672             :          END IF
    1673             : 
    1674      157858 :          IF (ASSOCIATED(recv_dist)) THEN
    1675      126184 :             IF ((recv_rank .EQ. 0)) THEN
    1676      236787 :                dest_block_tmp = (/recv_dist%nrow_block, recv_dist%ncol_block/)
    1677       78929 :                CALL para_env%isend(dest_block_tmp, 0, send_req(3), tag=dest_tag)
    1678       78929 :                CALL para_env%isend(recv_dist%context%num_pe, 0, send_req(4), tag=dest_tag)
    1679             :             END IF
    1680             :          END IF
    1681             : 
    1682      157858 :          IF (global_rank == 0) THEN
    1683       78929 :             CALL mp_waitall(recv_req(1:4))
    1684             :             ! Now we know the process decomposition, we can allocate the arrays to hold the blacs2mpi mapping
    1685           0 :             ALLOCATE (info%src_blacs2mpi(0:src_num_pe(1) - 1, 0:src_num_pe(2) - 1), &
    1686             :                       dest_blacs2mpi(0:dest_num_pe(1) - 1, 0:dest_num_pe(2) - 1) &
    1687      552503 :                       )
    1688       78929 :             CALL para_env%irecv(info%src_blacs2mpi, mp_any_source, recv_req(5), tag=src_tag)
    1689       78929 :             CALL para_env%irecv(dest_blacs2mpi, mp_any_source, recv_req(6), tag=dest_tag)
    1690             :          END IF
    1691             : 
    1692      157858 :          IF (ASSOCIATED(send_dist)) THEN
    1693      139100 :             IF ((send_rank .EQ. 0)) THEN
    1694       78929 :                CALL para_env%isend(send_dist%context%blacs2mpi(:, :), 0, send_req(5), tag=src_tag)
    1695             :             END IF
    1696             :          END IF
    1697             : 
    1698      157858 :          IF (ASSOCIATED(recv_dist)) THEN
    1699      126184 :             IF ((recv_rank .EQ. 0)) THEN
    1700       78929 :                CALL para_env%isend(recv_dist%context%blacs2mpi(:, :), 0, send_req(6), tag=dest_tag)
    1701             :             END IF
    1702             :          END IF
    1703             : 
    1704      157858 :          IF (global_rank == 0) THEN
    1705       78929 :             CALL mp_waitall(recv_req(5:6))
    1706             :          END IF
    1707             : 
    1708             :          ! Finally, broadcast the data to all processes in the global communicator
    1709      157858 :          CALL para_env%bcast(src_block, 0)
    1710      157858 :          CALL para_env%bcast(dest_block, 0)
    1711      157858 :          CALL para_env%bcast(src_num_pe, 0)
    1712      157858 :          CALL para_env%bcast(dest_num_pe, 0)
    1713      473574 :          info%src_num_pe(1:2) = src_num_pe(1:2)
    1714      473574 :          info%nblock_src(1:2) = src_block(1:2)
    1715      157858 :          IF (global_rank .NE. 0) THEN
    1716           0 :             ALLOCATE (info%src_blacs2mpi(0:src_num_pe(1) - 1, 0:src_num_pe(2) - 1), &
    1717           0 :                       dest_blacs2mpi(0:dest_num_pe(1) - 1, 0:dest_num_pe(2) - 1) &
    1718      552503 :                       )
    1719             :          END IF
    1720      157858 :          CALL para_env%bcast(info%src_blacs2mpi, 0)
    1721      157858 :          CALL para_env%bcast(dest_blacs2mpi, 0)
    1722             : 
    1723      157858 :          recv_size = dest_num_pe(1)*dest_num_pe(2)
    1724      157858 :          send_size = src_num_pe(1)*src_num_pe(2)
    1725      157858 :          info%send_size = send_size
    1726      157858 :          CALL mp_waitall(send_req(:))
    1727             : 
    1728             :          ! Setup is now complete, we can start the actual communication here.
    1729             :          ! The order implemented here is:
    1730             :          !  DEST_1
    1731             :          !      compute recv sizes
    1732             :          !      call irecv
    1733             :          !  SRC_1
    1734             :          !      compute send sizes
    1735             :          !      pack send buffers
    1736             :          !      call isend
    1737             :          !  DEST_2
    1738             :          !      wait for the recvs and unpack buffers (this part eventually will go into another routine to allow comms to run concurrently)
    1739             :          !  SRC_2
    1740             :          !      wait for the sends
    1741             : 
    1742             :          ! DEST_1
    1743      157858 :          IF (ASSOCIATED(recv_dist)) THEN
    1744             :             CALL cp_fm_struct_get(recv_dist, row_indices=recv_row_indices, &
    1745             :                                   col_indices=recv_col_indices &
    1746      126184 :                                   )
    1747      126184 :             info%recv_col_indices => recv_col_indices
    1748      126184 :             info%recv_row_indices => recv_row_indices
    1749      126184 :             nrow_block_src = src_block(1)
    1750      126184 :             ncol_block_src = src_block(2)
    1751      845772 :             ALLOCATE (recv_count(0:send_size - 1), info%recv_disp(0:send_size - 1), info%recv_request(0:send_size - 1))
    1752             : 
    1753             :             ! Determine the recv counts, allocate the receive buffers, call mpi_irecv for all the non-zero sized receives
    1754      126184 :             nrow_local_recv = recv_dist%nrow_locals(recv_dist%context%mepos(1))
    1755      126184 :             ncol_local_recv = recv_dist%ncol_locals(recv_dist%context%mepos(2))
    1756      126184 :             info%nlocal_recv(1) = nrow_local_recv
    1757      126184 :             info%nlocal_recv(2) = ncol_local_recv
    1758             :             ! Initialise src_p, src_q arrays (sized using number of rows/cols in the receiving distribution)
    1759      630920 :             ALLOCATE (src_p(nrow_local_recv), src_q(ncol_local_recv))
    1760     2160942 :             DO i = 1, nrow_local_recv
    1761             :                ! For each local row we will receive, we look up its global row (in recv_row_indices),
    1762             :                ! then work out which row block it comes from, and which process row that row block comes from.
    1763     2160942 :                src_p(i) = MOD(((recv_row_indices(i) - 1)/nrow_block_src), src_num_pe(1))
    1764             :             END DO
    1765     3592734 :             DO j = 1, ncol_local_recv
    1766             :                ! Similarly for the columns
    1767     3592734 :                src_q(j) = MOD(((recv_col_indices(j) - 1)/ncol_block_src), src_num_pe(2))
    1768             :             END DO
    1769             :             ! src_p/q now contains the process row/column ID that will send data to that row/column
    1770             : 
    1771      252368 :             DO q = 0, src_num_pe(2) - 1
    1772     3592734 :                ncols = COUNT(src_q .EQ. q)
    1773      467220 :                DO p = 0, src_num_pe(1) - 1
    1774     3923212 :                   nrows = COUNT(src_p .EQ. p)
    1775             :                   ! Use the send_dist here as we are looking up the processes where the data comes from
    1776      341036 :                   recv_count(info%src_blacs2mpi(p, q)) = nrows*ncols
    1777             :                END DO
    1778             :             END DO
    1779      126184 :             DEALLOCATE (src_p, src_q)
    1780             : 
    1781             :             ! Use one long buffer (and displacements into that buffer)
    1782             :             !     this prevents the need for a rectangular array where not all elements will be populated
    1783      593404 :             ALLOCATE (info%recv_buf(SUM(recv_count(:))))
    1784      126184 :             info%recv_disp(0) = 0
    1785      214852 :             DO i = 1, send_size - 1
    1786      214852 :                info%recv_disp(i) = info%recv_disp(i - 1) + recv_count(i - 1)
    1787             :             END DO
    1788             : 
    1789             :             ! Issue receive calls on ranks which expect data
    1790      341036 :             DO k = 0, send_size - 1
    1791      341036 :                IF (recv_count(k) .GT. 0) THEN
    1792             :                   CALL para_env%irecv(info%recv_buf(info%recv_disp(k) + 1:info%recv_disp(k) + recv_count(k)), &
    1793      158272 :                                       source2global(k), info%recv_request(k))
    1794             :                END IF
    1795             :             END DO
    1796      126184 :             DEALLOCATE (source2global)
    1797             :          END IF ! ASSOCIATED(recv_dist)
    1798             : 
    1799             :          ! SRC_1
    1800      157858 :          IF (ASSOCIATED(send_dist)) THEN
    1801             :             CALL cp_fm_struct_get(send_dist, row_indices=send_row_indices, &
    1802             :                                   col_indices=send_col_indices &
    1803      139100 :                                   )
    1804      139100 :             nrow_block_dest = dest_block(1)
    1805      139100 :             ncol_block_dest = dest_block(2)
    1806      910352 :             ALLOCATE (send_count(0:recv_size - 1), send_disp(0:recv_size - 1), info%send_request(0:recv_size - 1))
    1807             : 
    1808             :             ! Determine the send counts, allocate the send buffers
    1809      139100 :             nrow_local_send = send_dist%nrow_locals(send_dist%context%mepos(1))
    1810      139100 :             ncol_local_send = send_dist%ncol_locals(send_dist%context%mepos(2))
    1811             : 
    1812             :             ! Initialise dest_p, dest_q arrays (sized nrow_local, ncol_local)
    1813             :             !   i.e. number of rows,cols in the sending distribution
    1814      695500 :             ALLOCATE (dest_p(nrow_local_send), dest_q(ncol_local_send))
    1815             : 
    1816     2173858 :             DO i = 1, nrow_local_send
    1817             :                ! Use the send_dist%row_indices() here (we are looping over the local rows we will send)
    1818     2173858 :                dest_p(i) = MOD(((send_row_indices(i) - 1)/nrow_block_dest), dest_num_pe(1))
    1819             :             END DO
    1820     3842750 :             DO j = 1, ncol_local_send
    1821     3842750 :                dest_q(j) = MOD(((send_col_indices(j) - 1)/ncol_block_dest), dest_num_pe(2))
    1822             :             END DO
    1823             :             ! dest_p/q now contain the process row/column ID that will receive data from that row/column
    1824             : 
    1825      278200 :             DO q = 0, dest_num_pe(2) - 1
    1826     3842750 :                ncols = COUNT(dest_q .EQ. q)
    1827      493052 :                DO p = 0, dest_num_pe(1) - 1
    1828     3686616 :                   nrows = COUNT(dest_p .EQ. p)
    1829      353952 :                   send_count(dest_blacs2mpi(p, q)) = nrows*ncols
    1830             :                END DO
    1831             :             END DO
    1832      139100 :             DEALLOCATE (dest_p, dest_q)
    1833             : 
    1834             :             ! Allocate the send buffer using send_count -- and calculate the offset into the buffer for each process
    1835      632152 :             ALLOCATE (info%send_buf(SUM(send_count(:))))
    1836      139100 :             send_disp(0) = 0
    1837      214852 :             DO k = 1, recv_size - 1
    1838      214852 :                send_disp(k) = send_disp(k - 1) + send_count(k - 1)
    1839             :             END DO
    1840             : 
    1841             :             ! Loop over the smat, pack the send buffers
    1842      353952 :             send_count(:) = 0
    1843     3842750 :             DO j = 1, ncol_local_send
    1844             :                ! Use send_col_indices and row_indices here, as we are looking up the global row/column number of local rows.
    1845     3703650 :                dest_q_j = MOD(((send_col_indices(j) - 1)/ncol_block_dest), dest_num_pe(2))
    1846   113861016 :                DO i = 1, nrow_local_send
    1847   110018266 :                   dest_p_i = MOD(((send_row_indices(i) - 1)/nrow_block_dest), dest_num_pe(1))
    1848   110018266 :                   mpi_rank = dest_blacs2mpi(dest_p_i, dest_q_j)
    1849   110018266 :                   send_count(mpi_rank) = send_count(mpi_rank) + 1
    1850   113721916 :                   info%send_buf(send_disp(mpi_rank) + send_count(mpi_rank)) = source%local_data(i, j)
    1851             :                END DO
    1852             :             END DO
    1853             : 
    1854             :             ! For each non-zero send_count, call mpi_isend
    1855      353952 :             DO k = 0, recv_size - 1
    1856      353952 :                IF (send_count(k) .GT. 0) THEN
    1857             :                   CALL para_env%isend(info%send_buf(send_disp(k) + 1:send_disp(k) + send_count(k)), &
    1858      158272 :                                       dest2global(k), info%send_request(k))
    1859             :                END IF
    1860             :             END DO
    1861      139100 :             DEALLOCATE (send_count, send_disp, dest2global)
    1862             :          END IF ! ASSOCIATED(send_dist)
    1863      157858 :          DEALLOCATE (dest_blacs2mpi)
    1864             : 
    1865             :       END IF !IF (.NOT. cp2k_is_parallel)
    1866             : 
    1867      157858 :       CALL timestop(handle)
    1868             : 
    1869      631432 :    END SUBROUTINE cp_fm_start_copy_general
    1870             : 
    1871             : ! **************************************************************************************************
    1872             : !> \brief Completes the copy operation: wait for comms, unpack, clean up MPI state
    1873             : !> \param destination output fm matrix
    1874             : !> \param info all of the data that will be needed to complete the copy operation
    1875             : ! **************************************************************************************************
    1876      126184 :    SUBROUTINE cp_fm_finish_copy_general(destination, info)
    1877             :       TYPE(cp_fm_type), INTENT(IN)                       :: destination
    1878             :       TYPE(copy_info_type), INTENT(INOUT)                :: info
    1879             : 
    1880             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'cp_fm_finish_copy_general'
    1881             : 
    1882             :       INTEGER                                            :: handle, i, j, k, mpi_rank, send_size, &
    1883             :                                                             src_p_i, src_q_j
    1884      126184 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: recv_count
    1885             :       INTEGER, DIMENSION(2)                              :: nblock_src, nlocal_recv, src_num_pe
    1886      126184 :       INTEGER, DIMENSION(:), POINTER                     :: recv_col_indices, recv_row_indices
    1887             : 
    1888      126184 :       CALL timeset(routineN, handle)
    1889             : 
    1890             :       IF (.NOT. cp2k_is_parallel) THEN
    1891             :          ! Now unpack the data from the 'send buffer'
    1892             :          k = 0
    1893             :          DO j = 1, SIZE(destination%local_data, 2)
    1894             :             DO i = 1, SIZE(destination%local_data, 1)
    1895             :                k = k + 1
    1896             :                destination%local_data(i, j) = info%send_buf(k)
    1897             :             END DO
    1898             :          END DO
    1899             :          DEALLOCATE (info%send_buf)
    1900             :       ELSE
    1901             :          ! Set up local variables ...
    1902      126184 :          send_size = info%send_size
    1903      378552 :          nlocal_recv(1:2) = info%nlocal_recv(:)
    1904      378552 :          nblock_src(1:2) = info%nblock_src(:)
    1905      378552 :          src_num_pe(1:2) = info%src_num_pe(:)
    1906      126184 :          recv_col_indices => info%recv_col_indices
    1907      126184 :          recv_row_indices => info%recv_row_indices
    1908             : 
    1909             :          ! ... use the local variables to do the work
    1910             :          ! DEST_2
    1911      126184 :          CALL mp_waitall(info%recv_request(:))
    1912      378552 :          ALLOCATE (recv_count(0:send_size - 1))
    1913             :          ! Loop over the rmat, filling it in with data from the recv buffers
    1914             :          ! (here the block sizes, num_pes refer to the distribution of the source matrix)
    1915      341036 :          recv_count(:) = 0
    1916     3592734 :          DO j = 1, nlocal_recv(2)
    1917     3466550 :             src_q_j = MOD(((recv_col_indices(j) - 1)/nblock_src(2)), src_num_pe(2))
    1918   113611000 :             DO i = 1, nlocal_recv(1)
    1919   110018266 :                src_p_i = MOD(((recv_row_indices(i) - 1)/nblock_src(1)), src_num_pe(1))
    1920   110018266 :                mpi_rank = info%src_blacs2mpi(src_p_i, src_q_j)
    1921   110018266 :                recv_count(mpi_rank) = recv_count(mpi_rank) + 1
    1922   113484816 :                destination%local_data(i, j) = info%recv_buf(info%recv_disp(mpi_rank) + recv_count(mpi_rank))
    1923             :             END DO
    1924             :          END DO
    1925      126184 :          DEALLOCATE (recv_count, info%recv_disp, info%recv_request, info%recv_buf, info%src_blacs2mpi)
    1926             :          ! Invalidate the stored state
    1927             :          NULLIFY (info%recv_col_indices, &
    1928      126184 :                   info%recv_row_indices)
    1929             : 
    1930             :       END IF
    1931             : 
    1932      126184 :       CALL timestop(handle)
    1933             : 
    1934      126184 :    END SUBROUTINE cp_fm_finish_copy_general
    1935             : 
    1936             : ! **************************************************************************************************
    1937             : !> \brief Completes the copy operation: wait for comms clean up MPI state
    1938             : !> \param info all of the data that will be needed to complete the copy operation
    1939             : ! **************************************************************************************************
    1940      138436 :    SUBROUTINE cp_fm_cleanup_copy_general(info)
    1941             :       TYPE(copy_info_type), INTENT(INOUT)                :: info
    1942             : 
    1943             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'cp_fm_cleanup_copy_general'
    1944             : 
    1945             :       INTEGER                                            :: handle
    1946             : 
    1947      138436 :       CALL timeset(routineN, handle)
    1948             : 
    1949             :       IF (.NOT. cp2k_is_parallel) THEN
    1950             :          ! Don't do anything - no MPI state for the serial case
    1951             :       ELSE
    1952             :          ! SRC_2
    1953             :          ! If this process is also in the destination decomposition, this deallocate
    1954             :          ! Was already done in cp_fm_finish_copy_general
    1955      138436 :          IF (ALLOCATED(info%src_blacs2mpi)) THEN
    1956       31010 :             DEALLOCATE (info%src_blacs2mpi)
    1957             :          END IF
    1958      138436 :          CALL mp_waitall(info%send_request)
    1959      138436 :          DEALLOCATE (info%send_request, info%send_buf)
    1960             : 
    1961             :       END IF
    1962             : 
    1963      138436 :       CALL timestop(handle)
    1964             : 
    1965      138436 :    END SUBROUTINE cp_fm_cleanup_copy_general
    1966             : 
    1967             : ! **************************************************************************************************
    1968             : !> \brief General copy of a submatrix of fm matrix to  a submatrix of another fm matrix.
    1969             : !>        The two matrices can have different contexts.
    1970             : !>
    1971             : !>        Summary of distribution routines for dense matrices
    1972             : !>        The following will copy A(iA:iA+M-1,jA:jA+N-1) to B(iB:iB+M-1,jB:jB+N-1):
    1973             : !>
    1974             : !>        call pdgemr2d(M,N,Aloc,iA,jA,descA,Bloc,iB,jB,descB,context)
    1975             : !>
    1976             : !>        A process that is not a part of the context of A should set descA(2)
    1977             : !>        to -1, and similarly for B.
    1978             : !>
    1979             : !> \param source          input fm matrix
    1980             : !> \param destination     output fm matrix
    1981             : !> \param nrows           number of rows of sub matrix to be copied
    1982             : !> \param ncols           number of cols of sub matrix to be copied
    1983             : !> \param s_firstrow      starting global row index of sub matrix in source
    1984             : !> \param s_firstcol      starting global col index of sub matrix in source
    1985             : !> \param d_firstrow      starting global row index of sub matrix in destination
    1986             : !> \param d_firstcol      starting global col index of sub matrix in destination
    1987             : !> \param global_context  process grid that covers all parts of either A or B.
    1988             : ! **************************************************************************************************
    1989        7974 :    SUBROUTINE cp_fm_to_fm_submat_general(source, &
    1990             :                                          destination, &
    1991             :                                          nrows, &
    1992             :                                          ncols, &
    1993             :                                          s_firstrow, &
    1994             :                                          s_firstcol, &
    1995             :                                          d_firstrow, &
    1996             :                                          d_firstcol, &
    1997             :                                          global_context)
    1998             : 
    1999             :       TYPE(cp_fm_type), INTENT(IN)                       :: source, destination
    2000             :       INTEGER, INTENT(IN)                                :: nrows, ncols, s_firstrow, s_firstcol, &
    2001             :                                                             d_firstrow, d_firstcol
    2002             : 
    2003             :       CLASS(cp_blacs_type), INTENT(IN)        :: global_context
    2004             : 
    2005             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'cp_fm_to_fm_submat_general'
    2006             : 
    2007             :       LOGICAL                                  :: debug
    2008             :       INTEGER                                  :: handle
    2009             : #if defined(__parallel)
    2010             :       INTEGER, DIMENSION(9)                    :: desca, descb
    2011             :       REAL(KIND=dp), DIMENSION(1, 1), TARGET   :: dummy
    2012        7974 :       REAL(KIND=dp), DIMENSION(:, :), POINTER  :: smat, dmat
    2013             : #endif
    2014             : 
    2015        7974 :       CALL timeset(routineN, handle)
    2016             : 
    2017        7974 :       debug = debug_this_module
    2018             : 
    2019             :       IF (.NOT. cp2k_is_parallel) THEN
    2020             :          CALL cp_fm_to_fm_submat(source, &
    2021             :                                  destination, &
    2022             :                                  nrows, &
    2023             :                                  ncols, &
    2024             :                                  s_firstrow, &
    2025             :                                  s_firstcol, &
    2026             :                                  d_firstrow, &
    2027             :                                  d_firstcol)
    2028             :       ELSE
    2029             : #ifdef __parallel
    2030        7974 :          NULLIFY (smat, dmat)
    2031             :          ! check whether source is available on this process
    2032        7974 :          IF (ASSOCIATED(source%matrix_struct)) THEN
    2033       79740 :             desca = source%matrix_struct%descriptor
    2034        7974 :             IF (source%use_sp) CPABORT("only DP kind implemented")
    2035        7974 :             IF (nrows .GT. source%matrix_struct%nrow_global) &
    2036           0 :                CPABORT("nrows is greater than nrow_global of source")
    2037        7974 :             IF (ncols .GT. source%matrix_struct%ncol_global) &
    2038           0 :                CPABORT("ncols is greater than ncol_global of source")
    2039        7974 :             smat => source%local_data
    2040             :          ELSE
    2041           0 :             desca = -1
    2042           0 :             smat => dummy
    2043             :          END IF
    2044             :          ! check destination is available on this process
    2045        7974 :          IF (ASSOCIATED(destination%matrix_struct)) THEN
    2046       79740 :             descb = destination%matrix_struct%descriptor
    2047        7974 :             IF (destination%use_sp) CPABORT("only DP kind implemented")
    2048        7974 :             IF (nrows .GT. destination%matrix_struct%nrow_global) &
    2049           0 :                CPABORT("nrows is greater than nrow_global of destination")
    2050        7974 :             IF (ncols .GT. destination%matrix_struct%ncol_global) &
    2051           0 :                CPABORT("ncols is greater than ncol_global of destination")
    2052        7974 :             dmat => destination%local_data
    2053             :          ELSE
    2054           0 :             descb = -1
    2055           0 :             dmat => dummy
    2056             :          END IF
    2057             :          ! do copy
    2058             : 
    2059             :          CALL pdgemr2d(nrows, &
    2060             :                        ncols, &
    2061             :                        smat, &
    2062             :                        s_firstrow, &
    2063             :                        s_firstcol, &
    2064             :                        desca, &
    2065             :                        dmat, &
    2066             :                        d_firstrow, &
    2067             :                        d_firstcol, &
    2068             :                        descb, &
    2069        7974 :                        global_context%get_handle())
    2070             : #else
    2071             :          MARK_USED(global_context)
    2072             :          CPABORT("this subroutine only supports SCALAPACK")
    2073             : #endif
    2074             :       END IF
    2075             : 
    2076        7974 :       CALL timestop(handle)
    2077             : 
    2078        7974 :    END SUBROUTINE cp_fm_to_fm_submat_general
    2079             : 
    2080             : ! **************************************************************************************************
    2081             : !> \brief ...
    2082             : !> \param matrix ...
    2083             : !> \param irow_global ...
    2084             : !> \param icol_global ...
    2085             : !> \param alpha ...
    2086             : ! **************************************************************************************************
    2087         240 :    SUBROUTINE cp_fm_add_to_element(matrix, irow_global, icol_global, alpha)
    2088             : 
    2089             :       ! Add alpha to the matrix element specified by the global indices
    2090             :       ! irow_global and icol_global
    2091             : 
    2092             :       ! - Creation (05.05.06,MK)
    2093             : 
    2094             :       TYPE(cp_fm_type), INTENT(IN)          :: matrix
    2095             :       INTEGER, INTENT(IN)                      :: irow_global, icol_global
    2096             :       REAL(KIND=dp), INTENT(IN)                :: alpha
    2097             : 
    2098             :       INTEGER                                  :: mypcol, myprow, npcol, nprow
    2099         240 :       REAL(KIND=dp), DIMENSION(:, :), POINTER  :: a
    2100             :       TYPE(cp_blacs_env_type), POINTER         :: context
    2101             : #if defined(__parallel)
    2102             :       INTEGER                                  :: icol_local, ipcol, iprow, &
    2103             :                                                   irow_local
    2104             :       INTEGER, DIMENSION(9)                    :: desca
    2105             : #endif
    2106             : 
    2107         240 :       context => matrix%matrix_struct%context
    2108             : 
    2109         240 :       myprow = context%mepos(1)
    2110         240 :       mypcol = context%mepos(2)
    2111             : 
    2112         240 :       nprow = context%num_pe(1)
    2113         240 :       npcol = context%num_pe(2)
    2114             : 
    2115         240 :       a => matrix%local_data
    2116             : 
    2117             : #if defined(__parallel)
    2118             : 
    2119        2400 :       desca(:) = matrix%matrix_struct%descriptor(:)
    2120             : 
    2121             :       CALL infog2l(irow_global, icol_global, desca, nprow, npcol, myprow, mypcol, &
    2122         240 :                    irow_local, icol_local, iprow, ipcol)
    2123             : 
    2124         240 :       IF ((iprow == myprow) .AND. (ipcol == mypcol)) THEN
    2125         120 :          a(irow_local, icol_local) = a(irow_local, icol_local) + alpha
    2126             :       END IF
    2127             : 
    2128             : #else
    2129             : 
    2130             :       a(irow_global, icol_global) = a(irow_global, icol_global) + alpha
    2131             : 
    2132             : #endif
    2133             : 
    2134         240 :    END SUBROUTINE cp_fm_add_to_element
    2135             : 
    2136             : ! **************************************************************************************************
    2137             : !> \brief ...
    2138             : !> \param fm ...
    2139             : !> \param unit ...
    2140             : ! **************************************************************************************************
    2141       76114 :    SUBROUTINE cp_fm_write_unformatted(fm, unit)
    2142             :       TYPE(cp_fm_type), INTENT(IN)             :: fm
    2143             :       INTEGER, INTENT(IN)                      :: unit
    2144             : 
    2145             :       CHARACTER(len=*), PARAMETER :: routineN = 'cp_fm_write_unformatted'
    2146             : 
    2147             :       INTEGER                                  :: handle, j, max_block, &
    2148             :                                                   ncol_global, nrow_global
    2149             :       TYPE(mp_para_env_type), POINTER          :: para_env
    2150             : #if defined(__parallel)
    2151             :       INTEGER                                  :: i, i_block, icol_local, &
    2152             :                                                   in, info, ipcol, &
    2153             :                                                   iprow, irow_local, &
    2154             :                                                   mepos, &
    2155             :                                                   num_pe, rb, tag
    2156             :       INTEGER, DIMENSION(9)                    :: desc
    2157       76114 :       REAL(KIND=dp), DIMENSION(:), POINTER     :: vecbuf
    2158             :       REAL(KIND=dp), DIMENSION(:, :), POINTER  :: newdat
    2159             :       TYPE(cp_blacs_type) :: ictxt_loc
    2160             :       INTEGER, EXTERNAL                        :: numroc
    2161             : #endif
    2162             : 
    2163       76114 :       CALL timeset(routineN, handle)
    2164             :       CALL cp_fm_get_info(fm, nrow_global=nrow_global, ncol_global=ncol_global, ncol_block=max_block, &
    2165       76114 :                           para_env=para_env)
    2166             : 
    2167             : #if defined(__parallel)
    2168       76114 :       num_pe = para_env%num_pe
    2169       76114 :       mepos = para_env%mepos
    2170       76114 :       rb = nrow_global
    2171       76114 :       tag = 0
    2172             :       ! get a new context
    2173       76114 :       CALL ictxt_loc%gridinit(para_env, 'R', 1, num_pe)
    2174       76114 :       CALL descinit(desc, nrow_global, ncol_global, rb, max_block, 0, 0, ictxt_loc%get_handle(), nrow_global, info)
    2175       76114 :       CPASSERT(info == 0)
    2176             :       ASSOCIATE (nprow => ictxt_loc%num_pe(1), npcol => ictxt_loc%num_pe(2), &
    2177             :                  myprow => ictxt_loc%mepos(1), mypcol => ictxt_loc%mepos(2))
    2178      152228 :          in = numroc(ncol_global, max_block, mypcol, 0, npcol)
    2179             : 
    2180      304456 :          ALLOCATE (newdat(nrow_global, MAX(1, in)))
    2181             : 
    2182             :          ! do the actual scalapack to cols reordering
    2183             :          CALL pdgemr2d(nrow_global, ncol_global, fm%local_data, 1, 1, &
    2184             :                        fm%matrix_struct%descriptor, &
    2185       76114 :                        newdat, 1, 1, desc, ictxt_loc%get_handle())
    2186             : 
    2187      228342 :          ALLOCATE (vecbuf(nrow_global*max_block))
    2188    56233821 :          vecbuf = HUGE(1.0_dp) ! init for valgrind
    2189             : 
    2190      284062 :          DO i = 1, ncol_global, MAX(max_block, 1)
    2191      131834 :             i_block = MIN(max_block, ncol_global - i + 1)
    2192             :             CALL infog2l(1, i, desc, nprow, npcol, myprow, mypcol, &
    2193      131834 :                          irow_local, icol_local, iprow, ipcol)
    2194      131834 :             IF (ipcol == mypcol) THEN
    2195      901996 :                DO j = 1, i_block
    2196   145738162 :                   vecbuf((j - 1)*nrow_global + 1:nrow_global*j) = newdat(:, icol_local + j - 1)
    2197             :                END DO
    2198             :             END IF
    2199             : 
    2200      131834 :             IF (ipcol == 0) THEN
    2201             :                ! do nothing
    2202             :             ELSE
    2203       54022 :                IF (ipcol == mypcol) THEN
    2204    35142597 :                   CALL para_env%send(vecbuf(:), 0, tag)
    2205             :                END IF
    2206       54022 :                IF (mypcol == 0) THEN
    2207    70258183 :                   CALL para_env%recv(vecbuf(:), ipcol, tag)
    2208             :                END IF
    2209             :             END IF
    2210             : 
    2211      207948 :             IF (unit > 0) THEN
    2212      872092 :                DO j = 1, i_block
    2213      872092 :                   WRITE (unit) vecbuf((j - 1)*nrow_global + 1:nrow_global*j)
    2214             :                END DO
    2215             :             END IF
    2216             : 
    2217             :          END DO
    2218             :       END ASSOCIATE
    2219       76114 :       DEALLOCATE (vecbuf)
    2220             : 
    2221       76114 :       CALL ictxt_loc%gridexit()
    2222             : 
    2223       76114 :       DEALLOCATE (newdat)
    2224             : 
    2225             : #else
    2226             : 
    2227             :       IF (unit > 0) THEN
    2228             :          DO j = 1, ncol_global
    2229             :             WRITE (unit) fm%local_data(:, j)
    2230             :          END DO
    2231             :       END IF
    2232             : 
    2233             : #endif
    2234       76114 :       CALL timestop(handle)
    2235             : 
    2236      380570 :    END SUBROUTINE cp_fm_write_unformatted
    2237             : 
    2238             : ! **************************************************************************************************
    2239             : !> \brief Write out a full matrix in plain text.
    2240             : !> \param fm the matrix to be outputted
    2241             : !> \param unit the unit number for I/O
    2242             : !> \param header optional header
    2243             : !> \param value_format ...
    2244             : ! **************************************************************************************************
    2245          10 :    SUBROUTINE cp_fm_write_formatted(fm, unit, header, value_format)
    2246             :       TYPE(cp_fm_type), INTENT(IN)             :: fm
    2247             :       INTEGER, INTENT(IN)                      :: unit
    2248             :       CHARACTER(LEN=*), INTENT(IN), OPTIONAL   :: header, value_format
    2249             : 
    2250             :       CHARACTER(len=*), PARAMETER :: routineN = 'cp_fm_write_formatted'
    2251             : 
    2252             :       CHARACTER(LEN=21)                        :: my_value_format
    2253             :       INTEGER                                  :: handle, i, j, max_block, &
    2254             :                                                   ncol_global, nrow_global
    2255             :       TYPE(mp_para_env_type), POINTER          :: para_env
    2256             : #if defined(__parallel)
    2257             :       INTEGER                                  :: i_block, icol_local, &
    2258             :                                                   in, info, ipcol, &
    2259             :                                                   iprow, irow_local, &
    2260             :                                                   mepos, num_pe, rb, tag, k, &
    2261             :                                                   icol, irow
    2262             :       INTEGER, DIMENSION(9)                    :: desc
    2263          10 :       REAL(KIND=dp), DIMENSION(:), POINTER     :: vecbuf
    2264          10 :       REAL(KIND=dp), DIMENSION(:, :), POINTER  :: newdat
    2265             :       TYPE(cp_blacs_type) :: ictxt_loc
    2266             :       INTEGER, EXTERNAL                        :: numroc
    2267             : #endif
    2268             : 
    2269          10 :       CALL timeset(routineN, handle)
    2270             :       CALL cp_fm_get_info(fm, nrow_global=nrow_global, ncol_global=ncol_global, ncol_block=max_block, &
    2271          10 :                           para_env=para_env)
    2272             : 
    2273          10 :       IF (PRESENT(value_format)) THEN
    2274           0 :          CPASSERT(LEN_TRIM(ADJUSTL(value_format)) < 11)
    2275           0 :          my_value_format = "(I10, I10, "//TRIM(ADJUSTL(value_format))//")"
    2276             :       ELSE
    2277          10 :          my_value_format = "(I10, I10, ES24.12)"
    2278             :       END IF
    2279             : 
    2280          10 :       IF (unit > 0) THEN
    2281           5 :          IF (PRESENT(header)) WRITE (unit, *) header
    2282           5 :          WRITE (unit, "(A2, A8, A10, A24)") "#", "Row", "Column", ADJUSTL("Value")
    2283             :       END IF
    2284             : 
    2285             : #if defined(__parallel)
    2286          10 :       num_pe = para_env%num_pe
    2287          10 :       mepos = para_env%mepos
    2288          10 :       rb = nrow_global
    2289          10 :       tag = 0
    2290             :       ! get a new context
    2291          10 :       CALL ictxt_loc%gridinit(para_env, 'R', 1, num_pe)
    2292          10 :       CALL descinit(desc, nrow_global, ncol_global, rb, max_block, 0, 0, ictxt_loc%get_handle(), nrow_global, info)
    2293          10 :       CPASSERT(info == 0)
    2294             :       ASSOCIATE (nprow => ictxt_loc%num_pe(1), npcol => ictxt_loc%num_pe(2), &
    2295             :                  myprow => ictxt_loc%mepos(1), mypcol => ictxt_loc%mepos(2))
    2296          20 :          in = numroc(ncol_global, max_block, mypcol, 0, npcol)
    2297             : 
    2298          40 :          ALLOCATE (newdat(nrow_global, MAX(1, in)))
    2299             : 
    2300             :          ! do the actual scalapack to cols reordering
    2301             :          CALL pdgemr2d(nrow_global, ncol_global, fm%local_data, 1, 1, &
    2302             :                        fm%matrix_struct%descriptor, &
    2303          10 :                        newdat, 1, 1, desc, ictxt_loc%get_handle())
    2304             : 
    2305          30 :          ALLOCATE (vecbuf(nrow_global*max_block))
    2306        1226 :          vecbuf = HUGE(1.0_dp) ! init for valgrind
    2307          10 :          irow = 1
    2308          10 :          icol = 1
    2309             : 
    2310          32 :          DO i = 1, ncol_global, MAX(max_block, 1)
    2311          12 :             i_block = MIN(max_block, ncol_global - i + 1)
    2312             :             CALL infog2l(1, i, desc, nprow, npcol, myprow, mypcol, &
    2313          12 :                          irow_local, icol_local, iprow, ipcol)
    2314          12 :             IF (ipcol == mypcol) THEN
    2315          36 :                DO j = 1, i_block
    2316        1256 :                   vecbuf((j - 1)*nrow_global + 1:nrow_global*j) = newdat(:, icol_local + j - 1)
    2317             :                END DO
    2318             :             END IF
    2319             : 
    2320          12 :             IF (ipcol == 0) THEN
    2321             :                ! do nothing
    2322             :             ELSE
    2323           2 :                IF (ipcol == mypcol) THEN
    2324           3 :                   CALL para_env%send(vecbuf(:), 0, tag)
    2325             :                END IF
    2326           2 :                IF (mypcol == 0) THEN
    2327           5 :                   CALL para_env%recv(vecbuf(:), ipcol, tag)
    2328             :                END IF
    2329             :             END IF
    2330             : 
    2331          22 :             IF (unit > 0) THEN
    2332          36 :                DO j = 1, i_block
    2333         646 :                   DO k = (j - 1)*nrow_global + 1, nrow_global*j
    2334         610 :                      WRITE (UNIT=unit, FMT=my_value_format) irow, icol, vecbuf(k)
    2335         610 :                      irow = irow + 1
    2336         640 :                      IF (irow > nrow_global) THEN
    2337          30 :                         irow = 1
    2338          30 :                         icol = icol + 1
    2339             :                      END IF
    2340             :                   END DO
    2341             :                END DO
    2342             :             END IF
    2343             : 
    2344             :          END DO
    2345             :       END ASSOCIATE
    2346          10 :       DEALLOCATE (vecbuf)
    2347             : 
    2348          10 :       CALL ictxt_loc%gridexit()
    2349             : 
    2350          10 :       DEALLOCATE (newdat)
    2351             : 
    2352             : #else
    2353             : 
    2354             :       IF (unit > 0) THEN
    2355             :          DO j = 1, ncol_global
    2356             :             DO i = 1, nrow_global
    2357             :                WRITE (UNIT=unit, FMT=my_value_format) i, j, fm%local_data(i, j)
    2358             :             END DO
    2359             :          END DO
    2360             :       END IF
    2361             : 
    2362             : #endif
    2363          10 :       CALL timestop(handle)
    2364             : 
    2365          50 :    END SUBROUTINE cp_fm_write_formatted
    2366             : 
    2367             : ! **************************************************************************************************
    2368             : !> \brief ...
    2369             : !> \param fm ...
    2370             : !> \param unit ...
    2371             : ! **************************************************************************************************
    2372        1136 :    SUBROUTINE cp_fm_read_unformatted(fm, unit)
    2373             :       TYPE(cp_fm_type), INTENT(INOUT)          :: fm
    2374             :       INTEGER, INTENT(IN)                      :: unit
    2375             : 
    2376             :       CHARACTER(len=*), PARAMETER :: routineN = 'cp_fm_read_unformatted'
    2377             : 
    2378             :       INTEGER                                  :: handle, j, max_block, &
    2379             :                                                   ncol_global, nrow_global
    2380             :       TYPE(mp_para_env_type), POINTER          :: para_env
    2381             : #if defined(__parallel)
    2382             :       INTEGER                                  :: k, n_cols
    2383             :       REAL(KIND=dp), DIMENSION(:, :), POINTER  :: vecbuf
    2384             : #endif
    2385             : 
    2386        1136 :       CALL timeset(routineN, handle)
    2387             : 
    2388             :       CALL cp_fm_get_info(fm, nrow_global=nrow_global, ncol_global=ncol_global, ncol_block=max_block, &
    2389        1136 :                           para_env=para_env)
    2390             : 
    2391             : #if defined(__parallel)
    2392             : 
    2393             :       ! the parallel case could be made more efficient (see cp_fm_write_unformatted)
    2394             : 
    2395        4544 :       ALLOCATE (vecbuf(nrow_global, max_block))
    2396             : 
    2397        3392 :       DO j = 1, ncol_global, max_block
    2398             : 
    2399        2256 :          n_cols = MIN(max_block, ncol_global - j + 1)
    2400        2256 :          IF (para_env%mepos == 0) THEN
    2401       29981 :             DO k = 1, n_cols
    2402       29981 :                READ (unit) vecbuf(:, k)
    2403             :             END DO
    2404             :          END IF
    2405    11386656 :          CALL para_env%bcast(vecbuf, 0)
    2406        3392 :          CALL cp_fm_set_submatrix(fm, vecbuf, start_row=1, start_col=j, n_cols=n_cols)
    2407             : 
    2408             :       END DO
    2409             : 
    2410        1136 :       DEALLOCATE (vecbuf)
    2411             : 
    2412             : #else
    2413             : 
    2414             :       DO j = 1, ncol_global
    2415             :          READ (unit) fm%local_data(:, j)
    2416             :       END DO
    2417             : 
    2418             : #endif
    2419             : 
    2420        1136 :       CALL timestop(handle)
    2421             : 
    2422        1136 :    END SUBROUTINE cp_fm_read_unformatted
    2423             : 
    2424             : ! **************************************************************************************************
    2425             : !> \brief ...
    2426             : !> \param mult_type ...
    2427             : ! **************************************************************************************************
    2428        9025 :    SUBROUTINE cp_fm_setup(mult_type)
    2429             :       INTEGER, INTENT(IN)                                :: mult_type
    2430             : 
    2431        9025 :       mm_type = mult_type
    2432        9025 :    END SUBROUTINE cp_fm_setup
    2433             : 
    2434             : ! **************************************************************************************************
    2435             : !> \brief ...
    2436             : !> \return ...
    2437             : ! **************************************************************************************************
    2438     1344662 :    FUNCTION cp_fm_get_mm_type() RESULT(res)
    2439             :       INTEGER                                            :: res
    2440             : 
    2441     1344662 :       res = mm_type
    2442     1344662 :    END FUNCTION
    2443             : 
    2444             : ! **************************************************************************************************
    2445             : !> \brief ...
    2446             : !> \param ictxt ...
    2447             : !> \param prec ...
    2448             : !> \return ...
    2449             : ! **************************************************************************************************
    2450          10 :    FUNCTION cp_fm_pilaenv(ictxt, prec) RESULT(res)
    2451             :       INTEGER                                            :: ictxt
    2452             :       CHARACTER(LEN=1)                                   :: prec
    2453             :       INTEGER                                            :: res
    2454             : #if defined(__parallel)
    2455             :       INTEGER                                            :: pilaenv
    2456          10 :       res = pilaenv(ictxt, prec)
    2457             : #else
    2458             :       MARK_USED(ictxt)
    2459             :       MARK_USED(prec)
    2460             :       res = -1
    2461             : #endif
    2462             : 
    2463          10 :    END FUNCTION
    2464             : 
    2465           0 : END MODULE cp_fm_types

Generated by: LCOV version 1.15