LCOV - code coverage report
Current view: top level - src/pw - realspace_grid_types.F (source / functions) Hit Total Coverage
Test: CP2K Regtests (git:2fce0f8) Lines: 739 788 93.8 %
Date: 2024-12-21 06:28:57 Functions: 19 27 70.4 %

          Line data    Source code
       1             : !--------------------------------------------------------------------------------------------------!
       2             : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3             : !   Copyright 2000-2024 CP2K developers group <https://cp2k.org>                                   !
       4             : !                                                                                                  !
       5             : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6             : !--------------------------------------------------------------------------------------------------!
       7             : 
       8             : ! **************************************************************************************************
       9             : !> \note
      10             : !>      Basic type for real space grid methods
      11             : !> \par History
      12             : !>      JGH (22-May-2002) : New routine rs_grid_zero
      13             : !>      JGH (12-Jun-2002) : Bug fix for mpi groups
      14             : !>      JGH (19-Jun-2003) : Added routine for task distribution
      15             : !>      JGH (23-Nov-2003) : Added routine for task loop separation
      16             : !> \author JGH (18-Mar-2001)
      17             : ! **************************************************************************************************
      18             : MODULE realspace_grid_types
      19             :    USE cp_array_utils,                  ONLY: cp_1d_r_p_type
      20             :    USE cp_log_handling,                 ONLY: cp_to_string
      21             :    USE kahan_sum,                       ONLY: accurate_sum
      22             :    USE kinds,                           ONLY: dp,&
      23             :                                               int_8
      24             :    USE machine,                         ONLY: m_memory
      25             :    USE mathlib,                         ONLY: det_3x3
      26             :    USE message_passing,                 ONLY: mp_comm_null,&
      27             :                                               mp_comm_type,&
      28             :                                               mp_request_null,&
      29             :                                               mp_request_type,&
      30             :                                               mp_waitall,&
      31             :                                               mp_waitany
      32             :    USE offload_api,                     ONLY: offload_buffer_type,&
      33             :                                               offload_create_buffer,&
      34             :                                               offload_free_buffer
      35             :    USE pw_grid_types,                   ONLY: PW_MODE_LOCAL,&
      36             :                                               pw_grid_type
      37             :    USE pw_grids,                        ONLY: pw_grid_release,&
      38             :                                               pw_grid_retain
      39             :    USE pw_methods,                      ONLY: pw_integrate_function
      40             :    USE pw_types,                        ONLY: pw_r3d_rs_type
      41             :    USE util,                            ONLY: get_limit
      42             : 
      43             : !$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
      44             : 
      45             : #include "../base/base_uses.f90"
      46             : 
      47             :    IMPLICIT NONE
      48             : 
      49             :    PRIVATE
      50             :    PUBLIC :: realspace_grid_type, &
      51             :              realspace_grid_desc_type, &
      52             :              realspace_grid_p_type, &
      53             :              realspace_grid_desc_p_type, &
      54             :              realspace_grid_input_type
      55             : 
      56             :    PUBLIC :: transfer_rs2pw, &
      57             :              transfer_pw2rs, &
      58             :              rs_grid_zero, &
      59             :              rs_grid_set_box, &
      60             :              rs_grid_create, &
      61             :              rs_grid_create_descriptor, &
      62             :              rs_grid_retain_descriptor, &
      63             :              rs_grid_release, &
      64             :              rs_grid_release_descriptor, &
      65             :              rs_grid_reorder_ranks, &
      66             :              rs_grid_print, &
      67             :              rs_grid_locate_rank, &
      68             :              rs_grid_max_ngpts, &
      69             :              rs_grid_mult_and_add, &
      70             :              map_gaussian_here
      71             : 
      72             :    INTEGER, PARAMETER, PUBLIC               :: rsgrid_distributed = 0, &
      73             :                                                rsgrid_replicated = 1, &
      74             :                                                rsgrid_automatic = 2
      75             : 
      76             :    LOGICAL, PRIVATE, PARAMETER :: debug_this_module = .FALSE.
      77             :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'realspace_grid_types'
      78             : 
      79             : ! **************************************************************************************************
      80             :    TYPE realspace_grid_input_type
      81             :       INTEGER       :: distribution_type = rsgrid_replicated
      82             :       INTEGER       :: distribution_layout(3) = -1
      83             :       REAL(KIND=dp) :: memory_factor = 0.0_dp
      84             :       LOGICAL       :: lock_distribution = .FALSE.
      85             :       INTEGER       :: nsmax = -1
      86             :       REAL(KIND=dp) :: halo_reduction_factor = 1.0_dp
      87             :    END TYPE realspace_grid_input_type
      88             : 
      89             : ! **************************************************************************************************
      90             :    TYPE realspace_grid_desc_type
      91             :       TYPE(pw_grid_type), POINTER   :: pw => NULL() ! the pw grid
      92             : 
      93             :       INTEGER :: ref_count = 0 ! reference count
      94             : 
      95             :       INTEGER(int_8) :: ngpts = 0_int_8 ! # grid points
      96             :       INTEGER, DIMENSION(3) :: npts = 0 ! # grid points per dimension
      97             :       INTEGER, DIMENSION(3) :: lb = 0 ! lower bounds
      98             :       INTEGER, DIMENSION(3) :: ub = 0 ! upper bounds
      99             : 
     100             :       INTEGER :: border = 0 ! border points
     101             : 
     102             :       INTEGER, DIMENSION(3) :: perd = -1 ! periodicity enforced
     103             :       REAL(KIND=dp), DIMENSION(3, 3) :: dh = 0.0_dp ! incremental grid matrix
     104             :       REAL(KIND=dp), DIMENSION(3, 3) :: dh_inv = 0.0_dp ! inverse incremental grid matrix
     105             :       LOGICAL :: orthorhombic = .TRUE. ! grid symmetry
     106             : 
     107             :       LOGICAL :: parallel = .TRUE. ! whether the corresponding pw grid is distributed
     108             :       LOGICAL :: distributed = .TRUE. ! whether the rs grid is distributed
     109             :       ! these MPI related quantities are only meaningful depending on how the grid has been laid out
     110             :       ! they are most useful for fully distributed grids, where they reflect the topology of the grid
     111             :       TYPE(mp_comm_type) :: group = mp_comm_null
     112             :       INTEGER :: my_pos = -1
     113             :       INTEGER :: group_size = 0
     114             :       INTEGER, DIMENSION(3) :: group_dim = -1
     115             :       INTEGER, DIMENSION(3) :: group_coor = -1
     116             :       INTEGER, DIMENSION(3) :: neighbours = -1
     117             :       ! only meaningful on distributed grids
     118             :       ! a list of bounds for each CPU
     119             :       INTEGER, DIMENSION(:, :), ALLOCATABLE :: lb_global
     120             :       INTEGER, DIMENSION(:, :), ALLOCATABLE :: ub_global
     121             :       ! a mapping from linear rank to 3d coord
     122             :       INTEGER, DIMENSION(:, :), ALLOCATABLE :: rank2coord
     123             :       INTEGER, DIMENSION(:, :, :), ALLOCATABLE :: coord2rank
     124             :       ! a mapping from index to rank (which allows to figure out easily on which rank a given point of the grid is)
     125             :       INTEGER, DIMENSION(:), ALLOCATABLE :: x2coord
     126             :       INTEGER, DIMENSION(:), ALLOCATABLE :: y2coord
     127             :       INTEGER, DIMENSION(:), ALLOCATABLE :: z2coord
     128             : 
     129             :       INTEGER                :: my_virtual_pos = -1
     130             :       INTEGER, DIMENSION(3) :: virtual_group_coor = -1
     131             : 
     132             :       INTEGER, DIMENSION(:), ALLOCATABLE :: virtual2real, real2virtual
     133             : 
     134             :    END TYPE realspace_grid_desc_type
     135             : 
     136             :    TYPE realspace_grid_type
     137             : 
     138             :       TYPE(realspace_grid_desc_type), POINTER :: desc => NULL()
     139             : 
     140             :       INTEGER :: ngpts_local = -1 ! local dimensions
     141             :       INTEGER, DIMENSION(3) :: npts_local = -1
     142             :       INTEGER, DIMENSION(3) :: lb_local = -1
     143             :       INTEGER, DIMENSION(3) :: ub_local = -1
     144             :       INTEGER, DIMENSION(3) :: lb_real = -1 ! lower bounds of the real local data
     145             :       INTEGER, DIMENSION(3) :: ub_real = -1 ! upper bounds of the real local data
     146             : 
     147             :       INTEGER, DIMENSION(:), ALLOCATABLE         :: px, py, pz ! index translators
     148             :       TYPE(offload_buffer_type)                  :: buffer = offload_buffer_type() ! owner of the grid's memory
     149             :       REAL(KIND=dp), DIMENSION(:, :, :), CONTIGUOUS, POINTER :: r => NULL() ! the grid (pointer to buffer%host_buffer)
     150             : 
     151             :    END TYPE realspace_grid_type
     152             : 
     153             : ! **************************************************************************************************
     154             :    TYPE realspace_grid_p_type
     155             :       TYPE(realspace_grid_type), POINTER :: rs_grid => NULL()
     156             :    END TYPE realspace_grid_p_type
     157             : 
     158             :    TYPE realspace_grid_desc_p_type
     159             :       TYPE(realspace_grid_desc_type), POINTER :: rs_desc => NULL()
     160             :    END TYPE realspace_grid_desc_p_type
     161             : 
     162             : CONTAINS
     163             : 
     164             : ! **************************************************************************************************
     165             : !> \brief returns the 1D rank of the task which is a cartesian shift away from 1D rank rank_in
     166             : !>        only possible if rs_grid is a distributed grid
     167             : !> \param rs_desc ...
     168             : !> \param rank_in ...
     169             : !> \param shift ...
     170             : !> \return ...
     171             : ! **************************************************************************************************
     172        2346 :    PURE FUNCTION rs_grid_locate_rank(rs_desc, rank_in, shift) RESULT(rank_out)
     173             :       TYPE(realspace_grid_desc_type), INTENT(IN)         :: rs_desc
     174             :       INTEGER, INTENT(IN)                                :: rank_in
     175             :       INTEGER, DIMENSION(3), INTENT(IN)                  :: shift
     176             :       INTEGER                                            :: rank_out
     177             : 
     178             :       INTEGER                                            :: coord(3)
     179             : 
     180        9384 :       coord = MODULO(rs_desc%rank2coord(:, rank_in) + shift, rs_desc%group_dim)
     181        2346 :       rank_out = rs_desc%coord2rank(coord(1), coord(2), coord(3))
     182        2346 :    END FUNCTION rs_grid_locate_rank
     183             : 
     184             : ! **************************************************************************************************
     185             : !> \brief Determine the setup of real space grids - this is divided up into the
     186             : !>        creation of a descriptor and the actual grid itself (see rs_grid_create)
     187             : !> \param desc ...
     188             : !> \param pw_grid ...
     189             : !> \param input_settings ...
     190             : !> \param border_points ...
     191             : !> \par History
     192             : !>      JGH (08-Jun-2003) : nsmax <= 0 indicates fully replicated grid
     193             : !>      Iain Bethune (05-Sep-2008) : modified cut heuristic
     194             : !>      (c) The Numerical Algorithms Group (NAG) Ltd, 2008 on behalf of the HECToR project
     195             : !>      - Create a descriptor for realspace grids with a number of border
     196             : !>        points as exactly given by the optional argument border_points.
     197             : !>        These grids are always distributed.
     198             : !>        (27.11.2013, Matthias Krack)
     199             : !> \author JGH (18-Mar-2001)
     200             : ! **************************************************************************************************
     201       31718 :    SUBROUTINE rs_grid_create_descriptor(desc, pw_grid, input_settings, border_points)
     202             :       TYPE(realspace_grid_desc_type), POINTER            :: desc
     203             :       TYPE(pw_grid_type), INTENT(INOUT), TARGET          :: pw_grid
     204             :       TYPE(realspace_grid_input_type), INTENT(IN)        :: input_settings
     205             :       INTEGER, INTENT(IN), OPTIONAL                      :: border_points
     206             : 
     207             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'rs_grid_create_descriptor'
     208             : 
     209             :       INTEGER                                            :: border_size, dir, handle, i, j, k, l, &
     210             :                                                             lb(2), min_npts_real, n_slices(3), &
     211             :                                                             n_slices_tmp(3), nmin
     212             :       LOGICAL                                            :: overlap
     213             :       REAL(KIND=dp)                                      :: ratio, ratio_best, volume, volume_dist
     214             : 
     215       31718 :       CALL timeset(routineN, handle)
     216             : 
     217       31718 :       IF (PRESENT(border_points)) THEN
     218         128 :          border_size = border_points
     219             :       ELSE
     220             :          border_size = 0
     221             :       END IF
     222             : 
     223     1554182 :       ALLOCATE (desc)
     224             : 
     225       31718 :       CALL pw_grid%para%group%sync()
     226             : 
     227       31718 :       desc%pw => pw_grid
     228       31718 :       CALL pw_grid_retain(desc%pw)
     229             : 
     230      412334 :       desc%dh = pw_grid%dh
     231      412334 :       desc%dh_inv = pw_grid%dh_inv
     232       31718 :       desc%orthorhombic = pw_grid%orthorhombic
     233       31718 :       desc%ref_count = 1
     234             : 
     235       31718 :       IF (pw_grid%para%mode == PW_MODE_LOCAL) THEN
     236             :          ! The corresponding group has dimension 1
     237             :          ! All operations will be done locally
     238       16328 :          desc%npts = pw_grid%npts
     239       16328 :          desc%ngpts = PRODUCT(INT(desc%npts, KIND=int_8))
     240       16328 :          desc%lb = pw_grid%bounds(1, :)
     241       16328 :          desc%ub = pw_grid%bounds(2, :)
     242        4082 :          desc%border = border_size
     243        4082 :          IF (border_size == 0) THEN
     244       16328 :             desc%perd = 1
     245             :          ELSE
     246           0 :             desc%perd = 0
     247             :          END IF
     248        4082 :          desc%parallel = .FALSE.
     249        4082 :          desc%distributed = .FALSE.
     250        4082 :          desc%group = mp_comm_null
     251        4082 :          desc%group_size = 1
     252       16328 :          desc%group_dim = 1
     253       16328 :          desc%group_coor = 0
     254        4082 :          desc%my_pos = 0
     255             :       ELSE
     256             :          ! group size of desc grid
     257             :          ! global grid dimensions are still the same
     258       27636 :          desc%group_size = pw_grid%para%group%num_pe
     259      110544 :          desc%npts = pw_grid%npts
     260      110544 :          desc%ngpts = PRODUCT(INT(desc%npts, KIND=int_8))
     261      110544 :          desc%lb = pw_grid%bounds(1, :)
     262      110544 :          desc%ub = pw_grid%bounds(2, :)
     263             : 
     264             :          ! this is the eventual border size
     265       27636 :          IF (border_size == 0) THEN
     266       27508 :             nmin = (input_settings%nsmax + 1)/2
     267       27508 :             nmin = MAX(0, NINT(nmin*input_settings%halo_reduction_factor))
     268             :          ELSE
     269             :             ! Set explicitly the requested border size
     270             :             nmin = border_size
     271             :          END IF
     272             : 
     273       27636 :          IF (input_settings%distribution_type == rsgrid_replicated) THEN
     274             : 
     275       44280 :             n_slices = 1
     276       11070 :             IF (border_size > 0) THEN
     277             :                CALL cp_abort(__LOCATION__, &
     278             :                              "An explicit border size > 0 is not yet working for "// &
     279             :                              "replicated realspace grids. Request DISTRIBUTION_TYPE "// &
     280           0 :                              "distributed for RS_GRID explicitly.")
     281             :             END IF
     282             : 
     283             :          ELSE
     284             : 
     285       66264 :             n_slices = 1
     286       16566 :             ratio_best = -HUGE(ratio_best)
     287             : 
     288             :             ! don't allow distributions with more processors than real grid points
     289       49698 :             DO k = 1, MIN(desc%npts(3), desc%group_size)
     290      115962 :             DO j = 1, MIN(desc%npts(2), desc%group_size)
     291       66264 :                i = MIN(desc%npts(1), desc%group_size/(j*k))
     292      265056 :                n_slices_tmp = (/i, j, k/)
     293             : 
     294             :                ! we don't match the actual number of CPUs
     295      265056 :                IF (PRODUCT(n_slices_tmp) .NE. desc%group_size) CYCLE
     296             : 
     297             :                ! we see if there has been a input constraint
     298             :                ! i.e. if the layout is not -1 we need to fullfil it
     299      397842 :                IF (.NOT. ALL(PACK(n_slices_tmp == input_settings%distribution_layout, &
     300             :                                   (/-1, -1, -1/) /= input_settings%distribution_layout) &
     301       49698 :                              )) CYCLE
     302             : 
     303             :                ! We can not work with a grid that has more local than global grid points.
     304             :                ! This can happen when a halo region wraps around and overlaps with the other halo.
     305       49472 :                overlap = .FALSE.
     306      197888 :                DO dir = 1, 3
     307      197888 :                   IF (n_slices_tmp(dir) > 1) THEN
     308      148416 :                      DO l = 0, n_slices_tmp(dir) - 1
     309       98944 :                         lb = get_limit(desc%npts(dir), n_slices_tmp(dir), l)
     310      148416 :                         IF (lb(2) - lb(1) + 1 + 2*nmin > desc%npts(dir)) overlap = .TRUE.
     311             :                      END DO
     312             :                   END IF
     313             :                END DO
     314       49472 :                IF (overlap) CYCLE
     315             : 
     316             :                ! a heuristic optimisation to reduce the memory usage
     317             :                ! we go for the smallest local to real volume
     318             :                ! volume of the box without the wings / volume of the box with the wings
     319             :                ! with prefactodesc to promote less cuts in Z dimension
     320             :                ratio = PRODUCT(REAL(desc%npts, KIND=dp)/n_slices_tmp)/ &
     321             :                        PRODUCT(REAL(desc%npts, KIND=dp)/n_slices_tmp + &
     322       58716 :                                MERGE((/0.0, 0.0, 0.0/), 2*(/1.06*nmin, 1.05*nmin, 1.03*nmin/), n_slices_tmp == (/1, 1, 1/)))
     323       41520 :                IF (ratio > ratio_best) THEN
     324        8372 :                   ratio_best = ratio
     325        8372 :                   n_slices = n_slices_tmp
     326             :                END IF
     327             : 
     328             :             END DO
     329             :             END DO
     330             : 
     331             :             ! if automatic we can still decide this is a replicated grid
     332             :             ! if the memory gain (or the gain is messages) is too small.
     333       16566 :             IF (input_settings%distribution_type == rsgrid_automatic) THEN
     334       64064 :                volume = PRODUCT(REAL(desc%npts, KIND=dp))
     335             :                volume_dist = PRODUCT(REAL(desc%npts, KIND=dp)/n_slices + &
     336       64064 :                                      MERGE((/0, 0, 0/), 2*(/nmin, nmin, nmin/), n_slices == (/1, 1, 1/)))
     337       16016 :                IF (volume < volume_dist*input_settings%memory_factor) THEN
     338       64064 :                   n_slices = 1
     339             :                END IF
     340             :             END IF
     341             : 
     342             :          END IF
     343             : 
     344      110544 :          desc%group_dim(:) = n_slices(:)
     345       27636 :          CALL desc%group%from_dup(pw_grid%para%group)
     346       27636 :          desc%group_size = desc%group%num_pe
     347       27636 :          desc%my_pos = desc%group%mepos
     348             : 
     349      110366 :          IF (ALL(n_slices == 1)) THEN
     350             :             ! CASE 1 : only one slice: we do not need overlapping regions and special
     351             :             !          recombination of the total density
     352       27478 :             desc%border = border_size
     353       27478 :             IF (border_size == 0) THEN
     354      109912 :                desc%perd = 1
     355             :             ELSE
     356           0 :                desc%perd = 0
     357             :             END IF
     358       27478 :             desc%distributed = .FALSE.
     359       27478 :             desc%parallel = .TRUE.
     360      109912 :             desc%group_coor(:) = 0
     361       27478 :             desc%my_virtual_pos = 0
     362             : 
     363       82434 :             ALLOCATE (desc%virtual2real(0:desc%group_size - 1))
     364       82434 :             ALLOCATE (desc%real2virtual(0:desc%group_size - 1))
     365             :             ! Start with no reordering
     366       82434 :             DO i = 0, desc%group_size - 1
     367       54956 :                desc%virtual2real(i) = i
     368       82434 :                desc%real2virtual(i) = i
     369             :             END DO
     370             :          ELSE
     371             :             ! CASE 2 : general case
     372             :             ! periodicity is no longer enforced arbritary directions
     373         158 :             IF (border_size == 0) THEN
     374         120 :                desc%perd = 1
     375         120 :                DO dir = 1, 3
     376         120 :                   IF (n_slices(dir) > 1) desc%perd(dir) = 0
     377             :                END DO
     378             :             ELSE
     379         512 :                desc%perd(:) = 0
     380             :             END IF
     381             :             ! we keep a border of nmin points
     382         158 :             desc%border = nmin
     383             :             ! we are going parallel on the real space grid
     384         158 :             desc%parallel = .TRUE.
     385         158 :             desc%distributed = .TRUE.
     386             : 
     387             :             ! set up global info about the distribution
     388         474 :             ALLOCATE (desc%rank2coord(3, 0:desc%group_size - 1))
     389         790 :             ALLOCATE (desc%coord2rank(0:desc%group_dim(1) - 1, 0:desc%group_dim(2) - 1, 0:desc%group_dim(3) - 1))
     390         474 :             ALLOCATE (desc%lb_global(3, 0:desc%group_size - 1))
     391         474 :             ALLOCATE (desc%ub_global(3, 0:desc%group_size - 1))
     392         474 :             ALLOCATE (desc%x2coord(desc%lb(1):desc%ub(1)))
     393         474 :             ALLOCATE (desc%y2coord(desc%lb(2):desc%ub(2)))
     394         474 :             ALLOCATE (desc%z2coord(desc%lb(3):desc%ub(3)))
     395             : 
     396         474 :             DO i = 0, desc%group_size - 1
     397             :                ! Calculate coordinates in a row-major order (to be SMP-friendly)
     398         316 :                desc%rank2coord(1, i) = i/(desc%group_dim(2)*desc%group_dim(3))
     399             :                desc%rank2coord(2, i) = MODULO(i, desc%group_dim(2)*desc%group_dim(3)) &
     400         316 :                                        /desc%group_dim(3)
     401         316 :                desc%rank2coord(3, i) = MODULO(i, desc%group_dim(3))
     402             : 
     403         316 :                IF (i == desc%my_pos) THEN
     404         632 :                   desc%group_coor = desc%rank2coord(:, i)
     405             :                END IF
     406             : 
     407         316 :                desc%coord2rank(desc%rank2coord(1, i), desc%rank2coord(2, i), desc%rank2coord(3, i)) = i
     408             :                ! the lb_global and ub_global correspond to lb_real and ub_real of each task
     409        1264 :                desc%lb_global(:, i) = desc%lb
     410        1264 :                desc%ub_global(:, i) = desc%ub
     411        1422 :                DO dir = 1, 3
     412        1264 :                   IF (desc%group_dim(dir) .GT. 1) THEN
     413         316 :                      lb = get_limit(desc%npts(dir), desc%group_dim(dir), desc%rank2coord(dir, i))
     414         316 :                      desc%lb_global(dir, i) = lb(1) + desc%lb(dir) - 1
     415         316 :                      desc%ub_global(dir, i) = lb(2) + desc%lb(dir) - 1
     416             :                   END IF
     417             :                END DO
     418             :             END DO
     419             : 
     420             :             ! map a grid point to a CPU coord
     421         632 :             DO dir = 1, 3
     422        1264 :                DO l = 0, desc%group_dim(dir) - 1
     423         632 :                   IF (desc%group_dim(dir) .GT. 1) THEN
     424         316 :                      lb = get_limit(desc%npts(dir), desc%group_dim(dir), l)
     425         948 :                      lb = lb + desc%lb(dir) - 1
     426             :                   ELSE
     427         316 :                      lb(1) = desc%lb(dir)
     428         316 :                      lb(2) = desc%ub(dir)
     429             :                   END IF
     430         474 :                   SELECT CASE (dir)
     431             :                   CASE (1)
     432       11696 :                      desc%x2coord(lb(1):lb(2)) = l
     433             :                   CASE (2)
     434       12104 :                      desc%y2coord(lb(1):lb(2)) = l
     435             :                   CASE (3)
     436       12200 :                      desc%z2coord(lb(1):lb(2)) = l
     437             :                   END SELECT
     438             :                END DO
     439             :             END DO
     440             : 
     441             :             ! an upper bound for the number of neighbours the border is overlapping with
     442         632 :             DO dir = 1, 3
     443         474 :                desc%neighbours(dir) = 0
     444         632 :                IF ((n_slices(dir) > 1) .OR. (border_size > 0)) THEN
     445         414 :                   min_npts_real = HUGE(0)
     446         986 :                   DO l = 0, n_slices(dir) - 1
     447         572 :                      lb = get_limit(desc%npts(dir), n_slices(dir), l)
     448         986 :                      min_npts_real = MIN(lb(2) - lb(1) + 1, min_npts_real)
     449             :                   END DO
     450         414 :                   desc%neighbours(dir) = (desc%border + min_npts_real - 1)/min_npts_real
     451             :                END IF
     452             :             END DO
     453             : 
     454         474 :             ALLOCATE (desc%virtual2real(0:desc%group_size - 1))
     455         474 :             ALLOCATE (desc%real2virtual(0:desc%group_size - 1))
     456             :             ! Start with no reordering
     457         474 :             DO i = 0, desc%group_size - 1
     458         316 :                desc%virtual2real(i) = i
     459         474 :                desc%real2virtual(i) = i
     460             :             END DO
     461             : 
     462         158 :             desc%my_virtual_pos = desc%real2virtual(desc%my_pos)
     463         632 :             desc%virtual_group_coor(:) = desc%rank2coord(:, desc%my_virtual_pos)
     464             : 
     465             :          END IF
     466             :       END IF
     467             : 
     468       31718 :       CALL timestop(handle)
     469             : 
     470       31718 :    END SUBROUTINE rs_grid_create_descriptor
     471             : 
     472             : ! **************************************************************************************************
     473             : !> \brief ...
     474             : !> \param rs ...
     475             : !> \param desc ...
     476             : ! **************************************************************************************************
     477     4926054 :    SUBROUTINE rs_grid_create(rs, desc)
     478             :       TYPE(realspace_grid_type), INTENT(OUT)             :: rs
     479             :       TYPE(realspace_grid_desc_type), INTENT(INOUT), &
     480             :          TARGET                                          :: desc
     481             : 
     482             :       CHARACTER(LEN=*), PARAMETER                        :: routineN = 'rs_grid_create'
     483             : 
     484             :       INTEGER                                            :: handle
     485             : 
     486      234574 :       CALL timeset(routineN, handle)
     487             : 
     488      234574 :       rs%desc => desc
     489      234574 :       CALL rs_grid_retain_descriptor(rs%desc)
     490             : 
     491      234574 :       IF (desc%pw%para%mode == PW_MODE_LOCAL) THEN
     492             :          ! The corresponding group has dimension 1
     493             :          ! All operations will be done locally
     494       63928 :          rs%lb_real = desc%lb
     495       63928 :          rs%ub_real = desc%ub
     496       63928 :          rs%lb_local = rs%lb_real - desc%border*(1 - desc%perd)
     497       63928 :          rs%ub_local = rs%ub_real + desc%border*(1 - desc%perd)
     498       63928 :          rs%npts_local = rs%ub_local - rs%lb_local + 1
     499       63928 :          rs%ngpts_local = PRODUCT(rs%npts_local)
     500             :       END IF
     501             : 
     502      935924 :       IF (ALL(rs%desc%group_dim == 1)) THEN
     503             :          ! CASE 1 : only one slice: we do not need overlapping regions and special
     504             :          !          recombination of the total density
     505      931288 :          rs%lb_real = desc%lb
     506      931288 :          rs%ub_real = desc%ub
     507      931288 :          rs%lb_local = rs%lb_real - desc%border*(1 - desc%perd)
     508      931288 :          rs%ub_local = rs%ub_real + desc%border*(1 - desc%perd)
     509      931288 :          rs%npts_local = rs%ub_local - rs%lb_local + 1
     510      931288 :          rs%ngpts_local = PRODUCT(rs%npts_local)
     511             :       ELSE
     512             :          ! CASE 2 : general case
     513             :          ! extract some more derived quantities about the local grid
     514        7008 :          rs%lb_real = desc%lb_global(:, desc%my_virtual_pos)
     515        7008 :          rs%ub_real = desc%ub_global(:, desc%my_virtual_pos)
     516        7008 :          rs%lb_local = rs%lb_real - desc%border*(1 - desc%perd)
     517        7008 :          rs%ub_local = rs%ub_real + desc%border*(1 - desc%perd)
     518        7008 :          rs%npts_local = rs%ub_local - rs%lb_local + 1
     519        7008 :          rs%ngpts_local = PRODUCT(rs%npts_local)
     520             :       END IF
     521             : 
     522      234574 :       CALL offload_create_buffer(rs%ngpts_local, rs%buffer)
     523             :       rs%r(rs%lb_local(1):rs%ub_local(1), &
     524             :            rs%lb_local(2):rs%ub_local(2), &
     525      234574 :            rs%lb_local(3):rs%ub_local(3)) => rs%buffer%host_buffer
     526             : 
     527      703722 :       ALLOCATE (rs%px(desc%npts(1)))
     528      703722 :       ALLOCATE (rs%py(desc%npts(2)))
     529      703722 :       ALLOCATE (rs%pz(desc%npts(3)))
     530             : 
     531      234574 :       CALL timestop(handle)
     532             : 
     533      234574 :    END SUBROUTINE rs_grid_create
     534             : 
     535             : ! **************************************************************************************************
     536             : !> \brief Defines a new ordering of ranks on this realspace grid, recalculating
     537             : !>        the data bounds and reallocating the grid.  As a result, each MPI process
     538             : !>        now has a real rank (i.e., its rank in the MPI communicator from the pw grid)
     539             : !>        and a virtual rank (the rank of the process where the data now owned by this
     540             : !>        process would reside in an ordinary cartesian distribution).
     541             : !>        NB. Since the grid size required may change, the caller should be sure to release
     542             : !>        and recreate the corresponding rs_grids
     543             : !>        The desc%real2virtual and desc%virtual2real arrays can be used to map
     544             : !>        a physical rank to the 'rank' of data owned by that process and vice versa
     545             : !> \param desc ...
     546             : !> \param real2virtual ...
     547             : !> \par History
     548             : !>        04-2009 created [Iain Bethune]
     549             : !>          (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
     550             : ! **************************************************************************************************
     551           6 :    PURE SUBROUTINE rs_grid_reorder_ranks(desc, real2virtual)
     552             : 
     553             :       TYPE(realspace_grid_desc_type), INTENT(INOUT)      :: desc
     554             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: real2virtual
     555             : 
     556             :       INTEGER                                            :: i
     557             : 
     558          18 :       desc%real2virtual(:) = real2virtual
     559             : 
     560          18 :       DO i = 0, desc%group_size - 1
     561          18 :          desc%virtual2real(desc%real2virtual(i)) = i
     562             :       END DO
     563             : 
     564           6 :       desc%my_virtual_pos = desc%real2virtual(desc%my_pos)
     565             : 
     566          12 :       IF (.NOT. ALL(desc%group_dim == 1)) THEN
     567          24 :          desc%virtual_group_coor(:) = desc%rank2coord(:, desc%my_virtual_pos)
     568             :       END IF
     569             : 
     570           6 :    END SUBROUTINE
     571             : 
     572             : ! **************************************************************************************************
     573             : !> \brief Print information on grids to output
     574             : !> \param rs ...
     575             : !> \param iounit ...
     576             : !> \author JGH (17-May-2007)
     577             : ! **************************************************************************************************
     578       13782 :    SUBROUTINE rs_grid_print(rs, iounit)
     579             :       TYPE(realspace_grid_type), INTENT(IN)              :: rs
     580             :       INTEGER, INTENT(in)                                :: iounit
     581             : 
     582             :       INTEGER                                            :: dir, i, nn
     583             :       REAL(KIND=dp)                                      :: pp(3)
     584             : 
     585       13782 :       IF (rs%desc%parallel) THEN
     586       13500 :          IF (iounit > 0) THEN
     587             :             WRITE (iounit, '(/,A,T71,I10)') &
     588        6018 :                " RS_GRID| Information for grid number ", rs%desc%pw%id_nr
     589       24072 :             DO i = 1, 3
     590       18054 :                WRITE (iounit, '(A,I3,T30,2I8,T62,A,T71,I10)') " RS_GRID|   Bounds ", &
     591       42126 :                   i, rs%desc%lb(i), rs%desc%ub(i), "Points:", rs%desc%npts(i)
     592             :             END DO
     593        6018 :             IF (.NOT. rs%desc%distributed) THEN
     594        6003 :                WRITE (iounit, '(A)') " RS_GRID| Real space fully replicated"
     595             :                WRITE (iounit, '(A,T71,I10)') &
     596        6003 :                   " RS_GRID| Group size ", rs%desc%group_dim(2)
     597             :             ELSE
     598          60 :                DO dir = 1, 3
     599          60 :                   IF (rs%desc%perd(dir) /= 1) THEN
     600             :                      WRITE (iounit, '(A,T71,I3,A)') &
     601          15 :                         " RS_GRID| Real space distribution over ", rs%desc%group_dim(dir), " groups"
     602             :                      WRITE (iounit, '(A,T71,I10)') &
     603          15 :                         " RS_GRID| Real space distribution along direction ", dir
     604             :                      WRITE (iounit, '(A,T71,I10)') &
     605          15 :                         " RS_GRID| Border size ", rs%desc%border
     606             :                   END IF
     607             :                END DO
     608             :             END IF
     609             :          END IF
     610       13500 :          IF (rs%desc%distributed) THEN
     611         120 :             DO dir = 1, 3
     612         120 :                IF (rs%desc%perd(dir) /= 1) THEN
     613          30 :                   nn = rs%npts_local(dir)
     614          30 :                   CALL rs%desc%group%sum(nn)
     615         120 :                   pp(1) = REAL(nn, KIND=dp)/REAL(PRODUCT(rs%desc%group_dim), KIND=dp)
     616          30 :                   nn = rs%npts_local(dir)
     617          30 :                   CALL rs%desc%group%max(nn)
     618          30 :                   pp(2) = REAL(nn, KIND=dp)
     619          30 :                   nn = rs%npts_local(dir)
     620          30 :                   CALL rs%desc%group%min(nn)
     621          30 :                   pp(3) = REAL(nn, KIND=dp)
     622          30 :                   IF (iounit > 0) THEN
     623          15 :                      WRITE (iounit, '(A,T48,A)') " RS_GRID|   Distribution", &
     624          30 :                         "  Average         Max         Min"
     625          15 :                      WRITE (iounit, '(A,T45,F12.1,2I12)') " RS_GRID|   Planes   ", &
     626          30 :                         pp(1), NINT(pp(2)), NINT(pp(3))
     627             :                   END IF
     628             :                END IF
     629             :             END DO
     630             : !          WRITE ( iounit, '(/)' )
     631             :          END IF
     632             :       ELSE
     633         282 :          IF (iounit > 0) THEN
     634             :             WRITE (iounit, '(/,A,T71,I10)') &
     635         172 :                " RS_GRID| Information for grid number ", rs%desc%pw%id_nr
     636         688 :             DO i = 1, 3
     637         516 :                WRITE (iounit, '(A,I3,T30,2I8,T62,A,T71,I10)') " RS_GRID|   Bounds ", &
     638        1204 :                   i, rs%desc%lb(i), rs%desc%ub(i), "Points:", rs%desc%npts(i)
     639             :             END DO
     640             : !         WRITE ( iounit, '(/)' )
     641             :          END IF
     642             :       END IF
     643             : 
     644       13782 :    END SUBROUTINE rs_grid_print
     645             : 
     646             : ! **************************************************************************************************
     647             : !> \brief ...
     648             : !> \param rs ...
     649             : !> \param pw ...
     650             : ! **************************************************************************************************
     651     1059308 :    SUBROUTINE transfer_rs2pw(rs, pw)
     652             :       TYPE(realspace_grid_type), INTENT(IN)              :: rs
     653             :       TYPE(pw_r3d_rs_type), INTENT(INOUT)                :: pw
     654             : 
     655             :       CHARACTER(len=*), PARAMETER                        :: routineN = 'transfer_rs2pw'
     656             : 
     657             :       INTEGER                                            :: handle, handle2, i
     658             : 
     659     1059308 :       CALL timeset(routineN, handle2)
     660     1059308 :       CALL timeset(routineN//"_"//TRIM(ADJUSTL(cp_to_string(CEILING(pw%pw_grid%cutoff/10)*10))), handle)
     661             : 
     662     1059308 :       IF (.NOT. ASSOCIATED(rs%desc%pw, pw%pw_grid)) &
     663           0 :          CPABORT("Different rs and pw indentifiers")
     664             : 
     665     1059308 :       IF (rs%desc%distributed) THEN
     666        1828 :          CALL transfer_rs2pw_distributed(rs, pw)
     667     1057480 :       ELSE IF (rs%desc%parallel) THEN
     668      848130 :          CALL transfer_rs2pw_replicated(rs, pw)
     669             :       ELSE ! treat simple serial case locally
     670      209350 :          IF (rs%desc%border == 0) THEN
     671      837400 :             CALL dcopy(SIZE(rs%r), rs%r, 1, pw%array, 1)
     672             :          ELSE
     673           0 :             CPASSERT(LBOUND(pw%array, 3) .EQ. rs%lb_real(3))
     674           0 : !$OMP          PARALLEL DO DEFAULT(NONE) SHARED(pw,rs)
     675             :             DO i = rs%lb_real(3), rs%ub_real(3)
     676             :                pw%array(:, :, i) = rs%r(rs%lb_real(1):rs%ub_real(1), &
     677             :                                         rs%lb_real(2):rs%ub_real(2), i)
     678             :             END DO
     679             : !$OMP          END PARALLEL DO
     680             :          END IF
     681             :       END IF
     682             : 
     683     1059308 :       CALL timestop(handle)
     684     1059308 :       CALL timestop(handle2)
     685             : 
     686     1059308 :    END SUBROUTINE transfer_rs2pw
     687             : 
     688             : ! **************************************************************************************************
     689             : !> \brief ...
     690             : !> \param rs ...
     691             : !> \param pw ...
     692             : ! **************************************************************************************************
     693     1030469 :    SUBROUTINE transfer_pw2rs(rs, pw)
     694             : 
     695             :       TYPE(realspace_grid_type), INTENT(IN)              :: rs
     696             :       TYPE(pw_r3d_rs_type), INTENT(IN)                   :: pw
     697             : 
     698             :       CHARACTER(len=*), PARAMETER                        :: routineN = 'transfer_pw2rs'
     699             : 
     700             :       INTEGER                                            :: handle, handle2, i, im, j, jm, k, km
     701             : 
     702     1030469 :       CALL timeset(routineN, handle2)
     703     1030469 :       CALL timeset(routineN//"_"//TRIM(ADJUSTL(cp_to_string(CEILING(pw%pw_grid%cutoff/10)*10))), handle)
     704             : 
     705     1030469 :       IF (.NOT. ASSOCIATED(rs%desc%pw, pw%pw_grid)) &
     706           0 :          CPABORT("Different rs and pw indentifiers")
     707             : 
     708     1030469 :       IF (rs%desc%distributed) THEN
     709         864 :          CALL transfer_pw2rs_distributed(rs, pw)
     710     1029605 :       ELSE IF (rs%desc%parallel) THEN
     711      785312 :          CALL transfer_pw2rs_replicated(rs, pw)
     712             :       ELSE ! treat simple serial case locally
     713      244293 :          IF (rs%desc%border == 0) THEN
     714      977172 :             CALL dcopy(SIZE(rs%r), pw%array, 1, rs%r, 1)
     715             :          ELSE
     716             : !$OMP          PARALLEL DO DEFAULT(NONE) &
     717             : !$OMP                      PRIVATE(i,im,j,jm,k,km) &
     718           0 : !$OMP                      SHARED(pw,rs)
     719             :             DO k = rs%lb_local(3), rs%ub_local(3)
     720             :                IF (k < rs%lb_real(3)) THEN
     721             :                   km = k + rs%desc%npts(3)
     722             :                ELSE IF (k > rs%ub_real(3)) THEN
     723             :                   km = k - rs%desc%npts(3)
     724             :                ELSE
     725             :                   km = k
     726             :                END IF
     727             :                DO j = rs%lb_local(2), rs%ub_local(2)
     728             :                   IF (j < rs%lb_real(2)) THEN
     729             :                      jm = j + rs%desc%npts(2)
     730             :                   ELSE IF (j > rs%ub_real(2)) THEN
     731             :                      jm = j - rs%desc%npts(2)
     732             :                   ELSE
     733             :                      jm = j
     734             :                   END IF
     735             :                   DO i = rs%lb_local(1), rs%ub_local(1)
     736             :                      IF (i < rs%lb_real(1)) THEN
     737             :                         im = i + rs%desc%npts(1)
     738             :                      ELSE IF (i > rs%ub_real(1)) THEN
     739             :                         im = i - rs%desc%npts(1)
     740             :                      ELSE
     741             :                         im = i
     742             :                      END IF
     743             :                      rs%r(i, j, k) = pw%array(im, jm, km)
     744             :                   END DO
     745             :                END DO
     746             :             END DO
     747             : !$OMP          END PARALLEL DO
     748             :          END IF
     749             :       END IF
     750             : 
     751     1030469 :       CALL timestop(handle)
     752     1030469 :       CALL timestop(handle2)
     753             : 
     754     1030469 :    END SUBROUTINE transfer_pw2rs
     755             : 
     756             : ! **************************************************************************************************
     757             : !> \brief transfer from a realspace grid to a planewave grid
     758             : !> \param rs ...
     759             : !> \param pw ...
     760             : ! **************************************************************************************************
     761      848130 :    SUBROUTINE transfer_rs2pw_replicated(rs, pw)
     762             :       TYPE(realspace_grid_type), INTENT(IN)              :: rs
     763             :       TYPE(pw_r3d_rs_type), INTENT(INOUT)                :: pw
     764             : 
     765             :       INTEGER                                            :: dest, ii, ip, ix, iy, iz, nma, nn, s(3), &
     766             :                                                             source
     767      848130 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: rcount
     768             :       INTEGER, DIMENSION(3)                              :: lb, ub
     769      848130 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:)           :: recvbuf, sendbuf, swaparray
     770             : 
     771             :       ASSOCIATE (np => pw%pw_grid%para%group%num_pe, bo => pw%pw_grid%para%bo(1:2, 1:3, 0:pw%pw_grid%para%group%num_pe - 1, 1), &
     772             :                  pbo => pw%pw_grid%bounds, group => pw%pw_grid%para%group, mepos => pw%pw_grid%para%group%mepos, &
     773             :                  grid => rs%r)
     774     2544390 :          ALLOCATE (rcount(0:np - 1))
     775     2544390 :          DO ip = 1, np
     776     7633170 :             rcount(ip - 1) = PRODUCT(bo(2, :, ip) - bo(1, :, ip) + 1)
     777             :          END DO
     778     2544390 :          nma = MAXVAL(rcount(0:np - 1))
     779     3392520 :          ALLOCATE (sendbuf(nma), recvbuf(nma))
     780 31846714640 :          sendbuf = 1.0E99_dp; recvbuf = 1.0E99_dp ! init mpi'ed buffers to silence warnings under valgrind
     781             : 
     782             :          !sample peak memory
     783      848130 :          CALL m_memory()
     784             : 
     785      848130 :          dest = MODULO(mepos + 1, np)
     786      848130 :          source = MODULO(mepos - 1, np)
     787 15923357320 :          sendbuf = 0.0_dp
     788             : 
     789     1696260 :          DO ip = 1, np
     790             : 
     791     6785040 :             lb = pbo(1, :) + bo(1, :, MODULO(mepos - ip, np) + 1) - 1
     792     6785040 :             ub = pbo(1, :) + bo(2, :, MODULO(mepos - ip, np) + 1) - 1
     793             :             ! this loop takes about the same time as the message passing call
     794             :             ! notice that the range of ix is only a small fraction of the first index of grid
     795             :             ! therefore it seems faster to have the second index as the innermost loop
     796             :             ! if this runs on many cpus
     797             :             ! tested on itanium, pentium4, opteron, ultrasparc...
     798     6785040 :             s = ub - lb + 1
     799    42436392 :             DO iz = lb(3), ub(3)
     800   750299854 :                DO ix = lb(1), ub(1)
     801   707863462 :                   ii = (iz - lb(3))*s(1)*s(2) + (ix - lb(1)) + 1
     802 32374406210 :                   DO iy = lb(2), ub(2)
     803 31625802616 :                      sendbuf(ii) = sendbuf(ii) + grid(ix, iy, iz)
     804 32333666078 :                      ii = ii + s(1)
     805             :                   END DO
     806             :                END DO
     807             :             END DO
     808     1696260 :             IF (ip .EQ. np) EXIT
     809      848130 :             CALL group%sendrecv(sendbuf, dest, recvbuf, source, 13)
     810      848130 :             CALL MOVE_ALLOC(sendbuf, swaparray)
     811      848130 :             CALL MOVE_ALLOC(recvbuf, sendbuf)
     812     1696260 :             CALL MOVE_ALLOC(swaparray, recvbuf)
     813             :          END DO
     814      848130 :          nn = rcount(mepos)
     815             :       END ASSOCIATE
     816             : 
     817      848130 :       CALL dcopy(nn, sendbuf, 1, pw%array, 1)
     818             : 
     819      848130 :       DEALLOCATE (rcount)
     820      848130 :       DEALLOCATE (sendbuf)
     821      848130 :       DEALLOCATE (recvbuf)
     822             : 
     823      848130 :    END SUBROUTINE transfer_rs2pw_replicated
     824             : 
     825             : ! **************************************************************************************************
     826             : !> \brief transfer from a planewave grid to a realspace grid
     827             : !> \param rs ...
     828             : !> \param pw ...
     829             : ! **************************************************************************************************
     830      785312 :    SUBROUTINE transfer_pw2rs_replicated(rs, pw)
     831             :       TYPE(realspace_grid_type), INTENT(IN)              :: rs
     832             :       TYPE(pw_r3d_rs_type), INTENT(IN)                   :: pw
     833             : 
     834             :       INTEGER                                            :: dest, i, ii, im, ip, ix, iy, iz, j, jm, &
     835             :                                                             k, km, nma, nn, source
     836      785312 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: rcount
     837             :       INTEGER, DIMENSION(3)                              :: lb, ub
     838      785312 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:)           :: recvbuf, sendbuf, swaparray
     839     2355936 :       TYPE(mp_request_type), DIMENSION(2)                :: req
     840             : 
     841             :       ASSOCIATE (np => pw%pw_grid%para%group%num_pe, bo => pw%pw_grid%para%bo(1:2, 1:3, 0:pw%pw_grid%para%group%num_pe - 1, 1), &
     842             :                  pbo => pw%pw_grid%bounds, group => pw%pw_grid%para%group, mepos => pw%pw_grid%para%group%mepos, &
     843             :                  grid => rs%r)
     844     2355936 :          ALLOCATE (rcount(0:np - 1))
     845     2355936 :          DO ip = 1, np
     846     7067808 :             rcount(ip - 1) = PRODUCT(bo(2, :, ip) - bo(1, :, ip) + 1)
     847             :          END DO
     848     2355936 :          nma = MAXVAL(rcount(0:np - 1))
     849     3141248 :          ALLOCATE (sendbuf(nma), recvbuf(nma))
     850 30849837776 :          sendbuf = 1.0E99_dp; recvbuf = 1.0E99_dp ! init mpi'ed buffers to silence warnings under valgrind
     851             : 
     852             :          !sample peak memory
     853      785312 :          CALL m_memory()
     854             : 
     855      785312 :          nn = rcount(mepos)
     856      785312 :          CALL dcopy(nn, pw%array, 1, sendbuf, 1)
     857             : 
     858      785312 :          dest = MODULO(mepos + 1, np)
     859      785312 :          source = MODULO(mepos - 1, np)
     860             : 
     861     2355936 :          DO ip = 0, np - 1
     862             :             ! we must shift the buffer only np-1 times around
     863     1570624 :             IF (ip .NE. np - 1) THEN
     864             :                CALL group%isendrecv(sendbuf, dest, recvbuf, source, &
     865      785312 :                                     req(1), req(2), 13)
     866             :             END IF
     867     6282496 :             lb = pbo(1, :) + bo(1, :, MODULO(mepos - ip, np) + 1) - 1
     868     6282496 :             ub = pbo(1, :) + bo(2, :, MODULO(mepos - ip, np) + 1) - 1
     869     1570624 :             ii = 0
     870             :             ! this loop takes about the same time as the message passing call
     871             :             ! If I read the code correctly then:
     872    40255192 :             DO iz = lb(3), ub(3)
     873  1390460520 :                DO iy = lb(2), ub(2)
     874 32029421078 :                   DO ix = lb(1), ub(1)
     875 30640531182 :                      ii = ii + 1
     876 31990736510 :                      grid(ix, iy, iz) = sendbuf(ii)
     877             :                   END DO
     878             :                END DO
     879             :             END DO
     880     1570624 :             IF (ip .NE. np - 1) THEN
     881      785312 :                CALL mp_waitall(req)
     882             :             END IF
     883     1570624 :             CALL MOVE_ALLOC(sendbuf, swaparray)
     884     1570624 :             CALL MOVE_ALLOC(recvbuf, sendbuf)
     885     2355936 :             CALL MOVE_ALLOC(swaparray, recvbuf)
     886             :          END DO
     887     1570624 :          IF (rs%desc%border > 0) THEN
     888             : !$OMP       PARALLEL DO DEFAULT(NONE) &
     889             : !$OMP                   PRIVATE(i,im,j,jm,k,km) &
     890           0 : !$OMP                   SHARED(rs)
     891             :             DO k = rs%lb_local(3), rs%ub_local(3)
     892             :                IF (k < rs%lb_real(3)) THEN
     893             :                   km = k + rs%desc%npts(3)
     894             :                ELSE IF (k > rs%ub_real(3)) THEN
     895             :                   km = k - rs%desc%npts(3)
     896             :                ELSE
     897             :                   km = k
     898             :                END IF
     899             :                DO j = rs%lb_local(2), rs%ub_local(2)
     900             :                   IF (j < rs%lb_real(2)) THEN
     901             :                      jm = j + rs%desc%npts(2)
     902             :                   ELSE IF (j > rs%ub_real(2)) THEN
     903             :                      jm = j - rs%desc%npts(2)
     904             :                   ELSE
     905             :                      jm = j
     906             :                   END IF
     907             :                   DO i = rs%lb_local(1), rs%ub_local(1)
     908             :                      IF (i < rs%lb_real(1)) THEN
     909             :                         im = i + rs%desc%npts(1)
     910             :                      ELSE IF (i > rs%ub_real(1)) THEN
     911             :                         im = i - rs%desc%npts(1)
     912             :                      ELSE
     913             :                         im = i
     914             :                      END IF
     915             :                      rs%r(i, j, k) = rs%r(im, jm, km)
     916             :                   END DO
     917             :                END DO
     918             :             END DO
     919             : !$OMP       END PARALLEL DO
     920             :          END IF
     921             :       END ASSOCIATE
     922             : 
     923      785312 :       DEALLOCATE (rcount)
     924      785312 :       DEALLOCATE (sendbuf)
     925      785312 :       DEALLOCATE (recvbuf)
     926             : 
     927      785312 :    END SUBROUTINE transfer_pw2rs_replicated
     928             : 
     929             : ! **************************************************************************************************
     930             : !> \brief does the rs2pw transfer in the case where the rs grid is
     931             : !>       distributed (3D domain decomposition)
     932             : !> \param rs ...
     933             : !> \param pw ...
     934             : !> \par History
     935             : !>      12.2007 created [Matt Watkins]
     936             : !>      9.2008 reduced amount of halo data sent [Iain Bethune]
     937             : !>      10.2008 added non-blocking communication [Iain Bethune]
     938             : !>      4.2009 added support for rank-reordering on the grid [Iain Bethune]
     939             : !>      12.2009 added OMP and sparse alltoall [Iain Bethune]
     940             : !>              (c) The Numerical Algorithms Group (NAG) Ltd, 2008-2009 on behalf of the HECToR project
     941             : !> \note
     942             : !>       the transfer is a two step procedure. For example, for the rs2pw transfer:
     943             : !>
     944             : !>       1) Halo-exchange in 3D so that the local part of the rs_grid contains the full data
     945             : !>       2) an alltoall communication to redistribute the local rs_grid to the local pw_grid
     946             : !>
     947             : !>       the halo exchange is most expensive on a large number of CPUs. Particular in this halo
     948             : !>       exchange is that the border region is rather large (e.g. 20 points) and that it might overlap
     949             : !>       with the central domain of several CPUs (i.e. next nearest neighbors)
     950             : ! **************************************************************************************************
     951        1828 :    SUBROUTINE transfer_rs2pw_distributed(rs, pw)
     952             :       TYPE(realspace_grid_type), INTENT(IN)              :: rs
     953             :       TYPE(pw_r3d_rs_type), INTENT(IN)                   :: pw
     954             : 
     955             :       CHARACTER(LEN=200)                                 :: error_string
     956             :       INTEGER :: completed, dest_down, dest_up, i, idir, j, k, lb, my_id, my_pw_rank, my_rs_rank, &
     957             :          n_shifts, nn, num_threads, position, source_down, source_up, ub, x, y, z
     958        1828 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: dshifts, recv_disps, recv_sizes, &
     959        1828 :                                                             send_disps, send_sizes, ushifts
     960        3656 :       INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: bounds, recv_tasks, send_tasks
     961             :       INTEGER, DIMENSION(2)                              :: neighbours, pos
     962             :       INTEGER, DIMENSION(3) :: coords, lb_recv, lb_recv_down, lb_recv_up, lb_send, lb_send_down, &
     963             :          lb_send_up, ub_recv, ub_recv_down, ub_recv_up, ub_send, ub_send_down, ub_send_up
     964             :       LOGICAL, DIMENSION(3)                              :: halo_swapped
     965             :       REAL(KIND=dp)                                      :: pw_sum, rs_sum
     966        1828 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :)     :: recv_buf_3d_down, recv_buf_3d_up, &
     967        1828 :                                                             send_buf_3d_down, send_buf_3d_up
     968        3656 :       TYPE(cp_1d_r_p_type), ALLOCATABLE, DIMENSION(:)    :: recv_bufs, send_bufs
     969        1828 :       TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:)   :: recv_reqs, send_reqs
     970        9140 :       TYPE(mp_request_type), DIMENSION(4)                :: req
     971             : 
     972        1828 :       num_threads = 1
     973        1828 :       my_id = 0
     974             : 
     975             :       ! safety check, to be removed once we're absolute sure the routine is correct
     976             :       IF (debug_this_module) THEN
     977             :          rs_sum = accurate_sum(rs%r)*ABS(det_3x3(rs%desc%dh))
     978             :          CALL rs%desc%group%sum(rs_sum)
     979             :       END IF
     980             : 
     981        1828 :       halo_swapped = .FALSE.
     982             :       ! We don't need to send the 'edges' of the halos that have already been sent
     983             :       ! Halos are contiguous in memory in z-direction only, so swap these first,
     984             :       ! and send less data in the y and x directions which are more expensive
     985             : 
     986        7312 :       DO idir = 3, 1, -1
     987             : 
     988        5484 :          IF (rs%desc%perd(idir) .NE. 1) THEN
     989             : 
     990       14196 :             ALLOCATE (dshifts(0:rs%desc%neighbours(idir)))
     991        9464 :             ALLOCATE (ushifts(0:rs%desc%neighbours(idir)))
     992             : 
     993       14196 :             ushifts = 0
     994       14196 :             dshifts = 0
     995             : 
     996             :             ! check that we don't try to send data to ourself
     997        6560 :             DO n_shifts = 1, MIN(rs%desc%neighbours(idir), rs%desc%group_dim(idir) - 1)
     998             : 
     999             :                ! need to take into account the possible varying widths of neighbouring cells
    1000             :                ! offset_up and offset_down hold the real size of the neighbouring cells
    1001        1828 :                position = MODULO(rs%desc%virtual_group_coor(idir) - n_shifts, rs%desc%group_dim(idir))
    1002        1828 :                neighbours = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), position)
    1003        1828 :                dshifts(n_shifts) = dshifts(n_shifts - 1) + (neighbours(2) - neighbours(1) + 1)
    1004             : 
    1005        1828 :                position = MODULO(rs%desc%virtual_group_coor(idir) + n_shifts, rs%desc%group_dim(idir))
    1006        1828 :                neighbours = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), position)
    1007        1828 :                ushifts(n_shifts) = ushifts(n_shifts - 1) + (neighbours(2) - neighbours(1) + 1)
    1008             : 
    1009             :                ! The border data has to be send/received from the neighbours
    1010             :                ! First we calculate the source and destination processes for the shift
    1011             :                ! We do both shifts at once to allow for more overlap of communication and buffer packing/unpacking
    1012             : 
    1013        1828 :                CALL cart_shift(rs, idir, -1*n_shifts, source_down, dest_down)
    1014             : 
    1015        7312 :                lb_send_down(:) = rs%lb_local(:)
    1016        7312 :                lb_recv_down(:) = rs%lb_local(:)
    1017        7312 :                ub_recv_down(:) = rs%ub_local(:)
    1018        7312 :                ub_send_down(:) = rs%ub_local(:)
    1019             : 
    1020        1828 :                IF (dshifts(n_shifts - 1) .LE. rs%desc%border) THEN
    1021        1828 :                   ub_send_down(idir) = lb_send_down(idir) + rs%desc%border - 1 - dshifts(n_shifts - 1)
    1022             :                   lb_send_down(idir) = MAX(lb_send_down(idir), &
    1023        1828 :                                            lb_send_down(idir) + rs%desc%border - dshifts(n_shifts))
    1024             : 
    1025        1828 :                   ub_recv_down(idir) = ub_recv_down(idir) - rs%desc%border
    1026             :                   lb_recv_down(idir) = MAX(lb_recv_down(idir) + rs%desc%border, &
    1027        1828 :                                            ub_recv_down(idir) - rs%desc%border + 1 + ushifts(n_shifts - 1))
    1028             :                ELSE
    1029           0 :                   lb_send_down(idir) = 0
    1030           0 :                   ub_send_down(idir) = -1
    1031           0 :                   lb_recv_down(idir) = 0
    1032           0 :                   ub_recv_down(idir) = -1
    1033             :                END IF
    1034             : 
    1035        7312 :                DO i = 1, 3
    1036        7312 :                   IF (halo_swapped(i)) THEN
    1037         554 :                      lb_send_down(i) = rs%lb_real(i)
    1038         554 :                      ub_send_down(i) = rs%ub_real(i)
    1039         554 :                      lb_recv_down(i) = rs%lb_real(i)
    1040         554 :                      ub_recv_down(i) = rs%ub_real(i)
    1041             :                   END IF
    1042             :                END DO
    1043             : 
    1044             :                ! post the receive
    1045           0 :                ALLOCATE (recv_buf_3d_down(lb_recv_down(1):ub_recv_down(1), &
    1046        9140 :                                           lb_recv_down(2):ub_recv_down(2), lb_recv_down(3):ub_recv_down(3)))
    1047        1828 :                CALL rs%desc%group%irecv(recv_buf_3d_down, source_down, req(1))
    1048             : 
    1049             :                ! now allocate, pack and send the send buffer
    1050        7312 :                nn = PRODUCT(ub_send_down - lb_send_down + 1)
    1051           0 :                ALLOCATE (send_buf_3d_down(lb_send_down(1):ub_send_down(1), &
    1052        9140 :                                           lb_send_down(2):ub_send_down(2), lb_send_down(3):ub_send_down(3)))
    1053             : 
    1054             : !$OMP PARALLEL DEFAULT(NONE), &
    1055             : !$OMP          PRIVATE(lb,ub,my_id,NUM_THREADS), &
    1056        1828 : !$OMP          SHARED(send_buf_3d_down,rs,lb_send_down,ub_send_down)
    1057             : !$             num_threads = MIN(omp_get_max_threads(), ub_send_down(3) - lb_send_down(3) + 1)
    1058             : !$             my_id = omp_get_thread_num()
    1059             :                IF (my_id < num_threads) THEN
    1060             :                   lb = lb_send_down(3) + ((ub_send_down(3) - lb_send_down(3) + 1)*my_id)/num_threads
    1061             :                   ub = lb_send_down(3) + ((ub_send_down(3) - lb_send_down(3) + 1)*(my_id + 1))/num_threads - 1
    1062             : 
    1063             :                   send_buf_3d_down(lb_send_down(1):ub_send_down(1), lb_send_down(2):ub_send_down(2), &
    1064             :                                    lb:ub) = rs%r(lb_send_down(1):ub_send_down(1), &
    1065             :                                                  lb_send_down(2):ub_send_down(2), lb:ub)
    1066             :                END IF
    1067             : !$OMP END PARALLEL
    1068             : 
    1069        1828 :                CALL rs%desc%group%isend(send_buf_3d_down, dest_down, req(3))
    1070             : 
    1071             :                ! Now for the other direction
    1072        1828 :                CALL cart_shift(rs, idir, n_shifts, source_up, dest_up)
    1073             : 
    1074        7312 :                lb_send_up(:) = rs%lb_local(:)
    1075        7312 :                lb_recv_up(:) = rs%lb_local(:)
    1076        7312 :                ub_recv_up(:) = rs%ub_local(:)
    1077        7312 :                ub_send_up(:) = rs%ub_local(:)
    1078             : 
    1079        1828 :                IF (ushifts(n_shifts - 1) .LE. rs%desc%border) THEN
    1080             : 
    1081        1828 :                   lb_send_up(idir) = ub_send_up(idir) - rs%desc%border + 1 + ushifts(n_shifts - 1)
    1082             :                   ub_send_up(idir) = MIN(ub_send_up(idir), &
    1083        1828 :                                          ub_send_up(idir) - rs%desc%border + ushifts(n_shifts))
    1084             : 
    1085        1828 :                   lb_recv_up(idir) = lb_recv_up(idir) + rs%desc%border
    1086             :                   ub_recv_up(idir) = MIN(ub_recv_up(idir) - rs%desc%border, &
    1087        1828 :                                          lb_recv_up(idir) + rs%desc%border - 1 - dshifts(n_shifts - 1))
    1088             :                ELSE
    1089           0 :                   lb_send_up(idir) = 0
    1090           0 :                   ub_send_up(idir) = -1
    1091           0 :                   lb_recv_up(idir) = 0
    1092           0 :                   ub_recv_up(idir) = -1
    1093             :                END IF
    1094             : 
    1095        7312 :                DO i = 1, 3
    1096        7312 :                   IF (halo_swapped(i)) THEN
    1097         554 :                      lb_send_up(i) = rs%lb_real(i)
    1098         554 :                      ub_send_up(i) = rs%ub_real(i)
    1099         554 :                      lb_recv_up(i) = rs%lb_real(i)
    1100         554 :                      ub_recv_up(i) = rs%ub_real(i)
    1101             :                   END IF
    1102             :                END DO
    1103             : 
    1104             :                ! post the receive
    1105           0 :                ALLOCATE (recv_buf_3d_up(lb_recv_up(1):ub_recv_up(1), &
    1106        9140 :                                         lb_recv_up(2):ub_recv_up(2), lb_recv_up(3):ub_recv_up(3)))
    1107        1828 :                CALL rs%desc%group%irecv(recv_buf_3d_up, source_up, req(2))
    1108             : 
    1109             :                ! now allocate,pack and send the send buffer
    1110        7312 :                nn = PRODUCT(ub_send_up - lb_send_up + 1)
    1111           0 :                ALLOCATE (send_buf_3d_up(lb_send_up(1):ub_send_up(1), &
    1112        9140 :                                         lb_send_up(2):ub_send_up(2), lb_send_up(3):ub_send_up(3)))
    1113             : 
    1114             : !$OMP PARALLEL DEFAULT(NONE), &
    1115             : !$OMP          PRIVATE(lb,ub,my_id,NUM_THREADS), &
    1116        1828 : !$OMP          SHARED(send_buf_3d_up,rs,lb_send_up,ub_send_up)
    1117             : !$             num_threads = MIN(omp_get_max_threads(), ub_send_up(3) - lb_send_up(3) + 1)
    1118             : !$             my_id = omp_get_thread_num()
    1119             :                IF (my_id < num_threads) THEN
    1120             :                   lb = lb_send_up(3) + ((ub_send_up(3) - lb_send_up(3) + 1)*my_id)/num_threads
    1121             :                   ub = lb_send_up(3) + ((ub_send_up(3) - lb_send_up(3) + 1)*(my_id + 1))/num_threads - 1
    1122             : 
    1123             :                   send_buf_3d_up(lb_send_up(1):ub_send_up(1), lb_send_up(2):ub_send_up(2), &
    1124             :                                  lb:ub) = rs%r(lb_send_up(1):ub_send_up(1), &
    1125             :                                                lb_send_up(2):ub_send_up(2), lb:ub)
    1126             :                END IF
    1127             : !$OMP END PARALLEL
    1128             : 
    1129        1828 :                CALL rs%desc%group%isend(send_buf_3d_up, dest_up, req(4))
    1130             : 
    1131             :                ! wait for a recv to complete, then we can unpack
    1132             : 
    1133        5484 :                DO i = 1, 2
    1134             : 
    1135        3656 :                   CALL mp_waitany(req(1:2), completed)
    1136             : 
    1137        5484 :                   IF (completed .EQ. 1) THEN
    1138             : 
    1139             :                      ! only some procs may need later shifts
    1140        1828 :                      IF (ub_recv_down(idir) .GE. lb_recv_down(idir)) THEN
    1141             :                         ! Sum the data in the RS Grid
    1142             : !$OMP PARALLEL DEFAULT(NONE), &
    1143             : !$OMP          PRIVATE(lb,ub,my_id,NUM_THREADS), &
    1144        1828 : !$OMP          SHARED(recv_buf_3d_down,rs,lb_recv_down,ub_recv_down)
    1145             : !$                      num_threads = MIN(omp_get_max_threads(), ub_recv_down(3) - lb_recv_down(3) + 1)
    1146             : !$                      my_id = omp_get_thread_num()
    1147             :                         IF (my_id < num_threads) THEN
    1148             :                            lb = lb_recv_down(3) + ((ub_recv_down(3) - lb_recv_down(3) + 1)*my_id)/num_threads
    1149             :                            ub = lb_recv_down(3) + ((ub_recv_down(3) - lb_recv_down(3) + 1)*(my_id + 1))/num_threads - 1
    1150             : 
    1151             :                            rs%r(lb_recv_down(1):ub_recv_down(1), &
    1152             :                                 lb_recv_down(2):ub_recv_down(2), lb:ub) = &
    1153             :                               rs%r(lb_recv_down(1):ub_recv_down(1), &
    1154             :                                    lb_recv_down(2):ub_recv_down(2), lb:ub) + &
    1155             :                               recv_buf_3d_down(:, :, lb:ub)
    1156             :                         END IF
    1157             : !$OMP END PARALLEL
    1158             :                      END IF
    1159        1828 :                      DEALLOCATE (recv_buf_3d_down)
    1160             :                   ELSE
    1161             : 
    1162             :                      ! only some procs may need later shifts
    1163        1828 :                      IF (ub_recv_up(idir) .GE. lb_recv_up(idir)) THEN
    1164             :                         ! Sum the data in the RS Grid
    1165             : !$OMP PARALLEL DEFAULT(NONE), &
    1166             : !$OMP          PRIVATE(lb,ub,my_id,NUM_THREADS), &
    1167        1828 : !$OMP          SHARED(recv_buf_3d_up,rs,lb_recv_up,ub_recv_up)
    1168             : !$                      num_threads = MIN(omp_get_max_threads(), ub_recv_up(3) - lb_recv_up(3) + 1)
    1169             : !$                      my_id = omp_get_thread_num()
    1170             :                         IF (my_id < num_threads) THEN
    1171             :                            lb = lb_recv_up(3) + ((ub_recv_up(3) - lb_recv_up(3) + 1)*my_id)/num_threads
    1172             :                            ub = lb_recv_up(3) + ((ub_recv_up(3) - lb_recv_up(3) + 1)*(my_id + 1))/num_threads - 1
    1173             : 
    1174             :                            rs%r(lb_recv_up(1):ub_recv_up(1), &
    1175             :                                 lb_recv_up(2):ub_recv_up(2), lb:ub) = &
    1176             :                               rs%r(lb_recv_up(1):ub_recv_up(1), &
    1177             :                                    lb_recv_up(2):ub_recv_up(2), lb:ub) + &
    1178             :                               recv_buf_3d_up(:, :, lb:ub)
    1179             :                         END IF
    1180             : !$OMP END PARALLEL
    1181             :                      END IF
    1182        1828 :                      DEALLOCATE (recv_buf_3d_up)
    1183             :                   END IF
    1184             : 
    1185             :                END DO
    1186             : 
    1187             :                ! make sure the sends have completed before we deallocate
    1188             : 
    1189        1828 :                CALL mp_waitall(req(3:4))
    1190             : 
    1191        1828 :                DEALLOCATE (send_buf_3d_down)
    1192        8388 :                DEALLOCATE (send_buf_3d_up)
    1193             :             END DO
    1194             : 
    1195        4732 :             DEALLOCATE (dshifts)
    1196        4732 :             DEALLOCATE (ushifts)
    1197             : 
    1198             :          END IF
    1199             : 
    1200        7312 :          halo_swapped(idir) = .TRUE.
    1201             : 
    1202             :       END DO
    1203             : 
    1204             :       ! This is the real redistribution
    1205        7312 :       ALLOCATE (bounds(0:pw%pw_grid%para%group%num_pe - 1, 1:4))
    1206             : 
    1207             :       ! work out the pw grid points each proc holds
    1208        5484 :       DO i = 0, pw%pw_grid%para%group%num_pe - 1
    1209       10968 :          bounds(i, 1:2) = pw%pw_grid%para%bo(1:2, 1, i, 1)
    1210       10968 :          bounds(i, 3:4) = pw%pw_grid%para%bo(1:2, 2, i, 1)
    1211       10968 :          bounds(i, 1:2) = bounds(i, 1:2) - pw%pw_grid%npts(1)/2 - 1
    1212       12796 :          bounds(i, 3:4) = bounds(i, 3:4) - pw%pw_grid%npts(2)/2 - 1
    1213             :       END DO
    1214             : 
    1215        7312 :       ALLOCATE (send_tasks(0:pw%pw_grid%para%group%num_pe - 1, 1:6))
    1216        5484 :       ALLOCATE (send_sizes(0:pw%pw_grid%para%group%num_pe - 1))
    1217        3656 :       ALLOCATE (send_disps(0:pw%pw_grid%para%group%num_pe - 1))
    1218        3656 :       ALLOCATE (recv_tasks(0:pw%pw_grid%para%group%num_pe - 1, 1:6))
    1219        3656 :       ALLOCATE (recv_sizes(0:pw%pw_grid%para%group%num_pe - 1))
    1220        3656 :       ALLOCATE (recv_disps(0:pw%pw_grid%para%group%num_pe - 1))
    1221        5484 :       send_tasks(:, 1) = 1
    1222        5484 :       send_tasks(:, 2) = 0
    1223        5484 :       send_tasks(:, 3) = 1
    1224        5484 :       send_tasks(:, 4) = 0
    1225        5484 :       send_tasks(:, 5) = 1
    1226        5484 :       send_tasks(:, 6) = 0
    1227        5484 :       send_sizes = 0
    1228        5484 :       recv_sizes = 0
    1229             : 
    1230        1828 :       my_rs_rank = rs%desc%my_pos
    1231        1828 :       my_pw_rank = pw%pw_grid%para%group%mepos
    1232             : 
    1233             :       ! find the processors that should hold our data
    1234             :       ! should be part of the rs grid type
    1235             :       ! this is a loop over real ranks (i.e. the in-order cartesian ranks)
    1236             :       ! do the recv and send tasks in two separate loops which will
    1237             :       ! load balance better for OpenMP with large numbers of MPI tasks
    1238             : 
    1239             : !$OMP PARALLEL DO DEFAULT(NONE), &
    1240             : !$OMP             PRIVATE(coords,idir,pos,lb_send,ub_send), &
    1241        1828 : !$OMP             SHARED(rs,bounds,my_rs_rank,recv_tasks,recv_sizes)
    1242             :       DO i = 0, rs%desc%group_size - 1
    1243             : 
    1244             :          coords(:) = rs%desc%rank2coord(:, rs%desc%real2virtual(i))
    1245             :          !calculate the rs grid points on each processor
    1246             :          !coords is the part of the grid that rank i actually holds
    1247             :          DO idir = 1, 3
    1248             :             pos(:) = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), coords(idir))
    1249             :             pos(:) = pos(:) - rs%desc%npts(idir)/2 - 1
    1250             :             lb_send(idir) = pos(1)
    1251             :             ub_send(idir) = pos(2)
    1252             :          END DO
    1253             : 
    1254             :          IF (lb_send(1) .GT. bounds(my_rs_rank, 2)) CYCLE
    1255             :          IF (ub_send(1) .LT. bounds(my_rs_rank, 1)) CYCLE
    1256             :          IF (lb_send(2) .GT. bounds(my_rs_rank, 4)) CYCLE
    1257             :          IF (ub_send(2) .LT. bounds(my_rs_rank, 3)) CYCLE
    1258             : 
    1259             :          recv_tasks(i, 1) = MAX(lb_send(1), bounds(my_rs_rank, 1))
    1260             :          recv_tasks(i, 2) = MIN(ub_send(1), bounds(my_rs_rank, 2))
    1261             :          recv_tasks(i, 3) = MAX(lb_send(2), bounds(my_rs_rank, 3))
    1262             :          recv_tasks(i, 4) = MIN(ub_send(2), bounds(my_rs_rank, 4))
    1263             :          recv_tasks(i, 5) = lb_send(3)
    1264             :          recv_tasks(i, 6) = ub_send(3)
    1265             :          recv_sizes(i) = (recv_tasks(i, 2) - recv_tasks(i, 1) + 1)* &
    1266             :                          (recv_tasks(i, 4) - recv_tasks(i, 3) + 1)*(recv_tasks(i, 6) - recv_tasks(i, 5) + 1)
    1267             : 
    1268             :       END DO
    1269             : !$OMP END PARALLEL DO
    1270             : 
    1271        7312 :       coords(:) = rs%desc%rank2coord(:, rs%desc%real2virtual(my_rs_rank))
    1272        7312 :       DO idir = 1, 3
    1273        5484 :          pos(:) = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), coords(idir))
    1274       16452 :          pos(:) = pos(:) - rs%desc%npts(idir)/2 - 1
    1275        5484 :          lb_send(idir) = pos(1)
    1276        7312 :          ub_send(idir) = pos(2)
    1277             :       END DO
    1278             : 
    1279        1828 :       lb_recv(:) = lb_send(:)
    1280        1828 :       ub_recv(:) = ub_send(:)
    1281             : !$OMP PARALLEL DO DEFAULT(NONE), &
    1282        1828 : !$OMP             SHARED(pw,lb_send,ub_send,bounds,send_tasks,send_sizes)
    1283             :       DO j = 0, pw%pw_grid%para%group%num_pe - 1
    1284             : 
    1285             :          IF (lb_send(1) .GT. bounds(j, 2)) CYCLE
    1286             :          IF (ub_send(1) .LT. bounds(j, 1)) CYCLE
    1287             :          IF (lb_send(2) .GT. bounds(j, 4)) CYCLE
    1288             :          IF (ub_send(2) .LT. bounds(j, 3)) CYCLE
    1289             : 
    1290             :          send_tasks(j, 1) = MAX(lb_send(1), bounds(j, 1))
    1291             :          send_tasks(j, 2) = MIN(ub_send(1), bounds(j, 2))
    1292             :          send_tasks(j, 3) = MAX(lb_send(2), bounds(j, 3))
    1293             :          send_tasks(j, 4) = MIN(ub_send(2), bounds(j, 4))
    1294             :          send_tasks(j, 5) = lb_send(3)
    1295             :          send_tasks(j, 6) = ub_send(3)
    1296             :          send_sizes(j) = (send_tasks(j, 2) - send_tasks(j, 1) + 1)* &
    1297             :                          (send_tasks(j, 4) - send_tasks(j, 3) + 1)*(send_tasks(j, 6) - send_tasks(j, 5) + 1)
    1298             : 
    1299             :       END DO
    1300             : !$OMP END PARALLEL DO
    1301             : 
    1302        1828 :       send_disps(0) = 0
    1303        1828 :       recv_disps(0) = 0
    1304        3656 :       DO i = 1, pw%pw_grid%para%group%num_pe - 1
    1305        1828 :          send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
    1306        3656 :          recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
    1307             :       END DO
    1308             : 
    1309       12796 :       CPASSERT(SUM(send_sizes) == PRODUCT(ub_recv - lb_recv + 1))
    1310             : 
    1311        9140 :       ALLOCATE (send_bufs(0:rs%desc%group_size - 1))
    1312       10968 :       ALLOCATE (recv_bufs(0:rs%desc%group_size - 1))
    1313             : 
    1314        5484 :       DO i = 0, rs%desc%group_size - 1
    1315        3656 :          IF (send_sizes(i) .NE. 0) THEN
    1316       10230 :             ALLOCATE (send_bufs(i)%array(send_sizes(i)))
    1317             :          ELSE
    1318         246 :             NULLIFY (send_bufs(i)%array)
    1319             :          END IF
    1320        5484 :          IF (recv_sizes(i) .NE. 0) THEN
    1321       10230 :             ALLOCATE (recv_bufs(i)%array(recv_sizes(i)))
    1322             :          ELSE
    1323         246 :             NULLIFY (recv_bufs(i)%array)
    1324             :          END IF
    1325             :       END DO
    1326             : 
    1327        9140 :       ALLOCATE (recv_reqs(0:rs%desc%group_size - 1))
    1328        5484 :       recv_reqs = mp_request_null
    1329             : 
    1330        5484 :       DO i = 0, rs%desc%group_size - 1
    1331        5484 :          IF (recv_sizes(i) .NE. 0) THEN
    1332        3410 :             CALL rs%desc%group%irecv(recv_bufs(i)%array, i, recv_reqs(i))
    1333             :          END IF
    1334             :       END DO
    1335             : 
    1336             :       ! do packing
    1337             : !$OMP PARALLEL DO DEFAULT(NONE), &
    1338             : !$OMP             PRIVATE(k,z,y,x), &
    1339        1828 : !$OMP             SHARED(rs,send_tasks,send_bufs,send_disps)
    1340             :       DO i = 0, rs%desc%group_size - 1
    1341             :          k = 0
    1342             :          DO z = send_tasks(i, 5), send_tasks(i, 6)
    1343             :             DO y = send_tasks(i, 3), send_tasks(i, 4)
    1344             :                DO x = send_tasks(i, 1), send_tasks(i, 2)
    1345             :                   k = k + 1
    1346             :                   send_bufs(i)%array(k) = rs%r(x, y, z)
    1347             :                END DO
    1348             :             END DO
    1349             :          END DO
    1350             :       END DO
    1351             : !$OMP END PARALLEL DO
    1352             : 
    1353        9140 :       ALLOCATE (send_reqs(0:rs%desc%group_size - 1))
    1354        5484 :       send_reqs = mp_request_null
    1355             : 
    1356        5484 :       DO i = 0, rs%desc%group_size - 1
    1357        5484 :          IF (send_sizes(i) .NE. 0) THEN
    1358        3410 :             CALL rs%desc%group%isend(send_bufs(i)%array, i, send_reqs(i))
    1359             :          END IF
    1360             :       END DO
    1361             : 
    1362             :       ! do unpacking
    1363             :       ! no OMP here so we can unpack each message as it arrives
    1364        5484 :       DO i = 0, rs%desc%group_size - 1
    1365        3656 :          IF (recv_sizes(i) .EQ. 0) CYCLE
    1366             : 
    1367        3410 :          CALL mp_waitany(recv_reqs, completed)
    1368        3410 :          k = 0
    1369      141980 :          DO z = recv_tasks(completed - 1, 5), recv_tasks(completed - 1, 6)
    1370    10628098 :             DO y = recv_tasks(completed - 1, 3), recv_tasks(completed - 1, 4)
    1371   447639902 :                DO x = recv_tasks(completed - 1, 1), recv_tasks(completed - 1, 2)
    1372   437015460 :                   k = k + 1
    1373   447503160 :                   pw%array(x, y, z) = recv_bufs(completed - 1)%array(k)
    1374             :                END DO
    1375             :             END DO
    1376             :          END DO
    1377             :       END DO
    1378             : 
    1379        1828 :       CALL mp_waitall(send_reqs)
    1380             : 
    1381        1828 :       DEALLOCATE (recv_reqs)
    1382        1828 :       DEALLOCATE (send_reqs)
    1383             : 
    1384        5484 :       DO i = 0, rs%desc%group_size - 1
    1385        3656 :          IF (ASSOCIATED(send_bufs(i)%array)) THEN
    1386        3410 :             DEALLOCATE (send_bufs(i)%array)
    1387             :          END IF
    1388        5484 :          IF (ASSOCIATED(recv_bufs(i)%array)) THEN
    1389        3410 :             DEALLOCATE (recv_bufs(i)%array)
    1390             :          END IF
    1391             :       END DO
    1392             : 
    1393        1828 :       DEALLOCATE (send_bufs)
    1394        1828 :       DEALLOCATE (recv_bufs)
    1395        1828 :       DEALLOCATE (send_tasks)
    1396        1828 :       DEALLOCATE (send_sizes)
    1397        1828 :       DEALLOCATE (send_disps)
    1398        1828 :       DEALLOCATE (recv_tasks)
    1399        1828 :       DEALLOCATE (recv_sizes)
    1400        1828 :       DEALLOCATE (recv_disps)
    1401             : 
    1402             :       IF (debug_this_module) THEN
    1403             :          ! safety check, to be removed once we're absolute sure the routine is correct
    1404             :          pw_sum = pw_integrate_function(pw)
    1405             :          IF (ABS(pw_sum - rs_sum)/MAX(1.0_dp, ABS(pw_sum), ABS(rs_sum)) > EPSILON(rs_sum)*1000) THEN
    1406             :             WRITE (error_string, '(A,6(1X,I4.4),3F25.16)') "rs_pw_transfer_distributed", &
    1407             :                rs%desc%npts, rs%desc%group_dim, pw_sum, rs_sum, ABS(pw_sum - rs_sum)
    1408             :             CALL cp_abort(__LOCATION__, &
    1409             :                           error_string//" Please report this bug ... quick workaround: use "// &
    1410             :                           "DISTRIBUTION_TYPE REPLICATED")
    1411             :          END IF
    1412             :       END IF
    1413             : 
    1414        1828 :    END SUBROUTINE transfer_rs2pw_distributed
    1415             : 
    1416             : ! **************************************************************************************************
    1417             : !> \brief does the pw2rs transfer in the case where the rs grid is
    1418             : !>       distributed (3D domain decomposition)
    1419             : !> \param rs ...
    1420             : !> \param pw ...
    1421             : !> \par History
    1422             : !>      12.2007 created [Matt Watkins]
    1423             : !>      9.2008 reduced amount of halo data sent [Iain Bethune]
    1424             : !>      10.2008 added non-blocking communication [Iain Bethune]
    1425             : !>      4.2009 added support for rank-reordering on the grid [Iain Bethune]
    1426             : !>      12.2009 added OMP and sparse alltoall [Iain Bethune]
    1427             : !>              (c) The Numerical Algorithms Group (NAG) Ltd, 2008-2009 on behalf of the HECToR project
    1428             : !> \note
    1429             : !>       the transfer is a two step procedure. For example, for the rs2pw transfer:
    1430             : !>
    1431             : !>       1) Halo-exchange in 3D so that the local part of the rs_grid contains the full data
    1432             : !>       2) an alltoall communication to redistribute the local rs_grid to the local pw_grid
    1433             : !>
    1434             : !>       the halo exchange is most expensive on a large number of CPUs. Particular in this halo
    1435             : !>       exchange is that the border region is rather large (e.g. 20 points) and that it might overlap
    1436             : !>       with the central domain of several CPUs (i.e. next nearest neighbors)
    1437             : ! **************************************************************************************************
    1438         864 :    SUBROUTINE transfer_pw2rs_distributed(rs, pw)
    1439             :       TYPE(realspace_grid_type), INTENT(IN)              :: rs
    1440             :       TYPE(pw_r3d_rs_type), INTENT(IN)                   :: pw
    1441             : 
    1442             :       INTEGER :: completed, dest_down, dest_up, i, idir, j, k, lb, my_id, my_pw_rank, my_rs_rank, &
    1443             :          n_shifts, nn, num_threads, position, source_down, source_up, ub, x, y, z
    1444         864 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: dshifts, recv_disps, recv_sizes, &
    1445         864 :                                                             send_disps, send_sizes, ushifts
    1446        1728 :       INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: bounds, recv_tasks, send_tasks
    1447             :       INTEGER, DIMENSION(2)                              :: neighbours, pos
    1448             :       INTEGER, DIMENSION(3) :: coords, lb_recv, lb_recv_down, lb_recv_up, lb_send, lb_send_down, &
    1449             :          lb_send_up, ub_recv, ub_recv_down, ub_recv_up, ub_send, ub_send_down, ub_send_up
    1450             :       LOGICAL, DIMENSION(3)                              :: halo_swapped
    1451         864 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :)     :: recv_buf_3d_down, recv_buf_3d_up, &
    1452         864 :                                                             send_buf_3d_down, send_buf_3d_up
    1453        1728 :       TYPE(cp_1d_r_p_type), ALLOCATABLE, DIMENSION(:)    :: recv_bufs, send_bufs
    1454         864 :       TYPE(mp_request_type), ALLOCATABLE, DIMENSION(:)   :: recv_reqs, send_reqs
    1455        4320 :       TYPE(mp_request_type), DIMENSION(4)                :: req
    1456             : 
    1457         864 :       num_threads = 1
    1458         864 :       my_id = 0
    1459             : 
    1460         864 :       CALL rs_grid_zero(rs)
    1461             : 
    1462             :       ! This is the real redistribution
    1463             : 
    1464        3456 :       ALLOCATE (bounds(0:pw%pw_grid%para%group%num_pe - 1, 1:4))
    1465             : 
    1466        2592 :       DO i = 0, pw%pw_grid%para%group%num_pe - 1
    1467        5184 :          bounds(i, 1:2) = pw%pw_grid%para%bo(1:2, 1, i, 1)
    1468        5184 :          bounds(i, 3:4) = pw%pw_grid%para%bo(1:2, 2, i, 1)
    1469        5184 :          bounds(i, 1:2) = bounds(i, 1:2) - pw%pw_grid%npts(1)/2 - 1
    1470        6048 :          bounds(i, 3:4) = bounds(i, 3:4) - pw%pw_grid%npts(2)/2 - 1
    1471             :       END DO
    1472             : 
    1473        3456 :       ALLOCATE (send_tasks(0:pw%pw_grid%para%group%num_pe - 1, 1:6))
    1474        2592 :       ALLOCATE (send_sizes(0:pw%pw_grid%para%group%num_pe - 1))
    1475        1728 :       ALLOCATE (send_disps(0:pw%pw_grid%para%group%num_pe - 1))
    1476        1728 :       ALLOCATE (recv_tasks(0:pw%pw_grid%para%group%num_pe - 1, 1:6))
    1477        1728 :       ALLOCATE (recv_sizes(0:pw%pw_grid%para%group%num_pe - 1))
    1478        1728 :       ALLOCATE (recv_disps(0:pw%pw_grid%para%group%num_pe - 1))
    1479             : 
    1480       16416 :       send_tasks = 0
    1481        2592 :       send_tasks(:, 1) = 1
    1482        2592 :       send_tasks(:, 2) = 0
    1483        2592 :       send_tasks(:, 3) = 1
    1484        2592 :       send_tasks(:, 4) = 0
    1485        2592 :       send_tasks(:, 5) = 1
    1486        2592 :       send_tasks(:, 6) = 0
    1487        2592 :       send_sizes = 0
    1488             : 
    1489       16416 :       recv_tasks = 0
    1490        2592 :       recv_tasks(:, 1) = 1
    1491        2592 :       recv_tasks(:, 2) = 0
    1492        2592 :       send_tasks(:, 3) = 1
    1493        2592 :       send_tasks(:, 4) = 0
    1494        2592 :       send_tasks(:, 5) = 1
    1495        2592 :       send_tasks(:, 6) = 0
    1496        2592 :       recv_sizes = 0
    1497             : 
    1498         864 :       my_rs_rank = rs%desc%my_pos
    1499         864 :       my_pw_rank = pw%pw_grid%para%group%mepos
    1500             : 
    1501             :       ! find the processors that should hold our data
    1502             :       ! should be part of the rs grid type
    1503             :       ! this is a loop over real ranks (i.e. the in-order cartesian ranks)
    1504             :       ! do the recv and send tasks in two separate loops which will
    1505             :       ! load balance better for OpenMP with large numbers of MPI tasks
    1506             : 
    1507             :       ! this is the reverse of rs2pw: what were the sends are now the recvs
    1508             : 
    1509             : !$OMP PARALLEL DO DEFAULT(NONE), &
    1510             : !$OMP             PRIVATE(coords,idir,pos,lb_send,ub_send), &
    1511         864 : !$OMP             SHARED(rs,bounds,my_rs_rank,send_tasks,send_sizes,pw)
    1512             :       DO i = 0, pw%pw_grid%para%group%num_pe - 1
    1513             : 
    1514             :          coords(:) = rs%desc%rank2coord(:, rs%desc%real2virtual(i))
    1515             :          !calculate the real rs grid points on each processor
    1516             :          !coords is the part of the grid that rank i actually holds
    1517             :          DO idir = 1, 3
    1518             :             pos(:) = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), coords(idir))
    1519             :             pos(:) = pos(:) - rs%desc%npts(idir)/2 - 1
    1520             :             lb_send(idir) = pos(1)
    1521             :             ub_send(idir) = pos(2)
    1522             :          END DO
    1523             : 
    1524             :          IF (ub_send(1) .LT. bounds(my_rs_rank, 1)) CYCLE
    1525             :          IF (lb_send(1) .GT. bounds(my_rs_rank, 2)) CYCLE
    1526             :          IF (ub_send(2) .LT. bounds(my_rs_rank, 3)) CYCLE
    1527             :          IF (lb_send(2) .GT. bounds(my_rs_rank, 4)) CYCLE
    1528             : 
    1529             :          send_tasks(i, 1) = MAX(lb_send(1), bounds(my_rs_rank, 1))
    1530             :          send_tasks(i, 2) = MIN(ub_send(1), bounds(my_rs_rank, 2))
    1531             :          send_tasks(i, 3) = MAX(lb_send(2), bounds(my_rs_rank, 3))
    1532             :          send_tasks(i, 4) = MIN(ub_send(2), bounds(my_rs_rank, 4))
    1533             :          send_tasks(i, 5) = lb_send(3)
    1534             :          send_tasks(i, 6) = ub_send(3)
    1535             :          send_sizes(i) = (send_tasks(i, 2) - send_tasks(i, 1) + 1)* &
    1536             :                          (send_tasks(i, 4) - send_tasks(i, 3) + 1)*(send_tasks(i, 6) - send_tasks(i, 5) + 1)
    1537             : 
    1538             :       END DO
    1539             : !$OMP END PARALLEL DO
    1540             : 
    1541        3456 :       coords(:) = rs%desc%rank2coord(:, rs%desc%real2virtual(my_rs_rank))
    1542        3456 :       DO idir = 1, 3
    1543        2592 :          pos(:) = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), coords(idir))
    1544        7776 :          pos(:) = pos(:) - rs%desc%npts(idir)/2 - 1
    1545        2592 :          lb_send(idir) = pos(1)
    1546        3456 :          ub_send(idir) = pos(2)
    1547             :       END DO
    1548             : 
    1549         864 :       lb_recv(:) = lb_send(:)
    1550         864 :       ub_recv(:) = ub_send(:)
    1551             : 
    1552             : !$OMP PARALLEL DO DEFAULT(NONE), &
    1553         864 : !$OMP             SHARED(pw,lb_send,ub_send,bounds,recv_tasks,recv_sizes)
    1554             :       DO j = 0, pw%pw_grid%para%group%num_pe - 1
    1555             : 
    1556             :          IF (ub_send(1) .LT. bounds(j, 1)) CYCLE
    1557             :          IF (lb_send(1) .GT. bounds(j, 2)) CYCLE
    1558             :          IF (ub_send(2) .LT. bounds(j, 3)) CYCLE
    1559             :          IF (lb_send(2) .GT. bounds(j, 4)) CYCLE
    1560             : 
    1561             :          recv_tasks(j, 1) = MAX(lb_send(1), bounds(j, 1))
    1562             :          recv_tasks(j, 2) = MIN(ub_send(1), bounds(j, 2))
    1563             :          recv_tasks(j, 3) = MAX(lb_send(2), bounds(j, 3))
    1564             :          recv_tasks(j, 4) = MIN(ub_send(2), bounds(j, 4))
    1565             :          recv_tasks(j, 5) = lb_send(3)
    1566             :          recv_tasks(j, 6) = ub_send(3)
    1567             :          recv_sizes(j) = (recv_tasks(j, 2) - recv_tasks(j, 1) + 1)* &
    1568             :                          (recv_tasks(j, 4) - recv_tasks(j, 3) + 1)*(recv_tasks(j, 6) - recv_tasks(j, 5) + 1)
    1569             : 
    1570             :       END DO
    1571             : !$OMP END PARALLEL DO
    1572             : 
    1573         864 :       send_disps(0) = 0
    1574         864 :       recv_disps(0) = 0
    1575        1728 :       DO i = 1, pw%pw_grid%para%group%num_pe - 1
    1576         864 :          send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
    1577        1728 :          recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
    1578             :       END DO
    1579             : 
    1580        6048 :       CPASSERT(SUM(recv_sizes) == PRODUCT(ub_recv - lb_recv + 1))
    1581             : 
    1582        4320 :       ALLOCATE (send_bufs(0:rs%desc%group_size - 1))
    1583        5184 :       ALLOCATE (recv_bufs(0:rs%desc%group_size - 1))
    1584             : 
    1585        2592 :       DO i = 0, rs%desc%group_size - 1
    1586        1728 :          IF (send_sizes(i) .NE. 0) THEN
    1587        4938 :             ALLOCATE (send_bufs(i)%array(send_sizes(i)))
    1588             :          ELSE
    1589          82 :             NULLIFY (send_bufs(i)%array)
    1590             :          END IF
    1591        2592 :          IF (recv_sizes(i) .NE. 0) THEN
    1592        4938 :             ALLOCATE (recv_bufs(i)%array(recv_sizes(i)))
    1593             :          ELSE
    1594          82 :             NULLIFY (recv_bufs(i)%array)
    1595             :          END IF
    1596             :       END DO
    1597             : 
    1598        4320 :       ALLOCATE (recv_reqs(0:rs%desc%group_size - 1))
    1599        2592 :       recv_reqs = mp_request_null
    1600             : 
    1601        2592 :       DO i = 0, rs%desc%group_size - 1
    1602        2592 :          IF (recv_sizes(i) .NE. 0) THEN
    1603        1646 :             CALL rs%desc%group%irecv(recv_bufs(i)%array, i, recv_reqs(i))
    1604             :          END IF
    1605             :       END DO
    1606             : 
    1607             :       ! do packing
    1608             : !$OMP PARALLEL DO DEFAULT(NONE), &
    1609             : !$OMP             PRIVATE(k,z,y,x), &
    1610         864 : !$OMP             SHARED(pw,rs,send_tasks,send_bufs,send_disps)
    1611             :       DO i = 0, rs%desc%group_size - 1
    1612             :          k = 0
    1613             :          DO z = send_tasks(i, 5), send_tasks(i, 6)
    1614             :             DO y = send_tasks(i, 3), send_tasks(i, 4)
    1615             :                DO x = send_tasks(i, 1), send_tasks(i, 2)
    1616             :                   k = k + 1
    1617             :                   send_bufs(i)%array(k) = pw%array(x, y, z)
    1618             :                END DO
    1619             :             END DO
    1620             :          END DO
    1621             :       END DO
    1622             : !$OMP END PARALLEL DO
    1623             : 
    1624        4320 :       ALLOCATE (send_reqs(0:rs%desc%group_size - 1))
    1625        2592 :       send_reqs = mp_request_null
    1626             : 
    1627        2592 :       DO i = 0, rs%desc%group_size - 1
    1628        2592 :          IF (send_sizes(i) .NE. 0) THEN
    1629        1646 :             CALL rs%desc%group%isend(send_bufs(i)%array, i, send_reqs(i))
    1630             :          END IF
    1631             :       END DO
    1632             : 
    1633             :       ! do unpacking
    1634             :       ! no OMP here so we can unpack each message as it arrives
    1635             : 
    1636        2592 :       DO i = 0, rs%desc%group_size - 1
    1637        1728 :          IF (recv_sizes(i) .EQ. 0) CYCLE
    1638             : 
    1639        1646 :          CALL mp_waitany(recv_reqs, completed)
    1640        1646 :          k = 0
    1641       65956 :          DO z = recv_tasks(completed - 1, 5), recv_tasks(completed - 1, 6)
    1642     4721236 :             DO y = recv_tasks(completed - 1, 3), recv_tasks(completed - 1, 4)
    1643   193190191 :                DO x = recv_tasks(completed - 1, 1), recv_tasks(completed - 1, 2)
    1644   188470683 :                   k = k + 1
    1645   193126745 :                   rs%r(x, y, z) = recv_bufs(completed - 1)%array(k)
    1646             :                END DO
    1647             :             END DO
    1648             :          END DO
    1649             :       END DO
    1650             : 
    1651         864 :       CALL mp_waitall(send_reqs)
    1652             : 
    1653         864 :       DEALLOCATE (recv_reqs)
    1654         864 :       DEALLOCATE (send_reqs)
    1655             : 
    1656        2592 :       DO i = 0, rs%desc%group_size - 1
    1657        1728 :          IF (ASSOCIATED(send_bufs(i)%array)) THEN
    1658        1646 :             DEALLOCATE (send_bufs(i)%array)
    1659             :          END IF
    1660        2592 :          IF (ASSOCIATED(recv_bufs(i)%array)) THEN
    1661        1646 :             DEALLOCATE (recv_bufs(i)%array)
    1662             :          END IF
    1663             :       END DO
    1664             : 
    1665         864 :       DEALLOCATE (send_bufs)
    1666         864 :       DEALLOCATE (recv_bufs)
    1667         864 :       DEALLOCATE (send_tasks)
    1668         864 :       DEALLOCATE (send_sizes)
    1669         864 :       DEALLOCATE (send_disps)
    1670         864 :       DEALLOCATE (recv_tasks)
    1671         864 :       DEALLOCATE (recv_sizes)
    1672         864 :       DEALLOCATE (recv_disps)
    1673             : 
    1674             :       ! now pass wings around
    1675         864 :       halo_swapped = .FALSE.
    1676             : 
    1677        3456 :       DO idir = 1, 3
    1678             : 
    1679        2592 :          IF (rs%desc%perd(idir) /= 1) THEN
    1680             : 
    1681        5496 :             ALLOCATE (dshifts(0:rs%desc%neighbours(idir)))
    1682        3664 :             ALLOCATE (ushifts(0:rs%desc%neighbours(idir)))
    1683        5496 :             ushifts = 0
    1684        5496 :             dshifts = 0
    1685             : 
    1686        3664 :             DO n_shifts = 1, rs%desc%neighbours(idir)
    1687             : 
    1688             :                ! need to take into account the possible varying widths of neighbouring cells
    1689             :                ! ushifts and dshifts hold the real size of the neighbouring cells
    1690             : 
    1691        1832 :                position = MODULO(rs%desc%virtual_group_coor(idir) - n_shifts, rs%desc%group_dim(idir))
    1692        1832 :                neighbours = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), position)
    1693        1832 :                dshifts(n_shifts) = dshifts(n_shifts - 1) + (neighbours(2) - neighbours(1) + 1)
    1694             : 
    1695        1832 :                position = MODULO(rs%desc%virtual_group_coor(idir) + n_shifts, rs%desc%group_dim(idir))
    1696        1832 :                neighbours = get_limit(rs%desc%npts(idir), rs%desc%group_dim(idir), position)
    1697        1832 :                ushifts(n_shifts) = ushifts(n_shifts - 1) + (neighbours(2) - neighbours(1) + 1)
    1698             : 
    1699             :                ! The border data has to be send/received from the neighbors
    1700             :                ! First we calculate the source and destination processes for the shift
    1701             :                ! The first shift is "downwards"
    1702             : 
    1703        1832 :                CALL cart_shift(rs, idir, -1*n_shifts, source_down, dest_down)
    1704             : 
    1705        7328 :                lb_send_down(:) = rs%lb_local(:)
    1706        7328 :                ub_send_down(:) = rs%ub_local(:)
    1707        7328 :                lb_recv_down(:) = rs%lb_local(:)
    1708        7328 :                ub_recv_down(:) = rs%ub_local(:)
    1709             : 
    1710        1832 :                IF (dshifts(n_shifts - 1) .LE. rs%desc%border) THEN
    1711        1832 :                   lb_send_down(idir) = lb_send_down(idir) + rs%desc%border
    1712             :                   ub_send_down(idir) = MIN(ub_send_down(idir) - rs%desc%border, &
    1713        1832 :                                            lb_send_down(idir) + rs%desc%border - 1 - dshifts(n_shifts - 1))
    1714             : 
    1715        1832 :                   lb_recv_down(idir) = ub_recv_down(idir) - rs%desc%border + 1 + ushifts(n_shifts - 1)
    1716             :                   ub_recv_down(idir) = MIN(ub_recv_down(idir), &
    1717        1832 :                                            ub_recv_down(idir) - rs%desc%border + ushifts(n_shifts))
    1718             :                ELSE
    1719           0 :                   lb_send_down(idir) = 0
    1720           0 :                   ub_send_down(idir) = -1
    1721           0 :                   lb_recv_down(idir) = 0
    1722           0 :                   ub_recv_down(idir) = -1
    1723             :                END IF
    1724             : 
    1725        7328 :                DO i = 1, 3
    1726        7328 :                   IF (.NOT. (halo_swapped(i) .OR. i .EQ. idir)) THEN
    1727        1536 :                      lb_send_down(i) = rs%lb_real(i)
    1728        1536 :                      ub_send_down(i) = rs%ub_real(i)
    1729        1536 :                      lb_recv_down(i) = rs%lb_real(i)
    1730        1536 :                      ub_recv_down(i) = rs%ub_real(i)
    1731             :                   END IF
    1732             :                END DO
    1733             : 
    1734             :                ! allocate the recv buffer
    1735        7328 :                nn = PRODUCT(ub_recv_down - lb_recv_down + 1)
    1736           0 :                ALLOCATE (recv_buf_3d_down(lb_recv_down(1):ub_recv_down(1), &
    1737        9160 :                                           lb_recv_down(2):ub_recv_down(2), lb_recv_down(3):ub_recv_down(3)))
    1738             : 
    1739             :                ! recv buffer is now ready, so post the receive
    1740        1832 :                CALL rs%desc%group%irecv(recv_buf_3d_down, source_down, req(1))
    1741             : 
    1742             :                ! now allocate,pack and send the send buffer
    1743        7328 :                nn = PRODUCT(ub_send_down - lb_send_down + 1)
    1744           0 :                ALLOCATE (send_buf_3d_down(lb_send_down(1):ub_send_down(1), &
    1745        9160 :                                           lb_send_down(2):ub_send_down(2), lb_send_down(3):ub_send_down(3)))
    1746             : 
    1747             : !$OMP PARALLEL DEFAULT(NONE), &
    1748             : !$OMP          PRIVATE(lb,ub,my_id,NUM_THREADS), &
    1749        1832 : !$OMP          SHARED(send_buf_3d_down,rs,lb_send_down,ub_send_down)
    1750             : !$             num_threads = MIN(omp_get_max_threads(), ub_send_down(3) - lb_send_down(3) + 1)
    1751             : !$             my_id = omp_get_thread_num()
    1752             :                IF (my_id < num_threads) THEN
    1753             :                   lb = lb_send_down(3) + ((ub_send_down(3) - lb_send_down(3) + 1)*my_id)/num_threads
    1754             :                   ub = lb_send_down(3) + ((ub_send_down(3) - lb_send_down(3) + 1)*(my_id + 1))/num_threads - 1
    1755             : 
    1756             :                   send_buf_3d_down(lb_send_down(1):ub_send_down(1), lb_send_down(2):ub_send_down(2), &
    1757             :                                    lb:ub) = rs%r(lb_send_down(1):ub_send_down(1), &
    1758             :                                                  lb_send_down(2):ub_send_down(2), lb:ub)
    1759             :                END IF
    1760             : !$OMP END PARALLEL
    1761             : 
    1762        1832 :                CALL rs%desc%group%isend(send_buf_3d_down, dest_down, req(3))
    1763             : 
    1764             :                ! Now for the other direction
    1765             : 
    1766        1832 :                CALL cart_shift(rs, idir, n_shifts, source_up, dest_up)
    1767             : 
    1768        7328 :                lb_send_up(:) = rs%lb_local(:)
    1769        7328 :                ub_send_up(:) = rs%ub_local(:)
    1770        7328 :                lb_recv_up(:) = rs%lb_local(:)
    1771        7328 :                ub_recv_up(:) = rs%ub_local(:)
    1772             : 
    1773        1832 :                IF (ushifts(n_shifts - 1) .LE. rs%desc%border) THEN
    1774        1832 :                   ub_send_up(idir) = ub_send_up(idir) - rs%desc%border
    1775             :                   lb_send_up(idir) = MAX(lb_send_up(idir) + rs%desc%border, &
    1776        1832 :                                          ub_send_up(idir) - rs%desc%border + 1 + ushifts(n_shifts - 1))
    1777             : 
    1778        1832 :                   ub_recv_up(idir) = lb_recv_up(idir) + rs%desc%border - 1 - dshifts(n_shifts - 1)
    1779             :                   lb_recv_up(idir) = MAX(lb_recv_up(idir), &
    1780        1832 :                                          lb_recv_up(idir) + rs%desc%border - dshifts(n_shifts))
    1781             :                ELSE
    1782           0 :                   lb_send_up(idir) = 0
    1783           0 :                   ub_send_up(idir) = -1
    1784           0 :                   lb_recv_up(idir) = 0
    1785           0 :                   ub_recv_up(idir) = -1
    1786             :                END IF
    1787             : 
    1788        7328 :                DO i = 1, 3
    1789        7328 :                   IF (.NOT. (halo_swapped(i) .OR. i .EQ. idir)) THEN
    1790        1536 :                      lb_send_up(i) = rs%lb_real(i)
    1791        1536 :                      ub_send_up(i) = rs%ub_real(i)
    1792        1536 :                      lb_recv_up(i) = rs%lb_real(i)
    1793        1536 :                      ub_recv_up(i) = rs%ub_real(i)
    1794             :                   END IF
    1795             :                END DO
    1796             : 
    1797             :                ! allocate the recv buffer
    1798        7328 :                nn = PRODUCT(ub_recv_up - lb_recv_up + 1)
    1799           0 :                ALLOCATE (recv_buf_3d_up(lb_recv_up(1):ub_recv_up(1), &
    1800        9160 :                                         lb_recv_up(2):ub_recv_up(2), lb_recv_up(3):ub_recv_up(3)))
    1801             : 
    1802             :                ! recv buffer is now ready, so post the receive
    1803             : 
    1804        1832 :                CALL rs%desc%group%irecv(recv_buf_3d_up, source_up, req(2))
    1805             : 
    1806             :                ! now allocate,pack and send the send buffer
    1807        7328 :                nn = PRODUCT(ub_send_up - lb_send_up + 1)
    1808           0 :                ALLOCATE (send_buf_3d_up(lb_send_up(1):ub_send_up(1), &
    1809        9160 :                                         lb_send_up(2):ub_send_up(2), lb_send_up(3):ub_send_up(3)))
    1810             : 
    1811             : !$OMP PARALLEL DEFAULT(NONE), &
    1812             : !$OMP          PRIVATE(lb,ub,my_id,NUM_THREADS), &
    1813        1832 : !$OMP          SHARED(send_buf_3d_up,rs,lb_send_up,ub_send_up)
    1814             : !$             num_threads = MIN(omp_get_max_threads(), ub_send_up(3) - lb_send_up(3) + 1)
    1815             : !$             my_id = omp_get_thread_num()
    1816             :                IF (my_id < num_threads) THEN
    1817             :                   lb = lb_send_up(3) + ((ub_send_up(3) - lb_send_up(3) + 1)*my_id)/num_threads
    1818             :                   ub = lb_send_up(3) + ((ub_send_up(3) - lb_send_up(3) + 1)*(my_id + 1))/num_threads - 1
    1819             : 
    1820             :                   send_buf_3d_up(lb_send_up(1):ub_send_up(1), lb_send_up(2):ub_send_up(2), &
    1821             :                                  lb:ub) = rs%r(lb_send_up(1):ub_send_up(1), &
    1822             :                                                lb_send_up(2):ub_send_up(2), lb:ub)
    1823             :                END IF
    1824             : !$OMP END PARALLEL
    1825             : 
    1826        1832 :                CALL rs%desc%group%isend(send_buf_3d_up, dest_up, req(4))
    1827             : 
    1828             :                ! wait for a recv to complete, then we can unpack
    1829             : 
    1830        5496 :                DO i = 1, 2
    1831             : 
    1832        3664 :                   CALL mp_waitany(req(1:2), completed)
    1833             : 
    1834        5496 :                   IF (completed .EQ. 1) THEN
    1835             : 
    1836             :                      ! only some procs may need later shifts
    1837        1832 :                      IF (ub_recv_down(idir) .GE. lb_recv_down(idir)) THEN
    1838             : 
    1839             :                         ! Add the data to the RS Grid
    1840             : !$OMP PARALLEL DEFAULT(NONE), &
    1841             : !$OMP          PRIVATE(lb,ub,my_id,NUM_THREADS), &
    1842        1832 : !$OMP          SHARED(recv_buf_3d_down,rs,lb_recv_down,ub_recv_down)
    1843             : !$                      num_threads = MIN(omp_get_max_threads(), ub_recv_down(3) - lb_recv_down(3) + 1)
    1844             : !$                      my_id = omp_get_thread_num()
    1845             :                         IF (my_id < num_threads) THEN
    1846             :                            lb = lb_recv_down(3) + ((ub_recv_down(3) - lb_recv_down(3) + 1)*my_id)/num_threads
    1847             :                            ub = lb_recv_down(3) + ((ub_recv_down(3) - lb_recv_down(3) + 1)*(my_id + 1))/num_threads - 1
    1848             : 
    1849             :                            rs%r(lb_recv_down(1):ub_recv_down(1), lb_recv_down(2):ub_recv_down(2), &
    1850             :                                 lb:ub) = recv_buf_3d_down(:, :, lb:ub)
    1851             :                         END IF
    1852             : !$OMP END PARALLEL
    1853             :                      END IF
    1854             : 
    1855        1832 :                      DEALLOCATE (recv_buf_3d_down)
    1856             :                   ELSE
    1857             : 
    1858             :                      ! only some procs may need later shifts
    1859        1832 :                      IF (ub_recv_up(idir) .GE. lb_recv_up(idir)) THEN
    1860             : 
    1861             :                         ! Add the data to the RS Grid
    1862             : !$OMP PARALLEL DEFAULT(NONE), &
    1863             : !$OMP          PRIVATE(lb,ub,my_id,NUM_THREADS), &
    1864        1832 : !$OMP          SHARED(recv_buf_3d_up,rs,lb_recv_up,ub_recv_up)
    1865             : !$                      num_threads = MIN(omp_get_max_threads(), ub_recv_up(3) - lb_recv_up(3) + 1)
    1866             : !$                      my_id = omp_get_thread_num()
    1867             :                         IF (my_id < num_threads) THEN
    1868             :                            lb = lb_recv_up(3) + ((ub_recv_up(3) - lb_recv_up(3) + 1)*my_id)/num_threads
    1869             :                            ub = lb_recv_up(3) + ((ub_recv_up(3) - lb_recv_up(3) + 1)*(my_id + 1))/num_threads - 1
    1870             : 
    1871             :                            rs%r(lb_recv_up(1):ub_recv_up(1), lb_recv_up(2):ub_recv_up(2), &
    1872             :                                 lb:ub) = recv_buf_3d_up(:, :, lb:ub)
    1873             :                         END IF
    1874             : !$OMP END PARALLEL
    1875             :                      END IF
    1876             : 
    1877        1832 :                      DEALLOCATE (recv_buf_3d_up)
    1878             :                   END IF
    1879             :                END DO
    1880             : 
    1881        1832 :                CALL mp_waitall(req(3:4))
    1882             : 
    1883        1832 :                DEALLOCATE (send_buf_3d_down)
    1884        5496 :                DEALLOCATE (send_buf_3d_up)
    1885             :             END DO
    1886             : 
    1887        1832 :             DEALLOCATE (ushifts)
    1888        1832 :             DEALLOCATE (dshifts)
    1889             :          END IF
    1890             : 
    1891        3456 :          halo_swapped(idir) = .TRUE.
    1892             : 
    1893             :       END DO
    1894             : 
    1895         864 :    END SUBROUTINE transfer_pw2rs_distributed
    1896             : 
    1897             : ! **************************************************************************************************
    1898             : !> \brief Initialize grid to zero
    1899             : !> \param rs ...
    1900             : !> \par History
    1901             : !>      none
    1902             : !> \author JGH (23-Mar-2002)
    1903             : ! **************************************************************************************************
    1904      343264 :    SUBROUTINE rs_grid_zero(rs)
    1905             : 
    1906             :       TYPE(realspace_grid_type), INTENT(IN)              :: rs
    1907             : 
    1908             :       CHARACTER(len=*), PARAMETER                        :: routineN = 'rs_grid_zero'
    1909             : 
    1910             :       INTEGER                                            :: handle, i, j, k, l(3), u(3)
    1911             : 
    1912      343264 :       CALL timeset(routineN, handle)
    1913     1029792 :       l(1) = LBOUND(rs%r, 1); l(2) = LBOUND(rs%r, 2); l(3) = LBOUND(rs%r, 3)
    1914     1029792 :       u(1) = UBOUND(rs%r, 1); u(2) = UBOUND(rs%r, 2); u(3) = UBOUND(rs%r, 3)
    1915             : !$OMP PARALLEL DO DEFAULT(NONE) COLLAPSE(3) &
    1916             : !$OMP             PRIVATE(i,j,k) &
    1917      343264 : !$OMP             SHARED(rs,l,u)
    1918             :       DO k = l(3), u(3)
    1919             :       DO j = l(2), u(2)
    1920             :       DO i = l(1), u(1)
    1921             :          rs%r(i, j, k) = 0.0_dp
    1922             :       END DO
    1923             :       END DO
    1924             :       END DO
    1925             : !$OMP END PARALLEL DO
    1926      343264 :       CALL timestop(handle)
    1927             : 
    1928      343264 :    END SUBROUTINE rs_grid_zero
    1929             : 
    1930             : ! **************************************************************************************************
    1931             : !> \brief rs1(i) = rs1(i) + rs2(i)*rs3(i)
    1932             : !> \param rs1 ...
    1933             : !> \param rs2 ...
    1934             : !> \param rs3 ...
    1935             : !> \param scalar ...
    1936             : !> \par History
    1937             : !>      none
    1938             : !> \author
    1939             : ! **************************************************************************************************
    1940        1404 :    SUBROUTINE rs_grid_mult_and_add(rs1, rs2, rs3, scalar)
    1941             : 
    1942             :       TYPE(realspace_grid_type), INTENT(IN)              :: rs1, rs2, rs3
    1943             :       REAL(dp), INTENT(IN)                               :: scalar
    1944             : 
    1945             :       CHARACTER(len=*), PARAMETER :: routineN = 'rs_grid_mult_and_add'
    1946             : 
    1947             :       INTEGER                                            :: handle, i, j, k, l(3), u(3)
    1948             : 
    1949             : !-----------------------------------------------------------------------------!
    1950             : 
    1951        1404 :       CALL timeset(routineN, handle)
    1952        1404 :       IF (scalar .NE. 0.0_dp) THEN
    1953        4212 :          l(1) = LBOUND(rs1%r, 1); l(2) = LBOUND(rs1%r, 2); l(3) = LBOUND(rs1%r, 3)
    1954        4212 :          u(1) = UBOUND(rs1%r, 1); u(2) = UBOUND(rs1%r, 2); u(3) = UBOUND(rs1%r, 3)
    1955             : !$OMP PARALLEL DO DEFAULT(NONE) COLLAPSE(3) &
    1956             : !$OMP             PRIVATE(i,j,k) &
    1957        1404 : !$OMP             SHARED(rs1,rs2,rs3,scalar,l,u)
    1958             :          DO k = l(3), u(3)
    1959             :          DO j = l(2), u(2)
    1960             :          DO i = l(1), u(1)
    1961             :             rs1%r(i, j, k) = rs1%r(i, j, k) + scalar*rs2%r(i, j, k)*rs3%r(i, j, k)
    1962             :          END DO
    1963             :          END DO
    1964             :          END DO
    1965             : !$OMP END PARALLEL DO
    1966             :       END IF
    1967        1404 :       CALL timestop(handle)
    1968        1404 :    END SUBROUTINE rs_grid_mult_and_add
    1969             : 
    1970             : ! **************************************************************************************************
    1971             : !> \brief Set box matrix info for real space grid
    1972             : !>      This is needed for variable cell simulations
    1973             : !> \param pw_grid ...
    1974             : !> \param rs ...
    1975             : !> \par History
    1976             : !>      none
    1977             : !> \author JGH (15-May-2007)
    1978             : ! **************************************************************************************************
    1979      176154 :    SUBROUTINE rs_grid_set_box(pw_grid, rs)
    1980             : 
    1981             :       TYPE(pw_grid_type), INTENT(IN), TARGET             :: pw_grid
    1982             :       TYPE(realspace_grid_type), INTENT(IN)              :: rs
    1983             : 
    1984      176154 :       CPASSERT(ASSOCIATED(rs%desc%pw, pw_grid))
    1985     2290002 :       rs%desc%dh = pw_grid%dh
    1986     2290002 :       rs%desc%dh_inv = pw_grid%dh_inv
    1987             : 
    1988      176154 :    END SUBROUTINE rs_grid_set_box
    1989             : 
    1990             : ! **************************************************************************************************
    1991             : !> \brief retains the given rs grid descriptor (see doc/ReferenceCounting.html)
    1992             : !> \param rs_desc the grid descriptor to retain
    1993             : !> \par History
    1994             : !>      04.2009 created [Iain Bethune]
    1995             : !>        (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
    1996             : ! **************************************************************************************************
    1997      236948 :    SUBROUTINE rs_grid_retain_descriptor(rs_desc)
    1998             :       TYPE(realspace_grid_desc_type), INTENT(INOUT)      :: rs_desc
    1999             : 
    2000      236948 :       CPASSERT(rs_desc%ref_count > 0)
    2001      236948 :       rs_desc%ref_count = rs_desc%ref_count + 1
    2002      236948 :    END SUBROUTINE rs_grid_retain_descriptor
    2003             : 
    2004             : ! **************************************************************************************************
    2005             : !> \brief releases the given rs grid (see doc/ReferenceCounting.html)
    2006             : !> \param rs_grid the rs grid to release
    2007             : !> \par History
    2008             : !>      03.2003 created [fawzi]
    2009             : !> \author fawzi
    2010             : ! **************************************************************************************************
    2011      236326 :    SUBROUTINE rs_grid_release(rs_grid)
    2012             :       TYPE(realspace_grid_type), INTENT(INOUT)           :: rs_grid
    2013             : 
    2014      236326 :       CALL rs_grid_release_descriptor(rs_grid%desc)
    2015             : 
    2016      236326 :       CALL offload_free_buffer(rs_grid%buffer)
    2017      236326 :       NULLIFY (rs_grid%r)
    2018             : 
    2019      236326 :       IF (ALLOCATED(rs_grid%px)) DEALLOCATE (rs_grid%px)
    2020      236326 :       IF (ALLOCATED(rs_grid%py)) DEALLOCATE (rs_grid%py)
    2021      236326 :       IF (ALLOCATED(rs_grid%pz)) DEALLOCATE (rs_grid%pz)
    2022      236326 :    END SUBROUTINE rs_grid_release
    2023             : 
    2024             : ! **************************************************************************************************
    2025             : !> \brief releases the given rs grid descriptor (see doc/ReferenceCounting.html)
    2026             : !> \param rs_desc the rs grid descriptor to release
    2027             : !> \par History
    2028             : !>      04.2009 created [Iain Bethune]
    2029             : !>        (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
    2030             : ! **************************************************************************************************
    2031      272285 :    SUBROUTINE rs_grid_release_descriptor(rs_desc)
    2032             :       TYPE(realspace_grid_desc_type), POINTER            :: rs_desc
    2033             : 
    2034      272285 :       IF (ASSOCIATED(rs_desc)) THEN
    2035      268666 :          CPASSERT(rs_desc%ref_count > 0)
    2036      268666 :          rs_desc%ref_count = rs_desc%ref_count - 1
    2037      268666 :          IF (rs_desc%ref_count == 0) THEN
    2038             : 
    2039       31718 :             CALL pw_grid_release(rs_desc%pw)
    2040             : 
    2041       31718 :             IF (rs_desc%parallel) THEN
    2042             :                ! release the group communicator
    2043       27636 :                CALL rs_desc%group%free()
    2044             : 
    2045       27636 :                DEALLOCATE (rs_desc%virtual2real)
    2046       27636 :                DEALLOCATE (rs_desc%real2virtual)
    2047             :             END IF
    2048             : 
    2049       31718 :             IF (rs_desc%distributed) THEN
    2050         158 :                DEALLOCATE (rs_desc%rank2coord)
    2051         158 :                DEALLOCATE (rs_desc%coord2rank)
    2052         158 :                DEALLOCATE (rs_desc%lb_global)
    2053         158 :                DEALLOCATE (rs_desc%ub_global)
    2054         158 :                DEALLOCATE (rs_desc%x2coord)
    2055         158 :                DEALLOCATE (rs_desc%y2coord)
    2056         158 :                DEALLOCATE (rs_desc%z2coord)
    2057             :             END IF
    2058             : 
    2059       31718 :             DEALLOCATE (rs_desc)
    2060             :          END IF
    2061             :       END IF
    2062      272285 :       NULLIFY (rs_desc)
    2063      272285 :    END SUBROUTINE rs_grid_release_descriptor
    2064             : 
    2065             : ! **************************************************************************************************
    2066             : !> \brief emulates the function of an MPI_cart_shift operation, but the shift is
    2067             : !>        done in virtual coordinates, and the corresponding real ranks are returned
    2068             : !> \param rs_grid ...
    2069             : !> \param dir ...
    2070             : !> \param disp ...
    2071             : !> \param source ...
    2072             : !> \param dest ...
    2073             : !> \par History
    2074             : !>      04.2009 created [Iain Bethune]
    2075             : !>        (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
    2076             : ! **************************************************************************************************
    2077        7320 :    PURE SUBROUTINE cart_shift(rs_grid, dir, disp, source, dest)
    2078             : 
    2079             :       TYPE(realspace_grid_type), INTENT(IN)              :: rs_grid
    2080             :       INTEGER, INTENT(IN)                                :: dir, disp
    2081             :       INTEGER, INTENT(OUT)                               :: source, dest
    2082             : 
    2083             :       INTEGER, DIMENSION(3)                              :: shift_coords
    2084             : 
    2085       29280 :       shift_coords = rs_grid%desc%virtual_group_coor
    2086        7320 :       shift_coords(dir) = MODULO(shift_coords(dir) + disp, rs_grid%desc%group_dim(dir))
    2087        7320 :       dest = rs_grid%desc%virtual2real(rs_grid%desc%coord2rank(shift_coords(1), shift_coords(2), shift_coords(3)))
    2088       29280 :       shift_coords = rs_grid%desc%virtual_group_coor
    2089        7320 :       shift_coords(dir) = MODULO(shift_coords(dir) - disp, rs_grid%desc%group_dim(dir))
    2090        7320 :       source = rs_grid%desc%virtual2real(rs_grid%desc%coord2rank(shift_coords(1), shift_coords(2), shift_coords(3)))
    2091             : 
    2092        7320 :    END SUBROUTINE
    2093             : 
    2094             : ! **************************************************************************************************
    2095             : !> \brief returns the maximum number of points in the local grid of any process
    2096             : !>        to account for the case where the grid may later be reordered
    2097             : !> \param desc ...
    2098             : !> \return ...
    2099             : !> \par History
    2100             : !>      10.2011 created [Iain Bethune]
    2101             : ! **************************************************************************************************
    2102           0 :    FUNCTION rs_grid_max_ngpts(desc) RESULT(max_ngpts)
    2103             :       TYPE(realspace_grid_desc_type), INTENT(IN)         :: desc
    2104             :       INTEGER                                            :: max_ngpts
    2105             : 
    2106             :       CHARACTER(len=*), PARAMETER                        :: routineN = 'rs_grid_max_ngpts'
    2107             : 
    2108             :       INTEGER                                            :: handle, i
    2109             :       INTEGER, DIMENSION(3)                              :: lb, ub
    2110             : 
    2111           0 :       CALL timeset(routineN, handle)
    2112             : 
    2113           0 :       max_ngpts = 0
    2114           0 :       IF ((desc%pw%para%mode == PW_MODE_LOCAL) .OR. &
    2115             :           (ALL(desc%group_dim == 1))) THEN
    2116           0 :          CPASSERT(PRODUCT(INT(desc%npts, KIND=int_8)) < HUGE(1))
    2117           0 :          max_ngpts = PRODUCT(desc%npts)
    2118             :       ELSE
    2119           0 :          DO i = 0, desc%group_size - 1
    2120           0 :             lb = desc%lb_global(:, i)
    2121           0 :             ub = desc%ub_global(:, i)
    2122           0 :             lb = lb - desc%border*(1 - desc%perd)
    2123           0 :             ub = ub + desc%border*(1 - desc%perd)
    2124           0 :             CPASSERT(PRODUCT(INT(ub - lb + 1, KIND=int_8)) < HUGE(1))
    2125           0 :             max_ngpts = MAX(max_ngpts, PRODUCT(ub - lb + 1))
    2126             :          END DO
    2127             :       END IF
    2128             : 
    2129           0 :       CALL timestop(handle)
    2130             : 
    2131           0 :    END FUNCTION rs_grid_max_ngpts
    2132             : 
    2133             : ! **************************************************************************************************
    2134             : !> \brief ...
    2135             : !> \param rs_grid ...
    2136             : !> \param h_inv ...
    2137             : !> \param ra ...
    2138             : !> \param offset ...
    2139             : !> \param group_size ...
    2140             : !> \param my_pos ...
    2141             : !> \return ...
    2142             : ! **************************************************************************************************
    2143     1555487 :    PURE LOGICAL FUNCTION map_gaussian_here(rs_grid, h_inv, ra, offset, group_size, my_pos) RESULT(res)
    2144             :       TYPE(realspace_grid_type), INTENT(IN)              :: rs_grid
    2145             :       REAL(KIND=dp), DIMENSION(3, 3), INTENT(IN)         :: h_inv
    2146             :       REAL(KIND=dp), DIMENSION(3), INTENT(IN)            :: ra
    2147             :       INTEGER, INTENT(IN), OPTIONAL                      :: offset, group_size, my_pos
    2148             : 
    2149             :       INTEGER                                            :: dir, lb(3), location(3), tp(3), ub(3)
    2150             : 
    2151     1555487 :       res = .FALSE.
    2152             : 
    2153     6221940 :       IF (.NOT. ALL(rs_grid%desc%perd == 1)) THEN
    2154          32 :          DO dir = 1, 3
    2155             :             ! bounds of local grid (i.e. removing the 'wings'), if periodic
    2156          96 :             tp(dir) = FLOOR(DOT_PRODUCT(h_inv(dir, :), ra)*rs_grid%desc%npts(dir))
    2157          24 :             tp(dir) = MODULO(tp(dir), rs_grid%desc%npts(dir))
    2158          24 :             IF (rs_grid%desc%perd(dir) .NE. 1) THEN
    2159           8 :                lb(dir) = rs_grid%lb_local(dir) + rs_grid%desc%border
    2160           8 :                ub(dir) = rs_grid%ub_local(dir) - rs_grid%desc%border
    2161             :             ELSE
    2162          16 :                lb(dir) = rs_grid%lb_local(dir)
    2163          16 :                ub(dir) = rs_grid%ub_local(dir)
    2164             :             END IF
    2165             :             ! distributed grid, only map if it is local to the grid
    2166          32 :             location(dir) = tp(dir) + rs_grid%desc%lb(dir)
    2167             :          END DO
    2168          60 :          IF (ALL(lb(:) <= location(:)) .AND. ALL(location(:) <= ub(:))) THEN
    2169           4 :             res = .TRUE.
    2170             :          END IF
    2171             :       ELSE
    2172     1555479 :          IF (PRESENT(offset) .AND. PRESENT(group_size) .AND. PRESENT(my_pos)) THEN
    2173             :             ! not distributed, just a round-robin distribution over the full set of CPUs
    2174     1555479 :             IF (MODULO(offset, group_size) == my_pos) res = .TRUE.
    2175             :          END IF
    2176             :       END IF
    2177             : 
    2178     1555487 :    END FUNCTION map_gaussian_here
    2179             : 
    2180           0 : END MODULE realspace_grid_types

Generated by: LCOV version 1.15