LCOV - code coverage report
Current view: top level - src/pw - pw_gpu.F (source / functions) Hit Total Coverage
Test: CP2K Regtests (git:4dc10b3) Lines: 4 273 1.5 %
Date: 2024-11-21 06:45:46 Functions: 2 13 15.4 %

          Line data    Source code
       1             : !--------------------------------------------------------------------------------------------------!
       2             : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3             : !   Copyright 2000-2024 CP2K developers group <https://cp2k.org>                                   !
       4             : !                                                                                                  !
       5             : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6             : !--------------------------------------------------------------------------------------------------!
       7             : 
       8             : ! **************************************************************************************************
       9             : !> \note
      10             : !> This module contains routines necessary to operate on plane waves on GPUs
      11             : !     >  independently of the GPU platform.
      12             : !> \par History
      13             : !>      BGL (06-Mar-2008)  : Created
      14             : !>      AG  (18-May-2012)  : Refacturing:
      15             : !>                           - added explicit interfaces to C routines
      16             : !>                           - enable double precision complex transformations
      17             : !>      AG  (11-Sept-2012) : Modifications:
      18             : !>                          - use pointers if precision mapping is not required
      19             : !>                          - use OMP for mapping
      20             : !>      MT  (Jan 2022)     : Modifications
      21             : !>                          - use a generic interface for fft calls to GPUs
      22             : !>                          - Support both Nvidia and AMD GPUs. Other GPUs manufacturers
      23             : !>                            can be added easily.
      24             : !> \author Benjamin G. Levine
      25             : ! **************************************************************************************************
      26             : MODULE pw_gpu
      27             :    USE ISO_C_BINDING,                   ONLY: C_DOUBLE,&
      28             :                                               C_INT,&
      29             :                                               C_LOC,&
      30             :                                               C_PTR
      31             :    USE fft_tools,                       ONLY: &
      32             :         cube_transpose_1, cube_transpose_2, fft_scratch_sizes, fft_scratch_type, get_fft_scratch, &
      33             :         release_fft_scratch, x_to_yz, xz_to_yz, yz_to_x, yz_to_xz
      34             :    USE kinds,                           ONLY: dp
      35             :    USE mathconstants,                   ONLY: z_zero
      36             :    USE message_passing,                 ONLY: mp_cart_type
      37             :    USE pw_grid_types,                   ONLY: FULLSPACE
      38             :    USE pw_types,                        ONLY: pw_c1d_gs_type,&
      39             :                                               pw_r3d_rs_type
      40             : #include "../base/base_uses.f90"
      41             : 
      42             :    IMPLICIT NONE
      43             : 
      44             :    PRIVATE
      45             : 
      46             :    PUBLIC :: pw_gpu_r3dc1d_3d
      47             :    PUBLIC :: pw_gpu_c1dr3d_3d
      48             :    PUBLIC :: pw_gpu_r3dc1d_3d_ps
      49             :    PUBLIC :: pw_gpu_c1dr3d_3d_ps
      50             :    PUBLIC :: pw_gpu_init, pw_gpu_finalize
      51             : 
      52             :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'pw_gpu'
      53             :    LOGICAL, PARAMETER, PRIVATE :: debug_this_module = .FALSE.
      54             : 
      55             : CONTAINS
      56             : 
      57             : ! **************************************************************************************************
      58             : !> \brief Allocates resources on the gpu device for gpu fft acceleration
      59             : !> \author Ole Schuett
      60             : ! **************************************************************************************************
      61        8530 :    SUBROUTINE pw_gpu_init()
      62             :       INTEGER                                            :: dummy
      63             :       INTERFACE
      64             :          SUBROUTINE pw_gpu_init_c() BIND(C, name="pw_gpu_init")
      65             :          END SUBROUTINE pw_gpu_init_c
      66             :       END INTERFACE
      67             : 
      68             :       MARK_USED(dummy) ! TODO: fix fpretty
      69             : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
      70             :       CALL pw_gpu_init_c()
      71             : #else
      72             :       ! Nothing to do.
      73             : #endif
      74        8530 :    END SUBROUTINE pw_gpu_init
      75             : 
      76             : ! **************************************************************************************************
      77             : !> \brief Releases resources on the gpu device for gpu fft acceleration
      78             : !> \author Ole Schuett
      79             : ! **************************************************************************************************
      80        8530 :    SUBROUTINE pw_gpu_finalize()
      81             :       INTEGER                                            :: dummy
      82             :       INTERFACE
      83             :          SUBROUTINE pw_gpu_finalize_c() BIND(C, name="pw_gpu_finalize")
      84             :          END SUBROUTINE pw_gpu_finalize_c
      85             :       END INTERFACE
      86             : 
      87             :       MARK_USED(dummy) ! TODO: fix fpretty
      88             : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
      89             :       CALL pw_gpu_finalize_c()
      90             : #else
      91             :       ! Nothing to do.
      92             : #endif
      93        8530 :    END SUBROUTINE pw_gpu_finalize
      94             : 
      95             : ! **************************************************************************************************
      96             : !> \brief perform an fft followed by a gather on the gpu
      97             : !> \param pw1 ...
      98             : !> \param pw2 ...
      99             : !> \author Benjamin G Levine
     100             : ! **************************************************************************************************
     101           0 :    SUBROUTINE pw_gpu_r3dc1d_3d(pw1, pw2)
     102             :       TYPE(pw_r3d_rs_type), INTENT(IN)                   :: pw1
     103             :       TYPE(pw_c1d_gs_type), INTENT(INOUT)                :: pw2
     104             : 
     105             :       CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_gpu_r3dc1d_3d'
     106             : 
     107             :       COMPLEX(KIND=dp), POINTER                          :: ptr_pwout
     108             :       INTEGER                                            :: handle, l1, l2, l3, ngpts
     109           0 :       INTEGER, DIMENSION(:), POINTER                     :: npts
     110             :       INTEGER, POINTER                                   :: ptr_ghatmap
     111             :       REAL(KIND=dp)                                      :: scale
     112             :       REAL(KIND=dp), POINTER                             :: ptr_pwin
     113             :       INTERFACE
     114             :          SUBROUTINE pw_gpu_cfffg_c(din, zout, ghatmap, npts, ngpts, scale) BIND(C, name="pw_gpu_cfffg")
     115             :             IMPORT
     116             :             TYPE(C_PTR), INTENT(IN), VALUE               :: din
     117             :             TYPE(C_PTR), VALUE                           :: zout
     118             :             TYPE(C_PTR), INTENT(IN), VALUE               :: ghatmap
     119             :             INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
     120             :             INTEGER(KIND=C_INT), INTENT(IN), VALUE       :: ngpts
     121             :             REAL(KIND=C_DOUBLE), INTENT(IN), VALUE       :: scale
     122             : 
     123             :          END SUBROUTINE pw_gpu_cfffg_c
     124             :       END INTERFACE
     125             : 
     126           0 :       CALL timeset(routineN, handle)
     127             : 
     128           0 :       scale = 1.0_dp/REAL(pw1%pw_grid%ngpts, KIND=dp)
     129             : 
     130           0 :       ngpts = SIZE(pw2%pw_grid%gsq)
     131             :       l1 = LBOUND(pw1%array, 1)
     132             :       l2 = LBOUND(pw1%array, 2)
     133             :       l3 = LBOUND(pw1%array, 3)
     134           0 :       npts => pw1%pw_grid%npts
     135             : 
     136             :       ! pointers to data arrays
     137           0 :       ptr_pwin => pw1%array(l1, l2, l3)
     138           0 :       ptr_pwout => pw2%array(1)
     139             : 
     140             :       ! pointer to map array
     141           0 :       ptr_ghatmap => pw2%pw_grid%g_hatmap(1, 1)
     142             : 
     143             :       ! invoke the combined transformation
     144             : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
     145             :       CALL pw_gpu_cfffg_c(c_loc(ptr_pwin), c_loc(ptr_pwout), c_loc(ptr_ghatmap), npts, ngpts, scale)
     146             : #else
     147           0 :       CPABORT("Compiled without pw offloading.")
     148             : #endif
     149             : 
     150           0 :       CALL timestop(handle)
     151           0 :    END SUBROUTINE pw_gpu_r3dc1d_3d
     152             : 
     153             : ! **************************************************************************************************
     154             : !> \brief perform an scatter followed by a fft on the gpu
     155             : !> \param pw1 ...
     156             : !> \param pw2 ...
     157             : !> \author Benjamin G Levine
     158             : ! **************************************************************************************************
     159           0 :    SUBROUTINE pw_gpu_c1dr3d_3d(pw1, pw2)
     160             :       TYPE(pw_c1d_gs_type), INTENT(IN)                   :: pw1
     161             :       TYPE(pw_r3d_rs_type), INTENT(INOUT)                :: pw2
     162             : 
     163             :       CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_gpu_c1dr3d_3d'
     164             : 
     165             :       COMPLEX(KIND=dp), POINTER                          :: ptr_pwin
     166             :       INTEGER                                            :: handle, l1, l2, l3, ngpts, nmaps
     167           0 :       INTEGER, DIMENSION(:), POINTER                     :: npts
     168             :       INTEGER, POINTER                                   :: ptr_ghatmap
     169             :       REAL(KIND=dp)                                      :: scale
     170             :       REAL(KIND=dp), POINTER                             :: ptr_pwout
     171             :       INTERFACE
     172             :          SUBROUTINE pw_gpu_sfffc_c(zin, dout, ghatmap, npts, ngpts, nmaps, scale) BIND(C, name="pw_gpu_sfffc")
     173             :             IMPORT
     174             :             TYPE(C_PTR), INTENT(IN), VALUE               :: zin
     175             :             TYPE(C_PTR), VALUE                           :: dout
     176             :             TYPE(C_PTR), INTENT(IN), VALUE               :: ghatmap
     177             :             INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
     178             :             INTEGER(KIND=C_INT), INTENT(IN), VALUE       :: ngpts, nmaps
     179             :             REAL(KIND=C_DOUBLE), INTENT(IN), VALUE       :: scale
     180             :          END SUBROUTINE pw_gpu_sfffc_c
     181             :       END INTERFACE
     182             : 
     183           0 :       CALL timeset(routineN, handle)
     184             : 
     185           0 :       scale = 1.0_dp
     186             : 
     187           0 :       ngpts = SIZE(pw1%pw_grid%gsq)
     188             :       l1 = LBOUND(pw2%array, 1)
     189             :       l2 = LBOUND(pw2%array, 2)
     190             :       l3 = LBOUND(pw2%array, 3)
     191           0 :       npts => pw1%pw_grid%npts
     192             : 
     193             :       ! pointers to data arrays
     194           0 :       ptr_pwin => pw1%array(1)
     195           0 :       ptr_pwout => pw2%array(l1, l2, l3)
     196             : 
     197             :       ! pointer to map array
     198           0 :       nmaps = SIZE(pw1%pw_grid%g_hatmap, 2)
     199           0 :       ptr_ghatmap => pw1%pw_grid%g_hatmap(1, 1)
     200             : 
     201             :       ! invoke the combined transformation
     202             : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
     203             :       CALL pw_gpu_sfffc_c(c_loc(ptr_pwin), c_loc(ptr_pwout), c_loc(ptr_ghatmap), npts, ngpts, nmaps, scale)
     204             : #else
     205           0 :       CPABORT("Compiled without pw offloading")
     206             : #endif
     207             : 
     208           0 :       CALL timestop(handle)
     209           0 :    END SUBROUTINE pw_gpu_c1dr3d_3d
     210             : 
     211             : ! **************************************************************************************************
     212             : !> \brief perform an parallel fft followed by a gather on the gpu
     213             : !> \param pw1 ...
     214             : !> \param pw2 ...
     215             : !> \author Andreas Gloess
     216             : ! **************************************************************************************************
     217           0 :    SUBROUTINE pw_gpu_r3dc1d_3d_ps(pw1, pw2)
     218             :       TYPE(pw_r3d_rs_type), INTENT(IN)                   :: pw1
     219             :       TYPE(pw_c1d_gs_type), INTENT(INOUT)                :: pw2
     220             : 
     221             :       CHARACTER(len=*), PARAMETER :: routineN = 'pw_gpu_r3dc1d_3d_ps'
     222             : 
     223           0 :       COMPLEX(KIND=dp), DIMENSION(:, :), POINTER         :: grays, pbuf, qbuf, rbuf, sbuf
     224           0 :       COMPLEX(KIND=dp), DIMENSION(:, :, :), POINTER      :: tbuf
     225             :       INTEGER                                            :: g_pos, handle, lg, lmax, mg, mmax, mx2, &
     226             :                                                             mz2, n1, n2, ngpts, nmax, numtask, rp
     227           0 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: p2p
     228             :       INTEGER, DIMENSION(2)                              :: r_dim, r_pos
     229           0 :       INTEGER, DIMENSION(:), POINTER                     :: n, nloc, nyzray
     230           0 :       INTEGER, DIMENSION(:, :, :, :), POINTER            :: bo
     231             :       REAL(KIND=dp)                                      :: scale
     232           0 :       TYPE(fft_scratch_sizes)                            :: fft_scratch_size
     233             :       TYPE(fft_scratch_type), POINTER                    :: fft_scratch
     234           0 :       TYPE(mp_cart_type)                                 :: rs_group
     235             : 
     236           0 :       CALL timeset(routineN, handle)
     237             : 
     238           0 :       scale = 1.0_dp/REAL(pw1%pw_grid%ngpts, KIND=dp)
     239             : 
     240             :       ! dimensions
     241           0 :       n => pw1%pw_grid%npts
     242           0 :       nloc => pw1%pw_grid%npts_local
     243           0 :       grays => pw1%pw_grid%grays
     244           0 :       ngpts = nloc(1)*nloc(2)*nloc(3)
     245             : 
     246             :       !..transform
     247           0 :       IF (pw1%pw_grid%para%ray_distribution) THEN
     248           0 :          rs_group = pw1%pw_grid%para%group
     249           0 :          nyzray => pw1%pw_grid%para%nyzray
     250           0 :          bo => pw1%pw_grid%para%bo
     251             : 
     252           0 :          g_pos = rs_group%mepos
     253           0 :          numtask = rs_group%num_pe
     254           0 :          r_dim = rs_group%num_pe_cart
     255           0 :          r_pos = rs_group%mepos_cart
     256             : 
     257           0 :          lg = SIZE(grays, 1)
     258           0 :          mg = SIZE(grays, 2)
     259           0 :          mmax = MAX(mg, 1)
     260           0 :          lmax = MAX(lg, (ngpts/mmax + 1))
     261             : 
     262           0 :          ALLOCATE (p2p(0:numtask - 1))
     263             : 
     264           0 :          CALL rs_group%rank_compare(rs_group, p2p)
     265             : 
     266           0 :          rp = p2p(g_pos)
     267           0 :          mx2 = bo(2, 1, rp, 2) - bo(1, 1, rp, 2) + 1
     268           0 :          mz2 = bo(2, 3, rp, 2) - bo(1, 3, rp, 2) + 1
     269           0 :          n1 = MAXVAL(bo(2, 1, :, 1) - bo(1, 1, :, 1) + 1)
     270           0 :          n2 = MAXVAL(bo(2, 2, :, 1) - bo(1, 2, :, 1) + 1)
     271           0 :          nmax = MAX((2*n2)/numtask, 2)*mx2*mz2
     272           0 :          nmax = MAX(nmax, n1*MAXVAL(nyzray))
     273             : 
     274           0 :          fft_scratch_size%nx = nloc(1)
     275           0 :          fft_scratch_size%ny = nloc(2)
     276           0 :          fft_scratch_size%nz = nloc(3)
     277           0 :          fft_scratch_size%lmax = lmax
     278           0 :          fft_scratch_size%mmax = mmax
     279           0 :          fft_scratch_size%mx1 = bo(2, 1, rp, 1) - bo(1, 1, rp, 1) + 1
     280           0 :          fft_scratch_size%mx2 = mx2
     281           0 :          fft_scratch_size%my1 = bo(2, 2, rp, 1) - bo(1, 2, rp, 1) + 1
     282           0 :          fft_scratch_size%mz2 = mz2
     283           0 :          fft_scratch_size%lg = lg
     284           0 :          fft_scratch_size%mg = mg
     285           0 :          fft_scratch_size%nbx = MAXVAL(bo(2, 1, :, 2))
     286           0 :          fft_scratch_size%nbz = MAXVAL(bo(2, 3, :, 2))
     287           0 :          fft_scratch_size%mcz1 = MAXVAL(bo(2, 3, :, 1) - bo(1, 3, :, 1) + 1)
     288           0 :          fft_scratch_size%mcx2 = MAXVAL(bo(2, 1, :, 2) - bo(1, 1, :, 2) + 1)
     289           0 :          fft_scratch_size%mcz2 = MAXVAL(bo(2, 3, :, 2) - bo(1, 3, :, 2) + 1)
     290           0 :          fft_scratch_size%nmax = nmax
     291           0 :          fft_scratch_size%nmray = MAXVAL(nyzray)
     292           0 :          fft_scratch_size%nyzray = nyzray(g_pos)
     293           0 :          fft_scratch_size%rs_group = rs_group
     294           0 :          fft_scratch_size%g_pos = g_pos
     295           0 :          fft_scratch_size%r_pos = r_pos
     296           0 :          fft_scratch_size%r_dim = r_dim
     297           0 :          fft_scratch_size%numtask = numtask
     298             : 
     299           0 :          IF (r_dim(2) > 1) THEN
     300             :             !
     301             :             ! real space is distributed over x and y coordinate
     302             :             ! we have two stages of communication
     303             :             !
     304           0 :             IF (r_dim(1) == 1) &
     305           0 :                CPABORT("This processor distribution is not supported.")
     306             : 
     307           0 :             CALL get_fft_scratch(fft_scratch, tf_type=300, n=n, fft_sizes=fft_scratch_size)
     308             : 
     309             :             ! assign buffers
     310           0 :             qbuf => fft_scratch%p2buf
     311           0 :             rbuf => fft_scratch%p3buf
     312           0 :             pbuf => fft_scratch%p4buf
     313           0 :             sbuf => fft_scratch%p5buf
     314             : 
     315             :             ! FFT along z
     316           0 :             CALL pw_gpu_cf(pw1, qbuf)
     317             : 
     318             :             ! Exchange data ( transpose of matrix )
     319           0 :             CALL cube_transpose_2(qbuf, bo(:, :, :, 1), bo(:, :, :, 2), rbuf, fft_scratch)
     320             : 
     321             :             ! FFT along y
     322             :             ! use the inbuild fft-lib
     323             :             ! CALL fft_1dm(fft_scratch%fft_plan(2), rbuf, pbuf, 1.0_dp, stat)
     324             :             ! or cufft (works faster, but is only faster if plans are stored)
     325           0 :             CALL pw_gpu_f(rbuf, pbuf, +1, n(2), mx2*mz2)
     326             : 
     327             :             ! Exchange data ( transpose of matrix ) and sort
     328             :             CALL xz_to_yz(pbuf, rs_group, r_dim, g_pos, p2p, pw1%pw_grid%para%yzp, nyzray, &
     329           0 :                           bo(:, :, :, 2), sbuf, fft_scratch)
     330             : 
     331             :             ! FFT along x
     332           0 :             CALL pw_gpu_fg(sbuf, pw2, scale)
     333             : 
     334           0 :             CALL release_fft_scratch(fft_scratch)
     335             : 
     336             :          ELSE
     337             :             !
     338             :             ! real space is only distributed over x coordinate
     339             :             ! we have one stage of communication, after the transform of
     340             :             ! direction x
     341             :             !
     342             : 
     343           0 :             CALL get_fft_scratch(fft_scratch, tf_type=200, n=n, fft_sizes=fft_scratch_size)
     344             : 
     345             :             ! assign buffers
     346           0 :             tbuf => fft_scratch%tbuf
     347           0 :             sbuf => fft_scratch%r1buf
     348             : 
     349             :             ! FFT along y and z
     350           0 :             CALL pw_gpu_cff(pw1, tbuf)
     351             : 
     352             :             ! Exchange data ( transpose of matrix ) and sort
     353             :             CALL yz_to_x(tbuf, rs_group, g_pos, p2p, pw1%pw_grid%para%yzp, nyzray, &
     354           0 :                          bo(:, :, :, 2), sbuf, fft_scratch)
     355             : 
     356             :             ! FFT along x
     357           0 :             CALL pw_gpu_fg(sbuf, pw2, scale)
     358             : 
     359           0 :             CALL release_fft_scratch(fft_scratch)
     360             : 
     361             :          END IF
     362             : 
     363           0 :          DEALLOCATE (p2p)
     364             : 
     365             : !--------------------------------------------------------------------------
     366             :       ELSE
     367           0 :          CPABORT("Not implemented (no ray_distr.) in: pw_gpu_r3dc1d_3d_ps.")
     368             :       END IF
     369             : 
     370           0 :       CALL timestop(handle)
     371           0 :    END SUBROUTINE pw_gpu_r3dc1d_3d_ps
     372             : 
     373             : ! **************************************************************************************************
     374             : !> \brief perform an parallel scatter followed by a fft on the gpu
     375             : !> \param pw1 ...
     376             : !> \param pw2 ...
     377             : !> \author Andreas Gloess
     378             : ! **************************************************************************************************
     379           0 :    SUBROUTINE pw_gpu_c1dr3d_3d_ps(pw1, pw2)
     380             :       TYPE(pw_c1d_gs_type), INTENT(IN)                   :: pw1
     381             :       TYPE(pw_r3d_rs_type), INTENT(INOUT)                :: pw2
     382             : 
     383             :       CHARACTER(len=*), PARAMETER :: routineN = 'pw_gpu_c1dr3d_3d_ps'
     384             : 
     385           0 :       COMPLEX(KIND=dp), DIMENSION(:, :), POINTER         :: grays, pbuf, qbuf, rbuf, sbuf
     386           0 :       COMPLEX(KIND=dp), DIMENSION(:, :, :), POINTER      :: tbuf
     387             :       INTEGER                                            :: g_pos, handle, lg, lmax, mg, mmax, mx2, &
     388             :                                                             mz2, n1, n2, ngpts, nmax, numtask, rp
     389           0 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: p2p
     390             :       INTEGER, DIMENSION(2)                              :: r_dim, r_pos
     391           0 :       INTEGER, DIMENSION(:), POINTER                     :: n, nloc, nyzray
     392           0 :       INTEGER, DIMENSION(:, :, :, :), POINTER            :: bo
     393             :       REAL(KIND=dp)                                      :: scale
     394           0 :       TYPE(fft_scratch_sizes)                            :: fft_scratch_size
     395             :       TYPE(fft_scratch_type), POINTER                    :: fft_scratch
     396           0 :       TYPE(mp_cart_type)                                 :: rs_group
     397             : 
     398           0 :       CALL timeset(routineN, handle)
     399             : 
     400           0 :       scale = 1.0_dp
     401             : 
     402             :       ! dimensions
     403           0 :       n => pw1%pw_grid%npts
     404           0 :       nloc => pw1%pw_grid%npts_local
     405           0 :       grays => pw1%pw_grid%grays
     406           0 :       ngpts = nloc(1)*nloc(2)*nloc(3)
     407             : 
     408             :       !..transform
     409           0 :       IF (pw1%pw_grid%para%ray_distribution) THEN
     410           0 :          rs_group = pw1%pw_grid%para%group
     411           0 :          nyzray => pw1%pw_grid%para%nyzray
     412           0 :          bo => pw1%pw_grid%para%bo
     413             : 
     414           0 :          g_pos = rs_group%mepos
     415           0 :          numtask = rs_group%num_pe
     416           0 :          r_dim = rs_group%num_pe_cart
     417           0 :          r_pos = rs_group%mepos_cart
     418             : 
     419           0 :          lg = SIZE(grays, 1)
     420           0 :          mg = SIZE(grays, 2)
     421           0 :          mmax = MAX(mg, 1)
     422           0 :          lmax = MAX(lg, (ngpts/mmax + 1))
     423             : 
     424           0 :          ALLOCATE (p2p(0:numtask - 1))
     425             : 
     426           0 :          CALL rs_group%rank_compare(rs_group, p2p)
     427             : 
     428           0 :          rp = p2p(g_pos)
     429           0 :          mx2 = bo(2, 1, rp, 2) - bo(1, 1, rp, 2) + 1
     430           0 :          mz2 = bo(2, 3, rp, 2) - bo(1, 3, rp, 2) + 1
     431           0 :          n1 = MAXVAL(bo(2, 1, :, 1) - bo(1, 1, :, 1) + 1)
     432           0 :          n2 = MAXVAL(bo(2, 2, :, 1) - bo(1, 2, :, 1) + 1)
     433           0 :          nmax = MAX((2*n2)/numtask, 2)*mx2*mz2
     434           0 :          nmax = MAX(nmax, n1*MAXVAL(nyzray))
     435             : 
     436           0 :          fft_scratch_size%nx = nloc(1)
     437           0 :          fft_scratch_size%ny = nloc(2)
     438           0 :          fft_scratch_size%nz = nloc(3)
     439           0 :          fft_scratch_size%lmax = lmax
     440           0 :          fft_scratch_size%mmax = mmax
     441           0 :          fft_scratch_size%mx1 = bo(2, 1, rp, 1) - bo(1, 1, rp, 1) + 1
     442           0 :          fft_scratch_size%mx2 = mx2
     443           0 :          fft_scratch_size%my1 = bo(2, 2, rp, 1) - bo(1, 2, rp, 1) + 1
     444           0 :          fft_scratch_size%mz2 = mz2
     445           0 :          fft_scratch_size%lg = lg
     446           0 :          fft_scratch_size%mg = mg
     447           0 :          fft_scratch_size%nbx = MAXVAL(bo(2, 1, :, 2))
     448           0 :          fft_scratch_size%nbz = MAXVAL(bo(2, 3, :, 2))
     449           0 :          fft_scratch_size%mcz1 = MAXVAL(bo(2, 3, :, 1) - bo(1, 3, :, 1) + 1)
     450           0 :          fft_scratch_size%mcx2 = MAXVAL(bo(2, 1, :, 2) - bo(1, 1, :, 2) + 1)
     451           0 :          fft_scratch_size%mcz2 = MAXVAL(bo(2, 3, :, 2) - bo(1, 3, :, 2) + 1)
     452           0 :          fft_scratch_size%nmax = nmax
     453           0 :          fft_scratch_size%nmray = MAXVAL(nyzray)
     454           0 :          fft_scratch_size%nyzray = nyzray(g_pos)
     455           0 :          fft_scratch_size%rs_group = rs_group
     456           0 :          fft_scratch_size%g_pos = g_pos
     457           0 :          fft_scratch_size%r_pos = r_pos
     458           0 :          fft_scratch_size%r_dim = r_dim
     459           0 :          fft_scratch_size%numtask = numtask
     460             : 
     461           0 :          IF (r_dim(2) > 1) THEN
     462             :             !
     463             :             ! real space is distributed over x and y coordinate
     464             :             ! we have two stages of communication
     465             :             !
     466           0 :             IF (r_dim(1) == 1) &
     467           0 :                CPABORT("This processor distribution is not supported.")
     468             : 
     469           0 :             CALL get_fft_scratch(fft_scratch, tf_type=300, n=n, fft_sizes=fft_scratch_size)
     470             : 
     471             :             ! assign buffers
     472           0 :             pbuf => fft_scratch%p7buf
     473           0 :             qbuf => fft_scratch%p4buf
     474           0 :             rbuf => fft_scratch%p3buf
     475           0 :             sbuf => fft_scratch%p2buf
     476             : 
     477             :             ! FFT along x
     478           0 :             CALL pw_gpu_sf(pw1, pbuf, scale)
     479             : 
     480             :             ! Exchange data ( transpose of matrix ) and sort
     481           0 :             IF (pw1%pw_grid%grid_span /= FULLSPACE) qbuf = z_zero
     482             :             CALL yz_to_xz(pbuf, rs_group, r_dim, g_pos, p2p, pw1%pw_grid%para%yzp, nyzray, &
     483           0 :                           bo(:, :, :, 2), qbuf, fft_scratch)
     484             : 
     485             :             ! FFT along y
     486             :             ! use the inbuild fft-lib
     487             :             ! CALL fft_1dm(fft_scratch%fft_plan(5), qbuf, rbuf, 1.0_dp, stat)
     488             :             ! or cufft (works faster, but is only faster if plans are stored)
     489           0 :             CALL pw_gpu_f(qbuf, rbuf, -1, n(2), mx2*mz2)
     490             : 
     491             :             ! Exchange data ( transpose of matrix )
     492           0 :             IF (pw1%pw_grid%grid_span /= FULLSPACE) sbuf = z_zero
     493             : 
     494           0 :             CALL cube_transpose_1(rbuf, bo(:, :, :, 2), bo(:, :, :, 1), sbuf, fft_scratch)
     495             : 
     496             :             ! FFT along z
     497           0 :             CALL pw_gpu_fc(sbuf, pw2)
     498             : 
     499           0 :             CALL release_fft_scratch(fft_scratch)
     500             : 
     501             :          ELSE
     502             :             !
     503             :             ! real space is only distributed over x coordinate
     504             :             ! we have one stage of communication, after the transform of
     505             :             ! direction x
     506             :             !
     507             : 
     508           0 :             CALL get_fft_scratch(fft_scratch, tf_type=200, n=n, fft_sizes=fft_scratch_size)
     509             : 
     510             :             ! assign buffers
     511           0 :             sbuf => fft_scratch%r1buf
     512           0 :             tbuf => fft_scratch%tbuf
     513             : 
     514             :             ! FFT along x
     515           0 :             CALL pw_gpu_sf(pw1, sbuf, scale)
     516             : 
     517             :             ! Exchange data ( transpose of matrix ) and sort
     518           0 :             IF (pw1%pw_grid%grid_span /= FULLSPACE) tbuf = z_zero
     519             :             CALL x_to_yz(sbuf, rs_group, g_pos, p2p, pw1%pw_grid%para%yzp, nyzray, &
     520           0 :                          bo(:, :, :, 2), tbuf, fft_scratch)
     521             : 
     522             :             ! FFT along y and z
     523           0 :             CALL pw_gpu_ffc(tbuf, pw2)
     524             : 
     525           0 :             CALL release_fft_scratch(fft_scratch)
     526             : 
     527             :          END IF
     528             : 
     529           0 :          DEALLOCATE (p2p)
     530             : 
     531             : !--------------------------------------------------------------------------
     532             :       ELSE
     533           0 :          CPABORT("Not implemented (no ray_distr.) in: pw_gpu_c1dr3d_3d_ps.")
     534             :       END IF
     535             : 
     536           0 :       CALL timestop(handle)
     537           0 :    END SUBROUTINE pw_gpu_c1dr3d_3d_ps
     538             : 
     539             : ! **************************************************************************************************
     540             : !> \brief perform a parallel real_to_complex copy followed by a 2D-FFT on the gpu
     541             : !> \param pw1 ...
     542             : !> \param pwbuf ...
     543             : !> \author Andreas Gloess
     544             : ! **************************************************************************************************
     545           0 :    SUBROUTINE pw_gpu_cff(pw1, pwbuf)
     546             :       TYPE(pw_r3d_rs_type), INTENT(IN)                   :: pw1
     547             :       COMPLEX(KIND=dp), DIMENSION(:, :, :), &
     548             :          INTENT(INOUT), TARGET                           :: pwbuf
     549             : 
     550             :       CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_gpu_cff'
     551             : 
     552             :       COMPLEX(KIND=dp), POINTER                          :: ptr_pwout
     553             :       INTEGER                                            :: handle, l1, l2, l3
     554           0 :       INTEGER, DIMENSION(:), POINTER                     :: npts
     555             :       REAL(KIND=dp), POINTER                             :: ptr_pwin
     556             :       INTERFACE
     557             :          SUBROUTINE pw_gpu_cff_c(din, zout, npts) BIND(C, name="pw_gpu_cff")
     558             :             IMPORT
     559             :             TYPE(C_PTR), INTENT(IN), VALUE               :: din
     560             :             TYPE(C_PTR), VALUE                           :: zout
     561             :             INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
     562             :          END SUBROUTINE pw_gpu_cff_c
     563             :       END INTERFACE
     564             : 
     565           0 :       CALL timeset(routineN, handle)
     566             : 
     567             :       ! dimensions
     568           0 :       npts => pw1%pw_grid%npts_local
     569             :       l1 = LBOUND(pw1%array, 1)
     570             :       l2 = LBOUND(pw1%array, 2)
     571             :       l3 = LBOUND(pw1%array, 3)
     572             : 
     573             :       ! pointers to data arrays
     574           0 :       ptr_pwin => pw1%array(l1, l2, l3)
     575           0 :       ptr_pwout => pwbuf(1, 1, 1)
     576             : 
     577             :       ! invoke the combined transformation
     578             : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
     579             :       CALL pw_gpu_cff_c(c_loc(ptr_pwin), c_loc(ptr_pwout), npts)
     580             : #else
     581           0 :       CPABORT("Compiled without pw offloading")
     582             : #endif
     583             : 
     584           0 :       CALL timestop(handle)
     585           0 :    END SUBROUTINE pw_gpu_cff
     586             : 
     587             : ! **************************************************************************************************
     588             : !> \brief perform a parallel 2D-FFT followed by a complex_to_real copy on the gpu
     589             : !> \param pwbuf ...
     590             : !> \param pw2 ...
     591             : !> \author Andreas Gloess
     592             : ! **************************************************************************************************
     593           0 :    SUBROUTINE pw_gpu_ffc(pwbuf, pw2)
     594             :       COMPLEX(KIND=dp), DIMENSION(:, :, :), INTENT(IN), &
     595             :          TARGET                                          :: pwbuf
     596             :       TYPE(pw_r3d_rs_type), INTENT(IN)                   :: pw2
     597             : 
     598             :       CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_gpu_ffc'
     599             : 
     600             :       COMPLEX(KIND=dp), POINTER                          :: ptr_pwin
     601             :       INTEGER                                            :: handle, l1, l2, l3
     602           0 :       INTEGER, DIMENSION(:), POINTER                     :: npts
     603             :       REAL(KIND=dp), POINTER                             :: ptr_pwout
     604             :       INTERFACE
     605             :          SUBROUTINE pw_gpu_ffc_c(zin, dout, npts) BIND(C, name="pw_gpu_ffc")
     606             :             IMPORT
     607             :             TYPE(C_PTR), INTENT(IN), VALUE               :: zin
     608             :             TYPE(C_PTR), VALUE                           :: dout
     609             :             INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
     610             :          END SUBROUTINE pw_gpu_ffc_c
     611             :       END INTERFACE
     612             : 
     613           0 :       CALL timeset(routineN, handle)
     614             : 
     615             :       ! dimensions
     616           0 :       npts => pw2%pw_grid%npts_local
     617             :       l1 = LBOUND(pw2%array, 1)
     618             :       l2 = LBOUND(pw2%array, 2)
     619             :       l3 = LBOUND(pw2%array, 3)
     620             : 
     621             :       ! pointers to data arrays
     622           0 :       ptr_pwin => pwbuf(1, 1, 1)
     623           0 :       ptr_pwout => pw2%array(l1, l2, l3)
     624             : 
     625             :       ! invoke the combined transformation
     626             : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
     627             :       CALL pw_gpu_ffc_c(c_loc(ptr_pwin), c_loc(ptr_pwout), npts)
     628             : #else
     629           0 :       CPABORT("Compiled without pw offloading")
     630             : #endif
     631             : 
     632           0 :       CALL timestop(handle)
     633           0 :    END SUBROUTINE pw_gpu_ffc
     634             : 
     635             : ! **************************************************************************************************
     636             : !> \brief perform a parallel real_to_complex copy followed by a 1D-FFT on the gpu
     637             : !> \param pw1 ...
     638             : !> \param pwbuf ...
     639             : !> \author Andreas Gloess
     640             : ! **************************************************************************************************
     641           0 :    SUBROUTINE pw_gpu_cf(pw1, pwbuf)
     642             :       TYPE(pw_r3d_rs_type), INTENT(IN)                   :: pw1
     643             :       COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(INOUT), &
     644             :          TARGET                                          :: pwbuf
     645             : 
     646             :       CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_gpu_cf'
     647             : 
     648             :       COMPLEX(KIND=dp), POINTER                          :: ptr_pwout
     649             :       INTEGER                                            :: handle, l1, l2, l3
     650           0 :       INTEGER, DIMENSION(:), POINTER                     :: npts
     651             :       REAL(KIND=dp), POINTER                             :: ptr_pwin
     652             :       INTERFACE
     653             :          SUBROUTINE pw_gpu_cf_c(din, zout, npts) BIND(C, name="pw_gpu_cf")
     654             :             IMPORT
     655             :             TYPE(C_PTR), INTENT(IN), VALUE               :: din
     656             :             TYPE(C_PTR), VALUE                           :: zout
     657             :             INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
     658             :          END SUBROUTINE pw_gpu_cf_c
     659             :       END INTERFACE
     660             : 
     661           0 :       CALL timeset(routineN, handle)
     662             : 
     663             :       ! dimensions
     664           0 :       npts => pw1%pw_grid%npts_local
     665             :       l1 = LBOUND(pw1%array, 1)
     666             :       l2 = LBOUND(pw1%array, 2)
     667             :       l3 = LBOUND(pw1%array, 3)
     668             : 
     669             :       ! pointers to data arrays
     670           0 :       ptr_pwin => pw1%array(l1, l2, l3)
     671           0 :       ptr_pwout => pwbuf(1, 1)
     672             : 
     673             :       ! invoke the combined transformation
     674             : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
     675             :       CALL pw_gpu_cf_c(c_loc(ptr_pwin), c_loc(ptr_pwout), npts)
     676             : #else
     677           0 :       CPABORT("Compiled without pw offloading")
     678             : #endif
     679           0 :       CALL timestop(handle)
     680           0 :    END SUBROUTINE pw_gpu_cf
     681             : 
     682             : ! **************************************************************************************************
     683             : !> \brief perform a parallel 1D-FFT followed by a complex_to_real copy on the gpu
     684             : !> \param pwbuf ...
     685             : !> \param pw2 ...
     686             : !> \author Andreas Gloess
     687             : ! **************************************************************************************************
     688           0 :    SUBROUTINE pw_gpu_fc(pwbuf, pw2)
     689             :       COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(IN), &
     690             :          TARGET                                          :: pwbuf
     691             :       TYPE(pw_r3d_rs_type), INTENT(IN)                   :: pw2
     692             : 
     693             :       CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_gpu_fc'
     694             : 
     695             :       COMPLEX(KIND=dp), POINTER                          :: ptr_pwin
     696             :       INTEGER                                            :: handle, l1, l2, l3
     697           0 :       INTEGER, DIMENSION(:), POINTER                     :: npts
     698             :       REAL(KIND=dp), POINTER                             :: ptr_pwout
     699             :       INTERFACE
     700             :          SUBROUTINE pw_gpu_fc_c(zin, dout, npts) BIND(C, name="pw_gpu_fc")
     701             :             IMPORT
     702             :             TYPE(C_PTR), INTENT(IN), VALUE               :: zin
     703             :             TYPE(C_PTR), VALUE                           :: dout
     704             :             INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
     705             :          END SUBROUTINE pw_gpu_fc_c
     706             :       END INTERFACE
     707             : 
     708           0 :       CALL timeset(routineN, handle)
     709             : 
     710           0 :       npts => pw2%pw_grid%npts_local
     711             :       l1 = LBOUND(pw2%array, 1)
     712             :       l2 = LBOUND(pw2%array, 2)
     713             :       l3 = LBOUND(pw2%array, 3)
     714             : 
     715             :       ! pointers to data arrays
     716           0 :       ptr_pwin => pwbuf(1, 1)
     717           0 :       ptr_pwout => pw2%array(l1, l2, l3)
     718             : 
     719             :       ! invoke the combined transformation
     720             : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
     721             :       CALL pw_gpu_fc_c(c_loc(ptr_pwin), c_loc(ptr_pwout), npts)
     722             : #else
     723           0 :       CPABORT("Compiled without pw offloading")
     724             : #endif
     725             : 
     726           0 :       CALL timestop(handle)
     727           0 :    END SUBROUTINE pw_gpu_fc
     728             : 
     729             : ! **************************************************************************************************
     730             : !> \brief perform a parallel 1D-FFT on the gpu
     731             : !> \param pwbuf1 ...
     732             : !> \param pwbuf2 ...
     733             : !> \param dir ...
     734             : !> \param n ...
     735             : !> \param m ...
     736             : !> \author Andreas Gloess
     737             : ! **************************************************************************************************
     738           0 :    SUBROUTINE pw_gpu_f(pwbuf1, pwbuf2, dir, n, m)
     739             :       COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(IN), &
     740             :          TARGET                                          :: pwbuf1
     741             :       COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(INOUT), &
     742             :          TARGET                                          :: pwbuf2
     743             :       INTEGER, INTENT(IN)                                :: dir, n, m
     744             : 
     745             :       CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_gpu_f'
     746             : 
     747             :       COMPLEX(KIND=dp), POINTER                          :: ptr_pwin, ptr_pwout
     748             :       INTEGER                                            :: handle
     749             :       INTERFACE
     750             :          SUBROUTINE pw_gpu_f_c(zin, zout, dir, n, m) BIND(C, name="pw_gpu_f")
     751             :             IMPORT
     752             :             TYPE(C_PTR), INTENT(IN), VALUE               :: zin
     753             :             TYPE(C_PTR), VALUE                           :: zout
     754             :             INTEGER(KIND=C_INT), INTENT(IN), VALUE       :: dir, n, m
     755             :          END SUBROUTINE pw_gpu_f_c
     756             :       END INTERFACE
     757             : 
     758           0 :       CALL timeset(routineN, handle)
     759             : 
     760           0 :       IF (n*m /= 0) THEN
     761             :          ! pointers to data arrays
     762           0 :          ptr_pwin => pwbuf1(1, 1)
     763           0 :          ptr_pwout => pwbuf2(1, 1)
     764             : 
     765             :          ! invoke the combined transformation
     766             : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
     767             :          CALL pw_gpu_f_c(c_loc(ptr_pwin), c_loc(ptr_pwout), dir, n, m)
     768             : #else
     769             :          MARK_USED(dir)
     770           0 :          CPABORT("Compiled without pw offloading")
     771             : #endif
     772             :       END IF
     773             : 
     774           0 :       CALL timestop(handle)
     775           0 :    END SUBROUTINE pw_gpu_f
     776             : ! **************************************************************************************************
     777             : !> \brief perform a parallel 1D-FFT followed by a gather on the gpu
     778             : !> \param pwbuf ...
     779             : !> \param pw2 ...
     780             : !> \param scale ...
     781             : !> \author Andreas Gloess
     782             : ! **************************************************************************************************
     783           0 :    SUBROUTINE pw_gpu_fg(pwbuf, pw2, scale)
     784             :       COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(IN), &
     785             :          TARGET                                          :: pwbuf
     786             :       TYPE(pw_c1d_gs_type), INTENT(IN)                   :: pw2
     787             :       REAL(KIND=dp), INTENT(IN)                          :: scale
     788             : 
     789             :       CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_gpu_fg'
     790             : 
     791             :       COMPLEX(KIND=dp), POINTER                          :: ptr_pwin, ptr_pwout
     792             :       INTEGER                                            :: handle, mg, mmax, ngpts
     793           0 :       INTEGER, DIMENSION(:), POINTER                     :: npts
     794             :       INTEGER, POINTER                                   :: ptr_ghatmap
     795             :       INTERFACE
     796             :          SUBROUTINE pw_gpu_fg_c(zin, zout, ghatmap, npts, mmax, ngpts, scale) BIND(C, name="pw_gpu_fg")
     797             :             IMPORT
     798             :             TYPE(C_PTR), INTENT(IN), VALUE               :: zin
     799             :             TYPE(C_PTR), VALUE                           :: zout
     800             :             TYPE(C_PTR), INTENT(IN), VALUE               :: ghatmap
     801             :             INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
     802             :             INTEGER(KIND=C_INT), INTENT(IN), VALUE       :: mmax, ngpts
     803             :             REAL(KIND=C_DOUBLE), INTENT(IN), VALUE       :: scale
     804             : 
     805             :          END SUBROUTINE pw_gpu_fg_c
     806             :       END INTERFACE
     807             : 
     808           0 :       CALL timeset(routineN, handle)
     809             : 
     810           0 :       ngpts = SIZE(pw2%pw_grid%gsq)
     811           0 :       npts => pw2%pw_grid%npts
     812             : 
     813           0 :       IF ((npts(1) /= 0) .AND. (ngpts /= 0)) THEN
     814           0 :          mg = SIZE(pw2%pw_grid%grays, 2)
     815           0 :          mmax = MAX(mg, 1)
     816             : 
     817             :          ! pointers to data arrays
     818           0 :          ptr_pwin => pwbuf(1, 1)
     819           0 :          ptr_pwout => pw2%array(1)
     820             : 
     821             :          ! pointer to map array
     822           0 :          ptr_ghatmap => pw2%pw_grid%g_hatmap(1, 1)
     823             : 
     824             :          ! invoke the combined transformation
     825             : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
     826             :          CALL pw_gpu_fg_c(c_loc(ptr_pwin), c_loc(ptr_pwout), c_loc(ptr_ghatmap), npts, mmax, ngpts, scale)
     827             : #else
     828             :          MARK_USED(scale)
     829           0 :          CPABORT("Compiled without pw offloading")
     830             : #endif
     831             :       END IF
     832             : 
     833           0 :       CALL timestop(handle)
     834           0 :    END SUBROUTINE pw_gpu_fg
     835             : 
     836             : ! **************************************************************************************************
     837             : !> \brief perform a parallel scatter followed by a 1D-FFT on the gpu
     838             : !> \param pw1 ...
     839             : !> \param pwbuf ...
     840             : !> \param scale ...
     841             : !> \author Andreas Gloess
     842             : ! **************************************************************************************************
     843           0 :    SUBROUTINE pw_gpu_sf(pw1, pwbuf, scale)
     844             :       TYPE(pw_c1d_gs_type), INTENT(IN)                   :: pw1
     845             :       COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(INOUT), &
     846             :          TARGET                                          :: pwbuf
     847             :       REAL(KIND=dp), INTENT(IN)                          :: scale
     848             : 
     849             :       CHARACTER(len=*), PARAMETER                        :: routineN = 'pw_gpu_sf'
     850             : 
     851             :       COMPLEX(KIND=dp), POINTER                          :: ptr_pwin, ptr_pwout
     852             :       INTEGER                                            :: handle, mg, mmax, ngpts, nmaps
     853           0 :       INTEGER, DIMENSION(:), POINTER                     :: npts
     854             :       INTEGER, POINTER                                   :: ptr_ghatmap
     855             :       INTERFACE
     856             :          SUBROUTINE pw_gpu_sf_c(zin, zout, ghatmap, npts, mmax, ngpts, nmaps, scale) BIND(C, name="pw_gpu_sf")
     857             :             IMPORT
     858             :             TYPE(C_PTR), INTENT(IN), VALUE               :: zin
     859             :             TYPE(C_PTR), VALUE                           :: zout
     860             :             TYPE(C_PTR), INTENT(IN), VALUE               :: ghatmap
     861             :             INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
     862             :             INTEGER(KIND=C_INT), INTENT(IN), VALUE       :: mmax, ngpts, nmaps
     863             :             REAL(KIND=C_DOUBLE), INTENT(IN), VALUE       :: scale
     864             : 
     865             :          END SUBROUTINE pw_gpu_sf_c
     866             :       END INTERFACE
     867             : 
     868           0 :       CALL timeset(routineN, handle)
     869             : 
     870           0 :       ngpts = SIZE(pw1%pw_grid%gsq)
     871           0 :       npts => pw1%pw_grid%npts
     872             : 
     873           0 :       IF ((npts(1) /= 0) .AND. (ngpts /= 0)) THEN
     874           0 :          mg = SIZE(pw1%pw_grid%grays, 2)
     875           0 :          mmax = MAX(mg, 1)
     876             : 
     877             :          ! pointers to data arrays
     878           0 :          ptr_pwin => pw1%array(1)
     879           0 :          ptr_pwout => pwbuf(1, 1)
     880             : 
     881             :          ! pointer to map array
     882           0 :          nmaps = SIZE(pw1%pw_grid%g_hatmap, 2)
     883           0 :          ptr_ghatmap => pw1%pw_grid%g_hatmap(1, 1)
     884             : 
     885             :          ! invoke the combined transformation
     886             : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
     887             :          CALL pw_gpu_sf_c(c_loc(ptr_pwin), c_loc(ptr_pwout), c_loc(ptr_ghatmap), npts, mmax, ngpts, nmaps, scale)
     888             : #else
     889             :          MARK_USED(scale)
     890           0 :          CPABORT("Compiled without pw offloading")
     891             : #endif
     892             :       END IF
     893             : 
     894           0 :       CALL timestop(handle)
     895           0 :    END SUBROUTINE pw_gpu_sf
     896             : 
     897             : END MODULE pw_gpu
     898             : 

Generated by: LCOV version 1.15