Line data Source code
1 : !--------------------------------------------------------------------------------------------------!
2 : ! CP2K: A general program to perform molecular dynamics simulations !
3 : ! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4 : ! !
5 : ! SPDX-License-Identifier: GPL-2.0-or-later !
6 : !--------------------------------------------------------------------------------------------------!
7 :
8 : ! **************************************************************************************************
9 : !> \note
10 : !> This module contains routines necessary to operate on plane waves on GPUs
11 : ! > independently of the GPU platform.
12 : !> \par History
13 : !> BGL (06-Mar-2008) : Created
14 : !> AG (18-May-2012) : Refacturing:
15 : !> - added explicit interfaces to C routines
16 : !> - enable double precision complex transformations
17 : !> AG (11-Sept-2012) : Modifications:
18 : !> - use pointers if precision mapping is not required
19 : !> - use OMP for mapping
20 : !> MT (Jan 2022) : Modifications
21 : !> - use a generic interface for fft calls to GPUs
22 : !> - Support both Nvidia and AMD GPUs. Other GPUs manufacturers
23 : !> can be added easily.
24 : !> \author Benjamin G. Levine
25 : ! **************************************************************************************************
26 : MODULE pw_gpu
27 : USE ISO_C_BINDING, ONLY: C_DOUBLE,&
28 : C_INT,&
29 : C_LOC,&
30 : C_PTR
31 : USE fft_tools, ONLY: &
32 : cube_transpose_1, cube_transpose_2, fft_scratch_sizes, fft_scratch_type, get_fft_scratch, &
33 : release_fft_scratch, x_to_yz, xz_to_yz, yz_to_x, yz_to_xz
34 : USE kinds, ONLY: dp
35 : USE mathconstants, ONLY: z_zero
36 : USE message_passing, ONLY: mp_cart_type
37 : USE pw_grid_types, ONLY: FULLSPACE
38 : USE pw_types, ONLY: pw_c1d_gs_type,&
39 : pw_r3d_rs_type
40 : #include "../base/base_uses.f90"
41 :
42 : IMPLICIT NONE
43 :
44 : PRIVATE
45 :
46 : PUBLIC :: pw_gpu_r3dc1d_3d
47 : PUBLIC :: pw_gpu_c1dr3d_3d
48 : PUBLIC :: pw_gpu_r3dc1d_3d_ps
49 : PUBLIC :: pw_gpu_c1dr3d_3d_ps
50 : PUBLIC :: pw_gpu_init, pw_gpu_finalize
51 :
52 : CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'pw_gpu'
53 : LOGICAL, PARAMETER, PRIVATE :: debug_this_module = .FALSE.
54 :
55 : CONTAINS
56 :
57 : ! **************************************************************************************************
58 : !> \brief Allocates resources on the gpu device for gpu fft acceleration
59 : !> \author Ole Schuett
60 : ! **************************************************************************************************
61 9174 : SUBROUTINE pw_gpu_init()
62 : INTEGER :: dummy
63 : INTERFACE
64 : SUBROUTINE pw_gpu_init_c() BIND(C, name="pw_gpu_init")
65 : END SUBROUTINE pw_gpu_init_c
66 : END INTERFACE
67 :
68 : MARK_USED(dummy) ! TODO: fix fpretty
69 : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
70 : CALL pw_gpu_init_c()
71 : #else
72 : ! Nothing to do.
73 : #endif
74 9174 : END SUBROUTINE pw_gpu_init
75 :
76 : ! **************************************************************************************************
77 : !> \brief Releases resources on the gpu device for gpu fft acceleration
78 : !> \author Ole Schuett
79 : ! **************************************************************************************************
80 9174 : SUBROUTINE pw_gpu_finalize()
81 : INTEGER :: dummy
82 : INTERFACE
83 : SUBROUTINE pw_gpu_finalize_c() BIND(C, name="pw_gpu_finalize")
84 : END SUBROUTINE pw_gpu_finalize_c
85 : END INTERFACE
86 :
87 : MARK_USED(dummy) ! TODO: fix fpretty
88 : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
89 : CALL pw_gpu_finalize_c()
90 : #else
91 : ! Nothing to do.
92 : #endif
93 9174 : END SUBROUTINE pw_gpu_finalize
94 :
95 : ! **************************************************************************************************
96 : !> \brief perform an fft followed by a gather on the gpu
97 : !> \param pw1 ...
98 : !> \param pw2 ...
99 : !> \author Benjamin G Levine
100 : ! **************************************************************************************************
101 0 : SUBROUTINE pw_gpu_r3dc1d_3d(pw1, pw2)
102 : TYPE(pw_r3d_rs_type), INTENT(IN) :: pw1
103 : TYPE(pw_c1d_gs_type), INTENT(INOUT) :: pw2
104 :
105 : CHARACTER(len=*), PARAMETER :: routineN = 'pw_gpu_r3dc1d_3d'
106 :
107 : COMPLEX(KIND=dp), POINTER :: ptr_pwout
108 : INTEGER :: handle, l1, l2, l3, ngpts
109 0 : INTEGER, DIMENSION(:), POINTER :: npts
110 : INTEGER, POINTER :: ptr_ghatmap
111 : REAL(KIND=dp) :: scale
112 : REAL(KIND=dp), POINTER :: ptr_pwin
113 : INTERFACE
114 : SUBROUTINE pw_gpu_cfffg_c(din, zout, ghatmap, npts, ngpts, scale) BIND(C, name="pw_gpu_cfffg")
115 : IMPORT
116 : TYPE(C_PTR), INTENT(IN), VALUE :: din
117 : TYPE(C_PTR), VALUE :: zout
118 : TYPE(C_PTR), INTENT(IN), VALUE :: ghatmap
119 : INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
120 : INTEGER(KIND=C_INT), INTENT(IN), VALUE :: ngpts
121 : REAL(KIND=C_DOUBLE), INTENT(IN), VALUE :: scale
122 :
123 : END SUBROUTINE pw_gpu_cfffg_c
124 : END INTERFACE
125 :
126 0 : CALL timeset(routineN, handle)
127 :
128 0 : scale = 1.0_dp/REAL(pw1%pw_grid%ngpts, KIND=dp)
129 :
130 0 : ngpts = SIZE(pw2%pw_grid%gsq)
131 : l1 = LBOUND(pw1%array, 1)
132 : l2 = LBOUND(pw1%array, 2)
133 : l3 = LBOUND(pw1%array, 3)
134 0 : npts => pw1%pw_grid%npts
135 :
136 : ! pointers to data arrays
137 0 : ptr_pwin => pw1%array(l1, l2, l3)
138 0 : ptr_pwout => pw2%array(1)
139 :
140 : ! pointer to map array
141 0 : ptr_ghatmap => pw2%pw_grid%g_hatmap(1, 1)
142 :
143 : ! invoke the combined transformation
144 : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
145 : CALL pw_gpu_cfffg_c(c_loc(ptr_pwin), c_loc(ptr_pwout), c_loc(ptr_ghatmap), npts, ngpts, scale)
146 : #else
147 0 : CPABORT("Compiled without pw offloading.")
148 : #endif
149 :
150 0 : CALL timestop(handle)
151 0 : END SUBROUTINE pw_gpu_r3dc1d_3d
152 :
153 : ! **************************************************************************************************
154 : !> \brief perform an scatter followed by a fft on the gpu
155 : !> \param pw1 ...
156 : !> \param pw2 ...
157 : !> \author Benjamin G Levine
158 : ! **************************************************************************************************
159 0 : SUBROUTINE pw_gpu_c1dr3d_3d(pw1, pw2)
160 : TYPE(pw_c1d_gs_type), INTENT(IN) :: pw1
161 : TYPE(pw_r3d_rs_type), INTENT(INOUT) :: pw2
162 :
163 : CHARACTER(len=*), PARAMETER :: routineN = 'pw_gpu_c1dr3d_3d'
164 :
165 : COMPLEX(KIND=dp), POINTER :: ptr_pwin
166 : INTEGER :: handle, l1, l2, l3, ngpts, nmaps
167 0 : INTEGER, DIMENSION(:), POINTER :: npts
168 : INTEGER, POINTER :: ptr_ghatmap
169 : REAL(KIND=dp) :: scale
170 : REAL(KIND=dp), POINTER :: ptr_pwout
171 : INTERFACE
172 : SUBROUTINE pw_gpu_sfffc_c(zin, dout, ghatmap, npts, ngpts, nmaps, scale) BIND(C, name="pw_gpu_sfffc")
173 : IMPORT
174 : TYPE(C_PTR), INTENT(IN), VALUE :: zin
175 : TYPE(C_PTR), VALUE :: dout
176 : TYPE(C_PTR), INTENT(IN), VALUE :: ghatmap
177 : INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
178 : INTEGER(KIND=C_INT), INTENT(IN), VALUE :: ngpts, nmaps
179 : REAL(KIND=C_DOUBLE), INTENT(IN), VALUE :: scale
180 : END SUBROUTINE pw_gpu_sfffc_c
181 : END INTERFACE
182 :
183 0 : CALL timeset(routineN, handle)
184 :
185 0 : scale = 1.0_dp
186 :
187 0 : ngpts = SIZE(pw1%pw_grid%gsq)
188 : l1 = LBOUND(pw2%array, 1)
189 : l2 = LBOUND(pw2%array, 2)
190 : l3 = LBOUND(pw2%array, 3)
191 0 : npts => pw1%pw_grid%npts
192 :
193 : ! pointers to data arrays
194 0 : ptr_pwin => pw1%array(1)
195 0 : ptr_pwout => pw2%array(l1, l2, l3)
196 :
197 : ! pointer to map array
198 0 : nmaps = SIZE(pw1%pw_grid%g_hatmap, 2)
199 0 : ptr_ghatmap => pw1%pw_grid%g_hatmap(1, 1)
200 :
201 : ! invoke the combined transformation
202 : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
203 : CALL pw_gpu_sfffc_c(c_loc(ptr_pwin), c_loc(ptr_pwout), c_loc(ptr_ghatmap), npts, ngpts, nmaps, scale)
204 : #else
205 0 : CPABORT("Compiled without pw offloading")
206 : #endif
207 :
208 0 : CALL timestop(handle)
209 0 : END SUBROUTINE pw_gpu_c1dr3d_3d
210 :
211 : ! **************************************************************************************************
212 : !> \brief perform an parallel fft followed by a gather on the gpu
213 : !> \param pw1 ...
214 : !> \param pw2 ...
215 : !> \author Andreas Gloess
216 : ! **************************************************************************************************
217 0 : SUBROUTINE pw_gpu_r3dc1d_3d_ps(pw1, pw2)
218 : TYPE(pw_r3d_rs_type), INTENT(IN) :: pw1
219 : TYPE(pw_c1d_gs_type), INTENT(INOUT) :: pw2
220 :
221 : CHARACTER(len=*), PARAMETER :: routineN = 'pw_gpu_r3dc1d_3d_ps'
222 :
223 0 : COMPLEX(KIND=dp), DIMENSION(:, :), POINTER :: grays, pbuf, qbuf, rbuf, sbuf
224 0 : COMPLEX(KIND=dp), DIMENSION(:, :, :), POINTER :: tbuf
225 : INTEGER :: g_pos, handle, lg, lmax, mg, mmax, mx2, &
226 : mz2, n1, n2, ngpts, nmax, numtask, rp
227 0 : INTEGER, ALLOCATABLE, DIMENSION(:) :: p2p
228 : INTEGER, DIMENSION(2) :: r_dim, r_pos
229 0 : INTEGER, DIMENSION(:), POINTER :: n, nloc, nyzray
230 0 : INTEGER, DIMENSION(:, :, :, :), POINTER :: bo
231 : REAL(KIND=dp) :: scale
232 0 : TYPE(fft_scratch_sizes) :: fft_scratch_size
233 : TYPE(fft_scratch_type), POINTER :: fft_scratch
234 0 : TYPE(mp_cart_type) :: rs_group
235 :
236 0 : CALL timeset(routineN, handle)
237 :
238 0 : scale = 1.0_dp/REAL(pw1%pw_grid%ngpts, KIND=dp)
239 :
240 : ! dimensions
241 0 : n => pw1%pw_grid%npts
242 0 : nloc => pw1%pw_grid%npts_local
243 0 : grays => pw1%pw_grid%grays
244 0 : ngpts = nloc(1)*nloc(2)*nloc(3)
245 :
246 : !..transform
247 0 : IF (pw1%pw_grid%para%ray_distribution) THEN
248 0 : rs_group = pw1%pw_grid%para%group
249 0 : nyzray => pw1%pw_grid%para%nyzray
250 0 : bo => pw1%pw_grid%para%bo
251 :
252 0 : g_pos = rs_group%mepos
253 0 : numtask = rs_group%num_pe
254 0 : r_dim = rs_group%num_pe_cart
255 0 : r_pos = rs_group%mepos_cart
256 :
257 0 : lg = SIZE(grays, 1)
258 0 : mg = SIZE(grays, 2)
259 0 : mmax = MAX(mg, 1)
260 0 : lmax = MAX(lg, (ngpts/mmax + 1))
261 :
262 0 : ALLOCATE (p2p(0:numtask - 1))
263 :
264 0 : CALL rs_group%rank_compare(rs_group, p2p)
265 :
266 0 : rp = p2p(g_pos)
267 0 : mx2 = bo(2, 1, rp, 2) - bo(1, 1, rp, 2) + 1
268 0 : mz2 = bo(2, 3, rp, 2) - bo(1, 3, rp, 2) + 1
269 0 : n1 = MAXVAL(bo(2, 1, :, 1) - bo(1, 1, :, 1) + 1)
270 0 : n2 = MAXVAL(bo(2, 2, :, 1) - bo(1, 2, :, 1) + 1)
271 0 : nmax = MAX((2*n2)/numtask, 2)*mx2*mz2
272 0 : nmax = MAX(nmax, n1*MAXVAL(nyzray))
273 :
274 0 : fft_scratch_size%nx = nloc(1)
275 0 : fft_scratch_size%ny = nloc(2)
276 0 : fft_scratch_size%nz = nloc(3)
277 0 : fft_scratch_size%lmax = lmax
278 0 : fft_scratch_size%mmax = mmax
279 0 : fft_scratch_size%mx1 = bo(2, 1, rp, 1) - bo(1, 1, rp, 1) + 1
280 0 : fft_scratch_size%mx2 = mx2
281 0 : fft_scratch_size%my1 = bo(2, 2, rp, 1) - bo(1, 2, rp, 1) + 1
282 0 : fft_scratch_size%mz2 = mz2
283 0 : fft_scratch_size%lg = lg
284 0 : fft_scratch_size%mg = mg
285 0 : fft_scratch_size%nbx = MAXVAL(bo(2, 1, :, 2))
286 0 : fft_scratch_size%nbz = MAXVAL(bo(2, 3, :, 2))
287 0 : fft_scratch_size%mcz1 = MAXVAL(bo(2, 3, :, 1) - bo(1, 3, :, 1) + 1)
288 0 : fft_scratch_size%mcx2 = MAXVAL(bo(2, 1, :, 2) - bo(1, 1, :, 2) + 1)
289 0 : fft_scratch_size%mcz2 = MAXVAL(bo(2, 3, :, 2) - bo(1, 3, :, 2) + 1)
290 0 : fft_scratch_size%nmax = nmax
291 0 : fft_scratch_size%nmray = MAXVAL(nyzray)
292 0 : fft_scratch_size%nyzray = nyzray(g_pos)
293 0 : fft_scratch_size%rs_group = rs_group
294 0 : fft_scratch_size%g_pos = g_pos
295 0 : fft_scratch_size%r_pos = r_pos
296 0 : fft_scratch_size%r_dim = r_dim
297 0 : fft_scratch_size%numtask = numtask
298 :
299 0 : IF (r_dim(2) > 1) THEN
300 : !
301 : ! real space is distributed over x and y coordinate
302 : ! we have two stages of communication
303 : !
304 0 : IF (r_dim(1) == 1) &
305 0 : CPABORT("This processor distribution is not supported.")
306 :
307 0 : CALL get_fft_scratch(fft_scratch, tf_type=300, n=n, fft_sizes=fft_scratch_size)
308 :
309 : ! assign buffers
310 0 : qbuf => fft_scratch%p2buf
311 0 : rbuf => fft_scratch%p3buf
312 0 : pbuf => fft_scratch%p4buf
313 0 : sbuf => fft_scratch%p5buf
314 :
315 : ! FFT along z
316 0 : CALL pw_gpu_cf(pw1, qbuf)
317 :
318 : ! Exchange data ( transpose of matrix )
319 0 : CALL cube_transpose_2(qbuf, bo(:, :, :, 1), bo(:, :, :, 2), rbuf, fft_scratch)
320 :
321 : ! FFT along y
322 : ! use the inbuild fft-lib
323 : ! CALL fft_1dm(fft_scratch%fft_plan(2), rbuf, pbuf, 1.0_dp, stat)
324 : ! or cufft (works faster, but is only faster if plans are stored)
325 0 : CALL pw_gpu_f(rbuf, pbuf, +1, n(2), mx2*mz2)
326 :
327 : ! Exchange data ( transpose of matrix ) and sort
328 : CALL xz_to_yz(pbuf, rs_group, r_dim, g_pos, p2p, pw1%pw_grid%para%yzp, nyzray, &
329 0 : bo(:, :, :, 2), sbuf, fft_scratch)
330 :
331 : ! FFT along x
332 0 : CALL pw_gpu_fg(sbuf, pw2, scale)
333 :
334 0 : CALL release_fft_scratch(fft_scratch)
335 :
336 : ELSE
337 : !
338 : ! real space is only distributed over x coordinate
339 : ! we have one stage of communication, after the transform of
340 : ! direction x
341 : !
342 :
343 0 : CALL get_fft_scratch(fft_scratch, tf_type=200, n=n, fft_sizes=fft_scratch_size)
344 :
345 : ! assign buffers
346 0 : tbuf => fft_scratch%tbuf
347 0 : sbuf => fft_scratch%r1buf
348 :
349 : ! FFT along y and z
350 0 : CALL pw_gpu_cff(pw1, tbuf)
351 :
352 : ! Exchange data ( transpose of matrix ) and sort
353 : CALL yz_to_x(tbuf, rs_group, g_pos, p2p, pw1%pw_grid%para%yzp, nyzray, &
354 0 : bo(:, :, :, 2), sbuf, fft_scratch)
355 :
356 : ! FFT along x
357 0 : CALL pw_gpu_fg(sbuf, pw2, scale)
358 :
359 0 : CALL release_fft_scratch(fft_scratch)
360 :
361 : END IF
362 :
363 0 : DEALLOCATE (p2p)
364 :
365 : !--------------------------------------------------------------------------
366 : ELSE
367 0 : CPABORT("Not implemented (no ray_distr.) in: pw_gpu_r3dc1d_3d_ps.")
368 : END IF
369 :
370 0 : CALL timestop(handle)
371 0 : END SUBROUTINE pw_gpu_r3dc1d_3d_ps
372 :
373 : ! **************************************************************************************************
374 : !> \brief perform an parallel scatter followed by a fft on the gpu
375 : !> \param pw1 ...
376 : !> \param pw2 ...
377 : !> \author Andreas Gloess
378 : ! **************************************************************************************************
379 0 : SUBROUTINE pw_gpu_c1dr3d_3d_ps(pw1, pw2)
380 : TYPE(pw_c1d_gs_type), INTENT(IN) :: pw1
381 : TYPE(pw_r3d_rs_type), INTENT(INOUT) :: pw2
382 :
383 : CHARACTER(len=*), PARAMETER :: routineN = 'pw_gpu_c1dr3d_3d_ps'
384 :
385 0 : COMPLEX(KIND=dp), DIMENSION(:, :), POINTER :: grays, pbuf, qbuf, rbuf, sbuf
386 0 : COMPLEX(KIND=dp), DIMENSION(:, :, :), POINTER :: tbuf
387 : INTEGER :: g_pos, handle, lg, lmax, mg, mmax, mx2, &
388 : mz2, n1, n2, ngpts, nmax, numtask, rp
389 0 : INTEGER, ALLOCATABLE, DIMENSION(:) :: p2p
390 : INTEGER, DIMENSION(2) :: r_dim, r_pos
391 0 : INTEGER, DIMENSION(:), POINTER :: n, nloc, nyzray
392 0 : INTEGER, DIMENSION(:, :, :, :), POINTER :: bo
393 : REAL(KIND=dp) :: scale
394 0 : TYPE(fft_scratch_sizes) :: fft_scratch_size
395 : TYPE(fft_scratch_type), POINTER :: fft_scratch
396 0 : TYPE(mp_cart_type) :: rs_group
397 :
398 0 : CALL timeset(routineN, handle)
399 :
400 0 : scale = 1.0_dp
401 :
402 : ! dimensions
403 0 : n => pw1%pw_grid%npts
404 0 : nloc => pw1%pw_grid%npts_local
405 0 : grays => pw1%pw_grid%grays
406 0 : ngpts = nloc(1)*nloc(2)*nloc(3)
407 :
408 : !..transform
409 0 : IF (pw1%pw_grid%para%ray_distribution) THEN
410 0 : rs_group = pw1%pw_grid%para%group
411 0 : nyzray => pw1%pw_grid%para%nyzray
412 0 : bo => pw1%pw_grid%para%bo
413 :
414 0 : g_pos = rs_group%mepos
415 0 : numtask = rs_group%num_pe
416 0 : r_dim = rs_group%num_pe_cart
417 0 : r_pos = rs_group%mepos_cart
418 :
419 0 : lg = SIZE(grays, 1)
420 0 : mg = SIZE(grays, 2)
421 0 : mmax = MAX(mg, 1)
422 0 : lmax = MAX(lg, (ngpts/mmax + 1))
423 :
424 0 : ALLOCATE (p2p(0:numtask - 1))
425 :
426 0 : CALL rs_group%rank_compare(rs_group, p2p)
427 :
428 0 : rp = p2p(g_pos)
429 0 : mx2 = bo(2, 1, rp, 2) - bo(1, 1, rp, 2) + 1
430 0 : mz2 = bo(2, 3, rp, 2) - bo(1, 3, rp, 2) + 1
431 0 : n1 = MAXVAL(bo(2, 1, :, 1) - bo(1, 1, :, 1) + 1)
432 0 : n2 = MAXVAL(bo(2, 2, :, 1) - bo(1, 2, :, 1) + 1)
433 0 : nmax = MAX((2*n2)/numtask, 2)*mx2*mz2
434 0 : nmax = MAX(nmax, n1*MAXVAL(nyzray))
435 :
436 0 : fft_scratch_size%nx = nloc(1)
437 0 : fft_scratch_size%ny = nloc(2)
438 0 : fft_scratch_size%nz = nloc(3)
439 0 : fft_scratch_size%lmax = lmax
440 0 : fft_scratch_size%mmax = mmax
441 0 : fft_scratch_size%mx1 = bo(2, 1, rp, 1) - bo(1, 1, rp, 1) + 1
442 0 : fft_scratch_size%mx2 = mx2
443 0 : fft_scratch_size%my1 = bo(2, 2, rp, 1) - bo(1, 2, rp, 1) + 1
444 0 : fft_scratch_size%mz2 = mz2
445 0 : fft_scratch_size%lg = lg
446 0 : fft_scratch_size%mg = mg
447 0 : fft_scratch_size%nbx = MAXVAL(bo(2, 1, :, 2))
448 0 : fft_scratch_size%nbz = MAXVAL(bo(2, 3, :, 2))
449 0 : fft_scratch_size%mcz1 = MAXVAL(bo(2, 3, :, 1) - bo(1, 3, :, 1) + 1)
450 0 : fft_scratch_size%mcx2 = MAXVAL(bo(2, 1, :, 2) - bo(1, 1, :, 2) + 1)
451 0 : fft_scratch_size%mcz2 = MAXVAL(bo(2, 3, :, 2) - bo(1, 3, :, 2) + 1)
452 0 : fft_scratch_size%nmax = nmax
453 0 : fft_scratch_size%nmray = MAXVAL(nyzray)
454 0 : fft_scratch_size%nyzray = nyzray(g_pos)
455 0 : fft_scratch_size%rs_group = rs_group
456 0 : fft_scratch_size%g_pos = g_pos
457 0 : fft_scratch_size%r_pos = r_pos
458 0 : fft_scratch_size%r_dim = r_dim
459 0 : fft_scratch_size%numtask = numtask
460 :
461 0 : IF (r_dim(2) > 1) THEN
462 : !
463 : ! real space is distributed over x and y coordinate
464 : ! we have two stages of communication
465 : !
466 0 : IF (r_dim(1) == 1) &
467 0 : CPABORT("This processor distribution is not supported.")
468 :
469 0 : CALL get_fft_scratch(fft_scratch, tf_type=300, n=n, fft_sizes=fft_scratch_size)
470 :
471 : ! assign buffers
472 0 : pbuf => fft_scratch%p7buf
473 0 : qbuf => fft_scratch%p4buf
474 0 : rbuf => fft_scratch%p3buf
475 0 : sbuf => fft_scratch%p2buf
476 :
477 : ! FFT along x
478 0 : CALL pw_gpu_sf(pw1, pbuf, scale)
479 :
480 : ! Exchange data ( transpose of matrix ) and sort
481 0 : IF (pw1%pw_grid%grid_span /= FULLSPACE) qbuf = z_zero
482 : CALL yz_to_xz(pbuf, rs_group, r_dim, g_pos, p2p, pw1%pw_grid%para%yzp, nyzray, &
483 0 : bo(:, :, :, 2), qbuf, fft_scratch)
484 :
485 : ! FFT along y
486 : ! use the inbuild fft-lib
487 : ! CALL fft_1dm(fft_scratch%fft_plan(5), qbuf, rbuf, 1.0_dp, stat)
488 : ! or cufft (works faster, but is only faster if plans are stored)
489 0 : CALL pw_gpu_f(qbuf, rbuf, -1, n(2), mx2*mz2)
490 :
491 : ! Exchange data ( transpose of matrix )
492 0 : IF (pw1%pw_grid%grid_span /= FULLSPACE) sbuf = z_zero
493 :
494 0 : CALL cube_transpose_1(rbuf, bo(:, :, :, 2), bo(:, :, :, 1), sbuf, fft_scratch)
495 :
496 : ! FFT along z
497 0 : CALL pw_gpu_fc(sbuf, pw2)
498 :
499 0 : CALL release_fft_scratch(fft_scratch)
500 :
501 : ELSE
502 : !
503 : ! real space is only distributed over x coordinate
504 : ! we have one stage of communication, after the transform of
505 : ! direction x
506 : !
507 :
508 0 : CALL get_fft_scratch(fft_scratch, tf_type=200, n=n, fft_sizes=fft_scratch_size)
509 :
510 : ! assign buffers
511 0 : sbuf => fft_scratch%r1buf
512 0 : tbuf => fft_scratch%tbuf
513 :
514 : ! FFT along x
515 0 : CALL pw_gpu_sf(pw1, sbuf, scale)
516 :
517 : ! Exchange data ( transpose of matrix ) and sort
518 0 : IF (pw1%pw_grid%grid_span /= FULLSPACE) tbuf = z_zero
519 : CALL x_to_yz(sbuf, rs_group, g_pos, p2p, pw1%pw_grid%para%yzp, nyzray, &
520 0 : bo(:, :, :, 2), tbuf, fft_scratch)
521 :
522 : ! FFT along y and z
523 0 : CALL pw_gpu_ffc(tbuf, pw2)
524 :
525 0 : CALL release_fft_scratch(fft_scratch)
526 :
527 : END IF
528 :
529 0 : DEALLOCATE (p2p)
530 :
531 : !--------------------------------------------------------------------------
532 : ELSE
533 0 : CPABORT("Not implemented (no ray_distr.) in: pw_gpu_c1dr3d_3d_ps.")
534 : END IF
535 :
536 0 : CALL timestop(handle)
537 0 : END SUBROUTINE pw_gpu_c1dr3d_3d_ps
538 :
539 : ! **************************************************************************************************
540 : !> \brief perform a parallel real_to_complex copy followed by a 2D-FFT on the gpu
541 : !> \param pw1 ...
542 : !> \param pwbuf ...
543 : !> \author Andreas Gloess
544 : ! **************************************************************************************************
545 0 : SUBROUTINE pw_gpu_cff(pw1, pwbuf)
546 : TYPE(pw_r3d_rs_type), INTENT(IN) :: pw1
547 : COMPLEX(KIND=dp), DIMENSION(:, :, :), &
548 : INTENT(INOUT), TARGET :: pwbuf
549 :
550 : CHARACTER(len=*), PARAMETER :: routineN = 'pw_gpu_cff'
551 :
552 : COMPLEX(KIND=dp), POINTER :: ptr_pwout
553 : INTEGER :: handle, l1, l2, l3
554 0 : INTEGER, DIMENSION(:), POINTER :: npts
555 : REAL(KIND=dp), POINTER :: ptr_pwin
556 : INTERFACE
557 : SUBROUTINE pw_gpu_cff_c(din, zout, npts) BIND(C, name="pw_gpu_cff")
558 : IMPORT
559 : TYPE(C_PTR), INTENT(IN), VALUE :: din
560 : TYPE(C_PTR), VALUE :: zout
561 : INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
562 : END SUBROUTINE pw_gpu_cff_c
563 : END INTERFACE
564 :
565 0 : CALL timeset(routineN, handle)
566 :
567 : ! dimensions
568 0 : npts => pw1%pw_grid%npts_local
569 : l1 = LBOUND(pw1%array, 1)
570 : l2 = LBOUND(pw1%array, 2)
571 : l3 = LBOUND(pw1%array, 3)
572 :
573 : ! pointers to data arrays
574 0 : ptr_pwin => pw1%array(l1, l2, l3)
575 0 : ptr_pwout => pwbuf(1, 1, 1)
576 :
577 : ! invoke the combined transformation
578 : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
579 : CALL pw_gpu_cff_c(c_loc(ptr_pwin), c_loc(ptr_pwout), npts)
580 : #else
581 0 : CPABORT("Compiled without pw offloading")
582 : #endif
583 :
584 0 : CALL timestop(handle)
585 0 : END SUBROUTINE pw_gpu_cff
586 :
587 : ! **************************************************************************************************
588 : !> \brief perform a parallel 2D-FFT followed by a complex_to_real copy on the gpu
589 : !> \param pwbuf ...
590 : !> \param pw2 ...
591 : !> \author Andreas Gloess
592 : ! **************************************************************************************************
593 0 : SUBROUTINE pw_gpu_ffc(pwbuf, pw2)
594 : COMPLEX(KIND=dp), DIMENSION(:, :, :), INTENT(IN), &
595 : TARGET :: pwbuf
596 : TYPE(pw_r3d_rs_type), INTENT(IN) :: pw2
597 :
598 : CHARACTER(len=*), PARAMETER :: routineN = 'pw_gpu_ffc'
599 :
600 : COMPLEX(KIND=dp), POINTER :: ptr_pwin
601 : INTEGER :: handle, l1, l2, l3
602 0 : INTEGER, DIMENSION(:), POINTER :: npts
603 : REAL(KIND=dp), POINTER :: ptr_pwout
604 : INTERFACE
605 : SUBROUTINE pw_gpu_ffc_c(zin, dout, npts) BIND(C, name="pw_gpu_ffc")
606 : IMPORT
607 : TYPE(C_PTR), INTENT(IN), VALUE :: zin
608 : TYPE(C_PTR), VALUE :: dout
609 : INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
610 : END SUBROUTINE pw_gpu_ffc_c
611 : END INTERFACE
612 :
613 0 : CALL timeset(routineN, handle)
614 :
615 : ! dimensions
616 0 : npts => pw2%pw_grid%npts_local
617 : l1 = LBOUND(pw2%array, 1)
618 : l2 = LBOUND(pw2%array, 2)
619 : l3 = LBOUND(pw2%array, 3)
620 :
621 : ! pointers to data arrays
622 0 : ptr_pwin => pwbuf(1, 1, 1)
623 0 : ptr_pwout => pw2%array(l1, l2, l3)
624 :
625 : ! invoke the combined transformation
626 : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
627 : CALL pw_gpu_ffc_c(c_loc(ptr_pwin), c_loc(ptr_pwout), npts)
628 : #else
629 0 : CPABORT("Compiled without pw offloading")
630 : #endif
631 :
632 0 : CALL timestop(handle)
633 0 : END SUBROUTINE pw_gpu_ffc
634 :
635 : ! **************************************************************************************************
636 : !> \brief perform a parallel real_to_complex copy followed by a 1D-FFT on the gpu
637 : !> \param pw1 ...
638 : !> \param pwbuf ...
639 : !> \author Andreas Gloess
640 : ! **************************************************************************************************
641 0 : SUBROUTINE pw_gpu_cf(pw1, pwbuf)
642 : TYPE(pw_r3d_rs_type), INTENT(IN) :: pw1
643 : COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(INOUT), &
644 : TARGET :: pwbuf
645 :
646 : CHARACTER(len=*), PARAMETER :: routineN = 'pw_gpu_cf'
647 :
648 : COMPLEX(KIND=dp), POINTER :: ptr_pwout
649 : INTEGER :: handle, l1, l2, l3
650 0 : INTEGER, DIMENSION(:), POINTER :: npts
651 : REAL(KIND=dp), POINTER :: ptr_pwin
652 : INTERFACE
653 : SUBROUTINE pw_gpu_cf_c(din, zout, npts) BIND(C, name="pw_gpu_cf")
654 : IMPORT
655 : TYPE(C_PTR), INTENT(IN), VALUE :: din
656 : TYPE(C_PTR), VALUE :: zout
657 : INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
658 : END SUBROUTINE pw_gpu_cf_c
659 : END INTERFACE
660 :
661 0 : CALL timeset(routineN, handle)
662 :
663 : ! dimensions
664 0 : npts => pw1%pw_grid%npts_local
665 : l1 = LBOUND(pw1%array, 1)
666 : l2 = LBOUND(pw1%array, 2)
667 : l3 = LBOUND(pw1%array, 3)
668 :
669 : ! pointers to data arrays
670 0 : ptr_pwin => pw1%array(l1, l2, l3)
671 0 : ptr_pwout => pwbuf(1, 1)
672 :
673 : ! invoke the combined transformation
674 : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
675 : CALL pw_gpu_cf_c(c_loc(ptr_pwin), c_loc(ptr_pwout), npts)
676 : #else
677 0 : CPABORT("Compiled without pw offloading")
678 : #endif
679 0 : CALL timestop(handle)
680 0 : END SUBROUTINE pw_gpu_cf
681 :
682 : ! **************************************************************************************************
683 : !> \brief perform a parallel 1D-FFT followed by a complex_to_real copy on the gpu
684 : !> \param pwbuf ...
685 : !> \param pw2 ...
686 : !> \author Andreas Gloess
687 : ! **************************************************************************************************
688 0 : SUBROUTINE pw_gpu_fc(pwbuf, pw2)
689 : COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(IN), &
690 : TARGET :: pwbuf
691 : TYPE(pw_r3d_rs_type), INTENT(IN) :: pw2
692 :
693 : CHARACTER(len=*), PARAMETER :: routineN = 'pw_gpu_fc'
694 :
695 : COMPLEX(KIND=dp), POINTER :: ptr_pwin
696 : INTEGER :: handle, l1, l2, l3
697 0 : INTEGER, DIMENSION(:), POINTER :: npts
698 : REAL(KIND=dp), POINTER :: ptr_pwout
699 : INTERFACE
700 : SUBROUTINE pw_gpu_fc_c(zin, dout, npts) BIND(C, name="pw_gpu_fc")
701 : IMPORT
702 : TYPE(C_PTR), INTENT(IN), VALUE :: zin
703 : TYPE(C_PTR), VALUE :: dout
704 : INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
705 : END SUBROUTINE pw_gpu_fc_c
706 : END INTERFACE
707 :
708 0 : CALL timeset(routineN, handle)
709 :
710 0 : npts => pw2%pw_grid%npts_local
711 : l1 = LBOUND(pw2%array, 1)
712 : l2 = LBOUND(pw2%array, 2)
713 : l3 = LBOUND(pw2%array, 3)
714 :
715 : ! pointers to data arrays
716 0 : ptr_pwin => pwbuf(1, 1)
717 0 : ptr_pwout => pw2%array(l1, l2, l3)
718 :
719 : ! invoke the combined transformation
720 : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
721 : CALL pw_gpu_fc_c(c_loc(ptr_pwin), c_loc(ptr_pwout), npts)
722 : #else
723 0 : CPABORT("Compiled without pw offloading")
724 : #endif
725 :
726 0 : CALL timestop(handle)
727 0 : END SUBROUTINE pw_gpu_fc
728 :
729 : ! **************************************************************************************************
730 : !> \brief perform a parallel 1D-FFT on the gpu
731 : !> \param pwbuf1 ...
732 : !> \param pwbuf2 ...
733 : !> \param dir ...
734 : !> \param n ...
735 : !> \param m ...
736 : !> \author Andreas Gloess
737 : ! **************************************************************************************************
738 0 : SUBROUTINE pw_gpu_f(pwbuf1, pwbuf2, dir, n, m)
739 : COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(IN), &
740 : TARGET :: pwbuf1
741 : COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(INOUT), &
742 : TARGET :: pwbuf2
743 : INTEGER, INTENT(IN) :: dir, n, m
744 :
745 : CHARACTER(len=*), PARAMETER :: routineN = 'pw_gpu_f'
746 :
747 : COMPLEX(KIND=dp), POINTER :: ptr_pwin, ptr_pwout
748 : INTEGER :: handle
749 : INTERFACE
750 : SUBROUTINE pw_gpu_f_c(zin, zout, dir, n, m) BIND(C, name="pw_gpu_f")
751 : IMPORT
752 : TYPE(C_PTR), INTENT(IN), VALUE :: zin
753 : TYPE(C_PTR), VALUE :: zout
754 : INTEGER(KIND=C_INT), INTENT(IN), VALUE :: dir, n, m
755 : END SUBROUTINE pw_gpu_f_c
756 : END INTERFACE
757 :
758 0 : CALL timeset(routineN, handle)
759 :
760 0 : IF (n*m /= 0) THEN
761 : ! pointers to data arrays
762 0 : ptr_pwin => pwbuf1(1, 1)
763 0 : ptr_pwout => pwbuf2(1, 1)
764 :
765 : ! invoke the combined transformation
766 : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
767 : CALL pw_gpu_f_c(c_loc(ptr_pwin), c_loc(ptr_pwout), dir, n, m)
768 : #else
769 : MARK_USED(dir)
770 0 : CPABORT("Compiled without pw offloading")
771 : #endif
772 : END IF
773 :
774 0 : CALL timestop(handle)
775 0 : END SUBROUTINE pw_gpu_f
776 : ! **************************************************************************************************
777 : !> \brief perform a parallel 1D-FFT followed by a gather on the gpu
778 : !> \param pwbuf ...
779 : !> \param pw2 ...
780 : !> \param scale ...
781 : !> \author Andreas Gloess
782 : ! **************************************************************************************************
783 0 : SUBROUTINE pw_gpu_fg(pwbuf, pw2, scale)
784 : COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(IN), &
785 : TARGET :: pwbuf
786 : TYPE(pw_c1d_gs_type), INTENT(IN) :: pw2
787 : REAL(KIND=dp), INTENT(IN) :: scale
788 :
789 : CHARACTER(len=*), PARAMETER :: routineN = 'pw_gpu_fg'
790 :
791 : COMPLEX(KIND=dp), POINTER :: ptr_pwin, ptr_pwout
792 : INTEGER :: handle, mg, mmax, ngpts
793 0 : INTEGER, DIMENSION(:), POINTER :: npts
794 : INTEGER, POINTER :: ptr_ghatmap
795 : INTERFACE
796 : SUBROUTINE pw_gpu_fg_c(zin, zout, ghatmap, npts, mmax, ngpts, scale) BIND(C, name="pw_gpu_fg")
797 : IMPORT
798 : TYPE(C_PTR), INTENT(IN), VALUE :: zin
799 : TYPE(C_PTR), VALUE :: zout
800 : TYPE(C_PTR), INTENT(IN), VALUE :: ghatmap
801 : INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
802 : INTEGER(KIND=C_INT), INTENT(IN), VALUE :: mmax, ngpts
803 : REAL(KIND=C_DOUBLE), INTENT(IN), VALUE :: scale
804 :
805 : END SUBROUTINE pw_gpu_fg_c
806 : END INTERFACE
807 :
808 0 : CALL timeset(routineN, handle)
809 :
810 0 : ngpts = SIZE(pw2%pw_grid%gsq)
811 0 : npts => pw2%pw_grid%npts
812 :
813 0 : IF ((npts(1) /= 0) .AND. (ngpts /= 0)) THEN
814 0 : mg = SIZE(pw2%pw_grid%grays, 2)
815 0 : mmax = MAX(mg, 1)
816 :
817 : ! pointers to data arrays
818 0 : ptr_pwin => pwbuf(1, 1)
819 0 : ptr_pwout => pw2%array(1)
820 :
821 : ! pointer to map array
822 0 : ptr_ghatmap => pw2%pw_grid%g_hatmap(1, 1)
823 :
824 : ! invoke the combined transformation
825 : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
826 : CALL pw_gpu_fg_c(c_loc(ptr_pwin), c_loc(ptr_pwout), c_loc(ptr_ghatmap), npts, mmax, ngpts, scale)
827 : #else
828 : MARK_USED(scale)
829 0 : CPABORT("Compiled without pw offloading")
830 : #endif
831 : END IF
832 :
833 0 : CALL timestop(handle)
834 0 : END SUBROUTINE pw_gpu_fg
835 :
836 : ! **************************************************************************************************
837 : !> \brief perform a parallel scatter followed by a 1D-FFT on the gpu
838 : !> \param pw1 ...
839 : !> \param pwbuf ...
840 : !> \param scale ...
841 : !> \author Andreas Gloess
842 : ! **************************************************************************************************
843 0 : SUBROUTINE pw_gpu_sf(pw1, pwbuf, scale)
844 : TYPE(pw_c1d_gs_type), INTENT(IN) :: pw1
845 : COMPLEX(KIND=dp), DIMENSION(:, :), INTENT(INOUT), &
846 : TARGET :: pwbuf
847 : REAL(KIND=dp), INTENT(IN) :: scale
848 :
849 : CHARACTER(len=*), PARAMETER :: routineN = 'pw_gpu_sf'
850 :
851 : COMPLEX(KIND=dp), POINTER :: ptr_pwin, ptr_pwout
852 : INTEGER :: handle, mg, mmax, ngpts, nmaps
853 0 : INTEGER, DIMENSION(:), POINTER :: npts
854 : INTEGER, POINTER :: ptr_ghatmap
855 : INTERFACE
856 : SUBROUTINE pw_gpu_sf_c(zin, zout, ghatmap, npts, mmax, ngpts, nmaps, scale) BIND(C, name="pw_gpu_sf")
857 : IMPORT
858 : TYPE(C_PTR), INTENT(IN), VALUE :: zin
859 : TYPE(C_PTR), VALUE :: zout
860 : TYPE(C_PTR), INTENT(IN), VALUE :: ghatmap
861 : INTEGER(KIND=C_INT), DIMENSION(*), INTENT(IN):: npts
862 : INTEGER(KIND=C_INT), INTENT(IN), VALUE :: mmax, ngpts, nmaps
863 : REAL(KIND=C_DOUBLE), INTENT(IN), VALUE :: scale
864 :
865 : END SUBROUTINE pw_gpu_sf_c
866 : END INTERFACE
867 :
868 0 : CALL timeset(routineN, handle)
869 :
870 0 : ngpts = SIZE(pw1%pw_grid%gsq)
871 0 : npts => pw1%pw_grid%npts
872 :
873 0 : IF ((npts(1) /= 0) .AND. (ngpts /= 0)) THEN
874 0 : mg = SIZE(pw1%pw_grid%grays, 2)
875 0 : mmax = MAX(mg, 1)
876 :
877 : ! pointers to data arrays
878 0 : ptr_pwin => pw1%array(1)
879 0 : ptr_pwout => pwbuf(1, 1)
880 :
881 : ! pointer to map array
882 0 : nmaps = SIZE(pw1%pw_grid%g_hatmap, 2)
883 0 : ptr_ghatmap => pw1%pw_grid%g_hatmap(1, 1)
884 :
885 : ! invoke the combined transformation
886 : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_PW)
887 : CALL pw_gpu_sf_c(c_loc(ptr_pwin), c_loc(ptr_pwout), c_loc(ptr_ghatmap), npts, mmax, ngpts, nmaps, scale)
888 : #else
889 : MARK_USED(scale)
890 0 : CPABORT("Compiled without pw offloading")
891 : #endif
892 : END IF
893 :
894 0 : CALL timestop(handle)
895 0 : END SUBROUTINE pw_gpu_sf
896 :
897 : END MODULE pw_gpu
898 :
|