Line data Source code
1 : !--------------------------------------------------------------------------------------------------!
2 : ! CP2K: A general program to perform molecular dynamics simulations !
3 : ! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4 : ! !
5 : ! SPDX-License-Identifier: GPL-2.0-or-later !
6 : !--------------------------------------------------------------------------------------------------!
7 :
8 : ! **************************************************************************************************
9 : !> \brief Creates the wavelet kernel for the wavelet based poisson solver.
10 : !> \author Florian Schiffmann (09.2007,fschiff)
11 : ! **************************************************************************************************
12 : MODULE ps_wavelet_base
13 :
14 : USE kinds, ONLY: dp
15 : USE message_passing, ONLY: mp_comm_type
16 : USE ps_wavelet_fft3d, ONLY: ctrig,&
17 : ctrig_length,&
18 : fftstp
19 : #include "../base/base_uses.f90"
20 :
21 : IMPLICIT NONE
22 :
23 : PRIVATE
24 :
25 : CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'ps_wavelet_base'
26 :
27 : PUBLIC :: scramble_unpack, p_poissonsolver, s_poissonsolver, f_poissonsolver
28 :
29 : CONTAINS
30 :
31 : ! **************************************************************************************************
32 : !> \brief ...
33 : !> \param n1 ...
34 : !> \param n2 ...
35 : !> \param n3 ...
36 : !> \param nd1 ...
37 : !> \param nd2 ...
38 : !> \param nd3 ...
39 : !> \param md1 ...
40 : !> \param md2 ...
41 : !> \param md3 ...
42 : !> \param nproc ...
43 : !> \param iproc ...
44 : !> \param zf ...
45 : !> \param scal ...
46 : !> \param hx ...
47 : !> \param hy ...
48 : !> \param hz ...
49 : !> \param mpi_group ...
50 : ! **************************************************************************************************
51 16561 : SUBROUTINE P_PoissonSolver(n1, n2, n3, nd1, nd2, nd3, md1, md2, md3, nproc, iproc, zf &
52 : , scal, hx, hy, hz, mpi_group)
53 : INTEGER, INTENT(in) :: n1, n2, n3, nd1, nd2, nd3, md1, md2, &
54 : md3, nproc, iproc
55 : REAL(KIND=dp), DIMENSION(md1, md3, md2/nproc), &
56 : INTENT(inout) :: zf
57 : REAL(KIND=dp), INTENT(in) :: scal, hx, hy, hz
58 :
59 : CLASS(mp_comm_type), INTENT(in) :: mpi_group
60 :
61 : INTEGER, PARAMETER :: ncache_optimal = 8*1024
62 :
63 : INTEGER :: i, i1, i3, ic1, ic2, ic3, inzee, j, j2, &
64 : J2stb, J2stf, j3, Jp2stb, Jp2stf, lot, &
65 : lzt, ma, mb, ncache, nfft
66 16561 : INTEGER, ALLOCATABLE, DIMENSION(:) :: after1, after2, after3, before1, &
67 16561 : before2, before3, now1, now2, now3
68 16561 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :) :: btrig1, btrig2, btrig3, ftrig1, ftrig2, &
69 16561 : ftrig3
70 16561 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :) :: zt, zw
71 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :, :) :: zmpi2
72 : REAL(KIND=dp), ALLOCATABLE, &
73 16561 : DIMENSION(:, :, :, :, :) :: zmpi1
74 :
75 16561 : IF (nd1 .LT. n1/2 + 1) CPABORT("Parallel convolution:ERROR:nd1")
76 16561 : IF (nd2 .LT. n2/2 + 1) CPABORT("Parallel convolution:ERROR:nd2")
77 16561 : IF (nd3 .LT. n3/2 + 1) CPABORT("Parallel convolution:ERROR:nd3")
78 16561 : IF (md1 .LT. n1) CPABORT("Parallel convolution:ERROR:md1")
79 16561 : IF (md2 .LT. n2) CPABORT("Parallel convolution:ERROR:md2")
80 16561 : IF (md3 .LT. n3) CPABORT("Parallel convolution:ERROR:md3")
81 16561 : IF (MOD(nd3, nproc) .NE. 0) CPABORT("Parallel convolution:ERROR:nd3")
82 16561 : IF (MOD(md2, nproc) .NE. 0) CPABORT("Parallel convolution:ERROR:md2")
83 :
84 : !defining work arrays dimensions
85 16561 : ncache = ncache_optimal
86 16561 : IF (ncache <= MAX(n1, n2, n3)*4) ncache = MAX(n1, n2, n3)*4
87 :
88 16561 : lzt = n2
89 16561 : IF (MOD(n2, 2) .EQ. 0) lzt = lzt + 1
90 16561 : IF (MOD(n2, 4) .EQ. 0) lzt = lzt + 1 !maybe this is useless
91 :
92 : !Allocations
93 16561 : ALLOCATE (btrig1(2, ctrig_length))
94 16561 : ALLOCATE (ftrig1(2, ctrig_length))
95 16561 : ALLOCATE (after1(7))
96 16561 : ALLOCATE (now1(7))
97 16561 : ALLOCATE (before1(7))
98 16561 : ALLOCATE (btrig2(2, ctrig_length))
99 16561 : ALLOCATE (ftrig2(2, ctrig_length))
100 16561 : ALLOCATE (after2(7))
101 16561 : ALLOCATE (now2(7))
102 16561 : ALLOCATE (before2(7))
103 16561 : ALLOCATE (btrig3(2, ctrig_length))
104 16561 : ALLOCATE (ftrig3(2, ctrig_length))
105 16561 : ALLOCATE (after3(7))
106 16561 : ALLOCATE (now3(7))
107 16561 : ALLOCATE (before3(7))
108 66244 : ALLOCATE (zw(2, ncache/4, 2))
109 66244 : ALLOCATE (zt(2, lzt, n1))
110 82805 : ALLOCATE (zmpi2(2, n1, md2/nproc, nd3))
111 40101 : IF (nproc .GT. 1) ALLOCATE (zmpi1(2, n1, md2/nproc, nd3/nproc, nproc))
112 :
113 : !calculating the FFT work arrays (beware on the HalFFT in n3 dimension)
114 16561 : CALL ctrig(n3, btrig3, after3, before3, now3, 1, ic3)
115 16561 : CALL ctrig(n1, btrig1, after1, before1, now1, 1, ic1)
116 16561 : CALL ctrig(n2, btrig2, after2, before2, now2, 1, ic2)
117 360028 : DO j = 1, n1
118 343467 : ftrig1(1, j) = btrig1(1, j)
119 360028 : ftrig1(2, j) = -btrig1(2, j)
120 : END DO
121 360028 : DO j = 1, n2
122 343467 : ftrig2(1, j) = btrig2(1, j)
123 360028 : ftrig2(2, j) = -btrig2(2, j)
124 : END DO
125 360028 : DO j = 1, n3
126 343467 : ftrig3(1, j) = btrig3(1, j)
127 360028 : ftrig3(2, j) = -btrig3(2, j)
128 : END DO
129 :
130 : ! transform along z axis
131 16561 : lot = ncache/(4*n3)
132 16561 : IF (lot .LT. 1) THEN
133 : WRITE (*, *) &
134 : 'convolxc_off:ncache has to be enlarged to be able to hold at'// &
135 : 'least one 1-d FFT of this size even though this will'// &
136 0 : 'reduce the performance for shorter transform lengths'
137 0 : CPABORT("")
138 : END IF
139 306338 : DO j2 = 1, md2/nproc
140 : !this condition ensures that we manage only the interesting part for the FFT
141 306338 : IF (iproc*(md2/nproc) + j2 .LE. n2) THEN
142 587324 : DO i1 = 1, n1, lot
143 298171 : ma = i1
144 298171 : mb = MIN(i1 + (lot - 1), n1)
145 298171 : nfft = mb - ma + 1
146 : !inserting real data into complex array of half length
147 298171 : CALL P_fill_upcorn(md1, md3, lot, nfft, n3, zf(i1, 1, j2), zw(1, 1, 1))
148 :
149 : !performing FFT
150 : !input: I1,I3,J2,(Jp2)
151 298171 : inzee = 1
152 931908 : DO i = 1, ic3
153 : CALL fftstp(lot, nfft, n3, lot, n3, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
154 633737 : btrig3, after3(i), now3(i), before3(i), 1)
155 931908 : inzee = 3 - inzee
156 : END DO
157 :
158 : !output: I1,i3,J2,(Jp2)
159 : !exchanging components
160 : !input: I1,i3,J2,(Jp2)
161 587324 : CALL scramble_P(i1, j2, lot, nfft, n1, n3, md2, nproc, nd3, zw(1, 1, inzee), zmpi2)
162 : !output: I1,J2,i3,(Jp2)
163 : END DO
164 : END IF
165 : END DO
166 :
167 : !Interprocessor data transposition
168 : !input: I1,J2,j3,jp3,(Jp2)
169 16561 : IF (nproc .GT. 1) THEN
170 : !communication scheduling
171 4708 : CALL mpi_group%alltoall(zmpi2, zmpi1, 2*n1*(md2/nproc)*(nd3/nproc))
172 : END IF
173 : !output: I1,J2,j3,Jp2,(jp3)
174 :
175 : !now each process perform complete convolution of its planes
176 174643 : DO j3 = 1, nd3/nproc
177 : !this condition ensures that we manage only the interesting part for the FFT
178 174643 : IF (iproc*(nd3/nproc) + j3 .LE. n3/2 + 1) THEN
179 157193 : Jp2stb = 1
180 157193 : J2stb = 1
181 157193 : Jp2stf = 1
182 157193 : J2stf = 1
183 :
184 : ! transform along x axis
185 157193 : lot = ncache/(4*n1)
186 157193 : IF (lot .LT. 1) THEN
187 : WRITE (*, *) &
188 : 'convolxc_off:ncache has to be enlarged to be able to hold at'// &
189 : 'least one 1-d FFT of this size even though this will'// &
190 0 : 'reduce the performance for shorter transform lengths'
191 0 : CPABORT("")
192 : END IF
193 :
194 319062 : DO j = 1, n2, lot
195 161869 : ma = j
196 161869 : mb = MIN(j + (lot - 1), n2)
197 161869 : nfft = mb - ma + 1
198 :
199 : !reverse index ordering, leaving the planes to be transformed at the end
200 : !input: I1,J2,j3,Jp2,(jp3)
201 161869 : IF (nproc .EQ. 1) THEN
202 130318 : CALL P_mpiswitch_upcorn(j3, nfft, Jp2stb, J2stb, lot, n1, md2, nd3, nproc, zmpi2, zw(1, 1, 1))
203 : ELSE
204 31551 : CALL P_mpiswitch_upcorn(j3, nfft, Jp2stb, J2stb, lot, n1, md2, nd3, nproc, zmpi1, zw(1, 1, 1))
205 : END IF
206 : !output: J2,Jp2,I1,j3,(jp3)
207 :
208 : !performing FFT
209 : !input: I2,I1,j3,(jp3)
210 161869 : inzee = 1
211 343294 : DO i = 1, ic1 - 1
212 : CALL fftstp(lot, nfft, n1, lot, n1, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
213 181425 : btrig1, after1(i), now1(i), before1(i), 1)
214 343294 : inzee = 3 - inzee
215 : END DO
216 :
217 : !storing the last step into zt array
218 161869 : i = ic1
219 : CALL fftstp(lot, nfft, n1, lzt, n1, zw(1, 1, inzee), zt(1, j, 1), &
220 319062 : btrig1, after1(i), now1(i), before1(i), 1)
221 : !output: I2,i1,j3,(jp3)
222 : END DO
223 :
224 : !transform along y axis
225 157193 : lot = ncache/(4*n2)
226 157193 : IF (lot .LT. 1) THEN
227 : WRITE (*, *) &
228 : 'convolxc_off:ncache has to be enlarged to be able to hold at'// &
229 : 'least one 1-d FFT of this size even though this will'// &
230 0 : 'reduce the performance for shorter transform lengths'
231 0 : CPABORT("")
232 : END IF
233 :
234 319062 : DO j = 1, n1, lot
235 161869 : ma = j
236 161869 : mb = MIN(j + (lot - 1), n1)
237 161869 : nfft = mb - ma + 1
238 :
239 : !reverse ordering
240 : !input: I2,i1,j3,(jp3)
241 161869 : CALL P_switch_upcorn(nfft, n2, lot, n1, lzt, zt(1, 1, j), zw(1, 1, 1))
242 : !output: i1,I2,j3,(jp3)
243 :
244 : !performing FFT
245 : !input: i1,I2,j3,(jp3)
246 161869 : inzee = 1
247 505163 : DO i = 1, ic2
248 : CALL fftstp(lot, nfft, n2, lot, n2, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
249 343294 : btrig2, after2(i), now2(i), before2(i), 1)
250 505163 : inzee = 3 - inzee
251 : END DO
252 : !output: i1,i2,j3,(jp3)
253 :
254 : !Multiply with kernel in fourier space
255 161869 : i3 = iproc*(nd3/nproc) + j3
256 161869 : CALL P_multkernel(n1, n2, n3, lot, nfft, j, i3, zw(1, 1, inzee), hx, hy, hz)
257 :
258 : !TRANSFORM BACK IN REAL SPACE
259 :
260 : !transform along y axis
261 : !input: i1,i2,j3,(jp3)
262 505163 : DO i = 1, ic2
263 : CALL fftstp(lot, nfft, n2, lot, n2, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
264 343294 : ftrig2, after2(i), now2(i), before2(i), -1)
265 505163 : inzee = 3 - inzee
266 : END DO
267 :
268 : !reverse ordering
269 : !input: i1,I2,j3,(jp3)
270 319062 : CALL P_unswitch_downcorn(nfft, n2, lot, n1, lzt, zw(1, 1, inzee), zt(1, 1, j))
271 : !output: I2,i1,j3,(jp3)
272 : END DO
273 :
274 : !transform along x axis
275 : !input: I2,i1,j3,(jp3)
276 157193 : lot = ncache/(4*n1)
277 319062 : DO j = 1, n2, lot
278 161869 : ma = j
279 161869 : mb = MIN(j + (lot - 1), n2)
280 161869 : nfft = mb - ma + 1
281 :
282 : !performing FFT
283 161869 : i = 1
284 : CALL fftstp(lzt, nfft, n1, lot, n1, zt(1, j, 1), zw(1, 1, 1), &
285 161869 : ftrig1, after1(i), now1(i), before1(i), -1)
286 :
287 161869 : inzee = 1
288 343294 : DO i = 2, ic1
289 : CALL fftstp(lot, nfft, n1, lot, n1, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
290 181425 : ftrig1, after1(i), now1(i), before1(i), -1)
291 343294 : inzee = 3 - inzee
292 : END DO
293 : !output: I2,I1,j3,(jp3)
294 :
295 : !reverse ordering
296 : !input: J2,Jp2,I1,j3,(jp3)
297 319062 : IF (nproc .EQ. 1) THEN
298 130318 : CALL P_unmpiswitch_downcorn(j3, nfft, Jp2stf, J2stf, lot, n1, md2, nd3, nproc, zw(1, 1, inzee), zmpi2)
299 : ELSE
300 31551 : CALL P_unmpiswitch_downcorn(j3, nfft, Jp2stf, J2stf, lot, n1, md2, nd3, nproc, zw(1, 1, inzee), zmpi1)
301 : END IF
302 : ! output: I1,J2,j3,Jp2,(jp3)
303 : END DO
304 : END IF
305 : END DO
306 :
307 : !Interprocessor data transposition
308 : !input: I1,J2,j3,Jp2,(jp3)
309 16561 : IF (nproc .GT. 1) THEN
310 : !communication scheduling
311 4708 : CALL mpi_group%alltoall(zmpi1, zmpi2, 2*n1*(md2/nproc)*(nd3/nproc))
312 : END IF
313 : !output: I1,J2,j3,jp3,(Jp2)
314 : !transform along z axis
315 : !input: I1,J2,i3,(Jp2)
316 16561 : lot = ncache/(4*n3)
317 306338 : DO j2 = 1, md2/nproc
318 : !this condition ensures that we manage only the interesting part for the FFT
319 306338 : IF (iproc*(md2/nproc) + j2 .LE. n2) THEN
320 587324 : DO i1 = 1, n1, lot
321 298171 : ma = i1
322 298171 : mb = MIN(i1 + (lot - 1), n1)
323 298171 : nfft = mb - ma + 1
324 :
325 : !reverse ordering
326 : !input: I1,J2,i3,(Jp2)
327 298171 : CALL unscramble_P(i1, j2, lot, nfft, n1, n3, md2, nproc, nd3, zmpi2, zw(1, 1, 1))
328 : !output: I1,i3,J2,(Jp2)
329 :
330 : !performing FFT
331 : !input: I1,i3,J2,(Jp2)
332 298171 : inzee = 1
333 931908 : DO i = 1, ic3
334 : CALL fftstp(lot, nfft, n3, lot, n3, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
335 633737 : ftrig3, after3(i), now3(i), before3(i), -1)
336 931908 : inzee = 3 - inzee
337 : END DO
338 : !output: I1,I3,J2,(Jp2)
339 :
340 : !rebuild the output array
341 587324 : CALL P_unfill_downcorn(md1, md3, lot, nfft, n3, zw(1, 1, inzee), zf(i1, 1, j2), scal)
342 :
343 : END DO
344 : END IF
345 : END DO
346 :
347 : !De-allocations
348 16561 : DEALLOCATE (btrig1)
349 16561 : DEALLOCATE (ftrig1)
350 16561 : DEALLOCATE (after1)
351 16561 : DEALLOCATE (now1)
352 16561 : DEALLOCATE (before1)
353 16561 : DEALLOCATE (btrig2)
354 16561 : DEALLOCATE (ftrig2)
355 16561 : DEALLOCATE (after2)
356 16561 : DEALLOCATE (now2)
357 16561 : DEALLOCATE (before2)
358 16561 : DEALLOCATE (btrig3)
359 16561 : DEALLOCATE (ftrig3)
360 16561 : DEALLOCATE (after3)
361 16561 : DEALLOCATE (now3)
362 16561 : DEALLOCATE (before3)
363 16561 : DEALLOCATE (zmpi2)
364 16561 : DEALLOCATE (zw)
365 16561 : DEALLOCATE (zt)
366 16561 : IF (nproc .GT. 1) DEALLOCATE (zmpi1)
367 : ! call timing(iproc,'PSolv_comput ','OF')
368 16561 : END SUBROUTINE P_PoissonSolver
369 :
370 : ! **************************************************************************************************
371 : !> \brief ...
372 : !> \param j3 ...
373 : !> \param nfft ...
374 : !> \param Jp2stb ...
375 : !> \param J2stb ...
376 : !> \param lot ...
377 : !> \param n1 ...
378 : !> \param md2 ...
379 : !> \param nd3 ...
380 : !> \param nproc ...
381 : !> \param zmpi1 ...
382 : !> \param zw ...
383 : ! **************************************************************************************************
384 161869 : SUBROUTINE P_mpiswitch_upcorn(j3, nfft, Jp2stb, J2stb, lot, n1, md2, nd3, nproc, zmpi1, zw)
385 : INTEGER, INTENT(in) :: j3, nfft
386 : INTEGER, INTENT(inout) :: Jp2stb, J2stb
387 : INTEGER, INTENT(in) :: lot, n1, md2, nd3, nproc
388 : REAL(KIND=dp), &
389 : DIMENSION(2, n1, md2/nproc, nd3/nproc, nproc), &
390 : INTENT(in) :: zmpi1
391 : REAL(KIND=dp), DIMENSION(2, lot, n1), &
392 : INTENT(inout) :: zw
393 :
394 : INTEGER :: I1, J2, Jp2, mfft
395 :
396 161869 : mfft = 0
397 340096 : DO Jp2 = Jp2stb, nproc
398 3697451 : DO J2 = J2stb, md2/nproc
399 3519224 : mfft = mfft + 1
400 3519224 : IF (mfft .GT. nfft) THEN
401 12841 : Jp2stb = Jp2
402 12841 : J2stb = J2
403 12841 : RETURN
404 : END IF
405 93233601 : DO I1 = 1, n1
406 89548991 : zw(1, mfft, I1) = zmpi1(1, I1, J2, j3, Jp2)
407 93055374 : zw(2, mfft, I1) = zmpi1(2, I1, J2, j3, Jp2)
408 : END DO
409 : END DO
410 327255 : J2stb = 1
411 : END DO
412 : END SUBROUTINE P_mpiswitch_upcorn
413 :
414 : ! **************************************************************************************************
415 : !> \brief ...
416 : !> \param nfft ...
417 : !> \param n2 ...
418 : !> \param lot ...
419 : !> \param n1 ...
420 : !> \param lzt ...
421 : !> \param zt ...
422 : !> \param zw ...
423 : ! **************************************************************************************************
424 161869 : SUBROUTINE P_switch_upcorn(nfft, n2, lot, n1, lzt, zt, zw)
425 : INTEGER, INTENT(in) :: nfft, n2, lot, n1, lzt
426 : REAL(KIND=dp), DIMENSION(2, lzt, n1), INTENT(in) :: zt
427 : REAL(KIND=dp), DIMENSION(2, lot, n2), &
428 : INTENT(inout) :: zw
429 :
430 : INTEGER :: i, j
431 :
432 3668252 : DO j = 1, nfft
433 93217243 : DO i = 1, n2
434 89548991 : zw(1, j, i) = zt(1, i, j)
435 93055374 : zw(2, j, i) = zt(2, i, j)
436 : END DO
437 : END DO
438 :
439 161869 : END SUBROUTINE P_switch_upcorn
440 :
441 : ! **************************************************************************************************
442 : !> \brief ...
443 : !> \param nfft ...
444 : !> \param n2 ...
445 : !> \param lot ...
446 : !> \param n1 ...
447 : !> \param lzt ...
448 : !> \param zw ...
449 : !> \param zt ...
450 : ! **************************************************************************************************
451 161869 : SUBROUTINE P_unswitch_downcorn(nfft, n2, lot, n1, lzt, zw, zt)
452 : INTEGER, INTENT(in) :: nfft, n2, lot, n1, lzt
453 : REAL(KIND=dp), DIMENSION(2, lot, n2), INTENT(in) :: zw
454 : REAL(KIND=dp), DIMENSION(2, lzt, n1), &
455 : INTENT(inout) :: zt
456 :
457 : INTEGER :: i, j
458 :
459 3668252 : DO j = 1, nfft
460 93217243 : DO i = 1, n2
461 89548991 : zt(1, i, j) = zw(1, j, i)
462 93055374 : zt(2, i, j) = zw(2, j, i)
463 : END DO
464 : END DO
465 :
466 161869 : END SUBROUTINE P_unswitch_downcorn
467 :
468 : ! **************************************************************************************************
469 : !> \brief ...
470 : !> \param j3 ...
471 : !> \param nfft ...
472 : !> \param Jp2stf ...
473 : !> \param J2stf ...
474 : !> \param lot ...
475 : !> \param n1 ...
476 : !> \param md2 ...
477 : !> \param nd3 ...
478 : !> \param nproc ...
479 : !> \param zw ...
480 : !> \param zmpi1 ...
481 : ! **************************************************************************************************
482 161869 : SUBROUTINE P_unmpiswitch_downcorn(j3, nfft, Jp2stf, J2stf, lot, n1, md2, nd3, nproc, zw, zmpi1)
483 : INTEGER, INTENT(in) :: j3, nfft
484 : INTEGER, INTENT(inout) :: Jp2stf, J2stf
485 : INTEGER, INTENT(in) :: lot, n1, md2, nd3, nproc
486 : REAL(KIND=dp), DIMENSION(2, lot, n1), INTENT(in) :: zw
487 : REAL(KIND=dp), &
488 : DIMENSION(2, n1, md2/nproc, nd3/nproc, nproc), &
489 : INTENT(inout) :: zmpi1
490 :
491 : INTEGER :: I1, J2, Jp2, mfft
492 :
493 161869 : mfft = 0
494 340096 : DO Jp2 = Jp2stf, nproc
495 3697451 : DO J2 = J2stf, md2/nproc
496 3519224 : mfft = mfft + 1
497 3519224 : IF (mfft .GT. nfft) THEN
498 12841 : Jp2stf = Jp2
499 12841 : J2stf = J2
500 12841 : RETURN
501 : END IF
502 93233601 : DO I1 = 1, n1
503 89548991 : zmpi1(1, I1, J2, j3, Jp2) = zw(1, mfft, I1)
504 93055374 : zmpi1(2, I1, J2, j3, Jp2) = zw(2, mfft, I1)
505 : END DO
506 : END DO
507 327255 : J2stf = 1
508 : END DO
509 : END SUBROUTINE P_unmpiswitch_downcorn
510 :
511 : ! **************************************************************************************************
512 : !> \brief (Based on suitable modifications of S.Goedecker routines)
513 : !> Restore data into output array
514 : !> \param md1 Dimensions of the undistributed part of the real grid
515 : !> \param md3 Dimensions of the undistributed part of the real grid
516 : !> \param lot ...
517 : !> \param nfft number of planes
518 : !> \param n3 (twice the) dimension of the last FFTtransform
519 : !> \param zw FFT work array
520 : !> \param zf Original distributed density as well as
521 : !> Distributed solution of the poisson equation (inout)
522 : !> \param scal Needed to achieve unitarity and correct dimensions
523 : !> \date February 2006
524 : !> \author S. Goedecker, L. Genovese
525 : !> \note Assuming that high frequencies are in the corners
526 : !> and that n3 is multiple of 4
527 : !>
528 : !> RESTRICTIONS on USAGE
529 : !> Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
530 : !> Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
531 : !> Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
532 : !> This file is distributed under the terms of the
533 : !> GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
534 : ! **************************************************************************************************
535 298171 : SUBROUTINE P_unfill_downcorn(md1, md3, lot, nfft, n3, zw, zf, scal)
536 : INTEGER, INTENT(in) :: md1, md3, lot, nfft, n3
537 : REAL(KIND=dp), DIMENSION(2, lot, n3), INTENT(in) :: zw
538 : REAL(KIND=dp), DIMENSION(md1, md3), INTENT(inout) :: zf
539 : REAL(KIND=dp), INTENT(in) :: scal
540 :
541 : INTEGER :: i1, i3
542 : REAL(KIND=dp) :: pot1
543 :
544 7275604 : DO i3 = 1, n3
545 174441101 : DO i1 = 1, nfft
546 167165497 : pot1 = scal*zw(1, i1, i3)
547 174142930 : zf(i1, i3) = pot1
548 : END DO
549 : END DO
550 :
551 298171 : END SUBROUTINE P_unfill_downcorn
552 :
553 : ! **************************************************************************************************
554 : !> \brief ...
555 : !> \param md1 ...
556 : !> \param md3 ...
557 : !> \param lot ...
558 : !> \param nfft ...
559 : !> \param n3 ...
560 : !> \param zf ...
561 : !> \param zw ...
562 : ! **************************************************************************************************
563 298171 : SUBROUTINE P_fill_upcorn(md1, md3, lot, nfft, n3, zf, zw)
564 : INTEGER, INTENT(in) :: md1, md3, lot, nfft, n3
565 : REAL(KIND=dp), DIMENSION(md1, md3), INTENT(in) :: zf
566 : REAL(KIND=dp), DIMENSION(2, lot, n3), &
567 : INTENT(inout) :: zw
568 :
569 : INTEGER :: i1, i3
570 :
571 7275604 : DO i3 = 1, n3
572 174441101 : DO i1 = 1, nfft
573 167165497 : zw(1, i1, i3) = zf(i1, i3)
574 174142930 : zw(2, i1, i3) = 0._dp
575 : END DO
576 : END DO
577 :
578 298171 : END SUBROUTINE P_fill_upcorn
579 :
580 : ! **************************************************************************************************
581 : !> \brief (Based on suitable modifications of S.Goedecker routines)
582 : !> Assign the correct planes to the work array zmpi2
583 : !> in order to prepare for interprocessor data transposition.
584 : !> \param i1 Starting points of the plane and number of remaining lines
585 : !> \param j2 Starting points of the plane and number of remaining lines
586 : !> \param lot Starting points of the plane and number of remaining lines
587 : !> \param nfft Starting points of the plane and number of remaining lines
588 : !> \param n1 logical dimension of the FFT transform, reference for work arrays
589 : !> \param n3 logical dimension of the FFT transform, reference for work arrays
590 : !> \param md2 Dimensions of real grid
591 : !> \param nproc ...
592 : !> \param nd3 Dimensions of the kernel
593 : !> \param zw Work array (input)
594 : !> \param zmpi2 Work array for multiprocessor manipulation (output)
595 : !> \date February 2006
596 : !> \author S. Goedecker, L. Genovese
597 : !> \note
598 : !> RESTRICTIONS on USAGE
599 : !> Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
600 : !> Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
601 : !> Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
602 : !> This file is distributed under the terms of the
603 : !> GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
604 : ! **************************************************************************************************
605 298171 : SUBROUTINE scramble_P(i1, j2, lot, nfft, n1, n3, md2, nproc, nd3, zw, zmpi2)
606 : INTEGER, INTENT(in) :: i1, j2, lot, nfft, n1, n3, md2, nproc, &
607 : nd3
608 : REAL(KIND=dp), DIMENSION(2, lot, n3), INTENT(in) :: zw
609 : REAL(KIND=dp), DIMENSION(2, n1, md2/nproc, nd3), &
610 : INTENT(inout) :: zmpi2
611 :
612 : INTEGER :: i, i3
613 :
614 4057058 : DO i3 = 1, n3/2 + 1
615 93606049 : DO i = 0, nfft - 1
616 89548991 : zmpi2(1, i1 + i, j2, i3) = zw(1, i + 1, i3)
617 93307878 : zmpi2(2, i1 + i, j2, i3) = zw(2, i + 1, i3)
618 : END DO
619 : END DO
620 :
621 298171 : END SUBROUTINE scramble_P
622 :
623 : ! **************************************************************************************************
624 : !> \brief (Based on suitable modifications of S.Goedecker routines)
625 : !> Insert the correct planes of the work array zmpi2
626 : !> in order to prepare for backward FFT transform
627 : !> \param i1 Starting points of the plane and number of remaining lines
628 : !> \param j2 Starting points of the plane and number of remaining lines
629 : !> \param lot Starting points of the plane and number of remaining lines
630 : !> \param nfft Starting points of the plane and number of remaining lines
631 : !> \param n1 logical dimension of the FFT transform, reference for work arrays
632 : !> \param n3 logical dimension of the FFT transform, reference for work arrays
633 : !> \param md2 Dimensions of real grid
634 : !> \param nproc ...
635 : !> \param nd3 Dimensions of the kernel
636 : !> \param zmpi2 Work array for multiprocessor manipulation (output)
637 : !> \param zw Work array (input)
638 : !> \date February 2006
639 : !> \author S. Goedecker, L. Genovese
640 : !> \note
641 : !> RESTRICTIONS on USAGE
642 : !> Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
643 : !> Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
644 : !> Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
645 : !> This file is distributed under the terms of the
646 : !> GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
647 : ! **************************************************************************************************
648 298171 : SUBROUTINE unscramble_P(i1, j2, lot, nfft, n1, n3, md2, nproc, nd3, zmpi2, zw)
649 : INTEGER, INTENT(in) :: i1, j2, lot, nfft, n1, n3, md2, nproc, &
650 : nd3
651 : REAL(KIND=dp), DIMENSION(2, n1, md2/nproc, nd3), &
652 : INTENT(in) :: zmpi2
653 : REAL(KIND=dp), DIMENSION(2, lot, n3), &
654 : INTENT(inout) :: zw
655 :
656 : INTEGER :: i, i3, j3
657 :
658 298171 : i3 = 1
659 6788632 : DO i = 0, nfft - 1
660 6490461 : zw(1, i + 1, i3) = zmpi2(1, i1 + i, j2, i3)
661 6788632 : zw(2, i + 1, i3) = zmpi2(2, i1 + i, j2, i3)
662 : END DO
663 :
664 3758887 : DO i3 = 2, n3/2 + 1
665 3460716 : j3 = n3 + 2 - i3
666 86817417 : DO i = 0, nfft - 1
667 83058530 : zw(1, i + 1, j3) = zmpi2(1, i1 + i, j2, i3)
668 83058530 : zw(2, i + 1, j3) = -zmpi2(2, i1 + i, j2, i3)
669 83058530 : zw(1, i + 1, i3) = zmpi2(1, i1 + i, j2, i3)
670 86519246 : zw(2, i + 1, i3) = zmpi2(2, i1 + i, j2, i3)
671 : END DO
672 : END DO
673 :
674 298171 : END SUBROUTINE unscramble_P
675 :
676 : ! **************************************************************************************************
677 : !> \brief (Based on suitable modifications of S.Goedecker routines)
678 : !> Multiply with the kernel taking into account its symmetry
679 : !> Conceived to be used into convolution loops
680 : !> \param n1 ...
681 : !> \param n2 ...
682 : !> \param n3 ...
683 : !> \param lot ...
684 : !> \param nfft ...
685 : !> \param jS ...
686 : !> \param i3 ...
687 : !> \param zw Work array (input/output)
688 : !> n1,n2: logical dimension of the FFT transform, reference for zw
689 : !> nd1,nd2: Dimensions of POT
690 : !> jS,j3,nfft: starting point of the plane and number of remaining lines
691 : !>
692 : !> RESTRICTIONS on USAGE
693 : !> Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
694 : !> Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
695 : !> Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
696 : !> This file is distributed under the terms of the
697 : !> GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
698 : !> \param hx ...
699 : !> \param hy ...
700 : !> \param hz ...
701 : !> \date February 2006
702 : !> \author S. Goedecker, L. Genovese
703 : ! **************************************************************************************************
704 161869 : SUBROUTINE P_multkernel(n1, n2, n3, lot, nfft, jS, i3, zw, hx, hy, hz)
705 : INTEGER, INTENT(in) :: n1, n2, n3, lot, nfft, jS, i3
706 : REAL(KIND=dp), DIMENSION(2, lot, n2), &
707 : INTENT(inout) :: zw
708 : REAL(KIND=dp), INTENT(in) :: hx, hy, hz
709 :
710 : INTEGER :: i1, i2, j1, j2, j3
711 : REAL(KIND=dp) :: fourpi2, ker, mu3, p1, p2, pi
712 :
713 161869 : pi = 4._dp*ATAN(1._dp)
714 161869 : fourpi2 = 4._dp*pi**2
715 161869 : j3 = i3 !n3/2+1-abs(n3/2+2-i3)
716 161869 : mu3 = REAL(j3 - 1, KIND=dp)/REAL(n3, KIND=dp)
717 161869 : mu3 = (mu3/hy)**2 !beware of the exchanged dimension
718 : !Body
719 : !generic case
720 3920756 : DO i2 = 1, n2
721 93469747 : DO i1 = 1, nfft
722 89548991 : j1 = i1 + jS - 1
723 89548991 : j1 = j1 - (j1/(n1/2 + 2))*n1 !n1/2+1-abs(n1/2+2-jS-i1)
724 89548991 : j2 = i2 - (i2/(n2/2 + 2))*n2 !n2/2+1-abs(n2/2+1-i2)
725 89548991 : p1 = REAL(j1 - 1, KIND=dp)/REAL(n1, KIND=dp)
726 89548991 : p2 = REAL(j2 - 1, KIND=dp)/REAL(n2, KIND=dp)
727 89548991 : ker = -fourpi2*((p1/hx)**2 + (p2/hz)**2 + mu3) !beware of the exchanged dimension
728 89548991 : IF (ker /= 0._dp) ker = 1._dp/ker
729 89548991 : zw(1, i1, i2) = zw(1, i1, i2)*ker
730 93307878 : zw(2, i1, i2) = zw(2, i1, i2)*ker
731 : END DO
732 : END DO
733 :
734 161869 : END SUBROUTINE P_multkernel
735 :
736 : ! **************************************************************************************************
737 : !> \brief (Based on suitable modifications of S.Goedecker routines)
738 : !> Multiply with the kernel taking into account its symmetry
739 : !> Conceived to be used into convolution loops
740 : !> \param nd1 ...
741 : !> \param nd2 ...
742 : !> \param n1 ...
743 : !> \param n2 ...
744 : !> \param lot ...
745 : !> \param nfft ...
746 : !> \param jS ...
747 : !> \param pot Kernel, symmetric and real, half the length
748 : !> \param zw Work array (input/output)
749 : !> n1,n2: logical dimension of the FFT transform, reference for zw
750 : !> nd1,nd2: Dimensions of POT
751 : !> jS, nfft: starting point of the plane and number of remaining lines
752 : !>
753 : !> RESTRICTIONS on USAGE
754 : !> Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
755 : !> Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
756 : !> Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
757 : !> This file is distributed under the terms of the
758 : !> GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
759 : !> \date February 2006
760 : !> \author S. Goedecker, L. Genovese
761 : ! **************************************************************************************************
762 1701650 : SUBROUTINE multkernel(nd1, nd2, n1, n2, lot, nfft, jS, pot, zw)
763 : INTEGER, INTENT(in) :: nd1, nd2, n1, n2, lot, nfft, jS
764 : REAL(KIND=dp), DIMENSION(nd1, nd2), INTENT(in) :: pot
765 : REAL(KIND=dp), DIMENSION(2, lot, n2), &
766 : INTENT(inout) :: zw
767 :
768 : INTEGER :: i2, j, j1, j2
769 :
770 34628440 : DO j = 1, nfft
771 32926790 : j1 = j + jS - 1
772 32926790 : j1 = j1 + (j1/(n1/2 + 2))*(n1 + 2 - 2*j1)
773 32926790 : zw(1, j, 1) = zw(1, j, 1)*pot(j1, 1)
774 34628440 : zw(2, j, 1) = zw(2, j, 1)*pot(j1, 1)
775 : END DO
776 :
777 : !generic case
778 82273442 : DO i2 = 2, n2/2
779 1521396758 : DO j = 1, nfft
780 1439123316 : j1 = j + jS - 1
781 1439123316 : j1 = j1 + (j1/(n1/2 + 2))*(n1 + 2 - 2*j1)
782 1439123316 : j2 = n2 + 2 - i2
783 1439123316 : zw(1, j, i2) = zw(1, j, i2)*pot(j1, i2)
784 1439123316 : zw(2, j, i2) = zw(2, j, i2)*pot(j1, i2)
785 1439123316 : zw(1, j, j2) = zw(1, j, j2)*pot(j1, i2)
786 1519695108 : zw(2, j, j2) = zw(2, j, j2)*pot(j1, i2)
787 : END DO
788 : END DO
789 :
790 : !case i2=n2/2+1
791 34628440 : DO j = 1, nfft
792 32926790 : j1 = j + jS - 1
793 32926790 : j1 = j1 + (j1/(n1/2 + 2))*(n1 + 2 - 2*j1)
794 32926790 : j2 = n2/2 + 1
795 32926790 : zw(1, j, j2) = zw(1, j, j2)*pot(j1, j2)
796 34628440 : zw(2, j, j2) = zw(2, j, j2)*pot(j1, j2)
797 : END DO
798 :
799 1701650 : END SUBROUTINE multkernel
800 :
801 : ! **************************************************************************************************
802 : !> \brief !HERE POT MUST BE THE KERNEL (BEWARE THE HALF DIMENSION)
803 : !> ****h* BigDFT/S_PoissonSolver
804 : !> (Based on suitable modifications of S.Goedecker routines)
805 : !> Applies the local FFT space Kernel to the density in Real space.
806 : !> Does NOT calculate the LDA exchange-correlation terms
807 : !> \param n1 logical dimension of the transform.
808 : !> \param n2 logical dimension of the transform.
809 : !> \param n3 logical dimension of the transform.
810 : !> \param nd1 Dimension of POT
811 : !> \param nd2 Dimension of POT
812 : !> \param nd3 Dimension of POT
813 : !> \param md1 Dimension of ZF
814 : !> \param md2 Dimension of ZF
815 : !> \param md3 Dimension of ZF
816 : !> \param nproc number of processors used as returned by MPI_COMM_SIZE
817 : !> \param iproc [0:nproc-1] number of processor as returned by MPI_COMM_RANK
818 : !> \param pot Kernel, only the distributed part (REAL)
819 : !> POT(i1,i2,i3)
820 : !> i1=1,nd1 , i2=1,nd2 , i3=1,nd3/nproc
821 : !> \param zf Density (input/output)
822 : !> ZF(i1,i3,i2)
823 : !> i1=1,md1 , i2=1,md2/nproc , i3=1,md3
824 : !> \param scal factor of renormalization of the FFT in order to acheve unitarity
825 : !> and the correct dimension
826 : !> \param mpi_group ...
827 : !> \date October 2006
828 : !> \author S. Goedecker, L. Genovese
829 : !> \note As transform lengths most products of the prime factors 2,3,5 are allowed.
830 : !> The detailed table with allowed transform lengths can
831 : !> be found in subroutine CTRIG
832 : !> \note
833 : !> RESTRICTIONS on USAGE
834 : !> Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
835 : !> Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
836 : !> Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
837 : !> This file is distributed under the terms of the
838 : !> GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
839 : ! **************************************************************************************************
840 18 : SUBROUTINE S_PoissonSolver(n1, n2, n3, nd1, nd2, nd3, md1, md2, md3, nproc, iproc, pot, zf, &
841 : scal, mpi_group)
842 : INTEGER, INTENT(in) :: n1, n2, n3, nd1, nd2, nd3, md1, md2, &
843 : md3, nproc, iproc
844 : REAL(KIND=dp), DIMENSION(nd1, nd2, nd3/nproc), &
845 : INTENT(in) :: pot
846 : REAL(KIND=dp), DIMENSION(md1, md3, md2/nproc), &
847 : INTENT(inout) :: zf
848 : REAL(KIND=dp), INTENT(in) :: scal
849 :
850 : CLASS(mp_comm_type), INTENT(in) :: mpi_group
851 :
852 : CHARACTER(len=*), PARAMETER :: routineN = 'S_PoissonSolver'
853 : INTEGER, PARAMETER :: ncache_optimal = 8*1024
854 :
855 : INTEGER :: handle, i, i1, i3, ic1, ic2, ic3, inzee, &
856 : j, j2, J2stb, J2stf, j3, Jp2stb, &
857 : Jp2stf, lot, lzt, ma, mb, ncache, nfft
858 18 : INTEGER, ALLOCATABLE, DIMENSION(:) :: after1, after2, after3, before1, &
859 18 : before2, before3, now1, now2, now3
860 : REAL(kind=dp) :: twopion
861 18 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :) :: btrig1, btrig2, btrig3, cosinarr, &
862 18 : ftrig1, ftrig2, ftrig3
863 18 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :) :: zt, zw
864 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :, :) :: zmpi2
865 : REAL(KIND=dp), ALLOCATABLE, &
866 18 : DIMENSION(:, :, :, :, :) :: zmpi1
867 :
868 18 : CALL timeset(routineN, handle)
869 : ! check input
870 18 : IF (MOD(n3, 2) .NE. 0) CPABORT("Parallel convolution:ERROR:n3")
871 18 : IF (nd1 .LT. n1/2 + 1) CPABORT("Parallel convolution:ERROR:nd1")
872 18 : IF (nd2 .LT. n2/2 + 1) CPABORT("Parallel convolution:ERROR:nd2")
873 18 : IF (nd3 .LT. n3/2 + 1) CPABORT("Parallel convolution:ERROR:nd3")
874 18 : IF (md1 .LT. n1) CPABORT("Parallel convolution:ERROR:md1")
875 18 : IF (md2 .LT. n2) CPABORT("Parallel convolution:ERROR:md2")
876 18 : IF (md3 .LT. n3/2) CPABORT("Parallel convolution:ERROR:md3")
877 18 : IF (MOD(nd3, nproc) .NE. 0) CPABORT("Parallel convolution:ERROR:nd3")
878 18 : IF (MOD(md2, nproc) .NE. 0) CPABORT("Parallel convolution:ERROR:md2")
879 :
880 : !defining work arrays dimensions
881 18 : ncache = ncache_optimal
882 18 : IF (ncache <= MAX(n1, n2, n3/2)*4) ncache = MAX(n1, n2, n3/2)*4
883 :
884 : ! if (timing_flag == 1 .and. iproc ==0) print *,'parallel ncache=',ncache
885 :
886 18 : lzt = n2
887 18 : IF (MOD(n2, 2) .EQ. 0) lzt = lzt + 1
888 18 : IF (MOD(n2, 4) .EQ. 0) lzt = lzt + 1 !maybe this is useless
889 :
890 : !Allocations
891 18 : ALLOCATE (btrig1(2, ctrig_length))
892 18 : ALLOCATE (ftrig1(2, ctrig_length))
893 18 : ALLOCATE (after1(7))
894 18 : ALLOCATE (now1(7))
895 18 : ALLOCATE (before1(7))
896 18 : ALLOCATE (btrig2(2, ctrig_length))
897 18 : ALLOCATE (ftrig2(2, ctrig_length))
898 18 : ALLOCATE (after2(7))
899 18 : ALLOCATE (now2(7))
900 18 : ALLOCATE (before2(7))
901 18 : ALLOCATE (btrig3(2, ctrig_length))
902 18 : ALLOCATE (ftrig3(2, ctrig_length))
903 18 : ALLOCATE (after3(7))
904 18 : ALLOCATE (now3(7))
905 18 : ALLOCATE (before3(7))
906 72 : ALLOCATE (zw(2, ncache/4, 2))
907 72 : ALLOCATE (zt(2, lzt, n1))
908 90 : ALLOCATE (zmpi2(2, n1, md2/nproc, nd3))
909 72 : ALLOCATE (cosinarr(2, n3/2))
910 18 : IF (nproc .GT. 1) THEN
911 108 : ALLOCATE (zmpi1(2, n1, md2/nproc, nd3/nproc, nproc))
912 4437270 : zmpi1 = 0.0_dp
913 : END IF
914 4437234 : zmpi2 = 0.0_dp
915 221238 : zw = 0.0_dp
916 161370 : zt = 0.0_dp
917 :
918 : !calculating the FFT work arrays (beware on the HalFFT in n3 dimension)
919 18 : CALL ctrig(n3/2, btrig3, after3, before3, now3, 1, ic3)
920 18 : CALL ctrig(n1, btrig1, after1, before1, now1, 1, ic1)
921 18 : CALL ctrig(n2, btrig2, after2, before2, now2, 1, ic2)
922 990 : DO j = 1, n1
923 972 : ftrig1(1, j) = btrig1(1, j)
924 990 : ftrig1(2, j) = -btrig1(2, j)
925 : END DO
926 990 : DO j = 1, n2
927 972 : ftrig2(1, j) = btrig2(1, j)
928 990 : ftrig2(2, j) = -btrig2(2, j)
929 : END DO
930 990 : DO j = 1, n3/2
931 972 : ftrig3(1, j) = btrig3(1, j)
932 990 : ftrig3(2, j) = -btrig3(2, j)
933 : END DO
934 :
935 : !Calculating array of phases for HalFFT decoding
936 18 : twopion = 8._dp*ATAN(1._dp)/REAL(n3, KIND=dp)
937 990 : DO i3 = 1, n3/2
938 972 : cosinarr(1, i3) = COS(twopion*(i3 - 1))
939 990 : cosinarr(2, i3) = -SIN(twopion*(i3 - 1))
940 : END DO
941 :
942 : !initializing integral
943 : !ehartree=0._dp
944 :
945 : ! transform along z axis
946 18 : lot = ncache/(2*n3)
947 18 : IF (lot .LT. 1) THEN
948 : WRITE (*, *) &
949 : 'convolxc_off:ncache has to be enlarged to be able to hold at'// &
950 : 'least one 1-d FFT of this size even though this will'// &
951 0 : 'reduce the performance for shorter transform lengths', n1, n2, n3, nd1, nd2, nd3, md1, md2, md3, nproc, iproc
952 0 : CPABORT("")
953 : END IF
954 :
955 504 : DO j2 = 1, md2/nproc
956 : !this condition ensures that we manage only the interesting part for the FFT
957 504 : IF (iproc*(md2/nproc) + j2 .LE. n2) THEN
958 1458 : DO i1 = 1, n1, lot
959 972 : ma = i1
960 972 : mb = MIN(i1 + (lot - 1), n1)
961 972 : nfft = mb - ma + 1
962 :
963 : !inserting real data into complex array of half length
964 972 : CALL halfill_upcorn(md1, md3, lot, nfft, n3, zf(i1, 1, j2), zw(1, 1, 1))
965 :
966 : !performing FFT
967 : !input: I1,I3,J2,(Jp2)
968 972 : inzee = 1
969 3888 : DO i = 1, ic3
970 : CALL fftstp(lot, nfft, n3/2, lot, n3/2, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
971 2916 : btrig3, after3(i), now3(i), before3(i), 1)
972 3888 : inzee = 3 - inzee
973 : END DO
974 : !output: I1,i3,J2,(Jp2)
975 : !unpacking FFT in order to restore correct result,
976 : !while exchanging components
977 : !input: I1,i3,J2,(Jp2)
978 1458 : CALL scramble_unpack(i1, j2, lot, nfft, n1, n3, md2, nproc, nd3, zw(1, 1, inzee), zmpi2, cosinarr)
979 : !output: I1,J2,i3,(Jp2)
980 : END DO
981 : END IF
982 : END DO
983 : !Interprocessor data transposition
984 : !input: I1,J2,j3,jp3,(Jp2)
985 18 : IF (nproc .GT. 1) THEN
986 18 : CALL mpi_group%alltoall(zmpi2, zmpi1, 2*n1*(md2/nproc)*(nd3/nproc))
987 : END IF
988 : !output: I1,J2,j3,Jp2,(jp3)
989 :
990 : !now each process perform complete convolution of its planes
991 522 : DO j3 = 1, nd3/nproc
992 : !this condition ensures that we manage only the interesting part for the FFT
993 522 : IF (iproc*(nd3/nproc) + j3 .LE. n3/2 + 1) THEN
994 495 : Jp2stb = 1
995 495 : J2stb = 1
996 495 : Jp2stf = 1
997 495 : J2stf = 1
998 :
999 : ! transform along x axis
1000 495 : lot = ncache/(4*n1)
1001 495 : IF (lot .LT. 1) THEN
1002 : WRITE (*, *) &
1003 : 'convolxc_off:ncache has to be enlarged to be able to hold at'// &
1004 : 'least one 1-d FFT of this size even though this will'// &
1005 0 : 'reduce the performance for shorter transform lengths'
1006 0 : CPABORT("")
1007 : END IF
1008 :
1009 1485 : DO j = 1, n2, lot
1010 990 : ma = j
1011 990 : mb = MIN(j + (lot - 1), n2)
1012 990 : nfft = mb - ma + 1
1013 :
1014 : !reverse index ordering, leaving the planes to be transformed at the end
1015 : !input: I1,J2,j3,Jp2,(jp3)
1016 990 : IF (nproc .EQ. 1) THEN
1017 0 : CALL S_mpiswitch_upcorn(j3, nfft, Jp2stb, J2stb, lot, n1, md2, nd3, nproc, zmpi2, zw(1, 1, 1))
1018 : ELSE
1019 990 : CALL S_mpiswitch_upcorn(j3, nfft, Jp2stb, J2stb, lot, n1, md2, nd3, nproc, zmpi1, zw(1, 1, 1))
1020 : END IF
1021 : !output: J2,Jp2,I1,j3,(jp3)
1022 :
1023 : !performing FFT
1024 : !input: I2,I1,j3,(jp3)
1025 990 : inzee = 1
1026 2970 : DO i = 1, ic1 - 1
1027 : CALL fftstp(lot, nfft, n1, lot, n1, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
1028 1980 : btrig1, after1(i), now1(i), before1(i), 1)
1029 2970 : inzee = 3 - inzee
1030 : END DO
1031 :
1032 : !storing the last step into zt array
1033 990 : i = ic1
1034 : CALL fftstp(lot, nfft, n1, lzt, n1, zw(1, 1, inzee), zt(1, j, 1), &
1035 1485 : btrig1, after1(i), now1(i), before1(i), 1)
1036 : !output: I2,i1,j3,(jp3)
1037 : END DO
1038 :
1039 : !transform along y axis
1040 495 : lot = ncache/(4*n2)
1041 495 : IF (lot .LT. 1) THEN
1042 : WRITE (*, *) &
1043 : 'convolxc_off:ncache has to be enlarged to be able to hold at'// &
1044 : 'least one 1-d FFT of this size even though this will'// &
1045 0 : 'reduce the performance for shorter transform lengths'
1046 0 : CPABORT("")
1047 : END IF
1048 :
1049 1485 : DO j = 1, n1, lot
1050 990 : ma = j
1051 990 : mb = MIN(j + (lot - 1), n1)
1052 990 : nfft = mb - ma + 1
1053 :
1054 : !reverse ordering
1055 : !input: I2,i1,j3,(jp3)
1056 990 : CALL S_switch_upcorn(nfft, n2, lot, n1, lzt, zt(1, 1, j), zw(1, 1, 1))
1057 : !output: i1,I2,j3,(jp3)
1058 :
1059 : !performing FFT
1060 : !input: i1,I2,j3,(jp3)
1061 990 : inzee = 1
1062 3960 : DO i = 1, ic2
1063 : CALL fftstp(lot, nfft, n2, lot, n2, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
1064 2970 : btrig2, after2(i), now2(i), before2(i), 1)
1065 3960 : inzee = 3 - inzee
1066 : END DO
1067 : !output: i1,i2,j3,(jp3)
1068 :
1069 : !Multiply with kernel in fourier space
1070 990 : CALL multkernel(nd1, nd2, n1, n2, lot, nfft, j, pot(1, 1, j3), zw(1, 1, inzee))
1071 :
1072 : !TRANSFORM BACK IN REAL SPACE
1073 :
1074 : !transform along y axis
1075 : !input: i1,i2,j3,(jp3)
1076 3960 : DO i = 1, ic2
1077 : CALL fftstp(lot, nfft, n2, lot, n2, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
1078 2970 : ftrig2, after2(i), now2(i), before2(i), -1)
1079 3960 : inzee = 3 - inzee
1080 : END DO
1081 :
1082 : !reverse ordering
1083 : !input: i1,I2,j3,(jp3)
1084 1485 : CALL S_unswitch_downcorn(nfft, n2, lot, n1, lzt, zw(1, 1, inzee), zt(1, 1, j))
1085 : !output: I2,i1,j3,(jp3)
1086 : END DO
1087 :
1088 : !transform along x axis
1089 : !input: I2,i1,j3,(jp3)
1090 495 : lot = ncache/(4*n1)
1091 1485 : DO j = 1, n2, lot
1092 990 : ma = j
1093 990 : mb = MIN(j + (lot - 1), n2)
1094 990 : nfft = mb - ma + 1
1095 :
1096 : !performing FFT
1097 990 : i = 1
1098 : CALL fftstp(lzt, nfft, n1, lot, n1, zt(1, j, 1), zw(1, 1, 1), &
1099 990 : ftrig1, after1(i), now1(i), before1(i), -1)
1100 :
1101 990 : inzee = 1
1102 2970 : DO i = 2, ic1
1103 : CALL fftstp(lot, nfft, n1, lot, n1, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
1104 1980 : ftrig1, after1(i), now1(i), before1(i), -1)
1105 2970 : inzee = 3 - inzee
1106 : END DO
1107 : !output: I2,I1,j3,(jp3)
1108 :
1109 : !reverse ordering
1110 : !input: J2,Jp2,I1,j3,(jp3)
1111 1485 : IF (nproc .EQ. 1) THEN
1112 0 : CALL S_unmpiswitch_downcorn(j3, nfft, Jp2stf, J2stf, lot, n1, md2, nd3, nproc, zw(1, 1, inzee), zmpi2)
1113 : ELSE
1114 990 : CALL S_unmpiswitch_downcorn(j3, nfft, Jp2stf, J2stf, lot, n1, md2, nd3, nproc, zw(1, 1, inzee), zmpi1)
1115 : END IF
1116 : ! output: I1,J2,j3,Jp2,(jp3)
1117 : END DO
1118 : END IF
1119 : END DO
1120 :
1121 : !Interprocessor data transposition
1122 : !input: I1,J2,j3,Jp2,(jp3)
1123 18 : IF (nproc .GT. 1) THEN
1124 : !communication scheduling
1125 18 : CALL mpi_group%alltoall(zmpi1, zmpi2, 2*n1*(md2/nproc)*(nd3/nproc))
1126 : END IF
1127 :
1128 : !output: I1,J2,j3,jp3,(Jp2)
1129 :
1130 : !transform along z axis
1131 : !input: I1,J2,i3,(Jp2)
1132 18 : lot = ncache/(2*n3)
1133 504 : DO j2 = 1, md2/nproc
1134 : !this condition ensures that we manage only the interesting part for the FFT
1135 504 : IF (iproc*(md2/nproc) + j2 .LE. n2) THEN
1136 1458 : DO i1 = 1, n1, lot
1137 972 : ma = i1
1138 972 : mb = MIN(i1 + (lot - 1), n1)
1139 972 : nfft = mb - ma + 1
1140 :
1141 : !reverse ordering and repack the FFT data in order to be backward HalFFT transformed
1142 : !input: I1,J2,i3,(Jp2)
1143 972 : CALL unscramble_pack(i1, j2, lot, nfft, n1, n3, md2, nproc, nd3, zmpi2, zw(1, 1, 1), cosinarr)
1144 : !output: I1,i3,J2,(Jp2)
1145 :
1146 : !performing FFT
1147 : !input: I1,i3,J2,(Jp2)
1148 972 : inzee = 1
1149 3888 : DO i = 1, ic3
1150 : CALL fftstp(lot, nfft, n3/2, lot, n3/2, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
1151 2916 : ftrig3, after3(i), now3(i), before3(i), -1)
1152 3888 : inzee = 3 - inzee
1153 : END DO
1154 : !output: I1,I3,J2,(Jp2)
1155 :
1156 : !rebuild the output array
1157 : CALL unfill_downcorn(md1, md3, lot, nfft, n3, zw(1, 1, inzee), zf(i1, 1, j2) &
1158 1458 : , scal)
1159 :
1160 : !integrate local pieces together
1161 : !ehartree=ehartree+0.5_dp*ehartreetmp*hx*hy*hz
1162 : END DO
1163 : END IF
1164 : END DO
1165 :
1166 : !De-allocations
1167 18 : DEALLOCATE (btrig1)
1168 18 : DEALLOCATE (ftrig1)
1169 18 : DEALLOCATE (after1)
1170 18 : DEALLOCATE (now1)
1171 18 : DEALLOCATE (before1)
1172 18 : DEALLOCATE (btrig2)
1173 18 : DEALLOCATE (ftrig2)
1174 18 : DEALLOCATE (after2)
1175 18 : DEALLOCATE (now2)
1176 18 : DEALLOCATE (before2)
1177 18 : DEALLOCATE (btrig3)
1178 18 : DEALLOCATE (ftrig3)
1179 18 : DEALLOCATE (after3)
1180 18 : DEALLOCATE (now3)
1181 18 : DEALLOCATE (before3)
1182 18 : DEALLOCATE (zmpi2)
1183 18 : DEALLOCATE (zw)
1184 18 : DEALLOCATE (zt)
1185 18 : DEALLOCATE (cosinarr)
1186 18 : IF (nproc .GT. 1) DEALLOCATE (zmpi1)
1187 :
1188 : ! call timing(iproc,'PSolv_comput ','OF')
1189 18 : CALL timestop(handle)
1190 36 : END SUBROUTINE S_PoissonSolver
1191 :
1192 : ! **************************************************************************************************
1193 : !> \brief ...
1194 : !> \param j3 ...
1195 : !> \param nfft ...
1196 : !> \param Jp2stb ...
1197 : !> \param J2stb ...
1198 : !> \param lot ...
1199 : !> \param n1 ...
1200 : !> \param md2 ...
1201 : !> \param nd3 ...
1202 : !> \param nproc ...
1203 : !> \param zmpi1 ...
1204 : !> \param zw ...
1205 : ! **************************************************************************************************
1206 990 : SUBROUTINE S_mpiswitch_upcorn(j3, nfft, Jp2stb, J2stb, lot, n1, md2, nd3, nproc, zmpi1, zw)
1207 : INTEGER, INTENT(in) :: j3, nfft
1208 : INTEGER, INTENT(inout) :: Jp2stb, J2stb
1209 : INTEGER, INTENT(in) :: lot, n1, md2, nd3, nproc
1210 : REAL(KIND=dp), &
1211 : DIMENSION(2, n1, md2/nproc, nd3/nproc, nproc), &
1212 : INTENT(in) :: zmpi1
1213 : REAL(KIND=dp), DIMENSION(2, lot, n1), &
1214 : INTENT(inout) :: zw
1215 :
1216 : INTEGER :: I1, J2, Jp2, mfft
1217 :
1218 990 : mfft = 0
1219 1980 : DO Jp2 = Jp2stb, nproc
1220 28215 : DO J2 = J2stb, md2/nproc
1221 27225 : mfft = mfft + 1
1222 27225 : IF (mfft .GT. nfft) THEN
1223 495 : Jp2stb = Jp2
1224 495 : J2stb = J2
1225 495 : RETURN
1226 : END IF
1227 1471140 : DO I1 = 1, n1
1228 1443420 : zw(1, mfft, I1) = zmpi1(1, I1, J2, j3, Jp2)
1229 1470150 : zw(2, mfft, I1) = zmpi1(2, I1, J2, j3, Jp2)
1230 : END DO
1231 : END DO
1232 1485 : J2stb = 1
1233 : END DO
1234 : END SUBROUTINE S_mpiswitch_upcorn
1235 :
1236 : ! **************************************************************************************************
1237 : !> \brief ...
1238 : !> \param nfft ...
1239 : !> \param n2 ...
1240 : !> \param lot ...
1241 : !> \param n1 ...
1242 : !> \param lzt ...
1243 : !> \param zt ...
1244 : !> \param zw ...
1245 : ! **************************************************************************************************
1246 990 : SUBROUTINE S_switch_upcorn(nfft, n2, lot, n1, lzt, zt, zw)
1247 : INTEGER, INTENT(in) :: nfft, n2, lot, n1, lzt
1248 : REAL(KIND=dp), DIMENSION(2, lzt, n1), INTENT(in) :: zt
1249 : REAL(KIND=dp), DIMENSION(2, lot, n2), &
1250 : INTENT(inout) :: zw
1251 :
1252 : INTEGER :: i, j
1253 :
1254 27720 : DO j = 1, nfft
1255 1471140 : DO i = 1, n2
1256 1443420 : zw(1, j, i) = zt(1, i, j)
1257 1470150 : zw(2, j, i) = zt(2, i, j)
1258 : END DO
1259 : END DO
1260 990 : END SUBROUTINE S_switch_upcorn
1261 :
1262 : ! **************************************************************************************************
1263 : !> \brief ...
1264 : !> \param nfft ...
1265 : !> \param n2 ...
1266 : !> \param lot ...
1267 : !> \param n1 ...
1268 : !> \param lzt ...
1269 : !> \param zw ...
1270 : !> \param zt ...
1271 : ! **************************************************************************************************
1272 990 : SUBROUTINE S_unswitch_downcorn(nfft, n2, lot, n1, lzt, zw, zt)
1273 : INTEGER, INTENT(in) :: nfft, n2, lot, n1, lzt
1274 : REAL(KIND=dp), DIMENSION(2, lot, n2), INTENT(in) :: zw
1275 : REAL(KIND=dp), DIMENSION(2, lzt, n1), &
1276 : INTENT(inout) :: zt
1277 :
1278 : INTEGER :: i, j
1279 :
1280 27720 : DO j = 1, nfft
1281 1471140 : DO i = 1, n2
1282 1443420 : zt(1, i, j) = zw(1, j, i)
1283 1470150 : zt(2, i, j) = zw(2, j, i)
1284 : END DO
1285 : END DO
1286 990 : END SUBROUTINE S_unswitch_downcorn
1287 :
1288 : ! **************************************************************************************************
1289 : !> \brief ...
1290 : !> \param j3 ...
1291 : !> \param nfft ...
1292 : !> \param Jp2stf ...
1293 : !> \param J2stf ...
1294 : !> \param lot ...
1295 : !> \param n1 ...
1296 : !> \param md2 ...
1297 : !> \param nd3 ...
1298 : !> \param nproc ...
1299 : !> \param zw ...
1300 : !> \param zmpi1 ...
1301 : ! **************************************************************************************************
1302 990 : SUBROUTINE S_unmpiswitch_downcorn(j3, nfft, Jp2stf, J2stf, lot, n1, md2, nd3, nproc, zw, zmpi1)
1303 : INTEGER, INTENT(in) :: j3, nfft
1304 : INTEGER, INTENT(inout) :: Jp2stf, J2stf
1305 : INTEGER, INTENT(in) :: lot, n1, md2, nd3, nproc
1306 : REAL(KIND=dp), DIMENSION(2, lot, n1), INTENT(in) :: zw
1307 : REAL(KIND=dp), &
1308 : DIMENSION(2, n1, md2/nproc, nd3/nproc, nproc), &
1309 : INTENT(inout) :: zmpi1
1310 :
1311 : INTEGER :: I1, J2, Jp2, mfft
1312 :
1313 990 : mfft = 0
1314 1980 : DO Jp2 = Jp2stf, nproc
1315 28215 : DO J2 = J2stf, md2/nproc
1316 27225 : mfft = mfft + 1
1317 27225 : IF (mfft .GT. nfft) THEN
1318 495 : Jp2stf = Jp2
1319 495 : J2stf = J2
1320 495 : RETURN
1321 : END IF
1322 1471140 : DO I1 = 1, n1
1323 1443420 : zmpi1(1, I1, J2, j3, Jp2) = zw(1, mfft, I1)
1324 1470150 : zmpi1(2, I1, J2, j3, Jp2) = zw(2, mfft, I1)
1325 : END DO
1326 : END DO
1327 1485 : J2stf = 1
1328 : END DO
1329 : END SUBROUTINE S_unmpiswitch_downcorn
1330 :
1331 : ! **************************************************************************************************
1332 : !> \brief (Based on suitable modifications of S.Goedecker routines)
1333 : !> Restore data into output array, calculating in the meanwhile
1334 : !> Hartree energy of the potential
1335 : !> \param md1 Dimensions of the undistributed part of the real grid
1336 : !> \param md3 Dimensions of the undistributed part of the real grid
1337 : !> \param lot ...
1338 : !> \param nfft number of planes
1339 : !> \param n3 (twice the) dimension of the last FFTtransform.
1340 : !> \param zw FFT work array
1341 : !> \param zf Original distributed density as well as
1342 : !> Distributed solution of the poisson equation (inout)
1343 : !> \param scal Needed to achieve unitarity and correct dimensions
1344 : !> \date February 2006
1345 : !> \author S. Goedecker, L. Genovese
1346 : !> \note Assuming that high frequencies are in the corners
1347 : !> and that n3 is multiple of 4
1348 : !>
1349 : !> RESTRICTIONS on USAGE
1350 : !> Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
1351 : !> Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
1352 : !> Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
1353 : !> This file is distributed under the terms of the
1354 : !> GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
1355 : ! **************************************************************************************************
1356 542732 : SUBROUTINE unfill_downcorn(md1, md3, lot, nfft, n3, zw, zf, scal)
1357 : INTEGER, INTENT(in) :: md1, md3, lot, nfft, n3
1358 : REAL(KIND=dp), DIMENSION(2, lot, n3/2), INTENT(in) :: zw
1359 : REAL(KIND=dp), DIMENSION(md1, md3), INTENT(inout) :: zf
1360 : REAL(KIND=dp), INTENT(in) :: scal
1361 :
1362 : INTEGER :: i1, i3
1363 : REAL(KIND=dp) :: pot1
1364 :
1365 13101356 : DO i3 = 1, n3/4
1366 373748418 : DO i1 = 1, nfft
1367 360647062 : pot1 = scal*zw(1, i1, i3)
1368 : !ehartreetmp =ehartreetmp + pot1* zf(i1,2*i3-1)
1369 360647062 : zf(i1, 2*i3 - 1) = pot1
1370 360647062 : pot1 = scal*zw(2, i1, i3)
1371 : !ehartreetmp =ehartreetmp + pot1* zf(i1,2*i3)
1372 373205686 : zf(i1, 2*i3) = pot1
1373 : END DO
1374 : END DO
1375 542732 : END SUBROUTINE unfill_downcorn
1376 :
1377 : ! **************************************************************************************************
1378 : !> \brief ...
1379 : !> \param md1 ...
1380 : !> \param md3 ...
1381 : !> \param lot ...
1382 : !> \param nfft ...
1383 : !> \param n3 ...
1384 : !> \param zf ...
1385 : !> \param zw ...
1386 : ! **************************************************************************************************
1387 542732 : SUBROUTINE halfill_upcorn(md1, md3, lot, nfft, n3, zf, zw)
1388 : INTEGER :: md1, md3, lot, nfft, n3
1389 : REAL(KIND=dp) :: zf(md1, md3), zw(2, lot, n3/2)
1390 :
1391 : INTEGER :: i1, i3
1392 :
1393 13101356 : DO i3 = 1, n3/4
1394 : ! WARNING: Assuming that high frequencies are in the corners
1395 : ! and that n3 is multiple of 4
1396 : !in principle we can relax this condition
1397 373748418 : DO i1 = 1, nfft
1398 360647062 : zw(1, i1, i3) = 0._dp
1399 373205686 : zw(2, i1, i3) = 0._dp
1400 : END DO
1401 : END DO
1402 13101356 : DO i3 = n3/4 + 1, n3/2
1403 373748418 : DO i1 = 1, nfft
1404 360647062 : zw(1, i1, i3) = zf(i1, 2*i3 - 1 - n3/2)
1405 373205686 : zw(2, i1, i3) = zf(i1, 2*i3 - n3/2)
1406 : END DO
1407 : END DO
1408 :
1409 542732 : END SUBROUTINE halfill_upcorn
1410 :
1411 : ! **************************************************************************************************
1412 : !> \brief (Based on suitable modifications of S.Goedecker routines)
1413 : !> Assign the correct planes to the work array zmpi2
1414 : !> in order to prepare for interprocessor data transposition.
1415 : !> In the meanwhile, it unpacks the data of the HalFFT in order to prepare for
1416 : !> multiplication with the kernel
1417 : !> \param i1 Starting points of the plane and number of remaining lines
1418 : !> \param j2 Starting points of the plane and number of remaining lines
1419 : !> \param lot Starting points of the plane and number of remaining lines
1420 : !> \param nfft Starting points of the plane and number of remaining lines
1421 : !> \param n1 logical dimension of the FFT transform, reference for work arrays
1422 : !> \param n3 logical dimension of the FFT transform, reference for work arrays
1423 : !> \param md2 Dimensions of real grid
1424 : !> \param nproc ...
1425 : !> \param nd3 Dimensions of the kernel
1426 : !> \param zw Work array (input)
1427 : !> \param zmpi2 Work array for multiprocessor manipulation (output)
1428 : !> \param cosinarr Array of the phases needed for unpacking
1429 : !> \date February 2006
1430 : !> \author S. Goedecker, L. Genovese
1431 : !> \note
1432 : !> RESTRICTIONS on USAGE
1433 : !> Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
1434 : !> Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
1435 : !> Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
1436 : !> This file is distributed under the terms of the
1437 : !> GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
1438 : ! **************************************************************************************************
1439 614864 : SUBROUTINE scramble_unpack(i1, j2, lot, nfft, n1, n3, md2, nproc, nd3, zw, zmpi2, cosinarr)
1440 : INTEGER, INTENT(in) :: i1, j2, lot, nfft, n1, n3, md2, nproc, &
1441 : nd3
1442 : REAL(KIND=dp), DIMENSION(2, lot, n3/2), INTENT(in) :: zw
1443 : REAL(KIND=dp), DIMENSION(2, n1, md2/nproc, nd3), &
1444 : INTENT(inout) :: zmpi2
1445 : REAL(KIND=dp), DIMENSION(2, n3/2), INTENT(in) :: cosinarr
1446 :
1447 : INTEGER :: i, i3, ind1, ind2
1448 : REAL(KIND=dp) :: a, b, c, cp, d, feI, feR, fI, foI, foR, &
1449 : fR, sp
1450 :
1451 : !case i3=1 and i3=n3/2+1
1452 :
1453 18862834 : DO i = 0, nfft - 1
1454 18247970 : a = zw(1, i + 1, 1)
1455 18247970 : b = zw(2, i + 1, 1)
1456 18247970 : zmpi2(1, i1 + i, j2, 1) = a + b
1457 18247970 : zmpi2(2, i1 + i, j2, 1) = 0._dp
1458 18247970 : zmpi2(1, i1 + i, j2, n3/2 + 1) = a - b
1459 18862834 : zmpi2(2, i1 + i, j2, n3/2 + 1) = 0._dp
1460 : END DO
1461 : !case 2<=i3<=n3/2
1462 29130144 : DO i3 = 2, n3/2
1463 28515280 : ind1 = i3
1464 28515280 : ind2 = n3/2 - i3 + 2
1465 28515280 : cp = cosinarr(1, i3)
1466 28515280 : sp = cosinarr(2, i3)
1467 856427058 : DO i = 0, nfft - 1
1468 827296914 : a = zw(1, i + 1, ind1)
1469 827296914 : b = zw(2, i + 1, ind1)
1470 827296914 : c = zw(1, i + 1, ind2)
1471 827296914 : d = zw(2, i + 1, ind2)
1472 827296914 : feR = .5_dp*(a + c)
1473 827296914 : feI = .5_dp*(b - d)
1474 827296914 : foR = .5_dp*(a - c)
1475 827296914 : foI = .5_dp*(b + d)
1476 827296914 : fR = feR + cp*foI - sp*foR
1477 827296914 : fI = feI - cp*foR - sp*foI
1478 827296914 : zmpi2(1, i1 + i, j2, ind1) = fR
1479 855812194 : zmpi2(2, i1 + i, j2, ind1) = fI
1480 : END DO
1481 : END DO
1482 :
1483 614864 : END SUBROUTINE scramble_unpack
1484 :
1485 : ! **************************************************************************************************
1486 : !> \brief (Based on suitable modifications of S.Goedecker routines)
1487 : !> Insert the correct planes of the work array zmpi2
1488 : !> in order to prepare for backward FFT transform
1489 : !> In the meanwhile, it packs the data in order to be transformed with the HalFFT
1490 : !> procedure
1491 : !> \param i1 Starting points of the plane and number of remaining lines
1492 : !> \param j2 Starting points of the plane and number of remaining lines
1493 : !> \param lot Starting points of the plane and number of remaining lines
1494 : !> \param nfft Starting points of the plane and number of remaining lines
1495 : !> \param n1 logical dimension of the FFT transform, reference for work arrays
1496 : !> \param n3 logical dimension of the FFT transform, reference for work arrays
1497 : !> \param md2 Dimensions of real grid
1498 : !> \param nproc ...
1499 : !> \param nd3 Dimensions of the kernel
1500 : !> \param zmpi2 Work array for multiprocessor manipulation (output)
1501 : !> \param zw Work array (inout)
1502 : !> \param cosinarr Array of the phases needed for packing
1503 : !> \date February 2006
1504 : !> \author S. Goedecker, L. Genovese
1505 : !> \note
1506 : !> RESTRICTIONS on USAGE
1507 : !> Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
1508 : !> Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
1509 : !> Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
1510 : !> This file is distributed under the terms of the
1511 : !> GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
1512 : ! **************************************************************************************************
1513 542732 : SUBROUTINE unscramble_pack(i1, j2, lot, nfft, n1, n3, md2, nproc, nd3, zmpi2, zw, cosinarr)
1514 : INTEGER, INTENT(in) :: i1, j2, lot, nfft, n1, n3, md2, nproc, &
1515 : nd3
1516 : REAL(KIND=dp), DIMENSION(2, n1, md2/nproc, nd3), &
1517 : INTENT(in) :: zmpi2
1518 : REAL(KIND=dp), DIMENSION(2, lot, n3/2), &
1519 : INTENT(inout) :: zw
1520 : REAL(KIND=dp), DIMENSION(2, n3/2), INTENT(in) :: cosinarr
1521 :
1522 : INTEGER :: i, i3, indA, indB
1523 : REAL(KIND=dp) :: a, b, c, cp, d, ie, ih, io, re, rh, ro, &
1524 : sp
1525 :
1526 25659980 : DO i3 = 1, n3/2
1527 25117248 : indA = i3
1528 25117248 : indB = n3/2 + 2 - i3
1529 25117248 : cp = cosinarr(1, i3)
1530 25117248 : sp = cosinarr(2, i3)
1531 746954104 : DO i = 0, nfft - 1
1532 721294124 : a = zmpi2(1, i1 + i, j2, indA)
1533 721294124 : b = zmpi2(2, i1 + i, j2, indA)
1534 721294124 : c = zmpi2(1, i1 + i, j2, indB)
1535 721294124 : d = -zmpi2(2, i1 + i, j2, indB)
1536 721294124 : re = (a + c)
1537 721294124 : ie = (b + d)
1538 721294124 : ro = (a - c)*cp - (b - d)*sp
1539 721294124 : io = (a - c)*sp + (b - d)*cp
1540 721294124 : rh = re - io
1541 721294124 : ih = ie + ro
1542 721294124 : zw(1, i + 1, indA) = rh
1543 746411372 : zw(2, i + 1, indA) = ih
1544 : END DO
1545 : END DO
1546 :
1547 542732 : END SUBROUTINE unscramble_pack
1548 :
1549 : ! **************************************************************************************************
1550 : !> \brief (Based on suitable modifications of S.Goedecker routines)
1551 : !> Applies the local FFT space Kernel to the density in Real space.
1552 : !> Calculates also the LDA exchange-correlation terms
1553 : !> \param n1 logical dimension of the transform.
1554 : !> \param n2 logical dimension of the transform.
1555 : !> \param n3 logical dimension of the transform.
1556 : !> \param nd1 Dimension of POT
1557 : !> \param nd2 Dimension of POT
1558 : !> \param nd3 Dimension of POT
1559 : !> \param md1 Dimension of ZF
1560 : !> \param md2 Dimension of ZF
1561 : !> \param md3 Dimension of ZF
1562 : !> \param nproc number of processors used as returned by MPI_COMM_SIZE
1563 : !> \param iproc [0:nproc-1] number of processor as returned by MPI_COMM_RANK
1564 : !> \param pot Kernel, only the distributed part (REAL)
1565 : !> POT(i1,i2,i3)
1566 : !> i1=1,nd1 , i2=1,nd2 , i3=1,nd3/nproc
1567 : !> \param zf Density (input/output)
1568 : !> ZF(i1,i3,i2)
1569 : !> i1=1,md1 , i2=1,md2/nproc , i3=1,md3
1570 : !> \param scal factor of renormalization of the FFT in order to acheve unitarity
1571 : !> and the correct dimension
1572 : !> \param mpi_group ...
1573 : !> \date February 2006
1574 : !> \author S. Goedecker, L. Genovese
1575 : !> \note As transform lengths
1576 : !> most products of the prime factors 2,3,5 are allowed.
1577 : !> The detailed table with allowed transform lengths can
1578 : !> be found in subroutine CTRIG
1579 : !> \note
1580 : !> RESTRICTIONS on USAGE
1581 : !> Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
1582 : !> Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
1583 : !> Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
1584 : !> This file is distributed under the terms of the
1585 : !> GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
1586 : ! **************************************************************************************************
1587 15250 : SUBROUTINE F_PoissonSolver(n1, n2, n3, nd1, nd2, nd3, md1, md2, md3, nproc, iproc, pot, zf, &
1588 : scal, mpi_group)
1589 : INTEGER, INTENT(in) :: n1, n2, n3, nd1, nd2, nd3, md1, md2, &
1590 : md3, nproc, iproc
1591 : REAL(KIND=dp), DIMENSION(nd1, nd2, nd3/nproc), &
1592 : INTENT(in) :: pot
1593 : REAL(KIND=dp), DIMENSION(md1, md3, md2/nproc), &
1594 : INTENT(inout) :: zf
1595 : REAL(KIND=dp), INTENT(in) :: scal
1596 :
1597 : CLASS(mp_comm_type), INTENT(in) :: mpi_group
1598 :
1599 : INTEGER, PARAMETER :: ncache_optimal = 8*1024
1600 :
1601 : INTEGER :: i, i1, i3, ic1, ic2, ic3, inzee, j, j2, &
1602 : J2stb, J2stf, j3, Jp2stb, Jp2stf, lot, &
1603 : lzt, ma, mb, ncache, nfft
1604 15250 : INTEGER, ALLOCATABLE, DIMENSION(:) :: after1, after2, after3, before1, &
1605 15250 : before2, before3, now1, now2, now3
1606 : REAL(kind=dp) :: twopion
1607 15250 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :) :: btrig1, btrig2, btrig3, cosinarr, &
1608 15250 : ftrig1, ftrig2, ftrig3
1609 15250 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :) :: zt, zw
1610 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :, :, :) :: zmpi2
1611 : REAL(KIND=dp), ALLOCATABLE, &
1612 15250 : DIMENSION(:, :, :, :, :) :: zmpi1
1613 :
1614 15250 : IF (MOD(n1, 2) .NE. 0) CPABORT("Parallel convolution:ERROR:n1")
1615 15250 : IF (MOD(n2, 2) .NE. 0) CPABORT("Parallel convolution:ERROR:n2")
1616 15250 : IF (MOD(n3, 2) .NE. 0) CPABORT("Parallel convolution:ERROR:n3")
1617 15250 : IF (nd1 .LT. n1/2 + 1) CPABORT("Parallel convolution:ERROR:nd1")
1618 15250 : IF (nd2 .LT. n2/2 + 1) CPABORT("Parallel convolution:ERROR:nd2")
1619 15250 : IF (nd3 .LT. n3/2 + 1) CPABORT("Parallel convolution:ERROR:nd3")
1620 15250 : IF (md1 .LT. n1/2) CPABORT("Parallel convolution:ERROR:md1")
1621 15250 : IF (md2 .LT. n2/2) CPABORT("Parallel convolution:ERROR:md2")
1622 15250 : IF (md3 .LT. n3/2) CPABORT("Parallel convolution:ERROR:md3")
1623 15250 : IF (MOD(nd3, nproc) .NE. 0) CPABORT("Parallel convolution:ERROR:nd3")
1624 15250 : IF (MOD(md2, nproc) .NE. 0) CPABORT("Parallel convolution:ERROR:md2")
1625 :
1626 : !defining work arrays dimensions
1627 :
1628 15250 : ncache = ncache_optimal
1629 15250 : IF (ncache <= MAX(n1, n2, n3/2)*4) ncache = MAX(n1, n2, n3/2)*4
1630 15250 : lzt = n2/2
1631 15250 : IF (MOD(n2/2, 2) .EQ. 0) lzt = lzt + 1
1632 15250 : IF (MOD(n2/2, 4) .EQ. 0) lzt = lzt + 1
1633 :
1634 : !Allocations
1635 15250 : ALLOCATE (btrig1(2, ctrig_length))
1636 15250 : ALLOCATE (ftrig1(2, ctrig_length))
1637 15250 : ALLOCATE (after1(7))
1638 15250 : ALLOCATE (now1(7))
1639 15250 : ALLOCATE (before1(7))
1640 15250 : ALLOCATE (btrig2(2, ctrig_length))
1641 15250 : ALLOCATE (ftrig2(2, ctrig_length))
1642 15250 : ALLOCATE (after2(7))
1643 15250 : ALLOCATE (now2(7))
1644 15250 : ALLOCATE (before2(7))
1645 15250 : ALLOCATE (btrig3(2, ctrig_length))
1646 15250 : ALLOCATE (ftrig3(2, ctrig_length))
1647 15250 : ALLOCATE (after3(7))
1648 15250 : ALLOCATE (now3(7))
1649 15250 : ALLOCATE (before3(7))
1650 61000 : ALLOCATE (zw(2, ncache/4, 2))
1651 61000 : ALLOCATE (zt(2, lzt, n1))
1652 76250 : ALLOCATE (zmpi2(2, n1, md2/nproc, nd3))
1653 4489667096 : zmpi2 = 1.0E99_dp ! init mpi'ed buffers to silence warnings under valgrind
1654 61000 : ALLOCATE (cosinarr(2, n3/2))
1655 53940 : IF (nproc .GT. 1) ALLOCATE (zmpi1(2, n1, md2/nproc, nd3/nproc, nproc))
1656 :
1657 : !calculating the FFT work arrays (beware on the HalFFT in n3 dimension)
1658 15250 : CALL ctrig(n3/2, btrig3, after3, before3, now3, 1, ic3)
1659 15250 : CALL ctrig(n1, btrig1, after1, before1, now1, 1, ic1)
1660 15250 : CALL ctrig(n2, btrig2, after2, before2, now2, 1, ic2)
1661 1146722 : DO j = 1, n1
1662 1131472 : ftrig1(1, j) = btrig1(1, j)
1663 1146722 : ftrig1(2, j) = -btrig1(2, j)
1664 : END DO
1665 1146722 : DO j = 1, n2
1666 1131472 : ftrig2(1, j) = btrig2(1, j)
1667 1146722 : ftrig2(2, j) = -btrig2(2, j)
1668 : END DO
1669 1169930 : DO j = 1, n3
1670 1154680 : ftrig3(1, j) = btrig3(1, j)
1671 1169930 : ftrig3(2, j) = -btrig3(2, j)
1672 : END DO
1673 :
1674 : !Calculating array of phases for HalFFT decoding
1675 15250 : twopion = 8._dp*ATAN(1._dp)/REAL(n3, KIND=dp)
1676 592590 : DO i3 = 1, n3/2
1677 577340 : cosinarr(1, i3) = COS(twopion*(i3 - 1))
1678 592590 : cosinarr(2, i3) = -SIN(twopion*(i3 - 1))
1679 : END DO
1680 :
1681 : ! transform along z axis
1682 15250 : lot = ncache/(2*n3)
1683 15250 : IF (lot .LT. 1) THEN
1684 : WRITE (*, *) &
1685 : 'convolxc_on:ncache has to be enlarged to be able to hold at'// &
1686 : 'least one 1-d FFT of this size even though this will'// &
1687 0 : 'reduce the performance for shorter transform lengths'
1688 0 : CPABORT("")
1689 : END IF
1690 :
1691 411022 : DO j2 = 1, md2/nproc
1692 : !this condition ensures that we manage only the interesting part for the FFT
1693 411022 : IF (iproc*(md2/nproc) + j2 .LE. n2/2) THEN
1694 936466 : DO i1 = 1, (n1/2), lot
1695 541760 : ma = i1
1696 541760 : mb = MIN(i1 + (lot - 1), (n1/2))
1697 541760 : nfft = mb - ma + 1
1698 :
1699 : !inserting real data into complex array of half length
1700 541760 : CALL halfill_upcorn(md1, md3, lot, nfft, n3, zf(i1, 1, j2), zw(1, 1, 1))
1701 :
1702 : !performing FFT
1703 : !input: I1,I3,J2,(Jp2)
1704 541760 : inzee = 1
1705 1912076 : DO i = 1, ic3
1706 : CALL fftstp(lot, nfft, n3/2, lot, n3/2, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
1707 1370316 : btrig3, after3(i), now3(i), before3(i), 1)
1708 1912076 : inzee = 3 - inzee
1709 : END DO
1710 : !output: I1,i3,J2,(Jp2)
1711 :
1712 : !unpacking FFT in order to restore correct result,
1713 : !while exchanging components
1714 : !input: I1,i3,J2,(Jp2)
1715 936466 : CALL scramble_unpack(i1, j2, lot, nfft, n1/2, n3, md2, nproc, nd3, zw(1, 1, inzee), zmpi2, cosinarr)
1716 : !output: I1,J2,i3,(Jp2)
1717 : END DO
1718 : END IF
1719 : END DO
1720 :
1721 : !Interprocessor data transposition
1722 : !input: I1,J2,j3,jp3,(Jp2)
1723 15250 : IF (nproc .GT. 1) THEN
1724 : !communication scheduling
1725 7738 : CALL mpi_group%alltoall(zmpi2, zmpi1, n1*(md2/nproc)*(nd3/nproc))
1726 : END IF
1727 : !output: I1,J2,j3,Jp2,(jp3)
1728 :
1729 : !now each process perform complete convolution of its planes
1730 433432 : DO j3 = 1, nd3/nproc
1731 : !this condition ensures that we manage only the interesting part for the FFT
1732 433432 : IF (iproc*(nd3/nproc) + j3 .LE. n3/2 + 1) THEN
1733 414313 : Jp2stb = 1
1734 414313 : J2stb = 1
1735 414313 : Jp2stf = 1
1736 414313 : J2stf = 1
1737 :
1738 : ! transform along x axis
1739 414313 : lot = ncache/(4*n1)
1740 414313 : IF (lot .LT. 1) THEN
1741 : WRITE (*, *) &
1742 : 'convolxc_on:ncache has to be enlarged to be able to hold at'// &
1743 : 'least one 1-d FFT of this size even though this will'// &
1744 0 : 'reduce the performance for shorter transform lengths'
1745 0 : CPABORT("")
1746 : END IF
1747 :
1748 1329862 : DO j = 1, n2/2, lot
1749 915549 : ma = j
1750 915549 : mb = MIN(j + (lot - 1), n2/2)
1751 915549 : nfft = mb - ma + 1
1752 :
1753 : !reverse index ordering, leaving the planes to be transformed at the end
1754 : !input: I1,J2,j3,Jp2,(jp3)
1755 915549 : IF (nproc .EQ. 1) THEN
1756 412311 : CALL mpiswitch_upcorn(j3, nfft, Jp2stb, J2stb, lot, n1, md2, nd3, nproc, zmpi2, zw(1, 1, 1))
1757 : ELSE
1758 503238 : CALL mpiswitch_upcorn(j3, nfft, Jp2stb, J2stb, lot, n1, md2, nd3, nproc, zmpi1, zw(1, 1, 1))
1759 : END IF
1760 : !output: J2,Jp2,I1,j3,(jp3)
1761 :
1762 : !performing FFT
1763 : !input: I2,I1,j3,(jp3)
1764 915549 : inzee = 1
1765 2910285 : DO i = 1, ic1 - 1
1766 : CALL fftstp(lot, nfft, n1, lot, n1, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
1767 1994736 : btrig1, after1(i), now1(i), before1(i), 1)
1768 2910285 : inzee = 3 - inzee
1769 : END DO
1770 :
1771 : !storing the last step into zt array
1772 915549 : i = ic1
1773 : CALL fftstp(lot, nfft, n1, lzt, n1, zw(1, 1, inzee), zt(1, j, 1), &
1774 1329862 : btrig1, after1(i), now1(i), before1(i), 1)
1775 : !output: I2,i1,j3,(jp3)
1776 : END DO
1777 :
1778 : !transform along y axis
1779 414313 : lot = ncache/(4*n2)
1780 414313 : IF (lot .LT. 1) THEN
1781 : WRITE (*, *) &
1782 : 'convolxc_on:ncache has to be enlarged to be able to hold at'// &
1783 : 'least one 1-d FFT of this size even though this will'// &
1784 0 : 'reduce the performance for shorter transform lengths'
1785 0 : CPABORT("")
1786 : END IF
1787 :
1788 2114973 : DO j = 1, n1, lot
1789 1700660 : ma = j
1790 1700660 : mb = MIN(j + (lot - 1), n1)
1791 1700660 : nfft = mb - ma + 1
1792 :
1793 : !reverse ordering
1794 : !input: I2,i1,j3,(jp3)
1795 1700660 : CALL switch_upcorn(nfft, n2, lot, n1, lzt, zt(1, 1, j), zw(1, 1, 1))
1796 : !output: i1,I2,j3,(jp3)
1797 :
1798 : !performing FFT
1799 : !input: i1,I2,j3,(jp3)
1800 1700660 : inzee = 1
1801 7139300 : DO i = 1, ic2
1802 : CALL fftstp(lot, nfft, n2, lot, n2, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
1803 5438640 : btrig2, after2(i), now2(i), before2(i), 1)
1804 7139300 : inzee = 3 - inzee
1805 : END DO
1806 : !output: i1,i2,j3,(jp3)
1807 :
1808 : !Multiply with kernel in fourier space
1809 1700660 : CALL multkernel(nd1, nd2, n1, n2, lot, nfft, j, pot(1, 1, j3), zw(1, 1, inzee))
1810 :
1811 : !TRANSFORM BACK IN REAL SPACE
1812 :
1813 : !transform along y axis
1814 : !input: i1,i2,j3,(jp3)
1815 7139300 : DO i = 1, ic2
1816 : CALL fftstp(lot, nfft, n2, lot, n2, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
1817 5438640 : ftrig2, after2(i), now2(i), before2(i), -1)
1818 7139300 : inzee = 3 - inzee
1819 : END DO
1820 :
1821 : !reverse ordering
1822 : !input: i1,I2,j3,(jp3)
1823 2114973 : CALL unswitch_downcorn(nfft, n2, lot, n1, lzt, zw(1, 1, inzee), zt(1, 1, j))
1824 : !output: I2,i1,j3,(jp3)
1825 : END DO
1826 :
1827 : !transform along x axis
1828 : !input: I2,i1,j3,(jp3)
1829 414313 : lot = ncache/(4*n1)
1830 1329862 : DO j = 1, n2/2, lot
1831 915549 : ma = j
1832 915549 : mb = MIN(j + (lot - 1), n2/2)
1833 915549 : nfft = mb - ma + 1
1834 :
1835 : !performing FFT
1836 915549 : i = 1
1837 : CALL fftstp(lzt, nfft, n1, lot, n1, zt(1, j, 1), zw(1, 1, 1), &
1838 915549 : ftrig1, after1(i), now1(i), before1(i), -1)
1839 :
1840 915549 : inzee = 1
1841 2910285 : DO i = 2, ic1
1842 : CALL fftstp(lot, nfft, n1, lot, n1, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
1843 1994736 : ftrig1, after1(i), now1(i), before1(i), -1)
1844 2910285 : inzee = 3 - inzee
1845 : END DO
1846 : !output: I2,I1,j3,(jp3)
1847 :
1848 : !reverse ordering
1849 : !input: J2,Jp2,I1,j3,(jp3)
1850 1329862 : IF (nproc .EQ. 1) THEN
1851 412311 : CALL unmpiswitch_downcorn(j3, nfft, Jp2stf, J2stf, lot, n1, md2, nd3, nproc, zw(1, 1, inzee), zmpi2)
1852 : ELSE
1853 503238 : CALL unmpiswitch_downcorn(j3, nfft, Jp2stf, J2stf, lot, n1, md2, nd3, nproc, zw(1, 1, inzee), zmpi1)
1854 : END IF
1855 : ! output: I1,J2,j3,Jp2,(jp3)
1856 : END DO
1857 : END IF
1858 : END DO
1859 :
1860 : !Interprocessor data transposition
1861 : !input: I1,J2,j3,Jp2,(jp3)
1862 15250 : IF (nproc .GT. 1) THEN
1863 : !communication scheduling
1864 7738 : CALL mpi_group%alltoall(zmpi1, zmpi2, n1*(md2/nproc)*(nd3/nproc))
1865 : !output: I1,J2,j3,jp3,(Jp2)
1866 : END IF
1867 :
1868 : !transform along z axis
1869 : !input: I1,J2,i3,(Jp2)
1870 15250 : lot = ncache/(2*n3)
1871 411022 : DO j2 = 1, md2/nproc
1872 : !this condition ensures that we manage only the interesting part for the FFT
1873 411022 : IF (iproc*(md2/nproc) + j2 .LE. n2/2) THEN
1874 936466 : DO i1 = 1, (n1/2), lot
1875 541760 : ma = i1
1876 541760 : mb = MIN(i1 + (lot - 1), (n1/2))
1877 541760 : nfft = mb - ma + 1
1878 :
1879 : !reverse ordering and repack the FFT data in order to be backward HalFFT transformed
1880 : !input: I1,J2,i3,(Jp2)
1881 541760 : CALL unscramble_pack(i1, j2, lot, nfft, n1/2, n3, md2, nproc, nd3, zmpi2, zw(1, 1, 1), cosinarr)
1882 : !output: I1,i3,J2,(Jp2)
1883 :
1884 : !performing FFT
1885 : !input: I1,i3,J2,(Jp2)
1886 541760 : inzee = 1
1887 1912076 : DO i = 1, ic3
1888 : CALL fftstp(lot, nfft, n3/2, lot, n3/2, zw(1, 1, inzee), zw(1, 1, 3 - inzee), &
1889 1370316 : ftrig3, after3(i), now3(i), before3(i), -1)
1890 1912076 : inzee = 3 - inzee
1891 : END DO
1892 : !output: I1,I3,J2,(Jp2)
1893 :
1894 : !calculates the exchange correlation terms locally and rebuild the output array
1895 : CALL unfill_downcorn(md1, md3, lot, nfft, n3, zw(1, 1, inzee), zf(i1, 1, j2) &
1896 936466 : , scal) !,ehartreetmp)
1897 :
1898 : !integrate local pieces together
1899 : !ehartree=ehartree+0.5_dp*ehartreetmp*hgrid**3
1900 : END DO
1901 : END IF
1902 : END DO
1903 :
1904 : !De-allocations
1905 15250 : DEALLOCATE (btrig1)
1906 15250 : DEALLOCATE (ftrig1)
1907 15250 : DEALLOCATE (after1)
1908 15250 : DEALLOCATE (now1)
1909 15250 : DEALLOCATE (before1)
1910 15250 : DEALLOCATE (btrig2)
1911 15250 : DEALLOCATE (ftrig2)
1912 15250 : DEALLOCATE (after2)
1913 15250 : DEALLOCATE (now2)
1914 15250 : DEALLOCATE (before2)
1915 15250 : DEALLOCATE (btrig3)
1916 15250 : DEALLOCATE (ftrig3)
1917 15250 : DEALLOCATE (after3)
1918 15250 : DEALLOCATE (now3)
1919 15250 : DEALLOCATE (before3)
1920 15250 : DEALLOCATE (zmpi2)
1921 15250 : DEALLOCATE (zw)
1922 15250 : DEALLOCATE (zt)
1923 15250 : DEALLOCATE (cosinarr)
1924 15250 : IF (nproc .GT. 1) DEALLOCATE (zmpi1)
1925 :
1926 15250 : END SUBROUTINE F_PoissonSolver
1927 :
1928 : ! **************************************************************************************************
1929 : !> \brief ...
1930 : !> \param nfft ...
1931 : !> \param n2 ...
1932 : !> \param lot ...
1933 : !> \param n1 ...
1934 : !> \param lzt ...
1935 : !> \param zt ...
1936 : !> \param zw ...
1937 : ! **************************************************************************************************
1938 1700660 : SUBROUTINE switch_upcorn(nfft, n2, lot, n1, lzt, zt, zw)
1939 : INTEGER :: nfft, n2, lot, n1, lzt
1940 : REAL(KIND=dp) :: zt(2, lzt, n1), zw(2, lot, n2)
1941 :
1942 : INTEGER :: i, j
1943 :
1944 : ! WARNING: Assuming that high frequencies are in the corners
1945 : ! and that n2 is multiple of 2
1946 : ! Low frequencies
1947 :
1948 34600720 : DO j = 1, nfft
1949 1505929116 : DO i = n2/2 + 1, n2
1950 1471328396 : zw(1, j, i) = zt(1, i - n2/2, j)
1951 1504228456 : zw(2, j, i) = zt(2, i - n2/2, j)
1952 : END DO
1953 : END DO
1954 : ! High frequencies
1955 83947372 : DO i = 1, n2/2
1956 1555275768 : DO j = 1, nfft
1957 1471328396 : zw(1, j, i) = 0._dp
1958 1553575108 : zw(2, j, i) = 0._dp
1959 : END DO
1960 : END DO
1961 1700660 : END SUBROUTINE switch_upcorn
1962 :
1963 : ! **************************************************************************************************
1964 : !> \brief ...
1965 : !> \param j3 ...
1966 : !> \param nfft ...
1967 : !> \param Jp2stb ...
1968 : !> \param J2stb ...
1969 : !> \param lot ...
1970 : !> \param n1 ...
1971 : !> \param md2 ...
1972 : !> \param nd3 ...
1973 : !> \param nproc ...
1974 : !> \param zmpi1 ...
1975 : !> \param zw ...
1976 : ! **************************************************************************************************
1977 915549 : SUBROUTINE mpiswitch_upcorn(j3, nfft, Jp2stb, J2stb, lot, n1, md2, nd3, nproc, zmpi1, zw)
1978 : INTEGER :: j3, nfft, Jp2stb, J2stb, lot, n1, md2, &
1979 : nd3, nproc
1980 : REAL(KIND=dp) :: zmpi1(2, n1/2, md2/nproc, nd3/nproc, nproc), zw(2, lot, n1)
1981 :
1982 : INTEGER :: i1, j2, jp2, mfft
1983 :
1984 : ! WARNING: Assuming that high frequencies are in the corners
1985 : ! and that n1 is multiple of 2
1986 :
1987 915549 : mfft = 0
1988 1468055 : Main: DO Jp2 = Jp2stb, nproc
1989 17543856 : DO J2 = J2stb, md2/nproc
1990 16991350 : mfft = mfft + 1
1991 16991350 : IF (mfft .GT. nfft) THEN
1992 541320 : Jp2stb = Jp2
1993 541320 : J2stb = J2
1994 541320 : EXIT Main
1995 : END IF
1996 752114228 : DO I1 = 1, n1/2
1997 735664198 : zw(1, mfft, I1) = 0._dp
1998 752114228 : zw(2, mfft, I1) = 0._dp
1999 : END DO
2000 752666734 : DO I1 = n1/2 + 1, n1
2001 735664198 : zw(1, mfft, I1) = zmpi1(1, I1 - n1/2, J2, j3, Jp2)
2002 752114228 : zw(2, mfft, I1) = zmpi1(2, I1 - n1/2, J2, j3, Jp2)
2003 : END DO
2004 : END DO
2005 926735 : J2stb = 1
2006 : END DO Main
2007 915549 : END SUBROUTINE mpiswitch_upcorn
2008 :
2009 : ! **************************************************************************************************
2010 : !> \brief ...
2011 : !> \param nfft ...
2012 : !> \param n2 ...
2013 : !> \param lot ...
2014 : !> \param n1 ...
2015 : !> \param lzt ...
2016 : !> \param zw ...
2017 : !> \param zt ...
2018 : ! **************************************************************************************************
2019 1700660 : SUBROUTINE unswitch_downcorn(nfft, n2, lot, n1, lzt, zw, zt)
2020 : INTEGER :: nfft, n2, lot, n1, lzt
2021 : REAL(KIND=dp) :: zw(2, lot, n2), zt(2, lzt, n1)
2022 :
2023 : INTEGER :: i, j
2024 :
2025 : ! WARNING: Assuming that high frequencies are in the corners
2026 : ! and that n2 is multiple of 2
2027 : ! Low frequencies
2028 :
2029 34600720 : DO j = 1, nfft
2030 1505929116 : DO i = 1, n2/2
2031 1471328396 : zt(1, i, j) = zw(1, j, i)
2032 1504228456 : zt(2, i, j) = zw(2, j, i)
2033 : END DO
2034 : END DO
2035 1700660 : RETURN
2036 : END SUBROUTINE unswitch_downcorn
2037 :
2038 : ! **************************************************************************************************
2039 : !> \brief ...
2040 : !> \param j3 ...
2041 : !> \param nfft ...
2042 : !> \param Jp2stf ...
2043 : !> \param J2stf ...
2044 : !> \param lot ...
2045 : !> \param n1 ...
2046 : !> \param md2 ...
2047 : !> \param nd3 ...
2048 : !> \param nproc ...
2049 : !> \param zw ...
2050 : !> \param zmpi1 ...
2051 : ! **************************************************************************************************
2052 915549 : SUBROUTINE unmpiswitch_downcorn(j3, nfft, Jp2stf, J2stf, lot, n1, md2, nd3, nproc, zw, zmpi1)
2053 : INTEGER :: j3, nfft, Jp2stf, J2stf, lot, n1, md2, &
2054 : nd3, nproc
2055 : REAL(KIND=dp) :: zw(2, lot, n1), zmpi1(2, n1/2, md2/nproc, nd3/nproc, nproc)
2056 :
2057 : INTEGER :: i1, j2, jp2, mfft
2058 :
2059 : ! WARNING: Assuming that high frequencies are in the corners
2060 : ! and that n1 is multiple of 2
2061 :
2062 915549 : mfft = 0
2063 1468055 : Main: DO Jp2 = Jp2stf, nproc
2064 17543856 : DO J2 = J2stf, md2/nproc
2065 16991350 : mfft = mfft + 1
2066 16991350 : IF (mfft .GT. nfft) THEN
2067 541320 : Jp2stf = Jp2
2068 541320 : J2stf = J2
2069 541320 : EXIT Main
2070 : END IF
2071 752666734 : DO I1 = 1, n1/2
2072 735664198 : zmpi1(1, I1, J2, j3, Jp2) = zw(1, mfft, I1)
2073 752114228 : zmpi1(2, I1, J2, j3, Jp2) = zw(2, mfft, I1)
2074 : END DO
2075 : END DO
2076 926735 : J2stf = 1
2077 : END DO Main
2078 915549 : END SUBROUTINE unmpiswitch_downcorn
2079 :
2080 : ! **************************************************************************************************
2081 : !> \brief (Based on suitable modifications of S.Goedecker routines)
2082 : !> Restore data into output array, calculating in the meanwhile
2083 : !> Hartree energy of the potential
2084 : !> \param md1 Dimensions of the undistributed part of the real grid
2085 : !> \param md3 Dimensions of the undistributed part of the real grid
2086 : !> \param lot ...
2087 : !> \param nfft number of planes
2088 : !> \param n3 (twice the) dimension of the last FFTtransform.
2089 : !> \param zw FFT work array
2090 : !> \param zf Original distributed density as well as
2091 : !> Distributed solution of the poisson equation (inout)
2092 : !> \param scal Needed to achieve unitarity and correct dimensions
2093 : !> \param ehartreetmp Hartree energy
2094 : !> \date February 2006
2095 : !> \author S. Goedecker, L. Genovese
2096 : !> \note Assuming that high frequencies are in the corners
2097 : !> and that n3 is multiple of 4
2098 : !>
2099 : !> RESTRICTIONS on USAGE
2100 : !> Copyright (C) Stefan Goedecker, Cornell University, Ithaca, USA, 1994
2101 : !> Copyright (C) Stefan Goedecker, MPI Stuttgart, Germany, 1999
2102 : !> Copyright (C) 2002 Stefan Goedecker, CEA Grenoble
2103 : !> This file is distributed under the terms of the
2104 : !> GNU General Public License, see http://www.gnu.org/copyleft/gpl.txt .
2105 : ! **************************************************************************************************
2106 0 : SUBROUTINE F_unfill_downcorn(md1, md3, lot, nfft, n3, zw, zf, scal, ehartreetmp)
2107 : INTEGER, INTENT(in) :: md1, md3, lot, nfft, n3
2108 : REAL(KIND=dp), DIMENSION(2, lot, n3/2), INTENT(in) :: zw
2109 : REAL(KIND=dp), DIMENSION(md1, md3), INTENT(inout) :: zf
2110 : REAL(KIND=dp), INTENT(in) :: scal
2111 : REAL(KIND=dp), INTENT(out) :: ehartreetmp
2112 :
2113 : INTEGER :: i1, i3
2114 : REAL(KIND=dp) :: pot1
2115 :
2116 0 : ehartreetmp = 0._dp
2117 0 : DO i3 = 1, n3/4
2118 0 : DO i1 = 1, nfft
2119 0 : pot1 = scal*zw(1, i1, i3)
2120 0 : ehartreetmp = ehartreetmp + pot1*zf(i1, 2*i3 - 1)
2121 0 : zf(i1, 2*i3 - 1) = pot1
2122 0 : pot1 = scal*zw(2, i1, i3)
2123 0 : ehartreetmp = ehartreetmp + pot1*zf(i1, 2*i3)
2124 0 : zf(i1, 2*i3) = pot1
2125 : END DO
2126 : END DO
2127 0 : END SUBROUTINE F_unfill_downcorn
2128 :
2129 : END MODULE ps_wavelet_base
|