Line data Source code
1 : /*----------------------------------------------------------------------------*/
2 : /* CP2K: A general program to perform molecular dynamics simulations */
3 : /* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4 : /* */
5 : /* SPDX-License-Identifier: BSD-3-Clause */
6 : /*----------------------------------------------------------------------------*/
7 :
8 : #include <assert.h>
9 : #include <limits.h>
10 : #include <math.h>
11 : #include <stdio.h>
12 : #include <stdlib.h>
13 : #include <string.h>
14 :
15 : #if defined(__AVX2__) && defined(__FMA__)
16 : #include <immintrin.h>
17 : #endif
18 :
19 : #include "../common/grid_common.h"
20 : #include "../common/grid_library.h"
21 : #include "../common/grid_sphere_cache.h"
22 :
23 : #define GRID_MAX_LP_OPTIMIZED 9
24 :
25 : #if (GRID_DO_COLLOCATE)
26 : #define GRID_CONST_WHEN_COLLOCATE const
27 : #define GRID_CONST_WHEN_INTEGRATE
28 : #else
29 : #define GRID_CONST_WHEN_COLLOCATE
30 : #define GRID_CONST_WHEN_INTEGRATE const
31 : #endif
32 :
33 : /*******************************************************************************
34 : * \brief Simple loop body for ortho_cx_to_grid using plain C.
35 : * \author Ole Schuett
36 : ******************************************************************************/
37 : static inline void __attribute__((always_inline))
38 27595618182 : ortho_cx_to_grid_scalar(const int lp, const int cmax, const int i,
39 27595618182 : const double pol[3][lp + 1][2 * cmax + 1],
40 : GRID_CONST_WHEN_COLLOCATE double *cx,
41 : GRID_CONST_WHEN_INTEGRATE double *grid_0,
42 : GRID_CONST_WHEN_INTEGRATE double *grid_1,
43 : GRID_CONST_WHEN_INTEGRATE double *grid_2,
44 : GRID_CONST_WHEN_INTEGRATE double *grid_3) {
45 :
46 : #if (GRID_DO_COLLOCATE)
47 : // collocate
48 14472002262 : double reg[4] = {0.0, 0.0, 0.0, 0.0};
49 >11577*10^7 : #pragma omp simd reduction(+ : reg)
50 : for (int lxp = 0; lxp <= lp; lxp++) {
51 38682043496 : const double p = pol[0][lxp][i + cmax];
52 38682043496 : reg[0] += cx[lxp * 4 + 0] * p;
53 38682043496 : reg[1] += cx[lxp * 4 + 1] * p;
54 38682043496 : reg[2] += cx[lxp * 4 + 2] * p;
55 38682043496 : reg[3] += cx[lxp * 4 + 3] * p;
56 : }
57 14472002262 : *grid_0 += reg[0];
58 14472002262 : *grid_1 += reg[1];
59 14472002262 : *grid_2 += reg[2];
60 14472002262 : *grid_3 += reg[3];
61 :
62 : #else
63 : // integrate
64 13123615920 : const double reg[4] = {*grid_0, *grid_1, *grid_2, *grid_3};
65 : #pragma omp simd
66 13123615920 : for (int lxp = 0; lxp <= lp; lxp++) {
67 35809607248 : const double p = pol[0][lxp][i + cmax];
68 35809607248 : cx[lxp * 4 + 0] += reg[0] * p;
69 35809607248 : cx[lxp * 4 + 1] += reg[1] * p;
70 35809607248 : cx[lxp * 4 + 2] += reg[2] * p;
71 35809607248 : cx[lxp * 4 + 3] += reg[3] * p;
72 : }
73 : #endif
74 : }
75 :
76 : /*******************************************************************************
77 : * \brief Optimized loop body for ortho_cx_to_grid using AVX2 Intel Intrinsics.
78 : * This routine always processes four consecutive grid elements at once.
79 : * \author Ole Schuett
80 : ******************************************************************************/
81 : #if defined(__AVX2__) && defined(__FMA__)
82 : static inline void __attribute__((always_inline))
83 43582031961 : ortho_cx_to_grid_avx2(const int lp, const int cmax, const int i,
84 43582031961 : const double pol[3][lp + 1][2 * cmax + 1],
85 : GRID_CONST_WHEN_COLLOCATE double *cx,
86 : GRID_CONST_WHEN_INTEGRATE double *grid_0,
87 : GRID_CONST_WHEN_INTEGRATE double *grid_1,
88 : GRID_CONST_WHEN_INTEGRATE double *grid_2,
89 : GRID_CONST_WHEN_INTEGRATE double *grid_3) {
90 :
91 43582031961 : const int icmax = i + cmax;
92 :
93 : #if (GRID_DO_COLLOCATE)
94 : // collocate
95 : // First iteration for lxp == 0 does not need add instructions.
96 22989638082 : __m256d p_vec = _mm256_loadu_pd(&pol[0][0][icmax]);
97 22989638082 : __m256d r_vec_0 = _mm256_mul_pd(p_vec, _mm256_set1_pd(cx[0]));
98 22989638082 : __m256d r_vec_1 = _mm256_mul_pd(p_vec, _mm256_set1_pd(cx[1]));
99 22989638082 : __m256d r_vec_2 = _mm256_mul_pd(p_vec, _mm256_set1_pd(cx[2]));
100 22989638082 : __m256d r_vec_3 = _mm256_mul_pd(p_vec, _mm256_set1_pd(cx[3]));
101 :
102 : // Remaining iterations for lxp > 0 use fused multiply adds.
103 : GRID_PRAGMA_UNROLL_UP_TO(GRID_MAX_LP_OPTIMIZED)
104 60785750173 : for (int lxp = 1; lxp <= lp; lxp++) {
105 37796112091 : const double *cx_base = &cx[lxp * 4];
106 37796112091 : p_vec = _mm256_loadu_pd(&pol[0][lxp][icmax]);
107 37796112091 : r_vec_0 = _mm256_fmadd_pd(p_vec, _mm256_set1_pd(cx_base[0]), r_vec_0);
108 37796112091 : r_vec_1 = _mm256_fmadd_pd(p_vec, _mm256_set1_pd(cx_base[1]), r_vec_1);
109 37796112091 : r_vec_2 = _mm256_fmadd_pd(p_vec, _mm256_set1_pd(cx_base[2]), r_vec_2);
110 37796112091 : r_vec_3 = _mm256_fmadd_pd(p_vec, _mm256_set1_pd(cx_base[3]), r_vec_3);
111 : }
112 :
113 : // Add vectors to grid one at a time, because they can aliase when cube wraps.
114 22989638082 : _mm256_storeu_pd(grid_0, _mm256_add_pd(_mm256_loadu_pd(grid_0), r_vec_0));
115 22989638082 : _mm256_storeu_pd(grid_1, _mm256_add_pd(_mm256_loadu_pd(grid_1), r_vec_1));
116 22989638082 : _mm256_storeu_pd(grid_2, _mm256_add_pd(_mm256_loadu_pd(grid_2), r_vec_2));
117 22989638082 : _mm256_storeu_pd(grid_3, _mm256_add_pd(_mm256_loadu_pd(grid_3), r_vec_3));
118 :
119 : #else
120 : // integrate
121 20592393879 : __m256d grid_vec_0 = _mm256_loadu_pd(grid_0);
122 20592393879 : __m256d grid_vec_1 = _mm256_loadu_pd(grid_1);
123 20592393879 : __m256d grid_vec_2 = _mm256_loadu_pd(grid_2);
124 20592393879 : __m256d grid_vec_3 = _mm256_loadu_pd(grid_3);
125 :
126 : GRID_PRAGMA_UNROLL_UP_TO(GRID_MAX_LP_OPTIMIZED + 1)
127 76091103019 : for (int lxp = 0; lxp <= lp; lxp++) {
128 55498709140 : __m256d p_vec = _mm256_loadu_pd(&pol[0][lxp][icmax]);
129 :
130 : // Do 4 dot products at once. https://stackoverflow.com/a/10454420
131 55498709140 : __m256d xy0 = _mm256_mul_pd(p_vec, grid_vec_0);
132 55498709140 : __m256d xy1 = _mm256_mul_pd(p_vec, grid_vec_1);
133 55498709140 : __m256d xy2 = _mm256_mul_pd(p_vec, grid_vec_2);
134 55498709140 : __m256d xy3 = _mm256_mul_pd(p_vec, grid_vec_3);
135 :
136 : // low to high: xy00+xy01 xy10+xy11 xy02+xy03 xy12+xy13
137 55498709140 : __m256d temp01 = _mm256_hadd_pd(xy0, xy1);
138 :
139 : // low to high: xy20+xy21 xy30+xy31 xy22+xy23 xy32+xy33
140 55498709140 : __m256d temp23 = _mm256_hadd_pd(xy2, xy3);
141 :
142 : // low to high: xy02+xy03 xy12+xy13 xy20+xy21 xy30+xy31
143 55498709140 : __m256d swapped = _mm256_permute2f128_pd(temp01, temp23, 0x21);
144 :
145 : // low to high: xy00+xy01 xy10+xy11 xy22+xy23 xy32+xy33
146 55498709140 : __m256d blended = _mm256_blend_pd(temp01, temp23, 0b1100);
147 :
148 55498709140 : __m256d r_vec = _mm256_add_pd(swapped, blended);
149 :
150 : // cx += r_vec
151 55498709140 : double *cx_base = &cx[lxp * 4];
152 55498709140 : _mm256_storeu_pd(cx_base, _mm256_add_pd(r_vec, _mm256_loadu_pd(cx_base)));
153 : }
154 : #endif
155 : }
156 : #endif // __AVX2__ && __FMA__
157 :
158 : /*******************************************************************************
159 : * \brief Collocates coefficients C_x onto the grid for orthorhombic case.
160 : * \author Ole Schuett
161 : ******************************************************************************/
162 : static inline void __attribute__((always_inline))
163 11046662572 : ortho_cx_to_grid(const int lp, const int kg1, const int kg2, const int jg1,
164 : const int jg2, const int cmax,
165 11046662572 : const double pol[3][lp + 1][2 * cmax + 1],
166 : const int map[3][2 * cmax + 1],
167 : const int sections[3][2 * cmax + 1], const int npts_local[3],
168 : int **sphere_bounds_iter, GRID_CONST_WHEN_COLLOCATE double *cx,
169 : GRID_CONST_WHEN_INTEGRATE double *grid) {
170 :
171 : // Lower and upper sphere bounds relative to center, ie. in cube coordinates.
172 11046662572 : const int lb = *((*sphere_bounds_iter)++);
173 11046662572 : const int ub = 1 - lb;
174 :
175 : // AVX instructions can only load/store from evenly spaced memory locations.
176 : // Since the sphere bounds can wrap around due to the grid's periodicity,
177 : // the inner loop runs over sections with homogeneous cube to grid mapping.
178 31255744235 : for (int istart = lb; istart <= ub; istart++) {
179 20209081663 : const int istop = imin(ub, istart + sections[0][istart + cmax]);
180 20209081663 : const int cube2grid = map[0][istart + cmax] - istart;
181 :
182 20209081663 : const int stride = npts_local[1] * npts_local[0];
183 20209081663 : const int grid_index_0 = kg1 * stride + jg1 * npts_local[0];
184 20209081663 : const int grid_index_1 = kg2 * stride + jg1 * npts_local[0];
185 20209081663 : const int grid_index_2 = kg1 * stride + jg2 * npts_local[0];
186 20209081663 : const int grid_index_3 = kg2 * stride + jg2 * npts_local[0];
187 20209081663 : GRID_CONST_WHEN_INTEGRATE double *grid_base_0 = &grid[grid_index_0];
188 20209081663 : GRID_CONST_WHEN_INTEGRATE double *grid_base_1 = &grid[grid_index_1];
189 20209081663 : GRID_CONST_WHEN_INTEGRATE double *grid_base_2 = &grid[grid_index_2];
190 20209081663 : GRID_CONST_WHEN_INTEGRATE double *grid_base_3 = &grid[grid_index_3];
191 :
192 : // Use AVX2 to process grid points in chunks of four, ie. 256 bit vectors.
193 : #if defined(__AVX2__) && defined(__FMA__)
194 20209081663 : const int istop_vec = istart + 4 * ((istop - istart + 1) / 4) - 1;
195 63791113624 : for (int i = istart; i <= istop_vec; i += 4) {
196 43582031961 : const int ig = i + cube2grid;
197 43582031961 : ortho_cx_to_grid_avx2(lp, cmax, i, pol, cx, &grid_base_0[ig],
198 : &grid_base_1[ig], &grid_base_2[ig],
199 43582031961 : &grid_base_3[ig]);
200 : }
201 47804699845 : istart = istop_vec + 1;
202 : #endif
203 :
204 : // Process up to 3 remaining points - or everything if AVX2 isn't available.
205 47804699845 : for (int i = istart; i <= istop; i++) {
206 27595618182 : const int ig = i + cube2grid;
207 27595618182 : ortho_cx_to_grid_scalar(lp, cmax, i, pol, cx, &grid_base_0[ig],
208 : &grid_base_1[ig], &grid_base_2[ig],
209 27595618182 : &grid_base_3[ig]);
210 : }
211 20209081663 : istart = istop;
212 : }
213 : }
214 :
215 : /*******************************************************************************
216 : * \brief Transforms coefficients C_xy into C_x by fixing grid index j.
217 : * \author Ole Schuett
218 : ******************************************************************************/
219 : static inline void __attribute__((always_inline))
220 11046662572 : ortho_cxy_to_cx(const int lp, const int j1, const int j2, const int cmax,
221 11046662572 : const double pol[3][lp + 1][2 * cmax + 1],
222 : GRID_CONST_WHEN_COLLOCATE double *cxy,
223 : GRID_CONST_WHEN_INTEGRATE double *cx) {
224 :
225 40269907756 : for (int lyp = 0; lyp <= lp; lyp++) {
226 90765785027 : for (int lxp = 0; lxp <= lp - lyp; lxp++) {
227 61542539843 : const double p1 = pol[1][lyp][j1 + cmax];
228 61542539843 : const double p2 = pol[1][lyp][j2 + cmax];
229 61542539843 : const int cxy_index = lyp * (lp + 1) * 2 + lxp * 2; // [lyp, lxp, 0]
230 :
231 : #if (GRID_DO_COLLOCATE)
232 : // collocate
233 31547639963 : cx[lxp * 4 + 0] += cxy[cxy_index + 0] * p1;
234 31547639963 : cx[lxp * 4 + 1] += cxy[cxy_index + 1] * p1;
235 31547639963 : cx[lxp * 4 + 2] += cxy[cxy_index + 0] * p2;
236 31547639963 : cx[lxp * 4 + 3] += cxy[cxy_index + 1] * p2;
237 : #else
238 : // integrate
239 29994899880 : cxy[cxy_index + 0] += cx[lxp * 4 + 0] * p1;
240 29994899880 : cxy[cxy_index + 1] += cx[lxp * 4 + 1] * p1;
241 29994899880 : cxy[cxy_index + 0] += cx[lxp * 4 + 2] * p2;
242 29994899880 : cxy[cxy_index + 1] += cx[lxp * 4 + 3] * p2;
243 : #endif
244 : }
245 : }
246 : }
247 :
248 : /*******************************************************************************
249 : * \brief Loop body of ortho_cxy_to_grid to be inlined for low values of lp.
250 : * \author Ole Schuett
251 : ******************************************************************************/
252 11046662572 : static inline void __attribute__((always_inline)) ortho_cxy_to_grid_low(
253 : const int lp, const int j1, const int j2, const int kg1, const int kg2,
254 : const int jg1, const int jg2, const int cmax,
255 11046662572 : const double pol[3][lp + 1][2 * cmax + 1], const int map[3][2 * cmax + 1],
256 : const int sections[3][2 * cmax + 1], const int npts_local[3],
257 : int **sphere_bounds_iter, double *cx, GRID_CONST_WHEN_COLLOCATE double *cxy,
258 : GRID_CONST_WHEN_INTEGRATE double *grid) {
259 :
260 : #if (GRID_DO_COLLOCATE)
261 : // collocate
262 11512776380 : ortho_cxy_to_cx(lp, j1, j2, cmax, pol, cxy, cx);
263 5756388190 : ortho_cx_to_grid(lp, kg1, kg2, jg1, jg2, cmax, pol, map, sections, npts_local,
264 : sphere_bounds_iter, cx, grid);
265 : #else
266 : // integrate
267 10580548764 : ortho_cx_to_grid(lp, kg1, kg2, jg1, jg2, cmax, pol, map, sections, npts_local,
268 : sphere_bounds_iter, cx, grid);
269 5290274382 : ortho_cxy_to_cx(lp, j1, j2, cmax, pol, cxy, cx);
270 : #endif
271 : }
272 :
273 : /*******************************************************************************
274 : * \brief Collocates coefficients C_xy onto the grid for orthorhombic case.
275 : * \author Ole Schuett
276 : ******************************************************************************/
277 1247170183 : static inline void ortho_cxy_to_grid(
278 : const int lp, const int kg1, const int kg2, const int cmax,
279 1247170183 : const double pol[3][lp + 1][2 * cmax + 1], const int map[3][2 * cmax + 1],
280 : const int sections[3][2 * cmax + 1], const int npts_local[3],
281 : int **sphere_bounds_iter, GRID_CONST_WHEN_COLLOCATE double *cxy,
282 1247170183 : GRID_CONST_WHEN_INTEGRATE double *grid) {
283 :
284 : // The cube contains an even number of grid points in each direction and
285 : // collocation is always performed on a pair of two opposing grid points.
286 : // Hence, the points with index 0 and 1 are both assigned distance zero via
287 : // the formular distance=(2*index-1)/2.
288 :
289 1247170183 : const int jstart = *((*sphere_bounds_iter)++);
290 1247170183 : const size_t cx_size = (lp + 1) * 4;
291 1247170183 : double cx[cx_size];
292 12293832755 : for (int j1 = jstart; j1 <= 0; j1++) {
293 11046662572 : const int j2 = 1 - j1;
294 11046662572 : const int jg1 = map[1][j1 + cmax];
295 11046662572 : const int jg2 = map[1][j2 + cmax];
296 :
297 11046662572 : memset(cx, 0, cx_size * sizeof(double));
298 :
299 : // Generate separate branches for low values of lp gives up to 30% speedup.
300 11046662572 : if (lp <= GRID_MAX_LP_OPTIMIZED) {
301 : GRID_PRAGMA_UNROLL(GRID_MAX_LP_OPTIMIZED + 1)
302 >12151*10^7 : for (int ilp = 0; ilp <= GRID_MAX_LP_OPTIMIZED; ilp++) {
303 >11046*10^7 : if (lp == ilp) {
304 >12151*10^7 : ortho_cxy_to_grid_low(ilp, j1, j2, kg1, kg2, jg1, jg2, cmax, pol, map,
305 : sections, npts_local, sphere_bounds_iter, cx,
306 : cxy, grid);
307 : }
308 : }
309 : } else {
310 11046666190 : ortho_cxy_to_grid_low(lp, j1, j2, kg1, kg2, jg1, jg2, cmax, pol, map,
311 : sections, npts_local, sphere_bounds_iter, cx, cxy,
312 : grid);
313 : }
314 : }
315 1247170183 : }
316 :
317 : /*******************************************************************************
318 : * \brief Transforms coefficients C_xyz into C_xz by fixing grid index k.
319 : * \author Ole Schuett
320 : ******************************************************************************/
321 1247170183 : static inline void ortho_cxyz_to_cxy(const int lp, const int k1, const int k2,
322 : const int cmax,
323 1247170183 : const double pol[3][lp + 1][2 * cmax + 1],
324 : GRID_CONST_WHEN_COLLOCATE double *cxyz,
325 : GRID_CONST_WHEN_INTEGRATE double *cxy) {
326 :
327 4481087763 : for (int lzp = 0; lzp <= lp; lzp++) {
328 9963551915 : for (int lyp = 0; lyp <= lp - lzp; lyp++) {
329 19114200491 : for (int lxp = 0; lxp <= lp - lzp - lyp; lxp++) {
330 12384566156 : const double p1 = pol[2][lzp][k1 + cmax];
331 12384566156 : const double p2 = pol[2][lzp][k2 + cmax];
332 12384566156 : const int cxyz_index =
333 12384566156 : lzp * (lp + 1) * (lp + 1) + lyp * (lp + 1) + lxp; // [lzp, lyp, lxp]
334 12384566156 : const int cxy_index = lyp * (lp + 1) * 2 + lxp * 2; // [lyp, lxp, 0]
335 :
336 : #if (GRID_DO_COLLOCATE)
337 : // collocate
338 6128957420 : cxy[cxy_index + 0] += cxyz[cxyz_index] * p1;
339 6128957420 : cxy[cxy_index + 1] += cxyz[cxyz_index] * p2;
340 : #else
341 : // integrate
342 6255608736 : cxyz[cxyz_index] += cxy[cxy_index + 0] * p1;
343 6255608736 : cxyz[cxyz_index] += cxy[cxy_index + 1] * p2;
344 : #endif
345 : }
346 : }
347 : }
348 1247170183 : }
349 :
350 : /*******************************************************************************
351 : * \brief Collocates coefficients C_xyz onto the grid for orthorhombic case.
352 : * \author Ole Schuett
353 : ******************************************************************************/
354 : static inline void
355 121891892 : ortho_cxyz_to_grid(const int lp, const double zetp, const double dh[3][3],
356 : const double dh_inv[3][3], const double rp[3],
357 : const int npts_global[3], const int npts_local[3],
358 : const int shift_local[3], const double radius,
359 : GRID_CONST_WHEN_COLLOCATE double *cxyz,
360 121891892 : GRID_CONST_WHEN_INTEGRATE double *grid) {
361 :
362 : // *** position of the gaussian product
363 : //
364 : // this is the actual definition of the position on the grid
365 : // i.e. a point rp(:) gets here grid coordinates
366 : // MODULO(rp(:)/dr(:),npts_global(:))+1
367 : // hence (0.0,0.0,0.0) in real space is rsgrid%lb on the rsgrid in Fortran
368 : // and (1,1,1) on grid here in C.
369 :
370 : // cubecenter(:) = FLOOR(MATMUL(dh_inv, rp))
371 121891892 : int cubecenter[3];
372 487567568 : for (int i = 0; i < 3; i++) {
373 : double dh_inv_rp = 0.0;
374 1462702704 : for (int j = 0; j < 3; j++) {
375 1097027028 : dh_inv_rp += dh_inv[j][i] * rp[j];
376 : }
377 365675676 : cubecenter[i] = (int)floor(dh_inv_rp);
378 : }
379 :
380 : double roffset[3];
381 487567568 : for (int i = 0; i < 3; i++) {
382 365675676 : roffset[i] = rp[i] - ((double)cubecenter[i]) * dh[i][i];
383 : }
384 :
385 : // Lookup loop bounds for spherical cutoff.
386 121891892 : int *sphere_bounds;
387 121891892 : double disr_radius;
388 121891892 : grid_sphere_cache_lookup(radius, dh, dh_inv, &sphere_bounds, &disr_radius);
389 121891892 : int **sphere_bounds_iter = &sphere_bounds;
390 :
391 : // Cube bounds.
392 121891892 : int lb_cube[3], ub_cube[3];
393 487567568 : for (int i = 0; i < 3; i++) {
394 365675676 : lb_cube[i] = (int)ceil(-1e-8 - disr_radius * dh_inv[i][i]);
395 365675676 : ub_cube[i] = 1 - lb_cube[i];
396 : // If grid is not period check that cube fits without wrapping.
397 365675676 : if (npts_global[i] != npts_local[i]) {
398 17826 : const int offset =
399 17826 : modulo(cubecenter[i] + lb_cube[i] - shift_local[i], npts_global[i]) -
400 : lb_cube[i];
401 17826 : assert(offset + ub_cube[i] < npts_local[i]);
402 17826 : assert(offset + lb_cube[i] >= 0);
403 : }
404 : }
405 :
406 : // cmax = MAXVAL(ub_cube)
407 121891892 : const int cmax = imax(imax(ub_cube[0], ub_cube[1]), ub_cube[2]);
408 :
409 : // Precompute (x-xp)**lp*exp(..) for each direction.
410 121891892 : double pol_mutable[3][lp + 1][2 * cmax + 1];
411 487567568 : for (int idir = 0; idir < 3; idir++) {
412 365675676 : const double dr = dh[idir][idir];
413 365675676 : const double ro = roffset[idir];
414 : // Reuse the result from the previous gridpoint to avoid to many exps:
415 : // exp( -a*(x+d)**2) = exp(-a*x**2)*exp(-2*a*x*d)*exp(-a*d**2)
416 : // exp(-2*a*(x+d)*d) = exp(-2*a*x*d)*exp(-2*a*d**2)
417 365675676 : const double t_exp_1 = exp(-zetp * pow(dr, 2));
418 365675676 : const double t_exp_2 = pow(t_exp_1, 2);
419 365675676 : double t_exp_min_1 = exp(-zetp * pow(+dr - ro, 2));
420 365675676 : double t_exp_min_2 = exp(-2 * zetp * (+dr - ro) * (-dr));
421 4109127766 : for (int ig = 0; ig >= lb_cube[idir]; ig--) {
422 3743452090 : const double rpg = ig * dr - ro;
423 3743452090 : t_exp_min_1 *= t_exp_min_2 * t_exp_1;
424 3743452090 : t_exp_min_2 *= t_exp_2;
425 3743452090 : double pg = t_exp_min_1;
426 13451995697 : for (int icoef = 0; icoef <= lp; icoef++) {
427 9708543607 : pol_mutable[idir][icoef][ig + cmax] = pg; // exp(-zetp*rpg**2)
428 9708543607 : pg *= rpg;
429 : }
430 : }
431 365675676 : double t_exp_plus_1 = exp(-zetp * pow(-ro, 2));
432 365675676 : double t_exp_plus_2 = exp(-2 * zetp * (-ro) * (+dr));
433 4109127766 : for (int ig = 0; ig >= lb_cube[idir]; ig--) {
434 3743452090 : const double rpg = (1 - ig) * dr - ro;
435 3743452090 : t_exp_plus_1 *= t_exp_plus_2 * t_exp_1;
436 3743452090 : t_exp_plus_2 *= t_exp_2;
437 3743452090 : double pg = t_exp_plus_1;
438 13451995697 : for (int icoef = 0; icoef <= lp; icoef++) {
439 9708543607 : pol_mutable[idir][icoef][1 - ig + cmax] = pg; // exp(-zetp*rpg**2)
440 9708543607 : pg *= rpg;
441 : }
442 : }
443 : }
444 121891892 : const double(*pol)[lp + 1][2 * cmax + 1] =
445 121891892 : (const double(*)[lp + 1][2 * cmax + 1]) pol_mutable;
446 :
447 : // Precompute mapping from cube to grid indices for each direction
448 121891892 : int map_mutable[3][2 * cmax + 1];
449 487567568 : for (int i = 0; i < 3; i++) {
450 8279433564 : for (int k = -cmax; k <= +cmax; k++) {
451 7913757888 : map_mutable[i][k + cmax] =
452 7913757888 : modulo(cubecenter[i] + k - shift_local[i], npts_global[i]);
453 : }
454 : }
455 121891892 : const int(*map)[2 * cmax + 1] = (const int(*)[2 * cmax + 1]) map_mutable;
456 :
457 : // Precompute length of sections with homogeneous cube to grid mapping.
458 121891892 : int sections_mutable[3][2 * cmax + 1];
459 487567568 : for (int i = 0; i < 3; i++) {
460 8279433564 : for (int kg = 2 * cmax; kg >= 0; kg--) {
461 7913757888 : if (kg == 2 * cmax || map[i][kg] != map[i][kg + 1] - 1) {
462 746893342 : sections_mutable[i][kg] = 0;
463 : } else {
464 7166864546 : sections_mutable[i][kg] = sections_mutable[i][kg + 1] + 1;
465 : }
466 : }
467 : }
468 121891892 : const int(*sections)[2 * cmax + 1] =
469 121891892 : (const int(*)[2 * cmax + 1]) sections_mutable;
470 :
471 : // Loop over k dimension of the cube.
472 121891892 : const int kstart = *((*sphere_bounds_iter)++);
473 121891892 : const size_t cxy_size = (lp + 1) * (lp + 1) * 2;
474 121891892 : double cxy[cxy_size];
475 1369062075 : for (int k1 = kstart; k1 <= 0; k1++) {
476 1247170183 : const int k2 = 1 - k1;
477 1247170183 : const int kg1 = map[2][k1 + cmax];
478 1247170183 : const int kg2 = map[2][k2 + cmax];
479 :
480 1247170183 : memset(cxy, 0, cxy_size * sizeof(double));
481 :
482 : #if (GRID_DO_COLLOCATE)
483 : // collocate
484 636505121 : ortho_cxyz_to_cxy(lp, k1, k2, cmax, pol, cxyz, cxy);
485 636505121 : ortho_cxy_to_grid(lp, kg1, kg2, cmax, pol, map, sections, npts_local,
486 : sphere_bounds_iter, cxy, grid);
487 : #else
488 : // integrate
489 610665062 : ortho_cxy_to_grid(lp, kg1, kg2, cmax, pol, map, sections, npts_local,
490 : sphere_bounds_iter, cxy, grid);
491 610665062 : ortho_cxyz_to_cxy(lp, k1, k2, cmax, pol, cxyz, cxy);
492 : #endif
493 : }
494 121891892 : }
495 :
496 : /*******************************************************************************
497 : * \brief Collocates coefficients C_i onto the grid for general case.
498 : * \author Ole Schuett
499 : ******************************************************************************/
500 2092171736 : static inline void __attribute__((always_inline)) general_ci_to_grid(
501 : const int lp, const int jg, const int kg, const int ismin, const int ismax,
502 : const int npts_local[3], const int index_min[3], const int index_max[3],
503 : const int map_i[], const int sections_i[], const double gp[3], const int k,
504 : const int j, const double exp_ij[], const double exp_jk[],
505 : const double exp_ki[], GRID_CONST_WHEN_COLLOCATE double *ci,
506 : GRID_CONST_WHEN_INTEGRATE double *grid) {
507 :
508 2092171736 : const int base = kg * npts_local[1] * npts_local[0] + jg * npts_local[0];
509 :
510 : // AVX instructions can only load/store from evenly spaced memory locations.
511 : // Since the cube can wrap around due to the grid's periodicity,
512 : // the inner loop runs over sections with homogeneous cube to grid mapping.
513 6182874029 : for (int istart = ismin; istart <= ismax; istart++) {
514 4090702293 : const int istop = imin(ismax, istart + sections_i[istart - index_min[0]]);
515 4090702293 : if (map_i[istart - index_min[0]] < 0) {
516 0 : istart = istop; // skip over out-of-bounds indicies
517 0 : continue;
518 : }
519 :
520 4090702293 : const int cube2grid = map_i[istart - index_min[0]] - istart;
521 39563370053 : for (int i = istart; i <= istop; i++) {
522 35472667760 : const int ig = i + cube2grid;
523 35472667760 : const double di = i - gp[0];
524 :
525 35472667760 : const int stride_i = index_max[0] - index_min[0] + 1;
526 35472667760 : const int stride_j = index_max[1] - index_min[1] + 1;
527 35472667760 : const int stride_k = index_max[2] - index_min[2] + 1;
528 35472667760 : const int idx_ij = (j - index_min[1]) * stride_i + i - index_min[0];
529 35472667760 : const int idx_jk = (k - index_min[2]) * stride_j + j - index_min[1];
530 35472667760 : const int idx_ki = (i - index_min[0]) * stride_k + k - index_min[2];
531 :
532 : // Mathieu's trick: Calculate 3D Gaussian from three precomputed 2D tables
533 : //
534 : // r = (i-gp[0])*dh[0,:] + (j-gp[1])*dh[1,:] + (k-gp[2])*dh[2,:]
535 : // = a + b + c
536 : //
537 : // r**2 = (a + b + c)**2 = a**2 + b**2 + c**2 + 2ab + 2bc + 2ca
538 : //
539 : // exp(-r**2) = exp(-a(a+2b)) * exp(-b*(b+2c)) * exp(-c*(c+2a))
540 : //
541 35472667760 : const double gaussian = exp_ij[idx_ij] * exp_jk[idx_jk] * exp_ki[idx_ki];
542 :
543 35472667760 : const int grid_index = base + ig; // [kg, jg, ig]
544 35472667760 : double dip = gaussian;
545 :
546 : #if (GRID_DO_COLLOCATE)
547 : // collocate
548 19993388331 : double reg = 0.0;
549 93159902691 : for (int il = 0; il <= lp; il++) {
550 73166514360 : reg += ci[il] * dip;
551 73166514360 : dip *= di;
552 : }
553 19993388331 : grid[grid_index] += reg;
554 : #else
555 : // integrate
556 15479279429 : const double reg = grid[grid_index];
557 72115747733 : for (int il = 0; il <= lp; il++) {
558 56636468304 : ci[il] += reg * dip;
559 56636468304 : dip *= di;
560 : }
561 : #endif
562 : }
563 : istart = istop;
564 : }
565 : }
566 :
567 : /*******************************************************************************
568 : * \brief Transforms coefficients C_ij into C_i by fixing grid index j.
569 : * \author Ole Schuett
570 : ******************************************************************************/
571 : static inline void __attribute__((always_inline))
572 : general_cij_to_ci(const int lp, const double dj,
573 : GRID_CONST_WHEN_COLLOCATE double *cij,
574 : GRID_CONST_WHEN_INTEGRATE double *ci) {
575 : double djp = 1.0;
576 13164673351 : for (int jl = 0; jl <= lp; jl++) {
577 27242714842 : for (int il = 0; il <= lp - jl; il++) {
578 19542613154 : const int cij_index = jl * (lp + 1) + il; // [jl, il]
579 : #if (GRID_DO_COLLOCATE)
580 10881360674 : ci[il] += cij[cij_index] * djp; // collocate
581 : #else
582 8661252480 : cij[cij_index] += ci[il] * djp; // integrate
583 : #endif
584 : }
585 7700101688 : djp *= dj;
586 : }
587 : }
588 :
589 : /*******************************************************************************
590 : * \brief Loop body of general_cij_to_grid to be inlined for low values of lp.
591 : * \author Ole Schuett
592 : ******************************************************************************/
593 912938799 : static inline void __attribute__((always_inline)) general_cij_to_grid_low(
594 : const int lp, const int jg, const int kg, const int ismin, const int ismax,
595 : const int npts_local[3], const int index_min[3], const int index_max[3],
596 : const int map_i[], const int sections_i[], const double gp[3], const int k,
597 : const int j, const double exp_ij[], const double exp_jk[],
598 : const double exp_ki[], const double dj, double *ci,
599 : GRID_CONST_WHEN_COLLOCATE double *cij,
600 : GRID_CONST_WHEN_INTEGRATE double *grid) {
601 :
602 : #if (GRID_DO_COLLOCATE)
603 : // collocate
604 1179232937 : general_cij_to_ci(lp, dj, cij, ci);
605 1179232937 : general_ci_to_grid(lp, jg, kg, ismin, ismax, npts_local, index_min, index_max,
606 : map_i, sections_i, gp, k, j, exp_ij, exp_jk, exp_ki, ci,
607 : grid);
608 : #else
609 : // integrate
610 4285338726 : general_ci_to_grid(lp, jg, kg, ismin, ismax, npts_local, index_min, index_max,
611 : map_i, sections_i, gp, k, j, exp_ij, exp_jk, exp_ki, ci,
612 : grid);
613 : general_cij_to_ci(lp, dj, cij, ci);
614 : #endif
615 : }
616 :
617 : /*******************************************************************************
618 : * \brief Collocates coefficients C_ij onto the grid for general case.
619 : * \author Ole Schuett
620 : ******************************************************************************/
621 125347127 : static inline void general_cij_to_grid(
622 : const int lp, const int k, const int kg, const int npts_local[3],
623 : const int index_min[3], const int index_max[3], const int map_i[],
624 : const int map_j[], const int sections_i[], const int sections_j[],
625 : const double dh[3][3], const double gp[3], const double radius,
626 : const double exp_ij[], const double exp_jk[], const double exp_ki[],
627 : GRID_CONST_WHEN_COLLOCATE double *cij,
628 : GRID_CONST_WHEN_INTEGRATE double *grid) {
629 :
630 4001125782 : for (int j = index_min[1]; j <= index_max[1]; j++) {
631 3875778655 : const int jg = map_j[j - index_min[1]];
632 3875778655 : if (jg < 0) {
633 560 : j += sections_j[j - index_min[1]]; // skip over out-of-bounds indicies
634 560 : continue;
635 : }
636 :
637 : //--------------------------------------------------------------------
638 : // Find bounds for the inner loop based on a quadratic equation in i.
639 : //
640 : // The real-space vector from the center of the gaussian to the
641 : // grid point i,j,k is given by:
642 : // r = (i-gp[0])*dh[0,:] + (j-gp[1])*dh[1,:] + (k-gp[2])*dh[2,:]
643 : //
644 : // Separating the term that depends on i:
645 : // r = i*dh[0,:] - gp[0]*dh[0,:] + (j-gp[1])*dh[1,:] + (k-gp[2])*dh[2,:]
646 : // = i*dh[0,:] + v
647 : //
648 : // The squared distance works out to:
649 : // r**2 = dh[0,:]**2 * i**2 + 2 * v * dh[0,:] * i + v**2
650 : // = a * i**2 + b * i + c
651 : //
652 : // Solving r**2==radius**2 for i yields:
653 : // d = b**2 - 4 * a * (c - radius**2)
654 : // i = (-b \pm sqrt(d)) / (2*a)
655 : //
656 : double a = 0.0, b = 0.0, c = 0.0;
657 15503112380 : for (int i = 0; i < 3; i++) {
658 11627334285 : const double v = (0 - gp[0]) * dh[0][i] + (j - gp[1]) * dh[1][i] +
659 11627334285 : (k - gp[2]) * dh[2][i];
660 11627334285 : a += dh[0][i] * dh[0][i];
661 11627334285 : b += 2.0 * v * dh[0][i];
662 11627334285 : c += v * v;
663 : }
664 3875778095 : const double d = b * b - 4.0 * a * (c - radius * radius);
665 :
666 3875778095 : if (0.0 < d) {
667 2092171736 : const double sqrt_d = sqrt(d);
668 2092171736 : const double inv_2a = 1.0 / (2.0 * a);
669 2092171736 : const int ismin = (int)ceil((-b - sqrt_d) * inv_2a);
670 2092171736 : const int ismax = (int)floor((-b + sqrt_d) * inv_2a);
671 2092171736 : const double dj = j - gp[1];
672 :
673 2092171736 : double ci[lp + 1];
674 2092171736 : memset(ci, 0, sizeof(ci));
675 :
676 : // Generate separate branches for low values of lp.
677 2092171736 : if (lp <= GRID_MAX_LP_OPTIMIZED) {
678 : GRID_PRAGMA_UNROLL(GRID_MAX_LP_OPTIMIZED + 1)
679 23013889096 : for (int ilp = 0; ilp <= GRID_MAX_LP_OPTIMIZED; ilp++) {
680 20921717360 : if (lp == ilp) {
681 21834656159 : general_cij_to_grid_low(ilp, jg, kg, ismin, ismax, npts_local,
682 : index_min, index_max, map_i, sections_i, gp,
683 : k, j, exp_ij, exp_jk, exp_ki, dj, ci, cij,
684 : grid);
685 : }
686 : }
687 : } else {
688 2092171736 : general_cij_to_grid_low(lp, jg, kg, ismin, ismax, npts_local, index_min,
689 : index_max, map_i, sections_i, gp, k, j, exp_ij,
690 : exp_jk, exp_ki, dj, ci, cij, grid);
691 : }
692 : }
693 : }
694 125347127 : }
695 :
696 : /*******************************************************************************
697 : * \brief Transforms coefficients C_ijk into C_ij by fixing grid index k.
698 : * \author Ole Schuett
699 : ******************************************************************************/
700 125347127 : static inline void general_cijk_to_cij(const int lp, const double dk,
701 : GRID_CONST_WHEN_COLLOCATE double *cijk,
702 : GRID_CONST_WHEN_INTEGRATE double *cij) {
703 125347127 : double dkp = 1.0;
704 584297542 : for (int kl = 0; kl <= lp; kl++) {
705 1620633379 : for (int jl = 0; jl <= lp - kl; jl++) {
706 3625510355 : for (int il = 0; il <= lp - kl - jl; il++) {
707 2463827391 : const int cij_index = jl * (lp + 1) + il; // [jl, il]
708 2463827391 : const int cijk_index =
709 2463827391 : kl * (lp + 1) * (lp + 1) + jl * (lp + 1) + il; // [kl, jl, il]
710 : #if (GRID_DO_COLLOCATE)
711 1333748191 : cij[cij_index] += cijk[cijk_index] * dkp; // collocate
712 : #else
713 1130079200 : cijk[cijk_index] += cij[cij_index] * dkp; // integrate
714 : #endif
715 : }
716 : }
717 458950415 : dkp *= dk;
718 : }
719 125347127 : }
720 :
721 : /*******************************************************************************
722 : * \brief Precompute mapping of grid indices and its homogeneous sections.
723 : * \author Ole Schuett
724 : ******************************************************************************/
725 : static inline void
726 15863001 : general_precompute_mapping(const int index_min, const int index_max,
727 : const int shift_local, const int npts_global,
728 : const int bounds[2], int map[], int sections[]) {
729 :
730 : // Precompute mapping from continous grid indices to pbc wraped.
731 434699481 : for (int k = index_min; k <= index_max; k++) {
732 418836480 : const int kg = modulo(k - shift_local, npts_global);
733 418836480 : if (bounds[0] <= kg && kg <= bounds[1]) {
734 418832059 : map[k - index_min] = kg;
735 : } else {
736 4421 : map[k - index_min] = INT_MIN; // out of bounds - not mapped
737 : }
738 : }
739 :
740 : // Precompute length of sections with homogeneous cube to grid mapping.
741 15863001 : const int range = index_max - index_min + 1;
742 434699481 : for (int kg = range - 1; kg >= 0; kg--) {
743 418836480 : if (kg == range - 1 || map[kg] != map[kg + 1] - 1) {
744 40070540 : sections[kg] = 0;
745 : } else {
746 378765940 : sections[kg] = sections[kg + 1] + 1;
747 : }
748 : }
749 15863001 : }
750 :
751 : /*******************************************************************************
752 : * \brief Fill one of the 2D tables that speedup 3D Gaussian (Mathieu's trick).
753 : * \author Ole Schuett
754 : ******************************************************************************/
755 : static inline void
756 15863001 : general_fill_exp_table(const int idir, const int jdir, const int index_min[3],
757 : const int index_max[3], const double zetp,
758 : const double dh[3][3], const double gp[3],
759 : double exp_table[]) {
760 :
761 15863001 : const int stride_i = index_max[idir] - index_min[idir] + 1;
762 15863001 : const double h_ii = dh[idir][0] * dh[idir][0] + dh[idir][1] * dh[idir][1] +
763 15863001 : dh[idir][2] * dh[idir][2];
764 15863001 : const double h_ij = dh[idir][0] * dh[jdir][0] + dh[idir][1] * dh[jdir][1] +
765 15863001 : dh[idir][2] * dh[jdir][2];
766 :
767 434699481 : for (int i = index_min[idir]; i <= index_max[idir]; i++) {
768 418836480 : const double di = i - gp[idir];
769 418836480 : const double rii = di * di * h_ii;
770 418836480 : const double rij_unit = di * h_ij;
771 418836480 : const double exp_ij_unit = exp(-zetp * 2.0 * rij_unit);
772 :
773 : // compute exponentials symmetrically around cube center
774 418836480 : const int j_center = (int)gp[jdir];
775 418836480 : const double dj_center = j_center - gp[jdir];
776 418836480 : const double rij_center = dj_center * rij_unit;
777 418836480 : const double exp_ij_center = exp(-zetp * (rii + 2.0 * rij_center));
778 :
779 : // above center
780 418836480 : double exp_ij = exp_ij_center;
781 6778928462 : for (int j = j_center; j <= index_max[jdir]; j++) {
782 6360091982 : const int idx = (j - index_min[jdir]) * stride_i + i - index_min[idir];
783 6360091982 : exp_table[idx] = exp_ij; // exp(-zetp * (di*di*h_ii + 2*di*dj*h_ij));
784 6360091982 : exp_ij *= exp_ij_unit;
785 : }
786 :
787 : // below center
788 418836480 : const double exp_ij_unit_inv = 1.0 / exp_ij_unit;
789 418836480 : exp_ij = exp_ij_center * exp_ij_unit_inv;
790 6324031206 : for (int j = j_center - 1; j >= index_min[jdir]; j--) {
791 5905194726 : const int idx = (j - index_min[jdir]) * stride_i + i - index_min[idir];
792 5905194726 : exp_table[idx] = exp_ij; // exp(-zetp * (di*di*h_ii + 2*di*dj*h_ij));
793 5905194726 : exp_ij *= exp_ij_unit_inv;
794 : }
795 : }
796 15863001 : }
797 :
798 : /*******************************************************************************
799 : * \brief Collocates coefficients C_ijk onto the grid for general case.
800 : * \author Ole Schuett
801 : ******************************************************************************/
802 : static inline void
803 5287667 : general_cijk_to_grid(const int border_mask, const int lp, const double zetp,
804 : const double dh[3][3], const double dh_inv[3][3],
805 : const double rp[3], const int npts_global[3],
806 : const int npts_local[3], const int shift_local[3],
807 : const int border_width[3], const double radius,
808 : GRID_CONST_WHEN_COLLOCATE double *cijk,
809 5287667 : GRID_CONST_WHEN_INTEGRATE double *grid) {
810 :
811 : // Default for border_mask == 0.
812 5287667 : int bounds_i[2] = {0, npts_local[0] - 1};
813 5287667 : int bounds_j[2] = {0, npts_local[1] - 1};
814 5287667 : int bounds_k[2] = {0, npts_local[2] - 1};
815 :
816 : // See also rs_find_node() in task_list_methods.F.
817 : // If the bit is set then we need to exclude the border in that direction.
818 5287667 : if (border_mask & (1 << 0))
819 103324 : bounds_i[0] += border_width[0];
820 5287667 : if (border_mask & (1 << 1))
821 103324 : bounds_i[1] -= border_width[0];
822 5287667 : if (border_mask & (1 << 2))
823 103324 : bounds_j[0] += border_width[1];
824 5287667 : if (border_mask & (1 << 3))
825 103324 : bounds_j[1] -= border_width[1];
826 5287667 : if (border_mask & (1 << 4))
827 103316 : bounds_k[0] += border_width[2];
828 5287667 : if (border_mask & (1 << 5))
829 103324 : bounds_k[1] -= border_width[2];
830 :
831 : // center in grid coords
832 : // gp = MATMUL(dh_inv, rp)
833 5287667 : double gp[3] = {0.0, 0.0, 0.0};
834 21150668 : for (int i = 0; i < 3; i++) {
835 63452004 : for (int j = 0; j < 3; j++) {
836 47589003 : gp[i] += dh_inv[j][i] * rp[j];
837 : }
838 : }
839 :
840 : // Get the min max indices that contain at least the cube that contains a
841 : // sphere around rp of radius radius if the cell is very non-orthogonal this
842 : // implies that many useless points are included this estimate can be improved
843 : // (i.e. not box but sphere should be used)
844 5287667 : int index_min[3] = {INT_MAX, INT_MAX, INT_MAX};
845 5287667 : int index_max[3] = {INT_MIN, INT_MIN, INT_MIN};
846 21150668 : for (int i = -1; i <= 1; i++) {
847 63452004 : for (int j = -1; j <= 1; j++) {
848 190356012 : for (int k = -1; k <= 1; k++) {
849 142767009 : const double x = rp[0] + i * radius;
850 142767009 : const double y = rp[1] + j * radius;
851 142767009 : const double z = rp[2] + k * radius;
852 571068036 : for (int idir = 0; idir < 3; idir++) {
853 428301027 : const double resc =
854 428301027 : dh_inv[0][idir] * x + dh_inv[1][idir] * y + dh_inv[2][idir] * z;
855 428301027 : index_min[idir] = imin(index_min[idir], (int)floor(resc));
856 428301027 : index_max[idir] = imax(index_max[idir], (int)ceil(resc));
857 : }
858 : }
859 : }
860 : }
861 :
862 : // Precompute mappings
863 5287667 : const int range_i = index_max[0] - index_min[0] + 1;
864 5287667 : int map_i[range_i], sections_i[range_i];
865 5287667 : general_precompute_mapping(index_min[0], index_max[0], shift_local[0],
866 : npts_global[0], bounds_i, map_i, sections_i);
867 5287667 : const int range_j = index_max[1] - index_min[1] + 1;
868 5287667 : int map_j[range_j], sections_j[range_j];
869 5287667 : general_precompute_mapping(index_min[1], index_max[1], shift_local[1],
870 : npts_global[1], bounds_j, map_j, sections_j);
871 5287667 : const int range_k = index_max[2] - index_min[2] + 1;
872 5287667 : int map_k[range_k], sections_k[range_k];
873 5287667 : general_precompute_mapping(index_min[2], index_max[2], shift_local[2],
874 : npts_global[2], bounds_k, map_k, sections_k);
875 :
876 : // Precompute exponentials
877 5287667 : double exp_ij[range_i * range_j];
878 5287667 : general_fill_exp_table(0, 1, index_min, index_max, zetp, dh, gp, exp_ij);
879 5287667 : double exp_jk[range_j * range_k];
880 5287667 : general_fill_exp_table(1, 2, index_min, index_max, zetp, dh, gp, exp_jk);
881 5287667 : double exp_ki[range_k * range_i];
882 5287667 : general_fill_exp_table(2, 0, index_min, index_max, zetp, dh, gp, exp_ki);
883 :
884 : // go over the grid, but cycle if the point is not within the radius
885 5287667 : const int cij_size = (lp + 1) * (lp + 1);
886 5287667 : double cij[cij_size];
887 130639175 : for (int k = index_min[2]; k <= index_max[2]; k++) {
888 125351508 : const int kg = map_k[k - index_min[2]];
889 125351508 : if (kg < 0) {
890 4381 : k += sections_k[k - index_min[2]]; // skip over out-of-bounds indicies
891 4381 : continue;
892 : }
893 :
894 : // zero coef_xyt
895 125347127 : memset(cij, 0, cij_size * sizeof(double));
896 :
897 : #if (GRID_DO_COLLOCATE)
898 : // collocate
899 70071001 : general_cijk_to_cij(lp, (double)k - gp[2], cijk, cij);
900 70071001 : general_cij_to_grid(lp, k, kg, npts_local, index_min, index_max, map_i,
901 : map_j, sections_i, sections_j, dh, gp, radius, exp_ij,
902 : exp_jk, exp_ki, cij, grid);
903 : #else
904 : // integrate
905 55276126 : general_cij_to_grid(lp, k, kg, npts_local, index_min, index_max, map_i,
906 : map_j, sections_i, sections_j, dh, gp, radius, exp_ij,
907 : exp_jk, exp_ki, cij, grid);
908 55276126 : general_cijk_to_cij(lp, (double)k - gp[2], cijk, cij);
909 : #endif
910 : }
911 5287667 : }
912 :
913 : /*******************************************************************************
914 : * \brief Transforms coefficients C_xyz into C_ijk.
915 : * \author Ole Schuett
916 : ******************************************************************************/
917 : static inline void
918 5287667 : general_cxyz_to_cijk(const int lp, const double dh[3][3],
919 : GRID_CONST_WHEN_COLLOCATE double *cxyz,
920 5287667 : GRID_CONST_WHEN_INTEGRATE double *cijk) {
921 :
922 : // transform P_{lxp,lyp,lzp} into a P_{lip,ljp,lkp} such that
923 : // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-x_p)**lxp (y-y_p)**lyp (z-z_p)**lzp =
924 : // sum_{lip,ljp,lkp} P_{lip,ljp,lkp} (i-i_p)**lip (j-j_p)**ljp (k-k_p)**lkp
925 :
926 : // transform using multinomials
927 5287667 : double hmatgridp[lp + 1][3][3];
928 21150668 : for (int i = 0; i < 3; i++) {
929 63452004 : for (int j = 0; j < 3; j++) {
930 47589003 : hmatgridp[0][j][i] = 1.0;
931 171393651 : for (int k = 1; k <= lp; k++) {
932 123804648 : hmatgridp[k][j][i] = hmatgridp[k - 1][j][i] * dh[j][i];
933 : }
934 : }
935 : }
936 :
937 : const int lpx = lp;
938 24331406 : for (int klx = 0; klx <= lpx; klx++) {
939 66916583 : for (int jlx = 0; jlx <= lpx - klx; jlx++) {
940 149059499 : for (int ilx = 0; ilx <= lpx - klx - jlx; ilx++) {
941 101186655 : const int lx = ilx + jlx + klx;
942 101186655 : const int lpy = lp - lx;
943 293738564 : for (int kly = 0; kly <= lpy; kly++) {
944 533607929 : for (int jly = 0; jly <= lpy - kly; jly++) {
945 914068643 : for (int ily = 0; ily <= lpy - kly - jly; ily++) {
946 573012623 : const int ly = ily + jly + kly;
947 573012623 : const int lpz = lp - lx - ly;
948 1497060859 : for (int klz = 0; klz <= lpz; klz++) {
949 2365660473 : for (int jlz = 0; jlz <= lpz - klz; jlz++) {
950 3629565658 : for (int ilz = 0; ilz <= lpz - klz - jlz; ilz++) {
951 2187953421 : const int lz = ilz + jlz + klz;
952 2187953421 : const int il = ilx + ily + ilz;
953 2187953421 : const int jl = jlx + jly + jlz;
954 2187953421 : const int kl = klx + kly + klz;
955 2187953421 : const int lp1 = lp + 1;
956 2187953421 : const int cijk_index =
957 2187953421 : kl * lp1 * lp1 + jl * lp1 + il; // [kl,jl,il]
958 2187953421 : const int cxyz_index =
959 2187953421 : lz * lp1 * lp1 + ly * lp1 + lx; // [lz,ly,lx]
960 2187953421 : const double p =
961 2187953421 : hmatgridp[ilx][0][0] * hmatgridp[jlx][1][0] *
962 2187953421 : hmatgridp[klx][2][0] * hmatgridp[ily][0][1] *
963 2187953421 : hmatgridp[jly][1][1] * hmatgridp[kly][2][1] *
964 2187953421 : hmatgridp[ilz][0][2] * hmatgridp[jlz][1][2] *
965 2187953421 : hmatgridp[klz][2][2] * fac(lx) * fac(ly) * fac(lz) /
966 2187953421 : (fac(ilx) * fac(ily) * fac(ilz) * fac(jlx) * fac(jly) *
967 2187953421 : fac(jlz) * fac(klx) * fac(kly) * fac(klz));
968 : #if (GRID_DO_COLLOCATE)
969 958630799 : cijk[cijk_index] += cxyz[cxyz_index] * p; // collocate
970 : #else
971 1229322622 : cxyz[cxyz_index] += cijk[cijk_index] * p; // integrate
972 : #endif
973 : }
974 : }
975 : }
976 : }
977 : }
978 : }
979 : }
980 : }
981 : }
982 5287667 : }
983 :
984 : /*******************************************************************************
985 : * \brief Collocates coefficients C_xyz onto the grid for general case.
986 : * \author Ole Schuett
987 : ******************************************************************************/
988 : static inline void
989 5287667 : general_cxyz_to_grid(const int border_mask, const int lp, const double zetp,
990 : const double dh[3][3], const double dh_inv[3][3],
991 : const double rp[3], const int npts_global[3],
992 : const int npts_local[3], const int shift_local[3],
993 : const int border_width[3], const double radius,
994 : GRID_CONST_WHEN_COLLOCATE double *cxyz,
995 5287667 : GRID_CONST_WHEN_INTEGRATE double *grid) {
996 :
997 5287667 : const size_t cijk_size = (lp + 1) * (lp + 1) * (lp + 1);
998 5287667 : double cijk[cijk_size];
999 5287667 : memset(cijk, 0, cijk_size * sizeof(double));
1000 :
1001 : #if (GRID_DO_COLLOCATE)
1002 : // collocate
1003 2929421 : general_cxyz_to_cijk(lp, dh, cxyz, cijk);
1004 2929421 : general_cijk_to_grid(border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
1005 : npts_local, shift_local, border_width, radius, cijk,
1006 : grid);
1007 : #else
1008 : // integrate
1009 2358246 : general_cijk_to_grid(border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
1010 : npts_local, shift_local, border_width, radius, cijk,
1011 : grid);
1012 2358246 : general_cxyz_to_cijk(lp, dh, cxyz, cijk);
1013 : #endif
1014 5287667 : }
1015 :
1016 : /*******************************************************************************
1017 : * \brief Collocates coefficients C_xyz onto the grid.
1018 : * \author Ole Schuett
1019 : ******************************************************************************/
1020 : static inline void
1021 127179559 : cxyz_to_grid(const bool orthorhombic, const int border_mask, const int lp,
1022 : const double zetp, const double dh[3][3],
1023 : const double dh_inv[3][3], const double rp[3],
1024 : const int npts_global[3], const int npts_local[3],
1025 : const int shift_local[3], const int border_width[3],
1026 : const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz,
1027 : GRID_CONST_WHEN_INTEGRATE double *grid) {
1028 :
1029 127179559 : enum grid_library_kernel k;
1030 127179559 : if (orthorhombic && border_mask == 0) {
1031 121891892 : k = (GRID_DO_COLLOCATE) ? GRID_COLLOCATE_ORTHO : GRID_INTEGRATE_ORTHO;
1032 121891892 : ortho_cxyz_to_grid(lp, zetp, dh, dh_inv, rp, npts_global, npts_local,
1033 : shift_local, radius, cxyz, grid);
1034 : } else {
1035 5287667 : k = (GRID_DO_COLLOCATE) ? GRID_COLLOCATE_GENERAL : GRID_INTEGRATE_GENERAL;
1036 5287667 : general_cxyz_to_grid(border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
1037 : npts_local, shift_local, border_width, radius, cxyz,
1038 : grid);
1039 : }
1040 127179559 : grid_library_counter_add(lp, GRID_BACKEND_CPU, k, 1);
1041 127179559 : }
1042 :
1043 : /*******************************************************************************
1044 : * \brief Transforms coefficients C_ab into C_xyz.
1045 : * \author Ole Schuett
1046 : ******************************************************************************/
1047 127179559 : static inline void cab_to_cxyz(const int la_max, const int la_min,
1048 : const int lb_max, const int lb_min,
1049 : const double prefactor, const double ra[3],
1050 : const double rb[3], const double rp[3],
1051 : GRID_CONST_WHEN_COLLOCATE double *cab,
1052 127179559 : GRID_CONST_WHEN_INTEGRATE double *cxyz) {
1053 :
1054 : // Computes the polynomial expansion coefficients:
1055 : // (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,lxa,lxb,1)*(x-p)**ls
1056 127179559 : const int lp = la_max + lb_max;
1057 127179559 : double alpha[3][lb_max + 1][la_max + 1][lp + 1];
1058 127179559 : memset(alpha, 0, 3 * (lb_max + 1) * (la_max + 1) * (lp + 1) * sizeof(double));
1059 :
1060 508718236 : for (int i = 0; i < 3; i++) {
1061 381538677 : const double drpa = rp[i] - ra[i];
1062 381538677 : const double drpb = rp[i] - rb[i];
1063 1077146592 : for (int lxa = 0; lxa <= la_max; lxa++) {
1064 1998660033 : for (int lxb = 0; lxb <= lb_max; lxb++) {
1065 : double binomial_k_lxa = 1.0;
1066 : double a = 1.0;
1067 3426529068 : for (int k = 0; k <= lxa; k++) {
1068 : double binomial_l_lxb = 1.0;
1069 : double b = 1.0;
1070 5556530970 : for (int l = 0; l <= lxb; l++) {
1071 3433054020 : alpha[i][lxb][lxa][lxa - l + lxb - k] +=
1072 3433054020 : binomial_k_lxa * binomial_l_lxb * a * b;
1073 3433054020 : binomial_l_lxb *= ((double)(lxb - l)) / ((double)(l + 1));
1074 3433054020 : b *= drpb;
1075 : }
1076 2123476950 : binomial_k_lxa *= ((double)(lxa - k)) / ((double)(k + 1));
1077 2123476950 : a *= drpa;
1078 : }
1079 : }
1080 : }
1081 : }
1082 :
1083 : // *** initialise the coefficient matrix, we transform the sum
1084 : //
1085 : // sum_{lxa,lya,lza,lxb,lyb,lzb} P_{lxa,lya,lza,lxb,lyb,lzb} *
1086 : // (x-a_x)**lxa (y-a_y)**lya (z-a_z)**lza (x-b_x)**lxb (y-a_y)**lya
1087 : // (z-a_z)**lza
1088 : //
1089 : // into
1090 : //
1091 : // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-p_x)**lxp (y-p_y)**lyp (z-p_z)**lzp
1092 : //
1093 : // where p is center of the product gaussian, and lp = la_max + lb_max
1094 : // (current implementation is l**7)
1095 : //
1096 :
1097 348665001 : for (int lzb = 0; lzb <= lb_max; lzb++) {
1098 655836148 : for (int lza = 0; lza <= la_max; lza++) {
1099 1117779176 : for (int lyb = 0; lyb <= lb_max - lzb; lyb++) {
1100 1827779810 : for (int lya = 0; lya <= la_max - lza; lya++) {
1101 1144351340 : const int lxb_min = imax(lb_min - lzb - lyb, 0);
1102 1144351340 : const int lxa_min = imax(la_min - lza - lya, 0);
1103 2714406086 : for (int lxb = lxb_min; lxb <= lb_max - lzb - lyb; lxb++) {
1104 3872443160 : for (int lxa = lxa_min; lxa <= la_max - lza - lya; lxa++) {
1105 2302388414 : const int ico = coset(lxa, lya, lza);
1106 2302388414 : const int jco = coset(lxb, lyb, lzb);
1107 2302388414 : const int cab_index = jco * ncoset(la_max) + ico; // [jco, ico]
1108 6653207872 : for (int lzp = 0; lzp <= lza + lzb; lzp++) {
1109 18928791076 : for (int lyp = 0; lyp <= lp - lza - lzb; lyp++) {
1110 50836224488 : for (int lxp = 0; lxp <= lp - lza - lzb - lyp; lxp++) {
1111 36258252870 : const double p = alpha[0][lxb][lxa][lxp] *
1112 36258252870 : alpha[1][lyb][lya][lyp] *
1113 36258252870 : alpha[2][lzb][lza][lzp] * prefactor;
1114 36258252870 : const int lp1 = lp + 1;
1115 36258252870 : const int cxyz_index =
1116 36258252870 : lzp * lp1 * lp1 + lyp * lp1 + lxp; // [lzp, lyp, lxp]
1117 : #if (GRID_DO_COLLOCATE)
1118 15264082205 : cxyz[cxyz_index] += cab[cab_index] * p; // collocate
1119 : #else
1120 20994170665 : cab[cab_index] += cxyz[cxyz_index] * p; // integrate
1121 : #endif
1122 : }
1123 : }
1124 : }
1125 : }
1126 : }
1127 : }
1128 : }
1129 : }
1130 : }
1131 127179559 : }
1132 :
1133 : /*******************************************************************************
1134 : * \brief Collocates coefficients C_ab onto the grid.
1135 : * \author Ole Schuett
1136 : ******************************************************************************/
1137 : static inline void
1138 173768215 : cab_to_grid(const bool orthorhombic, const int border_mask, const int la_max,
1139 : const int la_min, const int lb_max, const int lb_min,
1140 : const double zeta, const double zetb, const double rscale,
1141 : const double dh[3][3], const double dh_inv[3][3],
1142 : const double ra[3], const double rab[3], const int npts_global[3],
1143 : const int npts_local[3], const int shift_local[3],
1144 : const int border_width[3], const double radius,
1145 : GRID_CONST_WHEN_COLLOCATE double *cab,
1146 173768215 : GRID_CONST_WHEN_INTEGRATE double *grid) {
1147 :
1148 : // Check if radius is too small to be mapped onto grid of given resolution.
1149 173768215 : double dh_max = 0.0;
1150 695072860 : for (int i = 0; i < 3; i++) {
1151 2085218580 : for (int j = 0; j < 3; j++) {
1152 1563913935 : dh_max = fmax(dh_max, fabs(dh[i][j]));
1153 : }
1154 : }
1155 173768215 : if (2.0 * radius < dh_max) {
1156 46588656 : return;
1157 : }
1158 :
1159 127179559 : const double zetp = zeta + zetb;
1160 127179559 : const double f = zetb / zetp;
1161 127179559 : const double rab2 = rab[0] * rab[0] + rab[1] * rab[1] + rab[2] * rab[2];
1162 127179559 : const double prefactor = rscale * exp(-zeta * f * rab2);
1163 127179559 : double rp[3], rb[3];
1164 508718236 : for (int i = 0; i < 3; i++) {
1165 381538677 : rp[i] = ra[i] + f * rab[i];
1166 381538677 : rb[i] = ra[i] + rab[i];
1167 : }
1168 :
1169 127179559 : const int lp = la_max + lb_max;
1170 127179559 : const size_t cxyz_size = (lp + 1) * (lp + 1) * (lp + 1);
1171 127179559 : double cxyz[cxyz_size];
1172 127179559 : memset(cxyz, 0, cxyz_size * sizeof(double));
1173 :
1174 : #if (GRID_DO_COLLOCATE)
1175 : // collocate
1176 63026371 : cab_to_cxyz(la_max, la_min, lb_max, lb_min, prefactor, ra, rb, rp, cab, cxyz);
1177 63026371 : cxyz_to_grid(orthorhombic, border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
1178 : npts_local, shift_local, border_width, radius, cxyz, grid);
1179 : #else
1180 : // integrate
1181 64153188 : cxyz_to_grid(orthorhombic, border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
1182 : npts_local, shift_local, border_width, radius, cxyz, grid);
1183 64153188 : cab_to_cxyz(la_max, la_min, lb_max, lb_min, prefactor, ra, rb, rp, cab, cxyz);
1184 : #endif
1185 : }
1186 :
1187 : // EOF
|