Line data Source code
1 : /*----------------------------------------------------------------------------*/
2 : /* CP2K: A general program to perform molecular dynamics simulations */
3 : /* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4 : /* */
5 : /* SPDX-License-Identifier: BSD-3-Clause */
6 : /*----------------------------------------------------------------------------*/
7 :
8 : #include <assert.h>
9 : #include <limits.h>
10 : #include <math.h>
11 : #include <omp.h>
12 : #include <stdbool.h>
13 : #include <stdio.h>
14 : #include <stdlib.h>
15 : #include <string.h>
16 :
17 : #ifdef __LIBXSMM
18 : #include <libxsmm.h>
19 : #endif
20 :
21 : #include "../common/grid_basis_set.h"
22 : #include "../common/grid_common.h"
23 : #include "../common/grid_constants.h"
24 : #include "grid_dgemm_coefficients.h"
25 : #include "grid_dgemm_collocate.h"
26 : #include "grid_dgemm_collocation_integration.h"
27 : #include "grid_dgemm_non_orthorombic_corrections.h"
28 : #include "grid_dgemm_prepare_pab.h"
29 : #include "grid_dgemm_private_header.h"
30 : #include "grid_dgemm_task_list.h"
31 : #include "grid_dgemm_tensor_local.h"
32 :
33 : void collocate_l0(double *scratch, const double alpha, const bool orthogonal,
34 : const struct tensor_ *exp_xy,
35 : const struct tensor_ *p_alpha_beta_reduced_,
36 : struct tensor_ *cube);
37 :
38 5040 : void rotate_to_cartesian_harmonics(const grid_basis_set *ibasis,
39 : const grid_basis_set *jbasis,
40 : const int iatom, const int jatom,
41 : const int iset, const int jset,
42 : double *const block, tensor *work,
43 : tensor *pab) {
44 5040 : dgemm_params m1, m2;
45 5040 : memset(&m1, 0, sizeof(dgemm_params));
46 5040 : memset(&m2, 0, sizeof(dgemm_params));
47 :
48 : // Define some more convenient aliases.
49 5040 : const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set
50 5040 : const int nsgf_setb = jbasis->nsgf_set[jset];
51 5040 : const int nsgfa = ibasis->nsgf; // size of entire spherical basis
52 5040 : const int nsgfb = jbasis->nsgf;
53 5040 : const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
54 5040 : const int sgfb = jbasis->first_sgf[jset] - 1;
55 5040 : const int maxcoa = ibasis->maxco;
56 5040 : const int maxcob = jbasis->maxco;
57 5040 : const int ncoseta = ncoset(ibasis->lmax[iset]);
58 5040 : const int ncosetb = ncoset(jbasis->lmax[jset]);
59 5040 : const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
60 5040 : const int ncob = jbasis->npgf[jset] * ncosetb;
61 :
62 5040 : initialize_tensor_2(work, nsgf_setb, ncoa);
63 5040 : realloc_tensor(work);
64 :
65 5040 : initialize_tensor_2(pab, ncob, ncoa);
66 5040 : realloc_tensor(pab);
67 :
68 : // the rotations happen here.
69 5040 : if (iatom <= jatom) {
70 3352 : m1.op1 = 'N';
71 3352 : m1.op2 = 'N';
72 3352 : m1.m = work->size[0];
73 3352 : m1.n = work->size[1];
74 3352 : m1.k = nsgf_seta;
75 3352 : m1.alpha = 1.0;
76 3352 : m1.beta = 0.0;
77 3352 : m1.a = block + sgfb * nsgfa + sgfa;
78 3352 : m1.lda = nsgfa;
79 3352 : m1.b = &ibasis->sphi[sgfa * maxcoa];
80 3352 : m1.ldb = maxcoa;
81 3352 : m1.c = work->data;
82 3352 : m1.ldc = work->ld_;
83 : } else {
84 1688 : m1.op1 = 'T';
85 1688 : m1.op2 = 'N';
86 1688 : m1.m = work->size[0];
87 1688 : m1.n = work->size[1];
88 1688 : m1.k = nsgf_seta;
89 1688 : m1.alpha = 1.0;
90 1688 : m1.beta = 0.0;
91 1688 : m1.a = block + sgfa * nsgfb + sgfb;
92 1688 : m1.lda = nsgfb;
93 1688 : m1.b = &ibasis->sphi[sgfa * maxcoa];
94 1688 : m1.ldb = maxcoa;
95 1688 : m1.c = work->data;
96 1688 : m1.ldc = work->ld_;
97 : }
98 5040 : m1.use_libxsmm = true;
99 5040 : dgemm_simplified(&m1);
100 :
101 5040 : m2.op1 = 'T';
102 5040 : m2.op2 = 'N';
103 5040 : m2.m = pab->size[0];
104 5040 : m2.n = pab->size[1];
105 5040 : m2.k = work->size[0];
106 5040 : m2.alpha = 1.0;
107 5040 : m2.beta = 0.0;
108 5040 : m2.a = &jbasis->sphi[sgfb * maxcob];
109 5040 : m2.lda = maxcob;
110 5040 : m2.b = work->data;
111 5040 : m2.ldb = work->ld_;
112 5040 : m2.c = pab->data;
113 5040 : m2.ldc = pab->ld_;
114 5040 : m2.use_libxsmm = true;
115 5040 : dgemm_simplified(&m2);
116 5040 : }
117 :
118 : void tensor_reduction_for_collocate_integrate(
119 : double *scratch, const double alpha, const bool *const orthogonal,
120 : const struct tensor_ *Exp, const struct tensor_ *co,
121 : const struct tensor_ *p_alpha_beta_reduced_, struct tensor_ *cube);
122 :
123 : /* compute the functions (x - x_i)^l exp (-eta (x - x_i)^2) for l = 0..lp using
124 : * a recursive relation to avoid computing the exponential on each grid point. I
125 : * think it is not really necessary anymore since it is *not* the dominating
126 : * contribution to computation of collocate and integrate */
127 :
128 264000 : void grid_fill_pol_dgemm(const bool transpose, const double dr,
129 : const double roffset, const int pol_offset,
130 : const int xmin, const int xmax, const int lp,
131 : const int cmax, const double zetp, double *pol_) {
132 264000 : tensor pol;
133 264000 : const double t_exp_1 = exp(-zetp * dr * dr);
134 264000 : const double t_exp_2 = t_exp_1 * t_exp_1;
135 :
136 264000 : double t_exp_min_1 = exp(-zetp * (dr - roffset) * (dr - roffset));
137 264000 : double t_exp_min_2 = exp(2.0 * zetp * (dr - roffset) * dr);
138 :
139 264000 : double t_exp_plus_1 = exp(-zetp * roffset * roffset);
140 264000 : double t_exp_plus_2 = exp(2.0 * zetp * roffset * dr);
141 :
142 264000 : if (transpose) {
143 102993 : initialize_tensor_2(&pol, cmax, lp + 1);
144 102993 : pol.data = pol_;
145 : /* It is original Ole code. I need to transpose the polynomials for the
146 : * integration routine and Ole code already does it. */
147 1397272 : for (int ig = 0; ig >= xmin; ig--) {
148 1294279 : const double rpg = ig * dr - roffset;
149 1294279 : t_exp_min_1 *= t_exp_min_2 * t_exp_1;
150 1294279 : t_exp_min_2 *= t_exp_2;
151 1294279 : double pg = t_exp_min_1;
152 5304293 : for (int icoef = 0; icoef <= lp; icoef++) {
153 4010014 : idx2(pol, pol_offset + ig - xmin, icoef) = pg;
154 4010014 : pg *= rpg;
155 : }
156 : }
157 :
158 102993 : double t_exp_plus_1 = exp(-zetp * roffset * roffset);
159 102993 : double t_exp_plus_2 = exp(2 * zetp * roffset * dr);
160 1343614 : for (int ig = 1; ig <= xmax; ig++) {
161 1240621 : const double rpg = ig * dr - roffset;
162 1240621 : t_exp_plus_1 *= t_exp_plus_2 * t_exp_1;
163 1240621 : t_exp_plus_2 *= t_exp_2;
164 1240621 : double pg = t_exp_plus_1;
165 5085185 : for (int icoef = 0; icoef <= lp; icoef++) {
166 3844564 : idx2(pol, pol_offset + ig - xmin, icoef) = pg;
167 3844564 : pg *= rpg;
168 : }
169 : }
170 :
171 : } else {
172 161007 : initialize_tensor_2(&pol, lp + 1, cmax);
173 161007 : pol.data = pol_;
174 : /* memset(pol.data, 0, sizeof(double) * pol.alloc_size_); */
175 : /*
176 : * compute the values of all (x-xp)**lp*exp(..)
177 : *
178 : * still requires the old trick:
179 : * new trick to avoid to many exps (reuse the result from the previous
180 : * gridpoint): exp( -a*(x+d)**2)=exp(-a*x**2)*exp(-2*a*x*d)*exp(-a*d**2)
181 : * exp(-2*a*(x+d)*d)=exp(-2*a*x*d)*exp(-2*a*d**2)
182 : */
183 :
184 : /* compute the exponential recursively and store the polynomial prefactors
185 : * as well */
186 2162340 : for (int ig = 0; ig >= xmin; ig--) {
187 2001333 : const double rpg = ig * dr - roffset;
188 2001333 : t_exp_min_1 *= t_exp_min_2 * t_exp_1;
189 2001333 : t_exp_min_2 *= t_exp_2;
190 2001333 : double pg = t_exp_min_1;
191 2001333 : idx2(pol, 0, pol_offset + ig - xmin) = pg;
192 2001333 : if (lp > 0)
193 1273689 : idx2(pol, 1, pol_offset + ig - xmin) = rpg;
194 : }
195 :
196 2078682 : for (int ig = 1; ig <= xmax; ig++) {
197 1917675 : const double rpg = ig * dr - roffset;
198 1917675 : t_exp_plus_1 *= t_exp_plus_2 * t_exp_1;
199 1917675 : t_exp_plus_2 *= t_exp_2;
200 1917675 : double pg = t_exp_plus_1;
201 1917675 : idx2(pol, 0, pol_offset + ig - xmin) = pg;
202 1917675 : if (lp > 0)
203 1219605 : idx2(pol, 1, pol_offset + ig - xmin) = rpg;
204 : }
205 :
206 : /* compute the remaining powers using previously computed stuff */
207 161007 : if (lp >= 2) {
208 56541 : double *__restrict__ poly = &idx2(pol, 1, 0);
209 56541 : double *__restrict__ src1 = &idx2(pol, 0, 0);
210 56541 : double *__restrict__ dst = &idx2(pol, 2, 0);
211 : // #pragma omp simd linear(dst, src1, poly) simdlen(8)
212 230814 : GRID_PRAGMA_SIMD((dst, src1, poly), 8)
213 56541 : for (int ig = 0; ig < (xmax - xmin + 1 + pol_offset); ig++)
214 1405620 : dst[ig] = src1[ig] * poly[ig] * poly[ig];
215 : }
216 :
217 174273 : for (int icoef = 3; icoef <= lp; icoef++) {
218 13266 : double *__restrict__ poly = &idx2(pol, 1, 0);
219 13266 : double *__restrict__ src1 = &idx2(pol, icoef - 1, 0);
220 13266 : double *__restrict__ dst = &idx2(pol, icoef, 0);
221 13266 : GRID_PRAGMA_SIMD((dst, src1, poly), 8)
222 13266 : for (int ig = 0; ig < (xmax - xmin + 1 + pol_offset); ig++) {
223 372702 : dst[ig] = src1[ig] * poly[ig];
224 : }
225 : }
226 :
227 : //
228 161007 : if (lp > 0) {
229 : // I can not declare src__ variable constant because it breaks openmp
230 : // standard.
231 101487 : double *__restrict__ dst = &idx2(pol, 1, 0);
232 101487 : double *__restrict__ src = &idx2(pol, 0, 0);
233 365487 : GRID_PRAGMA_SIMD((dst, src), 8)
234 101487 : for (int ig = 0; ig < (xmax - xmin + 1 + pol_offset); ig++) {
235 2493294 : dst[ig] *= src[ig];
236 : }
237 : }
238 : }
239 264000 : }
240 :
241 0 : void apply_sphere_cutoff_ortho(struct collocation_integration_ *const handler,
242 : const double disr_radius, const int cmax,
243 : const int *const lb_cube,
244 : const int *const cube_center) {
245 : // a mapping so that the ig corresponds to the right grid point
246 0 : int **map = handler->map;
247 0 : map[1] = map[0] + 2 * cmax + 1;
248 0 : map[2] = map[1] + 2 * cmax + 1;
249 : // memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
250 :
251 0 : for (int i = 0; i < 3; i++) {
252 0 : for (int ig = 0; ig < handler->cube.size[i]; ig++) {
253 0 : map[i][ig] = modulo(cube_center[i] + lb_cube[i] + ig -
254 0 : handler->grid.lower_corner[i],
255 : handler->grid.full_size[i]);
256 : }
257 : }
258 :
259 0 : const Interval zwindow = {.xmin = handler->grid.window_shift[0],
260 0 : .xmax = handler->grid.window_size[0]};
261 0 : const Interval ywindow = {.xmin = handler->grid.window_shift[1],
262 0 : .xmax = handler->grid.window_size[1]};
263 0 : const Interval xwindow = {.xmin = handler->grid.window_shift[2],
264 0 : .xmax = handler->grid.window_size[2]};
265 :
266 0 : for (int kg = 0; kg < handler->cube.size[0]; kg++) {
267 0 : const int k = map[0][kg];
268 0 : const int kd =
269 0 : (2 * (kg + lb_cube[0]) - 1) / 2; // distance from center in grid points
270 0 : const double kr = kd * handler->dh[2][2]; // distance from center in a.u.
271 0 : const double kremain = disr_radius * disr_radius - kr * kr;
272 :
273 0 : if ((kremain >= 0.0) && is_point_in_interval(k, zwindow)) {
274 :
275 0 : const int jgmin = ceil(-1e-8 - sqrt(kremain) * handler->dh_inv[1][1]);
276 0 : for (int jg = jgmin; jg <= (1 - jgmin); jg++) {
277 0 : const int j = map[1][jg - lb_cube[1]];
278 0 : const double jr = ((2 * jg - 1) >> 1) *
279 0 : handler->dh[1][1]; // distance from center in a.u.
280 0 : const double jremain = kremain - jr * jr;
281 :
282 0 : if ((jremain >= 0.0) && is_point_in_interval(j, ywindow)) {
283 0 : const int xmin = ceil(-1e-8 - sqrt(jremain) * handler->dh_inv[0][0]);
284 0 : const int xmax = 1 - xmin;
285 :
286 : // printf("xmin %d, xmax %d\n", xmin, xmax);
287 0 : for (int x = xmin - lb_cube[2];
288 0 : x < imin((xmax - lb_cube[2]), handler->cube.size[2]); x++) {
289 0 : const int x1 = map[2][x];
290 :
291 0 : if (!is_point_in_interval(x1, xwindow))
292 0 : continue;
293 :
294 0 : int lower_corner[3] = {k, j, x1};
295 0 : int upper_corner[3] = {k + 1, j + 1, x1 + 1};
296 :
297 0 : compute_interval(map[2], handler->grid.full_size[2],
298 : handler->grid.size[2], xmax - lb_cube[2], x1, &x,
299 : lower_corner + 2, upper_corner + 2, xwindow);
300 :
301 0 : if (upper_corner[2] - lower_corner[2]) {
302 0 : const int position1[3] = {kg, jg - lb_cube[1], x};
303 :
304 0 : double *restrict dst = &idx3(handler->grid, lower_corner[0],
305 : lower_corner[1], lower_corner[2]);
306 0 : double *restrict src = &idx3(handler->cube, position1[0],
307 : position1[1], position1[2]);
308 :
309 0 : const int sizex = upper_corner[2] - lower_corner[2];
310 0 : GRID_PRAGMA_SIMD((dst, src), 8)
311 0 : for (int x = 0; x < sizex; x++) {
312 0 : dst[x] += src[x];
313 : }
314 : }
315 :
316 0 : if (handler->grid.size[2] == handler->grid.full_size[2])
317 0 : update_loop_index(handler->grid.full_size[2], x1, &x);
318 : else
319 0 : x += upper_corner[2] - lower_corner[2] - 1;
320 : }
321 : }
322 : }
323 : }
324 : }
325 0 : }
326 :
327 0 : void apply_spherical_cutoff_generic(
328 : struct collocation_integration_ *const handler, const double disr_radius,
329 : const int cmax, const int *const lb_cube, const int *const ub_cube,
330 : const double *const roffset, const int *const cube_center) {
331 :
332 0 : const double a = handler->dh[0][0] * handler->dh[0][0] +
333 0 : handler->dh[0][1] * handler->dh[0][1] +
334 0 : handler->dh[0][2] * handler->dh[0][2];
335 0 : const double a_inv = 1.0 / a;
336 : // a mapping so that the ig corresponds to the right grid point
337 0 : int **map = handler->map;
338 0 : map[1] = map[0] + 2 * cmax + 1;
339 0 : map[2] = map[1] + 2 * cmax + 1;
340 : // memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
341 :
342 0 : for (int i = 0; i < 3; i++) {
343 0 : for (int ig = 0; ig < handler->cube.size[i]; ig++) {
344 0 : map[i][ig] = modulo(cube_center[i] + ig + lb_cube[i] -
345 0 : handler->grid.lower_corner[i],
346 : handler->grid.full_size[i]);
347 : }
348 : }
349 :
350 0 : const Interval zwindow = {.xmin = handler->grid.window_shift[0],
351 0 : .xmax = handler->grid.window_size[0]};
352 0 : const Interval ywindow = {.xmin = handler->grid.window_shift[1],
353 0 : .xmax = handler->grid.window_size[1]};
354 0 : const Interval xwindow = {.xmin = handler->grid.window_shift[2],
355 0 : .xmax = handler->grid.window_size[2]};
356 :
357 0 : for (int k = 0; k < handler->cube.size[0]; k++) {
358 0 : const int iz = map[0][k];
359 :
360 0 : if (!is_point_in_interval(iz, zwindow))
361 0 : continue;
362 :
363 0 : const double tz = (k + lb_cube[0] - roffset[0]);
364 0 : const double z[3] = {tz * handler->dh[2][0], tz * handler->dh[2][1],
365 0 : tz * handler->dh[2][2]};
366 :
367 0 : for (int j = 0; j < handler->cube.size[1]; j++) {
368 0 : const int iy = map[1][j];
369 :
370 0 : if (!is_point_in_interval(iy, ywindow))
371 0 : continue;
372 :
373 0 : const double ty = (j - roffset[1] + lb_cube[1]);
374 0 : const double y[3] = {z[0] + ty * handler->dh[1][0],
375 0 : z[1] + ty * handler->dh[1][1],
376 0 : z[2] + ty * handler->dh[1][2]};
377 :
378 : /* Sqrt[(-2 x1 \[Alpha] - 2 y1 \[Beta] - 2 z1 \[Gamma])^2 - */
379 : /* 4 (x1^2 + y1^2 + z1^2)
380 : * (\[Alpha]^2 + \[Beta]^2 + \[Gamma]^2)] */
381 :
382 0 : const double b =
383 0 : -2.0 * (handler->dh[0][0] * (roffset[2] * handler->dh[0][0] - y[0]) +
384 0 : handler->dh[0][1] * (roffset[2] * handler->dh[0][1] - y[1]) +
385 0 : handler->dh[0][2] * (roffset[2] * handler->dh[0][2] - y[2]));
386 :
387 0 : const double c = (roffset[2] * handler->dh[0][0] - y[0]) *
388 0 : (roffset[2] * handler->dh[0][0] - y[0]) +
389 0 : (roffset[2] * handler->dh[0][1] - y[1]) *
390 0 : (roffset[2] * handler->dh[0][1] - y[1]) +
391 0 : (roffset[2] * handler->dh[0][2] - y[2]) *
392 : (roffset[2] * handler->dh[0][2] - y[2]) -
393 0 : disr_radius * disr_radius;
394 :
395 0 : double delta = b * b - 4.0 * a * c;
396 :
397 0 : if (delta < 0.0)
398 0 : continue;
399 :
400 0 : delta = sqrt(delta);
401 :
402 0 : const int xmin = imax(ceil((-b - delta) * 0.5 * a_inv), lb_cube[2]);
403 0 : const int xmax = imin(floor((-b + delta) * 0.5 * a_inv), ub_cube[2]);
404 :
405 0 : int lower_corner[3] = {iz, iy, xmin};
406 0 : int upper_corner[3] = {iz + 1, iy + 1, xmin};
407 :
408 0 : for (int x = xmin - lb_cube[2];
409 0 : x < imin((xmax - lb_cube[2]), handler->cube.size[2]); x++) {
410 0 : const int x1 = map[2][x];
411 :
412 0 : if (!is_point_in_interval(x1, xwindow))
413 0 : continue;
414 :
415 0 : compute_interval(map[2], handler->grid.full_size[2],
416 : handler->grid.size[2], xmax - lb_cube[2], x1, &x,
417 : lower_corner + 2, upper_corner + 2, xwindow);
418 :
419 0 : if (upper_corner[2] - lower_corner[2]) {
420 0 : const int position1[3] = {k, j, x};
421 :
422 : /* the function will internally take care of the local vs global
423 : * grid */
424 :
425 0 : double *__restrict__ dst = &idx3(handler->grid, lower_corner[0],
426 : lower_corner[1], lower_corner[2]);
427 0 : double *__restrict__ src =
428 0 : &idx3(handler->cube, position1[0], position1[1], position1[2]);
429 :
430 0 : const int sizex = upper_corner[2] - lower_corner[2];
431 0 : GRID_PRAGMA_SIMD((dst, src), 8)
432 0 : for (int x = 0; x < sizex; x++) {
433 0 : dst[x] += src[x];
434 : }
435 :
436 0 : if (handler->grid.size[0] == handler->grid.full_size[0])
437 0 : update_loop_index(handler->grid.full_size[2], x1, &x);
438 : else
439 0 : x += upper_corner[2] - lower_corner[2] - 1;
440 : }
441 : }
442 : }
443 : }
444 0 : }
445 :
446 10560 : void collocate_l0(double *scratch, const double alpha, const bool orthogonal_xy,
447 : const struct tensor_ *exp_xy,
448 : const struct tensor_ *p_alpha_beta_reduced_,
449 : struct tensor_ *cube) {
450 10560 : const double *__restrict pz =
451 : &idx3(p_alpha_beta_reduced_[0], 0, 0, 0); /* k indice */
452 10560 : const double *__restrict py =
453 10560 : &idx3(p_alpha_beta_reduced_[0], 1, 0, 0); /* j indice */
454 10560 : const double *__restrict px =
455 10560 : &idx3(p_alpha_beta_reduced_[0], 2, 0, 0); /* i indice */
456 :
457 10560 : memset(&idx3(cube[0], 0, 0, 0), 0, sizeof(double) * cube->alloc_size_);
458 10560 : memset(scratch, 0, sizeof(double) * cube->size[1] * cube->ld_);
459 :
460 10560 : cblas_dger(CblasRowMajor, cube->size[1], cube->size[2], alpha, py, 1, px, 1,
461 : scratch, cube->ld_);
462 :
463 10560 : if (exp_xy && !orthogonal_xy) {
464 0 : for (int y = 0; y < cube->size[1]; y++) {
465 0 : const double *__restrict src = &idx2(exp_xy[0], y, 0);
466 0 : double *__restrict dst = &scratch[y * cube->ld_];
467 : // #pragma omp simd linear(dst, src) simdlen(8)
468 0 : GRID_PRAGMA_SIMD((dst, src), 8)
469 0 : for (int x = 0; x < cube->size[2]; x++) {
470 0 : dst[x] *= src[x];
471 : }
472 : }
473 : }
474 :
475 10560 : cblas_dger(CblasRowMajor, cube->size[0], cube->size[1] * cube->ld_, 1.0, pz,
476 10560 : 1, scratch, 1, &idx3(cube[0], 0, 0, 0), cube->size[1] * cube->ld_);
477 10560 : }
478 :
479 : /* compute the following operation (variant) it is a tensor contraction
480 :
481 : V_{kji} = \sum_{\alpha\beta\gamma}
482 : C_{\alpha\gamma\beta} T_{2,\alpha,i} T_{1,\beta,j} T_{0,\gamma,k}
483 :
484 : */
485 78720 : void tensor_reduction_for_collocate_integrate(
486 : double *scratch, const double alpha, const bool *const orthogonal,
487 : const struct tensor_ *Exp, const struct tensor_ *co,
488 : const struct tensor_ *p_alpha_beta_reduced_, struct tensor_ *cube) {
489 78720 : if (co->size[0] > 1) {
490 68160 : dgemm_params m1, m2, m3;
491 :
492 68160 : memset(&m1, 0, sizeof(dgemm_params));
493 68160 : memset(&m2, 0, sizeof(dgemm_params));
494 68160 : memset(&m3, 0, sizeof(dgemm_params));
495 68160 : tensor T;
496 68160 : tensor W;
497 :
498 68160 : double *__restrict const pz =
499 : &idx3(p_alpha_beta_reduced_[0], 0, 0, 0); /* k indice */
500 68160 : double *__restrict const py =
501 68160 : &idx3(p_alpha_beta_reduced_[0], 1, 0, 0); /* j indice */
502 68160 : double *__restrict const px =
503 68160 : &idx3(p_alpha_beta_reduced_[0], 2, 0, 0); /* i indice */
504 :
505 68160 : initialize_tensor_3(&T, co->size[0] /* alpha */, co->size[1] /* gamma */,
506 : cube->size[1] /* j */);
507 68160 : initialize_tensor_3(&W, co->size[1] /* gamma */, cube->size[1] /* j */,
508 : cube->size[2] /* i */);
509 :
510 68160 : T.data = scratch;
511 68160 : W.data = scratch + T.alloc_size_;
512 : /* WARNING we are in row major layout. cblas allows it and it is more
513 : * natural to read left to right than top to bottom
514 : *
515 : * we do first T_{\alpha,\gamma,j} = \sum_beta C_{alpha\gamma\beta}
516 : * Y_{\beta, j}
517 : *
518 : * keep in mind that Y_{\beta, j} = p_alpha_beta_reduced(1, \beta, j)
519 : * and the order of indices is also important. the last indice is the
520 : * fastest one. it can be done with one dgemm.
521 : */
522 :
523 68160 : m1.op1 = 'N';
524 68160 : m1.op2 = 'N';
525 68160 : m1.alpha = alpha;
526 68160 : m1.beta = 0.0;
527 68160 : m1.m = co->size[0] * co->size[1]; /* alpha gamma */
528 68160 : m1.n = cube->size[1]; /* j */
529 68160 : m1.k = co->size[2]; /* beta */
530 68160 : m1.a = co->data; // Coef_{alpha,gamma,beta} Coef_xzy
531 68160 : m1.lda = co->ld_;
532 68160 : m1.b = py; // Y_{beta, j} = p_alpha_beta_reduced(1, beta, j)
533 68160 : m1.ldb = p_alpha_beta_reduced_->ld_;
534 68160 : m1.c = T.data; // T_{\alpha, \gamma, j} = T(alpha, gamma, j)
535 68160 : m1.ldc = T.ld_;
536 68160 : m1.use_libxsmm = true;
537 : /*
538 : * the next step is a reduction along the alpha index.
539 : *
540 : * We compute then
541 : *
542 : * W_{gamma, j, i} = sum_{\alpha} T_{\gamma, j, alpha} X_{\alpha, i}
543 : *
544 : * which means we need to transpose T_{\alpha, \gamma, j} to get
545 : * T_{\gamma, j, \alpha}. Fortunately we can do it while doing the
546 : * matrix - matrix multiplication
547 : */
548 :
549 68160 : m2.op1 = 'T';
550 68160 : m2.op2 = 'N';
551 68160 : m2.alpha = 1.0;
552 68160 : m2.beta = 0.0;
553 68160 : m2.m = cube->size[1] * co->size[1]; // (\gamma j) direction
554 68160 : m2.n = cube->size[2]; // i
555 68160 : m2.k = co->size[0]; // alpha
556 68160 : m2.a = T.data; // T_{\alpha, \gamma, j}
557 68160 : m2.lda = T.ld_ * co->size[1];
558 68160 : m2.b = px; // X_{alpha, i} = p_alpha_beta_reduced(0, alpha, i)
559 68160 : m2.ldb = p_alpha_beta_reduced_->ld_;
560 68160 : m2.c = W.data; // W_{\gamma, j, i}
561 68160 : m2.ldc = W.ld_;
562 68160 : m2.use_libxsmm = true;
563 : /* the final step is again a reduction along the gamma indice. It can
564 : * again be done with one dgemm. The operation is simply
565 : *
566 : * Cube_{k, j, i} = \sum_{alpha} Z_{k, \gamma} W_{\gamma, j, i}
567 : *
568 : * which means we need to transpose Z_{\gamma, k}.
569 : */
570 :
571 68160 : m3.op1 = 'T';
572 68160 : m3.op2 = 'N';
573 68160 : m3.alpha = alpha;
574 68160 : m3.beta = 0.0;
575 68160 : m3.m = cube->size[0]; // Z_{k \gamma}
576 68160 : m3.n = cube->size[1] * cube->size[2]; // (ji) direction
577 68160 : m3.k = co->size[1]; // \gamma
578 68160 : m3.a = pz; // p_alpha_beta_reduced(0, gamma, i)
579 68160 : m3.lda = p_alpha_beta_reduced_->ld_;
580 68160 : m3.b = &idx3(W, 0, 0, 0); // W_{\gamma, j, i}
581 68160 : m3.ldb = W.size[1] * W.ld_;
582 68160 : m3.c = &idx3(cube[0], 0, 0, 0); // cube_{kji}
583 68160 : m3.ldc = cube->ld_ * cube->size[1];
584 68160 : m3.use_libxsmm = true;
585 68160 : dgemm_simplified(&m1);
586 68160 : dgemm_simplified(&m2);
587 :
588 : // apply the non orthorombic corrections in the xy plane
589 68160 : if (Exp && !orthogonal[2]) {
590 3112 : tensor exp_xy;
591 3112 : initialize_tensor_2(&exp_xy, Exp->size[1], Exp->size[2]);
592 3112 : exp_xy.data = &idx3(Exp[0], 2, 0, 0);
593 3112 : apply_non_orthorombic_corrections_xy_blocked(&exp_xy, &W);
594 : }
595 :
596 68160 : dgemm_simplified(&m3);
597 : } else {
598 10560 : if (Exp && !orthogonal[2]) {
599 0 : tensor exp_xy;
600 0 : initialize_tensor_2(&exp_xy, Exp->size[1], Exp->size[2]);
601 :
602 0 : exp_xy.data = &idx3(Exp[0], 2, 0, 0);
603 0 : collocate_l0(scratch, co->data[0] * alpha, orthogonal[2], &exp_xy,
604 : p_alpha_beta_reduced_, cube);
605 : } else {
606 10560 : collocate_l0(scratch, co->data[0] * alpha, true, NULL,
607 : p_alpha_beta_reduced_, cube);
608 : }
609 : }
610 :
611 78720 : if (Exp == NULL)
612 : return;
613 :
614 44389 : if (Exp && (!orthogonal[0] && !orthogonal[1])) {
615 0 : tensor exp_yz;
616 0 : tensor exp_xz;
617 0 : initialize_tensor_2(&exp_xz, Exp->size[1], Exp->size[2]);
618 0 : initialize_tensor_2(&exp_yz, Exp->size[1], Exp->size[2]);
619 0 : exp_xz.data = &idx3(Exp[0], 0, 0, 0);
620 0 : exp_yz.data = &idx3(Exp[0], 1, 0, 0);
621 0 : apply_non_orthorombic_corrections_xz_yz_blocked(&exp_xz, &exp_yz, cube);
622 0 : return;
623 : }
624 :
625 44389 : if (Exp && (!orthogonal[0] || !orthogonal[1])) {
626 20163 : if (!orthogonal[0]) {
627 20163 : tensor exp_xz;
628 20163 : initialize_tensor_2(&exp_xz, Exp->size[1], Exp->size[2]);
629 20163 : exp_xz.data = &idx3(Exp[0], 0, 0, 0);
630 20163 : apply_non_orthorombic_corrections_xz_blocked(&exp_xz, cube);
631 : }
632 :
633 20163 : if (!orthogonal[1]) {
634 0 : tensor exp_yz;
635 0 : initialize_tensor_2(&exp_yz, Exp->size[1], Exp->size[2]);
636 0 : exp_yz.data = &idx3(Exp[0], 1, 0, 0);
637 0 : apply_non_orthorombic_corrections_yz_blocked(&exp_yz, cube);
638 : }
639 : }
640 :
641 : return;
642 : }
643 :
644 : /* It is a sub-optimal version of the mapping in case of a cubic cutoff. But is
645 : * very general and does not depend on the orthorombic nature of the grid. for
646 : * orthorombic cases, it is faster to apply PCB directly on the polynomials. */
647 :
648 44389 : void apply_mapping_cubic(struct collocation_integration_ *handler,
649 : const int cmax, const int *const lower_boundaries_cube,
650 : const int *const cube_center) {
651 :
652 : // a mapping so that the ig corresponds to the right grid point
653 44389 : int **map = handler->map;
654 44389 : map[1] = map[0] + 2 * cmax + 1;
655 44389 : map[2] = map[1] + 2 * cmax + 1;
656 :
657 177556 : for (int i = 0; i < 3; i++) {
658 3385300 : for (int ig = 0; ig < handler->cube.size[i]; ig++) {
659 3252133 : map[i][ig] = modulo(cube_center[i] + ig + lower_boundaries_cube[i] -
660 3252133 : handler->grid.lower_corner[i],
661 : handler->grid.full_size[i]);
662 : }
663 : }
664 :
665 44389 : int lower_corner[3];
666 44389 : int upper_corner[3];
667 :
668 44389 : const Interval zwindow = {.xmin = handler->grid.window_shift[0],
669 44389 : .xmax = handler->grid.window_size[0]};
670 44389 : const Interval ywindow = {.xmin = handler->grid.window_shift[1],
671 44389 : .xmax = handler->grid.window_size[1]};
672 44389 : const Interval xwindow = {.xmin = handler->grid.window_shift[2],
673 44389 : .xmax = handler->grid.window_size[2]};
674 :
675 : /* this code makes a decomposition of the cube such that we can add block of
676 : * datas in a vectorized way. */
677 :
678 : /* the decomposition depends unfortunately on the way the grid is split over
679 : * mpi ranks. If each mpi rank has the full grid then it is simple. A 1D
680 : * example of the decomposition will explain it better. We have an interval
681 : * [x1, x1 + cube_size - 1] (and a second index x [0, cube_size -1]) and a
682 : * grid that goes from [0.. grid_size - 1].
683 : *
684 : * We start from x1 and compute the largest interval [x1.. x1 + diff] that fit
685 : * to [0.. grid_size - 1]. Computing the difference diff is simply
686 : * min(grid_size - x1, cube_size - x). then we add the result in a vectorized
687 : * fashion. we itterate the processus by reducing the interval [x1, x1 +
688 : * cube_size - 1] until it is empty. */
689 :
690 130455 : for (int z = 0; (z < handler->cube.size[0]); z++) {
691 86066 : const int z1 = map[0][z];
692 :
693 172132 : if (!is_point_in_interval(z1, zwindow))
694 0 : continue;
695 :
696 86066 : compute_interval(map[0], handler->grid.full_size[0], handler->grid.size[0],
697 : handler->cube.size[0], z1, &z, lower_corner, upper_corner,
698 : zwindow);
699 :
700 : /* // We have a full plane. */
701 86066 : if (upper_corner[0] - lower_corner[0]) {
702 261386 : for (int y = 0; y < handler->cube.size[1]; y++) {
703 175320 : const int y1 = map[1][y];
704 :
705 : // this check is completely irrelevant when running without MPI.
706 350640 : if (!is_point_in_interval(y1, ywindow))
707 0 : continue;
708 :
709 175320 : compute_interval(map[1], handler->grid.full_size[1],
710 : handler->grid.size[1], handler->cube.size[1], y1, &y,
711 : lower_corner + 1, upper_corner + 1, ywindow);
712 :
713 175320 : if (upper_corner[1] - lower_corner[1]) {
714 570824 : for (int x = 0; x < handler->cube.size[2]; x++) {
715 395504 : const int x1 = map[2][x];
716 :
717 791008 : if (!is_point_in_interval(x1, xwindow))
718 0 : continue;
719 :
720 395504 : compute_interval(map[2], handler->grid.full_size[2],
721 : handler->grid.size[2], handler->cube.size[2], x1,
722 : &x, lower_corner + 2, upper_corner + 2, xwindow);
723 :
724 395504 : if (upper_corner[2] - lower_corner[2]) {
725 395504 : const int position1[3] = {z, y, x};
726 : /* the function will internally take care of the local vx global
727 : * grid */
728 395504 : add_sub_grid(
729 : lower_corner, // lower corner of the portion of cube (in the
730 : // full grid)
731 : upper_corner, // upper corner of the portion of cube (in the
732 : // full grid)
733 : position1, // starting position subblock inside the cube
734 395504 : &handler->cube, // the cube to extract data from
735 : &handler->grid); // the grid to add data from
736 395504 : if (handler->grid.size[2] == handler->grid.full_size[2])
737 395504 : update_loop_index(handler->grid.full_size[2], x1, &x);
738 : else
739 0 : x += upper_corner[2] - lower_corner[2] - 1;
740 : }
741 : }
742 175320 : if (handler->grid.size[1] == handler->grid.full_size[1])
743 175320 : update_loop_index(handler->grid.full_size[1], y1, &y);
744 : else
745 0 : y += upper_corner[1] - lower_corner[1] - 1;
746 : }
747 : }
748 86066 : if (handler->grid.size[0] == handler->grid.full_size[0])
749 86066 : update_loop_index(handler->grid.full_size[0], z1, &z);
750 : else
751 0 : z += upper_corner[0] - lower_corner[0] - 1;
752 : }
753 : }
754 44389 : }
755 :
756 : // *****************************************************************************
757 44389 : void grid_collocate(collocation_integration *const handler,
758 : const bool use_ortho, const double zetp, const double rp[3],
759 : const double radius) {
760 : // *** position of the gaussian product
761 : //
762 : // this is the actual definition of the position on the grid
763 : // i.e. a point rp(:) gets here grid coordinates
764 : // MODULO(rp(:)/dr(:),npts(:))+1
765 : // hence (0.0,0.0,0.0) in real space is rsgrid%lb on the rsgrid ((1,1,1) on
766 : // grid)
767 :
768 : // cubecenter(:) = FLOOR(MATMUL(dh_inv, rp))
769 44389 : int cubecenter[3];
770 44389 : int cube_size[3];
771 44389 : int lb_cube[3], ub_cube[3];
772 44389 : int pol_offset[3] = {0, 0, 0};
773 44389 : double roffset[3];
774 44389 : double disr_radius;
775 : /* cube : grid containing pointlike product between polynomials
776 : *
777 : * pol : grid containing the polynomials in all three directions
778 : *
779 : * pol_folded : grid containing the polynomials after folding for periodic
780 : * boundaries conditions
781 : */
782 :
783 : /* seting up the cube parameters */
784 88778 : int cmax = compute_cube_properties(
785 44389 : use_ortho, radius, (const double(*)[3])handler->dh,
786 44389 : (const double(*)[3])handler->dh_inv, rp, &disr_radius, roffset,
787 : cubecenter, lb_cube, ub_cube, cube_size);
788 :
789 : /* initialize the multidimensional array containing the polynomials */
790 44389 : initialize_tensor_3(&handler->pol, 3, handler->coef.size[0], cmax);
791 44389 : realloc_tensor(&handler->pol);
792 44389 : memset(handler->pol.data, 0, sizeof(double) * handler->pol.alloc_size_);
793 :
794 : /* compute the polynomials */
795 :
796 : // WARNING : do not reverse the order in pol otherwise you will have to
797 : // reverse the order in collocate_dgemm as well.
798 :
799 44389 : if (use_ortho) {
800 21114 : grid_fill_pol_dgemm(false, handler->dh[0][0], roffset[2], pol_offset[2],
801 21114 : lb_cube[2], ub_cube[2], handler->coef.size[2] - 1, cmax,
802 21114 : zetp, &idx3(handler->pol, 2, 0, 0)); /* i indice */
803 21114 : grid_fill_pol_dgemm(false, handler->dh[1][1], roffset[1], pol_offset[1],
804 21114 : lb_cube[1], ub_cube[1], handler->coef.size[1] - 1, cmax,
805 21114 : zetp, &idx3(handler->pol, 1, 0, 0)); /* j indice */
806 21114 : grid_fill_pol_dgemm(false, handler->dh[2][2], roffset[0], pol_offset[0],
807 21114 : lb_cube[0], ub_cube[0], handler->coef.size[0] - 1, cmax,
808 : zetp, &idx3(handler->pol, 0, 0, 0)); /* k indice */
809 : } else {
810 23275 : grid_fill_pol_dgemm(false, 1.0, roffset[0], pol_offset[2], lb_cube[0],
811 23275 : ub_cube[0], handler->coef.size[0] - 1, cmax,
812 23275 : zetp * handler->dx[0],
813 : &idx3(handler->pol, 0, 0, 0)); /* k indice */
814 23275 : grid_fill_pol_dgemm(false, 1.0, roffset[1], pol_offset[1], lb_cube[1],
815 23275 : ub_cube[1], handler->coef.size[1] - 1, cmax,
816 23275 : zetp * handler->dx[1],
817 23275 : &idx3(handler->pol, 1, 0, 0)); /* j indice */
818 23275 : grid_fill_pol_dgemm(false, 1.0, roffset[2], pol_offset[0], lb_cube[2],
819 23275 : ub_cube[2], handler->coef.size[2] - 1, cmax,
820 23275 : zetp * handler->dx[2],
821 23275 : &idx3(handler->pol, 2, 0, 0)); /* i indice */
822 :
823 23275 : calculate_non_orthorombic_corrections_tensor(
824 : zetp, roffset, (const double(*)[3])handler->dh, lb_cube, ub_cube,
825 23275 : handler->orthogonal, &handler->Exp);
826 :
827 : /* Use a slightly modified version of Ole code */
828 23275 : grid_transform_coef_xzy_to_ikj((const double(*)[3])handler->dh,
829 23275 : &handler->coef);
830 : }
831 :
832 : /* allocate memory for the polynomial and the cube */
833 :
834 44389 : initialize_tensor_3(&handler->cube, cube_size[0], cube_size[1], cube_size[2]);
835 :
836 44389 : realloc_tensor(&handler->cube);
837 44389 : initialize_W_and_T(handler, &handler->cube, &handler->coef);
838 :
839 44389 : tensor_reduction_for_collocate_integrate(
840 44389 : handler->scratch, // pointer to scratch memory
841 44389 : 1.0, handler->orthogonal, &handler->Exp, &handler->coef, &handler->pol,
842 : &handler->cube);
843 :
844 44389 : if (handler->apply_cutoff) {
845 0 : if (use_ortho) {
846 0 : apply_sphere_cutoff_ortho(handler, disr_radius, cmax, lb_cube,
847 : cubecenter);
848 : } else {
849 0 : apply_spherical_cutoff_generic(handler, radius, cmax, lb_cube, ub_cube,
850 : roffset, cubecenter);
851 : }
852 0 : return;
853 : }
854 44389 : apply_mapping_cubic(handler, cmax, lb_cube, cubecenter);
855 : }
856 :
857 : //******************************************************************************
858 0 : void grid_dgemm_collocate_pgf_product(
859 : const bool use_ortho, const int border_mask, const enum grid_func func,
860 : const int la_max, const int la_min, const int lb_max, const int lb_min,
861 : const double zeta, const double zetb, const double rscale,
862 : const double dh[3][3], const double dh_inv[3][3], const double ra[3],
863 : const double rab[3], const int grid_global_size[3],
864 : const int grid_local_size[3], const int shift_local[3],
865 : const int border_width[3], const double radius, const int o1, const int o2,
866 0 : const int n1, const int n2, const double pab_[n2][n1],
867 : double *const grid_) {
868 0 : collocation_integration *handler = collocate_create_handle();
869 :
870 0 : tensor pab;
871 0 : initialize_tensor_2(&pab, n2, n1);
872 0 : alloc_tensor(&pab);
873 0 : memcpy(pab.data, pab_, sizeof(double) * n1 * n2);
874 : // Uncomment this to dump all tasks to file.
875 : // #define __GRID_DUMP_TASKS
876 0 : int offset[2] = {o1, o2};
877 :
878 0 : int lmax[2] = {la_max, lb_max};
879 0 : int lmin[2] = {la_min, lb_min};
880 :
881 0 : const double zetp = zeta + zetb;
882 0 : const double f = zetb / zetp;
883 0 : const double rab2 = rab[0] * rab[0] + rab[1] * rab[1] + rab[2] * rab[2];
884 0 : const double prefactor = rscale * exp(-zeta * f * rab2);
885 0 : const double zeta_pair[2] = {zeta, zetb};
886 0 : initialize_basis_vectors(handler, dh, dh_inv);
887 0 : verify_orthogonality((const double(*)[3])dh, handler->orthogonal);
888 :
889 0 : initialize_tensor_3(&(handler->grid), grid_local_size[2], grid_local_size[1],
890 : grid_local_size[0]);
891 :
892 0 : handler->grid.ld_ = grid_local_size[0];
893 0 : handler->grid.data = grid_;
894 :
895 0 : setup_global_grid_size(&handler->grid, (const int *)grid_global_size);
896 :
897 0 : initialize_tensor_3(&handler->grid, grid_local_size[2], grid_local_size[1],
898 : grid_local_size[0]);
899 0 : handler->grid.ld_ = grid_local_size[0];
900 :
901 0 : setup_grid_window(&handler->grid, shift_local, border_width, border_mask);
902 :
903 0 : double rp[3], rb[3];
904 0 : for (int i = 0; i < 3; i++) {
905 0 : rp[i] = ra[i] + f * rab[i];
906 0 : rb[i] = ra[i] + rab[i];
907 : }
908 :
909 0 : int lmin_diff[2], lmax_diff[2];
910 0 : grid_prepare_get_ldiffs_dgemm(func, lmin_diff, lmax_diff);
911 :
912 0 : int lmin_prep[2];
913 0 : int lmax_prep[2];
914 :
915 0 : lmin_prep[0] = imax(lmin[0] + lmin_diff[0], 0);
916 0 : lmin_prep[1] = imax(lmin[1] + lmin_diff[1], 0);
917 :
918 0 : lmax_prep[0] = lmax[0] + lmax_diff[0];
919 0 : lmax_prep[1] = lmax[1] + lmax_diff[1];
920 :
921 0 : const int n1_prep = ncoset(lmax_prep[0]);
922 0 : const int n2_prep = ncoset(lmax_prep[1]);
923 :
924 : /* I really do not like this. This will disappear */
925 0 : tensor pab_prep;
926 0 : initialize_tensor_2(&pab_prep, n2_prep, n1_prep);
927 0 : alloc_tensor(&pab_prep);
928 0 : memset(pab_prep.data, 0, pab_prep.alloc_size_ * sizeof(double));
929 :
930 0 : grid_prepare_pab_dgemm(func, offset, lmax, lmin, &zeta_pair[0], &pab,
931 : &pab_prep);
932 :
933 : // *** initialise the coefficient matrix, we transform the sum
934 : //
935 : // sum_{lxa,lya,lza,lxb,lyb,lzb} P_{lxa,lya,lza,lxb,lyb,lzb} *
936 : // (x-a_x)**lxa (y-a_y)**lya (z-a_z)**lza (x-b_x)**lxb (y-a_y)**lya
937 : // (z-a_z)**lza
938 : //
939 : // into
940 : //
941 : // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-p_x)**lxp (y-p_y)**lyp (z-p_z)**lzp
942 : //
943 : // where p is center of the product gaussian, and lp = la_max + lb_max
944 : // (current implementation is l**7)
945 : //
946 :
947 : /* precautionary tail since I probably intitialize data to NULL when I
948 : * initialize a new tensor. I want to keep the memory (I put a ridiculous
949 : * amount already) */
950 :
951 0 : initialize_tensor_4(&(handler->alpha), 3, lmax_prep[1] + 1, lmax_prep[0] + 1,
952 0 : lmax_prep[0] + lmax_prep[1] + 1);
953 0 : realloc_tensor(&(handler->alpha));
954 :
955 0 : const int lp = lmax_prep[0] + lmax_prep[1];
956 :
957 0 : initialize_tensor_3(&(handler->coef), lp + 1, lp + 1, lp + 1);
958 0 : realloc_tensor(&(handler->coef));
959 :
960 : // initialy cp2k stores coef_xyz as coef[z][y][x]. this is fine but I
961 : // need them to be stored as
962 :
963 0 : grid_prepare_alpha_dgemm(ra, rb, rp, lmax_prep, &(handler->alpha));
964 :
965 : //
966 : // compute P_{lxp,lyp,lzp} given P_{lxa,lya,lza,lxb,lyb,lzb} and
967 : // alpha(ls,lxa,lxb,1) use a three step procedure we don't store zeros, so
968 : // counting is done using lxyz,lxy in order to have contiguous memory access
969 : // in collocate_fast.F
970 : //
971 :
972 : // coef[x][z][y]
973 0 : grid_compute_coefficients_dgemm(lmin_prep, lmax_prep, lp, prefactor,
974 : &(handler->alpha), &pab_prep,
975 : &(handler->coef));
976 :
977 0 : grid_collocate(handler, use_ortho, zetp, rp, radius);
978 :
979 0 : collocate_destroy_handle(handler);
980 0 : free(pab_prep.data);
981 0 : }
982 :
983 5040 : void extract_blocks(grid_context *const ctx, const _task *const task,
984 : const offload_buffer *pab_blocks, tensor *const work,
985 : tensor *const pab) {
986 5040 : const int iatom = task->iatom;
987 5040 : const int jatom = task->jatom;
988 5040 : const int iset = task->iset;
989 5040 : const int jset = task->jset;
990 5040 : const int ikind = ctx->atom_kinds[iatom];
991 5040 : const int jkind = ctx->atom_kinds[jatom];
992 5040 : const grid_basis_set *ibasis = ctx->basis_sets[ikind];
993 5040 : const grid_basis_set *jbasis = ctx->basis_sets[jkind];
994 :
995 5040 : const int block_num = task->block_num;
996 :
997 : // Locate current matrix block within the buffer. This block
998 : // contains the weights of the gaussian pairs in the spherical
999 : // harmonic basis, but we do computation in the cartesian
1000 : // harmonic basis so we have to rotate the coefficients. It is nothing
1001 : // else than a basis change and it done with two dgemm.
1002 :
1003 5040 : const int block_offset = ctx->block_offsets[block_num]; // zero based
1004 5040 : double *const block = &pab_blocks->host_buffer[block_offset];
1005 :
1006 5040 : rotate_to_cartesian_harmonics(ibasis, jbasis, iatom, jatom, iset, jset, block,
1007 : work, pab);
1008 5040 : }
1009 :
1010 44389 : void compute_coefficients(grid_context *const ctx,
1011 : struct collocation_integration_ *handler,
1012 : const _task *const previous_task,
1013 : const _task *const task,
1014 : const offload_buffer *pab_blocks, tensor *const pab,
1015 : tensor *const work, tensor *const pab_prep) {
1016 : // Load subblock from buffer and decontract into Cartesian sublock pab.
1017 : // The previous pab can be reused when only ipgf or jpgf has changed.
1018 44389 : if (task->update_block_ || (previous_task == NULL)) {
1019 4482 : extract_blocks(ctx, task, pab_blocks, work, pab);
1020 : }
1021 :
1022 44389 : int lmin_prep[2];
1023 44389 : int lmax_prep[2];
1024 :
1025 44389 : lmin_prep[0] = imax(task->lmin[0] + handler->lmin_diff[0], 0);
1026 44389 : lmin_prep[1] = imax(task->lmin[1] + handler->lmin_diff[1], 0);
1027 :
1028 44389 : lmax_prep[0] = task->lmax[0] + handler->lmax_diff[0];
1029 44389 : lmax_prep[1] = task->lmax[1] + handler->lmax_diff[1];
1030 :
1031 44389 : const int n1_prep = ncoset(lmax_prep[0]);
1032 44389 : const int n2_prep = ncoset(lmax_prep[1]);
1033 :
1034 : /* we do not reallocate memory. We initialized the structure with the
1035 : * maximum lmax of the all list already.
1036 : */
1037 44389 : initialize_tensor_2(pab_prep, n2_prep, n1_prep);
1038 44389 : realloc_tensor(pab_prep);
1039 :
1040 44389 : grid_prepare_pab_dgemm(handler->func, task->offset, task->lmin, task->lmax,
1041 : &task->zeta[0], pab, pab_prep);
1042 :
1043 : // *** initialise the coefficient matrix, we transform the sum
1044 : //
1045 : // sum_{lxa,lya,lza,lxb,lyb,lzb} P_{lxa,lya,lza,lxb,lyb,lzb} *
1046 : // (x-a_x)**lxa (y-a_y)**lya (z-a_z)**lza (x-b_x)**lxb
1047 : // (y-a_y)**lya (z-a_z)**lza
1048 : //
1049 : // into
1050 : //
1051 : // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-p_x)**lxp (y-p_y)**lyp
1052 : // (z-p_z)**lzp
1053 : //
1054 : // where p is center of the product gaussian, and lp = la_max + lb_max
1055 : // (current implementation is l**7)
1056 : //
1057 :
1058 : /* precautionary tail since I probably intitialize data to NULL when I
1059 : * initialize a new tensor. I want to keep the memory (I put a ridiculous
1060 : * amount already) */
1061 :
1062 44389 : initialize_tensor_4(&handler->alpha, 3, lmax_prep[1] + 1, lmax_prep[0] + 1,
1063 44389 : lmax_prep[0] + lmax_prep[1] + 1);
1064 44389 : realloc_tensor(&handler->alpha);
1065 :
1066 44389 : const int lp = lmax_prep[0] + lmax_prep[1];
1067 :
1068 44389 : initialize_tensor_3(&handler->coef, lp + 1, lp + 1, lp + 1);
1069 44389 : realloc_tensor(&handler->coef);
1070 :
1071 : // these two functions can be done with dgemm again....
1072 :
1073 44389 : grid_prepare_alpha_dgemm(task->ra, task->rb, task->rp, lmax_prep,
1074 : &handler->alpha);
1075 :
1076 : // compute the coefficients after applying the function of interest
1077 : // coef[x][z][y]
1078 44389 : grid_compute_coefficients_dgemm(
1079 : lmin_prep, lmax_prep, lp,
1080 44389 : task->prefactor * ((task->iatom == task->jatom) ? 1.0 : 2.0),
1081 : &handler->alpha, pab_prep, &handler->coef);
1082 44389 : }
1083 :
1084 632 : void collocate_one_grid_level_dgemm(grid_context *const ctx,
1085 : const int *const border_width,
1086 : const int *const shift_local,
1087 : const enum grid_func func, const int level,
1088 : const offload_buffer *pab_blocks) {
1089 632 : tensor *const grid = &ctx->grid[level];
1090 : // Using default(shared) because with GCC 9 the behavior around const changed:
1091 : // https://www.gnu.org/software/gcc/gcc-9/porting_to.html
1092 632 : #pragma omp parallel default(shared)
1093 : {
1094 : const int num_threads = omp_get_num_threads();
1095 : const int thread_id = omp_get_thread_num();
1096 :
1097 : tensor work, pab, pab_prep;
1098 :
1099 : struct collocation_integration_ *handler = ctx->handler[thread_id];
1100 :
1101 : handler->func = func;
1102 : grid_prepare_get_ldiffs_dgemm(handler->func, handler->lmin_diff,
1103 : handler->lmax_diff);
1104 :
1105 : handler->apply_cutoff = ctx->apply_cutoff;
1106 :
1107 : // Allocate pab matrix for re-use across tasks.
1108 : initialize_tensor_2(&pab, ctx->maxco, ctx->maxco);
1109 : alloc_tensor(&pab);
1110 :
1111 : initialize_tensor_2(&work, ctx->maxco, ctx->maxco);
1112 : alloc_tensor(&work);
1113 :
1114 : initialize_tensor_2(&pab_prep, ctx->maxco, ctx->maxco);
1115 : alloc_tensor(&pab_prep);
1116 :
1117 : initialize_basis_vectors(handler, (const double(*)[3])grid->dh,
1118 : (const double(*)[3])grid->dh_inv);
1119 :
1120 : /* setup the grid parameters, window parameters (if the grid is split), etc
1121 : */
1122 :
1123 : tensor_copy(&handler->grid, grid);
1124 :
1125 : for (int d = 0; d < 3; d++)
1126 : handler->orthogonal[d] = handler->grid.orthogonal[d];
1127 :
1128 : if ((thread_id == 0) || (num_threads == 1)) {
1129 : // thread id 0 directly store the results in the final storage space
1130 : handler->grid.data = ctx->grid[level].data;
1131 : }
1132 :
1133 : if ((num_threads > 1) && (thread_id > 0)) {
1134 : handler->grid.data = ((double *)ctx->scratch) +
1135 : (thread_id - 1) * handler->grid.alloc_size_;
1136 : memset(handler->grid.data, 0, sizeof(double) * grid->alloc_size_);
1137 : }
1138 :
1139 : /* it is only useful when we split the list over multiple threads. The first
1140 : * iteration should load the block whatever status the task->block_update_
1141 : * has */
1142 : const _task *prev_task = NULL;
1143 : #pragma omp for schedule(static)
1144 : for (int itask = 0; itask < ctx->tasks_per_level[level]; itask++) {
1145 : // Define some convenient aliases.
1146 : const _task *task = &ctx->tasks[level][itask];
1147 :
1148 : if (task->level != level) {
1149 : printf("level %d, %d\n", task->level, level);
1150 : abort();
1151 : }
1152 : /* the grid is divided over several ranks or not periodic */
1153 : if ((handler->grid.size[0] != handler->grid.full_size[0]) ||
1154 : (handler->grid.size[1] != handler->grid.full_size[1]) ||
1155 : (handler->grid.size[2] != handler->grid.full_size[2])) {
1156 : /* unfortunately the window where the gaussian should be added depends
1157 : * on the bonds. So I have to adjust the window all the time. */
1158 :
1159 : setup_grid_window(&handler->grid, shift_local, border_width,
1160 : task->border_mask);
1161 : }
1162 :
1163 : /* this is a three steps procedure. pab_blocks contains the coefficients
1164 : * of the operator in the spherical harmonic basis while we do computation
1165 : * in the cartesian harmonic basis.
1166 : *
1167 : * step 1 : rotate the coefficients from the harmonic to the cartesian
1168 : * basis
1169 :
1170 : * step 2 : extract the subblock and apply additional transformations
1171 : * corresponding the spatial derivatives of the operator (the last is not
1172 : * always done)
1173 :
1174 : * step 3 : change from (x - x_1)^\alpha (x - x_2)^\beta to (x -
1175 : * x_{12})^k. It is a transformation which involves binomial
1176 : * coefficients.
1177 : *
1178 : * \f[ (x - x_1) ^\alpha (x - x_2) ^ beta = \sum_{k_{1} k_{2}} ^
1179 : * {\alpha\beta} \text{Binomial}(\alpha,k_1)
1180 : * \text{Binomial}(\beta,k_2) (x - x_{12})^{k_1 + k_2} (x_12 - x_1)
1181 : * ^{\alpha - k_1} (x_12 - x_2) ^{\beta - k_2} ]
1182 : *
1183 : * step 1 is done only when necessary, the two remaining steps are done
1184 : * for each bond.
1185 : */
1186 :
1187 : compute_coefficients(ctx, handler, prev_task, task, pab_blocks, &pab,
1188 : &work, &pab_prep);
1189 :
1190 : grid_collocate(handler, ctx->orthorhombic, task->zetp, task->rp,
1191 : task->radius);
1192 : prev_task = task;
1193 : }
1194 :
1195 : // Merge thread local grids into shared grid. Could be improved though....
1196 :
1197 : // thread 0 does nothing since the data are already placed in the final
1198 : // destination
1199 : if (num_threads > 1) {
1200 : if ((grid->alloc_size_ / (num_threads - 1)) >= 2) {
1201 : const int block_size = grid->alloc_size_ / (num_threads - 1) +
1202 : (grid->alloc_size_ % (num_threads - 1));
1203 :
1204 : for (int bk = 0; bk < num_threads; bk++) {
1205 : if (thread_id > 0) {
1206 : int bk_id = (bk + thread_id - 1) % num_threads;
1207 : int begin = bk_id * block_size;
1208 : int end = imin((bk_id + 1) * block_size, grid->alloc_size_);
1209 : cblas_daxpy(end - begin, 1.0, handler->grid.data + begin, 1,
1210 : grid->data + begin, 1);
1211 : }
1212 : #pragma omp barrier
1213 : }
1214 : }
1215 : } else {
1216 : #pragma omp critical
1217 : if (thread_id > 0)
1218 : cblas_daxpy(handler->grid.alloc_size_, 1.0,
1219 : &idx3(handler->grid, 0, 0, 0), 1, &idx3(grid[0], 0, 0, 0),
1220 : 1);
1221 : }
1222 : handler->grid.data = NULL;
1223 : free(pab.data);
1224 : free(pab_prep.data);
1225 : free(work.data);
1226 : }
1227 632 : }
1228 :
1229 : /*******************************************************************************
1230 : * \brief Collocate all tasks of a given list onto given grids.
1231 : * See grid_task_list.h for details.
1232 : ******************************************************************************/
1233 158 : void grid_dgemm_collocate_task_list(grid_dgemm_task_list *const ptr,
1234 : const enum grid_func func,
1235 : const int nlevels,
1236 : const offload_buffer *pab_blocks,
1237 : offload_buffer *grids[nlevels]) {
1238 :
1239 158 : grid_context *const ctx = (grid_context *)ptr;
1240 :
1241 158 : assert(ctx->checksum == ctx_checksum);
1242 :
1243 158 : const int max_threads = omp_get_max_threads();
1244 :
1245 158 : assert(ctx->nlevels == nlevels);
1246 :
1247 : // #pragma omp parallel for
1248 790 : for (int level = 0; level < ctx->nlevels; level++) {
1249 632 : const _layout *layout = &ctx->layouts[level];
1250 632 : set_grid_parameters(&ctx->grid[level], ctx->orthorhombic,
1251 632 : layout->npts_global, layout->npts_local,
1252 632 : layout->shift_local, layout->border_width, layout->dh,
1253 632 : layout->dh_inv, grids[level]);
1254 632 : memset(ctx->grid[level].data, 0,
1255 632 : sizeof(double) * ctx->grid[level].alloc_size_);
1256 : }
1257 :
1258 158 : if (ctx->scratch == NULL) {
1259 158 : int max_size = ctx->grid[0].alloc_size_;
1260 :
1261 : /* compute the size of the largest grid. It is used afterwards to allocate
1262 : * scratch memory for the grid on each omp thread */
1263 632 : for (int x = 1; x < nlevels; x++) {
1264 474 : max_size = imax(ctx->grid[x].alloc_size_, max_size);
1265 : }
1266 :
1267 158 : max_size = ((max_size / 4096) + (max_size % 4096 != 0)) * 4096;
1268 :
1269 : /* scratch is a void pointer !!!!! */
1270 158 : ctx->scratch =
1271 158 : grid_allocate_scratch(max_size * max_threads * sizeof(double));
1272 : }
1273 :
1274 790 : for (int level = 0; level < ctx->nlevels; level++) {
1275 632 : const _layout *layout = &ctx->layouts[level];
1276 632 : collocate_one_grid_level_dgemm(ctx, layout->border_width,
1277 632 : layout->shift_local, func, level,
1278 : pab_blocks);
1279 : }
1280 :
1281 158 : grid_free_scratch(ctx->scratch);
1282 158 : ctx->scratch = NULL;
1283 158 : }
|