Line data Source code
1 : /*----------------------------------------------------------------------------*/
2 : /* CP2K: A general program to perform molecular dynamics simulations */
3 : /* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */
4 : /* */
5 : /* SPDX-License-Identifier: BSD-3-Clause */
6 : /*----------------------------------------------------------------------------*/
7 :
8 : #include <assert.h>
9 : #include <limits.h>
10 : #include <math.h>
11 : #include <stdbool.h>
12 : #include <stdio.h>
13 : #include <stdlib.h>
14 : #include <string.h>
15 : #include <unistd.h>
16 :
17 : #include <omp.h>
18 :
19 : #ifdef __LIBXSMM
20 : #include <libxsmm.h>
21 : #endif
22 :
23 : #include "../common/grid_common.h"
24 : #include "grid_dgemm_coefficients.h"
25 : #include "grid_dgemm_collocate.h"
26 : #include "grid_dgemm_collocation_integration.h"
27 : #include "grid_dgemm_non_orthorombic_corrections.h"
28 : #include "grid_dgemm_private_header.h"
29 : #include "grid_dgemm_tensor_local.h"
30 : #include "grid_dgemm_utils.h"
31 :
32 0 : void extract_cube_within_spherical_cutoff_ortho(
33 : struct collocation_integration_ *const handler, const double disr_radius,
34 : const int cmax, const int *const lb_cube, const int *const cube_center) {
35 : // a mapping so that the ig corresponds to the right grid point
36 0 : int **map = handler->map;
37 0 : map[1] = map[0] + 2 * cmax + 1;
38 0 : map[2] = map[1] + 2 * cmax + 1;
39 : // memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
40 :
41 0 : for (int i = 0; i < 3; i++) {
42 0 : for (int ig = 0; ig < handler->cube.size[i]; ig++) {
43 0 : map[i][ig] = modulo(cube_center[i] + lb_cube[i] + ig -
44 0 : handler->grid.lower_corner[i],
45 : handler->grid.full_size[i]);
46 : }
47 : }
48 :
49 0 : const Interval zwindow = {.xmin = handler->grid.window_shift[0],
50 0 : .xmax = handler->grid.window_size[0]};
51 0 : const Interval ywindow = {.xmin = handler->grid.window_shift[1],
52 0 : .xmax = handler->grid.window_size[1]};
53 0 : const Interval xwindow = {.xmin = handler->grid.window_shift[2],
54 0 : .xmax = handler->grid.window_size[2]};
55 :
56 0 : for (int kg = 0; kg < handler->cube.size[0]; kg++) {
57 0 : const int k = map[0][kg];
58 0 : const int kd =
59 0 : (2 * (kg + lb_cube[0]) - 1) / 2; // distance from center in grid points
60 0 : const double kr = kd * handler->dh[2][2]; // distance from center in a.u.
61 0 : const double kremain = disr_radius * disr_radius - kr * kr;
62 :
63 0 : if ((kremain >= 0.0) && is_point_in_interval(k, zwindow)) {
64 :
65 0 : const int jgmin = ceil(-1e-8 - sqrt(kremain) * handler->dh_inv[1][1]);
66 0 : for (int jg = jgmin; jg <= (1 - jgmin); jg++) {
67 0 : const int j = map[1][jg - lb_cube[1]];
68 0 : const double jr = ((2 * jg - 1) >> 1) *
69 0 : handler->dh[1][1]; // distance from center in a.u.
70 0 : const double jremain = kremain - jr * jr;
71 :
72 0 : if ((jremain >= 0.0) && is_point_in_interval(j, ywindow)) {
73 0 : const int xmin = ceil(-1e-8 - sqrt(jremain) * handler->dh_inv[0][0]);
74 0 : const int xmax = 1 - xmin;
75 :
76 : // printf("xmin %d, xmax %d\n", xmin, xmax);
77 0 : for (int x = xmin - lb_cube[2];
78 0 : x < imin((xmax - lb_cube[2]), handler->cube.size[2]); x++) {
79 0 : const int x1 = map[2][x];
80 :
81 0 : if (!is_point_in_interval(x1, xwindow))
82 0 : continue;
83 :
84 0 : int lower_corner[3] = {k, j, x1};
85 0 : int upper_corner[3] = {k + 1, j + 1, x1 + 1};
86 :
87 0 : compute_interval(map[2], handler->grid.full_size[2],
88 : handler->grid.size[2], handler->cube.size[2], x1,
89 : &x, lower_corner + 2, upper_corner + 2, xwindow);
90 :
91 0 : if (upper_corner[2] - lower_corner[2]) {
92 0 : const int position1[3] = {kg, jg - lb_cube[1], x};
93 :
94 : /* the function will internally take care of the local vs global
95 : * grid */
96 :
97 0 : double *restrict dst = &idx3(handler->cube, position1[0],
98 : position1[1], position1[2]);
99 :
100 0 : double *restrict src = &idx3(handler->grid, lower_corner[0],
101 : lower_corner[1], lower_corner[2]);
102 :
103 0 : const int sizex = upper_corner[2] - lower_corner[2];
104 0 : GRID_PRAGMA_SIMD((dst, src), 8)
105 0 : for (int x = 0; x < sizex; x++) {
106 0 : dst[x] = src[x];
107 : }
108 : }
109 :
110 0 : if (handler->grid.size[2] == handler->grid.full_size[2])
111 0 : update_loop_index(handler->grid.full_size[2], x1, &x);
112 : else
113 0 : x += upper_corner[2] - lower_corner[2] - 1;
114 : }
115 : }
116 : }
117 : }
118 : }
119 0 : }
120 :
121 0 : void extract_cube_within_spherical_cutoff_generic(
122 : struct collocation_integration_ *const handler, const double disr_radius,
123 : const int cmax, const int *const lb_cube, const int *const ub_cube,
124 : const double *const roffset, const int *const cube_center) {
125 :
126 0 : const double a = handler->dh[0][0] * handler->dh[0][0] +
127 0 : handler->dh[0][1] * handler->dh[0][1] +
128 0 : handler->dh[0][2] * handler->dh[0][2];
129 0 : const double a_inv = 1.0 / a;
130 : // a mapping so that the ig corresponds to the right grid point
131 0 : int **map = handler->map;
132 0 : map[1] = map[0] + 2 * cmax + 1;
133 0 : map[2] = map[1] + 2 * cmax + 1;
134 : // memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
135 :
136 0 : for (int i = 0; i < 3; i++) {
137 0 : for (int ig = 0; ig < handler->cube.size[i]; ig++) {
138 0 : map[i][ig] = modulo(cube_center[i] + ig + lb_cube[i] -
139 0 : handler->grid.lower_corner[i],
140 : handler->grid.full_size[i]);
141 : }
142 : }
143 :
144 0 : const Interval zwindow = {.xmin = handler->grid.window_shift[0],
145 0 : .xmax = handler->grid.window_size[0]};
146 0 : const Interval ywindow = {.xmin = handler->grid.window_shift[1],
147 0 : .xmax = handler->grid.window_size[1]};
148 0 : const Interval xwindow = {.xmin = handler->grid.window_shift[2],
149 0 : .xmax = handler->grid.window_size[2]};
150 :
151 0 : for (int k = 0; k < handler->cube.size[0]; k++) {
152 0 : const int iz = map[0][k];
153 :
154 0 : if (!is_point_in_interval(iz, zwindow))
155 0 : continue;
156 :
157 0 : const double tz = (k + lb_cube[0] - roffset[0]);
158 0 : const double z[3] = {tz * handler->dh[2][0], tz * handler->dh[2][1],
159 0 : tz * handler->dh[2][2]};
160 :
161 0 : for (int j = 0; j < handler->cube.size[1]; j++) {
162 0 : const int iy = map[1][j];
163 :
164 0 : if (!is_point_in_interval(iy, ywindow))
165 0 : continue;
166 :
167 0 : const double ty = (j - roffset[1] + lb_cube[1]);
168 0 : const double y[3] = {z[0] + ty * handler->dh[1][0],
169 0 : z[1] + ty * handler->dh[1][1],
170 0 : z[2] + ty * handler->dh[1][2]};
171 :
172 : /* Sqrt[(-2 x1 \[Alpha] - 2 y1 \[Beta] - 2 z1 \[Gamma])^2 - */
173 : /* 4 (x1^2 + y1^2 + z1^2)
174 : * (\[Alpha]^2 + \[Beta]^2 + \[Gamma]^2)] */
175 :
176 0 : const double b =
177 0 : -2.0 * (handler->dh[0][0] * (roffset[2] * handler->dh[0][0] - y[0]) +
178 0 : handler->dh[0][1] * (roffset[2] * handler->dh[0][1] - y[1]) +
179 0 : handler->dh[0][2] * (roffset[2] * handler->dh[0][2] - y[2]));
180 :
181 0 : const double c = (roffset[2] * handler->dh[0][0] - y[0]) *
182 0 : (roffset[2] * handler->dh[0][0] - y[0]) +
183 0 : (roffset[2] * handler->dh[0][1] - y[1]) *
184 0 : (roffset[2] * handler->dh[0][1] - y[1]) +
185 0 : (roffset[2] * handler->dh[0][2] - y[2]) *
186 : (roffset[2] * handler->dh[0][2] - y[2]) -
187 0 : disr_radius * disr_radius;
188 :
189 0 : double delta = b * b - 4.0 * a * c;
190 :
191 0 : if (delta < 0.0)
192 0 : continue;
193 :
194 0 : delta = sqrt(delta);
195 :
196 0 : const int xmin = imax(ceil((-b - delta) * 0.5 * a_inv), lb_cube[2]);
197 0 : const int xmax = imin(floor((-b + delta) * 0.5 * a_inv), ub_cube[2]);
198 :
199 0 : int lower_corner[3] = {iz, iy, xmin};
200 0 : int upper_corner[3] = {iz + 1, iy + 1, xmin};
201 :
202 0 : for (int x = xmin - lb_cube[2];
203 0 : x < imin((xmax - lb_cube[2]), handler->cube.size[2]); x++) {
204 0 : const int x1 = map[2][x];
205 :
206 0 : if (!is_point_in_interval(x1, xwindow))
207 0 : continue;
208 :
209 0 : compute_interval(map[2], handler->grid.full_size[2],
210 : handler->grid.size[2], handler->cube.size[2], x1, &x,
211 : lower_corner + 2, upper_corner + 2, xwindow);
212 :
213 0 : if (upper_corner[2] - lower_corner[2]) {
214 0 : const int position1[3] = {k, j, x};
215 :
216 : /* the function will internally take care of the local vs global
217 : * grid */
218 :
219 0 : double *__restrict__ src = &idx3(handler->grid, lower_corner[0],
220 : lower_corner[1], lower_corner[2]);
221 0 : double *__restrict__ dst =
222 0 : &idx3(handler->cube, position1[0], position1[1], position1[2]);
223 :
224 0 : const int sizex = upper_corner[2] - lower_corner[2];
225 :
226 : // #pragma omp simd linear(dst, src) simdlen(8)
227 0 : GRID_PRAGMA_SIMD((dst, src), 8)
228 0 : for (int x = 0; x < sizex; x++) {
229 0 : dst[x] = src[x];
230 : }
231 :
232 0 : if (handler->grid.size[2] == handler->grid.full_size[2])
233 0 : update_loop_index(handler->grid.full_size[2], x1, &x);
234 : else
235 0 : x += upper_corner[2] - lower_corner[2] - 1;
236 : }
237 : }
238 : }
239 : }
240 0 : }
241 :
242 5076 : static void rotate_and_store_coefficients(grid_context *const ctx,
243 : const _task *prev_task,
244 : const _task *task, tensor *const hab,
245 : tensor *work, // some scratch matrix
246 : double *blocks) {
247 5076 : if (prev_task != NULL) {
248 : /* prev_task is NULL when we deal with the first iteration. In that case
249 : * we just need to initialize the hab matrix to the proper size. Note
250 : * that resizing does not really occurs since memory allocation is done
251 : * for the maximum size the matrix can have. */
252 4452 : const int iatom = prev_task->iatom;
253 4452 : const int jatom = prev_task->jatom;
254 4452 : const int iset = prev_task->iset;
255 4452 : const int jset = prev_task->jset;
256 4452 : const int ikind = ctx->atom_kinds[iatom];
257 4452 : const int jkind = ctx->atom_kinds[jatom];
258 4452 : const grid_basis_set *ibasis = ctx->basis_sets[ikind];
259 4452 : const grid_basis_set *jbasis = ctx->basis_sets[jkind];
260 :
261 4452 : const int block_num = prev_task->block_num;
262 4452 : double *const block = &blocks[ctx->block_offsets[block_num]];
263 :
264 4452 : const int ncoseta = ncoset(ibasis->lmax[iset]);
265 4452 : const int ncosetb = ncoset(jbasis->lmax[jset]);
266 :
267 4452 : const int ncoa = ibasis->npgf[iset] * ncoseta;
268 4452 : const int ncob = jbasis->npgf[jset] * ncosetb;
269 :
270 4452 : const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set */
271 4452 : const int nsgf_setb = jbasis->nsgf_set[jset];
272 4452 : const int nsgfa = ibasis->nsgf; // size of entire spherical basis
273 4452 : const int nsgfb = jbasis->nsgf;
274 4452 : const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
275 4452 : const int sgfb = jbasis->first_sgf[jset] - 1;
276 4452 : const int maxcoa = ibasis->maxco;
277 4452 : const int maxcob = jbasis->maxco;
278 :
279 4452 : initialize_tensor_2(work, nsgf_setb, ncoa);
280 4452 : realloc_tensor(work);
281 :
282 : // Warning these matrices are row major....
283 :
284 4452 : dgemm_params m1, m2;
285 4452 : memset(&m1, 0, sizeof(dgemm_params));
286 4452 : memset(&m2, 0, sizeof(dgemm_params));
287 :
288 4452 : m1.op1 = 'N';
289 4452 : m1.op2 = 'N';
290 4452 : m1.m = nsgf_setb;
291 4452 : m1.n = ncoa;
292 4452 : m1.k = ncob;
293 4452 : m1.alpha = 1.0;
294 4452 : m1.beta = 0.0;
295 4452 : m1.a = &jbasis->sphi[sgfb * maxcob];
296 4452 : m1.lda = maxcob;
297 4452 : m1.b = hab->data;
298 4452 : m1.ldb = ncoa;
299 4452 : m1.c = work->data;
300 4452 : m1.ldc = work->ld_;
301 :
302 : // phi[b][ncob]
303 : // work[b][ncoa] = phi[b][ncob] * hab[ncob][ncoa]
304 :
305 4452 : if (iatom <= jatom) {
306 : // I need to have the final result in the form
307 :
308 : // block[b][a] = work[b][ncoa] transpose(phi[a][ncoa])
309 2964 : m2.op1 = 'N';
310 2964 : m2.op2 = 'T';
311 2964 : m2.m = nsgf_setb;
312 2964 : m2.n = nsgf_seta;
313 2964 : m2.k = ncoa;
314 2964 : m2.a = work->data;
315 2964 : m2.lda = work->ld_;
316 2964 : m2.b = &ibasis->sphi[sgfa * maxcoa];
317 2964 : m2.ldb = maxcoa;
318 2964 : m2.c = block + sgfb * nsgfa + sgfa;
319 2964 : m2.ldc = nsgfa;
320 2964 : m2.alpha = 1.0;
321 2964 : m2.beta = 1.0;
322 : } else {
323 : // block[a][b] = phi[a][ncoa] Transpose(work[b][ncoa])
324 1488 : m2.op1 = 'N';
325 1488 : m2.op2 = 'T';
326 1488 : m2.m = nsgf_seta;
327 1488 : m2.n = nsgf_setb;
328 1488 : m2.k = ncoa;
329 1488 : m2.a = &ibasis->sphi[sgfa * maxcoa];
330 1488 : m2.lda = maxcoa;
331 1488 : m2.b = work->data;
332 1488 : m2.ldb = work->ld_;
333 1488 : m2.c = block + sgfa * nsgfb + sgfb;
334 1488 : m2.ldc = nsgfb;
335 1488 : m2.alpha = 1.0;
336 1488 : m2.beta = 1.0;
337 : }
338 :
339 4452 : m1.use_libxsmm = true;
340 4452 : m2.use_libxsmm = true;
341 :
342 : /* these dgemm are *row* major */
343 4452 : dgemm_simplified(&m1);
344 4452 : dgemm_simplified(&m2);
345 : }
346 :
347 5076 : if (task != NULL) {
348 4452 : const int iatom = task->iatom;
349 4452 : const int jatom = task->jatom;
350 4452 : const int ikind = ctx->atom_kinds[iatom];
351 4452 : const int jkind = ctx->atom_kinds[jatom];
352 4452 : const int iset = task->iset;
353 4452 : const int jset = task->jset;
354 4452 : const grid_basis_set *ibasis = ctx->basis_sets[ikind];
355 4452 : const grid_basis_set *jbasis = ctx->basis_sets[jkind];
356 4452 : const int ncoseta = ncoset(ibasis->lmax[iset]);
357 4452 : const int ncosetb = ncoset(jbasis->lmax[jset]);
358 :
359 4452 : const int ncoa = ibasis->npgf[iset] * ncoseta;
360 4452 : const int ncob = jbasis->npgf[jset] * ncosetb;
361 :
362 4452 : initialize_tensor_2(hab, ncob, ncoa);
363 4452 : realloc_tensor(hab);
364 : }
365 5076 : }
366 :
367 49036 : void update_force_pair(orbital a, orbital b, const double pab,
368 : const double ftz[2], const double *const rab,
369 : const tensor *const vab, tensor *force) {
370 49036 : const double axpm0 = idx2(vab[0], idx(b), idx(a));
371 196144 : for (int i = 0; i < 3; i++) {
372 147108 : const double aip1 = idx2(vab[0], idx(b), idx(up(i, a)));
373 147108 : const double aim1 = idx2(vab[0], idx(b), idx(down(i, a)));
374 147108 : const double bim1 = idx2(vab[0], idx(down(i, b)), idx(a));
375 147108 : idx2(force[0], 0, i) += pab * (ftz[0] * aip1 - a.l[i] * aim1);
376 147108 : idx2(force[0], 1, i) +=
377 147108 : pab * (ftz[1] * (aip1 - rab[i] * axpm0) - b.l[i] * bim1);
378 : }
379 49036 : }
380 :
381 34952 : void update_virial_pair(orbital a, orbital b, const double pab,
382 : const double ftz[2], const double *const rab,
383 : const tensor *const vab, tensor *virial) {
384 139808 : for (int i = 0; i < 3; i++) {
385 419424 : for (int j = 0; j < 3; j++) {
386 314568 : idx3(virial[0], 0, i, j) +=
387 314568 : pab * ftz[0] * idx2(vab[0], idx(b), idx(up(i, up(j, a)))) -
388 314568 : pab * a.l[j] * idx2(vab[0], idx(b), idx(up(i, down(j, a))));
389 : }
390 : }
391 :
392 139808 : for (int i = 0; i < 3; i++) {
393 419424 : for (int j = 0; j < 3; j++) {
394 314568 : idx3(virial[0], 1, i, j) +=
395 314568 : pab * ftz[1] *
396 314568 : (idx2(vab[0], idx(b), idx(up(i, up(j, a)))) -
397 314568 : idx2(vab[0], idx(b), idx(up(i, a))) * rab[j] -
398 314568 : idx2(vab[0], idx(b), idx(up(j, a))) * rab[i] +
399 314568 : idx2(vab[0], idx(b), idx(a)) * rab[j] * rab[i]) -
400 314568 : pab * b.l[j] * idx2(vab[0], idx(up(i, down(j, b))), idx(a));
401 : }
402 : }
403 34952 : }
404 :
405 340422 : void update_all(const bool compute_forces, const bool compute_virial,
406 : const orbital a, const orbital b, const double f,
407 : const double *const ftz, const double *rab, const tensor *vab,
408 : const double pab, double *hab, tensor *forces,
409 : tensor *virials) {
410 :
411 340422 : *hab += f * idx2(vab[0], idx(b), idx(a));
412 :
413 340422 : if (compute_forces) {
414 49036 : update_force_pair(a, b, f * pab, ftz, rab, vab, forces);
415 : }
416 :
417 340422 : if (compute_virial) {
418 34952 : update_virial_pair(a, b, f * pab, ftz, rab, vab, virials);
419 : }
420 340422 : }
421 :
422 0 : static void update_tau(const bool compute_forces, const bool compute_virial,
423 : const orbital a, const orbital b, const double ftz[2],
424 : const double *const rab, const tensor *const vab,
425 : const double pab, double *const hab, tensor *forces,
426 : tensor *virials) {
427 :
428 0 : for (int i = 0; i < 3; i++) {
429 0 : update_all(compute_forces, compute_virial, down(i, a), down(i, b),
430 0 : 0.5 * a.l[i] * b.l[i], ftz, rab, vab, pab, hab, forces, virials);
431 0 : update_all(compute_forces, compute_virial, up(i, a), down(i, b),
432 0 : -0.5 * ftz[0] * b.l[i], ftz, rab, vab, pab, hab, forces,
433 : virials);
434 0 : update_all(compute_forces, compute_virial, down(i, a), up(i, b),
435 0 : -0.5 * a.l[i] * ftz[1], ftz, rab, vab, pab, hab, forces,
436 : virials);
437 0 : update_all(compute_forces, compute_virial, up(i, a), up(i, b),
438 0 : 0.5 * ftz[0] * ftz[1], ftz, rab, vab, pab, hab, forces, virials);
439 : }
440 0 : }
441 :
442 : static void
443 43611 : update_hab_forces_and_stress(const _task *task, const tensor *const vab,
444 : const tensor *const pab, const bool compute_tau,
445 : const bool compute_forces,
446 : const bool compute_virial, tensor *const forces,
447 : tensor *const virial, tensor *const hab) {
448 43611 : double zeta[2] = {task->zeta[0] * 2.0, task->zeta[1] * 2.0};
449 101793 : for (int lb = task->lmin[1]; lb <= task->lmax[1]; lb++) {
450 138321 : for (int la = task->lmin[0]; la <= task->lmax[0]; la++) {
451 200037 : for (int bx = 0; bx <= lb; bx++) {
452 284562 : for (int by = 0; by <= lb - bx; by++) {
453 164664 : const int bz = lb - bx - by;
454 164664 : const orbital b = {{bx, by, bz}};
455 164664 : const int idx_b = task->offset[1] + idx(b);
456 412083 : for (int ax = 0; ax <= la; ax++) {
457 587841 : for (int ay = 0; ay <= la - ax; ay++) {
458 340422 : const int az = la - ax - ay;
459 340422 : const orbital a = {{ax, ay, az}};
460 340422 : const int idx_a = task->offset[0] + idx(a);
461 340422 : double *habval = &idx2(hab[0], idx_b, idx_a);
462 340422 : const double prefactor = idx2(pab[0], idx_b, idx_a);
463 :
464 : // now compute the forces
465 340422 : if (compute_tau) {
466 0 : update_tau(compute_forces, compute_virial, a, b, zeta,
467 0 : task->rab, vab, prefactor, habval, forces, virial);
468 : } else {
469 340422 : update_all(compute_forces, compute_virial, a, b, 1.0, zeta,
470 340422 : task->rab, vab, prefactor, habval, forces, virial);
471 : }
472 : }
473 : }
474 : }
475 : }
476 : }
477 : }
478 43611 : }
479 :
480 : /* It is a sub-optimal version of the mapping in case of a cubic cutoff. But is
481 : * very general and does not depend on the orthorombic nature of the grid. for
482 : * orthorombic cases, it is faster to apply PCB directly on the polynomials. */
483 :
484 43611 : void extract_cube(struct collocation_integration_ *handler, const int cmax,
485 : const int *lower_boundaries_cube, const int *cube_center) {
486 :
487 : // a mapping so that the ig corresponds to the right grid point
488 43611 : int **map = handler->map;
489 43611 : map[1] = map[0] + 2 * cmax + 1;
490 43611 : map[2] = map[1] + 2 * cmax + 1;
491 : // memset(map[0], 0xff, sizeof(int) * 3 * (2 * cmax + 1));
492 174444 : for (int i = 0; i < 3; i++) {
493 3332608 : for (int ig = 0; ig < handler->cube.size[i]; ig++) {
494 3201775 : map[i][ig] = modulo(cube_center[i] + ig + lower_boundaries_cube[i] -
495 3201775 : handler->grid.lower_corner[i],
496 : handler->grid.full_size[i]);
497 : }
498 : }
499 :
500 43611 : int lower_corner[3];
501 43611 : int upper_corner[3];
502 :
503 43611 : const Interval zwindow = {.xmin = handler->grid.window_shift[0],
504 43611 : .xmax = handler->grid.window_size[0]};
505 43611 : const Interval ywindow = {.xmin = handler->grid.window_shift[1],
506 43611 : .xmax = handler->grid.window_size[1]};
507 43611 : const Interval xwindow = {.xmin = handler->grid.window_shift[2],
508 43611 : .xmax = handler->grid.window_size[2]};
509 :
510 128484 : for (int z = 0; (z < handler->cube.size[0]); z++) {
511 84873 : const int z1 = map[0][z];
512 :
513 169746 : if (!is_point_in_interval(z1, zwindow))
514 0 : continue;
515 :
516 84873 : compute_interval(map[0], handler->grid.full_size[0], handler->grid.size[0],
517 : handler->cube.size[0], z1, &z, lower_corner, upper_corner,
518 : zwindow);
519 :
520 : /* // We have a full plane. */
521 84873 : if (upper_corner[0] - lower_corner[0]) {
522 255507 : for (int y = 0; y < handler->cube.size[1]; y++) {
523 170634 : const int y1 = map[1][y];
524 :
525 : // this check is completely irrelevant when running without MPI.
526 341268 : if (!is_point_in_interval(y1, ywindow))
527 0 : continue;
528 :
529 170634 : compute_interval(map[1], handler->grid.full_size[1],
530 : handler->grid.size[1], handler->cube.size[1], y1, &y,
531 : lower_corner + 1, upper_corner + 1, ywindow);
532 :
533 170634 : if (upper_corner[1] - lower_corner[1]) {
534 545794 : for (int x = 0; x < handler->cube.size[2]; x++) {
535 375160 : const int x1 = map[2][x];
536 :
537 750320 : if (!is_point_in_interval(x1, xwindow))
538 0 : continue;
539 :
540 375160 : compute_interval(map[2], handler->grid.full_size[2],
541 : handler->grid.size[2], handler->cube.size[2], x1,
542 : &x, lower_corner + 2, upper_corner + 2, xwindow);
543 :
544 375160 : if (upper_corner[2] - lower_corner[2]) { // should be non zero
545 375160 : const int position1[3] = {z, y, x};
546 :
547 : /* the function will internally take care of the local vx global
548 : * grid */
549 375160 : extract_sub_grid(
550 : lower_corner, // lower corner of the portion of cube (in the
551 : // full grid)
552 : upper_corner, // upper corner of the portion of cube (in the
553 : // full grid)
554 : position1, // starting position subblock inside the cube
555 375160 : &handler->grid,
556 : &handler->cube); // the grid to add data from
557 :
558 375160 : if (handler->grid.size[2] == handler->grid.full_size[2])
559 375160 : update_loop_index(handler->grid.full_size[2], x1, &x);
560 : else
561 0 : x += upper_corner[2] - lower_corner[2] - 1;
562 : }
563 : }
564 170634 : if (handler->grid.size[1] == handler->grid.full_size[1])
565 170634 : update_loop_index(handler->grid.full_size[1], y1, &y);
566 : else
567 0 : y += upper_corner[1] - lower_corner[1] - 1;
568 : }
569 : }
570 84873 : if (handler->grid.size[0] == handler->grid.full_size[0])
571 84873 : update_loop_index(handler->grid.full_size[0], z1, &z);
572 : else
573 0 : z += upper_corner[0] - lower_corner[0] - 1;
574 : }
575 : }
576 43611 : }
577 :
578 43611 : void grid_integrate(collocation_integration *const handler,
579 : const bool use_ortho, const double zetp, const double rp[3],
580 : const double radius) {
581 43611 : if (handler == NULL)
582 0 : abort();
583 :
584 43611 : const int lp = handler->coef.size[0] - 1;
585 :
586 43611 : int cubecenter[3];
587 43611 : int cube_size[3];
588 43611 : int lb_cube[3], ub_cube[3];
589 43611 : double roffset[3];
590 43611 : double disr_radius;
591 :
592 : /* cube : grid comtaining pointlike product between polynomials
593 : *
594 : * pol : grid containing the polynomials in all three directions
595 : */
596 :
597 : /* seting up the cube parameters */
598 87222 : int cmax = compute_cube_properties(
599 43611 : use_ortho, radius, (const double(*)[3])handler->dh,
600 43611 : (const double(*)[3])handler->dh_inv, rp, &disr_radius, roffset,
601 : cubecenter, lb_cube, ub_cube, cube_size);
602 :
603 : /* initialize the multidimensional array containing the polynomials */
604 43611 : if (lp != 0) {
605 34331 : initialize_tensor_3(&handler->pol, 3, cmax, handler->coef.size[0]);
606 : } else {
607 9280 : initialize_tensor_3(&handler->pol, 3, handler->coef.size[0], cmax);
608 : }
609 43611 : handler->pol_alloc_size = realloc_tensor(&handler->pol);
610 :
611 : /* allocate memory for the polynomial and the cube */
612 :
613 43611 : initialize_tensor_3(&handler->cube, cube_size[0], cube_size[1], cube_size[2]);
614 :
615 43611 : handler->cube_alloc_size = realloc_tensor(&handler->cube);
616 :
617 43611 : initialize_W_and_T(handler, &handler->coef, &handler->cube);
618 :
619 : /* compute the polynomials */
620 :
621 : // WARNING : do not reverse the order in pol otherwise you will have to
622 : // reverse the order in collocate_dgemm as well.
623 :
624 : /* the tensor contraction is done for a given order so I have to be careful
625 : * how the tensors X, Y, Z are stored. In collocate, they are stored
626 : * normally 0 (Z), (1) Y, (2) X in the table pol. but in integrate (which
627 : * uses the same tensor reduction), I have a special treatment for l = 0. In
628 : * that case the order *should* be the same than for collocate. For l > 0,
629 : * we need a different storage order which is (X) 2, (Y) 0, and (Z) 1.
630 : *
631 : * the reason for this is that the cube is stored as cube[z][y][x] so the
632 : * first direction taken for the dgemm is along x.
633 : */
634 :
635 43611 : int perm[3] = {2, 0, 1};
636 :
637 43611 : if (lp == 0) {
638 : /* I need to restore the same order than for collocate */
639 9280 : perm[0] = 0;
640 9280 : perm[1] = 1;
641 9280 : perm[2] = 2;
642 : }
643 :
644 43611 : bool use_ortho_forced = handler->orthogonal[0] && handler->orthogonal[1] &&
645 23448 : handler->orthogonal[2];
646 43611 : if (use_ortho) {
647 21114 : grid_fill_pol_dgemm((lp != 0), handler->dh[0][0], roffset[2], 0, lb_cube[2],
648 : ub_cube[2], lp, cmax, zetp,
649 21114 : &idx3(handler->pol, perm[2], 0, 0)); /* i indice */
650 21114 : grid_fill_pol_dgemm((lp != 0), handler->dh[1][1], roffset[1], 0, lb_cube[1],
651 : ub_cube[1], lp, cmax, zetp,
652 21114 : &idx3(handler->pol, perm[1], 0, 0)); /* j indice */
653 21114 : grid_fill_pol_dgemm((lp != 0), handler->dh[2][2], roffset[0], 0, lb_cube[0],
654 : ub_cube[0], lp, cmax, zetp,
655 21114 : &idx3(handler->pol, perm[0], 0, 0)); /* k indice */
656 : } else {
657 22497 : double dx[3];
658 :
659 22497 : dx[2] = handler->dh[0][0] * handler->dh[0][0] +
660 22497 : handler->dh[0][1] * handler->dh[0][1] +
661 22497 : handler->dh[0][2] * handler->dh[0][2];
662 :
663 22497 : dx[1] = handler->dh[1][0] * handler->dh[1][0] +
664 22497 : handler->dh[1][1] * handler->dh[1][1] +
665 22497 : handler->dh[1][2] * handler->dh[1][2];
666 :
667 22497 : dx[0] = handler->dh[2][0] * handler->dh[2][0] +
668 22497 : handler->dh[2][1] * handler->dh[2][1] +
669 22497 : handler->dh[2][2] * handler->dh[2][2];
670 :
671 22497 : grid_fill_pol_dgemm((lp != 0), 1.0, roffset[2], 0, lb_cube[2], ub_cube[2],
672 : lp, cmax, zetp * dx[2],
673 22497 : &idx3(handler->pol, perm[2], 0, 0)); /* i indice */
674 22497 : grid_fill_pol_dgemm((lp != 0), 1.0, roffset[1], 0, lb_cube[1], ub_cube[1],
675 : lp, cmax, zetp * dx[1],
676 22497 : &idx3(handler->pol, perm[1], 0, 0)); /* j indice */
677 22497 : grid_fill_pol_dgemm((lp != 0), 1.0, roffset[0], 0, lb_cube[0], ub_cube[0],
678 : lp, cmax, zetp * dx[0],
679 22497 : &idx3(handler->pol, perm[0], 0, 0)); /* k indice */
680 :
681 : /* the three remaining tensors are initialized in the function */
682 22497 : calculate_non_orthorombic_corrections_tensor(
683 : zetp, roffset, (const double(*)[3])handler->dh, lb_cube, ub_cube,
684 22497 : handler->orthogonal, &handler->Exp);
685 : }
686 :
687 43611 : if (handler->apply_cutoff) {
688 0 : memset(handler->cube.data, 0, sizeof(double) * handler->cube.alloc_size_);
689 0 : if (!use_ortho && !use_ortho_forced) {
690 0 : extract_cube_within_spherical_cutoff_generic(
691 : handler, disr_radius, cmax, lb_cube, ub_cube, roffset, cubecenter);
692 : } else {
693 0 : extract_cube_within_spherical_cutoff_ortho(handler, disr_radius, cmax,
694 : lb_cube, cubecenter);
695 : }
696 : } else {
697 43611 : extract_cube(handler, cmax, lb_cube, cubecenter);
698 : }
699 :
700 43611 : if (!use_ortho && !use_ortho_forced)
701 22497 : apply_non_orthorombic_corrections(handler->orthogonal, &handler->Exp,
702 : &handler->cube);
703 :
704 43611 : if (lp != 0) {
705 34331 : tensor_reduction_for_collocate_integrate(handler->scratch,
706 : // pointer to scratch memory
707 34331 : 1.0, handler->orthogonal, NULL,
708 : &handler->cube, &handler->pol,
709 : &handler->coef);
710 : } else {
711 : /* it is very specific to integrate because we might end up with a
712 : * single element after the tensor product/contraction. In that case, I
713 : * compute the cube and then do a scalar product between the two. */
714 :
715 : /* we could also do this with 2 matrix-vector multiplications and a scalar
716 : * product
717 : *
718 : * H_{jk} = C_{ijk} . P_i (along x) C_{ijk} is *stored C[k][j][i]* !!!!!!
719 : * L_{k} = H_{jk} . P_j (along y)
720 : * v_{ab} = L_k . P_k (along z)
721 : */
722 :
723 9280 : tensor cube_tmp;
724 9280 : initialize_tensor_2(&cube_tmp, cube_size[0], cube_size[1]);
725 9280 : alloc_tensor(&cube_tmp);
726 :
727 : /* first along x */
728 9280 : cblas_dgemv(CblasRowMajor, CblasNoTrans,
729 9280 : handler->cube.size[0] * handler->cube.size[1],
730 9280 : handler->cube.size[2], 1.0, handler->cube.data,
731 9280 : handler->cube.ld_, &idx3(handler->pol, 2, 0, 0), 1, 0.0,
732 : cube_tmp.data, 1);
733 :
734 : /* second along j */
735 9280 : cblas_dgemv(CblasRowMajor, CblasNoTrans, handler->cube.size[0],
736 9280 : handler->cube.size[1], 1.0, cube_tmp.data, cube_tmp.ld_,
737 9280 : &idx3(handler->pol, 1, 0, 0), 1, 0.0, handler->scratch, 1);
738 :
739 : /* finally along k, it is a scalar product.... */
740 18560 : handler->coef.data[0] = cblas_ddot(handler->cube.size[0], handler->scratch,
741 9280 : 1, &idx3(handler->pol, 0, 0, 0), 1);
742 :
743 9280 : free(cube_tmp.data);
744 : }
745 :
746 : /* go from ijk -> xyz */
747 43611 : if (!use_ortho)
748 22497 : grid_transform_coef_jik_to_yxz((const double(*)[3])handler->dh,
749 : &handler->coef);
750 43611 : }
751 :
752 624 : void integrate_one_grid_level_dgemm(
753 : grid_context *const ctx, const int level, const bool calculate_tau,
754 : const bool calculate_forces, const bool calculate_virial,
755 : const int *const shift_local, const int *const border_width,
756 : const offload_buffer *const pab_blocks, offload_buffer *const hab_blocks,
757 : tensor *forces_, tensor *virial_) {
758 624 : tensor *const grid = &ctx->grid[level];
759 :
760 : // Using default(shared) because with GCC 9 the behavior around const changed:
761 : // https://www.gnu.org/software/gcc/gcc-9/porting_to.html
762 624 : #pragma omp parallel default(shared)
763 : {
764 : const int num_threads = omp_get_num_threads();
765 : const int thread_id = omp_get_thread_num();
766 :
767 : double *hab_block_local = NULL;
768 :
769 : if (num_threads == 1) {
770 : hab_block_local = hab_blocks->host_buffer;
771 : } else {
772 : hab_block_local = ((double *)ctx->scratch) +
773 : thread_id * (hab_blocks->size / sizeof(double));
774 : memset(hab_block_local, 0, hab_blocks->size);
775 : }
776 :
777 : tensor work, pab, hab, vab, forces_local_, virial_local_,
778 : forces_local_pair_, virial_local_pair_;
779 : memset(&work, 0, sizeof(tensor));
780 : memset(&pab, 0, sizeof(tensor));
781 : memset(&hab, 0, sizeof(tensor));
782 : memset(&vab, 0, sizeof(tensor));
783 : memset(&forces_local_, 0, sizeof(tensor));
784 : memset(&virial_local_, 0, sizeof(tensor));
785 : memset(&forces_local_pair_, 0, sizeof(tensor));
786 : memset(&virial_local_pair_, 0, sizeof(tensor));
787 :
788 : struct collocation_integration_ *handler = ctx->handler[thread_id];
789 : handler->apply_cutoff = ctx->apply_cutoff;
790 : handler->lmax_diff[0] = 0;
791 : handler->lmax_diff[1] = 0;
792 : handler->lmin_diff[0] = 0;
793 : handler->lmin_diff[1] = 0;
794 :
795 : if (calculate_tau || calculate_forces || calculate_virial) {
796 : handler->lmax_diff[0] = 1;
797 : handler->lmax_diff[1] = 0;
798 : handler->lmin_diff[0] = -1;
799 : handler->lmin_diff[1] = -1;
800 : }
801 :
802 : if (calculate_virial) {
803 : handler->lmax_diff[0]++;
804 : handler->lmax_diff[1]++;
805 : }
806 :
807 : if (calculate_tau) {
808 : handler->lmax_diff[0]++;
809 : handler->lmax_diff[1]++;
810 : handler->lmin_diff[0]--;
811 : handler->lmin_diff[1]--;
812 : }
813 :
814 : // Allocate pab matrix for re-use across tasks.
815 : initialize_tensor_2(&pab, ctx->maxco, ctx->maxco);
816 : alloc_tensor(&pab);
817 :
818 : initialize_tensor_2(&vab, ctx->maxco, ctx->maxco);
819 : alloc_tensor(&vab);
820 :
821 : initialize_tensor_2(&work, ctx->maxco, ctx->maxco);
822 : alloc_tensor(&work);
823 :
824 : initialize_tensor_2(&hab, ctx->maxco, ctx->maxco);
825 : alloc_tensor(&hab);
826 :
827 : if (calculate_forces) {
828 : initialize_tensor_2(&forces_local_, forces_->size[0], forces_->size[1]);
829 : alloc_tensor(&forces_local_);
830 : initialize_tensor_2(&virial_local_, 3, 3);
831 : alloc_tensor(&virial_local_);
832 : memset(forces_local_.data, 0, sizeof(double) * forces_local_.alloc_size_);
833 : memset(virial_local_.data, 0, sizeof(double) * virial_local_.alloc_size_);
834 : initialize_tensor_2(&forces_local_pair_, 2, 3);
835 : alloc_tensor(&forces_local_pair_);
836 : initialize_tensor_3(&virial_local_pair_, 2, 3, 3);
837 : alloc_tensor(&virial_local_pair_);
838 : }
839 :
840 : initialize_basis_vectors(handler, (const double(*)[3])grid->dh,
841 : (const double(*)[3])grid->dh_inv);
842 :
843 : tensor_copy(&handler->grid, grid);
844 : handler->grid.data = grid->data;
845 : for (int d = 0; d < 3; d++)
846 : handler->orthogonal[d] = grid->orthogonal[d];
847 :
848 : /* it is only useful when we split the list over multiple threads. The
849 : * first iteration should load the block whatever status the
850 : * task->block_update_ variable has */
851 : const _task *prev_task = NULL;
852 : #pragma omp for schedule(static)
853 : for (int itask = 0; itask < ctx->tasks_per_level[level]; itask++) {
854 : // Define some convenient aliases.
855 : const _task *task = &ctx->tasks[level][itask];
856 :
857 : if (task->level != level) {
858 : printf("level %d, %d\n", task->level, level);
859 : abort();
860 : }
861 :
862 : if (task->update_block_ || (prev_task == NULL)) {
863 : /* need to load pab if forces are needed */
864 : if (calculate_forces) {
865 : extract_blocks(ctx, task, pab_blocks, &work, &pab);
866 : }
867 : /* store the coefficients of the operator after rotation to
868 : * the spherical harmonic basis */
869 :
870 : rotate_and_store_coefficients(ctx, prev_task, task, &hab, &work,
871 : hab_block_local);
872 :
873 : /* There is a difference here between collocate and integrate.
874 : * For collocate, I do not need to know the task where blocks
875 : * have been updated the last time. For integrate this
876 : * information is crucial to update the coefficients of the
877 : * potential */
878 : prev_task = task;
879 : memset(hab.data, 0, sizeof(double) * hab.alloc_size_);
880 : }
881 :
882 : /* the grid is divided over several ranks or not periodic */
883 : if ((handler->grid.size[0] != handler->grid.full_size[0]) ||
884 : (handler->grid.size[1] != handler->grid.full_size[1]) ||
885 : (handler->grid.size[2] != handler->grid.full_size[2])) {
886 : /* unfortunately the window where the gaussian should be added depends
887 : * on the bonds. So I have to adjust the window all the time. */
888 :
889 : setup_grid_window(&handler->grid, shift_local, border_width,
890 : task->border_mask);
891 : }
892 :
893 : int lmax[2] = {task->lmax[0] + handler->lmax_diff[0],
894 : task->lmax[1] + handler->lmax_diff[1]};
895 : int lmin[2] = {task->lmin[0] + handler->lmin_diff[0],
896 : task->lmin[1] + handler->lmin_diff[1]};
897 :
898 : lmin[0] = imax(lmin[0], 0);
899 : lmin[1] = imax(lmin[1], 0);
900 :
901 : initialize_tensor_4(&(handler->alpha), 3, lmax[1] + 1, lmax[0] + 1,
902 : lmax[0] + lmax[1] + 1);
903 : realloc_tensor(&(handler->alpha));
904 :
905 : const int lp = lmax[0] + lmax[1];
906 :
907 : initialize_tensor_3(&(handler->coef), lp + 1, lp + 1, lp + 1);
908 : realloc_tensor(&(handler->coef));
909 : grid_integrate(handler, ctx->orthorhombic, task->zetp, task->rp,
910 : task->radius);
911 : /*
912 : handler->coef contains coefficients in the (x - x_12) basis. now
913 : we need to rotate them in the (x - x_1) (x - x_2) basis
914 : */
915 :
916 : /* compute the transformation matrix */
917 : grid_prepare_alpha_dgemm(task->ra, task->rb, task->rp, lmax,
918 : &(handler->alpha));
919 :
920 : initialize_tensor_2(&vab, ncoset(lmax[1]), ncoset(lmax[0]));
921 : realloc_tensor(&vab);
922 :
923 : grid_compute_vab(lmin, lmax, lp, task->prefactor,
924 : &handler->alpha, // contains the matrix for the rotation
925 : &handler->coef, // contains the coefficients of
926 : // the potential in the
927 : // (x_x_12) basis
928 : &vab); // contains the coefficients of the potential
929 : // in the (x - x_1)(x - x_2) basis
930 :
931 : if (calculate_forces) {
932 : memset(forces_local_pair_.data, 0,
933 : sizeof(double) * forces_local_pair_.alloc_size_);
934 : memset(virial_local_pair_.data, 0,
935 : sizeof(double) * virial_local_pair_.alloc_size_);
936 : }
937 :
938 : update_hab_forces_and_stress(
939 : task, &vab, &pab, calculate_tau, calculate_forces, calculate_virial,
940 : &forces_local_pair_, /* matrix
941 : * containing the
942 : * contribution of
943 : * the gaussian
944 : * pair for each
945 : * atom */
946 : &virial_local_pair_, /* same but for the virial term (stress tensor)
947 : */
948 : &hab);
949 :
950 : if (calculate_forces) {
951 : const double scaling = (task->iatom == task->jatom) ? 1.0 : 2.0;
952 : idx2(forces_local_, task->iatom, 0) +=
953 : scaling * idx2(forces_local_pair_, 0, 0);
954 : idx2(forces_local_, task->iatom, 1) +=
955 : scaling * idx2(forces_local_pair_, 0, 1);
956 : idx2(forces_local_, task->iatom, 2) +=
957 : scaling * idx2(forces_local_pair_, 0, 2);
958 :
959 : idx2(forces_local_, task->jatom, 0) +=
960 : scaling * idx2(forces_local_pair_, 1, 0);
961 : idx2(forces_local_, task->jatom, 1) +=
962 : scaling * idx2(forces_local_pair_, 1, 1);
963 : idx2(forces_local_, task->jatom, 2) +=
964 : scaling * idx2(forces_local_pair_, 1, 2);
965 : if (calculate_virial) {
966 : for (int i = 0; i < 3; i++) {
967 : for (int j = 0; j < 3; j++) {
968 : idx2(virial_local_, i, j) +=
969 : scaling * (idx3(virial_local_pair_, 0, i, j) +
970 : idx3(virial_local_pair_, 1, i, j));
971 : }
972 : }
973 : }
974 : }
975 : }
976 :
977 : rotate_and_store_coefficients(ctx, prev_task, NULL, &hab, &work,
978 : hab_block_local);
979 :
980 : // now reduction over the hab blocks
981 : if (num_threads > 1) {
982 : // does not store the number of elements but the amount of memory
983 : // occupied. That's a strange choice.
984 : const int hab_size = hab_blocks->size / sizeof(double);
985 : if ((hab_size / num_threads) >= 2) {
986 : const int block_size =
987 : hab_size / num_threads + (hab_size % num_threads);
988 :
989 : for (int bk = 0; bk < num_threads; bk++) {
990 : int bk_id = (bk + thread_id) % num_threads;
991 : size_t begin = bk_id * block_size;
992 : size_t end = imin((bk_id + 1) * block_size, hab_size);
993 : cblas_daxpy(end - begin, 1.0, hab_block_local + begin, 1,
994 : hab_blocks->host_buffer + begin, 1);
995 : #pragma omp barrier
996 : }
997 : } else {
998 : const int hab_size = hab_blocks->size / sizeof(double);
999 : #pragma omp critical
1000 : cblas_daxpy(hab_size, 1.0, hab_block_local, 1, hab_blocks->host_buffer,
1001 : 1);
1002 : }
1003 : }
1004 :
1005 : if (calculate_forces) {
1006 : if (num_threads > 1) {
1007 : if ((forces_->alloc_size_ / num_threads) >= 2) {
1008 : const int block_size = forces_->alloc_size_ / num_threads +
1009 : (forces_->alloc_size_ % num_threads);
1010 :
1011 : for (int bk = 0; bk < num_threads; bk++) {
1012 : int bk_id = (bk + thread_id) % num_threads;
1013 : int begin = bk_id * block_size;
1014 : int end = imin((bk_id + 1) * block_size, forces_->alloc_size_);
1015 : cblas_daxpy(end - begin, 1.0, forces_local_.data + begin, 1,
1016 : forces_->data + begin, 1);
1017 : #pragma omp barrier
1018 : }
1019 : } else {
1020 : #pragma omp critical
1021 : cblas_daxpy(forces_local_.alloc_size_, 1.0, forces_local_.data, 1,
1022 : forces_->data, 1);
1023 : }
1024 : } else {
1025 : // we are running with OMP_NUM_THREADS=1
1026 : cblas_daxpy(forces_local_.alloc_size_, 1.0, forces_local_.data, 1,
1027 : forces_->data, 1);
1028 : }
1029 : }
1030 :
1031 : if (calculate_virial) {
1032 : #pragma omp critical
1033 : for (int i = 0; i < 3; i++) {
1034 : for (int j = 0; j < 3; j++) {
1035 : idx2(virial_[0], i, j) += idx2(virial_local_, i, j);
1036 : }
1037 : }
1038 : }
1039 :
1040 : handler->grid.data = NULL;
1041 : free(vab.data);
1042 : free(work.data);
1043 : free(pab.data);
1044 : free(hab.data);
1045 :
1046 : if (calculate_forces) {
1047 : free(forces_local_.data);
1048 : free(virial_local_.data);
1049 : free(virial_local_pair_.data);
1050 : free(forces_local_pair_.data);
1051 : }
1052 : }
1053 624 : }
1054 :
1055 : /*******************************************************************************
1056 : * \brief Integrate all tasks of in given list from given grids using matrix -
1057 : * matrix multiplication
1058 : ******************************************************************************/
1059 156 : void grid_dgemm_integrate_task_list(
1060 : void *ptr, const bool compute_tau, const int natoms, const int nlevels,
1061 : const offload_buffer *const pab_blocks, offload_buffer *grids[nlevels],
1062 : offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3]) {
1063 :
1064 156 : grid_context *const ctx = (grid_context *)ptr;
1065 :
1066 156 : assert(ctx->checksum == ctx_checksum);
1067 156 : assert(ctx->nlevels == nlevels);
1068 156 : assert(ctx->natoms == natoms);
1069 :
1070 : // Zero result arrays.
1071 156 : memset(hab_blocks->host_buffer, 0, hab_blocks->size);
1072 :
1073 156 : const int max_threads = omp_get_max_threads();
1074 :
1075 156 : if (ctx->scratch == NULL)
1076 156 : ctx->scratch = malloc(hab_blocks->size * max_threads);
1077 156 : assert(ctx->scratch != NULL);
1078 :
1079 : // #pragma omp parallel for
1080 780 : for (int level = 0; level < ctx->nlevels; level++) {
1081 624 : const _layout *layout = &ctx->layouts[level];
1082 624 : set_grid_parameters(&ctx->grid[level], ctx->orthorhombic,
1083 624 : layout->npts_global, layout->npts_local,
1084 624 : layout->shift_local, layout->border_width, layout->dh,
1085 624 : layout->dh_inv, grids[level]);
1086 624 : ctx->grid[level].data = grids[level]->host_buffer;
1087 : }
1088 :
1089 156 : bool calculate_virial = (virial != NULL);
1090 156 : bool calculate_forces = (forces != NULL);
1091 :
1092 156 : tensor forces_, virial_;
1093 156 : if (calculate_forces) {
1094 20 : initialize_tensor_2(&forces_, natoms, 3);
1095 20 : alloc_tensor(&forces_);
1096 20 : initialize_tensor_2(&virial_, 3, 3);
1097 20 : alloc_tensor(&virial_);
1098 20 : memset(forces_.data, 0, sizeof(double) * forces_.alloc_size_);
1099 20 : memset(virial_.data, 0, sizeof(double) * virial_.alloc_size_);
1100 : }
1101 :
1102 780 : for (int level = 0; level < ctx->nlevels; level++) {
1103 624 : const _layout *layout = &ctx->layouts[level];
1104 624 : integrate_one_grid_level_dgemm(ctx, level, compute_tau, calculate_forces,
1105 624 : calculate_virial, layout->shift_local,
1106 624 : layout->border_width, pab_blocks, hab_blocks,
1107 : &forces_, &virial_);
1108 624 : ctx->grid[level].data = NULL;
1109 : }
1110 156 : if (calculate_forces) {
1111 20 : if (calculate_virial) {
1112 16 : virial[0][0] = idx2(virial_, 0, 0);
1113 16 : virial[0][1] = idx2(virial_, 0, 1);
1114 16 : virial[0][2] = idx2(virial_, 0, 2);
1115 16 : virial[1][0] = idx2(virial_, 1, 0);
1116 16 : virial[1][1] = idx2(virial_, 1, 1);
1117 16 : virial[1][2] = idx2(virial_, 1, 2);
1118 16 : virial[2][0] = idx2(virial_, 2, 0);
1119 16 : virial[2][1] = idx2(virial_, 2, 1);
1120 16 : virial[2][2] = idx2(virial_, 2, 2);
1121 : }
1122 :
1123 20 : memcpy(forces[0], forces_.data, sizeof(double) * forces_.alloc_size_);
1124 20 : free(forces_.data);
1125 20 : free(virial_.data);
1126 : }
1127 :
1128 156 : free(ctx->scratch);
1129 156 : ctx->scratch = NULL;
1130 156 : }
|