LCOV - code coverage report
Current view: top level - src/grid/ref - grid_ref_collint.h (source / functions) Hit Total Coverage
Test: CP2K Regtests (git:262480d) Lines: 434 449 96.7 %
Date: 2024-11-22 07:00:40 Functions: 17 17 100.0 %

          Line data    Source code
       1             : /*----------------------------------------------------------------------------*/
       2             : /*  CP2K: A general program to perform molecular dynamics simulations         */
       3             : /*  Copyright 2000-2024 CP2K developers group <https://cp2k.org>              */
       4             : /*                                                                            */
       5             : /*  SPDX-License-Identifier: BSD-3-Clause                                     */
       6             : /*----------------------------------------------------------------------------*/
       7             : 
       8             : #include <assert.h>
       9             : #include <limits.h>
      10             : #include <math.h>
      11             : #include <stdio.h>
      12             : #include <stdlib.h>
      13             : #include <string.h>
      14             : 
      15             : #include "../common/grid_common.h"
      16             : #include "../common/grid_library.h"
      17             : 
      18             : #if (GRID_DO_COLLOCATE)
      19             : #define GRID_CONST_WHEN_COLLOCATE const
      20             : #define GRID_CONST_WHEN_INTEGRATE
      21             : #else
      22             : #define GRID_CONST_WHEN_COLLOCATE
      23             : #define GRID_CONST_WHEN_INTEGRATE const
      24             : #endif
      25             : 
      26             : /*******************************************************************************
      27             :  * \brief Collocates coefficients C_x onto the grid for orthorhombic case.
      28             :  * \author Ole Schuett
      29             :  ******************************************************************************/
      30             : static inline void
      31      221940 : ortho_cx_to_grid(const int lp, const int k1, const int k2, const int j1,
      32             :                  const int j2, const int cmax,
      33      221940 :                  const double pol[3][lp + 1][2 * cmax + 1],
      34             :                  const int map[3][2 * cmax + 1], const double dh[3][3],
      35             :                  const double dh_inv[3][3], const double kremain,
      36             :                  const int npts_local[3], GRID_CONST_WHEN_COLLOCATE double *cx,
      37             :                  GRID_CONST_WHEN_INTEGRATE double *grid) {
      38             : 
      39      221940 :   const int kg1 = map[2][k1 + cmax];
      40      221940 :   const int kg2 = map[2][k2 + cmax];
      41      221940 :   const int jg1 = map[1][j1 + cmax];
      42      221940 :   const int jg2 = map[1][j2 + cmax];
      43             : 
      44             :   // For this innermost dimension we're going sequentially instead of in pairs.
      45             :   // Hence we're processing four points at once, which is well suited for AVX2.
      46      221940 :   const int jd = (2 * j1 - 1) / 2; // distance from center in grid points
      47      221940 :   const double jr = jd * dh[1][1]; // distance from center in a.u.
      48      221940 :   const double jremain = kremain - jr * jr;
      49      221940 :   const int istart = (int)ceil(-1e-8 - sqrt(fmax(0.0, jremain)) * dh_inv[0][0]);
      50      221940 :   const int iend = 1 - istart;
      51             : 
      52     3833244 :   for (int i = istart; i <= iend; i++) {
      53     3611304 :     const int ig = map[0][i + cmax];
      54             : 
      55     3611304 :     const int stride = npts_local[1] * npts_local[0];
      56     3611304 :     const int grid_index_0 = kg1 * stride + jg1 * npts_local[0] + ig;
      57     3611304 :     const int grid_index_1 = kg2 * stride + jg1 * npts_local[0] + ig;
      58     3611304 :     const int grid_index_2 = kg1 * stride + jg2 * npts_local[0] + ig;
      59     3611304 :     const int grid_index_3 = kg2 * stride + jg2 * npts_local[0] + ig;
      60             : 
      61     3611304 :     GRID_CONST_WHEN_INTEGRATE double *grid_0 = &grid[grid_index_0];
      62     3611304 :     GRID_CONST_WHEN_INTEGRATE double *grid_1 = &grid[grid_index_1];
      63     3611304 :     GRID_CONST_WHEN_INTEGRATE double *grid_2 = &grid[grid_index_2];
      64     3611304 :     GRID_CONST_WHEN_INTEGRATE double *grid_3 = &grid[grid_index_3];
      65             : 
      66             : #if (GRID_DO_COLLOCATE)
      67             :     // collocate
      68     1805652 :     double reg[4] = {0.0, 0.0, 0.0, 0.0};
      69     7897392 :     for (int lxp = 0; lxp <= lp; lxp++) {
      70     6091740 :       const double p = pol[0][lxp][i + cmax];
      71     6091740 :       reg[0] += cx[lxp * 4 + 0] * p;
      72     6091740 :       reg[1] += cx[lxp * 4 + 1] * p;
      73     6091740 :       reg[2] += cx[lxp * 4 + 2] * p;
      74     6091740 :       reg[3] += cx[lxp * 4 + 3] * p;
      75             :     }
      76     1805652 :     *grid_0 += reg[0];
      77     1805652 :     *grid_1 += reg[1];
      78     1805652 :     *grid_2 += reg[2];
      79     1805652 :     *grid_3 += reg[3];
      80             : 
      81             : #else
      82             :     // integrate
      83     1805652 :     const double reg[4] = {*grid_0, *grid_1, *grid_2, *grid_3};
      84     8098020 :     for (int lxp = 0; lxp <= lp; lxp++) {
      85     6292368 :       const double p = pol[0][lxp][i + cmax];
      86     6292368 :       cx[lxp * 4 + 0] += reg[0] * p;
      87     6292368 :       cx[lxp * 4 + 1] += reg[1] * p;
      88     6292368 :       cx[lxp * 4 + 2] += reg[2] * p;
      89     6292368 :       cx[lxp * 4 + 3] += reg[3] * p;
      90             :     }
      91             : #endif
      92             :   }
      93      221940 : }
      94             : 
      95             : /*******************************************************************************
      96             :  * \brief Transforms coefficients C_xy into C_x by fixing grid index j.
      97             :  * \author Ole Schuett
      98             :  ******************************************************************************/
      99      221940 : static inline void ortho_cxy_to_cx(const int lp, const int j1, const int j2,
     100             :                                    const int cmax,
     101      221940 :                                    const double pol[3][lp + 1][2 * cmax + 1],
     102             :                                    GRID_CONST_WHEN_COLLOCATE double *cxy,
     103             :                                    GRID_CONST_WHEN_INTEGRATE double *cx) {
     104             : 
     105      973818 :   for (int lyp = 0; lyp <= lp; lyp++) {
     106     2441602 :     for (int lxp = 0; lxp <= lp - lyp; lxp++) {
     107     1689724 :       const double p1 = pol[1][lyp][j1 + cmax];
     108     1689724 :       const double p2 = pol[1][lyp][j2 + cmax];
     109     1689724 :       const int cxy_index = lyp * (lp + 1) * 2 + lxp * 2; // [lyp, lxp, 0]
     110             : 
     111             : #if (GRID_DO_COLLOCATE)
     112             :       // collocate
     113      818154 :       cx[lxp * 4 + 0] += cxy[cxy_index + 0] * p1;
     114      818154 :       cx[lxp * 4 + 1] += cxy[cxy_index + 1] * p1;
     115      818154 :       cx[lxp * 4 + 2] += cxy[cxy_index + 0] * p2;
     116      818154 :       cx[lxp * 4 + 3] += cxy[cxy_index + 1] * p2;
     117             : #else
     118             :       // integrate
     119      871570 :       cxy[cxy_index + 0] += cx[lxp * 4 + 0] * p1;
     120      871570 :       cxy[cxy_index + 1] += cx[lxp * 4 + 1] * p1;
     121      871570 :       cxy[cxy_index + 0] += cx[lxp * 4 + 2] * p2;
     122      871570 :       cxy[cxy_index + 1] += cx[lxp * 4 + 3] * p2;
     123             : #endif
     124             :     }
     125             :   }
     126      221940 : }
     127             : 
     128             : /*******************************************************************************
     129             :  * \brief Collocates coefficients C_xy onto the grid for orthorhombic case.
     130             :  * \author Ole Schuett
     131             :  ******************************************************************************/
     132       28620 : static inline void ortho_cxy_to_grid(
     133             :     const int lp, const int k1, const int k2, const int cmax,
     134       28620 :     const double pol[3][lp + 1][2 * cmax + 1], const int map[3][2 * cmax + 1],
     135             :     const double dh[3][3], const double dh_inv[3][3], const double disr_radius,
     136             :     const int npts_local[3], GRID_CONST_WHEN_COLLOCATE double *cxy,
     137       28620 :     GRID_CONST_WHEN_INTEGRATE double *grid) {
     138             : 
     139             :   // The cube contains an even number of grid points in each direction and
     140             :   // collocation is always performed on a pair of two opposing grid points.
     141             :   // Hence, the points with index 0 and 1 are both assigned distance zero via
     142             :   // the formular distance=(2*index-1)/2.
     143             : 
     144       28620 :   const int kd = (2 * k1 - 1) / 2; // distance from center in grid points
     145       28620 :   const double kr = kd * dh[2][2]; // distance from center in a.u.
     146       28620 :   const double kremain = disr_radius * disr_radius - kr * kr;
     147       28620 :   const int jstart = (int)ceil(-1e-8 - sqrt(fmax(0.0, kremain)) * dh_inv[1][1]);
     148             : 
     149       28620 :   const size_t cx_size = (lp + 1) * 4;
     150       28620 :   double cx[cx_size];
     151      250560 :   for (int j1 = jstart; j1 <= 0; j1++) {
     152      221940 :     const int j2 = 1 - j1;
     153      221940 :     memset(cx, 0, cx_size * sizeof(double));
     154             : 
     155             : #if (GRID_DO_COLLOCATE)
     156             :     // collocate
     157      110970 :     ortho_cxy_to_cx(lp, j1, j2, cmax, pol, cxy, cx);
     158      110970 :     ortho_cx_to_grid(lp, k1, k2, j1, j2, cmax, pol, map, dh, dh_inv, kremain,
     159             :                      npts_local, cx, grid);
     160             : #else
     161             :     // integrate
     162      110970 :     ortho_cx_to_grid(lp, k1, k2, j1, j2, cmax, pol, map, dh, dh_inv, kremain,
     163             :                      npts_local, cx, grid);
     164      110970 :     ortho_cxy_to_cx(lp, j1, j2, cmax, pol, cxy, cx);
     165             : #endif
     166             :   }
     167       28620 : }
     168             : 
     169             : /*******************************************************************************
     170             :  * \brief Transforms coefficients C_xyz into C_xz by fixing grid index k.
     171             :  * \author Ole Schuett
     172             :  ******************************************************************************/
     173       28620 : static inline void ortho_cxyz_to_cxy(const int lp, const int k1, const int k2,
     174             :                                      const int cmax,
     175       28620 :                                      const double pol[3][lp + 1][2 * cmax + 1],
     176             :                                      GRID_CONST_WHEN_COLLOCATE double *cxyz,
     177             :                                      GRID_CONST_WHEN_INTEGRATE double *cxy) {
     178             : 
     179      124350 :   for (int lzp = 0; lzp <= lp; lzp++) {
     180      308074 :     for (int lyp = 0; lyp <= lp - lzp; lyp++) {
     181      603002 :       for (int lxp = 0; lxp <= lp - lzp - lyp; lxp++) {
     182      390658 :         const double p1 = pol[2][lzp][k1 + cmax];
     183      390658 :         const double p2 = pol[2][lzp][k2 + cmax];
     184      390658 :         const int cxyz_index =
     185      390658 :             lzp * (lp + 1) * (lp + 1) + lyp * (lp + 1) + lxp; // [lzp, lyp, lxp]
     186      390658 :         const int cxy_index = lyp * (lp + 1) * 2 + lxp * 2;   // [lyp, lxp, 0]
     187             : 
     188             : #if (GRID_DO_COLLOCATE)
     189             :         // collocate
     190      186210 :         cxy[cxy_index + 0] += cxyz[cxyz_index] * p1;
     191      186210 :         cxy[cxy_index + 1] += cxyz[cxyz_index] * p2;
     192             : #else
     193             :         // integrate
     194      204448 :         cxyz[cxyz_index] += cxy[cxy_index + 0] * p1;
     195      204448 :         cxyz[cxyz_index] += cxy[cxy_index + 1] * p2;
     196             : #endif
     197             :       }
     198             :     }
     199             :   }
     200       28620 : }
     201             : 
     202             : /*******************************************************************************
     203             :  * \brief Collocates coefficients C_xyz onto the grid for orthorhombic case.
     204             :  * \author Ole Schuett
     205             :  ******************************************************************************/
     206             : static inline void
     207        3276 : ortho_cxyz_to_grid(const int lp, const double zetp, const double dh[3][3],
     208             :                    const double dh_inv[3][3], const double rp[3],
     209             :                    const int npts_global[3], const int npts_local[3],
     210             :                    const int shift_local[3], const double radius,
     211             :                    GRID_CONST_WHEN_COLLOCATE double *cxyz,
     212        3276 :                    GRID_CONST_WHEN_INTEGRATE double *grid) {
     213             : 
     214             :   // *** position of the gaussian product
     215             :   //
     216             :   // this is the actual definition of the position on the grid
     217             :   // i.e. a point rp(:) gets here grid coordinates
     218             :   // MODULO(rp(:)/dr(:),npts_global(:))+1
     219             :   // hence (0.0,0.0,0.0) in real space is rsgrid%lb on the rsgrid in Fortran
     220             :   // and (1,1,1) on grid here in C.
     221             : 
     222             :   // cubecenter(:) = FLOOR(MATMUL(dh_inv, rp))
     223        3276 :   int cubecenter[3];
     224       13104 :   for (int i = 0; i < 3; i++) {
     225             :     double dh_inv_rp = 0.0;
     226       39312 :     for (int j = 0; j < 3; j++) {
     227       29484 :       dh_inv_rp += dh_inv[j][i] * rp[j];
     228             :     }
     229        9828 :     cubecenter[i] = (int)floor(dh_inv_rp);
     230             :   }
     231             : 
     232             :   double roffset[3];
     233       13104 :   for (int i = 0; i < 3; i++) {
     234        9828 :     roffset[i] = rp[i] - ((double)cubecenter[i]) * dh[i][i];
     235             :   }
     236             : 
     237             :   // Historically, the radius gets discretized to make sphere bounds cacheable.
     238        3276 :   const double drmin = fmin(dh[0][0], fmin(dh[1][1], dh[2][2]));
     239        3276 :   const double disr_radius = drmin * fmax(1.0, ceil(radius / drmin));
     240             : 
     241             :   // Cube bounds.
     242        3276 :   int lb_cube[3], ub_cube[3];
     243       13104 :   for (int i = 0; i < 3; i++) {
     244        9828 :     lb_cube[i] = (int)ceil(-1e-8 - disr_radius * dh_inv[i][i]);
     245        9828 :     ub_cube[i] = 1 - lb_cube[i];
     246             :     // If grid is not period check that cube fits without wrapping.
     247        9828 :     if (npts_global[i] != npts_local[i]) {
     248           0 :       const int offset =
     249           0 :           modulo(cubecenter[i] + lb_cube[i] - shift_local[i], npts_global[i]) -
     250             :           lb_cube[i];
     251           0 :       assert(offset + ub_cube[i] < npts_local[i]);
     252           0 :       assert(offset + lb_cube[i] >= 0);
     253             :     }
     254             :   }
     255             : 
     256             :   // cmax = MAXVAL(ub_cube)
     257        3276 :   const int cmax = imax(imax(ub_cube[0], ub_cube[1]), ub_cube[2]);
     258             : 
     259             :   // Precompute (x-xp)**lp*exp(..) for each direction.
     260        3276 :   double pol_mutable[3][lp + 1][2 * cmax + 1];
     261       13104 :   for (int idir = 0; idir < 3; idir++) {
     262        9828 :     const double dr = dh[idir][idir];
     263        9828 :     const double ro = roffset[idir];
     264             :     //  Reuse the result from the previous gridpoint to avoid to many exps:
     265             :     //  exp( -a*(x+d)**2) = exp(-a*x**2)*exp(-2*a*x*d)*exp(-a*d**2)
     266             :     //  exp(-2*a*(x+d)*d) = exp(-2*a*x*d)*exp(-2*a*d**2)
     267        9828 :     const double t_exp_1 = exp(-zetp * pow(dr, 2));
     268        9828 :     const double t_exp_2 = pow(t_exp_1, 2);
     269        9828 :     double t_exp_min_1 = exp(-zetp * pow(+dr - ro, 2));
     270        9828 :     double t_exp_min_2 = exp(-2 * zetp * (+dr - ro) * (-dr));
     271       95688 :     for (int ig = 0; ig >= lb_cube[idir]; ig--) {
     272       85860 :       const double rpg = ig * dr - ro;
     273       85860 :       t_exp_min_1 *= t_exp_min_2 * t_exp_1;
     274       85860 :       t_exp_min_2 *= t_exp_2;
     275       85860 :       double pg = t_exp_min_1;
     276      373050 :       for (int icoef = 0; icoef <= lp; icoef++) {
     277      287190 :         pol_mutable[idir][icoef][ig + cmax] = pg; // exp(-zetp*rpg**2)
     278      287190 :         pg *= rpg;
     279             :       }
     280             :     }
     281        9828 :     double t_exp_plus_1 = exp(-zetp * pow(-ro, 2));
     282        9828 :     double t_exp_plus_2 = exp(-2 * zetp * (-ro) * (+dr));
     283       95688 :     for (int ig = 0; ig >= lb_cube[idir]; ig--) {
     284       85860 :       const double rpg = (1 - ig) * dr - ro;
     285       85860 :       t_exp_plus_1 *= t_exp_plus_2 * t_exp_1;
     286       85860 :       t_exp_plus_2 *= t_exp_2;
     287       85860 :       double pg = t_exp_plus_1;
     288      373050 :       for (int icoef = 0; icoef <= lp; icoef++) {
     289      287190 :         pol_mutable[idir][icoef][1 - ig + cmax] = pg; // exp(-zetp*rpg**2)
     290      287190 :         pg *= rpg;
     291             :       }
     292             :     }
     293             :   }
     294        3276 :   const double(*pol)[lp + 1][2 * cmax + 1] =
     295        3276 :       (const double(*)[lp + 1][2 * cmax + 1]) pol_mutable;
     296             : 
     297             :   // Precompute mapping from cube to grid indices for each direction
     298        3276 :   int map_mutable[3][2 * cmax + 1];
     299       13104 :   for (int i = 0; i < 3; i++) {
     300      191376 :     for (int k = -cmax; k <= +cmax; k++) {
     301      181548 :       map_mutable[i][k + cmax] =
     302      181548 :           modulo(cubecenter[i] + k - shift_local[i], npts_global[i]);
     303             :     }
     304             :   }
     305        3276 :   const int(*map)[2 * cmax + 1] = (const int(*)[2 * cmax + 1]) map_mutable;
     306             : 
     307             :   // Loop over k dimension of the cube.
     308        3276 :   const int kstart = (int)ceil(-1e-8 - disr_radius * dh_inv[2][2]);
     309        3276 :   const size_t cxy_size = (lp + 1) * (lp + 1) * 2;
     310        3276 :   double cxy[cxy_size];
     311       31896 :   for (int k1 = kstart; k1 <= 0; k1++) {
     312       28620 :     const int k2 = 1 - k1;
     313       28620 :     memset(cxy, 0, cxy_size * sizeof(double));
     314             : 
     315             : #if (GRID_DO_COLLOCATE)
     316             :     // collocate
     317       14310 :     ortho_cxyz_to_cxy(lp, k1, k2, cmax, pol, cxyz, cxy);
     318       14310 :     ortho_cxy_to_grid(lp, k1, k2, cmax, pol, map, dh, dh_inv, disr_radius,
     319             :                       npts_local, cxy, grid);
     320             : #else
     321             :     // integrate
     322       14310 :     ortho_cxy_to_grid(lp, k1, k2, cmax, pol, map, dh, dh_inv, disr_radius,
     323             :                       npts_local, cxy, grid);
     324       14310 :     ortho_cxyz_to_cxy(lp, k1, k2, cmax, pol, cxyz, cxy);
     325             : #endif
     326             :   }
     327        3276 : }
     328             : 
     329             : /*******************************************************************************
     330             :  * \brief Collocates coefficients C_i onto the grid for general case.
     331             :  * \author Ole Schuett
     332             :  ******************************************************************************/
     333             : static inline void
     334     2079910 : general_ci_to_grid(const int lp, const int jg, const int kg, const int ismin,
     335             :                    const int ismax, const int npts_local[3],
     336             :                    const int index_min[3], const int index_max[3],
     337             :                    const int map_i[], const double gp[3], const int k,
     338             :                    const int j, const double exp_ij[], const double exp_jk[],
     339             :                    const double exp_ki[], GRID_CONST_WHEN_COLLOCATE double *ci,
     340             :                    GRID_CONST_WHEN_INTEGRATE double *grid) {
     341             : 
     342     2079910 :   const int base = kg * npts_local[1] * npts_local[0] + jg * npts_local[0];
     343             : 
     344    23376248 :   for (int i = ismin; i <= ismax; i++) {
     345    21296338 :     const int ig = map_i[i - index_min[0]];
     346    21296338 :     if (ig < 0) {
     347           0 :       continue; // skip over out-of-bounds indicies
     348             :     }
     349    21296338 :     const double di = i - gp[0];
     350             : 
     351    21296338 :     const int stride_i = index_max[0] - index_min[0] + 1;
     352    21296338 :     const int stride_j = index_max[1] - index_min[1] + 1;
     353    21296338 :     const int stride_k = index_max[2] - index_min[2] + 1;
     354    21296338 :     const int idx_ij = (j - index_min[1]) * stride_i + i - index_min[0];
     355    21296338 :     const int idx_jk = (k - index_min[2]) * stride_j + j - index_min[1];
     356    21296338 :     const int idx_ki = (i - index_min[0]) * stride_k + k - index_min[2];
     357             : 
     358             :     // Mathieu's trick: Calculate 3D Gaussian from three precomputed 2D tables
     359             :     //
     360             :     // r   =  (i-gp[0])*dh[0,:] + (j-gp[1])*dh[1,:] + (k-gp[2])*dh[2,:]
     361             :     //     =  a                 + b                 + c
     362             :     //
     363             :     // r**2  =  (a + b + c)**2  =  a**2 + b**2 + c**2 + 2ab + 2bc + 2ca
     364             :     //
     365             :     // exp(-r**2)  =  exp(-a(a+2b)) * exp(-b*(b+2c)) * exp(-c*(c+2a))
     366             :     //
     367    21296338 :     const double gaussian = exp_ij[idx_ij] * exp_jk[idx_jk] * exp_ki[idx_ki];
     368             : 
     369    21296338 :     const int grid_index = base + ig; // [kg, jg, ig]
     370    21296338 :     double dip = gaussian;
     371             : 
     372             : #if (GRID_DO_COLLOCATE)
     373             :     // collocate
     374    12169336 :     double reg = 0.0;
     375    48677344 :     for (int il = 0; il <= lp; il++) {
     376    36508008 :       reg += ci[il] * dip;
     377    36508008 :       dip *= di;
     378             :     }
     379    12169336 :     grid[grid_index] += reg;
     380             : #else
     381             :     // integrate
     382     9127002 :     const double reg = grid[grid_index];
     383    39550342 :     for (int il = 0; il <= lp; il++) {
     384    30423340 :       ci[il] += reg * dip;
     385    30423340 :       dip *= di;
     386             :     }
     387             : #endif
     388             :   }
     389     2079910 : }
     390             : 
     391             : /*******************************************************************************
     392             :  * \brief Transforms coefficients C_ij into C_i by fixing grid index j.
     393             :  * \author Ole Schuett
     394             :  ******************************************************************************/
     395     2079910 : static inline void general_cij_to_ci(const int lp, const double dj,
     396             :                                      GRID_CONST_WHEN_COLLOCATE double *cij,
     397             :                                      GRID_CONST_WHEN_INTEGRATE double *ci) {
     398     2079910 :   double djp = 1.0;
     399     8616770 :   for (int jl = 0; jl <= lp; jl++) {
     400    20204840 :     for (int il = 0; il <= lp - jl; il++) {
     401    13667980 :       const int cij_index = jl * (lp + 1) + il; // [jl, il]
     402             : #if (GRID_DO_COLLOCATE)
     403     7131120 :       ci[il] += cij[cij_index] * djp; // collocate
     404             : #else
     405     6536860 :       cij[cij_index] += ci[il] * djp; // integrate
     406             : #endif
     407             :     }
     408     6536860 :     djp *= dj;
     409             :   }
     410     2079910 : }
     411             : 
     412             : /*******************************************************************************
     413             :  * \brief Collocates coefficients C_ij onto the grid for general case.
     414             :  * \author Ole Schuett
     415             :  ******************************************************************************/
     416      175910 : static inline void general_cij_to_grid(
     417             :     const int lp, const int k, const int kg, const int npts_local[3],
     418             :     const int index_min[3], const int index_max[3], const int map_i[],
     419             :     const int map_j[], const double dh[3][3], const double gp[3],
     420             :     const double radius, const double exp_ij[], const double exp_jk[],
     421             :     const double exp_ki[], GRID_CONST_WHEN_COLLOCATE double *cij,
     422             :     GRID_CONST_WHEN_INTEGRATE double *grid) {
     423             : 
     424     4593960 :   for (int j = index_min[1]; j <= index_max[1]; j++) {
     425     4418050 :     const int jg = map_j[j - index_min[1]];
     426     4418050 :     if (jg < 0) {
     427           0 :       continue; // skip over out-of-bounds indicies
     428             :     }
     429     4418050 :     const double dj = j - gp[1];
     430             : 
     431             :     //--------------------------------------------------------------------
     432             :     // Find bounds for the inner loop based on a quadratic equation in i.
     433             :     //
     434             :     // The real-space vector from the center of the gaussian to the
     435             :     // grid point i,j,k is given by:
     436             :     //   r = (i-gp[0])*dh[0,:] + (j-gp[1])*dh[1,:] + (k-gp[2])*dh[2,:]
     437             :     //
     438             :     // Separating the term that depends on i:
     439             :     //   r = i*dh[0,:] - gp[0]*dh[0,:] + (j-gp[1])*dh[1,:] + (k-gp[2])*dh[2,:]
     440             :     //     = i*dh[0,:] + v
     441             :     //
     442             :     // The squared distance works out to:
     443             :     //   r**2 = dh[0,:]**2 * i**2  +  2 * v * dh[0,:] * i  +  v**2
     444             :     //        = a * i**2           +  b * i                +  c
     445             :     //
     446             :     // Solving r**2==radius**2 for i yields:
     447             :     //    d =  b**2  -  4 * a * (c - radius**2)
     448             :     //    i = (-b \pm sqrt(d)) / (2*a)
     449             :     //
     450     4418050 :     double a = 0.0, b = 0.0, c = 0.0;
     451    17672200 :     for (int i = 0; i < 3; i++) {
     452    13254150 :       const double v = (0 - gp[0]) * dh[0][i] + (j - gp[1]) * dh[1][i] +
     453    13254150 :                        (k - gp[2]) * dh[2][i];
     454    13254150 :       a += dh[0][i] * dh[0][i];
     455    13254150 :       b += 2.0 * v * dh[0][i];
     456    13254150 :       c += v * v;
     457             :     }
     458     4418050 :     const double d = b * b - 4.0 * a * (c - radius * radius);
     459     4418050 :     if (0.0 < d) {
     460     2079910 :       const double sqrt_d = sqrt(d);
     461     2079910 :       const double inv_2a = 1.0 / (2.0 * a);
     462     2079910 :       const int ismin = (int)ceil((-b - sqrt_d) * inv_2a);
     463     2079910 :       const int ismax = (int)floor((-b + sqrt_d) * inv_2a);
     464             : 
     465     2079910 :       double ci[lp + 1];
     466     2079910 :       memset(ci, 0, sizeof(ci));
     467             : 
     468             : #if (GRID_DO_COLLOCATE)
     469             :       // collocate
     470     1188520 :       general_cij_to_ci(lp, dj, cij, ci);
     471     1188520 :       general_ci_to_grid(lp, jg, kg, ismin, ismax, npts_local, index_min,
     472             :                          index_max, map_i, gp, k, j, exp_ij, exp_jk, exp_ki, ci,
     473             :                          grid);
     474             : #else
     475             :       // integrate
     476      891390 :       general_ci_to_grid(lp, jg, kg, ismin, ismax, npts_local, index_min,
     477             :                          index_max, map_i, gp, k, j, exp_ij, exp_jk, exp_ki, ci,
     478             :                          grid);
     479      891390 :       general_cij_to_ci(lp, dj, cij, ci);
     480             : #endif
     481             :     }
     482             :   }
     483      175910 : }
     484             : 
     485             : /*******************************************************************************
     486             :  * \brief Transforms coefficients C_ijk into C_ij by fixing grid index k.
     487             :  * \author Ole Schuett
     488             :  ******************************************************************************/
     489      175910 : static inline void general_cijk_to_cij(const int lp, const double dk,
     490             :                                        GRID_CONST_WHEN_COLLOCATE double *cijk,
     491             :                                        GRID_CONST_WHEN_INTEGRATE double *cij) {
     492      175910 :   double dkp = 1.0;
     493      728770 :   for (int kl = 0; kl <= lp; kl++) {
     494     1708840 :     for (int jl = 0; jl <= lp - kl; jl++) {
     495     3166380 :       for (int il = 0; il <= lp - kl - jl; il++) {
     496     2010400 :         const int cij_index = jl * (lp + 1) + il; // [jl, il]
     497     2010400 :         const int cijk_index =
     498     2010400 :             kl * (lp + 1) * (lp + 1) + jl * (lp + 1) + il; // [kl, jl, il]
     499             : #if (GRID_DO_COLLOCATE)
     500     1005200 :         cij[cij_index] += cijk[cijk_index] * dkp; // collocate
     501             : #else
     502     1005200 :         cijk[cijk_index] += cij[cij_index] * dkp; // integrate
     503             : #endif
     504             :       }
     505             :     }
     506      552860 :     dkp *= dk;
     507             :   }
     508      175910 : }
     509             : 
     510             : /*******************************************************************************
     511             :  * \brief Precompute mapping of grid indices for general case.
     512             :  * \author Ole Schuett
     513             :  ******************************************************************************/
     514       32676 : static inline void general_precompute_mapping(const int index_min,
     515             :                                               const int index_max,
     516             :                                               const int shift_local,
     517             :                                               const int npts_global,
     518             :                                               const int bounds[2], int map[]) {
     519             : 
     520             :   // Precompute mapping from continous grid indices to pbc wraped.
     521      736498 :   for (int k = index_min; k <= index_max; k++) {
     522      703822 :     const int kg = modulo(k - shift_local, npts_global);
     523      703822 :     if (bounds[0] <= kg && kg <= bounds[1]) {
     524      703822 :       map[k - index_min] = kg;
     525             :     } else {
     526           0 :       map[k - index_min] = INT_MIN; // out of bounds - not mapped
     527             :     }
     528             :   }
     529       32676 : }
     530             : 
     531             : /*******************************************************************************
     532             :  * \brief Fill one of the 2D tables that speedup 3D Gaussian (Mathieu's trick).
     533             :  * \author Ole Schuett
     534             :  ******************************************************************************/
     535             : static inline void
     536       32676 : general_fill_exp_table(const int idir, const int jdir, const int index_min[3],
     537             :                        const int index_max[3], const double zetp,
     538             :                        const double dh[3][3], const double gp[3],
     539             :                        double exp_table[]) {
     540             : 
     541       32676 :   const int stride_i = index_max[idir] - index_min[idir] + 1;
     542       32676 :   const double h_ii = dh[idir][0] * dh[idir][0] + dh[idir][1] * dh[idir][1] +
     543       32676 :                       dh[idir][2] * dh[idir][2];
     544       32676 :   const double h_ij = dh[idir][0] * dh[jdir][0] + dh[idir][1] * dh[jdir][1] +
     545       32676 :                       dh[idir][2] * dh[jdir][2];
     546             : 
     547      736498 :   for (int i = index_min[idir]; i <= index_max[idir]; i++) {
     548      703822 :     const double di = i - gp[idir];
     549      703822 :     const double rii = di * di * h_ii;
     550      703822 :     const double rij_unit = di * h_ij;
     551      703822 :     const double exp_ij_unit = exp(-zetp * 2.0 * rij_unit);
     552             : 
     553             :     // compute exponentials symmetrically around cube center
     554      703822 :     const int j_center = (int)gp[jdir];
     555      703822 :     const double dj_center = j_center - gp[jdir];
     556      703822 :     const double rij_center = dj_center * rij_unit;
     557      703822 :     const double exp_ij_center = exp(-zetp * (rii + 2.0 * rij_center));
     558             : 
     559             :     // above center
     560      703822 :     double exp_ij = exp_ij_center;
     561     8769656 :     for (int j = j_center; j <= index_max[jdir]; j++) {
     562     8065834 :       const int idx = (j - index_min[jdir]) * stride_i + i - index_min[idir];
     563     8065834 :       exp_table[idx] = exp_ij; // exp(-zetp * (di*di*h_ii + 2*di*dj*h_ij));
     564     8065834 :       exp_ij *= exp_ij_unit;
     565             :     }
     566             : 
     567             :     // below center
     568      703822 :     const double exp_ij_unit_inv = 1.0 / exp_ij_unit;
     569      703822 :     exp_ij = exp_ij_center * exp_ij_unit_inv;
     570     8150996 :     for (int j = j_center - 1; j >= index_min[jdir]; j--) {
     571     7447174 :       const int idx = (j - index_min[jdir]) * stride_i + i - index_min[idir];
     572     7447174 :       exp_table[idx] = exp_ij; // exp(-zetp * (di*di*h_ii + 2*di*dj*h_ij));
     573     7447174 :       exp_ij *= exp_ij_unit_inv;
     574             :     }
     575             :   }
     576       32676 : }
     577             : 
     578             : /*******************************************************************************
     579             :  * \brief Collocates coefficients C_ijk onto the grid for general case.
     580             :  * \author Ole Schuett
     581             :  ******************************************************************************/
     582             : static inline void
     583       10892 : general_cijk_to_grid(const int border_mask, const int lp, const double zetp,
     584             :                      const double dh[3][3], const double dh_inv[3][3],
     585             :                      const double rp[3], const int npts_global[3],
     586             :                      const int npts_local[3], const int shift_local[3],
     587             :                      const int border_width[3], const double radius,
     588             :                      GRID_CONST_WHEN_COLLOCATE double *cijk,
     589       10892 :                      GRID_CONST_WHEN_INTEGRATE double *grid) {
     590             : 
     591             :   // Default for border_mask == 0.
     592       10892 :   int bounds_i[2] = {0, npts_local[0] - 1};
     593       10892 :   int bounds_j[2] = {0, npts_local[1] - 1};
     594       10892 :   int bounds_k[2] = {0, npts_local[2] - 1};
     595             : 
     596             :   // See also rs_find_node() in task_list_methods.F.
     597             :   // If the bit is set then we need to exclude the border in that direction.
     598       10892 :   if (border_mask & (1 << 0))
     599           0 :     bounds_i[0] += border_width[0];
     600       10892 :   if (border_mask & (1 << 1))
     601           0 :     bounds_i[1] -= border_width[0];
     602       10892 :   if (border_mask & (1 << 2))
     603           0 :     bounds_j[0] += border_width[1];
     604       10892 :   if (border_mask & (1 << 3))
     605           0 :     bounds_j[1] -= border_width[1];
     606       10892 :   if (border_mask & (1 << 4))
     607           0 :     bounds_k[0] += border_width[2];
     608       10892 :   if (border_mask & (1 << 5))
     609           0 :     bounds_k[1] -= border_width[2];
     610             : 
     611             :   // center in grid coords
     612             :   // gp = MATMUL(dh_inv, rp)
     613       10892 :   double gp[3] = {0.0, 0.0, 0.0};
     614       43568 :   for (int i = 0; i < 3; i++) {
     615      130704 :     for (int j = 0; j < 3; j++) {
     616       98028 :       gp[i] += dh_inv[j][i] * rp[j];
     617             :     }
     618             :   }
     619             : 
     620             :   // Get the min max indices that contain at least the cube that contains a
     621             :   // sphere around rp of radius radius if the cell is very non-orthogonal this
     622             :   // implies that many useless points are included this estimate can be improved
     623             :   // (i.e. not box but sphere should be used)
     624       10892 :   int index_min[3] = {INT_MAX, INT_MAX, INT_MAX};
     625       10892 :   int index_max[3] = {INT_MIN, INT_MIN, INT_MIN};
     626       43568 :   for (int i = -1; i <= 1; i++) {
     627      130704 :     for (int j = -1; j <= 1; j++) {
     628      392112 :       for (int k = -1; k <= 1; k++) {
     629      294084 :         const double x = rp[0] + i * radius;
     630      294084 :         const double y = rp[1] + j * radius;
     631      294084 :         const double z = rp[2] + k * radius;
     632     1176336 :         for (int idir = 0; idir < 3; idir++) {
     633      882252 :           const double resc =
     634      882252 :               dh_inv[0][idir] * x + dh_inv[1][idir] * y + dh_inv[2][idir] * z;
     635      882252 :           index_min[idir] = imin(index_min[idir], (int)floor(resc));
     636      882252 :           index_max[idir] = imax(index_max[idir], (int)ceil(resc));
     637             :         }
     638             :       }
     639             :     }
     640             :   }
     641             : 
     642             :   // Precompute mappings
     643       10892 :   const int range_i = index_max[0] - index_min[0] + 1;
     644       10892 :   int map_i[range_i];
     645       10892 :   general_precompute_mapping(index_min[0], index_max[0], shift_local[0],
     646             :                              npts_global[0], bounds_i, map_i);
     647       10892 :   const int range_j = index_max[1] - index_min[1] + 1;
     648       10892 :   int map_j[range_j];
     649       10892 :   general_precompute_mapping(index_min[1], index_max[1], shift_local[1],
     650             :                              npts_global[1], bounds_j, map_j);
     651       10892 :   const int range_k = index_max[2] - index_min[2] + 1;
     652       10892 :   int map_k[range_k];
     653       10892 :   general_precompute_mapping(index_min[2], index_max[2], shift_local[2],
     654             :                              npts_global[2], bounds_k, map_k);
     655             : 
     656             :   // Precompute exponentials
     657       10892 :   double exp_ij[range_i * range_j];
     658       10892 :   general_fill_exp_table(0, 1, index_min, index_max, zetp, dh, gp, exp_ij);
     659       10892 :   double exp_jk[range_j * range_k];
     660       10892 :   general_fill_exp_table(1, 2, index_min, index_max, zetp, dh, gp, exp_jk);
     661       10892 :   double exp_ki[range_k * range_i];
     662       10892 :   general_fill_exp_table(2, 0, index_min, index_max, zetp, dh, gp, exp_ki);
     663             : 
     664             :   // go over the grid, but cycle if the point is not within the radius
     665       10892 :   const int cij_size = (lp + 1) * (lp + 1);
     666       10892 :   double cij[cij_size];
     667      186802 :   for (int k = index_min[2]; k <= index_max[2]; k++) {
     668      175910 :     const int kg = map_k[k - index_min[2]];
     669      175910 :     if (kg < bounds_k[0] || bounds_k[1] < kg) {
     670           0 :       continue; // skip over out-of-bounds indicies
     671             :     }
     672      175910 :     const double dk = k - gp[2];
     673             : 
     674             :     // zero coef_xyt
     675      175910 :     memset(cij, 0, cij_size * sizeof(double));
     676             : 
     677             : #if (GRID_DO_COLLOCATE)
     678             :     // collocate
     679      100520 :     general_cijk_to_cij(lp, dk, cijk, cij);
     680      100520 :     general_cij_to_grid(lp, k, kg, npts_local, index_min, index_max, map_i,
     681             :                         map_j, dh, gp, radius, exp_ij, exp_jk, exp_ki, cij,
     682             :                         grid);
     683             : #else
     684             :     // integrate
     685       75390 :     general_cij_to_grid(lp, k, kg, npts_local, index_min, index_max, map_i,
     686             :                         map_j, dh, gp, radius, exp_ij, exp_jk, exp_ki, cij,
     687             :                         grid);
     688       75390 :     general_cijk_to_cij(lp, dk, cijk, cij);
     689             : #endif
     690             :   }
     691       10892 : }
     692             : 
     693             : /*******************************************************************************
     694             :  * \brief Transforms coefficients C_xyz into C_ijk.
     695             :  * \author Ole Schuett
     696             :  ******************************************************************************/
     697             : static inline void
     698       10892 : general_cxyz_to_cijk(const int lp, const double dh[3][3],
     699             :                      GRID_CONST_WHEN_COLLOCATE double *cxyz,
     700       10892 :                      GRID_CONST_WHEN_INTEGRATE double *cijk) {
     701             : 
     702             :   // transform P_{lxp,lyp,lzp} into a P_{lip,ljp,lkp} such that
     703             :   // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-x_p)**lxp (y-y_p)**lyp (z-z_p)**lzp =
     704             :   // sum_{lip,ljp,lkp} P_{lip,ljp,lkp} (i-i_p)**lip (j-j_p)**ljp (k-k_p)**lkp
     705             : 
     706             :   // transform using multinomials
     707       10892 :   double hmatgridp[lp + 1][3][3];
     708       43568 :   for (int i = 0; i < 3; i++) {
     709      130704 :     for (int j = 0; j < 3; j++) {
     710       98028 :       hmatgridp[0][j][i] = 1.0;
     711      308088 :       for (int k = 1; k <= lp; k++) {
     712      210060 :         hmatgridp[k][j][i] = hmatgridp[k - 1][j][i] * dh[j][i];
     713             :       }
     714             :     }
     715             :   }
     716             : 
     717             :   const int lpx = lp;
     718       45124 :   for (int klx = 0; klx <= lpx; klx++) {
     719      105808 :     for (int jlx = 0; jlx <= lpx - klx; jlx++) {
     720      196056 :       for (int ilx = 0; ilx <= lpx - klx - jlx; ilx++) {
     721      124480 :         const int lx = ilx + jlx + klx;
     722      124480 :         const int lpy = lp - lx;
     723      318980 :         for (int kly = 0; kly <= lpy; kly++) {
     724      477692 :           for (int jly = 0; jly <= lpy - kly; jly++) {
     725      675304 :             for (int ily = 0; ily <= lpy - kly - jly; ily++) {
     726      392112 :               const int ly = ily + jly + kly;
     727      392112 :               const int lpz = lp - lx - ly;
     728      914928 :               for (int klz = 0; klz <= lpz; klz++) {
     729     1199676 :                 for (int jlz = 0; jlz <= lpz - klz; jlz++) {
     730     1532660 :                   for (int ilz = 0; ilz <= lpz - klz - jlz; ilz++) {
     731      855800 :                     const int lz = ilz + jlz + klz;
     732      855800 :                     const int il = ilx + ily + ilz;
     733      855800 :                     const int jl = jlx + jly + jlz;
     734      855800 :                     const int kl = klx + kly + klz;
     735      855800 :                     const int lp1 = lp + 1;
     736      855800 :                     const int cijk_index =
     737      855800 :                         kl * lp1 * lp1 + jl * lp1 + il; // [kl,jl,il]
     738      855800 :                     const int cxyz_index =
     739      855800 :                         lz * lp1 * lp1 + ly * lp1 + lx; // [lz,ly,lx]
     740      855800 :                     const double p =
     741      855800 :                         hmatgridp[ilx][0][0] * hmatgridp[jlx][1][0] *
     742      855800 :                         hmatgridp[klx][2][0] * hmatgridp[ily][0][1] *
     743      855800 :                         hmatgridp[jly][1][1] * hmatgridp[kly][2][1] *
     744      855800 :                         hmatgridp[ilz][0][2] * hmatgridp[jlz][1][2] *
     745      855800 :                         hmatgridp[klz][2][2] * fac(lx) * fac(ly) * fac(lz) /
     746      855800 :                         (fac(ilx) * fac(ily) * fac(ilz) * fac(jlx) * fac(jly) *
     747      855800 :                          fac(jlz) * fac(klx) * fac(kly) * fac(klz));
     748             : #if (GRID_DO_COLLOCATE)
     749      342320 :                     cijk[cijk_index] += cxyz[cxyz_index] * p; // collocate
     750             : #else
     751      513480 :                     cxyz[cxyz_index] += cijk[cijk_index] * p; // integrate
     752             : #endif
     753             :                   }
     754             :                 }
     755             :               }
     756             :             }
     757             :           }
     758             :         }
     759             :       }
     760             :     }
     761             :   }
     762       10892 : }
     763             : 
     764             : /*******************************************************************************
     765             :  * \brief Collocates coefficients C_xyz onto the grid for general case.
     766             :  * \author Ole Schuett
     767             :  ******************************************************************************/
     768             : static inline void
     769       10892 : general_cxyz_to_grid(const int border_mask, const int lp, const double zetp,
     770             :                      const double dh[3][3], const double dh_inv[3][3],
     771             :                      const double rp[3], const int npts_global[3],
     772             :                      const int npts_local[3], const int shift_local[3],
     773             :                      const int border_width[3], const double radius,
     774             :                      GRID_CONST_WHEN_COLLOCATE double *cxyz,
     775       10892 :                      GRID_CONST_WHEN_INTEGRATE double *grid) {
     776             : 
     777       10892 :   const size_t cijk_size = (lp + 1) * (lp + 1) * (lp + 1);
     778       10892 :   double cijk[cijk_size];
     779       10892 :   memset(cijk, 0, cijk_size * sizeof(double));
     780             : 
     781             : #if (GRID_DO_COLLOCATE)
     782             :   // collocate
     783        6224 :   general_cxyz_to_cijk(lp, dh, cxyz, cijk);
     784        6224 :   general_cijk_to_grid(border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
     785             :                        npts_local, shift_local, border_width, radius, cijk,
     786             :                        grid);
     787             : #else
     788             :   // integrate
     789        4668 :   general_cijk_to_grid(border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
     790             :                        npts_local, shift_local, border_width, radius, cijk,
     791             :                        grid);
     792        4668 :   general_cxyz_to_cijk(lp, dh, cxyz, cijk);
     793             : #endif
     794       10892 : }
     795             : 
     796             : /*******************************************************************************
     797             :  * \brief Collocates coefficients C_xyz onto the grid.
     798             :  * \author Ole Schuett
     799             :  ******************************************************************************/
     800             : static inline void
     801       14168 : cxyz_to_grid(const bool orthorhombic, const int border_mask, const int lp,
     802             :              const double zetp, const double dh[3][3],
     803             :              const double dh_inv[3][3], const double rp[3],
     804             :              const int npts_global[3], const int npts_local[3],
     805             :              const int shift_local[3], const int border_width[3],
     806             :              const double radius, GRID_CONST_WHEN_COLLOCATE double *cxyz,
     807             :              GRID_CONST_WHEN_INTEGRATE double *grid) {
     808             : 
     809       14168 :   enum grid_library_kernel k;
     810       14168 :   if (orthorhombic && border_mask == 0) {
     811        3276 :     k = (GRID_DO_COLLOCATE) ? GRID_COLLOCATE_ORTHO : GRID_INTEGRATE_ORTHO;
     812        3276 :     ortho_cxyz_to_grid(lp, zetp, dh, dh_inv, rp, npts_global, npts_local,
     813             :                        shift_local, radius, cxyz, grid);
     814             :   } else {
     815       10892 :     k = (GRID_DO_COLLOCATE) ? GRID_COLLOCATE_GENERAL : GRID_INTEGRATE_GENERAL;
     816       10892 :     general_cxyz_to_grid(border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
     817             :                          npts_local, shift_local, border_width, radius, cxyz,
     818             :                          grid);
     819             :   }
     820       14168 :   grid_library_counter_add(lp, GRID_BACKEND_REF, k, 1);
     821       14168 : }
     822             : 
     823             : /*******************************************************************************
     824             :  * \brief Transforms coefficients C_ab into C_xyz.
     825             :  * \author Ole Schuett
     826             :  ******************************************************************************/
     827       14168 : static inline void cab_to_cxyz(const int la_max, const int la_min,
     828             :                                const int lb_max, const int lb_min,
     829             :                                const double prefactor, const double ra[3],
     830             :                                const double rb[3], const double rp[3],
     831             :                                GRID_CONST_WHEN_COLLOCATE double *cab,
     832       14168 :                                GRID_CONST_WHEN_INTEGRATE double *cxyz) {
     833             : 
     834             :   // Computes the polynomial expansion coefficients:
     835             :   //     (x-a)**lxa (x-b)**lxb -> sum_{ls} alpha(ls,lxa,lxb,1)*(x-p)**ls
     836       14168 :   const int lp = la_max + lb_max;
     837       14168 :   double alpha[3][lb_max + 1][la_max + 1][lp + 1];
     838       14168 :   memset(alpha, 0, 3 * (lb_max + 1) * (la_max + 1) * (lp + 1) * sizeof(double));
     839             : 
     840       56672 :   for (int i = 0; i < 3; i++) {
     841       42504 :     const double drpa = rp[i] - ra[i];
     842       42504 :     const double drpb = rp[i] - rb[i];
     843      133914 :     for (int lxa = 0; lxa <= la_max; lxa++) {
     844      276780 :       for (int lxb = 0; lxb <= lb_max; lxb++) {
     845             :         double binomial_k_lxa = 1.0;
     846             :         double a = 1.0;
     847      483030 :         for (int k = 0; k <= lxa; k++) {
     848             :           double binomial_l_lxb = 1.0;
     849             :           double b = 1.0;
     850      750288 :           for (int l = 0; l <= lxb; l++) {
     851      452628 :             alpha[i][lxb][lxa][lxa - l + lxb - k] +=
     852      452628 :                 binomial_k_lxa * binomial_l_lxb * a * b;
     853      452628 :             binomial_l_lxb *= ((double)(lxb - l)) / ((double)(l + 1));
     854      452628 :             b *= drpb;
     855             :           }
     856      297660 :           binomial_k_lxa *= ((double)(lxa - k)) / ((double)(k + 1));
     857      297660 :           a *= drpa;
     858             :         }
     859             :       }
     860             :     }
     861             :   }
     862             : 
     863             :   //   *** initialise the coefficient matrix, we transform the sum
     864             :   //
     865             :   // sum_{lxa,lya,lza,lxb,lyb,lzb} P_{lxa,lya,lza,lxb,lyb,lzb} *
     866             :   //         (x-a_x)**lxa (y-a_y)**lya (z-a_z)**lza (x-b_x)**lxb (y-a_y)**lya
     867             :   //         (z-a_z)**lza
     868             :   //
     869             :   // into
     870             :   //
     871             :   // sum_{lxp,lyp,lzp} P_{lxp,lyp,lzp} (x-p_x)**lxp (y-p_y)**lyp (z-p_z)**lzp
     872             :   //
     873             :   // where p is center of the product gaussian, and lp = la_max + lb_max
     874             :   // (current implementation is l**7)
     875             :   //
     876             : 
     877       42900 :   for (int lzb = 0; lzb <= lb_max; lzb++) {
     878       90522 :     for (int lza = 0; lza <= la_max; lza++) {
     879      155750 :       for (int lyb = 0; lyb <= lb_max - lzb; lyb++) {
     880      244836 :         for (int lya = 0; lya <= la_max - lza; lya++) {
     881      150876 :           const int lxb_min = imax(lb_min - lzb - lyb, 0);
     882      150876 :           const int lxa_min = imax(la_min - lza - lya, 0);
     883      349736 :           for (int lxb = lxb_min; lxb <= lb_max - lzb - lyb; lxb++) {
     884      475362 :             for (int lxa = lxa_min; lxa <= la_max - lza - lya; lxa++) {
     885      276502 :               const int ico = coset(lxa, lya, lza);
     886      276502 :               const int jco = coset(lxb, lyb, lzb);
     887      276502 :               const int cab_index = jco * ncoset(la_max) + ico; // [jco, ico]
     888      718144 :               for (int lzp = 0; lzp <= lza + lzb; lzp++) {
     889     1537350 :                 for (int lyp = 0; lyp <= lp - lza - lzb; lyp++) {
     890     3148618 :                   for (int lxp = 0; lxp <= lp - lza - lzb - lyp; lxp++) {
     891     2052910 :                     const double p = alpha[0][lxb][lxa][lxp] *
     892     2052910 :                                      alpha[1][lyb][lya][lyp] *
     893     2052910 :                                      alpha[2][lzb][lza][lzp] * prefactor;
     894     2052910 :                     const int lp1 = lp + 1;
     895     2052910 :                     const int cxyz_index =
     896     2052910 :                         lzp * lp1 * lp1 + lyp * lp1 + lxp; // [lzp, lyp, lxp]
     897             : #if (GRID_DO_COLLOCATE)
     898      795876 :                     cxyz[cxyz_index] += cab[cab_index] * p; // collocate
     899             : #else
     900     1257034 :                     cab[cab_index] += cxyz[cxyz_index] * p; // integrate
     901             : #endif
     902             :                   }
     903             :                 }
     904             :               }
     905             :             }
     906             :           }
     907             :         }
     908             :       }
     909             :     }
     910             :   }
     911       14168 : }
     912             : 
     913             : /*******************************************************************************
     914             :  * \brief Collocates coefficients C_ab onto the grid.
     915             :  * \author Ole Schuett
     916             :  ******************************************************************************/
     917             : static inline void
     918       14168 : cab_to_grid(const bool orthorhombic, const int border_mask, const int la_max,
     919             :             const int la_min, const int lb_max, const int lb_min,
     920             :             const double zeta, const double zetb, const double rscale,
     921             :             const double dh[3][3], const double dh_inv[3][3],
     922             :             const double ra[3], const double rab[3], const int npts_global[3],
     923             :             const int npts_local[3], const int shift_local[3],
     924             :             const int border_width[3], const double radius,
     925             :             GRID_CONST_WHEN_COLLOCATE double *cab,
     926       14168 :             GRID_CONST_WHEN_INTEGRATE double *grid) {
     927             : 
     928             :   // Check if radius is too small to be mapped onto grid of given resolution.
     929       14168 :   double dh_max = 0.0;
     930       56672 :   for (int i = 0; i < 3; i++) {
     931      170016 :     for (int j = 0; j < 3; j++) {
     932      127512 :       dh_max = fmax(dh_max, fabs(dh[i][j]));
     933             :     }
     934             :   }
     935       14168 :   if (2.0 * radius < dh_max) {
     936           0 :     return;
     937             :   }
     938             : 
     939       14168 :   const double zetp = zeta + zetb;
     940       14168 :   const double f = zetb / zetp;
     941       14168 :   const double rab2 = rab[0] * rab[0] + rab[1] * rab[1] + rab[2] * rab[2];
     942       14168 :   const double prefactor = rscale * exp(-zeta * f * rab2);
     943       14168 :   double rp[3], rb[3];
     944       56672 :   for (int i = 0; i < 3; i++) {
     945       42504 :     rp[i] = ra[i] + f * rab[i];
     946       42504 :     rb[i] = ra[i] + rab[i];
     947             :   }
     948             : 
     949       14168 :   const int lp = la_max + lb_max;
     950       14168 :   const size_t cxyz_size = (lp + 1) * (lp + 1) * (lp + 1);
     951       14168 :   double cxyz[cxyz_size];
     952       14168 :   memset(cxyz, 0, cxyz_size * sizeof(double));
     953             : 
     954             : #if (GRID_DO_COLLOCATE)
     955             :   // collocate
     956        7862 :   cab_to_cxyz(la_max, la_min, lb_max, lb_min, prefactor, ra, rb, rp, cab, cxyz);
     957        7862 :   cxyz_to_grid(orthorhombic, border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
     958             :                npts_local, shift_local, border_width, radius, cxyz, grid);
     959             : #else
     960             :   // integrate
     961        6306 :   cxyz_to_grid(orthorhombic, border_mask, lp, zetp, dh, dh_inv, rp, npts_global,
     962             :                npts_local, shift_local, border_width, radius, cxyz, grid);
     963        6306 :   cab_to_cxyz(la_max, la_min, lb_max, lb_min, prefactor, ra, rb, rp, cab, cxyz);
     964             : #endif
     965             : }
     966             : 
     967             : // EOF

Generated by: LCOV version 1.15