LCOV - code coverage report
Current view: top level - src/grid/cpu - grid_cpu_task_list.c (source / functions) Hit Total Coverage
Test: CP2K Regtests (git:4dc10b3) Lines: 195 195 100.0 %
Date: 2024-11-21 06:45:46 Functions: 10 10 100.0 %

          Line data    Source code
       1             : /*----------------------------------------------------------------------------*/
       2             : /*  CP2K: A general program to perform molecular dynamics simulations         */
       3             : /*  Copyright 2000-2024 CP2K developers group <https://cp2k.org>              */
       4             : /*                                                                            */
       5             : /*  SPDX-License-Identifier: BSD-3-Clause                                     */
       6             : /*----------------------------------------------------------------------------*/
       7             : 
       8             : #include <assert.h>
       9             : #include <math.h>
      10             : #include <omp.h>
      11             : #include <stdint.h>
      12             : #include <stdio.h>
      13             : #include <stdlib.h>
      14             : #include <string.h>
      15             : 
      16             : #include "../common/grid_common.h"
      17             : #include "grid_cpu_collocate.h"
      18             : #include "grid_cpu_integrate.h"
      19             : #include "grid_cpu_task_list.h"
      20             : 
      21             : /*******************************************************************************
      22             :  * \brief Comperator passed to qsort to compare two tasks.
      23             :  * \author Ole Schuett
      24             :  ******************************************************************************/
      25    47794798 : static int compare_tasks(const void *a, const void *b) {
      26    47794798 :   const grid_cpu_task *task_a = a, *task_b = b;
      27    47794798 :   if (task_a->level != task_b->level) {
      28     3608493 :     return task_a->level - task_b->level;
      29    44186305 :   } else if (task_a->block_num != task_b->block_num) {
      30    20497004 :     return task_a->block_num - task_b->block_num;
      31    23689301 :   } else if (task_a->iset != task_b->iset) {
      32     2811903 :     return task_a->iset - task_b->iset;
      33             :   } else {
      34    20877398 :     return task_a->jset - task_b->jset;
      35             :   }
      36             : }
      37             : /*******************************************************************************
      38             :  * \brief Allocates a task list for the cpu backend.
      39             :  *        See grid_task_list.h for details.
      40             :  * \author Ole Schuett
      41             :  ******************************************************************************/
      42       13438 : void grid_cpu_create_task_list(
      43             :     const bool orthorhombic, const int ntasks, const int nlevels,
      44             :     const int natoms, const int nkinds, const int nblocks,
      45             :     const int block_offsets[nblocks], const double atom_positions[natoms][3],
      46             :     const int atom_kinds[natoms], const grid_basis_set *basis_sets[nkinds],
      47             :     const int level_list[ntasks], const int iatom_list[ntasks],
      48             :     const int jatom_list[ntasks], const int iset_list[ntasks],
      49             :     const int jset_list[ntasks], const int ipgf_list[ntasks],
      50             :     const int jpgf_list[ntasks], const int border_mask_list[ntasks],
      51             :     const int block_num_list[ntasks], const double radius_list[ntasks],
      52             :     const double rab_list[ntasks][3], const int npts_global[nlevels][3],
      53             :     const int npts_local[nlevels][3], const int shift_local[nlevels][3],
      54             :     const int border_width[nlevels][3], const double dh[nlevels][3][3],
      55             :     const double dh_inv[nlevels][3][3], grid_cpu_task_list **task_list_out) {
      56             : 
      57       13438 :   if (*task_list_out != NULL) {
      58             :     // This is actually an opportunity to reuse some buffers.
      59        5382 :     grid_cpu_free_task_list(*task_list_out);
      60             :   }
      61             : 
      62       13438 :   grid_cpu_task_list *task_list = malloc(sizeof(grid_cpu_task_list));
      63       13438 :   assert(task_list != NULL);
      64             : 
      65       13438 :   task_list->orthorhombic = orthorhombic;
      66       13438 :   task_list->ntasks = ntasks;
      67       13438 :   task_list->nlevels = nlevels;
      68       13438 :   task_list->natoms = natoms;
      69       13438 :   task_list->nkinds = nkinds;
      70       13438 :   task_list->nblocks = nblocks;
      71             : 
      72       13438 :   size_t size = nblocks * sizeof(int);
      73       13438 :   task_list->block_offsets = malloc(size);
      74       13438 :   assert(task_list->block_offsets != NULL);
      75       13438 :   memcpy(task_list->block_offsets, block_offsets, size);
      76             : 
      77       13438 :   size = 3 * natoms * sizeof(double);
      78       13438 :   task_list->atom_positions = malloc(size);
      79       13438 :   assert(task_list->atom_positions != NULL);
      80       13438 :   memcpy(task_list->atom_positions, atom_positions, size);
      81             : 
      82       13438 :   size = natoms * sizeof(int);
      83       13438 :   task_list->atom_kinds = malloc(size);
      84       13438 :   assert(task_list->atom_kinds != NULL);
      85       13438 :   memcpy(task_list->atom_kinds, atom_kinds, size);
      86             : 
      87       13438 :   size = nkinds * sizeof(grid_basis_set *);
      88       13438 :   task_list->basis_sets = malloc(size);
      89       13438 :   assert(task_list->basis_sets != NULL);
      90       13438 :   memcpy(task_list->basis_sets, basis_sets, size);
      91             : 
      92       13438 :   size = ntasks * sizeof(grid_cpu_task);
      93       13438 :   task_list->tasks = malloc(size);
      94       13438 :   assert(task_list->tasks != NULL);
      95     7668086 :   for (int i = 0; i < ntasks; i++) {
      96     7654648 :     task_list->tasks[i].level = level_list[i];
      97     7654648 :     task_list->tasks[i].iatom = iatom_list[i];
      98     7654648 :     task_list->tasks[i].jatom = jatom_list[i];
      99     7654648 :     task_list->tasks[i].iset = iset_list[i];
     100     7654648 :     task_list->tasks[i].jset = jset_list[i];
     101     7654648 :     task_list->tasks[i].ipgf = ipgf_list[i];
     102     7654648 :     task_list->tasks[i].jpgf = jpgf_list[i];
     103     7654648 :     task_list->tasks[i].border_mask = border_mask_list[i];
     104     7654648 :     task_list->tasks[i].block_num = block_num_list[i];
     105     7654648 :     task_list->tasks[i].radius = radius_list[i];
     106     7654648 :     task_list->tasks[i].rab[0] = rab_list[i][0];
     107     7654648 :     task_list->tasks[i].rab[1] = rab_list[i][1];
     108     7654648 :     task_list->tasks[i].rab[2] = rab_list[i][2];
     109             :   }
     110             : 
     111             :   // Store grid layouts.
     112       13438 :   size = nlevels * sizeof(grid_cpu_layout);
     113       13438 :   task_list->layouts = malloc(size);
     114       13438 :   assert(task_list->layouts != NULL);
     115       66582 :   for (int level = 0; level < nlevels; level++) {
     116      212576 :     for (int i = 0; i < 3; i++) {
     117      159432 :       task_list->layouts[level].npts_global[i] = npts_global[level][i];
     118      159432 :       task_list->layouts[level].npts_local[i] = npts_local[level][i];
     119      159432 :       task_list->layouts[level].shift_local[i] = shift_local[level][i];
     120      159432 :       task_list->layouts[level].border_width[i] = border_width[level][i];
     121      637728 :       for (int j = 0; j < 3; j++) {
     122      478296 :         task_list->layouts[level].dh[i][j] = dh[level][i][j];
     123      478296 :         task_list->layouts[level].dh_inv[i][j] = dh_inv[level][i][j];
     124             :       }
     125             :     }
     126             :   }
     127             : 
     128             :   // Sort tasks by level, block_num, iset, and jset.
     129       13438 :   qsort(task_list->tasks, ntasks, sizeof(grid_cpu_task), &compare_tasks);
     130             : 
     131             :   // Find first and last task for each level and block.
     132       13438 :   size = nlevels * nblocks * sizeof(int);
     133       13438 :   task_list->first_level_block_task = malloc(size);
     134       13438 :   assert(task_list->first_level_block_task != NULL);
     135       13438 :   task_list->last_level_block_task = malloc(size);
     136       13438 :   assert(task_list->last_level_block_task != NULL);
     137     1044151 :   for (int i = 0; i < nlevels * nblocks; i++) {
     138     1030713 :     task_list->first_level_block_task[i] = 0;
     139     1030713 :     task_list->last_level_block_task[i] = -1; // last < first means no tasks
     140             :   }
     141     7668086 :   for (int itask = 0; itask < ntasks; itask++) {
     142     7654648 :     const int level = task_list->tasks[itask].level - 1;
     143     7654648 :     const int block_num = task_list->tasks[itask].block_num - 1;
     144     7654648 :     if (itask == 0 || task_list->tasks[itask - 1].level - 1 != level ||
     145     7618298 :         task_list->tasks[itask - 1].block_num - 1 != block_num) {
     146      522700 :       task_list->first_level_block_task[level * nblocks + block_num] = itask;
     147             :     }
     148     7654648 :     task_list->last_level_block_task[level * nblocks + block_num] = itask;
     149             :   }
     150             : 
     151             :   // Find largest Cartesian subblock size.
     152       13438 :   task_list->maxco = 0;
     153       37559 :   for (int i = 0; i < nkinds; i++) {
     154       24121 :     task_list->maxco = imax(task_list->maxco, task_list->basis_sets[i]->maxco);
     155             :   }
     156             : 
     157             :   // Initialize thread-local storage.
     158       13438 :   size = omp_get_max_threads() * sizeof(double *);
     159       13438 :   task_list->threadlocals = malloc(size);
     160       13438 :   assert(task_list->threadlocals != NULL);
     161       13438 :   memset(task_list->threadlocals, 0, size);
     162       13438 :   size = omp_get_max_threads() * sizeof(size_t);
     163       13438 :   task_list->threadlocal_sizes = malloc(size);
     164       13438 :   assert(task_list->threadlocal_sizes != NULL);
     165       13438 :   memset(task_list->threadlocal_sizes, 0, size);
     166             : 
     167       13438 :   *task_list_out = task_list;
     168       13438 : }
     169             : 
     170             : /*******************************************************************************
     171             :  * \brief Deallocates given task list, basis_sets have to be freed separately.
     172             :  * \author Ole Schuett
     173             :  ******************************************************************************/
     174       13438 : void grid_cpu_free_task_list(grid_cpu_task_list *task_list) {
     175       13438 :   free(task_list->block_offsets);
     176       13438 :   free(task_list->atom_positions);
     177       13438 :   free(task_list->atom_kinds);
     178       13438 :   free(task_list->basis_sets);
     179       13438 :   free(task_list->tasks);
     180       13438 :   free(task_list->layouts);
     181       13438 :   free(task_list->first_level_block_task);
     182       13438 :   free(task_list->last_level_block_task);
     183       26876 :   for (int i = 0; i < omp_get_max_threads(); i++) {
     184       13438 :     if (task_list->threadlocals[i] != NULL) {
     185       12024 :       free(task_list->threadlocals[i]);
     186             :     }
     187             :   }
     188       13438 :   free(task_list->threadlocals);
     189       13438 :   free(task_list->threadlocal_sizes);
     190       13438 :   free(task_list);
     191       13438 : }
     192             : 
     193             : /*******************************************************************************
     194             :  * \brief Prototype for BLAS dgemm.
     195             :  * \author Ole Schuett
     196             :  ******************************************************************************/
     197             : void dgemm_(const char *transa, const char *transb, const int *m, const int *n,
     198             :             const int *k, const double *alpha, const double *a, const int *lda,
     199             :             const double *b, const int *ldb, const double *beta, double *c,
     200             :             const int *ldc);
     201             : 
     202             : /*******************************************************************************
     203             :  * \brief Convenient wrapper to hide Fortran nature of dgemm_, swapping a and b.
     204             :  * \author Ole Schuett
     205             :  ******************************************************************************/
     206    37277846 : static void dgemm(const char transa, const char transb, const int m,
     207             :                   const int n, const int k, const double alpha, const double *a,
     208             :                   const int lda, const double *b, const int ldb,
     209             :                   const double beta, double *c, const int ldc) {
     210    37277846 :   dgemm_(&transb, &transa, &n, &m, &k, &alpha, b, &ldb, a, &lda, &beta, c,
     211             :          &ldc);
     212    37277846 : }
     213             : 
     214             : /*******************************************************************************
     215             :  * \brief Transforms pab from contracted spherical to prim. cartesian basis.
     216             :  * \author Ole Schuett
     217             :  ******************************************************************************/
     218     9888920 : static void load_pab(const grid_basis_set *ibasis, const grid_basis_set *jbasis,
     219             :                      const int iset, const int jset, const bool transpose,
     220     9888920 :                      const double *block, double *pab) {
     221             : 
     222             :   // Define some more convenient aliases.
     223     9888920 :   const int ncoseta = ncoset(ibasis->lmax[iset]);
     224     9888920 :   const int ncosetb = ncoset(jbasis->lmax[jset]);
     225     9888920 :   const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
     226     9888920 :   const int ncob = jbasis->npgf[jset] * ncosetb;
     227             : 
     228     9888920 :   const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set
     229     9888920 :   const int nsgf_setb = jbasis->nsgf_set[jset];
     230     9888920 :   const int nsgfa = ibasis->nsgf; // size of entire spherical basis
     231     9888920 :   const int nsgfb = jbasis->nsgf;
     232     9888920 :   const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
     233     9888920 :   const int sgfb = jbasis->first_sgf[jset] - 1;
     234     9888920 :   const int maxcoa = ibasis->maxco;
     235     9888920 :   const int maxcob = jbasis->maxco;
     236             : 
     237     9888920 :   double work[nsgf_setb * ncoa];
     238     9888920 :   if (transpose) {
     239             :     // work[nsgf_setb][ncoa] = MATMUL(subblock, ibasis->sphi)
     240     6025576 :     dgemm('N', 'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
     241     6025576 :           &block[sgfb * nsgfa + sgfa], nsgfa, &ibasis->sphi[sgfa * maxcoa],
     242             :           maxcoa, 0.0, work, ncoa);
     243             :   } else {
     244             :     // work[nsgf_setb][ncoa] = MATMUL(TRANSPOSE(subblock), ibasis->sphi)
     245     3863344 :     dgemm('T', 'N', nsgf_setb, ncoa, nsgf_seta, 1.0,
     246     3863344 :           &block[sgfa * nsgfb + sgfb], nsgfb, &ibasis->sphi[sgfa * maxcoa],
     247             :           maxcoa, 0.0, work, ncoa);
     248             :   }
     249             :   // pab[ncob][ncoa] = MATMUL(TRANSPOSE(jbasis->sphi), work)
     250     9888920 :   dgemm('T', 'N', ncob, ncoa, nsgf_setb, 1.0, &jbasis->sphi[sgfb * maxcob],
     251             :         maxcob, work, ncoa, 0.0, pab, ncoa);
     252     9888920 : }
     253             : 
     254             : /*******************************************************************************
     255             :  * \brief Collocate a range of tasks which are destined for the same grid level.
     256             :  * \author Ole Schuett
     257             :  ******************************************************************************/
     258      766712 : static void collocate_one_grid_level(
     259             :     const grid_cpu_task_list *task_list, const int *first_block_task,
     260             :     const int *last_block_task, const enum grid_func func,
     261             :     const int npts_global[3], const int npts_local[3], const int shift_local[3],
     262             :     const int border_width[3], const double dh[3][3], const double dh_inv[3][3],
     263             :     const double *pab_blocks, offload_buffer *grid) {
     264             : 
     265             : // Using default(shared) because with GCC 9 the behavior around const changed:
     266             : // https://www.gnu.org/software/gcc/gcc-9/porting_to.html
     267      766712 : #pragma omp parallel default(shared)
     268             :   {
     269             :     const int ithread = omp_get_thread_num();
     270             :     const int nthreads = omp_get_num_threads();
     271             : 
     272             :     // Initialize variables to detect when a new subblock has to be fetched.
     273             :     int old_offset = -1, old_iset = -1, old_jset = -1;
     274             : 
     275             :     // Matrix pab is re-used across tasks.
     276             :     double pab[task_list->maxco * task_list->maxco];
     277             : 
     278             :     // Ensure that grid can fit into thread-local storage, reallocate if needed.
     279             :     const int npts_local_total = npts_local[0] * npts_local[1] * npts_local[2];
     280             :     const size_t grid_size = npts_local_total * sizeof(double);
     281             :     if (task_list->threadlocal_sizes[ithread] < grid_size) {
     282             :       if (task_list->threadlocals[ithread] != NULL) {
     283             :         free(task_list->threadlocals[ithread]);
     284             :       }
     285             :       task_list->threadlocals[ithread] = malloc(grid_size);
     286             :       assert(task_list->threadlocals[ithread] != NULL);
     287             :       task_list->threadlocal_sizes[ithread] = grid_size;
     288             :     }
     289             : 
     290             :     // Zero thread-local copy of the grid.
     291             :     double *const my_grid = task_list->threadlocals[ithread];
     292             :     memset(my_grid, 0, grid_size);
     293             : 
     294             :     // Parallelize over blocks to avoid unnecessary calls to load_pab.
     295             :     const int chunk_size = imax(1, task_list->nblocks / (nthreads * 50));
     296             : #pragma omp for schedule(dynamic, chunk_size)
     297             :     for (int block_num = 0; block_num < task_list->nblocks; block_num++) {
     298             :       const int first_task = first_block_task[block_num];
     299             :       const int last_task = last_block_task[block_num];
     300             : 
     301             :       for (int itask = first_task; itask <= last_task; itask++) {
     302             :         // Define some convenient aliases.
     303             :         const grid_cpu_task *task = &task_list->tasks[itask];
     304             :         const int iatom = task->iatom - 1;
     305             :         const int jatom = task->jatom - 1;
     306             :         const int iset = task->iset - 1;
     307             :         const int jset = task->jset - 1;
     308             :         const int ipgf = task->ipgf - 1;
     309             :         const int jpgf = task->jpgf - 1;
     310             :         const int ikind = task_list->atom_kinds[iatom] - 1;
     311             :         const int jkind = task_list->atom_kinds[jatom] - 1;
     312             :         const grid_basis_set *ibasis = task_list->basis_sets[ikind];
     313             :         const grid_basis_set *jbasis = task_list->basis_sets[jkind];
     314             :         const double zeta = ibasis->zet[iset * ibasis->maxpgf + ipgf];
     315             :         const double zetb = jbasis->zet[jset * jbasis->maxpgf + jpgf];
     316             :         const int ncoseta = ncoset(ibasis->lmax[iset]);
     317             :         const int ncosetb = ncoset(jbasis->lmax[jset]);
     318             :         const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
     319             :         const int ncob = jbasis->npgf[jset] * ncosetb;
     320             :         const int block_num = task->block_num - 1;
     321             :         const int block_offset = task_list->block_offsets[block_num];
     322             :         const double *block = &pab_blocks[block_offset];
     323             :         const bool transpose = (iatom <= jatom);
     324             : 
     325             :         // Load subblock from buffer and decontract into Cartesian sublock pab.
     326             :         // The previous pab can be reused when only ipgf or jpgf has changed.
     327             :         if (block_offset != old_offset || iset != old_iset ||
     328             :             jset != old_jset) {
     329             :           old_offset = block_offset;
     330             :           old_iset = iset;
     331             :           old_jset = jset;
     332             :           load_pab(ibasis, jbasis, iset, jset, transpose, block, pab);
     333             :         }
     334             : 
     335             :         grid_cpu_collocate_pgf_product(
     336             :             /*orthorhombic=*/task_list->orthorhombic,
     337             :             /*border_mask=*/task->border_mask,
     338             :             /*func=*/func,
     339             :             /*la_max=*/ibasis->lmax[iset],
     340             :             /*la_min=*/ibasis->lmin[iset],
     341             :             /*lb_max=*/jbasis->lmax[jset],
     342             :             /*lb_min=*/jbasis->lmin[jset],
     343             :             /*zeta=*/zeta,
     344             :             /*zetb=*/zetb,
     345             :             /*rscale=*/(iatom == jatom) ? 1 : 2,
     346             :             /*dh=*/dh,
     347             :             /*dh_inv=*/dh_inv,
     348             :             /*ra=*/&task_list->atom_positions[3 * iatom],
     349             :             /*rab=*/task->rab,
     350             :             /*npts_global=*/npts_global,
     351             :             /*npts_local=*/npts_local,
     352             :             /*shift_local=*/shift_local,
     353             :             /*border_width=*/border_width,
     354             :             /*radius=*/task->radius,
     355             :             /*o1=*/ipgf * ncoseta,
     356             :             /*o2=*/jpgf * ncosetb,
     357             :             /*n1=*/ncoa,
     358             :             /*n2=*/ncob,
     359             :             /*pab=*/(const double(*)[ncoa])pab,
     360             :             /*grid=*/my_grid);
     361             : 
     362             :       } // end of task loop
     363             :     } // end of block loop
     364             : 
     365             : // While there should be an implicit barrier at the end of the block loop, this
     366             : // explicit barrier eliminates occasional seg faults with icc compiled binaries.
     367             : #pragma omp barrier
     368             : 
     369             :     // Merge thread-local grids via an efficient tree reduction.
     370             :     const int nreduction_cycles = ceil(log(nthreads) / log(2)); // tree depth
     371             :     for (int icycle = 1; icycle <= nreduction_cycles; icycle++) {
     372             :       // Threads are divided into groups, whose size doubles with each cycle.
     373             :       // After a cycle the reduced data is stored at first thread of each group.
     374             :       const int group_size = 1 << icycle; // 2**icycle
     375             :       const int igroup = ithread / group_size;
     376             :       const int dest_thread = igroup * group_size;
     377             :       const int src_thread = dest_thread + group_size / 2;
     378             :       // The last group might actually be a bit smaller.
     379             :       const int actual_group_size = imin(group_size, nthreads - dest_thread);
     380             :       // Parallelize summation by dividing grid points across group members.
     381             :       const int rank = modulo(ithread, group_size); // position within the group
     382             :       const int lb = (npts_local_total * rank) / actual_group_size;
     383             :       const int ub = (npts_local_total * (rank + 1)) / actual_group_size;
     384             :       if (src_thread < nthreads) {
     385             :         for (int i = lb; i < ub; i++) {
     386             :           task_list->threadlocals[dest_thread][i] +=
     387             :               task_list->threadlocals[src_thread][i];
     388             :         }
     389             :       }
     390             : #pragma omp barrier
     391             :     }
     392             : 
     393             :     // Copy final result from first thread into shared grid.
     394             :     const int lb = (npts_local_total * ithread) / nthreads;
     395             :     const int ub = (npts_local_total * (ithread + 1)) / nthreads;
     396             :     for (int i = lb; i < ub; i++) {
     397             :       grid->host_buffer[i] = task_list->threadlocals[0][i];
     398             :     }
     399             : 
     400             :   } // end of omp parallel region
     401      766712 : }
     402             : 
     403             : /*******************************************************************************
     404             :  * \brief Collocate all tasks of in given list onto given grids.
     405             :  *        See grid_task_list.h for details.
     406             :  * \author Ole Schuett
     407             :  ******************************************************************************/
     408      193699 : void grid_cpu_collocate_task_list(const grid_cpu_task_list *task_list,
     409             :                                   const enum grid_func func, const int nlevels,
     410             :                                   const offload_buffer *pab_blocks,
     411             :                                   offload_buffer *grids[nlevels]) {
     412             : 
     413      193699 :   assert(task_list->nlevels == nlevels);
     414             : 
     415      960411 :   for (int level = 0; level < task_list->nlevels; level++) {
     416      766712 :     const int idx = level * task_list->nblocks;
     417      766712 :     const int *first_block_task = &task_list->first_level_block_task[idx];
     418      766712 :     const int *last_block_task = &task_list->last_level_block_task[idx];
     419      766712 :     const grid_cpu_layout *layout = &task_list->layouts[level];
     420      766712 :     collocate_one_grid_level(
     421      766712 :         task_list, first_block_task, last_block_task, func, layout->npts_global,
     422      766712 :         layout->npts_local, layout->shift_local, layout->border_width,
     423      766712 :         layout->dh, layout->dh_inv, pab_blocks->host_buffer, grids[level]);
     424             :   }
     425      193699 : }
     426             : 
     427             : /*******************************************************************************
     428             :  * \brief Transforms hab from prim. cartesian to contracted spherical basis.
     429             :  * \author Ole Schuett
     430             :  ******************************************************************************/
     431     8750003 : static inline void store_hab(const grid_basis_set *ibasis,
     432             :                              const grid_basis_set *jbasis, const int iset,
     433             :                              const int jset, const bool transpose,
     434     8750003 :                              const double *hab, double *block) {
     435             : 
     436             :   // Define some more convenient aliases.
     437     8750003 :   const int ncoseta = ncoset(ibasis->lmax[iset]);
     438     8750003 :   const int ncosetb = ncoset(jbasis->lmax[jset]);
     439     8750003 :   const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
     440     8750003 :   const int ncob = jbasis->npgf[jset] * ncosetb;
     441             : 
     442     8750003 :   const int nsgf_seta = ibasis->nsgf_set[iset]; // size of spherical set
     443     8750003 :   const int nsgf_setb = jbasis->nsgf_set[jset];
     444     8750003 :   const int nsgfa = ibasis->nsgf; // size of entire spherical basis
     445     8750003 :   const int nsgfb = jbasis->nsgf;
     446     8750003 :   const int sgfa = ibasis->first_sgf[iset] - 1; // start of spherical set
     447     8750003 :   const int sgfb = jbasis->first_sgf[jset] - 1;
     448     8750003 :   const int maxcoa = ibasis->maxco;
     449     8750003 :   const int maxcob = jbasis->maxco;
     450             : 
     451     8750003 :   double work[nsgf_setb * ncoa];
     452             : 
     453             :   // work[nsgf_setb][ncoa] = MATMUL(jbasis->sphi, hab)
     454     8750003 :   dgemm('N', 'N', nsgf_setb, ncoa, ncob, 1.0, &jbasis->sphi[sgfb * maxcob],
     455             :         maxcob, hab, ncoa, 0.0, work, ncoa);
     456             : 
     457     8750003 :   if (transpose) {
     458             :     // subblock[nsgf_setb][nsgf_seta] += MATMUL(work, TRANSPOSE(ibasis->sphi))
     459     5325192 :     dgemm('N', 'T', nsgf_setb, nsgf_seta, ncoa, 1.0, work, ncoa,
     460     5325192 :           &ibasis->sphi[sgfa * maxcoa], maxcoa, 1.0,
     461     5325192 :           &block[sgfb * nsgfa + sgfa], nsgfa);
     462             :   } else {
     463             :     // subblock[nsgf_seta][nsgf_setb] += MATMUL(ibasis->sphi, TRANSPOSE(work))
     464     3424811 :     dgemm('N', 'T', nsgf_seta, nsgf_setb, ncoa, 1.0,
     465     3424811 :           &ibasis->sphi[sgfa * maxcoa], maxcoa, work, ncoa, 1.0,
     466     3424811 :           &block[sgfa * nsgfb + sgfb], nsgfb);
     467             :   }
     468     8750003 : }
     469             : 
     470             : /*******************************************************************************
     471             :  * \brief Integrate a range of tasks that belong to the same grid level.
     472             :  * \author Ole Schuett
     473             :  ******************************************************************************/
     474      719210 : static void integrate_one_grid_level(
     475             :     const grid_cpu_task_list *task_list, const int *first_block_task,
     476             :     const int *last_block_task, const bool compute_tau, const int natoms,
     477             :     const int npts_global[3], const int npts_local[3], const int shift_local[3],
     478             :     const int border_width[3], const double dh[3][3], const double dh_inv[3][3],
     479             :     const offload_buffer *pab_blocks, const offload_buffer *grid,
     480             :     offload_buffer *hab_blocks, double forces[natoms][3], double virial[3][3]) {
     481             : 
     482             : // Using default(shared) because with GCC 9 the behavior around const changed:
     483             : // https://www.gnu.org/software/gcc/gcc-9/porting_to.html
     484      719210 : #pragma omp parallel default(shared)
     485             :   {
     486             :     // Initialize variables to detect when a new subblock has to be fetched.
     487             :     int old_offset = -1, old_iset = -1, old_jset = -1;
     488             :     grid_basis_set *old_ibasis = NULL, *old_jbasis = NULL;
     489             :     bool old_transpose = false;
     490             : 
     491             :     // Matrix pab and hab are re-used across tasks.
     492             :     double pab[task_list->maxco * task_list->maxco];
     493             :     double hab[task_list->maxco * task_list->maxco];
     494             : 
     495             :     // Parallelize over blocks to avoid concurred access to hab_blocks.
     496             :     const int nthreads = omp_get_num_threads();
     497             :     const int chunk_size = imax(1, task_list->nblocks / (nthreads * 50));
     498             : #pragma omp for schedule(dynamic, chunk_size)
     499             :     for (int block_num = 0; block_num < task_list->nblocks; block_num++) {
     500             :       const int first_task = first_block_task[block_num];
     501             :       const int last_task = last_block_task[block_num];
     502             : 
     503             :       // Accumulate forces per block as it corresponds to a pair of atoms.
     504             :       const int iatom = task_list->tasks[first_task].iatom - 1;
     505             :       const int jatom = task_list->tasks[first_task].jatom - 1;
     506             :       double my_forces[2][3] = {0};
     507             :       double my_virials[2][3][3] = {0};
     508             : 
     509             :       for (int itask = first_task; itask <= last_task; itask++) {
     510             :         // Define some convenient aliases.
     511             :         const grid_cpu_task *task = &task_list->tasks[itask];
     512             :         assert(task->block_num - 1 == block_num);
     513             :         assert(task->iatom - 1 == iatom && task->jatom - 1 == jatom);
     514             :         const int ikind = task_list->atom_kinds[iatom] - 1;
     515             :         const int jkind = task_list->atom_kinds[jatom] - 1;
     516             :         grid_basis_set *ibasis = task_list->basis_sets[ikind];
     517             :         grid_basis_set *jbasis = task_list->basis_sets[jkind];
     518             :         const int iset = task->iset - 1;
     519             :         const int jset = task->jset - 1;
     520             :         const int ipgf = task->ipgf - 1;
     521             :         const int jpgf = task->jpgf - 1;
     522             :         const double zeta = ibasis->zet[iset * ibasis->maxpgf + ipgf];
     523             :         const double zetb = jbasis->zet[jset * jbasis->maxpgf + jpgf];
     524             :         const int ncoseta = ncoset(ibasis->lmax[iset]);
     525             :         const int ncosetb = ncoset(jbasis->lmax[jset]);
     526             :         const int ncoa = ibasis->npgf[iset] * ncoseta; // size of carthesian set
     527             :         const int ncob = jbasis->npgf[jset] * ncosetb;
     528             :         const int block_offset = task_list->block_offsets[block_num];
     529             :         const bool transpose = (iatom <= jatom);
     530             :         const bool pab_required = (forces != NULL || virial != NULL);
     531             : 
     532             :         // Load pab and store hab subblocks when needed.
     533             :         // Previous hab and pab can be reused when only ipgf or jpgf changed.
     534             :         if (block_offset != old_offset || iset != old_iset ||
     535             :             jset != old_jset) {
     536             :           if (pab_required) {
     537             :             load_pab(ibasis, jbasis, iset, jset, transpose,
     538             :                      &pab_blocks->host_buffer[block_offset], pab);
     539             :           }
     540             :           if (old_offset >= 0) { // skip first iteration
     541             :             store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose,
     542             :                       hab, &hab_blocks->host_buffer[old_offset]);
     543             :           }
     544             :           memset(hab, 0, ncoa * ncob * sizeof(double));
     545             :           old_offset = block_offset;
     546             :           old_iset = iset;
     547             :           old_jset = jset;
     548             :           old_ibasis = ibasis;
     549             :           old_jbasis = jbasis;
     550             :           old_transpose = transpose;
     551             :         }
     552             : 
     553             :         grid_cpu_integrate_pgf_product(
     554             :             /*orthorhombic=*/task_list->orthorhombic,
     555             :             /*compute_tau=*/compute_tau,
     556             :             /*border_mask=*/task->border_mask,
     557             :             /*la_max=*/ibasis->lmax[iset],
     558             :             /*la_min=*/ibasis->lmin[iset],
     559             :             /*lb_max=*/jbasis->lmax[jset],
     560             :             /*lb_min=*/jbasis->lmin[jset],
     561             :             /*zeta=*/zeta,
     562             :             /*zetb=*/zetb,
     563             :             /*dh=*/dh,
     564             :             /*dh_inv=*/dh_inv,
     565             :             /*ra=*/&task_list->atom_positions[3 * iatom],
     566             :             /*rab=*/task->rab,
     567             :             /*npts_global=*/npts_global,
     568             :             /*npts_local=*/npts_local,
     569             :             /*shift_local=*/shift_local,
     570             :             /*border_width=*/border_width,
     571             :             /*radius=*/task->radius,
     572             :             /*o1=*/ipgf * ncoseta,
     573             :             /*o2=*/jpgf * ncosetb,
     574             :             /*n1=*/ncoa,
     575             :             /*n2=*/ncob,
     576             :             /*grid=*/grid->host_buffer,
     577             :             /*hab=*/(double(*)[ncoa])hab,
     578             :             /*pab=*/(pab_required) ? (const double(*)[ncoa])pab : NULL,
     579             :             /*forces=*/(forces != NULL) ? my_forces : NULL,
     580             :             /*virials=*/(virial != NULL) ? my_virials : NULL,
     581             :             /*hdab=*/NULL,
     582             :             /*hadb=*/NULL,
     583             :             /*a_hdab=*/NULL);
     584             : 
     585             :       } // end of task loop
     586             : 
     587             :       // Merge thread-local forces and virial into shared ones.
     588             :       // It does not seem worth the trouble to accumulate them thread-locally.
     589             :       const double scalef = (iatom == jatom) ? 1.0 : 2.0;
     590             :       if (forces != NULL) {
     591             : #pragma omp critical(forces)
     592             :         for (int i = 0; i < 3; i++) {
     593             :           forces[iatom][i] += scalef * my_forces[0][i];
     594             :           forces[jatom][i] += scalef * my_forces[1][i];
     595             :         }
     596             :       }
     597             :       if (virial != NULL) {
     598             : #pragma omp critical(virial)
     599             :         for (int i = 0; i < 3; i++) {
     600             :           for (int j = 0; j < 3; j++) {
     601             :             virial[i][j] += scalef * my_virials[0][i][j];
     602             :             virial[i][j] += scalef * my_virials[1][i][j];
     603             :           }
     604             :         }
     605             :       }
     606             : 
     607             :     } // end of block loop
     608             : 
     609             :     // store final hab
     610             :     if (old_offset >= 0) {
     611             :       store_hab(old_ibasis, old_jbasis, old_iset, old_jset, old_transpose, hab,
     612             :                 &hab_blocks->host_buffer[old_offset]);
     613             :     }
     614             : 
     615             :   } // end of omp parallel region
     616      719210 : }
     617             : 
     618             : /*******************************************************************************
     619             :  * \brief Integrate all tasks of in given list from given grids.
     620             :  *        See grid_task_list.h for details.
     621             :  * \author Ole Schuett
     622             :  ******************************************************************************/
     623      181801 : void grid_cpu_integrate_task_list(
     624             :     const grid_cpu_task_list *task_list, const bool compute_tau,
     625             :     const int natoms, const int nlevels, const offload_buffer *pab_blocks,
     626             :     const offload_buffer *grids[nlevels], offload_buffer *hab_blocks,
     627             :     double forces[natoms][3], double virial[3][3]) {
     628             : 
     629      181801 :   assert(task_list->nlevels == nlevels);
     630      181801 :   assert(task_list->natoms == natoms);
     631             : 
     632             :   // Zero result arrays.
     633      181801 :   memset(hab_blocks->host_buffer, 0, hab_blocks->size);
     634      181801 :   if (forces != NULL) {
     635       22765 :     memset(forces, 0, natoms * 3 * sizeof(double));
     636             :   }
     637      181801 :   if (virial != NULL) {
     638        3595 :     memset(virial, 0, 9 * sizeof(double));
     639             :   }
     640             : 
     641      901011 :   for (int level = 0; level < task_list->nlevels; level++) {
     642      719210 :     const int idx = level * task_list->nblocks;
     643      719210 :     const int *first_block_task = &task_list->first_level_block_task[idx];
     644      719210 :     const int *last_block_task = &task_list->last_level_block_task[idx];
     645      719210 :     const grid_cpu_layout *layout = &task_list->layouts[level];
     646      719210 :     integrate_one_grid_level(
     647             :         task_list, first_block_task, last_block_task, compute_tau, natoms,
     648      719210 :         layout->npts_global, layout->npts_local, layout->shift_local,
     649      719210 :         layout->border_width, layout->dh, layout->dh_inv, pab_blocks,
     650      719210 :         grids[level], hab_blocks, forces, virial);
     651             :   }
     652      181801 : }
     653             : 
     654             : // EOF

Generated by: LCOV version 1.15