Line data Source code
1 : /*----------------------------------------------------------------------------*/ 2 : /* CP2K: A general program to perform molecular dynamics simulations */ 3 : /* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */ 4 : /* */ 5 : /* SPDX-License-Identifier: BSD-3-Clause */ 6 : /*----------------------------------------------------------------------------*/ 7 : 8 : #include <assert.h> 9 : #include <omp.h> 10 : #include <stddef.h> 11 : #include <stdio.h> 12 : #include <stdlib.h> 13 : #include <string.h> 14 : 15 : #include "../../offload/offload_runtime.h" 16 : #include "grid_common.h" 17 : #include "grid_constants.h" 18 : #include "grid_library.h" 19 : 20 : // counter dimensions 21 : #define GRID_NBACKENDS 5 22 : #define GRID_NKERNELS 4 23 : #define GRID_MAX_LP 20 24 : 25 : typedef struct { 26 : grid_sphere_cache sphere_cache; 27 : long counters[GRID_NBACKENDS * GRID_NKERNELS * GRID_MAX_LP]; 28 : } grid_library_globals; 29 : 30 : static grid_library_globals **per_thread_globals = NULL; 31 : static bool library_initialized = false; 32 : static int max_threads = 0; 33 : static grid_library_config config = { 34 : .backend = GRID_BACKEND_AUTO, .validate = false, .apply_cutoff = false}; 35 : 36 : #if !defined(_OPENMP) 37 : #error "OpenMP is required. Please add -fopenmp to your C compiler flags." 38 : #endif 39 : 40 : #if defined(NDEBUG) 41 : #error \ 42 : "Please do not build CP2K with NDEBUG. There is no performance advantage and asserts will save your neck." 43 : #endif 44 : 45 : /******************************************************************************* 46 : * \brief Initializes the grid library. 47 : * \author Ole Schuett 48 : ******************************************************************************/ 49 8414 : void grid_library_init(void) { 50 8414 : if (library_initialized) { 51 0 : printf("Error: Grid library was already initialized.\n"); 52 0 : abort(); 53 : } 54 : 55 : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_GRID) 56 : // Reserve global GPU memory for storing the intermediate Cab matrix blocks. 57 : // CUDA does not allow to increase this limit after a kernel was launched. 58 : // Unfortunately, the required memory is hard to predict because we neither 59 : // know which tasks will be run nor how many thread blocks the available GPU 60 : // can execute in parallel... 64 MiB ought to be enough for anybody ;-) 61 : offloadEnsureMallocHeapSize(64 * 1024 * 1024); 62 : #endif 63 : 64 8414 : max_threads = omp_get_max_threads(); 65 8414 : per_thread_globals = malloc(max_threads * sizeof(grid_library_globals *)); 66 : 67 : // Using parallel regions to ensure memory is allocated near a thread's core. 68 : #pragma omp parallel default(none) shared(per_thread_globals) \ 69 : num_threads(max_threads) 70 : { 71 : const int ithread = omp_get_thread_num(); 72 : per_thread_globals[ithread] = malloc(sizeof(grid_library_globals)); 73 : memset(per_thread_globals[ithread], 0, sizeof(grid_library_globals)); 74 : } 75 : 76 8414 : library_initialized = true; 77 8414 : } 78 : 79 : /******************************************************************************* 80 : * \brief Finalizes the grid library. 81 : * \author Ole Schuett 82 : ******************************************************************************/ 83 8414 : void grid_library_finalize(void) { 84 8414 : if (!library_initialized) { 85 0 : printf("Error: Grid library is not initialized.\n"); 86 0 : abort(); 87 : } 88 : 89 16828 : for (int i = 0; i < max_threads; i++) { 90 8414 : grid_sphere_cache_free(&per_thread_globals[i]->sphere_cache); 91 8414 : free(per_thread_globals[i]); 92 : } 93 8414 : free(per_thread_globals); 94 8414 : per_thread_globals = NULL; 95 8414 : library_initialized = false; 96 8414 : } 97 : 98 : /******************************************************************************* 99 : * \brief Returns a pointer to the thread local sphere cache. 100 : * \author Ole Schuett 101 : ******************************************************************************/ 102 120173281 : grid_sphere_cache *grid_library_get_sphere_cache(void) { 103 120173281 : const int ithread = omp_get_thread_num(); 104 120173281 : assert(ithread < max_threads); 105 120173281 : return &per_thread_globals[ithread]->sphere_cache; 106 : } 107 : 108 : /******************************************************************************* 109 : * \brief Configures the grid library. 110 : * \author Ole Schuett 111 : ******************************************************************************/ 112 8530 : void grid_library_set_config(const enum grid_backend backend, 113 : const bool validate, const bool apply_cutoff) { 114 8530 : config.backend = backend; 115 8530 : config.validate = validate; 116 8530 : config.apply_cutoff = apply_cutoff; 117 8530 : } 118 : 119 : /******************************************************************************* 120 : * \brief Returns the library config. 121 : * \author Ole Schuett 122 : ******************************************************************************/ 123 384243 : grid_library_config grid_library_get_config(void) { return config; } 124 : 125 : /******************************************************************************* 126 : * \brief Adds given increment to counter specified by lp, backend, and kernel. 127 : * \author Ole Schuett 128 : ******************************************************************************/ 129 125522199 : void grid_library_counter_add(const int lp, const enum grid_backend backend, 130 : const enum grid_library_kernel kernel, 131 : const int increment) { 132 125522199 : assert(lp >= 0); 133 125522199 : assert(kernel < GRID_NKERNELS); 134 125522199 : const int back = backend - GRID_BACKEND_REF; 135 125522199 : assert(back < GRID_NBACKENDS); 136 125522199 : const int idx = back * GRID_NKERNELS * GRID_MAX_LP + kernel * GRID_MAX_LP + 137 125522199 : imin(lp, GRID_MAX_LP - 1); 138 125522199 : const int ithread = omp_get_thread_num(); 139 125522199 : assert(ithread < max_threads); 140 125522199 : per_thread_globals[ithread]->counters[idx] += increment; 141 125522199 : } 142 : 143 : /******************************************************************************* 144 : * \brief Comperator passed to qsort to compare two counters. 145 : * \author Ole Schuett 146 : ******************************************************************************/ 147 14330416 : static int compare_counters(const void *a, const void *b) { 148 14330416 : return *(long *)b - *(long *)a; 149 : } 150 : 151 : /******************************************************************************* 152 : * \brief Prints statistics gathered by the grid library. 153 : * \author Ole Schuett 154 : ******************************************************************************/ 155 8532 : void grid_library_print_stats(void (*mpi_sum_func)(long *, int), 156 : const int mpi_comm, 157 : void (*print_func)(char *, int), 158 8532 : const int output_unit) { 159 8532 : if (!library_initialized) { 160 0 : printf("Error: Grid library is not initialized.\n"); 161 0 : abort(); 162 : } 163 : 164 : // Sum all counters across threads and mpi ranks. 165 8532 : const int ncounters = GRID_NBACKENDS * GRID_NKERNELS * GRID_MAX_LP; 166 8532 : long counters[ncounters][2]; 167 8532 : memset(counters, 0, ncounters * 2 * sizeof(long)); 168 8532 : double total = 0.0; 169 3421332 : for (int i = 0; i < ncounters; i++) { 170 3412800 : counters[i][1] = i; // needed as inverse index after qsort 171 6825600 : for (int j = 0; j < max_threads; j++) { 172 3412800 : counters[i][0] += per_thread_globals[j]->counters[i]; 173 : } 174 3412800 : mpi_sum_func(&counters[i][0], mpi_comm); 175 3412800 : total += counters[i][0]; 176 : } 177 : 178 : // Sort counters. 179 8532 : qsort(counters, ncounters, 2 * sizeof(long), &compare_counters); 180 : 181 : // Print counters. 182 8532 : print_func("\n", output_unit); 183 8532 : print_func(" ----------------------------------------------------------------" 184 : "---------------\n", 185 : output_unit); 186 8532 : print_func(" - " 187 : " -\n", 188 : output_unit); 189 8532 : print_func(" - GRID STATISTICS " 190 : " -\n", 191 : output_unit); 192 8532 : print_func(" - " 193 : " -\n", 194 : output_unit); 195 8532 : print_func(" ----------------------------------------------------------------" 196 : "---------------\n", 197 : output_unit); 198 8532 : print_func(" LP KERNEL BACKEND " 199 : "COUNT PERCENT\n", 200 : output_unit); 201 : 202 8532 : const char *kernel_names[] = {"collocate ortho", "integrate ortho", 203 : "collocate general", "integrate general"}; 204 8532 : const char *backend_names[] = {"REF", "CPU", "DGEMM", "GPU", "HIP"}; 205 : 206 3421332 : for (int i = 0; i < ncounters; i++) { 207 3412800 : if (counters[i][0] == 0) 208 3363576 : continue; // skip empty counters 209 49224 : const double percent = 100.0 * counters[i][0] / total; 210 49224 : const int idx = counters[i][1]; 211 49224 : const int backend_stride = GRID_NKERNELS * GRID_MAX_LP; 212 49224 : const int back = idx / backend_stride; 213 49224 : const int kern = (idx % backend_stride) / GRID_MAX_LP; 214 49224 : const int lp = (idx % backend_stride) % GRID_MAX_LP; 215 49224 : char buffer[100]; 216 49224 : snprintf(buffer, sizeof(buffer), " %-5i %-17s %-6s %34li %10.2f%%\n", lp, 217 : kernel_names[kern], backend_names[back], counters[i][0], percent); 218 49224 : print_func(buffer, output_unit); 219 : } 220 : 221 8532 : print_func(" ----------------------------------------------------------------" 222 : "---------------\n", 223 : output_unit); 224 8532 : } 225 : 226 : // EOF