Line data Source code
1 : /*----------------------------------------------------------------------------*/ 2 : /* CP2K: A general program to perform molecular dynamics simulations */ 3 : /* Copyright 2000-2024 CP2K developers group <https://cp2k.org> */ 4 : /* */ 5 : /* SPDX-License-Identifier: BSD-3-Clause */ 6 : /*----------------------------------------------------------------------------*/ 7 : 8 : #include <assert.h> 9 : #include <omp.h> 10 : #include <stddef.h> 11 : #include <stdio.h> 12 : #include <stdlib.h> 13 : #include <string.h> 14 : 15 : #include "../../offload/offload_runtime.h" 16 : #include "grid_common.h" 17 : #include "grid_constants.h" 18 : #include "grid_library.h" 19 : 20 : // counter dimensions 21 : #define GRID_NBACKENDS 5 22 : #define GRID_NKERNELS 4 23 : #define GRID_MAX_LP 20 24 : 25 : typedef struct { 26 : grid_sphere_cache sphere_cache; 27 : long counters[GRID_NBACKENDS * GRID_NKERNELS * GRID_MAX_LP]; 28 : } grid_library_globals; 29 : 30 : static grid_library_globals **per_thread_globals = NULL; 31 : static bool library_initialized = false; 32 : static int max_threads = 0; 33 : static grid_library_config config = { 34 : .backend = GRID_BACKEND_AUTO, .validate = false, .apply_cutoff = false}; 35 : 36 : #if !defined(_OPENMP) 37 : #error "OpenMP is required. Please add -fopenmp to your C compiler flags." 38 : #endif 39 : 40 : #if defined(NDEBUG) 41 : #error \ 42 : "Please do not build CP2K with NDEBUG. There is no performance advantage and asserts will save your neck." 43 : #endif 44 : 45 : /******************************************************************************* 46 : * \brief Initializes the grid library. 47 : * \author Ole Schuett 48 : ******************************************************************************/ 49 8532 : void grid_library_init(void) { 50 8532 : if (library_initialized) { 51 0 : printf("Error: Grid library was already initialized.\n"); 52 0 : abort(); 53 : } 54 : 55 : #if defined(__OFFLOAD) && !defined(__NO_OFFLOAD_GRID) 56 : // Reserve global GPU memory for storing the intermediate Cab matrix blocks. 57 : // CUDA does not allow to increase this limit after a kernel was launched. 58 : // Unfortunately, the required memory is hard to predict because we neither 59 : // know which tasks will be run nor how many thread blocks the available GPU 60 : // can execute in parallel... 64 MiB ought to be enough for anybody ;-) 61 : offloadEnsureMallocHeapSize(64 * 1024 * 1024); 62 : #endif 63 : 64 8532 : max_threads = omp_get_max_threads(); 65 8532 : per_thread_globals = malloc(max_threads * sizeof(grid_library_globals *)); 66 8532 : assert(per_thread_globals != NULL); 67 : 68 : // Using parallel regions to ensure memory is allocated near a thread's core. 69 : #pragma omp parallel default(none) shared(per_thread_globals) \ 70 : num_threads(max_threads) 71 : { 72 : const int ithread = omp_get_thread_num(); 73 : per_thread_globals[ithread] = malloc(sizeof(grid_library_globals)); 74 : assert(per_thread_globals[ithread] != NULL); 75 : memset(per_thread_globals[ithread], 0, sizeof(grid_library_globals)); 76 : } 77 : 78 8532 : library_initialized = true; 79 8532 : } 80 : 81 : /******************************************************************************* 82 : * \brief Finalizes the grid library. 83 : * \author Ole Schuett 84 : ******************************************************************************/ 85 8532 : void grid_library_finalize(void) { 86 8532 : if (!library_initialized) { 87 0 : printf("Error: Grid library is not initialized.\n"); 88 0 : abort(); 89 : } 90 : 91 17064 : for (int i = 0; i < max_threads; i++) { 92 8532 : grid_sphere_cache_free(&per_thread_globals[i]->sphere_cache); 93 8532 : free(per_thread_globals[i]); 94 : } 95 8532 : free(per_thread_globals); 96 8532 : per_thread_globals = NULL; 97 8532 : library_initialized = false; 98 8532 : } 99 : 100 : /******************************************************************************* 101 : * \brief Returns a pointer to the thread local sphere cache. 102 : * \author Ole Schuett 103 : ******************************************************************************/ 104 121891892 : grid_sphere_cache *grid_library_get_sphere_cache(void) { 105 121891892 : const int ithread = omp_get_thread_num(); 106 121891892 : assert(ithread < max_threads); 107 121891892 : return &per_thread_globals[ithread]->sphere_cache; 108 : } 109 : 110 : /******************************************************************************* 111 : * \brief Configures the grid library. 112 : * \author Ole Schuett 113 : ******************************************************************************/ 114 8648 : void grid_library_set_config(const enum grid_backend backend, 115 : const bool validate, const bool apply_cutoff) { 116 8648 : config.backend = backend; 117 8648 : config.validate = validate; 118 8648 : config.apply_cutoff = apply_cutoff; 119 8648 : } 120 : 121 : /******************************************************************************* 122 : * \brief Returns the library config. 123 : * \author Ole Schuett 124 : ******************************************************************************/ 125 389346 : grid_library_config grid_library_get_config(void) { return config; } 126 : 127 : /******************************************************************************* 128 : * \brief Adds given increment to counter specified by lp, backend, and kernel. 129 : * \author Ole Schuett 130 : ******************************************************************************/ 131 127193727 : void grid_library_counter_add(const int lp, const enum grid_backend backend, 132 : const enum grid_library_kernel kernel, 133 : const int increment) { 134 127193727 : assert(lp >= 0); 135 127193727 : assert(kernel < GRID_NKERNELS); 136 127193727 : const int back = backend - GRID_BACKEND_REF; 137 127193727 : assert(back < GRID_NBACKENDS); 138 127193727 : const int idx = back * GRID_NKERNELS * GRID_MAX_LP + kernel * GRID_MAX_LP + 139 127193727 : imin(lp, GRID_MAX_LP - 1); 140 127193727 : const int ithread = omp_get_thread_num(); 141 127193727 : assert(ithread < max_threads); 142 127193727 : per_thread_globals[ithread]->counters[idx] += increment; 143 127193727 : } 144 : 145 : /******************************************************************************* 146 : * \brief Comperator passed to qsort to compare two counters. 147 : * \author Ole Schuett 148 : ******************************************************************************/ 149 14528770 : static int compare_counters(const void *a, const void *b) { 150 14528770 : return *(long *)b - *(long *)a; 151 : } 152 : 153 : /******************************************************************************* 154 : * \brief Prints statistics gathered by the grid library. 155 : * \author Ole Schuett 156 : ******************************************************************************/ 157 8650 : void grid_library_print_stats(void (*mpi_sum_func)(long *, int), 158 : const int mpi_comm, 159 : void (*print_func)(char *, int), 160 8650 : const int output_unit) { 161 8650 : if (!library_initialized) { 162 0 : printf("Error: Grid library is not initialized.\n"); 163 0 : abort(); 164 : } 165 : 166 : // Sum all counters across threads and mpi ranks. 167 8650 : const int ncounters = GRID_NBACKENDS * GRID_NKERNELS * GRID_MAX_LP; 168 8650 : long counters[ncounters][2]; 169 8650 : memset(counters, 0, ncounters * 2 * sizeof(long)); 170 8650 : double total = 0.0; 171 3468650 : for (int i = 0; i < ncounters; i++) { 172 3460000 : counters[i][1] = i; // needed as inverse index after qsort 173 6920000 : for (int j = 0; j < max_threads; j++) { 174 3460000 : counters[i][0] += per_thread_globals[j]->counters[i]; 175 : } 176 3460000 : mpi_sum_func(&counters[i][0], mpi_comm); 177 3460000 : total += counters[i][0]; 178 : } 179 : 180 : // Sort counters. 181 8650 : qsort(counters, ncounters, 2 * sizeof(long), &compare_counters); 182 : 183 : // Print counters. 184 8650 : print_func("\n", output_unit); 185 8650 : print_func(" ----------------------------------------------------------------" 186 : "---------------\n", 187 : output_unit); 188 8650 : print_func(" - " 189 : " -\n", 190 : output_unit); 191 8650 : print_func(" - GRID STATISTICS " 192 : " -\n", 193 : output_unit); 194 8650 : print_func(" - " 195 : " -\n", 196 : output_unit); 197 8650 : print_func(" ----------------------------------------------------------------" 198 : "---------------\n", 199 : output_unit); 200 8650 : print_func(" LP KERNEL BACKEND " 201 : "COUNT PERCENT\n", 202 : output_unit); 203 : 204 8650 : const char *kernel_names[] = {"collocate ortho", "integrate ortho", 205 : "collocate general", "integrate general"}; 206 8650 : const char *backend_names[] = {"REF", "CPU", "DGEMM", "GPU", "HIP"}; 207 : 208 3468650 : for (int i = 0; i < ncounters; i++) { 209 3460000 : if (counters[i][0] == 0) 210 3409990 : continue; // skip empty counters 211 50010 : const double percent = 100.0 * counters[i][0] / total; 212 50010 : const int idx = counters[i][1]; 213 50010 : const int backend_stride = GRID_NKERNELS * GRID_MAX_LP; 214 50010 : const int back = idx / backend_stride; 215 50010 : const int kern = (idx % backend_stride) / GRID_MAX_LP; 216 50010 : const int lp = (idx % backend_stride) % GRID_MAX_LP; 217 50010 : char buffer[100]; 218 50010 : snprintf(buffer, sizeof(buffer), " %-5i %-17s %-6s %34li %10.2f%%\n", lp, 219 : kernel_names[kern], backend_names[back], counters[i][0], percent); 220 50010 : print_func(buffer, output_unit); 221 : } 222 : 223 8650 : print_func(" ----------------------------------------------------------------" 224 : "---------------\n", 225 : output_unit); 226 8650 : } 227 : 228 : // EOF