Line data Source code
1 : /*----------------------------------------------------------------------------*/
2 : /* CP2K: A general program to perform molecular dynamics simulations */
3 : /* Copyright 2000-2025 CP2K developers group <https://cp2k.org> */
4 : /* */
5 : /* SPDX-License-Identifier: BSD-3-Clause */
6 : /*----------------------------------------------------------------------------*/
7 :
8 : #include <assert.h>
9 : #include <inttypes.h>
10 : #include <omp.h>
11 : #include <stdio.h>
12 : #include <stdlib.h>
13 : #include <string.h>
14 :
15 : #include "dbm_library.h"
16 : #include "dbm_mempool.h"
17 : #include "dbm_mpi.h"
18 :
19 : #define DBM_NUM_COUNTERS 64
20 :
21 : static int64_t **per_thread_counters = NULL;
22 : static bool library_initialized = false;
23 : static int max_threads = 0;
24 :
25 : #if !defined(_OPENMP)
26 : #error "OpenMP is required. Please add -fopenmp to your C compiler flags."
27 : #endif
28 :
29 : /*******************************************************************************
30 : * \brief Initializes the DBM library.
31 : * \author Ole Schuett
32 : ******************************************************************************/
33 9208 : void dbm_library_init(void) {
34 9208 : assert(omp_get_num_threads() == 1);
35 :
36 9208 : if (library_initialized) {
37 0 : fprintf(stderr, "DBM library was already initialized.\n");
38 0 : abort();
39 : }
40 :
41 9208 : max_threads = omp_get_max_threads();
42 9208 : per_thread_counters = malloc(max_threads * sizeof(int64_t *));
43 9208 : assert(per_thread_counters != NULL);
44 :
45 : // Using parallel regions to ensure memory is allocated near a thread's core.
46 : #pragma omp parallel default(none) shared(per_thread_counters) \
47 : num_threads(max_threads)
48 : {
49 : const int ithread = omp_get_thread_num();
50 : const size_t counters_size = DBM_NUM_COUNTERS * sizeof(int64_t);
51 : per_thread_counters[ithread] = malloc(counters_size);
52 : assert(per_thread_counters[ithread] != NULL);
53 : memset(per_thread_counters[ithread], 0, counters_size);
54 : }
55 :
56 9208 : library_initialized = true;
57 9208 : }
58 :
59 : /*******************************************************************************
60 : * \brief Finalizes the DBM library.
61 : * \author Ole Schuett
62 : ******************************************************************************/
63 9208 : void dbm_library_finalize(void) {
64 9208 : assert(omp_get_num_threads() == 1);
65 :
66 9208 : if (!library_initialized) {
67 0 : fprintf(stderr, "Error: DBM library is not initialized.\n");
68 0 : abort();
69 : }
70 :
71 18416 : for (int i = 0; i < max_threads; i++) {
72 9208 : free(per_thread_counters[i]);
73 : }
74 9208 : free(per_thread_counters);
75 9208 : per_thread_counters = NULL;
76 :
77 9208 : dbm_mempool_clear();
78 9208 : library_initialized = false;
79 9208 : }
80 :
81 : /*******************************************************************************
82 : * \brief Computes min(3, floor(log10(x))).
83 : * \author Ole Schuett
84 : ******************************************************************************/
85 65010777 : static int floorlog10(const int x) {
86 65010777 : if (x >= 1000) {
87 : return 3;
88 : }
89 65010393 : if (x >= 100) {
90 : return 2;
91 : }
92 62425394 : if (x >= 10) {
93 18550700 : return 1;
94 : }
95 : return 0;
96 : }
97 :
98 : /*******************************************************************************
99 : * \brief Add given block multiplication to stats. This routine is thread-safe.
100 : * \author Ole Schuett
101 : ******************************************************************************/
102 21670259 : void dbm_library_counter_increment(const int m, const int n, const int k) {
103 21670259 : const int ithread = omp_get_thread_num();
104 21670259 : assert(ithread < max_threads);
105 21670259 : const int idx = 16 * floorlog10(m) + 4 * floorlog10(n) + floorlog10(k);
106 21670259 : per_thread_counters[ithread][idx]++;
107 21670259 : }
108 :
109 : /*******************************************************************************
110 : * \brief Comperator passed to qsort to compare two counters.
111 : * \author Ole Schuett
112 : ******************************************************************************/
113 1793420 : static int compare_counters(const void *a, const void *b) {
114 1793420 : return *(const int64_t *)b - *(const int64_t *)a;
115 : }
116 :
117 : /*******************************************************************************
118 : * \brief Prints statistics gathered by the DBM library.
119 : * \author Ole Schuett
120 : ******************************************************************************/
121 9326 : void dbm_library_print_stats(const int fortran_comm,
122 : void (*print_func)(char *, int),
123 : const int output_unit) {
124 9326 : assert(omp_get_num_threads() == 1);
125 :
126 9326 : if (!library_initialized) {
127 0 : fprintf(stderr, "Error: DBM library is not initialized.\n");
128 0 : abort();
129 : }
130 :
131 9326 : const dbm_mpi_comm_t comm = dbm_mpi_comm_f2c(fortran_comm);
132 : // Sum all counters across threads and mpi ranks.
133 9326 : int64_t counters[DBM_NUM_COUNTERS][2] = {{0}};
134 9326 : double total = 0.0;
135 606190 : for (int i = 0; i < DBM_NUM_COUNTERS; i++) {
136 596864 : counters[i][1] = i; // needed as inverse index after qsort
137 1193728 : for (int j = 0; j < max_threads; j++) {
138 596864 : counters[i][0] += per_thread_counters[j][i];
139 : }
140 596864 : dbm_mpi_sum_int64(&counters[i][0], 1, comm);
141 596864 : total += counters[i][0];
142 : }
143 :
144 : // Sort counters.
145 9326 : qsort(counters, DBM_NUM_COUNTERS, 2 * sizeof(int64_t), &compare_counters);
146 :
147 : // Print counters.
148 9326 : print_func("\n", output_unit);
149 9326 : print_func(" ----------------------------------------------------------------"
150 : "---------------\n",
151 : output_unit);
152 9326 : print_func(" - "
153 : " -\n",
154 : output_unit);
155 9326 : print_func(" - DBM STATISTICS "
156 : " -\n",
157 : output_unit);
158 9326 : print_func(" - "
159 : " -\n",
160 : output_unit);
161 9326 : print_func(" ----------------------------------------------------------------"
162 : "---------------\n",
163 : output_unit);
164 9326 : print_func(" M x N x K "
165 : "COUNT PERCENT\n",
166 : output_unit);
167 :
168 9326 : const char *labels[] = {"?", "??", "???", ">999"};
169 9326 : char buffer[100];
170 606190 : for (int i = 0; i < DBM_NUM_COUNTERS; i++) {
171 596864 : if (counters[i][0] == 0) {
172 594088 : continue; // skip empty counters
173 : }
174 2776 : const double percent = 100.0 * counters[i][0] / total;
175 2776 : const int idx = counters[i][1];
176 2776 : const int m = (idx % 64) / 16;
177 2776 : const int n = (idx % 16) / 4;
178 2776 : const int k = (idx % 4) / 1;
179 2776 : snprintf(buffer, sizeof(buffer),
180 : " %4s x %4s x %4s %46" PRId64 " %10.2f%%\n", labels[m],
181 : labels[n], labels[k], counters[i][0], percent);
182 2776 : print_func(buffer, output_unit);
183 : }
184 :
185 9326 : print_func(" ----------------------------------------------------------------"
186 : "---------------\n",
187 : output_unit);
188 :
189 9326 : dbm_memstats_t memstats;
190 9326 : dbm_mempool_statistics(&memstats);
191 9326 : dbm_mpi_max_uint64(&memstats.device_mallocs, 1, comm);
192 9326 : dbm_mpi_max_uint64(&memstats.host_mallocs, 1, comm);
193 :
194 9326 : if (0 != memstats.device_mallocs || 0 != memstats.host_mallocs) {
195 360 : print_func(" Memory consumption "
196 : " Number of allocations Size [MiB]\n",
197 : output_unit);
198 : }
199 9326 : if (0 < memstats.device_mallocs) {
200 0 : dbm_mpi_max_uint64(&memstats.device_size, 1, comm);
201 0 : snprintf(buffer, sizeof(buffer),
202 : " Device "
203 : " %20" PRIuPTR " %10" PRIuPTR "\n",
204 0 : (uintptr_t)memstats.device_mallocs,
205 0 : (uintptr_t)((memstats.device_size + (512U << 10)) >> 20));
206 0 : print_func(buffer, output_unit);
207 : }
208 9326 : if (0 < memstats.host_mallocs) {
209 360 : dbm_mpi_max_uint64(&memstats.host_size, 1, comm);
210 360 : snprintf(buffer, sizeof(buffer),
211 : " Host "
212 : " %20" PRIuPTR " %10" PRIuPTR "\n",
213 360 : (uintptr_t)memstats.host_mallocs,
214 360 : (uintptr_t)((memstats.host_size + (512U << 10)) >> 20));
215 360 : print_func(buffer, output_unit);
216 : }
217 9326 : if (0 < memstats.device_mallocs || 0 < memstats.host_mallocs) {
218 360 : print_func(
219 : " ----------------------------------------------------------------"
220 : "---------------\n",
221 : output_unit);
222 : }
223 9326 : }
224 :
225 : // EOF
|