Line data Source code
1 : !--------------------------------------------------------------------------------------------------!
2 : ! CP2K: A general program to perform molecular dynamics simulations !
3 : ! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4 : ! !
5 : ! SPDX-License-Identifier: GPL-2.0-or-later !
6 : !--------------------------------------------------------------------------------------------------!
7 :
8 : ! **************************************************************************************************
9 : !> \brief Interface to the message passing library MPI
10 : !> \par History
11 : !> JGH (02-Jan-2001): New error handling
12 : !> Performance tools
13 : !> JGH (14-Jan-2001): New routines mp_comm_compare, mp_cart_coords,
14 : !> mp_rank_compare, mp_alltoall
15 : !> JGH (06-Feb-2001): New routines mp_comm_free
16 : !> JGH (22-Mar-2001): New routines mp_comm_dup
17 : !> fawzi (04-NOV-2004): storable performance info (for f77 interface)
18 : !> Wrapper routine for mpi_gatherv added (22.12.2005,MK)
19 : !> JGH (13-Feb-2006): Flexible precision
20 : !> JGH (15-Feb-2006): single precision mp_alltoall
21 : !> \author JGH
22 : ! **************************************************************************************************
23 : MODULE mp_perf_test
24 : USE kinds, ONLY: dp
25 : USE message_passing, ONLY: mp_comm_type
26 : ! some benchmarking code
27 : #include "../base/base_uses.f90"
28 :
29 : #if defined(__parallel)
30 : #if defined(__MPI_F08)
31 : USE mpi_f08, ONLY: mpi_wtime
32 : #else
33 : USE mpi, ONLY: mpi_wtime
34 : #endif
35 : #endif
36 :
37 : PRIVATE
38 :
39 : PUBLIC :: mpi_perf_test
40 :
41 : CONTAINS
42 :
43 : ! **************************************************************************************************
44 : !> \brief Tests the MPI library
45 : !> \param comm the relevant, initialized communicator
46 : !> \param npow number of sizes to test, 10**1 .. 10**npow
47 : !> \param output_unit where to direct output
48 : !> \par History
49 : !> JGH 6-Feb-2001 : Test and performance code
50 : !> \author JGH 1-JAN-2001
51 : !> \note
52 : !> quickly adapted benchmark code, will only work on an even number of CPUs.
53 : ! **************************************************************************************************
54 2 : SUBROUTINE mpi_perf_test(comm, npow, output_unit)
55 : CLASS(mp_comm_type), INTENT(IN) :: comm
56 : INTEGER, INTENT(IN) :: npow, output_unit
57 :
58 : #if defined(__parallel)
59 :
60 : INTEGER :: I, itask, itests, J, jtask, left, nbufmax, &
61 : ncount, Ngrid, Nloc, nprocs, Ntot, partner, right, taskid, tag, source
62 2 : INTEGER, ALLOCATABLE, DIMENSION(:) :: rcount, rdispl, scount, sdispl
63 : LOGICAL :: ionode
64 : REAL(KIND=dp) :: maxdiff, t1, &
65 : t2, t3, t4, t5
66 2 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:) :: buffer1, buffer2, buffer3, &
67 2 : lgrid, lgrid2, lgrid3
68 : REAL(KIND=dp), ALLOCATABLE, &
69 2 : DIMENSION(:, :) :: grid, grid2, grid3, &
70 2 : send_timings, send_timings2
71 : REAL(KIND=dp), PARAMETER :: threshold = 1.0E-8_dp
72 :
73 : ! set system sizes !
74 2 : ngrid = 10**npow
75 :
76 2 : taskid = comm%mepos
77 2 : nprocs = comm%num_pe
78 2 : ionode = comm%is_source()
79 2 : IF (ionode .AND. output_unit > 0) THEN
80 1 : WRITE (output_unit, *) "Running with ", nprocs
81 1 : WRITE (output_unit, *) "running messages with npow = ", npow
82 1 : WRITE (output_unit, *) "use MPI X in the input for larger (e.g. 6) of smaller (e.g. 3) messages"
83 1 : IF (MODULO(nprocs, 2) .NE. 0) WRITE (output_unit, *) "Testing only with an even number of tasks"
84 : END IF
85 :
86 2 : IF (MODULO(nprocs, 2) .NE. 0) RETURN
87 :
88 : ! equal loads
89 2 : Nloc = Ngrid/nprocs
90 2 : Ntot = Nprocs*Nloc
91 2 : nbufmax = 10**npow
92 : !
93 6 : ALLOCATE (rcount(nprocs))
94 4 : ALLOCATE (scount(nprocs))
95 4 : ALLOCATE (sdispl(nprocs))
96 4 : ALLOCATE (rdispl(nprocs))
97 6 : ALLOCATE (buffer1(nbufmax))
98 4 : ALLOCATE (buffer2(nbufmax))
99 4 : ALLOCATE (buffer3(nbufmax))
100 8 : ALLOCATE (grid(Nloc, Nprocs))
101 6 : ALLOCATE (grid2(Nloc, Nprocs))
102 6 : ALLOCATE (grid3(Nloc, Nprocs))
103 6 : ALLOCATE (lgrid(Nloc))
104 4 : ALLOCATE (lgrid2(Nloc))
105 4 : ALLOCATE (lgrid3(Nloc))
106 8 : ALLOCATE (send_timings(0:nprocs - 1, 0:nprocs - 1))
107 6 : ALLOCATE (send_timings2(0:nprocs - 1, 0:nprocs - 1))
108 20002 : buffer1 = 0.0_dp
109 20002 : buffer2 = 0.0_dp
110 20002 : buffer3 = 0.0_dp
111 : ! timings
112 14 : send_timings = 0.0_dp
113 14 : send_timings2 = 0.0_dp
114 : ! -------------------------------------------------------------------------------------------
115 : ! ------------------------------ some in memory tests ---------------------
116 : ! -------------------------------------------------------------------------------------------
117 2 : CALL comm%sync()
118 2 : IF (ionode .AND. output_unit > 0) THEN
119 1 : WRITE (output_unit, *) "Testing in memory copies just 1 CPU "
120 1 : WRITE (output_unit, *) " could tell something about the motherboard / cache / compiler "
121 : END IF
122 10 : DO i = 1, npow
123 8 : ncount = 10**i
124 8 : t2 = 0.0E0_dp
125 8 : IF (ncount .GT. nbufmax) CPABORT("")
126 88 : DO j = 1, 3**(npow - i)
127 80 : CALL comm%sync()
128 80 : t1 = MPI_WTIME()
129 28420 : buffer2(1:ncount) = buffer1(1:ncount)
130 88 : t2 = t2 + MPI_WTIME() - t1 + threshold
131 : END DO
132 8 : CALL comm%max(t2, 0)
133 10 : IF (ionode .AND. output_unit > 0) THEN
134 4 : WRITE (output_unit, '(I9,A,F12.4,A)') 8*ncount, " Bytes ", (3**(npow - i))*ncount*8.0E-6_dp/t2, " MB/s"
135 : END IF
136 : END DO
137 : ! -------------------------------------------------------------------------------------------
138 : ! ------------------------------ some in memory tests ---------------------
139 : ! -------------------------------------------------------------------------------------------
140 2 : CALL comm%sync()
141 2 : IF (ionode .AND. output_unit > 0) THEN
142 1 : WRITE (output_unit, *) "Testing in memory copies all cpus"
143 1 : WRITE (output_unit, *) " is the memory bandwidth affected on an SMP machine ?"
144 : END IF
145 10 : DO i = 1, npow
146 8 : ncount = 10**i
147 8 : t2 = 0.0E0_dp
148 8 : IF (ncount .GT. nbufmax) CPABORT("")
149 88 : DO j = 1, 3**(npow - i)
150 80 : CALL comm%sync()
151 80 : t1 = MPI_WTIME()
152 28420 : buffer2(1:ncount) = buffer1(1:ncount)
153 88 : t2 = t2 + MPI_WTIME() - t1 + threshold
154 : END DO
155 8 : CALL comm%max(t2, 0)
156 10 : IF (ionode .AND. output_unit > 0) THEN
157 4 : WRITE (output_unit, '(I9,A,F12.4,A)') 8*ncount, " Bytes ", (3**(npow - i))*ncount*8.0E-6_dp/t2, " MB/s"
158 : END IF
159 : END DO
160 : ! -------------------------------------------------------------------------------------------
161 : ! ------------------------------ first test point to point communication ---------------------
162 : ! -------------------------------------------------------------------------------------------
163 2 : CALL comm%sync()
164 2 : IF (ionode .AND. output_unit > 0) THEN
165 1 : WRITE (output_unit, *) "Testing truly point to point communication (i with j only)"
166 1 : WRITE (output_unit, *) " is there some different connection between i j (e.g. shared memory comm)"
167 : END IF
168 2 : ncount = 10**npow
169 2 : IF (ionode .AND. output_unit > 0) WRITE (output_unit, *) "For messages of ", ncount*8, " bytes"
170 : IF (ncount .GT. nbufmax) CPABORT("")
171 6 : DO itask = 0, nprocs - 1
172 8 : DO jtask = itask + 1, nprocs - 1
173 2 : CALL comm%sync()
174 2 : t1 = MPI_WTIME()
175 2 : IF (taskid .EQ. itask) THEN
176 1 : CALL comm%send(buffer1, jtask, itask*jtask)
177 : END IF
178 2 : IF (taskid .EQ. jtask) THEN
179 1 : source = itask
180 1 : tag = itask*jtask
181 1 : CALL comm%recv(buffer1, source, tag)
182 : END IF
183 6 : send_timings(itask, jtask) = MPI_WTIME() - t1 + threshold
184 : END DO
185 : END DO
186 2 : CALL comm%max(send_timings, 0)
187 2 : IF (ionode .AND. output_unit > 0) THEN
188 3 : DO itask = 0, nprocs - 1
189 4 : DO jtask = itask + 1, nprocs - 1
190 3 : WRITE (output_unit, '(I4,I4,F12.4,A)') itask, jtask, ncount*8.0E-6_dp/send_timings(itask, jtask), " MB/s"
191 : END DO
192 : END DO
193 : END IF
194 2 : CALL comm%sync()
195 : ! -------------------------------------------------------------------------------------------
196 : ! ------------------------------ second test point to point communication -------------------
197 : ! -------------------------------------------------------------------------------------------
198 2 : IF (ionode .AND. output_unit > 0) THEN
199 1 : WRITE (output_unit, *) "Testing all nearby point to point communication (0,1)(2,3)..."
200 1 : WRITE (output_unit, *) " these could / should all be on the same shared memory node "
201 : END IF
202 10 : DO i = 1, npow
203 8 : ncount = 10**i
204 8 : t2 = 0.0E0_dp
205 8 : IF (ncount .GT. nbufmax) CPABORT("")
206 88 : DO j = 1, 3**(npow - i)
207 80 : CALL comm%sync()
208 80 : t1 = MPI_WTIME()
209 80 : IF (MODULO(taskid, 2) == 0) THEN
210 40 : CALL comm%send(buffer1, taskid + 1, 0)
211 : ELSE
212 40 : source = taskid - 1
213 40 : tag = 0
214 40 : CALL comm%recv(buffer1, source, tag)
215 : END IF
216 88 : t2 = t2 + MPI_WTIME() - t1 + threshold
217 : END DO
218 8 : CALL comm%max(t2, 0)
219 10 : IF (ionode .AND. output_unit > 0) THEN
220 4 : WRITE (output_unit, '(I9,A,F12.4,A)') 8*ncount, " Bytes ", (3**(npow - i))*ncount*8.0E-6_dp/t2, " MB/s"
221 : END IF
222 : END DO
223 2 : CALL comm%sync()
224 : ! -------------------------------------------------------------------------------------------
225 : ! ------------------------------ third test point to point communication -------------------
226 : ! -------------------------------------------------------------------------------------------
227 2 : IF (ionode .AND. output_unit > 0) THEN
228 1 : WRITE (output_unit, *) "Testing all far point to point communication (0,nprocs/2),(1,nprocs/2+1),.."
229 1 : WRITE (output_unit, *) " these could all be going over the network, and stress it a lot"
230 : END IF
231 10 : DO i = 1, npow
232 8 : ncount = 10**i
233 8 : t2 = 0.0E0_dp
234 8 : IF (ncount .GT. nbufmax) CPABORT("")
235 88 : DO j = 1, 3**(npow - i)
236 80 : CALL comm%sync()
237 80 : t1 = MPI_WTIME()
238 : ! first half with partner
239 80 : IF (taskid .LT. nprocs/2) THEN
240 40 : CALL comm%send(buffer1, taskid + nprocs/2, 0)
241 : ELSE
242 40 : source = taskid - nprocs/2
243 40 : tag = 0
244 40 : CALL comm%recv(buffer1, source, tag)
245 : END IF
246 88 : t2 = t2 + MPI_WTIME() - t1 + threshold
247 : END DO
248 8 : CALL comm%max(t2, 0)
249 10 : IF (ionode .AND. output_unit > 0) THEN
250 4 : WRITE (output_unit, '(I9,A,F12.4,A)') 8*ncount, " Bytes ", (3**(npow - i))*ncount*8.0E-6_dp/t2, " MB/s"
251 : END IF
252 : END DO
253 : ! -------------------------------------------------------------------------------------------
254 : ! ------------------------------ test root to all broadcast -------------------
255 : ! -------------------------------------------------------------------------------------------
256 2 : CALL comm%sync()
257 2 : IF (ionode .AND. output_unit > 0) THEN
258 1 : WRITE (output_unit, *) "Testing root to all broadcast "
259 1 : WRITE (output_unit, *) " using trees at least ? "
260 : END IF
261 10 : DO i = 1, npow
262 8 : ncount = 10**i
263 8 : t2 = 0.0E0_dp
264 8 : IF (ncount .GT. nbufmax) CPABORT("")
265 88 : DO j = 1, 3**(npow - i)
266 80 : CALL comm%sync()
267 80 : t1 = MPI_WTIME()
268 80 : CALL comm%bcast(buffer1, 0)
269 88 : t2 = t2 + MPI_WTIME() - t1 + threshold
270 : END DO
271 8 : CALL comm%max(t2, 0)
272 10 : IF (ionode .AND. output_unit > 0) THEN
273 4 : WRITE (output_unit, '(I9,A,F12.4,A)') 8*ncount, " Bytes ", (3**(npow - i))*ncount*8.0E-6_dp/t2, " MB/s"
274 : END IF
275 : END DO
276 : ! -------------------------------------------------------------------------------------------
277 : ! ------------------------------ test parallel sum like behavior -------------------
278 : ! -------------------------------------------------------------------------------------------
279 2 : CALL comm%sync()
280 2 : IF (ionode .AND. output_unit > 0) WRITE (output_unit, *) "Test global summation (mpi_allreduce) "
281 10 : DO i = 1, npow
282 8 : ncount = 10**i
283 8 : t2 = 0.0E0_dp
284 8 : IF (ncount .GT. nbufmax) CPABORT("")
285 88 : DO j = 1, 3**(npow - i)
286 800080 : buffer2(:) = buffer1
287 80 : CALL comm%sync()
288 80 : t1 = MPI_WTIME()
289 80 : CALL comm%sum(buffer2)
290 88 : t2 = t2 + MPI_WTIME() - t1 + threshold
291 : END DO
292 8 : CALL comm%max(t2, 0)
293 10 : IF (ionode .AND. output_unit > 0) THEN
294 4 : WRITE (output_unit, '(I9,A,F12.4,A)') 8*ncount, " Bytes ", (3**(npow - i))*ncount*8.0E-6_dp/t2, " MB/s"
295 : END IF
296 : END DO
297 : ! -------------------------------------------------------------------------------------------
298 : ! ------------------------------ test all to all communication -------------------
299 : ! -------------------------------------------------------------------------------------------
300 2 : CALL comm%sync()
301 2 : IF (ionode .AND. output_unit > 0) THEN
302 1 : WRITE (output_unit, *) "Test all to all communication (mpi_alltoallv)"
303 1 : WRITE (output_unit, *) " mpi/network getting confused ? "
304 : END IF
305 10 : DO i = 1, npow
306 8 : ncount = 10**i
307 8 : t2 = 0.0E0_dp
308 8 : IF (ncount .GT. nbufmax) CPABORT("")
309 24 : scount = ncount/nprocs
310 24 : rcount = ncount/nprocs
311 24 : DO j = 1, nprocs
312 16 : sdispl(j) = (j - 1)*(ncount/nprocs)
313 24 : rdispl(j) = (j - 1)*(ncount/nprocs)
314 : END DO
315 88 : DO j = 1, 3**(npow - i)
316 80 : CALL comm%sync()
317 80 : t1 = MPI_WTIME()
318 80 : CALL comm%alltoall(buffer1, scount, sdispl, buffer2, rcount, rdispl)
319 88 : t2 = t2 + MPI_WTIME() - t1 + threshold
320 : END DO
321 8 : CALL comm%max(t2, 0)
322 10 : IF (ionode .AND. output_unit > 0) THEN
323 4 : WRITE (output_unit, '(I9,A,F12.4,A)') 8*(ncount/nprocs)*nprocs, " Bytes ", &
324 8 : (3**(npow - i))*(ncount/nprocs)*nprocs*8.0E-6_dp/t2, " MB/s"
325 : END IF
326 : END DO
327 :
328 : ! -------------------------------------------------------------------------------------------
329 : ! ------------------------------ other stuff ---------------------
330 : ! -------------------------------------------------------------------------------------------
331 2 : IF (ionode .AND. output_unit > 0) THEN
332 1 : WRITE (output_unit, *) " Clean tests completed "
333 1 : WRITE (output_unit, *) " Testing MPI_REDUCE scatter"
334 : END IF
335 6 : rcount = Nloc
336 8 : DO itests = 1, 3
337 6 : IF (ionode .AND. output_unit > 0) &
338 3 : WRITE (output_unit, *) "------------------------------- test ", itests, " ------------------------"
339 : ! *** reference ***
340 18 : DO j = 1, Nprocs
341 60018 : DO i = 1, Nloc
342 60012 : grid(i, j) = MODULO(i*j*taskid, itests)
343 : END DO
344 : END DO
345 6 : t1 = MPI_WTIME()
346 6 : CALL comm%mp_sum_scatter_dv(grid, lgrid, rcount)
347 6 : t2 = MPI_WTIME() - t1 + threshold
348 6 : CALL comm%max(t2)
349 6 : IF (ionode .AND. output_unit > 0) WRITE (output_unit, *) "MPI_REDUCE_SCATTER ", t2
350 : ! *** simple shift ***
351 18 : DO j = 1, Nprocs
352 60018 : DO i = 1, Nloc
353 60012 : grid2(i, j) = MODULO(i*j*taskid, itests)
354 : END DO
355 : END DO
356 6 : t3 = MPI_WTIME()
357 30006 : lgrid2 = 0.0E0_dp
358 12 : DO i = 1, Nprocs
359 60012 : lgrid2(:) = lgrid2 + grid(:, MODULO(taskid - i, Nprocs) + 1)
360 12 : IF (i .EQ. nprocs) EXIT
361 12 : CALL comm%shift(lgrid2, 1)
362 : END DO
363 6 : t4 = MPI_WTIME() - t3 + threshold
364 6 : CALL comm%max(t4)
365 30012 : maxdiff = MAXVAL(ABS(lgrid2 - lgrid))
366 6 : CALL comm%max(maxdiff)
367 6 : IF (ionode .AND. output_unit > 0) WRITE (output_unit, *) "MPI_SENDRECV_REPLACE ", t4, maxdiff
368 : ! *** involved shift ****
369 : IF (MODULO(nprocs, 2) /= 0) CPABORT("")
370 18 : DO j = 1, Nprocs
371 60018 : DO i = 1, Nloc
372 60012 : grid3(i, j) = MODULO(i*j*taskid, itests)
373 : END DO
374 : END DO
375 6 : t3 = MPI_WTIME()
376 : ! first sum the grid in pairs (0,1),(2,3) should be within an LPAR and fast XXXXXXXXX
377 : ! 0 will only need parts 0,2,4,... correctly summed
378 : ! 1 will only need parts 1,3,5,... correctly summed
379 : ! *** could nicely be generalised ****
380 6 : IF (MODULO(taskid, 2) == 0) THEN
381 3 : partner = taskid + 1
382 6 : DO i = 1, Nprocs, 2 ! sum the full grid with the partner
383 3 : CALL comm%sendrecv(grid3(:, i + 1), partner, lgrid3, partner, 17)
384 15006 : grid3(:, i) = grid3(:, i) + lgrid3(:)
385 : END DO
386 : ELSE
387 3 : partner = taskid - 1
388 6 : DO i = 1, Nprocs, 2
389 3 : CALL comm%sendrecv(grid3(:, i), partner, lgrid3, partner, 17)
390 15006 : grid3(:, i + 1) = grid3(:, i + 1) + lgrid3(:)
391 : END DO
392 : END IF
393 6 : t4 = MPI_WTIME() - t3 + threshold
394 : ! now send a given buffer from 1 to 3 to 5 .. adding the right part of the data
395 : ! since we've summed an lgrid does only need to pass by even or odd tasks
396 6 : left = MODULO(taskid - 2, Nprocs)
397 6 : right = MODULO(taskid + 2, Nprocs)
398 6 : t3 = MPI_WTIME()
399 30006 : lgrid3 = 0.0E0_dp
400 6 : DO i = 1, Nprocs, 2
401 30006 : lgrid3(:) = lgrid3 + grid3(:, MODULO(taskid - i - 1, Nprocs) + 1)
402 6 : IF (i .EQ. nprocs - 1) EXIT
403 6 : CALL comm%shift(lgrid3, 2)
404 : END DO
405 6 : t5 = MPI_WTIME() - t3 + threshold
406 6 : CALL comm%max(t4)
407 6 : CALL comm%max(t5)
408 30012 : maxdiff = MAXVAL(ABS(lgrid3 - lgrid))
409 6 : CALL comm%max(maxdiff)
410 8 : IF (ionode .AND. output_unit > 0) WRITE (output_unit, *) "INVOLVED SHIFT ", t4 + t5, "(", t4, ",", t5, ")", maxdiff
411 : END DO
412 2 : DEALLOCATE (rcount)
413 2 : DEALLOCATE (scount)
414 2 : DEALLOCATE (sdispl)
415 2 : DEALLOCATE (rdispl)
416 2 : DEALLOCATE (buffer1)
417 2 : DEALLOCATE (buffer2)
418 2 : DEALLOCATE (buffer3)
419 2 : DEALLOCATE (grid)
420 2 : DEALLOCATE (grid2)
421 2 : DEALLOCATE (grid3)
422 2 : DEALLOCATE (lgrid)
423 2 : DEALLOCATE (lgrid2)
424 2 : DEALLOCATE (lgrid3)
425 2 : DEALLOCATE (send_timings)
426 2 : DEALLOCATE (send_timings2)
427 : #else
428 : MARK_USED(comm)
429 : MARK_USED(npow)
430 : IF (output_unit > 0) WRITE (output_unit, *) "No MPI tests for a serial program"
431 : #endif
432 2 : END SUBROUTINE mpi_perf_test
433 :
434 : END MODULE mp_perf_test
|