Line data Source code
1 : !--------------------------------------------------------------------------------------------------!
2 : ! CP2K: A general program to perform molecular dynamics simulations !
3 : ! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4 : ! !
5 : ! SPDX-License-Identifier: GPL-2.0-or-later !
6 : !--------------------------------------------------------------------------------------------------!
7 :
8 : ! **************************************************************************************************
9 : !> \brief Master's routines for the swarm-framework
10 : !> \author Ole Schuett
11 : ! **************************************************************************************************
12 : MODULE swarm_master
13 : USE cp_external_control, ONLY: external_control
14 : USE cp_log_handling, ONLY: cp_get_default_logger,&
15 : cp_logger_type
16 : USE cp_output_handling, ONLY: cp_print_key_finished_output,&
17 : cp_print_key_unit_nr
18 : USE cp_parser_types, ONLY: cp_parser_type,&
19 : parser_create,&
20 : parser_release
21 : USE glbopt_master, ONLY: glbopt_master_finalize,&
22 : glbopt_master_init,&
23 : glbopt_master_steer,&
24 : glbopt_master_type
25 : USE global_types, ONLY: global_environment_type
26 : USE input_constants, ONLY: swarm_do_glbopt
27 : USE input_section_types, ONLY: section_vals_get_subs_vals,&
28 : section_vals_type,&
29 : section_vals_val_get
30 : USE kinds, ONLY: default_path_length,&
31 : default_string_length
32 : USE message_passing, ONLY: mp_para_env_type
33 : USE swarm_message, ONLY: swarm_message_add,&
34 : swarm_message_equal,&
35 : swarm_message_file_read,&
36 : swarm_message_file_write,&
37 : swarm_message_free,&
38 : swarm_message_get,&
39 : swarm_message_type
40 : #include "../base/base_uses.f90"
41 :
42 : IMPLICIT NONE
43 : PRIVATE
44 :
45 : CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'swarm_master'
46 :
47 : PUBLIC :: swarm_master_type
48 : PUBLIC :: swarm_master_init, swarm_master_finalize
49 : PUBLIC :: swarm_master_steer
50 :
51 : TYPE swarm_message_p_type
52 : TYPE(swarm_message_type), POINTER :: p => Null()
53 : END TYPE swarm_message_p_type
54 :
55 : TYPE swarm_master_type
56 : PRIVATE
57 : INTEGER :: behavior = -1
58 : TYPE(glbopt_master_type), POINTER :: glbopt => Null()
59 : !possibly more behaviors ...
60 : INTEGER :: iw = 0
61 : INTEGER :: i_iteration = 0
62 : INTEGER :: max_iter = 0
63 : LOGICAL :: should_stop = .FALSE.
64 : INTEGER :: n_workers = -1
65 : INTEGER :: comlog_unit = -1
66 : TYPE(section_vals_type), POINTER :: swarm_section => Null()
67 : TYPE(mp_para_env_type), POINTER :: para_env => Null()
68 : TYPE(swarm_message_p_type), DIMENSION(:), POINTER :: queued_commands => Null()
69 : TYPE(global_environment_type), POINTER :: globenv => Null()
70 : LOGICAL :: ignore_last_iteration = .FALSE.
71 : INTEGER :: n_waiting = 0
72 : END TYPE swarm_master_type
73 :
74 : CONTAINS
75 :
76 : ! **************************************************************************************************
77 : !> \brief Initializes the swarm master
78 : !> \param master ...
79 : !> \param para_env ...
80 : !> \param globenv ...
81 : !> \param root_section ...
82 : !> \param n_workers ...
83 : !> \author Ole Schuett
84 : ! **************************************************************************************************
85 3 : SUBROUTINE swarm_master_init(master, para_env, globenv, root_section, n_workers)
86 : TYPE(swarm_master_type) :: master
87 : TYPE(mp_para_env_type), POINTER :: para_env
88 : TYPE(global_environment_type), POINTER :: globenv
89 : TYPE(section_vals_type), POINTER :: root_section
90 : INTEGER, INTENT(IN) :: n_workers
91 :
92 : TYPE(cp_logger_type), POINTER :: logger
93 :
94 3 : master%swarm_section => section_vals_get_subs_vals(root_section, "SWARM")
95 :
96 3 : logger => cp_get_default_logger()
97 3 : master%n_workers = n_workers
98 3 : master%para_env => para_env
99 3 : master%globenv => globenv
100 12 : ALLOCATE (master%queued_commands(master%n_workers))
101 : master%iw = cp_print_key_unit_nr(logger, master%swarm_section, &
102 3 : "PRINT%MASTER_RUN_INFO", extension=".masterLog")
103 :
104 3 : CALL section_vals_val_get(master%swarm_section, "BEHAVIOR", i_val=master%behavior)
105 :
106 : ! uses logger%iter_info%project_name to construct filename
107 : master%comlog_unit = cp_print_key_unit_nr(logger, master%swarm_section, "PRINT%COMMUNICATION_LOG", &
108 : !middle_name="comlog", extension=".xyz", &
109 : extension=".comlog", &
110 3 : file_action="WRITE", file_position="REWIND")
111 :
112 3 : CALL section_vals_val_get(master%swarm_section, "MAX_ITER", i_val=master%max_iter)
113 :
114 6 : SELECT CASE (master%behavior)
115 : CASE (swarm_do_glbopt)
116 3 : ALLOCATE (master%glbopt)
117 3 : CALL glbopt_master_init(master%glbopt, para_env, root_section, n_workers, master%iw)
118 : CASE DEFAULT
119 3 : CPABORT("got unknown behavior")
120 : END SELECT
121 :
122 3 : CALL replay_comlog(master)
123 3 : END SUBROUTINE swarm_master_init
124 :
125 : ! **************************************************************************************************
126 : !> \brief Helper routine for swarm_master_init, restarts a calculation
127 : !> \param master ...
128 : !> \author Ole Schuett
129 : ! **************************************************************************************************
130 3 : SUBROUTINE replay_comlog(master)
131 : TYPE(swarm_master_type) :: master
132 :
133 : CHARACTER(LEN=default_path_length) :: filename
134 : CHARACTER(LEN=default_string_length) :: command_log
135 : INTEGER :: handle, i, worker_id
136 : LOGICAL :: at_end, explicit
137 : TYPE(cp_parser_type) :: parser
138 : TYPE(swarm_message_type) :: cmd_log, report_log
139 : TYPE(swarm_message_type), &
140 9 : DIMENSION(master%n_workers) :: last_commands
141 : TYPE(swarm_message_type), POINTER :: cmd_now
142 :
143 : ! Initialize parser for trajectory
144 : CALL section_vals_val_get(master%swarm_section, "REPLAY_COMMUNICATION_LOG", &
145 3 : c_val=filename, explicit=explicit)
146 :
147 3 : IF (.NOT. explicit) RETURN
148 1 : IF (master%iw > 0) WRITE (master%iw, '(A,A)') &
149 1 : " SWARM| Starting replay of communication-log: ", TRIM(filename)
150 :
151 1 : CALL timeset("swarm_master_replay_comlog", handle)
152 1 : CALL parser_create(parser, filename, para_env=master%para_env)
153 :
154 1 : at_end = .FALSE.
155 21 : DO
156 6 : CALL swarm_message_file_read(report_log, parser, at_end)
157 6 : IF (at_end) EXIT
158 :
159 5 : CALL swarm_message_file_read(cmd_log, parser, at_end)
160 5 : IF (at_end) EXIT
161 :
162 5 : ALLOCATE (cmd_now)
163 5 : CALL swarm_master_steer(master, report_log, cmd_now)
164 :
165 : !TODO: maybe we should just exit the loop instead of stopping?
166 5 : CALL swarm_message_get(cmd_log, "command", command_log)
167 5 : IF (TRIM(command_log) /= "shutdown") THEN
168 4 : IF (.NOT. commands_equal(cmd_now, cmd_log, master%iw)) CPABORT("wrong behaviour")
169 : END IF
170 :
171 5 : CALL swarm_message_free(cmd_log)
172 5 : CALL swarm_message_free(report_log)
173 5 : CALL swarm_message_get(cmd_now, "worker_id", worker_id)
174 5 : CALL swarm_message_free(last_commands(worker_id))
175 5 : last_commands(worker_id) = cmd_now
176 5 : DEALLOCATE (cmd_now)
177 : END DO
178 :
179 1 : CALL swarm_message_free(report_log) !don't worry about double-frees
180 1 : CALL swarm_message_free(cmd_log)
181 :
182 1 : IF (master%iw > 0) WRITE (master%iw, '(A,A)') &
183 1 : " SWARM| Reached end of communication log. Queueing last commands."
184 :
185 2 : DO i = 1, master%n_workers
186 1 : ALLOCATE (master%queued_commands(i)%p)
187 2 : master%queued_commands(i)%p = last_commands(i)
188 : END DO
189 :
190 1 : CALL parser_release(parser)
191 1 : CALL timestop(handle)
192 9 : END SUBROUTINE replay_comlog
193 :
194 : ! **************************************************************************************************
195 : !> \brief Helper routine for replay_comlog, compares two commands
196 : !> \param cmd1 ...
197 : !> \param cmd2 ...
198 : !> \param iw ...
199 : !> \return ...
200 : !> \author Ole Schuett
201 : ! **************************************************************************************************
202 4 : FUNCTION commands_equal(cmd1, cmd2, iw) RESULT(res)
203 : TYPE(swarm_message_type) :: cmd1, cmd2
204 : INTEGER :: iw
205 : LOGICAL :: res
206 :
207 4 : res = swarm_message_equal(cmd1, cmd2)
208 4 : IF (.NOT. res .AND. iw > 0) THEN
209 0 : WRITE (iw, *) "Command 1:"
210 0 : CALL swarm_message_file_write(cmd1, iw)
211 0 : WRITE (iw, *) "Command 2:"
212 0 : CALL swarm_message_file_write(cmd2, iw)
213 : END IF
214 4 : END FUNCTION commands_equal
215 :
216 : ! **************************************************************************************************
217 : !> \brief Central steering routine of the swarm master
218 : !> \param master ...
219 : !> \param report ...
220 : !> \param cmd ...
221 : !> \author Ole Schuett
222 : ! **************************************************************************************************
223 112 : SUBROUTINE swarm_master_steer(master, report, cmd)
224 : TYPE(swarm_master_type), INTENT(INOUT) :: master
225 : TYPE(swarm_message_type), INTENT(IN) :: report
226 : TYPE(swarm_message_type), INTENT(OUT) :: cmd
227 :
228 : CHARACTER(len=default_string_length) :: command, status
229 : INTEGER :: handle, worker_id
230 : LOGICAL :: should_stop
231 :
232 56 : should_stop = .FALSE.
233 :
234 56 : CALL timeset("swarm_master_steer", handle)
235 :
236 : ! First check if there are queued commands for this worker
237 56 : CALL swarm_message_get(report, "worker_id", worker_id)
238 :
239 56 : IF (ASSOCIATED(master%queued_commands(worker_id)%p)) THEN
240 1 : cmd = master%queued_commands(worker_id)%p
241 1 : DEALLOCATE (master%queued_commands(worker_id)%p)
242 2 : IF (master%iw > 0) WRITE (master%iw, '(A,A,A,I9,1X,A)') ' SWARM| ', &
243 1 : REPEAT("*", 9), " Sending out queued command to worker: ", &
244 2 : worker_id, REPEAT("*", 9)
245 1 : CALL timestop(handle)
246 1 : RETURN
247 : END IF
248 :
249 55 : IF (.NOT. master%ignore_last_iteration) THEN
250 : ! There are no queued commands. Do the normal processing.
251 55 : master%i_iteration = master%i_iteration + 1
252 :
253 110 : IF (master%iw > 0) WRITE (master%iw, '(A,A,1X,I8,A,A)') ' SWARM| ', REPEAT("*", 15), &
254 110 : master%i_iteration, ' Master / Worker Communication ', REPEAT("*", 15)
255 : END IF
256 :
257 55 : IF (master%i_iteration >= master%max_iter .AND. .NOT. master%should_stop) THEN
258 1 : IF (master%iw > 0) WRITE (master%iw, '(A)') " SWARM| Reached MAX_ITER. Quitting."
259 1 : master%should_stop = .TRUE.
260 : END IF
261 :
262 55 : IF (.NOT. master%should_stop) THEN
263 54 : CALL external_control(master%should_stop, "SWARM", master%globenv)
264 54 : IF (master%should_stop .AND. master%iw > 0) &
265 0 : WRITE (master%iw, *) " SWARM| Received stop from external_control. Quitting."
266 : END IF
267 :
268 : !IF(unit > 0) &
269 :
270 55 : IF (master%should_stop) THEN
271 1 : CALL swarm_message_add(cmd, "command", "shutdown")
272 1 : IF (master%iw > 0) WRITE (master%iw, '(1X,A,T71,I10)') &
273 1 : "SWARM| Sending shutdown command to worker", worker_id
274 : ELSE
275 108 : SELECT CASE (master%behavior)
276 : CASE (swarm_do_glbopt)
277 54 : CALL glbopt_master_steer(master%glbopt, report, cmd, should_stop)
278 : CASE DEFAULT
279 54 : CPABORT("got unknown behavior")
280 : END SELECT
281 :
282 54 : IF (should_stop) THEN
283 2 : CALL swarm_message_free(cmd)
284 2 : CALL swarm_message_add(cmd, "command", "shutdown") !overwrite command
285 2 : IF (master%iw > 0) WRITE (master%iw, '(1X,A,T71,I10)') &
286 2 : "SWARM| Sending shutdown command to worker", worker_id
287 2 : master%should_stop = .TRUE.
288 : END IF
289 : END IF
290 :
291 55 : CALL swarm_message_add(cmd, "worker_id", worker_id)
292 :
293 : ! Don't pollute comlog with "continue waiting"-commands.
294 55 : CALL swarm_message_get(report, "status", status)
295 55 : CALL swarm_message_get(cmd, "command", command)
296 55 : IF (TRIM(status) == "wait_done") master%n_waiting = master%n_waiting - 1
297 55 : IF (TRIM(command) == "wait") master%n_waiting = master%n_waiting + 1
298 55 : IF (master%n_waiting < 0) CPABORT("master%n_waiting < 0")
299 55 : IF (TRIM(status) /= "wait_done" .OR. TRIM(command) /= "wait") THEN
300 55 : CALL swarm_message_file_write(report, master%comlog_unit)
301 55 : CALL swarm_message_file_write(cmd, master%comlog_unit)
302 55 : IF (master%n_waiting > 0 .AND. master%iw > 0) WRITE (master%iw, '(1X,A,T71,I10)') &
303 0 : "SWARM| Number of waiting workers:", master%n_waiting
304 55 : master%ignore_last_iteration = .FALSE.
305 : ELSE
306 0 : master%ignore_last_iteration = .TRUE.
307 : END IF
308 55 : CALL timestop(handle)
309 : END SUBROUTINE swarm_master_steer
310 :
311 : ! **************************************************************************************************
312 : !> \brief Finalizes the swarm master
313 : !> \param master ...
314 : !> \author Ole Schuett
315 : ! **************************************************************************************************
316 3 : SUBROUTINE swarm_master_finalize(master)
317 : TYPE(swarm_master_type) :: master
318 :
319 : TYPE(cp_logger_type), POINTER :: logger
320 :
321 3 : IF (master%iw > 0) THEN
322 3 : WRITE (master%iw, "(1X,A,T71,I10)") "SWARM| Total number of iterations ", master%i_iteration
323 3 : WRITE (master%iw, "(A)") " SWARM| Shutting down the master."
324 : END IF
325 :
326 6 : SELECT CASE (master%behavior)
327 : CASE (swarm_do_glbopt)
328 3 : CALL glbopt_master_finalize(master%glbopt)
329 3 : DEALLOCATE (master%glbopt)
330 : CASE DEFAULT
331 3 : CPABORT("got unknown behavior")
332 : END SELECT
333 :
334 3 : DEALLOCATE (master%queued_commands)
335 :
336 3 : logger => cp_get_default_logger()
337 : CALL cp_print_key_finished_output(master%iw, logger, &
338 3 : master%swarm_section, "PRINT%MASTER_RUN_INFO")
339 : CALL cp_print_key_finished_output(master%comlog_unit, logger, &
340 3 : master%swarm_section, "PRINT%COMMUNICATION_LOG")
341 :
342 : !CALL rm_timer_env() !pops the top-most timer
343 3 : END SUBROUTINE swarm_master_finalize
344 :
345 0 : END MODULE swarm_master
346 :
|