LCOV - code coverage report
Current view: top level - src/swarm - swarm_master.F (source / functions) Hit Total Coverage
Test: CP2K Regtests (git:2fce0f8) Lines: 116 124 93.5 %
Date: 2024-12-21 06:28:57 Functions: 5 7 71.4 %

          Line data    Source code
       1             : !--------------------------------------------------------------------------------------------------!
       2             : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3             : !   Copyright 2000-2024 CP2K developers group <https://cp2k.org>                                   !
       4             : !                                                                                                  !
       5             : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6             : !--------------------------------------------------------------------------------------------------!
       7             : 
       8             : ! **************************************************************************************************
       9             : !> \brief Master's routines for the swarm-framework
      10             : !> \author Ole Schuett
      11             : ! **************************************************************************************************
      12             : MODULE swarm_master
      13             :    USE cp_external_control,             ONLY: external_control
      14             :    USE cp_log_handling,                 ONLY: cp_get_default_logger,&
      15             :                                               cp_logger_type
      16             :    USE cp_output_handling,              ONLY: cp_print_key_finished_output,&
      17             :                                               cp_print_key_unit_nr
      18             :    USE cp_parser_types,                 ONLY: cp_parser_type,&
      19             :                                               parser_create,&
      20             :                                               parser_release
      21             :    USE glbopt_master,                   ONLY: glbopt_master_finalize,&
      22             :                                               glbopt_master_init,&
      23             :                                               glbopt_master_steer,&
      24             :                                               glbopt_master_type
      25             :    USE global_types,                    ONLY: global_environment_type
      26             :    USE input_constants,                 ONLY: swarm_do_glbopt
      27             :    USE input_section_types,             ONLY: section_vals_get_subs_vals,&
      28             :                                               section_vals_type,&
      29             :                                               section_vals_val_get
      30             :    USE kinds,                           ONLY: default_path_length,&
      31             :                                               default_string_length
      32             :    USE message_passing,                 ONLY: mp_para_env_type
      33             :    USE swarm_message,                   ONLY: swarm_message_add,&
      34             :                                               swarm_message_equal,&
      35             :                                               swarm_message_file_read,&
      36             :                                               swarm_message_file_write,&
      37             :                                               swarm_message_free,&
      38             :                                               swarm_message_get,&
      39             :                                               swarm_message_type
      40             : #include "../base/base_uses.f90"
      41             : 
      42             :    IMPLICIT NONE
      43             :    PRIVATE
      44             : 
      45             :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'swarm_master'
      46             : 
      47             :    PUBLIC :: swarm_master_type
      48             :    PUBLIC :: swarm_master_init, swarm_master_finalize
      49             :    PUBLIC :: swarm_master_steer
      50             : 
      51             :    TYPE swarm_message_p_type
      52             :       TYPE(swarm_message_type), POINTER                   :: p => Null()
      53             :    END TYPE swarm_message_p_type
      54             : 
      55             :    TYPE swarm_master_type
      56             :       PRIVATE
      57             :       INTEGER                                             :: behavior = -1
      58             :       TYPE(glbopt_master_type), POINTER                   :: glbopt => Null()
      59             :       !possibly more behaviors ...
      60             :       INTEGER                                             :: iw = 0
      61             :       INTEGER                                             :: i_iteration = 0
      62             :       INTEGER                                             :: max_iter = 0
      63             :       LOGICAL                                             :: should_stop = .FALSE.
      64             :       INTEGER                                             :: n_workers = -1
      65             :       INTEGER                                             :: comlog_unit = -1
      66             :       TYPE(section_vals_type), POINTER                    :: swarm_section => Null()
      67             :       TYPE(mp_para_env_type), POINTER                     :: para_env => Null()
      68             :       TYPE(swarm_message_p_type), DIMENSION(:), POINTER   :: queued_commands => Null()
      69             :       TYPE(global_environment_type), POINTER              :: globenv => Null()
      70             :       LOGICAL                                             :: ignore_last_iteration = .FALSE.
      71             :       INTEGER                                             :: n_waiting = 0
      72             :    END TYPE swarm_master_type
      73             : 
      74             : CONTAINS
      75             : 
      76             : ! **************************************************************************************************
      77             : !> \brief Initializes the swarm master
      78             : !> \param master ...
      79             : !> \param para_env ...
      80             : !> \param globenv ...
      81             : !> \param root_section ...
      82             : !> \param n_workers ...
      83             : !> \author Ole Schuett
      84             : ! **************************************************************************************************
      85           3 :    SUBROUTINE swarm_master_init(master, para_env, globenv, root_section, n_workers)
      86             :       TYPE(swarm_master_type)                            :: master
      87             :       TYPE(mp_para_env_type), POINTER                    :: para_env
      88             :       TYPE(global_environment_type), POINTER             :: globenv
      89             :       TYPE(section_vals_type), POINTER                   :: root_section
      90             :       INTEGER, INTENT(IN)                                :: n_workers
      91             : 
      92             :       TYPE(cp_logger_type), POINTER                      :: logger
      93             : 
      94           3 :       master%swarm_section => section_vals_get_subs_vals(root_section, "SWARM")
      95             : 
      96           3 :       logger => cp_get_default_logger()
      97           3 :       master%n_workers = n_workers
      98           3 :       master%para_env => para_env
      99           3 :       master%globenv => globenv
     100          12 :       ALLOCATE (master%queued_commands(master%n_workers))
     101             :       master%iw = cp_print_key_unit_nr(logger, master%swarm_section, &
     102           3 :                                        "PRINT%MASTER_RUN_INFO", extension=".masterLog")
     103             : 
     104           3 :       CALL section_vals_val_get(master%swarm_section, "BEHAVIOR", i_val=master%behavior)
     105             : 
     106             :       ! uses logger%iter_info%project_name to construct filename
     107             :       master%comlog_unit = cp_print_key_unit_nr(logger, master%swarm_section, "PRINT%COMMUNICATION_LOG", &
     108             :                                                 !middle_name="comlog", extension=".xyz", &
     109             :                                                 extension=".comlog", &
     110           3 :                                                 file_action="WRITE", file_position="REWIND")
     111             : 
     112           3 :       CALL section_vals_val_get(master%swarm_section, "MAX_ITER", i_val=master%max_iter)
     113             : 
     114           6 :       SELECT CASE (master%behavior)
     115             :       CASE (swarm_do_glbopt)
     116           3 :          ALLOCATE (master%glbopt)
     117           3 :          CALL glbopt_master_init(master%glbopt, para_env, root_section, n_workers, master%iw)
     118             :       CASE DEFAULT
     119           3 :          CPABORT("got unknown behavior")
     120             :       END SELECT
     121             : 
     122           3 :       CALL replay_comlog(master)
     123           3 :    END SUBROUTINE swarm_master_init
     124             : 
     125             : ! **************************************************************************************************
     126             : !> \brief Helper routine for swarm_master_init, restarts a calculation
     127             : !> \param master ...
     128             : !> \author Ole Schuett
     129             : ! **************************************************************************************************
     130           3 :    SUBROUTINE replay_comlog(master)
     131             :       TYPE(swarm_master_type)                            :: master
     132             : 
     133             :       CHARACTER(LEN=default_path_length)                 :: filename
     134             :       CHARACTER(LEN=default_string_length)               :: command_log
     135             :       INTEGER                                            :: handle, i, worker_id
     136             :       LOGICAL                                            :: at_end, explicit
     137             :       TYPE(cp_parser_type)                               :: parser
     138             :       TYPE(swarm_message_type)                           :: cmd_log, report_log
     139             :       TYPE(swarm_message_type), &
     140           9 :          DIMENSION(master%n_workers)                     :: last_commands
     141             :       TYPE(swarm_message_type), POINTER                  :: cmd_now
     142             : 
     143             :       ! Initialize parser for trajectory
     144             :       CALL section_vals_val_get(master%swarm_section, "REPLAY_COMMUNICATION_LOG", &
     145           3 :                                 c_val=filename, explicit=explicit)
     146             : 
     147           3 :       IF (.NOT. explicit) RETURN
     148           1 :       IF (master%iw > 0) WRITE (master%iw, '(A,A)') &
     149           1 :          " SWARM| Starting replay of communication-log: ", TRIM(filename)
     150             : 
     151           1 :       CALL timeset("swarm_master_replay_comlog", handle)
     152           1 :       CALL parser_create(parser, filename, para_env=master%para_env)
     153             : 
     154           1 :       at_end = .FALSE.
     155          21 :       DO
     156           6 :          CALL swarm_message_file_read(report_log, parser, at_end)
     157           6 :          IF (at_end) EXIT
     158             : 
     159           5 :          CALL swarm_message_file_read(cmd_log, parser, at_end)
     160           5 :          IF (at_end) EXIT
     161             : 
     162           5 :          ALLOCATE (cmd_now)
     163           5 :          CALL swarm_master_steer(master, report_log, cmd_now)
     164             : 
     165             :          !TODO: maybe we should just exit the loop instead of stopping?
     166           5 :          CALL swarm_message_get(cmd_log, "command", command_log)
     167           5 :          IF (TRIM(command_log) /= "shutdown") THEN
     168           4 :             IF (.NOT. commands_equal(cmd_now, cmd_log, master%iw)) CPABORT("wrong behaviour")
     169             :          END IF
     170             : 
     171           5 :          CALL swarm_message_free(cmd_log)
     172           5 :          CALL swarm_message_free(report_log)
     173           5 :          CALL swarm_message_get(cmd_now, "worker_id", worker_id)
     174           5 :          CALL swarm_message_free(last_commands(worker_id))
     175           5 :          last_commands(worker_id) = cmd_now
     176           5 :          DEALLOCATE (cmd_now)
     177             :       END DO
     178             : 
     179           1 :       CALL swarm_message_free(report_log) !don't worry about double-frees
     180           1 :       CALL swarm_message_free(cmd_log)
     181             : 
     182           1 :       IF (master%iw > 0) WRITE (master%iw, '(A,A)') &
     183           1 :          " SWARM| Reached end of communication log. Queueing last commands."
     184             : 
     185           2 :       DO i = 1, master%n_workers
     186           1 :          ALLOCATE (master%queued_commands(i)%p)
     187           2 :          master%queued_commands(i)%p = last_commands(i)
     188             :       END DO
     189             : 
     190           1 :       CALL parser_release(parser)
     191           1 :       CALL timestop(handle)
     192           9 :    END SUBROUTINE replay_comlog
     193             : 
     194             : ! **************************************************************************************************
     195             : !> \brief Helper routine for replay_comlog, compares two commands
     196             : !> \param cmd1 ...
     197             : !> \param cmd2 ...
     198             : !> \param iw ...
     199             : !> \return ...
     200             : !> \author Ole Schuett
     201             : ! **************************************************************************************************
     202           4 :    FUNCTION commands_equal(cmd1, cmd2, iw) RESULT(res)
     203             :       TYPE(swarm_message_type)                           :: cmd1, cmd2
     204             :       INTEGER                                            :: iw
     205             :       LOGICAL                                            :: res
     206             : 
     207           4 :       res = swarm_message_equal(cmd1, cmd2)
     208           4 :       IF (.NOT. res .AND. iw > 0) THEN
     209           0 :          WRITE (iw, *) "Command 1:"
     210           0 :          CALL swarm_message_file_write(cmd1, iw)
     211           0 :          WRITE (iw, *) "Command 2:"
     212           0 :          CALL swarm_message_file_write(cmd2, iw)
     213             :       END IF
     214           4 :    END FUNCTION commands_equal
     215             : 
     216             : ! **************************************************************************************************
     217             : !> \brief Central steering routine of the swarm master
     218             : !> \param master ...
     219             : !> \param report ...
     220             : !> \param cmd ...
     221             : !> \author Ole Schuett
     222             : ! **************************************************************************************************
     223         112 :    SUBROUTINE swarm_master_steer(master, report, cmd)
     224             :       TYPE(swarm_master_type), INTENT(INOUT)             :: master
     225             :       TYPE(swarm_message_type), INTENT(IN)               :: report
     226             :       TYPE(swarm_message_type), INTENT(OUT)              :: cmd
     227             : 
     228             :       CHARACTER(len=default_string_length)               :: command, status
     229             :       INTEGER                                            :: handle, worker_id
     230             :       LOGICAL                                            :: should_stop
     231             : 
     232          56 :       should_stop = .FALSE.
     233             : 
     234          56 :       CALL timeset("swarm_master_steer", handle)
     235             : 
     236             :       ! First check if there are queued commands for this worker
     237          56 :       CALL swarm_message_get(report, "worker_id", worker_id)
     238             : 
     239          56 :       IF (ASSOCIATED(master%queued_commands(worker_id)%p)) THEN
     240           1 :          cmd = master%queued_commands(worker_id)%p
     241           1 :          DEALLOCATE (master%queued_commands(worker_id)%p)
     242           2 :          IF (master%iw > 0) WRITE (master%iw, '(A,A,A,I9,1X,A)') ' SWARM| ', &
     243           1 :             REPEAT("*", 9), " Sending out queued command to worker: ", &
     244           2 :             worker_id, REPEAT("*", 9)
     245           1 :          CALL timestop(handle)
     246           1 :          RETURN
     247             :       END IF
     248             : 
     249          55 :       IF (.NOT. master%ignore_last_iteration) THEN
     250             :          ! There are no queued commands. Do the normal processing.
     251          55 :          master%i_iteration = master%i_iteration + 1
     252             : 
     253         110 :          IF (master%iw > 0) WRITE (master%iw, '(A,A,1X,I8,A,A)') ' SWARM| ', REPEAT("*", 15), &
     254         110 :             master%i_iteration, ' Master / Worker Communication  ', REPEAT("*", 15)
     255             :       END IF
     256             : 
     257          55 :       IF (master%i_iteration >= master%max_iter .AND. .NOT. master%should_stop) THEN
     258           1 :          IF (master%iw > 0) WRITE (master%iw, '(A)') " SWARM| Reached MAX_ITER. Quitting."
     259           1 :          master%should_stop = .TRUE.
     260             :       END IF
     261             : 
     262          55 :       IF (.NOT. master%should_stop) THEN
     263          54 :          CALL external_control(master%should_stop, "SWARM", master%globenv)
     264          54 :          IF (master%should_stop .AND. master%iw > 0) &
     265           0 :             WRITE (master%iw, *) " SWARM| Received stop from external_control. Quitting."
     266             :       END IF
     267             : 
     268             :       !IF(unit > 0) &
     269             : 
     270          55 :       IF (master%should_stop) THEN
     271           1 :          CALL swarm_message_add(cmd, "command", "shutdown")
     272           1 :          IF (master%iw > 0) WRITE (master%iw, '(1X,A,T71,I10)') &
     273           1 :             "SWARM| Sending shutdown command to worker", worker_id
     274             :       ELSE
     275         108 :          SELECT CASE (master%behavior)
     276             :          CASE (swarm_do_glbopt)
     277          54 :             CALL glbopt_master_steer(master%glbopt, report, cmd, should_stop)
     278             :          CASE DEFAULT
     279          54 :             CPABORT("got unknown behavior")
     280             :          END SELECT
     281             : 
     282          54 :          IF (should_stop) THEN
     283           2 :             CALL swarm_message_free(cmd)
     284           2 :             CALL swarm_message_add(cmd, "command", "shutdown") !overwrite command
     285           2 :             IF (master%iw > 0) WRITE (master%iw, '(1X,A,T71,I10)') &
     286           2 :                "SWARM| Sending shutdown command to worker", worker_id
     287           2 :             master%should_stop = .TRUE.
     288             :          END IF
     289             :       END IF
     290             : 
     291          55 :       CALL swarm_message_add(cmd, "worker_id", worker_id)
     292             : 
     293             :       ! Don't pollute comlog with "continue waiting"-commands.
     294          55 :       CALL swarm_message_get(report, "status", status)
     295          55 :       CALL swarm_message_get(cmd, "command", command)
     296          55 :       IF (TRIM(status) == "wait_done") master%n_waiting = master%n_waiting - 1
     297          55 :       IF (TRIM(command) == "wait") master%n_waiting = master%n_waiting + 1
     298          55 :       IF (master%n_waiting < 0) CPABORT("master%n_waiting < 0")
     299          55 :       IF (TRIM(status) /= "wait_done" .OR. TRIM(command) /= "wait") THEN
     300          55 :          CALL swarm_message_file_write(report, master%comlog_unit)
     301          55 :          CALL swarm_message_file_write(cmd, master%comlog_unit)
     302          55 :          IF (master%n_waiting > 0 .AND. master%iw > 0) WRITE (master%iw, '(1X,A,T71,I10)') &
     303           0 :             "SWARM| Number of waiting workers:", master%n_waiting
     304          55 :          master%ignore_last_iteration = .FALSE.
     305             :       ELSE
     306           0 :          master%ignore_last_iteration = .TRUE.
     307             :       END IF
     308          55 :       CALL timestop(handle)
     309             :    END SUBROUTINE swarm_master_steer
     310             : 
     311             : ! **************************************************************************************************
     312             : !> \brief Finalizes the swarm master
     313             : !> \param master ...
     314             : !> \author Ole Schuett
     315             : ! **************************************************************************************************
     316           3 :    SUBROUTINE swarm_master_finalize(master)
     317             :       TYPE(swarm_master_type)                            :: master
     318             : 
     319             :       TYPE(cp_logger_type), POINTER                      :: logger
     320             : 
     321           3 :       IF (master%iw > 0) THEN
     322           3 :          WRITE (master%iw, "(1X,A,T71,I10)") "SWARM| Total number of iterations ", master%i_iteration
     323           3 :          WRITE (master%iw, "(A)") " SWARM| Shutting down the master."
     324             :       END IF
     325             : 
     326           6 :       SELECT CASE (master%behavior)
     327             :       CASE (swarm_do_glbopt)
     328           3 :          CALL glbopt_master_finalize(master%glbopt)
     329           3 :          DEALLOCATE (master%glbopt)
     330             :       CASE DEFAULT
     331           3 :          CPABORT("got unknown behavior")
     332             :       END SELECT
     333             : 
     334           3 :       DEALLOCATE (master%queued_commands)
     335             : 
     336           3 :       logger => cp_get_default_logger()
     337             :       CALL cp_print_key_finished_output(master%iw, logger, &
     338           3 :                                         master%swarm_section, "PRINT%MASTER_RUN_INFO")
     339             :       CALL cp_print_key_finished_output(master%comlog_unit, logger, &
     340           3 :                                         master%swarm_section, "PRINT%COMMUNICATION_LOG")
     341             : 
     342             :       !CALL rm_timer_env() !pops the top-most timer
     343           3 :    END SUBROUTINE swarm_master_finalize
     344             : 
     345           0 : END MODULE swarm_master
     346             : 

Generated by: LCOV version 1.15