LCOV - code coverage report
Current view: top level - src/swarm - swarm_mpi.F (source / functions) Hit Total Coverage
Test: CP2K Regtests (git:2fce0f8) Lines: 121 131 92.4 %
Date: 2024-12-21 06:28:57 Functions: 11 13 84.6 %

          Line data    Source code
       1             : !--------------------------------------------------------------------------------------------------!
       2             : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3             : !   Copyright 2000-2024 CP2K developers group <https://cp2k.org>                                   !
       4             : !                                                                                                  !
       5             : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6             : !--------------------------------------------------------------------------------------------------!
       7             : 
       8             : ! **************************************************************************************************
       9             : !> \brief Handles the MPI communication of the swarm framework.
      10             : !> \author Ole Schuett
      11             : ! **************************************************************************************************
      12             : MODULE swarm_mpi
      13             :    USE cp_files,                        ONLY: close_file,&
      14             :                                               open_file
      15             :    USE cp_iter_types,                   ONLY: cp_iteration_info_create,&
      16             :                                               cp_iteration_info_release,&
      17             :                                               cp_iteration_info_type
      18             :    USE cp_log_handling,                 ONLY: cp_add_default_logger,&
      19             :                                               cp_get_default_logger,&
      20             :                                               cp_logger_create,&
      21             :                                               cp_logger_release,&
      22             :                                               cp_logger_type,&
      23             :                                               cp_rm_default_logger
      24             :    USE input_section_types,             ONLY: section_vals_type,&
      25             :                                               section_vals_val_set
      26             :    USE kinds,                           ONLY: default_path_length,&
      27             :                                               default_string_length
      28             :    USE machine,                         ONLY: default_output_unit
      29             :    USE message_passing,                 ONLY: mp_any_source,&
      30             :                                               mp_comm_type,&
      31             :                                               mp_para_env_release,&
      32             :                                               mp_para_env_type
      33             :    USE swarm_message,                   ONLY: swarm_message_get,&
      34             :                                               swarm_message_mpi_bcast,&
      35             :                                               swarm_message_mpi_recv,&
      36             :                                               swarm_message_mpi_send,&
      37             :                                               swarm_message_type
      38             : #include "../base/base_uses.f90"
      39             : 
      40             :    IMPLICIT NONE
      41             :    PRIVATE
      42             : 
      43             :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'swarm_mpi'
      44             : 
      45             :    PUBLIC :: swarm_mpi_type, swarm_mpi_init, swarm_mpi_finalize
      46             :    PUBLIC :: swarm_mpi_send_report, swarm_mpi_recv_report
      47             :    PUBLIC :: swarm_mpi_send_command, swarm_mpi_recv_command
      48             : 
      49             :    TYPE swarm_mpi_type
      50             :       TYPE(mp_para_env_type), POINTER          :: world => Null()
      51             :       TYPE(mp_para_env_type), POINTER          :: worker => Null()
      52             :       TYPE(mp_para_env_type), POINTER          :: master => Null()
      53             :       INTEGER, DIMENSION(:), ALLOCATABLE       :: wid2group
      54             :       CHARACTER(LEN=default_path_length)       :: master_output_path = ""
      55             :    END TYPE swarm_mpi_type
      56             : 
      57             : CONTAINS
      58             : 
      59             : ! **************************************************************************************************
      60             : !> \brief Initialize MPI communicators for a swarm run.
      61             : !> \param swarm_mpi ...
      62             : !> \param world_para_env ...
      63             : !> \param root_section ...
      64             : !> \param n_workers ...
      65             : !> \param worker_id ...
      66             : !> \param iw ...
      67             : !> \author Ole Schuett
      68             : ! **************************************************************************************************
      69           6 :    SUBROUTINE swarm_mpi_init(swarm_mpi, world_para_env, root_section, n_workers, worker_id, iw)
      70             :       TYPE(swarm_mpi_type)                               :: swarm_mpi
      71             :       TYPE(mp_para_env_type), POINTER                    :: world_para_env
      72             :       TYPE(section_vals_type), POINTER                   :: root_section
      73             :       INTEGER, INTENT(IN)                                :: n_workers
      74             :       INTEGER, INTENT(OUT)                               :: worker_id
      75             :       INTEGER, INTENT(IN)                                :: iw
      76             : 
      77             :       INTEGER                                            :: n_groups_created, pe_per_worker, &
      78             :                                                             subgroup_rank, subgroup_size
      79             :       TYPE(mp_comm_type)                                 :: subgroup
      80             :       LOGICAL                                            :: im_the_master
      81           6 :       INTEGER, DIMENSION(:), POINTER                     :: group_distribution_p
      82             :       INTEGER, DIMENSION(0:world_para_env%num_pe-2), &
      83          12 :          TARGET                                          :: group_distribution
      84             : 
      85             : ! ====== Setup of MPI-Groups ======
      86             : 
      87           6 :       worker_id = -1
      88           6 :       swarm_mpi%world => world_para_env
      89             : 
      90           6 :       IF (MOD(swarm_mpi%world%num_pe - 1, n_workers) /= 0) THEN
      91           0 :          CPABORT("number of processors-1 is not divisible by n_workers.")
      92             :       END IF
      93           6 :       IF (swarm_mpi%world%num_pe < n_workers + 1) THEN
      94           0 :          CPABORT("There are not enough processes for n_workers + 1. Aborting.")
      95             :       END IF
      96             : 
      97           6 :       pe_per_worker = (swarm_mpi%world%num_pe - 1)/n_workers
      98             : 
      99           6 :       IF (iw > 0) THEN
     100           3 :          WRITE (iw, '(A,45X,I8)') " SWARM| Number of mpi ranks", swarm_mpi%world%num_pe
     101           3 :          WRITE (iw, '(A,47X,I8)') " SWARM| Number of workers", n_workers
     102             :       END IF
     103             : 
     104             :       ! the last task becomes the master. Preseves node-alignment of other tasks.
     105           6 :       im_the_master = (swarm_mpi%world%mepos == swarm_mpi%world%num_pe - 1)
     106             : 
     107             :       ! First split split para_env into a master- and a workers-groups...
     108           6 :       IF (im_the_master) THEN
     109           3 :          ALLOCATE (swarm_mpi%master)
     110           3 :          CALL swarm_mpi%master%from_split(swarm_mpi%world, 1)
     111           3 :          subgroup_size = swarm_mpi%master%num_pe
     112           3 :          subgroup_rank = swarm_mpi%master%mepos
     113           3 :          IF (swarm_mpi%master%num_pe /= 1) CPABORT("mp_comm_split_direct failed (master)")
     114             :       ELSE
     115           3 :          CALL subgroup%from_split(swarm_mpi%world, 2)
     116           3 :          subgroup_size = subgroup%num_pe
     117           3 :          subgroup_rank = subgroup%mepos
     118           3 :          IF (subgroup_size /= swarm_mpi%world%num_pe - 1) CPABORT("mp_comm_split_direct failed (worker)")
     119             :       END IF
     120             : 
     121          18 :       ALLOCATE (swarm_mpi%wid2group(n_workers))
     122          12 :       swarm_mpi%wid2group = 0
     123             : 
     124           6 :       IF (.NOT. im_the_master) THEN
     125             :          ! ...then split workers-group into n_workers groups - one for each worker.
     126           3 :          group_distribution_p => group_distribution
     127           3 :          ALLOCATE (swarm_mpi%worker)
     128           3 :          CALL swarm_mpi%worker%from_split(subgroup, n_groups_created, group_distribution_p, n_subgroups=n_workers)
     129           3 :          worker_id = group_distribution(subgroup_rank) + 1 ! shall start by 1
     130           3 :          IF (n_groups_created /= n_workers) CPABORT("mp_comm_split failed.")
     131           3 :          CALL subgroup%free()
     132             : 
     133             :          !WRITE (*,*) "this is worker ", worker_id, swarm_mpi%worker%mepos, swarm_mpi%worker%num_pe
     134             : 
     135             :          ! collect world-ranks of each worker groups rank-0 node
     136           3 :          IF (swarm_mpi%worker%mepos == 0) &
     137           3 :             swarm_mpi%wid2group(worker_id) = swarm_mpi%world%mepos
     138             : 
     139             :       END IF
     140             : 
     141           6 :       CALL swarm_mpi%world%sum(swarm_mpi%wid2group)
     142             :       !WRITE (*,*), "wid2group table: ",swarm_mpi%wid2group
     143             : 
     144           6 :       CALL logger_init_master(swarm_mpi)
     145           6 :       CALL logger_init_worker(swarm_mpi, root_section, worker_id)
     146           6 :    END SUBROUTINE swarm_mpi_init
     147             : 
     148             : ! **************************************************************************************************
     149             : !> \brief Helper routine for swarm_mpi_init, configures the master's logger.
     150             : !> \param swarm_mpi ...
     151             : !> \author Ole Schuett
     152             : ! **************************************************************************************************
     153           6 :    SUBROUTINE logger_init_master(swarm_mpi)
     154             :       TYPE(swarm_mpi_type)                               :: swarm_mpi
     155             : 
     156             :       INTEGER                                            :: output_unit
     157             :       TYPE(cp_logger_type), POINTER                      :: logger
     158             : 
     159             : ! broadcast master_output_path to all ranks
     160             : 
     161           6 :       IF (swarm_mpi%world%is_source()) THEN
     162           3 :          logger => cp_get_default_logger()
     163           3 :          output_unit = logger%default_local_unit_nr
     164           3 :          swarm_mpi%master_output_path = output_unit2path(output_unit)
     165           3 :          IF (output_unit /= default_output_unit) &
     166           0 :             CLOSE (output_unit)
     167             :       END IF
     168             : 
     169           6 :       CALL swarm_mpi%world%bcast(swarm_mpi%master_output_path)
     170             : 
     171           6 :       IF (ASSOCIATED(swarm_mpi%master)) &
     172           3 :          CALL error_add_new_logger(swarm_mpi%master, swarm_mpi%master_output_path)
     173           6 :    END SUBROUTINE logger_init_master
     174             : 
     175             : ! **************************************************************************************************
     176             : !> \brief Helper routine for logger_init_master, inquires filename for given unit.
     177             : !> \param output_unit ...
     178             : !> \return ...
     179             : !> \author Ole Schuett
     180             : ! **************************************************************************************************
     181           3 :    FUNCTION output_unit2path(output_unit) RESULT(output_path)
     182             :       INTEGER, INTENT(IN)                                :: output_unit
     183             :       CHARACTER(LEN=default_path_length)                 :: output_path
     184             : 
     185           3 :       output_path = "__STD_OUT__"
     186           3 :       IF (output_unit /= default_output_unit) &
     187           0 :          INQUIRE (unit=output_unit, name=output_path)
     188           3 :    END FUNCTION output_unit2path
     189             : 
     190             : ! **************************************************************************************************
     191             : !> \brief Helper routine for swarm_mpi_init, configures the workers's logger.
     192             : !> \param swarm_mpi ...
     193             : !> \param root_section ...
     194             : !> \param worker_id ...
     195             : !> \author Ole Schuett
     196             : ! **************************************************************************************************
     197           6 :    SUBROUTINE logger_init_worker(swarm_mpi, root_section, worker_id)
     198             :       TYPE(swarm_mpi_type)                               :: swarm_mpi
     199             :       TYPE(section_vals_type), POINTER                   :: root_section
     200             :       INTEGER                                            :: worker_id
     201             : 
     202             :       CHARACTER(LEN=default_path_length)                 :: output_path
     203             :       CHARACTER(len=default_string_length)               :: new_project_name, project_name, &
     204             :                                                             worker_name
     205             :       TYPE(cp_iteration_info_type), POINTER              :: new_iter_info
     206             :       TYPE(cp_logger_type), POINTER                      :: old_logger
     207             : 
     208           6 :       NULLIFY (old_logger, new_iter_info)
     209           6 :       IF (ASSOCIATED(swarm_mpi%worker)) THEN
     210           3 :          old_logger => cp_get_default_logger()
     211           3 :          project_name = old_logger%iter_info%project_name
     212           3 :          IF (worker_id > 99999) THEN
     213           0 :             CPABORT("Did not expect so many workers.")
     214             :          END IF
     215           3 :          WRITE (worker_name, "(A,I5.5)") 'WORKER', worker_id
     216           3 :          IF (LEN_TRIM(project_name) + 1 + LEN_TRIM(worker_name) > default_string_length) THEN
     217           0 :             CPABORT("project name too long")
     218             :          END IF
     219           3 :          output_path = TRIM(project_name)//"-"//TRIM(worker_name)//".out"
     220           3 :          new_project_name = TRIM(project_name)//"-"//TRIM(worker_name)
     221           3 :          CALL section_vals_val_set(root_section, "GLOBAL%PROJECT_NAME", c_val=new_project_name)
     222           3 :          CALL cp_iteration_info_create(new_iter_info, new_project_name)
     223           3 :          CALL error_add_new_logger(swarm_mpi%worker, output_path, new_iter_info)
     224           3 :          CALL cp_iteration_info_release(new_iter_info)
     225             :       END IF
     226           6 :    END SUBROUTINE logger_init_worker
     227             : 
     228             : ! **************************************************************************************************
     229             : !> \brief Helper routine for logger_init_master and logger_init_worker
     230             : !> \param para_env ...
     231             : !> \param output_path ...
     232             : !> \param iter_info ...
     233             : !> \author Ole Schuett
     234             : ! **************************************************************************************************
     235           6 :    SUBROUTINE error_add_new_logger(para_env, output_path, iter_info)
     236             :       TYPE(mp_para_env_type), POINTER                    :: para_env
     237             :       CHARACTER(LEN=default_path_length)                 :: output_path
     238             :       TYPE(cp_iteration_info_type), OPTIONAL, POINTER    :: iter_info
     239             : 
     240             :       INTEGER                                            :: output_unit
     241             :       TYPE(cp_logger_type), POINTER                      :: new_logger, old_logger
     242             : 
     243           6 :       NULLIFY (new_logger, old_logger)
     244           6 :       output_unit = -1
     245           6 :       IF (para_env%is_source()) THEN
     246             :          ! open output_unit according to output_path
     247           6 :          output_unit = default_output_unit
     248           6 :          IF (output_path /= "__STD_OUT__") &
     249             :             CALL open_file(file_name=output_path, file_status="UNKNOWN", &
     250           3 :                            file_action="WRITE", file_position="APPEND", unit_number=output_unit)
     251             :       END IF
     252             : 
     253           6 :       old_logger => cp_get_default_logger()
     254             :       CALL cp_logger_create(new_logger, para_env=para_env, &
     255             :                             default_global_unit_nr=output_unit, close_global_unit_on_dealloc=.FALSE., &
     256           6 :                             template_logger=old_logger, iter_info=iter_info)
     257             : 
     258           6 :       CALL cp_add_default_logger(new_logger)
     259           6 :       CALL cp_logger_release(new_logger)
     260           6 :    END SUBROUTINE error_add_new_logger
     261             : 
     262             : ! **************************************************************************************************
     263             : !> \brief Finalizes the MPI communicators of a swarm run.
     264             : !> \param swarm_mpi ...
     265             : !> \param root_section ...
     266             : !> \author Ole Schuett
     267             : ! **************************************************************************************************
     268           6 :    SUBROUTINE swarm_mpi_finalize(swarm_mpi, root_section)
     269             :       TYPE(swarm_mpi_type)                               :: swarm_mpi
     270             :       TYPE(section_vals_type), POINTER                   :: root_section
     271             : 
     272           6 :       CALL swarm_mpi%world%sync()
     273           6 :       CALL logger_finalize(swarm_mpi, root_section)
     274             : 
     275           6 :       IF (ASSOCIATED(swarm_mpi%worker)) CALL mp_para_env_release(swarm_mpi%worker)
     276           6 :       IF (ASSOCIATED(swarm_mpi%master)) CALL mp_para_env_release(swarm_mpi%master)
     277           6 :       NULLIFY (swarm_mpi%worker, swarm_mpi%master)
     278           6 :       DEALLOCATE (swarm_mpi%wid2group)
     279           6 :    END SUBROUTINE swarm_mpi_finalize
     280             : 
     281             : ! **************************************************************************************************
     282             : !> \brief Helper routine for swarm_mpi_finalize, restores the original loggers
     283             : !> \param swarm_mpi ...
     284             : !> \param root_section ...
     285             : !> \author Ole Schuett
     286             : ! **************************************************************************************************
     287           6 :    SUBROUTINE logger_finalize(swarm_mpi, root_section)
     288             :       TYPE(swarm_mpi_type)                               :: swarm_mpi
     289             :       TYPE(section_vals_type), POINTER                   :: root_section
     290             : 
     291             :       INTEGER                                            :: output_unit
     292             :       TYPE(cp_logger_type), POINTER                      :: logger, old_logger
     293             : 
     294           6 :       NULLIFY (logger, old_logger)
     295           6 :       logger => cp_get_default_logger()
     296           6 :       output_unit = logger%default_local_unit_nr
     297           6 :       IF (output_unit > 0 .AND. output_unit /= default_output_unit) &
     298           0 :          CALL close_file(output_unit)
     299             : 
     300           6 :       CALL cp_rm_default_logger() !pops the top-most logger
     301           6 :       old_logger => cp_get_default_logger()
     302             : 
     303             :       ! restore GLOBAL%PROJECT_NAME
     304             :       CALL section_vals_val_set(root_section, "GLOBAL%PROJECT_NAME", &
     305           6 :                                 c_val=old_logger%iter_info%project_name)
     306             : 
     307           6 :       CALL swarm_mpi%world%sync()
     308             : 
     309             :       ! do this only on master's rank 0
     310           6 :       IF (swarm_mpi%world%is_source() .AND. output_unit /= default_output_unit) THEN
     311           0 :          output_unit = old_logger%default_local_unit_nr
     312             :          OPEN (unit=output_unit, file=swarm_mpi%master_output_path, &
     313           0 :                status="UNKNOWN", action="WRITE", position="APPEND")
     314             :       END IF
     315           6 :    END SUBROUTINE logger_finalize
     316             : 
     317             : ! **************************************************************************************************
     318             : !> \brief Sends a report via MPI
     319             : !> \param swarm_mpi ...
     320             : !> \param report ...
     321             : !> \author Ole Schuett
     322             : ! **************************************************************************************************
     323          51 :    SUBROUTINE swarm_mpi_send_report(swarm_mpi, report)
     324             :       TYPE(swarm_mpi_type)                               :: swarm_mpi
     325             :       TYPE(swarm_message_type)                           :: report
     326             : 
     327             :       INTEGER                                            :: dest, tag
     328             : 
     329             : ! Only rank-0 of worker group sends its report
     330             : 
     331          51 :       IF (swarm_mpi%worker%is_source()) THEN
     332          51 :          dest = swarm_mpi%world%num_pe - 1
     333          51 :          tag = 42
     334          51 :          CALL swarm_message_mpi_send(report, group=swarm_mpi%world, dest=dest, tag=tag)
     335             :       END IF
     336             : 
     337          51 :    END SUBROUTINE swarm_mpi_send_report
     338             : 
     339             : ! **************************************************************************************************
     340             : !> \brief Receives a report via MPI
     341             : !> \param swarm_mpi ...
     342             : !> \param report ...
     343             : !> \author Ole Schuett
     344             : ! **************************************************************************************************
     345          51 :    SUBROUTINE swarm_mpi_recv_report(swarm_mpi, report)
     346             :       TYPE(swarm_mpi_type)                               :: swarm_mpi
     347             :       TYPE(swarm_message_type), INTENT(OUT)              :: report
     348             : 
     349             :       INTEGER                                            :: src, tag
     350             : 
     351          51 :       tag = 42
     352          51 :       src = mp_any_source
     353             : 
     354          51 :       CALL swarm_message_mpi_recv(report, group=swarm_mpi%world, src=src, tag=tag)
     355             : 
     356          51 :    END SUBROUTINE swarm_mpi_recv_report
     357             : 
     358             : ! **************************************************************************************************
     359             : !> \brief Sends a command via MPI
     360             : !> \param swarm_mpi ...
     361             : !> \param cmd ...
     362             : !> \author Ole Schuett
     363             : ! **************************************************************************************************
     364          51 :    SUBROUTINE swarm_mpi_send_command(swarm_mpi, cmd)
     365             :       TYPE(swarm_mpi_type)                               :: swarm_mpi
     366             :       TYPE(swarm_message_type)                           :: cmd
     367             : 
     368             :       INTEGER                                            :: dest, tag, worker_id
     369             : 
     370          51 :       CALL swarm_message_get(cmd, "worker_id", worker_id)
     371          51 :       tag = 42
     372          51 :       dest = swarm_mpi%wid2group(worker_id)
     373             : 
     374          51 :       CALL swarm_message_mpi_send(cmd, group=swarm_mpi%world, dest=dest, tag=tag)
     375             : 
     376          51 :    END SUBROUTINE swarm_mpi_send_command
     377             : 
     378             : ! **************************************************************************************************
     379             : !> \brief Receives a command via MPI and broadcasts it within a worker.
     380             : !> \param swarm_mpi ...
     381             : !> \param cmd ...
     382             : !> \author Ole Schuett
     383             : ! **************************************************************************************************
     384          51 :    SUBROUTINE swarm_mpi_recv_command(swarm_mpi, cmd)
     385             :       TYPE(swarm_mpi_type)                               :: swarm_mpi
     386             :       TYPE(swarm_message_type), INTENT(OUT)              :: cmd
     387             : 
     388             :       INTEGER                                            :: src, tag
     389             : 
     390             : ! This is a two step communication schema.
     391             : ! First: The rank-0 of the worker groups receives the command from the master.
     392             : 
     393          51 :       IF (swarm_mpi%worker%is_source()) THEN
     394          51 :          src = swarm_mpi%world%num_pe - 1 !
     395          51 :          tag = 42
     396          51 :          CALL swarm_message_mpi_recv(cmd, group=swarm_mpi%world, src=src, tag=tag)
     397             : 
     398             :       END IF
     399             : 
     400             : !     ! Second: The command is broadcasted within the worker group.
     401          51 :       CALL swarm_message_mpi_bcast(cmd, src=swarm_mpi%worker%source, group=swarm_mpi%worker)
     402             : 
     403          51 :    END SUBROUTINE swarm_mpi_recv_command
     404             : 
     405           0 : END MODULE swarm_mpi
     406             : 

Generated by: LCOV version 1.15