LCOV - code coverage report
Current view: top level - src - task_list_methods.F (source / functions) Hit Total Coverage
Test: CP2K Regtests (git:4dc10b3) Lines: 921 1079 85.4 %
Date: 2024-11-21 06:45:46 Functions: 34 35 97.1 %

          Line data    Source code
       1             : !--------------------------------------------------------------------------------------------------!
       2             : !   CP2K: A general program to perform molecular dynamics simulations                              !
       3             : !   Copyright 2000-2024 CP2K developers group <https://cp2k.org>                                   !
       4             : !                                                                                                  !
       5             : !   SPDX-License-Identifier: GPL-2.0-or-later                                                      !
       6             : !--------------------------------------------------------------------------------------------------!
       7             : 
       8             : ! **************************************************************************************************
       9             : !> \brief generate the tasks lists used by collocate and integrate routines
      10             : !> \par History
      11             : !>      01.2008 [Joost VandeVondele] refactered out of qs_collocate / qs_integrate
      12             : !> \author Joost VandeVondele
      13             : ! **************************************************************************************************
      14             : MODULE task_list_methods
      15             :    USE offload_api, ONLY: offload_create_buffer, offload_buffer_type
      16             :    USE grid_api, ONLY: grid_create_basis_set, grid_create_task_list
      17             :    USE ao_util, ONLY: exp_radius_very_extended
      18             :    USE basis_set_types, ONLY: get_gto_basis_set, &
      19             :                               gto_basis_set_p_type, &
      20             :                               gto_basis_set_type
      21             :    USE cell_types, ONLY: cell_type, &
      22             :                          pbc
      23             :    USE cp_control_types, ONLY: dft_control_type
      24             :    USE cube_utils, ONLY: compute_cube_center, &
      25             :                          cube_info_type, &
      26             :                          return_cube, &
      27             :                          return_cube_nonortho
      28             :    USE cp_dbcsr_api, ONLY: dbcsr_convert_sizes_to_offsets, &
      29             :                            dbcsr_finalize, &
      30             :                            dbcsr_get_block_p, &
      31             :                            dbcsr_get_info, &
      32             :                            dbcsr_p_type, &
      33             :                            dbcsr_put_block, &
      34             :                            dbcsr_type, &
      35             :                            dbcsr_work_create
      36             :    USE gaussian_gridlevels, ONLY: gaussian_gridlevel, &
      37             :                                   gridlevel_info_type
      38             :    USE kinds, ONLY: default_string_length, &
      39             :                     dp, &
      40             :                     int_8
      41             :    USE kpoint_types, ONLY: get_kpoint_info, &
      42             :                            kpoint_type
      43             :    USE memory_utilities, ONLY: reallocate
      44             :    USE message_passing, ONLY: &
      45             :       mp_comm_type
      46             :    USE particle_types, ONLY: particle_type
      47             :    USE particle_methods, ONLY: get_particle_set
      48             :    USE pw_env_types, ONLY: pw_env_get, &
      49             :                            pw_env_type
      50             :    USE qs_kind_types, ONLY: get_qs_kind, &
      51             :                             qs_kind_type
      52             :    USE qs_ks_types, ONLY: get_ks_env, &
      53             :                           qs_ks_env_type
      54             :    USE qs_neighbor_list_types, ONLY: neighbor_list_set_p_type
      55             :    USE realspace_grid_types, ONLY: realspace_grid_desc_p_type, &
      56             :                                    realspace_grid_desc_type, &
      57             :                                    rs_grid_create, &
      58             :                                    rs_grid_locate_rank, &
      59             :                                    rs_grid_release, &
      60             :                                    rs_grid_reorder_ranks, realspace_grid_type
      61             :    USE task_list_types, ONLY: deserialize_task, &
      62             :                               reallocate_tasks, &
      63             :                               serialize_task, &
      64             :                               task_list_type, &
      65             :                               atom_pair_type, &
      66             :                               task_size_in_int8, &
      67             :                               task_type
      68             :    USE util, ONLY: sort
      69             : 
      70             : !$ USE OMP_LIB, ONLY: omp_destroy_lock, omp_get_num_threads, omp_init_lock, &
      71             : !$                    omp_lock_kind, omp_set_lock, omp_unset_lock, omp_get_max_threads
      72             : #include "./base/base_uses.f90"
      73             : 
      74             :    #:include './common/array_sort.fypp'
      75             : 
      76             :    IMPLICIT NONE
      77             : 
      78             :    LOGICAL, PRIVATE, PARAMETER :: debug_this_module = .FALSE.
      79             : 
      80             :    PRIVATE
      81             : 
      82             :    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'task_list_methods'
      83             : 
      84             :    PUBLIC :: generate_qs_task_list, &
      85             :              task_list_inner_loop
      86             :    PUBLIC :: distribute_tasks, &
      87             :              rs_distribute_matrix, &
      88             :              rs_scatter_matrices, &
      89             :              rs_gather_matrices, &
      90             :              rs_copy_to_buffer, &
      91             :              rs_copy_to_matrices
      92             : 
      93             : CONTAINS
      94             : 
      95             : ! **************************************************************************************************
      96             : !> \brief ...
      97             : !> \param ks_env ...
      98             : !> \param task_list ...
      99             : !> \param reorder_rs_grid_ranks Flag that indicates if this routine should
     100             : !>        or should not overwrite the rs descriptor (see comment below)
     101             : !> \param skip_load_balance_distributed ...
     102             : !> \param soft_valid ...
     103             : !> \param basis_type ...
     104             : !> \param pw_env_external ...
     105             : !> \param sab_orb_external ...
     106             : !> \par History
     107             : !>      01.2008 factored out of calculate_rho_elec [Joost VandeVondele]
     108             : !>      04.2010 divides tasks into grid levels and atom pairs for integrate/collocate [Iain Bethune]
     109             : !>              (c) The Numerical Algorithms Group (NAG) Ltd, 2010 on behalf of the HECToR project
     110             : !>      06.2015 adjusted to be used with multiple images (k-points) [JGH]
     111             : !> \note  If this routine is called several times with different task lists,
     112             : !>        the default behaviour is to re-optimize the grid ranks and overwrite
     113             : !>        the rs descriptor and grids. reorder_rs_grid_ranks = .FALSE. prevents the code
     114             : !>        of performing a new optimization by leaving the rank order in
     115             : !>        its current state.
     116             : ! **************************************************************************************************
     117       13410 :    SUBROUTINE generate_qs_task_list(ks_env, task_list, &
     118             :                                     reorder_rs_grid_ranks, skip_load_balance_distributed, &
     119             :                                     soft_valid, basis_type, pw_env_external, sab_orb_external)
     120             : 
     121             :       TYPE(qs_ks_env_type), POINTER                      :: ks_env
     122             :       TYPE(task_list_type), POINTER                      :: task_list
     123             :       LOGICAL, INTENT(IN)                                :: reorder_rs_grid_ranks, &
     124             :                                                             skip_load_balance_distributed
     125             :       LOGICAL, INTENT(IN), OPTIONAL                      :: soft_valid
     126             :       CHARACTER(LEN=*), INTENT(IN), OPTIONAL             :: basis_type
     127             :       TYPE(pw_env_type), OPTIONAL, POINTER               :: pw_env_external
     128             :       TYPE(neighbor_list_set_p_type), DIMENSION(:), &
     129             :          OPTIONAL, POINTER                               :: sab_orb_external
     130             : 
     131             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'generate_qs_task_list'
     132             :       INTEGER, PARAMETER                                 :: max_tasks = 2000
     133             : 
     134             :       CHARACTER(LEN=default_string_length)               :: my_basis_type
     135             :       INTEGER :: cindex, curr_tasks, handle, i, iatom, iatom_old, igrid_level, igrid_level_old, &
     136             :                  ikind, ilevel, img, img_old, ipair, ipgf, iset, itask, jatom, jatom_old, jkind, jpgf, &
     137             :                  jset, maxpgf, maxset, natoms, nimages, nkind, nseta, nsetb, slot
     138       13410 :       INTEGER, ALLOCATABLE, DIMENSION(:, :, :)           :: blocks
     139             :       INTEGER, DIMENSION(3)                              :: cellind
     140       13410 :       INTEGER, DIMENSION(:), POINTER                     :: la_max, la_min, lb_max, lb_min, npgfa, &
     141       13410 :                                                             npgfb, nsgf
     142       13410 :       INTEGER, DIMENSION(:, :, :), POINTER               :: cell_to_index
     143             :       LOGICAL                                            :: dokp, my_soft
     144             :       REAL(KIND=dp)                                      :: kind_radius_a, kind_radius_b
     145             :       REAL(KIND=dp), DIMENSION(3)                        :: ra, rab
     146       13410 :       REAL(KIND=dp), DIMENSION(:), POINTER               :: set_radius_a, set_radius_b
     147       13410 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: rpgfa, rpgfb, zeta, zetb
     148             :       TYPE(cell_type), POINTER                           :: cell
     149       13410 :       TYPE(cube_info_type), DIMENSION(:), POINTER        :: cube_info
     150             :       TYPE(dft_control_type), POINTER                    :: dft_control
     151             :       TYPE(gridlevel_info_type), POINTER                 :: gridlevel_info
     152       13410 :       TYPE(gto_basis_set_p_type), DIMENSION(:), POINTER  :: basis_set_list
     153             :       TYPE(gto_basis_set_type), POINTER                  :: basis_set_a, basis_set_b, orb_basis_set
     154             :       TYPE(kpoint_type), POINTER                         :: kpoints
     155             :       TYPE(neighbor_list_set_p_type), DIMENSION(:), &
     156       13410 :          POINTER                                         :: sab_orb
     157       13410 :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
     158             :       TYPE(pw_env_type), POINTER                         :: pw_env
     159       13410 :       TYPE(qs_kind_type), DIMENSION(:), POINTER          :: qs_kind_set
     160             :       TYPE(qs_kind_type), POINTER                        :: qs_kind
     161             :       TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
     162       13410 :          POINTER                                         :: rs_descs
     163       13410 :       TYPE(realspace_grid_type), DIMENSION(:), POINTER :: rs_grids
     164       13410 :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks
     165             : 
     166       13410 :       CALL timeset(routineN, handle)
     167             : 
     168             :       CALL get_ks_env(ks_env, &
     169             :                       qs_kind_set=qs_kind_set, &
     170             :                       cell=cell, &
     171             :                       particle_set=particle_set, &
     172       13410 :                       dft_control=dft_control)
     173             : 
     174             :       ! OPTION 1) basis is set through input
     175             :       ! OPTION 2) soft orb basis is requested
     176             :       ! OPTION 3) by default, the full orb basis density is calculated
     177       13410 :       my_soft = .FALSE.
     178       13410 :       IF (PRESENT(soft_valid)) my_soft = soft_valid
     179       13410 :       IF (PRESENT(basis_type)) THEN
     180        1576 :          CPASSERT(.NOT. my_soft)
     181        1576 :          my_basis_type = basis_type
     182       11834 :       ELSEIF (my_soft) THEN
     183        1596 :          my_basis_type = "ORB_SOFT"
     184             :       ELSE
     185       10238 :          my_basis_type = "ORB"
     186             :       END IF
     187             : 
     188       13410 :       CALL get_ks_env(ks_env, sab_orb=sab_orb)
     189       13410 :       IF (PRESENT(sab_orb_external)) sab_orb => sab_orb_external
     190             : 
     191       13410 :       CALL get_ks_env(ks_env, pw_env=pw_env)
     192       13410 :       IF (PRESENT(pw_env_external)) pw_env => pw_env_external
     193       13410 :       CALL pw_env_get(pw_env, rs_descs=rs_descs, rs_grids=rs_grids)
     194             : 
     195             :       ! *** assign from pw_env
     196       13410 :       gridlevel_info => pw_env%gridlevel_info
     197       13410 :       cube_info => pw_env%cube_info
     198             : 
     199             :       ! find maximum numbers
     200       13410 :       nkind = SIZE(qs_kind_set)
     201       13410 :       natoms = SIZE(particle_set)
     202       13410 :       maxset = 0
     203       13410 :       maxpgf = 0
     204       37467 :       DO ikind = 1, nkind
     205       24057 :          qs_kind => qs_kind_set(ikind)
     206             :          CALL get_qs_kind(qs_kind=qs_kind, &
     207       24057 :                           basis_set=orb_basis_set, basis_type=my_basis_type)
     208             : 
     209       24057 :          IF (.NOT. ASSOCIATED(orb_basis_set)) CYCLE
     210       24057 :          CALL get_gto_basis_set(gto_basis_set=orb_basis_set, npgf=npgfa, nset=nseta)
     211             : 
     212       24057 :          maxset = MAX(nseta, maxset)
     213       87694 :          maxpgf = MAX(MAXVAL(npgfa), maxpgf)
     214             :       END DO
     215             : 
     216             :       ! kpoint related
     217       13410 :       nimages = dft_control%nimages
     218       13410 :       IF (nimages > 1) THEN
     219         258 :          dokp = .TRUE.
     220         258 :          NULLIFY (kpoints)
     221         258 :          CALL get_ks_env(ks_env=ks_env, kpoints=kpoints)
     222         258 :          CALL get_kpoint_info(kpoint=kpoints, cell_to_index=cell_to_index)
     223             :       ELSE
     224       13152 :          dokp = .FALSE.
     225       13152 :          NULLIFY (cell_to_index)
     226             :       END IF
     227             : 
     228             :       ! free the atom_pair lists if allocated
     229       13410 :       IF (ASSOCIATED(task_list%atom_pair_send)) DEALLOCATE (task_list%atom_pair_send)
     230       13410 :       IF (ASSOCIATED(task_list%atom_pair_recv)) DEALLOCATE (task_list%atom_pair_recv)
     231             : 
     232             :       ! construct a list of all tasks
     233       13410 :       IF (.NOT. ASSOCIATED(task_list%tasks)) THEN
     234        8016 :          CALL reallocate_tasks(task_list%tasks, max_tasks)
     235             :       END IF
     236       13410 :       task_list%ntasks = 0
     237       13410 :       curr_tasks = SIZE(task_list%tasks)
     238             : 
     239       64287 :       ALLOCATE (basis_set_list(nkind))
     240       37467 :       DO ikind = 1, nkind
     241       24057 :          qs_kind => qs_kind_set(ikind)
     242             :          CALL get_qs_kind(qs_kind=qs_kind, basis_set=basis_set_a, &
     243       24057 :                           basis_type=my_basis_type)
     244       37467 :          IF (ASSOCIATED(basis_set_a)) THEN
     245       24057 :             basis_set_list(ikind)%gto_basis_set => basis_set_a
     246             :          ELSE
     247           0 :             NULLIFY (basis_set_list(ikind)%gto_basis_set)
     248             :          END IF
     249             :       END DO
     250             : !!$OMP PARALLEL DEFAULT(NONE) &
     251             : !!$OMP SHARED (sab_orb, dokp, basis_set_list, task_list, rs_descs, dft_control, cube_info, gridlevel_info,  &
     252             : !!$OMP         curr_tasks, maxpgf, maxset, natoms, nimages, particle_set, cell_to_index, cell) &
     253             : !!$OMP PRIVATE (ikind, jkind, iatom, jatom, rab, cellind, basis_set_a, basis_set_b, ra, &
     254             : !!$OMP          la_max, la_min, npgfa, nseta, rpgfa, set_radius_a, kind_radius_a, zeta, &
     255             : !!$OMP          lb_max, lb_min, npgfb, nsetb, rpgfb, set_radius_b, kind_radius_b, zetb, &
     256             : !!$OMP          cindex, slot)
     257             :       ! Loop over neighbor list
     258             : !!$OMP DO SCHEDULE(GUIDED)
     259     1351369 :       DO slot = 1, sab_orb(1)%nl_size
     260     1337959 :          ikind = sab_orb(1)%nlist_task(slot)%ikind
     261     1337959 :          jkind = sab_orb(1)%nlist_task(slot)%jkind
     262     1337959 :          iatom = sab_orb(1)%nlist_task(slot)%iatom
     263     1337959 :          jatom = sab_orb(1)%nlist_task(slot)%jatom
     264     5351836 :          rab(1:3) = sab_orb(1)%nlist_task(slot)%r(1:3)
     265     5351836 :          cellind(1:3) = sab_orb(1)%nlist_task(slot)%cell(1:3)
     266             : 
     267     1337959 :          basis_set_a => basis_set_list(ikind)%gto_basis_set
     268     1337959 :          IF (.NOT. ASSOCIATED(basis_set_a)) CYCLE
     269     1337959 :          basis_set_b => basis_set_list(jkind)%gto_basis_set
     270     1337959 :          IF (.NOT. ASSOCIATED(basis_set_b)) CYCLE
     271     1337959 :          ra(:) = pbc(particle_set(iatom)%r, cell)
     272             :          ! basis ikind
     273     1337959 :          la_max => basis_set_a%lmax
     274     1337959 :          la_min => basis_set_a%lmin
     275     1337959 :          npgfa => basis_set_a%npgf
     276     1337959 :          nseta = basis_set_a%nset
     277     1337959 :          rpgfa => basis_set_a%pgf_radius
     278     1337959 :          set_radius_a => basis_set_a%set_radius
     279     1337959 :          kind_radius_a = basis_set_a%kind_radius
     280     1337959 :          zeta => basis_set_a%zet
     281             :          ! basis jkind
     282     1337959 :          lb_max => basis_set_b%lmax
     283     1337959 :          lb_min => basis_set_b%lmin
     284     1337959 :          npgfb => basis_set_b%npgf
     285     1337959 :          nsetb = basis_set_b%nset
     286     1337959 :          rpgfb => basis_set_b%pgf_radius
     287     1337959 :          set_radius_b => basis_set_b%set_radius
     288     1337959 :          kind_radius_b = basis_set_b%kind_radius
     289     1337959 :          zetb => basis_set_b%zet
     290             : 
     291     1337959 :          IF (dokp) THEN
     292      141472 :             cindex = cell_to_index(cellind(1), cellind(2), cellind(3))
     293             :          ELSE
     294     1196487 :             cindex = 1
     295             :          END IF
     296             : 
     297             :          CALL task_list_inner_loop(task_list%tasks, task_list%ntasks, curr_tasks, &
     298             :                                    rs_descs, dft_control, cube_info, gridlevel_info, cindex, &
     299             :                                    iatom, jatom, rpgfa, rpgfb, zeta, zetb, kind_radius_b, &
     300             :                                    set_radius_a, set_radius_b, ra, rab, &
     301     1351369 :                                    la_max, la_min, lb_max, lb_min, npgfa, npgfb, nseta, nsetb)
     302             : 
     303             :       END DO
     304             : !!$OMP END PARALLEL
     305             : 
     306             :       ! redistribute the task list so that all tasks map on the local rs grids
     307             :       CALL distribute_tasks( &
     308             :          rs_descs=rs_descs, ntasks=task_list%ntasks, natoms=natoms, &
     309             :          tasks=task_list%tasks, atom_pair_send=task_list%atom_pair_send, &
     310             :          atom_pair_recv=task_list%atom_pair_recv, symmetric=.TRUE., &
     311             :          reorder_rs_grid_ranks=reorder_rs_grid_ranks, &
     312       13410 :          skip_load_balance_distributed=skip_load_balance_distributed)
     313             : 
     314             :       ! compute offsets for rs_scatter_matrix / rs_copy_matrix
     315       40230 :       ALLOCATE (nsgf(natoms))
     316       13410 :       CALL get_particle_set(particle_set, qs_kind_set, basis=basis_set_list, nsgf=nsgf)
     317       13410 :       IF (ASSOCIATED(task_list%atom_pair_send)) THEN
     318             :          ! only needed when there is a distributed grid
     319             :          CALL rs_calc_offsets(pairs=task_list%atom_pair_send, &
     320             :                               nsgf=nsgf, &
     321             :                               group_size=rs_descs(1)%rs_desc%group_size, &
     322             :                               pair_offsets=task_list%pair_offsets_send, &
     323             :                               rank_offsets=task_list%rank_offsets_send, &
     324             :                               rank_sizes=task_list%rank_sizes_send, &
     325          48 :                               buffer_size=task_list%buffer_size_send)
     326             :       END IF
     327             :       CALL rs_calc_offsets(pairs=task_list%atom_pair_recv, &
     328             :                            nsgf=nsgf, &
     329             :                            group_size=rs_descs(1)%rs_desc%group_size, &
     330             :                            pair_offsets=task_list%pair_offsets_recv, &
     331             :                            rank_offsets=task_list%rank_offsets_recv, &
     332             :                            rank_sizes=task_list%rank_sizes_recv, &
     333       13410 :                            buffer_size=task_list%buffer_size_recv)
     334       13410 :       DEALLOCATE (basis_set_list, nsgf)
     335             : 
     336             :       ! If the rank order has changed, reallocate any of the distributed rs_grids
     337       13410 :       IF (reorder_rs_grid_ranks) THEN
     338       57906 :          DO i = 1, gridlevel_info%ngrid_levels
     339       57906 :             IF (rs_descs(i)%rs_desc%distributed) THEN
     340          54 :                CALL rs_grid_release(rs_grids(i))
     341          54 :                CALL rs_grid_create(rs_grids(i), rs_descs(i)%rs_desc)
     342             :             END IF
     343             :          END DO
     344             :       END IF
     345             : 
     346             :       CALL create_grid_task_list(task_list=task_list, &
     347             :                                  qs_kind_set=qs_kind_set, &
     348             :                                  particle_set=particle_set, &
     349             :                                  cell=cell, &
     350             :                                  basis_type=my_basis_type, &
     351       13410 :                                  rs_grids=rs_grids)
     352             : 
     353             :       ! Now we have the final list of tasks, setup the task_list with the
     354             :       ! data needed for the loops in integrate_v/calculate_rho
     355             : 
     356       13410 :       IF (ASSOCIATED(task_list%taskstart)) THEN
     357        5394 :          DEALLOCATE (task_list%taskstart)
     358             :       END IF
     359       13410 :       IF (ASSOCIATED(task_list%taskstop)) THEN
     360        5394 :          DEALLOCATE (task_list%taskstop)
     361             :       END IF
     362       13410 :       IF (ASSOCIATED(task_list%npairs)) THEN
     363        5394 :          DEALLOCATE (task_list%npairs)
     364             :       END IF
     365             : 
     366             :       ! First, count the number of unique atom pairs per grid level
     367             : 
     368       40230 :       ALLOCATE (task_list%npairs(SIZE(rs_descs)))
     369             : 
     370       13410 :       iatom_old = -1; jatom_old = -1; igrid_level_old = -1; img_old = -1
     371       13410 :       ipair = 0
     372       66598 :       task_list%npairs = 0
     373             : 
     374     7674648 :       DO i = 1, task_list%ntasks
     375     7661238 :          igrid_level = task_list%tasks(i)%grid_level
     376     7661238 :          img = task_list%tasks(i)%image
     377     7661238 :          iatom = task_list%tasks(i)%iatom
     378     7661238 :          jatom = task_list%tasks(i)%jatom
     379     7661238 :          iset = task_list%tasks(i)%iset
     380     7661238 :          jset = task_list%tasks(i)%jset
     381     7661238 :          ipgf = task_list%tasks(i)%ipgf
     382     7661238 :          jpgf = task_list%tasks(i)%jpgf
     383     7674648 :          IF (igrid_level .NE. igrid_level_old) THEN
     384       36364 :             IF (igrid_level_old .NE. -1) THEN
     385       23181 :                task_list%npairs(igrid_level_old) = ipair
     386             :             END IF
     387             :             ipair = 1
     388             :             igrid_level_old = igrid_level
     389             :             iatom_old = iatom
     390             :             jatom_old = jatom
     391             :             img_old = img
     392     7624874 :          ELSE IF (iatom .NE. iatom_old .OR. jatom .NE. jatom_old .OR. img .NE. img_old) THEN
     393      486494 :             ipair = ipair + 1
     394      486494 :             iatom_old = iatom
     395      486494 :             jatom_old = jatom
     396      486494 :             img_old = img
     397             :          END IF
     398             :       END DO
     399             :       ! Take care of the last iteration
     400       13410 :       IF (task_list%ntasks /= 0) THEN
     401       13183 :          task_list%npairs(igrid_level) = ipair
     402             :       END IF
     403             : 
     404             :       ! Second, for each atom pair, find the indices in the task list
     405             :       ! of the first and last task
     406             : 
     407             :       ! Array sized for worst case
     408      106601 :       ALLOCATE (task_list%taskstart(MAXVAL(task_list%npairs), SIZE(rs_descs)))
     409      106601 :       ALLOCATE (task_list%taskstop(MAXVAL(task_list%npairs), SIZE(rs_descs)))
     410             : 
     411       13410 :       iatom_old = -1; jatom_old = -1; igrid_level_old = -1; img_old = -1
     412       13410 :       ipair = 0
     413     1096835 :       task_list%taskstart = 0
     414     1096835 :       task_list%taskstop = 0
     415             : 
     416     7674648 :       DO i = 1, task_list%ntasks
     417     7661238 :          igrid_level = task_list%tasks(i)%grid_level
     418     7661238 :          img = task_list%tasks(i)%image
     419     7661238 :          iatom = task_list%tasks(i)%iatom
     420     7661238 :          jatom = task_list%tasks(i)%jatom
     421     7661238 :          iset = task_list%tasks(i)%iset
     422     7661238 :          jset = task_list%tasks(i)%jset
     423     7661238 :          ipgf = task_list%tasks(i)%ipgf
     424     7661238 :          jpgf = task_list%tasks(i)%jpgf
     425     7674648 :          IF (igrid_level .NE. igrid_level_old) THEN
     426       36364 :             IF (igrid_level_old .NE. -1) THEN
     427       23181 :                task_list%taskstop(ipair, igrid_level_old) = i - 1
     428             :             END IF
     429       36364 :             ipair = 1
     430       36364 :             task_list%taskstart(ipair, igrid_level) = i
     431       36364 :             igrid_level_old = igrid_level
     432       36364 :             iatom_old = iatom
     433       36364 :             jatom_old = jatom
     434       36364 :             img_old = img
     435     7624874 :          ELSE IF (iatom .NE. iatom_old .OR. jatom .NE. jatom_old .OR. img .NE. img_old) THEN
     436      486494 :             ipair = ipair + 1
     437      486494 :             task_list%taskstart(ipair, igrid_level) = i
     438      486494 :             task_list%taskstop(ipair - 1, igrid_level) = i - 1
     439      486494 :             iatom_old = iatom
     440      486494 :             jatom_old = jatom
     441      486494 :             img_old = img
     442             :          END IF
     443             :       END DO
     444             :       ! Take care of the last iteration
     445       13410 :       IF (task_list%ntasks /= 0) THEN
     446       13183 :          task_list%taskstop(ipair, igrid_level) = task_list%ntasks
     447             :       END IF
     448             : 
     449             :       ! Debug task destribution
     450             :       IF (debug_this_module) THEN
     451             :          tasks => task_list%tasks
     452             :          WRITE (6, *)
     453             :          WRITE (6, *) "Total number of tasks              ", task_list%ntasks
     454             :          DO igrid_level = 1, gridlevel_info%ngrid_levels
     455             :             WRITE (6, *) "Total number of pairs(grid_level)  ", &
     456             :                igrid_level, task_list%npairs(igrid_level)
     457             :          END DO
     458             :          WRITE (6, *)
     459             : 
     460             :          DO igrid_level = 1, gridlevel_info%ngrid_levels
     461             : 
     462             :             ALLOCATE (blocks(natoms, natoms, nimages))
     463             :             blocks = -1
     464             :             DO ipair = 1, task_list%npairs(igrid_level)
     465             :                itask = task_list%taskstart(ipair, igrid_level)
     466             :                ilevel = task_list%tasks(itask)%grid_level
     467             :                img = task_list%tasks(itask)%image
     468             :                iatom = task_list%tasks(itask)%iatom
     469             :                jatom = task_list%tasks(itask)%jatom
     470             :                iset = task_list%tasks(itask)%iset
     471             :                jset = task_list%tasks(itask)%jset
     472             :                ipgf = task_list%tasks(itask)%ipgf
     473             :                jpgf = task_list%tasks(itask)%jpgf
     474             :                IF (blocks(iatom, jatom, img) == -1 .AND. blocks(jatom, iatom, img) == -1) THEN
     475             :                   blocks(iatom, jatom, img) = 1
     476             :                   blocks(jatom, iatom, img) = 1
     477             :                ELSE
     478             :                   WRITE (6, *) "TASK LIST CONFLICT IN PAIR       ", ipair
     479             :                   WRITE (6, *) "Reuse of iatom, jatom, image     ", iatom, jatom, img
     480             :                END IF
     481             : 
     482             :                iatom_old = iatom
     483             :                jatom_old = jatom
     484             :                img_old = img
     485             :                DO itask = task_list%taskstart(ipair, igrid_level), task_list%taskstop(ipair, igrid_level)
     486             :                   ilevel = task_list%tasks(itask)%grid_level
     487             :                   img = task_list%tasks(itask)%image
     488             :                   iatom = task_list%tasks(itask)%iatom
     489             :                   jatom = task_list%tasks(itask)%jatom
     490             :                   iset = task_list%tasks(itask)%iset
     491             :                   jset = task_list%tasks(itask)%jset
     492             :                   ipgf = task_list%tasks(itask)%ipgf
     493             :                   jpgf = task_list%tasks(itask)%jpgf
     494             :                   IF (iatom /= iatom_old .OR. jatom /= jatom_old .OR. img /= img_old) THEN
     495             :                      WRITE (6, *) "TASK LIST CONFLICT IN TASK       ", itask
     496             :                      WRITE (6, *) "Inconsistent iatom, jatom, image ", iatom, jatom, img
     497             :                      WRITE (6, *) "Should be    iatom, jatom, image ", iatom_old, jatom_old, img_old
     498             :                   END IF
     499             : 
     500             :                END DO
     501             :             END DO
     502             :             DEALLOCATE (blocks)
     503             : 
     504             :          END DO
     505             : 
     506             :       END IF
     507             : 
     508       13410 :       CALL timestop(handle)
     509             : 
     510       13410 :    END SUBROUTINE generate_qs_task_list
     511             : 
     512             : ! **************************************************************************************************
     513             : !> \brief Sends the task list data to the grid API.
     514             : !> \author Ole Schuett
     515             : ! **************************************************************************************************
     516       13410 :    SUBROUTINE create_grid_task_list(task_list, qs_kind_set, particle_set, cell, basis_type, rs_grids)
     517             :       TYPE(task_list_type), POINTER                      :: task_list
     518             :       TYPE(qs_kind_type), DIMENSION(:), POINTER          :: qs_kind_set
     519             :       TYPE(particle_type), DIMENSION(:), POINTER         :: particle_set
     520             :       TYPE(cell_type), POINTER                           :: cell
     521             :       CHARACTER(LEN=default_string_length)               :: basis_type
     522             :       TYPE(realspace_grid_type), DIMENSION(:), POINTER :: rs_grids
     523             : 
     524             :       TYPE(gto_basis_set_type), POINTER                  :: orb_basis_set
     525             :       INTEGER                                            :: nset, natoms, nkinds, ntasks, &
     526             :                                                             ikind, iatom, itask, nsgf
     527       13410 :       INTEGER, DIMENSION(:), ALLOCATABLE                 :: atom_kinds, level_list, iatom_list, jatom_list, &
     528       13410 :                                                             iset_list, jset_list, ipgf_list, jpgf_list, &
     529       13410 :                                                             border_mask_list, block_num_list
     530       13410 :       REAL(KIND=dp), DIMENSION(:), ALLOCATABLE           :: radius_list
     531       13410 :       REAL(KIND=dp), DIMENSION(:, :), ALLOCATABLE        :: rab_list, atom_positions
     532       13410 :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks
     533       13410 :       INTEGER, DIMENSION(:, :), POINTER                  :: first_sgf
     534       13410 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: sphi, zet
     535       13410 :       INTEGER, DIMENSION(:), POINTER                     :: lmax, lmin, npgf, nsgf_set
     536             : 
     537       13410 :       nkinds = SIZE(qs_kind_set)
     538       13410 :       natoms = SIZE(particle_set)
     539       13410 :       ntasks = task_list%ntasks
     540       13410 :       tasks => task_list%tasks
     541             : 
     542       13410 :       IF (.NOT. ASSOCIATED(task_list%grid_basis_sets)) THEN
     543             :          ! Basis sets do not change during simulation - only need to create them once.
     544       38383 :          ALLOCATE (task_list%grid_basis_sets(nkinds))
     545       22351 :          DO ikind = 1, nkinds
     546       14335 :             CALL get_qs_kind(qs_kind_set(ikind), basis_type=basis_type, basis_set=orb_basis_set)
     547             :             CALL get_gto_basis_set(gto_basis_set=orb_basis_set, &
     548             :                                    nset=nset, &
     549             :                                    nsgf=nsgf, &
     550             :                                    nsgf_set=nsgf_set, &
     551             :                                    npgf=npgf, &
     552             :                                    first_sgf=first_sgf, &
     553             :                                    lmax=lmax, &
     554             :                                    lmin=lmin, &
     555             :                                    sphi=sphi, &
     556       14335 :                                    zet=zet)
     557             :             CALL grid_create_basis_set(nset=nset, &
     558             :                                        nsgf=nsgf, &
     559             :                                        maxco=SIZE(sphi, 1), &
     560             :                                        maxpgf=SIZE(zet, 1), &
     561             :                                        lmin=lmin, &
     562             :                                        lmax=lmax, &
     563             :                                        npgf=npgf, &
     564             :                                        nsgf_set=nsgf_set, &
     565             :                                        first_sgf=first_sgf, &
     566             :                                        sphi=sphi, &
     567             :                                        zet=zet, &
     568       22351 :                                        basis_set=task_list%grid_basis_sets(ikind))
     569             :          END DO
     570             :       END IF
     571             : 
     572             :       ! Pack task list infos
     573       67050 :       ALLOCATE (atom_kinds(natoms), atom_positions(3, natoms))
     574       63099 :       DO iatom = 1, natoms
     575       49689 :          atom_kinds(iatom) = particle_set(iatom)%atomic_kind%kind_number
     576       63099 :          atom_positions(:, iatom) = pbc(particle_set(iatom)%r, cell)
     577             :       END DO
     578             : 
     579       66369 :       ALLOCATE (level_list(ntasks), iatom_list(ntasks), jatom_list(ntasks))
     580       66142 :       ALLOCATE (iset_list(ntasks), jset_list(ntasks), ipgf_list(ntasks), jpgf_list(ntasks))
     581       39776 :       ALLOCATE (border_mask_list(ntasks), block_num_list(ntasks))
     582       66596 :       ALLOCATE (radius_list(ntasks), rab_list(3, ntasks))
     583             : 
     584     7674648 :       DO itask = 1, ntasks
     585     7661238 :          level_list(itask) = tasks(itask)%grid_level
     586     7661238 :          iatom_list(itask) = tasks(itask)%iatom
     587     7661238 :          jatom_list(itask) = tasks(itask)%jatom
     588     7661238 :          iset_list(itask) = tasks(itask)%iset
     589     7661238 :          jset_list(itask) = tasks(itask)%jset
     590     7661238 :          ipgf_list(itask) = tasks(itask)%ipgf
     591     7661238 :          jpgf_list(itask) = tasks(itask)%jpgf
     592     7661238 :          IF (tasks(itask)%dist_type == 2) THEN
     593           0 :             border_mask_list(itask) = IAND(63, NOT(tasks(itask)%subpatch_pattern))  ! invert last 6 bits
     594             :          ELSE
     595     7661238 :             border_mask_list(itask) = 0 ! no masking
     596             :          END IF
     597     7661238 :          block_num_list(itask) = tasks(itask)%pair_index  ! change of nomenclature pair_index -> block_num
     598     7661238 :          radius_list(itask) = tasks(itask)%radius
     599    30658362 :          rab_list(:, itask) = tasks(itask)%rab(:)
     600             :       END DO
     601             : 
     602             :       CALL grid_create_task_list(ntasks=ntasks, &
     603             :                                  natoms=natoms, &
     604             :                                  nkinds=nkinds, &
     605             :                                  nblocks=SIZE(task_list%pair_offsets_recv), &
     606             :                                  block_offsets=task_list%pair_offsets_recv, &
     607             :                                  atom_positions=atom_positions, &
     608             :                                  atom_kinds=atom_kinds, &
     609             :                                  basis_sets=task_list%grid_basis_sets, &
     610             :                                  level_list=level_list, &
     611             :                                  iatom_list=iatom_list, &
     612             :                                  jatom_list=jatom_list, &
     613             :                                  iset_list=iset_list, &
     614             :                                  jset_list=jset_list, &
     615             :                                  ipgf_list=ipgf_list, &
     616             :                                  jpgf_list=jpgf_list, &
     617             :                                  border_mask_list=border_mask_list, &
     618             :                                  block_num_list=block_num_list, &
     619             :                                  radius_list=radius_list, &
     620             :                                  rab_list=rab_list, &
     621             :                                  rs_grids=rs_grids, &
     622       13410 :                                  task_list=task_list%grid_task_list)
     623             : 
     624       13410 :       CALL offload_create_buffer(task_list%buffer_size_recv, task_list%pab_buffer)
     625       13410 :       CALL offload_create_buffer(task_list%buffer_size_recv, task_list%hab_buffer)
     626             : 
     627       26820 :    END SUBROUTINE create_grid_task_list
     628             : 
     629             : ! **************************************************************************************************
     630             : !> \brief ...
     631             : !> \param tasks ...
     632             : !> \param ntasks ...
     633             : !> \param curr_tasks ...
     634             : !> \param rs_descs ...
     635             : !> \param dft_control ...
     636             : !> \param cube_info ...
     637             : !> \param gridlevel_info ...
     638             : !> \param cindex ...
     639             : !> \param iatom ...
     640             : !> \param jatom ...
     641             : !> \param rpgfa ...
     642             : !> \param rpgfb ...
     643             : !> \param zeta ...
     644             : !> \param zetb ...
     645             : !> \param kind_radius_b ...
     646             : !> \param set_radius_a ...
     647             : !> \param set_radius_b ...
     648             : !> \param ra ...
     649             : !> \param rab ...
     650             : !> \param la_max ...
     651             : !> \param la_min ...
     652             : !> \param lb_max ...
     653             : !> \param lb_min ...
     654             : !> \param npgfa ...
     655             : !> \param npgfb ...
     656             : !> \param nseta ...
     657             : !> \param nsetb ...
     658             : !> \par History
     659             : !>      Joost VandeVondele: 10.2008 refactored
     660             : ! **************************************************************************************************
     661     1349449 :    SUBROUTINE task_list_inner_loop(tasks, ntasks, curr_tasks, rs_descs, dft_control, &
     662             :                                    cube_info, gridlevel_info, cindex, &
     663             :                                    iatom, jatom, rpgfa, rpgfb, zeta, zetb, kind_radius_b, set_radius_a, set_radius_b, ra, rab, &
     664             :                                    la_max, la_min, lb_max, lb_min, npgfa, npgfb, nseta, nsetb)
     665             : 
     666             :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks
     667             :       INTEGER                                            :: ntasks, curr_tasks
     668             :       TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
     669             :          POINTER                                         :: rs_descs
     670             :       TYPE(dft_control_type), POINTER                    :: dft_control
     671             :       TYPE(cube_info_type), DIMENSION(:), POINTER        :: cube_info
     672             :       TYPE(gridlevel_info_type), POINTER                 :: gridlevel_info
     673             :       INTEGER                                            :: cindex, iatom, jatom
     674             :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: rpgfa, rpgfb, zeta, zetb
     675             :       REAL(KIND=dp)                                      :: kind_radius_b
     676             :       REAL(KIND=dp), DIMENSION(:), POINTER               :: set_radius_a, set_radius_b
     677             :       REAL(KIND=dp), DIMENSION(3)                        :: ra, rab
     678             :       INTEGER, DIMENSION(:), POINTER                     :: la_max, la_min, lb_max, lb_min, npgfa, &
     679             :                                                             npgfb
     680             :       INTEGER                                            :: nseta, nsetb
     681             : 
     682             :       INTEGER                                            :: cube_center(3), igrid_level, ipgf, iset, &
     683             :                                                             jpgf, jset, lb_cube(3), ub_cube(3)
     684             :       REAL(KIND=dp)                                      :: dab, rab2, radius, zetp
     685             : 
     686     1349449 :       rab2 = rab(1)*rab(1) + rab(2)*rab(2) + rab(3)*rab(3)
     687     1349449 :       dab = SQRT(rab2)
     688             : 
     689     4335318 :       loop_iset: DO iset = 1, nseta
     690             : 
     691     2985869 :          IF (set_radius_a(iset) + kind_radius_b < dab) CYCLE
     692             : 
     693     9009817 :          loop_jset: DO jset = 1, nsetb
     694             : 
     695     5590208 :             IF (set_radius_a(iset) + set_radius_b(jset) < dab) CYCLE
     696             : 
     697    14567997 :             loop_ipgf: DO ipgf = 1, npgfa(iset)
     698             : 
     699     8158777 :                IF (rpgfa(ipgf, iset) + set_radius_b(jset) < dab) CYCLE
     700             : 
     701    24657352 :                loop_jpgf: DO jpgf = 1, npgfb(jset)
     702             : 
     703    14031024 :                   IF (rpgfa(ipgf, iset) + rpgfb(jpgf, jset) < dab) CYCLE
     704             : 
     705     7825701 :                   zetp = zeta(ipgf, iset) + zetb(jpgf, jset)
     706     7825701 :                   igrid_level = gaussian_gridlevel(gridlevel_info, zetp)
     707             : 
     708             :                   CALL compute_pgf_properties(cube_center, lb_cube, ub_cube, radius, &
     709             :                                               rs_descs(igrid_level)%rs_desc, cube_info(igrid_level), &
     710             :                                               la_max(iset), zeta(ipgf, iset), la_min(iset), &
     711             :                                               lb_max(jset), zetb(jpgf, jset), lb_min(jset), &
     712     7825701 :                                               ra, rab, rab2, dft_control%qs_control%eps_rho_rspace)
     713             : 
     714             :                   CALL pgf_to_tasks(tasks, ntasks, curr_tasks, &
     715             :                                     rab, cindex, iatom, jatom, iset, jset, ipgf, jpgf, &
     716             :                                     la_max(iset), lb_max(jset), rs_descs(igrid_level)%rs_desc, &
     717             :                                     igrid_level, gridlevel_info%ngrid_levels, cube_center, &
     718    22189801 :                                     lb_cube, ub_cube, radius)
     719             : 
     720             :                END DO loop_jpgf
     721             : 
     722             :             END DO loop_ipgf
     723             : 
     724             :          END DO loop_jset
     725             : 
     726             :       END DO loop_iset
     727             : 
     728     1349449 :    END SUBROUTINE task_list_inner_loop
     729             : 
     730             : ! **************************************************************************************************
     731             : !> \brief combines the calculation of several basic properties of a given pgf:
     732             : !>  its center, the bounding cube, the radius, the cost,
     733             : !>  tries to predict the time needed for processing this task
     734             : !>      in this way an improved load balance might be obtained
     735             : !> \param cube_center ...
     736             : !> \param lb_cube ...
     737             : !> \param ub_cube ...
     738             : !> \param radius ...
     739             : !> \param rs_desc ...
     740             : !> \param cube_info ...
     741             : !> \param la_max ...
     742             : !> \param zeta ...
     743             : !> \param la_min ...
     744             : !> \param lb_max ...
     745             : !> \param zetb ...
     746             : !> \param lb_min ...
     747             : !> \param ra ...
     748             : !> \param rab ...
     749             : !> \param rab2 ...
     750             : !> \param eps ...
     751             : !> \par History
     752             : !>      10.2008 refactored [Joost VandeVondele]
     753             : !> \note
     754             : !>      -) this requires the radius to be computed in the same way as
     755             : !>      collocate_pgf_product, we should factor that part into a subroutine
     756             : !>      -) we're assuming that integrate_pgf and collocate_pgf are the same cost for load balancing
     757             : !>         this is more or less true for map_consistent
     758             : !>      -) in principle, the computed radius could be recycled in integrate_pgf/collocate_pgf if it is certainly
     759             : !>         the same, this could lead to a small speedup
     760             : !>      -) the cost function is a fit through the median cost of mapping a pgf with a given l and a given radius (in grid points)
     761             : !>         fitting the measured data on an opteron/g95 using the expression
     762             : !>         a*(l+b)(r+c)**3+d which is based on the innerloop of the collocating routines
     763             : ! **************************************************************************************************
     764     7825701 :    SUBROUTINE compute_pgf_properties(cube_center, lb_cube, ub_cube, radius, &
     765             :                                      rs_desc, cube_info, la_max, zeta, la_min, lb_max, zetb, lb_min, ra, rab, rab2, eps)
     766             : 
     767             :       INTEGER, DIMENSION(3), INTENT(OUT)                 :: cube_center, lb_cube, ub_cube
     768             :       REAL(KIND=dp), INTENT(OUT)                         :: radius
     769             :       TYPE(realspace_grid_desc_type), POINTER            :: rs_desc
     770             :       TYPE(cube_info_type), INTENT(IN)                   :: cube_info
     771             :       INTEGER, INTENT(IN)                                :: la_max
     772             :       REAL(KIND=dp), INTENT(IN)                          :: zeta
     773             :       INTEGER, INTENT(IN)                                :: la_min, lb_max
     774             :       REAL(KIND=dp), INTENT(IN)                          :: zetb
     775             :       INTEGER, INTENT(IN)                                :: lb_min
     776             :       REAL(KIND=dp), INTENT(IN)                          :: ra(3), rab(3), rab2, eps
     777             : 
     778             :       INTEGER                                            :: extent(3)
     779     7825701 :       INTEGER, DIMENSION(:), POINTER                     :: sphere_bounds
     780             :       REAL(KIND=dp)                                      :: cutoff, f, prefactor, rb(3), zetp
     781             :       REAL(KIND=dp), DIMENSION(3)                        :: rp
     782             : 
     783             : ! the radius for this task
     784             : 
     785     7825701 :       zetp = zeta + zetb
     786    31302804 :       rp(:) = ra(:) + zetb/zetp*rab(:)
     787    31302804 :       rb(:) = ra(:) + rab(:)
     788     7825701 :       cutoff = 1.0_dp
     789     7825701 :       f = zetb/zetp
     790     7825701 :       prefactor = EXP(-zeta*f*rab2)
     791             :       radius = exp_radius_very_extended(la_min, la_max, lb_min, lb_max, ra=ra, rb=rb, rp=rp, &
     792     7825701 :                                         zetp=zetp, eps=eps, prefactor=prefactor, cutoff=cutoff)
     793             : 
     794     7825701 :       CALL compute_cube_center(cube_center, rs_desc, zeta, zetb, ra, rab)
     795             :       ! compute cube_center, the center of the gaussian product to map (folded to within the unit cell)
     796    31302804 :       cube_center(:) = MODULO(cube_center(:), rs_desc%npts(:))
     797    31302804 :       cube_center(:) = cube_center(:) + rs_desc%lb(:)
     798             : 
     799     7825701 :       IF (rs_desc%orthorhombic) THEN
     800     7387320 :          CALL return_cube(cube_info, radius, lb_cube, ub_cube, sphere_bounds)
     801             :       ELSE
     802      438381 :          CALL return_cube_nonortho(cube_info, radius, lb_cube, ub_cube, rp)
     803             :          !? unclear if extent is computed correctly.
     804     1753524 :          extent(:) = ub_cube(:) - lb_cube(:)
     805     1753524 :          lb_cube(:) = -extent(:)/2 - 1
     806     1753524 :          ub_cube(:) = extent(:)/2
     807             :       END IF
     808             : 
     809     7825701 :    END SUBROUTINE compute_pgf_properties
     810             : ! **************************************************************************************************
     811             : !> \brief predicts the cost of a task in kcycles for a given task
     812             : !>        the model is based on a fit of actual data, and might need updating
     813             : !>        as collocate_pgf_product changes (or CPUs/compilers change)
     814             : !>        maybe some dynamic approach, improving the cost model on the fly could
     815             : !>        work as well
     816             : !>        the cost model does not yet take into account the fraction of space
     817             : !>        that is mapped locally for a given cube and rs_grid (generalised tasks)
     818             : !> \param lb_cube ...
     819             : !> \param ub_cube ...
     820             : !> \param fraction ...
     821             : !> \param lmax ...
     822             : !> \param is_ortho ...
     823             : !> \return ...
     824             : ! **************************************************************************************************
     825     7825701 :    INTEGER FUNCTION cost_model(lb_cube, ub_cube, fraction, lmax, is_ortho)
     826             :       INTEGER, DIMENSION(3), INTENT(IN)                  :: lb_cube, ub_cube
     827             :       REAL(KIND=dp), INTENT(IN)                          :: fraction
     828             :       INTEGER                                            :: lmax
     829             :       LOGICAL                                            :: is_ortho
     830             : 
     831             :       INTEGER                                            :: cmax
     832             :       REAL(KIND=dp)                                      :: v1, v2, v3, v4, v5
     833             : 
     834    31302804 :       cmax = MAXVAL(((ub_cube - lb_cube) + 1)/2)
     835             : 
     836     7825701 :       IF (is_ortho) THEN
     837             :          v1 = 1.504760E+00_dp
     838             :          v2 = 3.126770E+00_dp
     839             :          v3 = 5.074106E+00_dp
     840             :          v4 = 1.091568E+00_dp
     841             :          v5 = 1.070187E+00_dp
     842             :       ELSE
     843      438381 :          v1 = 7.831105E+00_dp
     844      438381 :          v2 = 2.675174E+00_dp
     845      438381 :          v3 = 7.546553E+00_dp
     846      438381 :          v4 = 6.122446E-01_dp
     847      438381 :          v5 = 3.886382E+00_dp
     848             :       END IF
     849     7825701 :       cost_model = CEILING(((lmax + v1)*(cmax + v2)**3*v3*fraction + v4 + v5*lmax**7)/1000.0_dp)
     850             : 
     851     7825701 :    END FUNCTION cost_model
     852             : ! **************************************************************************************************
     853             : !> \brief pgf_to_tasks converts a given pgf to one or more tasks, in particular
     854             : !>        this determines by which CPUs a given pgf gets collocated
     855             : !>        the format of the task array is as follows
     856             : !>        tasks(1,i) := destination
     857             : !>        tasks(2,i) := source
     858             : !>        tasks(3,i) := compressed type (iatom, jatom, ....)
     859             : !>        tasks(4,i) := type (0: replicated, 1: distributed local, 2: distributed generalised)
     860             : !>        tasks(5,i) := cost
     861             : !>        tasks(6,i) := alternate destination code (0 if none available)
     862             : !>
     863             : !> \param tasks ...
     864             : !> \param ntasks ...
     865             : !> \param curr_tasks ...
     866             : !> \param rab ...
     867             : !> \param cindex ...
     868             : !> \param iatom ...
     869             : !> \param jatom ...
     870             : !> \param iset ...
     871             : !> \param jset ...
     872             : !> \param ipgf ...
     873             : !> \param jpgf ...
     874             : !> \param la_max ...
     875             : !> \param lb_max ...
     876             : !> \param rs_desc ...
     877             : !> \param igrid_level ...
     878             : !> \param n_levels ...
     879             : !> \param cube_center ...
     880             : !> \param lb_cube ...
     881             : !> \param ub_cube ...
     882             : !> \par History
     883             : !>      10.2008 Refactored based on earlier routines by MattW [Joost VandeVondele]
     884             : ! **************************************************************************************************
     885     7825701 :    SUBROUTINE pgf_to_tasks(tasks, ntasks, curr_tasks, &
     886             :                            rab, cindex, iatom, jatom, iset, jset, ipgf, jpgf, &
     887             :                            la_max, lb_max, rs_desc, igrid_level, n_levels, &
     888             :                            cube_center, lb_cube, ub_cube, radius)
     889             : 
     890             :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks
     891             :       INTEGER, INTENT(INOUT)                             :: ntasks, curr_tasks
     892             :       REAL(KIND=dp), DIMENSION(3), INTENT(IN)            :: rab
     893             :       INTEGER, INTENT(IN)                                :: cindex, iatom, jatom, iset, jset, ipgf, &
     894             :                                                             jpgf, la_max, lb_max
     895             :       TYPE(realspace_grid_desc_type), POINTER            :: rs_desc
     896             :       INTEGER, INTENT(IN)                                :: igrid_level, n_levels
     897             :       INTEGER, DIMENSION(3), INTENT(IN)                  :: cube_center, lb_cube, ub_cube
     898             :       REAL(KIND=dp), INTENT(IN)                          :: radius
     899             : 
     900             :       INTEGER, PARAMETER                                 :: add_tasks = 1000
     901             :       REAL(kind=dp), PARAMETER                           :: mult_tasks = 2.0_dp
     902             : 
     903             :       INTEGER                                            :: added_tasks, cost, j, lmax
     904             :       LOGICAL                                            :: is_ortho
     905             :       REAL(KIND=dp)                                      :: tfraction
     906             : 
     907             : !$OMP SINGLE
     908     7825701 :       ntasks = ntasks + 1
     909     7825701 :       IF (ntasks > curr_tasks) THEN
     910         979 :          curr_tasks = INT((curr_tasks + add_tasks)*mult_tasks)
     911         979 :          CALL reallocate_tasks(tasks, curr_tasks)
     912             :       END IF
     913             : !$OMP END SINGLE
     914             : 
     915     7825701 :       IF (rs_desc%distributed) THEN
     916             : 
     917             :          ! finds the node(s) that need to process this task
     918             :          ! on exit tasks(:)%dist_type is 1 for distributed tasks and 2 for generalised tasks
     919             :          CALL rs_find_node(rs_desc, igrid_level, n_levels, cube_center, &
     920        1380 :                            ntasks=ntasks, tasks=tasks, lb_cube=lb_cube, ub_cube=ub_cube, added_tasks=added_tasks)
     921             : 
     922             :       ELSE
     923     7824321 :          tasks(ntasks)%destination = encode_rank(rs_desc%my_pos, igrid_level, n_levels)
     924     7824321 :          tasks(ntasks)%dist_type = 0
     925     7824321 :          tasks(ntasks)%subpatch_pattern = 0
     926     7824321 :          added_tasks = 1
     927             :       END IF
     928             : 
     929     7825701 :       lmax = la_max + lb_max
     930     7825701 :       is_ortho = (tasks(ntasks)%dist_type == 0 .OR. tasks(ntasks)%dist_type == 1) .AND. rs_desc%orthorhombic
     931             :       ! we assume the load is shared equally between processes dealing with a generalised Gaussian.
     932             :       ! this could be refined in the future
     933     7825701 :       tfraction = 1.0_dp/added_tasks
     934             : 
     935     7825701 :       cost = cost_model(lb_cube, ub_cube, tfraction, lmax, is_ortho)
     936             : 
     937    15651402 :       DO j = 1, added_tasks
     938     7825701 :          tasks(ntasks - added_tasks + j)%source = encode_rank(rs_desc%my_pos, igrid_level, n_levels)
     939     7825701 :          tasks(ntasks - added_tasks + j)%cost = cost
     940     7825701 :          tasks(ntasks - added_tasks + j)%grid_level = igrid_level
     941     7825701 :          tasks(ntasks - added_tasks + j)%image = cindex
     942     7825701 :          tasks(ntasks - added_tasks + j)%iatom = iatom
     943     7825701 :          tasks(ntasks - added_tasks + j)%jatom = jatom
     944     7825701 :          tasks(ntasks - added_tasks + j)%iset = iset
     945     7825701 :          tasks(ntasks - added_tasks + j)%jset = jset
     946     7825701 :          tasks(ntasks - added_tasks + j)%ipgf = ipgf
     947     7825701 :          tasks(ntasks - added_tasks + j)%jpgf = jpgf
     948    31302804 :          tasks(ntasks - added_tasks + j)%rab = rab
     949    15651402 :          tasks(ntasks - added_tasks + j)%radius = radius
     950             :       END DO
     951             : 
     952     7825701 :    END SUBROUTINE pgf_to_tasks
     953             : 
     954             : ! **************************************************************************************************
     955             : !> \brief performs load balancing of the tasks on the distributed grids
     956             : !> \param tasks ...
     957             : !> \param ntasks ...
     958             : !> \param rs_descs ...
     959             : !> \param grid_level ...
     960             : !> \param natoms ...
     961             : !> \par History
     962             : !>      created 2008-10-03 [Joost VandeVondele]
     963             : ! **************************************************************************************************
     964          54 :    SUBROUTINE load_balance_distributed(tasks, ntasks, rs_descs, grid_level, natoms)
     965             : 
     966             :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks
     967             :       INTEGER                                            :: ntasks
     968             :       TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
     969             :          POINTER                                         :: rs_descs
     970             :       INTEGER                                            :: grid_level, natoms
     971             : 
     972             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'load_balance_distributed'
     973             : 
     974             :       INTEGER                                            :: handle
     975          54 :       INTEGER, DIMENSION(:, :, :), POINTER               :: list
     976             : 
     977          54 :       CALL timeset(routineN, handle)
     978             : 
     979          54 :       NULLIFY (list)
     980             :       ! here we create for each cpu (0:ncpu-1) a list of possible destinations.
     981             :       ! if a destination would not be in this list, it is a bug
     982          54 :       CALL create_destination_list(list, rs_descs, grid_level)
     983             : 
     984             :       ! now, walk over the tasks, filling in the loads of each destination
     985          54 :       CALL compute_load_list(list, rs_descs, grid_level, tasks, ntasks, natoms, create_list=.TRUE.)
     986             : 
     987             :       ! optimize loads & fluxes
     988          54 :       CALL optimize_load_list(list, rs_descs(1)%rs_desc%group, rs_descs(1)%rs_desc%my_pos)
     989             : 
     990             :       ! now, walk over the tasks, using the list to set the destinations
     991          54 :       CALL compute_load_list(list, rs_descs, grid_level, tasks, ntasks, natoms, create_list=.FALSE.)
     992             : 
     993          54 :       DEALLOCATE (list)
     994             : 
     995          54 :       CALL timestop(handle)
     996             : 
     997          54 :    END SUBROUTINE load_balance_distributed
     998             : 
     999             : ! **************************************************************************************************
    1000             : !> \brief this serial routine adjusts the fluxes in the global list
    1001             : !>
    1002             : !> \param list_global ...
    1003             : !> \par History
    1004             : !>      created 2008-10-06 [Joost VandeVondele]
    1005             : ! **************************************************************************************************
    1006          27 :    SUBROUTINE balance_global_list(list_global)
    1007             :       INTEGER, DIMENSION(:, :, 0:)                       :: list_global
    1008             : 
    1009             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'balance_global_list'
    1010             :       INTEGER, PARAMETER                                 :: Max_Iter = 100
    1011             :       REAL(KIND=dp), PARAMETER                           :: Tolerance_factor = 0.005_dp
    1012             : 
    1013             :       INTEGER                                            :: dest, handle, icpu, idest, iflux, &
    1014             :                                                             ilocal, k, maxdest, Ncpu, Nflux
    1015          27 :       INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: flux_connections
    1016             :       LOGICAL                                            :: solution_optimal
    1017             :       REAL(KIND=dp)                                      :: average, load_shift, max_load_shift, &
    1018             :                                                             tolerance
    1019          27 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:)           :: load, optimized_flux, optimized_load
    1020          27 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:, :)        :: flux_limits
    1021             : 
    1022          27 :       CALL timeset(routineN, handle)
    1023             : 
    1024          27 :       Ncpu = SIZE(list_global, 3)
    1025          27 :       maxdest = SIZE(list_global, 2)
    1026          81 :       ALLOCATE (load(0:Ncpu - 1))
    1027          81 :       load = 0.0_dp
    1028          54 :       ALLOCATE (optimized_load(0:Ncpu - 1))
    1029             : 
    1030             :       ! figure out the number of fluxes
    1031             :       ! we assume that the global_list is symmetric
    1032          81 :       Nflux = 0
    1033          81 :       DO icpu = 0, ncpu - 1
    1034         189 :          DO idest = 1, maxdest
    1035         108 :             dest = list_global(1, idest, icpu)
    1036         162 :             IF (dest < ncpu .AND. dest > icpu) Nflux = Nflux + 1
    1037             :          END DO
    1038             :       END DO
    1039          81 :       ALLOCATE (optimized_flux(Nflux))
    1040          81 :       ALLOCATE (flux_limits(2, Nflux))
    1041          81 :       ALLOCATE (flux_connections(2, Nflux))
    1042             : 
    1043             :       ! reorder data
    1044         108 :       flux_limits = 0
    1045             :       Nflux = 0
    1046          81 :       DO icpu = 0, ncpu - 1
    1047         162 :          load(icpu) = SUM(list_global(2, :, icpu))
    1048         189 :          DO idest = 1, maxdest
    1049         108 :             dest = list_global(1, idest, icpu)
    1050         162 :             IF (dest < ncpu) THEN
    1051         108 :                IF (dest .NE. icpu) THEN
    1052          54 :                   IF (dest > icpu) THEN
    1053          27 :                      Nflux = Nflux + 1
    1054          27 :                      flux_limits(2, Nflux) = list_global(2, idest, icpu)
    1055          27 :                      flux_connections(1, Nflux) = icpu
    1056          27 :                      flux_connections(2, Nflux) = dest
    1057             :                   ELSE
    1058          27 :                      DO iflux = 1, Nflux
    1059          27 :                         IF (flux_connections(1, iflux) == dest .AND. flux_connections(2, iflux) == icpu) THEN
    1060          27 :                            flux_limits(1, iflux) = -list_global(2, idest, icpu)
    1061          27 :                            EXIT
    1062             :                         END IF
    1063             :                      END DO
    1064             :                   END IF
    1065             :                END IF
    1066             :             END IF
    1067             :          END DO
    1068             :       END DO
    1069             : 
    1070             :       solution_optimal = .FALSE.
    1071          54 :       optimized_flux = 0.0_dp
    1072             : 
    1073             :       ! an iterative solver, if iterated till convergence the maximum load is minimal
    1074             :       ! we terminate before things are fully converged, since this does show up in the timings
    1075             :       ! once the largest shift becomes less than a small fraction of the average load, we're done
    1076             :       ! we're perfectly happy if the load balance is within 1 percent or so
    1077             :       ! the maximum load normally converges even faster
    1078          81 :       average = SUM(load)/SIZE(load)
    1079          27 :       tolerance = Tolerance_factor*average
    1080             : 
    1081          81 :       optimized_load(:) = load
    1082         332 :       DO k = 1, Max_iter
    1083             :          max_load_shift = 0.0_dp
    1084         658 :          DO iflux = 1, Nflux
    1085         329 :             load_shift = (optimized_load(flux_connections(1, iflux)) - optimized_load(flux_connections(2, iflux)))/2
    1086         329 :             load_shift = MAX(flux_limits(1, iflux) - optimized_flux(iflux), load_shift)
    1087         329 :             load_shift = MIN(flux_limits(2, iflux) - optimized_flux(iflux), load_shift)
    1088         329 :             max_load_shift = MAX(ABS(load_shift), max_load_shift)
    1089         329 :             optimized_load(flux_connections(1, iflux)) = optimized_load(flux_connections(1, iflux)) - load_shift
    1090         329 :             optimized_load(flux_connections(2, iflux)) = optimized_load(flux_connections(2, iflux)) + load_shift
    1091         658 :             optimized_flux(iflux) = optimized_flux(iflux) + load_shift
    1092             :          END DO
    1093         332 :          IF (max_load_shift < tolerance) THEN
    1094             :             solution_optimal = .TRUE.
    1095             :             EXIT
    1096             :          END IF
    1097             :       END DO
    1098             : 
    1099             :       ! now adjust the load list to reflect the optimized fluxes
    1100             :       ! reorder data
    1101             :       Nflux = 0
    1102          81 :       DO icpu = 0, ncpu - 1
    1103         162 :          DO idest = 1, maxdest
    1104         162 :             IF (list_global(1, idest, icpu) == icpu) ilocal = idest
    1105             :          END DO
    1106         189 :          DO idest = 1, maxdest
    1107         108 :             dest = list_global(1, idest, icpu)
    1108         162 :             IF (dest < ncpu) THEN
    1109         108 :                IF (dest .NE. icpu) THEN
    1110          54 :                   IF (dest > icpu) THEN
    1111          27 :                      Nflux = Nflux + 1
    1112          27 :                      IF (optimized_flux(Nflux) > 0) THEN
    1113             :                         list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
    1114           0 :                                                        list_global(2, idest, icpu) - NINT(optimized_flux(Nflux))
    1115           0 :                         list_global(2, idest, icpu) = NINT(optimized_flux(Nflux))
    1116             :                      ELSE
    1117             :                         list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
    1118          27 :                                                        list_global(2, idest, icpu)
    1119          27 :                         list_global(2, idest, icpu) = 0
    1120             :                      END IF
    1121             :                   ELSE
    1122          27 :                      DO iflux = 1, Nflux
    1123          27 :                         IF (flux_connections(1, iflux) == dest .AND. flux_connections(2, iflux) == icpu) THEN
    1124          27 :                            IF (optimized_flux(iflux) > 0) THEN
    1125             :                               list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
    1126           0 :                                                              list_global(2, idest, icpu)
    1127           0 :                               list_global(2, idest, icpu) = 0
    1128             :                            ELSE
    1129             :                               list_global(2, ilocal, icpu) = list_global(2, ilocal, icpu) + &
    1130          27 :                                                              list_global(2, idest, icpu) + NINT(optimized_flux(iflux))
    1131          27 :                               list_global(2, idest, icpu) = -NINT(optimized_flux(iflux))
    1132             :                            END IF
    1133             :                            EXIT
    1134             :                         END IF
    1135             :                      END DO
    1136             :                   END IF
    1137             :                END IF
    1138             :             END IF
    1139             :          END DO
    1140             :       END DO
    1141             : 
    1142          27 :       CALL timestop(handle)
    1143             : 
    1144          54 :    END SUBROUTINE balance_global_list
    1145             : 
    1146             : ! **************************************************************************************************
    1147             : !> \brief this routine gets back optimized loads for all destinations
    1148             : !>
    1149             : !> \param list ...
    1150             : !> \param group ...
    1151             : !> \param my_pos ...
    1152             : !> \par History
    1153             : !>      created 2008-10-06 [Joost VandeVondele]
    1154             : !>      Modified 2016-01   [EPCC] Reduce memory requirements on P processes
    1155             : !>                                from O(P^2) to O(P)
    1156             : ! **************************************************************************************************
    1157          54 :    SUBROUTINE optimize_load_list(list, group, my_pos)
    1158             :       INTEGER, DIMENSION(:, :, 0:)                       :: list
    1159             :       TYPE(mp_comm_type), INTENT(IN) :: group
    1160             :       INTEGER, INTENT(IN)                                :: my_pos
    1161             : 
    1162             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'optimize_load_list'
    1163             :       INTEGER, PARAMETER                                 :: rank_of_root = 0
    1164             : 
    1165             :       INTEGER                                            :: handle, icpu, idest, maxdest, ncpu
    1166             :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: load_all
    1167          54 :       INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: load_partial
    1168             :       INTEGER, ALLOCATABLE, DIMENSION(:, :, :)           :: list_global
    1169             : 
    1170          54 :       CALL timeset(routineN, handle)
    1171             : 
    1172          54 :       ncpu = SIZE(list, 3)
    1173          54 :       maxdest = SIZE(list, 2)
    1174             : 
    1175             :       !find total workload ...
    1176         162 :       ALLOCATE (load_all(maxdest*ncpu))
    1177         108 :       load_all(:) = RESHAPE(list(2, :, :), (/maxdest*ncpu/))
    1178          54 :       CALL group%sum(load_all(:), rank_of_root)
    1179             : 
    1180             :       ! ... and optimise the work per process
    1181         216 :       ALLOCATE (list_global(2, maxdest, ncpu))
    1182          54 :       IF (rank_of_root .EQ. my_pos) THEN
    1183         189 :          list_global(1, :, :) = list(1, :, :)
    1184         243 :          list_global(2, :, :) = RESHAPE(load_all, (/maxdest, ncpu/))
    1185          27 :          CALL balance_global_list(list_global)
    1186             :       END IF
    1187          54 :       CALL group%bcast(list_global, rank_of_root)
    1188             : 
    1189             :       !figure out how much can be sent to other processes
    1190         216 :       ALLOCATE (load_partial(maxdest, ncpu))
    1191             :       ! send 'load_all', which is a copy of 'list' (but without leading dimension/stride)
    1192         162 :       CALL group%sum_partial(RESHAPE(load_all, (/maxdest, ncpu/)), load_partial(:, :))
    1193             : 
    1194         162 :       DO icpu = 1, ncpu
    1195         378 :          DO idest = 1, maxdest
    1196             : 
    1197             :             !need to deduct 1 because `list' was passed in to this routine as being indexed from zero
    1198         324 :             IF (load_partial(idest, icpu) > list_global(2, idest, icpu)) THEN
    1199          37 :                IF (load_partial(idest, icpu) - list(2, idest, icpu - 1) < list_global(2, idest, icpu)) THEN
    1200             :                   list(2, idest, icpu - 1) = list_global(2, idest, icpu) &
    1201          10 :                                              - (load_partial(idest, icpu) - list(2, idest, icpu - 1))
    1202             :                ELSE
    1203          27 :                   list(2, idest, icpu - 1) = 0
    1204             :                END IF
    1205             :             END IF
    1206             : 
    1207             :          END DO
    1208             :       END DO
    1209             : 
    1210             :       !clean up before leaving
    1211          54 :       DEALLOCATE (load_all)
    1212          54 :       DEALLOCATE (list_global)
    1213          54 :       DEALLOCATE (load_partial)
    1214             : 
    1215          54 :       CALL timestop(handle)
    1216          54 :    END SUBROUTINE optimize_load_list
    1217             : 
    1218             : ! **************************************************************************************************
    1219             : !> \brief fill the load list with values derived from the tasks array
    1220             : !>        from the alternate locations, we select the alternate location that
    1221             : !>        can be used without increasing the number of matrix blocks needed to
    1222             : !>        distribute.
    1223             : !>        Replicated tasks are not yet considered
    1224             : !>
    1225             : !> \param list ...
    1226             : !> \param rs_descs ...
    1227             : !> \param grid_level ...
    1228             : !> \param tasks ...
    1229             : !> \param ntasks ...
    1230             : !> \param natoms ...
    1231             : !> \param create_list ...
    1232             : !> \par History
    1233             : !>      created 2008-10-06 [Joost VandeVondele]
    1234             : ! **************************************************************************************************
    1235         108 :    SUBROUTINE compute_load_list(list, rs_descs, grid_level, tasks, ntasks, natoms, create_list)
    1236             :       INTEGER, DIMENSION(:, :, 0:)                       :: list
    1237             :       TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
    1238             :          POINTER                                         :: rs_descs
    1239             :       INTEGER                                            :: grid_level
    1240             :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks
    1241             :       INTEGER                                            :: ntasks, natoms
    1242             :       LOGICAL                                            :: create_list
    1243             : 
    1244             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'compute_load_list'
    1245             : 
    1246             :       INTEGER :: cost, dest, handle, i, iatom, ilevel, img, img_old, iopt, ipgf, iset, itask, &
    1247             :                  itask_start, itask_stop, jatom, jpgf, jset, li, maxdest, ncpu, ndest_pair, nopt, nshort, &
    1248             :                  rank
    1249             :       INTEGER(KIND=int_8)                                :: bit_pattern, ipair, ipair_old, natom8
    1250             :       INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:)     :: loads
    1251         108 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: all_dests, index
    1252             :       INTEGER, DIMENSION(6)                              :: options
    1253             : 
    1254         108 :       CALL timeset(routineN, handle)
    1255             : 
    1256         324 :       ALLOCATE (loads(0:rs_descs(grid_level)%rs_desc%group_size - 1))
    1257         108 :       CALL get_current_loads(loads, rs_descs, grid_level, ntasks, tasks, use_reordered_ranks=.FALSE.)
    1258             : 
    1259         108 :       maxdest = SIZE(list, 2)
    1260         108 :       ncpu = SIZE(list, 3)
    1261         108 :       natom8 = natoms
    1262             : 
    1263             :       ! first find the tasks that deal with the same atom pair
    1264         108 :       itask_stop = 0
    1265         108 :       ipair_old = HUGE(ipair_old)
    1266         108 :       img_old = -1
    1267         108 :       ALLOCATE (all_dests(0))
    1268         108 :       ALLOCATE (INDEX(0))
    1269             : 
    1270             :       DO
    1271             : 
    1272             :          ! first find the range of tasks that deal with the same atom pair
    1273         290 :          itask_start = itask_stop + 1
    1274         290 :          itask_stop = itask_start
    1275         290 :          IF (itask_stop > ntasks) EXIT
    1276         182 :          ilevel = tasks(itask_stop)%grid_level
    1277         182 :          img_old = tasks(itask_stop)%image
    1278         182 :          iatom = tasks(itask_stop)%iatom
    1279         182 :          jatom = tasks(itask_stop)%jatom
    1280         182 :          iset = tasks(itask_stop)%iset
    1281         182 :          jset = tasks(itask_stop)%jset
    1282         182 :          ipgf = tasks(itask_stop)%ipgf
    1283         182 :          jpgf = tasks(itask_stop)%jpgf
    1284             : 
    1285         182 :          ipair_old = (iatom - 1)*natom8 + (jatom - 1)
    1286             :          DO
    1287        7386 :             IF (itask_stop + 1 > ntasks) EXIT
    1288        7298 :             ilevel = tasks(itask_stop + 1)%grid_level
    1289        7298 :             img = tasks(itask_stop + 1)%image
    1290        7298 :             iatom = tasks(itask_stop + 1)%iatom
    1291        7298 :             jatom = tasks(itask_stop + 1)%jatom
    1292        7298 :             iset = tasks(itask_stop + 1)%iset
    1293        7298 :             jset = tasks(itask_stop + 1)%jset
    1294        7298 :             ipgf = tasks(itask_stop + 1)%ipgf
    1295        7298 :             jpgf = tasks(itask_stop + 1)%jpgf
    1296             : 
    1297        7298 :             ipair = (iatom - 1)*natom8 + (jatom - 1)
    1298        7386 :             IF (ipair == ipair_old .AND. img == img_old) THEN
    1299             :                itask_stop = itask_stop + 1
    1300             :             ELSE
    1301             :                EXIT
    1302             :             END IF
    1303             :          END DO
    1304         182 :          ipair = ipair_old
    1305         182 :          nshort = itask_stop - itask_start + 1
    1306             : 
    1307             :          ! find the unique list of destinations on this grid level only
    1308         182 :          DEALLOCATE (all_dests)
    1309         546 :          ALLOCATE (all_dests(nshort))
    1310         182 :          DEALLOCATE (index)
    1311         364 :          ALLOCATE (INDEX(nshort))
    1312        7568 :          DO i = 1, nshort
    1313        7386 :             ilevel = tasks(itask_start + i - 1)%grid_level
    1314        7386 :             img = tasks(itask_start + i - 1)%image
    1315        7386 :             iatom = tasks(itask_start + i - 1)%iatom
    1316        7386 :             jatom = tasks(itask_start + i - 1)%jatom
    1317        7386 :             iset = tasks(itask_start + i - 1)%iset
    1318        7386 :             jset = tasks(itask_start + i - 1)%jset
    1319        7386 :             ipgf = tasks(itask_start + i - 1)%ipgf
    1320        7386 :             jpgf = tasks(itask_start + i - 1)%jpgf
    1321             : 
    1322        7568 :             IF (ilevel .EQ. grid_level) THEN
    1323        2760 :                all_dests(i) = decode_rank(tasks(itask_start + i - 1)%destination, SIZE(rs_descs))
    1324             :             ELSE
    1325        4626 :                all_dests(i) = HUGE(all_dests(i))
    1326             :             END IF
    1327             :          END DO
    1328         182 :          CALL sort(all_dests, nshort, index)
    1329         182 :          ndest_pair = 1
    1330        7386 :          DO i = 2, nshort
    1331        7386 :             IF ((all_dests(ndest_pair) .NE. all_dests(i)) .AND. (all_dests(i) .NE. HUGE(all_dests(i)))) THEN
    1332          10 :                ndest_pair = ndest_pair + 1
    1333          10 :                all_dests(ndest_pair) = all_dests(i)
    1334             :             END IF
    1335             :          END DO
    1336             : 
    1337        7676 :          DO itask = itask_start, itask_stop
    1338             : 
    1339        7386 :             dest = decode_rank(tasks(itask)%destination, SIZE(rs_descs)) ! notice that dest can be changed
    1340        7386 :             ilevel = tasks(itask)%grid_level
    1341        7386 :             img = tasks(itask)%image
    1342        7386 :             iatom = tasks(itask)%iatom
    1343        7386 :             jatom = tasks(itask)%jatom
    1344        7386 :             iset = tasks(itask)%iset
    1345        7386 :             jset = tasks(itask)%jset
    1346        7386 :             ipgf = tasks(itask)%ipgf
    1347        7386 :             jpgf = tasks(itask)%jpgf
    1348             : 
    1349             :             ! Only proceed with tasks which are on this grid level
    1350        7386 :             IF (ilevel .NE. grid_level) CYCLE
    1351        2760 :             ipair = (iatom - 1)*natom8 + (jatom - 1)
    1352        2760 :             cost = INT(tasks(itask)%cost)
    1353             : 
    1354         182 :             SELECT CASE (tasks(itask)%dist_type)
    1355             :             CASE (1)
    1356        2760 :                bit_pattern = tasks(itask)%subpatch_pattern
    1357        2760 :                nopt = 0
    1358        2760 :                IF (BTEST(bit_pattern, 0)) THEN
    1359           0 :                   rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, (/-1, 0, 0/))
    1360           0 :                   IF (ANY(all_dests(1:ndest_pair) .EQ. rank)) THEN
    1361           0 :                      nopt = nopt + 1
    1362           0 :                      options(nopt) = rank
    1363             :                   END IF
    1364             :                END IF
    1365        2760 :                IF (BTEST(bit_pattern, 1)) THEN
    1366           0 :                   rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, (/+1, 0, 0/))
    1367           0 :                   IF (ANY(all_dests(1:ndest_pair) .EQ. rank)) THEN
    1368           0 :                      nopt = nopt + 1
    1369           0 :                      options(nopt) = rank
    1370             :                   END IF
    1371             :                END IF
    1372        2760 :                IF (BTEST(bit_pattern, 2)) THEN
    1373          48 :                   rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, (/0, -1, 0/))
    1374          96 :                   IF (ANY(all_dests(1:ndest_pair) .EQ. rank)) THEN
    1375           0 :                      nopt = nopt + 1
    1376           0 :                      options(nopt) = rank
    1377             :                   END IF
    1378             :                END IF
    1379        2760 :                IF (BTEST(bit_pattern, 3)) THEN
    1380           0 :                   rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, (/0, +1, 0/))
    1381           0 :                   IF (ANY(all_dests(1:ndest_pair) .EQ. rank)) THEN
    1382           0 :                      nopt = nopt + 1
    1383           0 :                      options(nopt) = rank
    1384             :                   END IF
    1385             :                END IF
    1386        2760 :                IF (BTEST(bit_pattern, 4)) THEN
    1387        1150 :                   rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, (/0, 0, -1/))
    1388        2200 :                   IF (ANY(all_dests(1:ndest_pair) .EQ. rank)) THEN
    1389         100 :                      nopt = nopt + 1
    1390         100 :                      options(nopt) = rank
    1391             :                   END IF
    1392             :                END IF
    1393        2760 :                IF (BTEST(bit_pattern, 5)) THEN
    1394         500 :                   rank = rs_grid_locate_rank(rs_descs(ilevel)%rs_desc, dest, (/0, 0, +1/))
    1395        1000 :                   IF (ANY(all_dests(1:ndest_pair) .EQ. rank)) THEN
    1396          80 :                      nopt = nopt + 1
    1397          80 :                      options(nopt) = rank
    1398             :                   END IF
    1399             :                END IF
    1400        2760 :                IF (nopt > 0) THEN
    1401             :                   ! set it to the rank with the lowest load
    1402         180 :                   rank = options(1)
    1403         180 :                   DO iopt = 2, nopt
    1404         180 :                      IF (loads(rank) > loads(options(iopt))) rank = options(iopt)
    1405             :                   END DO
    1406             :                ELSE
    1407        2580 :                   rank = dest
    1408             :                END IF
    1409        2760 :                li = list_index(list, rank, dest)
    1410        2760 :                IF (create_list) THEN
    1411        1380 :                   list(2, li, dest) = list(2, li, dest) + cost
    1412             :                ELSE
    1413        1380 :                   IF (list(1, li, dest) == dest) THEN
    1414        1290 :                      tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
    1415             :                   ELSE
    1416          90 :                      IF (list(2, li, dest) >= cost) THEN
    1417          10 :                         list(2, li, dest) = list(2, li, dest) - cost
    1418          10 :                         tasks(itask)%destination = encode_rank(list(1, li, dest), ilevel, SIZE(rs_descs))
    1419             :                      ELSE
    1420          80 :                         tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
    1421             :                      END IF
    1422             :                   END IF
    1423             :                END IF
    1424             :             CASE (2) ! generalised
    1425           0 :                li = list_index(list, dest, dest)
    1426           0 :                IF (create_list) THEN
    1427           0 :                   list(2, li, dest) = list(2, li, dest) + cost
    1428             :                ELSE
    1429           0 :                   IF (list(1, li, dest) == dest) THEN
    1430           0 :                      tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
    1431             :                   ELSE
    1432           0 :                      IF (list(2, li, dest) >= cost) THEN
    1433           0 :                         list(2, li, dest) = list(2, li, dest) - cost
    1434           0 :                         tasks(itask)%destination = encode_rank(list(1, li, dest), ilevel, SIZE(rs_descs))
    1435             :                      ELSE
    1436           0 :                         tasks(itask)%destination = encode_rank(dest, ilevel, SIZE(rs_descs))
    1437             :                      END IF
    1438             :                   END IF
    1439             :                END IF
    1440             :             CASE DEFAULT
    1441        7386 :                CPABORT("")
    1442             :             END SELECT
    1443             : 
    1444             :          END DO
    1445             : 
    1446             :       END DO
    1447             : 
    1448         108 :       CALL timestop(handle)
    1449             : 
    1450         216 :    END SUBROUTINE compute_load_list
    1451             : ! **************************************************************************************************
    1452             : !> \brief small helper function to return the proper index in the list array
    1453             : !>
    1454             : !> \param list ...
    1455             : !> \param rank ...
    1456             : !> \param dest ...
    1457             : !> \return ...
    1458             : !> \par History
    1459             : !>      created 2008-10-06 [Joost VandeVondele]
    1460             : ! **************************************************************************************************
    1461        2760 :    INTEGER FUNCTION list_index(list, rank, dest)
    1462             :       INTEGER, DIMENSION(:, :, 0:), INTENT(IN)           :: list
    1463             :       INTEGER, INTENT(IN)                                :: rank, dest
    1464             : 
    1465        2760 :       list_index = 1
    1466        1424 :       DO
    1467        4184 :          IF (list(1, list_index, dest) == rank) EXIT
    1468        1424 :          list_index = list_index + 1
    1469             :       END DO
    1470        2760 :    END FUNCTION list_index
    1471             : ! **************************************************************************************************
    1472             : !> \brief create a list with possible destinations (i.e. the central cpu and neighbors) for each cpu
    1473             : !>        note that we allocate it with an additional field to store the load of this destination
    1474             : !>
    1475             : !> \param list ...
    1476             : !> \param rs_descs ...
    1477             : !> \param grid_level ...
    1478             : !> \par History
    1479             : !>      created 2008-10-06 [Joost VandeVondele]
    1480             : ! **************************************************************************************************
    1481          54 :    SUBROUTINE create_destination_list(list, rs_descs, grid_level)
    1482             :       INTEGER, DIMENSION(:, :, :), POINTER               :: list
    1483             :       TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
    1484             :          POINTER                                         :: rs_descs
    1485             :       INTEGER, INTENT(IN)                                :: grid_level
    1486             : 
    1487             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_destination_list'
    1488             : 
    1489             :       INTEGER                                            :: handle, i, icpu, j, maxcount, ncpu, &
    1490             :                                                             ultimate_max
    1491          54 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: index, sublist
    1492             : 
    1493          54 :       CALL timeset(routineN, handle)
    1494             : 
    1495          54 :       CPASSERT(.NOT. ASSOCIATED(list))
    1496          54 :       ncpu = rs_descs(grid_level)%rs_desc%group_size
    1497          54 :       ultimate_max = 7
    1498             : 
    1499         162 :       ALLOCATE (list(2, ultimate_max, 0:ncpu - 1))
    1500             : 
    1501          54 :       ALLOCATE (INDEX(ultimate_max))
    1502          54 :       ALLOCATE (sublist(ultimate_max))
    1503         432 :       sublist = HUGE(sublist)
    1504             : 
    1505          54 :       maxcount = 1
    1506         162 :       DO icpu = 0, ncpu - 1
    1507         108 :          sublist(1) = icpu
    1508         108 :          sublist(2) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, (/-1, 0, 0/))
    1509         108 :          sublist(3) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, (/+1, 0, 0/))
    1510         108 :          sublist(4) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, (/0, -1, 0/))
    1511         108 :          sublist(5) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, (/0, +1, 0/))
    1512         108 :          sublist(6) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, (/0, 0, -1/))
    1513         108 :          sublist(7) = rs_grid_locate_rank(rs_descs(grid_level)%rs_desc, icpu, (/0, 0, +1/))
    1514             :          ! only retain unique values of the destination
    1515         108 :          CALL sort(sublist, ultimate_max, index)
    1516         108 :          j = 1
    1517         756 :          DO i = 2, 7
    1518         756 :             IF (sublist(i) .NE. sublist(j)) THEN
    1519         108 :                j = j + 1
    1520         108 :                sublist(j) = sublist(i)
    1521             :             END IF
    1522             :          END DO
    1523         108 :          maxcount = MAX(maxcount, j)
    1524         648 :          sublist(j + 1:ultimate_max) = HUGE(sublist)
    1525         864 :          list(1, :, icpu) = sublist
    1526         918 :          list(2, :, icpu) = 0
    1527             :       END DO
    1528             : 
    1529          54 :       CALL reallocate(list, 1, 2, 1, maxcount, 0, ncpu - 1)
    1530             : 
    1531          54 :       CALL timestop(handle)
    1532             : 
    1533         108 :    END SUBROUTINE create_destination_list
    1534             : 
    1535             : ! **************************************************************************************************
    1536             : !> \brief given a task list, compute the load of each process everywhere
    1537             : !>        giving this function the ability to loop over a (sub)set of rs_grids,
    1538             : !>        and do all the communication in one shot, would speed it up
    1539             : !> \param loads ...
    1540             : !> \param rs_descs ...
    1541             : !> \param grid_level ...
    1542             : !> \param ntasks ...
    1543             : !> \param tasks ...
    1544             : !> \param use_reordered_ranks ...
    1545             : !> \par History
    1546             : !>      none
    1547             : !> \author MattW 21/11/2007
    1548             : ! **************************************************************************************************
    1549         480 :    SUBROUTINE get_current_loads(loads, rs_descs, grid_level, ntasks, tasks, use_reordered_ranks)
    1550             :       INTEGER(KIND=int_8), DIMENSION(:)                  :: loads
    1551             :       TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
    1552             :          POINTER                                         :: rs_descs
    1553             :       INTEGER                                            :: grid_level, ntasks
    1554             :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks
    1555             :       LOGICAL, INTENT(IN)                                :: use_reordered_ranks
    1556             : 
    1557             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'get_current_loads'
    1558             : 
    1559             :       INTEGER                                            :: handle, i, iatom, ilevel, img, ipgf, &
    1560             :                                                             iset, jatom, jpgf, jset
    1561             :       INTEGER(KIND=int_8)                                :: total_cost_local
    1562             :       INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:)     :: recv_buf_i, send_buf_i
    1563             :       TYPE(realspace_grid_desc_type), POINTER            :: desc
    1564             : 
    1565         480 :       CALL timeset(routineN, handle)
    1566             : 
    1567         480 :       desc => rs_descs(grid_level)%rs_desc
    1568             : 
    1569             :       ! allocate local arrays
    1570        1440 :       ALLOCATE (send_buf_i(desc%group_size))
    1571         960 :       ALLOCATE (recv_buf_i(desc%group_size))
    1572             : 
    1573             :       ! communication step 1 : compute the total local cost of the tasks
    1574             :       !                        each proc needs to know the amount of work he will receive
    1575             : 
    1576             :       ! send buffer now contains for each target the cost of the tasks it will receive
    1577        1440 :       send_buf_i = 0
    1578       36792 :       DO i = 1, ntasks
    1579       36312 :          ilevel = tasks(i)%grid_level
    1580       36312 :          img = tasks(i)%image
    1581       36312 :          iatom = tasks(i)%iatom
    1582       36312 :          jatom = tasks(i)%jatom
    1583       36312 :          iset = tasks(i)%iset
    1584       36312 :          jset = tasks(i)%jset
    1585       36312 :          ipgf = tasks(i)%ipgf
    1586       36312 :          jpgf = tasks(i)%jpgf
    1587       36312 :          IF (ilevel .NE. grid_level) CYCLE
    1588       10476 :          IF (use_reordered_ranks) THEN
    1589             :             send_buf_i(rs_descs(ilevel)%rs_desc%virtual2real(decode_rank(tasks(i)%destination, SIZE(rs_descs))) + 1) = &
    1590             :                send_buf_i(rs_descs(ilevel)%rs_desc%virtual2real(decode_rank(tasks(i)%destination, SIZE(rs_descs))) + 1) &
    1591        3618 :                + tasks(i)%cost
    1592             :          ELSE
    1593             :             send_buf_i(decode_rank(tasks(i)%destination, SIZE(rs_descs)) + 1) = &
    1594             :                send_buf_i(decode_rank(tasks(i)%destination, SIZE(rs_descs)) + 1) &
    1595        6378 :                + tasks(i)%cost
    1596             :          END IF
    1597             :       END DO
    1598         480 :       CALL desc%group%alltoall(send_buf_i, recv_buf_i, 1)
    1599             : 
    1600             :       ! communication step 2 : compute the global cost of the tasks
    1601        1440 :       total_cost_local = SUM(recv_buf_i)
    1602             : 
    1603             :       ! after this step, the recv buffer contains the local cost for each CPU
    1604        1440 :       CALL desc%group%allgather(total_cost_local, loads)
    1605             : 
    1606         480 :       CALL timestop(handle)
    1607             : 
    1608         960 :    END SUBROUTINE get_current_loads
    1609             : ! **************************************************************************************************
    1610             : !> \brief performs load balancing shifting tasks on the replicated grids
    1611             : !>        this modifies the destination of some of the tasks on replicated
    1612             : !>        grids, and in this way balances the load
    1613             : !> \param rs_descs ...
    1614             : !> \param ntasks ...
    1615             : !> \param tasks ...
    1616             : !> \par History
    1617             : !>      none
    1618             : !> \author MattW 21/11/2007
    1619             : ! **************************************************************************************************
    1620          48 :    SUBROUTINE load_balance_replicated(rs_descs, ntasks, tasks)
    1621             : 
    1622             :       TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
    1623             :          POINTER                                         :: rs_descs
    1624             :       INTEGER                                            :: ntasks
    1625             :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks
    1626             : 
    1627             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'load_balance_replicated'
    1628             : 
    1629             :       INTEGER                                            :: handle, i, iatom, ilevel, img, ipgf, &
    1630             :                                                             iset, j, jatom, jpgf, jset, &
    1631             :                                                             no_overloaded, no_underloaded, &
    1632             :                                                             proc_receiving
    1633             :       INTEGER(KIND=int_8)                                :: average_cost, cost_task_rep, count, &
    1634             :                                                             offset, total_cost_global
    1635          48 :       INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:)     :: load_imbalance, loads, recv_buf_i
    1636          48 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: index
    1637             :       TYPE(realspace_grid_desc_type), POINTER            :: desc
    1638             : 
    1639          48 :       CALL timeset(routineN, handle)
    1640             : 
    1641          48 :       desc => rs_descs(1)%rs_desc
    1642             : 
    1643             :       ! allocate local arrays
    1644         144 :       ALLOCATE (recv_buf_i(desc%group_size))
    1645          96 :       ALLOCATE (loads(desc%group_size))
    1646             : 
    1647         144 :       recv_buf_i = 0
    1648         234 :       DO i = 1, SIZE(rs_descs)
    1649         186 :          CALL get_current_loads(loads, rs_descs, i, ntasks, tasks, use_reordered_ranks=.TRUE.)
    1650         606 :          recv_buf_i(:) = recv_buf_i + loads
    1651             :       END DO
    1652             : 
    1653         144 :       total_cost_global = SUM(recv_buf_i)
    1654          48 :       average_cost = total_cost_global/desc%group_size
    1655             : 
    1656             :       !
    1657             :       ! compute how to redistribute the replicated tasks so that the average cost is reached
    1658             :       !
    1659             : 
    1660             :       ! load imbalance measures the load of a given CPU relative
    1661             :       ! to the optimal load distribution (load=average)
    1662         144 :       ALLOCATE (load_imbalance(desc%group_size))
    1663         144 :       ALLOCATE (INDEX(desc%group_size))
    1664             : 
    1665         144 :       load_imbalance(:) = recv_buf_i - average_cost
    1666          48 :       no_overloaded = 0
    1667          48 :       no_underloaded = 0
    1668             : 
    1669         144 :       DO i = 1, desc%group_size
    1670          96 :          IF (load_imbalance(i) .GT. 0) no_overloaded = no_overloaded + 1
    1671         144 :          IF (load_imbalance(i) .LT. 0) no_underloaded = no_underloaded + 1
    1672             :       END DO
    1673             : 
    1674             :       ! sort the recv_buffer on number of tasks, gives us index which provides a
    1675             :       ! mapping between processor ranks and how overloaded the processor
    1676          48 :       CALL sort(recv_buf_i, SIZE(recv_buf_i), index)
    1677             : 
    1678             :       ! find out the number of replicated tasks each proc has
    1679             :       ! but only those tasks which have not yet been assigned
    1680          48 :       cost_task_rep = 0
    1681        3666 :       DO i = 1, ntasks
    1682             :          IF (tasks(i)%dist_type .EQ. 0 &
    1683        3666 :              .AND. decode_rank(tasks(i)%destination, SIZE(rs_descs)) == decode_rank(tasks(i)%source, SIZE(rs_descs))) THEN
    1684        2238 :             cost_task_rep = cost_task_rep + tasks(i)%cost
    1685             :          END IF
    1686             :       END DO
    1687             : 
    1688             :       ! now, correct the load imbalance for the overloaded CPUs
    1689             :       ! they will send away not more than the total load of replicated tasks
    1690          48 :       CALL desc%group%allgather(cost_task_rep, recv_buf_i)
    1691             : 
    1692         144 :       DO i = 1, desc%group_size
    1693             :          ! At the moment we can only offload replicated tasks
    1694          96 :          IF (load_imbalance(i) .GT. 0) &
    1695          96 :             load_imbalance(i) = MIN(load_imbalance(i), recv_buf_i(i))
    1696             :       END DO
    1697             : 
    1698             :       ! simplest algorithm I can think of of is that the processor with the most
    1699             :       ! excess tasks fills up the process needing most, then moves on to next most.
    1700             :       ! At the moment if we've got less replicated tasks than we're overloaded then
    1701             :       ! task balancing will be incomplete
    1702             : 
    1703             :       ! only need to do anything if I've excess tasks
    1704          48 :       IF (load_imbalance(desc%my_pos + 1) .GT. 0) THEN
    1705             : 
    1706          22 :          count = 0 ! weighted amount of tasks offloaded
    1707          22 :          offset = 0 ! no of underloaded processes already filled by other more overloaded procs
    1708             : 
    1709             :          ! calculate offset
    1710          22 :          DO i = desc%group_size, desc%group_size - no_overloaded + 1, -1
    1711          22 :             IF (INDEX(i) .EQ. desc%my_pos + 1) THEN
    1712             :                EXIT
    1713             :             ELSE
    1714           0 :                offset = offset + load_imbalance(INDEX(i))
    1715             :             END IF
    1716             :          END DO
    1717             : 
    1718             :          ! find my starting processor to send to
    1719          22 :          proc_receiving = HUGE(proc_receiving)
    1720          22 :          DO i = 1, no_underloaded
    1721          22 :             offset = offset + load_imbalance(INDEX(i))
    1722          22 :             IF (offset .LE. 0) THEN
    1723             :                proc_receiving = i
    1724             :                EXIT
    1725             :             END IF
    1726             :          END DO
    1727             : 
    1728             :          ! offset now contains minus the number of tasks proc_receiving requires
    1729             :          ! we fill this up by adjusting the destination of tasks on the replicated grid,
    1730             :          ! then move to next most underloaded proc
    1731        1380 :          DO j = 1, ntasks
    1732             :             IF (tasks(j)%dist_type .EQ. 0 &
    1733        1380 :                 .AND. decode_rank(tasks(j)%destination, SIZE(rs_descs)) == decode_rank(tasks(j)%source, SIZE(rs_descs))) THEN
    1734             :                ! just avoid sending to non existing procs due to integer truncation
    1735             :                ! in the computation of the average
    1736         808 :                IF (proc_receiving .GT. no_underloaded) EXIT
    1737             :                ! set new destination
    1738         808 :                ilevel = tasks(j)%grid_level
    1739         808 :                img = tasks(j)%image
    1740         808 :                iatom = tasks(j)%iatom
    1741         808 :                jatom = tasks(j)%jatom
    1742         808 :                iset = tasks(j)%iset
    1743         808 :                jset = tasks(j)%jset
    1744         808 :                ipgf = tasks(j)%ipgf
    1745         808 :                jpgf = tasks(j)%jpgf
    1746         808 :                tasks(j)%destination = encode_rank(INDEX(proc_receiving) - 1, ilevel, SIZE(rs_descs))
    1747         808 :                offset = offset + tasks(j)%cost
    1748         808 :                count = count + tasks(j)%cost
    1749         808 :                IF (count .GE. load_imbalance(desc%my_pos + 1)) EXIT
    1750         786 :                IF (offset .GT. 0) THEN
    1751           0 :                   proc_receiving = proc_receiving + 1
    1752             :                   ! just avoid sending to non existing procs due to integer truncation
    1753             :                   ! in the computation of the average
    1754           0 :                   IF (proc_receiving .GT. no_underloaded) EXIT
    1755           0 :                   offset = load_imbalance(INDEX(proc_receiving))
    1756             :                END IF
    1757             :             END IF
    1758             :          END DO
    1759             :       END IF
    1760             : 
    1761          48 :       DEALLOCATE (index)
    1762          48 :       DEALLOCATE (load_imbalance)
    1763             : 
    1764          48 :       CALL timestop(handle)
    1765             : 
    1766          96 :    END SUBROUTINE load_balance_replicated
    1767             : 
    1768             : ! **************************************************************************************************
    1769             : !> \brief given an input task list, redistribute so that all tasks can be processed locally,
    1770             : !>        i.e. dest equals rank
    1771             : !> \param rs_descs ...
    1772             : !> \param ntasks ...
    1773             : !> \param tasks ...
    1774             : !> \param ntasks_recv ...
    1775             : !> \param tasks_recv ...
    1776             : !> \par History
    1777             : !>      none
    1778             : !> \author MattW 21/11/2007
    1779             : ! **************************************************************************************************
    1780          48 :    SUBROUTINE create_local_tasks(rs_descs, ntasks, tasks, ntasks_recv, tasks_recv)
    1781             : 
    1782             :       TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
    1783             :          POINTER                                         :: rs_descs
    1784             :       INTEGER                                            :: ntasks
    1785             :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks
    1786             :       INTEGER                                            :: ntasks_recv
    1787             :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks_recv
    1788             : 
    1789             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'create_local_tasks'
    1790             : 
    1791             :       INTEGER                                            :: handle, i, j, k, l, rank
    1792             :       INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:)     :: recv_buf, send_buf
    1793             :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: recv_disps, recv_sizes, send_disps, &
    1794             :                                                             send_sizes
    1795             :       TYPE(realspace_grid_desc_type), POINTER            :: desc
    1796             : 
    1797          48 :       CALL timeset(routineN, handle)
    1798             : 
    1799          48 :       desc => rs_descs(1)%rs_desc
    1800             : 
    1801             :       ! allocate local arrays
    1802         144 :       ALLOCATE (send_sizes(desc%group_size))
    1803          96 :       ALLOCATE (recv_sizes(desc%group_size))
    1804          96 :       ALLOCATE (send_disps(desc%group_size))
    1805          96 :       ALLOCATE (recv_disps(desc%group_size))
    1806         144 :       ALLOCATE (send_buf(desc%group_size))
    1807          96 :       ALLOCATE (recv_buf(desc%group_size))
    1808             : 
    1809             :       ! fill send buffer, now counting how many tasks will be send (stored in an int8 array for convenience only).
    1810         144 :       send_buf = 0
    1811        3666 :       DO i = 1, ntasks
    1812             :          rank = rs_descs(decode_level(tasks(i)%destination, SIZE(rs_descs))) &
    1813        3618 :                 %rs_desc%virtual2real(decode_rank(tasks(i)%destination, SIZE(rs_descs)))
    1814        3666 :          send_buf(rank + 1) = send_buf(rank + 1) + 1
    1815             :       END DO
    1816             : 
    1817          48 :       CALL desc%group%alltoall(send_buf, recv_buf, 1)
    1818             : 
    1819             :       ! pack the tasks, and send them around
    1820             : 
    1821         144 :       send_sizes = 0
    1822         144 :       send_disps = 0
    1823         144 :       recv_sizes = 0
    1824         144 :       recv_disps = 0
    1825             : 
    1826          48 :       send_sizes(1) = INT(send_buf(1)*task_size_in_int8)
    1827          48 :       recv_sizes(1) = INT(recv_buf(1)*task_size_in_int8)
    1828          96 :       DO i = 2, desc%group_size
    1829          48 :          send_sizes(i) = INT(send_buf(i)*task_size_in_int8)
    1830          48 :          recv_sizes(i) = INT(recv_buf(i)*task_size_in_int8)
    1831          48 :          send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
    1832          96 :          recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
    1833             :       END DO
    1834             : 
    1835             :       ! deallocate old send/recv buffers
    1836          48 :       DEALLOCATE (send_buf)
    1837          48 :       DEALLOCATE (recv_buf)
    1838             : 
    1839             :       ! allocate them with new sizes
    1840         233 :       ALLOCATE (send_buf(SUM(send_sizes)))
    1841         238 :       ALLOCATE (recv_buf(SUM(recv_sizes)))
    1842             : 
    1843             :       ! do packing
    1844       61554 :       send_buf = 0
    1845         144 :       send_sizes = 0
    1846        3666 :       DO j = 1, ntasks
    1847             :          i = rs_descs(decode_level(tasks(j)%destination, SIZE(rs_descs))) &
    1848        3618 :              %rs_desc%virtual2real(decode_rank(tasks(j)%destination, SIZE(rs_descs))) + 1
    1849        3618 :          l = send_disps(i) + send_sizes(i)
    1850        3618 :          CALL serialize_task(tasks(j), send_buf(l + 1:l + task_size_in_int8))
    1851        3666 :          send_sizes(i) = send_sizes(i) + task_size_in_int8
    1852             :       END DO
    1853             : 
    1854             :       ! do communication
    1855          48 :       CALL desc%group%alltoall(send_buf, send_sizes, send_disps, recv_buf, recv_sizes, recv_disps)
    1856             : 
    1857          48 :       DEALLOCATE (send_buf)
    1858             : 
    1859         144 :       ntasks_recv = SUM(recv_sizes)/task_size_in_int8
    1860        3904 :       ALLOCATE (tasks_recv(ntasks_recv))
    1861             : 
    1862             :       ! do unpacking
    1863          48 :       l = 0
    1864         144 :       DO i = 1, desc%group_size
    1865        3762 :          DO j = 0, recv_sizes(i)/task_size_in_int8 - 1
    1866        3618 :             l = l + 1
    1867        3618 :             k = recv_disps(i) + j*task_size_in_int8
    1868        3714 :             CALL deserialize_task(tasks_recv(l), recv_buf(k + 1:k + task_size_in_int8))
    1869             :          END DO
    1870             :       END DO
    1871             : 
    1872          48 :       DEALLOCATE (recv_buf)
    1873          48 :       DEALLOCATE (send_sizes)
    1874          48 :       DEALLOCATE (recv_sizes)
    1875          48 :       DEALLOCATE (send_disps)
    1876          48 :       DEALLOCATE (recv_disps)
    1877             : 
    1878          48 :       CALL timestop(handle)
    1879             : 
    1880          48 :    END SUBROUTINE create_local_tasks
    1881             : 
    1882             : ! **************************************************************************************************
    1883             : !> \brief Assembles tasks to be performed on local grid
    1884             : !> \param rs_descs the grids
    1885             : !> \param ntasks Number of tasks for local processing
    1886             : !> \param natoms ...
    1887             : !> \param nimages ...
    1888             : !> \param tasks the task set generated on this processor
    1889             : !> \param rval ...
    1890             : !> \param atom_pair_send ...
    1891             : !> \param atom_pair_recv ...
    1892             : !> \param symmetric ...
    1893             : !> \param reorder_rs_grid_ranks ...
    1894             : !> \param skip_load_balance_distributed ...
    1895             : !> \par History
    1896             : !>      none
    1897             : !> \author MattW 21/11/2007
    1898             : ! **************************************************************************************************
    1899       15804 :    SUBROUTINE distribute_tasks(rs_descs, ntasks, natoms, &
    1900             :                                tasks, atom_pair_send, atom_pair_recv, &
    1901             :                                symmetric, reorder_rs_grid_ranks, skip_load_balance_distributed)
    1902             : 
    1903             :       TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
    1904             :          POINTER                                         :: rs_descs
    1905             :       INTEGER                                            :: ntasks, natoms
    1906             :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks
    1907             :       TYPE(atom_pair_type), DIMENSION(:), POINTER        :: atom_pair_send, atom_pair_recv
    1908             :       LOGICAL, INTENT(IN)                                :: symmetric, reorder_rs_grid_ranks, &
    1909             :                                                             skip_load_balance_distributed
    1910             : 
    1911             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'distribute_tasks'
    1912             : 
    1913             :       INTEGER                                            :: handle, igrid_level, irank, ntasks_recv
    1914             :       INTEGER(KIND=int_8)                                :: load_gap, max_load, replicated_load
    1915       15804 :       INTEGER(KIND=int_8), ALLOCATABLE, DIMENSION(:)     :: total_loads, total_loads_tmp, trial_loads
    1916       15804 :       INTEGER(KIND=int_8), DIMENSION(:, :), POINTER      :: loads
    1917       15804 :       INTEGER, ALLOCATABLE, DIMENSION(:)                 :: indices, real2virtual, total_index
    1918             :       LOGICAL                                            :: distributed_grids, fixed_first_grid
    1919             :       TYPE(realspace_grid_desc_type), POINTER            :: desc
    1920       15804 :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks_recv
    1921             : 
    1922       15804 :       CALL timeset(routineN, handle)
    1923             : 
    1924       15804 :       CPASSERT(ASSOCIATED(tasks))
    1925             : 
    1926             :       ! *** figure out if we have distributed grids
    1927       15804 :       distributed_grids = .FALSE.
    1928       78532 :       DO igrid_level = 1, SIZE(rs_descs)
    1929       78532 :          IF (rs_descs(igrid_level)%rs_desc%distributed) THEN
    1930          54 :             distributed_grids = .TRUE.
    1931             :          END IF
    1932             :       END DO
    1933       15804 :       desc => rs_descs(1)%rs_desc
    1934             : 
    1935       15804 :       IF (distributed_grids) THEN
    1936             : 
    1937         192 :          ALLOCATE (loads(0:desc%group_size - 1, SIZE(rs_descs)))
    1938         144 :          ALLOCATE (total_loads(0:desc%group_size - 1))
    1939             : 
    1940         144 :          total_loads = 0
    1941             : 
    1942             :          ! First round of balancing on the distributed grids
    1943             :          ! we just balance each of the distributed grids independently
    1944         234 :          DO igrid_level = 1, SIZE(rs_descs)
    1945         234 :             IF (rs_descs(igrid_level)%rs_desc%distributed) THEN
    1946             : 
    1947          54 :                IF (.NOT. skip_load_balance_distributed) &
    1948          54 :                   CALL load_balance_distributed(tasks, ntasks, rs_descs, igrid_level, natoms)
    1949             : 
    1950             :                CALL get_current_loads(loads(:, igrid_level), rs_descs, igrid_level, ntasks, &
    1951          54 :                                       tasks, use_reordered_ranks=.FALSE.)
    1952             : 
    1953         162 :                total_loads(:) = total_loads + loads(:, igrid_level)
    1954             : 
    1955             :             END IF
    1956             :          END DO
    1957             : 
    1958             :          ! calculate the total load of replicated tasks, so we can decide if it is worth
    1959             :          ! reordering the distributed grid levels
    1960             : 
    1961          48 :          replicated_load = 0
    1962         234 :          DO igrid_level = 1, SIZE(rs_descs)
    1963         234 :             IF (.NOT. rs_descs(igrid_level)%rs_desc%distributed) THEN
    1964             :                CALL get_current_loads(loads(:, igrid_level), rs_descs, igrid_level, ntasks, &
    1965         132 :                                       tasks, use_reordered_ranks=.FALSE.)
    1966         396 :                replicated_load = replicated_load + SUM(loads(:, igrid_level))
    1967             :             END IF
    1968             :          END DO
    1969             : 
    1970             :          !IF (desc%my_pos==0) THEN
    1971             :          ! WRITE(*,*) "Total replicated load is ",replicated_load
    1972             :          !END IF
    1973             : 
    1974             :          ! Now we adjust the rank ordering based on the current loads
    1975             :          ! we leave the first distributed level and all the replicated levels in the default order
    1976          48 :          IF (reorder_rs_grid_ranks) THEN
    1977          48 :             fixed_first_grid = .FALSE.
    1978         234 :             DO igrid_level = 1, SIZE(rs_descs)
    1979         234 :                IF (rs_descs(igrid_level)%rs_desc%distributed) THEN
    1980          54 :                   IF (fixed_first_grid .EQV. .FALSE.) THEN
    1981         144 :                      total_loads(:) = loads(:, igrid_level)
    1982             :                      fixed_first_grid = .TRUE.
    1983             :                   ELSE
    1984          18 :                      ALLOCATE (trial_loads(0:desc%group_size - 1))
    1985             : 
    1986          18 :                      trial_loads(:) = total_loads + loads(:, igrid_level)
    1987          18 :                      max_load = MAXVAL(trial_loads)
    1988             :                      load_gap = 0
    1989          18 :                      DO irank = 0, desc%group_size - 1
    1990          18 :                         load_gap = load_gap + max_load - trial_loads(irank)
    1991             :                      END DO
    1992             : 
    1993             :                      ! If there is not enough replicated load to load balance well enough
    1994             :                      ! then we will reorder this grid level
    1995           6 :                      IF (load_gap > replicated_load*1.05_dp) THEN
    1996             : 
    1997          18 :                         ALLOCATE (indices(0:desc%group_size - 1))
    1998          12 :                         ALLOCATE (total_index(0:desc%group_size - 1))
    1999          12 :                         ALLOCATE (total_loads_tmp(0:desc%group_size - 1))
    2000          12 :                         ALLOCATE (real2virtual(0:desc%group_size - 1))
    2001             : 
    2002          18 :                         total_loads_tmp(:) = total_loads
    2003           6 :                         CALL sort(total_loads_tmp, desc%group_size, total_index)
    2004           6 :                         CALL sort(loads(:, igrid_level), desc%group_size, indices)
    2005             : 
    2006             :                         ! Reorder so that the rank with smallest load on this grid level is paired with
    2007             :                         ! the highest load in total
    2008          18 :                         DO irank = 0, desc%group_size - 1
    2009             :                            total_loads(total_index(irank) - 1) = total_loads(total_index(irank) - 1) + &
    2010          12 :                                                                  loads(desc%group_size - irank - 1, igrid_level)
    2011          18 :                            real2virtual(total_index(irank) - 1) = indices(desc%group_size - irank - 1) - 1
    2012             :                         END DO
    2013             : 
    2014           6 :                         CALL rs_grid_reorder_ranks(rs_descs(igrid_level)%rs_desc, real2virtual)
    2015             : 
    2016           6 :                         DEALLOCATE (indices)
    2017           6 :                         DEALLOCATE (total_index)
    2018           6 :                         DEALLOCATE (total_loads_tmp)
    2019           6 :                         DEALLOCATE (real2virtual)
    2020             :                      ELSE
    2021           0 :                         total_loads(:) = trial_loads
    2022             :                      END IF
    2023             : 
    2024           6 :                      DEALLOCATE (trial_loads)
    2025             : 
    2026             :                   END IF
    2027             :                END IF
    2028             :             END DO
    2029             :          END IF
    2030             : 
    2031             :          ! Now we use the replicated tasks to balance out the rest of the load
    2032          48 :          CALL load_balance_replicated(rs_descs, ntasks, tasks)
    2033             : 
    2034             :          !total_loads = 0
    2035             :          !DO igrid_level=1,SIZE(rs_descs)
    2036             :          !  CALL get_current_loads(loads(:,igrid_level), rs_descs, igrid_level, ntasks, &
    2037             :          !                         tasks, use_reordered_ranks=.TRUE.)
    2038             :          !  total_loads = total_loads + loads(:, igrid_level)
    2039             :          !END DO
    2040             : 
    2041             :          !IF (desc%my_pos==0) THEN
    2042             :          !  WRITE(*,*) ""
    2043             :          !  WRITE(*,*) "At the end of the load balancing procedure"
    2044             :          !  WRITE(*,*) "Maximum  load:",MAXVAL(total_loads)
    2045             :          !  WRITE(*,*) "Average  load:",SUM(total_loads)/SIZE(total_loads)
    2046             :          !  WRITE(*,*) "Minimum  load:",MINVAL(total_loads)
    2047             :          !ENDIF
    2048             : 
    2049             :          ! given a list of tasks, this will do the needed reshuffle so that all tasks will be local
    2050          48 :          CALL create_local_tasks(rs_descs, ntasks, tasks, ntasks_recv, tasks_recv)
    2051             : 
    2052             :          !
    2053             :          ! tasks list are complete, we can compute the list of atomic blocks (atom pairs)
    2054             :          ! we will be sending. These lists are needed for redistribute_matrix.
    2055             :          !
    2056          48 :          CALL get_atom_pair(atom_pair_send, tasks, ntasks=ntasks, send=.TRUE., symmetric=symmetric, rs_descs=rs_descs)
    2057             : 
    2058             :          ! natom_send=SIZE(atom_pair_send)
    2059             :          ! CALL desc%group%sum(natom_send)
    2060             :          ! IF (desc%my_pos==0) THEN
    2061             :          !     WRITE(*,*) ""
    2062             :          !     WRITE(*,*) "Total number of atomic blocks to be send:",natom_send
    2063             :          ! ENDIF
    2064             : 
    2065          48 :          CALL get_atom_pair(atom_pair_recv, tasks_recv, ntasks=ntasks_recv, send=.FALSE., symmetric=symmetric, rs_descs=rs_descs)
    2066             : 
    2067             :          ! cleanup, at this point we  don't need the original tasks anymore
    2068          48 :          DEALLOCATE (tasks)
    2069          48 :          DEALLOCATE (loads)
    2070          48 :          DEALLOCATE (total_loads)
    2071             : 
    2072             :       ELSE
    2073       15756 :          tasks_recv => tasks
    2074       15756 :          ntasks_recv = ntasks
    2075       15756 :          CALL get_atom_pair(atom_pair_recv, tasks_recv, ntasks=ntasks_recv, send=.FALSE., symmetric=symmetric, rs_descs=rs_descs)
    2076             :          ! not distributed, hence atom_pair_send not needed
    2077             :       END IF
    2078             : 
    2079             :       ! here we sort the task list we will process locally.
    2080       47134 :       ALLOCATE (indices(ntasks_recv))
    2081       15804 :       CALL tasks_sort(tasks_recv, ntasks_recv, indices)
    2082       15804 :       DEALLOCATE (indices)
    2083             : 
    2084             :       !
    2085             :       ! final lists are ready
    2086             :       !
    2087             : 
    2088       15804 :       tasks => tasks_recv
    2089       15804 :       ntasks = ntasks_recv
    2090             : 
    2091       15804 :       CALL timestop(handle)
    2092             : 
    2093       31608 :    END SUBROUTINE distribute_tasks
    2094             : 
    2095             : ! **************************************************************************************************
    2096             : !> \brief ...
    2097             : !> \param atom_pair ...
    2098             : !> \param my_tasks ...
    2099             : !> \param send ...
    2100             : !> \param symmetric ...
    2101             : !> \param natoms ...
    2102             : !> \param nimages ...
    2103             : !> \param rs_descs ...
    2104             : ! **************************************************************************************************
    2105       15852 :    SUBROUTINE get_atom_pair(atom_pair, tasks, ntasks, send, symmetric, rs_descs)
    2106             : 
    2107             :       TYPE(atom_pair_type), DIMENSION(:), POINTER        :: atom_pair
    2108             :       TYPE(task_type), DIMENSION(:), INTENT(INOUT)       :: tasks
    2109             :       INTEGER, INTENT(IN)                                :: ntasks
    2110             :       LOGICAL, INTENT(IN)                                :: send, symmetric
    2111             :       TYPE(realspace_grid_desc_p_type), DIMENSION(:), INTENT(IN) :: rs_descs
    2112             : 
    2113             :       INTEGER                                            :: i, ilevel, iatom, jatom, npairs, virt_rank
    2114       15852 :       INTEGER, DIMENSION(:), ALLOCATABLE                 :: indices
    2115       15852 :       TYPE(atom_pair_type), DIMENSION(:), ALLOCATABLE    :: atom_pair_tmp
    2116             : 
    2117       15852 :       CPASSERT(.NOT. ASSOCIATED(atom_pair))
    2118       15852 :       IF (ntasks == 0) THEN
    2119         285 :          ALLOCATE (atom_pair(0))
    2120             :          RETURN
    2121             :       END IF
    2122             : 
    2123             :       ! calculate list of atom pairs
    2124             :       ! fill pair list taking into account symmetry
    2125     7876020 :       ALLOCATE (atom_pair_tmp(ntasks))
    2126     7844886 :       DO i = 1, ntasks
    2127     7829319 :          atom_pair_tmp(i)%image = tasks(i)%image
    2128     7829319 :          iatom = tasks(i)%iatom
    2129     7829319 :          jatom = tasks(i)%jatom
    2130     7829319 :          IF (symmetric .AND. iatom > jatom) THEN
    2131             :             ! iatom / jatom swapped
    2132     2950548 :             atom_pair_tmp(i)%row = jatom
    2133     2950548 :             atom_pair_tmp(i)%col = iatom
    2134             :          ELSE
    2135     4878771 :             atom_pair_tmp(i)%row = iatom
    2136     4878771 :             atom_pair_tmp(i)%col = jatom
    2137             :          END IF
    2138             : 
    2139     7844886 :          IF (send) THEN
    2140             :             ! If sending, we need to use the 'real rank' as the pair has to be sent to the process which
    2141             :             ! actually has the correct part of the rs_grid to do the mapping
    2142        3618 :             ilevel = tasks(i)%grid_level
    2143        3618 :             virt_rank = decode_rank(tasks(i)%destination, SIZE(rs_descs))
    2144        3618 :             atom_pair_tmp(i)%rank = rs_descs(ilevel)%rs_desc%virtual2real(virt_rank)
    2145             :          ELSE
    2146             :             ! If we are receiving, then no conversion is needed as the rank is that of the process with the
    2147             :             ! required matrix block, and the ordering of the rs grid is irrelevant
    2148     7825701 :             atom_pair_tmp(i)%rank = decode_rank(tasks(i)%source, SIZE(rs_descs))
    2149             :          END IF
    2150             :       END DO
    2151             : 
    2152             :       ! find unique atom pairs that I'm sending/receiving
    2153       46701 :       ALLOCATE (indices(ntasks))
    2154       15567 :       CALL atom_pair_sort(atom_pair_tmp, ntasks, indices)
    2155       15567 :       npairs = 1
    2156       15567 :       tasks(indices(1))%pair_index = 1
    2157     7829319 :       DO i = 2, ntasks
    2158     7813752 :          IF (atom_pair_less_than(atom_pair_tmp(i - 1), atom_pair_tmp(i))) THEN
    2159      253387 :             npairs = npairs + 1
    2160      253387 :             atom_pair_tmp(npairs) = atom_pair_tmp(i)
    2161             :          END IF
    2162     7829319 :          tasks(indices(i))%pair_index = npairs
    2163             :       END DO
    2164       15567 :       DEALLOCATE (indices)
    2165             : 
    2166             :       ! Copy unique pairs to final location.
    2167      315655 :       ALLOCATE (atom_pair(npairs))
    2168      284521 :       atom_pair(:) = atom_pair_tmp(:npairs)
    2169       15567 :       DEALLOCATE (atom_pair_tmp)
    2170             : 
    2171             :    END SUBROUTINE get_atom_pair
    2172             : 
    2173             : ! **************************************************************************************************
    2174             : !> \brief redistributes the matrix so that it can be used in realspace operations
    2175             : !>        i.e. according to the task lists for collocate and integrate.
    2176             : !>        This routine can become a bottleneck in large calculations.
    2177             : !> \param rs_descs ...
    2178             : !> \param pmats ...
    2179             : !> \param atom_pair_send ...
    2180             : !> \param atom_pair_recv ...
    2181             : !> \param natoms ...
    2182             : !> \param nimages ...
    2183             : !> \param scatter ...
    2184             : !> \param hmats ...
    2185             : ! **************************************************************************************************
    2186           0 :    SUBROUTINE rs_distribute_matrix(rs_descs, pmats, atom_pair_send, atom_pair_recv, &
    2187             :                                    nimages, scatter, hmats)
    2188             : 
    2189             :       TYPE(realspace_grid_desc_p_type), DIMENSION(:), &
    2190             :          POINTER                                         :: rs_descs
    2191             :       TYPE(dbcsr_p_type), DIMENSION(:), POINTER          :: pmats
    2192             :       TYPE(atom_pair_type), DIMENSION(:), POINTER        :: atom_pair_send, atom_pair_recv
    2193             :       INTEGER                                            :: nimages
    2194             :       LOGICAL                                            :: scatter
    2195             :       TYPE(dbcsr_p_type), DIMENSION(:), OPTIONAL, &
    2196             :          POINTER                                         :: hmats
    2197             : 
    2198             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'rs_distribute_matrix'
    2199             : 
    2200             :       INTEGER                                            :: acol, arow, handle, i, img, j, k, l, me, &
    2201             :                                                             nblkcols_total, nblkrows_total, ncol, &
    2202             :                                                             nrow, nthread, nthread_left
    2203           0 :       INTEGER, ALLOCATABLE, DIMENSION(:) :: first_col, first_row, last_col, last_row, recv_disps, &
    2204           0 :                                             recv_pair_count, recv_pair_disps, recv_sizes, send_disps, send_pair_count, &
    2205           0 :                                             send_pair_disps, send_sizes
    2206           0 :       INTEGER, DIMENSION(:), POINTER                     :: col_blk_size, row_blk_size
    2207             :       LOGICAL                                            :: found
    2208           0 :       REAL(KIND=dp), ALLOCATABLE, DIMENSION(:)           :: recv_buf_r, send_buf_r
    2209           0 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: h_block, p_block
    2210             :       TYPE(dbcsr_type), POINTER                          :: hmat, pmat
    2211             :       TYPE(realspace_grid_desc_type), POINTER            :: desc
    2212             : 
    2213           0 : !$    INTEGER(kind=omp_lock_kind), ALLOCATABLE, DIMENSION(:) :: locks
    2214             : 
    2215           0 :       CALL timeset(routineN, handle)
    2216             : 
    2217           0 :       IF (.NOT. scatter) THEN
    2218           0 :          CPASSERT(PRESENT(hmats))
    2219             :       END IF
    2220             : 
    2221           0 :       desc => rs_descs(1)%rs_desc
    2222           0 :       me = desc%my_pos + 1
    2223             : 
    2224             :       ! allocate local arrays
    2225           0 :       ALLOCATE (send_sizes(desc%group_size))
    2226           0 :       ALLOCATE (recv_sizes(desc%group_size))
    2227           0 :       ALLOCATE (send_disps(desc%group_size))
    2228           0 :       ALLOCATE (recv_disps(desc%group_size))
    2229           0 :       ALLOCATE (send_pair_count(desc%group_size))
    2230           0 :       ALLOCATE (recv_pair_count(desc%group_size))
    2231           0 :       ALLOCATE (send_pair_disps(desc%group_size))
    2232           0 :       ALLOCATE (recv_pair_disps(desc%group_size))
    2233             : 
    2234           0 :       pmat => pmats(1)%matrix
    2235             :       CALL dbcsr_get_info(pmat, &
    2236             :                           row_blk_size=row_blk_size, &
    2237             :                           col_blk_size=col_blk_size, &
    2238             :                           nblkrows_total=nblkrows_total, &
    2239           0 :                           nblkcols_total=nblkcols_total)
    2240           0 :       ALLOCATE (first_row(nblkrows_total), last_row(nblkrows_total), &
    2241           0 :                 first_col(nblkcols_total), last_col(nblkcols_total))
    2242           0 :       CALL dbcsr_convert_sizes_to_offsets(row_blk_size, first_row, last_row)
    2243           0 :       CALL dbcsr_convert_sizes_to_offsets(col_blk_size, first_col, last_col)
    2244             : 
    2245             :       ! set up send buffer sizes
    2246           0 :       send_sizes = 0
    2247           0 :       send_pair_count = 0
    2248           0 :       DO i = 1, SIZE(atom_pair_send)
    2249           0 :          k = atom_pair_send(i)%rank + 1 ! proc we're sending this block to
    2250           0 :          arow = atom_pair_send(i)%row
    2251           0 :          acol = atom_pair_send(i)%col
    2252           0 :          nrow = last_row(arow) - first_row(arow) + 1
    2253           0 :          ncol = last_col(acol) - first_col(acol) + 1
    2254           0 :          send_sizes(k) = send_sizes(k) + nrow*ncol
    2255           0 :          send_pair_count(k) = send_pair_count(k) + 1
    2256             :       END DO
    2257             : 
    2258           0 :       send_disps = 0
    2259           0 :       send_pair_disps = 0
    2260           0 :       DO i = 2, desc%group_size
    2261           0 :          send_disps(i) = send_disps(i - 1) + send_sizes(i - 1)
    2262           0 :          send_pair_disps(i) = send_pair_disps(i - 1) + send_pair_count(i - 1)
    2263             :       END DO
    2264             : 
    2265           0 :       ALLOCATE (send_buf_r(SUM(send_sizes)))
    2266             : 
    2267             :       ! set up recv buffer
    2268             : 
    2269           0 :       recv_sizes = 0
    2270           0 :       recv_pair_count = 0
    2271           0 :       DO i = 1, SIZE(atom_pair_recv)
    2272           0 :          k = atom_pair_recv(i)%rank + 1 ! proc we're receiving this data from
    2273           0 :          arow = atom_pair_recv(i)%row
    2274           0 :          acol = atom_pair_recv(i)%col
    2275           0 :          nrow = last_row(arow) - first_row(arow) + 1
    2276           0 :          ncol = last_col(acol) - first_col(acol) + 1
    2277           0 :          recv_sizes(k) = recv_sizes(k) + nrow*ncol
    2278           0 :          recv_pair_count(k) = recv_pair_count(k) + 1
    2279             :       END DO
    2280             : 
    2281           0 :       recv_disps = 0
    2282           0 :       recv_pair_disps = 0
    2283           0 :       DO i = 2, desc%group_size
    2284           0 :          recv_disps(i) = recv_disps(i - 1) + recv_sizes(i - 1)
    2285           0 :          recv_pair_disps(i) = recv_pair_disps(i - 1) + recv_pair_count(i - 1)
    2286             :       END DO
    2287           0 :       ALLOCATE (recv_buf_r(SUM(recv_sizes)))
    2288             : 
    2289             : !$OMP PARALLEL DEFAULT(OMP_DEFAULT_NONE_WITH_OOP) &
    2290             : !$OMP          SHARED(desc,send_pair_count,send_pair_disps,nimages),&
    2291             : !$OMP          SHARED(last_row,first_row,last_col,first_col),&
    2292             : !$OMP          SHARED(pmats,send_buf_r,send_disps,send_sizes),&
    2293             : !$OMP          SHARED(atom_pair_send,me,hmats,nblkrows_total),&
    2294             : !$OMP          SHARED(atom_pair_recv,recv_buf_r,scatter,recv_pair_disps), &
    2295             : !$OMP          SHARED(recv_sizes,recv_disps,recv_pair_count,locks), &
    2296             : !$OMP          PRIVATE(i,img,arow,acol,nrow,ncol,p_block,found,j,k,l),&
    2297           0 : !$OMP          PRIVATE(nthread,h_block,nthread_left,hmat,pmat)
    2298             : 
    2299             :       nthread = 1
    2300             : !$    nthread = omp_get_num_threads()
    2301             :       nthread_left = 1
    2302             : !$    nthread_left = MAX(1, nthread - 1)
    2303             : 
    2304             :       ! do packing
    2305             : !$OMP DO schedule(guided)
    2306             :       DO l = 1, desc%group_size
    2307             :          IF (l .EQ. me) CYCLE
    2308             :          send_sizes(l) = 0
    2309             :          DO i = 1, send_pair_count(l)
    2310             :             arow = atom_pair_send(send_pair_disps(l) + i)%row
    2311             :             acol = atom_pair_send(send_pair_disps(l) + i)%col
    2312             :             img = atom_pair_send(send_pair_disps(l) + i)%image
    2313             :             nrow = last_row(arow) - first_row(arow) + 1
    2314             :             ncol = last_col(acol) - first_col(acol) + 1
    2315             :             pmat => pmats(img)%matrix
    2316             :             CALL dbcsr_get_block_p(matrix=pmat, row=arow, col=acol, block=p_block, found=found)
    2317             :             CPASSERT(found)
    2318             : 
    2319             :             DO k = 1, ncol
    2320             :                DO j = 1, nrow
    2321             :                   send_buf_r(send_disps(l) + send_sizes(l) + j + (k - 1)*nrow) = p_block(j, k)
    2322             :                END DO
    2323             :             END DO
    2324             :             send_sizes(l) = send_sizes(l) + nrow*ncol
    2325             :          END DO
    2326             :       END DO
    2327             : !$OMP END DO
    2328             : 
    2329             :       IF (.NOT. scatter) THEN
    2330             :          ! We need locks to protect concurrent summation into H
    2331             : !$OMP SINGLE
    2332             : !$       ALLOCATE (locks(nthread*10))
    2333             : !$OMP END SINGLE
    2334             : 
    2335             : !$OMP DO
    2336             : !$       do i = 1, nthread*10
    2337             : !$          call omp_init_lock(locks(i))
    2338             : !$       end do
    2339             : !$OMP END DO
    2340             :       END IF
    2341             : 
    2342             : !$OMP MASTER
    2343             :       ! do communication
    2344             :       CALL desc%group%alltoall(send_buf_r, send_sizes, send_disps, &
    2345             :                                recv_buf_r, recv_sizes, recv_disps)
    2346             : !$OMP END MASTER
    2347             : 
    2348             :       ! If this is a scatter, then no need to copy local blocks,
    2349             :       ! If not, we sum them directly into H (bypassing the alltoall)
    2350             :       IF (.NOT. scatter) THEN
    2351             : 
    2352             :          ! Distribute work over remaining threads assuming one is still in the alltoall
    2353             : !$OMP DO schedule(dynamic,MAX(1,send_pair_count(me)/nthread_left))
    2354             :          DO i = 1, send_pair_count(me)
    2355             :             arow = atom_pair_send(send_pair_disps(me) + i)%row
    2356             :             acol = atom_pair_send(send_pair_disps(me) + i)%col
    2357             :             img = atom_pair_send(send_pair_disps(me) + i)%image
    2358             :             nrow = last_row(arow) - first_row(arow) + 1
    2359             :             ncol = last_col(acol) - first_col(acol) + 1
    2360             :             hmat => hmats(img)%matrix
    2361             :             pmat => pmats(img)%matrix
    2362             :             CALL dbcsr_get_block_p(matrix=hmat, row=arow, col=acol, BLOCK=h_block, found=found)
    2363             :             CPASSERT(found)
    2364             :             CALL dbcsr_get_block_p(matrix=pmat, row=arow, col=acol, BLOCK=p_block, found=found)
    2365             :             CPASSERT(found)
    2366             : 
    2367             : !$          call omp_set_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
    2368             :             DO k = 1, ncol
    2369             :                DO j = 1, nrow
    2370             :                   h_block(j, k) = h_block(j, k) + p_block(j, k)
    2371             :                END DO
    2372             :             END DO
    2373             : !$          call omp_unset_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
    2374             :          END DO
    2375             : !$OMP END DO
    2376             :       ELSE
    2377             :          ! We will insert new blocks into P, so create mutable work matrices
    2378             :          DO img = 1, nimages
    2379             :             pmat => pmats(img)%matrix
    2380             :             CALL dbcsr_work_create(pmat, work_mutable=.TRUE., &
    2381             :                                    nblks_guess=SIZE(atom_pair_recv)/nthread, sizedata_guess=SIZE(recv_buf_r)/nthread, &
    2382             :                                    n=nthread)
    2383             :          END DO
    2384             :       END IF
    2385             : 
    2386             : ! wait for comm and setup to finish
    2387             : !$OMP BARRIER
    2388             : 
    2389             :       !do unpacking
    2390             : !$OMP DO schedule(guided)
    2391             :       DO l = 1, desc%group_size
    2392             :          IF (l .EQ. me) CYCLE
    2393             :          recv_sizes(l) = 0
    2394             :          DO i = 1, recv_pair_count(l)
    2395             :             arow = atom_pair_recv(recv_pair_disps(l) + i)%row
    2396             :             acol = atom_pair_recv(recv_pair_disps(l) + i)%col
    2397             :             img = atom_pair_recv(recv_pair_disps(l) + i)%image
    2398             :             nrow = last_row(arow) - first_row(arow) + 1
    2399             :             ncol = last_col(acol) - first_col(acol) + 1
    2400             :             pmat => pmats(img)%matrix
    2401             :             NULLIFY (p_block)
    2402             :             CALL dbcsr_get_block_p(matrix=pmat, row=arow, col=acol, BLOCK=p_block, found=found)
    2403             : 
    2404             :             IF (PRESENT(hmats)) THEN
    2405             :                hmat => hmats(img)%matrix
    2406             :                CALL dbcsr_get_block_p(matrix=hmat, row=arow, col=acol, BLOCK=h_block, found=found)
    2407             :                CPASSERT(found)
    2408             :             END IF
    2409             : 
    2410             :             IF (scatter .AND. .NOT. ASSOCIATED(p_block)) THEN
    2411             :                CALL dbcsr_put_block(pmat, arow, acol, &
    2412             :                                     block=recv_buf_r(recv_disps(l) + recv_sizes(l) + 1:recv_disps(l) + recv_sizes(l) + nrow*ncol))
    2413             :             END IF
    2414             :             IF (.NOT. scatter) THEN
    2415             : !$             call omp_set_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
    2416             :                DO k = 1, ncol
    2417             :                   DO j = 1, nrow
    2418             :                      h_block(j, k) = h_block(j, k) + recv_buf_r(recv_disps(l) + recv_sizes(l) + j + (k - 1)*nrow)
    2419             :                   END DO
    2420             :                END DO
    2421             : !$             call omp_unset_lock(locks((arow - 1)*nthread*10/nblkrows_total + 1))
    2422             :             END IF
    2423             :             recv_sizes(l) = recv_sizes(l) + nrow*ncol
    2424             :          END DO
    2425             :       END DO
    2426             : !$OMP END DO
    2427             : 
    2428             : !$    IF (.not. scatter) THEN
    2429             : !$OMP DO
    2430             : !$       do i = 1, nthread*10
    2431             : !$          call omp_destroy_lock(locks(i))
    2432             : !$       end do
    2433             : !$OMP END DO
    2434             : !$    END IF
    2435             : 
    2436             : !$OMP SINGLE
    2437             : !$    IF (.not. scatter) THEN
    2438             : !$       DEALLOCATE (locks)
    2439             : !$    END IF
    2440             : !$OMP END SINGLE NOWAIT
    2441             : 
    2442             :       IF (scatter) THEN
    2443             :          ! Blocks were added to P
    2444             :          DO img = 1, nimages
    2445             :             pmat => pmats(img)%matrix
    2446             :             CALL dbcsr_finalize(pmat)
    2447             :          END DO
    2448             :       END IF
    2449             : !$OMP END PARALLEL
    2450             : 
    2451           0 :       DEALLOCATE (send_buf_r)
    2452           0 :       DEALLOCATE (recv_buf_r)
    2453             : 
    2454           0 :       DEALLOCATE (send_sizes)
    2455           0 :       DEALLOCATE (recv_sizes)
    2456           0 :       DEALLOCATE (send_disps)
    2457           0 :       DEALLOCATE (recv_disps)
    2458           0 :       DEALLOCATE (send_pair_count)
    2459           0 :       DEALLOCATE (recv_pair_count)
    2460           0 :       DEALLOCATE (send_pair_disps)
    2461           0 :       DEALLOCATE (recv_pair_disps)
    2462             : 
    2463           0 :       DEALLOCATE (first_row, last_row, first_col, last_col)
    2464             : 
    2465           0 :       CALL timestop(handle)
    2466             : 
    2467           0 :    END SUBROUTINE rs_distribute_matrix
    2468             : 
    2469             : ! **************************************************************************************************
    2470             : !> \brief Calculates offsets and sizes for rs_scatter_matrix and rs_copy_matrix.
    2471             : !> \author Ole Schuett
    2472             : ! **************************************************************************************************
    2473       13458 :    SUBROUTINE rs_calc_offsets(pairs, nsgf, group_size, &
    2474             :                               pair_offsets, rank_offsets, rank_sizes, buffer_size)
    2475             :       TYPE(atom_pair_type), DIMENSION(:), INTENT(IN)     :: pairs
    2476             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: nsgf
    2477             :       INTEGER, INTENT(IN)                                :: group_size
    2478             :       INTEGER, DIMENSION(:), POINTER                     :: pair_offsets, rank_offsets, rank_sizes
    2479             :       INTEGER, INTENT(INOUT)                             :: buffer_size
    2480             : 
    2481             :       INTEGER                                            :: acol, arow, i, block_size, total_size, k, prev_k
    2482             : 
    2483       13458 :       IF (ASSOCIATED(pair_offsets)) DEALLOCATE (pair_offsets)
    2484       13458 :       IF (ASSOCIATED(rank_offsets)) DEALLOCATE (rank_offsets)
    2485       13458 :       IF (ASSOCIATED(rank_sizes)) DEALLOCATE (rank_sizes)
    2486             : 
    2487             :       ! calculate buffer_size and pair_offsets
    2488       40140 :       ALLOCATE (pair_offsets(SIZE(pairs)))
    2489       13458 :       total_size = 0
    2490      270967 :       DO i = 1, SIZE(pairs)
    2491      257509 :          pair_offsets(i) = total_size
    2492      257509 :          arow = pairs(i)%row
    2493      257509 :          acol = pairs(i)%col
    2494      257509 :          block_size = nsgf(arow)*nsgf(acol)
    2495      270967 :          total_size = total_size + block_size
    2496             :       END DO
    2497       13458 :       buffer_size = total_size
    2498             : 
    2499             :       ! calculate rank_offsets and rank_sizes
    2500       40374 :       ALLOCATE (rank_offsets(group_size))
    2501       26916 :       ALLOCATE (rank_sizes(group_size))
    2502       39182 :       rank_offsets = 0
    2503       39182 :       rank_sizes = 0
    2504       13458 :       IF (SIZE(pairs) > 0) THEN
    2505       13224 :          prev_k = pairs(1)%rank + 1
    2506      270733 :          DO i = 1, SIZE(pairs)
    2507      257509 :             k = pairs(i)%rank + 1
    2508      257509 :             CPASSERT(k >= prev_k) ! expecting the pairs to be ordered by rank
    2509      270733 :             IF (k > prev_k) THEN
    2510          61 :                rank_offsets(k) = pair_offsets(i)
    2511          61 :                rank_sizes(prev_k) = rank_offsets(k) - rank_offsets(prev_k)
    2512          61 :                prev_k = k
    2513             :             END IF
    2514             :          END DO
    2515       13224 :          rank_sizes(k) = buffer_size - rank_offsets(k) ! complete last rank
    2516             :       END IF
    2517             : 
    2518       13458 :    END SUBROUTINE rs_calc_offsets
    2519             : 
    2520             : ! **************************************************************************************************
    2521             : !> \brief Scatters dbcsr matrix blocks and receives them into a buffer as needed before collocation.
    2522             : !> \author Ole Schuett
    2523             : ! **************************************************************************************************
    2524         266 :    SUBROUTINE rs_scatter_matrices(src_matrices, dest_buffer, task_list, group)
    2525             :       TYPE(dbcsr_p_type), DIMENSION(:), INTENT(IN)       :: src_matrices
    2526             :       TYPE(offload_buffer_type), INTENT(INOUT)           :: dest_buffer
    2527             :       TYPE(task_list_type), INTENT(IN)                   :: task_list
    2528             :       TYPE(mp_comm_type), INTENT(IN)                     :: group
    2529             : 
    2530             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'rs_scatter_matrices'
    2531             : 
    2532             :       INTEGER                                            :: handle
    2533             :       REAL(KIND=dp), DIMENSION(:), ALLOCATABLE           :: buffer_send
    2534             : 
    2535         266 :       CALL timeset(routineN, handle)
    2536         777 :       ALLOCATE (buffer_send(task_list%buffer_size_send))
    2537             : 
    2538             :       ! pack dbcsr blocks into send buffer
    2539         266 :       CPASSERT(ASSOCIATED(task_list%atom_pair_send))
    2540             :       CALL rs_pack_buffer(src_matrices=src_matrices, &
    2541             :                           dest_buffer=buffer_send, &
    2542             :                           atom_pair=task_list%atom_pair_send, &
    2543         266 :                           pair_offsets=task_list%pair_offsets_send)
    2544             : 
    2545             :       ! mpi all-to-all communication, receiving directly into blocks_recv%buffer.
    2546             :       CALL group%alltoall(buffer_send, task_list%rank_sizes_send, task_list%rank_offsets_send, &
    2547             :                           dest_buffer%host_buffer, &
    2548        2394 :                           task_list%rank_sizes_recv, task_list%rank_offsets_recv)
    2549             : 
    2550         266 :       DEALLOCATE (buffer_send)
    2551         266 :       CALL timestop(handle)
    2552             : 
    2553         266 :    END SUBROUTINE rs_scatter_matrices
    2554             : 
    2555             : ! **************************************************************************************************
    2556             : !> \brief Gather the dbcsr matrix blocks and receives them into a buffer as needed after integration.
    2557             : !> \author Ole Schuett
    2558             : ! **************************************************************************************************
    2559         218 :    SUBROUTINE rs_gather_matrices(src_buffer, dest_matrices, task_list, group)
    2560             :       TYPE(offload_buffer_type), INTENT(IN)              :: src_buffer
    2561             :       TYPE(dbcsr_p_type), DIMENSION(:), INTENT(INOUT)    :: dest_matrices
    2562             :       TYPE(task_list_type), INTENT(IN)                   :: task_list
    2563             :       TYPE(mp_comm_type), INTENT(IN)                     :: group
    2564             : 
    2565             :       CHARACTER(LEN=*), PARAMETER :: routineN = 'rs_gather_matrices'
    2566             : 
    2567             :       INTEGER                                            :: handle
    2568             :       REAL(KIND=dp), DIMENSION(:), ALLOCATABLE           :: buffer_send
    2569             : 
    2570         218 :       CALL timeset(routineN, handle)
    2571             : 
    2572             :       ! Caution: The meaning of send and recv are reversed in this routine.
    2573         640 :       ALLOCATE (buffer_send(task_list%buffer_size_send)) ! e.g. this is actually used for receiving
    2574             : 
    2575             :       ! mpi all-to-all communication
    2576             :       CALL group%alltoall(src_buffer%host_buffer, task_list%rank_sizes_recv, task_list%rank_offsets_recv, &
    2577        1962 :                           buffer_send, task_list%rank_sizes_send, task_list%rank_offsets_send)
    2578             : 
    2579             :       ! unpack dbcsr blocks from send buffer
    2580         218 :       CPASSERT(ASSOCIATED(task_list%atom_pair_send))
    2581             :       CALL rs_unpack_buffer(src_buffer=buffer_send, &
    2582             :                             dest_matrices=dest_matrices, &
    2583             :                             atom_pair=task_list%atom_pair_send, &
    2584         218 :                             pair_offsets=task_list%pair_offsets_send)
    2585             : 
    2586         218 :       DEALLOCATE (buffer_send)
    2587         218 :       CALL timestop(handle)
    2588             : 
    2589         218 :    END SUBROUTINE rs_gather_matrices
    2590             : 
    2591             : ! **************************************************************************************************
    2592             : !> \brief Copies the DBCSR blocks into buffer, replaces rs_scatter_matrix for non-distributed grids.
    2593             : !> \author Ole Schuett
    2594             : ! **************************************************************************************************
    2595      216354 :    SUBROUTINE rs_copy_to_buffer(src_matrices, dest_buffer, task_list)
    2596             :       TYPE(dbcsr_p_type), DIMENSION(:), INTENT(IN)       :: src_matrices
    2597             :       TYPE(offload_buffer_type), INTENT(INOUT)           :: dest_buffer
    2598             :       TYPE(task_list_type), INTENT(IN)                   :: task_list
    2599             : 
    2600             :       CALL rs_pack_buffer(src_matrices=src_matrices, &
    2601             :                           dest_buffer=dest_buffer%host_buffer, &
    2602             :                           atom_pair=task_list%atom_pair_recv, &
    2603      216354 :                           pair_offsets=task_list%pair_offsets_recv)
    2604             : 
    2605      216354 :    END SUBROUTINE rs_copy_to_buffer
    2606             : 
    2607             : ! **************************************************************************************************
    2608             : !> \brief Copies from buffer into DBCSR matrics, replaces rs_gather_matrix for non-distributed grids.
    2609             : !> \author Ole Schuett
    2610             : ! **************************************************************************************************
    2611      181737 :    SUBROUTINE rs_copy_to_matrices(src_buffer, dest_matrices, task_list)
    2612             :       TYPE(offload_buffer_type), INTENT(IN)              :: src_buffer
    2613             :       TYPE(dbcsr_p_type), DIMENSION(:), INTENT(INOUT)    :: dest_matrices
    2614             :       TYPE(task_list_type), INTENT(IN)                   :: task_list
    2615             : 
    2616             :       CALL rs_unpack_buffer(src_buffer=src_buffer%host_buffer, &
    2617             :                             dest_matrices=dest_matrices, &
    2618             :                             atom_pair=task_list%atom_pair_recv, &
    2619      181737 :                             pair_offsets=task_list%pair_offsets_recv)
    2620             : 
    2621      181737 :    END SUBROUTINE rs_copy_to_matrices
    2622             : 
    2623             : ! **************************************************************************************************
    2624             : !> \brief Helper routine for rs_scatter_matrix and rs_copy_to_buffer.
    2625             : !> \author Ole Schuett
    2626             : ! **************************************************************************************************
    2627      216620 :    SUBROUTINE rs_pack_buffer(src_matrices, dest_buffer, atom_pair, pair_offsets)
    2628             :       TYPE(dbcsr_p_type), DIMENSION(:), INTENT(IN)       :: src_matrices
    2629             :       REAL(KIND=dp), DIMENSION(:), INTENT(INOUT)         :: dest_buffer
    2630             :       TYPE(atom_pair_type), DIMENSION(:), INTENT(IN)     :: atom_pair
    2631             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: pair_offsets
    2632             : 
    2633             :       INTEGER                                            :: acol, arow, img, i, offset, block_size
    2634             :       LOGICAL                                            :: found
    2635      216620 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: block
    2636             : 
    2637             : !$OMP PARALLEL DEFAULT(NONE), &
    2638             : !$OMP          SHARED(src_matrices,atom_pair,pair_offsets,dest_buffer), &
    2639      216620 : !$OMP          PRIVATE(acol,arow,img,i,offset,block_size,found,block)
    2640             : !$OMP DO schedule(guided)
    2641             :       DO i = 1, SIZE(atom_pair)
    2642             :          arow = atom_pair(i)%row
    2643             :          acol = atom_pair(i)%col
    2644             :          img = atom_pair(i)%image
    2645             :          CALL dbcsr_get_block_p(matrix=src_matrices(img)%matrix, row=arow, col=acol, &
    2646             :                                 block=block, found=found)
    2647             :          CPASSERT(found)
    2648             :          block_size = SIZE(block)
    2649             :          offset = pair_offsets(i)
    2650             :          dest_buffer(offset + 1:offset + block_size) = RESHAPE(block, shape=(/block_size/))
    2651             :       END DO
    2652             : !$OMP END DO
    2653             : !$OMP END PARALLEL
    2654             : 
    2655      216620 :    END SUBROUTINE rs_pack_buffer
    2656             : 
    2657             : ! **************************************************************************************************
    2658             : !> \brief Helper routine for rs_gather_matrix and rs_copy_to_matrices.
    2659             : !> \author Ole Schuett
    2660             : ! **************************************************************************************************
    2661      181955 :    SUBROUTINE rs_unpack_buffer(src_buffer, dest_matrices, atom_pair, pair_offsets)
    2662             :       REAL(KIND=dp), DIMENSION(:), INTENT(IN)            :: src_buffer
    2663             :       TYPE(dbcsr_p_type), DIMENSION(:), INTENT(INOUT)    :: dest_matrices
    2664             :       TYPE(atom_pair_type), DIMENSION(:), INTENT(IN)     :: atom_pair
    2665             :       INTEGER, DIMENSION(:), INTENT(IN)                  :: pair_offsets
    2666             : 
    2667             :       INTEGER                                            :: acol, arow, img, i, offset, &
    2668             :                                                             nrows, ncols, lock_num
    2669             :       LOGICAL                                            :: found
    2670      181955 :       REAL(KIND=dp), DIMENSION(:, :), POINTER            :: block
    2671      181955 :       INTEGER(kind=omp_lock_kind), ALLOCATABLE, DIMENSION(:) :: locks
    2672             : 
    2673             :       ! initialize locks
    2674      545865 :       ALLOCATE (locks(10*omp_get_max_threads()))
    2675     2001505 :       DO i = 1, SIZE(locks)
    2676     2001505 :          CALL omp_init_lock(locks(i))
    2677             :       END DO
    2678             : 
    2679             : !$OMP PARALLEL DEFAULT(NONE), &
    2680             : !$OMP          SHARED(src_buffer,atom_pair,pair_offsets,dest_matrices,locks), &
    2681      181955 : !$OMP          PRIVATE(acol,arow,img,i,offset,nrows,ncols,lock_num,found,block)
    2682             : !$OMP DO schedule(guided)
    2683             :       DO i = 1, SIZE(atom_pair)
    2684             :          arow = atom_pair(i)%row
    2685             :          acol = atom_pair(i)%col
    2686             :          img = atom_pair(i)%image
    2687             :          CALL dbcsr_get_block_p(matrix=dest_matrices(img)%matrix, row=arow, col=acol, &
    2688             :                                 block=block, found=found)
    2689             :          CPASSERT(found)
    2690             :          nrows = SIZE(block, 1)
    2691             :          ncols = SIZE(block, 2)
    2692             :          offset = pair_offsets(i)
    2693             :          lock_num = MODULO(arow, SIZE(locks)) + 1  ! map matrix rows round-robin to available locks
    2694             : 
    2695             :          CALL omp_set_lock(locks(lock_num))
    2696             :          block = block + RESHAPE(src_buffer(offset + 1:offset + nrows*ncols), shape=(/nrows, ncols/))
    2697             :          CALL omp_unset_lock(locks(lock_num))
    2698             :       END DO
    2699             : !$OMP END DO
    2700             : !$OMP END PARALLEL
    2701             : 
    2702             :       ! destroy locks
    2703     2001505 :       DO i = 1, SIZE(locks)
    2704     2001505 :          CALL omp_destroy_lock(locks(i))
    2705             :       END DO
    2706      181955 :       DEALLOCATE (locks)
    2707             : 
    2708      181955 :    END SUBROUTINE rs_unpack_buffer
    2709             : 
    2710             : ! **************************************************************************************************
    2711             : !> \brief determines the rank of the processor who's real rs grid contains point
    2712             : !> \param rs_desc ...
    2713             : !> \param igrid_level ...
    2714             : !> \param n_levels ...
    2715             : !> \param cube_center ...
    2716             : !> \param ntasks ...
    2717             : !> \param tasks ...
    2718             : !> \param lb_cube ...
    2719             : !> \param ub_cube ...
    2720             : !> \param added_tasks ...
    2721             : !> \par History
    2722             : !>      11.2007 created [MattW]
    2723             : !>      10.2008 rewritten [Joost VandeVondele]
    2724             : !> \author MattW
    2725             : ! **************************************************************************************************
    2726        1380 :    SUBROUTINE rs_find_node(rs_desc, igrid_level, n_levels, cube_center, ntasks, tasks, &
    2727             :                            lb_cube, ub_cube, added_tasks)
    2728             : 
    2729             :       TYPE(realspace_grid_desc_type), POINTER            :: rs_desc
    2730             :       INTEGER, INTENT(IN)                                :: igrid_level, n_levels
    2731             :       INTEGER, DIMENSION(3), INTENT(IN)                  :: cube_center
    2732             :       INTEGER, INTENT(INOUT)                             :: ntasks
    2733             :       TYPE(task_type), DIMENSION(:), POINTER             :: tasks
    2734             :       INTEGER, DIMENSION(3), INTENT(IN)                  :: lb_cube, ub_cube
    2735             :       INTEGER, INTENT(OUT)                               :: added_tasks
    2736             : 
    2737             :       INTEGER, PARAMETER                                 :: add_tasks = 1000
    2738             :       REAL(kind=dp), PARAMETER                           :: mult_tasks = 2.0_dp
    2739             : 
    2740             :       INTEGER :: bit_index, coord(3), curr_tasks, dest, i, icoord(3), idest, itask, ix, iy, iz, &
    2741             :                  lb_coord(3), lb_domain(3), lbc(3), ub_coord(3), ub_domain(3), ubc(3)
    2742             :       INTEGER                                            :: bit_pattern
    2743             :       LOGICAL                                            :: dir_periodic(3)
    2744             : 
    2745        1380 :       coord(1) = rs_desc%x2coord(cube_center(1))
    2746        1380 :       coord(2) = rs_desc%y2coord(cube_center(2))
    2747        1380 :       coord(3) = rs_desc%z2coord(cube_center(3))
    2748        1380 :       dest = rs_desc%coord2rank(coord(1), coord(2), coord(3))
    2749             : 
    2750             :       ! the real cube coordinates
    2751        5520 :       lbc = lb_cube + cube_center
    2752        5520 :       ubc = ub_cube + cube_center
    2753             : 
    2754        9660 :       IF (ALL((rs_desc%lb_global(:, dest) - rs_desc%border) .LE. lbc) .AND. &
    2755             :           ALL((rs_desc%ub_global(:, dest) + rs_desc%border) .GE. ubc)) THEN
    2756             :          !standard distributed collocation/integration
    2757        1380 :          tasks(ntasks)%destination = encode_rank(dest, igrid_level, n_levels)
    2758        1380 :          tasks(ntasks)%dist_type = 1
    2759        1380 :          tasks(ntasks)%subpatch_pattern = 0
    2760        1380 :          added_tasks = 1
    2761             : 
    2762             :          ! here we figure out if there is an alternate location for this task
    2763             :          ! i.e. even though the cube_center is not in the real local domain,
    2764             :          ! it might fully fit in the halo of the neighbor
    2765             :          ! if its radius is smaller than the maximum radius
    2766             :          ! the list of possible neighbors is stored here as a bit pattern
    2767             :          ! to reconstruct what the rank of the neigbor is,
    2768             :          ! we can use (note this requires the correct rs_grid)
    2769             :          !  IF (BTEST(bit_pattern,0)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,(/-1,0,0/))
    2770             :          !  IF (BTEST(bit_pattern,1)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,(/+1,0,0/))
    2771             :          !  IF (BTEST(bit_pattern,2)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,(/0,-1,0/))
    2772             :          !  IF (BTEST(bit_pattern,3)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,(/0,+1,0/))
    2773             :          !  IF (BTEST(bit_pattern,4)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,(/0,0,-1/))
    2774             :          !  IF (BTEST(bit_pattern,5)) rank=rs_grid_locate_rank(rs_desc,tasks(ntasks)%destination,(/0,0,+1/))
    2775        1380 :          bit_index = 0
    2776        1380 :          bit_pattern = 0
    2777        5520 :          DO i = 1, 3
    2778        5520 :             IF (rs_desc%perd(i) == 1) THEN
    2779        2760 :                bit_pattern = IBCLR(bit_pattern, bit_index)
    2780        2760 :                bit_index = bit_index + 1
    2781        2760 :                bit_pattern = IBCLR(bit_pattern, bit_index)
    2782        2760 :                bit_index = bit_index + 1
    2783             :             ELSE
    2784             :                ! fits the left neighbor ?
    2785        1380 :                IF (ubc(i) <= rs_desc%lb_global(i, dest) - 1 + rs_desc%border) THEN
    2786         599 :                   bit_pattern = IBSET(bit_pattern, bit_index)
    2787         599 :                   bit_index = bit_index + 1
    2788             :                ELSE
    2789         781 :                   bit_pattern = IBCLR(bit_pattern, bit_index)
    2790         781 :                   bit_index = bit_index + 1
    2791             :                END IF
    2792             :                ! fits the right neighbor ?
    2793        1380 :                IF (lbc(i) >= rs_desc%ub_global(i, dest) + 1 - rs_desc%border) THEN
    2794         250 :                   bit_pattern = IBSET(bit_pattern, bit_index)
    2795         250 :                   bit_index = bit_index + 1
    2796             :                ELSE
    2797        1130 :                   bit_pattern = IBCLR(bit_pattern, bit_index)
    2798        1130 :                   bit_index = bit_index + 1
    2799             :                END IF
    2800             :             END IF
    2801             :          END DO
    2802        1380 :          tasks(ntasks)%subpatch_pattern = bit_pattern
    2803             : 
    2804             :       ELSE
    2805             :          ! generalised collocation/integration needed
    2806             :          ! first we figure out how many neighbors we have to add to include the lbc/ubc
    2807             :          ! in the available domains (inclusive of halo points)
    2808             :          ! first we 'ignore' periodic boundary conditions
    2809             :          ! i.e. ub_coord-lb_coord+1 might be larger than group_dim
    2810           0 :          lb_coord = coord
    2811           0 :          ub_coord = coord
    2812           0 :          lb_domain = rs_desc%lb_global(:, dest) - rs_desc%border
    2813           0 :          ub_domain = rs_desc%ub_global(:, dest) + rs_desc%border
    2814           0 :          DO i = 1, 3
    2815             :             ! only if the grid is not periodic in this direction we need to take care of adding neighbors
    2816           0 :             IF (rs_desc%perd(i) == 0) THEN
    2817             :                ! if the domain lower bound is greater than the lbc we need to add the size of the neighbor domain
    2818           0 :                DO
    2819           0 :                   IF (lb_domain(i) > lbc(i)) THEN
    2820           0 :                      lb_coord(i) = lb_coord(i) - 1
    2821           0 :                      icoord = MODULO(lb_coord, rs_desc%group_dim)
    2822           0 :                      idest = rs_desc%coord2rank(icoord(1), icoord(2), icoord(3))
    2823           0 :                      lb_domain(i) = lb_domain(i) - (rs_desc%ub_global(i, idest) - rs_desc%lb_global(i, idest) + 1)
    2824             :                   ELSE
    2825             :                      EXIT
    2826             :                   END IF
    2827             :                END DO
    2828             :                ! same for the upper bound
    2829           0 :                DO
    2830           0 :                   IF (ub_domain(i) < ubc(i)) THEN
    2831           0 :                      ub_coord(i) = ub_coord(i) + 1
    2832           0 :                      icoord = MODULO(ub_coord, rs_desc%group_dim)
    2833           0 :                      idest = rs_desc%coord2rank(icoord(1), icoord(2), icoord(3))
    2834           0 :                      ub_domain(i) = ub_domain(i) + (rs_desc%ub_global(i, idest) - rs_desc%lb_global(i, idest) + 1)
    2835             :                   ELSE
    2836             :                      EXIT
    2837             :                   END IF
    2838             :                END DO
    2839             :             END IF
    2840             :          END DO
    2841             : 
    2842             :          ! some care is needed for the periodic boundaries
    2843           0 :          DO i = 1, 3
    2844           0 :             IF (ub_domain(i) - lb_domain(i) + 1 >= rs_desc%npts(i)) THEN
    2845           0 :                dir_periodic(i) = .TRUE.
    2846           0 :                lb_coord(i) = 0
    2847           0 :                ub_coord(i) = rs_desc%group_dim(i) - 1
    2848             :             ELSE
    2849           0 :                dir_periodic(i) = .FALSE.
    2850             :             END IF
    2851             :          END DO
    2852             : 
    2853           0 :          added_tasks = PRODUCT(ub_coord - lb_coord + 1)
    2854           0 :          itask = ntasks
    2855           0 :          ntasks = ntasks + added_tasks - 1
    2856           0 :          IF (ntasks > SIZE(tasks)) THEN
    2857           0 :             curr_tasks = INT((SIZE(tasks) + add_tasks)*mult_tasks)
    2858           0 :             CALL reallocate_tasks(tasks, curr_tasks)
    2859             :          END IF
    2860           0 :          DO iz = lb_coord(3), ub_coord(3)
    2861           0 :          DO iy = lb_coord(2), ub_coord(2)
    2862           0 :          DO ix = lb_coord(1), ub_coord(1)
    2863           0 :             icoord = MODULO((/ix, iy, iz/), rs_desc%group_dim)
    2864           0 :             idest = rs_desc%coord2rank(icoord(1), icoord(2), icoord(3))
    2865           0 :             tasks(itask)%destination = encode_rank(idest, igrid_level, n_levels)
    2866           0 :             tasks(itask)%dist_type = 2
    2867           0 :             tasks(itask)%subpatch_pattern = 0
    2868             :             ! encode the domain size for this task
    2869             :             ! if the bit is set, we need to add the border in that direction
    2870           0 :             IF (ix == lb_coord(1) .AND. .NOT. dir_periodic(1)) &
    2871           0 :                tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 0)
    2872           0 :             IF (ix == ub_coord(1) .AND. .NOT. dir_periodic(1)) &
    2873           0 :                tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 1)
    2874           0 :             IF (iy == lb_coord(2) .AND. .NOT. dir_periodic(2)) &
    2875           0 :                tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 2)
    2876           0 :             IF (iy == ub_coord(2) .AND. .NOT. dir_periodic(2)) &
    2877           0 :                tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 3)
    2878           0 :             IF (iz == lb_coord(3) .AND. .NOT. dir_periodic(3)) &
    2879           0 :                tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 4)
    2880           0 :             IF (iz == ub_coord(3) .AND. .NOT. dir_periodic(3)) &
    2881           0 :                tasks(itask)%subpatch_pattern = IBSET(tasks(itask)%subpatch_pattern, 5)
    2882           0 :             itask = itask + 1
    2883             :          END DO
    2884             :          END DO
    2885             :          END DO
    2886             :       END IF
    2887             : 
    2888        1380 :    END SUBROUTINE rs_find_node
    2889             : 
    2890             : ! **************************************************************************************************
    2891             : !> \brief utility functions for encoding the grid level with a rank, allowing
    2892             : !>        different grid levels to maintain a different rank ordering without
    2893             : !>        losing information.  These encoded_ints are stored in the tasks(1:2,:) array
    2894             : !> \param rank ...
    2895             : !> \param grid_level ...
    2896             : !> \param n_levels ...
    2897             : !> \return ...
    2898             : !> \par History
    2899             : !>      4.2009 created [Iain Bethune]
    2900             : !>        (c) The Numerical Algorithms Group (NAG) Ltd, 2009 on behalf of the HECToR project
    2901             : ! **************************************************************************************************
    2902    15653590 :    FUNCTION encode_rank(rank, grid_level, n_levels) RESULT(encoded_int)
    2903             : 
    2904             :       INTEGER, INTENT(IN)                                :: rank, grid_level, n_levels
    2905             :       INTEGER                                            :: encoded_int
    2906             : 
    2907             : ! ordered so can still sort by rank
    2908             : 
    2909    15653590 :       encoded_int = rank*n_levels + grid_level - 1
    2910             : 
    2911    15653590 :    END FUNCTION
    2912             : 
    2913             : ! **************************************************************************************************
    2914             : !> \brief ...
    2915             : !> \param encoded_int ...
    2916             : !> \param n_levels ...
    2917             : !> \return ...
    2918             : ! **************************************************************************************************
    2919     7872785 :    FUNCTION decode_rank(encoded_int, n_levels) RESULT(rank)
    2920             : 
    2921             :       INTEGER, INTENT(IN)                                :: encoded_int
    2922             :       INTEGER, INTENT(IN)                                :: n_levels
    2923             :       INTEGER                                            :: rank
    2924             : 
    2925     7872785 :       rank = INT(encoded_int/n_levels)
    2926             : 
    2927     7872785 :    END FUNCTION
    2928             : 
    2929             : ! **************************************************************************************************
    2930             : !> \brief ...
    2931             : !> \param encoded_int ...
    2932             : !> \param n_levels ...
    2933             : !> \return ...
    2934             : ! **************************************************************************************************
    2935        7236 :    FUNCTION decode_level(encoded_int, n_levels) RESULT(grid_level)
    2936             : 
    2937             :       INTEGER, INTENT(IN)                                :: encoded_int
    2938             :       INTEGER, INTENT(IN)                                :: n_levels
    2939             :       INTEGER                                            :: grid_level
    2940             : 
    2941        7236 :       grid_level = INT(MODULO(encoded_int, n_levels)) + 1
    2942             : 
    2943        7236 :    END FUNCTION decode_level
    2944             : 
    2945             : ! **************************************************************************************************
    2946             : !> \brief Sort pgf index pair (ipgf,iset,iatom),(jpgf,jset,jatom) for which all atom pairs are
    2947             : !>        grouped, and for each atom pair all set pairs are grouped and for each set pair,
    2948             : !>        all pgfs are grouped.
    2949             : !>        This yields the right order of the tasks for collocation after the sort
    2950             : !>        in distribute_tasks. E.g. for a atom pair, all sets and pgfs are computed in one go.
    2951             : !>        The exception is the gridlevel. Tasks are first ordered wrt to grid_level. This implies
    2952             : !>        that a given density matrix block will be decontracted several times,
    2953             : !>        but cache effects on the grid make up for this.
    2954             : !> \param a ...
    2955             : !> \param b ...
    2956             : !> \return ...
    2957             : !> \author Ole Schuett
    2958             : ! **************************************************************************************************
    2959    73910348 :    PURE FUNCTION tasks_less_than(a, b) RESULT(res)
    2960             :       TYPE(task_type), INTENT(IN)                        :: a, b
    2961             :       LOGICAL                                            :: res
    2962             : 
    2963    73910348 :       IF (a%grid_level /= b%grid_level) THEN
    2964    18937732 :          res = a%grid_level < b%grid_level
    2965             : 
    2966    54972616 :       ELSE IF (a%image /= b%image) THEN
    2967     5229246 :          res = a%image < b%image
    2968             : 
    2969    49743370 :       ELSE IF (a%iatom /= b%iatom) THEN
    2970     6390010 :          res = a%iatom < b%iatom
    2971             : 
    2972    43353360 :       ELSE IF (a%jatom /= b%jatom) THEN
    2973     6889574 :          res = a%jatom < b%jatom
    2974             : 
    2975    36463786 :       ELSE IF (a%iset /= b%iset) THEN
    2976     5818361 :          res = a%iset < b%iset
    2977             : 
    2978    30645425 :       ELSE IF (a%jset /= b%jset) THEN
    2979     6228658 :          res = a%jset < b%jset
    2980             : 
    2981    24416767 :       ELSE IF (a%ipgf /= b%ipgf) THEN
    2982     6218530 :          res = a%ipgf < b%ipgf
    2983             : 
    2984             :       ELSE
    2985    18198237 :          res = a%jpgf < b%jpgf
    2986             : 
    2987             :       END IF
    2988    73910348 :    END FUNCTION tasks_less_than
    2989             : 
    2990   175403383 :    #:call array_sort(prefix='tasks', type='TYPE(task_type)')
    2991             :    #:endcall
    2992             : 
    2993             : ! **************************************************************************************************
    2994             : !> \brief Order atom pairs to find duplicates.
    2995             : !> \param a ...
    2996             : !> \param b ...
    2997             : !> \return ...
    2998             : !> \author Ole Schuett
    2999             : ! **************************************************************************************************
    3000    31430743 :    PURE FUNCTION atom_pair_less_than(a, b) RESULT(res)
    3001             :       TYPE(atom_pair_type), INTENT(IN)                   :: a, b
    3002             :       LOGICAL                                            :: res
    3003             : 
    3004    31430743 :       IF (a%rank /= b%rank) THEN
    3005        4243 :          res = a%rank < b%rank
    3006             : 
    3007    31426500 :       ELSE IF (a%row /= b%row) THEN
    3008     5144387 :          res = a%row < b%row
    3009             : 
    3010    26282113 :       ELSE IF (a%col /= b%col) THEN
    3011     5164594 :          res = a%col < b%col
    3012             : 
    3013             :       ELSE
    3014    21117519 :          res = a%image < b%image
    3015             : 
    3016             :       END IF
    3017    31430743 :    END FUNCTION atom_pair_less_than
    3018             : 
    3019    66593341 :    #:call array_sort(prefix='atom_pair', type='TYPE(atom_pair_type)')
    3020             :    #:endcall
    3021             : 
    3022             : END MODULE task_list_methods

Generated by: LCOV version 1.15