Line data Source code
1 : !--------------------------------------------------------------------------------------------------!
2 : ! CP2K: A general program to perform molecular dynamics simulations !
3 : ! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4 : ! !
5 : ! SPDX-License-Identifier: GPL-2.0-or-later !
6 : !--------------------------------------------------------------------------------------------------!
7 :
8 : ! **************************************************************************************************
9 : !> \note
10 : !> This module implements a modified version of the submatrix method, proposed in
11 : !> M. Lass, S. Mohr, H. Wiebeler, T. Kuehne, C. Plessl
12 : !> A Massively Parallel Algorithm for the Approximate Calculation of Inverse p-th Roots of Large Sparse Matrices
13 : !> Proc. Platform for Advanced Scientific Computing (PASC) Conference, ACM, 2018
14 : !>
15 : !> The method is extended to minimize the required data transfers and floating-point operations under the assumption that non-zero
16 : !> blocks of the matrix are close to its diagonal.
17 : !>
18 : !> Submatrices can be constructed not for single columns of the matrix but for a set of w consecutive submatrices. The underlying
19 : !> assumption is that columns next to each other have relatively similar occupation patterns, i.e., constructing a submatrix from
20 : !> columns x and x+1 should not lead to a much bigger submatrix than contructing it only from column x.
21 : !>
22 : !> The construction of the submatrices requires communication between all ranks. It is crucial to implement this communication as
23 : !> efficient as possible, i.e., data should only ever be transferred once between two ranks and message sizes need to be
24 : !> sufficiently large to utilize the communication bandwidth. To achieve this, we communicate the required blocks for all
25 : !> submatrices at once and copy them to large buffers before transmitting them via MPI.
26 : !>
27 : !> Note on multi-threading:
28 : !> Submatrices can be constructed and processed in parallel by multiple threads. However, generate_submatrix, get_sm_ids_for_rank
29 : !> and copy_resultcol are the only thread-safe routines in this module. All other routines involve MPI communication or operate on
30 : !> common, non-protected data and are hence not thread-safe.
31 : !>
32 : !> TODO:
33 : !> * generic types (for now only dp supported)
34 : !> * optimization of threaded initialization
35 : !> * sanity checks at the beginning of all methods
36 : !>
37 : !> \author Michael Lass
38 : ! **************************************************************************************************
39 :
40 : MODULE submatrix_dissection
41 :
42 : USE bibliography, ONLY: Lass2018,&
43 : cite_reference
44 : USE cp_dbcsr_api, ONLY: &
45 : dbcsr_distribution_get, dbcsr_distribution_type, dbcsr_finalize, dbcsr_get_block_p, &
46 : dbcsr_get_info, dbcsr_get_stored_coordinates, dbcsr_iterator_blocks_left, &
47 : dbcsr_iterator_next_block, dbcsr_iterator_start, dbcsr_iterator_stop, dbcsr_iterator_type, &
48 : dbcsr_put_block, dbcsr_type
49 : USE kinds, ONLY: dp
50 : USE message_passing, ONLY: mp_comm_type
51 : USE submatrix_types, ONLY: buffer_type,&
52 : bufptr_type,&
53 : intBuffer_type,&
54 : set_type,&
55 : setarray_type
56 :
57 : !$ USE omp_lib, ONLY: omp_get_max_threads, omp_get_thread_num
58 :
59 : #include "./base/base_uses.f90"
60 :
61 : IMPLICIT NONE
62 : PRIVATE
63 :
64 : CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'submatrix_dissection'
65 :
66 : TYPE, PUBLIC :: submatrix_dissection_type
67 : TYPE(dbcsr_type) :: dbcsr_mat
68 : TYPE(dbcsr_distribution_type) :: dist
69 : LOGICAL :: initialized = .FALSE.
70 : TYPE(mp_comm_type) :: group = mp_comm_type()
71 : INTEGER :: numnodes = -1, myrank = -1, nblkcols = -1, &
72 : nblkrows = -1, nblks = -1, local_blocks = -1, &
73 : cols_per_sm = -1, number_of_submatrices = -1
74 : INTEGER, DIMENSION(:), POINTER :: row_blk_size => NULL(), col_blk_size => NULL()
75 : INTEGER, DIMENSION(:), ALLOCATABLE :: coo_cols, coo_rows, coo_col_offsets, coo_cols_local, coo_rows_local, &
76 : coo_col_offsets_local, submatrix_owners, submatrix_sizes
77 : TYPE(buffer_type), DIMENSION(:), ALLOCATABLE :: recvbufs, result_sendbufs ! Indexing starts with 0 to match rank ids!
78 : TYPE(set_type), DIMENSION(:), ALLOCATABLE :: result_blocks_for_rank, result_blocks_from_rank
79 : TYPE(bufptr_type), DIMENSION(:), ALLOCATABLE :: coo_dptr
80 : TYPE(intBuffer_type), DIMENSION(:), ALLOCATABLE :: result_blocks_for_rank_buf_offsets
81 : CONTAINS
82 : PROCEDURE :: init => submatrix_dissection_init
83 : PROCEDURE :: final => submatrix_dissection_final
84 : PROCEDURE :: get_sm_ids_for_rank => submatrix_get_sm_ids_for_rank
85 : PROCEDURE :: generate_submatrix => submatrix_generate_sm
86 : PROCEDURE :: copy_resultcol => submatrix_cpy_resultcol
87 : PROCEDURE :: communicate_results => submatrix_communicate_results
88 : PROCEDURE :: get_relevant_sm_columns => submatrix_get_relevant_sm_columns
89 : END TYPE submatrix_dissection_type
90 :
91 : CONTAINS
92 :
93 : ! **************************************************************************************************
94 : !> \brief determine which columns of the submatrix are relevant for the result matrix
95 : !> \param this - object of class submatrix_dissection_type
96 : !> \param sm_id - id of the submatrix
97 : !> \param first - first column of submatrix that is relevant
98 : !> \param last - last column of submatrix that is relevant
99 : ! **************************************************************************************************
100 36 : SUBROUTINE submatrix_get_relevant_sm_columns(this, sm_id, first, last)
101 : CLASS(submatrix_dissection_type), INTENT(IN) :: this
102 : INTEGER, INTENT(IN) :: sm_id
103 : INTEGER, INTENT(OUT) :: first, last
104 : INTEGER :: i, j, blkid
105 9288 : TYPE(set_type) :: nonzero_rows
106 :
107 : ! TODO: Should we buffer the list of non-zero rows for each submatrix instead of recalculating it each time?
108 72 : DO i = (sm_id - 1)*this%cols_per_sm + 1, sm_id*this%cols_per_sm ! all colums that determine submatrix sm_id
109 108 : DO j = this%coo_col_offsets(i), this%coo_col_offsets(i + 1) - 1 ! all blocks that are within this column
110 72 : CALL nonzero_rows%insert(this%coo_rows(j))
111 : END DO
112 : END DO
113 :
114 36 : first = 1
115 36 : DO i = 1, nonzero_rows%elements
116 36 : blkid = nonzero_rows%get(i)
117 36 : IF (blkid .EQ. (sm_id - 1)*this%cols_per_sm + 1) THEN
118 : ! We just found the nonzero row that corresponds to the first inducing column of our submatrix
119 : ! Now add up block sizes to determine the last one as well
120 36 : last = first - 1
121 36 : DO j = i, nonzero_rows%elements
122 36 : blkid = nonzero_rows%get(j)
123 36 : last = last + this%col_blk_size(blkid)
124 36 : IF (blkid .EQ. (sm_id)*this%cols_per_sm) EXIT
125 : END DO
126 : EXIT
127 : END IF
128 0 : first = first + this%col_blk_size(blkid)
129 : END DO
130 :
131 36 : CALL nonzero_rows%reset
132 :
133 9288 : END SUBROUTINE submatrix_get_relevant_sm_columns
134 :
135 : ! **************************************************************************************************
136 : !> \brief initialize submatrix dissection and communicate, needs to be called before constructing any submatrices.
137 : !> \param this - object of class submatrix_dissection_type
138 : !> \param matrix_p - dbcsr input matrix
139 : !> \par History
140 : !> 2020.02 created [Michael Lass]
141 : !> 2020.05 add time measurements [Michael Lass]
142 : ! **************************************************************************************************
143 10 : SUBROUTINE submatrix_dissection_init(this, matrix_p) ! Should be PURE but the iterator routines are not
144 : CLASS(submatrix_dissection_type), INTENT(INOUT) :: this
145 : TYPE(dbcsr_type), INTENT(IN) :: matrix_p
146 :
147 : INTEGER :: cur_row, cur_col, cur_blk, i, j, k, l, m, l_limit_left, l_limit_right, &
148 : bufsize, bufsize_next
149 20 : INTEGER, DIMENSION(:), ALLOCATABLE :: blocks_per_rank, coo_dsplcmnts, num_blockids_send, num_blockids_recv
150 : TYPE(dbcsr_iterator_type) :: iter
151 2580 : TYPE(set_type) :: nonzero_rows
152 : REAL(KIND=dp) :: flops_total, flops_per_rank, flops_per_sm, flops_threshold, &
153 : flops_current, flops_remaining
154 :
155 : ! Indexing for the following arrays starts with 0 to match rank ids
156 10 : TYPE(set_type), DIMENSION(:), ALLOCATABLE :: blocks_from_rank
157 10 : TYPE(buffer_type), DIMENSION(:), ALLOCATABLE :: sendbufs
158 10 : TYPE(intBuffer_type), DIMENSION(:), ALLOCATABLE :: blocks_for_rank
159 :
160 : ! Additional structures for threaded parts
161 : INTEGER :: numthreads, mytid, group_handle
162 : ! Indexing starts at 0 to match thread ids
163 10 : TYPE(setarray_type), DIMENSION(:), ALLOCATABLE :: nonzero_rows_t
164 10 : TYPE(setarray_type), DIMENSION(:), ALLOCATABLE :: result_blocks_from_rank_t, result_blocks_for_rank_t, &
165 10 : blocks_from_rank_t
166 :
167 : LOGICAL :: valid
168 10 : REAL(KIND=dp), DIMENSION(:), POINTER :: blockp
169 :
170 : CHARACTER(LEN=*), PARAMETER :: routineN = 'submatrix_dissection_init'
171 : INTEGER :: handle
172 :
173 10 : CALL timeset(routineN, handle)
174 10 : CALL cite_reference(Lass2018)
175 :
176 10 : this%dbcsr_mat = matrix_p
177 : CALL dbcsr_get_info(matrix=this%dbcsr_mat, nblkcols_total=this%nblkcols, nblkrows_total=this%nblkrows, &
178 10 : row_blk_size=this%row_blk_size, col_blk_size=this%col_blk_size, group=group_handle, distribution=this%dist)
179 10 : CALL dbcsr_distribution_get(dist=this%dist, mynode=this%myrank, numnodes=this%numnodes)
180 :
181 10 : CALL this%group%set_handle(group_handle)
182 :
183 10 : IF (this%nblkcols .NE. this%nblkrows) THEN
184 0 : CPABORT("Number of block rows and cols need to be identical")
185 : END IF
186 :
187 20 : DO i = 1, this%nblkcols
188 20 : IF (this%col_blk_size(i) .NE. this%row_blk_size(i)) THEN
189 0 : CPABORT("Block row sizes and col sizes need to be identical")
190 : END IF
191 : END DO
192 :
193 : ! TODO: We could do even more sanity checks here, e.g., the matrix must not be stored symmetrically
194 :
195 : ! For the submatrix method, we need global knwoledge about which blocks are actually used. Therefore, we create a COO
196 : ! representation of the blocks (without their contents) on all ranks.
197 :
198 : ! TODO: Right now, the COO contains all blocks. Also we transfer all blocks. We could skip half of them due to the matrix
199 : ! being symmetric (however, we need to transpose the blocks). This can increase performance only by a factor of 2 and is
200 : ! therefore deferred.
201 :
202 : ! Determine number of locally stored blocks
203 10 : this%local_blocks = 0
204 10 : CALL dbcsr_iterator_start(iter, this%dbcsr_mat, read_only=.TRUE.)
205 15 : DO WHILE (dbcsr_iterator_blocks_left(iter))
206 5 : CALL dbcsr_iterator_next_block(iter, cur_row, cur_col, cur_blk)
207 5 : this%local_blocks = this%local_blocks + 1
208 : END DO
209 10 : CALL dbcsr_iterator_stop(iter)
210 :
211 0 : ALLOCATE (this%coo_cols_local(this%local_blocks), this%coo_rows_local(this%local_blocks), blocks_per_rank(this%numnodes), &
212 0 : coo_dsplcmnts(this%numnodes), this%coo_col_offsets_local(this%nblkcols + 1), &
213 : blocks_for_rank(0:this%numnodes - 1), blocks_from_rank(0:this%numnodes - 1), &
214 0 : sendbufs(0:this%numnodes - 1), this%recvbufs(0:this%numnodes - 1), this%result_sendbufs(0:this%numnodes - 1), &
215 0 : this%result_blocks_for_rank(0:this%numnodes - 1), this%result_blocks_from_rank(0:this%numnodes - 1), &
216 23540 : this%result_blocks_for_rank_buf_offsets(0:this%numnodes - 1))
217 :
218 10 : i = 1
219 10 : CALL dbcsr_iterator_start(iter, this%dbcsr_mat, read_only=.TRUE.)
220 15 : DO WHILE (dbcsr_iterator_blocks_left(iter))
221 5 : CALL dbcsr_iterator_next_block(iter, cur_row, cur_col, cur_blk)
222 5 : this%coo_cols_local(i) = cur_col
223 5 : this%coo_rows_local(i) = cur_row
224 5 : i = i + 1
225 : END DO
226 10 : CALL dbcsr_iterator_stop(iter)
227 :
228 : ! We only know how many blocks we own. What's with the other ranks?
229 10 : CALL this%group%allgather(msgout=this%local_blocks, msgin=blocks_per_rank)
230 10 : coo_dsplcmnts(1) = 0
231 20 : DO i = 1, this%numnodes - 1
232 20 : coo_dsplcmnts(i + 1) = coo_dsplcmnts(i) + blocks_per_rank(i)
233 : END DO
234 :
235 : ! Get a global view on the matrix
236 30 : this%nblks = SUM(blocks_per_rank)
237 0 : ALLOCATE (this%coo_cols(this%nblks), this%coo_rows(this%nblks), this%coo_col_offsets(this%nblkcols + 1), &
238 100 : this%coo_dptr(this%nblks))
239 : CALL this%group%allgatherv(msgout=this%coo_rows_local, msgin=this%coo_rows, rcount=blocks_per_rank, &
240 10 : rdispl=coo_dsplcmnts)
241 : CALL this%group%allgatherv(msgout=this%coo_cols_local, msgin=this%coo_cols, rcount=blocks_per_rank, &
242 10 : rdispl=coo_dsplcmnts)
243 :
244 10 : DEALLOCATE (blocks_per_rank, coo_dsplcmnts)
245 :
246 : ! Sort COO arrays according to their columns
247 10 : CALL qsort_two(this%coo_cols_local, this%coo_rows_local, 1, this%local_blocks)
248 10 : CALL qsort_two(this%coo_cols, this%coo_rows, 1, this%nblks)
249 :
250 : ! Get COO array offsets for columns to accelerate lookups
251 10 : this%coo_col_offsets(this%nblkcols + 1) = this%nblks + 1
252 10 : j = 1
253 20 : DO i = 1, this%nblkcols
254 10 : DO WHILE ((j .LE. this%nblks))
255 10 : IF (this%coo_cols(j) .GE. i) EXIT
256 10 : j = j + 1
257 : END DO
258 20 : this%coo_col_offsets(i) = j
259 : END DO
260 :
261 : ! Same for local COO
262 10 : this%coo_col_offsets_local(this%nblkcols + 1) = this%local_blocks + 1
263 10 : j = 1
264 20 : DO i = 1, this%nblkcols
265 10 : DO WHILE ((j .LE. this%local_blocks))
266 5 : IF (this%coo_cols_local(j) .GE. i) EXIT
267 5 : j = j + 1
268 : END DO
269 20 : this%coo_col_offsets_local(i) = j
270 : END DO
271 :
272 : ! We could combine multiple columns to generate a single submatrix. For now, we have not found a practical use-case for this
273 : ! so we only look at single columns for now.
274 10 : this%cols_per_sm = 1
275 :
276 : ! Determine sizes of all submatrices. This is required in order to assess the computational effort that is required to process
277 : ! the submatrices.
278 10 : this%number_of_submatrices = this%nblkcols/this%cols_per_sm
279 30 : ALLOCATE (this%submatrix_sizes(this%number_of_submatrices))
280 20 : this%submatrix_sizes = 0
281 10 : flops_total = 0.0D0
282 20 : DO i = 1, this%number_of_submatrices
283 10 : CALL nonzero_rows%reset
284 20 : DO j = (i - 1)*this%cols_per_sm + 1, i*this%cols_per_sm ! all colums that determine submatrix i
285 30 : DO k = this%coo_col_offsets(j), this%coo_col_offsets(j + 1) - 1 ! all blocks that are within this column
286 20 : CALL nonzero_rows%insert(this%coo_rows(k))
287 : END DO
288 : END DO
289 20 : DO j = 1, nonzero_rows%elements
290 20 : this%submatrix_sizes(i) = this%submatrix_sizes(i) + this%col_blk_size(nonzero_rows%get(j))
291 : END DO
292 20 : flops_total = flops_total + 2.0D0*this%submatrix_sizes(i)*this%submatrix_sizes(i)*this%submatrix_sizes(i)
293 : END DO
294 :
295 : ! Create mapping from submatrices to ranks. Since submatrices can be of different sizes, we need to perform some load
296 : ! balancing here. For that we assume that arithmetic operations performed on the submatrices scale cubically.
297 30 : ALLOCATE (this%submatrix_owners(this%number_of_submatrices))
298 10 : flops_per_sm = flops_total/this%number_of_submatrices
299 10 : flops_per_rank = flops_total/this%numnodes
300 10 : flops_current = 0.0D0
301 10 : j = 0
302 20 : DO i = 1, this%number_of_submatrices
303 10 : this%submatrix_owners(i) = j
304 10 : flops_current = flops_current + 2.0D0*this%submatrix_sizes(i)*this%submatrix_sizes(i)*this%submatrix_sizes(i)
305 10 : flops_remaining = flops_total - flops_current
306 10 : flops_threshold = (this%numnodes - j - 1)*flops_per_rank
307 : IF ((j .LT. (this%numnodes - 1)) &
308 10 : .AND. ((flops_remaining .LE. flops_threshold &
309 10 : .OR. (this%number_of_submatrices - i) .LT. (this%numnodes - j)))) THEN
310 10 : j = j + 1
311 10 : flops_total = flops_total - flops_current
312 10 : flops_current = 0.0D0
313 : END IF
314 : END DO
315 :
316 : ! Prepare data structures for multithreaded loop
317 10 : numthreads = 1
318 10 : !$ numthreads = omp_get_max_threads()
319 :
320 0 : ALLOCATE (result_blocks_from_rank_t(0:numthreads - 1), &
321 0 : result_blocks_for_rank_t(0:numthreads - 1), &
322 0 : blocks_from_rank_t(0:numthreads - 1), &
323 100 : nonzero_rows_t(0:numthreads - 1))
324 :
325 : ! Figure out which blocks we need to receive. Blocks are identified here as indices into our COO representation.
326 : ! TODO: This currently shows limited parallel efficiency. Investigate further.
327 :
328 : !$OMP PARALLEL DEFAULT(OMP_DEFAULT_NONE_WITH_OOP) &
329 : !$OMP NUM_THREADS(numthreads) &
330 : !$OMP PRIVATE(i,j,k,l,m,l_limit_left,l_limit_right,cur_col,cur_row,mytid) &
331 : !$OMP SHARED(result_blocks_from_rank_t,result_blocks_for_rank_t,blocks_from_rank_t,this,numthreads,nonzero_rows_t)
332 : mytid = 0
333 : !$ mytid = omp_get_thread_num()
334 :
335 : ALLOCATE (nonzero_rows_t(mytid)%sets(1), &
336 : result_blocks_from_rank_t(mytid)%sets(0:this%numnodes - 1), &
337 : result_blocks_for_rank_t(mytid)%sets(0:this%numnodes - 1), &
338 : blocks_from_rank_t(mytid)%sets(0:this%numnodes - 1))
339 :
340 10 : !$OMP DO schedule(guided)
341 : DO i = 1, this%number_of_submatrices
342 : CALL nonzero_rows_t(mytid)%sets(1)%reset
343 : DO j = (i - 1)*this%cols_per_sm + 1, i*this%cols_per_sm ! all colums that determine submatrix i
344 : DO k = this%coo_col_offsets(j), this%coo_col_offsets(j + 1) - 1 ! all blocks that are within this column
345 : ! This block will be required to assemble the final block matrix as it is within an inducing column for submatrix i.
346 : ! Figure out who stores it and insert it into the result_blocks_* sets.
347 : CALL dbcsr_get_stored_coordinates(matrix=this%dbcsr_mat, row=this%coo_rows(k), column=j, processor=m)
348 : IF (m .EQ. this%myrank) THEN
349 : CALL result_blocks_from_rank_t(mytid)%sets(this%submatrix_owners(i))%insert(k)
350 : END IF
351 : IF (this%submatrix_owners(i) .EQ. this%myrank) THEN
352 : CALL nonzero_rows_t(mytid)%sets(1)%insert(this%coo_rows(k))
353 : CALL result_blocks_for_rank_t(mytid)%sets(m)%insert(k)
354 : END IF
355 : END DO
356 : END DO
357 :
358 : IF (this%submatrix_owners(i) .NE. this%myrank) CYCLE
359 :
360 : ! In the following, we determine all blocks required to build the submatrix. We interpret nonzero_rows_t(mytid)(j) as
361 : ! column and nonzero_rows_t(mytid)(k) as row.
362 : DO j = 1, nonzero_rows_t(mytid)%sets(1)%elements
363 : cur_col = nonzero_rows_t(mytid)%sets(1)%get(j)
364 : l_limit_left = this%coo_col_offsets(cur_col)
365 : l_limit_right = this%coo_col_offsets(cur_col + 1) - 1
366 : DO k = 1, nonzero_rows_t(mytid)%sets(1)%elements
367 : cur_row = nonzero_rows_t(mytid)%sets(1)%get(k)
368 : l = l_limit_left
369 : DO WHILE (l .LE. l_limit_right)
370 : IF (this%coo_rows(l) .GE. cur_row) EXIT
371 : l = l + 1
372 : END DO
373 : l_limit_left = l
374 : IF (l .LE. l_limit_right) THEN
375 : IF (this%coo_rows(l) .EQ. cur_row) THEN
376 : ! We found a valid block. Figure out what to do with it.
377 : CALL dbcsr_get_stored_coordinates(matrix=this%dbcsr_mat, row=this%coo_rows(l), &
378 : column=this%coo_cols(l), processor=m)
379 : CALL blocks_from_rank_t(mytid)%sets(m)%insert(l)
380 : END IF
381 : END IF
382 : END DO
383 : END DO
384 : END DO
385 : !$OMP END DO
386 : !$OMP END PARALLEL
387 :
388 : ! Merge partial results from threads
389 30 : DO i = 0, this%numnodes - 1
390 50 : DO j = 0, numthreads - 1
391 25 : DO k = 1, result_blocks_from_rank_t(j)%sets(i)%elements
392 25 : CALL this%result_blocks_from_rank(i)%insert(result_blocks_from_rank_t(j)%sets(i)%get(k))
393 : END DO
394 20 : CALL result_blocks_from_rank_t(j)%sets(i)%reset
395 25 : DO k = 1, result_blocks_for_rank_t(j)%sets(i)%elements
396 25 : CALL this%result_blocks_for_rank(i)%insert(result_blocks_for_rank_t(j)%sets(i)%get(k))
397 : END DO
398 20 : CALL result_blocks_for_rank_t(j)%sets(i)%reset
399 25 : DO k = 1, blocks_from_rank_t(j)%sets(i)%elements
400 25 : CALL blocks_from_rank(i)%insert(blocks_from_rank_t(j)%sets(i)%get(k))
401 : END DO
402 40 : CALL blocks_from_rank_t(j)%sets(i)%reset
403 : END DO
404 : END DO
405 20 : DO i = 0, numthreads - 1
406 10 : CALL nonzero_rows_t(i)%sets(1)%reset
407 0 : DEALLOCATE (result_blocks_from_rank_t(i)%sets, result_blocks_for_rank_t(i)%sets, blocks_from_rank_t(i)%sets, &
408 18080 : nonzero_rows_t(i)%sets)
409 : END DO
410 50 : DEALLOCATE (result_blocks_from_rank_t, result_blocks_for_rank_t, blocks_from_rank_t, nonzero_rows_t)
411 :
412 : ! Make other ranks aware of our needs
413 40 : ALLOCATE (num_blockids_send(0:this%numnodes - 1), num_blockids_recv(0:this%numnodes - 1))
414 30 : DO i = 0, this%numnodes - 1
415 30 : num_blockids_send(i) = blocks_from_rank(i)%elements
416 : END DO
417 10 : CALL this%group%alltoall(num_blockids_send, num_blockids_recv, 1)
418 30 : DO i = 0, this%numnodes - 1
419 30 : CALL blocks_for_rank(i)%alloc(num_blockids_recv(i))
420 : END DO
421 10 : DEALLOCATE (num_blockids_send, num_blockids_recv)
422 :
423 10 : IF (this%numnodes .GT. 1) THEN
424 30 : DO i = 1, this%numnodes
425 20 : k = MODULO(this%myrank - i, this%numnodes) ! rank to receive from
426 30 : CALL this%group%irecv(msgout=blocks_for_rank(k)%data, source=k, request=blocks_for_rank(k)%mpi_request)
427 : END DO
428 30 : DO i = 1, this%numnodes
429 20 : k = MODULO(this%myrank + i, this%numnodes) ! rank to send to
430 30 : CALL this%group%send(blocks_from_rank(k)%getall(), k, 0)
431 : END DO
432 30 : DO i = 0, this%numnodes - 1
433 30 : CALL blocks_for_rank(i)%mpi_request%wait()
434 : END DO
435 : ELSE
436 0 : blocks_for_rank(0)%data = blocks_from_rank(0)%getall()
437 : END IF
438 :
439 : ! Free memory allocated in nonzero_rows
440 10 : CALL nonzero_rows%reset
441 :
442 : ! Make get calls on this%result_blocks_for_rank(...) threadsafe in other routines by updating the internal sorted list
443 30 : DO m = 0, this%numnodes - 1
444 30 : IF ((.NOT. this%result_blocks_for_rank(m)%sorted_up_to_date) .AND. (this%result_blocks_for_rank(m)%elements .GT. 0)) THEN
445 5 : CALL this%result_blocks_for_rank(m)%update_sorted
446 : END IF
447 : END DO
448 :
449 : ! Create and fill send buffers
450 30 : DO i = 0, this%numnodes - 1
451 20 : bufsize = 0
452 25 : DO j = 1, blocks_for_rank(i)%size
453 5 : k = blocks_for_rank(i)%data(j)
454 25 : bufsize = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
455 : END DO
456 20 : CALL sendbufs(i)%alloc(bufsize)
457 :
458 20 : bufsize = 0
459 20 : CALL this%result_blocks_for_rank_buf_offsets(i)%alloc(this%result_blocks_for_rank(i)%elements)
460 25 : DO j = 1, this%result_blocks_for_rank(i)%elements
461 5 : k = this%result_blocks_for_rank(i)%get(j)
462 5 : this%result_blocks_for_rank_buf_offsets(i)%data(j) = bufsize
463 25 : bufsize = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
464 : END DO
465 20 : CALL this%result_sendbufs(i)%alloc(bufsize)
466 :
467 20 : bufsize = 0
468 35 : DO j = 1, blocks_for_rank(i)%size
469 5 : k = blocks_for_rank(i)%data(j)
470 5 : CALL dbcsr_get_block_p(this%dbcsr_mat, row=this%coo_rows(k), col=this%coo_cols(k), block=blockp, found=valid)
471 5 : IF (.NOT. valid) THEN
472 0 : CPABORT("Block included in our COO and placed on our rank could not be fetched!")
473 : END IF
474 5 : bufsize_next = bufsize + SIZE(blockp)
475 185 : sendbufs(i)%data(bufsize + 1:bufsize_next) = blockp
476 30 : bufsize = bufsize_next
477 : END DO
478 : END DO
479 :
480 : ! Create receive buffers and mapping from blocks to memory locations
481 30 : DO i = 0, this%numnodes - 1
482 20 : bufsize = 0
483 25 : DO j = 1, blocks_from_rank(i)%elements
484 5 : k = blocks_from_rank(i)%get(j)
485 25 : bufsize = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
486 : END DO
487 20 : CALL this%recvbufs(i)%alloc(bufsize)
488 20 : bufsize = 0
489 35 : DO j = 1, blocks_from_rank(i)%elements
490 5 : k = blocks_from_rank(i)%get(j)
491 5 : bufsize_next = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
492 5 : this%coo_dptr(k)%target => this%recvbufs(i)%data(bufsize + 1:bufsize_next)
493 25 : bufsize = bufsize_next
494 : END DO
495 : END DO
496 :
497 30 : DO i = 0, this%numnodes - 1
498 20 : CALL blocks_for_rank(i)%dealloc
499 30 : CALL blocks_from_rank(i)%reset
500 : END DO
501 5170 : DEALLOCATE (blocks_for_rank, blocks_from_rank)
502 :
503 10 : IF (this%numnodes .GT. 1) THEN
504 : ! Communicate. We attempt to balance communication load in the network here by starting our sends with our right neighbor
505 : ! and first trying to receive from our left neighbor.
506 30 : DO i = 1, this%numnodes
507 20 : k = MODULO(this%myrank - i, this%numnodes) ! rank to receive from
508 20 : CALL this%group%irecv(msgout=this%recvbufs(k)%data, source=k, request=this%recvbufs(k)%mpi_request)
509 20 : k = MODULO(this%myrank + i, this%numnodes) ! rank to send to
510 30 : CALL this%group%isend(msgin=sendbufs(k)%data, dest=k, request=sendbufs(k)%mpi_request)
511 : END DO
512 30 : DO i = 0, this%numnodes - 1
513 20 : CALL sendbufs(i)%mpi_request%wait()
514 30 : CALL this%recvbufs(i)%mpi_request%wait()
515 : END DO
516 : ELSE
517 0 : this%recvbufs(0)%data = sendbufs(0)%data
518 : END IF
519 :
520 30 : DO i = 0, this%numnodes - 1
521 30 : CALL sendbufs(i)%dealloc
522 : END DO
523 10 : DEALLOCATE (sendbufs)
524 :
525 10 : this%initialized = .TRUE.
526 :
527 10 : CALL timestop(handle)
528 2620 : END SUBROUTINE submatrix_dissection_init
529 :
530 : ! **************************************************************************************************
531 : !> \brief free all associated memory, afterwards submatrix_dissection_init needs to be called again
532 : !> \param this - object of class submatrix_dissection_type
533 : ! **************************************************************************************************
534 10 : PURE SUBROUTINE submatrix_dissection_final(this)
535 : CLASS(submatrix_dissection_type), INTENT(INOUT) :: this
536 : INTEGER :: i
537 :
538 10 : this%initialized = .FALSE.
539 :
540 10 : IF (ALLOCATED(this%submatrix_sizes)) DEALLOCATE (this%submatrix_sizes)
541 10 : IF (ALLOCATED(this%coo_cols_local)) DEALLOCATE (this%coo_cols_local)
542 10 : IF (ALLOCATED(this%coo_rows_local)) DEALLOCATE (this%coo_rows_local)
543 10 : IF (ALLOCATED(this%coo_col_offsets_local)) DEALLOCATE (this%coo_col_offsets_local)
544 10 : IF (ALLOCATED(this%result_blocks_for_rank_buf_offsets)) THEN
545 30 : DO i = 0, this%numnodes - 1
546 30 : CALL this%result_blocks_for_rank_buf_offsets(i)%dealloc
547 : END DO
548 10 : DEALLOCATE (this%result_blocks_for_rank_buf_offsets)
549 : END IF
550 10 : IF (ALLOCATED(this%recvbufs)) THEN
551 30 : DO i = 0, this%numnodes - 1
552 30 : CALL this%recvbufs(i)%dealloc
553 : END DO
554 10 : DEALLOCATE (this%recvbufs)
555 : END IF
556 10 : IF (ALLOCATED(this%result_sendbufs)) THEN
557 30 : DO i = 0, this%numnodes - 1
558 30 : CALL this%result_sendbufs(i)%dealloc
559 : END DO
560 10 : DEALLOCATE (this%result_sendbufs)
561 : END IF
562 10 : IF (ALLOCATED(this%result_blocks_for_rank)) THEN
563 30 : DO i = 0, this%numnodes - 1
564 30 : CALL this%result_blocks_for_rank(i)%reset
565 : END DO
566 5170 : DEALLOCATE (this%result_blocks_for_rank)
567 : END IF
568 10 : IF (ALLOCATED(this%result_blocks_from_rank)) THEN
569 30 : DO i = 0, this%numnodes - 1
570 30 : CALL this%result_blocks_from_rank(i)%reset
571 : END DO
572 5170 : DEALLOCATE (this%result_blocks_from_rank)
573 : END IF
574 10 : IF (ALLOCATED(this%coo_cols)) DEALLOCATE (this%coo_cols)
575 10 : IF (ALLOCATED(this%coo_rows)) DEALLOCATE (this%coo_rows)
576 10 : IF (ALLOCATED(this%coo_col_offsets)) DEALLOCATE (this%coo_col_offsets)
577 10 : IF (ALLOCATED(this%coo_dptr)) DEALLOCATE (this%coo_dptr)
578 10 : IF (ALLOCATED(this%submatrix_owners)) DEALLOCATE (this%submatrix_owners)
579 :
580 10 : END SUBROUTINE submatrix_dissection_final
581 :
582 : ! **************************************************************************************************
583 : !> \brief generate a specific submatrix
584 : !> \param this - object of class submatrix_dissection_type
585 : !> \param sm_id - id of the submatrix to generate
586 : !> \param sm - generated submatrix
587 : ! **************************************************************************************************
588 6 : SUBROUTINE submatrix_generate_sm(this, sm_id, sm)
589 : CLASS(submatrix_dissection_type), INTENT(IN) :: this
590 : INTEGER, INTENT(IN) :: sm_id
591 : REAL(KIND=dp), DIMENSION(:, :), ALLOCATABLE, INTENT(OUT) :: sm
592 :
593 : INTEGER :: sm_dim, i, j, k, offset_x1, offset_x2, offset_y1, &
594 : offset_y2, k_limit_left, k_limit_right
595 1548 : TYPE(set_type) :: nonzero_rows
596 :
597 6 : IF (.NOT. this%initialized) THEN
598 0 : CPABORT("Submatrix dissection not initialized")
599 : END IF
600 :
601 6 : IF (this%myrank .NE. this%submatrix_owners(sm_id)) THEN
602 0 : CPABORT("This rank is not supposed to construct this submatrix")
603 : END IF
604 :
605 : ! TODO: Should we buffer the list of non-zero rows for each submatrix instead of recalculating it each time?
606 6 : CALL nonzero_rows%reset
607 12 : DO i = (sm_id - 1)*this%cols_per_sm + 1, sm_id*this%cols_per_sm ! all colums that determine submatrix sm_id
608 18 : DO j = this%coo_col_offsets(i), this%coo_col_offsets(i + 1) - 1 ! all blocks that are within this column
609 12 : CALL nonzero_rows%insert(this%coo_rows(j))
610 : END DO
611 : END DO
612 6 : sm_dim = 0
613 12 : DO i = 1, nonzero_rows%elements
614 12 : sm_dim = sm_dim + this%col_blk_size(nonzero_rows%get(i))
615 : END DO
616 :
617 24 : ALLOCATE (sm(sm_dim, sm_dim))
618 258 : sm = 0
619 :
620 6 : offset_x1 = 0
621 12 : DO j = 1, nonzero_rows%elements
622 6 : offset_x2 = offset_x1 + this%col_blk_size(nonzero_rows%get(j))
623 6 : offset_y1 = 0
624 6 : k_limit_left = this%coo_col_offsets(nonzero_rows%get(j))
625 6 : k_limit_right = this%coo_col_offsets(nonzero_rows%get(j) + 1) - 1
626 12 : DO i = 1, nonzero_rows%elements
627 6 : offset_y2 = offset_y1 + this%col_blk_size(nonzero_rows%get(i))
628 : ! Copy block nonzero_rows(i),nonzero_rows(j) to sm(i,j) (if the block actually exists)
629 6 : k = k_limit_left
630 6 : DO WHILE (k .LE. k_limit_right)
631 6 : IF (this%coo_rows(k) .GE. nonzero_rows%get(i)) EXIT
632 0 : k = k + 1
633 : END DO
634 6 : k_limit_left = k
635 6 : IF (this%coo_rows(k) .EQ. nonzero_rows%get(i)) THEN ! it does exist and k is our block id
636 : sm(offset_y1 + 1:offset_y2, offset_x1 + 1:offset_x2) = RESHAPE(this%coo_dptr(k)%target, &
637 270 : (/offset_y2 - offset_y1, offset_x2 - offset_x1/))
638 : END IF
639 12 : offset_y1 = offset_y2
640 : END DO
641 12 : offset_x1 = offset_x2
642 : END DO
643 :
644 : ! Free memory allocated in nonzero_rows
645 6 : CALL nonzero_rows%reset
646 :
647 1548 : END SUBROUTINE submatrix_generate_sm
648 :
649 : ! **************************************************************************************************
650 : !> \brief determine submatrix ids that are handled by a specific rank
651 : !> \param this - object of class submatrix_dissection_type
652 : !> \param rank - rank id of interest
653 : !> \param sm_ids - list of submatrix ids handled by that rank
654 : ! **************************************************************************************************
655 10 : SUBROUTINE submatrix_get_sm_ids_for_rank(this, rank, sm_ids)
656 : CLASS(submatrix_dissection_type), INTENT(IN) :: this
657 : INTEGER, INTENT(IN) :: rank
658 : INTEGER, DIMENSION(:), ALLOCATABLE, INTENT(OUT) :: sm_ids
659 : INTEGER :: count, i
660 :
661 10 : IF (.NOT. this%initialized) THEN
662 0 : CPABORT("Submatrix dissection not initialized")
663 : END IF
664 :
665 10 : count = 0
666 20 : DO i = 1, this%number_of_submatrices
667 20 : IF (this%submatrix_owners(i) .EQ. rank) count = count + 1
668 : END DO
669 :
670 25 : ALLOCATE (sm_ids(count))
671 :
672 10 : count = 0
673 20 : DO i = 1, this%number_of_submatrices
674 20 : IF (this%submatrix_owners(i) .EQ. rank) THEN
675 5 : count = count + 1
676 5 : sm_ids(count) = i
677 : END IF
678 : END DO
679 :
680 10 : END SUBROUTINE submatrix_get_sm_ids_for_rank
681 :
682 : ! **************************************************************************************************
683 : !> \brief copy result columns from a submatrix into result buffer
684 : !> \param this - object of class submatrix_dissection_type
685 : !> \param sm_id - id of the submatrix
686 : !> \param sm - result-submatrix
687 : ! **************************************************************************************************
688 6 : SUBROUTINE submatrix_cpy_resultcol(this, sm_id, sm)
689 : CLASS(submatrix_dissection_type), INTENT(INOUT) :: this
690 : INTEGER, INTENT(in) :: sm_id
691 : REAL(KIND=dp), DIMENSION(:, :), ALLOCATABLE, INTENT(IN) :: sm
692 :
693 1548 : TYPE(set_type) :: nonzero_rows
694 : INTEGER :: i, j, k, m, sm_col_offset, offset_x1, offset_x2, offset_y1, &
695 : offset_y2, bufsize, bufsize_next, k_limit_left, k_limit_right
696 6 : INTEGER, DIMENSION(:), ALLOCATABLE :: buf_offset_idxs
697 :
698 6 : IF (.NOT. this%initialized) THEN
699 0 : CPABORT("Submatrix dissection is not initizalized")
700 : END IF
701 :
702 6 : IF (this%myrank .NE. this%submatrix_owners(sm_id)) THEN
703 0 : CPABORT("This rank is not supposed to operate on this submatrix")
704 : END IF
705 :
706 18 : ALLOCATE (buf_offset_idxs(0:this%numnodes - 1))
707 18 : buf_offset_idxs = 1
708 :
709 : ! TODO: Should we buffer the list of non-zero rows for each submatrix instead of recalculating it each time?
710 6 : sm_col_offset = 0
711 12 : DO i = (sm_id - 1)*this%cols_per_sm + 1, sm_id*this%cols_per_sm ! all colums that determine submatrix sm_id
712 18 : DO j = this%coo_col_offsets(i), this%coo_col_offsets(i + 1) - 1 ! all blocks that are within this column
713 12 : CALL nonzero_rows%insert(this%coo_rows(j))
714 : END DO
715 : END DO
716 :
717 6 : sm_col_offset = 0
718 6 : DO i = 1, nonzero_rows%elements
719 6 : IF (nonzero_rows%get(i) .EQ. (sm_id - 1)*this%cols_per_sm + 1) THEN
720 : ! We just found the nonzero row that corresponds to the first inducing column of our submatrix
721 6 : sm_col_offset = i
722 6 : EXIT
723 : END IF
724 : END DO
725 6 : IF (sm_col_offset .EQ. 0) THEN
726 0 : CPABORT("Could not determine relevant result columns of submatrix")
727 : END IF
728 :
729 6 : offset_x1 = 0
730 12 : DO j = 1, nonzero_rows%elements
731 6 : offset_x2 = offset_x1 + this%col_blk_size(nonzero_rows%get(j))
732 : ! We only want to copy the blocks from the result columns
733 6 : IF ((j .GE. sm_col_offset) .AND. (j .LT. sm_col_offset + this%cols_per_sm)) THEN
734 6 : offset_y1 = 0
735 6 : k_limit_left = this%coo_col_offsets(nonzero_rows%get(j))
736 6 : k_limit_right = this%coo_col_offsets(nonzero_rows%get(j) + 1) - 1
737 12 : DO i = 1, nonzero_rows%elements
738 6 : offset_y2 = offset_y1 + this%col_blk_size(nonzero_rows%get(i))
739 : ! Check if sm(i,j), i.e., (nonzero_rows(i),nonzero_rows(j)) exists in the original matrix and if so, copy it into the
740 : ! correct send buffer.
741 6 : k = k_limit_left
742 6 : DO WHILE (k .LE. k_limit_right)
743 6 : IF (this%coo_rows(k) .GE. nonzero_rows%get(i)) EXIT
744 0 : k = k + 1
745 : END DO
746 6 : k_limit_left = k
747 6 : IF (this%coo_rows(k) .EQ. nonzero_rows%get(i)) THEN ! it does exist and k is our block id
748 : CALL dbcsr_get_stored_coordinates(matrix=this%dbcsr_mat, row=this%coo_rows(k), column=this%coo_cols(k), &
749 6 : processor=m)
750 6 : DO WHILE (buf_offset_idxs(m) .LE. this%result_blocks_for_rank(m)%elements)
751 6 : IF (this%result_blocks_for_rank(m)%get(buf_offset_idxs(m)) .GE. k) EXIT
752 0 : buf_offset_idxs(m) = buf_offset_idxs(m) + 1
753 : END DO
754 6 : IF (this%result_blocks_for_rank(m)%get(buf_offset_idxs(m)) .NE. k) THEN
755 0 : CPABORT("Could not determine buffer offset for block")
756 : END IF
757 6 : bufsize = this%result_blocks_for_rank_buf_offsets(m)%data(buf_offset_idxs(m))
758 6 : bufsize_next = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
759 : this%result_sendbufs(m)%data(bufsize + 1:bufsize_next) = RESHAPE( &
760 : sm(offset_y1 + 1:offset_y2, offset_x1 + 1:offset_x2), &
761 228 : (/bufsize_next - bufsize/))
762 : END IF
763 12 : offset_y1 = offset_y2
764 : END DO
765 : END IF
766 12 : offset_x1 = offset_x2
767 : END DO
768 :
769 6 : DEALLOCATE (buf_offset_idxs)
770 :
771 : ! Free memory in set
772 6 : CALL nonzero_rows%reset
773 :
774 1548 : END SUBROUTINE submatrix_cpy_resultcol
775 :
776 : ! **************************************************************************************************
777 : !> \brief finalize results back into a dbcsr matrix
778 : !> \param this - object of class submatrix_dissection_type
779 : !> \param resultmat - result dbcsr matrix
780 : !> \par History
781 : !> 2020.02 created [Michael Lass]
782 : !> 2020.05 add time measurements [Michael Lass]
783 : ! **************************************************************************************************
784 10 : SUBROUTINE submatrix_communicate_results(this, resultmat)
785 : CLASS(submatrix_dissection_type), INTENT(INOUT) :: this
786 : TYPE(dbcsr_type), INTENT(INOUT) :: resultmat
787 :
788 : INTEGER :: i, j, k, cur_row, cur_col, cur_sm, cur_buf, last_buf, &
789 : bufsize, bufsize_next
790 10 : TYPE(buffer_type), DIMENSION(:), ALLOCATABLE :: result_recvbufs
791 :
792 : CHARACTER(LEN=*), PARAMETER :: routineN = 'submatrix_communicate_results'
793 : INTEGER :: handle
794 :
795 10 : CALL timeset(routineN, handle)
796 :
797 60 : ALLOCATE (result_recvbufs(0:this%numnodes - 1))
798 30 : DO i = 0, this%numnodes - 1
799 20 : bufsize = 0
800 25 : DO j = 1, this%result_blocks_from_rank(i)%elements
801 5 : k = this%result_blocks_from_rank(i)%get(j)
802 25 : bufsize = bufsize + this%col_blk_size(this%coo_cols(k))*this%col_blk_size(this%coo_rows(k))
803 : END DO
804 30 : CALL result_recvbufs(i)%alloc(bufsize)
805 : END DO
806 :
807 : ! Communicate. We attempt to balance communication load in the network here by starting our sends with our right neighbor
808 : ! and first trying to receive from our left neighbor.
809 10 : IF (this%numnodes .GT. 1) THEN
810 30 : DO i = 1, this%numnodes
811 20 : k = MODULO(this%myrank - i, this%numnodes) ! rank to receive from
812 20 : CALL this%group%irecv(msgout=result_recvbufs(k)%data, source=k, request=result_recvbufs(k)%mpi_request)
813 20 : k = MODULO(this%myrank + i, this%numnodes) ! rank to send to
814 30 : CALL this%group%isend(msgin=this%result_sendbufs(k)%data, dest=k, request=this%result_sendbufs(k)%mpi_request)
815 : END DO
816 30 : DO i = 0, this%numnodes - 1
817 20 : CALL this%result_sendbufs(i)%mpi_request%wait()
818 30 : CALL result_recvbufs(i)%mpi_request%wait()
819 : END DO
820 : ELSE
821 0 : result_recvbufs(0)%data = this%result_sendbufs(0)%data
822 : END IF
823 :
824 10 : last_buf = -1
825 10 : bufsize = 0
826 15 : DO i = 1, this%local_blocks
827 5 : cur_col = this%coo_cols_local(i)
828 5 : cur_row = this%coo_rows_local(i)
829 5 : cur_sm = (cur_col - 1)/this%cols_per_sm + 1
830 5 : cur_buf = this%submatrix_owners(cur_sm)
831 5 : IF (cur_buf .GT. last_buf) bufsize = 0
832 5 : bufsize_next = bufsize + this%col_blk_size(cur_row)*this%col_blk_size(cur_col)
833 : CALL dbcsr_put_block(matrix=resultmat, row=cur_row, col=cur_col, &
834 5 : block=result_recvbufs(cur_buf)%data(bufsize + 1:bufsize_next))
835 5 : bufsize = bufsize_next
836 15 : last_buf = cur_buf
837 : END DO
838 :
839 30 : DO i = 0, this%numnodes - 1
840 30 : CALL result_recvbufs(i)%dealloc
841 : END DO
842 10 : DEALLOCATE (result_recvbufs)
843 :
844 10 : CALL dbcsr_finalize(resultmat)
845 :
846 10 : CALL timestop(handle)
847 10 : END SUBROUTINE submatrix_communicate_results
848 :
849 : ! **************************************************************************************************
850 : !> \brief sort two integer arrays using quicksort, using the second list as second-level sorting criterion
851 : !> \param arr_a - first input array
852 : !> \param arr_b - second input array
853 : !> \param left - left boundary of region to be sorted
854 : !> \param right - right boundary of region to be sorted
855 : ! **************************************************************************************************
856 20 : RECURSIVE PURE SUBROUTINE qsort_two(arr_a, arr_b, left, right)
857 :
858 : INTEGER, DIMENSION(:), INTENT(inout) :: arr_a, arr_b
859 : INTEGER, INTENT(in) :: left, right
860 :
861 : INTEGER :: i, j, pivot_a, pivot_b, tmp
862 :
863 20 : IF (right - left .LT. 1) RETURN
864 :
865 0 : i = left
866 0 : j = right - 1
867 0 : pivot_a = arr_a(right)
868 0 : pivot_b = arr_b(right)
869 :
870 0 : DO
871 0 : DO WHILE ((arr_a(i) .LT. pivot_a) .OR. ((arr_a(i) .EQ. pivot_a) .AND. (arr_b(i) .LT. pivot_b)))
872 0 : i = i + 1
873 : END DO
874 0 : DO WHILE ((j .GT. i) .AND. ((arr_a(j) .GT. pivot_a) .OR. ((arr_a(j) .EQ. pivot_a) .AND. (arr_b(j) .GE. pivot_b))))
875 0 : j = j - 1
876 : END DO
877 0 : IF (i .LT. j) THEN
878 0 : tmp = arr_a(i)
879 0 : arr_a(i) = arr_a(j)
880 0 : arr_a(j) = tmp
881 0 : tmp = arr_b(i)
882 0 : arr_b(i) = arr_b(j)
883 0 : arr_b(j) = tmp
884 : ELSE
885 : EXIT
886 : END IF
887 : END DO
888 :
889 0 : IF ((arr_a(i) .GT. pivot_a) .OR. (arr_a(i) .EQ. pivot_a .AND. arr_b(i) .GT. pivot_b)) THEN
890 0 : tmp = arr_a(i)
891 0 : arr_a(i) = arr_a(right)
892 0 : arr_a(right) = tmp
893 0 : tmp = arr_b(i)
894 0 : arr_b(i) = arr_b(right)
895 0 : arr_b(right) = tmp
896 : END IF
897 :
898 0 : IF (i - 1 .GT. left) CALL qsort_two(arr_a, arr_b, left, i - 1)
899 0 : IF (i + 1 .LT. right) CALL qsort_two(arr_a, arr_b, i + 1, right)
900 :
901 : END SUBROUTINE qsort_two
902 :
903 0 : END MODULE submatrix_dissection
|