Line data Source code
1 : !--------------------------------------------------------------------------------------------------!
2 : ! CP2K: A general program to perform molecular dynamics simulations !
3 : ! Copyright 2000-2024 CP2K developers group <https://cp2k.org> !
4 : ! !
5 : ! SPDX-License-Identifier: GPL-2.0-or-later !
6 : !--------------------------------------------------------------------------------------------------!
7 :
8 : ! **************************************************************************************************
9 : !> \brief Methods to perform on the fly statistical analysis of data
10 : !> -) Schiferl and Wallace, J. Chem. Phys. 83 (10) 1985
11 : !> \author Teodoro Laino (01.2007) [tlaino]
12 : !> \par History
13 : !> - Teodoro Laino (10.2008) [tlaino] - University of Zurich
14 : !> module made publicly available
15 : ! **************************************************************************************************
16 : MODULE statistical_methods
17 : USE cp_log_handling, ONLY: cp_logger_get_default_io_unit
18 : USE global_types, ONLY: global_environment_type
19 : USE kinds, ONLY: dp
20 : USE util, ONLY: sort
21 : #include "./base/base_uses.f90"
22 :
23 : IMPLICIT NONE
24 :
25 : PRIVATE
26 : CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'statistical_methods'
27 : LOGICAL, PARAMETER :: debug_this_module = .FALSE.
28 : INTEGER, PARAMETER, PUBLIC :: min_sample_size = 20
29 : PUBLIC :: sw_test, &
30 : k_test, &
31 : vn_test
32 :
33 : CONTAINS
34 :
35 : ! **************************************************************************************************
36 : !> \brief Shapiro - Wilk's test or W-statistic to test normality of a distribution
37 : !> R94 APPL. STATIST. (1995) VOL.44, NO.4
38 : !> Calculates the Shapiro-Wilk W test and its significance level
39 : !> \param ix ...
40 : !> \param n ...
41 : !> \param w ...
42 : !> \param pw ...
43 : !> \par History
44 : !> Teodoro Laino (02.2007) [tlaino]
45 : ! **************************************************************************************************
46 0 : SUBROUTINE sw_test(ix, n, w, pw)
47 : REAL(KIND=dp), DIMENSION(:), POINTER :: ix
48 : INTEGER, INTENT(IN) :: n
49 : REAL(KIND=dp), INTENT(OUT) :: w, pw
50 :
51 : REAL(KIND=dp), PARAMETER :: c1(6) = (/0.000000_dp, 0.221157_dp, -0.147981_dp, -2.071190_dp, &
52 : 4.434685_dp, -2.706056_dp/), c2(6) = (/0.000000_dp, 0.042981_dp, -0.293762_dp, &
53 : -1.752461_dp, 5.682633_dp, -3.582633_dp/), &
54 : c3(4) = (/0.544000_dp, -0.399780_dp, 0.025054_dp, -0.6714E-3_dp/), &
55 : c4(4) = (/1.3822_dp, -0.77857_dp, 0.062767_dp, -0.0020322_dp/), &
56 : c5(4) = (/-1.5861_dp, -0.31082_dp, -0.083751_dp, 0.0038915_dp/), &
57 : c6(3) = (/-0.4803_dp, -0.082676_dp, 0.0030302_dp/), g(2) = (/-2.273_dp, 0.459_dp/), &
58 : one = 1.0_dp, pi6 = 1.909859_dp, qtr = 0.25_dp, small = EPSILON(0.0_dp), &
59 : sqrth = 0.70711_dp
60 : REAL(KIND=dp), PARAMETER :: stqr = 1.047198_dp, th = 0.375_dp, two = 2.0_dp, zero = 0.0_dp
61 :
62 : INTEGER :: i, i1, j, n2, output_unit
63 0 : INTEGER, ALLOCATABLE, DIMENSION(:) :: itmp
64 : LOGICAL :: failure
65 : REAL(KIND=dp) :: a1, a2, an, an25, asa, fac, gamma, m, &
66 : range, rsn, s, sa, sax, ssa, ssassx, &
67 : ssumm2, ssx, summ2, sx, w1, xi, xsx, &
68 : xx, y
69 0 : REAL(KIND=dp), ALLOCATABLE, DIMENSION(:) :: a, x
70 :
71 0 : failure = .FALSE.
72 0 : output_unit = cp_logger_get_default_io_unit()
73 : ! Check for N < 3
74 0 : IF (n < 3 .OR. n > 5000) THEN
75 0 : IF (output_unit > 0) WRITE (output_unit, '(A)') &
76 0 : "Shapiro Wilk test: number of points less than 3 or greated than 5000."
77 : IF (output_unit > 0) WRITE (output_unit, '(A)') &
78 0 : "Not able to perform the test!"
79 : END IF
80 : ! Sort input array of data in ascending order
81 0 : IF (MOD(n, 2) == 0) THEN
82 0 : n2 = n/2
83 : ELSE
84 0 : n2 = (n - 1)/2
85 : END IF
86 0 : ALLOCATE (x(n))
87 0 : ALLOCATE (itmp(n))
88 0 : ALLOCATE (a(n2))
89 0 : x(:) = ix
90 0 : CALL sort(x, n, itmp)
91 : ! Check for zero range
92 0 : range = x(n) - x(1)
93 0 : IF (range < small) failure = .TRUE.
94 0 : IF (failure .AND. (output_unit > 0)) THEN
95 0 : WRITE (output_unit, '(A)') "Shapiro Wilk test: two data points are numerically identical."
96 0 : WRITE (output_unit, '(A)') "Not able to perform the test!"
97 : END IF
98 0 : pw = one
99 0 : w = one
100 0 : an = n
101 : ! Calculates coefficients for the test
102 0 : IF (n == 3) THEN
103 0 : a(1) = sqrth
104 : ELSE
105 0 : an25 = an + qtr
106 0 : summ2 = zero
107 0 : DO i = 1, n2
108 0 : CALL ppnd7((i - th)/an25, a(i))
109 0 : summ2 = summ2 + a(i)**2
110 : END DO
111 0 : summ2 = summ2*two
112 0 : ssumm2 = SQRT(summ2)
113 0 : rsn = one/SQRT(an)
114 0 : a1 = poly(c1, 6, rsn) - a(1)/ssumm2
115 : ! Normalize coefficients
116 0 : IF (n > 5) THEN
117 0 : i1 = 3
118 0 : a2 = -a(2)/ssumm2 + poly(c2, 6, rsn)
119 0 : fac = SQRT((summ2 - two*a(1)**2 - two*a(2)**2)/(one - two*a1**2 - two*a2**2))
120 0 : a(1) = a1
121 0 : a(2) = a2
122 : ELSE
123 0 : i1 = 2
124 0 : fac = SQRT((summ2 - two*a(1)**2)/(one - two*a1**2))
125 0 : a(1) = a1
126 : END IF
127 0 : DO i = i1, n2
128 0 : a(i) = -a(i)/fac
129 : END DO
130 : END IF
131 : ! scaled X
132 0 : xx = x(1)/range
133 0 : sx = xx
134 0 : sa = -a(1)
135 0 : j = n - 1
136 0 : DO i = 2, n
137 0 : xi = x(i)/range
138 0 : sx = sx + xi
139 0 : IF (i /= j) sa = sa + SIGN(1, i - j)*a(MIN(i, j))
140 : xx = xi
141 0 : j = j - 1
142 : END DO
143 : ! Calculate W statistic as squared correlation
144 : ! between data and coefficients
145 0 : sa = sa/n
146 0 : sx = sx/n
147 0 : ssa = zero
148 0 : ssx = zero
149 0 : sax = zero
150 0 : j = n
151 0 : DO i = 1, n
152 0 : IF (i /= j) THEN
153 0 : asa = SIGN(1, i - j)*a(MIN(i, j)) - sa
154 : ELSE
155 0 : asa = -sa
156 : END IF
157 0 : xsx = x(i)/range - sx
158 0 : ssa = ssa + asa*asa
159 0 : ssx = ssx + xsx*xsx
160 0 : sax = sax + asa*xsx
161 0 : j = j - 1
162 : END DO
163 : ! W1 equals (1-W) calculated to avoid excessive rounding error
164 : ! for W very near 1 (a potential problem in very large samples)
165 0 : ssassx = SQRT(ssa*ssx)
166 0 : w1 = (ssassx - sax)*(ssassx + sax)/(ssa*ssx)
167 0 : w = one - w1
168 : ! Calculate significance level for W (exact for N=3)
169 0 : IF (n == 3) THEN
170 0 : pw = pi6*(ASIN(SQRT(w)) - stqr)
171 : ELSE
172 0 : y = LOG(w1)
173 0 : xx = LOG(an)
174 0 : m = zero
175 0 : s = one
176 0 : IF (n <= 11) THEN
177 0 : gamma = poly(g, 2, an)
178 0 : IF (y >= gamma) THEN
179 0 : pw = small
180 : ELSE
181 0 : y = -LOG(gamma - y)
182 0 : m = poly(c3, 4, an)
183 0 : s = EXP(poly(c4, 4, an))
184 0 : pw = alnorm((y - m)/s, .TRUE.)
185 : END IF
186 : ELSE
187 0 : m = poly(c5, 4, xx)
188 0 : s = EXP(poly(c6, 3, xx))
189 0 : pw = alnorm((y - m)/s, .TRUE.)
190 : END IF
191 : END IF
192 0 : DEALLOCATE (x)
193 0 : DEALLOCATE (itmp)
194 0 : DEALLOCATE (a)
195 :
196 0 : END SUBROUTINE sw_test
197 :
198 : ! **************************************************************************************************
199 : !> \brief Produces the normal deviate Z corresponding to a given lower tail area of P
200 : !> Z is accurate to about 1 part in 10**7.
201 : !> AS241 APPL. STATIST. (1988) VOL. 37, NO. 3, 477- 484.
202 : !> \param p ...
203 : !> \param normal_dev ...
204 : !> \par History
205 : !> Original version by Alain J. Miller - 1996
206 : !> Teodoro Laino (02.2007) [tlaino]
207 : ! **************************************************************************************************
208 0 : SUBROUTINE ppnd7(p, normal_dev)
209 : REAL(KIND=dp), INTENT(IN) :: p
210 : REAL(KIND=dp), INTENT(OUT) :: normal_dev
211 :
212 : REAL(KIND=dp), PARAMETER :: a0 = 3.3871327179E+00_dp, a1 = 5.0434271938E+01_dp, &
213 : a2 = 1.5929113202E+02_dp, a3 = 5.9109374720E+01_dp, b1 = 1.7895169469E+01_dp, &
214 : b2 = 7.8757757664E+01_dp, b3 = 6.7187563600E+01_dp, c0 = 1.4234372777E+00_dp, &
215 : c1 = 2.7568153900E+00_dp, c2 = 1.3067284816E+00_dp, c3 = 1.7023821103E-01_dp, &
216 : const1 = 0.180625_dp, const2 = 1.6_dp, d1 = 7.3700164250E-01_dp, &
217 : d2 = 1.2021132975E-01_dp, e0 = 6.6579051150E+00_dp, e1 = 3.0812263860E+00_dp, &
218 : e2 = 4.2868294337E-01_dp, e3 = 1.7337203997E-02_dp, f1 = 2.4197894225E-01_dp, &
219 : f2 = 1.2258202635E-02_dp, half = 0.5_dp, one = 1.0_dp
220 : REAL(KIND=dp), PARAMETER :: split1 = 0.425_dp, split2 = 5.0_dp, zero = 0.0_dp
221 :
222 : REAL(KIND=dp) :: q, r
223 :
224 0 : q = p - half
225 0 : IF (ABS(q) <= split1) THEN
226 0 : r = const1 - q*q
227 : normal_dev = q*(((a3*r + a2)*r + a1)*r + a0)/ &
228 0 : (((b3*r + b2)*r + b1)*r + one)
229 0 : RETURN
230 : ELSE
231 0 : IF (q < zero) THEN
232 : r = p
233 : ELSE
234 0 : r = one - p
235 : END IF
236 0 : IF (r <= zero) THEN
237 0 : normal_dev = zero
238 0 : RETURN
239 : END IF
240 0 : r = SQRT(-LOG(r))
241 0 : IF (r <= split2) THEN
242 0 : r = r - const2
243 0 : normal_dev = (((c3*r + c2)*r + c1)*r + c0)/((d2*r + d1)*r + one)
244 : ELSE
245 0 : r = r - split2
246 0 : normal_dev = (((e3*r + e2)*r + e1)*r + e0)/((f2*r + f1)*r + one)
247 : END IF
248 0 : IF (q < zero) normal_dev = -normal_dev
249 0 : RETURN
250 : END IF
251 : END SUBROUTINE ppnd7
252 :
253 : ! **************************************************************************************************
254 : !> \brief Evaluates the tail area of the standardised normal curve
255 : !> from x to infinity if upper is .true. or
256 : !> from minus infinity to x if upper is .false.
257 : !> AS66 Applied Statistics (1973) vol.22, no.3
258 : !> \param x ...
259 : !> \param upper ...
260 : !> \return ...
261 : !> \par History
262 : !> Original version by Alain J. Miller - 1996
263 : !> Teodoro Laino (02.2007) [tlaino]
264 : ! **************************************************************************************************
265 0 : FUNCTION alnorm(x, upper) RESULT(fn_val)
266 : REAL(KIND=dp), INTENT(IN) :: x
267 : LOGICAL, INTENT(IN) :: upper
268 : REAL(KIND=dp) :: fn_val
269 :
270 : REAL(KIND=dp), PARAMETER :: a1 = 5.75885480458_dp, a2 = 2.62433121679_dp, &
271 : a3 = 5.92885724438_dp, b1 = -29.8213557807_dp, b2 = 48.6959930692_dp, c1 = -3.8052E-8_dp, &
272 : c2 = 3.98064794E-4_dp, c3 = -0.151679116635_dp, c4 = 4.8385912808_dp, &
273 : c5 = 0.742380924027_dp, c6 = 3.99019417011_dp, con = 1.28_dp, d1 = 1.00000615302_dp, &
274 : d2 = 1.98615381364_dp, d3 = 5.29330324926_dp, d4 = -15.1508972451_dp, &
275 : d5 = 30.789933034_dp, half = 0.5_dp, ltone = 7.0_dp, one = 1.0_dp, p = 0.398942280444_dp, &
276 : q = 0.39990348504_dp, r = 0.398942280385_dp, utzero = 18.66_dp, zero = 0.0_dp
277 :
278 : LOGICAL :: up
279 : REAL(KIND=dp) :: y, z
280 :
281 0 : up = upper
282 0 : z = x
283 0 : IF (z < zero) THEN
284 0 : up = .NOT. up
285 0 : z = -z
286 : END IF
287 0 : IF (.NOT. (z <= ltone .OR. up .AND. z <= utzero)) THEN
288 0 : fn_val = zero
289 0 : IF (.NOT. up) fn_val = one - fn_val
290 0 : RETURN
291 : END IF
292 0 : y = half*z*z
293 0 : IF (z <= con) THEN
294 0 : fn_val = r*EXP(-y)/(z + c1 + d1/(z + c2 + d2/(z + c3 + d3/(z + c4 + d4/(z + c5 + d5/(z + c6))))))
295 : ELSE
296 0 : fn_val = half - z*(p - q*y/(y + a1 + b1/(y + a2 + b2/(y + a3))))
297 : END IF
298 0 : IF (.NOT. up) fn_val = one - fn_val
299 :
300 : END FUNCTION alnorm
301 :
302 : ! **************************************************************************************************
303 : !> \brief Calculates the algebraic polynomial of order nored-1 with
304 : !> array of coefficients c. Zero order coefficient is c(1)
305 : !> AS 181.2 Appl. Statist. (1982) Vol. 31, No. 2
306 : !> \param c ...
307 : !> \param nord ...
308 : !> \param x ...
309 : !> \return ...
310 : !> \par History
311 : !> Original version by Alain J. Miller - 1996
312 : !> Teodoro Laino (02.2007) [tlaino]
313 : ! **************************************************************************************************
314 0 : FUNCTION poly(c, nord, x) RESULT(fn_val)
315 :
316 : REAL(KIND=dp), INTENT(IN) :: c(:)
317 : INTEGER, INTENT(IN) :: nord
318 : REAL(KIND=dp), INTENT(IN) :: x
319 : REAL(KIND=dp) :: fn_val
320 :
321 : INTEGER :: i, j, n2
322 : REAL(KIND=dp) :: p
323 :
324 0 : fn_val = c(1)
325 0 : IF (nord == 1) RETURN
326 0 : p = x*c(nord)
327 0 : IF (nord == 2) THEN
328 0 : fn_val = fn_val + p
329 0 : RETURN
330 : END IF
331 0 : n2 = nord - 2
332 0 : j = n2 + 1
333 0 : DO i = 1, n2
334 0 : p = (p + c(j))*x
335 0 : j = j - 1
336 : END DO
337 0 : fn_val = fn_val + p
338 0 : END FUNCTION poly
339 :
340 : ! **************************************************************************************************
341 : !> \brief Kandall's test for correlation
342 : !> \param xdata ...
343 : !> \param istart ...
344 : !> \param n ...
345 : !> \param tau ...
346 : !> \param z ...
347 : !> \param prob ...
348 : !> \par History
349 : !> Teodoro Laino (02.2007) [tlaino]
350 : !> \note
351 : !> tau: Kendall's Tau
352 : !> z: number of std devs from 0 of tau
353 : !> prob: tau's probability
354 : ! **************************************************************************************************
355 0 : SUBROUTINE k_test(xdata, istart, n, tau, z, prob)
356 : REAL(KIND=dp), DIMENSION(:), POINTER :: xdata
357 : INTEGER, INTENT(IN) :: istart, n
358 : REAL(KIND=dp) :: tau, z, prob
359 :
360 : INTEGER :: is, j, k, nt
361 : REAL(KIND=dp) :: a1, var
362 :
363 0 : nt = n - istart + 1
364 0 : IF (nt .GE. min_sample_size) THEN
365 : is = 0
366 0 : DO j = istart, n - 1
367 0 : DO k = j + 1, n
368 0 : a1 = xdata(j) - xdata(k)
369 0 : IF (a1 .GT. 0.0_dp) THEN
370 0 : is = is + 1
371 0 : ELSE IF (a1 .LT. 0.0_dp) THEN
372 0 : is = is - 1
373 : END IF
374 : END DO
375 : END DO
376 0 : tau = REAL(is, KIND=dp)
377 0 : var = REAL(nt, KIND=dp)*REAL(nt - 1, KIND=dp)*REAL(2*nt + 5, KIND=dp)/18.0_dp
378 0 : z = tau/SQRT(var)
379 0 : prob = erf(ABS(z)/SQRT(2.0_dp))
380 : ELSE
381 0 : tau = 0.0_dp
382 0 : z = 0.0_dp
383 0 : prob = 1.0_dp
384 : END IF
385 0 : END SUBROUTINE k_test
386 :
387 : ! **************************************************************************************************
388 : !> \brief Von Neumann test for serial correlation
389 : !> \param xdata ...
390 : !> \param n ...
391 : !> \param r ...
392 : !> \param u ...
393 : !> \param prob ...
394 : !> \par History
395 : !> Teodoro Laino (02.2007) [tlaino]
396 : ! **************************************************************************************************
397 0 : SUBROUTINE vn_test(xdata, n, r, u, prob)
398 : REAL(KIND=dp), DIMENSION(:), POINTER :: xdata
399 : INTEGER, INTENT(IN) :: n
400 : REAL(KIND=dp) :: r, u, prob
401 :
402 : INTEGER :: i
403 : REAL(KIND=dp) :: q, s, var, x
404 :
405 0 : IF (n .GE. min_sample_size) THEN
406 : x = 0.0_dp
407 : q = 0.0_dp
408 0 : s = 0.0_dp
409 0 : DO i = 1, n - 1
410 0 : x = x + xdata(i)
411 0 : q = q + (xdata(i + 1) - xdata(i))**2
412 : END DO
413 0 : x = x + xdata(n)
414 0 : x = x/REAL(n, KIND=dp)
415 0 : DO i = 1, n
416 0 : s = s + (xdata(i) - x)**2
417 : END DO
418 0 : s = s/REAL(n - 1, KIND=dp)
419 0 : q = q/REAL(2*(n - 1), KIND=dp)
420 0 : r = q/s
421 0 : var = SQRT(1.0_dp/REAL(n + 1, KIND=dp)*(1.0_dp + 1.0_dp/REAL(n - 1, KIND=dp)))
422 0 : u = (r - 1.0_dp)/var
423 0 : prob = erf(ABS(u)/SQRT(2.0_dp))
424 : ELSE
425 0 : r = 0.0_dp
426 0 : u = 0.0_dp
427 0 : prob = 1.0_dp
428 : END IF
429 :
430 0 : END SUBROUTINE vn_test
431 :
432 : ! **************************************************************************************************
433 : !> \brief Performs tests on statistical methods
434 : !> Debug use only
435 : !> \param xdata ...
436 : !> \param globenv ...
437 : !> \par History
438 : !> Teodoro Laino (02.2007) [tlaino]
439 : ! **************************************************************************************************
440 0 : SUBROUTINE tests(xdata, globenv)
441 : REAL(KIND=dp), DIMENSION(:), POINTER :: xdata
442 : TYPE(global_environment_type), POINTER :: globenv
443 :
444 : INTEGER :: i, n
445 : REAL(KINd=dp) :: prob, pw, r, tau, u, w, z
446 0 : REAL(KIND=dp), DIMENSION(:), POINTER :: ydata
447 :
448 : IF (debug_this_module) THEN
449 : n = 50 ! original sample size
450 : NULLIFY (xdata)
451 : ALLOCATE (xdata(n))
452 : DO i = 1, 10
453 : xdata(i) = 5.0_dp - REAL(i, KIND=dp)/2.0_dp + 0.1*globenv%gaussian_rng_stream%next()
454 : WRITE (3, *) xdata(i)
455 : END DO
456 : DO i = 11, n
457 : xdata(i) = 0.1*globenv%gaussian_rng_stream%next()
458 : END DO
459 :
460 : ! Test for trend
461 : DO i = 1, n
462 : CALL k_test(xdata, i, n, tau, z, prob)
463 : IF (prob <= 0.2_dp) EXIT
464 : END DO
465 : WRITE (*, *) "Mann-Kendall test", i
466 :
467 : ! Test for normality distribution and for serial correlation
468 : DO i = 1, n
469 : ALLOCATE (ydata(n - i + 1))
470 : ydata = xdata(i:n)
471 : CALL sw_test(ydata, n - i + 1, w, pw)
472 : CALL vn_test(ydata, n - i + 1, r, u, prob)
473 : WRITE (*, *) "Shapiro Wilks test", i, w, pw
474 : WRITE (*, *) "Von Neu", i, r, u, prob
475 : DEALLOCATE (ydata)
476 : END DO
477 :
478 : DEALLOCATE (xdata)
479 : END IF
480 0 : END SUBROUTINE tests
481 :
482 : END MODULE statistical_methods
483 :
|