Matrix Science Mascot Parser toolkit
 
Loading...
Searching...
No Matches
tools_stats.py

Using statistical routines.

1#!/usr/bin/python
2
16
17import msparser
18
19# ms_range function has been added to make the range supplied consistent with other languages such as Perl and C#
20def ms_range(start, stop, step=1):
21 i = start
22 while i <= stop:
23 yield i
24 i += step
25
26print("\n")
27print("Bionominal coefficients for sample size 6:")
28
29for i in ms_range(0, 6):
30 print("\t6 choose %d = %f" %(i, msparser.ms_quant_stats.binomialCoefficient(6, i)))
31
32print("\n")
33print("First 10 values of the Poisson(lambda = 1) distribution:")
34
35for i in ms_range(0, 10):
36 print("\tPoisson(lambda 1, k %d) = %f" %(i, msparser.ms_quant_stats.poissonDensity(1, i)))
37
38print("\n")
39print("Common values of the standard normal distribution:")
40
41for i in [0.001, 0.01, 0.025, 0.05, 0.1, 0.5, 0.9, 0.95, 0.975, 0.99, 0.999]:
42# Note that normalCriticalValue() takes an argument between 0.5 and 1.0;
43# that is, it returns only the right tail critical values.
44 if (i < 0.5):
45 critval = msparser.ms_quant_stats.normalCriticalValue(1 - i)
46 else:
47 critval = msparser.ms_quant_stats.normalCriticalValue(i)
48 print("\tNormal-1(%f) = %f" %(i, critval))
49
50print("\n")
51for i in [-3.09, -2.32, -1.96, -1.64, -1.28, 0, 1.28, 1.64, 1.96, 2.32, 3.09]:
52# Note that normalCumulativeProbability() takes an argument between 0 and
53# 4.09; that is, it reports only the right tail probability.
54 if (i < 0):
55 p = 0.5 - msparser.ms_quant_stats.normalCumulativeProbability(-i)
56 else:
57 p = 0.5 + msparser.ms_quant_stats.normalCumulativeProbability(i)
58 print("\tNormal(x <= %f) = %f" %(i, p))
59
60print("\n")
61print("Common values of the chi-square distribution at 10 degrees of freedom:")
62
63# Note the significance levels the chi-square functions support.
64for i in [0.001, 0.01, 0.025, 0.05, 0.1]:
65 lowcrit = msparser.ms_quant_stats.chisqLowerCriticalValue(10, i)
66 upcrit = msparser.ms_quant_stats.chisqUpperCriticalValue(10, i)
67 print("\tchi-sq(df 10, sigLevel %f) = <%f, %f>" %(i, lowcrit, upcrit))
68
69print("\n")
70print("Common values of the Student's t distribution at 10 degrees of freedom:")
71# Note the significance levels the Student's t function supports.
72for i in [0.001, 0.005, 0.01, 0.025, 0.05, 0.1]:
73 print("\tt(df 10, sigLevel %f) = %f" %(i, msparser.ms_quant_stats.studentsCriticalValue(10, i)))
74
75print("\n")
76print("Common values of the F distribution at 10 and 1 degrees of freedom:")
77# Note the significance levels the F function supports.
78for i in [0.01, 0.05, 0.1]:
79 print("\tF(df1 10, df2 1, sigLevel %f) = %f" %(i, msparser.ms_quant_stats.snedecorsCriticalValue(10, 1, i)))
80
81print("\n")
82
83samples = [
84# Uniform data generated with
85# perl -e 'print join "\n", map {+ rand() } 0 .. 9'
86['uniform(0,1)', 0.5, [0.400313346021907, 0.635154745166318, 0.795328049493161, 0.465079196585926, 0.709667388012424, 0.76161797692474, 0.617230566355229, 0.282583560369936, 0.251706165020014, 0.2962014901575]],
87# Normal(5, 2.5) data generated in R with
88# > rnorm(10, 5, 2.5)
89['normal(5,2.5)', 5, [3.1476833, 9.2061900, 2.6079399, 4.4361054, 6.4133568, 4.3745899, 3.5588195, 3.0542840, 0.6643241, 4.3697818]],
90# Beta(0.5, 0.5) data generated in R with
91# > rbeta(10, 0.5, 0.5)
92['beta(0.5,0.5)', 0.5, [0.754152463, 0.008992998, 0.221637853, 0.426146895, 0.656421042, 0.224818243, 0.022050868, 0.539143737, 0.877643550, 0.128030530]]
93]
94
95
96
97# Auxiliary helper functions:
98def aux_helper(vec, vec_sorted):
99 print("\tSum = %g" %(msparser.ms_quant_stats.sum(vec)))
100 print("\tSum of squares = %g" %(msparser.ms_quant_stats.sumsq(vec)))
101 print("\tLog transformed = [%s]" %str((msparser.ms_quant_stats.logTransform(vec))))
102 print("\tExp transformed = [%s]" %str((msparser.ms_quant_stats.expTransform(vec))))
103
104# Basic sample statistics:
105
106 xbar = msparser.ms_quant_stats.arithmeticMean(vec)
107 stdev = msparser.ms_quant_stats.arithmeticStandardDeviation(vec, xbar)
108 median = msparser.ms_quant_stats.sortedMedian(vec_sorted)
109
110# If the vector isn't sorted:
111# median = msparser.ms_quant_stats.unsortedMedian(vec)
112
113 geombar = msparser.ms_quant_stats.geometricMean(vec)
114 geomdev = msparser.ms_quant_stats.geometricStandardDeviation(vec, geombar)
115
116 print("\tArithmetic stdev = %g" %(stdev))
117 print("\tArithmetic mean = %g" %(xbar))
118 print("\tStandard error of mean = %g" %(msparser.ms_quant_stats.arithmeticStandardErrorOfMean(stdev, len(vec))))
119 print("\tMedian = %g" %(median))
120 print("\tStandard error of median = %g" %(msparser.ms_quant_stats.arithmeticStandardErrorOfMedian(stdev, len(vec))))
121 print("\tGeometric mean = %g" %(geombar))
122 print("\tGeometric stdev = %g" %(geomdev))
123
124# Basic distribution tests for normally distributed data:
125
126 print("\tHypothesis: distribution is N(%g, sigma^2) (mean test). p-value = %g" %(sample[1], msparser.ms_quant_stats.calculateNormalMeanPvalue(sample[1], xbar, stdev, len(vec))))
127 print("\tHypothesis: distribution is N(%g, sigma^2) (median test). p-value = %g" %(sample[1], msparser.ms_quant_stats.calculateNormalMedianPvalue(sample[1], median, stdev, len(vec))))
128
129# (We would expect the normal sample to have the highest p-value, i.e.
130# least evidence against the hypothesis, while the uniform and beta
131# distribution should have fairly low p-values. However, sample size is
132# so small that the tests are not very reliable.)
133
134 ok, W, p = msparser.ms_quant_stats.testShapiroWilkW(vec_sorted)
135 if W == None:
136 W = 'None'
137 if p == None:
138 p = 'None'
139 print("\tShapiro-Wilk ok = %d, W = %s, p-value = %s" %(ok, W, p))
140
141# Outlier detection can be done even when the above tests indicate that
142# the sample is not normally distributed. However, don't trust the results!
143
144
145# Note that the input vector must be sorted for outlier testing to
146# work.
147 numL, numH = msparser.ms_quant_stats.detectOutliers(vec_sorted, "auto", 0.05)
148 if numL == None:
149 numL = 'None'
150 if numH == None:
151 numH = 'None'
152# numL and numH are the number of elements at each end of vec_sorted
153# which seem to be outliers. That is, elements between 0 and numL-1,
154# and len(vec_sorted)-numH to len(vec_sorted)-1.
155
156 print("\tOutlier detection('auto', 0.05') = <%s, %s>" %(numL, numH))
157
158
159for sample in samples:
160 print("Sample '%s' = [%s]" %(sample[0], sample[2]))
161# ms_quant_stats takes as input STL vectors of type double. We can easily
162# convert a Python list into one:
163 vec = msparser.vectord()
164 vectors = sample[2]
165 for i in ms_range(0, len(vectors)-1):
166 vec.append(vectors[i])
167
168# The sorted sample vector is needed for the Shapiro-Wilk W test.
169# It also makes finding the sample median fast.
170 vec_sorted = msparser.vectord()
171 vectors_sorted = sample[2]
172 vectors_sorted.sort()
173 for i in ms_range(0, len(vectors_sorted)-1):
174 vec_sorted.append(vectors_sorted[i])
175 aux_helper(vec, vec_sorted)
176 print("\n")
177
178'''
179Running the program as
180
181python tools_stats.pl
182
183will give the following output under Mascot 2.5:
184
185Bionominal coefficients for sample size 6:
186 6 choose 0 = 1.000000
187 6 choose 1 = 6.000000
188 6 choose 2 = 15.000000
189 6 choose 3 = 20.000000
190 6 choose 4 = 15.000000
191 6 choose 5 = 6.000000
192
193
194First 10 values of the Poisson(lambda = 1) distribution:
195 Poisson(lambda 1, k 0) = 0.367879
196 Poisson(lambda 1, k 1) = 0.367879
197 Poisson(lambda 1, k 2) = 0.183940
198 Poisson(lambda 1, k 3) = 0.061313
199 Poisson(lambda 1, k 4) = 0.015328
200 Poisson(lambda 1, k 5) = 0.003066
201 Poisson(lambda 1, k 6) = 0.000511
202 Poisson(lambda 1, k 7) = 0.000073
203 Poisson(lambda 1, k 8) = 0.000009
204 Poisson(lambda 1, k 9) = 0.000001
205
206
207Common values of the standard normal distribution:
208 Normal-1(0.001000) = 3.090000
209 Normal-1(0.010000) = 2.318000
210 Normal-1(0.025000) = 1.959000
211 Normal-1(0.050000) = 1.642000
212 Normal-1(0.100000) = 1.282000
213 Normal-1(0.500000) = 0.000000
214 Normal-1(0.900000) = 1.282000
215 Normal-1(0.950000) = 1.642000
216 Normal-1(0.975000) = 1.959000
217 Normal-1(0.990000) = 2.318000
218 Normal-1(0.999000) = 3.090000
219
220
221 Normal(x <= -3.090000) = 0.001000
222 Normal(x <= -2.320000) = 0.005540
223 Normal(x <= -1.960000) = 0.025000
224 Normal(x <= -1.640000) = 0.050500
225 Normal(x <= -1.280000) = 0.100270
226 Normal(x <= 0.000000) = 0.500000
227 Normal(x <= 1.280000) = 0.899730
228 Normal(x <= 1.640000) = 0.949500
229 Normal(x <= 1.960000) = 0.975000
230 Normal(x <= 2.320000) = 0.994460
231 Normal(x <= 3.090000) = 0.999000
232
233
234Common values of the chi-square distribution at 10 degrees of freedom:
235 chi-sq(df 10, sigLevel 0.001000) = <1.479000, 29.588000>
236 chi-sq(df 10, sigLevel 0.010000) = <2.558000, 23.209000>
237 chi-sq(df 10, sigLevel 0.025000) = <3.247000, 20.483000>
238 chi-sq(df 10, sigLevel 0.050000) = <3.940000, 18.307000>
239 chi-sq(df 10, sigLevel 0.100000) = <4.865000, 15.987000>
240
241
242Common values of the Student's t distribution at 10 degrees of freedom: t(df 10, sigLevel 0.001000) = 4.143000
243 t(df 10, sigLevel 0.005000) = 3.169000
244 t(df 10, sigLevel 0.010000) = 2.764000
245 t(df 10, sigLevel 0.025000) = 2.228000
246 t(df 10, sigLevel 0.050000) = 1.812000
247 t(df 10, sigLevel 0.100000) = 1.372000
248
249
250Common values of the F distribution at 10 and 1 degrees of freedom:
251 F(df1 10, df2 1, sigLevel 0.010000) = 6055.850000
252 F(df1 10, df2 1, sigLevel 0.050000) = 241.882000
253 F(df1 10, df2 1, sigLevel 0.100000) = 60.195000
254
255
256Sample 'uniform(0,1)' = [[0.400313346021907, 0.635154745166318, 0.795328049493161, 0.465079196585926, 0.709667388012424, 0.76161797692474, 0.617230566355229, 0.282583560369936, 0.251706165020014, 0.2962014901575]]
257 Sum = 5.21488
258 Sum of squares = 3.10813
259 Log transformed = [(-0.915507673489646, -0.4538866166025002, -0.22900060856873583, -0.7655475726582238, -0.342958886300134, -0.2723101916281718, -0.48251263548864914, -1.2637809832124323, -1.379492883616736, -1.216715346244483)]
260 Exp transformed = [(1.4922922282212572, 1.8873141716084079, 2.215167562218903, 1.592140275968374, 2.03331484127, 2.1417387024328853, 1.853786986176205, 1.3265526176931346, 1.2862180459002401, 1.3447410816291303)]
261 Arithmetic stdev = 0.2078
262 Arithmetic mean = 0.521488
263 Standard error of mean = 0.065712
264 Median = 0.541155
265 Standard error of median = 0.0773299
266 Geometric mean = 0.480864
267 Geometric stdev = 1.54969
268 Hypothesis: distribution is N(0.5, sigma^2) (mean test). p-value = 0.75114
269 Hypothesis: distribution is N(0.5, sigma^2) (median test). p-value = 0.607474
270 Shapiro-Wilk ok = 1, W = 0.904955259788, p-value = 0.248103833546
271 Outlier detection('auto', 0.05') = <0, 0>
272
273
274Sample 'normal(5,2.5)' = [[3.1476833, 9.20619, 2.6079399, 4.4361054, 6.4133568, 4.3745899, 3.5588195, 3.054284, 0.6643241, 4.3697818]]
275 Sum = 41.8331
276 Sum of squares = 222.941
277 Log transformed = [(1.146666721936465, 2.2198760838959566, 0.9585605993203508, 1.489776829350127, 1.8583828156064155, 1.47581277827312, 1.2694288887485337, 1.1165451952692114, -0.40898514617929754, 1.474713076510773)]
278 Exp transformed = [(23.282064495279258, 9958.582289145075, 13.571064284632554, 84.44541928145347, 609.9376873108102, 79.40726795332382, 35.1217114826069, 21.205996596360794, 1.9431766842547191, 79.02638626067342)]
279 Arithmetic stdev = 2.30796
280 Arithmetic mean = 4.18331
281 Standard error of mean = 0.72984
282 Median = 3.9643
283 Standard error of median = 0.858876
284 Geometric mean = 3.5257
285 Geometric stdev = 2.00172
286 Hypothesis: distribution is N(5, sigma^2) (mean test). p-value = 0.292114
287 Hypothesis: distribution is N(5, sigma^2) (median test). p-value = 0.258609
288 Shapiro-Wilk ok = 1, W = 0.91741836864, p-value = 0.335925333025
289 Outlier detection('auto', 0.05') = <0, 0>
290
291
292Sample 'beta(0.5,0.5)' = [[0.754152463, 0.008992998, 0.221637853, 0.426146895, 0.656421042, 0.224818243, 0.022050868, 0.539143737, 0.87764355, 0.12803053]]
293 Sum = 3.85904
294 Sum of squares = 2.3588
295 Log transformed = [(-0.28216072584468, -4.7113090044449795, -1.506710521909531, -0.8529711682072173, -0.42095286360724493, -1.4924630121200262, -3.8144033127847012, -0.6177730701548017, -0.13051474727786935, -2.055486527877849)]
296 Exp transformed = [(2.1258090578036466, 1.0090335564961697, 1.2481192933509702, 1.5313457056396895, 1.9278801704488464, 1.252095118456305, 1.0222957852898442, 1.7145381378978928, 2.4052252302395134, 1.1365877021899178)]
297 Arithmetic stdev = 0.310837
298 Arithmetic mean = 0.385904
299 Standard error of mean = 0.0982953
300 Median = 0.325483
301 Standard error of median = 0.115674
302 Geometric mean = 0.204237
303 Geometric stdev = 4.71093
304 Hypothesis: distribution is N(0.5, sigma^2) (mean test). p-value = 0.275606
305 Hypothesis: distribution is N(0.5, sigma^2) (median test). p-value = 0.165651
306 Shapiro-Wilk ok = 1, W = 0.930482521074, p-value = 0.452655082151
307 Outlier detection('auto', 0.05') = <0, 0>
308
309'''