Matrix Science Mascot Parser toolkit
 
Loading...
Searching...
No Matches
tools_stats.pl

Using statistical routines.

#!/usr/local/bin/perl
##############################################################################
# file: tools_stats.pl #
# Mascot Parser toolkit example code #
##############################################################################
# COPYRIGHT NOTICE #
# Copyright 1998-2012 Matrix Science Limited All Rights Reserved. #
# #
##############################################################################
# $Source: parser/examples/test_perl/tools_stats.pl $
# $Author: villek@matrixscience.com $
# $Date: 2018-07-30 16:23:53 +0100 $
# $Revision: 1b450440f9c97e1e41d0fc6016a27d68951d4532 | MSPARSER_REL_3_0_0-2024-09-24-0-g93ebaeb4f4 $
##############################################################################
use strict;
##############################################################################
use msparser;
print "Binomial coefficients for sample size 6:\n";
for my $i (0 .. 6) {
printf "\t6 choose %d = %g\n", $i, msparser::ms_quant_stats::binomialCoefficient(6, $i);
}
print "First 10 values of the Poisson(lambda = 1) distribution:\n";
for my $i (1 .. 10) {
printf "\tPoisson(lambda 1, k %d) = %g\n", $i, msparser::ms_quant_stats::poissonDensity(1, $i);
}
print "Common values of the standard normal distribution:\n";
for (0.001, 0.01, 0.025, 0.05, 0.1, 0.5, 0.9, 0.95, 0.975, 0.99, 0.999) {
# Note that normalCriticalValue() takes an argument between 0.5 and 1.0;
# that is, it returns only the right tail critical values.
my $critval;
if ($_ < 0.5) {
$critval = -msparser::ms_quant_stats::normalCriticalValue(1 - $_);
} else {
$critval = msparser::ms_quant_stats::normalCriticalValue($_);
}
printf "\tNormal-1(%g) = %g\n", $_, $critval;
}
for (-3.09, -2.32, -1.96, -1.64, -1.28, 0, 1.28, 1.64, 1.96, 2.32, 3.09) {
# Note that normalCumulativeProbability() takes an argument between 0 and
# 4.09; that is, it reports only the right tail probability.
my $p;
if ($_ < 0) {
$p = 0.5 - msparser::ms_quant_stats::normalCumulativeProbability(-$_);
} else {
$p = 0.5 + msparser::ms_quant_stats::normalCumulativeProbability($_);
}
printf "\tNormal(x <= %g) = %g\n", $_, $p;
}
print "Common values of the chi-square distribution at 10 degrees of freedom:\n";
# Note the significance levels the chi-square functions support.
for (0.001, 0.01, 0.025, 0.05, 0.1) {
my $lowcrit = msparser::ms_quant_stats::chisqLowerCriticalValue(10, $_);
my $upcrit = msparser::ms_quant_stats::chisqUpperCriticalValue(10, $_);
printf "\tchi-sq(df 10, sigLevel %g) = <%g, %g>\n", $_, $lowcrit, $upcrit;
}
print "Common values of the Student's t distribution at 10 degrees of freedom:\n";
# Note the significance levels the Student's t function supports.
for (0.001, 0.005, 0.01, 0.025, 0.05, 0.1) {
printf "\tt(df 10, sigLevel %g) = %g\n",
$_,
msparser::ms_quant_stats::studentsCriticalValue(10, $_);
}
print "Common values of the F distribution at 10 and 1 degrees of freedom:\n";
# Note the significance levels the F function supports.
for (0.01, 0.05, 0.1) {
printf "\tF(df1 10, df2 1, sigLevel %g) = %g\n",
$_,
msparser::ms_quant_stats::snedecorsCriticalValue(10, 1, $_);
}
my @samples = (
# Uniform data generated with
# perl -e 'print join "\n", map {+ rand() } 0 .. 9'
{ name => 'uniform(0,1)', popmean => 0.5, data => [split /\s+/, <<EOD] },
0.400313346021907
0.635154745166318
0.795328049493161
0.465079196585926
0.709667388012424
0.76161797692474
0.617230566355229
0.282583560369936
0.251706165020014
0.2962014901575
EOD
# Normal(5, 2.5) data generated in R with
# > rnorm(10, 5, 2.5)
{ name => 'normal(5,2.5)', popmean => 5, data => [split /\s+/, <<EOD] },
3.1476833
9.2061900
2.6079399
4.4361054
6.4133568
4.3745899
3.5588195
3.0542840
0.6643241
4.3697818
EOD
# Beta(0.5, 0.5) data generated in R with
# > rbeta(10, 0.5, 0.5)
{ name => 'beta(0.5,0.5)', popmean => 0.5, data => [split /\s+/, <<EOD] },
0.754152463
0.008992998
0.221637853
0.426146895
0.656421042
0.224818243
0.022050868
0.539143737
0.877643550
0.128030530
EOD
);
print "\n";
for my $sample (@samples) {
printf "Sample '%s' = [%s]\n", $sample->{'name'}, join(', ', @{ $$sample{'data'} });
# ms_quant_stats takes as input STL vectors of type double. We can easily
# convert a Perl array into one:
my $vec = msparser::vectord->new(scalar(@{ $$sample{'data'} }));
for (0 .. $#{ $$sample{'data'} }) {
$vec->set($_, $$sample{'data'}[$_]);
}
# The sorted sample vector is needed for the Shapiro-Wilk W test.
# It also makes finding the sample median fast.
my $vec_sorted;
do {
my @sorted = sort { $a <=> $b } @{ $$sample{'data'} };
$vec_sorted = msparser::vectord->new(scalar(@sorted));
for (0 .. $#{ $$sample{'data'} }) {
$vec_sorted->set($_, $sorted[$_]);
}
};
# Auxiliary helper functions:
printf "\tSum = %g\n", msparser::ms_quant_stats::sum($vec);
printf "\tSum of squares = %g\n", msparser::ms_quant_stats::sumsq($vec);
printf "\tLog transformed = [%s]\n", join(', ', @{ msparser::ms_quant_stats::logTransform($vec) });
printf "\tExp transformed = [%s]\n", join(', ', @{ msparser::ms_quant_stats::expTransform($vec) });
# Basic sample statistics:
my $xbar = msparser::ms_quant_stats::arithmeticMean($vec);
my $stdev = msparser::ms_quant_stats::arithmeticStandardDeviation($vec, $xbar);
my $median = msparser::ms_quant_stats::sortedMedian($vec_sorted);
# If the vector isn't sorted:
# my $median = msparser::ms_quant_stats::unsortedMedian($vec);
my $geombar = msparser::ms_quant_stats::geometricMean($vec);
my $geomdev = msparser::ms_quant_stats::geometricStandardDeviation($vec, $geombar);
printf "\tArithmetic stdev = %g\n", $stdev;
printf "\tArithmetic mean = %g\n", $xbar;
printf "\tStandard error of mean = %g\n", msparser::ms_quant_stats::arithmeticStandardErrorOfMean($stdev, $vec->size());
printf "\tMedian = %g\n", $median;
printf "\tStandard error of median = %g\n", msparser::ms_quant_stats::arithmeticStandardErrorOfMedian($stdev, $vec->size());
printf "\tGeometric mean = %g\n", $geombar;
printf "\tGeometric stdev = %g\n", $geomdev;
# Basic distribution tests for normally distributed data:
printf "\tHypothesis: distribution is N(%g, sigma^2) (mean test). p-value = %g\n",
$$sample{'popmean'},
msparser::ms_quant_stats::calculateNormalMeanPvalue($$sample{'popmean'}, $xbar, $stdev, $vec->size());
printf "\tHypothesis: distribution is N(%g, sigma^2) (median test). p-value = %g\n",
$$sample{'popmean'},
msparser::ms_quant_stats::calculateNormalMedianPvalue($$sample{'popmean'}, $median, $stdev, $vec->size());
# (We would expect the normal sample to have the highest p-value, i.e.
# least evidence against the hypothesis, while the uniform and beta
# distribution should have fairly low p-values. However, sample size is
# so small that the tests are not very reliable.)
do {
my ($ok, $W, $p) = msparser::ms_quant_stats::testShapiroWilkW($vec_sorted);
printf "\tShapiro-Wilk ok = %d, W = %s, p-value = %s\n", $ok, (defined $W ? $W : '(none)'), (defined $p ? $p : '(none)');
};
# Outlier detection can be done even when the above tests indicate that
# the sample is not normally distributed. However, don't trust the results!
do {
# Note that the input vector must be sorted for outlier testing to
# work.
my ($numL, $numH) = msparser::ms_quant_stats::detectOutliers($vec_sorted, "auto", 0.05);
# $numL and $numH are the number of elements at each end of $vec_sorted
# which seem to be outliers. That is, elements between 0 and $numL-1,
# and $vec_sorted->size()-$numH to $vec_sorted->size()-1.
printf "\tOutlier detection('auto', 0.05') = <%s, %s>\n", (defined $numL ? $numL : '(none)'), (defined $numH ? $numH : '(none)');
};
print "\n";
}
=pod
Running the program as
perl -I../bin tools_stats.pl
will give the following output under Mascot 2.5:
Binomial coefficients for sample size 6:
6 choose 0 = 1
6 choose 1 = 6
6 choose 2 = 15
6 choose 3 = 20
6 choose 4 = 15
6 choose 5 = 6
6 choose 6 = 1
First 10 values of the Poisson(lambda = 1) distribution:
Poisson(lambda 1, k 1) = 0.367879
Poisson(lambda 1, k 2) = 0.18394
Poisson(lambda 1, k 3) = 0.0613132
Poisson(lambda 1, k 4) = 0.0153283
Poisson(lambda 1, k 5) = 0.00306566
Poisson(lambda 1, k 6) = 0.000510944
Poisson(lambda 1, k 7) = 7.2992e-05
Poisson(lambda 1, k 8) = 9.12399e-06
Poisson(lambda 1, k 9) = 1.01378e-06
Poisson(lambda 1, k 10) = 1.01378e-07
Common values of the standard normal distribution:
Normal-1(0.001) = -3.09
Normal-1(0.01) = -2.318
Normal-1(0.025) = -1.959
Normal-1(0.05) = -1.642
Normal-1(0.1) = -1.282
Normal-1(0.5) = 0
Normal-1(0.9) = 1.282
Normal-1(0.95) = 1.642
Normal-1(0.975) = 1.959
Normal-1(0.99) = 2.318
Normal-1(0.999) = 3.09
Normal(x <= -3.09) = 0.001
Normal(x <= -2.32) = 0.00554
Normal(x <= -1.96) = 0.025
Normal(x <= -1.64) = 0.0505
Normal(x <= -1.28) = 0.10027
Normal(x <= 0) = 0.5
Normal(x <= 1.28) = 0.89973
Normal(x <= 1.64) = 0.9495
Normal(x <= 1.96) = 0.975
Normal(x <= 2.32) = 0.99446
Normal(x <= 3.09) = 0.999
Common values of the chi-square distribution at 10 degrees of freedom:
chi-sq(df 10, sigLevel 0.001) = <1.479, 29.588>
chi-sq(df 10, sigLevel 0.01) = <2.558, 23.209>
chi-sq(df 10, sigLevel 0.025) = <3.247, 20.483>
chi-sq(df 10, sigLevel 0.05) = <3.94, 18.307>
chi-sq(df 10, sigLevel 0.1) = <4.865, 15.987>
Common values of the Student's t distribution at 10 degrees of freedom:
t(df 10, sigLevel 0.001) = 4.143
t(df 10, sigLevel 0.005) = 3.169
t(df 10, sigLevel 0.01) = 2.764
t(df 10, sigLevel 0.025) = 2.228
t(df 10, sigLevel 0.05) = 1.812
t(df 10, sigLevel 0.1) = 1.372
Common values of the F distribution at 10 and 1 degrees of freedom:
F(df1 10, df2 1, sigLevel 0.01) = 6055.85
F(df1 10, df2 1, sigLevel 0.05) = 241.882
F(df1 10, df2 1, sigLevel 0.1) = 60.195
Sample 'uniform(0,1)' = [0.400313346021907, 0.635154745166318, 0.795328049493161, 0.465079196585926, 0.709667388012424, 0.76161797692474, 0.617230566355229, 0.282583560369936, 0.251706165020014, 0.2962014901575]
Sum = 5.21488
Sum of squares = 3.10813
Log transformed = [-0.915507673489646, -0.4538866166025, -0.229000608568736, -0.765547572658224, -0.342958886300134, -0.272310191628172, -0.482512635488649, -1.26378098321243, -1.37949288361674, -1.21671534624448]
Exp transformed = [1.49229222822126, 1.88731417160841, 2.2151675622189, 1.59214027596837, 2.03331484127, 2.14173870243289, 1.85378698617621, 1.32655261769313, 1.28621804590024, 1.34474108162913]
Arithmetic stdev = 0.2078
Arithmetic mean = 0.521488
Standard error of mean = 0.065712
Median = 0.541155
Standard error of median = 0.0773299
Geometric mean = 0.480864
Geometric stdev = 1.54969
Hypothesis: distribution is N(0.5, sigma^2) (mean test). p-value = 0.75114
Hypothesis: distribution is N(0.5, sigma^2) (median test). p-value = 0.607474
Shapiro-Wilk ok = 1, W = 0.904955259787869, p-value = 0.248103833545881
Outlier detection('auto', 0.05') = <0, 0>
Sample 'normal(5,2.5)' = [3.1476833, 9.2061900, 2.6079399, 4.4361054, 6.4133568, 4.3745899, 3.5588195, 3.0542840, 0.6643241, 4.3697818]
Sum = 41.8331
Sum of squares = 222.941
Log transformed = [1.14666672193647, 2.21987608389596, 0.958560599320351, 1.48977682935013, 1.85838281560642, 1.47581277827312, 1.26942888874853, 1.11654519526921, -0.408985146179298, 1.47471307651077]
Exp transformed = [23.2820644952793, 9958.58228914508, 13.5710642846326, 84.4454192814535, 609.93768731081, 79.4072679533238, 35.1217114826069, 21.2059965963608, 1.94317668425472, 79.0263862606734]
Arithmetic stdev = 2.30796
Arithmetic mean = 4.18331
Standard error of mean = 0.72984
Median = 3.9643
Standard error of median = 0.858876
Geometric mean = 3.5257
Geometric stdev = 2.00172
Hypothesis: distribution is N(5, sigma^2) (mean test). p-value = 0.292114
Hypothesis: distribution is N(5, sigma^2) (median test). p-value = 0.258609
Shapiro-Wilk ok = 1, W = 0.917418368639979, p-value = 0.335925333025062
Outlier detection('auto', 0.05') = <0, 0>
Sample 'beta(0.5,0.5)' = [0.754152463, 0.008992998, 0.221637853, 0.426146895, 0.656421042, 0.224818243, 0.022050868, 0.539143737, 0.877643550, 0.128030530]
Sum = 3.85904
Sum of squares = 2.3588
Log transformed = [-0.28216072584468, -4.71130900444498, -1.50671052190953, -0.852971168207217, -0.420952863607245, -1.49246301212003, -3.8144033127847, -0.617773070154802, -0.130514747277869, -2.05548652787785]
Exp transformed = [2.12580905780365, 1.00903355649617, 1.24811929335097, 1.53134570563969, 1.92788017044885, 1.2520951184563, 1.02229578528984, 1.71453813789789, 2.40522523023951, 1.13658770218992]
Arithmetic stdev = 0.310837
Arithmetic mean = 0.385904
Standard error of mean = 0.0982953
Median = 0.325483
Standard error of median = 0.115674
Geometric mean = 0.204237
Geometric stdev = 4.71093
Hypothesis: distribution is N(0.5, sigma^2) (mean test). p-value = 0.275606
Hypothesis: distribution is N(0.5, sigma^2) (median test). p-value = 0.165651
Shapiro-Wilk ok = 1, W = 0.930482521074392, p-value = 0.452655082151213
Outlier detection('auto', 0.05') = <0, 0>
=cut