Mathematical and statistical routines for peptide and protein quantitation. More...
#include <ms_quant_stats.hpp>
Static Public Member Functions | |
static double | arithmeticMean (const std::vector< double > &data) |
Calculate the arithmetic mean. | |
static double | arithmeticStandardDeviation (const std::vector< double > &data, double arithmeticMean) |
Calculate the arithmetic sample standard deviation. | |
static double | arithmeticStandardErrorOfMean (double arithmeticStdev, unsigned long sampleSize) |
Calculate the standard error of the arithmetic sample mean. | |
static double | arithmeticStandardErrorOfMedian (double arithmeticStdev, unsigned long sampleSize) |
Calculate the standard error of the sample median. | |
static double | binomialCoefficient (unsigned int n, unsigned int k) |
Compute the binomial coefficient. | |
static double | calculateNormalMeanPvalue (double populationMean, double mean, double stdev, unsigned long sampleSize) |
Calculate the p-value that the population mean is populationMean, given the sample statistics and assuming the population is normally distributed. | |
static double | calculateNormalMedianPvalue (double populationMedian, double median, double stdev, unsigned long sampleSize) |
Calculate the p-value that the population median is populationMedian, given the sample statistics and assuming the population is normally distributed. | |
static double | calculateNormalWeightedPvalue (double populationMean, double weightedMean, const std::vector< double > &weights, double weightedStdev, unsigned long sampleSize) |
Calculate the p-value that the population mean is populationMean, given the weighted sample statistics and assuming the population is normally distributed. | |
static double | chisqLowerCriticalValue (unsigned int df, double sigLevel) |
Return the lower critical value of the chi-square distribution at the given significance level. | |
static double | chisqUpperCriticalValue (unsigned int df, double sigLevel) |
Return the upper critical value of the chi-square distribution at the given significance level. | |
static void | detectOutliers (const std::vector< double > &sample, std::string outlierMethod, double sigLevel, unsigned int *const numLow, unsigned int *const numHigh) |
Given a sample and an outlier method, return the number of outliers at low and high extreme at the given significance level. | |
static void | detectOutliers (const std::vector< double > &sample, std::string outlierMethod, unsigned int *const numLow, unsigned int *const numHigh) |
Given a sample and an outlier method, return the number of outliers at low and high extreme at significance level 0.05. | |
static std::vector< double > | expTransform (const std::vector< double > &logData) |
Antilog (exp) transform of the input data. | |
static double | geometricMean (const std::vector< double > &data) |
Calculate the geometric mean. | |
static double | geometricStandardDeviation (const std::vector< double > &data, double geometricMean) |
Calculate the geometric sample standard deviation. | |
static std::vector< double > | logTransform (const std::vector< double > &data) |
Log transform of the input data. | |
static std::vector< double > | meanCentre (const std::vector< double > &data, double mean) |
Centre the data values about the given mean. | |
static double | medianStandardErrorCorrection (unsigned long sampleSize) |
Return the correction for the standard error of the median. | |
static double | normalCriticalValue (double p) |
Return the critical value of the standard normal distribution at the given cumulative probability. | |
static double | normalCumulativeProbability (double x0) |
Return the cumulative probability that the standard normal statistic is at most x0. | |
static double | poissonDensity (double lambda, unsigned int k) |
Calculate the probability density of the Poisson distribution. | |
static double | snedecorsCriticalValue (unsigned int df1, unsigned int df2, double sigLevel) |
Return the critical value of Snedecor's F distribution at the given significance level. | |
static double | sortedMedian (const std::deque< double > &data) |
Find the median of a sorted data deque. | |
static double | sortedMedian (const std::vector< double > &data) |
Find the median of a sorted data vector. | |
static double | studentsCriticalValue (unsigned int df, double sigLevel) |
Return the critical value of Student's t distribution at the given significance level. | |
static double | sum (const std::vector< double > &data) |
Calculate the sum of a sample vector. | |
static double | sumsq (const std::vector< double > &data) |
Calculate the sum of squares of a sample vector. | |
static int | testDixonsN9 (unsigned long sampleSize, double min, double x2, double xn_1, double max, double sigLevel) |
Apply Dixon's N9 (r11) outlier test to the sample. | |
static int | testGrubbsN2 (unsigned long sampleSize, double min, double mean, double max, double stdev, double sigLevel) |
Apply Grubb's N2 outlier test to the sample statistics. | |
static void | testRosners (const std::vector< double > &sample, unsigned int maxOutliers, double sigLevel, unsigned int *const numLow, unsigned int *const numHigh) |
Apply the generalised extreme Studentised deviate (ESD) outlier test (aka Rosner's test) to the sample. | |
static bool | testShapiroWilkW (const std::vector< double > &sample, double *const W, double *const pValue) |
Apply Shapiro-Wilk W normality test to the sample. | |
static double | unsortedMedian (const std::vector< double > &data) |
Find the median of an unsorted data vector. | |
static double | weightedArithmeticMean (const std::vector< double > &data, const std::vector< double > &weights) |
Calculate the weighted arithmetic mean. | |
static double | weightedArithmeticStandardDeviation (const std::vector< double > &data, const std::vector< double > &weights, double weightedMean) |
Calculate the weighted arithmetic sample standard deviation. | |
static double | weightedArithmeticStandardError (const std::vector< double > &weights, double weightedStdev, unsigned long sampleSize) |
Calculate the standard error of the weighted arithmetic sample mean. | |
static double | weightedGeometricMean (const std::vector< double > &data, const std::vector< double > &weights) |
Calculate the weighted geometric mean. | |
static double | weightedGeometricStandardDeviation (const std::vector< double > &data, const std::vector< double > &weights, double weightedMean) |
Calculate the weighted geometric sample standard deviation. | |
Mathematical and statistical routines for peptide and protein quantitation.
ms_quant_stats is a collection of mathematical and statistical routines used in peptide and protein quantitation by ms_ms2quantitation and ms_customquantitation. It is possible to use the routines outside this context, as they are not tied to either class.
The class contains only static class methods; it is impossible and meaningless to create instances of ms_quant_stats.
|
static |
Calculate the arithmetic mean.
The arithmetic mean is the usual average:
mean(data) = (1/N
)sumi xi
where N
is the number of elements in data and the summation is over all elements xi.
data | Any vector of numbers. |
|
static |
Calculate the arithmetic sample standard deviation.
The arithmetic sample deviation is the square root of the arithmetic sample variance. The sample variance is the second sample moment about the sample mean, scaled by N - 1
, where N
is the sample size. That is,
var(data) = 1/(N
- 1)sumi( (xi - arithmeticMean)2 )
where the sum is over all elements xi.
data | A sample of data. |
arithmeticMean | Sample arithmetic mean. |
|
static |
Calculate the standard error of the arithmetic sample mean.
The standard error of the arithmetic sample mean in a normally distributed sample is simple: standard deviation divided by the square root of the sample size.
arithmeticStdev | Arithmetic standard deviation, for example from arithmeticStandardDeviation(). |
sampleSize | The sample size. |
|
static |
Calculate the standard error of the sample median.
The standard error of the sample median in a normally distributed sample is the same as the standard error of the sample mean, times a correction factor. The correction factor is documented in medianStandardErrorCorrection().
arithmeticStdev | Arithmetic standard deviation, for example from arithmeticStandardDeviation(). |
sampleSize | The sample size. |
|
static |
Compute the binomial coefficient.
The method returns 1 in the following cases, by definition:
Note that although the binomial coefficient is an integer, the return value is double
to allow values beyond about 4 billion on 32-bit systems. The value is exact up to about 74C37.
n | Population size. |
k | Sample size. |
|
static |
Calculate the p-value that the population mean is populationMean, given the sample statistics and assuming the population is normally distributed.
Hypothesis: the population you are sampling from is normally distributed with mean populationMean and unknown variance.
Statistic: Student's t = (mean - populationMean)/(stdev/sqrt
(sampleSize))
Assume the hypothesis is true. This method returns the probability (the p-value) that the statistic t could have a value as extreme (or more extreme) as the one observed. Low p-value means that either the hypothesis is wrong or the sample is exceptionally rare.
(Note that there is no alternative hypothesis. The p-value with this interpretation is part of Fisher's significance test, which does not decide between hypotheses. Recommended reading: Hubbard, R. et al. (2003): Confusion over Measures of Evidence (p's) versus Errors (alphas) in Classical Statistical Testing. The American Statistician 57(3):171-182.)
populationMean | Assumed population mean. (Population variance is assumed unknown.) |
mean | Sample arithmetic mean. |
stdev | Sample standard deviation. This must be positive. |
sampleSize | Sample size. This must be at least 1. |
|
static |
Calculate the p-value that the population median is populationMedian, given the sample statistics and assuming the population is normally distributed.
Hypothesis: the population you are sampling from is normally distributed with median populationMedian and unknown variance. In particular, the population median is equal to the population mean.
Statistic: Student's t for sample median, t = 1/C*
(median - populationMedian)/(stdev/sqrt
(sampleSize))
Assume the hypothesis is true. This method returns the probability (the p-value) that the statistic t could have a value as extreme (or more extreme) as the one observed. Low p-value means that either the hypothesis is wrong or the sample is exceptionally rare.
(Note that there is no alternative hypothesis. The p-value with this interpretation is part of Fisher's significance test, which does not decide between hypotheses. Recommended reading: Hubbard, R. et al. (2003): Confusion over Measures of Evidence (p's) versus Errors (alphas) in Classical Statistical Testing. The American Statistician 57(3):171-182.)
The constant C
is a correction to the standard error of the median, retrieved from medianStandardErrorCorrection().
populationMedian | Assumed population median. (Population variance is assumed unknown.) |
median | Sample median. |
stdev | Sample standard deviation. This must be positive. |
sampleSize | Sample size. This must be at least 1. |
|
static |
Calculate the p-value that the population mean is populationMean, given the weighted sample statistics and assuming the population is normally distributed.
Hypothesis: the population you are sampling from is normally distributed with mean populationMean and unknown variance.
Statistic: Student's t = (weightedMean - populationMean)/(stdev sqrt(W)/b
)),
where stdev
is the sample standard deviation, W
is the sum of weights and b
is the effective base, as defined in weightedArithmeticStandardError(). Note that this function takes as argument the weighted standard deviation and the weights, not stdev
.
Assume the hypothesis is true. This method returns the probability (the p-value) that the statistic t could have a value as extreme (or more extreme) as the one observed. Low p-value means that either the hypothesis is wrong or the sample is exceptionally rare.
(Note that there is no alternative hypothesis. The p-value with this interpretation is part of Fisher's significance test, which does not decide between hypotheses. Recommended reading: Hubbard, R. et al. (2003): Confusion over Measures of Evidence (p's) versus Errors (alphas) in Classical Statistical Testing. The American Statistician 57(3):171-182.)
populationMean | Assumed population mean. (Population variance is assumed unknown.) |
weightedMean | Weighted sample arithmetic mean. |
weights | Positive weight for each datum. Negative and zero weights are coerced to 1.0, and if weights has fewer elements than sampleSize, the weights of those elements are taken to be 1.0. |
weightedStdev | Weighted sample standard deviation. This must be positive. |
sampleSize | Sample size. This must be at least 1. |
|
static |
Return the lower critical value of the chi-square distribution at the given significance level.
Suppose the random variable X has the chi-square distribution with df degrees of freedom. This routine returns the lower critical value x0 for which
P(X <= x0) = p
where p is the desired significance level. Significance level is determined by finding the smallest standard significance level that exceeds sigLevel:
sigLevel (s ) | p |
---|---|
0.050 < s <= 0.100 | 0.1 |
0.025 < s <= 0.050 | 0.05 |
0.010 < s <= 0.025 | 0.025 |
0.001 < s <= 0.010 | 0.01 |
s <= 0.001 | 0.001 |
If sigLevel exceeds 0.100, it is truncated to 0.100.
df | The degrees of freedom between 1 and 100 inclusive. |
sigLevel | Significance level. See the above table for supported values. |
|
static |
Return the upper critical value of the chi-square distribution at the given significance level.
Suppose the random variable X has the chi-square distribution with df degrees of freedom. This routine returns the upper critical value x0 for which
P(X <= x0) = 1 - p
where p is the desired significance level. Significance level is determined by finding the smallest standard significance level that exceeds sigLevel:
sigLevel (s ) | p |
---|---|
0.050 < s <= 0.100 | 0.1 |
0.025 < s <= 0.050 | 0.05 |
0.010 < s <= 0.025 | 0.025 |
0.005 < s <= 0.010 | 0.01 |
0.001 < s <= 0.005 | 0.005 |
s <= 0.001 | 0.001 |
If sigLevel exceeds 0.100, it is truncated to 0.100.
df | The degrees of freedom between 1 and 100 inclusive. |
sigLevel | Significance level. See the above table for supported values. |
|
static |
Given a sample and an outlier method, return the number of outliers at low and high extreme at the given significance level.
If outlierMethod is "none"
or the empty string, or if outlierMethod is not listed in Type mqm:outlierType, then the method simply returns with 0 outliers. Otherwise the corresponding outlier test will be used to count the number of low and high outliers at the given significance level.
If outlierMethod is "auto"
, then the method uses Dixon's N9 if sample size is less than 25 and Rosner's test otherwise.
Note that not all tests support the same set of significance levels. The common values between all three tests are as follows:
sigLevel (s ) | p |
---|---|
0.100 < s <= 0.200 | 0.2 |
0.050 < s <= 0.100 | 0.1 |
0.020 < s <= 0.050 | 0.05 |
0.010 < s <= 0.020 | 0.02 |
0.005 < s <= 0.010 | 0.01 |
If you specify a significance level that is outside these ranges, then the effective significance levels depends on which test is being used. Please refer to testDixonsN9(), testGrubbsN2() and testRosners() for more details.
Note that this method does not modify sample. You will need to remove the indicated outliers yourself if desired.
See Multiple return values in Perl, Java, Python and C#.
[in] | sample | A sample of data sorted in increasing numerical order. If the sample size is not large enough for outlierMethod, both numLow and numHigh will be 0 and no testing will be done. |
[in] | outlierMethod | Outlier method to use. Outlier methods are listed in Type mqm:outlierType. Note that the value is case sensitive. |
[in] | sigLevel | Significance level to use in tests. |
[out] | numLow | Number of outliers at the low end of sample; values at indices from 0 to numLow - 1 are outliers based on the chosen test. If numLow is 0, the sample has no outliers at the low end. |
[out] | numHigh | Number of outliers at the high end of sample; values at indices from N - numHigh to N - 1 are outliers based on the chosen test (N is the size of sample). If numHigh is 0, the sample has no outliers at the high end. |
|
static |
Given a sample and an outlier method, return the number of outliers at low and high extreme at significance level 0.05.
If outlierMethod is "none"
or the empty string, or if outlierMethod is not listed in Type mqm:outlierType, then the method simply returns with 0 outliers. Otherwise the corresponding outlier test will be used to count the number of low and high outliers at significance level 0.05. If you wish to specify a different significance level, use detectOutliers(const std::vector<double>&, std::string, double, unsigned int*const, unsigned int*const).
If outlierMethod is "auto"
, then the method uses Dixon's N9 if sample size is less than 25 and Rosner's test otherwise.
Note that this method does not modify sample. You will need to remove the indicated outliers yourself if desired.
See Multiple return values in Perl, Java, Python and C#.
[in] | sample | A sample of data sorted in increasing numerical order. If the sample size is not large enough for outlierMethod, both numLow and numHigh will be 0 and no testing will be done. |
[in] | outlierMethod | Outlier method to use. Outlier methods are listed in Type mqm:outlierType. Note that the value is case sensitive. |
[out] | numLow | Number of outliers at the low end of sample; values at indices from 0 to numLow - 1 are outliers based on the chosen test. If numLow is 0, the sample has no outliers at the low end. |
[out] | numHigh | Number of outliers at the high end of sample; values at indices from N - numHigh to N - 1 are outliers based on the chosen test (N is the size of sample). If numHigh is 0, the sample has no outliers at the high end. |
|
static |
Antilog (exp) transform of the input data.
logData | Any vector of logarithms. |
|
static |
Calculate the geometric mean.
The geometric mean is
geom(data) = (prodi xi)1/N
where N
is the number of elements in data and the product is over all the elements xi. That is, the geometric mean is the N
th root of the product of all input values.
Equivalently, the logarithm of the geometric mean is equal to the arithmetic mean of the data in log space. This is actually how the mean is calculated by the method.
data | Any vector of numbers. |
|
static |
Calculate the geometric sample standard deviation.
The geometric sample deviation is the square root of the geometric sample variance. The geometric sample variance is the second sample moment about the sample mean, scaled by N - 1
, where N
is the sample size. That is,
var(data) = e1/(N
- 1)sumi( (log(xi) - log(geometricMean))2 )
where the sum is over all elements xi.
Equivalently, the logarithm of the geometric standard deviation is equal to the arithmetic standard deviation of the data in log space.
data | A sample of data. |
geometricMean | Sample geometric mean. |
|
static |
Log transform of the input data.
data | Any vector of positive numbers. |
|
static |
Centre the data values about the given mean.
data | Any vector of numbers. |
mean | Sample mean. |
|
static |
Return the correction for the standard error of the median.
The standard error of the median is the same as the standard error of the mean, times a correction factor. That is, the standard error of the mean is s/sqrt(n)
, where s
is the estimated standard deviation of the sample and n
is the sample size. The standard error of the median is C * s/sqrt(n)
, where the constant C
is the correction factor returned by this function.
The correction factor C
is a function of sample size and is always between 1 and about 1.25. The sample median is an unbiased estimator of the population mean, but it has larger sampling variance than the sample mean (it is less efficient). For a normal population, the ratio between the variance of the sample mean and sample median tends to pi/2 as the sample size tends to infinity, which means C =
).sqrt
(pi/2
If the sample size is small (below about 100), then the constant is less than sqrt(pi/2)
and depends on whether the sample size is even or odd. This method uses exact values for sample size between 1 and 12, and an accurate polynomial approximation for sample sizes between 13 and 100. These numbers and the polynomial are described in the following two publications, respectively:
(Lower efficiency means that although the median is a robust estimator, this statistic always has a less extreme value than if the sample mean had been used. In other words, the p-value will be higher than for calculateNormalMeanPvalue() for the same sample.)
sampleSize | The sample size. |
|
static |
Return the critical value of the standard normal distribution at the given cumulative probability.
Suppose the random variable X has standard normal distribution. This routine returns the critical value x0 in the right tail such that
P(X <= abs(x0)) = p
where p is the desired cumulative probability.
Note that due to implementation reasons, the precision of the calculated critical value is limited to 3 decimal places. That is, the return value ranges from 0.000 to 4.090.
p | The cumulative probability between 0.5 and 1.0. If you need critical values for the left tail, give 1 - p as the argument and negate the result. |
|
static |
Return the cumulative probability that the standard normal statistic is at most x0.
Suppose the random variable X has standard normal distribution. This routine returns the cumulative probability p such that
p = P(0 <= X <= abs(x0))
where abs is the absolute value function. That is, the routine calculates the cumulative probability in the right tail. Since the normal distribution is symmetric, this can be used to calculate any cumulative probability of the distribution. Refer to a standard text on statistics for details.
Note that due to implementation reasons, the precision of the calculated critical value is limited to 6 decimal places. That is, the return value ranges from 0.000000 to 0.500000.
x0 | The critical value between 0.000 and 4.090, inclusive. Values beyond 4.090 are taken to be infinite (so cumulative probability is 0.50000), while the sign of negative value is changed to positive. |
X
<= abs(x0).
|
static |
Calculate the probability density of the Poisson distribution.
Poisson probability density is the probability of observing exactly k events, when the events occur at mean rate per unit lambda.
lambda | The rate parameter, which must be positive. If lambda is negative or zero, the function returns 0. |
k | The number of events. |
|
static |
Return the critical value of Snedecor's F distribution at the given significance level.
Suppose the random variable F has Snedecor's F distribution with df1 and df2 degrees of freedom. This routine returns the critical value f0 for which
P(F <= f0) = 1 - p
where p is the desired significance level. Significance level is determined by finding the smallest standard significance level that exceeds sigLevel:
sigLevel (s ) | p |
---|---|
0.050 < s <= 0.100 | 0.1 |
0.010 < s <= 0.050 | 0.05 |
s <= 0.010 | 0.01 |
If sigLevel exceeds 0.100, it is truncated to 0.100.
df1 | The degrees of freedom for the "numerator" between 1 and 20, inclusive. |
df2 | The degrees of freedom for the "denominator" between 1 and 100 inclusive. |
sigLevel | Significance level. See the above table for supported values. |
|
static |
Find the median of a sorted data deque.
The median of a sorted input deque can be found very easily:
N
, then the median is simply the element at index m = (N-1)/2
. N
, then the median is the mean between the two central elements m1 = floor((N-1)/2) = N/2 - 1
and m2 = ceil((N-1)/2) = N/2
. If data is empty the method returns zero.
data | Any sorted deque of numbers. |
|
static |
Find the median of a sorted data vector.
The median of a sorted input vector can be found very easily:
N
, then the median is simply the element at index m = (N-1)/2
. N
, then the median is the mean between the two central elements m1 = floor((N-1)/2) = N/2 - 1
and m2 = ceil((N-1)/2) = N/2
. If data is empty the method returns zero.
data | Any sorted vector of numbers. |
|
static |
Return the critical value of Student's t distribution at the given significance level.
Suppose the random variable t has Student's t distribution with df degrees of freedom. This routine returns the critical value t0 for which
P(t <= t0) = 1 - p
where p is the desired significance level. Significance level is determined by finding the smallest standard significance level that exceeds sigLevel:
sigLevel (s ) | p |
---|---|
0.050 < s <= 0.100 | 0.1 |
0.025 < s <= 0.050 | 0.05 |
0.010 < s <= 0.025 | 0.025 |
0.005 < s <= 0.010 | 0.01 |
0.001 < s <= 0.005 | 0.005 |
s <= 0.001 | 0.001 |
If sigLevel exceeds 0.100, it is truncated to 0.100.
If you need critical values for df exceeding 100, you can use the standard normal critical values as a good approximation; see normalCriticalValue().
df | The degrees of freedom. If df exceeds 100, it is taken to be infinite. |
sigLevel | Significance level. See the above table for supported values. |
|
static |
Calculate the sum of a sample vector.
data | Any vector of numbers. |
|
static |
Calculate the sum of squares of a sample vector.
data | Any vector of numbers. |
|
static |
Apply Dixon's N9 (r11) outlier test to the sample.
Dixon's N9 outlier test assumes that the data is normally distributed with unknown mean and variance, and that it could be contaminated by value(s) from another normal distribution with a different mean or variance. (For example, such contamination could result from gross measurement errors.) A related but different test is Grubbs N2; see testGrubbsN2().
Dixon's N9 is based on sample order statistics, which are used to estimate sample standard deviation. The test tries to remove one of two extremes: sample maximum or sample minimum. The extreme with the higher statistic will be chosen for possible removal. The method returns -1 or +1 if the chosen extreme is an outlier (statistic exceeds a critical value) and 0 otherwise. If you suspect the sample contains multiple outliers, use the test repeatedly until it returns 0, removing the corresponding outlier at each step. (Note that in sequential use, sigLevel is a significance level strictly speaking only for the first test.)
If your data is multiplicative, for example log-normal (such as peptide ratios), then you need to do a log transformation first before calling this method. See logTransform().
Description of the test procedure and critical values are presented in the following references:
Significance level is determined by finding the smallest standard significance level that exceeds sigLevel:
sigLevel (s ) | p |
---|---|
0.200 < s <= 0.300 | 0.3 |
0.100 < s <= 0.200 | 0.2 |
0.050 < s <= 0.100 | 0.1 |
0.020 < s <= 0.050 | 0.05 |
0.010 < s <= 0.020 | 0.02 |
0.005 < s <= 0.010 | 0.01 |
s <= 0.005 | 0.005 |
If sigLevel exceeds 0.300, it is truncated to 0.300.
sampleSize | Sample size. If this is not between 4 and 100, the method returns 0. |
min | Minimum value in the sample. |
x2 | Second smallest value in the sample. |
xn_1 | Second largest value in the sample. |
max | Maximum value in the sample. |
sigLevel | Significance level. See the above table for supported values. |
|
static |
Apply Grubb's N2 outlier test to the sample statistics.
Grubbs N2 outlier test tries to remove one of two extremes: sample minimum or sample maximum. The data is assumed normally distributed with unknown mean and variance, and a statistic is computed for both extremes. The extreme with the higher statistic will be chosen for possible removal. The method returns -1 or +1 if the chosen extreme is an outlier (statistic exceeds a critical value) and 0 otherwise.
It is usually best to use a normal probability plot to assess whether outlier removal is needed and how many outliers there seem to be. The Grubbs N2 test assumes the data has at most one outlier, and may fail to remove the outlier if there are actually two or more outliers in the data (this is called masking). If you suspect more than one outlier, the generalised ESD test is more appropriate; see testRosners().
If your data is multiplicative, for example log-normal (such as peptide ratios), then you need to do a log transformation first before calling this method. See logTransform().
If you suspect the data is contaminated by values from another normal distribution with a different mean or variance, then Dixon's N9 test could be more appropriate. See testDixonsN9().
Details of the test procedure and critical values are presented in the following references:
Significance level is determined by finding the smallest standard significance level that exceeds sigLevel:
sigLevel (s ) | p |
---|---|
0.200 < s <= 0.300 | 0.3 |
0.100 < s <= 0.200 | 0.2 |
0.050 < s <= 0.100 | 0.1 |
0.020 < s <= 0.050 | 0.05 |
0.010 < s <= 0.020 | 0.02 |
0.005 < s <= 0.010 | 0.01 |
s <= 0.005 | 0.005 |
If sigLevel exceeds 0.300, it is truncated to 0.300.
sampleSize | Sample size. If this is not between 3 and 100, the method returns 0. |
min | Minimum value in the sample. |
mean | Sample arithmetic mean. |
max | Maximum value in the sample. |
stdev | Sample standard deviation. If stdev is not positive, the method returns 0. |
sigLevel | Significance level. See the above table for supported values. |
|
static |
Apply the generalised extreme Studentised deviate (ESD) outlier test (aka Rosner's test) to the sample.
The generalised ESD test tries to remove at least 1 and up to maxOutliers outliers from the sample. If you suspect there is at most one outlier in the data, the Grubbs N2 test may be more appropriate; see testGrubbsN2().
The data is assumed normally distributed with unknown mean and variance. If your data is multiplicative, for example log-normal (such as peptide ratios), then you need to do a log transformation first before calling this method. See logTransform().
Description of the test procedure can be found in the following references:
Significance level is determined by finding the smallest standard significance level that exceeds sigLevel:
sigLevel (s ) | p |
---|---|
0.100 < s <= 0.200 | 0.2 |
0.050 < s <= 0.100 | 0.1 |
0.020 < s <= 0.050 | 0.05 |
0.010 < s <= 0.020 | 0.02 |
0.002 < s <= 0.010 | 0.01 |
s <= 0.002 | 0.002 |
If sigLevel exceeds 0.200, it is truncated to 0.200.
See Multiple return values in Perl, Java, Python and C#.
[in] | sample | A sample of data sorted in increasing numerical order. Sample size must be at least 25 for meaningful results. |
[in] | maxOutliers | Number between 1 and 10 (inclusive) indicating the maximum number of suspected outliers in the sample. |
[in] | sigLevel | Significance level. See the above table for supported values. |
[out] | numLow | Number of outliers at the low end of sample; values at indices from 0 to numLow - 1 are outliers based on the test. If numLow is 0, the sample has no outliers at the low end. |
[out] | numHigh | Number of outliers at the high end of sample; values at indices from N - numHigh to the end of the vector are outliers based on the test (N is the size of sample). If numHigh is 0, the sample has no outliers at the high end. |
|
static |
Apply Shapiro-Wilk W normality test to the sample.
See Multiple return values in Perl, Java, Python and C#.
See ms_shapiro_wilk for the test details. This method returns false if any of the following conditions apply:
In these cases the test is not conducted, and the values of W and pValue are not defined.
The number of right-censored values is assumed to be 0. If your sample contains censored values, use ms_shapiro_wilk directly.
[in] | sample | A sample of data sorted in increasing numerical order. The number of elements should be between 3 and 5000, inclusive. If sample size exceeds 5000, the test statistic and p-value will be calculated successfully, but the p-value is not guaranteed to be accurate. |
[out] | W | The Shapiro-Wilk W statistic calculated from the data. |
[out] | pValue | The p-value of W under the null hypothesis. |
|
static |
Find the median of an unsorted data vector.
This method works for both sorted and unsorted input, although in the former case it is faster to call sortedMedian() instead.
The trivial implementation is to create a copy of the input data, sort and then call sortedMedian(). This method is slightly faster: If the input vector has an odd number of elements N
, then the median would be at index m = (N-1)/2
; otherwise the median is between elements m1 = floor((N-1)/2)
and m2 = ceil((N-1)/2)
. The method uses partial sorting to find the m
th element (or the m1
st and m2
nd elements) which means time complexity is slightly less than O(N
log(N
)) (which is the average time complexity of quicksort).
data | Any vector of numbers. |
|
static |
Calculate the weighted arithmetic mean.
The weighted arithmetic mean is
mean(data, weights) = (1/W
)sumi wixi
where W
is the sum of weights, wi is an individual weight, and the summation is over all elements xi.
data | Any vector of numbers. |
weights | Positive weight for each datum in data. Negative and zero weights are coerced to 1.0, and if weights has fewer elements than data, the missing weights are taken to be 1.0. |
|
static |
Calculate the weighted arithmetic sample standard deviation.
The weighted arithmetic sample deviation is the square root of the weighted arithmetic sample variance. The weighted sample variance is
var(data,weights) = var(data)/b(weights)
where the effective base b
is
b(weights) = (sumi wi)2/sumi(wi2)
data | A sample of data. |
weights | Positive weight for each datum in data. Negative and zero weights are coerced to 1.0, and if weights has fewer elements than data, the weights of those elements are taken to be 1.0. |
weightedMean | Sample weighted arithmetic mean. |
|
static |
Calculate the standard error of the weighted arithmetic sample mean.
The standard error of the weighted arithmetic mean is slightly different from the standard error of the mean. Like the weighted standard deviation, the weights must be taken into account. The effective base b
is
b(weights) = (sumi wi)2/sumi(wi2) = W2/S,
where W2 is the squared sum of the weights, and S is the sum of squared weights. As the weighted mean is
mean(data, weights) = (1/W
)sumi wixi,
we can use the usual sample properties (independent, identically distributed random variables) to easily derive
var(mean) = sigma2/(b/W)
where sigma2 is the population variance. The sample estimate of the standard error is then
sqrt(W s2/b2) = sqrt(W) s/b,
which is the value returned by this function.
weights | Positive weight for each datum. Negative and zero weights are coerced to 1.0, and if weights has fewer elements than sampleSize, the weights of those elements are taken to be 1.0. |
weightedStdev | Sample weighted arithmetic standard deviation. It is recommended to use weightedArithmeticStandardDeviation() with the same weights. |
sampleSize | The sample size. |
|
static |
Calculate the weighted geometric mean.
The weighted geometric mean is
geom(data,weights) = (prodi xiwi)1/W
where W
is the sum of weights, wi is an individual weight and the product is over all the elements xi. That is, the geometric mean is the W
th root of the product of the with roots of the input values.
Equivalently, the logarithm of the weighted geometric mean is equal to the weighted arithmetic mean of the data in log space. This is actually how the mean is calculated by the method.
data | Any vector of numbers. |
weights | Positive weight for each datum in data. Negative and zero weights are coerced to 1.0, and if weights has fewer elements than data, the missing weights are taken to be 1.0. |
|
static |
Calculate the weighted geometric sample standard deviation.
The weighted geometric sample deviation is the square root of the weighted geometric sample variance. The geometric sample variance is
var(logdata
,weights) = evar(logdata
)/b
where the effective base b
is
b(weights) = (sumi wi)2/sumi(wi2)
and logdata
is the log-transformed data.
Equivalently, the logarithm of the weighted geometric standard deviation is equal to the weighted arithmetic standard deviation of the data in log space.
data | A sample of data. |
weights | Positive weight for each datum in data. Negative and zero weights are coerced to 1.0, and if weights has fewer elements than data, the weights of those elements are taken to be 1.0. |
weightedMean | Sample weighted geometric mean. |