Mathematical and statistical routines for peptide and protein quantitation. More...

#include <ms_quant_stats.hpp>

Static Public Member Functions
static double	arithmeticMean (const std::vector< double > &data)
	Calculate the arithmetic mean.

static double	arithmeticStandardDeviation (const std::vector< double > &data, double arithmeticMean)
	Calculate the arithmetic sample standard deviation.

static double	arithmeticStandardErrorOfMean (double arithmeticStdev, unsigned long sampleSize)
	Calculate the standard error of the arithmetic sample mean.

static double	arithmeticStandardErrorOfMedian (double arithmeticStdev, unsigned long sampleSize)
	Calculate the standard error of the sample median.

static double	binomialCoefficient (unsigned int n, unsigned int k)
	Compute the binomial coefficient.

static double	calculateNormalMeanPvalue (double populationMean, double mean, double stdev, unsigned long sampleSize)
	Calculate the p-value that the population mean is populationMean, given the sample statistics and assuming the population is normally distributed.

static double	calculateNormalMedianPvalue (double populationMedian, double median, double stdev, unsigned long sampleSize)
	Calculate the p-value that the population median is populationMedian, given the sample statistics and assuming the population is normally distributed.

static double	calculateNormalWeightedPvalue (double populationMean, double weightedMean, const std::vector< double > &weights, double weightedStdev, unsigned long sampleSize)
	Calculate the p-value that the population mean is populationMean, given the weighted sample statistics and assuming the population is normally distributed.

static double	chisqLowerCriticalValue (unsigned int df, double sigLevel)
	Return the lower critical value of the chi-square distribution at the given significance level.

static double	chisqUpperCriticalValue (unsigned int df, double sigLevel)
	Return the upper critical value of the chi-square distribution at the given significance level.

static void	detectOutliers (const std::vector< double > &sample, std::string outlierMethod, double sigLevel, unsigned int const numLow, unsigned int const numHigh)
	Given a sample and an outlier method, return the number of outliers at low and high extreme at the given significance level.

static void	detectOutliers (const std::vector< double > &sample, std::string outlierMethod, unsigned int const numLow, unsigned int const numHigh)
	Given a sample and an outlier method, return the number of outliers at low and high extreme at significance level 0.05.

static std::vector< double >	expTransform (const std::vector< double > &logData)
	Antilog (exp) transform of the input data.

static double	geometricMean (const std::vector< double > &data)
	Calculate the geometric mean.

static double	geometricStandardDeviation (const std::vector< double > &data, double geometricMean)
	Calculate the geometric sample standard deviation.

static std::vector< double >	logTransform (const std::vector< double > &data)
	Log transform of the input data.

static std::vector< double >	meanCentre (const std::vector< double > &data, double mean)
	Centre the data values about the given mean.

static double	medianStandardErrorCorrection (unsigned long sampleSize)
	Return the correction for the standard error of the median.

static double	normalCriticalValue (double p)
	Return the critical value of the standard normal distribution at the given cumulative probability.

static double	normalCumulativeProbability (double x0)
	Return the cumulative probability that the standard normal statistic is at most x0.

static double	poissonDensity (double lambda, unsigned int k)
	Calculate the probability density of the Poisson distribution.

static double	snedecorsCriticalValue (unsigned int df1, unsigned int df2, double sigLevel)
	Return the critical value of Snedecor's F distribution at the given significance level.

static double	sortedMedian (const std::deque< double > &data)
	Find the median of a sorted data deque.

static double	sortedMedian (const std::vector< double > &data)
	Find the median of a sorted data vector.

static double	studentsCriticalValue (unsigned int df, double sigLevel)
	Return the critical value of Student's t distribution at the given significance level.

static double	sum (const std::vector< double > &data)
	Calculate the sum of a sample vector.

static double	sumsq (const std::vector< double > &data)
	Calculate the sum of squares of a sample vector.

static int	testDixonsN9 (unsigned long sampleSize, double min, double x2, double xn_1, double max, double sigLevel)
	Apply Dixon's N9 (r11) outlier test to the sample.

static int	testGrubbsN2 (unsigned long sampleSize, double min, double mean, double max, double stdev, double sigLevel)
	Apply Grubb's N2 outlier test to the sample statistics.

static void	testRosners (const std::vector< double > &sample, unsigned int maxOutliers, double sigLevel, unsigned int const numLow, unsigned int const numHigh)
	Apply the generalised extreme Studentised deviate (ESD) outlier test (aka Rosner's test) to the sample.

static bool	testShapiroWilkW (const std::vector< double > &sample, double const W, double const pValue)
	Apply Shapiro-Wilk W normality test to the sample.

static double	unsortedMedian (const std::vector< double > &data)
	Find the median of an unsorted data vector.

static double	weightedArithmeticMean (const std::vector< double > &data, const std::vector< double > &weights)
	Calculate the weighted arithmetic mean.

static double	weightedArithmeticStandardDeviation (const std::vector< double > &data, const std::vector< double > &weights, double weightedMean)
	Calculate the weighted arithmetic sample standard deviation.

static double	weightedArithmeticStandardError (const std::vector< double > &weights, double weightedStdev, unsigned long sampleSize)
	Calculate the standard error of the weighted arithmetic sample mean.

static double	weightedGeometricMean (const std::vector< double > &data, const std::vector< double > &weights)
	Calculate the weighted geometric mean.

static double	weightedGeometricStandardDeviation (const std::vector< double > &data, const std::vector< double > &weights, double weightedMean)
	Calculate the weighted geometric sample standard deviation.

Detailed Description

Mathematical and statistical routines for peptide and protein quantitation.

ms_quant_stats is a collection of mathematical and statistical routines used in peptide and protein quantitation by ms_ms2quantitation and ms_customquantitation. It is possible to use the routines outside this context, as they are not tied to either class.

The class contains only static class methods; it is impossible and meaningless to create instances of ms_quant_stats.

Member Function Documentation

◆ arithmeticMean()

double arithmeticMean ( const std::vector< double > & data )

static

Calculate the arithmetic mean.

The arithmetic mean is the usual average:

mean(data) = (1/N)sum_i x_i

where N is the number of elements in data and the summation is over all elements x_i.

Parameters

data	Any vector of numbers.

Returns: The arithmetic mean of data, or 0 if the vector is empty.

◆ arithmeticStandardDeviation()

double arithmeticStandardDeviation	(	const std::vector< double > &	data,
		double	arithmeticMean
	)

static

Calculate the arithmetic sample standard deviation.

The arithmetic sample deviation is the square root of the arithmetic sample variance. The sample variance is the second sample moment about the sample mean, scaled by N - 1, where N is the sample size. That is,

var(data) = 1/(N - 1)sum_i( (x_i - arithmeticMean)² )

where the sum is over all elements x_i.

Parameters

data	A sample of data.
arithmeticMean	Sample arithmetic mean.

Returns: Sample standard deviation, or 0 if sample size is 0 or 1.

◆ arithmeticStandardErrorOfMean()

double arithmeticStandardErrorOfMean	(	double	arithmeticStdev,
		unsigned long	sampleSize
	)

static

Calculate the standard error of the arithmetic sample mean.

The standard error of the arithmetic sample mean in a normally distributed sample is simple: standard deviation divided by the square root of the sample size.

Parameters

arithmeticStdev	Arithmetic standard deviation, for example from arithmeticStandardDeviation().
sampleSize	The sample size.

Returns: Standard error of the mean. If sample size is 0 or 1, this function returns 0.

◆ arithmeticStandardErrorOfMedian()

double arithmeticStandardErrorOfMedian	(	double	arithmeticStdev,
		unsigned long	sampleSize
	)

static

Calculate the standard error of the sample median.

The standard error of the sample median in a normally distributed sample is the same as the standard error of the sample mean, times a correction factor. The correction factor is documented in medianStandardErrorCorrection().

Parameters

arithmeticStdev	Arithmetic standard deviation, for example from arithmeticStandardDeviation().
sampleSize	The sample size.

Returns: Standard error of the mean. If sample size is 0 or 1, this function returns 0.

◆ binomialCoefficient()

double binomialCoefficient	(	unsigned int	n,
		unsigned int	k
	)

static

Compute the binomial coefficient.

The method returns 1 in the following cases, by definition:

n = 0
k = 0
n < k

Note that although the binomial coefficient is an integer, the return value is double to allow values beyond about 4 billion on 32-bit systems. The value is exact up to about ⁷⁴C₃₇.

Parameters

n	Population size.
k	Sample size.

Returns: The binomial coefficient ⁿC_k or "n choose k".

◆ calculateNormalMeanPvalue()

double calculateNormalMeanPvalue	(	double	populationMean,
		double	mean,
		double	stdev,
		unsigned long	sampleSize
	)

static

Calculate the p-value that the population mean is populationMean, given the sample statistics and assuming the population is normally distributed.

Hypothesis: the population you are sampling from is normally distributed with mean populationMean and unknown variance.

Statistic: Student's t = (mean - populationMean)/(stdev/sqrt(sampleSize))

Assume the hypothesis is true. This method returns the probability (the p-value) that the statistic t could have a value as extreme (or more extreme) as the one observed. Low p-value means that either the hypothesis is wrong or the sample is exceptionally rare.

(Note that there is no alternative hypothesis. The p-value with this interpretation is part of Fisher's significance test, which does not decide between hypotheses. Recommended reading: Hubbard, R. et al. (2003): Confusion over Measures of Evidence (p's) versus Errors (alphas) in Classical Statistical Testing. The American Statistician 57(3):171-182.)

Parameters

populationMean	Assumed population mean. (Population variance is assumed unknown.)
mean	Sample arithmetic mean.
stdev	Sample standard deviation. This must be positive.
sampleSize	Sample size. This must be at least 1.

Returns: The p-value of the statistic under the hypothesis of normality.

◆ calculateNormalMedianPvalue()

double calculateNormalMedianPvalue	(	double	populationMedian,
		double	median,
		double	stdev,
		unsigned long	sampleSize
	)

static

Calculate the p-value that the population median is populationMedian, given the sample statistics and assuming the population is normally distributed.

Hypothesis: the population you are sampling from is normally distributed with median populationMedian and unknown variance. In particular, the population median is equal to the population mean.

Statistic: Student's t for sample median, t = 1/C*(median - populationMedian)/(stdev/sqrt(sampleSize))

Assume the hypothesis is true. This method returns the probability (the p-value) that the statistic t could have a value as extreme (or more extreme) as the one observed. Low p-value means that either the hypothesis is wrong or the sample is exceptionally rare.

(Note that there is no alternative hypothesis. The p-value with this interpretation is part of Fisher's significance test, which does not decide between hypotheses. Recommended reading: Hubbard, R. et al. (2003): Confusion over Measures of Evidence (p's) versus Errors (alphas) in Classical Statistical Testing. The American Statistician 57(3):171-182.)

The constant C is a correction to the standard error of the median, retrieved from medianStandardErrorCorrection().

Parameters

populationMedian	Assumed population median. (Population variance is assumed unknown.)
median	Sample median.
stdev	Sample standard deviation. This must be positive.
sampleSize	Sample size. This must be at least 1.

Returns: The p-value of the statistic under the hypothesis of normality.

◆ calculateNormalWeightedPvalue()

double calculateNormalWeightedPvalue	(	double	populationMean,
		double	weightedMean,
		const std::vector< double > &	weights,
		double	weightedStdev,
		unsigned long	sampleSize
	)

static

Calculate the p-value that the population mean is populationMean, given the weighted sample statistics and assuming the population is normally distributed.

Hypothesis: the population you are sampling from is normally distributed with mean populationMean and unknown variance.

Statistic: Student's t = (weightedMean - populationMean)/(stdev sqrt(W)/b)),

where stdev is the sample standard deviation, W is the sum of weights and b is the effective base, as defined in weightedArithmeticStandardError(). Note that this function takes as argument the weighted standard deviation and the weights, not stdev.

Assume the hypothesis is true. This method returns the probability (the p-value) that the statistic t could have a value as extreme (or more extreme) as the one observed. Low p-value means that either the hypothesis is wrong or the sample is exceptionally rare.

(Note that there is no alternative hypothesis. The p-value with this interpretation is part of Fisher's significance test, which does not decide between hypotheses. Recommended reading: Hubbard, R. et al. (2003): Confusion over Measures of Evidence (p's) versus Errors (alphas) in Classical Statistical Testing. The American Statistician 57(3):171-182.)

Parameters

populationMean	Assumed population mean. (Population variance is assumed unknown.)
weightedMean	Weighted sample arithmetic mean.
weights	Positive weight for each datum. Negative and zero weights are coerced to 1.0, and if weights has fewer elements than sampleSize, the weights of those elements are taken to be 1.0.
weightedStdev	Weighted sample standard deviation. This must be positive.
sampleSize	Sample size. This must be at least 1.

Returns: The p-value of the statistic under the hypothesis of normality.

◆ chisqLowerCriticalValue()

double chisqLowerCriticalValue	(	unsigned int	df,
		double	sigLevel
	)

static

Return the lower critical value of the chi-square distribution at the given significance level.

Suppose the random variable X has the chi-square distribution with df degrees of freedom. This routine returns the lower critical value x₀ for which

P(X <= x₀) = p

where p is the desired significance level. Significance level is determined by finding the smallest standard significance level that exceeds sigLevel:

sigLevel (`s`)	p
0.050 < `s` <= 0.100	0.1
0.025 < `s` <= 0.050	0.05
0.010 < `s` <= 0.025	0.025
0.001 < `s` <= 0.010	0.01
`s` <= 0.001	0.001

If sigLevel exceeds 0.100, it is truncated to 0.100.

Parameters

df	The degrees of freedom between 1 and 100 inclusive.
sigLevel	Significance level. See the above table for supported values.

Returns: The lower critical value x₀.

◆ chisqUpperCriticalValue()

double chisqUpperCriticalValue	(	unsigned int	df,
		double	sigLevel
	)

static

Return the upper critical value of the chi-square distribution at the given significance level.

Suppose the random variable X has the chi-square distribution with df degrees of freedom. This routine returns the upper critical value x₀ for which

P(X <= x₀) = 1 - p

where p is the desired significance level. Significance level is determined by finding the smallest standard significance level that exceeds sigLevel:

sigLevel (`s`)	p
0.050 < `s` <= 0.100	0.1
0.025 < `s` <= 0.050	0.05
0.010 < `s` <= 0.025	0.025
0.005 < `s` <= 0.010	0.01
0.001 < `s` <= 0.005	0.005
`s` <= 0.001	0.001

If sigLevel exceeds 0.100, it is truncated to 0.100.

Parameters

df	The degrees of freedom between 1 and 100 inclusive.
sigLevel	Significance level. See the above table for supported values.

Returns: The upper critical value x₀.

◆ detectOutliers() [1/2]

void detectOutliers	(	const std::vector< double > &	sample,
		std::string	outlierMethod,
		double	sigLevel,
		unsigned int *const	numLow,
		unsigned int *const	numHigh
	)

static

Given a sample and an outlier method, return the number of outliers at low and high extreme at the given significance level.

If outlierMethod is "none" or the empty string, or if outlierMethod is not listed in Type mqm:outlierType, then the method simply returns with 0 outliers. Otherwise the corresponding outlier test will be used to count the number of low and high outliers at the given significance level.

If outlierMethod is "auto", then the method uses Dixon's N9 if sample size is less than 25 and Rosner's test otherwise.

Note that not all tests support the same set of significance levels. The common values between all three tests are as follows:

sigLevel (`s`)	p
0.100 < `s` <= 0.200	0.2
0.050 < `s` <= 0.100	0.1
0.020 < `s` <= 0.050	0.05
0.010 < `s` <= 0.020	0.02
0.005 < `s` <= 0.010	0.01

If you specify a significance level that is outside these ranges, then the effective significance levels depends on which test is being used. Please refer to testDixonsN9(), testGrubbsN2() and testRosners() for more details.

Note that this method does not modify sample. You will need to remove the indicated outliers yourself if desired.

See Multiple return values in Perl, Java, Python and C#.

Parameters

[in]	sample	A sample of data sorted in increasing numerical order. If the sample size is not large enough for outlierMethod, both numLow and numHigh will be 0 and no testing will be done.
[in]	outlierMethod	Outlier method to use. Outlier methods are listed in Type mqm:outlierType. Note that the value is case sensitive.
[in]	sigLevel	Significance level to use in tests.
[out]	numLow	Number of outliers at the low end of sample; values at indices from 0 to numLow - 1 are outliers based on the chosen test. If numLow is 0, the sample has no outliers at the low end.
[out]	numHigh	Number of outliers at the high end of sample; values at indices from `N` - numHigh to `N` - 1 are outliers based on the chosen test (`N` is the size of sample). If numHigh is 0, the sample has no outliers at the high end.

◆ detectOutliers() [2/2]

void detectOutliers	(	const std::vector< double > &	sample,
		std::string	outlierMethod,
		unsigned int *const	numLow,
		unsigned int *const	numHigh
	)

static

Given a sample and an outlier method, return the number of outliers at low and high extreme at significance level 0.05.

If outlierMethod is "none" or the empty string, or if outlierMethod is not listed in Type mqm:outlierType, then the method simply returns with 0 outliers. Otherwise the corresponding outlier test will be used to count the number of low and high outliers at significance level 0.05. If you wish to specify a different significance level, use detectOutliers(const std::vector<double>&, std::string, double, unsigned int*const, unsigned int*const).

If outlierMethod is "auto", then the method uses Dixon's N9 if sample size is less than 25 and Rosner's test otherwise.

Note that this method does not modify sample. You will need to remove the indicated outliers yourself if desired.

See Multiple return values in Perl, Java, Python and C#.

Parameters

[in]	sample	A sample of data sorted in increasing numerical order. If the sample size is not large enough for outlierMethod, both numLow and numHigh will be 0 and no testing will be done.
[in]	outlierMethod	Outlier method to use. Outlier methods are listed in Type mqm:outlierType. Note that the value is case sensitive.
[out]	numLow	Number of outliers at the low end of sample; values at indices from 0 to numLow - 1 are outliers based on the chosen test. If numLow is 0, the sample has no outliers at the low end.
[out]	numHigh	Number of outliers at the high end of sample; values at indices from `N` - numHigh to `N` - 1 are outliers based on the chosen test (`N` is the size of sample). If numHigh is 0, the sample has no outliers at the high end.

◆ expTransform()

std::vector< double > expTransform ( const std::vector< double > & logData )

static

Antilog (exp) transform of the input data.

Parameters

logData Any vector of logarithms.

Returns: The elementwise exp transform of the input vector.

◆ geometricMean()

double geometricMean ( const std::vector< double > & data )

static

Calculate the geometric mean.

The geometric mean is

geom(data) = (prod_i x_i)^1/N

where N is the number of elements in data and the product is over all the elements x_i. That is, the geometric mean is the Nth root of the product of all input values.

Equivalently, the logarithm of the geometric mean is equal to the arithmetic mean of the data in log space. This is actually how the mean is calculated by the method.

Parameters

data	Any vector of numbers.

Returns: The geometric mean of data, or 1 if the vector is empty. If any of the data elements are 0 or negative, the mean is 0.

◆ geometricStandardDeviation()

double geometricStandardDeviation	(	const std::vector< double > &	data,
		double	geometricMean
	)

static

Calculate the geometric sample standard deviation.

The geometric sample deviation is the square root of the geometric sample variance. The geometric sample variance is the second sample moment about the sample mean, scaled by N - 1, where N is the sample size. That is,

var(data) = e^{1/(N - 1)sum_i( (log(x_i) - log(geometricMean))² )}

where the sum is over all elements x_i.

Equivalently, the logarithm of the geometric standard deviation is equal to the arithmetic standard deviation of the data in log space.

Parameters

data	A sample of data.
geometricMean	Sample geometric mean.

Returns: Sample standard deviation, or 0 if sample size is 0 or 1. If any data element is 0 or negative, or if the geometric mean is 0 or negative, then the standard deviation is 0.

◆ logTransform()

std::vector< double > logTransform ( const std::vector< double > & data )

static

Log transform of the input data.

Parameters

data	Any vector of positive numbers.

Returns: The elementwise log transform of the input vector. If any element is 0 or negative, its logarithm will be substituted with a large negative number (but not negative infinity).

◆ meanCentre()

std::vector< double > meanCentre	(	const std::vector< double > &	data,
		double	mean
	)

static

Centre the data values about the given mean.

Parameters

data	Any vector of numbers.
mean	Sample mean.

Returns: A mean centred vector of the same size as data.

◆ medianStandardErrorCorrection()

double medianStandardErrorCorrection ( unsigned long sampleSize )

static

Return the correction for the standard error of the median.

The standard error of the median is the same as the standard error of the mean, times a correction factor. That is, the standard error of the mean is s/sqrt(n), where s is the estimated standard deviation of the sample and n is the sample size. The standard error of the median is C * s/sqrt(n), where the constant C is the correction factor returned by this function.

The correction factor C is a function of sample size and is always between 1 and about 1.25. The sample median is an unbiased estimator of the population mean, but it has larger sampling variance than the sample mean (it is less efficient). For a normal population, the ratio between the variance of the sample mean and sample median tends to pi/2 as the sample size tends to infinity, which means C = sqrt(pi/2).

If the sample size is small (below about 100), then the constant is less than sqrt(pi/2) and depends on whether the sample size is even or odd. This method uses exact values for sample size between 1 and 12, and an accurate polynomial approximation for sample sizes between 13 and 100. These numbers and the polynomial are described in the following two publications, respectively:

Hojo, T. and K. Pearson (1931): Distribution of the Median, Quartiles and Interquartile Distance in Samples From a Normal Population. Biometrika 23(3/4):315-363.
Cadwell, J. H. (1952): The Distribution of Quantiles of Small Samples. Biometrika 39(1/2):207-211.

(Lower efficiency means that although the median is a robust estimator, this statistic always has a less extreme value than if the sample mean had been used. In other words, the p-value will be higher than for calculateNormalMeanPvalue() for the same sample.)

Parameters

sampleSize The sample size.

Returns: Correction constant as described above.

◆ normalCriticalValue()

double normalCriticalValue ( double p )

static

Return the critical value of the standard normal distribution at the given cumulative probability.

Suppose the random variable X has standard normal distribution. This routine returns the critical value x0 in the right tail such that

P(X <= abs(x0)) = p

where p is the desired cumulative probability.

Note that due to implementation reasons, the precision of the calculated critical value is limited to 3 decimal places. That is, the return value ranges from 0.000 to 4.090.

Parameters

p	The cumulative probability between 0.5 and 1.0. If you need critical values for the left tail, give 1 - p as the argument and negate the result.

Returns: The critical value x0.

◆ normalCumulativeProbability()

double normalCumulativeProbability ( double x0 )

static

Return the cumulative probability that the standard normal statistic is at most x0.

Suppose the random variable X has standard normal distribution. This routine returns the cumulative probability p such that

p = P(0 <= X <= abs(x0))

where abs is the absolute value function. That is, the routine calculates the cumulative probability in the right tail. Since the normal distribution is symmetric, this can be used to calculate any cumulative probability of the distribution. Refer to a standard text on statistics for details.

Note that due to implementation reasons, the precision of the calculated critical value is limited to 6 decimal places. That is, the return value ranges from 0.000000 to 0.500000.

Parameters

x0	The critical value between 0.000 and 4.090, inclusive. Values beyond 4.090 are taken to be infinite (so cumulative probability is 0.50000), while the sign of negative value is changed to positive.

Returns: The cumulative probability that 0 <= X <= abs(x0).

◆ poissonDensity()

double poissonDensity	(	double	lambda,
		unsigned int	k
	)

static

Calculate the probability density of the Poisson distribution.

Poisson probability density is the probability of observing exactly k events, when the events occur at mean rate per unit lambda.

Parameters

lambda	The rate parameter, which must be positive. If lambda is negative or zero, the function returns 0.
k	The number of events.

Returns: The probability density, or 0 if lambda is not positive.

◆ snedecorsCriticalValue()

double snedecorsCriticalValue	(	unsigned int	df1,
		unsigned int	df2,
		double	sigLevel
	)

static

Return the critical value of Snedecor's F distribution at the given significance level.

Suppose the random variable F has Snedecor's F distribution with df1 and df2 degrees of freedom. This routine returns the critical value f0 for which

P(F <= f0) = 1 - p

where p is the desired significance level. Significance level is determined by finding the smallest standard significance level that exceeds sigLevel:

sigLevel (`s`)	p
0.050 < `s` <= 0.100	0.1
0.010 < `s` <= 0.050	0.05
`s` <= 0.010	0.01

If sigLevel exceeds 0.100, it is truncated to 0.100.

Parameters

df1	The degrees of freedom for the "numerator" between 1 and 20, inclusive.
df2	The degrees of freedom for the "denominator" between 1 and 100 inclusive.
sigLevel	Significance level. See the above table for supported values.

Returns: The critical value f0.

◆ sortedMedian() [1/2]

double sortedMedian ( const std::deque< double > & data )

static

Find the median of a sorted data deque.

The median of a sorted input deque can be found very easily:

If the deque has an odd number of elements N, then the median is simply the element at index m = (N-1)/2.
If the deque has an even number of elements N, then the median is the mean between the two central elements m1 = floor((N-1)/2) = N/2 - 1 and m2 = ceil((N-1)/2) = N/2.

If data is empty the method returns zero.

Parameters

data	Any sorted deque of numbers.

Returns: The median value of the input deque.

◆ sortedMedian() [2/2]

double sortedMedian ( const std::vector< double > & data )

static

Find the median of a sorted data vector.

The median of a sorted input vector can be found very easily:

If the vector has an odd number of elements N, then the median is simply the element at index m = (N-1)/2.
If the vector has an even number of elements N, then the median is the mean between the two central elements m1 = floor((N-1)/2) = N/2 - 1 and m2 = ceil((N-1)/2) = N/2.

If data is empty the method returns zero.

Parameters

data	Any sorted vector of numbers.

Returns: The median value of the input vector.

◆ studentsCriticalValue()

double studentsCriticalValue	(	unsigned int	df,
		double	sigLevel
	)

static

Return the critical value of Student's t distribution at the given significance level.

Suppose the random variable t has Student's t distribution with df degrees of freedom. This routine returns the critical value t₀ for which

P(t <= t₀) = 1 - p

where p is the desired significance level. Significance level is determined by finding the smallest standard significance level that exceeds sigLevel:

sigLevel (`s`)	p
0.050 < `s` <= 0.100	0.1
0.025 < `s` <= 0.050	0.05
0.010 < `s` <= 0.025	0.025
0.005 < `s` <= 0.010	0.01
0.001 < `s` <= 0.005	0.005
`s` <= 0.001	0.001

If sigLevel exceeds 0.100, it is truncated to 0.100.

If you need critical values for df exceeding 100, you can use the standard normal critical values as a good approximation; see normalCriticalValue().

Parameters

df	The degrees of freedom. If df exceeds 100, it is taken to be infinite.
sigLevel	Significance level. See the above table for supported values.

Returns: The critical value t₀.

◆ sum()

double sum ( const std::vector< double > & data )

static

Calculate the sum of a sample vector.

Parameters

data	Any vector of numbers.

Returns: The sum of the elements of data.

◆ sumsq()

double sumsq ( const std::vector< double > & data )

static

Calculate the sum of squares of a sample vector.

Parameters

data	Any vector of numbers.

Returns: The sum of the squares of elements of data.

◆ testDixonsN9()

int testDixonsN9	(	unsigned long	sampleSize,
		double	min,
		double	x2,
		double	xn_1,
		double	max,
		double	sigLevel
	)

static

Apply Dixon's N9 (r11) outlier test to the sample.

Dixon's N9 outlier test assumes that the data is normally distributed with unknown mean and variance, and that it could be contaminated by value(s) from another normal distribution with a different mean or variance. (For example, such contamination could result from gross measurement errors.) A related but different test is Grubbs N2; see testGrubbsN2().

Dixon's N9 is based on sample order statistics, which are used to estimate sample standard deviation. The test tries to remove one of two extremes: sample maximum or sample minimum. The extreme with the higher statistic will be chosen for possible removal. The method returns -1 or +1 if the chosen extreme is an outlier (statistic exceeds a critical value) and 0 otherwise. If you suspect the sample contains multiple outliers, use the test repeatedly until it returns 0, removing the corresponding outlier at each step. (Note that in sequential use, sigLevel is a significance level strictly speaking only for the first test.)

If your data is multiplicative, for example log-normal (such as peptide ratios), then you need to do a log transformation first before calling this method. See logTransform().

Description of the test procedure and critical values are presented in the following references:

Dixon, W. J. (1950): Analysis of Extreme Values. The Annals of Mathematical Statistics 21(4):488-506.
Dixon, W. J. (1951): Ratios Involving Extreme Values. The Annals of Mathematical Statistics 22(1):68-78.
Verma, S. P. and A. Quiroz-Ruiz (2006): Critical values for six Dixon tests for outliers in normal samples up to sizes 100, and applications in science and engineering. Revista Mexicana de Ciencias Geológicas 23(2):133-161.

Significance level is determined by finding the smallest standard significance level that exceeds sigLevel:

sigLevel (`s`)	p
0.200 < `s` <= 0.300	0.3
0.100 < `s` <= 0.200	0.2
0.050 < `s` <= 0.100	0.1
0.020 < `s` <= 0.050	0.05
0.010 < `s` <= 0.020	0.02
0.005 < `s` <= 0.010	0.01
`s` <= 0.005	0.005

If sigLevel exceeds 0.300, it is truncated to 0.300.

Parameters

sampleSize	Sample size. If this is not between 4 and 100, the method returns 0.
min	Minimum value in the sample.
x2	Second smallest value in the sample.
xn_1	Second largest value in the sample.
max	Maximum value in the sample.
sigLevel	Significance level. See the above table for supported values.

Returns: One of three values: 0 if neither extreme is an outlier; -1 if min is an outlier; and +1 if max is an outlier.

◆ testGrubbsN2()

int testGrubbsN2	(	unsigned long	sampleSize,
		double	min,
		double	mean,
		double	max,
		double	stdev,
		double	sigLevel
	)

static

Apply Grubb's N2 outlier test to the sample statistics.

Grubbs N2 outlier test tries to remove one of two extremes: sample minimum or sample maximum. The data is assumed normally distributed with unknown mean and variance, and a statistic is computed for both extremes. The extreme with the higher statistic will be chosen for possible removal. The method returns -1 or +1 if the chosen extreme is an outlier (statistic exceeds a critical value) and 0 otherwise.

It is usually best to use a normal probability plot to assess whether outlier removal is needed and how many outliers there seem to be. The Grubbs N2 test assumes the data has at most one outlier, and may fail to remove the outlier if there are actually two or more outliers in the data (this is called masking). If you suspect more than one outlier, the generalised ESD test is more appropriate; see testRosners().

If your data is multiplicative, for example log-normal (such as peptide ratios), then you need to do a log transformation first before calling this method. See logTransform().

If you suspect the data is contaminated by values from another normal distribution with a different mean or variance, then Dixon's N9 test could be more appropriate. See testDixonsN9().

Details of the test procedure and critical values are presented in the following references:

Grubbs, F. E. (1969): Procedures for Detecting Outlying Observations in Samples. Technometrics 11:1-21.
Verma, S. P. and A. Quiroz-Ruiz (2006): Critical values for 22 discordancy test variants for outliers in normal samples up to sizes 100, and applications in science and engineering. Revista Mexicana de Ciencias Geológicas 23(3):302-319.
NIST/SEMATECH e-Handbook of Statistical Methods, section 1.3.5.17.1

Significance level is determined by finding the smallest standard significance level that exceeds sigLevel:

sigLevel (`s`)	p
0.200 < `s` <= 0.300	0.3
0.100 < `s` <= 0.200	0.2
0.050 < `s` <= 0.100	0.1
0.020 < `s` <= 0.050	0.05
0.010 < `s` <= 0.020	0.02
0.005 < `s` <= 0.010	0.01
`s` <= 0.005	0.005

If sigLevel exceeds 0.300, it is truncated to 0.300.

Parameters

sampleSize	Sample size. If this is not between 3 and 100, the method returns 0.
min	Minimum value in the sample.
mean	Sample arithmetic mean.
max	Maximum value in the sample.
stdev	Sample standard deviation. If stdev is not positive, the method returns 0.
sigLevel	Significance level. See the above table for supported values.

Returns: One of three values: 0 if neither extreme is an outlier; -1 if min is an outlier; and +1 if max is an outlier.

◆ testRosners()

void testRosners	(	const std::vector< double > &	sample,
		unsigned int	maxOutliers,
		double	sigLevel,
		unsigned int *const	numLow,
		unsigned int *const	numHigh
	)

static

Apply the generalised extreme Studentised deviate (ESD) outlier test (aka Rosner's test) to the sample.

The generalised ESD test tries to remove at least 1 and up to maxOutliers outliers from the sample. If you suspect there is at most one outlier in the data, the Grubbs N2 test may be more appropriate; see testGrubbsN2().

The data is assumed normally distributed with unknown mean and variance. If your data is multiplicative, for example log-normal (such as peptide ratios), then you need to do a log transformation first before calling this method. See logTransform().

Description of the test procedure can be found in the following references:

Rosner, B. (1983): Percentage Points for a Generalized ESD Many-Outlier Procedure. Technometrics, 25(2):165-172.
NIST/SEMATECH e-Handbook of Statistical Methods, section 1.3.5.17.3

Significance level is determined by finding the smallest standard significance level that exceeds sigLevel:

sigLevel (`s`)	p
0.100 < `s` <= 0.200	0.2
0.050 < `s` <= 0.100	0.1
0.020 < `s` <= 0.050	0.05
0.010 < `s` <= 0.020	0.02
0.002 < `s` <= 0.010	0.01
`s` <= 0.002	0.002

If sigLevel exceeds 0.200, it is truncated to 0.200.

See Multiple return values in Perl, Java, Python and C#.

Parameters

[in]	sample	A sample of data sorted in increasing numerical order. Sample size must be at least 25 for meaningful results.
[in]	maxOutliers	Number between 1 and 10 (inclusive) indicating the maximum number of suspected outliers in the sample.
[in]	sigLevel	Significance level. See the above table for supported values.
[out]	numLow	Number of outliers at the low end of sample; values at indices from 0 to numLow - 1 are outliers based on the test. If numLow is 0, the sample has no outliers at the low end.
[out]	numHigh	Number of outliers at the high end of sample; values at indices from `N` - numHigh to the end of the vector are outliers based on the test (`N` is the size of sample). If numHigh is 0, the sample has no outliers at the high end.

◆ testShapiroWilkW()

bool testShapiroWilkW	(	const std::vector< double > &	sample,
		double *const	W,
		double *const	pValue
	)

static

Apply Shapiro-Wilk W normality test to the sample.

See Multiple return values in Perl, Java, Python and C#.

See ms_shapiro_wilk for the test details. This method returns false if any of the following conditions apply:

Sample size is less than 3.
Sample range (max - min) is not positive.

In these cases the test is not conducted, and the values of W and pValue are not defined.

The number of right-censored values is assumed to be 0. If your sample contains censored values, use ms_shapiro_wilk directly.

Parameters

[in]	sample	A sample of data sorted in increasing numerical order. The number of elements should be between 3 and 5000, inclusive. If sample size exceeds 5000, the test statistic and p-value will be calculated successfully, but the p-value is not guaranteed to be accurate.
[out]	W	The Shapiro-Wilk W statistic calculated from the data.
[out]	pValue	The p-value of W under the null hypothesis.

Returns: True if the test was conducted; false otherwise.

◆ unsortedMedian()

double unsortedMedian ( const std::vector< double > & data )

static

Find the median of an unsorted data vector.

This method works for both sorted and unsorted input, although in the former case it is faster to call sortedMedian() instead.

The trivial implementation is to create a copy of the input data, sort and then call sortedMedian(). This method is slightly faster: If the input vector has an odd number of elements N, then the median would be at index m = (N-1)/2; otherwise the median is between elements m1 = floor((N-1)/2) and m2 = ceil((N-1)/2). The method uses partial sorting to find the mth element (or the m1st and m2nd elements) which means time complexity is slightly less than O(N log(N)) (which is the average time complexity of quicksort).

Parameters

data	Any vector of numbers.

Returns: The median of data, or 0 if the vector is empty.

◆ weightedArithmeticMean()

double weightedArithmeticMean	(	const std::vector< double > &	data,
		const std::vector< double > &	weights
	)

static

Calculate the weighted arithmetic mean.

The weighted arithmetic mean is

mean(data, weights) = (1/W)sum_i w_ix_i

where W is the sum of weights, w_i is an individual weight, and the summation is over all elements x_i.

Parameters

data	Any vector of numbers.
weights	Positive weight for each datum in data. Negative and zero weights are coerced to 1.0, and if weights has fewer elements than data, the missing weights are taken to be 1.0.

Returns: The weighted arithmetic mean of data, or 0 if the vector is empty.

◆ weightedArithmeticStandardDeviation()

double weightedArithmeticStandardDeviation	(	const std::vector< double > &	data,
		const std::vector< double > &	weights,
		double	weightedMean
	)

static

Calculate the weighted arithmetic sample standard deviation.

The weighted arithmetic sample deviation is the square root of the weighted arithmetic sample variance. The weighted sample variance is

var(data,weights) = var(data)/b(weights)

where the effective base b is

b(weights) = (sum_i w_i)²/sum_i(w_i²)

Parameters

data	A sample of data.
weights	Positive weight for each datum in data. Negative and zero weights are coerced to 1.0, and if weights has fewer elements than data, the weights of those elements are taken to be 1.0.
weightedMean	Sample weighted arithmetic mean.

Returns: Sample standard deviation, or 0 if sample size is 0 or 1.

◆ weightedArithmeticStandardError()

double weightedArithmeticStandardError	(	const std::vector< double > &	weights,
		double	weightedStdev,
		unsigned long	sampleSize
	)

static

Calculate the standard error of the weighted arithmetic sample mean.

The standard error of the weighted arithmetic mean is slightly different from the standard error of the mean. Like the weighted standard deviation, the weights must be taken into account. The effective base b is

b(weights) = (sum_i w_i)²/sum_i(w_i²) = W²/S,

where W² is the squared sum of the weights, and S is the sum of squared weights. As the weighted mean is

mean(data, weights) = (1/W)sum_i w_ix_i,

we can use the usual sample properties (independent, identically distributed random variables) to easily derive

var(mean) = sigma²/(b/W)

where sigma² is the population variance. The sample estimate of the standard error is then

sqrt(W s²/b²) = sqrt(W) s/b,

which is the value returned by this function.

Parameters

weights	Positive weight for each datum. Negative and zero weights are coerced to 1.0, and if weights has fewer elements than sampleSize, the weights of those elements are taken to be 1.0.
weightedStdev	Sample weighted arithmetic standard deviation. It is recommended to use weightedArithmeticStandardDeviation() with the same weights.
sampleSize	The sample size.

Returns: Weighted standard error, or 0 if sample size is 0 or 1.

◆ weightedGeometricMean()

double weightedGeometricMean	(	const std::vector< double > &	data,
		const std::vector< double > &	weights
	)

static

Calculate the weighted geometric mean.

The weighted geometric mean is

geom(data,weights) = (prod_i x_i^w_i)^1/W

where W is the sum of weights, w_i is an individual weight and the product is over all the elements x_i. That is, the geometric mean is the Wth root of the product of the w_ith roots of the input values.

Equivalently, the logarithm of the weighted geometric mean is equal to the weighted arithmetic mean of the data in log space. This is actually how the mean is calculated by the method.

Parameters

data	Any vector of numbers.
weights	Positive weight for each datum in data. Negative and zero weights are coerced to 1.0, and if weights has fewer elements than data, the missing weights are taken to be 1.0.

Returns: The weighted geometric mean of data, or 1 if the vector is empty. If any of the data elements are 0, the mean is 0.

◆ weightedGeometricStandardDeviation()

double weightedGeometricStandardDeviation	(	const std::vector< double > &	data,
		const std::vector< double > &	weights,
		double	weightedMean
	)

static

Calculate the weighted geometric sample standard deviation.

The weighted geometric sample deviation is the square root of the weighted geometric sample variance. The geometric sample variance is

var(logdata,weights) = e^{var(logdata)/b}

where the effective base b is

b(weights) = (sum_i w_i)²/sum_i(w_i²)

and logdata is the log-transformed data.

Equivalently, the logarithm of the weighted geometric standard deviation is equal to the weighted arithmetic standard deviation of the data in log space.

Parameters

data	A sample of data.
weights	Positive weight for each datum in data. Negative and zero weights are coerced to 1.0, and if weights has fewer elements than data, the weights of those elements are taken to be 1.0.
weightedMean	Sample weighted geometric mean.

Returns: Sample standard deviation, or 0 if sample size is 0 or 1.

The documentation for this class was generated from the following files:

ms_quant_stats.hpp
ms_quant_stats.cpp

Static Public Member Functions

Detailed Description

Member Function Documentation

◆ arithmeticMean()

◆ arithmeticStandardDeviation()

◆ arithmeticStandardErrorOfMean()

◆ arithmeticStandardErrorOfMedian()

◆ binomialCoefficient()

◆ calculateNormalMeanPvalue()

◆ calculateNormalMedianPvalue()

◆ calculateNormalWeightedPvalue()

◆ chisqLowerCriticalValue()

◆ chisqUpperCriticalValue()

◆ detectOutliers() [1/2]

◆ detectOutliers() [2/2]

◆ expTransform()

◆ geometricMean()

◆ geometricStandardDeviation()

◆ logTransform()

◆ meanCentre()

◆ medianStandardErrorCorrection()

◆ normalCriticalValue()

◆ normalCumulativeProbability()

◆ poissonDensity()

◆ snedecorsCriticalValue()

◆ sortedMedian() [1/2]

◆ sortedMedian() [2/2]

◆ studentsCriticalValue()

◆ sum()

◆ sumsq()

◆ testDixonsN9()

◆ testGrubbsN2()

◆ testRosners()

◆ testShapiroWilkW()

◆ unsortedMedian()

◆ weightedArithmeticMean()

◆ weightedArithmeticStandardDeviation()

◆ weightedArithmeticStandardError()

◆ weightedGeometricMean()

◆ weightedGeometricStandardDeviation()