@Internal
public class VarianceFn<T extends java.lang.Number>
extends org.apache.beam.sdk.transforms.Combine.CombineFn<T,org.apache.beam.sdk.extensions.sql.impl.transform.agg.VarianceAccumulator,T>
Combine.CombineFn for Variance on Number types.
Calculates Population Variance and Sample Variance using incremental formulas described, for example, by Chan, Golub, and LeVeque in "Algorithms for computing the sample variance: analysis and recommendations", The American Statistician, 37 (1983) pp. 242--247.
If variance is defined like this:
(x[1], ... , x[n])
mean(x) = sum(x) / n
ith element from the current mean: deviation(x, i) = x[i] -
mean(n)
variance(x) = deviation(x, 1)^2 + ... + deviation(x, n)^2
Then variance of combined input of 2 samples (x[1], ... , x[n]) and (y[1], ...
, y[m]) is calculated using this formula:
variance(concat(x,y)) = variance(x) + variance(y) + increment, where:
increment = m/(n(m+n)) * (n/m * sum(x) - sum(y))^2
This is also applicable for a single element increment, assuming that variance of a single element input is zero
To implement the above formula we keep track of the current variation, sum, and count of elements, and then use the formula whenever new element comes or we need to merge variances for 2 samples.
| Modifier and Type | Method and Description |
|---|---|
org.apache.beam.sdk.extensions.sql.impl.transform.agg.VarianceAccumulator |
addInput(org.apache.beam.sdk.extensions.sql.impl.transform.agg.VarianceAccumulator currentVariance,
T rawInput) |
org.apache.beam.sdk.extensions.sql.impl.transform.agg.VarianceAccumulator |
createAccumulator() |
T |
extractOutput(org.apache.beam.sdk.extensions.sql.impl.transform.agg.VarianceAccumulator accumulator) |
org.apache.beam.sdk.coders.Coder<org.apache.beam.sdk.extensions.sql.impl.transform.agg.VarianceAccumulator> |
getAccumulatorCoder(org.apache.beam.sdk.coders.CoderRegistry registry,
org.apache.beam.sdk.coders.Coder<T> inputCoder) |
org.apache.beam.sdk.extensions.sql.impl.transform.agg.VarianceAccumulator |
mergeAccumulators(java.lang.Iterable<org.apache.beam.sdk.extensions.sql.impl.transform.agg.VarianceAccumulator> variances) |
static <V extends java.lang.Number> |
newPopulation(org.apache.beam.sdk.schemas.Schema.TypeName typeName) |
static <V extends java.lang.Number> |
newPopulation(org.apache.beam.sdk.transforms.SerializableFunction<java.math.BigDecimal,V> decimalConverter) |
static <V extends java.lang.Number> |
newSample(org.apache.beam.sdk.schemas.Schema.TypeName typeName) |
static <V extends java.lang.Number> |
newSample(org.apache.beam.sdk.transforms.SerializableFunction<java.math.BigDecimal,V> decimalConverter) |
public static <V extends java.lang.Number> VarianceFn newPopulation(org.apache.beam.sdk.schemas.Schema.TypeName typeName)
public static <V extends java.lang.Number> VarianceFn newPopulation(org.apache.beam.sdk.transforms.SerializableFunction<java.math.BigDecimal,V> decimalConverter)
public static <V extends java.lang.Number> VarianceFn newSample(org.apache.beam.sdk.schemas.Schema.TypeName typeName)
public static <V extends java.lang.Number> VarianceFn newSample(org.apache.beam.sdk.transforms.SerializableFunction<java.math.BigDecimal,V> decimalConverter)
public org.apache.beam.sdk.extensions.sql.impl.transform.agg.VarianceAccumulator createAccumulator()
public org.apache.beam.sdk.extensions.sql.impl.transform.agg.VarianceAccumulator addInput(org.apache.beam.sdk.extensions.sql.impl.transform.agg.VarianceAccumulator currentVariance,
T rawInput)
public org.apache.beam.sdk.extensions.sql.impl.transform.agg.VarianceAccumulator mergeAccumulators(java.lang.Iterable<org.apache.beam.sdk.extensions.sql.impl.transform.agg.VarianceAccumulator> variances)
public org.apache.beam.sdk.coders.Coder<org.apache.beam.sdk.extensions.sql.impl.transform.agg.VarianceAccumulator> getAccumulatorCoder(org.apache.beam.sdk.coders.CoderRegistry registry,
org.apache.beam.sdk.coders.Coder<T> inputCoder)
public T extractOutput(org.apache.beam.sdk.extensions.sql.impl.transform.agg.VarianceAccumulator accumulator)