// // Math.NET Numerics, part of the Math.NET Project // http://numerics.mathdotnet.com // http://github.com/mathnet/mathnet-numerics // // Copyright (c) 2009-2014 Math.NET // // Permission is hereby granted, free of charge, to any person // obtaining a copy of this software and associated documentation // files (the "Software"), to deal in the Software without // restriction, including without limitation the rights to use, // copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following // conditions: // // The above copyright notice and this permission notice shall be // included in all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT // HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, // WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR // OTHER DEALINGS IN THE SOFTWARE. // using System; using System.Collections.Generic; using System.Linq; using IStation.Numerics.Random; using IStation.Numerics.Statistics; using IStation.Numerics.Threading; namespace IStation.Numerics.Distributions { /// /// Discrete Univariate Categorical distribution. /// For details about this distribution, see /// Wikipedia - Categorical distribution. This /// distribution is sometimes called the Discrete distribution. /// /// /// The distribution is parameterized by a vector of ratios: in other words, the parameter /// does not have to be normalized and sum to 1. The reason is that some vectors can't be exactly normalized /// to sum to 1 in floating point representation. /// /// /// Support: 0..k where k = length(probability mass array)-1 /// public class Categorical : IDiscreteDistribution { System.Random _random; readonly double[] _pmfNormalized; readonly double[] _cdfUnnormalized; /// /// Initializes a new instance of the Categorical class. /// /// An array of nonnegative ratios: this array does not need to be normalized /// as this is often impossible using floating point arithmetic. /// If any of the probabilities are negative or do not sum to one. public Categorical(double[] probabilityMass) : this(probabilityMass, SystemRandomSource.Default) { } /// /// Initializes a new instance of the Categorical class. /// /// An array of nonnegative ratios: this array does not need to be normalized /// as this is often impossible using floating point arithmetic. /// The random number generator which is used to draw random samples. /// If any of the probabilities are negative or do not sum to one. public Categorical(double[] probabilityMass, System.Random randomSource) { if (Control.CheckDistributionParameters && !IsValidProbabilityMass(probabilityMass)) { throw new ArgumentException("Invalid parametrization for the distribution."); } _random = randomSource ?? SystemRandomSource.Default; // Extract unnormalized cumulative distribution _cdfUnnormalized = new double[probabilityMass.Length]; _cdfUnnormalized[0] = probabilityMass[0]; for (int i = 1; i < probabilityMass.Length; i++) { _cdfUnnormalized[i] = _cdfUnnormalized[i - 1] + probabilityMass[i]; } // Extract normalized probability mass var sum = _cdfUnnormalized[_cdfUnnormalized.Length - 1]; _pmfNormalized = new double[probabilityMass.Length]; for (int i = 0; i < probabilityMass.Length; i++) { _pmfNormalized[i] = probabilityMass[i]/sum; } } /// /// Initializes a new instance of the Categorical class from a . The distribution /// will not be automatically updated when the histogram changes. The categorical distribution will have /// one value for each bucket and a probability for that value proportional to the bucket count. /// /// The histogram from which to create the categorical variable. public Categorical(Histogram histogram) { if (histogram == null) { throw new ArgumentNullException(nameof(histogram)); } // The probability distribution vector. var p = new double[histogram.BucketCount]; // Fill in the distribution vector. for (var i = 0; i < histogram.BucketCount; i++) { p[i] = histogram[i].Count; } _random = SystemRandomSource.Default; if (Control.CheckDistributionParameters && !IsValidProbabilityMass(p)) { throw new ArgumentException("Invalid parametrization for the distribution."); } // Extract unnormalized cumulative distribution _cdfUnnormalized = new double[p.Length]; _cdfUnnormalized[0] = p[0]; for (int i1 = 1; i1 < p.Length; i1++) { _cdfUnnormalized[i1] = _cdfUnnormalized[i1 - 1] + p[i1]; } // Extract normalized probability mass var sum = _cdfUnnormalized[_cdfUnnormalized.Length - 1]; _pmfNormalized = new double[p.Length]; for (int i2 = 0; i2 < p.Length; i2++) { _pmfNormalized[i2] = p[i2]/sum; } } /// /// A string representation of the distribution. /// /// a string representation of the distribution. public override string ToString() { return $"Categorical(Dimension = {_pmfNormalized.Length})"; } /// /// Checks whether the parameters of the distribution are valid. /// /// An array of nonnegative ratios: this array does not need to be normalized as this is often impossible using floating point arithmetic. /// If any of the probabilities are negative returns false, or if the sum of parameters is 0.0; otherwise true public static bool IsValidProbabilityMass(double[] p) { var sum = 0.0; for (int i = 0; i < p.Length; i++) { double t = p[i]; if (t < 0.0 || double.IsNaN(t)) { return false; } sum += t; } return sum > 0.0; } /// /// Checks whether the parameters of the distribution are valid. /// /// An array of nonnegative ratios: this array does not need to be normalized as this is often impossible using floating point arithmetic. /// If any of the probabilities are negative returns false, or if the sum of parameters is 0.0; otherwise true public static bool IsValidCumulativeDistribution(double[] cdf) { var last = 0.0; for (int i = 0; i < cdf.Length; i++) { double t = cdf[i]; if (t < 0.0 || double.IsNaN(t) || t < last) { return false; } last = t; } return last > 0.0; } /// /// Gets the probability mass vector (non-negative ratios) of the multinomial. /// /// Sometimes the normalized probability vector cannot be represented exactly in a floating point representation. public double[] P => (double[])_pmfNormalized.Clone(); /// /// Gets or sets the random number generator which is used to draw random samples. /// public System.Random RandomSource { get => _random; set => _random = value ?? SystemRandomSource.Default; } /// /// Gets the mean of the distribution. /// public double Mean { get { // Mean = E[X] = Sum(x * p(x), x=0..N-1) // where f(x) is the probability mass function, and N is the number of categories. var sum = 0.0; for (int i = 0; i < _pmfNormalized.Length; i++) { sum += i*_pmfNormalized[i]; } return sum; } } /// /// Gets the standard deviation of the distribution. /// public double StdDev => Math.Sqrt(Variance); /// /// Gets the variance of the distribution. /// public double Variance { get { // Variance = E[(X-E[X])^2] = E[X^2] - (E[X])^2 = Sum(p(x) * (x - E[X])^2), x=0..N-1) var m = Mean; var sum = 0.0; for (int i = 0; i < _pmfNormalized.Length; i++) { var r = i - m; sum += r*r*_pmfNormalized[i]; } return sum; } } /// /// Gets the entropy of the distribution. /// public double Entropy { get { return -_pmfNormalized.Sum(p => p*Math.Log(p)); } } /// /// Gets the skewness of the distribution. /// /// Throws a . public double Skewness => throw new NotSupportedException(); /// /// Gets the smallest element in the domain of the distributions which can be represented by an integer. /// public int Minimum => 0; /// /// Gets the largest element in the domain of the distributions which can be represented by an integer. /// public int Maximum => _pmfNormalized.Length - 1; /// /// Gets he mode of the distribution. /// /// Throws a . public int Mode => throw new NotSupportedException(); /// /// Gets the median of the distribution. /// public double Median => InverseCumulativeDistribution(0.5); /// /// Computes the probability mass (PMF) at k, i.e. P(X = k). /// /// The location in the domain where we want to evaluate the probability mass function. /// the probability mass at location . public double Probability(int k) { if (k < 0) { return 0.0; } if (k >= _pmfNormalized.Length) { return 0.0; } return _pmfNormalized[k]; } /// /// Computes the log probability mass (lnPMF) at k, i.e. ln(P(X = k)). /// /// The location in the domain where we want to evaluate the log probability mass function. /// the log probability mass at location . public double ProbabilityLn(int k) { if (k < 0) { return 0.0; } if (k >= _pmfNormalized.Length) { return 0.0; } return Math.Log(_pmfNormalized[k]); } /// /// Computes the cumulative distribution (CDF) of the distribution at x, i.e. P(X ≤ x). /// /// The location at which to compute the cumulative distribution function. /// the cumulative distribution at location . public double CumulativeDistribution(double x) { if (x < 0.0) { return 0.0; } if (x >= _cdfUnnormalized.Length) { return 1.0; } return _cdfUnnormalized[(int)Math.Floor(x)]/_cdfUnnormalized[_cdfUnnormalized.Length - 1]; } /// /// Computes the inverse of the cumulative distribution function (InvCDF) for the distribution /// at the given probability. /// /// A real number between 0 and 1. /// An integer between 0 and the size of the categorical (exclusive), that corresponds to the inverse CDF for the given probability. public int InverseCumulativeDistribution(double probability) { if (probability < 0.0 || probability > 1.0 || double.IsNaN(probability)) { throw new ArgumentOutOfRangeException(nameof(probability)); } var denormalizedProbability = probability*_cdfUnnormalized[_cdfUnnormalized.Length - 1]; int idx = Array.BinarySearch(_cdfUnnormalized, denormalizedProbability); if (idx < 0) { idx = ~idx; } return idx; } /// /// Computes the probability mass (PMF) at k, i.e. P(X = k). /// /// The location in the domain where we want to evaluate the probability mass function. /// An array of nonnegative ratios: this array does not need to be normalized /// as this is often impossible using floating point arithmetic. /// the probability mass at location . public static double PMF(double[] probabilityMass, int k) { if (Control.CheckDistributionParameters && !IsValidProbabilityMass(probabilityMass)) { throw new ArgumentException("Invalid parametrization for the distribution."); } if (k < 0) { return 0.0; } if (k >= probabilityMass.Length) { return 0.0; } return probabilityMass[k]/probabilityMass.Sum(); } /// /// Computes the log probability mass (lnPMF) at k, i.e. ln(P(X = k)). /// /// The location in the domain where we want to evaluate the log probability mass function. /// An array of nonnegative ratios: this array does not need to be normalized /// as this is often impossible using floating point arithmetic. /// the log probability mass at location . public static double PMFLn(double[] probabilityMass, int k) { return Math.Log(PMF(probabilityMass, k)); } /// /// Computes the cumulative distribution (CDF) of the distribution at x, i.e. P(X ≤ x). /// /// The location at which to compute the cumulative distribution function. /// An array of nonnegative ratios: this array does not need to be normalized /// as this is often impossible using floating point arithmetic. /// the cumulative distribution at location . /// public static double CDF(double[] probabilityMass, double x) { if (Control.CheckDistributionParameters && !IsValidProbabilityMass(probabilityMass)) { throw new ArgumentException("Invalid parametrization for the distribution."); } if (x < 0.0) { return 0.0; } if (x >= probabilityMass.Length) { return 1.0; } var cdfUnnormalized = ProbabilityMassToCumulativeDistribution(probabilityMass); return cdfUnnormalized[(int)Math.Floor(x)]/cdfUnnormalized[cdfUnnormalized.Length - 1]; } /// /// Computes the inverse of the cumulative distribution function (InvCDF) for the distribution /// at the given probability. /// /// An array of nonnegative ratios: this array does not need to be normalized /// as this is often impossible using floating point arithmetic. /// A real number between 0 and 1. /// An integer between 0 and the size of the categorical (exclusive), that corresponds to the inverse CDF for the given probability. public static int InvCDF(double[] probabilityMass, double probability) { if (Control.CheckDistributionParameters && !IsValidProbabilityMass(probabilityMass)) { throw new ArgumentException("Invalid parametrization for the distribution."); } if (probability < 0.0 || probability > 1.0 || double.IsNaN(probability)) { throw new ArgumentOutOfRangeException(nameof(probability)); } var cdfUnnormalized = ProbabilityMassToCumulativeDistribution(probabilityMass); var denormalizedProbability = probability*cdfUnnormalized[cdfUnnormalized.Length - 1]; int idx = Array.BinarySearch(cdfUnnormalized, denormalizedProbability); if (idx < 0) { idx = ~idx; } return idx; } /// /// Computes the inverse of the cumulative distribution function (InvCDF) for the distribution /// at the given probability. /// /// An array corresponding to a CDF for a categorical distribution. Not assumed to be normalized. /// A real number between 0 and 1. /// An integer between 0 and the size of the categorical (exclusive), that corresponds to the inverse CDF for the given probability. public static int InvCDFWithCumulativeDistribution(double[] cdfUnnormalized, double probability) { if (Control.CheckDistributionParameters && !IsValidCumulativeDistribution(cdfUnnormalized)) { throw new ArgumentException("Invalid parametrization for the distribution."); } if (probability < 0.0 || probability > 1.0 || double.IsNaN(probability)) { throw new ArgumentOutOfRangeException(nameof(probability)); } var denormalizedProbability = probability*cdfUnnormalized[cdfUnnormalized.Length - 1]; int idx = Array.BinarySearch(cdfUnnormalized, denormalizedProbability); if (idx < 0) { idx = ~idx; } return idx; } /// /// Computes the cumulative distribution function. This method performs no parameter checking. /// If the probability mass was normalized, the resulting cumulative distribution is normalized as well (up to numerical errors). /// /// An array of nonnegative ratios: this array does not need to be normalized /// as this is often impossible using floating point arithmetic. /// An array representing the unnormalized cumulative distribution function. internal static double[] ProbabilityMassToCumulativeDistribution(double[] probabilityMass) { var cdfUnnormalized = new double[probabilityMass.Length]; cdfUnnormalized[0] = probabilityMass[0]; for (int i = 1; i < probabilityMass.Length; i++) { cdfUnnormalized[i] = cdfUnnormalized[i - 1] + probabilityMass[i]; } return cdfUnnormalized; } /// /// Returns one trials from the categorical distribution. /// /// The random number generator to use. /// The (unnormalized) cumulative distribution of the probability distribution. /// One sample from the categorical distribution implied by . internal static int SampleUnchecked(System.Random rnd, double[] cdfUnnormalized) { // TODO : use binary search to speed up this procedure. double u = rnd.NextDouble()*cdfUnnormalized[cdfUnnormalized.Length - 1]; var idx = 0; if (u == 0.0d) { // skip zero-probability categories while (0.0d == cdfUnnormalized[idx]) { idx++; } } while (u > cdfUnnormalized[idx]) { idx++; } return idx; } static void SamplesUnchecked(System.Random rnd, int[] values, double[] cdfUnnormalized) { // TODO : use binary search to speed up this procedure. double[] uniform = rnd.NextDoubles(values.Length); double w = cdfUnnormalized[cdfUnnormalized.Length - 1]; CommonParallel.For(0, values.Length, 4096, (a, b) => { for (int i = a; i < b; i++) { var u = uniform[i]*w; var idx = 0; if (u == 0.0d) { // skip zero-probability categories while (0.0d == cdfUnnormalized[idx]) { idx++; } } while (u > cdfUnnormalized[idx]) { idx++; } values[i] = idx; } }); } static IEnumerable SamplesUnchecked(System.Random rnd, double[] cdfUnnormalized) { while (true) { yield return SampleUnchecked(rnd, cdfUnnormalized); } } /// /// Samples a Binomially distributed random variable. /// /// The number of successful trials. public int Sample() { return SampleUnchecked(_random, _cdfUnnormalized); } /// /// Fills an array with samples generated from the distribution. /// public void Samples(int[] values) { SamplesUnchecked(_random, values, _cdfUnnormalized); } /// /// Samples an array of Bernoulli distributed random variables. /// /// a sequence of successful trial counts. public IEnumerable Samples() { return SamplesUnchecked(_random, _cdfUnnormalized); } /// /// Samples one categorical distributed random variable; also known as the Discrete distribution. /// /// The random number generator to use. /// An array of nonnegative ratios. Not assumed to be normalized. /// One random integer between 0 and the size of the categorical (exclusive). public static int Sample(System.Random rnd, double[] probabilityMass) { if (Control.CheckDistributionParameters && !IsValidProbabilityMass(probabilityMass)) { throw new ArgumentException("Invalid parametrization for the distribution."); } var cdf = ProbabilityMassToCumulativeDistribution(probabilityMass); return SampleUnchecked(rnd, cdf); } /// /// Samples a categorically distributed random variable. /// /// The random number generator to use. /// An array of nonnegative ratios. Not assumed to be normalized. /// random integers between 0 and the size of the categorical (exclusive). public static IEnumerable Samples(System.Random rnd, double[] probabilityMass) { if (Control.CheckDistributionParameters && !IsValidProbabilityMass(probabilityMass)) { throw new ArgumentException("Invalid parametrization for the distribution."); } var cdf = ProbabilityMassToCumulativeDistribution(probabilityMass); return SamplesUnchecked(rnd, cdf); } /// /// Fills an array with samples generated from the distribution. /// /// The random number generator to use. /// The array to fill with the samples. /// An array of nonnegative ratios. Not assumed to be normalized. /// random integers between 0 and the size of the categorical (exclusive). public static void Samples(System.Random rnd, int[] values, double[] probabilityMass) { if (Control.CheckDistributionParameters && !IsValidProbabilityMass(probabilityMass)) { throw new ArgumentException("Invalid parametrization for the distribution."); } var cdf = ProbabilityMassToCumulativeDistribution(probabilityMass); SamplesUnchecked(rnd, values, cdf); } /// /// Samples one categorical distributed random variable; also known as the Discrete distribution. /// /// An array of nonnegative ratios. Not assumed to be normalized. /// One random integer between 0 and the size of the categorical (exclusive). public static int Sample(double[] probabilityMass) { if (Control.CheckDistributionParameters && !IsValidProbabilityMass(probabilityMass)) { throw new ArgumentException("Invalid parametrization for the distribution."); } var cdf = ProbabilityMassToCumulativeDistribution(probabilityMass); return SampleUnchecked(SystemRandomSource.Default, cdf); } /// /// Samples a categorically distributed random variable. /// /// An array of nonnegative ratios. Not assumed to be normalized. /// random integers between 0 and the size of the categorical (exclusive). public static IEnumerable Samples(double[] probabilityMass) { if (Control.CheckDistributionParameters && !IsValidProbabilityMass(probabilityMass)) { throw new ArgumentException("Invalid parametrization for the distribution."); } var cdf = ProbabilityMassToCumulativeDistribution(probabilityMass); return SamplesUnchecked(SystemRandomSource.Default, cdf); } /// /// Fills an array with samples generated from the distribution. /// /// The array to fill with the samples. /// An array of nonnegative ratios. Not assumed to be normalized. /// random integers between 0 and the size of the categorical (exclusive). public static void Samples(int[] values, double[] probabilityMass) { if (Control.CheckDistributionParameters && !IsValidProbabilityMass(probabilityMass)) { throw new ArgumentException("Invalid parametrization for the distribution."); } var cdf = ProbabilityMassToCumulativeDistribution(probabilityMass); SamplesUnchecked(SystemRandomSource.Default, values, cdf); } /// /// Samples one categorical distributed random variable; also known as the Discrete distribution. /// /// The random number generator to use. /// An array of the cumulative distribution. Not assumed to be normalized. /// One random integer between 0 and the size of the categorical (exclusive). public static int SampleWithCumulativeDistribution(System.Random rnd, double[] cdfUnnormalized) { if (Control.CheckDistributionParameters && !IsValidCumulativeDistribution(cdfUnnormalized)) { throw new ArgumentException("Invalid parametrization for the distribution."); } return SampleUnchecked(rnd, cdfUnnormalized); } /// /// Samples a categorically distributed random variable. /// /// The random number generator to use. /// An array of the cumulative distribution. Not assumed to be normalized. /// random integers between 0 and the size of the categorical (exclusive). public static IEnumerable SamplesWithCumulativeDistribution(System.Random rnd, double[] cdfUnnormalized) { if (Control.CheckDistributionParameters && !IsValidCumulativeDistribution(cdfUnnormalized)) { throw new ArgumentException("Invalid parametrization for the distribution."); } return SamplesUnchecked(rnd, cdfUnnormalized); } /// /// Fills an array with samples generated from the distribution. /// /// The random number generator to use. /// The array to fill with the samples. /// An array of the cumulative distribution. Not assumed to be normalized. /// random integers between 0 and the size of the categorical (exclusive). public static void SamplesWithCumulativeDistribution(System.Random rnd, int[] values, double[] cdfUnnormalized) { if (Control.CheckDistributionParameters && !IsValidCumulativeDistribution(cdfUnnormalized)) { throw new ArgumentException("Invalid parametrization for the distribution."); } SamplesUnchecked(rnd, values, cdfUnnormalized); } /// /// Samples one categorical distributed random variable; also known as the Discrete distribution. /// /// An array of the cumulative distribution. Not assumed to be normalized. /// One random integer between 0 and the size of the categorical (exclusive). public static int SampleWithCumulativeDistribution(double[] cdfUnnormalized) { if (Control.CheckDistributionParameters && !IsValidCumulativeDistribution(cdfUnnormalized)) { throw new ArgumentException("Invalid parametrization for the distribution."); } return SampleUnchecked(SystemRandomSource.Default, cdfUnnormalized); } /// /// Samples a categorically distributed random variable. /// /// An array of the cumulative distribution. Not assumed to be normalized. /// random integers between 0 and the size of the categorical (exclusive). public static IEnumerable SamplesWithCumulativeDistribution(double[] cdfUnnormalized) { if (Control.CheckDistributionParameters && !IsValidCumulativeDistribution(cdfUnnormalized)) { throw new ArgumentException("Invalid parametrization for the distribution."); } return SamplesUnchecked(SystemRandomSource.Default, cdfUnnormalized); } /// /// Fills an array with samples generated from the distribution. /// /// The array to fill with the samples. /// An array of the cumulative distribution. Not assumed to be normalized. /// random integers between 0 and the size of the categorical (exclusive). public static void SamplesWithCumulativeDistribution(int[] values, double[] cdfUnnormalized) { if (Control.CheckDistributionParameters && !IsValidCumulativeDistribution(cdfUnnormalized)) { throw new ArgumentException("Invalid parametrization for the distribution."); } SamplesUnchecked(SystemRandomSource.Default, values, cdfUnnormalized); } } }