//
// Math.NET Numerics, part of the Math.NET Project
// http://numerics.mathdotnet.com
// http://github.com/mathnet/mathnet-numerics
//
// Copyright (c) 2009-2014 Math.NET
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
using System;
using System.Collections.Generic;
using System.Linq;
using IStation.Numerics.Random;
using IStation.Numerics.Statistics;
using IStation.Numerics.Threading;
namespace IStation.Numerics.Distributions
{
///
/// Discrete Univariate Categorical distribution.
/// For details about this distribution, see
/// Wikipedia - Categorical distribution. This
/// distribution is sometimes called the Discrete distribution.
///
///
/// The distribution is parameterized by a vector of ratios: in other words, the parameter
/// does not have to be normalized and sum to 1. The reason is that some vectors can't be exactly normalized
/// to sum to 1 in floating point representation.
///
///
/// Support: 0..k where k = length(probability mass array)-1
///
public class Categorical : IDiscreteDistribution
{
System.Random _random;
readonly double[] _pmfNormalized;
readonly double[] _cdfUnnormalized;
///
/// Initializes a new instance of the Categorical class.
///
/// An array of nonnegative ratios: this array does not need to be normalized
/// as this is often impossible using floating point arithmetic.
/// If any of the probabilities are negative or do not sum to one.
public Categorical(double[] probabilityMass)
: this(probabilityMass, SystemRandomSource.Default)
{
}
///
/// Initializes a new instance of the Categorical class.
///
/// An array of nonnegative ratios: this array does not need to be normalized
/// as this is often impossible using floating point arithmetic.
/// The random number generator which is used to draw random samples.
/// If any of the probabilities are negative or do not sum to one.
public Categorical(double[] probabilityMass, System.Random randomSource)
{
if (Control.CheckDistributionParameters && !IsValidProbabilityMass(probabilityMass))
{
throw new ArgumentException("Invalid parametrization for the distribution.");
}
_random = randomSource ?? SystemRandomSource.Default;
// Extract unnormalized cumulative distribution
_cdfUnnormalized = new double[probabilityMass.Length];
_cdfUnnormalized[0] = probabilityMass[0];
for (int i = 1; i < probabilityMass.Length; i++)
{
_cdfUnnormalized[i] = _cdfUnnormalized[i - 1] + probabilityMass[i];
}
// Extract normalized probability mass
var sum = _cdfUnnormalized[_cdfUnnormalized.Length - 1];
_pmfNormalized = new double[probabilityMass.Length];
for (int i = 0; i < probabilityMass.Length; i++)
{
_pmfNormalized[i] = probabilityMass[i]/sum;
}
}
///
/// Initializes a new instance of the Categorical class from a . The distribution
/// will not be automatically updated when the histogram changes. The categorical distribution will have
/// one value for each bucket and a probability for that value proportional to the bucket count.
///
/// The histogram from which to create the categorical variable.
public Categorical(Histogram histogram)
{
if (histogram == null)
{
throw new ArgumentNullException(nameof(histogram));
}
// The probability distribution vector.
var p = new double[histogram.BucketCount];
// Fill in the distribution vector.
for (var i = 0; i < histogram.BucketCount; i++)
{
p[i] = histogram[i].Count;
}
_random = SystemRandomSource.Default;
if (Control.CheckDistributionParameters && !IsValidProbabilityMass(p))
{
throw new ArgumentException("Invalid parametrization for the distribution.");
}
// Extract unnormalized cumulative distribution
_cdfUnnormalized = new double[p.Length];
_cdfUnnormalized[0] = p[0];
for (int i1 = 1; i1 < p.Length; i1++)
{
_cdfUnnormalized[i1] = _cdfUnnormalized[i1 - 1] + p[i1];
}
// Extract normalized probability mass
var sum = _cdfUnnormalized[_cdfUnnormalized.Length - 1];
_pmfNormalized = new double[p.Length];
for (int i2 = 0; i2 < p.Length; i2++)
{
_pmfNormalized[i2] = p[i2]/sum;
}
}
///
/// A string representation of the distribution.
///
/// a string representation of the distribution.
public override string ToString()
{
return $"Categorical(Dimension = {_pmfNormalized.Length})";
}
///
/// Checks whether the parameters of the distribution are valid.
///
/// An array of nonnegative ratios: this array does not need to be normalized as this is often impossible using floating point arithmetic.
/// If any of the probabilities are negative returns false, or if the sum of parameters is 0.0; otherwise true
public static bool IsValidProbabilityMass(double[] p)
{
var sum = 0.0;
for (int i = 0; i < p.Length; i++)
{
double t = p[i];
if (t < 0.0 || double.IsNaN(t))
{
return false;
}
sum += t;
}
return sum > 0.0;
}
///
/// Checks whether the parameters of the distribution are valid.
///
/// An array of nonnegative ratios: this array does not need to be normalized as this is often impossible using floating point arithmetic.
/// If any of the probabilities are negative returns false, or if the sum of parameters is 0.0; otherwise true
public static bool IsValidCumulativeDistribution(double[] cdf)
{
var last = 0.0;
for (int i = 0; i < cdf.Length; i++)
{
double t = cdf[i];
if (t < 0.0 || double.IsNaN(t) || t < last)
{
return false;
}
last = t;
}
return last > 0.0;
}
///
/// Gets the probability mass vector (non-negative ratios) of the multinomial.
///
/// Sometimes the normalized probability vector cannot be represented exactly in a floating point representation.
public double[] P => (double[])_pmfNormalized.Clone();
///
/// Gets or sets the random number generator which is used to draw random samples.
///
public System.Random RandomSource
{
get => _random;
set => _random = value ?? SystemRandomSource.Default;
}
///
/// Gets the mean of the distribution.
///
public double Mean
{
get
{
// Mean = E[X] = Sum(x * p(x), x=0..N-1)
// where f(x) is the probability mass function, and N is the number of categories.
var sum = 0.0;
for (int i = 0; i < _pmfNormalized.Length; i++)
{
sum += i*_pmfNormalized[i];
}
return sum;
}
}
///
/// Gets the standard deviation of the distribution.
///
public double StdDev => Math.Sqrt(Variance);
///
/// Gets the variance of the distribution.
///
public double Variance
{
get
{
// Variance = E[(X-E[X])^2] = E[X^2] - (E[X])^2 = Sum(p(x) * (x - E[X])^2), x=0..N-1)
var m = Mean;
var sum = 0.0;
for (int i = 0; i < _pmfNormalized.Length; i++)
{
var r = i - m;
sum += r*r*_pmfNormalized[i];
}
return sum;
}
}
///
/// Gets the entropy of the distribution.
///
public double Entropy
{
get { return -_pmfNormalized.Sum(p => p*Math.Log(p)); }
}
///
/// Gets the skewness of the distribution.
///
/// Throws a .
public double Skewness => throw new NotSupportedException();
///
/// Gets the smallest element in the domain of the distributions which can be represented by an integer.
///
public int Minimum => 0;
///
/// Gets the largest element in the domain of the distributions which can be represented by an integer.
///
public int Maximum => _pmfNormalized.Length - 1;
///
/// Gets he mode of the distribution.
///
/// Throws a .
public int Mode => throw new NotSupportedException();
///
/// Gets the median of the distribution.
///
public double Median => InverseCumulativeDistribution(0.5);
///
/// Computes the probability mass (PMF) at k, i.e. P(X = k).
///
/// The location in the domain where we want to evaluate the probability mass function.
/// the probability mass at location .
public double Probability(int k)
{
if (k < 0)
{
return 0.0;
}
if (k >= _pmfNormalized.Length)
{
return 0.0;
}
return _pmfNormalized[k];
}
///
/// Computes the log probability mass (lnPMF) at k, i.e. ln(P(X = k)).
///
/// The location in the domain where we want to evaluate the log probability mass function.
/// the log probability mass at location .
public double ProbabilityLn(int k)
{
if (k < 0)
{
return 0.0;
}
if (k >= _pmfNormalized.Length)
{
return 0.0;
}
return Math.Log(_pmfNormalized[k]);
}
///
/// Computes the cumulative distribution (CDF) of the distribution at x, i.e. P(X ≤ x).
///
/// The location at which to compute the cumulative distribution function.
/// the cumulative distribution at location .
public double CumulativeDistribution(double x)
{
if (x < 0.0)
{
return 0.0;
}
if (x >= _cdfUnnormalized.Length)
{
return 1.0;
}
return _cdfUnnormalized[(int)Math.Floor(x)]/_cdfUnnormalized[_cdfUnnormalized.Length - 1];
}
///
/// Computes the inverse of the cumulative distribution function (InvCDF) for the distribution
/// at the given probability.
///
/// A real number between 0 and 1.
/// An integer between 0 and the size of the categorical (exclusive), that corresponds to the inverse CDF for the given probability.
public int InverseCumulativeDistribution(double probability)
{
if (probability < 0.0 || probability > 1.0 || double.IsNaN(probability))
{
throw new ArgumentOutOfRangeException(nameof(probability));
}
var denormalizedProbability = probability*_cdfUnnormalized[_cdfUnnormalized.Length - 1];
int idx = Array.BinarySearch(_cdfUnnormalized, denormalizedProbability);
if (idx < 0)
{
idx = ~idx;
}
return idx;
}
///
/// Computes the probability mass (PMF) at k, i.e. P(X = k).
///
/// The location in the domain where we want to evaluate the probability mass function.
/// An array of nonnegative ratios: this array does not need to be normalized
/// as this is often impossible using floating point arithmetic.
/// the probability mass at location .
public static double PMF(double[] probabilityMass, int k)
{
if (Control.CheckDistributionParameters && !IsValidProbabilityMass(probabilityMass))
{
throw new ArgumentException("Invalid parametrization for the distribution.");
}
if (k < 0)
{
return 0.0;
}
if (k >= probabilityMass.Length)
{
return 0.0;
}
return probabilityMass[k]/probabilityMass.Sum();
}
///
/// Computes the log probability mass (lnPMF) at k, i.e. ln(P(X = k)).
///
/// The location in the domain where we want to evaluate the log probability mass function.
/// An array of nonnegative ratios: this array does not need to be normalized
/// as this is often impossible using floating point arithmetic.
/// the log probability mass at location .
public static double PMFLn(double[] probabilityMass, int k)
{
return Math.Log(PMF(probabilityMass, k));
}
///
/// Computes the cumulative distribution (CDF) of the distribution at x, i.e. P(X ≤ x).
///
/// The location at which to compute the cumulative distribution function.
/// An array of nonnegative ratios: this array does not need to be normalized
/// as this is often impossible using floating point arithmetic.
/// the cumulative distribution at location .
///
public static double CDF(double[] probabilityMass, double x)
{
if (Control.CheckDistributionParameters && !IsValidProbabilityMass(probabilityMass))
{
throw new ArgumentException("Invalid parametrization for the distribution.");
}
if (x < 0.0)
{
return 0.0;
}
if (x >= probabilityMass.Length)
{
return 1.0;
}
var cdfUnnormalized = ProbabilityMassToCumulativeDistribution(probabilityMass);
return cdfUnnormalized[(int)Math.Floor(x)]/cdfUnnormalized[cdfUnnormalized.Length - 1];
}
///
/// Computes the inverse of the cumulative distribution function (InvCDF) for the distribution
/// at the given probability.
///
/// An array of nonnegative ratios: this array does not need to be normalized
/// as this is often impossible using floating point arithmetic.
/// A real number between 0 and 1.
/// An integer between 0 and the size of the categorical (exclusive), that corresponds to the inverse CDF for the given probability.
public static int InvCDF(double[] probabilityMass, double probability)
{
if (Control.CheckDistributionParameters && !IsValidProbabilityMass(probabilityMass))
{
throw new ArgumentException("Invalid parametrization for the distribution.");
}
if (probability < 0.0 || probability > 1.0 || double.IsNaN(probability))
{
throw new ArgumentOutOfRangeException(nameof(probability));
}
var cdfUnnormalized = ProbabilityMassToCumulativeDistribution(probabilityMass);
var denormalizedProbability = probability*cdfUnnormalized[cdfUnnormalized.Length - 1];
int idx = Array.BinarySearch(cdfUnnormalized, denormalizedProbability);
if (idx < 0)
{
idx = ~idx;
}
return idx;
}
///
/// Computes the inverse of the cumulative distribution function (InvCDF) for the distribution
/// at the given probability.
///
/// An array corresponding to a CDF for a categorical distribution. Not assumed to be normalized.
/// A real number between 0 and 1.
/// An integer between 0 and the size of the categorical (exclusive), that corresponds to the inverse CDF for the given probability.
public static int InvCDFWithCumulativeDistribution(double[] cdfUnnormalized, double probability)
{
if (Control.CheckDistributionParameters && !IsValidCumulativeDistribution(cdfUnnormalized))
{
throw new ArgumentException("Invalid parametrization for the distribution.");
}
if (probability < 0.0 || probability > 1.0 || double.IsNaN(probability))
{
throw new ArgumentOutOfRangeException(nameof(probability));
}
var denormalizedProbability = probability*cdfUnnormalized[cdfUnnormalized.Length - 1];
int idx = Array.BinarySearch(cdfUnnormalized, denormalizedProbability);
if (idx < 0)
{
idx = ~idx;
}
return idx;
}
///
/// Computes the cumulative distribution function. This method performs no parameter checking.
/// If the probability mass was normalized, the resulting cumulative distribution is normalized as well (up to numerical errors).
///
/// An array of nonnegative ratios: this array does not need to be normalized
/// as this is often impossible using floating point arithmetic.
/// An array representing the unnormalized cumulative distribution function.
internal static double[] ProbabilityMassToCumulativeDistribution(double[] probabilityMass)
{
var cdfUnnormalized = new double[probabilityMass.Length];
cdfUnnormalized[0] = probabilityMass[0];
for (int i = 1; i < probabilityMass.Length; i++)
{
cdfUnnormalized[i] = cdfUnnormalized[i - 1] + probabilityMass[i];
}
return cdfUnnormalized;
}
///
/// Returns one trials from the categorical distribution.
///
/// The random number generator to use.
/// The (unnormalized) cumulative distribution of the probability distribution.
/// One sample from the categorical distribution implied by .
internal static int SampleUnchecked(System.Random rnd, double[] cdfUnnormalized)
{
// TODO : use binary search to speed up this procedure.
double u = rnd.NextDouble()*cdfUnnormalized[cdfUnnormalized.Length - 1];
var idx = 0;
if (u == 0.0d)
{
// skip zero-probability categories
while (0.0d == cdfUnnormalized[idx])
{
idx++;
}
}
while (u > cdfUnnormalized[idx])
{
idx++;
}
return idx;
}
static void SamplesUnchecked(System.Random rnd, int[] values, double[] cdfUnnormalized)
{
// TODO : use binary search to speed up this procedure.
double[] uniform = rnd.NextDoubles(values.Length);
double w = cdfUnnormalized[cdfUnnormalized.Length - 1];
CommonParallel.For(0, values.Length, 4096, (a, b) =>
{
for (int i = a; i < b; i++)
{
var u = uniform[i]*w;
var idx = 0;
if (u == 0.0d)
{
// skip zero-probability categories
while (0.0d == cdfUnnormalized[idx])
{
idx++;
}
}
while (u > cdfUnnormalized[idx])
{
idx++;
}
values[i] = idx;
}
});
}
static IEnumerable SamplesUnchecked(System.Random rnd, double[] cdfUnnormalized)
{
while (true)
{
yield return SampleUnchecked(rnd, cdfUnnormalized);
}
}
///
/// Samples a Binomially distributed random variable.
///
/// The number of successful trials.
public int Sample()
{
return SampleUnchecked(_random, _cdfUnnormalized);
}
///
/// Fills an array with samples generated from the distribution.
///
public void Samples(int[] values)
{
SamplesUnchecked(_random, values, _cdfUnnormalized);
}
///
/// Samples an array of Bernoulli distributed random variables.
///
/// a sequence of successful trial counts.
public IEnumerable Samples()
{
return SamplesUnchecked(_random, _cdfUnnormalized);
}
///
/// Samples one categorical distributed random variable; also known as the Discrete distribution.
///
/// The random number generator to use.
/// An array of nonnegative ratios. Not assumed to be normalized.
/// One random integer between 0 and the size of the categorical (exclusive).
public static int Sample(System.Random rnd, double[] probabilityMass)
{
if (Control.CheckDistributionParameters && !IsValidProbabilityMass(probabilityMass))
{
throw new ArgumentException("Invalid parametrization for the distribution.");
}
var cdf = ProbabilityMassToCumulativeDistribution(probabilityMass);
return SampleUnchecked(rnd, cdf);
}
///
/// Samples a categorically distributed random variable.
///
/// The random number generator to use.
/// An array of nonnegative ratios. Not assumed to be normalized.
/// random integers between 0 and the size of the categorical (exclusive).
public static IEnumerable Samples(System.Random rnd, double[] probabilityMass)
{
if (Control.CheckDistributionParameters && !IsValidProbabilityMass(probabilityMass))
{
throw new ArgumentException("Invalid parametrization for the distribution.");
}
var cdf = ProbabilityMassToCumulativeDistribution(probabilityMass);
return SamplesUnchecked(rnd, cdf);
}
///
/// Fills an array with samples generated from the distribution.
///
/// The random number generator to use.
/// The array to fill with the samples.
/// An array of nonnegative ratios. Not assumed to be normalized.
/// random integers between 0 and the size of the categorical (exclusive).
public static void Samples(System.Random rnd, int[] values, double[] probabilityMass)
{
if (Control.CheckDistributionParameters && !IsValidProbabilityMass(probabilityMass))
{
throw new ArgumentException("Invalid parametrization for the distribution.");
}
var cdf = ProbabilityMassToCumulativeDistribution(probabilityMass);
SamplesUnchecked(rnd, values, cdf);
}
///
/// Samples one categorical distributed random variable; also known as the Discrete distribution.
///
/// An array of nonnegative ratios. Not assumed to be normalized.
/// One random integer between 0 and the size of the categorical (exclusive).
public static int Sample(double[] probabilityMass)
{
if (Control.CheckDistributionParameters && !IsValidProbabilityMass(probabilityMass))
{
throw new ArgumentException("Invalid parametrization for the distribution.");
}
var cdf = ProbabilityMassToCumulativeDistribution(probabilityMass);
return SampleUnchecked(SystemRandomSource.Default, cdf);
}
///
/// Samples a categorically distributed random variable.
///
/// An array of nonnegative ratios. Not assumed to be normalized.
/// random integers between 0 and the size of the categorical (exclusive).
public static IEnumerable Samples(double[] probabilityMass)
{
if (Control.CheckDistributionParameters && !IsValidProbabilityMass(probabilityMass))
{
throw new ArgumentException("Invalid parametrization for the distribution.");
}
var cdf = ProbabilityMassToCumulativeDistribution(probabilityMass);
return SamplesUnchecked(SystemRandomSource.Default, cdf);
}
///
/// Fills an array with samples generated from the distribution.
///
/// The array to fill with the samples.
/// An array of nonnegative ratios. Not assumed to be normalized.
/// random integers between 0 and the size of the categorical (exclusive).
public static void Samples(int[] values, double[] probabilityMass)
{
if (Control.CheckDistributionParameters && !IsValidProbabilityMass(probabilityMass))
{
throw new ArgumentException("Invalid parametrization for the distribution.");
}
var cdf = ProbabilityMassToCumulativeDistribution(probabilityMass);
SamplesUnchecked(SystemRandomSource.Default, values, cdf);
}
///
/// Samples one categorical distributed random variable; also known as the Discrete distribution.
///
/// The random number generator to use.
/// An array of the cumulative distribution. Not assumed to be normalized.
/// One random integer between 0 and the size of the categorical (exclusive).
public static int SampleWithCumulativeDistribution(System.Random rnd, double[] cdfUnnormalized)
{
if (Control.CheckDistributionParameters && !IsValidCumulativeDistribution(cdfUnnormalized))
{
throw new ArgumentException("Invalid parametrization for the distribution.");
}
return SampleUnchecked(rnd, cdfUnnormalized);
}
///
/// Samples a categorically distributed random variable.
///
/// The random number generator to use.
/// An array of the cumulative distribution. Not assumed to be normalized.
/// random integers between 0 and the size of the categorical (exclusive).
public static IEnumerable SamplesWithCumulativeDistribution(System.Random rnd, double[] cdfUnnormalized)
{
if (Control.CheckDistributionParameters && !IsValidCumulativeDistribution(cdfUnnormalized))
{
throw new ArgumentException("Invalid parametrization for the distribution.");
}
return SamplesUnchecked(rnd, cdfUnnormalized);
}
///
/// Fills an array with samples generated from the distribution.
///
/// The random number generator to use.
/// The array to fill with the samples.
/// An array of the cumulative distribution. Not assumed to be normalized.
/// random integers between 0 and the size of the categorical (exclusive).
public static void SamplesWithCumulativeDistribution(System.Random rnd, int[] values, double[] cdfUnnormalized)
{
if (Control.CheckDistributionParameters && !IsValidCumulativeDistribution(cdfUnnormalized))
{
throw new ArgumentException("Invalid parametrization for the distribution.");
}
SamplesUnchecked(rnd, values, cdfUnnormalized);
}
///
/// Samples one categorical distributed random variable; also known as the Discrete distribution.
///
/// An array of the cumulative distribution. Not assumed to be normalized.
/// One random integer between 0 and the size of the categorical (exclusive).
public static int SampleWithCumulativeDistribution(double[] cdfUnnormalized)
{
if (Control.CheckDistributionParameters && !IsValidCumulativeDistribution(cdfUnnormalized))
{
throw new ArgumentException("Invalid parametrization for the distribution.");
}
return SampleUnchecked(SystemRandomSource.Default, cdfUnnormalized);
}
///
/// Samples a categorically distributed random variable.
///
/// An array of the cumulative distribution. Not assumed to be normalized.
/// random integers between 0 and the size of the categorical (exclusive).
public static IEnumerable SamplesWithCumulativeDistribution(double[] cdfUnnormalized)
{
if (Control.CheckDistributionParameters && !IsValidCumulativeDistribution(cdfUnnormalized))
{
throw new ArgumentException("Invalid parametrization for the distribution.");
}
return SamplesUnchecked(SystemRandomSource.Default, cdfUnnormalized);
}
///
/// Fills an array with samples generated from the distribution.
///
/// The array to fill with the samples.
/// An array of the cumulative distribution. Not assumed to be normalized.
/// random integers between 0 and the size of the categorical (exclusive).
public static void SamplesWithCumulativeDistribution(int[] values, double[] cdfUnnormalized)
{
if (Control.CheckDistributionParameters && !IsValidCumulativeDistribution(cdfUnnormalized))
{
throw new ArgumentException("Invalid parametrization for the distribution.");
}
SamplesUnchecked(SystemRandomSource.Default, values, cdfUnnormalized);
}
}
}