Source code for optbayesexpt.obe_utils

import numpy as np

try:
    rng = np.random.default_rng()
except AttributeError:
    rng = np.random
    

[docs]
class MeasurementSimulator():
    """
    Provides simulated measurement data

    Evaluates the model function and adds noise.

    Args:
        model_function (func): Generally the same as the function used
            by OptBayesExpt
        true_params (tuple): Parameter values, typically the "true values"
            of the simulated experiment.
        cons (tuple): The constants.
        noise_level (float): standard deviation of the added noise.

    """
    def __init__(self, model_function, true_params, cons, noise_level):

        self.model_function = model_function
        self.params = true_params
        self.cons = cons
        self.noise_level = noise_level


[docs]
    def simdata(self, setting, params=None, noise_level=None):
        """ Simulate a measurement

        Args:
            setting (tuple of floats): The setting values
            params (tuple of floats): if not ``None``, temporarily used
                instead of the initial values. (opt)
            noise_level (float): if not ``None``, temporarily used instead of
                the initial value. (opt)

        Returns:
            Simulated measurement value(s)
        """
        if params is None:
            params = self.params

        if noise_level is None:
            noise_level = self.noise_level

        y = np.array(self.model_function(setting, params, self.cons))
        tmpnoise = rng.standard_normal(y.shape) * noise_level
        yn = y + tmpnoise

        return yn





[docs]
def trace_sort(settings, measurements):
    """Combine measurements at identical settings values

    Analyzes input arrays of setttings and corresponding measurement
    values, data where settings values may repeat, i. e. more than one
    measurement was done at some of the settings.  The function
    bins the measurements by setting value and calculates some statistics
    for measurments in each bin.

    Args:
        settings: (ndarray) Setting values
        measurements: (ndarray) measurement values

    Returns:
        A tuple, (sorted_settings, m_average, m_std, n_of_m)
            - sorted_settings (list): setting values (sorted, none repeated)
            - m_average (list): average measurement value at each setting
            - m_sigma (list): standard deviation of measurement values at
              each setting
            - n_of_m (list): number of measurements at each setting.

    """

    # Sort the arrays by the setting values
    sortindices = np.argsort(settings)
    sarr = np.array(settings)[sortindices]
    marr = np.array(measurements)[sortindices]

    oldx = sarr[0]
    sorted_settings = []
    m_average = []
    m_std = []
    n_of_m = []
    m_list = []

    for x, y in zip(sarr, marr):
        # accumulate batches having the same x
        # check if the new x value is different
        if x != oldx:
            # new x value, so batch is complete
            # process the accumulated data for the old x value
            sorted_settings.append(oldx)
            m_average.append(np.mean(np.array(m_list)))
            m_std.append(np.std(m_list)/np.sqrt(len(m_list)))
            n_of_m.append(len(m_list))
            # reset accumulation & start a new batch
            oldx = x
            m_list = [y, ]
        else:
            # same setting value, so just accumulate the y value
            m_list.append(y)
    # process the last accumulated batch
    sorted_settings.append(oldx)
    m_average.append(np.mean(np.array(m_list)))
    n_of_m.append(len(m_list))
    m_std.append(np.std(np.array(m_list))/np.sqrt(len(m_list)))

    return sorted_settings, m_average, m_std, n_of_m




[docs]
def differential_entropy(values, window_length=None, base=None,
                         axis=0, method='auto'):
    """Given a sample of a distribution, estimate the differential entropy.

    This code is copied from scipy.stats with reformatted docstrings.  When the module is
    loaded, __init__.py attempts to import ``differential_entropy()`` from scipy.stats, and loads
    this version from obe_utils.py if an ``ImportError`` is raised.

    Several estimation methods are available using the `method` parameter. By
    default, a method is selected based the size of the sample.

    Args:
        values (:obj:`sequence`): Samples from a continuous distribution.

        window_length (:obj:`int`, optional): Window length for computing Vasicek estimate.
            Must be an integer between 1 and half of  the sample size. If ``None``
            (the default), it uses the heuristic value

            .. math::
                \left \lfloor \\sqrt{n} + 0.5 \\right \\rfloor

            where :math:`n` is the sample size. This heuristic was originally
            proposed in [2]_ and has become common in the literature.

        base (:obj:`float`, optional)
            The logarithmic base to use, defaults to ``e`` (natural logarithm).

        axis (:obj:`int`, optional)
            The axis along which the differential entropy is calculated.
            Default is 0.

        method : {'vasicek', 'van es', 'ebrahimi', 'correa', 'auto'}, optional
            The method used to estimate the differential entropy from the sample.
            Default is ``'auto'``.  See Notes for more information.

    Returns:
        entropy (:obj:`float`):
            The calculated differential entropy.

    Notes:
        This function will converge to the true differential entropy in the limit

        .. math::
            n \\to \\infty, \\quad m \\to \\infty, \\quad \\frac{m}{n} \\to 0

        The optimal choice of ``window_length`` for a given sample size depends on
        the (unknown) distribution. Typically, the smoother the density of the
        distribution, the larger the optimal value of ``window_length`` [1]_.
        The following options are available for the `method` parameter.

        * ``'vasicek'`` uses the estimator presented in [1]_. This is one of the
          first and most influential estimators of differential entropy.
        * ``'van es'`` uses the bias-corrected estimator presented in [3]_, which
          is not only consistent but, under some conditions, asymptotically normal.
        * ``'ebrahimi'`` uses an estimator presented in [4]_, which was shown
          in simulation to have smaller bias and mean squared error than
          the Vasicek estimator.
        * ``'correa'`` uses the estimator presented in [5]_ based on local linear
          regression. In a simulation study, it had consistently smaller mean
          square error than the Vasiceck estimator, but it is more expensive to
          compute.
        * ``'auto'`` selects the method automatically (default). Currently,
          this selects ``'van es'`` for very small samples (<10), ``'ebrahimi'``
          for moderate sample sizes (11-1000), and ``'vasicek'`` for larger
          samples, but this behavior is subject to change in future versions.

        All estimators are implemented as described in [6]_.

    References:

        .. [1] Vasicek, O. (1976). A test for normality based on sample entropy.
               Journal of the Royal Statistical Society:
               Series B (Methodological), 38(1), 54-59.
        .. [2] Crzcgorzewski, P., & Wirczorkowski, R. (1999). Entropy-based
               goodness-of-fit test for exponentiality. Communications in
               Statistics-Theory and Methods, 28(5), 1183-1202.
        .. [3] Van Es, B. (1992). Estimating functionals related to a density by a
               class of statistics based on spacings. Scandinavian Journal of
               Statistics, 61-72.
        .. [4] Ebrahimi, N., Pflughoeft, K., & Soofi, E. S. (1994). Two measures
               of sample entropy. Statistics & Probability Letters, 20(3), 225-234.
        .. [5] Correa, J. C. (1995). A new estimator of entropy. Communications
               in Statistics-Theory and Methods, 24(10), 2439-2449.
        .. [6] Noughabi, H. A. (2015). Entropy Estimation Using Numerical
               Methods. Annals of Data Science, 2(2), 231-241.
               https://link.springer.com/article/10.1007/s40745-015-0045-9

    """
    values = np.asarray(values)
    values = np.moveaxis(values, axis, -1)
    n = values.shape[-1]  # number of observations

    if window_length is None:
        window_length = int(np.floor(np.sqrt(n) + 0.5))

    if not 2 <= 2 * window_length < n:
        raise ValueError(
            f"Window length ({window_length}) must be positive and less "
            f"than half the sample size ({n}).",
        )

    if base is not None and base <= 0:
        raise ValueError("`base` must be a positive number or `None`.")

    sorted_data = np.sort(values, axis=-1)

    methods = {"vasicek": _vasicek_entropy,
               "van es": _van_es_entropy,
               "correa": _correa_entropy,
               "ebrahimi": _ebrahimi_entropy,
               "auto": _vasicek_entropy}
    method = method.lower()
    if method not in methods:
        message = f"`method` must be one of {set(methods)}"
        raise ValueError(message)

    if method == "auto":
        if n <= 10:
            method = 'van es'
        elif n <= 1000:
            method = 'ebrahimi'
        else:
            method = 'vasicek'

    res = methods[method](sorted_data, window_length)

    if base is not None:
        res /= np.log(base)

    return res



def _pad_along_last_axis(X, m):
    # Pad the data for computing the rolling window difference.
    # scales a  bit better than method in _vasicek_like_entropy
    shape = np.array(X.shape)
    shape[-1] = m
    Xl = np.broadcast_to(X[..., [0]], shape)  # [0] vs 0 to maintain shape
    Xr = np.broadcast_to(X[..., [-1]], shape)
    return np.concatenate((Xl, X, Xr), axis=-1)


def _vasicek_entropy(X, m):
    # Compute the Vasicek estimator as described in [7] Eq. 1.3.
    n = X.shape[-1]
    X = _pad_along_last_axis(X, m)
    differences = X[..., 2 * m:] - X[..., : -2 * m:]
    logs = np.log(n/(2*m) * differences)
    return np.mean(logs, axis=-1)


def _van_es_entropy(X, m):
    # Compute the van Es estimator as described in [7].
    # No equation number, but referred to as HVE_mn.
    # Typo: there should be a log within the summation.
    n = X.shape[-1]
    difference = X[..., m:] - X[..., :-m]
    term1 = 1/(n-m) * np.sum(np.log((n+1)/m * difference), axis=-1)
    k = np.arange(m, n+1)
    return term1 + np.sum(1/k) + np.log(m) - np.log(n+1)


def _ebrahimi_entropy(X, m):
    # Compute the Ebrahimi estimator as described in [7].
    # No equation number, but referred to as HE_mn
    n = X.shape[-1]
    X = _pad_along_last_axis(X, m)

    differences = X[..., 2 * m:] - X[..., : -2 * m:]

    i = np.arange(1, n+1).astype(float)
    ci = np.ones_like(i)*2
    ci[i <= m] = 1 + (i[i <= m] - 1)/m
    ci[i >= n - m + 1] = 1 + (n - i[i >= n-m+1])/m

    logs = np.log(n * differences / (ci * m))
    return np.mean(logs, axis=-1)


def _correa_entropy(X, m):
    # Compute the Correa estimator as described in [7].
    # No equation number, but referred to as HC_mn
    n = X.shape[-1]
    X = _pad_along_last_axis(X, m)

    i = np.arange(1, n+1)
    dj = np.arange(-m, m+1)[:, None]
    j = i + dj
    j0 = j + m - 1  # 0-indexed version of j

    Xibar = np.mean(X[..., j0], axis=-2, keepdims=True)
    difference = X[..., j0] - Xibar
    num = np.sum(difference*dj, axis=-2)  # dj is d-i
    den = n*np.sum(difference**2, axis=-2)
    return -np.mean(np.log(num/den), axis=-1)