Source code for pyhdust.stats

# -*- coding:utf-8 -*-

"""PyHdust *stats* module: statistical tools

:license: GNU GPL v3.0 https://github.com/danmoser/pyhdust/blob/master/LICENSE
"""
from __future__ import print_function
import numpy as _np
import pyhdust.phc as _phc
import warnings as _warn

try:
    import matplotlib.pyplot as _plt
    from scipy.stats import mode as _mode
except ImportError:
    _warn.warn("Matplotlib and/or Scipy module not installed!!!")


__author__ = "Daniel Moser"
__email__ = "dmfaes@gmail.com"


[docs]def mad(data, axis=None):
    """Return 1.48xMAD (median absolute deviation)

    The MAD is a robust statistic, being more resilient to outliers in a data
    set than the standard deviation."""
    return 1.4826 * _np.median(_np.abs(data - _np.median(data, axis)), axis)


[docs]def summary(x, verbose=False):
    """Returns the summary of the variable: "median", "minus sigma" and
    "plus sigma" ROBUST values (i.e., median and [15.9, 84.1] percentiles).

    Example:

    .. code::

        import pyhdust.stats as stt

        for i in range(8):
            a = _np.random.randn(10**i)+2
            print(np.average(a), np.std(a), stt.summary(a))
    """
    data = _np.hstack((_np.median(x), _np.percentile(x, (15.87, 84.13))))
    if verbose:
        print("# median and [15.9, 84.1] percentiles: ")
        print("# {0} {1} {2}".format(*data))
    return data


[docs]def cdf(x, xlim=None, savefig=False):
    """Display the CDF (Cumulative Density Distribution) of a sample `x`.

    A comparison with a gaussian and a linear one are made.
    """
    n = len(x)
    # probability
    p = _np.arange(n) / (n - 1.0)
    sortedx = _np.sort(x)
    if xlim is None:
        xlim = [sortedx[0], sortedx[-1]]
    linx = _np.linspace(xlim[0], xlim[1], n)
    madx = mad(x)
    gausx = _np.random.randn(n) * madx / 2.0 + (xlim[0] + xlim[1]) / 2.0
    gausx = _np.sort(gausx)

    fig, ax = _plt.subplots()
    ax.plot(sortedx, p, label="Data")
    ax.plot(linx, p, ls=":", label="linearized", color="gray")
    ax.plot(gausx, p, ls="-.", label="Gauss equiv.", color="gray")
    ax.set_xlim(xlim)
    ax.legend(loc="best", fancybox=True, framealpha=0.5, fontsize=8, labelspacing=0.05)
    ax.set_ylabel("c.d.f.")
    ax.set_ylim([0, 1])
    if savefig:
        _phc.savefig(fig)  # figname='outname')
    return fig, ax


[docs]def means(inarr, wtharr=None, quiet=False):
    """Calculate many "means" for a given input array `inarr`.

    `wtharr` is the weights array (e.g., inverse of the uncertainty).

    Return simple, geom, harm, rms, median, mode
    """
    inarr = _np.array(inarr).astype(float)
    if wtharr is None:
        wtharr = _np.ones(len(inarr))
    simp = _np.sum(wtharr * inarr) / _np.sum(wtharr)
    geom = _np.exp(_np.sum(wtharr * _np.log(inarr)) / _np.sum(wtharr))
    harm = _np.sum(wtharr) / _np.sum(wtharr / inarr)
    rms = _np.sqrt(_np.sum(wtharr * inarr**2) / _np.sum(wtharr))
    medi = _np.median(inarr)
    mode = _mode(inarr)[0][0]
    if not quiet:
        print("# Simple: {}".format(simp))
        print("# Geometric: {}".format(geom))
        print("# Harmonic: {}".format(harm))
        print("# RMS: {}".format(harm))
        print("# Median: {}".format(medi))
        print("# Mode: {}".format(mode))
    return simp, geom, harm, rms, medi, mode


[docs]def snr(count_rate, texp=1.0, nexp=1, npix=10.0, bg=10.0, dk=0.0, ron=2.0, var=0.0):
    """Calcute the Signal-to-Noise ratio based on Poisson statistics.

    :param count_rate: = rate of counts (e-/time)
    :param npix: = number os pixels for the given count
    :param bg: = background rate per pixel (e-/time)
    :param dk: = dark rate per pixel (e-/time)
    :param ron: = readout noise (single pixel, in e-)
    :param var: = variance on the source erroes (e-)
    """
    return (
        count_rate
        * texp
        * nexp
        / _np.sqrt(
            texp * nexp * (count_rate + bg * npix + dk * npix)
            + ron**2 * npix * nexp
            + var
        )
    )


[docs]def corr_coef(x, y, clear_nan=True):
    """Pearson correlation coefficient for two ``x`` and ``y`` arrays (same
    length).

    See also ``scipy.stats.pearsonr()``
    """
    if len(x) != len(y):
        _warn("# length of x and y arrays do not match!")
        return None
    x = _np.array(x)
    y = _np.array(y)
    if clear_nan:
        idx = _np.isnan(x)
        idy = _np.isnan(y)
        idxy = idx + idy
        x = x[~idxy]
        y = y[~idxy]
    avgx = _np.average(x)
    avgy = _np.average(y)
    return (_np.sum((x - avgx) * (y - avgy))) / (
        _np.sqrt(_np.sum((x - avgx) ** 2) * _np.sum((y - avgy) ** 2))
    )


[docs]def corr_coef_spearman(x, y, clear_nan=True):
    """Spearman's correlation coefficient for two ``x`` and ``y`` arrays (same
    length).

    See also ``scipy.stats.spearmanr()``
    """
    if len(x) != len(y):
        _warn("# length of x and y arrays do not match!")
        return None
    x = _np.array(x)
    y = _np.array(y)
    if clear_nan:
        idx = _np.isnan(x)
        idy = _np.isnan(y)
        idxy = idx + idy
        x = x[~idxy]
        y = y[~idxy]
    idx = _np.argsort(x)
    x = x[idx]
    y = y[idx]
    n = len(x)
    diff = _np.arange(n) - _np.argsort(y)
    diff2 = diff**2
    return 1 - 6 * _np.sum(diff2) / (n * (n**2 - 1))


[docs]def corr_coef_cov(x, y, clear_nan=True):
    r"""Correlation coefficient based on the Covariance of two ``x`` and ``y``
    arrays (same length).

    :math:`\rho(x,y)=Cov(x,y)/sqrt(Var(x)*Var(y))`

    If :math:`\rho(x,y)= 0` we say that X and Y are "uncorrelated." If two
    variables are independent, then their correlation will be 0. However,
    like with covariance. it doesn't go the other way. A correlation of 0
    does not imply independence.
    """
    if len(x) != len(y):
        _warn("# length of x and y arrays do not match!")
        return None
    x = _np.array(x)
    y = _np.array(y)
    if clear_nan:
        idx = _np.isnan(x)
        idy = _np.isnan(y)
        idxy = idx + idy
        x = x[~idxy]
        y = y[~idxy]
    idx = _np.argsort(x)
    c, cov = _np.polyfit(x, y, 1, cov=True)
    return cov[0, 1] / _np.sqrt(cov[0, 0] * cov[1, 1])


[docs]def corr_coef_cov_with_err(x, y, yerr, xerr=None, clear_nan=True, nsample=1000):
    r"""TO BE DONE
    Correlation coefficient based on the Covariance of two ``x`` and ``y``
    arrays (same length).

    :math:`\rho(x,y)=Cov(x,y)/sqrt(Var(x)*Var(y))`

    If :math:`\rho(x,y)= 0` we say that X and Y are "uncorrelated." If two
    variables are independent, then their correlation will be 0. However,
    like with covariance. it doesn't go the other way. A correlation of 0
    does not imply independence.
    """
    if xerr is None:
        xerr = _np.zeros(len(x))
    if len(x) != len(y) or len(x) != len(xerr) or len(xerr) != len(yerr):
        _warn("# length of input arrays do not match!")
        return None
    x = _np.array(x)
    xerr = _np.array(xerr)
    y = _np.array(y)
    yerr = _np.array(yerr)
    if clear_nan:
        idx = _np.isnan(x)
        idxe = _np.isnan(xerr)
        idy = _np.isnan(y)
        idye = _np.isnan(yerr)
        idxy = idx + idy + idxe + idye
        x = x[~idxy]
        xerr = xerr[~idxy]
        y = y[~idxy]
        yerr = yerr[~idxy]
    mx = _np.random.multivariate_normal(x, _np.diag(xerr**2), nsample)
    my = _np.random.multivariate_normal(y, _np.diag(yerr**2), nsample)
    return corr_coef_cov(mx.reshape(-1), my.reshape(-1))