"""Simple statistical functions (less powerful but faster than their *numpy*
equivalents).

"""
from __future__ import division

from math import modf, floor, sqrt
from itertools import product, combinations

try:
    import scipy.stats
    ttable = scipy.stats.t.ppf
except ImportError:
    def ttable(*_args):
        raise NotImplementedError

# =============================================================================
# standard statistics for list of real values
# =============================================================================

def mean(l):
    """Mean of a finite, non-empty list of real or integer values."""
    return sum(l) / len(l)

def median(l, issorted=False):
    """Median of a finite, non-empty list of real or integer values.

    If `sorted` is ``True``, `l` is assumed to be sorted already.

    """
    n = len(l)
    if n == 1:
        return l[0]
    l = l if issorted else sorted(l)
    if n % 2:
        return l[int(n/2)]
    else:
        x = int(n/2-1)
        return l[x] + 0.5 * abs(l[x+1] - l[x])

def quartiles(l, issorted=False):
    """Quartiles of a finite, non-empty list of real or integer values.

    >>> for x in range(2,12):
    ...     print range(1,x), "->", quartiles(range(1,x))
    [1] -> (1, 1, 1)
    [1, 2] -> (1.25, 1.5, 1.75)
    [1, 2, 3] -> (1.5, 2, 2.5)
    [1, 2, 3, 4] -> (1.75, 2.5, 3.25)
    [1, 2, 3, 4, 5] -> (2, 3, 4)
    [1, 2, 3, 4, 5, 6] -> (2.25, 3.5, 4.75)
    [1, 2, 3, 4, 5, 6, 7] -> (2.5, 4, 5.5)
    [1, 2, 3, 4, 5, 6, 7, 8] -> (2.75, 4.5, 6.25)
    [1, 2, 3, 4, 5, 6, 7, 8, 9] -> (3, 5, 7)
    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] -> (3.25, 5.5, 7.75)

    >>> quartiles([1, 1, 1, 2])
    (1.0, 1.0, 1.25)
    >>> quartiles([0, 0.25, 0.5, 0.75, 1.0])
    (0.25, 0.5, 0.75)
    >>> quartiles([0, 0, 0.25, 0.5, 0.75, 1.0])
    (0.0625, 0.375, 0.6875)
    >>> quartiles([6, 47, 49, 15, 42, 41, 7, 39, 43, 40, 36])
    (25.5, 40, 42.5)
    >>> quartiles([5, 3, 2, 2, 2])
    (2, 2, 3)

    """
    def quantile(x, q, qtype=7, issorted=False):
        """Compute quantile `q` for samples `x`, using algorithm `qtype`.

        References:
            http://reference.wolfram.com/mathematica/ref/Quantile.html
            http://wiki.r-project.org/rwiki/doku.php?id=rdoc:stats:quantile

        Author:
            Ernesto P.Adorio Ph.D.
            UP Extension Program in Pampanga, Clark Field.

        -----------------------------------------------------------------------
        Source: from http://adorio-research.org/wordpress/?p=125

        """
        if not issorted:
            y = sorted(x)
        else:
            y = x
        assert 1 <= qtype <= 9

        # Parameters for the Hyndman and Fan algorithm (needs float division!)
        abcd = (
            (0,   0,   1, 0), # inverse empirical distrib.function, R type 1
            (1/2, 0,   1, 0), # similar to type 1, averaged, R type 2
            (1/2, 0,   0, 0), # nearest order statistic, (SAS) R type 3
            (0,   0,   0, 1), # California linear interpolation, R type 4
            (1/2, 0,   0, 1), # hydrologists method, R type 5
            (0,   1,   0, 1), # mean-based estimate (Weibull method), (SPSS, Minitab), R type 6
            (1,  -1,   0, 1), # mode-based method, (S, S-Plus), R type 7
            (1/3, 1/3, 0, 1), # median-unbiased , R type 8
            (3/8, 1/4, 0, 1), # normal-unbiased, R type 9.
        )

        a, b, c, d = abcd[qtype-1]
        n = len(x)
        g, j = modf( a + (n+b) * q -1)
        if j < 0:
            return y[0]
        elif j >= n:
            return y[n-1] # oct. 8, 2010 y[n]???!! uncaught  off by 1 error!!!

        j = int(floor(j))
        if g ==  0:
            return y[j]
        else:
            return y[j] + (y[j+1]- y[j])* (c + d * g)

    return tuple(quantile(l, x, issorted=issorted) for x in (0.25, 0.5, 0.75))

def var(l):
    """Variance of a finite, non-empty list of real or integer values.

    Assumes same probability for each element (i.e. E=1/N).
    """
    m = mean(l)
    return sum([(x - m)**2 for x in l]) / len(l)

def sd(l):
    """Standard deviation of a finite, non-empty list of real or integer values.

    Assumes same probability for each element (i.e. E=1/N).
    """
    return sqrt(var(l))

def ci(l, cl=0.95):
    """Confidence interval.

    Note: uses *sample* (not population) standard deviation (see
    http://en.wikipedia.org/wiki/Standard_deviation#Estimation)

    """
    assert len(l) > 1
    m = mean(l)
    sample_sd = sqrt(sum((x - m)**2 for x in l) / (len(l) - 1))
    t = ttable((1 + cl) / 2, len(l) - 1)
    error = sample_sd / sqrt(len(l)) * t
    return error

# =============================================================================
# poset analysis functions
# =============================================================================

def posetwidth(poset):
    """Calculate width of given poset.

    >>> posetwidth(set())
    0
    >>> posetwidth([[]])
    1
    >>> posetwidth([[], [0], [1], [0,1]])
    2
    >>> posetwidth([[], [0], [0,1]])
    1
    >>> posetwidth([[1], [0, 1], [2], [3]])
    3
    >>> posetwidth([[0], [1], [2], [3]])
    4
    >>> posetwidth([[], [0], [1], [2], [3]])
    4
    >>> posetwidth([[], [0], [1], [0, 1], [2], [0, 2], [0, 2, 3]])
    3
    >>> posetwidth([[], [0], [1], [0, 1], [2], [0, 2], [3], [0, 2, 3]])
    4
    >>> posetwidth([[], [3], [0, 2, 3], [2], [0, 2], [0, 1, 2]])
    2
    >>> posetwidth([[0], [1], [2], [0, 1], [2, 3], [0, 1, 2, 3]])
    3

    """
    def hasnext(i):
        if i < 0:
            return True
        if visited[i]:
            return False
        visited[i] = True
        for j in indice:
            if gt[i][j]:
                if hasnext(pv[j]):
                    pv[j] = i
                    return True
        return False

    poset = tuple(frozenset(x) for x in poset)
    l = len(poset)
    indice = tuple(range(l))

    gt = [[False] * l for _ in poset]

    for i, j in product(indice, indice):
        gt[i][j] = poset[i] > poset[j]  # if elements are of class Label, then
                                        # the size comparison recognizes
                                        # hierarchical relations

    pv = [-1] * l
    visited = [False] * l

    width = 0

    for i in indice:
        for j in indice:
            visited[j] = False
        if not hasnext(i):
            width += 1

    return width

def _minimaxi(poset, cop):
    """Get minimal or maximal elements of `poset`, depending on the compare
    operator `cop`.

    """
    remove = set()
    for x1, x2 in combinations(poset, 2):
        if cop(x1, x2):
            remove.add(x1)
        elif cop(x2, x1):
            remove.add(x2)
    return tuple(x for x in poset if x not in remove)

def minimals(poset, itemgetter=lambda x: x):
    """Get the minimal elements of a poset.

    Elements in `poset` either must be of type `frozenset` ...

    >>> s = [frozenset(x) for x in ("abc", "ab", "abd")]
    >>> sorted(minimals(s))
    [frozenset(['a', 'b'])]

    ... or some other type where `itemgetter` provides the frozenset needed by
    this function:

    >>> s = [(frozenset(x), "foo") for x in ("abc", "ab", "abd")]
    >>> ig = lambda x: x[0]
    >>> sorted(minimals(s, itemgetter=ig))
    [(frozenset(['a', 'b']), 'foo')]

    """
    cop = lambda a, b: itemgetter(a) > itemgetter(b)
    return _minimaxi(poset, cop)

def maximals(poset, itemgetter=lambda x: x):
    """Get the maximal elements of a poset.

    Elements in `poset` either must be of type `frozenset` ...

    >>> s = [frozenset(x) for x in ("abc", "ab", "abd")]
    >>> sorted(maximals(s))
    [frozenset(['a', 'c', 'b']), frozenset(['a', 'b', 'd'])]

    ... or some other type where `itemgetter` provides the frozenset needed by
    this function:

    >>> s = [(frozenset(x), "foo") for x in ("abc", "ab", "abd")]
    >>> ig = lambda x: x[0]
    >>> sorted(maximals(s, itemgetter=ig))
    [(frozenset(['a', 'c', 'b']), 'foo'), (frozenset(['a', 'b', 'd']), 'foo')]

    """
    cop = lambda a, b: itemgetter(a) < itemgetter(b)
    return _minimaxi(poset, cop)

def posetchains(poset):
    """Get all maximal chains of `poset`."""

    def subchains(prefix):
        children = minimals([x for x in poset if not prefix or x > prefix[-1]])
        if children:
            for child in children:
                for chain in subchains(prefix + [child]):
                    yield chain
        else:
            yield prefix
    return subchains([])

def majorposetchains(poset, quartile=None):
    """Get those maximal chains of poset whose length is at least the mean of
    all chain lengths or whose length is in the given `quartile` (1st, 2nd, or
    3rd) of chain lengths.

    """
    chains = [(c, len(c)) for c in posetchains(poset)]
    if quartile is None:
        t = mean([l for _, l in chains])
    else:
        t = quartiles([l for _, l in chains])[quartile-1]
    return [c for c, l in chains if l >= t]

def inchain(chain, elem):
    """Check if an element is part of a chain.

    >>> inchain([set([0]), set([0,1,2])], set([2]))
    False
    >>> inchain([set([0]), set([0,1,2])], set([0,2]))
    True

    """
    return elem in chain or all(elem < x or elem > x for x in chain)

def atchain(chain, elem):
    """Check if an element is connected to a chain.

    >>> atchain([set([0]), set([0,1,2])], set([2]))
    True
    >>> atchain([set([0]), set([0,1,2])], set([0,2]))
    True
    >>> atchain([set([0]), set([0,1,2])], set([3]))
    False

    """
    return elem in chain or any(elem < x or elem > x for x in chain)

# =============================================================================
# tests
# =============================================================================

def __doctests_statfuncs():
    """
    Compare with numpy:

    >>> import random
    >>> import numpy
    >>> random.seed(2010)
    >>> for _ in range(100):
    ...     n = random.randint(1,20)
    ...     l = [random.random() * 10 - 5 for __ in range(n)]
    ...     for nfn, lfn in [(numpy.mean, mean), (numpy.median, median),
    ...                      (numpy.var, var), (numpy.std, sd)]:
    ...         a, b = nfn(l), lfn(l)
    ...         if abs(a - b) > 1e-12:
    ...             print "%s failed:" % lfn.__name__, l, a - b

    """

def __doctests_posetchains():
    """
    >>> from diles.learn import Label
    >>> fn = lambda *l: posetchains([Label(x) for x in l])
    >>> for c in fn(["a"], ["b"], ["a", "b"], ["a.b"]):
    ...     print c
    [{'b'}, {'a', 'b'}]
    [{'a.b'}, {'a'}, {'a', 'b'}]
    >>> for c in fn(["a"], ["a.d"], ["a.b.c"], ["a", "b"], ["a.b"]):
    ...     print c
    [{'a.d'}, {'a'}, {'a', 'b'}]
    [{'a.b.c'}, {'a.b'}, {'a'}, {'a', 'b'}]
    >>> for c in fn([], [0], [1], [0, 1], [2], [0, 2], [3], [0, 2, 3]):
    ...     print c
    [{}, {0}, {0, 1}]
    [{}, {0}, {0, 2}, {0, 2, 3}]
    [{}, {1}, {0, 1}]
    [{}, {2}, {0, 2}, {0, 2, 3}]
    [{}, {3}, {0, 2, 3}]

    >>> p = [frozenset(x) for x in ("a", "ab", "abc", "abcd", "abcde",
    ...                             "ax", "ay", "au", "av", "b", "c")]
    >>> for c in majorposetchains(p):
    ...    print ["".join(x) for x in c]
    ['a', 'ab', 'acb', 'acbd', 'acbed']
    ['b', 'ab', 'acb', 'acbd', 'acbed']
    ['c', 'acb', 'acbd', 'acbed']
    >>> for c in majorposetchains(p, quartile=3):
    ...    print ["".join(x) for x in c]
    ['a', 'ab', 'acb', 'acbd', 'acbed']
    ['b', 'ab', 'acb', 'acbd', 'acbed']

    """