"""Simple statistical functions (less powerful but faster than their *numpy* equivalents). """ from __future__ import division from math import modf, floor, sqrt from itertools import product, combinations try: import scipy.stats ttable = scipy.stats.t.ppf except ImportError: def ttable(*_args): raise NotImplementedError # ============================================================================= # standard statistics for list of real values # ============================================================================= def mean(l): """Mean of a finite, non-empty list of real or integer values.""" return sum(l) / len(l) def median(l, issorted=False): """Median of a finite, non-empty list of real or integer values. If `sorted` is ``True``, `l` is assumed to be sorted already. """ n = len(l) if n == 1: return l[0] l = l if issorted else sorted(l) if n % 2: return l[int(n/2)] else: x = int(n/2-1) return l[x] + 0.5 * abs(l[x+1] - l[x]) def quartiles(l, issorted=False): """Quartiles of a finite, non-empty list of real or integer values. >>> for x in range(2,12): ... print range(1,x), "->", quartiles(range(1,x)) [1] -> (1, 1, 1) [1, 2] -> (1.25, 1.5, 1.75) [1, 2, 3] -> (1.5, 2, 2.5) [1, 2, 3, 4] -> (1.75, 2.5, 3.25) [1, 2, 3, 4, 5] -> (2, 3, 4) [1, 2, 3, 4, 5, 6] -> (2.25, 3.5, 4.75) [1, 2, 3, 4, 5, 6, 7] -> (2.5, 4, 5.5) [1, 2, 3, 4, 5, 6, 7, 8] -> (2.75, 4.5, 6.25) [1, 2, 3, 4, 5, 6, 7, 8, 9] -> (3, 5, 7) [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] -> (3.25, 5.5, 7.75) >>> quartiles([1, 1, 1, 2]) (1.0, 1.0, 1.25) >>> quartiles([0, 0.25, 0.5, 0.75, 1.0]) (0.25, 0.5, 0.75) >>> quartiles([0, 0, 0.25, 0.5, 0.75, 1.0]) (0.0625, 0.375, 0.6875) >>> quartiles([6, 47, 49, 15, 42, 41, 7, 39, 43, 40, 36]) (25.5, 40, 42.5) >>> quartiles([5, 3, 2, 2, 2]) (2, 2, 3) """ def quantile(x, q, qtype=7, issorted=False): """Compute quantile `q` for samples `x`, using algorithm `qtype`. References: http://reference.wolfram.com/mathematica/ref/Quantile.html http://wiki.r-project.org/rwiki/doku.php?id=rdoc:stats:quantile Author: Ernesto P.Adorio Ph.D. UP Extension Program in Pampanga, Clark Field. ----------------------------------------------------------------------- Source: from http://adorio-research.org/wordpress/?p=125 """ if not issorted: y = sorted(x) else: y = x assert 1 <= qtype <= 9 # Parameters for the Hyndman and Fan algorithm (needs float division!) abcd = ( (0, 0, 1, 0), # inverse empirical distrib.function, R type 1 (1/2, 0, 1, 0), # similar to type 1, averaged, R type 2 (1/2, 0, 0, 0), # nearest order statistic, (SAS) R type 3 (0, 0, 0, 1), # California linear interpolation, R type 4 (1/2, 0, 0, 1), # hydrologists method, R type 5 (0, 1, 0, 1), # mean-based estimate (Weibull method), (SPSS, Minitab), R type 6 (1, -1, 0, 1), # mode-based method, (S, S-Plus), R type 7 (1/3, 1/3, 0, 1), # median-unbiased , R type 8 (3/8, 1/4, 0, 1), # normal-unbiased, R type 9. ) a, b, c, d = abcd[qtype-1] n = len(x) g, j = modf( a + (n+b) * q -1) if j < 0: return y[0] elif j >= n: return y[n-1] # oct. 8, 2010 y[n]???!! uncaught off by 1 error!!! j = int(floor(j)) if g == 0: return y[j] else: return y[j] + (y[j+1]- y[j])* (c + d * g) return tuple(quantile(l, x, issorted=issorted) for x in (0.25, 0.5, 0.75)) def var(l): """Variance of a finite, non-empty list of real or integer values. Assumes same probability for each element (i.e. E=1/N). """ m = mean(l) return sum([(x - m)**2 for x in l]) / len(l) def sd(l): """Standard deviation of a finite, non-empty list of real or integer values. Assumes same probability for each element (i.e. E=1/N). """ return sqrt(var(l)) def ci(l, cl=0.95): """Confidence interval. Note: uses *sample* (not population) standard deviation (see http://en.wikipedia.org/wiki/Standard_deviation#Estimation) """ assert len(l) > 1 m = mean(l) sample_sd = sqrt(sum((x - m)**2 for x in l) / (len(l) - 1)) t = ttable((1 + cl) / 2, len(l) - 1) error = sample_sd / sqrt(len(l)) * t return error # ============================================================================= # poset analysis functions # ============================================================================= def posetwidth(poset): """Calculate width of given poset. >>> posetwidth(set()) 0 >>> posetwidth([[]]) 1 >>> posetwidth([[], [0], [1], [0,1]]) 2 >>> posetwidth([[], [0], [0,1]]) 1 >>> posetwidth([[1], [0, 1], [2], [3]]) 3 >>> posetwidth([[0], [1], [2], [3]]) 4 >>> posetwidth([[], [0], [1], [2], [3]]) 4 >>> posetwidth([[], [0], [1], [0, 1], [2], [0, 2], [0, 2, 3]]) 3 >>> posetwidth([[], [0], [1], [0, 1], [2], [0, 2], [3], [0, 2, 3]]) 4 >>> posetwidth([[], [3], [0, 2, 3], [2], [0, 2], [0, 1, 2]]) 2 >>> posetwidth([[0], [1], [2], [0, 1], [2, 3], [0, 1, 2, 3]]) 3 """ def hasnext(i): if i < 0: return True if visited[i]: return False visited[i] = True for j in indice: if gt[i][j]: if hasnext(pv[j]): pv[j] = i return True return False poset = tuple(frozenset(x) for x in poset) l = len(poset) indice = tuple(range(l)) gt = [[False] * l for _ in poset] for i, j in product(indice, indice): gt[i][j] = poset[i] > poset[j] # if elements are of class Label, then # the size comparison recognizes # hierarchical relations pv = [-1] * l visited = [False] * l width = 0 for i in indice: for j in indice: visited[j] = False if not hasnext(i): width += 1 return width def _minimaxi(poset, cop): """Get minimal or maximal elements of `poset`, depending on the compare operator `cop`. """ remove = set() for x1, x2 in combinations(poset, 2): if cop(x1, x2): remove.add(x1) elif cop(x2, x1): remove.add(x2) return tuple(x for x in poset if x not in remove) def minimals(poset, itemgetter=lambda x: x): """Get the minimal elements of a poset. Elements in `poset` either must be of type `frozenset` ... >>> s = [frozenset(x) for x in ("abc", "ab", "abd")] >>> sorted(minimals(s)) [frozenset(['a', 'b'])] ... or some other type where `itemgetter` provides the frozenset needed by this function: >>> s = [(frozenset(x), "foo") for x in ("abc", "ab", "abd")] >>> ig = lambda x: x[0] >>> sorted(minimals(s, itemgetter=ig)) [(frozenset(['a', 'b']), 'foo')] """ cop = lambda a, b: itemgetter(a) > itemgetter(b) return _minimaxi(poset, cop) def maximals(poset, itemgetter=lambda x: x): """Get the maximal elements of a poset. Elements in `poset` either must be of type `frozenset` ... >>> s = [frozenset(x) for x in ("abc", "ab", "abd")] >>> sorted(maximals(s)) [frozenset(['a', 'c', 'b']), frozenset(['a', 'b', 'd'])] ... or some other type where `itemgetter` provides the frozenset needed by this function: >>> s = [(frozenset(x), "foo") for x in ("abc", "ab", "abd")] >>> ig = lambda x: x[0] >>> sorted(maximals(s, itemgetter=ig)) [(frozenset(['a', 'c', 'b']), 'foo'), (frozenset(['a', 'b', 'd']), 'foo')] """ cop = lambda a, b: itemgetter(a) < itemgetter(b) return _minimaxi(poset, cop) def posetchains(poset): """Get all maximal chains of `poset`.""" def subchains(prefix): children = minimals([x for x in poset if not prefix or x > prefix[-1]]) if children: for child in children: for chain in subchains(prefix + [child]): yield chain else: yield prefix return subchains([]) def majorposetchains(poset, quartile=None): """Get those maximal chains of poset whose length is at least the mean of all chain lengths or whose length is in the given `quartile` (1st, 2nd, or 3rd) of chain lengths. """ chains = [(c, len(c)) for c in posetchains(poset)] if quartile is None: t = mean([l for _, l in chains]) else: t = quartiles([l for _, l in chains])[quartile-1] return [c for c, l in chains if l >= t] def inchain(chain, elem): """Check if an element is part of a chain. >>> inchain([set([0]), set([0,1,2])], set([2])) False >>> inchain([set([0]), set([0,1,2])], set([0,2])) True """ return elem in chain or all(elem < x or elem > x for x in chain) def atchain(chain, elem): """Check if an element is connected to a chain. >>> atchain([set([0]), set([0,1,2])], set([2])) True >>> atchain([set([0]), set([0,1,2])], set([0,2])) True >>> atchain([set([0]), set([0,1,2])], set([3])) False """ return elem in chain or any(elem < x or elem > x for x in chain) # ============================================================================= # tests # ============================================================================= def __doctests_statfuncs(): """ Compare with numpy: >>> import random >>> import numpy >>> random.seed(2010) >>> for _ in range(100): ... n = random.randint(1,20) ... l = [random.random() * 10 - 5 for __ in range(n)] ... for nfn, lfn in [(numpy.mean, mean), (numpy.median, median), ... (numpy.var, var), (numpy.std, sd)]: ... a, b = nfn(l), lfn(l) ... if abs(a - b) > 1e-12: ... print "%s failed:" % lfn.__name__, l, a - b """ def __doctests_posetchains(): """ >>> from diles.learn import Label >>> fn = lambda *l: posetchains([Label(x) for x in l]) >>> for c in fn(["a"], ["b"], ["a", "b"], ["a.b"]): ... print c [{'b'}, {'a', 'b'}] [{'a.b'}, {'a'}, {'a', 'b'}] >>> for c in fn(["a"], ["a.d"], ["a.b.c"], ["a", "b"], ["a.b"]): ... print c [{'a.d'}, {'a'}, {'a', 'b'}] [{'a.b.c'}, {'a.b'}, {'a'}, {'a', 'b'}] >>> for c in fn([], [0], [1], [0, 1], [2], [0, 2], [3], [0, 2, 3]): ... print c [{}, {0}, {0, 1}] [{}, {0}, {0, 2}, {0, 2, 3}] [{}, {1}, {0, 1}] [{}, {2}, {0, 2}, {0, 2, 3}] [{}, {3}, {0, 2, 3}] >>> p = [frozenset(x) for x in ("a", "ab", "abc", "abcd", "abcde", ... "ax", "ay", "au", "av", "b", "c")] >>> for c in majorposetchains(p): ... print ["".join(x) for x in c] ['a', 'ab', 'acb', 'acbd', 'acbed'] ['b', 'ab', 'acb', 'acbd', 'acbed'] ['c', 'acb', 'acbd', 'acbed'] >>> for c in majorposetchains(p, quartile=3): ... print ["".join(x) for x in c] ['a', 'ab', 'acb', 'acbd', 'acbed'] ['b', 'ab', 'acb', 'acbd', 'acbed'] """