"""Learning related functionality: learning algorithms, optimization functions, performance evaluation functions and metrics. """ from __future__ import division from functools import wraps from itertools import combinations, product import time from types import MethodType _eval = globals()['__builtins__']['eval'] # FIXME: need to rename submodule `eval` def trackruntime(meth): runtimes = [] if getattr(meth, 'tracked', False): return meth # already tracked setattr(meth.im_self, "runtimes_%s" % meth.__name__, runtimes) @wraps(meth) def wrapper(selph, *args, **kwds): start = time.time() ret = meth(selph, *args, **kwds) runtimes.append(time.time() - start) return ret wrapper.tracked = True return wrapper class Learner(object): """General interface of a learner.""" def train(self, samples, labels, fwl=None, fwt=0): """Train using the given samples and labels. Each sample in `samples` is a list of feature values. `fwl` is a list of values between 0 and 1 (inclusive) specifying weights for individual features. `fwt` is a feature weight threshold. All features with a weight less than or equal to `fwt` are ignored. """ raise NotImplementedError def predict(self, sample): """Predict the label for the given sample. Returns: - predicted label - confidence in prediction, between 0 and 1 (exclusive), not comparable across different learner implementations - a prediction ranking of all known labels (also not comparable across different learners), e.g. ``[('a', 0.6), ('b', 0.5), ...]`` """ raise NotImplementedError _labelpool = {} class Label(frozenset): """A simple wrapper class for label objects. Learners handling (hierarchical) multi-label problems expect labels to be of this type. Practically Label objects are pooled frozen sets. Iterable constructor arguments are interpretaed as sets, all others are interpretated as one-element sets: >>> Label('1') {'1'} >>> Label('12') {'1', '2'} >>> Label(['12']) {'12'} >>> Label(1) {1} >>> Label(12) {12} >>> Label([1, 2]) {1, 2} Labels are pooled and not labelized again: >>> l = Label([1, 2]) >>> l is Label(l) True >>> l is Label([1,2]) True >>> hash(l) == hash(Label([1,2])) True >>> Label([1,2]) in {l:l} True Greater and lesser comparisons consider hierarchical structures where a label item `a` is considered to be greater than `a.b` because the latter one semantically expresses a subset of `a`: >>> Label(['a']) > Label([]) True >>> Label(['a']) > Label(['']) False >>> Label(['a']) > Label(['a']) False >>> Label(['a']) > Label(['a.b']) True >>> Label(['a']) > Label(['a.b', 'b']) False >>> Label(['a', 'b']) > Label(['a.b', 'b']) True >>> Label(['a', 'b.c']) > Label(['a.b', 'b']) False >>> Label(['a', 'b.c']) < Label(['a.b', 'b']) False >>> Label(['a', 'b.c', 'c']) > Label(['a.b', 'b.c.d']) True Hierarchical unions and intersections: >>> Label(['a', 'b.c', 'c']).hunion(Label(['a.b', 'b.c.d'])) {'a', 'b.c', 'c'} >>> Label(['a', 'b.c', 'c']).hunion(Label([1])) {1, 'a', 'b.c', 'c'} >>> Label(['a', 'b.c']).hunion(Label(['a.b', 'b'])) {'a', 'b'} >>> Label(['a', 'b.c']).hunion(Label(['a.b', 'b']), Label(['a.b', 'b.c'])) {'a', 'b'} >>> Label(['a.b', 'b.c']).hunion(Label(['a.b', 'b']), Label(['a.d', 'c'])) {'a.b', 'a.d', 'b', 'c'} >>> Label(['a', 'b.c', 'c']).hintersection(Label(['a.b', 'b.c.d'])) {'a.b', 'b.c.d'} >>> Label([1, 'a', 'b.c', 'c']).hintersection(Label([1])) {1} >>> Label(['a', 'b.c']).hintersection(Label(['a.b', 'b'])) {'a.b', 'b.c'} >>> Label(['a.x', 'b.c']).hintersection(Label(['a.b', 'b', 'a.d'])) {'b.c'} >>> Label(['a.x', 'b.c']).hintersection(Label(['a.b', 'b', 'a.d']), Label(['b.d'])) {} >>> Label(['a.x', 'b.c']).hintersection(Label(['a.b', 'b', 'a.d']), Label(['b.c'])) {'b.c'} >>> Label(['a.b.c']).hintersection(Label(['a.b']), Label(['a'])) {'a.b.c'} >>> Label(['a']).hintersection(Label(['a.b']), Label(['a.b.c'])) {'a.b.c'} Labels may be created using their string representations: >>> Label.fromstring("{'a.b.c'}") {'a.b.c'} >>> Label.fromstring("{'a', 1, 2, 'a'}") {1, 2, 'a'} >>> Label.fromstring("{}") {} """ @staticmethod def fromstring(s): s = "[%s]" % s[1:-1] return Label(_eval(s, {}, {})) def __new__(cls, obj): if type(obj) == Label: return obj try: l = super(Label, cls).__new__(cls, obj) except TypeError: l = super(Label, cls).__new__(cls, (obj,)) return _labelpool.setdefault(l, l) def __gt__(self, other): if super(Label, self).__gt__(other): return True if super(Label, self).__le__(other): return False for x in other.difference(self): try: levels = x.split('.') except AttributeError: return False for n in range(1, len(levels)): if ".".join(levels[:n]) in self: break else: return False return True def __lt__(self, other): return other.__gt__(self) def __str__(self): return "{%s}" % ", ".join([repr(x) for x in sorted(self)]) def __repr__(self): return str(self) def hunion(self, *others): """Hierarchical union.""" union = set(self).union(*others) for x, y in tuple(combinations(union, 2)): try: if x.startswith("%s." % y): union.remove(x) elif y.startswith("%s." % x): union.remove(y) except AttributeError: # x or y not a string pass return Label(union) def hintersection(self, *others): """Hierarchical intersection.""" def two(a, b): intersection = set() for x, y in product(a, b): try: if x == y: intersection.add(x) elif x.startswith("%s." % y): intersection.add(x) elif y.startswith("%s." % x): intersection.add(y) except AttributeError: # x or y not a string pass return intersection return Label(reduce(two, (self,) + others)) def expanded(self): xlab = set() for x in self: xlab.add(x) try: levels = x.split('.') except AttributeError: pass else: for n in range(1, len(levels)): xlab.add(".".join(levels[:n])) return Label(xlab) def __doctests(): """ Check if pooling also works with pickle: >>> import pickle >>> l = Label(1) >>> s = pickle.dumps(l) >>> _labelpool.clear() >>> _labelpool {} >>> l = pickle.loads(s) >>> _labelpool {{1}: {1}} >>> l is Label(1) True >>> l is pickle.loads(s) True >>> l = Label(["a.b.c", "a", "b"]) >>> l.expanded() {'a', 'a.b', 'a.b.c', 'b'} """