"""Case based reasoning learning methods.""" from __future__ import division import math from diles.learn import Learner from diles.learn.util import euclidean_dist, normalized, ratioconf # pylint: disable-msg=E0611 class KNNLearner(Learner): """A k-nearest neighbor learner.""" paramspace = {'k': range(2,5), 'dwsigma': [2**x for x in range(-2,3)]} def __init__(self, k=3, dwsigma=2.0): """When predicting, the `k` closest neighbors vote a label. Each vote is weighted according to the neighbors's distance, calculated by a Gaussian function using `dwsigma`. A greater *sigma* increases the weights of greater distances (vice versa, a smaller *sigma* reduces the influence of farther neighbors). """ super(KNNLearner, self).__init__() self.k = k self.dwsigma = dwsigma def train(self, samples, labels, fwl=None, fwt=0): """See :meth:`diles.learn.Learner.train`.""" self.samples = samples self.labels = labels self.fwl = fwl or [1] * len(samples[0]) self.fwt = fwt def predict(self, sample): """See :meth:`diles.learn.Learner.predict`. The returned ranking values are a voting-based probability distribution. """ # distance and weight functions dfn = lambda s: euclidean_dist(sample, s, self.fwl, self.fwt) wfn = lambda d: math.e**(- d**2 / (2 * self.dwsigma**2)) # sample weights sweights = [(wfn(dfn(s)), i) for i, s in enumerate(self.samples)] sweights = sorted(sweights, reverse=True)[:self.k] # summed label weights for `k` greatest sample weights lweights = dict([(l, 0) for l in self.labels]) for ws, i in sweights: lweights[self.labels[i]] += ws lweights = normalized(lweights.items()) ranking = sorted(lweights, key=lambda x: x[1], reverse=True) label = ranking[0][0] confidence = ratioconf(ranking) # TODO: why not ranking[0][1]? return label, confidence, ranking class GuessLearner(Learner): """A guessing learner. Always predicts the most often occurring label. """ id = "GS" paramspace = {} def train(self, samples, labels, fwl=None, fwt=0): """See :meth:`diles.learn.Learner.train`.""" probs = normalized([(l, labels.count(l)) for l in set(labels)]) ranking = sorted(probs, key=lambda x: x[1], reverse=True) label = ranking[0][0] confidence = ratioconf(ranking) self.prediction = label, confidence, ranking def predict(self, sample): # pylint: disable-msg=W0613 """See :meth:`diles.learn.Learner.predict`. The returned ranking is a probability distribution. """ return self.prediction # ============================================================================= # tests # ============================================================================= def __doctests_knn(): """ >>> import random >>> samples = 'ab', 'ac', 'ad', 'bb', 'bd' >>> labels = 0, 0, 1, 0, 1 >>> l1 = KNNLearner(k=1) >>> l2 = KNNLearner(k=2) >>> l3 = KNNLearner(k=3) >>> l1.train(samples, labels) >>> l1.predict('ab') (0, 0.9999..., [(0, 1.0), (1, 0.0)]) >>> l2.train(samples, labels) >>> l2.predict('ab') (0, 0.9999..., [(0, 1.0), (1, 0.0)]) >>> l3.train(samples, labels) >>> l3.predict('ab') (0, 0.53..., [(0, 0.680...), (1, 0.319...)]) >>> l1.train(samples, labels) >>> l1.predict('bd') (1, 0.9999..., [(1, 1.0), (0, 0.0)]) >>> l2.train(samples, labels) >>> l2.predict('bd') (1, 0.11..., [(1, 0.531...), (0, 0.468...)]) >>> l3.train(samples, labels) >>> l3.predict('bd') (1, 0.53..., [(1, 0.680...), (0, 0.319...)]) >>> l1.train(samples, labels) >>> l1.predict('bc') (1, 0.9999..., [(1, 1.0), (0, 0.0)]) >>> l2.train(samples, labels) >>> l2.predict('bc') (0, ...e-16, [(0, 0.5), (1, 0.5)]) >>> l3.train(samples, labels) >>> l3.predict('bc') (0, 0.5, [(0, 0.666...), (1, 0.333...)]) Test different distance weight configurations: >>> samples = 'abc', 'abc', 'aef' >>> labels = 0, 0, 1 >>> l = KNNLearner(k=3, dwsigma=0.5) >>> l.train(samples, labels) >>> l.predict('def') (1, 0.96..., [(1, 0.964...), (0, 0.035...)]) >>> l = KNNLearner(k=3, dwsigma=1.0) >>> l.train(samples, labels) >>> l.predict('def') (1, 0.26..., [(1, 0.576...), (0, 0.423...)]) >>> l = KNNLearner(k=3, dwsigma=2.0) >>> l.train(samples, labels) >>> l.predict('def') (0, 0.35..., [(0, 0.609...), (1, 0.390...)]) >>> l = KNNLearner(k=3, dwsigma=3.0) >>> l.train(samples, labels) >>> l.predict('def') (0, 0.44..., [(0, 0.641...), (1, 0.358...)]) Test feature weights: >>> l = KNNLearner(k=3, dwsigma=0.5) >>> samples = "ab", "ab", "ba" >>> labels = 'x', 'x', 'y' >>> l.train(samples, labels, [1,1]) >>> l.predict("bb") # second feature says it is `x` ('x', 0.5, [('x', 0.666...), ('y', 0.333...)]) >>> l.train(samples, labels, [1,0.1]) >>> l.predict("bb") # second feature unimportant -> `y` ('y', 0.72..., [('y', 0.783...), ('x', 0.216...)]) """ def __doctests_guess(): """ >>> import random >>> samples = "abcde" >>> labels = [1, 1, 0, 0, 0] >>> l = GuessLearner() >>> l.train(samples, labels) >>> l.predict(random.sample("abcdefghij", 1)) (0, 0.33..., [(0, 0.599...), (1, 0.400...)]) >>> l.train(samples[:3], labels[:3]) >>> l.predict(random.sample("abcdefghij", 1)) (1, 0.5, [(1, 0.666...), (0, 0.333...)]) """