"""Decision tree learning methods. This implementation is based on example code provided by Christopher Roach in his article *Building Decision Trees in Python* (http://onlamp.com/pub/a/python/2006/02/09/ai_decision_trees.html). """ from __future__ import division from itertools import izip import collections import math from diles.learn import Label, Learner # ============================================================================= # information gain utilities (ID3 heuristic) # ============================================================================= def entropy(data, target_attr): """ Calculates the entropy of the given data set for the target attribute. """ val_freq = {} data_entropy = 0.0 # Calculate the frequency of each of the values in the target attr for record in data: if (val_freq.has_key(record[target_attr])): val_freq[record[target_attr]] += 1.0 else: val_freq[record[target_attr]] = 1.0 # Calculate the entropy of the data for the target attribute for freq in val_freq.values(): data_entropy += (-freq/len(data)) * math.log(freq/len(data), 2) return data_entropy def gain(data, attr, target_attr): """ Calculates the information gain (reduction in entropy) that would result by splitting the data on the chosen attribute (attr). """ val_freq = {} subset_entropy = 0.0 # Calculate the frequency of each of the values in the target attribute for record in data: if (val_freq.has_key(record[attr])): val_freq[record[attr]] += 1.0 else: val_freq[record[attr]] = 1.0 # Calculate the sum of the entropy for each subset of records weighted # by their probability of occuring in the training set. for val in val_freq.keys(): val_prob = val_freq[val] / sum(val_freq.values()) data_subset = [record for record in data if record[attr] == val] subset_entropy += val_prob * entropy(data_subset, target_attr) # Subtract the entropy of the chosen attribute from the entropy of the # whole data set with respect to the target attribute (and return it) return (entropy(data, target_attr) - subset_entropy) # ============================================================================= # tree builder and classifier utilities # ============================================================================= def majority_value(data, target_attr): """ Creates a list of all values in the target attribute for each record in the data list object, and returns the value that appears in this list the most frequently. """ data = data[:] return most_frequent([record[target_attr] for record in data]) def most_frequent(lst): """ Returns the item that appears most frequently in the given list. """ lst = lst[:] highest_freq = 0 most_freq = None for val in unique(lst): if lst.count(val) > highest_freq: most_freq = val highest_freq = lst.count(val) return most_freq def unique(lst): """ Returns a list made up of the unique values found in lst. i.e., it removes the redundant values in lst. """ lst = lst[:] unique_lst = [] # Cycle through the list and add each value to the unique list only once. for item in lst: if unique_lst.count(item) <= 0: unique_lst.append(item) # Return the list with all redundant values removed. return unique_lst def get_values(data, attr): """ Creates a list of values in the chosen attribut for each record in data, prunes out all of the redundant values, and return the list. """ data = data[:] return unique([record[attr] for record in data]) def choose_attribute(data, attributes, target_attr, fitness): """ Cycles through all the attributes and returns the attribute with the highest information gain (or lowest entropy). """ data = data[:] best_gain = 0.0 best_attr = None for attr in attributes: gain = fitness(data, attr, target_attr) if (gain >= best_gain and attr != target_attr): best_gain = gain best_attr = attr return best_attr def get_examples(data, attr, value): """ Returns a list of all the records in with the value of matching the given value. """ data = data[:] rtn_lst = [] if not data: return rtn_lst else: record = data.pop() if record[attr] == value: rtn_lst.append(record) rtn_lst.extend(get_examples(data, attr, value)) return rtn_lst else: rtn_lst.extend(get_examples(data, attr, value)) return rtn_lst def get_classification(record, tree): """ This function recursively traverses the decision tree and returns a classification for the given record. """ # If the current node is a string, then we've reached a leaf node and # we can return it as our answer #if type(tree) == type("string"): # return tree if type(tree) != dict: return tree # a leaf # Traverse the tree further until a leaf node is found. else: attr = tree.keys()[0] t = tree[attr][record[attr]] return get_classification(record, t) def classify(tree, data): """ Returns a list of classifications for each of the records in the data list as determined by the given decision tree. """ data = data[:] classification = [] for record in data: classification.append(get_classification(record, tree)) return classification def create_decision_tree(data, attributes, target_attr, fitness_func): """ Returns a new decision tree based on the examples given. """ data = data[:] vals = [record[target_attr] for record in data] default = majority_value(data, target_attr) # If the dataset is empty or the attributes list is empty, return the # default value. When checking the attributes list for emptiness, we # need to subtract 1 to account for the target attribute. if not data or (len(attributes) - 1) <= 0: return default # If all the records in the dataset have the same classification, # return that classification. elif vals.count(vals[0]) == len(vals): return vals[0] else: # Choose the next best attribute to best classify our data best = choose_attribute(data, attributes, target_attr, fitness_func) # Create a new decision tree/node with the best attribute and an empty # dictionary object--we'll fill that up next. # We use the collections.defaultdict function to add a function to the # new tree that will be called whenever we query the tree with an # attribute that does not exist. This way we return the default value # for the target attribute whenever, we have an attribute combination # that wasn't seen during training. tree = {best:collections.defaultdict(lambda: default)} # Create a new decision tree/sub-node for each of the values in the # best attribute field for val in get_values(data, best): # Create a subtree for the current value under the "best" field subtree = create_decision_tree( get_examples(data, best, val), [attr for attr in attributes if attr != best], target_attr, fitness_func) # Add the new subtree to the empty dictionary object in our new # tree/node we just created. tree[best][val] = subtree return tree # ============================================================================= # date converter utilities # ============================================================================= TARGET_ATTR = 'l' def convert_data(samples, labels): """Convert data form diles format to the format support by DT learner.""" attributes = ['a%02d' % i for i in range(len(samples[0]))] attributes.append(TARGET_ATTR) data = [] for sample, label in izip(samples, labels): values = list(sample) + [label] dic = dict((a, v) for a, v in izip(attributes, values)) data.append(dic) return attributes, data # ============================================================================= # learner class # ============================================================================= class DecisionTreeLearner(Learner): """Learner using a naive Bayes classifier algorithm.""" id = "DT" paramspace = { 'fitness': ['gain'] } fitfuncs = { 'gain': gain, } def __init__(self, fitness): """...""" super(DecisionTreeLearner, self).__init__() self.fitness = self.fitfuncs[fitness] def train(self, samples, labels, fwl=None, fwt=0): """See :meth:`diles.learn.Learner.train`.""" self.fwl = fwl or [1] * len(samples[0]) self.fwt = fwt samples = [[x for w, x in izip(self.fwl, s) if w > fwt] for s in samples] self.attributes, data = convert_data(samples, labels) self.model = create_decision_tree(data, self.attributes, TARGET_ATTR, self.fitness) def predict(self, sample): """See :meth:`diles.learn.Learner.predict`. The returned ranking values are a probability distribution. The returned confidence is the ratio between the best 2 labels. """ sample = [x for w, x in izip(self.fwl, sample) if w > self.fwt] data = [dict((a, v) for a, v in izip(self.attributes[:-1], sample))] label = classify(self.model, data)[0] return label, 2/3, None # ============================================================================= # tests # ============================================================================= def __doctests_convert_data(): """ >>> samples = ["aaabbb", "aaaccc", "ababac"] >>> labels = "xyx" >>> attributes, data = convert_data(samples, labels) >>> for d in data: ... print sorted(d.items()) [('a00', 'a'), ('a01', 'a'), ('a02', 'a'), ('a03', 'b'), ('a04', 'b'), ('a05', 'b'), ('l', 'x')] [('a00', 'a'), ('a01', 'a'), ('a02', 'a'), ('a03', 'c'), ('a04', 'c'), ('a05', 'c'), ('l', 'y')] [('a00', 'a'), ('a01', 'b'), ('a02', 'a'), ('a03', 'b'), ('a04', 'a'), ('a05', 'c'), ('l', 'x')] >>> print attributes ['a00', 'a01', 'a02', 'a03', 'a04', 'a05', 'l'] >>> samples = [("aaa", "bbb"), ("aaa", "ccc"), ("aba", "bac")] >>> labels = [("x.x", "x.y"), ("x.x", "x.z"), ("x.y", "x.z")] >>> labels = [Label(x) for x in labels] >>> attributes, data = convert_data(samples, labels) >>> for d in data: ... print sorted(d.items()) [('a00', 'aaa'), ('a01', 'bbb'), ('l', {'x.x', 'x.y'})] [('a00', 'aaa'), ('a01', 'ccc'), ('l', {'x.x', 'x.z'})] [('a00', 'aba'), ('a01', 'bac'), ('l', {'x.y', 'x.z'})] >>> print attributes ['a00', 'a01', 'l'] """ def __doctests_decistion_tree_learner(): """ >>> import random First tests without an initial probability for yet unseen feature values: >>> l = DecisionTreeLearner(fitness="gain") >>> samples = "aab" >>> labels = 'xxy' >>> l.train(samples, labels) >>> l.predict("a") ('x', 0.666..., None) >>> l.predict("b") ('y', 0.666..., None) >>> l.predict("c") ('...', 0.666..., None) >>> samples = "aa", "ab", "bc" >>> labels = 'x', 'x', 'y' >>> l.train(samples, labels) >>> l.predict("aa") ('x', 0.666..., None) >>> l.predict("ab") ('x', 0.666..., None) >>> l.predict("bc") ('y', 0.666..., None) >>> l.predict("ac") ('...', 0.666..., None) >>> samples = "aa", "ab", "bb", "ab" >>> labels = 'x', 'x', 'x', 'y' >>> labels = [Label(x) for x in labels] >>> l.train(samples, labels) >>> l.predict("aa") ({'x'}, 0.666..., None) >>> l.predict("ab") ({'x'}, 0.666..., None) >>> l.predict("ac") (..., 0.666..., None) >>> from diles.tests import TSETS >>> l.train(*TSETS['easy']) >>> l.predict("a") ({'0'}, 0.666..., None) >>> l.predict("b") ({'1'}, 0.666..., None) >>> l.train(*TSETS['moderate']) >>> l.predict("ac") ({'x'}, 0.666..., None) >>> l.predict("bc") ({'y'}, 0.666..., None) >>> l.predict("ab") ({'x'}, 0.666..., None) >>> l.predict("bb") ({'y'}, 0.666..., None) >>> samples = 'ab', 'ac', 'ad', 'bb', 'bd' >>> labels = 1, 0, 1, 0, 1 >>> l.train(samples, labels) >>> l.predict("aa") (1, 0.666..., None) >>> l.predict("xd") (1, 0.666..., None) >>> l.predict("xc") (0, 0.666..., None) >>> samples = 'ab', 'ac', 'ad', 'bb', 'bd' >>> labels = 1, 0, 1, 0, 1 >>> l.train(samples, labels, fwl=[0, 1]) >>> l.predict("aa") (..., 0.666..., None) >>> l.predict("xd") (1, 0.666..., None) >>> l.predict("xc") (0, 0.666..., None) """