"""Decision tree learning methods.

This implementation is based on example code provided by Christopher Roach
in his article *Building Decision Trees in Python*
(http://onlamp.com/pub/a/python/2006/02/09/ai_decision_trees.html).

"""

from __future__ import division

from itertools import izip
import collections
import math

from diles.learn import Label, Learner

# =============================================================================
# information gain utilities (ID3 heuristic)
# =============================================================================

def entropy(data, target_attr):
    """
    Calculates the entropy of the given data set for the target attribute.
    """
    val_freq = {}
    data_entropy = 0.0

    # Calculate the frequency of each of the values in the target attr
    for record in data:
        if (val_freq.has_key(record[target_attr])):
            val_freq[record[target_attr]] += 1.0
        else:
            val_freq[record[target_attr]] = 1.0

    # Calculate the entropy of the data for the target attribute
    for freq in val_freq.values():
        data_entropy += (-freq/len(data)) * math.log(freq/len(data), 2)

    return data_entropy

def gain(data, attr, target_attr):
    """
    Calculates the information gain (reduction in entropy) that would
    result by splitting the data on the chosen attribute (attr).
    """
    val_freq = {}
    subset_entropy = 0.0

    # Calculate the frequency of each of the values in the target attribute
    for record in data:
        if (val_freq.has_key(record[attr])):
            val_freq[record[attr]] += 1.0
        else:
            val_freq[record[attr]] = 1.0

    # Calculate the sum of the entropy for each subset of records weighted
    # by their probability of occuring in the training set.
    for val in val_freq.keys():
        val_prob = val_freq[val] / sum(val_freq.values())
        data_subset = [record for record in data if record[attr] == val]
        subset_entropy += val_prob * entropy(data_subset, target_attr)

    # Subtract the entropy of the chosen attribute from the entropy of the
    # whole data set with respect to the target attribute (and return it)
    return (entropy(data, target_attr) - subset_entropy)

# =============================================================================
# tree builder and classifier utilities
# =============================================================================

def majority_value(data, target_attr):
    """
    Creates a list of all values in the target attribute for each record
    in the data list object, and returns the value that appears in this list
    the most frequently.
    """
    data = data[:]
    return most_frequent([record[target_attr] for record in data])

def most_frequent(lst):
    """
    Returns the item that appears most frequently in the given list.
    """
    lst = lst[:]
    highest_freq = 0
    most_freq = None

    for val in unique(lst):
        if lst.count(val) > highest_freq:
            most_freq = val
            highest_freq = lst.count(val)

    return most_freq

def unique(lst):
    """
    Returns a list made up of the unique values found in lst.  i.e., it
    removes the redundant values in lst.
    """
    lst = lst[:]
    unique_lst = []

    # Cycle through the list and add each value to the unique list only once.
    for item in lst:
        if unique_lst.count(item) <= 0:
            unique_lst.append(item)

    # Return the list with all redundant values removed.
    return unique_lst

def get_values(data, attr):
    """
    Creates a list of values in the chosen attribut for each record in data,
    prunes out all of the redundant values, and return the list.
    """
    data = data[:]
    return unique([record[attr] for record in data])

def choose_attribute(data, attributes, target_attr, fitness):
    """
    Cycles through all the attributes and returns the attribute with the
    highest information gain (or lowest entropy).
    """
    data = data[:]
    best_gain = 0.0
    best_attr = None

    for attr in attributes:
        gain = fitness(data, attr, target_attr)
        if (gain >= best_gain and attr != target_attr):
            best_gain = gain
            best_attr = attr

    return best_attr

def get_examples(data, attr, value):
    """
    Returns a list of all the records in <data> with the value of <attr>
    matching the given value.
    """
    data = data[:]
    rtn_lst = []

    if not data:
        return rtn_lst
    else:
        record = data.pop()
        if record[attr] == value:
            rtn_lst.append(record)
            rtn_lst.extend(get_examples(data, attr, value))
            return rtn_lst
        else:
            rtn_lst.extend(get_examples(data, attr, value))
            return rtn_lst

def get_classification(record, tree):
    """
    This function recursively traverses the decision tree and returns a
    classification for the given record.
    """
    # If the current node is a string, then we've reached a leaf node and
    # we can return it as our answer
    #if type(tree) == type("string"):
    #    return tree
    if type(tree) != dict:
        return tree # a leaf

    # Traverse the tree further until a leaf node is found.
    else:
        attr = tree.keys()[0]
        t = tree[attr][record[attr]]
        return get_classification(record, t)

def classify(tree, data):
    """
    Returns a list of classifications for each of the records in the data
    list as determined by the given decision tree.
    """
    data = data[:]
    classification = []

    for record in data:
        classification.append(get_classification(record, tree))

    return classification

def create_decision_tree(data, attributes, target_attr, fitness_func):
    """
    Returns a new decision tree based on the examples given.
    """
    data = data[:]
    vals = [record[target_attr] for record in data]
    default = majority_value(data, target_attr)

    # If the dataset is empty or the attributes list is empty, return the
    # default value. When checking the attributes list for emptiness, we
    # need to subtract 1 to account for the target attribute.
    if not data or (len(attributes) - 1) <= 0:
        return default
    # If all the records in the dataset have the same classification,
    # return that classification.
    elif vals.count(vals[0]) == len(vals):
        return vals[0]
    else:
        # Choose the next best attribute to best classify our data
        best = choose_attribute(data, attributes, target_attr,
                                fitness_func)

        # Create a new decision tree/node with the best attribute and an empty
        # dictionary object--we'll fill that up next.
        # We use the collections.defaultdict function to add a function to the
        # new tree that will be called whenever we query the tree with an
        # attribute that does not exist.  This way we return the default value
        # for the target attribute whenever, we have an attribute combination
        # that wasn't seen during training.
        tree = {best:collections.defaultdict(lambda: default)}

        # Create a new decision tree/sub-node for each of the values in the
        # best attribute field
        for val in get_values(data, best):
            # Create a subtree for the current value under the "best" field
            subtree = create_decision_tree(
                get_examples(data, best, val),
                [attr for attr in attributes if attr != best],
                target_attr,
                fitness_func)

            # Add the new subtree to the empty dictionary object in our new
            # tree/node we just created.
            tree[best][val] = subtree

    return tree

# =============================================================================
# date converter utilities
# =============================================================================

TARGET_ATTR = 'l'

def convert_data(samples, labels):
    """Convert data form diles format to the format support by DT learner."""

    attributes = ['a%02d' % i for i in range(len(samples[0]))]
    attributes.append(TARGET_ATTR)
    data = []
    for sample, label in izip(samples, labels):
        values = list(sample) + [label]
        dic = dict((a, v) for a, v in izip(attributes, values))
        data.append(dic)
    return attributes, data

# =============================================================================
# learner class
# =============================================================================

class DecisionTreeLearner(Learner):
    """Learner using a naive Bayes classifier algorithm."""

    id = "DT"

    paramspace = {
        'fitness': ['gain']
    }

    fitfuncs = {
        'gain': gain,
    }

    def __init__(self, fitness):
        """..."""
        super(DecisionTreeLearner, self).__init__()
        self.fitness = self.fitfuncs[fitness]

    def train(self, samples, labels, fwl=None, fwt=0):
        """See :meth:`diles.learn.Learner.train`."""

        self.fwl = fwl or [1] * len(samples[0])
        self.fwt = fwt
        samples = [[x for w, x in izip(self.fwl, s) if w > fwt]
                   for s in samples]
        self.attributes, data = convert_data(samples, labels)
        self.model = create_decision_tree(data, self.attributes, TARGET_ATTR,
                                          self.fitness)

    def predict(self, sample):
        """See :meth:`diles.learn.Learner.predict`.

        The returned ranking values are a probability distribution.

        The returned confidence is the ratio between the best 2 labels.

        """
        sample = [x for w, x in izip(self.fwl, sample) if w > self.fwt]
        data = [dict((a, v) for a, v in izip(self.attributes[:-1], sample))]
        label = classify(self.model, data)[0]
        return label, 2/3, None

# =============================================================================
# tests
# =============================================================================

def __doctests_convert_data():
    """
    >>> samples = ["aaabbb", "aaaccc", "ababac"]
    >>> labels = "xyx"
    >>> attributes, data = convert_data(samples, labels)
    >>> for d in data:
    ...     print sorted(d.items())
    [('a00', 'a'), ('a01', 'a'), ('a02', 'a'), ('a03', 'b'), ('a04', 'b'), ('a05', 'b'), ('l', 'x')]
    [('a00', 'a'), ('a01', 'a'), ('a02', 'a'), ('a03', 'c'), ('a04', 'c'), ('a05', 'c'), ('l', 'y')]
    [('a00', 'a'), ('a01', 'b'), ('a02', 'a'), ('a03', 'b'), ('a04', 'a'), ('a05', 'c'), ('l', 'x')]
    >>> print attributes
    ['a00', 'a01', 'a02', 'a03', 'a04', 'a05', 'l']

    >>> samples = [("aaa", "bbb"), ("aaa", "ccc"), ("aba", "bac")]
    >>> labels = [("x.x", "x.y"), ("x.x", "x.z"), ("x.y", "x.z")]
    >>> labels = [Label(x) for x in labels]
    >>> attributes, data = convert_data(samples, labels)
    >>> for d in data:
    ...     print sorted(d.items())
    [('a00', 'aaa'), ('a01', 'bbb'), ('l', {'x.x', 'x.y'})]
    [('a00', 'aaa'), ('a01', 'ccc'), ('l', {'x.x', 'x.z'})]
    [('a00', 'aba'), ('a01', 'bac'), ('l', {'x.y', 'x.z'})]
    >>> print attributes
    ['a00', 'a01', 'l']

    """

def __doctests_decistion_tree_learner():
    """
    >>> import random

    First tests without an initial probability for yet unseen feature values:

    >>> l = DecisionTreeLearner(fitness="gain")

    >>> samples = "aab"
    >>> labels = 'xxy'
    >>> l.train(samples, labels)
    >>> l.predict("a")
    ('x', 0.666..., None)
    >>> l.predict("b")
    ('y', 0.666..., None)
    >>> l.predict("c")
    ('...', 0.666..., None)

    >>> samples = "aa", "ab", "bc"
    >>> labels =  'x',  'x',  'y'
    >>> l.train(samples, labels)
    >>> l.predict("aa")
    ('x', 0.666..., None)
    >>> l.predict("ab")
    ('x', 0.666..., None)
    >>> l.predict("bc")
    ('y', 0.666..., None)
    >>> l.predict("ac")
    ('...', 0.666..., None)

    >>> samples = "aa", "ab", "bb", "ab"
    >>> labels =  'x',  'x',  'x',  'y'
    >>> labels = [Label(x) for x in labels]
    >>> l.train(samples, labels)
    >>> l.predict("aa")
    ({'x'}, 0.666..., None)
    >>> l.predict("ab")
    ({'x'}, 0.666..., None)
    >>> l.predict("ac")
    (..., 0.666..., None)

    >>> from diles.tests import TSETS

    >>> l.train(*TSETS['easy'])
    >>> l.predict("a")
    ({'0'}, 0.666..., None)
    >>> l.predict("b")
    ({'1'}, 0.666..., None)

    >>> l.train(*TSETS['moderate'])
    >>> l.predict("ac")
    ({'x'}, 0.666..., None)
    >>> l.predict("bc")
    ({'y'}, 0.666..., None)
    >>> l.predict("ab")
    ({'x'}, 0.666..., None)
    >>> l.predict("bb")
    ({'y'}, 0.666..., None)

    >>> samples = 'ab', 'ac', 'ad', 'bb', 'bd'
    >>> labels =  1,    0,    1,    0,    1
    >>> l.train(samples, labels)
    >>> l.predict("aa")
    (1, 0.666..., None)
    >>> l.predict("xd")
    (1, 0.666..., None)
    >>> l.predict("xc")
    (0, 0.666..., None)

    >>> samples = 'ab', 'ac', 'ad', 'bb', 'bd'
    >>> labels =  1,    0,    1,    0,    1
    >>> l.train(samples, labels, fwl=[0, 1])
    >>> l.predict("aa")
    (..., 0.666..., None)
    >>> l.predict("xd")
    (1, 0.666..., None)
    >>> l.predict("xc")
    (0, 0.666..., None)
    """