"""Miscellaneous utilities for scenario evaluation.""" from __future__ import division from itertools import product, izip from hashlib import md5 import optparse import os import shelve import shutil def dictokey(dic): """Convert a dictionary to a key usable for a :class:`ResultDB`. >>> dictokey({'a': 1, 'b': 2, 'c': 4}) 'ddfce457873ac799ea0b80470dd6a8c6' Keys are independent of item order: >>> k1 = dictokey({'a': 1, 'c': 2, 'b': 3}) >>> k1 '497fd5957b5df58f5b4094feec7ff849' >>> k1 == dictokey({'a': 1, 'b': 3, 'c': 2}) True `dic` may also be an already sorted list of key-value-tuples: >>> k1 == dictokey([('a', 1), ('b', 3), ('c', 2)]) True Tuples don't work: >>> k1 == dictokey((('a', 1), ('b', 3), ('c', 2))) False Also, item lists must be sorted to yield the same key: >>> k1 == dictokey([('a', 1), ('c', 2), ('b', 3)]) False """ try: items = sorted(dic.iteritems()) except AttributeError: # not a dict, assume an already sorted items list items = dic return md5(str(items)).hexdigest() class ResultDB(object): """Evaluation results database. This is a thin wrapper around Python's *shelve* module. The main purpose is to efficiently access results without the need to load them all into memory. >>> rdb = ResultDB("/var/tmp/rdb", flag='n') >>> rdb.add(({'a': 0, 'b': 0}, "r1")) >>> rdb.add(({'a': 0, 'b': 1}, "r2")) >>> rdb.add(({'a': 1, 'b': 0}, "r3")) >>> sorted(rdb.filter({'a': [0]})) [({'a': 0, 'b': 0}, 'r1'), ({'a': 0, 'b': 1}, 'r2')] >>> rdb.close() """ def __init__(self, fname, flag='r'): """The database is read from respectively written to `fname`. By default it is opened in *read-only* mode, i.e. `flag` is *r*. A flag of *w* opens a database in read-write mode while *n* always creates a new database. """ self._flag = flag if self._flag == 'n' and os.path.exists(fname): os.remove(fname) self._db = shelve.open(fname, flag=self._flag, protocol=2) self._km = self._db.get('__keymap__', {}) def all(self): """Returns an iterator over all results.""" isresultkey = lambda k: not k.startswith("__") and not k.endswith("__") return (self._db[k] for k in self._db.iterkeys() if isresultkey(k)) def filter(self, mask): """Returns an iterator over the results yielded by the meta information mask `mask` (a positive filter specifying allowed values for certain keys). Keys not known (according to the global keymap) are ignore, i.e. not used for filtering. """ mask = dict(self._km, **mask) # complete mask mitems = sorted((k, tuple(vl)) for k, vl in mask.iteritems() if k in self._km) mkeys, mvlists = zip(*mitems) for mvalues in product(*mvlists): key = dictokey(zip(mkeys, mvalues)) try: yield self._db[key] except KeyError: pass # not every possible meta must exist raise StopIteration def get(self, meta): """Returns the specific result identified by the meta information `meta`. """ key = dictokey(meta) return self._db[key] def add(self, result): """Add a new result.""" assert self._flag != 'r' meta = sorted(result[0].iteritems()) assert not self._km or set(self._km.keys()) == set([k for k, _ in meta]) for k, v in meta: self._km.setdefault(k, set()).add(v) key = dictokey(meta) self._db[key] = result def extend(self, results): """Add a list of results.""" for result in results: self.add(result) def close(self): """Close the result database, writing changes back to disk.""" if self._flag != 'r': self._db['__keymap__'] = self._km self._db.close() def xget(self, key): """Get some extra information, i.e. an item which is not a result.""" return self._db["__%s" % key] def xadd(self, key, value): """Add some extra information, i.e. an item which is not a result.""" self._db["__%s" % key] = value def join(rdbl, fname): """Join two or more result databases. Source DBs are given by their filenames in `rdbl`. The new joined DB is saved using file `fname`. Results available in one DB but not in another are okay (i.e. both will be in the joined DB) but probably this is not useful in case of results from randomized order validations both probably used different randomized orders. If a result is available in both DBs and in case it refers to a randomized order validation, the corresponding performance lists are joined. The extra information is not joined, i.e. the one from the first DB listed in `rdbl` is used. In all other cases, i.e. results from original order validations, results are not joined (they should be identical at all) but the one from the first DB listed in `rdbl` is used. """ shutil.copy(rdbl[0], fname) jdb = ResultDB(fname, flag='w') for rdb in rdbl[1:]: rdb = ResultDB(rdb) for meta, pfl, xinfo in rdb.all(): try: jmeta, jpfl, jxinfo = jdb.get(meta) except KeyError: jdb.add((meta, pfl, xinfo)) continue # a new result if meta['order'] != 'randomized': continue # do not duplicate results from original order # join performance lists from randomized orders for pf, jpf in izip(pfl, jpfl): jpf.extend(pf) jdb.add((jmeta, jpfl, jxinfo)) jdb.close() # ============================================================================= # command line interface # ============================================================================= def options(): op = optparse.OptionParser("%prog -s FILE [-s FILE ...] -d FILE") op.add_option("-s", "--source", action="append", help="source database [+]", metavar="FILE") op.add_option("-d", "--destination", help="destination for joined database", metavar="FILE") opts, args = op.parse_args() if args: op.error("invalid arguments") if not opts.source or len(opts.source) < 2: op.error("need at least 2 source database") if not opts.destination: op.error("need a destination file name") return opts def main(): opts = options() join(opts.source, opts.destination) if __name__ == '__main__': main() # ============================================================================= # tests # ============================================================================= def __doctests_resultdb(): """ >>> rdb = ResultDB("/var/tmp/rdb", flag='n') >>> tuple(rdb.all()) () >>> rdb.add(({'a': 0, 'b': 0}, "r1")) >>> tuple(rdb.all()) (({'a': 0, 'b': 0}, 'r1'),) >>> rdb.get({'a': 0, 'b': 0}) ({'a': 0, 'b': 0}, 'r1') >>> rdb.add(({'a': 0, 'b': 1}, "r2")) >>> rdb.add(({'a': 1, 'b': 1}, "r3")) >>> tuple(rdb.filter({'b': [1]})) (({'a': 0, 'b': 1}, 'r2'), ({'a': 1, 'b': 1}, 'r3')) >>> rdb.add(({'a': 1}, "r4")) # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): AssertionError >>> rdb.extend([({'a': 1, 'b': 0}, "r4"), ({'a': 2, 'b': 1}, "r5")]) >>> sorted(x[1] for x in rdb.all()) ['r1', 'r2', 'r3', 'r4', 'r5'] Add some extra items and check if all() hides them: >>> rdb.xadd('foo', 'bar') >>> sorted(x[1] for x in rdb.all()) ['r1', 'r2', 'r3', 'r4', 'r5'] >>> rdb.xget('foo') 'bar' >>> sorted(k for k in rdb._db.iterkeys() if k.startswith("__")) ['__foo'] >>> rdb.close() >>> rdb = ResultDB("/var/tmp/rdb") >>> sorted(k for k in rdb._db.iterkeys() if k.startswith("__")) ['__foo', '__keymap__'] >>> sorted(x[1] for x in rdb.all()) ['r1', 'r2', 'r3', 'r4', 'r5'] >>> tuple(rdb.filter({'b': [1], 'a': [0,5]})) (({'a': 0, 'b': 1}, 'r2'),) >>> rdb.add(({'a': 1, 'b': 0}, "rx")) # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): AssertionError >>> rdb.close() >>> rdb = ResultDB("/var/tmp/rdb", flag='n') >>> tuple(rdb.all()) () >>> rdb.close() """ def __doctests_join(): """ >>> fn1, fn2, fn3, fn4 = ("/var/tmp/rdb%d" % i for i in (1,2,3,4)) >>> rdb = ResultDB(fn1, flag='n') >>> rdb.add(({'a': 0, 'b': 0, 'order': "randomized"}, [[1], [10], [100]], "x")) >>> rdb.add(({'a': 0, 'b': 1, 'order': "randomized"}, [[2], [20], [200]], "x")) >>> rdb.close() >>> rdb = ResultDB(fn2, flag='n') >>> rdb.add(({'a': 0, 'b': 0, 'order': "randomized"}, [[1], [10], [100]], "x")) >>> rdb.add(({'a': 1, 'b': 1, 'order': "randomized"}, [[3], [30], [300]], "x")) >>> rdb.close() >>> rdb = ResultDB(fn3, flag='n') >>> rdb.add(({'a': 0, 'b': 0, 'order': "original"}, [[1], [10], [100]], "x")) >>> rdb.add(({'a': 1, 'b': 1, 'order': "randomized"}, [[3], [30], [300]], "x")) >>> rdb.close() >>> join([fn1, fn2, fn3], fn4) >>> rdb = ResultDB(fn4) >>> for result in sorted(rdb.all()): ... print sorted(result[0].items()), " ".join(repr(x) for x in result[1:]) [('a', 0), ('b', 0), ('order', 'original')] [[1], [10], [100]] 'x' [('a', 0), ('b', 0), ('order', 'randomized')] [[1, 1], [10, 10], [100, 100]] 'x' [('a', 0), ('b', 1), ('order', 'randomized')] [[2], [20], [200]] 'x' [('a', 1), ('b', 1), ('order', 'randomized')] [[3, 3], [30, 30], [300, 300]] 'x' """