| """ fontTools.misc.classifyTools.py -- tools for classifying things. |
| """ |
| |
| |
| class Classifier(object): |
| """ |
| Main Classifier object, used to classify things into similar sets. |
| """ |
| |
| def __init__(self, sort=True): |
| self._things = set() # set of all things known so far |
| self._sets = [] # list of class sets produced so far |
| self._mapping = {} # map from things to their class set |
| self._dirty = False |
| self._sort = sort |
| |
| def add(self, set_of_things): |
| """ |
| Add a set to the classifier. Any iterable is accepted. |
| """ |
| if not set_of_things: |
| return |
| |
| self._dirty = True |
| |
| things, sets, mapping = self._things, self._sets, self._mapping |
| |
| s = set(set_of_things) |
| intersection = s.intersection(things) # existing things |
| s.difference_update(intersection) # new things |
| difference = s |
| del s |
| |
| # Add new class for new things |
| if difference: |
| things.update(difference) |
| sets.append(difference) |
| for thing in difference: |
| mapping[thing] = difference |
| del difference |
| |
| while intersection: |
| # Take one item and process the old class it belongs to |
| old_class = mapping[next(iter(intersection))] |
| old_class_intersection = old_class.intersection(intersection) |
| |
| # Update old class to remove items from new set |
| old_class.difference_update(old_class_intersection) |
| |
| # Remove processed items from todo list |
| intersection.difference_update(old_class_intersection) |
| |
| # Add new class for the intersection with old class |
| sets.append(old_class_intersection) |
| for thing in old_class_intersection: |
| mapping[thing] = old_class_intersection |
| del old_class_intersection |
| |
| def update(self, list_of_sets): |
| """ |
| Add a a list of sets to the classifier. Any iterable of iterables is accepted. |
| """ |
| for s in list_of_sets: |
| self.add(s) |
| |
| def _process(self): |
| if not self._dirty: |
| return |
| |
| # Do any deferred processing |
| sets = self._sets |
| self._sets = [s for s in sets if s] |
| |
| if self._sort: |
| self._sets = sorted(self._sets, key=lambda s: (-len(s), sorted(s))) |
| |
| self._dirty = False |
| |
| # Output methods |
| |
| def getThings(self): |
| """Returns the set of all things known so far. |
| |
| The return value belongs to the Classifier object and should NOT |
| be modified while the classifier is still in use. |
| """ |
| self._process() |
| return self._things |
| |
| def getMapping(self): |
| """Returns the mapping from things to their class set. |
| |
| The return value belongs to the Classifier object and should NOT |
| be modified while the classifier is still in use. |
| """ |
| self._process() |
| return self._mapping |
| |
| def getClasses(self): |
| """Returns the list of class sets. |
| |
| The return value belongs to the Classifier object and should NOT |
| be modified while the classifier is still in use. |
| """ |
| self._process() |
| return self._sets |
| |
| |
| def classify(list_of_sets, sort=True): |
| """ |
| Takes a iterable of iterables (list of sets from here on; but any |
| iterable works.), and returns the smallest list of sets such that |
| each set, is either a subset, or is disjoint from, each of the input |
| sets. |
| |
| In other words, this function classifies all the things present in |
| any of the input sets, into similar classes, based on which sets |
| things are a member of. |
| |
| If sort=True, return class sets are sorted by decreasing size and |
| their natural sort order within each class size. Otherwise, class |
| sets are returned in the order that they were identified, which is |
| generally not significant. |
| |
| >>> classify([]) == ([], {}) |
| True |
| >>> classify([[]]) == ([], {}) |
| True |
| >>> classify([[], []]) == ([], {}) |
| True |
| >>> classify([[1]]) == ([{1}], {1: {1}}) |
| True |
| >>> classify([[1,2]]) == ([{1, 2}], {1: {1, 2}, 2: {1, 2}}) |
| True |
| >>> classify([[1],[2]]) == ([{1}, {2}], {1: {1}, 2: {2}}) |
| True |
| >>> classify([[1,2],[2]]) == ([{1}, {2}], {1: {1}, 2: {2}}) |
| True |
| >>> classify([[1,2],[2,4]]) == ([{1}, {2}, {4}], {1: {1}, 2: {2}, 4: {4}}) |
| True |
| >>> classify([[1,2],[2,4,5]]) == ( |
| ... [{4, 5}, {1}, {2}], {1: {1}, 2: {2}, 4: {4, 5}, 5: {4, 5}}) |
| True |
| >>> classify([[1,2],[2,4,5]], sort=False) == ( |
| ... [{1}, {4, 5}, {2}], {1: {1}, 2: {2}, 4: {4, 5}, 5: {4, 5}}) |
| True |
| >>> classify([[1,2,9],[2,4,5]], sort=False) == ( |
| ... [{1, 9}, {4, 5}, {2}], {1: {1, 9}, 2: {2}, 4: {4, 5}, 5: {4, 5}, |
| ... 9: {1, 9}}) |
| True |
| >>> classify([[1,2,9,15],[2,4,5]], sort=False) == ( |
| ... [{1, 9, 15}, {4, 5}, {2}], {1: {1, 9, 15}, 2: {2}, 4: {4, 5}, |
| ... 5: {4, 5}, 9: {1, 9, 15}, 15: {1, 9, 15}}) |
| True |
| >>> classes, mapping = classify([[1,2,9,15],[2,4,5],[15,5]], sort=False) |
| >>> set([frozenset(c) for c in classes]) == set( |
| ... [frozenset(s) for s in ({1, 9}, {4}, {2}, {5}, {15})]) |
| True |
| >>> mapping == {1: {1, 9}, 2: {2}, 4: {4}, 5: {5}, 9: {1, 9}, 15: {15}} |
| True |
| """ |
| classifier = Classifier(sort=sort) |
| classifier.update(list_of_sets) |
| return classifier.getClasses(), classifier.getMapping() |
| |
| |
| if __name__ == "__main__": |
| import sys, doctest |
| |
| sys.exit(doctest.testmod(optionflags=doctest.ELLIPSIS).failed) |