commit 1befc9a07f30490decb01c9ab873d37ea77e1fae Author: Isis Lovecruft isis@torproject.org Date: Sat Apr 18 00:04:34 2015 +0000
Refactor b.p.descriptors.deduplicate() to improve benchmark results. --- lib/bridgedb/parse/descriptors.py | 104 +++++++++++++++++++------------------ 1 file changed, 53 insertions(+), 51 deletions(-)
diff --git a/lib/bridgedb/parse/descriptors.py b/lib/bridgedb/parse/descriptors.py index efdedc9..bec10b3 100644 --- a/lib/bridgedb/parse/descriptors.py +++ b/lib/bridgedb/parse/descriptors.py @@ -144,7 +144,29 @@ def parseServerDescriptorsFile(filename, validate=True): routers = list(document) return routers
-def deduplicate(descriptors): +def __cmp_published__(x, y): + """A custom ``cmp()`` which sorts descriptors by published date. + + :rtype: int + :returns: Return negative if x<y, zero if x==y, positive if x>y. + """ + if x.published < y.published: + return -1 + elif x.published == y.published: + # This *shouldn't* happen. It would mean that two descriptors for + # the same router had the same timestamps, probably meaning there + # is a severely-messed up OR implementation out there. Let's log + # its fingerprint (no matter what!) so that we can look up its + # ``platform`` line in its server-descriptor and tell whoever + # wrote that code that they're probably (D)DOSing the Tor network. + logging.warn(("Duplicate descriptor with identical timestamp (%s) " + "for bridge %s with fingerprint %s !") % + (x.published, x.nickname, x.fingerprint)) + return 0 + elif x.published > y.published: + return 1 + +def deduplicate(descriptors, statistics=False): """Deduplicate some descriptors, returning only the newest for each router.
.. note:: If two descriptors for the same router are discovered, AND both @@ -155,68 +177,48 @@ def deduplicate(descriptors): :param list descriptors: A list of :api:`stem.descriptor.server_descriptor.RelayDescriptor`s, :api:`stem.descriptor.extrainfo_descriptor.BridgeExtraInfoDescriptor`s, - :api:`stem.descriptor.router_status_entry.RouterStatusEntryV2`s. + or :api:`stem.descriptor.router_status_entry.RouterStatusEntryV2`s. + :param bool statistics: If ``True``, log some extra statistics about the + number of duplicates. + :rtype: dict + :returns: A dictionary mapping router fingerprints to their newest + available descriptor. """ duplicates = {} - nonDuplicates = {} + newest = {}
for descriptor in descriptors: fingerprint = descriptor.fingerprint - logging.debug("Deduplicating %s descriptor for router %s" % (str(descriptor.__class__).rsplit('.', 1)[1], safelog.logSafely(fingerprint))) - - if fingerprint in nonDuplicates.keys(): - # We already found a descriptor for this fingerprint: - conflict = nonDuplicates[fingerprint] - - # If the descriptor we are currently parsing is newer than the - # last one we found: - if descriptor.published > conflict.published: - # And if this is the first duplicate we've found for this - # router, then create a list in the ``duplicates`` dictionary - # for the router: - if not fingerprint in duplicates.keys(): - duplicates[fingerprint] = list() - # Add this to the list of duplicates for this router: - duplicates[fingerprint].append(conflict) - # Finally, put the newest descriptor in the ``nonDuplicates`` - # dictionary: - nonDuplicates[fingerprint] = descriptor - # Same thing, but this time the one we're parsing is older: - elif descriptor.published < conflict.published: - if not fingerprint in duplicates.keys(): - duplicates[fingerprint] = list() - duplicates[fingerprint].append(descriptor) - # This *shouldn't* happen. It would mean that two descriptors for - # the same router had the same timestamps, probably meaning there - # is a severely-messed up OR implementation out there. Let's log - # its fingerprint (no matter what!) so that we can look up its - # ``platform`` line in its server-descriptor and tell whoever - # wrote that code that they're probably (D)DOSing the Tor network. - else: - try: - raise DescriptorWarning( - ("Duplicate descriptor with identical timestamp (%s) " - "for router with fingerprint '%s'!") - % (descriptor.published, fingerprint)) - # And just in case it does happen, catch the warning: - except DescriptorWarning as descwarn: - logging.warn("DescriptorWarning: %s" % str(descwarn)) - - # Hoorah! No duplicates! (yet...) + if fingerprint in duplicates: + duplicates[fingerprint].append(descriptor) else: - nonDuplicates[fingerprint] = descriptor + duplicates[fingerprint] = [descriptor,] + + for fingerprint, dupes in duplicates.items(): + dupes.sort(cmp=__cmp_published__) + first = dupes.pop() + newest[fingerprint] = first + duplicates[fingerprint] = dupes + + if statistics: + # sorted() won't sort by values (or anything that isn't the first item + # in its container), period, no matter what the cmp function is. + totals = sorted([(len(v), k,) for k, v in duplicates.viewitems()]) + total = sum([k for (k, v) in totals]) + bridges = len(duplicates) + top = 10 if bridges >= 10 else bridges + logging.info("Number of bridges with duplicates: %5d" % bridges) + logging.info("Total duplicate descriptors: %5d" % total) + logging.info("Bridges with the most duplicates (Top %d):" % top) + for i, (subtotal, bridge) in zip(range(1, top + 1), totals[:top]): + logging.info(" #%d %s: %d duplicates" % (i, bridge, subtotal))
logging.info("Descriptor deduplication finished.") - logging.info("Number of duplicates: %d" % len(duplicates)) - for (fingerprint, dittos) in duplicates.items(): - logging.info(" For %s: %d duplicates" - % (safelog.logSafely(fingerprint), len(dittos))) - logging.info("Number of non-duplicates: %d" % len(nonDuplicates))
- return nonDuplicates + return newest
def parseExtraInfoFiles(*filenames, **kwargs): """Parse files which contain ``@type bridge-extrainfo-descriptor``s.