[tor-commits] [metrics-tasks/master] Add simple Python database lookup code (#6471).

karsten at torproject.org karsten at torproject.org
Tue Nov 6 15:33:22 UTC 2012


commit 5a202017333cb64f5b9ca7d862e7ef9677761221
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date:   Tue Nov 6 10:19:40 2012 -0500

    Add simple Python database lookup code (#6471).
---
 task-6471/python/.gitignore   |    3 +
 task-6471/python/pygeodate.py |  140 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 143 insertions(+), 0 deletions(-)

diff --git a/task-6471/python/.gitignore b/task-6471/python/.gitignore
new file mode 100644
index 0000000..4d9663e
--- /dev/null
+++ b/task-6471/python/.gitignore
@@ -0,0 +1,3 @@
+*.csv
+*.csv.bz2
+
diff --git a/task-6471/python/pygeodate.py b/task-6471/python/pygeodate.py
new file mode 100644
index 0000000..6480f20
--- /dev/null
+++ b/task-6471/python/pygeodate.py
@@ -0,0 +1,140 @@
+import bisect
+import socket
+import struct
+import datetime
+
+class Range:
+  def __init__(self, line):
+    parts = line.split(',')
+    # TODO Extend to IPv6.  The cool thing in Python is that ints have
+    # variable size, so we can use 48 bit keys for IPv4 (32 bit for the
+    # IPv4 address and 16 bit for the database date) and 144 bit keys for
+    # IPv6.
+    self.start_address = Database.address_ston(parts[0])
+    self.end_address = Database.address_ston(parts[1])
+    self.code = parts[2]
+    self.start_date = Database.date_ston(parts[3])
+    self.end_date = Database.date_ston(parts[4])
+    self.key = Database.create_key(self.start_address, self.start_date)
+
+  def __str__(self):
+    return "%s,%s,%s,%s,%s" % \
+          (Database.address_ntos(self.start_address),
+           Database.address_ntos(self.end_address),
+           self.code,
+           Database.date_ntos(self.start_date),
+           Database.date_ntos(self.end_date))
+
+class Database:
+  def __init__(self):
+    # TODO Replace with crit-bit tree if performance becomes a problem
+    self.data = []
+    self.dates = []
+    self.keys = []
+
+  @staticmethod
+  def address_ston(address_string):
+    try:
+      address_struct = socket.inet_pton(socket.AF_INET, address_string)
+    except socket.error:
+        raise ValueError
+    return struct.unpack('!I', address_struct)[0]
+
+  @staticmethod
+  def address_ntos(address):
+    return socket.inet_ntop(socket.AF_INET, struct.pack('!I', address))
+
+  @staticmethod
+  def date_ston(date_string):
+    date_datetime = datetime.datetime.strptime(date_string, '%Y%m%d')
+    return int(date_datetime.strftime('%s')) / 86400
+
+  @staticmethod
+  def date_ntos(date):
+    return datetime.datetime.fromtimestamp(date * 86400).strftime('%Y%m%d')
+
+  @staticmethod
+  def address_kton(key):
+    return key >> 16
+
+  @staticmethod
+  def date_kton(key):
+    return key & 0xffff
+
+  @staticmethod
+  def address_ktos(key):
+    return Database.address_ntos(Database.address_kton(key))
+
+  @staticmethod
+  def date_ktos(key):
+    return Database.date_ntos(Database.date_kton(key))
+
+  @staticmethod
+  def create_key(address, date):
+    return (address << 16) + date
+
+  def load_combined_databases(self, path):
+    with open(path) as input_file:
+      for line in input_file.readlines():
+        line = line.strip()
+        if line.startswith('!'):
+          self.add_date(line)
+          continue
+        else:
+          self.add_range(line)
+    self.data.sort()
+    self.keys = [r[0] for r in self.data]
+
+  def add_date(self, line):
+    date = line.split("!")[1]
+    if date not in self.dates:
+      bisect.insort(self.dates, date)
+
+  def add_range(self, line):
+    r = Range(line)
+    self.data.append((r.key, r))
+
+  def lookup_address_and_date(self, address_string, date_string):
+    if len(self.data) == 0:
+      return '??'
+    dates_pos = max(0, bisect.bisect(self.dates, date_string) - 1)
+    address = Database.address_ston(address_string)
+    date = Database.date_ston(self.dates[dates_pos])
+    key = Database.create_key(address, date)
+    pos = bisect.bisect(self.keys, key + 1)
+    # Look up address and date by iterating backwards over possibly
+    # matching ranges.
+    while pos:
+      pos = pos - 1
+      r = self.data[pos][1]
+      # If either the end address or end date of the range we're looking
+      # at is smaller than the values we're looking for, we can be sure
+      # not to find it anymore.
+      if r.end_address < address or r.end_date < date:
+        return '??'
+      # If the range starts at a later date, skip it and look at the next
+      # one.
+      if r.start_date > date:
+        continue
+      # Both address and date ranges match, so return the assigned
+      # code.
+      return r.code
+    # No ranges (left) to look at.  We don't have what we were looking
+    # for. */
+    return '??';
+
+if __name__ == "__main__":
+  db = Database()
+  db.load_combined_databases('geoip-2007-10-2012-09.csv')
+  with open('test-cases-2007-10-2012-09.csv') as input_file:
+    for line in input_file.readlines():
+      line = line.strip()
+      parts = line.split(',')
+      address_string = parts[0]
+      date_string = parts[1]
+      expected = parts[2]
+      result = db.lookup_address_and_date(address_string, date_string)
+      if (expected != result):
+        print "! %s -> %s" % (line, result)
+        break
+



More information about the tor-commits mailing list