[tor-commits] [metrics-tasks/master] Implement a multi-GeoIP database in Java (#6471).

karsten at torproject.org karsten at torproject.org
Wed Sep 12 23:13:58 UTC 2012


commit 1ad8f400c93d1178d83a91a3a33e107a652e985e
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date:   Wed Sep 12 18:49:43 2012 -0400

    Implement a multi-GeoIP database in Java (#6471).
---
 task-6471/.gitignore                               |    2 +
 task-6471/README                                   |    3 +
 task-6471/java/.gitignore                          |    5 +
 task-6471/java/build.xml                           |   42 ++
 .../java/src/org/torproject/task6471/Database.java |   70 +++
 .../src/org/torproject/task6471/DatabaseImpl.java  |  652 ++++++++++++++++++++
 .../task6471/DatabasePerformanceExample.java       |  146 +++++
 .../src/org/torproject/task6471/DatabaseTest.java  |  141 +++++
 8 files changed, 1061 insertions(+), 0 deletions(-)

diff --git a/task-6471/.gitignore b/task-6471/.gitignore
new file mode 100644
index 0000000..6dbccb1
--- /dev/null
+++ b/task-6471/.gitignore
@@ -0,0 +1,2 @@
+data/
+
diff --git a/task-6471/README b/task-6471/README
new file mode 100644
index 0000000..bb1a572
--- /dev/null
+++ b/task-6471/README
@@ -0,0 +1,3 @@
+Task 6471 -- Designing a file format and Python/Java library for multiple
+             GeoIP or AS databases
+
diff --git a/task-6471/java/.gitignore b/task-6471/java/.gitignore
new file mode 100644
index 0000000..86b7b14
--- /dev/null
+++ b/task-6471/java/.gitignore
@@ -0,0 +1,5 @@
+.classpath
+.project
+lib/
+classes/
+
diff --git a/task-6471/java/build.xml b/task-6471/java/build.xml
new file mode 100644
index 0000000..9eb1223
--- /dev/null
+++ b/task-6471/java/build.xml
@@ -0,0 +1,42 @@
+<project default="test" name="task-6471" basedir=".">
+  <property name="sources" value="src"/>
+  <property name="classes" value="classes"/>
+  <property name="libs" value="lib"/>
+  <path id="classpath">
+    <pathelement path="${classes}"/>
+    <fileset dir="${libs}">
+      <include name="*.jar"/>
+    </fileset>
+  </path>
+  <target name="init">
+    <mkdir dir="${classes}"/>
+  </target>
+  <target name="compile"
+          depends="init">
+    <javac destdir="${classes}"
+           srcdir="${sources}"
+           source="1.5"
+           target="1.5"
+           debug="true"
+           deprecation="true"
+           optimize="false"
+           failonerror="true"
+           includeantruntime="false">
+      <classpath refid="classpath"/>
+    </javac>
+  </target>
+  <target name="test" depends="compile">
+    <junit fork="true"
+           haltonfailure="true"
+           maxmemory="1g"
+           printsummary="off">
+      <classpath refid="classpath"/>
+      <formatter type="plain" usefile="false"/>
+      <batchtest>
+        <fileset dir="${classes}"
+                 includes="**/*Test.class"/>
+      </batchtest>
+    </junit>
+  </target>
+</project>
+
diff --git a/task-6471/java/src/org/torproject/task6471/Database.java b/task-6471/java/src/org/torproject/task6471/Database.java
new file mode 100644
index 0000000..447b3c2
--- /dev/null
+++ b/task-6471/java/src/org/torproject/task6471/Database.java
@@ -0,0 +1,70 @@
+/* Copyright 2012 The Tor Project */
+package org.torproject.task6471;
+
+/**
+ * Database storing multiple GeoIP databases and supporting efficient
+ * ip-to-country lookups in the most recent of those databases for any
+ * given date.
+ *
+ * A typical query for this GeoIP database is: "to which country was IPv4
+ * address 1.2.3.4 assigned on date 20120912?"  This query is answered by
+ * looking at the entries from the most recent database published on or
+ * before 20120912.  If the earliest known database was published after
+ * 20120912, the earliest known database is used to resolve the request.
+ */
+public interface Database {
+
+  /**
+   * Import the contents of one or more IP address assignments files
+   * published by the Regional Internet Registries.  The file or files
+   * are expected to conform to the RIR Statistics Exchange Format.
+   * Only IPv4 address ranges are imported, whereas ASN and IPv6 lines are
+   * ignored.  Only the country code, start address, and address range
+   * length fields are imported.  (TODO Extend to IPv6 and find similar
+   * data source for ASN.)
+   *
+   * A typical entry from a RIR file is:
+   *   "ripencc|FR|ipv4|2.0.0.0|1048576|20100712|allocated".
+   *
+   * It is important to note that all five registry files (AfriNIC, APNIC,
+   * ARIN, LACNIC, and RIPE NCC) published on a given day should be
+   * imported, or the missing address ranges will be considered as
+   * unassigned from that day until the next database publication day.
+   * (TODO We could be smarter here by checking that less than five
+   * registry files have been imported for the same day, or something.)
+   *
+   * @param path Path to a stats file or directory.
+   * @return True if importing the file or directory was successful,
+   *         false otherwise.
+   */
+  public boolean importRegionalRegistryStatsFileOrDirectory(String path);
+
+  /**
+   * Save the combined databases in a format that can later be loaded much
+   * more efficiently than importing the original RIR files again.
+   *
+   * @param path Path to the combined database file.
+   * @return True if saving the combined database file was successful,
+   *         false otherwise.
+   */
+  public boolean saveCombinedDatabases(String path);
+
+  /**
+   * Load a combined databases file.
+   *
+   * @param path Path to the combined database file.
+   * @return True if loading the combined database file was successful,
+   *         false otherwise.
+   */
+  public boolean loadCombinedDatabases(String path);
+
+  /**
+   * Query the database for an IPv4 address and assignment date.
+   *
+   * @param address IPv4 address in dotted-quad notation.
+   * @param date Assignment date in format yyyymmdd.
+   * @return Assigned country code, or null if no assignment could be
+   *         found.
+   */
+  public String lookupAddress(String address, String date);
+}
diff --git a/task-6471/java/src/org/torproject/task6471/DatabaseImpl.java b/task-6471/java/src/org/torproject/task6471/DatabaseImpl.java
new file mode 100644
index 0000000..43e5a95
--- /dev/null
+++ b/task-6471/java/src/org/torproject/task6471/DatabaseImpl.java
@@ -0,0 +1,652 @@
+/* Copyright 2012 The Tor Project */
+package org.torproject.task6471;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.SortedSet;
+import java.util.Stack;
+import java.util.TimeZone;
+import java.util.TreeMap;
+import java.util.TreeSet;
+
+/**
+ * Implementation of database holding multiple GeoIP databases with
+ * special focus on lookup performance, import performance, and memory
+ * consumption (in that order).
+ *
+ * This implementation uses a single tree to store IP address and date
+ * ranges.  Each tree element is stored under a long integer consisting of
+ * the start IPv4 address in the higher bits and the first database
+ * publication date containing that range in the lower bits.  The tree
+ * element itself contains the end IPv4 address, last database publication
+ * date, last database index number (see explanation further below), and
+ * country code.
+ *
+ * Lookups for a given address and random date only require iterating
+ * backwards over ranges with start address smaller than or equaling the
+ * requested address and can terminate as soon as a range with a smaller
+ * end address is encountered.
+ *
+ * As a requirement for lookups to work correctly, address ranges may
+ * never overlap for different assignment periods.  Similarly, date
+ * assignment ranges for a given address range may not overlap.  These
+ * requirements make the import process somewhat more complex and
+ * time-consuming, which is a tradeoff for faster lookup performance.
+ *
+ * The purpose of storing the last database index number is to fix ranges
+ * that are contained in two or more databases, but that are missing in a
+ * database that was published between the others but imported after them.
+ * The database index number defines that the range is only valid for
+ * databases imported until a given database, not necessarily for
+ * databases importer later on.  A separate repair run is necessary to
+ * check whether later imported databases require splitting a range into
+ * two or more sub ranges to correctly reflect that the range was not
+ * contained in those databases.
+ */
+public class DatabaseImpl implements Database {
+
+  /**
+   * Tree element containing an end IPv4 address, last database date,
+   * last database index, and country code.  Start IPv4 address and first
+   * database date are encoded in the key under which the element is
+   * stored.
+   */
+  private static class TreeElement {
+    private long endAddress;
+    private int lastDbDate;
+    private int lastKnownDbIndex;
+    private String countryCode;
+    TreeElement(long endAddress, int lastDbDate, int lastKnownDbIndex,
+        String countryCode) {
+      this.endAddress = endAddress;
+      this.lastDbDate = lastDbDate;
+      this.lastKnownDbIndex = lastKnownDbIndex;
+      this.countryCode = countryCode;
+    }
+  }
+
+  /**
+   * IPv4 address and date ranges, ordered backwards by start address and
+   * first database date.
+   */
+  private SortedMap<Long, TreeElement> ranges =
+      new TreeMap<Long, TreeElement>(Collections.reverseOrder());
+
+  /**
+   * Return number of contained ranges.
+   */
+  int getNumberOfElements() {
+    return this.ranges.size();
+  }
+
+  /**
+   * Database dates ordered from oldest to youngest.
+   */
+  private SortedSet<Integer> databaseDates = new TreeSet<Integer>();
+
+  /**
+   * Ordered list of database dates to find out their indexes.
+   */
+  private List<Integer> databaseIndexes = new ArrayList<Integer>();
+
+  /**
+   * Parse one or more stats files.
+   */
+  public boolean importRegionalRegistryStatsFileOrDirectory(String path) {
+    boolean allImportsSuccessful = true;
+    Stack<File> files = new Stack<File>();
+    files.add(new File(path));
+    while (!files.isEmpty()) {
+      File file = files.pop();
+      if (file.isDirectory()) {
+        files.addAll(Arrays.asList(file.listFiles()));
+      } else if (file.getName().endsWith(".md5") ||
+          file.getName().endsWith(".md5.gz") ||
+          file.getName().endsWith(".asc") ||
+          file.getName().endsWith(".asc.gz")) {
+        System.err.println("Signature and digest files are not supported "
+            + "yet: '" + file.getAbsolutePath() + "'.  Skipping.");
+        /* TODO Implement checking signatures/digests. */
+      } else if (file.getName().endsWith(".gz") ||
+          file.getName().endsWith(".bz2")) {
+        System.err.println("Parsing compressed files is not supported "
+            + "yet: '" + file.getAbsolutePath() + "'.  Skipping.");
+        /* TODO Implement parsing compressed files. */
+      } else if (!this.importRegionalRegistryStatsFile(file)) {
+        allImportsSuccessful = false;
+      }
+    }
+    return allImportsSuccessful;
+  }
+
+  /**
+   * Simple and not very robust implementation of an RIR stats file
+   * parser.
+   */
+  private boolean importRegionalRegistryStatsFile(File file) {
+    try {
+      BufferedReader br = new BufferedReader(new FileReader(file));
+      String line;
+      String databaseDateString =
+          file.getName().substring(file.getName().length() - 8);
+      while ((line = br.readLine()) != null) {
+        if (line.startsWith("#") || line.length() == 0) {
+          /* Skip comment line. */
+          continue;
+        }
+        String[] parts = line.split("\\|");
+        if (parts[0].equals("2")) {
+          continue;
+        }
+        if (parts[1].equals("*")) {
+          /* Skip summary line. */
+          continue;
+        }
+        String type = parts[2];
+        if (type.equals("asn")) {
+          continue;
+        } else if (type.equals("ipv6")) {
+          continue;
+        }
+        String countryCode = parts[1].toLowerCase();
+        String startAddressString = parts[3];
+        long addresses = Long.parseLong(parts[4]);
+        this.addRange(databaseDateString, countryCode, startAddressString,
+            addresses);
+      }
+      br.close();
+      this.repairIndex();
+    } catch (IOException e) {
+      return false;
+    }
+    return true;
+  }
+
+  /**
+   * Internal counters for import statistics.
+   */
+  private int rangeImports = 0, rangeImportsKeyLookups = 0;
+
+  /**
+   * Add a single address and date range to the database, which may
+   * require splitting up existing ranges.
+   *
+   * This method has default visibility and is not specified in the
+   * interface, because the caller needs to make sure that repairIndex()
+   * is called prior to any lookupAddress() calls.  No further checks are
+   * performed that the tree is repaired before look up an address.
+   */
+  void addRange(String databaseDateString, String countryCode,
+      String startAddressString, long addresses) {
+
+    this.rangeImports++;
+    int databaseDate = convertDateStringToNumber(databaseDateString);
+    long startAddress = convertAddressStringToNumber(startAddressString);
+    long endAddress = startAddress + addresses - 1L;
+
+    /* Add new database date if it's not yet contained. */
+    if (!this.databaseDates.contains(databaseDate)) {
+      this.databaseDates.add(databaseDate);
+      this.databaseIndexes.add(databaseDate);
+      if (this.databaseIndexes.size() > 1) {
+        this.needToFixDatabase = true;
+      }
+    }
+
+    /* We might have to split existing ranges or the new range before
+     * adding it to the database, and we might have to remove existing
+     * ranges.  We shouldn't mess with the tree directly while iterating
+     * over it, so let's for now only calculate what changes we want to
+     * make. */
+    SortedMap<Long, TreeElement> updateElements =
+        this.getUpdatesForAddingRange(databaseDate, countryCode,
+            startAddress, endAddress);
+
+    /* Apply updates.  Elements with non-null values are added, elements
+     * with null values are removed. */
+    for (Map.Entry<Long, TreeElement> e : updateElements.entrySet()) {
+      if (e.getValue() == null) {
+        this.ranges.remove(e.getKey());
+      } else {
+        this.ranges.put(e.getKey(), e.getValue());
+      }
+    }
+  }
+
+  /**
+   * Calculate necessary changes to the tree to add a range.
+   */
+  private SortedMap<Long, TreeElement> getUpdatesForAddingRange(
+      int databaseDate, String countryCode, long startAddress,
+      long endAddress) {
+
+    /* Keep updates in a single tree where non-null values will later be
+     * added, possibly replacing existing elements, and null values will
+     * be removed from the tree. */
+    SortedMap<Long, TreeElement> updateElements =
+        new TreeMap<Long, TreeElement>();
+
+    /* Find out previous and next database, so that we can possibly merge
+     * ranges. */
+    int previousDatabaseDate =
+        this.databaseDates.headSet(databaseDate).isEmpty() ? -1 :
+        this.databaseDates.headSet(databaseDate).last();
+    int nextDatabaseDate =
+        this.databaseDates.tailSet(databaseDate + 1).isEmpty() ? -1 :
+        this.databaseDates.tailSet(databaseDate + 1).first();
+
+    /* Look up database index number of the range to be added. */
+    int dbIndex = this.databaseIndexes.indexOf(databaseDate);
+
+    /* Remember the address boundaries of the next (partial) range to be
+     * added. */
+    long nextStartAddress = startAddress, nextEndAddress = endAddress;
+    int nextFirstDbDate = databaseDate, nextLastDbDate = databaseDate;
+
+    /* Iterate backwards over the existing ranges, starting at the end
+     * address of the range to be added and at the last conceivable
+     * database publication date. */
+    for (Map.Entry<Long, TreeElement> e :
+        this.ranges.tailMap(((endAddress + 1L) << 16) - 1L).entrySet()) {
+      this.rangeImportsKeyLookups++;
+
+      /* Extract everything we need to know from the next existing range
+       * we're looking at. */
+      long eStartAddress = e.getKey() >> 16;
+      long eEndAddress = e.getValue().endAddress;
+      int eFirstDbDate = (int) (e.getKey() & ((1L << 16) - 1L));
+      int eLastDbDate = e.getValue().lastDbDate;
+      int eLastKnownDbIndex = e.getValue().lastKnownDbIndex;
+      String eCountryCode = e.getValue().countryCode;
+
+      /* If the next (partial) range starts after the current element
+       * ends, add the new range. */
+      if (nextStartAddress > eEndAddress &&
+          nextEndAddress >= startAddress) {
+        updateElements.put((nextStartAddress << 16) + nextFirstDbDate,
+            new TreeElement(nextEndAddress, nextLastDbDate, dbIndex,
+            countryCode));
+        nextEndAddress = nextStartAddress - 1L;
+        nextStartAddress = startAddress;
+        nextFirstDbDate = databaseDate;
+        nextLastDbDate = databaseDate;
+      }
+
+      /* If the next (partial) range still ends after the current element
+       * ends, add the new range. */
+      if (nextEndAddress > eEndAddress &&
+          nextEndAddress >= startAddress) {
+        updateElements.put(((eEndAddress + 1L) << 16) + databaseDate,
+            new TreeElement(nextEndAddress, databaseDate, dbIndex,
+            countryCode));
+        nextEndAddress = eEndAddress;
+        nextStartAddress = startAddress;
+        nextFirstDbDate = databaseDate;
+        nextLastDbDate = databaseDate;
+      }
+
+      /* If the existing range ends before the new range starts, we don't
+       * have to look at any more existing ranges. */
+      if (eEndAddress < startAddress) {
+        break;
+      }
+
+      /* Cut off any address range parts of the existing element that are
+       * not contained in the currently added range.  First check whether
+       * the existing range ends after the newly added range.  In that
+       * case cut off the overlapping part and store it as a new
+       * element.*/
+      if (eStartAddress <= endAddress && eEndAddress > endAddress) {
+        updateElements.put(((endAddress + 1L) << 16) + eFirstDbDate,
+            new TreeElement(eEndAddress, eLastDbDate, eLastKnownDbIndex,
+            eCountryCode));
+        updateElements.put((eStartAddress << 16) + eFirstDbDate,
+            new TreeElement(endAddress, eLastDbDate, eLastKnownDbIndex,
+            eCountryCode));
+        eEndAddress = endAddress;
+      }
+
+      /* Similarly, check whether the existing range starts before the
+       * newly added one.  If so, cut off the overlapping part and store
+       * it as new element. */
+      if (eStartAddress < startAddress && eEndAddress >= startAddress) {
+        updateElements.put((eStartAddress << 16) + eFirstDbDate,
+            new TreeElement(startAddress - 1L, eLastDbDate,
+            eLastKnownDbIndex, eCountryCode));
+        updateElements.put((startAddress << 16) + eFirstDbDate,
+            new TreeElement(eEndAddress, eLastDbDate, eLastKnownDbIndex,
+            eCountryCode));
+        eStartAddress = startAddress;
+      }
+
+      /* Now we're sure the existing element doesn't exceed the newly
+       * added element, address-wise. */
+      nextStartAddress = eStartAddress;
+      nextEndAddress = eEndAddress;
+
+      /* If the range is already contained, maybe even with different
+       * country code, ignore it. */
+      if (eFirstDbDate <= databaseDate && eLastDbDate >= databaseDate) {
+        updateElements.clear();
+        return updateElements;
+      }
+
+      /* See if we can merge the new range with the previous or next
+       * range.  If so, extend our database range and mark the existing
+       * element for deletion. */
+      if (eCountryCode.equals(countryCode)) {
+        if (eLastDbDate == previousDatabaseDate) {
+          nextFirstDbDate = eFirstDbDate;
+          updateElements.put((eStartAddress << 16) + eFirstDbDate, null);
+        } else if (eFirstDbDate == nextDatabaseDate) {
+          nextLastDbDate = eLastDbDate;
+          updateElements.put((eStartAddress << 16) + eFirstDbDate, null);
+        }
+      }
+    }
+
+    /* If there's still some part (or the whole?) address range left to
+     * add, add it now. */
+    while (nextEndAddress >= startAddress) {
+      updateElements.put((nextStartAddress << 16) + nextFirstDbDate,
+          new TreeElement(nextEndAddress, nextLastDbDate, dbIndex,
+          countryCode));
+      nextEndAddress = nextStartAddress - 1L;
+      nextStartAddress = startAddress;
+      nextFirstDbDate = databaseDate;
+      nextLastDbDate = databaseDate;
+    }
+
+    /* Return the tree updates that will add the given range. */
+    return updateElements;
+  }
+
+  /* Do we have to repair the tree? */
+  private boolean needToFixDatabase = false;
+
+  /**
+   * Repair tree by making sure that any range from a given database date
+   * to another is still valid when considering any other database that
+   * was imported later.
+   */
+  void repairIndex() {
+    if (!needToFixDatabase) {
+      return;
+    }
+    int maxDatabaseIndex = databaseIndexes.size() - 1;
+    if (maxDatabaseIndex < 1) {
+      return;
+    }
+    SortedMap<Long, TreeElement> updateElements =
+        new TreeMap<Long, TreeElement>();
+    for (Map.Entry<Long, TreeElement> e : this.ranges.entrySet()) {
+      if (e.getValue().lastKnownDbIndex < maxDatabaseIndex) {
+        int eFirstDbDate = (int) (e.getKey() & ((1L << 16) - 1L));
+        int eLastDbDate = e.getValue().lastDbDate;
+        List<Integer> splitAtDates = new ArrayList<Integer>();
+        for (int dbIndex = e.getValue().lastKnownDbIndex + 1;
+            dbIndex <= maxDatabaseIndex; dbIndex++) {
+          int dbDate = databaseIndexes.get(dbIndex);
+          if (eFirstDbDate < dbDate && eLastDbDate > dbDate) {
+            splitAtDates.add(dbDate);
+          }
+        }
+        if (splitAtDates.isEmpty()) {
+          e.getValue().lastKnownDbIndex = maxDatabaseIndex;
+        } else {
+          long eStartAddress = e.getKey() >> 16;
+          long eEndAddress = e.getValue().endAddress;
+          String eCountryCode = e.getValue().countryCode;
+          int start = eFirstDbDate, end = eFirstDbDate;
+          for (int cur : this.databaseDates.tailSet(eFirstDbDate)) {
+            if (cur > eLastDbDate) {
+              break;
+            }
+            if (splitAtDates.contains(cur)) {
+              if (start >= 0 && end >= 0) {
+                updateElements.put((eStartAddress << 16) + start,
+                    new TreeElement(eEndAddress, end,
+                    maxDatabaseIndex, eCountryCode));
+                start = end = -1;
+              }
+            } else if (start < 0) {
+              start = end = cur;
+            } else {
+              end = cur;
+            }
+          }
+          if (start >= 0 && end >= 0) {
+            updateElements.put((eStartAddress << 16) + start,
+                new TreeElement(eEndAddress, end,
+                maxDatabaseIndex, eCountryCode));
+          }
+        }
+      }
+    }
+    for (Map.Entry<Long, TreeElement> e : updateElements.entrySet()) {
+      this.ranges.put(e.getKey(), e.getValue());
+    }
+    this.needToFixDatabase = false;
+  }
+
+  /**
+   * Internal counters for lookup statistics.
+   */
+  private int addressLookups = 0, addressLookupsKeyLookups = 0;
+
+  /**
+   * Look up address and date by iterating backwards over possibly
+   * matching ranges.
+   */
+  public String lookupAddress(String addressString, String dateString) {
+    this.addressLookups++;
+
+    long address = convertAddressStringToNumber(addressString);
+    int date = convertDateStringToNumber(dateString);
+
+    if (this.databaseDates.isEmpty()) {
+      return null;
+    }
+
+    /* Look up which database we want. */
+    int databaseDate = this.databaseDates.headSet(date + 1).isEmpty() ?
+        this.databaseDates.first() :
+        this.databaseDates.headSet(date + 1).last();
+
+    /* Iterate backwards over the existing ranges, starting at the last
+     * possible date of the address to be found. */
+    for (Map.Entry<Long, TreeElement> e :
+      this.ranges.tailMap(((address + 1L) << 16) - 1L).entrySet()) {
+      this.addressLookupsKeyLookups++;
+
+      /* If either the end address or end date of the range we're looking
+       * at is smaller than the values we're looking for, we can be sure
+       * not to find it anymore. */
+      if (e.getValue().endAddress < address ||
+          e.getValue().lastDbDate < databaseDate) {
+        return null;
+      }
+
+      /* If the range starts at a later date, skip it and look at the next
+       * one. */
+      long startDate = e.getKey() & ((1L << 16) - 1L);
+      if (startDate > databaseDate) {
+        continue;
+      }
+
+      /* Both address and date ranges match, so return the assigned
+       * country code. */
+      return e.getValue().countryCode;
+    }
+
+    /* No ranges (left) to look at.  We don't have what we were looking
+     * for. */
+    return null;
+  }
+
+  /* Helper: convert a dotted-quad formatted address string to its
+   * corresponding long integer number. */
+  static long convertAddressStringToNumber(String addressString) {
+    long address = 0;
+    String[] addressParts = addressString.split("\\.");
+    for (int i = 0; i < 4; i++) {
+      address += Long.parseLong(addressParts[i]) << ((3 - i) * 8);
+    }
+    return address;
+  }
+
+  /* Helper: convert a long integer address number to its corresponding
+   * dotted-quad formatted string. */
+  static String convertAddressNumberToString(long address) {
+    return "" + (address / 256 / 256 / 256) + "."
+        + ((address / 256 / 256) % 256) + "."
+        + ((address / 256) % 256) + "." + (address % 256);
+  }
+
+  /* Helper: date format parser/formatter. */
+  private static SimpleDateFormat dateFormat;
+  static {
+    dateFormat = new SimpleDateFormat("yyyyMMdd");
+    dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
+  }
+
+  /* Helper: convert date string in format yyyymmdd to integer containing
+   * days passed since 1970-01-01. */
+  static int convertDateStringToNumber(String dateString)
+      throws IllegalArgumentException {
+    try {
+      SimpleDateFormat dateFormat = new SimpleDateFormat("yyyyMMdd");
+      dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
+      return (int) (dateFormat.parse(dateString).getTime() / 86400000);
+    } catch (ParseException e) {
+      throw new IllegalArgumentException(e);
+    }
+  }
+
+  /* Helper: convert integer containing days passed since 1970-01-01 to
+   * date string in format yyyymmdd. */
+  static String convertDateNumberToString(int date) {
+    return dateFormat.format(((long) date) * 86400000);
+  }
+
+  /* Return a nicely formatted string summarizing database contents and
+   * usage statistics. */
+  public String toString() {
+    StringBuilder sb = new StringBuilder();
+    sb.append(String.format("Database contains %d databases and %d "
+        + "combined address ranges.\n"
+        + "Performed %d address range imports requiring %d lookups.\n"
+        + "Performed %d address lookups requiring %d lookups.\n"
+        + "First 10 entries, in reverse order, are:",
+        this.databaseDates.size(), this.ranges.size(), rangeImports,
+        rangeImportsKeyLookups, addressLookups,
+        addressLookupsKeyLookups));
+    int entries = 10;
+    for (Map.Entry<Long, TreeElement> e : this.ranges.entrySet()) {
+      sb.append(String.format("%n  %s %s %s %s %s %d",
+          convertAddressNumberToString(e.getKey() >> 16),
+          convertAddressNumberToString(e.getValue().endAddress),
+          e.getValue().countryCode,
+          convertDateNumberToString(
+              (int) (e.getKey() & ((1L << 16) - 1L))),
+          convertDateNumberToString(e.getValue().lastDbDate),
+          e.getValue().lastKnownDbIndex));
+      if (--entries <= 0) {
+        break;
+      }
+    }
+    return sb.toString();
+  }
+
+  /**
+   * Save the combined databases to disk.
+   */
+  public boolean saveCombinedDatabases(String path) {
+    try {
+
+      /* Create parent directories if necessary. */
+      File file = new File(path);
+      if (file.getParentFile() != null) {
+        file.getParentFile().mkdirs();
+      }
+
+      /* Start with writing all contained database dates to the file
+       * header. */
+      BufferedWriter bw = new BufferedWriter(new FileWriter(file));
+      for (int dbDate : this.databaseDates) {
+        bw.write("!" + convertDateNumberToString(dbDate) + "\n");
+      }
+
+      /* Next write all database ranges in the same order as they are
+       * currently contained in memory.  The only information we can drop
+       * is the last known database index of each range, because we assume
+       * the tree is already in repaired state. */
+      for (Map.Entry<Long, TreeElement> e : this.ranges.entrySet()) {
+        bw.write(String.format("%s,%s,%s,%s,%s%n",
+            convertAddressNumberToString(e.getKey() >> 16),
+            convertAddressNumberToString(e.getValue().endAddress),
+            e.getValue().countryCode,
+            convertDateNumberToString(
+                (int) (e.getKey() & ((1L << 16) - 1L))),
+            convertDateNumberToString(e.getValue().lastDbDate)));
+      }
+      bw.close();
+    } catch (IOException e) {
+      return false;
+    }
+    return true;
+  }
+
+  /**
+   * Load previously saved combined databases from disk.  This code is not
+   * at all robust against external changes of the combined database file.
+   */
+  public boolean loadCombinedDatabases(String path) {
+    try {
+      File file = new File(path);
+      BufferedReader br = new BufferedReader(new FileReader(file));
+      String line;
+      int maxDbIndex = -1;
+      while ((line = br.readLine()) != null) {
+        if (line.startsWith("!")) {
+
+          /* First read file header containing database dates. */
+          int dbDate = convertDateStringToNumber(line.substring(1));
+          this.databaseDates.add(dbDate);
+          this.databaseIndexes.add(dbDate);
+          maxDbIndex = this.databaseIndexes.size() - 1;
+        } else {
+
+          /* Next read all ranges.  Set last known database index for each
+           * range to the last database we read from the header, because
+           * the tree will immediately be in repaired state. */
+          String[] parts = line.split(",");
+          long startAddress = convertAddressStringToNumber(parts[0]);
+          long endAddress = convertAddressStringToNumber(parts[1]);
+          String countryCode = parts[2];
+          int firstDbDate = convertDateStringToNumber(parts[3]);
+          int lastDbDate = convertDateStringToNumber(parts[4]);
+          this.ranges.put((startAddress << 16) + firstDbDate,
+              new TreeElement(endAddress, lastDbDate, maxDbIndex,
+              countryCode));
+        }
+      }
+      br.close();
+    } catch (IOException e) {
+      return false;
+    }
+    return true;
+  }
+}
diff --git a/task-6471/java/src/org/torproject/task6471/DatabasePerformanceExample.java b/task-6471/java/src/org/torproject/task6471/DatabasePerformanceExample.java
new file mode 100644
index 0000000..3b273bc
--- /dev/null
+++ b/task-6471/java/src/org/torproject/task6471/DatabasePerformanceExample.java
@@ -0,0 +1,146 @@
+package org.torproject.task6471;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Random;
+import java.util.SortedMap;
+import java.util.SortedSet;
+import java.util.Stack;
+import java.util.TreeMap;
+import java.util.TreeSet;
+
+public class DatabasePerformanceExample {
+  public static void main(String[] args) {
+
+    System.out.print("Generating test cases... ");
+    long startMillis = System.currentTimeMillis();
+    List<Long> tests = new ArrayList<Long>();
+    SortedMap<Long, String> results = new TreeMap<Long, String>();
+    Random rnd = new Random(1L);
+    int startDate = DatabaseImpl.convertDateStringToNumber("20071001");
+    int endDate = DatabaseImpl.convertDateStringToNumber("20120930");
+    /* Skipping Dec 1--3, 2009, because the first available database from
+     * December 2009 was published on the 4th, and generating test cases
+     * was just too confusing when taking that into account. */
+    List<Integer> skipDates = new ArrayList<Integer>();
+    skipDates.add(DatabaseImpl.convertDateStringToNumber("20091201"));
+    skipDates.add(DatabaseImpl.convertDateStringToNumber("20091202"));
+    skipDates.add(DatabaseImpl.convertDateStringToNumber("20091203"));
+    for (int i = 0; i < 100000; i++) {
+      long testAddress = rnd.nextLong() & ((1L << 32) - 1L);
+      int testDate = startDate + rnd.nextInt(endDate - startDate);
+      if (skipDates.contains(testDate)) {
+        i--;
+      } else {
+        tests.add((testAddress << 16) + testDate);
+      }
+    }
+    Stack<File> stackedFiles = new Stack<File>();
+    stackedFiles.add(new File("../data"));
+    SortedSet<File> files = new TreeSet<File>();
+    while (!stackedFiles.isEmpty()) {
+      File file = stackedFiles.pop();
+      if (file.isDirectory()) {
+        stackedFiles.addAll(Arrays.asList(file.listFiles()));
+      } else if (!file.getName().endsWith(".md5") &&
+          !file.getName().endsWith(".md5.gz") &&
+          !file.getName().endsWith(".asc") &&
+          !file.getName().endsWith(".asc.gz")) {
+        files.add(file);
+      }
+    }
+    for (File file : files) {
+      String dbMonth = file.getName().substring(
+          file.getName().length() - 8);
+      dbMonth = dbMonth.substring(0, 6);
+      Database temp = new DatabaseImpl();
+      temp.importRegionalRegistryStatsFileOrDirectory(
+          file.getAbsolutePath());
+      for (long test : tests) {
+        int testDate = (int) (test & ((1 << 16) - 1));
+        String testMonth = DatabaseImpl.convertDateNumberToString(
+            testDate).substring(0, 6);
+        if (testMonth.equals(dbMonth)) {
+          String testAddressString = DatabaseImpl.
+              convertAddressNumberToString(test >> 16);
+          String testDateString = DatabaseImpl.convertDateNumberToString(
+              testDate);
+          String countryCode = temp.lookupAddress(testAddressString,
+              testDateString);
+          if (countryCode != null) {
+            results.put(test, countryCode);
+          }
+        }
+      }
+    }
+    long endMillis = System.currentTimeMillis();
+    System.out.println((endMillis - startMillis) + " millis.");
+
+    System.out.print("Importing files... ");
+    startMillis = endMillis;
+    Database database = new DatabaseImpl();
+    database.importRegionalRegistryStatsFileOrDirectory("../data");
+    endMillis = System.currentTimeMillis();
+    System.out.println((endMillis - startMillis) + " millis.");
+
+    System.out.print("Making test requests... ");
+    startMillis = endMillis;
+    int failures = 0;
+    for (long test : tests) {
+      String testAddress = DatabaseImpl.convertAddressNumberToString(
+          test >> 16);
+      String testDate = DatabaseImpl.convertDateNumberToString(
+          (int) (test & ((1 << 16) - 1)));
+      String expected = results.get(test);
+      String result = database.lookupAddress(testAddress, testDate);
+      if ((expected == null && result != null) ||
+          (expected != null && !expected.equals(result))) {
+        //System.out.println("Expected " + expected + " for "
+        //    + testAddress + " " + testDate + ", but got " + result);
+        failures++;
+      }
+    }
+    endMillis = System.currentTimeMillis();
+    System.out.println((endMillis - startMillis) + " millis, " + failures
+        + " out of " + tests.size() + " tests failed.");
+
+    System.out.println(database);
+
+    System.out.print("Saving combined databases to disk... ");
+    startMillis = endMillis;
+    database.saveCombinedDatabases("geoip-2007-10-2012-09.csv");
+    endMillis = System.currentTimeMillis();
+    System.out.println((endMillis - startMillis) + " millis.");
+    startMillis = endMillis;
+
+    System.out.print("Loading combined databases from disk... ");
+    startMillis = endMillis;
+    database = new DatabaseImpl();
+    database.loadCombinedDatabases("geoip-2007-10-2012-09.csv");
+    endMillis = System.currentTimeMillis();
+    System.out.println((endMillis - startMillis) + " millis.");
+
+    System.out.print("Making a second round of test requests... ");
+    startMillis = endMillis;
+    failures = 0;
+    for (long test : tests) {
+      String testAddress = DatabaseImpl.convertAddressNumberToString(
+          test >> 16);
+      String testDate = DatabaseImpl.convertDateNumberToString(
+          (int) (test & ((1 << 16) - 1)));
+      String expected = results.get(test);
+      String result = database.lookupAddress(testAddress, testDate);
+      if ((expected == null && result != null) ||
+          (expected != null && !expected.equals(result))) {
+        //System.out.println("Expected " + expected + " for "
+        //    + testAddress + " " + testDate + ", but got " + result);
+        failures++;
+      }
+    }
+    endMillis = System.currentTimeMillis();
+    System.out.println((endMillis - startMillis) + " millis, " + failures
+        + " out of " + tests.size() + " tests failed.");
+  }
+}
diff --git a/task-6471/java/src/org/torproject/task6471/DatabaseTest.java b/task-6471/java/src/org/torproject/task6471/DatabaseTest.java
new file mode 100644
index 0000000..8f90b3d
--- /dev/null
+++ b/task-6471/java/src/org/torproject/task6471/DatabaseTest.java
@@ -0,0 +1,141 @@
+/* Copyright 2012 The Tor Project */
+package org.torproject.task6471;
+
+import static org.junit.Assert.assertEquals;
+import org.junit.Test;
+
+/**
+ * Test the multi-GeoIP database implementation.
+ */
+public class DatabaseTest {
+
+  @Test()
+  public void testSingleIpRangeSingleDatebase() {
+    DatabaseImpl database = new DatabaseImpl();
+    database.addRange("20120901", "us", "3.0.0.0", 16777216);
+    database.repairIndex();
+    assertEquals(1, ((DatabaseImpl) database).getNumberOfElements());
+    assertEquals(null, database.lookupAddress("2.255.255.255",
+        "19920901"));
+    assertEquals(null, database.lookupAddress("2.255.255.255",
+        "20020901"));
+    assertEquals(null, database.lookupAddress("2.255.255.255",
+        "20120901"));
+    assertEquals(null, database.lookupAddress("2.255.255.255",
+        "20220901"));
+    assertEquals("us", database.lookupAddress("3.0.0.0", "19920901"));
+    assertEquals("us", database.lookupAddress("3.0.0.0", "20020901"));
+    assertEquals("us", database.lookupAddress("3.0.0.0", "20120901"));
+    assertEquals("us", database.lookupAddress("3.0.0.0", "20220901"));
+    assertEquals("us", database.lookupAddress("3.127.0.0", "19920901"));
+    assertEquals("us", database.lookupAddress("3.127.0.0", "20020901"));
+    assertEquals("us", database.lookupAddress("3.127.0.0", "20120901"));
+    assertEquals("us", database.lookupAddress("3.127.0.0", "20220901"));
+    assertEquals("us", database.lookupAddress("3.255.255.255",
+        "19920901"));
+    assertEquals("us", database.lookupAddress("3.255.255.255",
+        "20020901"));
+    assertEquals("us", database.lookupAddress("3.255.255.255",
+        "20120901"));
+    assertEquals("us", database.lookupAddress("3.255.255.255",
+        "20220901"));
+    assertEquals(null, database.lookupAddress("4.0.0.0", "19920901"));
+    assertEquals(null, database.lookupAddress("4.0.0.0", "20020901"));
+    assertEquals(null, database.lookupAddress("4.0.0.0", "20120901"));
+    assertEquals(null, database.lookupAddress("4.0.0.0", "20220901"));
+  }
+
+  @Test()
+  public void testTwoAdjacentIpRangesSingleDatabase() {
+    DatabaseImpl database = new DatabaseImpl();
+    database.addRange("20120901", "us", "3.0.0.0", 16777216);
+    database.addRange("20120901", "ca", "4.0.0.0", 16777216);
+    database.repairIndex();
+    assertEquals(2, ((DatabaseImpl) database).getNumberOfElements());
+    assertEquals(null, database.lookupAddress("2.255.255.255",
+        "20120901"));
+    assertEquals("us", database.lookupAddress("3.127.0.0", "20120901"));
+    assertEquals("us", database.lookupAddress("3.127.0.0", "20120901"));
+    assertEquals("us", database.lookupAddress("3.127.0.0", "20120901"));
+    assertEquals("ca", database.lookupAddress("4.127.0.0", "20120901"));
+    assertEquals("ca", database.lookupAddress("4.127.0.0", "20120901"));
+    assertEquals("ca", database.lookupAddress("4.127.0.0", "20120901"));
+    assertEquals(null, database.lookupAddress("5.0.0.0", "20120901"));
+  }
+
+  @Test()
+  public void testTwoNonAdjacentIpDateRangesSingleDatabase() {
+    DatabaseImpl database = new DatabaseImpl();
+    database.addRange("20120901", "us", "3.0.0.0", 16777216);
+    database.addRange("20120901", "ca", "6.0.0.0", 16777216);
+    database.repairIndex();
+    assertEquals(2, ((DatabaseImpl) database).getNumberOfElements());
+    assertEquals(null, database.lookupAddress("2.255.255.255", "20120901"));
+    assertEquals("us", database.lookupAddress("3.127.0.0", "20120901"));
+    assertEquals(null, database.lookupAddress("4.255.255.255", "20120901"));
+    assertEquals("ca", database.lookupAddress("6.127.0.0", "20120901"));
+    assertEquals(null, database.lookupAddress("7.0.0.0", "20120901"));
+  }
+
+  @Test()
+  public void testDuplicateImport() {
+    DatabaseImpl database = new DatabaseImpl();
+    database.addRange("20120901", "us", "3.0.0.0", 16777216);
+    database.addRange("20120901", "us", "3.0.0.0", 16777216);
+    database.repairIndex();
+    assertEquals(1, ((DatabaseImpl) database).getNumberOfElements());
+    assertEquals(null, database.lookupAddress("2.255.255.255", "20120901"));
+    assertEquals("us", database.lookupAddress("3.127.0.0", "20120901"));
+    assertEquals(null, database.lookupAddress("4.0.0.0", "20120901"));
+  }
+
+  @Test()
+  public void testDuplicateImportDifferentCountryCode() {
+    DatabaseImpl database = new DatabaseImpl();
+    database.addRange("20120901", "us", "3.0.0.0", 16777216);
+    database.addRange("20120901", "ca", "3.0.0.0", 16777216);
+    database.repairIndex();
+    assertEquals(1, ((DatabaseImpl) database).getNumberOfElements());
+    assertEquals("us", database.lookupAddress("3.127.0.0", "20120901"));
+  }
+
+  @Test()
+  public void testLeaveIpChangeUnchanged() {
+    DatabaseImpl database = new DatabaseImpl();
+    database.addRange("20120901", "us", "3.0.0.0", 16777216);
+    database.addRange("20121001", "us", "3.0.0.0", 16777216);
+    database.repairIndex();
+    assertEquals(1, ((DatabaseImpl) database).getNumberOfElements());
+    assertEquals("us", database.lookupAddress("3.127.0.0", "20120801"));
+    assertEquals("us", database.lookupAddress("3.127.0.0", "20120901"));
+    assertEquals("us", database.lookupAddress("3.127.0.0", "20121001"));
+    assertEquals("us", database.lookupAddress("3.127.0.0", "20121101"));
+  }
+
+  @Test()
+  public void testLeaveIpChangeUnchangedReverseOrder() {
+    DatabaseImpl database = new DatabaseImpl();
+    database.addRange("20121001", "us", "3.0.0.0", 16777216);
+    database.addRange("20120901", "us", "3.0.0.0", 16777216);
+    database.repairIndex();
+    assertEquals(1, ((DatabaseImpl) database).getNumberOfElements());
+    assertEquals("us", database.lookupAddress("3.127.0.0", "20120801"));
+    assertEquals("us", database.lookupAddress("3.127.0.0", "20120901"));
+    assertEquals("us", database.lookupAddress("3.127.0.0", "20121001"));
+    assertEquals("us", database.lookupAddress("3.127.0.0", "20121101"));
+  }
+
+  @Test()
+  public void testMissingIpRange() {
+    DatabaseImpl database = new DatabaseImpl();
+    database.addRange("20120901", "us", "3.0.0.0", 16777216);
+    database.addRange("20121101", "us", "3.0.0.0", 16777216);
+    database.addRange("20121001", "us", "6.0.0.0", 16777216);
+    database.repairIndex();
+    assertEquals(3, ((DatabaseImpl) database).getNumberOfElements());
+    assertEquals("us", database.lookupAddress("3.127.0.0", "20120801"));
+    assertEquals("us", database.lookupAddress("3.127.0.0", "20120901"));
+    assertEquals(null, database.lookupAddress("3.127.0.0", "20121001"));
+    assertEquals("us", database.lookupAddress("3.127.0.0", "20121101"));
+  }
+}



More information about the tor-commits mailing list