[tor-commits] [metrics-tasks/master] Support parsing of Maxmind's formats (#6471).

karsten at torproject.org karsten at torproject.org
Tue Nov 6 15:33:21 UTC 2012


commit 2d1bd61d767f07f430106b35f67068ccb873c252
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date:   Sun Nov 4 14:55:50 2012 -0500

    Support parsing of Maxmind's formats (#6471).
---
 task-6471/java/build.xml                           |    7 +
 .../org/torproject/task6471/ConvertExample.java    |   34 ++++
 .../org/torproject/task6471/DatabaseImporter.java  |   37 ++++-
 .../torproject/task6471/DatabaseImporterImpl.java  |  193 ++++++++++++++++++--
 .../src/org/torproject/task6471/DatabaseTest.java  |   48 ++++--
 5 files changed, 289 insertions(+), 30 deletions(-)

diff --git a/task-6471/java/build.xml b/task-6471/java/build.xml
index bda8d25..40969ab 100644
--- a/task-6471/java/build.xml
+++ b/task-6471/java/build.xml
@@ -45,5 +45,12 @@
       <classpath refid="classpath"/>
     </java>
   </target>
+  <target name="convert" depends="compile">
+    <java fork="true"
+          maxmemory="2048m"
+          classname="org.torproject.task6471.ConvertExample">
+      <classpath refid="classpath"/>
+    </java>
+  </target>
 </project>
 
diff --git a/task-6471/java/src/org/torproject/task6471/ConvertExample.java b/task-6471/java/src/org/torproject/task6471/ConvertExample.java
new file mode 100644
index 0000000..c96047c
--- /dev/null
+++ b/task-6471/java/src/org/torproject/task6471/ConvertExample.java
@@ -0,0 +1,34 @@
+package org.torproject.task6471;
+
+public class ConvertExample {
+  public static void main(String[] args) {
+    System.out.print("Importing ASN database files... ");
+    long startMillis = System.currentTimeMillis();
+    DatabaseImporter combinedDatabase = new DatabaseImporterImpl();
+    combinedDatabase.importGeoIPASNum2FileOrDirectory("../asn");
+    long endMillis = System.currentTimeMillis();
+    System.out.println((endMillis - startMillis) + " millis.");
+
+    System.out.print("Saving combined ASN database to disk... ");
+    startMillis = endMillis;
+    combinedDatabase.saveCombinedDatabases("asn-2012-07-2012-10.csv");
+    endMillis = System.currentTimeMillis();
+    System.out.println((endMillis - startMillis) + " millis.");
+    startMillis = endMillis;
+
+    System.out.print("Importing city database files... ");
+    startMillis = System.currentTimeMillis();
+    combinedDatabase = new DatabaseImporterImpl();
+    combinedDatabase.importGeoLiteCityFileOrDirectory("../city");
+    endMillis = System.currentTimeMillis();
+    System.out.println((endMillis - startMillis) + " millis.");
+
+    System.out.print("Saving combined city database to disk... ");
+    startMillis = endMillis;
+    combinedDatabase.saveCombinedDatabases("city-2012-07-2012-10.csv");
+    endMillis = System.currentTimeMillis();
+    System.out.println((endMillis - startMillis) + " millis.");
+    startMillis = endMillis;
+
+  }
+}
diff --git a/task-6471/java/src/org/torproject/task6471/DatabaseImporter.java b/task-6471/java/src/org/torproject/task6471/DatabaseImporter.java
index 641def4..330ecea 100644
--- a/task-6471/java/src/org/torproject/task6471/DatabaseImporter.java
+++ b/task-6471/java/src/org/torproject/task6471/DatabaseImporter.java
@@ -8,8 +8,7 @@ public interface DatabaseImporter extends Database {
    * are expected to conform to the RIR Statistics Exchange Format.
    * Only IPv4 address ranges are imported, whereas ASN and IPv6 lines are
    * ignored.  Only the country code, start address, and address range
-   * length fields are imported.  (TODO Extend to IPv6 and find similar
-   * data source for ASN.)
+   * length fields are imported.
    *
    * A typical entry from a RIR file is:
    *   "ripencc|FR|ipv4|2.0.0.0|1048576|20100712|allocated".
@@ -28,6 +27,40 @@ public interface DatabaseImporter extends Database {
   public boolean importRegionalRegistryStatsFileOrDirectory(String path);
 
   /**
+   * Import the contents of one or more Maxmind GeoLiteCity databases,
+   * each of them consisting of two files: GeoLiteCity-Blocks.csv contains
+   * address ranges and block numbers, and GeoLiteCity-Location.csv
+   * contains country codes for block numbers, among other things.  Only
+   * the range start and end addresses and the country code are imported.
+   * The database date is taken from the file modification time of the
+   * GeoLiteCity-Blocks.csv file.
+   *
+   * A typical entry from the GeoLiteCity-Blocks.csv file is:
+   *   ""3758093312","3758094335","108612""
+   *
+   * A typical entry from the GeoLiteCity-Location.csv file is:
+   *   "108612,"IN","09","Rajkot","",22.3000,70.7833,,"
+   */
+  public boolean importGeoLiteCityFileOrDirectory(String path);
+
+  /**
+   * Import the contents of one or more Maxmind GeoIPASNum2.csv databases.
+   * Only the range start and end addresses and the AS number are
+   * imported.  The database date is taken from the file modification
+   * time.
+   *
+   * A typical entry from such a database file is:
+   *   "3758063616,3758079999,"AS9381 Wharf T&T Ltd.""
+   *
+   * @param path Path to a stats file or directory.
+   * @return True if importing the file or directory was successful,
+   *         false otherwise.
+   */
+  public boolean importGeoIPASNum2FileOrDirectory(String path);
+
+  /* TODO Extend all imported formats to IPv6 equivalents. */
+
+  /**
    * Save the combined databases in a format that can later be loaded much
    * more efficiently than importing the original RIR files again.
    *
diff --git a/task-6471/java/src/org/torproject/task6471/DatabaseImporterImpl.java b/task-6471/java/src/org/torproject/task6471/DatabaseImporterImpl.java
index 4795883..2a6c203 100644
--- a/task-6471/java/src/org/torproject/task6471/DatabaseImporterImpl.java
+++ b/task-6471/java/src/org/torproject/task6471/DatabaseImporterImpl.java
@@ -6,13 +6,18 @@ import java.io.File;
 import java.io.FileReader;
 import java.io.FileWriter;
 import java.io.IOException;
+import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.SortedMap;
 import java.util.Stack;
+import java.util.TimeZone;
 import java.util.TreeMap;
 
 public class DatabaseImporterImpl extends DatabaseImpl
@@ -74,7 +79,7 @@ public class DatabaseImporterImpl extends DatabaseImpl
       String databaseFileName = file.getName();
       while ((line = br.readLine()) != null) {
         if (line.startsWith("#") || line.length() == 0) {
-          /* Skip comment line. */
+          /* Skip comment or empty line. */
           continue;
         }
         String[] parts = line.split("\\|");
@@ -101,8 +106,8 @@ public class DatabaseImporterImpl extends DatabaseImpl
         }
         String startAddressString = parts[3];
         long addresses = Long.parseLong(parts[4]);
-        this.addRange(databaseFileName, code, startAddressString,
-            addresses);
+        this.addRegionalRegistryStatsFileRange(databaseFileName, code,
+            startAddressString, addresses);
       }
       br.close();
       this.repairTree();
@@ -112,6 +117,174 @@ public class DatabaseImporterImpl extends DatabaseImpl
     return true;
   }
 
+  void addRegionalRegistryStatsFileRange(String databaseFileName,
+      String code, String startAddressString, long addresses) {
+    String databaseDateString =
+        databaseFileName.substring(databaseFileName.length() - 8);
+    int databaseDate = convertDateStringToNumber(databaseDateString);
+    long startAddress = convertAddressStringToNumber(startAddressString);
+    long endAddress = startAddress + addresses - 1L;
+    this.addRange(databaseFileName, databaseDate, startAddress,
+        endAddress, code);
+  }
+
+  public boolean importGeoLiteCityFileOrDirectory(String path) {
+    boolean allImportsSuccessful = true;
+    Stack<File> stackedFiles = new Stack<File>();
+    stackedFiles.add(new File(path));
+    SortedMap<File, Set<File>> filesByDirectory =
+        new TreeMap<File, Set<File>>();
+    while (!stackedFiles.isEmpty()) {
+      File file = stackedFiles.pop();
+      if (file.isDirectory()) {
+        stackedFiles.addAll(Arrays.asList(file.listFiles()));
+      } else if (!file.getName().endsWith(".csv")) {
+        System.err.println("Parsing other files than .csv is not "
+            + "supported: '" + file.getAbsolutePath() + "'.  Skipping.");
+      } else {
+        if (!filesByDirectory.containsKey(file.getParentFile())) {
+          filesByDirectory.put(file.getParentFile(), new HashSet<File>());
+        }
+        filesByDirectory.get(file.getParentFile()).add(file);
+      }
+    }
+    for (Set<File> files : filesByDirectory.values()) {
+      File blocksFile = null, locationFile = null;
+      for (File file : files) {
+        if (file.getName().equals("GeoLiteCity-Blocks.csv")) {
+          blocksFile = file;
+        } else if (file.getName().equals("GeoLiteCity-Location.csv")) {
+          locationFile = file;
+        }
+      }
+      if (blocksFile != null && locationFile != null) {
+        if (!this.importGeoLiteCityBlocksAndLocationFiles(blocksFile,
+            locationFile)) {
+          allImportsSuccessful = false;
+        }
+        files.remove(blocksFile);
+        files.remove(locationFile);
+        if (!files.isEmpty()) {
+          System.err.println("Did not recognize the following files, or "
+              + "did not find the blocks/location equivalent:");
+          for (File file : files) {
+            System.err.println(file.getAbsolutePath());
+          }
+        }
+      }
+    }
+    return allImportsSuccessful;
+  }
+
+  boolean importGeoLiteCityBlocksAndLocationFiles(File blocksFile,
+      File locationFile) {
+    SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
+    dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
+    long lastModifiedMillis = blocksFile.lastModified();
+    String databaseFileName = blocksFile.getName() + " "
+        + locationFile.getName() + " "
+        + dateFormat.format(lastModifiedMillis);
+    int databaseDate = (int) (lastModifiedMillis / 86400000);
+    try {
+      /* Parse location file first and remember country codes for given
+       * locations. */
+      Map<Integer, String> locations = new HashMap<Integer, String>();
+      BufferedReader br = new BufferedReader(new FileReader(
+          locationFile));
+      String line;
+      while ((line = br.readLine()) != null) {
+        if (line.startsWith("Copyright") || line.startsWith("locId")) {
+          /* Skip copyright notice and column headers. */
+          continue;
+        }
+        String[] parts = line.split(",");
+        int location = Integer.parseInt(parts[0]);
+        String code = parts[1].replaceAll("\"", "").toLowerCase();
+        locations.put(location, code);
+      }
+      br.close();
+      /* Parse blocks file and add ranges to the database. */
+      br = new BufferedReader(new FileReader(blocksFile));
+      while ((line = br.readLine()) != null) {
+        if (!line.startsWith("\"")) {
+          /* Skip copyright notice and column headers. */
+          continue;
+        }
+        String[] parts = line.replaceAll("\"", "").split(",");
+        long startAddress = Long.parseLong(parts[0]),
+            endAddress = Long.parseLong(parts[1]);
+        int location = Integer.parseInt(parts[2]);
+        if (!locations.containsKey(location)) {
+          System.err.println(blocksFile.getAbsolutePath() + " contains "
+              + "line '" + line + "' that doesn't match any line in "
+              + locationFile.getAbsolutePath() + ".  Aborting.");
+          break;
+        }
+        String code = locations.get(location);
+        this.addRange(databaseFileName, databaseDate, startAddress,
+            endAddress, code);
+      }
+      br.close();
+    } catch (IOException e) {
+      return false;
+    }
+    return true;
+  }
+
+  public boolean importGeoIPASNum2FileOrDirectory(String path) {
+    boolean allImportsSuccessful = true;
+    Stack<File> stackedFiles = new Stack<File>();
+    stackedFiles.add(new File(path));
+    List<File> allFiles = new ArrayList<File>();
+    while (!stackedFiles.isEmpty()) {
+      File file = stackedFiles.pop();
+      if (file.isDirectory()) {
+        stackedFiles.addAll(Arrays.asList(file.listFiles()));
+      } else if (!file.getName().endsWith(".csv")) {
+        System.err.println("Parsing other files than .csv is not "
+            + "supported: '" + file.getAbsolutePath() + "'.  Skipping.");
+      } else {
+        allFiles.add(file);
+      }
+    }
+    Collections.sort(allFiles, Collections.reverseOrder());
+    for (File file : allFiles) {
+      if (!this.importGeoIPASNum2File(file)) {
+        allImportsSuccessful = false;
+      }
+    }
+    return allImportsSuccessful;
+  }
+
+  private boolean importGeoIPASNum2File(File file) {
+    SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
+    dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
+    long lastModifiedMillis = file.lastModified();
+    String databaseFileName = file.getName() + " "
+        + dateFormat.format(lastModifiedMillis);
+    int databaseDate = (int) (lastModifiedMillis / 86400000);
+    try {
+      BufferedReader br = new BufferedReader(new FileReader(file));
+      String line;
+      while ((line = br.readLine()) != null) {
+        String[] parts = line.split(",");
+        long startAddress = Long.parseLong(parts[0]),
+            endAddress = Long.parseLong(parts[1]);
+        String code = parts[2].split(" ")[0].replaceAll("\"", "");
+        if (!code.startsWith("AS")) {
+          /* Don't import illegal range. */
+          continue;
+        }
+        this.addRange(databaseFileName, databaseDate, startAddress,
+            endAddress, code);
+      }
+      br.close();
+      this.repairTree();
+    } catch (IOException e) {
+      return false;
+    }
+    return true;
+  }
 
   /**
    * Internal counters for import statistics.
@@ -127,15 +300,9 @@ public class DatabaseImporterImpl extends DatabaseImpl
    * is called prior to any lookupAddress() calls.  No further checks are
    * performed that the tree is repaired before looking up an address.
    */
-  void addRange(String databaseFileName, String code,
-      String startAddressString, long addresses) {
-
+  void addRange(String databaseFileName, int databaseDate,
+      long startAddress, long endAddress, String code) {
     this.rangeImports++;
-    String databaseDateString =
-        databaseFileName.substring(databaseFileName.length() - 8);
-    int databaseDate = convertDateStringToNumber(databaseDateString);
-    long startAddress = convertAddressStringToNumber(startAddressString);
-    long endAddress = startAddress + addresses - 1L;
 
     /* Add new database date and file name if we didn't know them yet,
      * and note that we need to repair the tree after importing. */
@@ -337,6 +504,9 @@ public class DatabaseImporterImpl extends DatabaseImpl
    * interface, because the caller needs to make sure that repairTree()
    * is called prior to any lookupAddress() calls.  No further checks are
    * performed that the tree is repaired before look up an address.
+   *
+   * TODO While repairing the tree, we might also optimize it by merging
+   * adjacent address ranges with the same database date ranges.
    */
   void repairTree() {
     if (this.addedDatabaseDate < 0) {
@@ -429,7 +599,6 @@ public class DatabaseImporterImpl extends DatabaseImpl
     }
     return true;
   }
-  
 
   /* Return a nicely formatted string summarizing database contents and
    * usage statistics. */
diff --git a/task-6471/java/src/org/torproject/task6471/DatabaseTest.java b/task-6471/java/src/org/torproject/task6471/DatabaseTest.java
index fb0f19d..236810b 100644
--- a/task-6471/java/src/org/torproject/task6471/DatabaseTest.java
+++ b/task-6471/java/src/org/torproject/task6471/DatabaseTest.java
@@ -12,7 +12,8 @@ public class DatabaseTest {
   @Test()
   public void testSingleIpRangeSingleDatebase() {
     DatabaseImporterImpl database = new DatabaseImporterImpl();
-    database.addRange("20120901", "us", "3.0.0.0", 16777216);
+    database.addRegionalRegistryStatsFileRange("20120901", "us",
+        "3.0.0.0", 16777216);
     database.repairTree();
     assertEquals(1, database.getNumberOfElements());
     assertEquals(null, database.lookupIpv4AddressAndDate(
@@ -60,8 +61,10 @@ public class DatabaseTest {
   @Test()
   public void testTwoAdjacentIpRangesSingleDatabase() {
     DatabaseImporterImpl database = new DatabaseImporterImpl();
-    database.addRange("20120901", "us", "3.0.0.0", 16777216);
-    database.addRange("20120901", "ca", "4.0.0.0", 16777216);
+    database.addRegionalRegistryStatsFileRange("20120901", "us",
+        "3.0.0.0", 16777216);
+    database.addRegionalRegistryStatsFileRange("20120901", "ca",
+        "4.0.0.0", 16777216);
     database.repairTree();
     assertEquals(2, database.getNumberOfElements());
     assertEquals(null, database.lookupIpv4AddressAndDate(
@@ -85,8 +88,10 @@ public class DatabaseTest {
   @Test()
   public void testTwoNonAdjacentIpDateRangesSingleDatabase() {
     DatabaseImporterImpl database = new DatabaseImporterImpl();
-    database.addRange("20120901", "us", "3.0.0.0", 16777216);
-    database.addRange("20120901", "ca", "6.0.0.0", 16777216);
+    database.addRegionalRegistryStatsFileRange("20120901", "us",
+        "3.0.0.0", 16777216);
+    database.addRegionalRegistryStatsFileRange("20120901", "ca",
+        "6.0.0.0", 16777216);
     database.repairTree();
     assertEquals(2, database.getNumberOfElements());
     assertEquals(null, database.lookupIpv4AddressAndDate(
@@ -104,8 +109,10 @@ public class DatabaseTest {
   @Test()
   public void testDuplicateImport() {
     DatabaseImporterImpl database = new DatabaseImporterImpl();
-    database.addRange("20120901", "us", "3.0.0.0", 16777216);
-    database.addRange("20120901", "us", "3.0.0.0", 16777216);
+    database.addRegionalRegistryStatsFileRange("20120901", "us",
+        "3.0.0.0", 16777216);
+    database.addRegionalRegistryStatsFileRange("20120901", "us",
+        "3.0.0.0", 16777216);
     database.repairTree();
     assertEquals(1, database.getNumberOfElements());
     assertEquals(null, database.lookupIpv4AddressAndDate(
@@ -119,8 +126,10 @@ public class DatabaseTest {
   @Test()
   public void testDuplicateImportDifferentCode() {
     DatabaseImporterImpl database = new DatabaseImporterImpl();
-    database.addRange("20120901", "us", "3.0.0.0", 16777216);
-    database.addRange("20120901", "ca", "3.0.0.0", 16777216);
+    database.addRegionalRegistryStatsFileRange("20120901", "us",
+        "3.0.0.0", 16777216);
+    database.addRegionalRegistryStatsFileRange("20120901", "ca",
+        "3.0.0.0", 16777216);
     database.repairTree();
     assertEquals(1, database.getNumberOfElements());
     assertEquals("us", database.lookupIpv4AddressAndDate(
@@ -130,9 +139,11 @@ public class DatabaseTest {
   @Test()
   public void testLeaveIpChangeUnchanged() {
     DatabaseImporterImpl database = new DatabaseImporterImpl();
-    database.addRange("20120901", "us", "3.0.0.0", 16777216);
+    database.addRegionalRegistryStatsFileRange("20120901", "us",
+        "3.0.0.0", 16777216);
     database.repairTree();
-    database.addRange("20121001", "us", "3.0.0.0", 16777216);
+    database.addRegionalRegistryStatsFileRange("20121001", "us",
+        "3.0.0.0", 16777216);
     database.repairTree();
     assertEquals(1, database.getNumberOfElements());
     assertEquals("us", database.lookupIpv4AddressAndDate(
@@ -148,9 +159,11 @@ public class DatabaseTest {
   @Test()
   public void testLeaveIpChangeUnchangedReverseOrder() {
     DatabaseImporterImpl database = new DatabaseImporterImpl();
-    database.addRange("20121001", "us", "3.0.0.0", 16777216);
+    database.addRegionalRegistryStatsFileRange("20121001", "us",
+        "3.0.0.0", 16777216);
     database.repairTree();
-    database.addRange("20120901", "us", "3.0.0.0", 16777216);
+    database.addRegionalRegistryStatsFileRange("20120901", "us",
+        "3.0.0.0", 16777216);
     database.repairTree();
     assertEquals(1, database.getNumberOfElements());
     assertEquals("us", database.lookupIpv4AddressAndDate(
@@ -166,11 +179,14 @@ public class DatabaseTest {
   @Test()
   public void testMissingIpRange() {
     DatabaseImporterImpl database = new DatabaseImporterImpl();
-    database.addRange("20120901", "us", "3.0.0.0", 16777216);
+    database.addRegionalRegistryStatsFileRange("20120901", "us",
+        "3.0.0.0", 16777216);
     database.repairTree();
-    database.addRange("20121101", "us", "3.0.0.0", 16777216);
+    database.addRegionalRegistryStatsFileRange("20121101", "us",
+        "3.0.0.0", 16777216);
     database.repairTree();
-    database.addRange("20121001", "us", "6.0.0.0", 16777216);
+    database.addRegionalRegistryStatsFileRange("20121001", "us",
+        "6.0.0.0", 16777216);
     database.repairTree();
     assertEquals(3, database.getNumberOfElements());
     assertEquals("us", database.lookupIpv4AddressAndDate(





More information about the tor-commits mailing list