[tor-commits] [metrics-tasks/master] Support parsing of Maxmind's formats (#6471).
karsten at torproject.org
karsten at torproject.org
Tue Nov 6 15:33:21 UTC 2012
commit 2d1bd61d767f07f430106b35f67068ccb873c252
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date: Sun Nov 4 14:55:50 2012 -0500
Support parsing of Maxmind's formats (#6471).
---
task-6471/java/build.xml | 7 +
.../org/torproject/task6471/ConvertExample.java | 34 ++++
.../org/torproject/task6471/DatabaseImporter.java | 37 ++++-
.../torproject/task6471/DatabaseImporterImpl.java | 193 ++++++++++++++++++--
.../src/org/torproject/task6471/DatabaseTest.java | 48 ++++--
5 files changed, 289 insertions(+), 30 deletions(-)
diff --git a/task-6471/java/build.xml b/task-6471/java/build.xml
index bda8d25..40969ab 100644
--- a/task-6471/java/build.xml
+++ b/task-6471/java/build.xml
@@ -45,5 +45,12 @@
<classpath refid="classpath"/>
</java>
</target>
+ <target name="convert" depends="compile">
+ <java fork="true"
+ maxmemory="2048m"
+ classname="org.torproject.task6471.ConvertExample">
+ <classpath refid="classpath"/>
+ </java>
+ </target>
</project>
diff --git a/task-6471/java/src/org/torproject/task6471/ConvertExample.java b/task-6471/java/src/org/torproject/task6471/ConvertExample.java
new file mode 100644
index 0000000..c96047c
--- /dev/null
+++ b/task-6471/java/src/org/torproject/task6471/ConvertExample.java
@@ -0,0 +1,34 @@
+package org.torproject.task6471;
+
+public class ConvertExample {
+ public static void main(String[] args) {
+ System.out.print("Importing ASN database files... ");
+ long startMillis = System.currentTimeMillis();
+ DatabaseImporter combinedDatabase = new DatabaseImporterImpl();
+ combinedDatabase.importGeoIPASNum2FileOrDirectory("../asn");
+ long endMillis = System.currentTimeMillis();
+ System.out.println((endMillis - startMillis) + " millis.");
+
+ System.out.print("Saving combined ASN database to disk... ");
+ startMillis = endMillis;
+ combinedDatabase.saveCombinedDatabases("asn-2012-07-2012-10.csv");
+ endMillis = System.currentTimeMillis();
+ System.out.println((endMillis - startMillis) + " millis.");
+ startMillis = endMillis;
+
+ System.out.print("Importing city database files... ");
+ startMillis = System.currentTimeMillis();
+ combinedDatabase = new DatabaseImporterImpl();
+ combinedDatabase.importGeoLiteCityFileOrDirectory("../city");
+ endMillis = System.currentTimeMillis();
+ System.out.println((endMillis - startMillis) + " millis.");
+
+ System.out.print("Saving combined city database to disk... ");
+ startMillis = endMillis;
+ combinedDatabase.saveCombinedDatabases("city-2012-07-2012-10.csv");
+ endMillis = System.currentTimeMillis();
+ System.out.println((endMillis - startMillis) + " millis.");
+ startMillis = endMillis;
+
+ }
+}
diff --git a/task-6471/java/src/org/torproject/task6471/DatabaseImporter.java b/task-6471/java/src/org/torproject/task6471/DatabaseImporter.java
index 641def4..330ecea 100644
--- a/task-6471/java/src/org/torproject/task6471/DatabaseImporter.java
+++ b/task-6471/java/src/org/torproject/task6471/DatabaseImporter.java
@@ -8,8 +8,7 @@ public interface DatabaseImporter extends Database {
* are expected to conform to the RIR Statistics Exchange Format.
* Only IPv4 address ranges are imported, whereas ASN and IPv6 lines are
* ignored. Only the country code, start address, and address range
- * length fields are imported. (TODO Extend to IPv6 and find similar
- * data source for ASN.)
+ * length fields are imported.
*
* A typical entry from a RIR file is:
* "ripencc|FR|ipv4|2.0.0.0|1048576|20100712|allocated".
@@ -28,6 +27,40 @@ public interface DatabaseImporter extends Database {
public boolean importRegionalRegistryStatsFileOrDirectory(String path);
/**
+ * Import the contents of one or more Maxmind GeoLiteCity databases,
+ * each of them consisting of two files: GeoLiteCity-Blocks.csv contains
+ * address ranges and block numbers, and GeoLiteCity-Location.csv
+ * contains country codes for block numbers, among other things. Only
+ * the range start and end addresses and the country code are imported.
+ * The database date is taken from the file modification time of the
+ * GeoLiteCity-Blocks.csv file.
+ *
+ * A typical entry from the GeoLiteCity-Blocks.csv file is:
+ * ""3758093312","3758094335","108612""
+ *
+ * A typical entry from the GeoLiteCity-Location.csv file is:
+ * "108612,"IN","09","Rajkot","",22.3000,70.7833,,"
+ */
+ public boolean importGeoLiteCityFileOrDirectory(String path);
+
+ /**
+ * Import the contents of one or more Maxmind GeoIPASNum2.csv databases.
+ * Only the range start and end addresses and the AS number are
+ * imported. The database date is taken from the file modification
+ * time.
+ *
+ * A typical entry from such a database file is:
+ * "3758063616,3758079999,"AS9381 Wharf T&T Ltd.""
+ *
+ * @param path Path to a stats file or directory.
+ * @return True if importing the file or directory was successful,
+ * false otherwise.
+ */
+ public boolean importGeoIPASNum2FileOrDirectory(String path);
+
+ /* TODO Extend all imported formats to IPv6 equivalents. */
+
+ /**
* Save the combined databases in a format that can later be loaded much
* more efficiently than importing the original RIR files again.
*
diff --git a/task-6471/java/src/org/torproject/task6471/DatabaseImporterImpl.java b/task-6471/java/src/org/torproject/task6471/DatabaseImporterImpl.java
index 4795883..2a6c203 100644
--- a/task-6471/java/src/org/torproject/task6471/DatabaseImporterImpl.java
+++ b/task-6471/java/src/org/torproject/task6471/DatabaseImporterImpl.java
@@ -6,13 +6,18 @@ import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
+import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
import java.util.List;
import java.util.Map;
+import java.util.Set;
import java.util.SortedMap;
import java.util.Stack;
+import java.util.TimeZone;
import java.util.TreeMap;
public class DatabaseImporterImpl extends DatabaseImpl
@@ -74,7 +79,7 @@ public class DatabaseImporterImpl extends DatabaseImpl
String databaseFileName = file.getName();
while ((line = br.readLine()) != null) {
if (line.startsWith("#") || line.length() == 0) {
- /* Skip comment line. */
+ /* Skip comment or empty line. */
continue;
}
String[] parts = line.split("\\|");
@@ -101,8 +106,8 @@ public class DatabaseImporterImpl extends DatabaseImpl
}
String startAddressString = parts[3];
long addresses = Long.parseLong(parts[4]);
- this.addRange(databaseFileName, code, startAddressString,
- addresses);
+ this.addRegionalRegistryStatsFileRange(databaseFileName, code,
+ startAddressString, addresses);
}
br.close();
this.repairTree();
@@ -112,6 +117,174 @@ public class DatabaseImporterImpl extends DatabaseImpl
return true;
}
+ void addRegionalRegistryStatsFileRange(String databaseFileName,
+ String code, String startAddressString, long addresses) {
+ String databaseDateString =
+ databaseFileName.substring(databaseFileName.length() - 8);
+ int databaseDate = convertDateStringToNumber(databaseDateString);
+ long startAddress = convertAddressStringToNumber(startAddressString);
+ long endAddress = startAddress + addresses - 1L;
+ this.addRange(databaseFileName, databaseDate, startAddress,
+ endAddress, code);
+ }
+
+ public boolean importGeoLiteCityFileOrDirectory(String path) {
+ boolean allImportsSuccessful = true;
+ Stack<File> stackedFiles = new Stack<File>();
+ stackedFiles.add(new File(path));
+ SortedMap<File, Set<File>> filesByDirectory =
+ new TreeMap<File, Set<File>>();
+ while (!stackedFiles.isEmpty()) {
+ File file = stackedFiles.pop();
+ if (file.isDirectory()) {
+ stackedFiles.addAll(Arrays.asList(file.listFiles()));
+ } else if (!file.getName().endsWith(".csv")) {
+ System.err.println("Parsing other files than .csv is not "
+ + "supported: '" + file.getAbsolutePath() + "'. Skipping.");
+ } else {
+ if (!filesByDirectory.containsKey(file.getParentFile())) {
+ filesByDirectory.put(file.getParentFile(), new HashSet<File>());
+ }
+ filesByDirectory.get(file.getParentFile()).add(file);
+ }
+ }
+ for (Set<File> files : filesByDirectory.values()) {
+ File blocksFile = null, locationFile = null;
+ for (File file : files) {
+ if (file.getName().equals("GeoLiteCity-Blocks.csv")) {
+ blocksFile = file;
+ } else if (file.getName().equals("GeoLiteCity-Location.csv")) {
+ locationFile = file;
+ }
+ }
+ if (blocksFile != null && locationFile != null) {
+ if (!this.importGeoLiteCityBlocksAndLocationFiles(blocksFile,
+ locationFile)) {
+ allImportsSuccessful = false;
+ }
+ files.remove(blocksFile);
+ files.remove(locationFile);
+ if (!files.isEmpty()) {
+ System.err.println("Did not recognize the following files, or "
+ + "did not find the blocks/location equivalent:");
+ for (File file : files) {
+ System.err.println(file.getAbsolutePath());
+ }
+ }
+ }
+ }
+ return allImportsSuccessful;
+ }
+
+ boolean importGeoLiteCityBlocksAndLocationFiles(File blocksFile,
+ File locationFile) {
+ SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
+ dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
+ long lastModifiedMillis = blocksFile.lastModified();
+ String databaseFileName = blocksFile.getName() + " "
+ + locationFile.getName() + " "
+ + dateFormat.format(lastModifiedMillis);
+ int databaseDate = (int) (lastModifiedMillis / 86400000);
+ try {
+ /* Parse location file first and remember country codes for given
+ * locations. */
+ Map<Integer, String> locations = new HashMap<Integer, String>();
+ BufferedReader br = new BufferedReader(new FileReader(
+ locationFile));
+ String line;
+ while ((line = br.readLine()) != null) {
+ if (line.startsWith("Copyright") || line.startsWith("locId")) {
+ /* Skip copyright notice and column headers. */
+ continue;
+ }
+ String[] parts = line.split(",");
+ int location = Integer.parseInt(parts[0]);
+ String code = parts[1].replaceAll("\"", "").toLowerCase();
+ locations.put(location, code);
+ }
+ br.close();
+ /* Parse blocks file and add ranges to the database. */
+ br = new BufferedReader(new FileReader(blocksFile));
+ while ((line = br.readLine()) != null) {
+ if (!line.startsWith("\"")) {
+ /* Skip copyright notice and column headers. */
+ continue;
+ }
+ String[] parts = line.replaceAll("\"", "").split(",");
+ long startAddress = Long.parseLong(parts[0]),
+ endAddress = Long.parseLong(parts[1]);
+ int location = Integer.parseInt(parts[2]);
+ if (!locations.containsKey(location)) {
+ System.err.println(blocksFile.getAbsolutePath() + " contains "
+ + "line '" + line + "' that doesn't match any line in "
+ + locationFile.getAbsolutePath() + ". Aborting.");
+ break;
+ }
+ String code = locations.get(location);
+ this.addRange(databaseFileName, databaseDate, startAddress,
+ endAddress, code);
+ }
+ br.close();
+ } catch (IOException e) {
+ return false;
+ }
+ return true;
+ }
+
+ public boolean importGeoIPASNum2FileOrDirectory(String path) {
+ boolean allImportsSuccessful = true;
+ Stack<File> stackedFiles = new Stack<File>();
+ stackedFiles.add(new File(path));
+ List<File> allFiles = new ArrayList<File>();
+ while (!stackedFiles.isEmpty()) {
+ File file = stackedFiles.pop();
+ if (file.isDirectory()) {
+ stackedFiles.addAll(Arrays.asList(file.listFiles()));
+ } else if (!file.getName().endsWith(".csv")) {
+ System.err.println("Parsing other files than .csv is not "
+ + "supported: '" + file.getAbsolutePath() + "'. Skipping.");
+ } else {
+ allFiles.add(file);
+ }
+ }
+ Collections.sort(allFiles, Collections.reverseOrder());
+ for (File file : allFiles) {
+ if (!this.importGeoIPASNum2File(file)) {
+ allImportsSuccessful = false;
+ }
+ }
+ return allImportsSuccessful;
+ }
+
+ private boolean importGeoIPASNum2File(File file) {
+ SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
+ dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
+ long lastModifiedMillis = file.lastModified();
+ String databaseFileName = file.getName() + " "
+ + dateFormat.format(lastModifiedMillis);
+ int databaseDate = (int) (lastModifiedMillis / 86400000);
+ try {
+ BufferedReader br = new BufferedReader(new FileReader(file));
+ String line;
+ while ((line = br.readLine()) != null) {
+ String[] parts = line.split(",");
+ long startAddress = Long.parseLong(parts[0]),
+ endAddress = Long.parseLong(parts[1]);
+ String code = parts[2].split(" ")[0].replaceAll("\"", "");
+ if (!code.startsWith("AS")) {
+ /* Don't import illegal range. */
+ continue;
+ }
+ this.addRange(databaseFileName, databaseDate, startAddress,
+ endAddress, code);
+ }
+ br.close();
+ this.repairTree();
+ } catch (IOException e) {
+ return false;
+ }
+ return true;
+ }
/**
* Internal counters for import statistics.
@@ -127,15 +300,9 @@ public class DatabaseImporterImpl extends DatabaseImpl
* is called prior to any lookupAddress() calls. No further checks are
* performed that the tree is repaired before looking up an address.
*/
- void addRange(String databaseFileName, String code,
- String startAddressString, long addresses) {
-
+ void addRange(String databaseFileName, int databaseDate,
+ long startAddress, long endAddress, String code) {
this.rangeImports++;
- String databaseDateString =
- databaseFileName.substring(databaseFileName.length() - 8);
- int databaseDate = convertDateStringToNumber(databaseDateString);
- long startAddress = convertAddressStringToNumber(startAddressString);
- long endAddress = startAddress + addresses - 1L;
/* Add new database date and file name if we didn't know them yet,
* and note that we need to repair the tree after importing. */
@@ -337,6 +504,9 @@ public class DatabaseImporterImpl extends DatabaseImpl
* interface, because the caller needs to make sure that repairTree()
* is called prior to any lookupAddress() calls. No further checks are
* performed that the tree is repaired before look up an address.
+ *
+ * TODO While repairing the tree, we might also optimize it by merging
+ * adjacent address ranges with the same database date ranges.
*/
void repairTree() {
if (this.addedDatabaseDate < 0) {
@@ -429,7 +599,6 @@ public class DatabaseImporterImpl extends DatabaseImpl
}
return true;
}
-
/* Return a nicely formatted string summarizing database contents and
* usage statistics. */
diff --git a/task-6471/java/src/org/torproject/task6471/DatabaseTest.java b/task-6471/java/src/org/torproject/task6471/DatabaseTest.java
index fb0f19d..236810b 100644
--- a/task-6471/java/src/org/torproject/task6471/DatabaseTest.java
+++ b/task-6471/java/src/org/torproject/task6471/DatabaseTest.java
@@ -12,7 +12,8 @@ public class DatabaseTest {
@Test()
public void testSingleIpRangeSingleDatebase() {
DatabaseImporterImpl database = new DatabaseImporterImpl();
- database.addRange("20120901", "us", "3.0.0.0", 16777216);
+ database.addRegionalRegistryStatsFileRange("20120901", "us",
+ "3.0.0.0", 16777216);
database.repairTree();
assertEquals(1, database.getNumberOfElements());
assertEquals(null, database.lookupIpv4AddressAndDate(
@@ -60,8 +61,10 @@ public class DatabaseTest {
@Test()
public void testTwoAdjacentIpRangesSingleDatabase() {
DatabaseImporterImpl database = new DatabaseImporterImpl();
- database.addRange("20120901", "us", "3.0.0.0", 16777216);
- database.addRange("20120901", "ca", "4.0.0.0", 16777216);
+ database.addRegionalRegistryStatsFileRange("20120901", "us",
+ "3.0.0.0", 16777216);
+ database.addRegionalRegistryStatsFileRange("20120901", "ca",
+ "4.0.0.0", 16777216);
database.repairTree();
assertEquals(2, database.getNumberOfElements());
assertEquals(null, database.lookupIpv4AddressAndDate(
@@ -85,8 +88,10 @@ public class DatabaseTest {
@Test()
public void testTwoNonAdjacentIpDateRangesSingleDatabase() {
DatabaseImporterImpl database = new DatabaseImporterImpl();
- database.addRange("20120901", "us", "3.0.0.0", 16777216);
- database.addRange("20120901", "ca", "6.0.0.0", 16777216);
+ database.addRegionalRegistryStatsFileRange("20120901", "us",
+ "3.0.0.0", 16777216);
+ database.addRegionalRegistryStatsFileRange("20120901", "ca",
+ "6.0.0.0", 16777216);
database.repairTree();
assertEquals(2, database.getNumberOfElements());
assertEquals(null, database.lookupIpv4AddressAndDate(
@@ -104,8 +109,10 @@ public class DatabaseTest {
@Test()
public void testDuplicateImport() {
DatabaseImporterImpl database = new DatabaseImporterImpl();
- database.addRange("20120901", "us", "3.0.0.0", 16777216);
- database.addRange("20120901", "us", "3.0.0.0", 16777216);
+ database.addRegionalRegistryStatsFileRange("20120901", "us",
+ "3.0.0.0", 16777216);
+ database.addRegionalRegistryStatsFileRange("20120901", "us",
+ "3.0.0.0", 16777216);
database.repairTree();
assertEquals(1, database.getNumberOfElements());
assertEquals(null, database.lookupIpv4AddressAndDate(
@@ -119,8 +126,10 @@ public class DatabaseTest {
@Test()
public void testDuplicateImportDifferentCode() {
DatabaseImporterImpl database = new DatabaseImporterImpl();
- database.addRange("20120901", "us", "3.0.0.0", 16777216);
- database.addRange("20120901", "ca", "3.0.0.0", 16777216);
+ database.addRegionalRegistryStatsFileRange("20120901", "us",
+ "3.0.0.0", 16777216);
+ database.addRegionalRegistryStatsFileRange("20120901", "ca",
+ "3.0.0.0", 16777216);
database.repairTree();
assertEquals(1, database.getNumberOfElements());
assertEquals("us", database.lookupIpv4AddressAndDate(
@@ -130,9 +139,11 @@ public class DatabaseTest {
@Test()
public void testLeaveIpChangeUnchanged() {
DatabaseImporterImpl database = new DatabaseImporterImpl();
- database.addRange("20120901", "us", "3.0.0.0", 16777216);
+ database.addRegionalRegistryStatsFileRange("20120901", "us",
+ "3.0.0.0", 16777216);
database.repairTree();
- database.addRange("20121001", "us", "3.0.0.0", 16777216);
+ database.addRegionalRegistryStatsFileRange("20121001", "us",
+ "3.0.0.0", 16777216);
database.repairTree();
assertEquals(1, database.getNumberOfElements());
assertEquals("us", database.lookupIpv4AddressAndDate(
@@ -148,9 +159,11 @@ public class DatabaseTest {
@Test()
public void testLeaveIpChangeUnchangedReverseOrder() {
DatabaseImporterImpl database = new DatabaseImporterImpl();
- database.addRange("20121001", "us", "3.0.0.0", 16777216);
+ database.addRegionalRegistryStatsFileRange("20121001", "us",
+ "3.0.0.0", 16777216);
database.repairTree();
- database.addRange("20120901", "us", "3.0.0.0", 16777216);
+ database.addRegionalRegistryStatsFileRange("20120901", "us",
+ "3.0.0.0", 16777216);
database.repairTree();
assertEquals(1, database.getNumberOfElements());
assertEquals("us", database.lookupIpv4AddressAndDate(
@@ -166,11 +179,14 @@ public class DatabaseTest {
@Test()
public void testMissingIpRange() {
DatabaseImporterImpl database = new DatabaseImporterImpl();
- database.addRange("20120901", "us", "3.0.0.0", 16777216);
+ database.addRegionalRegistryStatsFileRange("20120901", "us",
+ "3.0.0.0", 16777216);
database.repairTree();
- database.addRange("20121101", "us", "3.0.0.0", 16777216);
+ database.addRegionalRegistryStatsFileRange("20121101", "us",
+ "3.0.0.0", 16777216);
database.repairTree();
- database.addRange("20121001", "us", "6.0.0.0", 16777216);
+ database.addRegionalRegistryStatsFileRange("20121001", "us",
+ "6.0.0.0", 16777216);
database.repairTree();
assertEquals(3, database.getNumberOfElements());
assertEquals("us", database.lookupIpv4AddressAndDate(
More information about the tor-commits
mailing list