commit 2d1bd61d767f07f430106b35f67068ccb873c252 Author: Karsten Loesing karsten.loesing@gmx.net Date: Sun Nov 4 14:55:50 2012 -0500
Support parsing of Maxmind's formats (#6471). --- task-6471/java/build.xml | 7 + .../org/torproject/task6471/ConvertExample.java | 34 ++++ .../org/torproject/task6471/DatabaseImporter.java | 37 ++++- .../torproject/task6471/DatabaseImporterImpl.java | 193 ++++++++++++++++++-- .../src/org/torproject/task6471/DatabaseTest.java | 48 ++++-- 5 files changed, 289 insertions(+), 30 deletions(-)
diff --git a/task-6471/java/build.xml b/task-6471/java/build.xml index bda8d25..40969ab 100644 --- a/task-6471/java/build.xml +++ b/task-6471/java/build.xml @@ -45,5 +45,12 @@ <classpath refid="classpath"/> </java> </target> + <target name="convert" depends="compile"> + <java fork="true" + maxmemory="2048m" + classname="org.torproject.task6471.ConvertExample"> + <classpath refid="classpath"/> + </java> + </target> </project>
diff --git a/task-6471/java/src/org/torproject/task6471/ConvertExample.java b/task-6471/java/src/org/torproject/task6471/ConvertExample.java new file mode 100644 index 0000000..c96047c --- /dev/null +++ b/task-6471/java/src/org/torproject/task6471/ConvertExample.java @@ -0,0 +1,34 @@ +package org.torproject.task6471; + +public class ConvertExample { + public static void main(String[] args) { + System.out.print("Importing ASN database files... "); + long startMillis = System.currentTimeMillis(); + DatabaseImporter combinedDatabase = new DatabaseImporterImpl(); + combinedDatabase.importGeoIPASNum2FileOrDirectory("../asn"); + long endMillis = System.currentTimeMillis(); + System.out.println((endMillis - startMillis) + " millis."); + + System.out.print("Saving combined ASN database to disk... "); + startMillis = endMillis; + combinedDatabase.saveCombinedDatabases("asn-2012-07-2012-10.csv"); + endMillis = System.currentTimeMillis(); + System.out.println((endMillis - startMillis) + " millis."); + startMillis = endMillis; + + System.out.print("Importing city database files... "); + startMillis = System.currentTimeMillis(); + combinedDatabase = new DatabaseImporterImpl(); + combinedDatabase.importGeoLiteCityFileOrDirectory("../city"); + endMillis = System.currentTimeMillis(); + System.out.println((endMillis - startMillis) + " millis."); + + System.out.print("Saving combined city database to disk... "); + startMillis = endMillis; + combinedDatabase.saveCombinedDatabases("city-2012-07-2012-10.csv"); + endMillis = System.currentTimeMillis(); + System.out.println((endMillis - startMillis) + " millis."); + startMillis = endMillis; + + } +} diff --git a/task-6471/java/src/org/torproject/task6471/DatabaseImporter.java b/task-6471/java/src/org/torproject/task6471/DatabaseImporter.java index 641def4..330ecea 100644 --- a/task-6471/java/src/org/torproject/task6471/DatabaseImporter.java +++ b/task-6471/java/src/org/torproject/task6471/DatabaseImporter.java @@ -8,8 +8,7 @@ public interface DatabaseImporter extends Database { * are expected to conform to the RIR Statistics Exchange Format. * Only IPv4 address ranges are imported, whereas ASN and IPv6 lines are * ignored. Only the country code, start address, and address range - * length fields are imported. (TODO Extend to IPv6 and find similar - * data source for ASN.) + * length fields are imported. * * A typical entry from a RIR file is: * "ripencc|FR|ipv4|2.0.0.0|1048576|20100712|allocated". @@ -28,6 +27,40 @@ public interface DatabaseImporter extends Database { public boolean importRegionalRegistryStatsFileOrDirectory(String path);
/** + * Import the contents of one or more Maxmind GeoLiteCity databases, + * each of them consisting of two files: GeoLiteCity-Blocks.csv contains + * address ranges and block numbers, and GeoLiteCity-Location.csv + * contains country codes for block numbers, among other things. Only + * the range start and end addresses and the country code are imported. + * The database date is taken from the file modification time of the + * GeoLiteCity-Blocks.csv file. + * + * A typical entry from the GeoLiteCity-Blocks.csv file is: + * ""3758093312","3758094335","108612"" + * + * A typical entry from the GeoLiteCity-Location.csv file is: + * "108612,"IN","09","Rajkot","",22.3000,70.7833,," + */ + public boolean importGeoLiteCityFileOrDirectory(String path); + + /** + * Import the contents of one or more Maxmind GeoIPASNum2.csv databases. + * Only the range start and end addresses and the AS number are + * imported. The database date is taken from the file modification + * time. + * + * A typical entry from such a database file is: + * "3758063616,3758079999,"AS9381 Wharf T&T Ltd."" + * + * @param path Path to a stats file or directory. + * @return True if importing the file or directory was successful, + * false otherwise. + */ + public boolean importGeoIPASNum2FileOrDirectory(String path); + + /* TODO Extend all imported formats to IPv6 equivalents. */ + + /** * Save the combined databases in a format that can later be loaded much * more efficiently than importing the original RIR files again. * diff --git a/task-6471/java/src/org/torproject/task6471/DatabaseImporterImpl.java b/task-6471/java/src/org/torproject/task6471/DatabaseImporterImpl.java index 4795883..2a6c203 100644 --- a/task-6471/java/src/org/torproject/task6471/DatabaseImporterImpl.java +++ b/task-6471/java/src/org/torproject/task6471/DatabaseImporterImpl.java @@ -6,13 +6,18 @@ import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; +import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.SortedMap; import java.util.Stack; +import java.util.TimeZone; import java.util.TreeMap;
public class DatabaseImporterImpl extends DatabaseImpl @@ -74,7 +79,7 @@ public class DatabaseImporterImpl extends DatabaseImpl String databaseFileName = file.getName(); while ((line = br.readLine()) != null) { if (line.startsWith("#") || line.length() == 0) { - /* Skip comment line. */ + /* Skip comment or empty line. */ continue; } String[] parts = line.split("\|"); @@ -101,8 +106,8 @@ public class DatabaseImporterImpl extends DatabaseImpl } String startAddressString = parts[3]; long addresses = Long.parseLong(parts[4]); - this.addRange(databaseFileName, code, startAddressString, - addresses); + this.addRegionalRegistryStatsFileRange(databaseFileName, code, + startAddressString, addresses); } br.close(); this.repairTree(); @@ -112,6 +117,174 @@ public class DatabaseImporterImpl extends DatabaseImpl return true; }
+ void addRegionalRegistryStatsFileRange(String databaseFileName, + String code, String startAddressString, long addresses) { + String databaseDateString = + databaseFileName.substring(databaseFileName.length() - 8); + int databaseDate = convertDateStringToNumber(databaseDateString); + long startAddress = convertAddressStringToNumber(startAddressString); + long endAddress = startAddress + addresses - 1L; + this.addRange(databaseFileName, databaseDate, startAddress, + endAddress, code); + } + + public boolean importGeoLiteCityFileOrDirectory(String path) { + boolean allImportsSuccessful = true; + Stack<File> stackedFiles = new Stack<File>(); + stackedFiles.add(new File(path)); + SortedMap<File, Set<File>> filesByDirectory = + new TreeMap<File, Set<File>>(); + while (!stackedFiles.isEmpty()) { + File file = stackedFiles.pop(); + if (file.isDirectory()) { + stackedFiles.addAll(Arrays.asList(file.listFiles())); + } else if (!file.getName().endsWith(".csv")) { + System.err.println("Parsing other files than .csv is not " + + "supported: '" + file.getAbsolutePath() + "'. Skipping."); + } else { + if (!filesByDirectory.containsKey(file.getParentFile())) { + filesByDirectory.put(file.getParentFile(), new HashSet<File>()); + } + filesByDirectory.get(file.getParentFile()).add(file); + } + } + for (Set<File> files : filesByDirectory.values()) { + File blocksFile = null, locationFile = null; + for (File file : files) { + if (file.getName().equals("GeoLiteCity-Blocks.csv")) { + blocksFile = file; + } else if (file.getName().equals("GeoLiteCity-Location.csv")) { + locationFile = file; + } + } + if (blocksFile != null && locationFile != null) { + if (!this.importGeoLiteCityBlocksAndLocationFiles(blocksFile, + locationFile)) { + allImportsSuccessful = false; + } + files.remove(blocksFile); + files.remove(locationFile); + if (!files.isEmpty()) { + System.err.println("Did not recognize the following files, or " + + "did not find the blocks/location equivalent:"); + for (File file : files) { + System.err.println(file.getAbsolutePath()); + } + } + } + } + return allImportsSuccessful; + } + + boolean importGeoLiteCityBlocksAndLocationFiles(File blocksFile, + File locationFile) { + SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd"); + dateFormat.setTimeZone(TimeZone.getTimeZone("UTC")); + long lastModifiedMillis = blocksFile.lastModified(); + String databaseFileName = blocksFile.getName() + " " + + locationFile.getName() + " " + + dateFormat.format(lastModifiedMillis); + int databaseDate = (int) (lastModifiedMillis / 86400000); + try { + /* Parse location file first and remember country codes for given + * locations. */ + Map<Integer, String> locations = new HashMap<Integer, String>(); + BufferedReader br = new BufferedReader(new FileReader( + locationFile)); + String line; + while ((line = br.readLine()) != null) { + if (line.startsWith("Copyright") || line.startsWith("locId")) { + /* Skip copyright notice and column headers. */ + continue; + } + String[] parts = line.split(","); + int location = Integer.parseInt(parts[0]); + String code = parts[1].replaceAll(""", "").toLowerCase(); + locations.put(location, code); + } + br.close(); + /* Parse blocks file and add ranges to the database. */ + br = new BufferedReader(new FileReader(blocksFile)); + while ((line = br.readLine()) != null) { + if (!line.startsWith(""")) { + /* Skip copyright notice and column headers. */ + continue; + } + String[] parts = line.replaceAll(""", "").split(","); + long startAddress = Long.parseLong(parts[0]), + endAddress = Long.parseLong(parts[1]); + int location = Integer.parseInt(parts[2]); + if (!locations.containsKey(location)) { + System.err.println(blocksFile.getAbsolutePath() + " contains " + + "line '" + line + "' that doesn't match any line in " + + locationFile.getAbsolutePath() + ". Aborting."); + break; + } + String code = locations.get(location); + this.addRange(databaseFileName, databaseDate, startAddress, + endAddress, code); + } + br.close(); + } catch (IOException e) { + return false; + } + return true; + } + + public boolean importGeoIPASNum2FileOrDirectory(String path) { + boolean allImportsSuccessful = true; + Stack<File> stackedFiles = new Stack<File>(); + stackedFiles.add(new File(path)); + List<File> allFiles = new ArrayList<File>(); + while (!stackedFiles.isEmpty()) { + File file = stackedFiles.pop(); + if (file.isDirectory()) { + stackedFiles.addAll(Arrays.asList(file.listFiles())); + } else if (!file.getName().endsWith(".csv")) { + System.err.println("Parsing other files than .csv is not " + + "supported: '" + file.getAbsolutePath() + "'. Skipping."); + } else { + allFiles.add(file); + } + } + Collections.sort(allFiles, Collections.reverseOrder()); + for (File file : allFiles) { + if (!this.importGeoIPASNum2File(file)) { + allImportsSuccessful = false; + } + } + return allImportsSuccessful; + } + + private boolean importGeoIPASNum2File(File file) { + SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd"); + dateFormat.setTimeZone(TimeZone.getTimeZone("UTC")); + long lastModifiedMillis = file.lastModified(); + String databaseFileName = file.getName() + " " + + dateFormat.format(lastModifiedMillis); + int databaseDate = (int) (lastModifiedMillis / 86400000); + try { + BufferedReader br = new BufferedReader(new FileReader(file)); + String line; + while ((line = br.readLine()) != null) { + String[] parts = line.split(","); + long startAddress = Long.parseLong(parts[0]), + endAddress = Long.parseLong(parts[1]); + String code = parts[2].split(" ")[0].replaceAll(""", ""); + if (!code.startsWith("AS")) { + /* Don't import illegal range. */ + continue; + } + this.addRange(databaseFileName, databaseDate, startAddress, + endAddress, code); + } + br.close(); + this.repairTree(); + } catch (IOException e) { + return false; + } + return true; + }
/** * Internal counters for import statistics. @@ -127,15 +300,9 @@ public class DatabaseImporterImpl extends DatabaseImpl * is called prior to any lookupAddress() calls. No further checks are * performed that the tree is repaired before looking up an address. */ - void addRange(String databaseFileName, String code, - String startAddressString, long addresses) { - + void addRange(String databaseFileName, int databaseDate, + long startAddress, long endAddress, String code) { this.rangeImports++; - String databaseDateString = - databaseFileName.substring(databaseFileName.length() - 8); - int databaseDate = convertDateStringToNumber(databaseDateString); - long startAddress = convertAddressStringToNumber(startAddressString); - long endAddress = startAddress + addresses - 1L;
/* Add new database date and file name if we didn't know them yet, * and note that we need to repair the tree after importing. */ @@ -337,6 +504,9 @@ public class DatabaseImporterImpl extends DatabaseImpl * interface, because the caller needs to make sure that repairTree() * is called prior to any lookupAddress() calls. No further checks are * performed that the tree is repaired before look up an address. + * + * TODO While repairing the tree, we might also optimize it by merging + * adjacent address ranges with the same database date ranges. */ void repairTree() { if (this.addedDatabaseDate < 0) { @@ -429,7 +599,6 @@ public class DatabaseImporterImpl extends DatabaseImpl } return true; } -
/* Return a nicely formatted string summarizing database contents and * usage statistics. */ diff --git a/task-6471/java/src/org/torproject/task6471/DatabaseTest.java b/task-6471/java/src/org/torproject/task6471/DatabaseTest.java index fb0f19d..236810b 100644 --- a/task-6471/java/src/org/torproject/task6471/DatabaseTest.java +++ b/task-6471/java/src/org/torproject/task6471/DatabaseTest.java @@ -12,7 +12,8 @@ public class DatabaseTest { @Test() public void testSingleIpRangeSingleDatebase() { DatabaseImporterImpl database = new DatabaseImporterImpl(); - database.addRange("20120901", "us", "3.0.0.0", 16777216); + database.addRegionalRegistryStatsFileRange("20120901", "us", + "3.0.0.0", 16777216); database.repairTree(); assertEquals(1, database.getNumberOfElements()); assertEquals(null, database.lookupIpv4AddressAndDate( @@ -60,8 +61,10 @@ public class DatabaseTest { @Test() public void testTwoAdjacentIpRangesSingleDatabase() { DatabaseImporterImpl database = new DatabaseImporterImpl(); - database.addRange("20120901", "us", "3.0.0.0", 16777216); - database.addRange("20120901", "ca", "4.0.0.0", 16777216); + database.addRegionalRegistryStatsFileRange("20120901", "us", + "3.0.0.0", 16777216); + database.addRegionalRegistryStatsFileRange("20120901", "ca", + "4.0.0.0", 16777216); database.repairTree(); assertEquals(2, database.getNumberOfElements()); assertEquals(null, database.lookupIpv4AddressAndDate( @@ -85,8 +88,10 @@ public class DatabaseTest { @Test() public void testTwoNonAdjacentIpDateRangesSingleDatabase() { DatabaseImporterImpl database = new DatabaseImporterImpl(); - database.addRange("20120901", "us", "3.0.0.0", 16777216); - database.addRange("20120901", "ca", "6.0.0.0", 16777216); + database.addRegionalRegistryStatsFileRange("20120901", "us", + "3.0.0.0", 16777216); + database.addRegionalRegistryStatsFileRange("20120901", "ca", + "6.0.0.0", 16777216); database.repairTree(); assertEquals(2, database.getNumberOfElements()); assertEquals(null, database.lookupIpv4AddressAndDate( @@ -104,8 +109,10 @@ public class DatabaseTest { @Test() public void testDuplicateImport() { DatabaseImporterImpl database = new DatabaseImporterImpl(); - database.addRange("20120901", "us", "3.0.0.0", 16777216); - database.addRange("20120901", "us", "3.0.0.0", 16777216); + database.addRegionalRegistryStatsFileRange("20120901", "us", + "3.0.0.0", 16777216); + database.addRegionalRegistryStatsFileRange("20120901", "us", + "3.0.0.0", 16777216); database.repairTree(); assertEquals(1, database.getNumberOfElements()); assertEquals(null, database.lookupIpv4AddressAndDate( @@ -119,8 +126,10 @@ public class DatabaseTest { @Test() public void testDuplicateImportDifferentCode() { DatabaseImporterImpl database = new DatabaseImporterImpl(); - database.addRange("20120901", "us", "3.0.0.0", 16777216); - database.addRange("20120901", "ca", "3.0.0.0", 16777216); + database.addRegionalRegistryStatsFileRange("20120901", "us", + "3.0.0.0", 16777216); + database.addRegionalRegistryStatsFileRange("20120901", "ca", + "3.0.0.0", 16777216); database.repairTree(); assertEquals(1, database.getNumberOfElements()); assertEquals("us", database.lookupIpv4AddressAndDate( @@ -130,9 +139,11 @@ public class DatabaseTest { @Test() public void testLeaveIpChangeUnchanged() { DatabaseImporterImpl database = new DatabaseImporterImpl(); - database.addRange("20120901", "us", "3.0.0.0", 16777216); + database.addRegionalRegistryStatsFileRange("20120901", "us", + "3.0.0.0", 16777216); database.repairTree(); - database.addRange("20121001", "us", "3.0.0.0", 16777216); + database.addRegionalRegistryStatsFileRange("20121001", "us", + "3.0.0.0", 16777216); database.repairTree(); assertEquals(1, database.getNumberOfElements()); assertEquals("us", database.lookupIpv4AddressAndDate( @@ -148,9 +159,11 @@ public class DatabaseTest { @Test() public void testLeaveIpChangeUnchangedReverseOrder() { DatabaseImporterImpl database = new DatabaseImporterImpl(); - database.addRange("20121001", "us", "3.0.0.0", 16777216); + database.addRegionalRegistryStatsFileRange("20121001", "us", + "3.0.0.0", 16777216); database.repairTree(); - database.addRange("20120901", "us", "3.0.0.0", 16777216); + database.addRegionalRegistryStatsFileRange("20120901", "us", + "3.0.0.0", 16777216); database.repairTree(); assertEquals(1, database.getNumberOfElements()); assertEquals("us", database.lookupIpv4AddressAndDate( @@ -166,11 +179,14 @@ public class DatabaseTest { @Test() public void testMissingIpRange() { DatabaseImporterImpl database = new DatabaseImporterImpl(); - database.addRange("20120901", "us", "3.0.0.0", 16777216); + database.addRegionalRegistryStatsFileRange("20120901", "us", + "3.0.0.0", 16777216); database.repairTree(); - database.addRange("20121101", "us", "3.0.0.0", 16777216); + database.addRegionalRegistryStatsFileRange("20121101", "us", + "3.0.0.0", 16777216); database.repairTree(); - database.addRange("20121001", "us", "6.0.0.0", 16777216); + database.addRegionalRegistryStatsFileRange("20121001", "us", + "6.0.0.0", 16777216); database.repairTree(); assertEquals(3, database.getNumberOfElements()); assertEquals("us", database.lookupIpv4AddressAndDate(