commit 1ad8f400c93d1178d83a91a3a33e107a652e985e Author: Karsten Loesing karsten.loesing@gmx.net Date: Wed Sep 12 18:49:43 2012 -0400
Implement a multi-GeoIP database in Java (#6471). --- task-6471/.gitignore | 2 + task-6471/README | 3 + task-6471/java/.gitignore | 5 + task-6471/java/build.xml | 42 ++ .../java/src/org/torproject/task6471/Database.java | 70 +++ .../src/org/torproject/task6471/DatabaseImpl.java | 652 ++++++++++++++++++++ .../task6471/DatabasePerformanceExample.java | 146 +++++ .../src/org/torproject/task6471/DatabaseTest.java | 141 +++++ 8 files changed, 1061 insertions(+), 0 deletions(-)
diff --git a/task-6471/.gitignore b/task-6471/.gitignore new file mode 100644 index 0000000..6dbccb1 --- /dev/null +++ b/task-6471/.gitignore @@ -0,0 +1,2 @@ +data/ + diff --git a/task-6471/README b/task-6471/README new file mode 100644 index 0000000..bb1a572 --- /dev/null +++ b/task-6471/README @@ -0,0 +1,3 @@ +Task 6471 -- Designing a file format and Python/Java library for multiple + GeoIP or AS databases + diff --git a/task-6471/java/.gitignore b/task-6471/java/.gitignore new file mode 100644 index 0000000..86b7b14 --- /dev/null +++ b/task-6471/java/.gitignore @@ -0,0 +1,5 @@ +.classpath +.project +lib/ +classes/ + diff --git a/task-6471/java/build.xml b/task-6471/java/build.xml new file mode 100644 index 0000000..9eb1223 --- /dev/null +++ b/task-6471/java/build.xml @@ -0,0 +1,42 @@ +<project default="test" name="task-6471" basedir="."> + <property name="sources" value="src"/> + <property name="classes" value="classes"/> + <property name="libs" value="lib"/> + <path id="classpath"> + <pathelement path="${classes}"/> + <fileset dir="${libs}"> + <include name="*.jar"/> + </fileset> + </path> + <target name="init"> + <mkdir dir="${classes}"/> + </target> + <target name="compile" + depends="init"> + <javac destdir="${classes}" + srcdir="${sources}" + source="1.5" + target="1.5" + debug="true" + deprecation="true" + optimize="false" + failonerror="true" + includeantruntime="false"> + <classpath refid="classpath"/> + </javac> + </target> + <target name="test" depends="compile"> + <junit fork="true" + haltonfailure="true" + maxmemory="1g" + printsummary="off"> + <classpath refid="classpath"/> + <formatter type="plain" usefile="false"/> + <batchtest> + <fileset dir="${classes}" + includes="**/*Test.class"/> + </batchtest> + </junit> + </target> +</project> + diff --git a/task-6471/java/src/org/torproject/task6471/Database.java b/task-6471/java/src/org/torproject/task6471/Database.java new file mode 100644 index 0000000..447b3c2 --- /dev/null +++ b/task-6471/java/src/org/torproject/task6471/Database.java @@ -0,0 +1,70 @@ +/* Copyright 2012 The Tor Project */ +package org.torproject.task6471; + +/** + * Database storing multiple GeoIP databases and supporting efficient + * ip-to-country lookups in the most recent of those databases for any + * given date. + * + * A typical query for this GeoIP database is: "to which country was IPv4 + * address 1.2.3.4 assigned on date 20120912?" This query is answered by + * looking at the entries from the most recent database published on or + * before 20120912. If the earliest known database was published after + * 20120912, the earliest known database is used to resolve the request. + */ +public interface Database { + + /** + * Import the contents of one or more IP address assignments files + * published by the Regional Internet Registries. The file or files + * are expected to conform to the RIR Statistics Exchange Format. + * Only IPv4 address ranges are imported, whereas ASN and IPv6 lines are + * ignored. Only the country code, start address, and address range + * length fields are imported. (TODO Extend to IPv6 and find similar + * data source for ASN.) + * + * A typical entry from a RIR file is: + * "ripencc|FR|ipv4|2.0.0.0|1048576|20100712|allocated". + * + * It is important to note that all five registry files (AfriNIC, APNIC, + * ARIN, LACNIC, and RIPE NCC) published on a given day should be + * imported, or the missing address ranges will be considered as + * unassigned from that day until the next database publication day. + * (TODO We could be smarter here by checking that less than five + * registry files have been imported for the same day, or something.) + * + * @param path Path to a stats file or directory. + * @return True if importing the file or directory was successful, + * false otherwise. + */ + public boolean importRegionalRegistryStatsFileOrDirectory(String path); + + /** + * Save the combined databases in a format that can later be loaded much + * more efficiently than importing the original RIR files again. + * + * @param path Path to the combined database file. + * @return True if saving the combined database file was successful, + * false otherwise. + */ + public boolean saveCombinedDatabases(String path); + + /** + * Load a combined databases file. + * + * @param path Path to the combined database file. + * @return True if loading the combined database file was successful, + * false otherwise. + */ + public boolean loadCombinedDatabases(String path); + + /** + * Query the database for an IPv4 address and assignment date. + * + * @param address IPv4 address in dotted-quad notation. + * @param date Assignment date in format yyyymmdd. + * @return Assigned country code, or null if no assignment could be + * found. + */ + public String lookupAddress(String address, String date); +} diff --git a/task-6471/java/src/org/torproject/task6471/DatabaseImpl.java b/task-6471/java/src/org/torproject/task6471/DatabaseImpl.java new file mode 100644 index 0000000..43e5a95 --- /dev/null +++ b/task-6471/java/src/org/torproject/task6471/DatabaseImpl.java @@ -0,0 +1,652 @@ +/* Copyright 2012 The Tor Project */ +package org.torproject.task6471; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.SortedMap; +import java.util.SortedSet; +import java.util.Stack; +import java.util.TimeZone; +import java.util.TreeMap; +import java.util.TreeSet; + +/** + * Implementation of database holding multiple GeoIP databases with + * special focus on lookup performance, import performance, and memory + * consumption (in that order). + * + * This implementation uses a single tree to store IP address and date + * ranges. Each tree element is stored under a long integer consisting of + * the start IPv4 address in the higher bits and the first database + * publication date containing that range in the lower bits. The tree + * element itself contains the end IPv4 address, last database publication + * date, last database index number (see explanation further below), and + * country code. + * + * Lookups for a given address and random date only require iterating + * backwards over ranges with start address smaller than or equaling the + * requested address and can terminate as soon as a range with a smaller + * end address is encountered. + * + * As a requirement for lookups to work correctly, address ranges may + * never overlap for different assignment periods. Similarly, date + * assignment ranges for a given address range may not overlap. These + * requirements make the import process somewhat more complex and + * time-consuming, which is a tradeoff for faster lookup performance. + * + * The purpose of storing the last database index number is to fix ranges + * that are contained in two or more databases, but that are missing in a + * database that was published between the others but imported after them. + * The database index number defines that the range is only valid for + * databases imported until a given database, not necessarily for + * databases importer later on. A separate repair run is necessary to + * check whether later imported databases require splitting a range into + * two or more sub ranges to correctly reflect that the range was not + * contained in those databases. + */ +public class DatabaseImpl implements Database { + + /** + * Tree element containing an end IPv4 address, last database date, + * last database index, and country code. Start IPv4 address and first + * database date are encoded in the key under which the element is + * stored. + */ + private static class TreeElement { + private long endAddress; + private int lastDbDate; + private int lastKnownDbIndex; + private String countryCode; + TreeElement(long endAddress, int lastDbDate, int lastKnownDbIndex, + String countryCode) { + this.endAddress = endAddress; + this.lastDbDate = lastDbDate; + this.lastKnownDbIndex = lastKnownDbIndex; + this.countryCode = countryCode; + } + } + + /** + * IPv4 address and date ranges, ordered backwards by start address and + * first database date. + */ + private SortedMap<Long, TreeElement> ranges = + new TreeMap<Long, TreeElement>(Collections.reverseOrder()); + + /** + * Return number of contained ranges. + */ + int getNumberOfElements() { + return this.ranges.size(); + } + + /** + * Database dates ordered from oldest to youngest. + */ + private SortedSet<Integer> databaseDates = new TreeSet<Integer>(); + + /** + * Ordered list of database dates to find out their indexes. + */ + private List<Integer> databaseIndexes = new ArrayList<Integer>(); + + /** + * Parse one or more stats files. + */ + public boolean importRegionalRegistryStatsFileOrDirectory(String path) { + boolean allImportsSuccessful = true; + Stack<File> files = new Stack<File>(); + files.add(new File(path)); + while (!files.isEmpty()) { + File file = files.pop(); + if (file.isDirectory()) { + files.addAll(Arrays.asList(file.listFiles())); + } else if (file.getName().endsWith(".md5") || + file.getName().endsWith(".md5.gz") || + file.getName().endsWith(".asc") || + file.getName().endsWith(".asc.gz")) { + System.err.println("Signature and digest files are not supported " + + "yet: '" + file.getAbsolutePath() + "'. Skipping."); + /* TODO Implement checking signatures/digests. */ + } else if (file.getName().endsWith(".gz") || + file.getName().endsWith(".bz2")) { + System.err.println("Parsing compressed files is not supported " + + "yet: '" + file.getAbsolutePath() + "'. Skipping."); + /* TODO Implement parsing compressed files. */ + } else if (!this.importRegionalRegistryStatsFile(file)) { + allImportsSuccessful = false; + } + } + return allImportsSuccessful; + } + + /** + * Simple and not very robust implementation of an RIR stats file + * parser. + */ + private boolean importRegionalRegistryStatsFile(File file) { + try { + BufferedReader br = new BufferedReader(new FileReader(file)); + String line; + String databaseDateString = + file.getName().substring(file.getName().length() - 8); + while ((line = br.readLine()) != null) { + if (line.startsWith("#") || line.length() == 0) { + /* Skip comment line. */ + continue; + } + String[] parts = line.split("\|"); + if (parts[0].equals("2")) { + continue; + } + if (parts[1].equals("*")) { + /* Skip summary line. */ + continue; + } + String type = parts[2]; + if (type.equals("asn")) { + continue; + } else if (type.equals("ipv6")) { + continue; + } + String countryCode = parts[1].toLowerCase(); + String startAddressString = parts[3]; + long addresses = Long.parseLong(parts[4]); + this.addRange(databaseDateString, countryCode, startAddressString, + addresses); + } + br.close(); + this.repairIndex(); + } catch (IOException e) { + return false; + } + return true; + } + + /** + * Internal counters for import statistics. + */ + private int rangeImports = 0, rangeImportsKeyLookups = 0; + + /** + * Add a single address and date range to the database, which may + * require splitting up existing ranges. + * + * This method has default visibility and is not specified in the + * interface, because the caller needs to make sure that repairIndex() + * is called prior to any lookupAddress() calls. No further checks are + * performed that the tree is repaired before look up an address. + */ + void addRange(String databaseDateString, String countryCode, + String startAddressString, long addresses) { + + this.rangeImports++; + int databaseDate = convertDateStringToNumber(databaseDateString); + long startAddress = convertAddressStringToNumber(startAddressString); + long endAddress = startAddress + addresses - 1L; + + /* Add new database date if it's not yet contained. */ + if (!this.databaseDates.contains(databaseDate)) { + this.databaseDates.add(databaseDate); + this.databaseIndexes.add(databaseDate); + if (this.databaseIndexes.size() > 1) { + this.needToFixDatabase = true; + } + } + + /* We might have to split existing ranges or the new range before + * adding it to the database, and we might have to remove existing + * ranges. We shouldn't mess with the tree directly while iterating + * over it, so let's for now only calculate what changes we want to + * make. */ + SortedMap<Long, TreeElement> updateElements = + this.getUpdatesForAddingRange(databaseDate, countryCode, + startAddress, endAddress); + + /* Apply updates. Elements with non-null values are added, elements + * with null values are removed. */ + for (Map.Entry<Long, TreeElement> e : updateElements.entrySet()) { + if (e.getValue() == null) { + this.ranges.remove(e.getKey()); + } else { + this.ranges.put(e.getKey(), e.getValue()); + } + } + } + + /** + * Calculate necessary changes to the tree to add a range. + */ + private SortedMap<Long, TreeElement> getUpdatesForAddingRange( + int databaseDate, String countryCode, long startAddress, + long endAddress) { + + /* Keep updates in a single tree where non-null values will later be + * added, possibly replacing existing elements, and null values will + * be removed from the tree. */ + SortedMap<Long, TreeElement> updateElements = + new TreeMap<Long, TreeElement>(); + + /* Find out previous and next database, so that we can possibly merge + * ranges. */ + int previousDatabaseDate = + this.databaseDates.headSet(databaseDate).isEmpty() ? -1 : + this.databaseDates.headSet(databaseDate).last(); + int nextDatabaseDate = + this.databaseDates.tailSet(databaseDate + 1).isEmpty() ? -1 : + this.databaseDates.tailSet(databaseDate + 1).first(); + + /* Look up database index number of the range to be added. */ + int dbIndex = this.databaseIndexes.indexOf(databaseDate); + + /* Remember the address boundaries of the next (partial) range to be + * added. */ + long nextStartAddress = startAddress, nextEndAddress = endAddress; + int nextFirstDbDate = databaseDate, nextLastDbDate = databaseDate; + + /* Iterate backwards over the existing ranges, starting at the end + * address of the range to be added and at the last conceivable + * database publication date. */ + for (Map.Entry<Long, TreeElement> e : + this.ranges.tailMap(((endAddress + 1L) << 16) - 1L).entrySet()) { + this.rangeImportsKeyLookups++; + + /* Extract everything we need to know from the next existing range + * we're looking at. */ + long eStartAddress = e.getKey() >> 16; + long eEndAddress = e.getValue().endAddress; + int eFirstDbDate = (int) (e.getKey() & ((1L << 16) - 1L)); + int eLastDbDate = e.getValue().lastDbDate; + int eLastKnownDbIndex = e.getValue().lastKnownDbIndex; + String eCountryCode = e.getValue().countryCode; + + /* If the next (partial) range starts after the current element + * ends, add the new range. */ + if (nextStartAddress > eEndAddress && + nextEndAddress >= startAddress) { + updateElements.put((nextStartAddress << 16) + nextFirstDbDate, + new TreeElement(nextEndAddress, nextLastDbDate, dbIndex, + countryCode)); + nextEndAddress = nextStartAddress - 1L; + nextStartAddress = startAddress; + nextFirstDbDate = databaseDate; + nextLastDbDate = databaseDate; + } + + /* If the next (partial) range still ends after the current element + * ends, add the new range. */ + if (nextEndAddress > eEndAddress && + nextEndAddress >= startAddress) { + updateElements.put(((eEndAddress + 1L) << 16) + databaseDate, + new TreeElement(nextEndAddress, databaseDate, dbIndex, + countryCode)); + nextEndAddress = eEndAddress; + nextStartAddress = startAddress; + nextFirstDbDate = databaseDate; + nextLastDbDate = databaseDate; + } + + /* If the existing range ends before the new range starts, we don't + * have to look at any more existing ranges. */ + if (eEndAddress < startAddress) { + break; + } + + /* Cut off any address range parts of the existing element that are + * not contained in the currently added range. First check whether + * the existing range ends after the newly added range. In that + * case cut off the overlapping part and store it as a new + * element.*/ + if (eStartAddress <= endAddress && eEndAddress > endAddress) { + updateElements.put(((endAddress + 1L) << 16) + eFirstDbDate, + new TreeElement(eEndAddress, eLastDbDate, eLastKnownDbIndex, + eCountryCode)); + updateElements.put((eStartAddress << 16) + eFirstDbDate, + new TreeElement(endAddress, eLastDbDate, eLastKnownDbIndex, + eCountryCode)); + eEndAddress = endAddress; + } + + /* Similarly, check whether the existing range starts before the + * newly added one. If so, cut off the overlapping part and store + * it as new element. */ + if (eStartAddress < startAddress && eEndAddress >= startAddress) { + updateElements.put((eStartAddress << 16) + eFirstDbDate, + new TreeElement(startAddress - 1L, eLastDbDate, + eLastKnownDbIndex, eCountryCode)); + updateElements.put((startAddress << 16) + eFirstDbDate, + new TreeElement(eEndAddress, eLastDbDate, eLastKnownDbIndex, + eCountryCode)); + eStartAddress = startAddress; + } + + /* Now we're sure the existing element doesn't exceed the newly + * added element, address-wise. */ + nextStartAddress = eStartAddress; + nextEndAddress = eEndAddress; + + /* If the range is already contained, maybe even with different + * country code, ignore it. */ + if (eFirstDbDate <= databaseDate && eLastDbDate >= databaseDate) { + updateElements.clear(); + return updateElements; + } + + /* See if we can merge the new range with the previous or next + * range. If so, extend our database range and mark the existing + * element for deletion. */ + if (eCountryCode.equals(countryCode)) { + if (eLastDbDate == previousDatabaseDate) { + nextFirstDbDate = eFirstDbDate; + updateElements.put((eStartAddress << 16) + eFirstDbDate, null); + } else if (eFirstDbDate == nextDatabaseDate) { + nextLastDbDate = eLastDbDate; + updateElements.put((eStartAddress << 16) + eFirstDbDate, null); + } + } + } + + /* If there's still some part (or the whole?) address range left to + * add, add it now. */ + while (nextEndAddress >= startAddress) { + updateElements.put((nextStartAddress << 16) + nextFirstDbDate, + new TreeElement(nextEndAddress, nextLastDbDate, dbIndex, + countryCode)); + nextEndAddress = nextStartAddress - 1L; + nextStartAddress = startAddress; + nextFirstDbDate = databaseDate; + nextLastDbDate = databaseDate; + } + + /* Return the tree updates that will add the given range. */ + return updateElements; + } + + /* Do we have to repair the tree? */ + private boolean needToFixDatabase = false; + + /** + * Repair tree by making sure that any range from a given database date + * to another is still valid when considering any other database that + * was imported later. + */ + void repairIndex() { + if (!needToFixDatabase) { + return; + } + int maxDatabaseIndex = databaseIndexes.size() - 1; + if (maxDatabaseIndex < 1) { + return; + } + SortedMap<Long, TreeElement> updateElements = + new TreeMap<Long, TreeElement>(); + for (Map.Entry<Long, TreeElement> e : this.ranges.entrySet()) { + if (e.getValue().lastKnownDbIndex < maxDatabaseIndex) { + int eFirstDbDate = (int) (e.getKey() & ((1L << 16) - 1L)); + int eLastDbDate = e.getValue().lastDbDate; + List<Integer> splitAtDates = new ArrayList<Integer>(); + for (int dbIndex = e.getValue().lastKnownDbIndex + 1; + dbIndex <= maxDatabaseIndex; dbIndex++) { + int dbDate = databaseIndexes.get(dbIndex); + if (eFirstDbDate < dbDate && eLastDbDate > dbDate) { + splitAtDates.add(dbDate); + } + } + if (splitAtDates.isEmpty()) { + e.getValue().lastKnownDbIndex = maxDatabaseIndex; + } else { + long eStartAddress = e.getKey() >> 16; + long eEndAddress = e.getValue().endAddress; + String eCountryCode = e.getValue().countryCode; + int start = eFirstDbDate, end = eFirstDbDate; + for (int cur : this.databaseDates.tailSet(eFirstDbDate)) { + if (cur > eLastDbDate) { + break; + } + if (splitAtDates.contains(cur)) { + if (start >= 0 && end >= 0) { + updateElements.put((eStartAddress << 16) + start, + new TreeElement(eEndAddress, end, + maxDatabaseIndex, eCountryCode)); + start = end = -1; + } + } else if (start < 0) { + start = end = cur; + } else { + end = cur; + } + } + if (start >= 0 && end >= 0) { + updateElements.put((eStartAddress << 16) + start, + new TreeElement(eEndAddress, end, + maxDatabaseIndex, eCountryCode)); + } + } + } + } + for (Map.Entry<Long, TreeElement> e : updateElements.entrySet()) { + this.ranges.put(e.getKey(), e.getValue()); + } + this.needToFixDatabase = false; + } + + /** + * Internal counters for lookup statistics. + */ + private int addressLookups = 0, addressLookupsKeyLookups = 0; + + /** + * Look up address and date by iterating backwards over possibly + * matching ranges. + */ + public String lookupAddress(String addressString, String dateString) { + this.addressLookups++; + + long address = convertAddressStringToNumber(addressString); + int date = convertDateStringToNumber(dateString); + + if (this.databaseDates.isEmpty()) { + return null; + } + + /* Look up which database we want. */ + int databaseDate = this.databaseDates.headSet(date + 1).isEmpty() ? + this.databaseDates.first() : + this.databaseDates.headSet(date + 1).last(); + + /* Iterate backwards over the existing ranges, starting at the last + * possible date of the address to be found. */ + for (Map.Entry<Long, TreeElement> e : + this.ranges.tailMap(((address + 1L) << 16) - 1L).entrySet()) { + this.addressLookupsKeyLookups++; + + /* If either the end address or end date of the range we're looking + * at is smaller than the values we're looking for, we can be sure + * not to find it anymore. */ + if (e.getValue().endAddress < address || + e.getValue().lastDbDate < databaseDate) { + return null; + } + + /* If the range starts at a later date, skip it and look at the next + * one. */ + long startDate = e.getKey() & ((1L << 16) - 1L); + if (startDate > databaseDate) { + continue; + } + + /* Both address and date ranges match, so return the assigned + * country code. */ + return e.getValue().countryCode; + } + + /* No ranges (left) to look at. We don't have what we were looking + * for. */ + return null; + } + + /* Helper: convert a dotted-quad formatted address string to its + * corresponding long integer number. */ + static long convertAddressStringToNumber(String addressString) { + long address = 0; + String[] addressParts = addressString.split("\."); + for (int i = 0; i < 4; i++) { + address += Long.parseLong(addressParts[i]) << ((3 - i) * 8); + } + return address; + } + + /* Helper: convert a long integer address number to its corresponding + * dotted-quad formatted string. */ + static String convertAddressNumberToString(long address) { + return "" + (address / 256 / 256 / 256) + "." + + ((address / 256 / 256) % 256) + "." + + ((address / 256) % 256) + "." + (address % 256); + } + + /* Helper: date format parser/formatter. */ + private static SimpleDateFormat dateFormat; + static { + dateFormat = new SimpleDateFormat("yyyyMMdd"); + dateFormat.setTimeZone(TimeZone.getTimeZone("UTC")); + } + + /* Helper: convert date string in format yyyymmdd to integer containing + * days passed since 1970-01-01. */ + static int convertDateStringToNumber(String dateString) + throws IllegalArgumentException { + try { + SimpleDateFormat dateFormat = new SimpleDateFormat("yyyyMMdd"); + dateFormat.setTimeZone(TimeZone.getTimeZone("UTC")); + return (int) (dateFormat.parse(dateString).getTime() / 86400000); + } catch (ParseException e) { + throw new IllegalArgumentException(e); + } + } + + /* Helper: convert integer containing days passed since 1970-01-01 to + * date string in format yyyymmdd. */ + static String convertDateNumberToString(int date) { + return dateFormat.format(((long) date) * 86400000); + } + + /* Return a nicely formatted string summarizing database contents and + * usage statistics. */ + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append(String.format("Database contains %d databases and %d " + + "combined address ranges.\n" + + "Performed %d address range imports requiring %d lookups.\n" + + "Performed %d address lookups requiring %d lookups.\n" + + "First 10 entries, in reverse order, are:", + this.databaseDates.size(), this.ranges.size(), rangeImports, + rangeImportsKeyLookups, addressLookups, + addressLookupsKeyLookups)); + int entries = 10; + for (Map.Entry<Long, TreeElement> e : this.ranges.entrySet()) { + sb.append(String.format("%n %s %s %s %s %s %d", + convertAddressNumberToString(e.getKey() >> 16), + convertAddressNumberToString(e.getValue().endAddress), + e.getValue().countryCode, + convertDateNumberToString( + (int) (e.getKey() & ((1L << 16) - 1L))), + convertDateNumberToString(e.getValue().lastDbDate), + e.getValue().lastKnownDbIndex)); + if (--entries <= 0) { + break; + } + } + return sb.toString(); + } + + /** + * Save the combined databases to disk. + */ + public boolean saveCombinedDatabases(String path) { + try { + + /* Create parent directories if necessary. */ + File file = new File(path); + if (file.getParentFile() != null) { + file.getParentFile().mkdirs(); + } + + /* Start with writing all contained database dates to the file + * header. */ + BufferedWriter bw = new BufferedWriter(new FileWriter(file)); + for (int dbDate : this.databaseDates) { + bw.write("!" + convertDateNumberToString(dbDate) + "\n"); + } + + /* Next write all database ranges in the same order as they are + * currently contained in memory. The only information we can drop + * is the last known database index of each range, because we assume + * the tree is already in repaired state. */ + for (Map.Entry<Long, TreeElement> e : this.ranges.entrySet()) { + bw.write(String.format("%s,%s,%s,%s,%s%n", + convertAddressNumberToString(e.getKey() >> 16), + convertAddressNumberToString(e.getValue().endAddress), + e.getValue().countryCode, + convertDateNumberToString( + (int) (e.getKey() & ((1L << 16) - 1L))), + convertDateNumberToString(e.getValue().lastDbDate))); + } + bw.close(); + } catch (IOException e) { + return false; + } + return true; + } + + /** + * Load previously saved combined databases from disk. This code is not + * at all robust against external changes of the combined database file. + */ + public boolean loadCombinedDatabases(String path) { + try { + File file = new File(path); + BufferedReader br = new BufferedReader(new FileReader(file)); + String line; + int maxDbIndex = -1; + while ((line = br.readLine()) != null) { + if (line.startsWith("!")) { + + /* First read file header containing database dates. */ + int dbDate = convertDateStringToNumber(line.substring(1)); + this.databaseDates.add(dbDate); + this.databaseIndexes.add(dbDate); + maxDbIndex = this.databaseIndexes.size() - 1; + } else { + + /* Next read all ranges. Set last known database index for each + * range to the last database we read from the header, because + * the tree will immediately be in repaired state. */ + String[] parts = line.split(","); + long startAddress = convertAddressStringToNumber(parts[0]); + long endAddress = convertAddressStringToNumber(parts[1]); + String countryCode = parts[2]; + int firstDbDate = convertDateStringToNumber(parts[3]); + int lastDbDate = convertDateStringToNumber(parts[4]); + this.ranges.put((startAddress << 16) + firstDbDate, + new TreeElement(endAddress, lastDbDate, maxDbIndex, + countryCode)); + } + } + br.close(); + } catch (IOException e) { + return false; + } + return true; + } +} diff --git a/task-6471/java/src/org/torproject/task6471/DatabasePerformanceExample.java b/task-6471/java/src/org/torproject/task6471/DatabasePerformanceExample.java new file mode 100644 index 0000000..3b273bc --- /dev/null +++ b/task-6471/java/src/org/torproject/task6471/DatabasePerformanceExample.java @@ -0,0 +1,146 @@ +package org.torproject.task6471; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Random; +import java.util.SortedMap; +import java.util.SortedSet; +import java.util.Stack; +import java.util.TreeMap; +import java.util.TreeSet; + +public class DatabasePerformanceExample { + public static void main(String[] args) { + + System.out.print("Generating test cases... "); + long startMillis = System.currentTimeMillis(); + List<Long> tests = new ArrayList<Long>(); + SortedMap<Long, String> results = new TreeMap<Long, String>(); + Random rnd = new Random(1L); + int startDate = DatabaseImpl.convertDateStringToNumber("20071001"); + int endDate = DatabaseImpl.convertDateStringToNumber("20120930"); + /* Skipping Dec 1--3, 2009, because the first available database from + * December 2009 was published on the 4th, and generating test cases + * was just too confusing when taking that into account. */ + List<Integer> skipDates = new ArrayList<Integer>(); + skipDates.add(DatabaseImpl.convertDateStringToNumber("20091201")); + skipDates.add(DatabaseImpl.convertDateStringToNumber("20091202")); + skipDates.add(DatabaseImpl.convertDateStringToNumber("20091203")); + for (int i = 0; i < 100000; i++) { + long testAddress = rnd.nextLong() & ((1L << 32) - 1L); + int testDate = startDate + rnd.nextInt(endDate - startDate); + if (skipDates.contains(testDate)) { + i--; + } else { + tests.add((testAddress << 16) + testDate); + } + } + Stack<File> stackedFiles = new Stack<File>(); + stackedFiles.add(new File("../data")); + SortedSet<File> files = new TreeSet<File>(); + while (!stackedFiles.isEmpty()) { + File file = stackedFiles.pop(); + if (file.isDirectory()) { + stackedFiles.addAll(Arrays.asList(file.listFiles())); + } else if (!file.getName().endsWith(".md5") && + !file.getName().endsWith(".md5.gz") && + !file.getName().endsWith(".asc") && + !file.getName().endsWith(".asc.gz")) { + files.add(file); + } + } + for (File file : files) { + String dbMonth = file.getName().substring( + file.getName().length() - 8); + dbMonth = dbMonth.substring(0, 6); + Database temp = new DatabaseImpl(); + temp.importRegionalRegistryStatsFileOrDirectory( + file.getAbsolutePath()); + for (long test : tests) { + int testDate = (int) (test & ((1 << 16) - 1)); + String testMonth = DatabaseImpl.convertDateNumberToString( + testDate).substring(0, 6); + if (testMonth.equals(dbMonth)) { + String testAddressString = DatabaseImpl. + convertAddressNumberToString(test >> 16); + String testDateString = DatabaseImpl.convertDateNumberToString( + testDate); + String countryCode = temp.lookupAddress(testAddressString, + testDateString); + if (countryCode != null) { + results.put(test, countryCode); + } + } + } + } + long endMillis = System.currentTimeMillis(); + System.out.println((endMillis - startMillis) + " millis."); + + System.out.print("Importing files... "); + startMillis = endMillis; + Database database = new DatabaseImpl(); + database.importRegionalRegistryStatsFileOrDirectory("../data"); + endMillis = System.currentTimeMillis(); + System.out.println((endMillis - startMillis) + " millis."); + + System.out.print("Making test requests... "); + startMillis = endMillis; + int failures = 0; + for (long test : tests) { + String testAddress = DatabaseImpl.convertAddressNumberToString( + test >> 16); + String testDate = DatabaseImpl.convertDateNumberToString( + (int) (test & ((1 << 16) - 1))); + String expected = results.get(test); + String result = database.lookupAddress(testAddress, testDate); + if ((expected == null && result != null) || + (expected != null && !expected.equals(result))) { + //System.out.println("Expected " + expected + " for " + // + testAddress + " " + testDate + ", but got " + result); + failures++; + } + } + endMillis = System.currentTimeMillis(); + System.out.println((endMillis - startMillis) + " millis, " + failures + + " out of " + tests.size() + " tests failed."); + + System.out.println(database); + + System.out.print("Saving combined databases to disk... "); + startMillis = endMillis; + database.saveCombinedDatabases("geoip-2007-10-2012-09.csv"); + endMillis = System.currentTimeMillis(); + System.out.println((endMillis - startMillis) + " millis."); + startMillis = endMillis; + + System.out.print("Loading combined databases from disk... "); + startMillis = endMillis; + database = new DatabaseImpl(); + database.loadCombinedDatabases("geoip-2007-10-2012-09.csv"); + endMillis = System.currentTimeMillis(); + System.out.println((endMillis - startMillis) + " millis."); + + System.out.print("Making a second round of test requests... "); + startMillis = endMillis; + failures = 0; + for (long test : tests) { + String testAddress = DatabaseImpl.convertAddressNumberToString( + test >> 16); + String testDate = DatabaseImpl.convertDateNumberToString( + (int) (test & ((1 << 16) - 1))); + String expected = results.get(test); + String result = database.lookupAddress(testAddress, testDate); + if ((expected == null && result != null) || + (expected != null && !expected.equals(result))) { + //System.out.println("Expected " + expected + " for " + // + testAddress + " " + testDate + ", but got " + result); + failures++; + } + } + endMillis = System.currentTimeMillis(); + System.out.println((endMillis - startMillis) + " millis, " + failures + + " out of " + tests.size() + " tests failed."); + } +} diff --git a/task-6471/java/src/org/torproject/task6471/DatabaseTest.java b/task-6471/java/src/org/torproject/task6471/DatabaseTest.java new file mode 100644 index 0000000..8f90b3d --- /dev/null +++ b/task-6471/java/src/org/torproject/task6471/DatabaseTest.java @@ -0,0 +1,141 @@ +/* Copyright 2012 The Tor Project */ +package org.torproject.task6471; + +import static org.junit.Assert.assertEquals; +import org.junit.Test; + +/** + * Test the multi-GeoIP database implementation. + */ +public class DatabaseTest { + + @Test() + public void testSingleIpRangeSingleDatebase() { + DatabaseImpl database = new DatabaseImpl(); + database.addRange("20120901", "us", "3.0.0.0", 16777216); + database.repairIndex(); + assertEquals(1, ((DatabaseImpl) database).getNumberOfElements()); + assertEquals(null, database.lookupAddress("2.255.255.255", + "19920901")); + assertEquals(null, database.lookupAddress("2.255.255.255", + "20020901")); + assertEquals(null, database.lookupAddress("2.255.255.255", + "20120901")); + assertEquals(null, database.lookupAddress("2.255.255.255", + "20220901")); + assertEquals("us", database.lookupAddress("3.0.0.0", "19920901")); + assertEquals("us", database.lookupAddress("3.0.0.0", "20020901")); + assertEquals("us", database.lookupAddress("3.0.0.0", "20120901")); + assertEquals("us", database.lookupAddress("3.0.0.0", "20220901")); + assertEquals("us", database.lookupAddress("3.127.0.0", "19920901")); + assertEquals("us", database.lookupAddress("3.127.0.0", "20020901")); + assertEquals("us", database.lookupAddress("3.127.0.0", "20120901")); + assertEquals("us", database.lookupAddress("3.127.0.0", "20220901")); + assertEquals("us", database.lookupAddress("3.255.255.255", + "19920901")); + assertEquals("us", database.lookupAddress("3.255.255.255", + "20020901")); + assertEquals("us", database.lookupAddress("3.255.255.255", + "20120901")); + assertEquals("us", database.lookupAddress("3.255.255.255", + "20220901")); + assertEquals(null, database.lookupAddress("4.0.0.0", "19920901")); + assertEquals(null, database.lookupAddress("4.0.0.0", "20020901")); + assertEquals(null, database.lookupAddress("4.0.0.0", "20120901")); + assertEquals(null, database.lookupAddress("4.0.0.0", "20220901")); + } + + @Test() + public void testTwoAdjacentIpRangesSingleDatabase() { + DatabaseImpl database = new DatabaseImpl(); + database.addRange("20120901", "us", "3.0.0.0", 16777216); + database.addRange("20120901", "ca", "4.0.0.0", 16777216); + database.repairIndex(); + assertEquals(2, ((DatabaseImpl) database).getNumberOfElements()); + assertEquals(null, database.lookupAddress("2.255.255.255", + "20120901")); + assertEquals("us", database.lookupAddress("3.127.0.0", "20120901")); + assertEquals("us", database.lookupAddress("3.127.0.0", "20120901")); + assertEquals("us", database.lookupAddress("3.127.0.0", "20120901")); + assertEquals("ca", database.lookupAddress("4.127.0.0", "20120901")); + assertEquals("ca", database.lookupAddress("4.127.0.0", "20120901")); + assertEquals("ca", database.lookupAddress("4.127.0.0", "20120901")); + assertEquals(null, database.lookupAddress("5.0.0.0", "20120901")); + } + + @Test() + public void testTwoNonAdjacentIpDateRangesSingleDatabase() { + DatabaseImpl database = new DatabaseImpl(); + database.addRange("20120901", "us", "3.0.0.0", 16777216); + database.addRange("20120901", "ca", "6.0.0.0", 16777216); + database.repairIndex(); + assertEquals(2, ((DatabaseImpl) database).getNumberOfElements()); + assertEquals(null, database.lookupAddress("2.255.255.255", "20120901")); + assertEquals("us", database.lookupAddress("3.127.0.0", "20120901")); + assertEquals(null, database.lookupAddress("4.255.255.255", "20120901")); + assertEquals("ca", database.lookupAddress("6.127.0.0", "20120901")); + assertEquals(null, database.lookupAddress("7.0.0.0", "20120901")); + } + + @Test() + public void testDuplicateImport() { + DatabaseImpl database = new DatabaseImpl(); + database.addRange("20120901", "us", "3.0.0.0", 16777216); + database.addRange("20120901", "us", "3.0.0.0", 16777216); + database.repairIndex(); + assertEquals(1, ((DatabaseImpl) database).getNumberOfElements()); + assertEquals(null, database.lookupAddress("2.255.255.255", "20120901")); + assertEquals("us", database.lookupAddress("3.127.0.0", "20120901")); + assertEquals(null, database.lookupAddress("4.0.0.0", "20120901")); + } + + @Test() + public void testDuplicateImportDifferentCountryCode() { + DatabaseImpl database = new DatabaseImpl(); + database.addRange("20120901", "us", "3.0.0.0", 16777216); + database.addRange("20120901", "ca", "3.0.0.0", 16777216); + database.repairIndex(); + assertEquals(1, ((DatabaseImpl) database).getNumberOfElements()); + assertEquals("us", database.lookupAddress("3.127.0.0", "20120901")); + } + + @Test() + public void testLeaveIpChangeUnchanged() { + DatabaseImpl database = new DatabaseImpl(); + database.addRange("20120901", "us", "3.0.0.0", 16777216); + database.addRange("20121001", "us", "3.0.0.0", 16777216); + database.repairIndex(); + assertEquals(1, ((DatabaseImpl) database).getNumberOfElements()); + assertEquals("us", database.lookupAddress("3.127.0.0", "20120801")); + assertEquals("us", database.lookupAddress("3.127.0.0", "20120901")); + assertEquals("us", database.lookupAddress("3.127.0.0", "20121001")); + assertEquals("us", database.lookupAddress("3.127.0.0", "20121101")); + } + + @Test() + public void testLeaveIpChangeUnchangedReverseOrder() { + DatabaseImpl database = new DatabaseImpl(); + database.addRange("20121001", "us", "3.0.0.0", 16777216); + database.addRange("20120901", "us", "3.0.0.0", 16777216); + database.repairIndex(); + assertEquals(1, ((DatabaseImpl) database).getNumberOfElements()); + assertEquals("us", database.lookupAddress("3.127.0.0", "20120801")); + assertEquals("us", database.lookupAddress("3.127.0.0", "20120901")); + assertEquals("us", database.lookupAddress("3.127.0.0", "20121001")); + assertEquals("us", database.lookupAddress("3.127.0.0", "20121101")); + } + + @Test() + public void testMissingIpRange() { + DatabaseImpl database = new DatabaseImpl(); + database.addRange("20120901", "us", "3.0.0.0", 16777216); + database.addRange("20121101", "us", "3.0.0.0", 16777216); + database.addRange("20121001", "us", "6.0.0.0", 16777216); + database.repairIndex(); + assertEquals(3, ((DatabaseImpl) database).getNumberOfElements()); + assertEquals("us", database.lookupAddress("3.127.0.0", "20120801")); + assertEquals("us", database.lookupAddress("3.127.0.0", "20120901")); + assertEquals(null, database.lookupAddress("3.127.0.0", "20121001")); + assertEquals("us", database.lookupAddress("3.127.0.0", "20121101")); + } +}