commit 7b2c08bb7724614964ad5f318cc7016f558e3849 Author: Karsten Loesing karsten.loesing@gmx.net Date: Tue Nov 1 16:09:22 2016 +0100
Use metrics-lib for parsing descriptors.
We're using metrics-lib for downloading descriptors from CollecTor, but we're still using our own parsing code. Let's avoid duplicating code by using what metrics-lib provides. --- build.xml | 2 + .../exonerator/ExoneraTorDatabaseImporter.java | 291 +++++---------------- 2 files changed, 66 insertions(+), 227 deletions(-)
diff --git a/build.xml b/build.xml index 42c0ee0..1b8fecd 100644 --- a/build.xml +++ b/build.xml @@ -24,6 +24,8 @@ <include name="logback-core-1.1.2.jar" /> <include name="logback-classic-1.1.2.jar" /> <include name="slf4j-api-1.7.7.jar"/> + <include name="commons-compress-1.9.jar"/> + <include name="xz-1.5.jar"/> </fileset> </path> <path id="checkstyle.classpath" > diff --git a/src/main/java/org/torproject/exonerator/ExoneraTorDatabaseImporter.java b/src/main/java/org/torproject/exonerator/ExoneraTorDatabaseImporter.java index 3777908..68bc8cc 100644 --- a/src/main/java/org/torproject/exonerator/ExoneraTorDatabaseImporter.java +++ b/src/main/java/org/torproject/exonerator/ExoneraTorDatabaseImporter.java @@ -3,37 +3,39 @@
package org.torproject.exonerator;
+import org.torproject.descriptor.Descriptor; import org.torproject.descriptor.DescriptorCollector; +import org.torproject.descriptor.DescriptorFile; +import org.torproject.descriptor.DescriptorReader; import org.torproject.descriptor.DescriptorSourceFactory; +import org.torproject.descriptor.ExitList; +import org.torproject.descriptor.ExitList.Entry; +import org.torproject.descriptor.NetworkStatusEntry; +import org.torproject.descriptor.RelayNetworkStatusConsensus;
-import org.apache.commons.codec.binary.Base64; import org.apache.commons.codec.binary.Hex;
-import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.BufferedWriter; -import java.io.ByteArrayOutputStream; import java.io.File; -import java.io.FileInputStream; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; -import java.io.StringReader; import java.sql.CallableStatement; import java.sql.Connection; import java.sql.DriverManager; import java.sql.SQLException; import java.sql.Timestamp; import java.sql.Types; -import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Calendar; -import java.util.HashMap; import java.util.HashSet; +import java.util.Iterator; import java.util.Map; import java.util.Set; -import java.util.Stack; +import java.util.SortedMap; import java.util.TimeZone; +import java.util.TreeMap;
/* Import Tor descriptors into the ExoneraTor database. */ public class ExoneraTorDatabaseImporter { @@ -159,11 +161,11 @@ public class ExoneraTorDatabaseImporter {
/* Last and next parse histories containing paths of parsed files and * last modified times. */ - private static Map<String, Long> - lastImportHistory = new HashMap<String, Long>(); + private static SortedMap<String, Long> + lastImportHistory = new TreeMap<String, Long>();
- private static Map<String, Long> - nextImportHistory = new HashMap<String, Long>(); + private static SortedMap<String, Long> + nextImportHistory = new TreeMap<String, Long>();
/* Read stats/exonerator-import-history file from disk and remember * locally when files were last parsed. */ @@ -201,103 +203,26 @@ public class ExoneraTorDatabaseImporter {
/* Parse descriptors in the import directory and its subdirectories. */ private static void parseDescriptors() { - File file = new File(importDirString); - if (!file.exists()) { - System.out.println("File or directory " + importDirString + " does " - + "not exist. Exiting."); - return; - } - Stack<File> files = new Stack<File>(); - files.add(file); - while (!files.isEmpty()) { - file = files.pop(); - if (file.isDirectory()) { - for (File f : file.listFiles()) { - files.add(f); - } - } else { - parseFile(file); - } - } - } - - /* Import a file if it wasn't imported before, and add it to the import - * history for the next execution. */ - private static void parseFile(File file) { - long lastModified = file.lastModified(); - String filename = file.getName(); - nextImportHistory.put(filename, lastModified); - if (!lastImportHistory.containsKey(filename) - || lastImportHistory.get(filename) < lastModified) { - try { - FileInputStream fis = new FileInputStream(file); - BufferedInputStream bis = new BufferedInputStream(fis); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - int len; - byte[] bytes = new byte[1024]; - while ((len = bis.read(bytes, 0, 1024)) >= 0) { - baos.write(bytes, 0, len); - } - bis.close(); - byte[] allBytes = baos.toByteArray(); - splitFile(file, allBytes); - } catch (IOException e) { - System.out.println("Could not read '" + file + "' to memory. " - + "Skipping."); - nextImportHistory.remove(filename); - } - } - } - - /* Detect what descriptor type is contained in a file and split it to - * parse the single descriptors. */ - private static void splitFile(File file, byte[] bytes) { - try { - String asciiString = new String(bytes, "US-ASCII"); - BufferedReader br = new BufferedReader(new StringReader( - asciiString)); - String line = br.readLine(); - while (line != null && line.startsWith("@")) { - line = br.readLine(); - } - if (line == null) { - return; - } - br.close(); - String startToken = null; - if (line.equals("network-status-version 3")) { - startToken = "network-status-version 3"; - } else if (line.startsWith("Downloaded ") - || line.startsWith("ExitNode ")) { - startToken = "ExitNode "; - } else { - System.out.println("Unknown descriptor type in file '" + file - + "'. Ignoring."); - return; - } - String splitToken = "\n" + startToken; - int length = bytes.length; - int start = asciiString.indexOf(startToken); - while (start < length) { - int end = asciiString.indexOf(splitToken, start); - if (end < 0) { - end = length; - } else { - end += 1; - } - byte[] descBytes = new byte[end - start]; - System.arraycopy(bytes, start, descBytes, 0, end - start); - if (startToken.equals("network-status-version 3")) { - parseConsensus(file, descBytes); - } else if (startToken.equals("ExitNode ")) { - parseExitList(file, descBytes); + DescriptorReader descriptorReader = + DescriptorSourceFactory.createDescriptorReader(); + descriptorReader.addDirectory(new File(importDirString)); + descriptorReader.setMaxDescriptorFilesInQueue(20); + descriptorReader.setExcludedFiles(lastImportHistory); + Iterator<DescriptorFile> descriptorFiles = + descriptorReader.readDescriptors(); + while (descriptorFiles.hasNext()) { + DescriptorFile descriptorFile = descriptorFiles.next(); + for (Descriptor descriptor : descriptorFile.getDescriptors()) { + if (descriptor instanceof RelayNetworkStatusConsensus) { + parseConsensus((RelayNetworkStatusConsensus) descriptor); + } else if (descriptor instanceof ExitList) { + parseExitList((ExitList) descriptor); } - start = end; } - } catch (IOException e) { - System.out.println("Could not parse descriptor '" + file + "'. " - + "Skipping."); } + nextImportHistory.putAll( + descriptorReader.getExcludedFiles()); + nextImportHistory.putAll(descriptorReader.getParsedFiles()); }
/* Date format to parse UTC timestamps. */ @@ -309,72 +234,20 @@ public class ExoneraTorDatabaseImporter { }
/* Parse a consensus. */ - private static void parseConsensus(File file, byte[] bytes) { - try { - BufferedReader br = new BufferedReader(new StringReader(new String( - bytes, "US-ASCII"))); - String line; - String fingerprint = null; - String descriptor = null; - Set<String> orAddresses = new HashSet<String>(); - long validAfterMillis = -1L; - StringBuilder rawStatusentryBuilder = null; - boolean isRunning = false; - while ((line = br.readLine()) != null) { - if (line.startsWith("vote-status ") - && !line.equals("vote-status consensus")) { - System.out.println("File '" + file + "' contains network " - + "status *votes*, not network status *consensuses*. " - + "Skipping."); - return; - } else if (line.startsWith("valid-after ")) { - String validAfterTime = line.substring("valid-after ".length()); - try { - validAfterMillis = parseFormat.parse(validAfterTime) - .getTime(); - } catch (ParseException e) { - System.out.println("Could not parse valid-after timestamp in " - + "'" + file + "'. Skipping."); - return; - } - } else if (line.startsWith("r ") - || line.equals("directory-footer")) { - if (isRunning) { - byte[] rawStatusentry = rawStatusentryBuilder.toString() - .getBytes(); - importStatusentry(validAfterMillis, fingerprint, descriptor, - orAddresses, rawStatusentry); - orAddresses = new HashSet<String>(); - } - if (line.equals("directory-footer")) { - return; - } - rawStatusentryBuilder = new StringBuilder(line + "\n"); - String[] parts = line.split(" "); - if (parts.length < 9) { - System.out.println("Could not parse r line '" + line - + "'. Skipping."); - return; - } - fingerprint = Hex.encodeHexString(Base64.decodeBase64(parts[2] - + "=")).toLowerCase(); - descriptor = Hex.encodeHexString(Base64.decodeBase64(parts[3] - + "=")).toLowerCase(); - orAddresses.add(parts[6]); - } else if (line.startsWith("a ")) { - rawStatusentryBuilder.append(line + "\n"); - orAddresses.add(line.substring("a ".length(), - line.lastIndexOf(":"))); - } else if (line.startsWith("s ") || line.equals("s")) { - rawStatusentryBuilder.append(line + "\n"); - isRunning = line.contains(" Running"); - } else if (rawStatusentryBuilder != null) { - rawStatusentryBuilder.append(line + "\n"); + private static void parseConsensus(RelayNetworkStatusConsensus consensus) { + for (NetworkStatusEntry entry : consensus.getStatusEntries().values()) { + if (entry.getFlags().contains("Running")) { + Set<String> orAddresses = new HashSet<String>(); + orAddresses.add(entry.getAddress()); + for (String orAddressAndPort : entry.getOrAddresses()) { + orAddresses.add(orAddressAndPort.substring(0, + orAddressAndPort.lastIndexOf(":"))); } + importStatusentry(consensus.getValidAfterMillis(), + entry.getFingerprint().toLowerCase(), + entry.getDescriptor().toLowerCase(), + orAddresses, entry.getStatusEntryBytes()); } - } catch (IOException e) { - System.out.println("Could not parse consensus. Skipping."); - return; } }
@@ -453,65 +326,29 @@ public class ExoneraTorDatabaseImporter { } }
+ private static final byte[] IGNORED_RAW_EXITLIST_ENTRY = new byte[0]; + /* Parse an exit list. */ - private static void parseExitList(File file, byte[] bytes) { - try { - BufferedReader br = new BufferedReader(new StringReader(new String( - bytes, "US-ASCII"))); - String fingerprint = null; - Set<String> exitAddressLines = new HashSet<String>(); - StringBuilder rawExitlistentryBuilder = new StringBuilder(); - while (true) { - String line = br.readLine(); - if ((line == null || line.startsWith("ExitNode ")) - && fingerprint != null) { - for (String exitAddressLine : exitAddressLines) { - String[] parts = exitAddressLine.split(" "); - String exitAddress = parts[1]; - /* TODO Extend the following code for IPv6 once the exit list - * format supports it. */ - String[] exitAddressParts = exitAddress.split("\."); - byte[] exitAddress24Bytes = new byte[3]; - exitAddress24Bytes[0] = (byte) Integer.parseInt( - exitAddressParts[0]); - exitAddress24Bytes[1] = (byte) Integer.parseInt( - exitAddressParts[1]); - exitAddress24Bytes[2] = (byte) Integer.parseInt( - exitAddressParts[2]); - String exitAddress24 = Hex.encodeHexString( - exitAddress24Bytes); - String scannedTime = parts[2] + " " + parts[3]; - long scannedMillis = -1L; - try { - scannedMillis = parseFormat.parse(scannedTime).getTime(); - } catch (ParseException e) { - System.out.println("Could not parse timestamp in " - + "'" + file + "'. Skipping."); - return; - } - byte[] rawExitlistentry = rawExitlistentryBuilder.toString() - .getBytes(); - importExitlistentry(fingerprint, exitAddress24, exitAddress, - scannedMillis, rawExitlistentry); - } - exitAddressLines.clear(); - rawExitlistentryBuilder = new StringBuilder(); - } - if (line == null) { - break; - } - rawExitlistentryBuilder.append(line + "\n"); - if (line.startsWith("ExitNode ")) { - fingerprint = line.substring("ExitNode ".length()) - .toLowerCase(); - } else if (line.startsWith("ExitAddress ")) { - exitAddressLines.add(line); - } + private static void parseExitList(ExitList exitList) { + for (Entry entry : exitList.getEntries()) { + for (Map.Entry<String, Long> e : entry.getExitAddresses().entrySet()) { + String exitAddress = e.getKey(); + /* TODO Extend the following code for IPv6 once the exit list + * format supports it. */ + String[] exitAddressParts = exitAddress.split("\."); + byte[] exitAddress24Bytes = new byte[3]; + exitAddress24Bytes[0] = (byte) Integer.parseInt( + exitAddressParts[0]); + exitAddress24Bytes[1] = (byte) Integer.parseInt( + exitAddressParts[1]); + exitAddress24Bytes[2] = (byte) Integer.parseInt( + exitAddressParts[2]); + String exitAddress24 = Hex.encodeHexString( + exitAddress24Bytes); + long scannedMillis = e.getValue(); + importExitlistentry(entry.getFingerprint().toLowerCase(), exitAddress24, + exitAddress, scannedMillis, IGNORED_RAW_EXITLIST_ENTRY); } - br.close(); - } catch (IOException e) { - System.out.println("Could not parse exit list. Skipping."); - return; } }
tor-commits@lists.torproject.org