commit 0044dfc04efc3553de8c5bd3375692a0444ff2b4 Author: Karsten Loesing karsten.loesing@gmx.net Date: Tue Aug 28 09:40:12 2018 +0200
Reduce database size and variance of query response times.
The changes in this commit are mainly driven by the following database changes:
- Fingerprints are no longer stored repeatedly as strings in the statusentry and exitlistentry tables but instead moved to their own fingerprint table and referenced by identifier. This is a step towards normalization that ideally reduces database size a lot by itself. - The statusentry table no longer contains the raw status entry but instead only those parts that are required for searching and displaying results. This includes nickname and exit information, where the former is not stored as string but as reference into its own nickname table. - There is a new table specifically for lookups which returns all relevant fingerprint identifiers for a date and an IP address prefix. The new search function based on this new table performs as little table joining as necessary and leaves some of the filtering and joining to the servlet.
More details can be found in the exonerator2.sql file, which also serves as migration script from the existing schema to the new one.
Implements #27356. --- .../exonerator/ExoneraTorDatabaseImporter.java | 93 ++-- .../metrics/exonerator/QueryResponse.java | 7 +- .../metrics/exonerator/QueryServlet.java | 242 ++++++---- src/main/sql/exonerator.sql | 10 + src/main/sql/exonerator2.sql | 508 +++++++++++++++++++++ .../metrics/exonerator/QueryResponseTest.java | 9 +- 6 files changed, 723 insertions(+), 146 deletions(-)
diff --git a/src/main/java/org/torproject/metrics/exonerator/ExoneraTorDatabaseImporter.java b/src/main/java/org/torproject/metrics/exonerator/ExoneraTorDatabaseImporter.java index 00a3d89..3ed7c19 100644 --- a/src/main/java/org/torproject/metrics/exonerator/ExoneraTorDatabaseImporter.java +++ b/src/main/java/org/torproject/metrics/exonerator/ExoneraTorDatabaseImporter.java @@ -12,6 +12,8 @@ import org.torproject.descriptor.ExitList.Entry; import org.torproject.descriptor.NetworkStatusEntry; import org.torproject.descriptor.RelayNetworkStatusConsensus;
+import org.apache.commons.codec.DecoderException; +import org.apache.commons.codec.binary.Base64; import org.apache.commons.codec.binary.Hex;
import org.slf4j.Logger; @@ -28,8 +30,6 @@ import java.sql.Connection; import java.sql.DriverManager; import java.sql.SQLException; import java.sql.Timestamp; -import java.sql.Types; -import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.HashSet; import java.util.Map; @@ -114,9 +114,9 @@ public class ExoneraTorDatabaseImporter { private static void prepareDatabaseStatements() { try { insertStatusentryStatement = connection.prepareCall( - "{call insert_statusentry(?, ?, ?, ?, ?, ?, ?)}"); + "{call insert_statusentry_oraddress(?, ?, ?, ?, ?, ?)}"); insertExitlistentryStatement = connection.prepareCall( - "{call insert_exitlistentry(?, ?, ?, ?, ?)}"); + "{call insert_exitlistentry_exitaddress(?, ?, ?, ?)}"); } catch (SQLException e) { logger.warn("Could not prepare callable statements to " + "import data into the database. Exiting.", e); @@ -221,28 +221,35 @@ public class ExoneraTorDatabaseImporter { nextImportHistory.putAll(descriptorReader.getParsedFiles()); }
- /* Date format to parse UTC timestamps. */ - private static SimpleDateFormat parseFormat; - - static { - parseFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - parseFormat.setTimeZone(TimeZone.getTimeZone("UTC")); - } - /* Parse a consensus. */ private static void parseConsensus(RelayNetworkStatusConsensus consensus) { + long validAfterMillis = consensus.getValidAfterMillis(); for (NetworkStatusEntry entry : consensus.getStatusEntries().values()) { if (entry.getFlags().contains("Running")) { + String fingerprintBase64 = null; + try { + fingerprintBase64 = Base64.encodeBase64String( + Hex.decodeHex(entry.getFingerprint().toCharArray())) + .replaceAll("=", ""); + } catch (DecoderException e) { + logger.warn("Unable to decode hex fingerprint {} to convert it back " + + "to base64. Aborting import.", entry.getFingerprint(), e); + System.exit(1); + } + final String nickname = entry.getNickname(); + Boolean exit = null; + if (null != entry.getDefaultPolicy() && null != entry.getPortList()) { + exit = "accept".equals(entry.getDefaultPolicy()) + || !"1-65535".equals(entry.getPortList()); + } Set<String> orAddresses = new HashSet<>(); orAddresses.add(entry.getAddress()); for (String orAddressAndPort : entry.getOrAddresses()) { orAddresses.add(orAddressAndPort.substring(0, orAddressAndPort.lastIndexOf(":"))); } - importStatusentry(consensus.getValidAfterMillis(), - entry.getFingerprint().toLowerCase(), - entry.getDescriptor().toLowerCase(), - orAddresses, entry.getStatusEntryBytes()); + importStatusentry(validAfterMillis, fingerprintBase64, nickname, + exit, orAddresses); } } } @@ -254,16 +261,16 @@ public class ExoneraTorDatabaseImporter { /* Import a status entry with one or more OR addresses into the * database. */ private static void importStatusentry(long validAfterMillis, - String fingerprint, String descriptor, Set<String> orAddresses, - byte[] rawStatusentry) { + String fingerprintBase64, String nickname, Boolean exit, + Set<String> orAddresses) { try { for (String orAddress : orAddresses) { insertStatusentryStatement.clearParameters(); insertStatusentryStatement.setTimestamp(1, new Timestamp(validAfterMillis), calendarUTC); - insertStatusentryStatement.setString(2, fingerprint); - insertStatusentryStatement.setString(3, descriptor); + insertStatusentryStatement.setString(2, fingerprintBase64); if (!orAddress.contains(":")) { + insertStatusentryStatement.setString(3, orAddress); String[] addressParts = orAddress.split("\."); byte[] address24Bytes = new byte[3]; address24Bytes[0] = (byte) Integer.parseInt(addressParts[0]); @@ -271,8 +278,6 @@ public class ExoneraTorDatabaseImporter { address24Bytes[2] = (byte) Integer.parseInt(addressParts[2]); String orAddress24 = Hex.encodeHexString(address24Bytes); insertStatusentryStatement.setString(4, orAddress24); - insertStatusentryStatement.setNull(5, Types.VARCHAR); - insertStatusentryStatement.setString(6, orAddress); } else { StringBuilder addressHex = new StringBuilder(); int start = orAddress.startsWith("[::") ? 2 : 1; @@ -289,7 +294,7 @@ public class ExoneraTorDatabaseImporter { break; } } - String orAddress48 = null; + String orAddress24 = null; if (addressHex != null) { String addressHexString = addressHex.toString(); addressHexString = addressHexString.replaceFirst("x", @@ -297,22 +302,22 @@ public class ExoneraTorDatabaseImporter { + "s", "0")); if (!addressHexString.contains("x") && addressHexString.length() == 32) { - orAddress48 = addressHexString.replaceAll(" ", "0") - .toLowerCase().substring(0, 12); + orAddress24 = addressHexString.replaceAll(" ", "0") + .toLowerCase().substring(0, 6); } } - if (orAddress48 != null) { - insertStatusentryStatement.setNull(4, Types.VARCHAR); - insertStatusentryStatement.setString(5, orAddress48); - insertStatusentryStatement.setString(6, + if (orAddress24 != null) { + insertStatusentryStatement.setString(3, orAddress.replaceAll("[\[\]]", "")); + insertStatusentryStatement.setString(4, orAddress24); } else { logger.error("Could not import status entry with IPv6 " + "address '{}'. Exiting.", orAddress); System.exit(1); } } - insertStatusentryStatement.setBytes(7, rawStatusentry); + insertStatusentryStatement.setString(5, nickname); + insertStatusentryStatement.setBoolean(6, exit); insertStatusentryStatement.execute(); } } catch (SQLException e) { @@ -321,12 +326,20 @@ public class ExoneraTorDatabaseImporter { } }
- private static final byte[] IGNORED_RAW_EXITLIST_ENTRY = new byte[0]; - /* Parse an exit list. */ private static void parseExitList(ExitList exitList) { for (Entry entry : exitList.getEntries()) { for (Map.Entry<String, Long> e : entry.getExitAddresses().entrySet()) { + String fingerprintBase64 = null; + try { + fingerprintBase64 = Base64.encodeBase64String( + Hex.decodeHex(entry.getFingerprint().toCharArray())) + .replaceAll("=", ""); + } catch (DecoderException ex) { + logger.warn("Unable to decode hex fingerprint {} to convert to " + + "base64. Aborting import.", entry.getFingerprint(), ex); + System.exit(1); + } String exitAddress = e.getKey(); /* TODO Extend the following code for IPv6 once the exit list * format supports it. */ @@ -341,24 +354,22 @@ public class ExoneraTorDatabaseImporter { String exitAddress24 = Hex.encodeHexString( exitAddress24Bytes); long scannedMillis = e.getValue(); - importExitlistentry(entry.getFingerprint().toLowerCase(), exitAddress24, - exitAddress, scannedMillis, IGNORED_RAW_EXITLIST_ENTRY); + importExitlistentry(fingerprintBase64, exitAddress24, exitAddress, + scannedMillis); } } }
/* Import an exit list entry into the database. */ - private static void importExitlistentry(String fingerprint, - String exitAddress24, String exitAddress, long scannedMillis, - byte[] rawExitlistentry) { + private static void importExitlistentry(String fingerprintBase64, + String exitAddress24, String exitAddress, long scannedMillis) { try { insertExitlistentryStatement.clearParameters(); - insertExitlistentryStatement.setString(1, fingerprint); - insertExitlistentryStatement.setString(2, exitAddress24); - insertExitlistentryStatement.setString(3, exitAddress); + insertExitlistentryStatement.setString(1, fingerprintBase64); + insertExitlistentryStatement.setString(2, exitAddress); + insertExitlistentryStatement.setString(3, exitAddress24); insertExitlistentryStatement.setTimestamp(4, new Timestamp(scannedMillis), calendarUTC); - insertExitlistentryStatement.setBytes(5, rawExitlistentry); insertExitlistentryStatement.execute(); } catch (SQLException e) { logger.error("Could not import exit list entry. Exiting.", e); diff --git a/src/main/java/org/torproject/metrics/exonerator/QueryResponse.java b/src/main/java/org/torproject/metrics/exonerator/QueryResponse.java index e920ecf..6a8976a 100644 --- a/src/main/java/org/torproject/metrics/exonerator/QueryResponse.java +++ b/src/main/java/org/torproject/metrics/exonerator/QueryResponse.java @@ -13,6 +13,7 @@ import org.slf4j.LoggerFactory;
import java.io.IOException; import java.io.Reader; +import java.util.SortedSet;
/** Query response from the ExoneraTor database. */ public class QueryResponse { @@ -119,7 +120,7 @@ public class QueryResponse { String timestamp;
/** All known IP addresses of the relay at the time. */ - String[] addresses; + SortedSet<String> addresses;
/** Relay fingerprint. */ String fingerprint; @@ -131,11 +132,11 @@ public class QueryResponse { * unknown. */ Boolean exit;
- /** Constructor for Gson. */ + /** Constructor for JSON library. */ public Match() {}
/** Constructor for tests. */ - Match(String timestamp, String[] addresses, String fingerprint, + Match(String timestamp, SortedSet<String> addresses, String fingerprint, String nickname, Boolean exit) { this.timestamp = timestamp; this.addresses = addresses; diff --git a/src/main/java/org/torproject/metrics/exonerator/QueryServlet.java b/src/main/java/org/torproject/metrics/exonerator/QueryServlet.java index d597b4e..a807ad0 100644 --- a/src/main/java/org/torproject/metrics/exonerator/QueryServlet.java +++ b/src/main/java/org/torproject/metrics/exonerator/QueryServlet.java @@ -3,6 +3,7 @@
package org.torproject.metrics.exonerator;
+import org.apache.commons.codec.binary.Base64; import org.apache.commons.codec.binary.Hex;
import org.slf4j.Logger; @@ -13,16 +14,20 @@ import java.sql.CallableStatement; import java.sql.Connection; import java.sql.ResultSet; import java.sql.SQLException; -import java.sql.Timestamp; import java.text.ParseException; import java.text.SimpleDateFormat; import java.time.ZoneOffset; import java.time.ZonedDateTime; import java.util.ArrayList; import java.util.Calendar; +import java.util.HashSet; import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.SortedMap; import java.util.SortedSet; import java.util.TimeZone; +import java.util.TreeMap; import java.util.TreeSet; import java.util.regex.Pattern;
@@ -248,8 +253,7 @@ public class QueryServlet extends HttpServlet { if (addressHex == null) { return null; } - String address24Or48Hex = !relayIp.contains(":") - ? addressHex.substring(0, 6) : addressHex.substring(0, 12); + String address24Hex = addressHex.substring(0, 6);
/* Prepare formatting response items. */ SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd"); @@ -258,70 +262,88 @@ public class QueryServlet extends HttpServlet { "yyyy-MM-dd HH:mm:ss"); validAfterTimeFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
- /* Make the database query. */ - SortedSet<Long> allValidAfters = new TreeSet<>(); - List<QueryResponse.Match> matches = new ArrayList<>(); - SortedSet<String> allAddresses = new TreeSet<>(); + /* Store all dates contained in the query response in order to populate the + * {first|last}_date_in_database and relevant_statuses fields. */ + SortedSet<Long> allDates = new TreeSet<>(); + + /* Store all possible matches for the results table by base64-encoded + * fingerprint and valid-after time. This map is first populated by going + * through the result set and adding or updating map entries, so that + * there's one entry per fingerprint and valid-after time with one or more + * addresses. In a second step, exit addresses are added to map entries. */ + SortedMap<String, SortedMap<Long, QueryResponse.Match>> + matchesByFingerprintBase64AndValidAfter = new TreeMap<>(); + + /* Store all possible matches by address. This map has two purposes: First, + * the query returns all entries matching the first 24 bits of an address, + * which may include other addresses than the one being looked for. This map + * then has only those matches that are relevant. Second, if there are no + * matches for the given address, this map may contain nearby addresses in + * the same /24 or /48 that can be included in the nearby_addresses + * field. */ + SortedMap<String, Set<QueryResponse.Match>> + matchesByAddress = new TreeMap<>(); + + /* Store all exit addresses by base64-encoded fingerprint and scanned + * time. These addresses are added to this map while going through the + * result set and later added to the two maps above containing matches. The + * reason for separating these steps is that the result set may contain + * status entries and exit list entries in any specific order. */ + SortedMap<String, SortedMap<Long, String>> + exitAddressesByFingeprintBase64AndScanned = new TreeMap<>(); + + /* Make the database query to populate the sets and maps above. */ final long requestedConnection = System.currentTimeMillis(); Calendar utcCalendar = Calendar.getInstance(TimeZone.getTimeZone("UTC")); try (Connection conn = this.ds.getConnection()) { - try (CallableStatement cs = conn.prepareCall(String.format( - "{call search_by_address%s_date(?, ?)}", - relayIp.contains(":") ? 48 : 24))) { - cs.setString(1, address24Or48Hex); - cs.setDate(2, new java.sql.Date(timestamp), utcCalendar); + try (CallableStatement cs = conn.prepareCall( + "{call search_by_date_address24(?, ?)}")) { + cs.setDate(1, new java.sql.Date(timestamp), utcCalendar); + cs.setString(2, address24Hex); try (ResultSet rs = cs.executeQuery()) { while (rs.next()) { - Timestamp ts = rs.getTimestamp(2, utcCalendar); - if (null == ts) { - continue; - } - long validafter = ts.getTime(); - allValidAfters.add(validafter); - byte[] rawstatusentry = rs.getBytes(1); - if (null == rawstatusentry) { - continue; - } - SortedSet<String> addresses = new TreeSet<>(); - SortedSet<String> addressesHex = new TreeSet<>(); - String nickname = null; - Boolean exit = null; - for (String line : new String(rawstatusentry).split("\n")) { - if (line.startsWith("r ")) { - String[] parts = line.split(" "); - nickname = parts[1]; - addresses.add(parts[6]); - addressesHex.add(this.convertIpV4ToHex(parts[6])); - } else if (line.startsWith("a ")) { - String address = line.substring("a ".length(), - line.lastIndexOf(":")); - addresses.add(address); - String orAddressHex = !address.contains(":") - ? this.convertIpV4ToHex(address) - : this.convertIpV6ToHex(address); - addressesHex.add(orAddressHex); - } else if (line.startsWith("p ")) { - exit = !line.equals("p reject 1-65535"); + java.sql.Date date = rs.getDate(1, utcCalendar); + String fingerprintBase64 = rs.getString(2); + java.sql.Timestamp scanned = rs.getTimestamp(3, utcCalendar); + String exitAddress = rs.getString(4); + java.sql.Timestamp validAfter = rs.getTimestamp(5, utcCalendar); + String nickname = rs.getString(6); + Boolean exit = rs.getBoolean(7); + String orAddress = rs.getString(8); + if (null != date) { + allDates.add(date.getTime()); + } else if (null != scanned) { + long scannedMillis = scanned.getTime(); + exitAddressesByFingeprintBase64AndScanned.putIfAbsent( + fingerprintBase64, new TreeMap<>()); + exitAddressesByFingeprintBase64AndScanned.get(fingerprintBase64) + .put(scannedMillis, exitAddress); + } else if (null != validAfter) { + long validAfterMillis = validAfter.getTime(); + matchesByFingerprintBase64AndValidAfter.putIfAbsent( + fingerprintBase64, new TreeMap<>()); + if (!matchesByFingerprintBase64AndValidAfter + .get(fingerprintBase64).containsKey(validAfterMillis)) { + String validAfterString = validAfterTimeFormat.format( + validAfterMillis); + String fingerprint = Hex.encodeHexString(Base64.decodeBase64( + fingerprintBase64 + "=")).toUpperCase(); + matchesByFingerprintBase64AndValidAfter.get(fingerprintBase64) + .put(validAfterMillis, new QueryResponse.Match( + validAfterString, new TreeSet<>(), fingerprint, + nickname, exit)); } + QueryResponse.Match match + = matchesByFingerprintBase64AndValidAfter + .get(fingerprintBase64).get(validAfterMillis); + if (orAddress.contains(":")) { + match.addresses.add("[" + orAddress + "]"); + } else { + match.addresses.add(orAddress); + } + matchesByAddress.putIfAbsent(orAddress, new HashSet<>()); + matchesByAddress.get(orAddress).add(match); } - String exitaddress = rs.getString(4); - if (exitaddress != null && exitaddress.length() > 0) { - addresses.add(exitaddress); - addressesHex.add(this.convertIpV4ToHex(exitaddress)); - } - allAddresses.addAll(addresses); - if (!addressesHex.contains(addressHex)) { - continue; - } - String validAfterString = validAfterTimeFormat.format(validafter); - String fingerprint = rs.getString(3).toUpperCase(); - QueryResponse.Match match = new QueryResponse.Match(); - match.timestamp = validAfterString; - match.addresses = addresses.toArray(new String[0]); - match.fingerprint = fingerprint; - match.nickname = nickname; - match.exit = exit; - matches.add(match); } } catch (SQLException e) { this.logger.warn("Result set error. Returning 'null'.", e); @@ -338,49 +360,73 @@ public class QueryServlet extends HttpServlet { return null; }
- QueryResponse response = new QueryResponse(); - response.queryAddress = relayIp; - response.queryDate = dateFormat.format(timestamp); - if (!allValidAfters.isEmpty()) { - response.firstDateInDatabase = dateFormat.format(allValidAfters.first()); - response.lastDateInDatabase = dateFormat.format(allValidAfters.last()); - response.relevantStatuses = false; - long timestampFrom = timestamp - MILLISECONDS_IN_A_DAY; - long timestampTo = timestamp + 2 * MILLISECONDS_IN_A_DAY - 1L; - for (long validAfter : allValidAfters) { - if (validAfter >= timestampFrom && validAfter <= timestampTo) { - response.relevantStatuses = true; - break; + /* Go through exit addresses and update possible matches. */ + for (Map.Entry<String, SortedMap<Long, String>> e + : exitAddressesByFingeprintBase64AndScanned.entrySet()) { + String fingerprintBase64 = e.getKey(); + if (!matchesByFingerprintBase64AndValidAfter.containsKey( + fingerprintBase64)) { + /* This is a rare edge case where an exit list entry exists, but where + * that relay was not included in any consensus with a valid-after time + * of up to 24 hours after the scan time. This match is not supposed to + * show up in the results, nor should the exit address show up in + * nearby matches. We'll just skip it. */ + continue; + } + for (Map.Entry<Long, String> e1 : e.getValue().entrySet()) { + long scannedMillis = e1.getKey(); + String exitAddress = e1.getValue(); + for (QueryResponse.Match match + : matchesByFingerprintBase64AndValidAfter.get(fingerprintBase64) + .subMap(scannedMillis, scannedMillis + MILLISECONDS_IN_A_DAY) + .values()) { + match.addresses.add(exitAddress); + matchesByAddress.putIfAbsent(exitAddress, new HashSet<>()); + matchesByAddress.get(exitAddress).add(match); } } - if (!matches.isEmpty()) { - matches.sort((m1, m2) -> { - if (m1 == m2) { - return 0; - } else if (!m1.timestamp.equals(m2.timestamp)) { - return m1.timestamp.compareTo(m2.timestamp); - } else { - return m1.fingerprint.compareTo(m2.fingerprint); - } - }); - response.matches = matches.toArray(new QueryResponse.Match[0]); - } else { - List<String> nearbyAddresses = new ArrayList<>(); - for (String nearbyAddress : allAddresses) { - String nearbyAddressHex = !nearbyAddress.contains(":") - ? this.convertIpV4ToHex(nearbyAddress) - : this.convertIpV6ToHex(nearbyAddress); - String nearbyAddress24Or48Hex = !nearbyAddress.contains(":") - ? nearbyAddressHex.substring(0, 6) - : nearbyAddressHex.substring(0, 12); - if (address24Or48Hex.equals(nearbyAddress24Or48Hex)) { - nearbyAddresses.add(nearbyAddress); - } + } + + /* Write all results to a new QueryResponse object. */ + final QueryResponse response = new QueryResponse(); + response.queryAddress = relayIp; + response.queryDate = dateFormat.format(timestamp); + if (!allDates.isEmpty()) { + response.firstDateInDatabase = dateFormat.format(allDates.first()); + response.lastDateInDatabase = dateFormat.format(allDates.last()); + response.relevantStatuses = allDates.contains(timestamp) + || allDates.contains(timestamp - MILLISECONDS_IN_A_DAY) + || allDates.contains(timestamp + MILLISECONDS_IN_A_DAY); + } + if (matchesByAddress.containsKey(relayIp)) { + List<QueryResponse.Match> matchesList + = new ArrayList<>(matchesByAddress.get(relayIp)); + matchesList.sort((m1, m2) -> { + if (m1 == m2) { + return 0; + } else if (!m1.timestamp.equals(m2.timestamp)) { + return m1.timestamp.compareTo(m2.timestamp); + } else { + return m1.fingerprint.compareTo(m2.fingerprint); } - if (!nearbyAddresses.isEmpty()) { - response.nearbyAddresses = nearbyAddresses.toArray(new String[0]); + }); + response.matches = matchesList.toArray(new QueryResponse.Match[0]); + } else { + SortedSet<String> nearbyAddresses = new TreeSet<>(); + String relayIpHex24Or48 = !relayIp.contains(":") + ? this.convertIpV4ToHex(relayIp).substring(0, 6) + : this.convertIpV6ToHex(relayIp).substring(0, 12); + for (String address : matchesByAddress.keySet()) { + String nearbyAddressHex24Or48 = !address.contains(":") + ? this.convertIpV4ToHex(address).substring(0, 6) + : this.convertIpV6ToHex(address).substring(0, 12); + if (relayIpHex24Or48.equals(nearbyAddressHex24Or48)) { + nearbyAddresses.add(address); } } + if (!nearbyAddresses.isEmpty()) { + response.nearbyAddresses = nearbyAddresses.toArray(new String[0]); + } }
return response; diff --git a/src/main/sql/exonerator.sql b/src/main/sql/exonerator.sql index 129f78b..7e5ad7e 100755 --- a/src/main/sql/exonerator.sql +++ b/src/main/sql/exonerator.sql @@ -1,6 +1,16 @@ -- Copyright 2011--2018 The Tor Project -- See LICENSE for licensing information
+-------------------------------------------------------------------------------- +-- This schema has been superseded by exonerator2.sql: +-- - Existing databases can be migrated by running exonerator2.sql. See the +-- instructions in that file for details. +-- - New databases need to be initialized by running this script first and then +-- exonerator2.sql. +-- - At some point in the future this file will disappear, and exonerator2.sql +-- will be modified to create a new database from scratch. +-------------------------------------------------------------------------------- + -- The statusentry table stores network status consensus entries listing -- a relay as running at a certain point in time. Only relays with the -- Running flag shall be inserted into this table. If a relay advertises diff --git a/src/main/sql/exonerator2.sql b/src/main/sql/exonerator2.sql new file mode 100755 index 0000000..397d2bf --- /dev/null +++ b/src/main/sql/exonerator2.sql @@ -0,0 +1,508 @@ +-- Copyright 2011--2018 The Tor Project +-- See LICENSE for licensing information + +-------------------------------------------------------------------------------- +-- This schema supersedes exonerator.sql: +-- - Existing databases can be migrated by running this script. It is highly +-- recommended to read this file before migrating. +-- - New databases need to be initialized by running exonerator.sql first and +-- then this script. +-- - At some point in the future exonerator.sql will disappear, and this file +-- will be modified to create a new database from scratch. +-------------------------------------------------------------------------------- + +-- How to migrate from exonerator.sql: +-- - Migration takes a while! And it can break. Read this script first! +-- - Make sure there is at least 50% disk space left and the machine is +-- connected to power. +-- - Turn off the importer cronjob, make sure it is not currently running, and +-- otherwise wait for it to finish. +-- - Back up the database and possibly move the backup offsite, if otherwise +-- there is less than 50% disk space left to complete the migration. +-- - Run this script, either step by step or as a whole. It's going to take a +-- while. +-- - Update the Java importer and servlets to use the new functions. +-- - Turn the importer back on. + +-- Exit on first error, which prevents multiple executions of this file. +\set ON_ERROR_STOP true + +-- The fingerprint table stores fingerprint strings uniquely identifying relays +-- and assigns much shorter numeric identifiers for internal-only use. +CREATE TABLE fingerprint ( + + -- The auto-incremented numeric identifier for a unique fingerprint. + fingerprint_id SERIAL PRIMARY KEY, + + -- The 27-character base64-encoded string uniquely identifying the relay. + fingerprint_base64 CHARACTER(27) UNIQUE NOT NULL +); + +-- The nickname table stores nickname strings for display in the results table +-- and assigns much shorter numeric identifiers for internal-only use. +CREATE TABLE nickname ( + + -- The auto-incremented numeric identifier for a unique nickname. + nickname_id SERIAL PRIMARY KEY, + + -- The 1 to 19 character long alphanumeric nickname assigned to the relay by + -- its operator. + nickname CHARACTER VARYING(19) UNIQUE NOT NULL +); + +-- The date_address24 table is the main lookup table. It contains all +-- fingerprints seen in the network on a given date and with a given /24 +-- address prefix. +CREATE TABLE date_address24 ( + + -- The auto-incremented numeric identifier for an entry in this table. + date_address24_id SERIAL PRIMARY KEY, + + -- The date when the relay was listed in a network status consensus or scanned + -- by the exit scanner and contained in an exit list. + date DATE NOT NULL, + + -- The most significant 3 bytes of the relay's IPv4 or IPv6 address in + -- lower-case hex notation. The purpose of this notation is to quickly reduce + -- query results and to enable searches for relays in the same /24 network. + -- Can be NULL to store the information that there is data available for a + -- given date. + address24 CHARACTER(6), + + -- The numeric fingerprint identifier uniquely identifying the relay. Can be + -- NULL to store the information that there is data available for a given + -- date. + fingerprint_id integer REFERENCES fingerprint (fingerprint_id), + + UNIQUE (date, address24, fingerprint_id) +); + +-- The statusentry_oraddress table stores network status consensus entries +-- listing a relay as running at a certain point in time. Only relays with the +-- Running flag shall be inserted into this table. If a relay advertises more +-- than one IP address, there is a distinct entry for each address in this +-- table. If a relay advertises more than one TCP port on the same IP address, +-- there is only a single entry in this table. +CREATE TABLE statusentry_oraddress ( + + -- The auto-incremented numeric identifier for an entry in this table. + statusentry_oraddress_id BIGSERIAL PRIMARY KEY, + + -- The valid-after time of the consensus that contains this entry. + validafter TIMESTAMP WITHOUT TIME ZONE NOT NULL, + + -- The numeric fingerprint identifier uniquely identifying the relay. + fingerprint_id integer REFERENCES fingerprint (fingerprint_id) NOT NULL, + + -- The relay's onion routing address. Can be an IPv4 or an IPv6 address. If a + -- relay advertises more than one address, there are multiple entries in this + -- table for the same status entry. + oraddress INET NOT NULL, + + -- The numeric nickname identifier referencing the relay nickname. + nickname_id integer REFERENCES nickname (nickname_id) NOT NULL, + + -- Whether the relay permitted exiting, which can be TRUE, FALSE, or NULL for + -- unknown. + exit BOOLEAN, + + UNIQUE (validafter, fingerprint_id, oraddress) +); + +-- The exitlistentry_exitaddress table stores the results of the active testing, +-- DNS-based exit list for exit nodes. An entry in this table means that a relay +-- was scanned at a given time and found to be exiting to the Internet from a +-- given IP address. This IP address can be different from the relay's onion +-- routing address if the relay uses more than one IP addresses. +CREATE TABLE exitlistentry_exitaddress ( + + -- The auto-incremented numeric identifier for an entry in this table. + exitlistentry_exitaddress_id SERIAL PRIMARY KEY, + + -- The numeric fingerprint identifier uniquely identifying the relay. + fingerprint_id integer REFERENCES fingerprint (fingerprint_id) NOT NULL, + + -- The IP address that the relay uses for exiting to the Internet. If the + -- relay uses more than one IP address, there are multiple entries in this + -- table. + exitaddress INET NOT NULL, + + -- The time when the relay was scanned to find out its exit IP address(es). + scanned TIMESTAMP WITHOUT TIME ZONE NOT NULL, + + -- An exit list entry is uniquely identified by its scan time, relay + -- fingerprint ID, and exit address. + UNIQUE (scanned, fingerprint_id, exitaddress) +); + +-- Insert the given base64-encoded fingerprint into the fingerprint table, if +-- it's not yet contained, and return the fingerprint identifier. +CREATE OR REPLACE FUNCTION insert_fingerprint ( + insert_fingerprint_base64 CHARACTER(27)) + RETURNS INTEGER AS $$ +DECLARE + result INTEGER; +BEGIN + SELECT fingerprint_id + INTO result + FROM fingerprint + WHERE fingerprint_base64 = insert_fingerprint_base64; + IF result IS NULL THEN + INSERT INTO fingerprint(fingerprint_id, fingerprint_base64) + VALUES (DEFAULT, insert_fingerprint_base64) + RETURNING fingerprint_id INTO result; + END IF; + RETURN result; +END; +$$ LANGUAGE plpgsql; + +-- Insert the given nickname into the nickname table, if it's not yet contained, +-- and return the nickname identifier. +CREATE OR REPLACE FUNCTION insert_nickname ( + param_nickname CHARACTER VARYING(19)) + RETURNS INTEGER AS $$ +DECLARE + result INTEGER; +BEGIN + SELECT nickname_id + INTO result + FROM nickname + WHERE nickname = param_nickname; + IF result IS NULL THEN + INSERT INTO nickname(nickname_id, nickname) + VALUES (DEFAULT, param_nickname) + RETURNING nickname_id INTO result; + END IF; + RETURN result; +END; +$$ LANGUAGE plpgsql; + +-- Insert the given date as well as the given combination of date, hex-encoded +-- /24 IP address prefix, and fingerprint identifier into the date_address24 +-- table, if they're not yet contained. Return the number of inserted rows, +-- which is 2 if the given date did not exist in the date_address24 table yet, 1 +-- if the date existed but not in combination with the given address prefix and +-- fingerprint identifier, or 0 if both date and combination already existed. +CREATE OR REPLACE FUNCTION insert_date_address24 ( + insert_date DATE, + insert_address24 CHARACTER(6), + insert_fingerprint_id INTEGER) + RETURNS INTEGER AS $$ + DECLARE + existing_rows INTEGER; + BEGIN + SELECT COUNT(*) + INTO existing_rows + FROM date_address24 + WHERE date = insert_date + AND (address24 IS NULL OR address24 = insert_address24) + AND (fingerprint_id IS NULL + OR fingerprint_id = insert_fingerprint_id); + IF existing_rows < 2 THEN + IF existing_rows < 1 THEN + INSERT INTO date_address24 (date, address24, fingerprint_id) + VALUES (insert_date, NULL, NULL); + END IF; + INSERT INTO date_address24 (date, address24, fingerprint_id) + VALUES (insert_date, insert_address24, insert_fingerprint_id); + RETURN 2 - existing_rows; + ELSE + RETURN 0; + END IF; + END; +$$ LANGUAGE 'plpgsql'; + +-- Insert a status entry into the statusentry_oraddress table. First check that +-- this status entry isn't contained in the table yet. It's okay to insert the +-- same status entry multiple times for different IP addresses though. Return +-- the number of inserted rows as result. +CREATE OR REPLACE FUNCTION insert_statusentry_oraddress ( + insert_validafter TIMESTAMP WITHOUT TIME ZONE, + insert_fingerprint_base64 CHARACTER(27), + insert_oraddress TEXT, + insert_oraddress24 CHARACTER(6), + insert_nickname_param CHARACTER VARYING(19), + insert_exit BOOLEAN) + RETURNS INTEGER AS $$ + DECLARE + insert_fingerprint_id INTEGER; + insert_nickname_id INTEGER; + BEGIN + insert_fingerprint_id := + insert_fingerprint(insert_fingerprint_base64); + IF (SELECT COUNT(*) + FROM statusentry_oraddress + WHERE validafter = insert_validafter + AND fingerprint_id = insert_fingerprint_id + AND oraddress = insert_oraddress::INET) = 0 THEN + insert_nickname_id := insert_nickname(insert_nickname_param); + INSERT INTO statusentry_oraddress (validafter, fingerprint_id, + oraddress, nickname_id, exit) + VALUES (insert_validafter, insert_fingerprint_id, + insert_oraddress::INET, insert_nickname_id, + insert_exit); + RETURN 1 + insert_date_address24(DATE(insert_validafter), + insert_oraddress24, insert_fingerprint_id); + ELSE + RETURN 0; + END IF; + END; +$$ LANGUAGE 'plpgsql'; + +-- Insert an exit list entry into the exitlistentry_exitaddress table. Check +-- that this entry hasn't been inserted before. It's okay to insert the same +-- exit list entry multiple times for different exit addresses. Return the +-- number of inserted rows as result. +CREATE OR REPLACE FUNCTION insert_exitlistentry_exitaddress ( + insert_fingerprint_base64 CHARACTER(27), + insert_exitaddress TEXT, + insert_exitaddress24 CHARACTER(6), + insert_scanned TIMESTAMP WITHOUT TIME ZONE) + RETURNS INTEGER AS $$ + DECLARE + insert_fingerprint_id INTEGER; + BEGIN + insert_fingerprint_id := insert_fingerprint(insert_fingerprint_base64); + IF (SELECT COUNT(*) + FROM exitlistentry_exitaddress + WHERE fingerprint_id = insert_fingerprint_id + AND exitaddress = insert_exitaddress::INET + AND scanned = insert_scanned) = 0 THEN + INSERT INTO exitlistentry_exitaddress (fingerprint_id, exitaddress, + scanned) VALUES (insert_fingerprint_id, insert_exitaddress::INET, + insert_scanned); + RETURN 1 + insert_date_address24(DATE(insert_scanned), + insert_exitaddress24, insert_fingerprint_id); + ELSE + RETURN 0; + END IF; + END; +$$ LANGUAGE 'plpgsql'; + +-- Search for (1) status entries with an IPv4 or IPv6 onion routing address in +-- the same /24 network as the given hex-encoded IP address prefix and with a +-- valid-after date within a day of the given date, (2) exit list entries with +-- an IPv4 exit address in the same /24 network and with a scan time not earlier +-- than two days before and not later than one day after the given date, and (3) +-- the last and first dates in the database as well as the dates for which the +-- database contains relevant data within a day of the given date. +-- +-- This function makes heavy use of the date_address24 table in order to reduce +-- query response time by first obtaining all relevant fingerprint identifiers. +-- In the next step it runs three selects to obtain status entries, exit list +-- entries, and relevant dates. Any postprocessing, including filtering by exact +-- IP address or matching status entries and exit list entries, needs to happen +-- at the caller. +CREATE OR REPLACE FUNCTION search_by_date_address24 ( + search_date DATE, search_address24 CHARACTER(6)) + RETURNS TABLE( + date DATE, + fingerprint_base64 CHARACTER(27), + scanned TIMESTAMP WITHOUT TIME ZONE, + exitaddress INET, + validafter TIMESTAMP WITHOUT TIME ZONE, + nickname CHARACTER VARYING(19), + exit BOOLEAN, + oraddress INET) AS $$ + BEGIN + RETURN QUERY EXECUTE + 'WITH matching_fingerprint_ids AS ( + SELECT fingerprint_id FROM date_address24 + WHERE date_address24.date >= $1 - 2 + AND date_address24.date <= $1 + 1 + AND date_address24.address24 = $2) + SELECT NULL::DATE AS date, fingerprint_base64, scanned, exitaddress, + NULL AS validafter, NULL AS nickname, NULL AS exit, + NULL AS oraddress + FROM exitlistentry_exitaddress + NATURAL JOIN fingerprint + WHERE DATE(exitlistentry_exitaddress.scanned) >= $1 - 2 + AND DATE(exitlistentry_exitaddress.scanned) <= $1 + 1 + AND exitlistentry_exitaddress.fingerprint_id + IN (SELECT fingerprint_id FROM matching_fingerprint_ids) + UNION + SELECT NULL::DATE AS date, fingerprint_base64, NULL AS scanned, + NULL AS exitaddress, validafter, nickname, exit, + oraddress + FROM statusentry_oraddress + NATURAL JOIN fingerprint + NATURAL JOIN nickname + WHERE DATE(statusentry_oraddress.validafter) >= $1 - 1 + AND DATE(statusentry_oraddress.validafter) <= $1 + 1 + AND statusentry_oraddress.fingerprint_id + IN (SELECT fingerprint_id FROM matching_fingerprint_ids) + UNION + SELECT date, NULL AS fingerprint_base64, NULL AS scanned, + NULL AS exitaddress, NULL AS validafter, NULL AS nickname, + NULL AS exit, NULL AS oraddress + FROM date_address24 + WHERE date IN (SELECT MIN(date) FROM date_address24 UNION + SELECT MAX(date) FROM date_address24 UNION + SELECT date FROM date_address24 + WHERE date >= $1 - 1 AND date <= $1 + 1)' + USING search_date, search_address24; +END; +$$ LANGUAGE plpgsql; + +-- Migrate from the earlier schema in exonerator.sql to this schema. This +-- migration happens in two big loops, one over statusentry and one over +-- exitlistentry, in which entries are copied to statusentry_oraddress and +-- exitlistentry_exitaddress, respectively. Entries in both loops are ordered by +-- fingerprint and by timestamp in order to reduce insert attempts into the +-- fingerprint, nickname, and date_address24 tables as much as possible. This +-- function can easily run for days, which is why it prints out a few dozen +-- progress messages while copying rows. This function is only run once and +-- deleted further down below. +CREATE OR REPLACE FUNCTION migrate_from_exonerator_sql() + RETURNS INTEGER AS $$ +DECLARE + existing_rows BIGINT; + copied_rows BIGINT := 0; + last_printed_progress BIGINT := 0; + last_nickname TEXT := NULL; + last_nickname_id INTEGER; + last_fingerprint_base64 TEXT := NULL; + last_fingerprint_id INTEGER; + last_address24 TEXT := NULL; + last_date DATE := NULL; + encoded_rawstatusentry TEXT; + exit BOOLEAN; + rec RECORD; + matches TEXT[]; + oraddress24 CHARACTER(6); + fingerprint_base64 TEXT; +BEGIN + RAISE NOTICE '% Starting schema migration.', timeofday(); + SELECT COUNT(*) INTO existing_rows FROM statusentry; + RAISE NOTICE '% Sorting % rows in statusentry (this may take days!).', + timeofday(), existing_rows; + FOR rec IN SELECT * FROM statusentry ORDER BY fingerprint, validafter + LOOP + IF copied_rows = 0 THEN + RAISE NOTICE '% Query returned, starting to copy.', timeofday(); + END IF; + encoded_rawstatusentry := ENCODE(rec.rawstatusentry, 'escape'); + matches := regexp_matches(encoded_rawstatusentry, '^r (\S+) (\S+) '); + IF last_nickname IS NULL OR matches[1] != last_nickname THEN + last_nickname_id := insert_nickname(matches[1]); + END IF; + IF last_fingerprint_base64 IS NULL + OR matches[2] != last_fingerprint_base64 THEN + last_fingerprint_id := insert_fingerprint(matches[2]); + END IF; + IF encoded_rawstatusentry ~ 'p (\S+ [\d-,]+)' THEN + exit := encoded_rawstatusentry !~ 'p reject 1-65535'; + ELSE + exit := NULL; + END IF; + INSERT INTO statusentry_oraddress (validafter, fingerprint_id, + oraddress, nickname_id, exit) + VALUES (rec.validafter, last_fingerprint_id, rec.oraddress, + last_nickname_id, exit); + IF rec.oraddress24 IS NOT NULL THEN + oraddress24 := rec.oraddress24; + ELSIF rec.oraddress48 IS NOT NULL THEN + oraddress24 := SUBSTRING(rec.oraddress48, 1, 6); + ELSE + RAISE EXCEPTION 'Invalid statusentry row: %, %, %.', + rec.validafter, rec.fingerprint, rec.oraddress; + END IF; + IF last_fingerprint_base64 IS NULL + OR matches[2] != last_fingerprint_base64 + OR last_address24 IS NULL OR oraddress24 != last_address24 + OR last_date IS NULL OR DATE(rec.validafter) != last_date THEN + PERFORM insert_date_address24(DATE(rec.validafter), oraddress24, + last_fingerprint_id); + last_address24 := oraddress24; + last_date := DATE(rec.validafter); + END IF; + last_nickname := matches[1]; + last_fingerprint_base64 := matches[2]; + copied_rows := copied_rows + 1; + IF copied_rows - last_printed_progress > existing_rows / 50 THEN + RAISE NOTICE '% Copied % rows from statusentry.', + timeofday(), copied_rows; + last_printed_progress := copied_rows; + END IF; + END LOOP; + last_nickname := NULL; + last_fingerprint_base64 := NULL; + last_address24 := NULL; + last_date := NULL; + SELECT COUNT(*) INTO existing_rows FROM exitlistentry; + copied_rows := 0; + last_printed_progress := 0; + RAISE NOTICE '% Sorting % rows in exitlistentry (this may take hours!).', + timeofday(), existing_rows; + FOR rec IN SELECT * FROM exitlistentry + ORDER BY fingerprint, DATE(scanned) + LOOP + IF copied_rows = 0 THEN + RAISE NOTICE '% Query returned, starting to copy.', timeofday(); + END IF; + fingerprint_base64 := SUBSTRING(ENCODE(DECODE(rec.fingerprint, 'hex'), + 'base64'), 1, 27); + IF last_fingerprint_base64 IS NULL + OR fingerprint_base64 != last_fingerprint_base64 THEN + last_fingerprint_id := insert_fingerprint(fingerprint_base64); + END IF; + INSERT INTO exitlistentry_exitaddress (fingerprint_id, exitaddress, + scanned) VALUES (last_fingerprint_id, rec.exitaddress, rec.scanned); + IF last_fingerprint_base64 IS NULL + OR fingerprint_base64 != last_fingerprint_base64 + OR last_address24 IS NULL OR rec.exitaddress24 != last_address24 + OR last_date IS NULL OR DATE(rec.scanned) != last_date THEN + PERFORM insert_date_address24(DATE(rec.scanned), rec.exitaddress24, + last_fingerprint_id); + last_address24 := rec.exitaddress24; + last_date := DATE(rec.scanned); + END IF; + last_fingerprint_base64 := fingerprint_base64; + copied_rows := copied_rows + 1; + IF copied_rows - last_printed_progress > existing_rows / 5 THEN + RAISE NOTICE '% Copied % rows from exitlistentry.', + timeofday(), copied_rows; + last_printed_progress := copied_rows; + END IF; + END LOOP; + RAISE NOTICE '% Completed schema migration.', timeofday(); +RETURN 1; +END; +$$ LANGUAGE plpgsql; + +-- Run the migration script once. This is the only time this function is run +-- before it's dropped further down below. +SELECT migrate_from_exonerator_sql(); + +-- Create an index on date and first three address bytes which is supposed to be +-- the main index used for lookups. +CREATE INDEX date_address24_date_address24 + ON date_address24 (date, address24); + +-- Create an index on statusentry_oraddress for joining with date_address24 as +-- part of search_by_date_address24. +CREATE INDEX statusentry_oraddress_date_validafter_fingerprint_id + ON statusentry_oraddress (DATE(validafter), fingerprint_id); + +-- Create an index on exitlistentry_exitaddress for joining with date_address24 +-- as part of search_by_date_address24. +CREATE INDEX exitlistentry_exitaddress_date_scanned_fingerprint_id + ON exitlistentry_exitaddress (DATE(scanned), fingerprint_id); + +-- Drop the migration function. +DROP FUNCTION migrate_from_exonerator_sql(); + +-- Drop the old insert_* and search_* functions which are based on the old +-- statusentry and exitlistentry tables. +DROP FUNCTION insert_exitlistentry( + CHARACTER, CHARACTER, TEXT, TIMESTAMP WITHOUT TIME ZONE, BYTEA); +DROP FUNCTION insert_statusentry( + TIMESTAMP WITHOUT TIME ZONE, CHARACTER, CHARACTER, CHARACTER, CHARACTER, + TEXT, BYTEA); +DROP FUNCTION search_by_address24_date(TEXT, DATE); +DROP FUNCTION search_by_address48_date(TEXT, DATE); + +-- Also drop the old tables including any indexes on them. +DROP TABLE exitlistentry CASCADE; +DROP TABLE statusentry CASCADE; + diff --git a/src/test/java/org/torproject/metrics/exonerator/QueryResponseTest.java b/src/test/java/org/torproject/metrics/exonerator/QueryResponseTest.java index 5390340..97ae88d 100644 --- a/src/test/java/org/torproject/metrics/exonerator/QueryResponseTest.java +++ b/src/test/java/org/torproject/metrics/exonerator/QueryResponseTest.java @@ -15,6 +15,7 @@ import java.io.IOException; import java.io.StringReader; import java.util.Arrays; import java.util.Collection; +import java.util.TreeSet;
@RunWith(Parameterized.class) public class QueryResponseTest { @@ -46,10 +47,10 @@ public class QueryResponseTest { {new QueryResponse("1.0", "12.13.14.15", "2016-12-12", "2016-01-01", "2016-12-31", true, new QueryResponse.Match[]{new QueryResponse.Match("2016-12-03", - new String[] {"12.13.14.15", "12.13.14.16"}, + new TreeSet<>(Arrays.asList("12.13.14.15", "12.13.14.16")), "fingerprint-not-checked", "some name", true), new QueryResponse.Match("2012-12-03", - new String[] {"12.13.20.15", "12.13.20.16"}, + new TreeSet<>(Arrays.asList("12.13.20.15", "12.13.20.16")), "fingerprint2-not-checked", "some name2", false)}, new String[] {"12.13.14.15", "12.13.14.16"}), "{"version":"1.0"," @@ -71,10 +72,10 @@ public class QueryResponseTest { {new QueryResponse("1.0", "12.13.14.15", "2016-12-12", "2016-01-01", "2016-12-31", false, new QueryResponse.Match[]{new QueryResponse.Match("2016-12-03", - new String[] {"12.13.14.15", "12.13.14.16"}, + new TreeSet<>(Arrays.asList("12.13.14.15", "12.13.14.16")), "fingerprint-not-checked", "some name", null), new QueryResponse.Match("2012-12-03", - new String[] {"12.13.20.15", "12.13.20.16"}, + new TreeSet<>(Arrays.asList("12.13.20.15", "12.13.20.16")), "fingerprint2-not-checked", "some name2", true)}, new String[] {"12.13.14.15", "12.13.14.16"}), "{"version":"1.0","
tor-commits@lists.torproject.org