[metrics-web/master] Be smarter about re-importing consensuses.

commit 2b3f4b90191b5e8eadd8a72f3841789d6a9e22ba Author: Karsten Loesing <karsten.loesing@gmx.net> Date: Wed Feb 10 17:50:03 2016 +0100 Be smarter about re-importing consensuses. A recent analysis of Metrics' back-end performance has revealed that importing consensuses into the database can take between a few seconds and a few *hours*. More precisely, importing a consensus for the first time takes seconds and re-importing a consensus that was already (partially) contained in the database can take hours. The reason for the latter is that we're checking for every status entry whether it's contained in the database before we're inserting it, and these 7k queries are crazy expensive. What we should do, which is what we're doing now, is request and store a list of fingerprints of contained status entry for a given consensus and only inserting a status entry if its fingerprint is not contained in that list. Now we can avoid making these 7k queries and re-import a consensus within seconds. There were two situations when we re-imported one or more consensuses which took hours or more: whenever the host was rebooted during the database import and we lost import history, and whenever CollecTor fetched an outdated consensus from a directory authority that it already received the hour before and that Metrics already imported in its previous run. --- .../cron/RelayDescriptorDatabaseImporter.java | 50 ++++------------------ 1 file changed, 8 insertions(+), 42 deletions(-) diff --git a/modules/legacy/src/org/torproject/ernie/cron/RelayDescriptorDatabaseImporter.java b/modules/legacy/src/org/torproject/ernie/cron/RelayDescriptorDatabaseImporter.java index c92baa7..19f2a0f 100644 --- a/modules/legacy/src/org/torproject/ernie/cron/RelayDescriptorDatabaseImporter.java +++ b/modules/legacy/src/org/torproject/ernie/cron/RelayDescriptorDatabaseImporter.java @@ -77,12 +77,6 @@ public final class RelayDescriptorDatabaseImporter { private PreparedStatement psSs; /** - * Prepared statement to check whether a given network status consensus - * entry has been imported into the database before. - */ - private PreparedStatement psRs; - - /** * Prepared statement to check whether a given server descriptor has * been imported into the database before. */ @@ -174,13 +168,7 @@ public final class RelayDescriptorDatabaseImporter { * Set of fingerprints that we imported for the valid-after time in * <code>lastCheckedStatusEntries</code>. */ - private Set<String> insertedStatusEntries; - - /** - * Flag that tells us whether we need to check whether a network status - * entry is already contained in the database or not. - */ - private boolean separateStatusEntryCheckNecessary; + private Set<String> insertedStatusEntries = new HashSet<String>(); private boolean importIntoDatabase; private boolean writeRawImportFiles; @@ -218,11 +206,8 @@ public final class RelayDescriptorDatabaseImporter { this.conn.setAutoCommit(false); /* Prepare statements. */ - this.psSs = conn.prepareStatement("SELECT COUNT(*) " + this.psSs = conn.prepareStatement("SELECT fingerprint " + "FROM statusentry WHERE validafter = ?"); - this.psRs = conn.prepareStatement("SELECT COUNT(*) " - + "FROM statusentry WHERE validafter = ? AND " - + "fingerprint = ?"); this.psDs = conn.prepareStatement("SELECT COUNT(*) " + "FROM descriptor WHERE descriptor = ?"); this.psCs = conn.prepareStatement("SELECT COUNT(*) " @@ -253,10 +238,6 @@ public final class RelayDescriptorDatabaseImporter { this.logger.log(Level.WARNING, "Could not connect to database or " + "prepare statements.", e); } - - /* Initialize set of fingerprints to remember which status entries - * we already imported. */ - this.insertedStatusEntries = new HashSet<String>(); } /* Remember where we want to write raw import files. */ @@ -305,33 +286,17 @@ public final class RelayDescriptorDatabaseImporter { Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("UTC")); Timestamp validAfterTimestamp = new Timestamp(validAfter); if (lastCheckedStatusEntries != validAfter) { + insertedStatusEntries.clear(); this.psSs.setTimestamp(1, validAfterTimestamp, cal); ResultSet rs = psSs.executeQuery(); - rs.next(); - if (rs.getInt(1) == 0) { - separateStatusEntryCheckNecessary = false; - insertedStatusEntries.clear(); - } else { - separateStatusEntryCheckNecessary = true; + while (rs.next()) { + String insertedFingerprint = rs.getString(1); + insertedStatusEntries.add(insertedFingerprint); } rs.close(); lastCheckedStatusEntries = validAfter; } - boolean alreadyContained = false; - if (separateStatusEntryCheckNecessary || - insertedStatusEntries.contains(fingerprint)) { - this.psRs.setTimestamp(1, validAfterTimestamp, cal); - this.psRs.setString(2, fingerprint); - ResultSet rs = psRs.executeQuery(); - rs.next(); - if (rs.getInt(1) > 0) { - alreadyContained = true; - } - rs.close(); - } else { - insertedStatusEntries.add(fingerprint); - } - if (!alreadyContained) { + if (!insertedStatusEntries.contains(fingerprint)) { this.psR.clearParameters(); this.psR.setTimestamp(1, validAfterTimestamp, cal); this.psR.setString(2, nickname); @@ -364,6 +329,7 @@ public final class RelayDescriptorDatabaseImporter { if (rrsCount % autoCommitCount == 0) { this.conn.commit(); } + insertedStatusEntries.add(fingerprint); } } catch (SQLException e) { this.logger.log(Level.WARNING, "Could not add network status "
participants (1)
-
karsten@torproject.org