commit 3650d4e5b13d219ee03802c507569bb0e608023f Author: Karsten Loesing karsten.loesing@gmx.net Date: Tue Jan 12 19:28:17 2016 +0100
Ignore dirreq stats in legacy module.
We're not using directory-request statistics in the legacy module anymore, which only produces servers.csv and bandwidth.csv. Taking them out will make it easier to refactor the legacy module later. --- modules/legacy/db/tordir.sql | 134 ++------------------ .../cron/RelayDescriptorDatabaseImporter.java | 96 -------------- 2 files changed, 8 insertions(+), 222 deletions(-)
diff --git a/modules/legacy/db/tordir.sql b/modules/legacy/db/tordir.sql index 04d31c0..3c9e0ee 100644 --- a/modules/legacy/db/tordir.sql +++ b/modules/legacy/db/tordir.sql @@ -616,100 +616,25 @@ CREATE OR REPLACE FUNCTION refresh_user_stats() RETURNS INTEGER AS $$ DELETE FROM user_stats WHERE date IN (SELECT date FROM updates); -- Now insert new user statistics. EXECUTE ' - INSERT INTO user_stats (date, country, r, dw, dr, drw, drr, bw, br, bwd, - brd, bwr, brr, bwdr, brdr, bwp, brp, bwn, brn) + INSERT INTO user_stats (date, country, dw, dr, bwd, brd, bwp, brp) SELECT - -- We want to learn about total requests by date and country. - dirreq_stats_by_country.date AS date, - dirreq_stats_by_country.country AS country, - dirreq_stats_by_country.r AS r, - -- In order to weight the reported directory requests, we are - -- counting bytes of relays (except directory authorities) - -- matching certain criteria: whether or not they are reporting - -- directory requests, whether or not they are reporting - -- directory bytes, and whether their directory port is open or - -- closed. + bwhist_by_relay.date AS date, + ''zy'' AS country, SUM(CASE WHEN authority IS NOT NULL THEN NULL ELSE dirwritten END) AS dw, SUM(CASE WHEN authority IS NOT NULL THEN NULL ELSE dirread END) AS dr, - SUM(CASE WHEN requests IS NULL OR authority IS NOT NULL - THEN NULL ELSE dirwritten END) AS dwr, - SUM(CASE WHEN requests IS NULL OR authority IS NOT NULL - THEN NULL ELSE dirread END) AS drr, - SUM(CASE WHEN authority IS NOT NULL - THEN NULL ELSE written END) AS bw, - SUM(CASE WHEN authority IS NOT NULL - THEN NULL ELSE read END) AS br, SUM(CASE WHEN dirwritten = 0 OR authority IS NOT NULL THEN NULL ELSE written END) AS bwd, SUM(CASE WHEN dirwritten = 0 OR authority IS NOT NULL THEN NULL ELSE read END) AS brd, - SUM(CASE WHEN requests IS NULL OR authority IS NOT NULL - THEN NULL ELSE written END) AS bwr, - SUM(CASE WHEN requests IS NULL OR authority IS NOT NULL - THEN NULL ELSE read END) AS brr, - SUM(CASE WHEN dirwritten = 0 OR requests IS NULL - OR authority IS NOT NULL THEN NULL ELSE written END) AS bwdr, - SUM(CASE WHEN dirwritten = 0 OR requests IS NULL - OR authority IS NOT NULL THEN NULL ELSE read END) AS brdr, SUM(CASE WHEN opendirport IS NULL OR authority IS NOT NULL THEN NULL ELSE written END) AS bwp, SUM(CASE WHEN opendirport IS NULL OR authority IS NOT NULL - THEN NULL ELSE read END) AS brp, - SUM(CASE WHEN opendirport IS NOT NULL OR authority IS NOT NULL - THEN NULL ELSE written END) AS bwn, - SUM(CASE WHEN opendirport IS NOT NULL OR authority IS NOT NULL - THEN NULL ELSE read END) AS brn + THEN NULL ELSE read END) AS brp FROM ( - -- The first sub-select tells us the total number of directory - -- requests per country reported by all directory mirrors. - SELECT dirreq_stats_by_date.date AS date, country, SUM(requests) AS r - FROM ( - SELECT fingerprint, date, country, SUM(requests) AS requests - FROM ( - -- There are two selects here, because in most cases the directory - -- request statistics cover two calendar dates. - SELECT LOWER(source) AS fingerprint, DATE(statsend) AS date, - country, FLOOR(requests * (CASE - WHEN EXTRACT(EPOCH FROM DATE(statsend)) > - EXTRACT(EPOCH FROM statsend) - seconds - THEN EXTRACT(EPOCH FROM statsend) - - EXTRACT(EPOCH FROM DATE(statsend)) - ELSE seconds END) / seconds) AS requests - FROM dirreq_stats - UNION - SELECT LOWER(source) AS fingerprint, DATE(statsend) - 1 AS date, - country, FLOOR(requests * - (EXTRACT(EPOCH FROM DATE(statsend)) - - EXTRACT(EPOCH FROM statsend) + seconds) - / seconds) AS requests - FROM dirreq_stats - WHERE EXTRACT(EPOCH FROM DATE(statsend)) - - EXTRACT(EPOCH FROM statsend) + seconds > 0 - ) dirreq_stats_split - GROUP BY 1, 2, 3 - ) dirreq_stats_by_date - -- We are only interested in requests by directory mirrors, not - -- directory authorities, so we exclude all relays with the Authority - -- flag. - RIGHT JOIN ( - SELECT fingerprint, DATE(validafter) AS date - FROM statusentry - WHERE validafter >= ''' || min_date || ''' - AND validafter < ''' || max_date || ''' - AND DATE(validafter) IN (SELECT date FROM updates) - AND isauthority IS FALSE - GROUP BY 1, 2 - ) statusentry_dirmirrors - ON dirreq_stats_by_date.fingerprint = - statusentry_dirmirrors.fingerprint - AND dirreq_stats_by_date.date = statusentry_dirmirrors.date - GROUP BY 1, 2 - ) dirreq_stats_by_country - LEFT JOIN ( - -- In the next step, we expand the result by bandwidth histories of - -- all relays. + -- Retrieve aggregate bandwidth histories of all relays in the given + -- time frame. SELECT fingerprint, date, read_sum AS read, written_sum AS written, dirread_sum AS dirread, dirwritten_sum AS dirwritten FROM bwhist @@ -717,7 +642,6 @@ CREATE OR REPLACE FUNCTION refresh_user_stats() RETURNS INTEGER AS $$ AND date < ''' || max_date || ''' AND date IN (SELECT date FROM updates) ) bwhist_by_relay - ON dirreq_stats_by_country.date = bwhist_by_relay.date LEFT JOIN ( -- For each relay, tell how often it had an open directory port and -- how often it had the Authority flag assigned on a given date. @@ -732,38 +656,8 @@ CREATE OR REPLACE FUNCTION refresh_user_stats() RETURNS INTEGER AS $$ ) statusentry_by_relay ON bwhist_by_relay.fingerprint = statusentry_by_relay.fingerprint AND bwhist_by_relay.date = statusentry_by_relay.date - LEFT JOIN ( - -- For each relay, tell if it has reported directory request - -- statistics on a given date. Again, we have to take into account - -- that statistics intervals cover more than one calendar date in most - -- cases. The exact number of requests is not relevant here, but only - -- whether the relay reported directory requests or not. - SELECT fingerprint, date, 1 AS requests - FROM ( - SELECT LOWER(source) AS fingerprint, DATE(statsend) AS date - FROM dirreq_stats - WHERE DATE(statsend) >= ''' || min_date || ''' - AND DATE(statsend) < ''' || max_date || ''' - AND DATE(statsend) IN (SELECT date FROM updates) - AND country = ''zy'' - UNION - SELECT LOWER(source) AS fingerprint, DATE(statsend) - 1 AS date - FROM dirreq_stats - WHERE DATE(statsend) - 1 >= ''' || min_date || ''' - AND DATE(statsend) - 1 < ''' || max_date || ''' - AND DATE(statsend) IN (SELECT date FROM updates) - AND country = ''zy'' - AND EXTRACT(EPOCH FROM DATE(statsend)) - - EXTRACT(EPOCH FROM statsend) + seconds > 0 - ) dirreq_stats_split - GROUP BY 1, 2 - ) dirreq_stats_by_relay - ON bwhist_by_relay.fingerprint = dirreq_stats_by_relay.fingerprint - AND bwhist_by_relay.date = dirreq_stats_by_relay.date - WHERE dirreq_stats_by_country.country IS NOT NULL - -- Group by date, country, and total reported directory requests, - -- summing up the bandwidth histories. - GROUP BY 1, 2, 3'; + -- Group by date and country, summing up the bandwidth histories. + GROUP BY 1, 2'; RETURN 1; END; $$ LANGUAGE plpgsql; @@ -782,18 +676,6 @@ CREATE TABLE bridge_network_size ( CONSTRAINT bridge_network_size_pkey PRIMARY KEY(date) );
--- TABLE dirreq_stats --- Contains daily users by country. -CREATE TABLE dirreq_stats ( - source CHARACTER(40) NOT NULL, - statsend TIMESTAMP WITHOUT TIME ZONE NOT NULL, - seconds INTEGER NOT NULL, - country CHARACTER(2) NOT NULL, - requests INTEGER NOT NULL, - CONSTRAINT dirreq_stats_pkey - PRIMARY KEY (source, statsend, seconds, country) -); - -- Refresh all statistics in the database. CREATE OR REPLACE FUNCTION refresh_all() RETURNS INTEGER AS $$ BEGIN diff --git a/modules/legacy/src/org/torproject/ernie/cron/RelayDescriptorDatabaseImporter.java b/modules/legacy/src/org/torproject/ernie/cron/RelayDescriptorDatabaseImporter.java index 59e6740..c92baa7 100644 --- a/modules/legacy/src/org/torproject/ernie/cron/RelayDescriptorDatabaseImporter.java +++ b/modules/legacy/src/org/torproject/ernie/cron/RelayDescriptorDatabaseImporter.java @@ -18,7 +18,6 @@ import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Calendar; -import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; @@ -96,12 +95,6 @@ public final class RelayDescriptorDatabaseImporter { private PreparedStatement psCs;
/** - * Prepared statement to check whether a given dirreq stats string has - * been imported into the database before. - */ - private PreparedStatement psQs; - - /** * Set of dates that have been inserted into the database for being * included in the next refresh run. */ @@ -137,12 +130,6 @@ public final class RelayDescriptorDatabaseImporter { private PreparedStatement psC;
/** - * Prepared statement to insert a given dirreq stats string into the - * database. - */ - private PreparedStatement psQ; - - /** * Logger for this class. */ private Logger logger; @@ -173,11 +160,6 @@ public final class RelayDescriptorDatabaseImporter { private BufferedWriter consensusOut;
/** - * Raw import file containing dirreq stats. - */ - private BufferedWriter dirReqOut; - - /** * Date format to parse timestamps. */ private SimpleDateFormat dateTimeFormat; @@ -245,8 +227,6 @@ public final class RelayDescriptorDatabaseImporter { + "FROM descriptor WHERE descriptor = ?"); this.psCs = conn.prepareStatement("SELECT COUNT(*) " + "FROM consensus WHERE validafter = ?"); - this.psQs = conn.prepareStatement("SELECT COUNT(*) " - + "FROM dirreq_stats WHERE source = ? AND statsend = ?"); this.psR = conn.prepareStatement("INSERT INTO statusentry " + "(validafter, nickname, fingerprint, descriptor, " + "published, address, orport, dirport, isauthority, " @@ -265,9 +245,6 @@ public final class RelayDescriptorDatabaseImporter { + "?)}"); this.psC = conn.prepareStatement("INSERT INTO consensus " + "(validafter) VALUES (?)"); - this.psQ = conn.prepareStatement("INSERT INTO dirreq_stats " - + "(source, statsend, seconds, country, requests) VALUES " - + "(?, ?, ?, ?, ?)"); this.psU = conn.prepareStatement("INSERT INTO scheduled_updates " + "(date) VALUES (?)"); this.scheduledUpdates = new HashSet<Long>(); @@ -861,68 +838,6 @@ public final class RelayDescriptorDatabaseImporter { } }
- /** - * Adds observations on the number of directory requests by country as - * seen on a directory at a given date to the database. - */ - public void addDirReqStats(String source, long statsEndMillis, - long seconds, Map<String, String> dirReqsPerCountry) { - String statsEnd = this.dateTimeFormat.format(statsEndMillis); - if (this.importIntoDatabase) { - try { - this.addDateToScheduledUpdates(statsEndMillis); - Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("UTC")); - Timestamp statsEndTimestamp = new Timestamp(statsEndMillis); - this.psQs.setString(1, source); - this.psQs.setTimestamp(2, statsEndTimestamp, cal); - ResultSet rs = psQs.executeQuery(); - rs.next(); - if (rs.getInt(1) == 0) { - for (Map.Entry<String, String> e : - dirReqsPerCountry.entrySet()) { - this.psQ.clearParameters(); - this.psQ.setString(1, source); - this.psQ.setTimestamp(2, statsEndTimestamp, cal); - this.psQ.setLong(3, seconds); - this.psQ.setString(4, e.getKey()); - this.psQ.setLong(5, Long.parseLong(e.getValue())); - this.psQ.executeUpdate(); - rqsCount++; - if (rqsCount % autoCommitCount == 0) { - this.conn.commit(); - } - } - } - } catch (SQLException e) { - this.logger.log(Level.WARNING, "Could not add dirreq stats. We " - + "won't make any further SQL requests in this execution.", - e); - this.importIntoDatabase = false; - } - } - if (this.writeRawImportFiles) { - try { - if (this.dirReqOut == null) { - new File(rawFilesDirectory).mkdirs(); - this.dirReqOut = new BufferedWriter(new FileWriter( - rawFilesDirectory + "/dirreq_stats.sql")); - this.dirReqOut.write(" COPY dirreq_stats (source, statsend, " - + "seconds, country, requests) FROM stdin;\n"); - } - for (Map.Entry<String, String> e : - dirReqsPerCountry.entrySet()) { - this.dirReqOut.write(source + "\t" + statsEnd + "\t" + seconds - + "\t" + e.getKey() + "\t" + e.getValue() + "\n"); - } - } catch (IOException e) { - this.logger.log(Level.WARNING, "Could not write dirreq stats to " - + "raw database import file. We won't make any further " - + "attempts to write raw import files in this execution.", e); - this.writeRawImportFiles = false; - } - } - } - public void importRelayDescriptors() { if (archivesDirectory.exists()) { logger.fine("Importing files in directory " + archivesDirectory @@ -984,17 +899,6 @@ public final class RelayDescriptorDatabaseImporter { }
private void addExtraInfoDescriptor(ExtraInfoDescriptor descriptor) { - if (descriptor.getDirreqV3Reqs() != null) { - int allUsers = 0; - Map<String, String> obs = new HashMap<String, String>(); - for (int users : descriptor.getDirreqV3Reqs().values()) { - allUsers += users - 4; - } - obs.put("zy", "" + allUsers); - this.addDirReqStats(descriptor.getFingerprint(), - descriptor.getDirreqStatsEndMillis(), - descriptor.getDirreqStatsIntervalLength(), obs); - } List<String> bandwidthHistoryLines = new ArrayList<String>(); if (descriptor.getWriteHistory() != null) { bandwidthHistoryLines.add(descriptor.getWriteHistory().getLine());