[tor-commits] [metrics-web/master] Tweak refresh functions to use partitioned table.

karsten at torproject.org karsten at torproject.org
Thu Jan 12 15:36:05 UTC 2012


commit 13857257ac1df8c4b4d9bc00e0164b5731289762
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date:   Thu Jan 12 14:54:16 2012 +0100

    Tweak refresh functions to use partitioned table.
---
 db/tordir.sql |  162 ++++++++++++++++++++++++++++++++++++++-------------------
 1 files changed, 109 insertions(+), 53 deletions(-)

diff --git a/db/tordir.sql b/db/tordir.sql
index fbb1341..bc93b45 100644
--- a/db/tordir.sql
+++ b/db/tordir.sql
@@ -438,11 +438,18 @@ $$ LANGUAGE plpgsql;
 
 -- FUNCTION refresh_network_size()
 CREATE OR REPLACE FUNCTION refresh_network_size() RETURNS INTEGER AS $$
+    DECLARE
+        min_date TIMESTAMP WITHOUT TIME ZONE;
+        max_date TIMESTAMP WITHOUT TIME ZONE;
     BEGIN
 
+    min_date := (SELECT MIN(date) FROM updates);
+    max_date := (SELECT MAX(date) + 1 FROM updates);
+
     DELETE FROM network_size
     WHERE date IN (SELECT date FROM updates);
 
+    EXECUTE '
         INSERT INTO network_size
         (date, avg_running, avg_exit, avg_guard, avg_fast, avg_stable)
         SELECT date,
@@ -460,12 +467,12 @@ CREATE OR REPLACE FUNCTION refresh_network_size() RETURNS INTEGER AS $$
                 COUNT(NULLIF(isstable, FALSE)) AS isstable
             FROM statusentry
             WHERE isrunning = TRUE
-              AND DATE(validafter) >= (SELECT MIN(date) FROM updates)
-              AND DATE(validafter) <= (SELECT MAX(date) FROM updates)
+              AND validafter >= ''' || min_date || '''
+              AND validafter < ''' || max_date || '''
               AND DATE(validafter) IN (SELECT date FROM updates)
             GROUP BY DATE(validafter)
             ) b
-        NATURAL JOIN relay_statuses_per_day;
+        NATURAL JOIN relay_statuses_per_day';
 
     RETURN 1;
     END;
@@ -473,11 +480,18 @@ $$ LANGUAGE plpgsql;
 
 -- FUNCTION refresh_network_size_hour()
 CREATE OR REPLACE FUNCTION refresh_network_size_hour() RETURNS INTEGER AS $$
+    DECLARE
+        min_date TIMESTAMP WITHOUT TIME ZONE;
+        max_date TIMESTAMP WITHOUT TIME ZONE;
     BEGIN
 
+    min_date := (SELECT MIN(date) FROM updates);
+    max_date := (SELECT MAX(date) + 1 FROM updates);
+
     DELETE FROM network_size_hour
     WHERE DATE(validafter) IN (SELECT date FROM updates);
 
+    EXECUTE '
     INSERT INTO network_size_hour
     (validafter, avg_running, avg_exit, avg_guard, avg_fast, avg_stable)
     SELECT validafter, COUNT(*) AS avg_running,
@@ -487,10 +501,10 @@ CREATE OR REPLACE FUNCTION refresh_network_size_hour() RETURNS INTEGER AS $$
     COUNT(NULLIF(isstable, FALSE)) AS avg_stable
     FROM statusentry
     WHERE isrunning = TRUE
-    AND DATE(validafter) >= (SELECT MIN(date) FROM updates)
-    AND DATE(validafter) <= (SELECT MAX(date) FROM updates)
+    AND validafter >= ''' || min_date || '''
+    AND validafter < ''' || max_date || '''
     AND DATE(validafter) IN (SELECT date FROM updates)
-    GROUP BY validafter;
+    GROUP BY validafter';
 
     RETURN 1;
     END;
@@ -498,27 +512,34 @@ $$ LANGUAGE plpgsql;
 
 -- FUNCTION refresh_relay_countries()
 CREATE OR REPLACE FUNCTION refresh_relay_countries() RETURNS INTEGER AS $$
+    DECLARE
+        min_date TIMESTAMP WITHOUT TIME ZONE;
+        max_date TIMESTAMP WITHOUT TIME ZONE;
     BEGIN
 
+    min_date := (SELECT MIN(date) FROM updates);
+    max_date := (SELECT MAX(date) + 1 FROM updates);
+
     DELETE FROM relay_countries
     WHERE date IN (SELECT date FROM updates);
 
+    EXECUTE '
     INSERT INTO relay_countries
     (date, country, relays)
     SELECT date, country, relays / count AS relays
     FROM (
         SELECT DATE(validafter),
-               COALESCE(lower((geoip_lookup(address)).country), 'zz')
+               COALESCE(lower((geoip_lookup(address)).country), ''zz'')
                  AS country,
                COUNT(*) AS relays
         FROM statusentry
         WHERE isrunning = TRUE
-              AND DATE(validafter) >= (SELECT MIN(date) FROM updates)
-              AND DATE(validafter) <= (SELECT MAX(date) FROM updates)
+              AND validafter >= ''' || min_date || '''
+              AND validafter < ''' || max_date || '''
               AND DATE(validafter) IN (SELECT date FROM updates)
         GROUP BY 1, 2
         ) b
-    NATURAL JOIN relay_statuses_per_day;
+    NATURAL JOIN relay_statuses_per_day';
 
     RETURN 1;
     END;
@@ -526,11 +547,18 @@ $$ LANGUAGE plpgsql;
 
 -- FUNCTION refresh_relay_platforms()
 CREATE OR REPLACE FUNCTION refresh_relay_platforms() RETURNS INTEGER AS $$
+    DECLARE
+        min_date TIMESTAMP WITHOUT TIME ZONE;
+        max_date TIMESTAMP WITHOUT TIME ZONE;
     BEGIN
 
+    min_date := (SELECT MIN(date) FROM updates);
+    max_date := (SELECT MAX(date) + 1 FROM updates);
+
     DELETE FROM relay_platforms
     WHERE date IN (SELECT date FROM updates);
 
+    EXECUTE '
     INSERT INTO relay_platforms
     (date, avg_linux, avg_darwin, avg_bsd, avg_windows, avg_other)
     SELECT date,
@@ -541,29 +569,29 @@ CREATE OR REPLACE FUNCTION refresh_relay_platforms() RETURNS INTEGER AS $$
         other / count AS avg_other
     FROM (
         SELECT DATE(validafter) AS date,
-            SUM(CASE WHEN platform LIKE '%Linux%' THEN 1 ELSE 0 END)
+            SUM(CASE WHEN platform LIKE ''%Linux%'' THEN 1 ELSE 0 END)
                 AS linux,
-            SUM(CASE WHEN platform LIKE '%Darwin%' THEN 1 ELSE 0 END)
+            SUM(CASE WHEN platform LIKE ''%Darwin%'' THEN 1 ELSE 0 END)
                 AS darwin,
-            SUM(CASE WHEN platform LIKE '%BSD%' THEN 1 ELSE 0 END)
+            SUM(CASE WHEN platform LIKE ''%BSD%'' THEN 1 ELSE 0 END)
                 AS bsd,
-            SUM(CASE WHEN platform LIKE '%Windows%' THEN 1 ELSE 0 END)
+            SUM(CASE WHEN platform LIKE ''%Windows%'' THEN 1 ELSE 0 END)
                 AS windows,
-            SUM(CASE WHEN platform NOT LIKE '%Windows%'
-                AND platform NOT LIKE '%Darwin%'
-                AND platform NOT LIKE '%BSD%'
-                AND platform NOT LIKE '%Linux%' THEN 1 ELSE 0 END)
+            SUM(CASE WHEN platform NOT LIKE ''%Windows%''
+                AND platform NOT LIKE ''%Darwin%''
+                AND platform NOT LIKE ''%BSD%''
+                AND platform NOT LIKE ''%Linux%'' THEN 1 ELSE 0 END)
                 AS other
         FROM descriptor
         RIGHT JOIN statusentry
         ON statusentry.descriptor = descriptor.descriptor
         WHERE isrunning = TRUE
-          AND DATE(validafter) >= (SELECT MIN(date) FROM updates)
-          AND DATE(validafter) <= (SELECT MAX(date) FROM updates)
+          AND validafter >= ''' || min_date || '''
+          AND validafter < ''' || max_date || '''
           AND DATE(validafter) IN (SELECT date FROM updates)
         GROUP BY DATE(validafter)
         ) b
-    NATURAL JOIN relay_statuses_per_day;
+    NATURAL JOIN relay_statuses_per_day';
 
     RETURN 1;
     END;
@@ -571,11 +599,18 @@ $$ LANGUAGE plpgsql;
 
 -- FUNCTION refresh_relay_versions()
 CREATE OR REPLACE FUNCTION refresh_relay_versions() RETURNS INTEGER AS $$
+    DECLARE
+        min_date TIMESTAMP WITHOUT TIME ZONE;
+        max_date TIMESTAMP WITHOUT TIME ZONE;
     BEGIN
 
+    min_date := (SELECT MIN(date) FROM updates);
+    max_date := (SELECT MAX(date) + 1 FROM updates);
+
     DELETE FROM relay_versions
     WHERE date IN (SELECT date FROM updates);
 
+    EXECUTE '
     INSERT INTO relay_versions
     (date, version, relays)
     SELECT date, version, relays / count AS relays
@@ -586,12 +621,12 @@ CREATE OR REPLACE FUNCTION refresh_relay_versions() RETURNS INTEGER AS $$
         ON descriptor.descriptor = statusentry.descriptor
         WHERE isrunning = TRUE
               AND platform IS NOT NULL
-              AND DATE(validafter) >= (SELECT MIN(date) FROM updates)
-              AND DATE(validafter) <= (SELECT MAX(date) FROM updates)
+              AND validafter >= ''' || min_date || '''
+              AND validafter < ''' || max_date || '''
               AND DATE(validafter) IN (SELECT date FROM updates)
         GROUP BY 1, 2
         ) b
-    NATURAL JOIN relay_statuses_per_day;
+    NATURAL JOIN relay_statuses_per_day';
 
     RETURN 1;
     END;
@@ -600,11 +635,18 @@ $$ LANGUAGE plpgsql;
 -- FUNCTION refresh_total_bandwidth()
 -- This keeps the table total_bandwidth up-to-date when necessary.
 CREATE OR REPLACE FUNCTION refresh_total_bandwidth() RETURNS INTEGER AS $$
+    DECLARE
+        min_date TIMESTAMP WITHOUT TIME ZONE;
+        max_date TIMESTAMP WITHOUT TIME ZONE;
     BEGIN
 
+    min_date := (SELECT MIN(date) FROM updates);
+    max_date := (SELECT MAX(date) + 1 FROM updates);
+
     DELETE FROM total_bandwidth
     WHERE date IN (SELECT date FROM updates);
 
+    EXECUTE '
     INSERT INTO total_bandwidth
     (bwavg, bwburst, bwobserved, bwadvertised, date)
     SELECT (SUM(bandwidthavg)
@@ -621,16 +663,14 @@ CREATE OR REPLACE FUNCTION refresh_total_bandwidth() RETURNS INTEGER AS $$
     JOIN relay_statuses_per_day
     ON DATE(validafter) = relay_statuses_per_day.date
     WHERE isrunning = TRUE
-          AND DATE(validafter) >= (SELECT MIN(date) FROM updates)
-          AND DATE(validafter) <= (SELECT MAX(date) FROM updates)
+          AND validafter >= ''' || min_date || '''
+          AND validafter < ''' || max_date || '''
           AND DATE(validafter) IN (SELECT date FROM updates)
-          AND DATE(relay_statuses_per_day.date) >=
-              (SELECT MIN(date) FROM updates)
-          AND DATE(relay_statuses_per_day.date) <=
-              (SELECT MAX(date) FROM updates)
+          AND relay_statuses_per_day.date >= ''' || min_date || '''
+          AND relay_statuses_per_day.date < ''' || max_date || '''
           AND DATE(relay_statuses_per_day.date) IN
               (SELECT date FROM updates)
-    GROUP BY DATE(validafter), relay_statuses_per_day.count;
+    GROUP BY DATE(validafter), relay_statuses_per_day.count';
 
     RETURN 1;
     END;
@@ -651,8 +691,16 @@ CREATE OR REPLACE FUNCTION refresh_total_bwhist() RETURNS INTEGER AS $$
 $$ LANGUAGE plpgsql;
 
 CREATE OR REPLACE FUNCTION refresh_bwhist_flags() RETURNS INTEGER AS $$
-  BEGIN
+    DECLARE
+        min_date TIMESTAMP WITHOUT TIME ZONE;
+        max_date TIMESTAMP WITHOUT TIME ZONE;
+    BEGIN
+
+    min_date := (SELECT MIN(date) FROM updates);
+    max_date := (SELECT MAX(date) + 1 FROM updates);
+
   DELETE FROM bwhist_flags WHERE date IN (SELECT date FROM updates);
+  EXECUTE '
   INSERT INTO bwhist_flags (date, isexit, isguard, read, written)
   SELECT a.date, isexit, isguard, SUM(read_sum) as read,
       SUM(written_sum) AS written
@@ -663,14 +711,14 @@ CREATE OR REPLACE FUNCTION refresh_bwhist_flags() RETURNS INTEGER AS $$
              BOOL_OR(isguard) AS isguard
       FROM statusentry
       WHERE isrunning = TRUE
-        AND DATE(validafter) >= (SELECT MIN(date) FROM updates)
-        AND DATE(validafter) <= (SELECT MAX(date) FROM updates)
+        AND validafter >= ''' || min_date || '''
+        AND validafter < ''' || max_date || '''
         AND DATE(validafter) IN (SELECT date FROM updates)
       GROUP BY 1, 2) a
   JOIN bwhist
   ON a.date = bwhist.date
   AND a.fingerprint = bwhist.fingerprint
-  GROUP BY 1, 2, 3;
+  GROUP BY 1, 2, 3';
   RETURN 1;
   END;
 $$ LANGUAGE plpgsql;
@@ -680,11 +728,19 @@ $$ LANGUAGE plpgsql;
 -- directory request statistics of directory mirrors with bandwidth
 -- histories.
 CREATE OR REPLACE FUNCTION refresh_user_stats() RETURNS INTEGER AS $$
-  BEGIN
+    DECLARE
+        min_date TIMESTAMP WITHOUT TIME ZONE;
+        max_date TIMESTAMP WITHOUT TIME ZONE;
+    BEGIN
+
+    min_date := (SELECT MIN(date) FROM updates);
+    max_date := (SELECT MAX(date) + 1 FROM updates);
+
   -- Start by deleting user statistics of the dates we're about to
   -- regenerate.
   DELETE FROM user_stats WHERE date IN (SELECT date FROM updates);
   -- Now insert new user statistics.
+  EXECUTE '
   INSERT INTO user_stats (date, country, r, dw, dr, drw, drr, bw, br, bwd,
       brd, bwr, brr, bwdr, brdr, bwp, brp, bwn, brn)
   SELECT
@@ -692,7 +748,7 @@ CREATE OR REPLACE FUNCTION refresh_user_stats() RETURNS INTEGER AS $$
          dirreq_stats_by_country.date AS date,
          dirreq_stats_by_country.country AS country,
          dirreq_stats_by_country.r AS r,
-         -- In order to weight the reported directory requests, we're
+         -- In order to weight the reported directory requests, we are
          -- counting bytes of relays (except directory authorities)
          -- matching certain criteria: whether or not they are reporting
          -- directory requests, whether or not they are reporting
@@ -759,14 +815,14 @@ CREATE OR REPLACE FUNCTION refresh_user_stats() RETURNS INTEGER AS $$
       ) dirreq_stats_split
       GROUP BY 1, 2, 3
     ) dirreq_stats_by_date
-    -- We're only interested in requests by directory mirrors, not
+    -- We are only interested in requests by directory mirrors, not
     -- directory authorities, so we exclude all relays with the Authority
     -- flag.
     RIGHT JOIN (
       SELECT fingerprint, DATE(validafter) AS date
       FROM statusentry
-      WHERE DATE(validafter) >= (SELECT MIN(date) FROM updates)
-      AND DATE(validafter) <= (SELECT MAX(date) FROM updates)
+      WHERE validafter >= ''' || min_date || '''
+      AND validafter < ''' || max_date || '''
       AND DATE(validafter) IN (SELECT date FROM updates)
       AND isauthority IS FALSE
       GROUP BY 1, 2
@@ -782,8 +838,8 @@ CREATE OR REPLACE FUNCTION refresh_user_stats() RETURNS INTEGER AS $$
     SELECT fingerprint, date, read_sum AS read, written_sum AS written,
            dirread_sum AS dirread, dirwritten_sum AS dirwritten
     FROM bwhist
-    WHERE date >= (SELECT MIN(date) FROM updates)
-    AND date <= (SELECT MAX(date) FROM updates)
+    WHERE date >= ''' || min_date || '''
+    AND date < ''' || max_date || '''
     AND date IN (SELECT date FROM updates)
   ) bwhist_by_relay
   ON dirreq_stats_by_country.date = bwhist_by_relay.date
@@ -794,8 +850,8 @@ CREATE OR REPLACE FUNCTION refresh_user_stats() RETURNS INTEGER AS $$
       SUM(CASE WHEN dirport > 0 THEN 1 ELSE NULL END) AS opendirport,
       SUM(CASE WHEN isauthority IS TRUE THEN 1 ELSE NULL END) AS authority
     FROM statusentry
-    WHERE DATE(validafter) >= (SELECT MIN(date) FROM updates)
-    AND DATE(validafter) <= (SELECT MAX(date) FROM updates)
+    WHERE validafter >= ''' || min_date || '''
+    AND validafter < ''' || max_date || '''
     AND DATE(validafter) IN (SELECT date FROM updates)
     GROUP BY 1, 2
   ) statusentry_by_relay
@@ -805,23 +861,23 @@ CREATE OR REPLACE FUNCTION refresh_user_stats() RETURNS INTEGER AS $$
     -- For each relay, tell if it has reported directory request
     -- statistics on a given date. Again, we have to take into account
     -- that statistics intervals cover more than one calendar date in most
-    -- cases. The exact number of requests isn't relevant here, but only
+    -- cases. The exact number of requests is not relevant here, but only
     -- whether the relay reported directory requests or not.
     SELECT fingerprint, date, 1 AS requests
     FROM (
       SELECT LOWER(source) AS fingerprint, DATE(statsend) AS date
       FROM dirreq_stats
-      WHERE DATE(statsend) >= (SELECT MIN(date) FROM updates)
-      AND DATE(statsend) <= (SELECT MAX(date) FROM updates)
+      WHERE DATE(statsend) >= ''' || min_date || '''
+      AND DATE(statsend) < ''' || max_date || '''
       AND DATE(statsend) IN (SELECT date FROM updates)
-      AND country = 'zy'
+      AND country = ''zy''
       UNION
       SELECT LOWER(source) AS fingerprint, DATE(statsend) - 1 AS date
       FROM dirreq_stats
-      WHERE DATE(statsend) - 1 >= (SELECT MIN(date) FROM updates)
-      AND DATE(statsend) - 1 <= (SELECT MAX(date) FROM updates)
-      AND DATE(statsend) - 1 IN (SELECT date FROM updates)
-      AND country = 'zy'
+      WHERE DATE(statsend) - 1 >= ''' || min_date || '''
+      AND DATE(statsend) - 1 < ''' || max_date || '''
+      AND DATE(statsend) IN (SELECT date FROM updates)
+      AND country = ''zy''
       AND EXTRACT(EPOCH FROM DATE(statsend)) -
       EXTRACT(EPOCH FROM statsend) + seconds > 0
     ) dirreq_stats_split
@@ -832,7 +888,7 @@ CREATE OR REPLACE FUNCTION refresh_user_stats() RETURNS INTEGER AS $$
   WHERE dirreq_stats_by_country.country IS NOT NULL
   -- Group by date, country, and total reported directory requests,
   -- summing up the bandwidth histories.
-  GROUP BY 1, 2, 3;
+  GROUP BY 1, 2, 3';
   RETURN 1;
   END;
 $$ LANGUAGE plpgsql;





More information about the tor-commits mailing list