[tor-commits] [metrics-web/master] Use a temporary table to speed up merging client stats.

karsten at torproject.org karsten at torproject.org
Thu Jan 7 16:37:19 UTC 2016


commit 44acfe50b92983e4285569bb88b997af78a56da9
Author: Karsten Loesing <karsten.loesing at gmx.net>
Date:   Wed Jan 6 14:20:40 2016 +0100

    Use a temporary table to speed up merging client stats.
---
 modules/clients/init-userstats.sql |   59 +++++++++++++++++++++---------------
 1 file changed, 34 insertions(+), 25 deletions(-)

diff --git a/modules/clients/init-userstats.sql b/modules/clients/init-userstats.sql
index 00b7938..ea6edec 100644
--- a/modules/clients/init-userstats.sql
+++ b/modules/clients/init-userstats.sql
@@ -184,21 +184,28 @@ DECLARE
 BEGIN
   RAISE NOTICE '% Starting to merge.', timeofday();
 
-  -- TODO Maybe we'll have to materialize a merged_part table that only
-  -- contains dates IN (SELECT DISTINCT DATE(stats_start) FROM imported)
-  -- and use that in the query below.
+  RAISE NOTICE '% Creating temporary merged_part table.', timeofday();
+
+  -- Materialize a temporary merged_part table that only contains dates of
+  -- newly imported rows for the query below.
+  CREATE TEMPORARY TABLE merged_part AS
+    SELECT * FROM merged
+    WHERE DATE(merged.stats_start) IN (
+      SELECT DISTINCT DATE(stats_start) FROM imported);
+
+  RAISE NOTICE '% Joining imported and merged_part tables.', timeofday();
 
   -- Loop over results from a query that joins new entries in the imported
-  -- table with existing entries in the merged table.
+  -- table with existing entries in the merged_part table.
   FOR cur IN SELECT DISTINCT
 
     -- Select id, interval start and end, and value of the existing entry
-    -- in merged; all these fields may be null if the imported entry is
-    -- not adjacent to an existing one.
-    merged.id AS merged_id,
-    merged.stats_start AS merged_start,
-    merged.stats_end AS merged_end,
-    merged.val AS merged_val,
+    -- in merged_part; all these fields may be null if the imported entry
+    -- is not adjacent to an existing one.
+    merged_part.id AS merged_id,
+    merged_part.stats_start AS merged_start,
+    merged_part.stats_end AS merged_end,
+    merged_part.val AS merged_val,
 
     -- Select interval start and end and value of the newly imported
     -- entry.
@@ -216,33 +223,35 @@ BEGIN
     imported.version AS version
 
     -- Select these columns from all entries in the imported table, plus
-    -- do an outer join on the merged table to find adjacent entries that
-    -- we might want to merge the new entries with.  It's possible that we
-    -- handle the same imported entry twice, if it starts directly after
-    -- one existing entry and ends directly before another existing entry.
-    FROM imported LEFT JOIN merged
+    -- do an outer join on the merged_part table to find adjacent entries
+    -- that we might want to merge the new entries with.  It's possible
+    -- that we handle the same imported entry twice, if it starts directly
+    -- after one existing entry and ends directly before another existing
+    -- entry.
+    FROM imported LEFT JOIN merged_part
 
     -- First two join conditions are to find adjacent intervals.  In fact,
     -- we also include overlapping intervals here, so that we can skip the
     -- overlapping entry in the imported table.
-    ON imported.stats_end >= merged.stats_start AND
-       imported.stats_start <= merged.stats_end AND
+    ON imported.stats_end >= merged_part.stats_start AND
+       imported.stats_start <= merged_part.stats_end AND
 
        -- Further join conditions are same date, fingerprint, node, etc.,
        -- so that we don't merge entries that don't belong together.
-       DATE(imported.stats_start) = DATE(merged.stats_start) AND
-       imported.fingerprint = merged.fingerprint AND
-       imported.node = merged.node AND
-       imported.metric = merged.metric AND
-       imported.country = merged.country AND
-       imported.transport = merged.transport AND
-       imported.version = merged.version
+       DATE(imported.stats_start) = DATE(merged_part.stats_start) AND
+       imported.fingerprint = merged_part.fingerprint AND
+       imported.node = merged_part.node AND
+       imported.metric = merged_part.metric AND
+       imported.country = merged_part.country AND
+       imported.transport = merged_part.transport AND
+       imported.version = merged_part.version
 
     -- Ordering is key, or our approach to merge subsequent entries is
     -- going to break.
     ORDER BY imported.fingerprint, imported.node, imported.metric,
              imported.country, imported.transport, imported.version,
-             imported.stats_start, merged.stats_start, imported.stats_end
+             imported.stats_start, merged_part.stats_start,
+             imported.stats_end
 
   -- Now go through the results one by one.
   LOOP



More information about the tor-commits mailing list