[tor-commits] [sbws/master] chg: generate, cleanup: Use 28 days of measurements

juga at torproject.org juga at torproject.org
Fri Feb 19 17:53:52 UTC 2021


commit e416547c73d166bd085a10843c6ba4d1a4167092
Author: juga0 <juga at riseup.net>
Date:   Sat Jan 23 09:54:19 2021 +0000

    chg: generate, cleanup: Use 28 days of measurements
    
    When generating the Bandwidth File as Torflow, use 28 days of past raw
    measurements instead of 5, by default.
    Also keep the raw measurements for that long before compressing or
    deleting them.
    And stop checking whether the compression and delete
    periods are valid, without checking defaults first and based on
    arbitrary values.
    
    Closes: #40017
---
 docs/source/how_works.rst    |  3 ++-
 docs/source/man_sbws.ini.rst |  4 ++--
 sbws/config.default.ini      | 11 ++++++++---
 sbws/core/cleanup.py         | 25 -------------------------
 sbws/core/generate.py        |  9 ++++++---
 sbws/globals.py              | 12 +++++++++---
 6 files changed, 27 insertions(+), 37 deletions(-)

diff --git a/docs/source/how_works.rst b/docs/source/how_works.rst
index f7d7533..24f8689 100644
--- a/docs/source/how_works.rst
+++ b/docs/source/how_works.rst
@@ -137,7 +137,8 @@ Each relay bandwidth measurements are selected in the following way:
    If they are not, the relay MUST NOT be included in the Bandwith File.
 #. The measurements than are are older than an arbitrary number of senconds
    in the past MUST be discarded.
-   Currently this number is the same as ``data_period`` (5 days).
+   Currently this number is the same as ``data_period`` (5 days) when not
+   scaling as Torflow and 28 days when scaling as Torflow.
 
 If the number of relays to include in the Bandwidth File are less than
 a percententage (currently 60%) than the number of relays in the consensus,
diff --git a/docs/source/man_sbws.ini.rst b/docs/source/man_sbws.ini.rst
index e2127ab..31effc7 100644
--- a/docs/source/man_sbws.ini.rst
+++ b/docs/source/man_sbws.ini.rst
@@ -143,9 +143,9 @@ relayprioritizer
 
 cleanup
   data_files_compress_after_days = INT
-    After this many days, compress data files. (Default: 10)
+    After this many days, compress data files. (Default: 29)
   data_files_delete_after_days = INT
-    After this many days, delete data files. (Default: 90)
+    After this many days, delete data files. (Default: 57)
   v3bw_files_compress_after_days = INT
     After this many days, compress v3bw files. (Default: 1)
   v3bw_files_delete_after_days = INT
diff --git a/sbws/config.default.ini b/sbws/config.default.ini
index 0105527..c47e4fd 100644
--- a/sbws/config.default.ini
+++ b/sbws/config.default.ini
@@ -71,9 +71,14 @@ extra_lines =
 
 [cleanup]
 # After this many days, compress data files
-data_files_compress_after_days = 10
-# After this many days, delete data files
-data_files_delete_after_days = 90
+# #40017: To generate files as Torflow the result files must be kept for
+# GENERATE_PERIOD seconds.
+# The number of days after they are compressed or deleted could be added
+# as defaults (currently globals.py), and just as a factor of GENERATE_PERIOD.
+data_files_compress_after_days = 29
+# After this many days, delete data files.
+# 57 == 28 * 2 + 1.
+data_files_delete_after_days = 57
 # After this many days, compress v3bw files (1d)
 v3bw_files_compress_after_days = 1
 # After this many days, delete v3bw files (7d)
diff --git a/sbws/core/cleanup.py b/sbws/core/cleanup.py
index aa16fba..1d21989 100644
--- a/sbws/core/cleanup.py
+++ b/sbws/core/cleanup.py
@@ -99,28 +99,6 @@ def _check_validity_periods_v3bw(compress_after_days, delete_after_days):
               "after a bigger number of days.")
 
 
-def _check_validity_periods_results(
-        data_period, compress_after_days, delete_after_days):
-    if compress_after_days - 2 < data_period:
-        fail_hard(
-            'For safetly, cleanup/data_files_compress_after_days (%d) must be '
-            'at least 2 days larger than general/data_period (%d)',
-            compress_after_days, data_period)
-    if delete_after_days < compress_after_days:
-        fail_hard(
-            'cleanup/data_files_delete_after_days (%d) must be the same or '
-            'larger than cleanup/data_files_compress_after_days (%d)',
-            delete_after_days, compress_after_days)
-    if compress_after_days / 2 < data_period:
-        log.warning(
-            'cleanup/data_files_compress_after_days (%d) is less than twice '
-            'general/data_period (%d). For ease of parsing older results '
-            'if necessary, it is recommended to make '
-            'data_files_compress_after_days at least twice the data_period.',
-            compress_after_days, data_period)
-    return True
-
-
 def _clean_v3bw_files(args, conf):
     v3bw_dname = conf.getpath('paths', 'v3bw_dname')
     if not os.path.isdir(v3bw_dname):
@@ -147,13 +125,10 @@ def _clean_result_files(args, conf):
     datadir = conf.getpath('paths', 'datadir')
     if not os.path.isdir(datadir):
         fail_hard('%s does not exist', datadir)
-    data_period = conf.getint('general', 'data_period')
     compress_after_days = conf.getint(
         'cleanup', 'data_files_compress_after_days')
     delete_after_days = conf.getint(
         'cleanup', 'data_files_delete_after_days')
-    _check_validity_periods_results(
-        data_period, compress_after_days, delete_after_days)
 
     # first delete so that the files to be deleted are not compressed first
     files_to_delete = _get_files_mtime_older_than(
diff --git a/sbws/core/generate.py b/sbws/core/generate.py
index 7283c25..8fa8ec4 100644
--- a/sbws/core/generate.py
+++ b/sbws/core/generate.py
@@ -2,7 +2,7 @@ from math import ceil
 
 from sbws.globals import (fail_hard, SBWS_SCALE_CONSTANT, TORFLOW_SCALING,
                           SBWS_SCALING, TORFLOW_BW_MARGIN, PROP276_ROUND_DIG,
-                          DAY_SECS, NUM_MIN_RESULTS)
+                          DAY_SECS, NUM_MIN_RESULTS, GENERATE_PERIOD)
 from sbws.lib.v3bwfile import V3BWFile
 from sbws.lib.resultdump import load_recent_results_in_datadir
 from argparse import ArgumentDefaultsHelpFormatter
@@ -60,8 +60,9 @@ def gen_parser(sub):
                    help="Number of most significant digits to round bw.")
     p.add_argument('-p', '--secs-recent', default=None, type=int,
                    help="How many secs in the past are results being "
-                        "still considered. Note this value will supersede "
-                        "data_period in the configuration.")
+                        "still considered. Default is {} secs. If not scaling "
+                        "as Torflow the default is data_period in the "
+                        "configuration.".format(GENERATE_PERIOD))
     p.add_argument('-a', '--secs-away', default=DAY_SECS, type=int,
                    help="How many secs results have to be away from each "
                         "other.")
@@ -90,6 +91,8 @@ def main(args, conf):
         scaling_method = TORFLOW_SCALING
     if args.secs_recent:
         fresh_days = ceil(args.secs_recent / 24 / 60 / 60)
+    elif scaling_method == TORFLOW_SCALING:
+        fresh_days = ceil(GENERATE_PERIOD / 24 / 60 / 60)
     else:
         fresh_days = conf.getint('general', 'data_period')
     reset_bw_ipv4_changes = conf.getboolean('general', 'reset_bw_ipv4_changes')
diff --git a/sbws/globals.py b/sbws/globals.py
index 253e2aa..2e4481c 100644
--- a/sbws/globals.py
+++ b/sbws/globals.py
@@ -101,16 +101,22 @@ MAX_BW_DIFF_PERC = 50
 # Tor already accept lines of any size, but leaving the limit anyway.
 BW_LINE_SIZE = 1022
 
-# RelayList, ResultDump, v3bwfile
+# RelayList, ResultDump
 # For how many seconds in the past the relays and measurements data is keep/
 # considered valid.
-# This is currently set by default in config.default.ini as ``date_period``,
-# and used in ResultDump and v3bwfile.
+# This is currently set by default in config.default.ini as ``data_period``,
+# and used in ResultDump.
 # In a future refactor, constants in config.default.ini should be moved here,
 # or calculated in settings, so that there's no need to pass the configuration
 # to all the functions.
 MEASUREMENTS_PERIOD = 5 * 24 * 60 * 60
 
+# #40017: To make sbws behave similar to Torflow, the number of raw past
+# measurements used when generating the Bandwidth File has to be 28, not 5.
+# Note that this is different from the number of raw past measurements used
+# when measuring, which are used for the monitoring values and storing json.
+GENERATE_PERIOD = 28 * 24 * 60 * 60
+
 # Metadata to send in every requests, so that data servers can know which
 # scanners are using them.
 # In Requests these keys are case insensitive.





More information about the tor-commits mailing list