[tor-commits] [sbws/master] Use juga's code to cleanup result files too

pastly at torproject.org pastly at torproject.org
Thu Aug 9 14:21:19 UTC 2018


commit 3a1789bddf7aecd74a9ed784f38ebb1a9f45f2de
Author: Matt Traudt <sirmatt at ksu.edu>
Date:   Wed Aug 1 19:35:53 2018 -0400

    Use juga's code to cleanup result files too
---
 sbws/core/cleanup.py | 132 +++++++++++++++++----------------------------------
 1 file changed, 43 insertions(+), 89 deletions(-)

diff --git a/sbws/core/cleanup.py b/sbws/core/cleanup.py
index ad5b286..7dc7e78 100644
--- a/sbws/core/cleanup.py
+++ b/sbws/core/cleanup.py
@@ -7,7 +7,6 @@ from sbws.util.timestamp import unixts_to_dt_obj
 from argparse import ArgumentDefaultsHelpFormatter
 from datetime import datetime
 from datetime import timedelta
-import re
 import os
 import gzip
 import shutil
@@ -64,38 +63,6 @@ def _get_files_mtime_older_than(dname, days_delta, extensions):
                 yield fname
 
 
-def _get_older_files_than(dname, num_days_ago, extensions):
-    assert os.path.isdir(dname)
-    assert isinstance(num_days_ago, int)
-    assert isinstance(extensions, list)
-    for ext in extensions:
-        assert isinstance(ext, str)
-        assert ext[0] == '.'
-    # Determine oldest allowed date
-    today = datetime.utcfromtimestamp(time.time())
-    oldest_day = today - timedelta(days=num_days_ago)
-    # Compile a regex that can extract a date from a file name that looks like
-    # /path/to/foo/YYYY-MM-DD*.extension
-    extensions = [re.escape(e) for e in extensions]
-    day_part = '[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]'
-    regex = re.compile(r'^.*/({}).*({})$'
-                       .format(day_part, '|'.join(extensions)))
-    # Walk through all files in the given dname, find files that match the
-    # regex, and yield the ones that contain a date in the file name that is
-    # too old.
-    for root, dirs, files in os.walk(dname):
-        for f in files:
-            fname = os.path.join(root, f)
-            match = regex.match(fname)
-            if not match:
-                log.debug('Ignoring %s because it doesn\'t look like '
-                          'YYYY-MM-DD', fname)
-                continue
-            d = datetime(*[int(n) for n in match.group(1).split('-')])
-            if d < oldest_day:
-                yield fname
-
-
 def _delete_files(dname, files, dry_run=True):
     """Delete the files passed as argument."""
     assert os.path.isdir(dname)
@@ -108,20 +75,6 @@ def _delete_files(dname, files, dry_run=True):
                 os.remove(fname)
 
 
-def _remove_rotten_files(datadir, rotten_days, dry_run=True):
-    assert os.path.isdir(datadir)
-    assert isinstance(rotten_days, int)
-    # Hold the lock for basically the entire time just in case someone else
-    # moves files between when we get the list of files and when we try to
-    # delete them.
-    with DirectoryLock(datadir):
-        for fname in _get_older_files_than(datadir, rotten_days,
-                                           ['.txt', '.txt.gz']):
-            log.info('Deleting %s', fname)
-            if not dry_run:
-                os.remove(fname)
-
-
 def _compress_files(dname, files, dry_run=True):
     """Compress the files passed as argument."""
     assert os.path.isdir(dname)
@@ -139,24 +92,6 @@ def _compress_files(dname, files, dry_run=True):
             os.remove(fname)
 
 
-def _compress_stale_files(datadir, stale_days, dry_run=True):
-    assert os.path.isdir(datadir)
-    assert isinstance(stale_days, int)
-    # Hold the lock for basically the entire time just in case someone else
-    # moves files between when we get the list of files and when we try to
-    # compress them.
-    with DirectoryLock(datadir):
-        for fname in _get_older_files_than(datadir, stale_days, ['.txt']):
-            log.info('Compressing %s', fname)
-            if dry_run:
-                continue
-            with open(fname, 'rt') as in_fd:
-                out_fname = fname + '.gz'
-                with gzip.open(out_fname, 'wt') as out_fd:
-                    shutil.copyfileobj(in_fd, out_fd)
-            os.remove(fname)
-
-
 def _check_validity_periods_v3bw(compress_after_days, delete_after_days):
     if 1 <= compress_after_days and compress_after_days < delete_after_days:
         return True
@@ -164,6 +99,26 @@ def _check_validity_periods_v3bw(compress_after_days, delete_after_days):
               "after a bigger number of days.")
 
 
+def _check_validity_periods_results(
+        data_period, compress_after_days, delete_after_days):
+    if compress_after_days - 2 < data_period:
+        fail_hard(
+            'For safetly, cleanup/stale_days (%d) must be at least 2 days '
+            'larger than general/data_period (%d)', compress_after_days,
+            data_period)
+    if delete_after_days < compress_after_days:
+        fail_hard(
+            'cleanup/rotten_days (%d) must be the same or larger than '
+            'cleanup/stale_days (%d)', delete_after_days, compress_after_days)
+    if compress_after_days / 2 < data_period:
+        log.warning(
+            'cleanup/stale_days (%d) is less than twice '
+            'general/data_period (%d). For ease of parsing older results '
+            'if necessary, it is recommended to make stale_days at least '
+            'twice the data_period.', compress_after_days, data_period)
+    return True
+
+
 def _clean_v3bw_files(args, conf):
     v3bw_dname = conf['paths']['v3bw_dname']
     if not os.path.isdir(v3bw_dname):
@@ -186,6 +141,28 @@ def _clean_v3bw_files(args, conf):
     _compress_files(v3bw_dname, files_to_compress, dry_run=args.dry_run)
 
 
+def _clean_result_files(args, conf):
+    datadir = conf['paths']['datadir']
+    if not os.path.isdir(datadir):
+        fail_hard('%s does not exist', datadir)
+    data_period = conf.getint('general', 'data_period')
+    compress_after_days = conf.getint('cleanup', 'stale_days')
+    delete_after_days = conf.getint('cleanup', 'rotten_days')
+    _check_validity_periods_results(
+        data_period, compress_after_days, delete_after_days)
+
+    # first delete so that the files to be deleted are not compressed first
+    files_to_delete = _get_files_mtime_older_than(
+        datadir, delete_after_days, ['.txt', '.gz'])
+    _delete_files(datadir, files_to_delete, dry_run=args.dry_run)
+
+    # when dry_run is true, compress will also show all the files that
+    # would have been deleted, since they are not really deleted
+    files_to_compress = _get_files_mtime_older_than(
+        datadir, compress_after_days, ['.txt'])
+    _compress_files(datadir, files_to_compress, dry_run=args.dry_run)
+
+
 def main(args, conf):
     '''
     Main entry point in to the cleanup command.
@@ -200,30 +177,7 @@ def main(args, conf):
         fail_hard('Nothing to clean.')
 
     if not args.no_results:
-        datadir = conf['paths']['datadir']
-        if not os.path.isdir(datadir):
-            fail_hard('%s does not exist', datadir)
-
-        fresh_days = conf.getint('general', 'data_period')
-        stale_days = conf.getint('cleanup', 'stale_days')
-        rotten_days = conf.getint('cleanup', 'rotten_days')
-        if stale_days - 2 < fresh_days:
-            fail_hard('For safetly, cleanup/stale_days (%d) must be at least '
-                      '2 days larger than general/data_period (%d)',
-                      stale_days, fresh_days)
-        if rotten_days < stale_days:
-            fail_hard('cleanup/rotten_days (%d) must be the same or larger '
-                      'than cleanup/stale_days (%d)', rotten_days, stale_days)
-
-        if stale_days / 2 < fresh_days:
-            log.warning(
-                'cleanup/stale_days (%d) is less than twice '
-                'general/data_period (%d). For ease of parsing older results '
-                'if necessary, it is recommended to make stale_days at least '
-                'twice the data_period.', stale_days, fresh_days)
-
-        _remove_rotten_files(datadir, rotten_days, dry_run=args.dry_run)
-        _compress_stale_files(datadir, stale_days, dry_run=args.dry_run)
+        _clean_result_files(args, conf)
 
     if not args.no_v3bw:
         _clean_v3bw_files(args, conf)





More information about the tor-commits mailing list