[or-cvs] [metrics-utils/master 1/3] Filtered out the dates without exit lists.

karsten at torproject.org karsten at torproject.org
Tue Jan 4 07:44:15 UTC 2011


Author: Kiyoto Tamura <owenestea at gmail.com>
Date: Mon, 27 Dec 2010 22:39:12 -0600
Subject: Filtered out the dates without exit lists.
Commit: 9c692faaa3f002e07df74a1d7888a5a041ddbc3c

---
 visitor/visitor.py |   19 +++++++++++--------
 1 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/visitor/visitor.py b/visitor/visitor.py
index b18e0ef..504971d 100644
--- a/visitor/visitor.py
+++ b/visitor/visitor.py
@@ -8,7 +8,7 @@
 import re
 import sys
 import os
-from datetime import datetime, timedelta
+from datetime import datetime, timedelta, date
 import bisect
 from time import strptime, mktime, gmtime # datetime.strptime does not exist for version < 2.5
 from cStringIO import StringIO
@@ -44,6 +44,8 @@ def get_exitlist(exitlist_filepath):
     exist address was recorded.
     """
     exitlist = {}
+    first_exit_date = date.today() + timedelta(1)
+    last_exit_date = date(1970, 1, 1) # Unix epoch. Should suffice
     for dirpath, _, filenames in os.walk(exitlist_filepath, topdown = False):
         for filename in filenames:
             fn = os.path.join(dirpath, filename)
@@ -53,6 +55,9 @@ def get_exitlist(exitlist_filepath):
                     if line.startswith('ExitAddress'):
                         _, ip, dt = line.split(' ', 2)
                         yr, mo, d, h, m, s, _, _, _ = strptime(dt.rstrip('\s\n'), '%Y-%m-%d %H:%M:%S')
+                        curr_date = date(yr, mo, d)
+                        last_exit_date = max(first_exit_date, curr_date)
+                        first_exit_date = min(first_exit_date, curr_date)
                         if not ip in exitlist:
                             exitlist[ip] = []
                         timestamp = datetime(yr, mo, d, h, m, s)
@@ -61,7 +66,7 @@ def get_exitlist(exitlist_filepath):
             except IOError:
                 print >> sys.stderr, 'could not open %s. Skipping it.'%fn
 
-    return exitlist
+    return exitlist, first_exit_date, last_exit_date
 
 def apache_time2datetime(time_str, timediff_str):
     """
@@ -107,7 +112,7 @@ def analyze(apache_log_path, exitlist_path, output = sys.stdout):
     The main script. It reads the exit list, and goes through the Apache access log line by line, and checks if
     if it is a Tor request. TODO: filter out the bots.
     """
-    exitlist = get_exitlist(exitlist_path)
+    exitlist, first_exit_date, last_exit_date = get_exitlist(exitlist_path)
 
     tor_stats = {}
     tor_ua = TOR_USERAGENTS
@@ -146,12 +151,10 @@ def analyze(apache_log_path, exitlist_path, output = sys.stdout):
     for tor_type, _ in tor_ua:
         col_list.append(tor_type)
     buffer.write(','.join(col_list) + '\n')
-    apache_dates = tor_stats.keys()
-    apache_dates.sort()
-    curr_apache_date = apache_dates[0]
-    last_apache_date = apache_dates[-1]
+    apache_dates = sorted(tor_stats.keys())
+    curr_apache_date = first_exit_date
 
-    while curr_apache_date <= last_apache_date:
+    while curr_apache_date <= last_exit_date:
         stats = tor_stats.get(curr_apache_date)
         if stats == None:
             stats = {'date': curr_apache_date}
-- 
1.7.1




More information about the tor-commits mailing list