[or-cvs] r18290: {torflow} Kill fetches that drop below 1kbyte/sec. Also fix an issue w (torflow/trunk/NetworkScanners)

mikeperry at seul.org mikeperry at seul.org
Wed Jan 28 09:29:38 UTC 2009


Author: mikeperry
Date: 2009-01-28 04:29:34 -0500 (Wed, 28 Jan 2009)
New Revision: 18290

Modified:
   torflow/trunk/NetworkScanners/libsoat.py
   torflow/trunk/NetworkScanners/soat.py
Log:

Kill fetches that drop below 1kbyte/sec. Also fix an
issue with tag culling. Other misc cleanups.




Modified: torflow/trunk/NetworkScanners/libsoat.py
===================================================================
--- torflow/trunk/NetworkScanners/libsoat.py	2009-01-28 09:27:52 UTC (rev 18289)
+++ torflow/trunk/NetworkScanners/libsoat.py	2009-01-28 09:29:34 UTC (rev 18290)
@@ -22,7 +22,7 @@
 ssl_certs_dir = data_dir + 'ssl/certs/'
 
 http_data_dir = data_dir + 'http/'
-http_content_dir = data_dir + 'http/content'
+http_content_dir = data_dir + 'http/content/'
 http_failed_dir = data_dir + 'http/failed/'
 http_inconclusive_dir = data_dir + 'http/inconclusive/'
 

Modified: torflow/trunk/NetworkScanners/soat.py
===================================================================
--- torflow/trunk/NetworkScanners/soat.py	2009-01-28 09:27:52 UTC (rev 18289)
+++ torflow/trunk/NetworkScanners/soat.py	2009-01-28 09:29:34 UTC (rev 18290)
@@ -58,7 +58,7 @@
 from OpenSSL import *
 
 sys.path.append("./libs/")
-from BeautifulSoup.BeautifulSoup import BeautifulSoup, SoupStrainer
+from BeautifulSoup.BeautifulSoup import BeautifulSoup, SoupStrainer, Tag
 from SocksiPy import socks
 import Pyssh.pyssh
 
@@ -73,8 +73,13 @@
 
 # Avoid vmware images+isos plz. Nobody could possibly have the patience
 # to download anything much larger than 30MB over Tor anyways ;)
-max_content_size = 30*1024*1024 
+# XXX: 30MB?? Who the hell am I kidding. For testing this needs to be like 1MB
+max_content_size = 1024*1024 # 30*1024*1024
 
+# Kill fetches if they drop below 1kbyte/sec
+min_rate=1024
+
+
 firefox_headers = {
   'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.8.1) Gecko/20061010 Firefox/2.0',
   'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
@@ -150,29 +155,32 @@
   '111'         # multicast & experimental 224.0.0.0/3
 ]
 
-# tags and attributes to check in the http test: XXX these should be reviewed
-# See also: http://ha.ckers.org/xss.html
+# Tags and attributes to check in the http test.
+# The general idea is to grab tags with attributes known
+# to either hold script, or cause automatic network actvitity
+# See: http://www.w3.org/TR/REC-html40/index/attributes.html
+# http://www.w3.org/TR/REC-html40/index/elements.html  
+# and http://ha.ckers.org/xss.html
 # Note: the more we add, the greater the potential for false positives...  
 # We also only care about the ones that work for FF2/FF3. 
-tags_to_check = ['a', 'area', 'base', 'applet', 'embed', 'form', 'frame',
-                 'input', 'iframe', 'img', 'link', 'object', 'script', 'meta', 
-                 'body', 'style']
 
+tags_to_check = ['a', 'applet', 'area', 'base', 'body', 'embed', 'form',
+                 'frame', 'iframe', 'img', 'input', 'link', 'meta', 
+                 'object', 'script', 'style']
 tags_preserve_inner = ['script','style'] 
+attrs_to_check =  ['background', 'cite', 'classid', 'codebase', 'data', 
+                   'longdesc', 'onblur', 
+                   'onchange', 'onclick', 'ondblclick', 'onfocus', 'onkeydown', 
+                   'onkeypress', 'onkeyup','onload', 'onmousedown', 'onmousemove', 
+                   'onmouseout', 'onmouseover','onmouseup', 'onreset', 'onselect', 
+                   'onsubmit', 'onunload', 'profile', 'src', 'usemap']
 
-attrs_to_check = ['onblur', 'onchange', 'onclick', 'ondblclick', 'onfocus', 
-                  'onkeydown', 'onkeypress', 'onkeyup', 'onload','onmousedown', 
-                  'onmouseup', 'onmouseover', 'onmousemove', 'onmouseout', 
-                  'onreset', 'onselect', 'onsubmit', 'onunload', 'profile', 
-                  'src', 'usemap', 'background', 'data', 'classid',
-                  'codebase', 'profile']
 
-tags_to_recurse = ['applet', 'embed', 'object', 'script', 'frame', 'iframe', 
-                   'img', 'link', 'a']
-
+tags_to_recurse = ['a', 'applet', 'embed', 'frame', 'iframe', #'img',
+                   'link', 'object', 'script'] 
 recurse_html = ['frame', 'iframe']
-attrs_to_recurse = ['src', 'pluginurl', 'data', 'classid', 'codebase', 'href',
-                    'background']
+attrs_to_recurse = ['background', 'classid', 'codebase', 'data', 'href',
+                    'pluginurl', 'src']
 
 #
 # constants
@@ -206,8 +214,8 @@
     plog('WARN', 'The http-request address ' + address + ' is malformed')
     traceback.print_exc()
     return ""
-  except (IndexError, TypeError):
-    plog('WARN', 'An error occured while negotiating socks5 with Tor')
+  except (IndexError, TypeError, socks.Socks5Error), e:
+    plog('WARN', 'An error occured while negotiating socks5 with Tor: '+str(e))
     traceback.print_exc()
     return ""
   except KeyboardInterrupt:
@@ -369,6 +377,7 @@
 
   def run_test(self):
     # A single test should have a single cookie jar
+    # XXX: Compare these elements at the end of the test
     self.tor_cookie_jar = cookielib.LWPCookieJar()
     self.cookie_jar = cookielib.LWPCookieJar()
     self.headers = copy.copy(firefox_headers)
@@ -597,6 +606,7 @@
  
   def run_test(self):
     # A single test should have a single cookie jar
+    # XXX: Compare these elements at the end of the test
     self.tor_cookie_jar = cookielib.LWPCookieJar()
     self.cookie_jar = cookielib.LWPCookieJar()
     self.headers = copy.copy(firefox_headers)
@@ -622,7 +632,7 @@
   def get_targets(self):
     return self.get_search_urls('http', self.fetch_targets) 
 
-  def add_recursive_targets(self, soup, orig_addr):
+  def _add_recursive_targets(self, soup, orig_addr):
     # XXX: Watch for spider-traps! (ie mutually sourcing iframes)
     # Only pull at most one filetype from the list of 'a' links
     targets = []
@@ -652,17 +662,26 @@
     for i in sets.Set(targets):
       self.fetch_queue.put_nowait(i)
 
-  def recursive_strain(self, soup):
+  def _tag_not_worthy(self, tag):
+    if str(tag.name) in tags_to_check:
+      return False
+    for attr in tag.attrs:
+      if attr[0] in attrs_to_check:
+        return False
+    return True
+ 
+  def _recursive_strain(self, soup):
     """ Remove all tags that are of no interest. Also remove content """
     to_extract = []
     for tag in soup.findAll():
-      if not tag.name in tags_to_check or not tag.attr in attrs_to_check:
+      if self._tag_not_worthy(tag):
         to_extract.append(tag)
       if tag.name not in tags_preserve_inner:
         for child in tag.childGenerator():
-          to_extract.append(child)
+          if not isinstance(child, Tag) or self._tag_not_worthy(child):
+            to_extract.append(child)
     for tag in to_extract:
-      tag.extract()    
+      tag.extract()
     return soup      
  
   def check_html(self, address):
@@ -701,13 +720,13 @@
     elements = SoupStrainer(lambda name, attrs: name in tags_to_check or 
         len(Set(map(lambda a: a[0], attrs)).intersection(Set(attrs_to_check))) > 0)
     pcontent = pcontent.decode('ascii', 'ignore')
-    psoup = self.recursive_strain(BeautifulSoup(pcontent, parseOnlyThese=elements))
+    psoup = self._recursive_strain(BeautifulSoup(pcontent, parseOnlyThese=elements))
 
     # Also find recursive urls
     recurse_elements = SoupStrainer(lambda name, attrs: 
          name in tags_to_recurse and 
             len(Set(map(lambda a: a[0], attrs)).intersection(Set(attrs_to_recurse))) > 0)
-    self.add_recursive_targets(BeautifulSoup(pcontent, recurse_elements), 
+    self._add_recursive_targets(BeautifulSoup(pcontent, recurse_elements), 
                                address) 
 
     # load the original tag structure
@@ -721,7 +740,7 @@
     except IOError:
       content = http_request(address, self.cookie_jar, self.headers)
       content = content.decode('ascii','ignore')
-      soup = BeautifulSoup(content, parseOnlyThese=elements)
+      soup = self._recursive_strain(BeautifulSoup(content, parseOnlyThese=elements))
 
       tag_file = open(content_prefix+'.tags', 'w')
       # the space is needed in case we have some page with no matching tags at all
@@ -761,7 +780,7 @@
       self.datahandler.saveResult(result)
       return TEST_INCONCLUSIVE
 
-    soup_new = self.recursive_strain(BeautifulSoup(content_new,
+    soup_new = self._recursive_strain(BeautifulSoup(content_new,
                                      parseOnlyThese=elements))
     # compare the new and old content
     # if they match, means the node has been changing the content
@@ -1726,10 +1745,29 @@
   elif (response.__class__.__name__ == "addinfourl"):
     encoding = response.info().get("Content-Encoding")
 
+  start = time.time()
+  data = ""
+  while True:
+    data_read = response.read(450) # Cells are 500 bytes
+    # XXX: if this doesn't work, check stream observer for 
+    # lack of progress.. or for a sign we should read..
+
+    len_read = len(data)
+    now = time.time()
+
+    # Wait 5 seconds before counting data
+    if (now-start) > 5 and len_read/(now-start) < min_rate:
+      plog("WARN", "Minimum xfer rate not maintained. Aborting xfer")
+      return ""
+      
+    if not data_read:
+      break
+    data += data_read 
+ 
   if encoding == 'gzip' or encoding == 'x-gzip':
-    return gzip.GzipFile('', 'rb', 9, StringIO.StringIO(response.read())).read()
+    return gzip.GzipFile('', 'rb', 9, StringIO.StringIO(data)).read()
   elif encoding == 'deflate':
-    return StringIO.StringIO(zlib.decompress(response.read())).read()
+    return StringIO.StringIO(zlib.decompress(data)).read()
   else:
     return response.read()
 



More information about the tor-commits mailing list