[or-cvs] r18484: {torflow} Fix a bug with the HTML false positive filter where we weren (torflow/trunk/NetworkScanners)

mikeperry at seul.org mikeperry at seul.org
Wed Feb 11 11:27:48 UTC 2009


Author: mikeperry
Date: 2009-02-11 06:27:48 -0500 (Wed, 11 Feb 2009)
New Revision: 18484

Modified:
   torflow/trunk/NetworkScanners/README.ExitScanning
   torflow/trunk/NetworkScanners/libsoat.py
   torflow/trunk/NetworkScanners/soat.py
   torflow/trunk/NetworkScanners/soatstats.py
Log:

Fix a bug with the HTML false positive filter where we
weren't properly tracking all changed tags. Also fix rewind
behavior to properly clear accumulated URL error information.



Modified: torflow/trunk/NetworkScanners/README.ExitScanning
===================================================================
--- torflow/trunk/NetworkScanners/README.ExitScanning	2009-02-11 08:04:46 UTC (rev 18483)
+++ torflow/trunk/NetworkScanners/README.ExitScanning	2009-02-11 11:27:48 UTC (rev 18484)
@@ -54,7 +54,7 @@
 The patch to fix this bug is present in ../tordiffs/XXX.
 
 It is also strongly recommended that you have a custom Tor instance that
-it devoted only to exit scanning, and is not performing any other
+is devoted only to exit scanning, and is not performing any other
 function (including serving as a relay or a directory authority).
 
 
@@ -91,6 +91,9 @@
 urls. This can be useful if you believe it likely for an adversary to
 target only certain keywords/concepts/sites in a particular context.
 
+You can edit the contents of the wordlist files while SoaT runs. It will
+pick up the changes after it completes a full network scan with the old 
+list.
 
 
 IV. Running Tor, The Metatroller, and SoaT

Modified: torflow/trunk/NetworkScanners/libsoat.py
===================================================================
--- torflow/trunk/NetworkScanners/libsoat.py	2009-02-11 08:04:46 UTC (rev 18483)
+++ torflow/trunk/NetworkScanners/libsoat.py	2009-02-11 11:27:48 UTC (rev 18484)
@@ -10,6 +10,7 @@
 import traceback
 import difflib
 import re
+import copy
 sys.path.append("./libs")
 from BeautifulSoup.BeautifulSoup import Tag, SoupStrainer
 
@@ -317,6 +318,7 @@
   def __str__(self):
     ret = TestResult.__str__(self)
     if self.verbose:
+      soup = old_soup = tor_soup = None
       if self.content and self.content_old:
         content = open(self.content).read().decode('ascii', 'ignore')
         content_old = open(self.content_old).read().decode('ascii', 'ignore')
@@ -339,6 +341,30 @@
                                     lineterm="")
         for line in diff:
           ret+=line+"\n"
+
+      if soup and tor_soup and old_soup:
+        old_vs_new = SoupDiffer(old_soup, soup)
+        new_vs_old = SoupDiffer(soup, old_soup)
+        new_vs_tor = SoupDiffer(soup, tor_soup)
+
+        # I'm an evil man and I'm going to CPU hell..
+        changed_tags = SoupDiffer.merge_tag_maps(
+                            old_vs_new.changed_tags_with_attrs(),
+                            new_vs_old.changed_tags_with_attrs())
+
+        changed_attributes = SoupDiffer.merge_tag_maps(
+                                old_vs_new.changed_attributes_by_tag(),
+                                new_vs_old.changed_attributes_by_tag())
+
+        changed_content = bool(old_vs_new.changed_content() or old_vs_new.changed_content())
+     
+        ret += "\nTor changed tags:\n"
+        ret += new_vs_tor.more_changed_tags(changed_tags)
+        ret += "\nTor changed attrs:\n"
+        ret += new_vs_tor.more_changed_attrs(changed_attributes)
+        if not changed_content:
+          ret += "\nChanged Content:\n"
+          ret += "\n".join(new_vs_tor.changed_content())+"\n"
     else:
       if self.content:
         ret += " "+self.content+"\n"
@@ -595,6 +621,7 @@
           changed_tags[t.name].add(attr[0])
     return changed_tags
 
+
   def has_more_changed_tags(self, tag_attr_map):
     """ Returns true if we have additional tags with additional
         attributes that were not present in tag_attr_map 
@@ -609,6 +636,18 @@
               return True
     return False
 
+  def more_changed_tags(self, tag_attr_map):
+    ret = ""
+    for tags in map(TheChosenSoup, self.changed_tags()):
+      for t in tags.findAll():
+        if t.name not in tag_attr_map:
+          ret += " New Tag: "+str(t)+"\n"
+        else:
+          for attr in t.attrs:
+            if attr[0] not in tag_attr_map[t.name]:
+              ret += " New Attr "+attr[0]+": "+str(t)+"\n"
+    return ret
+
   def _get_attributes(self):
     attrs_old = [(tag.name, tag.attrs) for tag in self.soup_old.findAll()]
     attrs_new = [(tag.name, tag.attrs) for tag in self.soup_new.findAll()]
@@ -640,6 +679,17 @@
       changed_attributes[tag].add(attr[0])
     return changed_attributes 
 
+  def merge_tag_maps(tag_map1, tag_map2):
+    " Merges either two tag_attr_maps or two attrs_by_tag maps "
+    ret = copy.deepcopy(tag_map1)
+    for tag in tag_map2:
+      if tag not in ret:
+        ret[tag] = copy.deepcopy(tag_map2[tag])
+      else:
+        ret[tag].union_update(tag_map2[tag])
+    return ret
+  merge_tag_maps = Callable(merge_tag_maps)
+
   def has_more_changed_attrs(self, attrs_by_tag):
     """ Returns true if we have any tags with additional
         changed attributes that were not present in attrs_by_tag
@@ -652,6 +702,17 @@
         return True
     return False
 
+  def more_changed_attrs(self, attrs_by_tag):
+    ret = ""
+    for (tag, attr) in self.changed_attributes():
+      if tag in attrs_by_tag:
+        if attr[0] not in attrs_by_tag[tag]:
+          ret += " New Attr "+attr[0]+": "+tag+" "+attr[0]+'="'+attr[1]+'"\n'
+      else:
+        ret += " New Tag: "+tag+" "+attr[0]+'="'+attr[1]+'"\n'
+    return ret
+
+
   def changed_content(self):
     """ Return a list of tag contents changed in soup_new """
     tags_old = sets.Set(map(str, 

Modified: torflow/trunk/NetworkScanners/soat.py
===================================================================
--- torflow/trunk/NetworkScanners/soat.py	2009-02-11 08:04:46 UTC (rev 18483)
+++ torflow/trunk/NetworkScanners/soat.py	2009-02-11 11:27:48 UTC (rev 18484)
@@ -152,7 +152,6 @@
     traceback.print_exc()
     return (666, [], "", str(e))
 
-  # TODO: Consider also returning mime type here
   return (reply.code, new_cookies, mime_type, content)
 
 class Test:
@@ -163,13 +162,13 @@
     self.mt = mt
     self.datahandler = DataHandler()
     self.min_targets = min_targets
+    self.exit_limit_pct = max_exit_fail_pct
+    self.dynamic_limit = max_dynamic_failure
     self.marked_nodes = sets.Set([])
     self.exit_fails = {}
     self.successes = {}
-    self.exit_limit_pct = max_exit_fail_pct
     self.results = []
     self.dynamic_fails = {}
-    self.dynamic_limit = max_dynamic_failure
     self.banned_targets = sets.Set([])
 
   def run_test(self): 
@@ -239,7 +238,13 @@
     self.tests_run = 0
     self.nodes_marked = 0
     self.marked_nodes = sets.Set([])
+    self.exit_fails = {}
+    self.successes = {}
+    self.dynamic_fails = {}
+    # TODO: report these results as BadExit before clearing
+    self.results = []
 
+
   def register_exit_failure(self, address, exit_node):
     if address in self.exit_fails:
       self.exit_fails[address].add(exit_node)
@@ -277,10 +282,14 @@
 
 
 class SearchBasedTest(Test):
-  def __init__(self, mt, proto, port, wordlist):
-    self.wordlist = wordlist
+  def __init__(self, mt, proto, port, wordlist_file):
+    self.wordlist_file = wordlist_file
     Test.__init__(self, mt, proto, port)
 
+  def rewind(self):
+    self.wordlist = load_wordlist(self.wordlist_file)
+    Test.rewind(self)
+
   def _is_useable_url(self, url, valid_schemes=None, filetypes=None):
     (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(url)
     if netloc.rfind(":") != -1:
@@ -399,10 +408,14 @@
     self.httpcode_limit_pct = max_exit_httpcode_pct
     self.scan_filetypes = filetypes
 
+  def rewind(self):
+    SearchBasedTest.rewind(self)
+    self.httpcode_fails = {}
+
   def check_cookies(self):
     tor_cookies = "\n"
     plain_cookies = "\n"
-    # XXX: do we need to sort these?
+    # XXX: do we need to sort these? So far we have worse problems..
     for cookie in self.tor_cookie_jar:
       tor_cookies += "\t"+cookie.name+":"+cookie.domain+cookie.path+" discard="+str(cookie.discard)+"\n"
     for cookie in self.cookie_jar:
@@ -415,7 +428,9 @@
                             tor_cookies)
       self.results.append(result)
       self.datahandler.saveResult(result)
-      return TEST_FAILURE
+      # XXX: this test is pretty spammy with false positives.. 
+      # It should not affect if a node "passes" or not yet.
+      #return TEST_FAILURE
 
     return TEST_SUCCESS
 
@@ -811,8 +826,10 @@
               if t.name == "link":
                 for a in t.attrs:
                   if a[0] == "type" and a[1] in script_mime_types:
+                    plog("INFO", "Adding link script for: "+str(t))
                     targets.append(("js", urlparse.urljoin(orig_addr, attr_tgt)))
               else:
+                plog("INFO", "Adding script tag for: "+str(t))
                 targets.append(("js", urlparse.urljoin(orig_addr, attr_tgt)))
             elif t.name == 'a':
               if attr_name == "href":
@@ -845,7 +862,7 @@
     (mime_type, tor_js, tsha, orig_js, osha, new_js, nsha, exit_node) = http_ret
 
     if mime_type not in script_mime_types:
-      plog("WARN", "Non-script mime type "+mime_type+" fed to JS test")
+      plog("WARN", "Non-script mime type "+mime_type+" fed to JS test for "+address)
       if mime_type in html_mime_types:
         return self._check_html_worker(address, http_ret)
       else:
@@ -897,7 +914,7 @@
 
     if mime_type not in html_mime_types:
       # XXX: Keep an eye on this logline.
-      plog("INFO", "Non-html mime type "+mime_type+" fed to HTML test")
+      plog("INFO", "Non-html mime type "+mime_type+" fed to HTML test for "+address)
       if mime_type in script_mime_types:
         return self._check_js_worker(address, http_ret)
       else:
@@ -968,11 +985,13 @@
     new_vs_tor = SoupDiffer(new_soup, tor_soup)
 
     # I'm an evil man and I'm going to CPU hell..
-    changed_tags = old_vs_new.changed_tags_with_attrs()
-    changed_tags.update(new_vs_old.changed_tags_with_attrs())
+    changed_tags = SoupDiffer.merge_tag_maps(
+                        old_vs_new.changed_tags_with_attrs(),
+                        new_vs_old.changed_tags_with_attrs())
 
-    changed_attributes = old_vs_new.changed_attributes_by_tag()
-    changed_attributes.update(new_vs_old.changed_attributes_by_tag())
+    changed_attributes = SoupDiffer.merge_tag_maps(
+                            old_vs_new.changed_attributes_by_tag(),
+                            new_vs_old.changed_attributes_by_tag())
 
     changed_content = bool(old_vs_new.changed_content() or old_vs_new.changed_content())
  
@@ -1044,7 +1063,7 @@
     c.set_connect_state()
   
     try:
-      c.connect((address, 443)) # XXX: Verify TorDNS here too..
+      c.connect((address, 443)) # DNS OK.
       c.send(crypto.dump_certificate_request(crypto.FILETYPE_PEM,request))
     except socket.error, e:
       plog('WARN','An error occured while opening an ssl connection to '+address+": "+str(e))
@@ -1177,6 +1196,8 @@
     if ssl_domain.seen_cert(cert_pem):
       result = SSLTestResult(exit_node, address, ssl_file_name, TEST_SUCCESS)
       #self.datahandler.saveResult(result)
+      if address in self.successes: self.successes[address]+=1
+      else: self.successes[address]=1
       return TEST_SUCCESS
 
     # False positive case.. Can't help it if the cert rotates AND we have a
@@ -2145,46 +2166,28 @@
   tests = {}
 
   if do_ssl:
-    try:
-      tests["SSL"] = SSLTest(mt, load_wordlist(ssl_wordlist_file))
-    except NoURLsFound, e:
-      plog('ERROR', e.message)
+    tests["SSL"] = SSLTest(mt, ssl_wordlist_file)
 
   if do_http:
-    try:
-      tests["HTTP"] = HTTPTest(mt, load_wordlist(filetype_wordlist_file))
-    except NoURLsFound, e:
-      plog('ERROR', e.message)
+    tests["HTTP"] = HTTPTest(mt, filetype_wordlist_file)
 
   if do_html:
-    try:
-      tests["HTML"] = HTMLTest(mt, load_wordlist(html_wordlist_file))
-    except NoURLsFound, e:
-      plog('ERROR', e.message)
+    tests["HTML"] = HTMLTest(mt, html_wordlist_file)
 
   if do_smtp:
-    try:
-      tests["SMTPS"] = SMTPSTest(mt)
-    except NoURLsFound, e:
-      plog('ERROR', e.message)
+    tests["SMTPS"] = SMTPSTest(mt)
     
   if do_pop:
-    try:
-      tests["POPS"] = POP3STest(mt) 
-    except NoURLsFound, e:
-      plog('ERROR', e.message)
+    tests["POPS"] = POP3STest(mt) 
 
   if do_imap:
-    try:
-      tests["IMAPS"] = IMAPSTest(mt)
-    except NoURLsFound, e:
-      plog('ERROR', e.message)
+    tests["IMAPS"] = IMAPSTest(mt)
 
   # maybe no tests could be initialized
   if not (do_ssl or do_html or do_http or do_ssh or do_smtp or do_pop or do_imap):
     plog('INFO', 'Done.')
     sys.exit(0)
-  
+
   for test in tests.itervalues():
     test.rewind()
  
@@ -2250,7 +2253,7 @@
     for test in tests.itervalues():
       if test.finished():
         plog("NOTICE", test.proto+" test has finished all nodes.  Rewinding")
-        test.rewind() 
+        test.rewind()
     
 
 # initiate the program

Modified: torflow/trunk/NetworkScanners/soatstats.py
===================================================================
--- torflow/trunk/NetworkScanners/soatstats.py	2009-02-11 08:04:46 UTC (rev 18483)
+++ torflow/trunk/NetworkScanners/soatstats.py	2009-02-11 11:27:48 UTC (rev 18484)
@@ -99,54 +99,6 @@
       if node.counts[test].inconclusive != 0:
         print `node.idhex` + "\t" + `node.counts[test].inconclusive`
 
-
-  # False positive test left in for verifcation and tweaking
-  # TODO: Remove this bit eventually
-  for result in data:
-    if result.__class__.__name__ == "HtmlTestResult":
-      if not result.tags_old or not result.tags or not result.exit_tags:
-        continue
-      print result.exit_node
-
-      print result.tags
-      print result.tags_old
-      print result.exit_tags
-
-      new_soup = BeautifulSoup(open(result.tags, "r").read())
-      old_soup = BeautifulSoup(open(result.tags_old, "r").read())
-      tor_soup = BeautifulSoup(open(result.exit_tags, "r").read())
-
-      new_vs_old = SoupDiffer(new_soup, old_soup)
-      old_vs_new = SoupDiffer(old_soup, new_soup)
-      new_vs_tor = SoupDiffer(new_soup, tor_soup)
-
-      # I'm an evil man and I'm going to CPU hell..
-      changed_tags = old_vs_new.changed_tags_with_attrs()
-      changed_tags.update(new_vs_old.changed_tags_with_attrs())
-
-      changed_attributes = old_vs_new.changed_attributes_by_tag()
-      changed_attributes.update(new_vs_old.changed_attributes_by_tag())
-
-      changed_content = bool(old_vs_new.changed_content() or old_vs_new.changed_content())
- 
-      # Verify all of our changed tags are present here
-      # XXX: Have this print out more info on changed tags..
-      if new_vs_tor.has_more_changed_tags(changed_tags) or \
-        new_vs_tor.has_more_changed_attrs(changed_attributes) or \
-        new_vs_tor.changed_content() and not changed_content:
-        false_positive = False
-      else:
-        false_positive = True
-
-      if false_positive:
-        # Use http://codespeak.net/pypy/dist/pypy/lang/js/ to parse
-        # links and attributes that contain javascript
-        jsdiff = JSSoupDiffer(old_soup)
-        jsdiff.prune_differences(new_soup)
-        false_positive = not jsdiff.contains_differences(tor_soup)
-  
-      print false_positive      
-
   print ""
 
 if __name__ == "__main__":



More information about the tor-commits mailing list