[or-cvs] [torflow/master 45/92] TestInconclusive and remove target when request for binary content returns html mimetype

mikeperry at torproject.org mikeperry at torproject.org
Sat Aug 21 05:14:00 UTC 2010


Author: John M. Schanck <john at anomos.info>
Date: Fri, 30 Jul 2010 20:46:54 -0400
Subject: TestInconclusive and remove target when request for binary content returns html mimetype
Commit: c47a63b8595d41d17fbdc11360a14889bd4c6bbb

---
 NetworkScanners/ExitAuthority/soat.py |   52 +++++++++++++++++++-------------
 1 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/NetworkScanners/ExitAuthority/soat.py b/NetworkScanners/ExitAuthority/soat.py
index 0551626..8747cd3 100755
--- a/NetworkScanners/ExitAuthority/soat.py
+++ b/NetworkScanners/ExitAuthority/soat.py
@@ -29,6 +29,7 @@ import cookielib
 import copy
 import getopt
 import httplib
+import mimetypes
 import os
 import random
 import re
@@ -1293,6 +1294,15 @@ class BaseHTTPTest(Test):
         mime_type = "text/disk"
       plog("WARN", "Mime type change: 1st: "+mime_type+", 2nd: "+mime_type_new+", Tor: "+pmime_type)
       # TODO: If this actually happens, store a result.
+    else:
+      # Mime types match.. Are they sensible?
+      guess = mimetypes.guess_type(address, strict=False)[0]
+      if guess and not is_html_mimetype(guess) and is_html_mimetype(str(pmime_type)):
+        # We're not expecting html and we got (seemingly dynamic) html content
+        # This causes a lot of false positives, let's just remove the target
+        plog("NOTICE", "Got HTML content for non-HTML request, removing target "+address)
+        self.remove_target(address, FALSEPOSITIVE_DYNAMIC)
+        return TEST_INCONCLUSIVE
 
     # Dirty dirty dirty...
     return (mime_type_new, pcontent, psha1sum, content, sha1sum, content_new,
@@ -1346,6 +1356,22 @@ class BaseHTTPTest(Test):
     self.remove_target(address, FALSEPOSITIVE_DYNAMIC)
     return TEST_FAILURE
 
+# TODO move these somewhere sensible
+def is_html_mimetype(mime_type):
+  is_html = False
+  for type_match in html_mime_types:
+    if re.match(type_match, mime_type.lower()):
+      is_html = True
+      break
+  return is_html
+
+def is_script_mimetype(mime_type):
+  is_script = False
+  for type_match in script_mime_types:
+    if re.match(type_match, mime_type.lower()):
+      is_script = True
+      break
+  return is_script
 
 class BaseHTMLTest(BaseHTTPTest):
   def __init__(self, recurse_filetypes=scan_filetypes):
@@ -1449,7 +1475,7 @@ class BaseHTMLTest(BaseHTTPTest):
                     plog("INFO", "Adding favicon of: "+str(t))
                     found_favicon = True
                     targets.append(("image", urlparse.urljoin(orig_addr, attr_tgt)))
-                  elif a[0] == "type" and self.is_script(a[1], ""):
+                  elif a[0] == "type" and is_script_mimetype(a[1]):
                     plog("INFO", "Adding link script of: "+str(t))
                     targets.append(("js", urlparse.urljoin(orig_addr, attr_tgt)))
               else:
@@ -1495,29 +1521,13 @@ class BaseHTMLTest(BaseHTTPTest):
       return ret
     return self._check_js_worker(address, ret)
 
-  def is_html(self, mime_type, content):
-    is_html = False
-    for type_match in html_mime_types:
-      if re.match(type_match, mime_type.lower()):
-        is_html = True
-        break
-    return is_html
-
-  def is_script(self, mime_type, content):
-    is_script = False
-    for type_match in script_mime_types:
-      if re.match(type_match, mime_type.lower()):
-        is_script = True
-        break
-    return is_script
-
   def _check_js_worker(self, address, http_ret):
     (mime_type, tor_js, tsha, orig_js, osha, new_js, nsha, exit_node) = http_ret
 
-    if not self.is_script(mime_type, orig_js):
+    if not is_script_mimetype(mime_type):
       plog("WARN", "Non-script mime type "+mime_type+" fed to JS test for "+address)
 
-      if self.is_html(mime_type, orig_js):
+      if is_html_mimetype(mime_type):
         return self._check_html_worker(address, http_ret)
       else:
         return self._check_http_worker(address, http_ret)
@@ -1568,10 +1578,10 @@ class BaseHTMLTest(BaseHTTPTest):
   def _check_html_worker(self, address, http_ret):
     (mime_type,tor_html,tsha,orig_html,osha,new_html,nsha,exit_node)=http_ret
 
-    if not self.is_html(mime_type, orig_html):
+    if not is_html_mimetype(mime_type):
       # XXX: Keep an eye on this logline.
       plog("WARN", "Non-html mime type "+mime_type+" fed to HTML test for "+address)
-      if self.is_script(mime_type, orig_html):
+      if is_script_mimetype(mime_type):
         return self._check_js_worker(address, http_ret)
       else:
         return self._check_http_worker(address, http_ret)
-- 
1.7.1




More information about the tor-commits mailing list