[tor-commits] [torflow/master] Replacing HTMLTest with customizable HTTPTest.

13 Sep 2011

commit 06987d841646edad453c65d36196c35c7d83e331
Author: christian <christian@avtok.com>
Date:   Fri Jul 22 05:04:41 2011 -0400

    Replacing HTMLTest with customizable HTTPTest.
---
 NetworkScanners/ExitAuthority/libsoat.py     |    9 +-
 NetworkScanners/ExitAuthority/soat.py        |  610 +++++++++++++-------------
 NetworkScanners/ExitAuthority/soat_config.py |    6 +-
 3 files changed, 325 insertions(+), 300 deletions(-)

diff --git a/NetworkScanners/ExitAuthority/libsoat.py b/NetworkScanners/ExitAuthority/libsoat.py
index 5971c9b..2a86cb4 100644
--- a/NetworkScanners/ExitAuthority/libsoat.py
+++ b/NetworkScanners/ExitAuthority/libsoat.py
@@ -42,6 +42,7 @@ __all__ = [ # Classes
             # Functions
            "FullyStrainedSoup",
             # Constants
+           "COMPARE_EQUAL", "COMPARE_NOEQUAL", "COMPARE_TRUNCATION",
            "TEST_SUCCESS", "TEST_INCONCLUSIVE", "TEST_FAILURE",
            "RESULT_STRINGS", "RESULT_CODES",
            "INCONCLUSIVE_NOLOCALCONTENT", "INCONCLUSIVE_DYNAMICSSL",
@@ -77,6 +78,12 @@ class LoggingJSLexer(JavaScriptLexer):
 
 # constants
 
+# Compare results
+COMPARE_EQUAL = 0
+COMPARE_NOEQUAL = 1
+COMPARE_TRUNCATION = 2
+
+# Test results
 TEST_SUCCESS = 0
 TEST_INCONCLUSIVE = 1
 TEST_FAILURE = 2
@@ -842,7 +849,7 @@ class SnakePickler:
           pass
       raise KeyboardInterrupt
     except Exception, e:
-      plog("WARN", "Exception during pickle dump: "+e)
+      plog("WARN", "Exception during pickle dump: " + str(e))
       try:
         os.unlink(filename)
       except: pass
diff --git a/NetworkScanners/ExitAuthority/soat.py b/NetworkScanners/ExitAuthority/soat.py
index 8d557eb..dc4409a 100755
--- a/NetworkScanners/ExitAuthority/soat.py
+++ b/NetworkScanners/ExitAuthority/soat.py
@@ -2,6 +2,7 @@
 
 # 2008 Aleksei Gorny, mentored by Mike Perry
 # 2009 Mike Perry
+# 2011 Christian Anderson
 
 '''
 Snakes on a Tor exit node scanner
@@ -31,6 +32,7 @@ import getopt
 import httplib
 import mimetypes
 import os
+import pickle
 import random
 import re
 import signal
@@ -591,7 +593,7 @@ class Test:
     self.scan_nodes = 0
     self.nodes_to_mark = 0
     self.tests_per_node = num_tests_per_node
-    self._reset() #CA make this a call to rewind instead?
+    self._reset()
     self._pickle_revision = 8 # Will increment as fields are added
 
   def run_test(self):
@@ -603,7 +605,7 @@ class Test:
       # Yes, this is a hack, and yes, it will bias results
       # away from the filter, but hey, at least it will still run.
       self._pickle_revision = 1
-
+      
       for addr in self.successes.keys():
         if type(self.successes[addr]) == int:
           self.successes[addr] = set(xrange(0,self.successes[addr]))
@@ -664,7 +666,10 @@ class Test:
     self.targets.add(target)
 
   def select_targets(self):
-    return self.targets
+    ret = []
+    for key in self.targets.keys():
+      ret.extend(map(lambda x: (x,key), self.targets.bykey(key)))
+    return ret
 
   def refill_targets(self):
     map(self.add_target, self.get_targets())
@@ -830,6 +835,7 @@ class Test:
 
   def _reset(self):
     self.results = []
+    # Empty target list for new test
     self.targets = Targets()
     self.tests_run = 0
     self.nodes_marked = 0
@@ -1000,10 +1006,17 @@ class BaseHTTPTest(Test):
     self.fetch_queue = []
     Test.__init__(self, "HTTP", 80)
     self.save_name = "HTTPTest"
+    self.compare_funcs = {'html': self.compare_html, "js": self.compare_js}
 
   def _reset(self):
     self.httpcode_fails = {}
     self.httpcode_fails_per_exit = {}
+    # Default cookie jar for new test
+    self.tor_cookie_jar = None
+    self.cookie_jar = None
+    # Default headers for new test
+    self.headers = copy.copy(firefox_headers)
+
     Test._reset(self)
 
   def depickle_upgrade(self):
@@ -1042,18 +1055,19 @@ class BaseHTTPTest(Test):
     # A single test should have a single cookie jar
     self.tor_cookie_jar = cookielib.MozillaCookieJar()
     self.cookie_jar = cookielib.MozillaCookieJar()
-    self.headers = copy.copy(firefox_headers)
 
     self.tests_run += 1
 
     self.fetch_queue.extend(self.select_targets())
 
+    plog('INFO',str(self.fetch_queue))
+
     n_success = n_fail = n_inconclusive = 0
 
     while self.fetch_queue:
-      address = self.fetch_queue.pop(0)
+      address, filetype = self.fetch_queue.pop(0)
       # FIXME: Set referrer to random or none for each of these
-      result = self.check_http(address)
+      result = self.check_http(address,filetype)
       if result == TEST_INCONCLUSIVE:
         n_inconclusive += 1
       if result == TEST_FAILURE:
@@ -1106,144 +1120,148 @@ class BaseHTTPTest(Test):
     datahandler.saveResult(result)
     return TEST_FAILURE
 
-  def check_http_nodynamic(self, address, nocontent=False):
-    # TODO: use nocontent to cause us to not load content into memory.
-    # This will require refactoring http_response though.
-    ''' check whether a http connection to a given address is molested '''
+  def direct_load(self, orig_address, filetype):
+    """Loads a page on a direct connection. The signtuare is:
+       address (posibly after redirects)
+       success (T/F)
+       code
+       filetype of loaded page (should be null if we failed)"""
 
-    # an address representation acceptable for a filename
-    address_file = DataHandler.safeFilename(address.replace('http://',''))
-    content_prefix = http_content_dir+address_file
 
-    # Keep a copy of the cookie jar before mods for refetch or
-    # to restore on errors that cancel a fetch
-    orig_cookie_jar = cookielib.MozillaCookieJar()
-    for cookie in self.cookie_jar:
-      orig_cookie_jar.set_cookie(cookie)
-    orig_tor_cookie_jar = cookielib.MozillaCookieJar()
-    for cookie in self.tor_cookie_jar:
-      orig_tor_cookie_jar.set_cookie(cookie)
+    # This is the address that this function will return:
+    address = orig_address
 
-    try:
-      # Load content from disk, md5
-      content_file = open(content_prefix+'.content', 'r')
-      sha1sum = sha()
-      buf = content_file.read(4096)
-      while buf:
-        sha1sum.update(buf)
-        buf = content_file.read(4096)
-      content_file.close()
+    # Reqest the content using a direct connection
+    (code, resp_headers, new_cookies, mime_type, content) = http_request(orig_address,self.cookie_jar, self.headers)
 
-      added_cookie_jar = cookielib.MozillaCookieJar()
-      added_cookie_jar.load(content_prefix+'.cookies', ignore_discard=True)
-      self.cookie_jar.load(content_prefix+'.cookies', ignore_discard=True)
+    # Make a good faith effort to follow redirects
+    count = 0
+    trail = set([])
+    while (300 <= code < 400):
+      plog("NOTICE", "Non-Tor HTTP "+str(code)+" redirect from "+str(orig_address)+" to "+str(content))
+      address = content
+      if address in trail: break
+      trail.add(address)
+      (code, resp_headers, new_cookies, mime_type, content) = http_request(address, self.cookie_jar, self.headers)
 
-      headerdiffer = SnakePickler.load(content_prefix+'.headerdiff')
+      count += 1
+      if count > 4: break
 
-      content = None
-      mime_type = None
+    # Couldn't get past the redirects
+    if (300 <= code < 400):
+      return (address,False,code,'')
 
-    except IOError:
-      (code, resp_headers, new_cookies, mime_type, content) = http_request(address, self.cookie_jar, self.headers)
+    # If there was a fatal error, return failure
+    if not (200 <= code < 300) or not content:
+      plog("NOTICE", "Non-tor HTTP error "+str(code)+" fetching content for "+address)
+      return (address, False, code,'')
 
-      if 300 <= code < 400: # Redirects
-        plog("NOTICE", "Non-Tor HTTP "+str(code)+" redirect from "+str(address)+" to "+str(content))
-        # Remove the original target and add the redirected location
-        self.remove_target(address, INCONCLUSIVE_REDIRECT)
-        self.add_target(content)
-        # Restore cookie jar
-        self.cookie_jar = orig_cookie_jar
-        self.tor_cookie_jar = orig_cookie_jar
-        return TEST_INCONCLUSIVE
+    loaded_filetype = mime_to_filetype(mime_type)
 
-      if code - (code % 100) != 200:
-        plog("NOTICE", "Non-tor HTTP error "+str(code)+" fetching content for "+address)
-        # Just remove it
-        self.remove_target(address, FALSEPOSITIVE_HTTPERRORS)
-        # Restore cookie jars
-        self.cookie_jar = orig_cookie_jar
-        self.tor_cookie_jar = orig_tor_cookie_jar
-        return TEST_INCONCLUSIVE
+    if filetype and filetype != loaded_filetype:
+      
+      plog('DEBUG', 'Wrong filetype: ' + filetype + ' ' + loaded_filetype)
+      return (address, False, code, '')
 
-      if not content:
-        plog("WARN", "Failed to direct load "+address)
-        # Just remove it
-        self.remove_target(address, INCONCLUSIVE_NOLOCALCONTENT)
-        # Restore cookie jar
-        self.cookie_jar = orig_cookie_jar
-        self.tor_cookie_jar = orig_tor_cookie_jar
-        return TEST_INCONCLUSIVE
-      sha1sum = sha(content)
+    # Fetch again with different cookies and see if we get the same content
+    # Use a different IP address if possible
 
-      content_file = open(content_prefix+'.content', 'w')
-      content_file.write(content)
-      content_file.close()
+    empty_cookie_jar = cookielib.MozillaCookieJar()
 
-      headerdiffer = HeaderDiffer(resp_headers)
-      SnakePickler.dump(headerdiffer, content_prefix+'.headerdiff')
+    BindingSocket.bind_to = refetch_ip
+    (code_new, resp_headers_new, new_cookies_new, mime_type_new, content_new) = http_request(address, empty_cookie_jar, self.headers)
+    BindingSocket.bind_to = None
 
-      # Need to do set subtraction and only save new cookies..
-      # or extract/make_cookies
-      added_cookie_jar = cookielib.MozillaCookieJar()
-      for cookie in new_cookies:
-        added_cookie_jar.set_cookie(cookie)
-      try:
-        added_cookie_jar.save(content_prefix+'.cookies', ignore_discard=True)
-      except:
-        traceback.print_exc()
-        plog("WARN", "Error saving cookies in "+str(added_cookie_jar)+" to "+content_prefix+".cookies")
+    # If there was a fatal error, return failure
+    if not (code <= 200 < 300) or not content:
+      plog("NOTICE", "Non-tor HTTP error "+str(code)+" fetching content for "+address)
+      return (address, False, code, '')
 
-    except TypeError, e:
-      plog('ERROR', 'Failed obtaining the shasum for ' + address)
-      plog('ERROR', e)
-      # Restore cookie jars
-      self.cookie_jar = orig_cookie_jar
-      self.tor_cookie_jar = orig_tor_cookie_jar
-      return TEST_INCONCLUSIVE
+    # The context for writing out the files used to make repeated comparisons
+    address_file = DataHandler.safeFilename(re.sub('[a-z]+://','',address))
+    content_prefix = http_content_dir + address_file
+
+    # If the page is different on the second load, then it is probably dynamic and useless to us
+    if self.compare(content,content_new,content_prefix,loaded_filetype) != COMPARE_EQUAL:
+      return (address, False, code, '')
+
+    f = open(content_prefix + '.content', 'w')
+    f.write(content)
+    f.close()
+
+    # Save the cookies in case we want them for a later test
+    empty_cookie_jar.save(content_prefix + '.cookies',ignore_discard=True)
+
+    # Save the response headers in case we want them for a later test
+    headerdiffer = HeaderDiffer(resp_headers)
+    SnakePickler.dump(headerdiffer, content_prefix+'.headerdiff')
+
+    return (address, True, code, loaded_filetype)    
+
+  def check_http(self, address, filetype, dynamic = False):
+    ''' check whether a http connection to a given address is molested '''
 
-    (pcode, presp_headers, pnew_cookies, pmime_type, pcontent) = torify(http_request, address, self.tor_cookie_jar, self.headers)
+    # The "dynamic" option controls whether we dare grapple with dynamic
+    # pages. Currently only False is supported.
+
+    plog('INFO', 'Conducting an http test with destination ' + address)
+
+    # Keep a copy of the cookie jar before mods for refetch or
+    # to restore on errors that cancel a fetch
+    my_tor_cookie_jar = cookielib.MozillaCookieJar()
+    for cookie in self.tor_cookie_jar:
+      my_tor_cookie_jar.set_cookie(cookie)
+
+    my_cookie_jar = cookielib.MozillaCookieJar()
+    for cookie in self.cookie_jar:
+      my_cookie_jar.set_cookie(cookie)
+
+    # CA we should modify our headers for maximum magic
+
+    # pfoobar means that foobar was acquired over a _p_roxy
+    (pcode, presp_headers, pnew_cookies, pmime_type, pcontent) = torify(http_request, address, my_tor_cookie_jar, self.headers)
     psha1sum = sha(pcontent)
 
     exit_node = scanhdlr.get_exit_node()
     if not exit_node:
+      # CA: how can this happen?
       plog('NOTICE', 'We had no exit node to test, skipping to the next test.')
       result = HttpTestResult(None,
                               address, TEST_INCONCLUSIVE, INCONCLUSIVE_NOEXIT)
       if self.rescan_nodes:
+        # CA: we shouldn't need to do this
         result.from_rescan = True
       self.results.append(result)
-
-      # Restore cookie jars
-      self.cookie_jar = orig_cookie_jar
-      self.tor_cookie_jar = orig_tor_cookie_jar
+      # CA: when do we use datahandler?
       return TEST_INCONCLUSIVE
 
     exit_node = "$"+exit_node.idhex
-    if pcode - (pcode % 100) != 200:
-      plog("NOTICE", exit_node+" had error "+str(pcode)+" fetching content for "+address)
-
-      if pcode not in SOCKS_ERRS: # Do a refetch for non-SOCKS errors only
-        # Restore cookie jars
-        # XXX: This is odd and possibly wrong for the refetch
-        self.cookie_jar = orig_cookie_jar
-        self.tor_cookie_jar = orig_tor_cookie_jar
-        BindingSocket.bind_to = refetch_ip
-        (code_new, resp_headers_new, new_cookies_new, mime_type_new, content_new) = http_request(address, orig_tor_cookie_jar, self.headers)
-        BindingSocket.bind_to = None
-
-        if code_new == pcode and 300 <= pcode < 400: # Target introduced a redirect
-          plog("NOTICE", "Non-Tor HTTP "+str(code_new)+" redirect from "+address+" to "+str(content_new))
-          # Remove the original URL and add the redirect to our targets (if it's of the right type)
+
+    # If there is an error loading the page over Tor:
+    if not (200 <= pcode < 300) or not pcontent:
+      # And if it doesn't have to do with our SOCKS connection:
+      if pcode not in SOCKS_ERRS:
+        plog("NOTICE", exit_node+" had error "+str(pcode)+" fetching content for "+address)
+
+        (code_direct, resp_headers_direct, direct_cookies_direct, mime_type_direct, content_direct) = http_request(address, my_cookie_jar, self.headers)
+
+        # If a direct load is failing, remove this target from future consideration
+        if (300 <= code_direct < 400):
           self.remove_target(address, INCONCLUSIVE_REDIRECT)
-          self.add_target(content_new)
-          return TEST_INCONCLUSIVE
-        elif code_new == pcode: # Target introduced some other change
-          plog("NOTICE", "Non-tor HTTP error "+str(code_new)+" fetching content for "+address)
-          # Just remove it
+        elif not (200 <= code_direct < 300):
           self.remove_target(address, FALSEPOSITIVE_HTTPERRORS)
+
+        # If Tor and direct are failing for the same reason, Tor is off the hook
+        if (code_direct == pcode):
+          result = HttpTestResult(self.node_map[exit_node[1:]],
+                                  address, TEST_INCONCLUSIVE, INCONCLUSIVE_NOLOCALCONTENT)
+          if self.rescan_nodes:
+            # CA: we shouldn't need to do this
+            result.from_rescan = True
+          self.results.append(result)
           return TEST_INCONCLUSIVE
 
-      # Error => behavior lookup table
+      #  Error => behavior lookup table
       #  Error code     (Failure reason,        Register method,               Set extra_info to pcontent?)
       err_lookup = \
         {E_SOCKS:       (FAILURE_CONNERROR,     self.register_connect_failure, True), # "General socks error"
@@ -1257,6 +1275,7 @@ class BaseHTTPTest(Test):
          E_URL:         (FAILURE_URLERROR,      self.register_connect_failure, True),
          E_MISC:        (FAILURE_MISCEXCEPTION, self.register_connect_failure, True)
         }
+
       if pcode in err_lookup:
         fail_reason, register, extra_info = err_lookup[pcode]
       elif 300 <= pcode < 400: # Exit node introduced a redirect
@@ -1265,209 +1284,193 @@ class BaseHTTPTest(Test):
         register = self.register_http_failure
         extra_info = True
       else: # Exit node introduced some other change
-        fail_reason = FAILURE_BADHTTPCODE+str(pcode)
+        fail_reason = FAILURE_BADHTTPCODE + str(pcode) #CA don't think this is good
         register = self.register_exit_failure
         extra_info = True
 
+      # the [1:] gets rid of dollar sign. CA ugly
       result = HttpTestResult(self.node_map[exit_node[1:]],
-                            address, TEST_FAILURE, fail_reason)
+                              address, TEST_FAILURE, fail_reason)
       if extra_info:
         result.extra_info = str(pcontent)
-      return register(result)
 
-    # if we have no content, we had a connection error
+      register(result)
+      return TEST_FAILURE
+
+    # If we have no content, we had a connection error
     if pcontent == "":
       result = HttpTestResult(self.node_map[exit_node[1:]],
                               address, TEST_FAILURE, FAILURE_NOEXITCONTENT)
       self.register_exit_failure(result)
       # Restore cookie jars
-      self.cookie_jar = orig_cookie_jar
-      self.tor_cookie_jar = orig_tor_cookie_jar
       return TEST_FAILURE
 
-    hdiffs = headerdiffer.show_differences(presp_headers)
-    if hdiffs:
-      plog("NOTICE", "Header differences for "+address+": \n"+hdiffs)
-
-    # compare the content
-    # if content matches, everything is ok
-    if not hdiffs and psha1sum.hexdigest() == sha1sum.hexdigest():
-      result = HttpTestResult(self.node_map[exit_node[1:]],
-                              address, TEST_SUCCESS)
-      self.register_success(result)
-      return TEST_SUCCESS
+    #
+    # Tor was able to connect, so now it's time to make the comparison
+    #
+
+    # An address representation acceptable for a filename:
+    address_file = DataHandler.safeFilename(re.sub('[a-z]+://','',address))
+    content_prefix = http_content_dir + address_file
+    failed_prefix = http_failed_dir + address_file
+
+    # Load content from disk
+    content_file = open(content_prefix+'.content', 'r')
+    content = ''.join(content_file.readlines())
+    content_file.close()
+    
+    # If we need to write out the content handed to us by the exit node
+    exit_content_file_name = DataHandler.uniqueFilename(failed_prefix+'.'+exit_node[1:]+'.content')
+
+    # TODO we might want to check headers and cookies
+
+    # Compare the content
+    # TODO should we check if mimetype agrees with filetype?
+    result = self.compare(pcontent,content,content_prefix,filetype)
+    if result == COMPARE_NOEQUAL:
+      # Reload direct content and try again
+      (code_direct, resp_headers_direct, direct_cookies_direct, mime_type_direct, content_direct) = http_request(address, my_cookie_jar, self.headers)
+      
+      # If a new direct load somehow fails, then we're out of luck
+      if not (200 <= code_direct < 300):
+        plog("WARN", "Failed to re-frech "+address+" outside of Tor. Did our network fail?")
+        self.remove_target(address, FALSEPOSITIVE_HTTPERRORS)
+        result = HttpTestResult(self.node_map[exit_node[1:]],
+                                address, TEST_INCONCLUSIVE,
+                                INCONCLUSIVE_NOLOCALCONTENT)
+        if self.rescan_nodes:
+          result.from_rescan = True
+        self.results.append(result)
+        return TEST_INCONCLUSIVE
 
-    # Check for a simple truncation failure, which seems
-    # common with many nodes
-    if not content and not nocontent:
-      load_file = content_prefix+'.content'
-      content_file = open(load_file, 'r')
-      content = content_file.read()
-      content_file.close()
-
-    if content and len(pcontent) < len(content):
-      if content[0:len(pcontent)] == pcontent[0:len(pcontent)]:
-        failed_prefix = http_failed_dir+address_file
-        exit_content_file = open(DataHandler.uniqueFilename(failed_prefix+'.'+exit_node[1:]+'.content'), 'w')
-        exit_content_file.write(pcontent)
-        exit_content_file.close()
+      # Try our comparison again
+      dynamic = self.compare(content_direct,content,content_prefix,filetype)
+      
+      if dynamic == COMPARE_EQUAL:
+        # The content has changed, so our exit node is screwing with us.
         result = HttpTestResult(self.node_map[exit_node[1:]],
-                                address, TEST_FAILURE, FAILURE_EXITTRUNCATION,
+                                address, TEST_FAILURE, FAILURE_EXITONLY,
                                 sha1sum.hexdigest(), psha1sum.hexdigest(),
-                                content_prefix+".content",
-                                exit_content_file.name)
+                                content_prefix+".content", exit_content_file_name)
         self.register_exit_failure(result)
-        # Restore cookie jars
-        self.cookie_jar = orig_cookie_jar
-        self.tor_cookie_jar = orig_tor_cookie_jar
-        return TEST_FAILURE
-
-    # if content doesnt match, update the direct content and use new cookies
-    # If we have alternate IPs to bind to on this box, use them?
-    # Sometimes pages have the client IP encoded in them..
-    # Also, use the Tor cookies, since those identifiers are
-    # probably embeded in the Tor page as well.
-    BindingSocket.bind_to = refetch_ip
-    (code_new, resp_headers_new, new_cookies_new, mime_type_new, content_new) = http_request(address, orig_tor_cookie_jar, self.headers)
-    BindingSocket.bind_to = None
-
-    if not content_new:
-      plog("WARN", "Failed to re-frech "+address+" outside of Tor. Did our network fail?")
-      result = HttpTestResult(self.node_map[exit_node[1:]],
-                              address, TEST_INCONCLUSIVE,
-                              INCONCLUSIVE_NOLOCALCONTENT)
-      if self.rescan_nodes:
-        result.from_rescan = True
-      self.results.append(result)
-      datahandler.saveResult(result)
-      return TEST_INCONCLUSIVE
-
-    headerdiffer.prune_differences(resp_headers_new)
-    hdiffs = headerdiffer.show_differences(presp_headers)
-
-    SnakePickler.dump(headerdiffer, content_prefix+'.headerdiff')
-
-    sha1sum_new = sha(content_new)
-
-    if sha1sum.hexdigest() != sha1sum_new.hexdigest():
-      # if content has changed outside of tor, update the saved file
-      os.rename(content_prefix+'.content', content_prefix+'.content-old')
-      new_content_file = open(content_prefix+'.content', 'w')
-      new_content_file.write(content_new)
-      new_content_file.close()
-
-    # Need to do set subtraction and only save new cookies..
-    # or extract/make_cookies
-
-    self.cookie_jar = orig_cookie_jar
-    new_cookie_jar = cookielib.MozillaCookieJar()
-    for cookie in new_cookies_new:
-      new_cookie_jar.set_cookie(cookie)
-      self.cookie_jar.set_cookie(cookie) # Update..
-    os.rename(content_prefix+'.cookies', content_prefix+'.cookies-old')
-    try:
-      new_cookie_jar.save(content_prefix+'.cookies', ignore_discard=True)
-    except:
-      traceback.print_exc()
-      plog("WARN", "Error saving cookies in "+str(new_cookie_jar)+" to "+content_prefix+".cookies")
-
-    if hdiffs:
-      # XXX: We probably should store the header differ + exit headers
-      # for later comparison (ie if the header differ picks up more diffs)
-      plog("NOTICE", "Post-refetch header changes for "+address+": \n"+hdiffs)
-      result = HttpTestResult(self.node_map[exit_node[1:]],
-                              address, TEST_FAILURE, FAILURE_HEADERCHANGE)
-      result.extra_info = hdiffs
-      self.register_dynamic_failure(result)
-      # Lets let the rest of the tests run too actually
-      #return TEST_FAILURE
-
-    # compare the node content and the new content
-    # if it matches, everything is ok
-    if psha1sum.hexdigest() == sha1sum_new.hexdigest():
+        retval = TEST_FAILURE
+      else:
+        # The content is dynamic.
+        # Here's where "no dynamic" comes in.
+        # We reject this target and mark the test inconclusive.
+        plog("WARN", "HTTP Test is removing dynamic URL "+address)
+        self.remove_target(address, FALSEPOSITIVE_DYNAMIC)
+        result = HttpTestResult(self.node_map[exit_node[1:]],
+                                address, TEST_INCONCLUSIVE, INCONCLUSIVE_DYNAMIC,
+                                sha1sum_new.hexdigest(), psha1sum.hexdigest(),
+                                content_prefix+".content", exit_content_file_name,
+                                content_prefix+'.content-old',
+                                sha1sum.hexdigest())
+        self.results.append(result)
+        retval = TEST_INCONCLUSIVE
+    elif result == COMPARE_EQUAL:
       result = HttpTestResult(self.node_map[exit_node[1:]],
                               address, TEST_SUCCESS)
       self.register_success(result)
       return TEST_SUCCESS
-
-    if not content and not nocontent:
-      if sha1sum.hexdigest() != sha1sum_new.hexdigest():
-        load_file = content_prefix+'.content-old'
-      else:
-        load_file = content_prefix+'.content'
-      content_file = open(load_file, 'r')
-      content = content_file.read()
-      content_file.close()
-
-    if not ((mime_type == mime_type_new or not mime_type) \
-               and mime_type_new == pmime_type):
-      if not mime_type:
-        mime_type = "text/disk"
-      plog("WARN", "Mime type change: 1st: "+mime_type+", 2nd: "+mime_type_new+", Tor: "+pmime_type)
-      # TODO: If this actually happens, store a result.
-    else:
-      # Mime types match.. Are they sensible?
-      guess = mimetypes.guess_type(address, strict=False)[0]
-      if guess and not is_html_mimetype(guess) and is_html_mimetype(str(pmime_type)):
-        # We're not expecting html and we got (seemingly dynamic) html content
-        # This causes a lot of false positives, let's just remove the target
-        plog("NOTICE", "Got HTML content for non-HTML request, removing target "+address)
-        self.remove_target(address, FALSEPOSITIVE_DYNAMIC)
-        return TEST_INCONCLUSIVE
-
-    # Dirty dirty dirty...
-    return (mime_type_new, pcontent, psha1sum, content, sha1sum, content_new,
-            sha1sum_new, exit_node)
-
-  def check_http(self, address):
-    plog('INFO', 'Conducting an http test with destination ' + address)
-    ret = self.check_http_nodynamic(address)
-    if type(ret) == int:
-      return ret
-    return self._check_http_worker(address, ret)
-
-  def _check_http_worker(self, address, http_ret):
-    (mime_type,pcontent,psha1sum,content,sha1sum,content_new,sha1sum_new,exit_node) = http_ret
-
-    address_file = DataHandler.safeFilename(address.replace('http://',''))
-    content_prefix = http_content_dir+address_file
-    failed_prefix = http_failed_dir+address_file
-
-    # compare the new and old content
-    # if they match, means the node has been changing the content
-    if sha1sum.hexdigest() == sha1sum_new.hexdigest():
+    elif result == COMPARE_TRUNCATION:
       exit_content_file = open(DataHandler.uniqueFilename(failed_prefix+'.'+exit_node[1:]+'.content'), 'w')
       exit_content_file.write(pcontent)
       exit_content_file.close()
-
       result = HttpTestResult(self.node_map[exit_node[1:]],
-                              address, TEST_FAILURE, FAILURE_EXITONLY,
+                              address, TEST_FAILURE, FAILURE_EXITTRUNCATION,
                               sha1sum.hexdigest(), psha1sum.hexdigest(),
-                              content_prefix+".content", exit_content_file.name)
+                              content_prefix+".content",
+                              exit_content_file_name)
       self.register_exit_failure(result)
       return TEST_FAILURE
 
-    exit_content_file = open(DataHandler.uniqueFilename(failed_prefix+'.'+exit_node[1:]+'.dyn-content'),'w')
-    exit_content_file.write(pcontent)
-    exit_content_file.close()
+    # If we failed, then store what the exit node handed us
+    if retval == TEST_FAILURE:
+      exit_content_file = open(exit_content_file_name, 'w')
+      exit_content_file.write(pcontent)
+      exit_content_file.close()
 
-    result = HttpTestResult(self.node_map[exit_node[1:]],
-                            address, TEST_FAILURE, FAILURE_DYNAMIC,
-                            sha1sum_new.hexdigest(), psha1sum.hexdigest(),
-                            content_prefix+".content", exit_content_file.name,
-                            content_prefix+'.content-old',
-                            sha1sum.hexdigest())
-    if self.rescan_nodes:
-      result.from_rescan = True
-    self.results.append(result)
-    datahandler.saveResult(result)
+    return retval
 
-    # The HTTP Test should remove address immediately...
-    plog("WARN", "HTTP Test is removing dynamic URL "+address)
-    self.remove_target(address, FALSEPOSITIVE_DYNAMIC)
-    return TEST_FAILURE
+  def compare(self,new_content,old_content,context,filetype):
+    """The generic function for comparing webcontent."""
+
+    plog('DEBUG', "Beginning Compare")
+
+    new_linelist = new_content.split('\n')
+    old_linelist = old_content.split('\n')
+ 
+    old_hashes = pickled_content(context,'.hashes')
+    if not old_hashes:
+      old_hashes = []
+      old_hash = sha()
+      for l in old_linelist:
+        old_hash.update(l)
+        old_hashes.append(old_hash.hexdigest())
+      f = open(context + '.hashes','w')
+      pickle.dump(old_hashes,f)
+      f.close()
+
+    if len(new_linelist) > len(old_linelist):
+      retval = COMPARE_NOEQUAL
+    else:
+      new_hash = sha()
+      for i in range(0,min(len(old_linelist),len(new_linelist))):
+        new_hash.update(new_linelist[i])
+      new_hash = new_hash.hexdigest()
+
+      if new_hash != old_hashes[len(new_linelist) - 1]:
+        retval = COMPARE_NOEQUAL
+      elif len(new_linelist) == len(old_linelist):
+        retval = COMPARE_EQUAL
+      else:
+        retval = COMPARE_TRUNCATION
+
+    if retval == COMPARE_NOEQUAL:
+      try:
+        retval = self.compare_funcs[filetype](new_content,old_content,context)
+      except KeyError:
+        pass
+
+    plog('DEBUG', "Compare got the result: " + str(retval))
+
+    return retval
+
+  def compare_js(self,new_content,old_content,context):
+    # TODO check for truncation? Store differ?
+    jsdiff = JSDiffer(old_content)
+    has_changes = jsdiff.contains_differences(new_content)
+    if not has_changes:
+      return COMPARE_EQUAL
+    else:
+      return COMPARE_NOEQUAL
+
+  def compare_html(self,new_content,old_content,context):
+    # TODO check for truncation? Store differ?
+    old_soup = FullyStrainedSoup(old_content.decode('ascii', 'ignore'))
+    new_soup = FullyStrainedSoup(new_content.decode('ascii', 'ignore'))
+    htmldiff = SoupDiffer(old_soup,new_soup)
+    html_has_changes = htmldiff.content_changed
+    # TODO do we need to seperately check JS?
+    if not html_has_changes:
+      return COMPARE_EQUAL
+    else:
+      return COMPARE_NOEQUAL
 
 # TODO move these somewhere sensible
+def pickled_content(context,extension):
+  try:
+    f = open(context + extension, 'r')
+    ret = pickle.load(f)
+    f.close()
+  except IOError:
+    ret = False
+  return ret
+
+def mime_to_filetype(mime_type):
+  return mimetypes.guess_extension(mime_type)[1:]
+
 def is_html_mimetype(mime_type):
   is_html = False
   for type_match in html_mime_types:
@@ -2030,6 +2033,7 @@ class BaseSSLTest(Test):
 class FixedTargetTest:
   """ Mixin class. Must be mixed with a subclass of Test """
   def __init__(self, targets):
+    plog('INFO', "You requested the fixed targets: " + str(targets))
     self.fixed_targets = targets
 
   def get_targets(self):
@@ -2041,7 +2045,6 @@ class FixedTargetTest:
 
   def finished(self):
     """FixedTargetTests are done if they test all nodes or run out of targets"""
-    # CA do we properly handle possibility that self.targets can run out
     return not (self.nodes and self.targets)
 
 class FixedTargetHTTPTest(FixedTargetTest, BaseHTTPTest):
@@ -2050,6 +2053,16 @@ class FixedTargetHTTPTest(FixedTargetTest, BaseHTTPTest):
     utargets = [t for t in targets if self._is_useable_url(t, ['http'])]
     FixedTargetTest.__init__(self, utargets)
 
+  def get_targets(self):
+    ret = []
+    for targ in self.fixed_targets:
+      addr, succ, code, ftype = self.direct_load(targ, False)
+      if succ: ret.append([addr,ftype])
+    return ret
+
+  def add_target(self, target):
+    self.targets.add(target[0],[target[1]])
+
 class FixedTargetHTMLTest(FixedTargetTest, BaseHTMLTest):
   def __init__(self, targets):
     BaseHTMLTest.__init__(self)
@@ -2077,10 +2090,11 @@ class SearchBasedTest:
   def rewind(self):
     self.wordlist = load_wordlist(self.wordlist_file)
 
-  def get_targets(self):
-    return self.get_search_urls()
+  def add_target(self, target):
+    self.targets.add(target[0],[target[1]])
+    return True
 
-  def get_search_urls(self):
+  def get_targets(self):
     '''
     construct a list of urls based on the wordlist, filetypes and protocol.
     '''
@@ -2088,11 +2102,15 @@ class SearchBasedTest:
 
     urllist = set([])
     for filetype in self.scan_filetypes:
-      urllist.update(self.get_search_urls_for_filetype(filetype))
+      urllist.update(map(lambda x: (x, filetype), self.get_search_urls_for_filetype(filetype)))
 
     return list(urllist)
 
-  def get_search_urls_for_filetype(self, filetype,number = 0):
+  def get_search_urls_for_filetype(self, filetype, number=0):
+    # CA. I don't want to support 'any' any more. We must specify a filetype
+    assert(filetype != 'any')
+    assert(filetype)
+
     if not number:
       number = self.results_per_type
 
@@ -2178,12 +2196,18 @@ class SearchBasedTest:
           file_list = self.scan_filetypes
 
         if self._is_useable_url(url, prot_list, file_list):
+          plog('DEBUG', "Found a useable url: " + url)
+          url, success, code, cur_filetype = self.direct_load(url,filetype)
+          if not success:
+            plog('DEBUG',"Url was not useable after all: " + url)
+            continue
           if self.host_only:
             # FIXME: %-encoding, @'s, etc?
             plog("INFO", url)
             url = urlparse.urlparse(url)[1]
             # Have to check again here after parsing the url:
             if host in self.banned_targets:
+              plog('DEBUG',"Url was not useable after all (banned): " + url)
               continue
           type_urls.add(url)
           plog("INFO", "Have "+str(len(type_urls))+"/"+str(number)+" urls from search so far..")
@@ -2195,6 +2219,8 @@ class SearchBasedTest:
       self.url_reserve[filetype].extend(list(type_urls - set(chosen)))
       type_urls = chosen
 
+    plog("INFO","Got urls for filetype!")
+
     return type_urls
 
 class SearchBasedHTTPTest(SearchBasedTest, BaseHTTPTest):
@@ -2215,21 +2241,13 @@ class SearchBasedHTTPTest(SearchBasedTest, BaseHTTPTest):
     SearchBasedTest.rewind(self)
     BaseHTTPTest.rewind(self)
 
-  def add_target(self, target):
-    # Keys targets by filetype. One filetype per target
-    split = target.rsplit('.',1)
-    if len(split) > 1 and split[-1] in self.scan_filetypes:
-      self.targets.add(target,[split[-1]])
-      return True
-    return False
-
   def select_targets(self):
     retval = []
     n_tests = random.randrange(1,len(self.targets.keys())+1)
     filetypes = random.sample(self.targets.keys(), n_tests)
     plog("INFO", "HTTPTest decided to fetch "+str(n_tests)+" urls of types: "+str(filetypes))
     for ftype in filetypes:
-      retval.append(random.choice(self.targets.bykey(ftype)))
+      retval.append((random.choice(self.targets.bykey(ftype)),ftype))
     return retval
 
   def refill_targets(self):
diff --git a/NetworkScanners/ExitAuthority/soat_config.py b/NetworkScanners/ExitAuthority/soat_config.py
index 39f8165..99cd4ff 100644
--- a/NetworkScanners/ExitAuthority/soat_config.py
+++ b/NetworkScanners/ExitAuthority/soat_config.py
@@ -28,7 +28,7 @@ num_html_urls = 10
 max_search_retry = 3
 
 # Hrmm.. Too many of these and Google really h8s us..
-scan_filetypes = ['pdf','exe']
+scan_filetypes = ['pdf','doc','html']
 
 # Urls to scan for each filetype
 urls_per_filetype = 2
@@ -150,8 +150,8 @@ ixquick_search_mode = {"host" : "ixquick.com/do/metasearch.pl", "query":"all_ter
                       "extra":[("prfh","disable_family_filterEEE1N1Nnum_of_resultsEEE50N1Ndisable_video_family_filterEEE1N1N")]}
  
 
-#default_search_mode = google_search_mode
-default_search_mode = ixquick_search_mode
+default_search_mode = google_search_mode
+#default_search_mode = ixquick_search_mode
 
 # Regex of characters we consider unsafe to write to the filesystem
 unsafe_filechars = "[^a-zA-Z0-9-\.+]"

    

[tor-commits] [torflow/master] Replacing HTMLTest with customizable HTTPTest.

mikeperry＠torproject.org