commit b9f5348a00ceae526cbf2d8496ba568869993f85
Author: christian <christian(a)avtok.com>
Date: Mon Aug 22 16:20:34 2011 -0400
Better mechanism for remembering directly loaded content.
---
NetworkScanners/ExitAuthority/soat.py | 220 +++++++++++++++++----------------
1 files changed, 112 insertions(+), 108 deletions(-)
diff --git a/NetworkScanners/ExitAuthority/soat.py b/NetworkScanners/ExitAuthority/soat.py
index dc4409a..e8a17dc 100755
--- a/NetworkScanners/ExitAuthority/soat.py
+++ b/NetworkScanners/ExitAuthority/soat.py
@@ -358,6 +358,10 @@ class ExitScanHandler(ScanSupport.ScanHandler):
self.__dnshandler = DNSRebindScanner(self, c)
+class Http_Return:
+ def __init__(self, rt):
+ (self.code, self.headers, self.new_cookies, self.mime_type, self.content) = rt
+
# HTTP request handling
def http_request(address, cookie_jar=None, headers=firefox_headers):
''' perform a http GET-request and return the content received '''
@@ -384,7 +388,7 @@ def http_request(address, cookie_jar=None, headers=firefox_headers):
length = reply.info().get("Content-Length")
if length and int(length) > max_content_size:
plog("WARN", "Max content size exceeded for "+address+": "+length)
- return (reply.code, None, [], "", "")
+ return Http_Return((reply.code, None, [], "", ""))
mime_type = reply.info().type.lower()
reply_headers = HeaderDiffer.filter_headers(reply.info().items())
reply_headers.add(("mime-type", mime_type))
@@ -433,7 +437,7 @@ def http_request(address, cookie_jar=None, headers=firefox_headers):
traceback.print_exc()
rval = (E_MISC, None, [], "", e.__class__.__name__+str(e))
plog("INFO", "Completed HTTP Reqest for: "+address)
- return rval
+ return Http_Return(rval)
# SSL request handling
@@ -1120,7 +1124,7 @@ class BaseHTTPTest(Test):
datahandler.saveResult(result)
return TEST_FAILURE
- def direct_load(self, orig_address, filetype):
+ def first_load(self, orig_address, filetype):
"""Loads a page on a direct connection. The signtuare is:
address (posibly after redirects)
success (T/F)
@@ -1132,36 +1136,37 @@ class BaseHTTPTest(Test):
address = orig_address
# Reqest the content using a direct connection
- (code, resp_headers, new_cookies, mime_type, content) = http_request(orig_address,self.cookie_jar, self.headers)
+ req = http_request(orig_address,self.cookie_jar, self.headers)
# Make a good faith effort to follow redirects
count = 0
trail = set([])
- while (300 <= code < 400):
- plog("NOTICE", "Non-Tor HTTP "+str(code)+" redirect from "+str(orig_address)+" to "+str(content))
- address = content
+ while (300 <= req.code < 400):
+ plog("NOTICE", "Non-Tor HTTP "+str(req.code)+" redirect from "+str(orig_address)+" to "+str(req.content))
+ address = req.content
if address in trail: break
trail.add(address)
- (code, resp_headers, new_cookies, mime_type, content) = http_request(address, self.cookie_jar, self.headers)
+ req = http_request(address, self.cookie_jar, self.headers)
count += 1
if count > 4: break
# Couldn't get past the redirects
- if (300 <= code < 400):
- return (address,False,code,'')
+ if (300 <= req.code < 400):
+ return (address,False,req.code,'')
# If there was a fatal error, return failure
- if not (200 <= code < 300) or not content:
- plog("NOTICE", "Non-tor HTTP error "+str(code)+" fetching content for "+address)
- return (address, False, code,'')
+ if not (200 <= req.code < 300) or not req.content:
+ plog("NOTICE", "Non-tor HTTP error "+str(req.code)+" fetching content for "+address)
+ return (address, False, req.code,'')
- loaded_filetype = mime_to_filetype(mime_type)
+ loaded_filetype = mime_to_filetype(req.mime_type)
if filetype and filetype != loaded_filetype:
-
- plog('DEBUG', 'Wrong filetype: ' + filetype + ' ' + loaded_filetype)
- return (address, False, code, '')
+ plog('DEBUG', 'Wrong filetype: ' + loaded_filetype + ' instead of ' + filetype)
+ return (address, False, req.code, '')
+
+ self.save_compare_data(address,filetype,req)
# Fetch again with different cookies and see if we get the same content
# Use a different IP address if possible
@@ -1169,34 +1174,18 @@ class BaseHTTPTest(Test):
empty_cookie_jar = cookielib.MozillaCookieJar()
BindingSocket.bind_to = refetch_ip
- (code_new, resp_headers_new, new_cookies_new, mime_type_new, content_new) = http_request(address, empty_cookie_jar, self.headers)
+ second_req = http_request(address, empty_cookie_jar, self.headers)
BindingSocket.bind_to = None
# If there was a fatal error, return failure
- if not (code <= 200 < 300) or not content:
- plog("NOTICE", "Non-tor HTTP error "+str(code)+" fetching content for "+address)
- return (address, False, code, '')
-
- # The context for writing out the files used to make repeated comparisons
- address_file = DataHandler.safeFilename(re.sub('[a-z]+://','',address))
- content_prefix = http_content_dir + address_file
-
- # If the page is different on the second load, then it is probably dynamic and useless to us
- if self.compare(content,content_new,content_prefix,loaded_filetype) != COMPARE_EQUAL:
- return (address, False, code, '')
-
- f = open(content_prefix + '.content', 'w')
- f.write(content)
- f.close()
-
- # Save the cookies in case we want them for a later test
- empty_cookie_jar.save(content_prefix + '.cookies',ignore_discard=True)
+ if not (second_req.code <= 200 < 300) or not second_req.content:
+ plog("NOTICE", "Non-tor HTTP error "+str(second_req.code)+" fetching content for "+address)
+ return (address, False, second_req.code, '')
- # Save the response headers in case we want them for a later test
- headerdiffer = HeaderDiffer(resp_headers)
- SnakePickler.dump(headerdiffer, content_prefix+'.headerdiff')
+ if self.compare(address,filetype,second_req) != COMPARE_EQUAL:
+ return (address, False, second_req.code, '')
- return (address, True, code, loaded_filetype)
+ return (address, True, req.code, loaded_filetype)
def check_http(self, address, filetype, dynamic = False):
''' check whether a http connection to a given address is molested '''
@@ -1219,8 +1208,8 @@ class BaseHTTPTest(Test):
# CA we should modify our headers for maximum magic
# pfoobar means that foobar was acquired over a _p_roxy
- (pcode, presp_headers, pnew_cookies, pmime_type, pcontent) = torify(http_request, address, my_tor_cookie_jar, self.headers)
- psha1sum = sha(pcontent)
+ preq = torify(http_request, address, my_tor_cookie_jar, self.headers)
+ psha1sum = sha(preq.content)
exit_node = scanhdlr.get_exit_node()
if not exit_node:
@@ -1238,21 +1227,21 @@ class BaseHTTPTest(Test):
exit_node = "$"+exit_node.idhex
# If there is an error loading the page over Tor:
- if not (200 <= pcode < 300) or not pcontent:
+ if not (200 <= preq.code < 300) or not preq.content:
# And if it doesn't have to do with our SOCKS connection:
- if pcode not in SOCKS_ERRS:
- plog("NOTICE", exit_node+" had error "+str(pcode)+" fetching content for "+address)
+ if preq.code not in SOCKS_ERRS:
+ plog("NOTICE", exit_node+" had error "+str(preq.code)+" fetching content for "+address)
- (code_direct, resp_headers_direct, direct_cookies_direct, mime_type_direct, content_direct) = http_request(address, my_cookie_jar, self.headers)
+ direct_req = http_request(address, my_cookie_jar, self.headers)
# If a direct load is failing, remove this target from future consideration
- if (300 <= code_direct < 400):
+ if (300 <= direct_req.code < 400):
self.remove_target(address, INCONCLUSIVE_REDIRECT)
- elif not (200 <= code_direct < 300):
+ elif not (200 <= direct_req.code < 300):
self.remove_target(address, FALSEPOSITIVE_HTTPERRORS)
# If Tor and direct are failing for the same reason, Tor is off the hook
- if (code_direct == pcode):
+ if (direct_req.code == preq.code):
result = HttpTestResult(self.node_map[exit_node[1:]],
address, TEST_INCONCLUSIVE, INCONCLUSIVE_NOLOCALCONTENT)
if self.rescan_nodes:
@@ -1276,15 +1265,15 @@ class BaseHTTPTest(Test):
E_MISC: (FAILURE_MISCEXCEPTION, self.register_connect_failure, True)
}
- if pcode in err_lookup:
- fail_reason, register, extra_info = err_lookup[pcode]
- elif 300 <= pcode < 400: # Exit node introduced a redirect
- plog("NOTICE", "Tor only HTTP "+str(pcode)+" redirect from "+address+" to "+str(pcontent))
+ if preq.code in err_lookup:
+ fail_reason, register, extra_info = err_lookup[preq.code]
+ elif 300 <= preq.code < 400: # Exit node introduced a redirect
+ plog("NOTICE", "Tor only HTTP "+str(preq.code)+" redirect from "+address+" to "+str(preq.content))
fail_reason = FAILURE_REDIRECT
register = self.register_http_failure
extra_info = True
else: # Exit node introduced some other change
- fail_reason = FAILURE_BADHTTPCODE + str(pcode) #CA don't think this is good
+ fail_reason = FAILURE_BADHTTPCODE + str(preq.code) #CA don't think this is good
register = self.register_exit_failure
extra_info = True
@@ -1298,7 +1287,7 @@ class BaseHTTPTest(Test):
return TEST_FAILURE
# If we have no content, we had a connection error
- if pcontent == "":
+ if not preq.content:
result = HttpTestResult(self.node_map[exit_node[1:]],
address, TEST_FAILURE, FAILURE_NOEXITCONTENT)
self.register_exit_failure(result)
@@ -1309,30 +1298,15 @@ class BaseHTTPTest(Test):
# Tor was able to connect, so now it's time to make the comparison
#
- # An address representation acceptable for a filename:
- address_file = DataHandler.safeFilename(re.sub('[a-z]+://','',address))
- content_prefix = http_content_dir + address_file
- failed_prefix = http_failed_dir + address_file
-
- # Load content from disk
- content_file = open(content_prefix+'.content', 'r')
- content = ''.join(content_file.readlines())
- content_file.close()
-
- # If we need to write out the content handed to us by the exit node
- exit_content_file_name = DataHandler.uniqueFilename(failed_prefix+'.'+exit_node[1:]+'.content')
-
- # TODO we might want to check headers and cookies
-
# Compare the content
# TODO should we check if mimetype agrees with filetype?
- result = self.compare(pcontent,content,content_prefix,filetype)
+ result = self.compare(address,filetype,preq)
if result == COMPARE_NOEQUAL:
# Reload direct content and try again
- (code_direct, resp_headers_direct, direct_cookies_direct, mime_type_direct, content_direct) = http_request(address, my_cookie_jar, self.headers)
+ new_req = http_request(address, my_cookie_jar, self.headers)
# If a new direct load somehow fails, then we're out of luck
- if not (200 <= code_direct < 300):
+ if not (200 <= new_req.code < 300):
plog("WARN", "Failed to re-frech "+address+" outside of Tor. Did our network fail?")
self.remove_target(address, FALSEPOSITIVE_HTTPERRORS)
result = HttpTestResult(self.node_map[exit_node[1:]],
@@ -1344,14 +1318,14 @@ class BaseHTTPTest(Test):
return TEST_INCONCLUSIVE
# Try our comparison again
- dynamic = self.compare(content_direct,content,content_prefix,filetype)
+ dynamic = self.compare(address,filetype,new_req)
if dynamic == COMPARE_EQUAL:
- # The content has changed, so our exit node is screwing with us.
+ # The content has not actually changed, so our exit node is screwing with us.
result = HttpTestResult(self.node_map[exit_node[1:]],
address, TEST_FAILURE, FAILURE_EXITONLY,
sha1sum.hexdigest(), psha1sum.hexdigest(),
- content_prefix+".content", exit_content_file_name)
+ address_to_context(address)+".content")
self.register_exit_failure(result)
retval = TEST_FAILURE
else:
@@ -1363,73 +1337,100 @@ class BaseHTTPTest(Test):
result = HttpTestResult(self.node_map[exit_node[1:]],
address, TEST_INCONCLUSIVE, INCONCLUSIVE_DYNAMIC,
sha1sum_new.hexdigest(), psha1sum.hexdigest(),
- content_prefix+".content", exit_content_file_name,
- content_prefix+'.content-old',
- sha1sum.hexdigest())
+ address_to_context(address)+".content")
self.results.append(result)
retval = TEST_INCONCLUSIVE
+
elif result == COMPARE_EQUAL:
result = HttpTestResult(self.node_map[exit_node[1:]],
address, TEST_SUCCESS)
self.register_success(result)
return TEST_SUCCESS
elif result == COMPARE_TRUNCATION:
- exit_content_file = open(DataHandler.uniqueFilename(failed_prefix+'.'+exit_node[1:]+'.content'), 'w')
- exit_content_file.write(pcontent)
- exit_content_file.close()
result = HttpTestResult(self.node_map[exit_node[1:]],
address, TEST_FAILURE, FAILURE_EXITTRUNCATION,
sha1sum.hexdigest(), psha1sum.hexdigest(),
content_prefix+".content",
- exit_content_file_name)
+ exit_content_file)
self.register_exit_failure(result)
- return TEST_FAILURE
+ retval = TEST_FAILURE
# If we failed, then store what the exit node handed us
if retval == TEST_FAILURE:
- exit_content_file = open(exit_content_file_name, 'w')
- exit_content_file.write(pcontent)
+ exit_content_file = open(address_to_failed_prefix(address)+'.'+exit_node[1:]+'.content', 'w')
+ exit_content_file.write(preq.contet)
exit_content_file.close()
return retval
- def compare(self,new_content,old_content,context,filetype):
+ def _address_to_filename(self, address):
+ return DataHandler.safeFilename(re.sub('[a-z]+://','',address))
+
+ def address_to_context(self,address):
+ return http_content_dir + self._address_to_filename(address)
+
+ def address_to_failed_prefix(self, address):
+ return http_failed_dir + self._address_to_filename(address)
+
+ def save_compare_data(self, address, filetype, req):
+ context = self. address_to_context(address)
+
+ f = open(context + '.content', 'w')
+ f.write(req.content)
+ f.close()
+
+ lines = req.content.split('\n')
+
+ hashes = []
+ working_hash = sha()
+ for l in lines:
+ working_hash.update(l)
+ hashes.append(working_hash.hexdigest())
+
+ f = open(context + '.hashes','w')
+ pickle.dump(hashes,f)
+ f.close()
+
+ # Save the response headers in case we want them for a later test
+ headerdiffer = HeaderDiffer(req.headers)
+ SnakePickler.dump(headerdiffer, context + '.headerdiff')
+
+ # Save the new cookies in case we need them for a later test
+ SnakePickler.dump(req.new_cookies,context + '.cookies')
+
+ def compare(self,address,filetype,req):
"""The generic function for comparing webcontent."""
plog('DEBUG', "Beginning Compare")
- new_linelist = new_content.split('\n')
- old_linelist = old_content.split('\n')
+ context = self. address_to_context(address)
+
+ new_linelist = req.content.split('\n')
+ f = open(context + '.content')
+ old_content = f.read()
+ f.close()
+
old_hashes = pickled_content(context,'.hashes')
- if not old_hashes:
- old_hashes = []
- old_hash = sha()
- for l in old_linelist:
- old_hash.update(l)
- old_hashes.append(old_hash.hexdigest())
- f = open(context + '.hashes','w')
- pickle.dump(old_hashes,f)
- f.close()
-
- if len(new_linelist) > len(old_linelist):
+
+ if len(new_linelist) > len(old_hashes):
retval = COMPARE_NOEQUAL
else:
new_hash = sha()
- for i in range(0,min(len(old_linelist),len(new_linelist))):
+ for i in range(0,min(len(old_hashes),len(new_linelist))):
new_hash.update(new_linelist[i])
new_hash = new_hash.hexdigest()
if new_hash != old_hashes[len(new_linelist) - 1]:
retval = COMPARE_NOEQUAL
- elif len(new_linelist) == len(old_linelist):
+ elif len(new_linelist) == len(old_hashes):
retval = COMPARE_EQUAL
else:
retval = COMPARE_TRUNCATION
if retval == COMPARE_NOEQUAL:
try:
- retval = self.compare_funcs[filetype](new_content,old_content,context)
+ retval = self.compare_funcs[filetype](req.content,old_content,context)
except KeyError:
pass
@@ -2056,7 +2057,7 @@ class FixedTargetHTTPTest(FixedTargetTest, BaseHTTPTest):
def get_targets(self):
ret = []
for targ in self.fixed_targets:
- addr, succ, code, ftype = self.direct_load(targ, False)
+ addr, succ, code, ftype = self.first_load(targ, False)
if succ: ret.append([addr,ftype])
return ret
@@ -2146,11 +2147,12 @@ class SearchBasedTest:
plog("INFO", "Search url: "+search_url)
try:
if self.search_mode["useragent"]:
- (code, resp_headers, new_cookies, mime_type, content) = http_request(search_url, search_cookies)
+ search_req = http_request(search_url, search_cookies)
else:
headers = filter(lambda h: h[0] != "User-Agent",
copy.copy(firefox_headers))
- (code, resp_headers, new_cookies, mime_type, content) = http_request(search_url, search_cookies, headers)
+ search_req = http_request(search_url, search_cookies, headers)
+
except socket.gaierror:
plog('ERROR', 'Scraping of http://'+host+search_path+" failed")
traceback.print_exc()
@@ -2161,10 +2163,12 @@ class SearchBasedTest:
# Bloody hack just to run some tests overnight
break
- if (400 <= code < 500):
+ if (400 <= search_req.code < 500):
plog('ERROR', 'Scraping of http://'+host+search_path+' failed. HTTP '+str(code))
break
+ content = search_req.content
+
links = SoupStrainer('a')
try:
soup = TheChosenSoup(content, parseOnlyThese=links)
@@ -2197,7 +2201,7 @@ class SearchBasedTest:
if self._is_useable_url(url, prot_list, file_list):
plog('DEBUG', "Found a useable url: " + url)
- url, success, code, cur_filetype = self.direct_load(url,filetype)
+ url, success, code, cur_filetype = self.first_load(url,filetype)
if not success:
plog('DEBUG',"Url was not useable after all: " + url)
continue