September 2011 - tor-commits - lists.torproject.org

[tor/master] bump to 0.2.3.4-alpha-dev
by arma＠torproject.org 14 Sep '11

14 Sep '11

commit 6a799c10eed145005b1755c49ea0d9c787df44b7 Author: Roger Dingledine <arma(a)torproject.org> Date: Tue Sep 13 22:04:47 2011 -0400 bump to 0.2.3.4-alpha-dev --- configure.in | 2 +- contrib/tor-mingw.nsi.in | 2 +- src/win32/orconfig.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/configure.in b/configure.in index 3659cd7..201eee0 100644 --- a/configure.in +++ b/configure.in @@ -4,7 +4,7 @@ dnl Copyright (c) 2007-2008, The Tor Project, Inc. dnl See LICENSE for licensing information AC_INIT -AM_INIT_AUTOMAKE(tor, 0.2.3.4-alpha) +AM_INIT_AUTOMAKE(tor, 0.2.3.4-alpha-dev) AM_CONFIG_HEADER(orconfig.h) AC_CANONICAL_HOST diff --git a/contrib/tor-mingw.nsi.in b/contrib/tor-mingw.nsi.in index e6b2585..f57e71d 100644 --- a/contrib/tor-mingw.nsi.in +++ b/contrib/tor-mingw.nsi.in @@ -8,7 +8,7 @@ !include "LogicLib.nsh" !include "FileFunc.nsh" !insertmacro GetParameters -!define VERSION "0.2.3.4-alpha" +!define VERSION "0.2.3.4-alpha-dev" !define INSTALLER "tor-${VERSION}-win32.exe" !define WEBSITE "https://www.torproject.org/" !define LICENSE "LICENSE" diff --git a/src/win32/orconfig.h b/src/win32/orconfig.h index e5c195b..f9d8d51 100644 --- a/src/win32/orconfig.h +++ b/src/win32/orconfig.h @@ -234,7 +234,7 @@ #define USING_TWOS_COMPLEMENT /* Version number of package */ -#define VERSION "0.2.3.4-alpha" +#define VERSION "0.2.3.4-alpha-dev"

1 0

[torflow/master] SearchBasedHTTPTest.get_targets duplicated the effort of BaseHTTP.add_target. BaseHTTPTest.remove_target became confused
by mikeperry＠torproject.org 14 Sep '11

14 Sep '11

commit 1248e73fd061257bee960df54b53b6c4d8f7e648 Author: Christian Anderson <christian(a)avtok.com> Date: Tue May 24 14:14:44 2011 -0400 SearchBasedHTTPTest.get_targets duplicated the effort of BaseHTTP.add_target. BaseHTTPTest.remove_target became confused --- NetworkScanners/ExitAuthority/soat.py | 23 +++++++++++++---------- 1 files changed, 13 insertions(+), 10 deletions(-) diff --git a/NetworkScanners/ExitAuthority/soat.py b/NetworkScanners/ExitAuthority/soat.py index 97d310b..f2552e2 100755 --- a/NetworkScanners/ExitAuthority/soat.py +++ b/NetworkScanners/ExitAuthority/soat.py @@ -2188,16 +2188,19 @@ class SearchBasedHTTPTest(SearchBasedTest, BaseHTTPTest): plog("NOTICE", self.proto+" scanner short on "+ftype+" targets. Adding more") map(self.add_target, self.get_search_urls_for_filetype(ftype)) - def get_targets(self): - raw_urls = self.get_search_urls() - new = {} - for url in raw_urls: - split = url.rsplit('.',1) # Try to get filetype - if len(split) > 1 and split[-1] in self.scan_filetypes: - new.setdefault(split[-1],[]).append(url) - for k,v in new.items(): - self.targets_by_type.setdefault(k, []).extend(v) - return raw_urls +# This duplicated the effort of BaseHTTPTest.add_target which is invoked by +# SearchBasedHTTPTest.rewind -> BaseHTTPTest.rewind = Test.rewind +# Instead we should fall back on SearchBasedTest.get_targets +# def get_targets(self): +# raw_urls = self.get_search_urls() +# new = {} +# for url in raw_urls: +# split = url.rsplit('.',1) # Try to get filetype +# if len(split) > 1 and split[-1] in self.scan_filetypes: +# new.setdefault(split[-1],[]).append(url) +# for k,v in new.items(): +# self.targets_by_type.setdefault(k, []).extend(v) +# return raw_urls HTTPTest = SearchBasedHTTPTest # For resuming from old HTTPTest.*.test files

1 0

[torflow/master] Modernizing search modes. Adding support to soat.py
by mikeperry＠torproject.org 14 Sep '11

14 Sep '11

commit b4be3a63400e658eb26436d97fca766907ecf91c Author: Christian Anderson <christian(a)avtok.com> Date: Tue May 24 12:46:04 2011 -0400 Modernizing search modes. Adding support to soat.py --- .gitignore | 3 ++ NetworkScanners/ExitAuthority/soat.py | 30 +++++++++++++------------ NetworkScanners/ExitAuthority/soat_config.py | 28 +++++++++++++----------- 3 files changed, 34 insertions(+), 27 deletions(-) diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b4e8d7b --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +*.pyc +NetworkScanners/ExitAuthority/data/ +NetworkScanners/ExitAuthority/search_cookies.lwp \ No newline at end of file diff --git a/NetworkScanners/ExitAuthority/soat.py b/NetworkScanners/ExitAuthority/soat.py index 162ff0e..97d310b 100755 --- a/NetworkScanners/ExitAuthority/soat.py +++ b/NetworkScanners/ExitAuthority/soat.py @@ -2076,12 +2076,14 @@ class SearchBasedTest: count = 0 while len(type_urls) < self.results_per_type and count < max_search_retry: count += 1 + + #Try to filter based on filetype/protocol. Unreliable. We will re-filter. query = random.choice(self.wordlist) if filetype != 'any': query += " "+self.search_mode["filetype"]+filetype plog("WARN", "RESULTPROTOCOL IS:" + self.result_protocol) - if self.result_protocol != 'any' and self.search_mode["inurl"]: - query += " "+self.search_mode["inurl"]+self.result_protocol # this isn't too reliable, but we'll re-filter results later + if self.result_protocol == 'https' and self.search_mode["inurl"]: + query += " " + self.search_mode["inurl"] + "https" #query += '&num=' + `g_results_per_page` # search google for relevant pages @@ -2124,19 +2126,19 @@ class SearchBasedTest: traceback.print_exc() print "Content is: "+str(content) break + # get the links and do some additional filtering + assert(self.search_mode["class"]) for link in soup.findAll('a'): - skip = True - for a in link.attrs: - if a[0] == "class" and self.search_mode["class"] in a[1]: - skip = False - break - if skip: - continue - if link.has_key(self.search_mode['realtgt']): - url = link[self.search_mode['realtgt']] - else: - url = link['href'] + #Filter based on class of link + try: + if self.search_mode["class"] != link["class"]: + continue + except KeyError: continue + + #Get real target + url = link[self.search_mode['realtgt']] + if self.result_protocol == 'any': prot_list = None else: @@ -2158,7 +2160,7 @@ class SearchBasedTest: type_urls.add(url) else: pass - plog("INFO", "Have "+str(len(type_urls))+"/"+str(self.results_per_type)+" urls from search so far..") + plog("INFO", "Have "+str(len(type_urls))+"/"+str(self.results_per_type)+" urls from search so far..") return type_urls class SearchBasedHTTPTest(SearchBasedTest, BaseHTTPTest): diff --git a/NetworkScanners/ExitAuthority/soat_config.py b/NetworkScanners/ExitAuthority/soat_config.py index 3e13463..39f8165 100644 --- a/NetworkScanners/ExitAuthority/soat_config.py +++ b/NetworkScanners/ExitAuthority/soat_config.py @@ -40,8 +40,8 @@ max_content_size = 256*1024 # Bind refetches of docuements to a specific source IP. # Useful for eliminating false positives that arise # from IP-based identifiers encoded in content -#refetch_ip = None -refetch_ip = "4.4.4.4" +refetch_ip = None +#refetch_ip = "4.4.4.4" # Email settings for email scans. from_email = "Tor Exit Scanner <noreply(a)torproject.org>" @@ -134,21 +134,23 @@ search_cookie_file="./search_cookies.lwp" # Search mode. # Leave these maps alone. Change the default_search_mode variable # to what you want. -# XXX: Make a bing search mode. -yahoo_search_mode = {"host" : "search.yahoo.com", "query":"p", "filetype": "originurlextension:", \ - "inurl":None, "class":"yschttl", "realtgt":"ourl", "useragent":False, \ - "extra":[]} -google_search_mode = {"host" : "www.google.com", "query":"q", "filetype":"filetype:", \ - "inurl":"inurl:", "class" : "l", "realtgt":"href", "useragent":True, \ - "extra":[]} -ixquick_search_mode = {"host" : "ixquick.com/do/metasearch.pl", "query":"all_terms", "filetype":"title:", \ +# XXX: Make a bing search mode and a DuckDuckGo search mode + +#Yahoo is no longer supported because they make it difficult to scrape their results +#yahoo_search_mode = {"host" : "search.yahoo.com/search", "query":"p", "filetype": "vf:", \ +# "inurl":None, "class":"yschttl", "realtgt":"ourl", "useragent":False, \ +# "extra":[]} + +google_search_mode = {"host" : "www.google.com/search", "query":"q", "filetype":"filetype:", \ + "inurl":"inurl:", "class" : "l", "realtgt":"href", "useragent":True, \ + "extra":[]} + +ixquick_search_mode = {"host" : "ixquick.com/do/metasearch.pl", "query":"all_terms", "filetype":"url:.", \ "inurl":"url:", "class" : "title2", "realtgt":"href", "useragent":False, \ "extra":[("prfh","disable_family_filterEEE1N1Nnum_of_resultsEEE50N1Ndisable_video_family_filterEEE1N1N")]} -# FIXME: This does not affect the ssl search.. Only Google has -# a working "inurl:" that allows you to pick the scheme to be https + #default_search_mode = google_search_mode -#default_search_mode = yahoo_search_mode default_search_mode = ixquick_search_mode # Regex of characters we consider unsafe to write to the filesystem

1 0

[torflow/master] New Targets class
by mikeperry＠torproject.org 14 Sep '11

14 Sep '11

commit 5898b393aba321e70c170c70c3d49c0087310906 Author: christian <christian(a)avtok.com> Date: Thu Jun 2 00:41:56 2011 -0400 New Targets class --- NetworkScanners/ExitAuthority/soat.py | 48 +++++++++++++++++++++++++++++++++ 1 files changed, 48 insertions(+), 0 deletions(-) diff --git a/NetworkScanners/ExitAuthority/soat.py b/NetworkScanners/ExitAuthority/soat.py index f2552e2..c96468f 100755 --- a/NetworkScanners/ExitAuthority/soat.py +++ b/NetworkScanners/ExitAuthority/soat.py @@ -520,6 +520,54 @@ def _ssl_request(address, method='TLSv1_METHOD'): plog("INFO", "SSL Request done for addrress: "+str(address)) return rval +class Targets: + """ + The class used to store the targets of a Test. + + Supports iteration over all targets and labelling a target with one or more "keys". + MUST support these methods: + add -- Add a target. Optional second argument is list of keys. Idempotent. + remove -- Remove a target. Returns True iff the target was found. + bykey -- Get an iterator whose elements match the supplied key. + __iter__ + __len__ + + """ + def __init__(self): + self.list = [] + self.lookup = {} + def add(self, target, keys=[]): + if not target: + return + for pos,entry in enumerate(self.list): + if entry[0] == target: + newkeys = set.difference(set(keys),self.list[pos][1]) + self.list[pos][1].update(newkeys) + break + else: + newkeys = set(keys) + self.list.append((target,newkeys)) + for key in newkeys: + try: + self.lookup[key].append(target) + except KeyError: + self.lookup[key] = [target] + def remove(self,target): + retval = False + for pos,entry in enumerate(self.list): + if entry[0] == target: + for key in self.list[pos][1]: + self.lookup[key].remove(target) + self.list.pop(pos) + retval = True + break + return retval + def bykey(self,key): + return self.lookup.get(key,[]) + def __iter__(self): + return map(lambda x: x[0], self.list).__iter__() + def __len__(self): + return len(self.list) # Base Test Classes class Test:

1 0

[torflow/master] Incorporating new container for targets. SearchBasedHTTPTest and FixedTargetHTTPTest now seem to run.
by mikeperry＠torproject.org 14 Sep '11

14 Sep '11

commit 0c10b5be192f96e75db45c4a32c48248da8ee513 Author: christian <christian(a)avtok.com> Date: Thu Jun 2 15:05:40 2011 -0400 Incorporating new container for targets. SearchBasedHTTPTest and FixedTargetHTTPTest now seem to run. --- NetworkScanners/ExitAuthority/soat.py | 204 +++++++++++++++----------------- 1 files changed, 96 insertions(+), 108 deletions(-) diff --git a/NetworkScanners/ExitAuthority/soat.py b/NetworkScanners/ExitAuthority/soat.py index c96468f..8d557eb 100755 --- a/NetworkScanners/ExitAuthority/soat.py +++ b/NetworkScanners/ExitAuthority/soat.py @@ -528,9 +528,10 @@ class Targets: MUST support these methods: add -- Add a target. Optional second argument is list of keys. Idempotent. remove -- Remove a target. Returns True iff the target was found. - bykey -- Get an iterator whose elements match the supplied key. + bykey -- Get a list whose elements match the supplied key. __iter__ __len__ + __getitem__ """ def __init__(self): @@ -564,15 +565,20 @@ class Targets: return retval def bykey(self,key): return self.lookup.get(key,[]) + def keys(self): + return self.lookup.keys() def __iter__(self): return map(lambda x: x[0], self.list).__iter__() def __len__(self): return len(self.list) + def __getitem__(self,index): + return self.list[index] # Base Test Classes class Test: """ Base class for our tests """ def __init__(self, proto, port): + """Sets the variables that are static for the lifetime of the test and calls self._reset() which sets the variables that are not.""" self.proto = proto self.port = port self.min_targets = min_targets @@ -585,8 +591,7 @@ class Test: self.scan_nodes = 0 self.nodes_to_mark = 0 self.tests_per_node = num_tests_per_node - self.url_reserve = {} - self._reset() + self._reset() #CA make this a call to rewind instead? self._pickle_revision = 8 # Will increment as fields are added def run_test(self): @@ -656,13 +661,19 @@ class Test: return True def add_target(self, target): - self.targets.append(target) + self.targets.add(target) + + def select_targets(self): + return self.targets + + def refill_targets(self): + map(self.add_target, self.get_targets()) + if not self.targets: + raise NoURLsFound("No URLS found for protocol "+self.proto) def remove_target(self, target, reason="None"): self.banned_targets.add(target) - self.refill_targets() - if target in self.targets: - self.targets.remove(target) + self.targets.remove(target) if target in self.dynamic_fails: del self.dynamic_fails[target] if target in self.successes: @@ -693,6 +704,8 @@ class Test: datahandler.saveResult(r) self.results.remove(r) + self.refill_targets() + def load_rescan(self, type, since=None): self.rescan_nodes = set([]) results = datahandler.getAll() @@ -817,7 +830,7 @@ class Test: def _reset(self): self.results = [] - self.targets = [] + self.targets = Targets() self.tests_run = 0 self.nodes_marked = 0 self.run_start = time.time() @@ -827,7 +840,7 @@ class Test: self.dns_fails_per_exit = {} self.exit_fails_per_exit = {} self.node_results = {} - # These are indexed by site url: + # These are indexed by target URI: self.connect_fails = {} self.timeout_fails = {} self.dns_fails = {} @@ -842,8 +855,8 @@ class Test: if not self.targets: raise NoURLsFound("No URLS found for protocol "+self.proto) - targets = "\n\t".join(self.targets) - plog("INFO", "Using the following urls for "+self.proto+" scan:\n\t"+targets) + targets_str = "\n\t".join(map(str,self.targets)) + plog("INFO", "Using the following urls for "+self.proto+" scan:\n\t"+targets_str) def site_tests(self, site): tot_cnt = 0 @@ -981,17 +994,16 @@ class Test: class BaseHTTPTest(Test): - def __init__(self, filetypes=scan_filetypes): + def __init__(self, scan_filetypes=scan_filetypes): # FIXME: Handle http urls w/ non-80 ports.. - self.scan_filetypes = filetypes + self.scan_filetypes = scan_filetypes + self.fetch_queue = [] Test.__init__(self, "HTTP", 80) self.save_name = "HTTPTest" - self.fetch_targets = urls_per_filetype def _reset(self): self.httpcode_fails = {} self.httpcode_fails_per_exit = {} - self.targets_by_type = {} Test._reset(self) def depickle_upgrade(self): @@ -1034,15 +1046,13 @@ class BaseHTTPTest(Test): self.tests_run += 1 - n_tests = random.choice(xrange(1,len(self.targets_by_type)+1)) - filetypes = random.sample(self.targets_by_type.keys(), n_tests) - - plog("INFO", "HTTPTest decided to fetch "+str(n_tests)+" urls of types: "+str(filetypes)) + self.fetch_queue.extend(self.select_targets()) n_success = n_fail = n_inconclusive = 0 - for ftype in filetypes: + + while self.fetch_queue: + address = self.fetch_queue.pop(0) # FIXME: Set referrer to random or none for each of these - address = random.choice(self.targets_by_type[ftype]) result = self.check_http(address) if result == TEST_INCONCLUSIVE: n_inconclusive += 1 @@ -1062,22 +1072,9 @@ class BaseHTTPTest(Test): else: return TEST_SUCCESS - def add_target(self, target): - # HTTP Tests keep an additional dictionary of targets keyed by filetype - split = target.rsplit('.',1) - if len(split) > 1 and split[-1] in self.scan_filetypes: - self.targets.append(target) - self.targets_by_type.setdefault(split[-1], []).append(target) - def remove_target(self, target, reason="None"): # Remove from targets list and targets by type dictionary - if target in self.targets: - self.targets.remove(target) - for k,v in self.targets_by_type.items(): - if target in v: - v.remove(target) - if not v: - del self.targets_by_type[k] + self.targets.remove(target) # Delete results in httpcode_fails if target in self.httpcode_fails: del self.httpcode_fails[target] @@ -1488,18 +1485,10 @@ def is_script_mimetype(mime_type): return is_script class BaseHTMLTest(BaseHTTPTest): - def __init__(self, recurse_filetypes=scan_filetypes): - BaseHTTPTest.__init__(self, recurse_filetypes) + def __init__(self, scan_filetypes=scan_filetypes): + BaseHTTPTest.__init__(self, scan_filetypes) self.save_name = "HTMLTest" - self.fetch_targets = num_html_urls - self.proto = "HTML" - self.recurse_filetypes = recurse_filetypes - self.fetch_queue = [] - - def _reset(self): - self.httpcode_fails = {} - self.httpcode_fails_per_exit = {} - Test._reset(self) + self.proto = "HTML" #CA .. ? def depickle_upgrade(self): if self._pickle_revision < 7: @@ -1507,11 +1496,9 @@ class BaseHTMLTest(BaseHTTPTest): Test.depickle_upgrade(self) def add_target(self, target): + """Avoid BaseHTTP.add_target which keys entries""" Test.add_target(self, target) - def remove_target(self, target, reason="None"): - Test.remove_target(self, target, reason) - def run_test(self): # A single test should have a single cookie jar self.tor_cookie_jar = cookielib.MozillaCookieJar() @@ -1616,7 +1603,7 @@ class BaseHTMLTest(BaseHTTPTest): targets.append(("image", urlparse.urljoin(orig_addr, attr_tgt))) elif t.name == 'a': if attr_name == "href": - for f in self.recurse_filetypes: + for f in self.scan_filetypes: if f not in got_type and attr_tgt[-len(f):] == f: got_type[f] = 1 targets.append(("http", urlparse.urljoin(orig_addr, attr_tgt))) @@ -2045,14 +2032,16 @@ class FixedTargetTest: def __init__(self, targets): self.fixed_targets = targets - def refill_targets(self): - pass - def get_targets(self): return self.fixed_targets[:] + def refill_targets(self): + """Can't refill FixedTargetTest""" + pass + def finished(self): - # FixedTargetTests are done if they test all nodes or run out of targets + """FixedTargetTests are done if they test all nodes or run out of targets""" + # CA do we properly handle possibility that self.targets can run out return not (self.nodes and self.targets) class FixedTargetHTTPTest(FixedTargetTest, BaseHTTPTest): @@ -2081,17 +2070,12 @@ class SearchBasedTest: """ Mixin class. Must be mixed with a subclass of Test """ def __init__(self, wordlist_file): self.wordlist_file = wordlist_file - self.host_only = False - self.result_filetypes = ['any'] - self.result_protocol = 'any' - self.results_per_type = 10 self.search_mode = default_search_mode + self.url_reserve = {} - def refill_targets(self): - if len(self.targets) < self.min_targets: - plog("NOTICE", self.proto+" scanner short on targets. Adding more") - map(self.add_target, self.get_targets()) + def rewind(self): + self.wordlist = load_wordlist(self.wordlist_file) def get_targets(self): return self.get_search_urls() @@ -2103,26 +2087,23 @@ class SearchBasedTest: plog('INFO', 'Searching for relevant sites...') urllist = set([]) - for filetype in self.result_filetypes: - type_urls = self.get_search_urls_for_filetype(filetype) - # make sure we don't get more urls than needed - if len(type_urls) > self.results_per_type: - chosen_urls = set(random.sample(type_urls, self.results_per_type)) - if filetype in self.url_reserve: - self.url_reserve[filetype].extend(list(type_urls - chosen_urls)) - else: - self.url_reserve[filetype] = list(type_urls - chosen_urls) - type_urls = chosen_urls - urllist.update(type_urls) + for filetype in self.scan_filetypes: + urllist.update(self.get_search_urls_for_filetype(filetype)) return list(urllist) - def get_search_urls_for_filetype(self, filetype): - type_urls = set(self.url_reserve.get(filetype, [])) - if type_urls: # Clear urls from the reserve - self.url_reserve[filetype] = [] + def get_search_urls_for_filetype(self, filetype,number = 0): + if not number: + number = self.results_per_type + + self.url_reserve.setdefault(filetype,[]) + + type_urls = set(self.url_reserve[filetype][:number]) + self.url_reserve[filetype] = self.url_reserve[filetype][number:] + count = 0 - while len(type_urls) < self.results_per_type and count < max_search_retry: + + while len(type_urls) < number and count < max_search_retry: count += 1 #Try to filter based on filetype/protocol. Unreliable. We will re-filter. @@ -2194,30 +2175,34 @@ class SearchBasedTest: if filetype == 'any': file_list = None else: - file_list = self.result_filetypes + file_list = self.scan_filetypes if self._is_useable_url(url, prot_list, file_list): if self.host_only: # FIXME: %-encoding, @'s, etc? plog("INFO", url) - host = urlparse.urlparse(url)[1] + url = urlparse.urlparse(url)[1] # Have to check again here after parsing the url: - if host not in self.banned_targets: - type_urls.add(host) - else: - type_urls.add(url) + if host in self.banned_targets: + continue + type_urls.add(url) + plog("INFO", "Have "+str(len(type_urls))+"/"+str(number)+" urls from search so far..") else: pass - plog("INFO", "Have "+str(len(type_urls))+"/"+str(self.results_per_type)+" urls from search so far..") + + if len(type_urls) > number: + chosen = random.sample(type_urls,number) + self.url_reserve[filetype].extend(list(type_urls - set(chosen))) + type_urls = chosen + return type_urls class SearchBasedHTTPTest(SearchBasedTest, BaseHTTPTest): def __init__(self, wordlist): BaseHTTPTest.__init__(self) SearchBasedTest.__init__(self, wordlist) - self.result_filetypes = self.scan_filetypes - self.result_protocol = "http" - self.results_per_type = self.fetch_targets + self.results_per_type = urls_per_filetype + self.result_protocol = 'http' def depickle_upgrade(self): if self._pickle_revision < 7: @@ -2227,28 +2212,32 @@ class SearchBasedHTTPTest(SearchBasedTest, BaseHTTPTest): BaseHTTPTest.depickle_upgrade(self) def rewind(self): - self.wordlist = load_wordlist(self.wordlist_file) + SearchBasedTest.rewind(self) BaseHTTPTest.rewind(self) + def add_target(self, target): + # Keys targets by filetype. One filetype per target + split = target.rsplit('.',1) + if len(split) > 1 and split[-1] in self.scan_filetypes: + self.targets.add(target,[split[-1]]) + return True + return False + + def select_targets(self): + retval = [] + n_tests = random.randrange(1,len(self.targets.keys())+1) + filetypes = random.sample(self.targets.keys(), n_tests) + plog("INFO", "HTTPTest decided to fetch "+str(n_tests)+" urls of types: "+str(filetypes)) + for ftype in filetypes: + retval.append(random.choice(self.targets.bykey(ftype))) + return retval + def refill_targets(self): for ftype in self.scan_filetypes: - if not ftype in self.targets_by_type or len(self.targets_by_type[ftype]) < self.fetch_targets: + targets_needed = self.results_per_type - len(self.targets.bykey(ftype)) + if targets_needed > 0: plog("NOTICE", self.proto+" scanner short on "+ftype+" targets. Adding more") - map(self.add_target, self.get_search_urls_for_filetype(ftype)) - -# This duplicated the effort of BaseHTTPTest.add_target which is invoked by -# SearchBasedHTTPTest.rewind -> BaseHTTPTest.rewind = Test.rewind -# Instead we should fall back on SearchBasedTest.get_targets -# def get_targets(self): -# raw_urls = self.get_search_urls() -# new = {} -# for url in raw_urls: -# split = url.rsplit('.',1) # Try to get filetype -# if len(split) > 1 and split[-1] in self.scan_filetypes: -# new.setdefault(split[-1],[]).append(url) -# for k,v in new.items(): -# self.targets_by_type.setdefault(k, []).extend(v) -# return raw_urls + map(self.add_target, self.get_search_urls_for_filetype(ftype,targets_needed)) HTTPTest = SearchBasedHTTPTest # For resuming from old HTTPTest.*.test files @@ -2257,7 +2246,6 @@ class SearchBasedHTMLTest(SearchBasedTest, BaseHTMLTest): BaseHTMLTest.__init__(self) SearchBasedTest.__init__(self, wordlist) self.result_filetypes = ["any"] - self.result_protocol = "http" self.results_per_type = self.fetch_targets def depickle_upgrade(self): @@ -2268,7 +2256,7 @@ class SearchBasedHTMLTest(SearchBasedTest, BaseHTMLTest): BaseHTMLTest.depickle_upgrade(self) def rewind(self): - self.wordlist = load_wordlist(self.wordlist_file) + SearchBasedTest.rewind(self) BaseHTMLTest.rewind(self) HTMLTest = SearchBasedHTMLTest # For resuming from old HTMLTest.*.test files @@ -2908,7 +2896,7 @@ def decompress_response_data(response): len_read = len(data) now = time.time() - plog("DEBUG", "Read "+str(len_read)+"/"+str(tot_len)) + #plog("DEBUG", "Read "+str(len_read)+"/"+str(tot_len)) #Very verbose # Wait 5 seconds before counting data if (now-start) > 5: rate = (float(len_read)/(now-start)) #B/s

1 0

[torflow/master] Replacing HTMLTest with customizable HTTPTest.
by mikeperry＠torproject.org 14 Sep '11

14 Sep '11

commit 06987d841646edad453c65d36196c35c7d83e331 Author: christian <christian(a)avtok.com> Date: Fri Jul 22 05:04:41 2011 -0400 Replacing HTMLTest with customizable HTTPTest. --- NetworkScanners/ExitAuthority/libsoat.py | 9 +- NetworkScanners/ExitAuthority/soat.py | 610 +++++++++++++------------- NetworkScanners/ExitAuthority/soat_config.py | 6 +- 3 files changed, 325 insertions(+), 300 deletions(-) diff --git a/NetworkScanners/ExitAuthority/libsoat.py b/NetworkScanners/ExitAuthority/libsoat.py index 5971c9b..2a86cb4 100644 --- a/NetworkScanners/ExitAuthority/libsoat.py +++ b/NetworkScanners/ExitAuthority/libsoat.py @@ -42,6 +42,7 @@ __all__ = [ # Classes # Functions "FullyStrainedSoup", # Constants + "COMPARE_EQUAL", "COMPARE_NOEQUAL", "COMPARE_TRUNCATION", "TEST_SUCCESS", "TEST_INCONCLUSIVE", "TEST_FAILURE", "RESULT_STRINGS", "RESULT_CODES", "INCONCLUSIVE_NOLOCALCONTENT", "INCONCLUSIVE_DYNAMICSSL", @@ -77,6 +78,12 @@ class LoggingJSLexer(JavaScriptLexer): # constants +# Compare results +COMPARE_EQUAL = 0 +COMPARE_NOEQUAL = 1 +COMPARE_TRUNCATION = 2 + +# Test results TEST_SUCCESS = 0 TEST_INCONCLUSIVE = 1 TEST_FAILURE = 2 @@ -842,7 +849,7 @@ class SnakePickler: pass raise KeyboardInterrupt except Exception, e: - plog("WARN", "Exception during pickle dump: "+e) + plog("WARN", "Exception during pickle dump: " + str(e)) try: os.unlink(filename) except: pass diff --git a/NetworkScanners/ExitAuthority/soat.py b/NetworkScanners/ExitAuthority/soat.py index 8d557eb..dc4409a 100755 --- a/NetworkScanners/ExitAuthority/soat.py +++ b/NetworkScanners/ExitAuthority/soat.py @@ -2,6 +2,7 @@ # 2008 Aleksei Gorny, mentored by Mike Perry # 2009 Mike Perry +# 2011 Christian Anderson ''' Snakes on a Tor exit node scanner @@ -31,6 +32,7 @@ import getopt import httplib import mimetypes import os +import pickle import random import re import signal @@ -591,7 +593,7 @@ class Test: self.scan_nodes = 0 self.nodes_to_mark = 0 self.tests_per_node = num_tests_per_node - self._reset() #CA make this a call to rewind instead? + self._reset() self._pickle_revision = 8 # Will increment as fields are added def run_test(self): @@ -603,7 +605,7 @@ class Test: # Yes, this is a hack, and yes, it will bias results # away from the filter, but hey, at least it will still run. self._pickle_revision = 1 - + for addr in self.successes.keys(): if type(self.successes[addr]) == int: self.successes[addr] = set(xrange(0,self.successes[addr])) @@ -664,7 +666,10 @@ class Test: self.targets.add(target) def select_targets(self): - return self.targets + ret = [] + for key in self.targets.keys(): + ret.extend(map(lambda x: (x,key), self.targets.bykey(key))) + return ret def refill_targets(self): map(self.add_target, self.get_targets()) @@ -830,6 +835,7 @@ class Test: def _reset(self): self.results = [] + # Empty target list for new test self.targets = Targets() self.tests_run = 0 self.nodes_marked = 0 @@ -1000,10 +1006,17 @@ class BaseHTTPTest(Test): self.fetch_queue = [] Test.__init__(self, "HTTP", 80) self.save_name = "HTTPTest" + self.compare_funcs = {'html': self.compare_html, "js": self.compare_js} def _reset(self): self.httpcode_fails = {} self.httpcode_fails_per_exit = {} + # Default cookie jar for new test + self.tor_cookie_jar = None + self.cookie_jar = None + # Default headers for new test + self.headers = copy.copy(firefox_headers) + Test._reset(self) def depickle_upgrade(self): @@ -1042,18 +1055,19 @@ class BaseHTTPTest(Test): # A single test should have a single cookie jar self.tor_cookie_jar = cookielib.MozillaCookieJar() self.cookie_jar = cookielib.MozillaCookieJar() - self.headers = copy.copy(firefox_headers) self.tests_run += 1 self.fetch_queue.extend(self.select_targets()) + plog('INFO',str(self.fetch_queue)) + n_success = n_fail = n_inconclusive = 0 while self.fetch_queue: - address = self.fetch_queue.pop(0) + address, filetype = self.fetch_queue.pop(0) # FIXME: Set referrer to random or none for each of these - result = self.check_http(address) + result = self.check_http(address,filetype) if result == TEST_INCONCLUSIVE: n_inconclusive += 1 if result == TEST_FAILURE: @@ -1106,144 +1120,148 @@ class BaseHTTPTest(Test): datahandler.saveResult(result) return TEST_FAILURE - def check_http_nodynamic(self, address, nocontent=False): - # TODO: use nocontent to cause us to not load content into memory. - # This will require refactoring http_response though. - ''' check whether a http connection to a given address is molested ''' + def direct_load(self, orig_address, filetype): + """Loads a page on a direct connection. The signtuare is: + address (posibly after redirects) + success (T/F) + code + filetype of loaded page (should be null if we failed)""" - # an address representation acceptable for a filename - address_file = DataHandler.safeFilename(address.replace('http://','')) - content_prefix = http_content_dir+address_file - # Keep a copy of the cookie jar before mods for refetch or - # to restore on errors that cancel a fetch - orig_cookie_jar = cookielib.MozillaCookieJar() - for cookie in self.cookie_jar: - orig_cookie_jar.set_cookie(cookie) - orig_tor_cookie_jar = cookielib.MozillaCookieJar() - for cookie in self.tor_cookie_jar: - orig_tor_cookie_jar.set_cookie(cookie) + # This is the address that this function will return: + address = orig_address - try: - # Load content from disk, md5 - content_file = open(content_prefix+'.content', 'r') - sha1sum = sha() - buf = content_file.read(4096) - while buf: - sha1sum.update(buf) - buf = content_file.read(4096) - content_file.close() + # Reqest the content using a direct connection + (code, resp_headers, new_cookies, mime_type, content) = http_request(orig_address,self.cookie_jar, self.headers) - added_cookie_jar = cookielib.MozillaCookieJar() - added_cookie_jar.load(content_prefix+'.cookies', ignore_discard=True) - self.cookie_jar.load(content_prefix+'.cookies', ignore_discard=True) + # Make a good faith effort to follow redirects + count = 0 + trail = set([]) + while (300 <= code < 400): + plog("NOTICE", "Non-Tor HTTP "+str(code)+" redirect from "+str(orig_address)+" to "+str(content)) + address = content + if address in trail: break + trail.add(address) + (code, resp_headers, new_cookies, mime_type, content) = http_request(address, self.cookie_jar, self.headers) - headerdiffer = SnakePickler.load(content_prefix+'.headerdiff') + count += 1 + if count > 4: break - content = None - mime_type = None + # Couldn't get past the redirects + if (300 <= code < 400): + return (address,False,code,'') - except IOError: - (code, resp_headers, new_cookies, mime_type, content) = http_request(address, self.cookie_jar, self.headers) + # If there was a fatal error, return failure + if not (200 <= code < 300) or not content: + plog("NOTICE", "Non-tor HTTP error "+str(code)+" fetching content for "+address) + return (address, False, code,'') - if 300 <= code < 400: # Redirects - plog("NOTICE", "Non-Tor HTTP "+str(code)+" redirect from "+str(address)+" to "+str(content)) - # Remove the original target and add the redirected location - self.remove_target(address, INCONCLUSIVE_REDIRECT) - self.add_target(content) - # Restore cookie jar - self.cookie_jar = orig_cookie_jar - self.tor_cookie_jar = orig_cookie_jar - return TEST_INCONCLUSIVE + loaded_filetype = mime_to_filetype(mime_type) - if code - (code % 100) != 200: - plog("NOTICE", "Non-tor HTTP error "+str(code)+" fetching content for "+address) - # Just remove it - self.remove_target(address, FALSEPOSITIVE_HTTPERRORS) - # Restore cookie jars - self.cookie_jar = orig_cookie_jar - self.tor_cookie_jar = orig_tor_cookie_jar - return TEST_INCONCLUSIVE + if filetype and filetype != loaded_filetype: + + plog('DEBUG', 'Wrong filetype: ' + filetype + ' ' + loaded_filetype) + return (address, False, code, '') - if not content: - plog("WARN", "Failed to direct load "+address) - # Just remove it - self.remove_target(address, INCONCLUSIVE_NOLOCALCONTENT) - # Restore cookie jar - self.cookie_jar = orig_cookie_jar - self.tor_cookie_jar = orig_tor_cookie_jar - return TEST_INCONCLUSIVE - sha1sum = sha(content) + # Fetch again with different cookies and see if we get the same content + # Use a different IP address if possible - content_file = open(content_prefix+'.content', 'w') - content_file.write(content) - content_file.close() + empty_cookie_jar = cookielib.MozillaCookieJar() - headerdiffer = HeaderDiffer(resp_headers) - SnakePickler.dump(headerdiffer, content_prefix+'.headerdiff') + BindingSocket.bind_to = refetch_ip + (code_new, resp_headers_new, new_cookies_new, mime_type_new, content_new) = http_request(address, empty_cookie_jar, self.headers) + BindingSocket.bind_to = None - # Need to do set subtraction and only save new cookies.. - # or extract/make_cookies - added_cookie_jar = cookielib.MozillaCookieJar() - for cookie in new_cookies: - added_cookie_jar.set_cookie(cookie) - try: - added_cookie_jar.save(content_prefix+'.cookies', ignore_discard=True) - except: - traceback.print_exc() - plog("WARN", "Error saving cookies in "+str(added_cookie_jar)+" to "+content_prefix+".cookies") + # If there was a fatal error, return failure + if not (code <= 200 < 300) or not content: + plog("NOTICE", "Non-tor HTTP error "+str(code)+" fetching content for "+address) + return (address, False, code, '') - except TypeError, e: - plog('ERROR', 'Failed obtaining the shasum for ' + address) - plog('ERROR', e) - # Restore cookie jars - self.cookie_jar = orig_cookie_jar - self.tor_cookie_jar = orig_tor_cookie_jar - return TEST_INCONCLUSIVE + # The context for writing out the files used to make repeated comparisons + address_file = DataHandler.safeFilename(re.sub('[a-z]+://','',address)) + content_prefix = http_content_dir + address_file + + # If the page is different on the second load, then it is probably dynamic and useless to us + if self.compare(content,content_new,content_prefix,loaded_filetype) != COMPARE_EQUAL: + return (address, False, code, '') + + f = open(content_prefix + '.content', 'w') + f.write(content) + f.close() + + # Save the cookies in case we want them for a later test + empty_cookie_jar.save(content_prefix + '.cookies',ignore_discard=True) + + # Save the response headers in case we want them for a later test + headerdiffer = HeaderDiffer(resp_headers) + SnakePickler.dump(headerdiffer, content_prefix+'.headerdiff') + + return (address, True, code, loaded_filetype) + + def check_http(self, address, filetype, dynamic = False): + ''' check whether a http connection to a given address is molested ''' - (pcode, presp_headers, pnew_cookies, pmime_type, pcontent) = torify(http_request, address, self.tor_cookie_jar, self.headers) + # The "dynamic" option controls whether we dare grapple with dynamic + # pages. Currently only False is supported. + + plog('INFO', 'Conducting an http test with destination ' + address) + + # Keep a copy of the cookie jar before mods for refetch or + # to restore on errors that cancel a fetch + my_tor_cookie_jar = cookielib.MozillaCookieJar() + for cookie in self.tor_cookie_jar: + my_tor_cookie_jar.set_cookie(cookie) + + my_cookie_jar = cookielib.MozillaCookieJar() + for cookie in self.cookie_jar: + my_cookie_jar.set_cookie(cookie) + + # CA we should modify our headers for maximum magic + + # pfoobar means that foobar was acquired over a _p_roxy + (pcode, presp_headers, pnew_cookies, pmime_type, pcontent) = torify(http_request, address, my_tor_cookie_jar, self.headers) psha1sum = sha(pcontent) exit_node = scanhdlr.get_exit_node() if not exit_node: + # CA: how can this happen? plog('NOTICE', 'We had no exit node to test, skipping to the next test.') result = HttpTestResult(None, address, TEST_INCONCLUSIVE, INCONCLUSIVE_NOEXIT) if self.rescan_nodes: + # CA: we shouldn't need to do this result.from_rescan = True self.results.append(result) - - # Restore cookie jars - self.cookie_jar = orig_cookie_jar - self.tor_cookie_jar = orig_tor_cookie_jar + # CA: when do we use datahandler? return TEST_INCONCLUSIVE exit_node = "$"+exit_node.idhex - if pcode - (pcode % 100) != 200: - plog("NOTICE", exit_node+" had error "+str(pcode)+" fetching content for "+address) - - if pcode not in SOCKS_ERRS: # Do a refetch for non-SOCKS errors only - # Restore cookie jars - # XXX: This is odd and possibly wrong for the refetch - self.cookie_jar = orig_cookie_jar - self.tor_cookie_jar = orig_tor_cookie_jar - BindingSocket.bind_to = refetch_ip - (code_new, resp_headers_new, new_cookies_new, mime_type_new, content_new) = http_request(address, orig_tor_cookie_jar, self.headers) - BindingSocket.bind_to = None - - if code_new == pcode and 300 <= pcode < 400: # Target introduced a redirect - plog("NOTICE", "Non-Tor HTTP "+str(code_new)+" redirect from "+address+" to "+str(content_new)) - # Remove the original URL and add the redirect to our targets (if it's of the right type) + + # If there is an error loading the page over Tor: + if not (200 <= pcode < 300) or not pcontent: + # And if it doesn't have to do with our SOCKS connection: + if pcode not in SOCKS_ERRS: + plog("NOTICE", exit_node+" had error "+str(pcode)+" fetching content for "+address) + + (code_direct, resp_headers_direct, direct_cookies_direct, mime_type_direct, content_direct) = http_request(address, my_cookie_jar, self.headers) + + # If a direct load is failing, remove this target from future consideration + if (300 <= code_direct < 400): self.remove_target(address, INCONCLUSIVE_REDIRECT) - self.add_target(content_new) - return TEST_INCONCLUSIVE - elif code_new == pcode: # Target introduced some other change - plog("NOTICE", "Non-tor HTTP error "+str(code_new)+" fetching content for "+address) - # Just remove it + elif not (200 <= code_direct < 300): self.remove_target(address, FALSEPOSITIVE_HTTPERRORS) + + # If Tor and direct are failing for the same reason, Tor is off the hook + if (code_direct == pcode): + result = HttpTestResult(self.node_map[exit_node[1:]], + address, TEST_INCONCLUSIVE, INCONCLUSIVE_NOLOCALCONTENT) + if self.rescan_nodes: + # CA: we shouldn't need to do this + result.from_rescan = True + self.results.append(result) return TEST_INCONCLUSIVE - # Error => behavior lookup table + # Error => behavior lookup table # Error code (Failure reason, Register method, Set extra_info to pcontent?) err_lookup = \ {E_SOCKS: (FAILURE_CONNERROR, self.register_connect_failure, True), # "General socks error" @@ -1257,6 +1275,7 @@ class BaseHTTPTest(Test): E_URL: (FAILURE_URLERROR, self.register_connect_failure, True), E_MISC: (FAILURE_MISCEXCEPTION, self.register_connect_failure, True) } + if pcode in err_lookup: fail_reason, register, extra_info = err_lookup[pcode] elif 300 <= pcode < 400: # Exit node introduced a redirect @@ -1265,209 +1284,193 @@ class BaseHTTPTest(Test): register = self.register_http_failure extra_info = True else: # Exit node introduced some other change - fail_reason = FAILURE_BADHTTPCODE+str(pcode) + fail_reason = FAILURE_BADHTTPCODE + str(pcode) #CA don't think this is good register = self.register_exit_failure extra_info = True + # the [1:] gets rid of dollar sign. CA ugly result = HttpTestResult(self.node_map[exit_node[1:]], - address, TEST_FAILURE, fail_reason) + address, TEST_FAILURE, fail_reason) if extra_info: result.extra_info = str(pcontent) - return register(result) - # if we have no content, we had a connection error + register(result) + return TEST_FAILURE + + # If we have no content, we had a connection error if pcontent == "": result = HttpTestResult(self.node_map[exit_node[1:]], address, TEST_FAILURE, FAILURE_NOEXITCONTENT) self.register_exit_failure(result) # Restore cookie jars - self.cookie_jar = orig_cookie_jar - self.tor_cookie_jar = orig_tor_cookie_jar return TEST_FAILURE - hdiffs = headerdiffer.show_differences(presp_headers) - if hdiffs: - plog("NOTICE", "Header differences for "+address+": \n"+hdiffs) - - # compare the content - # if content matches, everything is ok - if not hdiffs and psha1sum.hexdigest() == sha1sum.hexdigest(): - result = HttpTestResult(self.node_map[exit_node[1:]], - address, TEST_SUCCESS) - self.register_success(result) - return TEST_SUCCESS + # + # Tor was able to connect, so now it's time to make the comparison + # + + # An address representation acceptable for a filename: + address_file = DataHandler.safeFilename(re.sub('[a-z]+://','',address)) + content_prefix = http_content_dir + address_file + failed_prefix = http_failed_dir + address_file + + # Load content from disk + content_file = open(content_prefix+'.content', 'r') + content = ''.join(content_file.readlines()) + content_file.close() + + # If we need to write out the content handed to us by the exit node + exit_content_file_name = DataHandler.uniqueFilename(failed_prefix+'.'+exit_node[1:]+'.content') + + # TODO we might want to check headers and cookies + + # Compare the content + # TODO should we check if mimetype agrees with filetype? + result = self.compare(pcontent,content,content_prefix,filetype) + if result == COMPARE_NOEQUAL: + # Reload direct content and try again + (code_direct, resp_headers_direct, direct_cookies_direct, mime_type_direct, content_direct) = http_request(address, my_cookie_jar, self.headers) + + # If a new direct load somehow fails, then we're out of luck + if not (200 <= code_direct < 300): + plog("WARN", "Failed to re-frech "+address+" outside of Tor. Did our network fail?") + self.remove_target(address, FALSEPOSITIVE_HTTPERRORS) + result = HttpTestResult(self.node_map[exit_node[1:]], + address, TEST_INCONCLUSIVE, + INCONCLUSIVE_NOLOCALCONTENT) + if self.rescan_nodes: + result.from_rescan = True + self.results.append(result) + return TEST_INCONCLUSIVE - # Check for a simple truncation failure, which seems - # common with many nodes - if not content and not nocontent: - load_file = content_prefix+'.content' - content_file = open(load_file, 'r') - content = content_file.read() - content_file.close() - - if content and len(pcontent) < len(content): - if content[0:len(pcontent)] == pcontent[0:len(pcontent)]: - failed_prefix = http_failed_dir+address_file - exit_content_file = open(DataHandler.uniqueFilename(failed_prefix+'.'+exit_node[1:]+'.content'), 'w') - exit_content_file.write(pcontent) - exit_content_file.close() + # Try our comparison again + dynamic = self.compare(content_direct,content,content_prefix,filetype) + + if dynamic == COMPARE_EQUAL: + # The content has changed, so our exit node is screwing with us. result = HttpTestResult(self.node_map[exit_node[1:]], - address, TEST_FAILURE, FAILURE_EXITTRUNCATION, + address, TEST_FAILURE, FAILURE_EXITONLY, sha1sum.hexdigest(), psha1sum.hexdigest(), - content_prefix+".content", - exit_content_file.name) + content_prefix+".content", exit_content_file_name) self.register_exit_failure(result) - # Restore cookie jars - self.cookie_jar = orig_cookie_jar - self.tor_cookie_jar = orig_tor_cookie_jar - return TEST_FAILURE - - # if content doesnt match, update the direct content and use new cookies - # If we have alternate IPs to bind to on this box, use them? - # Sometimes pages have the client IP encoded in them.. - # Also, use the Tor cookies, since those identifiers are - # probably embeded in the Tor page as well. - BindingSocket.bind_to = refetch_ip - (code_new, resp_headers_new, new_cookies_new, mime_type_new, content_new) = http_request(address, orig_tor_cookie_jar, self.headers) - BindingSocket.bind_to = None - - if not content_new: - plog("WARN", "Failed to re-frech "+address+" outside of Tor. Did our network fail?") - result = HttpTestResult(self.node_map[exit_node[1:]], - address, TEST_INCONCLUSIVE, - INCONCLUSIVE_NOLOCALCONTENT) - if self.rescan_nodes: - result.from_rescan = True - self.results.append(result) - datahandler.saveResult(result) - return TEST_INCONCLUSIVE - - headerdiffer.prune_differences(resp_headers_new) - hdiffs = headerdiffer.show_differences(presp_headers) - - SnakePickler.dump(headerdiffer, content_prefix+'.headerdiff') - - sha1sum_new = sha(content_new) - - if sha1sum.hexdigest() != sha1sum_new.hexdigest(): - # if content has changed outside of tor, update the saved file - os.rename(content_prefix+'.content', content_prefix+'.content-old') - new_content_file = open(content_prefix+'.content', 'w') - new_content_file.write(content_new) - new_content_file.close() - - # Need to do set subtraction and only save new cookies.. - # or extract/make_cookies - - self.cookie_jar = orig_cookie_jar - new_cookie_jar = cookielib.MozillaCookieJar() - for cookie in new_cookies_new: - new_cookie_jar.set_cookie(cookie) - self.cookie_jar.set_cookie(cookie) # Update.. - os.rename(content_prefix+'.cookies', content_prefix+'.cookies-old') - try: - new_cookie_jar.save(content_prefix+'.cookies', ignore_discard=True) - except: - traceback.print_exc() - plog("WARN", "Error saving cookies in "+str(new_cookie_jar)+" to "+content_prefix+".cookies") - - if hdiffs: - # XXX: We probably should store the header differ + exit headers - # for later comparison (ie if the header differ picks up more diffs) - plog("NOTICE", "Post-refetch header changes for "+address+": \n"+hdiffs) - result = HttpTestResult(self.node_map[exit_node[1:]], - address, TEST_FAILURE, FAILURE_HEADERCHANGE) - result.extra_info = hdiffs - self.register_dynamic_failure(result) - # Lets let the rest of the tests run too actually - #return TEST_FAILURE - - # compare the node content and the new content - # if it matches, everything is ok - if psha1sum.hexdigest() == sha1sum_new.hexdigest(): + retval = TEST_FAILURE + else: + # The content is dynamic. + # Here's where "no dynamic" comes in. + # We reject this target and mark the test inconclusive. + plog("WARN", "HTTP Test is removing dynamic URL "+address) + self.remove_target(address, FALSEPOSITIVE_DYNAMIC) + result = HttpTestResult(self.node_map[exit_node[1:]], + address, TEST_INCONCLUSIVE, INCONCLUSIVE_DYNAMIC, + sha1sum_new.hexdigest(), psha1sum.hexdigest(), + content_prefix+".content", exit_content_file_name, + content_prefix+'.content-old', + sha1sum.hexdigest()) + self.results.append(result) + retval = TEST_INCONCLUSIVE + elif result == COMPARE_EQUAL: result = HttpTestResult(self.node_map[exit_node[1:]], address, TEST_SUCCESS) self.register_success(result) return TEST_SUCCESS - - if not content and not nocontent: - if sha1sum.hexdigest() != sha1sum_new.hexdigest(): - load_file = content_prefix+'.content-old' - else: - load_file = content_prefix+'.content' - content_file = open(load_file, 'r') - content = content_file.read() - content_file.close() - - if not ((mime_type == mime_type_new or not mime_type) \ - and mime_type_new == pmime_type): - if not mime_type: - mime_type = "text/disk" - plog("WARN", "Mime type change: 1st: "+mime_type+", 2nd: "+mime_type_new+", Tor: "+pmime_type) - # TODO: If this actually happens, store a result. - else: - # Mime types match.. Are they sensible? - guess = mimetypes.guess_type(address, strict=False)[0] - if guess and not is_html_mimetype(guess) and is_html_mimetype(str(pmime_type)): - # We're not expecting html and we got (seemingly dynamic) html content - # This causes a lot of false positives, let's just remove the target - plog("NOTICE", "Got HTML content for non-HTML request, removing target "+address) - self.remove_target(address, FALSEPOSITIVE_DYNAMIC) - return TEST_INCONCLUSIVE - - # Dirty dirty dirty... - return (mime_type_new, pcontent, psha1sum, content, sha1sum, content_new, - sha1sum_new, exit_node) - - def check_http(self, address): - plog('INFO', 'Conducting an http test with destination ' + address) - ret = self.check_http_nodynamic(address) - if type(ret) == int: - return ret - return self._check_http_worker(address, ret) - - def _check_http_worker(self, address, http_ret): - (mime_type,pcontent,psha1sum,content,sha1sum,content_new,sha1sum_new,exit_node) = http_ret - - address_file = DataHandler.safeFilename(address.replace('http://','')) - content_prefix = http_content_dir+address_file - failed_prefix = http_failed_dir+address_file - - # compare the new and old content - # if they match, means the node has been changing the content - if sha1sum.hexdigest() == sha1sum_new.hexdigest(): + elif result == COMPARE_TRUNCATION: exit_content_file = open(DataHandler.uniqueFilename(failed_prefix+'.'+exit_node[1:]+'.content'), 'w') exit_content_file.write(pcontent) exit_content_file.close() - result = HttpTestResult(self.node_map[exit_node[1:]], - address, TEST_FAILURE, FAILURE_EXITONLY, + address, TEST_FAILURE, FAILURE_EXITTRUNCATION, sha1sum.hexdigest(), psha1sum.hexdigest(), - content_prefix+".content", exit_content_file.name) + content_prefix+".content", + exit_content_file_name) self.register_exit_failure(result) return TEST_FAILURE - exit_content_file = open(DataHandler.uniqueFilename(failed_prefix+'.'+exit_node[1:]+'.dyn-content'),'w') - exit_content_file.write(pcontent) - exit_content_file.close() + # If we failed, then store what the exit node handed us + if retval == TEST_FAILURE: + exit_content_file = open(exit_content_file_name, 'w') + exit_content_file.write(pcontent) + exit_content_file.close() - result = HttpTestResult(self.node_map[exit_node[1:]], - address, TEST_FAILURE, FAILURE_DYNAMIC, - sha1sum_new.hexdigest(), psha1sum.hexdigest(), - content_prefix+".content", exit_content_file.name, - content_prefix+'.content-old', - sha1sum.hexdigest()) - if self.rescan_nodes: - result.from_rescan = True - self.results.append(result) - datahandler.saveResult(result) + return retval - # The HTTP Test should remove address immediately... - plog("WARN", "HTTP Test is removing dynamic URL "+address) - self.remove_target(address, FALSEPOSITIVE_DYNAMIC) - return TEST_FAILURE + def compare(self,new_content,old_content,context,filetype): + """The generic function for comparing webcontent.""" + + plog('DEBUG', "Beginning Compare") + + new_linelist = new_content.split('\n') + old_linelist = old_content.split('\n') + + old_hashes = pickled_content(context,'.hashes') + if not old_hashes: + old_hashes = [] + old_hash = sha() + for l in old_linelist: + old_hash.update(l) + old_hashes.append(old_hash.hexdigest()) + f = open(context + '.hashes','w') + pickle.dump(old_hashes,f) + f.close() + + if len(new_linelist) > len(old_linelist): + retval = COMPARE_NOEQUAL + else: + new_hash = sha() + for i in range(0,min(len(old_linelist),len(new_linelist))): + new_hash.update(new_linelist[i]) + new_hash = new_hash.hexdigest() + + if new_hash != old_hashes[len(new_linelist) - 1]: + retval = COMPARE_NOEQUAL + elif len(new_linelist) == len(old_linelist): + retval = COMPARE_EQUAL + else: + retval = COMPARE_TRUNCATION + + if retval == COMPARE_NOEQUAL: + try: + retval = self.compare_funcs[filetype](new_content,old_content,context) + except KeyError: + pass + + plog('DEBUG', "Compare got the result: " + str(retval)) + + return retval + + def compare_js(self,new_content,old_content,context): + # TODO check for truncation? Store differ? + jsdiff = JSDiffer(old_content) + has_changes = jsdiff.contains_differences(new_content) + if not has_changes: + return COMPARE_EQUAL + else: + return COMPARE_NOEQUAL + + def compare_html(self,new_content,old_content,context): + # TODO check for truncation? Store differ? + old_soup = FullyStrainedSoup(old_content.decode('ascii', 'ignore')) + new_soup = FullyStrainedSoup(new_content.decode('ascii', 'ignore')) + htmldiff = SoupDiffer(old_soup,new_soup) + html_has_changes = htmldiff.content_changed + # TODO do we need to seperately check JS? + if not html_has_changes: + return COMPARE_EQUAL + else: + return COMPARE_NOEQUAL # TODO move these somewhere sensible +def pickled_content(context,extension): + try: + f = open(context + extension, 'r') + ret = pickle.load(f) + f.close() + except IOError: + ret = False + return ret + +def mime_to_filetype(mime_type): + return mimetypes.guess_extension(mime_type)[1:] + def is_html_mimetype(mime_type): is_html = False for type_match in html_mime_types: @@ -2030,6 +2033,7 @@ class BaseSSLTest(Test): class FixedTargetTest: """ Mixin class. Must be mixed with a subclass of Test """ def __init__(self, targets): + plog('INFO', "You requested the fixed targets: " + str(targets)) self.fixed_targets = targets def get_targets(self): @@ -2041,7 +2045,6 @@ class FixedTargetTest: def finished(self): """FixedTargetTests are done if they test all nodes or run out of targets""" - # CA do we properly handle possibility that self.targets can run out return not (self.nodes and self.targets) class FixedTargetHTTPTest(FixedTargetTest, BaseHTTPTest): @@ -2050,6 +2053,16 @@ class FixedTargetHTTPTest(FixedTargetTest, BaseHTTPTest): utargets = [t for t in targets if self._is_useable_url(t, ['http'])] FixedTargetTest.__init__(self, utargets) + def get_targets(self): + ret = [] + for targ in self.fixed_targets: + addr, succ, code, ftype = self.direct_load(targ, False) + if succ: ret.append([addr,ftype]) + return ret + + def add_target(self, target): + self.targets.add(target[0],[target[1]]) + class FixedTargetHTMLTest(FixedTargetTest, BaseHTMLTest): def __init__(self, targets): BaseHTMLTest.__init__(self) @@ -2077,10 +2090,11 @@ class SearchBasedTest: def rewind(self): self.wordlist = load_wordlist(self.wordlist_file) - def get_targets(self): - return self.get_search_urls() + def add_target(self, target): + self.targets.add(target[0],[target[1]]) + return True - def get_search_urls(self): + def get_targets(self): ''' construct a list of urls based on the wordlist, filetypes and protocol. ''' @@ -2088,11 +2102,15 @@ class SearchBasedTest: urllist = set([]) for filetype in self.scan_filetypes: - urllist.update(self.get_search_urls_for_filetype(filetype)) + urllist.update(map(lambda x: (x, filetype), self.get_search_urls_for_filetype(filetype))) return list(urllist) - def get_search_urls_for_filetype(self, filetype,number = 0): + def get_search_urls_for_filetype(self, filetype, number=0): + # CA. I don't want to support 'any' any more. We must specify a filetype + assert(filetype != 'any') + assert(filetype) + if not number: number = self.results_per_type @@ -2178,12 +2196,18 @@ class SearchBasedTest: file_list = self.scan_filetypes if self._is_useable_url(url, prot_list, file_list): + plog('DEBUG', "Found a useable url: " + url) + url, success, code, cur_filetype = self.direct_load(url,filetype) + if not success: + plog('DEBUG',"Url was not useable after all: " + url) + continue if self.host_only: # FIXME: %-encoding, @'s, etc? plog("INFO", url) url = urlparse.urlparse(url)[1] # Have to check again here after parsing the url: if host in self.banned_targets: + plog('DEBUG',"Url was not useable after all (banned): " + url) continue type_urls.add(url) plog("INFO", "Have "+str(len(type_urls))+"/"+str(number)+" urls from search so far..") @@ -2195,6 +2219,8 @@ class SearchBasedTest: self.url_reserve[filetype].extend(list(type_urls - set(chosen))) type_urls = chosen + plog("INFO","Got urls for filetype!") + return type_urls class SearchBasedHTTPTest(SearchBasedTest, BaseHTTPTest): @@ -2215,21 +2241,13 @@ class SearchBasedHTTPTest(SearchBasedTest, BaseHTTPTest): SearchBasedTest.rewind(self) BaseHTTPTest.rewind(self) - def add_target(self, target): - # Keys targets by filetype. One filetype per target - split = target.rsplit('.',1) - if len(split) > 1 and split[-1] in self.scan_filetypes: - self.targets.add(target,[split[-1]]) - return True - return False - def select_targets(self): retval = [] n_tests = random.randrange(1,len(self.targets.keys())+1) filetypes = random.sample(self.targets.keys(), n_tests) plog("INFO", "HTTPTest decided to fetch "+str(n_tests)+" urls of types: "+str(filetypes)) for ftype in filetypes: - retval.append(random.choice(self.targets.bykey(ftype))) + retval.append((random.choice(self.targets.bykey(ftype)),ftype)) return retval def refill_targets(self): diff --git a/NetworkScanners/ExitAuthority/soat_config.py b/NetworkScanners/ExitAuthority/soat_config.py index 39f8165..99cd4ff 100644 --- a/NetworkScanners/ExitAuthority/soat_config.py +++ b/NetworkScanners/ExitAuthority/soat_config.py @@ -28,7 +28,7 @@ num_html_urls = 10 max_search_retry = 3 # Hrmm.. Too many of these and Google really h8s us.. -scan_filetypes = ['pdf','exe'] +scan_filetypes = ['pdf','doc','html'] # Urls to scan for each filetype urls_per_filetype = 2 @@ -150,8 +150,8 @@ ixquick_search_mode = {"host" : "ixquick.com/do/metasearch.pl", "query":"all_ter "extra":[("prfh","disable_family_filterEEE1N1Nnum_of_resultsEEE50N1Ndisable_video_family_filterEEE1N1N")]} -#default_search_mode = google_search_mode -default_search_mode = ixquick_search_mode +default_search_mode = google_search_mode +#default_search_mode = ixquick_search_mode # Regex of characters we consider unsafe to write to the filesystem unsafe_filechars = "[^a-zA-Z0-9-\.+]"

1 0

[torflow/master] Better mechanism for remembering directly loaded content.
by mikeperry＠torproject.org 14 Sep '11

14 Sep '11

commit b9f5348a00ceae526cbf2d8496ba568869993f85 Author: christian <christian(a)avtok.com> Date: Mon Aug 22 16:20:34 2011 -0400 Better mechanism for remembering directly loaded content. --- NetworkScanners/ExitAuthority/soat.py | 220 +++++++++++++++++---------------- 1 files changed, 112 insertions(+), 108 deletions(-) diff --git a/NetworkScanners/ExitAuthority/soat.py b/NetworkScanners/ExitAuthority/soat.py index dc4409a..e8a17dc 100755 --- a/NetworkScanners/ExitAuthority/soat.py +++ b/NetworkScanners/ExitAuthority/soat.py @@ -358,6 +358,10 @@ class ExitScanHandler(ScanSupport.ScanHandler): self.__dnshandler = DNSRebindScanner(self, c) +class Http_Return: + def __init__(self, rt): + (self.code, self.headers, self.new_cookies, self.mime_type, self.content) = rt + # HTTP request handling def http_request(address, cookie_jar=None, headers=firefox_headers): ''' perform a http GET-request and return the content received ''' @@ -384,7 +388,7 @@ def http_request(address, cookie_jar=None, headers=firefox_headers): length = reply.info().get("Content-Length") if length and int(length) > max_content_size: plog("WARN", "Max content size exceeded for "+address+": "+length) - return (reply.code, None, [], "", "") + return Http_Return((reply.code, None, [], "", "")) mime_type = reply.info().type.lower() reply_headers = HeaderDiffer.filter_headers(reply.info().items()) reply_headers.add(("mime-type", mime_type)) @@ -433,7 +437,7 @@ def http_request(address, cookie_jar=None, headers=firefox_headers): traceback.print_exc() rval = (E_MISC, None, [], "", e.__class__.__name__+str(e)) plog("INFO", "Completed HTTP Reqest for: "+address) - return rval + return Http_Return(rval) # SSL request handling @@ -1120,7 +1124,7 @@ class BaseHTTPTest(Test): datahandler.saveResult(result) return TEST_FAILURE - def direct_load(self, orig_address, filetype): + def first_load(self, orig_address, filetype): """Loads a page on a direct connection. The signtuare is: address (posibly after redirects) success (T/F) @@ -1132,36 +1136,37 @@ class BaseHTTPTest(Test): address = orig_address # Reqest the content using a direct connection - (code, resp_headers, new_cookies, mime_type, content) = http_request(orig_address,self.cookie_jar, self.headers) + req = http_request(orig_address,self.cookie_jar, self.headers) # Make a good faith effort to follow redirects count = 0 trail = set([]) - while (300 <= code < 400): - plog("NOTICE", "Non-Tor HTTP "+str(code)+" redirect from "+str(orig_address)+" to "+str(content)) - address = content + while (300 <= req.code < 400): + plog("NOTICE", "Non-Tor HTTP "+str(req.code)+" redirect from "+str(orig_address)+" to "+str(req.content)) + address = req.content if address in trail: break trail.add(address) - (code, resp_headers, new_cookies, mime_type, content) = http_request(address, self.cookie_jar, self.headers) + req = http_request(address, self.cookie_jar, self.headers) count += 1 if count > 4: break # Couldn't get past the redirects - if (300 <= code < 400): - return (address,False,code,'') + if (300 <= req.code < 400): + return (address,False,req.code,'') # If there was a fatal error, return failure - if not (200 <= code < 300) or not content: - plog("NOTICE", "Non-tor HTTP error "+str(code)+" fetching content for "+address) - return (address, False, code,'') + if not (200 <= req.code < 300) or not req.content: + plog("NOTICE", "Non-tor HTTP error "+str(req.code)+" fetching content for "+address) + return (address, False, req.code,'') - loaded_filetype = mime_to_filetype(mime_type) + loaded_filetype = mime_to_filetype(req.mime_type) if filetype and filetype != loaded_filetype: - - plog('DEBUG', 'Wrong filetype: ' + filetype + ' ' + loaded_filetype) - return (address, False, code, '') + plog('DEBUG', 'Wrong filetype: ' + loaded_filetype + ' instead of ' + filetype) + return (address, False, req.code, '') + + self.save_compare_data(address,filetype,req) # Fetch again with different cookies and see if we get the same content # Use a different IP address if possible @@ -1169,34 +1174,18 @@ class BaseHTTPTest(Test): empty_cookie_jar = cookielib.MozillaCookieJar() BindingSocket.bind_to = refetch_ip - (code_new, resp_headers_new, new_cookies_new, mime_type_new, content_new) = http_request(address, empty_cookie_jar, self.headers) + second_req = http_request(address, empty_cookie_jar, self.headers) BindingSocket.bind_to = None # If there was a fatal error, return failure - if not (code <= 200 < 300) or not content: - plog("NOTICE", "Non-tor HTTP error "+str(code)+" fetching content for "+address) - return (address, False, code, '') - - # The context for writing out the files used to make repeated comparisons - address_file = DataHandler.safeFilename(re.sub('[a-z]+://','',address)) - content_prefix = http_content_dir + address_file - - # If the page is different on the second load, then it is probably dynamic and useless to us - if self.compare(content,content_new,content_prefix,loaded_filetype) != COMPARE_EQUAL: - return (address, False, code, '') - - f = open(content_prefix + '.content', 'w') - f.write(content) - f.close() - - # Save the cookies in case we want them for a later test - empty_cookie_jar.save(content_prefix + '.cookies',ignore_discard=True) + if not (second_req.code <= 200 < 300) or not second_req.content: + plog("NOTICE", "Non-tor HTTP error "+str(second_req.code)+" fetching content for "+address) + return (address, False, second_req.code, '') - # Save the response headers in case we want them for a later test - headerdiffer = HeaderDiffer(resp_headers) - SnakePickler.dump(headerdiffer, content_prefix+'.headerdiff') + if self.compare(address,filetype,second_req) != COMPARE_EQUAL: + return (address, False, second_req.code, '') - return (address, True, code, loaded_filetype) + return (address, True, req.code, loaded_filetype) def check_http(self, address, filetype, dynamic = False): ''' check whether a http connection to a given address is molested ''' @@ -1219,8 +1208,8 @@ class BaseHTTPTest(Test): # CA we should modify our headers for maximum magic # pfoobar means that foobar was acquired over a _p_roxy - (pcode, presp_headers, pnew_cookies, pmime_type, pcontent) = torify(http_request, address, my_tor_cookie_jar, self.headers) - psha1sum = sha(pcontent) + preq = torify(http_request, address, my_tor_cookie_jar, self.headers) + psha1sum = sha(preq.content) exit_node = scanhdlr.get_exit_node() if not exit_node: @@ -1238,21 +1227,21 @@ class BaseHTTPTest(Test): exit_node = "$"+exit_node.idhex # If there is an error loading the page over Tor: - if not (200 <= pcode < 300) or not pcontent: + if not (200 <= preq.code < 300) or not preq.content: # And if it doesn't have to do with our SOCKS connection: - if pcode not in SOCKS_ERRS: - plog("NOTICE", exit_node+" had error "+str(pcode)+" fetching content for "+address) + if preq.code not in SOCKS_ERRS: + plog("NOTICE", exit_node+" had error "+str(preq.code)+" fetching content for "+address) - (code_direct, resp_headers_direct, direct_cookies_direct, mime_type_direct, content_direct) = http_request(address, my_cookie_jar, self.headers) + direct_req = http_request(address, my_cookie_jar, self.headers) # If a direct load is failing, remove this target from future consideration - if (300 <= code_direct < 400): + if (300 <= direct_req.code < 400): self.remove_target(address, INCONCLUSIVE_REDIRECT) - elif not (200 <= code_direct < 300): + elif not (200 <= direct_req.code < 300): self.remove_target(address, FALSEPOSITIVE_HTTPERRORS) # If Tor and direct are failing for the same reason, Tor is off the hook - if (code_direct == pcode): + if (direct_req.code == preq.code): result = HttpTestResult(self.node_map[exit_node[1:]], address, TEST_INCONCLUSIVE, INCONCLUSIVE_NOLOCALCONTENT) if self.rescan_nodes: @@ -1276,15 +1265,15 @@ class BaseHTTPTest(Test): E_MISC: (FAILURE_MISCEXCEPTION, self.register_connect_failure, True) } - if pcode in err_lookup: - fail_reason, register, extra_info = err_lookup[pcode] - elif 300 <= pcode < 400: # Exit node introduced a redirect - plog("NOTICE", "Tor only HTTP "+str(pcode)+" redirect from "+address+" to "+str(pcontent)) + if preq.code in err_lookup: + fail_reason, register, extra_info = err_lookup[preq.code] + elif 300 <= preq.code < 400: # Exit node introduced a redirect + plog("NOTICE", "Tor only HTTP "+str(preq.code)+" redirect from "+address+" to "+str(preq.content)) fail_reason = FAILURE_REDIRECT register = self.register_http_failure extra_info = True else: # Exit node introduced some other change - fail_reason = FAILURE_BADHTTPCODE + str(pcode) #CA don't think this is good + fail_reason = FAILURE_BADHTTPCODE + str(preq.code) #CA don't think this is good register = self.register_exit_failure extra_info = True @@ -1298,7 +1287,7 @@ class BaseHTTPTest(Test): return TEST_FAILURE # If we have no content, we had a connection error - if pcontent == "": + if not preq.content: result = HttpTestResult(self.node_map[exit_node[1:]], address, TEST_FAILURE, FAILURE_NOEXITCONTENT) self.register_exit_failure(result) @@ -1309,30 +1298,15 @@ class BaseHTTPTest(Test): # Tor was able to connect, so now it's time to make the comparison # - # An address representation acceptable for a filename: - address_file = DataHandler.safeFilename(re.sub('[a-z]+://','',address)) - content_prefix = http_content_dir + address_file - failed_prefix = http_failed_dir + address_file - - # Load content from disk - content_file = open(content_prefix+'.content', 'r') - content = ''.join(content_file.readlines()) - content_file.close() - - # If we need to write out the content handed to us by the exit node - exit_content_file_name = DataHandler.uniqueFilename(failed_prefix+'.'+exit_node[1:]+'.content') - - # TODO we might want to check headers and cookies - # Compare the content # TODO should we check if mimetype agrees with filetype? - result = self.compare(pcontent,content,content_prefix,filetype) + result = self.compare(address,filetype,preq) if result == COMPARE_NOEQUAL: # Reload direct content and try again - (code_direct, resp_headers_direct, direct_cookies_direct, mime_type_direct, content_direct) = http_request(address, my_cookie_jar, self.headers) + new_req = http_request(address, my_cookie_jar, self.headers) # If a new direct load somehow fails, then we're out of luck - if not (200 <= code_direct < 300): + if not (200 <= new_req.code < 300): plog("WARN", "Failed to re-frech "+address+" outside of Tor. Did our network fail?") self.remove_target(address, FALSEPOSITIVE_HTTPERRORS) result = HttpTestResult(self.node_map[exit_node[1:]], @@ -1344,14 +1318,14 @@ class BaseHTTPTest(Test): return TEST_INCONCLUSIVE # Try our comparison again - dynamic = self.compare(content_direct,content,content_prefix,filetype) + dynamic = self.compare(address,filetype,new_req) if dynamic == COMPARE_EQUAL: - # The content has changed, so our exit node is screwing with us. + # The content has not actually changed, so our exit node is screwing with us. result = HttpTestResult(self.node_map[exit_node[1:]], address, TEST_FAILURE, FAILURE_EXITONLY, sha1sum.hexdigest(), psha1sum.hexdigest(), - content_prefix+".content", exit_content_file_name) + address_to_context(address)+".content") self.register_exit_failure(result) retval = TEST_FAILURE else: @@ -1363,73 +1337,100 @@ class BaseHTTPTest(Test): result = HttpTestResult(self.node_map[exit_node[1:]], address, TEST_INCONCLUSIVE, INCONCLUSIVE_DYNAMIC, sha1sum_new.hexdigest(), psha1sum.hexdigest(), - content_prefix+".content", exit_content_file_name, - content_prefix+'.content-old', - sha1sum.hexdigest()) + address_to_context(address)+".content") self.results.append(result) retval = TEST_INCONCLUSIVE + elif result == COMPARE_EQUAL: result = HttpTestResult(self.node_map[exit_node[1:]], address, TEST_SUCCESS) self.register_success(result) return TEST_SUCCESS elif result == COMPARE_TRUNCATION: - exit_content_file = open(DataHandler.uniqueFilename(failed_prefix+'.'+exit_node[1:]+'.content'), 'w') - exit_content_file.write(pcontent) - exit_content_file.close() result = HttpTestResult(self.node_map[exit_node[1:]], address, TEST_FAILURE, FAILURE_EXITTRUNCATION, sha1sum.hexdigest(), psha1sum.hexdigest(), content_prefix+".content", - exit_content_file_name) + exit_content_file) self.register_exit_failure(result) - return TEST_FAILURE + retval = TEST_FAILURE # If we failed, then store what the exit node handed us if retval == TEST_FAILURE: - exit_content_file = open(exit_content_file_name, 'w') - exit_content_file.write(pcontent) + exit_content_file = open(address_to_failed_prefix(address)+'.'+exit_node[1:]+'.content', 'w') + exit_content_file.write(preq.contet) exit_content_file.close() return retval - def compare(self,new_content,old_content,context,filetype): + def _address_to_filename(self, address): + return DataHandler.safeFilename(re.sub('[a-z]+://','',address)) + + def address_to_context(self,address): + return http_content_dir + self._address_to_filename(address) + + def address_to_failed_prefix(self, address): + return http_failed_dir + self._address_to_filename(address) + + def save_compare_data(self, address, filetype, req): + context = self. address_to_context(address) + + f = open(context + '.content', 'w') + f.write(req.content) + f.close() + + lines = req.content.split('\n') + + hashes = [] + working_hash = sha() + for l in lines: + working_hash.update(l) + hashes.append(working_hash.hexdigest()) + + f = open(context + '.hashes','w') + pickle.dump(hashes,f) + f.close() + + # Save the response headers in case we want them for a later test + headerdiffer = HeaderDiffer(req.headers) + SnakePickler.dump(headerdiffer, context + '.headerdiff') + + # Save the new cookies in case we need them for a later test + SnakePickler.dump(req.new_cookies,context + '.cookies') + + def compare(self,address,filetype,req): """The generic function for comparing webcontent.""" plog('DEBUG', "Beginning Compare") - new_linelist = new_content.split('\n') - old_linelist = old_content.split('\n') + context = self. address_to_context(address) + + new_linelist = req.content.split('\n') + f = open(context + '.content') + old_content = f.read() + f.close() + old_hashes = pickled_content(context,'.hashes') - if not old_hashes: - old_hashes = [] - old_hash = sha() - for l in old_linelist: - old_hash.update(l) - old_hashes.append(old_hash.hexdigest()) - f = open(context + '.hashes','w') - pickle.dump(old_hashes,f) - f.close() - - if len(new_linelist) > len(old_linelist): + + if len(new_linelist) > len(old_hashes): retval = COMPARE_NOEQUAL else: new_hash = sha() - for i in range(0,min(len(old_linelist),len(new_linelist))): + for i in range(0,min(len(old_hashes),len(new_linelist))): new_hash.update(new_linelist[i]) new_hash = new_hash.hexdigest() if new_hash != old_hashes[len(new_linelist) - 1]: retval = COMPARE_NOEQUAL - elif len(new_linelist) == len(old_linelist): + elif len(new_linelist) == len(old_hashes): retval = COMPARE_EQUAL else: retval = COMPARE_TRUNCATION if retval == COMPARE_NOEQUAL: try: - retval = self.compare_funcs[filetype](new_content,old_content,context) + retval = self.compare_funcs[filetype](req.content,old_content,context) except KeyError: pass @@ -2056,7 +2057,7 @@ class FixedTargetHTTPTest(FixedTargetTest, BaseHTTPTest): def get_targets(self): ret = [] for targ in self.fixed_targets: - addr, succ, code, ftype = self.direct_load(targ, False) + addr, succ, code, ftype = self.first_load(targ, False) if succ: ret.append([addr,ftype]) return ret @@ -2146,11 +2147,12 @@ class SearchBasedTest: plog("INFO", "Search url: "+search_url) try: if self.search_mode["useragent"]: - (code, resp_headers, new_cookies, mime_type, content) = http_request(search_url, search_cookies) + search_req = http_request(search_url, search_cookies) else: headers = filter(lambda h: h[0] != "User-Agent", copy.copy(firefox_headers)) - (code, resp_headers, new_cookies, mime_type, content) = http_request(search_url, search_cookies, headers) + search_req = http_request(search_url, search_cookies, headers) + except socket.gaierror: plog('ERROR', 'Scraping of http://'+host+search_path+" failed") traceback.print_exc() @@ -2161,10 +2163,12 @@ class SearchBasedTest: # Bloody hack just to run some tests overnight break - if (400 <= code < 500): + if (400 <= search_req.code < 500): plog('ERROR', 'Scraping of http://'+host+search_path+' failed. HTTP '+str(code)) break + content = search_req.content + links = SoupStrainer('a') try: soup = TheChosenSoup(content, parseOnlyThese=links) @@ -2197,7 +2201,7 @@ class SearchBasedTest: if self._is_useable_url(url, prot_list, file_list): plog('DEBUG', "Found a useable url: " + url) - url, success, code, cur_filetype = self.direct_load(url,filetype) + url, success, code, cur_filetype = self.first_load(url,filetype) if not success: plog('DEBUG',"Url was not useable after all: " + url) continue

1 0

[torflow/master] Better registering test results for HttpTest
by mikeperry＠torproject.org 14 Sep '11

14 Sep '11

commit a8a11da002d9ff867f3441b8c0114a9d57b22a5c Author: christian <christian(a)avtok.com> Date: Mon Aug 22 17:52:26 2011 -0400 Better registering test results for HttpTest --- NetworkScanners/ExitAuthority/soat.py | 51 +++++++++++++-------------------- 1 files changed, 20 insertions(+), 31 deletions(-) diff --git a/NetworkScanners/ExitAuthority/soat.py b/NetworkScanners/ExitAuthority/soat.py index e8a17dc..373dc00 100755 --- a/NetworkScanners/ExitAuthority/soat.py +++ b/NetworkScanners/ExitAuthority/soat.py @@ -1002,6 +1002,12 @@ class Test: datahandler.saveResult(result) return TEST_FAILURE + def register_inconclusive(self, result): + if self.rescan_nodes: + result.from_rescan = True + self.results.append(result) + datahandler.saveResult(result) + return TEST_INCONCLUSIVE class BaseHTTPTest(Test): def __init__(self, scan_filetypes=scan_filetypes): @@ -1205,7 +1211,7 @@ class BaseHTTPTest(Test): for cookie in self.cookie_jar: my_cookie_jar.set_cookie(cookie) - # CA we should modify our headers for maximum magic + # CA we should modify our headers so we look like a browser # pfoobar means that foobar was acquired over a _p_roxy preq = torify(http_request, address, my_tor_cookie_jar, self.headers) @@ -1217,12 +1223,7 @@ class BaseHTTPTest(Test): plog('NOTICE', 'We had no exit node to test, skipping to the next test.') result = HttpTestResult(None, address, TEST_INCONCLUSIVE, INCONCLUSIVE_NOEXIT) - if self.rescan_nodes: - # CA: we shouldn't need to do this - result.from_rescan = True - self.results.append(result) - # CA: when do we use datahandler? - return TEST_INCONCLUSIVE + return self.register_inconclusive(result) exit_node = "$"+exit_node.idhex @@ -1244,11 +1245,7 @@ class BaseHTTPTest(Test): if (direct_req.code == preq.code): result = HttpTestResult(self.node_map[exit_node[1:]], address, TEST_INCONCLUSIVE, INCONCLUSIVE_NOLOCALCONTENT) - if self.rescan_nodes: - # CA: we shouldn't need to do this - result.from_rescan = True - self.results.append(result) - return TEST_INCONCLUSIVE + return self.register_inconclusive(result) # Error => behavior lookup table # Error code (Failure reason, Register method, Set extra_info to pcontent?) @@ -1283,23 +1280,20 @@ class BaseHTTPTest(Test): if extra_info: result.extra_info = str(pcontent) - register(result) - return TEST_FAILURE + return register(result) # If we have no content, we had a connection error if not preq.content: result = HttpTestResult(self.node_map[exit_node[1:]], address, TEST_FAILURE, FAILURE_NOEXITCONTENT) - self.register_exit_failure(result) - # Restore cookie jars - return TEST_FAILURE + return self.register_exit_failure(result) # # Tor was able to connect, so now it's time to make the comparison # # Compare the content - # TODO should we check if mimetype agrees with filetype? + result = self.compare(address,filetype,preq) if result == COMPARE_NOEQUAL: # Reload direct content and try again @@ -1312,10 +1306,7 @@ class BaseHTTPTest(Test): result = HttpTestResult(self.node_map[exit_node[1:]], address, TEST_INCONCLUSIVE, INCONCLUSIVE_NOLOCALCONTENT) - if self.rescan_nodes: - result.from_rescan = True - self.results.append(result) - return TEST_INCONCLUSIVE + return self.register_inconclusive(result) # Try our comparison again dynamic = self.compare(address,filetype,new_req) @@ -1326,8 +1317,8 @@ class BaseHTTPTest(Test): address, TEST_FAILURE, FAILURE_EXITONLY, sha1sum.hexdigest(), psha1sum.hexdigest(), address_to_context(address)+".content") - self.register_exit_failure(result) - retval = TEST_FAILURE + retval = self.register_exit_failure(result) + else: # The content is dynamic. # Here's where "no dynamic" comes in. @@ -1338,22 +1329,20 @@ class BaseHTTPTest(Test): address, TEST_INCONCLUSIVE, INCONCLUSIVE_DYNAMIC, sha1sum_new.hexdigest(), psha1sum.hexdigest(), address_to_context(address)+".content") - self.results.append(result) - retval = TEST_INCONCLUSIVE + retval = self.register_inconclusive(result) elif result == COMPARE_EQUAL: result = HttpTestResult(self.node_map[exit_node[1:]], address, TEST_SUCCESS) - self.register_success(result) - return TEST_SUCCESS + retval = self.register_success(result) + elif result == COMPARE_TRUNCATION: result = HttpTestResult(self.node_map[exit_node[1:]], address, TEST_FAILURE, FAILURE_EXITTRUNCATION, sha1sum.hexdigest(), psha1sum.hexdigest(), content_prefix+".content", exit_content_file) - self.register_exit_failure(result) - retval = TEST_FAILURE + retval = self.register_exit_failure(result) # If we failed, then store what the exit node handed us if retval == TEST_FAILURE: @@ -2108,7 +2097,7 @@ class SearchBasedTest: return list(urllist) def get_search_urls_for_filetype(self, filetype, number=0): - # CA. I don't want to support 'any' any more. We must specify a filetype + # We don't want to support 'any' any more. We must specify a filetype assert(filetype != 'any') assert(filetype)

1 0

[torflow/master] Merge remote branch 'canderson/master'
by mikeperry＠torproject.org 14 Sep '11

14 Sep '11

commit 67b4547a1edbdbbef283ac35709c40808619840f Merge: 5736529 f3fefed Author: Mike Perry <mikeperry-git(a)fscked.org> Date: Tue Sep 13 18:43:53 2011 -0700 Merge remote branch 'canderson/master' .gitignore | 3 + NetworkScanners/ExitAuthority/libsoat.py | 9 +- NetworkScanners/ExitAuthority/soat.py | 1308 ++++++++++---------------- NetworkScanners/ExitAuthority/soat_config.py | 34 +- 4 files changed, 515 insertions(+), 839 deletions(-)

1 0

[torflow/master] Incorporating ssl test into new infrastructure
by mikeperry＠torproject.org 14 Sep '11

14 Sep '11

commit 5ae4e364f809169c19add390e64053bd834e216e Author: christian <christian(a)avtok.com> Date: Tue Aug 30 14:03:28 2011 -0400 Incorporating ssl test into new infrastructure --- NetworkScanners/ExitAuthority/soat.py | 142 ++++++++++++++++---------------- 1 files changed, 71 insertions(+), 71 deletions(-) diff --git a/NetworkScanners/ExitAuthority/soat.py b/NetworkScanners/ExitAuthority/soat.py index 373dc00..4633dad 100755 --- a/NetworkScanners/ExitAuthority/soat.py +++ b/NetworkScanners/ExitAuthority/soat.py @@ -32,7 +32,6 @@ import getopt import httplib import mimetypes import os -import pickle import random import re import signal @@ -447,7 +446,7 @@ def ssl_request(address): try: return _ssl_request(address) except (ReadTimeout, socket.timeout), e: - plog("INFO", "SSL Request done with timoeut for addrress: "+str(address)) + plog("INFO", "SSL Request done with timoeut for address: "+str(address)) return (E_TIMEOUT, None, "Socket timeout") def _ssl_request(address, method='TLSv1_METHOD'): @@ -639,7 +638,7 @@ class Test: self.url_reserve = {} self._pickle_revision = 8 - def _is_useable_url(self, url, valid_schemes=None, filetypes=None): + def _is_useable_url(self, url, valid_schemes=None, filetype=None): (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(url) if netloc.rfind(":") != -1: # FIXME: %-encoding? @@ -658,12 +657,10 @@ class Test: if url in self.banned_targets: plog("DEBUG", "Banned url "+url) return False - if filetypes: # Must be checked last - for filetype in filetypes: - if url[-len(filetype):] == filetype: - return True - plog("DEBUG", "Bad filetype for "+url) - return False + if filetype: + if url[-len(filetype):] != filetype: + plog("DEBUG", "Bad filetype for "+url) + return False return True def add_target(self, target): @@ -1010,9 +1007,8 @@ class Test: return TEST_INCONCLUSIVE class BaseHTTPTest(Test): - def __init__(self, scan_filetypes=scan_filetypes): + def __init__(self): # FIXME: Handle http urls w/ non-80 ports.. - self.scan_filetypes = scan_filetypes self.fetch_queue = [] Test.__init__(self, "HTTP", 80) self.save_name = "HTTPTest" @@ -1172,7 +1168,7 @@ class BaseHTTPTest(Test): plog('DEBUG', 'Wrong filetype: ' + loaded_filetype + ' instead of ' + filetype) return (address, False, req.code, '') - self.save_compare_data(address,filetype,req) + self.save_compare_data(address,loaded_filetype,req) # Fetch again with different cookies and see if we get the same content # Use a different IP address if possible @@ -1188,7 +1184,7 @@ class BaseHTTPTest(Test): plog("NOTICE", "Non-tor HTTP error "+str(second_req.code)+" fetching content for "+address) return (address, False, second_req.code, '') - if self.compare(address,filetype,second_req) != COMPARE_EQUAL: + if self.compare(address,loaded_filetype,second_req) != COMPARE_EQUAL: return (address, False, second_req.code, '') return (address, True, req.code, loaded_filetype) @@ -1278,7 +1274,7 @@ class BaseHTTPTest(Test): result = HttpTestResult(self.node_map[exit_node[1:]], address, TEST_FAILURE, fail_reason) if extra_info: - result.extra_info = str(pcontent) + result.extra_info = str(preq.content) return register(result) @@ -1376,9 +1372,8 @@ class BaseHTTPTest(Test): working_hash.update(l) hashes.append(working_hash.hexdigest()) - f = open(context + '.hashes','w') - pickle.dump(hashes,f) - f.close() + # Save these line-by-line hashes for later use + SnakePickler.dump(hashes, context + '.hashes') # Save the response headers in case we want them for a later test headerdiffer = HeaderDiffer(req.headers) @@ -1400,7 +1395,8 @@ class BaseHTTPTest(Test): old_content = f.read() f.close() - old_hashes = pickled_content(context,'.hashes') + + old_hashes = SnakePickler.load(context + '.hashes') if len(new_linelist) > len(old_hashes): retval = COMPARE_NOEQUAL @@ -1449,15 +1445,6 @@ class BaseHTTPTest(Test): return COMPARE_NOEQUAL # TODO move these somewhere sensible -def pickled_content(context,extension): - try: - f = open(context + extension, 'r') - ret = pickle.load(f) - f.close() - except IOError: - ret = False - return ret - def mime_to_filetype(mime_type): return mimetypes.guess_extension(mime_type)[1:] @@ -1478,8 +1465,8 @@ def is_script_mimetype(mime_type): return is_script class BaseHTMLTest(BaseHTTPTest): - def __init__(self, scan_filetypes=scan_filetypes): - BaseHTTPTest.__init__(self, scan_filetypes) + def __init__(self): + BaseHTTPTest.__init__(self) self.save_name = "HTMLTest" self.proto = "HTML" #CA .. ? @@ -1833,7 +1820,7 @@ class BaseSSLTest(Test): def run_test(self): self.tests_run += 1 - return self.check_ssl(random.choice(self.targets)) + return self.check_ssl(random.choice(self.targets)[0]) def get_resolved_ip(self, hostname): # XXX: This is some extreme GIL abuse.. It may have race conditions @@ -2043,11 +2030,12 @@ class FixedTargetHTTPTest(FixedTargetTest, BaseHTTPTest): utargets = [t for t in targets if self._is_useable_url(t, ['http'])] FixedTargetTest.__init__(self, utargets) - def get_targets(self): + def get_targets(self): ret = [] for targ in self.fixed_targets: addr, succ, code, ftype = self.first_load(targ, False) - if succ: ret.append([addr,ftype]) + if succ: + ret.append([addr,ftype]) return ret def add_target(self, target): @@ -2065,6 +2053,7 @@ class FixedTargetHTMLTest(FixedTargetTest, BaseHTMLTest): class FixedTargetSSLTest(FixedTargetTest, BaseSSLTest): def __init__(self, targets): BaseSSLTest.__init__(self) + # We ask for hostnames only, please utargets = [t for t in targets if self._is_useable_url(t, [''])] FixedTargetTest.__init__(self, utargets) @@ -2080,30 +2069,7 @@ class SearchBasedTest: def rewind(self): self.wordlist = load_wordlist(self.wordlist_file) - def add_target(self, target): - self.targets.add(target[0],[target[1]]) - return True - - def get_targets(self): - ''' - construct a list of urls based on the wordlist, filetypes and protocol. - ''' - plog('INFO', 'Searching for relevant sites...') - - urllist = set([]) - for filetype in self.scan_filetypes: - urllist.update(map(lambda x: (x, filetype), self.get_search_urls_for_filetype(filetype))) - - return list(urllist) - - def get_search_urls_for_filetype(self, filetype, number=0): - # We don't want to support 'any' any more. We must specify a filetype - assert(filetype != 'any') - assert(filetype) - - if not number: - number = self.results_per_type - + def get_search_urls_for_filetype(self, filetype, number): self.url_reserve.setdefault(filetype,[]) type_urls = set(self.url_reserve[filetype][:number]) @@ -2116,7 +2082,7 @@ class SearchBasedTest: #Try to filter based on filetype/protocol. Unreliable. We will re-filter. query = random.choice(self.wordlist) - if filetype != 'any': + if filetype: query += " "+self.search_mode["filetype"]+filetype plog("WARN", "RESULTPROTOCOL IS:" + self.result_protocol) if self.result_protocol == 'https' and self.search_mode["inurl"]: @@ -2183,17 +2149,18 @@ class SearchBasedTest: prot_list = None else: prot_list = [self.result_protocol] - if filetype == 'any': - file_list = None - else: - file_list = self.scan_filetypes - if self._is_useable_url(url, prot_list, file_list): + if self._is_useable_url(url, prot_list, filetype): + try: + self.first_load + except AttributeError: + pass + else: + url, success, code, cur_filetype = self.first_load(url,filetype) + if not success: + continue + plog('DEBUG', "Found a useable url: " + url) - url, success, code, cur_filetype = self.first_load(url,filetype) - if not success: - plog('DEBUG',"Url was not useable after all: " + url) - continue if self.host_only: # FIXME: %-encoding, @'s, etc? plog("INFO", url) @@ -2217,9 +2184,10 @@ class SearchBasedTest: return type_urls class SearchBasedHTTPTest(SearchBasedTest, BaseHTTPTest): - def __init__(self, wordlist): + def __init__(self, wordlist, scan_filetypes=scan_filetypes): BaseHTTPTest.__init__(self) SearchBasedTest.__init__(self, wordlist) + self.scan_filetypes = scan_filetypes self.results_per_type = urls_per_filetype self.result_protocol = 'http' @@ -2250,6 +2218,22 @@ class SearchBasedHTTPTest(SearchBasedTest, BaseHTTPTest): plog("NOTICE", self.proto+" scanner short on "+ftype+" targets. Adding more") map(self.add_target, self.get_search_urls_for_filetype(ftype,targets_needed)) + def add_target(self, target): + self.targets.add(target[0],[target[1]]) + return True + + def get_targets(self): + ''' + construct a list of urls based on the wordlist, filetypes and protocol. + ''' + plog('INFO', 'Searching for relevant sites...') + + urllist = set([]) + for filetype in self.scan_filetypes: + urllist.update(map(lambda x: (x, filetype), self.get_search_urls_for_filetype(filetype, self.results_per_type))) + + return list(urllist) + HTTPTest = SearchBasedHTTPTest # For resuming from old HTTPTest.*.test files class SearchBasedHTMLTest(SearchBasedTest, BaseHTMLTest): @@ -2257,7 +2241,6 @@ class SearchBasedHTMLTest(SearchBasedTest, BaseHTMLTest): BaseHTMLTest.__init__(self) SearchBasedTest.__init__(self, wordlist) self.result_filetypes = ["any"] - self.results_per_type = self.fetch_targets def depickle_upgrade(self): if self._pickle_revision < 7: @@ -2278,8 +2261,11 @@ class SearchBasedSSLTest(SearchBasedTest, BaseSSLTest): SearchBasedTest.__init__(self, wordlist) self.host_only = True self.result_protocol = 'https' - if default_search_mode == yahoo_search_mode: - plog('WARN', 'Yahoo search mode is not suitable for SSLTests. Continuing anyway.') + try: + if default_search_mode == yahoo_search_mode: + plog('WARN', 'Yahoo search mode is not suitable for SSLTests. Continuing anyway.') + except NameError: + pass self.search_mode=default_search_mode def depickle_upgrade(self): @@ -2289,8 +2275,19 @@ class SearchBasedSSLTest(SearchBasedTest, BaseSSLTest): self.search_mode=google_search_mode BaseSSLTest.depickle_upgrade(self) + def get_targets(self): + ''' + construct a list of urls based on the wordlist, filetypes and protocol. + ''' + plog('INFO', 'Searching for relevant sites...') + + urllist = set([]) + urllist.update(self.get_search_urls_for_filetype(None, num_ssl_hosts)) + + return list(urllist) + def rewind(self): - self.wordlist = load_wordlist(self.wordlist_file) + SearchBasedTest.rewind(self) BaseSSLTest.rewind(self) SSLTest = SearchBasedSSLTest # For resuming from old SSLTest.*.test files @@ -3207,6 +3204,9 @@ def main(argv): common_nodes &= test.nodes scanhdlr._sanity_check(map(lambda id: test.node_map[id], test.nodes)) + print "COMMON NODES" + print common_nodes + if common_nodes is None: common_nodes = set([])

1 0