tor-commits
Threads by month
- ----- 2025 -----
- July
- June
- May
- April
- March
- February
- January
- ----- 2024 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2023 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2022 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2021 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2020 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2019 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2018 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2017 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2016 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2015 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2014 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2013 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2012 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2011 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
September 2011
- 19 participants
- 865 discussions
commit 6a799c10eed145005b1755c49ea0d9c787df44b7
Author: Roger Dingledine <arma(a)torproject.org>
Date: Tue Sep 13 22:04:47 2011 -0400
bump to 0.2.3.4-alpha-dev
---
configure.in | 2 +-
contrib/tor-mingw.nsi.in | 2 +-
src/win32/orconfig.h | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/configure.in b/configure.in
index 3659cd7..201eee0 100644
--- a/configure.in
+++ b/configure.in
@@ -4,7 +4,7 @@ dnl Copyright (c) 2007-2008, The Tor Project, Inc.
dnl See LICENSE for licensing information
AC_INIT
-AM_INIT_AUTOMAKE(tor, 0.2.3.4-alpha)
+AM_INIT_AUTOMAKE(tor, 0.2.3.4-alpha-dev)
AM_CONFIG_HEADER(orconfig.h)
AC_CANONICAL_HOST
diff --git a/contrib/tor-mingw.nsi.in b/contrib/tor-mingw.nsi.in
index e6b2585..f57e71d 100644
--- a/contrib/tor-mingw.nsi.in
+++ b/contrib/tor-mingw.nsi.in
@@ -8,7 +8,7 @@
!include "LogicLib.nsh"
!include "FileFunc.nsh"
!insertmacro GetParameters
-!define VERSION "0.2.3.4-alpha"
+!define VERSION "0.2.3.4-alpha-dev"
!define INSTALLER "tor-${VERSION}-win32.exe"
!define WEBSITE "https://www.torproject.org/"
!define LICENSE "LICENSE"
diff --git a/src/win32/orconfig.h b/src/win32/orconfig.h
index e5c195b..f9d8d51 100644
--- a/src/win32/orconfig.h
+++ b/src/win32/orconfig.h
@@ -234,7 +234,7 @@
#define USING_TWOS_COMPLEMENT
/* Version number of package */
-#define VERSION "0.2.3.4-alpha"
+#define VERSION "0.2.3.4-alpha-dev"
1
0

[torflow/master] SearchBasedHTTPTest.get_targets duplicated the effort of BaseHTTP.add_target. BaseHTTPTest.remove_target became confused
by mikeperry@torproject.org 14 Sep '11
by mikeperry@torproject.org 14 Sep '11
14 Sep '11
commit 1248e73fd061257bee960df54b53b6c4d8f7e648
Author: Christian Anderson <christian(a)avtok.com>
Date: Tue May 24 14:14:44 2011 -0400
SearchBasedHTTPTest.get_targets duplicated the effort of BaseHTTP.add_target. BaseHTTPTest.remove_target became confused
---
NetworkScanners/ExitAuthority/soat.py | 23 +++++++++++++----------
1 files changed, 13 insertions(+), 10 deletions(-)
diff --git a/NetworkScanners/ExitAuthority/soat.py b/NetworkScanners/ExitAuthority/soat.py
index 97d310b..f2552e2 100755
--- a/NetworkScanners/ExitAuthority/soat.py
+++ b/NetworkScanners/ExitAuthority/soat.py
@@ -2188,16 +2188,19 @@ class SearchBasedHTTPTest(SearchBasedTest, BaseHTTPTest):
plog("NOTICE", self.proto+" scanner short on "+ftype+" targets. Adding more")
map(self.add_target, self.get_search_urls_for_filetype(ftype))
- def get_targets(self):
- raw_urls = self.get_search_urls()
- new = {}
- for url in raw_urls:
- split = url.rsplit('.',1) # Try to get filetype
- if len(split) > 1 and split[-1] in self.scan_filetypes:
- new.setdefault(split[-1],[]).append(url)
- for k,v in new.items():
- self.targets_by_type.setdefault(k, []).extend(v)
- return raw_urls
+# This duplicated the effort of BaseHTTPTest.add_target which is invoked by
+# SearchBasedHTTPTest.rewind -> BaseHTTPTest.rewind = Test.rewind
+# Instead we should fall back on SearchBasedTest.get_targets
+# def get_targets(self):
+# raw_urls = self.get_search_urls()
+# new = {}
+# for url in raw_urls:
+# split = url.rsplit('.',1) # Try to get filetype
+# if len(split) > 1 and split[-1] in self.scan_filetypes:
+# new.setdefault(split[-1],[]).append(url)
+# for k,v in new.items():
+# self.targets_by_type.setdefault(k, []).extend(v)
+# return raw_urls
HTTPTest = SearchBasedHTTPTest # For resuming from old HTTPTest.*.test files
1
0

[torflow/master] Modernizing search modes. Adding support to soat.py
by mikeperry@torproject.org 14 Sep '11
by mikeperry@torproject.org 14 Sep '11
14 Sep '11
commit b4be3a63400e658eb26436d97fca766907ecf91c
Author: Christian Anderson <christian(a)avtok.com>
Date: Tue May 24 12:46:04 2011 -0400
Modernizing search modes. Adding support to soat.py
---
.gitignore | 3 ++
NetworkScanners/ExitAuthority/soat.py | 30 +++++++++++++------------
NetworkScanners/ExitAuthority/soat_config.py | 28 +++++++++++++-----------
3 files changed, 34 insertions(+), 27 deletions(-)
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b4e8d7b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+*.pyc
+NetworkScanners/ExitAuthority/data/
+NetworkScanners/ExitAuthority/search_cookies.lwp
\ No newline at end of file
diff --git a/NetworkScanners/ExitAuthority/soat.py b/NetworkScanners/ExitAuthority/soat.py
index 162ff0e..97d310b 100755
--- a/NetworkScanners/ExitAuthority/soat.py
+++ b/NetworkScanners/ExitAuthority/soat.py
@@ -2076,12 +2076,14 @@ class SearchBasedTest:
count = 0
while len(type_urls) < self.results_per_type and count < max_search_retry:
count += 1
+
+ #Try to filter based on filetype/protocol. Unreliable. We will re-filter.
query = random.choice(self.wordlist)
if filetype != 'any':
query += " "+self.search_mode["filetype"]+filetype
plog("WARN", "RESULTPROTOCOL IS:" + self.result_protocol)
- if self.result_protocol != 'any' and self.search_mode["inurl"]:
- query += " "+self.search_mode["inurl"]+self.result_protocol # this isn't too reliable, but we'll re-filter results later
+ if self.result_protocol == 'https' and self.search_mode["inurl"]:
+ query += " " + self.search_mode["inurl"] + "https"
#query += '&num=' + `g_results_per_page`
# search google for relevant pages
@@ -2124,19 +2126,19 @@ class SearchBasedTest:
traceback.print_exc()
print "Content is: "+str(content)
break
+
# get the links and do some additional filtering
+ assert(self.search_mode["class"])
for link in soup.findAll('a'):
- skip = True
- for a in link.attrs:
- if a[0] == "class" and self.search_mode["class"] in a[1]:
- skip = False
- break
- if skip:
- continue
- if link.has_key(self.search_mode['realtgt']):
- url = link[self.search_mode['realtgt']]
- else:
- url = link['href']
+ #Filter based on class of link
+ try:
+ if self.search_mode["class"] != link["class"]:
+ continue
+ except KeyError: continue
+
+ #Get real target
+ url = link[self.search_mode['realtgt']]
+
if self.result_protocol == 'any':
prot_list = None
else:
@@ -2158,7 +2160,7 @@ class SearchBasedTest:
type_urls.add(url)
else:
pass
- plog("INFO", "Have "+str(len(type_urls))+"/"+str(self.results_per_type)+" urls from search so far..")
+ plog("INFO", "Have "+str(len(type_urls))+"/"+str(self.results_per_type)+" urls from search so far..")
return type_urls
class SearchBasedHTTPTest(SearchBasedTest, BaseHTTPTest):
diff --git a/NetworkScanners/ExitAuthority/soat_config.py b/NetworkScanners/ExitAuthority/soat_config.py
index 3e13463..39f8165 100644
--- a/NetworkScanners/ExitAuthority/soat_config.py
+++ b/NetworkScanners/ExitAuthority/soat_config.py
@@ -40,8 +40,8 @@ max_content_size = 256*1024
# Bind refetches of docuements to a specific source IP.
# Useful for eliminating false positives that arise
# from IP-based identifiers encoded in content
-#refetch_ip = None
-refetch_ip = "4.4.4.4"
+refetch_ip = None
+#refetch_ip = "4.4.4.4"
# Email settings for email scans.
from_email = "Tor Exit Scanner <noreply(a)torproject.org>"
@@ -134,21 +134,23 @@ search_cookie_file="./search_cookies.lwp"
# Search mode.
# Leave these maps alone. Change the default_search_mode variable
# to what you want.
-# XXX: Make a bing search mode.
-yahoo_search_mode = {"host" : "search.yahoo.com", "query":"p", "filetype": "originurlextension:", \
- "inurl":None, "class":"yschttl", "realtgt":"ourl", "useragent":False, \
- "extra":[]}
-google_search_mode = {"host" : "www.google.com", "query":"q", "filetype":"filetype:", \
- "inurl":"inurl:", "class" : "l", "realtgt":"href", "useragent":True, \
- "extra":[]}
-ixquick_search_mode = {"host" : "ixquick.com/do/metasearch.pl", "query":"all_terms", "filetype":"title:", \
+# XXX: Make a bing search mode and a DuckDuckGo search mode
+
+#Yahoo is no longer supported because they make it difficult to scrape their results
+#yahoo_search_mode = {"host" : "search.yahoo.com/search", "query":"p", "filetype": "vf:", \
+# "inurl":None, "class":"yschttl", "realtgt":"ourl", "useragent":False, \
+# "extra":[]}
+
+google_search_mode = {"host" : "www.google.com/search", "query":"q", "filetype":"filetype:", \
+ "inurl":"inurl:", "class" : "l", "realtgt":"href", "useragent":True, \
+ "extra":[]}
+
+ixquick_search_mode = {"host" : "ixquick.com/do/metasearch.pl", "query":"all_terms", "filetype":"url:.", \
"inurl":"url:", "class" : "title2", "realtgt":"href", "useragent":False, \
"extra":[("prfh","disable_family_filterEEE1N1Nnum_of_resultsEEE50N1Ndisable_video_family_filterEEE1N1N")]}
-# FIXME: This does not affect the ssl search.. Only Google has
-# a working "inurl:" that allows you to pick the scheme to be https
+
#default_search_mode = google_search_mode
-#default_search_mode = yahoo_search_mode
default_search_mode = ixquick_search_mode
# Regex of characters we consider unsafe to write to the filesystem
1
0
commit 5898b393aba321e70c170c70c3d49c0087310906
Author: christian <christian(a)avtok.com>
Date: Thu Jun 2 00:41:56 2011 -0400
New Targets class
---
NetworkScanners/ExitAuthority/soat.py | 48 +++++++++++++++++++++++++++++++++
1 files changed, 48 insertions(+), 0 deletions(-)
diff --git a/NetworkScanners/ExitAuthority/soat.py b/NetworkScanners/ExitAuthority/soat.py
index f2552e2..c96468f 100755
--- a/NetworkScanners/ExitAuthority/soat.py
+++ b/NetworkScanners/ExitAuthority/soat.py
@@ -520,6 +520,54 @@ def _ssl_request(address, method='TLSv1_METHOD'):
plog("INFO", "SSL Request done for addrress: "+str(address))
return rval
+class Targets:
+ """
+ The class used to store the targets of a Test.
+
+ Supports iteration over all targets and labelling a target with one or more "keys".
+ MUST support these methods:
+ add -- Add a target. Optional second argument is list of keys. Idempotent.
+ remove -- Remove a target. Returns True iff the target was found.
+ bykey -- Get an iterator whose elements match the supplied key.
+ __iter__
+ __len__
+
+ """
+ def __init__(self):
+ self.list = []
+ self.lookup = {}
+ def add(self, target, keys=[]):
+ if not target:
+ return
+ for pos,entry in enumerate(self.list):
+ if entry[0] == target:
+ newkeys = set.difference(set(keys),self.list[pos][1])
+ self.list[pos][1].update(newkeys)
+ break
+ else:
+ newkeys = set(keys)
+ self.list.append((target,newkeys))
+ for key in newkeys:
+ try:
+ self.lookup[key].append(target)
+ except KeyError:
+ self.lookup[key] = [target]
+ def remove(self,target):
+ retval = False
+ for pos,entry in enumerate(self.list):
+ if entry[0] == target:
+ for key in self.list[pos][1]:
+ self.lookup[key].remove(target)
+ self.list.pop(pos)
+ retval = True
+ break
+ return retval
+ def bykey(self,key):
+ return self.lookup.get(key,[])
+ def __iter__(self):
+ return map(lambda x: x[0], self.list).__iter__()
+ def __len__(self):
+ return len(self.list)
# Base Test Classes
class Test:
1
0

[torflow/master] Incorporating new container for targets. SearchBasedHTTPTest and FixedTargetHTTPTest now seem to run.
by mikeperry@torproject.org 14 Sep '11
by mikeperry@torproject.org 14 Sep '11
14 Sep '11
commit 0c10b5be192f96e75db45c4a32c48248da8ee513
Author: christian <christian(a)avtok.com>
Date: Thu Jun 2 15:05:40 2011 -0400
Incorporating new container for targets. SearchBasedHTTPTest and FixedTargetHTTPTest now seem to run.
---
NetworkScanners/ExitAuthority/soat.py | 204 +++++++++++++++-----------------
1 files changed, 96 insertions(+), 108 deletions(-)
diff --git a/NetworkScanners/ExitAuthority/soat.py b/NetworkScanners/ExitAuthority/soat.py
index c96468f..8d557eb 100755
--- a/NetworkScanners/ExitAuthority/soat.py
+++ b/NetworkScanners/ExitAuthority/soat.py
@@ -528,9 +528,10 @@ class Targets:
MUST support these methods:
add -- Add a target. Optional second argument is list of keys. Idempotent.
remove -- Remove a target. Returns True iff the target was found.
- bykey -- Get an iterator whose elements match the supplied key.
+ bykey -- Get a list whose elements match the supplied key.
__iter__
__len__
+ __getitem__
"""
def __init__(self):
@@ -564,15 +565,20 @@ class Targets:
return retval
def bykey(self,key):
return self.lookup.get(key,[])
+ def keys(self):
+ return self.lookup.keys()
def __iter__(self):
return map(lambda x: x[0], self.list).__iter__()
def __len__(self):
return len(self.list)
+ def __getitem__(self,index):
+ return self.list[index]
# Base Test Classes
class Test:
""" Base class for our tests """
def __init__(self, proto, port):
+ """Sets the variables that are static for the lifetime of the test and calls self._reset() which sets the variables that are not."""
self.proto = proto
self.port = port
self.min_targets = min_targets
@@ -585,8 +591,7 @@ class Test:
self.scan_nodes = 0
self.nodes_to_mark = 0
self.tests_per_node = num_tests_per_node
- self.url_reserve = {}
- self._reset()
+ self._reset() #CA make this a call to rewind instead?
self._pickle_revision = 8 # Will increment as fields are added
def run_test(self):
@@ -656,13 +661,19 @@ class Test:
return True
def add_target(self, target):
- self.targets.append(target)
+ self.targets.add(target)
+
+ def select_targets(self):
+ return self.targets
+
+ def refill_targets(self):
+ map(self.add_target, self.get_targets())
+ if not self.targets:
+ raise NoURLsFound("No URLS found for protocol "+self.proto)
def remove_target(self, target, reason="None"):
self.banned_targets.add(target)
- self.refill_targets()
- if target in self.targets:
- self.targets.remove(target)
+ self.targets.remove(target)
if target in self.dynamic_fails:
del self.dynamic_fails[target]
if target in self.successes:
@@ -693,6 +704,8 @@ class Test:
datahandler.saveResult(r)
self.results.remove(r)
+ self.refill_targets()
+
def load_rescan(self, type, since=None):
self.rescan_nodes = set([])
results = datahandler.getAll()
@@ -817,7 +830,7 @@ class Test:
def _reset(self):
self.results = []
- self.targets = []
+ self.targets = Targets()
self.tests_run = 0
self.nodes_marked = 0
self.run_start = time.time()
@@ -827,7 +840,7 @@ class Test:
self.dns_fails_per_exit = {}
self.exit_fails_per_exit = {}
self.node_results = {}
- # These are indexed by site url:
+ # These are indexed by target URI:
self.connect_fails = {}
self.timeout_fails = {}
self.dns_fails = {}
@@ -842,8 +855,8 @@ class Test:
if not self.targets:
raise NoURLsFound("No URLS found for protocol "+self.proto)
- targets = "\n\t".join(self.targets)
- plog("INFO", "Using the following urls for "+self.proto+" scan:\n\t"+targets)
+ targets_str = "\n\t".join(map(str,self.targets))
+ plog("INFO", "Using the following urls for "+self.proto+" scan:\n\t"+targets_str)
def site_tests(self, site):
tot_cnt = 0
@@ -981,17 +994,16 @@ class Test:
class BaseHTTPTest(Test):
- def __init__(self, filetypes=scan_filetypes):
+ def __init__(self, scan_filetypes=scan_filetypes):
# FIXME: Handle http urls w/ non-80 ports..
- self.scan_filetypes = filetypes
+ self.scan_filetypes = scan_filetypes
+ self.fetch_queue = []
Test.__init__(self, "HTTP", 80)
self.save_name = "HTTPTest"
- self.fetch_targets = urls_per_filetype
def _reset(self):
self.httpcode_fails = {}
self.httpcode_fails_per_exit = {}
- self.targets_by_type = {}
Test._reset(self)
def depickle_upgrade(self):
@@ -1034,15 +1046,13 @@ class BaseHTTPTest(Test):
self.tests_run += 1
- n_tests = random.choice(xrange(1,len(self.targets_by_type)+1))
- filetypes = random.sample(self.targets_by_type.keys(), n_tests)
-
- plog("INFO", "HTTPTest decided to fetch "+str(n_tests)+" urls of types: "+str(filetypes))
+ self.fetch_queue.extend(self.select_targets())
n_success = n_fail = n_inconclusive = 0
- for ftype in filetypes:
+
+ while self.fetch_queue:
+ address = self.fetch_queue.pop(0)
# FIXME: Set referrer to random or none for each of these
- address = random.choice(self.targets_by_type[ftype])
result = self.check_http(address)
if result == TEST_INCONCLUSIVE:
n_inconclusive += 1
@@ -1062,22 +1072,9 @@ class BaseHTTPTest(Test):
else:
return TEST_SUCCESS
- def add_target(self, target):
- # HTTP Tests keep an additional dictionary of targets keyed by filetype
- split = target.rsplit('.',1)
- if len(split) > 1 and split[-1] in self.scan_filetypes:
- self.targets.append(target)
- self.targets_by_type.setdefault(split[-1], []).append(target)
-
def remove_target(self, target, reason="None"):
# Remove from targets list and targets by type dictionary
- if target in self.targets:
- self.targets.remove(target)
- for k,v in self.targets_by_type.items():
- if target in v:
- v.remove(target)
- if not v:
- del self.targets_by_type[k]
+ self.targets.remove(target)
# Delete results in httpcode_fails
if target in self.httpcode_fails:
del self.httpcode_fails[target]
@@ -1488,18 +1485,10 @@ def is_script_mimetype(mime_type):
return is_script
class BaseHTMLTest(BaseHTTPTest):
- def __init__(self, recurse_filetypes=scan_filetypes):
- BaseHTTPTest.__init__(self, recurse_filetypes)
+ def __init__(self, scan_filetypes=scan_filetypes):
+ BaseHTTPTest.__init__(self, scan_filetypes)
self.save_name = "HTMLTest"
- self.fetch_targets = num_html_urls
- self.proto = "HTML"
- self.recurse_filetypes = recurse_filetypes
- self.fetch_queue = []
-
- def _reset(self):
- self.httpcode_fails = {}
- self.httpcode_fails_per_exit = {}
- Test._reset(self)
+ self.proto = "HTML" #CA .. ?
def depickle_upgrade(self):
if self._pickle_revision < 7:
@@ -1507,11 +1496,9 @@ class BaseHTMLTest(BaseHTTPTest):
Test.depickle_upgrade(self)
def add_target(self, target):
+ """Avoid BaseHTTP.add_target which keys entries"""
Test.add_target(self, target)
- def remove_target(self, target, reason="None"):
- Test.remove_target(self, target, reason)
-
def run_test(self):
# A single test should have a single cookie jar
self.tor_cookie_jar = cookielib.MozillaCookieJar()
@@ -1616,7 +1603,7 @@ class BaseHTMLTest(BaseHTTPTest):
targets.append(("image", urlparse.urljoin(orig_addr, attr_tgt)))
elif t.name == 'a':
if attr_name == "href":
- for f in self.recurse_filetypes:
+ for f in self.scan_filetypes:
if f not in got_type and attr_tgt[-len(f):] == f:
got_type[f] = 1
targets.append(("http", urlparse.urljoin(orig_addr, attr_tgt)))
@@ -2045,14 +2032,16 @@ class FixedTargetTest:
def __init__(self, targets):
self.fixed_targets = targets
- def refill_targets(self):
- pass
-
def get_targets(self):
return self.fixed_targets[:]
+ def refill_targets(self):
+ """Can't refill FixedTargetTest"""
+ pass
+
def finished(self):
- # FixedTargetTests are done if they test all nodes or run out of targets
+ """FixedTargetTests are done if they test all nodes or run out of targets"""
+ # CA do we properly handle possibility that self.targets can run out
return not (self.nodes and self.targets)
class FixedTargetHTTPTest(FixedTargetTest, BaseHTTPTest):
@@ -2081,17 +2070,12 @@ class SearchBasedTest:
""" Mixin class. Must be mixed with a subclass of Test """
def __init__(self, wordlist_file):
self.wordlist_file = wordlist_file
-
self.host_only = False
- self.result_filetypes = ['any']
- self.result_protocol = 'any'
- self.results_per_type = 10
self.search_mode = default_search_mode
+ self.url_reserve = {}
- def refill_targets(self):
- if len(self.targets) < self.min_targets:
- plog("NOTICE", self.proto+" scanner short on targets. Adding more")
- map(self.add_target, self.get_targets())
+ def rewind(self):
+ self.wordlist = load_wordlist(self.wordlist_file)
def get_targets(self):
return self.get_search_urls()
@@ -2103,26 +2087,23 @@ class SearchBasedTest:
plog('INFO', 'Searching for relevant sites...')
urllist = set([])
- for filetype in self.result_filetypes:
- type_urls = self.get_search_urls_for_filetype(filetype)
- # make sure we don't get more urls than needed
- if len(type_urls) > self.results_per_type:
- chosen_urls = set(random.sample(type_urls, self.results_per_type))
- if filetype in self.url_reserve:
- self.url_reserve[filetype].extend(list(type_urls - chosen_urls))
- else:
- self.url_reserve[filetype] = list(type_urls - chosen_urls)
- type_urls = chosen_urls
- urllist.update(type_urls)
+ for filetype in self.scan_filetypes:
+ urllist.update(self.get_search_urls_for_filetype(filetype))
return list(urllist)
- def get_search_urls_for_filetype(self, filetype):
- type_urls = set(self.url_reserve.get(filetype, []))
- if type_urls: # Clear urls from the reserve
- self.url_reserve[filetype] = []
+ def get_search_urls_for_filetype(self, filetype,number = 0):
+ if not number:
+ number = self.results_per_type
+
+ self.url_reserve.setdefault(filetype,[])
+
+ type_urls = set(self.url_reserve[filetype][:number])
+ self.url_reserve[filetype] = self.url_reserve[filetype][number:]
+
count = 0
- while len(type_urls) < self.results_per_type and count < max_search_retry:
+
+ while len(type_urls) < number and count < max_search_retry:
count += 1
#Try to filter based on filetype/protocol. Unreliable. We will re-filter.
@@ -2194,30 +2175,34 @@ class SearchBasedTest:
if filetype == 'any':
file_list = None
else:
- file_list = self.result_filetypes
+ file_list = self.scan_filetypes
if self._is_useable_url(url, prot_list, file_list):
if self.host_only:
# FIXME: %-encoding, @'s, etc?
plog("INFO", url)
- host = urlparse.urlparse(url)[1]
+ url = urlparse.urlparse(url)[1]
# Have to check again here after parsing the url:
- if host not in self.banned_targets:
- type_urls.add(host)
- else:
- type_urls.add(url)
+ if host in self.banned_targets:
+ continue
+ type_urls.add(url)
+ plog("INFO", "Have "+str(len(type_urls))+"/"+str(number)+" urls from search so far..")
else:
pass
- plog("INFO", "Have "+str(len(type_urls))+"/"+str(self.results_per_type)+" urls from search so far..")
+
+ if len(type_urls) > number:
+ chosen = random.sample(type_urls,number)
+ self.url_reserve[filetype].extend(list(type_urls - set(chosen)))
+ type_urls = chosen
+
return type_urls
class SearchBasedHTTPTest(SearchBasedTest, BaseHTTPTest):
def __init__(self, wordlist):
BaseHTTPTest.__init__(self)
SearchBasedTest.__init__(self, wordlist)
- self.result_filetypes = self.scan_filetypes
- self.result_protocol = "http"
- self.results_per_type = self.fetch_targets
+ self.results_per_type = urls_per_filetype
+ self.result_protocol = 'http'
def depickle_upgrade(self):
if self._pickle_revision < 7:
@@ -2227,28 +2212,32 @@ class SearchBasedHTTPTest(SearchBasedTest, BaseHTTPTest):
BaseHTTPTest.depickle_upgrade(self)
def rewind(self):
- self.wordlist = load_wordlist(self.wordlist_file)
+ SearchBasedTest.rewind(self)
BaseHTTPTest.rewind(self)
+ def add_target(self, target):
+ # Keys targets by filetype. One filetype per target
+ split = target.rsplit('.',1)
+ if len(split) > 1 and split[-1] in self.scan_filetypes:
+ self.targets.add(target,[split[-1]])
+ return True
+ return False
+
+ def select_targets(self):
+ retval = []
+ n_tests = random.randrange(1,len(self.targets.keys())+1)
+ filetypes = random.sample(self.targets.keys(), n_tests)
+ plog("INFO", "HTTPTest decided to fetch "+str(n_tests)+" urls of types: "+str(filetypes))
+ for ftype in filetypes:
+ retval.append(random.choice(self.targets.bykey(ftype)))
+ return retval
+
def refill_targets(self):
for ftype in self.scan_filetypes:
- if not ftype in self.targets_by_type or len(self.targets_by_type[ftype]) < self.fetch_targets:
+ targets_needed = self.results_per_type - len(self.targets.bykey(ftype))
+ if targets_needed > 0:
plog("NOTICE", self.proto+" scanner short on "+ftype+" targets. Adding more")
- map(self.add_target, self.get_search_urls_for_filetype(ftype))
-
-# This duplicated the effort of BaseHTTPTest.add_target which is invoked by
-# SearchBasedHTTPTest.rewind -> BaseHTTPTest.rewind = Test.rewind
-# Instead we should fall back on SearchBasedTest.get_targets
-# def get_targets(self):
-# raw_urls = self.get_search_urls()
-# new = {}
-# for url in raw_urls:
-# split = url.rsplit('.',1) # Try to get filetype
-# if len(split) > 1 and split[-1] in self.scan_filetypes:
-# new.setdefault(split[-1],[]).append(url)
-# for k,v in new.items():
-# self.targets_by_type.setdefault(k, []).extend(v)
-# return raw_urls
+ map(self.add_target, self.get_search_urls_for_filetype(ftype,targets_needed))
HTTPTest = SearchBasedHTTPTest # For resuming from old HTTPTest.*.test files
@@ -2257,7 +2246,6 @@ class SearchBasedHTMLTest(SearchBasedTest, BaseHTMLTest):
BaseHTMLTest.__init__(self)
SearchBasedTest.__init__(self, wordlist)
self.result_filetypes = ["any"]
- self.result_protocol = "http"
self.results_per_type = self.fetch_targets
def depickle_upgrade(self):
@@ -2268,7 +2256,7 @@ class SearchBasedHTMLTest(SearchBasedTest, BaseHTMLTest):
BaseHTMLTest.depickle_upgrade(self)
def rewind(self):
- self.wordlist = load_wordlist(self.wordlist_file)
+ SearchBasedTest.rewind(self)
BaseHTMLTest.rewind(self)
HTMLTest = SearchBasedHTMLTest # For resuming from old HTMLTest.*.test files
@@ -2908,7 +2896,7 @@ def decompress_response_data(response):
len_read = len(data)
now = time.time()
- plog("DEBUG", "Read "+str(len_read)+"/"+str(tot_len))
+ #plog("DEBUG", "Read "+str(len_read)+"/"+str(tot_len)) #Very verbose
# Wait 5 seconds before counting data
if (now-start) > 5:
rate = (float(len_read)/(now-start)) #B/s
1
0

[torflow/master] Replacing HTMLTest with customizable HTTPTest.
by mikeperry@torproject.org 14 Sep '11
by mikeperry@torproject.org 14 Sep '11
14 Sep '11
commit 06987d841646edad453c65d36196c35c7d83e331
Author: christian <christian(a)avtok.com>
Date: Fri Jul 22 05:04:41 2011 -0400
Replacing HTMLTest with customizable HTTPTest.
---
NetworkScanners/ExitAuthority/libsoat.py | 9 +-
NetworkScanners/ExitAuthority/soat.py | 610 +++++++++++++-------------
NetworkScanners/ExitAuthority/soat_config.py | 6 +-
3 files changed, 325 insertions(+), 300 deletions(-)
diff --git a/NetworkScanners/ExitAuthority/libsoat.py b/NetworkScanners/ExitAuthority/libsoat.py
index 5971c9b..2a86cb4 100644
--- a/NetworkScanners/ExitAuthority/libsoat.py
+++ b/NetworkScanners/ExitAuthority/libsoat.py
@@ -42,6 +42,7 @@ __all__ = [ # Classes
# Functions
"FullyStrainedSoup",
# Constants
+ "COMPARE_EQUAL", "COMPARE_NOEQUAL", "COMPARE_TRUNCATION",
"TEST_SUCCESS", "TEST_INCONCLUSIVE", "TEST_FAILURE",
"RESULT_STRINGS", "RESULT_CODES",
"INCONCLUSIVE_NOLOCALCONTENT", "INCONCLUSIVE_DYNAMICSSL",
@@ -77,6 +78,12 @@ class LoggingJSLexer(JavaScriptLexer):
# constants
+# Compare results
+COMPARE_EQUAL = 0
+COMPARE_NOEQUAL = 1
+COMPARE_TRUNCATION = 2
+
+# Test results
TEST_SUCCESS = 0
TEST_INCONCLUSIVE = 1
TEST_FAILURE = 2
@@ -842,7 +849,7 @@ class SnakePickler:
pass
raise KeyboardInterrupt
except Exception, e:
- plog("WARN", "Exception during pickle dump: "+e)
+ plog("WARN", "Exception during pickle dump: " + str(e))
try:
os.unlink(filename)
except: pass
diff --git a/NetworkScanners/ExitAuthority/soat.py b/NetworkScanners/ExitAuthority/soat.py
index 8d557eb..dc4409a 100755
--- a/NetworkScanners/ExitAuthority/soat.py
+++ b/NetworkScanners/ExitAuthority/soat.py
@@ -2,6 +2,7 @@
# 2008 Aleksei Gorny, mentored by Mike Perry
# 2009 Mike Perry
+# 2011 Christian Anderson
'''
Snakes on a Tor exit node scanner
@@ -31,6 +32,7 @@ import getopt
import httplib
import mimetypes
import os
+import pickle
import random
import re
import signal
@@ -591,7 +593,7 @@ class Test:
self.scan_nodes = 0
self.nodes_to_mark = 0
self.tests_per_node = num_tests_per_node
- self._reset() #CA make this a call to rewind instead?
+ self._reset()
self._pickle_revision = 8 # Will increment as fields are added
def run_test(self):
@@ -603,7 +605,7 @@ class Test:
# Yes, this is a hack, and yes, it will bias results
# away from the filter, but hey, at least it will still run.
self._pickle_revision = 1
-
+
for addr in self.successes.keys():
if type(self.successes[addr]) == int:
self.successes[addr] = set(xrange(0,self.successes[addr]))
@@ -664,7 +666,10 @@ class Test:
self.targets.add(target)
def select_targets(self):
- return self.targets
+ ret = []
+ for key in self.targets.keys():
+ ret.extend(map(lambda x: (x,key), self.targets.bykey(key)))
+ return ret
def refill_targets(self):
map(self.add_target, self.get_targets())
@@ -830,6 +835,7 @@ class Test:
def _reset(self):
self.results = []
+ # Empty target list for new test
self.targets = Targets()
self.tests_run = 0
self.nodes_marked = 0
@@ -1000,10 +1006,17 @@ class BaseHTTPTest(Test):
self.fetch_queue = []
Test.__init__(self, "HTTP", 80)
self.save_name = "HTTPTest"
+ self.compare_funcs = {'html': self.compare_html, "js": self.compare_js}
def _reset(self):
self.httpcode_fails = {}
self.httpcode_fails_per_exit = {}
+ # Default cookie jar for new test
+ self.tor_cookie_jar = None
+ self.cookie_jar = None
+ # Default headers for new test
+ self.headers = copy.copy(firefox_headers)
+
Test._reset(self)
def depickle_upgrade(self):
@@ -1042,18 +1055,19 @@ class BaseHTTPTest(Test):
# A single test should have a single cookie jar
self.tor_cookie_jar = cookielib.MozillaCookieJar()
self.cookie_jar = cookielib.MozillaCookieJar()
- self.headers = copy.copy(firefox_headers)
self.tests_run += 1
self.fetch_queue.extend(self.select_targets())
+ plog('INFO',str(self.fetch_queue))
+
n_success = n_fail = n_inconclusive = 0
while self.fetch_queue:
- address = self.fetch_queue.pop(0)
+ address, filetype = self.fetch_queue.pop(0)
# FIXME: Set referrer to random or none for each of these
- result = self.check_http(address)
+ result = self.check_http(address,filetype)
if result == TEST_INCONCLUSIVE:
n_inconclusive += 1
if result == TEST_FAILURE:
@@ -1106,144 +1120,148 @@ class BaseHTTPTest(Test):
datahandler.saveResult(result)
return TEST_FAILURE
- def check_http_nodynamic(self, address, nocontent=False):
- # TODO: use nocontent to cause us to not load content into memory.
- # This will require refactoring http_response though.
- ''' check whether a http connection to a given address is molested '''
+ def direct_load(self, orig_address, filetype):
+ """Loads a page on a direct connection. The signtuare is:
+ address (posibly after redirects)
+ success (T/F)
+ code
+ filetype of loaded page (should be null if we failed)"""
- # an address representation acceptable for a filename
- address_file = DataHandler.safeFilename(address.replace('http://',''))
- content_prefix = http_content_dir+address_file
- # Keep a copy of the cookie jar before mods for refetch or
- # to restore on errors that cancel a fetch
- orig_cookie_jar = cookielib.MozillaCookieJar()
- for cookie in self.cookie_jar:
- orig_cookie_jar.set_cookie(cookie)
- orig_tor_cookie_jar = cookielib.MozillaCookieJar()
- for cookie in self.tor_cookie_jar:
- orig_tor_cookie_jar.set_cookie(cookie)
+ # This is the address that this function will return:
+ address = orig_address
- try:
- # Load content from disk, md5
- content_file = open(content_prefix+'.content', 'r')
- sha1sum = sha()
- buf = content_file.read(4096)
- while buf:
- sha1sum.update(buf)
- buf = content_file.read(4096)
- content_file.close()
+ # Reqest the content using a direct connection
+ (code, resp_headers, new_cookies, mime_type, content) = http_request(orig_address,self.cookie_jar, self.headers)
- added_cookie_jar = cookielib.MozillaCookieJar()
- added_cookie_jar.load(content_prefix+'.cookies', ignore_discard=True)
- self.cookie_jar.load(content_prefix+'.cookies', ignore_discard=True)
+ # Make a good faith effort to follow redirects
+ count = 0
+ trail = set([])
+ while (300 <= code < 400):
+ plog("NOTICE", "Non-Tor HTTP "+str(code)+" redirect from "+str(orig_address)+" to "+str(content))
+ address = content
+ if address in trail: break
+ trail.add(address)
+ (code, resp_headers, new_cookies, mime_type, content) = http_request(address, self.cookie_jar, self.headers)
- headerdiffer = SnakePickler.load(content_prefix+'.headerdiff')
+ count += 1
+ if count > 4: break
- content = None
- mime_type = None
+ # Couldn't get past the redirects
+ if (300 <= code < 400):
+ return (address,False,code,'')
- except IOError:
- (code, resp_headers, new_cookies, mime_type, content) = http_request(address, self.cookie_jar, self.headers)
+ # If there was a fatal error, return failure
+ if not (200 <= code < 300) or not content:
+ plog("NOTICE", "Non-tor HTTP error "+str(code)+" fetching content for "+address)
+ return (address, False, code,'')
- if 300 <= code < 400: # Redirects
- plog("NOTICE", "Non-Tor HTTP "+str(code)+" redirect from "+str(address)+" to "+str(content))
- # Remove the original target and add the redirected location
- self.remove_target(address, INCONCLUSIVE_REDIRECT)
- self.add_target(content)
- # Restore cookie jar
- self.cookie_jar = orig_cookie_jar
- self.tor_cookie_jar = orig_cookie_jar
- return TEST_INCONCLUSIVE
+ loaded_filetype = mime_to_filetype(mime_type)
- if code - (code % 100) != 200:
- plog("NOTICE", "Non-tor HTTP error "+str(code)+" fetching content for "+address)
- # Just remove it
- self.remove_target(address, FALSEPOSITIVE_HTTPERRORS)
- # Restore cookie jars
- self.cookie_jar = orig_cookie_jar
- self.tor_cookie_jar = orig_tor_cookie_jar
- return TEST_INCONCLUSIVE
+ if filetype and filetype != loaded_filetype:
+
+ plog('DEBUG', 'Wrong filetype: ' + filetype + ' ' + loaded_filetype)
+ return (address, False, code, '')
- if not content:
- plog("WARN", "Failed to direct load "+address)
- # Just remove it
- self.remove_target(address, INCONCLUSIVE_NOLOCALCONTENT)
- # Restore cookie jar
- self.cookie_jar = orig_cookie_jar
- self.tor_cookie_jar = orig_tor_cookie_jar
- return TEST_INCONCLUSIVE
- sha1sum = sha(content)
+ # Fetch again with different cookies and see if we get the same content
+ # Use a different IP address if possible
- content_file = open(content_prefix+'.content', 'w')
- content_file.write(content)
- content_file.close()
+ empty_cookie_jar = cookielib.MozillaCookieJar()
- headerdiffer = HeaderDiffer(resp_headers)
- SnakePickler.dump(headerdiffer, content_prefix+'.headerdiff')
+ BindingSocket.bind_to = refetch_ip
+ (code_new, resp_headers_new, new_cookies_new, mime_type_new, content_new) = http_request(address, empty_cookie_jar, self.headers)
+ BindingSocket.bind_to = None
- # Need to do set subtraction and only save new cookies..
- # or extract/make_cookies
- added_cookie_jar = cookielib.MozillaCookieJar()
- for cookie in new_cookies:
- added_cookie_jar.set_cookie(cookie)
- try:
- added_cookie_jar.save(content_prefix+'.cookies', ignore_discard=True)
- except:
- traceback.print_exc()
- plog("WARN", "Error saving cookies in "+str(added_cookie_jar)+" to "+content_prefix+".cookies")
+ # If there was a fatal error, return failure
+ if not (code <= 200 < 300) or not content:
+ plog("NOTICE", "Non-tor HTTP error "+str(code)+" fetching content for "+address)
+ return (address, False, code, '')
- except TypeError, e:
- plog('ERROR', 'Failed obtaining the shasum for ' + address)
- plog('ERROR', e)
- # Restore cookie jars
- self.cookie_jar = orig_cookie_jar
- self.tor_cookie_jar = orig_tor_cookie_jar
- return TEST_INCONCLUSIVE
+ # The context for writing out the files used to make repeated comparisons
+ address_file = DataHandler.safeFilename(re.sub('[a-z]+://','',address))
+ content_prefix = http_content_dir + address_file
+
+ # If the page is different on the second load, then it is probably dynamic and useless to us
+ if self.compare(content,content_new,content_prefix,loaded_filetype) != COMPARE_EQUAL:
+ return (address, False, code, '')
+
+ f = open(content_prefix + '.content', 'w')
+ f.write(content)
+ f.close()
+
+ # Save the cookies in case we want them for a later test
+ empty_cookie_jar.save(content_prefix + '.cookies',ignore_discard=True)
+
+ # Save the response headers in case we want them for a later test
+ headerdiffer = HeaderDiffer(resp_headers)
+ SnakePickler.dump(headerdiffer, content_prefix+'.headerdiff')
+
+ return (address, True, code, loaded_filetype)
+
+ def check_http(self, address, filetype, dynamic = False):
+ ''' check whether a http connection to a given address is molested '''
- (pcode, presp_headers, pnew_cookies, pmime_type, pcontent) = torify(http_request, address, self.tor_cookie_jar, self.headers)
+ # The "dynamic" option controls whether we dare grapple with dynamic
+ # pages. Currently only False is supported.
+
+ plog('INFO', 'Conducting an http test with destination ' + address)
+
+ # Keep a copy of the cookie jar before mods for refetch or
+ # to restore on errors that cancel a fetch
+ my_tor_cookie_jar = cookielib.MozillaCookieJar()
+ for cookie in self.tor_cookie_jar:
+ my_tor_cookie_jar.set_cookie(cookie)
+
+ my_cookie_jar = cookielib.MozillaCookieJar()
+ for cookie in self.cookie_jar:
+ my_cookie_jar.set_cookie(cookie)
+
+ # CA we should modify our headers for maximum magic
+
+ # pfoobar means that foobar was acquired over a _p_roxy
+ (pcode, presp_headers, pnew_cookies, pmime_type, pcontent) = torify(http_request, address, my_tor_cookie_jar, self.headers)
psha1sum = sha(pcontent)
exit_node = scanhdlr.get_exit_node()
if not exit_node:
+ # CA: how can this happen?
plog('NOTICE', 'We had no exit node to test, skipping to the next test.')
result = HttpTestResult(None,
address, TEST_INCONCLUSIVE, INCONCLUSIVE_NOEXIT)
if self.rescan_nodes:
+ # CA: we shouldn't need to do this
result.from_rescan = True
self.results.append(result)
-
- # Restore cookie jars
- self.cookie_jar = orig_cookie_jar
- self.tor_cookie_jar = orig_tor_cookie_jar
+ # CA: when do we use datahandler?
return TEST_INCONCLUSIVE
exit_node = "$"+exit_node.idhex
- if pcode - (pcode % 100) != 200:
- plog("NOTICE", exit_node+" had error "+str(pcode)+" fetching content for "+address)
-
- if pcode not in SOCKS_ERRS: # Do a refetch for non-SOCKS errors only
- # Restore cookie jars
- # XXX: This is odd and possibly wrong for the refetch
- self.cookie_jar = orig_cookie_jar
- self.tor_cookie_jar = orig_tor_cookie_jar
- BindingSocket.bind_to = refetch_ip
- (code_new, resp_headers_new, new_cookies_new, mime_type_new, content_new) = http_request(address, orig_tor_cookie_jar, self.headers)
- BindingSocket.bind_to = None
-
- if code_new == pcode and 300 <= pcode < 400: # Target introduced a redirect
- plog("NOTICE", "Non-Tor HTTP "+str(code_new)+" redirect from "+address+" to "+str(content_new))
- # Remove the original URL and add the redirect to our targets (if it's of the right type)
+
+ # If there is an error loading the page over Tor:
+ if not (200 <= pcode < 300) or not pcontent:
+ # And if it doesn't have to do with our SOCKS connection:
+ if pcode not in SOCKS_ERRS:
+ plog("NOTICE", exit_node+" had error "+str(pcode)+" fetching content for "+address)
+
+ (code_direct, resp_headers_direct, direct_cookies_direct, mime_type_direct, content_direct) = http_request(address, my_cookie_jar, self.headers)
+
+ # If a direct load is failing, remove this target from future consideration
+ if (300 <= code_direct < 400):
self.remove_target(address, INCONCLUSIVE_REDIRECT)
- self.add_target(content_new)
- return TEST_INCONCLUSIVE
- elif code_new == pcode: # Target introduced some other change
- plog("NOTICE", "Non-tor HTTP error "+str(code_new)+" fetching content for "+address)
- # Just remove it
+ elif not (200 <= code_direct < 300):
self.remove_target(address, FALSEPOSITIVE_HTTPERRORS)
+
+ # If Tor and direct are failing for the same reason, Tor is off the hook
+ if (code_direct == pcode):
+ result = HttpTestResult(self.node_map[exit_node[1:]],
+ address, TEST_INCONCLUSIVE, INCONCLUSIVE_NOLOCALCONTENT)
+ if self.rescan_nodes:
+ # CA: we shouldn't need to do this
+ result.from_rescan = True
+ self.results.append(result)
return TEST_INCONCLUSIVE
- # Error => behavior lookup table
+ # Error => behavior lookup table
# Error code (Failure reason, Register method, Set extra_info to pcontent?)
err_lookup = \
{E_SOCKS: (FAILURE_CONNERROR, self.register_connect_failure, True), # "General socks error"
@@ -1257,6 +1275,7 @@ class BaseHTTPTest(Test):
E_URL: (FAILURE_URLERROR, self.register_connect_failure, True),
E_MISC: (FAILURE_MISCEXCEPTION, self.register_connect_failure, True)
}
+
if pcode in err_lookup:
fail_reason, register, extra_info = err_lookup[pcode]
elif 300 <= pcode < 400: # Exit node introduced a redirect
@@ -1265,209 +1284,193 @@ class BaseHTTPTest(Test):
register = self.register_http_failure
extra_info = True
else: # Exit node introduced some other change
- fail_reason = FAILURE_BADHTTPCODE+str(pcode)
+ fail_reason = FAILURE_BADHTTPCODE + str(pcode) #CA don't think this is good
register = self.register_exit_failure
extra_info = True
+ # the [1:] gets rid of dollar sign. CA ugly
result = HttpTestResult(self.node_map[exit_node[1:]],
- address, TEST_FAILURE, fail_reason)
+ address, TEST_FAILURE, fail_reason)
if extra_info:
result.extra_info = str(pcontent)
- return register(result)
- # if we have no content, we had a connection error
+ register(result)
+ return TEST_FAILURE
+
+ # If we have no content, we had a connection error
if pcontent == "":
result = HttpTestResult(self.node_map[exit_node[1:]],
address, TEST_FAILURE, FAILURE_NOEXITCONTENT)
self.register_exit_failure(result)
# Restore cookie jars
- self.cookie_jar = orig_cookie_jar
- self.tor_cookie_jar = orig_tor_cookie_jar
return TEST_FAILURE
- hdiffs = headerdiffer.show_differences(presp_headers)
- if hdiffs:
- plog("NOTICE", "Header differences for "+address+": \n"+hdiffs)
-
- # compare the content
- # if content matches, everything is ok
- if not hdiffs and psha1sum.hexdigest() == sha1sum.hexdigest():
- result = HttpTestResult(self.node_map[exit_node[1:]],
- address, TEST_SUCCESS)
- self.register_success(result)
- return TEST_SUCCESS
+ #
+ # Tor was able to connect, so now it's time to make the comparison
+ #
+
+ # An address representation acceptable for a filename:
+ address_file = DataHandler.safeFilename(re.sub('[a-z]+://','',address))
+ content_prefix = http_content_dir + address_file
+ failed_prefix = http_failed_dir + address_file
+
+ # Load content from disk
+ content_file = open(content_prefix+'.content', 'r')
+ content = ''.join(content_file.readlines())
+ content_file.close()
+
+ # If we need to write out the content handed to us by the exit node
+ exit_content_file_name = DataHandler.uniqueFilename(failed_prefix+'.'+exit_node[1:]+'.content')
+
+ # TODO we might want to check headers and cookies
+
+ # Compare the content
+ # TODO should we check if mimetype agrees with filetype?
+ result = self.compare(pcontent,content,content_prefix,filetype)
+ if result == COMPARE_NOEQUAL:
+ # Reload direct content and try again
+ (code_direct, resp_headers_direct, direct_cookies_direct, mime_type_direct, content_direct) = http_request(address, my_cookie_jar, self.headers)
+
+ # If a new direct load somehow fails, then we're out of luck
+ if not (200 <= code_direct < 300):
+ plog("WARN", "Failed to re-frech "+address+" outside of Tor. Did our network fail?")
+ self.remove_target(address, FALSEPOSITIVE_HTTPERRORS)
+ result = HttpTestResult(self.node_map[exit_node[1:]],
+ address, TEST_INCONCLUSIVE,
+ INCONCLUSIVE_NOLOCALCONTENT)
+ if self.rescan_nodes:
+ result.from_rescan = True
+ self.results.append(result)
+ return TEST_INCONCLUSIVE
- # Check for a simple truncation failure, which seems
- # common with many nodes
- if not content and not nocontent:
- load_file = content_prefix+'.content'
- content_file = open(load_file, 'r')
- content = content_file.read()
- content_file.close()
-
- if content and len(pcontent) < len(content):
- if content[0:len(pcontent)] == pcontent[0:len(pcontent)]:
- failed_prefix = http_failed_dir+address_file
- exit_content_file = open(DataHandler.uniqueFilename(failed_prefix+'.'+exit_node[1:]+'.content'), 'w')
- exit_content_file.write(pcontent)
- exit_content_file.close()
+ # Try our comparison again
+ dynamic = self.compare(content_direct,content,content_prefix,filetype)
+
+ if dynamic == COMPARE_EQUAL:
+ # The content has changed, so our exit node is screwing with us.
result = HttpTestResult(self.node_map[exit_node[1:]],
- address, TEST_FAILURE, FAILURE_EXITTRUNCATION,
+ address, TEST_FAILURE, FAILURE_EXITONLY,
sha1sum.hexdigest(), psha1sum.hexdigest(),
- content_prefix+".content",
- exit_content_file.name)
+ content_prefix+".content", exit_content_file_name)
self.register_exit_failure(result)
- # Restore cookie jars
- self.cookie_jar = orig_cookie_jar
- self.tor_cookie_jar = orig_tor_cookie_jar
- return TEST_FAILURE
-
- # if content doesnt match, update the direct content and use new cookies
- # If we have alternate IPs to bind to on this box, use them?
- # Sometimes pages have the client IP encoded in them..
- # Also, use the Tor cookies, since those identifiers are
- # probably embeded in the Tor page as well.
- BindingSocket.bind_to = refetch_ip
- (code_new, resp_headers_new, new_cookies_new, mime_type_new, content_new) = http_request(address, orig_tor_cookie_jar, self.headers)
- BindingSocket.bind_to = None
-
- if not content_new:
- plog("WARN", "Failed to re-frech "+address+" outside of Tor. Did our network fail?")
- result = HttpTestResult(self.node_map[exit_node[1:]],
- address, TEST_INCONCLUSIVE,
- INCONCLUSIVE_NOLOCALCONTENT)
- if self.rescan_nodes:
- result.from_rescan = True
- self.results.append(result)
- datahandler.saveResult(result)
- return TEST_INCONCLUSIVE
-
- headerdiffer.prune_differences(resp_headers_new)
- hdiffs = headerdiffer.show_differences(presp_headers)
-
- SnakePickler.dump(headerdiffer, content_prefix+'.headerdiff')
-
- sha1sum_new = sha(content_new)
-
- if sha1sum.hexdigest() != sha1sum_new.hexdigest():
- # if content has changed outside of tor, update the saved file
- os.rename(content_prefix+'.content', content_prefix+'.content-old')
- new_content_file = open(content_prefix+'.content', 'w')
- new_content_file.write(content_new)
- new_content_file.close()
-
- # Need to do set subtraction and only save new cookies..
- # or extract/make_cookies
-
- self.cookie_jar = orig_cookie_jar
- new_cookie_jar = cookielib.MozillaCookieJar()
- for cookie in new_cookies_new:
- new_cookie_jar.set_cookie(cookie)
- self.cookie_jar.set_cookie(cookie) # Update..
- os.rename(content_prefix+'.cookies', content_prefix+'.cookies-old')
- try:
- new_cookie_jar.save(content_prefix+'.cookies', ignore_discard=True)
- except:
- traceback.print_exc()
- plog("WARN", "Error saving cookies in "+str(new_cookie_jar)+" to "+content_prefix+".cookies")
-
- if hdiffs:
- # XXX: We probably should store the header differ + exit headers
- # for later comparison (ie if the header differ picks up more diffs)
- plog("NOTICE", "Post-refetch header changes for "+address+": \n"+hdiffs)
- result = HttpTestResult(self.node_map[exit_node[1:]],
- address, TEST_FAILURE, FAILURE_HEADERCHANGE)
- result.extra_info = hdiffs
- self.register_dynamic_failure(result)
- # Lets let the rest of the tests run too actually
- #return TEST_FAILURE
-
- # compare the node content and the new content
- # if it matches, everything is ok
- if psha1sum.hexdigest() == sha1sum_new.hexdigest():
+ retval = TEST_FAILURE
+ else:
+ # The content is dynamic.
+ # Here's where "no dynamic" comes in.
+ # We reject this target and mark the test inconclusive.
+ plog("WARN", "HTTP Test is removing dynamic URL "+address)
+ self.remove_target(address, FALSEPOSITIVE_DYNAMIC)
+ result = HttpTestResult(self.node_map[exit_node[1:]],
+ address, TEST_INCONCLUSIVE, INCONCLUSIVE_DYNAMIC,
+ sha1sum_new.hexdigest(), psha1sum.hexdigest(),
+ content_prefix+".content", exit_content_file_name,
+ content_prefix+'.content-old',
+ sha1sum.hexdigest())
+ self.results.append(result)
+ retval = TEST_INCONCLUSIVE
+ elif result == COMPARE_EQUAL:
result = HttpTestResult(self.node_map[exit_node[1:]],
address, TEST_SUCCESS)
self.register_success(result)
return TEST_SUCCESS
-
- if not content and not nocontent:
- if sha1sum.hexdigest() != sha1sum_new.hexdigest():
- load_file = content_prefix+'.content-old'
- else:
- load_file = content_prefix+'.content'
- content_file = open(load_file, 'r')
- content = content_file.read()
- content_file.close()
-
- if not ((mime_type == mime_type_new or not mime_type) \
- and mime_type_new == pmime_type):
- if not mime_type:
- mime_type = "text/disk"
- plog("WARN", "Mime type change: 1st: "+mime_type+", 2nd: "+mime_type_new+", Tor: "+pmime_type)
- # TODO: If this actually happens, store a result.
- else:
- # Mime types match.. Are they sensible?
- guess = mimetypes.guess_type(address, strict=False)[0]
- if guess and not is_html_mimetype(guess) and is_html_mimetype(str(pmime_type)):
- # We're not expecting html and we got (seemingly dynamic) html content
- # This causes a lot of false positives, let's just remove the target
- plog("NOTICE", "Got HTML content for non-HTML request, removing target "+address)
- self.remove_target(address, FALSEPOSITIVE_DYNAMIC)
- return TEST_INCONCLUSIVE
-
- # Dirty dirty dirty...
- return (mime_type_new, pcontent, psha1sum, content, sha1sum, content_new,
- sha1sum_new, exit_node)
-
- def check_http(self, address):
- plog('INFO', 'Conducting an http test with destination ' + address)
- ret = self.check_http_nodynamic(address)
- if type(ret) == int:
- return ret
- return self._check_http_worker(address, ret)
-
- def _check_http_worker(self, address, http_ret):
- (mime_type,pcontent,psha1sum,content,sha1sum,content_new,sha1sum_new,exit_node) = http_ret
-
- address_file = DataHandler.safeFilename(address.replace('http://',''))
- content_prefix = http_content_dir+address_file
- failed_prefix = http_failed_dir+address_file
-
- # compare the new and old content
- # if they match, means the node has been changing the content
- if sha1sum.hexdigest() == sha1sum_new.hexdigest():
+ elif result == COMPARE_TRUNCATION:
exit_content_file = open(DataHandler.uniqueFilename(failed_prefix+'.'+exit_node[1:]+'.content'), 'w')
exit_content_file.write(pcontent)
exit_content_file.close()
-
result = HttpTestResult(self.node_map[exit_node[1:]],
- address, TEST_FAILURE, FAILURE_EXITONLY,
+ address, TEST_FAILURE, FAILURE_EXITTRUNCATION,
sha1sum.hexdigest(), psha1sum.hexdigest(),
- content_prefix+".content", exit_content_file.name)
+ content_prefix+".content",
+ exit_content_file_name)
self.register_exit_failure(result)
return TEST_FAILURE
- exit_content_file = open(DataHandler.uniqueFilename(failed_prefix+'.'+exit_node[1:]+'.dyn-content'),'w')
- exit_content_file.write(pcontent)
- exit_content_file.close()
+ # If we failed, then store what the exit node handed us
+ if retval == TEST_FAILURE:
+ exit_content_file = open(exit_content_file_name, 'w')
+ exit_content_file.write(pcontent)
+ exit_content_file.close()
- result = HttpTestResult(self.node_map[exit_node[1:]],
- address, TEST_FAILURE, FAILURE_DYNAMIC,
- sha1sum_new.hexdigest(), psha1sum.hexdigest(),
- content_prefix+".content", exit_content_file.name,
- content_prefix+'.content-old',
- sha1sum.hexdigest())
- if self.rescan_nodes:
- result.from_rescan = True
- self.results.append(result)
- datahandler.saveResult(result)
+ return retval
- # The HTTP Test should remove address immediately...
- plog("WARN", "HTTP Test is removing dynamic URL "+address)
- self.remove_target(address, FALSEPOSITIVE_DYNAMIC)
- return TEST_FAILURE
+ def compare(self,new_content,old_content,context,filetype):
+ """The generic function for comparing webcontent."""
+
+ plog('DEBUG', "Beginning Compare")
+
+ new_linelist = new_content.split('\n')
+ old_linelist = old_content.split('\n')
+
+ old_hashes = pickled_content(context,'.hashes')
+ if not old_hashes:
+ old_hashes = []
+ old_hash = sha()
+ for l in old_linelist:
+ old_hash.update(l)
+ old_hashes.append(old_hash.hexdigest())
+ f = open(context + '.hashes','w')
+ pickle.dump(old_hashes,f)
+ f.close()
+
+ if len(new_linelist) > len(old_linelist):
+ retval = COMPARE_NOEQUAL
+ else:
+ new_hash = sha()
+ for i in range(0,min(len(old_linelist),len(new_linelist))):
+ new_hash.update(new_linelist[i])
+ new_hash = new_hash.hexdigest()
+
+ if new_hash != old_hashes[len(new_linelist) - 1]:
+ retval = COMPARE_NOEQUAL
+ elif len(new_linelist) == len(old_linelist):
+ retval = COMPARE_EQUAL
+ else:
+ retval = COMPARE_TRUNCATION
+
+ if retval == COMPARE_NOEQUAL:
+ try:
+ retval = self.compare_funcs[filetype](new_content,old_content,context)
+ except KeyError:
+ pass
+
+ plog('DEBUG', "Compare got the result: " + str(retval))
+
+ return retval
+
+ def compare_js(self,new_content,old_content,context):
+ # TODO check for truncation? Store differ?
+ jsdiff = JSDiffer(old_content)
+ has_changes = jsdiff.contains_differences(new_content)
+ if not has_changes:
+ return COMPARE_EQUAL
+ else:
+ return COMPARE_NOEQUAL
+
+ def compare_html(self,new_content,old_content,context):
+ # TODO check for truncation? Store differ?
+ old_soup = FullyStrainedSoup(old_content.decode('ascii', 'ignore'))
+ new_soup = FullyStrainedSoup(new_content.decode('ascii', 'ignore'))
+ htmldiff = SoupDiffer(old_soup,new_soup)
+ html_has_changes = htmldiff.content_changed
+ # TODO do we need to seperately check JS?
+ if not html_has_changes:
+ return COMPARE_EQUAL
+ else:
+ return COMPARE_NOEQUAL
# TODO move these somewhere sensible
+def pickled_content(context,extension):
+ try:
+ f = open(context + extension, 'r')
+ ret = pickle.load(f)
+ f.close()
+ except IOError:
+ ret = False
+ return ret
+
+def mime_to_filetype(mime_type):
+ return mimetypes.guess_extension(mime_type)[1:]
+
def is_html_mimetype(mime_type):
is_html = False
for type_match in html_mime_types:
@@ -2030,6 +2033,7 @@ class BaseSSLTest(Test):
class FixedTargetTest:
""" Mixin class. Must be mixed with a subclass of Test """
def __init__(self, targets):
+ plog('INFO', "You requested the fixed targets: " + str(targets))
self.fixed_targets = targets
def get_targets(self):
@@ -2041,7 +2045,6 @@ class FixedTargetTest:
def finished(self):
"""FixedTargetTests are done if they test all nodes or run out of targets"""
- # CA do we properly handle possibility that self.targets can run out
return not (self.nodes and self.targets)
class FixedTargetHTTPTest(FixedTargetTest, BaseHTTPTest):
@@ -2050,6 +2053,16 @@ class FixedTargetHTTPTest(FixedTargetTest, BaseHTTPTest):
utargets = [t for t in targets if self._is_useable_url(t, ['http'])]
FixedTargetTest.__init__(self, utargets)
+ def get_targets(self):
+ ret = []
+ for targ in self.fixed_targets:
+ addr, succ, code, ftype = self.direct_load(targ, False)
+ if succ: ret.append([addr,ftype])
+ return ret
+
+ def add_target(self, target):
+ self.targets.add(target[0],[target[1]])
+
class FixedTargetHTMLTest(FixedTargetTest, BaseHTMLTest):
def __init__(self, targets):
BaseHTMLTest.__init__(self)
@@ -2077,10 +2090,11 @@ class SearchBasedTest:
def rewind(self):
self.wordlist = load_wordlist(self.wordlist_file)
- def get_targets(self):
- return self.get_search_urls()
+ def add_target(self, target):
+ self.targets.add(target[0],[target[1]])
+ return True
- def get_search_urls(self):
+ def get_targets(self):
'''
construct a list of urls based on the wordlist, filetypes and protocol.
'''
@@ -2088,11 +2102,15 @@ class SearchBasedTest:
urllist = set([])
for filetype in self.scan_filetypes:
- urllist.update(self.get_search_urls_for_filetype(filetype))
+ urllist.update(map(lambda x: (x, filetype), self.get_search_urls_for_filetype(filetype)))
return list(urllist)
- def get_search_urls_for_filetype(self, filetype,number = 0):
+ def get_search_urls_for_filetype(self, filetype, number=0):
+ # CA. I don't want to support 'any' any more. We must specify a filetype
+ assert(filetype != 'any')
+ assert(filetype)
+
if not number:
number = self.results_per_type
@@ -2178,12 +2196,18 @@ class SearchBasedTest:
file_list = self.scan_filetypes
if self._is_useable_url(url, prot_list, file_list):
+ plog('DEBUG', "Found a useable url: " + url)
+ url, success, code, cur_filetype = self.direct_load(url,filetype)
+ if not success:
+ plog('DEBUG',"Url was not useable after all: " + url)
+ continue
if self.host_only:
# FIXME: %-encoding, @'s, etc?
plog("INFO", url)
url = urlparse.urlparse(url)[1]
# Have to check again here after parsing the url:
if host in self.banned_targets:
+ plog('DEBUG',"Url was not useable after all (banned): " + url)
continue
type_urls.add(url)
plog("INFO", "Have "+str(len(type_urls))+"/"+str(number)+" urls from search so far..")
@@ -2195,6 +2219,8 @@ class SearchBasedTest:
self.url_reserve[filetype].extend(list(type_urls - set(chosen)))
type_urls = chosen
+ plog("INFO","Got urls for filetype!")
+
return type_urls
class SearchBasedHTTPTest(SearchBasedTest, BaseHTTPTest):
@@ -2215,21 +2241,13 @@ class SearchBasedHTTPTest(SearchBasedTest, BaseHTTPTest):
SearchBasedTest.rewind(self)
BaseHTTPTest.rewind(self)
- def add_target(self, target):
- # Keys targets by filetype. One filetype per target
- split = target.rsplit('.',1)
- if len(split) > 1 and split[-1] in self.scan_filetypes:
- self.targets.add(target,[split[-1]])
- return True
- return False
-
def select_targets(self):
retval = []
n_tests = random.randrange(1,len(self.targets.keys())+1)
filetypes = random.sample(self.targets.keys(), n_tests)
plog("INFO", "HTTPTest decided to fetch "+str(n_tests)+" urls of types: "+str(filetypes))
for ftype in filetypes:
- retval.append(random.choice(self.targets.bykey(ftype)))
+ retval.append((random.choice(self.targets.bykey(ftype)),ftype))
return retval
def refill_targets(self):
diff --git a/NetworkScanners/ExitAuthority/soat_config.py b/NetworkScanners/ExitAuthority/soat_config.py
index 39f8165..99cd4ff 100644
--- a/NetworkScanners/ExitAuthority/soat_config.py
+++ b/NetworkScanners/ExitAuthority/soat_config.py
@@ -28,7 +28,7 @@ num_html_urls = 10
max_search_retry = 3
# Hrmm.. Too many of these and Google really h8s us..
-scan_filetypes = ['pdf','exe']
+scan_filetypes = ['pdf','doc','html']
# Urls to scan for each filetype
urls_per_filetype = 2
@@ -150,8 +150,8 @@ ixquick_search_mode = {"host" : "ixquick.com/do/metasearch.pl", "query":"all_ter
"extra":[("prfh","disable_family_filterEEE1N1Nnum_of_resultsEEE50N1Ndisable_video_family_filterEEE1N1N")]}
-#default_search_mode = google_search_mode
-default_search_mode = ixquick_search_mode
+default_search_mode = google_search_mode
+#default_search_mode = ixquick_search_mode
# Regex of characters we consider unsafe to write to the filesystem
unsafe_filechars = "[^a-zA-Z0-9-\.+]"
1
0

[torflow/master] Better mechanism for remembering directly loaded content.
by mikeperry@torproject.org 14 Sep '11
by mikeperry@torproject.org 14 Sep '11
14 Sep '11
commit b9f5348a00ceae526cbf2d8496ba568869993f85
Author: christian <christian(a)avtok.com>
Date: Mon Aug 22 16:20:34 2011 -0400
Better mechanism for remembering directly loaded content.
---
NetworkScanners/ExitAuthority/soat.py | 220 +++++++++++++++++----------------
1 files changed, 112 insertions(+), 108 deletions(-)
diff --git a/NetworkScanners/ExitAuthority/soat.py b/NetworkScanners/ExitAuthority/soat.py
index dc4409a..e8a17dc 100755
--- a/NetworkScanners/ExitAuthority/soat.py
+++ b/NetworkScanners/ExitAuthority/soat.py
@@ -358,6 +358,10 @@ class ExitScanHandler(ScanSupport.ScanHandler):
self.__dnshandler = DNSRebindScanner(self, c)
+class Http_Return:
+ def __init__(self, rt):
+ (self.code, self.headers, self.new_cookies, self.mime_type, self.content) = rt
+
# HTTP request handling
def http_request(address, cookie_jar=None, headers=firefox_headers):
''' perform a http GET-request and return the content received '''
@@ -384,7 +388,7 @@ def http_request(address, cookie_jar=None, headers=firefox_headers):
length = reply.info().get("Content-Length")
if length and int(length) > max_content_size:
plog("WARN", "Max content size exceeded for "+address+": "+length)
- return (reply.code, None, [], "", "")
+ return Http_Return((reply.code, None, [], "", ""))
mime_type = reply.info().type.lower()
reply_headers = HeaderDiffer.filter_headers(reply.info().items())
reply_headers.add(("mime-type", mime_type))
@@ -433,7 +437,7 @@ def http_request(address, cookie_jar=None, headers=firefox_headers):
traceback.print_exc()
rval = (E_MISC, None, [], "", e.__class__.__name__+str(e))
plog("INFO", "Completed HTTP Reqest for: "+address)
- return rval
+ return Http_Return(rval)
# SSL request handling
@@ -1120,7 +1124,7 @@ class BaseHTTPTest(Test):
datahandler.saveResult(result)
return TEST_FAILURE
- def direct_load(self, orig_address, filetype):
+ def first_load(self, orig_address, filetype):
"""Loads a page on a direct connection. The signtuare is:
address (posibly after redirects)
success (T/F)
@@ -1132,36 +1136,37 @@ class BaseHTTPTest(Test):
address = orig_address
# Reqest the content using a direct connection
- (code, resp_headers, new_cookies, mime_type, content) = http_request(orig_address,self.cookie_jar, self.headers)
+ req = http_request(orig_address,self.cookie_jar, self.headers)
# Make a good faith effort to follow redirects
count = 0
trail = set([])
- while (300 <= code < 400):
- plog("NOTICE", "Non-Tor HTTP "+str(code)+" redirect from "+str(orig_address)+" to "+str(content))
- address = content
+ while (300 <= req.code < 400):
+ plog("NOTICE", "Non-Tor HTTP "+str(req.code)+" redirect from "+str(orig_address)+" to "+str(req.content))
+ address = req.content
if address in trail: break
trail.add(address)
- (code, resp_headers, new_cookies, mime_type, content) = http_request(address, self.cookie_jar, self.headers)
+ req = http_request(address, self.cookie_jar, self.headers)
count += 1
if count > 4: break
# Couldn't get past the redirects
- if (300 <= code < 400):
- return (address,False,code,'')
+ if (300 <= req.code < 400):
+ return (address,False,req.code,'')
# If there was a fatal error, return failure
- if not (200 <= code < 300) or not content:
- plog("NOTICE", "Non-tor HTTP error "+str(code)+" fetching content for "+address)
- return (address, False, code,'')
+ if not (200 <= req.code < 300) or not req.content:
+ plog("NOTICE", "Non-tor HTTP error "+str(req.code)+" fetching content for "+address)
+ return (address, False, req.code,'')
- loaded_filetype = mime_to_filetype(mime_type)
+ loaded_filetype = mime_to_filetype(req.mime_type)
if filetype and filetype != loaded_filetype:
-
- plog('DEBUG', 'Wrong filetype: ' + filetype + ' ' + loaded_filetype)
- return (address, False, code, '')
+ plog('DEBUG', 'Wrong filetype: ' + loaded_filetype + ' instead of ' + filetype)
+ return (address, False, req.code, '')
+
+ self.save_compare_data(address,filetype,req)
# Fetch again with different cookies and see if we get the same content
# Use a different IP address if possible
@@ -1169,34 +1174,18 @@ class BaseHTTPTest(Test):
empty_cookie_jar = cookielib.MozillaCookieJar()
BindingSocket.bind_to = refetch_ip
- (code_new, resp_headers_new, new_cookies_new, mime_type_new, content_new) = http_request(address, empty_cookie_jar, self.headers)
+ second_req = http_request(address, empty_cookie_jar, self.headers)
BindingSocket.bind_to = None
# If there was a fatal error, return failure
- if not (code <= 200 < 300) or not content:
- plog("NOTICE", "Non-tor HTTP error "+str(code)+" fetching content for "+address)
- return (address, False, code, '')
-
- # The context for writing out the files used to make repeated comparisons
- address_file = DataHandler.safeFilename(re.sub('[a-z]+://','',address))
- content_prefix = http_content_dir + address_file
-
- # If the page is different on the second load, then it is probably dynamic and useless to us
- if self.compare(content,content_new,content_prefix,loaded_filetype) != COMPARE_EQUAL:
- return (address, False, code, '')
-
- f = open(content_prefix + '.content', 'w')
- f.write(content)
- f.close()
-
- # Save the cookies in case we want them for a later test
- empty_cookie_jar.save(content_prefix + '.cookies',ignore_discard=True)
+ if not (second_req.code <= 200 < 300) or not second_req.content:
+ plog("NOTICE", "Non-tor HTTP error "+str(second_req.code)+" fetching content for "+address)
+ return (address, False, second_req.code, '')
- # Save the response headers in case we want them for a later test
- headerdiffer = HeaderDiffer(resp_headers)
- SnakePickler.dump(headerdiffer, content_prefix+'.headerdiff')
+ if self.compare(address,filetype,second_req) != COMPARE_EQUAL:
+ return (address, False, second_req.code, '')
- return (address, True, code, loaded_filetype)
+ return (address, True, req.code, loaded_filetype)
def check_http(self, address, filetype, dynamic = False):
''' check whether a http connection to a given address is molested '''
@@ -1219,8 +1208,8 @@ class BaseHTTPTest(Test):
# CA we should modify our headers for maximum magic
# pfoobar means that foobar was acquired over a _p_roxy
- (pcode, presp_headers, pnew_cookies, pmime_type, pcontent) = torify(http_request, address, my_tor_cookie_jar, self.headers)
- psha1sum = sha(pcontent)
+ preq = torify(http_request, address, my_tor_cookie_jar, self.headers)
+ psha1sum = sha(preq.content)
exit_node = scanhdlr.get_exit_node()
if not exit_node:
@@ -1238,21 +1227,21 @@ class BaseHTTPTest(Test):
exit_node = "$"+exit_node.idhex
# If there is an error loading the page over Tor:
- if not (200 <= pcode < 300) or not pcontent:
+ if not (200 <= preq.code < 300) or not preq.content:
# And if it doesn't have to do with our SOCKS connection:
- if pcode not in SOCKS_ERRS:
- plog("NOTICE", exit_node+" had error "+str(pcode)+" fetching content for "+address)
+ if preq.code not in SOCKS_ERRS:
+ plog("NOTICE", exit_node+" had error "+str(preq.code)+" fetching content for "+address)
- (code_direct, resp_headers_direct, direct_cookies_direct, mime_type_direct, content_direct) = http_request(address, my_cookie_jar, self.headers)
+ direct_req = http_request(address, my_cookie_jar, self.headers)
# If a direct load is failing, remove this target from future consideration
- if (300 <= code_direct < 400):
+ if (300 <= direct_req.code < 400):
self.remove_target(address, INCONCLUSIVE_REDIRECT)
- elif not (200 <= code_direct < 300):
+ elif not (200 <= direct_req.code < 300):
self.remove_target(address, FALSEPOSITIVE_HTTPERRORS)
# If Tor and direct are failing for the same reason, Tor is off the hook
- if (code_direct == pcode):
+ if (direct_req.code == preq.code):
result = HttpTestResult(self.node_map[exit_node[1:]],
address, TEST_INCONCLUSIVE, INCONCLUSIVE_NOLOCALCONTENT)
if self.rescan_nodes:
@@ -1276,15 +1265,15 @@ class BaseHTTPTest(Test):
E_MISC: (FAILURE_MISCEXCEPTION, self.register_connect_failure, True)
}
- if pcode in err_lookup:
- fail_reason, register, extra_info = err_lookup[pcode]
- elif 300 <= pcode < 400: # Exit node introduced a redirect
- plog("NOTICE", "Tor only HTTP "+str(pcode)+" redirect from "+address+" to "+str(pcontent))
+ if preq.code in err_lookup:
+ fail_reason, register, extra_info = err_lookup[preq.code]
+ elif 300 <= preq.code < 400: # Exit node introduced a redirect
+ plog("NOTICE", "Tor only HTTP "+str(preq.code)+" redirect from "+address+" to "+str(preq.content))
fail_reason = FAILURE_REDIRECT
register = self.register_http_failure
extra_info = True
else: # Exit node introduced some other change
- fail_reason = FAILURE_BADHTTPCODE + str(pcode) #CA don't think this is good
+ fail_reason = FAILURE_BADHTTPCODE + str(preq.code) #CA don't think this is good
register = self.register_exit_failure
extra_info = True
@@ -1298,7 +1287,7 @@ class BaseHTTPTest(Test):
return TEST_FAILURE
# If we have no content, we had a connection error
- if pcontent == "":
+ if not preq.content:
result = HttpTestResult(self.node_map[exit_node[1:]],
address, TEST_FAILURE, FAILURE_NOEXITCONTENT)
self.register_exit_failure(result)
@@ -1309,30 +1298,15 @@ class BaseHTTPTest(Test):
# Tor was able to connect, so now it's time to make the comparison
#
- # An address representation acceptable for a filename:
- address_file = DataHandler.safeFilename(re.sub('[a-z]+://','',address))
- content_prefix = http_content_dir + address_file
- failed_prefix = http_failed_dir + address_file
-
- # Load content from disk
- content_file = open(content_prefix+'.content', 'r')
- content = ''.join(content_file.readlines())
- content_file.close()
-
- # If we need to write out the content handed to us by the exit node
- exit_content_file_name = DataHandler.uniqueFilename(failed_prefix+'.'+exit_node[1:]+'.content')
-
- # TODO we might want to check headers and cookies
-
# Compare the content
# TODO should we check if mimetype agrees with filetype?
- result = self.compare(pcontent,content,content_prefix,filetype)
+ result = self.compare(address,filetype,preq)
if result == COMPARE_NOEQUAL:
# Reload direct content and try again
- (code_direct, resp_headers_direct, direct_cookies_direct, mime_type_direct, content_direct) = http_request(address, my_cookie_jar, self.headers)
+ new_req = http_request(address, my_cookie_jar, self.headers)
# If a new direct load somehow fails, then we're out of luck
- if not (200 <= code_direct < 300):
+ if not (200 <= new_req.code < 300):
plog("WARN", "Failed to re-frech "+address+" outside of Tor. Did our network fail?")
self.remove_target(address, FALSEPOSITIVE_HTTPERRORS)
result = HttpTestResult(self.node_map[exit_node[1:]],
@@ -1344,14 +1318,14 @@ class BaseHTTPTest(Test):
return TEST_INCONCLUSIVE
# Try our comparison again
- dynamic = self.compare(content_direct,content,content_prefix,filetype)
+ dynamic = self.compare(address,filetype,new_req)
if dynamic == COMPARE_EQUAL:
- # The content has changed, so our exit node is screwing with us.
+ # The content has not actually changed, so our exit node is screwing with us.
result = HttpTestResult(self.node_map[exit_node[1:]],
address, TEST_FAILURE, FAILURE_EXITONLY,
sha1sum.hexdigest(), psha1sum.hexdigest(),
- content_prefix+".content", exit_content_file_name)
+ address_to_context(address)+".content")
self.register_exit_failure(result)
retval = TEST_FAILURE
else:
@@ -1363,73 +1337,100 @@ class BaseHTTPTest(Test):
result = HttpTestResult(self.node_map[exit_node[1:]],
address, TEST_INCONCLUSIVE, INCONCLUSIVE_DYNAMIC,
sha1sum_new.hexdigest(), psha1sum.hexdigest(),
- content_prefix+".content", exit_content_file_name,
- content_prefix+'.content-old',
- sha1sum.hexdigest())
+ address_to_context(address)+".content")
self.results.append(result)
retval = TEST_INCONCLUSIVE
+
elif result == COMPARE_EQUAL:
result = HttpTestResult(self.node_map[exit_node[1:]],
address, TEST_SUCCESS)
self.register_success(result)
return TEST_SUCCESS
elif result == COMPARE_TRUNCATION:
- exit_content_file = open(DataHandler.uniqueFilename(failed_prefix+'.'+exit_node[1:]+'.content'), 'w')
- exit_content_file.write(pcontent)
- exit_content_file.close()
result = HttpTestResult(self.node_map[exit_node[1:]],
address, TEST_FAILURE, FAILURE_EXITTRUNCATION,
sha1sum.hexdigest(), psha1sum.hexdigest(),
content_prefix+".content",
- exit_content_file_name)
+ exit_content_file)
self.register_exit_failure(result)
- return TEST_FAILURE
+ retval = TEST_FAILURE
# If we failed, then store what the exit node handed us
if retval == TEST_FAILURE:
- exit_content_file = open(exit_content_file_name, 'w')
- exit_content_file.write(pcontent)
+ exit_content_file = open(address_to_failed_prefix(address)+'.'+exit_node[1:]+'.content', 'w')
+ exit_content_file.write(preq.contet)
exit_content_file.close()
return retval
- def compare(self,new_content,old_content,context,filetype):
+ def _address_to_filename(self, address):
+ return DataHandler.safeFilename(re.sub('[a-z]+://','',address))
+
+ def address_to_context(self,address):
+ return http_content_dir + self._address_to_filename(address)
+
+ def address_to_failed_prefix(self, address):
+ return http_failed_dir + self._address_to_filename(address)
+
+ def save_compare_data(self, address, filetype, req):
+ context = self. address_to_context(address)
+
+ f = open(context + '.content', 'w')
+ f.write(req.content)
+ f.close()
+
+ lines = req.content.split('\n')
+
+ hashes = []
+ working_hash = sha()
+ for l in lines:
+ working_hash.update(l)
+ hashes.append(working_hash.hexdigest())
+
+ f = open(context + '.hashes','w')
+ pickle.dump(hashes,f)
+ f.close()
+
+ # Save the response headers in case we want them for a later test
+ headerdiffer = HeaderDiffer(req.headers)
+ SnakePickler.dump(headerdiffer, context + '.headerdiff')
+
+ # Save the new cookies in case we need them for a later test
+ SnakePickler.dump(req.new_cookies,context + '.cookies')
+
+ def compare(self,address,filetype,req):
"""The generic function for comparing webcontent."""
plog('DEBUG', "Beginning Compare")
- new_linelist = new_content.split('\n')
- old_linelist = old_content.split('\n')
+ context = self. address_to_context(address)
+
+ new_linelist = req.content.split('\n')
+ f = open(context + '.content')
+ old_content = f.read()
+ f.close()
+
old_hashes = pickled_content(context,'.hashes')
- if not old_hashes:
- old_hashes = []
- old_hash = sha()
- for l in old_linelist:
- old_hash.update(l)
- old_hashes.append(old_hash.hexdigest())
- f = open(context + '.hashes','w')
- pickle.dump(old_hashes,f)
- f.close()
-
- if len(new_linelist) > len(old_linelist):
+
+ if len(new_linelist) > len(old_hashes):
retval = COMPARE_NOEQUAL
else:
new_hash = sha()
- for i in range(0,min(len(old_linelist),len(new_linelist))):
+ for i in range(0,min(len(old_hashes),len(new_linelist))):
new_hash.update(new_linelist[i])
new_hash = new_hash.hexdigest()
if new_hash != old_hashes[len(new_linelist) - 1]:
retval = COMPARE_NOEQUAL
- elif len(new_linelist) == len(old_linelist):
+ elif len(new_linelist) == len(old_hashes):
retval = COMPARE_EQUAL
else:
retval = COMPARE_TRUNCATION
if retval == COMPARE_NOEQUAL:
try:
- retval = self.compare_funcs[filetype](new_content,old_content,context)
+ retval = self.compare_funcs[filetype](req.content,old_content,context)
except KeyError:
pass
@@ -2056,7 +2057,7 @@ class FixedTargetHTTPTest(FixedTargetTest, BaseHTTPTest):
def get_targets(self):
ret = []
for targ in self.fixed_targets:
- addr, succ, code, ftype = self.direct_load(targ, False)
+ addr, succ, code, ftype = self.first_load(targ, False)
if succ: ret.append([addr,ftype])
return ret
@@ -2146,11 +2147,12 @@ class SearchBasedTest:
plog("INFO", "Search url: "+search_url)
try:
if self.search_mode["useragent"]:
- (code, resp_headers, new_cookies, mime_type, content) = http_request(search_url, search_cookies)
+ search_req = http_request(search_url, search_cookies)
else:
headers = filter(lambda h: h[0] != "User-Agent",
copy.copy(firefox_headers))
- (code, resp_headers, new_cookies, mime_type, content) = http_request(search_url, search_cookies, headers)
+ search_req = http_request(search_url, search_cookies, headers)
+
except socket.gaierror:
plog('ERROR', 'Scraping of http://'+host+search_path+" failed")
traceback.print_exc()
@@ -2161,10 +2163,12 @@ class SearchBasedTest:
# Bloody hack just to run some tests overnight
break
- if (400 <= code < 500):
+ if (400 <= search_req.code < 500):
plog('ERROR', 'Scraping of http://'+host+search_path+' failed. HTTP '+str(code))
break
+ content = search_req.content
+
links = SoupStrainer('a')
try:
soup = TheChosenSoup(content, parseOnlyThese=links)
@@ -2197,7 +2201,7 @@ class SearchBasedTest:
if self._is_useable_url(url, prot_list, file_list):
plog('DEBUG', "Found a useable url: " + url)
- url, success, code, cur_filetype = self.direct_load(url,filetype)
+ url, success, code, cur_filetype = self.first_load(url,filetype)
if not success:
plog('DEBUG',"Url was not useable after all: " + url)
continue
1
0

[torflow/master] Better registering test results for HttpTest
by mikeperry@torproject.org 14 Sep '11
by mikeperry@torproject.org 14 Sep '11
14 Sep '11
commit a8a11da002d9ff867f3441b8c0114a9d57b22a5c
Author: christian <christian(a)avtok.com>
Date: Mon Aug 22 17:52:26 2011 -0400
Better registering test results for HttpTest
---
NetworkScanners/ExitAuthority/soat.py | 51 +++++++++++++--------------------
1 files changed, 20 insertions(+), 31 deletions(-)
diff --git a/NetworkScanners/ExitAuthority/soat.py b/NetworkScanners/ExitAuthority/soat.py
index e8a17dc..373dc00 100755
--- a/NetworkScanners/ExitAuthority/soat.py
+++ b/NetworkScanners/ExitAuthority/soat.py
@@ -1002,6 +1002,12 @@ class Test:
datahandler.saveResult(result)
return TEST_FAILURE
+ def register_inconclusive(self, result):
+ if self.rescan_nodes:
+ result.from_rescan = True
+ self.results.append(result)
+ datahandler.saveResult(result)
+ return TEST_INCONCLUSIVE
class BaseHTTPTest(Test):
def __init__(self, scan_filetypes=scan_filetypes):
@@ -1205,7 +1211,7 @@ class BaseHTTPTest(Test):
for cookie in self.cookie_jar:
my_cookie_jar.set_cookie(cookie)
- # CA we should modify our headers for maximum magic
+ # CA we should modify our headers so we look like a browser
# pfoobar means that foobar was acquired over a _p_roxy
preq = torify(http_request, address, my_tor_cookie_jar, self.headers)
@@ -1217,12 +1223,7 @@ class BaseHTTPTest(Test):
plog('NOTICE', 'We had no exit node to test, skipping to the next test.')
result = HttpTestResult(None,
address, TEST_INCONCLUSIVE, INCONCLUSIVE_NOEXIT)
- if self.rescan_nodes:
- # CA: we shouldn't need to do this
- result.from_rescan = True
- self.results.append(result)
- # CA: when do we use datahandler?
- return TEST_INCONCLUSIVE
+ return self.register_inconclusive(result)
exit_node = "$"+exit_node.idhex
@@ -1244,11 +1245,7 @@ class BaseHTTPTest(Test):
if (direct_req.code == preq.code):
result = HttpTestResult(self.node_map[exit_node[1:]],
address, TEST_INCONCLUSIVE, INCONCLUSIVE_NOLOCALCONTENT)
- if self.rescan_nodes:
- # CA: we shouldn't need to do this
- result.from_rescan = True
- self.results.append(result)
- return TEST_INCONCLUSIVE
+ return self.register_inconclusive(result)
# Error => behavior lookup table
# Error code (Failure reason, Register method, Set extra_info to pcontent?)
@@ -1283,23 +1280,20 @@ class BaseHTTPTest(Test):
if extra_info:
result.extra_info = str(pcontent)
- register(result)
- return TEST_FAILURE
+ return register(result)
# If we have no content, we had a connection error
if not preq.content:
result = HttpTestResult(self.node_map[exit_node[1:]],
address, TEST_FAILURE, FAILURE_NOEXITCONTENT)
- self.register_exit_failure(result)
- # Restore cookie jars
- return TEST_FAILURE
+ return self.register_exit_failure(result)
#
# Tor was able to connect, so now it's time to make the comparison
#
# Compare the content
- # TODO should we check if mimetype agrees with filetype?
+
result = self.compare(address,filetype,preq)
if result == COMPARE_NOEQUAL:
# Reload direct content and try again
@@ -1312,10 +1306,7 @@ class BaseHTTPTest(Test):
result = HttpTestResult(self.node_map[exit_node[1:]],
address, TEST_INCONCLUSIVE,
INCONCLUSIVE_NOLOCALCONTENT)
- if self.rescan_nodes:
- result.from_rescan = True
- self.results.append(result)
- return TEST_INCONCLUSIVE
+ return self.register_inconclusive(result)
# Try our comparison again
dynamic = self.compare(address,filetype,new_req)
@@ -1326,8 +1317,8 @@ class BaseHTTPTest(Test):
address, TEST_FAILURE, FAILURE_EXITONLY,
sha1sum.hexdigest(), psha1sum.hexdigest(),
address_to_context(address)+".content")
- self.register_exit_failure(result)
- retval = TEST_FAILURE
+ retval = self.register_exit_failure(result)
+
else:
# The content is dynamic.
# Here's where "no dynamic" comes in.
@@ -1338,22 +1329,20 @@ class BaseHTTPTest(Test):
address, TEST_INCONCLUSIVE, INCONCLUSIVE_DYNAMIC,
sha1sum_new.hexdigest(), psha1sum.hexdigest(),
address_to_context(address)+".content")
- self.results.append(result)
- retval = TEST_INCONCLUSIVE
+ retval = self.register_inconclusive(result)
elif result == COMPARE_EQUAL:
result = HttpTestResult(self.node_map[exit_node[1:]],
address, TEST_SUCCESS)
- self.register_success(result)
- return TEST_SUCCESS
+ retval = self.register_success(result)
+
elif result == COMPARE_TRUNCATION:
result = HttpTestResult(self.node_map[exit_node[1:]],
address, TEST_FAILURE, FAILURE_EXITTRUNCATION,
sha1sum.hexdigest(), psha1sum.hexdigest(),
content_prefix+".content",
exit_content_file)
- self.register_exit_failure(result)
- retval = TEST_FAILURE
+ retval = self.register_exit_failure(result)
# If we failed, then store what the exit node handed us
if retval == TEST_FAILURE:
@@ -2108,7 +2097,7 @@ class SearchBasedTest:
return list(urllist)
def get_search_urls_for_filetype(self, filetype, number=0):
- # CA. I don't want to support 'any' any more. We must specify a filetype
+ # We don't want to support 'any' any more. We must specify a filetype
assert(filetype != 'any')
assert(filetype)
1
0

14 Sep '11
commit 67b4547a1edbdbbef283ac35709c40808619840f
Merge: 5736529 f3fefed
Author: Mike Perry <mikeperry-git(a)fscked.org>
Date: Tue Sep 13 18:43:53 2011 -0700
Merge remote branch 'canderson/master'
.gitignore | 3 +
NetworkScanners/ExitAuthority/libsoat.py | 9 +-
NetworkScanners/ExitAuthority/soat.py | 1308 ++++++++++----------------
NetworkScanners/ExitAuthority/soat_config.py | 34 +-
4 files changed, 515 insertions(+), 839 deletions(-)
1
0

[torflow/master] Incorporating ssl test into new infrastructure
by mikeperry@torproject.org 14 Sep '11
by mikeperry@torproject.org 14 Sep '11
14 Sep '11
commit 5ae4e364f809169c19add390e64053bd834e216e
Author: christian <christian(a)avtok.com>
Date: Tue Aug 30 14:03:28 2011 -0400
Incorporating ssl test into new infrastructure
---
NetworkScanners/ExitAuthority/soat.py | 142 ++++++++++++++++----------------
1 files changed, 71 insertions(+), 71 deletions(-)
diff --git a/NetworkScanners/ExitAuthority/soat.py b/NetworkScanners/ExitAuthority/soat.py
index 373dc00..4633dad 100755
--- a/NetworkScanners/ExitAuthority/soat.py
+++ b/NetworkScanners/ExitAuthority/soat.py
@@ -32,7 +32,6 @@ import getopt
import httplib
import mimetypes
import os
-import pickle
import random
import re
import signal
@@ -447,7 +446,7 @@ def ssl_request(address):
try:
return _ssl_request(address)
except (ReadTimeout, socket.timeout), e:
- plog("INFO", "SSL Request done with timoeut for addrress: "+str(address))
+ plog("INFO", "SSL Request done with timoeut for address: "+str(address))
return (E_TIMEOUT, None, "Socket timeout")
def _ssl_request(address, method='TLSv1_METHOD'):
@@ -639,7 +638,7 @@ class Test:
self.url_reserve = {}
self._pickle_revision = 8
- def _is_useable_url(self, url, valid_schemes=None, filetypes=None):
+ def _is_useable_url(self, url, valid_schemes=None, filetype=None):
(scheme, netloc, path, params, query, fragment) = urlparse.urlparse(url)
if netloc.rfind(":") != -1:
# FIXME: %-encoding?
@@ -658,12 +657,10 @@ class Test:
if url in self.banned_targets:
plog("DEBUG", "Banned url "+url)
return False
- if filetypes: # Must be checked last
- for filetype in filetypes:
- if url[-len(filetype):] == filetype:
- return True
- plog("DEBUG", "Bad filetype for "+url)
- return False
+ if filetype:
+ if url[-len(filetype):] != filetype:
+ plog("DEBUG", "Bad filetype for "+url)
+ return False
return True
def add_target(self, target):
@@ -1010,9 +1007,8 @@ class Test:
return TEST_INCONCLUSIVE
class BaseHTTPTest(Test):
- def __init__(self, scan_filetypes=scan_filetypes):
+ def __init__(self):
# FIXME: Handle http urls w/ non-80 ports..
- self.scan_filetypes = scan_filetypes
self.fetch_queue = []
Test.__init__(self, "HTTP", 80)
self.save_name = "HTTPTest"
@@ -1172,7 +1168,7 @@ class BaseHTTPTest(Test):
plog('DEBUG', 'Wrong filetype: ' + loaded_filetype + ' instead of ' + filetype)
return (address, False, req.code, '')
- self.save_compare_data(address,filetype,req)
+ self.save_compare_data(address,loaded_filetype,req)
# Fetch again with different cookies and see if we get the same content
# Use a different IP address if possible
@@ -1188,7 +1184,7 @@ class BaseHTTPTest(Test):
plog("NOTICE", "Non-tor HTTP error "+str(second_req.code)+" fetching content for "+address)
return (address, False, second_req.code, '')
- if self.compare(address,filetype,second_req) != COMPARE_EQUAL:
+ if self.compare(address,loaded_filetype,second_req) != COMPARE_EQUAL:
return (address, False, second_req.code, '')
return (address, True, req.code, loaded_filetype)
@@ -1278,7 +1274,7 @@ class BaseHTTPTest(Test):
result = HttpTestResult(self.node_map[exit_node[1:]],
address, TEST_FAILURE, fail_reason)
if extra_info:
- result.extra_info = str(pcontent)
+ result.extra_info = str(preq.content)
return register(result)
@@ -1376,9 +1372,8 @@ class BaseHTTPTest(Test):
working_hash.update(l)
hashes.append(working_hash.hexdigest())
- f = open(context + '.hashes','w')
- pickle.dump(hashes,f)
- f.close()
+ # Save these line-by-line hashes for later use
+ SnakePickler.dump(hashes, context + '.hashes')
# Save the response headers in case we want them for a later test
headerdiffer = HeaderDiffer(req.headers)
@@ -1400,7 +1395,8 @@ class BaseHTTPTest(Test):
old_content = f.read()
f.close()
- old_hashes = pickled_content(context,'.hashes')
+
+ old_hashes = SnakePickler.load(context + '.hashes')
if len(new_linelist) > len(old_hashes):
retval = COMPARE_NOEQUAL
@@ -1449,15 +1445,6 @@ class BaseHTTPTest(Test):
return COMPARE_NOEQUAL
# TODO move these somewhere sensible
-def pickled_content(context,extension):
- try:
- f = open(context + extension, 'r')
- ret = pickle.load(f)
- f.close()
- except IOError:
- ret = False
- return ret
-
def mime_to_filetype(mime_type):
return mimetypes.guess_extension(mime_type)[1:]
@@ -1478,8 +1465,8 @@ def is_script_mimetype(mime_type):
return is_script
class BaseHTMLTest(BaseHTTPTest):
- def __init__(self, scan_filetypes=scan_filetypes):
- BaseHTTPTest.__init__(self, scan_filetypes)
+ def __init__(self):
+ BaseHTTPTest.__init__(self)
self.save_name = "HTMLTest"
self.proto = "HTML" #CA .. ?
@@ -1833,7 +1820,7 @@ class BaseSSLTest(Test):
def run_test(self):
self.tests_run += 1
- return self.check_ssl(random.choice(self.targets))
+ return self.check_ssl(random.choice(self.targets)[0])
def get_resolved_ip(self, hostname):
# XXX: This is some extreme GIL abuse.. It may have race conditions
@@ -2043,11 +2030,12 @@ class FixedTargetHTTPTest(FixedTargetTest, BaseHTTPTest):
utargets = [t for t in targets if self._is_useable_url(t, ['http'])]
FixedTargetTest.__init__(self, utargets)
- def get_targets(self):
+ def get_targets(self):
ret = []
for targ in self.fixed_targets:
addr, succ, code, ftype = self.first_load(targ, False)
- if succ: ret.append([addr,ftype])
+ if succ:
+ ret.append([addr,ftype])
return ret
def add_target(self, target):
@@ -2065,6 +2053,7 @@ class FixedTargetHTMLTest(FixedTargetTest, BaseHTMLTest):
class FixedTargetSSLTest(FixedTargetTest, BaseSSLTest):
def __init__(self, targets):
BaseSSLTest.__init__(self)
+ # We ask for hostnames only, please
utargets = [t for t in targets if self._is_useable_url(t, [''])]
FixedTargetTest.__init__(self, utargets)
@@ -2080,30 +2069,7 @@ class SearchBasedTest:
def rewind(self):
self.wordlist = load_wordlist(self.wordlist_file)
- def add_target(self, target):
- self.targets.add(target[0],[target[1]])
- return True
-
- def get_targets(self):
- '''
- construct a list of urls based on the wordlist, filetypes and protocol.
- '''
- plog('INFO', 'Searching for relevant sites...')
-
- urllist = set([])
- for filetype in self.scan_filetypes:
- urllist.update(map(lambda x: (x, filetype), self.get_search_urls_for_filetype(filetype)))
-
- return list(urllist)
-
- def get_search_urls_for_filetype(self, filetype, number=0):
- # We don't want to support 'any' any more. We must specify a filetype
- assert(filetype != 'any')
- assert(filetype)
-
- if not number:
- number = self.results_per_type
-
+ def get_search_urls_for_filetype(self, filetype, number):
self.url_reserve.setdefault(filetype,[])
type_urls = set(self.url_reserve[filetype][:number])
@@ -2116,7 +2082,7 @@ class SearchBasedTest:
#Try to filter based on filetype/protocol. Unreliable. We will re-filter.
query = random.choice(self.wordlist)
- if filetype != 'any':
+ if filetype:
query += " "+self.search_mode["filetype"]+filetype
plog("WARN", "RESULTPROTOCOL IS:" + self.result_protocol)
if self.result_protocol == 'https' and self.search_mode["inurl"]:
@@ -2183,17 +2149,18 @@ class SearchBasedTest:
prot_list = None
else:
prot_list = [self.result_protocol]
- if filetype == 'any':
- file_list = None
- else:
- file_list = self.scan_filetypes
- if self._is_useable_url(url, prot_list, file_list):
+ if self._is_useable_url(url, prot_list, filetype):
+ try:
+ self.first_load
+ except AttributeError:
+ pass
+ else:
+ url, success, code, cur_filetype = self.first_load(url,filetype)
+ if not success:
+ continue
+
plog('DEBUG', "Found a useable url: " + url)
- url, success, code, cur_filetype = self.first_load(url,filetype)
- if not success:
- plog('DEBUG',"Url was not useable after all: " + url)
- continue
if self.host_only:
# FIXME: %-encoding, @'s, etc?
plog("INFO", url)
@@ -2217,9 +2184,10 @@ class SearchBasedTest:
return type_urls
class SearchBasedHTTPTest(SearchBasedTest, BaseHTTPTest):
- def __init__(self, wordlist):
+ def __init__(self, wordlist, scan_filetypes=scan_filetypes):
BaseHTTPTest.__init__(self)
SearchBasedTest.__init__(self, wordlist)
+ self.scan_filetypes = scan_filetypes
self.results_per_type = urls_per_filetype
self.result_protocol = 'http'
@@ -2250,6 +2218,22 @@ class SearchBasedHTTPTest(SearchBasedTest, BaseHTTPTest):
plog("NOTICE", self.proto+" scanner short on "+ftype+" targets. Adding more")
map(self.add_target, self.get_search_urls_for_filetype(ftype,targets_needed))
+ def add_target(self, target):
+ self.targets.add(target[0],[target[1]])
+ return True
+
+ def get_targets(self):
+ '''
+ construct a list of urls based on the wordlist, filetypes and protocol.
+ '''
+ plog('INFO', 'Searching for relevant sites...')
+
+ urllist = set([])
+ for filetype in self.scan_filetypes:
+ urllist.update(map(lambda x: (x, filetype), self.get_search_urls_for_filetype(filetype, self.results_per_type)))
+
+ return list(urllist)
+
HTTPTest = SearchBasedHTTPTest # For resuming from old HTTPTest.*.test files
class SearchBasedHTMLTest(SearchBasedTest, BaseHTMLTest):
@@ -2257,7 +2241,6 @@ class SearchBasedHTMLTest(SearchBasedTest, BaseHTMLTest):
BaseHTMLTest.__init__(self)
SearchBasedTest.__init__(self, wordlist)
self.result_filetypes = ["any"]
- self.results_per_type = self.fetch_targets
def depickle_upgrade(self):
if self._pickle_revision < 7:
@@ -2278,8 +2261,11 @@ class SearchBasedSSLTest(SearchBasedTest, BaseSSLTest):
SearchBasedTest.__init__(self, wordlist)
self.host_only = True
self.result_protocol = 'https'
- if default_search_mode == yahoo_search_mode:
- plog('WARN', 'Yahoo search mode is not suitable for SSLTests. Continuing anyway.')
+ try:
+ if default_search_mode == yahoo_search_mode:
+ plog('WARN', 'Yahoo search mode is not suitable for SSLTests. Continuing anyway.')
+ except NameError:
+ pass
self.search_mode=default_search_mode
def depickle_upgrade(self):
@@ -2289,8 +2275,19 @@ class SearchBasedSSLTest(SearchBasedTest, BaseSSLTest):
self.search_mode=google_search_mode
BaseSSLTest.depickle_upgrade(self)
+ def get_targets(self):
+ '''
+ construct a list of urls based on the wordlist, filetypes and protocol.
+ '''
+ plog('INFO', 'Searching for relevant sites...')
+
+ urllist = set([])
+ urllist.update(self.get_search_urls_for_filetype(None, num_ssl_hosts))
+
+ return list(urllist)
+
def rewind(self):
- self.wordlist = load_wordlist(self.wordlist_file)
+ SearchBasedTest.rewind(self)
BaseSSLTest.rewind(self)
SSLTest = SearchBasedSSLTest # For resuming from old SSLTest.*.test files
@@ -3207,6 +3204,9 @@ def main(argv):
common_nodes &= test.nodes
scanhdlr._sanity_check(map(lambda id: test.node_map[id],
test.nodes))
+ print "COMMON NODES"
+ print common_nodes
+
if common_nodes is None:
common_nodes = set([])
1
0