commit 0c10b5be192f96e75db45c4a32c48248da8ee513 Author: christian christian@avtok.com Date: Thu Jun 2 15:05:40 2011 -0400
Incorporating new container for targets. SearchBasedHTTPTest and FixedTargetHTTPTest now seem to run. --- NetworkScanners/ExitAuthority/soat.py | 204 +++++++++++++++----------------- 1 files changed, 96 insertions(+), 108 deletions(-)
diff --git a/NetworkScanners/ExitAuthority/soat.py b/NetworkScanners/ExitAuthority/soat.py index c96468f..8d557eb 100755 --- a/NetworkScanners/ExitAuthority/soat.py +++ b/NetworkScanners/ExitAuthority/soat.py @@ -528,9 +528,10 @@ class Targets: MUST support these methods: add -- Add a target. Optional second argument is list of keys. Idempotent. remove -- Remove a target. Returns True iff the target was found. - bykey -- Get an iterator whose elements match the supplied key. + bykey -- Get a list whose elements match the supplied key. __iter__ __len__ + __getitem__
""" def __init__(self): @@ -564,15 +565,20 @@ class Targets: return retval def bykey(self,key): return self.lookup.get(key,[]) + def keys(self): + return self.lookup.keys() def __iter__(self): return map(lambda x: x[0], self.list).__iter__() def __len__(self): return len(self.list) + def __getitem__(self,index): + return self.list[index]
# Base Test Classes class Test: """ Base class for our tests """ def __init__(self, proto, port): + """Sets the variables that are static for the lifetime of the test and calls self._reset() which sets the variables that are not.""" self.proto = proto self.port = port self.min_targets = min_targets @@ -585,8 +591,7 @@ class Test: self.scan_nodes = 0 self.nodes_to_mark = 0 self.tests_per_node = num_tests_per_node - self.url_reserve = {} - self._reset() + self._reset() #CA make this a call to rewind instead? self._pickle_revision = 8 # Will increment as fields are added
def run_test(self): @@ -656,13 +661,19 @@ class Test: return True
def add_target(self, target): - self.targets.append(target) + self.targets.add(target) + + def select_targets(self): + return self.targets + + def refill_targets(self): + map(self.add_target, self.get_targets()) + if not self.targets: + raise NoURLsFound("No URLS found for protocol "+self.proto)
def remove_target(self, target, reason="None"): self.banned_targets.add(target) - self.refill_targets() - if target in self.targets: - self.targets.remove(target) + self.targets.remove(target) if target in self.dynamic_fails: del self.dynamic_fails[target] if target in self.successes: @@ -693,6 +704,8 @@ class Test: datahandler.saveResult(r) self.results.remove(r)
+ self.refill_targets() + def load_rescan(self, type, since=None): self.rescan_nodes = set([]) results = datahandler.getAll() @@ -817,7 +830,7 @@ class Test:
def _reset(self): self.results = [] - self.targets = [] + self.targets = Targets() self.tests_run = 0 self.nodes_marked = 0 self.run_start = time.time() @@ -827,7 +840,7 @@ class Test: self.dns_fails_per_exit = {} self.exit_fails_per_exit = {} self.node_results = {} - # These are indexed by site url: + # These are indexed by target URI: self.connect_fails = {} self.timeout_fails = {} self.dns_fails = {} @@ -842,8 +855,8 @@ class Test: if not self.targets: raise NoURLsFound("No URLS found for protocol "+self.proto)
- targets = "\n\t".join(self.targets) - plog("INFO", "Using the following urls for "+self.proto+" scan:\n\t"+targets) + targets_str = "\n\t".join(map(str,self.targets)) + plog("INFO", "Using the following urls for "+self.proto+" scan:\n\t"+targets_str)
def site_tests(self, site): tot_cnt = 0 @@ -981,17 +994,16 @@ class Test:
class BaseHTTPTest(Test): - def __init__(self, filetypes=scan_filetypes): + def __init__(self, scan_filetypes=scan_filetypes): # FIXME: Handle http urls w/ non-80 ports.. - self.scan_filetypes = filetypes + self.scan_filetypes = scan_filetypes + self.fetch_queue = [] Test.__init__(self, "HTTP", 80) self.save_name = "HTTPTest" - self.fetch_targets = urls_per_filetype
def _reset(self): self.httpcode_fails = {} self.httpcode_fails_per_exit = {} - self.targets_by_type = {} Test._reset(self)
def depickle_upgrade(self): @@ -1034,15 +1046,13 @@ class BaseHTTPTest(Test):
self.tests_run += 1
- n_tests = random.choice(xrange(1,len(self.targets_by_type)+1)) - filetypes = random.sample(self.targets_by_type.keys(), n_tests) - - plog("INFO", "HTTPTest decided to fetch "+str(n_tests)+" urls of types: "+str(filetypes)) + self.fetch_queue.extend(self.select_targets())
n_success = n_fail = n_inconclusive = 0 - for ftype in filetypes: + + while self.fetch_queue: + address = self.fetch_queue.pop(0) # FIXME: Set referrer to random or none for each of these - address = random.choice(self.targets_by_type[ftype]) result = self.check_http(address) if result == TEST_INCONCLUSIVE: n_inconclusive += 1 @@ -1062,22 +1072,9 @@ class BaseHTTPTest(Test): else: return TEST_SUCCESS
- def add_target(self, target): - # HTTP Tests keep an additional dictionary of targets keyed by filetype - split = target.rsplit('.',1) - if len(split) > 1 and split[-1] in self.scan_filetypes: - self.targets.append(target) - self.targets_by_type.setdefault(split[-1], []).append(target) - def remove_target(self, target, reason="None"): # Remove from targets list and targets by type dictionary - if target in self.targets: - self.targets.remove(target) - for k,v in self.targets_by_type.items(): - if target in v: - v.remove(target) - if not v: - del self.targets_by_type[k] + self.targets.remove(target) # Delete results in httpcode_fails if target in self.httpcode_fails: del self.httpcode_fails[target] @@ -1488,18 +1485,10 @@ def is_script_mimetype(mime_type): return is_script
class BaseHTMLTest(BaseHTTPTest): - def __init__(self, recurse_filetypes=scan_filetypes): - BaseHTTPTest.__init__(self, recurse_filetypes) + def __init__(self, scan_filetypes=scan_filetypes): + BaseHTTPTest.__init__(self, scan_filetypes) self.save_name = "HTMLTest" - self.fetch_targets = num_html_urls - self.proto = "HTML" - self.recurse_filetypes = recurse_filetypes - self.fetch_queue = [] - - def _reset(self): - self.httpcode_fails = {} - self.httpcode_fails_per_exit = {} - Test._reset(self) + self.proto = "HTML" #CA .. ?
def depickle_upgrade(self): if self._pickle_revision < 7: @@ -1507,11 +1496,9 @@ class BaseHTMLTest(BaseHTTPTest): Test.depickle_upgrade(self)
def add_target(self, target): + """Avoid BaseHTTP.add_target which keys entries""" Test.add_target(self, target)
- def remove_target(self, target, reason="None"): - Test.remove_target(self, target, reason) - def run_test(self): # A single test should have a single cookie jar self.tor_cookie_jar = cookielib.MozillaCookieJar() @@ -1616,7 +1603,7 @@ class BaseHTMLTest(BaseHTTPTest): targets.append(("image", urlparse.urljoin(orig_addr, attr_tgt))) elif t.name == 'a': if attr_name == "href": - for f in self.recurse_filetypes: + for f in self.scan_filetypes: if f not in got_type and attr_tgt[-len(f):] == f: got_type[f] = 1 targets.append(("http", urlparse.urljoin(orig_addr, attr_tgt))) @@ -2045,14 +2032,16 @@ class FixedTargetTest: def __init__(self, targets): self.fixed_targets = targets
- def refill_targets(self): - pass - def get_targets(self): return self.fixed_targets[:]
+ def refill_targets(self): + """Can't refill FixedTargetTest""" + pass + def finished(self): - # FixedTargetTests are done if they test all nodes or run out of targets + """FixedTargetTests are done if they test all nodes or run out of targets""" + # CA do we properly handle possibility that self.targets can run out return not (self.nodes and self.targets)
class FixedTargetHTTPTest(FixedTargetTest, BaseHTTPTest): @@ -2081,17 +2070,12 @@ class SearchBasedTest: """ Mixin class. Must be mixed with a subclass of Test """ def __init__(self, wordlist_file): self.wordlist_file = wordlist_file - self.host_only = False - self.result_filetypes = ['any'] - self.result_protocol = 'any' - self.results_per_type = 10 self.search_mode = default_search_mode + self.url_reserve = {}
- def refill_targets(self): - if len(self.targets) < self.min_targets: - plog("NOTICE", self.proto+" scanner short on targets. Adding more") - map(self.add_target, self.get_targets()) + def rewind(self): + self.wordlist = load_wordlist(self.wordlist_file)
def get_targets(self): return self.get_search_urls() @@ -2103,26 +2087,23 @@ class SearchBasedTest: plog('INFO', 'Searching for relevant sites...')
urllist = set([]) - for filetype in self.result_filetypes: - type_urls = self.get_search_urls_for_filetype(filetype) - # make sure we don't get more urls than needed - if len(type_urls) > self.results_per_type: - chosen_urls = set(random.sample(type_urls, self.results_per_type)) - if filetype in self.url_reserve: - self.url_reserve[filetype].extend(list(type_urls - chosen_urls)) - else: - self.url_reserve[filetype] = list(type_urls - chosen_urls) - type_urls = chosen_urls - urllist.update(type_urls) + for filetype in self.scan_filetypes: + urllist.update(self.get_search_urls_for_filetype(filetype))
return list(urllist)
- def get_search_urls_for_filetype(self, filetype): - type_urls = set(self.url_reserve.get(filetype, [])) - if type_urls: # Clear urls from the reserve - self.url_reserve[filetype] = [] + def get_search_urls_for_filetype(self, filetype,number = 0): + if not number: + number = self.results_per_type + + self.url_reserve.setdefault(filetype,[]) + + type_urls = set(self.url_reserve[filetype][:number]) + self.url_reserve[filetype] = self.url_reserve[filetype][number:] + count = 0 - while len(type_urls) < self.results_per_type and count < max_search_retry: + + while len(type_urls) < number and count < max_search_retry: count += 1
#Try to filter based on filetype/protocol. Unreliable. We will re-filter. @@ -2194,30 +2175,34 @@ class SearchBasedTest: if filetype == 'any': file_list = None else: - file_list = self.result_filetypes + file_list = self.scan_filetypes
if self._is_useable_url(url, prot_list, file_list): if self.host_only: # FIXME: %-encoding, @'s, etc? plog("INFO", url) - host = urlparse.urlparse(url)[1] + url = urlparse.urlparse(url)[1] # Have to check again here after parsing the url: - if host not in self.banned_targets: - type_urls.add(host) - else: - type_urls.add(url) + if host in self.banned_targets: + continue + type_urls.add(url) + plog("INFO", "Have "+str(len(type_urls))+"/"+str(number)+" urls from search so far..") else: pass - plog("INFO", "Have "+str(len(type_urls))+"/"+str(self.results_per_type)+" urls from search so far..") + + if len(type_urls) > number: + chosen = random.sample(type_urls,number) + self.url_reserve[filetype].extend(list(type_urls - set(chosen))) + type_urls = chosen + return type_urls
class SearchBasedHTTPTest(SearchBasedTest, BaseHTTPTest): def __init__(self, wordlist): BaseHTTPTest.__init__(self) SearchBasedTest.__init__(self, wordlist) - self.result_filetypes = self.scan_filetypes - self.result_protocol = "http" - self.results_per_type = self.fetch_targets + self.results_per_type = urls_per_filetype + self.result_protocol = 'http'
def depickle_upgrade(self): if self._pickle_revision < 7: @@ -2227,28 +2212,32 @@ class SearchBasedHTTPTest(SearchBasedTest, BaseHTTPTest): BaseHTTPTest.depickle_upgrade(self)
def rewind(self): - self.wordlist = load_wordlist(self.wordlist_file) + SearchBasedTest.rewind(self) BaseHTTPTest.rewind(self)
+ def add_target(self, target): + # Keys targets by filetype. One filetype per target + split = target.rsplit('.',1) + if len(split) > 1 and split[-1] in self.scan_filetypes: + self.targets.add(target,[split[-1]]) + return True + return False + + def select_targets(self): + retval = [] + n_tests = random.randrange(1,len(self.targets.keys())+1) + filetypes = random.sample(self.targets.keys(), n_tests) + plog("INFO", "HTTPTest decided to fetch "+str(n_tests)+" urls of types: "+str(filetypes)) + for ftype in filetypes: + retval.append(random.choice(self.targets.bykey(ftype))) + return retval + def refill_targets(self): for ftype in self.scan_filetypes: - if not ftype in self.targets_by_type or len(self.targets_by_type[ftype]) < self.fetch_targets: + targets_needed = self.results_per_type - len(self.targets.bykey(ftype)) + if targets_needed > 0: plog("NOTICE", self.proto+" scanner short on "+ftype+" targets. Adding more") - map(self.add_target, self.get_search_urls_for_filetype(ftype)) - -# This duplicated the effort of BaseHTTPTest.add_target which is invoked by -# SearchBasedHTTPTest.rewind -> BaseHTTPTest.rewind = Test.rewind -# Instead we should fall back on SearchBasedTest.get_targets -# def get_targets(self): -# raw_urls = self.get_search_urls() -# new = {} -# for url in raw_urls: -# split = url.rsplit('.',1) # Try to get filetype -# if len(split) > 1 and split[-1] in self.scan_filetypes: -# new.setdefault(split[-1],[]).append(url) -# for k,v in new.items(): -# self.targets_by_type.setdefault(k, []).extend(v) -# return raw_urls + map(self.add_target, self.get_search_urls_for_filetype(ftype,targets_needed))
HTTPTest = SearchBasedHTTPTest # For resuming from old HTTPTest.*.test files
@@ -2257,7 +2246,6 @@ class SearchBasedHTMLTest(SearchBasedTest, BaseHTMLTest): BaseHTMLTest.__init__(self) SearchBasedTest.__init__(self, wordlist) self.result_filetypes = ["any"] - self.result_protocol = "http" self.results_per_type = self.fetch_targets
def depickle_upgrade(self): @@ -2268,7 +2256,7 @@ class SearchBasedHTMLTest(SearchBasedTest, BaseHTMLTest): BaseHTMLTest.depickle_upgrade(self)
def rewind(self): - self.wordlist = load_wordlist(self.wordlist_file) + SearchBasedTest.rewind(self) BaseHTMLTest.rewind(self)
HTMLTest = SearchBasedHTMLTest # For resuming from old HTMLTest.*.test files @@ -2908,7 +2896,7 @@ def decompress_response_data(response): len_read = len(data) now = time.time()
- plog("DEBUG", "Read "+str(len_read)+"/"+str(tot_len)) + #plog("DEBUG", "Read "+str(len_read)+"/"+str(tot_len)) #Very verbose # Wait 5 seconds before counting data if (now-start) > 5: rate = (float(len_read)/(now-start)) #B/s
tor-commits@lists.torproject.org