commit b4be3a63400e658eb26436d97fca766907ecf91c Author: Christian Anderson christian@avtok.com Date: Tue May 24 12:46:04 2011 -0400
Modernizing search modes. Adding support to soat.py --- .gitignore | 3 ++ NetworkScanners/ExitAuthority/soat.py | 30 +++++++++++++------------ NetworkScanners/ExitAuthority/soat_config.py | 28 +++++++++++++----------- 3 files changed, 34 insertions(+), 27 deletions(-)
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b4e8d7b --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +*.pyc +NetworkScanners/ExitAuthority/data/ +NetworkScanners/ExitAuthority/search_cookies.lwp \ No newline at end of file diff --git a/NetworkScanners/ExitAuthority/soat.py b/NetworkScanners/ExitAuthority/soat.py index 162ff0e..97d310b 100755 --- a/NetworkScanners/ExitAuthority/soat.py +++ b/NetworkScanners/ExitAuthority/soat.py @@ -2076,12 +2076,14 @@ class SearchBasedTest: count = 0 while len(type_urls) < self.results_per_type and count < max_search_retry: count += 1 + + #Try to filter based on filetype/protocol. Unreliable. We will re-filter. query = random.choice(self.wordlist) if filetype != 'any': query += " "+self.search_mode["filetype"]+filetype plog("WARN", "RESULTPROTOCOL IS:" + self.result_protocol) - if self.result_protocol != 'any' and self.search_mode["inurl"]: - query += " "+self.search_mode["inurl"]+self.result_protocol # this isn't too reliable, but we'll re-filter results later + if self.result_protocol == 'https' and self.search_mode["inurl"]: + query += " " + self.search_mode["inurl"] + "https" #query += '&num=' + `g_results_per_page`
# search google for relevant pages @@ -2124,19 +2126,19 @@ class SearchBasedTest: traceback.print_exc() print "Content is: "+str(content) break + # get the links and do some additional filtering + assert(self.search_mode["class"]) for link in soup.findAll('a'): - skip = True - for a in link.attrs: - if a[0] == "class" and self.search_mode["class"] in a[1]: - skip = False - break - if skip: - continue - if link.has_key(self.search_mode['realtgt']): - url = link[self.search_mode['realtgt']] - else: - url = link['href'] + #Filter based on class of link + try: + if self.search_mode["class"] != link["class"]: + continue + except KeyError: continue + + #Get real target + url = link[self.search_mode['realtgt']] + if self.result_protocol == 'any': prot_list = None else: @@ -2158,7 +2160,7 @@ class SearchBasedTest: type_urls.add(url) else: pass - plog("INFO", "Have "+str(len(type_urls))+"/"+str(self.results_per_type)+" urls from search so far..") + plog("INFO", "Have "+str(len(type_urls))+"/"+str(self.results_per_type)+" urls from search so far..") return type_urls
class SearchBasedHTTPTest(SearchBasedTest, BaseHTTPTest): diff --git a/NetworkScanners/ExitAuthority/soat_config.py b/NetworkScanners/ExitAuthority/soat_config.py index 3e13463..39f8165 100644 --- a/NetworkScanners/ExitAuthority/soat_config.py +++ b/NetworkScanners/ExitAuthority/soat_config.py @@ -40,8 +40,8 @@ max_content_size = 256*1024 # Bind refetches of docuements to a specific source IP. # Useful for eliminating false positives that arise # from IP-based identifiers encoded in content -#refetch_ip = None -refetch_ip = "4.4.4.4" +refetch_ip = None +#refetch_ip = "4.4.4.4"
# Email settings for email scans. from_email = "Tor Exit Scanner noreply@torproject.org" @@ -134,21 +134,23 @@ search_cookie_file="./search_cookies.lwp" # Search mode. # Leave these maps alone. Change the default_search_mode variable # to what you want. -# XXX: Make a bing search mode. -yahoo_search_mode = {"host" : "search.yahoo.com", "query":"p", "filetype": "originurlextension:", \ - "inurl":None, "class":"yschttl", "realtgt":"ourl", "useragent":False, \ - "extra":[]} -google_search_mode = {"host" : "www.google.com", "query":"q", "filetype":"filetype:", \ - "inurl":"inurl:", "class" : "l", "realtgt":"href", "useragent":True, \ - "extra":[]} -ixquick_search_mode = {"host" : "ixquick.com/do/metasearch.pl", "query":"all_terms", "filetype":"title:", \ +# XXX: Make a bing search mode and a DuckDuckGo search mode + +#Yahoo is no longer supported because they make it difficult to scrape their results +#yahoo_search_mode = {"host" : "search.yahoo.com/search", "query":"p", "filetype": "vf:", \ +# "inurl":None, "class":"yschttl", "realtgt":"ourl", "useragent":False, \ +# "extra":[]} + +google_search_mode = {"host" : "www.google.com/search", "query":"q", "filetype":"filetype:", \ + "inurl":"inurl:", "class" : "l", "realtgt":"href", "useragent":True, \ + "extra":[]} + +ixquick_search_mode = {"host" : "ixquick.com/do/metasearch.pl", "query":"all_terms", "filetype":"url:.", \ "inurl":"url:", "class" : "title2", "realtgt":"href", "useragent":False, \ "extra":[("prfh","disable_family_filterEEE1N1Nnum_of_resultsEEE50N1Ndisable_video_family_filterEEE1N1N")]}
-# FIXME: This does not affect the ssl search.. Only Google has -# a working "inurl:" that allows you to pick the scheme to be https + #default_search_mode = google_search_mode -#default_search_mode = yahoo_search_mode default_search_mode = ixquick_search_mode
# Regex of characters we consider unsafe to write to the filesystem