[or-cvs] r15819: SOAT: Added TLS tests. Started choosing semi-random urls for (torflow/branches/gsoc2008)

aleksei at seul.org aleksei at seul.org
Thu Jul 10 15:29:14 UTC 2008


Author: aleksei
Date: 2008-07-10 11:29:14 -0400 (Thu, 10 Jul 2008)
New Revision: 15819

Modified:
   torflow/branches/gsoc2008/soat.py
Log:
SOAT: Added TLS tests. Started choosing semi-random urls for testing. Pydoced. 

Modified: torflow/branches/gsoc2008/soat.py
===================================================================
--- torflow/branches/gsoc2008/soat.py	2008-07-10 15:25:49 UTC (rev 15818)
+++ torflow/branches/gsoc2008/soat.py	2008-07-10 15:29:14 UTC (rev 15819)
@@ -1,10 +1,24 @@
 #!/usr/bin/python
+#
+# 2008 Aleksei Gorny, mentored by Mike Perry
 
+'''
+Snakes on a Tor exit node scanner
+
+The SOAT scanner checks whether exit nodes behave by initiating connections
+to semi-randomly chosen targets using several protocols (http, https, ssh, smtp, imap, etc)
+and comparing content received directly and via tor.
+
+It interacts with metatroller and the control port to be aware of the tor network status.
+'''
+
+__all__ = ["ExitNodeScanner", "load_wordlist", "get_urls"]
+
 import httplib
 import os
+import pickle
 import random
 import re
-import pickle
 from sets import Set
 import smtplib
 import socket
@@ -23,45 +37,28 @@
 from TorCtl.PathSupport import *
 from TorCtl.TorCtl import Connection
 
-sys.path.append("./tools/BeautifulSoup/")
-from BeautifulSoup import BeautifulSoup, SoupStrainer
-
-sys.path.append("./tools/SocksiPy/")
-import socks
-
 # Try to use system openssl first
 try:
   from OpenSSL import *
+  sys.path.append("./tools")
 except:
   sys.path.append("./tools/")
   from OpenSSL import *
 
-sys.path.append("./tools/pyssh")
-import pyssh
+from BeautifulSoup.BeautifulSoup import BeautifulSoup, SoupStrainer
+from SocksiPy import socks
+import Pyssh.pyssh
 
 #
 # config stuff
 #
 
-user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.8.1) Gecko/20061010 Firefox/2.0' 
+# these are used when searching for 'random' urls for testing
+wordlist_file = './wordlist.txt'; 
+allowed_filetypes = ['all','pdf'] 
+result_per_type = 5 
 
-wordlist_file = './wordlist.txt';
-allowed_filetypes = ['all','pdf']
-result_per_type = 1
-
-same_origin_policy = True
-
 #
-# links of interest
-#
-
-# FIXME: Turn these into a keyword list that causes us to get semi-random
-# google results in a few different languages
-docs_http = ['http://www.torproject.org','http://www.math.ut.ee','http://www.mozilla.com']
-docs_https = ['mail.google.com','addons.mozilla.org','www.paypal.com','www.fastmail.fm']
-docs_ssh = []
-
-#
 # ports to test in the consistency test
 #
 
@@ -73,7 +70,7 @@
     ["http", ExitPolicyRestriction('255.255.255.255', 80), "https", ExitPolicyRestriction('255.255.255.255', 443)]
 ]
 
-# tags and attributes to check in the http test
+# tags and attributes to check in the http test: XXX these should be reviewed
 
 tags_to_check = ['a', 'area', 'base', 'applet', 'embed', 'form', 'frame', 
                  'iframe', 'img', 'link', 'object', 'script']
@@ -85,43 +82,45 @@
 
 linebreak = '\r\n'
 
-# a simple interface to handle a socket connection 
-# with readline and writeline capability
+# a simple interface to handle a socket connection
 class Client:
 
     def __init__(self, host, port):
-        self.s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-        self.s.connect((host, port))
-        self.buffer = self.s.makefile('rb')
+        self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        self.sock.connect((host, port))
+        self.buffer = self.sock.makefile('rb')
 
     def writeline(self, line):
-        self.s.send(line + linebreak)
+        self.sock.send(line + linebreak)
 
     def readline(self):
-        s = self.buffer.readline()
-        if not s:
+        response = self.buffer.readline()
+        if not response:
             raise EOFError
-        if s[-2:] == linebreak:
-            s = s[:-2]
-        elif s[-1:] in linebreak:
-            s = s[:-1]
-        return s
+        elif response[-2:] == linebreak:
+            response = response[:-2]
+        elif response[-1:] in linebreak:
+            response = response[:-1]
+        return response 
 
-# The scanner class
 class ExitNodeScanner:
-
+    ''' The scanner class '''
     def __init__(self):
+        ''' 
+        Establish a connection to metatroller & control port, 
+        configure metatroller, load the number of previously tested nodes 
+        '''
         # establish a metatroller connection
         plog('INFO', 'ExitNodeScanner starting up...')
         try:
-            self.__client = Client(meta_host, meta_port)
+            self.__meta = Client(meta_host, meta_port)
         except socket.error:
             plog('ERROR', 'Couldn\'t connect to metatroller. Is it on?')
             exit()
     
         # skip two lines of metatroller introduction
-        data = self.__client.readline()
-        data = self.__client.readline()
+        data = self.__meta.readline()
+        data = self.__meta.readline()
         
         # configure metatroller
         commands = [
@@ -135,10 +134,10 @@
             'RESETSTATS']
         plog('INFO', 'Executing preliminary configuration commands')
         for c in commands:
-            self.__client.writeline(c)
-            reply = self.__client.readline()
+            self.__meta.writeline(c)
+            reply = self.__meta.readline()
             if reply[:3] != '250': # first three chars indicate the reply code
-                reply += self.__client.readline()
+                reply += self.__meta.readline()
                 plog('ERROR', 'Error configuring metatroller (' + command + ' failed)')
                 plog('ERROR', reply)
                 exit()
@@ -166,11 +165,12 @@
         plog('INFO', 'ExitNodeScanner up and ready')
 
     def get_exit_node(self):
-        self.__client.writeline("GETLASTEXIT")
-        reply = self.__client.readline()
+        ''' ask metatroller for the last exit used '''
+        self.__meta.writeline("GETLASTEXIT")
+        reply = self.__meta.readline()
         
         if reply[:3] != '250':
-            reply += self.__client.readline()
+            reply += self.__meta.readline()
             plog('ERROR', reply)
             return 0
         
@@ -181,36 +181,47 @@
         return self.__exit
 
     def get_new_circuit(self):
+        ''' tell metatroller to close the current circuit and open a new one '''
         plog('NOTICE', 'Trying to construct a new circuit')
-        self.__client.writeline("NEWEXIT")
-        reply = self.__client.readline()
+        self.__meta.writeline("NEWEXIT")
+        reply = self.__meta.readline()
 
         if reply[:3] != '250':
             plog('ERROR', 'Choosing a new exit failed')
             plog('ERROR', reply)
 
     def set_new_exit(self, exit):
+        ''' 
+        tell metatroller to set the given node as the exit in the next circuit 
+        Note: currently not used
+        '''
         plog('NOTICE', 'Trying to set ' + exit + ' as the exit for the next circuit')
-        self.__client.writeline("SETEXIT " + exit)
-        reply = self.__client.readline()
+        self.__meta.writeline("SETEXIT " + exit)
+        reply = self.__meta.readline()
     
         if reply[:3] != '250':
             plog('ERROR', 'Setting ' + exit + ' as the new exit failed')
             plog('ERROR', reply)
 
-    def report_bad_exit(self, exit):        
+    def report_bad_exit(self, exit):
+        ''' 
+        report an evil exit to the control port using AuthDirBadExit 
+        Note: currently not used    
+        '''
         # self__contol.set_option('AuthDirBadExit', exit) ?
         pass
 
-    # get the list of nodes that allow to exit to a port
     def get_nodes_for_port(self, port):
+        ''' ask control port for a list of nodes that allow exiting to a given port '''
         routers = self.__control.read_routers(self.__control.get_network_status())
         restriction = ExitPolicyRestriction('255.255.255.255', port)
         return [x for x in routers if restriction.r_is_ok(x)]
 
-    # finds nodes that allow connections over a common protocol
-    # while disallowing connections over its secure version
     def check_all_exits_port_consistency(self):
+        ''' 
+        an independent test that finds nodes that allow connections over a common protocol
+        while disallowing connections over its secure version (for instance http/https)
+        '''
 
         # get the structure
         routers = self.__control.read_routers(self.__control.get_network_status())
@@ -236,6 +247,7 @@
         plog('INFO', 'Total bad exits: ' + `len(bad_exits)` + ' (~' + `(len(bad_exits) * 100 / len(routers))` + '%)')
 
     def check_http(self, address):
+        ''' check whether a http connection to a given address is molested '''
         plog('INFO', 'Conducting an http test with destination ' + address)
 
         defaultsocket = socket.socket
@@ -248,15 +260,16 @@
         socket.socket = defaultsocket
 
         exit_node = self.get_exit_node()
-        if exit_node == 0 or exit_node == '0':
+        if exit_node == 0 or exit_node == '0' or not exit_node:
             plog('INFO', 'We had no exit node to test, skipping to the next test.')
             return 0
 
+        address_file = address[7:].replace('/','_') # an address representation acceptable for a filename (leave out the http:// and replace slashes)
+
         # if we have no content, we had a connection error
-        # address[7:] means we're leaving out the 'http://' from the address part
         if pcontent == 0:
             result = HttpTestResult(exit_node, address, 0, TEST_INCONCLUSIVE)
-            result_file = open(http_i_dir + `exit_node` + '_' + address[7:] + '.result','w')
+            result_file = open(http_i_dir + `exit_node` + '_' + address_file + '.result','w')
             pickle.dump(result, result_file)
             result_file.close()
             return TEST_INCONCLUSIVE
@@ -270,15 +283,15 @@
         # if we don't have any yet, get it
         soup = 0
         try:
-            tag_file = open(http_tags_dir + address[7:] + '.tags', 'r')
+            tag_file = open(http_tags_dir + address_file + '.tags', 'r')
             soup = BeautifulSoup(tag_file.read())
             tag_file.close()
         except IOError:
             content = self.http_request(address)
             content = content.decode('ascii','ignore')
             soup = BeautifulSoup(content, parseOnlyThese=elements)
-            tag_file = open(http_tags_dir + '_' + address[7:] + '.tags', 'w')
-            tag_file.write(soup.__str__())
+            tag_file = open(http_tags_dir + address_file + '.tags', 'w')
+            tag_file.write(soup.__str__() +  ' ') # the space is needed in case we have some page with no matching tags at all
             tag_file.close()
         except Exception, e:
             plog('ERROR', 'Failed to get the correct tag structure for ' + address)
@@ -291,25 +304,55 @@
         # compare the content
         # if content matches, everything is ok
         if psoup == soup:
-            plog('INFO', 'Content matches')
+            result = HttpTestResult(exit_node, address, 0, TEST_SUCCESS)
+            result_file = open(http_s_dir + `exit_node` + '_' + address_file + '.result','w')
+            pickle.dump(result, result_file)
+            result_file.close()
+            return TEST_SUCCESS
 
         # if content doesnt match, update the direct content
-    
+        content_new = self.http_request(address)
+        content_new = content_new.decode('ascii', 'ignore')
+        if content_new == 0:
+            result = HttpTestResult(exit_node, address, 0, TEST_INCONCLUSIVE)
+            result_file = open(http_i_dir + `exit_node` + '_' + address_file + '.result','w')
+            pickle.dump(result, result_file)
+            result_file.close()
+            return TEST_INCONCLUSIVE
+
+        soup_new = BeautifulSoup(content_new, parseOnlyThese=elements)
         # compare the new and old content
         # if they match, means the node has been changing the content
+        if soup == soup_new:
+            result = HttpTestResult(exit_node, address, 0, TEST_FAILURE)
+            result_file = open(http_f_dir + `exit_node` + '_' + address_file + '.result','w')
+            pickle.dump(result, result_file)
+            result_file.close()
+            return TEST_FAILURE
 
-
         # if content has changed outside of tor, update the saved file
+        tag_file = open(http_tags_dir + '_' + address_file + '.tags', 'w')
+        tag_file.write(soup_new.__str__())
+        tag_file.close()
 
         # compare the node content and the new content
         # if it matches, everything is ok
+        if psoup == soup_new:
+            result = HttpTestResult(exit_node, address, 0, TEST_SUCCESS)
+            result_file = open(http_s_dir + `exit_node` + '_' + address_file + '.result','w')
+            pickle.dump(result, result_file)
+            result_file.close()
+            return TEST_SUCCESS
 
+        # if it doesn't match, means the node has been changing the content
+        result = HttpTestResult(exit_node, address, 0, TEST_FAILURE)
+        result_file = open(http_f_dir + `exit_node` + '_' + address_file + '.result','w')
+        pickle.dump(result, result_file)
+        result_file.close()
+        return TEST_FAILURE
 
-        # if it doesn't match, means the node has been changing the content 
-
-        return TEST_SUCCESS
-
     def check_openssh(self, address):
+        ''' check whether an openssh connection to a given address is molested '''
         ssh = pyssh.Ssh('username', 'host', 22)
         ssh.set_sshpath(pyssh.SSH_PATH)
 
@@ -319,6 +362,7 @@
         return 0 
 
     def check_openssl(self, address):
+        ''' check whether an https connection to a given address is molested '''
         plog('INFO', 'Conducting an ssl test with destination ' + address)
 
         # get the cert via tor
@@ -333,7 +377,7 @@
         socket.socket = defaultsocket
 
         exit_node = self.get_exit_node()
-        if exit_node == 0 or exit_node == '0':
+        if exit_node == 0 or exit_node == '0' or not exit_node:
             plog('INFO', 'We had no exit node to test, skipping to the next test.')
             return TEST_FAILURE
 
@@ -447,35 +491,360 @@
 
         return TEST_FAILURE
 
-    # stub for checking whether smtp & tls function properly
     def check_smtp(self, address):
+        ''' 
+        check whether smtp + tls connection to a given address is molested
+        this is done by going through the STARTTLS sequence and comparing server
+        responses for the direct and tor connections
+        '''
+
+        plog('INFO', 'Conducting an stmp test with destination ' + address)
+
+        defaultsocket = socket.socket
+        socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, tor_host, tor_port)
+        socket.socket = socks.socksocket
+
+        ehlo1_reply = 0
+        has_starttls = 0
+        ehlo2_reply = 0
+
         try:
             s = smtplib.SMTP(address)
-            c = s.ehlo()[0]
-            if not c>= 200 or c <= 299:
-                return 0
-            if not s.has_extn('starttls'):
-                return 0
-            c = s.ehlo()[0]
-            if not c>= 200 or c <= 299:
-                return 0
-        except:
-            pass
+            ehlo1_reply = s.ehlo()[0]
+            if ehlo1_reply != 250:
+                raise smtplib.SMTPException('First ehlo failed')
+            has_starttls = s.has_extn('starttls')
+            if not has_starttls:
+                raise smtplib.SMTPException('It seems the server doesn\'t support starttls')
+            s.starttls()
+            # TODO check certs?
+            ehlo2_reply = s.ehlo()[0]
+            if ehlo2_reply != 250:
+                raise smtplib.SMTPException('Second ehlo failed')
+        except socket.gaierror, e:
+            plog('ERROR', 'A connection error occured while testing smtp at ' + address)
+            plog('ERROR', e)
+            return TEST_INCONCLUSIVE
+        except Exception, e:
+            plog('ERROR','An error occured while testing smtp at ' + address)
+            plog('ERROR', e)
+        finally:
+            # reset the connection method back to direct
+            socket.socket = defaultsocket 
 
-    # stub for checking whether pop & tls function properly
+        # check whether the test was valid at all
+        exit_node = self.get_exit_node()
+        if exit_node == 0 or exit_node == '0':
+            plog('INFO', 'We had no exit node to test, skipping to the next test.')
+            return 0
+
+        # now directly
+
+        ehlo1_reply_d = 0
+        has_starttls_d = 0
+        ehlo2_reply_d = 0
+
+        try:
+            s = smtplib.SMTP(address)
+            ehlo1_reply_d = s.ehlo()[0]
+            if ehlo1_reply != 250:
+                raise smtplib.SMTPException('First ehlo failed')
+            has_starttls_d = s.has_extn('starttls')
+            if not has_starttls_d:
+                raise smtplib.SMTPException('It seems that the server doesn\'t support starttls')
+            s.starttls()
+            ehlo2_reply = s.ehlo()[0]
+            if ehlo2_reply != 250:
+                raise smtplib.SMTPException('Second ehlo failed')
+        except Exception, e:
+            plog('ERROR', 'An error occurred while testing smtp at ' + address)
+            plog('ERROR', e)
+
+        # compare
+        if ehlo1_reply != ehlo1_reply_d or has_starttls != has_starttls_d or ehlo2_reply != ehlo2_reply_d:
+            return TEST_FAILURE
+
+        return TEST_SUCCESS
+
     def check_pop(self, address):
-        pass
+        ''' 
+        check whether a pop + tls connection to a given address is molested 
+        it is implied that the server reads/sends messages compliant with RFC1939 & RFC2449
+        '''
 
-    # stub for checking whether imap & tls function properly
+        plog('INFO', 'Conducting a pop test with destination ' + address)
+
+        defaultsocket = socket.socket
+        socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, tor_host, tor_port)
+        socket.socket = socks.socksocket
+
+        capabilities_ok = False
+        starttls_present = False
+        tls_started = None
+        tls_succeeded = None
+
+        try:
+            pop = Client(address, 110)
+        
+            # read the server greeting
+            server_greeting = pop.readline()
+
+            # get the server capabilities
+            pop.writeline('CAPA')
+            capabilities = ''
+            while 1:
+                curr = pop.readline()
+                if '+OK' in curr:
+                    capabilities_ok = True
+                elif curr == '.':
+                    break
+                elif 'STLS' in curr:
+                    starttls_present = True
+            
+            if not capabilities_ok:
+                return TEST_INCONCLUSIVE
+
+            # try to start tls negotiation
+            if starttls_present:
+                pop.writeline('STLS')
+
+            starttls_started = '+OK' in starttls_response
+
+            # negotiate TLS and issue some request to feel good about it
+            # TODO check certs? 
+            ctx = SSL.Context(SSL.SSLv23_METHOD)
+            c = SSL.Connection(ctx, pop.sock)
+            c.set_connect_state()
+            c.do_handshake()
+            c.send('CAPA' + linebreak)
+            
+            while tls_succeeded == None:
+                line = ''
+                char = None
+                while char != '\n':
+                    char = c.read(1)
+                    if not char:
+                        break
+                    elif char == '.':
+                        tls_succeeded = False
+                    line += char
+
+                if '-ERR' in line:
+                    tls_succeeded = False
+                elif '+OK' in line:
+                    tls_succeeded = True
+                elif not line:
+                    tls_succeeded = False
+
+        except Exception, e:
+            plog('ERROR', e)
+            return TEST_INCONCLUSIVE
+        finally:
+            # reset the connection to default
+            socket.socket = defaultsocket
+
+        # do the same for the direct connection
+
+        capabilities_ok_d = False
+        starttls_present_d = False
+        tls_started_d = None
+        tls_succeeded_d = None
+
+        try:
+            pop = Client(address, 110)
+        
+            # read the server greeting
+            server_greeting = pop.readline()
+
+            # get the server capabilities
+            pop.writeline('CAPA')
+            capabilities = ''
+            while 1:
+                curr = pop.readline()
+                if '+OK' in curr:
+                    capabilities_ok_d = True
+                elif curr == '.':
+                    break
+                elif 'STLS' in curr:
+                    starttls_present_d = True
+            
+            if not capabilities_ok_d:
+                return TEST_INCONCLUSIVE
+
+            # try to start tls negotiation
+            if starttls_present_d:
+                pop.writeline('STLS')
+
+            starttls_started_d = '+OK' in starttls_response
+
+            # negotiate TLS, issue some request to feel good about it
+            ctx = SSL.Context(SSL.SSLv23_METHOD)
+            c = SSL.Connection(ctx, pop.sock)
+            c.set_connect_state()
+            c.do_handshake()
+            c.send('CAPA' + linebreak)
+            
+            while tls_succeeded_d == None:
+                line = ''
+                char = None
+                while char != '\n':
+                    char = c.read(1)
+                    if not char:
+                        break
+                    elif char == '.':
+                        tls_succeeded_d = False
+                    line += char
+
+                if '-ERR' in line:
+                    tls_succeeded_d = False
+                elif '+OK' in line:
+                    tls_succeeded_d = True
+                elif not line:
+                    tls_succeeded_d = False
+
+        except Exception, e:
+            plog('ERROR', e)
+            return TEST_INCONCLUSIVE
+
+        # compare
+        if (capabilities_ok != capabilities_ok_d or starttls_present != starttls_present_d or 
+                tls_started != tls_started_d or tls_suceeded != tls_succeeded_d):
+            return TEST_FAILURE
+        
+        return TEST_SUCCESS
+
     def check_imap(self, address):
-        pass
+        ''' 
+        check whether an imap + tls connection to a given address is molested 
+        it is implied that the server reads/sends messages compliant with RFC3501
+        ''' 
+        plog('INFO', 'Conducting an imap test with destination ' + address)
 
+        defaultsocket = socket.socket
+        socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, tor_host, tor_port)
+        socket.socket = socks.socksocket
+        
+        capabilities_ok = None
+        starttls_present = None
+        tls_started = None
+        tls_succeeded = None
+        try:
+            imap = Client(address, 143)
+
+            # read server greeting
+            server_greeting = imap.readline()
+
+            # get server capabilities
+            imap.writeline('a001 CAPABILITY')
+            capabilities = imap.readline() # first line - list of capabilities
+            capabilities_ok = 'OK' in imap.readline() # second line - the request status
+        
+            if not capabilities_ok:
+               return TEST_INCONCLUSIVE
+
+            # check if starttls is present
+            starttls_present = 'STARTTLS' in capabilities
+
+            if starttls_present:
+                imap.writeline('a002 STARTTLS')
+                tls_started = 'OK' in imap.readline()
+
+            # negotiate TLS, issue a request to feel good about it
+            # TODO check the cert aswell ?
+            ctx = SSL.Context(SSL.SSLv23_METHOD)
+            c = SSL.Connection(ctx, imap.sock)
+            c.set_connect_state()
+            c.do_handshake()
+            c.send('a003 CAPABILITY' + linebreak)
+            
+            while tls_succeeded == None:
+                line = ''
+                char = None
+                while char != '\n':
+                    char = c.read(1)
+                    if not char:
+                        break
+                    line += char
+
+                if 'Error' in line or 'error' in line:
+                    tls_succeeded = False
+                elif 'OK' in line:
+                    tls_succeeded = True
+                elif not line:
+                    tls_succeeded = False
+
+        except Exception, e:
+            plog('ERROR', e)
+            return TEST_INCONCLUSIVE
+        finally:
+            socket.socket = defaultsocket 
+
+        # do the same for the direct connection
+        capabilities_ok_d = None
+        starttls_present_d = None
+        tls_started_d = None
+        tls_succeeded_d = None
+        try:
+            imap = Client(address, 143)
+
+            # read server greeting
+            server_greeting = imap.readline()
+
+            # get server capabilities
+            imap.writeline('a001 CAPABILITY')
+            capabilities = imap.readline() # first line - list of capabilities
+            capabilities_ok_d = 'OK' in imap.readline() # second line - the request status
+
+            if not capabilities_ok_d:
+                return TEST_INCONCLUSIVE
+
+            # check if starttls is present
+            starttls_present_d = 'STARTTLS' in capabilities
+
+            if starttls_present_d:
+                imap.writeline('a002 STARTTLS')
+                tls_started = 'OK' in imap.readline()
+
+            # negotiate TLS, issue some request to feel good about it
+            ctx = SSL.Context(SSL.SSLv23_METHOD)
+            c = SSL.Connection(ctx, imap.sock)
+            c.set_connect_state()
+            c.do_handshake()
+            c.send('a003 CAPABILITY' + linebreak)
+
+            while tls_succeeded_d == None:
+                line = ''
+                char = None
+                while char != '\n':
+                    char = c.read(1)
+                    if not char:
+                        break
+                    line += char
+
+                if 'Error' in line or 'error' in line:
+                    tls_succeeded_d = False
+                elif 'OK' in line:
+                    tls_succeeded_d = True
+                elif not line:
+                    tls_succeeded_d = False
+
+        except Exception, e:
+            plog('ERROR', e)
+            return TEST_INCONCLUSIVE
+
+        # compare
+        if (capabilities_ok != capabilities_ok_d or starttls_present != starttls_present_d or 
+            tls_started != tls_started_d or tls_succeeded != tls_succeeded_d):
+            return TEST_FAILURE
+
+        return TEST_SUCCESS
+
     def http_request(self, address):
-        
+        ''' perform a http GET-request and return the content received '''
         request = 0
         try:
             request = urllib2.Request(address)
-            request.add_header('User-Agent', user_agent)
+            request.add_header('User-Agent','Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.8.1) Gecko/20061010 Firefox/2.0')
         except Exception, e:
             plog('ERROR', 'Forming a http request to ' + address + ' failed.')
             plog('ERROR', e)
@@ -496,7 +865,7 @@
         pass
 
     def ssl_request(self, address):
-
+        ''' initiate an ssl connection and return the server certificate '''
         # specify the context
         ctx = SSL.Context(SSL.SSLv23_METHOD)
         ctx.set_verify_depth(1)
@@ -523,132 +892,150 @@
 
 # some helpful methods
 
-'''
-construct a list of urls based on the wordlist and filetypes of interest
-'''
-def load_urls():
-    plog('INFO', 'Loading url list')
-        
+def load_wordlist(file):
+    ''' load a list of strings from a file (which contains words separated by newlines) '''
+    plog('INFO', 'Loading the wordlist')
+    
     wordlist = []
-    fh = open(wordlist_file, 'r')
+    fh = open(file, 'r')
     try:
         for line in fh:
             wordlist.append(line[:-1]) # get rid of the linebreaks
     finally:
         fh.close()
 
+    return wordlist
+
+def get_urls(wordlist, filetypes=['any'], results_per_type=5, protocol='any', g_results_per_page=10):
+    ''' 
+    construct a list of urls based on the wordlist, filetypes and protocol. 
+    
+    Note: since we currently use google, which doesn't index by protocol,
+    searches for anything but 'any' could be rather slow
+    '''
+    plog('INFO', 'Searching google for relevant sites...')
+
     urllist = []
-    for ft in allowed_filetypes:
+    for filetype in filetypes:
         type_urls = []
 
-        while len(type_urls) < result_per_type:
-            # probably the discover_urls method should consider moving along the search result pages
-            type_urls.extend(discover_urls(ft, 
-                wordlist[int(random.random() * len(wordlist))]))
-            type_urls = list(Set(type_urls))
-        
-            plog('INFO', 'URL list for ' + ft + ': ' + '\n'.join(type_urls) + '\n')
-            urllist.extend(type_urls)
-         
-    return urllist
+        while len(type_urls) < results_per_type:
+            query = random.choice(wordlist)
+            if filetype != 'any':
+                query += ' filetype:' + filetype
+            if protocol != 'any':
+                query += ' allinurl:' + protocol # this isn't too reliable, but we'll re-filter results later
+            #query += '&num=' + `g_results_per_page` 
 
-'''
-Find links to files related to a query
-'''
-def discover_urls(self, filetype, query):
-    # search google for relevant pages
-    # note: google only accepts requests from idenitified browsers
-    if filetype != 'all':
-        query += ':' + filetype
-        
-    host = 'www.google.com'
-    params = urllib.urlencode({'q' : query})
-    headers = {'User-Agent' : user_agent}
-    search_url = '/search' + '?' + params
-   
-    connection = httplib.HTTPConnection(host)
-    connection.request("GET", search_url, {}, headers) # can't add params here for some reason
-    
-    response = connection.getresponse()
-    if response.status == 200:
-        # if everything went well, start parsing
-        urls = []
+            # search google for relevant pages
+            # note: google only accepts requests from idenitified browsers
+            host = 'www.google.com'
+            params = urllib.urlencode({'q' : query})
+            headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.8.1) Gecko/20061010 Firefox/2.0'}
+            search_path = '/search' + '?' + params
 
-        content = response.read()
+            connection = None
+            response = None
+            try:
+                connection = httplib.HTTPConnection(host)
+                connection.request("GET", search_path, {}, headers)
+                response = connection.getresponse()
+                if response.status != 200:
+                    raise Exception(response.status, response.reason)
+            except Exception, e:
+                plog('ERROR', 'Connection to google.com failed')
+                plog('ERROR', e)
+                continue
 
-        soup = BeautifulSoup(content)
+            content = response.read()
+            links = SoupStrainer('a')
+            soup = BeautifulSoup(content, parseOnlyThese=links)
+            
+            # get the links and do some additional filtering
+            for link in soup.findAll('a', {'class' : 'l'}):
+                url = link['href']
+                if (protocol != 'any' and url[:len(protocol)] != protocol or 
+                        filetype != 'any' and url[-len(filetype):] != filetype):
+                    pass
+                else:
+                    type_urls.append(link['href'])
         
-        # l is the class for relevant reply links
-        # probably not the best criterion to rely on, so maybe some other solution needed
-        for link in soup.findAll('a', {'class' : 'l'}): 
-            urls.append(link['href'])
+        if type_urls > results_per_type:
+            type_urls = random.sample(type_urls, results_per_type) # make sure we don't get more urls than needed
+        urllist.extend(type_urls)
+         
+    return list(Set(urllist))
 
-        # filter for filetypes if needed
-        if filetype != 'all':
-            urls = [u for u in urls if u[-len(filetype):] == filetype]
-
-        return urls
-    else:
-        plog('ERROR', 'Google search failed: ' + 
-                response.status + ' ' + response.reason)
-        return []
-
 #
 # main logic
 #
 def main(argv):
     scanner = ExitNodeScanner()
     
-    # consistency test
+    # 
+    # 1) consistency test
+    #
+
     # scanner.check_all_exits_port_consistency()
     
-    # find sites for http testing if necessary
     #
-    # global doc_urls
-    # doc_urls.extend(load_url_list())
-    # doc_urls = list(Set(doc_urls))
-    # plog('NOTICE', 'Final URL list: ' + '\n'.join(doc_urls) + '\n')
+    # 2) test for checking yet unchecked nodes
+    # XXX use SETEXIT systematically, after 'all nodes' have been tested, just continue with NEWEXIT
+    #
 
-    # get the number of nodes that need to be tested
-    # XXX: Need to update this periodically for this to work.. But 
-    # it probably shouldn't be used for a termination condition anyways..
-    # It is probably good to ballpark if we've done all exits for 
-    # informational purposes, but then we should just restart the scan
+    # load the wordlist to search for sites lates on
+    wordlist = load_wordlist(wordlist_file)
+    
+    # get the total number of nodes for ports
     ssl_nodes = len(scanner.get_nodes_for_port(443))
     http_nodes = len(scanner.get_nodes_for_port(80))
-    ssh_nodes = len(scanner.get_nodes_for_port(22))
+    #ssh_nodes = len(scanner.get_nodes_for_port(22)) 
 
+    # lists of addresses (generated later with get_urls)
+    ssl_urls = []
+    http_urls = []
+    ssh_urls = []
+
+    # test terminating conditions for somewhat ok network coverage
     ssl_done = False
     http_done = False
     ssh_done = True
-    while 1:
-    
-        # https test
+
+    # get some semi-random urls, try to test the exit node for each protocol needed, get a new node
+    while 1: 
         
+        http_urls = get_urls(wordlist, protocol='http')
+        ssl_urls = ['mail.google.com', 'addons.mozilla.org', 'www.fastmail.fm'] # the search for https stuff is yet too slow
+        
+        # https test  
         if not ssl_done:
-            for ssl_site in docs_https:
-                scanner.check_openssl(ssl_site)
+            ssl_site = random.choice(ssl_urls)
+            scanner.check_openssl(ssl_site)
             ssl_tested_n = len(scanner.ssl_tested)
             plog('INFO', 'Nodes ssl-tested: ' + `ssl_tested_n` + '/' + `ssl_nodes`
                 + ' (~' + `((ssl_tested_n * 100) / ssl_nodes)` + '%)')
-            if ssl_tested_n == ssl_nodes:
+            if ssl_tested_n >= ssl_nodes:
                 ssl_done = True
         
         # http test
-        
         if not http_done:
-            for http_site in docs_http:
-                scanner.check_http(http_site)
+            http_site = random.choice(http_urls)
+            scanner.check_http(http_site)
             http_tested_n = len(scanner.http_tested)
             plog('INFO', 'Nodes http-tested: ' + `http_tested_n` + '/' + `http_nodes`
                 + ' (~' + `((http_tested_n * 100) / http_nodes)` + '%)')
-            if http_tested_n == http_nodes:
+            if http_tested_n >= http_nodes:
                 http_done = True
-
+        '''
         # ssh test
-        '''
         if not ssh_done:
-            pass
+            ssh_site = random.choice(ssh_urls)
+            scanner.check_openssh(ssh_site)
+            ssh_tested_n = len(scanner.ssh_tested)
+            plog('INFO', 'Nodes ssh-tested: ' + `ssh_tested_n` + '/' + `ssh_nodes`
+                + '(~' + `((ssh_tested_n * 100) / ssh_nodes)` + '%')')
+            if ssh_tested_n >= ssh_nodes:
+                ssh_done = True
         '''
 
         # check whether we're done, otherwise get a new circuit
@@ -658,7 +1045,6 @@
         else:
             scanner.get_new_circuit()
             time.sleep(1)
-
 #
 # initiate the program
 #



More information about the tor-commits mailing list