commit 2bf195d0ce38dbc0ad25f10288f22ed352230296 Author: Karsten Loesing karsten.loesing@gmx.net Date: Tue Nov 27 21:22:58 2012 -0500
Add script to fix "A1" entries in geoip file.
Fixes #6266. --- changes/task-6266 | 7 ++ src/config/README.geoip | 90 +++++++++++++++++++++ src/config/deanonymind.py | 194 +++++++++++++++++++++++++++++++++++++++++++++ src/config/geoip-manual | 114 ++++++++++++++++++++++++++ 4 files changed, 405 insertions(+), 0 deletions(-)
diff --git a/changes/task-6266 b/changes/task-6266 new file mode 100644 index 0000000..e7f0509 --- /dev/null +++ b/changes/task-6266 @@ -0,0 +1,7 @@ + o Minor features: + - Use a script to replace "A1" ("Anonymous Proxy") entries in our + geoip file with real country codes. This script fixes about 90% of + "A1" entries automatically and uses manual country code assignments + to fix the remaining 10%. See src/config/README.geoip for details. + Fixes #6266. + diff --git a/src/config/README.geoip b/src/config/README.geoip new file mode 100644 index 0000000..8520501 --- /dev/null +++ b/src/config/README.geoip @@ -0,0 +1,90 @@ +README.geoip -- information on the IP-to-country-code file shipped with tor +=========================================================================== + +The IP-to-country-code file in src/config/geoip is based on MaxMind's +GeoLite Country database with the following modifications: + + - Those "A1" ("Anonymous Proxy") entries lying inbetween two entries with + the same country code are automatically changed to that country code. + These changes can be overriden by specifying a different country code + in src/config/geoip-manual. + + - Other "A1" entries are replaced with country codes specified in + src/config/geoip-manual, or are left as is if there is no corresponding + entry in that file. Even non-"A1" entries can be modified by adding a + replacement entry to src/config/geoip-manual. Handle with care. + + +1. Updating the geoip file from a MaxMind database file +------------------------------------------------------- + +Download the most recent MaxMind GeoLite Country database: +http://geolite.maxmind.com/download/geoip/database/GeoIPCountryCSV.zip + +Run `python deanonymind.py` in the local directory. Review the output to +learn about applied automatic/manual changes and watch out for any +warnings. + +Possibly edit geoip-manual to make more/fewer/different manual changes and +re-run `python deanonymind.py`. + +When done, prepend the new geoip file with a comment like this: + + # Last updated based on $DATE Maxmind GeoLite Country + # See README.geoip for details on the conversion. + + +2. Verifying automatic and manual changes using diff +---------------------------------------------------- + +To unzip the original MaxMind file and look at the automatic changes, run: + + unzip GeoIPCountryCSV.zip + diff -U1 GeoIPCountryWhois.csv AutomaticGeoIPCountryWhois.csv + +To look at subsequent manual changes, run: + + diff -U1 AutomaticGeoIPCountryWhois.csv ManualGeoIPCountryWhois.csv + +To manually generate the geoip file and compare it to the automatically +created one, run: + + cut -d, -f3-5 < ManualGeoIPCountryWhois.csv | sed 's/"//g' > mygeoip + diff -U1 geoip mygeoip + + +3. Verifying automatic and manual changes using blockfinder +----------------------------------------------------------- + +Blockfinder is a powerful tool to handle multiple IP-to-country data +sources. Blockfinder has a function to specify a country code and compare +conflicting country code assignments in different data sources. + +We can use blockfinder to compare A1 entries in the original MaxMind file +with the same or overlapping blocks in the file generated above and in the +RIR delegation files: + + git clone https://github.com/ioerror/blockfinder + cd blockfinder/ + python blockfinder -i + python blockfinder -r ../GeoIPCountryWhois.csv + python blockfinder -r ../ManualGeoIPCountryWhois.csv + python blockfinder -p A1 > A1-comparison.txt + +The output marks conflicts between assignments using either '*' in case of +two different opinions or '#' for three or more different opinions about +the country code for a given block. + +The '*' conflicts are most likely harmless, because there will always be +at least two opinions with the original MaxMind file saying A1 and the +other two sources saying something more meaningful. + +However, watch out for '#' conflicts. In these cases, the original +MaxMind file ("A1"), the updated MaxMind file (hopefully the correct +country code), and the RIR delegation files (some other country code) all +disagree. + +There are perfectly valid cases where the updated MaxMind file and the RIR +delegation files don't agree. But each of those cases must be verified +manually. + diff --git a/src/config/deanonymind.py b/src/config/deanonymind.py new file mode 100755 index 0000000..c86dadc --- /dev/null +++ b/src/config/deanonymind.py @@ -0,0 +1,194 @@ +#!/usr/bin/env python +import optparse +import os +import sys +import zipfile + +""" +Take a MaxMind GeoLite Country database as input and replace A1 entries +with the country code and name of the preceding entry iff the preceding +(subsequent) entry ends (starts) directly before (after) the A1 entry and +both preceding and subsequent entries contain the same country code. + +Then apply manual changes, either replacing A1 entries that could not be +replaced automatically or overriding previously made automatic changes. +""" + +def main(): + options = parse_options() + assignments = read_file(options.in_maxmind) + assignments = apply_automatic_changes(assignments) + write_file(options.out_automatic, assignments) + manual_assignments = read_file(options.in_manual, must_exist=False) + assignments = apply_manual_changes(assignments, manual_assignments) + write_file(options.out_manual, assignments) + write_file(options.out_geoip, assignments, long_format=False) + +def parse_options(): + parser = optparse.OptionParser() + parser.add_option('-i', action='store', dest='in_maxmind', + default='GeoIPCountryCSV.zip', metavar='FILE', + help='use the specified MaxMind GeoLite Country .zip or .csv ' + 'file as input [default: %default]') + parser.add_option('-g', action='store', dest='in_manual', + default='geoip-manual', metavar='FILE', + help='use the specified .csv file for manual changes or to ' + 'override automatic changes [default: %default]') + parser.add_option('-a', action='store', dest='out_automatic', + default="AutomaticGeoIPCountryWhois.csv", metavar='FILE', + help='write full input file plus automatic changes to the ' + 'specified .csv file [default: %default]') + parser.add_option('-m', action='store', dest='out_manual', + default='ManualGeoIPCountryWhois.csv', metavar='FILE', + help='write full input file plus automatic and manual ' + 'changes to the specified .csv file [default: %default]') + parser.add_option('-o', action='store', dest='out_geoip', + default='geoip', metavar='FILE', + help='write full input file plus automatic and manual ' + 'changes to the specified .csv file that can be shipped ' + 'with tor [default: %default]') + (options, args) = parser.parse_args() + return options + +def read_file(path, must_exist=True): + if not os.path.exists(path): + if must_exist: + print 'File %s does not exist. Exiting.' % (path, ) + sys.exit(1) + else: + return + if path.endswith('.zip'): + zip_file = zipfile.ZipFile(path) + csv_content = zip_file.read('GeoIPCountryWhois.csv') + zip_file.close() + else: + csv_file = open(path) + csv_content = csv_file.read() + csv_file.close() + assignments = [] + for line in csv_content.split('\n'): + stripped_line = line.strip() + if len(stripped_line) > 0 and not stripped_line.startswith('#'): + assignments.append(stripped_line) + return assignments + +def apply_automatic_changes(assignments): + print '\nApplying automatic changes...' + result_lines = [] + prev_line = None + a1_lines = [] + for line in assignments: + if '"A1"' in line: + a1_lines.append(line) + else: + if len(a1_lines) > 0: + new_a1_lines = process_a1_lines(prev_line, a1_lines, line) + for new_a1_line in new_a1_lines: + result_lines.append(new_a1_line) + a1_lines = [] + result_lines.append(line) + prev_line = line + if len(a1_lines) > 0: + new_a1_lines = process_a1_lines(prev_line, a1_lines, None) + for new_a1_line in new_a1_lines: + result_lines.append(new_a1_line) + return result_lines + +def process_a1_lines(prev_line, a1_lines, next_line): + if not prev_line or not next_line: + return a1_lines # Can't merge first or last line in file. + if len(a1_lines) > 1: + return a1_lines # Can't merge more than 1 line at once. + a1_line = a1_lines[0].strip() + prev_entry = parse_line(prev_line) + a1_entry = parse_line(a1_line) + next_entry = parse_line(next_line) + touches_prev_entry = int(prev_entry['end_num']) + 1 == \ + int(a1_entry['start_num']) + touches_next_entry = int(a1_entry['end_num']) + 1 == \ + int(next_entry['start_num']) + same_country_code = prev_entry['country_code'] == \ + next_entry['country_code'] + if touches_prev_entry and touches_next_entry and same_country_code: + new_line = format_line_with_other_country(a1_entry, prev_entry) + print '-%s\n+%s' % (a1_line, new_line, ) + return [new_line] + else: + return a1_lines + +def parse_line(line): + if not line: + return None + keys = ['start_str', 'end_str', 'start_num', 'end_num', + 'country_code', 'country_name'] + stripped_line = line.replace('"', '').strip() + parts = stripped_line.split(',') + entry = dict((k, v) for k, v in zip(keys, parts)) + return entry + +def format_line_with_other_country(original_entry, other_entry): + return '"%s","%s","%s","%s","%s","%s"' % (original_entry['start_str'], + original_entry['end_str'], original_entry['start_num'], + original_entry['end_num'], other_entry['country_code'], + other_entry['country_name'], ) + +def apply_manual_changes(assignments, manual_assignments): + if not manual_assignments: + return assignments + print '\nApplying manual changes...' + manual_dict = {} + for line in manual_assignments: + start_num = parse_line(line)['start_num'] + if start_num in manual_dict: + print ('Warning: duplicate start number in manual ' + 'assignments:\n %s\n %s\nDiscarding first entry.' % + (manual_dict[start_num], line, )) + manual_dict[start_num] = line + result = [] + for line in assignments: + entry = parse_line(line) + start_num = entry['start_num'] + if start_num in manual_dict: + manual_line = manual_dict[start_num] + manual_entry = parse_line(manual_line) + if entry['start_str'] == manual_entry['start_str'] and \ + entry['end_str'] == manual_entry['end_str'] and \ + entry['end_num'] == manual_entry['end_num']: + if len(manual_entry['country_code']) != 2: + print '-%s' % (line, ) # only remove, don't replace + else: + new_line = format_line_with_other_country(entry, + manual_entry) + print '-%s\n+%s' % (line, new_line, ) + result.append(new_line) + del manual_dict[start_num] + else: + print ('Warning: only partial match between ' + 'original/automatically replaced assignment and ' + 'manual assignment:\n %s\n %s\nNot applying ' + 'manual change.' % (line, manual_line, )) + result.append(line) + else: + result.append(line) + if len(manual_dict) > 0: + print ('Warning: could not apply all manual assignments: %s' % + ('\n '.join(manual_dict.values())), ) + return result + +def write_file(path, assignments, long_format=True): + if long_format: + output_lines = assignments + else: + output_lines = [] + for long_line in assignments: + entry = parse_line(long_line) + short_line = "%s,%s,%s" % (entry['start_num'], + entry['end_num'], entry['country_code'], ) + output_lines.append(short_line) + out_file = open(path, 'w') + out_file.write('\n'.join(output_lines)) + out_file.close() + +if __name__ == '__main__': + main() + diff --git a/src/config/geoip-manual b/src/config/geoip-manual new file mode 100644 index 0000000..3811c75 --- /dev/null +++ b/src/config/geoip-manual @@ -0,0 +1,114 @@ +# This file contains manual overrides of A1 entries (and possibly others) +# in MaxMind's GeoLite Country database. Use deanonymind.py in the same +# directory to process this file when producing a new geoip file. See +# README.geoip in the same directory for details. + +# Remove MaxMind entry 0.116.0.0-0.119.255.255 which MaxMind says is AT, +# but which is part of reserved range 0.0.0.0/8. -KL 2012-06-13 +"0.116.0.0","0.119.255.255","7602176","7864319","","" + +# NL, because previous MaxMind entry 31.171.128.0-31.171.133.255 is NL, +# and RIR delegation files say 31.171.128.0-31.171.135.255 is NL. +# -KL 2012-11-27 +"31.171.134.0","31.171.135.255","531334656","531335167","NL","Netherlands" + +# EU, because next MaxMind entry 37.139.64.1-37.139.64.9 is EU, because +# RIR delegation files say 37.139.64.0-37.139.71.255 is EU, and because it +# just makes more sense for the next entry to start at .0 and not .1. +# -KL 2012-11-27 +"37.139.64.0","37.139.64.0","629882880","629882880","EU","Europe" + +# CH, because previous MaxMind entry 46.19.141.0-46.19.142.255 is CH, and +# RIR delegation files say 46.19.136.0-46.19.143.255 is CH. +# -KL 2012-11-27 +"46.19.143.0","46.19.143.255","773033728","773033983","CH","Switzerland" + +# GB, because next MaxMind entry 46.166.129.0-46.166.134.255 is GB, and +# RIR delegation files say 46.166.128.0-46.166.191.255 is GB. +# -KL 2012-11-27 +"46.166.128.0","46.166.128.255","782663680","782663935","GB","United Kingdom" + +# US, though could as well be CA. Previous MaxMind entry +# 64.237.32.52-64.237.34.127 is US, next MaxMind entry +# 64.237.34.144-64.237.34.151 is CA, and RIR delegation files say the +# entire block 64.237.32.0-64.237.63.255 is US. -KL 2012-11-27 +"64.237.34.128","64.237.34.143","1089282688","1089282703","US","United States" + +# US, though could as well be UY. Previous MaxMind entry +# 67.15.170.0-67.15.182.255 is US, next MaxMind entry +# 67.15.183.128-67.15.183.159 is UY, and RIR delegation files say the +# entire block 67.15.0.0-67.15.255.255 is US. -KL 2012-11-27 +"67.15.183.0","67.15.183.127","1125103360","1125103487","US","United States" + +# US, because next MaxMind entry 67.43.145.0-67.43.155.255 is US, and RIR +# delegation files say 67.43.144.0-67.43.159.255 is US. +# -KL 2012-11-27 +"67.43.144.0","67.43.144.255","1126928384","1126928639","US","United States" + +# US, because previous MaxMind entry 70.159.21.51-70.232.244.255 is US, +# because next MaxMind entry 70.232.245.58-70.232.245.59 is A2 ("Satellite +# Provider") which is a country information about as useless as A1, and +# because RIR delegation files say 70.224.0.0-70.239.255.255 is US. +# -KL 2012-11-27 +"70.232.245.0","70.232.245.57","1189672192","1189672249","US","United States" + +# US, because next MaxMind entry 70.232.246.0-70.240.141.255 is US, +# because previous MaxMind entry 70.232.245.58-70.232.245.59 is A2 +# ("Satellite Provider") which is a country information about as useless +# as A1, and because RIR delegation files say 70.224.0.0-70.239.255.255 is +# US. -KL 2012-11-27 +"70.232.245.60","70.232.245.255","1189672252","1189672447","US","United States" + +# GB, despite neither previous (GE) nor next (LV) MaxMind entry being GB, +# but because RIR delegation files agree with both previous and next +# MaxMind entry and say GB for 91.228.0.0-91.228.3.255. -KL 2012-11-27 +"91.228.0.0","91.228.3.255","1541668864","1541669887","GB","United Kingdom" + +# GB, because next MaxMind entry 91.232.125.0-91.232.125.255 is GB, and +# RIR delegation files say 91.232.124.0-91.232.125.255 is GB. +# -KL 2012-11-27 +"91.232.124.0","91.232.124.255","1541962752","1541963007","GB","United Kingdom" + +# GB, despite neither previous (RU) nor next (PL) MaxMind entry being GB, +# but because RIR delegation files agree with both previous and next +# MaxMind entry and say GB for 91.238.214.0-91.238.215.255. +# -KL 2012-11-27 +"91.238.214.0","91.238.215.255","1542379008","1542379519","GB","United Kingdom" + +# US, because next MaxMind entry 173.0.16.0-173.0.65.255 is US, and RIR +# delegation files say 173.0.0.0-173.0.15.255 is US. -KL 2012-11-27 +"173.0.0.0","173.0.15.255","2902458368","2902462463","US","United States" + +# US, because next MaxMind entry 176.67.84.0-176.67.84.79 is US, and RIR +# delegation files say 176.67.80.0-176.67.87.255 is US. -KL 2012-11-27 +"176.67.80.0","176.67.83.255","2957201408","2957202431","US","United States" + +# US, because previous MaxMind entry 176.67.84.192-176.67.85.255 is US, +# and RIR delegation files say 176.67.80.0-176.67.87.255 is US. +# -KL 2012-11-27 +"176.67.86.0","176.67.87.255","2957202944","2957203455","US","United States" + +# EU, despite neither previous (RU) nor next (UA) MaxMind entry being EU, +# but because RIR delegation files agree with both previous and next +# MaxMind entry and say EU for 193.200.150.0-193.200.150.255. +# -KL 2012-11-27 +"193.200.150.0","193.200.150.255","3251148288","3251148543","EU","Europe" + +# US, because previous MaxMind entry 199.96.68.0-199.96.87.127 is US, and +# RIR delegation files say 199.96.80.0-199.96.87.255 is US. +# -KL 2012-11-27 +"199.96.87.128","199.96.87.255","3344979840","3344979967","US","United States" + +# US, because previous MaxMind entry 209.58.176.144-209.59.31.255 is US, +# and RIR delegation files say 209.59.32.0-209.59.63.255 is US. +# -KL 2012-11-27 +"209.59.32.0","209.59.63.255","3510312960","3510321151","US","United States" + +# FR, because previous MaxMind entry 217.15.166.0-217.15.166.255 is FR, +# and RIR delegation files contain a block 217.15.160.0-217.15.175.255 +# which, however, is EU, not FR. But merging with next MaxMind entry +# 217.15.176.0-217.15.191.255 which is KZ and which fully matches what +# the RIR delegation files say seems unlikely to be correct. +# -KL 2012-11-27 +"217.15.167.0","217.15.175.255","3641681664","3641683967","FR","France" +
tor-commits@lists.torproject.org