commit 0d4237839b21b466526a01147538d09c117cc884 Author: Nick Mathewson nickm@torproject.org Date: Mon Feb 22 08:30:11 2021 -0500
Rust tool to convert IPFire Location dump into CSV format.
The IPFire people provide a tool that collects data from several top-level sources, combines it into a single database, and annotates it with optional overrides. This tool transforms the "dump" format of their database into the form Tor expects. --- scripts/maint/geoip/README.geoip | 25 ++++ scripts/maint/geoip/geoip-db-tool/.gitignore | 1 + scripts/maint/geoip/geoip-db-tool/Cargo.toml | 16 +++ scripts/maint/geoip/geoip-db-tool/src/db.rs | 126 ++++++++++++++++++++ scripts/maint/geoip/geoip-db-tool/src/main.rs | 165 ++++++++++++++++++++++++++ scripts/maint/geoip/update_geoip.sh | 16 +++ 6 files changed, 349 insertions(+)
diff --git a/scripts/maint/geoip/README.geoip b/scripts/maint/geoip/README.geoip new file mode 100644 index 0000000000..0ed94b2276 --- /dev/null +++ b/scripts/maint/geoip/README.geoip @@ -0,0 +1,25 @@ +To generate new geoip files, you'll need to install the +libloc/"location" tool provided by https://location.ipfire.org/. +I personally build it with: + + ./configure CFLAGS='-g -O2' --disable-perl --without-systemd --prefix=/opt/libloc + make + make install + +Then (after adjusting PATH and PYTHONPATH) you can get the latest +dump with: + + location update + location dump geoip-dump.txt + +And transform it into geoip files with + + cargo run --release -- -i geoip-dump.txt + + +============================== + +Note that the current version "0.1.9" of rangemap has a performance +bug, making this tool quite slow. Previous versions had a +correctness bug that made the output needlessly long. With luck, +there will soon be a fast correct rangemap version. \ No newline at end of file diff --git a/scripts/maint/geoip/geoip-db-tool/.gitignore b/scripts/maint/geoip/geoip-db-tool/.gitignore new file mode 100644 index 0000000000..eb5a316cbd --- /dev/null +++ b/scripts/maint/geoip/geoip-db-tool/.gitignore @@ -0,0 +1 @@ +target diff --git a/scripts/maint/geoip/geoip-db-tool/Cargo.toml b/scripts/maint/geoip/geoip-db-tool/Cargo.toml new file mode 100644 index 0000000000..b08863924a --- /dev/null +++ b/scripts/maint/geoip/geoip-db-tool/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "geoip-db-tool" +version = "0.1.0" +authors = ["Nick Mathewson nickm@torproject.org"] +edition = "2018" +license = "MIT OR Apache-2.0" +publish = false + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +ipnetwork= "0.17.0" +rangemap= "0.1.9" +# I use this for now to avoid a performance hit due to a bug on 0.1.9 +# rangemap = {version = "*", path = "/home/nickm/src/rangemap/" } +argh = "0.1.4" diff --git a/scripts/maint/geoip/geoip-db-tool/src/db.rs b/scripts/maint/geoip/geoip-db-tool/src/db.rs new file mode 100644 index 0000000000..eaadd4c612 --- /dev/null +++ b/scripts/maint/geoip/geoip-db-tool/src/db.rs @@ -0,0 +1,126 @@ +/// Code to parse a dump file +use std::collections::HashMap; +use std::convert::TryInto; +use std::iter::Peekable; + +use super::NetBlock; + +pub struct BlockReader<I> +where + I: Iterator<Item = std::io::Result<String>>, +{ + iter: Peekable<I>, +} + +enum AnyBlock { + NotNet, + NetBlock(NetBlock), +} + +impl<I> BlockReader<I> +where + I: Iterator<Item = std::io::Result<String>>, +{ + pub fn new(iter: I) -> Self { + BlockReader { + iter: iter.peekable(), + } + } + + /// Extract the initial header from the file. + pub fn extract_header(&mut self) -> String { + let mut res: String = "".to_string(); + + while let Some(Ok(line)) = self.iter.peek() { + if !line.starts_with('#') { + break; + } + res.push_str(line.as_str()); + res.push('\n'); + let _ = self.iter.next(); + } + + res + } + + /// Extract the next empty-line-delimited block from the file. + /// + /// This isn't terribly efficient, but it's "fast enough". + fn get_block(&mut self) -> Option<std::io::Result<AnyBlock>> { + let mut kv = HashMap::new(); + + while let Some(line) = self.iter.next() { + //dbg!(&line); + if let Err(e) = line { + return Some(Err(e)); + } + let line_orig = line.unwrap(); + let line = line_orig.splitn(2, '#').next().unwrap().trim(); + if line.is_empty() { + if kv.is_empty() { + continue; + } else { + break; + } + } + let kwds: Vec<_> = line.splitn(2, ':').collect(); + if kwds.len() != 2 { + return None; // XXXX handle the error better. + } + kv.insert(kwds[0].trim().to_string(), kwds[1].trim().to_string()); + } + + if kv.is_empty() { + return None; + } + + let net = if let Some(net) = kv.get("net") { + net.parse().unwrap() //XXXX handle the error better. + } else { + return Some(Ok(AnyBlock::NotNet)); + }; + + let cc = if let Some(country) = kv.get("country") { + assert!(country.as_bytes().len() == 2); + country.as_bytes()[0..2].try_into().unwrap() + } else { + return Some(Ok(AnyBlock::NotNet)); + }; + + fn is_true(v: Option<&String>) -> bool { + match v { + Some(s) => s == "true", + None => false, + } + } + + let is_anon_proxy = is_true(kv.get("is-anonymous-proxy")); + let is_anycast = is_true(kv.get("is-anycast-proxy")); + let is_satellite = is_true(kv.get("is-satellite-provider")); + + Some(Ok(AnyBlock::NetBlock(NetBlock { + net, + cc, + is_anon_proxy, + is_anycast, + is_satellite, + }))) + } +} + +impl<I> Iterator for BlockReader<I> +where + I: Iterator<Item = std::io::Result<String>>, +{ + type Item = NetBlock; + fn next(&mut self) -> OptionSelf::Item { + loop { + match self.get_block() { + None => return None, + Some(Err(_)) => return None, + Some(Ok(AnyBlock::NotNet)) => continue, + Some(Ok(AnyBlock::NetBlock(n))) => return Some(n), + } + } + } +} diff --git a/scripts/maint/geoip/geoip-db-tool/src/main.rs b/scripts/maint/geoip/geoip-db-tool/src/main.rs new file mode 100644 index 0000000000..38d70f7e1b --- /dev/null +++ b/scripts/maint/geoip/geoip-db-tool/src/main.rs @@ -0,0 +1,165 @@ +/// A basic tool to convert IPFire Location dumps into the CSV formats that Tor +/// expects. +mod db; + +use argh::FromArgs; +use ipnetwork::IpNetwork; +use rangemap::RangeInclusiveMap; + +use std::fs::File; +use std::io::{BufRead, BufReader, BufWriter, Write}; +use std::net::{IpAddr, Ipv6Addr}; +use std::path::{Path, PathBuf}; + +fn default_ipv4_path() -> PathBuf { + "./geoip".into() +} +fn default_ipv6_path() -> PathBuf { + "./geoip6".into() +} + +#[derive(FromArgs)] +/// Convert an IPFire Location dump into CSV geoip files. +struct Args { + /// where to store the IPv4 geoip output + #[argh(option, default = "default_ipv4_path()", short = '4')] + output_ipv4: PathBuf, + + /// where to store the IPv6 geoip6 output + #[argh(option, default = "default_ipv6_path()", short = '6')] + output_ipv6: PathBuf, + + /// where to find the dump file + #[argh(option, short = 'i')] + input: PathBuf, +} + +/// Represents a network block from running `location dump`. +#[derive(Debug, Clone)] +pub struct NetBlock { + pub net: IpNetwork, + pub cc: [u8; 2], + pub is_anon_proxy: bool, + pub is_anycast: bool, + pub is_satellite: bool, +} + +impl PartialEq for NetBlock { + fn eq(&self, other: &Self) -> bool { + self.net == other.net + } +} + +/// We define network blocks as being sorted first from largest to smallest, +/// then by address. +impl Ord for NetBlock { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.net + .prefix() + .cmp(&other.net.prefix()) + .then_with(|| self.net.network().cmp(&other.net.network())) + } +} + +impl PartialOrd for NetBlock { + fn partial_cmp(&self, other: &Self) -> Optionstd::cmp::Ordering { + Some(self.cmp(other)) + } +} + +impl Eq for NetBlock {} + +const PROLOGUE: &str = "\ +# This file has been converted from the IPFire Location database +# using Tor's geoip-db-tool. For more information on the data, see +# https://location.ipfire.org/. +# +# Below is the header from the original export: +# +"; + +/// Read an input file in the `location dump` format, and write CSV ipv4 and ipv6 files. +/// +/// This code tries to be "efficient enough"; most of the logic is handled by +/// using the rangemap crate. +fn convert(input: &Path, output_v4: &Path, output_v6: &Path) -> std::io::Result<()> { + let f = File::open(input)?; + let f = BufReader::new(f); + let mut blocks = Vec::new(); + + let mut reader = db::BlockReader::new(f.lines()); + let hdr = reader.extract_header(); + // Read blocks, and then sort them by specificity and address. + for nb in reader { + blocks.push(nb); + } + blocks.sort(); + + // Convert the sorted blocks into a map from address ranges into + // country codes. + // + // Note that since we have sorted the blocks from least to most specific, + // we will be puttting them into the maps in the right order, so that the + // most specific rule "wins". + // + // We use u32 and u128 as the index types for these RangeInclusiveMaps, + // so that we don't need to implement a step function for IpAddr. + let mut v4map: RangeInclusiveMap<u32, [u8; 2], _> = RangeInclusiveMap::new(); + let mut v6map: RangeInclusiveMap<u128, [u8; 2], _> = RangeInclusiveMap::new(); + + let mut n = 0usize; + let num_blocks = blocks.len(); + for nb in blocks { + n += 1; + if n % 100000 == 0 { + println!("{}/{}", n, num_blocks); + } + let start = nb.net.network(); + let end = nb.net.broadcast(); + match (start, end) { + (IpAddr::V4(a), IpAddr::V4(b)) => { + v4map.insert(a.into()..=b.into(), nb.cc); + } + (IpAddr::V6(a), IpAddr::V6(b)) => { + v6map.insert(a.into()..=b.into(), nb.cc); + } + (_, _) => panic!("network started and ended in different families!?"), + } + } + + // Write the ranges out to the appropriate files, in order. + let mut v4 = BufWriter::new(File::create(output_v4)?); + let mut v6 = BufWriter::new(File::create(output_v6)?); + + v4.write_all(PROLOGUE.as_bytes())?; + v4.write_all(hdr.as_bytes())?; + for (r, cc) in v4map.iter() { + let a: u32 = *r.start(); + let b: u32 = *r.end(); + writeln!(&mut v4, "{},{},{}", a, b, std::str::from_utf8(cc).unwrap())?; + } + + v6.write_all(PROLOGUE.as_bytes())?; + v6.write_all(hdr.as_bytes())?; + for (r, cc) in v6map.iter() { + let a: Ipv6Addr = (*r.start()).into(); + let b: Ipv6Addr = (*r.end()).into(); + writeln!(&mut v6, "{},{},{}", a, b, std::str::from_utf8(cc).unwrap())?; + } + + // The documentation says you should always flush a BufWriter. + v4.flush()?; + v6.flush()?; + + Ok(()) +} + +fn main() -> std::io::Result<()> { + let args: Args = argh::from_env(); + + convert( + args.input.as_path(), + args.output_ipv4.as_path(), + args.output_ipv6.as_path(), + ) +} diff --git a/scripts/maint/geoip/update_geoip.sh b/scripts/maint/geoip/update_geoip.sh new file mode 100755 index 0000000000..9289e7a969 --- /dev/null +++ b/scripts/maint/geoip/update_geoip.sh @@ -0,0 +1,16 @@ +#!/bin/sh + +set -e + +DIR=$(cd "$(dirname "$0")" && pwd) +TMP=$(mktemp -d) + +location update +location dump "$TMP/geoip-dump.txt" + +OLDDIR=$(pwd) +cd "$DIR/geoip-db-tool/" +cargo build --release +cd "$OLDDIR" + +"$DIR/geoip-db-tool/target/release/geoip-db-tool" -i "$TMP/geoip-dump.txt"