commit cbba7ec70bf5cd6896cab57088164d4d90977e71 Author: Karsten Loesing karsten.loesing@gmx.net Date: Wed Aug 14 18:44:28 2013 +0200
MaxMind's GeoIP files use ISO-8859-1, not UTF-8.
Without specifying a file encoding, Java uses the system default, which may be UTF-8 on Linux systems. But MaxMind's files all use ISO-8859-1. The effect is that our details files may contain incorrect lines like:
"as_name":"Servi\uFFFDos de Comunica\uFFFD\uFFFDo S.A.",
which should be:
"as_name":"Servi\u00C1os de Comunica\u00C1\u201Eo S.A.", --- src/org/torproject/onionoo/LookupService.java | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-)
diff --git a/src/org/torproject/onionoo/LookupService.java b/src/org/torproject/onionoo/LookupService.java index 3791443..6fc2bf6 100644 --- a/src/org/torproject/onionoo/LookupService.java +++ b/src/org/torproject/onionoo/LookupService.java @@ -4,8 +4,9 @@ package org.torproject.onionoo;
import java.io.BufferedReader; import java.io.File; -import java.io.FileReader; +import java.io.FileInputStream; import java.io.IOException; +import java.io.InputStreamReader; import java.util.HashMap; import java.util.HashSet; import java.util.Map; @@ -118,8 +119,8 @@ public class LookupService { SortedSet<Long> sortedAddressNumbers = new TreeSet<Long>( addressStringNumbers.values()); long firstAddressNumber = sortedAddressNumbers.first(); - BufferedReader br = new BufferedReader(new FileReader( - geoLiteCityBlocksCsvFile)); + BufferedReader br = new BufferedReader(new InputStreamReader( + new FileInputStream(geoLiteCityBlocksCsvFile), "ISO-8859-1")); String line; long previousStartIpNum = -1L; while ((line = br.readLine()) != null) { @@ -187,8 +188,8 @@ public class LookupService { try { Set<Long> blockNumbers = new HashSet<Long>( addressNumberBlocks.values()); - BufferedReader br = new BufferedReader(new FileReader( - geoLiteCityLocationCsvFile)); + BufferedReader br = new BufferedReader(new InputStreamReader( + new FileInputStream(geoLiteCityLocationCsvFile), "ISO-8859-1")); String line; while ((line = br.readLine()) != null) { if (line.startsWith("C") || line.startsWith("l")) { @@ -225,8 +226,8 @@ public class LookupService { /* Read country names to memory. */ Map<String, String> countryNames = new HashMap<String, String>(); try { - BufferedReader br = new BufferedReader(new FileReader( - iso3166CsvFile)); + BufferedReader br = new BufferedReader(new InputStreamReader( + new FileInputStream(iso3166CsvFile), "ISO-8859-1")); String line; while ((line = br.readLine()) != null) { String[] parts = line.replaceAll(""", "").split(",", 2); @@ -248,8 +249,8 @@ public class LookupService { /* Read region names to memory. */ Map<String, String> regionNames = new HashMap<String, String>(); try { - BufferedReader br = new BufferedReader(new FileReader( - regionCsvFile)); + BufferedReader br = new BufferedReader(new InputStreamReader( + new FileInputStream(regionCsvFile), "ISO-8859-1")); String line; while ((line = br.readLine()) != null) { String[] parts = line.replaceAll(""", "").split(",", 3); @@ -275,8 +276,8 @@ public class LookupService { SortedSet<Long> sortedAddressNumbers = new TreeSet<Long>( addressStringNumbers.values()); long firstAddressNumber = sortedAddressNumbers.first(); - BufferedReader br = new BufferedReader(new FileReader( - geoIPASNum2CsvFile)); + BufferedReader br = new BufferedReader(new InputStreamReader( + new FileInputStream(geoIPASNum2CsvFile), "ISO-8859-1")); String line; long previousStartIpNum = -1L; while ((line = br.readLine()) != null) {