commit 869036cceac9468db6daa7e3c9d878b41432a6a7 Author: Karsten Loesing karsten.loesing@gmx.net Date: Mon Jul 1 08:33:39 2013 +0200
Add scripts written by nickm to analyze consensus diff sizes (#7009). --- task-7009/README | 2 + task-7009/build_md_table_2.py | 20 +++++++ task-7009/cmds5 | 65 +++++++++++++++++++++++ task-7009/condiff.py | 91 ++++++++++++++++++++++++++++++++ task-7009/condiff2.py | 115 +++++++++++++++++++++++++++++++++++++++++ task-7009/cy.py | 104 +++++++++++++++++++++++++++++++++++++ task-7009/individually.py | 43 +++++++++++++++ task-7009/md-kludge.py | 56 ++++++++++++++++++++ task-7009/pairwise.py | 86 ++++++++++++++++++++++++++++++ task-7009/wround.py | 39 ++++++++++++++ 10 files changed, 621 insertions(+)
diff --git a/task-7009/README b/task-7009/README new file mode 100644 index 0000000..a5caa39 --- /dev/null +++ b/task-7009/README @@ -0,0 +1,2 @@ +Scripts written by nickm to analyze consensus diff sizes. + diff --git a/task-7009/build_md_table_2.py b/task-7009/build_md_table_2.py new file mode 100755 index 0000000..9ab4051 --- /dev/null +++ b/task-7009/build_md_table_2.py @@ -0,0 +1,20 @@ +#!/usr/bin/python +import sys + +result = {} + +for fname in sys.argv[1:]: + f = open(fname, 'r') + for line in f: + if line.startswith('r '): + lastR = line + if line.startswith('m 8') or (line.startswith('m ') and ",8" in line): + sha = line.split()[2] + assert sha.startswith("sha256=") + descID = lastR.split()[3] + result[descID] = sha + f.close() + +for k,v in result.iteritems(): + print k,v + diff --git a/task-7009/cmds5 b/task-7009/cmds5 new file mode 100644 index 0000000..e2df77a --- /dev/null +++ b/task-7009/cmds5 @@ -0,0 +1,65 @@ + ./individually.py gz */* >> RESULTS5 + ./individually.py bz2 */* >> RESULTS5 + ./individually.py xz */* >> RESULTS5 + ./individually.py uncompressed */* >> RESULTS5 + + ./pairwise.py diff_gz 1 */* >> RESULTS5 + ./pairwise.py diff_gz 2 */* >> RESULTS5 + ./pairwise.py diff_gz 4 */* >> RESULTS5 + ./pairwise.py diff_gz 6 */* >> RESULTS5 + ./pairwise.py diff_gz 8 */* >> RESULTS5 + + ./pairwise.py diff_bz2 1 */* >> RESULTS5 + ./pairwise.py diff_bz2 2 */* >> RESULTS5 + ./pairwise.py diff_bz2 4 */* >> RESULTS5 + ./pairwise.py diff_bz2 6 */* >> RESULTS5 + ./pairwise.py diff_bz2 8 */* >> RESULTS5 + + ./pairwise.py diff_u_gz 1 */* >> RESULTS5 + ./pairwise.py diff_u_gz 2 */* >> RESULTS5 + ./pairwise.py diff_u_gz 4 */* >> RESULTS5 + ./pairwise.py diff_u_gz 6 */* >> RESULTS5 + ./pairwise.py diff_u_gz 8 */* >> RESULTS5 + + ./pairwise.py diff_u_bz2 1 */* >> RESULTS5 + ./pairwise.py diff_u_bz2 2 */* >> RESULTS5 + ./pairwise.py diff_u_bz2 4 */* >> RESULTS5 + ./pairwise.py diff_u_bz2 6 */* >> RESULTS5 + ./pairwise.py diff_u_bz2 8 */* >> RESULTS5 + + ./pairwise.py diff_e_gz 1 */* >> RESULTS5 + ./pairwise.py diff_e_gz 2 */* >> RESULTS5 + ./pairwise.py diff_e_gz 4 */* >> RESULTS5 + ./pairwise.py diff_e_gz 6 */* >> RESULTS5 + ./pairwise.py diff_e_gz 8 */* >> RESULTS5 + + ./pairwise.py diff_e_bz2 1 */* >> RESULTS5 + ./pairwise.py diff_e_bz2 2 */* >> RESULTS5 + ./pairwise.py diff_e_bz2 4 */* >> RESULTS5 + ./pairwise.py diff_e_bz2 6 */* >> RESULTS5 + ./pairwise.py diff_e_bz2 8 */* >> RESULTS5 + + ./pairwise.py condiff_gz 1 */* >> RESULTS5 + ./pairwise.py condiff_gz 2 */* >> RESULTS5 + ./pairwise.py condiff_gz 4 */* >> RESULTS5 + ./pairwise.py condiff_gz 6 */* >> RESULTS5 + ./pairwise.py condiff_gz 8 */* >> RESULTS5 + + ./pairwise.py condiff_bz2 1 */* >> RESULTS5 + ./pairwise.py condiff_bz2 2 */* >> RESULTS5 + ./pairwise.py condiff_bz2 4 */* >> RESULTS5 + ./pairwise.py condiff_bz2 6 */* >> RESULTS5 + ./pairwise.py condiff_bz2 8 */* >> RESULTS5 + + ./pairwise.py condiff2_gz 1 */* >> RESULTS5 + ./pairwise.py condiff2_gz 2 */* >> RESULTS5 + ./pairwise.py condiff2_gz 4 */* >> RESULTS5 + ./pairwise.py condiff2_gz 6 */* >> RESULTS5 + ./pairwise.py condiff2_gz 8 */* >> RESULTS5 + + ./pairwise.py condiff2_bz2 1 */* >> RESULTS5 + ./pairwise.py condiff2_bz2 2 */* >> RESULTS5 + ./pairwise.py condiff2_bz2 4 */* >> RESULTS5 + ./pairwise.py condiff2_bz2 6 */* >> RESULTS5 + ./pairwise.py condiff2_bz2 8 */* >> RESULTS5 + diff --git a/task-7009/condiff.py b/task-7009/condiff.py new file mode 100755 index 0000000..23c8a75 --- /dev/null +++ b/task-7009/condiff.py @@ -0,0 +1,91 @@ +#!/usr/bin/python +import sys + +class Router: + def __init__(self, r_line): + assert r_line.startswith("r ") + self.ident = r_line.split()[2] + self.lines = [ r_line ] + def append(self, line): + self.lines.append(line) + +def splitfile(f): + header, body, footer = [], [], [] + inHeader = True + inBody = False + inFooter = False + for line in f.readlines(): + if inHeader and line.startswith("r "): + inBody = True + inHeader = False + curRouter = None + if inBody and line.startswith("directory-footer"): + inFooter = True + inBody = False + + if inHeader: + header.append(line) + + if inBody: + if line.startswith("r "): + curRouter = Router(line) + body.append(curRouter) + else: + curRouter.append(line) + + if inFooter: + footer.append(line) + + assert inFooter + return header, body, footer + + +def main(f1, f2): + _, body1, _ = splitfile(f1) + header2, body2, footer2 = splitfile(f2) + assert footer2 + + for h in header2: + sys.stdout.write(h) + + while body1 and body2: + if body1[0].ident < body2[0].ident: + print "-" + del body1[0] + elif body1[0].ident > body2[0].ident: + sys.stdout.write("* ") + for b in body2[0].lines: + sys.stdout.write(b) + del body2[0] + else: # same router + if body1[0].lines != body2[0].lines: + if len(body1[0].lines) != len(body2[0].lines): + print >>sys.stderr, "<<%s>><<%s>>"%(body1[0].lines, body2[0].lines) + sys.stdout.write("** ") + for b in body2[0].lines: + sys.stdout.write(b) + del body2[0] + del body1[0] + continue + + if body1[0].lines[0] == body2[0].lines[0]: + print "." + else: + sys.stdout.write(body2[0].lines[0]) + for l1,l2 in zip(body1[0].lines[1:], body2[0].lines[1:]): + if l1 != l2: sys.stdout.write(l2) + del body1[0] + del body2[0] + + for r in body2: + sys.stdout.write("* ") + for l in r.lines: + sys.stdout.write(l) + + for f in footer2: + sys.stdout.write(f) + +f1 = open(sys.argv[1]) +f2 = open(sys.argv[2]) + +main(f1,f2) diff --git a/task-7009/condiff2.py b/task-7009/condiff2.py new file mode 100755 index 0000000..c93ab9f --- /dev/null +++ b/task-7009/condiff2.py @@ -0,0 +1,115 @@ +#!/usr/bin/python +import sys + +class Router: + def __init__(self, r_line): + assert r_line.startswith("r ") + self.ident = r_line.split()[2] + self.lines = [ r_line ] + def append(self, line): + self.lines.append(line) + + +def rdiff(r1,r2): + r1 = r1.split() + r2 = r2.split() + p1 = ["r"] + p2 = [] + for i in xrange(len(r1)): + if r1[i] == r2[i]: + continue + p1.append(str(i)) + p2.append(r2[i]) + return "".join(p1) + " " + " ".join(p2) + +def sdiff(s1, s2): + s1 = set(s1.split()[1:]) + s2 = set(s2.split()[1:]) + minus = sorted(("-%s"%item) for item in s1 if item not in s2) + plus = sorted(("+%s"%item) for item in s2 if item not in s1) + return " ".join(["s"] + minus + plus) + +def splitfile(f): + header, body, footer = [], [], [] + inHeader = True + inBody = False + inFooter = False + for line in f.readlines(): + if inHeader and line.startswith("r "): + inBody = True + inHeader = False + curRouter = None + if inBody and line.startswith("directory-footer"): + inFooter = True + inBody = False + + if inHeader: + header.append(line) + + if inBody: + if line.startswith("r "): + curRouter = Router(line) + body.append(curRouter) + else: + curRouter.append(line) + + if inFooter: + footer.append(line) + + assert inFooter + return header, body, footer + + +def main(f1, f2): + _, body1, _ = splitfile(f1) + header2, body2, footer2 = splitfile(f2) + assert footer2 + + for h in header2: + sys.stdout.write(h) + + while body1 and body2: + if body1[0].ident < body2[0].ident: + print "-" + del body1[0] + elif body1[0].ident > body2[0].ident: + sys.stdout.write("* ") + for b in body2[0].lines: + sys.stdout.write(b) + del body2[0] + else: # same router + if body1[0].lines != body2[0].lines: + if len(body1[0].lines) != len(body2[0].lines): + print >>sys.stderr, "<<%s>><<%s>>"%(body1[0].lines, body2[0].lines) + sys.stdout.write("** ") + for b in body2[0].lines: + sys.stdout.write(b) + del body2[0] + del body1[0] + continue + + if body1[0].lines[0] == body2[0].lines[0]: + print "." + else: + print rdiff(body1[0].lines[0],body2[0].lines[0]) + for l1,l2 in zip(body1[0].lines[1:], body2[0].lines[1:]): + if l1 != l2: + if l2.startswith('s ') and l1.startswith('s '): + print sdiff(l1, l2) + else: + sys.stdout.write(l2) + del body1[0] + del body2[0] + + for r in body2: + sys.stdout.write("* ") + for l in r.lines: + sys.stdout.write(l) + + for f in footer2: + sys.stdout.write(f) + +f1 = open(sys.argv[1]) +f2 = open(sys.argv[2]) + +main(f1,f2) diff --git a/task-7009/cy.py b/task-7009/cy.py new file mode 100755 index 0000000..940d917 --- /dev/null +++ b/task-7009/cy.py @@ -0,0 +1,104 @@ +#!/usr/bin/python +import sys + +class Router: + def __init__(self, r_line): + assert r_line.startswith("r ") + self.ident = r_line.split()[2] + self.lines = [ r_line ] + def append(self, line): + self.lines.append(line) + + +def rdiff(r1,r2): + r1 = r1.split() + r2 = r2.split() + p1 = ["r"] + p2 = [] + for i in xrange(len(r1)): + if r1[i] == r2[i]: + continue + p1.append(str(i)) + p2.append(r2[i]) + return "".join(p1) + " " + " ".join(p2) + +def splitfile(f): + header, body, footer = [], [], [] + inHeader = True + inBody = False + inFooter = False + for line in f.readlines(): + if inHeader and line.startswith("r "): + inBody = True + inHeader = False + curRouter = None + if inBody and line.startswith("directory-footer"): + inFooter = True + inBody = False + + if inHeader: + header.append(line) + + if inBody: + if line.startswith("r "): + curRouter = Router(line) + body.append(curRouter) + else: + curRouter.append(line) + + if inFooter: + footer.append(line) + + assert inFooter + return header, body, footer + + +def main(f1, f2): + _, body1, _ = splitfile(f1) + header2, body2, footer2 = splitfile(f2) + assert footer2 + + for h in header2: + sys.stdout.write(h) + + while body1 and body2: + if body1[0].ident < body2[0].ident: + print "-" + del body1[0] + elif body1[0].ident > body2[0].ident: + sys.stdout.write("* ") + for b in body2[0].lines: + sys.stdout.write(b) + del body2[0] + else: # same router + if body1[0].lines != body2[0].lines: + if len(body1[0].lines) != len(body2[0].lines): + print >>sys.stderr, "<<%s>><<%s>>"%(body1[0].lines, body2[0].lines) + sys.stdout.write("** ") + for b in body2[0].lines: + sys.stdout.write(b) + del body2[0] + del body1[0] + continue + + if body1[0].lines[0] == body2[0].lines[0]: + print "." + else: + print rdiff(body1[0].lines[0],body2[0].lines[0]) + for l1,l2 in zip(body1[0].lines[1:], body2[0].lines[1:]): + if l1 != l2: sys.stdout.write(l2) + del body1[0] + del body2[0] + + for r in body2: + sys.stdout.write("* ") + for l in r.lines: + sys.stdout.write(l) + + for f in footer2: + sys.stdout.write(f) + +f1 = open(sys.argv[1]) +f2 = open(sys.argv[2]) + +main(f1,f2) diff --git a/task-7009/individually.py b/task-7009/individually.py new file mode 100755 index 0000000..0868ef6 --- /dev/null +++ b/task-7009/individually.py @@ -0,0 +1,43 @@ +#!/usr/bin/python + +import os +import sys +import subprocess + +def uncompressed(fname): + return os.stat(fname).st_size + +def gz(fname): + try: + os.unlink("out.tmp") + except OSError, e: + pass + os.system("gzip -c -9 %s > out.tmp" % fname) + return os.stat("out.tmp").st_size + +def bz2(fname): + os.unlink("out.tmp") + os.system("bzip2 -c -9 %s > out.tmp" % fname) + return os.stat("out.tmp").st_size + +def xz(fname): + os.unlink("out.tmp") + os.system("xz -c -9 %s > out.tmp" %fname) + return os.stat("out.tmp").st_size + +func = globals()[sys.argv[1]] + +allvals = [] +total = 0L +N = 0 + +for fname in sys.argv[2:]: + n = func(fname) + N += 1 + total += n + allvals.append(n) + +allvals.sort() +print "%s: mean %s. median %s"%(sys.argv[1], total//N, allvals[N//2]) + + diff --git a/task-7009/md-kludge.py b/task-7009/md-kludge.py new file mode 100755 index 0000000..6f18c8b --- /dev/null +++ b/task-7009/md-kludge.py @@ -0,0 +1,56 @@ +#!/usr/bin/python + +import sys +import re +import os + +def read_table(fn): + p = re.compile(r'^([^ ]*) sha256=([^ \n]*)') + bad = 0 + t = {} + for line in open(fn): + m = p.match(line) + if not m: + bad += 1 + continue + t[m.group(1)] = m.group(2) + print bad, "bad entries in", fn + return t + + +def process(fn, t): + tmp = fn+".tmp" + inp = open(fn, 'r') + out = open(tmp, 'w') + h = m = 0 + for line in inp: + if line.startswith('r '): + r = line.split() + desc_id = r[3] + del r[3] + print >>out, " ".join(r) + try: + md_id = t[desc_id] + h += 1 + except KeyError: + md_id = desc_id #kluuuuuuuuudge!!!!! + m += 1 + print >>out, "m",md_id + else: + out.write(line) + inp.close() + out.close() + os.rename(tmp, fn) + return h, m + +table = read_table("table.txt") + +hit = 0 +miss = 0 + +for fn in sys.argv[1:]: + h,m = process(fn, table) + hit += h + miss += m + +print hit, miss diff --git a/task-7009/pairwise.py b/task-7009/pairwise.py new file mode 100755 index 0000000..30a498d --- /dev/null +++ b/task-7009/pairwise.py @@ -0,0 +1,86 @@ +#!/usr/bin/python + +import os +import sys +import subprocess + +def echo(a,b): + print a, b + return 0 + +def diff_gz(fn, fn2): + os.unlink("out.tmp") + os.system("diff %s %s | gzip -9 -c > out.tmp" % (fn, fn2)) + return os.stat("out.tmp").st_size +def diff_u_gz(fn, fn2): + os.unlink("out.tmp") + os.system("diff -u %s %s | gzip -9 -c > out.tmp" % (fn, fn2)) + return os.stat("out.tmp").st_size +def diff_e_gz(fn, fn2): + os.unlink("out.tmp") + os.system("diff -e %s %s | gzip -9 -c > out.tmp" % (fn, fn2)) + return os.stat("out.tmp").st_size + +def diff_bz2(fn, fn2): + os.unlink("out.tmp") + os.system("diff %s %s | bzip2 -9 -c > out.tmp" % (fn, fn2)) + return os.stat("out.tmp").st_size +def diff_u_bz2(fn, fn2): + os.unlink("out.tmp") + os.system("diff -u %s %s | bzip2 -9 -c > out.tmp" % (fn, fn2)) + return os.stat("out.tmp").st_size +def diff_e_bz2(fn, fn2): + os.unlink("out.tmp") + os.system("diff -e %s %s | bzip2 -9 -c > out.tmp" % (fn, fn2)) + return os.stat("out.tmp").st_size + +def condiff_gz(fn, fn2): + os.unlink("out.tmp") + os.system("./condiff.py %s %s | gzip -9 -c > out.tmp" % (fn, fn2)) + return os.stat("out.tmp").st_size +def condiff_bz2(fn, fn2): + os.unlink("out.tmp") + os.system("./condiff.py %s %s | bzip2 -9 -c > out.tmp" % (fn, fn2)) + return os.stat("out.tmp").st_size + +def condiff2_gz(fn, fn2): + os.unlink("out.tmp") + os.system("./condiff2.py %s %s | gzip -9 -c > out.tmp" % (fn, fn2)) + return os.stat("out.tmp").st_size +def condiff2_bz2(fn, fn2): + os.unlink("out.tmp") + os.system("./condiff2.py %s %s | bzip2 -9 -c > out.tmp" % (fn, fn2)) + return os.stat("out.tmp").st_size + +f = open("out.tmp", 'w') +f.write("xyz") +f.close() + +func = globals()[sys.argv[1]] + +allvals = [] +total = 0L +N = 0 + +lag = int(sys.argv[2]) + +def pairwise(it): + it = iter(it) + lastv = [] + for i in xrange(lag): + lastv.append(it.next()) + for v in it: + yield lastv[0], v + lastv.append(v) + del lastv[0] + +for fname, fname2 in pairwise(sys.argv[3:]): + n = func(fname, fname2) + N += 1 + total += n + allvals.append(n) + +allvals.sort() +print "%s: lag %s: mean %s. median %s"%(sys.argv[1], lag, total//N, allvals[N//2]) + + diff --git a/task-7009/wround.py b/task-7009/wround.py new file mode 100755 index 0000000..3444950 --- /dev/null +++ b/task-7009/wround.py @@ -0,0 +1,39 @@ +#!/usr/bin/python +import sys +import re +import os +def wround(v): + if v < 8: + return 8 + elif v < 128: + downto = 1 + elif v < 1024: + downto = 3 + else: + downto = 7 + v_orig = v + shift = 0 + while v > downto: + v >>= 1 + shift += 1 + return v << shift + +p = re.compile(r'w Bandwidth=(\d+)(.*)') + +def rewrite(fname): + inp = open(fname) + out = open(fname+".tmp", 'w') + for line in inp: + m = p.match(line) + if not m: + out.write(line) + continue + v = wround(int(m.group(1))) + out.write("w Bandwidth=%s%s\n"%(v,m.group(2).rstrip())) + out.close() + inp.close() + os.rename(fname+".tmp", fname) + +for fn in sys.argv[1:]: + rewrite(fn) +