[stegotorus/master] Rough draft benchmarking tools.

20 Jul 2012

commit 704c053fbae93353976e3f7abf585b6283edb3a9
Author: Zack Weinberg <zackw@panix.com>
Date:   Sun Feb 12 20:36:56 2012 -0800

    Rough draft benchmarking tools.
---
 scripts/benchmark-plot.R   |    7 +
 scripts/benchmark.py       |  410 +++++++++++++++++++++++++++++++++++
 scripts/bm-fixedrate-cgi.c |  158 ++++++++++++++
 scripts/bm-genfiles.py     |  162 ++++++++++++++
 scripts/bm-mcurl.c         |  196 +++++++++++++++++
 scripts/tool_urlglob.c     |  516 ++++++++++++++++++++++++++++++++++++++++++++
 scripts/tool_urlglob.h     |   69 ++++++
 7 files changed, 1518 insertions(+), 0 deletions(-)

diff --git a/scripts/benchmark-plot.R b/scripts/benchmark-plot.R
new file mode 100644
index 0000000..a58ae0a
--- /dev/null
+++ b/scripts/benchmark-plot.R
@@ -0,0 +1,7 @@
+#! /usr/bin/Rscript
+
+suppressPackageStartupMessages({
+  library(ggplot2)
+})
+
+lf.direct <- read.csv("bench-lf-direct.tab", header=TRUE)
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
new file mode 100755
index 0000000..c6487b6
--- /dev/null
+++ b/scripts/benchmark.py
@@ -0,0 +1,410 @@
+#! /usr/bin/python
+
+# Stegotorus benchmarking script.
+# Several different computers are involved:
+#
+# - the "client" is the machine you run this script on; the workload
+#   generator will run there, as will the StegoTorus and Tor clients.
+#
+# - the "proxy" is a machine that you can ssh to with no password.
+#   It will run the StegoTorus and Tor bridge servers.
+#
+# - the "target" is the HTTP server that will be contacted in various ways.
+#
+#   bm-genfiles.py must have been run on this server to create file
+#   trees named 'fixed' and 'pareto' which appear as direct children
+#   of the root URL. bm-fixedrate-cgi.c must have been compiled for
+#   that server and appear as /bm-fixedrate.cgi.
+#
+# Software you need on the client machine:
+#
+# bwm-ng: http://www.gropp.org/?id=projects&sub=bwm-ng
+# curl: http://curl.haxx.se/
+# httperf: http://www.hpl.hp.com/research/linux/httperf/
+# tsocks: http://tsocks.sourceforge.net/about.php
+# tor: https://torproject.org/
+# stegotorus: you already have it :)
+#
+# Software you need on the proxy machine:
+#
+# nylon: http://monkey.org/~marius/pages/?page=nylon
+# tor, stegotorus
+#
+# You configure this script by setting variables below.
+
+# Client host
+
+CLIENT_IP     = "99.113.33.155"
+CLIENT_IFACE  = "eth0"
+
+# Proxy host
+
+PROXY         = "sandbox03.sv.cmu.edu"
+PROXY_IP      = "209.129.244.30" # some things won't do DNS for this
+PROXY_PORT    = "1080"
+PROXY_SSH_CMD = ("ssh", PROXY)
+
+# Target
+
+TARGET    = "storustest.nfshost.com"
+
+# Fudge factors.  For some reason, bm-fixedrate generates data a
+# linear factor slower than it was meant to; this is the quick fix.
+
+FUDGE_FIXEDRATE = 2.5
+
+# Programs we need to run.  Change these if any binary is not in the
+# default path or hasn't got the default name.
+# C_ - for the client.  P_ - for the proxy.
+# You can NOT specify arguments here - if you need to do any
+# setup, write a wrapper script.
+
+C_bwm     = "bwm-ng"
+C_curl    = "curl"
+C_httperf = "httperf"
+C_storus  = "stegotorus-wrapper"
+C_tor     = "/usr/sbin/tor"
+C_tsocks  = "/usr/lib/libtsocks.so"
+
+P_nylon   = "nylon"
+P_storus  = "stegotorus-wrapper"
+P_tor     = "tor"
+P_python  = "/usr/local/bin/python" # this must be an absolute path,
+                                    # it goes on a shebang line
+
+# ACTUAL PROGRAM STARTS HERE
+
+from types import MethodType
+import os
+import os.path
+import pickle
+import subprocess
+import sys
+import time
+
+def monitor(report, label, period):
+    """Monitor network utilization (bytes/sec up and down) for a
+    period of PERIOD seconds, writing the report to REPORT, labeling
+    each line with LABEL."""
+
+    bwm = subprocess.Popen((C_bwm, "-o", "csv", "-c", str(period), "-t", "1000",
+                            "-u", "bytes", "-T", "rate", "-I", CLIENT_IFACE),
+                           stdout=subprocess.PIPE,
+                           universal_newlines=True)
+    try:
+        n = 1
+        for line in bwm.stdout:
+            (stamp, iface, upbytes, dnbytes, rest) = line.split(';', 4)
+            if iface == 'total': continue
+
+            # convert to most compact possible form
+            upbytes = str(float(upbytes))
+            dnbytes = str(float(dnbytes))
+
+            report.write("%s,%d,%s,%s\n" % (label,n,upbytes,dnbytes))
+            n += 1
+    except:
+        bwm.terminate()
+        raise
+    finally:
+        bwm.wait()
+
+class ProxyProcess(object):
+    """A process running on the proxy host.  It has a command line and
+    an optional config file. It is not expected to produce any output
+    (if it does, it will get dumped to this script's stdout/stderr) or
+    require any input (input is redirected from /dev/null).  It is
+    expected to run until it is killed."""
+
+    @staticmethod
+    def prepare_remote():
+        remote_driver=r"""#! %s
+import pickle
+import signal
+import subprocess
+import sys
+import traceback
+
+wrote_rpid = False
+
+# Remote driver for proxy processes.
+try:
+    data = pickle.load(sys.stdin)
+    sys.stdin.close()
+    if data['cfgname']:
+        f = open(data['cfgname'], "w")
+        f.write(data['cfgdata'])
+        f.close()
+    proc = subprocess.Popen(data['args'], stdin=open("/dev/null", "r"),
+                            stdout=2) # redirect child stdout to our stderr
+    sys.stdout.write(str(proc.pid) + "\n")
+    wrote_rpid = True
+    sys.stdout.close()
+    proc.wait()
+
+    # the process being killed by SIGTERM is normal
+    if proc.returncode != 0 and proc.returncode != -signal.SIGTERM:
+        raise subprocess.CalledProcessError(proc.returncode, data['args'][0])
+except:
+    traceback.print_exc()
+    if not wrote_rpid: sys.stdout.write("X\n")
+    sys.exit(1)
+
+sys.exit(0)
+""" % P_python
+        remote_setup=r"""newdriver=`mktemp ./driver.py.XXXXXX` || exit 1
+cat > "$newdriver"
+if cmp -s "$newdriver" driver.py
+then rm -f "$newdriver"
+else set -e; mv -f "$newdriver" driver.py; chmod +x driver.py
+fi
+"""
+        prep_worker = subprocess.Popen(PROXY_SSH_CMD + (remote_setup,),
+                                       stdin=subprocess.PIPE,
+                                       stdout=2)
+        prep_worker.communicate(remote_driver)
+        if prep_worker.returncode != 0:
+            raise subprocess.CalledProcessError(prep_worker.returncode,
+                                                'remote_setup script')
+
+    def __init__(self, args, cfgname=None, cfgdata=None):
+        if ((cfgname is None or cfgdata is None) and
+            (cfgname is not None or cfgdata is not None)):
+            raise TypeError("either both or neither of cfgname and cfgdata"
+                            " must be specified")
+
+        self._rpid = "X"
+
+        ProxyProcess.prepare_remote()
+        self._proc = subprocess.Popen(PROXY_SSH_CMD + ("./driver.py",),
+                                     stdin=subprocess.PIPE,
+                                     stdout=subprocess.PIPE,
+                                     close_fds=True)
+        pickle.dump({ 'args'    : args,
+                      'cfgname' : cfgname,
+                      'cfgdata' : cfgdata },
+                    self._proc.stdin, 2)
+        self._proc.stdin.close()
+        self._rpid = self._proc.stdout.readline().strip()
+        if self._rpid == "X" or self._rpid == "":
+            self._rpid = "X"
+            self._proc.wait()
+            raise RuntimeError("failed to execute '%s' on proxy host"
+                               % " ".join(args))
+
+    def terminate(self):
+        if self._rpid == "X": return
+        subprocess.check_call(PROXY_SSH_CMD + ("kill", self._rpid))
+
+    def kill(self):
+        if self._rpid == "X": return
+        subprocess.check_call(PROXY_SSH_CMD + ("kill", "-9", self._rpid))
+
+    # forward everything else to _proc; logic copied verbatim from
+    # http://code.activestate.com/recipes/519639-
+    #    true-lieberman-style-delegation-in-python/
+    def __getattr__(self, aname):
+        target = self._proc
+        f = getattr(target, aname)
+        if isinstance(f, MethodType):
+            return MethodType(f.im_func, self, target.__class__)
+        else:
+            return f
+
+# Individual proxy-side test runners.
+def p_nylon():
+    return ProxyProcess((P_nylon, "-f", "-c", "nylon.conf"),
+                        "nylon.conf",
+                        """\
+[General]
+No-Simultaneous-Conn=10
+Log=0
+Verbose=0
+PIDfile=nylon.pid
+
+[Server]
+Port=%s
+Allow-IP=%s/32
+""" % (PROXY_PORT, CLIENT_IP))
+
+def p_tor_direct():
+    return ProxyProcess((P_tor, "--quiet", "-f", "tor-direct.conf"),
+                        "tor-direct.conf",
+                        """\
+ORPort %s
+SocksPort 0
+BridgeRelay 1
+PublishServerDescriptor 0
+ExitPolicy reject *:*
+DataDirectory .
+Log err stderr
+# unfortunately there doesn't seem to be any way to tell Tor to accept
+# OR connections from specific IP addresses only.
+""" % PROXY_PORT)
+
+class ClientProcess(subprocess.Popen):
+    """A process running on the local machine.  This is probably doing
+    the meat of the work of some benchmark.  Basically a shim around
+    subprocess.Popen to fix constructor arguments."""
+
+    def __init__(self, argv, envp=None):
+        if envp is not None:
+            env = os.environ.copy()
+            env.update(envp)
+            subprocess.Popen.__init__(self, argv,
+                                      stdin=open("/dev/null", "r"),
+                                      stdout=open("/dev/null", "w"),
+                                      stderr=subprocess.STDOUT, env=env)
+        else:
+            subprocess.Popen.__init__(self, argv,
+                                      stdin=open("/dev/null", "r"),
+                                      stdout=2)
+
+def c_tor_direct():
+    fp = open("tor-direct-client.conf", "w")
+    fp.write("""\
+ORPort 0
+SocksPort %s
+DataDirectory .
+Log err stderr
+Bridge %s:%s
+UseBridges 1
+SafeSocks 0
+""" % (PROXY_PORT, PROXY_IP, PROXY_PORT))
+    fp.close()
+    return ClientProcess((C_tor, "--quiet", "-f", "tor-direct-client.conf"))
+
+def c_curl(url, proxyhost):
+    return ClientProcess((C_curl, "-s", "--socks5-hostname",
+                          proxyhost + ":" + PROXY_PORT,
+                          url, "-o", "/dev/null"))
+
+def c_httperf(prefix, rate, proxyhost):
+    fp = open("tsocks.conf", "w")
+    fp.write("""\
+server = %s
+local = %s/255.255.255.255
+server_port = %s
+server_type = 5
+""" % (proxyhost, proxyhost, PROXY_PORT))
+    fp.close()
+    return ClientProcess((C_httperf, "--hog",
+                          "--server=" + TARGET,
+                          "--uri=" + prefix,
+                          "--period=" + str(rate),
+                          "--num-calls=5", "--num-conns=2000",
+                          "--wset=10000,1"),
+                         { 'LD_PRELOAD' : C_tsocks,
+                           'TSOCKS_CONF_FILE' :
+                               os.path.join(os.getcwd(), "tsocks.conf") })
+
+# Benchmarks.
+
+def bench_fixedrate_direct(report):
+    client = None
+    proxy = None
+    try:
+        proxy = p_nylon()
+
+        for cap in range(10, 810, 10):
+            sys.stderr.write("fixedrate,direct,%d\n" % (cap * 1000))
+            try:
+                client = c_curl('http://' + TARGET + '/bm-fixedrate.cgi/' +
+                                str(int(cap * 1000 * FUDGE_FIXEDRATE)),
+                                PROXY)
+                monitor(report, "fixedrate,direct,%d" % (cap * 1000), 60)
+            finally:
+                if client is not None:
+                    client.terminate()
+                    client.wait()
+                    client = None
+    finally:
+        if proxy is not None:
+            proxy.terminate()
+            proxy.wait()
+
+def bench_fixedrate_tor(report):
+    client = None
+    proxy = None
+    proxyl = None
+    try:
+        proxy = p_tor_direct()
+        proxyl = c_tor_direct()
+        time.sleep(5) # tor startup is slow
+
+        for cap in range(10,810,10):
+            sys.stderr.write("fixedrate,tor,%d\n" % (cap * 1000))
+            try:
+                client = c_curl('http://' + TARGET + '/bm-fixedrate.cgi/' +
+                                str(int(cap * 1000 * FUDGE_FIXEDRATE)),
+                                '127.0.0.1')
+                monitor(report, "fixedrate,tor,%d" % (cap * 1000), 60)
+            finally:
+                if client is not None:
+                    client.terminate()
+                    client.wait()
+                    client = None
+    finally:
+        if proxy is not None:
+            proxy.terminate()
+            proxy.wait()
+        if proxyl is not None:
+            proxyl.terminate()
+            proxyl.wait()
+
+def bench_files_direct(report, prefix):
+    client = None
+    proxy = None
+    try:
+        proxy = p_nylon()
+
+        for cps in range(1,81):
+            sys.stderr.write("files.%s,direct,%d\n" % (prefix, cps))
+            try:
+                client = c_httperf(prefix, 1./cps, PROXY_IP)
+                monitor(report, "files.%s,direct,%d" % (prefix, cps), 60)
+            finally:
+                if client is not None:
+                    client.terminate()
+                    client.wait()
+                    client = None
+    finally:
+        if proxy is not None:
+            proxy.terminate()
+            proxy.wait()
+
+def bench_files_tor(report, prefix):
+    client = None
+    proxy = None
+    proxyl = None
+    try:
+        proxy = p_tor_direct()
+        proxyl = c_tor_direct()
+        time.sleep(5) # tor startup is slow
+
+        for cps in range(1,81):
+            sys.stderr.write("files.%s,tor,%d\n" % (prefix, cps))
+            try:
+                client = c_httperf(prefix, 1./cps, '127.0.0.1')
+                monitor(report, "files.%s,tor,%d" % (prefix, cps), 60)
+            finally:
+                if client is not None:
+                    client.terminate()
+                    client.wait()
+                    client = None
+    finally:
+        if proxy is not None:
+            proxy.terminate()
+            proxy.wait()
+        if proxyl is not None:
+            proxyl.terminate()
+            proxyl.wait()
+
+if __name__ == '__main__':
+    sys.stdout.write("benchmark,relay,cap,obs,up,down\n")
+    bench_fixedrate_direct(sys.stdout)
+    bench_fixedrate_tor(sys.stdout)
+    bench_files_direct(sys.stdout, "fixed")
+    bench_files_tor(sys.stdout, "fixed")
+    bench_files_direct(sys.stdout, "pareto")
+    bench_files_tor(sys.stdout, "pareto")
diff --git a/scripts/bm-fixedrate-cgi.c b/scripts/bm-fixedrate-cgi.c
new file mode 100644
index 0000000..2b48f98
--- /dev/null
+++ b/scripts/bm-fixedrate-cgi.c
@@ -0,0 +1,158 @@
+#define _XOPEN_SOURCE 600
+#define _POSIX_C_SOURCE 200112
+
+#include <stdbool.h>
+#include <stddef.h>
+
+#include <errno.h>
+#include <math.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+
+/* 1400 bytes is a safe figure for per-packet transmissible payload. */
+#define BLOCKSZ 1400
+
+
+#if __GNUC__ >= 3
+#define NORETURN void __attribute__((noreturn))
+#else
+#define NORETURN void
+#endif
+
+extern char **environ;
+
+static NORETURN
+error_400(const char *msg)
+{
+  char **p;
+  printf("Status: 400 Bad Request\nContent-Type: text/plain\n\n"
+         "400 Bad Request (%s)\nCGI environment dump follows:\n\n", msg);
+  for (p = environ; *p; p++)
+    puts(*p);
+  exit(0);
+}
+
+static NORETURN
+error_500(const char *syscall)
+{
+  printf("Status: 500 Internal Server Error\nContent-Type:text/plain\n\n"
+         "500 Internal Server Error: %s: %s\n",
+         syscall, strerror(errno));
+  exit(0);
+}
+
+static void
+generate(unsigned long rate, bool dryrun)
+{
+  double interval;
+  timer_t timerid;
+  struct sigevent sev;
+  struct itimerspec its;
+  sigset_t mask;
+  int sig;
+  char *data;
+  size_t bufsz = BLOCKSZ;
+
+  /* You send data at R bytes per second in 1400-byte blocks by
+     calling write() every 1/(R/1400) second.  However, despite our
+     use of the high-resolution interval timers, we cannot count on
+     being scheduled more often than every 1/CLOCKS_PER_SEC seconds,
+     so if we need to send data faster than that, bump up the block
+     size instead.  */
+  interval = 1./(rate/(double)BLOCKSZ);
+
+  if (interval < 1./CLOCKS_PER_SEC) {
+    interval = 1./CLOCKS_PER_SEC;
+    bufsz = rate / CLOCKS_PER_SEC;
+  }
+
+  its.it_value.tv_sec = lrint(floor(interval));
+  its.it_value.tv_nsec = lrint((interval - its.it_value.tv_sec) * 1e9);
+  its.it_interval.tv_sec = its.it_value.tv_sec;
+  its.it_interval.tv_nsec = its.it_value.tv_nsec;
+
+  if (dryrun) {
+    printf("Content-Type: text/plain\n\n"
+           "Goal %lu bytes per second:\n"
+           "would send %lu bytes every %f seconds\n"
+           "  \"    \"    \"     \"     \"   %lu sec + %lu nsec\n",
+           rate, bufsz, interval,
+           (unsigned long)its.it_value.tv_sec,
+           (unsigned long)its.it_value.tv_nsec);
+    return;
+  }
+
+  data = malloc(bufsz);
+  if (!data)
+    error_500("malloc");
+  memset(data, 0, bufsz);
+
+  fflush(stdout);
+  setvbuf(stdout, 0, _IONBF, 0);
+  fputs("Content-Type: application/octet-stream\n"
+        "Cache-Control: no-store,no-cache\n\n", stdout);
+
+  sigemptyset(&mask);
+  sigaddset(&mask, SIGRTMIN);
+  if (sigprocmask(SIG_SETMASK, &mask, 0))
+    error_500("sigprocmask");
+
+  memset(&sev, 0, sizeof sev);
+  sev.sigev_notify = SIGEV_SIGNAL;
+  sev.sigev_signo = SIGRTMIN;
+  sev.sigev_value.sival_ptr = &timerid;
+  if (timer_create(CLOCK_MONOTONIC, &sev, &timerid))
+    error_500("timer_create");
+
+  if (timer_settime(timerid, 0, &its, 0))
+    error_500("timer_settime");
+
+  do {
+    size_t r, n = bufsz;
+    char *p = data;
+    do {
+      r = fwrite(p, 1, n, stdout);
+      if (r == 0)
+        exit(1);
+      n -= r;
+      p += r;
+    } while (n > 0);
+  } while (sigwait(&mask, &sig) == 0);
+}
+
+int
+main(void)
+{
+  unsigned long rate;
+  char *endp;
+  bool dryrun;
+  char *request_method = getenv("REQUEST_METHOD");
+  char *query_string = getenv("QUERY_STRING");
+  char *path_info = getenv("PATH_INFO");
+
+  if (!request_method || strcmp(request_method, "GET"))
+    error_400("method not supported");
+  if (query_string && strcmp(query_string, ""))
+    error_400("no query parameters accepted");
+
+  if (!path_info || path_info[0] != '/')
+    error_400("malformed or missing PATH_INFO");
+
+  rate = strtoul(path_info+1, &endp, 10);
+  if (endp == path_info+1)
+    error_400("missing rate (specify bytes per second)");
+
+  if (endp[0] == '\0')
+    dryrun = false;
+  else if (endp[0] == ';' && endp[1] == 'd' && endp[2] == '\0')
+    dryrun = true;
+  else
+    error_400("unrecognized extra arguments");
+
+  generate(rate, dryrun);
+  return 0;
+}
diff --git a/scripts/bm-genfiles.py b/scripts/bm-genfiles.py
new file mode 100755
index 0000000..dcd1030
--- /dev/null
+++ b/scripts/bm-genfiles.py
@@ -0,0 +1,162 @@
+#! /usr/bin/python
+
+"""Generate files for network performance testing.
+
+The default behavior is to generate 10,000 files all of which are
+exactly 3584 bytes long, because that is approximately how big
+Flickr's 75x75px JPEG thumbnails are.  You can request a different
+size, or you can request that the file sizes instead follow a bounded
+Pareto distribution with tunable alpha.
+
+The files have names compatible with httperf's --wset mode.  Since
+it insists on .html as a file suffix, the files are syntactically
+valid HTML.  Their contents are word salad.
+
+There is one mandatory command line argument: the path to the root
+of the tree of files to generate.  It is created if it doesn't
+already exist.  If it already exists, its contents will be erased!
+(so don't use '.')"""
+
+from __future__ import division
+
+import argparse
+import errno
+import math
+import os
+import os.path
+import random
+import shutil
+import sys
+import textwrap
+
+def ensure_empty_dir(dpath):
+    todelete = []
+    try:
+        todelete = os.listdir(dpath)
+    except OSError, e:
+        # Don't delete a _file_ that's in the way.
+        # Don't try to create parent directories that are missing.
+        if e.errno != errno.ENOENT:
+            raise
+        os.mkdir(dpath)
+        return
+    for f in todelete:
+        p = os.path.join(dpath, f)
+        try:
+            os.remove(p)
+        except OSError, e:
+            if e.errno != errno.EISDIR and e.errno != errno.EPERM:
+                raise
+            shutil.rmtree(p)
+
+def ensure_parent_directories(path):
+    try:
+        os.makedirs(os.path.dirname(path))
+    except OSError, e:
+        if e.errno != errno.EEXIST:
+            raise
+
+def word_salad(f, words, seed, maxlen):
+    rng = random.Random(seed)
+    salad = []
+    slen = 0
+    while slen < maxlen - 1:
+        nl = rng.randint(1, min((maxlen - 1) - slen, len(words))) - 1
+        w = rng.choice(words[nl])
+        salad.append(w)
+        slen += len(w) + 1
+    salad = textwrap.fill(" ".join(salad), 78)
+    while len(salad) < maxlen-1:
+        salad += '.'
+    salad += '\n'
+    f.write(salad)
+
+def load_words():
+    words = [ [] for _ in xrange(15) ]
+    for w in open('/usr/share/dict/words'):
+        w = w.strip()
+        if w.endswith("'s"): continue
+        if len(w) > 15 or len(w) < 2: continue
+        words[len(w)-1].append(w)
+    # special case words[0] as dictfiles often have every single single letter
+    words[0].extend(('a','I'))
+    return words
+
+FILE_PREFIX = '<!doctype html>\n<title>{0}</title>\n<p>\n'
+FILE_SUFFIX = '</p>\n'
+
+def create_one(parent, ctr, digits, words, filesize, seed, resume, progress):
+    label = format(ctr, '0'+str(digits)+'d')
+    fname = os.path.join(parent, *label) + '.html'
+    ensure_parent_directories(fname)
+
+    if os.path.exists(fname):
+        if not resume: raise RuntimeError('{0} already exists'.format(fname))
+        return
+
+    prefix = FILE_PREFIX.format(label)
+    suffix = FILE_SUFFIX
+    limit  = filesize - (len(prefix) + len(suffix))
+    if limit <= 0:
+        raise TypeError("{0} bytes is too small to generate (minimum {1})"
+                        .format(filesize, len(prefix)+len(suffix)))
+
+    if progress:
+        sys.stderr.write(fname + '\n')
+
+    f = open(fname, "w")
+    f.write(prefix)
+    word_salad(f, words, ctr+seed, limit)
+    f.write(suffix)
+
+def bounded_pareto(rng, alpha, L, H):
+    while True:
+        U = rng.random()
+        if U < 1: break
+    Ha = H**alpha
+    La = L**alpha
+    return int(round((-(U*Ha - U*La - Ha)/(Ha * La)) ** (-1/alpha)))
+
+if __name__ == '__main__':
+
+    default_filesize  = 3584
+    default_filecount = 10000  # 0/0/0/0.html through 9/9/9/9.html
+
+    parser = argparse.ArgumentParser(description=__doc__,
+                         formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument('directory',
+                        help='directory to populate with files')
+    parser.add_argument('-c', '--count', type=int, default=default_filecount,
+                        help='number of files to generate')
+    sg = parser.add_mutually_exclusive_group()
+    sg.add_argument('-s', '--size', type=int, default=default_filesize,
+                    help='all files will be exactly SIZE bytes long')
+    sg.add_argument('-p', '--pareto', type=float,
+                    metavar='ALPHA',
+                    help='file sizes will follow a bounded Pareto distribution'
+                    ' with parameter ALPHA')
+    parser.add_argument('-m', '--minsize', type=int, default=512,
+                        help='minimum file size (only useful with -p)')
+    parser.add_argument('-M', '--maxsize', type=int, default=2*1024*1024,
+                        help='maximum file size (only useful with -p)')
+    parser.add_argument('-S', '--seed', type=int, default=719,
+                        help='seed for random number generator')
+    parser.add_argument('--resume', action='store_true',
+                        help='resume an interrupted run where it left off')
+    parser.add_argument('--progress', action='store_true',
+                        help='report progress')
+
+    args = parser.parse_args()
+    digits = len(str(args.count - 1))
+    rng = random.Random(args.seed)
+
+    words = load_words()
+    if not args.resume:
+        ensure_empty_dir(args.directory)
+
+    size = args.size
+    for i in xrange(args.count):
+        if args.pareto is not None:
+            size = bounded_pareto(rng, args.pareto, args.minsize, args.maxsize)
+        create_one(args.directory, i, digits, words, size, args.seed,
+                   args.resume, args.progress)
diff --git a/scripts/bm-mcurl.c b/scripts/bm-mcurl.c
new file mode 100644
index 0000000..ac24f3a
--- /dev/null
+++ b/scripts/bm-mcurl.c
@@ -0,0 +1,196 @@
+/* Use libcurl to retrieve many URLs, according to a wildcard pattern,
+   starting new connections at a constant rate until we hit a limit.
+
+   Command line arguments -- all are required, but 'proxy' may be an
+   empty string if you want direct connections:
+
+       bm-mcurl [-v] rate limit proxy url-pattern [url-pattern ...]
+
+   There is no output; it is assumed that you are monitoring traffic
+   externally.  Passing -v turns on CURLOPT_VERBOSE debugging spew.
+ */
+
+#define _XOPEN_SOURCE 600
+
+#include <stdbool.h>
+#include <stddef.h>
+
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <curl/curl.h>
+#include "tool_urlglob.h"
+
+#define NORETURN __attribute__((noreturn))
+
+static bool verbose = false;
+
+static size_t
+discard_data(char *ptr, size_t size, size_t nmemb, void *userdata)
+{
+  return size * nmemb;
+}
+
+static size_t
+read_abort(void *ptr, size_t size, size_t nmemb, void *userdata)
+{
+  /* we don't do anything that should require this to be called,
+     so if it does get called, something is wrong */
+  return CURL_READFUNC_ABORT;
+}
+
+static CURL *
+setup_curl_easy_handle(char *proxy)
+{
+  CURL *h = curl_easy_init();
+  if (!h) abort();
+
+#define SET_OR_CRASH(h, opt, param) \
+  do { if (curl_easy_setopt(h, opt, param)) abort(); } while (0)
+
+  SET_OR_CRASH(h, CURLOPT_VERBOSE,         (unsigned long)verbose);
+  SET_OR_CRASH(h, CURLOPT_NOPROGRESS,      1L);
+  SET_OR_CRASH(h, CURLOPT_FAILONERROR,     1L);
+  SET_OR_CRASH(h, CURLOPT_USERAGENT,       "bm-mcurl/0.1");
+  SET_OR_CRASH(h, CURLOPT_ACCEPT_ENCODING, "");
+  SET_OR_CRASH(h, CURLOPT_AUTOREFERER,     1L);
+  SET_OR_CRASH(h, CURLOPT_FOLLOWLOCATION,  1L);
+  SET_OR_CRASH(h, CURLOPT_MAXREDIRS,       30L);
+
+  SET_OR_CRASH(h, CURLOPT_WRITEFUNCTION,   discard_data);
+  SET_OR_CRASH(h, CURLOPT_WRITEDATA,       NULL);
+  SET_OR_CRASH(h, CURLOPT_READFUNCTION,    read_abort);
+  SET_OR_CRASH(h, CURLOPT_READDATA,        NULL);
+
+  if (proxy && proxy[0]) {
+    SET_OR_CRASH(h, CURLOPT_PROXY,         proxy);
+    SET_OR_CRASH(h, CURLOPT_PROXYTYPE,     CURLPROXY_SOCKS5_HOSTNAME);
+  }
+#undef SET_OR_CRASH
+}
+
+static bool
+process_events_once(CURLM *multi, unsigned long timeout_max)
+{
+  struct timeval tv;
+  int rc; /* select() return code */
+
+  fd_set fdread;
+  fd_set fdwrite;
+  fd_set fdexcept;
+  int maxfd = -1;
+
+  unsigned long timeout = 1000000; /* one second - ultimate default */
+  long curl_tout_ms = -1;
+
+  /* get fd sets for all pending transfers */
+  FD_ZERO(&fdread);
+  FD_ZERO(&fdwrite);
+  FD_ZERO(&fdexcept);
+  curl_multi_fdset(multi_handle, &fdread, &fdwrite, &fdexcept, &maxfd);
+
+  /* timeout */
+  if (timeout_max > 0 && timeout_max < timeout)
+    timeout = timeout_max;
+
+  curl_multi_timeout(multi_handle, &curl_tout_ms);
+
+  if (curl_tout_ms >= 0) {
+    unsigned long curl_tout_us = ((unsigned long)curl_tout_ms) * 1000;
+    if (timeout > curl_tout_us)
+      timeout = curl_tout_us;
+  }
+
+  tv.tv_sec = timeout / 1000000;
+  if(tv.tv_sec >= 1)
+    tv.tv_sec = 1;
+  else
+    tv.tv_usec = timeout % 1000000;
+
+  do {
+    rc = select(maxfd+1, &fdread, &fdwrite, &fdexcept, &tv);
+  } while (rc == -1 && errno == EINTR);
+
+  if (rc > 0) {
+    int still_running;
+    curl_multi_perform(multi_handle, &still_running);
+    return !!still_running;
+  } else
+    abort();
+}
+
+/* Note: this function must not return until we are ready to start
+   another connection. */
+static void
+queue_one(CURLM *multi, unsigned long rate, unsigned long limit,
+          char *proxy, char *url)
+{
+
+}
+
+static void
+run(unsigned long rate, unsigned long limit, char *proxy, char **urls)
+{
+  CURLM *multi;
+  curl_global_init();
+  multi = curl_multi_init();
+  if (!multi) abort();
+
+  for (char **upat = urls; *upat; url++) {
+    URLGlob *uglob;
+    int *n;
+    if (glob_url(&uglob, *upat, &n, stderr))
+      continue;
+    do {
+      char *url;
+      if (glob_next_url(&url, uglob)) abort();
+      queue_one(multi, rate, limit, proxy, url); /* takes ownership */
+    } while (--n);
+    glob_cleanup(uglob);
+  }
+
+  /* spin the event loop until all outstanding transfers complete */
+  while (process_events_once(multi, 0));
+
+  curl_multi_cleanup(multi);
+}
+
+static NORETURN
+usage(const char *av0, const char *complaint)
+{
+  fprintf(stderr,
+          "%s\nusage: %s [-v] rate limit proxy url [url...]\n",
+          complaint, av0);
+  exit(2);
+}
+
+int
+main(int argc, char **argv)
+{
+  unsigned long rate;
+  unsigned long limit;
+  char *endp;
+
+  if (argv[1] && (!strcmp(argv[1], "-v") || !strcmp(argv[1], "--verbose"))) {
+    verbose = true;
+    argc--;
+    argv[1] = argv[0];
+    argv++;
+  }
+
+  if (argc < 5)
+    usage("not enough arguments");
+
+  rate = strtoul(argv[1], &endp, 10);
+  if (endp == argv[1] || *endp)
+    usage("rate must be a positive integer (connections per second)");
+
+  limit = strtoul(argv[2], &endp, 10);
+  if (endp == argv[2] || *endp)
+    usage("limit must be a positive integer (max outstanding requests)");
+
+  run(rate, limit, argv[3], argv+4);
+  return 0;
+}
diff --git a/scripts/tool_urlglob.c b/scripts/tool_urlglob.c
new file mode 100644
index 0000000..d714971
--- /dev/null
+++ b/scripts/tool_urlglob.c
@@ -0,0 +1,516 @@
+/***************************************************************************
+ *                                  _   _ ____  _
+ *  Project                     ___| | | |  _ \| |
+ *                             / __| | | | |_) | |
+ *                            | (__| |_| |  _ <| |___
+ *                             \___|\___/|_| \_\_____|
+ *
+ * Copyright (C) 1998 - 2011, Daniel Stenberg, <daniel@haxx.se>, et al.
+ *
+ * This software is licensed as described in the file COPYING, which
+ * you should have received as part of this distribution. The terms
+ * are also available at http://curl.haxx.se/docs/copyright.html.
+ *
+ * You may opt to use, copy, modify, merge, publish, distribute and/or sell
+ * copies of the Software, and permit persons to whom the Software is
+ * furnished to do so, under the terms of the COPYING file.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ***************************************************************************/
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <curl/curl.h>
+
+#include "tool_urlglob.h"
+
+typedef enum {
+  GLOB_OK,
+  GLOB_NO_MEM,
+  GLOB_ERROR
+} GlobCode;
+
+/*
+ * glob_word()
+ *
+ * Input a full globbed string, set the forth argument to the amount of
+ * strings we get out of this. Return GlobCode.
+ */
+static GlobCode glob_word(URLGlob *, /* object anchor */
+                          char *,    /* globbed string */
+                          size_t,       /* position */
+                          int *);    /* returned number of strings */
+
+static GlobCode glob_set(URLGlob *glob, char *pattern,
+                         size_t pos, int *amount)
+{
+  /* processes a set expression with the point behind the opening '{'
+     ','-separated elements are collected until the next closing '}'
+  */
+  URLPattern *pat;
+  GlobCode res;
+  bool done = false;
+  char* buf = glob->glob_buffer;
+
+  pat = &glob->pattern[glob->size / 2];
+  /* patterns 0,1,2,... correspond to size=1,3,5,... */
+  pat->type = UPTSet;
+  pat->content.Set.size = 0;
+  pat->content.Set.ptr_s = 0;
+  pat->content.Set.elements = NULL;
+
+  ++glob->size;
+
+  while(!done) {
+    switch (*pattern) {
+    case '\0':                  /* URL ended while set was still open */
+      snprintf(glob->errormsg, sizeof(glob->errormsg),
+               "unmatched brace at pos %zu\n", pos);
+      return GLOB_ERROR;
+
+    case '{':
+    case '[':                   /* no nested expressions at this time */
+      snprintf(glob->errormsg, sizeof(glob->errormsg),
+               "nested braces not supported at pos %zu\n", pos);
+      return GLOB_ERROR;
+
+    case ',':
+    case '}':                           /* set element completed */
+      *buf = '\0';
+      if(pat->content.Set.elements) {
+        char **new_arr = realloc(pat->content.Set.elements,
+                                 (pat->content.Set.size + 1) * sizeof(char*));
+        if(!new_arr) {
+          short elem;
+          for(elem = 0; elem < pat->content.Set.size; elem++)
+            Curl_safefree(pat->content.Set.elements[elem]);
+          Curl_safefree(pat->content.Set.elements);
+          pat->content.Set.ptr_s = 0;
+          pat->content.Set.size = 0;
+        }
+        pat->content.Set.elements = new_arr;
+      }
+      else
+        pat->content.Set.elements = malloc(sizeof(char*));
+      if(!pat->content.Set.elements) {
+        snprintf(glob->errormsg, sizeof(glob->errormsg), "out of memory\n");
+        return GLOB_NO_MEM;
+      }
+      pat->content.Set.elements[pat->content.Set.size] =
+        strdup(glob->glob_buffer);
+      if(!pat->content.Set.elements[pat->content.Set.size]) {
+        short elem;
+        for(elem = 0; elem < pat->content.Set.size; elem++)
+          Curl_safefree(pat->content.Set.elements[elem]);
+        Curl_safefree(pat->content.Set.elements);
+        pat->content.Set.ptr_s = 0;
+        pat->content.Set.size = 0;
+        snprintf(glob->errormsg, sizeof(glob->errormsg), "out of memory\n");
+        return GLOB_NO_MEM;
+      }
+      ++pat->content.Set.size;
+
+      if(*pattern == '}') {
+        /* entire set pattern completed */
+        int wordamount;
+
+        /* always check for a literal (may be "") between patterns */
+        res = glob_word(glob, ++pattern, ++pos, &wordamount);
+        if(res) {
+          short elem;
+          for(elem = 0; elem < pat->content.Set.size; elem++)
+            Curl_safefree(pat->content.Set.elements[elem]);
+          Curl_safefree(pat->content.Set.elements);
+          pat->content.Set.ptr_s = 0;
+          pat->content.Set.size = 0;
+          return res;
+        }
+
+        *amount = pat->content.Set.size * wordamount;
+
+        done = true;
+        continue;
+      }
+
+      buf = glob->glob_buffer;
+      ++pattern;
+      ++pos;
+      break;
+
+    case ']':                           /* illegal closing bracket */
+      snprintf(glob->errormsg, sizeof(glob->errormsg),
+               "illegal pattern at pos %zu\n", pos);
+      return GLOB_ERROR;
+
+    case '\\':                          /* escaped character, skip '\' */
+      if(pattern[1]) {
+        ++pattern;
+        ++pos;
+      }
+      /* intentional fallthrough */
+    default:
+      *buf++ = *pattern++;              /* copy character to set element */
+      ++pos;
+    }
+  }
+  return GLOB_OK;
+}
+
+static GlobCode glob_range(URLGlob *glob, char *pattern,
+                           size_t pos, int *amount)
+{
+  /* processes a range expression with the point behind the opening '['
+     - char range: e.g. "a-z]", "B-Q]"
+     - num range: e.g. "0-9]", "17-2000]"
+     - num range with leading zeros: e.g. "001-999]"
+     expression is checked for well-formedness and collected until the next ']'
+  */
+  URLPattern *pat;
+  char *c;
+  char sep;
+  char sep2;
+  int step;
+  int rc;
+  GlobCode res;
+  int wordamount = 1;
+
+  pat = &glob->pattern[glob->size / 2];
+  /* patterns 0,1,2,... correspond to size=1,3,5,... */
+  ++glob->size;
+
+  if(ISALPHA(*pattern)) {
+    /* character range detected */
+    char min_c;
+    char max_c;
+
+    pat->type = UPTCharRange;
+
+    rc = sscanf(pattern, "%c-%c%c%d%c", &min_c, &max_c, &sep, &step, &sep2);
+
+    if((rc < 3) || (min_c >= max_c) || ((max_c - min_c) > ('z' - 'a'))) {
+      /* the pattern is not well-formed */
+      snprintf(glob->errormsg, sizeof(glob->errormsg),
+               "error: bad range specification after pos %zu\n", pos);
+      return GLOB_ERROR;
+    }
+
+    /* check the (first) separating character */
+    if((sep != ']') && (sep != ':')) {
+      snprintf(glob->errormsg, sizeof(glob->errormsg),
+               "error: unsupported character (%c) after range at pos %zu\n",
+               sep, pos);
+      return GLOB_ERROR;
+    }
+
+    /* if there was a ":[num]" thing, use that as step or else use 1 */
+    pat->content.CharRange.step =
+      ((sep == ':') && (rc == 5) && (sep2 == ']')) ? step : 1;
+
+    pat->content.CharRange.ptr_c = pat->content.CharRange.min_c = min_c;
+    pat->content.CharRange.max_c = max_c;
+  }
+  else if(ISDIGIT(*pattern)) {
+    /* numeric range detected */
+    int min_n;
+    int max_n;
+
+    pat->type = UPTNumRange;
+    pat->content.NumRange.padlength = 0;
+
+    rc = sscanf(pattern, "%d-%d%c%d%c", &min_n, &max_n, &sep, &step, &sep2);
+
+    if((rc < 2) || (min_n > max_n)) {
+      /* the pattern is not well-formed */
+      snprintf(glob->errormsg, sizeof(glob->errormsg),
+               "error: bad range specification after pos %zu\n", pos);
+      return GLOB_ERROR;
+    }
+    pat->content.NumRange.ptr_n = pat->content.NumRange.min_n = min_n;
+    pat->content.NumRange.max_n = max_n;
+
+    /* if there was a ":[num]" thing, use that as step or else use 1 */
+    pat->content.NumRange.step =
+      ((sep == ':') && (rc == 5) && (sep2 == ']')) ? step : 1;
+
+    if(*pattern == '0') {
+      /* leading zero specified */
+      c = pattern;
+      while(ISDIGIT(*c)) {
+        c++;
+        ++pat->content.NumRange.padlength; /* padding length is set for all
+                                              instances of this pattern */
+      }
+    }
+  }
+  else {
+    snprintf(glob->errormsg, sizeof(glob->errormsg),
+             "illegal character in range specification at pos %zu\n", pos);
+    return GLOB_ERROR;
+  }
+
+  c = (char*)strchr(pattern, ']'); /* continue after next ']' */
+  if(c)
+    c++;
+  else {
+    snprintf(glob->errormsg, sizeof(glob->errormsg), "missing ']'");
+    return GLOB_ERROR; /* missing ']' */
+  }
+
+  /* always check for a literal (may be "") between patterns */
+
+  res = glob_word(glob, c, pos + (c - pattern), &wordamount);
+  if(res == GLOB_ERROR) {
+    wordamount = 1;
+    res = GLOB_OK;
+  }
+
+  if(!res) {
+    if(pat->type == UPTCharRange)
+      *amount = wordamount * (pat->content.CharRange.max_c -
+                              pat->content.CharRange.min_c + 1);
+    else
+      *amount = wordamount * (pat->content.NumRange.max_n -
+                              pat->content.NumRange.min_n + 1);
+  }
+
+  return res; /* GLOB_OK or GLOB_NO_MEM */
+}
+
+static GlobCode glob_word(URLGlob *glob, char *pattern,
+                          size_t pos, int *amount)
+{
+  /* processes a literal string component of a URL
+     special characters '{' and '[' branch to set/range processing functions
+   */
+  char* buf = glob->glob_buffer;
+  size_t litindex;
+  GlobCode res = GLOB_OK;
+
+  *amount = 1; /* default is one single string */
+
+  while(*pattern != '\0' && *pattern != '{' && *pattern != '[') {
+    if(*pattern == '}' || *pattern == ']') {
+      snprintf(glob->errormsg, sizeof(glob->errormsg),
+               "unmatched close brace/bracket at pos %zu\n", pos);
+      return GLOB_ERROR;
+    }
+
+    /* only allow \ to escape known "special letters" */
+    if(*pattern == '\\' &&
+        (*(pattern+1) == '{' || *(pattern+1) == '[' ||
+         *(pattern+1) == '}' || *(pattern+1) == ']') ) {
+
+      /* escape character, skip '\' */
+      ++pattern;
+      ++pos;
+    }
+    *buf++ = *pattern++; /* copy character to literal */
+    ++pos;
+  }
+  *buf = '\0';
+  litindex = glob->size / 2;
+  /* literals 0,1,2,... correspond to size=0,2,4,... */
+  glob->literal[litindex] = strdup(glob->glob_buffer);
+  if(!glob->literal[litindex]) {
+    snprintf(glob->errormsg, sizeof(glob->errormsg), "out of memory\n");
+    return GLOB_NO_MEM;
+  }
+  ++glob->size;
+
+  switch (*pattern) {
+  case '\0':
+    /* singular URL processed  */
+    break;
+
+  case '{':
+    /* process set pattern */
+    res = glob_set(glob, ++pattern, ++pos, amount);
+    break;
+
+  case '[':
+    /* process range pattern */
+    res = glob_range(glob, ++pattern, ++pos, amount);
+    break;
+  }
+
+  if(res)
+    Curl_safefree(glob->literal[litindex]);
+
+  return res;
+}
+
+int glob_url(URLGlob** glob, char* url, int *urlnum, FILE *error)
+{
+  /*
+   * We can deal with any-size, just make a buffer with the same length
+   * as the specified URL!
+   */
+  URLGlob *glob_expand;
+  int amount;
+  char *glob_buffer;
+  GlobCode res;
+
+  *glob = NULL;
+
+  glob_buffer = malloc(strlen(url) + 1);
+  if(!glob_buffer)
+    return CURLE_OUT_OF_MEMORY;
+
+  glob_expand = calloc(1, sizeof(URLGlob));
+  if(!glob_expand) {
+    Curl_safefree(glob_buffer);
+    return CURLE_OUT_OF_MEMORY;
+  }
+  glob_expand->size = 0;
+  glob_expand->urllen = strlen(url);
+  glob_expand->glob_buffer = glob_buffer;
+  glob_expand->beenhere = 0;
+
+  res = glob_word(glob_expand, url, 1, &amount);
+  if(!res)
+    *urlnum = amount;
+  else {
+    if(error && glob_expand->errormsg[0]) {
+      /* send error description to the error-stream */
+      fprintf(error, "curl: (%d) [globbing] %s",
+              (res == GLOB_NO_MEM) ? CURLE_OUT_OF_MEMORY : CURLE_URL_MALFORMAT,
+              glob_expand->errormsg);
+    }
+    /* it failed, we cleanup */
+    Curl_safefree(glob_buffer);
+    Curl_safefree(glob_expand);
+    *urlnum = 1;
+    return (res == GLOB_NO_MEM) ? CURLE_OUT_OF_MEMORY : CURLE_URL_MALFORMAT;
+  }
+
+  *glob = glob_expand;
+  return CURLE_OK;
+}
+
+void glob_cleanup(URLGlob* glob)
+{
+  size_t i;
+  int elem;
+
+  for(i = glob->size - 1; i < glob->size; --i) {
+    if(!(i & 1)) {     /* even indexes contain literals */
+      Curl_safefree(glob->literal[i/2]);
+    }
+    else {              /* odd indexes contain sets or ranges */
+      if((glob->pattern[i/2].type == UPTSet) &&
+         (glob->pattern[i/2].content.Set.elements)) {
+        for(elem = glob->pattern[i/2].content.Set.size - 1;
+             elem >= 0;
+             --elem) {
+          Curl_safefree(glob->pattern[i/2].content.Set.elements[elem]);
+        }
+        Curl_safefree(glob->pattern[i/2].content.Set.elements);
+      }
+    }
+  }
+  Curl_safefree(glob->glob_buffer);
+  Curl_safefree(glob);
+}
+
+int glob_next_url(char **globbed, URLGlob *glob)
+{
+  URLPattern *pat;
+  char *lit;
+  size_t i;
+  size_t j;
+  size_t len;
+  size_t buflen = glob->urllen + 1;
+  char *buf = glob->glob_buffer;
+
+  *globbed = NULL;
+
+  if(!glob->beenhere)
+    glob->beenhere = 1;
+  else {
+    bool carry = true;
+
+    /* implement a counter over the index ranges of all patterns,
+       starting with the rightmost pattern */
+    for(i = glob->size / 2 - 1; carry && (i < glob->size); --i) {
+      carry = false;
+      pat = &glob->pattern[i];
+      switch (pat->type) {
+      case UPTSet:
+        if((pat->content.Set.elements) &&
+           (++pat->content.Set.ptr_s == pat->content.Set.size)) {
+          pat->content.Set.ptr_s = 0;
+          carry = true;
+        }
+        break;
+      case UPTCharRange:
+        pat->content.CharRange.ptr_c = (char)(pat->content.CharRange.step +
+                           (int)((unsigned char)pat->content.CharRange.ptr_c));
+        if(pat->content.CharRange.ptr_c > pat->content.CharRange.max_c) {
+          pat->content.CharRange.ptr_c = pat->content.CharRange.min_c;
+          carry = true;
+        }
+        break;
+      case UPTNumRange:
+        pat->content.NumRange.ptr_n += pat->content.NumRange.step;
+        if(pat->content.NumRange.ptr_n > pat->content.NumRange.max_n) {
+          pat->content.NumRange.ptr_n = pat->content.NumRange.min_n;
+          carry = true;
+        }
+        break;
+      default:
+        printf("internal error: invalid pattern type (%d)\n", (int)pat->type);
+        return CURLE_FAILED_INIT;
+      }
+    }
+    if(carry) {         /* first pattern ptr has run into overflow, done! */
+      /* TODO: verify if this should actally return CURLE_OK. */
+      return CURLE_OK; /* CURLE_OK to match previous behavior */
+    }
+  }
+
+  for(j = 0; j < glob->size; ++j) {
+    if(!(j&1)) {              /* every other term (j even) is a literal */
+      lit = glob->literal[j/2];
+      len = snprintf(buf, buflen, "%s", lit);
+      buf += len;
+      buflen -= len;
+    }
+    else {                              /* the rest (i odd) are patterns */
+      pat = &glob->pattern[j/2];
+      switch(pat->type) {
+      case UPTSet:
+        if(pat->content.Set.elements) {
+          len = strlen(pat->content.Set.elements[pat->content.Set.ptr_s]);
+          snprintf(buf, buflen, "%s",
+                   pat->content.Set.elements[pat->content.Set.ptr_s]);
+          buf += len;
+          buflen -= len;
+        }
+        break;
+      case UPTCharRange:
+        *buf++ = pat->content.CharRange.ptr_c;
+        break;
+      case UPTNumRange:
+        len = snprintf(buf, buflen, "%0*d",
+                       pat->content.NumRange.padlength,
+                       pat->content.NumRange.ptr_n);
+        buf += len;
+        buflen -= len;
+        break;
+      default:
+        printf("internal error: invalid pattern type (%d)\n", (int)pat->type);
+        return CURLE_FAILED_INIT;
+      }
+    }
+  }
+  *buf = '\0';
+
+  *globbed = strdup(glob->glob_buffer);
+  if(!*globbed)
+    return CURLE_OUT_OF_MEMORY;
+
+  return CURLE_OK;
+}
diff --git a/scripts/tool_urlglob.h b/scripts/tool_urlglob.h
new file mode 100644
index 0000000..562b08e
--- /dev/null
+++ b/scripts/tool_urlglob.h
@@ -0,0 +1,69 @@
+#ifndef HEADER_CURL_TOOL_URLGLOB_H
+#define HEADER_CURL_TOOL_URLGLOB_H
+/***************************************************************************
+ *                                  _   _ ____  _
+ *  Project                     ___| | | |  _ \| |
+ *                             / __| | | | |_) | |
+ *                            | (__| |_| |  _ <| |___
+ *                             \___|\___/|_| \_\_____|
+ *
+ * Copyright (C) 1998 - 2011, Daniel Stenberg, <daniel@haxx.se>, et al.
+ *
+ * This software is licensed as described in the file COPYING, which
+ * you should have received as part of this distribution. The terms
+ * are also available at http://curl.haxx.se/docs/copyright.html.
+ *
+ * You may opt to use, copy, modify, merge, publish, distribute and/or sell
+ * copies of the Software, and permit persons to whom the Software is
+ * furnished to do so, under the terms of the COPYING file.
+ *
+ * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
+ * KIND, either express or implied.
+ *
+ ***************************************************************************/
+
+typedef enum {
+  UPTSet = 1,
+  UPTCharRange,
+  UPTNumRange
+} URLPatternType;
+
+typedef struct {
+  URLPatternType type;
+  union {
+    struct {
+      char **elements;
+      short size;
+      short ptr_s;
+    } Set;
+    struct {
+      char min_c;
+      char max_c;
+      char ptr_c;
+      int step;
+    } CharRange;
+    struct {
+      int min_n;
+      int max_n;
+      short padlength;
+      int ptr_n;
+      int step;
+    } NumRange ;
+  } content;
+} URLPattern;
+
+typedef struct {
+  char *literal[10];
+  URLPattern pattern[9];
+  size_t size;
+  size_t urllen;
+  char *glob_buffer;
+  char beenhere;
+  char errormsg[80]; /* error message buffer */
+} URLGlob;
+
+int glob_url(URLGlob**, char*, int *, FILE *);
+int glob_next_url(char **, URLGlob *);
+void glob_cleanup(URLGlob* glob);
+
+#endif /* HEADER_CURL_TOOL_URLGLOB_H */

    

zwol＠torproject.org

tags

participants (1)