Merge pull request #8 from tilezen/zerebubuth/tile-percentiles

Tool to calculate tile size percentiles
tilezen · Oct 5, 2018 · 31daaae · 31daaae
2 parents f026ead + ebff8d6
commit 31daaae
Show file tree

Hide file tree

Showing 3 changed files with 330 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -4,6 +4,7 @@ Current scoville commands:
 
 * `info`: Prints size and item count information about an MVT tile.
 * `proxy`: Serves a treemap visualisation of tiles on a local HTTP server.
+* `percentiles`: Calculate the percentile tile sizes for a set of MVT tiles.
 
 ### Info command ###
 
@@ -66,6 +67,35 @@ This will run a server on [localhost:8000](http://localhost:8000) by default (us
 
 ![Screenshot of the proxy server](doc/proxy_screenshot.png)
 
+### Percentiles command ###
+
+Downloads a set of tiles and calculates percentile tile sizes, both total for the tile and per-layer within the tile. This can be useful for measuring the changes in tile size across different versions of tiles, and indicating which layers are contributing the most to outlier sizes.
+
+For example:
+
+```
+echo "0/0/0" > tiles.txt
+scoville percentiles --cache tiles.txt https://tile.nextzen.org/tilezen/vector/v1/512/all/{z}/{x}/{y}.mvt?api_key=YOUR_API_KEY
+```
+
+Will output something like:
+
+```
+               TOTAL      p50      p90      p99    p99.9
+          boundaries    68731    68731    68731    68731
+           buildings       19       19       19       19
+               earth   110378   110378   110378   110378
+             landuse       17       17       17       17
+              places   471086   471086   471086   471086
+                pois       14       14       14       14
+               roads       15       15       15       15
+             transit       17       17       17       17
+               water   404996   404996   404996   404996
+              ~total  1055273  1055273  1055273  1055273
+```
+
+Note that the `~total` entry is **not** the total of the column above it; it's the percentile of total tile size. In other words, if we had three tiles with three layers, and each tile had a single, different layer taking up 1000 bytes and two layers taking up 10 bytes, then each tile is 1020 bytes and that would be the p50 `~total`. However, the p50 on each individual layer would only be 10 bytes.
+
 ## Install on Ubuntu:
 
 ```

diff --git a/scoville/command.py b/scoville/command.py
@@ -139,5 +139,104 @@ def proxy(url, port):
     serve_http(url, port)
 
 
+def read_urls(file_name, url_pattern):
+    with open(file_name, 'r') as fh:
+        for line in fh:
+            zxy = line.split(' ', 1)[0]
+            z, x, y = map(int, zxy.split('/', 2))
+
+            u = url_pattern \
+                .replace('{z}', str(z)) \
+                .replace('{x}', str(x)) \
+                .replace('{y}', str(y))
+
+            yield u
+
+
+def _percentiles_output_text(percentiles, result):
+    """
+    Output results to the console as columns of text, using ANSI colours where
+    available.
+    """
+
+    fmt = '%20s' + ' %8d' * len(percentiles)
+    header = '%20s' % ('TOTAL',)
+    for percentile in percentiles:
+        pct_header = 'p%r' % (percentile,)
+        header += ' %8s' % (pct_header,)
+    click.secho(header, fg='green', bold=True)
+    for name in sorted(result.keys()):
+        percentiles = result[name]
+        line = fmt % tuple([name] + percentiles)
+        click.secho(line, bold=name.startswith('~'))
+
+
+def _percentiles_output_csv(percentiles, result):
+    """
+    Output text to the console as a CSV file.
+    """
+
+    import csv
+    from sys import stdout
+
+    writer = csv.writer(stdout)
+
+    headers = ['Layer']
+    for percentile in percentiles:
+        headers.append('p%r' % (percentile,))
+    writer.writerow(headers)
+
+    for name in sorted(result.keys()):
+        line = [name]
+        for pct in result[name]:
+            line.append(str(pct))
+        writer.writerow(line)
+
+
+@cli.command()
+@click.argument('tiles_file', required=1)
+@click.argument('url', required=1)
+@click.option('--percentiles', '-p', multiple=True, type=float,
+              help='Percentiles to display. Use decimal floats, i.e: 99.9, '
+              'not 99_9. Can be used multiple times.')
+@click.option('--cache/--no-cache', default=False, help='Use a cache for '
+              'tiles. Can speed up multiple runs considerably.')
+@click.option('--nprocs', '-j', default=1, type=int, help='Number of '
+              'processes to use to download and do tile size aggregation.')
+@click.option('--output-format', '-f', type=click.Choice(['text', 'csv']),
+              default='text', help='Format to use when writing results to '
+              'the console.')
+def percentiles(tiles_file, url, percentiles, cache, nprocs, output_format):
+    """
+    Download a bunch of tiles and display the percentiles of size, breakdown by
+    layer, and so forth.
+
+    The tiles to download should be listed in TILES_FILE, one per line as
+    'z/x/y'. The URL to fetch them from should contain {z}, {x} and {y}
+    replacements.
+    """
+
+    from scoville.percentiles import calculate_percentiles
+
+    if not percentiles:
+        percentiles = [50, 90, 99, 99.9]
+
+    tiles = read_urls(tiles_file, url)
+    result = calculate_percentiles(tiles, percentiles, cache, nprocs)
+
+    if output_format == 'text':
+        _percentiles_output_text(percentiles, result)
+
+    elif output_format == 'csv':
+        _percentiles_output_csv(percentiles, result)
+
+    else:
+        raise ValueError('Unknown output format %r' % (output_format,))
+
+
 def scoville_main():
     cli()
+
+
+if __name__ == '__main__':
+    scoville_main()
diff --git a/scoville/percentiles.py b/scoville/percentiles.py
@@ -0,0 +1,201 @@
+import requests
+from collections import defaultdict
+from scoville.mvt import Tile
+
+
+def _fetch_http(url):
+    """
+    Fetch a tile over HTTP.
+    """
+
+    res = requests.get(url)
+
+    # TODO: retry? better error handling!
+    if res.status_code != requests.codes.ok:
+        raise IOError("Got tile response %d" % (res.status_code,))
+
+    return res.content
+
+
+def _fetch_cache(url):
+    """
+    If a tile is present on disk, then use it. Otherwise fetch over HTTP.
+    """
+
+    from base64 import urlsafe_b64encode
+    from os.path import join, isfile, isdir
+    from os import makedirs
+    from hashlib import sha224
+
+    # we use the non-query part to store on disk. (tile won't depend on API
+    # key, right?) partly because the API key can be very long and overflow
+    # the max 255 chars for a filename when base64 encoded.
+    no_query = url.split('?', 1)[0]
+    encoded = urlsafe_b64encode(no_query)
+    assert len(encoded) < 256
+
+    # we use a 2-level hash-based fanout to avoid having so many inodes in
+    # a directory that file lookup slows to a crawl.
+    hashed = sha224(no_query).hexdigest()
+    dir_name = join('.cache', hashed[0:3], hashed[3:6])
+    file_name = join(dir_name, encoded)
+
+    data = None
+    if isfile(file_name):
+        with open(file_name, 'r') as fh:
+            data = fh.read()
+
+    else:
+        data = _fetch_http(url)
+        if not isdir(dir_name):
+            makedirs(dir_name)
+        with open(file_name, 'w') as fh:
+            fh.write(data)
+
+    return data
+
+
+class Aggregator(object):
+    """
+    Core of the algorithm. Fetches tiles and aggregates their total and
+    per-layer sizes into a set of lists.
+    """
+
+    def __init__(self, cache=False):
+        self.fetch_fn = _fetch_http
+        if cache:
+            self.fetch_fn = _fetch_cache
+
+        self.results = defaultdict(list)
+
+    def add(self, tile_url):
+        data = self.fetch_fn(tile_url)
+        self.results['~total'].append(len(data))
+
+        tile = Tile(data)
+        for layer in tile:
+            self.results[layer.name].append(layer.size)
+
+
+# special object to tell worker threads to exit
+class Sentinel(object):
+    pass
+
+
+# encode a message to be sent over the "wire" from a worker to the parent
+# process. we use msgpack encoding rather than pickle, as pickle was producing
+# some very large messages.
+def mp_encode(data):
+    from msgpack import packb
+    return packb(data)
+
+
+def mp_decode(data):
+    from msgpack import unpackb
+    return unpackb(data)
+
+
+def worker(input_queue, output_queue, cache):
+    """
+    Worker for multi-processing. Reads tasks from a queue and feeds them into
+    the Aggregator. When all tasks are done it reads a Sentinel and sends the
+    aggregated result back on the output queue.
+    """
+
+    agg = Aggregator(cache)
+
+    while True:
+        obj = input_queue.get()
+        if isinstance(obj, Sentinel):
+            break
+
+        assert(isinstance(obj, (str, unicode)))
+        agg.add(obj)
+        input_queue.task_done()
+
+    output_queue.put(mp_encode(agg.results))
+
+
+def parallel(tile_urls, cache, nprocs):
+    """
+    Fetch percentile data in parallel, using nprocs processes.
+
+    This uses two queues; one for input to the workers and one for output from
+    the workers. A pool of workers of size nprocs is started, fed with jobs
+    from tile_urls, and the results are aggregated at the end and returned.
+    """
+
+    from multiprocessing import Queue, JoinableQueue, Process
+
+    input_queue = JoinableQueue(nprocs)
+    output_queue = Queue(nprocs)
+
+    workers = []
+    for i in xrange(0, nprocs):
+        w = Process(target=worker, args=(input_queue, output_queue, cache))
+        w.start()
+        workers.append(w)
+
+    for tile_url in tile_urls:
+        input_queue.put(tile_url)
+
+    # join waits for all the tasks to be marked as done. this way we know that
+    # enqueuing the Sentinel isn't going to "jump the queue" in front of a task
+    # and mean we don't get the full result set back.
+    input_queue.join()
+    for i in xrange(0, nprocs):
+        input_queue.put(Sentinel())
+
+    # after we've queued the Sentinels, each worker should output an aggregated
+    # result on the output queue.
+    result = defaultdict(list)
+    for i in xrange(0, nprocs):
+        worker_result = mp_decode(output_queue.get())
+        for k, v in worker_result.iteritems():
+            result[k].extend(v)
+
+    # and the worker should have exited, so we can clean up the processes.
+    for w in workers:
+        w.join()
+
+    return result
+
+
+def sequential(tile_urls, cache):
+    agg = Aggregator(cache)
+    for tile_url in tile_urls:
+        agg.add(tile_url)
+    return agg.results
+
+
+def calculate_percentiles(tile_urls, percentiles, cache, nprocs):
+    """
+    Fetch tiles and calculate the percentile sizes in total and per-layer.
+
+    Percentiles should be given as a list of decimal numbers between 0 and 100,
+    i.e: [50, 90, 99].
+
+    Cache, if true, uses a local disk cache for the tiles. This can be very
+    useful if re-running percentile calculations.
+
+    Nprocs is the number of processes to use for both fetching and aggregation.
+    Even on a system with a single CPU, it can be worth setting this to a
+    larger number to make concurrent nework requests for tiles.
+    """
+
+    if nprocs > 1:
+        results = parallel(tile_urls, cache, nprocs)
+    else:
+        results = sequential(tile_urls, cache)
+
+    pct = {}
+    for label, values in results.iteritems():
+        values.sort()
+        pcts = []
+        for p in percentiles:
+            i = int(len(values) * p / 100.0)
+            pcts.append(values[i])
+
+        pct[label] = pcts
+
+    return pct