Skip to content

Commit

Permalink
Merge pull request #8 from tilezen/zerebubuth/tile-percentiles
Browse files Browse the repository at this point in the history
Tool to calculate tile size percentiles
  • Loading branch information
zerebubuth authored Oct 5, 2018
2 parents f026ead + ebff8d6 commit 31daaae
Show file tree
Hide file tree
Showing 3 changed files with 330 additions and 0 deletions.
30 changes: 30 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ Current scoville commands:

* `info`: Prints size and item count information about an MVT tile.
* `proxy`: Serves a treemap visualisation of tiles on a local HTTP server.
* `percentiles`: Calculate the percentile tile sizes for a set of MVT tiles.

### Info command ###

Expand Down Expand Up @@ -66,6 +67,35 @@ This will run a server on [localhost:8000](http://localhost:8000) by default (us

![Screenshot of the proxy server](doc/proxy_screenshot.png)

### Percentiles command ###

Downloads a set of tiles and calculates percentile tile sizes, both total for the tile and per-layer within the tile. This can be useful for measuring the changes in tile size across different versions of tiles, and indicating which layers are contributing the most to outlier sizes.

For example:

```
echo "0/0/0" > tiles.txt
scoville percentiles --cache tiles.txt https://tile.nextzen.org/tilezen/vector/v1/512/all/{z}/{x}/{y}.mvt?api_key=YOUR_API_KEY
```

Will output something like:

```
TOTAL p50 p90 p99 p99.9
boundaries 68731 68731 68731 68731
buildings 19 19 19 19
earth 110378 110378 110378 110378
landuse 17 17 17 17
places 471086 471086 471086 471086
pois 14 14 14 14
roads 15 15 15 15
transit 17 17 17 17
water 404996 404996 404996 404996
~total 1055273 1055273 1055273 1055273
```

Note that the `~total` entry is **not** the total of the column above it; it's the percentile of total tile size. In other words, if we had three tiles with three layers, and each tile had a single, different layer taking up 1000 bytes and two layers taking up 10 bytes, then each tile is 1020 bytes and that would be the p50 `~total`. However, the p50 on each individual layer would only be 10 bytes.

## Install on Ubuntu:

```
Expand Down
99 changes: 99 additions & 0 deletions scoville/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,5 +139,104 @@ def proxy(url, port):
serve_http(url, port)


def read_urls(file_name, url_pattern):
with open(file_name, 'r') as fh:
for line in fh:
zxy = line.split(' ', 1)[0]
z, x, y = map(int, zxy.split('/', 2))

u = url_pattern \
.replace('{z}', str(z)) \
.replace('{x}', str(x)) \
.replace('{y}', str(y))

yield u


def _percentiles_output_text(percentiles, result):
"""
Output results to the console as columns of text, using ANSI colours where
available.
"""

fmt = '%20s' + ' %8d' * len(percentiles)
header = '%20s' % ('TOTAL',)
for percentile in percentiles:
pct_header = 'p%r' % (percentile,)
header += ' %8s' % (pct_header,)
click.secho(header, fg='green', bold=True)
for name in sorted(result.keys()):
percentiles = result[name]
line = fmt % tuple([name] + percentiles)
click.secho(line, bold=name.startswith('~'))


def _percentiles_output_csv(percentiles, result):
"""
Output text to the console as a CSV file.
"""

import csv
from sys import stdout

writer = csv.writer(stdout)

headers = ['Layer']
for percentile in percentiles:
headers.append('p%r' % (percentile,))
writer.writerow(headers)

for name in sorted(result.keys()):
line = [name]
for pct in result[name]:
line.append(str(pct))
writer.writerow(line)


@cli.command()
@click.argument('tiles_file', required=1)
@click.argument('url', required=1)
@click.option('--percentiles', '-p', multiple=True, type=float,
help='Percentiles to display. Use decimal floats, i.e: 99.9, '
'not 99_9. Can be used multiple times.')
@click.option('--cache/--no-cache', default=False, help='Use a cache for '
'tiles. Can speed up multiple runs considerably.')
@click.option('--nprocs', '-j', default=1, type=int, help='Number of '
'processes to use to download and do tile size aggregation.')
@click.option('--output-format', '-f', type=click.Choice(['text', 'csv']),
default='text', help='Format to use when writing results to '
'the console.')
def percentiles(tiles_file, url, percentiles, cache, nprocs, output_format):
"""
Download a bunch of tiles and display the percentiles of size, breakdown by
layer, and so forth.
The tiles to download should be listed in TILES_FILE, one per line as
'z/x/y'. The URL to fetch them from should contain {z}, {x} and {y}
replacements.
"""

from scoville.percentiles import calculate_percentiles

if not percentiles:
percentiles = [50, 90, 99, 99.9]

tiles = read_urls(tiles_file, url)
result = calculate_percentiles(tiles, percentiles, cache, nprocs)

if output_format == 'text':
_percentiles_output_text(percentiles, result)

elif output_format == 'csv':
_percentiles_output_csv(percentiles, result)

else:
raise ValueError('Unknown output format %r' % (output_format,))


def scoville_main():
cli()


if __name__ == '__main__':
scoville_main()
201 changes: 201 additions & 0 deletions scoville/percentiles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
import requests
from collections import defaultdict
from scoville.mvt import Tile


def _fetch_http(url):
"""
Fetch a tile over HTTP.
"""

res = requests.get(url)

# TODO: retry? better error handling!
if res.status_code != requests.codes.ok:
raise IOError("Got tile response %d" % (res.status_code,))

return res.content


def _fetch_cache(url):
"""
If a tile is present on disk, then use it. Otherwise fetch over HTTP.
"""

from base64 import urlsafe_b64encode
from os.path import join, isfile, isdir
from os import makedirs
from hashlib import sha224

# we use the non-query part to store on disk. (tile won't depend on API
# key, right?) partly because the API key can be very long and overflow
# the max 255 chars for a filename when base64 encoded.
no_query = url.split('?', 1)[0]
encoded = urlsafe_b64encode(no_query)
assert len(encoded) < 256

# we use a 2-level hash-based fanout to avoid having so many inodes in
# a directory that file lookup slows to a crawl.
hashed = sha224(no_query).hexdigest()
dir_name = join('.cache', hashed[0:3], hashed[3:6])
file_name = join(dir_name, encoded)

data = None
if isfile(file_name):
with open(file_name, 'r') as fh:
data = fh.read()

else:
data = _fetch_http(url)
if not isdir(dir_name):
makedirs(dir_name)
with open(file_name, 'w') as fh:
fh.write(data)

return data


class Aggregator(object):
"""
Core of the algorithm. Fetches tiles and aggregates their total and
per-layer sizes into a set of lists.
"""

def __init__(self, cache=False):
self.fetch_fn = _fetch_http
if cache:
self.fetch_fn = _fetch_cache

self.results = defaultdict(list)

def add(self, tile_url):
data = self.fetch_fn(tile_url)
self.results['~total'].append(len(data))

tile = Tile(data)
for layer in tile:
self.results[layer.name].append(layer.size)


# special object to tell worker threads to exit
class Sentinel(object):
pass


# encode a message to be sent over the "wire" from a worker to the parent
# process. we use msgpack encoding rather than pickle, as pickle was producing
# some very large messages.
def mp_encode(data):
from msgpack import packb
return packb(data)


def mp_decode(data):
from msgpack import unpackb
return unpackb(data)


def worker(input_queue, output_queue, cache):
"""
Worker for multi-processing. Reads tasks from a queue and feeds them into
the Aggregator. When all tasks are done it reads a Sentinel and sends the
aggregated result back on the output queue.
"""

agg = Aggregator(cache)

while True:
obj = input_queue.get()
if isinstance(obj, Sentinel):
break

assert(isinstance(obj, (str, unicode)))
agg.add(obj)
input_queue.task_done()

output_queue.put(mp_encode(agg.results))


def parallel(tile_urls, cache, nprocs):
"""
Fetch percentile data in parallel, using nprocs processes.
This uses two queues; one for input to the workers and one for output from
the workers. A pool of workers of size nprocs is started, fed with jobs
from tile_urls, and the results are aggregated at the end and returned.
"""

from multiprocessing import Queue, JoinableQueue, Process

input_queue = JoinableQueue(nprocs)
output_queue = Queue(nprocs)

workers = []
for i in xrange(0, nprocs):
w = Process(target=worker, args=(input_queue, output_queue, cache))
w.start()
workers.append(w)

for tile_url in tile_urls:
input_queue.put(tile_url)

# join waits for all the tasks to be marked as done. this way we know that
# enqueuing the Sentinel isn't going to "jump the queue" in front of a task
# and mean we don't get the full result set back.
input_queue.join()
for i in xrange(0, nprocs):
input_queue.put(Sentinel())

# after we've queued the Sentinels, each worker should output an aggregated
# result on the output queue.
result = defaultdict(list)
for i in xrange(0, nprocs):
worker_result = mp_decode(output_queue.get())
for k, v in worker_result.iteritems():
result[k].extend(v)

# and the worker should have exited, so we can clean up the processes.
for w in workers:
w.join()

return result


def sequential(tile_urls, cache):
agg = Aggregator(cache)
for tile_url in tile_urls:
agg.add(tile_url)
return agg.results


def calculate_percentiles(tile_urls, percentiles, cache, nprocs):
"""
Fetch tiles and calculate the percentile sizes in total and per-layer.
Percentiles should be given as a list of decimal numbers between 0 and 100,
i.e: [50, 90, 99].
Cache, if true, uses a local disk cache for the tiles. This can be very
useful if re-running percentile calculations.
Nprocs is the number of processes to use for both fetching and aggregation.
Even on a system with a single CPU, it can be worth setting this to a
larger number to make concurrent nework requests for tiles.
"""

if nprocs > 1:
results = parallel(tile_urls, cache, nprocs)
else:
results = sequential(tile_urls, cache)

pct = {}
for label, values in results.iteritems():
values.sort()
pcts = []
for p in percentiles:
i = int(len(values) * p / 100.0)
pcts.append(values[i])

pct[label] = pcts

return pct

0 comments on commit 31daaae

Please sign in to comment.