-
Notifications
You must be signed in to change notification settings - Fork 22
/
Copy pathbench.py
executable file
·133 lines (109 loc) · 4.73 KB
/
bench.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#! /usr/bin/env python
'''This is a utility to run a benchmark against a backend'''
import os
import time
import psutil
import random
import argparse
from simhash_db import Client, GeneralException
parser = argparse.ArgumentParser(description='Run benchmarks on simhash_db')
parser.add_argument('--count', type=int, default=1000,
help='How many thousands of keys should be inserted per process')
parser.add_argument('--name', type=str, default='testing',
help='The name of the set of simhashes to use')
parser.add_argument('--processes', type=int, default=(2 * psutil.NUM_CPUS),
help='How many processes should be forked (defaults to 2 x NUM_CPUS)')
parser.add_argument('--num-blocks', dest='num_blocks', type=int, default=6,
help='How many blocks to configure the client to use')
parser.add_argument('--num-bits', dest='num_bits', type=int, default=3,
help='How many bits to configure the client to use')
parser.add_argument('--backend', type=str, required=True,
help='Which backend to use')
parser.add_argument('--config', type=str, required=False,
help='Path to a yaml file with the host configuration')
args = parser.parse_args()
# If a configuration file was provided, we should use it
if args.config:
from yaml import load
with open(args.config) as fin:
kwargs = load(fin.read())
else:
kwargs = {}
def make_seeds():
'''
Generate all the hashes that we'll be using. We'd like to be able to get a
large number of hashes for insertion, but we also don't want to:
1) use a lot of memory holding said set
2) spend a lot of time generating that set
So, we generate a number of random seed values, and then insert 1000 hashes
beginning with that number and skipping by another random number. These
pairs of (seed, skip) are returned'''
return [(
random.randint(0, 2 ** 64),
random.randint(1, 1000)
) for i in range(args.count)]
def insert():
'''Run the timing numbers for each of the provided seeds for insertion'''
seeds = make_seeds()
client = Client(args.backend, args.name, args.num_blocks, args.num_bits,
**kwargs)
for i in range(1000):
if i % 25 == 0:
print 'Inserting batch %i' % i
# We want to get a relatively good mix each time we insert data
hashes = [(start + i * interval) for start, interval in seeds]
try:
results = client.insert(hashes)
except GeneralException as exc:
print '---> Client exception: %s' % repr(exc)
# Since this is meant to be run in a subprocess...
exit(0)
def query():
'''Run the timing numbers for each of the provided seeds for query all'''
seeds = make_seeds()
client = Client(args.backend, args.name, args.num_blocks, args.num_bits,
**kwargs)
for i in range(1000):
if i % 25 == 0:
print 'Querying batch %i' % i
# We want to get a relatively good mix each time we insert data
hashes = [(start + i * interval) for start, interval in seeds]
try:
results = client.find_all(hashes)
except GeneralException as exc:
print '---> Client exception: %s' % repr(exc)
def time_children(processes, function, *args, **kwargs):
'''Run `processes` number of forked processes, each running `function` with
the provided arguments, and return the timing information'''
start = -time.time()
times = []
for i in range(processes):
pid = os.fork()
if pid == 0:
function(*args, **kwargs)
else:
print '---> Started %i' % pid
# And now wait for them, and collect the timing information
for i in range(processes):
pid, status = os.wait()
times.append(start + time.time())
print '---> %i finished in %fs' % (pid, times[-1])
return times
# Now run the benchmark itself and print out the results
insert_times = time_children(args.processes, insert)
query_times = time_children(args.processes, query)
# This is how many hashes we actually tried to insert
count = 1000 * args.count
print 'Insertion:'
print ' Times (min | avg | max): %10.5f s | %10.5f s | %10.5f s ' % (
min(insert_times), sum(insert_times) / len(insert_times),
max(insert_times))
print ' Rate (min | avg | max): %10i / s | %10i / s | %10i / s' % (
count / max(insert_times), count * len(insert_times) / sum(insert_times),
count / min(insert_times))
print 'Query:'
print ' Times (min | avg | max): %10.5f s | %10.5f s | %10.5f s ' % (
min(query_times), sum(query_times) / len(query_times), max(query_times))
print ' Rate (min | avg | max): %10i / s | %10i / s | %10i / s' % (
count / max(query_times), count * len(query_times) / sum(query_times),
count / min(query_times))