Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

mrmatrix more flexible #71

Merged
merged 26 commits into from
May 21, 2019
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
File renamed without changes.
4 changes: 4 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
in progress

- Make tsv_to_mrmatrix more flexible and add it to the exported scripts.

v0.10.7

- Changed bins_per_dimension in npvector.tileset_info to match the value in
Expand Down
136 changes: 81 additions & 55 deletions scripts/tsv_to_mrmatrix.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
#!/usr/bin/python

import csv
import dask.array as da
import h5py
import math
import numpy as np
import os
import os.path as op
import sys
import argparse
import time

Expand All @@ -17,13 +15,11 @@ def coarsen(f, tile_size=256):
'''
grid = f['resolutions']['1']['values']
top_n = grid.shape[0]

max_zoom = math.ceil(math.log(top_n / tile_size) / math.log(2))
max_width = tile_size * 2 ** max_zoom

chunk_size=tile_size * 16
chunk_size = tile_size * 16
curr_size = grid.shape
dask_dset = da.from_array(grid, chunks=(chunk_size,chunk_size))
dask_dset = da.from_array(grid, chunks=(chunk_size, chunk_size))

r = f['resolutions']
curr_resolution = 1
Expand All @@ -36,92 +32,122 @@ def coarsen(f, tile_size=256):
print("curr_size:", curr_size)
g = r.create_group(str(curr_resolution))
values = g.require_dataset('values', curr_size, dtype='f4',
compression='lzf', fillvalue=np.nan)
compression='lzf', fillvalue=np.nan)

dask_dset = dask_dset.rechunk((chunk_size, chunk_size))
dask_dset = da.coarsen(np.nansum, dask_dset, {0: 2, 1: 2})
da.store(dask_dset, values)


def parse(input_handle, output_hdf5, top_n=None):
input_handle
first_line = next(input_handle)
parts = first_line.strip().split('\t')
# TODO: Use the python built-in csv module, instead of parsing by hand?
def parse(input_handle, output_hdf5, height, width,
delimiter, first_n, is_square, is_labelled):
reader = csv.reader(input_handle, delimiter=delimiter)
if is_labelled:
first_row = next(reader)
labels = first_row[1:(first_n + 1) if first_n else None]
if is_square:
output_hdf5.create_dataset(
'labels',
data=np.array(labels, dtype=h5py.special_dtype(vlen=str)),
compression='lzf')
# TODO: Handle non-square labels
mccalluc marked this conversation as resolved.
Show resolved Hide resolved

if top_n is None:
top_n = len(parts) - 1
# TODO: So if it's taller than it is wide, it will be truncated to a square,
# unless an explicit top_n is provided? That doesn't seem right.

labels = parts[1:top_n+1]
tile_size = 256
max_zoom = math.ceil(math.log(top_n / tile_size) / math.log(2))
limit = max(height, width)
max_zoom = math.ceil(math.log(limit / tile_size) / math.log(2))
max_width = tile_size * 2 ** max_zoom

labels_dset = output_hdf5.create_dataset('labels', data=np.array(labels, dtype=h5py.special_dtype(vlen=str)),
compression='lzf')

g = output_hdf5.create_group('resolutions')
g1 = g.create_group('1')
ds = g1.create_dataset('values', (max_width, max_width),
dtype='f4', compression='lzf', fillvalue=np.nan)
ds1 = g1.create_dataset('nan_values', (max_width, max_width),
dtype='f4', compression='lzf', fillvalue=0)
# TODO: We don't write to this... Is it necessary?
dtype='f4', compression='lzf', fillvalue=np.nan)
g1.create_dataset('nan_values', (max_width, max_width),
dtype='f4', compression='lzf', fillvalue=0)
# TODO: We don't write to this... Is it necessary?

start_time = time.time()
counter = 0
for line in input_handle:
parts = line.strip().split('\t')[1:top_n+1]
x = np.array([float(p) for p in parts])
ds[counter,:len(x)] = x
for row in reader:
x = np.array([float(p) for p in row[1 if is_labelled else None:]])
ds[counter, :len(x)] = x

counter += 1
if counter == top_n:
if counter == first_n:
break

time_elapsed = time.time() - start_time
time_per_entry = time_elapsed / counter

time_remaining = time_per_entry * (top_n - counter)
print("counter:", counter, "sum(x):", sum(x), "time remaining: {:d} seconds".format(int(time_remaining)))
time_remaining = time_per_entry * (height - counter)
print("counter:", counter, "sum(x):", sum(x),
"time remaining: {:d} seconds".format(int(time_remaining)))

coarsen(output_hdf5)
output_hdf5.close()


def main():
parser = argparse.ArgumentParser(description="""
def get_height(input_path, is_labelled=True):
'''
We need to scan the file once just to see how many lines it contains.
If it is tall and narrow, the first tile will need to be larger than just
looking at the width of the first row would suggest.
'''
with open(input_path) as f:
for i, l in enumerate(f):
pass
if is_labelled:
return i
else:
return i + 1

python tsv-dense-to-sparse
""")

parser.add_argument('input_file')
parser.add_argument('output_file')
#parser.add_argument('-o', '--options', default='yo',
# help="Some option", type='str')
#parser.add_argument('-u', '--useless', action='store_true',
# help='Another useless option')
parser.add_argument('-n', '--first-n', type=int, default=None,
help="Only use the first n entries in the matrix")
def get_width(input_path, is_labelled, delimiter='\t'):
'''
Assume the number of elements in the first row is the total width.
'''
with open(input_path, 'r', newline='') as input_handle:
reader = csv.reader(input_handle, delimiter=delimiter)
len_row = len(next(reader))
if is_labelled:
return len_row - 1
return len_row

args = parser.parse_args()

count = 0
top_n = args.first_n
def main():
parser = argparse.ArgumentParser(description='''
Given a tab-delimited file, produces an HDF5 file with mrmatrix
("multi-resolution matrix") structure: Under the "resolutions"
group are datasets, named with successive powers of 2,
which represent successively higher aggregations of the input.
''')
parser.add_argument('input_file', help='TSV file path')
parser.add_argument('output_file', help='HDF5 file')
parser.add_argument('-d', '--delimiter', type=str, default='\t',
metavar='D', help='Delimiter; defaults to tab')
parser.add_argument('-n', '--first-n', type=int, default=None, metavar='N',
help='Only read first N columns from first N rows')
parser.add_argument('-s', '--square', action='store_true',
help='Row labels are assumed to match column labels')
parser.add_argument('-l', '--labelled', action='store_true',
help='TSV Matrix has column and row labels')
args = parser.parse_args()

if args.input_file == '-':
f_in = sys.stdin
else:
f_in = open(args.input_file, 'r')
height = get_height(args.input_file, is_labelled=args.labelled)
width = get_width(args.input_file, is_labelled=args.labelled,
mccalluc marked this conversation as resolved.
Show resolved Hide resolved
delimiter=args.delimiter)
f_in = open(args.input_file, 'r', newline='')

parse(f_in, h5py.File(args.output_file, 'w'), top_n)
parse(f_in,
h5py.File(args.output_file, 'w'),
height=height, width=width,
delimiter=args.delimiter,
first_n=args.first_n,
is_square=args.square,
is_labelled=args.labelled)

f = h5py.File(args.output_file, 'r')
print("sum1:", np.nansum(f['resolutions']['1']['values'][0]))



if __name__ == '__main__':
main()
7 changes: 5 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


print("packages:", find_packages())

setup_requires = [
'numpy',
]
Expand Down Expand Up @@ -38,9 +38,12 @@
packages=['clodius', 'clodius.cli', 'clodius.tiles'],
setup_requires=setup_requires,
install_requires=install_requires,
scripts=[
'scripts/tsv_to_mrmatrix.py'
],
entry_points={
'console_scripts': [
'clodius = clodius.cli.aggregate:cli',
'clodius = clodius.cli.aggregate:cli'
]
}
)
Loading