Skip to content

Commit

Permalink
mrmatrix more flexible (#71)
Browse files Browse the repository at this point in the history
* Use csv module instead of parsing by hand; Fix #63

* better description

* Aspirational command-line options

* Checkpoint: Need to update test

* checkpoint: still buggy [skip ci]

* Runs, but we get one less resolution

* Fix test

* tsv_to_mrmatrix_test.py whitespace

* autopep8

* flake8 clean

* changelog [skip ci]

* start test unlabelled

* unlabelled square

* Make test more generic

* Test aggregation of tall and wide datasets

* Test tall and wide aggregation

* If source data not power of 2...

* Handle first n

* click-ify

* Un-click-ify: Argparse feels good-enough for now

* update changelog [skip ci]

* Fix whitespace; one failing test locally

* Comments and logs

* missing paren
  • Loading branch information
mccalluc authored May 21, 2019
1 parent 1e275f4 commit dafc422
Show file tree
Hide file tree
Showing 6 changed files with 219 additions and 59 deletions.
File renamed without changes.
4 changes: 4 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
in progress

- Make tsv_to_mrmatrix more flexible and add it to the exported scripts.

v0.10.7

- Changed bins_per_dimension in npvector.tileset_info to match the value in
Expand Down
131 changes: 79 additions & 52 deletions scripts/tsv_to_mrmatrix.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
#!/usr/bin/python

import csv
import dask.array as da
import h5py
import math
import numpy as np
import os
import os.path as op
import sys
import argparse
import time

Expand All @@ -17,9 +15,7 @@ def coarsen(f, tile_size=256):
'''
grid = f['resolutions']['1']['values']
top_n = grid.shape[0]

max_zoom = math.ceil(math.log(top_n / tile_size) / math.log(2))
max_width = tile_size * 2 ** max_zoom

chunk_size = tile_size * 16
curr_size = grid.shape
Expand All @@ -43,84 +39,115 @@ def coarsen(f, tile_size=256):
da.store(dask_dset, values)


def parse(input_handle, output_hdf5, top_n=None):
input_handle
first_line = next(input_handle)
parts = first_line.strip().split('\t')
# TODO: Use the python built-in csv module, instead of parsing by handself.
# https://github.com/higlass/clodius/issues/63
def parse(input_handle, output_hdf5, height, width,
delimiter, first_n, is_square, is_labelled):
reader = csv.reader(input_handle, delimiter=delimiter)
if is_labelled:
first_row = next(reader)
labels = first_row[1:(first_n + 1) if first_n else None]
if is_square:
output_hdf5.create_dataset(
'labels',
data=np.array(labels, dtype=h5py.special_dtype(vlen=str)),
compression='lzf')
# TODO: Handle non-square labels
# https://github.com/higlass/clodius/issues/68

if top_n is None:
top_n = len(parts) - 1
# TODO: If it's taller than it is wide, it will be truncated to a square,
# unless an explicit top_n is provided.
# https://github.com/higlass/clodius/issues/64

labels = parts[1:top_n + 1]
tile_size = 256
max_zoom = math.ceil(math.log(top_n / tile_size) / math.log(2))
limit = max(height, width)
max_zoom = math.ceil(math.log(limit / tile_size) / math.log(2))
max_width = tile_size * 2 ** max_zoom

labels_dset = output_hdf5.create_dataset('labels', data=np.array(labels, dtype=h5py.special_dtype(vlen=str)),
compression='lzf')

g = output_hdf5.create_group('resolutions')
g1 = g.create_group('1')
ds = g1.create_dataset('values', (max_width, max_width),
dtype='f4', compression='lzf', fillvalue=np.nan)
ds1 = g1.create_dataset('nan_values', (max_width, max_width),
dtype='f4', compression='lzf', fillvalue=0)
# TODO: We don't write to this, and I think we want the nan_values for every resolution.
# https://github.com/higlass/clodius/issues/62
g1.create_dataset('nan_values', (max_width, max_width),
dtype='f4', compression='lzf', fillvalue=0)
# TODO: We don't write to this... Is it necessary?

start_time = time.time()
counter = 0
for line in input_handle:
parts = line.strip().split('\t')[1:top_n + 1]
x = np.array([float(p) for p in parts])
for row in reader:
x = np.array([float(p) for p in row[1 if is_labelled else None:]])
ds[counter, :len(x)] = x

counter += 1
if counter == top_n:
if counter == first_n:
break

time_elapsed = time.time() - start_time
time_per_entry = time_elapsed / counter

time_remaining = time_per_entry * (top_n - counter)
time_remaining = time_per_entry * (height - counter)
print("counter:", counter, "sum(x):", sum(x),
"time remaining: {:d} seconds".format(int(time_remaining)))

coarsen(output_hdf5)
output_hdf5.close()


def main():
parser = argparse.ArgumentParser(description="""
python tsv-dense-to-sparse
""")
def get_height(input_path, is_labelled=True):
'''
We need to scan the file once just to see how many lines it contains.
If it is tall and narrow, the first tile will need to be larger than just
looking at the width of the first row would suggest.
'''
with open(input_path) as f:
for i, l in enumerate(f):
pass
if is_labelled:
return i
else:
return i + 1

parser.add_argument('input_file')
parser.add_argument('output_file')
# parser.add_argument('-o', '--options', default='yo',
# help="Some option", type='str')
# parser.add_argument('-u', '--useless', action='store_true',
# help='Another useless option')
parser.add_argument('-n', '--first-n', type=int, default=None,
help="Only use the first n entries in the matrix")

args = parser.parse_args()
def get_width(input_path, is_labelled, delimiter='\t'):
'''
Assume the number of elements in the first row is the total width.
'''
with open(input_path, 'r', newline='') as input_handle:
reader = csv.reader(input_handle, delimiter=delimiter)
len_row = len(next(reader))
if is_labelled:
return len_row - 1
return len_row

count = 0
top_n = args.first_n

if args.input_file == '-':
f_in = sys.stdin
else:
f_in = open(args.input_file, 'r')
def main():
parser = argparse.ArgumentParser(description='''
Given a tab-delimited file, produces an HDF5 file with mrmatrix
("multi-resolution matrix") structure: Under the "resolutions"
group are datasets, named with successive powers of 2,
which represent successively higher aggregations of the input.
''')
parser.add_argument('input_file', help='TSV file path')
parser.add_argument('output_file', help='HDF5 file')
parser.add_argument('-d', '--delimiter', type=str, default='\t',
metavar='D', help='Delimiter; defaults to tab')
parser.add_argument('-n', '--first-n', type=int, default=None, metavar='N',
help='Only read first N columns from first N rows')
parser.add_argument('-s', '--square', action='store_true',
help='Row labels are assumed to match column labels')
parser.add_argument('-l', '--labelled', action='store_true',
help='TSV Matrix has column and row labels')
args = parser.parse_args()

parse(f_in, h5py.File(args.output_file, 'w'), top_n)
height = get_height(args.input_file, is_labelled=args.labelled)
width = get_width(args.input_file, is_labelled=args.labelled,
delimiter=args.delimiter)
print('height:', height)
print('width:', width)

f_in = open(args.input_file, 'r', newline='')

parse(f_in,
h5py.File(args.output_file, 'w'),
height=height, width=width,
delimiter=args.delimiter,
first_n=args.first_n,
is_square=args.square,
is_labelled=args.labelled)

f = h5py.File(args.output_file, 'r')
print("sum1:", np.nansum(f['resolutions']['1']['values'][0]))
Expand Down
5 changes: 4 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,12 @@
packages=['clodius', 'clodius.cli', 'clodius.tiles'],
setup_requires=setup_requires,
install_requires=install_requires,
scripts=[
'scripts/tsv_to_mrmatrix.py'
],
entry_points={
'console_scripts': [
'clodius = clodius.cli.aggregate:cli',
'clodius = clodius.cli.aggregate:cli'
]
}
)
131 changes: 126 additions & 5 deletions test/tsv_to_mrmatrix_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from numpy.testing import assert_array_equal
import h5py

from scripts.tsv_to_mrmatrix import coarsen, parse
from scripts.tsv_to_mrmatrix import coarsen, parse, get_height, get_width


class CoarsenTest(unittest.TestCase):
Expand All @@ -21,7 +21,8 @@ def test_5_layer_pyramid(self):
g = hdf5.create_group('resolutions')
g1 = g.create_group('1')
ds = g1.create_dataset('values', (max_width, max_width),
dtype='f4', compression='lzf', fillvalue=np.nan)
dtype='f4', compression='lzf',
fillvalue=np.nan)
for y in range(max_width):
a = np.array([float(x) for x in range(max_width)])
ds[y, :max_width] = a
Expand Down Expand Up @@ -70,7 +71,8 @@ def test_math(self):
g = hdf5.create_group('resolutions')
g1 = g.create_group('1')
ds = g1.create_dataset('values', (max_width, max_width),
dtype='f4', compression='lzf', fillvalue=np.nan)
dtype='f4', compression='lzf',
fillvalue=np.nan)
for y in range(max_width):
a = np.array([float(x) for x in range(max_width)])
ds[y, :max_width] = a
Expand Down Expand Up @@ -107,7 +109,7 @@ def test_math(self):


class ParseTest(unittest.TestCase):
def test_parse(self):
def test_wide_labelled_square(self):
with TemporaryDirectory() as tmp_dir:
csv_path = tmp_dir + '/tmp.csv'
with open(csv_path, 'w', newline='') as csv_file:
Expand All @@ -127,7 +129,11 @@ def test_parse(self):
hdf5_path = tmp_dir + 'tmp.hdf5'
hdf5_write_handle = h5py.File(hdf5_path, 'w')

parse(csv_handle, hdf5_write_handle)
height = get_height(csv_path)
width = get_width(csv_path, is_labelled=True)
parse(csv_handle, hdf5_write_handle, height, width,
delimiter='\t', first_n=None, is_square=True,
is_labelled=True)

hdf5 = h5py.File(hdf5_path, 'r')
self.assertEqual(list(hdf5.keys()), ['labels', 'resolutions'])
Expand Down Expand Up @@ -158,3 +164,118 @@ def test_parse(self):
assert_array_equal(res_2[4], [0] * 256)
assert_array_equal(res_2[5], [0] * 256)
assert_array_equal(res_2[6], [0] * 256)
# TODO: We lose nan at higher aggregations.
# https://github.com/higlass/clodius/issues/62

def _assert_unlabelled_roundtrip_lt_256(
self, matrix, delimiter, is_square):
with TemporaryDirectory() as tmp_dir:
csv_path = tmp_dir + '/tmp.csv'
with open(csv_path, 'w', newline='') as csv_file:
writer = csv.writer(csv_file, delimiter=delimiter)
# body:
for row in matrix:
writer.writerow(row)

csv_handle = open(csv_path, 'r')
hdf5_path = tmp_dir + 'tmp.hdf5'
hdf5_write_handle = h5py.File(hdf5_path, 'w')

is_labelled = False
height = get_height(csv_path, is_labelled=is_labelled)
width = get_width(csv_path, is_labelled=is_labelled)
parse(csv_handle, hdf5_write_handle, height, width,
first_n=None, is_labelled=is_labelled,
delimiter=delimiter, is_square=is_square)

hdf5 = h5py.File(hdf5_path, 'r')
self.assertEqual(list(hdf5.keys()), ['resolutions'])
self.assertEqual(list(hdf5['resolutions'].keys()), ['1'])
self.assertEqual(list(hdf5['resolutions']['1'].keys()),
['nan_values', 'values'])
assert_array_equal(
hdf5['resolutions']['1']['nan_values'],
[[0] * len(matrix[0])] * len(matrix)
)
assert_array_equal(
hdf5['resolutions']['1']['values'],
matrix
)

def test_unlabelled_csv_is_square_true(self):
self._assert_unlabelled_roundtrip_lt_256(
matrix=[[x + y for x in range(4)] for y in range(4)],
delimiter=',',
is_square=True
)

def test_unlabelled_tsv_is_square_false(self):
self._assert_unlabelled_roundtrip_lt_256(
matrix=[[x + y for x in range(4)] for y in range(4)],
delimiter='\t',
is_square=False
)

def _assert_unlabelled_roundtrip_1024(
self, matrix, first_row=None, first_col=None, first_n=None):
delimiter = '\t'
is_square = False
with TemporaryDirectory() as tmp_dir:
csv_path = tmp_dir + '/tmp.csv'
with open(csv_path, 'w', newline='') as csv_file:
writer = csv.writer(csv_file, delimiter=delimiter)
# body:
for row in matrix:
writer.writerow(row)

csv_handle = open(csv_path, 'r')
hdf5_path = tmp_dir + 'tmp.hdf5'
hdf5_write_handle = h5py.File(hdf5_path, 'w')

is_labelled = False
height = get_height(csv_path, is_labelled=is_labelled)
width = get_width(csv_path, is_labelled=is_labelled)
parse(csv_handle, hdf5_write_handle, height, width,
first_n=first_n, is_labelled=is_labelled,
delimiter=delimiter, is_square=is_square)

hdf5 = h5py.File(hdf5_path, 'r')
self.assertEqual(list(hdf5.keys()), ['resolutions'])
self.assertEqual(list(hdf5['resolutions'].keys()), ['1', '2', '4'])
self.assertEqual(list(hdf5['resolutions']['1'].keys()),
['nan_values', 'values'])
self.assertEqual(list(hdf5['resolutions']['4'].keys()),
['values'])
res_4 = hdf5['resolutions']['4']['values']
if first_row:
assert_array_equal(res_4[0], first_row)
if first_col:
assert_array_equal(
[res_4[y][0] for y in range(len(first_col))],
first_col)

def test_unlabelled_tsv_tall(self):
self._assert_unlabelled_roundtrip_1024(
matrix=[[1 for x in range(4)] for y in range(1000)],
first_col=[16] * 250 + [0] * 6
)

def test_unlabelled_tsv_wide(self):
self._assert_unlabelled_roundtrip_1024(
matrix=[[1 for x in range(1000)] for y in range(4)],
first_row=[16] * 250 + [0] * 6
)

def test_unlabelled_tsv_tall_first_n(self):
self._assert_unlabelled_roundtrip_1024(
matrix=[[1 for x in range(4)] for y in range(1000)],
first_col=[8] + [0] * 255,
first_n=2
)

def test_unlabelled_tsv_wide_first_n(self):
self._assert_unlabelled_roundtrip_1024(
matrix=[[1 for x in range(1000)] for y in range(4)],
first_row=[8] * 250 + [0] * 6,
first_n=2
)
7 changes: 6 additions & 1 deletion travis_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,12 @@ die() { set +v; echo "$*" 1>&2 ; sleep 1; exit 1; }
# https://github.com/travis-ci/travis-ci/issues/6018

start flake8
flake8
# TODO:
# - Get more files to lint cleanly.
# - Reduce the number of errors which are ignored everywhere else.
flake8 --config=.flake8-ignore
flake8 test/tsv_to_mrmatrix_test.py \
scripts/tsv_to_mrmatrix.py
end flake8

start download
Expand Down

0 comments on commit dafc422

Please sign in to comment.