From dafc422715e3f22b1494196dea0cb87aea2b62e7 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Tue, 21 May 2019 12:17:57 -0400 Subject: [PATCH] mrmatrix more flexible (#71) * Use csv module instead of parsing by hand; Fix #63 * better description * Aspirational command-line options * Checkpoint: Need to update test * checkpoint: still buggy [skip ci] * Runs, but we get one less resolution * Fix test * tsv_to_mrmatrix_test.py whitespace * autopep8 * flake8 clean * changelog [skip ci] * start test unlabelled * unlabelled square * Make test more generic * Test aggregation of tall and wide datasets * Test tall and wide aggregation * If source data not power of 2... * Handle first n * click-ify * Un-click-ify: Argparse feels good-enough for now * update changelog [skip ci] * Fix whitespace; one failing test locally * Comments and logs * missing paren --- .flake8 => .flake8-ignore | 0 CHANGELOG | 4 ++ scripts/tsv_to_mrmatrix.py | 131 +++++++++++++++++++++-------------- setup.py | 5 +- test/tsv_to_mrmatrix_test.py | 131 +++++++++++++++++++++++++++++++++-- travis_test.sh | 7 +- 6 files changed, 219 insertions(+), 59 deletions(-) rename .flake8 => .flake8-ignore (100%) diff --git a/.flake8 b/.flake8-ignore similarity index 100% rename from .flake8 rename to .flake8-ignore diff --git a/CHANGELOG b/CHANGELOG index 581af515..bace25b4 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,7 @@ +in progress + +- Make tsv_to_mrmatrix more flexible and add it to the exported scripts. + v0.10.7 - Changed bins_per_dimension in npvector.tileset_info to match the value in diff --git a/scripts/tsv_to_mrmatrix.py b/scripts/tsv_to_mrmatrix.py index 12f2e057..e4977122 100644 --- a/scripts/tsv_to_mrmatrix.py +++ b/scripts/tsv_to_mrmatrix.py @@ -1,12 +1,10 @@ #!/usr/bin/python +import csv import dask.array as da import h5py import math import numpy as np -import os -import os.path as op -import sys import argparse import time @@ -17,9 +15,7 @@ def coarsen(f, tile_size=256): ''' grid = f['resolutions']['1']['values'] top_n = grid.shape[0] - max_zoom = math.ceil(math.log(top_n / tile_size) / math.log(2)) - max_width = tile_size * 2 ** max_zoom chunk_size = tile_size * 16 curr_size = grid.shape @@ -43,51 +39,47 @@ def coarsen(f, tile_size=256): da.store(dask_dset, values) -def parse(input_handle, output_hdf5, top_n=None): - input_handle - first_line = next(input_handle) - parts = first_line.strip().split('\t') - # TODO: Use the python built-in csv module, instead of parsing by handself. - # https://github.com/higlass/clodius/issues/63 +def parse(input_handle, output_hdf5, height, width, + delimiter, first_n, is_square, is_labelled): + reader = csv.reader(input_handle, delimiter=delimiter) + if is_labelled: + first_row = next(reader) + labels = first_row[1:(first_n + 1) if first_n else None] + if is_square: + output_hdf5.create_dataset( + 'labels', + data=np.array(labels, dtype=h5py.special_dtype(vlen=str)), + compression='lzf') + # TODO: Handle non-square labels + # https://github.com/higlass/clodius/issues/68 - if top_n is None: - top_n = len(parts) - 1 - # TODO: If it's taller than it is wide, it will be truncated to a square, - # unless an explicit top_n is provided. - # https://github.com/higlass/clodius/issues/64 - - labels = parts[1:top_n + 1] tile_size = 256 - max_zoom = math.ceil(math.log(top_n / tile_size) / math.log(2)) + limit = max(height, width) + max_zoom = math.ceil(math.log(limit / tile_size) / math.log(2)) max_width = tile_size * 2 ** max_zoom - labels_dset = output_hdf5.create_dataset('labels', data=np.array(labels, dtype=h5py.special_dtype(vlen=str)), - compression='lzf') - g = output_hdf5.create_group('resolutions') g1 = g.create_group('1') ds = g1.create_dataset('values', (max_width, max_width), dtype='f4', compression='lzf', fillvalue=np.nan) - ds1 = g1.create_dataset('nan_values', (max_width, max_width), - dtype='f4', compression='lzf', fillvalue=0) - # TODO: We don't write to this, and I think we want the nan_values for every resolution. - # https://github.com/higlass/clodius/issues/62 + g1.create_dataset('nan_values', (max_width, max_width), + dtype='f4', compression='lzf', fillvalue=0) + # TODO: We don't write to this... Is it necessary? start_time = time.time() counter = 0 - for line in input_handle: - parts = line.strip().split('\t')[1:top_n + 1] - x = np.array([float(p) for p in parts]) + for row in reader: + x = np.array([float(p) for p in row[1 if is_labelled else None:]]) ds[counter, :len(x)] = x counter += 1 - if counter == top_n: + if counter == first_n: break time_elapsed = time.time() - start_time time_per_entry = time_elapsed / counter - time_remaining = time_per_entry * (top_n - counter) + time_remaining = time_per_entry * (height - counter) print("counter:", counter, "sum(x):", sum(x), "time remaining: {:d} seconds".format(int(time_remaining))) @@ -95,32 +87,67 @@ def parse(input_handle, output_hdf5, top_n=None): output_hdf5.close() -def main(): - parser = argparse.ArgumentParser(description=""" - - python tsv-dense-to-sparse -""") +def get_height(input_path, is_labelled=True): + ''' + We need to scan the file once just to see how many lines it contains. + If it is tall and narrow, the first tile will need to be larger than just + looking at the width of the first row would suggest. + ''' + with open(input_path) as f: + for i, l in enumerate(f): + pass + if is_labelled: + return i + else: + return i + 1 - parser.add_argument('input_file') - parser.add_argument('output_file') - # parser.add_argument('-o', '--options', default='yo', - # help="Some option", type='str') - # parser.add_argument('-u', '--useless', action='store_true', - # help='Another useless option') - parser.add_argument('-n', '--first-n', type=int, default=None, - help="Only use the first n entries in the matrix") - args = parser.parse_args() +def get_width(input_path, is_labelled, delimiter='\t'): + ''' + Assume the number of elements in the first row is the total width. + ''' + with open(input_path, 'r', newline='') as input_handle: + reader = csv.reader(input_handle, delimiter=delimiter) + len_row = len(next(reader)) + if is_labelled: + return len_row - 1 + return len_row - count = 0 - top_n = args.first_n - if args.input_file == '-': - f_in = sys.stdin - else: - f_in = open(args.input_file, 'r') +def main(): + parser = argparse.ArgumentParser(description=''' + Given a tab-delimited file, produces an HDF5 file with mrmatrix + ("multi-resolution matrix") structure: Under the "resolutions" + group are datasets, named with successive powers of 2, + which represent successively higher aggregations of the input. + ''') + parser.add_argument('input_file', help='TSV file path') + parser.add_argument('output_file', help='HDF5 file') + parser.add_argument('-d', '--delimiter', type=str, default='\t', + metavar='D', help='Delimiter; defaults to tab') + parser.add_argument('-n', '--first-n', type=int, default=None, metavar='N', + help='Only read first N columns from first N rows') + parser.add_argument('-s', '--square', action='store_true', + help='Row labels are assumed to match column labels') + parser.add_argument('-l', '--labelled', action='store_true', + help='TSV Matrix has column and row labels') + args = parser.parse_args() - parse(f_in, h5py.File(args.output_file, 'w'), top_n) + height = get_height(args.input_file, is_labelled=args.labelled) + width = get_width(args.input_file, is_labelled=args.labelled, + delimiter=args.delimiter) + print('height:', height) + print('width:', width) + + f_in = open(args.input_file, 'r', newline='') + + parse(f_in, + h5py.File(args.output_file, 'w'), + height=height, width=width, + delimiter=args.delimiter, + first_n=args.first_n, + is_square=args.square, + is_labelled=args.labelled) f = h5py.File(args.output_file, 'r') print("sum1:", np.nansum(f['resolutions']['1']['values'][0])) diff --git a/setup.py b/setup.py index e7a70911..10088011 100644 --- a/setup.py +++ b/setup.py @@ -37,9 +37,12 @@ packages=['clodius', 'clodius.cli', 'clodius.tiles'], setup_requires=setup_requires, install_requires=install_requires, + scripts=[ + 'scripts/tsv_to_mrmatrix.py' + ], entry_points={ 'console_scripts': [ - 'clodius = clodius.cli.aggregate:cli', + 'clodius = clodius.cli.aggregate:cli' ] } ) diff --git a/test/tsv_to_mrmatrix_test.py b/test/tsv_to_mrmatrix_test.py index 350243e1..36878cfd 100644 --- a/test/tsv_to_mrmatrix_test.py +++ b/test/tsv_to_mrmatrix_test.py @@ -7,7 +7,7 @@ from numpy.testing import assert_array_equal import h5py -from scripts.tsv_to_mrmatrix import coarsen, parse +from scripts.tsv_to_mrmatrix import coarsen, parse, get_height, get_width class CoarsenTest(unittest.TestCase): @@ -21,7 +21,8 @@ def test_5_layer_pyramid(self): g = hdf5.create_group('resolutions') g1 = g.create_group('1') ds = g1.create_dataset('values', (max_width, max_width), - dtype='f4', compression='lzf', fillvalue=np.nan) + dtype='f4', compression='lzf', + fillvalue=np.nan) for y in range(max_width): a = np.array([float(x) for x in range(max_width)]) ds[y, :max_width] = a @@ -70,7 +71,8 @@ def test_math(self): g = hdf5.create_group('resolutions') g1 = g.create_group('1') ds = g1.create_dataset('values', (max_width, max_width), - dtype='f4', compression='lzf', fillvalue=np.nan) + dtype='f4', compression='lzf', + fillvalue=np.nan) for y in range(max_width): a = np.array([float(x) for x in range(max_width)]) ds[y, :max_width] = a @@ -107,7 +109,7 @@ def test_math(self): class ParseTest(unittest.TestCase): - def test_parse(self): + def test_wide_labelled_square(self): with TemporaryDirectory() as tmp_dir: csv_path = tmp_dir + '/tmp.csv' with open(csv_path, 'w', newline='') as csv_file: @@ -127,7 +129,11 @@ def test_parse(self): hdf5_path = tmp_dir + 'tmp.hdf5' hdf5_write_handle = h5py.File(hdf5_path, 'w') - parse(csv_handle, hdf5_write_handle) + height = get_height(csv_path) + width = get_width(csv_path, is_labelled=True) + parse(csv_handle, hdf5_write_handle, height, width, + delimiter='\t', first_n=None, is_square=True, + is_labelled=True) hdf5 = h5py.File(hdf5_path, 'r') self.assertEqual(list(hdf5.keys()), ['labels', 'resolutions']) @@ -158,3 +164,118 @@ def test_parse(self): assert_array_equal(res_2[4], [0] * 256) assert_array_equal(res_2[5], [0] * 256) assert_array_equal(res_2[6], [0] * 256) + # TODO: We lose nan at higher aggregations. + # https://github.com/higlass/clodius/issues/62 + + def _assert_unlabelled_roundtrip_lt_256( + self, matrix, delimiter, is_square): + with TemporaryDirectory() as tmp_dir: + csv_path = tmp_dir + '/tmp.csv' + with open(csv_path, 'w', newline='') as csv_file: + writer = csv.writer(csv_file, delimiter=delimiter) + # body: + for row in matrix: + writer.writerow(row) + + csv_handle = open(csv_path, 'r') + hdf5_path = tmp_dir + 'tmp.hdf5' + hdf5_write_handle = h5py.File(hdf5_path, 'w') + + is_labelled = False + height = get_height(csv_path, is_labelled=is_labelled) + width = get_width(csv_path, is_labelled=is_labelled) + parse(csv_handle, hdf5_write_handle, height, width, + first_n=None, is_labelled=is_labelled, + delimiter=delimiter, is_square=is_square) + + hdf5 = h5py.File(hdf5_path, 'r') + self.assertEqual(list(hdf5.keys()), ['resolutions']) + self.assertEqual(list(hdf5['resolutions'].keys()), ['1']) + self.assertEqual(list(hdf5['resolutions']['1'].keys()), + ['nan_values', 'values']) + assert_array_equal( + hdf5['resolutions']['1']['nan_values'], + [[0] * len(matrix[0])] * len(matrix) + ) + assert_array_equal( + hdf5['resolutions']['1']['values'], + matrix + ) + + def test_unlabelled_csv_is_square_true(self): + self._assert_unlabelled_roundtrip_lt_256( + matrix=[[x + y for x in range(4)] for y in range(4)], + delimiter=',', + is_square=True + ) + + def test_unlabelled_tsv_is_square_false(self): + self._assert_unlabelled_roundtrip_lt_256( + matrix=[[x + y for x in range(4)] for y in range(4)], + delimiter='\t', + is_square=False + ) + + def _assert_unlabelled_roundtrip_1024( + self, matrix, first_row=None, first_col=None, first_n=None): + delimiter = '\t' + is_square = False + with TemporaryDirectory() as tmp_dir: + csv_path = tmp_dir + '/tmp.csv' + with open(csv_path, 'w', newline='') as csv_file: + writer = csv.writer(csv_file, delimiter=delimiter) + # body: + for row in matrix: + writer.writerow(row) + + csv_handle = open(csv_path, 'r') + hdf5_path = tmp_dir + 'tmp.hdf5' + hdf5_write_handle = h5py.File(hdf5_path, 'w') + + is_labelled = False + height = get_height(csv_path, is_labelled=is_labelled) + width = get_width(csv_path, is_labelled=is_labelled) + parse(csv_handle, hdf5_write_handle, height, width, + first_n=first_n, is_labelled=is_labelled, + delimiter=delimiter, is_square=is_square) + + hdf5 = h5py.File(hdf5_path, 'r') + self.assertEqual(list(hdf5.keys()), ['resolutions']) + self.assertEqual(list(hdf5['resolutions'].keys()), ['1', '2', '4']) + self.assertEqual(list(hdf5['resolutions']['1'].keys()), + ['nan_values', 'values']) + self.assertEqual(list(hdf5['resolutions']['4'].keys()), + ['values']) + res_4 = hdf5['resolutions']['4']['values'] + if first_row: + assert_array_equal(res_4[0], first_row) + if first_col: + assert_array_equal( + [res_4[y][0] for y in range(len(first_col))], + first_col) + + def test_unlabelled_tsv_tall(self): + self._assert_unlabelled_roundtrip_1024( + matrix=[[1 for x in range(4)] for y in range(1000)], + first_col=[16] * 250 + [0] * 6 + ) + + def test_unlabelled_tsv_wide(self): + self._assert_unlabelled_roundtrip_1024( + matrix=[[1 for x in range(1000)] for y in range(4)], + first_row=[16] * 250 + [0] * 6 + ) + + def test_unlabelled_tsv_tall_first_n(self): + self._assert_unlabelled_roundtrip_1024( + matrix=[[1 for x in range(4)] for y in range(1000)], + first_col=[8] + [0] * 255, + first_n=2 + ) + + def test_unlabelled_tsv_wide_first_n(self): + self._assert_unlabelled_roundtrip_1024( + matrix=[[1 for x in range(1000)] for y in range(4)], + first_row=[8] * 250 + [0] * 6, + first_n=2 + ) diff --git a/travis_test.sh b/travis_test.sh index d3db68d4..e146eb81 100755 --- a/travis_test.sh +++ b/travis_test.sh @@ -8,7 +8,12 @@ die() { set +v; echo "$*" 1>&2 ; sleep 1; exit 1; } # https://github.com/travis-ci/travis-ci/issues/6018 start flake8 -flake8 +# TODO: +# - Get more files to lint cleanly. +# - Reduce the number of errors which are ignored everywhere else. +flake8 --config=.flake8-ignore +flake8 test/tsv_to_mrmatrix_test.py \ + scripts/tsv_to_mrmatrix.py end flake8 start download