From 6f9333e18ddb0b72035920100d2666e76e58e1ee Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Mon, 22 Apr 2019 10:11:38 -0400 Subject: [PATCH 01/24] Use csv module instead of parsing by hand; Fix #63 --- scripts/tsv_to_mrmatrix.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/scripts/tsv_to_mrmatrix.py b/scripts/tsv_to_mrmatrix.py index a0214934..04c88131 100644 --- a/scripts/tsv_to_mrmatrix.py +++ b/scripts/tsv_to_mrmatrix.py @@ -1,5 +1,6 @@ #!/usr/bin/python +import csv import dask.array as da import h5py import math @@ -44,10 +45,8 @@ def coarsen(f, tile_size=256): def parse(input_handle, output_hdf5, top_n=None): - input_handle - first_line = next(input_handle) - parts = first_line.strip().split('\t') - # TODO: Use the python built-in csv module, instead of parsing by hand? + reader = csv.reader(input_handle, delimiter='\t') + parts = next(reader) if top_n is None: top_n = len(parts) - 1 @@ -59,8 +58,10 @@ def parse(input_handle, output_hdf5, top_n=None): max_zoom = math.ceil(math.log(top_n / tile_size) / math.log(2)) max_width = tile_size * 2 ** max_zoom - labels_dset = output_hdf5.create_dataset('labels', data=np.array(labels, dtype=h5py.special_dtype(vlen=str)), - compression='lzf') + labels_dset = output_hdf5.create_dataset( + 'labels', + data=np.array(labels, dtype=h5py.special_dtype(vlen=str)), + compression='lzf') g = output_hdf5.create_group('resolutions') g1 = g.create_group('1') @@ -72,9 +73,8 @@ def parse(input_handle, output_hdf5, top_n=None): start_time = time.time() counter = 0 - for line in input_handle: - parts = line.strip().split('\t')[1:top_n+1] - x = np.array([float(p) for p in parts]) + for row in reader: + x = np.array([float(p) for p in row[1:]]) ds[counter,:len(x)] = x counter += 1 @@ -114,7 +114,7 @@ def main(): if args.input_file == '-': f_in = sys.stdin else: - f_in = open(args.input_file, 'r') + f_in = open(args.input_file, 'r', newline='') parse(f_in, h5py.File(args.output_file, 'w'), top_n) From 15b786ebe456eebad6ffefd871edc03b3668d260 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Mon, 22 Apr 2019 10:21:30 -0400 Subject: [PATCH 02/24] better description --- scripts/tsv_to_mrmatrix.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/scripts/tsv_to_mrmatrix.py b/scripts/tsv_to_mrmatrix.py index 04c88131..092566ef 100644 --- a/scripts/tsv_to_mrmatrix.py +++ b/scripts/tsv_to_mrmatrix.py @@ -92,17 +92,14 @@ def parse(input_handle, output_hdf5, top_n=None): def main(): - parser = argparse.ArgumentParser(description=""" - - python tsv-dense-to-sparse -""") + parser = argparse.ArgumentParser(description=''' + Given a tab-delimited file, produces an HDF5 file with mrmatrix ("multi-resolution matrix") + structure: Under the "resolutions" group are datasets, named with successive powers of 2, + which represent successively higher aggregations of the input. + ''') parser.add_argument('input_file') parser.add_argument('output_file') - #parser.add_argument('-o', '--options', default='yo', - # help="Some option", type='str') - #parser.add_argument('-u', '--useless', action='store_true', - # help='Another useless option') parser.add_argument('-n', '--first-n', type=int, default=None, help="Only use the first n entries in the matrix") From afde554d76affa34111a905c904b2b0abee94485 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Mon, 22 Apr 2019 10:40:00 -0400 Subject: [PATCH 03/24] Aspirational command-line options --- scripts/tsv_to_mrmatrix.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/scripts/tsv_to_mrmatrix.py b/scripts/tsv_to_mrmatrix.py index 092566ef..5b0c593d 100644 --- a/scripts/tsv_to_mrmatrix.py +++ b/scripts/tsv_to_mrmatrix.py @@ -97,11 +97,16 @@ def main(): structure: Under the "resolutions" group are datasets, named with successive powers of 2, which represent successively higher aggregations of the input. ''') - - parser.add_argument('input_file') - parser.add_argument('output_file') - parser.add_argument('-n', '--first-n', type=int, default=None, - help="Only use the first n entries in the matrix") + parser.add_argument('input_file', help='TSV file path, or "-" for STDIN') + parser.add_argument('output_file', help='HDF5 file') + parser.add_argument('-d', '--delimiter', type=str, default='\t', metavar='D', + help='Delimiter; defaults to tab') + parser.add_argument('-n', '--first-n', type=int, default=None, metavar='N', + help='Only read the first n columns from the first n rows') + parser.add_argument('-s', '--square', action='store_true', + help='Row labels are assumed to match column labels') + parser.add_argument('-u', '--unlabelled', action='store_true', + help='TSV Matrix contains only numbers: no column or row labels') args = parser.parse_args() From d37958d9c7669443a897172317129c376ce0b859 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Mon, 22 Apr 2019 11:35:17 -0400 Subject: [PATCH 04/24] Checkpoint: Need to update test --- scripts/tsv_to_mrmatrix.py | 69 ++++++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 28 deletions(-) diff --git a/scripts/tsv_to_mrmatrix.py b/scripts/tsv_to_mrmatrix.py index 5b0c593d..732c7790 100644 --- a/scripts/tsv_to_mrmatrix.py +++ b/scripts/tsv_to_mrmatrix.py @@ -44,25 +44,23 @@ def coarsen(f, tile_size=256): da.store(dask_dset, values) -def parse(input_handle, output_hdf5, top_n=None): - reader = csv.reader(input_handle, delimiter='\t') - parts = next(reader) +def parse(input_handle, output_hdf5, height, delimiter, first_n, is_square, is_labelled): + reader = csv.reader(input_handle, delimiter=delimiter) + if is_labelled: + first_row = next(reader) + labels = first_row[1:(first_n + 1) if first_n else None] + if is_square: + output_hdf5.create_dataset( + 'labels', + data=np.array(labels, dtype=h5py.special_dtype(vlen=str)), + compression='lzf') + # TODO: Handle non-square labels - if top_n is None: - top_n = len(parts) - 1 - # TODO: So if it's taller than it is wide, it will be truncated to a square, - # unless an explicit top_n is provided? That doesn't seem right. - - labels = parts[1:top_n+1] tile_size = 256 - max_zoom = math.ceil(math.log(top_n / tile_size) / math.log(2)) + limit = min(first_n, height) if first_n else height + max_zoom = math.ceil(math.log(limit / tile_size) / math.log(2)) max_width = tile_size * 2 ** max_zoom - labels_dset = output_hdf5.create_dataset( - 'labels', - data=np.array(labels, dtype=h5py.special_dtype(vlen=str)), - compression='lzf') - g = output_hdf5.create_group('resolutions') g1 = g.create_group('1') ds = g1.create_dataset('values', (max_width, max_width), @@ -78,7 +76,7 @@ def parse(input_handle, output_hdf5, top_n=None): ds[counter,:len(x)] = x counter += 1 - if counter == top_n: + if counter == first_n: break time_elapsed = time.time() - start_time @@ -91,13 +89,28 @@ def parse(input_handle, output_hdf5, top_n=None): output_hdf5.close() +def get_height(input_path, is_labelled=False): + ''' + We need to scan the file once just to see how many lines it contains. + If it is tall and narrow, the first tile will need to be larger than just + looking at the width of the first row would suggest. + ''' + with open(fname) as f: + for i, l in enumerate(f): + pass + if is_labelled: + return i + else: + return i + 1 + + def main(): parser = argparse.ArgumentParser(description=''' Given a tab-delimited file, produces an HDF5 file with mrmatrix ("multi-resolution matrix") structure: Under the "resolutions" group are datasets, named with successive powers of 2, which represent successively higher aggregations of the input. ''') - parser.add_argument('input_file', help='TSV file path, or "-" for STDIN') + parser.add_argument('input_file', help='TSV file path') parser.add_argument('output_file', help='HDF5 file') parser.add_argument('-d', '--delimiter', type=str, default='\t', metavar='D', help='Delimiter; defaults to tab') @@ -105,20 +118,20 @@ def main(): help='Only read the first n columns from the first n rows') parser.add_argument('-s', '--square', action='store_true', help='Row labels are assumed to match column labels') - parser.add_argument('-u', '--unlabelled', action='store_true', - help='TSV Matrix contains only numbers: no column or row labels') - + parser.add_argument('-l', '--labelled', action='store_true', + help='TSV Matrix has column and row labels') args = parser.parse_args() - count = 0 - top_n = args.first_n - - if args.input_file == '-': - f_in = sys.stdin - else: - f_in = open(args.input_file, 'r', newline='') + height = get_height(args.input_file, is_labelled=args.labelled) + f_in = open(args.input_file, 'r', newline='') - parse(f_in, h5py.File(args.output_file, 'w'), top_n) + parse(f_in, + h5py.File(args.output_file, 'w'), + height, + delimiter=args.delimiter, + first_n=args.first_n, + is_square=args.square, + is_labelled=args.labelled) f = h5py.File(args.output_file, 'r') print("sum1:", np.nansum(f['resolutions']['1']['values'][0])) From 549ce5ed1bf3ce3e413a6b95d81048a3fb5e0376 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Mon, 22 Apr 2019 12:18:59 -0400 Subject: [PATCH 05/24] checkpoint: still buggy [skip ci] --- scripts/tsv_to_mrmatrix.py | 6 ++++-- test/tsv_to_mrmatrix_test.py | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/tsv_to_mrmatrix.py b/scripts/tsv_to_mrmatrix.py index 732c7790..eaad0b75 100644 --- a/scripts/tsv_to_mrmatrix.py +++ b/scripts/tsv_to_mrmatrix.py @@ -10,7 +10,7 @@ import sys import argparse import time - +import logging def coarsen(f, tile_size=256): ''' @@ -60,6 +60,8 @@ def parse(input_handle, output_hdf5, height, delimiter, first_n, is_square, is_l limit = min(first_n, height) if first_n else height max_zoom = math.ceil(math.log(limit / tile_size) / math.log(2)) max_width = tile_size * 2 ** max_zoom + logging.info('max_zoom: %s' % max_zoom) + logging.info('max_width: %s' % max_width) g = output_hdf5.create_group('resolutions') g1 = g.create_group('1') @@ -95,7 +97,7 @@ def get_height(input_path, is_labelled=False): If it is tall and narrow, the first tile will need to be larger than just looking at the width of the first row would suggest. ''' - with open(fname) as f: + with open(input_path) as f: for i, l in enumerate(f): pass if is_labelled: diff --git a/test/tsv_to_mrmatrix_test.py b/test/tsv_to_mrmatrix_test.py index 0830737e..7b3310b7 100644 --- a/test/tsv_to_mrmatrix_test.py +++ b/test/tsv_to_mrmatrix_test.py @@ -7,7 +7,7 @@ from numpy.testing import assert_array_equal import h5py -from scripts.tsv_to_mrmatrix import coarsen, parse +from scripts.tsv_to_mrmatrix import coarsen, parse, get_height class CoarsenTest(unittest.TestCase): def test_5_layer_pyramid(self): @@ -121,7 +121,9 @@ def test_parse(self): hdf5_path = tmp_dir + 'tmp.hdf5' hdf5_write_handle = h5py.File(hdf5_path, 'w') - parse(csv_handle, hdf5_write_handle) + height = get_height(csv_path) + parse(csv_handle, hdf5_write_handle, height, + delimiter='\t', first_n=None, is_square=True, is_labelled=True) hdf5 = h5py.File(hdf5_path, 'r') self.assertEqual(list(hdf5.keys()), ['labels', 'resolutions']) From 4f41d1d603998590bf78a794378714ef5f323a85 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Mon, 22 Apr 2019 13:12:04 -0400 Subject: [PATCH 06/24] Runs, but we get one less resolution --- scripts/tsv_to_mrmatrix.py | 23 +++++++++++++++-------- test/tsv_to_mrmatrix_test.py | 5 +++-- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/scripts/tsv_to_mrmatrix.py b/scripts/tsv_to_mrmatrix.py index eaad0b75..a1540471 100644 --- a/scripts/tsv_to_mrmatrix.py +++ b/scripts/tsv_to_mrmatrix.py @@ -44,7 +44,7 @@ def coarsen(f, tile_size=256): da.store(dask_dset, values) -def parse(input_handle, output_hdf5, height, delimiter, first_n, is_square, is_labelled): +def parse(input_handle, output_hdf5, height, width, delimiter, first_n, is_square, is_labelled): reader = csv.reader(input_handle, delimiter=delimiter) if is_labelled: first_row = next(reader) @@ -54,14 +54,12 @@ def parse(input_handle, output_hdf5, height, delimiter, first_n, is_square, is_l 'labels', data=np.array(labels, dtype=h5py.special_dtype(vlen=str)), compression='lzf') - # TODO: Handle non-square labels + # TODO: Handle non-square labels tile_size = 256 - limit = min(first_n, height) if first_n else height + limit = first_n if first_n else max(height, width) max_zoom = math.ceil(math.log(limit / tile_size) / math.log(2)) max_width = tile_size * 2 ** max_zoom - logging.info('max_zoom: %s' % max_zoom) - logging.info('max_width: %s' % max_width) g = output_hdf5.create_group('resolutions') g1 = g.create_group('1') @@ -84,14 +82,14 @@ def parse(input_handle, output_hdf5, height, delimiter, first_n, is_square, is_l time_elapsed = time.time() - start_time time_per_entry = time_elapsed / counter - time_remaining = time_per_entry * (top_n - counter) + time_remaining = time_per_entry * (height - counter) print("counter:", counter, "sum(x):", sum(x), "time remaining: {:d} seconds".format(int(time_remaining))) coarsen(output_hdf5) output_hdf5.close() -def get_height(input_path, is_labelled=False): +def get_height(input_path, is_labelled=True): ''' We need to scan the file once just to see how many lines it contains. If it is tall and narrow, the first tile will need to be larger than just @@ -105,6 +103,14 @@ def get_height(input_path, is_labelled=False): else: return i + 1 +def get_width(input_path, delimiter='\t'): + ''' + Assume the number of elements in the first row is the total width. + ''' + with open(input_path, 'r', newline='') as input_handle: + reader = csv.reader(input_handle, delimiter=delimiter) + return len(next(reader)) + def main(): parser = argparse.ArgumentParser(description=''' @@ -125,11 +131,12 @@ def main(): args = parser.parse_args() height = get_height(args.input_file, is_labelled=args.labelled) + width = get_width(args.input_file, delimiter=args.delimiter) f_in = open(args.input_file, 'r', newline='') parse(f_in, h5py.File(args.output_file, 'w'), - height, + height, width, delimiter=args.delimiter, first_n=args.first_n, is_square=args.square, diff --git a/test/tsv_to_mrmatrix_test.py b/test/tsv_to_mrmatrix_test.py index 7b3310b7..b567be86 100644 --- a/test/tsv_to_mrmatrix_test.py +++ b/test/tsv_to_mrmatrix_test.py @@ -7,7 +7,7 @@ from numpy.testing import assert_array_equal import h5py -from scripts.tsv_to_mrmatrix import coarsen, parse, get_height +from scripts.tsv_to_mrmatrix import coarsen, parse, get_height, get_width class CoarsenTest(unittest.TestCase): def test_5_layer_pyramid(self): @@ -122,7 +122,8 @@ def test_parse(self): hdf5_write_handle = h5py.File(hdf5_path, 'w') height = get_height(csv_path) - parse(csv_handle, hdf5_write_handle, height, + width = get_width(csv_path) + parse(csv_handle, hdf5_write_handle, height, width, delimiter='\t', first_n=None, is_square=True, is_labelled=True) hdf5 = h5py.File(hdf5_path, 'r') From 1d148c857aad91e439ba53e73777619279d6f238 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Mon, 22 Apr 2019 14:06:41 -0400 Subject: [PATCH 07/24] Fix test --- scripts/tsv_to_mrmatrix.py | 12 +++++++----- test/tsv_to_mrmatrix_test.py | 2 +- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/scripts/tsv_to_mrmatrix.py b/scripts/tsv_to_mrmatrix.py index a1540471..2bdbcdb0 100644 --- a/scripts/tsv_to_mrmatrix.py +++ b/scripts/tsv_to_mrmatrix.py @@ -18,7 +18,6 @@ def coarsen(f, tile_size=256): ''' grid = f['resolutions']['1']['values'] top_n = grid.shape[0] - max_zoom = math.ceil(math.log(top_n / tile_size) / math.log(2)) max_width = tile_size * 2 ** max_zoom @@ -103,13 +102,16 @@ def get_height(input_path, is_labelled=True): else: return i + 1 -def get_width(input_path, delimiter='\t'): +def get_width(input_path, is_labelled, delimiter='\t'): ''' Assume the number of elements in the first row is the total width. ''' with open(input_path, 'r', newline='') as input_handle: reader = csv.reader(input_handle, delimiter=delimiter) - return len(next(reader)) + len_row = len(next(reader)) + if is_labelled: + return len_row - 1 + return len_row def main(): @@ -131,12 +133,12 @@ def main(): args = parser.parse_args() height = get_height(args.input_file, is_labelled=args.labelled) - width = get_width(args.input_file, delimiter=args.delimiter) + width = get_width(args.input_file, is_labelled=args.labelled, delimiter=args.delimiter) f_in = open(args.input_file, 'r', newline='') parse(f_in, h5py.File(args.output_file, 'w'), - height, width, + height=height, width=width, delimiter=args.delimiter, first_n=args.first_n, is_square=args.square, diff --git a/test/tsv_to_mrmatrix_test.py b/test/tsv_to_mrmatrix_test.py index b567be86..c7ca7f19 100644 --- a/test/tsv_to_mrmatrix_test.py +++ b/test/tsv_to_mrmatrix_test.py @@ -122,7 +122,7 @@ def test_parse(self): hdf5_write_handle = h5py.File(hdf5_path, 'w') height = get_height(csv_path) - width = get_width(csv_path) + width = get_width(csv_path, is_labelled=True) parse(csv_handle, hdf5_write_handle, height, width, delimiter='\t', first_n=None, is_square=True, is_labelled=True) From 6209b4c99cee8cb93e35878857e758aabcf8b98e Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Mon, 22 Apr 2019 14:47:11 -0400 Subject: [PATCH 08/24] tsv_to_mrmatrix_test.py whitespace --- .flake8 => .flake8-ignore | 0 test/tsv_to_mrmatrix_test.py | 38 ++++++++++++++++++++++++------------ travis_test.sh | 7 ++++++- 3 files changed, 32 insertions(+), 13 deletions(-) rename .flake8 => .flake8-ignore (100%) diff --git a/.flake8 b/.flake8-ignore similarity index 100% rename from .flake8 rename to .flake8-ignore diff --git a/test/tsv_to_mrmatrix_test.py b/test/tsv_to_mrmatrix_test.py index c7ca7f19..bde998bc 100644 --- a/test/tsv_to_mrmatrix_test.py +++ b/test/tsv_to_mrmatrix_test.py @@ -9,6 +9,7 @@ from scripts.tsv_to_mrmatrix import coarsen, parse, get_height, get_width + class CoarsenTest(unittest.TestCase): def test_5_layer_pyramid(self): tile_size = 4 @@ -20,7 +21,8 @@ def test_5_layer_pyramid(self): g = hdf5.create_group('resolutions') g1 = g.create_group('1') ds = g1.create_dataset('values', (max_width, max_width), - dtype='f4', compression='lzf', fillvalue=np.nan) + dtype='f4', compression='lzf', + fillvalue=np.nan) for y in range(max_width): a = np.array([float(x) for x in range(max_width)]) ds[y, :max_width] = a @@ -28,8 +30,10 @@ def test_5_layer_pyramid(self): # before coarsen() self.assertEqual(list(hdf5.keys()), ['resolutions']) self.assertEqual(list(hdf5['resolutions'].keys()), ['1']) - self.assertEqual(list(hdf5['resolutions']['1'].keys()), ['values']) - self.assertEqual(list(hdf5['resolutions']['1']['values'].shape), [64, 64]) + self.assertEqual(list(hdf5['resolutions']['1'].keys()), + ['values']) + self.assertEqual(list(hdf5['resolutions']['1']['values'].shape), + [64, 64]) self.assertEqual( hdf5['resolutions']['1']['values'][:].tolist()[0], [float(x) for x in range(64)] @@ -39,8 +43,10 @@ def test_5_layer_pyramid(self): # after coarsen() self.assertEqual(list(hdf5.keys()), ['resolutions']) - self.assertEqual(list(hdf5['resolutions'].keys()), ['1', '16', '2', '4', '8']) - self.assertEqual(list(hdf5['resolutions']['16'].keys()), ['values']) + self.assertEqual(list(hdf5['resolutions'].keys()), + ['1', '16', '2', '4', '8']) + self.assertEqual(list(hdf5['resolutions']['16'].keys()), + ['values']) shapes = { '1': 64, '2': 32, @@ -49,7 +55,8 @@ def test_5_layer_pyramid(self): '16': 4 } for (k, v) in shapes.items(): - self.assertEqual(hdf5['resolutions'][k]['values'].shape, (v, v)) + self.assertEqual(hdf5['resolutions'][k]['values'].shape, + (v, v)) row = [1920, 6016, 10112, 14208] self.assertEqual( hdf5['resolutions']['16']['values'][:].tolist(), @@ -66,7 +73,8 @@ def test_math(self): g = hdf5.create_group('resolutions') g1 = g.create_group('1') ds = g1.create_dataset('values', (max_width, max_width), - dtype='f4', compression='lzf', fillvalue=np.nan) + dtype='f4', compression='lzf', + fillvalue=np.nan) for y in range(max_width): a = np.array([float(x) for x in range(max_width)]) ds[y, :max_width] = a @@ -83,7 +91,8 @@ def test_math(self): '4': 2 } for (k, v) in shapes.items(): - self.assertEqual(hdf5['resolutions'][k]['values'].shape, (v, v)) + self.assertEqual(hdf5['resolutions'][k]['values'].shape, + (v, v)) row8 = list(range(8)) assert_array_equal( @@ -100,6 +109,7 @@ def test_math(self): hdf5['resolutions']['4']['values'], [row2 for _ in range(2)]) + class ParseTest(unittest.TestCase): def test_parse(self): with TemporaryDirectory() as tmp_dir: @@ -124,7 +134,8 @@ def test_parse(self): height = get_height(csv_path) width = get_width(csv_path, is_labelled=True) parse(csv_handle, hdf5_write_handle, height, width, - delimiter='\t', first_n=None, is_square=True, is_labelled=True) + delimiter='\t', first_n=None, is_square=True, + is_labelled=True) hdf5 = h5py.File(hdf5_path, 'r') self.assertEqual(list(hdf5.keys()), ['labels', 'resolutions']) @@ -132,7 +143,8 @@ def test_parse(self): self.assertEqual(list(hdf5['resolutions'].keys()), ['1', '2']) - self.assertEqual(list(hdf5['resolutions']['1'].keys()), ['nan_values', 'values']) + self.assertEqual(list(hdf5['resolutions']['1'].keys()), + ['nan_values', 'values']) assert_array_equal( hdf5['resolutions']['1']['nan_values'], [[0] * 512] * 512 ) @@ -145,9 +157,11 @@ def test_parse(self): self.assertEqual(list(hdf5['resolutions']['2'].keys()), ['values']) res_2 = hdf5['resolutions']['2']['values'] assert_array_equal(res_2[0], [0] * 256) - assert_array_equal(res_2[1], [2] * 256) # Stradles the 0 and 1 rows + assert_array_equal(res_2[1], [2] * 256) + # Stradles the 0 and 1 rows assert_array_equal(res_2[2], [4] * 256) - assert_array_equal(res_2[3], [0] * 256) # -1 and +1 cancel out + assert_array_equal(res_2[3], [0] * 256) + # -1 and +1 cancel out assert_array_equal(res_2[4], [0] * 256) assert_array_equal(res_2[5], [0] * 256) assert_array_equal(res_2[6], [0] * 256) diff --git a/travis_test.sh b/travis_test.sh index d3db68d4..14ac781f 100755 --- a/travis_test.sh +++ b/travis_test.sh @@ -8,7 +8,12 @@ die() { set +v; echo "$*" 1>&2 ; sleep 1; exit 1; } # https://github.com/travis-ci/travis-ci/issues/6018 start flake8 -flake8 +# TODO: +# - Get more files to lint cleanly. +# - Reduce the number of errors which are ignored everywhere else. +flake8 --config=.flake8-ignore +flake8 test/tsv_to_mrmatrix_test.py +flake8 scripts/tsv_to_mrmatrix.py end flake8 start download From b4facc100006e55f6115db800e77ee2935327d2e Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Mon, 22 Apr 2019 14:53:17 -0400 Subject: [PATCH 09/24] autopep8 --- scripts/tsv_to_mrmatrix.py | 43 ++++++++++++++++++++------------------ travis_test.sh | 4 ++-- 2 files changed, 25 insertions(+), 22 deletions(-) diff --git a/scripts/tsv_to_mrmatrix.py b/scripts/tsv_to_mrmatrix.py index 2bdbcdb0..dfd8c22d 100644 --- a/scripts/tsv_to_mrmatrix.py +++ b/scripts/tsv_to_mrmatrix.py @@ -12,6 +12,7 @@ import time import logging + def coarsen(f, tile_size=256): ''' Create data pyramid. @@ -21,9 +22,9 @@ def coarsen(f, tile_size=256): max_zoom = math.ceil(math.log(top_n / tile_size) / math.log(2)) max_width = tile_size * 2 ** max_zoom - chunk_size=tile_size * 16 + chunk_size = tile_size * 16 curr_size = grid.shape - dask_dset = da.from_array(grid, chunks=(chunk_size,chunk_size)) + dask_dset = da.from_array(grid, chunks=(chunk_size, chunk_size)) r = f['resolutions'] curr_resolution = 1 @@ -36,7 +37,7 @@ def coarsen(f, tile_size=256): print("curr_size:", curr_size) g = r.create_group(str(curr_resolution)) values = g.require_dataset('values', curr_size, dtype='f4', - compression='lzf', fillvalue=np.nan) + compression='lzf', fillvalue=np.nan) dask_dset = dask_dset.rechunk((chunk_size, chunk_size)) dask_dset = da.coarsen(np.nansum, dask_dset, {0: 2, 1: 2}) @@ -63,16 +64,16 @@ def parse(input_handle, output_hdf5, height, width, delimiter, first_n, is_squar g = output_hdf5.create_group('resolutions') g1 = g.create_group('1') ds = g1.create_dataset('values', (max_width, max_width), - dtype='f4', compression='lzf', fillvalue=np.nan) + dtype='f4', compression='lzf', fillvalue=np.nan) ds1 = g1.create_dataset('nan_values', (max_width, max_width), - dtype='f4', compression='lzf', fillvalue=0) - # TODO: We don't write to this... Is it necessary? + dtype='f4', compression='lzf', fillvalue=0) + # TODO: We don't write to this... Is it necessary? start_time = time.time() counter = 0 for row in reader: x = np.array([float(p) for p in row[1:]]) - ds[counter,:len(x)] = x + ds[counter, :len(x)] = x counter += 1 if counter == first_n: @@ -82,7 +83,8 @@ def parse(input_handle, output_hdf5, height, width, delimiter, first_n, is_squar time_per_entry = time_elapsed / counter time_remaining = time_per_entry * (height - counter) - print("counter:", counter, "sum(x):", sum(x), "time remaining: {:d} seconds".format(int(time_remaining))) + print("counter:", counter, "sum(x):", sum(x), + "time remaining: {:d} seconds".format(int(time_remaining))) coarsen(output_hdf5) output_hdf5.close() @@ -102,6 +104,7 @@ def get_height(input_path, is_labelled=True): else: return i + 1 + def get_width(input_path, is_labelled, delimiter='\t'): ''' Assume the number of elements in the first row is the total width. @@ -123,31 +126,31 @@ def main(): parser.add_argument('input_file', help='TSV file path') parser.add_argument('output_file', help='HDF5 file') parser.add_argument('-d', '--delimiter', type=str, default='\t', metavar='D', - help='Delimiter; defaults to tab') + help='Delimiter; defaults to tab') parser.add_argument('-n', '--first-n', type=int, default=None, metavar='N', - help='Only read the first n columns from the first n rows') + help='Only read the first n columns from the first n rows') parser.add_argument('-s', '--square', action='store_true', - help='Row labels are assumed to match column labels') + help='Row labels are assumed to match column labels') parser.add_argument('-l', '--labelled', action='store_true', - help='TSV Matrix has column and row labels') + help='TSV Matrix has column and row labels') args = parser.parse_args() height = get_height(args.input_file, is_labelled=args.labelled) - width = get_width(args.input_file, is_labelled=args.labelled, delimiter=args.delimiter) + width = get_width(args.input_file, is_labelled=args.labelled, + delimiter=args.delimiter) f_in = open(args.input_file, 'r', newline='') parse(f_in, - h5py.File(args.output_file, 'w'), - height=height, width=width, - delimiter=args.delimiter, - first_n=args.first_n, - is_square=args.square, - is_labelled=args.labelled) + h5py.File(args.output_file, 'w'), + height=height, width=width, + delimiter=args.delimiter, + first_n=args.first_n, + is_square=args.square, + is_labelled=args.labelled) f = h5py.File(args.output_file, 'r') print("sum1:", np.nansum(f['resolutions']['1']['values'][0])) - if __name__ == '__main__': main() diff --git a/travis_test.sh b/travis_test.sh index 14ac781f..e146eb81 100755 --- a/travis_test.sh +++ b/travis_test.sh @@ -12,8 +12,8 @@ start flake8 # - Get more files to lint cleanly. # - Reduce the number of errors which are ignored everywhere else. flake8 --config=.flake8-ignore -flake8 test/tsv_to_mrmatrix_test.py -flake8 scripts/tsv_to_mrmatrix.py +flake8 test/tsv_to_mrmatrix_test.py \ + scripts/tsv_to_mrmatrix.py end flake8 start download From 37f5b60451f8abdf7725107d4a426da9eaf800b1 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Mon, 22 Apr 2019 15:00:26 -0400 Subject: [PATCH 10/24] flake8 clean --- scripts/tsv_to_mrmatrix.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/scripts/tsv_to_mrmatrix.py b/scripts/tsv_to_mrmatrix.py index dfd8c22d..1e41c57c 100644 --- a/scripts/tsv_to_mrmatrix.py +++ b/scripts/tsv_to_mrmatrix.py @@ -5,12 +5,8 @@ import h5py import math import numpy as np -import os -import os.path as op -import sys import argparse import time -import logging def coarsen(f, tile_size=256): @@ -20,7 +16,6 @@ def coarsen(f, tile_size=256): grid = f['resolutions']['1']['values'] top_n = grid.shape[0] max_zoom = math.ceil(math.log(top_n / tile_size) / math.log(2)) - max_width = tile_size * 2 ** max_zoom chunk_size = tile_size * 16 curr_size = grid.shape @@ -44,7 +39,8 @@ def coarsen(f, tile_size=256): da.store(dask_dset, values) -def parse(input_handle, output_hdf5, height, width, delimiter, first_n, is_square, is_labelled): +def parse(input_handle, output_hdf5, height, width, + delimiter, first_n, is_square, is_labelled): reader = csv.reader(input_handle, delimiter=delimiter) if is_labelled: first_row = next(reader) @@ -65,8 +61,8 @@ def parse(input_handle, output_hdf5, height, width, delimiter, first_n, is_squar g1 = g.create_group('1') ds = g1.create_dataset('values', (max_width, max_width), dtype='f4', compression='lzf', fillvalue=np.nan) - ds1 = g1.create_dataset('nan_values', (max_width, max_width), - dtype='f4', compression='lzf', fillvalue=0) + g1.create_dataset('nan_values', (max_width, max_width), + dtype='f4', compression='lzf', fillvalue=0) # TODO: We don't write to this... Is it necessary? start_time = time.time() @@ -119,16 +115,17 @@ def get_width(input_path, is_labelled, delimiter='\t'): def main(): parser = argparse.ArgumentParser(description=''' - Given a tab-delimited file, produces an HDF5 file with mrmatrix ("multi-resolution matrix") - structure: Under the "resolutions" group are datasets, named with successive powers of 2, + Given a tab-delimited file, produces an HDF5 file with mrmatrix + ("multi-resolution matrix") structure: Under the "resolutions" + group are datasets, named with successive powers of 2, which represent successively higher aggregations of the input. ''') parser.add_argument('input_file', help='TSV file path') parser.add_argument('output_file', help='HDF5 file') - parser.add_argument('-d', '--delimiter', type=str, default='\t', metavar='D', - help='Delimiter; defaults to tab') + parser.add_argument('-d', '--delimiter', type=str, default='\t', + metavar='D', help='Delimiter; defaults to tab') parser.add_argument('-n', '--first-n', type=int, default=None, metavar='N', - help='Only read the first n columns from the first n rows') + help='Only read first N columns from first N rows') parser.add_argument('-s', '--square', action='store_true', help='Row labels are assumed to match column labels') parser.add_argument('-l', '--labelled', action='store_true', From f6b8e3cc10a1a4fd8a584aa25835b1c9815bd1cc Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Mon, 22 Apr 2019 15:08:53 -0400 Subject: [PATCH 11/24] changelog [skip ci] --- CHANGELOG | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index 581af515..4c8131a1 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,7 @@ +in progress + +- Make tsv_to_mrmatrix more flexible. + v0.10.7 - Changed bins_per_dimension in npvector.tileset_info to match the value in From 5171bfb4c92ea10ac438692919f778c34aee0fba Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Mon, 22 Apr 2019 16:15:12 -0400 Subject: [PATCH 12/24] start test unlabelled --- test/tsv_to_mrmatrix_test.py | 51 +++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/test/tsv_to_mrmatrix_test.py b/test/tsv_to_mrmatrix_test.py index bde998bc..72569aac 100644 --- a/test/tsv_to_mrmatrix_test.py +++ b/test/tsv_to_mrmatrix_test.py @@ -111,7 +111,7 @@ def test_math(self): class ParseTest(unittest.TestCase): - def test_parse(self): + def test_labelled_square(self): with TemporaryDirectory() as tmp_dir: csv_path = tmp_dir + '/tmp.csv' with open(csv_path, 'w', newline='') as csv_file: @@ -167,3 +167,52 @@ def test_parse(self): assert_array_equal(res_2[6], [0] * 256) # TODO: We lose nan at higher aggregations: # Maybe regular mean/sum instead of treating missing values as 0? + + def test_unlabelled(self): + with TemporaryDirectory() as tmp_dir: + csv_path = tmp_dir + '/tmp.csv' + with open(csv_path, 'w', newline='') as csv_file: + writer = csv.writer(csv_file, delimiter='\t') + # body: + for y in range(4): + writer.writerow([x + y for x in range(4)]) + + csv_handle = open(csv_path, 'r') + hdf5_path = tmp_dir + 'tmp.hdf5' + hdf5_write_handle = h5py.File(hdf5_path, 'w') + + is_labelled = False + height = get_height(csv_path, is_labelled=is_labelled) + width = get_width(csv_path, is_labelled=is_labelled) + self.assertEqual([4, 4], [height, width]) + parse(csv_handle, hdf5_write_handle, height, width, + is_labelled=is_labelled, + delimiter='\t', first_n=None, is_square=True) + + hdf5 = h5py.File(hdf5_path, 'r') + self.assertEqual(list(hdf5.keys()), ['resolutions']) + self.assertEqual(list(hdf5['resolutions'].keys()), ['1']) + self.assertEqual(list(hdf5['resolutions']['1'].keys()), + ['nan_values', 'values']) + # assert_array_equal( + # hdf5['resolutions']['1']['nan_values'], [[0] * 512] * 512 + # ) + # res_1 = hdf5['resolutions']['1']['values'] + # assert_array_equal(res_1[0], [0] * 512) + # assert_array_equal(res_1[3], [1] * 512) + # assert_array_equal(res_1[6], [1, -1] * 256) + # assert_array_equal(res_1[9], [nan] * 512) + # + # self.assertEqual(list(hdf5['resolutions']['2'].keys()), ['values']) + # res_2 = hdf5['resolutions']['2']['values'] + # assert_array_equal(res_2[0], [0] * 256) + # assert_array_equal(res_2[1], [2] * 256) + # # Stradles the 0 and 1 rows + # assert_array_equal(res_2[2], [4] * 256) + # assert_array_equal(res_2[3], [0] * 256) + # # -1 and +1 cancel out + # assert_array_equal(res_2[4], [0] * 256) + # assert_array_equal(res_2[5], [0] * 256) + # assert_array_equal(res_2[6], [0] * 256) + # # TODO: We lose nan at higher aggregations: + # # Maybe regular mean/sum instead of treating missing values as 0? From dbee7fd243f389dd56b7fe881570150cb0515ec6 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Mon, 22 Apr 2019 16:36:11 -0400 Subject: [PATCH 13/24] unlabelled square --- scripts/tsv_to_mrmatrix.py | 2 +- test/tsv_to_mrmatrix_test.py | 46 ++++++++++++++++-------------------- 2 files changed, 22 insertions(+), 26 deletions(-) diff --git a/scripts/tsv_to_mrmatrix.py b/scripts/tsv_to_mrmatrix.py index 1e41c57c..a04ab87d 100644 --- a/scripts/tsv_to_mrmatrix.py +++ b/scripts/tsv_to_mrmatrix.py @@ -68,7 +68,7 @@ def parse(input_handle, output_hdf5, height, width, start_time = time.time() counter = 0 for row in reader: - x = np.array([float(p) for p in row[1:]]) + x = np.array([float(p) for p in row[1 if is_labelled else None:]]) ds[counter, :len(x)] = x counter += 1 diff --git a/test/tsv_to_mrmatrix_test.py b/test/tsv_to_mrmatrix_test.py index 72569aac..810e9a10 100644 --- a/test/tsv_to_mrmatrix_test.py +++ b/test/tsv_to_mrmatrix_test.py @@ -111,7 +111,7 @@ def test_math(self): class ParseTest(unittest.TestCase): - def test_labelled_square(self): + def test_wide_labelled_square(self): with TemporaryDirectory() as tmp_dir: csv_path = tmp_dir + '/tmp.csv' with open(csv_path, 'w', newline='') as csv_file: @@ -168,7 +168,7 @@ def test_labelled_square(self): # TODO: We lose nan at higher aggregations: # Maybe regular mean/sum instead of treating missing values as 0? - def test_unlabelled(self): + def _assert_unlabelled_4x4(self, is_square): with TemporaryDirectory() as tmp_dir: csv_path = tmp_dir + '/tmp.csv' with open(csv_path, 'w', newline='') as csv_file: @@ -187,32 +187,28 @@ def test_unlabelled(self): self.assertEqual([4, 4], [height, width]) parse(csv_handle, hdf5_write_handle, height, width, is_labelled=is_labelled, - delimiter='\t', first_n=None, is_square=True) + delimiter='\t', first_n=None, is_square=is_square) hdf5 = h5py.File(hdf5_path, 'r') self.assertEqual(list(hdf5.keys()), ['resolutions']) self.assertEqual(list(hdf5['resolutions'].keys()), ['1']) self.assertEqual(list(hdf5['resolutions']['1'].keys()), ['nan_values', 'values']) - # assert_array_equal( - # hdf5['resolutions']['1']['nan_values'], [[0] * 512] * 512 - # ) - # res_1 = hdf5['resolutions']['1']['values'] - # assert_array_equal(res_1[0], [0] * 512) - # assert_array_equal(res_1[3], [1] * 512) - # assert_array_equal(res_1[6], [1, -1] * 256) - # assert_array_equal(res_1[9], [nan] * 512) - # - # self.assertEqual(list(hdf5['resolutions']['2'].keys()), ['values']) - # res_2 = hdf5['resolutions']['2']['values'] - # assert_array_equal(res_2[0], [0] * 256) - # assert_array_equal(res_2[1], [2] * 256) - # # Stradles the 0 and 1 rows - # assert_array_equal(res_2[2], [4] * 256) - # assert_array_equal(res_2[3], [0] * 256) - # # -1 and +1 cancel out - # assert_array_equal(res_2[4], [0] * 256) - # assert_array_equal(res_2[5], [0] * 256) - # assert_array_equal(res_2[6], [0] * 256) - # # TODO: We lose nan at higher aggregations: - # # Maybe regular mean/sum instead of treating missing values as 0? + assert_array_equal( + hdf5['resolutions']['1']['nan_values'], [[0] * 4] * 4 + ) + assert_array_equal( + hdf5['resolutions']['1']['values'], + [ + [0, 1, 2, 3], + [1, 2, 3, 4], + [2, 3, 4, 5], + [3, 4, 5, 6] + ] + ) + + def test_unlabelled_is_square_true(self): + self._assert_unlabelled_4x4(is_square=True) + + def test_unlabelled_is_square_false(self): + self._assert_unlabelled_4x4(is_square=False) From 5d7afe8b35e0d166b9c0eee34070a3bba7de6fd0 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Mon, 22 Apr 2019 17:08:50 -0400 Subject: [PATCH 14/24] Make test more generic --- test/tsv_to_mrmatrix_test.py | 84 ++++++++++++++++++++++++++++-------- 1 file changed, 66 insertions(+), 18 deletions(-) diff --git a/test/tsv_to_mrmatrix_test.py b/test/tsv_to_mrmatrix_test.py index 810e9a10..2482e8a5 100644 --- a/test/tsv_to_mrmatrix_test.py +++ b/test/tsv_to_mrmatrix_test.py @@ -168,14 +168,14 @@ def test_wide_labelled_square(self): # TODO: We lose nan at higher aggregations: # Maybe regular mean/sum instead of treating missing values as 0? - def _assert_unlabelled_4x4(self, is_square): + def _assert_unlabelled_roundtrip_lt_256(self, matrix, delimiter, is_square): with TemporaryDirectory() as tmp_dir: csv_path = tmp_dir + '/tmp.csv' with open(csv_path, 'w', newline='') as csv_file: - writer = csv.writer(csv_file, delimiter='\t') + writer = csv.writer(csv_file, delimiter=delimiter) # body: - for y in range(4): - writer.writerow([x + y for x in range(4)]) + for row in matrix: + writer.writerow(row) csv_handle = open(csv_path, 'r') hdf5_path = tmp_dir + 'tmp.hdf5' @@ -184,10 +184,9 @@ def _assert_unlabelled_4x4(self, is_square): is_labelled = False height = get_height(csv_path, is_labelled=is_labelled) width = get_width(csv_path, is_labelled=is_labelled) - self.assertEqual([4, 4], [height, width]) parse(csv_handle, hdf5_write_handle, height, width, - is_labelled=is_labelled, - delimiter='\t', first_n=None, is_square=is_square) + first_n=None, is_labelled=is_labelled, + delimiter=delimiter, is_square=is_square) hdf5 = h5py.File(hdf5_path, 'r') self.assertEqual(list(hdf5.keys()), ['resolutions']) @@ -195,20 +194,69 @@ def _assert_unlabelled_4x4(self, is_square): self.assertEqual(list(hdf5['resolutions']['1'].keys()), ['nan_values', 'values']) assert_array_equal( - hdf5['resolutions']['1']['nan_values'], [[0] * 4] * 4 + hdf5['resolutions']['1']['nan_values'], + [[0] * len(matrix[0])] * len(matrix) ) assert_array_equal( hdf5['resolutions']['1']['values'], - [ - [0, 1, 2, 3], - [1, 2, 3, 4], - [2, 3, 4, 5], - [3, 4, 5, 6] - ] + matrix ) - def test_unlabelled_is_square_true(self): - self._assert_unlabelled_4x4(is_square=True) + def test_unlabelled_csv_is_square_true(self): + self._assert_unlabelled_roundtrip_lt_256( + matrix=[[x + y for x in range(4)] for y in range(4)], + delimiter=',', + is_square=True + ) + + def test_unlabelled_tsv_is_square_false(self): + self._assert_unlabelled_roundtrip_lt_256( + matrix=[[x + y for x in range(4)] for y in range(4)], + delimiter='\t', + is_square=False + ) + + def _assert_unlabelled_roundtrip_1024(self, matrix, first_row=None, first_col=None): + delimiter = '\t' + is_square = False + with TemporaryDirectory() as tmp_dir: + csv_path = tmp_dir + '/tmp.csv' + with open(csv_path, 'w', newline='') as csv_file: + writer = csv.writer(csv_file, delimiter=delimiter) + # body: + for row in matrix: + writer.writerow(row) + + csv_handle = open(csv_path, 'r') + hdf5_path = tmp_dir + 'tmp.hdf5' + hdf5_write_handle = h5py.File(hdf5_path, 'w') + + is_labelled = False + height = get_height(csv_path, is_labelled=is_labelled) + width = get_width(csv_path, is_labelled=is_labelled) + parse(csv_handle, hdf5_write_handle, height, width, + first_n=None, is_labelled=is_labelled, + delimiter=delimiter, is_square=is_square) + + hdf5 = h5py.File(hdf5_path, 'r') + self.assertEqual(list(hdf5.keys()), ['resolutions']) + self.assertEqual(list(hdf5['resolutions'].keys()), ['1', '2', '4']) + self.assertEqual(list(hdf5['resolutions']['1'].keys()), + ['nan_values', 'values']) + self.assertEqual(list(hdf5['resolutions']['4'].keys()), + ['values']) + res_4 = hdf5['resolutions']['4']['values'] + if first_row: + assert_array_equal(res_4[0], first_row) + if first_col: + assert_array_equal([res_4[y][0] for y in range(len(first_col))], first_col) + + def test_unlabelled_tsv_tall(self): + self._assert_unlabelled_roundtrip_1024( + matrix=[[x + y for x in range(4)] for y in range(1024)] + ) - def test_unlabelled_is_square_false(self): - self._assert_unlabelled_4x4(is_square=False) + def test_unlabelled_tsv_wide(self): + self._assert_unlabelled_roundtrip_1024( + matrix=[[x + y for x in range(1024)] for y in range(4)] + ) From e82eb36543c67a05bb75c4129079f5bffb12889c Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Mon, 22 Apr 2019 17:13:21 -0400 Subject: [PATCH 15/24] Test aggregation of tall and wide datasets --- test/tsv_to_mrmatrix_test.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/tsv_to_mrmatrix_test.py b/test/tsv_to_mrmatrix_test.py index 2482e8a5..626ca79d 100644 --- a/test/tsv_to_mrmatrix_test.py +++ b/test/tsv_to_mrmatrix_test.py @@ -253,10 +253,12 @@ def _assert_unlabelled_roundtrip_1024(self, matrix, first_row=None, first_col=No def test_unlabelled_tsv_tall(self): self._assert_unlabelled_roundtrip_1024( - matrix=[[x + y for x in range(4)] for y in range(1024)] + matrix=[[1 for x in range(4)] for y in range(1024)], + first_col=[16]*256 ) def test_unlabelled_tsv_wide(self): self._assert_unlabelled_roundtrip_1024( - matrix=[[x + y for x in range(1024)] for y in range(4)] + matrix=[[1 for x in range(1024)] for y in range(4)], + first_row=[16]*256 ) From 0f940f2aa31f109e919ee3aba6628d28553c83da Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Mon, 22 Apr 2019 17:15:10 -0400 Subject: [PATCH 16/24] Test tall and wide aggregation --- test/tsv_to_mrmatrix_test.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/test/tsv_to_mrmatrix_test.py b/test/tsv_to_mrmatrix_test.py index 626ca79d..2c2511e1 100644 --- a/test/tsv_to_mrmatrix_test.py +++ b/test/tsv_to_mrmatrix_test.py @@ -168,7 +168,8 @@ def test_wide_labelled_square(self): # TODO: We lose nan at higher aggregations: # Maybe regular mean/sum instead of treating missing values as 0? - def _assert_unlabelled_roundtrip_lt_256(self, matrix, delimiter, is_square): + def _assert_unlabelled_roundtrip_lt_256( + self, matrix, delimiter, is_square): with TemporaryDirectory() as tmp_dir: csv_path = tmp_dir + '/tmp.csv' with open(csv_path, 'w', newline='') as csv_file: @@ -216,7 +217,8 @@ def test_unlabelled_tsv_is_square_false(self): is_square=False ) - def _assert_unlabelled_roundtrip_1024(self, matrix, first_row=None, first_col=None): + def _assert_unlabelled_roundtrip_1024( + self, matrix, first_row=None, first_col=None): delimiter = '\t' is_square = False with TemporaryDirectory() as tmp_dir: @@ -249,7 +251,9 @@ def _assert_unlabelled_roundtrip_1024(self, matrix, first_row=None, first_col=No if first_row: assert_array_equal(res_4[0], first_row) if first_col: - assert_array_equal([res_4[y][0] for y in range(len(first_col))], first_col) + assert_array_equal( + [res_4[y][0] for y in range(len(first_col))], + first_col) def test_unlabelled_tsv_tall(self): self._assert_unlabelled_roundtrip_1024( From 64115cc595d7258d7f1cfe2c5c6183a6b1e11502 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Mon, 22 Apr 2019 17:40:18 -0400 Subject: [PATCH 17/24] If source data not power of 2... --- test/tsv_to_mrmatrix_test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/tsv_to_mrmatrix_test.py b/test/tsv_to_mrmatrix_test.py index 2c2511e1..7978bdac 100644 --- a/test/tsv_to_mrmatrix_test.py +++ b/test/tsv_to_mrmatrix_test.py @@ -257,12 +257,12 @@ def _assert_unlabelled_roundtrip_1024( def test_unlabelled_tsv_tall(self): self._assert_unlabelled_roundtrip_1024( - matrix=[[1 for x in range(4)] for y in range(1024)], - first_col=[16]*256 + matrix=[[1 for x in range(4)] for y in range(1000)], + first_col=[16]*250 + [0]*6 ) def test_unlabelled_tsv_wide(self): self._assert_unlabelled_roundtrip_1024( - matrix=[[1 for x in range(1024)] for y in range(4)], - first_row=[16]*256 + matrix=[[1 for x in range(1000)] for y in range(4)], + first_row=[16]*250 + [0]*6 ) From 84e60ca2254bd86dadec40832135f9c919d5d3dc Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Mon, 22 Apr 2019 17:54:19 -0400 Subject: [PATCH 18/24] Handle first n --- scripts/tsv_to_mrmatrix.py | 2 +- test/tsv_to_mrmatrix_test.py | 22 ++++++++++++++++++---- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/scripts/tsv_to_mrmatrix.py b/scripts/tsv_to_mrmatrix.py index a04ab87d..0a76c8b9 100644 --- a/scripts/tsv_to_mrmatrix.py +++ b/scripts/tsv_to_mrmatrix.py @@ -53,7 +53,7 @@ def parse(input_handle, output_hdf5, height, width, # TODO: Handle non-square labels tile_size = 256 - limit = first_n if first_n else max(height, width) + limit = max(height, width) max_zoom = math.ceil(math.log(limit / tile_size) / math.log(2)) max_width = tile_size * 2 ** max_zoom diff --git a/test/tsv_to_mrmatrix_test.py b/test/tsv_to_mrmatrix_test.py index 7978bdac..69157f66 100644 --- a/test/tsv_to_mrmatrix_test.py +++ b/test/tsv_to_mrmatrix_test.py @@ -218,7 +218,7 @@ def test_unlabelled_tsv_is_square_false(self): ) def _assert_unlabelled_roundtrip_1024( - self, matrix, first_row=None, first_col=None): + self, matrix, first_row=None, first_col=None, first_n=None): delimiter = '\t' is_square = False with TemporaryDirectory() as tmp_dir: @@ -237,7 +237,7 @@ def _assert_unlabelled_roundtrip_1024( height = get_height(csv_path, is_labelled=is_labelled) width = get_width(csv_path, is_labelled=is_labelled) parse(csv_handle, hdf5_write_handle, height, width, - first_n=None, is_labelled=is_labelled, + first_n=first_n, is_labelled=is_labelled, delimiter=delimiter, is_square=is_square) hdf5 = h5py.File(hdf5_path, 'r') @@ -258,11 +258,25 @@ def _assert_unlabelled_roundtrip_1024( def test_unlabelled_tsv_tall(self): self._assert_unlabelled_roundtrip_1024( matrix=[[1 for x in range(4)] for y in range(1000)], - first_col=[16]*250 + [0]*6 + first_col=[16] * 250 + [0] * 6 ) def test_unlabelled_tsv_wide(self): self._assert_unlabelled_roundtrip_1024( matrix=[[1 for x in range(1000)] for y in range(4)], - first_row=[16]*250 + [0]*6 + first_row=[16] * 250 + [0] * 6 + ) + + def test_unlabelled_tsv_tall_first_n(self): + self._assert_unlabelled_roundtrip_1024( + matrix=[[1 for x in range(4)] for y in range(1000)], + first_col=[8] + [0] * 255, + first_n=2 + ) + + def test_unlabelled_tsv_wide_first_n(self): + self._assert_unlabelled_roundtrip_1024( + matrix=[[1 for x in range(1000)] for y in range(4)], + first_row=[8] * 250 + [0] * 6, + first_n=2 ) From 2532a7b1295a310278b9b4bc30c5044195c78b77 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Mon, 22 Apr 2019 18:08:38 -0400 Subject: [PATCH 19/24] click-ify --- scripts/tsv_to_mrmatrix.py | 2 ++ setup.py | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/tsv_to_mrmatrix.py b/scripts/tsv_to_mrmatrix.py index 0a76c8b9..2ecabb1f 100644 --- a/scripts/tsv_to_mrmatrix.py +++ b/scripts/tsv_to_mrmatrix.py @@ -3,6 +3,7 @@ import csv import dask.array as da import h5py +import click import math import numpy as np import argparse @@ -113,6 +114,7 @@ def get_width(input_path, is_labelled, delimiter='\t'): return len_row +@click.command() def main(): parser = argparse.ArgumentParser(description=''' Given a tab-delimited file, produces an HDF5 file with mrmatrix diff --git a/setup.py b/setup.py index 4447da73..ab0d1c2d 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ print("packages:", find_packages()) - + setup_requires = [ 'numpy', ] @@ -40,7 +40,8 @@ install_requires=install_requires, entry_points={ 'console_scripts': [ - 'clodius = clodius.cli.aggregate:cli', + 'clodius = clodius.cli.aggregate:cli', + 'tsv_to_mrmatrix = scripts.tsv_to_mrmatrix:main' ] } ) From 3fadfb0f2b0bb0b9ca23f0d8d5b7f6f85dbdcc7c Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Mon, 22 Apr 2019 18:40:16 -0400 Subject: [PATCH 20/24] Un-click-ify: Argparse feels good-enough for now --- scripts/tsv_to_mrmatrix.py | 2 -- setup.py | 6 ++++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/tsv_to_mrmatrix.py b/scripts/tsv_to_mrmatrix.py index 2ecabb1f..0a76c8b9 100644 --- a/scripts/tsv_to_mrmatrix.py +++ b/scripts/tsv_to_mrmatrix.py @@ -3,7 +3,6 @@ import csv import dask.array as da import h5py -import click import math import numpy as np import argparse @@ -114,7 +113,6 @@ def get_width(input_path, is_labelled, delimiter='\t'): return len_row -@click.command() def main(): parser = argparse.ArgumentParser(description=''' Given a tab-delimited file, produces an HDF5 file with mrmatrix diff --git a/setup.py b/setup.py index ab0d1c2d..1e8c9884 100644 --- a/setup.py +++ b/setup.py @@ -38,10 +38,12 @@ packages=['clodius', 'clodius.cli', 'clodius.tiles'], setup_requires=setup_requires, install_requires=install_requires, + scripts=[ + 'scripts/tsv_to_mrmatrix.py' + ], entry_points={ 'console_scripts': [ - 'clodius = clodius.cli.aggregate:cli', - 'tsv_to_mrmatrix = scripts.tsv_to_mrmatrix:main' + 'clodius = clodius.cli.aggregate:cli' ] } ) From e595bd61a8e13b5ae4eeaf10ba8ff491ff7339b3 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Mon, 22 Apr 2019 18:41:12 -0400 Subject: [PATCH 21/24] update changelog [skip ci] --- CHANGELOG | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG b/CHANGELOG index 4c8131a1..bace25b4 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,6 @@ in progress -- Make tsv_to_mrmatrix more flexible. +- Make tsv_to_mrmatrix more flexible and add it to the exported scripts. v0.10.7 From 3d1aca970fb2833e639100b91bf0682b8b0a39c2 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Tue, 21 May 2019 11:03:59 -0400 Subject: [PATCH 22/24] Fix whitespace; one failing test locally --- test/tsv_to_mrmatrix_test.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/test/tsv_to_mrmatrix_test.py b/test/tsv_to_mrmatrix_test.py index cc097498..f6b013dc 100644 --- a/test/tsv_to_mrmatrix_test.py +++ b/test/tsv_to_mrmatrix_test.py @@ -10,7 +10,6 @@ from scripts.tsv_to_mrmatrix import coarsen, parse, get_height, get_width - class CoarsenTest(unittest.TestCase): def test_5_layer_pyramid(self): tile_size = 4 @@ -22,7 +21,8 @@ def test_5_layer_pyramid(self): g = hdf5.create_group('resolutions') g1 = g.create_group('1') ds = g1.create_dataset('values', (max_width, max_width), - dtype='f4', compression='lzf', fillvalue=np.nan) + dtype='f4', compression='lzf', + fillvalue=np.nan) for y in range(max_width): a = np.array([float(x) for x in range(max_width)]) ds[y, :max_width] = a @@ -71,7 +71,8 @@ def test_math(self): g = hdf5.create_group('resolutions') g1 = g.create_group('1') ds = g1.create_dataset('values', (max_width, max_width), - dtype='f4', compression='lzf', fillvalue=np.nan) + dtype='f4', compression='lzf', + fillvalue=np.nan) for y in range(max_width): a = np.array([float(x) for x in range(max_width)]) ds[y, :max_width] = a From beb5a0c044e8397f0dac7ec477606e346abd72cb Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Tue, 21 May 2019 11:23:39 -0400 Subject: [PATCH 23/24] Comments and logs --- scripts/tsv_to_mrmatrix.py | 6 +++++- test/tsv_to_mrmatrix_test.py | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/scripts/tsv_to_mrmatrix.py b/scripts/tsv_to_mrmatrix.py index 0a76c8b9..2d0b9146 100644 --- a/scripts/tsv_to_mrmatrix.py +++ b/scripts/tsv_to_mrmatrix.py @@ -51,6 +51,7 @@ def parse(input_handle, output_hdf5, height, width, data=np.array(labels, dtype=h5py.special_dtype(vlen=str)), compression='lzf') # TODO: Handle non-square labels + # https://github.com/higlass/clodius/issues/68 tile_size = 256 limit = max(height, width) @@ -134,7 +135,10 @@ def main(): height = get_height(args.input_file, is_labelled=args.labelled) width = get_width(args.input_file, is_labelled=args.labelled, - delimiter=args.delimiter) + delimiter=args.delimiter + print('height:', height) + print('width:', width) + f_in = open(args.input_file, 'r', newline='') parse(f_in, diff --git a/test/tsv_to_mrmatrix_test.py b/test/tsv_to_mrmatrix_test.py index f6b013dc..36878cfd 100644 --- a/test/tsv_to_mrmatrix_test.py +++ b/test/tsv_to_mrmatrix_test.py @@ -164,8 +164,8 @@ def test_wide_labelled_square(self): assert_array_equal(res_2[4], [0] * 256) assert_array_equal(res_2[5], [0] * 256) assert_array_equal(res_2[6], [0] * 256) - # TODO: We lose nan at higher aggregations: - # Maybe regular mean/sum instead of treating missing values as 0? + # TODO: We lose nan at higher aggregations. + # https://github.com/higlass/clodius/issues/62 def _assert_unlabelled_roundtrip_lt_256( self, matrix, delimiter, is_square): From d168503bb7b57ee05282067e7231365b59d341d5 Mon Sep 17 00:00:00 2001 From: Chuck McCallum Date: Tue, 21 May 2019 11:42:48 -0400 Subject: [PATCH 24/24] missing paren --- scripts/tsv_to_mrmatrix.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/tsv_to_mrmatrix.py b/scripts/tsv_to_mrmatrix.py index 2d0b9146..e4977122 100644 --- a/scripts/tsv_to_mrmatrix.py +++ b/scripts/tsv_to_mrmatrix.py @@ -135,7 +135,7 @@ def main(): height = get_height(args.input_file, is_labelled=args.labelled) width = get_width(args.input_file, is_labelled=args.labelled, - delimiter=args.delimiter + delimiter=args.delimiter) print('height:', height) print('width:', width)