From 6f9333e18ddb0b72035920100d2666e76e58e1ee Mon Sep 17 00:00:00 2001
From: Chuck McCallum <chuck_mccallum@hms.harvard.edu>
Date: Mon, 22 Apr 2019 10:11:38 -0400
Subject: [PATCH 01/24] Use csv module instead of parsing by hand; Fix #63

---
 scripts/tsv_to_mrmatrix.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/scripts/tsv_to_mrmatrix.py b/scripts/tsv_to_mrmatrix.py
index a0214934..04c88131 100644
--- a/scripts/tsv_to_mrmatrix.py
+++ b/scripts/tsv_to_mrmatrix.py
@@ -1,5 +1,6 @@
 #!/usr/bin/python
 
+import csv
 import dask.array as da
 import h5py
 import math
@@ -44,10 +45,8 @@ def coarsen(f, tile_size=256):
 
 
 def parse(input_handle, output_hdf5, top_n=None):
-    input_handle
-    first_line = next(input_handle)
-    parts = first_line.strip().split('\t')
-    # TODO: Use the python built-in csv module, instead of parsing by hand?
+    reader = csv.reader(input_handle, delimiter='\t')
+    parts = next(reader)
 
     if top_n is None:
         top_n = len(parts) - 1
@@ -59,8 +58,10 @@ def parse(input_handle, output_hdf5, top_n=None):
     max_zoom = math.ceil(math.log(top_n / tile_size) / math.log(2))
     max_width = tile_size * 2 ** max_zoom
 
-    labels_dset = output_hdf5.create_dataset('labels', data=np.array(labels, dtype=h5py.special_dtype(vlen=str)),
-            compression='lzf')
+    labels_dset = output_hdf5.create_dataset(
+        'labels',
+        data=np.array(labels, dtype=h5py.special_dtype(vlen=str)),
+        compression='lzf')
 
     g = output_hdf5.create_group('resolutions')
     g1 = g.create_group('1')
@@ -72,9 +73,8 @@ def parse(input_handle, output_hdf5, top_n=None):
 
     start_time = time.time()
     counter = 0
-    for line in input_handle:
-        parts = line.strip().split('\t')[1:top_n+1]
-        x = np.array([float(p) for p in parts])
+    for row in reader:
+        x = np.array([float(p) for p in row[1:]])
         ds[counter,:len(x)] = x
 
         counter += 1
@@ -114,7 +114,7 @@ def main():
     if args.input_file == '-':
         f_in = sys.stdin
     else:
-        f_in = open(args.input_file, 'r')
+        f_in = open(args.input_file, 'r', newline='')
 
     parse(f_in, h5py.File(args.output_file, 'w'), top_n)
 

From 15b786ebe456eebad6ffefd871edc03b3668d260 Mon Sep 17 00:00:00 2001
From: Chuck McCallum <chuck_mccallum@hms.harvard.edu>
Date: Mon, 22 Apr 2019 10:21:30 -0400
Subject: [PATCH 02/24] better description

---
 scripts/tsv_to_mrmatrix.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/scripts/tsv_to_mrmatrix.py b/scripts/tsv_to_mrmatrix.py
index 04c88131..092566ef 100644
--- a/scripts/tsv_to_mrmatrix.py
+++ b/scripts/tsv_to_mrmatrix.py
@@ -92,17 +92,14 @@ def parse(input_handle, output_hdf5, top_n=None):
 
 
 def main():
-    parser = argparse.ArgumentParser(description="""
-
-    python tsv-dense-to-sparse
-""")
+    parser = argparse.ArgumentParser(description='''
+        Given a tab-delimited file, produces an HDF5 file with mrmatrix ("multi-resolution matrix")
+        structure: Under the "resolutions" group are datasets, named with successive powers of 2,
+        which represent successively higher aggregations of the input.
+    ''')
 
     parser.add_argument('input_file')
     parser.add_argument('output_file')
-    #parser.add_argument('-o', '--options', default='yo',
-    #					 help="Some option", type='str')
-    #parser.add_argument('-u', '--useless', action='store_true',
-    #					 help='Another useless option')
     parser.add_argument('-n', '--first-n', type=int, default=None,
             help="Only use the first n entries in the matrix")
 

From afde554d76affa34111a905c904b2b0abee94485 Mon Sep 17 00:00:00 2001
From: Chuck McCallum <chuck_mccallum@hms.harvard.edu>
Date: Mon, 22 Apr 2019 10:40:00 -0400
Subject: [PATCH 03/24] Aspirational command-line options

---
 scripts/tsv_to_mrmatrix.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/scripts/tsv_to_mrmatrix.py b/scripts/tsv_to_mrmatrix.py
index 092566ef..5b0c593d 100644
--- a/scripts/tsv_to_mrmatrix.py
+++ b/scripts/tsv_to_mrmatrix.py
@@ -97,11 +97,16 @@ def main():
         structure: Under the "resolutions" group are datasets, named with successive powers of 2,
         which represent successively higher aggregations of the input.
     ''')
-
-    parser.add_argument('input_file')
-    parser.add_argument('output_file')
-    parser.add_argument('-n', '--first-n', type=int, default=None,
-            help="Only use the first n entries in the matrix")
+    parser.add_argument('input_file', help='TSV file path, or "-" for STDIN')
+    parser.add_argument('output_file', help='HDF5 file')
+    parser.add_argument('-d', '--delimiter', type=str, default='\t', metavar='D',
+            help='Delimiter; defaults to tab')
+    parser.add_argument('-n', '--first-n', type=int, default=None, metavar='N',
+            help='Only read the first n columns from the first n rows')
+    parser.add_argument('-s', '--square', action='store_true',
+            help='Row labels are assumed to match column labels')
+    parser.add_argument('-u', '--unlabelled', action='store_true',
+            help='TSV Matrix contains only numbers: no column or row labels')
 
     args = parser.parse_args()
 

From d37958d9c7669443a897172317129c376ce0b859 Mon Sep 17 00:00:00 2001
From: Chuck McCallum <chuck_mccallum@hms.harvard.edu>
Date: Mon, 22 Apr 2019 11:35:17 -0400
Subject: [PATCH 04/24] Checkpoint: Need to update test

---
 scripts/tsv_to_mrmatrix.py | 69 ++++++++++++++++++++++----------------
 1 file changed, 41 insertions(+), 28 deletions(-)

diff --git a/scripts/tsv_to_mrmatrix.py b/scripts/tsv_to_mrmatrix.py
index 5b0c593d..732c7790 100644
--- a/scripts/tsv_to_mrmatrix.py
+++ b/scripts/tsv_to_mrmatrix.py
@@ -44,25 +44,23 @@ def coarsen(f, tile_size=256):
         da.store(dask_dset, values)
 
 
-def parse(input_handle, output_hdf5, top_n=None):
-    reader = csv.reader(input_handle, delimiter='\t')
-    parts = next(reader)
+def parse(input_handle, output_hdf5, height, delimiter, first_n, is_square, is_labelled):
+    reader = csv.reader(input_handle, delimiter=delimiter)
+    if is_labelled:
+        first_row = next(reader)
+        labels = first_row[1:(first_n + 1) if first_n else None]
+        if is_square:
+            output_hdf5.create_dataset(
+                'labels',
+                data=np.array(labels, dtype=h5py.special_dtype(vlen=str)),
+                compression='lzf')
+    # TODO: Handle non-square labels
 
-    if top_n is None:
-        top_n = len(parts) - 1
-        # TODO: So if it's taller than it is wide, it will be truncated to a square,
-        # unless an explicit top_n is provided? That doesn't seem right.
-
-    labels = parts[1:top_n+1]
     tile_size = 256
-    max_zoom = math.ceil(math.log(top_n / tile_size) / math.log(2))
+    limit = min(first_n, height) if first_n else height
+    max_zoom = math.ceil(math.log(limit / tile_size) / math.log(2))
     max_width = tile_size * 2 ** max_zoom
 
-    labels_dset = output_hdf5.create_dataset(
-        'labels',
-        data=np.array(labels, dtype=h5py.special_dtype(vlen=str)),
-        compression='lzf')
-
     g = output_hdf5.create_group('resolutions')
     g1 = g.create_group('1')
     ds = g1.create_dataset('values', (max_width, max_width),
@@ -78,7 +76,7 @@ def parse(input_handle, output_hdf5, top_n=None):
         ds[counter,:len(x)] = x
 
         counter += 1
-        if counter == top_n:
+        if counter == first_n:
             break
 
         time_elapsed = time.time() - start_time
@@ -91,13 +89,28 @@ def parse(input_handle, output_hdf5, top_n=None):
     output_hdf5.close()
 
 
+def get_height(input_path, is_labelled=False):
+    '''
+    We need to scan the file once just to see how many lines it contains.
+    If it is tall and narrow, the first tile will need to be larger than just
+    looking at the width of the first row would suggest.
+    '''
+    with open(fname) as f:
+        for i, l in enumerate(f):
+            pass
+    if is_labelled:
+        return i
+    else:
+        return i + 1
+
+
 def main():
     parser = argparse.ArgumentParser(description='''
         Given a tab-delimited file, produces an HDF5 file with mrmatrix ("multi-resolution matrix")
         structure: Under the "resolutions" group are datasets, named with successive powers of 2,
         which represent successively higher aggregations of the input.
     ''')
-    parser.add_argument('input_file', help='TSV file path, or "-" for STDIN')
+    parser.add_argument('input_file', help='TSV file path')
     parser.add_argument('output_file', help='HDF5 file')
     parser.add_argument('-d', '--delimiter', type=str, default='\t', metavar='D',
             help='Delimiter; defaults to tab')
@@ -105,20 +118,20 @@ def main():
             help='Only read the first n columns from the first n rows')
     parser.add_argument('-s', '--square', action='store_true',
             help='Row labels are assumed to match column labels')
-    parser.add_argument('-u', '--unlabelled', action='store_true',
-            help='TSV Matrix contains only numbers: no column or row labels')
-
+    parser.add_argument('-l', '--labelled', action='store_true',
+            help='TSV Matrix has column and row labels')
     args = parser.parse_args()
 
-    count = 0
-    top_n = args.first_n
-
-    if args.input_file == '-':
-        f_in = sys.stdin
-    else:
-        f_in = open(args.input_file, 'r', newline='')
+    height = get_height(args.input_file, is_labelled=args.labelled)
+    f_in = open(args.input_file, 'r', newline='')
 
-    parse(f_in, h5py.File(args.output_file, 'w'), top_n)
+    parse(f_in,
+        h5py.File(args.output_file, 'w'),
+        height,
+        delimiter=args.delimiter,
+        first_n=args.first_n,
+        is_square=args.square,
+        is_labelled=args.labelled)
 
     f = h5py.File(args.output_file, 'r')
     print("sum1:", np.nansum(f['resolutions']['1']['values'][0]))

From 549ce5ed1bf3ce3e413a6b95d81048a3fb5e0376 Mon Sep 17 00:00:00 2001
From: Chuck McCallum <chuck_mccallum@hms.harvard.edu>
Date: Mon, 22 Apr 2019 12:18:59 -0400
Subject: [PATCH 05/24] checkpoint: still buggy [skip ci]

---
 scripts/tsv_to_mrmatrix.py   | 6 ++++--
 test/tsv_to_mrmatrix_test.py | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/scripts/tsv_to_mrmatrix.py b/scripts/tsv_to_mrmatrix.py
index 732c7790..eaad0b75 100644
--- a/scripts/tsv_to_mrmatrix.py
+++ b/scripts/tsv_to_mrmatrix.py
@@ -10,7 +10,7 @@
 import sys
 import argparse
 import time
-
+import logging
 
 def coarsen(f, tile_size=256):
     '''
@@ -60,6 +60,8 @@ def parse(input_handle, output_hdf5, height, delimiter, first_n, is_square, is_l
     limit = min(first_n, height) if first_n else height
     max_zoom = math.ceil(math.log(limit / tile_size) / math.log(2))
     max_width = tile_size * 2 ** max_zoom
+    logging.info('max_zoom: %s' % max_zoom)
+    logging.info('max_width: %s' % max_width)
 
     g = output_hdf5.create_group('resolutions')
     g1 = g.create_group('1')
@@ -95,7 +97,7 @@ def get_height(input_path, is_labelled=False):
     If it is tall and narrow, the first tile will need to be larger than just
     looking at the width of the first row would suggest.
     '''
-    with open(fname) as f:
+    with open(input_path) as f:
         for i, l in enumerate(f):
             pass
     if is_labelled:
diff --git a/test/tsv_to_mrmatrix_test.py b/test/tsv_to_mrmatrix_test.py
index 0830737e..7b3310b7 100644
--- a/test/tsv_to_mrmatrix_test.py
+++ b/test/tsv_to_mrmatrix_test.py
@@ -7,7 +7,7 @@
 from numpy.testing import assert_array_equal
 import h5py
 
-from scripts.tsv_to_mrmatrix import coarsen, parse
+from scripts.tsv_to_mrmatrix import coarsen, parse, get_height
 
 class CoarsenTest(unittest.TestCase):
     def test_5_layer_pyramid(self):
@@ -121,7 +121,9 @@ def test_parse(self):
             hdf5_path = tmp_dir + 'tmp.hdf5'
             hdf5_write_handle = h5py.File(hdf5_path, 'w')
 
-            parse(csv_handle, hdf5_write_handle)
+            height = get_height(csv_path)
+            parse(csv_handle, hdf5_write_handle, height,
+                delimiter='\t', first_n=None, is_square=True, is_labelled=True)
 
             hdf5 = h5py.File(hdf5_path, 'r')
             self.assertEqual(list(hdf5.keys()), ['labels', 'resolutions'])

From 4f41d1d603998590bf78a794378714ef5f323a85 Mon Sep 17 00:00:00 2001
From: Chuck McCallum <chuck_mccallum@hms.harvard.edu>
Date: Mon, 22 Apr 2019 13:12:04 -0400
Subject: [PATCH 06/24] Runs, but we get one less resolution

---
 scripts/tsv_to_mrmatrix.py   | 23 +++++++++++++++--------
 test/tsv_to_mrmatrix_test.py |  5 +++--
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/scripts/tsv_to_mrmatrix.py b/scripts/tsv_to_mrmatrix.py
index eaad0b75..a1540471 100644
--- a/scripts/tsv_to_mrmatrix.py
+++ b/scripts/tsv_to_mrmatrix.py
@@ -44,7 +44,7 @@ def coarsen(f, tile_size=256):
         da.store(dask_dset, values)
 
 
-def parse(input_handle, output_hdf5, height, delimiter, first_n, is_square, is_labelled):
+def parse(input_handle, output_hdf5, height, width, delimiter, first_n, is_square, is_labelled):
     reader = csv.reader(input_handle, delimiter=delimiter)
     if is_labelled:
         first_row = next(reader)
@@ -54,14 +54,12 @@ def parse(input_handle, output_hdf5, height, delimiter, first_n, is_square, is_l
                 'labels',
                 data=np.array(labels, dtype=h5py.special_dtype(vlen=str)),
                 compression='lzf')
-    # TODO: Handle non-square labels
+        # TODO: Handle non-square labels
 
     tile_size = 256
-    limit = min(first_n, height) if first_n else height
+    limit = first_n if first_n else max(height, width)
     max_zoom = math.ceil(math.log(limit / tile_size) / math.log(2))
     max_width = tile_size * 2 ** max_zoom
-    logging.info('max_zoom: %s' % max_zoom)
-    logging.info('max_width: %s' % max_width)
 
     g = output_hdf5.create_group('resolutions')
     g1 = g.create_group('1')
@@ -84,14 +82,14 @@ def parse(input_handle, output_hdf5, height, delimiter, first_n, is_square, is_l
         time_elapsed = time.time() - start_time
         time_per_entry = time_elapsed / counter
 
-        time_remaining = time_per_entry * (top_n - counter)
+        time_remaining = time_per_entry * (height - counter)
         print("counter:", counter, "sum(x):", sum(x), "time remaining: {:d} seconds".format(int(time_remaining)))
 
     coarsen(output_hdf5)
     output_hdf5.close()
 
 
-def get_height(input_path, is_labelled=False):
+def get_height(input_path, is_labelled=True):
     '''
     We need to scan the file once just to see how many lines it contains.
     If it is tall and narrow, the first tile will need to be larger than just
@@ -105,6 +103,14 @@ def get_height(input_path, is_labelled=False):
     else:
         return i + 1
 
+def get_width(input_path, delimiter='\t'):
+    '''
+    Assume the number of elements in the first row is the total width.
+    '''
+    with open(input_path, 'r', newline='') as input_handle:
+        reader = csv.reader(input_handle, delimiter=delimiter)
+        return len(next(reader))
+
 
 def main():
     parser = argparse.ArgumentParser(description='''
@@ -125,11 +131,12 @@ def main():
     args = parser.parse_args()
 
     height = get_height(args.input_file, is_labelled=args.labelled)
+    width = get_width(args.input_file, delimiter=args.delimiter)
     f_in = open(args.input_file, 'r', newline='')
 
     parse(f_in,
         h5py.File(args.output_file, 'w'),
-        height,
+        height, width,
         delimiter=args.delimiter,
         first_n=args.first_n,
         is_square=args.square,
diff --git a/test/tsv_to_mrmatrix_test.py b/test/tsv_to_mrmatrix_test.py
index 7b3310b7..b567be86 100644
--- a/test/tsv_to_mrmatrix_test.py
+++ b/test/tsv_to_mrmatrix_test.py
@@ -7,7 +7,7 @@
 from numpy.testing import assert_array_equal
 import h5py
 
-from scripts.tsv_to_mrmatrix import coarsen, parse, get_height
+from scripts.tsv_to_mrmatrix import coarsen, parse, get_height, get_width
 
 class CoarsenTest(unittest.TestCase):
     def test_5_layer_pyramid(self):
@@ -122,7 +122,8 @@ def test_parse(self):
             hdf5_write_handle = h5py.File(hdf5_path, 'w')
 
             height = get_height(csv_path)
-            parse(csv_handle, hdf5_write_handle, height,
+            width = get_width(csv_path)
+            parse(csv_handle, hdf5_write_handle, height, width,
                 delimiter='\t', first_n=None, is_square=True, is_labelled=True)
 
             hdf5 = h5py.File(hdf5_path, 'r')

From 1d148c857aad91e439ba53e73777619279d6f238 Mon Sep 17 00:00:00 2001
From: Chuck McCallum <chuck_mccallum@hms.harvard.edu>
Date: Mon, 22 Apr 2019 14:06:41 -0400
Subject: [PATCH 07/24] Fix test

---
 scripts/tsv_to_mrmatrix.py   | 12 +++++++-----
 test/tsv_to_mrmatrix_test.py |  2 +-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/scripts/tsv_to_mrmatrix.py b/scripts/tsv_to_mrmatrix.py
index a1540471..2bdbcdb0 100644
--- a/scripts/tsv_to_mrmatrix.py
+++ b/scripts/tsv_to_mrmatrix.py
@@ -18,7 +18,6 @@ def coarsen(f, tile_size=256):
     '''
     grid = f['resolutions']['1']['values']
     top_n = grid.shape[0]
-
     max_zoom = math.ceil(math.log(top_n / tile_size) / math.log(2))
     max_width = tile_size * 2 ** max_zoom
 
@@ -103,13 +102,16 @@ def get_height(input_path, is_labelled=True):
     else:
         return i + 1
 
-def get_width(input_path, delimiter='\t'):
+def get_width(input_path, is_labelled, delimiter='\t'):
     '''
     Assume the number of elements in the first row is the total width.
     '''
     with open(input_path, 'r', newline='') as input_handle:
         reader = csv.reader(input_handle, delimiter=delimiter)
-        return len(next(reader))
+        len_row = len(next(reader))
+        if is_labelled:
+            return len_row - 1
+        return len_row
 
 
 def main():
@@ -131,12 +133,12 @@ def main():
     args = parser.parse_args()
 
     height = get_height(args.input_file, is_labelled=args.labelled)
-    width = get_width(args.input_file, delimiter=args.delimiter)
+    width = get_width(args.input_file, is_labelled=args.labelled, delimiter=args.delimiter)
     f_in = open(args.input_file, 'r', newline='')
 
     parse(f_in,
         h5py.File(args.output_file, 'w'),
-        height, width,
+        height=height, width=width,
         delimiter=args.delimiter,
         first_n=args.first_n,
         is_square=args.square,
diff --git a/test/tsv_to_mrmatrix_test.py b/test/tsv_to_mrmatrix_test.py
index b567be86..c7ca7f19 100644
--- a/test/tsv_to_mrmatrix_test.py
+++ b/test/tsv_to_mrmatrix_test.py
@@ -122,7 +122,7 @@ def test_parse(self):
             hdf5_write_handle = h5py.File(hdf5_path, 'w')
 
             height = get_height(csv_path)
-            width = get_width(csv_path)
+            width = get_width(csv_path, is_labelled=True)
             parse(csv_handle, hdf5_write_handle, height, width,
                 delimiter='\t', first_n=None, is_square=True, is_labelled=True)
 

From 6209b4c99cee8cb93e35878857e758aabcf8b98e Mon Sep 17 00:00:00 2001
From: Chuck McCallum <chuck_mccallum@hms.harvard.edu>
Date: Mon, 22 Apr 2019 14:47:11 -0400
Subject: [PATCH 08/24] tsv_to_mrmatrix_test.py whitespace

---
 .flake8 => .flake8-ignore    |  0
 test/tsv_to_mrmatrix_test.py | 38 ++++++++++++++++++++++++------------
 travis_test.sh               |  7 ++++++-
 3 files changed, 32 insertions(+), 13 deletions(-)
 rename .flake8 => .flake8-ignore (100%)

diff --git a/.flake8 b/.flake8-ignore
similarity index 100%
rename from .flake8
rename to .flake8-ignore
diff --git a/test/tsv_to_mrmatrix_test.py b/test/tsv_to_mrmatrix_test.py
index c7ca7f19..bde998bc 100644
--- a/test/tsv_to_mrmatrix_test.py
+++ b/test/tsv_to_mrmatrix_test.py
@@ -9,6 +9,7 @@
 
 from scripts.tsv_to_mrmatrix import coarsen, parse, get_height, get_width
 
+
 class CoarsenTest(unittest.TestCase):
     def test_5_layer_pyramid(self):
         tile_size = 4
@@ -20,7 +21,8 @@ def test_5_layer_pyramid(self):
             g = hdf5.create_group('resolutions')
             g1 = g.create_group('1')
             ds = g1.create_dataset('values', (max_width, max_width),
-                    dtype='f4', compression='lzf', fillvalue=np.nan)
+                                   dtype='f4', compression='lzf',
+                                   fillvalue=np.nan)
             for y in range(max_width):
                 a = np.array([float(x) for x in range(max_width)])
                 ds[y, :max_width] = a
@@ -28,8 +30,10 @@ def test_5_layer_pyramid(self):
             # before coarsen()
             self.assertEqual(list(hdf5.keys()), ['resolutions'])
             self.assertEqual(list(hdf5['resolutions'].keys()), ['1'])
-            self.assertEqual(list(hdf5['resolutions']['1'].keys()), ['values'])
-            self.assertEqual(list(hdf5['resolutions']['1']['values'].shape), [64, 64])
+            self.assertEqual(list(hdf5['resolutions']['1'].keys()),
+                             ['values'])
+            self.assertEqual(list(hdf5['resolutions']['1']['values'].shape),
+                             [64, 64])
             self.assertEqual(
                 hdf5['resolutions']['1']['values'][:].tolist()[0],
                 [float(x) for x in range(64)]
@@ -39,8 +43,10 @@ def test_5_layer_pyramid(self):
 
             # after coarsen()
             self.assertEqual(list(hdf5.keys()), ['resolutions'])
-            self.assertEqual(list(hdf5['resolutions'].keys()), ['1', '16', '2', '4', '8'])
-            self.assertEqual(list(hdf5['resolutions']['16'].keys()), ['values'])
+            self.assertEqual(list(hdf5['resolutions'].keys()),
+                             ['1', '16', '2', '4', '8'])
+            self.assertEqual(list(hdf5['resolutions']['16'].keys()),
+                             ['values'])
             shapes = {
                 '1': 64,
                 '2': 32,
@@ -49,7 +55,8 @@ def test_5_layer_pyramid(self):
                 '16': 4
             }
             for (k, v) in shapes.items():
-                self.assertEqual(hdf5['resolutions'][k]['values'].shape, (v, v))
+                self.assertEqual(hdf5['resolutions'][k]['values'].shape,
+                                 (v, v))
             row = [1920,  6016, 10112, 14208]
             self.assertEqual(
                 hdf5['resolutions']['16']['values'][:].tolist(),
@@ -66,7 +73,8 @@ def test_math(self):
             g = hdf5.create_group('resolutions')
             g1 = g.create_group('1')
             ds = g1.create_dataset('values', (max_width, max_width),
-                    dtype='f4', compression='lzf', fillvalue=np.nan)
+                                   dtype='f4', compression='lzf',
+                                   fillvalue=np.nan)
             for y in range(max_width):
                 a = np.array([float(x) for x in range(max_width)])
                 ds[y, :max_width] = a
@@ -83,7 +91,8 @@ def test_math(self):
                 '4': 2
             }
             for (k, v) in shapes.items():
-                self.assertEqual(hdf5['resolutions'][k]['values'].shape, (v, v))
+                self.assertEqual(hdf5['resolutions'][k]['values'].shape,
+                                 (v, v))
 
             row8 = list(range(8))
             assert_array_equal(
@@ -100,6 +109,7 @@ def test_math(self):
                 hdf5['resolutions']['4']['values'],
                 [row2 for _ in range(2)])
 
+
 class ParseTest(unittest.TestCase):
     def test_parse(self):
         with TemporaryDirectory() as tmp_dir:
@@ -124,7 +134,8 @@ def test_parse(self):
             height = get_height(csv_path)
             width = get_width(csv_path, is_labelled=True)
             parse(csv_handle, hdf5_write_handle, height, width,
-                delimiter='\t', first_n=None, is_square=True, is_labelled=True)
+                  delimiter='\t', first_n=None, is_square=True,
+                  is_labelled=True)
 
             hdf5 = h5py.File(hdf5_path, 'r')
             self.assertEqual(list(hdf5.keys()), ['labels', 'resolutions'])
@@ -132,7 +143,8 @@ def test_parse(self):
 
             self.assertEqual(list(hdf5['resolutions'].keys()), ['1', '2'])
 
-            self.assertEqual(list(hdf5['resolutions']['1'].keys()), ['nan_values', 'values'])
+            self.assertEqual(list(hdf5['resolutions']['1'].keys()),
+                             ['nan_values', 'values'])
             assert_array_equal(
                 hdf5['resolutions']['1']['nan_values'], [[0] * 512] * 512
             )
@@ -145,9 +157,11 @@ def test_parse(self):
             self.assertEqual(list(hdf5['resolutions']['2'].keys()), ['values'])
             res_2 = hdf5['resolutions']['2']['values']
             assert_array_equal(res_2[0], [0] * 256)
-            assert_array_equal(res_2[1], [2] * 256) # Stradles the 0 and 1 rows
+            assert_array_equal(res_2[1], [2] * 256)
+            # Stradles the 0 and 1 rows
             assert_array_equal(res_2[2], [4] * 256)
-            assert_array_equal(res_2[3], [0] * 256) # -1 and +1 cancel out
+            assert_array_equal(res_2[3], [0] * 256)
+            # -1 and +1 cancel out
             assert_array_equal(res_2[4], [0] * 256)
             assert_array_equal(res_2[5], [0] * 256)
             assert_array_equal(res_2[6], [0] * 256)
diff --git a/travis_test.sh b/travis_test.sh
index d3db68d4..14ac781f 100755
--- a/travis_test.sh
+++ b/travis_test.sh
@@ -8,7 +8,12 @@ die() { set +v; echo "$*" 1>&2 ; sleep 1; exit 1; }
 # https://github.com/travis-ci/travis-ci/issues/6018
 
 start flake8
-flake8
+# TODO:
+# - Get more files to lint cleanly.
+# - Reduce the number of errors which are ignored everywhere else.
+flake8 --config=.flake8-ignore
+flake8 test/tsv_to_mrmatrix_test.py
+flake8 scripts/tsv_to_mrmatrix.py
 end flake8
 
 start download

From b4facc100006e55f6115db800e77ee2935327d2e Mon Sep 17 00:00:00 2001
From: Chuck McCallum <chuck_mccallum@hms.harvard.edu>
Date: Mon, 22 Apr 2019 14:53:17 -0400
Subject: [PATCH 09/24] autopep8

---
 scripts/tsv_to_mrmatrix.py | 43 ++++++++++++++++++++------------------
 travis_test.sh             |  4 ++--
 2 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/scripts/tsv_to_mrmatrix.py b/scripts/tsv_to_mrmatrix.py
index 2bdbcdb0..dfd8c22d 100644
--- a/scripts/tsv_to_mrmatrix.py
+++ b/scripts/tsv_to_mrmatrix.py
@@ -12,6 +12,7 @@
 import time
 import logging
 
+
 def coarsen(f, tile_size=256):
     '''
     Create data pyramid.
@@ -21,9 +22,9 @@ def coarsen(f, tile_size=256):
     max_zoom = math.ceil(math.log(top_n / tile_size) / math.log(2))
     max_width = tile_size * 2 ** max_zoom
 
-    chunk_size=tile_size * 16
+    chunk_size = tile_size * 16
     curr_size = grid.shape
-    dask_dset = da.from_array(grid, chunks=(chunk_size,chunk_size))
+    dask_dset = da.from_array(grid, chunks=(chunk_size, chunk_size))
 
     r = f['resolutions']
     curr_resolution = 1
@@ -36,7 +37,7 @@ def coarsen(f, tile_size=256):
         print("curr_size:", curr_size)
         g = r.create_group(str(curr_resolution))
         values = g.require_dataset('values', curr_size, dtype='f4',
-            compression='lzf', fillvalue=np.nan)
+                                   compression='lzf', fillvalue=np.nan)
 
         dask_dset = dask_dset.rechunk((chunk_size, chunk_size))
         dask_dset = da.coarsen(np.nansum, dask_dset, {0: 2, 1: 2})
@@ -63,16 +64,16 @@ def parse(input_handle, output_hdf5, height, width, delimiter, first_n, is_squar
     g = output_hdf5.create_group('resolutions')
     g1 = g.create_group('1')
     ds = g1.create_dataset('values', (max_width, max_width),
-            dtype='f4', compression='lzf', fillvalue=np.nan)
+                           dtype='f4', compression='lzf', fillvalue=np.nan)
     ds1 = g1.create_dataset('nan_values', (max_width, max_width),
-            dtype='f4', compression='lzf', fillvalue=0)
-            # TODO: We don't write to this... Is it necessary?
+                            dtype='f4', compression='lzf', fillvalue=0)
+    # TODO: We don't write to this... Is it necessary?
 
     start_time = time.time()
     counter = 0
     for row in reader:
         x = np.array([float(p) for p in row[1:]])
-        ds[counter,:len(x)] = x
+        ds[counter, :len(x)] = x
 
         counter += 1
         if counter == first_n:
@@ -82,7 +83,8 @@ def parse(input_handle, output_hdf5, height, width, delimiter, first_n, is_squar
         time_per_entry = time_elapsed / counter
 
         time_remaining = time_per_entry * (height - counter)
-        print("counter:", counter, "sum(x):", sum(x), "time remaining: {:d} seconds".format(int(time_remaining)))
+        print("counter:", counter, "sum(x):", sum(x),
+              "time remaining: {:d} seconds".format(int(time_remaining)))
 
     coarsen(output_hdf5)
     output_hdf5.close()
@@ -102,6 +104,7 @@ def get_height(input_path, is_labelled=True):
     else:
         return i + 1
 
+
 def get_width(input_path, is_labelled, delimiter='\t'):
     '''
     Assume the number of elements in the first row is the total width.
@@ -123,31 +126,31 @@ def main():
     parser.add_argument('input_file', help='TSV file path')
     parser.add_argument('output_file', help='HDF5 file')
     parser.add_argument('-d', '--delimiter', type=str, default='\t', metavar='D',
-            help='Delimiter; defaults to tab')
+                        help='Delimiter; defaults to tab')
     parser.add_argument('-n', '--first-n', type=int, default=None, metavar='N',
-            help='Only read the first n columns from the first n rows')
+                        help='Only read the first n columns from the first n rows')
     parser.add_argument('-s', '--square', action='store_true',
-            help='Row labels are assumed to match column labels')
+                        help='Row labels are assumed to match column labels')
     parser.add_argument('-l', '--labelled', action='store_true',
-            help='TSV Matrix has column and row labels')
+                        help='TSV Matrix has column and row labels')
     args = parser.parse_args()
 
     height = get_height(args.input_file, is_labelled=args.labelled)
-    width = get_width(args.input_file, is_labelled=args.labelled, delimiter=args.delimiter)
+    width = get_width(args.input_file, is_labelled=args.labelled,
+                      delimiter=args.delimiter)
     f_in = open(args.input_file, 'r', newline='')
 
     parse(f_in,
-        h5py.File(args.output_file, 'w'),
-        height=height, width=width,
-        delimiter=args.delimiter,
-        first_n=args.first_n,
-        is_square=args.square,
-        is_labelled=args.labelled)
+          h5py.File(args.output_file, 'w'),
+          height=height, width=width,
+          delimiter=args.delimiter,
+          first_n=args.first_n,
+          is_square=args.square,
+          is_labelled=args.labelled)
 
     f = h5py.File(args.output_file, 'r')
     print("sum1:", np.nansum(f['resolutions']['1']['values'][0]))
 
 
-
 if __name__ == '__main__':
     main()
diff --git a/travis_test.sh b/travis_test.sh
index 14ac781f..e146eb81 100755
--- a/travis_test.sh
+++ b/travis_test.sh
@@ -12,8 +12,8 @@ start flake8
 # - Get more files to lint cleanly.
 # - Reduce the number of errors which are ignored everywhere else.
 flake8 --config=.flake8-ignore
-flake8 test/tsv_to_mrmatrix_test.py
-flake8 scripts/tsv_to_mrmatrix.py
+flake8 test/tsv_to_mrmatrix_test.py \
+       scripts/tsv_to_mrmatrix.py
 end flake8
 
 start download

From 37f5b60451f8abdf7725107d4a426da9eaf800b1 Mon Sep 17 00:00:00 2001
From: Chuck McCallum <chuck_mccallum@hms.harvard.edu>
Date: Mon, 22 Apr 2019 15:00:26 -0400
Subject: [PATCH 10/24] flake8 clean

---
 scripts/tsv_to_mrmatrix.py | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/scripts/tsv_to_mrmatrix.py b/scripts/tsv_to_mrmatrix.py
index dfd8c22d..1e41c57c 100644
--- a/scripts/tsv_to_mrmatrix.py
+++ b/scripts/tsv_to_mrmatrix.py
@@ -5,12 +5,8 @@
 import h5py
 import math
 import numpy as np
-import os
-import os.path as op
-import sys
 import argparse
 import time
-import logging
 
 
 def coarsen(f, tile_size=256):
@@ -20,7 +16,6 @@ def coarsen(f, tile_size=256):
     grid = f['resolutions']['1']['values']
     top_n = grid.shape[0]
     max_zoom = math.ceil(math.log(top_n / tile_size) / math.log(2))
-    max_width = tile_size * 2 ** max_zoom
 
     chunk_size = tile_size * 16
     curr_size = grid.shape
@@ -44,7 +39,8 @@ def coarsen(f, tile_size=256):
         da.store(dask_dset, values)
 
 
-def parse(input_handle, output_hdf5, height, width, delimiter, first_n, is_square, is_labelled):
+def parse(input_handle, output_hdf5, height, width,
+          delimiter, first_n, is_square, is_labelled):
     reader = csv.reader(input_handle, delimiter=delimiter)
     if is_labelled:
         first_row = next(reader)
@@ -65,8 +61,8 @@ def parse(input_handle, output_hdf5, height, width, delimiter, first_n, is_squar
     g1 = g.create_group('1')
     ds = g1.create_dataset('values', (max_width, max_width),
                            dtype='f4', compression='lzf', fillvalue=np.nan)
-    ds1 = g1.create_dataset('nan_values', (max_width, max_width),
-                            dtype='f4', compression='lzf', fillvalue=0)
+    g1.create_dataset('nan_values', (max_width, max_width),
+                      dtype='f4', compression='lzf', fillvalue=0)
     # TODO: We don't write to this... Is it necessary?
 
     start_time = time.time()
@@ -119,16 +115,17 @@ def get_width(input_path, is_labelled, delimiter='\t'):
 
 def main():
     parser = argparse.ArgumentParser(description='''
-        Given a tab-delimited file, produces an HDF5 file with mrmatrix ("multi-resolution matrix")
-        structure: Under the "resolutions" group are datasets, named with successive powers of 2,
+        Given a tab-delimited file, produces an HDF5 file with mrmatrix
+        ("multi-resolution matrix") structure: Under the "resolutions"
+        group are datasets, named with successive powers of 2,
         which represent successively higher aggregations of the input.
     ''')
     parser.add_argument('input_file', help='TSV file path')
     parser.add_argument('output_file', help='HDF5 file')
-    parser.add_argument('-d', '--delimiter', type=str, default='\t', metavar='D',
-                        help='Delimiter; defaults to tab')
+    parser.add_argument('-d', '--delimiter', type=str, default='\t',
+                        metavar='D', help='Delimiter; defaults to tab')
     parser.add_argument('-n', '--first-n', type=int, default=None, metavar='N',
-                        help='Only read the first n columns from the first n rows')
+                        help='Only read first N columns from first N rows')
     parser.add_argument('-s', '--square', action='store_true',
                         help='Row labels are assumed to match column labels')
     parser.add_argument('-l', '--labelled', action='store_true',

From f6b8e3cc10a1a4fd8a584aa25835b1c9815bd1cc Mon Sep 17 00:00:00 2001
From: Chuck McCallum <chuck_mccallum@hms.harvard.edu>
Date: Mon, 22 Apr 2019 15:08:53 -0400
Subject: [PATCH 11/24] changelog [skip ci]

---
 CHANGELOG | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CHANGELOG b/CHANGELOG
index 581af515..4c8131a1 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,7 @@
+in progress
+
+- Make tsv_to_mrmatrix more flexible.
+
 v0.10.7
 
 - Changed bins_per_dimension in npvector.tileset_info to match the value in

From 5171bfb4c92ea10ac438692919f778c34aee0fba Mon Sep 17 00:00:00 2001
From: Chuck McCallum <chuck_mccallum@hms.harvard.edu>
Date: Mon, 22 Apr 2019 16:15:12 -0400
Subject: [PATCH 12/24] start test unlabelled

---
 test/tsv_to_mrmatrix_test.py | 51 +++++++++++++++++++++++++++++++++++-
 1 file changed, 50 insertions(+), 1 deletion(-)

diff --git a/test/tsv_to_mrmatrix_test.py b/test/tsv_to_mrmatrix_test.py
index bde998bc..72569aac 100644
--- a/test/tsv_to_mrmatrix_test.py
+++ b/test/tsv_to_mrmatrix_test.py
@@ -111,7 +111,7 @@ def test_math(self):
 
 
 class ParseTest(unittest.TestCase):
-    def test_parse(self):
+    def test_labelled_square(self):
         with TemporaryDirectory() as tmp_dir:
             csv_path = tmp_dir + '/tmp.csv'
             with open(csv_path, 'w', newline='') as csv_file:
@@ -167,3 +167,52 @@ def test_parse(self):
             assert_array_equal(res_2[6], [0] * 256)
             # TODO: We lose nan at higher aggregations:
             # Maybe regular mean/sum instead of treating missing values as 0?
+
+    def test_unlabelled(self):
+        with TemporaryDirectory() as tmp_dir:
+            csv_path = tmp_dir + '/tmp.csv'
+            with open(csv_path, 'w', newline='') as csv_file:
+                writer = csv.writer(csv_file, delimiter='\t')
+                # body:
+                for y in range(4):
+                    writer.writerow([x + y for x in range(4)])
+
+            csv_handle = open(csv_path, 'r')
+            hdf5_path = tmp_dir + 'tmp.hdf5'
+            hdf5_write_handle = h5py.File(hdf5_path, 'w')
+
+            is_labelled = False
+            height = get_height(csv_path, is_labelled=is_labelled)
+            width = get_width(csv_path, is_labelled=is_labelled)
+            self.assertEqual([4, 4], [height, width])
+            parse(csv_handle, hdf5_write_handle, height, width,
+                  is_labelled=is_labelled,
+                  delimiter='\t', first_n=None, is_square=True)
+
+            hdf5 = h5py.File(hdf5_path, 'r')
+            self.assertEqual(list(hdf5.keys()), ['resolutions'])
+            self.assertEqual(list(hdf5['resolutions'].keys()), ['1'])
+            self.assertEqual(list(hdf5['resolutions']['1'].keys()),
+                             ['nan_values', 'values'])
+            # assert_array_equal(
+            #     hdf5['resolutions']['1']['nan_values'], [[0] * 512] * 512
+            # )
+            # res_1 = hdf5['resolutions']['1']['values']
+            # assert_array_equal(res_1[0], [0] * 512)
+            # assert_array_equal(res_1[3], [1] * 512)
+            # assert_array_equal(res_1[6], [1, -1] * 256)
+            # assert_array_equal(res_1[9], [nan] * 512)
+            #
+            # self.assertEqual(list(hdf5['resolutions']['2'].keys()), ['values'])
+            # res_2 = hdf5['resolutions']['2']['values']
+            # assert_array_equal(res_2[0], [0] * 256)
+            # assert_array_equal(res_2[1], [2] * 256)
+            # # Stradles the 0 and 1 rows
+            # assert_array_equal(res_2[2], [4] * 256)
+            # assert_array_equal(res_2[3], [0] * 256)
+            # # -1 and +1 cancel out
+            # assert_array_equal(res_2[4], [0] * 256)
+            # assert_array_equal(res_2[5], [0] * 256)
+            # assert_array_equal(res_2[6], [0] * 256)
+            # # TODO: We lose nan at higher aggregations:
+            # # Maybe regular mean/sum instead of treating missing values as 0?

From dbee7fd243f389dd56b7fe881570150cb0515ec6 Mon Sep 17 00:00:00 2001
From: Chuck McCallum <chuck_mccallum@hms.harvard.edu>
Date: Mon, 22 Apr 2019 16:36:11 -0400
Subject: [PATCH 13/24] unlabelled square

---
 scripts/tsv_to_mrmatrix.py   |  2 +-
 test/tsv_to_mrmatrix_test.py | 46 ++++++++++++++++--------------------
 2 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/scripts/tsv_to_mrmatrix.py b/scripts/tsv_to_mrmatrix.py
index 1e41c57c..a04ab87d 100644
--- a/scripts/tsv_to_mrmatrix.py
+++ b/scripts/tsv_to_mrmatrix.py
@@ -68,7 +68,7 @@ def parse(input_handle, output_hdf5, height, width,
     start_time = time.time()
     counter = 0
     for row in reader:
-        x = np.array([float(p) for p in row[1:]])
+        x = np.array([float(p) for p in row[1 if is_labelled else None:]])
         ds[counter, :len(x)] = x
 
         counter += 1
diff --git a/test/tsv_to_mrmatrix_test.py b/test/tsv_to_mrmatrix_test.py
index 72569aac..810e9a10 100644
--- a/test/tsv_to_mrmatrix_test.py
+++ b/test/tsv_to_mrmatrix_test.py
@@ -111,7 +111,7 @@ def test_math(self):
 
 
 class ParseTest(unittest.TestCase):
-    def test_labelled_square(self):
+    def test_wide_labelled_square(self):
         with TemporaryDirectory() as tmp_dir:
             csv_path = tmp_dir + '/tmp.csv'
             with open(csv_path, 'w', newline='') as csv_file:
@@ -168,7 +168,7 @@ def test_labelled_square(self):
             # TODO: We lose nan at higher aggregations:
             # Maybe regular mean/sum instead of treating missing values as 0?
 
-    def test_unlabelled(self):
+    def _assert_unlabelled_4x4(self, is_square):
         with TemporaryDirectory() as tmp_dir:
             csv_path = tmp_dir + '/tmp.csv'
             with open(csv_path, 'w', newline='') as csv_file:
@@ -187,32 +187,28 @@ def test_unlabelled(self):
             self.assertEqual([4, 4], [height, width])
             parse(csv_handle, hdf5_write_handle, height, width,
                   is_labelled=is_labelled,
-                  delimiter='\t', first_n=None, is_square=True)
+                  delimiter='\t', first_n=None, is_square=is_square)
 
             hdf5 = h5py.File(hdf5_path, 'r')
             self.assertEqual(list(hdf5.keys()), ['resolutions'])
             self.assertEqual(list(hdf5['resolutions'].keys()), ['1'])
             self.assertEqual(list(hdf5['resolutions']['1'].keys()),
                              ['nan_values', 'values'])
-            # assert_array_equal(
-            #     hdf5['resolutions']['1']['nan_values'], [[0] * 512] * 512
-            # )
-            # res_1 = hdf5['resolutions']['1']['values']
-            # assert_array_equal(res_1[0], [0] * 512)
-            # assert_array_equal(res_1[3], [1] * 512)
-            # assert_array_equal(res_1[6], [1, -1] * 256)
-            # assert_array_equal(res_1[9], [nan] * 512)
-            #
-            # self.assertEqual(list(hdf5['resolutions']['2'].keys()), ['values'])
-            # res_2 = hdf5['resolutions']['2']['values']
-            # assert_array_equal(res_2[0], [0] * 256)
-            # assert_array_equal(res_2[1], [2] * 256)
-            # # Stradles the 0 and 1 rows
-            # assert_array_equal(res_2[2], [4] * 256)
-            # assert_array_equal(res_2[3], [0] * 256)
-            # # -1 and +1 cancel out
-            # assert_array_equal(res_2[4], [0] * 256)
-            # assert_array_equal(res_2[5], [0] * 256)
-            # assert_array_equal(res_2[6], [0] * 256)
-            # # TODO: We lose nan at higher aggregations:
-            # # Maybe regular mean/sum instead of treating missing values as 0?
+            assert_array_equal(
+                hdf5['resolutions']['1']['nan_values'], [[0] * 4] * 4
+            )
+            assert_array_equal(
+                hdf5['resolutions']['1']['values'],
+                [
+                    [0, 1, 2, 3],
+                    [1, 2, 3, 4],
+                    [2, 3, 4, 5],
+                    [3, 4, 5, 6]
+                ]
+            )
+
+    def test_unlabelled_is_square_true(self):
+        self._assert_unlabelled_4x4(is_square=True)
+
+    def test_unlabelled_is_square_false(self):
+        self._assert_unlabelled_4x4(is_square=False)

From 5d7afe8b35e0d166b9c0eee34070a3bba7de6fd0 Mon Sep 17 00:00:00 2001
From: Chuck McCallum <chuck_mccallum@hms.harvard.edu>
Date: Mon, 22 Apr 2019 17:08:50 -0400
Subject: [PATCH 14/24] Make test more generic

---
 test/tsv_to_mrmatrix_test.py | 84 ++++++++++++++++++++++++++++--------
 1 file changed, 66 insertions(+), 18 deletions(-)

diff --git a/test/tsv_to_mrmatrix_test.py b/test/tsv_to_mrmatrix_test.py
index 810e9a10..2482e8a5 100644
--- a/test/tsv_to_mrmatrix_test.py
+++ b/test/tsv_to_mrmatrix_test.py
@@ -168,14 +168,14 @@ def test_wide_labelled_square(self):
             # TODO: We lose nan at higher aggregations:
             # Maybe regular mean/sum instead of treating missing values as 0?
 
-    def _assert_unlabelled_4x4(self, is_square):
+    def _assert_unlabelled_roundtrip_lt_256(self, matrix, delimiter, is_square):
         with TemporaryDirectory() as tmp_dir:
             csv_path = tmp_dir + '/tmp.csv'
             with open(csv_path, 'w', newline='') as csv_file:
-                writer = csv.writer(csv_file, delimiter='\t')
+                writer = csv.writer(csv_file, delimiter=delimiter)
                 # body:
-                for y in range(4):
-                    writer.writerow([x + y for x in range(4)])
+                for row in matrix:
+                    writer.writerow(row)
 
             csv_handle = open(csv_path, 'r')
             hdf5_path = tmp_dir + 'tmp.hdf5'
@@ -184,10 +184,9 @@ def _assert_unlabelled_4x4(self, is_square):
             is_labelled = False
             height = get_height(csv_path, is_labelled=is_labelled)
             width = get_width(csv_path, is_labelled=is_labelled)
-            self.assertEqual([4, 4], [height, width])
             parse(csv_handle, hdf5_write_handle, height, width,
-                  is_labelled=is_labelled,
-                  delimiter='\t', first_n=None, is_square=is_square)
+                  first_n=None, is_labelled=is_labelled,
+                  delimiter=delimiter, is_square=is_square)
 
             hdf5 = h5py.File(hdf5_path, 'r')
             self.assertEqual(list(hdf5.keys()), ['resolutions'])
@@ -195,20 +194,69 @@ def _assert_unlabelled_4x4(self, is_square):
             self.assertEqual(list(hdf5['resolutions']['1'].keys()),
                              ['nan_values', 'values'])
             assert_array_equal(
-                hdf5['resolutions']['1']['nan_values'], [[0] * 4] * 4
+                hdf5['resolutions']['1']['nan_values'],
+                [[0] * len(matrix[0])] * len(matrix)
             )
             assert_array_equal(
                 hdf5['resolutions']['1']['values'],
-                [
-                    [0, 1, 2, 3],
-                    [1, 2, 3, 4],
-                    [2, 3, 4, 5],
-                    [3, 4, 5, 6]
-                ]
+                matrix
             )
 
-    def test_unlabelled_is_square_true(self):
-        self._assert_unlabelled_4x4(is_square=True)
+    def test_unlabelled_csv_is_square_true(self):
+        self._assert_unlabelled_roundtrip_lt_256(
+            matrix=[[x + y for x in range(4)] for y in range(4)],
+            delimiter=',',
+            is_square=True
+        )
+
+    def test_unlabelled_tsv_is_square_false(self):
+        self._assert_unlabelled_roundtrip_lt_256(
+            matrix=[[x + y for x in range(4)] for y in range(4)],
+            delimiter='\t',
+            is_square=False
+        )
+
+    def _assert_unlabelled_roundtrip_1024(self, matrix, first_row=None, first_col=None):
+        delimiter = '\t'
+        is_square = False
+        with TemporaryDirectory() as tmp_dir:
+            csv_path = tmp_dir + '/tmp.csv'
+            with open(csv_path, 'w', newline='') as csv_file:
+                writer = csv.writer(csv_file, delimiter=delimiter)
+                # body:
+                for row in matrix:
+                    writer.writerow(row)
+
+            csv_handle = open(csv_path, 'r')
+            hdf5_path = tmp_dir + 'tmp.hdf5'
+            hdf5_write_handle = h5py.File(hdf5_path, 'w')
+
+            is_labelled = False
+            height = get_height(csv_path, is_labelled=is_labelled)
+            width = get_width(csv_path, is_labelled=is_labelled)
+            parse(csv_handle, hdf5_write_handle, height, width,
+                  first_n=None, is_labelled=is_labelled,
+                  delimiter=delimiter, is_square=is_square)
+
+            hdf5 = h5py.File(hdf5_path, 'r')
+            self.assertEqual(list(hdf5.keys()), ['resolutions'])
+            self.assertEqual(list(hdf5['resolutions'].keys()), ['1', '2', '4'])
+            self.assertEqual(list(hdf5['resolutions']['1'].keys()),
+                             ['nan_values', 'values'])
+            self.assertEqual(list(hdf5['resolutions']['4'].keys()),
+                             ['values'])
+            res_4 = hdf5['resolutions']['4']['values']
+            if first_row:
+                assert_array_equal(res_4[0], first_row)
+            if first_col:
+                assert_array_equal([res_4[y][0] for y in range(len(first_col))], first_col)
+
+    def test_unlabelled_tsv_tall(self):
+        self._assert_unlabelled_roundtrip_1024(
+            matrix=[[x + y for x in range(4)] for y in range(1024)]
+        )
 
-    def test_unlabelled_is_square_false(self):
-        self._assert_unlabelled_4x4(is_square=False)
+    def test_unlabelled_tsv_wide(self):
+        self._assert_unlabelled_roundtrip_1024(
+            matrix=[[x + y for x in range(1024)] for y in range(4)]
+        )

From e82eb36543c67a05bb75c4129079f5bffb12889c Mon Sep 17 00:00:00 2001
From: Chuck McCallum <chuck_mccallum@hms.harvard.edu>
Date: Mon, 22 Apr 2019 17:13:21 -0400
Subject: [PATCH 15/24] Test aggregation of tall and wide datasets

---
 test/tsv_to_mrmatrix_test.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/test/tsv_to_mrmatrix_test.py b/test/tsv_to_mrmatrix_test.py
index 2482e8a5..626ca79d 100644
--- a/test/tsv_to_mrmatrix_test.py
+++ b/test/tsv_to_mrmatrix_test.py
@@ -253,10 +253,12 @@ def _assert_unlabelled_roundtrip_1024(self, matrix, first_row=None, first_col=No
 
     def test_unlabelled_tsv_tall(self):
         self._assert_unlabelled_roundtrip_1024(
-            matrix=[[x + y for x in range(4)] for y in range(1024)]
+            matrix=[[1 for x in range(4)] for y in range(1024)],
+            first_col=[16]*256
         )
 
     def test_unlabelled_tsv_wide(self):
         self._assert_unlabelled_roundtrip_1024(
-            matrix=[[x + y for x in range(1024)] for y in range(4)]
+            matrix=[[1 for x in range(1024)] for y in range(4)],
+            first_row=[16]*256
         )

From 0f940f2aa31f109e919ee3aba6628d28553c83da Mon Sep 17 00:00:00 2001
From: Chuck McCallum <chuck_mccallum@hms.harvard.edu>
Date: Mon, 22 Apr 2019 17:15:10 -0400
Subject: [PATCH 16/24] Test tall and wide aggregation

---
 test/tsv_to_mrmatrix_test.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/test/tsv_to_mrmatrix_test.py b/test/tsv_to_mrmatrix_test.py
index 626ca79d..2c2511e1 100644
--- a/test/tsv_to_mrmatrix_test.py
+++ b/test/tsv_to_mrmatrix_test.py
@@ -168,7 +168,8 @@ def test_wide_labelled_square(self):
             # TODO: We lose nan at higher aggregations:
             # Maybe regular mean/sum instead of treating missing values as 0?
 
-    def _assert_unlabelled_roundtrip_lt_256(self, matrix, delimiter, is_square):
+    def _assert_unlabelled_roundtrip_lt_256(
+            self, matrix, delimiter, is_square):
         with TemporaryDirectory() as tmp_dir:
             csv_path = tmp_dir + '/tmp.csv'
             with open(csv_path, 'w', newline='') as csv_file:
@@ -216,7 +217,8 @@ def test_unlabelled_tsv_is_square_false(self):
             is_square=False
         )
 
-    def _assert_unlabelled_roundtrip_1024(self, matrix, first_row=None, first_col=None):
+    def _assert_unlabelled_roundtrip_1024(
+            self, matrix, first_row=None, first_col=None):
         delimiter = '\t'
         is_square = False
         with TemporaryDirectory() as tmp_dir:
@@ -249,7 +251,9 @@ def _assert_unlabelled_roundtrip_1024(self, matrix, first_row=None, first_col=No
             if first_row:
                 assert_array_equal(res_4[0], first_row)
             if first_col:
-                assert_array_equal([res_4[y][0] for y in range(len(first_col))], first_col)
+                assert_array_equal(
+                    [res_4[y][0] for y in range(len(first_col))],
+                    first_col)
 
     def test_unlabelled_tsv_tall(self):
         self._assert_unlabelled_roundtrip_1024(

From 64115cc595d7258d7f1cfe2c5c6183a6b1e11502 Mon Sep 17 00:00:00 2001
From: Chuck McCallum <chuck_mccallum@hms.harvard.edu>
Date: Mon, 22 Apr 2019 17:40:18 -0400
Subject: [PATCH 17/24] If source data not power of 2...

---
 test/tsv_to_mrmatrix_test.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/tsv_to_mrmatrix_test.py b/test/tsv_to_mrmatrix_test.py
index 2c2511e1..7978bdac 100644
--- a/test/tsv_to_mrmatrix_test.py
+++ b/test/tsv_to_mrmatrix_test.py
@@ -257,12 +257,12 @@ def _assert_unlabelled_roundtrip_1024(
 
     def test_unlabelled_tsv_tall(self):
         self._assert_unlabelled_roundtrip_1024(
-            matrix=[[1 for x in range(4)] for y in range(1024)],
-            first_col=[16]*256
+            matrix=[[1 for x in range(4)] for y in range(1000)],
+            first_col=[16]*250 + [0]*6
         )
 
     def test_unlabelled_tsv_wide(self):
         self._assert_unlabelled_roundtrip_1024(
-            matrix=[[1 for x in range(1024)] for y in range(4)],
-            first_row=[16]*256
+            matrix=[[1 for x in range(1000)] for y in range(4)],
+            first_row=[16]*250 + [0]*6
         )

From 84e60ca2254bd86dadec40832135f9c919d5d3dc Mon Sep 17 00:00:00 2001
From: Chuck McCallum <chuck_mccallum@hms.harvard.edu>
Date: Mon, 22 Apr 2019 17:54:19 -0400
Subject: [PATCH 18/24] Handle first n

---
 scripts/tsv_to_mrmatrix.py   |  2 +-
 test/tsv_to_mrmatrix_test.py | 22 ++++++++++++++++++----
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/scripts/tsv_to_mrmatrix.py b/scripts/tsv_to_mrmatrix.py
index a04ab87d..0a76c8b9 100644
--- a/scripts/tsv_to_mrmatrix.py
+++ b/scripts/tsv_to_mrmatrix.py
@@ -53,7 +53,7 @@ def parse(input_handle, output_hdf5, height, width,
         # TODO: Handle non-square labels
 
     tile_size = 256
-    limit = first_n if first_n else max(height, width)
+    limit = max(height, width)
     max_zoom = math.ceil(math.log(limit / tile_size) / math.log(2))
     max_width = tile_size * 2 ** max_zoom
 
diff --git a/test/tsv_to_mrmatrix_test.py b/test/tsv_to_mrmatrix_test.py
index 7978bdac..69157f66 100644
--- a/test/tsv_to_mrmatrix_test.py
+++ b/test/tsv_to_mrmatrix_test.py
@@ -218,7 +218,7 @@ def test_unlabelled_tsv_is_square_false(self):
         )
 
     def _assert_unlabelled_roundtrip_1024(
-            self, matrix, first_row=None, first_col=None):
+            self, matrix, first_row=None, first_col=None, first_n=None):
         delimiter = '\t'
         is_square = False
         with TemporaryDirectory() as tmp_dir:
@@ -237,7 +237,7 @@ def _assert_unlabelled_roundtrip_1024(
             height = get_height(csv_path, is_labelled=is_labelled)
             width = get_width(csv_path, is_labelled=is_labelled)
             parse(csv_handle, hdf5_write_handle, height, width,
-                  first_n=None, is_labelled=is_labelled,
+                  first_n=first_n, is_labelled=is_labelled,
                   delimiter=delimiter, is_square=is_square)
 
             hdf5 = h5py.File(hdf5_path, 'r')
@@ -258,11 +258,25 @@ def _assert_unlabelled_roundtrip_1024(
     def test_unlabelled_tsv_tall(self):
         self._assert_unlabelled_roundtrip_1024(
             matrix=[[1 for x in range(4)] for y in range(1000)],
-            first_col=[16]*250 + [0]*6
+            first_col=[16] * 250 + [0] * 6
         )
 
     def test_unlabelled_tsv_wide(self):
         self._assert_unlabelled_roundtrip_1024(
             matrix=[[1 for x in range(1000)] for y in range(4)],
-            first_row=[16]*250 + [0]*6
+            first_row=[16] * 250 + [0] * 6
+        )
+
+    def test_unlabelled_tsv_tall_first_n(self):
+        self._assert_unlabelled_roundtrip_1024(
+            matrix=[[1 for x in range(4)] for y in range(1000)],
+            first_col=[8] + [0] * 255,
+            first_n=2
+        )
+
+    def test_unlabelled_tsv_wide_first_n(self):
+        self._assert_unlabelled_roundtrip_1024(
+            matrix=[[1 for x in range(1000)] for y in range(4)],
+            first_row=[8] * 250 + [0] * 6,
+            first_n=2
         )

From 2532a7b1295a310278b9b4bc30c5044195c78b77 Mon Sep 17 00:00:00 2001
From: Chuck McCallum <chuck_mccallum@hms.harvard.edu>
Date: Mon, 22 Apr 2019 18:08:38 -0400
Subject: [PATCH 19/24] click-ify

---
 scripts/tsv_to_mrmatrix.py | 2 ++
 setup.py                   | 5 +++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/scripts/tsv_to_mrmatrix.py b/scripts/tsv_to_mrmatrix.py
index 0a76c8b9..2ecabb1f 100644
--- a/scripts/tsv_to_mrmatrix.py
+++ b/scripts/tsv_to_mrmatrix.py
@@ -3,6 +3,7 @@
 import csv
 import dask.array as da
 import h5py
+import click
 import math
 import numpy as np
 import argparse
@@ -113,6 +114,7 @@ def get_width(input_path, is_labelled, delimiter='\t'):
         return len_row
 
 
+@click.command()
 def main():
     parser = argparse.ArgumentParser(description='''
         Given a tab-delimited file, produces an HDF5 file with mrmatrix
diff --git a/setup.py b/setup.py
index 4447da73..ab0d1c2d 100644
--- a/setup.py
+++ b/setup.py
@@ -8,7 +8,7 @@
 
 
 print("packages:", find_packages())
- 
+
 setup_requires = [
         'numpy',
         ]
@@ -40,7 +40,8 @@
     install_requires=install_requires,
     entry_points={
         'console_scripts': [
-            'clodius = clodius.cli.aggregate:cli',
+                'clodius = clodius.cli.aggregate:cli',
+                'tsv_to_mrmatrix = scripts.tsv_to_mrmatrix:main'
             ]
         }
 )

From 3fadfb0f2b0bb0b9ca23f0d8d5b7f6f85dbdcc7c Mon Sep 17 00:00:00 2001
From: Chuck McCallum <chuck_mccallum@hms.harvard.edu>
Date: Mon, 22 Apr 2019 18:40:16 -0400
Subject: [PATCH 20/24] Un-click-ify: Argparse feels good-enough for now

---
 scripts/tsv_to_mrmatrix.py | 2 --
 setup.py                   | 6 ++++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/tsv_to_mrmatrix.py b/scripts/tsv_to_mrmatrix.py
index 2ecabb1f..0a76c8b9 100644
--- a/scripts/tsv_to_mrmatrix.py
+++ b/scripts/tsv_to_mrmatrix.py
@@ -3,7 +3,6 @@
 import csv
 import dask.array as da
 import h5py
-import click
 import math
 import numpy as np
 import argparse
@@ -114,7 +113,6 @@ def get_width(input_path, is_labelled, delimiter='\t'):
         return len_row
 
 
-@click.command()
 def main():
     parser = argparse.ArgumentParser(description='''
         Given a tab-delimited file, produces an HDF5 file with mrmatrix
diff --git a/setup.py b/setup.py
index ab0d1c2d..1e8c9884 100644
--- a/setup.py
+++ b/setup.py
@@ -38,10 +38,12 @@
     packages=['clodius', 'clodius.cli', 'clodius.tiles'],
     setup_requires=setup_requires,
     install_requires=install_requires,
+    scripts=[
+        'scripts/tsv_to_mrmatrix.py'
+    ],
     entry_points={
         'console_scripts': [
-                'clodius = clodius.cli.aggregate:cli',
-                'tsv_to_mrmatrix = scripts.tsv_to_mrmatrix:main'
+                'clodius = clodius.cli.aggregate:cli'
             ]
         }
 )

From e595bd61a8e13b5ae4eeaf10ba8ff491ff7339b3 Mon Sep 17 00:00:00 2001
From: Chuck McCallum <chuck_mccallum@hms.harvard.edu>
Date: Mon, 22 Apr 2019 18:41:12 -0400
Subject: [PATCH 21/24] update changelog [skip ci]

---
 CHANGELOG | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG b/CHANGELOG
index 4c8131a1..bace25b4 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,6 +1,6 @@
 in progress
 
-- Make tsv_to_mrmatrix more flexible.
+- Make tsv_to_mrmatrix more flexible and add it to the exported scripts.
 
 v0.10.7
 

From 3d1aca970fb2833e639100b91bf0682b8b0a39c2 Mon Sep 17 00:00:00 2001
From: Chuck McCallum <chuck_mccallum@hms.harvard.edu>
Date: Tue, 21 May 2019 11:03:59 -0400
Subject: [PATCH 22/24] Fix whitespace; one failing test locally

---
 test/tsv_to_mrmatrix_test.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/test/tsv_to_mrmatrix_test.py b/test/tsv_to_mrmatrix_test.py
index cc097498..f6b013dc 100644
--- a/test/tsv_to_mrmatrix_test.py
+++ b/test/tsv_to_mrmatrix_test.py
@@ -10,7 +10,6 @@
 from scripts.tsv_to_mrmatrix import coarsen, parse, get_height, get_width
 
 
-
 class CoarsenTest(unittest.TestCase):
     def test_5_layer_pyramid(self):
         tile_size = 4
@@ -22,7 +21,8 @@ def test_5_layer_pyramid(self):
             g = hdf5.create_group('resolutions')
             g1 = g.create_group('1')
             ds = g1.create_dataset('values', (max_width, max_width),
-                                   dtype='f4', compression='lzf', fillvalue=np.nan)
+                                   dtype='f4', compression='lzf',
+                                   fillvalue=np.nan)
             for y in range(max_width):
                 a = np.array([float(x) for x in range(max_width)])
                 ds[y, :max_width] = a
@@ -71,7 +71,8 @@ def test_math(self):
             g = hdf5.create_group('resolutions')
             g1 = g.create_group('1')
             ds = g1.create_dataset('values', (max_width, max_width),
-                                   dtype='f4', compression='lzf', fillvalue=np.nan)
+                                   dtype='f4', compression='lzf',
+                                   fillvalue=np.nan)
             for y in range(max_width):
                 a = np.array([float(x) for x in range(max_width)])
                 ds[y, :max_width] = a

From beb5a0c044e8397f0dac7ec477606e346abd72cb Mon Sep 17 00:00:00 2001
From: Chuck McCallum <chuck_mccallum@hms.harvard.edu>
Date: Tue, 21 May 2019 11:23:39 -0400
Subject: [PATCH 23/24] Comments and logs

---
 scripts/tsv_to_mrmatrix.py   | 6 +++++-
 test/tsv_to_mrmatrix_test.py | 4 ++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/scripts/tsv_to_mrmatrix.py b/scripts/tsv_to_mrmatrix.py
index 0a76c8b9..2d0b9146 100644
--- a/scripts/tsv_to_mrmatrix.py
+++ b/scripts/tsv_to_mrmatrix.py
@@ -51,6 +51,7 @@ def parse(input_handle, output_hdf5, height, width,
                 data=np.array(labels, dtype=h5py.special_dtype(vlen=str)),
                 compression='lzf')
         # TODO: Handle non-square labels
+        # https://github.com/higlass/clodius/issues/68
 
     tile_size = 256
     limit = max(height, width)
@@ -134,7 +135,10 @@ def main():
 
     height = get_height(args.input_file, is_labelled=args.labelled)
     width = get_width(args.input_file, is_labelled=args.labelled,
-                      delimiter=args.delimiter)
+                      delimiter=args.delimiter
+    print('height:', height)
+    print('width:', width)
+
     f_in = open(args.input_file, 'r', newline='')
 
     parse(f_in,
diff --git a/test/tsv_to_mrmatrix_test.py b/test/tsv_to_mrmatrix_test.py
index f6b013dc..36878cfd 100644
--- a/test/tsv_to_mrmatrix_test.py
+++ b/test/tsv_to_mrmatrix_test.py
@@ -164,8 +164,8 @@ def test_wide_labelled_square(self):
             assert_array_equal(res_2[4], [0] * 256)
             assert_array_equal(res_2[5], [0] * 256)
             assert_array_equal(res_2[6], [0] * 256)
-            # TODO: We lose nan at higher aggregations:
-            # Maybe regular mean/sum instead of treating missing values as 0?
+            # TODO: We lose nan at higher aggregations.
+            # https://github.com/higlass/clodius/issues/62
 
     def _assert_unlabelled_roundtrip_lt_256(
             self, matrix, delimiter, is_square):

From d168503bb7b57ee05282067e7231365b59d341d5 Mon Sep 17 00:00:00 2001
From: Chuck McCallum <chuck_mccallum@hms.harvard.edu>
Date: Tue, 21 May 2019 11:42:48 -0400
Subject: [PATCH 24/24] missing paren

---
 scripts/tsv_to_mrmatrix.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/tsv_to_mrmatrix.py b/scripts/tsv_to_mrmatrix.py
index 2d0b9146..e4977122 100644
--- a/scripts/tsv_to_mrmatrix.py
+++ b/scripts/tsv_to_mrmatrix.py
@@ -135,7 +135,7 @@ def main():
 
     height = get_height(args.input_file, is_labelled=args.labelled)
     width = get_width(args.input_file, is_labelled=args.labelled,
-                      delimiter=args.delimiter
+                      delimiter=args.delimiter)
     print('height:', height)
     print('width:', width)