From c02d4cc3052467893c327f728c0da8e4c5d930e4 Mon Sep 17 00:00:00 2001
From: Fritz Lekschas <932103+flekschas@users.noreply.github.com>
Date: Wed, 9 Dec 2020 21:43:19 -0500
Subject: [PATCH 1/5] Add a new tile-based indexing strategy for beddb files

Genome wide this can lead to 20x faster queries but it increases the file size by a factor of 2.5
---
 clodius/cli/aggregate.py | 351 ++++++++++++++++++++++++++++++++++-----
 clodius/tiles/beddb.py   |  18 +-
 2 files changed, 327 insertions(+), 42 deletions(-)

diff --git a/clodius/cli/aggregate.py b/clodius/cli/aggregate.py
index e086babd..33004353 100644
--- a/clodius/cli/aggregate.py
+++ b/clodius/cli/aggregate.py
@@ -465,9 +465,16 @@ def _bedfile(
     delimiter,
     chromsizes_filename,
     offset,
+    sqlite_cache_size=500,  # 500 MB
+    sqlite_batch_size=100000,
+    index_strategy='range-index',
+    verbose=False,
 ):
     BEDDB_VERSION = 3
 
+    if verbose:
+        print(f'BEDDB VERSION: {BEDDB_VERSION}')
+
     if output_file is None:
         output_file = filepath + ".beddb"
     else:
@@ -550,7 +557,9 @@ def line_to_np_array(line):
 
     dset = []
 
-    print("delimiter:", delimiter)
+    if verbose:
+        print("delimiter:", delimiter)
+
     if has_header:
         line = bed_file.readline()
         header = line.strip().split(delimiter)
@@ -610,7 +619,9 @@ def line_to_np_array(line):
     import sqlite3
 
     sqlite3.register_adapter(np.int64, lambda val: int(val))
-    print("output_file:", output_file, "header:", header)
+    if verbose:
+        print("output_file:", output_file, "header:", header)
+
     conn = sqlite3.connect(output_file)
 
     # store some meta data
@@ -629,19 +640,21 @@ def line_to_np_array(line):
     )
 
     # max_width = tile_size * 2 ** max_zoom
-    uid_to_entry = {}
+    uid_to_interval = {}
 
     intervals = []
 
     # store each bed file entry as an interval
     for d in dset:
         uid = d["uid"]
-        uid_to_entry[uid] = d
+        uid_to_interval[uid] = d
         intervals += [(d["startPos"], d["endPos"], uid)]
 
-    tile_width = tile_size
-
     c = conn.cursor()
+    c.execute("PRAGMA synchronous = OFF;")
+    c.execute("PRAGMA journal_mode = OFF;")
+    c.execute(f"PRAGMA cache_size = {int(sqlite_cache_size * 1000)};")
+
     c.execute(
         """
         CREATE TABLE intervals
@@ -659,6 +672,74 @@ def line_to_np_array(line):
         """
     )
 
+    sorted_intervals = sorted(
+        intervals, key=lambda x: -uid_to_interval[x[-1]]["importance"]
+    )
+
+    if verbose:
+        print("max_per_tile:", max_per_tile)
+
+    tile_counts = col.defaultdict(int)
+
+    if index_strategy == 'tile-index':
+        _bedfile_tile_index(
+            conn,
+            c,
+            sorted_intervals,
+            uid_to_interval,
+            max_zoom,
+            tile_size,
+            tile_counts,
+            max_per_tile,
+            sqlite_cache_size,
+            sqlite_batch_size,
+            verbose,
+        )
+    else:
+        _bedfile_range_index(
+            conn,
+            c,
+            sorted_intervals,
+            uid_to_interval,
+            max_zoom,
+            tile_size,
+            tile_counts,
+            max_per_tile,
+            sqlite_cache_size,
+            sqlite_batch_size,
+            verbose,
+        )
+
+    conn.commit()
+
+    c.execute("ANALYZE;")
+
+    conn.commit()
+
+    c.close()
+
+    return True
+
+
+def _bedfile_range_index(
+    conn,
+    c,
+    sorted_intervals,
+    uid_to_interval,
+    max_zoom,
+    tile_size,
+    tile_counts,
+    max_per_tile,
+    sqlite_cache_size=500,  # 500 MB
+    sqlite_batch_size=100000,
+    verbose=False,
+):
+    """Traditional beddb format
+    """
+
+    if verbose:
+        print('Indexing strategy: range-based (default)')
+
     c.execute(
         """
         CREATE VIRTUAL TABLE position_index USING rtree(
@@ -668,25 +749,36 @@ def line_to_np_array(line):
         """
     )
 
-    curr_zoom = 0
     counter = 0
 
-    max_viewable_zoom = max_zoom
+    if verbose:
+        print("max_per_tile:", max_per_tile)
 
-    if max_zoom is not None and max_zoom < max_zoom:
-        max_viewable_zoom = max_zoom
+    tile_counts = col.defaultdict(int)
 
-    sorted_intervals = sorted(
-        intervals, key=lambda x: -uid_to_entry[x[-1]]["importance"]
-    )
-    # print('si:', sorted_intervals[:10])
-    print("max_per_tile:", max_per_tile)
+    interval_inserts = []
+    position_index_inserts = []
 
-    tile_counts = col.defaultdict(int)
+    def batch_insert(conn, c, interval_inserts, position_index_inserts):
+        if verbose > 0:
+            print(f"Insert batch ({counter})")
+
+        with transaction(conn):
+            c.executemany(
+                "INSERT INTO intervals VALUES (?,?,?,?,?,?,?,?,?)", interval_inserts
+            )
+            c.executemany(
+                "INSERT INTO position_index VALUES (?,?,?,?,?)", position_index_inserts
+            )
+
+        interval_inserts.clear()
+        position_index_inserts.clear()
 
     for interval in sorted_intervals:
+        curr_zoom = 0
+
         # go through each interval from most important to least
-        while curr_zoom <= max_viewable_zoom:
+        while curr_zoom <= max_zoom:
             # try to place it in the highest zoom level and go down from there
             tile_width = tile_size * 2 ** (max_zoom - curr_zoom)
 
@@ -735,14 +827,9 @@ def line_to_np_array(line):
 
             if space_available:
                 # there's available space
-                value = uid_to_entry[interval[-1]]
+                value = uid_to_interval[interval[-1]]
 
-                # one extra question mark for the primary key
-                exec_statement = "INSERT INTO intervals VALUES (?,?,?,?,?,?,?,?,?)"
-
-                c.execute(
-                    exec_statement,
-                    # primary key, zoomLevel, startPos, endPos, chrOffset, line
+                interval_inserts.append(
                     (
                         counter,
                         curr_zoom,
@@ -756,14 +843,14 @@ def line_to_np_array(line):
                     ),
                 )
 
-                if counter % 1000 == 0:
-                    print("counter:", counter, value["endPos"] - value["startPos"])
-
-                exec_statement = "INSERT INTO position_index VALUES (?,?,?,?,?)"
-                c.execute(
-                    exec_statement,
-                    # add counter as a primary key
-                    (counter, curr_zoom, curr_zoom, value["startPos"], value["endPos"]),
+                position_index_inserts.append(
+                    (
+                        counter,
+                        curr_zoom,
+                        curr_zoom,
+                        value["startPos"],
+                        value["endPos"]
+                    ),
                 )
 
                 counter += 1
@@ -771,9 +858,149 @@ def line_to_np_array(line):
 
             curr_zoom += 1
 
-        curr_zoom = 0
+        if len(interval_inserts) >= sqlite_batch_size:
+            batch_insert(conn, c, interval_inserts, position_index_inserts)
+
+    batch_insert(conn, c, interval_inserts, position_index_inserts)
+
+
+def _bedfile_tile_index(
+    conn,
+    c,
+    sorted_intervals,
+    uid_to_interval,
+    max_zoom,
+    tile_size,
+    tile_counts,
+    max_per_tile,
+    sqlite_cache_size=500,  # 500 MB
+    sqlite_batch_size=100000,
+    verbose=False,
+):
+    if verbose:
+        print('Indexing strategy: tile-based')
+
+    row = c.execute("SELECT * from tileset_info").fetchone()
+    version = row[next(zip(*c.description)).index("version")]
+    c.execute(
+        f"""
+        UPDATE tileset_info
+        SET version = '{version}t'
+        WHERE version = '{version}'
+        """
+    )
     conn.commit()
-    return True
+
+    c.execute(
+        """
+        CREATE TABLE tiles
+        (
+            id int,
+            intervalId int,
+            PRIMARY KEY (id, intervalId)
+        )
+        """
+    )
+
+    # I.e., tiles_cumsum[3] is the number of tiles with zoomlevels lower than 3
+    tiles_cumsum = np.cumsum([0] + [2**x for x in range(max_zoom + 1)])
+
+    interval_inserts = []
+    tile_inserts = []
+
+    def batch_insert(conn, c, interval_inserts, tile_inserts, counter):
+        if verbose > 0:
+            print(f"Insert batch ({counter})")
+
+        with transaction(conn):
+            c.executemany(
+                "INSERT INTO intervals VALUES (?,?,?,?,?,?,?,?,?)", interval_inserts
+            )
+            c.executemany(
+                "INSERT INTO tiles VALUES (?,?)", tile_inserts
+            )
+
+        interval_inserts.clear()
+        tile_inserts.clear()
+
+    for interval_idx, interval in enumerate(sorted_intervals):
+        curr_zoom = 0
+        inserted = False
+        # go through each interval from most important to least
+        while curr_zoom <= max_zoom:
+            # try to place it in the highest zoom level and go down from there
+            tile_width = tile_size * 2 ** (max_zoom - curr_zoom)
+
+            curr_pos = interval[0]
+            space_available = True
+
+            # if we have not yet inserted the interval somewhere
+            if not inserted:
+                # check if there's space at this zoom level
+                while curr_pos < interval[1]:
+                    curr_tile = math.floor(curr_pos / tile_width)
+                    tile_id = f"{curr_zoom}.{curr_tile}"
+
+                    # if any of the overlapping tiles is already filled up,
+                    # lets go to the next zoom level by setting `space_available`
+                    # to false
+                    if tile_counts[tile_id] >= max_per_tile:
+                        space_available = False
+                        break
+
+                    curr_pos += tile_width
+
+            # if there is, then increment the tile counters
+            if not inserted and space_available:
+                curr_pos = interval[0]
+                while curr_pos < interval[1]:
+                    curr_tile = math.floor(curr_pos / tile_width)
+                    tile_id = "{}.{}".format(curr_zoom, curr_tile)
+
+                    tile_counts[tile_id] += 1
+
+                    curr_pos += tile_width
+
+            if inserted or space_available:
+                # there's available space
+                if not inserted:
+                    value = uid_to_interval[interval[-1]]
+                    interval_inserts.append(
+                        (
+                            interval_idx,
+                            curr_zoom,
+                            value["importance"],
+                            value["startPos"],
+                            value["endPos"],
+                            value["chrOffset"],
+                            value["uid"],
+                            value["name"],
+                            value["fields"],
+                        ),
+                    )
+                    if verbose and interval_idx == 0:
+                        print(f'Interval 0 first appears at zoom level {curr_zoom}')
+
+                curr_pos = interval[0]
+                while curr_pos < interval[1]:
+                    curr_tile_x = math.floor(curr_pos / tile_width)
+                    tile_idx = tiles_cumsum[curr_zoom] + curr_tile_x
+
+                    tile_inserts.append((tile_idx, interval_idx))
+
+                    if verbose and interval_idx == 0:
+                        print(f'Interval 0 is added to {curr_zoom}.{curr_tile_x}')
+
+                    curr_pos += tile_width
+
+                inserted = True
+
+            curr_zoom += 1
+
+        if len(interval_inserts) >= sqlite_batch_size:
+            batch_insert(conn, c, interval_inserts, tile_inserts, interval_idx)
+
+    batch_insert(conn, c, interval_inserts, tile_inserts, len(sorted_intervals))
 
 
 ###############################################################################
@@ -1431,59 +1658,98 @@ def bedgraph(
 @click.option(
     "--output-file",
     "-o",
-    default=None,
     help="The default output file name to use. If this isn't "
     "specified, clodius will replace the current extension "
     "with .multires.bed",
+    default=None,
+    show_default=True,
 )
 @click.option(
     "--assembly",
     "-a",
     help="The genome assembly that this file was created against",
     default="hg19",
+    show_default=True,
 )
 @click.option(
     "--importance-column",
+    "-i",
     help="The column (1-based) containing information about how important "
     "that row is. If it's absent, then use the length of the region. "
     "If the value is equal to `random`, then a random value will be "
     "used for the importance (effectively leading to random sampling)",
+    default="random",
+    show_default=True,
 )
 @click.option(
     "--has-header/--no-header",
     help="Does this file have a header that we should ignore",
     default=False,
+    show_default=True,
 )
 @click.option(
     "--chromosome",
-    default=None,
+    "-c",
     help="Only extract values for a particular chromosome. "
     "Use all chromosomes if not set.",
+    default=None,
+    show_default=True,
 )
 @click.option(
     "--max-per-tile",
-    default=100,
+    "-m",
     type=int,
+    default=100,
+    show_default=True,
     help="The maximum number of entries to store per tile",
 )
 @click.option(
     "--tile-size",
-    default=1024,
+    "-s",
     help="The number of nucleotides that the highest resolution tiles "
     "should span. This determines the maximum zoom level",
+    default=1024,
+    show_default=True,
 )
 @click.option("--delimiter", default=None, type=str)
 @click.option(
     "--chromsizes-filename",
     help="A file containing chromosome sizes and order",
     default=None,
+    show_default=True,
 )
 @click.option(
     "--offset",
     help="Apply an offset to all the coordinates in this file",
     type=int,
     default=0,
+    show_default=True,
 )
+@click.option(
+    "--sqlite-cache-size",
+    help="The SQLite cache size in MB. The higher "
+    + "the faster the aggregation gets but more memory will be required",
+    type=int,
+    default=500,
+    show_default=True,
+)
+@click.option(
+    "--sqlite-batch-size",
+    help="The number of entries inserted into SQLite at once. The higher "
+    + "the faster the aggregation gets but more memory will be required",
+    type=int,
+    default=100000,
+    show_default=True,
+)
+@click.option(
+    "--tile-index/--no-tile-index",
+    help="Tile-based indexing speeds up interval queries by up to 20x at "
+    +"the expensive of a 2.5x larger filesize",
+    type=bool,
+    default=False,
+    show_default=True,
+)
+@click.option("-v", "--verbose", count=True, help="Increase log statements")
 def bedfile(
     filepath,
     output_file,
@@ -1496,7 +1762,12 @@ def bedfile(
     delimiter,
     chromsizes_filename,
     offset,
+    sqlite_cache_size,
+    sqlite_batch_size,
+    tile_index,
+    verbose
 ):
+    index_strategy = 'tile-index' if tile_index else 'range-index'
     _bedfile(
         filepath,
         output_file,
@@ -1509,6 +1780,10 @@ def bedfile(
         delimiter,
         chromsizes_filename,
         offset,
+        sqlite_cache_size,
+        sqlite_batch_size,
+        index_strategy,
+        verbose
     )
 
 
diff --git a/clodius/tiles/beddb.py b/clodius/tiles/beddb.py
index 160d8a56..11b6dd37 100644
--- a/clodius/tiles/beddb.py
+++ b/clodius/tiles/beddb.py
@@ -12,7 +12,10 @@ def tileset_info(db_file):
     if "version" not in colnames:
         version = 1
     else:
-        version = int(row[colnames.index("version")])
+        try:
+            version = int(row[colnames.index("version")])
+        except ValueError:
+            version = row[colnames.index("version")]
 
     if "header" not in colnames:
         header = ""
@@ -153,10 +156,17 @@ def get_1D_tiles(db_file, zoom, tile_x_pos, num_tiles=1):
             zoom, tile_start_pos, tile_end_pos
         )
 
-    # import time
-    # t1 = time.time()
+    if version == '3t':
+        tile_id = sum([2**x for x in range(zoom)]) + tile_x_pos
+        query = f"""
+        SELECT startPos, endPos, chrOffset, importance, fields, uid, name
+        FROM intervals, tiles
+        WHERE
+            tiles.id = {tile_id} AND
+            tiles.intervalId = intervals.id
+        """
+
     rows = c.execute(query).fetchall()
-    # t2 = time.time()
 
     new_rows = []
 

From eb50e3812d8bcf9a68f85467d2b8ce0f3b3beef0 Mon Sep 17 00:00:00 2001
From: Fritz Lekschas <932103+flekschas@users.noreply.github.com>
Date: Wed, 9 Dec 2020 21:50:52 -0500
Subject: [PATCH 2/5] Blackification

---
 clodius/cli/aggregate.py | 40 ++++++++++++++++------------------------
 clodius/tiles/beddb.py   |  4 ++--
 2 files changed, 18 insertions(+), 26 deletions(-)

diff --git a/clodius/cli/aggregate.py b/clodius/cli/aggregate.py
index 75d00618..8b882b8a 100644
--- a/clodius/cli/aggregate.py
+++ b/clodius/cli/aggregate.py
@@ -468,13 +468,13 @@ def _bedfile(
     offset,
     sqlite_cache_size=500,  # 500 MB
     sqlite_batch_size=100000,
-    index_strategy='range-index',
+    index_strategy="range-index",
     verbose=False,
 ):
     BEDDB_VERSION = 3
 
     if verbose:
-        print(f'BEDDB VERSION: {BEDDB_VERSION}')
+        print(f"BEDDB VERSION: {BEDDB_VERSION}")
 
     if output_file is None:
         output_file = filepath + ".beddb"
@@ -682,7 +682,7 @@ def line_to_np_array(line):
 
     tile_counts = col.defaultdict(int)
 
-    if index_strategy == 'tile-index':
+    if index_strategy == "tile-index":
         _bedfile_tile_index(
             conn,
             c,
@@ -739,7 +739,7 @@ def _bedfile_range_index(
     """
 
     if verbose:
-        print('Indexing strategy: range-based (default)')
+        print("Indexing strategy: range-based (default)")
 
     c.execute(
         """
@@ -841,17 +841,11 @@ def batch_insert(conn, c, interval_inserts, position_index_inserts):
                         value["uid"],
                         value["name"],
                         value["fields"],
-                    ),
+                    )
                 )
 
                 position_index_inserts.append(
-                    (
-                        counter,
-                        curr_zoom,
-                        curr_zoom,
-                        value["startPos"],
-                        value["endPos"]
-                    ),
+                    (counter, curr_zoom, curr_zoom, value["startPos"], value["endPos"])
                 )
 
                 counter += 1
@@ -879,7 +873,7 @@ def _bedfile_tile_index(
     verbose=False,
 ):
     if verbose:
-        print('Indexing strategy: tile-based')
+        print("Indexing strategy: tile-based")
 
     row = c.execute("SELECT * from tileset_info").fetchone()
     version = row[next(zip(*c.description)).index("version")]
@@ -904,7 +898,7 @@ def _bedfile_tile_index(
     )
 
     # I.e., tiles_cumsum[3] is the number of tiles with zoomlevels lower than 3
-    tiles_cumsum = np.cumsum([0] + [2**x for x in range(max_zoom + 1)])
+    tiles_cumsum = np.cumsum([0] + [2 ** x for x in range(max_zoom + 1)])
 
     interval_inserts = []
     tile_inserts = []
@@ -917,9 +911,7 @@ def batch_insert(conn, c, interval_inserts, tile_inserts, counter):
             c.executemany(
                 "INSERT INTO intervals VALUES (?,?,?,?,?,?,?,?,?)", interval_inserts
             )
-            c.executemany(
-                "INSERT INTO tiles VALUES (?,?)", tile_inserts
-            )
+            c.executemany("INSERT INTO tiles VALUES (?,?)", tile_inserts)
 
         interval_inserts.clear()
         tile_inserts.clear()
@@ -977,10 +969,10 @@ def batch_insert(conn, c, interval_inserts, tile_inserts, counter):
                             value["uid"],
                             value["name"],
                             value["fields"],
-                        ),
+                        )
                     )
                     if verbose and interval_idx == 0:
-                        print(f'Interval 0 first appears at zoom level {curr_zoom}')
+                        print(f"Interval 0 first appears at zoom level {curr_zoom}")
 
                 curr_pos = interval[0]
                 while curr_pos < interval[1]:
@@ -990,7 +982,7 @@ def batch_insert(conn, c, interval_inserts, tile_inserts, counter):
                     tile_inserts.append((tile_idx, interval_idx))
 
                     if verbose and interval_idx == 0:
-                        print(f'Interval 0 is added to {curr_zoom}.{curr_tile_x}')
+                        print(f"Interval 0 is added to {curr_zoom}.{curr_tile_x}")
 
                     curr_pos += tile_width
 
@@ -1745,7 +1737,7 @@ def bedgraph(
 @click.option(
     "--tile-index/--no-tile-index",
     help="Tile-based indexing speeds up interval queries by up to 20x at "
-    +"the expensive of a 2.5x larger filesize",
+    + "the expensive of a 2.5x larger filesize",
     type=bool,
     default=False,
     show_default=True,
@@ -1766,9 +1758,9 @@ def bedfile(
     sqlite_cache_size,
     sqlite_batch_size,
     tile_index,
-    verbose
+    verbose,
 ):
-    index_strategy = 'tile-index' if tile_index else 'range-index'
+    index_strategy = "tile-index" if tile_index else "range-index"
     _bedfile(
         filepath,
         output_file,
@@ -1784,7 +1776,7 @@ def bedfile(
         sqlite_cache_size,
         sqlite_batch_size,
         index_strategy,
-        verbose
+        verbose,
     )
 
 
diff --git a/clodius/tiles/beddb.py b/clodius/tiles/beddb.py
index 11b6dd37..43e7ea9b 100644
--- a/clodius/tiles/beddb.py
+++ b/clodius/tiles/beddb.py
@@ -156,8 +156,8 @@ def get_1D_tiles(db_file, zoom, tile_x_pos, num_tiles=1):
             zoom, tile_start_pos, tile_end_pos
         )
 
-    if version == '3t':
-        tile_id = sum([2**x for x in range(zoom)]) + tile_x_pos
+    if version == "3t":
+        tile_id = sum([2 ** x for x in range(zoom)]) + tile_x_pos
         query = f"""
         SELECT startPos, endPos, chrOffset, importance, fields, uid, name
         FROM intervals, tiles

From a0523f5d5ee0d1277025bba0777749010367d75f Mon Sep 17 00:00:00 2001
From: Fritz Lekschas <932103+flekschas@users.noreply.github.com>
Date: Wed, 9 Dec 2020 21:51:44 -0500
Subject: [PATCH 3/5] Update

---
 CHANGELOG.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f3c497f7..a3bc70c2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,9 @@
 v0.16.0
 
+- Added a tile-based indexing strategy for `.beddb` files, which can speed up queries by up to 20x at the expense of increasing the file size by a factor of 2.5x
+
+v0.16.0
+
 - No default assembly
 
 v0.15.2

From 8fc7cad6b7616e68a51b053585c1ae1c3913ace0 Mon Sep 17 00:00:00 2001
From: Fritz Lekschas <932103+flekschas@users.noreply.github.com>
Date: Mon, 14 Dec 2020 11:22:55 -0500
Subject: [PATCH 4/5] Improve code comments for documentation and remove debug
 logs

---
 clodius/cli/aggregate.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/clodius/cli/aggregate.py b/clodius/cli/aggregate.py
index 8b882b8a..39bde5b8 100644
--- a/clodius/cli/aggregate.py
+++ b/clodius/cli/aggregate.py
@@ -943,7 +943,11 @@ def batch_insert(conn, c, interval_inserts, tile_inserts, counter):
 
                     curr_pos += tile_width
 
-            # if there is, then increment the tile counters
+            # If there is, then increment the tile counters
+            # Note, the tile count should only be incremented when space is
+            # available and we have not yet inserted the interval. In other
+            # words, only the first instance of where an interval is inserted
+            # counts!
             if not inserted and space_available:
                 curr_pos = interval[0]
                 while curr_pos < interval[1]:
@@ -955,7 +959,10 @@ def batch_insert(conn, c, interval_inserts, tile_inserts, counter):
                     curr_pos += tile_width
 
             if inserted or space_available:
-                # there's available space
+                # If there's available space, we will insert the interval
+                # Note, that we only want to insert the interval exactly once
+                # and skip subsequent inserts by checking if `inserted` is
+                # false
                 if not inserted:
                     value = uid_to_interval[interval[-1]]
                     interval_inserts.append(
@@ -971,9 +978,9 @@ def batch_insert(conn, c, interval_inserts, tile_inserts, counter):
                             value["fields"],
                         )
                     )
-                    if verbose and interval_idx == 0:
-                        print(f"Interval 0 first appears at zoom level {curr_zoom}")
 
+                # The following while-loop is necessary to ensure that tiles at
+                # higher zoom level also contain the interval
                 curr_pos = interval[0]
                 while curr_pos < interval[1]:
                     curr_tile_x = math.floor(curr_pos / tile_width)
@@ -981,9 +988,6 @@ def batch_insert(conn, c, interval_inserts, tile_inserts, counter):
 
                     tile_inserts.append((tile_idx, interval_idx))
 
-                    if verbose and interval_idx == 0:
-                        print(f"Interval 0 is added to {curr_zoom}.{curr_tile_x}")
-
                     curr_pos += tile_width
 
                 inserted = True

From 74c21043dfbf46fd9cbf55622629ce35668e25c9 Mon Sep 17 00:00:00 2001
From: Fritz Lekschas <932103+flekschas@users.noreply.github.com>
Date: Mon, 14 Dec 2020 14:58:09 -0500
Subject: [PATCH 5/5] Added a test for tile index beddb files

---
 test/bedfile_test.py | 100 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 100 insertions(+)

diff --git a/test/bedfile_test.py b/test/bedfile_test.py
index 4bba8dba..a320c6ac 100644
--- a/test/bedfile_test.py
+++ b/test/bedfile_test.py
@@ -108,6 +108,8 @@ def test_gene_annotations():
     rows = ctb.tiles(f.name, ["x.11.112"])[0][1]
     assert rows[0]["fields"][3] == "Lrp1b"
 
+    os.remove(f.name)
+
 
 def test_random_importance():
     # check that when aggregating using random importance, all values that
@@ -165,6 +167,7 @@ def test_random_importance():
     for key, value in found.items():
         assert value
 
+    os.remove(f.name)
     pass
 
 
@@ -272,3 +275,100 @@ def test_float_importance():
         ],
     )
     # TODO: Make assertions about result
+
+    os.remove(f.name)
+
+
+def test_tile_index():
+    f = tempfile.NamedTemporaryFile(delete=False)
+
+    runner = clt.CliRunner()
+    input_file = op.join(testdir, "sample_data", "test_float_importance.bed")
+
+    runner.invoke(
+        cca.bedfile,
+        [
+            input_file,
+            "--max-per-tile",
+            "2",
+            "--importance-column",
+            "4",
+            "--assembly",
+            "hg38",
+            "--no-header",
+            "--tile-index",
+            "--output-file",
+            f.name,
+        ],
+    )
+
+    rows = ctb.tiles(f.name, ["x.0.0"])[0][1]
+
+    for row in rows:
+        assert row["fields"][0] == "chr20"
+
+    conn = sqlite3.connect(f.name)
+    c = conn.cursor()
+
+    rows = c.execute("SELECT * from tiles;").fetchall()
+    assert len(rows) == 2 * 39 + 2 * 38 + 37
+
+    rows = c.execute("SELECT * from intervals;").fetchall()
+    assert len(rows) == 5
+
+    os.remove(f.name)
+
+
+
+def test_compare_tile_vs_range_index():
+    f_tile = tempfile.NamedTemporaryFile(delete=False)
+    f_range = tempfile.NamedTemporaryFile(delete=False)
+
+    runner = clt.CliRunner()
+    input_file = op.join(testdir, "sample_data", "test_float_importance.bed")
+
+    runner.invoke(
+        cca.bedfile,
+        [
+            input_file,
+            "--max-per-tile",
+            "2",
+            "--importance-column",
+            "4",
+            "--assembly",
+            "hg38",
+            "--no-header",
+            "--tile-index",
+            "--output-file",
+            f_tile.name,
+        ],
+    )
+
+    runner.invoke(
+        cca.bedfile,
+        [
+            input_file,
+            "--max-per-tile",
+            "2",
+            "--importance-column",
+            "4",
+            "--assembly",
+            "hg38",
+            "--no-header",
+            "--output-file",
+            f_range.name,
+        ],
+    )
+
+    rows_tile = ctb.tiles(f_tile.name, ["x.1.1"])[0][1]
+    assert len(rows_tile) == 4
+    rows_range = ctb.tiles(f_tile.name, ["x.1.1"])[0][1]
+    assert len(rows_range) == 4
+
+    rows_tile = ctb.tiles(f_tile.name, ["x.2.2"])[0][1]
+    assert len(rows_tile) == 5
+    rows_range = ctb.tiles(f_tile.name, ["x.2.2"])[0][1]
+    assert len(rows_range) == 5
+
+    os.remove(f_tile.name)
+    os.remove(f_range.name)
\ No newline at end of file