Skip to content

Commit

Permalink
Handing bedfile headers (#50)
Browse files Browse the repository at this point in the history
* modified the docstring of the states_bedline_to_vector function

- Added more examples and descriptions to make ir more informative

* Added test for the states format option

* Fixed syntax error

* fixed syntax error 2

* Allows row_infos multiple columns

* Added code to identify and ignore bedfiles headers

* Removed --has-header option

* clean up the encode/decode conversions

* Fix line that handles bedfile headers

* Added test for ignoring bedfile headers

* Added bedfile with headers testfile

* Revert "Removed --has-header option"

This reverts commit 51a1af6. Brings back the --has-header option
  • Loading branch information
Lmercadom authored and pkerpedjiev committed Apr 6, 2019
1 parent 7e4a0d5 commit 37b304a
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 4 deletions.
9 changes: 5 additions & 4 deletions clodius/cli/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def states_bedline_to_vector(bedlines,states_dic):
chrom=parts[0]
start=int(parts[1])
end=int(parts[2])
state= states_dic[parts[3].encode('utf8')]
state= states_dic[parts[3]]

states_vector = [ 1 if index == state else 0 for index in range(len(states_dic))]

Expand Down Expand Up @@ -135,13 +135,13 @@ def bedline_to_chrom_start_end_vector(bedlines, row_infos=None):
start_set = set()
end_set = set()
all_vector = []

for bedline in bedlines:
parts = bedline.strip().split()
chrom = parts[chrom_col-1]
start = int(parts[from_pos_col-1])
end = int(parts[to_pos_col-1])
vector = [float(f) if not f == 'NA' else np.nan
vector = [float(f) if not f == 'NA' else np.nan
for f in parts[value_col-1:value_col-1+num_rows]]
chrom_set.add(chrom)
start_set.add(start)
Expand All @@ -165,7 +165,8 @@ def bedline_to_chrom_start_end_vector(bedlines, row_infos=None):
starting_resolution, has_header, chunk_size)
elif format == 'states':
assert(row_infos != None), "A row_infos file must be provided for --format = 'states' "
states_dic = {row_infos[x]:x for x in range(len(row_infos))}
states_names = [lne.decode('utf8').split('\t')[0] for lne in row_infos]
states_dic = {states_names[x]:x for x in range(len(row_infos))}

cmv.bedfile_to_multivec(filepaths, f_out, states_bedline_to_vector,
starting_resolution, has_header, chunk_size, states_dic)
Expand Down
5 changes: 5 additions & 0 deletions clodius/multivec.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ def bedfile_to_multivec(input_filenames, f_out,
'''
Convert an epilogos bedfile to multivec format.
'''

files = []
for input_filename in input_filenames:
if op.splitext(input_filename)[1] == '.gz':
Expand All @@ -40,6 +41,10 @@ def bedfile_to_multivec(input_filenames, f_out,
warned = False

for lines in zip(*files):
# Identifies bedfile headers and ignore them
if "browser" == lines[0].decode('utf8')[0:7] or "track" in lines[0].decode('utf8')[0:6]:
continue

chrom, start, end, vector = bedline_to_chrom_start_end_vector(lines, row_infos)
# if vector[0] > 0 or vector[1] > 0:
# print("c,s,e,v", chrom, start, end, vector)
Expand Down
19 changes: 19 additions & 0 deletions test/multivec_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,22 @@ def test_states_format_befile_to_multivec():
print("result.error", traceback.print_tb(tb))
print("Exception:", a,b)
'''


def test_ignore_bedfile_headers():
runner = clt.CliRunner()
input_file = op.join(testdir, 'sample_data', '3_header_100_testfile.bed.gz')
rows_info_file = op.join(testdir, 'sample_data', '3_header_100_row_infos.txt')
f = tempfile.NamedTemporaryFile(delete=False)

result = runner.invoke(
ccc.bedfile_to_multivec,
[input_file,
'--format', 'states',
'--row-infos-filename', rows_info_file,
'--assembly', 'hg19',
'--starting-resolution', '200',
'--num-rows', '15'])

import traceback
a,b,tb = result.exc_info
15 changes: 15 additions & 0 deletions test/sample_data/3_header_100_row_infos.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
15_Quies
1_TssA
14_ReprPCWk
9_Het
7_Enh
5_TxWk
2_TssAFlnk
10_TssBiv
4_Tx
13_ReprPC
12_EnhBiv
11_BivFlnk
8_ZNF/Rpts
6_EnhG
3_TxFlnk
Binary file added test/sample_data/3_header_100_testfile.bed.gz
Binary file not shown.

0 comments on commit 37b304a

Please sign in to comment.