Skip to content

Commit

Permalink
Merge pull request #33 from genomewalker/fix-header
Browse files Browse the repository at this point in the history
Fix header
  • Loading branch information
genomewalker authored May 1, 2024
2 parents 369e503 + 3951108 commit b327819
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 52 deletions.
30 changes: 16 additions & 14 deletions bam_filter/reassign.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,9 @@ def write_reassigned_bam(
write_threads = threads

new_header = header.to_dict()
new_header["SQ"] = [x for x in new_header["SQ"] if x["SN"] in ref_names]
new_header["SQ"] = [x for x in new_header["SQ"] if x["SN"] in list(ref_names)]
new_header["SQ"].sort(key=lambda x: list(ref_names).index(x["SN"]))
new_header["HD"]["SO"] = "unsorted"

out_bam_file = pysam.AlignmentFile(
out_files["bam_reassigned_tmp"],
Expand Down Expand Up @@ -415,19 +417,19 @@ def write_reassigned_bam(
samfile.close()

if max_chr_length > 536870912:
logging.info("A reference is longer than 2^29, indexing with csi")
pysam.index(
"-c",
"-@",
str(threads),
out_bam,
)
else:
pysam.index(
"-@",
str(threads),
out_bam,
)
logging.info("A reference is longer than 2^29")
pysam.index(
"-c",
"-@",
str(threads),
out_bam,
)
# else:
# pysam.index(
# "-@",
# str(threads),
# out_bam,
# )

os.remove(out_files["bam_reassigned_tmp"])
else:
Expand Down
68 changes: 30 additions & 38 deletions bam_filter/sam_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,10 @@ def write_bam(bam, references, output_files, threads=1, sort_memory="1G"):
write_threads = threads

new_header = header.to_dict()
new_header["SQ"] = [x for x in new_header["SQ"] if x["SN"] in ref_names]
new_header["SQ"] = [x for x in new_header["SQ"] if x["SN"] in list(ref_names)]
new_header["SQ"].sort(key=lambda x: list(ref_names).index(x["SN"]))
# change it to unsorted
new_header["HD"]["SO"] = "unsorted"

out_bam_file = pysam.AlignmentFile(
output_files["bam_tmp"],
Expand All @@ -102,6 +105,8 @@ def write_bam(bam, references, output_files, threads=1, sort_memory="1G"):
# out_bam_file.set(pg)
references = [x for x in samfile.references if x in refs_idx.keys()]

# sort new_header["SQ"] using the references order

logging.info(f"::: ::: Filtering {len(references):,} references sequentially...")
for reference in tqdm.tqdm(
references,
Expand Down Expand Up @@ -153,19 +158,13 @@ def write_bam(bam, references, output_files, threads=1, sort_memory="1G"):
samfile.close()

if max_chr_length > 536870912:
logging.info("::: ::: A reference is longer than 2^29, indexing with csi")
pysam.index(
"-c",
"-@",
str(s_threads),
output_files["bam_tmp_sorted"],
)
else:
pysam.index(
"-@",
str(s_threads),
output_files["bam_tmp_sorted"],
)
logging.info("::: ::: A reference is longer than 2^29")
pysam.index(
"-c",
"-@",
str(s_threads),
output_files["bam_tmp_sorted"],
)

os.remove(output_files["bam_tmp"])
return output_files["bam_tmp_sorted"]
Expand Down Expand Up @@ -829,14 +828,9 @@ def check_bam_file(
if not samfile.has_index():
logging.info("BAM index not found. Indexing...")
if max_chr_length > 536870912:
logging.info("A reference is longer than 2^29, indexing with csi")
pysam.index("-c", "-@", str(threads), bam)
else:
pysam.index(
"-@",
str(threads),
bam,
)
logging.info("A reference is longer than 2^29")
pysam.index("-c", "-@", str(threads), bam)

logging.info("::: BAM file looks good.")

return bam # No need to reload the samfile after creating index, thanks to the with statement
Expand Down Expand Up @@ -1187,7 +1181,13 @@ def filter_reference_BAM(
else:
write_threads = threads
new_header = header.to_dict()
new_header["SQ"] = [x for x in new_header["SQ"] if x["SN"] in ref_names]
new_header["SQ"] = [
x for x in new_header["SQ"] if x["SN"] in list(ref_names)
]

new_header["SQ"].sort(key=lambda x: list(ref_names).index(x["SN"]))
new_header["HD"]["SO"] = "unsorted"

out_bam_file = pysam.AlignmentFile(
out_files["bam_filtered_tmp"],
"wb",
Expand Down Expand Up @@ -1326,21 +1326,13 @@ def filter_reference_BAM(
samfile.close()

if max_chr_length > 536870912:
logging.info(
"A reference is longer than 2^29, indexing with csi"
)
pysam.index(
"-c",
"-@",
str(threads),
out_files["bam_filtered"],
)
else:
pysam.index(
"-@",
str(threads),
out_files["bam_filtered"],
)
logging.info("A reference is longer than 2^29")
pysam.index(
"-c",
"-@",
str(threads),
out_files["bam_filtered"],
)

os.remove(out_files["bam_filtered_tmp"])
else:
Expand Down

0 comments on commit b327819

Please sign in to comment.