Skip to content

Commit

Permalink
Merge pull request #10 from PacificBiosciences/v041
Browse files Browse the repository at this point in the history
V041
  • Loading branch information
velociroger-pb authored Mar 22, 2024
2 parents 572f85d + 6ccc5cb commit 87d1ed3
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 27 deletions.
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,14 @@ python3 extract_tag.py \
### Changelog
Changelog - PacBio Fusion Detection - pbfusion

## v0.4.1 3/22/24
### Changes
- Bug fix: consistent ordering between gene ids and gene names when dealing with ambiguity.
- Bug fix: rely on breakpoint ordering in visualization script.
- Improvement: gene names are assigned automatically in visualization script.
- Improvement: visualization script can now take SAM files as input.
- Improvement: visualization script can now take a breakpoint number (these aren't always unique, so it's best to still give a single entry).

## v0.4.0 11/28/23
### Changes
- Improvement: Improved ambiguous gene resolution.
Expand Down
82 changes: 55 additions & 27 deletions scripts/visualize_fusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ def parse_args():
)
parser.add_argument(
'--fusion', '-f', type=str,
help='A single line from the breakpoints groups file with the query fusion'
help=('A single line from the breakpoints '
'groups file with the query fusion')
)
parser.add_argument(
'--bam', '-b', type=str,
Expand All @@ -44,10 +45,14 @@ def parse_args():
'--pickle', '-p', type=str, default='pbfusion_vis.pickle',
help='Pickle filename [pbfusion_vis.pickle]'
)
parser.add_argument(
'--breakpoint', '-bp', type=str, default='',
help='Specify a breakpoint like BP600 to select from bed'
)
return parser.parse_args()


def read_bedpe(bedpe):
def read_bedpe(bedpe, bp):
'''
Takes the bedpe file and gives back:
reads
Expand All @@ -59,12 +64,19 @@ def read_bedpe(bedpe):
if line.startswith('#'):
continue
sl = line.rstrip().split('\t')
breakpoints = ((sl[0], sl[1]), (sl[3], sl[4]))
reads = set(sl[11][6:].split(','))
if bp and bp != sl[6]:
continue
# chr, pos, gene
breakpoints = [[sl[0], sl[1]], [sl[3], sl[4]]]
reads = set(sl[11].split(';')[0].split('=')[1].split(','))
aux = sl[10].split(';')
for field in aux:
if field.startswith('GI='):
gene_ids = field[9:].split(',')
gene_ids = field.split('=')[1].split(',')
if field.startswith('GN='):
gene_names = field.split('=')[1].split(',')
breakpoints[0].extend([gene_names[0], gene_ids[0]])
breakpoints[1].extend([gene_names[1], gene_ids[1]])
return reads, gene_ids, breakpoints


Expand All @@ -87,7 +99,7 @@ def read_gtf(gtf, gene_ids):
continue
if transcript not in gtf_dict:
gtf_dict[transcript] = []
gtf_dict[transcript].append([chromosome, start, end, type1])
gtf_dict[transcript].append([chromosome, start, end, type1, gene])

for transcript, parts in gtf_dict.items():
starts, ends, blockstarts, blockwidths, types = [], [], [], [], []
Expand All @@ -100,33 +112,45 @@ def read_gtf(gtf, gene_ids):
types.append(part[3])
transcript_list.append([
chromosome, min(starts), max(ends),
blockstarts, blockwidths, False, types])
blockstarts, blockwidths, False, types, parts[0][4]])
return transcript_list


def read_bam(bam, reads):
def read_bam(bam, reads, primary_chrom):
pysam.set_verbosity(0)
read_type = 'rb'
if bam.endswith('.sam'):
read_type = 'r'

read_dict = {}
with pysam.AlignmentFile(bam, 'rb', check_sq=False, require_index=False) as bam_file:
with pysam.AlignmentFile(bam, read_type,
check_sq=False, require_index=False) as bam_file:
for read in bam_file.fetch(until_eof=True):
if read.query_name in reads:
if len(read.get_tag('SA').split(';')) > 2:
continue
if read.query_name not in read_dict:
read_dict[read.query_name] = []
read_dict[read.query_name] = [[], []]

# check if this alignment is the 5p end of the molecule
lidx = 0 if read.query_alignment_start < 50 else 1

# make blockstarts and blockwidths
blocks = []
for b in read.get_blocks():
blocks.append((b[0], b[1]-b[0]))
read_dict[read.query_name].append([read.reference_name, blocks])
read_dict[read.query_name][lidx].append(read.reference_name)
read_dict[read.query_name][lidx].append(blocks)
return read_dict


def plot_fusion(ref_genes, alignments, breakpoints, output):
# alignments { molecule: [[chr, [(block start, block width), ...]], [[chr, ...]]] }
alignments = {k: v for (k, v) in alignments.items() if len(v) == 2}
# look at the first blockstart
gene1_start = min([x[0][1][0][0] for x in list(alignments.values())])

gene1_start = min([x[0][1][0][0] for x in list(alignments.values()) if x[0][0] == breakpoints[0][0]])
gene2_end = max([x[2] for x in ref_genes if x[0] == breakpoints[1][0]])
gene2_end = max([x[1][1][-1][0] + x[1][1][-1][1] for x in list(alignments.values())])
gene1_span = abs(int(breakpoints[0][1]) - gene1_start)
gene2_span = abs(int(breakpoints[1][1]) - gene2_end)
total_span = gene1_span + gene2_span
Expand All @@ -145,49 +169,52 @@ def plot_fusion(ref_genes, alignments, breakpoints, output):
# plot the gene annotations
g1_y, g2_y = 0, 0
for transcript in ref_genes:
if transcript[0] == breakpoints[0][0]:
if transcript[-1] == breakpoints[0][-1]:
panel = gene1
g1_y += 1
ypos = g1_y
color = orange
elif transcript[0] == breakpoints[1][0]:
elif transcript[-1] == breakpoints[1][-1]:
panel = gene2
g2_y += 1
ypos = g2_y
color = blue
start, stop = transcript[1], transcript[2]
panel.plot([start, stop], [ypos]*2, lw=0.2, c=color, zorder=10)
for i in range(len(transcript[3])):
left, width, ftype = transcript[3][i], transcript[4][i], transcript[6][i]
left, width, ftype = (transcript[x][i] for x in (3, 4, 6))
if ftype == 'exon':
offset = 0.125
else:
offset = 0.25
feature = Rect((left, ypos-offset), width, offset*2, lw=0, fc=color, zorder=12)
feature = Rect((left, ypos-offset), width,
offset*2, lw=0, fc=color, zorder=12)
panel.add_patch(feature)

y = 0
for read in list(alignments.values()):
for read in sorted(list(alignments.values()), key=lambda x: x[0][1][0][0], reverse=True):
y += 1
for locus in read:
if locus[0] == breakpoints[0][0]:
for i, locus in enumerate(read):
if i == 0:
panel = reads1
color = orange
start, stop = locus[1][0][0], int(breakpoints[0][1])
elif locus[0] == breakpoints[1][0]:
elif i == 1:
panel = reads2
color = blue
start, stop = locus[1][0][0], locus[1][-1][0] + locus[1][-1][1]
for block in locus[1]:
left, width = block
exon = Rect((left, y-0.25), width, 0.5, lw=0, fc=color, zorder=12)
exon = Rect((left, y-0.25), width, 0.5,
lw=0, fc=color, zorder=12)
panel.add_patch(exon)
panel.plot([start, stop], [y]*2, lw=0.2, c=color, zorder=10)

gene1.set_ylabel('ASPSCR1', fontsize=5)
gene2.set_ylabel('TFE3', fontsize=5)
gene1.set_ylabel(breakpoints[0][2], fontsize=5)
gene2.set_ylabel(breakpoints[1][2], fontsize=5)
reads1.set_ylabel('Reads')

pad = 50
gene1.set_xlim(gene1_start, int(breakpoints[0][1])+gene2_span)
gene2.set_xlim(int(breakpoints[1][1])-gene1_span, gene2_end)
reads1.set_ylim(0, len(alignments)+1)
Expand All @@ -211,21 +238,22 @@ def plot_fusion(ref_genes, alignments, breakpoints, output):


def main(args):
reads, gene_ids, breakpoints = read_bedpe(args.fusion)
reads, gene_ids, breakpoints = read_bedpe(args.fusion, args.breakpoint)
if os.path.exists(args.pickle):
with open(args.pickle, 'rb') as f:
ref_genes = pickle.load(f)
alignments = pickle.load(f)
else:
reads, gene_ids, breakpoints = read_bedpe(args.fusion)
# reads, gene_ids, breakpoints = read_bedpe(args.fusion)
ref_genes = read_gtf(args.annotation, gene_ids)
alignments = read_bam(args.bam, reads)
alignments = read_bam(args.bam, reads, breakpoints[0][0])
with open(args.pickle, 'wb') as f:
pickle.dump(ref_genes, f)
pickle.dump(alignments, f)

plot_fusion(ref_genes, alignments, breakpoints, args.output)


if __name__ == '__main__':
args = parse_args()
main(args)

0 comments on commit 87d1ed3

Please sign in to comment.