Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fixed small bugs #101

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# Welcome to Circle-Map official repository!
# This is NOT the Circle-Map official repository!
# I fixed some detail bugs of Circle-Map in this gld_debug branch, enjoy this.

[![PyPI](https://img.shields.io/pypi/v/Circle-Map.svg)](https://pypi.python.org/pypi/Circle-Map)
[![Anaconda-Server Badge](https://anaconda.org/bioconda/circle-map/badges/version.svg)](https://anaconda.org/bioconda/circle-map)
[![Bioconda Downloads](https://anaconda.org/bioconda/circle-map/badges/downloads.svg)](https://anaconda.org/bioconda/circle-map)
Expand All @@ -7,7 +9,7 @@

Circle-Map is an easy to install, python package that implements all the steps required to detect extrachromosomal DNA circles. The package contains easy to run algorithms to accurately detect circular DNA formed from mappable and non mappable regions of a genome.


## Why should I use Circle-Map?

Circle-Map takes as input an alignment of reads to a reference genome (e.g. a *BWA-MEM* generated *BAM* file) and like other methods, it will use those alignments to detect cases were the read has been split into two segments (e.g. split reads) to detect genomic rearrangements supporting a circular DNA structure.
Expand Down
57 changes: 29 additions & 28 deletions circlemap/realigner.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def realign(self,peaks):

for index,interval in peaks_pd.iterrows():


iteration += 1

if check_size_and_write(results,only_discordants,self.output,self.lock,self.directory,self.overlap_fraction,self.pid) == True:
results = []
Expand Down Expand Up @@ -207,14 +207,13 @@ def realign(self,peaks):
disorcordants_per_it = 0
for index,mate_interval in realignment_interval_extended.iterrows():

iteration += 1




#sample realignment intervals
#fasta file fetch is 1 based that why I do +1

plus_coding_interval = genome_fa.fetch(str(mate_interval['chrom']),int(int(mate_interval['start'])+1),int(int(mate_interval['end'])+1)).upper()
plus_coding_interval = genome_fa.fetch(str(mate_interval['chrom']),int(int(mate_interval['start'])),int(int(mate_interval['end']))).upper()
interval_length = len(plus_coding_interval)
minus_coding_interval = str(Seq(plus_coding_interval).complement())

Expand Down Expand Up @@ -282,11 +281,11 @@ def realign(self,peaks):
else:
#sc length
sc_len = len(get_longest_soft_clipped_bases(read)['seq'])
if non_colinearity(int(read.cigar[0][0]),int(read.cigar[-1][0]),
int(read.cigar[0][1]),int(read.cigar[-1][1]),
int(read.pos),int(mate_interval.start),int(mate_interval.end)) == True:


if non_colinearity(int(read.cigar[0][0]),int(read.cigar[-1][0]),int(read.pos),
int(mate_interval.start),int(mate_interval.end)) == True:


if sc_len >= self.min_sc_length:
edits_allowed = adaptative_myers_k(sc_len, self.edit_distance_frac)
Expand Down Expand Up @@ -324,7 +323,7 @@ def realign(self,peaks):

iteration_results.append([interval['chrom'], read.reference_start, soft_clip_end+1, read.qname,iteration,float(round(score,2))])

elif read.reference_start + int(mate_interval['start']) + int(
elif read.reference_start > int(mate_interval['start']) + int(
realignment_dict['alignments'][1][0][0]):

iteration_results.append([interval['chrom'], soft_clip_start, read_end, read.qname,iteration,float(round(score,2))])
Expand All @@ -346,40 +345,42 @@ def realign(self,peaks):
else:
#discordant reads
#R2F1 oriented when iterating trough R2
if read.is_reverse == True and read.mate_is_reverse == False:
if read.is_read2:
if read.reference_start < read.next_reference_start:
# discordant read
disorcordants_per_it +=1
iteration_discordants.append([interval['chrom'],read.reference_start,read.next_reference_start + read.infer_query_length(),read.qname])

if read.mapq >= self.mapq_cutoff:
if read.is_reverse == True and read.mate_is_reverse == False:
if read.is_read2:
if read.reference_id == read.next_reference_id:
if read.reference_start < read.next_reference_start:
# discordant read
disorcordants_per_it +=1
iteration_discordants.append([interval['chrom'],read.reference_start,read.next_reference_start + read.infer_query_length(),read.qname])



#R2F1 when iterating trough F1
elif read.is_reverse == False and read.mate_is_reverse == True:
if read.is_read2 == False:
if read.next_reference_start < read.reference_start:
disorcordants_per_it +=1
iteration_discordants.append([interval['chrom'], read.next_reference_start,read.reference_start+read.infer_query_length(), read.qname])

#R2F1 when iterating trough F1
elif read.is_reverse == False and read.mate_is_reverse == True:
if read.is_read2 == False:
if read.reference_id == read.next_reference_id:
if read.next_reference_start < read.reference_start:
disorcordants_per_it +=1
iteration_discordants.append([interval['chrom'], read.next_reference_start,read.reference_start+read.infer_query_length(), read.qname])

#second pass to add discordant read info
if len(iteration_results) > 0:


results = results + assign_discordants(iteration_results,iteration_discordants,insert_metrics[0],insert_metrics[1])


elif len(iteration_discordants) > 0:
discordant_bed = pd.DataFrame.from_records(iteration_discordants,columns=['chrom','start','end','read']).sort_values(['chrom','start','end'])

discordant_bed = discordant_bed.groupby(merge_bed(discordant_bed)).agg(
{'chrom': 'first', 'start': 'first', 'end': 'last', 'read': 'count'})
discordant_bed = pd.DataFrame.from_records(iteration_discordants,columns=['chrom','start','end','read']).sort_values(['chrom','start','end'])

discordant_bed = discordant_bed.groupby(merge_bed(discordant_bed)).agg(
{'chrom': 'first', 'start': 'first', 'end': 'max', 'read': 'nunique'})


for index,disc_interval in discordant_bed.iterrows():
only_discordants.append([disc_interval['chrom'],disc_interval['start'],disc_interval['end'],disc_interval['read'],0])
for index,disc_interval in discordant_bed.iterrows():
only_discordants.append([disc_interval['chrom'],disc_interval['start'],disc_interval['end'],disc_interval['read'],0])



Expand Down Expand Up @@ -414,4 +415,4 @@ def realign(self,peaks):
ecc_dna.close()


return([0,0])
return([0,0])
71 changes: 43 additions & 28 deletions circlemap/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -607,7 +607,7 @@ def get_realignment_intervals(bed_prior,interval_extension,interval_p_cutoff,ver
candidate_mates_dataframe = candidate_mates_dataframe.sort_values(by=['chrom', 'start','end'],ascending=[True,True,True])
candidate_mates_dataframe['probability'] = candidate_mates_dataframe.probability.astype(float)

candidate_mates = candidate_mates_dataframe.groupby((candidate_mates_dataframe.end.shift()-candidate_mates_dataframe.start).lt(0).cumsum()).agg({'chrom':'first','start':'first','end':'last','probability':'sum'})
candidate_mates = candidate_mates_dataframe.groupby((candidate_mates_dataframe.end.shift()-candidate_mates_dataframe.start).lt(0).cumsum()).agg({'chrom':'first','start':'first','end':'max','probability':'sum'})

sum = np.sum(float(x[3]) for index, x in candidate_mates.iterrows())
candidate_mates['probability'] = candidate_mates['probability'] / sum
Expand All @@ -621,7 +621,7 @@ def get_realignment_intervals(bed_prior,interval_extension,interval_p_cutoff,ver
candidate_mates_dataframe['probability'] = candidate_mates_dataframe.probability.astype(float)


candidate_mates = candidate_mates_dataframe.groupby((candidate_mates_dataframe.end.shift()-candidate_mates_dataframe.start).lt(0).cumsum()).agg({'chrom':'first','start':'first','end':'last','probability':'sum'})
candidate_mates = candidate_mates_dataframe.groupby((candidate_mates_dataframe.end.shift()-candidate_mates_dataframe.start).lt(0).cumsum()).agg({'chrom':'first','start':'first','end':'max','probability':'sum'})


sum = np.sum(float(x[3]) for index,x in candidate_mates.iterrows())
Expand All @@ -647,7 +647,7 @@ def get_realignment_intervals(bed_prior,interval_extension,interval_p_cutoff,ver

for item,row in candidate_mates.iterrows():

if ('LR' in orientation) or ('L' and 'R' in orientation):
if ('LR' in orientation) or (('L' in orientation) and ('R' in orientation)):


start = row['start'] - interval_extension
Expand Down Expand Up @@ -675,8 +675,12 @@ def get_realignment_intervals(bed_prior,interval_extension,interval_p_cutoff,ver
end = row['end'] + interval_extension

extended.append([row['chrom'], row['start'], int(round(end)),float(row['probability'])])

return (pd.DataFrame.from_records(extended, columns=['chrom', 'start', 'end','probability']))
ret_data = pd.DataFrame.from_records(extended, columns=['chrom', 'start', 'end','probability'])
ret_data['start'] = ret_data.start.astype(int)
ret_data['end'] = ret_data.end.astype(int)
ret_data = ret_data.sort_values(by=['chrom', 'start','end'],ascending=[True,True,True])
ret_data = ret_data.groupby((ret_data.end.shift()-ret_data.start).lt(0).cumsum()).agg({'chrom':'first','start':'first','end':'max','probability':'first'})
return (ret_data)

else:

Expand All @@ -687,7 +691,7 @@ def get_realignment_intervals(bed_prior,interval_extension,interval_p_cutoff,ver

if interval['probability'] >= interval_p_cutoff:

if ('LR' in orientation) or ('L' and 'R' in orientation):
if ('LR' in orientation) or (('L' in orientation) and ('R' in orientation)):


start = interval['start'] - interval_extension
Expand Down Expand Up @@ -716,9 +720,12 @@ def get_realignment_intervals(bed_prior,interval_extension,interval_p_cutoff,ver

extended.append([interval['chrom'], interval['start'], int(round(end)),float(interval['probability'])])


return(pd.DataFrame.from_records(extended,columns=['chrom','start','end','probability']).sort_values(by=['probability'],ascending=[False]))

ret_data = pd.DataFrame.from_records(extended, columns=['chrom', 'start', 'end','probability'])
ret_data = ret_data.sort_values(by=['chrom', 'start','end'],ascending=[True,True,True])
ret_data['start'] = ret_data.start.astype(int)
ret_data['end'] = ret_data.end.astype(int)
ret_data = ret_data.groupby((ret_data.end.shift()-ret_data.start).lt(0).cumsum()).agg({'chrom':'first','start':'first','end':'max','probability':'first'})
return (ret_data)

except BaseException as e:

Expand Down Expand Up @@ -889,8 +896,8 @@ def realign(read,n_hits,plus_strand,minus_strand,plus_base_freqs,minus_base_freq
#get soft-clipped read
soft_clipped_read = get_longest_soft_clipped_bases(read)

#encoding of DNA and operations A,T,C,G,=,X,DI. THis is done for Numba
nuc_and_ops = np.array([1,2,3,4,5,6,7])
#encoding of DNA and operations A,T,C,G,=,X,D,I. THis is done for Numba
nuc_and_ops = np.array([1,2,3,4,5,6,7,8])
encoded_nucs = number_encoding(soft_clipped_read['seq'])

hits = 0
Expand All @@ -905,7 +912,7 @@ def realign(read,n_hits,plus_strand,minus_strand,plus_base_freqs,minus_base_freq

while hits < n_hits and min_score >= -10:

alignment = edlib.align(soft_clipped_read['seq'], minus_strand, mode='HW', task='path')
alignment = edlib.align(soft_clipped_read['seq'], plus_strand , mode='HW', task='path')
if hits ==0:
if alignment['editDistance'] > max_edit:
return(None)
Expand All @@ -919,13 +926,13 @@ def realign(read,n_hits,plus_strand,minus_strand,plus_base_freqs,minus_base_freq
mask_bases = 'X' * ( location[1] - location[0])


minus_strand = minus_strand[:location[0]] + mask_bases + minus_strand[location[1]:]
plus_strand = plus_strand[:location[0]] + mask_bases + plus_strand[location[1]:]

hits += 1


score = pssm(phred_to_prob(np.array(soft_clipped_read['qual'],dtype=np.float64)), encoded_nucs,
edlib_cigar_to_iterable(alignment['cigar']),minus_base_freqs,gap_open,gap_extend,nuc_and_ops,verbose)
edlib_cigar_to_iterable(alignment['cigar']),plus_base_freqs,gap_open,gap_extend,nuc_and_ops,verbose)

if score < min_score:
min_score = score
Expand Down Expand Up @@ -986,13 +993,15 @@ def edlib_cigar_to_iterable(edlib_cigar):
operations = []

for i in re.findall(r'\d+[IDX=]',edlib_cigar):
length.append(int(i[0]))
if i[1] == '=':
length.append(int(i[:-1]))
if i[-1] == '=':
operations.append(5)
elif i[1] == 'X':
elif i[-1] == 'X':
operations.append(6)
elif i[1] == 'I' or 'D':
elif i[-1] == 'D':
operations.append(7)
elif i[-1] == 'I':
operations.append(8)


return(np.array(length),np.array(operations))
Expand Down Expand Up @@ -1091,10 +1100,12 @@ def pssm(seq_prob,seq_nucl,iterable_cigar,base_freqs,gap_open,gap_extend,nuc_and
seq_pos += operation_length


elif operation == nuc_and_ops[6]:
elif operation == nuc_and_ops[6] or operation == nuc_and_ops[7]:

#affine gap scoring model
indel_penalty += gap_open + gap_extend*(operation_length-1)
if operation == nuc_and_ops[7]: # insertion cost query
seq_pos += operation_length


return(np.sum(seq_prob)-indel_penalty)
Expand Down Expand Up @@ -1282,8 +1293,8 @@ def merge_final_output(bam,results,begin,splits,dir,fraction,pid):
for interval in unfiltered_output:

if (int(interval[4])+int(interval[3])) >= splits:
if int(interval[1]) != 0:
interval[1] = int(interval[1])+1
#if int(interval[1]) != 0:
# interval[1] = int(interval[1])+1
filtered.append(interval)

filtered_output = bt.BedTool(filtered)
Expand Down Expand Up @@ -1491,10 +1502,12 @@ def assign_discordants(split_bed,discordant_bed,insert_mean,insert_std):
start_filt = chrom_filt[
(chrom_filt['start'] > row['start']) & ((chrom_filt['start'] - row['start']) < max_dist)]
end_filt = start_filt[(start_filt['end'] < row['end']) & ((row['end'] - start_filt['end']) < max_dist)]

assigned_splits.append(
[row['chrom'], row['start'], row['end'], row['read'], row['iteration'], float(row['score']), len(end_filt)])

if len(end_filt)>1:
assigned_splits.append(
[row['chrom'], row['start'], row['end'], row['read'], row['iteration'], float(row['score']), len(np.unique(end_filt['read']))])
else:
assigned_splits.append(
[row['chrom'], row['start'], row['end'], row['read'], row['iteration'], float(row['score']), len(end_filt)])
return (assigned_splits)

else:
Expand All @@ -1510,14 +1523,15 @@ def adaptative_myers_k(sc_len,edit_frac):
"""Calculate the edit distance allowed as a function of the read length"""
return(float(sc_len*edit_frac))
@jit(nopython=True)
def non_colinearity(read_start_cigar,read_end_cigar,aln_start,mate_interval_start,mate_interval_end):
def non_colinearity(read_start_cigar,read_end_cigar,read_start_cigar_l,read_end_cigar_l,
aln_start,mate_interval_start,mate_interval_end):
"""Input a read and the mate interval in the graph. The function checks whether the alignment would be linear (splicing)
or colinear. Will return false, in order to not attemp realignment. This is mainly thought for skipping deletions and
RNA splicing"""


#check left soft-clipped
if read_start_cigar == 4:
if (read_start_cigar == 4 and read_end_cigar != 4 )or (read_start_cigar == 4 and read_end_cigar ==4 and (read_start_cigar_l>read_end_cigar_l)):
#graph needs to be upstream or looping to itself
if int(mate_interval_start) > aln_start:
return (True)
Expand All @@ -1527,7 +1541,7 @@ def non_colinearity(read_start_cigar,read_end_cigar,aln_start,mate_interval_star
else:
return (False)
#check right softclipped
if read_end_cigar == 4:
if (read_end_cigar == 4 and read_start_cigar != 4 ) or (read_start_cigar == 4 and read_end_cigar ==4 and read_start_cigar_l<read_end_cigar_l):
# graph needs to be downstream or looping to itself
if int(mate_interval_end) < aln_start:
return (True)
Expand All @@ -1536,6 +1550,7 @@ def non_colinearity(read_start_cigar,read_end_cigar,aln_start,mate_interval_star
return (True)
else:
return (False)
return (False)

@jit(nopython=True)
def prob_to_phred(prob):
Expand Down