Skip to content

Commit

Permalink
some prelim methods for ONT data
Browse files Browse the repository at this point in the history
  • Loading branch information
Jon Palmer committed Dec 6, 2021
1 parent 70a63f9 commit 94ea653
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 22 deletions.
70 changes: 70 additions & 0 deletions amptk/ont_cluster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from bisect import bisect_left
import numpy as np
from scipy.signal import find_peaks
import pyfastx
import subprocess


def primary_clustering(fastq, outfolder, cpus=1, min_num=3):
cmd = ['isONclust', '--ont', '--fastq', fastq, '--t', str(cpus),
'--outfolder', outfolder]
subprocess.call(cmd)
cmd2 = ['isONclust', 'write_fastq', '--clusters', os.path.join(outfolder, 'final_clusters.tsv'),
'--fastq', fastq, '--outfolder', os.path.join(outfolder, 'clust_0'), '--N', str(min_num)]
subprocess.call(cmd2)


def subcluster(folder, outfolder, n=15, w=4, d=20):
if not os.path.isdir(outfolder):
os.makedirs(outfolder)
for f in os.listdir(folder):
if f.endswith('.fastq'):
fastq_file = os.path.join(folder, f)
cluster_num = int(f.rstrip('.fastq'))
reads = []
fqindex = pyfastx.Fastq(fastq_file)
for r in fqindex:
reads.append((r.name, len(r)))
lengths = [x[1] for x in reads]
centers = get_centroids_by_length(lengths, n=n, w=w, d=d)
clusters = {}
if len(centers) > 0:
for c in centers:
clusters[c] = []
for read in reads:
closest = take_closest(centers, read[1])
clusters[closest].append(read[0])
else:
clusters[0] = [x[0] for x in reads]
for k, v in clusters.items():
fqoutfile = os.path.join(outfolder, '{}_{}.fq'.format(cluster_num, k))
with open(fqoutfile, 'w') as outfile:
for x in v:
outfile.write('{}'.format(fqindex[x].raw))


def take_closest(myList, myNumber):
pos = bisect_left(myList, myNumber)
if pos == 0:
return myList[0]
if pos == len(myList):
return myList[-1]
before = myList[pos - 1]
after = myList[pos]
if after - myNumber < myNumber - before:
return after
else:
return before

def get_centroids_by_length(lengths, n=15, w=4, d=20):
s_lengths = sorted(lengths)
r = []
for i in range(s_lengths[0], s_lengths[-1], 1):
r.append(s_lengths.count(i))
x = np.array(r)
peaks, prop = find_peaks(x, height=n, width=w, distance=d)
len_centers = []
if len(peaks) > 0:
for z in peaks:
len_centers.append(z+s_lengths[0])
return len_centers
46 changes: 24 additions & 22 deletions amptk/process_ont.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,28 +368,30 @@ def main(args):
os.makedirs(tmpdir)

# parse the input, if directories then traverse
data = parseinfiles(args.fastq, mapping=args.barcode_map)
step1Stats = []
for k, v in data.items():
barcode = v['label']
run, guppybarcode = k.split('::')
lib.log.info('Working on {:,} FASTQ files from {} {} --> {}'.format(
len(v['files']), run, guppybarcode, barcode))
for reads in v['files']:
oriented = os.path.join(tmpdir, '{}.{}.oriented.fq'.format(
run, guppybarcode))
lib.log.debug('Parsing: {}'.format(reads))
orientStats = orient(reads,
oriented,
fwdprimer=args.fwd_primer,
revprimer=args.rev_primer,
internalprimer=intPrimer,
mismatch=args.primer_mismatch,
minlen=args.min_len,
minqual=args.min_qual,
barcode=barcode)
lib.log.debug('Demuxed: {}'.format(orientStats))
step1Stats.append(orientStats)
if os.path.isdir(args.fastq):
data = parseinfiles(args.fastq, mapping=args.barcode_map)
step1Stats = []
for k, v in data.items():
barcode = v['label']
run, guppybarcode = k.split('::')
lib.log.info('Working on {:,} FASTQ files from {} {} --> {}'.format(
len(v['files']), run, guppybarcode, barcode))
for reads in v['files']:
oriented = os.path.join(tmpdir, '{}.{}.oriented.fq'.format(
run, guppybarcode))
lib.log.debug('Parsing: {}'.format(reads))
orientStats = orient(reads,
oriented,
fwdprimer=args.fwd_primer,
revprimer=args.rev_primer,
internalprimer=intPrimer,
mismatch=args.primer_mismatch,
minlen=args.min_len,
minqual=args.min_qual,
barcode=barcode)
lib.log.debug('Demuxed: {}'.format(orientStats))
step1Stats.append(orientStats)
elif os.path.isfile(args.fastq): # here passed single file so need to orient and find barcodes

combinedStats = [sum(i) for i in zip(*step1Stats)]
lib.log.info('{:,} of {:,} reads passed'.format(
Expand Down

0 comments on commit 94ea653

Please sign in to comment.