Skip to content

Commit

Permalink
Merge pull request #34 from sanger-tol/dp24_barcodes
Browse files Browse the repository at this point in the history
Dp24 barcodes
  • Loading branch information
DLBPointon authored Oct 17, 2023
2 parents 06db313 + b35f732 commit 35cda05
Show file tree
Hide file tree
Showing 15 changed files with 427 additions and 80 deletions.
3 changes: 2 additions & 1 deletion assets/github_testing/test.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
assembly_path: /home/runner/work/ascc/ascc/asccTinyTest/assembly/Pyoeliiyoelii17XNL_assembly.fa
assembly_title: asccTinyTest
pacbio_multiplexing_barcode_names: ""
pacbio_barcodes: /home/runner/work/ascc/ascc/assets/pacbio_adaptors.fa
pacbio_multiplexing_barcode_names: "bc1008,bc1009"
pacbio_reads_path: /home/runner/work/ascc/ascc/asccTinyTest/pacbio/
sci_name: "Plasmodium yoelii yoelii 17XNL"
taxid: 352914
Expand Down
224 changes: 208 additions & 16 deletions assets/pacbio_adaptors.fa
Original file line number Diff line number Diff line change
@@ -1,32 +1,224 @@
>bc1001_BAK8A_OA
>bc1001
CACATATCAGAGTGCGT
>bc1002_BAK8A_OA
>bc1002
ACACACAGACTGTGAGT
>bc1003_BAK8A_OA
>bc1003
ACACATCTCGTGAGAGT
>bc1008_BAK8A_OA
>bc1008
ACAGTCGAGCGCTGCGT
>bc1009_BAK8A_OA
>bc1009
ACACACGCGAGACAGAT
>bc1010_BAK8A_OA
>bc1010
ACGCGCTATCTCAGAGT
>bc1011_BAK8A_OA
>bc1011
CTATACGTATATCTATT
>bc1012_BAK8A_OA
>bc1012
ACACTAGATCGCGTGTT
>bc1015_BAK8B_OA
>bc1015
CGCATGACACGTGTGTT
>bc1016_BAK8B_OA
>bc1016
CATAGAGAGATAGTATT
>bc1017_BAK8B_OA
>bc1017
CACACGCGCGCTATATT
>bc1018_BAK8B_OA
>bc1018
TCACGTGCTCACTGTGT
>bc1019_BAK8B_OA
>bc1019
ACACACTCTATCAGATT
>bc1020_BAK8B_OA
>bc1020
CACGACACGACGATGTT
>bc1021_BAK8B_OA
>bc1021
CTATACATAGTGATGTT
>bc1022_BAK8B_OA
>bc1022
CACTCACGTGTGATATT
>bc2001
ATCGTGCGACGAGTAT
>bc2002
TGCATGTCATGAGTAT
>bc2003
ACGAGTGCTCGAGTAT
>bc2004
TGCAGTGCTCGAGTAT
>bc2005
TGACTCGATCGAGTAT
>bc2006
CATGCGATCTGAGTAT
>bc2007
ACTAGCATCTGAGTAT
>bc2008
ACGCTAGTCTGAGTAT
>bc2009
CGATCGCACTGAGTAT
>bc2010
TACGTAGTATGAGTAT
>bc2011
CTGACAGTACGAGTAT
>bc2012
TCGTACTACTGAGTAT
>bc2013
CTGCGTAGACGAGTAT
>bc2014
ATACATGCACGAGTAT
>bc2015
CGACATAGATGAGTAT
>bc2016
ATCTGCACGTGAGTAT
>bc2017
CTATGATAGCGAGTAT
>bc2018
CGATCAGTGCGAGTAT
>bc2019
CGTCATAGTCGAGTAT
>bc2020
ACTATGCGTCGAGTAT
>bc2021
CGTACATGCTGAGTAT
>bc2022
TCATCGACGTGAGTAT
>bc2023
TCGCATGACTGAGTAT
>bc2024
CATGATCGACGAGTAT
>bc2025
ACGCACGTACGAGTAT
>bc2026
CAGTAGCGTCGAGTAT
>bc2027
TGACTGTAGCGAGTAT
>bc2028
ACTGCAGCACGAGTAT
>bc2029
TAGCAGTATCGAGTAT
>bc2030
CATACAGCATGAGTAT
>bc2031
ATAGCGTACTGAGTAT
>bc2032
ATAGACGAGTGAGTAT
>bc2033
CGACTCGTATGAGTAT
>bc2034
TACTAGTGACGAGTAT
>bc2035
CAGCTGACATGAGTAT
>bc2036
ACGTCGCTGCGAGTAT
>bc2037
CAGTATGAGCGAGTAT
>bc2038
TCACGACGACGAGTAT
>bc2039
CATGTATGTCGAGTAT
>bc2040
TGCTGCGACTGAGTAT
>bc2041
TATGATCACTGAGTAT
>bc2042
TCTGCACTGCGAGTAT
>bc2043
ACGATGACGTGAGTAT
>bc2044
CGATGATGCTGAGTAT
>bc2045
TACGACAGTCGAGTAT
>bc2046
ATAGCATGTCGAGTAT
>bc2047
CATAGTACTCGAGTAT
>bc2048
TGATGCTAGTGAGTAT
>bc2049
TAGTCTGCGTGAGTAT
>bc2050
CTCATCTATCGAGTAT
>bc2051
TGCATACTGCGAGTAT
>bc2052
CAGACTAGTCGAGTAT
>bc2053
ATCGTGATCTGAGTAT
>bc2054
CTGCGATCACGAGTAT
>bc2055
CTCAGCATACGAGTAT
>bc2056
TCGCAGCGTCGAGTAT
>bc2057
TAGCACGCATGAGTAT
>bc2058
TACTGACGCTGAGTAT
>bc2059
ATCTGACTATGAGTAT
>bc2060
ATACGAGCTCGAGTAT
>bc2061
CGAGCACGCTGAGTAT
>bc2062
TCTGCGTATCGAGTAT
>bc2063
TCTGCATCATGAGTAT
>bc2064
TGCGTGATGCGAGTAT
>bc2065
TGAGCTATGCGAGTAT
>bc2066
CTGTCGTAGTGAGTAT
>bc2067
ATCGATGCATGAGTAT
>bc2068
ACTACGTGATGAGTAT
>bc2069
TCTATGACATGAGTAT
>bc2070
TACTGCTCACGAGTAT
>bc2071
CGAGTCTAGCGAGTAT
>bc2072
TATCAGTAGTGAGTAT
>bc2073
ATCACTAGTCGAGTAT
>bc2074
TATCACGACTGAGTAT
>bc2075
CTCGTCAGATGAGTAT
>bc2076
CAGCAGTGACGAGTAT
>bc2077
TGCGACGTGCGAGTAT
>bc2078
CTCACTGAGTGAGTAT
>bc2079
CACTGAGCGTGAGTAT
>bc2080
CAGCGTCTACGAGTAT
>bc2081
CTACTATGTCGAGTAT
>bc2082
ATGTACAGACGAGTAT
>bc2083
ACTCATCAGTGAGTAT
>bc2084
CTGAGCACTCGAGTAT
>bc2085
ATCATCTACTGAGTAT
>bc2086
TACATGCGATGAGTAT
>bc2087
TCGCTGTCACGAGTAT
>bc2088
ACGCTCATGCGAGTAT
>bc2089
TACTAGCAGCGAGTAT
>bc2090
CGTAGCAGATGAGTAT
>bc2091
CGTGCTCGTCGAGTAT
>bc2092
ACAGCTGTACGAGTAT
>bc2093
TCGATGCTACGAGTAT
>bc2094
TAGATACAGCGAGTAT
>bc2095
CTACTCATACGAGTAT
>bc2096
ATGTACTAGTGAGTAT
3 changes: 2 additions & 1 deletion assets/test.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
assembly_path: /lustre/scratch123/tol/resources/treeval/treeval-testdata/asccTinyTest/assembly/Pyoeliiyoelii17XNL_assembly.fa
assembly_title: asccTinyTest
pacbio_multiplexing_barcode_names: something
pacbio_barcodes: /nfs/treeoflife-01/teams/tola/users/dp24/ascc/assets/pacbio_adaptors.fa
pacbio_multiplexing_barcode_names: "bc1008,bc1009"
pacbio_reads_path: /lustre/scratch123/tol/resources/treeval/treeval-testdata/asccTinyTest/pacbio/
sci_name: "Plasmodium yoelii yoelii 17XNL"
taxid: 352914
Expand Down
67 changes: 43 additions & 24 deletions bin/pacbio_barcode_check.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
#!/usr/bin/env python3

"""
Notes: Forces sys.exit(1) to kill pipeline
Pacbio Barcode Check
------------------------
Looks for Pacbio barcodes in ref and data.
If any User-supplied barcodes arn't in data then pipeline dies.
If no barcodes are found then pipeline carries on
Based on a standard operating procedure developed by James Torrance
Originally written by Eerik Aunin @eeaunin
Expand All @@ -21,7 +30,9 @@ def detect_barcodes_from_read_file_names(barcodes_fasta_path, pacbio_read_files)
barcodes_fasta_data = gpf.l(barcodes_fasta_path)
barcode_names = [n.split(">")[1] for n in barcodes_fasta_data if n.startswith(">")]
if len(barcode_names) == 0:
print("NO BARCODES, KILL PIPELINE")
sys.stderr.write(
f"Failed to read PacBio multiplexing barcode names from the specified file {barcodes_fasta_data}\n"
)
sys.exit(1)
detected_barcodes = list()
for barcode_name in barcode_names:
Expand All @@ -40,44 +51,52 @@ def check_if_barcodes_exist_in_barcodes_fasta(barcodes_list, barcodes_fasta_path
barcode_names_in_fasta = [n.split(">")[1] for n in barcodes_fasta_data if n.startswith(">")]
for barcode in barcodes_list:
if barcode not in barcode_names_in_fasta:
# sys.stderr.write(f"The PacBio multiplexing barcode ({barcode}) was not found in the barcode sequences file ({barcodes_fasta_path})\n")
print("NO BARCODES, KILL PIPELINE")
sys.stderr.write(
f"The PacBio multiplexing barcode ({barcode}) was not found in the barcode sequences file ({barcodes_fasta_path})\n"
)
sys.exit(1)

# If this print statement is reached, all user-supplied codes are present.
print("The query barcodes exist in the barcodes database file")


def main(barcodes_fasta_path, pacbio_read_files, pacbio_multiplexing_barcode_names):
pacbio_read_files = pacbio_read_files.split(",")

barcodes_list = []
if pacbio_multiplexing_barcode_names != "NA":
barcodes_list = pacbio_multiplexing_barcode_names.split(",")
if len(pacbio_multiplexing_barcode_names) > 0:
barcodes_list = pacbio_multiplexing_barcode_names.strip("[").strip("]").split(",")

current_script_dir = os.path.dirname(sys.argv[0])

if barcodes_fasta_path is None:
barcodes_fasta_path = f"{current_script_dir}/third_party_files/pacbio_barcode_screen/pacbio_adaptors.fa"
else:
if os.path.isfile(barcodes_fasta_path) is False:
print("NO BARCODES, KILL PIPELINE")
sys.exit(1)
if os.path.isfile(barcodes_fasta_path) is False:
sys.stderr.write(
"FASTA file with PacBio multiplexing barcode sequences ({barcodes_fasta_path}) was not found\n"
)
sys.exit(1)

if barcodes_list == []:
if len(barcodes_list) == 0:
barcodes_list = detect_barcodes_from_read_file_names(barcodes_fasta_path, pacbio_read_files)

# Here script should break successfully
if len(barcodes_list) == 0:
print("NO BARCODES, KILL PIPELINE")
sys.exit(1)
sys.stderr.write(
"Skipping the PacBio barcodes check, as no barcodes were specified by the user and no barcodes were found in PacBio read file names\n"
)
print(
"Skipping the PacBio barcodes check, as no barcodes were specified by the user and no barcodes were found in PacBio read file names\n"
)
sys.exit(0)

check_if_barcodes_exist_in_barcodes_fasta(
barcodes_list, barcodes_fasta_path
) # This is a TRUE | FALSE check, if FALSE kill pipeline.
print("BARCODES FOUND!")
check_if_barcodes_exist_in_barcodes_fasta(barcodes_list, barcodes_fasta_path)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("barcode_fasta", type=str, help="Pacbio Barcode FASTA file")
parser.add_argument("pacbio_reads", type=str, help="Pacbio Read FASTA.gz files")
parser.add_argument("multiplex_name", type=str, help="Pacbio Multiplex Barcode Name")
parser.add_argument("-b", "--barcode_fasta", type=str, help="Pacbio Barcode FASTA file")

parser.add_argument("-p", "--pacbio_reads", type=str, help="Pacbio Read FASTA.gz files")

parser.add_argument("-m", "--multiplex_name", type=str, help="Pacbio Multiplex Barcode Name")

parser.add_argument("-v", action="version", version="1.0")
args = parser.parse_args()
main(args.barcode_fasta, args.pacbio_reads, args.multiplex_name)
Expand Down
4 changes: 4 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ process {
ext.args = 'nucleotide'
}

withName: BLAST_MAKEBLASTDB {
ext.args = { "-dbtype nucl" }
}

withName: BLAST_BLASTN {
ext.args = { "-outfmt '6 qseqid staxids bitscore std' -max_target_seqs 10 -max_hsps 1 -evalue 1e-25 -dust yes -lcase_masking" }
}
Expand Down
Loading

0 comments on commit 35cda05

Please sign in to comment.