-
Notifications
You must be signed in to change notification settings - Fork 28
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: new SC2 dataset with better coverage
- Loading branch information
1 parent
b22c9a8
commit a2c274d
Showing
9 changed files
with
195,213 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
16 changes: 16 additions & 0 deletions
16
data/datasets/sars-cov-2/references/MN908947/versions/2022-02-07T12:00:00Z/files/genemap.gff
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# Gene map (genome annotation) of SARS-CoV-2 in GFF format. | ||
# For gene map purpses we only need some of the columns. We substitute unused values with "." as per GFF spec. | ||
# See GFF format reference at https://www.ensembl.org/info/website/upload/gff.html | ||
# seqname source feature start end score strand frame attribute | ||
. . gene 26245 26472 . + . gene_name=E | ||
. . gene 26523 27191 . + . gene_name=M | ||
. . gene 28274 29533 . + . gene_name=N | ||
. . gene 266 13468 . + . gene_name=ORF1a | ||
. . gene 13468 21555 . + . gene_name=ORF1b | ||
. . gene 25393 26220 . + . gene_name=ORF3a | ||
. . gene 27202 27387 . + . gene_name=ORF6 | ||
. . gene 27394 27759 . + . gene_name=ORF7a | ||
. . gene 27756 27887 . + . gene_name=ORF7b | ||
. . gene 27894 28259 . + . gene_name=ORF8 | ||
. . gene 28284 28577 . + . gene_name=ORF9b | ||
. . gene 21563 25384 . + . gene_name=S |
37 changes: 37 additions & 0 deletions
37
data/datasets/sars-cov-2/references/MN908947/versions/2022-02-07T12:00:00Z/files/primers.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
Country (Institute),Target,Oligonucleotide,Sequence | ||
Charité (Germany),RdRp,Charité_RdRp_F,GTGARATGGTCATGTGTGGCGG | ||
Charité (Germany),RdRp,Charité_S_RdRp_P,CAGGTGGAACCTCATCAGGAGATGC | ||
Charité (Germany),RdRp,Charité_RdRp_R,CARATGTTAAASACACTATTAGCATA | ||
Charité (Germany),E,Charité_E_F,ACAGGTACGTTAATAGTTAATAGCGT | ||
Charité (Germany),E,Charité_E_P,ACACTAGCCATCCTTACTGCGCTTCG | ||
Charité (Germany),E,Charité_E_R,ATATTGCAGCAGTACGCACACA | ||
Charité (Germany),N,Charité_N_F,CACATTGGCACCCGCAATC | ||
Charité (Germany),N,Charité_N_P,ACTTCCTCAAGGAACAACATTGCCA | ||
Charité (Germany),N,Charité_N_R,GAGGAACGAGAAGAGGCTTG | ||
HKU (Hong Kong),ORF1b-nsp14,HKU_ORF_F,TGGGGYTTTACRGGTAACCT | ||
HKU (Hong Kong),ORF1b-nsp14,HKU_ORF_P,TAGTTGTGATGCWATCATGACTAG | ||
HKU (Hong Kong),ORF1b-nsp14,HKU_ORF_R,AACRCGCTTAACAAAGCACTC | ||
HKU (Hong Kong),N,HKU_N_F,TAATCAGACAAGGAACTGATTA | ||
HKU (Hong Kong),N,HKU_N_P,GCAAATTGTGCAATTTGCGG | ||
HKU (Hong Kong),N,HKU_N_R,CGAAGGTGTGACTTCCATG | ||
China CDC (China),N,ChinaCDC_N_F,GGGGAACTTCTCCTGCTAGAAT | ||
China CDC (China),N,ChinaCDC_N_P,TTGCTGCTGCTTGACAGATT | ||
China CDC (China),N,ChinaCDC_N_R,CAGACATTTTGCTCTCAAGCTG | ||
China CDC (China),ORF1ab-nsp10,ChinaCDC_ORF_F,CCCTGTGGGTTTTACACTTAA | ||
China CDC (China),ORF1ab-nsp10,ChinaCDC_ORF_P,CCGTCTGCGGTATGTGGAAAGGTTATGG | ||
China CDC (China),ORF1ab-nsp10,ChinaCDC_ORF_R,ACGATTGTGCATCAGCTGA | ||
US CDC (United States),N1,USCDC_N1_F,GACCCCAAAATCAGCGAAAT | ||
US CDC (United States),N1,USCDC_N1_P,ACCCCGCATTACGTTTGGTGGACC | ||
US CDC (United States),N1,USCDC_N1_R,TCTGGTTACTGCCAGTTGAATCTG | ||
US CDC (United States),N2,USCDC_N2_F,TTACAAACATTGGCCGCAAA | ||
US CDC (United States),N2,USCDC_N2_P,ACAATTTGCCCCCAGCGCTTCAG | ||
US CDC (United States),N2,USCDC_N2_R,GCGCGACATTCCGAAGAA | ||
US CDC (United States),N3,USCDC_N3_F,GGGAGCCTTGAATACACCAAAA | ||
US CDC (United States),N3,USCDC_N3_P,AYCACATTGGCACCCGCAATCCTG | ||
US CDC (United States),N3,USCDC_N3_R,TGTAGCACGATTGCAGCATTG | ||
"Institut Pasteur, Paris (France)",RdRp,Pasteur_IP2_F,ATGAGCTTAGTCCTGTTG | ||
"Institut Pasteur, Paris (France)",RdRp,Pasteur_IP2_P,AGATGTCTTGTGCTGCCGGTA | ||
"Institut Pasteur, Paris (France)",RdRp,Pasteur_IP2_R,CTCCCTTTGTTGTGTTGT | ||
"Institut Pasteur, Paris (France)",RdRp,Pasteur_IP4_F,GGTAACTGGTATGATTTCG | ||
"Institut Pasteur, Paris (France)",RdRp,Pasteur_IP4_P,TCATACAAACCACGCCAGG | ||
"Institut Pasteur, Paris (France)",RdRp,Pasteur_IP4_R,CTGGTCAAGGTTAATATAGG |
94 changes: 94 additions & 0 deletions
94
data/datasets/sars-cov-2/references/MN908947/versions/2022-02-07T12:00:00Z/files/qc.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
{ | ||
"schemaVersion": "1.2.0", | ||
"privateMutations": { | ||
"enabled": true, | ||
"typical": 8, | ||
"cutoff": 24, | ||
"weightLabeledSubstitutions": 4, | ||
"weightReversionSubstitutions": 6, | ||
"weightUnlabeledSubstitutions": 1 | ||
}, | ||
"missingData": { | ||
"enabled": true, | ||
"missingDataThreshold": 2700, | ||
"scoreBias": 300 | ||
}, | ||
"snpClusters": { | ||
"enabled": true, | ||
"windowSize": 100, | ||
"clusterCutOff": 6, | ||
"scoreWeight": 50 | ||
}, | ||
"mixedSites": { | ||
"enabled": true, | ||
"mixedSitesThreshold": 10 | ||
}, | ||
"frameShifts": { | ||
"enabled": true, | ||
"ignoredFrameShifts": [ | ||
{ "geneName": "ORF3a", "codonRange": {"begin": 256, "end": 276 } }, | ||
{ "geneName": "ORF3a", "codonRange": {"begin": 258, "end": 276 } }, | ||
{ "geneName": "ORF6", "codonRange": {"begin": 21, "end": 62 } }, | ||
{ "geneName": "ORF6", "codonRange": {"begin": 29, "end": 62 } }, | ||
{ "geneName": "ORF6", "codonRange": {"begin": 30, "end": 62 } }, | ||
{ "geneName": "ORF6", "codonRange": {"begin": 37, "end": 62 } }, | ||
{ "geneName": "ORF7a", "codonRange": {"begin": 53, "end": 122 } }, | ||
{ "geneName": "ORF7a", "codonRange": {"begin": 61, "end": 122 } }, | ||
{ "geneName": "ORF7a", "codonRange": {"begin": 62, "end": 122 } }, | ||
{ "geneName": "ORF7a", "codonRange": {"begin": 63, "end": 122 } }, | ||
{ "geneName": "ORF7a", "codonRange": {"begin": 64, "end": 122 } }, | ||
{ "geneName": "ORF7a", "codonRange": {"begin": 65, "end": 122 } }, | ||
{ "geneName": "ORF7a", "codonRange": {"begin": 68, "end": 122 } }, | ||
{ "geneName": "ORF7a", "codonRange": {"begin": 71, "end": 122 } }, | ||
{ "geneName": "ORF7a", "codonRange": {"begin": 74, "end": 122 } }, | ||
{ "geneName": "ORF7a", "codonRange": {"begin": 75, "end": 122 } }, | ||
{ "geneName": "ORF7a", "codonRange": {"begin": 76, "end": 122 } }, | ||
{ "geneName": "ORF7a", "codonRange": {"begin": 77, "end": 122 } }, | ||
{ "geneName": "ORF7a", "codonRange": {"begin": 78, "end": 122 } }, | ||
{ "geneName": "ORF7a", "codonRange": {"begin": 80, "end": 122 } }, | ||
{ "geneName": "ORF7a", "codonRange": {"begin": 81, "end": 122 } }, | ||
{ "geneName": "ORF7a", "codonRange": {"begin": 82, "end": 122 } }, | ||
{ "geneName": "ORF7a", "codonRange": {"begin": 84, "end": 122 } }, | ||
{ "geneName": "ORF7a", "codonRange": {"begin": 100, "end": 122 } }, | ||
{ "geneName": "ORF7a", "codonRange": {"begin": 101, "end": 122 } }, | ||
{ "geneName": "ORF7a", "codonRange": {"begin": 102, "end": 122 } }, | ||
{ "geneName": "ORF7a", "codonRange": {"begin": 103, "end": 122 } }, | ||
{ "geneName": "ORF7a", "codonRange": {"begin": 109, "end": 122 } }, | ||
{ "geneName": "ORF7a", "codonRange": {"begin": 116, "end": 122 } }, | ||
{ "geneName": "ORF7b", "codonRange": {"begin": 41, "end": 44 } }, | ||
{ "geneName": "ORF7b", "codonRange": {"begin": 42, "end": 44 } }, | ||
{ "geneName": "ORF8", "codonRange": {"begin": 45, "end": 122 } }, | ||
{ "geneName": "ORF8", "codonRange": {"begin": 67, "end": 122 } }, | ||
{ "geneName": "ORF8", "codonRange": {"begin": 107, "end": 122 } }, | ||
{ "geneName": "ORF8", "codonRange": {"begin": 118, "end": 120 } }, | ||
{ "geneName": "ORF8", "codonRange": {"begin": 119, "end": 122 } }, | ||
{ "geneName": "ORF8", "codonRange": {"begin": 120, "end": 122 } } | ||
] | ||
}, | ||
"stopCodons": { | ||
"enabled": true, | ||
"ignoredStopCodons": [ | ||
{ "geneName": "ORF3a", "codon": 253 }, | ||
{ "geneName": "ORF7a", "codon": 37 }, | ||
{ "geneName": "ORF7a", "codon": 40 }, | ||
{ "geneName": "ORF7a", "codon": 61 }, | ||
{ "geneName": "ORF7a", "codon": 75 }, | ||
{ "geneName": "ORF7a", "codon": 76 }, | ||
{ "geneName": "ORF7a", "codon": 89 }, | ||
{ "geneName": "ORF7a", "codon": 93 }, | ||
{ "geneName": "ORF7a", "codon": 94 }, | ||
{ "geneName": "ORF7a", "codon": 120 }, | ||
{ "geneName": "ORF7b", "codon": 2 }, | ||
{ "geneName": "ORF7b", "codon": 32 }, | ||
{ "geneName": "ORF7b", "codon": 38 }, | ||
{ "geneName": "ORF8", "codon": 17 }, | ||
{ "geneName": "ORF8", "codon": 18 }, | ||
{ "geneName": "ORF8", "codon": 26 }, | ||
{ "geneName": "ORF8", "codon": 58 }, | ||
{ "geneName": "ORF8", "codon": 63 }, | ||
{ "geneName": "ORF8", "codon": 67 }, | ||
{ "geneName": "ORF8", "codon": 105 }, | ||
{ "geneName": "ORF8", "codon": 109 } | ||
] | ||
} | ||
} |
Oops, something went wrong.