From 69ee8aae8c3262a146b9d612d6601b11e02b80f8 Mon Sep 17 00:00:00 2001 From: Nobuaki Karasawa Date: Tue, 10 Sep 2024 10:41:11 +0900 Subject: [PATCH] fix: add frameshift and extension conditional branch to indel and related fns --- src/varity/vcf_to_hgvs/protein.clj | 105 +++++++++++++++++------ test/varity/vcf_to_hgvs/protein_test.clj | 53 ++++++++++++ test/varity/vcf_to_hgvs_test.clj | 32 ++++++- 3 files changed, 163 insertions(+), 27 deletions(-) diff --git a/src/varity/vcf_to_hgvs/protein.clj b/src/varity/vcf_to_hgvs/protein.clj index 7791b8b..d920ce0 100644 --- a/src/varity/vcf_to_hgvs/protein.clj +++ b/src/varity/vcf_to_hgvs/protein.clj @@ -157,6 +157,22 @@ alt-positions (set (range pos (inc end-pos)))] (boolean (seq (s/intersection ter-site-positions alt-positions))))) +(defn- ref-include-from-ter-start-and-over-ter-end? + [{:keys [strand cds-start cds-end]} pos ref alt] + (let [[del _ offset _] (diff-bases ref alt) + pos (+ pos offset) + ndel (count del) + ter-start-pos (if (= strand :forward) + (- cds-end 2) + (+ cds-start 2)) + ter-end-pos (if (= strand :forward) + cds-end + cds-start) + pos-end (+ pos (if (= ndel 0) 0 (dec ndel)))] + (if (= strand :forward) + (and (= pos ter-start-pos) (<= ter-end-pos pos-end)) + (and (= pos-end ter-start-pos) (<= pos ter-end-pos))))) + (defn- ter-site-same-pos? [ref-prot-seq alt-prot-seq] (let [ter-site-pos (dec (count ref-prot-seq))] @@ -182,11 +198,29 @@ (< cds-end pos) (<= (inc cds-end) pos pos-end)))) +(defn- cds-variant? + [cds-start cds-end pos ref alt] + (let [[del _ offset _] (diff-bases ref alt) + ndel (count del) + pos (+ pos offset) + pos-end (+ pos (if (= ndel 0) 0 (dec ndel)))] + (if (is-insertion-variant? ref alt) + (and (< cds-start pos) (<= pos cds-end)) + (<= cds-start pos pos-end cds-end)))) + (defn- utr-variant? [cds-start cds-end pos ref alt] (or (cds-start-upstream? cds-start pos ref alt) (cds-end-downstream? cds-end pos ref alt))) +(defn- frameshift-within-cds? + [{:keys [cds-start cds-end]} pos ref alt] + (let [[del ins _ _] (diff-bases ref alt) + ndel (count del) + nins (count ins)] + (and (cds-variant? cds-start cds-end pos ref alt) + (not= 0 (rem (abs (- ndel nins)) 3))))) + (defn- get-alt-cds-start-pos [cds-start pos-start pos-end exon-ranges pos*] (let [[_ exon-end] (first (filter (fn [[s e]] (<= s pos* e)) exon-ranges)) @@ -246,6 +280,8 @@ ref-cds-exon-seq (exon-sequence ref-cds-seq cds-start cds-end exon-ranges) ref-include-utr-ini-site-boundary (include-utr-ini-site-boundary? rg pos ref alt) ref-include-ter-site (include-ter-site? rg pos ref alt) + ref-include-from-ter-start-and-over-ter-end (ref-include-from-ter-start-and-over-ter-end? rg pos ref alt) + frameshift-within-cds (frameshift-within-cds? rg pos ref alt) alt-seq (common/alt-sequence ref-seq tx-start pos ref alt) alt-exon-ranges* (alt-exon-ranges exon-ranges pos ref alt) apply-offset* (partial apply-offset pos ref alt cds-start cds-end exon-ranges) @@ -283,6 +319,8 @@ :tx-end alt-tx-end)) :ref-include-utr-ini-site-boundary ref-include-utr-ini-site-boundary :ref-include-ter-site ref-include-ter-site + :ref-include-from-ter-start-and-over-ter-end ref-include-from-ter-start-and-over-ter-end + :frameshift-within-cds frameshift-within-cds :utr-variant (utr-variant? cds-start cds-end pos ref alt)}))) (defn- protein-position @@ -335,7 +373,8 @@ (defn- ->protein-variant [{:keys [strand] :as rg} pos ref alt - {:keys [ref-exon-seq ref-prot-seq alt-exon-seq alt-rg ref-include-ter-site utr-variant] :as seq-info} + {:keys [ref-exon-seq ref-prot-seq alt-exon-seq alt-rg ref-include-ter-site + ref-include-from-ter-start-and-over-ter-end utr-variant] :as seq-info} {:keys [prefer-deletion? prefer-insertion? prefer-extension-for-initial-codon-alt?]}] (cond (:overlap-exon-intron-boundary seq-info) @@ -376,7 +415,9 @@ pref-only palt-only) t (cond - (= (+ base-ppos offset) (count ref-prot-seq)) (if (= "" pref-only palt-only) + ref-include-from-ter-start-and-over-ter-end :frame-shift + (= (+ base-ppos offset) (count ref-prot-seq)) (if (and (= "" pref-only palt-only) + (ter-site-same-pos? ref-prot-seq alt-prot-seq*)) :no-effect :extension) (= (+ base-ppos offset) 1) (if (or (= ref-prot-rest alt-prot-rest) @@ -391,7 +432,7 @@ (= palt (subs pref 0 (count palt)))) (= (first palt-only) \*)) :fs-ter-substitution ref-include-ter-site :indel - (first-diff-aa-is-ter-site? ppos + (first-diff-aa-is-ter-site? base-ppos ref-prot-seq alt-prot-seq*) :extension :else :frame-shift) @@ -480,12 +521,15 @@ alt-repeat))) (defn- protein-frame-shift - [ppos {:keys [ref-prot-seq alt-prot-seq ref-include-utr-ini-site-boundary] :as seq-info}] - (let [{:keys [ppos pref palt]} (get-first-diff-aa-info ppos ref-prot-seq alt-prot-seq) - [_ _ offset _] (diff-bases pref palt) + [ppos {:keys [ref-prot-seq ref-include-utr-ini-site-boundary + ref-include-from-ter-start-and-over-ter-end] :as seq-info}] + (let [alt-prot-seq* (format-alt-prot-seq seq-info) + {:keys [ppos pref palt]} (get-first-diff-aa-info ppos ref-prot-seq alt-prot-seq*) + [_ _ offset _] (diff-bases (or pref "") (or palt "")) + ppos (if ref-include-from-ter-start-and-over-ter-end (count ref-prot-seq) ppos) alt-prot-seq* (format-alt-prot-seq seq-info) ref (nth ref-prot-seq (dec (+ ppos offset))) - alt (nth alt-prot-seq (dec (+ ppos offset))) + alt (nth alt-prot-seq* (dec (+ ppos offset))) ter-site (-> seq-info format-alt-prot-seq (subs (dec (+ ppos offset))) @@ -511,7 +555,7 @@ (if (and (not= ppos 1) (ter-site-same-pos? ref-prot-seq c-ter-adjusted-alt-prot-seq)) (mut/protein-no-effect) - (let [[_ ins offset _] (diff-bases pref palt) + (let [[_ ins offset _] (diff-bases (or pref "") (or palt "")) alt-prot-seq* (format-alt-prot-seq seq-info) ini-site ((comp str first) ref-prot-seq) first-diff-aa-info (if (= ppos 1) @@ -538,40 +582,49 @@ (coord/unknown-coordinate)))))) (defn- protein-indel - [ppos pref palt {:keys [ref-prot-seq c-ter-adjusted-alt-prot-seq ref-include-ter-site] :as seq-info}] - (let [[pref palt ppos] (if ref-include-ter-site - (let [{adjusted-ppos :ppos} (get-first-diff-aa-info ppos ref-prot-seq c-ter-adjusted-alt-prot-seq) - ppos (or adjusted-ppos ppos) - get-seq-between-pos-ter-site (fn [seq pos] - (-> (subs seq (dec pos)) - (string/split #"\*") - first)) - pref (get-seq-between-pos-ter-site ref-prot-seq ppos) - palt (get-seq-between-pos-ter-site c-ter-adjusted-alt-prot-seq ppos)] - [pref palt ppos]) - [pref palt ppos]) - [del ins offset _] (diff-bases pref palt) + [ppos pref palt {:keys [ref-prot-seq c-ter-adjusted-alt-prot-seq + ref-include-ter-site frameshift-within-cds] :as seq-info}] + (let [[pref* palt* ppos*] (if ref-include-ter-site + (let [{adjusted-ppos :ppos} (get-first-diff-aa-info ppos ref-prot-seq c-ter-adjusted-alt-prot-seq) + ppos (or adjusted-ppos ppos) + get-seq-between-pos-ter-site (fn [seq pos] + (-> (subs seq (dec pos)) + (string/split #"\*") + first)) + pref (get-seq-between-pos-ter-site ref-prot-seq ppos) + palt (get-seq-between-pos-ter-site c-ter-adjusted-alt-prot-seq ppos)] + [pref palt ppos]) + [pref palt ppos]) + [del ins offset _] (diff-bases (or pref* "") (or palt* "")) ndel (count del) alt-retain-ter-site? (if ref-include-ter-site - (string/includes? (subs c-ter-adjusted-alt-prot-seq (dec ppos)) "*") + (string/includes? (subs c-ter-adjusted-alt-prot-seq (dec ppos*)) "*") true)] (cond + (first-diff-aa-is-ter-site? ppos* + ref-prot-seq + c-ter-adjusted-alt-prot-seq) + (protein-extension ppos* pref* palt* seq-info) + + frameshift-within-cds + (protein-frame-shift ppos* seq-info) + (every? empty? [del ins]) (mut/protein-no-effect) (empty? del) - (protein-insertion ppos pref palt seq-info) + (protein-insertion ppos* pref* palt* seq-info) (empty? ins) - (protein-deletion ppos pref palt) + (protein-deletion ppos* pref* palt*) alt-retain-ter-site? (mut/protein-indel (mut/->long-amino-acid (first del)) - (coord/protein-coordinate (+ ppos offset)) + (coord/protein-coordinate (+ ppos* offset)) (when (> ndel 1) (mut/->long-amino-acid (last del))) (when (> ndel 1) - (coord/protein-coordinate (+ ppos offset ndel -1))) + (coord/protein-coordinate (+ ppos* offset ndel -1))) (->> (seq ins) (map mut/->long-amino-acid))) :else (mut/protein-unknown-mutation)))) diff --git a/test/varity/vcf_to_hgvs/protein_test.clj b/test/varity/vcf_to_hgvs/protein_test.clj index a747af8..723835e 100644 --- a/test/varity/vcf_to_hgvs/protein_test.clj +++ b/test/varity/vcf_to_hgvs/protein_test.clj @@ -150,6 +150,26 @@ true? reverse-rg 9 "AACT" "AG" false? reverse-rg 8 "AG" "A"))) +(deftest ref-include-from-ter-start-and-over-ter-end?-test + (let [cds-start 10 + cds-end 21] + (are [p strand pos ref alt] (p (#'prot/ref-include-from-ter-start-and-over-ter-end? {:strand strand + :cds-start cds-start + :cds-end cds-end} + pos + ref + alt)) + true? :forward 18 "ATAA" "A" + true? :forward 18 "ATAAG" "AGC" + true? :reverse 9 "GTCA" "G" + true? :reverse 8 "CGTCA" "CTG" + false? :forward 18 "A" "T" + false? :forward 19 "T" "TCCCT" + false? :forward 18 "ATA" "A" + false? :reverse 10 "T" "A" + false? :reverse 11 "C" "CATG" + false? :reverse 8 "CGTC" "C"))) + (deftest ter-site-same-pos?-test (are [p ref alt] (p (#'prot/ter-site-same-pos? ref alt)) true? "MTGA*" "MTGA*" @@ -157,6 +177,22 @@ false? "MTGA*" "MTGAQCT*" false? "MTGA*" "MTGA")) +(deftest cds-variant?-test + (let [cds-start 10 + cds-end 21] + (are [p pos ref alt] (p (#'prot/cds-variant? cds-start cds-end pos ref alt)) + true? 10 "A" "A" + true? 10 "A" "ACT" + true? 9 "GATG" "G" + true? 10 "ATG" "AGC" + true? 21 "A" "T" + true? 20 "A" "ATC" + true? 20 "AA" "A" + false? 8 "CGA" "CTT" + false? 9 "C" "CGGT" + false? 20 "AAG" "ATC" + false? 21 "A" "AGC"))) + (deftest utr-variant?-test (let [cds-start 10 cds-end 21] @@ -185,6 +221,23 @@ false? 20 "GA" "GGCT" false? 20 "TATA" "TCG"))) +(deftest frameshift-within-cds?-test + (let [cds-start 10 + cds-end 21] + (are [p pos ref alt] (p (#'prot/frameshift-within-cds? {:cds-start cds-start + :cds-end cds-end} + pos + ref + alt)) + true? 13 "T" "TC" + true? 15 "AGCTC" "A" + true? 18 "GC" "GGA" + false? 15 "A" "T" + false? 11 "T" "TGCT" + false? 10 "ATGGCTC" "A" + false? 8 "GTATG" "G" + false? 20 "GTAAC" "GCTTA"))) + (deftest get-alt-cds-start-pos-test (let [cds-start 40 exon-ranges [[10 50] [80 120] [150 200]] diff --git a/test/varity/vcf_to_hgvs_test.clj b/test/varity/vcf_to_hgvs_test.clj index 0315506..d04341a 100644 --- a/test/varity/vcf_to_hgvs_test.clj +++ b/test/varity/vcf_to_hgvs_test.clj @@ -221,8 +221,20 @@ "chr2" 47445589 "CTTACTGAT" "CCC" '("p.L440_D442delinsP" "p.L374_D376delinsP") ; cf. rs63749931 (+) "chr1" 152111364 "TGC" "TCG" '("p.E617_Q618delinsDE") ; cf. rs35444647 (-) - ;; indel include stop codon deletion + ;; indel includes stop codon deletion "chr8" 116847497 "TCCTTATATAATATGGAACCTTGGTCCAGGTGTTGCGATGATGTCACTGTA" "T" '("p.Y617_I631delinsS") + "chr17" 43045678 "TCAGTAG" "T" '("p.H758delinsQLQPATGTEPQDPKNELTKWPFQALGAPLTLQSFYCPG" + "p.H1815delinsQLQPATGTEPQDPKNELTKWPFQALGAPLTLQSFYCPG" + "p.H1862delinsQLQPATGTEPQDPKNELTKWPFQALGAPLTLQSFYCPG" + "p.H1883delinsQLQPATGTEPQDPKNELTKWPFQALGAPLTLQSFYCPG") ; not actual example (-) + "chr17" 43045679 "CAGTAGT" "C" '("p.H758delinsRLQPATGTEPQDPKNELTKWPFQALGAPLTLQSFYCPG" + "p.H1815delinsRLQPATGTEPQDPKNELTKWPFQALGAPLTLQSFYCPG" + "p.H1862delinsRLQPATGTEPQDPKNELTKWPFQALGAPLTLQSFYCPG" + "p.H1883delinsRLQPATGTEPQDPKNELTKWPFQALGAPLTLQSFYCPG") ; not actual example (-) + "chr10" 87965467 "GTCT" "G" '("p.V403delinsGIFFYQEG" "p.V576delinsGIFFYQEG" "p.V206delinsGIFFYQEG") ; not actual example (+) + "chr10" 87965464 "AAAGTCT" "A" '("p.K402_V403delinsRIFFYQEG" "p.K575_V576delinsRIFFYQEG" "p.K205_V206delinsRIFFYQEG") ; not actual example (+) + "chr13" 24421120 "TAGC" "T" '("p.G1724delinsEVK") ; not actual example (-) + "chr13" 24421120 "TAGCCTT" "T" '("p.G1724delinsVK") ; not actual example (-) ;; repeated sequences "chr1" 47438996 "T" "TCCGCAC" '("p.P286_H287[5]") ; cf. rs3046924 (+) @@ -236,6 +248,8 @@ "p.S1415Ifs*2") ; https://github.com/chrovis/varity/issues/58 "chr17" 31261816 "CC" "C" '("p.N1542Tfs*11" "p.N1563Tfs*11") ; cf. rs1555619041 (+) "chr1" 16138274 "CG" "C" '("p.R327Dfs*66") + "chr3" 149520807 "ACAGC" "A" '("p.W399Cfs*62") ; not actual example (-) + ;; initiation site is affected in DNA level but initiation site is not changed in protein level "chr7" 55019279 "TGC" "T" '("p.R2Tfs*34") ; not actual example (+) "chr11" 32396363 "C" "CGACCGTACAA" '("p.A170Cfs*6" "p.A153Cfs*6" "p.A365Cfs*6" "p.A382Cfs*6") ; not actual example (-) @@ -244,17 +258,33 @@ "chr17" 81537070 "G" "GTA" '("p.W514Cfs*?" "p.W490Cfs*?") ; not actual example (+) "chr17" 9771493 "CCT" "C" '("p.E310Gfs*?" "p.E311Gfs*?") ; not actual example (-) + ;; Frame shift includes termination site deletion + "chr17" 43045679 "CAGTAG" "C" '("p.H758Qfs*16" "p.H1815Qfs*16" "p.H1862Qfs*16" "p.H1883Qfs*16") ;; not actual example (-) + "chr13" 24421120 "TAGCC" "T" '("p.G1724Kfs*8") ; not actual example (-) + "chr13" 24421120 "TAGCCT" "T" '("p.G1724Sfs*36") ; not actual example (-) + "chr13" 24421120 "TAGCCTTG" "T" '("p.Q1723Kfs*8") ; not actual example (-) + "chr10" 87965466 "AGTCT" "A" '("p.V403Efs*12" "p.V576Efs*12" "p.V206Efs*12") ; not actual example (+) + "chr10" 87965465 "AAGTCT" "A" '("p.V403Nfs*17" "p.V576Nfs*17" "p.V206Nfs*17") ; not actual example (+) + "chr10" 87965463 "AAAAGTCT" "A" '("p.K402Efs*12" "p.K575Efs*12" "p.K205Efs*12") ; not actual example (+) + ;; Extension "chr2" 188974490 "A" "C" '("p.M1Lext-23") "chr2" 189011772 "T" "C" '("p.*1467Qext*45") ; cf. ClinVar 101338 "chr11" 125655318 "TGA" "TAT" '("p.*477Yext*17" "p.*443Yext*17" "p.*477Yext*24") "chr10" 8074014 "C" "CATGGGTT" '("p.*445Yext*64" "p.*444Yext*64") ; not actual example (+) + "chr10" 87965468 "TC" "T" '("p.*404Eext*11" "p.*577Eext*11" "p.*207Eext*11") ; not actual example (+) ;; NOTE: There are very few correct examples... ;; Extension without termination site "chr17" 81537077 "CT" "C" '("p.*517Eext*?" "p.*493Eext*?") ; not actual example (+) "chr17" 9771487 "GT" "G" '("p.*312Yext*?" "p.*313Yext*?") ; not actual example (-) + ;; Extension includes termination site deletion + "chr13" 24421117 "ACTT" "A" '("p.*1725Fext*2") ; not actual example (-) + "chr13" 24421116 "GACTT" "G" '("p.*1725Sext*6") ; not actual example (-) + "chr13" 24421117 "ACT" "A" '("p.*1725Yext*35") ; not actual example (-) + "chr13" 24421116 "GACT" "G" '("p.*1725Yext*2") ; not actual example (-) + ;; no effect "chr7" 55181876 "A" "T" '("p.=") ; not actual example (+) "chr7" 55181874 "TGAT" "T" '("p.=") ; not actual example (+)