Skip to content

Commit

Permalink
fix: add three-prime-rule option to vcf->hgvs
Browse files Browse the repository at this point in the history
  • Loading branch information
nokara26 committed Feb 5, 2025
1 parent 2f5fc0d commit 125e86f
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 24 deletions.
65 changes: 43 additions & 22 deletions src/varity/vcf_to_hgvs.clj
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,16 @@
(some? (re-matches #"(NM_|ENST)\d+(\.\d+)?" (:name rg))))

(defn select-variant
[var seq-rdr rg]
[var seq-rdr rg & {:keys [three-prime-rule]}]
(if-let [nvar (normalize-variant var seq-rdr rg)]
(let [var-start-cds-coord (rg/cds-coord (:pos var) rg)
var-end-cds-coord (rg/cds-coord (+ (:pos var) (max (count (:ref var)) (count (:alt var)))) rg)
nvar-start-cds-coord (rg/cds-coord (:pos nvar) rg)
nvar-end-cds-coord (rg/cds-coord (+ (:pos nvar) (max (count (:ref nvar)) (count (:alt nvar)))) rg)]
(if (= (:region var-start-cds-coord) (:region nvar-start-cds-coord)
(:region var-end-cds-coord) (:region nvar-end-cds-coord))
nvar-end-cds-coord (rg/cds-coord (+ (:pos nvar) (max (count (:ref nvar)) (count (:alt nvar)))) rg)
restrict-cds (get three-prime-rule :restrict-cds true)]
(if (or (= (:region var-start-cds-coord) (:region nvar-start-cds-coord)
(:region var-end-cds-coord) (:region nvar-end-cds-coord))
(not restrict-cds))
nvar
var))
var))
Expand Down Expand Up @@ -119,7 +121,8 @@
(filter coding-dna-ref-gene?)
(map (fn [rg]
(assoc (select-variant {:chr chr, :pos pos, :ref ref, :alt alt}
seq-rdr rg)
seq-rdr rg
:three-prime-rule (:three-prime-rule options))
:rg rg)))
(map (fn [{:keys [rg] :as m}]
(when (:verbose? options)
Expand All @@ -135,7 +138,8 @@
(let [options (merge default-options options)]
(if (valid-ref? seq-rdr chr pos ref)
(let [nv (select-variant {:chr chr, :pos pos, :ref ref, :alt alt}
seq-rdr rg)]
seq-rdr rg
:three-prime-rule (:three-prime-rule options))]
(when (:verbose? options)
(print-debug-info nv seq-rdr rg))
(coding-dna/->hgvs (assoc nv :rg rg) seq-rdr rg options))
Expand Down Expand Up @@ -189,7 +193,8 @@
(filter coding-dna-ref-gene?)
(map (fn [rg]
(assoc (select-variant {:chr chr, :pos pos, :ref ref, :alt alt}
seq-rdr rg)
seq-rdr rg
:three-prime-rule (:three-prime-rule options))
:rg rg)))
(filter #(cds-affected? % (:rg %)))
(keep (fn [{:keys [rg] :as m}]
Expand All @@ -206,7 +211,8 @@
(let [options (merge default-options options)]
(if (valid-ref? seq-rdr chr pos ref)
(let [nv (select-variant {:chr chr, :pos pos, :ref ref, :alt alt}
seq-rdr rg)]
seq-rdr rg
:three-prime-rule (:three-prime-rule options))]
(when (cds-affected? nv rg)
(when (:verbose? options)
(print-debug-info nv seq-rdr rg))
Expand Down Expand Up @@ -264,15 +270,23 @@
(->> (rg/ref-genes chr pos rgidx (:tx-margin options))
(filter coding-dna-ref-gene?)
(map (fn [rg]
(assoc (select-variant {:chr chr, :pos pos, :ref ref, :alt alt}
seq-rdr rg)
:rg rg)))
(map (fn [{:keys [rg] :as m}]
{:coding-dna (assoc (select-variant {:chr chr, :pos pos, :ref ref, :alt alt}
seq-rdr rg
:three-prime-rule (get-in options [:three-prime-rule :coding-dna]))
:rg rg
:type :coding-dna)
:protein (assoc (select-variant {:chr chr, :pos pos, :ref ref, :alt alt}
seq-rdr rg
:three-prime-rule (get-in options [:three-prime-rule :protein]))
:rg rg
:type :protein)}))
(map (fn [{:keys [coding-dna protein]}]
(when (:verbose? options)
(print-debug-info m seq-rdr rg))
{:coding-dna (coding-dna/->hgvs m seq-rdr rg options)
:protein (if (cds-affected? m rg)
(prot/->hgvs m seq-rdr rg options))}))
(print-debug-info coding-dna seq-rdr (:rg coding-dna))
(print-debug-info protein seq-rdr (:rg protein)))
{:coding-dna (coding-dna/->hgvs coding-dna seq-rdr (:rg coding-dna) options)
:protein (when (cds-affected? coding-dna (:rg coding-dna))
(prot/->hgvs protein seq-rdr (:rg protein) options))}))
distinct)
(throw (ex-info "ref is not found on the position."
{:type ::invalid-ref
Expand All @@ -282,14 +296,21 @@
[{:keys [pos ref alt]} seq-rdr {:keys [chr] :as rg} & [options]]
(let [options (merge default-options options)]
(if (valid-ref? seq-rdr chr pos ref)
(let [nv (select-variant {:chr chr, :pos pos, :ref ref, :alt alt}
seq-rdr rg)]
(let [nv (assoc (select-variant {:chr chr, :pos pos, :ref ref, :alt alt}
seq-rdr rg
:three-prime-rule (get-in options [:three-prime-rule :coding-dna]))
:type :coding-dna)
pnv (assoc (select-variant {:chr chr, :pos pos, :ref ref, :alt alt}
seq-rdr rg
:three-prime-rule (get-in options [:three-prime-rule :protein]))
:type :protein)]
(when (:verbose? options)
(print-debug-info nv seq-rdr rg))

(print-debug-info nv seq-rdr rg)
(print-debug-info pnv seq-rdr rg))
(println (:three-prime-rule options))
{:coding-dna (coding-dna/->hgvs nv seq-rdr rg options)
:protein (if (cds-affected? nv rg)
(prot/->hgvs nv seq-rdr rg options))})
:protein (when (cds-affected? nv rg)
(prot/->hgvs pnv seq-rdr rg options))})
(throw (ex-info "ref is not found on the position."
{:type ::invalid-ref
:variant {:chr chr, :pos pos, :ref ref, :alt alt}})))))
41 changes: 39 additions & 2 deletions test/varity/vcf_to_hgvs_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,17 @@
;; tx-margin
"chr5" 1295113 "G" "A" {:tx-margin 5000} '("NM_001193376:c.-124C>T"
"NM_198253:c.-124C>T")
"chr5" 1295113 "G" "A" {:tx-margin 0} '())))
"chr5" 1295113 "G" "A" {:tx-margin 0} '()

;; three-prime-rule
"chr13" 24421115 "TGACTTAGCCT" "T" {:three-prime-rule {:restrict-cds true}} '("NM_006437:c.5169_*3delAGGCTAAGTC") ;; not actual example (-)
"chr13" 24421115 "TGACTTAGCCT" "T" {:three-prime-rule {:restrict-cds false}} '("NM_006437:c.5170_*4delGGCTAAGTCA") ;; not actual example (-)
"chr3" 53495165 "GGAT" "G" {:three-prime-rule {:restrict-cds true} :prefer-insertion? true :prefer-deletion? true} '("NM_000720:c.-1_2delGAT"
"NM_001128839:c.-1_2delGAT"
"NM_001128840:c.-1_2delGAT") ;; not actual example (+)
"chr3" 53495165 "GGAT" "G" {:three-prime-rule {:restrict-cds false} :prefer-insertion? true :prefer-deletion? true} '("NM_000720:c.20_22delTGA"
"NM_001128839:c.20_22delTGA"
"NM_001128840:c.20_22delTGA")))) ;; not actual example (+)
(cavia-testing "throws Exception if inputs are illegal"
(let [rgidx (rg/index (rg/load-ref-genes test-ref-gene-file))]
(is (thrown? Exception
Expand Down Expand Up @@ -350,10 +360,16 @@

;; prefer-extension-for-initial-codon-alt?
"chr10" 121593814 "CCATGGT" "C" {:prefer-extension-for-initial-codon-alt? true} '("p.M1Vext-17") ;; not actual example (-)
"chr2" 197434979 "AGTCTTGGCGATCTTCGCCATTTT" "A" {:prefer-extension-for-initial-codon-alt? true} '("p.M1Sext-?")))) ;; not actual example (-)
"chr2" 197434979 "AGTCTTGGCGATCTTCGCCATTTT" "A" {:prefer-extension-for-initial-codon-alt? true} '("p.M1Sext-?") ;; not actual example (-)
;; prefer-extension-for-initial-codon-alt?, initiation codon is altered to termination codon
"chr9" 27109592 "T" "TTTA" {:prefer-extension-for-initial-codon-alt? true} '("p.?") ;; not actual example (+)

;; three-prime-rule
"chr13" 24421115 "TGACTTAGCCT" "T" {:three-prime-rule {:restrict-cds true}} '("p.G1724Nfs*6") ;; not actual example (-)
"chr13" 24421115 "TGACTTAGCCT" "T" {:three-prime-rule {:restrict-cds false}} '("p.G1724delinsNETEF") ;; not actual example (-)
"chr3" 53495165 "GGAT" "G" {:three-prime-rule {:restrict-cds true} :prefer-insertion? true :prefer-deletion? true} '("p.?") ;; not actual example (+)
"chr3" 53495165 "GGAT" "G" {:three-prime-rule {:restrict-cds false} :prefer-insertion? true :prefer-deletion? true} '("p.M7del")))) ;; not actual example (+)

(cavia-testing "throws Exception if inputs are illegal"
(let [rgidx (rg/index (rg/load-ref-genes test-ref-gene-file))]
(is (thrown? Exception
Expand Down Expand Up @@ -403,3 +419,24 @@
"NR_024540"
"ENSP00000496776.1"
"ENSP00000496776")))

(defn- vcf-variant->hgvs-texts
[variant seq-rdr rgidx & [options]]
(map (fn [{:keys [coding-dna protein]}]
{:coding-dna (hgvs/format coding-dna {:show-bases? true
:range-format :coord})
:protein (hgvs/format protein {:amino-acid-format :short
:show-ter-site? true
:ter-format :short})})
(vcf-variant->hgvs variant seq-rdr rgidx options)))

(defslowtest vcf-variant->hgvs-test
(cavia-testing "options"
(let [rgidx (rg/index (rg/load-ref-genes test-ref-gene-file))]
(are [chr pos ref alt opts e]
(= (vcf-variant->hgvs-texts {:chr chr, :pos pos, :ref ref, :alt alt}
test-ref-seq-file rgidx (merge {:prefer-insertion? true
:prefer-deletion? true}
opts)) e)
"chr13" 24421115 "TGACTTAGCCT" "T" {:three-prime-rule {:coding-dna {:restrict-cds true} :protein {:restrict-cds true}}} '({:coding-dna "NM_006437:c.5169_*3delAGGCTAAGTC", :protein "p.G1724Nfs*6"}) ;; not actual example (-)
"chr13" 24421115 "TGACTTAGCCT" "T" {:three-prime-rule {:coding-dna {:restrict-cds false} :protein {:restrict-cds false}}} '({:coding-dna "NM_006437:c.5170_*4delGGCTAAGTCA", :protein "p.G1724delinsNETEF"}))))) ;; not actual example (-)

0 comments on commit 125e86f

Please sign in to comment.