Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix upstream and downstream sequence of sequence-info and tweak frameshift process #80

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 88 additions & 20 deletions src/varity/vcf_to_hgvs/protein.clj
Original file line number Diff line number Diff line change
Expand Up @@ -79,38 +79,106 @@
(exon-sequence (cseq/read-sequence seq-rdr {:chr chr, :start start, :end end})
start end exon-ranges))

(defn- is-deletion-variant?
[ref alt]
(or (and (= 1 (count alt))
(= (first ref) (first alt)))
(and (not (= 1 (count ref) (count alt)))
(not= (first ref) (first alt)))))

(defn- cds-start-upstream-to-cds-variant?
[cds-start pos ref]
(and (< pos cds-start)
(<= cds-start (dec (+ pos (count ref))))))

(defn- cds-to-cds-end-downstream-variant?
[cds-end pos ref]
(and (<= pos cds-end)
(< cds-end (dec (+ pos (count ref))))))

(defn- make-alt-up-exon-seq
[ref-up-exon-seq cds-start pos ref alt]
(let [is-deletion (is-deletion-variant? ref alt)
alt-up-exon-seq (if (cds-start-upstream-to-cds-variant? cds-start pos ref)
(let [offset (if is-deletion
(- cds-start pos)
0)]
(string/join "" (drop-last offset ref-up-exon-seq)))
ref-up-exon-seq)]
(subs alt-up-exon-seq (mod (count alt-up-exon-seq) 3))))

(defn- make-alt-down-exon-seq
[ref-down-exon-seq cds-end pos ref alt]
(let [is-deletion (is-deletion-variant? ref alt)
ref-end (dec (+ pos (count ref)))
alt-down-exon-seq (if (cds-to-cds-end-downstream-variant? cds-end pos ref)
(let [offset (if is-deletion
(- ref-end cds-end)
0)]
(string/join "" (drop offset ref-down-exon-seq)))
ref-down-exon-seq)
nalt-down-exon-seq (count alt-down-exon-seq)]
(subs alt-down-exon-seq 0 (- nalt-down-exon-seq (mod nalt-down-exon-seq 3)))))

(defn- make-stop-codon-adjusted-alt-seq
[alt-exon-seq alt-up-exon-seq alt-down-exon-seq strand cds-start cds-end pos ref]
(cond
(and (= strand :reverse)
(cds-start-upstream-to-cds-variant? cds-start pos ref))
(str alt-up-exon-seq alt-exon-seq)

(and (= strand :forward)
(cds-to-cds-end-downstream-variant? cds-end pos ref))
(str alt-exon-seq alt-down-exon-seq)

:else
alt-exon-seq))

(defn- get-pos-exon-end-tuple
[pos exon-ranges]
(let [[_ exon-end] (first (filter (fn [[s e]] (<= s pos e)) exon-ranges))]
[pos exon-end]))

(defn- read-sequence-info
[seq-rdr rg pos ref alt]
(let [{:keys [chr tx-start tx-end cds-start cds-end exon-ranges strand]} rg
ref-seq (cseq/read-sequence seq-rdr {:chr chr, :start cds-start, :end cds-end})
alt-seq (common/alt-sequence ref-seq cds-start pos ref alt)
alt-exon-ranges* (alt-exon-ranges exon-ranges pos ref alt)
ref-exon-seq1 (exon-sequence ref-seq cds-start exon-ranges)
ref-up-exon-seq1 (read-exon-sequence seq-rdr chr tx-start (dec cds-start) exon-ranges)
ref-up-exon-seq1 (subs ref-up-exon-seq1 (mod (count ref-up-exon-seq1) 3))
ref-down-exon-seq1 (read-exon-sequence seq-rdr chr (inc cds-end) tx-end exon-ranges)
nref-down-exon-seq1 (count ref-down-exon-seq1)
ref-down-exon-seq1 (subs ref-down-exon-seq1 0 (- nref-down-exon-seq1 (mod nref-down-exon-seq1 3)))
alt-exon-seq1 (exon-sequence alt-seq cds-start alt-exon-ranges*)
ref-exon-seq (exon-sequence ref-seq cds-start exon-ranges)
ref-up-exon-seq (read-exon-sequence seq-rdr chr tx-start (dec cds-start) exon-ranges)
alt-up-exon-seq (make-alt-up-exon-seq ref-up-exon-seq cds-start pos ref alt)
ref-down-exon-seq (read-exon-sequence seq-rdr chr (inc cds-end) tx-end exon-ranges)
alt-down-exon-seq (make-alt-down-exon-seq ref-down-exon-seq cds-start pos ref alt)
alt-exon-seq (exon-sequence alt-seq cds-start alt-exon-ranges*)
stop-codon-adjusted-alt-seq (make-stop-codon-adjusted-alt-seq alt-exon-seq alt-up-exon-seq alt-down-exon-seq
strand cds-start cds-end pos ref)
apply-offset #(or (ffirst (alt-exon-ranges [[% %]] pos ref alt))
(some (fn [[_ e]] (when (<= e %) e)) (reverse alt-exon-ranges*)))]
{:ref-exon-seq ref-exon-seq1
:ref-prot-seq (codon/amino-acid-sequence (cond-> ref-exon-seq1
(some (fn [[_ e]] (when (<= e %) e)) (reverse alt-exon-ranges*))
(-> [(get-pos-exon-end-tuple % alt-exon-ranges*)]
(alt-exon-ranges pos ref alt)
ffirst))]
{:ref-exon-seq ref-exon-seq
:ref-prot-seq (codon/amino-acid-sequence (cond-> ref-exon-seq
(= strand :reverse) util-seq/revcomp))
:alt-exon-seq alt-exon-seq1
:alt-prot-seq (codon/amino-acid-sequence (cond-> alt-exon-seq1
:alt-exon-seq alt-exon-seq
:alt-prot-seq (codon/amino-acid-sequence (cond-> alt-exon-seq
(= strand :reverse) util-seq/revcomp))
:alt-tx-prot-seq (codon/amino-acid-sequence
(cond-> (str ref-up-exon-seq1 alt-exon-seq1 ref-down-exon-seq1)
(cond-> (str alt-up-exon-seq alt-exon-seq alt-down-exon-seq)
(= strand :reverse) util-seq/revcomp))
:ini-offset (quot (:position (rg/cds-coord (case strand
:forward tx-start
:reverse tx-end) rg))
:ini-offset (quot (count (case strand
:forward alt-up-exon-seq
:reverse alt-down-exon-seq))
3)
:c-ter-adjusted-alt-prot-seq (codon/amino-acid-sequence
(cond-> stop-codon-adjusted-alt-seq
(= strand :reverse) util-seq/revcomp))
:alt-rg (-> rg
(assoc :exon-ranges alt-exon-ranges*)
(update :cds-start apply-offset)
(update :cds-end apply-offset))}))
(update :cds-end apply-offset)
(update :tx-end apply-offset))}))

(defn- protein-position
"Converts genomic position to protein position. If pos is outside of CDS,
Expand Down Expand Up @@ -294,8 +362,8 @@
alt-repeat)))

(defn- protein-frame-shift
[ppos seq-info]
(let [[ppos pref palt] (->> (map vector (:ref-prot-seq seq-info) (:alt-prot-seq seq-info))
[ppos {:keys [ref-prot-seq c-ter-adjusted-alt-prot-seq] :as seq-info}]
(let [[ppos pref palt] (->> (map vector ref-prot-seq c-ter-adjusted-alt-prot-seq)
(drop (dec ppos))
(map-indexed vector)
(drop-while (fn [[_ [r a]]] (= r a)))
Expand All @@ -304,7 +372,7 @@
[(+ ppos i) (str r) (str a)])))
[_ _ offset _] (diff-bases pref palt)
alt-prot-seq (format-alt-prot-seq seq-info)
ref (nth (:ref-prot-seq seq-info) (dec (+ ppos offset)))
ref (nth ref-prot-seq (dec (+ ppos offset)))
alt (nth alt-prot-seq (dec (+ ppos offset)))
ter-site (-> seq-info
format-alt-prot-seq
Expand Down
52 changes: 52 additions & 0 deletions test/varity/vcf_to_hgvs/protein_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,58 @@
;; 1 [2 3 4] 5 6 7 [8 9 10 11] 12 13 14 15
(is (= (#'prot/exon-sequence "ACGTACGTACGTACG" 1 [[2 4] [8 11]]) "CGTTACG")))

(deftest make-alt-up-exon-seq-test
(let [ref-up-exon-seq "AATGCTTCTAGCTCC"
cds-start 100]
(are [p pos ref alt] (= p (#'prot/make-alt-up-exon-seq ref-up-exon-seq
cds-start
pos
ref
alt))
"AATGCTTCTAGCTCC" 102 "ATGTC" "A"
"ATGCTTCTAGCT" 98 "CCTT" "C")))

(deftest make-alt-down-exon-seq-test
(let [ref-up-exon-seq "CTTATAATAATAA"
cds-end 1000]
(are [p pos ref alt] (= p (#'prot/make-alt-down-exon-seq ref-up-exon-seq
cds-end
pos
ref
alt))
"CTTATAATAATA" 1002 "TTATAA" "T"
"TATAATAAT" 998 "GGCCT" "G")))

(deftest make-stop-codon-adjusted-alt-seq-test
(let [alt-seq "XXXXXX"
upstream-seq "YYYYYY"
downstream-seq "ZZZZZZ"
[cds-start cds-end] [7 12]]
(are [p strand pos ref] (#'prot/make-stop-codon-adjusted-alt-seq alt-seq
upstream-seq
downstream-seq
strand
cds-start
cds-end
pos
ref)
"XXXXXX" :forward 8 "XX"
"XXXXXX" :forward 5 "YY"
"XXXXXX" :forward 5 "YYX"
"XXXXXX" :forward 13 "ZZ"
"XXXXXXZZZZZZ" :forward 12 "XZZ"
"XXXXXX" :reverse 8 "XX"
"XXXXXX" :reverse 5 "YY"
"YYYYYYXXXXXX" :reverse 5 "YYX"
"XXXXXX" :reverse 13 "ZZ"
"XXXXXX" :reverse 12 "XZZ")))

(deftest get-pos-exon-end-tuple-test
(let [exon-ranges [[1 10] [15 20] [25 40]]]
(are [p pos] (= (#'prot/get-pos-exon-end-tuple pos exon-ranges) p)
[15 20] 15
[5 10] 5)))

(def ref-gene-EGFR
{:bin 125
:name "NM_005228"
Expand Down
3 changes: 3 additions & 0 deletions test/varity/vcf_to_hgvs_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,9 @@
"chr17" 43124016 "CCAGATGGGACACTCTAAGATTTTCTGCATAGCATTAATGACATTTTGTACTTCTTCAACGCGAAGAGCAGATAAATCCATTTCTTTCTGTTCCAATGAA" "C"
'("p.M1Sfs*13")

;; frame shift with termination codon change
"chr8" 116847497 "TCCTTATATAATATGGAACCTTGGTCCAGGTGTTGCGATGATGTCACTGTA" "T" '("p.Y617Sfs*2")

;; Extension
"chr2" 188974490 "A" "C" '("p.M1Lext-23")
"chr2" 189011772 "T" "C" '("p.*1467Qext*45") ; cf. ClinVar 101338
Expand Down