-
Notifications
You must be signed in to change notification settings - Fork 0
/
fromCCDS.Rmd
87 lines (65 loc) · 3.06 KB
/
fromCCDS.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
---
title: "Create Gene Coordinate Table from CCDS table"
output: pdf_document
---
```{r setup, echo=FALSE, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
library(readr)
```
## Load data downloaded from FTP site: ftp://ftp.ncbi.nih.gov/pub/CCDS
```{r load, include=FALSE}
setwd('~/Projects/lookup_coordinates/')
CCDS_20180614 <- read_delim("CCDS.20180614.txt", "\t", escape_double = FALSE, trim_ws = TRUE,
col_types = cols(`#chromosome` = 'c',
nc_accession = 'c',
gene = 'c',
gene_id = 'i',
ccds_id = 'c',
ccds_status = 'c',
cds_strand = 'c',
cds_from = 'i',
cds_to = 'i',
cds_locations = 'c',
match_type = 'c'))
public <- filter(CCDS_20180614,ccds_status == 'Public')
```
## Check if the first entry is the longest
You can also embed plots, for example:
```{r check}
check_largest <- function(x){if(x[1]==max(x)) {return(1)} else {return(0)}}
check_res <- group_by(public,gene) %>%
summarise(check = check_largest(cds_to))
length(which(check_res$check != 1)) ## not always first
#uniq <- group_by(public,gene) %>% summarise_all(funs(first))
```
```{r chromosome duplicate}
uniq <- group_by(public,gene) %>%
summarise(cds_start = min(cds_from),cds_end = max(cds_to),
`#chromosome`=paste(unique(`#chromosome`),collapse='_'))
unique(uniq$`#chromosome`) ## there is "X_Y"
filter(uniq,`#chromosome`=='X_Y')
XYgenes <- filter(uniq,`#chromosome`=='X_Y') $gene
```
## Apparently, some genes are only on X, but entry is wrong; and some on both X and Y.
```{r non-uniform annotation for the 18 genes}
## apparently some genes are only on X, but entry is wrong
filter(public,gene=='ZBED1')
## and some on both X and Y.
filter(public,gene=='SPRY3')
```
## For 3 genes, need to create entry on both X and Y. Therefore, group by both gene and chromosome and summarize.
```{r some have annotations on both X and Y}
uniq <- group_by(public,gene,`#chromosome`) %>%
summarise(cds_start = min(cds_from),cds_end = max(cds_to))
```
## Manually remove the genes which are using X coordinates but have chromosome Y entry. Then write table.
```{r removal and write table}
retain <- c('IL9R','SPRY3','VAMP7')
XYgenes <- XYgenes [!XYgenes %in% retain]
uniq <- filter(uniq, !(gene %in% XYgenes & `#chromosome` == 'Y'))
final <- data.frame(uniq, gene_id = public$gene_id[match(uniq$gene, public$gene)])
colnames(final) <- c('gene_symbol','reference_name','cds_start_min','cds_end_max','gene_entrez_id')
final <- final[,c('gene_symbol','gene_entrez_id','reference_name','cds_start_min','cds_end_max')]
#write.table(final,'table.tsv',quote = F, sep='\t',row.names = F)
```