-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcellstudy1202.Rmd
executable file
·76 lines (67 loc) · 2.58 KB
/
cellstudy1202.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
---
title: "xcms tutorial"
author: "Chuyao Wang"
date: "`r Sys.Date()`"
output:
html_document:
number_sections: true # automatic section numbering
toc: yes # table of content
toc_depth: 3
toc_float: true
smooth_scroll: true
toc_collapsed: true
fig_caption: yes
df_print: kable
knit: (function(inputFile, encoding) {
rmarkdown::render(inputFile, encoding = encoding, output_dir = "./html") })
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
library(stringr)
library(openxlsx)
```
Load data and clean up colnames.
```{r}
data <- read.xlsx("./data/cellstudy20221202_CD33_result.xlsx")
data <- data %>%
`[`(,!str_detect(colnames(.),"QC")) %>%
`[`(,!str_detect(colnames(.),"methanol"))
colnames(data) <- colnames(data) %>% str_remove(.,"768O") %>% str_remove(.,"_Waters_Zhu_ddMS2_[:digit:]+_[:digit:]+") %>% str_remove(.,".raw.") %>% str_remove("\\(F[:digit:]+\\)") %>% str_remove("._") %>% str_replace(":","_")
```
```{r}
# Get max area
featureData <- select(data,colnames(data)[!str_detect(colnames(data),"Area")&!str_detect(colnames(data),"Peak")])
areas <- select(data,colnames(data)[str_detect(colnames(data),"Area")])
areas.new <- lapply(1:(ncol(areas)/4),function(i) {
idx <- (i-1)*4+c(1:4)
return(apply(areas[,idx],MARGIN=1,max,na.rm=TRUE))
}) %>%
data.frame()
colnames(areas.new) <- colnames(areas) %>% str_remove(.,"_[:letter:]+_[:digit:]") %>% unique
data.new <- cbind(featureData,areas.new)
# Remove duplicate metabolites
# If there are duplicates, merge by taking the max of each metabolite. Then keep only one row
duped <- data.new$Formula %>% table %>% as.data.frame() %>% `[`(.$Freq>1,) %>% `[`(,1) %>% as.vector()
for (i in duped) {
idx <- which(data.new$Formula %in% i)
newrow <- data.new[c(idx),] %>%
select(colnames(data.new)[str_detect(colnames(data.new),"Area")]) %>%
sapply(.,max)
for (j in idx) {
data.new[j,which(str_detect(colnames(data.new),"Area"))] <- newrow
}
data.new$Name[idx[-1]] <- "dup"
}
data.new <- data.new %>% filter((Name != "dup")|(is.na(Name)))
# Give NA names an ID
data.new$Name[which(is.na(data.new$Name))] <- paste("Unknown",seq_along(which(is.na(data.new$Name))),sep="_")
```
```{r}
# Transpose and process
# data.new <- data.new[which(!str_detect(data.new$Name,"Unknown")),] # Removes unknown compounds
areas.out <- data.new %>% select(colnames(data.new)[str_detect(colnames(data.new),"Area")]) %>% t %>% as.data.frame
colnames(areas.out) <- data.new$Name
areas.out <- mutate(areas.out, grp = rownames(areas.out) %>% str_split_fixed(.,"_",3) %>% `[`(,3)) %>%
select(grp,everything())
```