Lat_Clean_Data_with_chloroplasts.Rmd

---
title: "This code is for the analysis of 16S Sequencing Data from a latitudinal study along the Eastern Coast of Australia"
author: "Nathan Williams"
date: "22/10/2020"
---

```{r Libraries}
#0.2 Load dependencies
library("gridExtra")
library("vegan")
library("metacoder")
library("taxa")
library("phyloseq")
library("ggplot2")
library("dplyr")
library("readr")
library("stringr")
library("agricolae")
library("ape")
library("tidyverse")
library("tidyr")
library("ggpubr")
library("metagMisc")
library("microbiome")
```

```{r Asthetics}
#Theme for plot
NathanTheme <- list(  #scale_x_discrete(limits=(locations)),
                       theme(axis.title.x = element_text(vjust = 0, hjust = 0.5, size = 16)),
                       theme(axis.title.y = element_text(vjust = 1, hjust = 0.5, size = 16)),
                       theme(axis.text=element_text(size=10)),
                       theme(axis.text.x = element_text(hjust=1, size = 14, angle = 90)),
                       theme(axis.text.y = element_text(hjust=1, size = 14, angle = 0)),
                       theme(panel.border = element_blank(), panel.grid.major = element_blank(), 
                             panel.background = element_blank(),panel.grid.minor = element_blank(), 
                             axis.line = element_line(colour = 'black')))


```

```{r Load and prepare files}
#0.1 Set working directory
setwd("~/Dropbox/UTS/PhD/Projects/Chapter1_Vibrio_latitude_study/Data/16S/Data")
#1.1 load in the data
asv_data <- read_csv("ASV_table.csv")
sample_data <- read_csv('SMD.csv')

#1.2 "select" your columns and then "spread" the data 
asv_data <- asv_data %>% select(ASV,code2,abund_chim1)
asv_data <- asv_data %>% rename(SampleID = code2)
asv_data <- asv_data %>% rename(Abundance = abund_chim1)
asv_data <-asv_data %>% spread(key='SampleID', value='Abundance', fill=0)
names(asv_data) = gsub(pattern = "p1;", replacement = "", x = names(asv_data))
```

```{r Load in the taxa data}
setwd("~/Dropbox/UTS/PhD/Projects/Chapter1_Vibrio_latitude_study/Data/16S/Data")
tax.silva <- read_csv("Tax_silva.csv")
tax.species <- read_csv("Tax_species.csv")
Chloroplast_tax_data <- read_csv('PR2tax.csv')
tax_data  <- tax.silva  %>% left_join(tax.species, 'ASV')
```

```{r Set your threshold and remove your bootstrap of 20 and replace it with 49, this is the first step towards cleaning your data}
mythreshold <- 49
tax_data$tax.Kingdom <- ifelse(tax_data$boot.Kingdom <= mythreshold, NA, tax_data$tax.Kingdom)
tax_data$tax.Phylum <- ifelse(tax_data$boot.Phylum <= mythreshold, NA, tax_data$tax.Phylum)
tax_data$tax.Class <- ifelse(tax_data$boot.Class <= mythreshold, NA, tax_data$tax.Class)
tax_data$tax.Order <- ifelse(tax_data$boot.Order <= mythreshold, NA, tax_data$tax.Order)
tax_data$tax.Family <- ifelse(tax_data$boot.Family <= mythreshold, NA, tax_data$tax.Family)
tax_data$tax.Genus <- ifelse(tax_data$boot.Genus <= mythreshold, NA, tax_data$tax.Genus)
tax_data$tax.Kingdom <- forcats::fct_explicit_na(tax_data$tax.Kingdom, 'k_unassigned')
tax_data$tax.Phylum <- forcats::fct_explicit_na(tax_data$tax.Phylum, 'p_unassigned')
tax_data$tax.Class <- forcats::fct_explicit_na(tax_data$tax.Class, 'c_unassigned')
tax_data$tax.Order <- forcats::fct_explicit_na(tax_data$tax.Order, 'o_unassigned')
tax_data$tax.Family <- forcats::fct_explicit_na(tax_data$tax.Family, 'f_unassigned')
tax_data$Genus <- forcats::fct_explicit_na(tax_data$Genus, 'g_unassigned')
tax_data$Species <- forcats::fct_explicit_na(tax_data$Species, 'sp.')
```

```{r do the same for chlorplast data}
Chloroplast_tax_data$tax.Kingdom <- ifelse(Chloroplast_tax_data$boot.Kingdom <= mythreshold, NA, Chloroplast_tax_data$tax.Kingdom)
Chloroplast_tax_data$tax.Phylum <- ifelse(Chloroplast_tax_data$boot.Phylum <= mythreshold, NA, Chloroplast_tax_data$tax.Phylum)
Chloroplast_tax_data$tax.Class <- ifelse(Chloroplast_tax_data$boot.Class <= mythreshold, NA, Chloroplast_tax_data$tax.Class)
Chloroplast_tax_data$tax.Order <- ifelse(Chloroplast_tax_data$boot.Order <= mythreshold, NA, Chloroplast_tax_data$tax.Order)
Chloroplast_tax_data$tax.Family <- ifelse(Chloroplast_tax_data$boot.Family <= mythreshold, NA, Chloroplast_tax_data$tax.Family)
Chloroplast_tax_data$tax.Genus <- ifelse(Chloroplast_tax_data$boot.Genus <= mythreshold, NA, Chloroplast_tax_data$tax.Genus)
Chloroplast_tax_data$tax.Kingdom <- forcats::fct_explicit_na(Chloroplast_tax_data$tax.Kingdom, 'k_unassigned')
Chloroplast_tax_data$tax.Phylum <- forcats::fct_explicit_na(Chloroplast_tax_data$tax.Phylum, 'p_unassigned')
Chloroplast_tax_data$tax.Class <- forcats::fct_explicit_na(Chloroplast_tax_data$tax.Class, 'c_unassigned')
Chloroplast_tax_data$tax.Order <- forcats::fct_explicit_na(Chloroplast_tax_data$tax.Order, 'o_unassigned')
Chloroplast_tax_data$tax.Family <- forcats::fct_explicit_na(Chloroplast_tax_data$tax.Family, 'f_unassigned')
Chloroplast_tax_data$Genus <- forcats::fct_explicit_na(Chloroplast_tax_data$tax.Genus, 'g_unassigned')
Chloroplast_tax_data$Species <- forcats::fct_explicit_na(Chloroplast_tax_data$tax.Species, 'sp.')
```

```{r Select the columns you want and ditch the rest}
tax_data <- tax_data %>% select(ASV,tax.Kingdom,tax.Phylum,tax.Order,tax.Family,tax.Class,Genus,Species)
Chloroplast_tax_data <- Chloroplast_tax_data %>% rename(ASV = asv)
Chloroplast_tax_data <- Chloroplast_tax_data %>% select(ASV,tax.Kingdom,tax.Phylum,tax.Order,tax.Family,tax.Class, tax.Genus, tax.Species)
```

```{r Rename columns (I do this to be neat but you don't have to)}
tax_data <- tax_data %>% rename(Kingdom = tax.Kingdom)
tax_data <- tax_data %>% rename(Phylum = tax.Phylum)
tax_data <- tax_data %>% rename(Class = tax.Class)
tax_data <- tax_data %>% rename(Order = tax.Order)
tax_data <- tax_data %>% rename(Family = tax.Family)
Chloroplast_tax_data <- Chloroplast_tax_data %>% rename(Kingdom = tax.Kingdom)
Chloroplast_tax_data <- Chloroplast_tax_data %>% rename(Phylum = tax.Phylum)
Chloroplast_tax_data <- Chloroplast_tax_data %>% rename(Class = tax.Class)
Chloroplast_tax_data <- Chloroplast_tax_data %>% rename(Order = tax.Order)
Chloroplast_tax_data <- Chloroplast_tax_data %>% rename(Family = tax.Family)
Chloroplast_tax_data <- Chloroplast_tax_data %>% rename(Genus = tax.Genus)
Chloroplast_tax_data <- Chloroplast_tax_data %>% rename(Species = tax.Species)
```

```{r Remove Chloroplasts from Silva aka tax_data}
tax_data <- tax_data %>% filter(!Order %in% c('Chloroplast'))
```

```{r rbind the tax_data to Chloroplast_tax_data}
tax_data <- rbind(tax_data,Chloroplast_tax_data)
```

```{r Left join taxonomy data and ASV data to keep things simple}
tax_data$ASV <- as.character(tax_data$ASV) # Must be same type for join to work
asv_data$ASV <- as.character(asv_data$ASV) # Must be same type for join to work
asv_data <- left_join(asv_data, tax_data,by = c("ASV" = "ASV")) # Vdentifies cols with shared IDs
#print(asv_data)
```

```{r Make a long version of your data}
asv_data_long <- gather(data = asv_data, key = SampleID, value = Abundance, -c(1,92,93,94,95,96,97,98))
```

```{r Left join your sample_data}
asv_data_long <- asv_data_long %>% left_join(sample_data)
asv_data_long <- asv_data_long %>% select(!SampleID) %>% rename(SampleID = code)
```

```{r Remove unwanted ASVs - we will then store them in "Discarded_ASVs" in the case you want them later on.For this step we are going to rename our asv_data_long to Remove_1, Remove_2 and so on.. We will then label the final long ASV table as asv_cleaned_long.}
table(asv_data_long$Kingdom)
```

```{r Remove Archaea... 29602 ASVs total in the dataset (present >1 time ever)}
Remove_1 <- asv_data_long %>% filter(!Kingdom %in% c('Archaea'))
Archaea <- asv_data_long %>% filter(Kingdom %in% c('Archaea'))
```

```{r Remove Unassigned ASVs} 
Remove_2 <- Remove_1 %>% filter(!Kingdom %in% c('k_unassigned'))
Unassigned_ASVs <- Remove_1 %>% filter(Kingdom %in% c('k_unassigned'))
```

```{r Remove Mitochondria - this is the last step of removals so rather than labelling "Remove_5" we are going for asv_clean_long}
asv_cleaned_long <- Remove_2 %>% filter(!Family %in% c('Mitochondria'))
Mitochondria <- Remove_2 %>% filter(Order %in% c("Mitochondria"))
```

```{r Make a table to check everything has been removed}
table(asv_cleaned_long$Kingdom)
```

```{r Make a dataframe for your stored Discarded ASVS "Discared_ASVs"}
Discarded_ASVS<-rbind(Archaea,Eukaryota,Chloroplast)
```

```{r Remove any unwanted samples. In this case I'm removing some weird samples that aren't mine}
asv_cleaned_long <- asv_cleaned_long %>% filter(!SampleID %in% c('L25','L26','L27','L28','L29','L30'))
```

```{r Quality filtration - theres a nature paper - basically you are removing anything that has a relative abundance of below 0.05%}
asv_cleaned_long <- asv_cleaned_long %>%  arrange(desc(asv_cleaned_long$Abundance))
```

```{r Make your No_Filter table}
No_Filter <- asv_cleaned_long  %>% filter(Abundance > 0)
```

```{r Calculate sample total column and left join back to No_Filter, this will now be called No_Filter_Total}
Sample_Total <- No_Filter  %>% group_by(SampleID) %>% summarise(Total=sum(Abundance))
No_Filter_Total <- No_Filter %>% left_join(Sample_Total, c("SampleID"="SampleID"))
```

```{r We will then calculate the relative abundance and remove the bottom 0.05%. Once this is done we will relable it asv_cleaned_long as we are done}

No_Filter_Total$PercT <- ((No_Filter_Total$Abundance)/No_Filter_Total$Total)*100 #this makes it into a %
No_Filter_Total <- No_Filter_Total %>%  arrange(desc(No_Filter_Total$PercT))
asv_cleaned_long <- No_Filter_Total  %>% filter(PercT > 0.005)
```

```{r Store your filtered out ASVs incase you want to peak at them later on}
Filtered_ASVs_0.05 <- No_Filter_Total  %>% filter(PercT < 0.005)
```

```{r Select the columns we want or aka delete those columns we created before. We cannot use that old relative abundance column, we must calculate a new RA based off of our now cleaned data set.} 
asv_cleaned_long <- asv_cleaned_long %>% select(ASV,Kingdom,Phylum,Order,Family,Class,Genus,Species,SampleID,Abundance,Location,`Koppen`,`Lat DD`,Location_Rep,Type)
```

```{r Now we will name the ASVs}
asv_cleaned_long_name <- asv_cleaned_long %>% filter(!Abundance %in% c("0"))
asv_cleaned_long <- asv_cleaned_long_name %>%  arrange(desc(asv_cleaned_long_name$Abundance))
asv_cleaned_by_asv <- asv_cleaned_long %>%group_by(ASV)
ASV_list <- asv_cleaned_by_asv %>% dplyr::select (ASV)
ASV_list <-distinct(ASV_list)
ASV_list$numbers<-seq(1000000001,1000000000+nrow(ASV_list),1)
ASV_list$prefix<-rep('B',nrow(ASV_list))
ASV_list <- ASV_list %>% unite(ASV_name, c('prefix','numbers'), sep='', remove=T)
asv_cleaned_long <- asv_cleaned_long %>% left_join(ASV_list , c("ASV"="ASV"))
asv_cleaned_long <- asv_cleaned_long %>% unite('FG', c(Family, Genus), remove=F, sep=';')
asv_cleaned_long <- asv_cleaned_long %>% unite('FGSp', c(FG, Species), remove=F, sep=';')
asv_cleaned_long <- asv_cleaned_long %>% unite('FGID', c(FGSp, ASV_name), remove=F, sep=';')
asv_cleaned_long <- asv_cleaned_long %>% unite('GSID', c(Genus, Species, ASV_name), remove=F, sep=';')
asv_cleaned_long <- asv_cleaned_long %>% unite('FGID.Location', c(FGID, Location), remove=F, sep=';')
asv_cleaned_long <- asv_cleaned_long %>% unite('taxonomy', c(Kingdom,Phylum,Class,Order,Family,Genus,Species), remove=F, sep=';')
```

```{r write file}
setwd("~/Dropbox/UTS/PhD/Projects/Chapter1_Vibrio_latitude_study/Data/16S/Data")
write_csv(asv_cleaned_long, "asv_cleaned_long_Chloroplasts.csv")
```