-
Notifications
You must be signed in to change notification settings - Fork 0
/
lcube.r
62 lines (52 loc) · 2.26 KB
/
lcube.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# script that provides a list of IDs for an LCUBE subsample of sequences
# Input: a CSV file with a unique identifier ID column, date column (YYYY-MM-DD format), and any number of numeric columns; a fasta file with sequences and ids that match the CSV file
# Output: a CSV file for the LCUBE subsample; a fasta file with sequences and ids that match the subsample CSV file
# Load necessary libraries
library(BalancedSampling)
library(phangorn)
library(dplyr)
library(stringr)
# Load command line arguments into variables
args <- commandArgs(trailingOnly = TRUE)
in_csv <- args[1]
id_col <- args[2]
date_col <- args[3]
in_fasta <- args[4]
out_csv <- args[5]
out_fasta <- args[6]
n <- as.integer(args[7])
seed <- as.integer(args[8])
# Read in the CSV and FASTA files
metadata <- read.csv(in_csv, header = TRUE, stringsAsFactors = FALSE)
fasta <- read.dna(in_fasta, as.character=TRUE, format="fasta", as.matrix=TRUE)
fa <- as.phyDat(fasta)
distance <- dist.hamming(fa)
metadata <- metadata[match(names(fa), metadata[,id_col]),]
# Set the seed for reproducibility
set.seed(seed)
# Get the number of sequences in the FASTA file
N <- length(fa)
# get temporal distance matrix and collect lat long vals
temporal<- c()
for (i in 1:nrow(metadata)) {
temporal<- c(temporal, as.numeric(difftime(as.Date(metadata[i, date_col],
tryFormats = c("%Y-%m-%d", "%Y-%m", "%Y")),
as.Date("2020-01-01"), unit="days")) / 365)
}
metadata$timediff <- temporal
# bind distance matrices together
matr = as.matrix(distance)
# selected_columns <- sample(ncol(matr), 4) # This chunk is commented out, but
# matr[, selected_columns] # left in case we need to speed things up. I recommend a more useful selection of relevant distance measures.
matr<-cbind(select(metadata, -c(date_col, id_col)), matr)
matr<- scale(matr)
# scream if there is an NA
if(any(is.na(matr))) {
stop("Error: The matrix contains NA values.")
}
# get the LCUBE subsample
p = rep(n/N, N)
s = lcube(p, matr, cbind(p))
# write out the metadata from the subsample and the corresponding FASTA sequences
write.csv(metadata[s,], out_csv, row.names = FALSE)
write.dna(fasta[s,], file = str_replace(in_fasta, ".fasta", ".lcube.fasta"), format = "fasta")