Skip to content

Commit

Permalink
Duplicate lcube.r in main for preliminary publishing
Browse files Browse the repository at this point in the history
  • Loading branch information
glstott committed May 15, 2024
1 parent 11b5456 commit 395e322
Showing 1 changed file with 74 additions and 0 deletions.
74 changes: 74 additions & 0 deletions lcube.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# script that provides a list of IDs for an LCUBE subsample of sequences
# Input: a CSV file with columns "id", "lat", "lng", and "date"; a fasta file with sequences and ids that match the CSV file
# Output: a CSV file with columns "id", "lat", "lng", and "date" for the LCUBE subsample; a fasta file with sequences and ids that match the subsample CSV file

# Load necessary libraries
library(BalancedSampling)
library(phangorn)
library(dplyr)
# library(geonames)
library(stringr)

# Load command line arguments into variables
args <- commandArgs(trailingOnly = TRUE)
in_csv <- args[1]
id_col <- args[2]
date_col <- args[3]
in_fasta <- args[4]
out_csv <- args[5]
n <- as.integer(args[6])
seed <- as.integer(args[7])

# Adding some hard-coded testing comments because as an academic, I can get away with it
# in_csv<-"../test_files/HA_NorthAmerica_202401-20240507.csv"
# id_col <- "id"
# date_col <- "Collection_Date"
# in_fasta <- "../test_files/HA_NorthAmerica_202401-20240507.aligned.fasta"
# out_csv <- "../test_files/HA_NorthAmerica_202401-20240507_lcube.csv"
# n <- 100
# seed <- 12

# Read in the CSV and FASTA files
metadata <- read.csv(in_csv, header = TRUE, stringsAsFactors = FALSE)
fasta <- read.dna(in_fasta, as.character=TRUE, format="fasta", as.matrix=TRUE)
fa <- as.phyDat(fasta)
distance <- dist.hamming(fa)
metadata <- metadata[match(names(fa), metadata[,id_col]),]

# Set the seed for reproducibility
set.seed(seed)

# Get the number of sequences in the FASTA file
N <- length(fa)

# get temporal distance matrix and collect lat long vals
temporal<- c()
for (i in 1:nrow(metadata)) {
temporal<- c(temporal, as.numeric(difftime(as.Date(metadata$Collection_Date[i],
tryFormats = c("%Y-%m-%d", "%Y-%m", "%Y")),
as.Date("2020-01-01"), unit="days")) / 365)
}
metadata$timediff <- temporal
# metadata$lat<-as.numeric(metadata$lat)
# metadata$lng<-as.numeric(metadata$lng)

# bind distance matrices together
matr = as.matrix(distance)
# selected_columns <- sample(ncol(matr), 4) # This chunk is commented out, but
# matr[, selected_columns] # left in case we need to speed things up.
matr<-cbind(matr, select(metadata, -c(date_col, id_col)))
matr<- scale(matr)

# scream if there is an NA
if(any(is.na(matr))) {
stop("Error: The matrix contains NA values.")
}

# get the LCUBE subsample
p = rep(n/N, N)
s = lcube(p, matr, cbind(p))
# print("outputting results...")

# write out the metadata from the subsample and the corresponding FASTA sequences
write.csv(metadata[s,], out_csv, row.names = FALSE)
write.dna(fasta[s,], file = str_replace(in_fasta, ".fasta", ".lcube.fasta"), format = "fasta")

0 comments on commit 395e322

Please sign in to comment.