From 395e3226933cff958e3f9b589ec15ef6653d24b1 Mon Sep 17 00:00:00 2001
From: Guppy <30294975+glstott@users.noreply.github.com>
Date: Wed, 15 May 2024 10:37:26 -0400
Subject: [PATCH] Duplicate lcube.r in main for preliminary publishing

---
 lcube.r | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)
 create mode 100644 lcube.r

diff --git a/lcube.r b/lcube.r
new file mode 100644
index 0000000..1e5cbed
--- /dev/null
+++ b/lcube.r
@@ -0,0 +1,74 @@
+# script that provides a list of IDs for an LCUBE subsample of sequences
+# Input: a CSV file with columns "id", "lat", "lng", and "date"; a fasta file with sequences and ids that match the CSV file
+# Output: a CSV file with columns "id", "lat", "lng", and "date" for the LCUBE subsample; a fasta file with sequences and ids that match the subsample CSV file
+
+# Load necessary libraries
+library(BalancedSampling)
+library(phangorn)
+library(dplyr)
+# library(geonames)
+library(stringr)
+
+# Load command line arguments into variables
+args <- commandArgs(trailingOnly = TRUE)
+in_csv <- args[1]
+id_col <- args[2]
+date_col <- args[3]
+in_fasta <- args[4]
+out_csv <- args[5]
+n <- as.integer(args[6])
+seed <- as.integer(args[7])
+
+# Adding some hard-coded testing comments because as an academic, I can get away with it
+# in_csv<-"../test_files/HA_NorthAmerica_202401-20240507.csv"
+# id_col <- "id"
+# date_col <- "Collection_Date"
+# in_fasta <- "../test_files/HA_NorthAmerica_202401-20240507.aligned.fasta"
+# out_csv <- "../test_files/HA_NorthAmerica_202401-20240507_lcube.csv"
+# n <- 100
+# seed <- 12
+
+# Read in the CSV and FASTA files
+metadata <- read.csv(in_csv, header = TRUE, stringsAsFactors = FALSE)
+fasta <- read.dna(in_fasta, as.character=TRUE, format="fasta", as.matrix=TRUE)
+fa <- as.phyDat(fasta)
+distance <- dist.hamming(fa)
+metadata <- metadata[match(names(fa), metadata[,id_col]),]
+
+# Set the seed for reproducibility
+set.seed(seed)
+
+# Get the number of sequences in the FASTA file
+N <- length(fa)
+
+# get temporal distance matrix and collect lat long vals
+temporal<- c()
+for (i in 1:nrow(metadata)) {
+  temporal<- c(temporal, as.numeric(difftime(as.Date(metadata$Collection_Date[i], 
+                                                     tryFormats = c("%Y-%m-%d", "%Y-%m", "%Y")), 
+                                             as.Date("2020-01-01"), unit="days")) / 365)
+}
+metadata$timediff <- temporal
+# metadata$lat<-as.numeric(metadata$lat)
+# metadata$lng<-as.numeric(metadata$lng)
+
+# bind distance matrices together
+matr = as.matrix(distance)
+# selected_columns <- sample(ncol(matr), 4) # This chunk is commented out, but 
+# matr[, selected_columns]                  # left in case we need to speed things up.
+matr<-cbind(matr, select(metadata, -c(date_col, id_col)))
+matr<- scale(matr)
+
+# scream if there is an NA
+if(any(is.na(matr))) {
+  stop("Error: The matrix contains NA values.")
+}
+
+# get the LCUBE subsample
+p = rep(n/N, N)
+s = lcube(p, matr, cbind(p))
+# print("outputting results...")
+
+# write out the metadata from the subsample and the corresponding FASTA sequences
+write.csv(metadata[s,], out_csv, row.names = FALSE)
+write.dna(fasta[s,], file = str_replace(in_fasta, ".fasta", ".lcube.fasta"), format = "fasta")