From df3a1da82a4baa49493aabd2c372c41d90729312 Mon Sep 17 00:00:00 2001
From: Guppy <30294975+glstott@users.noreply.github.com>
Date: Wed, 15 May 2024 11:07:04 -0400
Subject: [PATCH] Clean up pretty LCUBE. Reincorporate a change that keeps
 going missing because I have multiple editors open.

---
 lcube.r                  | 30 +++++++++---------------------
 nextflow/scripts/lcube.r |  7 ++++---
 2 files changed, 13 insertions(+), 24 deletions(-)

diff --git a/lcube.r b/lcube.r
index 1e5cbed..60148d8 100644
--- a/lcube.r
+++ b/lcube.r
@@ -1,12 +1,11 @@
 # script that provides a list of IDs for an LCUBE subsample of sequences
-# Input: a CSV file with columns "id", "lat", "lng", and "date"; a fasta file with sequences and ids that match the CSV file
-# Output: a CSV file with columns "id", "lat", "lng", and "date" for the LCUBE subsample; a fasta file with sequences and ids that match the subsample CSV file
+# Input: a CSV file with a unique identifier ID column, date column (YYYY-MM-DD format), and any number of numeric columns; a fasta file with sequences and ids that match the CSV file
+# Output: a CSV file for the LCUBE subsample; a fasta file with sequences and ids that match the subsample CSV file
 
 # Load necessary libraries
 library(BalancedSampling)
 library(phangorn)
 library(dplyr)
-# library(geonames)
 library(stringr)
 
 # Load command line arguments into variables
@@ -16,17 +15,9 @@ id_col <- args[2]
 date_col <- args[3]
 in_fasta <- args[4]
 out_csv <- args[5]
-n <- as.integer(args[6])
-seed <- as.integer(args[7])
-
-# Adding some hard-coded testing comments because as an academic, I can get away with it
-# in_csv<-"../test_files/HA_NorthAmerica_202401-20240507.csv"
-# id_col <- "id"
-# date_col <- "Collection_Date"
-# in_fasta <- "../test_files/HA_NorthAmerica_202401-20240507.aligned.fasta"
-# out_csv <- "../test_files/HA_NorthAmerica_202401-20240507_lcube.csv"
-# n <- 100
-# seed <- 12
+out_fasta <- args[6]
+n <- as.integer(args[7])
+seed <- as.integer(args[8])
 
 # Read in the CSV and FASTA files
 metadata <- read.csv(in_csv, header = TRUE, stringsAsFactors = FALSE)
@@ -44,19 +35,17 @@ N <- length(fa)
 # get temporal distance matrix and collect lat long vals
 temporal<- c()
 for (i in 1:nrow(metadata)) {
-  temporal<- c(temporal, as.numeric(difftime(as.Date(metadata$Collection_Date[i], 
+  temporal<- c(temporal, as.numeric(difftime(as.Date(metadata[i, date_col], 
                                                      tryFormats = c("%Y-%m-%d", "%Y-%m", "%Y")), 
                                              as.Date("2020-01-01"), unit="days")) / 365)
 }
 metadata$timediff <- temporal
-# metadata$lat<-as.numeric(metadata$lat)
-# metadata$lng<-as.numeric(metadata$lng)
 
 # bind distance matrices together
 matr = as.matrix(distance)
 # selected_columns <- sample(ncol(matr), 4) # This chunk is commented out, but 
-# matr[, selected_columns]                  # left in case we need to speed things up.
-matr<-cbind(matr, select(metadata, -c(date_col, id_col)))
+# matr[, selected_columns]                  # left in case we need to speed things up. I recommend a more useful selection of relevant distance measures.
+matr<-cbind(select(metadata, -c(date_col, id_col)), matr)
 matr<- scale(matr)
 
 # scream if there is an NA
@@ -67,8 +56,7 @@ if(any(is.na(matr))) {
 # get the LCUBE subsample
 p = rep(n/N, N)
 s = lcube(p, matr, cbind(p))
-# print("outputting results...")
 
 # write out the metadata from the subsample and the corresponding FASTA sequences
 write.csv(metadata[s,], out_csv, row.names = FALSE)
-write.dna(fasta[s,], file = str_replace(in_fasta, ".fasta", ".lcube.fasta"), format = "fasta")
+write.dna(fasta[s,], file = str_replace(in_fasta, ".fasta", ".lcube.fasta"), format = "fasta")
\ No newline at end of file
diff --git a/nextflow/scripts/lcube.r b/nextflow/scripts/lcube.r
index 1e5cbed..c739f27 100644
--- a/nextflow/scripts/lcube.r
+++ b/nextflow/scripts/lcube.r
@@ -16,8 +16,9 @@ id_col <- args[2]
 date_col <- args[3]
 in_fasta <- args[4]
 out_csv <- args[5]
-n <- as.integer(args[6])
-seed <- as.integer(args[7])
+out_fasta <- args[6]
+n <- as.integer(args[7])
+seed <- as.integer(args[8])
 
 # Adding some hard-coded testing comments because as an academic, I can get away with it
 # in_csv<-"../test_files/HA_NorthAmerica_202401-20240507.csv"
@@ -44,7 +45,7 @@ N <- length(fa)
 # get temporal distance matrix and collect lat long vals
 temporal<- c()
 for (i in 1:nrow(metadata)) {
-  temporal<- c(temporal, as.numeric(difftime(as.Date(metadata$Collection_Date[i], 
+  temporal<- c(temporal, as.numeric(difftime(as.Date(metadata[i, date_col], 
                                                      tryFormats = c("%Y-%m-%d", "%Y-%m", "%Y")), 
                                              as.Date("2020-01-01"), unit="days")) / 365)
 }