From df3a1da82a4baa49493aabd2c372c41d90729312 Mon Sep 17 00:00:00 2001 From: Guppy <30294975+glstott@users.noreply.github.com> Date: Wed, 15 May 2024 11:07:04 -0400 Subject: [PATCH] Clean up pretty LCUBE. Reincorporate a change that keeps going missing because I have multiple editors open. --- lcube.r | 30 +++++++++--------------------- nextflow/scripts/lcube.r | 7 ++++--- 2 files changed, 13 insertions(+), 24 deletions(-) diff --git a/lcube.r b/lcube.r index 1e5cbed..60148d8 100644 --- a/lcube.r +++ b/lcube.r @@ -1,12 +1,11 @@ # script that provides a list of IDs for an LCUBE subsample of sequences -# Input: a CSV file with columns "id", "lat", "lng", and "date"; a fasta file with sequences and ids that match the CSV file -# Output: a CSV file with columns "id", "lat", "lng", and "date" for the LCUBE subsample; a fasta file with sequences and ids that match the subsample CSV file +# Input: a CSV file with a unique identifier ID column, date column (YYYY-MM-DD format), and any number of numeric columns; a fasta file with sequences and ids that match the CSV file +# Output: a CSV file for the LCUBE subsample; a fasta file with sequences and ids that match the subsample CSV file # Load necessary libraries library(BalancedSampling) library(phangorn) library(dplyr) -# library(geonames) library(stringr) # Load command line arguments into variables @@ -16,17 +15,9 @@ id_col <- args[2] date_col <- args[3] in_fasta <- args[4] out_csv <- args[5] -n <- as.integer(args[6]) -seed <- as.integer(args[7]) - -# Adding some hard-coded testing comments because as an academic, I can get away with it -# in_csv<-"../test_files/HA_NorthAmerica_202401-20240507.csv" -# id_col <- "id" -# date_col <- "Collection_Date" -# in_fasta <- "../test_files/HA_NorthAmerica_202401-20240507.aligned.fasta" -# out_csv <- "../test_files/HA_NorthAmerica_202401-20240507_lcube.csv" -# n <- 100 -# seed <- 12 +out_fasta <- args[6] +n <- as.integer(args[7]) +seed <- as.integer(args[8]) # Read in the CSV and FASTA files metadata <- read.csv(in_csv, header = TRUE, stringsAsFactors = FALSE) @@ -44,19 +35,17 @@ N <- length(fa) # get temporal distance matrix and collect lat long vals temporal<- c() for (i in 1:nrow(metadata)) { - temporal<- c(temporal, as.numeric(difftime(as.Date(metadata$Collection_Date[i], + temporal<- c(temporal, as.numeric(difftime(as.Date(metadata[i, date_col], tryFormats = c("%Y-%m-%d", "%Y-%m", "%Y")), as.Date("2020-01-01"), unit="days")) / 365) } metadata$timediff <- temporal -# metadata$lat<-as.numeric(metadata$lat) -# metadata$lng<-as.numeric(metadata$lng) # bind distance matrices together matr = as.matrix(distance) # selected_columns <- sample(ncol(matr), 4) # This chunk is commented out, but -# matr[, selected_columns] # left in case we need to speed things up. -matr<-cbind(matr, select(metadata, -c(date_col, id_col))) +# matr[, selected_columns] # left in case we need to speed things up. I recommend a more useful selection of relevant distance measures. +matr<-cbind(select(metadata, -c(date_col, id_col)), matr) matr<- scale(matr) # scream if there is an NA @@ -67,8 +56,7 @@ if(any(is.na(matr))) { # get the LCUBE subsample p = rep(n/N, N) s = lcube(p, matr, cbind(p)) -# print("outputting results...") # write out the metadata from the subsample and the corresponding FASTA sequences write.csv(metadata[s,], out_csv, row.names = FALSE) -write.dna(fasta[s,], file = str_replace(in_fasta, ".fasta", ".lcube.fasta"), format = "fasta") +write.dna(fasta[s,], file = str_replace(in_fasta, ".fasta", ".lcube.fasta"), format = "fasta") \ No newline at end of file diff --git a/nextflow/scripts/lcube.r b/nextflow/scripts/lcube.r index 1e5cbed..c739f27 100644 --- a/nextflow/scripts/lcube.r +++ b/nextflow/scripts/lcube.r @@ -16,8 +16,9 @@ id_col <- args[2] date_col <- args[3] in_fasta <- args[4] out_csv <- args[5] -n <- as.integer(args[6]) -seed <- as.integer(args[7]) +out_fasta <- args[6] +n <- as.integer(args[7]) +seed <- as.integer(args[8]) # Adding some hard-coded testing comments because as an academic, I can get away with it # in_csv<-"../test_files/HA_NorthAmerica_202401-20240507.csv" @@ -44,7 +45,7 @@ N <- length(fa) # get temporal distance matrix and collect lat long vals temporal<- c() for (i in 1:nrow(metadata)) { - temporal<- c(temporal, as.numeric(difftime(as.Date(metadata$Collection_Date[i], + temporal<- c(temporal, as.numeric(difftime(as.Date(metadata[i, date_col], tryFormats = c("%Y-%m-%d", "%Y-%m", "%Y")), as.Date("2020-01-01"), unit="days")) / 365) }