Skip to content

Commit

Permalink
Clean up pretty LCUBE. Reincorporate a change that keeps going missin…
Browse files Browse the repository at this point in the history
…g because I have multiple editors open.
  • Loading branch information
glstott committed May 15, 2024
1 parent 395e322 commit df3a1da
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 24 deletions.
30 changes: 9 additions & 21 deletions lcube.r
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
# script that provides a list of IDs for an LCUBE subsample of sequences
# Input: a CSV file with columns "id", "lat", "lng", and "date"; a fasta file with sequences and ids that match the CSV file
# Output: a CSV file with columns "id", "lat", "lng", and "date" for the LCUBE subsample; a fasta file with sequences and ids that match the subsample CSV file
# Input: a CSV file with a unique identifier ID column, date column (YYYY-MM-DD format), and any number of numeric columns; a fasta file with sequences and ids that match the CSV file
# Output: a CSV file for the LCUBE subsample; a fasta file with sequences and ids that match the subsample CSV file

# Load necessary libraries
library(BalancedSampling)
library(phangorn)
library(dplyr)
# library(geonames)
library(stringr)

# Load command line arguments into variables
Expand All @@ -16,17 +15,9 @@ id_col <- args[2]
date_col <- args[3]
in_fasta <- args[4]
out_csv <- args[5]
n <- as.integer(args[6])
seed <- as.integer(args[7])

# Adding some hard-coded testing comments because as an academic, I can get away with it
# in_csv<-"../test_files/HA_NorthAmerica_202401-20240507.csv"
# id_col <- "id"
# date_col <- "Collection_Date"
# in_fasta <- "../test_files/HA_NorthAmerica_202401-20240507.aligned.fasta"
# out_csv <- "../test_files/HA_NorthAmerica_202401-20240507_lcube.csv"
# n <- 100
# seed <- 12
out_fasta <- args[6]
n <- as.integer(args[7])
seed <- as.integer(args[8])

# Read in the CSV and FASTA files
metadata <- read.csv(in_csv, header = TRUE, stringsAsFactors = FALSE)
Expand All @@ -44,19 +35,17 @@ N <- length(fa)
# get temporal distance matrix and collect lat long vals
temporal<- c()
for (i in 1:nrow(metadata)) {
temporal<- c(temporal, as.numeric(difftime(as.Date(metadata$Collection_Date[i],
temporal<- c(temporal, as.numeric(difftime(as.Date(metadata[i, date_col],
tryFormats = c("%Y-%m-%d", "%Y-%m", "%Y")),
as.Date("2020-01-01"), unit="days")) / 365)
}
metadata$timediff <- temporal
# metadata$lat<-as.numeric(metadata$lat)
# metadata$lng<-as.numeric(metadata$lng)

# bind distance matrices together
matr = as.matrix(distance)
# selected_columns <- sample(ncol(matr), 4) # This chunk is commented out, but
# matr[, selected_columns] # left in case we need to speed things up.
matr<-cbind(matr, select(metadata, -c(date_col, id_col)))
# matr[, selected_columns] # left in case we need to speed things up. I recommend a more useful selection of relevant distance measures.
matr<-cbind(select(metadata, -c(date_col, id_col)), matr)
matr<- scale(matr)

# scream if there is an NA
Expand All @@ -67,8 +56,7 @@ if(any(is.na(matr))) {
# get the LCUBE subsample
p = rep(n/N, N)
s = lcube(p, matr, cbind(p))
# print("outputting results...")

# write out the metadata from the subsample and the corresponding FASTA sequences
write.csv(metadata[s,], out_csv, row.names = FALSE)
write.dna(fasta[s,], file = str_replace(in_fasta, ".fasta", ".lcube.fasta"), format = "fasta")
write.dna(fasta[s,], file = str_replace(in_fasta, ".fasta", ".lcube.fasta"), format = "fasta")
7 changes: 4 additions & 3 deletions nextflow/scripts/lcube.r
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@ id_col <- args[2]
date_col <- args[3]
in_fasta <- args[4]
out_csv <- args[5]
n <- as.integer(args[6])
seed <- as.integer(args[7])
out_fasta <- args[6]
n <- as.integer(args[7])
seed <- as.integer(args[8])

# Adding some hard-coded testing comments because as an academic, I can get away with it
# in_csv<-"../test_files/HA_NorthAmerica_202401-20240507.csv"
Expand All @@ -44,7 +45,7 @@ N <- length(fa)
# get temporal distance matrix and collect lat long vals
temporal<- c()
for (i in 1:nrow(metadata)) {
temporal<- c(temporal, as.numeric(difftime(as.Date(metadata$Collection_Date[i],
temporal<- c(temporal, as.numeric(difftime(as.Date(metadata[i, date_col],
tryFormats = c("%Y-%m-%d", "%Y-%m", "%Y")),
as.Date("2020-01-01"), unit="days")) / 365)
}
Expand Down

0 comments on commit df3a1da

Please sign in to comment.