From bebe6737e7eb7ff3b0edf37b5d8f5bb1ec784dbd Mon Sep 17 00:00:00 2001 From: Maciej Banas Date: Fri, 20 Dec 2024 15:20:41 +0000 Subject: [PATCH] Some small adjustments with repo date. --- R/add_metadata.R | 4 +-- R/process_content.R | 14 +++++++---- R/process_repos.R | 35 +++++++++++++------------- tests/testthat/test-Pinecone.R | 36 +++++++++++++-------------- tests/testthat/test-add_metadata.R | 21 +++++++--------- tests/testthat/test-process_content.R | 9 ++++--- 6 files changed, 61 insertions(+), 58 deletions(-) diff --git a/R/add_metadata.R b/R/add_metadata.R index 7441a47..7557e72 100644 --- a/R/add_metadata.R +++ b/R/add_metadata.R @@ -1,5 +1,5 @@ #' @noRd -add_metadata <- function(result, content) { +add_metadata <- function(result, content, timestamp) { web_url <- content$repo_url[1] api_url <- content$api_url[1] if (grepl("github", api_url)) { @@ -10,7 +10,7 @@ add_metadata <- function(result, content) { result[["metadata"]] <- list( repo_url = web_url, files = paste0(content$file_path, collapse = ", "), - timestamp = get_repo_date(api_url) + timestamp = timestamp ) result } diff --git a/R/process_content.R b/R/process_content.R index 070850e..e396b58 100644 --- a/R/process_content.R +++ b/R/process_content.R @@ -1,14 +1,18 @@ -process_content <- function(gitai, content, max_words = 80000) { +process_content <- function(gitai, content, max_words = 80000, verbose) { words <- strsplit(content, "\\s+")[[1]] num_words <- length(words) - cli::cli_alert_info("Repo content has {num_words} words") + if (verbose) cli::cli_alert_info("Repo content has {num_words} words") if (num_words > max_words) { - cli::cli_alert_warning("Repo content is probably too long, triming...") - trimmed_words <- words[1:min(length(words), max_words)] + if (verbose) { + cli::cli_alert_warning("Repo content is probably too long, triming...") + } + trimmed_words <- words[seq_len(min(length(words), max_words))] content <- paste(trimmed_words, collapse = " ") - cli::cli_alert_info("Repo content has now {length(trimmed_words)} words.") + if (verbose) { + cli::cli_alert_info("Repo content has now {length(trimmed_words)} words.") + } } llm_clone <- gitai$llm$clone(deep = TRUE) diff --git a/R/process_repos.R b/R/process_repos.R index 21fea85..376f54e 100644 --- a/R/process_repos.R +++ b/R/process_repos.R @@ -22,24 +22,24 @@ process_repos <- function( verbose = verbose ) GitStats::get_files_structure( - gitstats_object = gitstats, + gitstats, pattern = paste0(gitai$files, collapse = "|"), depth = depth, verbose = verbose ) files_content <- GitStats::get_files_content(gitstats, verbose = verbose) - distinct_repos <- files_content |> - dplyr::distinct(repo_name, api_url) + distinct_repos <- files_content |> + dplyr::distinct(repo_name, api_url) repositories <- distinct_repos$repo_name api_urls <- distinct_repos$api_url results <- purrr::map2(repositories, api_urls, function(repo_name, api_url) { - + current_repo_number <- which(repositories == repo_name) - + if (verbose) { cli::cli_alert(paste0( "Processing repository ", @@ -56,25 +56,24 @@ process_repos <- function( dplyr::pull(file_content) |> paste(collapse = "\n\n") + if (grepl("github", api_url)) { + api_url <- github_repo(api_url) + } else { + api_url <- gitlab_repo(api_url) + } + repo_timestamp <- get_repo_date(api_url) + if (!is.null(gitai$db)) { if (verbose) { cli::cli_alert_info("Checking repo timestamp...") } record <- gitai$db$read_record(id = repo_name) - + if (NROW(record) > 0) { record <- record[[1]] record_timestamp <- as.POSIXct(record$metadata$timestamp, tz = "UTC") - - if (grepl("github", api_url)) { - api_url <- github_repo(api_url) - } else { - api_url <- gitlab_repo(api_url) - } - - repo_timestamp <- get_repo_date(api_url) - + if (repo_timestamp <= record_timestamp) { if (verbose) { cli::cli_alert_info("Repo has not been updated. Skipping...") @@ -87,11 +86,13 @@ process_repos <- function( if (verbose) { cli::cli_alert_info("Processing content with LLM...") } + result <- process_content( gitai = gitai, - content = content_to_process + content = content_to_process, + verbose = verbose ) |> - add_metadata(content = filtered_content) + add_metadata(content = filtered_content, timestamp = repo_timestamp) if (!is.null(gitai$db)) { if (verbose) { diff --git a/tests/testthat/test-Pinecone.R b/tests/testthat/test-Pinecone.R index df3341d..f18b741 100644 --- a/tests/testthat/test-Pinecone.R +++ b/tests/testthat/test-Pinecone.R @@ -1,10 +1,10 @@ test_that("getting index metadata", { db <- Pinecone$new( - namespace = "test_project_id", + namespace = "test_project_id", index = "gitai" ) - + index <- db$get_index_metadata() index$host |> is.character() |> expect_true() }) @@ -12,10 +12,10 @@ test_that("getting index metadata", { test_that("getting embeddings", { db <- Pinecone$new( - namespace = "test_project_id", + namespace = "test_project_id", index = "gitai" ) - + test_text <- "Apple is a popular fruit known for its sweetness and crisp texture." embeddings <- db$.__enclos_env__$private$.get_embeddings(text = test_text) @@ -23,12 +23,12 @@ test_that("getting embeddings", { }) test_that("writting records", { - + db <- Pinecone$new( - namespace = "test_project_id", + namespace = "test_project_id", index = "gitai" ) - + test_texts <- c( "Apple is a popular fruit known for its sweetness and crisp texture.", "The tech company Apple is known for its innovative products like the iPhone.", @@ -39,40 +39,40 @@ test_that("writting records", { ) for (i in seq_along(test_texts)) { - + result <- db$write_record( id = paste0("id_", i), text = test_texts[i] - ) + ) result$upsertedCount |> expect_equal(1) } }) test_that("finding records", { - + Sys.sleep(3) - + db <- Pinecone$new( namespace = "test_project_id", index = "gitai" ) - + result <- db$find_records( - query = "Tell me about Apple Tech computer company.", + query = "Tell me about Apple Tech computer company.", top_k = 1 ) - + length(result) |> expect_equal(1) result[[1]]$id |> expect_equal("id_2") result[[1]]$metadata$text |> is.character() |> expect_true() result[[1]]$score |> is.numeric() |> expect_true() - + result_2 <- db$find_records( - query = "Tell me about apple fruit.", + query = "Tell me about apple fruit.", top_k = 1 ) - + expect_false(result_2[[1]]$id == result[[1]]$id) }) @@ -85,7 +85,7 @@ test_that("reading records", { result <- db$read_record(id = "id_1") - result[[1]]$metadata$text |> + result[[1]]$metadata$text |> is.character() |> expect_true() }) diff --git a/tests/testthat/test-add_metadata.R b/tests/testthat/test-add_metadata.R index ea2ea09..35b4541 100644 --- a/tests/testthat/test-add_metadata.R +++ b/tests/testthat/test-add_metadata.R @@ -9,16 +9,13 @@ test_that("metadata is added to content", { repo_url = c("test_URL", "test_URL"), api_url = c("test_URL", "test_URL") ) - testthat::with_mocked_bindings({ - result_with_metadata <- "result" |> - test_mocker$use() |> - add_metadata( - content = mocked_files_content - ) - expect_true("metadata" %in% names(result_with_metadata)) - expect_type(result_with_metadata[["metadata"]], "list") - expect_equal(names(result_with_metadata[["metadata"]]), c("repo_url", "files", "timestamp")) - }, - get_repo_date = function(api_url) Sys.time() - ) + result_with_metadata <- "result" |> + test_mocker$use() |> + add_metadata( + content = mocked_files_content, + timestamp = Sys.Date() + ) + expect_true("metadata" %in% names(result_with_metadata)) + expect_type(result_with_metadata[["metadata"]], "list") + expect_equal(names(result_with_metadata[["metadata"]]), c("repo_url", "files", "timestamp")) }) diff --git a/tests/testthat/test-process_content.R b/tests/testthat/test-process_content.R index a5828f3..a7884ca 100644 --- a/tests/testthat/test-process_content.R +++ b/tests/testthat/test-process_content.R @@ -3,7 +3,7 @@ test_that("processing content have proper output structure", { set_llm() |> set_prompt(system_prompt = "Say 'Hi there!' only and nothing else.") - result <- process_content(gitai = my_project, content = "") + result <- process_content(gitai = my_project, content = "", verbose = FALSE) expect_equal(result$text, "Hi there!") expect_true(is.numeric(result$tokens)) expect_true(is.list(result$output)) @@ -26,17 +26,18 @@ test_that("processing a single file content with deterministic output", { httr2::with_verbosity(verbosity = 0, { result <- process_content( gitai = my_project, - content = test_content + content = test_content, + verbose = FALSE ) }) expect_length(gregexpr("\\.", result$text)[[1]], 1) expect_equal( result$text, - process_content(gitai = my_project, content = test_content)$text + process_content(gitai = my_project, content = test_content, verbose = FALSE)$text ) expect_equal( result$text, - process_content(gitai = my_project, content = test_content)$text + process_content(gitai = my_project, content = test_content, verbose = FALSE)$text ) test_mocker$cache(result)