Skip to content

Commit

Permalink
Some small adjustments with repo date.
Browse files Browse the repository at this point in the history
  • Loading branch information
maciekbanas committed Dec 20, 2024
1 parent 30edc03 commit bebe673
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 58 deletions.
4 changes: 2 additions & 2 deletions R/add_metadata.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#' @noRd
add_metadata <- function(result, content) {
add_metadata <- function(result, content, timestamp) {
web_url <- content$repo_url[1]
api_url <- content$api_url[1]
if (grepl("github", api_url)) {
Expand All @@ -10,7 +10,7 @@ add_metadata <- function(result, content) {
result[["metadata"]] <- list(
repo_url = web_url,
files = paste0(content$file_path, collapse = ", "),
timestamp = get_repo_date(api_url)
timestamp = timestamp
)
result
}
Expand Down
14 changes: 9 additions & 5 deletions R/process_content.R
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
process_content <- function(gitai, content, max_words = 80000) {
process_content <- function(gitai, content, max_words = 80000, verbose) {

words <- strsplit(content, "\\s+")[[1]]
num_words <- length(words)
cli::cli_alert_info("Repo content has {num_words} words")
if (verbose) cli::cli_alert_info("Repo content has {num_words} words")

if (num_words > max_words) {
cli::cli_alert_warning("Repo content is probably too long, triming...")
trimmed_words <- words[1:min(length(words), max_words)]
if (verbose) {
cli::cli_alert_warning("Repo content is probably too long, triming...")
}
trimmed_words <- words[seq_len(min(length(words), max_words))]
content <- paste(trimmed_words, collapse = " ")
cli::cli_alert_info("Repo content has now {length(trimmed_words)} words.")
if (verbose) {
cli::cli_alert_info("Repo content has now {length(trimmed_words)} words.")
}
}

llm_clone <- gitai$llm$clone(deep = TRUE)
Expand Down
35 changes: 18 additions & 17 deletions R/process_repos.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,24 +22,24 @@ process_repos <- function(
verbose = verbose
)
GitStats::get_files_structure(
gitstats_object = gitstats,
gitstats,
pattern = paste0(gitai$files, collapse = "|"),
depth = depth,
verbose = verbose
)
files_content <- GitStats::get_files_content(gitstats, verbose = verbose)

distinct_repos <- files_content |>
dplyr::distinct(repo_name, api_url)
distinct_repos <- files_content |>
dplyr::distinct(repo_name, api_url)

repositories <- distinct_repos$repo_name
api_urls <- distinct_repos$api_url

results <-
purrr::map2(repositories, api_urls, function(repo_name, api_url) {

current_repo_number <- which(repositories == repo_name)

if (verbose) {
cli::cli_alert(paste0(
"Processing repository ",
Expand All @@ -56,25 +56,24 @@ process_repos <- function(
dplyr::pull(file_content) |>
paste(collapse = "\n\n")

if (grepl("github", api_url)) {
api_url <- github_repo(api_url)
} else {
api_url <- gitlab_repo(api_url)
}
repo_timestamp <- get_repo_date(api_url)

if (!is.null(gitai$db)) {
if (verbose) {
cli::cli_alert_info("Checking repo timestamp...")
}
record <- gitai$db$read_record(id = repo_name)

if (NROW(record) > 0) {

record <- record[[1]]
record_timestamp <- as.POSIXct(record$metadata$timestamp, tz = "UTC")

if (grepl("github", api_url)) {
api_url <- github_repo(api_url)
} else {
api_url <- gitlab_repo(api_url)
}

repo_timestamp <- get_repo_date(api_url)


if (repo_timestamp <= record_timestamp) {
if (verbose) {
cli::cli_alert_info("Repo has not been updated. Skipping...")
Expand All @@ -87,11 +86,13 @@ process_repos <- function(
if (verbose) {
cli::cli_alert_info("Processing content with LLM...")
}

result <- process_content(
gitai = gitai,
content = content_to_process
content = content_to_process,
verbose = verbose
) |>
add_metadata(content = filtered_content)
add_metadata(content = filtered_content, timestamp = repo_timestamp)

if (!is.null(gitai$db)) {
if (verbose) {
Expand Down
36 changes: 18 additions & 18 deletions tests/testthat/test-Pinecone.R
Original file line number Diff line number Diff line change
@@ -1,34 +1,34 @@
test_that("getting index metadata", {

db <- Pinecone$new(
namespace = "test_project_id",
namespace = "test_project_id",
index = "gitai"
)

index <- db$get_index_metadata()
index$host |> is.character() |> expect_true()
})

test_that("getting embeddings", {

db <- Pinecone$new(
namespace = "test_project_id",
namespace = "test_project_id",
index = "gitai"
)

test_text <- "Apple is a popular fruit known for its sweetness and crisp texture."
embeddings <- db$.__enclos_env__$private$.get_embeddings(text = test_text)

length(embeddings) |> expect_equal(1024)
})

test_that("writting records", {

db <- Pinecone$new(
namespace = "test_project_id",
namespace = "test_project_id",
index = "gitai"
)

test_texts <- c(
"Apple is a popular fruit known for its sweetness and crisp texture.",
"The tech company Apple is known for its innovative products like the iPhone.",
Expand All @@ -39,40 +39,40 @@ test_that("writting records", {
)

for (i in seq_along(test_texts)) {

result <- db$write_record(
id = paste0("id_", i),
text = test_texts[i]
)
)

result$upsertedCount |> expect_equal(1)
}
})

test_that("finding records", {

Sys.sleep(3)

db <- Pinecone$new(
namespace = "test_project_id",
index = "gitai"
)

result <- db$find_records(
query = "Tell me about Apple Tech computer company.",
query = "Tell me about Apple Tech computer company.",
top_k = 1
)

length(result) |> expect_equal(1)
result[[1]]$id |> expect_equal("id_2")
result[[1]]$metadata$text |> is.character() |> expect_true()
result[[1]]$score |> is.numeric() |> expect_true()

result_2 <- db$find_records(
query = "Tell me about apple fruit.",
query = "Tell me about apple fruit.",
top_k = 1
)

expect_false(result_2[[1]]$id == result[[1]]$id)
})

Expand All @@ -85,7 +85,7 @@ test_that("reading records", {

result <- db$read_record(id = "id_1")

result[[1]]$metadata$text |>
result[[1]]$metadata$text |>
is.character() |>
expect_true()
})
21 changes: 9 additions & 12 deletions tests/testthat/test-add_metadata.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,13 @@ test_that("metadata is added to content", {
repo_url = c("test_URL", "test_URL"),
api_url = c("test_URL", "test_URL")
)
testthat::with_mocked_bindings({
result_with_metadata <- "result" |>
test_mocker$use() |>
add_metadata(
content = mocked_files_content
)
expect_true("metadata" %in% names(result_with_metadata))
expect_type(result_with_metadata[["metadata"]], "list")
expect_equal(names(result_with_metadata[["metadata"]]), c("repo_url", "files", "timestamp"))
},
get_repo_date = function(api_url) Sys.time()
)
result_with_metadata <- "result" |>
test_mocker$use() |>
add_metadata(
content = mocked_files_content,
timestamp = Sys.Date()
)
expect_true("metadata" %in% names(result_with_metadata))
expect_type(result_with_metadata[["metadata"]], "list")
expect_equal(names(result_with_metadata[["metadata"]]), c("repo_url", "files", "timestamp"))
})
9 changes: 5 additions & 4 deletions tests/testthat/test-process_content.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ test_that("processing content have proper output structure", {
set_llm() |>
set_prompt(system_prompt = "Say 'Hi there!' only and nothing else.")

result <- process_content(gitai = my_project, content = "")
result <- process_content(gitai = my_project, content = "", verbose = FALSE)
expect_equal(result$text, "Hi there!")
expect_true(is.numeric(result$tokens))
expect_true(is.list(result$output))
Expand All @@ -26,17 +26,18 @@ test_that("processing a single file content with deterministic output", {
httr2::with_verbosity(verbosity = 0, {
result <- process_content(
gitai = my_project,
content = test_content
content = test_content,
verbose = FALSE
)
})
expect_length(gregexpr("\\.", result$text)[[1]], 1)
expect_equal(
result$text,
process_content(gitai = my_project, content = test_content)$text
process_content(gitai = my_project, content = test_content, verbose = FALSE)$text
)
expect_equal(
result$text,
process_content(gitai = my_project, content = test_content)$text
process_content(gitai = my_project, content = test_content, verbose = FALSE)$text
)

test_mocker$cache(result)
Expand Down

0 comments on commit bebe673

Please sign in to comment.