Skip to content

Commit

Permalink
Fix build_index (#24)
Browse files Browse the repository at this point in the history
  • Loading branch information
svilupp authored May 27, 2024
1 parent 7ecb0d9 commit 191f28c
Show file tree
Hide file tree
Showing 6 changed files with 95 additions and 10 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ concurrency:
# Cancel intermediate builds: only if it is a pull request build.
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
env:
OPENAI_API_KEY: "invalid-key-just-for-testing"
jobs:
test:
name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
Expand Down
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Fixed

## [0.1.1]

### Fixed
- Fixed a bug in `build_index` where imports were missing and keywords were not passed properly in all scenarios.

## [0.1.0]

### Added
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "AIHelpMe"
uuid = "01402e1f-dc83-4213-a98b-42887d758baa"
authors = ["J S <[email protected]> and contributors"]
version = "0.1.0"
version = "0.1.1"

[deps]
HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
Expand Down
24 changes: 16 additions & 8 deletions src/preparation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -97,16 +97,20 @@ function RT.build_index(mod::Module; verbose::Int = 1, kwargs...)
## Extract current configuration
chunker_kwargs_ = (; sources = all_sources)
chunker_kwargs = haskey(kwargs, :chunker_kwargs) ?
merge(kwargs.chunker_kwargs, chunker_kwargs_) : chunker_kwargs_
merge(kwargs[:chunker_kwargs], chunker_kwargs_) : chunker_kwargs_

embedder_kwargs_ = RT.getpropertynested(
RAG_KWARGS[], [:retriever_kwargs], :embedder_kwargs, nothing)
# Note: force Matrix{Bool} structure for now, switch to Int8-based binary embeddings with the latest PT
embedder_kwargs = haskey(kwargs, :embedder_kwargs) ?
merge(kwargs.embedder_kwargs, embedder_kwargs_) : embedder_kwargs_
merge(
(; return_type = Matrix{Bool}), embedder_kwargs_, kwargs[:embedder_kwargs]) :
merge((; return_type = Matrix{Bool}), embedder_kwargs_)

new_index = RT.build_index(RAG_CONFIG[].indexer, all_docs;
embedder_kwargs, chunker = TextChunker(), chunker_kwargs,
verbose, index_id = nameof(mod), kwargs...)
kwargs...,
embedder_kwargs, chunker = RT.TextChunker(), chunker_kwargs,
verbose, index_id = nameof(mod))
end

"""
Expand All @@ -124,14 +128,18 @@ function RT.build_index(modules::Vector{Module} = Base.Docs.modules; verbose::In
## Extract current configuration
chunker_kwargs_ = (; sources = all_sources)
chunker_kwargs = haskey(kwargs, :chunker_kwargs) ?
merge(kwargs.chunker_kwargs, chunker_kwargs_) : chunker_kwargs_
merge(kwargs[:chunker_kwargs], chunker_kwargs_) : chunker_kwargs_

# Note: force Matrix{Bool} structure for now, switch to Int8-based binary embeddings with the latest PT
embedder_kwargs_ = RT.getpropertynested(
RAG_KWARGS[], [:retriever_kwargs], :embedder_kwargs, nothing)
embedder_kwargs = haskey(kwargs, :embedder_kwargs) ?
merge(kwargs.embedder_kwargs, embedder_kwargs_) : embedder_kwargs_
merge(
(; return_type = Matrix{Bool}), embedder_kwargs_, kwargs[:embedder_kwargs]) :
merge((; return_type = Matrix{Bool}), embedder_kwargs_)

new_index = RT.build_index(RAG_CONFIG[].indexer, all_docs;
embedder_kwargs, chunker = TextChunker(), chunker_kwargs,
verbose, index_id = nameof(mod), kwargs...)
kwargs...,
embedder_kwargs, chunker = RT.TextChunker(), chunker_kwargs,
verbose, index_id = :all_modules)
end
71 changes: 70 additions & 1 deletion test/preparation.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
using AIHelpMe: docextract
using AIHelpMe: docextract, build_index

# create an empty module
module ABC123
Expand All @@ -24,3 +24,72 @@ end
@test length(all_sources) == 2
@test occursin("ABC1234", all_sources[2])
end

@testset "build_index" begin
# test with a mock server
PORT = rand(9000:31000)
PT.register_model!(; name = "mock-emb", schema = PT.CustomOpenAISchema())
PT.register_model!(; name = "mock-meta", schema = PT.CustomOpenAISchema())
PT.register_model!(; name = "mock-gen", schema = PT.CustomOpenAISchema())

echo_server = HTTP.serve!(PORT; verbose = -1) do req
content = JSON3.read(req.body)

if content[:model] == "mock-gen"
user_msg = last(content[:messages])
response = Dict(
:choices => [
Dict(:message => user_msg, :finish_reason => "stop")
],
:model => content[:model],
:usage => Dict(:total_tokens => length(user_msg[:content]),
:prompt_tokens => length(user_msg[:content]),
:completion_tokens => 0))
elseif content[:model] == "mock-emb"
response = Dict(
:data => [Dict(:embedding => ones(Float32, 1536))
for i in 1:length(content[:input])],
:usage => Dict(:total_tokens => length(content[:input]),
:prompt_tokens => length(content[:input]),
:completion_tokens => 0))
elseif content[:model] == "mock-meta"
user_msg = last(content[:messages])
response = Dict(
:choices => [
Dict(:finish_reason => "stop",
:message => Dict(:tool_calls => [
Dict(:function => Dict(:arguments => JSON3.write(MaybeTags([
Tag("yes", "category")
]))))]))],
:model => content[:model],
:usage => Dict(:total_tokens => length(user_msg[:content]),
:prompt_tokens => length(user_msg[:content]),
:completion_tokens => 0))
else
@info content
end
return HTTP.Response(200, JSON3.write(response))
end

# One module
index = build_index(AIHelpMe; verbose = 2, embedder_kwargs = (; model = "mock-emb"),
tagger_kwargs = (; model = "mock-meta"), api_kwargs = (;
url = "http://localhost:$(PORT)"))
@test index.embeddings == ones(Bool, 1024, length(index.chunks))
@test all(x -> occursin("AIHelpMe", x), index.sources)
@test index.tags == nothing
@test index.tags_vocab == nothing

# Many modules
index = build_index(
[AIHelpMe, Test]; verbose = 2, embedder_kwargs = (; model = "mock-emb"),
tagger_kwargs = (; model = "mock-meta"), api_kwargs = (;
url = "http://localhost:$(PORT)"))
@test index.embeddings == ones(Bool, 1024, length(index.chunks))
@test all(x -> occursin("AIHelpMe", x) || occursin("Test", x), index.sources)
@test index.tags == nothing
@test index.tags_vocab == nothing

# clean up
close(echo_server)
end
1 change: 1 addition & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ using PromptingTools
using PromptingTools.Experimental.RAGTools
const PT = PromptingTools
const RT = PromptingTools.Experimental.RAGTools
using PromptingTools: HTTP, JSON3
using HDF5, Serialization
using Test
using Aqua
Expand Down

2 comments on commit 191f28c

@svilupp
Copy link
Owner Author

@svilupp svilupp commented on 191f28c May 27, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register

Release notes:

Fixed

  • Fixed a bug in build_index where imports were missing and keywords were not passed properly in all scenarios.

Commits

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/107760

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.1.1 -m "<description of version>" 191f28c6d25447b4be23b691e4416f586b56af4d
git push origin v0.1.1

Please sign in to comment.