Skip to content

Commit

Permalink
Use max-segment-size in tokens in step06+ to be consistent with step05
Browse files Browse the repository at this point in the history
  • Loading branch information
jmartisk committed Feb 4, 2025
1 parent 01f3670 commit 65375b9
Show file tree
Hide file tree
Showing 6 changed files with 16 additions and 6 deletions.
2 changes: 1 addition & 1 deletion docs/docs/step-06.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ The `FileSystemDocumentLoader.loadDocumentsRecursively(documents)` method loads
The `EmbeddingStoreIngestor` class is used to ingest the documents into the vector store.
This is the cornerstone of the ingestion process.
Configuring it correctly is crucial to the accuracy of the RAG pattern.
Here, we use a recursive document splitter with a segment size of 100 and an overlap size of 25 (like we had in the previous step).
Here, we use a recursive document splitter with a segment size of 100 tokens and an overlap size of 25 tokens (like we had in the previous step).
!!! important
The splitter, the segment size, and the overlap size are crucial to the accuracy of the RAG pattern.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import java.nio.file.Path;
import java.util.List;

import dev.langchain4j.model.embedding.onnx.HuggingFaceTokenizer;
import jakarta.enterprise.context.ApplicationScoped;
import jakarta.enterprise.event.Observes;

Expand Down Expand Up @@ -38,7 +39,8 @@ public void ingest(@Observes StartupEvent ev,
EmbeddingStoreIngestor ingestor = EmbeddingStoreIngestor.builder()
.embeddingStore(store)
.embeddingModel(embeddingModel)
.documentSplitter(recursive(100, 25))
.documentSplitter(recursive(100, 25,
new HuggingFaceTokenizer()))
.build();
ingestor.ingest(list);
Log.info("Documents ingested successfully");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import java.nio.file.Path;
import java.util.List;

import dev.langchain4j.model.embedding.onnx.HuggingFaceTokenizer;
import jakarta.enterprise.context.ApplicationScoped;
import jakarta.enterprise.event.Observes;

Expand Down Expand Up @@ -38,7 +39,8 @@ public void ingest(@Observes StartupEvent ev,
EmbeddingStoreIngestor ingestor = EmbeddingStoreIngestor.builder()
.embeddingStore(store)
.embeddingModel(embeddingModel)
.documentSplitter(recursive(100, 25))
.documentSplitter(recursive(100, 25,
new HuggingFaceTokenizer()))
.build();
ingestor.ingest(list);
Log.info("Documents ingested successfully");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import java.nio.file.Path;
import java.util.List;

import dev.langchain4j.model.embedding.onnx.HuggingFaceTokenizer;
import jakarta.enterprise.context.ApplicationScoped;
import jakarta.enterprise.event.Observes;

Expand Down Expand Up @@ -38,7 +39,8 @@ public void ingest(@Observes StartupEvent ev,
EmbeddingStoreIngestor ingestor = EmbeddingStoreIngestor.builder()
.embeddingStore(store)
.embeddingModel(embeddingModel)
.documentSplitter(recursive(100, 25))
.documentSplitter(recursive(100, 25,
new HuggingFaceTokenizer()))
.build();
ingestor.ingest(list);
Log.info("Documents ingested successfully");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import java.nio.file.Path;
import java.util.List;

import dev.langchain4j.model.embedding.onnx.HuggingFaceTokenizer;
import jakarta.enterprise.context.ApplicationScoped;
import jakarta.enterprise.event.Observes;

Expand Down Expand Up @@ -38,7 +39,8 @@ public void ingest(@Observes StartupEvent ev,
EmbeddingStoreIngestor ingestor = EmbeddingStoreIngestor.builder()
.embeddingStore(store)
.embeddingModel(embeddingModel)
.documentSplitter(recursive(100, 25))
.documentSplitter(recursive(100, 25,
new HuggingFaceTokenizer()))
.build();
ingestor.ingest(list);
Log.info("Documents ingested successfully");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import java.nio.file.Path;
import java.util.List;

import dev.langchain4j.model.embedding.onnx.HuggingFaceTokenizer;
import jakarta.enterprise.context.ApplicationScoped;
import jakarta.enterprise.event.Observes;

Expand Down Expand Up @@ -38,7 +39,8 @@ public void ingest(@Observes StartupEvent ev,
EmbeddingStoreIngestor ingestor = EmbeddingStoreIngestor.builder()
.embeddingStore(store)
.embeddingModel(embeddingModel)
.documentSplitter(recursive(100, 25))
.documentSplitter(recursive(100, 25,
new HuggingFaceTokenizer()))
.build();
ingestor.ingest(list);
Log.info("Documents ingested successfully");
Expand Down

0 comments on commit 65375b9

Please sign in to comment.