Skip to content

Commit

Permalink
feat: Measure indexing time separately (#107)
Browse files Browse the repository at this point in the history
* Measure indexing time separately

* Measure time for answer prompt API

* README update on development with SDK, minor PR comment addressed
  • Loading branch information
chandrasekharan-zipstack authored Oct 1, 2024
1 parent 4b44c02 commit c48e21f
Show file tree
Hide file tree
Showing 3 changed files with 136 additions and 74 deletions.
44 changes: 44 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,50 @@ Supported commands:
Unstract SDK 0.3.2 uses the following version of Llama
Index Version **0.9.28** as on January 14th, 2024

### Developing with the SDK

Ensure that you have all the required dependencies and pre-commit hooks installed
```shell
pdm install
pre-commit install
```

Once the changes have been made, it can be tested with [Unstract](https://github.com/Zipstack/unstract) through the following means.

#### With PDM
Specify the SDK as a dependency to a project using a tool like `pdm` by adding the following to your `pyproject.toml`

```toml
[tool.pdm.dev-dependencies]
local_copies = [
"-e unstract-adapters @ file:///${UNSTRACT_ADAPTERS_PATH}",
"-e unstract-sdk @ file:///${UNSTRACT_SDK_PATH}",
]
```
Or by running the below command
```shell
pdm add -e /path/to/unstract-sdk --dev
```

#### With pip
- If the project is using `pip` it might be possible to add it as a dependency in `requirements.txt`
```
-e /path/to/unstract-sdk
```
NOTE: Building locally might require the below section to be replaced in the `unstract-sdk`'s build system configuration
```
[build-system]
requires = ["setuptools", "wheel"]
build-backend = "setuptools.build_meta"
```
- Another option is to provide a git URL in `requirements.txt`, this can come in handy while building tool
docker images. Don't forget to run `apt install git` within the `Dockerfile` for this
```shell
unstract-sdk @ git+https://github.com/Zipstack/unstract-sdk@feature-branch
```

- Or try installing a [local PyPI server](https://pypi.org/project/pypiserver/) and upload / download your package from this server

### Environment variables required for various LLMs (deprecated)

- Azure OpenAI
Expand Down
159 changes: 89 additions & 70 deletions src/unstract/sdk/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def extract_text(
logger.error(f"Error occured inside function 'process_text': {e}")
return extracted_text

@log_elapsed(operation="INDEXING(might include EXTRACTION)")
@log_elapsed(operation="CHECK_AND_INDEX(overall)")
def index(
self,
tool_id: str,
Expand Down Expand Up @@ -293,82 +293,101 @@ def index(
if not extracted_text:
raise IndexingError("No text available to index")

full_text = [
{
"section": "full",
"text_contents": extracted_text,
}
]

# Check if chunking is required
documents = []
for item in full_text:
text = item["text_contents"]
self.tool.stream_log("Indexing file...")
document = Document(
text=text,
doc_id=doc_id,
metadata={"section": item["section"]},
)
document.id_ = doc_id
documents.append(document)
self.tool.stream_log(f"Number of documents: {len(documents)}")

if doc_id_found:
# Delete the nodes for the doc_id
try:
vector_db.delete(ref_doc_id=doc_id)
self.tool.stream_log(f"Deleted nodes for {doc_id}")
except Exception as e:
self.tool.stream_log(
f"Error deleting nodes for {doc_id}: {e}",
level=LogLevel.ERROR,
)
raise SdkError(f"Error deleting nodes for {doc_id}: {e}") from e
self.index_to_vector_db(
vector_db=vector_db,
embedding=embedding,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
doc_id=doc_id,
text_to_idx=extracted_text,
doc_id_found=doc_id_found,
)
return doc_id
finally:
vector_db.close()

@log_elapsed(operation="INDEXING")
def index_to_vector_db(
self,
vector_db: VectorDB,
embedding: Embedding,
chunk_size: int,
chunk_overlap: int,
text_to_idx: str,
doc_id: str,
doc_id_found: bool,
):
self.tool.stream_log("Indexing file...")
full_text = [
{
"section": "full",
"text_contents": text_to_idx,
}
]
# Check if chunking is required
documents = []
for item in full_text:
text = item["text_contents"]
document = Document(
text=text,
doc_id=doc_id,
metadata={"section": item["section"]},
)
document.id_ = doc_id
documents.append(document)
self.tool.stream_log(f"Number of documents: {len(documents)}")

if doc_id_found:
# Delete the nodes for the doc_id
try:
if chunk_size == 0:
parser = SentenceSplitter.from_defaults(
chunk_size=len(documents[0].text) + 10,
chunk_overlap=0,
callback_manager=embedding.get_callback_manager(),
)
nodes = parser.get_nodes_from_documents(
documents, show_progress=True
)
node = nodes[0]
node.embedding = embedding.get_query_embedding(" ")
vector_db.add(doc_id, nodes=[node])
self.tool.stream_log("Added node to vector db")
else:
self.tool.stream_log("Adding nodes to vector db...")
# TODO: Phase 2:
# Post insertion to VDB, use query using doc_id and
# store all the VDB ids to a table against the doc_id
# During deletion for cases where metadata filtering
# does not work, these ids can be used for direct deletion
# This new table will also act like an audit trail for
# all nodes that were added to the VDB by Unstract
# Once this is in place, the overridden implementation
# of prefixing ids with doc_id before adding to VDB
# can be removed
vector_db.index_document(
documents,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
show_progress=True,
)
vector_db.delete(ref_doc_id=doc_id)
self.tool.stream_log(f"Deleted nodes for {doc_id}")
except Exception as e:
self.tool.stream_log(
f"Error adding nodes to vector db: {e}",
f"Error deleting nodes for {doc_id}: {e}",
level=LogLevel.ERROR,
)
raise IndexingError(str(e)) from e
raise SdkError(f"Error deleting nodes for {doc_id}: {e}") from e

self.tool.stream_log("File has been indexed successfully")
return doc_id
finally:
vector_db.close()
try:
if chunk_size == 0:
parser = SentenceSplitter.from_defaults(
chunk_size=len(documents[0].text) + 10,
chunk_overlap=0,
callback_manager=embedding.get_callback_manager(),
)
nodes = parser.get_nodes_from_documents(documents, show_progress=True)
node = nodes[0]
node.embedding = embedding.get_query_embedding(" ")
vector_db.add(doc_id, nodes=[node])
self.tool.stream_log("Added node to vector db")
else:
self.tool.stream_log("Adding nodes to vector db...")
# TODO: Phase 2:
# Post insertion to VDB, use query using doc_id and
# store all the VDB ids to a table against the doc_id
# During deletion for cases where metadata filtering
# does not work, these ids can be used for direct deletion
# This new table will also act like an audit trail for
# all nodes that were added to the VDB by Unstract
# Once this is in place, the overridden implementation
# of prefixing ids with doc_id before adding to VDB
# can be removed
vector_db.index_document(
documents,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
show_progress=True,
)
except Exception as e:
self.tool.stream_log(
f"Error adding nodes to vector db: {e}",
level=LogLevel.ERROR,
)
raise IndexingError(str(e)) from e

self.tool.stream_log("File has been indexed successfully")
return

def generate_index_key(
self,
Expand Down
7 changes: 3 additions & 4 deletions src/unstract/sdk/prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from unstract.sdk.constants import LogLevel, PromptStudioKeys, ToolEnv
from unstract.sdk.helper import SdkHelper
from unstract.sdk.tool.base import BaseTool
from unstract.sdk.utils.common_utils import log_elapsed

logger = logging.getLogger(__name__)

Expand All @@ -33,6 +34,7 @@ def __init__(
if not is_public_call:
self.bearer_token = tool.get_env_or_die(ToolEnv.PLATFORM_API_KEY)

@log_elapsed(operation="ANSWER_PROMPTS")
def answer_prompt(
self, payload: dict[str, Any], params: Optional[dict[str, str]] = None
) -> dict[str, Any]:
Expand Down Expand Up @@ -97,10 +99,7 @@ def _post_call(
response: Response = Response()
try:
response = requests.post(
url=url,
json=payload,
params=params,
headers=headers
url=url, json=payload, params=params, headers=headers
)
response.raise_for_status()
result["status"] = "OK"
Expand Down

0 comments on commit c48e21f

Please sign in to comment.