allow list of texts as input to pure free text

stanford-oval · Jun 10, 2024 · e8cd09b · e8cd09b
1 parent bae7e09
commit e8cd09b
Show file tree

Hide file tree

Showing 4 changed files with 48 additions and 6 deletions.
diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 # Package metadata
 name = "suql"
-version = "1.1.7a6"
+version = "1.1.7a7"
 description = "Structured and Unstructured Query Language (SUQL) Python API"
 author = "Shicheng Liu"
 author_email = "[email protected]"

diff --git a/src/suql/free_text_fcns_server.py b/src/suql/free_text_fcns_server.py
@@ -19,7 +19,14 @@
 # engine = "gpt-3.5-turbo-0613"
 
 
-def _answer(source, query, type_prompt = None, k=5, max_input_token=3800, engine="gpt-3.5-turbo-0613"):
+def _answer(
+    source,
+    query,
+    type_prompt = None,
+    k=5,
+    max_input_token=10000,
+    engine="gpt-3.5-turbo-0125"
+):
     from suql.prompt_continuation import llm_generate
     if not source:
         return {"result": "no information"}
@@ -52,9 +59,9 @@ def _answer(source, query, type_prompt = None, k=5, max_input_token=3800, engine
             "type_prompt": type_prompt,
         },
         engine=engine,
-        max_tokens=200,
+        max_tokens=1000,
         temperature=0.0,
-        stop_tokens=["\n"],
+        stop_tokens=[],
         postprocess=False,
     )
     return {"result": continuation}

diff --git a/src/suql/loaders/utils.py b/src/suql/loaders/utils.py
@@ -0,0 +1,23 @@
+import json
+
+def chunk_store_documents(data, output_file): 
+    from llama_index.core.schema import Document
+    data = [Document(text=data)] # llama index expects a list
+
+    from llama_index.embeddings.fastembed import FastEmbedEmbedding
+    embed_model = FastEmbedEmbedding(model_name="BAAI/bge-large-en-v1.5")
+
+    from llama_index.core.node_parser import SemanticSplitterNodeParser
+    splitter = SemanticSplitterNodeParser(
+        embed_model=embed_model
+    )
+    nodes = splitter.get_nodes_from_documents(data)
+
+    chunked_documents = [node.text for node in nodes]
+
+    with open(output_file, "w") as fd:
+        json.dump(
+            chunked_documents, 
+            fd,
+            indent=2
+        )
diff --git a/src/suql/sql_free_text_support/execute_free_text_sql.py b/src/suql/sql_free_text_support/execute_free_text_sql.py
@@ -9,6 +9,7 @@
 from collections import defaultdict
 from copy import deepcopy
 from typing import List, Union
+from functools import lru_cache
 
 import pglast
 import requests
@@ -1656,13 +1657,24 @@ def _parse_standalone_answer(suql):
     else:
         return None
 
+
+@lru_cache(maxsize=16)
+def _read_source_file(filename):
+    try:
+        with open(filename, "r") as fd:
+            content = json.load(fd)
+        return content
+    except json.JSONDecodeError:
+        with open(filename, "r") as fd:
+            return fd.read()
+
+
 def _execute_standalone_answer(suql, source_file_mapping):
     source, query = _parse_standalone_answer(suql)
     if source not in source_file_mapping:
         return None
 
-    with open(source_file_mapping[source], "r") as fd:
-        source_content = fd.read()
+    source_content = _read_source_file(source_file_mapping[source])
 
     return _answer(source_content, query)