Skip to content

Commit

Permalink
Merge pull request #230 from amadeus4dev/updatelangchain
Browse files Browse the repository at this point in the history
clean up db upsert
  • Loading branch information
minjikarin authored Jan 25, 2024
2 parents 065015d + 0415063 commit 54fcf81
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 77 deletions.
66 changes: 3 additions & 63 deletions .github/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,64 +1,4 @@
aiohttp==3.8.4
aiosignal==1.3.1
anyio==3.6.2
argilla==1.6.0
async-timeout==4.0.2
attrs==22.2.0
backoff==2.2.1
certifi==2022.12.7
charset-normalizer==3.1.0
click==8.1.3
commonmark==0.9.1
dataclasses-json==0.5.7
Deprecated==1.2.13
et-xmlfile==1.1.0
frozenlist==1.3.3
greenlet==2.0.2
h11==0.14.0
httpcore==0.16.3
httpx==0.23.3
idna==3.4
joblib==1.2.0
tiktoken==0.3.3
tqdm==4.65.0
langchain==0.0.136
lxml==4.9.2
Markdown==3.4.3
marshmallow==3.19.0
marshmallow-enum==1.5.1
monotonic==1.6
msg-parser==1.2.0
multidict==6.0.4
mypy-extensions==1.0.0
nltk==3.8.1
numpy==1.23.5
olefile==0.46
openapi-schema-pydantic==1.2.4
openpyxl==3.1.2
packaging==23.0
pandas==1.5.3
Pillow==9.5.0
pydantic==1.10.7
Pygments==2.15.0
pypandoc==1.11
python-dateutil==2.8.2
python-docx==0.8.11
python-magic==0.4.27
python-pptx==0.6.21
pytz==2023.3
PyYAML==6.0
regex==2023.3.23
requests==2.28.2
rfc3986==1.5.0
rich==13.0.1
six==1.16.0
sniffio==1.3.0
SQLAlchemy==1.4.47
tenacity==8.2.2
tiktoken==0.3.3
tqdm==4.65.0
typing-inspect==0.8.0
typing_extensions==4.5.0
unstructured==0.5.11
urllib3==1.26.15
wrapt==1.14.1
XlsxWriter==3.0.9
yarl==1.8.2
unstructured==0.5.11
19 changes: 5 additions & 14 deletions .github/update_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,32 +26,25 @@
file_path = os.path.join(root, file)
loader = UnstructuredMarkdownLoader(file_path)
docs = loader.load()
logging.info(f'Loaded {len(docs)} document(s) from {file_path}')

# logging.info(f'Loaded {len(docs)} document(s) from {file_path}')
# Append the loaded documents to the all_docs list
all_docs.extend(docs)

logging.info(f'Total documents loaded: {len(all_docs)}')

# Chunking
tokenizer = tiktoken.get_encoding('cl100k_base')


def tiktoken_len(text):
tokens = tokenizer.encode(
text,
disallowed_special=()
)
#.encode() method converts a text string into a list of token integers.
tokens = tokenizer.encode(text,disallowed_special=())
return len(tokens)


text_splitter = RecursiveCharacterTextSplitter(
chunk_size=400,
chunk_overlap=20, # number of tokens overlap between chunks
length_function=tiktoken_len,
separators=['\n\n', '\n', ' ', '']
)
chunks = text_splitter.split_text(all_docs[5].page_content)

m = hashlib.md5() # this will convert URL into unique ID

Expand All @@ -69,7 +62,7 @@ def tiktoken_len(text):

for doc in tqdm(all_docs):
url = doc.metadata['source'].replace(
'./docs/', 'https://amadeus4dev.github.io/developer-guides/').replace('.md', '')
'./docs/', 'https://developers.amadeus.com/self-service/apis-docs/guides/developer-guides/').replace('.md', '')
m.update(url.encode('utf-8'))
uid = m.hexdigest()[:12]
chunks = text_splitter.split_text(doc.page_content)
Expand All @@ -96,8 +89,6 @@ def tiktoken_len(text):
res = s.post(
f'{endpoint_url}/upsert',
headers=headers,
json={
'documents': documents[i:i_end]
}
json={'documents': documents[i:i_end]}
)
logging.info(res.status_code)

0 comments on commit 54fcf81

Please sign in to comment.