diff --git a/.github/requirements.txt b/.github/requirements.txt index d7657d44..cd9d7b7a 100644 --- a/.github/requirements.txt +++ b/.github/requirements.txt @@ -1,64 +1,4 @@ -aiohttp==3.8.4 -aiosignal==1.3.1 -anyio==3.6.2 -argilla==1.6.0 -async-timeout==4.0.2 -attrs==22.2.0 -backoff==2.2.1 -certifi==2022.12.7 -charset-normalizer==3.1.0 -click==8.1.3 -commonmark==0.9.1 -dataclasses-json==0.5.7 -Deprecated==1.2.13 -et-xmlfile==1.1.0 -frozenlist==1.3.3 -greenlet==2.0.2 -h11==0.14.0 -httpcore==0.16.3 -httpx==0.23.3 -idna==3.4 -joblib==1.2.0 +tiktoken==0.3.3 +tqdm==4.65.0 langchain==0.0.136 -lxml==4.9.2 -Markdown==3.4.3 -marshmallow==3.19.0 -marshmallow-enum==1.5.1 -monotonic==1.6 -msg-parser==1.2.0 -multidict==6.0.4 -mypy-extensions==1.0.0 -nltk==3.8.1 -numpy==1.23.5 -olefile==0.46 -openapi-schema-pydantic==1.2.4 -openpyxl==3.1.2 -packaging==23.0 -pandas==1.5.3 -Pillow==9.5.0 -pydantic==1.10.7 -Pygments==2.15.0 -pypandoc==1.11 -python-dateutil==2.8.2 -python-docx==0.8.11 -python-magic==0.4.27 -python-pptx==0.6.21 -pytz==2023.3 -PyYAML==6.0 -regex==2023.3.23 -requests==2.28.2 -rfc3986==1.5.0 -rich==13.0.1 -six==1.16.0 -sniffio==1.3.0 -SQLAlchemy==1.4.47 -tenacity==8.2.2 -tiktoken==0.3.3 -tqdm==4.65.0 -typing-inspect==0.8.0 -typing_extensions==4.5.0 -unstructured==0.5.11 -urllib3==1.26.15 -wrapt==1.14.1 -XlsxWriter==3.0.9 -yarl==1.8.2 \ No newline at end of file +unstructured==0.5.11 \ No newline at end of file diff --git a/.github/update_db.py b/.github/update_db.py index 932f55be..ed457b8b 100644 --- a/.github/update_db.py +++ b/.github/update_db.py @@ -26,32 +26,25 @@ file_path = os.path.join(root, file) loader = UnstructuredMarkdownLoader(file_path) docs = loader.load() - logging.info(f'Loaded {len(docs)} document(s) from {file_path}') - + # logging.info(f'Loaded {len(docs)} document(s) from {file_path}') # Append the loaded documents to the all_docs list all_docs.extend(docs) - logging.info(f'Total documents loaded: {len(all_docs)}') # Chunking tokenizer = tiktoken.get_encoding('cl100k_base') - def tiktoken_len(text): - tokens = tokenizer.encode( - text, - disallowed_special=() - ) + #.encode() method converts a text string into a list of token integers. + tokens = tokenizer.encode(text,disallowed_special=()) return len(tokens) - text_splitter = RecursiveCharacterTextSplitter( chunk_size=400, chunk_overlap=20, # number of tokens overlap between chunks length_function=tiktoken_len, separators=['\n\n', '\n', ' ', ''] ) -chunks = text_splitter.split_text(all_docs[5].page_content) m = hashlib.md5() # this will convert URL into unique ID @@ -69,7 +62,7 @@ def tiktoken_len(text): for doc in tqdm(all_docs): url = doc.metadata['source'].replace( - './docs/', 'https://amadeus4dev.github.io/developer-guides/').replace('.md', '') + './docs/', 'https://developers.amadeus.com/self-service/apis-docs/guides/developer-guides/').replace('.md', '') m.update(url.encode('utf-8')) uid = m.hexdigest()[:12] chunks = text_splitter.split_text(doc.page_content) @@ -96,8 +89,6 @@ def tiktoken_len(text): res = s.post( f'{endpoint_url}/upsert', headers=headers, - json={ - 'documents': documents[i:i_end] - } + json={'documents': documents[i:i_end]} ) logging.info(res.status_code)