-
Notifications
You must be signed in to change notification settings - Fork 279
/
pyproject.toml
154 lines (128 loc) · 5.66 KB
/
pyproject.toml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
[tool.poetry]
name = "llm-engineering"
version = "0.1.0"
description = ""
authors = ["iusztinpaul <[email protected]>"]
license = "MIT"
readme = "README.md"
[tool.poetry.dependencies]
python = "~3.11"
zenml = { version = "0.67.0", extras = ["server"] }
pymongo = "^4.6.2"
click = "^8.0.1"
loguru = "^0.7.2"
rich = "^13.7.1"
numpy = "^1.26.4"
poethepoet = "0.29.0"
datasets = "^3.0.1"
# Digital data ETL
selenium = "^4.21.0"
webdriver-manager = "^4.0.1"
beautifulsoup4 = "^4.12.3"
html2text = "^2024.2.26"
jmespath = "^1.0.1"
chromedriver-autoinstaller = "^0.6.4"
# Feature engineering
qdrant-client = "^1.8.0"
langchain = "^0.2.11"
sentence-transformers = "^3.0.0"
# RAG
langchain-openai = "^0.1.3"
jinja2 = "^3.1.4"
tiktoken = "^0.7.0"
fake-useragent = "^1.5.1"
langchain-community = "^0.2.11"
# Inference
fastapi = ">=0.100,<=0.110"
uvicorn = "^0.30.6"
opik = "^0.2.2"
[tool.poetry.group.dev.dependencies]
ruff = "^0.4.9"
pre-commit = "^3.7.1"
pytest = "^8.2.2"
[tool.poetry.group.aws.dependencies]
sagemaker = ">=2.232.2"
s3fs = ">2022.3.0"
aws-profile-manager = "^0.7.3"
kubernetes = "^30.1.0"
sagemaker-huggingface-inference-toolkit = "^2.4.0"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
# ----------------------------------
# --- Poe the Poet Configuration ---
# ----------------------------------
[tool.poe.tasks]
# Data pipelines
run-digital-data-etl-alex = "echo 'It is not supported anymore.'"
run-digital-data-etl-maxime = "poetry run python -m tools.run --run-etl --no-cache --etl-config-filename digital_data_etl_maxime_labonne.yaml"
run-digital-data-etl-paul = "poetry run python -m tools.run --run-etl --no-cache --etl-config-filename digital_data_etl_paul_iusztin.yaml"
run-digital-data-etl = [
"run-digital-data-etl-maxime",
"run-digital-data-etl-paul",
]
run-feature-engineering-pipeline = "poetry run python -m tools.run --no-cache --run-feature-engineering"
run-generate-instruct-datasets-pipeline = "poetry run python -m tools.run --no-cache --run-generate-instruct-datasets"
run-generate-preference-datasets-pipeline = "poetry run python -m tools.run --no-cache --run-generate-preference-datasets"
run-end-to-end-data-pipeline = "poetry run python -m tools.run --no-cache --run-end-to-end-data"
# Utility pipelines
run-export-artifact-to-json-pipeline = "poetry run python -m tools.run --no-cache --run-export-artifact-to-json"
run-export-data-warehouse-to-json = "poetry run python -m tools.data_warehouse --export-raw-data"
run-import-data-warehouse-from-json = "poetry run python -m tools.data_warehouse --import-raw-data"
# Training pipelines
run-training-pipeline = "poetry run python -m tools.run --no-cache --run-training"
run-evaluation-pipeline = "poetry run python -m tools.run --no-cache --run-evaluation"
# Inference
call-rag-retrieval-module = "poetry run python -m tools.rag"
run-inference-ml-service = "poetry run uvicorn tools.ml_service:app --host 0.0.0.0 --port 8000 --reload"
call-inference-ml-service = "curl -X POST 'http://127.0.0.1:8000/rag' -H 'Content-Type: application/json' -d '{\"query\": \"My name is Paul Iusztin. Could you draft a LinkedIn post discussing RAG systems? I am particularly interested in how RAG works and how it is integrated with vector DBs and LLMs.\"}'"
# Infrastructure
## Local infrastructure
local-docker-infrastructure-up = "docker compose up -d"
local-docker-infrastructure-down = "docker compose stop"
local-zenml-server-down = "poetry run zenml down"
local-infrastructure-up = [
"local-docker-infrastructure-up",
"local-zenml-server-down",
"local-zenml-server-up",
]
local-infrastructure-down = [
"local-docker-infrastructure-down",
"local-zenml-server-down",
]
set-local-stack = "poetry run zenml stack set default"
set-aws-stack = "poetry run zenml stack set aws-stack"
set-asynchronous-runs = "poetry run zenml orchestrator update aws-stack --synchronous=False"
zenml-server-disconnect = "poetry run zenml disconnect"
## Settings
export-settings-to-zenml = "poetry run python -m tools.run --export-settings"
delete-settings-zenml = "poetry run zenml secret delete settings"
## SageMaker
create-sagemaker-role = "poetry run python -m llm_engineering.infrastructure.aws.roles.create_sagemaker_role"
create-sagemaker-execution-role = "poetry run python -m llm_engineering.infrastructure.aws.roles.create_execution_role"
deploy-inference-endpoint = "poetry run python -m llm_engineering.infrastructure.aws.deploy.huggingface.run"
test-sagemaker-endpoint = "poetry run python -m llm_engineering.model.inference.test"
delete-inference-endpoint = "poetry run python -m llm_engineering.infrastructure.aws.deploy.delete_sagemaker_endpoint"
## Docker
build-docker-image = "docker buildx build --platform linux/amd64 -t llmtwin -f Dockerfile ."
run-docker-end-to-end-data-pipeline = "docker run --rm --network host --shm-size=2g --env-file .env llmtwin poetry poe --no-cache --run-end-to-end-data"
bash-docker-container = "docker run --rm -it --network host --env-file .env llmtwin bash"
# QA
lint-check = "poetry run ruff check ."
format-check = "poetry run ruff format --check ."
lint-check-docker = "sh -c 'docker run --rm -i hadolint/hadolint < Dockerfile'"
gitleaks-check = "docker run -v .:/src zricethezav/gitleaks:latest dir /src/llm_engineering"
lint-fix = "poetry run ruff check --fix ."
format-fix = "poetry run ruff format ."
[tool.poe.tasks.local-zenml-server-up]
control.expr = "sys.platform"
[[tool.poe.tasks.local-zenml-server-up.switch]]
case = "darwin"
env = { OBJC_DISABLE_INITIALIZE_FORK_SAFETY = "YES" }
cmd = "poetry run zenml up"
[[tool.poe.tasks.local-zenml-server-up.switch]]
cmd = "poetry run zenml up"
# Tests
[tool.poe.tasks.test]
cmd = "poetry run pytest tests/"
env = { ENV_FILE = ".env.testing" }