Make Gaudi adapt to the tgi 2.3.0

Signed-off-by: yuanwu <[email protected]>
huggingface · Sep 26, 2024 · bab529c · bab529c
1 parent 14fdc4a
commit bab529c
Show file tree

Hide file tree

Showing 25 changed files with 3,116 additions and 2,348 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -61,6 +61,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
         make \
         curl \
         git \
+        python3.11-dev \
         && rm -rf /var/lib/apt/lists/*
 
 # Install server
@@ -96,5 +97,5 @@ FROM base
 COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
 RUN chmod +x /tgi-entrypoint.sh
 
-ENTRYPOINT ["/tgi-entrypoint.sh"]
+#ENTRYPOINT ["/tgi-entrypoint.sh"]
 # CMD ["--json-output"]
diff --git a/server/Makefile b/server/Makefile
@@ -4,10 +4,6 @@ include Makefile-vllm
 include Makefile-awq
 include Makefile-eetq
 include Makefile-selective-scan
-include Makefile-lorax-punica
-include Makefile-fbgemm
-include Makefile-exllamav2
-include Makefile-flashinfer
 
 unit-tests:
 	pytest -s -vv -m "not private" tests
@@ -21,25 +17,20 @@ gen-server:
 	find text_generation_server/pb/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \;
 	touch text_generation_server/pb/__init__.py
 
-install-server: gen-server
+install: gen-server
 	pip install pip --upgrade
-	pip install -r requirements_cuda.txt
-	pip install -e ".[accelerate, quantize, peft, outlines]"
-
-
-install: install-cuda
-	echo "Installed server"
-
-install-cuda: install-server install-flash-attention-v2-cuda install-vllm-cuda install-flash-attention install-fbgemm
-	pip install -e ".[bnb]"
-	pip install nvidia-nccl-cu12==2.22.3
-
-install-rocm: install-server install-flash-attention-v2-rocm  install-vllm-rocm
+	pip install -r requirements.txt
+	pip install -e "."
 
 run-dev:
 	SAFETENSORS_FAST_GPU=1 python -m torch.distributed.run --nproc_per_node=2 text_generation_server/cli.py serve bigscience/bloom-560m --sharded
 
+install-poetry:
+	curl -sSL https://install.python-poetry.org | python3 -
+
+update-lock:
+	rm poetry.lock
+	poetry lock --no-update
+
 export-requirements:
-	poetry export -o requirements_cuda.txt --without-hashes
-	poetry export -o requirements_rocm.txt --without-hashes
-	poetry export -o requirements_intel.txt --without-hashes
+	poetry export -o requirements.txt --without-hashes
diff --git a/server/dill-0.3.7-patch.sh b/server/dill-0.3.7-patch.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+git clone -b dill-0.3.7 https://github.com/uqfoundation/dill.git
+pushd dill
+cat <<EOF > dill-0.3.7.patch
+diff --git a/dill/_dill.py b/dill/_dill.py
+index d0cf543..f6eb662 100644
+--- a/dill/_dill.py
++++ b/dill/_dill.py
+@@ -69,7 +69,15 @@ TypeType = type # 'new-style' classes #XXX: unregistered
+ XRangeType = range
+ from types import MappingProxyType as DictProxyType, new_class
+ from pickle import DEFAULT_PROTOCOL, HIGHEST_PROTOCOL, PickleError, PicklingError, UnpicklingError
+-import __main__ as _main_module
++class _LazyMainModule(object):
++    _module = None
++    @property
++    def module(self):
++        if self._module is None:
++            import __main__ as _m_module
++            self._module = _m_module
++        return self._module
++_main_module = _LazyMainModule()
+ import marshal
+ import gc
+ # import zlib
+@@ -353,7 +361,7 @@ class Pickler(StockPickler):
+         _fmode = kwds.pop('fmode', None)
+         _recurse = kwds.pop('recurse', None)
+         StockPickler.__init__(self, file, *args, **kwds)
+-        self._main = _main_module
++        self._main = _main_module.module
+         self._diff_cache = {}
+         self._byref = settings['byref'] if _byref is None else _byref
+         self._strictio = False #_strictio
+@@ -435,12 +443,12 @@ class Unpickler(StockUnpickler):
+         settings = Pickler.settings
+         _ignore = kwds.pop('ignore', None)
+         StockUnpickler.__init__(self, *args, **kwds)
+-        self._main = _main_module
++        self._main = _main_module.module
+         self._ignore = settings['ignore'] if _ignore is None else _ignore
+ 
+     def load(self): #NOTE: if settings change, need to update attributes
+         obj = StockUnpickler.load(self)
+-        if type(obj).__module__ == getattr(_main_module, '__name__', '__main__'):
++        if type(obj).__module__ == getattr(self._main, '__name__', '__main__'):
+             if not self._ignore:
+                 # point obj class to main
+                 try: obj.__class__ = getattr(self._main, type(obj).__name__)
+@@ -1194,11 +1202,11 @@ def save_module_dict(pickler, obj):
+         logger.trace(pickler, "D1: %s", _repr_dict(obj)) # obj
+         pickler.write(bytes('c__builtin__\n__main__\n', 'UTF-8'))
+         logger.trace(pickler, "# D1")
+-    elif (not is_dill(pickler, child=False)) and (obj == _main_module.__dict__):
++    elif (not is_dill(pickler, child=False)) and (obj == _main_module.module.__dict__):
+         logger.trace(pickler, "D3: %s", _repr_dict(obj)) # obj
+         pickler.write(bytes('c__main__\n__dict__\n', 'UTF-8'))  #XXX: works in general?
+         logger.trace(pickler, "# D3")
+-    elif '__name__' in obj and obj != _main_module.__dict__ \\
++    elif '__name__' in obj and obj != _main_module.module.__dict__ \\
+             and type(obj['__name__']) is str \\
+             and obj is getattr(_import_module(obj['__name__'],True), '__dict__', None):
+         logger.trace(pickler, "D4: %s", _repr_dict(obj)) # obj
+diff --git a/dill/session.py b/dill/session.py
+index 74234ab..1be8d89 100644
+--- a/dill/session.py
++++ b/dill/session.py
+@@ -233,7 +233,7 @@ def dump_module(
+     protocol = settings['protocol']
+     main = module
+     if main is None:
+-        main = _main_module
++        main = _main_module.module
+     elif isinstance(main, str):
+         main = _import_module(main)
+     if not isinstance(main, ModuleType):
+@@ -501,7 +501,7 @@ def load_module(
+             pass
+     assert loaded is main
+     _restore_modules(unpickler, main)
+-    if main is _main_module or main is module:
++    if main is _main_module.module or main is module:
+         return None
+     else:
+         return main
+
+EOF
+git apply dill-0.3.7.patch
+python -m pip install .
+popd
+rm -fr dill
diff --git a/server/dill-0.3.8-patch.sh b/server/dill-0.3.8-patch.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+git clone -b 0.3.8 https://github.com/uqfoundation/dill.git
+pushd dill
+cat <<EOF > dill-0.3.8.patch
+diff --git a/dill/_dill.py b/dill/_dill.py
+index d42432f..1d251e6 100644
+--- a/dill/_dill.py
++++ b/dill/_dill.py
+@@ -69,7 +69,15 @@ TypeType = type # 'new-style' classes #XXX: unregistered
+ XRangeType = range
+ from types import MappingProxyType as DictProxyType, new_class
+ from pickle import DEFAULT_PROTOCOL, HIGHEST_PROTOCOL, PickleError, PicklingError, UnpicklingError
+-import __main__ as _main_module
++class _LazyMainModule(object):
++    _module = None
++    @property
++    def module(self):
++        if self._module is None:
++            import __main__ as _m_module
++            self._module = _m_module
++        return self._module
++_main_module = _LazyMainModule()
+ import marshal
+ import gc
+ # import zlib
+@@ -355,7 +363,7 @@ class Pickler(StockPickler):
+         _fmode = kwds.pop('fmode', None)
+         _recurse = kwds.pop('recurse', None)
+         StockPickler.__init__(self, file, *args, **kwds)
+-        self._main = _main_module
++        self._main = _main_module.module
+         self._diff_cache = {}
+         self._byref = settings['byref'] if _byref is None else _byref
+         self._strictio = False #_strictio
+@@ -437,12 +445,12 @@ class Unpickler(StockUnpickler):
+         settings = Pickler.settings
+         _ignore = kwds.pop('ignore', None)
+         StockUnpickler.__init__(self, *args, **kwds)
+-        self._main = _main_module
++        self._main = _main_module.module
+         self._ignore = settings['ignore'] if _ignore is None else _ignore
+ 
+     def load(self): #NOTE: if settings change, need to update attributes
+         obj = StockUnpickler.load(self)
+-        if type(obj).__module__ == getattr(_main_module, '__name__', '__main__'):
++        if type(obj).__module__ == getattr(self._main, '__name__', '__main__'):
+             if not self._ignore:
+                 # point obj class to main
+                 try: obj.__class__ = getattr(self._main, type(obj).__name__)
+@@ -1199,11 +1207,11 @@ def save_module_dict(pickler, obj):
+         logger.trace(pickler, "D1: %s", _repr_dict(obj)) # obj
+         pickler.write(bytes('c__builtin__\n__main__\n', 'UTF-8'))
+         logger.trace(pickler, "# D1")
+-    elif (not is_dill(pickler, child=False)) and (obj == _main_module.__dict__):
++    elif (not is_dill(pickler, child=False)) and (obj == _main_module.module.__dict__):
+         logger.trace(pickler, "D3: %s", _repr_dict(obj)) # obj
+         pickler.write(bytes('c__main__\n__dict__\n', 'UTF-8'))  #XXX: works in general?
+         logger.trace(pickler, "# D3")
+-    elif '__name__' in obj and obj != _main_module.__dict__ \\
++    elif '__name__' in obj and obj != _main_module.module.__dict__ \\
+             and type(obj['__name__']) is str \\
+             and obj is getattr(_import_module(obj['__name__'],True), '__dict__', None):
+         logger.trace(pickler, "D4: %s", _repr_dict(obj)) # obj
+diff --git a/dill/session.py b/dill/session.py
+index e91068a..a921b43 100644
+--- a/dill/session.py
++++ b/dill/session.py
+@@ -233,7 +233,7 @@ def dump_module(
+     protocol = settings['protocol']
+     main = module
+     if main is None:
+-        main = _main_module
++        main = _main_module.module
+     elif isinstance(main, str):
+         main = _import_module(main)
+     if not isinstance(main, ModuleType):
+@@ -501,7 +501,7 @@ def load_module(
+             pass
+     assert loaded is main
+     _restore_modules(unpickler, main)
+-    if main is _main_module or main is module:
++    if main is _main_module.module or main is module:
+         return None
+     else:
+         return main
+
+EOF
+git apply dill-0.3.8.patch
+python -m pip install .
+popd
+rm -fr dill
diff --git a/server/pyproject.toml b/server/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "text-generation-server"
-version = "2.0.5-dev0"
+version = "2.0.4"
 description = "Text Generation Inference Python gRPC Server"
 authors = ["Olivier Dehaene <[email protected]>"]
 
@@ -9,76 +9,34 @@ text-generation-server = 'text_generation_server.cli:app'
 
 [tool.poetry.dependencies]
 python = ">=3.9,<3.13"
-protobuf = "^4.25.3"
+protobuf = "^3.20.3"
 grpcio = "^1.51.1"
-grpcio-status = "^1.51.1"
-grpcio-reflection = "^1.51.1"
+grpcio-status = "*"
+grpcio-reflection = "*"
 grpc-interceptor = "^0.15.0"
-typer = "^0.6.1"
-accelerate = { version = "^0.29.1", optional = true }
-bitsandbytes = { version = "^0.43.0", optional = true }
-safetensors = "^0.4"
+typer = "^0.7.0"
 loguru = "^0.6.0"
-opentelemetry-api = "^1.25.0"
-opentelemetry-exporter-otlp = "^1.25.0"
-opentelemetry-instrumentation-grpc = "^0.46b0"
+opentelemetry-api = "^1.15.0"
+opentelemetry-exporter-otlp = "^1.15.0"
+opentelemetry-instrumentation-grpc = "^0.36b0"
 hf-transfer = "^0.1.2"
 sentencepiece = "^0.1.97"
-tokenizers = "^0.19.1"
-huggingface-hub = "^0.23"
-transformers = "^4.43"
-einops = "^0.6.1"
-texttable = { version = "^1.6.7", optional = true }
-datasets = { version = "^2.14.0", optional = true }
-peft = { version = "^0.10", optional = true }
-torch = { version = "^2.4.0", optional = true }
-scipy = "^1.11.1"
-pillow = "^10.0.0"
-outlines= { version = "^0.0.34", optional = true }
+peft = "^0.10"
+optimum-habana = "1.13.2"
+transformers = "4.43.4"
+numpy = "1.26.4"
+accelerate = "0.33.0"
+outlines= { version = "^0.0.36", optional = true }
 prometheus-client = "^0.20.0"
 py-cpuinfo = "^9.0.0"
-# Remove later, temporary workaround for outlines.
-numpy = "^1.26"
-
-marlin-kernels = [
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.2.0/marlin_kernels-0.2.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.2.0/marlin_kernels-0.2.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.2.0/marlin_kernels-0.2.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.2.0/marlin_kernels-0.2.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
-]
-moe-kernels = [
-  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.3.1/moe_kernels-0.3.1+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
-  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.3.1/moe_kernels-0.3.1+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
-  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.3.1/moe_kernels-0.3.1+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
-  { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.3.1/moe_kernels-0.3.1+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
-]
-rich = "^13.7.1"
-
-[tool.poetry.extras]
-torch = ["torch"]
-accelerate = ["accelerate"]
-bnb = ["bitsandbytes"]
-marlin = ["marlin-kernels"]
-moe = ["moe-kernels"]
-peft = ["peft"]
-quantize = ["texttable", "datasets", "accelerate"]
-outlines = ["outlines"]
 
 [tool.poetry.group.dev.dependencies]
-grpcio-tools = "^1.51.1"
+grpcio-tools = "*"
 pytest = "^7.3.0"
 
-
-[[tool.poetry.source]]
-name = "pytorch-gpu-src"
-url = "https://download.pytorch.org/whl/cu121"
-priority = "explicit"
-
 [tool.pytest.ini_options]
 markers = ["private: marks tests as requiring an admin hf token (deselect with '-m \"not private\"')"]
 
 [build-system]
-requires = [
-    "poetry-core>=1.0.0",
-]
+requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"