Merge branch 'main' into wangchang/inc3.x

intel · Jun 14, 2024 · ce93c14 · ce93c14
2 parents 6b429c0 + 8ab25cf
commit ce93c14
Show file tree

Hide file tree

Showing 19 changed files with 194 additions and 40 deletions.
diff --git a/README.md b/README.md
@@ -11,6 +11,7 @@ Intel® Extension for Transformers
 </div>
 
 ## 🚀Latest News
+* [2024/06] Support Qwen2, please find the details in [Blog](https://medium.com/intel-analytics-software/accelerating-qwen2-models-with-intel-extension-for-transformers-99403de82f68)
 * [2024/04] Support the launch of **[Meta Llama 3](https://llama.meta.com/llama3/)**, the next generation of Llama models. Check out [Accelerate Meta* Llama 3 with Intel AI Solutions](https://www.intel.com/content/www/us/en/developer/articles/technical/accelerate-meta-llama3-with-intel-ai-solutions.html).
 * [2024/04] Demonstrated the chatbot in 4th, 5th, and 6th Gen Xeon Scalable Processors in [**Intel Vision Pat's Keynote**](https://youtu.be/QB7FoIpx8os?t=2280).
 * [2024/04] Supported **INT4 inference on Intel Meteor Lake**.

diff --git a/docs/publication.md b/docs/publication.md
@@ -1,7 +1,8 @@
 Full Publications/Events (50)
 ==========
 
-## 2024 (10)
+## 2024 (11)
+* Blog published on Medium: [Accelerating Qwen2 Models with Intel Extension for Transformers](https://medium.com/intel-analytics-software/accelerating-qwen2-models-with-intel-extension-for-transformers-99403de82f68) (June 2024)
 * Blog published on Huggingface: [Building Cost-Efficient Enterprise RAG applications with Intel Gaudi 2 and Intel Xeon](https://huggingface.co/blog/cost-efficient-rag-applications-with-intel) (May 2024)
 * Blog published on Intel Developer News: [Efficient Natural Language Embedding Models with Intel® Extension for Transformers](https://www.intel.com/content/www/us/en/developer/articles/technical/efficient-natural-language-embedding-models.html) (May 2024)
 * Blog published on Intel NewsRoom: [Intel Welcomes Open Platform for Enterprise AI](https://www.intel.com/content/www/us/en/newsroom/news/intel-welcomes-open-platform-enterprise-ai.html#gs.89490l) (April 2024)

diff --git a/docs/qbits.md b/docs/qbits.md
@@ -74,3 +74,8 @@ If user wants to use QBits, the Pytorch version must meet ITREX requirements, he
 |     v1.4      |    2.2.0+cpu    |
 |    v1.4.1     |    2.2.0+cpu    |
 |    v1.4.2     |    2.3.0+cpu    |
+
+Users can also check whether the current torch version is compatible with QBits by using the `check_torch_compatibility` function provided by QBits.  
+```python
+assert qbits.check_torch_compatibility(str(torch.__version__))
+```
diff --git a/examples/modelscope/README.md b/examples/modelscope/README.md
@@ -0,0 +1,24 @@
+# ModelScope with ITREX
+
+Intel® Extension for Transformers(ITREX) support almost all the LLMs in Pytorch format from ModelScope such as phi, Qwen, ChatGLM, Baichuan, gemma, etc.
+
+## Usage Example
+
+ITREX provides a script that demonstrates the use of modelscope. Use numactl to improve performance and run it with the following command:
+```bash
+OMP_NUM_THREADS=num_cores numactl -l -C 0-num_cores-1 python run_modelscope_example.py --model=qwen/Qwen-7B --prompt=你好
+```
+
+## Supported and Validated Models
+We have validated the majority of existing models using modelscope==1.13.1:
+* [qwen/Qwen-7B](https://www.modelscope.cn/models/qwen/Qwen-7B/summary)
+* [ZhipuAI/ChatGLM-6B](https://www.modelscope.cn/models/ZhipuAI/ChatGLM-6B/summary)(transformers=4.33.1)
+* [ZhipuAI/chatglm2-6b](https://www.modelscope.cn/models/ZhipuAI/chatglm2-6b/summary)(transformers=4.33.1)
+* [ZhipuAI/chatglm3-6b](https://www.modelscope.cn/models/ZhipuAI/chatglm3-6b/summary)(transformers=4.33.1)
+* [baichuan-inc/Baichuan2-7B-Chat](https://www.modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat/summary)(transformers=4.33.1)
+* [baichuan-inc/Baichuan2-13B-Chat](https://www.modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary)(transformers=4.33.1)
+* [LLM-Research/Phi-3-mini-4k-instruct](https://www.modelscope.cn/models/LLM-Research/Phi-3-mini-4k-instruct/summary)
+* [LLM-Research/Phi-3-mini-128k-instruct](https://www.modelscope.cn/models/LLM-Research/Phi-3-mini-128k-instruct/summary)
+* [AI-ModelScope/gemma-2b](https://www.modelscope.cn/models/AI-ModelScope/gemma-2b/summary)
+
+If you encounter any problems, please let us know.
diff --git a/examples/modelscope/requirements.txt b/examples/modelscope/requirements.txt
@@ -0,0 +1,13 @@
+intel_extension_for_transformers
+neural-speed
+lm-eval
+sentencepiece
+gguf
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch==2.3.0+cpu
+transformers
+intel_extension_for_pytorch==2.3.0
+tiktoken
+transformers_stream_generator
+zipfile38
+modelscope
diff --git a/examples/modelscope/run_modelscope_example.py b/examples/modelscope/run_modelscope_example.py
@@ -0,0 +1,30 @@
+from transformers import TextStreamer
+from modelscope import AutoTokenizer
+from intel_extension_for_transformers.transformers import AutoModelForCausalLM
+from typing import List, Optional
+import argparse
+
+def main(args_in: Optional[List[str]] = None) -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, help="Model name: String", required=True, default="qwen/Qwen-7B")
+    parser.add_argument(
+        "-p",
+        "--prompt",
+        type=str,
+        help="Prompt to start generation with: String (default: empty)",
+        default="你好，你可以做点什么？",
+    )
+    parser.add_argument("--benchmark", action="store_true")
+    parser.add_argument("--use_neural_speed", action="store_true")
+    args = parser.parse_args(args_in)
+    print(args)
+    model_name = args.model     # Modelscope model_id or local model
+    prompt = args.prompt
+    model = AutoModelForCausalLM.from_pretrained(model_name, load_in_4bit=True, model_hub="modelscope")
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    inputs = tokenizer(prompt, return_tensors="pt").input_ids
+    streamer = TextStreamer(tokenizer)
+    outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
+
+if __name__ == "__main__":
+    main()
diff --git a/intel_extension_for_transformers/neural_chat/assets/docs/sample.pptx b/intel_extension_for_transformers/neural_chat/assets/docs/sample.pptx
diff --git a/...l_extension_for_transformers/neural_chat/examples/quick_start/chatbot/README.md b/...l_extension_for_transformers/neural_chat/examples/quick_start/chatbot/README.md
@@ -110,7 +110,35 @@ Open a new linux console, run following command
 
 `curl -vv -X POST http://127.0.0.1:8000/v1/chat/completions`
 
-Check the output. Make sure there is no network connection and proxy setting issue at Client side
+Check the output. Make sure there is no network connection and proxy setting issue at Client side.
+
+If there is no network connection issue, you will get such response from console:
+```
+curl -vv -X POST http://127.0.0.1:8000/v1/chat/completions
+* Uses proxy env variable no_proxy == 'localhost,127.0.0.1'
+*   Trying 127.0.0.1:8000...
+* Connected to 127.0.0.1 (127.0.0.1) port 8000 (#0)
+> POST /v1/chat/completions HTTP/1.1
+> Host: 127.0.0.1:8000
+> User-Agent: curl/7.81.0
+> Accept: */*
+>
+* Mark bundle as not supporting multiuse
+< HTTP/1.1 422 Unprocessable Entity
+< date: Wed, 12 Jun 2024 01:56:44 GMT
+< server: uvicorn
+< content-length: 81
+< content-type: application/json
+<
+* Connection #0 to host 127.0.0.1 left intact
+{"detail":[{"loc":["body"],"msg":"field required","type":"value_error.missing"}]}
+```
+
+And at server side there is connection in (but incorrect input data), there output is:
+```
+INFO:     127.0.0.1:59902 - "POST /v1/chat/completions HTTP/1.1" 422 Unprocessable Entity
+ ```
+Otherwise, check your network setting.
 
 ### 3.1.2 Test request command at client side
 

diff --git a/...extension_for_transformers/neural_chat/pipeline/plugins/retrieval/parser/context_utils.py b/...extension_for_transformers/neural_chat/pipeline/plugins/retrieval/parser/context_utils.py
@@ -19,6 +19,7 @@
 import pandas as pd
 import re, json
 from langchain_community.document_loaders import UnstructuredMarkdownLoader
+from langchain_community.document_loaders import UnstructuredPowerPointLoader
 from docx import Document as DDocument
 from bs4 import BeautifulSoup
 import fitz
@@ -112,6 +113,13 @@ def read_md(md_path):
     return text
 
 
+def read_pptx(pptx_path):
+    """Read pptx file."""
+    loader = UnstructuredPowerPointLoader(pptx_path)
+    text = loader.load()[0].page_content
+    return text
+
+
 def load_json(input, process, max_length, min_length):
     """Load and process json file."""
     data = []
@@ -226,6 +234,8 @@ def load_unstructured_data(input):
         text = read_txt(input)
     elif input.endswith("md"):
         text = read_md(input)
+    elif input.endswith("pptx"):
+        text = read_pptx(input)
 
     text = text.replace('\n', ' ')
     text = text.replace('\n\n', ' ')

diff --git a/intel_extension_for_transformers/neural_chat/pipeline/plugins/retrieval/parser/parser.py b/intel_extension_for_transformers/neural_chat/pipeline/plugins/retrieval/parser/parser.py
@@ -70,7 +70,7 @@ def load(self, input, **kwargs):
     def parse_document(self, input):
         """Parse the uploaded file."""
         if input.endswith("pdf") or input.endswith("docx") or input.endswith("html") \
-           or input.endswith("txt") or input.endswith("md"):
+           or input.endswith("txt") or input.endswith("md") or input.endswith("pptx"):
             content = load_unstructured_data(input)
             if self.process:
                 chuck = get_chuck_data(content, self.max_chuck_size, self.min_chuck_size, input)
@@ -110,7 +110,7 @@ def batch_parse_document(self, input):
         for dirpath, dirnames, filenames in os.walk(input):
             for filename in filenames:
                 if filename.endswith("pdf") or filename.endswith("docx") or filename.endswith("html") \
-                    or filename.endswith("txt") or filename.endswith("md"):
+                    or filename.endswith("txt") or filename.endswith("md") or filename.endswith("pptx"):
                     content = load_unstructured_data(os.path.join(dirpath, filename))
                     if self.process:
                         chuck = get_chuck_data(content, self.max_chuck_size, self.min_chuck_size, input)

diff --git a/intel_extension_for_transformers/neural_chat/pipeline/plugins/retrieval/requirements.txt b/intel_extension_for_transformers/neural_chat/pipeline/plugins/retrieval/requirements.txt
@@ -11,7 +11,7 @@ markdown
 openpyxl
 PyMuPDF
 python-docx
-qdrant-client==1.8.2
+qdrant-client==1.9.0
 rank_bm25
 scikit-learn
 sentence-transformers==2.3.1

diff --git a/intel_extension_for_transformers/neural_chat/pipeline/plugins/retrieval/retrieval_agent.py b/intel_extension_for_transformers/neural_chat/pipeline/plugins/retrieval/retrieval_agent.py
@@ -189,9 +189,6 @@ def __init__(self,
                 knowledge_base = self.database.build(documents=langchain_documents, embedding=self.embeddings, **kwargs)
             self.retriever = RetrieverAdapter(retrieval_type=self.retrieval_type, document_store=knowledge_base, \
                                               **kwargs)
-            if self.vector_database == "Qdrant" and knowledge_base.is_local():
-               # one local storage folder cannot be accessed by multiple instances of Qdrant client simultaneously.
-               knowledge_base.client.close()
         elif self.retrieval_type == "child_parent":    # Using child-parent store retriever
             child_documents = self.splitter.split_documents(langchain_documents)
             langchain_documents = document_append_id(langchain_documents)
@@ -206,12 +203,6 @@ def __init__(self,
                                             sign='child', **kwargs)
             self.retriever = RetrieverAdapter(retrieval_type=self.retrieval_type, document_store=knowledge_base, \
                                child_document_store=child_knowledge_base, **kwargs)
-            if self.vector_database == "Qdrant" :
-                # one local storage folder cannot be accessed by multiple instances of Qdrant client simultaneously.
-                if knowledge_base.is_local():
-                    knowledge_base.client.close()
-                if child_knowledge_base.is_local():
-                    child_knowledge_base.client.close()
         elif self.retrieval_type == "bm25":
             self.docs = document_append_id(langchain_documents)
             self.retriever = RetrieverAdapter(retrieval_type=self.retrieval_type, docs=self.docs, **kwargs)
@@ -341,4 +332,23 @@ def pre_llm_inference_actions(self, model_name, query):
                 prompt = generate_qa_prompt(query, context)
         else:
             logging.error("The selected generation mode is invalid!")
+
+        # qdrant local vector db need to be closed
+        # one local storage folder cannot be accessed by multiple instances of Qdrant client simultaneously.
+        if self.vector_database == "Qdrant":
+            to_close = []
+            if self.retrieval_type == "default":
+                knowledge_base = self.retriever.retriever.vectorstore
+                if knowledge_base.is_local():
+                    to_close.append(knowledge_base)
+            if self.retrieval_type == "child_parent":
+                knowledge_base = self.retriever.retriever.parentstore
+                child_knowledge_base = self.retriever.retriever.vectorstore
+                if knowledge_base.is_local():
+                    to_close.append(knowledge_base)
+                if child_knowledge_base.is_local():
+                    to_close.append(child_knowledge_base)
+            for kb in to_close:
+                kb.client.close()
+
         return prompt, links
diff --git a/intel_extension_for_transformers/neural_chat/tests/ci/plugins/retrieval/test_rag.py b/intel_extension_for_transformers/neural_chat/tests/ci/plugins/retrieval/test_rag.py
@@ -98,6 +98,31 @@ def test_retrieval_docx(self):
         self.assertIsNotNone(response)
         plugins.retrieval.enable = False
 
+class TestChatbotBuilder_pptx(unittest.TestCase):
+    def setUp(self):
+        if os.path.exists("test_pptx"):
+            shutil.rmtree("test_pptx", ignore_errors=True)
+        return super().setUp()
+
+    def tearDown(self) -> None:
+        if os.path.exists("test_pptx"):
+            shutil.rmtree("test_pptx", ignore_errors=True)
+        return super().tearDown()
+
+    def test_retrieval_pptx(self):
+        plugins.retrieval.enable = True
+        plugins.retrieval.args["input_path"] = "../assets/docs/sample.pptx"
+        plugins.retrieval.args["persist_directory"] = "./test_pptx"
+        plugins.retrieval.args["retrieval_type"] = 'default'
+        config = PipelineConfig(model_name_or_path="facebook/opt-125m",
+                                plugins=plugins)
+        chatbot = build_chatbot(config)
+        response = chatbot.predict("How many cores does the Intel Xeon Platinum 8480+ Processor have in total?")
+        print(response)
+        plugins.retrieval.args["persist_directory"] = "./output"
+        self.assertIsNotNone(response)
+        plugins.retrieval.enable = False
+
 class TestChatbotBuilder_xlsx(unittest.TestCase):
     def setUp(self):
         if os.path.exists("test_xlsx"):

diff --git a/intel_extension_for_transformers/neural_chat/tests/requirements.txt b/intel_extension_for_transformers/neural_chat/tests/requirements.txt
@@ -63,7 +63,7 @@ pypinyin
 python-docx
 python-multipart
 pyyaml
-qdrant-client==1.8.2
+qdrant-client==1.9.0
 rank_bm25
 resampy==0.3.1
 rouge_score
@@ -85,7 +85,7 @@ torchvision==0.18.0
 tqdm
 transformers==4.38.0
 transformers_stream_generator
-unstructured
+unstructured[all-docs]
 urllib3
 uvicorn
 vector_quantize_pytorch

diff --git a/intel_extension_for_transformers/qbits/CMakeLists.txt b/intel_extension_for_transformers/qbits/CMakeLists.txt
@@ -16,16 +16,27 @@ project(qbits_py LANGUAGES C CXX)
 
 
 set(QBITS_TORCH_PATH "" CACHE STRING "Torch install path")
+set(torch_info "")
+
+function(get_torch_info python_command)
+    set(import_torch "import torch:")
+    string(REPLACE ":" ";" import_torch ${import_torch})
+    string(CONCAT fin_command "${import_torch}" "${python_command}")
+    execute_process(COMMAND python -c "${fin_command}"
+                    OUTPUT_VARIABLE torch_info
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
+    set(torch_info "${torch_info}" PARENT_SCOPE)
+endfunction()
+
 
 if(QBITS_TORCH_PATH)
     set(torch_path ${QBITS_TORCH_PATH})
     unset(TORCH_LIBRARY CACHE)  # force find_package torch
     unset(c10_LIBRARY CACHE)
     unset(TORCH_DIR CACHE)
 else()
-    execute_process(COMMAND python -c "import torch; print(torch.__path__[0])"
-                    OUTPUT_VARIABLE torch_path
-                    OUTPUT_STRIP_TRAILING_WHITESPACE)
+    get_torch_info("print(torch.__path__[0])")
+    set(torch_path "${torch_info}")
 endif()
 
 find_package(Torch REQUIRED
@@ -48,6 +59,10 @@ add_compile_options(-flto=auto)
 
 # Link against LibTorch
 pybind11_add_module(qbits_py ${qbits_src})
+get_torch_info("print(torch.__version__)")
+set(torch_version "${torch_info}")
 target_compile_features(qbits_py PRIVATE cxx_std_14)
+set(TORCH_VERSION_MACRO COMPATIBLE_TORCH_VERSION="${torch_version}")
+target_compile_definitions(qbits_py PUBLIC ${TORCH_VERSION_MACRO})
 target_link_directories(qbits_py PRIVATE ${torch_path}/lib)
 target_link_libraries(qbits_py PRIVATE bestla_dispatcher torch_python)
diff --git a/intel_extension_for_transformers/qbits/qbits.cpp b/intel_extension_for_transformers/qbits/qbits.cpp
@@ -114,7 +114,6 @@ static void woq_linear(const torch::Tensor& activation, const torch::Tensor& wei
                        torch::Tensor& output, const std::string& compute_type, const std::string& weight_type,
                        const std::string& scale_type, bool asym) {
   woq::woq_config_param p;
-
   torch::Tensor bias_fp32;
   torch::Tensor* rt_bias = bias.numel() == 0 ? &output : const_cast<torch::Tensor*>(&bias);
   if (bias.scalar_type() != torch::kFloat32 && bias.numel() != 0) {
@@ -180,6 +179,16 @@ static bool check_isa_supported(std::string isa) {
   return false;
 }
 
+static bool check_torch_compatibility(std::string version) {
+  static std::string expected_version = COMPATIBLE_TORCH_VERSION;
+  if (version == expected_version) {
+    return true;
+  }
+  TORCH_CHECK(false,
+              "QBits: Detected non QBits compiled version Torch, expected" + expected_version + ", but got " + version);
+  return false;
+}
+
 PYBIND11_MODULE(qbits_py, m) {
   m.def("quantize_to_packed_weight", &quantize_to_packed_weight);
   m.def("woq_linear", &woq_linear);
@@ -193,4 +202,5 @@ PYBIND11_MODULE(qbits_py, m) {
   m.def("dropout_fwd", &qbits_dropout_fwd);
   m.def("dropout_bwd", &qbits_dropout_bwd);
   m.def("check_isa_supported", &check_isa_supported);
+  m.def("check_torch_compatibility", &check_torch_compatibility);
 }
diff --git a/intel_extension_for_transformers/qbits/qbits_ut/test_weightonly.py b/intel_extension_for_transformers/qbits/qbits_ut/test_weightonly.py
@@ -41,6 +41,7 @@
 @pytest.mark.parametrize("src_dt", ["fp32", "bf16"])
 @pytest.mark.parametrize("dst_dt", ["fp32", "bf16"])
 def test(m, n, k, blocksize, compute_type, weight_type, scale_type, asym, transpose, add_bias, src_dt, dst_dt, dump_tensor_info=True):
+    assert qbits.check_torch_compatibility(str(torch.__version__))
     if compute_type == "int8" and weight_type == "int8" and (not qbits.check_isa_supported("AVX_VNNI")):
         pytest.skip()
     if compute_type not in cmpt_configs[weight_type] or scale_type not in scale_configs[weight_type]:

diff --git a/intel_extension_for_transformers/qbits/run_build.sh b/intel_extension_for_transformers/qbits/run_build.sh
diff --git a/intel_extension_for_transformers/transformers/modeling/modeling_auto.py b/intel_extension_for_transformers/transformers/modeling/modeling_auto.py
@@ -344,6 +344,7 @@ class _BaseQBitsAutoModelClass:
         "whisper",
         "qwen2",
         "gemma",
+        "phi3",
         "tinyllama",
     ]