fix model export

huggingface · Oct 7, 2024 · b543d94 · b543d94
1 parent 689a757
commit b543d94
Show file tree

Hide file tree

Showing 6 changed files with 67 additions and 67 deletions.
diff --git a/notebooks/ipex/text_generation.ipynb b/notebooks/ipex/text_generation.ipynb
@@ -62,13 +62,9 @@
    "source": [
     "model = IPEXModelForCausalLM.from_pretrained(\"gpt2\", torch_dtype=torch.bfloat16, export=True)\n",
     "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n",
-    "input_sentence = [\n",
-    "    \"Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?\"\n",
-    "]\n",
+    "input_sentence = [\"Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?\"]\n",
     "model_inputs = tokenizer(input_sentence, return_tensors=\"pt\")\n",
-    "generation_kwargs = dict(\n",
-    "    max_new_tokens=32, do_sample=False, num_beams=4, num_beam_groups=1, no_repeat_ngram_size=2, use_cache=True\n",
-    ")\n",
+    "generation_kwargs = dict(max_new_tokens=32, do_sample=False, num_beams=4, num_beam_groups=1, no_repeat_ngram_size=2, use_cache=True)\n",
     "\n",
     "generated_ids = model.generate(**model_inputs, **generation_kwargs)\n",
     "output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]\n",

diff --git a/notebooks/openvino/optimum_openvino_inference.ipynb b/notebooks/openvino/optimum_openvino_inference.ipynb
@@ -466,9 +466,7 @@
    "source": [
     "# Set the device directly with `.from_pretrained()`\n",
     "if \"GPU\" in Core().available_devices:\n",
-    "    model = OVModelForQuestionAnswering.from_pretrained(\n",
-    "        \"distilbert-base-uncased-distilled-squad-ov-fp16\", device=\"GPU\"\n",
-    "    )"
+    "    model = OVModelForQuestionAnswering.from_pretrained(\"distilbert-base-uncased-distilled-squad-ov-fp16\", device=\"GPU\")"
    ]
   },
   {

diff --git a/notebooks/openvino/quantized_generation_demo.ipynb b/notebooks/openvino/quantized_generation_demo.ipynb
@@ -121,7 +121,7 @@
     "        \"CACHE_DIR\": os.path.join(save_name, \"model_cache\"),  # OpenVINO will use this directory as cache\n",
     "    },\n",
     "    \"compile\": False,\n",
-    "    \"quantization_config\": quantization_config,\n",
+    "    \"quantization_config\": quantization_config\n",
     "}\n",
     "\n",
     "# Check whether the model was already exported\n",
@@ -143,8 +143,8 @@
     "\n",
     "# TODO Optional: export to huggingface/hub\n",
     "\n",
-    "model_size = os.stat(os.path.join(save_name, \"openvino_model.bin\")).st_size / 1024**3\n",
-    "print(f\"Model size in FP32: ~5.4GB, current model size in 4bit: {model_size:.2f}GB\")"
+    "model_size = os.stat(os.path.join(save_name, \"openvino_model.bin\")).st_size / 1024 ** 3\n",
+    "print(f'Model size in FP32: ~5.4GB, current model size in 4bit: {model_size:.2f}GB')"
    ]
   },
   {
@@ -212,7 +212,7 @@
     "from transformers import TextStreamer\n",
     "\n",
     "# Tokenize the sample\n",
-    "inputs = tokenizer([sample], return_tensors=\"pt\")\n",
+    "inputs = tokenizer([sample], return_tensors='pt')\n",
     "\n",
     "# Call generate on the inputs\n",
     "out = model.generate(\n",
@@ -294,15 +294,15 @@
     "\n",
     "\n",
     "# Tokenize the sample\n",
-    "inputs = tokenizer([sample], return_tensors=\"pt\")\n",
+    "inputs = tokenizer([sample], return_tensors='pt')    \n",
     "\n",
     "out = stateless_model.generate(\n",
     "    **inputs,\n",
     "    max_new_tokens=128,\n",
     "    streamer=TextStreamer(tokenizer=tokenizer, skip_special_tokens=True),\n",
     "    pad_token_id=tokenizer.eos_token_id,\n",
     "    prompt_lookup_num_tokens=3,\n",
-    ")"
+    ")    "
    ]
   },
   {
@@ -358,7 +358,7 @@
     "        \"CACHE_DIR\": os.path.join(save_name, \"model_cache\"),  # OpenVINO will use this directory as cache\n",
     "    },\n",
     "    \"compile\": False,\n",
-    "    \"quantization_config\": quantization_config,\n",
+    "    \"quantization_config\": quantization_config\n",
     "}\n",
     "\n",
     "# Check whether the model was already exported\n",
@@ -458,15 +458,15 @@
     "        if len(self.seq_lens) > 0 or len(self.win_sizes) > 0:\n",
     "            raise RuntimeError(\"Always use a new instance, don't reuse!\")\n",
     "        self.model_forward = self.model.forward\n",
-    "\n",
+    "        \n",
     "        @wraps(self.model_forward)\n",
     "        def forward_wrapper(**kwargs):\n",
     "            self.seq_lens[-1].append(kwargs.get(\"attention_mask\").shape[-1])\n",
     "            self.win_sizes[-1].append(kwargs.get(\"input_ids\").shape[-1] - 1)\n",
     "            return self.model_forward(**kwargs)\n",
-    "\n",
+    "        \n",
     "        self.model.forward = forward_wrapper\n",
-    "\n",
+    "        \n",
     "        # wrap generate method\n",
     "        self.model_generate = self.model.generate\n",
     "\n",
@@ -479,11 +479,10 @@
     "            out = self.model_generate(*args, **kwargs)\n",
     "            self.seq_lens[-1].append(out.shape[-1])\n",
     "            return out\n",
-    "\n",
     "        self.model.generate = generate_wrapper\n",
     "        return self\n",
     "\n",
-    "    def __exit__(self, type, value, traceback):\n",
+    "    def __exit__(self,  type, value, traceback):\n",
     "        self.model.forward = self.model_forward\n",
     "        self.model.generate = self.model_generate\n",
     "        self.model_forward = None\n",
@@ -495,7 +494,7 @@
     "        self.seq_lens = [sl[1:] for sl in self.seq_lens]\n",
     "        # Add window size for output to ease calculation later\n",
     "        for ws, sl in zip(self.win_sizes, self.seq_lens):\n",
-    "            ws.append(0)\n",
+    "            ws.append(0)    \n",
     "\n",
     "    def acceptance_rate(self, return_mean=True, normalize=False):\n",
     "        # ar_per_win = ((cur_seq_len - cur_win_size) - (prev_seq_len - prev_win_size) - 1) / prev_win_size\n",
@@ -504,8 +503,9 @@
     "            sl = np.array(sl, dtype=np.float64)\n",
     "            ws = np.array(ws, dtype=np.float64)\n",
     "            out_lens = sl - ws\n",
-    "            accepted = out_lens[1:] - out_lens[:-1] - 1\n",
-    "            ar_per_win.append(np.divide(accepted, ws[:-1], out=np.zeros_like(accepted), where=ws[:-1] != 0))\n",
+    "            accepted = (out_lens[1:] - out_lens[:-1] - 1)\n",
+    "            ar_per_win.append(np.divide(accepted, ws[:-1],\n",
+    "                                   out=np.zeros_like(accepted),where=ws[:-1] != 0))\n",
     "        ar_per_win = np.hstack(ar_per_win)\n",
     "        # Normalized AR doesn't take into account windows with size 0\n",
     "        if normalize:\n",
@@ -544,7 +544,7 @@
     "samples_number = 30\n",
     "with AcceptanceRateRecorder(stateless_model) as ar_recorder:\n",
     "    for text in tqdm(dataset[:samples_number]):\n",
-    "        tokenized_prompt = tokenizer([prompt_template.format(text=text)], return_tensors=\"pt\")\n",
+    "        tokenized_prompt = tokenizer([prompt_template.format(text=text)], return_tensors='pt')\n",
     "        stateless_model.generate(\n",
     "            **tokenized_prompt,\n",
     "            max_new_tokens=128,\n",
@@ -623,6 +623,7 @@
     "    return False\n",
     "\n",
     "\n",
+    "\n",
     "# Set the chat template to the tokenizer. The chat template implements the simple template of\n",
     "#   User: content\n",
     "#   Assistant: content\n",
@@ -650,7 +651,11 @@
     "        if model_msg:\n",
     "            messages.append({\"role\": \"Assistant\", \"content\": model_msg})\n",
     "    input_token = tokenizer.apply_chat_template(\n",
-    "        messages, add_generation_prompt=True, tokenize=True, return_tensors=\"pt\", return_dict=True\n",
+    "        messages,\n",
+    "        add_generation_prompt=True,\n",
+    "        tokenize=True,\n",
+    "        return_tensors=\"pt\",\n",
+    "        return_dict=True\n",
     "    )\n",
     "    return input_token\n",
     "\n",
@@ -674,18 +679,18 @@
     "    # Construct the input message string for the model by concatenating the current system message and conversation history\n",
     "    # Tokenize the messages string\n",
     "    inputs = prepare_history_for_model(history)\n",
-    "    input_length = inputs[\"input_ids\"].shape[1]\n",
+    "    input_length = inputs['input_ids'].shape[1]\n",
     "    # truncate input in case it is too long.\n",
     "    # TODO improve this\n",
     "    if input_length > 2000:\n",
     "        history = [history[-1]]\n",
     "        inputs = prepare_history_for_model(history)\n",
-    "        input_length = inputs[\"input_ids\"].shape[1]\n",
+    "        input_length = inputs['input_ids'].shape[1]\n",
     "\n",
     "    prompt_char = \"▌\"\n",
     "    history[-1][1] = prompt_char\n",
     "    yield history, \"Status: Generating...\", *([gr.update(interactive=False)] * 4)\n",
-    "\n",
+    "    \n",
     "    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)\n",
     "\n",
     "    # Create a stopping criteria to prevent the model from playing the role of the user aswell.\n",
@@ -701,14 +706,11 @@
     "        eos_token_id=[tokenizer.eos_token_id],\n",
     "        pad_token_id=tokenizer.eos_token_id,\n",
     "    )\n",
-    "    generate_kwargs = (\n",
-    "        dict(\n",
-    "            streamer=streamer,\n",
-    "            generation_config=generation_config,\n",
-    "            stopping_criteria=stopping_criteria,\n",
-    "        )\n",
-    "        | inputs\n",
-    "    )\n",
+    "    generate_kwargs = dict(\n",
+    "        streamer=streamer,\n",
+    "        generation_config=generation_config,\n",
+    "        stopping_criteria=stopping_criteria,\n",
+    "    ) | inputs\n",
     "\n",
     "    if assisted:\n",
     "        target_generate = stateless_model.generate\n",
@@ -735,7 +737,7 @@
     "        yield history, \"Status: Generating...\", *([gr.update(interactive=False)] * 4)\n",
     "    history[-1][1] = partial_text\n",
     "    generation_time = time.perf_counter() - start\n",
-    "    yield history, f\"Generation time: {generation_time:.2f} sec\", *([gr.update(interactive=True)] * 4)"
+    "    yield history, f'Generation time: {generation_time:.2f} sec', *([gr.update(interactive=True)] * 4)"
    ]
   },
   {
@@ -779,9 +781,7 @@
     "    [\"Can you explain to me briefly what is Python programming language?\"],\n",
     "    [\"Explain the plot of Cinderella in a sentence.\"],\n",
     "    [\"Write a Python function to perform binary search over a sorted list. Use markdown to write code\"],\n",
-    "    [\n",
-    "        \"Lily has a rubber ball that she drops from the top of a wall. The wall is 2 meters tall. How long will it take for the ball to reach the ground?\"\n",
-    "    ],\n",
+    "    [\"Lily has a rubber ball that she drops from the top of a wall. The wall is 2 meters tall. How long will it take for the ball to reach the ground?\"],\n",
     "]\n",
     "\n",
     "\n",
@@ -797,7 +797,7 @@
     "    \"\"\"\n",
     "    # Append current user message to history with a blank assistant message which will be generated by the model\n",
     "    history.append([message, None])\n",
-    "    return (\"\", history)\n",
+    "    return ('', history)\n",
     "\n",
     "\n",
     "def prepare_for_regenerate(history):\n",
@@ -808,7 +808,7 @@
     "      history: conversation history\n",
     "    Returns:\n",
     "      updated history\n",
-    "    \"\"\"\n",
+    "    \"\"\" \n",
     "    history[-1][1] = None\n",
     "    return history\n",
     "\n",
@@ -821,7 +821,7 @@
     "        msg = gr.Textbox(placeholder=\"Enter message here...\", show_label=False, autofocus=True, scale=75)\n",
     "        status = gr.Textbox(\"Status: Idle\", show_label=False, max_lines=1, scale=15)\n",
     "    with gr.Row():\n",
-    "        submit = gr.Button(\"Submit\", variant=\"primary\")\n",
+    "        submit = gr.Button(\"Submit\", variant='primary')\n",
     "        regenerate = gr.Button(\"Regenerate\")\n",
     "        clear = gr.Button(\"Clear\")\n",
     "    with gr.Accordion(\"Advanced Options:\", open=False):\n",
@@ -860,7 +860,9 @@
     "                    step=0.1,\n",
     "                    interactive=True,\n",
     "                )\n",
-    "    gr.Examples(EXAMPLES, inputs=msg, label=\"Click on any example and press the 'Submit' button\")\n",
+    "    gr.Examples(\n",
+    "        EXAMPLES, inputs=msg, label=\"Click on any example and press the 'Submit' button\"\n",
+    "    )\n",
     "\n",
     "    # Sets generate function to be triggered when the user submit a new message\n",
     "    gr.on(\n",
@@ -874,14 +876,20 @@
     "        inputs=[chatbot, temperature, max_new_tokens, top_p, repetition_penalty, assisted],\n",
     "        outputs=[chatbot, status, msg, submit, regenerate, clear],\n",
     "        concurrency_limit=1,\n",
-    "        queue=True,\n",
+    "        queue=True\n",
     "    )\n",
-    "    regenerate.click(fn=prepare_for_regenerate, inputs=chatbot, outputs=chatbot, queue=True, concurrency_limit=1).then(\n",
+    "    regenerate.click(\n",
+    "        fn=prepare_for_regenerate,\n",
+    "        inputs=chatbot,\n",
+    "        outputs=chatbot,\n",
+    "        queue=True,\n",
+    "        concurrency_limit=1\n",
+    "    ).then(\n",
     "        fn=generate,\n",
     "        inputs=[chatbot, temperature, max_new_tokens, top_p, repetition_penalty, assisted],\n",
     "        outputs=[chatbot, status, msg, submit, regenerate, clear],\n",
     "        concurrency_limit=1,\n",
-    "        queue=True,\n",
+    "        queue=True\n",
     "    )\n",
     "    clear.click(fn=lambda: (None, \"Status: Idle\"), inputs=None, outputs=[chatbot, status], queue=False)"
    ]

diff --git a/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb b/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb
@@ -167,7 +167,6 @@
     "def preprocess_fn(example):\n",
     "    return {\"prompt\": example[\"caption\"]}\n",
     "\n",
-    "\n",
     "NUM_SAMPLES = 200\n",
     "dataset = dataset.take(NUM_SAMPLES)\n",
     "calibration_dataset = dataset.map(lambda x: preprocess_fn(x), remove_columns=dataset.column_names)"
@@ -1067,14 +1066,12 @@
    ],
    "source": [
     "int8_pipe = OVStableDiffusionPipeline.from_pretrained(model_id=MODEL_ID, export=True)\n",
-    "quantization_config = OVWeightQuantizationConfig(\n",
-    "    bits=8, num_samples=NUM_SAMPLES, quant_method=OVQuantizationMethod.HYBRID\n",
-    ")\n",
+    "quantization_config = OVWeightQuantizationConfig(bits=8, num_samples=NUM_SAMPLES, quant_method=OVQuantizationMethod.HYBRID)\n",
     "quantizer = OVQuantizer(int8_pipe)\n",
     "quantizer.quantize(\n",
     "    ov_config=OVConfig(quantization_config=quantization_config),\n",
     "    calibration_dataset=calibration_dataset,\n",
-    "    save_directory=int8_model_path,\n",
+    "    save_directory=int8_model_path\n",
     ")"
    ]
   },
@@ -1205,10 +1202,8 @@
     "    im_w, im_h = fp32_img.size\n",
     "    is_horizontal = im_h <= im_w\n",
     "    figsize = (20, 30) if is_horizontal else (30, 20)\n",
-    "    fig, axs = plt.subplots(\n",
-    "        1 if is_horizontal else 2, 2 if is_horizontal else 1, figsize=figsize, sharex=\"all\", sharey=\"all\"\n",
-    "    )\n",
-    "    fig.patch.set_facecolor(\"white\")\n",
+    "    fig, axs = plt.subplots(1 if is_horizontal else 2, 2 if is_horizontal else 1, figsize=figsize, sharex='all', sharey='all')\n",
+    "    fig.patch.set_facecolor('white')\n",
     "    list_axes = list(axs.flat)\n",
     "    for a in list_axes:\n",
     "        a.set_xticklabels([])\n",
@@ -1222,7 +1217,7 @@
     "    img2_title = \"INT8 result\"\n",
     "    list_axes[0].set_title(img1_title, fontsize=20)\n",
     "    list_axes[1].set_title(img2_title, fontsize=20)\n",
-    "    fig.subplots_adjust(wspace=0.0 if is_horizontal else 0.01, hspace=0.01 if is_horizontal else 0.0)\n",
+    "    fig.subplots_adjust(wspace=0.0 if is_horizontal else 0.01 , hspace=0.01 if is_horizontal else 0.0)\n",
     "    fig.tight_layout()"
    ]
   },
@@ -1235,10 +1230,13 @@
    "source": [
     "prompt = \"Self-portrait oil painting, a beautiful cyborg with golden hair, 8k\"\n",
     "\n",
-    "\n",
     "def generate_image(pipeline, prompt):\n",
     "    transformers.set_seed(1)\n",
-    "    return pipeline(prompt=prompt, guidance_scale=8.0, output_type=\"pil\").images[0]"
+    "    return pipeline(\n",
+    "        prompt=prompt,\n",
+    "        guidance_scale=8.0,\n",
+    "        output_type=\"pil\"\n",
+    "    ).images[0]"
    ]
   },
   {
@@ -1331,7 +1329,7 @@
     "def get_model_size(model_folder, framework):\n",
     "    \"\"\"\n",
     "    Return OpenVINO or PyTorch model size in Mb.\n",
-    "\n",
+    "    \n",
     "    Arguments:\n",
     "        model_folder:\n",
     "            Directory containing a model.\n",
@@ -1533,7 +1531,6 @@
     "def get_val_dataset(num_items=3):\n",
     "    return [item[\"caption\"] for item in dataset.take(num_items)]\n",
     "\n",
-    "\n",
     "def benchmark(pipeline, dataset):\n",
     "    \"\"\"\n",
     "    Benchmark PyTorch or OpenVINO model. This function does inference on `num_items`\n",

diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py
@@ -908,7 +908,7 @@ def get_diffusion_models_for_export_ext(
         is_sd3 = False
 
     if not is_sd3:
-        return get_diffusion_models_for_export(pipeline, int_dtype, float_dtype, exporter)
+        return None, get_diffusion_models_for_export(pipeline, int_dtype, float_dtype, exporter)
 
     models_for_export = {}