Skip to content

Commit

Permalink
fix model export
Browse files Browse the repository at this point in the history
  • Loading branch information
eaidova committed Oct 7, 2024
1 parent 689a757 commit b543d94
Show file tree
Hide file tree
Showing 6 changed files with 67 additions and 67 deletions.
8 changes: 2 additions & 6 deletions notebooks/ipex/text_generation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,9 @@
"source": [
"model = IPEXModelForCausalLM.from_pretrained(\"gpt2\", torch_dtype=torch.bfloat16, export=True)\n",
"tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n",
"input_sentence = [\n",
" \"Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?\"\n",
"]\n",
"input_sentence = [\"Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?\"]\n",
"model_inputs = tokenizer(input_sentence, return_tensors=\"pt\")\n",
"generation_kwargs = dict(\n",
" max_new_tokens=32, do_sample=False, num_beams=4, num_beam_groups=1, no_repeat_ngram_size=2, use_cache=True\n",
")\n",
"generation_kwargs = dict(max_new_tokens=32, do_sample=False, num_beams=4, num_beam_groups=1, no_repeat_ngram_size=2, use_cache=True)\n",
"\n",
"generated_ids = model.generate(**model_inputs, **generation_kwargs)\n",
"output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]\n",
Expand Down
4 changes: 1 addition & 3 deletions notebooks/openvino/optimum_openvino_inference.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -466,9 +466,7 @@
"source": [
"# Set the device directly with `.from_pretrained()`\n",
"if \"GPU\" in Core().available_devices:\n",
" model = OVModelForQuestionAnswering.from_pretrained(\n",
" \"distilbert-base-uncased-distilled-squad-ov-fp16\", device=\"GPU\"\n",
" )"
" model = OVModelForQuestionAnswering.from_pretrained(\"distilbert-base-uncased-distilled-squad-ov-fp16\", device=\"GPU\")"
]
},
{
Expand Down
86 changes: 47 additions & 39 deletions notebooks/openvino/quantized_generation_demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@
" \"CACHE_DIR\": os.path.join(save_name, \"model_cache\"), # OpenVINO will use this directory as cache\n",
" },\n",
" \"compile\": False,\n",
" \"quantization_config\": quantization_config,\n",
" \"quantization_config\": quantization_config\n",
"}\n",
"\n",
"# Check whether the model was already exported\n",
Expand All @@ -143,8 +143,8 @@
"\n",
"# TODO Optional: export to huggingface/hub\n",
"\n",
"model_size = os.stat(os.path.join(save_name, \"openvino_model.bin\")).st_size / 1024**3\n",
"print(f\"Model size in FP32: ~5.4GB, current model size in 4bit: {model_size:.2f}GB\")"
"model_size = os.stat(os.path.join(save_name, \"openvino_model.bin\")).st_size / 1024 ** 3\n",
"print(f'Model size in FP32: ~5.4GB, current model size in 4bit: {model_size:.2f}GB')"
]
},
{
Expand Down Expand Up @@ -212,7 +212,7 @@
"from transformers import TextStreamer\n",
"\n",
"# Tokenize the sample\n",
"inputs = tokenizer([sample], return_tensors=\"pt\")\n",
"inputs = tokenizer([sample], return_tensors='pt')\n",
"\n",
"# Call generate on the inputs\n",
"out = model.generate(\n",
Expand Down Expand Up @@ -294,15 +294,15 @@
"\n",
"\n",
"# Tokenize the sample\n",
"inputs = tokenizer([sample], return_tensors=\"pt\")\n",
"inputs = tokenizer([sample], return_tensors='pt') \n",
"\n",
"out = stateless_model.generate(\n",
" **inputs,\n",
" max_new_tokens=128,\n",
" streamer=TextStreamer(tokenizer=tokenizer, skip_special_tokens=True),\n",
" pad_token_id=tokenizer.eos_token_id,\n",
" prompt_lookup_num_tokens=3,\n",
")"
") "
]
},
{
Expand Down Expand Up @@ -358,7 +358,7 @@
" \"CACHE_DIR\": os.path.join(save_name, \"model_cache\"), # OpenVINO will use this directory as cache\n",
" },\n",
" \"compile\": False,\n",
" \"quantization_config\": quantization_config,\n",
" \"quantization_config\": quantization_config\n",
"}\n",
"\n",
"# Check whether the model was already exported\n",
Expand Down Expand Up @@ -458,15 +458,15 @@
" if len(self.seq_lens) > 0 or len(self.win_sizes) > 0:\n",
" raise RuntimeError(\"Always use a new instance, don't reuse!\")\n",
" self.model_forward = self.model.forward\n",
"\n",
" \n",
" @wraps(self.model_forward)\n",
" def forward_wrapper(**kwargs):\n",
" self.seq_lens[-1].append(kwargs.get(\"attention_mask\").shape[-1])\n",
" self.win_sizes[-1].append(kwargs.get(\"input_ids\").shape[-1] - 1)\n",
" return self.model_forward(**kwargs)\n",
"\n",
" \n",
" self.model.forward = forward_wrapper\n",
"\n",
" \n",
" # wrap generate method\n",
" self.model_generate = self.model.generate\n",
"\n",
Expand All @@ -479,11 +479,10 @@
" out = self.model_generate(*args, **kwargs)\n",
" self.seq_lens[-1].append(out.shape[-1])\n",
" return out\n",
"\n",
" self.model.generate = generate_wrapper\n",
" return self\n",
"\n",
" def __exit__(self, type, value, traceback):\n",
" def __exit__(self, type, value, traceback):\n",
" self.model.forward = self.model_forward\n",
" self.model.generate = self.model_generate\n",
" self.model_forward = None\n",
Expand All @@ -495,7 +494,7 @@
" self.seq_lens = [sl[1:] for sl in self.seq_lens]\n",
" # Add window size for output to ease calculation later\n",
" for ws, sl in zip(self.win_sizes, self.seq_lens):\n",
" ws.append(0)\n",
" ws.append(0) \n",
"\n",
" def acceptance_rate(self, return_mean=True, normalize=False):\n",
" # ar_per_win = ((cur_seq_len - cur_win_size) - (prev_seq_len - prev_win_size) - 1) / prev_win_size\n",
Expand All @@ -504,8 +503,9 @@
" sl = np.array(sl, dtype=np.float64)\n",
" ws = np.array(ws, dtype=np.float64)\n",
" out_lens = sl - ws\n",
" accepted = out_lens[1:] - out_lens[:-1] - 1\n",
" ar_per_win.append(np.divide(accepted, ws[:-1], out=np.zeros_like(accepted), where=ws[:-1] != 0))\n",
" accepted = (out_lens[1:] - out_lens[:-1] - 1)\n",
" ar_per_win.append(np.divide(accepted, ws[:-1],\n",
" out=np.zeros_like(accepted),where=ws[:-1] != 0))\n",
" ar_per_win = np.hstack(ar_per_win)\n",
" # Normalized AR doesn't take into account windows with size 0\n",
" if normalize:\n",
Expand Down Expand Up @@ -544,7 +544,7 @@
"samples_number = 30\n",
"with AcceptanceRateRecorder(stateless_model) as ar_recorder:\n",
" for text in tqdm(dataset[:samples_number]):\n",
" tokenized_prompt = tokenizer([prompt_template.format(text=text)], return_tensors=\"pt\")\n",
" tokenized_prompt = tokenizer([prompt_template.format(text=text)], return_tensors='pt')\n",
" stateless_model.generate(\n",
" **tokenized_prompt,\n",
" max_new_tokens=128,\n",
Expand Down Expand Up @@ -623,6 +623,7 @@
" return False\n",
"\n",
"\n",
"\n",
"# Set the chat template to the tokenizer. The chat template implements the simple template of\n",
"# User: content\n",
"# Assistant: content\n",
Expand Down Expand Up @@ -650,7 +651,11 @@
" if model_msg:\n",
" messages.append({\"role\": \"Assistant\", \"content\": model_msg})\n",
" input_token = tokenizer.apply_chat_template(\n",
" messages, add_generation_prompt=True, tokenize=True, return_tensors=\"pt\", return_dict=True\n",
" messages,\n",
" add_generation_prompt=True,\n",
" tokenize=True,\n",
" return_tensors=\"pt\",\n",
" return_dict=True\n",
" )\n",
" return input_token\n",
"\n",
Expand All @@ -674,18 +679,18 @@
" # Construct the input message string for the model by concatenating the current system message and conversation history\n",
" # Tokenize the messages string\n",
" inputs = prepare_history_for_model(history)\n",
" input_length = inputs[\"input_ids\"].shape[1]\n",
" input_length = inputs['input_ids'].shape[1]\n",
" # truncate input in case it is too long.\n",
" # TODO improve this\n",
" if input_length > 2000:\n",
" history = [history[-1]]\n",
" inputs = prepare_history_for_model(history)\n",
" input_length = inputs[\"input_ids\"].shape[1]\n",
" input_length = inputs['input_ids'].shape[1]\n",
"\n",
" prompt_char = \"\"\n",
" history[-1][1] = prompt_char\n",
" yield history, \"Status: Generating...\", *([gr.update(interactive=False)] * 4)\n",
"\n",
" \n",
" streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)\n",
"\n",
" # Create a stopping criteria to prevent the model from playing the role of the user aswell.\n",
Expand All @@ -701,14 +706,11 @@
" eos_token_id=[tokenizer.eos_token_id],\n",
" pad_token_id=tokenizer.eos_token_id,\n",
" )\n",
" generate_kwargs = (\n",
" dict(\n",
" streamer=streamer,\n",
" generation_config=generation_config,\n",
" stopping_criteria=stopping_criteria,\n",
" )\n",
" | inputs\n",
" )\n",
" generate_kwargs = dict(\n",
" streamer=streamer,\n",
" generation_config=generation_config,\n",
" stopping_criteria=stopping_criteria,\n",
" ) | inputs\n",
"\n",
" if assisted:\n",
" target_generate = stateless_model.generate\n",
Expand All @@ -735,7 +737,7 @@
" yield history, \"Status: Generating...\", *([gr.update(interactive=False)] * 4)\n",
" history[-1][1] = partial_text\n",
" generation_time = time.perf_counter() - start\n",
" yield history, f\"Generation time: {generation_time:.2f} sec\", *([gr.update(interactive=True)] * 4)"
" yield history, f'Generation time: {generation_time:.2f} sec', *([gr.update(interactive=True)] * 4)"
]
},
{
Expand Down Expand Up @@ -779,9 +781,7 @@
" [\"Can you explain to me briefly what is Python programming language?\"],\n",
" [\"Explain the plot of Cinderella in a sentence.\"],\n",
" [\"Write a Python function to perform binary search over a sorted list. Use markdown to write code\"],\n",
" [\n",
" \"Lily has a rubber ball that she drops from the top of a wall. The wall is 2 meters tall. How long will it take for the ball to reach the ground?\"\n",
" ],\n",
" [\"Lily has a rubber ball that she drops from the top of a wall. The wall is 2 meters tall. How long will it take for the ball to reach the ground?\"],\n",
"]\n",
"\n",
"\n",
Expand All @@ -797,7 +797,7 @@
" \"\"\"\n",
" # Append current user message to history with a blank assistant message which will be generated by the model\n",
" history.append([message, None])\n",
" return (\"\", history)\n",
" return ('', history)\n",
"\n",
"\n",
"def prepare_for_regenerate(history):\n",
Expand All @@ -808,7 +808,7 @@
" history: conversation history\n",
" Returns:\n",
" updated history\n",
" \"\"\"\n",
" \"\"\" \n",
" history[-1][1] = None\n",
" return history\n",
"\n",
Expand All @@ -821,7 +821,7 @@
" msg = gr.Textbox(placeholder=\"Enter message here...\", show_label=False, autofocus=True, scale=75)\n",
" status = gr.Textbox(\"Status: Idle\", show_label=False, max_lines=1, scale=15)\n",
" with gr.Row():\n",
" submit = gr.Button(\"Submit\", variant=\"primary\")\n",
" submit = gr.Button(\"Submit\", variant='primary')\n",
" regenerate = gr.Button(\"Regenerate\")\n",
" clear = gr.Button(\"Clear\")\n",
" with gr.Accordion(\"Advanced Options:\", open=False):\n",
Expand Down Expand Up @@ -860,7 +860,9 @@
" step=0.1,\n",
" interactive=True,\n",
" )\n",
" gr.Examples(EXAMPLES, inputs=msg, label=\"Click on any example and press the 'Submit' button\")\n",
" gr.Examples(\n",
" EXAMPLES, inputs=msg, label=\"Click on any example and press the 'Submit' button\"\n",
" )\n",
"\n",
" # Sets generate function to be triggered when the user submit a new message\n",
" gr.on(\n",
Expand All @@ -874,14 +876,20 @@
" inputs=[chatbot, temperature, max_new_tokens, top_p, repetition_penalty, assisted],\n",
" outputs=[chatbot, status, msg, submit, regenerate, clear],\n",
" concurrency_limit=1,\n",
" queue=True,\n",
" queue=True\n",
" )\n",
" regenerate.click(fn=prepare_for_regenerate, inputs=chatbot, outputs=chatbot, queue=True, concurrency_limit=1).then(\n",
" regenerate.click(\n",
" fn=prepare_for_regenerate,\n",
" inputs=chatbot,\n",
" outputs=chatbot,\n",
" queue=True,\n",
" concurrency_limit=1\n",
" ).then(\n",
" fn=generate,\n",
" inputs=[chatbot, temperature, max_new_tokens, top_p, repetition_penalty, assisted],\n",
" outputs=[chatbot, status, msg, submit, regenerate, clear],\n",
" concurrency_limit=1,\n",
" queue=True,\n",
" queue=True\n",
" )\n",
" clear.click(fn=lambda: (None, \"Status: Idle\"), inputs=None, outputs=[chatbot, status], queue=False)"
]
Expand Down
25 changes: 11 additions & 14 deletions notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,6 @@
"def preprocess_fn(example):\n",
" return {\"prompt\": example[\"caption\"]}\n",
"\n",
"\n",
"NUM_SAMPLES = 200\n",
"dataset = dataset.take(NUM_SAMPLES)\n",
"calibration_dataset = dataset.map(lambda x: preprocess_fn(x), remove_columns=dataset.column_names)"
Expand Down Expand Up @@ -1067,14 +1066,12 @@
],
"source": [
"int8_pipe = OVStableDiffusionPipeline.from_pretrained(model_id=MODEL_ID, export=True)\n",
"quantization_config = OVWeightQuantizationConfig(\n",
" bits=8, num_samples=NUM_SAMPLES, quant_method=OVQuantizationMethod.HYBRID\n",
")\n",
"quantization_config = OVWeightQuantizationConfig(bits=8, num_samples=NUM_SAMPLES, quant_method=OVQuantizationMethod.HYBRID)\n",
"quantizer = OVQuantizer(int8_pipe)\n",
"quantizer.quantize(\n",
" ov_config=OVConfig(quantization_config=quantization_config),\n",
" calibration_dataset=calibration_dataset,\n",
" save_directory=int8_model_path,\n",
" save_directory=int8_model_path\n",
")"
]
},
Expand Down Expand Up @@ -1205,10 +1202,8 @@
" im_w, im_h = fp32_img.size\n",
" is_horizontal = im_h <= im_w\n",
" figsize = (20, 30) if is_horizontal else (30, 20)\n",
" fig, axs = plt.subplots(\n",
" 1 if is_horizontal else 2, 2 if is_horizontal else 1, figsize=figsize, sharex=\"all\", sharey=\"all\"\n",
" )\n",
" fig.patch.set_facecolor(\"white\")\n",
" fig, axs = plt.subplots(1 if is_horizontal else 2, 2 if is_horizontal else 1, figsize=figsize, sharex='all', sharey='all')\n",
" fig.patch.set_facecolor('white')\n",
" list_axes = list(axs.flat)\n",
" for a in list_axes:\n",
" a.set_xticklabels([])\n",
Expand All @@ -1222,7 +1217,7 @@
" img2_title = \"INT8 result\"\n",
" list_axes[0].set_title(img1_title, fontsize=20)\n",
" list_axes[1].set_title(img2_title, fontsize=20)\n",
" fig.subplots_adjust(wspace=0.0 if is_horizontal else 0.01, hspace=0.01 if is_horizontal else 0.0)\n",
" fig.subplots_adjust(wspace=0.0 if is_horizontal else 0.01 , hspace=0.01 if is_horizontal else 0.0)\n",
" fig.tight_layout()"
]
},
Expand All @@ -1235,10 +1230,13 @@
"source": [
"prompt = \"Self-portrait oil painting, a beautiful cyborg with golden hair, 8k\"\n",
"\n",
"\n",
"def generate_image(pipeline, prompt):\n",
" transformers.set_seed(1)\n",
" return pipeline(prompt=prompt, guidance_scale=8.0, output_type=\"pil\").images[0]"
" return pipeline(\n",
" prompt=prompt,\n",
" guidance_scale=8.0,\n",
" output_type=\"pil\"\n",
" ).images[0]"
]
},
{
Expand Down Expand Up @@ -1331,7 +1329,7 @@
"def get_model_size(model_folder, framework):\n",
" \"\"\"\n",
" Return OpenVINO or PyTorch model size in Mb.\n",
"\n",
" \n",
" Arguments:\n",
" model_folder:\n",
" Directory containing a model.\n",
Expand Down Expand Up @@ -1533,7 +1531,6 @@
"def get_val_dataset(num_items=3):\n",
" return [item[\"caption\"] for item in dataset.take(num_items)]\n",
"\n",
"\n",
"def benchmark(pipeline, dataset):\n",
" \"\"\"\n",
" Benchmark PyTorch or OpenVINO model. This function does inference on `num_items`\n",
Expand Down
2 changes: 1 addition & 1 deletion optimum/exporters/openvino/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -908,7 +908,7 @@ def get_diffusion_models_for_export_ext(
is_sd3 = False

if not is_sd3:
return get_diffusion_models_for_export(pipeline, int_dtype, float_dtype, exporter)
return None, get_diffusion_models_for_export(pipeline, int_dtype, float_dtype, exporter)

models_for_export = {}

Expand Down
Loading

0 comments on commit b543d94

Please sign in to comment.