Skip to content

Commit

Permalink
Disable speedup test for CausalLM with pkv
Browse files Browse the repository at this point in the history
Speedup is small on the Github Actions runner hardware so this test regularly
fails even with a speedup threshold of only 1.1
  • Loading branch information
helena-intel committed Oct 25, 2023
1 parent fe658be commit ecad239
Showing 1 changed file with 7 additions and 20 deletions.
27 changes: 7 additions & 20 deletions tests/openvino/test_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,7 +484,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
"pegasus",
)
GENERATION_LENGTH = 100
SPEEDUP_CACHE = 1.1

@parameterized.expand(SUPPORTED_ARCHITECTURES)
def test_compare_to_transformers(self, model_arch):
Expand Down Expand Up @@ -557,29 +556,17 @@ def test_compare_with_and_without_past_key_values(self):
tokens = tokenizer("This is a sample input", return_tensors="pt")

model_with_pkv = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=True)
# Warmup
_ = model_with_pkv.generate(**tokens)
with Timer() as with_pkv_timer:
outputs_model_with_pkv = model_with_pkv.generate(
**tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1
)

outputs_model_with_pkv = model_with_pkv.generate(
**tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1
)
model_without_pkv = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=False)

# Warmup
_ = model_without_pkv.generate(**tokens)
with Timer() as without_pkv_timer:
outputs_model_without_pkv = model_without_pkv.generate(
**tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1
)
outputs_model_without_pkv = model_without_pkv.generate(
**tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1
)
self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv))
self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH)
self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH)
self.assertTrue(
without_pkv_timer.elapsed / with_pkv_timer.elapsed > self.SPEEDUP_CACHE,
f"With pkv latency: {with_pkv_timer.elapsed:.3f} ms, without pkv latency: {without_pkv_timer.elapsed:.3f} ms,"
f" speedup: {without_pkv_timer.elapsed / with_pkv_timer.elapsed:.3f}",
)

del model_with_pkv
del model_without_pkv
gc.collect()
Expand Down

0 comments on commit ecad239

Please sign in to comment.