From a716949d1c28341b2fb9af75f43b541ddeac9335 Mon Sep 17 00:00:00 2001
From: Drew Robbins <drew@drewby.com>
Date: Thu, 16 Jan 2025 07:24:35 +0900
Subject: [PATCH] Add metrics to the Python OpenAI instrumentation (#3180)

---
 .../CHANGELOG.md                              |   1 +
 .../README.rst                                |  45 ++++-
 .../instrumentation/openai_v2/__init__.py     |  18 +-
 .../instrumentation/openai_v2/instruments.py  |  11 +
 .../instrumentation/openai_v2/patch.py        | 107 +++++++++-
 .../test_async_chat_completion_metrics.yaml   | 133 ++++++++++++
 .../test_chat_completion_metrics.yaml         | 135 +++++++++++++
 .../tests/conftest.py                         |  83 +++++++-
 .../tests/test_chat_completions.py            |  49 ++++-
 .../tests/test_chat_metrics.py                | 190 ++++++++++++++++++
 10 files changed, 763 insertions(+), 9 deletions(-)
 create mode 100644 instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/instruments.py
 create mode 100644 instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/cassettes/test_async_chat_completion_metrics.yaml
 create mode 100644 instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/cassettes/test_chat_completion_metrics.yaml
 create mode 100644 instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/test_chat_metrics.py

diff --git a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/CHANGELOG.md b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/CHANGELOG.md
index 4644ee3dc5..ed27904e63 100644
--- a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/CHANGELOG.md
+++ b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/CHANGELOG.md
@@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Add example to `opentelemetry-instrumentation-openai-v2`
   ([#3006](https://github.com/open-telemetry/opentelemetry-python-contrib/pull/3006))
 - Support for `AsyncOpenAI/AsyncCompletions` ([#2984](https://github.com/open-telemetry/opentelemetry-python-contrib/pull/2984))
+- Add metrics ([#3180](https://github.com/open-telemetry/opentelemetry-python-contrib/pull/3180))
 
 ## Version 2.0b0 (2024-11-08)
 
diff --git a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/README.rst b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/README.rst
index d2cb0b5724..c402b30bc0 100644
--- a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/README.rst
+++ b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/README.rst
@@ -7,7 +7,8 @@ OpenTelemetry OpenAI Instrumentation
    :target: https://pypi.org/project/opentelemetry-instrumentation-openai-v2/
 
 This library allows tracing LLM requests and logging of messages made by the
-`OpenAI Python API library <https://pypi.org/project/openai/>`_.
+`OpenAI Python API library <https://pypi.org/project/openai/>`_. It also captures
+the duration of the operations and the number of tokens used as metrics.
 
 
 Installation
@@ -74,6 +75,48 @@ To uninstrument clients, call the uninstrument method:
     # Uninstrument all clients
     OpenAIInstrumentor().uninstrument()
 
+Bucket Boundaries
+-----------------
+
+This section describes the explicit bucket boundaries for metrics such as token usage and operation duration, and guides users to create Views to implement them according to the semantic conventions.
+
+The bucket boundaries are defined as follows:
+
+- For `gen_ai.client.token.usage`: [1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864]
+- For `gen_ai.client.operation.duration`: [0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64, 1.28, 2.56, 5.12, 10.24, 20.48, 40.96, 81.92]
+
+To implement these bucket boundaries, you can create Views in your OpenTelemetry SDK setup. Here is an example:
+
+.. code-block:: python
+
+    from opentelemetry.sdk.metrics import MeterProvider, View
+    from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
+    from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
+    from opentelemetry.sdk.metrics.aggregation import ExplicitBucketHistogramAggregation
+
+    views = [
+        View(
+            instrument_name="gen_ai.client.token.usage",
+            aggregation=ExplicitBucketHistogramAggregation([1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864]),
+        ),
+        View(
+            instrument_name="gen_ai.client.operation.duration",
+            aggregation=ExplicitBucketHistogramAggregation([0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64, 1.28, 2.56, 5.12, 10.24, 20.48, 40.96, 81.92]),
+        ),
+    ]
+
+    metric_exporter = OTLPMetricExporter(endpoint="http://localhost:4317")
+    metric_reader = PeriodicExportingMetricReader(metric_exporter)
+    provider = MeterProvider(
+        metric_readers=[metric_reader],
+        views=views
+    )
+
+    from opentelemetry.sdk.metrics import set_meter_provider
+    set_meter_provider(provider)
+
+For more details, refer to the `OpenTelemetry GenAI Metrics documentation <https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-metrics/>`_.
+
 References
 ----------
 * `OpenTelemetry OpenAI Instrumentation <https://opentelemetry-python-contrib.readthedocs.io/en/latest/instrumentation/openai/openai.html>`_
diff --git a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/__init__.py b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/__init__.py
index ee3bbfdb73..ab4b6f9d7b 100644
--- a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/__init__.py
+++ b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/__init__.py
@@ -49,13 +49,18 @@
 from opentelemetry.instrumentation.openai_v2.package import _instruments
 from opentelemetry.instrumentation.openai_v2.utils import is_content_enabled
 from opentelemetry.instrumentation.utils import unwrap
+from opentelemetry.metrics import get_meter
 from opentelemetry.semconv.schemas import Schemas
 from opentelemetry.trace import get_tracer
 
+from .instruments import Instruments
 from .patch import async_chat_completions_create, chat_completions_create
 
 
 class OpenAIInstrumentor(BaseInstrumentor):
+    def __init__(self):
+        self._meter = None
+
     def instrumentation_dependencies(self) -> Collection[str]:
         return _instruments
 
@@ -75,12 +80,21 @@ def _instrument(self, **kwargs):
             schema_url=Schemas.V1_28_0.value,
             event_logger_provider=event_logger_provider,
         )
+        meter_provider = kwargs.get("meter_provider")
+        self._meter = get_meter(
+            __name__,
+            "",
+            meter_provider,
+            schema_url=Schemas.V1_28_0.value,
+        )
+
+        instruments = Instruments(self._meter)
 
         wrap_function_wrapper(
             module="openai.resources.chat.completions",
             name="Completions.create",
             wrapper=chat_completions_create(
-                tracer, event_logger, is_content_enabled()
+                tracer, event_logger, instruments, is_content_enabled()
             ),
         )
 
@@ -88,7 +102,7 @@ def _instrument(self, **kwargs):
             module="openai.resources.chat.completions",
             name="AsyncCompletions.create",
             wrapper=async_chat_completions_create(
-                tracer, event_logger, is_content_enabled()
+                tracer, event_logger, instruments, is_content_enabled()
             ),
         )
 
diff --git a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/instruments.py b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/instruments.py
new file mode 100644
index 0000000000..d1e184ac84
--- /dev/null
+++ b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/instruments.py
@@ -0,0 +1,11 @@
+from opentelemetry.semconv._incubating.metrics import gen_ai_metrics
+
+
+class Instruments:
+    def __init__(self, meter):
+        self.operation_duration_histogram = (
+            gen_ai_metrics.create_gen_ai_client_operation_duration(meter)
+        )
+        self.token_usage_histogram = (
+            gen_ai_metrics.create_gen_ai_client_token_usage(meter)
+        )
diff --git a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/patch.py b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/patch.py
index cd284473ce..307b312fca 100644
--- a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/patch.py
+++ b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/patch.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 
+from timeit import default_timer
 from typing import Optional
 
 from openai import Stream
@@ -21,8 +22,12 @@
 from opentelemetry.semconv._incubating.attributes import (
     gen_ai_attributes as GenAIAttributes,
 )
+from opentelemetry.semconv._incubating.attributes import (
+    server_attributes as ServerAttributes,
+)
 from opentelemetry.trace import Span, SpanKind, Tracer
 
+from .instruments import Instruments
 from .utils import (
     choice_to_event,
     get_llm_request_attributes,
@@ -34,7 +39,10 @@
 
 
 def chat_completions_create(
-    tracer: Tracer, event_logger: EventLogger, capture_content: bool
+    tracer: Tracer,
+    event_logger: EventLogger,
+    instruments: Instruments,
+    capture_content: bool,
 ):
     """Wrap the `create` method of the `ChatCompletion` class to trace it."""
 
@@ -54,6 +62,9 @@ def traced_method(wrapped, instance, args, kwargs):
                         message_to_event(message, capture_content)
                     )
 
+            start = default_timer()
+            result = None
+            error_type = None
             try:
                 result = wrapped(*args, **kwargs)
                 if is_streaming(kwargs):
@@ -69,14 +80,27 @@ def traced_method(wrapped, instance, args, kwargs):
                 return result
 
             except Exception as error:
+                error_type = type(error).__qualname__
                 handle_span_exception(span, error)
                 raise
+            finally:
+                duration = max((default_timer() - start), 0)
+                _record_metrics(
+                    instruments,
+                    duration,
+                    result,
+                    span_attributes,
+                    error_type,
+                )
 
     return traced_method
 
 
 def async_chat_completions_create(
-    tracer: Tracer, event_logger: EventLogger, capture_content: bool
+    tracer: Tracer,
+    event_logger: EventLogger,
+    instruments: Instruments,
+    capture_content: bool,
 ):
     """Wrap the `create` method of the `AsyncChatCompletion` class to trace it."""
 
@@ -96,6 +120,9 @@ async def traced_method(wrapped, instance, args, kwargs):
                         message_to_event(message, capture_content)
                     )
 
+            start = default_timer()
+            result = None
+            error_type = None
             try:
                 result = await wrapped(*args, **kwargs)
                 if is_streaming(kwargs):
@@ -111,12 +138,88 @@ async def traced_method(wrapped, instance, args, kwargs):
                 return result
 
             except Exception as error:
+                error_type = type(error).__qualname__
                 handle_span_exception(span, error)
                 raise
+            finally:
+                duration = max((default_timer() - start), 0)
+                _record_metrics(
+                    instruments,
+                    duration,
+                    result,
+                    span_attributes,
+                    error_type,
+                )
 
     return traced_method
 
 
+def _record_metrics(
+    instruments: Instruments,
+    duration: float,
+    result,
+    span_attributes: dict,
+    error_type: Optional[str],
+):
+    common_attributes = {
+        GenAIAttributes.GEN_AI_OPERATION_NAME: GenAIAttributes.GenAiOperationNameValues.CHAT.value,
+        GenAIAttributes.GEN_AI_SYSTEM: GenAIAttributes.GenAiSystemValues.OPENAI.value,
+        GenAIAttributes.GEN_AI_REQUEST_MODEL: span_attributes[
+            GenAIAttributes.GEN_AI_REQUEST_MODEL
+        ],
+    }
+
+    if error_type:
+        common_attributes["error.type"] = error_type
+
+    if result and getattr(result, "model", None):
+        common_attributes[GenAIAttributes.GEN_AI_RESPONSE_MODEL] = result.model
+
+    if result and getattr(result, "service_tier", None):
+        common_attributes[
+            GenAIAttributes.GEN_AI_OPENAI_RESPONSE_SERVICE_TIER
+        ] = result.service_tier
+
+    if result and getattr(result, "system_fingerprint", None):
+        common_attributes["gen_ai.openai.response.system_fingerprint"] = (
+            result.system_fingerprint
+        )
+
+    if ServerAttributes.SERVER_ADDRESS in span_attributes:
+        common_attributes[ServerAttributes.SERVER_ADDRESS] = span_attributes[
+            ServerAttributes.SERVER_ADDRESS
+        ]
+
+    if ServerAttributes.SERVER_PORT in span_attributes:
+        common_attributes[ServerAttributes.SERVER_PORT] = span_attributes[
+            ServerAttributes.SERVER_PORT
+        ]
+
+    instruments.operation_duration_histogram.record(
+        duration,
+        attributes=common_attributes,
+    )
+
+    if result and getattr(result, "usage", None):
+        input_attributes = {
+            **common_attributes,
+            GenAIAttributes.GEN_AI_TOKEN_TYPE: GenAIAttributes.GenAiTokenTypeValues.INPUT.value,
+        }
+        instruments.token_usage_histogram.record(
+            result.usage.prompt_tokens,
+            attributes=input_attributes,
+        )
+
+        completion_attributes = {
+            **common_attributes,
+            GenAIAttributes.GEN_AI_TOKEN_TYPE: GenAIAttributes.GenAiTokenTypeValues.COMPLETION.value,
+        }
+        instruments.token_usage_histogram.record(
+            result.usage.completion_tokens,
+            attributes=completion_attributes,
+        )
+
+
 def _set_response_attributes(
     span, result, event_logger: EventLogger, capture_content: bool
 ):
diff --git a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/cassettes/test_async_chat_completion_metrics.yaml b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/cassettes/test_async_chat_completion_metrics.yaml
new file mode 100644
index 0000000000..e771e93cbe
--- /dev/null
+++ b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/cassettes/test_async_chat_completion_metrics.yaml
@@ -0,0 +1,133 @@
+interactions:
+- request:
+    body: |-
+      {
+        "messages": [
+          {
+            "role": "user",
+            "content": "Say this is a test"
+          }
+        ],
+        "model": "gpt-4o-mini",
+        "stream": false
+      }
+    headers:
+      accept:
+      - application/json
+      accept-encoding:
+      - gzip, deflate
+      authorization:
+      - Bearer test_openai_api_key
+      connection:
+      - keep-alive
+      content-length:
+      - '106'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      user-agent:
+      - AsyncOpenAI/Python 1.26.0
+      x-stainless-arch:
+      - arm64
+      x-stainless-async:
+      - async:asyncio
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - MacOS
+      x-stainless-package-version:
+      - 1.26.0
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.12.5
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: |-
+        {
+          "id": "chatcmpl-ASv9R2E7Yhb2e7bj4Xl0qm9s3J42Y",
+          "object": "chat.completion",
+          "created": 1731456237,
+          "model": "gpt-4o-mini-2024-07-18",
+          "choices": [
+            {
+              "index": 0,
+              "message": {
+                "role": "assistant",
+                "content": "This is a test. How can I assist you further?",
+                "refusal": null
+              },
+              "logprobs": null,
+              "finish_reason": "stop"
+            }
+          ],
+          "service_tier": "default",
+          "usage": {
+            "prompt_tokens": 12,
+            "completion_tokens": 12,
+            "total_tokens": 24,
+            "prompt_tokens_details": {
+              "cached_tokens": 0,
+              "audio_tokens": 0
+            },
+            "completion_tokens_details": {
+              "reasoning_tokens": 0,
+              "audio_tokens": 0,
+              "accepted_prediction_tokens": 0,
+              "rejected_prediction_tokens": 0
+            }
+          },
+          "system_fingerprint": "fp_0ba0d124f1"
+        }
+    headers:
+      CF-Cache-Status:
+      - DYNAMIC
+      CF-RAY:
+      - 8e1a80679a8311a6-MRS
+      Connection:
+      - keep-alive
+      Content-Type:
+      - application/json
+      Date:
+      - Wed, 13 Nov 2024 00:03:58 GMT
+      Server:
+      - cloudflare
+      Set-Cookie: test_set_cookie
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - nosniff
+      access-control-expose-headers:
+      - X-Request-ID
+      alt-svc:
+      - h3=":443"; ma=86400
+      content-length:
+      - '796'
+      openai-organization: test_openai_org_id
+      openai-processing-ms:
+      - '359'
+      openai-version:
+      - '2020-10-01'
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      x-ratelimit-limit-requests:
+      - '30000'
+      x-ratelimit-limit-tokens:
+      - '150000000'
+      x-ratelimit-remaining-requests:
+      - '29999'
+      x-ratelimit-remaining-tokens:
+      - '149999978'
+      x-ratelimit-reset-requests:
+      - 2ms
+      x-ratelimit-reset-tokens:
+      - 0s
+      x-request-id:
+      - req_41ea134c1fc450d4ca4cf8d0c6a7c53a
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/cassettes/test_chat_completion_metrics.yaml b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/cassettes/test_chat_completion_metrics.yaml
new file mode 100644
index 0000000000..1c6c11c858
--- /dev/null
+++ b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/cassettes/test_chat_completion_metrics.yaml
@@ -0,0 +1,135 @@
+interactions:
+- request:
+    body: |-
+      {
+        "messages": [
+          {
+            "role": "user",
+            "content": "Say this is a test"
+          }
+        ],
+        "model": "gpt-4o-mini",
+        "stream": false
+      }
+    headers:
+      accept:
+      - application/json
+      accept-encoding:
+      - gzip, deflate
+      authorization:
+      - Bearer test_openai_api_key
+      connection:
+      - keep-alive
+      content-length:
+      - '106'
+      content-type:
+      - application/json
+      host:
+      - api.openai.com
+      user-agent:
+      - OpenAI/Python 1.54.3
+      x-stainless-arch:
+      - arm64
+      x-stainless-async:
+      - 'false'
+      x-stainless-lang:
+      - python
+      x-stainless-os:
+      - MacOS
+      x-stainless-package-version:
+      - 1.54.3
+      x-stainless-retry-count:
+      - '0'
+      x-stainless-runtime:
+      - CPython
+      x-stainless-runtime-version:
+      - 3.12.6
+    method: POST
+    uri: https://api.openai.com/v1/chat/completions
+  response:
+    body:
+      string: |-
+        {
+          "id": "chatcmpl-ASYMQRl3A3DXL9FWCK9tnGRcKIO7q",
+          "object": "chat.completion",
+          "created": 1731368630,
+          "model": "gpt-4o-mini-2024-07-18",
+          "choices": [
+            {
+              "index": 0,
+              "message": {
+                "role": "assistant",
+                "content": "This is a test.",
+                "refusal": null
+              },
+              "logprobs": null,
+              "finish_reason": "stop"
+            }
+          ],
+          "service_tier": "default",
+          "usage": {
+            "prompt_tokens": 12,
+            "completion_tokens": 5,
+            "total_tokens": 17,
+            "prompt_tokens_details": {
+              "cached_tokens": 0,
+              "audio_tokens": 0
+            },
+            "completion_tokens_details": {
+              "reasoning_tokens": 0,
+              "audio_tokens": 0,
+              "accepted_prediction_tokens": 0,
+              "rejected_prediction_tokens": 0
+            }
+          },
+          "system_fingerprint": "fp_0ba0d124f1"
+        }
+    headers:
+      CF-Cache-Status:
+      - DYNAMIC
+      CF-RAY:
+      - 8e122593ff368bc8-SIN
+      Connection:
+      - keep-alive
+      Content-Type:
+      - application/json
+      Date:
+      - Mon, 11 Nov 2024 23:43:50 GMT
+      Server:
+      - cloudflare
+      Set-Cookie: test_set_cookie
+      Transfer-Encoding:
+      - chunked
+      X-Content-Type-Options:
+      - nosniff
+      access-control-expose-headers:
+      - X-Request-ID
+      alt-svc:
+      - h3=":443"; ma=86400
+      content-length:
+      - '765'
+      openai-organization: test_openai_org_id
+      openai-processing-ms:
+      - '287'
+      openai-version:
+      - '2020-10-01'
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      x-ratelimit-limit-requests:
+      - '10000'
+      x-ratelimit-limit-tokens:
+      - '200000'
+      x-ratelimit-remaining-requests:
+      - '9999'
+      x-ratelimit-remaining-tokens:
+      - '199977'
+      x-ratelimit-reset-requests:
+      - 8.64s
+      x-ratelimit-reset-tokens:
+      - 6ms
+      x-request-id:
+      - req_58cff97afd0e7c0bba910ccf0b044a6f
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/conftest.py b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/conftest.py
index 18e6582dff..51521dbadd 100644
--- a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/conftest.py
+++ b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/conftest.py
@@ -17,6 +17,17 @@
     InMemoryLogExporter,
     SimpleLogRecordProcessor,
 )
+from opentelemetry.sdk.metrics import (
+    Histogram,
+    MeterProvider,
+)
+from opentelemetry.sdk.metrics.export import (
+    InMemoryMetricReader,
+)
+from opentelemetry.sdk.metrics.view import (
+    ExplicitBucketHistogramAggregation,
+    View,
+)
 from opentelemetry.sdk.trace import TracerProvider
 from opentelemetry.sdk.trace.export import SimpleSpanProcessor
 from opentelemetry.sdk.trace.export.in_memory_span_exporter import (
@@ -36,6 +47,12 @@ def fixture_log_exporter():
     yield exporter
 
 
+@pytest.fixture(scope="function", name="metric_reader")
+def fixture_metric_reader():
+    exporter = InMemoryMetricReader()
+    yield exporter
+
+
 @pytest.fixture(scope="function", name="tracer_provider")
 def fixture_tracer_provider(span_exporter):
     provider = TracerProvider()
@@ -52,6 +69,62 @@ def fixture_event_logger_provider(log_exporter):
     return event_logger_provider
 
 
+@pytest.fixture(scope="function", name="meter_provider")
+def fixture_meter_provider(metric_reader):
+    token_usage_histogram_view = View(
+        instrument_type=Histogram,
+        instrument_name="gen_ai.client.token.usage",
+        aggregation=ExplicitBucketHistogramAggregation(
+            boundaries=[
+                1,
+                4,
+                16,
+                64,
+                256,
+                1024,
+                4096,
+                16384,
+                65536,
+                262144,
+                1048576,
+                4194304,
+                16777216,
+                67108864,
+            ]
+        ),
+    )
+
+    duration_histogram_view = View(
+        instrument_type=Histogram,
+        instrument_name="gen_ai.client.operation.duration",
+        aggregation=ExplicitBucketHistogramAggregation(
+            boundaries=[
+                0.01,
+                0.02,
+                0.04,
+                0.08,
+                0.16,
+                0.32,
+                0.64,
+                1.28,
+                2.56,
+                5.12,
+                10.24,
+                20.48,
+                40.96,
+                81.92,
+            ]
+        ),
+    )
+
+    meter_provider = MeterProvider(
+        metric_readers=[metric_reader],
+        views=[token_usage_histogram_view, duration_histogram_view],
+    )
+
+    return meter_provider
+
+
 @pytest.fixture(autouse=True)
 def environment():
     if not os.getenv("OPENAI_API_KEY"):
@@ -83,7 +156,9 @@ def vcr_config():
 
 
 @pytest.fixture(scope="function")
-def instrument_no_content(tracer_provider, event_logger_provider):
+def instrument_no_content(
+    tracer_provider, event_logger_provider, meter_provider
+):
     os.environ.update(
         {OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT: "False"}
     )
@@ -92,6 +167,7 @@ def instrument_no_content(tracer_provider, event_logger_provider):
     instrumentor.instrument(
         tracer_provider=tracer_provider,
         event_logger_provider=event_logger_provider,
+        meter_provider=meter_provider,
     )
 
     yield instrumentor
@@ -100,7 +176,9 @@ def instrument_no_content(tracer_provider, event_logger_provider):
 
 
 @pytest.fixture(scope="function")
-def instrument_with_content(tracer_provider, event_logger_provider):
+def instrument_with_content(
+    tracer_provider, event_logger_provider, meter_provider
+):
     os.environ.update(
         {OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT: "True"}
     )
@@ -108,6 +186,7 @@ def instrument_with_content(tracer_provider, event_logger_provider):
     instrumentor.instrument(
         tracer_provider=tracer_provider,
         event_logger_provider=event_logger_provider,
+        meter_provider=meter_provider,
     )
 
     yield instrumentor
diff --git a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/test_chat_completions.py b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/test_chat_completions.py
index 4677b7cb95..9685903603 100644
--- a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/test_chat_completions.py
+++ b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/test_chat_completions.py
@@ -32,6 +32,7 @@
 from opentelemetry.semconv._incubating.attributes import (
     server_attributes as ServerAttributes,
 )
+from opentelemetry.semconv._incubating.metrics import gen_ai_metrics
 
 
 @pytest.mark.vcr()
@@ -94,7 +95,9 @@ def test_chat_completion_no_content(
     assert_message_in_logs(logs[1], "gen_ai.choice", choice_event, spans[0])
 
 
-def test_chat_completion_bad_endpoint(span_exporter, instrument_no_content):
+def test_chat_completion_bad_endpoint(
+    span_exporter, metric_reader, instrument_no_content
+):
     llm_model_value = "gpt-4o-mini"
     messages_value = [{"role": "user", "content": "Say this is a test"}]
 
@@ -116,10 +119,31 @@ def test_chat_completion_bad_endpoint(span_exporter, instrument_no_content):
         "APIConnectionError" == spans[0].attributes[ErrorAttributes.ERROR_TYPE]
     )
 
+    metrics = metric_reader.get_metrics_data().resource_metrics
+    assert len(metrics) == 1
+
+    metric_data = metrics[0].scope_metrics[0].metrics
+    duration_metric = next(
+        (
+            m
+            for m in metric_data
+            if m.name == gen_ai_metrics.GEN_AI_CLIENT_OPERATION_DURATION
+        ),
+        None,
+    )
+    assert duration_metric is not None
+    assert duration_metric.data.data_points[0].sum > 0
+    assert (
+        duration_metric.data.data_points[0].attributes[
+            ErrorAttributes.ERROR_TYPE
+        ]
+        == "APIConnectionError"
+    )
+
 
 @pytest.mark.vcr()
 def test_chat_completion_404(
-    span_exporter, openai_client, instrument_no_content
+    span_exporter, openai_client, metric_reader, instrument_no_content
 ):
     llm_model_value = "this-model-does-not-exist"
     messages_value = [{"role": "user", "content": "Say this is a test"}]
@@ -135,6 +159,27 @@ def test_chat_completion_404(
     assert_all_attributes(spans[0], llm_model_value)
     assert "NotFoundError" == spans[0].attributes[ErrorAttributes.ERROR_TYPE]
 
+    metrics = metric_reader.get_metrics_data().resource_metrics
+    assert len(metrics) == 1
+
+    metric_data = metrics[0].scope_metrics[0].metrics
+    duration_metric = next(
+        (
+            m
+            for m in metric_data
+            if m.name == gen_ai_metrics.GEN_AI_CLIENT_OPERATION_DURATION
+        ),
+        None,
+    )
+    assert duration_metric is not None
+    assert duration_metric.data.data_points[0].sum > 0
+    assert (
+        duration_metric.data.data_points[0].attributes[
+            ErrorAttributes.ERROR_TYPE
+        ]
+        == "NotFoundError"
+    )
+
 
 @pytest.mark.vcr()
 def test_chat_completion_extra_params(
diff --git a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/test_chat_metrics.py b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/test_chat_metrics.py
new file mode 100644
index 0000000000..d0f7c5a596
--- /dev/null
+++ b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/test_chat_metrics.py
@@ -0,0 +1,190 @@
+import pytest
+
+from opentelemetry.semconv._incubating.attributes import (
+    gen_ai_attributes as GenAIAttributes,
+)
+from opentelemetry.semconv._incubating.attributes import (
+    server_attributes as ServerAttributes,
+)
+from opentelemetry.semconv._incubating.metrics import gen_ai_metrics
+
+
+def assert_all_metric_attributes(data_point):
+    assert GenAIAttributes.GEN_AI_OPERATION_NAME in data_point.attributes
+    assert (
+        data_point.attributes[GenAIAttributes.GEN_AI_OPERATION_NAME]
+        == GenAIAttributes.GenAiOperationNameValues.CHAT.value
+    )
+    assert GenAIAttributes.GEN_AI_SYSTEM in data_point.attributes
+    assert (
+        data_point.attributes[GenAIAttributes.GEN_AI_SYSTEM]
+        == GenAIAttributes.GenAiSystemValues.OPENAI.value
+    )
+    assert GenAIAttributes.GEN_AI_REQUEST_MODEL in data_point.attributes
+    assert (
+        data_point.attributes[GenAIAttributes.GEN_AI_REQUEST_MODEL]
+        == "gpt-4o-mini"
+    )
+    assert GenAIAttributes.GEN_AI_RESPONSE_MODEL in data_point.attributes
+    assert (
+        data_point.attributes[GenAIAttributes.GEN_AI_RESPONSE_MODEL]
+        == "gpt-4o-mini-2024-07-18"
+    )
+    assert "gen_ai.openai.response.system_fingerprint" in data_point.attributes
+    assert (
+        data_point.attributes["gen_ai.openai.response.system_fingerprint"]
+        == "fp_0ba0d124f1"
+    )
+    assert (
+        GenAIAttributes.GEN_AI_OPENAI_RESPONSE_SERVICE_TIER
+        in data_point.attributes
+    )
+    assert (
+        data_point.attributes[
+            GenAIAttributes.GEN_AI_OPENAI_RESPONSE_SERVICE_TIER
+        ]
+        == "default"
+    )
+    assert (
+        data_point.attributes[ServerAttributes.SERVER_ADDRESS]
+        == "api.openai.com"
+    )
+
+
+@pytest.mark.vcr()
+def test_chat_completion_metrics(
+    metric_reader, openai_client, instrument_with_content
+):
+    llm_model_value = "gpt-4o-mini"
+    messages_value = [{"role": "user", "content": "Say this is a test"}]
+
+    openai_client.chat.completions.create(
+        messages=messages_value, model=llm_model_value, stream=False
+    )
+
+    metrics = metric_reader.get_metrics_data().resource_metrics
+    assert len(metrics) == 1
+
+    metric_data = metrics[0].scope_metrics[0].metrics
+    assert len(metric_data) == 2
+
+    duration_metric = next(
+        (
+            m
+            for m in metric_data
+            if m.name == gen_ai_metrics.GEN_AI_CLIENT_OPERATION_DURATION
+        ),
+        None,
+    )
+    assert duration_metric is not None
+    assert duration_metric.data.data_points[0].sum > 0
+    assert_all_metric_attributes(duration_metric.data.data_points[0])
+
+    token_usage_metric = next(
+        (
+            m
+            for m in metric_data
+            if m.name == gen_ai_metrics.GEN_AI_CLIENT_TOKEN_USAGE
+        ),
+        None,
+    )
+    assert token_usage_metric is not None
+
+    input_token_usage = next(
+        (
+            d
+            for d in token_usage_metric.data.data_points
+            if d.attributes[GenAIAttributes.GEN_AI_TOKEN_TYPE]
+            == GenAIAttributes.GenAiTokenTypeValues.INPUT.value
+        ),
+        None,
+    )
+    assert input_token_usage is not None
+    assert input_token_usage.sum == 12
+    # assert against buckets [1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864]
+    assert input_token_usage.bucket_counts[2] == 1
+    assert_all_metric_attributes(input_token_usage)
+
+    output_token_usage = next(
+        (
+            d
+            for d in token_usage_metric.data.data_points
+            if d.attributes[GenAIAttributes.GEN_AI_TOKEN_TYPE]
+            == GenAIAttributes.GenAiTokenTypeValues.COMPLETION.value
+        ),
+        None,
+    )
+    assert output_token_usage is not None
+    assert output_token_usage.sum == 5
+    # assert against buckets [1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864]
+    assert output_token_usage.bucket_counts[2] == 1
+    assert_all_metric_attributes(output_token_usage)
+
+
+@pytest.mark.vcr()
+@pytest.mark.asyncio()
+async def test_async_chat_completion_metrics(
+    metric_reader, async_openai_client, instrument_with_content
+):
+    llm_model_value = "gpt-4o-mini"
+    messages_value = [{"role": "user", "content": "Say this is a test"}]
+
+    await async_openai_client.chat.completions.create(
+        messages=messages_value, model=llm_model_value, stream=False
+    )
+
+    metrics = metric_reader.get_metrics_data().resource_metrics
+    assert len(metrics) == 1
+
+    metric_data = metrics[0].scope_metrics[0].metrics
+    assert len(metric_data) == 2
+
+    duration_metric = next(
+        (
+            m
+            for m in metric_data
+            if m.name == gen_ai_metrics.GEN_AI_CLIENT_OPERATION_DURATION
+        ),
+        None,
+    )
+    assert duration_metric is not None
+    assert duration_metric.data.data_points[0].sum > 0
+    assert_all_metric_attributes(duration_metric.data.data_points[0])
+
+    token_usage_metric = next(
+        (
+            m
+            for m in metric_data
+            if m.name == gen_ai_metrics.GEN_AI_CLIENT_TOKEN_USAGE
+        ),
+        None,
+    )
+    assert token_usage_metric is not None
+
+    input_token_usage = next(
+        (
+            d
+            for d in token_usage_metric.data.data_points
+            if d.attributes[GenAIAttributes.GEN_AI_TOKEN_TYPE]
+            == GenAIAttributes.GenAiTokenTypeValues.INPUT.value
+        ),
+        None,
+    )
+
+    assert input_token_usage is not None
+    assert input_token_usage.sum == 12
+    assert_all_metric_attributes(input_token_usage)
+
+    output_token_usage = next(
+        (
+            d
+            for d in token_usage_metric.data.data_points
+            if d.attributes[GenAIAttributes.GEN_AI_TOKEN_TYPE]
+            == GenAIAttributes.GenAiTokenTypeValues.COMPLETION.value
+        ),
+        None,
+    )
+
+    assert output_token_usage is not None
+    assert output_token_usage.sum == 12
+    assert_all_metric_attributes(output_token_usage)