From a5ee5990f47903a81d252f996d923758c02252f4 Mon Sep 17 00:00:00 2001 From: isaac hershenson Date: Mon, 9 Dec 2024 08:46:18 -0800 Subject: [PATCH] refactor --- python/langsmith/client.py | 29 ++++++------ python/langsmith/evaluation/_arunner.py | 6 ++- python/langsmith/evaluation/_runner.py | 8 ++-- python/langsmith/schemas.py | 30 +++++++++---- python/tests/integration_tests/test_client.py | 44 +++++-------------- 5 files changed, 56 insertions(+), 61 deletions(-) diff --git a/python/langsmith/client.py b/python/langsmith/client.py index 9fe393125..778911808 100644 --- a/python/langsmith/client.py +++ b/python/langsmith/client.py @@ -3580,6 +3580,7 @@ def _prepate_multipart_data( def upload_examples_multipart( self, *, + dataset_id: ID_TYPE, uploads: List[ls_schemas.ExampleUploadWithAttachments] = None, ) -> ls_schemas.UpsertExamplesResponse: """Upload examples.""" @@ -3592,10 +3593,6 @@ def upload_examples_multipart( if uploads is None: uploads = [] encoder, data = self._prepate_multipart_data(uploads, include_dataset_id=False) - dataset_ids = set([example.dataset_id for example in uploads]) - if len(dataset_ids) > 1: - raise ValueError("All examples must be in the same dataset.") - dataset_id = list(dataset_ids)[0] response = self.request_with_retries( "POST", @@ -3823,21 +3820,21 @@ def read_example( ) example = response.json() - attachment_urls = {} + attachments_info = {} if example["attachment_urls"]: for key, value in example["attachment_urls"].items(): response = requests.get(value["presigned_url"], stream=True) response.raise_for_status() reader = io.BytesIO(response.content) - attachment_urls[key.split(".")[1]] = ( - value["presigned_url"], - reader, - ) + attachments_info[key.split(".")[1]] = { + "presigned_url": value["presigned_url"], + "reader": reader, + } del example["attachment_urls"] return ls_schemas.Example( **example, - attachment_urls=attachment_urls, + attachments_info=attachments_info, _host_url=self._host_url, _tenant_id=self._get_optional_tenant_id(), ) @@ -3910,21 +3907,21 @@ def list_examples( for i, example in enumerate( self._get_paginated_list("/examples", params=params) ): - attachment_urls = {} + attachments_info = {} if example["attachment_urls"]: for key, value in example["attachment_urls"].items(): response = requests.get(value["presigned_url"], stream=True) response.raise_for_status() reader = io.BytesIO(response.content) - attachment_urls[key.split(".")[1]] = ( - value["presigned_url"], - reader, - ) + attachments_info[key.split(".")[1]] = { + "presigned_url": value["presigned_url"], + "reader": reader, + } del example["attachment_urls"] yield ls_schemas.Example( **example, - attachment_urls=attachment_urls, + attachments_info=attachments_info, _host_url=self._host_url, _tenant_id=self._get_optional_tenant_id(), ) diff --git a/python/langsmith/evaluation/_arunner.py b/python/langsmith/evaluation/_arunner.py index 9412bf5f3..5a22ba305 100644 --- a/python/langsmith/evaluation/_arunner.py +++ b/python/langsmith/evaluation/_arunner.py @@ -1023,7 +1023,7 @@ def _get_run(r: run_trees.RunTree) -> None: with rh.tracing_context(enabled=True): try: args = ( - (example.inputs, example.attachment_urls) + (example.inputs, example.attachments_info) if include_attachments else (example.inputs,) ) @@ -1044,6 +1044,10 @@ def _get_run(r: run_trees.RunTree) -> None: client=client, ), ) + if include_attachments and example.attachments_info is not None: + for attachment in example.attachments_info: + reader = example.attachments_info[attachment]["reader"] + reader.seek(0) except Exception as e: logger.error( f"Error running target function: {e}", exc_info=True, stacklevel=1 diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py index ebd259e14..fbb096484 100644 --- a/python/langsmith/evaluation/_runner.py +++ b/python/langsmith/evaluation/_runner.py @@ -1834,7 +1834,7 @@ def _get_run(r: rt.RunTree) -> None: ) try: args = ( - (example.inputs, example.attachment_urls) + (example.inputs, example.attachments_info) if include_attachments else (example.inputs,) ) @@ -1842,9 +1842,9 @@ def _get_run(r: rt.RunTree) -> None: *args, langsmith_extra=langsmith_extra, ) - if include_attachments and example.attachment_urls is not None: - for attachment in example.attachment_urls: - _, reader = example.attachment_urls[attachment] + if include_attachments and example.attachments_info is not None: + for attachment in example.attachments_info: + reader = example.attachments_info[attachment]["reader"] reader.seek(0) except Exception as e: logger.error( diff --git a/python/langsmith/schemas.py b/python/langsmith/schemas.py index 34de09aaa..b06552dcd 100644 --- a/python/langsmith/schemas.py +++ b/python/langsmith/schemas.py @@ -93,9 +93,6 @@ class ExampleBase(BaseModel): inputs: Dict[str, Any] = Field(default_factory=dict) outputs: Optional[Dict[str, Any]] = Field(default=None) metadata: Optional[Dict[str, Any]] = Field(default=None) - attachment_urls: Optional[Dict[str, Tuple[str, BinaryIOLike]]] = Field(default=None) - """Dictionary with attachment names as keys and a tuple of the S3 url - and a reader of the data for the file.""" class Config: """Configuration class for the schema.""" @@ -112,16 +109,30 @@ class ExampleCreate(ExampleBase): split: Optional[Union[str, List[str]]] = None -class ExampleUpsertWithAttachments(ExampleCreate): - """Example create with attachments.""" +class ExampleUploadWithAttachments(BaseModel): + """Example upload with attachments.""" + id: Optional[UUID] + created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) + inputs: Dict[str, Any] = Field(default_factory=dict) + outputs: Optional[Dict[str, Any]] = Field(default=None) + metadata: Optional[Dict[str, Any]] = Field(default=None) + split: Optional[Union[str, List[str]]] = None attachments: Optional[Attachments] = None -class ExampleUploadWithAttachments(ExampleUpsertWithAttachments): - """Example upload with attachments.""" +class ExampleUpsertWithAttachments(ExampleUploadWithAttachments): + """Example create with attachments.""" + + dataset_id: UUID - pass + +class AttachmentInfo(TypedDict): + """Info for an attachment.""" + + presigned_url: str + reader: BinaryIOLike + # TODO: add mime type class Example(ExampleBase): @@ -135,6 +146,9 @@ class Example(ExampleBase): modified_at: Optional[datetime] = Field(default=None) runs: List[Run] = Field(default_factory=list) source_run_id: Optional[UUID] = None + attachments_info: Optional[Dict[str, AttachmentInfo]] = Field(default=None) + """Dictionary with attachment names as keys and a tuple of the S3 url + and a reader of the data for the file.""" _host_url: Optional[str] = PrivateAttr(default=None) _tenant_id: Optional[UUID] = PrivateAttr(default=None) diff --git a/python/tests/integration_tests/test_client.py b/python/tests/integration_tests/test_client.py index bc3f6f33f..e175efc9e 100644 --- a/python/tests/integration_tests/test_client.py +++ b/python/tests/integration_tests/test_client.py @@ -393,7 +393,6 @@ def test_upload_examples_multipart(langchain_client: Client): example_id = uuid4() example_1 = ExampleUploadWithAttachments( id=example_id, - dataset_id=dataset.id, inputs={"text": "hello world"}, attachments={ "test_file": ("text/plain", b"test content"), @@ -402,13 +401,11 @@ def test_upload_examples_multipart(langchain_client: Client): # Test example with minimum required fields example_2 = ExampleUploadWithAttachments( - dataset_id=dataset.id, inputs={"text": "minimal example"}, ) # Test example with outputs and multiple attachments example_3 = ExampleUploadWithAttachments( - dataset_id=dataset.id, inputs={"text": "example with outputs"}, outputs={"response": "test response"}, attachments={ @@ -419,7 +416,7 @@ def test_upload_examples_multipart(langchain_client: Client): # Test uploading multiple examples at once created_examples = langchain_client.upload_examples_multipart( - uploads=[example_1, example_2, example_3] + dataset_id=dataset.id, uploads=[example_1, example_2, example_3] ) assert created_examples["count"] == 3 @@ -439,7 +436,7 @@ def test_upload_examples_multipart(langchain_client: Client): # Verify example with ID was created with correct ID example_with_id = [ex for ex in examples if ex.id == example_id][0] assert example_with_id.inputs["text"] == "hello world" - assert "test_file" in example_with_id.attachment_urls + assert "test_file" in example_with_id.attachments_info # Verify example with outputs and multiple attachments example_with_outputs = next( @@ -447,35 +444,20 @@ def test_upload_examples_multipart(langchain_client: Client): for ex in examples if ex.outputs and ex.outputs.get("response") == "test response" ) - assert len(example_with_outputs.attachment_urls) == 2 - assert "file1" in example_with_outputs.attachment_urls - assert "file2" in example_with_outputs.attachment_urls + assert len(example_with_outputs.attachments_info) == 2 + assert "file1" in example_with_outputs.attachments_info + assert "file2" in example_with_outputs.attachments_info # Test uploading to non-existent dataset fails fake_id = uuid4() with pytest.raises(LangSmithNotFoundError): langchain_client.upload_examples_multipart( + dataset_id=fake_id, uploads=[ ExampleUploadWithAttachments( - dataset_id=fake_id, inputs={"text": "should fail"}, ) - ] - ) - - # Test uploading examples to different datasets fails - with pytest.raises(ValueError, match="All examples must be in the same dataset"): - langchain_client.upload_examples_multipart( - uploads=[ - ExampleUploadWithAttachments( - dataset_id=dataset.id, - inputs={"text": "example 1"}, - ), - ExampleUploadWithAttachments( - dataset_id=uuid4(), - inputs={"text": "example 2"}, - ), - ] + ], ) # Clean up @@ -1283,8 +1265,7 @@ def test_evaluate_with_attachments(langchain_client: Client) -> None: ) # 2. Create example with attachments - example = ExampleUpsertWithAttachments( - dataset_id=dataset.id, + example = ExampleUploadWithAttachments( inputs={"question": "What is shown in the image?"}, outputs={"answer": "test image"}, attachments={ @@ -1292,13 +1273,13 @@ def test_evaluate_with_attachments(langchain_client: Client) -> None: }, ) - langchain_client.upsert_examples_multipart(upserts=[example]) + langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example]) # 3. Define target function that uses attachments def target(inputs: Dict[str, Any], attachments: Dict[str, Any]) -> Dict[str, Any]: # Verify we receive the attachment data assert "image" in attachments - image_url, image_data = attachments["image"] + image_data = attachments["image"]["reader"] assert image_data.read() == b"fake image data for testing" return {"answer": "test image"} @@ -1345,12 +1326,11 @@ def test_evaluate_with_no_attachments(langchain_client: Client) -> None: ) # Verify we can create example the new way without attachments - example = ExampleUpsertWithAttachments( - dataset_id=dataset.id, + example = ExampleUploadWithAttachments( inputs={"question": "What is 3+1?"}, outputs={"answer": "4"}, ) - langchain_client.upsert_examples_multipart(upserts=[example]) + langchain_client.upload_examples_multipart(dataset_id=dataset.id, uploads=[example]) def target(inputs: Dict[str, Any], attachments: Dict[str, Any]) -> Dict[str, Any]: # Verify we receive an empty attachments dict