From b6f102cbdb112666c42b2f5f8c574954ac560bce Mon Sep 17 00:00:00 2001 From: staru09 Date: Wed, 13 Nov 2024 19:03:24 +0530 Subject: [PATCH] Revert "outlines added" This reverts commit 16426247c37190be4320f465108354c555b860a3. --- docetl/operations/cpp_outlines.py | 114 ------------------------- docetl/operations/hf_outlines.py | 60 ------------- tests/test_cpp_outlines.py | 135 ------------------------------ tests/test_hf_outlines.py | 134 ----------------------------- 4 files changed, 443 deletions(-) delete mode 100644 docetl/operations/cpp_outlines.py delete mode 100644 docetl/operations/hf_outlines.py delete mode 100644 tests/test_cpp_outlines.py delete mode 100644 tests/test_hf_outlines.py diff --git a/docetl/operations/cpp_outlines.py b/docetl/operations/cpp_outlines.py deleted file mode 100644 index dfbb8902..00000000 --- a/docetl/operations/cpp_outlines.py +++ /dev/null @@ -1,114 +0,0 @@ -from typing import Any, Dict, List, Optional, Tuple -from pydantic import BaseModel -from docetl.operations.base import BaseOperation -from outlines import generate, models -import llama_cpp -import json - -class LlamaCppMapOperation(BaseOperation): - class schema(BaseOperation.schema): - type: str = "llama_cpp_map" - model_path: str - model_file: str - output_schema: Dict[str, Any] - prompt_template: str - batch_size: Optional[int] = 10 - n_gpu_layers: int = -1 - flash_attn: bool = True - n_ctx: int = 8192 - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - self.tokenizer = llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained( - self.config["model_path"] - ) - self.model = models.llamacpp( - self.config["model_path"], - self.config["model_file"], - tokenizer=self.tokenizer, - n_gpu_layers=self.config["n_gpu_layers"], - flash_attn=self.config["flash_attn"], - n_ctx=self.config["n_ctx"], - verbose=False - ) - - output_model = BaseModel.model_validate(self.config["output_schema"]) - self.processor = generate.json( - self.model, - output_model, - max_tokens=4096 - ) - - def syntax_check(self) -> None: - """Validate the operation configuration.""" - config = self.schema(**self.config) - - if not config.model_path: - raise ValueError("model_path is required") - - if not config.model_file: - raise ValueError("model_file is required for llama_cpp models") - - if not config.output_schema: - raise ValueError("output_schema is required") - - if not config.prompt_template: - raise ValueError("prompt_template is required") - - def create_prompt(self, item: Dict[str, Any]) -> str: - """Create a prompt from the template and input data.""" - messages = [ - { - 'role': 'user', - 'content': self.config["prompt_template"] - }, - { - 'role': 'assistant', - 'content': "I understand and will process the input as requested." - }, - { - 'role': 'user', - 'content': str(item) - } - ] - return self.tokenizer.hf_tokenizer.apply_chat_template( - messages, - tokenize=False - ) - - def process_item(self, item: Dict[str, Any]) -> Dict[str, Any]: - """Process a single item through the Outlines model.""" - prompt = self.create_prompt(item) - try: - result = self.processor(prompt) - result_dict = result.model_dump() - final_dict = {**item, **result_dict} - return json.loads(json.dumps(final_dict, indent=2)) - except Exception as e: - self.console.print(f"Error processing item: {e}") - return json.loads(json.dumps(item, indent=2)) - - def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]: - """Execute the operation on the input data.""" - if self.status: - self.status.stop() - - results = [] - batch_size = self.config.get("batch_size", 10) - - for i in range(0, len(input_data), batch_size): - batch = input_data[i:i + batch_size] - batch_results = [self.process_item(item) for item in batch] - results.extend(batch_results) - - if self.status: - self.status.start() - - return results, 0.0 - - - - - - diff --git a/docetl/operations/hf_outlines.py b/docetl/operations/hf_outlines.py deleted file mode 100644 index b2d3d241..00000000 --- a/docetl/operations/hf_outlines.py +++ /dev/null @@ -1,60 +0,0 @@ -from typing import Any, Dict, List, Optional, Tuple -from pydantic import BaseModel, create_model -from docetl.operations.base import BaseOperation -from outlines import generate, models -import json - -class HuggingFaceMapOperation(BaseOperation): - class schema(BaseOperation.schema): - name: str - type: str = "hf_map" - model_path: str - output_schema: Dict[str, Any] - prompt_template: str - max_tokens: int = 4096 - - def __init__(self, config: Dict[str, Any], runner=None, *args, **kwargs): - super().__init__( - config=config, - default_model=config.get('default_model', config['model_path']), - max_threads=config.get('max_threads', 1), - runner=runner - ) - - self.model = models.transformers( - self.config["model_path"] - ) - - # Create a dynamic Pydantic model from the output schema - field_definitions = { - k: (eval(v) if isinstance(v, str) else v, ...) - for k, v in self.config["output_schema"].items() - } - output_model = create_model('OutputModel', **field_definitions) - - self.processor = generate.json( - self.model, - output_model - ) - - def syntax_check(self) -> None: - """Validate the operation configuration.""" - self.schema(**self.config) - - def process_item(self, item: Dict[str, Any]) -> Dict[str, Any]: - """Process a single item through the model.""" - try: - result = self.processor(self.config["prompt_template"] + "\n" + str(item)) - result_dict = result.model_dump() - final_dict = {**item, **result_dict} - return final_dict - except Exception as e: - self.console.print(f"Error processing item: {e}") - return item - - @classmethod - def execute(cls, config: Dict[str, Any], input_data: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], float]: - """Execute the operation on the input data.""" - instance = cls(config) - results = [instance.process_item(item) for item in input_data] - return results, 0.0 \ No newline at end of file diff --git a/tests/test_cpp_outlines.py b/tests/test_cpp_outlines.py deleted file mode 100644 index 19fa92b9..00000000 --- a/tests/test_cpp_outlines.py +++ /dev/null @@ -1,135 +0,0 @@ -import pytest -from unittest.mock import Mock, patch -from docetl.operations.cpp_outlines import LlamaCppMapOperation - -@pytest.fixture -def sample_config(): - return { - "type": "llama_cpp_map", - "model_path": "/path/to/local/model", - "model_file": "Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf", - "output_schema": { - "first_name": "str", - "last_name": "str", - "order_number": "str", - "department": "str" - }, - "prompt_template": "Extract customer information from this text", - "batch_size": 2, - "n_gpu_layers": -1, - "flash_attn": True, - "n_ctx": 8192 - } - -@pytest.fixture -def mock_processor_output(): - class MockOutput: - def model_dump(self): - return { - "first_name": "John", - "last_name": "Doe", - "order_number": "12345", - "department": "Sales" - } - return MockOutput() - -@pytest.fixture -def sample_input_data(): - return [ - {"message": "Customer John Doe ordered item #12345"}, - {"message": "Customer Jane Smith from Sales department"} - ] - -def test_initialization(sample_config): - with patch('llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained'), \ - patch('outlines.models.llamacpp'): - operation = LlamaCppMapOperation(sample_config) - assert operation.config == sample_config - assert operation.config["n_gpu_layers"] == -1 - assert operation.config["flash_attn"] is True - -def test_syntax_check(sample_config): - with patch('llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained'), \ - patch('outlines.models.llamacpp'): - operation = LlamaCppMapOperation(sample_config) - operation.syntax_check() - -@pytest.mark.parametrize("missing_field", [ - "model_path", - "model_file", - "output_schema", - "prompt_template" -]) -def test_syntax_check_missing_fields(sample_config, missing_field): - with patch('llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained'), \ - patch('outlines.models.llamacpp'): - invalid_config = sample_config.copy() - invalid_config[missing_field] = "" - operation = LlamaCppMapOperation(invalid_config) - with pytest.raises(ValueError): - operation.syntax_check() - -def test_create_prompt(sample_config): - with patch('llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained') as mock_tokenizer_class: - mock_tokenizer = Mock() - mock_tokenizer.hf_tokenizer.apply_chat_template.return_value = "mocked prompt" - mock_tokenizer_class.return_value = mock_tokenizer - - with patch('outlines.models.llamacpp'): - operation = LlamaCppMapOperation(sample_config) - test_item = {"message": "test message"} - prompt = operation.create_prompt(test_item) - - assert isinstance(prompt, str) - assert mock_tokenizer.hf_tokenizer.apply_chat_template.called - -def test_process_item(sample_config, mock_processor_output): - with patch('llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained'), \ - patch('outlines.models.llamacpp'): - operation = LlamaCppMapOperation(sample_config) - operation.processor = Mock(return_value=mock_processor_output) - - test_item = {"message": "test message"} - result = operation.process_item(test_item) - - assert isinstance(result, dict) - assert "first_name" in result - assert "last_name" in result - assert "message" in result - -def test_process_item_error_handling(sample_config): - with patch('llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained'), \ - patch('outlines.models.llamacpp'): - operation = LlamaCppMapOperation(sample_config) - operation.processor = Mock(side_effect=Exception("Test error")) - - test_item = {"message": "test message"} - result = operation.process_item(test_item) - - assert isinstance(result, dict) - assert "message" in result - -def test_execute(sample_config, sample_input_data): - with patch('llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained'), \ - patch('outlines.models.llamacpp'): - operation = LlamaCppMapOperation(sample_config) - operation.process_item = Mock(return_value={"processed": True}) - - results, timing = operation.execute(sample_input_data) - - assert len(results) == len(sample_input_data) - assert isinstance(timing, float) - -def test_batch_processing(sample_config, sample_input_data): - with patch('llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained'), \ - patch('outlines.models.llamacpp'): - operation = LlamaCppMapOperation(sample_config) - operation.process_item = Mock(return_value={"processed": True}) - - sample_config["batch_size"] = 1 - results1, _ = operation.execute(sample_input_data) - assert len(results1) == len(sample_input_data) - - sample_config["batch_size"] = 2 - results2, _ = operation.execute(sample_input_data) - assert len(results2) == len(sample_input_data) \ No newline at end of file diff --git a/tests/test_hf_outlines.py b/tests/test_hf_outlines.py deleted file mode 100644 index ef9fc943..00000000 --- a/tests/test_hf_outlines.py +++ /dev/null @@ -1,134 +0,0 @@ -import pytest -from unittest.mock import Mock, patch, MagicMock -from docetl.operations.hf_outlines import HuggingFaceMapOperation - -@pytest.fixture -def mock_runner(): - return Mock() - -@pytest.fixture -def sample_config(): - return { - "name": "test_hf_operation", - "type": "hf_map", - "model_path": "meta-llama/Llama-3.2-1B-Instruct", - "output_schema": { - "first_name": "str", - "last_name": "str" - }, - "prompt_template": "Extract customer information from this text", - "max_tokens": 4096 - } - -@pytest.fixture -def research_config(): - return { - "name": "research_analyzer", - "type": "hf_map", - "model_path": "meta-llama/Llama-3.2-1B-Instruct", - "output_schema": { - "title": "str", - "authors": "list", - "methodology": "str", - "findings": "list", - "limitations": "list", - "future_work": "list" - }, - "prompt_template": "Analyze the following research paper abstract.\nExtract key components and summarize findings.", - "max_tokens": 4096 - } - -@pytest.fixture -def mock_research_output(): - class MockOutput: - def model_dump(self): - return { - "title": "Deep Learning in Natural Language Processing", - "authors": ["John Smith", "Jane Doe"], - "methodology": "Comparative analysis of transformer architectures", - "findings": [ - "Improved accuracy by 15%", - "Reduced training time by 30%" - ], - "limitations": [ - "Limited dataset size", - "Computational constraints" - ], - "future_work": [ - "Extend to multilingual models", - "Optimize for edge devices" - ] - } - return MockOutput() - -def test_process_item(sample_config, mock_runner): - mock_model = MagicMock() - - class MockOutput: - def model_dump(self): - return { - "first_name": "John", - "last_name": "Doe" - } - - mock_processor = Mock(return_value=MockOutput()) - - with patch('outlines.models.transformers', return_value=mock_model) as mock_transformers, \ - patch('outlines.generate.json', return_value=mock_processor): - - operation = HuggingFaceMapOperation(sample_config, runner=mock_runner) - test_item = {"message": "test message"} - result = operation.process_item(test_item) - - assert isinstance(result, dict) - assert "first_name" in result - assert "last_name" in result - assert "message" in result - -def test_research_paper_analysis(research_config, mock_research_output, mock_runner): - mock_model = MagicMock() - mock_processor = Mock(return_value=mock_research_output) - - with patch('outlines.models.transformers', return_value=mock_model) as mock_transformers, \ - patch('outlines.generate.json', return_value=mock_processor): - - operation = HuggingFaceMapOperation(research_config, runner=mock_runner) - test_item = { - "abstract": """ - This paper presents a comprehensive analysis of deep learning approaches - in natural language processing. We compare various transformer architectures - and their performance on standard NLP tasks. - """ - } - result = operation.process_item(test_item) - - # Verify structure and types - assert isinstance(result, dict) - assert "title" in result - assert isinstance(result["title"], str) - assert "authors" in result - assert isinstance(result["authors"], list) - assert "methodology" in result - assert isinstance(result["methodology"], str) - assert "findings" in result - assert isinstance(result["findings"], list) - assert len(result["findings"]) > 0 - assert "limitations" in result - assert isinstance(result["limitations"], list) - assert "future_work" in result - assert isinstance(result["future_work"], list) - - # Verify original input is preserved - assert "abstract" in result - -def test_execute(sample_config, mock_runner): - mock_model = MagicMock() - mock_processor = Mock(return_value={"first_name": "John", "last_name": "Doe"}) - - with patch('outlines.models.transformers', return_value=mock_model) as mock_transformers, \ - patch('outlines.generate.json', return_value=mock_processor): - - input_data = [{"message": "test message"}] - results, timing = HuggingFaceMapOperation.execute(sample_config, input_data) - assert len(results) == 1 - assert isinstance(timing, float) \ No newline at end of file