From b6f102cbdb112666c42b2f5f8c574954ac560bce Mon Sep 17 00:00:00 2001
From: staru09 <arusharmazxx000@gmail.com>
Date: Wed, 13 Nov 2024 19:03:24 +0530
Subject: [PATCH] Revert "outlines added"

This reverts commit 16426247c37190be4320f465108354c555b860a3.
---
 docetl/operations/cpp_outlines.py | 114 -------------------------
 docetl/operations/hf_outlines.py  |  60 -------------
 tests/test_cpp_outlines.py        | 135 ------------------------------
 tests/test_hf_outlines.py         | 134 -----------------------------
 4 files changed, 443 deletions(-)
 delete mode 100644 docetl/operations/cpp_outlines.py
 delete mode 100644 docetl/operations/hf_outlines.py
 delete mode 100644 tests/test_cpp_outlines.py
 delete mode 100644 tests/test_hf_outlines.py

diff --git a/docetl/operations/cpp_outlines.py b/docetl/operations/cpp_outlines.py
deleted file mode 100644
index dfbb8902..00000000
--- a/docetl/operations/cpp_outlines.py
+++ /dev/null
@@ -1,114 +0,0 @@
-from typing import Any, Dict, List, Optional, Tuple
-from pydantic import BaseModel
-from docetl.operations.base import BaseOperation
-from outlines import generate, models
-import llama_cpp
-import json
-
-class LlamaCppMapOperation(BaseOperation):
-    class schema(BaseOperation.schema):
-        type: str = "llama_cpp_map"
-        model_path: str
-        model_file: str
-        output_schema: Dict[str, Any]
-        prompt_template: str
-        batch_size: Optional[int] = 10
-        n_gpu_layers: int = -1
-        flash_attn: bool = True
-        n_ctx: int = 8192
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        
-        self.tokenizer = llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
-            self.config["model_path"]
-        )
-        self.model = models.llamacpp(
-            self.config["model_path"],
-            self.config["model_file"],
-            tokenizer=self.tokenizer,
-            n_gpu_layers=self.config["n_gpu_layers"],
-            flash_attn=self.config["flash_attn"],
-            n_ctx=self.config["n_ctx"],
-            verbose=False
-        )
-
-        output_model = BaseModel.model_validate(self.config["output_schema"])
-        self.processor = generate.json(
-            self.model,
-            output_model,
-            max_tokens=4096
-        )
-
-    def syntax_check(self) -> None:
-        """Validate the operation configuration."""
-        config = self.schema(**self.config)
-        
-        if not config.model_path:
-            raise ValueError("model_path is required")
-        
-        if not config.model_file:
-            raise ValueError("model_file is required for llama_cpp models")
-        
-        if not config.output_schema:
-            raise ValueError("output_schema is required")
-        
-        if not config.prompt_template:
-            raise ValueError("prompt_template is required")
-
-    def create_prompt(self, item: Dict[str, Any]) -> str:
-        """Create a prompt from the template and input data."""
-        messages = [
-            {
-                'role': 'user',
-                'content': self.config["prompt_template"]
-            },
-            {
-                'role': 'assistant',
-                'content': "I understand and will process the input as requested."
-            },
-            {
-                'role': 'user',
-                'content': str(item)
-            }
-        ]
-        return self.tokenizer.hf_tokenizer.apply_chat_template(
-            messages, 
-            tokenize=False
-        )
-
-    def process_item(self, item: Dict[str, Any]) -> Dict[str, Any]:
-        """Process a single item through the Outlines model."""
-        prompt = self.create_prompt(item)
-        try:
-            result = self.processor(prompt)
-            result_dict = result.model_dump()
-            final_dict = {**item, **result_dict}
-            return json.loads(json.dumps(final_dict, indent=2))
-        except Exception as e:
-            self.console.print(f"Error processing item: {e}")
-            return json.loads(json.dumps(item, indent=2))
-
-    def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
-        """Execute the operation on the input data."""
-        if self.status:
-            self.status.stop()
-
-        results = []
-        batch_size = self.config.get("batch_size", 10)
-        
-        for i in range(0, len(input_data), batch_size):
-            batch = input_data[i:i + batch_size]
-            batch_results = [self.process_item(item) for item in batch]
-            results.extend(batch_results)
-
-        if self.status:
-            self.status.start()
-
-        return results, 0.0
-
-
-        
-
-
-
diff --git a/docetl/operations/hf_outlines.py b/docetl/operations/hf_outlines.py
deleted file mode 100644
index b2d3d241..00000000
--- a/docetl/operations/hf_outlines.py
+++ /dev/null
@@ -1,60 +0,0 @@
-from typing import Any, Dict, List, Optional, Tuple
-from pydantic import BaseModel, create_model
-from docetl.operations.base import BaseOperation
-from outlines import generate, models
-import json
-
-class HuggingFaceMapOperation(BaseOperation):
-    class schema(BaseOperation.schema):
-        name: str
-        type: str = "hf_map"
-        model_path: str
-        output_schema: Dict[str, Any]
-        prompt_template: str
-        max_tokens: int = 4096
-
-    def __init__(self, config: Dict[str, Any], runner=None, *args, **kwargs):
-        super().__init__(
-            config=config,
-            default_model=config.get('default_model', config['model_path']),
-            max_threads=config.get('max_threads', 1),
-            runner=runner
-        )
-        
-        self.model = models.transformers(
-            self.config["model_path"]
-        )
-        
-        # Create a dynamic Pydantic model from the output schema
-        field_definitions = {
-            k: (eval(v) if isinstance(v, str) else v, ...)
-            for k, v in self.config["output_schema"].items()
-        }
-        output_model = create_model('OutputModel', **field_definitions)
-        
-        self.processor = generate.json(
-            self.model,
-            output_model
-        )
-
-    def syntax_check(self) -> None:
-        """Validate the operation configuration."""
-        self.schema(**self.config)
-
-    def process_item(self, item: Dict[str, Any]) -> Dict[str, Any]:
-        """Process a single item through the model."""
-        try:
-            result = self.processor(self.config["prompt_template"] + "\n" + str(item))
-            result_dict = result.model_dump()
-            final_dict = {**item, **result_dict}
-            return final_dict
-        except Exception as e:
-            self.console.print(f"Error processing item: {e}")
-            return item
-
-    @classmethod
-    def execute(cls, config: Dict[str, Any], input_data: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], float]:
-        """Execute the operation on the input data."""
-        instance = cls(config)
-        results = [instance.process_item(item) for item in input_data]
-        return results, 0.0
\ No newline at end of file
diff --git a/tests/test_cpp_outlines.py b/tests/test_cpp_outlines.py
deleted file mode 100644
index 19fa92b9..00000000
--- a/tests/test_cpp_outlines.py
+++ /dev/null
@@ -1,135 +0,0 @@
-import pytest
-from unittest.mock import Mock, patch
-from docetl.operations.cpp_outlines import LlamaCppMapOperation
-
-@pytest.fixture
-def sample_config():
-    return {
-        "type": "llama_cpp_map",
-        "model_path": "/path/to/local/model",
-        "model_file": "Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf",
-        "output_schema": {
-            "first_name": "str",
-            "last_name": "str",
-            "order_number": "str",
-            "department": "str"
-        },
-        "prompt_template": "Extract customer information from this text",
-        "batch_size": 2,
-        "n_gpu_layers": -1,
-        "flash_attn": True,
-        "n_ctx": 8192
-    }
-
-@pytest.fixture
-def mock_processor_output():
-    class MockOutput:
-        def model_dump(self):
-            return {
-                "first_name": "John",
-                "last_name": "Doe",
-                "order_number": "12345",
-                "department": "Sales"
-            }
-    return MockOutput()
-
-@pytest.fixture
-def sample_input_data():
-    return [
-        {"message": "Customer John Doe ordered item #12345"},
-        {"message": "Customer Jane Smith from Sales department"}
-    ]
-
-def test_initialization(sample_config):
-    with patch('llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained'), \
-         patch('outlines.models.llamacpp'):
-        operation = LlamaCppMapOperation(sample_config)
-        assert operation.config == sample_config
-        assert operation.config["n_gpu_layers"] == -1
-        assert operation.config["flash_attn"] is True
-
-def test_syntax_check(sample_config):
-    with patch('llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained'), \
-         patch('outlines.models.llamacpp'):
-        operation = LlamaCppMapOperation(sample_config)
-        operation.syntax_check() 
-
-@pytest.mark.parametrize("missing_field", [
-    "model_path",
-    "model_file",
-    "output_schema",
-    "prompt_template"
-])
-def test_syntax_check_missing_fields(sample_config, missing_field):
-    with patch('llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained'), \
-         patch('outlines.models.llamacpp'):
-        invalid_config = sample_config.copy()
-        invalid_config[missing_field] = ""
-        operation = LlamaCppMapOperation(invalid_config)
-        with pytest.raises(ValueError):
-            operation.syntax_check()
-
-def test_create_prompt(sample_config):
-    with patch('llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained') as mock_tokenizer_class:
-        mock_tokenizer = Mock()
-        mock_tokenizer.hf_tokenizer.apply_chat_template.return_value = "mocked prompt"
-        mock_tokenizer_class.return_value = mock_tokenizer
-        
-        with patch('outlines.models.llamacpp'):
-            operation = LlamaCppMapOperation(sample_config)
-            test_item = {"message": "test message"}
-            prompt = operation.create_prompt(test_item)
-            
-            assert isinstance(prompt, str)
-            assert mock_tokenizer.hf_tokenizer.apply_chat_template.called
-
-def test_process_item(sample_config, mock_processor_output):
-    with patch('llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained'), \
-         patch('outlines.models.llamacpp'):
-        operation = LlamaCppMapOperation(sample_config)
-        operation.processor = Mock(return_value=mock_processor_output)
-        
-        test_item = {"message": "test message"}
-        result = operation.process_item(test_item)
-        
-        assert isinstance(result, dict)
-        assert "first_name" in result
-        assert "last_name" in result
-        assert "message" in result
-
-def test_process_item_error_handling(sample_config):
-    with patch('llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained'), \
-         patch('outlines.models.llamacpp'):
-        operation = LlamaCppMapOperation(sample_config)
-        operation.processor = Mock(side_effect=Exception("Test error"))
-        
-        test_item = {"message": "test message"}
-        result = operation.process_item(test_item)
-        
-        assert isinstance(result, dict)
-        assert "message" in result
-
-def test_execute(sample_config, sample_input_data):
-    with patch('llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained'), \
-         patch('outlines.models.llamacpp'):
-        operation = LlamaCppMapOperation(sample_config)
-        operation.process_item = Mock(return_value={"processed": True})
-        
-        results, timing = operation.execute(sample_input_data)
-        
-        assert len(results) == len(sample_input_data)
-        assert isinstance(timing, float)
-
-def test_batch_processing(sample_config, sample_input_data):
-    with patch('llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained'), \
-         patch('outlines.models.llamacpp'):
-        operation = LlamaCppMapOperation(sample_config)
-        operation.process_item = Mock(return_value={"processed": True})
-        
-        sample_config["batch_size"] = 1
-        results1, _ = operation.execute(sample_input_data)
-        assert len(results1) == len(sample_input_data)
-        
-        sample_config["batch_size"] = 2
-        results2, _ = operation.execute(sample_input_data)
-        assert len(results2) == len(sample_input_data)
\ No newline at end of file
diff --git a/tests/test_hf_outlines.py b/tests/test_hf_outlines.py
deleted file mode 100644
index ef9fc943..00000000
--- a/tests/test_hf_outlines.py
+++ /dev/null
@@ -1,134 +0,0 @@
-import pytest
-from unittest.mock import Mock, patch, MagicMock
-from docetl.operations.hf_outlines import HuggingFaceMapOperation
-
-@pytest.fixture
-def mock_runner():
-    return Mock()
-
-@pytest.fixture
-def sample_config():
-    return {
-        "name": "test_hf_operation",
-        "type": "hf_map",
-        "model_path": "meta-llama/Llama-3.2-1B-Instruct",
-        "output_schema": {
-            "first_name": "str",
-            "last_name": "str"
-        },
-        "prompt_template": "Extract customer information from this text",
-        "max_tokens": 4096
-    }
-
-@pytest.fixture
-def research_config():
-    return {
-        "name": "research_analyzer",
-        "type": "hf_map",
-        "model_path": "meta-llama/Llama-3.2-1B-Instruct",
-        "output_schema": {
-            "title": "str",
-            "authors": "list",
-            "methodology": "str",
-            "findings": "list",
-            "limitations": "list",
-            "future_work": "list"
-        },
-        "prompt_template": "Analyze the following research paper abstract.\nExtract key components and summarize findings.",
-        "max_tokens": 4096
-    }
-
-@pytest.fixture
-def mock_research_output():
-    class MockOutput:
-        def model_dump(self):
-            return {
-                "title": "Deep Learning in Natural Language Processing",
-                "authors": ["John Smith", "Jane Doe"],
-                "methodology": "Comparative analysis of transformer architectures",
-                "findings": [
-                    "Improved accuracy by 15%",
-                    "Reduced training time by 30%"
-                ],
-                "limitations": [
-                    "Limited dataset size",
-                    "Computational constraints"
-                ],
-                "future_work": [
-                    "Extend to multilingual models",
-                    "Optimize for edge devices"
-                ]
-            }
-    return MockOutput()
-
-def test_process_item(sample_config, mock_runner):
-    mock_model = MagicMock()
-    
-    class MockOutput:
-        def model_dump(self):
-            return {
-                "first_name": "John",
-                "last_name": "Doe"
-            }
-    
-    mock_processor = Mock(return_value=MockOutput())
-    
-    with patch('outlines.models.transformers', return_value=mock_model) as mock_transformers, \
-         patch('outlines.generate.json', return_value=mock_processor):
-        
-        operation = HuggingFaceMapOperation(sample_config, runner=mock_runner)
-        test_item = {"message": "test message"}
-        result = operation.process_item(test_item)
-        
-        assert isinstance(result, dict)
-        assert "first_name" in result
-        assert "last_name" in result
-        assert "message" in result
-
-def test_research_paper_analysis(research_config, mock_research_output, mock_runner):
-    mock_model = MagicMock()
-    mock_processor = Mock(return_value=mock_research_output)
-    
-    with patch('outlines.models.transformers', return_value=mock_model) as mock_transformers, \
-         patch('outlines.generate.json', return_value=mock_processor):
-        
-        operation = HuggingFaceMapOperation(research_config, runner=mock_runner)
-        test_item = {
-            "abstract": """
-            This paper presents a comprehensive analysis of deep learning approaches 
-            in natural language processing. We compare various transformer architectures 
-            and their performance on standard NLP tasks.
-            """
-        }
-        result = operation.process_item(test_item)
-        
-        # Verify structure and types
-        assert isinstance(result, dict)
-        assert "title" in result
-        assert isinstance(result["title"], str)
-        assert "authors" in result
-        assert isinstance(result["authors"], list)
-        assert "methodology" in result
-        assert isinstance(result["methodology"], str)
-        assert "findings" in result
-        assert isinstance(result["findings"], list)
-        assert len(result["findings"]) > 0
-        assert "limitations" in result
-        assert isinstance(result["limitations"], list)
-        assert "future_work" in result
-        assert isinstance(result["future_work"], list)
-        
-        # Verify original input is preserved
-        assert "abstract" in result
-
-def test_execute(sample_config, mock_runner):
-    mock_model = MagicMock()
-    mock_processor = Mock(return_value={"first_name": "John", "last_name": "Doe"})
-    
-    with patch('outlines.models.transformers', return_value=mock_model) as mock_transformers, \
-         patch('outlines.generate.json', return_value=mock_processor):
-        
-        input_data = [{"message": "test message"}]
-        results, timing = HuggingFaceMapOperation.execute(sample_config, input_data)
-        assert len(results) == 1
-        assert isinstance(timing, float)
\ No newline at end of file