Skip to content

Commit 7d0b47b

Browse files
committed
feat: add tests for SpacyAnnotator and improve coverage
- Added tests for datafog.models.spacy_nlp.SpacyAnnotator.annotate_text - Mocked spaCy dependencies to avoid network/model download needs - Corrected entity type validation based on EntityTypes Enum - Skipped test_spark_service_handles_pyspark_import_error due to mocking complexity - Increased overall test coverage to >74%
1 parent a0a8bfd commit 7d0b47b

File tree

3 files changed

+168
-0
lines changed

3 files changed

+168
-0
lines changed

.pre-commit-config.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,5 @@ repos:
2525
rev: v4.0.0-alpha.8
2626
hooks:
2727
- id: prettier
28+
types: [yaml, markdown] # Explicitly define file types
2829
exclude: .venv

tests/test_spacy_nlp.py

+85
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
# tests/test_spacy_nlp.py
2+
from unittest.mock import MagicMock, patch
3+
from uuid import UUID
4+
5+
import pytest
6+
7+
from datafog.models.spacy_nlp import AnnotationResult, SpacyAnnotator
8+
9+
10+
@patch("datafog.models.spacy_nlp.spacy.load")
11+
def test_annotate_text_basic(mock_spacy_load):
12+
"""
13+
Test that annotate_text correctly processes text and returns AnnotationResult objects.
14+
"""
15+
# Arrange: Mock the spaCy NLP object and its return value
16+
mock_nlp = MagicMock()
17+
mock_doc = MagicMock()
18+
19+
# Simulate entities found by spaCy
20+
mock_ent1 = MagicMock()
21+
mock_ent1.start_char = 0
22+
mock_ent1.end_char = 4
23+
mock_ent1.label_ = "PERSON"
24+
25+
mock_ent2 = MagicMock()
26+
mock_ent2.start_char = 11
27+
mock_ent2.end_char = 17
28+
mock_ent2.label_ = "LOCATION" # Use valid EntityTypes member
29+
30+
mock_doc.ents = [mock_ent1, mock_ent2]
31+
mock_nlp.return_value = mock_doc # nlp(text) returns the mock_doc
32+
mock_spacy_load.return_value = mock_nlp # spacy.load() returns the mock_nlp
33+
34+
# Instantiate the annotator (doesn't load model immediately)
35+
annotator = SpacyAnnotator()
36+
37+
# Act: Call the method under test
38+
test_text = "John lives in London."
39+
results = annotator.annotate_text(test_text)
40+
41+
# Assert:
42+
# Check that spacy.load was called (implicitly tests load_model)
43+
mock_spacy_load.assert_called_once_with(annotator.model_name)
44+
# Check that the nlp object was called with the text
45+
mock_nlp.assert_called_once()
46+
# Check the number of results
47+
assert len(results) == 2
48+
49+
# Check the details of the first result
50+
assert isinstance(results[0], AnnotationResult)
51+
assert results[0].start == 0
52+
assert results[0].end == 4
53+
assert results[0].entity_type == "PERSON"
54+
assert isinstance(results[0].score, float)
55+
56+
# Check the details of the second result
57+
assert isinstance(results[1], AnnotationResult)
58+
assert results[1].start == 11
59+
assert results[1].end == 17
60+
assert results[1].entity_type == "LOCATION" # Assert for LOCATION
61+
assert isinstance(results[1].score, float)
62+
63+
64+
# Example of testing other branches (e.g., model already loaded)
65+
@patch("datafog.models.spacy_nlp.spacy.load")
66+
def test_annotate_text_model_already_loaded(mock_spacy_load):
67+
"""
68+
Test that annotate_text doesn't reload the model if already loaded.
69+
"""
70+
# Arrange
71+
mock_nlp = MagicMock()
72+
mock_doc = MagicMock()
73+
mock_doc.ents = [] # No entities for simplicity
74+
mock_nlp.return_value = mock_doc
75+
mock_spacy_load.return_value = mock_nlp
76+
77+
annotator = SpacyAnnotator()
78+
annotator.nlp = mock_nlp # Pre-set the nlp attribute
79+
80+
# Act
81+
annotator.annotate_text("Some text.")
82+
83+
# Assert
84+
mock_spacy_load.assert_not_called() # Should not be called again
85+
mock_nlp.assert_called_once_with("Some text.")

tests/test_spark_service.py

+82
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
# tests/test_spark_service.py
2+
import importlib
3+
import sys
4+
from unittest.mock import MagicMock, patch
5+
6+
import pytest
7+
8+
# DO NOT import datafog.services.spark_service at the top level
9+
10+
11+
@pytest.mark.skip(
12+
reason="Skipping due to complex mocking interactions with dependencies. "
13+
"Needs revisit when SparkService has real functionality."
14+
)
15+
def test_spark_service_handles_pyspark_import_error(capsys):
16+
"""
17+
Test that SparkService handles ImportError for pyspark gracefully during import
18+
and prints the expected message, isolating it from dependency import errors.
19+
"""
20+
# Ensure the module under test and its dependency are not cached
21+
if "datafog.services.spark_service" in sys.modules:
22+
del sys.modules["datafog.services.spark_service"]
23+
if "datafog.processing.spark_processing.pyspark_udfs" in sys.modules:
24+
del sys.modules["datafog.processing.spark_processing.pyspark_udfs"]
25+
26+
# Store original state
27+
original_modules = sys.modules.copy()
28+
29+
# Modules to remove/mock
30+
modules_to_patch = {}
31+
# Remove pyspark
32+
modules_to_patch["pyspark"] = None
33+
modules_to_patch["pyspark.sql"] = None # Also remove submodule just in case
34+
# Mock the problematic dependency
35+
modules_to_patch["datafog.processing.spark_processing.pyspark_udfs"] = MagicMock()
36+
37+
# Use patch.dict to modify sys.modules for this context
38+
with patch.dict(
39+
sys.modules, modules_to_patch, clear=False
40+
): # clear=False, just overlay
41+
try:
42+
# Attempt to import the module *within* the patch context
43+
# The import of spark_service itself should trigger its try/except
44+
# The import *within* spark_service for pyspark_udfs should get the MagicMock
45+
import datafog.services.spark_service as spark_service
46+
47+
# Check if the warning message was printed (stdout)
48+
captured = capsys.readouterr()
49+
expected_message = (
50+
"PySpark not found. Please install it with the [spark] extra"
51+
)
52+
assert expected_message in captured.out
53+
54+
# Check stderr for the traceback from spark_service's except block
55+
assert (
56+
"ImportError" in captured.err or "ModuleNotFoundError" in captured.err
57+
)
58+
assert "pyspark" in captured.err
59+
60+
# Verify that the placeholder is set in the imported module
61+
assert spark_service.SparkSession is None
62+
63+
# Verify dependency was mocked (optional, but good practice)
64+
assert isinstance(spark_service.pyspark_udfs, MagicMock)
65+
66+
finally:
67+
# Strict restoration of original modules is important
68+
sys.modules.clear()
69+
sys.modules.update(original_modules)
70+
# Re-delete the target module and dependency to ensure clean state
71+
if "datafog.services.spark_service" in sys.modules:
72+
del sys.modules["datafog.services.spark_service"]
73+
if "datafog.processing.spark_processing.pyspark_udfs" in sys.modules:
74+
del sys.modules["datafog.processing.spark_processing.pyspark_udfs"]
75+
76+
77+
# Add placeholder for actual SparkService tests later if needed
78+
# class TestSparkServiceFunctionality:
79+
# @pytest.mark.skipif(sys.modules.get("pyspark") is None, reason="pyspark not installed")
80+
# def test_spark_functionality(self):
81+
# # Add tests for actual service methods here
82+
# pass

0 commit comments

Comments
 (0)