Skip to content

Commit

Permalink
feat: Add document page number of ExtractedAnswer to meta (#7572)
Browse files Browse the repository at this point in the history
* calculate page number of answer and add to meta

* fix mypy, add reno

* add test

* simplify unit test

* update release note

* undo @patch updates

* extend tests, check page_number type
  • Loading branch information
julian-risch authored May 2, 2024
1 parent 2e35f13 commit b028497
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 1 deletion.
29 changes: 29 additions & 0 deletions haystack/components/readers/extractive.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,31 @@ def _postprocess(

return start_candidates_tokens_to_chars, end_candidates_tokens_to_chars, candidates_values

def _add_answer_page_number(self, answer: ExtractedAnswer) -> ExtractedAnswer:
if answer.meta is None:
answer.meta = {}

if answer.document_offset is None:
return answer

if not answer.document or "page_number" not in answer.document.meta:
return answer

if not isinstance(answer.document.meta["page_number"], int):
logger.warning(
f"Document's page_number must be int but is {type(answer.document.meta['page_number'])}. "
f"No page number will be added to the answer."
)
return answer

# Calculate the answer page number
if answer.document.content:
ans_start = answer.document_offset.start
answer_page_number = answer.document.meta["page_number"] + answer.document.content[:ans_start].count("\f")
answer.meta.update({"answer_page_number": answer_page_number})

return answer

def _nest_answers(
self,
start: List[List[int]],
Expand Down Expand Up @@ -358,6 +383,10 @@ def _nest_answers(
current_answers = sorted(current_answers, key=lambda ans: ans.score, reverse=True)
current_answers = self.deduplicate_by_overlap(current_answers, overlap_threshold=overlap_threshold)
current_answers = current_answers[:top_k]

# Calculate the answer page number and add it to meta
current_answers = [self._add_answer_page_number(answer=answer) for answer in current_answers]

if no_answer:
no_answer_score = math.prod(1 - answer.score for answer in current_answers)
answer_ = ExtractedAnswer(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
enhancements:
- |
The ExtractiveReader can now add page numbers to the meta data of ExtractedAnswers.
It's done automatically if the source document of the ExtractedAnswer contains a page number in its meta data.
The ExtractedAnswer will then contain a key "answer_page_number" in its meta data.
74 changes: 73 additions & 1 deletion test/components/readers/test_extractive.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def forward(self, input_ids, attention_mask, *args, **kwargs):
[
Document(content="Angela Merkel was the chancellor of Germany."),
Document(content="Olaf Scholz is the chancellor of Germany"),
Document(content="Jerry is the head of the department."),
Document(content="Jerry is the head of the department.", meta={"page_number": 3}),
]
] * 2

Expand Down Expand Up @@ -386,12 +386,84 @@ def test_nest_answers(mock_reader: ExtractiveReader):
assert answer.query == query
assert answer.document == doc
assert answer.score == pytest.approx(score)
if "page_number" in doc.meta:
assert answer.meta["answer_page_number"] == doc.meta["page_number"]
no_answer = answers[-1]
assert no_answer.query == query
assert no_answer.document is None
assert no_answer.score == pytest.approx(expected_no_answer)


def test_add_answer_page_number_returns_same_answer(mock_reader: ExtractiveReader, caplog):
# answer.document_offset is None
document = Document(content="I thought a lot about this. The answer is 42.", meta={"page_number": 5})
answer = ExtractedAnswer(
data="42",
query="What is the answer?",
document=document,
score=1.0,
document_offset=None,
meta={"meta_key": "meta_value"},
)
assert mock_reader._add_answer_page_number(answer=answer) == answer

# answer.document is None
answer = ExtractedAnswer(
data="42",
query="What is the answer?",
document=None,
score=1.0,
document_offset=ExtractedAnswer.Span(42, 44),
meta={"meta_key": "meta_value"},
)
assert mock_reader._add_answer_page_number(answer=answer) == answer

# answer.document.meta is None
document = Document(content="I thought a lot about this. The answer is 42.")
answer = ExtractedAnswer(
data="42",
query="What is the answer?",
document=document,
score=1.0,
document_offset=ExtractedAnswer.Span(42, 44),
meta={"meta_key": "meta_value"},
)
assert mock_reader._add_answer_page_number(answer=answer) == answer

# answer.document.meta["page_number"] is not int
document = Document(content="I thought a lot about this. The answer is 42.", meta={"page_number": "5"})
answer = ExtractedAnswer(
data="42",
query="What is the answer?",
document=document,
score=1.0,
document_offset=ExtractedAnswer.Span(42, 44),
meta={"meta_key": "meta_value"},
)
with caplog.at_level(logging.WARNING):
assert mock_reader._add_answer_page_number(answer=answer) == answer
assert "page_number must be int" in caplog.text


def test_add_answer_page_number_with_form_feed(mock_reader: ExtractiveReader):
document = Document(
content="I thought a lot about this. \f And this document is long. \f The answer is 42.",
meta={"page_number": 5},
)
answer = ExtractedAnswer(
data="42",
query="What is the answer?",
document=document,
context="The answer is 42.",
score=1.0,
document_offset=ExtractedAnswer.Span(73, 75),
context_offset=ExtractedAnswer.Span(14, 16),
meta={"meta_key": "meta_value"},
)
answer_with_page_number = mock_reader._add_answer_page_number(answer=answer)
assert answer_with_page_number.meta["answer_page_number"] == 7


@patch("haystack.components.readers.extractive.AutoTokenizer.from_pretrained")
@patch("haystack.components.readers.extractive.AutoModelForQuestionAnswering.from_pretrained")
def test_warm_up_use_hf_token(mocked_automodel, mocked_autotokenizer, initialized_token: Secret):
Expand Down

0 comments on commit b028497

Please sign in to comment.