Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add XLSXToDocument converter #8522

Merged
merged 19 commits into from
Jan 9, 2025
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/pydoc/config/converters_api.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ loaders:
"pypdf",
"tika",
"txt",
"xlsx",
]
ignore_when_discovered: ["__init__"]
processors:
Expand Down
2 changes: 2 additions & 0 deletions haystack/components/converters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from haystack.components.converters.pypdf import PyPDFToDocument
from haystack.components.converters.tika import TikaDocumentConverter
from haystack.components.converters.txt import TextFileToDocument
from haystack.components.converters.xlsx import XLSXToDocument

__all__ = [
"TextFileToDocument",
Expand All @@ -31,4 +32,5 @@
"PPTXToDocument",
"CSVToDocument",
"JSONConverter",
"XLSXToDocument",
]
180 changes: 180 additions & 0 deletions haystack/components/converters/xlsx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0

import io
from pathlib import Path
from typing import Any, Dict, List, Literal, Optional, Tuple, Union

import pandas as pd

from haystack import Document, component, logging
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
from haystack.dataclasses import ByteStream
from haystack.lazy_imports import LazyImport

logger = logging.getLogger(__name__)

with LazyImport("Run 'pip install openpyxl'") as xlsx_import:
import openpyxl # pylint: disable=unused-import # the library is used but not directly referenced

with LazyImport("Run 'pip install tabulate'") as tabulate_import:
from tabulate import tabulate # pylint: disable=unused-import # the library is used but not directly referenced


@component
class XLSXToDocument:
"""
Converts XLSX (Excel) files into Documents.

Supports reading data from specific sheets or all sheets in the Excel file. If all sheets are read, a Document is
created for each sheet. The content of the Document is the table which can be saved in CSV or Markdown format.

### Usage example

```python
from haystack.components.converters.xlsx import XLSXToDocument

converter = XLSXToDocument()
results = converter.run(sources=["sample.xlsx"], meta={"date_added": datetime.now().isoformat()})
documents = results["documents"]
print(documents[0].content)
# ",A,B\n1,col_a,col_b\n2,1.5,test\n"
```
"""

def __init__(
self,
table_format: Literal["csv", "markdown"] = "csv",
sheet_name: Union[str, int, List[Union[str, int]], None] = None,
read_excel_kwargs: Optional[Dict[str, Any]] = None,
table_format_kwargs: Optional[Dict[str, Any]] = None,
):
"""
Creates a XLSXToDocument component.

:param table_format: The format to convert the Excel file to.
:param sheet_name: The name of the sheet to read. If None, all sheets are read.
:param read_excel_kwargs: Additional arguments to pass to `pandas.read_excel`.
See https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html#pandas-read-excel
:param table_format_kwargs: Additional keyword arguments to pass to the table format function.
anakin87 marked this conversation as resolved.
Show resolved Hide resolved
- If `table_format` is "csv", these arguments are passed to `pandas.DataFrame.to_csv`.
See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html#pandas-dataframe-to-csv
- If `table_format` is "markdown", these arguments are passed to `pandas.DataFrame.to_markdown`.
See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_markdown.html#pandas-dataframe-to-markdown
"""
xlsx_import.check()
self.table_format = table_format
if table_format not in ["csv", "markdown"]:
raise ValueError(f"Unsupported export format: {table_format}. Choose either 'csv' or 'markdown'.")
if table_format == "markdown":
tabulate_import.check()
self.sheet_name = sheet_name
self.read_excel_kwargs = read_excel_kwargs or {}
self.table_format_kwargs = table_format_kwargs or {}

@component.output_types(documents=List[Document])
def run(
self,
sources: List[Union[str, Path, ByteStream]],
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
) -> Dict[str, List[Document]]:
"""
Converts a XLSX file to a Document.

:param sources:
List of file paths or ByteStream objects.
:param meta:
Optional metadata to attach to the documents.
This value can be either a list of dictionaries or a single dictionary.
If it's a single dictionary, its content is added to the metadata of all produced documents.
If it's a list, the length of the list must match the number of sources, because the two lists will
be zipped.
If `sources` contains ByteStream objects, their `meta` will be added to the output documents.
:returns:
A dictionary with the following keys:
- `documents`: Created documents
"""
documents = []

meta_list = normalize_metadata(meta, sources_count=len(sources))

for source, metadata in zip(sources, meta_list):
try:
bytestream = get_bytestream_from_source(source)
except Exception as e:
logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
continue

try:
tables, tables_metadata = self._extract_tables(bytestream)
except Exception as e:
logger.warning(
"Could not read {source} and convert it to a Document, skipping. Error: {error}",
source=source,
error=e,
)
continue

# Loop over tables and create a Document for each table
for table, excel_metadata in zip(tables, tables_metadata):
merged_metadata = {**bytestream.meta, **metadata, **excel_metadata}
document = Document(content=table, meta=merged_metadata)
documents.append(document)

return {"documents": documents}

@staticmethod
def _generate_excel_column_names(n_cols: int) -> List[str]:
result = []
for i in range(n_cols):
col_name = ""
num = i
while num >= 0:
col_name = chr(num % 26 + 65) + col_name
num = num // 26 - 1
result.append(col_name)
return result

def _extract_tables(self, bytestream: ByteStream) -> Tuple[List[str], List[Dict]]:
"""
Extract tables from a Excel file.
"""
resolved_read_excel_kwargs = {
**self.read_excel_kwargs,
"sheet_name": self.sheet_name,
"header": None, # Don't assign any pandas column labels
"engine": "openpyxl", # Use openpyxl as the engine to read the Excel file
}
sheet_to_dataframe = pd.read_excel(io=io.BytesIO(bytestream.data), **resolved_read_excel_kwargs)
if isinstance(sheet_to_dataframe, pd.DataFrame):
sheet_to_dataframe = {self.sheet_name: sheet_to_dataframe}

updated_sheet_to_dataframe = {}
for key in sheet_to_dataframe:
df = sheet_to_dataframe[key]
# Row starts at 1 in Excel
df.index = df.index + 1
# Excel column names are Alphabet Characters
header = self._generate_excel_column_names(df.shape[1])
df.columns = header
updated_sheet_to_dataframe[key] = df

tables = []
metadata = []
for key, value in updated_sheet_to_dataframe.items():
if self.table_format == "csv":
resolved_kwargs = {"index": True, "header": True, "lineterminator": "\n", **self.table_format_kwargs}
tables.append(value.to_csv(**resolved_kwargs))
else:
resolved_kwargs = {
"index": True,
"headers": value.columns,
"tablefmt": "pipe",
**self.table_format_kwargs,
}
# to_markdown uses tabulate
tables.append(value.to_markdown(**resolved_kwargs))
# add sheet_name to metadata
metadata.append({"xlsx": {"sheet_name": key}})
return tables, metadata
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,9 @@ extra-dependencies = [
"trafilatura", # HTMLToDocument
"python-pptx", # PPTXToDocument
"python-docx", # DocxToDocument
"jq", #JSONConverter
"jq", # JSONConverter
"openpyxl", # XLSXToDocument
"tabulate", # XLSXToDocument

"nltk", # NLTKDocumentSplitter

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
features:
- |
Add XLSXToDocument converter that loads an Excel file using Pandas + openpyxl and by default converts each sheet into a separate Document in a CSV format.
139 changes: 139 additions & 0 deletions test/components/converters/test_xlsx_to_document.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import logging
from typing import Union

import pytest

from haystack.components.converters.xlsx import XLSXToDocument


class TestXLSXToDocument:
def test_init(self) -> None:
converter = XLSXToDocument()
assert converter.sheet_name is None
assert converter.read_excel_kwargs == {}
assert converter.table_format == "csv"
assert converter.table_format_kwargs == {}

def test_run_basic_tables(self, test_files_path) -> None:
converter = XLSXToDocument()
paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"]
results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
documents = results["documents"]
assert len(documents) == 2
assert documents[0].content == ",A,B\n1,col_a,col_b\n2,1.5,test\n"
assert documents[0].meta == {
"date_added": "2022-01-01T00:00:00",
"file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"),
"xlsx": {"sheet_name": "Basic Table"},
}
assert documents[1].content == ",A,B\n1,col_c,col_d\n2,True,\n"
assert documents[1].meta == {
"date_added": "2022-01-01T00:00:00",
"file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"),
"xlsx": {"sheet_name": "Table Missing Value"},
}

def test_run_table_empty_rows_and_columns(self, test_files_path) -> None:
converter = XLSXToDocument()
paths = [test_files_path / "xlsx" / "table_empty_rows_and_columns.xlsx"]
results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
documents = results["documents"]
assert len(documents) == 1
assert documents[0].content == ",A,B,C\n1,,,\n2,,,\n3,,,\n4,,col_a,col_b\n5,,1.5,test\n"
assert documents[0].meta == {
"date_added": "2022-01-01T00:00:00",
"file_path": str(test_files_path / "xlsx" / "table_empty_rows_and_columns.xlsx"),
"xlsx": {"sheet_name": "Sheet1"},
}

def test_run_multiple_tables_in_one_sheet(self, test_files_path) -> None:
converter = XLSXToDocument()
paths = [test_files_path / "xlsx" / "multiple_tables.xlsx"]
results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
documents = results["documents"]
assert len(documents) == 1
assert (
documents[0].content
== ",A,B,C,D,E,F\n1,,,,,,\n2,,,,,,\n3,,col_a,col_b,,,\n4,,1.5,test,,col_c,col_d\n5,,,,,3,True\n"
)
assert documents[0].meta == {
"date_added": "2022-01-01T00:00:00",
"file_path": str(test_files_path / "xlsx" / "multiple_tables.xlsx"),
"xlsx": {"sheet_name": "Sheet1"},
}

def test_run_markdown(self, test_files_path) -> None:
converter = XLSXToDocument(table_format="markdown")
paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"]
results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
documents = results["documents"]
assert len(documents) == 2
assert (
documents[0].content
== "| | A | B |\n|---:|:------|:------|\n| 1 | col_a | col_b |\n| 2 | 1.5 | test |"
)
assert documents[0].meta == {
"date_added": "2022-01-01T00:00:00",
"file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"),
"xlsx": {"sheet_name": "Basic Table"},
}
assert (
documents[1].content
== "| | A | B |\n|---:|:------|:------|\n| 1 | col_c | col_d |\n| 2 | True | nan |"
)
assert documents[1].meta == {
"date_added": "2022-01-01T00:00:00",
"file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"),
"xlsx": {"sheet_name": "Table Missing Value"},
}

@pytest.mark.parametrize(
"sheet_name, expected_sheet_name, expected_content",
[
("Basic Table", "Basic Table", ",A,B\n1,col_a,col_b\n2,1.5,test\n"),
("Table Missing Value", "Table Missing Value", ",A,B\n1,col_c,col_d\n2,True,\n"),
(0, 0, ",A,B\n1,col_a,col_b\n2,1.5,test\n"),
(1, 1, ",A,B\n1,col_c,col_d\n2,True,\n"),
],
)
def test_run_sheet_name(
self, sheet_name: Union[int, str], expected_sheet_name: str, expected_content: str, test_files_path
) -> None:
converter = XLSXToDocument(sheet_name=sheet_name)
paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"]
results = converter.run(sources=paths)
documents = results["documents"]
assert len(documents) == 1
assert documents[0].content == expected_content
assert documents[0].meta == {
"file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"),
"xlsx": {"sheet_name": expected_sheet_name},
}

def test_run_with_read_excel_kwargs(self, test_files_path) -> None:
converter = XLSXToDocument(sheet_name="Basic Table", read_excel_kwargs={"skiprows": 1})
paths = [test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"]
results = converter.run(sources=paths, meta={"date_added": "2022-01-01T00:00:00"})
documents = results["documents"]
assert len(documents) == 1
assert documents[0].content == ",A,B\n1,1.5,test\n"
assert documents[0].meta == {
"date_added": "2022-01-01T00:00:00",
"file_path": str(test_files_path / "xlsx" / "basic_tables_two_sheets.xlsx"),
"xlsx": {"sheet_name": "Basic Table"},
}

def test_run_error_wrong_file_type(self, caplog: pytest.LogCaptureFixture, test_files_path) -> None:
converter = XLSXToDocument()
sources = [test_files_path / "pdf" / "sample_pdf_1.pdf"]
with caplog.at_level(logging.WARNING):
results = converter.run(sources=sources)
assert "sample_pdf_1.pdf and convert it" in caplog.text
assert results["documents"] == []

def test_run_error_non_existent_file(self, caplog: pytest.LogCaptureFixture) -> None:
converter = XLSXToDocument()
paths = ["non_existing_file.docx"]
with caplog.at_level(logging.WARNING):
converter.run(sources=paths)
assert "Could not read non_existing_file.docx" in caplog.text
Binary file not shown.
Binary file added test/test_files/xlsx/multiple_tables.xlsx
Binary file not shown.
Binary file not shown.
Loading