Skip to content

Commit

Permalink
Merge pull request #27 from ks6088ts-labs/feature/issue-26_extract-da…
Browse files Browse the repository at this point in the history
…ta-from-pdf

add a CLI to extract data from PDF
  • Loading branch information
ks6088ts authored Nov 8, 2024
2 parents f3800c1 + 3a2db7a commit acb8f54
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 0 deletions.
26 changes: 26 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,29 @@ To publish the docker image to Docker Hub, you need to set the following secrets
gh secret set DOCKERHUB_USERNAME --body $DOCKERHUB_USERNAME
gh secret set DOCKERHUB_TOKEN --body $DOCKERHUB_TOKEN
```

## scripts

### pdf_cli.py

```shell
# help
poetry run python scripts/pdf_cli.py --help

# Convert a PDF file to a Markdown file.
poetry run python scripts/pdf_cli.py pdf2md \
--in-pdf "./datasets/sample.pdf" \
--out-md "./datasets/sample.md" \
--verbose

# Dump the table of contents (TOC) of a PDF file.
poetry run python scripts/pdf_cli.py toc \
--in-pdf "./datasets/sample.pdf" \
--verbose

# Dump tables of the specified page of a PDF file.
poetry run python scripts/pdf_cli.py tables \
--in-pdf "./datasets/sample.pdf" \
--page-number 123 \
--verbose
```
76 changes: 76 additions & 0 deletions scripts/pdf_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import logging
import pathlib

import pymupdf
import pymupdf4llm
import typer

app = typer.Typer(
add_completion=False,
)

logger = logging.getLogger(__name__)


@app.command(
help="Convert a PDF file to a Markdown file.",
)
def pdf2md(
in_pdf: str = typer.Option(..., help="Path to the input PDF file."),
out_md: str = typer.Option(..., help="Path to the output Markdown file."),
verbose: bool = typer.Option(True, help="Verbose mode."),
):
if verbose:
logging.basicConfig(level=logging.DEBUG)
try:
logger.info(f"Converting {in_pdf} to {out_md}")
md_text = pymupdf4llm.to_markdown(
doc=in_pdf,
)
pathlib.Path(out_md).write_bytes(md_text.encode())
except Exception as e:
logger.error(e)


@app.command(
help="Dump the table of contents (TOC) of a PDF file.",
)
def toc(
in_pdf: str = typer.Option(..., help="Path to the input PDF file."),
verbose: bool = typer.Option(True, help="Verbose mode."),
):
if verbose:
logging.basicConfig(level=logging.DEBUG)
try:
doc = pymupdf.open(in_pdf)
# https://pymupdf.readthedocs.io/en/latest/document.html#Document.get_toc
toc = doc.get_toc()
for i, (level, title, page) in enumerate(toc):
logger.info(f"{i}: {level}, {title}, {page}")
except Exception as e:
logger.error(e)


@app.command(
help="Dump tables of the specified page of a PDF file.",
)
def tables(
in_pdf: str = typer.Option(..., help="Path to the input PDF file."),
page_number: int = typer.Option(0, help="Page number."),
verbose: bool = typer.Option(True, help="Verbose mode."),
):
if verbose:
logging.basicConfig(level=logging.DEBUG)
try:
doc = pymupdf.open(in_pdf)
page = doc[page_number - 1] # 0-based index
# https://pymupdf.readthedocs.io/en/latest/page.html#Page.find_tables
tables = page.find_tables()
for i, table in enumerate(tables):
logger.info(f"Table {i}: {table.to_markdown()}")
except Exception as e:
logger.error(e)


if __name__ == "__main__":
app()

0 comments on commit acb8f54

Please sign in to comment.