Skip to content

Commit

Permalink
✨ Improve metadata removal
Browse files Browse the repository at this point in the history
  • Loading branch information
nymann committed Aug 28, 2022
1 parent 5d409d0 commit 1c60394
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 101 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,4 @@ coverage.xml
htmlcov
.idea
.cache/
*.pdf
9 changes: 6 additions & 3 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ setup_requires =
pytest-runner
install_requires =
typer >= 0.4.1
block-pruner
block-pruner >= 0.0.4
pdf-information

[options.extras_require]
Expand Down Expand Up @@ -72,16 +72,19 @@ convention=google
[flake8]
docstring-style = google
format = wemake
ignore = WPS305,D100,D101,D102,D103,D104,D107,H601,WPS306
ignore =
WPS305 # allow f-string
WPS306 # allow class without base class (implicit object)
D100,D101,D102,D103,D104,D107 # docstrings are a code smell and should not be required.
max-complexity = 6
max-line-length = 120
max-module-members = 8
show-source = True
strictness = long
inline-quotes = double
per-file-ignores =
tests/**.py:WPS218,WPS432,WPS442,S101,
src/pdf_scrub/version.py:WPS410
src/pdf_scrub/**.py:S603,S607,S108,WPS239,WPS237,WPS210,C901,WPS110,S404,WPS214

[isort]
combine_as_imports = True
Expand Down
17 changes: 14 additions & 3 deletions src/pdf_scrub/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,25 @@
from pdf_scrub.pdf import PDF

app = typer.Typer()
CompressOption = typer.Option(default=True, help="Compress the final pdf to reduce file size greatly")


def save(index: int, potential: bytes, name: str) -> None:
out = Path.cwd().joinpath(f"{index}-{name}")
with open(file=out, mode="w+b") as output_file:
output_file.write(potential)


@app.command()
def scrub(files: list[Path]) -> None:
def scrub(
files: list[Path],
compress: bool = CompressOption,
) -> None:
for pdf_file in files:
typer.echo(f"Scrubbing {pdf_file.name}")
pdf = PDF(pdf_file=pdf_file)
for potential in pdf.scrub():
typer.echo(potential)
for index, potential in enumerate(pdf.scrub(compress=compress)):
save(index=index, potential=potential, name=pdf_file.name)


if __name__ == "__main__":
Expand Down
120 changes: 25 additions & 95 deletions src/pdf_scrub/pdf.py
Original file line number Diff line number Diff line change
@@ -1,104 +1,34 @@
from pathlib import Path
import subprocess
from typing import Iterable

from block_pruner import BlockPruner
from pdf_information import PDFInfo

from pdf_scrub.toolkit import compress_pdf
from pdf_scrub.toolkit import decrypt_pdf
from pdf_scrub.toolkit import find_possible_watermark_needles
from pdf_scrub.toolkit import remove_metadata
from pdf_scrub.toolkit import remove_watermark
from pdf_scrub.toolkit import uncompress_pdf


class PDF:
def __init__(self, pdf_file: Path) -> None:
self.pdf_file = pdf_file
self.scrubbed = pdf_file
self.info = PDFInfo.from_cmd(pdf_file=pdf_file)

def scrub(self) -> Iterable[Path]:
if self.info.encrypted:
self.scrubbed = self._decrypt()
self.scrubbed = self._uncompress()
self.scrubbed = self._remove_metadata()
for index, possibility in enumerate(self._find_possible_watermarks()):
needle: str = possibility.strip("\n")
potential = self._remove_potential_watermark(needle=needle, attempt=index)
yield self._compress(potential=potential)

def _remove_metadata(self) -> Path:
block_pruner = BlockPruner(start=r"[0-9]+\ [0-9]+\ obj", end="endobj", needle="DocumentID")
remove_metadata = block_pruner.prune_file(self.scrubbed)
out = self._file_path("no_metadata")
with open(out, "w+b") as output_file:
output_file.write(remove_metadata)
return out

def _file_path(self, name: str | int) -> Path:
return self.scrubbed.parent.joinpath(f"{self.scrubbed.name}.{name}")

def _remove_potential_watermark(self, needle: str, attempt: int) -> Path:
block_pruner = BlockPruner(start=r"[0-9]+\ [0-9]+\ obj", end="endobj", needle=needle)
out = self._file_path(attempt)
potential = block_pruner.prune_file(self.scrubbed)
with open(out, "w+b") as output_file:
output_file.write(potential)
return out

def _decrypt(self) -> Path:
out = Path(f"/tmp/{self.scrubbed.name}.decrypted")
subprocess.check_call(
[
"qpdf",
"--decrypt",
self.scrubbed,
out,
],
)
return out

def _compress(self, potential: Path) -> Path:
out = Path(f"/tmp/{potential.name}.compressed")
subprocess.check_call(
[
"pdftk",
potential,
"output",
out,
"compress",
],
)
return out

def _uncompress(self) -> Path:
out = self._file_path("uncompressed")
subprocess.check_call(
[
"pdftk",
self.scrubbed,
"output",
out,
"uncompress",
],
)
return out

def _find_possible_watermarks(self) -> Iterable[str]:
if self.info.pages is None:
raise ValueError("Cannot find watermark if pages is none")
possibilities: dict[str, int] = {}
with open(file=self.scrubbed, mode="rb") as decrypted_file:
for raw_line in decrypted_file:
line = utf8_or_space(raw_line)
if "/Length" not in line:
continue
try:
possibilities[line] += 1
except KeyError:
possibilities[line] = 1
for key, val in possibilities.items():
if val == self.info.pages:
yield key


def utf8_or_space(line: bytes) -> str:
try:
return line.decode()
except UnicodeDecodeError:
return ""
self.pdf_info = PDFInfo.from_cmd(pdf_file=pdf_file)

def scrub(self, compress: bool) -> Iterable[bytes]:
pdf_bytes_without_metadata: bytes = remove_metadata(pdf_bytes=self._get_uncompressed())
for needle in find_possible_watermark_needles(pdf_bytes=pdf_bytes_without_metadata, pdf_info=self.pdf_info):
potentially_clean_pdf = remove_watermark(pdf_bytes=pdf_bytes_without_metadata, needle=needle)
if compress:
yield compress_pdf(pdf_bytes=potentially_clean_pdf)
else:
yield potentially_clean_pdf

def _get_uncompressed(self) -> bytes:
if self.pdf_info.encrypted:
decrypted: bytes = decrypt_pdf(pdf_file=self.pdf_file)
else:
with open(file=self.pdf_file, mode="rb") as pdf_buffer:
decrypted = pdf_buffer.read()
return uncompress_pdf(pdf_file_buffer=decrypted)
63 changes: 63 additions & 0 deletions src/pdf_scrub/toolkit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from pathlib import Path
import subprocess # noqa: S404
from typing import Iterable

from block_pruner import BlockPruner
from pdf_information import PDFInfo


def _pdftk(stdin: bytes, command: str) -> bytes:
return subprocess.check_output( # noqa: S603
["/usr/bin/pdftk", "-", "output", "-", command],
input=stdin,
)


def uncompress_pdf(pdf_file_buffer: bytes) -> bytes:
return _pdftk(stdin=pdf_file_buffer, command="uncompress")


def decrypt_pdf(pdf_file: Path) -> bytes:
return subprocess.check_output(["/usr/bin/qpdf", "--decrypt", pdf_file, "-"]) # noqa: S603


def compress_pdf(pdf_bytes: bytes) -> bytes:
return _pdftk(stdin=pdf_bytes, command="compress")


def remove_metadata(pdf_bytes: bytes) -> bytes:
return BlockPruner(
start=r"<\?xpacket\ begin",
end=r"<\?xpacket\ end",
needle="DocumentID",
).prune_bytes(input_data=pdf_bytes)


def remove_watermark(pdf_bytes: bytes, needle: str) -> bytes:
return BlockPruner(
start=r"[0-9]+\ [0-9]+\ obj",
end="endobj",
needle=needle,
).prune_bytes(input_data=pdf_bytes)


def find_possible_watermark_needles(pdf_bytes: bytes, pdf_info: PDFInfo) -> Iterable[str]:
if pdf_info.pages is None:
raise ValueError("Cannot find watermark if pages is none")
possibilities: dict[str, int] = {}
for raw_line in pdf_bytes.splitlines(keepends=False):
line = utf8_or_space(raw_line)
if "/Length" not in line:
continue
try:
possibilities[line] += 1
except KeyError:
possibilities[line] = 1
yield from (needle for needle, count in possibilities.items() if count == pdf_info.pages)


def utf8_or_space(line: bytes) -> str:
try:
return line.decode()
except UnicodeDecodeError:
return ""

0 comments on commit 1c60394

Please sign in to comment.