Skip to content

Commit

Permalink
Support multiple paths (closes #26)
Browse files Browse the repository at this point in the history
  • Loading branch information
AliOsm committed Jul 26, 2024
1 parent 5e58839 commit 2b94116
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 43 deletions.
10 changes: 4 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
<h3 dir="rtl">الخيارات المتوفرة</h3>

<ul dir="rtl">
<li>مسار ملف PDF أو مجلد يحتوي على أكثر من ملف PDF: يجب تمرير مسار الملف أو المجلد بعد اسم أداة تحويل بشكل مباشر. على سبيل المثال: <code dir="ltr">tahweel "./pdfs"</code></li>
<li>مسارات ملفات PDF أو مجلدات تحتوي على أكثر من ملف PDF: يجب تمرير مسارات الملفات أو المجلدات بعد اسم أداة تحويل بشكل مباشر. على سبيل المثال: <code dir="ltr">tahweel "./pdfs"</code></li>
<li>ملف Service Account Credentials: يجب تمرير مسار ملف <code>JSON</code> الخاص بك من Google Cloud Platform إلى الاختيار <code dir="ltr">--service-account-credentials</code></li>
<li>عدد عمليات تحويل ملف PDF إلى صور: يمكن تحديد العدد من خلال الاختيار <code dir="ltr">--pdf2image-thread-count</code>. حسب قوة حاسبك يمكن تقليل أو زيادة هذه القيمة. القيمة الافتراضية هي <code dir="ltr">8</code></li>
<li>عدد عمليات تحويل الصور إلى نص: يمكن تحديد العدد من خلال الاختيار <code dir="ltr">--processor-max-workers</code>. حسب جودة اتصال الانترنت لديك يمكن تقليل أو زيادة هذه القيمة. القيمة الافتراضية هي <code dir="ltr">8</code></li>
Expand All @@ -62,11 +62,11 @@
```
➜ tahweel --help
usage: tahweel --service-account-credentials SERVICE_ACCOUNT_CREDENTIALS [--pdf2image-thread-count PDF2IMAGE_THREAD_COUNT] [--processor-max-workers PROCESSOR_MAX_WORKERS]
[--dir-output-type {tree_to_tree,side_by_side}] [--txt-page-separator TXT_PAGE_SEPARATOR] [--docx-remove-newlines] [--skip-output-check] [--tahweel-type {file,dir}] [-h] [--version]
file_or_dir_path
[--dir-output-type {tree_to_tree,side_by_side}] [--txt-page-separator TXT_PAGE_SEPARATOR] [--docx-remove-newlines] [--skip-output-check] [-h] [--version]
files_or_dirs_paths [files_or_dirs_paths ...]
positional arguments:
file_or_dir_path Path to the file or directory to be processed.
files_or_dirs_paths Path to the file or directory to be processed.
options:
--service-account-credentials SERVICE_ACCOUNT_CREDENTIALS
Expand All @@ -83,8 +83,6 @@ options:
--docx-remove-newlines
(bool, default=False) Remove newlines from the output DOCX file. Useful if you want DOCX and PDF to have the same page count.
--skip-output-check (bool, default=False) Use this flag in development only to skip the output check.
--tahweel-type {file,dir}
Don't use this argument, it will be auto-set based on `file_or_dir_path`.
-h, --help show this help message and exit
--version show program's version number and exit
```
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "tahweel"
version = "0.0.10"
version = "0.0.11"
description = "تحويل ملفات PDF إلى Word و TXT"
authors = ["EasyBooks <[email protected]>"]
license = "MIT"
Expand Down
58 changes: 38 additions & 20 deletions tahweel/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,30 +37,48 @@ def main() -> None:

prepare_package_dirs()

match args.tahweel_type:
case TahweelType.FILE:
pdf_file_paths = [args.file_or_dir_path]
case TahweelType.DIR:
pdf_file_paths = list(args.file_or_dir_path.rglob('*.pdf'))

for pdf_file_path in tqdm(pdf_file_paths, desc='Files'):
pdf_file_manager = PdfFileManager(pdf_file_path, args.pdf2image_thread_count)

try:
process_file(args, processor, pdf_file_manager)
except Exception as e:
logging.error(f'Failed to process "{pdf_file_manager.file_path}" due to {e}, continuing...', exc_info=True)
continue
for file_or_dir_path in tqdm(args.files_or_dirs_paths, desc='Paths'):
process_path(args, processor, file_or_dir_path)


def prepare_package_dirs() -> None:
Path(platformdirs.user_cache_dir('Tahweel')).mkdir(parents=True, exist_ok=True)


def process_file(args: TahweelArgumentParser, processor: BaseOcrProcessor, file_manager: PdfFileManager) -> None:
if not args.skip_output_check and file_manager.output_exists(
args.tahweel_type, args.dir_output_type, args.file_or_dir_path
):
def process_path(args: TahweelArgumentParser, processor: BaseOcrProcessor, file_or_dir_path: Path) -> None:
if file_or_dir_path.is_file():
process_single_file(args, processor, file_or_dir_path)
else:
process_directory(args, processor, file_or_dir_path)


def process_single_file(args: TahweelArgumentParser, processor: BaseOcrProcessor, file_path: Path) -> None:
try:
process_file(args, processor, PdfFileManager(file_path, args.pdf2image_thread_count), file_path, TahweelType.FILE)
except Exception as e:
logging.error(f'Failed to process "{file_path}" due to {e}, continuing...', exc_info=True)


def process_directory(args: TahweelArgumentParser, processor: BaseOcrProcessor, dir_path: Path) -> None:
pdf_file_paths = list(dir_path.rglob('*.pdf'))

for pdf_file_path in tqdm(pdf_file_paths, desc=f'Files ({truncate(str(dir_path), 50, from_end=True)})'):
try:
pdf_file_manager = PdfFileManager(pdf_file_path, args.pdf2image_thread_count)

process_file(args, processor, pdf_file_manager, dir_path, TahweelType.DIR)
except Exception as e:
logging.error(f'Failed to process "{pdf_file_path}" due to {e}, continuing...', exc_info=True)


def process_file(
args: TahweelArgumentParser,
processor: BaseOcrProcessor,
file_manager: PdfFileManager,
file_or_dir_path: Path,
tahweel_type: TahweelType,
) -> None:
if not args.skip_output_check and file_manager.output_exists(tahweel_type, args.dir_output_type, file_or_dir_path):
return

file_manager.to_images()
Expand All @@ -76,12 +94,12 @@ def process_file(args: TahweelArgumentParser, processor: BaseOcrProcessor, file_

content = list(map(lambda text: apply_transformations(text, TRANSFORMATIONS), content))

TxtWriter(file_manager.txt_file_path(args.tahweel_type, args.dir_output_type, args.file_or_dir_path)).write(
TxtWriter(file_manager.txt_file_path(tahweel_type, args.dir_output_type, file_or_dir_path)).write(
content,
args.txt_page_separator,
)

DocxWriter(file_manager.docx_file_path(args.tahweel_type, args.dir_output_type, args.file_or_dir_path)).write(
DocxWriter(file_manager.docx_file_path(tahweel_type, args.dir_output_type, file_or_dir_path)).write(
content,
args.docx_remove_newlines,
)
19 changes: 3 additions & 16 deletions tahweel/tahweel_argument_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@

from tap import Tap

from tahweel.enums import DirOutputType, TahweelType
from tahweel.enums import DirOutputType


class TahweelArgumentParser(Tap):
file_or_dir_path: Path
files_or_dirs_paths: list[Path]

service_account_credentials: Path
"""Path to the service account credentials JSON file."""
Expand All @@ -30,10 +30,8 @@ class TahweelArgumentParser(Tap):
skip_output_check: bool = False
"""Use this flag in development only to skip the output check."""

tahweel_type: TahweelType = TahweelType.FILE

def configure(self):
self.add_argument('file_or_dir_path', type=Path, help='Path to the file or directory to be processed.')
self.add_argument('files_or_dirs_paths', nargs='+', help='Path to the file or directory to be processed.')

self.add_argument(
'--dir-output-type',
Expand All @@ -45,20 +43,9 @@ def configure(self):
'while `side_by_side` means the output will be in the same input directory beside each file.',
)

self.add_argument(
'--tahweel-type',
type=TahweelType,
default=TahweelType.FILE,
choices=list(TahweelType),
help="Don't use this argument, it will be auto-set based on `file_or_dir_path`.",
)

self.add_argument(
'--version',
action='version',
version=importlib.metadata.version('tahweel'),
help="show program's version number and exit",
)

def process_args(self):
self.tahweel_type = TahweelType.FILE if self.file_or_dir_path.is_file() else TahweelType.DIR

0 comments on commit 2b94116

Please sign in to comment.