Support multiple paths (closes #26)

ieasybooks · Jul 26, 2024 · 2b94116 · 2b94116
1 parent 5e58839
commit 2b94116
Show file tree

Hide file tree

Showing 4 changed files with 46 additions and 43 deletions.
diff --git a/README.md b/README.md
@@ -50,7 +50,7 @@
 <h3 dir="rtl">الخيارات المتوفرة</h3>
 
 <ul dir="rtl">
-  <li>مسار ملف PDF أو مجلد يحتوي على أكثر من ملف PDF: يجب تمرير مسار الملف أو المجلد بعد اسم أداة تحويل بشكل مباشر. على سبيل المثال: <code dir="ltr">tahweel "./pdfs"</code></li>
+  <li>مسارات ملفات PDF أو مجلدات تحتوي على أكثر من ملف PDF: يجب تمرير مسارات الملفات أو المجلدات بعد اسم أداة تحويل بشكل مباشر. على سبيل المثال: <code dir="ltr">tahweel "./pdfs"</code></li>
   <li>ملف Service Account Credentials: يجب تمرير مسار ملف <code>JSON</code> الخاص بك من Google Cloud Platform إلى الاختيار <code dir="ltr">--service-account-credentials</code></li>
   <li>عدد عمليات تحويل ملف PDF إلى صور: يمكن تحديد العدد من خلال الاختيار <code dir="ltr">--pdf2image-thread-count</code>. حسب قوة حاسبك يمكن تقليل أو زيادة هذه القيمة. القيمة الافتراضية هي <code dir="ltr">8</code></li>
   <li>عدد عمليات تحويل الصور إلى نص: يمكن تحديد العدد من خلال الاختيار <code dir="ltr">--processor-max-workers</code>. حسب جودة اتصال الانترنت لديك يمكن تقليل أو زيادة هذه القيمة. القيمة الافتراضية هي <code dir="ltr">8</code></li>
@@ -62,11 +62,11 @@
 ```
 ➜ tahweel --help
 usage: tahweel --service-account-credentials SERVICE_ACCOUNT_CREDENTIALS [--pdf2image-thread-count PDF2IMAGE_THREAD_COUNT] [--processor-max-workers PROCESSOR_MAX_WORKERS]
-               [--dir-output-type {tree_to_tree,side_by_side}] [--txt-page-separator TXT_PAGE_SEPARATOR] [--docx-remove-newlines] [--skip-output-check] [--tahweel-type {file,dir}] [-h] [--version]
-               file_or_dir_path
+               [--dir-output-type {tree_to_tree,side_by_side}] [--txt-page-separator TXT_PAGE_SEPARATOR] [--docx-remove-newlines] [--skip-output-check] [-h] [--version]
+               files_or_dirs_paths [files_or_dirs_paths ...]
 
 positional arguments:
-  file_or_dir_path      Path to the file or directory to be processed.
+  files_or_dirs_paths   Path to the file or directory to be processed.
 
 options:
   --service-account-credentials SERVICE_ACCOUNT_CREDENTIALS
@@ -83,8 +83,6 @@ options:
   --docx-remove-newlines
                         (bool, default=False) Remove newlines from the output DOCX file. Useful if you want DOCX and PDF to have the same page count.
   --skip-output-check   (bool, default=False) Use this flag in development only to skip the output check.
-  --tahweel-type {file,dir}
-                        Don't use this argument, it will be auto-set based on `file_or_dir_path`.
   -h, --help            show this help message and exit
   --version             show program's version number and exit
 ```

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "tahweel"
-version = "0.0.10"
+version = "0.0.11"
 description = "تحويل ملفات PDF إلى Word و TXT"
 authors = ["EasyBooks <[email protected]>"]
 license = "MIT"

diff --git a/tahweel/cli.py b/tahweel/cli.py
@@ -37,30 +37,48 @@ def main() -> None:
 
   prepare_package_dirs()
 
-  match args.tahweel_type:
-    case TahweelType.FILE:
-      pdf_file_paths = [args.file_or_dir_path]
-    case TahweelType.DIR:
-      pdf_file_paths = list(args.file_or_dir_path.rglob('*.pdf'))
-
-  for pdf_file_path in tqdm(pdf_file_paths, desc='Files'):
-    pdf_file_manager = PdfFileManager(pdf_file_path, args.pdf2image_thread_count)
-
-    try:
-      process_file(args, processor, pdf_file_manager)
-    except Exception as e:
-      logging.error(f'Failed to process "{pdf_file_manager.file_path}" due to {e}, continuing...', exc_info=True)
-      continue
+  for file_or_dir_path in tqdm(args.files_or_dirs_paths, desc='Paths'):
+    process_path(args, processor, file_or_dir_path)
 
 
 def prepare_package_dirs() -> None:
   Path(platformdirs.user_cache_dir('Tahweel')).mkdir(parents=True, exist_ok=True)
 
 
-def process_file(args: TahweelArgumentParser, processor: BaseOcrProcessor, file_manager: PdfFileManager) -> None:
-  if not args.skip_output_check and file_manager.output_exists(
-    args.tahweel_type, args.dir_output_type, args.file_or_dir_path
-  ):
+def process_path(args: TahweelArgumentParser, processor: BaseOcrProcessor, file_or_dir_path: Path) -> None:
+  if file_or_dir_path.is_file():
+    process_single_file(args, processor, file_or_dir_path)
+  else:
+    process_directory(args, processor, file_or_dir_path)
+
+
+def process_single_file(args: TahweelArgumentParser, processor: BaseOcrProcessor, file_path: Path) -> None:
+  try:
+    process_file(args, processor, PdfFileManager(file_path, args.pdf2image_thread_count), file_path, TahweelType.FILE)
+  except Exception as e:
+    logging.error(f'Failed to process "{file_path}" due to {e}, continuing...', exc_info=True)
+
+
+def process_directory(args: TahweelArgumentParser, processor: BaseOcrProcessor, dir_path: Path) -> None:
+  pdf_file_paths = list(dir_path.rglob('*.pdf'))
+
+  for pdf_file_path in tqdm(pdf_file_paths, desc=f'Files ({truncate(str(dir_path), 50, from_end=True)})'):
+    try:
+      pdf_file_manager = PdfFileManager(pdf_file_path, args.pdf2image_thread_count)
+
+      process_file(args, processor, pdf_file_manager, dir_path, TahweelType.DIR)
+    except Exception as e:
+      logging.error(f'Failed to process "{pdf_file_path}" due to {e}, continuing...', exc_info=True)
+
+
+def process_file(
+  args: TahweelArgumentParser,
+  processor: BaseOcrProcessor,
+  file_manager: PdfFileManager,
+  file_or_dir_path: Path,
+  tahweel_type: TahweelType,
+) -> None:
+  if not args.skip_output_check and file_manager.output_exists(tahweel_type, args.dir_output_type, file_or_dir_path):
     return
 
   file_manager.to_images()
@@ -76,12 +94,12 @@ def process_file(args: TahweelArgumentParser, processor: BaseOcrProcessor, file_
 
   content = list(map(lambda text: apply_transformations(text, TRANSFORMATIONS), content))
 
-  TxtWriter(file_manager.txt_file_path(args.tahweel_type, args.dir_output_type, args.file_or_dir_path)).write(
+  TxtWriter(file_manager.txt_file_path(tahweel_type, args.dir_output_type, file_or_dir_path)).write(
     content,
     args.txt_page_separator,
   )
 
-  DocxWriter(file_manager.docx_file_path(args.tahweel_type, args.dir_output_type, args.file_or_dir_path)).write(
+  DocxWriter(file_manager.docx_file_path(tahweel_type, args.dir_output_type, file_or_dir_path)).write(
     content,
     args.docx_remove_newlines,
   )
diff --git a/tahweel/tahweel_argument_parser.py b/tahweel/tahweel_argument_parser.py
@@ -4,11 +4,11 @@
 
 from tap import Tap
 
-from tahweel.enums import DirOutputType, TahweelType
+from tahweel.enums import DirOutputType
 
 
 class TahweelArgumentParser(Tap):
-  file_or_dir_path: Path
+  files_or_dirs_paths: list[Path]
 
   service_account_credentials: Path
   """Path to the service account credentials JSON file."""
@@ -30,10 +30,8 @@ class TahweelArgumentParser(Tap):
   skip_output_check: bool = False
   """Use this flag in development only to skip the output check."""
 
-  tahweel_type: TahweelType = TahweelType.FILE
-
   def configure(self):
-    self.add_argument('file_or_dir_path', type=Path, help='Path to the file or directory to be processed.')
+    self.add_argument('files_or_dirs_paths', nargs='+', help='Path to the file or directory to be processed.')
 
     self.add_argument(
       '--dir-output-type',
@@ -45,20 +43,9 @@ def configure(self):
       'while `side_by_side` means the output will be in the same input directory beside each file.',
     )
 
-    self.add_argument(
-      '--tahweel-type',
-      type=TahweelType,
-      default=TahweelType.FILE,
-      choices=list(TahweelType),
-      help="Don't use this argument, it will be auto-set based on `file_or_dir_path`.",
-    )
-
     self.add_argument(
       '--version',
       action='version',
       version=importlib.metadata.version('tahweel'),
       help="show program's version number and exit",
     )
-
-  def process_args(self):
-    self.tahweel_type = TahweelType.FILE if self.file_or_dir_path.is_file() else TahweelType.DIR