Skip to content

Commit 502b1f4

Browse files
authored
Merge pull request #14 from Zipstack/feat/recursive-processing-of-input-files
feat: Added option to recurse sub-directories
2 parents adc6aeb + 90d783c commit 502b1f4

File tree

2 files changed

+37
-22
lines changed

2 files changed

+37
-22
lines changed

Diff for: README.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# File Processing Script
22

3-
This script processes files in a specified directory using an API, logs results in a local SQLite database, and provides options for retrying failed or pending files. It includes features for skipping specific files, generating reports, and running multiple API calls in parallel.
3+
This script processes files recursively from a specified directory using an API, logs results in a local SQLite database, and provides options for retrying failed or pending files. It includes features for skipping specific files, generating reports, and running multiple API calls in parallel.
44

55
## Features
66

@@ -61,6 +61,7 @@ This will display detailed usage information.
6161
- `-p`, `--parallel_call_count`: Number of parallel API calls (default: 10).
6262
- `--csv_report`: Path to export the detailed report as a CSV file.
6363
- `--db_path`: Path where the SQlite DB file is stored (default: './file_processing.db')
64+
- `--recursive`: Recursively identify and process files from the input folder path (default: False)
6465
- `--retry_failed`: Retry processing of failed files.
6566
- `--retry_pending`: Retry processing of pending files by making new requests.
6667
- `--skip_pending`: Skip processing of pending files.

Diff for: main.py

+35-21
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ class Arguments:
2929
input_folder_path: str = ""
3030
db_path: str = ""
3131
parallel_call_count: int = 5
32+
recurse_input_folder: bool = False
3233
retry_failed: bool = False
3334
retry_pending: bool = False
3435
skip_pending: bool = False
@@ -463,11 +464,15 @@ def process_file(
463464

464465

465466
def load_folder(args: Arguments):
466-
files = [
467-
os.path.join(args.input_folder_path, f)
468-
for f in os.listdir(args.input_folder_path)
469-
if os.path.isfile(os.path.join(args.input_folder_path, f))
470-
]
467+
files = []
468+
for root, _, filenames in os.walk(args.input_folder_path):
469+
for f in filenames:
470+
file_path = os.path.join(root, f)
471+
if os.path.isfile(file_path):
472+
files.append(file_path)
473+
if not args.recurse_input_folder:
474+
break
475+
logger.debug(f"Loaded '{len(files)}' files from '{args.input_folder_path}': {files}")
471476

472477
with Manager() as manager, Pool(args.parallel_call_count) as executor:
473478
success_count = manager.Value("i", 0) # Shared integer for success count
@@ -501,6 +506,24 @@ def load_folder(args: Arguments):
501506
pbar.close()
502507

503508

509+
def api_deployment_batch_run(args: Arguments):
510+
logger.warning(f"Running with params: {args}")
511+
init_db(args=args) # Initialize DB
512+
513+
load_folder(args=args)
514+
515+
print_summary(args=args) # Print summary at the end
516+
if args.print_report:
517+
print_report(args=args)
518+
logger.warning(
519+
"Elapsed time calculation of a file which was resumed"
520+
" from pending state will not be correct"
521+
)
522+
523+
if args.csv_report:
524+
export_report_to_csv(args=args)
525+
526+
504527
def main():
505528
parser = argparse.ArgumentParser(description="Process files using Unstract's API deployment")
506529
parser.add_argument(
@@ -564,6 +587,12 @@ def main():
564587
type=str,
565588
help='Path to export the detailed report as a CSV file',
566589
)
590+
parser.add_argument(
591+
"--recursive",
592+
dest="recurse_input_folder",
593+
action="store_true",
594+
help="Recursively identify and process files from the input folder path (default: False)",
595+
)
567596
parser.add_argument(
568597
"--retry_failed",
569598
dest="retry_failed",
@@ -625,22 +654,7 @@ def main():
625654
ch.setFormatter(formatter)
626655
logging.basicConfig(level=args.log_level, handlers=[ch])
627656

628-
logger.warning(f"Running with params: {args}")
629-
630-
init_db(args=args) # Initialize DB
631-
632-
load_folder(args=args)
633-
634-
print_summary(args=args) # Print summary at the end
635-
if args.print_report:
636-
print_report(args=args)
637-
logger.warning(
638-
"Elapsed time calculation of a file which was resumed"
639-
" from pending state will not be correct"
640-
)
641-
642-
if args.csv_report:
643-
export_report_to_csv(args=args)
657+
api_deployment_batch_run(args=args)
644658

645659

646660
if __name__ == "__main__":

0 commit comments

Comments
 (0)