From 568b37d11123b2450b11851bbb633454f755773a Mon Sep 17 00:00:00 2001
From: Ars Vladimirov <grokhael@gmail.com>
Date: Tue, 10 Oct 2023 22:27:02 +0600
Subject: [PATCH] fix max image size

---
 src/main.py | 134 ++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 92 insertions(+), 42 deletions(-)

diff --git a/src/main.py b/src/main.py
index f232c0d..95c8735 100644
--- a/src/main.py
+++ b/src/main.py
@@ -1,4 +1,6 @@
 import os
+import tarfile
+from tqdm import tqdm
 import supervisely as sly
 from supervisely.api.module_api import ApiField
 from supervisely.io.fs import get_file_ext
@@ -6,6 +8,10 @@
 from distutils import util
 from dotenv import load_dotenv
 
+from PIL import Image
+
+Image.MAX_IMAGE_PIXELS = 1000000000
+
 from dataset_tools import ProjectRepo
 
 if sly.is_development():
@@ -17,19 +23,30 @@
 my_app: AppService = AppService()
 
 
-TEAM_ID = int(os.environ['context.teamId'])
-WORKSPACE_ID = int(os.environ['context.workspaceId'])
-PROJECT_ID = int(os.environ['modal.state.slyProjectId'])
-DATASET_ID = os.environ.get('modal.state.slyDatasetId', None)
+TEAM_ID = int(os.environ["context.teamId"])
+WORKSPACE_ID = int(os.environ["context.workspaceId"])
+PROJECT_ID = int(os.environ["modal.state.slyProjectId"])
+DATASET_ID = os.environ.get("modal.state.slyDatasetId", None)
 if DATASET_ID is not None:
     DATASET_ID = int(DATASET_ID)
 
 task_id = int(os.environ["TASK_ID"])
-mode = os.environ['modal.state.download']
-replace_method = bool(util.strtobool(os.environ['modal.state.fixExtension']))
+mode = os.environ["modal.state.download"]
+replace_method = bool(util.strtobool(os.environ["modal.state.fixExtension"]))
 batch_size = 10
 
 
+def pack_directory_to_tar(source_dir, output_tar):
+    if not os.path.exists(source_dir):
+        raise FileNotFoundError(f"Source directory '{source_dir}' does not exist.")
+
+    with tarfile.open(output_tar, "w") as tar:
+        for root, _, files in os.walk(source_dir):
+            for file in tqdm(files, desc=f"Packing to '{output_tar}'", unit="file"):
+                file_path = os.path.join(root, file)
+                tar.add(file_path, arcname=os.path.relpath(file_path, source_dir))
+
+
 def ours_convert_json_info(self, info: dict, skip_missing=True):
     if info is None:
         return None
@@ -44,7 +61,7 @@ def ours_convert_json_info(self, info: dict, skip_missing=True):
             val = info[field_name]
         field_values.append(val)
         if field_name == ApiField.MIME:
-            temp_ext = val.split('/')[1]
+            temp_ext = val.split("/")[1]
             field_values.append(temp_ext)
     for idx, field_name in enumerate(self.info_sequence()):
         if field_name == ApiField.NAME:
@@ -52,7 +69,7 @@ def ours_convert_json_info(self, info: dict, skip_missing=True):
             if not cur_ext:
                 field_values[idx] = "{}.{}".format(field_values[idx], temp_ext)
                 break
-            if temp_ext == 'jpeg' and cur_ext in ['jpg', 'jpeg', 'mpo']:
+            if temp_ext == "jpeg" and cur_ext in ["jpg", "jpeg", "mpo"]:
                 break
             if temp_ext != cur_ext and cur_ext is not None:
                 pass
@@ -61,12 +78,12 @@ def ours_convert_json_info(self, info: dict, skip_missing=True):
 
 
 def init(data, state):
-    state['download'] = mode
-    state['fixExtension'] = replace_method
+    state["download"] = mode
+    state["fixExtension"] = replace_method
 
 
 if replace_method:
-    sly.logger.debug('change SDK method')
+    sly.logger.debug("change SDK method")
     sly.api.image_api.ImageApi._convert_json_info = ours_convert_json_info
 
 
@@ -86,45 +103,68 @@ def download_as_sly(api: sly.Api, task_id, context, state, app_logger):
         try:
             datasets = api.dataset.get_list(project.id)
         except Exception as e:
-            raise Exception(f"Failed to get list of datasets from project ID:{project.id}. {e}")
+            raise Exception(
+                f"Failed to get list of datasets from project ID:{project.id}. {e}"
+            )
         dataset_ids = [dataset.id for dataset in datasets]
-    if mode == 'all':
+    if mode == "all":
         download_json_plus_images(api, project, dataset_ids)
     else:
         download_only_json(api, project, dataset_ids)
 
-    download_dir = os.path.join(my_app.data_dir, f'{project.id}_{project.name}')
-    full_archive_name = str(project.id) + '_' + project.name + '.tar'
+    download_dir = os.path.join(my_app.data_dir, f"{project.id}_{project.name}")
+    full_archive_name = str(project.id) + "_" + project.name + ".tar"
     result_archive = os.path.join(my_app.data_dir, full_archive_name)
-    sly.fs.archive_directory(download_dir, result_archive)
+    pack_directory_to_tar(download_dir, result_archive)
+    # sly.fs.archive_directory(download_dir, result_archive)
     app_logger.info("Result directory is archived")
     upload_progress = []
     remote_archive_path = os.path.join(
-        sly.team_files.RECOMMENDED_EXPORT_PATH, "export-to-supervisely-format/{}_{}".format(task_id, full_archive_name))
+        sly.team_files.RECOMMENDED_EXPORT_PATH,
+        "export-to-supervisely-format/{}_{}".format(task_id, full_archive_name),
+    )
 
     def _print_progress(monitor, upload_progress):
         if len(upload_progress) == 0:
-            upload_progress.append(sly.Progress(message="Upload {!r}".format(full_archive_name),
-                                                total_cnt=monitor.len,
-                                                ext_logger=app_logger,
-                                                is_size=True))
+            upload_progress.append(
+                sly.Progress(
+                    message="Upload {!r}".format(full_archive_name),
+                    total_cnt=monitor.len,
+                    ext_logger=app_logger,
+                    is_size=True,
+                )
+            )
         upload_progress[0].set_current_value(monitor.bytes_read)
 
-    file_info = api.file.upload(TEAM_ID, result_archive, remote_archive_path,
-                                lambda m: _print_progress(m, upload_progress))
+    file_info = api.file.upload(
+        TEAM_ID,
+        result_archive,
+        remote_archive_path,
+        lambda m: _print_progress(m, upload_progress),
+    )
     app_logger.info("Uploaded to Team-Files: {!r}".format(file_info.path))
-    api.task.set_output_archive(task_id, file_info.id, full_archive_name, file_url=file_info.storage_path)
+    api.task.set_output_archive(
+        task_id, file_info.id, full_archive_name, file_url=file_info.storage_path
+    )
     my_app.stop()
 
 
 def download_json_plus_images(api, project, dataset_ids):
-    sly.logger.info('###########DOWNLOAD_PROJECT', extra={'title': project.name})
-    download_dir = os.path.join(my_app.data_dir, f'{project.id}_{project.name}')
+    sly.logger.info("DOWNLOAD_PROJECT", extra={"title": project.name})
+    download_dir = os.path.join(my_app.data_dir, f"{project.id}_{project.name}")
     if os.path.exists(download_dir):
         sly.fs.clean_dir(download_dir)
-    sly.download_project(api, project.id, download_dir, dataset_ids=dataset_ids,
-                         log_progress=True, batch_size=batch_size)
-    sly.logger.info('Project {!r} has been successfully downloaded.'.format(project.name))
+    sly.download_project(
+        api,
+        project.id,
+        download_dir,
+        dataset_ids=dataset_ids,
+        log_progress=True,
+        batch_size=batch_size,
+    )
+    sly.logger.info(
+        "Project {!r} has been successfully downloaded.".format(project.name)
+    )
 
     sly.logger.info("Start building files...")
     print(project.custom_data)
@@ -134,24 +174,28 @@ def download_json_plus_images(api, project, dataset_ids):
 
 
 def download_only_json(api, project, dataset_ids):
-    sly.logger.info('DOWNLOAD_PROJECT', extra={'title': project.name})
-    download_dir = os.path.join(my_app.data_dir, f'{project.id}_{project.name}')
+    sly.logger.info("DOWNLOAD_PROJECT", extra={"title": project.name})
+    download_dir = os.path.join(my_app.data_dir, f"{project.id}_{project.name}")
     sly.fs.mkdir(download_dir)
     meta_json = api.project.get_meta(project.id)
-    sly.io.json.dump_json_file(meta_json, os.path.join(download_dir, 'meta.json'))
+    sly.io.json.dump_json_file(meta_json, os.path.join(download_dir, "meta.json"))
 
     total_images = 0
     dataset_info = (
         [api.dataset.get_info_by_id(ds_id) for ds_id in dataset_ids]
-        if (dataset_ids is not None) else api.dataset.get_list(project.id))
+        if (dataset_ids is not None)
+        else api.dataset.get_list(project.id)
+    )
 
     for dataset in dataset_info:
-        ann_dir = os.path.join(download_dir, dataset.name, 'ann')
+        ann_dir = os.path.join(download_dir, dataset.name, "ann")
         sly.fs.mkdir(ann_dir)
 
         images = api.image.get_list(dataset.id)
         ds_progress = sly.Progress(
-            'Downloading annotations for: {!r}/{!r}'.format(project.name, dataset.name), total_cnt=len(images))
+            "Downloading annotations for: {!r}/{!r}".format(project.name, dataset.name),
+            total_cnt=len(images),
+        )
         for batch in sly.batched(images, batch_size=10):
             image_ids = [image_info.id for image_info in batch]
             image_names = [image_info.name for image_info in batch]
@@ -160,14 +204,19 @@ def download_only_json(api, project, dataset_ids):
             ann_infos = api.annotation.download_batch(dataset.id, image_ids)
 
             for image_name, ann_info in zip(image_names, ann_infos):
-                sly.io.json.dump_json_file(ann_info.annotation, os.path.join(ann_dir, image_name + '.json'))
+                sly.io.json.dump_json_file(
+                    ann_info.annotation, os.path.join(ann_dir, image_name + ".json")
+                )
             ds_progress.iters_done_report(len(batch))
             total_images += len(batch)
 
-    sly.logger.info('Project {!r} has been successfully downloaded'.format(project.name))
-    sly.logger.info('Total number of images: {!r}'.format(total_images))
+    sly.logger.info(
+        "Project {!r} has been successfully downloaded".format(project.name)
+    )
+    sly.logger.info("Total number of images: {!r}".format(total_images))
 
-def build_license(license_content:str, download_dir:str):
+
+def build_license(license_content: str, download_dir: str):
     license_path = os.path.join(download_dir, "LICENSE.md")
     print(license_content)
     with open(license_path, "w") as license_file:
@@ -179,14 +228,15 @@ def build_readme(readme_content, download_dir):
     with open(readme_path, "w") as license_file:
         license_file.write(readme_content)
 
+
 def main():
     sly.logger.info(
         "Script arguments",
         extra={
-            "TEAM_ID":      TEAM_ID,
+            "TEAM_ID": TEAM_ID,
             "WORKSPACE_ID": WORKSPACE_ID,
-            "PROJECT_ID":   PROJECT_ID
-        }
+            "PROJECT_ID": PROJECT_ID,
+        },
     )
 
     data = {}