Merge pull request #36 from ThatOneGoat/33_rescale_panos

Implemented pano cleaning v1.0 by resizing panos that are of the smaller dimension.
ProjectSidewalk · Dec 1, 2021 · ecc942e · ecc942e
2 parents f7ec7b6 + 18ac5f8
commit ecc942e
Show file tree

Hide file tree

Showing 5 changed files with 84 additions and 19 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,7 +5,7 @@ alphie-sftp
 rawdata
 
 # Ignore generated batch sftp commands file
-batch.txt
+batch*.text
 
 # Ignore pano-downloads folder
 pano-downloads

diff --git a/CropRunner.py b/CropRunner.py
@@ -42,10 +42,10 @@
 mark_center = True
 
 # The number of crops per multicrop
-MULTICROP_COUNT = 3
+MULTICROP_COUNT = 2
 
 # The scale factor for each multicrop
-MULTICROP_SCALE_FACTOR = 1.25
+MULTICROP_SCALE_FACTOR = 1.5
 
 logging.basicConfig(filename='crop.log', level=logging.DEBUG)
 
@@ -87,8 +87,9 @@ def make_crop(pano_img_path, sv_image_x, sv_image_y, pano_yaw_deg, destination_d
     :param label_name: label name
     :param multicrop: whether or not to make multiple crops for the label
     :param draw_mark: if a dot should be drawn in the centre of the object/image
-    :return: none
+    :return: crop_names: a list of crop_names
     """
+    crop_names = []
     try:
         im = Image.open(pano_img_path)
         # draw = ImageDraw.Draw(im)
@@ -130,6 +131,7 @@ def make_crop(pano_img_path, sv_image_x, sv_image_y, pano_yaw_deg, destination_d
                 print("Successfully extracted crop to " + crop_name)
                 logging.info(label_name + " " + pano_img_path + " " + str(sv_image_x) + " " + str(sv_image_y) + " " + str(pano_yaw_deg))
                 logging.info("---------------------------------------------------")
+                crop_names.append(crop_name)
             if not multicrop:
                 break
             crop_width *= MULTICROP_SCALE_FACTOR
@@ -139,7 +141,7 @@ def make_crop(pano_img_path, sv_image_x, sv_image_y, pano_yaw_deg, destination_d
         print(e)
         print("Error for {}".format(pano_img_path))
 
-    return
+    return crop_names
 
 def bulk_extract_crops(path_to_db_export, path_to_gsv_scrapes, destination_dir, mark_label=False):
     t_start = perf_counter()
@@ -149,6 +151,10 @@ def bulk_extract_crops(path_to_db_export, path_to_gsv_scrapes, destination_dir,
     label_list = list(csv_f)
     row_count = len(label_list)
 
+    # make the output directory if needed
+    if not os.path.isdir(destination_dir):
+        os.makedirs(destination_dir)
+
     with mp.Manager() as manager:
         # get cpu core count
         cpu_count = mp.cpu_count()
@@ -186,7 +192,7 @@ def bulk_extract_crops(path_to_db_export, path_to_gsv_scrapes, destination_dir,
         successful_crop_count = len(output_rows)
         # no_metadata_fail = 0
         # don't count header row as a failed crop
-        no_pano_fail = row_count - successful_crop_count - 1
+        no_pano_fail = ((row_count - 1) * MULTICROP_COUNT) - successful_crop_count
 
         for row in output_rows:
             csv_w.writerow(row)
@@ -216,20 +222,18 @@ def crop_label_subset(input_rows, output_rows, path_to_gsv_scrapes, destination_
 
         # Extract the crop
         if os.path.exists(pano_img_path):
-            destination_folder = os.path.join(destination_dir)
-            if not os.path.isdir(destination_folder):
-                os.makedirs(destination_folder)
-
+            crop_names = []
             if not label_type == 0:
                 label_name = str(row[7])
-                make_crop(pano_img_path, sv_image_x, sv_image_y, pano_yaw_deg, destination_dir, label_name, True)
+                crop_names = make_crop(pano_img_path, sv_image_x, sv_image_y, pano_yaw_deg, destination_dir, label_name, True)
             else:
                 # In order to uniquely identify null crops, we concatenate the pid of process they
                 # were generated on and the counter within the process to the name of the null crop.
                 label_name = "null_" + str(process_pid) + "_" +  str(counter)
-                make_crop(pano_img_path, sv_image_x, sv_image_y, pano_yaw_deg, destination_dir, label_name, False)
+                crop_names = make_crop(pano_img_path, sv_image_x, sv_image_y, pano_yaw_deg, destination_dir, label_name, False)
 
-            output_rows.append([label_name, label_type])
+            for crop_name in crop_names:
+                output_rows.append([crop_name, label_type])
         else:
             print("Panorama image not found.")
             try:

diff --git a/PanoScraper.py b/PanoScraper.py
@@ -1,19 +1,22 @@
 import csv
+import glob
+import multiprocessing as mp
 import os
+import random
 import subprocess
-import multiprocessing as mp
 from time import perf_counter
 from itertools import islice
 from datatypes.label import Label
 from datatypes.panorama import Panorama
 from datatypes.point import Point
-import random
+from PIL import Image, ImageFile
+ImageFile.LOAD_TRUNCATED_IMAGES = True
 
 GSV_IMAGE_WIDTH  = 13312
 GSV_IMAGE_HEIGHT = 6656
 
 # null crops per pano
-NULLS_PER_PANO = 2
+NULLS_PER_PANO = 0
 
 def bulk_scrape_panos(n, start_row, path_to_labeldata_csv, local_dir, remote_dir, output_csv_name):
     # TODO: find way to clear to pano_downloads folder and batch.txt file
@@ -123,3 +126,53 @@ def acquire_n_panos(remote_dir, local_dir, pano_ids, thread_id):
     print(result)
     if sftp.returncode != 0:
         print("sftp failed on one or more commands: {0}".format(sftp_command_list))
+
+def clean_panos(path_to_panos):
+    t_start = perf_counter()
+
+    # get list of pano paths
+    panos = glob.glob(path_to_panos + "/*.jpg")
+
+     # get available cpu_count
+    cpu_count = mp.cpu_count() if mp.cpu_count() <= 8 else 8
+
+    # split pano set into chunks for multithreading
+    pano_set_size = len(panos)
+    i = 0
+    processes = []
+    while i < pano_set_size:
+        chunk_size = (pano_set_size - i) // cpu_count
+        print(chunk_size)
+        pano_ids = set(islice(panos, i, i + chunk_size))
+        print(pano_ids)
+        process = mp.Process(target=clean_n_panos, args=(pano_ids,))
+        processes.append(process)
+        cpu_count -= 1
+        i += chunk_size
+
+    # start processes
+    for p in processes:
+        p.start()
+
+    # join processes once finished
+    for p in processes:
+        p.join()
+
+    t_stop = perf_counter()
+    execution_time = t_stop - t_start
+    return execution_time
+
+def clean_n_panos(panos):
+    for pano_path in panos:
+        with Image.open(pano_path) as p:
+            # check if pano needs cleaning by looking for black space
+            pix = p.load()
+            if pix[GSV_IMAGE_WIDTH, GSV_IMAGE_HEIGHT] == (0,0,0):
+                print("resizing ", pano_path)
+                original_size = p.size
+                print(original_size)
+                im = p.crop((0, 0, GSV_IMAGE_WIDTH, GSV_IMAGE_HEIGHT))
+                print(im.size)
+                im = im.resize(original_size)
+                im.save(pano_path)
+                print(im.size)
diff --git a/requirements.txt b/requirements.txt
@@ -11,3 +11,4 @@ backoff>=1.10.0
 torch
 scikit-learn
 torchvision
+glob
diff --git a/scrape_and_crop_labels.py b/scrape_and_crop_labels.py
@@ -1,12 +1,12 @@
-from PanoScraper import bulk_scrape_panos
+from PanoScraper import bulk_scrape_panos, clean_panos
 from CropRunner import bulk_extract_crops
 
 import multiprocessing as mp
 import os
 
 if __name__ ==  '__main__':
     # scrape panos from SFTP server
-    n = 50
+    n = 20
     start_row = 1
     path_to_labeldata_csv = "rawdata/seattle-labels-cv-10-29-2021.csv"
 
@@ -21,9 +21,12 @@
     output_csv_name = 'gathered_panos.csv'
     pano_set_size, scraper_exec_time = bulk_scrape_panos(n, start_row, path_to_labeldata_csv, local_dir, remote_dir, output_csv_name)
 
+    # clean panos
+    gsv_pano_path = 'pano-downloads'
+    clean_time = clean_panos(gsv_pano_path)
+
     # crop labels with scrapped panos
     csv_export_path = 'pano-downloads/gathered_panos.csv'
-    gsv_pano_path = 'pano-downloads'
     destination_path = 'crops'
     metrics = bulk_extract_crops(csv_export_path, gsv_pano_path, destination_path, mark_label=False)
 
@@ -33,6 +36,10 @@
     print("Elapsed time scraping {} panos for {} labels in seconds:".format(pano_set_size, n),
                                             scraper_exec_time)
     print()
+    print("Pano Cleaning metrics:")
+    print("Elapsed time cleaning {} panos in seconds:".format(pano_set_size),
+                                            clean_time)
+    print()
     print("Label Cropping metrics:")
     print(str(metrics[1]) + " successful crop extractions")
     print(str(metrics[2]) + " extractions failed because panorama image was not found.")
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,3 +11,4 @@ backoff>=1.10.0 @@
     torch
     scikit-learn
     torchvision
+    glob