-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
83 lines (70 loc) · 2.67 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import re
import constants
import os
import requests
import pandas as pd
import multiprocessing
import time
from time import time as timer
from tqdm import tqdm
import numpy as np
from pathlib import Path
from functools import partial
import requests
import urllib
from PIL import Image
def common_mistake(unit):
if unit in constants.allowed_units:
return unit
if unit.replace('ter', 'tre') in constants.allowed_units:
return unit.replace('ter', 'tre')
if unit.replace('feet', 'foot') in constants.allowed_units:
return unit.replace('feet', 'foot')
return unit
def parse_string(s):
s_stripped = "" if s==None or str(s)=='nan' else s.strip()
if s_stripped == "":
return None, None
pattern = re.compile(r'^-?\d+(\.\d+)?\s+[a-zA-Z\s]+$')
if not pattern.match(s_stripped):
raise ValueError("Invalid format in {}".format(s))
parts = s_stripped.split(maxsplit=1)
number = float(parts[0])
unit = common_mistake(parts[1])
if unit not in constants.allowed_units:
raise ValueError("Invalid unit [{}] found in {}. Allowed units: {}".format(
unit, s, constants.allowed_units))
return number, unit
def create_placeholder_image(image_save_path):
try:
placeholder_image = Image.new('RGB', (100, 100), color='black')
placeholder_image.save(image_save_path)
except Exception as e:
return
def download_image(image_link, save_folder, retries=3, delay=3):
if not isinstance(image_link, str):
return
filename = Path(image_link).name
image_save_path = os.path.join(save_folder, filename)
if os.path.exists(image_save_path):
return
for _ in range(retries):
try:
urllib.request.urlretrieve(image_link, image_save_path)
return
except:
time.sleep(delay)
create_placeholder_image(image_save_path) #Create a black placeholder image for invalid links/images
def download_images(image_links, download_folder, allow_multiprocessing=True):
if not os.path.exists(download_folder):
os.makedirs(download_folder)
if allow_multiprocessing:
download_image_partial = partial(
download_image, save_folder=download_folder, retries=3, delay=3)
with multiprocessing.Pool(64) as pool:
list(tqdm(pool.imap(download_image_partial, image_links), total=len(image_links)))
pool.close()
pool.join()
else:
for image_link in tqdm(image_links, total=len(image_links)):
download_image(image_link, save_folder=download_folder, retries=3, delay=3)