Skip to content

Commit

Permalink
Merge branch 'develop' of github.com:yoyonel/image_downloader_multipr…
Browse files Browse the repository at this point in the history
…ocessing_python into develop
  • Loading branch information
yoyonel committed Feb 22, 2020
2 parents 00f96f0 + 3da9e3c commit 28eec77
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 32 deletions.
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ argparse
pillow
requests
tqdm
uvloop
uvloop; sys_platform != 'win32'
96 changes: 65 additions & 31 deletions src/multiprocessing/image_downloader.py
Original file line number Diff line number Diff line change
@@ -1,47 +1,45 @@
# -*- coding: utf-8 -*-
import sys

import argparse
import collections
import io
import logging
import pathlib
import random
from functools import partial
from multiprocessing.pool import ThreadPool
from typing import List

import requests
from PIL import Image
from multiprocessing.pool import ThreadPool

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("image_downloader::multi_processing")

def get_download_location():
try:
url_input = sys.argv[1]
except IndexError:
print('ERROR: Please provide the txt file\n$python image_downloader.py cats.txt')

def get_download_location(url_input: str):
name = url_input.split('.')[0]
pathlib.Path(name).mkdir(parents=True, exist_ok=True)
return name


def get_urls():
def get_urls(url_input: str) -> List[str]:
"""
Returns a list of urls by reading the txt file supplied as argument in terminal
"""
try:
url_input = sys.argv[1]
except IndexError:
print('ERROR: Please provide the txt file\n Example \n\n$python image_downloader.py dogs.txt \n\n')
sys.exit()
with open(url_input, 'r') as f:
images_url = f.read().splitlines()

print('{} Images detected'.format(len(images_url)))
logger.info('{} Images detected'.format(len(images_url)))
return images_url


def image_downloader(img_url: str):
def image_downloader(img_url: str, url_input: str) -> bool:
"""
Input:
param: img_url str (Image url)
Tries to download the image url and use name provided in headers. Else it randomly picks a name
"""
print(f'Downloading: {img_url}')
logger.info(f'Downloading: {img_url}')
res = requests.get(img_url, stream=True)
# count = 1
# while res.status_code != 200 and count <= 5:
Expand All @@ -50,7 +48,7 @@ def image_downloader(img_url: str):
# count += 1
# checking the type for image
if 'image' not in res.headers.get("content-type", ''):
print("ERROR: URL doesn't appear to be an image")
logger.error("ERROR: URL doesn't appear to be an image")
return False
# Trying to red image name from response headers
try:
Expand All @@ -61,27 +59,63 @@ def image_downloader(img_url: str):
image_name = str(random.randint(11111, 99999)) + '.jpg'

i = Image.open(io.BytesIO(res.content))
download_location = get_download_location()
download_location = get_download_location(url_input)
i.save(download_location + '/' + image_name)
return f'Download complete: {img_url}'

logger.info('Download complete: %s', img_url)
return True


def run_downloader(process: int, images_url: list):
def run_downloader(process: int, images_url: list, url_input: str):
"""
Inputs:
process: (int) number of process to run
images_url:(list) list of images url
"""
print(f'MESSAGE: Running {process} process')
results = ThreadPool(process).imap_unordered(image_downloader, images_url)
for r in results:
print(r)
logger.info('MESSAGE: Running %s process', process)
it_mp_imap = ThreadPool(process).imap_unordered(partial(image_downloader, url_input=url_input), images_url)
# https://github.com/python/cpython/blob/v3.6.5/Modules/_collectionsmodule.c#L356
collections.deque(it_mp_imap, maxlen=0)


def build_parser():
"""
Returns:
"""
parser = argparse.ArgumentParser(
description='MultiProcess Image downloader',
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)

# Required
parser.add_argument("url_input",
type=str,
help="txt file (example: cats.txt)")

#
parser.add_argument("num_process",
nargs='?',
type=int,
default=10,
help="Number of process")

parser.add_argument("--export_dir",
type=str,
default="cats",
help="Export directory")

return parser


def main():
parser = build_parser()
args = parser.parse_args()

images_url = get_urls(args.url_input)
run_downloader(args.num_process, images_url, args.url_input)

try:
num_process = int(sys.argv[2])
except:
num_process = 10

images_url = get_urls()
run_downloader(num_process, images_url)
if __name__ == "__main__":
main()

0 comments on commit 28eec77

Please sign in to comment.