Skip to content
This repository has been archived by the owner on Oct 4, 2024. It is now read-only.

Commit

Permalink
Merge pull request #232 from gilesknap/restore-bad-ids
Browse files Browse the repository at this point in the history
Restore bad ids
  • Loading branch information
gilesknap authored May 12, 2020
2 parents 239a5b5 + 066c1d3 commit 10be8f0
Show file tree
Hide file tree
Showing 13 changed files with 184 additions and 31 deletions.
59 changes: 59 additions & 0 deletions gphotos/BadIds.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from pathlib import Path
from yaml import safe_load, safe_dump, YAMLError
from typing import Dict
import logging

log = logging.getLogger(__name__)


class BadIds:
""" keeps a list of media items with ID in a YAML file. The YAML file
allows a user to easily investigate their list of media items that have
failed to download
Attributes:
items: Dict[str, Item] bad ids found with identifying attributes
bad_ids_filename: str: file where ids are stored/read
bad_ids_found: count of Ids found since instantiation
"""

def __init__(self, root_folder: Path):
self.items: Dict[str, dict] = {}
self.bad_ids_filename: Path = root_folder / "gphotos.bad_ids.yaml"
self.bad_ids_found: int = 0
self.load_ids()

def __exit__(self, exc_type, exc_val, exc_tb):
self.store_ids()

def load_ids(self):
try:
with self.bad_ids_filename.open("r") as stream:
self.items = safe_load(stream)
log.debug("bad_ids file, loaded %d bad ids", len(self.items))
except (YAMLError, IOError):
log.debug("no bad_ids file, bad ids list is empty")

def store_ids(self):
with self.bad_ids_filename.open("w") as stream:
safe_dump(self.items, stream, default_flow_style=False)

def add_id(self, path: str, gid: str, product_url: str, e: Exception):
item = dict(path=str(path), product_url=product_url)
self.items[gid] = item
log.debug("BAD ID %s for %s", gid, path, exc_info=e)

def check_id_ok(self, gid: str):
if gid in self.items:
self.bad_ids_found += 1
return False
else:
return True

def report(self):
if self.bad_ids_found > 0:
log.error(
"WARNING: skipped %d files listed in %s",
self.bad_ids_found,
self.bad_ids_filename,
)
3 changes: 3 additions & 0 deletions gphotos/BaseMedia.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ def set_path_by_date(self, root: Path, use_flat_path: bool = False):
self._relative_folder = root / y / m

def is_video(self) -> bool:
# guard against no mimetype issue #231
if not self.mime_type:
return False
return self.mime_type.startswith("video")

@property
Expand Down
2 changes: 1 addition & 1 deletion gphotos/Checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
class Checks:
# regex for illegal characters in file names and database queries
fix_linux = re.compile(r"[/]|[\x00-\x1f]|\x7f|\x00")
fix_windows = re.compile(r'[<>:"/\\|?*]|[\x00-\x1f]|\x7f|\x00')
fix_windows = re.compile(r'[,<>:"/\\|?*]|[\x00-\x1f]|\x7f|\x00')
fix_windows_ending = re.compile("([ .]+$)")
fix_whitespace_ending = re.compile("([ \t]+$)")
fix_unicode = re.compile(r"[^\x00-\x7F]")
Expand Down
4 changes: 2 additions & 2 deletions gphotos/GoogleAlbumsSync.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ def create_album_content_links(self):
created_date = Utils.string_to_date(created)
if full_file_name.exists():
if self._use_hardlinks:
os.link(full_file_name, link_file)
os.link(full_file_name, link_file)
else:
link_file.symlink_to(relative_filename)
else:
Expand All @@ -323,7 +323,7 @@ def create_album_content_links(self):
follow_symlinks=False,
)
except PermissionError:
log.warning("cant set date on {link_file}")
log.debug(f"cant set date on {link_file}")

except (FileExistsError, UnicodeEncodeError):
log.error("bad link to %s", full_file_name)
Expand Down
51 changes: 44 additions & 7 deletions gphotos/GooglePhotosDownload.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from gphotos.restclient import RestClient
from gphotos.DatabaseMedia import DatabaseMedia
from gphotos.GooglePhotosRow import GooglePhotosRow
from gphotos.BadIds import BadIds

from itertools import zip_longest
from typing import Iterable, Mapping, Union, List
Expand All @@ -23,8 +24,8 @@
from urllib3.util.retry import Retry

try:
import win32file
import win32con
import win32file # noqa
import win32con # noqa

_use_win_32 = True
except ImportError:
Expand Down Expand Up @@ -72,6 +73,7 @@ def __init__(
# attributes related to multi-threaded download
self.download_pool = futures.ThreadPoolExecutor(max_workers=self.max_threads)
self.pool_future_to_media = {}
self.bad_ids = BadIds(self._root_folder)

self.current_umask = os.umask(7)
os.umask(self.current_umask)
Expand Down Expand Up @@ -140,7 +142,7 @@ def grouper(
)
self._db.put_downloaded(media_item.id)

else:
elif self.bad_ids.check_id_ok(media_item.id):
batch[media_item.id] = media_item
if not local_folder.is_dir():
local_folder.mkdir(parents=True)
Expand All @@ -157,6 +159,8 @@ def grouper(
self.files_download_failed,
self.files_download_skipped,
)
self.bad_ids.store_ids()
self.bad_ids.report()
return self.files_downloaded

def download_batch(self, batch: Mapping[str, DatabaseMedia]):
Expand Down Expand Up @@ -186,6 +190,8 @@ def download_batch(self, batch: Mapping[str, DatabaseMedia]):
else:
media_item = batch.get(media_item_json["id"])
self.download_file(media_item, media_item_json)
except RequestException:
self.find_bad_items(batch)

except KeyboardInterrupt:
log.warning("Cancelling download threads ...")
Expand Down Expand Up @@ -306,10 +312,19 @@ def do_download_complete(
media_item.relative_path,
e,
)
# do not raise any errors in invididual downloads
# carry on and try to download the rest
# NOTE: this may cause errors to spew out when
# there is some fatal issue, like network disconnected
# treat API errors as possibly transient. Report them above in
# log.error but do not raise them. Other exceptions will raise
# up to the root handler and abort. Note that all retry logic is
# already handled in urllib3

# Items that cause API errors go in a BadIds file which must
# be deleted to retry these items.
if isinstance(e, RequestException):
self.bad_ids.add_id(
media_item.relative_path, media_item.id, media_item.url, e
)
else:
raise e
else:
self._db.put_downloaded(media_item.id)
self.files_downloaded += 1
Expand All @@ -321,3 +336,25 @@ def do_download_complete(
if self.settings.progress and self.files_downloaded % 10 == 0:
log.warning(f"Downloaded {self.files_downloaded} items ...\033[F")
del self.pool_future_to_media[future]

def find_bad_items(self, batch: Mapping[str, DatabaseMedia]):
"""
a batch get failed. Now do all of its contents as individual
gets so we can work out which ID(s) cause the failure
"""
for item_id, media_item in batch.items():
try:
log.debug("BAD ID Retry on %s (%s)", item_id, media_item.relative_path)
response = self._api.mediaItems.get.execute(mediaItemId=item_id)
media_item_json = response.json()
self.download_file(media_item, media_item_json)
except RequestException as e:
self.bad_ids.add_id(
str(media_item.relative_path), media_item.id, media_item.url, e
)
self.files_download_failed += 1
log.error(
"FAILURE %d in get of %s BAD ID",
self.files_download_failed,
media_item.relative_path,
)
2 changes: 1 addition & 1 deletion gphotos/authorize.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def authorize(self):
# set up the retry bevaiour for the authorized session
retries = Retry(
total=self.max_retries,
backoff_factor=0.2,
backoff_factor=0.5,
status_forcelist=[500, 502, 503, 504],
method_whitelist=frozenset(["GET", "POST"]),
raise_on_status=False,
Expand Down
5 changes: 2 additions & 3 deletions test/test-units/test_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,9 @@ def test_base_media(self):

s = ts.SetupDbAndCredentials()
args = [
"--archived",
"--skip-albums",
"--start-date",
"2019-10-01",
"2020-01-01",
"--use-flat-path",
]
s.test_setup("test_base_media", args=args, trash_files=True, trash_db=True)
Expand All @@ -114,7 +113,7 @@ def test_base_media(self):
count = db.cur.fetchone()
self.assertEqual(1, count[0])

pat = str(photos_root / "2019-11" / "*.*")
pat = str(photos_root / "2020-04" / "*.*")
files = sorted(s.root.glob(pat))
self.assertEqual(1, len(files))

Expand Down
5 changes: 3 additions & 2 deletions test/test-units/test_units.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,12 @@ def test_http_500_retries(self):

start = datetime.now()

result = a.session.get("https://httpbin.org/status/500", timeout=10)
result = a.session.get("https://httpbin.org/status/500", timeout=30)
self.assertEqual(result.status_code, 500)
elapsed = datetime.now() - start
# timeout should not affect the 5 retries
self.assertLess(elapsed.seconds, 10)
# but backoff_factor=0.5 should
self.assertLess(elapsed.seconds, 30)

def test_download_timeout(self):
a = auth.Authorize(scope, token_file, secrets_file)
Expand Down
21 changes: 12 additions & 9 deletions test/test_account.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,20 @@


class TestAccount:
latest_date = date(2017, 9, 26)
latest_date = date(2020, 4, 26)

image_years = [2019, 2017, 2016, 2015, 2014, 2001, 2000, 1998, 1965]
image_years = [2020, 2019, 2017, 2016, 2015, 2014, 2001, 2000, 1998, 1965]
# 10 images in each of the years in the test data
# plus 5 shared items in the 2017 shared album
images_per_year = [0, 10, 10, 10, 10, 10, 10, 10, 10]
shared_images_per_year = [0, 5, 0, 0, 0, 0, 0, 0, 0]
shared_album_images_per_year = [6, 0, 0, 0, 0, 0, 0, 0, 0]
videos_per_year = [0, 10, 0, 0, 0, 0, 0, 0, 0, 0]
images_per_year = [1, 0, 10, 10, 10, 10, 10, 10, 10, 10]
shared_images_per_year = [0, 0, 5, 0, 0, 0, 0, 0, 0, 0]
shared_album_images_per_year = [0, 6, 0, 0, 0, 0, 0, 0, 0, 0]
videos_per_year = [0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0]

image_count = sum(images_per_year)
shared_image_count = sum(shared_images_per_year)
video_count = sum(videos_per_year)
total_count = image_count + video_count

# shared test album has 'show in albums' so does appear in our albums list
# 5 of its files are ours and 5 shared by the real giles knap
Expand All @@ -27,10 +28,11 @@ class TestAccount:
r"0923?Clones😀",
r"0926?Album?2016",
r"1207?Same?Names",
r"0426?Name?with?Comma",
]
album_years = [2019, 2001, 2017, 2017, 2016, 2014]
album_images = [5, 10, 10, 4, 16, 10]
album_shared_images = [5, 0, 0, 0, 0, 0]
album_years = [2019, 2001, 2017, 2017, 2016, 2014, 2020]
album_images = [5, 10, 10, 4, 16, 10, 1]
album_shared_images = [5, 0, 0, 0, 0, 0, 0]
album_count = len(album_names)
album_image_count = sum(album_images)
album_shared_image_count = sum(album_shared_images)
Expand All @@ -47,3 +49,4 @@ class TestAccount:
end = "2017-01-01"
image_count_2016 = 10
item_count_2017 = 20
item_count_2020 = 1
2 changes: 1 addition & 1 deletion test/test_credentials/.gphotos.token
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"access_token": "ya29.a0Ae4lvC1ZBRSQCX8BWo5pWXFmLuxo50RbV3jqzHAcC6_uNeuNJh5gZI3fyoDbYOZkZg5S5Xjup5XuLFTOj7318Lxl7sZidUI6ECpyiqawDqyjzqI-0oRoXLklZYUTA3A6NJu12JXc1crIBbGkybJz0FCGnhFOF3FBXknR", "expires_in": 3599, "scope": ["https://www.googleapis.com/auth/photoslibrary.sharing", "https://www.googleapis.com/auth/photoslibrary.readonly"], "token_type": "Bearer", "expires_at": 1587844377.5515726, "refresh_token": "1//03Yu7zu9SJvECCgYIARAAGAMSNwF-L9IrQJfYtnhVFpOHX_0Qq2-O2si6CtUvk9MDJ3mF7CH5bUqqFiSoeXxvnQE4mfD_k01wBtM"}
{"access_token": "ya29.a0AfH6SMD35EBX1lN2t6VhxFeqhrnjc90Z28Go4RY2V7EYVH2biHQtpidr2-dC6VPvY5C6tbQmLAUGuI1Ba6ArQWv0tLfQlYenNGNyNuNwMlsTBqi97ozirEYV86Bgegdrav592HIk0Azf7q1TIzh2_gqDBBRjWQc4cUYT", "expires_in": 3599, "scope": ["https://www.googleapis.com/auth/photoslibrary.sharing", "https://www.googleapis.com/auth/photoslibrary.readonly"], "token_type": "Bearer", "expires_at": 1589309491.7933416, "refresh_token": "1//03Yu7zu9SJvECCgYIARAAGAMSNwF-L9IrQJfYtnhVFpOHX_0Qq2-O2si6CtUvk9MDJ3mF7CH5bUqqFiSoeXxvnQE4mfD_k01wBtM"}
3 changes: 2 additions & 1 deletion test/test_system/test_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from unittest.mock import patch, Mock
from requests.exceptions import HTTPError
from requests import Session
from test.test_account import TestAccount

from gphotos.LocalData import LocalData
import test.test_setup as ts
Expand Down Expand Up @@ -36,7 +37,7 @@ def test_max_retries_hit(self):

db.cur.execute("SELECT COUNT() FROM SyncFiles")
count = db.cur.fetchone()
self.assertEqual(90, count[0])
self.assertEqual(TestAccount.total_count, count[0])

pat = str(photos_root / "*" / "*" / "*")
self.assertEqual(
Expand Down
2 changes: 2 additions & 0 deletions test/test_system/test_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def test_retries_500(self):
status_forcelist=[500, 502, 503, 504],
method_whitelist=frozenset(["GET", "POST"]),
raise_on_status=False,
respect_retry_after_header=True,
)

session.mount("https://", HTTPAdapter(max_retries=retry))
Expand All @@ -54,6 +55,7 @@ def test_retries_timeout(self):
status_forcelist=[500, 502, 503, 504],
method_whitelist=frozenset(["GET", "POST"]),
raise_on_status=False,
respect_retry_after_header=True,
)

session.mount("https://", HTTPAdapter(max_retries=retry))
Expand Down
Loading

0 comments on commit 10be8f0

Please sign in to comment.