diff --git a/tests/importers/test_lux_importer.py b/tests/importers/test_lux_importer.py index 24ae1954..a06242e4 100644 --- a/tests/importers/test_lux_importer.py +++ b/tests/importers/test_lux_importer.py @@ -24,7 +24,8 @@ def test_import_issues(): issues = lux_detect_issues(inp_dir) assert issues is not None - import_issues(issues, out_dir, s3_bucket=output_bucket, issue_class=LuxNewspaperIssue, image_dirs=None, temp_dir=None) + import_issues(issues, out_dir, s3_bucket=output_bucket, issue_class=LuxNewspaperIssue, + image_dirs=None, temp_dir=None, chunk_size=None) def test_selective_import(): @@ -60,7 +61,8 @@ def test_selective_import(): assert issues is not None and len(issues) > 0 logger.info(f'There are {len(issues)} to ingest') - import_issues(issues, out_dir, s3_bucket=None, issue_class=LuxNewspaperIssue, image_dirs=None, temp_dir=None) + import_issues(issues, out_dir, s3_bucket=None, issue_class=LuxNewspaperIssue, + image_dirs=None, temp_dir=None, chunk_size=None) # # TODO: adapt it to Lux data # def test_verify_imported_issues(): diff --git a/tests/importers/test_olive_importer.py b/tests/importers/test_olive_importer.py index 431134eb..364352f5 100644 --- a/tests/importers/test_olive_importer.py +++ b/tests/importers/test_olive_importer.py @@ -40,7 +40,8 @@ def test_import_issues(): s3_bucket=None, issue_class=OliveNewspaperIssue, image_dirs="/mnt/project_impresso/images/", - temp_dir=pkg_resources.resource_filename('text_importer', 'data/temp/') + temp_dir=pkg_resources.resource_filename('text_importer', 'data/temp/'), + chunk_size=None ) print(result) diff --git a/tests/importers/test_rero_importer.py b/tests/importers/test_rero_importer.py index b15fcf55..bced587b 100644 --- a/tests/importers/test_rero_importer.py +++ b/tests/importers/test_rero_importer.py @@ -36,7 +36,8 @@ def test_import_issues(): s3_bucket=None, issue_class=ReroNewspaperIssue, temp_dir=None, - image_dirs=None + image_dirs=None, + chunk_size=None ) print(result) diff --git a/text_importer/__init__.py b/text_importer/__init__.py index d69d16e9..a2fecb45 100644 --- a/text_importer/__init__.py +++ b/text_importer/__init__.py @@ -1 +1 @@ -__version__ = "0.9.1" +__version__ = "0.9.2" diff --git a/text_importer/importers/core.py b/text_importer/importers/core.py index dd04e24a..6d10e1a7 100644 --- a/text_importer/importers/core.py +++ b/text_importer/importers/core.py @@ -37,6 +37,17 @@ logger = logging.getLogger(__name__) +def write_error(issue, error, failed_log): + logger.error(f'Error when processing {issue}: {error}') + logger.exception(error) + note = ( + f"{canonical_path(issue, path_type='dir').replace('/', '-')}: " + f"{error}" + ) + with open(failed_log, "a+") as f: + f.write(note + "\n") + + def dir2issue( issue: IssueDir, issue_class: Type[NewspaperIssue], @@ -54,14 +65,7 @@ def dir2issue( np_issue = issue_class(issue) return np_issue except Exception as e: - logger.error(f'Error when processing issue {issue}: {e}') - logger.exception(e) - note = ( - f"{canonical_path(issue, path_type='dir').replace('/', '-')}: " - f"{e}" - ) - with open(failed_log, "a+") as f: - f.write(note + "\n") + write_error(issue, e, failed_log) return None @@ -139,10 +143,11 @@ def serialize_pages( return result -def process_pages(pages: List[NewspaperPage]) -> List[NewspaperPage]: +def process_pages(pages: List[NewspaperPage], failed_log: str) -> List[NewspaperPage]: """Given a list of pages, trigger the ``.parse()`` method of each page. :param List[NewspaperPage] pages: Input newspaper pages. + :param str failed_log: File path of failed log :return: A list of processed pages. :rtype: List[NewspaperPage] @@ -154,8 +159,7 @@ def process_pages(pages: List[NewspaperPage]) -> List[NewspaperPage]: page.parse() result.append(page) except Exception as e: - logger.error(f'Error when processing page {page.id}: {e}') - # logger.exception(e) + write_error(page, e, failed_log) return result @@ -164,8 +168,8 @@ def import_issues( out_dir: str, s3_bucket: Optional[str], issue_class: Type[NewspaperIssue], - image_dirs: str, - temp_dir: str, + image_dirs: Optional[str], + temp_dir: Optional[str], chunk_size: Optional[int]): """Import a bunch of newspaper issues. @@ -226,7 +230,7 @@ def import_issues( pages_bag = db.from_sequence(chunk_of_issues, partition_size=2) \ .map(issue2pages) \ .flatten() \ - .map_partitions(process_pages) \ + .map_partitions(process_pages, failed_log=failed_log_path) \ .map_partitions(serialize_pages, output_dir=out_dir) pages_out_dir = os.path.join(out_dir, 'pages') diff --git a/text_importer/importers/lux/classes.py b/text_importer/importers/lux/classes.py index 9fa0b8e1..3807c3f5 100644 --- a/text_importer/importers/lux/classes.py +++ b/text_importer/importers/lux/classes.py @@ -15,7 +15,7 @@ CONTENTITEM_TYPE_TABLE, CONTENTITEM_TYPE_WEATHER) from text_importer.importers.lux.helpers import convert_coordinates, encode_ark -from text_importer.importers.mets_alto import (MetsAltoNewPaperIssue, +from text_importer.importers.mets_alto import (MetsAltoNewspaperIssue, MetsAltoNewspaperPage, parse_mets_amdsec) from text_importer.utils import get_issue_schema, get_page_schema @@ -30,7 +30,7 @@ class LuxNewspaperPage(MetsAltoNewspaperPage): """Class representing a page in BNL data.""" - def add_issue(self, issue: MetsAltoNewPaperIssue): + def add_issue(self, issue: MetsAltoNewspaperIssue): self.issue = issue encoded_ark_id = encode_ark(self.issue.ark_id) iiif_base_link = f'{IIIF_ENDPOINT_URL}/{encoded_ark_id}' @@ -76,7 +76,7 @@ def _convert_coordinates(self, page_data: List[dict]) -> Tuple[bool, List[dict]] return success, page_data -class LuxNewspaperIssue(MetsAltoNewPaperIssue): +class LuxNewspaperIssue(MetsAltoNewspaperIssue): """Class representing an issue in BNL data. All functions defined in this child class are specific to parsing BNL Mets/Alto format """ diff --git a/text_importer/importers/mets_alto/__init__.py b/text_importer/importers/mets_alto/__init__.py index 7ebc576b..c143adaf 100644 --- a/text_importer/importers/mets_alto/__init__.py +++ b/text_importer/importers/mets_alto/__init__.py @@ -1,2 +1,2 @@ -from text_importer.importers.mets_alto.classes import MetsAltoNewspaperPage, MetsAltoNewPaperIssue +from text_importer.importers.mets_alto.classes import MetsAltoNewspaperPage, MetsAltoNewspaperIssue from text_importer.importers.mets_alto.mets import parse_mets_amdsec diff --git a/text_importer/importers/mets_alto/classes.py b/text_importer/importers/mets_alto/classes.py index d65d2ab4..1211eef9 100644 --- a/text_importer/importers/mets_alto/classes.py +++ b/text_importer/importers/mets_alto/classes.py @@ -78,7 +78,7 @@ def parse(self): ) -class MetsAltoNewPaperIssue(NewspaperIssue): +class MetsAltoNewspaperIssue(NewspaperIssue): """Generic class representing a newspaper issue in Mets/Alto format. .. note :: diff --git a/text_importer/importers/rero/classes.py b/text_importer/importers/rero/classes.py index 6e14e0ae..03ec5ed5 100644 --- a/text_importer/importers/rero/classes.py +++ b/text_importer/importers/rero/classes.py @@ -7,7 +7,7 @@ from bs4.element import NavigableString, Tag from text_importer.importers import CONTENTITEM_TYPE_IMAGE, CONTENTITEM_TYPES -from text_importer.importers.mets_alto import (MetsAltoNewPaperIssue, +from text_importer.importers.mets_alto import (MetsAltoNewspaperIssue, MetsAltoNewspaperPage, parse_mets_amdsec) from text_importer.utils import get_issue_schema, get_page_schema @@ -37,7 +37,7 @@ def _convert_coordinates(self, page_data): return False, page_data -class ReroNewspaperIssue(MetsAltoNewPaperIssue): +class ReroNewspaperIssue(MetsAltoNewspaperIssue): """Class representing an issue in RERO (Mets/Alto) data. All functions defined in this child class are specific to parsing RERO Mets/Alto format """