Skip to content

Commit

Permalink
Merge pull request #65 from impresso/Bugfixes
Browse files Browse the repository at this point in the history
Add-ons
  • Loading branch information
Matteo Romanello authored Sep 13, 2019
2 parents cfd2cbe + 425f07a commit 5cda20e
Show file tree
Hide file tree
Showing 9 changed files with 34 additions and 26 deletions.
6 changes: 4 additions & 2 deletions tests/importers/test_lux_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ def test_import_issues():

issues = lux_detect_issues(inp_dir)
assert issues is not None
import_issues(issues, out_dir, s3_bucket=output_bucket, issue_class=LuxNewspaperIssue, image_dirs=None, temp_dir=None)
import_issues(issues, out_dir, s3_bucket=output_bucket, issue_class=LuxNewspaperIssue,
image_dirs=None, temp_dir=None, chunk_size=None)


def test_selective_import():
Expand Down Expand Up @@ -60,7 +61,8 @@ def test_selective_import():
assert issues is not None and len(issues) > 0

logger.info(f'There are {len(issues)} to ingest')
import_issues(issues, out_dir, s3_bucket=None, issue_class=LuxNewspaperIssue, image_dirs=None, temp_dir=None)
import_issues(issues, out_dir, s3_bucket=None, issue_class=LuxNewspaperIssue,
image_dirs=None, temp_dir=None, chunk_size=None)

# # TODO: adapt it to Lux data
# def test_verify_imported_issues():
Expand Down
3 changes: 2 additions & 1 deletion tests/importers/test_olive_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ def test_import_issues():
s3_bucket=None,
issue_class=OliveNewspaperIssue,
image_dirs="/mnt/project_impresso/images/",
temp_dir=pkg_resources.resource_filename('text_importer', 'data/temp/')
temp_dir=pkg_resources.resource_filename('text_importer', 'data/temp/'),
chunk_size=None
)
print(result)

Expand Down
3 changes: 2 additions & 1 deletion tests/importers/test_rero_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ def test_import_issues():
s3_bucket=None,
issue_class=ReroNewspaperIssue,
temp_dir=None,
image_dirs=None
image_dirs=None,
chunk_size=None
)
print(result)

2 changes: 1 addition & 1 deletion text_importer/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.9.1"
__version__ = "0.9.2"
32 changes: 18 additions & 14 deletions text_importer/importers/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,17 @@
logger = logging.getLogger(__name__)


def write_error(issue, error, failed_log):
logger.error(f'Error when processing {issue}: {error}')
logger.exception(error)
note = (
f"{canonical_path(issue, path_type='dir').replace('/', '-')}: "
f"{error}"
)
with open(failed_log, "a+") as f:
f.write(note + "\n")


def dir2issue(
issue: IssueDir,
issue_class: Type[NewspaperIssue],
Expand All @@ -54,14 +65,7 @@ def dir2issue(
np_issue = issue_class(issue)
return np_issue
except Exception as e:
logger.error(f'Error when processing issue {issue}: {e}')
logger.exception(e)
note = (
f"{canonical_path(issue, path_type='dir').replace('/', '-')}: "
f"{e}"
)
with open(failed_log, "a+") as f:
f.write(note + "\n")
write_error(issue, e, failed_log)
return None


Expand Down Expand Up @@ -139,10 +143,11 @@ def serialize_pages(
return result


def process_pages(pages: List[NewspaperPage]) -> List[NewspaperPage]:
def process_pages(pages: List[NewspaperPage], failed_log: str) -> List[NewspaperPage]:
"""Given a list of pages, trigger the ``.parse()`` method of each page.
:param List[NewspaperPage] pages: Input newspaper pages.
:param str failed_log: File path of failed log
:return: A list of processed pages.
:rtype: List[NewspaperPage]
Expand All @@ -154,8 +159,7 @@ def process_pages(pages: List[NewspaperPage]) -> List[NewspaperPage]:
page.parse()
result.append(page)
except Exception as e:
logger.error(f'Error when processing page {page.id}: {e}')
# logger.exception(e)
write_error(page, e, failed_log)
return result


Expand All @@ -164,8 +168,8 @@ def import_issues(
out_dir: str,
s3_bucket: Optional[str],
issue_class: Type[NewspaperIssue],
image_dirs: str,
temp_dir: str,
image_dirs: Optional[str],
temp_dir: Optional[str],
chunk_size: Optional[int]):
"""Import a bunch of newspaper issues.
Expand Down Expand Up @@ -226,7 +230,7 @@ def import_issues(
pages_bag = db.from_sequence(chunk_of_issues, partition_size=2) \
.map(issue2pages) \
.flatten() \
.map_partitions(process_pages) \
.map_partitions(process_pages, failed_log=failed_log_path) \
.map_partitions(serialize_pages, output_dir=out_dir)

pages_out_dir = os.path.join(out_dir, 'pages')
Expand Down
6 changes: 3 additions & 3 deletions text_importer/importers/lux/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
CONTENTITEM_TYPE_TABLE,
CONTENTITEM_TYPE_WEATHER)
from text_importer.importers.lux.helpers import convert_coordinates, encode_ark
from text_importer.importers.mets_alto import (MetsAltoNewPaperIssue,
from text_importer.importers.mets_alto import (MetsAltoNewspaperIssue,
MetsAltoNewspaperPage,
parse_mets_amdsec)
from text_importer.utils import get_issue_schema, get_page_schema
Expand All @@ -30,7 +30,7 @@
class LuxNewspaperPage(MetsAltoNewspaperPage):
"""Class representing a page in BNL data."""

def add_issue(self, issue: MetsAltoNewPaperIssue):
def add_issue(self, issue: MetsAltoNewspaperIssue):
self.issue = issue
encoded_ark_id = encode_ark(self.issue.ark_id)
iiif_base_link = f'{IIIF_ENDPOINT_URL}/{encoded_ark_id}'
Expand Down Expand Up @@ -76,7 +76,7 @@ def _convert_coordinates(self, page_data: List[dict]) -> Tuple[bool, List[dict]]
return success, page_data


class LuxNewspaperIssue(MetsAltoNewPaperIssue):
class LuxNewspaperIssue(MetsAltoNewspaperIssue):
"""Class representing an issue in BNL data.
All functions defined in this child class are specific to parsing BNL Mets/Alto format
"""
Expand Down
2 changes: 1 addition & 1 deletion text_importer/importers/mets_alto/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from text_importer.importers.mets_alto.classes import MetsAltoNewspaperPage, MetsAltoNewPaperIssue
from text_importer.importers.mets_alto.classes import MetsAltoNewspaperPage, MetsAltoNewspaperIssue
from text_importer.importers.mets_alto.mets import parse_mets_amdsec
2 changes: 1 addition & 1 deletion text_importer/importers/mets_alto/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def parse(self):
)


class MetsAltoNewPaperIssue(NewspaperIssue):
class MetsAltoNewspaperIssue(NewspaperIssue):
"""Generic class representing a newspaper issue in Mets/Alto format.
.. note ::
Expand Down
4 changes: 2 additions & 2 deletions text_importer/importers/rero/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from bs4.element import NavigableString, Tag

from text_importer.importers import CONTENTITEM_TYPE_IMAGE, CONTENTITEM_TYPES
from text_importer.importers.mets_alto import (MetsAltoNewPaperIssue,
from text_importer.importers.mets_alto import (MetsAltoNewspaperIssue,
MetsAltoNewspaperPage,
parse_mets_amdsec)
from text_importer.utils import get_issue_schema, get_page_schema
Expand Down Expand Up @@ -37,7 +37,7 @@ def _convert_coordinates(self, page_data):
return False, page_data


class ReroNewspaperIssue(MetsAltoNewPaperIssue):
class ReroNewspaperIssue(MetsAltoNewspaperIssue):
"""Class representing an issue in RERO (Mets/Alto) data.
All functions defined in this child class are specific to parsing RERO Mets/Alto format
"""
Expand Down

0 comments on commit 5cda20e

Please sign in to comment.