Skip to content

Commit

Permalink
Merge pull request #62 from impresso/Lux-Bugfix
Browse files Browse the repository at this point in the history
Lux bugfix

- notably fixes the problems with not-unique canonical IDs being created (for `luxwort`)
  • Loading branch information
Matteo Romanello authored Sep 13, 2019
2 parents 5cda20e + 6529d15 commit 1628e1d
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 17 deletions.
3 changes: 2 additions & 1 deletion text_importer/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
__version__ = "0.9.2"
__version__ = "0.9.3"

11 changes: 6 additions & 5 deletions text_importer/importers/lux/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,19 +126,18 @@ def _parse_mets_sections(self, mets_doc):
sections,
key=lambda elem: elem.get('ID').split("_")[1]
)

for item_counter, section in enumerate(sections):
counter = 1
for section in sections:

section_id = section.get('ID')

if 'ARTICLE' in section_id:
item_counter += 1
lang = section.find_all('languageTerm')[0].getText()
title_elements = section.find_all('titleInfo')
item_title = title_elements[0].getText().replace('\n', ' ') \
.strip() if len(title_elements) > 0 else None
metadata = {
'id': "{}-i{}".format(self.id, str(item_counter).zfill(4)),
'id': "{}-i{}".format(self.id, str(counter).zfill(4)),
'l': lang,
'tp': CONTENTITEM_TYPE_ARTICLE,
'pp': []
Expand All @@ -154,6 +153,7 @@ def _parse_mets_sections(self, mets_doc):
}
}
content_items.append(item)
counter += 1
elif 'PICT' in section_id:
# TODO: keep language (there may be more than one)
title_elements = section.find_all('titleInfo')
Expand All @@ -162,7 +162,7 @@ def _parse_mets_sections(self, mets_doc):

# TODO: how to get language information for these CIs ?
metadata = {
'id': "{}-i{}".format(self.id, str(item_counter).zfill(4)),
'id': "{}-i{}".format(self.id, str(counter).zfill(4)),
'tp': CONTENTITEM_TYPE_IMAGE,
'pp': []
}
Expand All @@ -176,6 +176,7 @@ def _parse_mets_sections(self, mets_doc):
}
}
content_items.append(item)
counter += 1
return content_items

def _parse_structmap_divs(self, mets_doc, start_counter):
Expand Down
37 changes: 26 additions & 11 deletions text_importer/importers/lux/detect.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
import os
from collections import namedtuple
from datetime import date
from typing import List
from typing import List, Optional

from dask import bag as db
from impresso_commons.path.path_fs import _apply_datefilter

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -62,13 +63,13 @@ def dir2issue(path: str) -> LuxIssueDir:
issue_date = issue_dir.split('_')[3]
year, month, day = issue_date.split('-')
rights = 'open_public' if 'public_domain' in path else 'closed'

if len(issue_dir.split('_')) == 4:
edition = 'a'
elif len(issue_dir.split('_')) == 5:
edition = issue_dir.split('_')[4]
edition = EDITIONS_MAPPINGS[int(edition)]

return LuxIssueDir(
local_id,
date(int(year), int(month), int(day)),
Expand Down Expand Up @@ -102,7 +103,7 @@ def detect_issues(base_dir: str, access_rights: str = None) -> List[LuxIssueDir]
]


def select_issues(base_dir: str, config: dict, access_rights: str) -> List[LuxIssueDir]:
def select_issues(base_dir: str, config: dict, access_rights: str) -> Optional[List[LuxIssueDir]]:
"""Detect selectively newspaper issues to import.
The behavior is very similar to :func:`detect_issues` with the only
Expand All @@ -115,16 +116,30 @@ def select_issues(base_dir: str, config: dict, access_rights: str) -> List[LuxIs
:param str access_rights: Not used for this imported, but argument is kept for normality
:return: List of `LuxIssueDir` instances, to be imported.
"""
issues = detect_issues(base_dir)

try:
filter_dict = config["newspapers"]
exclude_list = config["exclude_newspapers"]
year_flag = config["year_only"]

except KeyError:
logger.critical(f"The key [newspapers|exclude_newspapers|year_only] is missing in the config file.")
return

issues = detect_issues(base_dir, access_rights)
issue_bag = db.from_sequence(issues)
selected_issues = issue_bag \
.filter(lambda i: i.journal in config['newspapers'].keys()) \
.filter(lambda i: (len(filter_dict) == 0 or i.journal in filter_dict.keys()) and i.journal not in exclude_list) \
.compute()


exclude_flag = False if not exclude_list else True
filtered_issues = _apply_datefilter(filter_dict, selected_issues,
year_only=year_flag) if not exclude_flag else selected_issues
logger.info(
"{} newspaper issues remained after applying filter: {}".format(
len(selected_issues),
selected_issues
)
len(filtered_issues),
filtered_issues
)
return selected_issues
)
return filtered_issues

0 comments on commit 1628e1d

Please sign in to comment.