Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
gjost committed Oct 24, 2023
2 parents 3d9a76f + b2693a7 commit e56a2fe
Show file tree
Hide file tree
Showing 7 changed files with 142 additions and 39 deletions.
58 changes: 58 additions & 0 deletions bin/ddrfar.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Make CSV linking FAR facility codes to FAR document scan files

This script produces a CSV file linking FAR codes used in `namesdb-editor` and elsewhere with individual `File` IDs.

The Final Accountability Rosters Collection (ddr-densho-305-1) contains scans of
the original FAR documents.
Each `Entity` represents a camp and a set of documents.
Each `File` represents a single scanned page.

``` python
FAR_FACILITIES = [
'1-topaz',
'2-poston',
'3-gilariver',
'4-amache',
'5-heartmountain',
'6-jerome',
'7-manzanar',
'8-minidoka',
'9-rohwer',
'10-tulelake',
]

FACILITY_DDRCOLLECTION = {
'1-topaz': 'ddr-densho-305-1',
'2-poston': 'ddr-densho-305-2',
'3-gilariver': 'ddr-densho-305-3',
'4-amache': 'ddr-densho-305-4',
'5-heartmountain': 'ddr-densho-305-5',
'6-jerome': 'ddr-densho-305-6',
'7-manzanar': 'ddr-densho-305-7',
'8-minidoka': 'ddr-densho-305-8',
'9-rohwer': 'ddr-densho-305-9',
'10-tulelake': 'ddr-densho-305-10',
}

BASEDIR = '/var/www/media/ddr'
OUTPUT_FILE = '/tmp/ddr-far.csv'

from pathlib import Path
from DDR import config
from DDR import fileio
from DDR import identifier
headers = ['facility', 'page', 'file_id', 'file_label']
lines = [fileio.write_csv_str(headers)]
for facility in FAR_FACILITIES:
eid = FACILITY_DDRCOLLECTION[facility]
entity = identifier.Identifier(eid, BASEDIR).object()
for n,file_ in enumerate(entity.children()):
if n == 0:
continue
row = [facility, n, file_.identifier.id, file_.label]
line = fileio.write_csv_str(row)
lines.append(line)

with Path(OUTPUT_FILE).open('w') as f:
f.write('\n'.join(lines))
```
2 changes: 1 addition & 1 deletion ddr/DDR/cli/ddrindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ def destroy(confirm, host):
+ "will be DESTROYED!"
)
for index in identifier.ELASTICSEARCH_CLASSES['all']:
click.echo(f"- {index['doc_type']}")
click.echo(f"- {index}")
else:
click.echo(
f"Add '--confirm' to destroy the {cluster} cluster ({ds.host})."
Expand Down
12 changes: 10 additions & 2 deletions ddr/DDR/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,12 @@ def clone(user_name, user_mail, identifier, dest_path):
@return: message ('ok' if successful)
"""
git_url = '{}:{}.git'.format(config.GITOLITE, identifier.id)
# Raise exception if repository does not exist
gitolite = dvcs.Gitolite(config.GITOLITE)
gitolite.initialize()
if not identifier.id in gitolite.repos():
raise Exception(f'{git_url} must be created before it can be cloned.')
# clone from Gitolite
repo = git.Repo.clone_from(git_url, dest_path)
logging.debug(' git clone {}'.format(git_url))
if repo:
Expand Down Expand Up @@ -220,11 +226,13 @@ def create(user_name, user_mail, identifier, agent=''):
@param agent: (optional) Name of software making the change.
@return: message ('ok' if successful)
"""
git_url = '{}:{}.git'.format(config.GITOLITE, identifier.id)
# Raise exception if repository already exists
gitolite = dvcs.Gitolite(config.GITOLITE)
gitolite.initialize()
if identifier.id in gitolite.collections():
raise Exception("'%s' already exists -- clone instead." % identifier.id)
git_url = '{}:{}.git'.format(config.GITOLITE, identifier.id)
raise Exception(f'{git_url} cannot be created when it already exists.')
# clone from Gitolite (Gitolite creates the repository on the server)
repo = git.Repo.clone_from(git_url, identifier.path_abs())
logging.debug(' git clone {}'.format(git_url))
if repo:
Expand Down
61 changes: 47 additions & 14 deletions ddr/DDR/converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -739,7 +739,7 @@ def text_to_bracketids(text: str,
#
# text = "Masuda, Kikuye [42]:narrator"
# data = [
# {'namepart': 'Masuda, Kikuye', 'role': 'narrator', 'id': 42}
# {'namepart': 'Masuda, Kikuye', 'role': 'narrator', 'oh_id': 42}
# ]
#
# text = "Watanabe, Joe:author; Masuda, Kikuye:narrator"
Expand All @@ -749,38 +749,48 @@ def text_to_bracketids(text: str,
# ]
# text = [
# {'namepart': 'Watanabe, Joe', 'role': 'author'}
# {'namepart': 'Masuda, Kikuye', 'role': 'narrator', 'id': 42}
# {'namepart': 'Masuda, Kikuye', 'role': 'narrator', 'oh_id': 42}
# ]
# data = [
# {'namepart': 'Watanabe, Joe', 'role': 'author'}
# {'namepart': 'Masuda, Kikuye', 'role': 'narrator', 'id': 42}
# {'namepart': 'Masuda, Kikuye', 'role': 'narrator', 'oh_id': 42}
# ]
#

def _filter_rolepeople(data: List[Dict[str,str]]) -> List[Dict[str,str]]:
"""filters out items with empty nameparts
"""Processes persons/creators data already in list-of-dicts format
filters out items with empty nameparts
prevents this: [{'namepart': '', 'role': 'author'}]
"""
return [
data = [
item for item in data
if item.get('namepart') # and item.get('role')
]
# convert old 'id' to oral history id
for item in data:
if item.get('id'):
item['oh_id'] = item.pop('id')
# make sure oh_id is int
if item.get('oh_id'):
item['oh_id'] = int(item['oh_id'])
return data

# TODO add type hints
def _parse_rolepeople_text(texts, default):
"""Parses rolepeople format used to represent persons/creators in CSV
"""
data = []
for text in _unroll_gloppy_list(texts):
txt = text.strip()
if txt:
item = copy.deepcopy(default)
# TODO clean this up

if ('|' in txt) and (':' in txt):
# ex: "namepart:Sadako Kashiwagi|role:narrator|id:856"
# ex: "namepart:Sadako Kashiwagi|role:narrator|oh_id:856"
for chunk in txt.split('|'):
key,val = chunk.split(':')
item[key.strip()] = val.strip()
if item.get('name') and not item.get('namepart'):
item['namepart'] = item.pop('name')

elif ':' in txt:
# ex: "Sadako Kashiwagi:narrator"
Expand All @@ -798,13 +808,36 @@ def _parse_rolepeople_text(texts, default):
# ex: "Sadako Kashiwagi"
item['namepart'] = txt

# extract person ID if present
# obsolete/original narrator ID format: 'Masuda, Kikuye [42]'
if item.get('role') and item['role'] == 'narrator' and not item.get('oh_id'):
# "namepart: Masuda, Kikuye [42] | role: narrator"
m = re.search('([\w\s,-]+) \[(\d+)]', item['namepart'])
if m and m.groups() and len(m.groups()) == 2:
item['namepart'] = m.groups()[0]
item['oh_id'] = m.groups()[1]
# "Masuda, Kikuye [42]:narrator"
m1 = re.search('([\w\s,-]+) \[(\d+)]:narrator', txt)
if m1 and m1.groups() and len(m1.groups()) == 2:
item['namepart'] = m1.groups()[0]
item['oh_id'] = m1.groups()[1]

# convert old 'name' to 'namepart'
if item.get('name') and not item.get('namepart'):
item['namepart'] = item.pop('name')

# convert old 'id' to oral history id
if item.get('id'):
item['oh_id'] = item.pop('id')

# old bracketid format
match = _is_text_bracketid(item.get('namepart',''))
if match:
item['namepart'] = match.groupdict()['term'].strip()
item['id'] = match.groupdict()['id'].strip()
if item.get('id') and item['id'].isdigit():
item['id'] = int(item['id'])
#item['oh_id'] = match.groupdict()['oh_id'].strip()

# make sure oh_id is int
if item.get('oh_id'):
item['oh_id'] = int(item['oh_id'])

data.append(item)
return data
Expand Down Expand Up @@ -834,9 +867,9 @@ def text_to_rolepeople(text: str, default: dict) -> List[Dict[str,str]]:

# might already be listofdicts or listofstrs
if isinstance(text, list):
if _is_listofdicts(text):
if _is_listofdicts(text): # from JSON
return _filter_rolepeople(text)
elif _is_listofstrs(text):
elif _is_listofstrs(text): # from CSV
data = _parse_rolepeople_text(text, default)
return _filter_rolepeople(data)

Expand Down
1 change: 1 addition & 0 deletions ddr/DDR/docstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,7 @@ def _repo_org(self, path, doctype, remove=False):
d = ES_Class(id=oi.id)
d.meta.id = oi.id
d.model = oi.model
d.sort = data.get('sort')
d.parent_id = oi.parent_id(stubs=1)
# links
d.links_html = oi.id
Expand Down
45 changes: 24 additions & 21 deletions ddr/tests/test_converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,37 +379,37 @@ def test_text_to_bracketids():
]
TEXTROLEPEOPLE_NAME_OUT_ROLE = 'namepart: Watanabe, Joe | role: author'

TEXTROLEPEOPLE_SINGLE_TEXT = 'namepart: Masuda, Kikuye | role: narrator | id: 42'
TEXTROLEPEOPLE_SINGLE_TEXT = 'namepart: Masuda, Kikuye | role: narrator | oh_id: 42'
TEXTROLEPEOPLE_SINGLE_DATA = [
{'namepart': 'Masuda, Kikuye', 'role': 'narrator', 'id': 42},
{'namepart': 'Masuda, Kikuye', 'role': 'narrator', 'oh_id': 42},
]

TEXTROLEPEOPLE_SINGLE_ID_TEXT = 'namepart: Masuda, Kikuye | role: narrator | id: 42'
TEXTROLEPEOPLE_SINGLE_ID_TEXT = 'namepart: Masuda, Kikuye | role: narrator | oh_id: 42'
TEXTROLEPEOPLE_SINGLE_ID_DATA = [
{'namepart': 'Masuda, Kikuye', 'role': 'narrator', 'id': 42},
{'namepart': 'Masuda, Kikuye', 'role': 'narrator', 'oh_id': 42},
]

TEXTROLEPEOPLE_SINGLE_NRID_TEXT = 'namepart: Masuda, Kikuye | nr_id: 88922/nr014m435 | role: narrator'
TEXTROLEPEOPLE_SINGLE_NRID_DATA = [
{'namepart': 'Masuda, Kikuye', 'nr_id': '88922/nr014m435', 'role': 'narrator'},
]

TEXTROLEPEOPLE_SINGLE_NRIDID_TEXT = 'namepart: Masuda, Kikuye | nr_id: 88922/nr014m435 | role: narrator | id: 42'
TEXTROLEPEOPLE_SINGLE_NRIDID_TEXT = 'namepart: Masuda, Kikuye | nr_id: 88922/nr014m435 | role: narrator | oh_id: 42'
TEXTROLEPEOPLE_SINGLE_NRIDID_DATA = [
{'namepart': 'Masuda, Kikuye', 'nr_id': '88922/nr014m435', 'role': 'narrator', 'id': 42},
{'namepart': 'Masuda, Kikuye', 'nr_id': '88922/nr014m435', 'role': 'narrator', 'oh_id': 42},
]

TEXTROLEPEOPLE_MULTI_TEXT = 'namepart: Watanabe, Joe | role: author; namepart: Masuda, Kikuye | role: narrator | id: 42'
TEXTROLEPEOPLE_MULTI_TEXT = 'namepart: Watanabe, Joe | role: author; namepart: Masuda, Kikuye | role: narrator | oh_id: 42'
TEXTROLEPEOPLE_MULTI_DATA = [
{'namepart': 'Watanabe, Joe', 'role': 'author'},
{'namepart': 'Masuda, Kikuye', 'role': 'narrator', 'id': 42},
{'namepart': 'Masuda, Kikuye', 'role': 'narrator', 'oh_id': 42},
]

TEXTROLEPEOPLE_MULTI_NRID_TEXT = 'namepart: Ito, Jo | role: author; namepart: Ban, Shig | nr_id: 88922/nr014m437 | role: arch; namepart: Aso, San | role: narrator | id: 42'
TEXTROLEPEOPLE_MULTI_NRID_TEXT = 'namepart: Ito, Jo | role: author; namepart: Ban, Shig | nr_id: 88922/nr014m437 | role: arch; namepart: Aso, San | role: narrator | oh_id: 42'
TEXTROLEPEOPLE_MULTI_NRID_DATA = [
{'namepart': 'Ito, Jo', 'role': 'author'},
{'namepart': 'Ban, Shig', 'nr_id': '88922/nr014m437', 'role': 'arch'},
{'namepart': 'Aso, San', 'role': 'narrator', 'id': 42},
{'namepart': 'Aso, San', 'role': 'narrator', 'oh_id': 42},
]

TEXTROLEPEOPLE_LISTSTRSNAME_TEXT = [
Expand All @@ -425,33 +425,35 @@ def test_text_to_bracketids():
]
TEXTROLEPEOPLE_LISTSTRS_DATA = [
{'namepart': 'Watanabe, Joe', 'role': 'author'},
{'namepart': 'Masuda, Kikuye', 'role': 'narrator', 'id': 42},
{'namepart': 'Masuda, Kikuye', 'role': 'narrator', 'oh_id': 42},
]

TEXTROLEPEOPLE_MULTI_TEXT = 'namepart: Watanabe, Joe | role: author; namepart: Masuda, Kikuye | role: narrator | id: 42'
TEXTROLEPEOPLE_MULTI_TEXT_OLD = 'namepart: Watanabe, Joe | role: author; namepart: Tanaka, Cherry | role: narrator | id: 41; namepart: Masuda, Kikuye | role: narrator | oh_id: 42'
TEXTROLEPEOPLE_MULTI_TEXT_NEW = 'namepart: Watanabe, Joe | role: author; namepart: Tanaka, Cherry | role: narrator | oh_id: 41; namepart: Masuda, Kikuye | role: narrator | oh_id: 42'
TEXTROLEPEOPLE_MULTI_DATA = [
{'namepart': 'Watanabe, Joe', 'role': 'author'},
{'namepart': 'Masuda, Kikuye', 'role': 'narrator', 'id': 42},
{'namepart': 'Tanaka, Cherry', 'role': 'narrator', 'oh_id': 41},
{'namepart': 'Masuda, Kikuye', 'role': 'narrator', 'oh_id': 42},
]

TEXTROLEPEOPLE_MULTI_NRID_TEXT = 'namepart: Ito, Jo | role: author; namepart: Ban, Shig | nr_id: 88922/nr014m437 | matching: match | role: arch; namepart: Aso, San | role: narrator | id: 42'
TEXTROLEPEOPLE_MULTI_NRID_TEXT = 'namepart: Ito, Jo | role: author; namepart: Ban, Shig | nr_id: 88922/nr014m437 | matching: match | role: arch; namepart: Aso, San | role: narrator | oh_id: 42'
TEXTROLEPEOPLE_MULTI_NRID_DATA = [
{'namepart': 'Ito, Jo', 'role': 'author'},
{'namepart': 'Ban, Shig', 'nr_id': '88922/nr014m437', 'matching': 'match', 'role': 'arch'},
{'namepart': 'Aso, San', 'role': 'narrator', 'id': 42},
{'namepart': 'Aso, San', 'role': 'narrator', 'oh_id': 42},
]

TEXTROLEPEOPLE_PIPES_TEXT = 'namepart: Watanabe, Joe | role: author; namepart: Masuda, Kikuye [42] | role: narrator; namepart: Joi Ito | role: techie | id:123'
TEXTROLEPEOPLE_PIPES_TEXT = 'namepart: Watanabe, Joe | role: author; namepart: Masuda, Kikuye [42] | role: narrator; namepart: Joi Ito | role: techie | oh_id:123'
TEXTROLEPEOPLE_PIPES_DATA = [
{'namepart': 'Watanabe, Joe', 'role': 'author'},
{'namepart': 'Masuda, Kikuye', 'role': 'narrator', 'id': 42},
{'namepart': 'Joi Ito', 'role': 'techie', 'id': 123},
{'namepart': 'Masuda, Kikuye', 'role': 'narrator', 'oh_id': 42},
{'namepart': 'Joi Ito', 'role': 'techie', 'oh_id': 123},
]

TEXTROLEPEOPLE_NOSPACES_TEXT = 'namepart:Watanabe, Joe|role:author; namepart:Masuda, Kikuye [42]|role:narrator;'
TEXTROLEPEOPLE_NOSPACES_DATA = [
{'namepart': 'Watanabe, Joe', 'role': 'author'},
{'namepart': 'Masuda, Kikuye', 'role': 'narrator', 'id': 42},
{'namepart': 'Masuda, Kikuye', 'role': 'narrator', 'oh_id': 42},
]

# many legacy files have this pattern
Expand Down Expand Up @@ -482,7 +484,8 @@ def test_text_to_rolepeople():
assert converters.text_to_rolepeople(TEXTROLEPEOPLE_SINGLE_ID_TEXT, defaults) == TEXTROLEPEOPLE_SINGLE_ID_DATA
assert converters.text_to_rolepeople(TEXTROLEPEOPLE_SINGLE_NRID_TEXT, defaults) == TEXTROLEPEOPLE_SINGLE_NRID_DATA
assert converters.text_to_rolepeople(TEXTROLEPEOPLE_SINGLE_NRIDID_TEXT, defaults) == TEXTROLEPEOPLE_SINGLE_NRIDID_DATA
assert converters.text_to_rolepeople(TEXTROLEPEOPLE_MULTI_TEXT, defaults) == TEXTROLEPEOPLE_MULTI_DATA
assert converters.text_to_rolepeople(TEXTROLEPEOPLE_MULTI_TEXT_NEW, defaults) == TEXTROLEPEOPLE_MULTI_DATA
assert converters.text_to_rolepeople(TEXTROLEPEOPLE_MULTI_TEXT_OLD, defaults) == TEXTROLEPEOPLE_MULTI_DATA
assert converters.text_to_rolepeople(TEXTROLEPEOPLE_MULTI_NRID_TEXT, defaults) == TEXTROLEPEOPLE_MULTI_NRID_DATA
assert converters.text_to_rolepeople(TEXTROLEPEOPLE_LISTSTRSNAME_TEXT, defaults) == TEXTROLEPEOPLE_LISTSTRSNAME_DATA
assert converters.text_to_rolepeople(TEXTROLEPEOPLE_LISTSTRS_TEXT, defaults) == TEXTROLEPEOPLE_LISTSTRS_DATA
Expand All @@ -496,6 +499,6 @@ def test_rolepeople_to_text():
assert converters.rolepeople_to_text([]) == ''
assert converters.rolepeople_to_text(TEXTROLEPEOPLE_NAME_DATA) == TEXTROLEPEOPLE_NAME_OUT
assert converters.rolepeople_to_text(TEXTROLEPEOPLE_SINGLE_DATA) == TEXTROLEPEOPLE_SINGLE_TEXT
assert converters.rolepeople_to_text(TEXTROLEPEOPLE_MULTI_DATA) == TEXTROLEPEOPLE_MULTI_TEXT
assert converters.rolepeople_to_text(TEXTROLEPEOPLE_MULTI_DATA) == TEXTROLEPEOPLE_MULTI_TEXT_NEW
assert converters.rolepeople_to_text(TEXTROLEPEOPLE_SINGLE_NRID_DATA) == TEXTROLEPEOPLE_SINGLE_NRID_TEXT
assert converters.rolepeople_to_text(TEXTROLEPEOPLE_MULTI_NRID_DATA) == TEXTROLEPEOPLE_MULTI_NRID_TEXT
2 changes: 1 addition & 1 deletion ddr/tests/test_identifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def test_definitions_valid_components():
'repo': 'ddr',
'org': [
'densho', 'ajah', 'chi', 'csujad', 'fom', 'hmwf', 'jamsj', 'janm',
'jcch', 'manz', 'njpa', 'one', 'pc', 'sbbt', 'sjacl', 'dev',
'jcch', 'manz', 'njpa', 'one', 'pc', 'phljacl', 'sbbt', 'sjacl', 'dev',
'qumulo', 'test', 'testing'
],
'role': [
Expand Down

0 comments on commit e56a2fe

Please sign in to comment.