Skip to content

Commit aebe985

Browse files
committed
Adding some datatype recognition via Regex
As well as identifying_factor annotation, cardinality and template keyword conversion
1 parent 80d13af commit aebe985

File tree

4 files changed

+138
-43
lines changed

4 files changed

+138
-43
lines changed

lib/DataHarmonizer.js

+1-1
Original file line numberDiff line numberDiff line change
@@ -1990,7 +1990,7 @@ class DataHarmonizer {
19901990
}
19911991
if (field.pattern) {
19921992
guidance.push(
1993-
i18next.t('reference_guide_msg_pattern_regex') + ' ' + field.pattern
1993+
i18next.t('reference_guide_msg_pattern_regex') + '<br>' + field.pattern
19941994
);
19951995
}
19961996
if (field.structured_pattern) {

script/oca_to_linkml.py

+101-31
Original file line numberDiff line numberDiff line change
@@ -39,21 +39,7 @@
3939
# integer or decimal number, may begin with + or - /^[-+]?\d*\.?\d+$
4040
# integer /^-?[0-9]+$
4141
#
42-
# Textual:
43-
# Capital or lower case letters only, at least 1 character, and 50 characters max ^[A-Za-z]{1,50}$
44-
# Capital or lower case letters only, 50 characters max ^[A-Za-z]{0,50}$
45-
# Short text, 50 characters max ^.{0,50}$
46-
# Short text, 250 characters max ^.{0,250}$
47-
# long text, 800 characters max ^.{0,800}$
48-
# long text, 4000 characters max ^.{0,4000}$
49-
# Canadian postal codes (A1A 1A1) ^[A-Z][0-9][A-Z]\s[0-9][A-Z][0-9]$
50-
# Zip code ^\d{5,6}(?:[-\s]\d{4})?$
51-
# Email address [a-zA-Z0-9_\.\+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-\.]+
52-
# URL https?\:\/\/[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,}
53-
# Phone number \+?\(?\d{2,4}\)?[\d\s-]{3,}
54-
# Latitude in formats S30°15'45.678" or N12°30.999" ^[NS]-?(?:[0-8]?\d|90)°(?:\d+(?:\.\d+)?)(?:'(\d+(?:\.\d+)?)")?$
55-
# Longitude in formats E30°15'45.678" or W90°00.000" ^[WE]-?(?:[0-8]?\d|90)°(?:\d+(?:\.\d+)?)(?:'(\d+(?:\.\d+)?)")?$
56-
#
42+
# See also: https://github.com/agrifooddatacanada/OCA_package_standard
5743

5844

5945
import json
@@ -270,6 +256,8 @@
270256
"range_2",
271257
"identifier",
272258
"multivalued",
259+
"minimum_cardinality",
260+
"maximum_cardinality",
273261
"required",
274262
"recommended",
275263
"minimum_value",
@@ -278,7 +266,8 @@
278266
"structured_pattern",
279267
"description",
280268
"comments",
281-
"examples"
269+
"examples",
270+
"annotations"
282271
];
283272

284273
SCHEMA_ENUMS = [
@@ -385,9 +374,12 @@ def writeSchemaCore():
385374
'name': SCHEMA["name"],
386375
'title': SCHEMA["title"] or SCHEMA["name"],
387376
'description': SCHEMA["description"],
388-
# 'is_a': 'dh_interface'
389377
}
390378

379+
# Associate classification keywords with this class (Rather than LinkML schema as a whole)
380+
if len(oca_classification):
381+
SCHEMA["classes"][SCHEMA["name"]]["keywords"] = oca_classification;
382+
391383
# Set up Container class to hold given schema class's data
392384
SCHEMA["classes"]['Container']['attributes'] = {
393385
'name': SCHEMA["name"] + 'Data',
@@ -409,7 +401,7 @@ def writeSlots():
409401
# Ensure SCHEMA_SLOTS has language variation
410402
addLocaleHeaders(SCHEMA_SLOTS, ["slot_group","title","description","comments","examples"]);
411403

412-
# start slots as an ordered dictionary {slot name:,...} of DH slot attributes.
404+
# Start slots as an ordered dictionary {slot name:,...} of DH slot attributes.
413405
slots = OrderedDict([i, OrderedDict([i,""] for i in SCHEMA_SLOTS) ] for i in oca_attributes)
414406

415407
for slot_name in oca_attributes:
@@ -418,45 +410,110 @@ def writeSlots():
418410
slot['class_name'] = SCHEMA["name"];
419411
slot['name'] = slot_name;
420412
slot['title'] = oca_labels[slot_name];
421-
slot['range'] = oca_attributes[slot_name]; # ISSUE: Numeric
413+
slot['range'] = oca_attributes[slot_name]; # Yeilds Type
422414
slot['pattern'] = oca_formats[slot_name];
423415
slot['description'] = oca_informations[slot_name];
424416

417+
# Minnum and maximum number of values in array of a multivalued field.
418+
# See https://oca.colossi.network/specification/#cardinality-overlay
419+
if slot_name in oca_cardinality: # Format: n, n-, n-m, -m
420+
card = oca_cardinality[slot_name];
421+
if '-' in card:
422+
if '-' == card[0]:
423+
slot['maximum_cardinality'] = int(card[1:]);
424+
if (slot['maximum_cardinality'] > 1):
425+
slot['multivalued'] = True;
426+
elif '-' == card[-1]:
427+
slot['minimum_cardinality'] = int(card[0:-1]);
428+
slot['multivalued'] = True;
429+
else:
430+
(min, max) = card.split('-');
431+
slot['minimum_cardinality'] = int(min);
432+
slot['maximum_cardinality'] = int(max);
433+
if (int(max) < int(min)):
434+
warnings.append("Field " + slot_name + " has maximum_cardinality less than the minimum_cardinality.")
435+
if int(max) > 1:
436+
slot['multivalued'] = True;
437+
else: # A single value so both min and max
438+
slot['minimum_cardinality'] = slot['maximum_cardinality'] = int(card);
439+
if int(card) > 1:
440+
slot['multivalued'] = True;
441+
442+
# If slot range is "Array[some datatype]",
443+
if slot['range'][0:5] == "Array":
444+
slot['multivalued'] = True;
445+
slot['range'] = re.search('\[(.+)\]', slot['range']).group(1);
446+
425447
# Range 2 gets any picklist for now.
426448
if slot_name in oca_entry_codes:
427449
slots[slot_name]['range_2'] = slot_name;
428450

451+
if slot_name in oca_conformance:
452+
match oca_conformance[slot_name]:
453+
case "M": # Mandatory
454+
slot['required'] = True;
455+
case "O": # Optional -> Recommended?!
456+
slot['recommended'] = True;
457+
458+
# Flag that this field may have confidentiality compromising content.
459+
# Field confidentiality https://kantarainitiative.org/download/blinding-identity-taxonomy-pdf/
460+
# https://lf-toip.atlassian.net/wiki/spaces/HOME/pages/22974595/Blinding+Identity+Taxonomy
461+
# Currently the only use of slot.attributes:
462+
if slot_name in oca_identifying_factors:
463+
slot['annotations'] = 'identifying_factor:True';
464+
429465
# Conversion of range field from OCA to LinkML data types.
430466
# See https://github.com/ClimateSmartAgCollab/JSON-Form-Generator/blob/main/src/JsonFormGenerator.js
467+
# See also: https://oca.colossi.network/specification/#attribute-type
468+
# There's also a list of file types: https://github.com/agrifooddatacanada/format_options/blob/main/format/binary.md
469+
# Data types: Text | Numeric | Reference (crypto hash) | Boolean | Binary | DateTime | Array[data type]
431470
match slot['range']: # case sensitive?
471+
432472
case "Text":
473+
# https://github.com/agrifooddatacanada/format_options/blob/main/format/text.md
433474
slot['range'] = "WhitespaceMinimizedString" # or "string"
475+
434476
case "Numeric":
477+
# https://github.com/agrifooddatacanada/format_options/blob/main/format/numeric.md
435478
# ISSUE: if field is marked as an integer or decimal, then even
436479
# if regular expression validates, a test against integer or
437480
# decimal format will INVALIDATE this slot.
438-
# Sniff whether it is integer or decimal. FUTURE: allow negatives?
481+
# Sniff whether it is integer or decimal.
439482
if re.search("^-?\[0-9\]\{\d+\}$", slot['pattern']):
440483
slot['range'] = "integer";
441484
else:
442485
slot['range'] = "decimal";
443-
case "DateTime":
444-
case "Boolean":
486+
487+
case "DateTime":
488+
# There are many datatypes that might be matched via the OCA regex expression used to define them.
489+
pass
490+
case "Boolean":
491+
pass
492+
493+
# Now convert any slot datatypes where pattern matches OCA-specific data type
494+
for type_name in SCHEMA["types"]:
495+
if "pattern" in SCHEMA["types"][type_name]:
496+
if SCHEMA["types"][type_name]["pattern"] == slot['pattern']:
497+
#print("PATTERN", type_name, )
498+
slot['range'] = type_name;
499+
slot['pattern'] = ''; # Redundant
445500

446501

447-
case ""
448502
# Need access to original oca language parameter, e.g. "eng"
449503
if len(locale_mapping) > 1:
450504
for locale in list(locale_mapping)[1:]:
451505
oca_locale = locale_mapping[locale];
452506
slot['slot_group_'+locale] = "Generic";
453507
slot['title_'+locale] = getLookup("label", oca_locale, slot_name)
454508
slot['description_'+locale] = getLookup("information", oca_locale, slot_name)
455-
#slot['comments_'+locale]
456-
#slot['examples_'+locale]
509+
#slot['comments_'+locale] # No OCA equivalent
510+
#slot['examples_'+locale] # No OCA equivalent
457511

512+
513+
458514
save_tsv("schema_slots.tsv", SCHEMA_SLOTS, slots);
459515

516+
460517
def writeEnums():
461518
addLocaleHeaders(SCHEMA_ENUMS, ["title", "menu_1"]);
462519
enums = [];
@@ -500,26 +557,39 @@ def writeEnums():
500557
# ALSO, it is assumed that language variant objects all have the "default"
501558
# and consistent primary language as first entry.
502559

560+
# oca_attributes contains slot.name and slot.Type (datatype, e.g. Numeric, ...)
561+
oca_attributes = oca_obj["bundle"]["capture_base"]["attributes"];
562+
563+
# Keywords about this schema (class's) subject categorization.
564+
oca_classification = oca_obj["bundle"]["capture_base"]["classification"];
565+
566+
# Fields which likely have personal or institutional confidentiality content:
567+
oca_identifying_factors = oca_obj["bundle"]["capture_base"]["flagged_attributes"];
568+
569+
############################# Overlays #################################
503570
oca_overlays = oca_obj["bundle"]["overlays"];
504571

505-
# Contains {schema.name,.description,.language} in array
506-
# Optional?
572+
# Contains {schema.name,.description,.language} in array. Optional?
507573
oca_metas = oca_overlays["meta"][0];
508574

509-
# oca_attributes contains slot.name and slot.datatype
510-
oca_attributes = oca_obj["bundle"]["capture_base"]["attributes"];
511-
512575
# Contains slot.name and slot.pattern
513576
oca_formats = oca_overlays["format"]["attribute_formats"];
514577

578+
# Minnum and maximum number of values in array of a multivalued field.
579+
if "cardinality" in oca_overlays:
580+
oca_cardinality = oca_overlays["cardinality"]["attr_cardinality"];
581+
else:
582+
oca_cardinality = {};
583+
515584
# Contains {slot.title,.language} in array
516585
oca_labels = oca_overlays["label"][0]["attribute_labels"];
517586

518587
# Contains {slot.name,.description,.language} in array
519588
# Optional?
520589
oca_informations = oca_overlays["information"][0]["attribute_information"];
521590

522-
# Contains {"d": "M", "i": "M", "passed": "M"} # "M" ?
591+
# A dictionary for each field indicating required/recommended status:
592+
# M is mandatory and O is optional.
523593
oca_conformance = oca_overlays["conformance"]["attribute_conformance"];
524594

525595
# Contains [enumeration name]:[code,...]

script/tabular_to_schema.py

+33-8
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,20 @@ def set_examples (slot, example_string):
113113

114114
slot['examples'] = examples;
115115

116+
# Parse annotation_string into slot.examples. Works for multilingual slot locale
117+
def set_annotations (slot, annotation_string):
118+
119+
if annotation_string > '':
120+
annotations = {};
121+
for v in annotation_string.split(';'):
122+
(key, value) = v.split(':');
123+
value = value.strip();
124+
if value.lower() == 'true':
125+
value = bool(value);
126+
annotations[key.strip()] = value;
127+
128+
slot['annotations'] = annotations;
129+
116130

117131
# A slot's or enum's exact_mappings array gets populated with all the
118132
# EXPORT_XYZ column cell values.
@@ -249,17 +263,22 @@ def set_classes(schema_slot_path, schema, locale_schemas, export_format, warning
249263
slot_description = row.get('description','');
250264
slot_comments = row.get('comments','');
251265
slot_examples = row.get('examples','');
266+
slot_annotations = row.get('annotations','');
252267
slot_uri = row.get('slot_uri','');
253-
slot_identifier = row.get('identifier','');
254-
slot_multivalued = row.get('multivalued','');
255-
slot_required = row.get('required','');
256-
slot_recommended = row.get('recommended', '');
268+
269+
slot_identifier = bool(row.get('identifier',''));
270+
slot_multivalued = bool(row.get('multivalued',''));
271+
slot_required = bool(row.get('required',''));
272+
slot_recommended = bool(row.get('recommended', ''));
273+
257274
slot_range = row.get('range','');
258275
slot_range_2 = row.get('range_2','');
259276
slot_pattern = row.get('pattern','');
260277
slot_structured_pattern = row.get('structured_pattern','');
261278
slot_minimum_value = row.get('minimum_value','');
262279
slot_maximum_value = row.get('maximum_value','');
280+
slot_minimum_cardinality = row.get('minimum_cardinality','');
281+
slot_maximum_cardinality = row.get('maximum_cardinality','');
263282

264283
slot = {'name': slot_name};
265284

@@ -269,12 +288,17 @@ def set_classes(schema_slot_path, schema, locale_schemas, export_format, warning
269288
if slot_description > '': slot['description'] = slot_description;
270289
if slot_comments > '': slot['comments'] = slot_comments;
271290
if slot_uri > '': slot['slot_uri'] = slot_uri;
272-
if slot_identifier == 'TRUE': slot['identifier'] = True;
291+
292+
if slot_identifier == True: slot['identifier'] = True;
293+
if slot_multivalued == True: slot['multivalued'] = True;
294+
if slot_required == True: slot['required'] = True;
295+
if slot_recommended == True: slot['recommended'] = True;
296+
297+
if slot_minimum_cardinality > '': slot['minimum_cardinality'] = int(slot_minimum_cardinality);
298+
if slot_maximum_cardinality > '': slot['maximum_cardinality'] = int(slot_maximum_cardinality);
299+
273300
set_range(slot, slot_range, slot_range_2);
274301
set_min_max(slot, slot_minimum_value, slot_maximum_value);
275-
if slot_multivalued == 'TRUE': slot['multivalued'] = True;
276-
if slot_required == 'TRUE': slot['required'] = True;
277-
if slot_recommended == 'TRUE': slot['recommended'] = True;
278302
if slot_pattern > '': slot['pattern'] = slot_pattern;
279303
if slot_structured_pattern > '':
280304
slot['structured_pattern'] = {
@@ -284,6 +308,7 @@ def set_classes(schema_slot_path, schema, locale_schemas, export_format, warning
284308
}
285309

286310
set_examples(slot, slot_examples);
311+
set_annotations(slot, slot_annotations);
287312
set_mappings(slot, row, export_format);
288313

289314
# If slot has already been set up in schema['slots'] then compare

web/translations/translations.json

+3-3
Original file line numberDiff line numberDiff line change
@@ -252,9 +252,9 @@
252252
"fr": "Guide de référence pour"
253253
},
254254
"reference_guide_msg_pattern_regex": {
255-
"en": "Pattern as regular expression:",
256-
"fr": "Motif en tant qu'expression régulière :"
257-
},
255+
"en": "Regular expression validator:",
256+
"fr": "Motif en tant qu'expression régulière:"
257+
},
258258
"reference_guide_msg_pattern_hint": {
259259
"en": "Pattern hint:",
260260
"fr": "Indice du motif :"

0 commit comments

Comments
 (0)