Adding some datatype recognition via Regex

ddooley · ddooley · commit aebe985f19f4 · 2025-02-28T08:57:19.000-08:00
As well as identifying_factor annotation, cardinality and template keyword conversion
diff --git a/lib/DataHarmonizer.js b/lib/DataHarmonizer.js
@@ -1990,7 +1990,7 @@ class DataHarmonizer {
     }
     if (field.pattern) {
       guidance.push(
-        i18next.t('reference_guide_msg_pattern_regex') + ' ' + field.pattern
+        i18next.t('reference_guide_msg_pattern_regex') + '<br>' + field.pattern
       );
     }
     if (field.structured_pattern) {
diff --git a/script/oca_to_linkml.py b/script/oca_to_linkml.py
@@ -39,21 +39,7 @@
 # integer or decimal number, may begin with + or -	/^[-+]?\d*\.?\d+$
 # integer		/^-?[0-9]+$
 # 
-# Textual:
-# Capital or lower case letters only, at least 1 character, and 50 characters max		^[A-Za-z]{1,50}$
-# Capital or lower case letters only, 50 characters max		^[A-Za-z]{0,50}$
-# Short text, 50 characters max		^.{0,50}$
-# Short text, 250 characters max		^.{0,250}$
-# long text, 800 characters max		^.{0,800}$
-# long text, 4000 characters max		^.{0,4000}$
-# Canadian postal codes (A1A 1A1)		^[A-Z][0-9][A-Z]\s[0-9][A-Z][0-9]$
-# Zip code		^\d{5,6}(?:[-\s]\d{4})?$
-# Email address		[a-zA-Z0-9_\.\+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-\.]+
-# URL	https?\:\/\/[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,}
-# Phone number		\+?\(?\d{2,4}\)?[\d\s-]{3,}
-# Latitude in formats S30°15'45.678" or N12°30.999"		^[NS]-?(?:[0-8]?\d|90)°(?:\d+(?:\.\d+)?)(?:'(\d+(?:\.\d+)?)")?$
-# Longitude in formats E30°15'45.678" or W90°00.000"		^[WE]-?(?:[0-8]?\d|90)°(?:\d+(?:\.\d+)?)(?:'(\d+(?:\.\d+)?)")?$
-#
+# See also: https://github.com/agrifooddatacanada/OCA_package_standard
 
 
 import json
@@ -270,6 +256,8 @@
 	"range_2",
 	"identifier",
 	"multivalued",
+	"minimum_cardinality",
+	"maximum_cardinality",
 	"required",
 	"recommended",
 	"minimum_value",
@@ -278,7 +266,8 @@
 	"structured_pattern",
 	"description",
 	"comments",
-	"examples"
+	"examples",
+  "annotations"
 ];
 
 SCHEMA_ENUMS = [
@@ -385,9 +374,12 @@ def writeSchemaCore():
 	    'name': SCHEMA["name"],
     	'title': SCHEMA["title"] or SCHEMA["name"],
     	'description': SCHEMA["description"],
-			#	'is_a': 'dh_interface'
 	}
 
+	# Associate classification keywords with this class (Rather than LinkML schema as a whole)
+	if len(oca_classification):
+		SCHEMA["classes"][SCHEMA["name"]]["keywords"] = oca_classification;
+
 	# Set up Container class to hold given schema class's data
 	SCHEMA["classes"]['Container']['attributes'] = {
 			'name': SCHEMA["name"] + 'Data',
@@ -409,7 +401,7 @@ def writeSlots():
 	# Ensure SCHEMA_SLOTS has language variation
 	addLocaleHeaders(SCHEMA_SLOTS, ["slot_group","title","description","comments","examples"]);
 
-	# start slots as an ordered dictionary {slot name:,...} of DH slot attributes.
+	# Start slots as an ordered dictionary {slot name:,...} of DH slot attributes.
 	slots = OrderedDict([i, OrderedDict([i,""] for i in SCHEMA_SLOTS) ] for i in oca_attributes)
 
 	for slot_name in oca_attributes:
@@ -418,45 +410,110 @@ def writeSlots():
 		slot['class_name'] = SCHEMA["name"];
 		slot['name'] = slot_name;
 		slot['title'] = oca_labels[slot_name];
-		slot['range'] = oca_attributes[slot_name]; # ISSUE: Numeric
+		slot['range'] = oca_attributes[slot_name]; # Yeilds Type
 		slot['pattern'] = oca_formats[slot_name];
 		slot['description'] = oca_informations[slot_name];
 
+		# Minnum and maximum number of values in array of a multivalued field.
+		# See https://oca.colossi.network/specification/#cardinality-overlay
+		if slot_name in oca_cardinality:  # Format: n, n-, n-m, -m
+			card = oca_cardinality[slot_name];
+			if '-' in card:
+				if '-' == card[0]:
+					slot['maximum_cardinality'] = int(card[1:]);
+					if (slot['maximum_cardinality'] > 1):
+						slot['multivalued'] = True;
+				elif '-' == card[-1]:
+					slot['minimum_cardinality'] = int(card[0:-1]);
+					slot['multivalued'] = True;
+				else:
+					(min, max) = card.split('-');
+					slot['minimum_cardinality'] = int(min);
+					slot['maximum_cardinality'] = int(max);
+					if (int(max) < int(min)):
+						warnings.append("Field " + slot_name + " has maximum_cardinality less than the minimum_cardinality.")
+					if int(max) > 1:
+						slot['multivalued'] = True;
+			else: # A single value so both min and max
+				slot['minimum_cardinality'] = slot['maximum_cardinality'] = int(card);
+				if int(card) > 1:
+					slot['multivalued'] = True;
+
+		# If slot range is "Array[some datatype]",
+		if slot['range'][0:5] == "Array":
+			slot['multivalued'] = True;
+			slot['range'] = re.search('\[(.+)\]', slot['range']).group(1);
+
 		# Range 2 gets any picklist for now.
 		if slot_name in oca_entry_codes:
 			slots[slot_name]['range_2'] = slot_name;
 
+		if slot_name in oca_conformance:
+			match oca_conformance[slot_name]:
+				case "M": # Mandatory
+					slot['required'] = True;
+				case "O": # Optional -> Recommended?!
+					slot['recommended'] = True;
+
+		# Flag that this field may have confidentiality compromising content.
+		# Field confidentiality https://kantarainitiative.org/download/blinding-identity-taxonomy-pdf/
+		# https://lf-toip.atlassian.net/wiki/spaces/HOME/pages/22974595/Blinding+Identity+Taxonomy
+		# Currently the only use of slot.attributes:
+		if slot_name in oca_identifying_factors:
+			slot['annotations'] = 'identifying_factor:True';
+
 		# Conversion of range field from OCA to LinkML data types.
     # See https://github.com/ClimateSmartAgCollab/JSON-Form-Generator/blob/main/src/JsonFormGenerator.js
+    # See also: https://oca.colossi.network/specification/#attribute-type
+    # There's also a list of file types: https://github.com/agrifooddatacanada/format_options/blob/main/format/binary.md
+    # Data types: Text | Numeric | Reference (crypto hash) | Boolean | Binary | DateTime | Array[data type]
 		match slot['range']: # case sensitive?
+
 			case "Text":
+				# https://github.com/agrifooddatacanada/format_options/blob/main/format/text.md
 				slot['range'] = "WhitespaceMinimizedString" # or "string"
+
 			case "Numeric":
+        # https://github.com/agrifooddatacanada/format_options/blob/main/format/numeric.md
 				# ISSUE: if field is marked as an integer or decimal, then even
 				# if regular expression validates, a test against integer or 
 				# decimal format will INVALIDATE this slot.
-				# Sniff whether it is integer or decimal. FUTURE: allow negatives?
+				# Sniff whether it is integer or decimal.
 				if re.search("^-?\[0-9\]\{\d+\}$", slot['pattern']):
 					slot['range'] = "integer";
 				else:
 					slot['range'] = "decimal";
-      case "DateTime":
-      case "Boolean":
+
+			case "DateTime":
+				# There are many datatypes that might be matched via the OCA regex expression used to define them.
+				pass
+			case "Boolean":
+				pass
+
+    # Now convert any slot datatypes where pattern matches OCA-specific data type
+		for type_name in SCHEMA["types"]:
+			if "pattern" in SCHEMA["types"][type_name]:
+				if SCHEMA["types"][type_name]["pattern"] == slot['pattern']:
+					#print("PATTERN", type_name, )
+					slot['range'] = type_name;
+					slot['pattern'] = ''; # Redundant
 
 
-      case ""
 		# Need access to original oca language parameter, e.g. "eng"
 		if len(locale_mapping) > 1:
 			for locale in list(locale_mapping)[1:]:
 				oca_locale = locale_mapping[locale];
 				slot['slot_group_'+locale] = "Generic";
 				slot['title_'+locale] = getLookup("label", oca_locale, slot_name)
 				slot['description_'+locale] = getLookup("information", oca_locale, slot_name)
-				#slot['comments_'+locale]
-				#slot['examples_'+locale]
+				#slot['comments_'+locale] # No OCA equivalent
+				#slot['examples_'+locale] # No OCA equivalent
 	
+
+
 	save_tsv("schema_slots.tsv", SCHEMA_SLOTS, slots);
 
+
 def writeEnums():
 	addLocaleHeaders(SCHEMA_ENUMS, ["title", "menu_1"]);
 	enums = [];
@@ -500,26 +557,39 @@ def writeEnums():
 # ALSO, it is assumed that language variant objects all have the "default" 
 # and consistent primary language as first entry.
 
+# oca_attributes contains slot.name and slot.Type (datatype, e.g. Numeric, ...)
+oca_attributes = oca_obj["bundle"]["capture_base"]["attributes"];
+
+# Keywords about this schema (class's) subject categorization.
+oca_classification = oca_obj["bundle"]["capture_base"]["classification"];
+
+# Fields which likely have personal or institutional confidentiality content:
+oca_identifying_factors = oca_obj["bundle"]["capture_base"]["flagged_attributes"];
+
+############################# Overlays #################################
 oca_overlays = oca_obj["bundle"]["overlays"];
 
-# Contains {schema.name,.description,.language} in array 
-# Optional?
+# Contains {schema.name,.description,.language} in array.  Optional?
 oca_metas = oca_overlays["meta"][0];
 
-# oca_attributes contains slot.name and slot.datatype
-oca_attributes = oca_obj["bundle"]["capture_base"]["attributes"];
-
 # Contains slot.name and slot.pattern
 oca_formats = oca_overlays["format"]["attribute_formats"];
 
+# Minnum and maximum number of values in array of a multivalued field.
+if "cardinality" in oca_overlays:
+	oca_cardinality = oca_overlays["cardinality"]["attr_cardinality"];
+else:
+	oca_cardinality = {};
+
 # Contains {slot.title,.language} in array
 oca_labels = oca_overlays["label"][0]["attribute_labels"];
 
 # Contains {slot.name,.description,.language} in array 
 # Optional?
 oca_informations = oca_overlays["information"][0]["attribute_information"];
 
-# Contains {"d": "M", "i": "M", "passed": "M"}  # "M" ?
+# A dictionary for each field indicating required/recommended status:  
+# M is mandatory and O is optional.
 oca_conformance = oca_overlays["conformance"]["attribute_conformance"];
 
 # Contains [enumeration name]:[code,...]
diff --git a/script/tabular_to_schema.py b/script/tabular_to_schema.py
@@ -113,6 +113,20 @@ def set_examples (slot, example_string):
 
 		slot['examples'] = examples;
 
+# Parse annotation_string into slot.examples. Works for multilingual slot locale
+def set_annotations (slot, annotation_string):
+				
+	if annotation_string > '':
+		annotations = {};
+		for v in annotation_string.split(';'):
+			(key, value) = v.split(':');
+			value = value.strip();
+			if value.lower() == 'true':
+				value = bool(value);
+			annotations[key.strip()] = value;
+
+		slot['annotations'] = annotations;
+
 
 # A slot's or enum's exact_mappings array gets populated with all the 
 # EXPORT_XYZ column cell values.
@@ -249,17 +263,22 @@ def set_classes(schema_slot_path, schema, locale_schemas, export_format, warning
 					slot_description =				row.get('description','');
 					slot_comments =						row.get('comments','');
 					slot_examples = 					row.get('examples','');
+					slot_annotations = 					row.get('annotations','');
 					slot_uri =								row.get('slot_uri','');
-					slot_identifier =					row.get('identifier','');
-					slot_multivalued =				row.get('multivalued','');
-					slot_required =						row.get('required','');
-					slot_recommended =				row.get('recommended', '');
+
+					slot_identifier =					bool(row.get('identifier',''));
+					slot_multivalued =				bool(row.get('multivalued',''));
+					slot_required =						bool(row.get('required',''));
+					slot_recommended =				bool(row.get('recommended', ''));
+
 					slot_range =							row.get('range','');
 					slot_range_2 =						row.get('range_2','');
 					slot_pattern = 						row.get('pattern','');
 					slot_structured_pattern = row.get('structured_pattern','');
 					slot_minimum_value =			row.get('minimum_value','');
 					slot_maximum_value =			row.get('maximum_value','');
+					slot_minimum_cardinality =			row.get('minimum_cardinality','');
+					slot_maximum_cardinality =			row.get('maximum_cardinality','');
 
 					slot = {'name': slot_name};
 
@@ -269,12 +288,17 @@ def set_classes(schema_slot_path, schema, locale_schemas, export_format, warning
 					if slot_description > '':				slot['description'] = slot_description;
 					if slot_comments > '':					slot['comments'] = slot_comments;
 					if slot_uri > '':								slot['slot_uri'] = slot_uri;
-					if slot_identifier == 'TRUE':		slot['identifier'] = True;
+
+					if slot_identifier == True:		slot['identifier'] = True;
+					if slot_multivalued == True:	slot['multivalued'] = True;
+					if slot_required == True:			slot['required'] = True;
+					if slot_recommended == True:	slot['recommended'] = True;
+
+					if slot_minimum_cardinality > '':	slot['minimum_cardinality'] = int(slot_minimum_cardinality);
+					if slot_maximum_cardinality > '':	slot['maximum_cardinality'] = int(slot_maximum_cardinality);
+					
 					set_range(slot, slot_range, slot_range_2);
 					set_min_max(slot, slot_minimum_value, slot_maximum_value);
-					if slot_multivalued == 'TRUE':	slot['multivalued'] = True;
-					if slot_required == 'TRUE':			slot['required'] = True;
-					if slot_recommended == 'TRUE':	slot['recommended'] = True;
 					if slot_pattern > '':						slot['pattern'] = slot_pattern;		
 					if slot_structured_pattern > '':
 																					slot['structured_pattern'] = {
@@ -284,6 +308,7 @@ def set_classes(schema_slot_path, schema, locale_schemas, export_format, warning
 																					}
 
 					set_examples(slot, slot_examples);
+					set_annotations(slot, slot_annotations);
 					set_mappings(slot, row, export_format);
 
 					# If slot has already been set up in schema['slots'] then compare 
diff --git a/web/translations/translations.json b/web/translations/translations.json
@@ -252,9 +252,9 @@
     "fr": "Guide de référence pour"
   },
   "reference_guide_msg_pattern_regex": {
-    "en": "Pattern as regular expression:",
-    "fr": "Motif en tant qu'expression régulière :"
-  },
+    "en": "Regular expression validator:",
+    "fr": "Motif en tant qu'expression régulière:" 
+  }, 
   "reference_guide_msg_pattern_hint": {
     "en": "Pattern hint:",
     "fr": "Indice du motif :"

Original file line number	Diff line number	Diff line change
`@@ -1990,7 +1990,7 @@ class DataHarmonizer {`
`1990`	`1990`	`}`
`1991`	`1991`	`if (field.pattern) {`
`1992`	`1992`	`guidance.push(`
`1993`		`- i18next.t('reference_guide_msg_pattern_regex') + ' ' + field.pattern`
	`1993`	`+ i18next.t('reference_guide_msg_pattern_regex') + '<br>' + field.pattern`
`1994`	`1994`	`);`
`1995`	`1995`	`}`
`1996`	`1996`	`if (field.structured_pattern) {`