Skip to content

Commit

Permalink
Merge pull request #1350 from Sage-Bionetworks/rs-fix-manifest-requir…
Browse files Browse the repository at this point in the history
…ed-cols

Merge Refactor Schemas with Develop
  • Loading branch information
mialy-defelice authored Jan 18, 2024
2 parents 30b3177 + 47c3d6c commit 2ade117
Show file tree
Hide file tree
Showing 43 changed files with 6,445 additions and 6,067 deletions.
257 changes: 246 additions & 11 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ pandarallel = "^1.6.4"
schematic-db = {version = "0.0.dev33", extras = ["synapse"]}
pyopenssl = "^23.0.0"
typing-extensions = "<4.6.0"
dataclasses-json = "^0.6.1"

[tool.poetry.group.dev.dependencies]
pytest = "^7.0.0"
Expand Down
3 changes: 0 additions & 3 deletions schematic/help.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,9 +166,6 @@
"short_help": (
"Convert specification from CSV data model to JSON-LD data model."
),
"base_schema": (
"Path to base data model. BioThings data model is loaded by default."
),
"output_jsonld": (
"Path to where the generated JSON-LD file needs to be outputted."
),
Expand Down
42 changes: 30 additions & 12 deletions schematic/manifest/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,14 @@
import click
import click_log

from schematic.schemas.data_model_parser import DataModelParser
from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer
from schematic.manifest.generator import ManifestGenerator

from schematic.utils.cli_utils import log_value_from_config, query_dict, parse_synIDs
from schematic.help import manifest_commands
from schematic.schemas.generator import SchemaGenerator
from schematic.utils.google_api_utils import export_manifest_csv
from schematic.help import manifest_commands

from schematic.store.synapse import SynapseStorage
from schematic.configuration.configuration import CONFIG

Expand Down Expand Up @@ -59,7 +62,7 @@ def manifest(ctx, config): # use as `schematic manifest ...`
help=query_dict(manifest_commands, ("manifest", "get", "data_type")),
)
@click.option(
"-p", "--jsonld", help=query_dict(manifest_commands, ("manifest", "get", "jsonld"))
"-p", "--path_to_data_model", help=query_dict(manifest_commands, ("manifest", "get", "path_to_data_model"))
)
@click.option(
"-d",
Expand Down Expand Up @@ -104,7 +107,7 @@ def get_manifest(
ctx,
title,
data_type,
jsonld,
path_to_data_model,
dataset_id,
sheet_url,
output_csv,
Expand All @@ -121,17 +124,31 @@ def get_manifest(
if data_type is None:
data_type = CONFIG.manifest_data_type
log_value_from_config("data_type", data_type)
if jsonld is None:
jsonld = CONFIG.model_location
log_value_from_config("jsonld", jsonld)
if path_to_data_model is None:
path_to_data_model = CONFIG.model_location
log_value_from_config("path_to_data_model", path_to_data_model)
if title is None:
title = CONFIG.manifest_title
log_value_from_config("title", title)

data_model_parser = DataModelParser(path_to_data_model = path_to_data_model)

#Parse Model
logger.info("Parsing data model.")
parsed_data_model = data_model_parser.parse_model()

# Instantiate DataModelGraph
data_model_grapher = DataModelGraph(parsed_data_model)

# Generate graph
logger.info("Generating data model graph.")
graph_data_model = data_model_grapher.generate_data_model_graph()

def create_single_manifest(data_type, output_csv=None, output_xlsx=None):
# create object of type ManifestGenerator
manifest_generator = ManifestGenerator(
path_to_json_ld=jsonld,
path_to_data_model=path_to_data_model,
graph = graph_data_model,
title=t,
root=data_type,
use_annotations=use_annotations,
Expand Down Expand Up @@ -174,7 +191,7 @@ def create_single_manifest(data_type, output_csv=None, output_xlsx=None):
logger.info("Find the manifest template using this Google Sheet URL:")
click.echo(result)
if output_csv is None and output_xlsx is None:
prefix, _ = os.path.splitext(jsonld)
prefix, _ = os.path.splitext(path_to_data_model)
prefix_root, prefix_ext = os.path.splitext(prefix)
if prefix_ext == ".model":
prefix = prefix_root
Expand All @@ -194,9 +211,10 @@ def create_single_manifest(data_type, output_csv=None, output_xlsx=None):
if type(data_type) is str:
data_type = [data_type]

if data_type[0] == 'all manifests':
sg = SchemaGenerator(path_to_json_ld=jsonld)
component_digraph = sg.se.get_digraph_by_edge_type('requiresComponent')
if data_type[0] == 'all manifests':
# Feed graph into the data model graph explorer
dmge = DataModelGraphExplorer(graph_data_model)
component_digraph = dmge.get_digraph_by_edge_type('requiresComponent')
components = component_digraph.nodes()
for component in components:
t = f'{title}.{component}.manifest'
Expand Down
86 changes: 54 additions & 32 deletions schematic/manifest/generator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from collections import OrderedDict
import json
import logging
import networkx as nx
from openpyxl.styles import Font, Alignment, PatternFill
from openpyxl import load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows
Expand All @@ -12,7 +13,10 @@
from typing import Dict, List, Optional, Tuple, Union, BinaryIO, Literal
from flask import send_from_directory

from schematic.schemas.generator import SchemaGenerator
from schematic.schemas.data_model_graph import DataModelGraph, DataModelGraphExplorer
from schematic.schemas.data_model_parser import DataModelParser
from schematic.schemas.data_model_json_schema import DataModelJSONSchema

from schematic.utils.google_api_utils import (
execute_google_api_requests,
build_service_account_creds,
Expand All @@ -35,7 +39,8 @@
class ManifestGenerator(object):
def __init__(
self,
path_to_json_ld: str, # JSON-LD file to be used for generating the manifest
path_to_data_model: str, # JSON-LD file to be used for generating the manifest
graph: nx.MultiDiGraph, # At this point, the graph is fully formed.
alphabetize_valid_values: str = 'ascending',
title: str = None, # manifest sheet title
root: str = None,
Expand All @@ -54,6 +59,12 @@ def __init__(
# google service credentials object
self.creds = services_creds["creds"]

# Path to jsonld
self.model_path = path_to_data_model

# Graph
self.graph = graph

# schema root
if root:
self.root = root
Expand All @@ -79,14 +90,14 @@ def __init__(
"when there is no manifest file for the dataset in question."
)

# SchemaGenerator() object
self.sg = SchemaGenerator(path_to_json_ld)
# Instantiate Data Model Explorer object
self.dmge = DataModelGraphExplorer(self.graph)

# additional metadata to add to manifest
self.additional_metadata = additional_metadata

# Check if the class is in the schema
root_in_schema = self.sg.se.is_class_in_schema(self.root)
root_in_schema = self.dmge.is_class_in_schema(self.root)

# If the class could not be found, give a notification
if not root_in_schema:
Expand All @@ -95,8 +106,7 @@ def __init__(
raise LookupError(exception_message)

# Determine whether current data type is file-based
self.is_file_based = "Filename" in self.sg.get_node_dependencies(self.root)

self.is_file_based = "Filename" in self.dmge.get_node_dependencies(self.root)

def _attribute_to_letter(self, attribute, manifest_fields):
"""Map attribute to column letter in a google sheet"""
Expand Down Expand Up @@ -364,13 +374,12 @@ def _get_json_schema(self, json_schema_filepath: str) -> Dict:
json_schema_filepath(str): path to json schema file
Returns:
Dictionary, containing portions of the json schema
TODO: Do we even allow people to provide a json_schema_filepath anyore?
"""
if not json_schema_filepath:
# if no json schema is provided; there must be
# schema explorer defined for schema.org schema
# o.w. this will throw an error
# TODO: catch error
json_schema = self.sg.get_json_schema_requirements(self.root, self.title)
# TODO Catch error if no JSONLD or JSON path provided.
data_model_js = DataModelJSONSchema(jsonld_path=self.model_path, graph=self.graph)
json_schema = data_model_js.get_json_validation_schema(source_node=self.root, schema_name=self.title)
else:
with open(json_schema_filepath) as jsonfile:
json_schema = json.load(jsonfile)
Expand Down Expand Up @@ -813,9 +822,9 @@ def _request_row_format(self, i, req):
notes_body["requests"] (dict): with information on note
to add to the column header. This notes body will be added to a request.
"""
if self.sg.se:
if self.dmge:
# get node definition
note = self.sg.get_node_definition(req)
note = self.dmge.get_node_comment(node_display_name = req)

notes_body = {
"requests": [
Expand Down Expand Up @@ -1014,8 +1023,7 @@ def _dependency_formatting(
dependency_formatting_body = {"requests": []}
for j, val_dep in enumerate(val_dependencies):
is_required = False

if self.sg.is_node_required(val_dep):
if self.dmge.get_node_required(node_display_name=val_dep):
is_required = True
else:
is_required = False
Expand Down Expand Up @@ -1058,13 +1066,13 @@ def _request_dependency_formatting(
for req_val in req_vals:
# get this required/valid value's node label in schema, based on display name (i.e. shown to the user in a dropdown to fill in)
req_val = req_val["userEnteredValue"]
req_val_node_label = self.sg.get_node_label(req_val)
req_val_node_label = self.dmge.get_node_label(req_val)
if not req_val_node_label:
# if this node is not in the graph
# continue - there are no dependencies for it
continue
# check if this required/valid value has additional dependency attributes
val_dependencies = self.sg.get_node_dependencies(
val_dependencies = self.dmge.get_node_dependencies(
req_val_node_label, schema_ordered=False
)

Expand Down Expand Up @@ -1117,7 +1125,7 @@ def _create_requests_body(
requests_body["requests"] = []
for i, req in enumerate(ordered_metadata_fields[0]):
# Gather validation rules and valid values for attribute.
validation_rules = self.sg.get_node_validation_rules(req)
validation_rules = self.dmge.get_node_validation_rules(node_display_name=req)

# Add regex match validaiton rule to Google Sheets.
if validation_rules and sheet_url:
Expand Down Expand Up @@ -1364,7 +1372,7 @@ def map_annotation_names_to_display_names(
pd.DataFrame: Annotations table with updated column headers.
"""
# Get list of attribute nodes from data model
model_nodes = self.sg.se.get_nx_schema().nodes
model_nodes = self.graph.nodes

# Subset annotations to those appearing as a label in the model
labels = filter(lambda x: x in model_nodes, annotations.columns)
Expand Down Expand Up @@ -1492,7 +1500,7 @@ def _handle_output_format_logic(self, output_format: str = None, output_path: st
return dataframe

@staticmethod
def create_single_manifest(jsonld: str, data_type: str, access_token:Optional[str]=None, dataset_id:Optional[str]=None, strict:Optional[bool]=True, title:Optional[str]=None, output_format:Literal["google_sheet", "excel", "dataframe"]="google_sheet", use_annotations:Optional[bool]=False) -> Union[str, pd.DataFrame, BinaryIO]:
def create_single_manifest(path_to_data_model: str, graph_data_model: nx.MultiDiGraph, data_type: str, access_token:Optional[str]=None, dataset_id:Optional[str]=None, strict:Optional[bool]=True, title:Optional[str]=None, output_format:Literal["google_sheet", "excel", "dataframe"]="google_sheet", use_annotations:Optional[bool]=False) -> Union[str, pd.DataFrame, BinaryIO]:
"""Create a single manifest
Args:
Expand All @@ -1510,7 +1518,8 @@ def create_single_manifest(jsonld: str, data_type: str, access_token:Optional[st
"""
# create object of type ManifestGenerator
manifest_generator = ManifestGenerator(
path_to_json_ld=jsonld,
path_to_data_model=path_to_data_model,
graph=graph_data_model,
title=title,
root=data_type,
use_annotations=use_annotations,
Expand All @@ -1536,11 +1545,11 @@ def create_single_manifest(jsonld: str, data_type: str, access_token:Optional[st
return result

@staticmethod
def create_manifests(jsonld:str, data_types:list, access_token:Optional[str]=None, dataset_ids:Optional[list]=None, output_format:Literal["google_sheet", "excel", "dataframe"]="google_sheet", title:Optional[str]=None, strict:Optional[bool]=True, use_annotations:Optional[bool]=False) -> Union[List[str], List[pd.DataFrame], BinaryIO]:
def create_manifests(path_to_data_model:str, data_types:list, access_token:Optional[str]=None, dataset_ids:Optional[list]=None, output_format:Literal["google_sheet", "excel", "dataframe"]="google_sheet", title:Optional[str]=None, strict:Optional[bool]=True, use_annotations:Optional[bool]=False) -> Union[List[str], List[pd.DataFrame], BinaryIO]:
"""Create multiple manifests
Args:
jsonld (str): jsonld schema
path_to_data_model (str): str path to data model
data_type (list): a list of data types
access_token (str, optional): synapse access token. Required when getting an existing manifest. Defaults to None.
dataset_id (list, optional): a list of dataset ids when generating an existing manifest. Defaults to None.
Expand All @@ -1552,18 +1561,30 @@ def create_manifests(jsonld:str, data_types:list, access_token:Optional[str]=Non
Returns:
Union[List[str], List[pd.DataFrame], BinaryIO]: a list of Googlesheet URLs, a list of pandas dataframes or an Excel file.
"""
data_model_parser = DataModelParser(path_to_data_model = path_to_data_model)

#Parse Model
parsed_data_model = data_model_parser.parse_model()

# Instantiate DataModelGraph
data_model_grapher = DataModelGraph(parsed_data_model)

# Generate graph
graph_data_model = data_model_grapher.generate_data_model_graph()

# Gather all returned result urls
all_results = []
if data_types[0] == 'all manifests':
sg = SchemaGenerator(path_to_json_ld=jsonld)
component_digraph = sg.se.get_digraph_by_edge_type('requiresComponent')
dmge = DataModelGraphExplorer(graph_data_model)
component_digraph = dmge.get_digraph_by_edge_type('requiresComponent')
components = component_digraph.nodes()
for component in components:
if title:
t = f'{title}.{component}.manifest'
else:
t = f'Example.{component}.manifest'
if output_format != "excel":
result = ManifestGenerator.create_single_manifest(jsonld=jsonld, data_type=component, output_format=output_format, title=t, access_token=access_token, strict=strict, use_annotations=use_annotations)
result = ManifestGenerator.create_single_manifest(path_to_data_model=path_to_data_model, data_type=component, graph_data_model=graph_data_model, output_format=output_format, title=t, access_token=access_token)
all_results.append(result)
else:
logger.error('Currently we do not support returning multiple files as Excel format at once. Please choose a different output format. ')
Expand All @@ -1578,9 +1599,9 @@ def create_manifests(jsonld:str, data_types:list, access_token:Optional[str]=Non
t = title
if dataset_ids:
# if a dataset_id is provided add this to the function call.
result = ManifestGenerator.create_single_manifest(jsonld=jsonld, data_type=dt, dataset_id=dataset_ids[i], output_format=output_format, title=t, access_token=access_token, strict=strict, use_annotations=use_annotations)
result = ManifestGenerator.create_single_manifest(path_to_data_model=path_to_data_model, data_type=dt, graph_data_model=graph_data_model, dataset_id=dataset_ids[i], output_format=output_format, title=t, access_token=access_token, use_annotations=use_annotations)
else:
result = ManifestGenerator.create_single_manifest(jsonld=jsonld, data_type=dt, output_format=output_format, title=t, access_token=access_token, strict=strict, use_annotations=use_annotations)
result = ManifestGenerator.create_single_manifest(path_to_data_model=path_to_data_model, data_type=dt, graph_data_model=graph_data_model, output_format=output_format, title=t, access_token=access_token, use_annotations=use_annotations)

# if output is pandas dataframe or google sheet url
if isinstance(result, str) or isinstance(result, pd.DataFrame):
Expand All @@ -1589,6 +1610,7 @@ def create_manifests(jsonld:str, data_types:list, access_token:Optional[str]=Non
if len(data_types) > 1:
logger.warning(f'Currently we do not support returning multiple files as Excel format at once. Only {t} would get returned. ')
return result

return all_results


Expand Down Expand Up @@ -1632,7 +1654,7 @@ def get_manifest(

# Get manifest file associated with given dataset (if applicable)
# populate manifest with set of new files (if applicable)
manifest_record = store.updateDatasetManifestFiles(self.sg, datasetId = dataset_id, store = False)
manifest_record = store.updateDatasetManifestFiles(self.dmge, datasetId = dataset_id, store = False)

# get URL of an empty manifest file created based on schema component
empty_manifest_url = self.get_empty_manifest(strict=strict, sheet_url=True)
Expand Down Expand Up @@ -1869,9 +1891,9 @@ def sort_manifest_fields(self, manifest_fields, order="schema"):

# order manifest fields based on data-model schema
if order == "schema":
if self.sg and self.root:
if self.dmge and self.root:
# get display names of dependencies
dependencies_display_names = self.sg.get_node_dependencies(self.root)
dependencies_display_names = self.dmge.get_node_dependencies(self.root)

# reorder manifest fields so that root dependencies are first and follow schema order
manifest_fields = sorted(
Expand Down
Loading

0 comments on commit 2ade117

Please sign in to comment.