Fixes #12

opencitations · Apr 29, 2021 · 4afe7a5 · 4afe7a5
1 parent b1b3ae1
commit 4afe7a5
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 14 deletions.
diff --git a/oc_ocdm/storer.py b/oc_ocdm/storer.py
@@ -25,7 +25,7 @@
 from oc_ocdm.support.query_utils import get_update_query
 
 if TYPE_CHECKING:
-    from typing import Dict, List, Tuple, Any, Optional
+    from typing import Dict, List, Tuple, Any, Optional, Set
     from rdflib import URIRef
     from oc_ocdm.abstract_entity import AbstractEntity
     from oc_ocdm.abstract_set import AbstractSet
@@ -44,7 +44,18 @@ class Storer(object):
     def __init__(self, abstract_set: AbstractSet, repok: Reporter = None, reperr: Reporter = None,
                  context_map: Dict[str, Any] = None, default_dir: str = "_", dir_split: int = 0,
                  n_file_item: int = 1, output_format: str = "json-ld") -> None:
-        self.output_format: str = output_format
+        # We only accept format strings that:
+        # 1. are supported by rdflib
+        # 2. correspond to an output format which is effectively either NT or NQ
+        # The only exception to this rule is the 'json-ld' format, which is the default value of 'output_format'.
+        supported_formats: Set[str] = {'application/n-triples', 'ntriples', 'nt', 'nt11',
+                                       'application/n-quads', 'nquads', 'json-ld'}
+        if output_format not in supported_formats:
+            raise ValueError(f"Given output_format '{self.output_format}' is not supported."
+                             f" Available formats: {supported_formats}.")
+        else:
+            self.output_format: str = output_format
+
         self.dir_split: int = dir_split
         self.n_file_item: int = n_file_item
         self.default_dir: str = default_dir if default_dir != "" else "_"

diff --git a/oc_ocdm/support/support.py b/oc_ocdm/support/support.py
@@ -269,11 +269,6 @@ def find_paths(res: URIRef, base_dir: str, base_iri: str, default_dir: str, dir_
     """
     string_iri: str = str(res)
 
-    if is_json:
-        format_string: str = ".json"
-    else:
-        format_string: str = ".ttl"
-
     if is_dataset(res):
         cur_dir_path: str = (base_dir + re.sub(r"^%s(.*)$" % base_iri, r"\1", string_iri))[:-1]
         # In case of dataset, the file path is different from regular files, e.g.
@@ -305,49 +300,61 @@ def find_paths(res: URIRef, base_dir: str, base_iri: str, default_dir: str, dir_
                 subj_short_name: str = get_prov_subject_short_name(res)
                 short_name: str = get_short_name(res)
                 sub_folder: str = get_prov_subject_prefix(res)
+                file_extension: str = '.json' if is_json else '.nq'
                 if sub_folder == "":
                     sub_folder = default_dir
+                if sub_folder == "":
+                    sub_folder = "_"  # enforce default value
 
                 cur_dir_path: str = base_dir + subj_short_name + os.sep + sub_folder + \
                     os.sep + str(cur_split) + os.sep + str(cur_file_split) + os.sep + "prov"
-                cur_file_path: str = cur_dir_path + os.sep + short_name + format_string
+                cur_file_path: str = cur_dir_path + os.sep + short_name + file_extension
             else:  # regular bibliographic entity
                 short_name: str = get_short_name(res)
                 sub_folder: str = get_prefix(res)
+                file_extension: str = '.json' if is_json else '.nt'
                 if sub_folder == "":
                     sub_folder = default_dir
+                if sub_folder == "":
+                    sub_folder = "_"  # enforce default value
 
-                cur_dir_path: str = base_dir + short_name + os.sep + sub_folder + \
-                    os.sep + str(cur_split)
-                cur_file_path: str = cur_dir_path + os.sep + str(cur_file_split) + format_string
+                cur_dir_path: str = base_dir + short_name + os.sep + sub_folder + os.sep + str(cur_split)
+                cur_file_path: str = cur_dir_path + os.sep + str(cur_file_split) + file_extension
         # Enter here if no split is needed
         elif dir_split == 0:
             if "/prov/" in string_iri:
                 subj_short_name: str = get_prov_subject_short_name(res)
                 short_name: str = get_short_name(res)
                 sub_folder: str = get_prov_subject_prefix(res)
+                file_extension: str = '.json' if is_json else '.nq'
                 if sub_folder == "":
                     sub_folder = default_dir
+                if sub_folder == "":
+                    sub_folder = "_"  # enforce default value
 
                 cur_dir_path: str = base_dir + subj_short_name + os.sep + sub_folder + \
                     os.sep + str(cur_file_split) + os.sep + "prov"
-                cur_file_path: str = cur_dir_path + os.sep + short_name + format_string
+                cur_file_path: str = cur_dir_path + os.sep + short_name + file_extension
             else:
                 short_name: str = get_short_name(res)
                 sub_folder: str = get_prefix(res)
+                file_extension: str = '.json' if is_json else '.nt'
                 if sub_folder == "":
                     sub_folder = default_dir
+                if sub_folder == "":
+                    sub_folder = "_"  # enforce default value
 
                 cur_dir_path: str = base_dir + short_name + os.sep + sub_folder
-                cur_file_path: str = cur_dir_path + os.sep + str(cur_file_split) + format_string
+                cur_file_path: str = cur_dir_path + os.sep + str(cur_file_split) + file_extension
         # Enter here if the data is about a provenance agent, e.g. /corpus/prov/
         else:
             short_name: str = get_short_name(res)
             prefix: str = get_prefix(res)
             count: str = get_count(res)
+            file_extension: str = '.json' if is_json else '.nq'
 
             cur_dir_path: str = base_dir + short_name
-            cur_file_path: str = cur_dir_path + os.sep + prefix + count + format_string
+            cur_file_path: str = cur_dir_path + os.sep + prefix + count + file_extension
 
     return cur_dir_path, cur_file_path