From dd3a607d37eb70b185a3c5653bf7bb802afbdb0c Mon Sep 17 00:00:00 2001
From: Jonathan Yu <4723726+jyucsiro@users.noreply.github.com>
Date: Wed, 20 Sep 2017 17:11:15 +1000
Subject: [PATCH 01/13] adding handling for outputting to file and default uris

---
 nc2rdf/nc2rdf.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/nc2rdf/nc2rdf.py b/nc2rdf/nc2rdf.py
index 120edac..57d69f8 100644
--- a/nc2rdf/nc2rdf.py
+++ b/nc2rdf/nc2rdf.py
@@ -6,12 +6,17 @@
 import numpy as np
 import bald
 
-def nc2rdf(ncfilename, outformat):  
+def nc2rdf(ncfilename, outformat, outputfile=None, uri=None):  
     #print("nc2rdf test")
     #print(ncfile)
-    root_container = bald.load_netcdf(ncfilename)
+    root_container = bald.load_netcdf(ncfilename, uri=uri)
     ttl = root_container.rdfgraph().serialize(format=outformat).decode("utf-8")
-    print(ttl)
+    if(outputfile is None):
+       ttl = root_container.rdfgraph().serialize(format=outformat).decode("utf-8")
+       print(ttl)
+    else:
+       ttl = root_container.rdfgraph().serialize(destination=outputfile, format=outformat).decode("utf-8")
+
 
 def cdl2rdf(cdl_file, outformat): 
     #print("cdl2rdf test")

From 61eb27983cdca69485e25fa57372ca925fab79cc Mon Sep 17 00:00:00 2001
From: Jonathan Yu <4723726+jyucsiro@users.noreply.github.com>
Date: Wed, 20 Sep 2017 17:11:41 +1000
Subject: [PATCH 02/13] adding default prefixes handling

---
 lib/bald/__init__.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/lib/bald/__init__.py b/lib/bald/__init__.py
index 120300d..f1f07f4 100644
--- a/lib/bald/__init__.py
+++ b/lib/bald/__init__.py
@@ -568,6 +568,20 @@ def load_netcdf(afilepath, uri=None):
             identity = uri
         else:
             identity = 'root'
+
+        # Ensure some well-known prefixes are loaded
+        default_prefixes = {
+                "bald__" : "http://binary-array-ld.net/latest/",
+                "rdf__" : "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
+                "rdfs__" : "http://www.w3.org/2000/01/rdf-schema#",
+                "xml__" : "http://www.w3.org/XML/1998/namespace#",
+                "xsd__" : "http://www.w3.org/2001/XMLSchema#"
+                }
+        for p in default_prefixes:
+           if p not in prefixes:
+              prefixes[p] = default_prefixes[p]
+
+
         root_container = Container(identity, attrs, prefixes=prefixes,
                                    aliases=aliases)
         root_container.attrs['bald__contains'] = []

From dc1f020799968b95646b19fdb5816b8819e22b9c Mon Sep 17 00:00:00 2001
From: Jonathan Yu <4723726+jyucsiro@users.noreply.github.com>
Date: Wed, 20 Sep 2017 17:12:21 +1000
Subject: [PATCH 03/13] adding requirements.txt and script to harvest thredds
 nc files and turn them into rdf

---
 nc2rdf/requirements.txt |   5 +
 nc2rdf/threddsnc2rdf.py | 256 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 261 insertions(+)
 create mode 100644 nc2rdf/requirements.txt
 create mode 100755 nc2rdf/threddsnc2rdf.py

diff --git a/nc2rdf/requirements.txt b/nc2rdf/requirements.txt
new file mode 100644
index 0000000..ff9123f
--- /dev/null
+++ b/nc2rdf/requirements.txt
@@ -0,0 +1,5 @@
+lxml
+pydap
+requests-futures
+owslib
+urllib3
diff --git a/nc2rdf/threddsnc2rdf.py b/nc2rdf/threddsnc2rdf.py
new file mode 100755
index 0000000..a8fba1e
--- /dev/null
+++ b/nc2rdf/threddsnc2rdf.py
@@ -0,0 +1,256 @@
+import nc2rdf
+import re
+import sys
+import datetime
+import argparse
+from urlparse import urlparse
+import uuid
+try:
+        from urlparse import urljoin  # Python2
+        from urlparse import urlsplit, urlunsplit
+except ImportError:
+        from urllib.parse import urljoin  # Python3
+        from urllib.parse import urlsplit, urlunsplit
+import lxml
+import json
+import requests
+from dateutil import parser
+from pydap.client import open_url
+import pydap.lib
+from owslib.wms import WebMapService
+from owslib.iso import *
+import urllib
+from timeit import default_timer as timer
+import code, traceback, signal
+import os
+
+pydap.lib.TIMEOUT = 5
+
+OUTDIR = 'rdf'
+
+#Utility to allow debugger to attach to this program
+def debug(sig, frame):
+    """Interrupt running process, and provide a python prompt for
+    interactive debugging."""
+    d={'_frame':frame}         # Allow access to frame object.
+    d.update(frame.f_globals)  # Unless shadowed by global
+    d.update(frame.f_locals)
+
+    i = code.InteractiveConsole(d)
+    message  = "Signal received : entering python shell.\nTraceback:\n"
+    message += ''.join(traceback.format_stack(frame))
+    i.interact(message)
+
+#Utility to allow debugger to attach to this program
+def listen():
+    signal.signal(signal.SIGUSR1, debug)  # Register handler
+
+class ThreddsHarvester:
+    """Harvests metadata from a Thredds service"""
+    def lookup_datasets_in_catalog(self, base_url, catalog_url, list_of_netcdf_files):
+       """loads the catalog xml and extracts dataset access information"""
+       xml = lxml.etree.parse(catalog_url)
+       namespaces = {"xlink": "http://www.w3.org/1999/xlink", 'c':'http://www.unidata.ucar.edu/namespaces/thredds/InvCatalog/v1.0'}
+       access_infos = []
+       used_types = []
+       for node in xml.xpath('/c:catalog/c:service/c:service', namespaces=namespaces):
+           access_type = node.get('serviceType')
+           if access_type not in used_types:
+               used_types.append(access_type)
+               access_info = { "type" : access_type, "access" : node.get('base') }
+               access_infos.append(access_info)
+
+       #print "b: " + base_url
+       #print "c: " + catalog_url
+
+       open_dap_prefix = xml.xpath('/c:catalog/c:service/c:service[@serviceType="OPENDAP"]', namespaces=namespaces)[0].get('base')
+       iso_prefix_result = xml.xpath('/c:catalog/c:service/c:service[@serviceType="ISO"]', namespaces=namespaces)
+       wms_prefix_result = xml.xpath('/c:catalog/c:service/c:service[@serviceType="WMS"]', namespaces=namespaces)
+
+       if len(wms_prefix_result) > 0:
+          wms_prefix = wms_prefix_result[0].get('base')
+       else:
+          wms_prefix = None
+
+       if len(iso_prefix_result) > 0:
+          iso_prefix = iso_prefix_result[0].get('base')
+       else:
+          iso_prefix = None
+
+       res = xml.xpath('/c:catalog/c:dataset/c:dataset|/c:catalog/c:dataset[@urlPath]|//c:catalogRef', namespaces=namespaces)
+
+       for item in res:
+          if 'urlPath' in item.keys():
+              url_path = item.attrib['urlPath']
+              iso_path = base_url + iso_prefix + url_path if iso_prefix != None else None
+              wms_path = base_url + wms_prefix + url_path if wms_prefix != None else None
+              dataset_access_infos = []
+              print "urlPath: " + url_path
+              for access_info in access_infos:
+                  dataset_access_info = { "type" : access_info["type"], "access" :  base_url + access_info["access"] + url_path }
+                  dataset_access_infos.append(dataset_access_info)
+              datasetEntry = { 'name' : item.attrib['name'], 'open_dap_path': base_url + open_dap_prefix + url_path, 'iso_path': iso_path, 'wms_path': wms_path, 'access_infos' : dataset_access_infos }
+              list_of_netcdf_files.append(datasetEntry)
+          if '{http://www.w3.org/1999/xlink}href' in item.keys():
+                  newCatalogPath = item.attrib["{http://www.w3.org/1999/xlink}href"]
+                  #print  item 
+                  #print "baseUrl " + base_url
+                  #print "href " + newCatalogPath
+                  #print "catalogUrl " + catalog_url
+                  newCatalogUrl = urljoin(catalog_url, newCatalogPath)
+                  print "newCatalogUrl " + newCatalogUrl
+                  if (newCatalogPath.endswith('catalog.xml')):
+                      self.lookup_datasets_in_catalog(base_url, newCatalogUrl, list_of_netcdf_files)
+                  elif (newCatalogPath.endswith('.xml')):
+                      self.lookup_datasets_in_catalog(base_url, catalog_url.replace('catalog.xml', newCatalogPath), list_of_netcdf_files)
+       return list_of_netcdf_files
+
+def get_opendap_record(dataset_url):
+    """Get the open dap record from the thredds service and look for the eReefs observed properties, build a record of these and return"""
+    data = {}
+    print dataset_url
+    datasetInformation = open_url(dataset_url)
+    for variable in datasetInformation.keys():
+        variable_properties = datasetInformation[variable]
+        data[variable] = {}
+        list_attributes = variable_properties.attributes.keys()
+        for variable_attribute in list_attributes :
+           value = variable_properties.attributes[variable_attribute]
+           data[variable][variable_attribute] = value
+    return data
+
+def assign_url_later(assign_url_map, property_to_assign, url_to_assign):
+    """build a record of a url and which properties in the dataset record it exists on, also update a list of unique urls, assign_url_map holds this information for later use when url is resolved"""
+    if not url_to_assign in assign_url_map["unique_urls"]:
+        assign_url_map["url_property_map"][url_to_assign] = []
+        assign_url_map["unique_urls"].append(url_to_assign)
+    assign_url_map["url_property_map"][url_to_assign].append(property_to_assign)
+
+class RecordManager:
+    """Abstract class for managing dataset record persistence"""
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type, value, traceback):
+        pass
+
+    def should_update(self, dataset_address):
+        return True
+
+    def assign_urls(self, assign_url_map):
+        """Resolve urls in the assign_url_map and, if redirected, update the dataset records to use the redirected url"""
+        failed_urls = []
+        redirected_urls = []
+        original_to_resolved_url = {}
+        url_contents = {}
+
+        print assign_url_map
+
+
+def process_dataset(assign_url_map, dataset_address, thredds_url, thredds_catalog_url):
+    """Extract information about the specific at dataset_address"""
+    print "processing dataset address: " + dataset_address['name']
+
+
+    #Use multiple endpoints to get information about this dataset with redundancy
+    opendap_url = dataset_address['open_dap_path']
+    iso_url = dataset_address['iso_path']
+    wms_url = dataset_address['wms_path']  + "?service=WMS"
+
+    #Common information across all variables in this dataset
+    common_info = {}
+
+    opendap_information = {}
+    try:
+        opendap_information = get_opendap_record(opendap_url)
+    except Exception as e:
+        print "Exception caught in perform_harvest - get_opendap_ereefs(" + opendap_url + "): ", e.message
+
+    common_info["access"] = dataset_address["access_infos"]
+    common_info["dataset_id"] = opendap_url
+
+    #print opendap_information
+    #print json.dumps(opendap_information, check_circular=False, sort_keys=True, indent=4, separators=(',', ': '), default=datetime_handler)
+    
+    unique_dataset_id = uuid.uuid4().hex
+    outputpath = OUTDIR + "/" + unique_dataset_id + ".ttl"
+    print "emitting to " + outputpath
+    nc2rdf.nc2rdf(opendap_url, 'turtle', outputfile=outputpath, uri=opendap_url)
+
+
+def perform_harvest(thredds_url, thredds_catalog_url):
+    """Perform harvest on the thredds_catalog_url"""
+    #Get dataset information
+    #print thredds_url
+    #print thredds_catalog_url
+    list_datasets_address = harvester.lookup_datasets_in_catalog(thredds_url, thredds_catalog_url, [])
+    #print list_datasets_address
+    dataset_uri = ''
+
+    #Prepare a map for delayed resolution of redirected urls
+    assign_url_map = { "unique_urls" : [], "url_property_map": {} }
+
+    #process other dataset information
+    for dataset_address in list_datasets_address:
+       try:
+          process_dataset(assign_url_map, dataset_address, thredds_url, thredds_catalog_url)
+       except requests.exceptions.HTTPError as e:
+          print "HTTPError caught in perform_harvest: ", e.message , " ", thredds_catalog_url
+       except AttributeError as e:
+          print "AttributeError caught in perform_harvest: ", e.message , " ", thredds_catalog_url
+
+
+def process_thredds(thredds_url):
+    """harvest thredds endpoint"""
+    #assemble catalog url
+
+    if (thredds_url.endswith('catalog.xml')):
+        thredds_catalog_url = thredds_url
+        thredds_url = get_base_url(thredds_url) 
+    else:
+       thredds_catalog_url = thredds_url + '/catalog/catalog.xml'
+    print 'thredds_base_url:' + thredds_url
+    print 'thredds_catalog_url:' + thredds_catalog_url
+    perform_harvest(thredds_url, thredds_catalog_url)
+
+
+def get_base_url(url):
+    split_url = urlsplit(url)
+    # You now have:
+    # split_url.scheme   "http"
+    # split_url.netloc   "127.0.0.1" 
+    # split_url.path     "/asdf/login.php"
+    # split_url.query    ""
+    # split_url.fragment ""
+
+    # urlunsplit takes and joins a five item iterable, we "" the last two items to remove the query string and fragment. 
+    clean_url = urlunsplit((split_url.scheme, split_url.netloc, "", "", ""))
+    return clean_url
+
+def datetime_handler(x):
+    if isinstance(x, datetime.datetime):
+       return x.isoformat()
+    raise TypeError("Unknown type")
+
+def checkOrCreateDir(directory):
+    if not os.path.exists(directory):
+       os.makedirs(directory)
+
+if __name__ == '__main__':
+    start = timer()
+
+    harvester = ThreddsHarvester()
+
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument('threddsUrlOrCatalog', help='THREDDS endpoint url or catalog.xml')
+    args = argparser.parse_args()
+
+    #make sure outdir is created
+    checkOrCreateDir(OUTDIR)
+
+    process_thredds(args.threddsUrlOrCatalog)
+    #process_dpn(dpn_url.strip(), DPN_ELDA_RES_ENDPOINT)
+
+    end = timer()
+    elapsed = end - start
+    print "Execution took ", elapsed, " seconds"

From 1b0168e1f8672831d57d8c5e904516727021c25e Mon Sep 17 00:00:00 2001
From: Jonathan Yu <4723726+jyucsiro@users.noreply.github.com>
Date: Wed, 27 Sep 2017 15:23:24 +1000
Subject: [PATCH 04/13] Adding threddsnc2rdf cmd line tool and readme doc

---
 nc2rdf/README.md        | 13 +++++++++++++
 nc2rdf/threddsnc2rdf.py | 31 ++++++++++++++++++++++++++-----
 2 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/nc2rdf/README.md b/nc2rdf/README.md
index 43db903..cb732e8 100644
--- a/nc2rdf/README.md
+++ b/nc2rdf/README.md
@@ -32,3 +32,16 @@ $ python nc2rdf.py -o xml myfile.nc
 ```
 
 Note: This command-line tool is experimental and is subject to changes, however serves as a prototype for accessing bald functions for netCDF related files to RDF.
+
+
+# thredds2rdf
+
+This tool allows users to input a THREDDS endpoint or THREDDS catalog.xml and get a set of RDF graphs returned for every nc file found.
+
+Example:
+```
+$ python thredds2rdf.py http://example.org/thredds
+$ python thredds2rdf.py http://example.org/thredds/catalog.xml
+```
+
+Output will be emitted to the `rdf` directory
diff --git a/nc2rdf/threddsnc2rdf.py b/nc2rdf/threddsnc2rdf.py
index a8fba1e..347aee2 100755
--- a/nc2rdf/threddsnc2rdf.py
+++ b/nc2rdf/threddsnc2rdf.py
@@ -63,7 +63,12 @@ def lookup_datasets_in_catalog(self, base_url, catalog_url, list_of_netcdf_files
        #print "b: " + base_url
        #print "c: " + catalog_url
 
-       open_dap_prefix = xml.xpath('/c:catalog/c:service/c:service[@serviceType="OPENDAP"]', namespaces=namespaces)[0].get('base')
+       open_dap_result = xml.xpath('/c:catalog/c:service/c:service[@serviceType="opendap"]', namespaces=namespaces)
+       if len(open_dap_result) > 0:
+          open_dap_prefix = open_dap_result[0].get('base')
+       else:
+          open_dap_prefix = xml.xpath('/c:catalog/c:service[@serviceType="OPeNDAP"]', namespaces=namespaces)[0].get('base')
+
        iso_prefix_result = xml.xpath('/c:catalog/c:service/c:service[@serviceType="ISO"]', namespaces=namespaces)
        wms_prefix_result = xml.xpath('/c:catalog/c:service/c:service[@serviceType="WMS"]', namespaces=namespaces)
 
@@ -77,10 +82,25 @@ def lookup_datasets_in_catalog(self, base_url, catalog_url, list_of_netcdf_files
        else:
           iso_prefix = None
 
-       res = xml.xpath('/c:catalog/c:dataset/c:dataset|/c:catalog/c:dataset[@urlPath]|//c:catalogRef', namespaces=namespaces)
+       res = xml.xpath('/c:catalog/c:dataset/c:dataset|/c:catalog/c:dataset[@urlPath]|/c:catalog/c:dataset/c:dataset/c:access[@urlPath and @serviceName="dap"]|//c:catalogRef', namespaces=namespaces)
 
        for item in res:
-          if 'urlPath' in item.keys():
+          if 'urlPath' in item.keys() and 'serviceName' in item.keys():
+             # get the name from parent elem 
+             parent = item.getparent()
+             name = parent.attrib['name']
+
+             url_path = item.attrib['urlPath']
+             iso_path = base_url + iso_prefix + url_path if iso_prefix != None else None
+             wms_path = base_url + wms_prefix + url_path if wms_prefix != None else None
+             dataset_access_infos = []
+             print "urlPath: " + url_path
+             for access_info in access_infos:
+                  dataset_access_info = { "type" : access_info["type"], "access" :  base_url + access_info["access"] + url_path }
+                  dataset_access_infos.append(dataset_access_info)
+             datasetEntry = { 'name' : name, 'open_dap_path': base_url + open_dap_prefix + url_path, 'iso_path': iso_path, 'wms_path': wms_path, 'access_infos' : dataset_access_infos }
+             list_of_netcdf_files.append(datasetEntry)
+          elif 'urlPath' in item.keys():
               url_path = item.attrib['urlPath']
               iso_path = base_url + iso_prefix + url_path if iso_prefix != None else None
               wms_path = base_url + wms_prefix + url_path if wms_prefix != None else None
@@ -155,7 +175,8 @@ def process_dataset(assign_url_map, dataset_address, thredds_url, thredds_catalo
     #Use multiple endpoints to get information about this dataset with redundancy
     opendap_url = dataset_address['open_dap_path']
     iso_url = dataset_address['iso_path']
-    wms_url = dataset_address['wms_path']  + "?service=WMS"
+    if 'wms_path' in dataset_address and dataset_address['wms_path'] != None:
+       wms_url = dataset_address['wms_path']  + "?service=WMS"
 
     #Common information across all variables in this dataset
     common_info = {}
@@ -164,7 +185,7 @@ def process_dataset(assign_url_map, dataset_address, thredds_url, thredds_catalo
     try:
         opendap_information = get_opendap_record(opendap_url)
     except Exception as e:
-        print "Exception caught in perform_harvest - get_opendap_ereefs(" + opendap_url + "): ", e.message
+        print "Exception caught in perform_harvest - get_opendap_record(" + opendap_url + "): ", e.message
 
     common_info["access"] = dataset_address["access_infos"]
     common_info["dataset_id"] = opendap_url

From 930191549bf54dffcada15564f4d5da4ef2e1cfe Mon Sep 17 00:00:00 2001
From: Jonathan Yu <jonathan.yu@csiro.au>
Date: Mon, 19 Nov 2018 13:19:12 +1100
Subject: [PATCH 05/13] adding nc2schemaorg function

---
 nc2rdf/bald2schemaorg_mappings.json |  8 +++
 nc2rdf/nc2rdf.py                    | 80 +++++++++++++++++++++++++++--
 2 files changed, 84 insertions(+), 4 deletions(-)
 create mode 100644 nc2rdf/bald2schemaorg_mappings.json

diff --git a/nc2rdf/bald2schemaorg_mappings.json b/nc2rdf/bald2schemaorg_mappings.json
new file mode 100644
index 0000000..9eadd25
--- /dev/null
+++ b/nc2rdf/bald2schemaorg_mappings.json
@@ -0,0 +1,8 @@
+[
+   { "bald" : "summary", "schemaorg": "description" }, 
+   { "bald" : "title", "schemaorg": "name" }, 
+   { "bald" : "id", "schemaorg": "identifier" }, 
+   { "bald" : "keywords", "schemaorg": "keywords" }, 
+   { "bald" : "license", "schemaorg": "license" }, 
+   { "bald" : "standard_name", "schemaorg": "variableMeasured" }
+]
diff --git a/nc2rdf/nc2rdf.py b/nc2rdf/nc2rdf.py
index 4a91e32..14848cd 100644
--- a/nc2rdf/nc2rdf.py
+++ b/nc2rdf/nc2rdf.py
@@ -5,14 +5,79 @@
 import netCDF4
 import numpy as np
 import bald
+import rdflib
+import json
+from rdflib import Namespace, BNode, URIRef, Literal
+from rdflib.namespace import RDF
+
+
+
+def getBasename(urlstr):
+   return os.path.basename(urlstr)
+
+def baldgraph2schemaorg(graph):
+    """
+       Input: netCDF file
+       Transforms to a rdflib.Graph bald style
+       Returns a new graph in schema.org profile
+    """
+    #load mappings
+    mapping_idx = {}
+    mapping_data = []
+    with open('bald2schemaorg_mappings.json' , 'r') as f:
+       mapping_data = json.load(f)
+       
+    for item in mapping_data:
+       mapping_idx[item['bald']] = item['schemaorg']
+
+    qres = graph.query(
+    """PREFIX bald: <http://binary-array-ld.net/latest/> 
+       SELECT DISTINCT ?pred ?value
+       WHERE {
+          ?c a bald:Container .
+          ?c ?pred ?value
+       }""")
+ 
+    schema_g = rdflib.Graph()
+    container = BNode()
+    so = Namespace("http://schema.org/")
+    schema_g.add( (container, URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), so.Dataset) )
+
+    for row in qres:
+       currField = getBasename(str(row[0])).strip()
+       #print(getBasename(str(row[0])) + ' (type: ' + str(type(row[0])) + ")" + " :: " + row[1] + ' (type: ' + str(type(row[1])) + ")")
+       if(currField in mapping_idx.keys()):
+          print('schemaorg:' + mapping_idx[currField], "\t", row[1])
+          predUri = URIRef("http://schema.org/" + mapping_idx[currField])
+          lit = Literal(row[1])
+          schema_g.add( (container, predUri, lit) )
+    return schema_g
+
+def nc2schemaorg(ncfilename, outformat, baseuri=None):
+    root_container = bald.load_netcdf(ncfilename, baseuri=baseuri)
+    graph = root_container.rdfgraph()
+    schema_g = baldgraph2schemaorg(graph)
+    
+    if(outformat == 'json-ld'):
+       context = "http://schema.org/"
+       s = schema_g.serialize(format=outformat, context=context, indent=4).decode("utf-8")
+    else:
+       s = schema_g.serialize(format=outformat).decode("utf-8")
+    print(s)
 
 def nc2rdf(ncfilename, outformat, baseuri=None):  
-    #print("nc2rdf test")
-    #print(ncfile)
     root_container = bald.load_netcdf(ncfilename, baseuri=baseuri)
     ttl = root_container.rdfgraph().serialize(format=outformat).decode("utf-8")
     print(ttl)
 
+def cdl2schemaorg(cdl_file, outformat, baseuri=None): 
+    tfile, tfilename = tempfile.mkstemp('.nc')
+    subprocess.check_call(['ncgen', '-o', tfilename, cdl_file])
+    schema_g = nc2schemaorg(tfilename, outformat, baseuri=baseuri)
+    os.close(tfile)
+    os.remove(tfilename)
+    return schema_g
+
 def cdl2rdf(cdl_file, outformat, baseuri=None): 
     #print("cdl2rdf test")
     #print(cdl_file)
@@ -32,13 +97,20 @@ def cdl2rdf(cdl_file, outformat, baseuri=None):
     parser.add_argument('--baseuri', action="store", dest="baseuri", help="Base URI for the graph")
     parser.add_argument('--cdl', action="store_true", dest="isCDL", default=False, help="Flag to indicate file is CDL")
     parser.add_argument('--nc', action="store_true", dest="isNC", default=False, help="Flag to indicate file is netCDF")
+    parser.add_argument('--schema-org', action="store_true", dest="isSchemaOrgOutput", default=False, help="Flag to indicate if schema.org output activated")
     parser.add_argument("ncfile", help="Path for the netCDF file")
 
     args = parser.parse_args()
 
     if(args.isCDL or args.ncfile.endswith(".cdl") or args.ncfile.endswith('.CDL')):
-        cdl2rdf(args.ncfile, args.format, baseuri=args.baseuri)
+        if(args.isSchemaOrgOutput):
+           cdl2schemaorg(args.ncfile, args.format, baseuri=args.baseuri)
+        else:
+           cdl2rdf(args.ncfile, args.format, baseuri=args.baseuri)
     elif(args.isNC or args.ncfile.endswith(".nc") or args.ncfile.endswith('.NC')):
-        nc2rdf(args.ncfile, args.format, baseuri=args.baseuri)
+        if(args.isSchemaOrgOutput):
+           nc2schemaorg(args.ncfile, args.format, baseuri=args.baseuri)
+        else:
+           nc2rdf(args.ncfile, args.format, baseuri=args.baseuri)
     else:
         print("Unrecognised file suffix. Please indicate if CDL or NC via --cdl or --nc");

From 2678a9bc0b70d19546fe4c70d56f3ccab7c57270 Mon Sep 17 00:00:00 2001
From: Jonathan Yu <jonathan.yu@csiro.au>
Date: Mon, 19 Nov 2018 13:23:19 +1100
Subject: [PATCH 06/13] ad reame

---
 nc2rdf/README.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/nc2rdf/README.md b/nc2rdf/README.md
index 43db903..40b54cd 100644
--- a/nc2rdf/README.md
+++ b/nc2rdf/README.md
@@ -31,4 +31,19 @@ $ python nc2rdf.py -o ttl myfile.nc
 $ python nc2rdf.py -o xml myfile.nc
 ```
 
+## nc2schemaorg
+
+This feature provides users a way to create schema.org descriptions from
+ACDD/CF/NUG conformant values in a nc file.
+
+```
+$ python nc2rdf.py -o json-ld --schema-org [cdl or nc file]
+```
+
+Example:
+```
+$ python nc2rdf.py -o json-ld --schema-org ../lib/bald/tests/integration/CDL/trajectoryProfile_template.cdl
+```
+
+
 Note: This command-line tool is experimental and is subject to changes, however serves as a prototype for accessing bald functions for netCDF related files to RDF.

From 974d7790ad22ebd8d5eff34fa56e6e9fc235d107 Mon Sep 17 00:00:00 2001
From: Jonathan Yu <jonathan.yu@csiro.au>
Date: Mon, 19 Nov 2018 13:36:55 +1100
Subject: [PATCH 07/13] adding comments that this is a hack

---
 nc2rdf/nc2rdf.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/nc2rdf/nc2rdf.py b/nc2rdf/nc2rdf.py
index 14848cd..2f91ce0 100644
--- a/nc2rdf/nc2rdf.py
+++ b/nc2rdf/nc2rdf.py
@@ -21,6 +21,9 @@ def baldgraph2schemaorg(graph):
        Transforms to a rdflib.Graph bald style
        Returns a new graph in schema.org profile
     """
+    # HACK: The following mappings ignore prefixes as well as prefixes in nc file
+    # TODO: Fix references to prefixes/aliases proper
+
     #load mappings
     mapping_idx = {}
     mapping_data = []

From 80347e48284cf4de9cb00a29c900d6e66992a4d7 Mon Sep 17 00:00:00 2001
From: Jonathan Yu <jonathan.yu@csiro.au>
Date: Tue, 20 Nov 2018 15:30:45 +1100
Subject: [PATCH 08/13] Commenting print

---
 nc2rdf/nc2rdf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nc2rdf/nc2rdf.py b/nc2rdf/nc2rdf.py
index 2f91ce0..eae8c33 100644
--- a/nc2rdf/nc2rdf.py
+++ b/nc2rdf/nc2rdf.py
@@ -50,7 +50,7 @@ def baldgraph2schemaorg(graph):
        currField = getBasename(str(row[0])).strip()
        #print(getBasename(str(row[0])) + ' (type: ' + str(type(row[0])) + ")" + " :: " + row[1] + ' (type: ' + str(type(row[1])) + ")")
        if(currField in mapping_idx.keys()):
-          print('schemaorg:' + mapping_idx[currField], "\t", row[1])
+          #print('schemaorg:' + mapping_idx[currField], "\t", row[1])
           predUri = URIRef("http://schema.org/" + mapping_idx[currField])
           lit = Literal(row[1])
           schema_g.add( (container, predUri, lit) )

From 616e4716ee763d5da7073ccc05f7c4eeb754b05e Mon Sep 17 00:00:00 2001
From: Jonathan Yu <jonathan.yu@csiro.au>
Date: Tue, 20 Nov 2018 17:15:23 +1100
Subject: [PATCH 09/13] enable nc2rdf for urls

---
 lib/bald/__init__.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/lib/bald/__init__.py b/lib/bald/__init__.py
index 5f81270..f6c62f6 100644
--- a/lib/bald/__init__.py
+++ b/lib/bald/__init__.py
@@ -670,8 +670,10 @@ def load(afilepath):
         loader = netCDF4.Dataset
     else:
         raise ValueError('filepath suffix not supported: {}'.format(afilepath))
-    if not os.path.exists(afilepath):
-        raise IOError('{} not found'.format(afilepath))
+    #Disable this check for now to allow URL input
+    #TODO: Add feature to check both local files and files on the web, e.g. URLs
+    #if not os.path.exists(afilepath):
+    #    raise IOError('{} not found'.format(afilepath))
     try:
         f = loader(afilepath, "r")
         yield f

From cd7b1db7a2adeee1e6dd480e0ed97debe7546c8d Mon Sep 17 00:00:00 2001
From: Jonathan Yu <jonathan.yu@csiro.au>
Date: Wed, 21 Nov 2018 10:20:57 +1100
Subject: [PATCH 10/13] adding feature to add url if input is a http/https url

---
 nc2rdf/nc2rdf.py | 41 +++++++++++++++++++++++++++++++++++------
 1 file changed, 35 insertions(+), 6 deletions(-)

diff --git a/nc2rdf/nc2rdf.py b/nc2rdf/nc2rdf.py
index eae8c33..96081dd 100644
--- a/nc2rdf/nc2rdf.py
+++ b/nc2rdf/nc2rdf.py
@@ -9,13 +9,24 @@
 import json
 from rdflib import Namespace, BNode, URIRef, Literal
 from rdflib.namespace import RDF
-
-
+try:
+    # python 3
+    from urllib.parse import urlparse
+except ImportError:
+    from urlparse import urlparse
+
+def isUrl(url):
+    try:
+        result = urlparse(url)
+        if all([result.scheme, result.netloc, result.path]) and (result.scheme == 'https' or result.scheme == 'http'):
+           return True
+    except:
+        return False
 
 def getBasename(urlstr):
    return os.path.basename(urlstr)
 
-def baldgraph2schemaorg(graph):
+def baldgraph2schemaorg(graph, path=None, baseuri=None):
     """
        Input: netCDF file
        Transforms to a rdflib.Graph bald style
@@ -42,16 +53,34 @@ def baldgraph2schemaorg(graph):
        }""")
  
     schema_g = rdflib.Graph()
-    container = BNode()
+
+    if baseuri is not None:
+       container = URIRef(baseuri)
+    else:
+       container = BNode()
+
     so = Namespace("http://schema.org/")
     schema_g.add( (container, URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), so.Dataset) )
 
+    if path is not None and isUrl(path):
+       predUri = URIRef("http://schema.org/url")
+       schema_g.add( (container, predUri, URIRef(path)) )
+
     for row in qres:
        currField = getBasename(str(row[0])).strip()
        #print(getBasename(str(row[0])) + ' (type: ' + str(type(row[0])) + ")" + " :: " + row[1] + ' (type: ' + str(type(row[1])) + ")")
        if(currField in mapping_idx.keys()):
-          #print('schemaorg:' + mapping_idx[currField], "\t", row[1])
           predUri = URIRef("http://schema.org/" + mapping_idx[currField])
+          if currField == 'keywords':
+             for x in row[1].split(','):
+                kw = x.strip()
+                if len(kw) == 0:
+                   continue
+                lit = Literal(kw)
+                schema_g.add( (container, predUri, lit) )
+             continue
+
+          #print('schemaorg:' + mapping_idx[currField], "\t", row[1])
           lit = Literal(row[1])
           schema_g.add( (container, predUri, lit) )
     return schema_g
@@ -59,7 +88,7 @@ def baldgraph2schemaorg(graph):
 def nc2schemaorg(ncfilename, outformat, baseuri=None):
     root_container = bald.load_netcdf(ncfilename, baseuri=baseuri)
     graph = root_container.rdfgraph()
-    schema_g = baldgraph2schemaorg(graph)
+    schema_g = baldgraph2schemaorg(graph, path=ncfilename, baseuri=baseuri)
     
     if(outformat == 'json-ld'):
        context = "http://schema.org/"

From b85819a3e82999fec4bb3a0735b67e756cf422a5 Mon Sep 17 00:00:00 2001
From: Jonathan Yu <jonathan.yu@csiro.au>
Date: Wed, 21 Nov 2018 14:15:50 +1100
Subject: [PATCH 11/13] using bnode as id

---
 nc2rdf/threddsnc2rdf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nc2rdf/threddsnc2rdf.py b/nc2rdf/threddsnc2rdf.py
index 1ad23d5..22ea58f 100755
--- a/nc2rdf/threddsnc2rdf.py
+++ b/nc2rdf/threddsnc2rdf.py
@@ -213,7 +213,7 @@ def process_dataset(assign_url_map, dataset_address, thredds_url, thredds_catalo
        unique_dataset_id = uuid.uuid4().hex
        outputpath = OUTDIR + "/" + unique_dataset_id + ".json"
        print("emitting to " + outputpath)
-       nc2rdf.nc2schemaorg(opendap_url, outputformat, outputfile=outputpath, baseuri=opendap_url)
+       nc2rdf.nc2schemaorg(opendap_url, outputformat, outputfile=outputpath, baseuri=None)
     elif outputformat == 'turtle':
        unique_dataset_id = uuid.uuid4().hex
        outputpath = OUTDIR + "/" + unique_dataset_id + ".ttl"

From 40960b429e0615a1fc70c1faeb21bf18a9b5b68e Mon Sep 17 00:00:00 2001
From: Jonathan Yu <jonathan.yu@csiro.au>
Date: Wed, 21 Nov 2018 14:19:05 +1100
Subject: [PATCH 12/13] using bnode instead

---
 nc2rdf/threddsnc2rdf.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/nc2rdf/threddsnc2rdf.py b/nc2rdf/threddsnc2rdf.py
index 22ea58f..d16a2a1 100755
--- a/nc2rdf/threddsnc2rdf.py
+++ b/nc2rdf/threddsnc2rdf.py
@@ -227,6 +227,9 @@ def perform_harvest(thredds_url, thredds_catalog_url, outputformat='turtle', isS
     #print thredds_url
     #print thredds_catalog_url
     list_datasets_address = harvester.lookup_datasets_in_catalog(thredds_url, thredds_catalog_url, [])
+    if list_datasets_address is None:
+       return
+
     #print list_datasets_address
     dataset_uri = ''
 

From 17eb6ea7ee518ecae605abafee0cc74e83c4a3bb Mon Sep 17 00:00:00 2001
From: Jonathan Yu <jonathan.yu@csiro.au>
Date: Tue, 18 Dec 2018 08:18:45 +1100
Subject: [PATCH 13/13] removing unused dependencies

---
 nc2rdf/requirements.txt | 3 ---
 nc2rdf/threddsnc2rdf.py | 3 ---
 2 files changed, 6 deletions(-)

diff --git a/nc2rdf/requirements.txt b/nc2rdf/requirements.txt
index a0c75c8..fc66813 100644
--- a/nc2rdf/requirements.txt
+++ b/nc2rdf/requirements.txt
@@ -1,6 +1,3 @@
 lxml
 pydap
-requests-futures
-owslib
 urllib3
-python-dateutil
diff --git a/nc2rdf/threddsnc2rdf.py b/nc2rdf/threddsnc2rdf.py
index d16a2a1..2076d62 100755
--- a/nc2rdf/threddsnc2rdf.py
+++ b/nc2rdf/threddsnc2rdf.py
@@ -15,11 +15,8 @@
 import lxml
 import json
 import requests
-from dateutil import parser
 from pydap.client import open_url
 import pydap.lib
-from owslib.wms import WebMapService
-from owslib.iso import *
 import urllib
 from timeit import default_timer as timer
 import code, traceback, signal