From 524a62d9d7a0fd901061eeaf6e93ec20424e8eb2 Mon Sep 17 00:00:00 2001 From: Douglas Fils Date: Fri, 25 Oct 2024 13:33:08 -0500 Subject: [PATCH 1/5] Mostly simple path and configuration edits to test OIH indexing with. gleanerio.py and utils.py have small edits to address a version change and also path check for a deployment edge case --- dagster/dagster_home/.gitkeep | 1 - dagster/dagster_home/dagster.yaml | 2 +- .../implnets/configs/oihv2/gleanerconfig.yaml | 60 +++++++++++ .../implnets/configs/oihv2/nabuconfig.yaml | 77 ++++++++++++++ dagster/implnets/configs/oihv2/tenant.yaml | 27 +++++ dagster/implnets/configs/oihv2/workspace.yaml | 28 +++++ .../deployment/compose_no_routing.yaml | 100 ++++++++++++++++++ .../ingest/ingest/resources/gleanerio.py | 2 +- .../implnets/workflows/ingest/ingest/utils.py | 4 +- 9 files changed, 297 insertions(+), 4 deletions(-) delete mode 100644 dagster/dagster_home/.gitkeep create mode 100644 dagster/implnets/configs/oihv2/gleanerconfig.yaml create mode 100644 dagster/implnets/configs/oihv2/nabuconfig.yaml create mode 100644 dagster/implnets/configs/oihv2/tenant.yaml create mode 100644 dagster/implnets/configs/oihv2/workspace.yaml create mode 100644 dagster/implnets/deployment/compose_no_routing.yaml diff --git a/dagster/dagster_home/.gitkeep b/dagster/dagster_home/.gitkeep deleted file mode 100644 index 79083c9a..00000000 --- a/dagster/dagster_home/.gitkeep +++ /dev/null @@ -1 +0,0 @@ -This is a place where dagster.yamls can be kept for runs diff --git a/dagster/dagster_home/dagster.yaml b/dagster/dagster_home/dagster.yaml index 35033656..48499f9a 100644 --- a/dagster/dagster_home/dagster.yaml +++ b/dagster/dagster_home/dagster.yaml @@ -2,7 +2,7 @@ local_artifact_storage: module: dagster.core.storage.root class: LocalArtifactStorage config: - base_dir: /Users/valentin/development/dev_earthcube/scheduler/dagster/dagster_home/ + base_dir: /home/fils/src/Projects/earthcube/scheduler/dagster/dagster_home/ run_coordinator: module: dagster.core.run_coordinator class: QueuedRunCoordinator diff --git a/dagster/implnets/configs/oihv2/gleanerconfig.yaml b/dagster/implnets/configs/oihv2/gleanerconfig.yaml new file mode 100644 index 00000000..a825c6eb --- /dev/null +++ b/dagster/implnets/configs/oihv2/gleanerconfig.yaml @@ -0,0 +1,60 @@ +--- +minio: + address: + port: + accessKey: + secretKey: + ssl: + bucket: +gleaner: + runid: oih # this will be the bucket the output is placed in... + summon: true # do we want to visit the web sites and pull down the files + mill: false +context: + cache: true +contextmaps: + - prefix: "https://schema.org/" + file: "/assets/schemaorg-current-https.jsonld" # wget http://schema.org/docs/jsonldcontext.jsonld + - prefix: "http://schema.org/" + file: "/assets/schemaorg-current-http.jsonld" # wget http://schema.org/docs/jsonldcontext.jsonld +summoner: + after: "" # "21 May 20 10:00 UTC" + mode: full # full || diff: If diff compare what we have currently in gleaner to sitemap, get only new, delete missing + threads: 5 + delay: # milliseconds (1000 = 1 second) to delay between calls (will FORCE threads to 1) + headless: http://workstation.lan:9222 # URL for headless see docs/headless +millers: + graph: true +sources: + - sourcetype: sitemap + name: cioosatlantic + logo: https://cioosatlantic.ca/wp-content/themes/cioos-siooc-wordpress-theme/img/atlantic/cioos-atlantic_EN.svg?x79655 + url: https://catalogue.cioosatlantic.ca/sitemap/sitemap.xml + headless: false + pid: "" + propername: CIOOS Atlantic + domain: https://cioosatlantic.ca + active: true + credentialsfile: "" + other: { } + headlesswait: 0 + delay: 0 + identifierpath: "" + apipagelimit: 0 + identifiertype: identifiersha + fixcontextoption: 1 + acceptcontenttype: application/ld+json, text/html + jsonprofile: application/ld+json + - name: euroceanexperts + propername: EurOcean Experts + domain: https://infohub.eurocean.net/ + catalogue: https://infohub.eurocean.net/data/experts + logo: https://infohub.eurocean.net/images/under_constrution/eurOcean-logo-color.png + #ODISCat entry missing ODIS-arch url & type + pid: https://catalogue.odis.org/view/2993 + sourcetype: sitemap + url: https://www.oceanexpert.org/assets/sitemaps/sitemapExperts.xml + backend: Unknown + headless: false + dateadded: 2022-03-26 + active: true \ No newline at end of file diff --git a/dagster/implnets/configs/oihv2/nabuconfig.yaml b/dagster/implnets/configs/oihv2/nabuconfig.yaml new file mode 100644 index 00000000..3218d9a3 --- /dev/null +++ b/dagster/implnets/configs/oihv2/nabuconfig.yaml @@ -0,0 +1,77 @@ +minio: + bucket: + address: + port: + accesskey: + secretkey: + ssl: true +context: + cache: true + strict: false +contextmaps: + - prefix: "https://schema.org/" + file: "/assets/schemaorg-current-https.jsonld" # wget http://schema.org/docs/jsonldcontext.jsonld + - prefix: "http://schema.org/" + file: "/assets/schemaorg-current-http.jsonld" # wget http://schema.org/docs/jsonldcontext.jsonld +implementation_network: + orgname: eco +endpoints: + - service: ec_blazegraph + baseurl: https://graph.geocodes-aws-dev.earthcube.org/blazegraph/namespace/test + type: blazegraph + authenticate: false + username: + password: + modes: + - action: sparql + suffix: /sparql + accept: application/sparql-results+json + method: GET + - action: update + suffix: /sparql + accept: application/sparql-update + method: POST + - action: bulk + suffix: /sparql + accept: text/x-nquads + method: POST +objects: + domain: us-east-1 + prefix: + - summoned/aquadocs + - summoned/bcodmo + - summoned/cchdo + - summoned/earthchem + - summoned/edi + - summoned/hydroshare + - summoned/linkedearth + - summoned/magic + - summoned/opentopography + - summoned/r2r + - summoned/ssdbiodp + - summoned/unavco + - summoned/glim + - summoned/gpp + - summoned/nitrogen + - summoned/nitrogen2 + - summoned/hydrography90m + - summoned/neon4cast + - summoned/usgsrc4cast + - summoned/vera4cast + - summoned/osmc + - summoned/obis + - summoned/geochemistry_custom + - prov/aquadocs + - prov/bcodmo + - prov/cchdo + - prov/earthchem + - prov/edi + - prov/hydroshare + - prov/linkedearth + - prov/magic + - prov/opentopography + - prov/r2r + - prov/ssdbiodp + - prov/unavco + + diff --git a/dagster/implnets/configs/oihv2/tenant.yaml b/dagster/implnets/configs/oihv2/tenant.yaml new file mode 100644 index 00000000..ec5e2ca5 --- /dev/null +++ b/dagster/implnets/configs/oihv2/tenant.yaml @@ -0,0 +1,27 @@ +# prototype tennants file + +tenant: + - community: dev + hostname: geocodes-dev + description: GeoCodes is... + name: Geocodes Science on Schema + url: https://www.earthcube.org + logo: https://unsplash.com/random + graph: + main_namespace: test + summary_namespace: test_summary + sources: + - cioosatlantic +###### + - community: geocodesall + hostname: geocodes-all + description: GeoCodes is... + name: Geocodes Science on Schema + url: https://www.earthcube.org + logo: https://unsplash.com/random + graph: + main_namespace: geocodes_test + summary_namespace: geocodes_test_summary + sources: + - all + diff --git a/dagster/implnets/configs/oihv2/workspace.yaml b/dagster/implnets/configs/oihv2/workspace.yaml new file mode 100644 index 00000000..747fd3ef --- /dev/null +++ b/dagster/implnets/configs/oihv2/workspace.yaml @@ -0,0 +1,28 @@ +load_from: +# - python_file: +# relative_path: "project/eco/repositories/repository.py" +# location_name: project +# working_directory: "./project/eco/" +# - python_file: +# relative_path: "workflows/ecrr/repositories/repository.py" +# working_directory: "./workflows/ecrr/" + # module starting out with the definitions api + # - python_module: "workflows.tasks.tasks" + - + + - grpc_server: + host: dagster-code-tasks + port: 4000 + location_name: "tasks" + - grpc_server: + host: dagster-code-ingest + port: 4000 + location_name: "ingest" +# - grpc_server: +# host: dagster-code-project +# port: 4000 +# location_name: "project_grpc" + - grpc_server: + host: dagster-code-eco-ecrr + port: 4000 + location_name: "ecrr" diff --git a/dagster/implnets/deployment/compose_no_routing.yaml b/dagster/implnets/deployment/compose_no_routing.yaml new file mode 100644 index 00000000..d8983849 --- /dev/null +++ b/dagster/implnets/deployment/compose_no_routing.yaml @@ -0,0 +1,100 @@ +version: "3.9" + +# ########### +# This is for a single dagster instance, that does not use an externally defined network +######## + +# this NEEDS +# $HOST +# $PROJECT default eco +# CONTAINER_TAG default latest + +networks: + traefik_proxy: + driver: overlay + name: traefik-${PROJECT:-eco} + attachable: true +volumes: + dagster-postgres: + driver: local +secrets: + MINIO_ROOT_ACCESS_KEY: + external: true + MINIO_ROOT_SECRET_KEY: + external: true + +services: + dagster-dagit: + image: docker.io/nsfearthcube/dagster-${PROJECT:-eco}:${CONTAINER_TAG:-latest} + + secrets: + - MINIO_ROOT_ACCESS_KEY + - MINIO_ROOT_SECRET_KEY + environment: &env + - DEBUG=${DEBUG:-false} + - PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python + - PORTAINER_URL=${PORTAINER_URL} + - PORTAINER_KEY=${PORTAINER_KEY} + - GLEANERIO_GLEANER_IMAGE=${GLEANERIO_GLEANER_IMAGE} + - GLEANERIO_NABU_IMAGE=${GLEANERIO_NABU_IMAGE} + - GLEANERIO_LOG_PREFIX=${GLEANERIO_LOG_PREFIX} + - GLEANER_MINIO_ADDRESS=${GLEANERIO_MINIO_ADDRESS} + - GLEANER_MINIO_PORT=${GLEANERIO_MINIO_PORT} + - GLEANER_MINIO_USE_SSL=${GLEANERIO_MINIO_USE_SSL} + - GLEANER_MINIO_BUCKET=${GLEANERIO_MINIO_BUCKET} + - GLEANER_MINIO_ACCESS_KEY=${GLEANERIO_MINIO_ACCESS_KEY} + - GLEANER_MINIO_SECRET_KEY=${GLEANERIO_MINIO_SECRET_KEY} + - GLEANER_HEADLESS_ENDPOINT=${GLEANERIO_HEADLESS_ENDPOINT} + - GLEANER_HEADLESS_NETWORK=${GLEANERIO_HEADLESS_NETWORK} + - GLEANER_GRAPH_URL=${GLEANERIO_GRAPH_URL} + - GLEANER_GRAPH_NAMESPACE=${GLEANERIO_GRAPH_NAMESPACE} + - GLEANERIO_NABU_CONFIG_PATH=${GLEANERIO_NABU_CONFIG_PATH:-/configs/gleaner/nabuconfig.yaml} + - GLEANERIO_GLEANER_CONFIG_PATH=${GLEANERIO_GLEANER_CONFIG_PATH:-/configs/gleaner/gleanerconfig.yaml} + - GLEANERIO_NABU_DOCKER_CONFIG=${GLEANERIO_NABU_DOCKER_CONFIG:-nabu} + - GLEANERIO_GLEANER_DOCKER_CONFIG=${GLEANERIO_GLEANER_DOCKER_CONFIG:-gleaner} + ports: + - 3000:3000 + networks: + - traefik_proxy + - dagster_host + depends_on: + - dagster-postgres + + dagster-daemon: + image: docker.io/nsfearthcube/dagster-${PROJECT:-eco}:${CONTAINER_TAG:-latest} + secrets: + - MINIO_ROOT_ACCESS_KEY + - MINIO_ROOT_SECRET_KEY + environment: *env + + command: "dagster-daemon run" + depends_on: + - dagster-postgres + networks: + - dagster_host + + + dagster-postgres: + image: postgres:13.3 + ports: + - 5432:5432 + environment: + - POSTGRES_PASSWORD=secret + volumes: + - dagster-postgres:/var/lib/postgresql/data + networks: + - traefik_proxy + + + headless: + # image: chromedp/headless-shell:stable + # stable after 105 causes "devtool: CreateURL: Using unsafe HTTP verb GET to invoke /json/new. This action supports only PUT verb.", + image: chromedp/headless-shell:105.0.5195.127 + restart: unless-stopped + shm_size: "2gb" + ports: + - 9222:9222 + environment: + - SERVICE_PORTS=9222 + networks: + - traefik_proxy diff --git a/dagster/implnets/workflows/ingest/ingest/resources/gleanerio.py b/dagster/implnets/workflows/ingest/ingest/resources/gleanerio.py index 937d3c0d..bda3276e 100644 --- a/dagster/implnets/workflows/ingest/ingest/resources/gleanerio.py +++ b/dagster/implnets/workflows/ingest/ingest/resources/gleanerio.py @@ -132,7 +132,7 @@ class GleanerioResource(ConfigurableResource): def _get_client(self, docker_container_context: DockerContainerContext): headers = {'X-API-Key': self.GLEANERIO_PORTAINER_APIKEY} - client = docker.DockerClient(base_url=self.GLEANERIO_DOCKER_URL, version="1.43") + client = docker.DockerClient(base_url=self.GLEANERIO_DOCKER_URL, version="1.47") # client = docker.APIClient(base_url=URL, version="1.35") get_dagster_logger().info(f"create docker client") if (client.api._general_configs): diff --git a/dagster/implnets/workflows/ingest/ingest/utils.py b/dagster/implnets/workflows/ingest/ingest/utils.py index df35ee78..e63209e4 100644 --- a/dagster/implnets/workflows/ingest/ingest/utils.py +++ b/dagster/implnets/workflows/ingest/ingest/utils.py @@ -5,6 +5,8 @@ def PythonMinioAddress(url, port=None): PYTHON_MINIO_URL = "s3.amazonaws.com" else: PYTHON_MINIO_URL = url - if port is not None: + + if port is not None and port != "": PYTHON_MINIO_URL = f"{PYTHON_MINIO_URL}:{port}" + return PYTHON_MINIO_URL From 70a7281a8190cd8bbb55d3a0e4c48bc762748a04 Mon Sep 17 00:00:00 2001 From: Douglas Fils Date: Mon, 28 Oct 2024 11:12:55 -0500 Subject: [PATCH 2/5] Remove obsolete oihv2 configuration files Deleted the outdated gleanerconfig.yaml and workspace.yaml files from the oihv2 directory. These files are no longer needed and their removal helps to clean up the project structure. --- .../implnets/configs/oihv2/gleanerconfig.yaml | 60 ------------------- dagster/implnets/configs/oihv2/workspace.yaml | 28 --------- 2 files changed, 88 deletions(-) delete mode 100644 dagster/implnets/configs/oihv2/gleanerconfig.yaml delete mode 100644 dagster/implnets/configs/oihv2/workspace.yaml diff --git a/dagster/implnets/configs/oihv2/gleanerconfig.yaml b/dagster/implnets/configs/oihv2/gleanerconfig.yaml deleted file mode 100644 index a825c6eb..00000000 --- a/dagster/implnets/configs/oihv2/gleanerconfig.yaml +++ /dev/null @@ -1,60 +0,0 @@ ---- -minio: - address: - port: - accessKey: - secretKey: - ssl: - bucket: -gleaner: - runid: oih # this will be the bucket the output is placed in... - summon: true # do we want to visit the web sites and pull down the files - mill: false -context: - cache: true -contextmaps: - - prefix: "https://schema.org/" - file: "/assets/schemaorg-current-https.jsonld" # wget http://schema.org/docs/jsonldcontext.jsonld - - prefix: "http://schema.org/" - file: "/assets/schemaorg-current-http.jsonld" # wget http://schema.org/docs/jsonldcontext.jsonld -summoner: - after: "" # "21 May 20 10:00 UTC" - mode: full # full || diff: If diff compare what we have currently in gleaner to sitemap, get only new, delete missing - threads: 5 - delay: # milliseconds (1000 = 1 second) to delay between calls (will FORCE threads to 1) - headless: http://workstation.lan:9222 # URL for headless see docs/headless -millers: - graph: true -sources: - - sourcetype: sitemap - name: cioosatlantic - logo: https://cioosatlantic.ca/wp-content/themes/cioos-siooc-wordpress-theme/img/atlantic/cioos-atlantic_EN.svg?x79655 - url: https://catalogue.cioosatlantic.ca/sitemap/sitemap.xml - headless: false - pid: "" - propername: CIOOS Atlantic - domain: https://cioosatlantic.ca - active: true - credentialsfile: "" - other: { } - headlesswait: 0 - delay: 0 - identifierpath: "" - apipagelimit: 0 - identifiertype: identifiersha - fixcontextoption: 1 - acceptcontenttype: application/ld+json, text/html - jsonprofile: application/ld+json - - name: euroceanexperts - propername: EurOcean Experts - domain: https://infohub.eurocean.net/ - catalogue: https://infohub.eurocean.net/data/experts - logo: https://infohub.eurocean.net/images/under_constrution/eurOcean-logo-color.png - #ODISCat entry missing ODIS-arch url & type - pid: https://catalogue.odis.org/view/2993 - sourcetype: sitemap - url: https://www.oceanexpert.org/assets/sitemaps/sitemapExperts.xml - backend: Unknown - headless: false - dateadded: 2022-03-26 - active: true \ No newline at end of file diff --git a/dagster/implnets/configs/oihv2/workspace.yaml b/dagster/implnets/configs/oihv2/workspace.yaml deleted file mode 100644 index 747fd3ef..00000000 --- a/dagster/implnets/configs/oihv2/workspace.yaml +++ /dev/null @@ -1,28 +0,0 @@ -load_from: -# - python_file: -# relative_path: "project/eco/repositories/repository.py" -# location_name: project -# working_directory: "./project/eco/" -# - python_file: -# relative_path: "workflows/ecrr/repositories/repository.py" -# working_directory: "./workflows/ecrr/" - # module starting out with the definitions api - # - python_module: "workflows.tasks.tasks" - - - - - grpc_server: - host: dagster-code-tasks - port: 4000 - location_name: "tasks" - - grpc_server: - host: dagster-code-ingest - port: 4000 - location_name: "ingest" -# - grpc_server: -# host: dagster-code-project -# port: 4000 -# location_name: "project_grpc" - - grpc_server: - host: dagster-code-eco-ecrr - port: 4000 - location_name: "ecrr" From ff8df0a204657ce4b22edb65c27479bfc7c93fd3 Mon Sep 17 00:00:00 2001 From: Douglas Fils Date: Sat, 2 Nov 2024 17:33:14 -0500 Subject: [PATCH 3/5] Remove tenant.yaml and refactor YAML configurations. This commit deletes the unused tenant.yaml file, updates Docker Compose configurations to streamline networking and environment variables, and adjusts several python files for error handling. Additionally, multiple YAML configurations have been cleaned up, optimizing schema and source handling settings. --- .../implnets/configs/oih/gleanerconfig.yaml | 528 ++-------- .../configs/oih/gleanerconfig_full.yaml | 899 ++++++++++++++++++ .../configs/{oihv2 => oih}/nabuconfig.yaml | 4 +- dagster/implnets/configs/oih/tenant.yaml | 27 + dagster/implnets/configs/oih/workspace.yaml | 34 +- dagster/implnets/configs/oihv2/tenant.yaml | 27 - .../deployment/compose_no_routing.yaml | 179 ++-- .../ingest/ingest/assets/gleaner_sources.py | 23 +- .../ingest/assets/gleaner_summon_assets.py | 26 +- .../ingest/ingest/jobs/summon_assets.py | 6 +- .../ingest/ingest/resources/gleanerio.py | 2 +- ...n.xml => dagster_ingest_debug_oih.run.xml} | 7 +- 12 files changed, 1140 insertions(+), 622 deletions(-) create mode 100644 dagster/implnets/configs/oih/gleanerconfig_full.yaml rename dagster/implnets/configs/{oihv2 => oih}/nabuconfig.yaml (94%) create mode 100644 dagster/implnets/configs/oih/tenant.yaml delete mode 100644 dagster/implnets/configs/oihv2/tenant.yaml rename runConfigurations/{dagster_ingest_debug (1).run.xml => dagster_ingest_debug_oih.run.xml} (84%) diff --git a/dagster/implnets/configs/oih/gleanerconfig.yaml b/dagster/implnets/configs/oih/gleanerconfig.yaml index 84524db6..19e00e36 100644 --- a/dagster/implnets/configs/oih/gleanerconfig.yaml +++ b/dagster/implnets/configs/oih/gleanerconfig.yaml @@ -1,11 +1,11 @@ --- minio: - address: - port: + address: + port: accessKey: secretKey: - ssl: - bucket: oih + ssl: + bucket: gleaner: runid: oih # this will be the bucket the output is placed in... summon: true # do we want to visit the web sites and pull down the files @@ -13,474 +13,64 @@ gleaner: context: cache: true contextmaps: -- prefix: "https://schema.org/" - file: "./jsonldcontext.json" # wget http://schema.org/docs/jsonldcontext.jsonld -- prefix: "http://schema.org/" - file: "./jsonldcontext.json" # wget http://schema.org/docs/jsonldcontext.jsonld + - prefix: "https://schema.org/" + file: "/assets/schemaorg-current-https.jsonld" # wget http://schema.org/docs/jsonldcontext.jsonld + - prefix: "http://schema.org/" + file: "/assets/schemaorg-current-http.jsonld" # wget http://schema.org/docs/jsonldcontext.jsonld summoner: - after: "" # "21 May 20 10:00 UTC" + after: "" # "21 May 20 10:00 UTC" mode: full # full || diff: If diff compare what we have currently in gleaner to sitemap, get only new, delete missing threads: 5 - delay: # milliseconds (1000 = 1 second) to delay between calls (will FORCE threads to 1) - headless: http://0.0.0.0:9222 # URL for headless see docs/headless + delay: # milliseconds (1000 = 1 second) to delay between calls (will FORCE threads to 1) + headless: http://workstation.lan:9222 # URL for headless see docs/headless millers: graph: true sources: -# -# AquaDocs -# -- name: aquadocs - propername: AquaDocs - catalogue: https://aquadocs.org/discover - domain: https://aquadocs.org - logo: https://aquadocs.org/themes/OR//images/repo-logo.png - pid: https://catalogue.odis.org/view/3215 - sourcetype: sitegraph - url: https://oih.aquadocs.org/aquadocs.json - backend: OAI - headless: false - dateadded: 2021-02-26 - active: true -# -# Benguela Current Convention (BCC) GeoData Portal -# -- name: benguelacc - propername: Benguela Current Convention (BCC) GeoData Portal - catalogue: https://geodata.benguelacc.org/ - domain: https://www.benguelacc.org/ - logo: https://static.tildacdn.com/tild3263-3132-4937-a166-373963646533/Skjermbilde_2020-12-.png - #missing ODISCat entry - pid: https://oceaninfohub.org/.well-known/org/benguelacc - sourcetype: sitegraph - url: https://raw.githubusercontent.com/iodepo/odis-arch/schema-dev-jm/code/notebooks/Exploration/data-benguelacc/benguelacc-simple-graph.json - backend: GeoNode - headless: false - dateadded: 2022-04-13 - active: true -# -# Caribbean Marine Atlas catalogue -# -- name: caribbeanmarineatlas - propername: Caribbean Marine Atlas catalogue - catalogue: https://www.caribbeanmarineatlas.net/home/ - domain: https://www.caribbeanmarineatlas.net/ - logo: https://www.caribbeanmarineatlas.net/static/cma2/landing_page/images/logotipo_CMA_2019.svg - pid: https://catalogue.odis.org/view/616 - sourcetype: sitegraph - url: https://raw.githubusercontent.com/iodepo/odis-arch/schema-dev-jm/code/notebooks/Exploration/data-caribbeanmarineatlas/caribbeanmarineatlas-simple-graph.json - backend: GeoNode - headless: false - dateadded: 2022-04-13 - active: true -# -# CIOOS Atlantic -# -- name: cioosatlantic - propername: CIOOS Atlantic - catalogue: https://catalogue.cioosatlantic.ca/ - domain: https://cioosatlantic.ca - logo: https://cioosatlantic.ca/wp-content/themes/cioos-siooc-wordpress-theme/img/atlantic/cioos-atlantic_EN.svg?x79655 - #missing ODISCat entry - pid: https://oceaninfohub.org/.well-known/org/cioosatlantic - sourcetype: sitemap - url: https://catalogue.cioosatlantic.ca/sitemap/sitemap.xml - #url: https://catalogue.cioosatlantic.ca/sitemap/sitemap-1.xml - backend: CKAN - headless: true - dateadded: 2022-06-04 - active: true -# -# EDMERP SeaDataNet -# -- name: edmerp - propername: European Directory of Marine Environmental Research Projects (EDMERP) - SeaDataNet - catalogue: https://edmerp.seadatanet.org/search - domain: https://edmerp.seadatanet.org - logo: https://edmerp.seadatanet.org/grfx/edmerp/logo_big.png - #missing ODISCat entry - pid: https://oceaninfohub.org/.well-known/org/edmerp - sourcetype: sitemap - url: https://edmerp.seadatanet.org/sitemap.xml - backend: SeaDataCloud - headless: false - dateadded: 2021-07-26 - active: true -# -# EDMO SeaDataNet -# -- name: edmo - propername: European Directory of Marine Organisations (EDMO) SeaDataNet - catalogue: https://edmo.seadatanet.org/search - domain: https://edmo.seadatanet.org - logo: https://edmo.seadatanet.org/grfx/edmo/logo.png - #missing ODISCat entry - pid: https://oceaninfohub.org/.well-known/org/edmo - sourcetype: sitemap - url: https://edmo.seadatanet.org/sitemap.xml - backend: SeaDataCloud - dateadded: 2021-07-26 - headless: false - active: true -# -# EurOcean Events -# -- name: euroceanevents - propername: EurOcean Events - domain: https://infohub.eurocean.net/ - catalogue: https://infohub.eurocean.net/data/events - logo: https://infohub.eurocean.net/images/under_constrution/eurOcean-logo-color.png - #ODISCat entry missing ODIS-arch url & type - pid: https://catalogue.odis.org/view/2993 - sourcetype: sitemap - url: https://www.oceanexpert.org/assets/sitemaps/sitemapEvents.xml - backend: Unknown - headless: false - dateadded: 2022-03-26 - active: true -# -# EurOcean Experts -# -- name: euroceanexperts - propername: EurOcean Experts - domain: https://infohub.eurocean.net/ - catalogue: https://infohub.eurocean.net/data/experts - logo: https://infohub.eurocean.net/images/under_constrution/eurOcean-logo-color.png - #ODISCat entry missing ODIS-arch url & type - pid: https://catalogue.odis.org/view/2993 - sourcetype: sitemap - url: https://www.oceanexpert.org/assets/sitemaps/sitemapExperts.xml - backend: Unknown - headless: false - dateadded: 2022-03-26 - active: true -# -# EurOcean Institutions -# -- name: euroceaninstitutions - propername: EurOcean Institutions - domain: https://infohub.eurocean.net/ - catalogue: https://infohub.eurocean.net/data/institutions - logo: https://infohub.eurocean.net/images/under_constrution/eurOcean-logo-color.png - #ODISCat entry missing ODIS-arch url & type - pid: https://catalogue.odis.org/view/2993 - sourcetype: sitemap - url: https://www.oceanexpert.org/assets/sitemaps/sitemapInstitutions.xml - backend: Unknown - headless: false - dateadded: 2022-03-26 - active: true -# -# EurOcean Organizations -# -- name: euroceanorgs - propername: EurOcean Organizations - domain: https://infohub.eurocean.net/ - catalogue: https://infohub.eurocean.net/data/organizations - logo: https://infohub.eurocean.net/images/under_constrution/eurOcean-logo-color.png - #ODISCat entry missing ODIS-arch url & type - pid: https://catalogue.odis.org/view/2993 - sourcetype: sitemap - #sitemap entries link to login page - url: https://infohub.eurocean.net/sitemap/organizations - backend: Unknown - headless: false - dateadded: 2022-03-26 - active: true -# -# EurOcean Projects -# -- name: euroceanprojects - propername: EurOcean Projects - domain: https://infohub.eurocean.net/ - catalogue: https://infohub.eurocean.net/data/projects - logo: https://infohub.eurocean.net/images/under_constrution/eurOcean-logo-color.png - #ODISCat entry missing ODIS-arch url & type - pid: https://catalogue.odis.org/view/2993 - sourcetype: sitemap - #sitemap entries link to login page - url: https://infohub.eurocean.net/sitemap/projects - backend: Unknown - headless: false - dateadded: 2022-03-26 - active: true -# -# EurOcean Training -# -- name: euroceantraining - propername: EurOcean Training - domain: https://infohub.eurocean.net/ - catalogue: https://infohub.eurocean.net/data/training - logo: https://infohub.eurocean.net/images/under_constrution/eurOcean-logo-color.png - #ODISCat entry missing ODIS-arch url & type - pid: https://catalogue.odis.org/view/2993 - sourcetype: sitemap - url: https://www.oceanexpert.org/assets/sitemaps/sitemapTraining.xml - backend: Unknown - headless: false - dateadded: 2022-03-26 - active: true -# -# EurOcean Vessels -# -- name: euroceanvessels - propername: EurOcean Vessels - domain: https://infohub.eurocean.net/ - catalogue: https://infohub.eurocean.net/data/vessels - logo: https://infohub.eurocean.net/images/under_constrution/eurOcean-logo-color.png - #ODISCat entry missing ODIS-arch url & type - pid: https://catalogue.odis.org/view/2993 - sourcetype: sitemap - #sitemap entries link to login page - url: https://infohub.eurocean.net/sitemap/vessels - backend: Unknown - headless: false - dateadded: 2022-03-26 - active: true -# -# European Marine Observation and Data Network catalogue -# -- name: emodnet - propername: European Marine Observation and Data Network catalogue - domain: https://emodnet.ec.europa.eu/ - catalogue: https://emodnet.ec.europa.eu/geonetwork/srv/eng/catalog.search#/home - logo: https://emodnet.ec.europa.eu/sites/emodnet.ec.europa.eu/themes/emodnet/component-library/ec/static/media/logo--en.5055ef4f.svg - pid: https://catalogue.odis.org/view/364 - sourcetype: sitemap - #sitemaps are password protected? - url: https://emodnet.ec.europa.eu/geonetwork/srv/eng/portal.sitemap - #url: https://raw.githubusercontent.com/iodepo/odis-arch/schema-dev-jm/code/notebooks/Exploration/data-emodnet/emodnet-simple-graph.json - backend: GeoNetwork - headless: true - dateadded: 2022-08-24 - active: false -# -# Indonesia National Oceanic Data Center -# -- name: inanodc - propername: Indonesia National Oceanic Data Center - catalogue: https://geonode.nodc.id - domain: https://nodc.id - logo: https://nodc.id/dist/img/brin.png - pid: https://catalogue.odis.org/view/3246 - sourcetype: sitemap - url: https://geonode.nodc.id/sitemap.xml - #url: https://raw.githubusercontent.com/iodepo/odis-arch/schema-dev-jm/code/notebooks/Exploration/data-ina-nodc/ina-nodc-simple-graph.json - backend: GeoNode - headless: false - dateadded: 2022-04-05 - active: true -# -# INVEMAR Documents -# -- name: invemardocuments - propername: CHM LAC - Documents - catalogue: http://portete.invemar.org.co/chm/ - domain: http://portete.invemar.org.co - logo: http://portete.invemar.org.co/static/images/logo-web.svg - #missing ODISCat entry - pid: https://oceaninfohub.org/.well-known/org/invemardocuments - sourcetype: sitegraph - url: http://portete.invemar.org.co/chm/api/oih/documents?format=json - backend: INVEMAR CHM - headless: false - dateadded: 2022-01-26 - active: true -# -# INVEMAR Experts -# -- name: invemarexperts - propername: CHM LAC - Experts - catalogue: http://portete.invemar.org.co/chm/ - domain: http://portete.invemar.org.co - logo: http://portete.invemar.org.co/static/images/logo-web.svg - #missing ODISCat entry - pid: https://oceaninfohub.org/.well-known/org/invemarexpert - sourcetype: sitegraph - url: http://portete.invemar.org.co/chm/api/oih/expert?format=json - backend: INVEMAR CHM - headless: false - dateadded: 2022-01-26 - active: true -# -# INVEMAR Institutions -# -- name: invemarinstitutions - propername: CHM LAC - Institutions - catalogue: http://portete.invemar.org.co/chm/ - domain: http://portete.invemar.org.co - logo: http://portete.invemar.org.co/static/images/logo-web.svg - #missing ODISCat entry - pid: https://oceaninfohub.org/.well-known/org/invemarinstitutions - sourcetype: sitegraph - url: http://portete.invemar.org.co/chm/api/oih/institution?format=json - backend: INVEMAR CHM - headless: false - dateadded: 2022-01-26 - active: true -# -# INVEMAR Training -# -- name: invemartraining - propername: CHM LAC - Training - catalogue: http://portete.invemar.org.co/chm/ - domain: http://portete.invemar.org.co - logo: http://portete.invemar.org.co/static/images/logo-web.svg - #missing ODISCat entry - pid: https://oceaninfohub.org/.well-known/org/invemartraining - sourcetype: sitegraph - url: http://portete.invemar.org.co/chm/api/oih/training?format=json - backend: INVEMAR CHM - headless: false - dateadded: 2022-01-26 - active: true -# -# INVEMAR Vessels -# -- name: invemarvessels - propername: CHM LAC - Vessels - catalogue: http://portete.invemar.org.co/chm/ - domain: http://portete.invemar.org.co - logo: http://portete.invemar.org.co/static/images/logo-web.svg - #missing ODISCat entry - pid: https://oceaninfohub.org/.well-known/org/invemarvessels - sourcetype: sitegraph - url: http://portete.invemar.org.co/chm/api/oih/vessel?format=json - backend: INVEMAR CHM - headless: false - dateadded: 2022-01-26 - active: true -# -# Marine Training EU -# -- name: marinetraining - propername: Marine Training EU - catalogue: https://marinetraining.eu/search - domain: https://marinetraining.eu/ - logo: https://marinetraining.eu/sites/default/files/M2.0.png - #ODISCat entry missing ODIS-arch url & type - pid: https://catalogue.odis.org/view/1972 - sourcetype: sitemap - url: https://www.marinetraining.eu/sitemap.xml - backend: Drupal - headless: false - dateadded: 2021-05-26 - active: true -# -# MASPAWIO: Marine Spatial Atlas for the Western Indian Ocean -# -- name: maspawio - propername: MASPAWIO - Marine Spatial Atlas for the Western Indian Ocean - catalogue: http://maspawio.net/ - domain: https://cordioea.net/ - logo: https://cordioea.net/wp-content/uploads/2017/09/Geonde.png - pid: https://catalogue.odis.org/view/351 - sourcetype: sitegraph - url: https://raw.githubusercontent.com/iodepo/odis-arch/schema-dev-jm/code/notebooks/Exploration/data-maspawio/maspawio-simple-graph.json - backend: GeoNode - headless: false - dateadded: 2022-03-30 - active: true -# -# OBIS: Ocean Biodiversity Information System -# -- name: obis - propername: Ocean Biodiversity Information System - catalogue: https://obis.org - domain: https://obis.org - logo: https://obis.org/images/logo.png - #ODISCat entry missing ODIS-arch url - pid: https://catalogue.odis.org/view/343 - sourcetype: sitemap - url: https://obis.org/sitemap_datasets.xml - backend: Unknown - headless: false - dateadded: 2021-02-26 - active: true -# -# Ocean Best Practices -# -- name: obps - propername: Ocean Best Practices System - catalogue: https://www.oceanbestpractices.org/repository/ - domain: https://www.oceanbestpractices.org/ - logo: https://search.oceanbestpractices.org/static/media/unesco-ioc-ocb-lockup-2x.713a7ec7.png - pid: https://catalogue.odis.org/view/292 - sourcetype: sitegraph - url: https://oih.oceanbestpractices.org/obps.json - backend: OAI - headless: false - dateadded: 2021-01-26 - active: true -# -# OceanExpert -# -- name: oceanexperts - propername: OceanExpert UNESCO/IOC Project Office for IODE - catalogue: https://oceanexpert.org/advancedSearch - domain: https://oceanexpert.org/ - logo: https://oceanexpert.org/img/logoimageindex.jpg - #ODISCat arch url doesn't point to sitemap - pid: https://catalogue.odis.org/view/4 - sourcetype: sitemap - url: https://oceanexpert.org/assets/sitemaps/sitemapIndex.xml - backend: Unknown - headless: false - dateadded: 2021-08-26 - active: true -# -# Oceanscape Project -# -- name: oceanscape - propername: Oceanscape Project - catalogue: https://oceanscape.org - domain: https://geoblueplanet.org/ - logo: https://oceanscape.org/wp-content/uploads/2019/06/Oceanscape-banner-1024x183.png - pid: https://catalogue.odis.org/view/1098 - sourcetype: sitemap - url: https://oceanscape.org/organisation-sitemap.xml - #url: https://oceanscape.org/sitemap_index.xml - backend: WordPress - headless: true - dateadded: 2022-05-18 - active: true -# -# Pacific Data Hub -# -- name: pdh - propername: Pacific Data Hub - catalogue: https://pacificdata.org/data/dataset - domain: https://pacificdata.org - logo: https://pacificdata.org/themes/custom/spc/logo.png - pid: https://catalogue.odis.org/view/689 - sourcetype: sitemap - url: https://pacificdata.org/organization/sitemap.xml - #url: https://raw.githubusercontent.com/iodepo/odis-arch/schema-dev-jm/code/notebooks/Exploration/data-pacificdatahub/sitemap.xml - backend: CKAN - headless: true - dateadded: 2022-07-25 - active: true -# -# VLIZ: Flanders Marine Institute catalogue -# -- name: vliz - propername: VLIZ - Flanders Marine Institute catalogue - catalogue: https://emodnet.ec.europa.eu/geonetwork/srv/eng/catalog.search#/home - domain: http://www.vliz.be/en - logo: '' - #missing ODISCat entry - pid: https://oceaninfohub.org/.well-known/org/vliz - sourcetype: sitegraph - url: https://raw.githubusercontent.com/iodepo/odis-arch/schema-dev-jm/code/notebooks/Exploration/data-vliz/vliz-simple-graph.json - backend: GeoNetwork - headless: false - dateadded: 2022-08-03 - active: false -#- sourcetype: sitemap - #name: marineie - #url: http://data.marine.ie/geonetwork/srv/eng/portal.sitemap - #headless: true - #pid: https://www.re3data.org/repository/marineie - #propername: Marine Institute Data Catalogue - #domain: http://data.marine.ie - #active: true + - sourcetype: sitemap + name: acma + propername: African Coastal and Marine Atlas catalogue (ACMA) + catalogue: https://acma.africanmarineatlas.org + domain: https://africanmarineatlas.org + logo: https://acma.africanmarineatlas.org/static/mapstore/img/geonode-logo.svg + pid: https://catalogue.odis.org/view/3125 + url: https://raw.githubusercontent.com/iodepo/odis-arch/master/collection/tempHosting/data-acma/sitemap.xml + changefreq: + backend: GeoNode + headless: false + dateadded: 2024-04-23 + cron: 0 6 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" + - sourcetype: sitemap + name: africaioc + propername: IOC Africa Data Portal + catalogue: https://ioc-africa.org/dbs/displayData.php + domain: https://ioc-africa.org + logo: https://ioc-africa.org/dbs/images/unesco_ioc_logo.png + pid: https://oceaninfohub.org/.well-known/org/africaioc + url: https://ioc-africa.org/sitemap.xml + changefreq: daily + backend: Custom + headless: false + dateadded: 2023-02-09 + cron: 0 6 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" + - sourcetype: sitegraph + name: aquadocs + propername: AquaDocs + catalogue: https://aquadocs.org/discover + domain: https://aquadocs.org + logo: https://aquadocs.org/themes/OR/images/repo-logo.png + pid: https://catalogue.odis.org/view/3215 + url: https://oih.aquadocs.org/aquadocs.json + changefreq: + backend: OAI + headless: false + dateadded: 2021-02-26 + cron: 0 7 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" diff --git a/dagster/implnets/configs/oih/gleanerconfig_full.yaml b/dagster/implnets/configs/oih/gleanerconfig_full.yaml new file mode 100644 index 00000000..877c3adb --- /dev/null +++ b/dagster/implnets/configs/oih/gleanerconfig_full.yaml @@ -0,0 +1,899 @@ +--- +minio: + address: + port: + accessKey: + secretKey: + ssl: + bucket: +gleaner: + runid: oih # this will be the bucket the output is placed in... + summon: true # do we want to visit the web sites and pull down the files + mill: false +context: + cache: true +contextmaps: + - prefix: "https://schema.org/" + file: "/assets/schemaorg-current-https.jsonld" # wget http://schema.org/docs/jsonldcontext.jsonld + - prefix: "http://schema.org/" + file: "/assets/schemaorg-current-http.jsonld" # wget http://schema.org/docs/jsonldcontext.jsonld +summoner: + after: "" # "21 May 20 10:00 UTC" + mode: full # full || diff: If diff compare what we have currently in gleaner to sitemap, get only new, delete missing + threads: 5 + delay: # milliseconds (1000 = 1 second) to delay between calls (will FORCE threads to 1) + headless: http://workstation.lan:9222 # URL for headless see docs/headless +millers: + graph: true +sources: + - name: acma + propername: African Coastal and Marine Atlas catalogue (ACMA) + catalogue: https://acma.africanmarineatlas.org + domain: https://africanmarineatlas.org + logo: https://acma.africanmarineatlas.org/static/mapstore/img/geonode-logo.svg + pid: https://catalogue.odis.org/view/3125 + sourcetype: sitemap + url: https://raw.githubusercontent.com/iodepo/odis-arch/master/collection/tempHosting/data-acma/sitemap.xml + changefreq: + backend: GeoNode + headless: false + dateadded: 2024-04-23 + cron: 0 6 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" + - name: africaioc + propername: IOC Africa Data Portal + catalogue: https://ioc-africa.org/dbs/displayData.php + domain: https://ioc-africa.org + logo: https://ioc-africa.org/dbs/images/unesco_ioc_logo.png + #missing ODISCat entry + pid: https://oceaninfohub.org/.well-known/org/africaioc + sourcetype: sitemap + url: https://ioc-africa.org/sitemap.xml + changefreq: daily + backend: Custom + headless: false + dateadded: 2023-02-09 + cron: 0 6 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" + - name: aquadocs + propername: AquaDocs + catalogue: https://aquadocs.org/discover + domain: https://aquadocs.org + logo: https://aquadocs.org/themes/OR/images/repo-logo.png + pid: https://catalogue.odis.org/view/3215 + sourcetype: sitegraph + url: https://oih.aquadocs.org/aquadocs.json + changefreq: + backend: OAI + headless: false + dateadded: 2021-02-26 + cron: 0 7 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" + - name: argovis + propername: Argovis ARGO Collection + catalogue: https://argovis.colorado.edu/argoURLhelper + domain: https://argovis.colorado.edu + logo: https://argovis.colorado.edu/fulllogo.png + pid: https://catalogue.odis.org/view/3304 + sourcetype: sitemap + url: https://argovis.colorado.edu/argo_sitemap.xml + changefreq: + backend: custom + headless: false + dateadded: 2024-06-14 + cron: 1 7 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" + - name: bebop + propername: Better Biomolecular Ocean Practices (BeBOP) as part of Ocean Biomolecular Observing Network (OBON) + catalogue: https://github.com/BeBOP-OBON/odis-interface + domain: https://oceandecade.org/actions/better-biomolecular-ocean-practices/ + logo: https://en.unesco.org/sites/default/files/styles/banner_sec_col_234x100/public/logo_decade_ocean_science_en.jpg + pid: https://catalogue.odis.org/view/3294 + sourcetype: sitemap + url: https://raw.githubusercontent.com/BeBOP-OBON/odis-interface/main/sitemap.xml + changefreq: as needed + backend: GitHub + headless: false + dateadded: 2023-03-08 + cron: 0 8 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" + - name: benguelacc + propername: Benguela Current Convention (BCC) GeoData Portal + catalogue: https://geodata.benguelacc.org/ + domain: https://www.benguelacc.org/ + logo: https://static.tildacdn.com/tild3263-3132-4937-a166-373963646533/Skjermbilde_2020-12-.png + #missing ODISCat entry + pid: https://oceaninfohub.org/.well-known/org/benguelacc + sourcetype: sitegraph + url: https://raw.githubusercontent.com/iodepo/odis-arch/master/collection/tempHosting/data-benguelacc/benguelacc-simple-graph.json + changefreq: + backend: GeoNode + headless: false + dateadded: 2022-04-13 + cron: 0 1 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" + - name: bmdc + propername: Belgian Marine Data Centre (BMDC) + catalogue: https://metadata.naturalsciences.be/geonetwork/srv/eng/catalog.search#/home + domain: https://www.bmdc.be/ + logo: https://www.bmdc.be/NODC/images/museum.png + pid: https://catalogue.odis.org/view/3271 + sourcetype: sitemap + url: https://metadata.naturalsciences.be/geonetwork/srv/api/sitemap + changefreq: daily + backend: GeoNetwork + headless: false + dateadded: 2023-06-20 + cron: 0 6 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" + - name: bodc + propername: British Oceanographic Data Centre (BODC) + catalogue: https://www.bodc.ac.uk/data/bodc_database/nodb/search/ + domain: https://www.bodc.ac.uk/ + logo: https://www.bodc.ac.uk/assets/img/bodc-logo-colour-white.png + pid: https://catalogue.odis.org/view/29 + sourcetype: sitemap + url: https://api.linked-systems.uk/sitemap_pap_api.xml + changefreq: monthly + backend: Custom + headless: false + dateadded: 2023-07-10 + cron: 0 7 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" + - name: caribbeanmarineatlas + propername: Caribbean Marine Atlas catalogue + catalogue: https://www.caribbeanmarineatlas.net/home/ + domain: https://www.caribbeanmarineatlas.net/ + logo: https://www.caribbeanmarineatlas.net/static/cma2/landing_page/images/logotipo_CMA_2019.svg + pid: https://catalogue.odis.org/view/616 + sourcetype: sitegraph + url: https://raw.githubusercontent.com/iodepo/odis-arch/master/collection/tempHosting/data-caribbeanmarineatlas/caribbeanmarineatlas-simple-graph.json + changefreq: + backend: GeoNode + headless: false + dateadded: 2022-04-13 + cron: 0 2 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" + - name: cchdo + propername: CLIVAR and Carbon Hydrographic Data Office (CCHDO) + catalogue: https://cchdo.ucsd.edu/ + domain: https://ucsd.edu/ + logo: https://cchdo.ucsd.edu/static/svg/logo_cchdo.svg + pid: https://catalogue.odis.org/view/3291 + sourcetype: sitemap + url: https://cchdo.ucsd.edu/sitemap.xml + changefreq: + backend: ERDDAP + headless: false + dateadded: 2024-05-16 + cron: 0 9 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" + - name: cclme + propername: Canary Current Large Marine Ecosystem (CCLME) + catalogue: https://www.fao.org/in-action/canary-current-lme/en + domain: https://www.fao.org/in-action/canary-current-lme/en + logo: https://www.fao.org/images/corporatelibraries/fao-logo/fao-logo-en.svg?sfvrsn=f64522b4_36 + pid: https://catalogue.odis.org/view/3276 + sourcetype: sitemap + url: http://www.ideo-cclme.ieo.es/odis/odis_sitemap.xml + changefreq: monthly + backend: Custom + headless: false + dateadded: 2023-10-30 + cron: 0 2 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" + - name: cioos + propername: Canadian Integrated Ocean Observing System (CIOOS) + catalogue: https://catalogue.cioos.ca/ + domain: https://cioos.ca/ + logo: https://cioos.ca/wp-content/themes/cioos-siooc-wordpress-theme/img/national/cioos-national_EN.svg + pid: https://catalogue.odis.org/view/3263 + sourcetype: sitemap + url: https://catalogue.cioos.ca/sitemap/sitemap.xml + changefreq: daily + backend: CKAN + headless: false + dateadded: 2023-01-31 + cron: 0 3 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" + - name: edmerp + propername: European Directory of Marine Environmental Research Projects (EDMERP) SeaDataNet + catalogue: https://edmerp.seadatanet.org/search + domain: https://edmerp.seadatanet.org + logo: https://edmerp.seadatanet.org/grfx/edmerp/logo_big.png + #missing ODISCat entry + pid: https://oceaninfohub.org/.well-known/org/edmerp + sourcetype: sitemap + url: https://edmerp.seadatanet.org/sitemap.xml + changefreq: as needed + backend: SeaDataCloud + headless: false + dateadded: 2021-07-26 + cron: 0 4 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" + - name: edmo + propername: European Directory of Marine Organisations (EDMO) SeaDataNet + catalogue: https://edmo.seadatanet.org/search + domain: https://edmo.seadatanet.org + logo: https://edmo.seadatanet.org/grfx/edmo/logo.png + #missing ODISCat entry + pid: https://oceaninfohub.org/.well-known/org/edmo + sourcetype: sitemap + url: https://edmo.seadatanet.org/sitemap.xml + changefreq: as needed + backend: SeaDataCloud + dateadded: 2021-07-26 + headless: false + cron: 0 5 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" + - name: emodnet + propername: European Marine Observation and Data Network (EMODnet) + domain: https://emodnet.ec.europa.eu/ + catalogue: https://emodnet.ec.europa.eu/geonetwork/srv/eng/catalog.search#/home + logo: https://emodnet.ec.europa.eu/sites/emodnet.ec.europa.eu/files/public/emodnet_logos/print/EMODnet_coll_all.png + pid: https://catalogue.odis.org/view/364 + sourcetype: sitemap + url: https://emodnet.ec.europa.eu/geonetwork/srv/eng/portal.sitemap + changefreq: as needed + backend: GeoNetwork + headless: false + dateadded: 2022-08-24 + active: true + identifiertype: identifiersha + identifierpath: "" + cron: 0 9 * * 0 + - name: euroceanorgs + propername: EurOcean Organizations + domain: https://infohub.eurocean.net/ + catalogue: https://infohub.eurocean.net/data/organizations + logo: https://infohub.eurocean.net/images/under_constrution/eurOcean-logo-color.png + #ODISCat entry missing ODIS-arch url & type + pid: https://catalogue.odis.org/view/2993 + sourcetype: sitemap + #sitemap entries link to login page + url: https://infohub.eurocean.net/sitemap/organizations + changefreq: daily + backend: unknown + headless: false + dateadded: 2022-03-26 + active: true + identifiertype: identifiersha + identifierpath: "" + cron: 0 6 * * 0 + - name: euroceanprojects + propername: EurOcean Projects + domain: https://infohub.eurocean.net/ + catalogue: https://infohub.eurocean.net/data/projects + logo: https://infohub.eurocean.net/images/under_constrution/eurOcean-logo-color.png + #ODISCat entry missing ODIS-arch url & type + pid: https://catalogue.odis.org/view/2993 + sourcetype: sitemap + #sitemap entries link to login page + url: https://infohub.eurocean.net/sitemap/projects + changefreq: daily + backend: unknown + headless: false + dateadded: 2022-03-26 + active: true + identifiertype: identifiersha + identifierpath: "" + cron: 0 7 * * 0 + - name: euroceanvessels + propername: EurOcean Vessels + domain: https://infohub.eurocean.net/ + catalogue: https://infohub.eurocean.net/data/vessels + logo: https://infohub.eurocean.net/images/under_constrution/eurOcean-logo-color.png + #ODISCat entry missing ODIS-arch url & type + pid: https://catalogue.odis.org/view/2993 + sourcetype: sitemap + #sitemap entries link to login page + url: https://infohub.eurocean.net/sitemap/vessels + changefreq: daily + backend: unknown + headless: false + dateadded: 2022-03-26 + active: true + identifiertype: identifiersha + identifierpath: "" + cron: 0 8 * * 0 + - name: gbif + propername: Global Biodiversity Information Facility (GBIF) + catalogue: https://www.gbif.org/dataset/search + domain: https://www.gbif.org/ + logo: https://upload.wikimedia.org/wikipedia/commons/1/1e/GBIF-2015-full-stacked.png + pid: https://catalogue.odis.org/view/3297 + sourcetype: sitemap + url: https://www.gbif.org/sitemap-dataset.xml + changefreq: weekly + backend: Custom + headless: false + dateadded: 2024-05-06 + cron: 0 1 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" + - name: incois + propername: Indian National Centre for Ocean Information Services (INCOIS) + catalogue: https://incois.gov.in/essdp/ + domain: https://incois.gov.in/ + logo: https://incois.gov.in/essdp/images/logo.png + pid: https://catalogue.odis.org/view/3300 + sourcetype: sitemap + url: https://incois.gov.in/essdp/xml/sitemap.xml + changefreq: + backend: Custom + headless: false + dateadded: 2024-04-15 + cron: 0 9 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" + - name: invemardocuments + propername: Latin America and the Caribbean Region (LAC) Documents + catalogue: https://portete.invemar.org.co/chm/ + domain: https://portete.invemar.org.co + logo: https://www.invemar.org.co/o/invemar-actual-theme/images/logoWebINVEMAR.png + #missing ODISCat entry + pid: https://oceaninfohub.org/.well-known/org/invemardocuments + sourcetype: sitegraph + url: https://portete.invemar.org.co/chm/api/oih/documents?format=json + changefreq: + backend: INVEMAR CHM + headless: false + dateadded: 2022-01-26 + active: true + identifiertype: identifiersha + identifierpath: "" + cron: 0 11 * * 0 + - name: invemarexperts + propername: Latin America and the Caribbean Region (LAC) Experts + catalogue: https://portete.invemar.org.co/chm/ + domain: https://portete.invemar.org.co + logo: https://www.invemar.org.co/o/invemar-actual-theme/images/logoWebINVEMAR.png + #missing ODISCat entry + pid: https://oceaninfohub.org/.well-known/org/invemarexpert + sourcetype: sitegraph + url: https://portete.invemar.org.co/chm/api/oih/expert?format=json + changefreq: + backend: INVEMAR CHM + headless: false + dateadded: 2022-01-26 + active: true + identifiertype: identifiersha + identifierpath: "" + cron: 0 12 * * 0 + - name: invemargeo + propername: Latin America and the Caribbean Region (LAC) Geospatial + catalogue: https://portete.invemar.org.co/chm/ + domain: https://portete.invemar.org.co + logo: https://www.invemar.org.co/o/invemar-actual-theme/images/logoWebINVEMAR.png + #missing ODISCat entry + pid: https://oceaninfohub.org/.well-known/org/invemargeo + sourcetype: sitegraph + url: https://portete.invemar.org.co/chm/api/oih/platformgeo?format=json + changefreq: + backend: INVEMAR CHM + headless: false + dateadded: 2023-08-22 + cron: 0 8 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" + - name: invemarinstitutions + propername: Latin America and the Caribbean Region (LAC) Institutions + catalogue: https://portete.invemar.org.co/chm/ + domain: https://portete.invemar.org.co + logo: https://www.invemar.org.co/o/invemar-actual-theme/images/logoWebINVEMAR.png + #missing ODISCat entry + pid: https://oceaninfohub.org/.well-known/org/invemarinstitutions + sourcetype: sitegraph + url: https://portete.invemar.org.co/chm/api/oih/institution?format=json + changefreq: + backend: INVEMAR CHM + headless: false + dateadded: 2022-01-26 + active: true + identifiertype: identifiersha + identifierpath: "" + cron: 0 13 * * 0 + - name: invemartraining + propername: Latin America and the Caribbean Region (LAC) Training + catalogue: https://portete.invemar.org.co/chm/ + domain: https://portete.invemar.org.co + logo: https://www.invemar.org.co/o/invemar-actual-theme/images/logoWebINVEMAR.png + #missing ODISCat entry + pid: https://oceaninfohub.org/.well-known/org/invemartraining + sourcetype: sitegraph + url: https://portete.invemar.org.co/chm/api/oih/training?format=json + changefreq: + backend: INVEMAR CHM + headless: false + dateadded: 2022-01-26 + active: true + identifiertype: identifiersha + identifierpath: "" + cron: 0 14 * * 0 + - name: invemarvessels + propername: Latin America and the Caribbean Region (LAC) Vessels + catalogue: https://portete.invemar.org.co/chm/ + domain: https//portete.invemar.org.co + logo: https://www.invemar.org.co/o/invemar-actual-theme/images/logoWebINVEMAR.png + #missing ODISCat entry + pid: https://oceaninfohub.org/.well-known/org/invemarvessels + sourcetype: sitegraph + url: https://portete.invemar.org.co/chm/api/oih/vessel?format=json + changefreq: + backend: INVEMAR CHM + headless: false + dateadded: 2022-01-26 + active: true + identifiertype: identifiersha + identifierpath: "" + cron: 0 15 * * 0 + - name: marcobolo + propername: MARine COastal BiOdiversity Long-term Observations (MARCO-BOLO) + catalogue: https://github.com/marco-bolo + domain: https://marcobolo-project.eu/ + logo: https://marcobolo-project.eu/wp-content/uploads/2020/09/MARCO-BOLO_logo_col-1.png + pid: https://catalogue.odis.org/view/3305 + sourcetype: sitemap + url: https://lab.marcobolo-project.eu/dataset-catalogue/sitemap.xml + changefreq: + backend: custom + headless: false + dateadded: 2024-06-12 + active: true + identifiertype: identifiersha + identifierpath: "" + cron: 0 15 * * 0 + - name: marineie + propername: Marine Institute Data Catalogue (Ireland) + catalogue: https://data.marine.ie/geonetwork/srv/eng/catalog.search + domain: https://data.marine.ie + logo: https://data.marine.ie/geonetwork/images/logos/494b5d66-a774-4a7e-86f0-c860cc21331b.png + pid: https://catalogue.odis.org/view/193 + sourcetype: sitemap + url: https://data.marine.ie/geonetwork/srv/api/sitemap + changefreq: daily + backend: GeoNetwork + headless: false + dateadded: 2023-08-22 + cron: 0 9 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" + - name: marinetraining + propername: Marine Training EU + catalogue: https://marinetraining.eu/search + domain: https://marinetraining.eu/ + logo: https://marinetraining.eu/sites/default/files/M2.0.png + #ODISCat entry missing ODIS-arch url & type + pid: https://catalogue.odis.org/view/1972 + sourcetype: sitemap + url: https://www.marinetraining.eu/sitemap.xml + changefreq: daily + backend: Drupal + headless: false + dateadded: 2021-05-26 + active: true + identifiertype: identifiersha + identifierpath: "" + cron: 0 16 * * 0 + - name: maspawio + propername: Marine Spatial Atlas for the Western Indian Ocean (MASPAWIO) + catalogue: http://maspawio.net/ + domain: https://cordioea.net/ + logo: https://cordioea.net/wp-content/uploads/2022/11/CORDIO-Pastel-Logo-Defringed.png + pid: https://catalogue.odis.org/view/351 + sourcetype: sitegraph + url: https://raw.githubusercontent.com/iodepo/odis-arch/master/collection/tempHosting/data-maspawio/maspawio-simple-graph.json + changefreq: + backend: GeoNode + headless: false + dateadded: 2022-03-30 + active: true + identifiertype: identifiersha + identifierpath: "" + cron: 0 17 * * 0 + - name: medin + propername: Marine Environmental Data and Information Network (MEDIN) + catalogue: https://portal.medin.org.uk/portal/start.php + domain: https://medin.org.uk/ + logo: https://medin.org.uk/sites/medin/files/images/medin-portal-logo.png + pid: https://catalogue.odis.org/view/40 + sourcetype: sitemap + url: https://portal.medin.org.uk/portal/sitemap.php + changefreq: daily + backend: MARIS + headless: false + dateadded: 2023-05-11 + cron: 0 10 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" + - name: metsrcn + propername: Research Coordination Network for Marine Ecological Time Series (METS-RCN) + catalogue: https://github.com/NicoGEOMAR/METS-RCN + domain: https://www2.whoi.edu/site/mets-rcn/ + logo: https://www.us-ocb.org/wp-content/uploads/sites/43/2022/05/METS-RCN-slider.png + #missing ODISCat entry + pid: https://oceaninfohub.org/.well-known/org/metsrcn + sourcetype: sitemap + url: https://raw.githubusercontent.com/NicoGEOMAR/METS-RCN/main/sitemap.xml + changefreq: as needed + backend: GitHub + headless: false + dateadded: 2023-04-13 + active: true + identifiertype: identifiersha + identifierpath: "" + cron: 0 18 * * 0 + - name: mims + propername: Marine Information Management System (MIMS) + catalogue: https://data.ocean.gov.za/mims/catalog/ + domain: https://data.ocean.gov.za + logo: https://data.ocean.gov.za/mims/static/images/mims-logo.png + pid: https://catalogue.odis.org/view/3303 + sourcetype: sitemap + url: https://data.ocean.gov.za/mims/catalog/sitemap.xml + changefreq: as needed + backend: SAEON + headless: false + dateadded: 2024-06-11 + active: true + identifiertype: identifiersha + identifierpath: "" + cron: 0 11 * * 0 + - name: ncei + propername: NCEI Marine Microplastics Catalogue + catalogue: https://experience.arcgis.com/experience/b296879cc1984fda833a8acc93e31476 + domain: https://www.ncei.noaa.gov/products/microplastics + logo: https://www.ncei.noaa.gov/themes/custom/ncei/logo.svg + pid: https://catalogue.odis.org/view/3295 + sourcetype: sitemap + url: https://www.ncei.noaa.gov/sites/g/files/anmtlf171/files/2024-04/microplastics-sitemap.xml + changefreq: monthly + backend: ArcGIS + headless: false + dateadded: 2024-01-24 + cron: 0 2 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" + - name: nmdis + propername: National Marine Data and Information Service (NMDIS) + catalogue: http://222.186.3.18:8888/erddap/index.html + domain: https://www.cmoc-china.cn/ + logo: https://www.cmoc-china.cn/assets/imgs/ab555e05d0f33e67c199a84fbb502547.png + pid: https://catalogue.odis.org/view/3274 + sourcetype: sitemap + url: http://222.186.3.18:8888/erddap/sitemap.xml + changefreq: monthly + backend: ERDDAP + headless: false + dateadded: 2023-08-22 + cron: 0 11 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" + - name: obis + propername: Ocean Biodiversity Information System (OBIS) + catalogue: https://obis.org + domain: https://obis.org + logo: https://obis.org/images/logo.png + #ODISCat entry missing ODIS-arch url + pid: https://catalogue.odis.org/view/343 + sourcetype: sitemap + url: https://obis-sitemaps.s3.amazonaws.com/sitemap_datasets.xml + changefreq: weekly + backend: unknown + headless: false + dateadded: 2021-02-26 + active: true + identifiertype: identifiersha + identifierpath: "" + cron: 0 19 * * 0 + - name: obps + propername: Ocean Best Practices System (OBPS) + catalogue: https://www.oceanbestpractices.org/repository/ + domain: https://www.oceanbestpractices.org/ + logo: https://www.oceanbestpractices.org/wp-content/uploads/2023/09/Ocean_Best_Practices_LOGO_WAVES_TEXT_INLINE_Final_outlines_RGB1.jpg + pid: https://catalogue.odis.org/view/292 + sourcetype: sitegraph + url: https://oih.oceanbestpractices.org/obps.json + changefreq: + backend: OAI + headless: false + dateadded: 2021-01-26 + active: true + identifiertype: identifiersha + identifierpath: "" + cron: 0 20 * * 0 + - name: oceanexpert + propername: OceanExpert UNESCO/IOC Project Office for IODE + catalogue: https://oceanexpert.org/advancedSearch + domain: https://oceanexpert.org/ + logo: https://oceanexpert.org/img/logoimageindex.jpg + #ODISCat arch url doesn't point to sitemap + pid: https://catalogue.odis.org/view/4 + sourcetype: sitemap + url: https://oceanexpert.org/assets/sitemaps/sitemapIndex.xml + changefreq: unknown + backend: unknown + headless: false + dateadded: 2021-08-26 + active: true + identifiertype: identifiersha + identifierpath: "" + cron: 0 21 * * 0 + - name: oceanscape + propername: Oceanscape Project + catalogue: https://oceanscape.org + domain: https://geoblueplanet.org/ + logo: https://oceanscape.org/wp-content/uploads/2019/06/Oceanscape-banner-1024x183.png + pid: https://catalogue.odis.org/view/1098 + sourcetype: sitemap + url: https://oceanscape.org/organisation-sitemap.xml + #url: https://oceanscape.org/sitemap_index.xml + changefreq: as needed + backend: WordPress + headless: false + dateadded: 2022-05-18 + cron: 0 12 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" + - name: odiscat + propername: ODIS Catalogue (ODISCat) + catalogue: https://catalogue.odis.org/ + domain: https://odis.org/ + logo: https://gatewaygeomatics.com/dl/odis/odis-catalogue-logo.png + #missing ODISCat entry + pid: https://oceaninfohub.org/.well-known/org/odiscat + sourcetype: sitemap + url: https://stag.catalogue.odis.org/sitemapIndex.xml + changefreq: daily + backend: Custom + headless: false + dateadded: 2023-10-19 + cron: 0 11 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" + - name: openasfa + propername: Aquatic Sciences and Fisheries Abstracts (OpenASFA) + catalogue: https://www.fao.org/fishery/en/openasfa + domain: https://www.fao.org/fishery/en + logo: https://upload.wikimedia.org/wikipedia/commons/thumb/d/db/FAO_logo.svg/180px-FAO_logo.svg.png + pid: https://catalogue.odis.org/view/3277 + sourcetype: sitemap + url: https://www.fao.org/fishery/sitemap/fishery_openasfa_en.xml + #url: https://www.fao.org/fishery/sitemap.xml + changefreq: monthly + backend: Custom + headless: false + dateadded: 2024-02-02 + cron: 0 11 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" + - name: osmc + propername: Observing System Monitoring Center (OSMC) + catalogue: https://osmc.noaa.gov/erddap/info/index.html + domain: https://www.osmc.noaa.gov/ + logo: https://www.osmc.noaa.gov/OSMC_logo.png + #missing ODISCat entry + pid: https://oceaninfohub.org/.well-known/org/osmc + sourcetype: sitemap + url: https://osmc.noaa.gov/erddap/sitemap.xml + changefreq: monthly + backend: ERDDAP + headless: false + dateadded: 2023-10-12 + cron: 0 10 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" + - name: pdh + propername: Pacific Data Hub (PDH) + catalogue: https://pacificdata.org/data/dataset + domain: https://pacificdata.org + logo: https://www.gitbook.com/cdn-cgi/image/width=256,dpr=2,height=40,fit=contain,format=auto/https%3A%2F%2F1509476088-files.gitbook.io%2F~%2Ffiles%2Fv0%2Fb%2Fgitbook-legacy-files%2Fo%2Fspaces%252F-MDSpD50SdXxsYnU7P9D%252Favatar-rectangle-1596147212769.png%3Fgeneration%3D1596147213229230%26alt%3Dmedia + pid: https://catalogue.odis.org/view/689 + sourcetype: sitemap + url: https://pacificdata.org/organization/sitemap.xml + changefreq: as needed + backend: CKAN + headless: false + dateadded: 2022-07-25 + active: true + identifiertype: identifiersha + identifierpath: "" + cron: 0 22 * * 0 + - name: pedp + propername: Pacific Environment Data Portal (PEDP) + catalogue: https://pacific-data.sprep.org/index.php/search + domain: https://pacific-data.sprep.org/ + logo: https://pacific-data.sprep.org/themes/custom/inform_regional/logo.png + pid: https://catalogue.odis.org/view/3293 + sourcetype: sitemap + url: https://raw.githubusercontent.com/iodepo/odis-arch/master/collection/tempHosting/data-pedp/sitemap.xml + #url: https://pacific-data.sprep.org/sitemap.xml + changefreq: as needed + backend: Drupal + headless: false + dateadded: 2023-08-08 + cron: 0 13 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" + - name: r2r + propername: Rolling Deck to Repository (R2R) + catalogue: https://www.rvdata.us/ + domain: https://lamont.columbia.edu/ + logo: https://www.rvdata.us/images/Logo.4b1519be.png + pid: https://catalogue.odis.org/view/3292 + sourcetype: sitemap + url: https://service.rvdata.us/api/sitemap/ + changefreq: yearly + backend: unknown + headless: false + dateadded: 2024-06-04 + cron: 0 14 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" + - name: rda + propername: Research Data Australia (RDA) + catalogue: https://researchdata.edu.au/ + domain: https://ardc.edu.au/ + logo: https://researchdata.edu.au/assets/img/ARDC_Research_Data_RGB_FA_Reverse_sml.png + pid: https://catalogue.odis.org/view/3267 + sourcetype: sitemap + url: https://researchdata.edu.au/home/sitemap/?ds=384,393,16 + #url: https://raw.githubusercontent.com/iodepo/odis-arch/master/collection/tempHosting/data-rda/sitemap.xml + changefreq: daily + backend: GeoNetwork + headless: false + dateadded: 2023-03-06 + cron: 0 14 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" + - name: unep + propername: UNEP Data Catalogue + catalogue: https://data.unep.org/app + domain: https://www.unep.org/ + logo: https://upload.wikimedia.org/wikipedia/en/thumb/9/9b/UNEP_logo.svg/195px-UNEP_logo.svg.png + pid: https://catalogue.odis.org/view/3288 + sourcetype: sitemap + url: https://data.unep.org/oceans_sitemap2.xml + changefreq: monthly + backend: CKAN + headless: false + dateadded: 2024-01-22 + active: true + identifiertype: identifiersha + identifierpath: "" + - name: wiosymphony + propername: WIO (Western Indian Ocean) Symphony + catalogue: https://symphony.nairobiconvention.org/login + domain: https://www.nairobiconvention.org/wio-symphony/ + logo: https://www.nairobiconvention.org/wp-content/uploads/2019/05/nairobic_logo.png + pid: https://catalogue.odis.org/view/3290 + sourcetype: sitemap + url: https://raw.githubusercontent.com/WIOSymphony/wiosym/main/metadata/sitemap.xml + changefreq: monthly + backend: Custom + headless: false + dateadded: 2024-01-24 + active: true + identifiertype: identifiersha + identifierpath: "" + - name: wod + propername: World Ocean Database (WOD) + catalogue: https://noaa-wod-pds.s3.amazonaws.com/index.html + domain: https://www.ncei.noaa.gov/products/world-ocean-database + logo: https://www.ncei.noaa.gov/themes/custom/ncei/logo.svg + pid: https://catalogue.odis.org/view/182 + sourcetype: sitemap + url: https://raw.githubusercontent.com/iodepo/odis-arch/master/collection/tempHosting/data-wod/sitemap.xml + changefreq: + backend: AmazonS3 + headless: false + dateadded: 2023-09-13 + cron: 0 15 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" + - name: zmt + propername: Leibniz Center for Tropical Marine Research (ZMT) + catalogue: https://dataportal.leibniz-zmt.de/ + domain: https://www.leibniz-zmt.de/de/ + logo: https://www.leibniz-zmt.de/templates/hm_teekit_zmt_v1/images/logo/de-DE/zmt_logo_full_edge.png + pid: https://catalogue.odis.org/view/3289 + sourcetype: sitemap + url: https://dataportal.leibniz-zmt.de/sitemap.xml + changefreq: monthly + backend: Pangaea + headless: false + dateadded: 2023-11-02 + active: true + identifiertype: identifiersha + identifierpath: "" + - name: calcofi + propername: California Cooperative Oceanic Fisheries Investigations (CalCOFI) + catalogue: https://calcofi.org/data/ + domain: https://calcofi.org/ + logo: https://calcofi.org/wp-content/uploads/2021/03/cropped-calcofirose_512_favicon.png + pid: https://oceaninfohub.org/.well-known/org/calcofi + sourcetype: sitemap + url: https://calcofi.io/workflows/datasets/sitemap.xml + changefreq: + backend: Custom + headless: false + dateadded: 2024-10-22 + cron: 0 2 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" + - name: isa + propername: International Seabed Authority (ISA) + catalogue: https://data.isa.org.jm/ + domain: https://www.isa.org.jm/ + logo: https://data.isa.org.jm/static/img/logo.png + pid: https://catalogue.odis.org/view/893 + sourcetype: sitemap + url: https://data.isa.org.jm/static/oih/site_map_oih_deepdata.xml + changefreq: + backend: Custom + headless: false + dateadded: 2024-10-22 + active: true + cron: 0 15 * * 0 + identifiertype: identifiersha + identifierpath: "" + - name: spase + propername: Space Physics Archive Search and Extract (SPASE) + catalogue: https://spase-group.org/ + domain: https://spase-group.org/ + logo: https://hdrl.gsfc.nasa.gov/SPASE.png + pid: https://catalogue.odis.org/view/3310 + sourcetype: sitemap + url: https://raw.githubusercontent.com/lechatpito/NASA-ODIS-Examples/main/sitemap.xml + changefreq: as needed + backend: custom + headless: false + dateadded: 2024-09-12 + cron: 0 14 * * 0 + active: true + identifiertype: identifiersha + identifierpath: "" diff --git a/dagster/implnets/configs/oihv2/nabuconfig.yaml b/dagster/implnets/configs/oih/nabuconfig.yaml similarity index 94% rename from dagster/implnets/configs/oihv2/nabuconfig.yaml rename to dagster/implnets/configs/oih/nabuconfig.yaml index 3218d9a3..b6308fa4 100644 --- a/dagster/implnets/configs/oihv2/nabuconfig.yaml +++ b/dagster/implnets/configs/oih/nabuconfig.yaml @@ -14,10 +14,10 @@ contextmaps: - prefix: "http://schema.org/" file: "/assets/schemaorg-current-http.jsonld" # wget http://schema.org/docs/jsonldcontext.jsonld implementation_network: - orgname: eco + orgname: oih endpoints: - service: ec_blazegraph - baseurl: https://graph.geocodes-aws-dev.earthcube.org/blazegraph/namespace/test + baseurl: http://nas.lan:49153/blazegraph/namespace/oih_prod type: blazegraph authenticate: false username: diff --git a/dagster/implnets/configs/oih/tenant.yaml b/dagster/implnets/configs/oih/tenant.yaml new file mode 100644 index 00000000..7372dc24 --- /dev/null +++ b/dagster/implnets/configs/oih/tenant.yaml @@ -0,0 +1,27 @@ +# prototype tennants file + +tenant: + - community: dev + hostname: oih-dev + description: oih in progress set + name: Ocean InfoHub Dev + url: https://www.oceaninfohub.org + logo: https://raw.githubusercontent.com/iodepo/odis-arch/refs/heads/master/book/logo.png + graph: + main_namespace: oih_dev + summary_namespace: oih_dev_summary + sources: + - africaioc +###### + - community: oihall + hostname: oih-all + description: oih production set + name: Ocean InfoHub + url: https://www.oceaninfohub.org + logo: https://raw.githubusercontent.com/iodepo/odis-arch/refs/heads/master/book/logo.png + graph: + main_namespace: oih_prod + summary_namespace: oih_prod_summary + sources: + - all + diff --git a/dagster/implnets/configs/oih/workspace.yaml b/dagster/implnets/configs/oih/workspace.yaml index ded83cee..747fd3ef 100644 --- a/dagster/implnets/configs/oih/workspace.yaml +++ b/dagster/implnets/configs/oih/workspace.yaml @@ -1,18 +1,28 @@ load_from: - - python_file: - relative_path: "project/oih/repositories/repository.py" - location_name: project - working_directory: "./project/oih/" +# - python_file: +# relative_path: "project/eco/repositories/repository.py" +# location_name: project +# working_directory: "./project/eco/" # - python_file: # relative_path: "workflows/ecrr/repositories/repository.py" # working_directory: "./workflows/ecrr/" # module starting out with the definitions api - - grpc_server: - host: dagster-code-ingest - port: 4000 - location_name: "ingest" - # - grpc_server: - # host: dagster-code-project - # port: 4000 - # location_name: "project_grpc" + # - python_module: "workflows.tasks.tasks" + - + - grpc_server: + host: dagster-code-tasks + port: 4000 + location_name: "tasks" + - grpc_server: + host: dagster-code-ingest + port: 4000 + location_name: "ingest" +# - grpc_server: +# host: dagster-code-project +# port: 4000 +# location_name: "project_grpc" + - grpc_server: + host: dagster-code-eco-ecrr + port: 4000 + location_name: "ecrr" diff --git a/dagster/implnets/configs/oihv2/tenant.yaml b/dagster/implnets/configs/oihv2/tenant.yaml deleted file mode 100644 index ec5e2ca5..00000000 --- a/dagster/implnets/configs/oihv2/tenant.yaml +++ /dev/null @@ -1,27 +0,0 @@ -# prototype tennants file - -tenant: - - community: dev - hostname: geocodes-dev - description: GeoCodes is... - name: Geocodes Science on Schema - url: https://www.earthcube.org - logo: https://unsplash.com/random - graph: - main_namespace: test - summary_namespace: test_summary - sources: - - cioosatlantic -###### - - community: geocodesall - hostname: geocodes-all - description: GeoCodes is... - name: Geocodes Science on Schema - url: https://www.earthcube.org - logo: https://unsplash.com/random - graph: - main_namespace: geocodes_test - summary_namespace: geocodes_test_summary - sources: - - all - diff --git a/dagster/implnets/deployment/compose_no_routing.yaml b/dagster/implnets/deployment/compose_no_routing.yaml index d8983849..c3333c20 100644 --- a/dagster/implnets/deployment/compose_no_routing.yaml +++ b/dagster/implnets/deployment/compose_no_routing.yaml @@ -1,5 +1,3 @@ -version: "3.9" - # ########### # This is for a single dagster instance, that does not use an externally defined network ######## @@ -10,91 +8,104 @@ version: "3.9" # CONTAINER_TAG default latest networks: - traefik_proxy: - driver: overlay - name: traefik-${PROJECT:-eco} - attachable: true + dagster_host: + # traefik_proxy: + # driver: overlay + # name: traefik-${PROJECT:-eco} + # attachable: true + # volumes: - dagster-postgres: - driver: local -secrets: - MINIO_ROOT_ACCESS_KEY: - external: true - MINIO_ROOT_SECRET_KEY: - external: true - -services: - dagster-dagit: - image: docker.io/nsfearthcube/dagster-${PROJECT:-eco}:${CONTAINER_TAG:-latest} + dagster-postgres: + driver: local +# secrets: +# MINIO_ROOT_ACCESS_KEY: +# external: true +# MINIO_ROOT_SECRET_KEY: +# external: true - secrets: - - MINIO_ROOT_ACCESS_KEY - - MINIO_ROOT_SECRET_KEY - environment: &env - - DEBUG=${DEBUG:-false} - - PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python - - PORTAINER_URL=${PORTAINER_URL} - - PORTAINER_KEY=${PORTAINER_KEY} - - GLEANERIO_GLEANER_IMAGE=${GLEANERIO_GLEANER_IMAGE} - - GLEANERIO_NABU_IMAGE=${GLEANERIO_NABU_IMAGE} - - GLEANERIO_LOG_PREFIX=${GLEANERIO_LOG_PREFIX} - - GLEANER_MINIO_ADDRESS=${GLEANERIO_MINIO_ADDRESS} - - GLEANER_MINIO_PORT=${GLEANERIO_MINIO_PORT} - - GLEANER_MINIO_USE_SSL=${GLEANERIO_MINIO_USE_SSL} - - GLEANER_MINIO_BUCKET=${GLEANERIO_MINIO_BUCKET} - - GLEANER_MINIO_ACCESS_KEY=${GLEANERIO_MINIO_ACCESS_KEY} - - GLEANER_MINIO_SECRET_KEY=${GLEANERIO_MINIO_SECRET_KEY} - - GLEANER_HEADLESS_ENDPOINT=${GLEANERIO_HEADLESS_ENDPOINT} - - GLEANER_HEADLESS_NETWORK=${GLEANERIO_HEADLESS_NETWORK} - - GLEANER_GRAPH_URL=${GLEANERIO_GRAPH_URL} - - GLEANER_GRAPH_NAMESPACE=${GLEANERIO_GRAPH_NAMESPACE} - - GLEANERIO_NABU_CONFIG_PATH=${GLEANERIO_NABU_CONFIG_PATH:-/configs/gleaner/nabuconfig.yaml} - - GLEANERIO_GLEANER_CONFIG_PATH=${GLEANERIO_GLEANER_CONFIG_PATH:-/configs/gleaner/gleanerconfig.yaml} - - GLEANERIO_NABU_DOCKER_CONFIG=${GLEANERIO_NABU_DOCKER_CONFIG:-nabu} - - GLEANERIO_GLEANER_DOCKER_CONFIG=${GLEANERIO_GLEANER_DOCKER_CONFIG:-gleaner} - ports: - - 3000:3000 - networks: - - traefik_proxy - - dagster_host - depends_on: - - dagster-postgres - - dagster-daemon: - image: docker.io/nsfearthcube/dagster-${PROJECT:-eco}:${CONTAINER_TAG:-latest} - secrets: - - MINIO_ROOT_ACCESS_KEY - - MINIO_ROOT_SECRET_KEY - environment: *env - - command: "dagster-daemon run" - depends_on: - - dagster-postgres - networks: - - dagster_host +configs: + # gleaner: + # name: ${GLEANERIO_DOCKER_GLEANER_CONFIG:-gleaner} + # file: ../configs/${PROJECT:-eco}/gleanerconfig.yaml + # nabu: + # name: ${GLEANERIO_DOCKER_NABU_CONFIG:-nabu} + # file: ../configs/${PROJECT:-eco}/nabuconfig.yaml + workspace: + name: ${GLEANERIO_WORKSPACE_DOCKER_CONFIG:-workspace} + file: ../configs/${PROJECT:-eco}/workspace.yaml +services: + dagster-dagit: + image: earthcube/dagster_oih:0.0.2 + # image: docker.io/nsfearthcube/dagster-${PROJECT:-eco}:${CONTAINER_TAG:-latest} + # secrets: + # - MINIO_ROOT_ACCESS_KEY + # - MINIO_ROOT_SECRET_KEY + environment: &env + - DEBUG=${DEBUG:-false} + - PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python + - PORTAINER_URL=${PORTAINER_URL} + - PORTAINER_KEY=${PORTAINER_KEY} + - GLEANERIO_GLEANER_IMAGE=${GLEANERIO_GLEANER_IMAGE} + - GLEANERIO_NABU_IMAGE=${GLEANERIO_NABU_IMAGE} + - GLEANERIO_LOG_PREFIX=${GLEANERIO_LOG_PREFIX} + - GLEANER_MINIO_ADDRESS=${GLEANERIO_MINIO_ADDRESS} + - GLEANER_MINIO_PORT=${GLEANERIO_MINIO_PORT} + - GLEANER_MINIO_USE_SSL=${GLEANERIO_MINIO_USE_SSL} + - GLEANER_MINIO_BUCKET=${GLEANERIO_MINIO_BUCKET} + - GLEANER_MINIO_ACCESS_KEY=${GLEANERIO_MINIO_ACCESS_KEY} + - GLEANER_MINIO_SECRET_KEY=${GLEANERIO_MINIO_SECRET_KEY} + - GLEANER_HEADLESS_ENDPOINT=${GLEANERIO_HEADLESS_ENDPOINT} + - GLEANER_HEADLESS_NETWORK=${GLEANERIO_HEADLESS_NETWORK} + - GLEANER_GRAPH_URL=${GLEANERIO_GRAPH_URL} + - GLEANER_GRAPH_NAMESPACE=${GLEANERIO_GRAPH_NAMESPACE} + - GLEANERIO_NABU_CONFIG_PATH=${GLEANERIO_NABU_CONFIG_PATH:-/configs/gleaner/nabuconfig.yaml} + - GLEANERIO_GLEANER_CONFIG_PATH=${GLEANERIO_GLEANER_CONFIG_PATH:-/configs/gleaner/gleanerconfig.yaml} + - GLEANERIO_NABU_DOCKER_CONFIG=${GLEANERIO_NABU_DOCKER_CONFIG:-nabu} + - GLEANERIO_GLEANER_DOCKER_CONFIG=${GLEANERIO_GLEANER_DOCKER_CONFIG:-gleaner} + ports: + - 3000:3000 + networks: + # - traefik_proxy + - dagster_host + depends_on: + - dagster-postgres - dagster-postgres: - image: postgres:13.3 - ports: - - 5432:5432 - environment: - - POSTGRES_PASSWORD=secret - volumes: - - dagster-postgres:/var/lib/postgresql/data - networks: - - traefik_proxy + dagster-daemon: + image: earthcube/dagster_oih:0.0.2 + # image: docker.io/nsfearthcube/dagster-${PROJECT:-eco}:${CONTAINER_TAG:-latest} + # secrets: + # - MINIO_ROOT_ACCESS_KEY + # - MINIO_ROOT_SECRET_KEY + environment: *env + command: "dagster-daemon run" + depends_on: + - dagster-postgres + networks: + - dagster_host + dagster-postgres: + image: postgres:13.3 + ports: + - 5432:5432 + environment: + - POSTGRES_PASSWORD=secret + volumes: + - dagster-postgres:/var/lib/postgresql/data + networks: + - dagster_host + # - traefik_proxy - headless: - # image: chromedp/headless-shell:stable - # stable after 105 causes "devtool: CreateURL: Using unsafe HTTP verb GET to invoke /json/new. This action supports only PUT verb.", - image: chromedp/headless-shell:105.0.5195.127 - restart: unless-stopped - shm_size: "2gb" - ports: - - 9222:9222 - environment: - - SERVICE_PORTS=9222 - networks: - - traefik_proxy + headless: + # image: chromedp/headless-shell:stable + # stable after 105 causes "devtool: CreateURL: Using unsafe HTTP verb GET to invoke /json/new. This action supports only PUT verb.", + image: chromedp/headless-shell:105.0.5195.127 + restart: unless-stopped + shm_size: "2gb" + ports: + - 9222:9222 + environment: + - SERVICE_PORTS=9222 + networks: + - dagster_host + # - traefik_proxy diff --git a/dagster/implnets/workflows/ingest/ingest/assets/gleaner_sources.py b/dagster/implnets/workflows/ingest/ingest/assets/gleaner_sources.py index dbbf24c4..a7d0e8d2 100644 --- a/dagster/implnets/workflows/ingest/ingest/assets/gleaner_sources.py +++ b/dagster/implnets/workflows/ingest/ingest/assets/gleaner_sources.py @@ -87,16 +87,20 @@ def gleanerio_tenants(context): def check_for_valid_sitemap( sources_active): validated_sources=[] for source in sources_active: - try: - sm = Sitemap(source['url'], no_progress_bar=True) - - source['sm_url_is_valid'] = sm.validUrl() - validated_sources.append(source) - get_dagster_logger().info(f" sitemap url valid {source['sm_url_is_valid']} for {source['name']} {source['url']}") - except Exception as e: - get_dagster_logger().error(f" sitemap url ERROR for {source['name']} {source['url']} exception {e}") - source['sm_url_is_valid'] = False + if source['sourcetype'] == "sitegraph": + source['sm_url_is_valid'] = True validated_sources.append(source) + get_dagster_logger().info(f" sitegraph url valid {source['sm_url_is_valid']} for {source['name']} {source['url']}") + else: + try: + sm = Sitemap(source['url'], no_progress_bar=True) + source['sm_url_is_valid'] = sm.validUrl() + validated_sources.append(source) + get_dagster_logger().info(f" sitemap url valid {source['sm_url_is_valid']} for {source['name']} {source['url']}") + except Exception as e: + get_dagster_logger().error(f" sitemap url ERROR for {source['name']} {source['url']} exception {e}") + source['sm_url_is_valid'] = False + validated_sources.append(source) return validated_sources @multi_asset( @@ -122,6 +126,7 @@ def gleanerio_sources(context ): sources_obj = yaml.safe_load(source) sources_all_value = list(filter(lambda t: t["name"], sources_obj["sources"])) sources_active_value = filter(lambda t: t["active"], sources_all_value ) + # sources_sourcetype_value = filter(lambda t: t["sourcetype"], sources_all_value ) # df not used when set source_sm_validated = list(check_for_valid_sitemap( sources_active_value)) context.log.info(f"validated sitemaps {source_sm_validated} ") sources_active_names = list(map(lambda t: t["name"], filter(lambda t: t["sm_url_is_valid"], source_sm_validated ))) diff --git a/dagster/implnets/workflows/ingest/ingest/assets/gleaner_summon_assets.py b/dagster/implnets/workflows/ingest/ingest/assets/gleaner_summon_assets.py index c5d6e375..b2f959be 100644 --- a/dagster/implnets/workflows/ingest/ingest/assets/gleaner_summon_assets.py +++ b/dagster/implnets/workflows/ingest/ingest/assets/gleaner_summon_assets.py @@ -45,16 +45,18 @@ def getSource(context, source_name): def validate_sitemap_url(context): source_name = context.asset_partition_key_for_output() source = getSource(context, source_name) - sm = Sitemap(source['url'], no_progress_bar=True) - if sm.validUrl(): - return source['url'] - else: - context.log.error(f"source: {source['name']} bad url: {source['url']}") - raise HTTPError(url=source['url'], - code=404, - hdrs=None, - fp=None, - msg=f"Bad URL ource: {source['name']} bad url: {source['url']}" ) + + if source['sourcetype'] == "sitemap": # ie, skip this for type sitegraph + sm = Sitemap(source['url'], no_progress_bar=True) + if sm.validUrl(): + return source['url'] + else: + context.log.error(f"source: {source['name']} bad url: {source['url']}") + raise HTTPError(url=source['url'], + code=404, + hdrs=None, + fp=None, + msg=f"Bad URL source: {source['name']} bad url: {source['url']}" ) @asset(group_name="load", key_prefix="ingest", @@ -225,7 +227,7 @@ def release_summarize(context) : bucket_name, object_name =s3Minio.putTextFileToStore(summaryttl, s3ObjectInfo) context.add_output_metadata( metadata={ - "source": source, # Metadata can be any key-value pair + "source": source_name, # Metadata can be any key-value pair "run": "release_summarize", "bucket_name": bucket_name, # Metadata can be any key-value pair "object_name": object_name, @@ -270,7 +272,7 @@ def identifier_stats(context): #r = str('identifier stats returned value:{}'.format(returned_value)) report = returned_value.to_json() s3Minio.putReportFile(bucket, source_name, "identifier_stats.json", report) - get_dagster_logger().info(f"identifer stats report returned {r} ") + get_dagster_logger().info(f"identifier stats report returned {r} ") return @asset(group_name="load",key_prefix="ingest", diff --git a/dagster/implnets/workflows/ingest/ingest/jobs/summon_assets.py b/dagster/implnets/workflows/ingest/ingest/jobs/summon_assets.py index 6901701a..0f72b4a8 100644 --- a/dagster/implnets/workflows/ingest/ingest/jobs/summon_assets.py +++ b/dagster/implnets/workflows/ingest/ingest/jobs/summon_assets.py @@ -4,16 +4,18 @@ get_dagster_logger, ) +# from dagster.implnets.templates.v1.implnet_ops_SOURCEVAL import post_to_graph from ..assets.gleaner_summon_assets import * +from ..assets.tenant import * from ..assets.gleaner_sources import sources_partitions_def, gleanerio_sources +# from ..resources.graph import GraphResource # disabling load_graph report until we can move it to tenant build runs. summon_asset_job = define_asset_job( name="summon_and_release_job", selection=AssetSelection.assets(validate_sitemap_url, gleanerio_run, release_nabu_run, load_report_s3, release_summarize, identifier_stats, bucket_urls, - graph_stats_report, - #load_report_graph + graph_stats_report #, upload_release ), partitions_def=sources_partitions_def, #tags={"dagster/concurrency_key": 'ingest'}, diff --git a/dagster/implnets/workflows/ingest/ingest/resources/gleanerio.py b/dagster/implnets/workflows/ingest/ingest/resources/gleanerio.py index bda3276e..2f1a2799 100644 --- a/dagster/implnets/workflows/ingest/ingest/resources/gleanerio.py +++ b/dagster/implnets/workflows/ingest/ingest/resources/gleanerio.py @@ -132,7 +132,7 @@ class GleanerioResource(ConfigurableResource): def _get_client(self, docker_container_context: DockerContainerContext): headers = {'X-API-Key': self.GLEANERIO_PORTAINER_APIKEY} - client = docker.DockerClient(base_url=self.GLEANERIO_DOCKER_URL, version="1.47") + client = docker.DockerClient(base_url=self.GLEANERIO_DOCKER_URL, version="1.47") # was 1.43, I changed to 1.47 # client = docker.APIClient(base_url=URL, version="1.35") get_dagster_logger().info(f"create docker client") if (client.api._general_configs): diff --git a/runConfigurations/dagster_ingest_debug (1).run.xml b/runConfigurations/dagster_ingest_debug_oih.run.xml similarity index 84% rename from runConfigurations/dagster_ingest_debug (1).run.xml rename to runConfigurations/dagster_ingest_debug_oih.run.xml index f8db179a..7f1e8620 100644 --- a/runConfigurations/dagster_ingest_debug (1).run.xml +++ b/runConfigurations/dagster_ingest_debug_oih.run.xml @@ -1,17 +1,16 @@ - +