diff --git a/docs/_toc.yml b/docs/_toc.yml index 8fde307..8741fb2 100644 --- a/docs/_toc.yml +++ b/docs/_toc.yml @@ -9,7 +9,7 @@ parts: numbered: False chapters: - file: introduction/about.md - title: "About" + title: "About & Mission" - file: introduction/implementation.md title: "Implementation overview" - file: introduction/data_sources.md diff --git a/docs/dev_guide/architecture/07_deployment_view.md b/docs/dev_guide/architecture/07_deployment_view.md index 0dd245a..55a9a1e 100644 --- a/docs/dev_guide/architecture/07_deployment_view.md +++ b/docs/dev_guide/architecture/07_deployment_view.md @@ -99,7 +99,7 @@ Mapping of Building Blocks to Infrastructure : *\* ::: -## Infrastructure Level 1 {#_infrastructure_level_1} +## Infrastructure Level 1 UnHIDE is deployed on [HDF-cloud](https://www.fz-juelich.de/en/ias/jsc/systems/scientific-clouds/hdf-cloud) diff --git a/docs/diagrams/make_svgs.ipynb b/docs/diagrams/make_svgs.ipynb index 28f6a9f..34e8b7b 100644 --- a/docs/diagrams/make_svgs.ipynb +++ b/docs/diagrams/make_svgs.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 37, + "execution_count": 1, "id": "803b291a", "metadata": {}, "outputs": [], @@ -15,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 2, "id": "3267f714", "metadata": {}, "outputs": [ @@ -23,7 +23,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[32msuccess\u001b[0m: successfully compiled documentation_deployment.d2 to documentation_deployment.svg in 200.738359ms\r\n" + "\u001b[32msuccess\u001b[0m: successfully compiled documentation_deployment.d2 to documentation_deployment.svg in 538.914272ms\n" ] } ], @@ -33,11 +33,9 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 3, "id": "ee21bf18", - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [ { "data": { @@ -916,7 +914,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 4, "id": "9ce63c07", "metadata": {}, "outputs": [ @@ -924,7 +922,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[32msuccess\u001b[0m: successfully compiled unhide_harvester_datapipeline.d2 to unhide_harvester_datapipeline.svg in 417.583927ms\r\n" + "\u001b[32msuccess\u001b[0m: successfully compiled unhide_harvester_datapipeline.d2 to unhide_harvester_datapipeline.svg in 434.938943ms\n" ] } ], @@ -934,7 +932,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 5, "id": "4ba64fe0", "metadata": { "scrolled": true @@ -1867,7 +1865,7 @@ }, { "cell_type": "code", - "execution_count": 136, + "execution_count": 6, "id": "2803cb0a", "metadata": {}, "outputs": [ @@ -1875,7 +1873,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[32msuccess\u001b[0m: successfully compiled unhide_overview_architecture.d2 to unhide_overview_architecture.svg in 14.717877ms\r\n" + "\u001b[32msuccess\u001b[0m: successfully compiled unhide_overview_architecture.d2 to unhide_overview_architecture.svg in 15.805072ms\n" ] } ], @@ -1885,7 +1883,7 @@ }, { "cell_type": "code", - "execution_count": 137, + "execution_count": 7, "id": "9aada51e", "metadata": {}, "outputs": [ @@ -1995,7 +1993,7 @@ }, { "cell_type": "code", - "execution_count": 138, + "execution_count": 12, "id": "a03d6465", "metadata": {}, "outputs": [ @@ -2003,7 +2001,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[32msuccess\u001b[0m: successfully compiled unhide_deployment_overview.d2 to unhide_deployment_overview.svg in 678.758987ms\r\n" + "\u001b[32msuccess\u001b[0m: successfully compiled unhide_deployment_overview.d2 to unhide_deployment_overview.svg in 628.711054ms\n" ] } ], @@ -2013,34 +2011,34 @@ }, { "cell_type": "code", - "execution_count": 139, + "execution_count": 13, "id": "1030a77d", "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ - "HDF-CloudInternetUnHIDE deploymentVirtual Machine instanceBackupBackup VM Imagesearch.unhide.helmholtz-metadaten.deapi.unhide.helmholtz-metadaten.desparql.unhide.helmholtz-metadaten.deMounted Data VolumeData pipelineWeb front endAPIApache Jenanginx-proxyletsencrypt-nginx-proxy-companionSOLR instance:IndexerUnHIDE Data filesSOLR IndexUnHIDE Graph filesHarvestersReact appFastAPIUnHIDE GraphFuseki SPARQL APINGINXApache SOLRindexer reads fromstores data store & retrieve graphstores & retrieve indexqueriesqueriesroutesencryptsrequestsroutesroutesmanualhandles requests\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", + "JSC-CloudInternetHelmholtz Knowledge graph deploymentVirtual Machine instanceBackupBackup VM Imagesearch.unhide.helmholtz-metadaten.deapi.unhide.helmholtz-metadaten.desparql.unhide.helmholtz-metadaten.deMounted Data VolumeData pipelineWeb front endAPIOpenLink Virtuosonginx-proxyletsencrypt-nginx-proxy-companionSOLR instance:IndexerjenaUnHIDE Data filesSOLR IndexUnHIDE Graph filesHarvestersReact appFastAPIUnHIDE GraphOpenLink SPARQL APINGINXApache SOLRindexersparql reads fromstores data store & retrieve graphstores & retrieve indexqueriesqueriesroutesencryptsrequestsroutesroutesmanualhandles requests\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", "" ], "text/plain": [ @@ -2919,6 +2919,25 @@ "show_svg(filename='unhide_deployment_overview.svg')" ] }, + { + "cell_type": "code", + "execution_count": 15, + "id": "a7fb7ebc-fa9b-4b70-9727-9e87ed2f0069", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[34minfo\u001b[0m: converting to PNG...\n", + "\u001b[32msuccess\u001b[0m: successfully compiled unhide_deployment_overview.d2 to unhide_deployment_overview.pdf in 7.163495982s\n" + ] + } + ], + "source": [ + "!d2 unhide_deployment_overview.d2 unhide_deployment_overview.pdf" + ] + }, { "cell_type": "code", "execution_count": 131, diff --git a/docs/diagrams/unhide_deployment_overview.d2 b/docs/diagrams/unhide_deployment_overview.d2 index d318548..2c0f65e 100644 --- a/docs/diagrams/unhide_deployment_overview.d2 +++ b/docs/diagrams/unhide_deployment_overview.d2 @@ -1,4 +1,4 @@ -title: UnHIDE deployment { +title: Helmholtz Knowledge graph deployment { shape: text near: top-center style: { @@ -6,7 +6,7 @@ title: UnHIDE deployment { } } -hdfcloud: HDF-Cloud{ +hdfcloud: JSC-Cloud{ style: { font-size: 55 } @@ -55,7 +55,7 @@ hdfcloud: HDF-Cloud{ } } - jena: Apache Jena { + virtuoso: OpenLink Virtuoso { style: { font-size: 55 } @@ -63,8 +63,8 @@ hdfcloud: HDF-Cloud{ graph: UnHIDE Graph { icon: https://icons.terrastruct.com/azure%2FManagement%20and%20Governance%20Service%20Color%2FResource%20Graph%20Explorer.svg } - sparql: Fuseki SPARQL API { - icon: ./sparql.svg + sparql: OpenLink SPARQL API { + icon: ./virtuoso_logo.png } } @@ -94,11 +94,11 @@ hdfcloud: HDF-Cloud{ store -> indexer: reads from pipe -> store: stores data - jena <-> store: store & retrieve graph - solr <-> store: stores & retrieve index + jena <-> store.UnHIDE Graph files: store & retrieve graph + solr <-> store.SOLR Index: stores & retrieve index solr <- api: queries - Jena.graph <- jena.sparql: queries - jena.sparql <-> nginx: routes + Virtuoso.graph <- jena.sparql: queries + virtuoso.sparql <-> nginx: routes letsencrypt <-> nginx: encrypts web -> api: requests web <-> nginx: routes @@ -126,4 +126,4 @@ Internet { domain3: sparql.unhide.helmholtz-metadaten.de } -hdfcloud.cloud.nginx <-> Internet: handles requests \ No newline at end of file +hdfcloud.cloud.nginx <-> Internet: handles requests diff --git a/docs/diagrams/unhide_deployment_overview.pdf b/docs/diagrams/unhide_deployment_overview.pdf new file mode 100644 index 0000000..94d7a95 Binary files /dev/null and b/docs/diagrams/unhide_deployment_overview.pdf differ diff --git a/docs/diagrams/unhide_deployment_overview.svg b/docs/diagrams/unhide_deployment_overview.svg index ef79d90..980cda1 100644 --- a/docs/diagrams/unhide_deployment_overview.svg +++ b/docs/diagrams/unhide_deployment_overview.svg @@ -1,24 +1,24 @@ -HDF-CloudInternetUnHIDE deploymentVirtual Machine instanceBackupBackup VM Imagesearch.unhide.helmholtz-metadaten.deapi.unhide.helmholtz-metadaten.desparql.unhide.helmholtz-metadaten.deMounted Data VolumeData pipelineWeb front endAPIApache Jenanginx-proxyletsencrypt-nginx-proxy-companionSOLR instance:IndexerUnHIDE Data filesSOLR IndexUnHIDE Graph filesHarvestersReact appFastAPIUnHIDE GraphFuseki SPARQL APINGINXApache SOLRindexer reads fromstores data store & retrieve graphstores & retrieve indexqueriesqueriesroutesencryptsrequestsroutesroutesmanualhandles requests - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +JSC-CloudInternetHelmholtz Knowledge graph deploymentVirtual Machine instanceBackupBackup VM Imagesearch.unhide.helmholtz-metadaten.deapi.unhide.helmholtz-metadaten.desparql.unhide.helmholtz-metadaten.deMounted Data VolumeData pipelineWeb front endAPIOpenLink Virtuosonginx-proxyletsencrypt-nginx-proxy-companionSOLR instance:IndexerjenaUnHIDE Data filesSOLR IndexUnHIDE Graph filesHarvestersReact appFastAPIUnHIDE GraphOpenLink SPARQL APINGINXApache SOLRindexersparql reads fromstores data store & retrieve graphstores & retrieve indexqueriesqueriesroutesencryptsrequestsroutesroutesmanualhandles requests + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/diagrams/virtuoso_logo.png b/docs/diagrams/virtuoso_logo.png new file mode 100644 index 0000000..c6d53c2 Binary files /dev/null and b/docs/diagrams/virtuoso_logo.png differ diff --git a/docs/images/hzb-logo-a4-rgb.png b/docs/images/hzb-logo-a4-rgb.png new file mode 100644 index 0000000..5cd96e8 Binary files /dev/null and b/docs/images/hzb-logo-a4-rgb.png differ diff --git a/docs/intro.md b/docs/intro.md index bdaee51..781853b 100644 --- a/docs/intro.md +++ b/docs/intro.md @@ -39,19 +39,22 @@ With the implementation of the Helmholtz-KG, unHIDE will create substantial addi ## Contributors and Partners +% [HZB](https://www.helmholtz-berlin.de/) -[FZJ](https://fz-juelich.de) +[FZJ](https://fz-juelich.de) +![HZB](./images/hzb-logo-a4-rgb.png) -## Acknowledgements +## Acknowledgements -[HMC Logo](https://helmholtz-metadaten.de) This project was developed and funded by the Helmholtz Metadata Collaboration (HMC), an incubator-platform of the Helmholtz Association within the framework of the Information and Data Science strategic initiative. +[HMC Logo](https://helmholtz-metadaten.de) + ## References - [1] https://5stardata.info/en/ diff --git a/docs/introduction/about.md b/docs/introduction/about.md index ff39d4a..0c5798d 100644 --- a/docs/introduction/about.md +++ b/docs/introduction/about.md @@ -1,3 +1,13 @@ -# About UnHIDE +# About UnHIDE and its mission -![unhide_overview](../images/unhide_overview.png) +## Mission + +The efforts of the unHIDE initiative are one part of the efforts by the Helmholtz metadata collaboration (HMC) to improve the quality, knowledge management and conservation of research output of the Helmholtz association with respect and through metadata. This is accomplished by making research output `FAIR` through better metadata or differently formulated creating to a certain extend in a certain form of a semantic web encompassing Helmholtz research. + +With the unHIDE initiative our goal is to improve the metadata at the source and make data providers as well as scientists more aware of what metadata they put out on the web, how and with what quality. +For this we create and expose the Helmholtz knowledge graph, which contains open high-level metadata exposed by different Helmholtz infrastructures. Also such a graph allows for services which serve needs of certain stakeholder groups to empower their work in different ways. + +Beyond the knowledge graph in unHIDE we communicate and work together with Helmholtz infrastructures to improve metadata, (or make it available in the first place), through consulting, help and fostering networking between the infrastructures and respected experts. + + +![unhide_overview](../images/unhide_overview.png) \ No newline at end of file diff --git a/docs/tech/datapipe.md b/docs/tech/datapipe.md index a1446d8..e902577 100644 --- a/docs/tech/datapipe.md +++ b/docs/tech/datapipe.md @@ -1,8 +1,8 @@ # Data pipeline -In UnHIDE data is harvested from connected providers and partners. -Then data is 'uplifted', i.e semantically enriched and or completed, -where possible from aggregated data or schema.org semantics. +In UnHIDE metadata about research outputs is harvested from connected providers and partners. +Then the original metadata is 'uplifted', i.e semantically enriched and or completed, +where possible from aggregated data or schema.org semantics as an example of how it can be. ## Overview @@ -36,4 +36,16 @@ The second direction is there to provide full text search on the data to end use For this an index of each uplifted data record is constructed and uploaded into a single SOLR index, which is exposed to a certain extend via a custom fastAPI. A web front end using the javascript library React provides a user interface for the full text search and supports special use cases as a service -to certain stakeholder groups. \ No newline at end of file +to certain stakeholder groups. + + +The technical implementation is currently a minimal running version, by exposing each +component and functionality through the command line interface `hmc-unhide` and then using +cron jobs to run them from time to time. On the deployment instance this can be run monthly or +weekly. In the longer term, the pipeline orchestration itself should become more sophisticated. +For this one could deploy a workflow manager with provenance tracking like (AiiDA) +or one with less overhead depending on the needs, also if one wants to move to a more event based system +with more fault tolerance for errors of individual records or data sources. Currently, +in the minimal implementation there is the risks that a not caught failure in a subtask +fails a larger part of the pipeline. Which is then only logged, but has to be resolved in a manual way. + diff --git a/docs/tech/harvesting.md b/docs/tech/harvesting.md index f524043..5970eec 100644 --- a/docs/tech/harvesting.md +++ b/docs/tech/harvesting.md @@ -1,3 +1,30 @@ -# Data harvesting +# Data harvesting: extracting metadata from the web -How does UnHIDE harvested data? \ No newline at end of file +How does UnHIDE harvested data? + +Data harvesting and mining for the knowledge graph is done by `Harvester classes`. +For each interface a specific Harvester class should be implemented. +All Harvester classes should inherit from existing Harvesters or the [`BaseHarvester`](https://codebase.helmholtz.cloud/hmc/hmc-public/unhide/data_harvesting/-/blob/main/data_harvesting/base_harvester.py?ref_type=heads), which currently specifies that: + +1. Each harvester needs a `run` method +2. Can read from the [`config.yml`](https://codebase.helmholtz.cloud/hmc/hmc-public/unhide/data_harvesting/-/blob/main/data_harvesting/configs/config.yaml?ref_type=heads) +3. Reads from a `.last_run` file the time the harvester was last run + +Implemented harvester classes include: + +| Name (Cli) | Class Name | Interface | Comment | +|-------------|------------|-----------|---------| +|sitemap | SitemapHarvester | sitemaps | Selecting record links from the sitemap requires expression matching. Relies on the advertools lib.| +|oai | OAIHarvester | OAI-PMH | Relies on the oai lib. For the library providers, dublin core is converted to schema.org | +|git | GitHarvester | Git, Gitlab/Github API | Relies on codemetapy and codemeta-harvester as well as gitlab/github APIs. | +|datacite | DataciteHarvester | REST API & GraphQL endpoint | schema.org extracted through content negotiation.| +|feed | FeedHarvester | RSS & Atom Feeds | Relies on the atoma library, and also only works if on the landing pages schema.org metadata can be extracted. Can only get recent data, useful for event metadata.| +|indico | IndicoHarvester | Indico REST API | Directly extracts schema.org metadata through API, requires an access token | + +Json-ld metadata from landing pages of records is extracted via the `extruct` library, if it cannot be directly retrieved through some standardized interface. + +All harvesters are exposed on the `hmc-unhide` commandline interface. +They store the extracted metadata per default in the internal data model [`LinkedDataObject`](https://codebase.helmholtz.cloud/hmc/hmc-public/unhide/data_harvesting/-/blob/main/data_harvesting/data_model.py?ref_type=heads). +Which has a serialization with some provenance information, original source data and uplifted data and provides method for validation. + +In a single central yaml configuration file called [`config.yml`](https://codebase.helmholtz.cloud/hmc/hmc-public/unhide/data_harvesting/-/blob/main/data_harvesting/configs/config.yaml?ref_type=heads), specifies for each harvester class the sources to harvest and harvester or source specific configuration. \ No newline at end of file