datamodel: reference sheet #1

fenekku · 2019-08-15T16:35:25Z

This is a reference sheet for the core metadata shared by InvenioRDM records:

Jsonschema as of 2019-12-20 :

{
  "$schema": "http://json-schema.org/draft-07/schema#",
  "id": "http://localhost/schemas/records/record-v1.0.0.json",
  "title": "Invenio Datacite based Record Schema v1.0.0",
  "type": "object",
  "additionalProperties": false,
  "properties": {
    "_access": {
      "metadata_restricted": {
        "default": false,
        "description": "Record metadata accesibility. Public by default (False).",
        "type": "boolean"
      },
      "files_restricted": {
        "default": false,
        "description": "Record associated files accesibility. Public by default (False).",
        "type": "boolean"
      }
    },
    "_bucket": {
      "description": "Record bucket.",
      "type": "string"
    },
    "access_right": {
      "default": "open",
      "description": "Access right for record.",
      "type": "string"
    },
    "additional_descriptions": {
      "type": "array",
      "items": {
          "type": "object",
          "properties": {
              "description": {
                "description": "Description/abstract for record.",
                "type": "string"
              },
              "description_type": {
                "description": "Type of description.",
                "type": "string"
              },
              "lang": {
                "description": "Language of the description. ISO 639-3 language code.",
                "type": "string",
                "maxLength": 3
              }
          },
          "required": ["description", "description_type"]
      },
      "uniqueItems": true
    },
    "additional_titles": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
            "title": {
              "description": "Title of the record.",
              "type": "string"
            },
            "title_type": {
              "description": "Type of title.",
              "type": "string"
            },
            "lang": {
              "description": "Language of the title. ISO 639-3 language code.",
              "type": "string",
              "maxLength": 3
            }
        },
        "required": ["title"]
      },
      "uniqueItems": true
    },
    "contributors": {
      "description": "Contributors in order of importance.",
      "minItems": 1,
      "type": "array",
      "items": {
        "type": "object",
        "additionalProperties": false,
        "properties": {
          "ids": {
            "description": "List of IDs related with the person.",
            "type": "array",
            "uniqueItems": true,
            "items": {
              "additionalProperties": false,
              "type": "object",
              "properties": {
                "source": {
                  "type": "string"
                },
                "value": {
                  "type": "string"
                }
              }
            }
          },
          "name": {
            "description": "Full name of person or organisation. Personal name format: family, given.",
            "type": "string"
          },
          "affiliations": {
            "description": "Affiliation(s) for the purpose of this specific record.",
            "type": "array",
            "uniqueItems": true,
            "items": {
              "type": "string"
            }
          },
          "email": {
            "type": "string",
            "description": "Contact email for the purpose of this specific record.",
            "format": "email"
          },
          "role": {
            "description": "",
            "type": "string"
          }
        },
        "required": [
          "name"
        ]
      }
    },
    "dates": {
      "description": "Date interval.",
      "items": {
        "additionalProperties": false,
        "properties": {
          "description": {
            "description": "Description of the date interval.",
            "type": "string"
          },
          "end": {
            "description": "End date.",
            "type": "string",
            "format": "date-time"
          },
          "start": {
            "description": "Start date.",
            "type": "string",
            "format": "date-time"
          },
          "type": {
            "description": "Type of the date interval."
          }
        },
        "required": [
          "type"
        ],
        "type": "object"
      },
      "type": "array"
    },
    "description": {
      "description": "Description for record.",
      "type": "string"
    },
    "embargo_date": {
      "description": "Embargo date of record (ISO8601 formatted date).",
      "type": "string",
      "format": "date-time"
    },
    "keywords": {
      "description": "Free text keywords.",
      "items": {
        "type": "string"
      },
      "type": "array"
    },
    "language": {
      "description": "Primary language of the resource. ISO 639-3 language code.",
      "type": "string",
      "maxLength": 3
    },
    "owners": {
      "description": "List of user IDs that are owners of the record.",
      "items": {
        "type": "number"
      },
      "type": "array",
      "minItems": 1,
      "uniqueItems": true
    },
    "publication_date": {
      "description": "Record publication date (IS8601-formatted). EDTF support to be added for field.",
      "type": "string",
      "format": "date-time"
    },
    "recid": {
      "description": "Invenio record identifier (alphanumeric).",
      "type": "string"
    },
    "resource_type": {
      "additionalProperties": false,
      "description": "Record resource type.",
      "properties": {
        "subtype": {
          "description": "Specific resource type.",
          "type": "string"
        },
        "type": {
          "default": "publication",
          "description": "General resource type.",
          "type": "string"
        }
      },
      "required": [
        "type",
        "subtype"
      ],
      "type": "object"
    },
    "rights": {
      "description": "Any rights information for this resource.",
      "type": "array",
      "items": {
          "type": "object",
          "properties": {
              "rights": {
                "description": "The right itself. Free text.",
                "type": "string"
              },
              "uri": {
                "description": "The URI of the license.",
                "type": "string",
                "format": "uri"
              },
              "identifier": {
                "description": "A short, standardized version of the license name.",
                "type": "string"
              },
              "identifier_scheme": {
                "description": "The name of the scheme.",
                "type": "string"
              },
              "scheme_uri": {
                "description": "The URI of the identifier_scheme.",
                "type": "string",
                "format": "uri"
              },
              "lang": {
                "description": "Language of the right information. ISO 639-3 language code.",
                "type": "string",
                "maxLength": 3
              }
          }
      },
      "uniqueItems": true
    },
    "title": {
      "description": "Record title.",
      "type": "string"
    },
    "version": {
      "description": "Record version tag.",
      "type": "string"
    }
  },
  "required": [
    "_access",
    "access_right",
    "contributors",
    "description",
    "owners",
    "publication_date",
    "resource_type",
    "title"
  ]
}

Fields

Extras for all

Add references (need to check zenodo structure if ok). Not all has related identifiers, sometimes is just a reference to the text.
internal notes (discuss with ILS what they have). "curators notes", non-public notes.
Access condition
"internal fields" : See datamodel: treating internal fields with _ #38

How to handle custom fields?

See Zenodo implementation (custom)
See Data model extension #2

Zenodo custom fields

Imprint
Journal
Part of?
Thesis

TODO

Check against ILS schema.

Discussion points for community:

Keywords or only subjects?

*CV: Control Vocabulary
updated 2019-08-16 with comments below.
updated 2019-12-20 with issues.

The text was updated successfully, but these errors were encountered:

tmorrell · 2019-08-15T17:06:17Z

Here are some initial thoughts on this draft:

We might want to support more types of author identifiers beyond ORCID, as in DataCite. I feel very strongly that affiliation should be included with creators and contributors. In DataCite 4.3 (just released) it looks like:

"affiliation": [
{
"name": "Université catholique de Louvain",
"affiliationIdentifier": "https://ror.org/02495e989",
"affiliationIdentifierScheme": "ROR"
}
]

Having multiple descriptions with description types is nice:

"descriptions": [
{
"description": "These files provide the original survey data ",
"descriptionType": "Abstract",
"lang": "en"
}
]

I just did a PR for the DataCite JSON module that includes a more flexible identifier block that should work if a site doesn't want to use DOIs. identifiers is mandatory, but any specific identifier isn't

"identifiers": [
{
"identifierType": "DOI",
"identifier": "https://doi.org/10.5281/zenodo.47394"
},
{
"identifierType": "URL",
"identifier": "http://zenodo.org/record/47394"
}
]

There should probably be a separate metadata block that would accept multiple dates about an object. The system can then map the system generated _created field into one of the dates, but providing flexibility around dates is really important.

"dates": [
{
"date": "2014-10-10",
"dateType": "Created"
},
{
"date": "2019-06-01",
"dateType": "Updated"
}
]

We probably want to support the full DataCite funding object

"fundingReferences": [
{
"funderName": "European Commission",
"funderIdentifier": "http://doi.org/http://doi.org/10.13039/501100000780",
"funderIdentifierType": "Crossref Funder ID",
"awardNumber": "282625",
"awardUri": "http://cordis.europa.eu/project/rcn/100180_en.html",
"awardTitle": "MOTivational strength of ecosystem services and alternative ways to express the value of BIOdiversity"
}
]

We should stick with the DataCite related identifier structure unless there is a good reason not to

"relatedIdentifiers": [
{
"relatedIdentifier": "urn:nbn:de:bib-cpos-2013-02en8",
"relatedIdentifierType": "URN",
"relationType": "IsIdenticalTo"
}
]

The DataCite license structure is a good place to start, but we should think if we want to provide a better connection between these fields and any files attached to the records

"rightsList": [
{
"rights": "Creative Commons License: Attribution-NonCommercial-NoDerivs 3.0 Unported",
"rightsUri": "http://creativecommons.org/licenses/by-nc-nd/3.0/deed",
"lang": "en-US"
}

In general I would prefer sticking with DataCite field names as much as possible.

fenekku · 2019-08-15T17:27:51Z

I was thinking about field names too... I wonder why keeping them the same as DataCite is important?

I see this data model as internal. As long as translation to the fields datacite wants is possible and systematic we are good despite our potentially idiosyncratic internal names. People will always name things their own ways and that isn't a problem: what you do in your own house is your own business. It's when we start interacting with other services that it might be a problem. But then, it's just a matter of doing the translation for the other service. After all we will have to do translations for others as well down the road anyway. Having a standards committee's decisions reach all the way to my internal naming scheme seems oppressive 😄 .

That being said, if it was just a matter of naming, I really wouldn't care (I don't mind re-naming to more domain appropriate names - I am not a domain expert 😸 ). However, I do see possibilities for our internal data layout to be structured in such a way that it is much more flexible than what following datacite's layout would allow us. For instance, coalescing authors and contributors into a common internal structure would allow us to just have one field/structure for both. That structure could even be reused for acknowledgements... I think there are a couple of these potentially elegant simplifications that would not subtract anything to the understandability of the metadata of the record when taken out of the repository.

I agree we should use DataCite to inform our required needs and as the starting point for names and structures.

tmorrell · 2019-08-15T17:58:25Z

I'd argue having the internal data match a standard is good unless there is a compelling reason not to. We could spend a lot of time arguing whether "funder" or "funderName" is a better label (for example), but I'm not sure it would be a productive use of our time. Not having to do a mapping also helps with interoperability.

I also agree on the author and contributor coalescing...DataCite has implemented this in their JSON version and I just added it to the invenio wrapper in inveniosoftware/datacite#50

lnielsen · 2020-11-02T13:49:07Z

I'm closing the ticket nearly all have been implemented, except for specific items that need further design like access control, files, pids etc.

fenekku mentioned this issue Aug 15, 2019

Data model extension #2

Closed

fenekku mentioned this issue Aug 16, 2019

Metadata Schema Reference sheet galterlibrary/InvenioRDM-at-NU#272

Open

ppanero mentioned this issue Oct 4, 2019

permissions: check compatibility inveniosoftware/invenio-app-rdm#12

Closed

fenekku mentioned this issue Oct 17, 2019

datamodel: create datacite model based on zenodo #18

Closed

17 tasks

fenekku mentioned this issue Oct 22, 2019

rdm: define {crea,contribu}tors schema inveniosoftware/rfcs#11

Merged

fenekku changed the title ~~Data model reference sheet~~ datamodel: reference sheet Dec 20, 2019

fenekku added a commit to fenekku/invenio-rdm-records that referenced this issue Aug 18, 2020

es: fix elasticsearch v6 support (attempt inveniosoftware#1)

0861a66

fenekku added a commit to fenekku/invenio-rdm-records that referenced this issue Aug 27, 2020

tests: improve coverage inveniosoftware#1

c1ada52

fenekku added a commit that referenced this issue Aug 27, 2020

tests: improve coverage #1

ca16bb6

lnielsen self-assigned this Oct 13, 2020

lnielsen closed this as completed Nov 2, 2020

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

datamodel: reference sheet #1

datamodel: reference sheet #1

fenekku commented Aug 15, 2019 •

edited by lnielsen

Loading

tmorrell commented Aug 15, 2019

fenekku commented Aug 15, 2019

tmorrell commented Aug 15, 2019 •

edited

Loading

lnielsen commented Nov 2, 2020

datamodel: reference sheet #1

datamodel: reference sheet #1

Comments

fenekku commented Aug 15, 2019 • edited by lnielsen Loading

Extras for all

How to handle custom fields?

Zenodo custom fields

TODO

Discussion points for community:

tmorrell commented Aug 15, 2019

fenekku commented Aug 15, 2019

tmorrell commented Aug 15, 2019 • edited Loading

lnielsen commented Nov 2, 2020

fenekku commented Aug 15, 2019 •

edited by lnielsen

Loading

tmorrell commented Aug 15, 2019 •

edited

Loading