mlcommons · ccl-core · Dec 11, 2023 · Dec 11, 2023 · Dec 11, 2023 · Dec 11, 2023
@@ -3,6 +3,7 @@
     "@language": "en",
     "@vocab": "https://schema.org/",
     "column": "ml:column",
+    "conformsTo": "dct:conformsTo",
     "data": {
       "@id": "ml:data",
       "@type": "@json"
@@ -11,6 +12,7 @@
       "@id": "ml:dataType",
       "@type": "@vocab"
     },
+    "dct": "http://purl.org/dc/terms/",
     "extract": "ml:extract",
     "field": "ml:field",
     "fileProperty": "ml:fileProperty",
@@ -35,6 +37,7 @@
   },
   "@type": "sc:Dataset",
   "name": "COCO",
+  "conformsTo": "http://mlcommons.org/croissant/1.0",
   "description": "COCO is a large-scale object detection, segmentation, and captioning dataset.  WARNING: `metadata.json` is incomplete and does not fully define the COCO2014 dataset. It lacks `recordSet` definitions that would enable automatic loading of all the annotations.",
   "citation": "@article{DBLP:journals/corr/LinMBHPRDZ14,\n  author    = {Tsung{-}Yi Lin and\n               Michael Maire and\n               Serge J. Belongie and\n               Lubomir D. Bourdev and\n               Ross B. Girshick and\n               James Hays and\n               Pietro Perona and\n               Deva Ramanan and\n               Piotr Doll{'{a}}r and\n               C. Lawrence Zitnick},\n  title     = {Microsoft {COCO:} Common Objects in Context},\n  journal   = {CoRR},\n  volume    = {abs/1405.0312},\n  year      = {2014},\n  url       = {http://arxiv.org/abs/1405.0312},\n  archivePrefix = {arXiv},\n  eprint    = {1405.0312},\n  timestamp = {Mon, 13 Aug 2018 16:48:13 +0200},\n  biburl    = {https://dblp.org/rec/bib/journals/corr/LinMBHPRDZ14},\n  bibsource = {dblp computer science bibliography, https://dblp.org}\n}",
   "license": [

@@ -68,7 +68,7 @@ def __post_init__(self):
         # Check properties.
         self.validate_name()
         self.validate_version()
-        self.assert_has_mandatory_properties("name")
+        self.assert_has_mandatory_properties("name", "conforms_to")
         self.assert_has_optional_properties("citation", "license", "version")
 
         # Raise exception if there are errors.

@@ -24,7 +24,7 @@ def test_checks_are_performed():
         Node, "validate_name"
     ) as validate_name_mock:
         create_test_node(Metadata, name="field_name")
-        mandatory_mock.assert_called_once_with("name")
+        mandatory_mock.assert_called_once_with("name", "conforms_to")
         optional_mock.assert_called_once_with("citation", "license", "version")
         validate_name_mock.assert_called_once()
 

@@ -113,6 +113,7 @@
     "metadata = mlc.Metadata(\n",
     "    name=\"COCO2014\",\n",
     "    url=\"https://cocodataset.org\",\n",
+    "    conforms_to=\"http://mlcommons.org/croissant/1.0\",\n",
     "    distribution=distribution,\n",
     "    record_sets=record_sets,\n",
     ")"

@@ -1,30 +1,19 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    }
-  },
   "cells": [
     {
       "cell_type": "markdown",
-      "source": [
-        "# Tutorial for `mlcroissant` 🥐"
-      ],
       "metadata": {
         "id": "AriH9CP6AKhs"
-      }
+      },
+      "source": [
+        "# Tutorial for `mlcroissant` 🥐"
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "Hh-0cehIAErA"
+      },
       "source": [
         "## Introduction\n",
         "\n",
@@ -37,10 +26,7 @@
         "- Programmatically write your JSON-LD Croissant files.\n",
         "- Verify your JSON-LD Croissant files.\n",
         "- Load data from Croissant datasets."
-      ],
-      "metadata": {
-        "id": "Hh-0cehIAErA"
-      }
+      ]
     },
     {
       "cell_type": "code",
@@ -52,29 +38,34 @@
       "source": [
         "# https://github.com/mlcommons/croissant/python/mlcroissant\n",
         "\n",
-        "!git clone https://github.com/mlcommons/croissant.git",
+        "!git clone https://github.com/mlcommons/croissant.git\n",
         "\n",
-        "%cd croissant/python/mlcroissant",
+        "%cd croissant/python/mlcroissant\n",
         "\n",
-        "!pip install -e .[git]",
+        "!pip install -e .[git]\n",
         "\n"
       ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "Xwrol5JR_GTY"
+      },
       "source": [
         "## Example\n",
         "\n",
         "Let's try on a very concrete dataset: OpenAI's [`gpt-3`](https://github.com/openai/gpt-3) dataset for LLMs!\n",
         "\n",
         "In the tutorial, we will generate programmatically the Croissant JSON-LD file describing the dataset. Then we will verify the file and yield data from the dataset."
-      ],
-      "metadata": {
-        "id": "Xwrol5JR_GTY"
-      }
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "7OyQffJv-zso"
+      },
+      "outputs": [],
       "source": [
         "import mlcroissant as mlc\n",
         "\n",
@@ -179,6 +170,7 @@
         "        \" distinguishing from articles written by humans. We discuss broader\"\n",
         "        \" societal impacts of this finding and of GPT-3 in general.\"\n",
         "    ),\n",
+        "    conforms_to=\"http://mlcommons.org/croissant/1.0\",\n",
         "    citation=(\n",
         "        \"@article{brown2020language, title={Language Models are Few-Shot\"\n",
         "        \" Learners}, author={Tom B. Brown and Benjamin Mann and Nick Ryder and\"\n",
@@ -196,61 +188,61 @@
         "    distribution=distribution,\n",
         "    record_sets=record_sets,\n",
         ")\n"
-      ],
-      "metadata": {
-        "id": "7OyQffJv-zso"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "2RUVgWI-DldZ"
+      },
       "source": [
         "When creating `Metadata`:\n",
         "- We also check for errors in the configuration.\n",
         "- We generate warnings if the configuration doesn't follow guidelines and best practices.\n",
         "\n",
         "For instance, in this case:"
-      ],
-      "metadata": {
-        "id": "2RUVgWI-DldZ"
-      }
+      ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "print(metadata.issues.report())"
-      ],
+      "execution_count": null,
       "metadata": {
         "id": "AENcJUwMCd1B"
       },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": [
+        "print(metadata.issues.report())"
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "vES3KHaND4P2"
+      },
       "source": [
         "`Property \"https://schema.org/license\" is recommended`...\n",
         "\n",
         "We can see at a glance that we miss an important metadata to build datasets for responsible AI: the license!"
-      ],
-      "metadata": {
-        "id": "vES3KHaND4P2"
-      }
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "S0BEhzqiEjd0"
+      },
       "source": [
         "## Build the Croissant file and yield data\n",
         "\n",
         "Let's write the Croissant JSON-LD to a file on disk!"
-      ],
-      "metadata": {
-        "id": "S0BEhzqiEjd0"
-      }
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "-XCycu81ECVq"
+      },
+      "outputs": [],
       "source": [
         "import json\n",
         "\n",
@@ -260,66 +252,75 @@
         "  print(content)\n",
         "  f.write(content)\n",
         "  f.write(\"\\n\")  # Terminate file with newline"
-      ],
-      "metadata": {
-        "id": "-XCycu81ECVq"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "From this JSON-LD file, we can easily create a dataset..."
-      ],
       "metadata": {
         "id": "Ypb_ll3SE6UU"
-      }
+      },
+      "source": [
+        "From this JSON-LD file, we can easily create a dataset..."
+      ]
     },
     {
       "cell_type": "code",
-      "source": [
-        "dataset = mlc.Dataset(file=\"croissant.json\")"
-      ],
+      "execution_count": null,
       "metadata": {
         "id": "_JNyQFuAEiIs"
       },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": [
+        "dataset = mlc.Dataset(file=\"croissant.json\")"
+      ]
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "...and yield records from this dataset:"
-      ],
       "metadata": {
         "id": "ldwdIGPoFT_p"
-      }
+      },
+      "source": [
+        "...and yield records from this dataset:"
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "MHdVY4TBEqZ8"
+      },
+      "outputs": [],
       "source": [
         "records = dataset.records(record_set=\"jsonl\")\n",
         "\n",
         "for i, record in enumerate(records):\n",
         "  print(record)\n",
         "  if i > 10:\n",
         "    break"
-      ],
-      "metadata": {
-        "id": "MHdVY4TBEqZ8"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
-      "source": [],
+      "execution_count": null,
       "metadata": {
         "id": "8a2sCy0GFYCQ"
       },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": []
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
     }
-  ]
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
 }