Merge from CTuning (#1092)

mlcommons · Feb 2, 2024 · 6219ef3 · 6219ef3
2 parents a31610e + 05b1a2f
commit 6219ef3
Show file tree

Hide file tree

Showing 45 changed files with 802 additions and 113 deletions.
diff --git a/README.md b/README.md
@@ -17,26 +17,33 @@
 
 ### About
 
-Collective Mind (CM) is a [community project](CONTRIBUTING.md) to develop 
+**Collective Mind (CM)** is a [community project](https://github.com/mlcommons/ck/blob/master/CONTRIBUTING.md) to develop 
 a [collection of portable, extensible, technology-agnostic and ready-to-use automation recipes
 with a human-friendly interface (aka CM scripts)](https://github.com/mlcommons/ck/tree/master/docs/list_of_scripts.md)
 that automate all the manual steps required to build, run, benchmark and optimize complex ML/AI applications on any platform
 with any software and hardware. 
 
-CM scripts are being developed based on the feedback from [MLCommons engineers and researchers](docs/taskforce.md) 
+CM scripts are being developed based on the feedback from 
+[MLCommons engineers and researchers](https://github.com/mlcommons/ck/blob/master/docs/taskforce.md) 
 to help them assemble, run, benchmark and optimize complex AI/ML applications
 across diverse and continuously changing models, data sets, software and hardware
 from Nvidia, Intel, AMD, Google, Qualcomm, Amazon and other vendors.
 They require Python 3.7+ with minimal dependencies and can run natively on Ubuntu, MacOS, Windows, RHEL, Debian, Amazon Linux
 and any other operating system, in a cloud or inside automatically generated containers.
 
 Some key requirements for the CM design are:
-* must be non-intrusive and easy to debug, require zero changes to existing projects and must complement, reuse, wrap and interconnect all existing automation scripts and tools (such as cmake, ML workflows, python poetry and containers) rather than substituting them; 
+* must be non-intrusive and easy to debug, require zero changes to existing projects and must complement, 
+  reuse, wrap and interconnect all existing automation scripts and tools (such as cmake, ML workflows, 
+  python poetry and containers) rather than substituting them; 
 * must have a very simple and human-friendly command line with a Python API and minimal dependencies;
-* must require minimal or zero learning curve by using plain Python, native scripts, environment variables and simple JSON/YAML descriptions instead of inventing new languages;
-* must run in a native environment with Ubuntu, Debian, RHEL, Amazon Linux, MacOS, Windows and any other operating system while automatically generating container snapshots with CM recipes for repeatability and reproducibility;
-
-Below you can find a few examples of this collaborative engineering effort sponsored by [MLCommons (non-profit organization with 125+ organizations)](https://mlcommons.org) -
+* must require minimal or zero learning curve by using plain Python, native scripts, environment variables 
+  and simple JSON/YAML descriptions instead of inventing new languages;
+* must run in a native environment with Ubuntu, Debian, RHEL, Amazon Linux, MacOS, Windows 
+  and any other operating system while automatically generating container snapshots 
+  with CM recipes for repeatability and reproducibility.
+
+Below you can find a few examples of this collaborative engineering effort sponsored 
+by [MLCommons (non-profit organization with 125+ organizations)](https://mlcommons.org) -
 a few most-commonly used [automation recipes](https://github.com/mlcommons/ck/tree/master/docs/list_of_scripts.md)
 that can be chained into more complex automation workflows [using simple JSON or YAML](https://github.com/mlcommons/ck/blob/master/cm-mlops/script/app-image-classification-onnx-py/_cm.yaml).
 
@@ -167,7 +174,7 @@ to modularize, run and benchmark other software projects and make it
 easier to rerun, reproduce and reuse [research projects from published papers 
 at Systems and ML conferences]( https://cTuning.org/ae/micro2023.html ).
 
-Please check the [**Getting Started Guide**](docs/getting-started.md) 
+Please check the [**Getting Started Guide**](https://github.com/mlcommons/ck/blob/master/docs/getting-started.md) 
 to understand how CM automation recipes work, how to use them to automate your own projects,
 and how to implement and share new automations in your public or private projects.
 
@@ -185,7 +192,7 @@ and how to implement and share new automations in your public or private project
 
 * ACM REP'23 keynote about MLCommons CM: [slides](https://doi.org/10.5281/zenodo.8105339)
 * ACM TechTalk'21 about automating research projects: [YouTube](https://www.youtube.com/watch?v=7zpeIVwICa4)
-* MLPerf inference submitter orientation: [v3.1 slides](https://doi.org/10.5281/zenodo.10605079), [v3.0 slides](https://doi.org/10.5281/zenodo.8144274) 
+* MLPerf inference submitter orientation: [v4.0 slides](https://doi.org/10.5281/zenodo.10605079), [v3.1 slides](https://doi.org/10.5281/zenodo.8144274) 
 
 ### Get in touch
 

diff --git a/cm-mlops/automation/script/_cm.json b/cm-mlops/automation/script/_cm.json
@@ -7,7 +7,7 @@
   },
   "desc": "Making native scripts more portable, interoperable and deterministic",
   "developers": "[Arjun Suresh](https://www.linkedin.com/in/arjunsuresh), [Grigori Fursin](https://cKnowledge.org/gfursin)",
-  "actions_with_help":["run"],
+  "actions_with_help":["run", "docker"],
   "sort": 1000,
   "tags": [
     "automation"

diff --git a/cm-mlops/automation/script/module.py b/cm-mlops/automation/script/module.py
@@ -652,46 +652,7 @@ def run(self, i):
 
         # Check if has --help
         if i.get('help',False):
-            print ('')
-            print ('Help for this CM script (automation recipe):')
-
-            variations = meta.get('variations',{})
-            if len(variations)>0:
-                print ('')
-                print ('Available variations:')
-                print ('')
-                for v in sorted(variations):
-                    print ('  _'+v)
-
-            input_mapping = meta.get('input_mapping', {})
-            if len(input_mapping)>0:
-                print ('')
-                print ('Available flags mapped to environment variables:')
-                print ('')
-                for k in sorted(input_mapping):
-                    v = input_mapping[k]
-
-                    print ('  --{}  ->  --env.{}'.format(k,v))
-
-            input_description = meta.get('input_description', {})
-            if len(input_description)>0:
-                print ('')
-                print ('Available flags (Python API dict keys):')
-                print ('')
-                for k in sorted(input_description):
-                    v = input_description[k]
-                    n = v.get('desc','')
-
-                    x = '  --'+k
-                    if n!='': x+='  ({})'.format(n)
-
-                    print (x)
-
-
-            print ('')
-            input ('Press Enter to see common flags for all scripts')
-
-            return {'return':0}
+            return utils.call_internal_module(self, __file__, 'module_help', 'print_help', {'meta':meta, 'path':path})
 
 
         deps = meta.get('deps',[])

diff --git a/cm-mlops/automation/script/module_help.py b/cm-mlops/automation/script/module_help.py
@@ -0,0 +1,51 @@
+import os
+from cmind import utils
+
+# Pring help about script
+def print_help(i):
+
+    meta = i['meta']
+    path = i['path']
+
+    print ('')
+    print ('Help for this CM script ({},{}):'.format(meta.get('alias',''), meta.get('uid','')))
+
+    print ('')
+    print ('Path to this automation recipe: {}'.format(path))
+
+    variations = meta.get('variations',{})
+    if len(variations)>0:
+        print ('')
+        print ('Available variations:')
+        print ('')
+        for v in sorted(variations):
+            print ('  _'+v)
+
+    input_mapping = meta.get('input_mapping', {})
+    if len(input_mapping)>0:
+        print ('')
+        print ('Available flags mapped to environment variables:')
+        print ('')
+        for k in sorted(input_mapping):
+            v = input_mapping[k]
+
+            print ('  --{}  ->  --env.{}'.format(k,v))
+
+    input_description = meta.get('input_description', {})
+    if len(input_description)>0:
+        print ('')
+        print ('Available flags (Python API dict keys):')
+        print ('')
+        for k in sorted(input_description):
+            v = input_description[k]
+            n = v.get('desc','')
+
+            x = '  --'+k
+            if n!='': x+='  ({})'.format(n)
+
+            print (x)
+
+    print ('')
+    input ('Press Enter to see common flags for all scripts')
+
+    return {'return':0}
diff --git a/cm-mlops/automation/script/module_misc.py b/cm-mlops/automation/script/module_misc.py
@@ -1553,6 +1553,9 @@ def docker(i):
 
         meta = artifact.meta
 
+        if i.get('help',False):
+            return utils.call_internal_module(self_module, __file__, 'module_help', 'print_help', {'meta':meta, 'path':artifact.path})
+
         script_path = artifact.path
 
         tags = meta.get("tags", [])
@@ -1691,6 +1694,11 @@ def docker(i):
 
         port_maps = i.get('docker_port_maps', docker_settings.get('port_maps', []))
 
+        if detached == '':
+            detached = docker_settings.get('detached', '')
+
+        if interactive == '':
+            interactive = docker_settings.get('interactive', '')
 
 #        # Regenerate run_cmd
 #        if i.get('cmd'):

diff --git a/cm-mlops/script/build-mlperf-inference-server-nvidia/_cm.yaml b/cm-mlops/script/build-mlperf-inference-server-nvidia/_cm.yaml
@@ -77,7 +77,7 @@ deps:
 
   # Detect CMake
   - tags: get,cmake
-    version_min: "3.18"
+    version_min: "3.25"
 
   # Detect Google Logger
   - tags: get,generic,sys-util,_glog-dev
@@ -106,6 +106,8 @@ deps:
     names:
     - nvidia-inference-common-code
 
+  - tags: get,generic-python-lib,_package.pybind11
+
   # Detect pycuda
   - tags: get,generic-python-lib,_pycuda
     skip_if_env:
@@ -125,6 +127,7 @@ deps:
     names:
     - nvidia-scratch-space
 
+
 post_deps:
   # Detect nvidia system
   - tags: add,custom,system,nvidia
@@ -185,25 +188,40 @@ versions:
     add_deps_recursive:
       nvidia-inference-common-code:
         version: r2.1
+      nvidia-scratch-space:
+        tags: version.2_1
 
   r3.0:
     add_deps_recursive:
       nvidia-inference-common-code:
         version: r3.0
+      nvidia-scratch-space:
+        tags: version.3_0
+
   r3.1:
     add_deps_recursive:
       nvidia-inference-common-code:
         version: r3.1
+      nvidia-scratch-space:
+        tags: version.3_1
+    deps:
+      - tags: install,nccl,libs,_cuda
+      - tags: install,pytorch,from.src,_for-nvidia-mlperf-inference-v3.1-gptj
+        names:
+        - pytorch
 
 docker:
   skip_run_cmd: 'no'
   all_gpus: 'yes'
   docker_os: ubuntu
-  docker_real_run: True
+  docker_real_run: False
+  interactive: True
   docker_os_version: '20.04'
   base_image: nvcr.io/nvidia/mlperf/mlperf-inference:mlpinf-v3.1-cuda12.2-cudnn8.9-x86_64-ubuntu20.04-public
   docker_input_mapping:
     imagenet_path: IMAGENET_PATH
+    gptj_checkpoint_path: GPTJ_CHECKPOINT_PATH
+    criteo_preprocessed_path: CRITEO_PREPROCESSED_PATH
     results_dir: RESULTS_DIR
     submission_dir: SUBMISSION_DIR
     cudnn_tar_file_path: CM_CUDNN_TAR_FILE_PATH

diff --git a/cm-mlops/script/build-mlperf-inference-server-nvidia/run.sh b/cm-mlops/script/build-mlperf-inference-server-nvidia/run.sh
@@ -11,6 +11,6 @@ if [[ ${CM_MLPERF_DEVICE} == "inferentia" ]]; then
  make prebuild
 fi
 
-make ${CM_MAKE_BUILD_COMMAND}
+SKIP_DRIVER_CHECK=1 make ${CM_MAKE_BUILD_COMMAND}
 
 test $? -eq 0 || exit $?
diff --git a/cm-mlops/script/download-and-extract/_cm.json b/cm-mlops/script/download-and-extract/_cm.json
@@ -96,6 +96,14 @@
         "CM_DAE_EXTRACT_DOWNLOADED": "yes"
       }
     },
+    "rclone": {
+      "add_deps_recursive": {
+        "download-script": {
+          "tags": "_rclone"
+        }
+      },
+      "group": "download-tool"
+    },
     "gdown": {
       "add_deps_recursive": {
         "download-script": {

diff --git a/cm-mlops/script/download-file/_cm.json b/cm-mlops/script/download-file/_cm.json
@@ -63,6 +63,17 @@
       },
       "group": "download-tool"
     },
+    "rclone": {
+      "deps": [
+        {
+          "tags": "get,rclone"
+        }
+      ],
+      "env": {
+        "CM_DOWNLOAD_TOOL": "rclone"
+      },
+      "group": "download-tool"
+    },
     "url.#": {
       "env": {
         "CM_DOWNLOAD_URL": "#"

diff --git a/cm-mlops/script/download-file/customize.py b/cm-mlops/script/download-file/customize.py
@@ -65,6 +65,8 @@ def preprocess(i):
                 if j>0:
                     urltail=urltail[:j]
                 env['CM_DOWNLOAD_FILENAME'] = urltail
+            elif env.get('CM_DOWNLOAD_TOOL', '') == "rclone":
+                env['CM_DOWNLOAD_FILENAME'] = urltail
             else:
                 env['CM_DOWNLOAD_FILENAME'] = "index.html"
 
@@ -104,6 +106,11 @@ def preprocess(i):
         elif tool == "gdown":
             env['CM_DOWNLOAD_CMD'] = f"gdown {extra_download_options} {url}"
 
+        elif tool == "rclone":
+            if env.get('CM_RCLONE_CONFIG_CMD', '') != '':
+                env['CM_DOWNLOAD_CONFIG_CMD'] = env['CM_RCLONE_CONFIG_CMD']
+            env['CM_DOWNLOAD_CMD'] = f"rclone copy {url} ./{env['CM_DOWNLOAD_FILENAME']} -P"
+
         filename = env['CM_DOWNLOAD_FILENAME']
         env['CM_DOWNLOAD_DOWNLOADED_FILENAME'] = filename
 

diff --git a/cm-mlops/script/download-file/run.sh b/cm-mlops/script/download-file/run.sh
@@ -1,5 +1,11 @@
 #!/bin/bash
 
+if [[ -n ${CM_DOWNLOAD_CONFIG_CMD} ]]; then
+  echo ""
+  echo "${CM_DOWNLOAD_CONFIG_CMD}"
+  eval "${CM_DOWNLOAD_CONFIG_CMD}"
+fi
+
 if [ -e ${CM_DOWNLOAD_DOWNLOADED_PATH} ]; then
   if [[ "${CM_DOWNLOAD_CHECKSUM_CMD}" != "" ]]; then
     echo ""

diff --git a/cm-mlops/script/get-lib-armnn/_cm.json b/cm-mlops/script/get-lib-armnn/_cm.json
@@ -4,7 +4,7 @@
   "automation_uid": "5b4e0237da074764",
   "cache": true,
   "category": "Detection or installation of tools and artifacts",
-  "default_version": "23.05",
+  "default_version": "23.11",
   "deps": [
     {
       "tags": "detect,os"
@@ -36,6 +36,12 @@
   ],
   "uid": "9603a2e90fd44587",
   "versions": {
+    "23.11": {
+      "env": {
+        "CM_LIB_ARMNN_VERSION": "v23.11",
+        "CM_TMP_GIT_BRANCH_NAME": "branches/armnn_23_11"
+      }
+    },
     "23.05": {
       "env": {
         "CM_LIB_ARMNN_VERSION": "v23.05",