Fix rclone config (#1093)

mlcommons · Feb 3, 2024 · dc9cebf · dc9cebf
2 parents dd9d377 + 2a241cf
commit dc9cebf
Show file tree

Hide file tree

Showing 21 changed files with 1,347 additions and 287 deletions.
diff --git a/README.md b/README.md
@@ -17,44 +17,35 @@
 
 ### About
 
-**Collective Mind (CM)** is a [community project](https://github.com/mlcommons/ck/blob/master/CONTRIBUTING.md) to develop 
-a [collection of portable, extensible, technology-agnostic and ready-to-use automation recipes
+Collective Mind (CM) is a [community project](https://github.com/mlcommons/ck/blob/master/CONTRIBUTING.md) to develop 
+a [collection of portable, extensible, technology-agnostic and ready-to-use automation recipes for MLOps and DevOps
 with a human-friendly interface (aka CM scripts)](https://github.com/mlcommons/ck/tree/master/docs/list_of_scripts.md)
-that automate all the manual steps required to build, run, benchmark and optimize complex ML/AI applications on any platform
-with any software and hardware. 
-
-CM scripts are being developed based on the feedback from 
-[MLCommons engineers and researchers](https://github.com/mlcommons/ck/blob/master/docs/taskforce.md) 
-to help them assemble, run, benchmark and optimize complex AI/ML applications
-across diverse and continuously changing models, data sets, software and hardware
-from Nvidia, Intel, AMD, Google, Qualcomm, Amazon and other vendors.
+that help to automate all the manual steps required to prepare, build, run, benchmark and optimize complex ML/AI applications 
+on any platform with any software and hardware. 
 They require Python 3.7+ with minimal dependencies and can run natively on Ubuntu, MacOS, Windows, RHEL, Debian, Amazon Linux
 and any other operating system, in a cloud or inside automatically generated containers.
 
-Some key requirements for the CM design are:
-* must be non-intrusive and easy to debug, require zero changes to existing projects and must complement, 
-  reuse, wrap and interconnect all existing automation scripts and tools (such as cmake, ML workflows, 
+CM scripts were originally developed based on the following requirements from the
+[MLCommons engineers and researchers](https://github.com/mlcommons/ck/blob/master/docs/taskforce.md) 
+to help them automatically build, benchmark and optimize complex MLPerf benchmarks
+across diverse and continuously changing models, data sets, software and hardware
+from Nvidia, Intel, AMD, Google, Qualcomm, Amazon and other vendors:
+* must work out of the box with the default options and without the need to edit some paths, environment variables and configuration files;
+* must be non-intrusive, easy to debug and must reuse existing 
+  user scripts and automation tools (such as cmake, make, ML workflows, 
   python poetry and containers) rather than substituting them; 
 * must have a very simple and human-friendly command line with a Python API and minimal dependencies;
 * must require minimal or zero learning curve by using plain Python, native scripts, environment variables 
-  and simple JSON/YAML descriptions instead of inventing new languages;
-* must run in a native environment with Ubuntu, Debian, RHEL, Amazon Linux, MacOS, Windows 
-  and any other operating system while automatically generating container snapshots 
-  with CM recipes for repeatability and reproducibility.
-
-Below you can find a few examples of this collaborative engineering effort sponsored 
-by [MLCommons (non-profit organization with 125+ organizations)](https://mlcommons.org) -
-a few most-commonly used [automation recipes](https://github.com/mlcommons/ck/tree/master/docs/list_of_scripts.md)
-that can be chained into more complex automation workflows [using simple JSON or YAML](https://github.com/mlcommons/ck/blob/master/cm-mlops/script/app-image-classification-onnx-py/_cm.yaml).
+  and simple JSON/YAML descriptions instead of inventing new workflow languages;
+* must have the same interface to run all automations natively, in a cloud or inside containers.
 
-You can try them yourself (you only need Python 3.7+, PIP, git and wget installed and optionally Docker if you want to 
-run CM scripts via automatically-generated containers - check the [installation guide](docs/installation.md) for more details).
+Below you can find and try a few examples of the most-commonly used [automation recipes](https://github.com/mlcommons/ck/tree/master/docs/list_of_scripts.md)
+that can be chained into more complex automation workflows [using simple JSON or YAML](https://github.com/mlcommons/ck/blob/master/cm-mlops/script/app-image-classification-onnx-py/_cm.yaml).
 
-*Note that the Collective Mind concept is to continue improving portability and functionality 
-of all CM automation recipes across rapidly evolving models, data sets, software and hardware
-based on collaborative testing and feedback - don't hestiate to report encountered issues 
-[here](https://github.com/mlcommons/ck/issues) and/or contact us via [public Discord Server](https://discord.gg/JjWNWXKxwT) 
-to help this community effort!*
+*Note that MLCommons CM is a collaborative engineering effort to gradually improve portability and functionality
+across continuously changing models, data sets, software and hardware based on your feedback -
+please check this [installation guide](installation.md), report  encountered issues [here](https://github.com/mlcommons/ck/issues) 
+and contact us via [public Discord Server](https://discord.gg/JjWNWXKxwT) to help this community effort!*
 
 
 <details open>
@@ -174,13 +165,13 @@ to modularize, run and benchmark other software projects and make it
 easier to rerun, reproduce and reuse [research projects from published papers 
 at Systems and ML conferences]( https://cTuning.org/ae/micro2023.html ).
 
-Please check the [**Getting Started Guide**](https://github.com/mlcommons/ck/blob/master/docs/getting-started.md) 
+Please check the [**Getting Started Guide and FAQ**](https://github.com/mlcommons/ck/blob/master/docs/getting-started.md) 
 to understand how CM automation recipes work, how to use them to automate your own projects,
 and how to implement and share new automations in your public or private projects.
 
 ### Documentation
 
-* [Getting Started Guide](docs/getting-started.md)
+* [Getting Started Guide and FAQ](docs/getting-started.md)
   * [CM interface for MLPerf benchmarks](docs/mlperf)
   * [CM interface for ML and Systems conferences](docs/tutorials/common-interface-to-reproduce-research-projects.md)
   * [CM automation recipes for MLOps and DevOps](cm-mlops/script)
@@ -190,9 +181,9 @@ and how to implement and share new automations in your public or private project
 
 ### Motivation and concepts
 
-* ACM REP'23 keynote about MLCommons CM: [slides](https://doi.org/10.5281/zenodo.8105339)
-* ACM TechTalk'21 about automating research projects: [YouTube](https://www.youtube.com/watch?v=7zpeIVwICa4)
-* MLPerf inference submitter orientation: [v4.0 slides](https://doi.org/10.5281/zenodo.10605079), [v3.1 slides](https://doi.org/10.5281/zenodo.8144274) 
+* ACM REP'23 keynote about MLCommons CM: [ [slides](https://doi.org/10.5281/zenodo.8105339) ] [ [YouTube](https://youtu.be/_1f9i_Bzjmg) ]
+* ACM TechTalk'21 about automating research projects: [ [YouTube](https://www.youtube.com/watch?v=7zpeIVwICa4) ] [ [slides](https://learning.acm.org/binaries/content/assets/leaning-center/webinar-slides/2021/grigorifursin_techtalk_slides.pdf) ]
+* MLPerf inference submitter orientation: [ [v4.0 slides](https://doi.org/10.5281/zenodo.10605079) ] [ [v3.1 slides](https://doi.org/10.5281/zenodo.8144274) ]
 
 ### Get in touch
 
@@ -204,5 +195,4 @@ our goal is to help everyone automate all manual and repetitive tasks
 to build, run, benchmark and optimize AI systems including 
 downloading artifacts, installing tools, resolving dependencies, 
 running experiments, processing logs, and reproducing results
-on any software/hardware stack - you can reach us via [public Discord server](https://discord.gg/JjWNWXKxwT)
-to discuss this project.
+on any software/hardware stack - don't hesitate to get in touch via [public Discord server](https://discord.gg/JjWNWXKxwT)!
diff --git a/cm-mlops/automation/script/module.py b/cm-mlops/automation/script/module.py
@@ -368,17 +368,6 @@ def run(self, i):
             if value != '':
                 env['CM_' + key.upper()] = value
 
-        # Check extra cache tags
-        x = env.get('CM_EXTRA_CACHE_TAGS','').strip()
-        extra_cache_tags = [] if x=='' else x.split(',')
-
-        if i.get('extra_cache_tags','')!='':
-            for x in i['extra_cache_tags'].strip().split(','):
-                if x!='' and x not in extra_cache_tags:
-                    extra_cache_tags.append(x)
-
-        if env.get('CM_NAME','')!='':
-            extra_cache_tags.append('name-'+env['CM_NAME'].strip().lower())
 
 
         ############################################################################################################
@@ -972,6 +961,29 @@ def run(self, i):
         update_env_with_values(env)
 
 
+
+        ############################################################################################################
+        # Check extra cache tags
+        x = env.get('CM_EXTRA_CACHE_TAGS','').strip()
+        extra_cache_tags = [] if x=='' else x.split(',')
+
+        if i.get('extra_cache_tags','')!='':
+            for x in i['extra_cache_tags'].strip().split(','):
+                if x!='':
+                    if '<<<' in x:
+                        import re
+                        tmp_values = re.findall(r'<<<(.*?)>>>', str(x))
+                        for tmp_value in tmp_values:
+                            xx = str(env.get(tmp_value,''))
+                            x = x.replace("<<<"+tmp_value+">>>", xx)
+                    if x not in extra_cache_tags:
+                        extra_cache_tags.append(x)
+
+        if env.get('CM_NAME','')!='':
+            extra_cache_tags.append('name-'+env['CM_NAME'].strip().lower())
+
+
+
         ############################################################################################################
         # Check if need to clean output files
         clean_output_files = meta.get('clean_output_files', [])
@@ -2525,7 +2537,7 @@ def _call_run_deps(script, deps, local_env_keys, local_env_keys_from_meta, env,
     ##############################################################################
     def _run_deps(self, deps, clean_env_keys_deps, env, state, const, const_state, add_deps_recursive, recursion_spaces, 
                     remembered_selections, variation_tags_string='', from_cache=False, debug_script_tags='', 
-                    verbose=False, show_time=False, extra_recursion_spaces='  ', run_state={'deps':[], 'fake_deps':[]}):
+                  verbose=False, show_time=False, extra_recursion_spaces='  ', run_state={'deps':[], 'fake_deps':[], 'parent': None}):
         """
         Runs all the enabled dependencies and pass them env minus local env
         """

diff --git a/cm-mlops/automation/script/module_misc.py b/cm-mlops/automation/script/module_misc.py
@@ -1537,7 +1537,7 @@ def docker(i):
         return {'return':1, 'error':'no scripts were found'}
 
     env=i.get('env', {})
-    env['CM_RUN_STATE_DOCKER'] = True
+    env['CM_RUN_STATE_DOCKER'] = False
 
     docker_cache = i.get('docker_cache', "yes")
     if docker_cache in ["no", False, "False" ]:
@@ -1599,8 +1599,16 @@ def docker(i):
             update_path_for_docker('.', mounts, force_path_target=current_path_target)
 
 
-        _os = i.get('docker_os', meta.get('docker_os', 'ubuntu'))
-        version = i.get('docker_os_version', meta.get('docker_os_version', '22.04'))
+        _os = i.get('docker_os', docker_settings.get('docker_os', 'ubuntu'))
+        version = i.get('docker_os_version', docker_settings.get('docker_os_version', '22.04'))
+
+        deps = docker_settings.get('deps', [])
+        if deps:
+            # Todo: Support state, const and add_deps_recursive
+            script_automation = i['self_module']
+            r = script_automation._run_deps(deps, [], env, {}, {}, {}, {}, '',{})
+            if r['return'] > 0:
+                return r
 
         for key in docker_settings.get('mounts', []):
             mounts.append(key)
@@ -1722,6 +1730,7 @@ def docker(i):
         if r['return']>0: return r
 
         run_cmd  = r['run_cmd_string']
+        env['CM_RUN_STATE_DOCKER'] = True
 
         if docker_settings.get('mount_current_dir','')=='yes':
             run_cmd = 'cd '+current_path_target+' && '+run_cmd

diff --git a/cm-mlops/script/build-mlperf-inference-server-nvidia/_cm.yaml b/cm-mlops/script/build-mlperf-inference-server-nvidia/_cm.yaml
@@ -228,6 +228,8 @@ docker:
     tensorrt_tar_file_path: CM_TENSORRT_TAR_FILE_PATH
     cuda_run_file_path: CUDA_RUN_FILE_LOCAL_PATH
     scratch_path: MLPERF_SCRATCH_PATH
+  deps:
+    - tags: get,mlperf,inference,nvidia,scratch,space
   mounts:
    - "${{ IMAGENET_PATH }}:/data/imagenet-val"
    - "${{ RESULTS_DIR }}:/home/cmuser/results_dir"

diff --git a/cm-mlops/script/download-file/customize.py b/cm-mlops/script/download-file/customize.py
@@ -109,7 +109,7 @@ def preprocess(i):
         elif tool == "rclone":
             if env.get('CM_RCLONE_CONFIG_CMD', '') != '':
                 env['CM_DOWNLOAD_CONFIG_CMD'] = env['CM_RCLONE_CONFIG_CMD']
-            env['CM_DOWNLOAD_CMD'] = f"rclone copy {url} ./{env['CM_DOWNLOAD_FILENAME']} -P"
+            env['CM_DOWNLOAD_CMD'] = f"rclone copy {url} {os.path.join(os.getcwd(), env['CM_DOWNLOAD_FILENAME'])} -P"
 
         filename = env['CM_DOWNLOAD_FILENAME']
         env['CM_DOWNLOAD_DOWNLOADED_FILENAME'] = filename

diff --git a/cm-mlops/script/get-ml-model-gptj/_cm.json b/cm-mlops/script/get-ml-model-gptj/_cm.json
@@ -19,6 +19,7 @@
   "prehook_deps": [
     {
       "env": {
+        "CM_DOWNLOAD_FINAL_ENV_NAME": "GPTJ_CHECKPOINT_PATH",
         "CM_EXTRACT_FINAL_ENV_NAME": "GPTJ_CHECKPOINT_PATH",
         "CM_EXTRACT_TO_FOLDER": "gpt-j"
       },
@@ -81,13 +82,14 @@
     "pytorch,fp32": {
       "env": {
         "CM_DOWNLOAD_EXTRA_OPTIONS": " --output-document checkpoint.zip",
-        "CM_DOWNLOAD_FILENAME": "checkpoint.zip",
         "CM_UNZIP": "yes",
         "CM_DOWNLOAD_CHECKSUM_NOT_USED": "e677e28aaf03da84584bb3073b7ee315",
         "CM_PACKAGE_URL": "https://cloud.mlcommons.org/index.php/s/QAZ2oM94MkFtbQx/download",
-        "CM_RCLONE_CONFIG": "rclone config create mlc-inference s3 provider=LyveCloud access_key_id=0LITLNQMHZALM5AK secret_access_key=YQKYTMBY23TMZHLOYFJKL5CHHS0CWYUC endpoint=s3.us-east-1.lyvecloud.seagate.com",
+        "CM_RCLONE_CONFIG_CMD": "rclone config create mlc-inference s3 provider=LyveCloud access_key_id=0LITLNQMHZALM5AK secret_access_key=YQKYTMBY23TMZHLOYFJKL5CHHS0CWYUC endpoint=s3.us-east-1.lyvecloud.seagate.com",
         "CM_RCLONE_URL": "mlc-inference:mlcommons-inference-wg-s3/gpt-j"
-      },
+      }
+    },
+    "pytorch,fp32,wget": {
       "add_deps_recursive": {
         "dae": {
           "tags": "_extract"
@@ -186,7 +188,8 @@
         }
       },
       "env": {
-        "CM_DOWNLOAD_URL": "<<<CM_PACKAGE_URL>>>"
+        "CM_DOWNLOAD_URL": "<<<CM_PACKAGE_URL>>>",
+        "CM_DOWNLOAD_FILENAME": "checkpoint.zip"
       }
     },
     "rclone": {
@@ -197,6 +200,7 @@
         }
       },
       "env": {
+        "CM_DOWNLOAD_FILENAME": "checkpoint",
         "CM_DOWNLOAD_URL": "<<<CM_RCLONE_URL>>>"
       }
     },