diff --git a/.gitignore b/.gitignore index c52c0a1..f2e9433 100644 --- a/.gitignore +++ b/.gitignore @@ -1,17 +1,77 @@ -*.gz -*.npy -train_log +# Created by https://www.gitignore.io/api/python,pycharm+all + +### PyCharm+all ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff: +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/dictionaries + +# Sensitive or high-churn files: +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.xml +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml + +# Gradle: +.idea/**/gradle.xml +.idea/**/libraries + +# CMake +cmake-build-debug/ + +# Mongo Explorer plugin: +.idea/**/mongoSettings.xml + +## File-based project format: +*.iws + +## Plugin-specific files: + +# IntelliJ +/out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Ruby plugin and RubyMine +/.rakeTasks + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +### PyCharm+all Patch ### +# Ignores the whole idea folder +# See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360 + +.idea/ + +### Python ### # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] +*$py.class # C extensions *.so # Distribution / packaging .Python -env/ build/ develop-eggs/ dist/ @@ -23,6 +83,7 @@ lib64/ parts/ sdist/ var/ +wheels/ *.egg-info/ .installed.cfg *.egg @@ -43,27 +104,71 @@ htmlcov/ .coverage .coverage.* .cache +.pytest_cache/ nosetests.xml coverage.xml -*,cover +*.cover +.hypothesis/ # Translations *.mo *.pot -# Django stuff: -*.log +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ -*.dat +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule.* + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + + +# End of https://www.gitignore.io/api/python,pycharm+all + +*.gz +*.npy +train_log *.bin *.tfmodel *.meta *.log* model-* -.gitignore diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml deleted file mode 100644 index c23ecac..0000000 --- a/.idea/inspectionProfiles/profiles_settings.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml deleted file mode 100644 index e0e0c4d..0000000 --- a/.idea/misc.xml +++ /dev/null @@ -1,4 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml deleted file mode 100644 index d8c1d79..0000000 --- a/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/mrunner.iml b/.idea/mrunner.iml deleted file mode 100644 index 6711606..0000000 --- a/.idea/mrunner.iml +++ /dev/null @@ -1,11 +0,0 @@ - - - - - - - - - - \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 94a25f7..0000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml deleted file mode 100644 index 883156d..0000000 --- a/.idea/workspace.xml +++ /dev/null @@ -1,890 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - P - PYTHONPATH - mod - modu - modul - mrunn - EXPERIMENT_ID - , - neptune: - sbatc - imagePullSecrets\:\n \- name\: regsecret - p - p_mr - p_m - name - m_ - script_name - PLGRID_USERNAME - MRUNNER_SCRATCH_SPACE - PLGRID_HOST - m - module - PlgridTask - e - eagle - n - neptune - create_neptune_run_command - create_normal_run_command - MRUNNER_UNDER_NEPTUNE - - - $PROJECT_DIR$ - - - - - - - - - - - true - DEFINITION_ORDER - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - project - - - - - - - - - - - - - - - - - - - - - - - - - - - - 1504867544930 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..0b5fdea --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include mrunner/templates/* diff --git a/README.md b/README.md new file mode 100644 index 0000000..5b7e2bf --- /dev/null +++ b/README.md @@ -0,0 +1,18 @@ +# mrunner + +mrunner is a tool intended to run experiment code on different +computation systems, without manual deployment and with significantly +less configuration. Main features are: + +- prepare remote environment +- deploy code +- run experiments + - use of scheduler, based on mangement of available resources +(if remote system supports it) +- monitor experiments using [neptune](neptune.ml) + +Currently [slurm](https://slurm.schedmd.com) and +[kubernetes](http://kubernetes.io) clusters are supported. +It is also possible to run experiment locally. + +More details may be found in [documentation](https://deepsense-ai.github.io/mrunner) diff --git a/certs/cpascal/.srl b/certs/cpascal/.srl deleted file mode 100644 index c8954c3..0000000 --- a/certs/cpascal/.srl +++ /dev/null @@ -1 +0,0 @@ -EEC6A0E2CC1A7CC8 diff --git a/certs/cpascal/cpascal.crt b/certs/cpascal/cpascal.crt deleted file mode 100644 index fc49b39..0000000 --- a/certs/cpascal/cpascal.crt +++ /dev/null @@ -1,19 +0,0 @@ ------BEGIN CERTIFICATE----- -MIIDGDCCAgACCQDuxqDizBp8yDANBgkqhkiG9w0BAQsFADBFMQswCQYDVQQGEwJQ -TDETMBEGA1UECAwKU29tZS1TdGF0ZTEhMB8GA1UECgwYSW50ZXJuZXQgV2lkZ2l0 -cyBQdHkgTHRkMB4XDTE3MDgxNzIwMjY0N1oXDTQ1MDEwMjIwMjY0N1owVzELMAkG -A1UEBhMCQVUxEzARBgNVBAgMClNvbWUtU3RhdGUxITAfBgNVBAoMGEludGVybmV0 -IFdpZGdpdHMgUHR5IEx0ZDEQMA4GA1UEAwwHY3Bhc2NhbDCCASIwDQYJKoZIhvcN -AQEBBQADggEPADCCAQoCggEBAOinAywmk0nqW1LFyxxBctGqD6sqbC0d1Spc9WZJ -/oPtoTBX+ESdin3W07myW9h9L+hbeWes2+hRZyuyoiqTgjz6PyE5bfaGNskTZ850 -XufcQ7xQe5Ft2//a3Qigm3XYjrYOjKJm6kGQiIIxDGWokd0U7YtootmD6iv2/wQ/ -kigspeiwouV17vPPj9LOydVqLTMlmR0LiMGMbHd0Qj3EMk7rPixoNiv0alY1QrOG -jQPObWlJr2Lsn7WJVB7zfvrD7+VmjXPaBKXfxV1E0PWCRR0ZlAGWv9cymtZ6dAqm -Nx7QkIDRemIeWH75Iooy6Gbx9qKvPbItjJeX60S7T8orcSsCAwEAATANBgkqhkiG -9w0BAQsFAAOCAQEAIHenrONyzZy3xj0Hz/FJtSHKCdTv0jQL/k8EMlRc4kv+fuGO -tj2HN4Oq+JGtkSXX2jgSS3LiG6qncPATSbSOWq2dieC8G8bmrCSlkKGIJg4WcIYg -kQKlzX5emxANKK9b1mF11Ah4L3/v+Zgucmi74cJj2NHK5anJPP64wFf6y8WV1s/S -S0GkNQCUcEEX6rYf328YImrIaGkZEnYI+L94/RzgLSu03smcAmIjoAUDxpCkHm2x -+beL1yKGN7z+MpQSrya4fjeO38EWiryHIY/4jQM4GCmxZolAl2yvZsg3K1PbPAar -3vUESxoekWCiQ28Y95h5suZaM6bgNk6rw5MMzw== ------END CERTIFICATE----- diff --git a/certs/cpascal/cpascal.csr b/certs/cpascal/cpascal.csr deleted file mode 100644 index b93e8f5..0000000 --- a/certs/cpascal/cpascal.csr +++ /dev/null @@ -1,16 +0,0 @@ ------BEGIN CERTIFICATE REQUEST----- -MIICnDCCAYQCAQAwVzELMAkGA1UEBhMCQVUxEzARBgNVBAgMClNvbWUtU3RhdGUx -ITAfBgNVBAoMGEludGVybmV0IFdpZGdpdHMgUHR5IEx0ZDEQMA4GA1UEAwwHY3Bh -c2NhbDCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBAOinAywmk0nqW1LF -yxxBctGqD6sqbC0d1Spc9WZJ/oPtoTBX+ESdin3W07myW9h9L+hbeWes2+hRZyuy -oiqTgjz6PyE5bfaGNskTZ850XufcQ7xQe5Ft2//a3Qigm3XYjrYOjKJm6kGQiIIx -DGWokd0U7YtootmD6iv2/wQ/kigspeiwouV17vPPj9LOydVqLTMlmR0LiMGMbHd0 -Qj3EMk7rPixoNiv0alY1QrOGjQPObWlJr2Lsn7WJVB7zfvrD7+VmjXPaBKXfxV1E -0PWCRR0ZlAGWv9cymtZ6dAqmNx7QkIDRemIeWH75Iooy6Gbx9qKvPbItjJeX60S7 -T8orcSsCAwEAAaAAMA0GCSqGSIb3DQEBCwUAA4IBAQDKLi9G8W4TIk+zR8do16Zj -Atpp6eDdD7xhSp+iRw6UAlTfJRv1+Yk7+4hREckKs7c2Tr8uZkiUFxG5twkewLVu -M4yw/7cwQyWgXiZFIf1daV8Z2Dydg9gpWrNMby8fWa316kSEIWubf3NDh8bJlddq -d/SgssejD+o+Bd+pXk+PoDzgqCCGSQQHEKV7CIFqbNA/8Oy0p84LVB15az0nzjgV -hhZTubOmw7VM55e9ctXkNefUyI3ZU4EHE0gMcPXtu+RD5hXuEYiEYTNXPLWYdkdR -qIsq6RgxvJMbm7tFvK/hDJU4gLHS2k+qDxcYR072m2cCtefILsaTpteGvn2Cj7EB ------END CERTIFICATE REQUEST----- diff --git a/certs/cpascal/cpascal.key b/certs/cpascal/cpascal.key deleted file mode 100644 index 35b4292..0000000 --- a/certs/cpascal/cpascal.key +++ /dev/null @@ -1,27 +0,0 @@ ------BEGIN RSA PRIVATE KEY----- -MIIEpAIBAAKCAQEA6KcDLCaTSepbUsXLHEFy0aoPqypsLR3VKlz1Zkn+g+2hMFf4 -RJ2KfdbTubJb2H0v6Ft5Z6zb6FFnK7KiKpOCPPo/ITlt9oY2yRNnznRe59xDvFB7 -kW3b/9rdCKCbddiOtg6MombqQZCIgjEMZaiR3RTti2ii2YPqK/b/BD+SKCyl6LCi -5XXu88+P0s7J1WotMyWZHQuIwYxsd3RCPcQyTus+LGg2K/RqVjVCs4aNA85taUmv -YuyftYlUHvN++sPv5WaNc9oEpd/FXUTQ9YJFHRmUAZa/1zKa1np0CqY3HtCQgNF6 -Yh5YfvkiijLoZvH2oq89si2Ml5frRLtPyitxKwIDAQABAoIBAB2Vz3N33FcT9FUM -Tg3jzMQYyjFMDcGW/5qJg7NkSXdhapQJO8sN9aSXAwmkQsW+9a6oo0Gp27UpVONb -rf+YDoHOmOSStwcYfjMHwrWgWslFW1/BJFWUENvyaxVDx0EvLsa8BupICBYaWSGl -NxxeWV4x4RtPTRY4DjlVCYzywJVXCgeW+yAwMuPADynphkq8tKBu4tzc0oqn3KYh -NMN3x38TlYTCaazBRZl8qrqEd5yCUos84SMl8bZp3qZxZdZU+4mfLkid9LkZiobj -iwnRYpfCBfwbK7u+uNBTL6yABCByUiuUgiKJx46cwppwfaOmxmW4rqVvPfn12TbU -L3ZrIDECgYEA/4FOhkm5aGc0wGY/LG2sss3rN56wD+KOPVHMxDR5a79OZKUiP+OF -AnUlj03V1L0COEgGp/PMJlPZr3cEeDj117AHyvaFD6c5LEuf4KVuOTpB6y7hvob1 -4bY0qdCeUOmY3UGYBVvuxY7C+cH87Qgt6d/VT7JO0nt7UtIJ1MIIYucCgYEA6Rpf -wVdhQukog06PPW+DAu4+RdsC1vG33MNNGAiJx0WclfBzx3629322bdD7DWVFgm3E -J3DgEFEMzrq7Ol7uSmHjdmCyHOUlqDrtMElOtE5Vu28V1O3dmr62tSoBfaGTR+xE -J9RdVxapAW1A2n9/fJ6CjsWCl6tSf9pqBcuLOx0CgYAMAAC2IeRJWP+LQcIZzhTc -WPuBIzzVoihYDO13EZfaTaPa6j6PjZoDoYjgECHI2g+uoQNfA/5Elo+UxBw7oUE6 -nhPfnnBlac8vm2Wo4Vwp5+iMqSkFD8WgkjQq/ZdR6Ya6bYg8SDMQqEHUBR5h645s -e4zDL3awQAhsvVyk13AZZQKBgQCykP2hgmqDc1ERpXCdQ3s9F3ykzq1SM0EB4KqC -KqhA+taFFfnbbPqEhMznOxvWmymwTJ3jCNjcJvqEsw8SG/63BF3XJSPMds1uHhXs -03tzLD8IooQ65SzJiic0iMUzlD7a7ecYC9qlCDrXWPZBa0IiGNsyu6LaxOciAhMN -9WrnEQKBgQDzTYs0CAEQkQCQdPVvxynYeakjZDeYavVXU0mdVPpaajx49sXX990p -Fp5ty0w2wtXQlTdHheUNjNBMZxDCuD0Sc7mkSIZCotUxJ6NgQ1dGLan3uvTnp2Gt -bGPF3jx4gix0qnSbZEObhPNSqF2JstwDI6YY8+cNtO58BXbQvzirSg== ------END RSA PRIVATE KEY----- diff --git a/certs/cpascal_alt/selfsigned.crt b/certs/cpascal_alt/selfsigned.crt deleted file mode 100644 index 8a06d15..0000000 --- a/certs/cpascal_alt/selfsigned.crt +++ /dev/null @@ -1,19 +0,0 @@ ------BEGIN CERTIFICATE----- -MIIDIzCCAgugAwIBAgIQfDerH/EYrOKbgFzg/LRTzzANBgkqhkiG9w0BAQsFADAo -MRQwEgYDVQQKEwtMb2cgQ291cmllcjEQMA4GA1UEAxMHY3Bhc2NhbDAeFw0xNzA4 -MjQxMzI3NTNaFw0xODA4MjQxMzI3NTNaMCgxFDASBgNVBAoTC0xvZyBDb3VyaWVy -MRAwDgYDVQQDEwdjcGFzY2FsMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKC -AQEAtHwYPutNpCCoGq3k3jmwMYUjORLvrbPYPXg4XGG8VnMl8zupaFtleTeDQvp9 -mP+BkVEstY7alRgfeLOAaTYWukZW6PTchFlG6i5QfatXnkc9ytKtQigUMPY0b3dd -7d/DISY3L62bIG4/kk1nK8Lz/MO9yXeFtr2QSx6lE9yK27kDXc4lCd9giLq11d+I -/tQ716DM7p2oU3J7gtkEuhjJEaJhRhAnmj74OftlRX0AMgdI14SM6KjODzJlCgUW -dM6TWIYx88kadwNjagVFydp/UG0aIdoQXHgmpSTSFDvx3oJ07rJ4I1TzvDy/oIyr -+S+6Iq6MWZQQ0ngK1SS763gZ6QIDAQABo0kwRzAOBgNVHQ8BAf8EBAMCAqQwEwYD -VR0lBAwwCgYIKwYBBQUHAwEwDwYDVR0TAQH/BAUwAwEB/zAPBgNVHREECDAGhwSs -EQ9qMA0GCSqGSIb3DQEBCwUAA4IBAQAxQ2K/iXsOsGSIP+fAfNjDGc7S0FnwkRQx -byYHFfUOWABPAq5+Okz4+MpS5v1OyecrrnAxc4udiQWKSdb4telvq+buIZvzXXS8 -35i3cdNUIrI9d1jVnAVv2rNsApDdaiUtWCWa491FgpDctQol7S2T0ehDbayzsv/W -AfqpbmAKhKc9DrIABBkAS0bFvCtO+KIflvdX4f9tyPxw4f8FAgQJRl7XeC3TBrIv -a7EFfa9JKmx0saCLWgG75e3P+USX9pEeUUXX6J9Vo9ssYqREs3+2dlYerb7xMg6b -RoF1AJKzmSU6+U3K0tcKqmrm1mdcp4XDMXVBLrksLCeHImHAU5IE ------END CERTIFICATE----- diff --git a/certs/cpascal_alt/selfsigned.key b/certs/cpascal_alt/selfsigned.key deleted file mode 100644 index e0ca647..0000000 --- a/certs/cpascal_alt/selfsigned.key +++ /dev/null @@ -1,27 +0,0 @@ ------BEGIN RSA PRIVATE KEY----- -MIIEpAIBAAKCAQEAtHwYPutNpCCoGq3k3jmwMYUjORLvrbPYPXg4XGG8VnMl8zup -aFtleTeDQvp9mP+BkVEstY7alRgfeLOAaTYWukZW6PTchFlG6i5QfatXnkc9ytKt -QigUMPY0b3dd7d/DISY3L62bIG4/kk1nK8Lz/MO9yXeFtr2QSx6lE9yK27kDXc4l -Cd9giLq11d+I/tQ716DM7p2oU3J7gtkEuhjJEaJhRhAnmj74OftlRX0AMgdI14SM -6KjODzJlCgUWdM6TWIYx88kadwNjagVFydp/UG0aIdoQXHgmpSTSFDvx3oJ07rJ4 -I1TzvDy/oIyr+S+6Iq6MWZQQ0ngK1SS763gZ6QIDAQABAoIBABfaEoY2KJOZMT4d -XxKzQwK4WphPVVd6Xgh5m9ExpWczXn+PkerVsUpcuL8gRXrzwVbICIWR9Hchsf8M -kdrVqBlD9HZi7cWb2tBWAJMrErA7MAJe50DH9SsnTnSwrLwkBzCZUU++vLKDf2lB -S3IwGHh+EdXPMt+cm7YhvaUrTzqMNsZy2o7WvOabYaYbT/m4fKt7SDCGjDXXQ5aG -2Km3vo3GmIzR4cCeCqUel+KOvW22ZnFx1BSVE0VHtw8sQE2dzHDytJ2bHWw7hxBM -EkVCMCgYZlhkph4iZl0aOF9ZzZ2esLlTafyCBe5UVwCKFNAs4RKcPb5hxHNeqao7 -FXDYg+UCgYEAxFS8g48wcllQCbvSvEeOJAyMniJAOAiCTCB9CEilbTx/nrLTDDPv -l+chkZkENoDK1UB19Kl7ahFjvU2ldKoQRDv/9kqEUUEqZlSgKB2i2v+VaBINCuzE -P0aDV/DMPATi1/l3DJ7AqZGbZ3W9NLm2p/vYxrltsvgjrmBvQAS0kScCgYEA61Z2 -Te2vFf1HQSWyaNbT7eGacanoi59IOZ4Nn3AMoytwqi1871gCm2AV8n9+90HO73EY -xtFWAeDuAYkkB4imMCwCH2UOqjGm4SCIxn8/FAGcenCpcrfHZa9OZ9rWVgLgipoh -hKTx21B8W5l46QIevCdUCEofIUFdWH+QZg9vxm8CgYBkJa07lpDp43EWZf6rF/0D -mnsoNuR3MK/2USQ2U4g+3nti4mkRcnSnEbln4ZvqiWvtt8HzmXhgtJoh+DeL+Svv -AoQgiAZSxGDybfx8OhHWkEUQHxUUG45HkJpzA1vKydtEu/XkNB6m4KVa8w4elRT9 -P1RmmjPA9R6OVpapGz1YRQKBgQClYhWxnh807NFuTveaBZ7ZxXmr9yOdO0qecxxP -t4aqBJUggucRrF/ooRXLXR301SJ/0pL+5Y3ztAaOvbm8MaPYppS3S2WK5WnJO/vi -wvmpLZix0wrBu7U9GH2ZLokB2PPDc52qgeAVp/I8zRXTadH/mAPN74kA7daTotHv -Z1m9sQKBgQC/VJYHmUFOqcxFNugtbnzSGUQUEnaRFoIeRnk4OrBza8M1USIBkMa4 -kUXvfdhvxQ+LuVR1+FVQaqBJr8huiWp4i4/uNUsrAUMNynxbQrj6bJv9ENcan247 -WijUsHNjw3T04poPmC4Komr9QgJH9/azWGBkASTE0ybknOEExqGyww== ------END RSA PRIVATE KEY----- diff --git a/certs/devdockerCA.crt b/certs/devdockerCA.crt deleted file mode 100644 index cbb4d86..0000000 --- a/certs/devdockerCA.crt +++ /dev/null @@ -1,21 +0,0 @@ ------BEGIN CERTIFICATE----- -MIIDXTCCAkWgAwIBAgIJAKWrf3HzQceaMA0GCSqGSIb3DQEBCwUAMEUxCzAJBgNV -BAYTAlBMMRMwEQYDVQQIDApTb21lLVN0YXRlMSEwHwYDVQQKDBhJbnRlcm5ldCBX -aWRnaXRzIFB0eSBMdGQwHhcNMTcwODE3MjAyNDQyWhcNNDUwMTAyMjAyNDQyWjBF -MQswCQYDVQQGEwJQTDETMBEGA1UECAwKU29tZS1TdGF0ZTEhMB8GA1UECgwYSW50 -ZXJuZXQgV2lkZ2l0cyBQdHkgTHRkMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIB -CgKCAQEArizMVikwgW08Gd3aE1uETUPWGArrfOhX0qSMf7hfnNNqHLfObCIlDsFM -5ympz1sSwVK+wWMOvA77hbWoz0zGcbDz3NxfPYVvJNh8x8Wp0ob2mvmivI7qNjXc -Eq/IxIjkazIX3OeeiHMSSRcGEujLmFAooWnK/DQRCgsbGV++funFeDi8Iu4yKw79 -ML8rCtOsq7URLsAkuM/ry/ru/8dT3av+JDV+2BJCFtzq1SYzEQ57aRSQtWNlj71H -ILYbXP2WmnpWIdeqiGh/ViYj2ZboTcT7UhkaDC4mnOf7WJIbq03DDoTX/i4uxFxp -HVNnHWOJNhBkz1pUm4axGfBjPkjZhwIDAQABo1AwTjAdBgNVHQ4EFgQUkTEDAPvA -ul3MY9eIiuvwCEunHqIwHwYDVR0jBBgwFoAUkTEDAPvAul3MY9eIiuvwCEunHqIw -DAYDVR0TBAUwAwEB/zANBgkqhkiG9w0BAQsFAAOCAQEAI6d/Nd8kcLANarXJH9AD -CQaMGMV2xuI68pLgzI55OuMLsaEfhaMLrJGKxRUKxu1kHVoKGQD9UNdsZ0Gfo6Jl -u0BtMW0g2Ui5HBOyWdyWf70PxBcfFK4T/0lJw7vlpAi2SZZxt0pxE67ruaXJFW6e -wKXxBbwYxfS2JiQvn13NHjwFQVPOwsOQvfvrbjaTDn4y1mZoKqxGw7/YyxM1tfnS -imP05DDU2GzHBSkMvC1d/VIJMWv/7dlRl9qUCJrJMeHODM2QRWOWA5NauDcrtCYt -tUqGX0hruwhtOHavqWdV4iQujWlUh9jhvllpGWOjLUDAvVazk91NLFnXlgG11XqJ -ZA== ------END CERTIFICATE----- diff --git a/certs/devdockerCA.key b/certs/devdockerCA.key deleted file mode 100644 index f526d59..0000000 --- a/certs/devdockerCA.key +++ /dev/null @@ -1,27 +0,0 @@ ------BEGIN RSA PRIVATE KEY----- -MIIEogIBAAKCAQEArizMVikwgW08Gd3aE1uETUPWGArrfOhX0qSMf7hfnNNqHLfO -bCIlDsFM5ympz1sSwVK+wWMOvA77hbWoz0zGcbDz3NxfPYVvJNh8x8Wp0ob2mvmi -vI7qNjXcEq/IxIjkazIX3OeeiHMSSRcGEujLmFAooWnK/DQRCgsbGV++funFeDi8 -Iu4yKw79ML8rCtOsq7URLsAkuM/ry/ru/8dT3av+JDV+2BJCFtzq1SYzEQ57aRSQ -tWNlj71HILYbXP2WmnpWIdeqiGh/ViYj2ZboTcT7UhkaDC4mnOf7WJIbq03DDoTX -/i4uxFxpHVNnHWOJNhBkz1pUm4axGfBjPkjZhwIDAQABAoIBABE4Do9qDQiOkwV/ -oEKPppm43h6MXdGAasZBTD6ILLOlVvU12qtoCL/XrlGgFdCREV6ZXpBhyaAf3w/N -8L1luqycsM8h3iyH14AOAvABA5PzkceqbEM14cOwOWyDglajrelHD4LgRxT4Q1OE -ei4+Y3pQWmPywIWEZDbY+Dne9Ec5jWunpobm1vGabJa8BPu7JHkmLxzrBNalFZgO -MSTIaQjOq6BtpR0oXZYGGDsRVff11ZhTBB9z39MJNjAV+Ew++BBPAGQLuWayWEmp -/Zr18iHwlG5W5xz1FkuSXVHoT7alDKKlh1EgU4zl7BseQsrThflhb8mwK/IxfXZ7 -yZe2QjECgYEA2wHnUO+MOEi/5TmzEyNY0BaIeXKMJYU9UhR2jN2wY6cA4L73T3jL -tHBvgeMBW0UpkEEh5R0u/mm/+RRhSAOyd0n6KoCwmEwomSNFHmuC+gBf7zfbCYlP -ILayaJIakR89OQCVq+9B+5LEtm9GtToNWUgd+9oKaRdKNZJbVxz4SBkCgYEAy5hL -uMfvMvY1Zy5NBe+dTutxaajqeWL9jP0HFQGWsFBItk3XRg9c40wdbVNIPph/HO9r -1jWzYmkUc0HJBBkxqgf0HH6TKL2fcMSW2JXiaxYq/TWmmJjVBDa1jtAzmFaxYfYk -MvJyyqKoc3Vp/KRNWCgh24n1fahzp76xIzme4p8CgYAnHXLr8KzRjKTBUwnZvDFr -kOq1DjFbri1ikIcP57ROSBCPE2xmREPVqVMxFpMycurWeduM6SZwiACigvJRA2AH -kC3fE3b27Hs/xp3crgUJk7GOryIu1NhOFnNM2/NkwfiUSrdwqt3VCcSadMtjKMgj -jw7T4YCEfoVhhg9MQr9RWQKBgB545/R7MEkPYsTTwEGAMKpMQQhQeO+8Kjj88h9p -wnUXPxrSIcNhr0Rx8wGW8nwz9OqOxzQji3rSQXZSxVRwtOc/E4FkTIBPiVqmgfFe -1amzgrGtYwb58QAVdC0w6lgFubzwy1A4y4eaut21GCzxJoZoU1D5oB2zpcPYncio -6if/AoGAURs6akekG382ow5IhbCmId1LXfbY9tpeJodSGk8/NZkwG3H+LdTq45Gz -nS/QvGNN7Pdy257MvPoZNw/KcOpN0ozAOksY2ZytLmqLa5VzHcpgCBw3dF9VicYx -u1kJR9Frkj2M9ZqG+ujCUhDUQAyEtCYz82Me46VZx/EXEDZDCkM= ------END RSA PRIVATE KEY----- diff --git a/certs/generate_cert.sh b/certs/generate_cert.sh deleted file mode 100644 index 8953228..0000000 --- a/certs/generate_cert.sh +++ /dev/null @@ -1,7 +0,0 @@ -# https://www.digitalocean.com/community/tutorials/how-to-set-up-a-private-docker-registry-on-ubuntu-1 -openssl genrsa -out devdockerCA.key 2048 -openssl req -x509 -new -nodes -key devdockerCA.key -days 10000 -out devdockerCA.crt -openssl genrsa -out cpascal.key 2048 -# IMPORTANT!: For example, if your Docker registry is going to be running on the domain www.ilovedocker.com, then your input should look like this: -openssl req -new -key cpascal.key -out cpascal.csr -openssl x509 -req -in cpascal.csr -CA devdockerCA.crt -CAkey devdockerCA.key -CAcreateserial -out cpascal.crt -days 10000 diff --git a/certs/lc-tlscert b/certs/lc-tlscert deleted file mode 100755 index 8a8d395..0000000 Binary files a/certs/lc-tlscert and /dev/null differ diff --git a/certs/lc-tlscert.go b/certs/lc-tlscert.go deleted file mode 100644 index 64da165..0000000 --- a/certs/lc-tlscert.go +++ /dev/null @@ -1,204 +0,0 @@ -/* -* Copyright 2014 Jason Woods. -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* Derived from Golang src/pkg/crypto/tls/generate_cert.go -* Copyright 2009 The Go Authors. All rights reserved. -* Use of this source code is governed by a BSD-style -* license that can be found in the LICENSE file. - */ - -package main - -import ( - "bufio" - "crypto/rand" - "crypto/rsa" - "crypto/x509" - "crypto/x509/pkix" - "encoding/pem" - "fmt" - "math/big" - "net" - "os" - "strconv" - "time" -) - -var input *bufio.Reader - -func init() { - input = bufio.NewReader(os.Stdin) -} - -func readString(prompt string) string { - fmt.Printf("%s: ", prompt) - - var line []byte - for { - data, prefix, _ := input.ReadLine() - line = append(line, data...) - if !prefix { - break - } - } - - return string(line) -} - -func readNumber(prompt string) (num int64) { - var err error - for { - if num, err = strconv.ParseInt(readString(prompt), 0, 64); err != nil { - fmt.Println("Please enter a valid numerical value") - continue - } - break - } - return -} - -func anyKey() { - input.ReadRune() -} - -func main() { - var err error - - template := x509.Certificate{ - Subject: pkix.Name{ - Organization: []string{"Log Courier"}, - }, - NotBefore: time.Now(), - - KeyUsage: x509.KeyUsageCertSign | x509.KeyUsageKeyEncipherment | x509.KeyUsageDigitalSignature, - ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, - BasicConstraintsValid: true, - - IsCA: true, - } - - fmt.Println("Specify the Common Name for the certificate. The common name") - fmt.Println("can be anything, but is usually set to the server's primary") - fmt.Println("DNS name. Even if you plan to connect via IP address you") - fmt.Println("should specify the DNS name here.") - fmt.Println() - - template.Subject.CommonName = readString("Common name") - fmt.Println() - - fmt.Println("The next step is to add any additional DNS names and IP") - fmt.Println("addresses that clients may use to connect to the server. If") - fmt.Println("you plan to connect to the server via IP address and not DNS") - fmt.Println("then you must specify those IP addresses here.") - fmt.Println("When you are finished, just press enter.") - fmt.Println() - - var cnt = 0 - var val string - for { - cnt++ - - if val = readString(fmt.Sprintf("DNS or IP address %d", cnt)); val == "" { - break - } - - if ip := net.ParseIP(val); ip != nil { - template.IPAddresses = append(template.IPAddresses, ip) - } else { - template.DNSNames = append(template.DNSNames, val) - } - } - - fmt.Println() - - fmt.Println("How long should the certificate be valid for? A year (365") - fmt.Println("days) is usual but requires the certificate to be regenerated") - fmt.Println("within a year or the certificate will cease working.") - fmt.Println() - - template.NotAfter = template.NotBefore.Add(time.Duration(readNumber("Number of days")) * time.Hour * 24) - - fmt.Println("Common name:", template.Subject.CommonName) - fmt.Println("DNS SANs:") - if len(template.DNSNames) == 0 { - fmt.Println(" None") - } else { - for _, e := range template.DNSNames { - fmt.Println(" ", e) - } - } - fmt.Println("IP SANs:") - if len(template.IPAddresses) == 0 { - fmt.Println(" None") - } else { - for _, e := range template.IPAddresses { - fmt.Println(" ", e) - } - } - fmt.Println() - - fmt.Println("The certificate can now be generated") - fmt.Println("Press any key to begin generating the self-signed certificate.") - anyKey() - - priv, err := rsa.GenerateKey(rand.Reader, 2048) - if err != nil { - fmt.Println("Failed to generate private key:", err) - os.Exit(1) - } - - serialNumberLimit := new(big.Int).Lsh(big.NewInt(1), 128) - template.SerialNumber, err = rand.Int(rand.Reader, serialNumberLimit) - if err != nil { - fmt.Println("Failed to generate serial number:", err) - os.Exit(1) - } - - derBytes, err := x509.CreateCertificate(rand.Reader, &template, &template, &priv.PublicKey, priv) - if err != nil { - fmt.Println("Failed to create certificate:", err) - os.Exit(1) - } - - certOut, err := os.Create("selfsigned.crt") - if err != nil { - fmt.Println("Failed to open selfsigned.pem for writing:", err) - os.Exit(1) - } - pem.Encode(certOut, &pem.Block{Type: "CERTIFICATE", Bytes: derBytes}) - certOut.Close() - - keyOut, err := os.OpenFile("selfsigned.key", os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0600) - if err != nil { - fmt.Println("failed to open selfsigned.key for writing:", err) - os.Exit(1) - } - pem.Encode(keyOut, &pem.Block{Type: "RSA PRIVATE KEY", Bytes: x509.MarshalPKCS1PrivateKey(priv)}) - keyOut.Close() - - fmt.Println("Successfully generated certificate") - fmt.Println(" Certificate: selfsigned.crt") - fmt.Println(" Private Key: selfsigned.key") - fmt.Println() - fmt.Println("Copy and paste the following into your Log Courier") - fmt.Println("configuration, adjusting paths as necessary:") - fmt.Println(" \"transport\": \"tls\",") - fmt.Println(" \"ssl ca\": \"path/to/selfsigned.crt\",") - fmt.Println() - fmt.Println("Copy and paste the following into your LogStash configuration, ") - fmt.Println("adjusting paths as necessary:") - fmt.Println(" ssl_certificate => \"path/to/selfsigned.crt\",") - fmt.Println(" ssl_key => \"path/to/selfsigned.key\",") -} diff --git a/certs/pascal-tower01/pascal-tower01.crt b/certs/pascal-tower01/pascal-tower01.crt deleted file mode 100644 index 8ea4bfd..0000000 --- a/certs/pascal-tower01/pascal-tower01.crt +++ /dev/null @@ -1,13 +0,0 @@ ------BEGIN CERTIFICATE----- -MIICATCCAWoCCQDmTxy8bTY1qzANBgkqhkiG9w0BAQsFADBFMQswCQYDVQQGEwJQ -TDETMBEGA1UECAwKU29tZS1TdGF0ZTEhMB8GA1UECgwYSW50ZXJuZXQgV2lkZ2l0 -cyBQdHkgTHRkMB4XDTE3MDgxNzE0NDAyMloXDTE4MDgxNzE0NDAyMlowRTELMAkG -A1UEBhMCUEwxEzARBgNVBAgMClNvbWUtU3RhdGUxITAfBgNVBAoMGEludGVybmV0 -IFdpZGdpdHMgUHR5IEx0ZDCBnzANBgkqhkiG9w0BAQEFAAOBjQAwgYkCgYEAvmYi -B/yi08LJaA7iylTG0xlT7BuTkQUMyJbX4ZYc9aNJgrhuO5jnAarTunHyiuiV2/Oc -HBwYafqSB1lVTCEfV4XH/anLnxG3edUT/ZDIyqwpcbFWayto+uM9ZUhww7i1CSl7 -z6OyEr8Y/3xRaR8e9KmrntFTDYgtFZJ6GqrapqsCAwEAATANBgkqhkiG9w0BAQsF -AAOBgQBBcur0UYxqng9QGFWRz5JvuKUJtcLTcygh5iAwX+zkLs1R52ZHoT/5AlGo -i0fInAXzm9eG8XhlHpLGU/Kl6yd0pnafoUjxyr3U8E9pN+rbIs9Qf/CR2noNiCeY -cxenk2J/L5R7ztWOq39pZukuGL9EHCPA6fGa/jXXdh1zIjeF4A== ------END CERTIFICATE----- diff --git a/certs/pascal-tower01/pascal-tower01.csr b/certs/pascal-tower01/pascal-tower01.csr deleted file mode 100644 index 7b11eef..0000000 --- a/certs/pascal-tower01/pascal-tower01.csr +++ /dev/null @@ -1,11 +0,0 @@ ------BEGIN CERTIFICATE REQUEST----- -MIIBhDCB7gIBADBFMQswCQYDVQQGEwJQTDETMBEGA1UECAwKU29tZS1TdGF0ZTEh -MB8GA1UECgwYSW50ZXJuZXQgV2lkZ2l0cyBQdHkgTHRkMIGfMA0GCSqGSIb3DQEB -AQUAA4GNADCBiQKBgQC+ZiIH/KLTwsloDuLKVMbTGVPsG5ORBQzIltfhlhz1o0mC -uG47mOcBqtO6cfKK6JXb85wcHBhp+pIHWVVMIR9Xhcf9qcufEbd51RP9kMjKrClx -sVZrK2j64z1lSHDDuLUJKXvPo7ISvxj/fFFpHx70qaue0VMNiC0VknoaqtqmqwID -AQABoAAwDQYJKoZIhvcNAQELBQADgYEAuyhM12Jp6kDj2kSjaKNXFCRuAB4BshQS -2UDi5bxTmxR8g59322Ba8PHB4p02EP+ZTB9VoEtXEcXczqhcbTFZElMiiWgs8Ukd -7QwlL5ex+NO905au4JI4kmjXYp33aaipj/Z6ihc7je0o6wBh4gVU1GTbu5e3YLgv -V39QKbq2Spo= ------END CERTIFICATE REQUEST----- diff --git a/certs/pascal-tower01/pascal-tower01.key b/certs/pascal-tower01/pascal-tower01.key deleted file mode 100644 index c963d9c..0000000 --- a/certs/pascal-tower01/pascal-tower01.key +++ /dev/null @@ -1,18 +0,0 @@ ------BEGIN RSA PRIVATE KEY----- -Proc-Type: 4,ENCRYPTED -DEK-Info: DES-EDE3-CBC,5EB4E1A1D1F44E04 - -QoJ7nHB+t6HE5RTtZYs7XYXLd8YZfeQRZ814W3IZ835cHcwgkCoNfbJKxz81+rd8 -NoO5hcrSvgwQtHphKkXaOL4anzOYk1DEazKmwC8NHiXzAoHZdiK3U+qIiN0edAdx -PLQwu1y5ZLhTbzrOy3xkj/UznVW/unF54xy/UD40QtE/PI4+2XMC/ouB1sURPhoC -ytxuTY5KBbmPFCSzNB+/VpuvBhU+QQnS+2igP5vwLbJpENIjsaX8Ql9p4tLto5yQ -5sMkmVnG0dL+x2CU+CacvFLxhpkJ7jQk+u0nHHbDZgAwLKjKdriqZZL0R6bKg8ZC -smn6uf5yKRtO/0PvjRUWcGzW2XUaGNbbHBq1SzHnNSXYlI+2/YrDYHXt7i5w0/J/ -GygVyf5szZC2GnpHGtdJ5CvbUt966wPxdqxnza0hX8tQQOq5E3n7p/l1tUT2/Df9 -iagzwSy/1RDubkFpGLZ33GJ+gkkrl1ZZCuQ55osfKgxwyXLL1yF2BOM75S7PCVWh -az+0N4qpD06sBxgSHksQ7nFXPUt4I22HSaRjOo6zv/qPlTlM+qEy7FN3dZsNpf0n -OugslzVYY6W8M4Pt4oEKVdvhxUHI6EP3eYG+TZexRkXna3prR5mbB3wbeiBsi2z0 -HCehCvpj1Y1dOtBdgrBR7qvPBsYRODMsnrzZpyAgsl0O5YI2Rn4qPs+Z98FssqgS -7ls1RNJPY4UY//LEn/DvhotIjxiesehUjpUU0TmcY/SubkcbBrVc7mYe+X0fgRVH -fMBB7lRn8SAQee+XYKL45/OVHLF8OFL1E8mdQl5LWu9mZAzfyk6z3g== ------END RSA PRIVATE KEY----- diff --git a/docs/contribution.md b/docs/contribution.md new file mode 100644 index 0000000..602ff46 --- /dev/null +++ b/docs/contribution.md @@ -0,0 +1,11 @@ +### documentation + +To build and publish documentation use: + +```bash +pip install mkdocs +mkdocs build +mkdocs gh-pages +``` + +See [mkdocs documentation](https://www.mkdocs.org/) for more details. diff --git a/docs/dispatcher.md b/docs/dispatcher.md new file mode 100644 index 0000000..f6f627c --- /dev/null +++ b/docs/dispatcher.md @@ -0,0 +1,87 @@ +# dispatcher + +- [Call stack](#call-stack) + - [standard_mrunner_main](#standard_mrunner_main) +- [How to debug](#how-to-debug) + +## remarks on current state + +- in neptune.yaml there are placed only tags from python spec function + +### directories + + + +## Sequence + +- mrunner run with python config + - eval spec function from python config and list of `Experiment` classes + - generate neptune.yaml for each experiment + - use only: project name, experiment name, parameters, tags, description + - neptune.yaml files are generated into .... + - for generating neptune yaml there are required only dictionary with keys: + project, name, parameters, [tags], [description] + +## Sequence (old dispatcher) + +- shell script + - env setup (environment variables + venv activate etc) + - python dispatcher.py + - load list of experiments + - special support for composite experiments + - register experiments in omninote + - obtain XRunConfig + - special support for composite experiments + - update attributes from command line + - for each experiment + - generate neptune yaml + - name, project + - parameters: name, type, required, default + - update experiment structure parameters based on CLI arguments + - tags + - generate mrunner cmd + - depends on XRunType + - slurm: update env (PLGRID_USERNAME, MRUNNER_SCRATCH_SPACE, PLGRID_HOST) + - optionally experiments/cmds list is shuffled and/or trimmed to size + - run experiments + - it executes set mrunner cmd + - depends on mrunner backend calls are sync (slurm-srun) or async call (slurm-sbatch, k8s) + - depends on parallel flag passed in neptune CLI + - using sequential os.system calls if no parallel call + - using `subprocess.Popen` executed by n threads + +### standard_mrunner_main + +Required enviornment variables which determine method of setting +**parameters list** and **experiment directory**: + +| key | values | description | +| ---------------------- | ------ | --------------------------------------------------- | +| MRUNNER_UNDER_NEPTUNE | 0/1 | both are obtained from neptune | +| PMILOS_DEBUG | 0/1 | experiment directory passed as cmd argument, parameters evaluated from python file | +| RESOURCE_DIR_PATH | path | experiment directory pointed by this env var, parameters evaluated from neptune yaml file | + +When experiment is started: + + - under neptune - neptune `storage_dir` and params are used + - under `PMILOS_DEBUG` additional arguments are parsed (same as in original `dispatcher.py`) + - `--ex` - path to experiment describing python file from which + function pointed by `--spec` is executed; this function shall return + structure containing `parameters` attribute containing experiment parameters + - `--spec` - as mentioned above: name of function which returns structure + containing parameters list + - `--exp_dir_path` - path to experiment directory + - when both neptune and `MRUNNER_DEBUG` environment variable are not set + - `--neptune` - path to neptune yaml file; paramters are obtained from `parameters` + key and `default` values are used + - `RESOURCE_DIR_PATH` - environment variable pointing path to experiment directory + + +Neptune context obtained in this function is ignored. +**Experiment dir is also often ignored**, except: +- `model_rl_experiment_loop.py` - used to set log dir (see NeptuneLogger) +- `run_kanapa_ppo.py` - used to set `env_model_path` + +## How to debug + +TBD \ No newline at end of file diff --git a/docs/images/k8s_storage.png b/docs/images/k8s_storage.png new file mode 100644 index 0000000..8ee08cc Binary files /dev/null and b/docs/images/k8s_storage.png differ diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..f31acd4 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,14 @@ +mrunner is a tool intended to run experiment code on different +computation systems, without manual deployment and with significantly +less configuration. Main features are: + +- prepare remote environment +- deploy code +- run experiments + - use of scheduler, based on mangement of available resources +(if remote system supports it) +- monitor experiments using [neptune](https://neptune.ml) + +Currently [slurm](https://slurm.schedmd.com) and +[kubernetes](http://kubernetes.io) clusters are supported. +It is also possible to run experiment locally. diff --git a/docs/kubernetes.md b/docs/kubernetes.md new file mode 100644 index 0000000..6067ab8 --- /dev/null +++ b/docs/kubernetes.md @@ -0,0 +1,195 @@ +Kubernetes system may be used to manage computation resources and +accordingly schedule experimentation jobs. Read more about +[Kubernetes objects](https://kubernetes.io/docs/concepts/overview/working-with-objects/kubernetes-objects/). + +**Till now kubernetes support was tested only on GKE** - +thus it may need some code update in order to run on +on premise cluster. + +### Setup kubernetes + +(Need to be followed by other persons to write/check steps) + +1. To manage cluster resources and jobs install +[kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl) +tool and setup local [docker](https://docs.docker.com/install/#server) +engine. +1. mrunner and kubectl uses kubectl context, which points to which +cluster we'll communicate to. This structure may be configured, by one of +below methods: + - while using google kubernetes cluster (GKE) it is done + with gcloud tool - see [GKE](#google-kubernetes-engine-gke) section + for details, + - while using [minikube](https://github.com/kubernetes/minikube) + there is created `minikube` context during starting local cluster, + - defining context in YAML config (TBD) + + there is possiblity to switch between already configured contexts + using: + + ```commandline + kubectl config use-context + kubectl config current-context # show context which is used + ``` + +kubectl configuration is stored in `~/.kube/config` file and +can be viewed using: + +```commandline +kubectl config view +``` + +##### Google Kubernetes Engine (GKE) + +If you plan to use [GKE](https://cloud.google.com/kubernetes-engine/) +additionally follow below steps: + +1. Install [gcloud](https://cloud.google.com/sdk/docs/quickstarts) tool, +which will provide authorization to access GKE clusters. It also contain functionality +to manage GKE clusters and Google Cloud Storage (GCS). +1. Configure cluster credentials and kubectl context follow below steps: + - Go to [GKE console](https://console.cloud.google.com/kubernetes) + - Select project + - Press `connect` button on clusters list + - Copy and paste `gcloud` command line + - Authorize google cloud sdk by obtaining token with: + + ```sh + gcloud auth application-default login + ``` + +### remote context keys for kubernetes + +Possible remote context keys: + +| key | req | description | example | +| --------------- | --- | ------------------------------------------------ | ------------------ | +| name | R | unique name of context which identifies him | rl.sandbox | +| type | R | shall equal `kubernetes` | kubernetes | +| storage | R | path to directory where neptune CLI will store data for experiment provenance (required even when neptune is disabled) | /storage | +| registry_url | R | url to docker image repository where built experiment images will be published (shall be available also from cluster level) | https://gcr.io | +| resources | O | defines resource limits for every experiment (by default no resource limits) | {cpu: 4, tpu: 1, mem: 8G} | +| neptune | O | enable/disable neptune (by default enabled) | true | +| google_project_id | O | if using GKE set this key with google project id | rl-sandbox-1234 | +| default_pvc_size | O | size of storage created for new project (see [persistent volumes](#persistent-volumes) section; by default creates volume of size `KubernetesBackend.DEFAULT_STORAGE_PVC_SIZE`) | 100G | + +### Run experiment on kubernetes + +Available options specific for kubernetes cluster: + +| key | req | description / additional information | example | +| --------------- | --- | ------------------------------------------------ | ------------------ | +| config | R | path to neptune experiment configuration; mrunner uses i.a. project and experiment names, parameters list | neptune.yaml | +| base_image | R | name and tag of base docker image used to build experiment docker image | python:3 | +| requirements | R | path to requirements.txt file with python requirements | requirements.txt | + +Sample command call: + +```commandline +mrunner run --config neptune.yaml \ + --tags "grid_search_k-12-48" --tags new_data \ + --requirements requirements.txt \ + --base_image python:3 experiment1.py -- --param1 1 +``` + +Another example could be: + +```commandline +mrunner --context gke.sandbox run --config neptune.yaml \ + --base_image python:3 \ + --requirements requirements.txt \ + -- experiment1.py -- --epochs 3 +``` + +Notice (in both examples) that certain flags refer to the `mrunner` itself +(eg. config, base_image) and others to experiment/script that we wish to run (eg. epochs, param1); +the way these two sets are separated is relevant ('--'). Context is provided to mrunner before `run`. + +While running experiments on kubernetes, mrunner performs following +steps: + +1. Prepares docker image based on provided in command line parameters + - see `templates/Dockerfile.jinja2` file for details + - during build docker cache is used, so if there is no change +in requirements.txt file, build shall be relatively fast +2. If new image was generated tags it with timestamp and published in +docker containers repository. +3. Ensure kubernetes configuration (create resources if missing) + - namespace named after project name exists; see [cluster namespaces](#cluster-namespaces) section +how to switch `kubectl` between them. + - [persistent volume claim](#persistent-volumes) +4. Generate kubernetes job - in fact your experiment + + +### Cluster namespaces + +For each project, new namespace is created in kubernetes cluster. +This provide freedom in experiments naming, possibility to manage +resource quota per project, separate storage. +More details may be found in +[kubernetes documentation](https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/). + +kubectl tool is required to have pointed default namespaces. Otherwise, we shall pass `--namespace|-n` option for each kubectl call. To switch kubectl to access given namespace resources by default set +kubectl context with: + +```commandline +kubectl get namespace +kubectl config set-context $(kubectl config current-context) \ + --namespace= +``` + +### Persistent volumes + +To gather project data from different experiments in single place, +it is required to create set of Persistent Volume related resources (see diagram below and [nfs example](https://github.com/kubernetes/examples/tree/master/staging/volumes/nfs)). During execution of each experiment, it is checked for existance and correctness of this setup. The size of `pvc/storage`defines `default_pvc_size`key from mrunner context, but if not provided volume of size`KubernetesBackend.DEFAULT_STORAGE_PVC_SIZE`(40GB) will be created. + +![k8s_storage](images/k8s_storage.png) + +By default volume is mounted under directory pointed by path from `$STORAGE_DIR` environment variable (the same as passed in `storage` key mrunner context). + +### Kubernetes tools cheat sheet + +To check with which cluster kubectl is communicating: + +```commandline +kubectl config current-context +``` + +To observe from command line current status of jobs you may use + +```commandline +watch 'kubectl get all,pvc,pv -a' +watch 'kubectl get all,pvc,pv -a -o wide' +watch 'kubectl get all,pvc,pv -a --field-selector=metadata.namespace!=kube-system --all-namespaces' +``` + +To observe logs from given pod (also completed and failed) you may use: + +```commandline +kubectl logs -f +``` + +In order to connect to running pod use: + +```commandline +kubectl attach +``` + +To delete job from cluster use below command. But be aware that related +pod also will be deleted. + +```commandline +kubectl delete job +``` + +To show details of job or pod use: + +```commandline +kubectl describe +``` + +To download data from presistent volume use: + +```commandline +TBD +``` diff --git a/docs/neptune.md b/docs/neptune.md new file mode 100644 index 0000000..5734e82 --- /dev/null +++ b/docs/neptune.md @@ -0,0 +1,60 @@ +mrunner run experiments with [neptune](http://neptune.ml). +Currently it is not possible to disable it. + +### Authorization + +Method of authorization with neptune server depends on neptune version. +For v1 credentials are stored in neptune global configuration file +(by default `~/.neptune.yaml`). Example configuration: + +```yaml +host: +port: 443 +username: +password: +``` + +For version v2 `neptune account login` command is used authorize and obtain OAuth2 tokens. +More details on configuration may be found in +[v1.6](http://neptune-docs.deepsense.codilime.com/versions/1.6/reference-guides/cli.html) and +[v2](https://docs.neptune.ml/config/intro/) documentations. + +#### Internals details + +Tokens are stored (again depending on neptune-cli version): + +- `$HOME/.neptune_tokens/` for neptune-cli>=2.0.0,neptune-cli<=2.8.14 +- `$HOME/.neptune/tokens/` for neptune-cli>=2.8.15 + +Required connection parameters/tokens are passed during remote execution using +environment variables or attached as file to experiment archive. + +### Tags + +Experiment related neptune tags may be set in 4 places: +- fixed tags shall be placed in `neptune.yaml` file as `tags` key +- fixed tags may be also placed in context as `tags` key +- run related tags may be addtionally added with CLI `--tags` parameter +- added tags programmatically to neptune context (TODO: add sample) + +```commandline +mrunner run --config neptune.yaml \ + --tags "grid_search_k-12-48" --tags new_data \ + --requirements requirements.txt \ + --base_image python:3 experiment1.py -- --param1 1 +``` + +### Storage + +[TODO: describe difference between v1 and v2] + +### Running neptune in mpirun/srun experiments + +[TODO: test it] + +### Requirements overwriting in older versions + +There is issue with installation of other packages with neptune-cli<=2.8.8. + +If put some packages with conflicting requirements, it is observed +that older version of packages may be installed if after installation of neptune-cli. \ No newline at end of file diff --git a/docs/setup.md b/docs/setup.md new file mode 100644 index 0000000..cfd96fe --- /dev/null +++ b/docs/setup.md @@ -0,0 +1,63 @@ +mrunner is tested on python2.7 and python3.5, but newer versions +of python3 shall also work. +Additionally we recommend to install and use mrunner in +[virtualenv](http://docs.python-guide.org/en/latest/dev/virtualenvs/). + +1. To install it use following commands: + + ```shell + pip install neptune-cli==1.6 + pip install git+ssh://git@pascal-tower01.intra.codilime.com/ml-robotics/mrunner.git@develop + ``` + + Above sequence is related with neptune + [issues](#issue-with-requirements). + +1. Set some [remote contexts](#remote-context) and select active one +1. Configure clients for at least one system: + - [slurm](#slurm) + - [kubernetes](#kubernetes) + +### Remote context + +To avoid passing details on configuration of computation system, it is possible +to store predefined configuration parameters. Each configuration is named and +can be selected during experiment start or set by default: + +```commandline +mrunner run foo.py -- --param1 2 # run with context defined in current_context config key +mrunner --context plgrid.agents run foo.py -- --param1 2 +mrunner --context plgrid.agents --config mrunner.yaml run foo.py -- --param1 2 # loads configuration from local file (instead of user configuration directory) +``` + +Set of keys depends on type of remote context. For description +of available keys go to proper sections (ex. [slurm](#remote-context-keys-for-slurm), +[kubernetes](#remote-context-keys-for-kubernetes)). + +To manage contexts use `context` command. Example calls: + +```commandline +mrunner context +mrunner context add --name gke.sandbox --backend_type kubernetes \ + --registry_url https://gcr.io --storage /storage + --resources "tpu=1 cpu=4" +mrunner context edit gke.sandbox # opens editor with context parameters +mrunner context set-active gke.sandbox +mrunner context delete gke.sandbox +mrunner context copy gke.sandbox gke.new +mrunner --config mrunner.yaml context set-active gke.sandbox +``` + +Example remote contexts': + +```yaml +name: gke.sandbox +type: kubernetes +registry_url: https://gcr.io +resources: + cpu: 4 + tpu: 1 +neptune: true +storage: /storage +``` + diff --git a/docs/slurm.md b/docs/slurm.md new file mode 100644 index 0000000..593942f --- /dev/null +++ b/docs/slurm.md @@ -0,0 +1,88 @@ +Read [presentation](http://www2.chemia.uj.edu.pl/cttc7/plgworkshop/2016.09.08.cttc7.plgrid.workshop.pdf) +from PLGrid Workshop to gain knowledge on slurm and PLGrid clusters. + +### Setup slurm + +### remote context keys for slurm + +Possible remote context keys: + +| key | req | description | example | +| -------------------- | --- | --------------------------------------------------- | ------------------ | +| name | R | unique name of context which identifies him | plgrid.plggluna.sandbox | +| type | R | shall equal `slurm` | slurm | +| slurm_url | R | username and address of slurm cluster | chnorris@pro.cyfronet.pl | +| storage_dir | R | path to directory where neptune CLI will store data for experiment provenance (required even when neptune is disabled; may use env variables) | /storage | +| partition | R | request a specific slurm partition for the resource allocation | plgrid-testing | +| user_id | R | any, meaningful user id used to identify owner of experiment | pz | +| scratch_dir | O | subdirectories under $SCRATCH dir (default `mrunner`) | mrunner | +| resources | O | defines resource limits for every experiment (by default no resource limits) | {cpu: 4, gpu: 1, mem: 8G} | +| neptune | O | enable/disable neptune (by default enabled) | true | +| modules_to_load | O | list of space separated additional slurm modules to load | plgrid/tools/python/3.6.0 plgrid/tools/ffmpeg/3.2.2 | +| after_module_load_ cmd | O | shell oneliner executed after slurm module load, before sourcing venv | export PATH=/net/people/plghenrykm/anaconda2/bin:$PATH; source activate opensim-rl-2.7 | +| venv | O | path to virtual environment; can be overwritten by CLI `venv` option | '/net/people/plghenrykm/ppo_tpu/ppo_env | +| time | O | Set a limit on the total run time of the job allocation. If the requested time limit exceeds the partition's time limit, the job will be left in a PENDING state (possibly indefinitely). (Used with `sbatch` flag) | 3600000 | +| ntasks | O | This option advises the slurm controller that job steps run within the allocation will launch a maximum of number tasks and to provide for sufficient resources. The default is one task per node, but note that the Slurm '--cpus-per-task' option will change this default.| + +### plgrid + +Filesystem: +- `/net/scratch` - lustre filesystem meant for short term use (cleaned after some period of time? 1 month? number of files quota?) + - `$SCRATCH` - your account scratch directory + - `$SCRATCH/mrunner` - current default scratch directory for mrunner +- `/net/archive` - lustre filesystem + - `$PLG_GROUPS_STORAGE` - currently it points to `/net/archive/groups` +- `/net/people` - NFS filesystem + - `$HOME` - your home directory (it is in `/net/people`) + - + +Read [best practices](https://proteusmaster.urcf.drexel.edu/urcfwiki/index.php/Lustre_Scratch_Filesystem#Best_Practices) +while using lustre filesystem (not exactly for plgrid cluster, but certainly shall be used during) + +### Run experiment on slurm + +Available options specific for slurm cluster: + +| key | req | description / additional information | example | +| --------------- | --- | ------------------------------------------------ | ------------------ | +| config | R | path to neptune experiment configuration; mrunner uses i.a. project and experiment names, parameters list | neptune.yaml | +| base_image | R | name and tag of base docker image used to build experiment docker image | python:3 | +| requirements | R | path to requirements.txt file with python requirements | requirements.txt | + +Sample command call: + +```commandline +mrunner run --config neptune.yaml \ + --tags "grid_search_k-12-48" --tags new_data \ + --requirements requirements.txt \ + --base_image python:3 experiment1.py -- --param1 1 +``` + +Another example could be: + +```commandline +mrunner --context gke.sandbox run --config neptune.yaml \ + --base_image python:3 \ + --requirements requirements.txt \ + -- experiment1.py -- --epochs 3 +``` + +Notice (in both examples) that certain flags refer to the `mrunner` itself +(eg. config, base_image) and others to experiment/script that we wish to run (eg. epochs, param1); +the way these two sets are separated is relevant ('--'). Context is provided to mrunner before `run`. + +While running experiments on kubernetes, mrunner performs following +steps: + +1. Prepares docker image based on provided in command line parameters + - see `templates/Dockerfile.jinja2` file for details + - during build docker cache is used, so if there is no change +in requirements.txt file, build shall be relatively fast +2. If new image was generated tags it with timestamp and published in +docker containers repository. +3. Ensure kubernetes configuration (create resources if missing) + - namespace named after project name exists; see [cluster namespaces](#cluster-namespaces) section +how to switch `kubectl` between them. + - [persistent volume claim](#persistent-volumes) +4. Generate kubernetes job - in fact your experiment + diff --git a/docs/tips.md b/docs/tips.md new file mode 100644 index 0000000..80eb2de --- /dev/null +++ b/docs/tips.md @@ -0,0 +1,23 @@ +### good practices + +- while using [kubernetes](#kubernetes) you can easily clean-up unnecessary `jobs` and `pods` by running this command: + + ```commandline + kubectl delete job + ``` + + Do not delete `pvc`s or `namespace`'s unless you're sure you're want that, + otherwise you may accidentally delete the storage used by somebody's else job. + +### common errors + +- install locally and remotely differnt major versions of neptune. As there is difference in `neptune.yaml` semantic +between neptune-cli v1 an v2 parse errors are rised. It may looks like: + +``` + File "/usr/local/lib/python3.5/dist-packages/pykwalify/core.py", line 209, in _validate + self._validate_sequence(value, rule, path, done=None) + File "/usr/local/lib/python3.5/dist-packages/pykwalify/core.py", line 288, in _validate_sequence + raise NotSequenceError(u"Value: {} is not of a sequence type".format(value)) +pykwalify.errors.NotSequenceError: +``` diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..009f432 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,10 @@ +site_name: mrunner +pages: + - overview: index.md + - setup: setup.md + - neptune: neptune.md + - slurm: slurm.md + - kubernetes: kubernetes.md + - tips: tips.md + - contribution: contribution.md +theme: readthedocs diff --git a/mrunner/__init__.py b/mrunner/__init__.py index caef67a..e69de29 100644 --- a/mrunner/__init__.py +++ b/mrunner/__init__.py @@ -1 +0,0 @@ -from mrunner.mrunner_user import mrunner_main \ No newline at end of file diff --git a/mrunner/backends/__init__.py b/mrunner/backends/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mrunner/backends/k8s.py b/mrunner/backends/k8s.py new file mode 100644 index 0000000..d2fbe15 --- /dev/null +++ b/mrunner/backends/k8s.py @@ -0,0 +1,270 @@ +# -*- coding: utf-8 -*- +import logging +import re + +import attr +from kubernetes import client, config + +from mrunner.experiment import COMMON_EXPERIMENT_MANDATORY_FIELDS, COMMON_EXPERIMENT_OPTIONAL_FIELDS +from mrunner.utils.docker_engine import DockerEngine +from mrunner.utils.utils import make_attr_class, filter_only_attr + +LOGGER = logging.getLogger(__name__) + + +def _generate_project_namespace(args): + return re.sub(r'[ .,_-]+', '-', args.project) + + +def _extract_cmd_without_params(args): + cmd = args.cmd.command + if args.cmd and ' -- ' in args.cmd.command: + cmd = args.cmd.command.split(' -- ')[0] + ' --' + return cmd.split(' ') + + +def _extract_params(args): + cmd = '' + if args.cmd and ' -- ' in args.cmd.command: + cmd = args.cmd.command.split(' -- ')[1].strip() + return cmd.split(' ') if cmd else [] + + +EXPERIMENT_MANDATORY_FIELDS = [ + ('registry_url', dict()), # url to docker registry + ('base_image', dict()) # experiment base docker image: name[:version] +] + +EXPERIMENT_OPTIONAL_FIELDS = [ + ('google_project_id', dict(default='')), + ('registry_username', dict(default='')), # docker image registry credentials (not required for GKE) + ('registry_password', dict(default='')), + ('cmd_without_params', dict(init=False, default=attr.Factory(_extract_cmd_without_params, takes_self=True))), + ('params', dict(init=False, default=attr.Factory(_extract_params, takes_self=True))), + ('default_pvc_size', dict(default='')), + ('namespace', dict(init=False, default=attr.Factory(_generate_project_namespace, takes_self=True))), +] + +EXPERIMENT_FIELDS = COMMON_EXPERIMENT_MANDATORY_FIELDS + EXPERIMENT_MANDATORY_FIELDS + \ + COMMON_EXPERIMENT_OPTIONAL_FIELDS + EXPERIMENT_OPTIONAL_FIELDS +ExperimentRunOnKubernetes = make_attr_class('ExperimentRunOnKubernetes', EXPERIMENT_FIELDS, frozen=True) + + +class Job(client.V1Job): + RESOURCE_NAME_MAP = {'cpu': 'cpu', 'mem': 'memory', 'gpu': 'nvidia.com/gpu', 'tpu': 'cloud-tpus.google.com/v2'} + + def __init__(self, image, experiment): + from mrunner.utils.namesgenerator import get_random_name + + experiment_name = re.sub(r'[ ,.\-_:;]+', '-', experiment.name) + name = '{}-{}'.format(experiment_name, get_random_name('-')) + + envs = experiment.env.copy() + envs.update(experiment.cmd.env if experiment.cmd else {}) + envs = {k: str(v) for k, v in envs.items()} + + resources = dict([self._map_resources(name, qty) for name, qty in experiment.resources.items()]) + + internal_volume_name = 'experiment-storage' + vol = client.V1Volume(name=internal_volume_name, + persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource( + claim_name=KubernetesBackend.NFS_PVC_NAME)) + ctr = client.V1Container(name=name, image=image, args=experiment.params, + volume_mounts=[client.V1VolumeMount(mount_path=experiment.storage_dir, + name=internal_volume_name)], + resources=client.V1ResourceRequirements( + limits={k: v for k, v in resources.items()}), + env=[client.V1EnvVar(name=k, value=v) for k, v in envs.items()]) + pod_spec = client.V1PodSpec(restart_policy='Never', containers=[ctr], volumes=[vol]) + pod_template = client.V1PodTemplateSpec(spec=pod_spec) + job_spec = client.V1JobSpec(template=pod_template, backoff_limit=0) # , active_deadline_seconds=100) + super(Job, self).__init__(metadata=client.V1ObjectMeta(name=name), spec=job_spec) + + def _map_resources(self, resource_name, resource_qty): + name = self.RESOURCE_NAME_MAP[resource_name] + if name == 'memory': + qty = resource_qty + 'i' + else: + qty = resource_qty + return name, qty + + @staticmethod + def _escape_arg(arg): + return re.sub(r'[ .,_=-]+', '-', arg) + + +class StandardPVC(client.V1PersistentVolumeClaim): + + def __init__(self, name, size, access_mode): + resource_req = client.V1ResourceRequirements(requests={'storage': size + 'i'}) + pvc_spec = client.V1PersistentVolumeClaimSpec(access_modes=[access_mode], resources=resource_req) + super(StandardPVC, self).__init__(metadata=client.V1ObjectMeta(name=name), spec=pvc_spec) + + +class NFSDeployment(client.V1Deployment): + """ + Pod which contains NFS server to share mounted volume + See details on https://github.com/kubernetes/examples/tree/master/staging/volumes/nfs + """ + PORTS = {'nfs': 2049, 'mountd': 20048, 'rpcbind': 111} + LABELS = {'role': 'nfs-server'} + IMAGE = 'k8s.gcr.io/volume-nfs:0.8' + + def __init__(self, name, storage_pvc): + internal_volume_name = 'nfs-server-volume' + mount_path = '/exports' + + ctr = client.V1Container(name=name, image=self.IMAGE, + ports=[client.V1ContainerPort(name=k, container_port=v) + for k, v in self.PORTS.items()], + security_context=client.V1SecurityContext(privileged=True), + volume_mounts=[client.V1VolumeMount(mount_path=mount_path, name=internal_volume_name)]) + volume_source = client.V1PersistentVolumeClaimVolumeSource(claim_name=storage_pvc) + volume = client.V1Volume(name=internal_volume_name, persistent_volume_claim=volume_source) + pod_spec = client.V1PodSpec(containers=[ctr], volumes=[volume]) + pod_metadata = client.V1ObjectMeta(labels=self.LABELS) + pod_template = client.V1PodTemplateSpec(metadata=pod_metadata, spec=pod_spec) + rs_spec = client.V1ReplicaSetSpec(replicas=1, + selector=client.V1LabelSelector(match_labels=self.LABELS), + template=pod_template) + metadata = client.V1ObjectMeta(name=name, labels=self.LABELS) + super(NFSDeployment, self).__init__(metadata=metadata, spec=rs_spec) + + +class NFSSvc(client.V1Service): + """ + Service for NFS pod + """ + + def __init__(self, name): + nfs_service_spec = client.V1ServiceSpec(ports=[client.V1ServicePort(name=k, port=v) + for k, v in NFSDeployment.PORTS.items()], + selector=NFSDeployment.LABELS) + super(NFSSvc, self).__init__(metadata=client.V1ObjectMeta(name=name), spec=nfs_service_spec) + + +class NFSPv(client.V1PersistentVolume): + """ + Persistent volume which wraps NFS server. + Provide PV with ReadWriteMany access mode, which is required to provide storage for pods + scheduled on different nodes + See details on https://github.com/kubernetes/examples/tree/master/staging/volumes/nfs + """ + STORAGE_CLASS = 'nfs' + + def __init__(self, name, nfs_server_ip): + pv_spec = client.V1PersistentVolumeSpec(capacity={'storage': '1Mi'}, + access_modes=['ReadWriteMany'], + storage_class_name=self.STORAGE_CLASS, + persistent_volume_reclaim_policy='Delete', + nfs=client.V1NFSVolumeSource(server=nfs_server_ip, path='/')) + super(NFSPv, self).__init__(metadata=client.V1ObjectMeta(name=name), spec=pv_spec) + + +class NFSPvc(client.V1PersistentVolumeClaim): + """ + Persistent Volume Claim - claim which attaches to NFS PV + """ + + def __init__(self, name): + resource_req = client.V1ResourceRequirements(requests={'storage': '1Mi'}) + pvc_spec = client.V1PersistentVolumeClaimSpec(access_modes=['ReadWriteMany'], + resources=resource_req, + storage_class_name=NFSPv.STORAGE_CLASS) + super(NFSPvc, self).__init__(metadata=client.V1ObjectMeta(name=name), spec=pvc_spec) + + +class KubernetesBackend(object): + DEFAULT_STORAGE_PVC_SIZE = '40G' + DEFAULT_STORAGE_PVC_NAME = 'storage' + NFS_PVC_NAME = 'nfs' + + def __init__(self): + self._check_env() + config.load_kube_config() + self.core_api = client.CoreV1Api() + self.batch_api = client.BatchV1Api() + self.apps_api = client.AppsV1Api() + + def run(self, experiment): + experiment = ExperimentRunOnKubernetes(**filter_only_attr(ExperimentRunOnKubernetes, experiment)) + image = DockerEngine().build_and_publish_image(experiment=experiment) + + self.configure_namespace(experiment) + self.configure_storage_for_project(experiment) + + for experiment in [experiment, ]: + job = Job(image, experiment) + job_name = job.to_dict()['metadata']['name'] + self._ensure_resource('job', experiment.namespace, job_name, job) + + def configure_namespace(self, experiment): + namespace = client.V1Namespace(metadata=client.V1ObjectMeta(name=experiment.namespace)) + self._ensure_resource('namespace', None, experiment.namespace, namespace) + + def configure_storage_for_project(self, experiment): + """Configures storage as in https://github.com/kubernetes/examples/tree/master/staging/volumes/nfs""" + nfs_svc_name = 'nfs-server' + nfs_pv_name = 'pvc-nfs-{}'.format(experiment.namespace) + + self._ensure_resource('pvc', experiment.namespace, self.DEFAULT_STORAGE_PVC_NAME, + StandardPVC(name=self.DEFAULT_STORAGE_PVC_NAME, + size=experiment.default_pvc_size or self.DEFAULT_STORAGE_PVC_SIZE, + access_mode="ReadWriteOnce")) + self._ensure_resource('dep', experiment.namespace, nfs_svc_name, + NFSDeployment(name=nfs_svc_name, storage_pvc=self.DEFAULT_STORAGE_PVC_NAME)) + _, nfs_svc = self._ensure_resource('svc', experiment.namespace, nfs_svc_name, NFSSvc(name=nfs_svc_name)) + + nfs_svc_ip = nfs_svc.spec.cluster_ip + _, nfs_pv = self._ensure_resource('pv', None, nfs_pv_name, NFSPv(nfs_pv_name, nfs_svc_ip)) + if nfs_pv.spec.nfs.server != nfs_svc_ip: + nfs_pv.spec.source.server = nfs_svc_ip + self.core_api.patch_persistent_volume(nfs_pv_name, nfs_pv) + LOGGER.warning('pv/{}: patched NFS server ip (current={})', nfs_pv_name, nfs_svc_ip) + self._ensure_resource('pvc', experiment.namespace, self.NFS_PVC_NAME, NFSPvc(name=self.NFS_PVC_NAME)) + + def _ensure_resource(self, resource_type, namespace, name, resource_body): + list_kwargs = {'field_selector': 'metadata.name={}'.format(name)} + create_kwargs = {'body': resource_body} + + if namespace: + list_kwargs['namespace'] = namespace + create_kwargs['namespace'] = namespace + + list_fun, create_fun = { + 'dep': (self.apps_api.list_namespaced_deployment, self.apps_api.create_namespaced_deployment), + 'job': (self.batch_api.list_namespaced_job, self.batch_api.create_namespaced_job), + 'namespace': (self.core_api.list_namespace, self.core_api.create_namespace), + 'pod': (self.core_api.list_namespaced_pod, self.core_api.create_namespaced_pod), + 'pv': (self.core_api.list_persistent_volume, self.core_api.create_persistent_volume), + 'pvc': (self.core_api.list_namespaced_persistent_volume_claim, + self.core_api.create_namespaced_persistent_volume_claim), + 'svc': (self.core_api.list_namespaced_service, self.core_api.create_namespaced_service), + }[resource_type] + response = list_fun(**list_kwargs) + if not response.items: + resource = create_fun(**create_kwargs) + LOGGER.debug('{}/{} created ({})'.format(resource_type, name, resource.to_str())) + else: + resource = response.items[0] + LOGGER.debug('{}/{} exists'.format(resource_type, name)) + return bool(response.items), resource + + @staticmethod + def _check_env(): + import subprocess + + try: + from subprocess import DEVNULL # py3k + except ImportError: + import os + DEVNULL = open(os.devnull, 'wb') + + result = 0 + for cmd, link in [('kubectl', 'https://kubernetes.io/docs/tasks/tools/install-kubectl'), + ('gcloud', 'https://cloud.google.com/sdk/docs/quickstarts')]: + try: + subprocess.call(cmd, stdout=DEVNULL, stderr=DEVNULL) + except OSError: + raise RuntimeError('Missing {} cmd. Please install and setup it first: {}'.format(cmd, link)) + return result diff --git a/mrunner/backends/slurm.py b/mrunner/backends/slurm.py new file mode 100644 index 0000000..4f11585 --- /dev/null +++ b/mrunner/backends/slurm.py @@ -0,0 +1,242 @@ +# -*- coding: utf-8 -*- +import logging +import tarfile +import tempfile + +import attr +from fabric.api import run as fabric_run +from fabric.context_managers import cd +from fabric.contrib.project import rsync_project +from fabric.state import env +from paramiko.agent import Agent +from path import Path + +from mrunner.experiment import COMMON_EXPERIMENT_MANDATORY_FIELDS, COMMON_EXPERIMENT_OPTIONAL_FIELDS +from mrunner.plgrid import PLGRID_USERNAME, PLGRID_HOST, PLGRID_TESTING_PARTITION +from mrunner.utils.namesgenerator import id_generator +from mrunner.utils.neptune import NEPTUNE_LOCAL_VERSION +from mrunner.utils.utils import GeneratedTemplateFile, get_paths_to_copy, make_attr_class, filter_only_attr + +LOGGER = logging.getLogger(__name__) +RECOMMENDED_CPUS_NUMBER = 4 +DEFAULT_SCRATCH_SUBDIR = 'mrunner_scratch' +SCRATCH_DIR_RANDOM_SUFIX_SIZE = 10 + + +def generate_experiment_scratch_dir(experiment): + experiment_subdir = '{name}_{random_id}'.format(name=experiment.name, + random_id=id_generator(SCRATCH_DIR_RANDOM_SUFIX_SIZE)) + project_subdir = generate_project_scratch_dir(experiment) + return project_subdir / experiment_subdir + + +def generate_project_scratch_dir(experiment): + project_subdir = '{user_id}_{project}'.format(user_id=experiment.user_id, project=experiment.project) + scratch_subdir = (experiment.scratch_subdir or DEFAULT_SCRATCH_SUBDIR) + return Path(experiment._slurm_scratch_dir) / scratch_subdir / project_subdir + + +EXPERIMENT_MANDATORY_FIELDS = [ + ('venv', dict()), # path to virtual environment + ('user_id', dict()), # used to generate project scratch dir + ('_slurm_scratch_dir', dict()) # obtained from cluster $SCRATCH env +] + +EXPERIMENT_OPTIONAL_FIELDS = [ + # by default use plgrid configuration + ('slurm_url', dict(default='{}@{}'.format(PLGRID_USERNAME, PLGRID_HOST))), + ('partition', dict(default=PLGRID_TESTING_PARTITION)), + + # scratch directory related + ('scratch_subdir', dict(default='')), + ('project_scratch_dir', dict(default=attr.Factory(generate_project_scratch_dir, takes_self=True))), + ('experiment_scratch_dir', dict(default=attr.Factory(generate_experiment_scratch_dir, takes_self=True))), + + # run time related + ('account', dict(default=None)), + ('log_output_path', dict(default=None)), + ('time', dict(default=None)), + ('ntasks', dict(default=None)), + ('modules_to_load', dict(default=attr.Factory(list), type=list)), + ('after_module_load_cmd', dict(default='')), + ('cmd_type', dict(default='srun')), +] + +EXPERIMENT_FIELDS = COMMON_EXPERIMENT_MANDATORY_FIELDS + EXPERIMENT_MANDATORY_FIELDS + \ + COMMON_EXPERIMENT_OPTIONAL_FIELDS + EXPERIMENT_OPTIONAL_FIELDS + +ExperimentRunOnSlurm = make_attr_class('ExperimentRunOnSlurm', EXPERIMENT_FIELDS, frozen=True) + + +class ExperimentScript(GeneratedTemplateFile): + DEFAULT_SLURM_EXPERIMENT_SCRIPT_TEMPLATE = 'slurm_experiment.sh.jinja2' + + def __init__(self, experiment): + # merge env vars + env = experiment.cmd.env.copy() if experiment.cmd else {} + env.update(experiment.env) + env = {k: str(v) for k, v in env.items()} + + if NEPTUNE_LOCAL_VERSION.version[0] == 2: + env['HOME'] = '$(pwd)' # neptune shall loads token local copy of .neptune_tokens|.neptune/tokens + + experiment = attr.evolve(experiment, env=env, experiment_scratch_dir=experiment.experiment_scratch_dir) + + super(ExperimentScript, self).__init__(template_filename=self.DEFAULT_SLURM_EXPERIMENT_SCRIPT_TEMPLATE, + experiment=experiment) + self.experiment = experiment + self.path.chmod('a+x') + + @property + def script_name(self): + e = self.experiment + return '{}.sh'.format(e.experiment_scratch_dir.relpath(e.project_scratch_dir)) + + +class SlurmWrappersCmd(object): + + def __init__(self, experiment, script_path): + self._experiment = experiment + self._script_path = script_path + + @property + def command(self): + # see: https://slurm.schedmd.com/srun.html + # see: https://slurm.schedmd.com/sbatch.html + + if not self._cmd: + raise RuntimeError('Instantiate one of SlurmWrapperCmd subclasses') + + cmd_items = [self._cmd] + + def _extend_cmd_items(cmd_items, option, data_key, default=None): + value = self._getattr(data_key) + if value: + cmd_items += [option, str(value)] + elif default: + cmd_items += [option, default] + + default_log_path = self._experiment.experiment_scratch_dir / 'slurm.log' if self._cmd == 'sbatch' else None + _extend_cmd_items(cmd_items, '-A', 'account') + _extend_cmd_items(cmd_items, '-o', 'log_output_path', default_log_path) # output + _extend_cmd_items(cmd_items, '-p', 'partition') + _extend_cmd_items(cmd_items, '-t', 'time') + + cmd_items += self._resources_items() + cmd_items += [self._script_path] + + return ' '.join(cmd_items) + + def _getattr(self, key): + return getattr(self, key, getattr(self._experiment, key) or None) + + def _resources_items(self): + """mapping from mrunner notation into slurm""" + cmd_items = [] + mrunner_resources = self._getattr('resources') + for resource_type, resource_qty in mrunner_resources.items(): + if resource_type == 'cpu': + ntasks = int(self._getattr('ntasks') or 1) + cores_per_task = int(int(resource_qty) / ntasks) + cmd_items += ['-c', str(cores_per_task)] + + if ntasks > 1: + cmd_items += ['-n', str(ntasks)] + LOGGER.debug('Running {} tasks'.format(ntasks)) + total_cpus = cores_per_task * ntasks + if total_cpus != int(resource_qty): + LOGGER.warning('Will request {} CPU instead of {}'.format(total_cpus, int(resource_qty))) + if total_cpus > RECOMMENDED_CPUS_NUMBER: + LOGGER.warning('Requested number of CPU is higher than recommended value {}/4'.format( + total_cpus, RECOMMENDED_CPUS_NUMBER)) + LOGGER.debug('Using {}/{} CPU cores per_task/total'.format(cores_per_task, total_cpus)) + elif resource_type == 'gpu': + cmd_items += ['--gres', 'gpu:{}'.format(resource_qty)] + LOGGER.debug('Using {} gpu'.format(resource_qty)) + elif resource_type == 'mem': + cmd_items += ['--mem', str(resource_qty)] + LOGGER.debug('Using {} memory'.format(resource_qty)) + else: + raise ValueError('Unsupported resource request: {}={}'.format(resource_type, resource_qty)) + + return cmd_items + + +class SBatchWrapperCmd(SlurmWrappersCmd): + + def __init__(self, experiment, script_path, **kwargs): + super(SBatchWrapperCmd, self).__init__(experiment, script_path, **kwargs) + self._cmd = 'sbatch' + + +class SRunWrapperCmd(SlurmWrappersCmd): + + def __init__(self, experiment, script_path, **kwargs): + super(SRunWrapperCmd, self).__init__(experiment, script_path, **kwargs) + self._cmd = 'srun' + + +class SlurmBackend(object): + + def run(self, experiment): + assert Agent().get_keys(), "Add your private key to ssh agent using 'ssh-add' command" + + # configure fabric + slurm_url = experiment.pop('slurm_url', '{}@{}'.format(PLGRID_USERNAME, PLGRID_HOST)) + env['host_string'] = slurm_url + + slurm_scratch_dir = Path(self._fabric_run('echo $SCRATCH')) + experiment = ExperimentRunOnSlurm(slurm_scratch_dir=slurm_scratch_dir, slurm_url=slurm_url, + **filter_only_attr(ExperimentRunOnSlurm, experiment)) + LOGGER.debug('Configuration: {}'.format(experiment)) + + self.ensure_directories(experiment) + script_path = self.deploy_code(experiment) + SCmd = {'sbatch': SBatchWrapperCmd, 'srun': SRunWrapperCmd}[experiment.cmd_type] + cmd = SCmd(experiment=experiment, script_path=script_path) + self._fabric_run(cmd.command) + + def ensure_directories(self, experiment): + self._ensure_dir(experiment.experiment_scratch_dir) + self._ensure_dir(experiment.storage_dir) + + def deploy_code(self, experiment): + paths_to_dump = get_paths_to_copy(exclude=experiment.exclude, paths_to_copy=experiment.paths_to_copy) + with tempfile.NamedTemporaryFile(suffix='.tar.gz') as temp_file: + # archive all files + with tarfile.open(temp_file.name, 'w:gz') as tar_file: + for p in paths_to_dump: + LOGGER.debug('Adding "{}" to deployment archive'.format(p.rel_remote_path)) + tar_file.add(p.local_path, arcname=p.rel_remote_path) + if experiment.neptune_token_files: + neptune_token_path = experiment.neptune_token_files[0] + rel_local_path = Path('.').relpathto(neptune_token_path) + remote_path = '/'.join([p for p in rel_local_path.split('/') if p and p != '..']) + LOGGER.debug('Adding "{}" to deployment archive'.format(remote_path)) + tar_file.add(neptune_token_path, arcname=remote_path) + + # upload archive to cluster and extract + self._put(temp_file.name, experiment.experiment_scratch_dir) + with cd(experiment.experiment_scratch_dir): + archive_remote_path = experiment.experiment_scratch_dir / Path(temp_file.name).name + self._fabric_run('tar xf {tar_filename} && rm {tar_filename}'.format(tar_filename=archive_remote_path)) + + # create and upload experiment script + script = ExperimentScript(experiment) + remote_script_path = experiment.project_scratch_dir / script.script_name + self._put(script.path, remote_script_path) + + return remote_script_path + + @staticmethod + def _put(local_path, remote_path, quiet=True): + quiet_kwargs = {'ssh_opts': '-q', 'extra_opts': '-q'} if quiet else {} + rsync_project(remote_path, local_path, **quiet_kwargs) + + @staticmethod + def _ensure_dir(directory_path): + SlurmBackend._fabric_run('mkdir -p {path}'.format(path=directory_path)) + + @staticmethod + def _fabric_run(cmd): + return fabric_run(cmd) diff --git a/mrunner/cli/__init__.py b/mrunner/cli/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mrunner/cli/config.py b/mrunner/cli/config.py new file mode 100644 index 0000000..f49e518 --- /dev/null +++ b/mrunner/cli/config.py @@ -0,0 +1,181 @@ +# -*- coding: utf-8 -*- +from copy import deepcopy + +import attr +import click +import six +import yaml + +from mrunner.utils.utils import make_attr_class + +AVAILABLE_RESOURCES = ['cpu', 'mem', 'gpu', 'tpu'] + +Config = make_attr_class('Config', [ + ('contexts', dict(default={})), + ('current_context', dict(default='')), +]) + + +class ConfigParser(object): + + def __init__(self, file_path): + self._config_path = file_path + + def load(self): + config = Config() + if self._config_path.exists(): + with self._config_path.open('r') as conf_file: + config = Config(**yaml.load(conf_file) or {}) + + return config + + def save(self, config): + from six import StringIO + + # first dump config to memory + output = StringIO() + yaml.safe_dump(attr.asdict(config), output, default_flow_style=False) + yaml_payload = six.u(output.getvalue()) + + # then save to file + self._config_path.abspath().parent.makedirs_p() + with self._config_path.open('w') as config_file: + config_file.write(yaml_payload) + + +@click.group(invoke_without_command=True) +@click.pass_context +def context(ctx): + """Manage remote context's""" + if not ctx.invoked_subcommand: + config = ctx.obj['config'] + for context_name in config.contexts: + # not showing as active context which name was passed in command line + active = config.current_context and context_name == config.current_context + click.echo('{}{}'.format(context_name, active and '\t(active)' or '')) + + +@context.command(name='add') +@click.option('--name', required=True, help='Remote context name') +@click.option('--backend_type', required=True, + type=click.Choice(['kubernetes', 'slurm']), help='Type of backend') +@click.option('--storage', default=None, help='Storage path to which neptune will copy source code') +@click.option('--resources', default=None, + help='Resource to request (ex. "mem=2G cpu=4"; available types: {})'.format( + ', '.join(AVAILABLE_RESOURCES))) +@click.option('--registry_url', default=None, help='URL to docker container\'s registry') +@click.option('--neptune/--no-neptune', default=True, help='Use neptune') +@click.pass_context +def context_add(ctx, name, backend_type, storage, resources, registry_url, neptune): + """Add new context""" + config = ctx.obj['config'] + config_path = ctx.obj['config_path'] + + resources = dict([r.split('=') for r in resources.split(' ')]) if resources else {} + if set(resources) - set(AVAILABLE_RESOURCES): + unknown_resources = set(resources) - set(AVAILABLE_RESOURCES) + raise click.ClickException('Unknown resource type: {}'.format(','.join(unknown_resources))) + context = {'context_name': name, 'backend_type': backend_type, 'neptune': neptune, 'storage_dir': storage, + 'registry_url': registry_url, 'resources': resources} + context = {k: v for k, v in context.items() if v} + try: + if name in config.contexts: + raise ValueError('Context "{}" already exists'.format(name)) + config.contexts[name] = context + if len(config.contexts) == 1: + config.current_context = name + except ValueError as e: + raise click.ClickException(e) + ConfigParser(config_path).save(config) + + +@context.command(name='edit') +@click.argument('name') +@click.pass_context +def context_edit(ctx, name): + """Edit context""" + from six import StringIO + + config = ctx.obj['config'] + config_path = ctx.obj['config_path'] + + try: + if name not in config.contexts: + raise ValueError('Context "{}" is missing'.format(name)) + + context = config.contexts[name] + text = yaml.safe_dump(context, default_flow_style=False) + + updated_text = click.edit(text) + if updated_text: + updated_context = updated_text and yaml.load(StringIO(updated_text)) or context + + if updated_context['context_name'] != context['context_name']: + del config.contexts[context['context_name']] + if config.current_context == context['context_name']: + config.current_context = updated_context['context_name'] + + config.contexts[updated_context['context_name']] = updated_context + else: + click.echo('No changes in context') + except ValueError as e: + raise click.ClickException(e) + except yaml.parser.ParserError as e: + raise click.ClickException('Could not parser YAML') + ConfigParser(config_path).save(config) + + +@context.command(name='delete') +@click.argument("name") +@click.pass_context +def context_delete(ctx, name): + """Delete context""" + config = ctx.obj['config'] + config_path = ctx.obj['config_path'] + + try: + if name not in config.contexts: + raise ValueError('Context "{}" is missing'.format(name)) + del config.contexts[name] + if name == config.current_context: + config.current_context = '' + except ValueError as e: + raise click.ClickException(e) + ConfigParser(config_path).save(config) + + +@context.command(name='set-active') +@click.argument("name") +@click.pass_context +def context_set_active(ctx, name): + """Set context as active""" + config = ctx.obj['config'] + config_path = ctx.obj['config_path'] + + try: + if name not in config.contexts: + raise ValueError('Context "{}" is missing'.format(name)) + config.current_context = name + except ValueError as e: + raise click.ClickException(e) + ConfigParser(config_path).save(config) + + +@context.command(name='copy') +@click.argument("src") +@click.argument("dst") +@click.pass_context +def context_copy(ctx, src, dst): + """Make copy of context""" + config = ctx.obj['config'] + config_path = ctx.obj['config_path'] + + try: + if src not in config.contexts: + raise ValueError('Context "{}" is missing'.format(src)) + if dst in config.contexts: + raise ValueError('Context "{}" already set (delete it first)'.format(dst)) + config.contexts[dst] = deepcopy(config.contexts[src]) + except ValueError as e: + raise click.ClickException(e) + ConfigParser(config_path).save(config) diff --git a/mrunner/cli/deprecated/__init__.py b/mrunner/cli/deprecated/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mrunner/command_gen.py b/mrunner/cli/deprecated/command_gen_cli.py old mode 100644 new mode 100755 similarity index 76% rename from mrunner/command_gen.py rename to mrunner/cli/deprecated/command_gen_cli.py index c7956d3..0bf8917 --- a/mrunner/command_gen.py +++ b/mrunner/cli/deprecated/command_gen_cli.py @@ -1,10 +1,13 @@ - +#!/usr/bin/env python +# -*- coding: utf-8 -*- import argparse import random import re import sys +from mrunner.utils.utils import parse_argv + # NOTE(maciek): Code is copied from jobman: https://github.com/crmne/jobman def generate_combination(repl): if repl == []: @@ -65,29 +68,14 @@ def my_generate_commands(argv, repeat=1): return result - -def create_parser(): - parser = argparse.ArgumentParser(description='TODO', fromfile_prefix_chars='@') - parser.add_argument('--repeat', type=int, default=1, help='TODO') - parser.add_argument('--shuffle', action='store_true', help='TODO') - parser.add_argument('--limit', type=int, help='TODO') - return parser - def main(): - argv = sys.argv[1:] - try: - where_is_split = argv.index('--') - control_args = argv[:where_is_split] - proper_args = argv[where_is_split + 1:] - except ValueError: - control_args = [] - proper_args = argv + parser = argparse.ArgumentParser(description='Generate commands', fromfile_prefix_chars='@') + parser.add_argument('--repeat', type=int, default=1, help='Repeat commands') + parser.add_argument('--shuffle', action='store_true', help='Shuffle commands') + parser.add_argument('--limit', type=int, help='Limit number of commands') - # print control_args, proper_args - parser = create_parser() - args = parser.parse_args(control_args) - proper_args = [1] + proper_args + args, proper_args = parse_argv(parser, sys.argv) commands = my_generate_commands(proper_args, repeat=args.repeat) if args.shuffle: @@ -98,10 +86,9 @@ def main(): if len(commands) == 0: commands = [' '.join(argv)] - print '\n'.join(commands) + print('\n'.join(commands)) return 0 + if __name__ == '__main__': sys.exit(main()) - - diff --git a/mrunner/cli/deprecated/local_cli.py b/mrunner/cli/deprecated/local_cli.py new file mode 100755 index 0000000..dafbc0e --- /dev/null +++ b/mrunner/cli/deprecated/local_cli.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import argparse +import logging +import os +import subprocess +import sys + +import yaml + +from mrunner.cmd import Cmd +from mrunner.utils.neptune import NeptuneWrapperCmd +from mrunner.utils.utils import get_experiment_dirname, parse_argv + +LOGGER = logging.getLogger(__name__) + + +def run_task_locally(task, env_update={}): + env = os.environ.copy() + env.update(task.env) + env.update({k: str(v) for k, v in env_update.items() if v}) + + LOGGER.info("Running job locally") + LOGGER.info("cmd: {}".format(task.command)) + LOGGER.info("env: {}".format(yaml.dump(env))) + + try: + process = subprocess.Popen(task.command, shell=True, env=env) + return process.wait() + except KeyboardInterrupt: + LOGGER.warning("Interrupted with keyboard") + return 0 + + +def run_without_neptune(mrunner_args, rest_argv): + exp_dir_path = os.path.join(mrunner_args.storage_url, get_experiment_dirname()) + if mrunner_args.docker_image: + raise NotImplementedError() + task = Cmd(rest_argv, exp_dir_path=exp_dir_path) + return run_task_locally(task, env_update={'PYTHONPATH': mrunner_args.pythonpath}) + + +def run_with_neptune(mrunner_args, rest_argv): + if mrunner_args.config is None: + raise ValueError('config is required while running task with neptune') + if mrunner_args.storage_url is None: + raise ValueError('storage_url is required while running task with neptune') + task = NeptuneWrapperCmd(rest_argv, mrunner_args.config, + neptune_storage=mrunner_args.storage_url, + additional_tags=mrunner_args.tags, + paths_to_dump=mrunner_args.paths_to_dump, + docker_image=mrunner_args.docker_image) + + return run_task_locally(task, env_update={'PYTHONPATH': mrunner_args.pythonpath}) + + +def main(): + logging.basicConfig(level=logging.INFO) + parser = argparse.ArgumentParser(description='Run experiment on local mahine with optional neptune support', + fromfile_prefix_chars='@') + parser.add_argument('--storage_url', type=str, + help="Path to directory where neptune CLI will store data for experiment provenance") + + # TODO: [PZ] think neptune parameter is redundant (may assume that + parser.add_argument('--neptune', action='store_true', help='Enable neptune for experiment') + parser.add_argument('--config', type=str, help='Path to experiment neptune config') + parser.add_argument('--tags', default=[], type=str, nargs='+', help='Additional (to those from experiment neptune config) tags ' + 'which will be added to neptune call') + parser.add_argument('--docker_image', type=str, help='Docker image name to use while running experiment' + 'with neptune') + parser.add_argument('--paths_to_dump', type=str, nargs='+', + help="List of files or dirs which will be copied by neptune to storage") + parser.add_argument('--pythonpath', type=str, help='Additional paths to be added to PYTHONPATH') + + mrunner_args, rest_argv = parse_argv(parser, sys.argv) + run = {True: run_with_neptune, False: run_without_neptune}[mrunner_args.neptune] + sys.exit(run(mrunner_args, rest_argv)) + + +if __name__ == '__main__': + main() diff --git a/mrunner/cli/mrunner_cli.py b/mrunner/cli/mrunner_cli.py new file mode 100755 index 0000000..74d9f47 --- /dev/null +++ b/mrunner/cli/mrunner_cli.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import logging + +import click +from path import Path + +from mrunner.backends.k8s import KubernetesBackend +from mrunner.backends.slurm import SlurmBackend +from mrunner.cli.config import ConfigParser, context as context_cli +from mrunner.experiment import generate_experiments, get_experiments_spec_handle +from mrunner.utils.neptune import NeptuneWrapperCmd + +LOGGER = logging.getLogger(__name__) + + +def get_default_config_path(ctx): + default_config_file_name = 'config.yaml' + + app_name = Path(ctx.command_path).stem + app_dir = Path(click.get_app_dir(app_name)) + return app_dir / default_config_file_name + + +@click.group() +@click.option('--debug/--no-debug', default=False, help='Enable debug messages') +@click.option('--config', default=None, type=click.Path(dir_okay=False), + help='Path to mrunner yaml configuration') +@click.option('--context', default=None, help='Name of remote context to use ' + '(if not provided, "contexts.current" conf key will be used)') +@click.pass_context +def cli(ctx, debug, config, context): + """Deploy experiments on computation cluster""" + + log_tags_to_suppress = ['pykwalify', 'docker', 'kubernetes', 'paramiko', 'requests.packages'] + logging.basicConfig(level=debug and logging.DEBUG or logging.INFO) + for tag in log_tags_to_suppress: + logging.getLogger(tag).setLevel(logging.ERROR) + + # read configuration + config_path = Path(config or get_default_config_path(ctx)) + LOGGER.debug('Using {} as mrunner config'.format(config_path)) + config = ConfigParser(config_path).load() + + cmd_require_context = ctx.invoked_subcommand not in ['context'] + if cmd_require_context: + context_name = context or config.current_context or None + if not context_name: + raise click.ClickException( + 'Provide context name (use CLI "--context" option or use "mrunner context set-active" command)') + if context_name not in config.contexts: + raise click.ClickException( + 'Could not find predefined context: "{}". Use context add command.'.format(context_name)) + + try: + context = config.contexts[context_name] + for k in ['neptune', 'storage_dir', 'backend_type', 'context_name']: + if k not in context: + raise AttributeError('Missing required "{}" context key'.format(k)) + except KeyError: + raise click.ClickException('Unknown context {}'.format(context_name)) + except AttributeError as e: + raise click.ClickException(e) + + ctx.obj = {'config_path': config_path, + 'config': config, + 'context': context} + + +@cli.command() +@click.option('--neptune', type=click.Path(), help="Path to neptune experiment config") +@click.option('--spec', default='spec', help="Name of function providing experiment specification") +@click.option('--tags', multiple=True, help='Additional tags') +@click.option('--requirements_file', type=click.Path(), help='Path to requirements file') +@click.option('--base_image', help='Base docker image used in experiment') +@click.argument('script') +@click.argument('params', nargs=-1) +@click.pass_context +def run(ctx, neptune, spec, tags, requirements_file, base_image, script, params): + """Run experiment""" + + context = ctx.obj['context'] + + # validate options and arguments + requirements = requirements_file and [req.strip() for req in Path(requirements_file).open('r')] or [] + if context['backend_type'] == 'kubernetes' and not base_image: + raise click.ClickException('Provide docker base image') + if context['backend_type'] == 'kubernetes' and not requirements_file: + raise click.ClickException('Provide requirements.txt file') + script_has_spec = get_experiments_spec_handle(script, spec) is not None + neptune_support = context.get('neptune', None) or neptune + if neptune_support and not neptune and not script_has_spec: + raise click.ClickException('Neptune support is enabled in context ' + 'but no neptune config or python experiment descriptor provided') + if neptune and script_has_spec: + raise click.ClickException('Provide only one of: neptune config or python experiment descriptor') + + if not neptune_support: + # TODO: implement it if possible + raise click.ClickException('Currentlu doesn\'t support experiments without neptune') + + neptune_dir = None + try: + # prepare neptune directory in case if neptune yamls shall be generated + if neptune_support and not neptune: + script_path = Path(script) + neptune_dir = script_path.parent / 'neptune_{}'.format(script_path.stem) + neptune_dir.makedirs_p() + + for neptune_path, experiment in generate_experiments(script, neptune, context, spec=spec, + neptune_dir=neptune_dir): + + experiment.update({'base_image': base_image, 'requirements': requirements}) + + if neptune_support: + script = experiment.pop('script') + cmd = ' '.join([script] + list(params)) + # tags from neptune.yaml will be extracted by neptune + additional_tags = context.get('tags', []) + list(tags) + cmd = NeptuneWrapperCmd(cmd=cmd, experiment_config_path=neptune_path, + neptune_storage=context['storage_dir'], + paths_to_dump=None, + additional_tags=additional_tags) + experiment['cmd'] = cmd + experiment.setdefault('paths_to_copy', []) + for possible_token_path in ['~/.neptune_tokens/token', '~/.neptune/tokens/token']: + neptune_path = Path(possible_token_path).expanduser().abspath() + if neptune_path.exists(): + neptune_token_files = experiment.setdefault('neptune_token_files', []) + neptune_token_files.append(str(neptune_path)) + + assert len(experiment.get('neptune_token_files', [])) < 2, \ + 'You have multiple neptune tokens ({}); remove obsolete'.format( + ', '.join(experiment['neptune_token_files'])) + else: + # TODO: implement no neptune version + # TODO: for sbatch set log path into something like os.path.join(resource_dir_path, "job_logs.txt") + raise click.ClickException('Not implemented yet') + + run_kwargs = {'experiment': experiment} + backend = { + 'kubernetes': KubernetesBackend, + 'slurm': SlurmBackend + }[experiment['backend_type']]() + # TODO: add calling experiments in parallel + backend.run(**run_kwargs) + finally: + if neptune_dir: + neptune_dir.rmtree_p() + + +cli.add_command(context_cli) + +if __name__ == '__main__': + cli() diff --git a/mrunner/cmd.py b/mrunner/cmd.py new file mode 100644 index 0000000..34bd5d1 --- /dev/null +++ b/mrunner/cmd.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- +import logging + +LOGGER = logging.getLogger(__name__) + + +class Cmd(object): + + def __init__(self, cmd, exp_dir_path=None): + self._cmd = cmd + self._exp_dir_path = exp_dir_path + + @property + def command(self): + return ' '.join(self._cmd) + + @property + def env(self): + return {'MRUNNER_EXP_DIR_PATH': self._exp_dir_path, 'MRUNNER_UNDER_NEPTUNE': '0'} + + +class YamlCmd(object): + + def __init__(self, cmd, experiment_config_path, neptune_storage=None, additional_tags=None, paths_to_dump=None, + exp_dir_path=None): + self._cmd = cmd + self._experiment_dir_path = exp_dir_path + self._experiment_config_path = experiment_config_path + self._additional_tags = additional_tags + self._storage = neptune_storage + self._paths_to_dump = paths_to_dump + + @property + def command(self): + base_argv = self._cmd + ['--config', self._experiment_config_path] + tags_argv = ['--tags'] + self._additional_tags if self._additional_tags else [] + dump_argv = ['--paths-to-copy'] + self._paths_to_dump if self._paths_to_dump else [] + storage_arv = ['--storage', self._storage] if self._storage else [] + + cmd = base_argv + storage_arv + tags_argv + dump_argv + ['--'] + return ' '.join(cmd) + + @property + def env(self): + return {'MRUNNER_EXP_DIR_PATH': self._experiment_dir_path, 'MRUNNER_UNDER_NEPTUNE': '0'} diff --git a/mrunner/command.py b/mrunner/command.py deleted file mode 100644 index 8b13789..0000000 --- a/mrunner/command.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/mrunner/experiment.py b/mrunner/experiment.py new file mode 100644 index 0000000..c5a8c47 --- /dev/null +++ b/mrunner/experiment.py @@ -0,0 +1,172 @@ +# -*- coding: utf-8 -*- +import logging +import random +import re +import warnings + +import attr +import six +from path import Path + +from mrunner.utils.namesgenerator import id_generator, get_random_name +from mrunner.utils.neptune import NeptuneConfigFileV1, NeptuneConfigFileV2, load_neptune_config, NEPTUNE_LOCAL_VERSION + +LOGGER = logging.getLogger(__name__) + +COMMON_EXPERIMENT_MANDATORY_FIELDS = [ + ('backend_type', dict()), + ('name', dict()), + ('storage_dir', dict()), + ('cmd', dict()) +] + +COMMON_EXPERIMENT_OPTIONAL_FIELDS = [ + ('project', dict(default='sandbox')), + ('requirements', dict(default=attr.Factory(list), type=list)), + ('exclude', dict(default=None, type=list)), + ('paths_to_copy', dict(default=attr.Factory(list), type=list)), + ('env', dict(default=attr.Factory(dict), type=dict)), + ('resources', dict(default=attr.Factory(dict), type=dict)), + ('cwd', dict(default=attr.Factory(Path.getcwd))), + ('neptune_token_files', dict(default=attr.Factory(list), type=list)), +] + + +class Experiment(object): + + def __init__(self, project, name, script, parameters, **kwargs): + def _get_arg(k, sep=' '): + list_type = ['tags', 'paths-to-copy', 'exclude', 'properties', 'python_path'] + v = kwargs.pop(k, [] if k in list_type else '') + return v.split(sep) if isinstance(v, six.string_types) and k in list_type else v + + self.script = script + self.project = project + self.name = name[:16] + self.parameters = parameters + + self.env = kwargs.pop('env', {}) + self.env['PYTHONPATH'] = ':'.join(['$PYTHONPATH', ] + _get_arg('python_path', sep=':')) + + for k in list(kwargs.keys()): + self.__setattr__(k, _get_arg(k)) + + def to_dict(self): + return self.__dict__ + + +class NeptuneExperiment(object): + """Use Experiment class instead""" + + def __init__(self, what, tags, pythonpath, paths_to_dump, name, project_name, parameters, + random_id=None, description=None): + warnings.warn('User Experiment class instead', category=DeprecationWarning, stacklevel=2) + self.what = what + self.tags = tags + # self.pythonpath = pythonpath + self.env = {'PYTHONPATH': '$PYTHONPATH:{}'.format(pythonpath)} + self.paths_to_dump = paths_to_dump.split(' ') if isinstance(paths_to_dump, str) else paths_to_dump + self.name = name[:16] + self.project_name = project_name + self.parameters = parameters + self.description = description or name + self.random_id = "{}".format(random.randint(100000, 999999)) if not random_id else random_id + + def to_dict(self): + attribute_map = { + 'what': 'script', 'project_name': 'project', 'paths_to_dump': 'paths_to_copy' + } + return {attribute_map.get(k, k): v for k, v in self.__dict__.items()} + + +def merge_experiment_parameters(cli_kwargs, neptune_config, context): + config = context.copy() + for k, v in list(neptune_config.items()) + list(cli_kwargs.items()): + if k not in config: + LOGGER.debug('New config["{}"]: {}'.format(k, v)) + config[k] = v + else: + if isinstance(config[k], (list, tuple)): + LOGGER.debug('Extending config["{}"]: {} with {}'.format(k, config[k], v)) + if isinstance(v, (list, tuple)): + config[k].extend(v) + else: + config[k].append(v) + else: + LOGGER.debug('Overwriting config["{}"]: {} -> {}'.format(k, config[k], v)) + config[k] = v + return config + + +def _load_py_experiment_and_generate_neptune_yamls(script, spec, *, neptune_dir, neptune_version=None): + LOGGER.info('Found {} function in {}; will use it as experiments configuration generator'.format(spec, script)) + neptune_support = bool(neptune_dir) + if neptune_support: + if neptune_version and NEPTUNE_LOCAL_VERSION < neptune_version: + # this shall match because we'll later use local neptune to parse them + raise RuntimeError('Current neptune major version: {}, doesn\'t match forced one: {}'.format( + NEPTUNE_LOCAL_VERSION, neptune_version + )) + NeptuneConfigFile = {'1': NeptuneConfigFileV1, '2': NeptuneConfigFileV2}[str(NEPTUNE_LOCAL_VERSION.version[0])] + + def _dump_to_neptune(cli_params, neptune_dir): + neptune_path = None + while not neptune_path or neptune_path.exists(): + neptune_path = neptune_dir / 'neptune-{}-{}.yaml'.format(get_random_name(), id_generator(4)) + with neptune_path.open('w') as neptune_file: + NeptuneConfigFile(**cli_params).dump(neptune_file) + LOGGER.debug('Generated neptune file {}: {}'.format(neptune_path, Path(neptune_path).text())) + return neptune_path + + spec_fun = get_experiments_spec_handle(script, spec) + for experiment in spec_fun(): + if isinstance(experiment, dict): + experiment = NeptuneExperiment(**experiment) + elif not hasattr(experiment, 'to_dict'): + experiment = NeptuneExperiment(**experiment.__dict__) + cli_params = experiment.to_dict() + + neptune_path = _dump_to_neptune(cli_params, neptune_dir) if neptune_support else None + + # TODO: possibly part of this shall not be removed on experiments without neptune support + cli_params.pop('parameters', None) + cli_params.pop('project', None) + cli_params.pop('description', None) + cli_params.pop('tags', None) + cli_params.pop('random_id', None) + + yield (neptune_path, cli_params) + + +def generate_experiments(script, neptune, context, *, spec='spec', + neptune_dir=None, neptune_version=None, **cli_kwargs): + spec_fun = get_experiments_spec_handle(script, spec) + if spec_fun: + experiments = _load_py_experiment_and_generate_neptune_yamls(script, spec=spec, + neptune_dir=neptune_dir, + neptune_version=neptune_version) + else: + neptune_config = load_neptune_config(neptune) + experiments = [(neptune, {'script': script, 'name': neptune_config['name']})] + + for neptune_path, cli_kwargs_ in experiments: + cli_kwargs_['name'] = re.sub(r'[ .,_-]+', '-', cli_kwargs_['name'].lower()) + cli_kwargs_['cwd'] = Path.getcwd() + + # neptune_config = load_neptune_config(neptune_path) if neptune_path else {} + # del neptune_config['storage'] + + neptune_config = {} + cli_kwargs_.update(**cli_kwargs) + experiment = merge_experiment_parameters(cli_kwargs_, neptune_config, context) + + yield neptune_path, experiment + + +def get_experiments_spec_handle(script, spec): + vars = {} + exec(open(script).read(), vars) + spec_fun = vars.get(spec, None) + if not callable(spec_fun): + spec_fun = None + return spec_fun diff --git a/mrunner/kubernetes_backend.py b/mrunner/kubernetes_backend.py deleted file mode 100644 index 7ab0681..0000000 --- a/mrunner/kubernetes_backend.py +++ /dev/null @@ -1,200 +0,0 @@ - -import io -import json -import pprint -import subprocess -from collections import namedtuple - -import yaml - -from mrunner.utils import id_generator - -pod_template_yaml = ''' -kind: Pod -apiVersion: v1 -metadata: - name: gpu-pod2 -spec: - containers: - - name: gpu-container - image: gcr.io/tensorflow/tensorflow:latest-gpu - imagePullPolicy: Always - resources: - requests: - alpha.kubernetes.io/nvidia-gpu: 1 - limits: - alpha.kubernetes.io/nvidia-gpu: 1 - - volumeMounts: - - name: nvidia-driver-384 - mountPath: /usr/local/nvidia - readOnly: true - - name: libcuda-so - mountPath: /usr/lib/x86_64-linux-gnu/libcuda.so - - name: libcuda-so-1 - mountPath: /usr/lib/x86_64-linux-gnu/libcuda.so.1 - - name: libcuda-so-384 - mountPath: /usr/lib/x86_64-linux-gnu/libcuda.so.384.59 - - imagePullSecrets: - - name: regsecret - - restartPolicy: Never - volumes: - - name: nvidia-driver-384 - hostPath: - path: /var/lib/nvidia-docker/volumes/nvidia_driver/384.59 - - name: libcuda-so - hostPath: - path: /usr/lib/x86_64-linux-gnu/libcuda.so - - name: libcuda-so-1 - hostPath: - path: /usr/lib/x86_64-linux-gnu/libcuda.so.1 - - name: libcuda-so-384 - hostPath: - path: /usr/lib/x86_64-linux-gnu/libcuda.so.384.59 -''' - -pod_template2_yaml = ''' -apiVersion: v1 -metadata: - name: gpu-pod2 -spec: - containers: - - name: gpu-container - image: gcr.io/tensorflow/tensorflow:latest-gpu - imagePullPolicy: Always - command: ["/bin/bash"] - args: ["-c", "for i in {1..100}; sleep 1; echo $i; done"] - resources: - requests: - alpha.kubernetes.io/nvidia-gpu: 1 - limits: - alpha.kubernetes.io/nvidia-gpu: 1 - - restartPolicy: Never -''' -KubeVolumeMount = namedtuple('KubeVolumeMount', ['mountPath', 'name', 'hostPath']) - - -class KubernetesBackend(object): - def __init__(self, kube_config=None): - self.kube_config = kube_config - - @classmethod - def generate_pod_dict(cls, pod_name, image, command, args, nr_gpus, env=None): - pod_dict = yaml.load(pod_template_yaml) - pod_dict['metadata']['name'] = pod_name - if command is not None: - pod_dict['spec']['containers'][0]['command'] = command - - if args is not None: - pod_dict['spec']['containers'][0]['args'] = args - - pod_dict['spec']['containers'][0]['resources']['limits']['alpha.kubernetes.io/nvidia-gpu'] = nr_gpus - pod_dict['spec']['containers'][0]['resources']['requests']['alpha.kubernetes.io/nvidia-gpu'] = nr_gpus - pod_dict['spec']['containers'][0]['image'] = image - - if env is not None: - pod_dict['spec']['containers'][0]['env'] = [] - for key, value in env.items(): - pod_dict['spec']['containers'][0]['env'].append({ - 'name': key, - 'value': value - }) - - return pod_dict - #return yaml.dump(pod_dict) - - @classmethod - def generate_pod_dict2(cls, pod_name, image, command, args, nr_gpus): - pod_dict = yaml.load(pod_template2_yaml) - pod_dict['metadata']['name'] = pod_name - pod_dict['spec']['containers'][0]['command'] = command - pod_dict['spec']['containers'][0]['args'] = args - pod_dict['spec']['containers'][0]['image'] = image - return pod_dict - - def add_volume_mounts(self, pod_dict, volume_mounts): - for volume_mount in volume_mounts: - pod_dict['spec']['containers'][0]['volumeMounts'].append( - {'mountPath': volume_mount.mountPath, 'name': volume_mount.name} - ) - pod_dict['spec']['volumes'].append( - {'hostPath': volume_mount.hostPath, 'name': volume_mount.name} - ) - - def add_working_dir(self, pod_dict, workingDir): - pod_dict['spec']['containers'][0]['workingDir'] = workingDir - - def add_node_selector(self, pod_dict, label_key, label_value): - pod_dict['spec']['nodeSelector'] = {} - pod_dict['spec']['nodeSelector'][label_key] = label_value - - def run_command_in_pod(self, pod_name, image, nr_gpus, - args, - command, - node_selector_key=None, node_selector_value=None, volume_mounts=[], interactive=False, workingDir=None, - env=None): - pod_dict = self.generate_pod_dict(pod_name, image, command=command, - args=args, nr_gpus=nr_gpus, env=env) - self.add_volume_mounts(pod_dict, volume_mounts) - if workingDir is not None: - self.add_working_dir(pod_dict, workingDir) - - if node_selector_key is not None and node_selector_value is not None: - self.add_node_selector(pod_dict, node_selector_key, node_selector_value) - - if interactive is True: - raise NotImplementedError - # json_str = json.dumps(pod_dict) - # kubectl_run_command = ['kubectl', 'run', pod_name, - # '-i','--tty', - # '--restart=Never', - # '--image={}'.format(image), - # "--overrides='{pod_dict_json}'".format(pod_dict_json=json_str) - # ] - # pprint.pprint(pod_dict, indent=4) - # print(kubectl_run_command) - # print(' '.join(kubectl_run_command)) - else: - yaml_str = yaml.dump(pod_dict) - - with open('/tmp/final_yaml.yaml', 'w') as f: - # print(yaml_str, file=f) - f.write(yaml_str) - - pprint.pprint(pod_dict) - self.create_pod(yaml_str) - print('Pod {pod_name} created!'.format(pod_name=pod_name)) - print('To attach you should run:') - print('kubectl attach {pod_name}'.format(pod_name=pod_name)) - print('To see logs you should run:') - print('kubectl logs {pod_name}'.format(pod_name=pod_name)) - print('To get shell at pod run:') - print('kubectl exec -it {pod_name} /bin/bash'.format(pod_name=pod_name)) - - - def create_pod(self, yaml_str): - random_path = '/tmp/{}.yaml'.format(id_generator(10)) - with open(random_path, 'w') as f: - # print(yaml_str, file=f) - f.write(yaml_str) - - command = ['kubectl', 'create'] - if self.kube_config is not None: - command += ['--kubeconfig', self.kube_config] - command += ['-f', random_path] - print(' '.join(command)) - # subprocess.run(command, check=True) - subprocess.call(command) - - - -# res = KubernetesApi.generate_yaml(pod_name='test_pod_name', -# image='test_image', -# command='/bin/bash', -# command_args=['-c', 'sleep 10000'], -# nr_gpus=1) -# print(res) - diff --git a/mrunner/mrunner_api.py b/mrunner/mrunner_api.py deleted file mode 100755 index e58d83b..0000000 --- a/mrunner/mrunner_api.py +++ /dev/null @@ -1,216 +0,0 @@ -#!/usr/bin/env python -from __future__ import print_function -import argparse -import os -import subprocess -import sys -from datetime import datetime - -import yaml - -from mrunner.tasks import CommandWithEnv -from mrunner.utils import mkdir_p, id_generator - - -class NeptuneDummyParser(argparse.ArgumentParser): - def __init__(self): - pass - self.parameters = [] - - - def add_argument(self, *args, **kwargs): - print(args, kwargs) - if len(args) != 1: - - raise RuntimeError() - - name = args[0] - type = kwargs.get('type', str) - default = kwargs.get('default', None) - assert(name[:2] == '--') - - def map_type(t): - for a, b in [(int, 'int'), (str, 'string'), (float, 'double')]: - if t == a: - return b - raise RuntimeError() - - d = {'name': name[2:], - 'type': map_type(type), - 'description': 'empty description', - 'required': False - } - if default is not None: - d['default'] = default - - self.parameters.append(d) - - def gen_yaml(self, name='test', description='empty description', project='test'): - res = {'name': name, - 'description': description, - 'project': project, - 'parameters': self.parameters - } - return yaml.dump(res) - - -class MRunnerHelper(object): - def parser_to_yaml(self, config_path, name, project): - import imp - config_module = imp.load_source('create_parser', config_path) - p = NeptuneDummyParser() - yaml_str = config_module.create_parser(p).gen_yaml(name=name, project=project) - return yaml_str - - def handle_docker_no_neptune(self, mrunner_args, rest_argv): - raise NotImplementedError - t = datetime.now().strftime('%Y_%m_%d_%H_%M_%S') - exp_dir_path = os.path.join(mrunner_args.storage_url, t, 'exp_dir') - resource_dir_path = os.path.join(mrunner_args.storage_url, t, 'res_dir') - mkdir_p(exp_dir_path) - mkdir_p(resource_dir_path) - - docker_img = mrunner_args.docker_img - docker_bin = mrunner_args.docker_bin - paths_to_dump_conf = mrunner_args.paths_to_dump_conf - with open(paths_to_dump_conf) as f: - for line in f.readlines(): - src, dst_rel = line.split(' ') - dst = os.path.join(resource_dir_path, dst_rel) - print(dst_rel) - print(src, dst) - cmd = 'cp -r {src} {dst}'.format(src=src, dst=dst) - print(cmd) - os.system(cmd) - - with open(os.path.join(exp_dir_path, 'cmd.txt'), 'w') as f: - print((' '.join(sys.argv)), file=f) - - - command = '{docker_bin} run -i -t -e LOCAL_USER_ID=`id -u $USER` -v {resource_dir_path}:/wdir/res_dir -v {exp_dir_path}:/wdir/exp_dir --workdir /wdir/res_dir {docker_img} '.format( - docker_bin=docker_bin, - docker_img=docker_img, - exp_dir_path=exp_dir_path, - resource_dir_path=resource_dir_path) - command = filter(lambda a: len(a) > 0, command.split(' ')) - rest_cmd = ' '.join(rest_argv) - rest_cmd = ['/bin/bash', '-c', "'{rest_cmd}'".format(rest_cmd=rest_cmd)] - command += rest_cmd - print('COMMAND') - print(command) - self._wait_for_command(command) - - def config_to_yaml(self, config_path, name, project): - assert config_path is not None - if config_path[-2:] == 'py': - yaml_str = self.parser_to_yaml(config_path, name, project) - path = '/tmp/mrunner_config_{id}.yaml'.format(id=id_generator(20)) - f = open(path, 'w') - print(yaml_str, file=f) - f.close() - print(yaml_str) - new_config_path = path - elif config_path[-4:] == 'yaml': - new_config_path = config_path - else: - raise RuntimeError() - return new_config_path - - def create_normal_run_command(self, rest_argv, exp_dir_path): - env = {'MRUNNER_EXP_DIR_PATH': exp_dir_path, 'MRUNNER_UNDER_NEPTUNE': '0'} - command = rest_argv - return CommandWithEnv(command=command, env=env) - - def create_yaml_run_command(self, config_path, paths_to_dump, storage_url, rest_argv, tags, exp_dir_path): - env = {'MRUNNER_EXP_DIR_PATH': exp_dir_path, 'MRUNNER_UNDER_NEPTUNE': '0'} - # main_path = rest_argv[0] - base_argv = rest_argv+['--config', config_path, '--storage', storage_url] - if tags: - tags_argv = ['--tags'] + tags - else: - tags_argv = [] - - if paths_to_dump is not None: - paths_to_dump_argv = ['--paths-to-copy'] + paths_to_dump - else: - paths_to_dump_argv = [] - - neptune_command = base_argv + tags_argv + paths_to_dump_argv - command = neptune_command + ['--'] - - print("PM: command:{}".format(command)) - - return CommandWithEnv(command=command, env=env) - - - - - def create_neptune_run_command(self, config_path, paths_to_dump, storage_url, rest_argv, tags=[], neptune_conf_path=None, docker_image=None, - neptune_host=None, neptune_port=None, neptune_username=None, neptune_password=None): - print('create_neptune {}'.format(config_path)) - print('paths_to_dump {}'.format(paths_to_dump)) - print('storage_url {}'.format(storage_url)) - print('rest_argv {}'.format(rest_argv)) - main_path = rest_argv[0] - if main_path == 'python': - main_path = rest_argv[1] - rest_argv = rest_argv[1:] - - base_argv = ['neptune', 'run', main_path, '--config', config_path, '--storage', storage_url] - if tags: - tags_argv = ['--tags'] + tags - else: - tags_argv = [] - - if paths_to_dump is not None: - paths_to_dump_argv = ['--paths-to-copy'] + paths_to_dump - else: - paths_to_dump_argv = [] - - if docker_image is not None: - print('Will be using docker') - docker_image_argv = ['--docker-image', docker_image] - else: - docker_image_argv = [] - - if neptune_conf_path is None: - assert neptune_host is not None - assert neptune_port is not None - assert neptune_username is not None - assert neptune_password is not None - neptune_credentials_argv = ['--host', neptune_host, '--port', neptune_port, '--username', - neptune_username, '--password', neptune_password] - else: - neptune_credentials_argv = [] - - neptune_command = base_argv + tags_argv + paths_to_dump_argv + docker_image_argv + neptune_credentials_argv - command = neptune_command + ['--'] + rest_argv[1:] - - if neptune_conf_path is not None: - with open(neptune_conf_path) as f: - for line in f.readlines(): - command = [line] + command - - - env = { - 'MRUNNER_UNDER_NEPTUNE': '1' - } - return CommandWithEnv(command=command, env=env) - - def run_task_local(self, task): - print('task.command', task.command) - print(' '.join(task.command)) - print('task.env', task.env) - child_env = os.environ.copy() - child_env.update(task.env) - - try: - proc = subprocess.Popen(' '.join(task.command), shell=True, env=child_env) - proc.wait() - except KeyboardInterrupt: - print('Interrupted by user!') - - - - - diff --git a/mrunner/mrunner_cli.py b/mrunner/mrunner_cli.py deleted file mode 100644 index 2abd7ea..0000000 --- a/mrunner/mrunner_cli.py +++ /dev/null @@ -1,149 +0,0 @@ -import argparse - -import sys - -import os - -from datetime import datetime - -from mrunner.mrunner_api import MRunnerHelper - -class MRunnerCLI(object): - def parse_argv(self): - parser = self.create_parser() - divider_pos = self.argv.index('--') - mrunner_argv = self.argv[1:divider_pos] - rest_argv = self.argv[divider_pos + 1:] - - mrunner_args = parser.parse_args(args=mrunner_argv) - return mrunner_args, rest_argv - - def _parse_paths_to_dump(self, resource_dir_path, paths_to_dump_conf, paths_to_dump): - # TODO(maciek): This is broken :P - res_paths_to_dump = [] - - if paths_to_dump_conf is not None: - with open(paths_to_dump_conf) as f: - for line in f.readlines(): - src, dst_rel = line.split(' ') - dst = os.path.join(resource_dir_path, dst_rel) - res_paths_to_dump.append({'src': src, 'dst': dst, 'dst_rel': dst_rel}) - else: - # INFO(maciek): it seems when we pass args from file the list is treated as one arg? - - for maybe_path_to_dump in paths_to_dump: - print('maybe', maybe_path_to_dump) - for path_to_dump in maybe_path_to_dump.split(' '): - if len(path_to_dump) == 0: - continue - print('path_to_dump', path_to_dump, len(path_to_dump)) - #dst_rel = path_to_dump - dst_rel = '.' - src = os.path.abspath(path_to_dump) - dst = os.path.join(resource_dir_path, dst_rel) - # TODO(maciek): I do not understand the semantics here, src, dst, dst_rel??? - res_paths_to_dump.append({'src': src, 'dst': dst, 'dst_rel': dst_rel}) - - return res_paths_to_dump - - -class MRunnerLocalCLI(MRunnerCLI): - def __init__(self, mrunner_api): - self.mrunner_api = mrunner_api - - def create_parser(self): - parser = argparse.ArgumentParser(description='', fromfile_prefix_chars='@') - parser.add_argument('--storage_url', type=str) - parser.add_argument('--exp_dir_path', type=str) - - parser.add_argument('--paths_to_dump', type=str, nargs='+') - parser.add_argument('--pythonpath', type=str) - parser.add_argument('--neptune_conf_path', type=str, default=None) - - parser.add_argument('--tags', default=[], type=str, nargs='+') - - parser.add_argument('--paths_to_dump_conf', type=str) - - parser.add_argument('--name', type=str, default='test') - parser.add_argument('--project', type=str, default='test') - - parser.add_argument('--neptune_host', type=str) - parser.add_argument('--neptune_port', type=str) - parser.add_argument('--neptune_username', type=str) - parser.add_argument('--neptune_password', type=str) - - - parser.add_argument('--docker_image', type=str) - parser.add_argument('--docker_bin', type=str, default='docker') - parser.add_argument('--neptune', action='store_true') - parser.add_argument('--config', type=str) - return parser - - def main(self, argv): - # def handler(signum, frame): - # print 'Signal handler called with signal', signum - # - # signal.signal(signal.SIGTERM, handler) - # signal.signal(signal.SIGINT, handler) - self.argv = argv - mrunner_args, rest_argv = self.parse_argv() - - - if mrunner_args.storage_url is not None: - exp_dir_path = os.path.join(mrunner_args.storage_url, datetime.now().strftime('%Y_%m_%d_%H_%M_%S')) - print('exp_dir_path', exp_dir_path) - else: - print('Warning! no exp_dir_path set') - exp_dir_path = '.' - - resource_dir_path = os.path.join(exp_dir_path, 'src') - - if mrunner_args.neptune: - if mrunner_args.config is None: - raise RuntimeError('Please supply --config!') - # paths_to_dump = self._parse_paths_to_dump(resource_dir_path, - # mrunner_args.paths_to_dump_conf, - # mrunner_args.paths_to_dump) - - - new_local_config_path = self.mrunner_api.config_to_yaml(mrunner_args.config, - mrunner_args.name, - mrunner_args.project) - - local_task = self.mrunner_api.create_neptune_run_command(config_path=new_local_config_path, - paths_to_dump=mrunner_args.paths_to_dump, - storage_url=mrunner_args.storage_url, - neptune_conf_path=mrunner_args.neptune_conf_path, - tags=mrunner_args.tags, - neptune_host=mrunner_args.neptune_host, - neptune_port=mrunner_args.neptune_port, - neptune_username=mrunner_args.neptune_username, - neptune_password=mrunner_args.neptune_password, - rest_argv=rest_argv, - docker_image=mrunner_args.docker_image) - if mrunner_args.pythonpath: - local_task.env['PYTHONPATH'] = mrunner_args.pythonpath - - self.mrunner_api.run_task_local(local_task) - else: - if mrunner_args.docker_image is not None: - raise NotImplementedError - #return self.mrunner_api.handle_docker_no_neptune(mrunner_args, rest_argv) - - local_task = self.mrunner_api.create_normal_run_command(rest_argv, exp_dir_path=exp_dir_path) - - if mrunner_args.pythonpath: - local_task.env['PYTHONPATH'] = mrunner_args.pythonpath - self.mrunner_api.run_task_local(local_task) - - -def main(): - mrunner_api = MRunnerHelper() - mrunner_cli = MRunnerLocalCLI(mrunner_api) - sys.exit(mrunner_cli.main(sys.argv)) - -if __name__ == '__main__': - main() - - - diff --git a/mrunner/mrunner_kubernetes_cli.py b/mrunner/mrunner_kubernetes_cli.py deleted file mode 100644 index 822e4b3..0000000 --- a/mrunner/mrunner_kubernetes_cli.py +++ /dev/null @@ -1,174 +0,0 @@ -import argparse -import shutil -import sys - -import datetime -import os - -from os.path import isfile - -from mrunner.kubernetes_backend import KubernetesBackend, KubeVolumeMount -from mrunner.mrunner_api import MRunnerHelper -from mrunner.mrunner_cli import MRunnerCLI -from mrunner.utils import mkdir_p, id_generator - -DUMP_SUBDIRECTORY_NAME = 'dump' -class MRunnerKuberntesCLI(MRunnerCLI): - def __init__(self, mrunner_helper, kubernetes_backend): - self.mrunner_helper = mrunner_helper - self.kubernetes_backend = kubernetes_backend - - def create_parser(self): - parser = argparse.ArgumentParser(description='', fromfile_prefix_chars='@') - parser.add_argument('--storage_url', type=str, required=True) - parser.add_argument('--paths_to_dump', type=str, nargs='+', default=[]) - parser.add_argument('--name', type=str, default='test') - parser.add_argument('--project', type=str, default='test') - - - parser.add_argument('--tags', default=[], type=str, nargs='+') -# parser.add_argument('--paths_to_dump_conf', type=str) - parser.add_argument('--nr_gpus', type=int, default=1) - - #parser.add_argument('--venv_path', type=str, default='/net/people/plghenrykm/maciek/venvs/tpack') - parser.add_argument('--docker_image', type=str, required=True) - - parser.add_argument('--pythonpath', type=str) - parser.add_argument('--node_selector_key', type=str) - parser.add_argument('--node_selector_value', type=str) - - parser.add_argument('--interactive', action='store_true') - - parser.add_argument('--neptune', action='store_true') - parser.add_argument('--neptune_exp_config', type=str) - - parser.add_argument('--neptune_host', type=str) - parser.add_argument('--neptune_port', type=str) - parser.add_argument('--neptune_username', type=str) - parser.add_argument('--neptune_password', type=str) - - parser.add_argument('--dry_run', action='store_true') - return parser - - - - def main(self, argv): - self.argv = argv - mrunner_args, rest_argv = self.parse_argv() - - # INFO(maciek): This is a directory where we now copy needed files, but then - # we will run 'neptune run', and it itself will create new exp_dir - # This is fucked! - - subdir = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '_' + id_generator(4) - temp_exp_dir_path = os.path.join(mrunner_args.storage_url, subdir) - print('temp_exp_dir_path', temp_exp_dir_path) - dump_dir = os.path.join(temp_exp_dir_path, DUMP_SUBDIRECTORY_NAME) - paths_to_dump = self._parse_paths_to_dump(dump_dir, - None, - mrunner_args.paths_to_dump) - mkdir_p(dump_dir) - self.copy_paths(paths_to_dump) - if mrunner_args.neptune: - tmp_config_path = self.mrunner_helper.config_to_yaml(mrunner_args.neptune_exp_config, - mrunner_args.name, - mrunner_args.project) - - config_path = os.path.join(dump_dir, 'config.yaml') - self.copy_path(config_path, tmp_config_path) - print('config_path', config_path) - paths_to_dump_for_neptune = [p['dst'] for p in paths_to_dump] - - # We will be constructing neptune run command here - command_with_env = self.mrunner_helper.create_neptune_run_command( - config_path=config_path, - paths_to_dump=paths_to_dump_for_neptune, - storage_url=mrunner_args.storage_url, - tags=mrunner_args.tags, - neptune_host=mrunner_args.neptune_host, - neptune_port=mrunner_args.neptune_port, - neptune_username=mrunner_args.neptune_username, - neptune_password=mrunner_args.neptune_password, - rest_argv=rest_argv) - else: - command_with_env = self.mrunner_helper.create_normal_run_command(rest_argv=rest_argv, - exp_dir_path=temp_exp_dir_path) - - print('command_with_env', command_with_env.command, command_with_env.env) - pod_name = 'mrunner-pod-{id}'.format(id=id_generator(10)) - - # TODO(maciek): temporary!!! - volume_mounts = [ - KubeVolumeMount(name='storage', - mountPath='/mnt/ml-team/rl/kubernetes_storage', - hostPath={'path': '/mnt/ml-team/rl/kubernetes_storage'} - ) - ] - - - # volume_mounts = [ - # KubeVolumeMount(name='storage', - # mountPath=mrunner_args.storage_url, - # hostPath={'path': mrunner_args.storage_url} - # ) - # ] - - if mrunner_args.dry_run is True: - print('only dry-run, not executing!!!') - else: - # INFO(maciek): kube's semantics is not obvious at all what args, and command mean - # https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#define-a-command-and-arguments-when-you-create-a-pod - # see the table that comparse Docker and Kube - - # INFO(maciek): this will use container's default entrypoint - args = command_with_env.command - command = None - env = {} - env.update(command_with_env.env) - if mrunner_args.pythonpath is not None: - env['PYTHONPATH'] = mrunner_args.pythonpath - - print('Pod env will be {}'.format(env)) - - - self.kubernetes_backend.run_command_in_pod(pod_name=pod_name, - image=mrunner_args.docker_image, - nr_gpus=mrunner_args.nr_gpus, - args=args, - command=command, - volume_mounts=volume_mounts, - interactive=mrunner_args.interactive, - workingDir=dump_dir, - node_selector_key=mrunner_args.node_selector_key, - node_selector_value=mrunner_args.node_selector_value, - env=env, - ) - - def copy_paths(self, paths_to_dump): - for d in paths_to_dump: - dst_path, src_path = d['dst'], d['src'] - self.copy_path(dst_path=dst_path, src_path=src_path) - - def copy_path(self, dst_path, src_path): - print('copy_path dst = {}, src = {}'.format(dst_path, src_path)) - if isfile(src_path): - shutil.copy(src=src_path, dst=dst_path) - else: - os.system('cp -r {src} {dst}'.format(src=src_path, dst=dst_path)) - # shutil.copytree(src_path, dst_path) - - -KUBE_CONFIG_PATH = '~/.kube/config' - - -def main(): - kubernetes_backend = KubernetesBackend() - mrunner_helper = MRunnerHelper() - cli = MRunnerKuberntesCLI(mrunner_helper, kubernetes_backend) - return cli.main(sys.argv) - -if __name__ == '__main__': - sys.exit(main()) - - - diff --git a/mrunner/mrunner_plgrid_cli.py b/mrunner/mrunner_plgrid_cli.py deleted file mode 100644 index f9f8058..0000000 --- a/mrunner/mrunner_plgrid_cli.py +++ /dev/null @@ -1,213 +0,0 @@ -import argparse -import sys -import os - -from datetime import datetime - -from mrunner.mrunner_api import MRunnerHelper - -from mrunner.mrunner_cli import MRunnerCLI -from mrunner.prometheus import PrometheusBackend -from mrunner.tasks import PlgridTask -from mrunner.utils import id_generator - -PLGRID_USERNAME = os.environ.get('PLGRID_USERNAME', 'plghenrykm') -MRUNNER_SCRATCH_SPACE = os.environ.get('MRUNNER_SCRATCH_SPACE', '/net/scratch/people/plghenrykm/pmilos/mrunner') -PLGRID_HOST = os.environ.get('PLGRID_HOST', 'pro.cyfronet.pl') - -class MRunnerPLGridCLI(MRunnerCLI): - def __init__(self, mrunner_api, prometheus_api): - self.mrunner_api = mrunner_api - self.prometheus_api = prometheus_api - - def create_parser(self): - parser = argparse.ArgumentParser(description='', fromfile_prefix_chars='@') - - parser.add_argument('--storage_url', type=str) - parser.add_argument('--exp_dir_path', type=str) - - parser.add_argument('--partition', type=str, default='plgrid-testing') - parser.add_argument('--paths_to_dump', type=str, nargs='+') - parser.add_argument('--pythonpath', type=str) - parser.add_argument('--neptune_conf', type=str, default=None) - parser.add_argument('--tags', default=[], type=str, nargs='+') - parser.add_argument('--paths_to_dump_conf', type=str) - parser.add_argument('--name', type=str, default='test') - parser.add_argument('--project', type=str, default='test') - parser.add_argument('--config', type=str) - parser.add_argument('--experiment_id', type=str, default="") - parser.add_argument('--with_yaml', action='store_true') - parser.add_argument('--after_module_load_cmd', type=str) - parser.add_argument('--venv_path', type=str) - parser.add_argument('--cores', type=int, default=24) - parser.add_argument('--ntasks', type=int, default=24) - parser.add_argument('--time', type=str, default='24:00:00') - parser.add_argument('--neptune', action='store_true') - parser.add_argument('--srun', action='store_true') - parser.add_argument('--sbatch', action='store_true') - parser.add_argument('--script_name', type=str, default="mrunner") - parser.add_argument('--modules_to_load', type=str) - parser.add_argument('--A', type=str, default=None) - parser.add_argument('--gres', type=str, default=None) - return parser - - - - def main(self, argv): - - self.argv = argv - mrunner_args, rest_argv = self.parse_argv() - - - if mrunner_args.storage_url is not None: - # INFO(maciek): random noise added is for purpose! - exp_dir_path = os.path.join(mrunner_args.storage_url, datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '_' + id_generator(4)) - print('exp_dir_path', exp_dir_path) - else: - print('Warning! no exp_dir_path set') - exp_dir_path = '.' - if int(mrunner_args.srun) + int(mrunner_args.sbatch) != 1: - raise RuntimeError('Please provide exactly one of --srun, --sbatch') - - resource_dir_path = os.path.join(exp_dir_path, 'src') - - paths_to_dump = self._parse_paths_to_dump(resource_dir_path, - mrunner_args.paths_to_dump_conf, - mrunner_args.paths_to_dump) - print(paths_to_dump) - - remote_config_path = os.path.join(resource_dir_path, 'config.yaml') - - if mrunner_args.neptune: - if mrunner_args.config is None: - raise RuntimeError('Please supply --config!') - self.prometheus_api.mkdir(resource_dir_path) - self.prometheus_api.copy_paths_rel(paths_to_dump, resource_dir_path) - new_local_config_path = self.mrunner_api.config_to_yaml(mrunner_args.config, - mrunner_args.name, - mrunner_args.project) - - - self.prometheus_api.copy_path(remote_config_path, new_local_config_path) - - paths_to_dump_for_neptune = [os.path.join(p['dst'], os.path.basename(p['src'])) for p in paths_to_dump] - print(paths_to_dump_for_neptune) - local_task = self.mrunner_api.create_neptune_run_command(config_path=remote_config_path, - paths_to_dump=paths_to_dump_for_neptune, - storage_url=mrunner_args.storage_url, - tags=mrunner_args.tags, - neptune_conf_path=mrunner_args.neptune_conf, - rest_argv=rest_argv) - command_list = local_task.command - - if mrunner_args.neptune_conf is not None: - with open(mrunner_args.neptune_conf) as f: - for line in f.readlines(): - command_list = [line] + command_list - - command = ' '.join(command_list) - import random - sleep_command = "sleep {0:.4f}".format(random.random() * 5) - command = sleep_command + " ; " + command - print(command) - - env = local_task.env - env['EXPERIMENT_ID'] = mrunner_args.experiment_id - env['STORAGE_URL'] = mrunner_args.storage_url - env['RESOURCE_DIR_PATH'] = resource_dir_path - - if mrunner_args.pythonpath: - env['PYTHONPATH'] = mrunner_args.pythonpath - - log_path = '/dev/null' - modules_to_load = [] - if mrunner_args.modules_to_load: - modules_to_load = mrunner_args.modules_to_load.split(":") - modules_to_load = [x for x in modules_to_load if x] # remove empty strings - print("Modules to load:{}".format(modules_to_load)) - - task = PlgridTask(command=command, cwd=resource_dir_path, env=env, venv_path=mrunner_args.venv_path, - after_module_load_cmd=mrunner_args.after_module_load_cmd, - script_name=mrunner_args.script_name, modules_to_load=modules_to_load) - - if mrunner_args.with_yaml: - self.prometheus_api.mkdir(resource_dir_path) - self.prometheus_api.copy_paths_rel(paths_to_dump, resource_dir_path) - paths_to_dump_for_neptune = [os.path.join(p['dst'], os.path.basename(p['src'])) for p in paths_to_dump] - new_local_config_path = self.mrunner_api.config_to_yaml(mrunner_args.config, - mrunner_args.name, - mrunner_args.project) - self.prometheus_api.copy_path(remote_config_path, new_local_config_path) - local_task = self.mrunner_api.create_yaml_run_command(config_path=remote_config_path, - paths_to_dump=paths_to_dump_for_neptune, - storage_url=mrunner_args.storage_url, - tags=mrunner_args.tags, - exp_dir_path=exp_dir_path, - rest_argv=rest_argv) - - # parms_argv = rest_argv - # if mrunner_args.with_yaml: - # parms_argv.append(" --yaml {}".format(remote_config_path)) - # new_local_config_path = self.mrunner_api.config_to_yaml(mrunner_args.config, - # mrunner_args.name, - # mrunner_args.project) - # - # self.prometheus_api.copy_path(remote_config_path, new_local_config_path) - # - # local_task = self.mrunner_api.create_normal_run_command(rest_argv, exp_dir_path=exp_dir_path) - - command_list = local_task.command - if mrunner_args.neptune_conf is not None: - with open(mrunner_args.neptune_conf) as f: - for line in f.readlines(): - command_list = [line] + command_list - - - command = ' '.join(command_list) - env = local_task.env - env['EXPERIMENT_ID'] = mrunner_args.experiment_id - env['STORAGE_URL'] = mrunner_args.storage_url - env['RESOURCE_DIR_PATH'] = resource_dir_path - - if mrunner_args.pythonpath: - env['PYTHONPATH'] = "{}:$PYTHONPATH".format(mrunner_args.pythonpath) - - log_path = os.path.join(resource_dir_path, "job_logs.txt") - modules_to_load = [] - if mrunner_args.modules_to_load: - modules_to_load = mrunner_args.modules_to_load.split(":") - modules_to_load = [x for x in modules_to_load if x] #remove empty strings - print("Modules to load:{}".format(modules_to_load)) - - task = PlgridTask(command=command, cwd=resource_dir_path, env=env, venv_path=mrunner_args.venv_path, - after_module_load_cmd=mrunner_args.after_module_load_cmd, - script_name=mrunner_args.script_name, modules_to_load=modules_to_load) - - - if mrunner_args.srun: - self.prometheus_api.srun(task, partition=mrunner_args.partition, - cores=mrunner_args.cores, ntasks=mrunner_args.ntasks, - account=mrunner_args.A, - gres=mrunner_args.gres) - elif mrunner_args.sbatch: - self.prometheus_api.sbatch(task, partition=mrunner_args.partition, - cores=mrunner_args.cores, - time=mrunner_args.time, - stdout_path = log_path, - ntasks=mrunner_args.ntasks, - account=mrunner_args.A, - gres=mrunner_args.gres) - - -def main(): - prometheus = PrometheusBackend(username=PLGRID_USERNAME, host=PLGRID_HOST, scratch_space=MRUNNER_SCRATCH_SPACE) - mrunner_api = MRunnerHelper() - mrunner_cli = MRunnerPLGridCLI(mrunner_api, prometheus) - sys.exit(mrunner_cli.main(sys.argv)) - - -if __name__ == '__main__': - main() - - - diff --git a/mrunner/mrunner_user.py b/mrunner/mrunner_user.py deleted file mode 100644 index 155ddd8..0000000 --- a/mrunner/mrunner_user.py +++ /dev/null @@ -1,18 +0,0 @@ -def job_main_(neptune_ctx, args, exp_dir_path): - raise NotImplementedError - -def mrunner_main(job_main, create_parser_fun): - import os - if os.environ.get('MRUNNER_UNDER_NEPTUNE', '0') == '1': - # running under neptune - from deepsense import neptune - ctx = neptune.Context() - args = ctx.params - exp_dir_path = ctx.storage_url - else: - parser = create_parser_fun() - args = parser.parse_args() - ctx = None - exp_dir_path = os.environ.get('MRUNNER_EXP_DIR_PATH', '.') - - job_main(ctx, args, exp_dir_path) diff --git a/mrunner/plgrid.py b/mrunner/plgrid.py new file mode 100644 index 0000000..172ad69 --- /dev/null +++ b/mrunner/plgrid.py @@ -0,0 +1,5 @@ +# -*- coding: utf-8 -*- + +PLGRID_HOST = 'pro.cyfronet.pl' +PLGRID_USERNAME = 'plghenrykm' +PLGRID_TESTING_PARTITION = 'plgrid-testing' diff --git a/mrunner/prometheus.py b/mrunner/prometheus.py deleted file mode 100644 index 8f5588c..0000000 --- a/mrunner/prometheus.py +++ /dev/null @@ -1,159 +0,0 @@ -import hashlib -import os -import os.path as osp -import tarfile -from fabric.api import run -from fabric.context_managers import cd -from fabric.contrib.project import rsync_project -from fabric.operations import put -from fabric.state import env - -from mrunner.utils import id_generator - - -class PrometheusBackend(object): - def __init__(self, username, host, scratch_space): - self.host = host - self.username = username - self.host_string = '{username}@{host}'.format(username=self.username, host=self.host) - self.scratch_space = scratch_space - env['host_string'] = self.host_string - self.mkdir(self.scratch_space) - - def _make_script_name(self, script_name): - return script_name + '_' + id_generator(20) + '.sh' - - def _chmod_x(self, path): - run('chmod +x {path}'.format(path=path)) - - def _create_remote_script(self, command, script_name): - script_name = self._make_script_name(script_name) - script_path = osp.join('/tmp/', script_name) - - with open(script_path, 'w') as f: - print >> f, '#!/bin/bash' - print >> f, command - - remote_path = osp.join(self.scratch_space, script_name) - put(script_path, remote_path) - self._chmod_x(remote_path) - return remote_path - - def sbatch(self, task, partition='plgrid', time='24:00:00', cores=24, stdout_path='/dev/null', ntasks=1, gres=None, account=None): - command = task.construct_command() - script_name = task.script_name - remote_path = self._create_remote_script(command, script_name) - remote_command = 'sbatch -p {partition} -t {time} {gpu_gres} {account} -c {num_cores} -n {ntasks} -o {stdout_path} {script_path}'.format(partition=partition, - time=time, - num_cores=cores, - stdout_path=stdout_path, - gpu_gres=('--gres={}'.format(gres) if gres else ''), - account=('-A {}'.format(account) if gres else ''), - script_path=remote_path, - ntasks=ntasks - ) - print('remote_command=', remote_command) - run(remote_command) - - def srun(self, task, partition='plgrid', cores=24, ntasks=1, gres=None, account=None): - command = task.construct_command() - script_name = task.script_name - remote_path = self._create_remote_script(command, script_name) - - remote_command = 'srun -p {partition} {gpu_gres} {account} -c {num_cores} -n {ntasks} {script_path}'.format(partition=partition, - gpu_gres=('--gres={}'.format(gres) if gres else ''), - account=('-A {}'.format(account) if gres else ''), - num_cores=cores, - script_path=remote_path, - ntasks=ntasks - ) - print('remote_command=', remote_command) - run(remote_command) - - def mkdir(self, path): - run('mkdir -p {path}'.format(path=path)) - - def copy_paths(self, paths_to_dump): - for d in paths_to_dump: - remote_path, local_path = d['dst'], d['src'] - self.copy_path(remote_path, local_path) - #put(local_path, remote_path) - - def copy_paths_rel(self, paths_to_dump, dst_dir): - # TODO(maciek): describe the semantics of this!!! - print('copy_paths_rel') - for d in paths_to_dump: - print(d) - tar_filename = '{id}.tar.gz'.format(id=id_generator(20)) - tar_tmp_path = os.path.join('/tmp/', tar_filename) - print('tmp_path', tar_tmp_path) - tar = tarfile.open(tar_tmp_path, 'w:gz') - - for d in paths_to_dump: - remote_rel_path, local_path = d['dst_rel'], d['src'] - # if remote_rel_path != '': - # raise NotImplementedError - print('adding_path', local_path) - tar.add(local_path, arcname=os.path.basename(local_path)) - tar.close() - - print('dst_dir', dst_dir) - self.copy_path(dst_dir, tar_tmp_path) - with cd(dst_dir): - run('pwd') - run('ls *') - run('tar xfz {tar_filename}'.format(tar_filename=tar_filename)) - - def copy_paths_rel_cached(self, paths_to_dump, dst_dir): - raise NotImplementedError - tar_filename = '{id}.tar.gz'.format(id=id_generator(20)) - tar_path = os.path.join('/tmp/', tar_filename) - print('tmp_path', tar_path) - tar = tarfile.open(tar_path, 'w:gz') - - for d in paths_to_dump: - remote_rel_path, local_path = d['dst_rel'], d['src'] - if remote_rel_path != '': - raise NotImplementedError - print('adding_path', local_path) - tar.add(local_path, arcname=os.path.basename(local_path)) - tar.close() - sha_digest = self._sha256_file(tar_path) - print(sha_digest) - - print('dst_dir', dst_dir) - - # WARNING(maciek): this is not concurrent safe - - remote_tar_filename = sha_digest + '.tar.gz' - if not exists(osp.join(self.scratch_space, remote_tar_filename)): - self.copy_path(osp.join(self.scratch_space, remote_tar_filename), tar_path) - - run('cp {path_src} {path_dst}'.format(path_src=osp.join(self.scratch_space, remote_tar_filename), - path_dst=osp.join(dst_dir, remote_tar_filename))) - - with cd(dst_dir): - run('ls *') - run('tar xfz {remote_tar_filename}'.format(remote_tar_filename=remote_tar_filename)) - - def _sha256_file(self, path): - return hashlib.sha256(open(path, 'rb').read()).hexdigest() - - - def copy_path(self, remote_path, local_path): - rsync_project(remote_path, local_path) - - - -if __name__ == '__main__': - - username = 'plghenrykm' - host = 'pro.cyfronet.pl' - - prometheus = PrometheusBackend(username=username, host=host) - #prometheus.sbatch('uptime', time='01:00:00', partition='plgrid-testing') - - command = 'echo test ; sleep 1; echo test2; sleep 1' - #prometheus.srun('nvidia-smi', partition='plgrid-gpu') - prometheus.srun(command, partition='plgrid-testing') - diff --git a/mrunner/ssh_backend.py b/mrunner/ssh_backend.py deleted file mode 100644 index de8a731..0000000 --- a/mrunner/ssh_backend.py +++ /dev/null @@ -1,3 +0,0 @@ - -class SshBackend(object): - pass \ No newline at end of file diff --git a/mrunner/tasks.py b/mrunner/tasks.py deleted file mode 100644 index 2c99196..0000000 --- a/mrunner/tasks.py +++ /dev/null @@ -1,42 +0,0 @@ -class CommandWithEnv(object): - def __init__(self, command, env): - self.command = command - self.env = env - - def generate_one_liner(self): - raise NotImplementedError - - - -class PlgridTask(object): - def __init__(self, command, cwd=None, env={}, modules_to_load=[], venv_path=None, - after_module_load_cmd=None, script_name="mrunner"): - # paths_to_dump (dst_remote_path, local_path) - self.command = command - self.cwd = cwd - self.env = env - self.venv_path = venv_path - self.modules_to_load = modules_to_load - self.after_module_load_cmd = after_module_load_cmd - self.script_name = script_name - - def construct_command(self): - command = '' - if self.cwd is not None: - command += 'cd {cwd}\n'.format(cwd=self.cwd) - - for module in self.modules_to_load: - command += 'module load {}\n'.format(module) - - if self.after_module_load_cmd is not None: - command += self.after_module_load_cmd + '\n' - - # command += "pip install scipy\n" - - if self.venv_path is not None: - command += 'source {venv_path}/bin/activate\n'.format(venv_path=self.venv_path) - - for name, val in self.env.iteritems(): - command += 'export {name}={val}\n'.format(name=name, val=val) - command += self.command - return command diff --git a/mrunner/templates/Dockerfile.jinja2 b/mrunner/templates/Dockerfile.jinja2 new file mode 100644 index 0000000..ee75d7c --- /dev/null +++ b/mrunner/templates/Dockerfile.jinja2 @@ -0,0 +1,21 @@ +FROM {{ experiment.base_image }} + +ARG EXP_DIR=/experiment +ARG STORAGE_DIR={{ experiment.storage_dir }} +ARG NEPTUNE_TOKEN=missing +ARG NEPTUNE_TOKEN_PATH=/root/.neptune/tokens/token + +COPY {{ requirements_file }} ${EXP_DIR}/requirements.txt +RUN pip install --no-cache-dir -r $EXP_DIR/requirements.txt +{%- for local_path, remote_path in paths_to_copy or ['.'] %} +COPY {{ local_path }} ${EXP_DIR}/{{ remote_path }} +{%- endfor %} +ENV STORAGE_DIR=${STORAGE_DIR} + +RUN mkdir -p $(dirname ${NEPTUNE_TOKEN_PATH}) && echo ${NEPTUNE_TOKEN} > ${NEPTUNE_TOKEN_PATH} + +VOLUME ${STORAGE_DIR} +VOLUME ${EXP_DIR} +WORKDIR ${EXP_DIR} + +ENTRYPOINT ["{{ experiment.cmd_without_params|join('", "') }}"] diff --git a/mrunner/templates/slurm_experiment.sh.jinja2 b/mrunner/templates/slurm_experiment.sh.jinja2 new file mode 100644 index 0000000..c904aba --- /dev/null +++ b/mrunner/templates/slurm_experiment.sh.jinja2 @@ -0,0 +1,14 @@ +#!/usr/bin/env sh +set -e +cd {{ experiment.experiment_scratch_dir }} +{%- for module_name in experiment.modules_to_load %} +module load {{ module_name }} +{%- endfor %} +{%- if experiment.after_module_load_cmd %} +{{ experiment.after_module_load_cmd }} +{%- endif %} +source {{ experiment.venv }}/bin/activate +{%- for env_key, env_value in experiment.env.items() %} +export {{ env_key }}={{ env_value }} +{%- endfor %} +{{ experiment.cmd.command }} \ No newline at end of file diff --git a/mrunner/utils.py b/mrunner/utils.py deleted file mode 100644 index c86da95..0000000 --- a/mrunner/utils.py +++ /dev/null @@ -1,19 +0,0 @@ -import errno -import os -import random -import string - - -def mkdir_p(path): - try: - os.makedirs(path) - return path - except OSError as exc: # Python >2.5 - if exc.errno == errno.EEXIST and os.path.isdir(path): - return path - else: - raise - - -def id_generator(n=10): - return ''.join(random.SystemRandom().choice(string.ascii_lowercase + string.digits) for _ in range(n)) \ No newline at end of file diff --git a/mrunner/utils/__init__.py b/mrunner/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mrunner/utils/docker_engine.py b/mrunner/utils/docker_engine.py new file mode 100644 index 0000000..8e42bea --- /dev/null +++ b/mrunner/utils/docker_engine.py @@ -0,0 +1,152 @@ +# -*- coding: utf-8 -*- +import logging +import os +from subprocess import call + +import attr +from docker.errors import ImageNotFound +from path import Path + +from mrunner.utils.utils import GeneratedTemplateFile, get_paths_to_copy + +LOGGER = logging.getLogger(__name__) + + +class RequirementsFile(object): + + def __init__(self, path, requirements): + self._path = Path(path) + with open(path, 'w') as requirements_file: + payload = '\n'.join(requirements) + requirements_file.writelines(payload) + + def __del__(self): + self._path.remove_p() + + @property + def path(self): + return self._path + + +StaticCmd = attr.make_class('StaticCmd', ['command', 'env'], frozen=True) + + +class DockerFile(GeneratedTemplateFile): + DEFAULT_DOCKERFILE_TEMPLATE = 'Dockerfile.jinja2' + + def __init__(self, experiment, requirements_file): + experiment_data = attr.asdict(experiment) + # paths in command shall be relative + cmd = experiment_data.pop('cmd') + updated_cmd = self._rewrite_paths(experiment.cwd, cmd.command) + paths_to_copy = get_paths_to_copy(exclude=experiment.exclude, paths_to_copy=experiment.paths_to_copy) + experiment = attr.evolve(experiment, cmd=StaticCmd(command=updated_cmd, env=cmd.env)) + + super(DockerFile, self).__init__(template_filename=self.DEFAULT_DOCKERFILE_TEMPLATE, + experiment=experiment, requirements_file=requirements_file, + paths_to_copy=paths_to_copy) + + def _rewrite_paths(self, cwd, cmd): + updated_cmd = [] + for item in cmd.split(' '): + if Path(item).exists(): + item = Path(cwd).relpathto(item) + updated_cmd.append(item) + return ' '.join(updated_cmd) + + +class DockerEngine(object): + + def __init__(self, docker_url=None): + import docker + base_url = docker_url if docker_url else os.environ.get('DOCKER_HOST', 'unix://var/run/docker.sock') + self._client = docker.DockerClient(base_url=base_url) + + def _login_with_docker(self, experiment): + self._client.login(registry=experiment.registry_url, username=experiment.registry_username, + password=experiment.registry_password, reauth=True) + + def _login_with_gcloud(self, experiment): + call('gcloud auth configure-docker'.split(' ')) + + def build_and_publish_image(self, experiment): + registry_url = experiment.registry_url + self._is_gcr = registry_url and registry_url.startswith('https://gcr.io') + if registry_url: + _login = self._login_with_gcloud if self._is_gcr else self._login_with_docker + _login(experiment) + + # requirements filename shall be constant for experiment, to use docker cache during build; + # thus we don't use dynamic/temporary file names + file_path = self._generate_requirements_name(experiment) + requirements = RequirementsFile(file_path, experiment.requirements) + LOGGER.debug('Requirements file created:') + LOGGER.debug(Path(file_path).text()) + + dockerfile = DockerFile(experiment=experiment, requirements_file=requirements.path) + LOGGER.debug('Dockerfile created:') + dockerfile_rel_path = Path(experiment.cwd).relpathto(dockerfile.path) + + # obtain old image for comparison if there where any changes + repository_name = self._generate_repository_name(experiment) + try: + old_image = self._client.images.get(repository_name + ':latest') + except ImageNotFound: + old_image = None + + # build image; use cache if possible + LOGGER.debug(Path(dockerfile.path).text()) + LOGGER.debug('Building docker image') + neptune_build_args = self._get_neptune_build_args(experiment) + image, _ = self._client.images.build(path=experiment.cwd, tag=repository_name, + buildargs=neptune_build_args, + dockerfile=dockerfile_rel_path, pull=True, rm=True, forcerm=True) + + is_image_updated = not old_image or old_image.id != image.id + LOGGER.debug('Docker image built (updated={})'.format(is_image_updated)) + if is_image_updated: + # if new image is generated - tag it and push to repository + tag = self._get_tag() + image.tag(repository_name, tag=tag) + LOGGER.debug('Docker image tagged: {}'.format(tag)) + result = self._client.images.push(repository_name, tag=tag) + LOGGER.debug('Docker image published: {}'.format(tag)) + if 'errorDetail' in result: + raise RuntimeError(result) + image = self._client.images.get(repository_name) + + # obtain image name with our tag + image_name = [tag for tag in image.tags if not tag.endswith('latest')][0] + LOGGER.debug('Docker image {} ready'.format(image_name)) + return image_name + + def _generate_requirements_name(self, experiment): + return 'requirements_{}_{}.txt'.format(experiment.project, experiment.name) + + def _generate_repository_name(self, experiment): + image_name = '{}/{}'.format(experiment.project, experiment.name) + + # while publishing images there is need to prefix them with repository hostname + if experiment.registry_url: + registry_name = experiment.registry_url.split(r'://')[1] + image_name = '{}/{}'.format(registry_name, image_name) + + if self._is_gcr: + assert experiment.google_project_id, 'Configure google_project_id key for current context' + image_name = image_name.replace('/{}/'.format(experiment.project), + '/{}/'.format(experiment.google_project_id)) + + return image_name + + def _get_tag(self): + from datetime import datetime + return datetime.utcnow().strftime('%Y%m%d_%H%M%S') + + def _get_neptune_build_args(self, experiment): + args = {} + if experiment.neptune_token_files: + neptune_token_path = Path(experiment.neptune_token_files[0]) + rel_path = Path('.').relpathto(neptune_token_path) + args['NEPTUNE_TOKEN'] = neptune_token_path.text() + args['NEPTUNE_TOKEN_PATH'] = '/'.join(['/root', ] + [p for p in rel_path.split('/') if p and p != '..']) + return args diff --git a/mrunner/utils/namesgenerator.py b/mrunner/utils/namesgenerator.py new file mode 100644 index 0000000..247f4d7 --- /dev/null +++ b/mrunner/utils/namesgenerator.py @@ -0,0 +1,606 @@ +# coding: utf-8 + +# Docker names generator, Python port +# https://github.com/shamrin/namesgenerator +# Copyright (c) 2017 Alexey Shamrin +# MIT License + +import random +import string + +left = [ + 'admiring', + 'adoring', + 'affectionate', + 'agitated', + 'amazing', + 'angry', + 'awesome', + 'blissful', + 'boring', + 'brave', + 'clever', + 'cocky', + 'compassionate', + 'competent', + 'condescending', + 'confident', + 'cranky', + 'dazzling', + 'determined', + 'distracted', + 'dreamy', + 'eager', + 'ecstatic', + 'elastic', + 'elated', + 'elegant', + 'eloquent', + 'epic', + 'fervent', + 'festive', + 'flamboyant', + 'focused', + 'friendly', + 'frosty', + 'gallant', + 'gifted', + 'goofy', + 'gracious', + 'happy', + 'hardcore', + 'heuristic', + 'hopeful', + 'hungry', + 'infallible', + 'inspiring', + 'jolly', + 'jovial', + 'keen', + 'kind', + 'laughing', + 'loving', + 'lucid', + 'mystifying', + 'modest', + 'musing', + 'naughty', + 'nervous', + 'nifty', + 'nostalgic', + 'objective', + 'optimistic', + 'peaceful', + 'pedantic', + 'pensive', + 'practical', + 'priceless', + 'quirky', + 'quizzical', + 'relaxed', + 'reverent', + 'romantic', + 'sad', + 'serene', + 'sharp', + 'silly', + 'sleepy', + 'stoic', + 'stupefied', + 'suspicious', + 'tender', + 'thirsty', + 'trusting', + 'unruffled', + 'upbeat', + 'vibrant', + 'vigilant', + 'vigorous', + 'wizardly', + 'wonderful', + 'xenodochial', + 'youthful', + 'zealous', + 'zen', +] + +right = [ + # Muhammad ibn Jābir al-Ḥarrānī al-Battānī was a founding father of astronomy. https://en.wikipedia.org/wiki/Mu%E1%B8%A5ammad_ibn_J%C4%81bir_al-%E1%B8%A4arr%C4%81n%C4%AB_al-Batt%C4%81n%C4%AB + 'albattani', + + # Frances E. Allen, became the first female IBM Fellow in 1989. In 2006, she became the first female recipient of the ACM's Turing Award. https://en.wikipedia.org/wiki/Frances_E._Allen + 'allen', + + # June Almeida - Scottish virologist who took the first pictures of the rubella virus - https://en.wikipedia.org/wiki/June_Almeida + 'almeida', + + # Maria Gaetana Agnesi - Italian mathematician, philosopher, theologian and humanitarian. She was the first woman to write a mathematics handbook and the first woman appointed as a Mathematics Professor at a University. https://en.wikipedia.org/wiki/Maria_Gaetana_Agnesi + 'agnesi', + + # Archimedes was a physicist, engineer and mathematician who invented too many things to list them here. https://en.wikipedia.org/wiki/Archimedes + 'archimedes', + + # Maria Ardinghelli - Italian translator, mathematician and physicist - https://en.wikipedia.org/wiki/Maria_Ardinghelli + 'ardinghelli', + + # Aryabhata - Ancient Indian mathematician-astronomer during 476-550 CE https://en.wikipedia.org/wiki/Aryabhata + 'aryabhata', + + # Wanda Austin - Wanda Austin is the President and CEO of The Aerospace Corporation, a leading architect for the US security space programs. https://en.wikipedia.org/wiki/Wanda_Austin + 'austin', + + # Charles Babbage invented the concept of a programmable computer. https://en.wikipedia.org/wiki/Charles_Babbage. + 'babbage', + + # Stefan Banach - Polish mathematician, was one of the founders of modern functional analysis. https://en.wikipedia.org/wiki/Stefan_Banach + 'banach', + + # John Bardeen co-invented the transistor - https://en.wikipedia.org/wiki/John_Bardeen + 'bardeen', + + # Jean Bartik, born Betty Jean Jennings, was one of the original programmers for the ENIAC computer. https://en.wikipedia.org/wiki/Jean_Bartik + 'bartik', + + # Laura Bassi, the world's first female professor https://en.wikipedia.org/wiki/Laura_Bassi + 'bassi', + + # Hugh Beaver, British engineer, founder of the Guinness Book of World Records https://en.wikipedia.org/wiki/Hugh_Beaver + 'beaver', + + # Alexander Graham Bell - an eminent Scottish-born scientist, inventor, engineer and innovator who is credited with inventing the first practical telephone - https://en.wikipedia.org/wiki/Alexander_Graham_Bell + 'bell', + + # Karl Friedrich Benz - a German automobile engineer. Inventor of the first practical motorcar. https://en.wikipedia.org/wiki/Karl_Benz + 'benz', + + # Homi J Bhabha - was an Indian nuclear physicist, founding director, and professor of physics at the Tata Institute of Fundamental Research. Colloquially known as 'father of Indian nuclear programme'- https://en.wikipedia.org/wiki/Homi_J._Bhabha + 'bhabha', + + # Bhaskara II - Ancient Indian mathematician-astronomer whose work on calculus predates Newton and Leibniz by over half a millennium - https://en.wikipedia.org/wiki/Bh%C4%81skara_II#Calculus + 'bhaskara', + + # Elizabeth Blackwell - American doctor and first American woman to receive a medical degree - https://en.wikipedia.org/wiki/Elizabeth_Blackwell + 'blackwell', + + # Niels Bohr is the father of quantum theory. https://en.wikipedia.org/wiki/Niels_Bohr. + 'bohr', + + # Kathleen Booth, she's credited with writing the first assembly language. https://en.wikipedia.org/wiki/Kathleen_Booth + 'booth', + + # Anita Borg - Anita Borg was the founding director of the Institute for Women and Technology (IWT). https://en.wikipedia.org/wiki/Anita_Borg + 'borg', + + # Satyendra Nath Bose - He provided the foundation for Bose–Einstein statistics and the theory of the Bose–Einstein condensate. - https://en.wikipedia.org/wiki/Satyendra_Nath_Bose + 'bose', + + # Evelyn Boyd Granville - She was one of the first African-American woman to receive a Ph.D. in mathematics; she earned it in 1949 from Yale University. https://en.wikipedia.org/wiki/Evelyn_Boyd_Granville + 'boyd', + + # Brahmagupta - Ancient Indian mathematician during 598-670 CE who gave rules to compute with zero - https://en.wikipedia.org/wiki/Brahmagupta#Zero + 'brahmagupta', + + # Walter Houser Brattain co-invented the transistor - https://en.wikipedia.org/wiki/Walter_Houser_Brattain + 'brattain', + + # Emmett Brown invented time travel. https://en.wikipedia.org/wiki/Emmett_Brown (thanks Brian Goff) + 'brown', + + # Rachel Carson - American marine biologist and conservationist, her book Silent Spring and other writings are credited with advancing the global environmental movement. https://en.wikipedia.org/wiki/Rachel_Carson + 'carson', + + # Subrahmanyan Chandrasekhar - Astrophysicist known for his mathematical theory on different stages and evolution in structures of the stars. He has won nobel prize for physics - https://en.wikipedia.org/wiki/Subrahmanyan_Chandrasekhar + 'chandrasekhar', + + # Claude Shannon - The father of information theory and founder of digital circuit design theory. (https://en.wikipedia.org/wiki/Claude_Shannon) + 'shannon', + + # Joan Clarke - Bletchley Park code breaker during the Second World War who pioneered techniques that remained top secret for decades. Also an accomplished numismatist https://en.wikipedia.org/wiki/Joan_Clarke + 'clarke', + + # Jane Colden - American botanist widely considered the first female American botanist - https://en.wikipedia.org/wiki/Jane_Colden + 'colden', + + # Gerty Theresa Cori - American biochemist who became the third woman—and first American woman—to win a Nobel Prize in science, and the first woman to be awarded the Nobel Prize in Physiology or Medicine. Cori was born in Prague. https://en.wikipedia.org/wiki/Gerty_Cori + 'cori', + + # Seymour Roger Cray was an American electrical engineer and supercomputer architect who designed a series of computers that were the fastest in the world for decades. https://en.wikipedia.org/wiki/Seymour_Cray + 'cray', + + # This entry reflects a husband and wife team who worked together: + # Joan Curran was a Welsh scientist who developed radar and invented chaff, a radar countermeasure. https://en.wikipedia.org/wiki/Joan_Curran + # Samuel Curran was an Irish physicist who worked alongside his wife during WWII and invented the proximity fuse. https://en.wikipedia.org/wiki/Samuel_Curran + 'curran', + + # Marie Curie discovered radioactivity. https://en.wikipedia.org/wiki/Marie_Curie. + 'curie', + + # Charles Darwin established the principles of natural evolution. https://en.wikipedia.org/wiki/Charles_Darwin. + 'darwin', + + # Leonardo Da Vinci invented too many things to list here. https://en.wikipedia.org/wiki/Leonardo_da_Vinci. + 'davinci', + + # Edsger Wybe Dijkstra was a Dutch computer scientist and mathematical scientist. https://en.wikipedia.org/wiki/Edsger_W._Dijkstra. + 'dijkstra', + + # Donna Dubinsky - played an integral role in the development of personal digital assistants (PDAs) serving as CEO of Palm, Inc. and co-founding Handspring. https://en.wikipedia.org/wiki/Donna_Dubinsky + 'dubinsky', + + # Annie Easley - She was a leading member of the team which developed software for the Centaur rocket stage and one of the first African-Americans in her field. https://en.wikipedia.org/wiki/Annie_Easley + 'easley', + + # Thomas Alva Edison, prolific inventor https://en.wikipedia.org/wiki/Thomas_Edison + 'edison', + + # Albert Einstein invented the general theory of relativity. https://en.wikipedia.org/wiki/Albert_Einstein + 'einstein', + + # Gertrude Elion - American biochemist, pharmacologist and the 1988 recipient of the Nobel Prize in Medicine - https://en.wikipedia.org/wiki/Gertrude_Elion + 'elion', + + # Douglas Engelbart gave the mother of all demos: https://en.wikipedia.org/wiki/Douglas_Engelbart + 'engelbart', + + # Euclid invented geometry. https://en.wikipedia.org/wiki/Euclid + 'euclid', + + # Leonhard Euler invented large parts of modern mathematics. https://de.wikipedia.org/wiki/Leonhard_Euler + 'euler', + + # Pierre de Fermat pioneered several aspects of modern mathematics. https://en.wikipedia.org/wiki/Pierre_de_Fermat + 'fermat', + + # Enrico Fermi invented the first nuclear reactor. https://en.wikipedia.org/wiki/Enrico_Fermi. + 'fermi', + + # Richard Feynman was a key contributor to quantum mechanics and particle physics. https://en.wikipedia.org/wiki/Richard_Feynman + 'feynman', + + # Benjamin Franklin is famous for his experiments in electricity and the invention of the lightning rod. + 'franklin', + + # Galileo was a founding father of modern astronomy, and faced politics and obscurantism to establish scientific truth. https://en.wikipedia.org/wiki/Galileo_Galilei + 'galileo', + + # William Henry 'Bill' Gates III is an American business magnate, philanthropist, investor, computer programmer, and inventor. https://en.wikipedia.org/wiki/Bill_Gates + 'gates', + + # Adele Goldberg, was one of the designers and developers of the Smalltalk language. https://en.wikipedia.org/wiki/Adele_Goldberg_(computer_scientist) + 'goldberg', + + # Adele Goldstine, born Adele Katz, wrote the complete technical description for the first electronic digital computer, ENIAC. https://en.wikipedia.org/wiki/Adele_Goldstine + 'goldstine', + + # Shafi Goldwasser is a computer scientist known for creating theoretical foundations of modern cryptography. Winner of 2012 ACM Turing Award. https://en.wikipedia.org/wiki/Shafi_Goldwasser + 'goldwasser', + + # James Golick, all around gangster. + 'golick', + + # Jane Goodall - British primatologist, ethologist, and anthropologist who is considered to be the world's foremost expert on chimpanzees - https://en.wikipedia.org/wiki/Jane_Goodall + 'goodall', + + # Lois Haibt - American computer scientist, part of the team at IBM that developed FORTRAN - https://en.wikipedia.org/wiki/Lois_Haibt + 'haibt', + + # Margaret Hamilton - Director of the Software Engineering Division of the MIT Instrumentation Laboratory, which developed on-board flight software for the Apollo space program. https://en.wikipedia.org/wiki/Margaret_Hamilton_(scientist) + 'hamilton', + + # Stephen Hawking pioneered the field of cosmology by combining general relativity and quantum mechanics. https://en.wikipedia.org/wiki/Stephen_Hawking + 'hawking', + + # Werner Heisenberg was a founding father of quantum mechanics. https://en.wikipedia.org/wiki/Werner_Heisenberg + 'heisenberg', + + # Grete Hermann was a German philosopher noted for her philosophical work on the foundations of quantum mechanics. https://en.wikipedia.org/wiki/Grete_Hermann + 'hermann', + + # Jaroslav Heyrovský was the inventor of the polarographic method, father of the electroanalytical method, and recipient of the Nobel Prize in 1959. His main field of work was polarography. https://en.wikipedia.org/wiki/Jaroslav_Heyrovsk%C3%BD + 'heyrovsky', + + # Dorothy Hodgkin was a British biochemist, credited with the development of protein crystallography. She was awarded the Nobel Prize in Chemistry in 1964. https://en.wikipedia.org/wiki/Dorothy_Hodgkin + 'hodgkin', + + # Erna Schneider Hoover revolutionized modern communication by inventing a computerized telephone switching method. https://en.wikipedia.org/wiki/Erna_Schneider_Hoover + 'hoover', + + # Grace Hopper developed the first compiler for a computer programming language and is credited with popularizing the term 'debugging' for fixing computer glitches. https://en.wikipedia.org/wiki/Grace_Hopper + 'hopper', + + # Frances Hugle, she was an American scientist, engineer, and inventor who contributed to the understanding of semiconductors, integrated circuitry, and the unique electrical principles of microscopic materials. https://en.wikipedia.org/wiki/Frances_Hugle + 'hugle', + + # Hypatia - Greek Alexandrine Neoplatonist philosopher in Egypt who was one of the earliest mothers of mathematics - https://en.wikipedia.org/wiki/Hypatia + 'hypatia', + + # Mary Jackson, American mathematician and aerospace engineer who earned the highest title within NASA's engineering department - https://en.wikipedia.org/wiki/Mary_Jackson_(engineer) + 'jackson', + + # Yeong-Sil Jang was a Korean scientist and astronomer during the Joseon Dynasty; he invented the first metal printing press and water gauge. https://en.wikipedia.org/wiki/Jang_Yeong-sil + 'jang', + + # Betty Jennings - one of the original programmers of the ENIAC. https://en.wikipedia.org/wiki/ENIAC - https://en.wikipedia.org/wiki/Jean_Bartik + 'jennings', + + # Mary Lou Jepsen, was the founder and chief technology officer of One Laptop Per Child (OLPC), and the founder of Pixel Qi. https://en.wikipedia.org/wiki/Mary_Lou_Jepsen + 'jepsen', + + # Katherine Coleman Goble Johnson - American physicist and mathematician contributed to the NASA. https://en.wikipedia.org/wiki/Katherine_Johnson + 'johnson', + + # Irène Joliot-Curie - French scientist who was awarded the Nobel Prize for Chemistry in 1935. Daughter of Marie and Pierre Curie. https://en.wikipedia.org/wiki/Ir%C3%A8ne_Joliot-Curie + 'joliot', + + # Karen Spärck Jones came up with the concept of inverse document frequency, which is used in most search engines today. https://en.wikipedia.org/wiki/Karen_Sp%C3%A4rck_Jones + 'jones', + + # A. P. J. Abdul Kalam - is an Indian scientist aka Missile Man of India for his work on the development of ballistic missile and launch vehicle technology - https://en.wikipedia.org/wiki/A._P._J._Abdul_Kalam + 'kalam', + + # Susan Kare, created the icons and many of the interface elements for the original Apple Macintosh in the 1980s, and was an original employee of NeXT, working as the Creative Director. https://en.wikipedia.org/wiki/Susan_Kare + 'kare', + + # Mary Kenneth Keller, Sister Mary Kenneth Keller became the first American woman to earn a PhD in Computer Science in 1965. https://en.wikipedia.org/wiki/Mary_Kenneth_Keller + 'keller', + + # Johannes Kepler, German astronomer known for his three laws of planetary motion - https://en.wikipedia.org/wiki/Johannes_Kepler + 'kepler', + + # Har Gobind Khorana - Indian-American biochemist who shared the 1968 Nobel Prize for Physiology - https://en.wikipedia.org/wiki/Har_Gobind_Khorana + 'khorana', + + # Jack Kilby invented silicone integrated circuits and gave Silicon Valley its name. - https://en.wikipedia.org/wiki/Jack_Kilby + 'kilby', + + # Maria Kirch - German astronomer and first woman to discover a comet - https://en.wikipedia.org/wiki/Maria_Margarethe_Kirch + 'kirch', + + # Donald Knuth - American computer scientist, author of 'The Art of Computer Programming' and creator of the TeX typesetting system. https://en.wikipedia.org/wiki/Donald_Knuth + 'knuth', + + # Sophie Kowalevski - Russian mathematician responsible for important original contributions to analysis, differential equations and mechanics - https://en.wikipedia.org/wiki/Sofia_Kovalevskaya + 'kowalevski', + + # Marie-Jeanne de Lalande - French astronomer, mathematician and cataloguer of stars - https://en.wikipedia.org/wiki/Marie-Jeanne_de_Lalande + 'lalande', + + # Hedy Lamarr - Actress and inventor. The principles of her work are now incorporated into modern Wi-Fi, CDMA and Bluetooth technology. https://en.wikipedia.org/wiki/Hedy_Lamarr + 'lamarr', + + # Leslie B. Lamport - American computer scientist. Lamport is best known for his seminal work in distributed systems and was the winner of the 2013 Turing Award. https://en.wikipedia.org/wiki/Leslie_Lamport + 'lamport', + + # Mary Leakey - British paleoanthropologist who discovered the first fossilized Proconsul skull - https://en.wikipedia.org/wiki/Mary_Leakey + 'leakey', + + # Henrietta Swan Leavitt - she was an American astronomer who discovered the relation between the luminosity and the period of Cepheid variable stars. https://en.wikipedia.org/wiki/Henrietta_Swan_Leavitt + 'leavitt', + + # Daniel Lewin - Mathematician, Akamai co-founder, soldier, 9/11 victim-- Developed optimization techniques for routing traffic on the internet. Died attempting to stop the 9-11 hijackers. https://en.wikipedia.org/wiki/Daniel_Lewin + 'lewin', + + # Ruth Lichterman - one of the original programmers of the ENIAC. https://en.wikipedia.org/wiki/ENIAC - https://en.wikipedia.org/wiki/Ruth_Teitelbaum + 'lichterman', + + # Barbara Liskov - co-developed the Liskov substitution principle. Liskov was also the winner of the Turing Prize in 2008. - https://en.wikipedia.org/wiki/Barbara_Liskov + 'liskov', + + # Ada Lovelace invented the first algorithm. https://en.wikipedia.org/wiki/Ada_Lovelace (thanks James Turnbull) + 'lovelace', + + # Auguste and Louis Lumière - the first filmmakers in history - https://en.wikipedia.org/wiki/Auguste_and_Louis_Lumi%C3%A8re + 'lumiere', + + # Mahavira - Ancient Indian mathematician during 9th century AD who discovered basic algebraic identities - https://en.wikipedia.org/wiki/Mah%C4%81v%C4%ABra_(mathematician) + 'mahavira', + + # Maria Mayer - American theoretical physicist and Nobel laureate in Physics for proposing the nuclear shell model of the atomic nucleus - https://en.wikipedia.org/wiki/Maria_Mayer + 'mayer', + + # John McCarthy invented LISP: https://en.wikipedia.org/wiki/John_McCarthy_(computer_scientist) + 'mccarthy', + + # Barbara McClintock - a distinguished American cytogeneticist, 1983 Nobel Laureate in Physiology or Medicine for discovering transposons. https://en.wikipedia.org/wiki/Barbara_McClintock + 'mcclintock', + + # Malcolm McLean invented the modern shipping container: https://en.wikipedia.org/wiki/Malcom_McLean + 'mclean', + + # Kay McNulty - one of the original programmers of the ENIAC. https://en.wikipedia.org/wiki/ENIAC - https://en.wikipedia.org/wiki/Kathleen_Antonelli + 'mcnulty', + + # Lise Meitner - Austrian/Swedish physicist who was involved in the discovery of nuclear fission. The element meitnerium is named after her - https://en.wikipedia.org/wiki/Lise_Meitner + 'meitner', + + # Carla Meninsky, was the game designer and programmer for Atari 2600 games Dodge 'Em and Warlords. https://en.wikipedia.org/wiki/Carla_Meninsky + 'meninsky', + + # Johanna Mestorf - German prehistoric archaeologist and first female museum director in Germany - https://en.wikipedia.org/wiki/Johanna_Mestorf + 'mestorf', + + # Marvin Minsky - Pioneer in Artificial Intelligence, co-founder of the MIT's AI Lab, won the Turing Award in 1969. https://en.wikipedia.org/wiki/Marvin_Minsky + 'minsky', + + # Maryam Mirzakhani - an Iranian mathematician and the first woman to win the Fields Medal. https://en.wikipedia.org/wiki/Maryam_Mirzakhani + 'mirzakhani', + + # Samuel Morse - contributed to the invention of a single-wire telegraph system based on European telegraphs and was a co-developer of the Morse code - https://en.wikipedia.org/wiki/Samuel_Morse + 'morse', + + # Ian Murdock - founder of the Debian project - https://en.wikipedia.org/wiki/Ian_Murdock + 'murdock', + + # John von Neumann - todays computer architectures are based on the von Neumann architecture. https://en.wikipedia.org/wiki/Von_Neumann_architecture + 'neumann', + + # Isaac Newton invented classic mechanics and modern optics. https://en.wikipedia.org/wiki/Isaac_Newton + 'newton', + + # Florence Nightingale, more prominently known as a nurse, was also the first female member of the Royal Statistical Society and a pioneer in statistical graphics https://en.wikipedia.org/wiki/Florence_Nightingale#Statistics_and_sanitary_reform + 'nightingale', + + # Alfred Nobel - a Swedish chemist, engineer, innovator, and armaments manufacturer (inventor of dynamite) - https://en.wikipedia.org/wiki/Alfred_Nobel + 'nobel', + + # Emmy Noether, German mathematician. Noether's Theorem is named after her. https://en.wikipedia.org/wiki/Emmy_Noether + 'noether', + + # Poppy Northcutt. Poppy Northcutt was the first woman to work as part of NASA’s Mission Control. http://www.businessinsider.com/poppy-northcutt-helped-apollo-astronauts-2014-12?op=1 + 'northcutt', + + # Robert Noyce invented silicone integrated circuits and gave Silicon Valley its name. - https://en.wikipedia.org/wiki/Robert_Noyce + 'noyce', + + # Panini - Ancient Indian linguist and grammarian from 4th century CE who worked on the world's first formal system - https://en.wikipedia.org/wiki/P%C4%81%E1%B9%87ini#Comparison_with_modern_formal_systems + 'panini', + + # Ambroise Pare invented modern surgery. https://en.wikipedia.org/wiki/Ambroise_Par%C3%A9 + 'pare', + + # Louis Pasteur discovered vaccination, fermentation and pasteurization. https://en.wikipedia.org/wiki/Louis_Pasteur. + 'pasteur', + + # Cecilia Payne-Gaposchkin was an astronomer and astrophysicist who, in 1925, proposed in her Ph.D. thesis an explanation for the composition of stars in terms of the relative abundances of hydrogen and helium. https://en.wikipedia.org/wiki/Cecilia_Payne-Gaposchkin + 'payne', + + # Radia Perlman is a software designer and network engineer and most famous for her invention of the spanning-tree protocol (STP). https://en.wikipedia.org/wiki/Radia_Perlman + 'perlman', + + # Rob Pike was a key contributor to Unix, Plan 9, the X graphic system, utf-8, and the Go programming language. https://en.wikipedia.org/wiki/Rob_Pike + 'pike', + + # Henri Poincaré made fundamental contributions in several fields of mathematics. https://en.wikipedia.org/wiki/Henri_Poincar%C3%A9 + 'poincare', + + # Laura Poitras is a director and producer whose work, made possible by open source crypto tools, advances the causes of truth and freedom of information by reporting disclosures by whistleblowers such as Edward Snowden. https://en.wikipedia.org/wiki/Laura_Poitras + 'poitras', + + # Claudius Ptolemy - a Greco-Egyptian writer of Alexandria, known as a mathematician, astronomer, geographer, astrologer, and poet of a single epigram in the Greek Anthology - https://en.wikipedia.org/wiki/Ptolemy + 'ptolemy', + + # C. V. Raman - Indian physicist who won the Nobel Prize in 1930 for proposing the Raman effect. - https://en.wikipedia.org/wiki/C._V._Raman + 'raman', + + # Srinivasa Ramanujan - Indian mathematician and autodidact who made extraordinary contributions to mathematical analysis, number theory, infinite series, and continued fractions. - https://en.wikipedia.org/wiki/Srinivasa_Ramanujan + 'ramanujan', + + # Sally Kristen Ride was an American physicist and astronaut. She was the first American woman in space, and the youngest American astronaut. https://en.wikipedia.org/wiki/Sally_Ride + 'ride', + + # Rita Levi-Montalcini - Won Nobel Prize in Physiology or Medicine jointly with colleague Stanley Cohen for the discovery of nerve growth factor (https://en.wikipedia.org/wiki/Rita_Levi-Montalcini) + 'montalcini', + + # Dennis Ritchie - co-creator of UNIX and the C programming language. - https://en.wikipedia.org/wiki/Dennis_Ritchie + 'ritchie', + + # Wilhelm Conrad Röntgen - German physicist who was awarded the first Nobel Prize in Physics in 1901 for the discovery of X-rays (Röntgen rays). https://en.wikipedia.org/wiki/Wilhelm_R%C3%B6ntgen + 'roentgen', + + # Rosalind Franklin - British biophysicist and X-ray crystallographer whose research was critical to the understanding of DNA - https://en.wikipedia.org/wiki/Rosalind_Franklin + 'rosalind', + + # Meghnad Saha - Indian astrophysicist best known for his development of the Saha equation, used to describe chemical and physical conditions in stars - https://en.wikipedia.org/wiki/Meghnad_Saha + 'saha', + + # Jean E. Sammet developed FORMAC, the first widely used computer language for symbolic manipulation of mathematical formulas. https://en.wikipedia.org/wiki/Jean_E._Sammet + 'sammet', + + # Carol Shaw - Originally an Atari employee, Carol Shaw is said to be the first female video game designer. https://en.wikipedia.org/wiki/Carol_Shaw_(video_game_designer) + 'shaw', + + # Dame Stephanie 'Steve' Shirley - Founded a software company in 1962 employing women working from home. https://en.wikipedia.org/wiki/Steve_Shirley + 'shirley', + + # William Shockley co-invented the transistor - https://en.wikipedia.org/wiki/William_Shockley + 'shockley', + + # Françoise Barré-Sinoussi - French virologist and Nobel Prize Laureate in Physiology or Medicine; her work was fundamental in identifying HIV as the cause of AIDS. https://en.wikipedia.org/wiki/Fran%C3%A7oise_Barr%C3%A9-Sinoussi + 'sinoussi', + + # Betty Snyder - one of the original programmers of the ENIAC. https://en.wikipedia.org/wiki/ENIAC - https://en.wikipedia.org/wiki/Betty_Holberton + 'snyder', + + # Frances Spence - one of the original programmers of the ENIAC. https://en.wikipedia.org/wiki/ENIAC - https://en.wikipedia.org/wiki/Frances_Spence + 'spence', + + # Richard Matthew Stallman - the founder of the Free Software movement, the GNU project, the Free Software Foundation, and the League for Programming Freedom. He also invented the concept of copyleft to protect the ideals of this movement, and enshrined this concept in the widely-used GPL (General Public License) for software. https://en.wikiquote.org/wiki/Richard_Stallman + 'stallman', + + # Michael Stonebraker is a database research pioneer and architect of Ingres, Postgres, VoltDB and SciDB. Winner of 2014 ACM Turing Award. https://en.wikipedia.org/wiki/Michael_Stonebraker + 'stonebraker', + + # Janese Swanson (with others) developed the first of the Carmen Sandiego games. She went on to found Girl Tech. https://en.wikipedia.org/wiki/Janese_Swanson + 'swanson', + + # Aaron Swartz was influential in creating RSS, Markdown, Creative Commons, Reddit, and much of the internet as we know it today. He was devoted to freedom of information on the web. https://en.wikiquote.org/wiki/Aaron_Swartz + 'swartz', + + # Bertha Swirles was a theoretical physicist who made a number of contributions to early quantum theory. https://en.wikipedia.org/wiki/Bertha_Swirles + 'swirles', + + # Nikola Tesla invented the AC electric system and every gadget ever used by a James Bond villain. https://en.wikipedia.org/wiki/Nikola_Tesla + 'tesla', + + # Ken Thompson - co-creator of UNIX and the C programming language - https://en.wikipedia.org/wiki/Ken_Thompson + 'thompson', + + # Linus Torvalds invented Linux and Git. https://en.wikipedia.org/wiki/Linus_Torvalds + 'torvalds', + + # Alan Turing was a founding father of computer science. https://en.wikipedia.org/wiki/Alan_Turing. + 'turing', + + # Varahamihira - Ancient Indian mathematician who discovered trigonometric formulae during 505-587 CE - https://en.wikipedia.org/wiki/Var%C4%81hamihira#Contributions + 'varahamihira', + + # Sir Mokshagundam Visvesvaraya - is a notable Indian engineer. He is a recipient of the Indian Republic's highest honour, the Bharat Ratna, in 1955. On his birthday, 15 September is celebrated as Engineer's Day in India in his memory - https://en.wikipedia.org/wiki/Visvesvaraya + 'visvesvaraya', + + # Christiane Nüsslein-Volhard - German biologist, won Nobel Prize in Physiology or Medicine in 1995 for research on the genetic control of embryonic development. https://en.wikipedia.org/wiki/Christiane_N%C3%BCsslein-Volhard + 'volhard', + + # Marlyn Wescoff - one of the original programmers of the ENIAC. https://en.wikipedia.org/wiki/ENIAC - https://en.wikipedia.org/wiki/Marlyn_Meltzer + 'wescoff', + + # Andrew Wiles - Notable British mathematician who proved the enigmatic Fermat's Last Theorem - https://en.wikipedia.org/wiki/Andrew_Wiles + 'wiles', + + # Roberta Williams, did pioneering work in graphical adventure games for personal computers, particularly the King's Quest series. https://en.wikipedia.org/wiki/Roberta_Williams + 'williams', + + # Sophie Wilson designed the first Acorn Micro-Computer and the instruction set for ARM processors. https://en.wikipedia.org/wiki/Sophie_Wilson + 'wilson', + + # Jeannette Wing - co-developed the Liskov substitution principle. - https://en.wikipedia.org/wiki/Jeannette_Wing + 'wing', + + # Steve Wozniak invented the Apple I and Apple II. https://en.wikipedia.org/wiki/Steve_Wozniak + 'wozniak', + + # The Wright brothers, Orville and Wilbur - credited with inventing and building the world's first successful airplane and making the first controlled, powered and sustained heavier-than-air human flight - https://en.wikipedia.org/wiki/Wright_brothers + 'wright', + + # Rosalyn Sussman Yalow - Rosalyn Sussman Yalow was an American medical physicist, and a co-winner of the 1977 Nobel Prize in Physiology or Medicine for development of the radioimmunoassay technique. https://en.wikipedia.org/wiki/Rosalyn_Sussman_Yalow + 'yalow', + + # Ada Yonath - an Israeli crystallographer, the first woman from the Middle East to win a Nobel prize in the sciences. https://en.wikipedia.org/wiki/Ada_Yonath + 'yonath', +] + + +def get_random_name(sep='_'): + r = random.SystemRandom() + while 1: + name = '%s%s%s' % (r.choice(left), sep, r.choice(right)) + if name == 'boring' + sep + 'wozniak': # Steve Wozniak is not boring + continue + return name + + +def id_generator(n=10): + return ''.join(random.SystemRandom().choice(string.ascii_lowercase + string.digits) for _ in range(n)) + + +if __name__ == '__main__': + print(get_random_name()) diff --git a/mrunner/utils/neptune.py b/mrunner/utils/neptune.py new file mode 100644 index 0000000..fb70614 --- /dev/null +++ b/mrunner/utils/neptune.py @@ -0,0 +1,184 @@ +# -*- coding: utf-8 -*- +import logging +from collections import namedtuple +from copy import copy +from distutils.version import LooseVersion + +import six +try: + from deepsense import version as neptune_version +except ImportError: + from neptune import version as neptune_version + +LOGGER = logging.getLogger(__name__) + +NEPTUNE_LOCAL_VERSION = LooseVersion(neptune_version.__version__) + + +class NeptuneConfigFileBase(object): + # set of attributes is force by neptune v1 yaml format + + class Parameter(namedtuple('Parameter', 'name type default required')): + + @staticmethod + def create(name, value): + if isinstance(value, bool): + type = 'boolean' + elif isinstance(value, int): + type = 'int' + elif isinstance(value, float): + type = 'double' + else: + type = 'string' + value = str(value) + return NeptuneConfigFileBase.Parameter(name, type, value, False) + + def __init__(self, project, name, parameters, tags=None, description=None, **kwargs): + self._project = project + self._name = name + self._parameters = [self.Parameter.create(k, v) for k, v in parameters.items()] + self._description = description + self._tags = tags + self._exclude = kwargs.get('exclude', []) + + def dump(self, fh): + import yaml + yaml.dump(self._format_data(), fh, default_flow_style=False) + + def _format_data(self): + raise NotImplementedError() + + +class NeptuneConfigFileV1(NeptuneConfigFileBase): + # see http://neptune-docs.deepsense.codilime.com/versions/1.6/reference-guides/cli.html#configuration-files + def _format_data(self): + def format_parameter(p): + return dict(p._asdict()) + + data = { + 'name': self._name, + 'project': self._project, + 'parameters': [format_parameter(p) for p in self._parameters] + } + if self._description: + data['description'] = self._description + if self._tags: + data['tags'] = self._tags + if self._exclude: + data['exclude'] = self._exclude + return data + + +class NeptuneConfigFileV2(NeptuneConfigFileBase): + # see: https://docs.neptune.ml/cli/config/ + def _format_data(self): + def format_parameter(p): + return p.default + + data = { + 'open-webbrowser': False, + 'name': self._name, + 'project': self._project, + 'parameters': {p.name: format_parameter(p) for p in self._parameters} + } + if self._description: + data['description'] = self._description + if self._tags: + data['tags'] = self._tags + if self._exclude: + data['exclude'] = self._exclude + return data + + +def load_neptune_config(neptune_config_path): + from deepsense.neptune.common.config import neptune_config + + global_config = neptune_config.load_global_config() + local_config = neptune_config.load_local_config(neptune_config_path) if neptune_config_path else {} + neptune_config = neptune_config.NeptuneConfig(global_config=global_config, local_config=local_config) + + if len(neptune_config.name) > 16: + raise ValueError('Neptune config "name" key (experiment name) shall be at most 16 chars long') + + config_dict = neptune_config.config_dict + + def _rename(d, k1, k2): + if k1 in d: + d[k2] = copy(d[k1]) + del d[k1] + + _rename(config_dict, 'host', 'neptune_host') + _rename(config_dict, 'port', 'neptune_port') + _rename(config_dict, 'username', 'neptune_username') + _rename(config_dict, 'password', 'neptune_password') + _rename(config_dict, 'paths-to-copy', 'paths_to_copy') + + return neptune_config.config_dict + + +class NeptuneWrapperCmd(object): + + def __init__(self, cmd, experiment_config_path, neptune_storage=None, additional_tags=None, paths_to_dump=None, + docker_image=None): + self._cmd = cmd + self._experiment_config_path = experiment_config_path + self._additional_tags = additional_tags + self._storage = neptune_storage + self._paths_to_dump = paths_to_dump + self._docker_image = docker_image + + @property + def command(self): + cmd = self._cmd.split(' ') if isinstance(self._cmd, six.string_types) else self._cmd + while cmd[0].startswith('python'): + cmd = cmd[1:] + + base_argv = ['neptune', 'run', '--config', str(self._experiment_config_path)] + if NEPTUNE_LOCAL_VERSION.version[0] == 1: + storage_arv = ['--storage', self._storage] if self._storage else [] + tags_argv = ['--tags'] + self._additional_tags if self._additional_tags else [] + # for v2 it is exclude param + dump_argv = ['--paths-to-copy'] + self._paths_to_dump if self._paths_to_dump else [] + cmd = [cmd[0], '--'] + cmd[1:] + else: + storage_arv = [] + tags_argv = [] + for tag in self._additional_tags: + tags_argv.extend(['--tag', tag]) + dump_argv = [] + docker_argv = ['--docker-image', self._docker_image] if self._docker_image else [] + + cmd = base_argv + storage_arv + tags_argv + dump_argv + docker_argv + cmd + return ' '.join(cmd) + + @property + def env(self): + # setup env variables to setup neptune config + neptune_env = {} + if NEPTUNE_LOCAL_VERSION.version[0] == 1: + # neptune connection config is required because experiments from different neptune accounts may be + # started on same system account + config = self.conf + assert config['username'], "Use ~/.neptune.yaml to setup credentials" + assert config['password'], "Use ~/.neptune.yaml to setup credentials" + + neptune_env = {'NEPTUNE_PASSWORD': str(config['password']), 'NEPTUNE_USER': str(config['username'])} + if 'host' in config: + neptune_env['NEPTUNE_HOST'] = str(config['host']) + if 'port' in config: + neptune_env['NEPTUNE_PORT'] = str(config['port']) + + # TODO: [PZ] because neptune env vars are set, maybe it is not required to set additional env var? + neptune_env.update({'MRUNNER_UNDER_NEPTUNE': '1'}) + return neptune_env + + @property + def conf(self): + """Extracts neptune configuration from global config""" + try: + from deepsense.neptune.common.config import neptune_config + global_config = neptune_config.load_global_config() + # loads also NEPTUNE_* env vars + return neptune_config.NeptuneConfig(global_config=global_config) + except ImportError: + raise RuntimeError('Install neptune-cli first and configure connection') diff --git a/mrunner/utils/utils.py b/mrunner/utils/utils.py new file mode 100644 index 0000000..66982c1 --- /dev/null +++ b/mrunner/utils/utils.py @@ -0,0 +1,118 @@ +import datetime +import logging +from collections import namedtuple, OrderedDict +from tempfile import NamedTemporaryFile + +import attr +from jinja2 import Environment, PackageLoader, StrictUndefined +from path import Path + +from mrunner.utils.namesgenerator import id_generator + +LOGGER = logging.getLogger(__name__) + + +def get_experiment_dirname(): + return datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '_' + id_generator(4) + + +def parse_argv(parser, argv): + try: + divider_pos = argv.index('--') + mrunner_argv = argv[1:divider_pos] + rest_argv = argv[divider_pos + 1:] + except ValueError: + # when missing '--' separator + mrunner_argv = argv + rest_argv = [] + return parser.parse_args(args=mrunner_argv), rest_argv + + +template_env = Environment( + loader=PackageLoader('mrunner', 'templates'), + undefined=StrictUndefined +) + + +class TempFile(object): + + def __init__(self, dir=None): + self._file = NamedTemporaryFile(prefix='mrunner_', dir=dir) + + def write(self, payload): + self._file.write(payload) + self._file.flush() + + @property + def path(self): + return Path(self._file.name) + + +class GeneratedTemplateFile(TempFile): + + def __init__(self, template_filename=None, **kwargs): + super(GeneratedTemplateFile, self).__init__() + template = template_env.get_template(template_filename) + payload = template.render(**kwargs).encode(encoding='utf-8') + self.write(payload) + + +PathToDump = namedtuple('PathToDump', 'local_path rel_remote_path') + + +def get_paths_to_copy(paths_to_copy=None, exclude=None): + """Lists paths to copy from current working directory, after excluding paths from exclude list; + additionally paths_to_copy are copied""" + + if paths_to_copy is None: + paths_to_copy = [] + if exclude is None: + exclude = ['.git', '.gitignore', '.gitmodules'] + exclude = [Path(e).abspath() for e in exclude] + + def _list_dir(d): + directories = [] + for p in Path(d).listdir(): + p = p.abspath() + excluded = False + for e in exclude: + e = e.abspath() + if not e.relpath(p).startswith('..'): + excluded = True + # if excluded subdir - not whole current + if e != p: + directories += _list_dir(p) + break + if not excluded: + directories.append(PathToDump(p.relpath('.'), p.relpath('.'))) + return directories + + result = _list_dir(Path('.')) + for external in paths_to_copy: + if ':' in external: + src, rel_dst = external.split(':') + else: + src = external + # get relative to cwd split into items on each '/' and remove relative parts + rel_dst = '/'.join([item for item in Path(external).relpath('.').splitall() if item and item != '..']) + result.append(PathToDump(Path(src).relpath('.'), Path(rel_dst).relpath('.'))) + + result = set(result) + LOGGER.debug('get_paths_to_copy(paths_to_copy={}, exclude={}) result={}'.format( + paths_to_copy, exclude, [str(s) for s, d in result] + )) + return result + + +def make_attr_class(class_name, fields, **class_kwargs): + fields = OrderedDict([(k, attr.ib(**kwargs) if isinstance(kwargs, dict) else kwargs) for k, kwargs in fields]) + return attr.make_class(class_name, fields, **class_kwargs) + + +def filter_only_attr(AttrClass, d): + available_fields = [f.name for f in attr.fields(AttrClass)] + for k, v in d.items(): + if k in available_fields: + continue + LOGGER.debug('Ignoring argument {}={}'.format(k, v)) + return {k: v for k, v in d.items() if k in available_fields} diff --git a/parser_to_yaml.py b/parser_to_yaml.py deleted file mode 100644 index 237ffd7..0000000 --- a/parser_to_yaml.py +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env python -import sys - -from mrunner import parser_to_yaml - -config_path = sys.argv[1] - -print parser_to_yaml(config_path) diff --git a/setup.py b/setup.py index 566bea4..dbd0868 100644 --- a/setup.py +++ b/setup.py @@ -1,15 +1,15 @@ from setuptools import setup, find_packages + setup( name='mrunner', - version='0.0.15', + version='0.2.1', packages=find_packages(), - install_requires=['PyYAML', 'fabric'], + include_package_data=True, + install_requires=['PyYAML', 'fabric3', 'path.py', 'jinja2', 'six', 'attrs>=17.3', 'click', + 'docker', 'kubernetes>=5.0.0', 'google-cloud'], entry_points={ 'console_scripts': [ - 'mrunner_local=mrunner.mrunner_cli:main', - 'mrunner_plgrid=mrunner.mrunner_plgrid_cli:main', - 'mrunner_kube=mrunner.mrunner_kubernetes_cli:main', - 'command_gen=mrunner.command_gen:main', + 'mrunner=mrunner.cli.mrunner_cli:cli' ], }, ) diff --git a/template/config.py b/template/config.py deleted file mode 100644 index 1d6bd96..0000000 --- a/template/config.py +++ /dev/null @@ -1,7 +0,0 @@ -import argparse - -def create_parser(parser=None): - if parser is None: - parser = argparse.ArgumentParser(description='') - return parser - diff --git a/template/main.py b/template/main.py deleted file mode 100644 index a6dc20a..0000000 --- a/template/main.py +++ /dev/null @@ -1,6 +0,0 @@ -import sys - -from mrunner.template import create_parser - - - diff --git a/tests/kubernetes_no_gpu/Dockerfile b/tests/kubernetes_no_gpu/Dockerfile deleted file mode 100644 index 978022a..0000000 --- a/tests/kubernetes_no_gpu/Dockerfile +++ /dev/null @@ -1,5 +0,0 @@ -FROM ubuntu:16.04 -RUN echo 'test' -RUN apt-get update -RUN apt-get -y install python-pip iputils-ping -RUN mkdir -p /mnt/mhome diff --git a/tests/kubernetes_no_gpu/build_docker_cpascal.sh b/tests/kubernetes_no_gpu/build_docker_cpascal.sh deleted file mode 100755 index 23cd089..0000000 --- a/tests/kubernetes_no_gpu/build_docker_cpascal.sh +++ /dev/null @@ -1 +0,0 @@ -docker build -t cpascal:5001/mrunner_test:kubernetes_no_gpu . diff --git a/tests/kubernetes_no_gpu/build_docker_local.sh b/tests/kubernetes_no_gpu/build_docker_local.sh deleted file mode 100755 index df50c68..0000000 --- a/tests/kubernetes_no_gpu/build_docker_local.sh +++ /dev/null @@ -1 +0,0 @@ -docker build -t mrunner_test:kubernetes_no_gpu . diff --git a/tests/kubernetes_no_gpu/main.py b/tests/kubernetes_no_gpu/main.py deleted file mode 100644 index 600444a..0000000 --- a/tests/kubernetes_no_gpu/main.py +++ /dev/null @@ -1,6 +0,0 @@ -import time -import sys -import tensorflow -print(10) -time.sleep(10000) -sys.stdout.flush() diff --git a/tests/neptune_test.py b/tests/neptune_test.py new file mode 100644 index 0000000..7dde491 --- /dev/null +++ b/tests/neptune_test.py @@ -0,0 +1,91 @@ +# -*- coding: utf-8 -*- +import tempfile +import unittest + +import six +import yaml +from path import tempdir + +from mrunner.cmd import Cmd +from mrunner.utils.neptune import NeptuneWrapperCmd, NeptuneConfigFileV1, load_neptune_config, NeptuneConfigFileV2 + + +class NeptuneWrapperCmdTestCase(unittest.TestCase): + + def test_cmd_generation(self): + self.assertEqual(NeptuneWrapperCmd('python exp1.py --param1 value1 -f', experiment_config_path='foo.yaml', + neptune_storage='/tmp/storage').command, + 'neptune run exp1.py --config foo.yaml --storage /tmp/storage -- --param1 value1 -f') + self.assertEqual(NeptuneWrapperCmd('./exp1.py --param1 value1 -f', experiment_config_path='foo.yaml', + neptune_storage='/tmp/storage').command, + 'neptune run ./exp1.py --config foo.yaml --storage /tmp/storage -- --param1 value1 -f') + self.assertEqual(NeptuneWrapperCmd('./exp1.py --param1 value1 -f', experiment_config_path='foo.yaml', + additional_tags=['tag1', 'tag2', 'tag3'], + neptune_storage='/tmp/storage').command, + 'neptune run ./exp1.py --config foo.yaml --storage /tmp/storage ' + '--tags tag1 tag2 tag3 -- --param1 value1 -f') + + def test_env_generation(self): + with tempfile.NamedTemporaryFile(suffix='.yaml') as config_file: + yaml.dump({'user': 'foo@bar.com', 'password': 'foobar'}) + cmd = NeptuneWrapperCmd('python exp1.py --param1 value1 -f', experiment_config_path=config_file.name, + neptune_storage='/tmp/storage') + + vars_with_no_str_values = {k: v for k, v in cmd.env.items() if not isinstance(v, six.string_types)} + self.assertTrue(not vars_with_no_str_values) + self.assertIn('NEPTUNE_PASSWORD', cmd.env) + self.assertIn('NEPTUNE_USER', cmd.env) + + cmd = Cmd('python exp1.py --param1 value1 -f', exp_dir_path='/tmp/storage') + vars_with_no_str_values = {k: v for k, v in cmd.env.items() if not isinstance(v, six.string_types)} + self.assertTrue(not vars_with_no_str_values) + self.assertNotIn('NEPTUNE_PASSWORD', cmd.env) + self.assertNotIn('NEPTUNE_USER', cmd.env) + + +class NeptuneConfigFileTestCase(unittest.TestCase): + CONFIG_ORIG = { + 'project': 'project', + 'name': 'name', + 'parameters': {'param1': 'param1', 'param2': 2, 'param3': True, 'param4': 1.2}, + 'tags': ['tag1', 'tag2', 'tag3'] + } + + def test_generate_v1(self): + from deepsense.version import __version__ + version = int(__version__.split('.')[0]) + if version == 1: + # test only if neptune-cli==1 is installed + with tempdir() as d: + temp_path = d / 'neptune.yaml' + with temp_path.open('w') as fh: + NeptuneConfigFileV1(**self.CONFIG_ORIG).dump(fh) + config_read = load_neptune_config(temp_path) + # trim to keys from config_orig + config_trimmed = {k: v for k, v in config_read.items() if k in self.CONFIG_ORIG} + + # convert from v1 parameters list to dict + config_trimmed['parameters'] = {d['name']: d['default'] for d in config_trimmed['parameters']} + config_trimmed['tags'] = sorted(config_trimmed['tags']) # not sure why order of tags is not preserved + + self.assertEqual(self.CONFIG_ORIG, config_trimmed) + + def test_generate_v2(self): + from deepsense.version import __version__ + version = int(__version__.split('.')[0]) + if version == 2: + # test only if neptune-cli==2 is installed + with tempdir() as d: + temp_path = d / 'neptune.yaml' + with temp_path.open('w') as fh: + NeptuneConfigFileV2(**self.CONFIG_ORIG).dump(fh) + config_read = load_neptune_config(temp_path) + # trim to keys from config_orig + config_trimmed = {k: v for k, v in config_read.items() if k in self.CONFIG_ORIG} + self.assertEqual(self.CONFIG_ORIG, config_trimmed) + + def test_generate_extended_args(self): + config_orig = self.CONFIG_ORIG.copy() + config_orig['foo'] = 1 + config_orig['bar'] = 2 + config = NeptuneConfigFileV1(**config_orig) diff --git a/tests/slurm_test.py b/tests/slurm_test.py new file mode 100644 index 0000000..5b650d5 --- /dev/null +++ b/tests/slurm_test.py @@ -0,0 +1,136 @@ +# -*- coding: utf-8 -*- +import os +import shutil +import sys +import tempfile +import unittest +from contextlib import contextmanager + +from fabric.operations import run +from fabric.state import env +from paramiko import Agent +from path import Path + +from mrunner.cli.config import Context +from mrunner.backends.slurm import SlurmBackend, ExperimentScript, ExperimentRunOnSlurm + + +class TmpCmd(object): + + @property + def command(self): + return 'python experiment1.py --foo bar' + + @property + def env(self): + return {'CMD_VAR': 2} + + +class FabricTestCase(unittest.TestCase): + + def setUp(self): + # assume that openssh-server is installed and tester public key is added to own account + env['host_string'] = '{}@localhost'.format(os.path.basename(os.path.expanduser('~'))) + + def test_fabric_connection_to_localhost(self): + assert Agent().get_keys(), "Add your private key to ssh agent using 'ssh-add' command" + run('echo "Hello world"') + + +class SlurmTestCase(unittest.TestCase): + + def setUp(self): + # assume that openssh-server is installed and tester public key is added to own account + slurm_url = '{}@localhost'.format(os.path.basename(os.path.expanduser('~'))) + self.tmp_dir = Path(tempfile.mkdtemp()) + + self.scratch_dir = self.tmp_dir / 'scratch' + self.storage_dir = self.tmp_dir / 'storage' + self.context = Context(slurm_url=slurm_url, + scratch_dir=self.scratch_dir, + storage=self.storage_dir, + partition='plgrid.py-testing', + user_id='jj', + modules_to_load='plgrid.py/tools/python/3.6.0 plgrid.py/tools/imagemagick/6.9.1') + self.experiment = ExperimentRunOnSlurm(self.context, project='project-name', + name='experiment-name', + cmd=TmpCmd(), env={'EXPERIMENT_VAR': 3}) + + self.paths_to_dump = [ + {'src': os.path.abspath('tests'), 'dst': self.tmp_dir, 'dst_rel': 'tests'}, + {'src': os.path.abspath('mrunner'), 'dst': self.tmp_dir, 'dst_rel': 'mrunner'}, + {'src': os.path.abspath('certs/cpascal'), 'dst': self.tmp_dir, 'dst_rel': 'certs/cpascal'}, + ] + + def tearDown(self): + if os.path.exists(self.tmp_dir): + shutil.rmtree(self.tmp_dir) + + def test_expand_vars(self): + os.environ['SCRATCH'] = '/tmp/scratch' + os.environ['STORAGE'] = '/tmp/storage' + + context = Context(storage='$STORAGE/bar', scratch_dir='foo', + slurm_url=None, partition=None, + modules_to_load=None, user_id='jj') + s = SlurmBackend(context=context) + self.assertEqual(s.scratch_dir, '/tmp/scratch/foo') + self.assertEqual(s.storage_dir, '/tmp/storage/bar') + self.assertTrue(isinstance(s.scratch_dir, Path)) + self.assertTrue(isinstance(s.storage_dir, Path)) + + def test_slurm_create_directories(self): + s = SlurmBackend(self.context) + s.ensure_directories(self.experiment) + self.assertTrue(self.scratch_dir.exists()) + self.assertTrue(self.storage_dir.exists()) + + def test_slurm_experiment_script_generation(self): + context = Context(slurm_url=self.context.slurm_url, + scratch_dir='/tmp/scratch', + storage='/tmp/storage', + partition='plgrid.py-testing', + modules_to_load='plgrid.py/tools/python/3.6.0 plgrid.py/tools/imagemagick/6.9.1', + after_module_load_cmd='echo loaded', + venv='/tmp/home/.venvs/project', + env={'CONTEXT_VAR': '1'}, + user_id='jj') + + experiment = ExperimentRunOnSlurm(context=context, cmd=TmpCmd(), env={'EXPERIMENT_VAR': 3}) + + # because pykwalify reloads sys, thus causing error while debugging in pycharm + # generate shell script + with secure_stdout_context(): + script = ExperimentScript(context, experiment) + script_payload = script.path.text() + expected_script_regexp = "#!/usr/bin/env sh\n" \ + "set -e\n" \ + "cd /tmp/scratch/.*\n" \ + "module load plgrid.py/tools/python/3.6.0\n" \ + "module load plgrid.py/tools/imagemagick/6.9.1\n" \ + "echo loaded\n" \ + "source /tmp/home/.venvs/project/bin/activate\n" \ + "export .*\n" \ + "export .*\n" \ + "export .*\n" \ + "python experiment1.py --foo bar" + + self.assertRegexpMatches(script_payload, expected_script_regexp) + + def test_slurm_script_name(self): + # because pykwalify reloads sys, thus causing error while debugging in pycharm + # generate shell script + with secure_stdout_context(): + script = ExperimentScript(self.context, self.experiment) + self.assertRegexpMatches(script.script_name, 'jj_project-name_experiment-name_.*.sh') + + # check if venv is overwritten + # ensure venv has no slash on end + + +@contextmanager +def secure_stdout_context(): + # because pykwalify reloads sys, thus causing error while debugging in pycharm + stdout_tmp, stderr_tmp = sys.stdout, sys.stderr + yield + sys.stdout, sys.stderr = stdout_tmp, stderr_tmp diff --git a/tests/template_test.py b/tests/template_test.py new file mode 100644 index 0000000..b3d6421 --- /dev/null +++ b/tests/template_test.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +import unittest + +from path import Path + +from mrunner.cli.config import Context +from mrunner.experiment import Experiment +from mrunner.utils.utils import GeneratedTemplateFile, DObject + + +class GeneratedTemplatesTestCase(unittest.TestCase): + + def test_generate_template(self): + context = Context(storage='/storage') + experiment = Experiment(base_image='python:3', + paths_to_copy=['.', 'src', 'tests'], + cmd=DObject(command='neptune run foo.py --storage /storage -- --epochs 2', env={})) + dockerfile = GeneratedTemplateFile(template_filename='Dockerfile.jinja2', + context=context, experiment=experiment, requirements_file='requirements.txt') + dockerfile_payload = Path(dockerfile.path).text(encoding='utf-8') + expected_dockerfile_payload = '''FROM python:3 + +ARG EXP_DIR=/experiment +ARG STORAGE_DIR=/storage + +COPY requirements.txt ${EXP_DIR}/requirements.txt +RUN pip install --no-cache-dir -r $EXP_DIR/requirements.txt +COPY . ${EXP_DIR}/. +COPY src ${EXP_DIR}/src +COPY tests ${EXP_DIR}/tests +ENV STORAGE_DIR=${STORAGE_DIR} + +VOLUME ${STORAGE_DIR} +VOLUME ${EXP_DIR} +WORKDIR ${EXP_DIR} + +ENTRYPOINT ["neptune", "run", "foo.py", "--storage", "/storage", "--"]''' + self.assertEqual(dockerfile_payload, expected_dockerfile_payload) diff --git a/tests/tensorflow_with_neptune/Dockerfile b/tests/tensorflow_with_neptune/Dockerfile deleted file mode 100644 index 13482c6..0000000 --- a/tests/tensorflow_with_neptune/Dockerfile +++ /dev/null @@ -1,6 +0,0 @@ -FROM gcr.io/tensorflow/tensorflow:latest-gpu -RUN apt-get -y update -RUN apt-get install -y python3-pip python-pip iputils-ping -RUN pip install neptune-cli==1.6.3 -RUN pip install keras -RUN mkdir -p /mnt/mhome diff --git a/tests/tensorflow_with_neptune/build_docker_local.sh b/tests/tensorflow_with_neptune/build_docker_local.sh deleted file mode 100755 index fbe56ce..0000000 --- a/tests/tensorflow_with_neptune/build_docker_local.sh +++ /dev/null @@ -1 +0,0 @@ -docker build -t mrunner_test:neptune . diff --git a/tests/utils_test.py b/tests/utils_test.py new file mode 100644 index 0000000..8fba92a --- /dev/null +++ b/tests/utils_test.py @@ -0,0 +1,83 @@ +# -*- coding: utf-8 -*- +import unittest + +from path import tempdir, Path + +from mrunner.cli.config import ConfigParser +from mrunner.utils.utils import get_paths_to_copy, PathToDump + + +class ConfigTestCase(unittest.TestCase): + + def test_on_missing_config_file(self): + with tempdir() as tmp: + config_path = tmp / 'config.yaml' + config = ConfigParser(config_path).load() + self.assertEqual({}, config.contexts) + self.assertEqual('', config.current_context) + self.assertTrue(not config_path.exists()) + + +class UtilsTestCase(unittest.TestCase): + + # def test_dobject(self): + # ctx = DObject(a='foo', b='bar') + # self.assertEqual(ctx.a, 'foo') + # self.assertEqual(ctx.b, 'bar') + # self.assertRaises(AttributeError, ctx.__setattr__, 'a', 'a') + # self.assertRaises(AttributeError, ctx.__setattr__, 'c', 'c') + # + # ctx = DObject(b='foo', c='bar') + # self.assertEqual(ctx.b, 'foo') + # self.assertEqual(ctx.c, 'bar') + # self.assertRaises(AttributeError, ctx.__setattr__, 'a', 'a') + # + # def test_deep_dobject(self): + # ctx = DObject(a={'b': 'foo'}, c=1) + # self.assertEqual(ctx.a, Context(b='foo')) + # self.assertEqual(ctx.a.b, 'foo') + # self.assertEqual(ctx.c, 1) + # + # def test_deep_dobject_as_getitem(self): + # ctx = DObject(a={'b': 'foo'}, c=1) + # self.assertEqual(ctx['a'], Context(b='foo')) + # + # def test_raise_on_setting_computed_attr(self): + # class Foo(DObject): + # foo = property(lambda self: 1) + # + # # self.assertRaises(AttributeError, Foo.__new__, foo='bar') + + def test_paths_to_copy(self): + with tempdir() as tmp: + tmp.chdir() + (tmp / 'a/1').makedirs() + (tmp / 'a/2').makedirs() + (tmp / 'b/1').makedirs() + (tmp / 'b/2').makedirs() + (tmp / 'c/1/a').makedirs() + (tmp / 'file1').write_text('file1') + (tmp / 'file2').write_text('file2') + (tmp / 'file3').write_text('file3') + (tmp / 'a/file_a1').write_text('file_a1') + (tmp / 'a/1/file_a1_1').write_text('file_a1_1') + + # list all dirs and files + self.assertEqual({PathToDump(Path(p), Path(p)) for p in {'a', 'b', 'c', 'file1', 'file2', 'file3'}}, + set(get_paths_to_copy())) + + # exclude 'a' and 'file1' + self.assertEqual({PathToDump(Path(p), Path(p)) for p in {'b', 'c', 'file2', 'file3'}}, + set(get_paths_to_copy(exclude=['a', 'file1']))) + + # exclude 'a/1 and 'file1' + self.assertEqual({PathToDump(Path(p), Path(p)) for p in {'a/2', 'a/file_a1', 'b', 'c', 'file2', 'file3'}}, + set(get_paths_to_copy(exclude=['a/1', 'file1', 'a/10/file_a10_1']))) + + # add external resource + self.assertEqual({PathToDump(Path(s), Path(d)) for s, d in {('../external1', 'external1'), + ('a', 'a'), ('b', 'b'), ('c', 'c'), + ('file1', 'file1'), + ('file2', 'file2'), + ('file3', 'file3')}}, + set(get_paths_to_copy(paths_to_copy=[tmp / '../external1']))) diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..abf4bb5 --- /dev/null +++ b/tox.ini @@ -0,0 +1,17 @@ +[tox] +envlist = py27,py35 +[testenv:py27] +# required for fabric ssh connections used in tests +passenv=SSH_AGENT_PID SSH_AUTH_SOCK +deps= + pytest + neptune-cli==1.6 +commands=pytest +[testenv:py35] +# required for fabric ssh connections used in tests +passenv=SSH_AGENT_PID SSH_AUTH_SOCK +deps= + pytest + neptune-cli==1.6 +# no capture is required for fabric3 +commands=pytest --capture no