diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 000000000..e69de29bb diff --git a/404.html b/404.html new file mode 100644 index 000000000..1ebe729c5 --- /dev/null +++ b/404.html @@ -0,0 +1 @@ + Sherlock

404

Uh oh, you found a ... nothing.

But it's probably not what you were looking for, sorry...

You're on this page because you tried to access a URL that doesn't exist or a page that's moved elsewhere. Now, if you really want, you can try to use the "Search" box above to find some more useful things. But you don't have to, your call.
\ No newline at end of file diff --git a/CNAME b/CNAME new file mode 100644 index 000000000..bf2122d60 --- /dev/null +++ b/CNAME @@ -0,0 +1 @@ +www.sherlock.stanford.edu diff --git a/assets/images/banner.png b/assets/images/banner.png new file mode 100644 index 000000000..1b6717ccc Binary files /dev/null and b/assets/images/banner.png differ diff --git a/assets/images/bg_hero.jpg b/assets/images/bg_hero.jpg new file mode 100644 index 000000000..a0c1473b5 Binary files /dev/null and b/assets/images/bg_hero.jpg differ diff --git a/assets/images/bg_svc.jpg b/assets/images/bg_svc.jpg new file mode 100644 index 000000000..9b30031a9 Binary files /dev/null and b/assets/images/bg_svc.jpg differ diff --git a/assets/images/favicon.png b/assets/images/favicon.png new file mode 100644 index 000000000..1cf13b9f9 Binary files /dev/null and b/assets/images/favicon.png differ diff --git a/assets/images/logo.png b/assets/images/logo.png new file mode 100644 index 000000000..85e0a12a1 Binary files /dev/null and b/assets/images/logo.png differ diff --git a/assets/images/logo_small.png b/assets/images/logo_small.png new file mode 100644 index 000000000..203cef64f Binary files /dev/null and b/assets/images/logo_small.png differ diff --git a/assets/images/social/docs/advanced-topics/connection.png b/assets/images/social/docs/advanced-topics/connection.png new file mode 100644 index 000000000..1e8004840 Binary files /dev/null and b/assets/images/social/docs/advanced-topics/connection.png differ diff --git a/assets/images/social/docs/advanced-topics/job-management.png b/assets/images/social/docs/advanced-topics/job-management.png new file mode 100644 index 000000000..d838e6878 Binary files /dev/null and b/assets/images/social/docs/advanced-topics/job-management.png differ diff --git a/assets/images/social/docs/advanced-topics/node-features.png b/assets/images/social/docs/advanced-topics/node-features.png new file mode 100644 index 000000000..a19246cdb Binary files /dev/null and b/assets/images/social/docs/advanced-topics/node-features.png differ diff --git a/assets/images/social/docs/concepts.png b/assets/images/social/docs/concepts.png new file mode 100644 index 000000000..2de2ee0fc Binary files /dev/null and b/assets/images/social/docs/concepts.png differ diff --git a/assets/images/social/docs/credits.png b/assets/images/social/docs/credits.png new file mode 100644 index 000000000..3112f831b Binary files /dev/null and b/assets/images/social/docs/credits.png differ diff --git a/assets/images/social/docs/getting-started/connecting.png b/assets/images/social/docs/getting-started/connecting.png new file mode 100644 index 000000000..7b8b1d6ac Binary files /dev/null and b/assets/images/social/docs/getting-started/connecting.png differ diff --git a/assets/images/social/docs/getting-started/index.png b/assets/images/social/docs/getting-started/index.png new file mode 100644 index 000000000..a004b6b3c Binary files /dev/null and b/assets/images/social/docs/getting-started/index.png differ diff --git a/assets/images/social/docs/getting-started/submitting.png b/assets/images/social/docs/getting-started/submitting.png new file mode 100644 index 000000000..c8aa4e99f Binary files /dev/null and b/assets/images/social/docs/getting-started/submitting.png differ diff --git a/assets/images/social/docs/glossary.png b/assets/images/social/docs/glossary.png new file mode 100644 index 000000000..6d2157040 Binary files /dev/null and b/assets/images/social/docs/glossary.png differ diff --git a/assets/images/social/docs/index.png b/assets/images/social/docs/index.png new file mode 100644 index 000000000..1731205ac Binary files /dev/null and b/assets/images/social/docs/index.png differ diff --git a/assets/images/social/docs/orders.png b/assets/images/social/docs/orders.png new file mode 100644 index 000000000..5f8548439 Binary files /dev/null and b/assets/images/social/docs/orders.png differ diff --git a/assets/images/social/docs/software/index.png b/assets/images/social/docs/software/index.png new file mode 100644 index 000000000..72aaaa9ef Binary files /dev/null and b/assets/images/social/docs/software/index.png differ diff --git a/assets/images/social/docs/software/install.png b/assets/images/social/docs/software/install.png new file mode 100644 index 000000000..6fa586277 Binary files /dev/null and b/assets/images/social/docs/software/install.png differ diff --git a/assets/images/social/docs/software/list.png b/assets/images/social/docs/software/list.png new file mode 100644 index 000000000..1fa51edc9 Binary files /dev/null and b/assets/images/social/docs/software/list.png differ diff --git a/assets/images/social/docs/software/modules.png b/assets/images/social/docs/software/modules.png new file mode 100644 index 000000000..8a13e34dd Binary files /dev/null and b/assets/images/social/docs/software/modules.png differ diff --git a/assets/images/social/docs/software/using/R.png b/assets/images/social/docs/software/using/R.png new file mode 100644 index 000000000..d985d0dbd Binary files /dev/null and b/assets/images/social/docs/software/using/R.png differ diff --git a/assets/images/social/docs/software/using/anaconda.png b/assets/images/social/docs/software/using/anaconda.png new file mode 100644 index 000000000..259b80ec8 Binary files /dev/null and b/assets/images/social/docs/software/using/anaconda.png differ diff --git a/assets/images/social/docs/software/using/clustershell.png b/assets/images/social/docs/software/using/clustershell.png new file mode 100644 index 000000000..37a56c823 Binary files /dev/null and b/assets/images/social/docs/software/using/clustershell.png differ diff --git a/assets/images/social/docs/software/using/julia.png b/assets/images/social/docs/software/using/julia.png new file mode 100644 index 000000000..c1f3773c5 Binary files /dev/null and b/assets/images/social/docs/software/using/julia.png differ diff --git a/assets/images/social/docs/software/using/mariadb.png b/assets/images/social/docs/software/using/mariadb.png new file mode 100644 index 000000000..1817d6d34 Binary files /dev/null and b/assets/images/social/docs/software/using/mariadb.png differ diff --git a/assets/images/social/docs/software/using/matlab.png b/assets/images/social/docs/software/using/matlab.png new file mode 100644 index 000000000..8e397e799 Binary files /dev/null and b/assets/images/social/docs/software/using/matlab.png differ diff --git a/assets/images/social/docs/software/using/perl.png b/assets/images/social/docs/software/using/perl.png new file mode 100644 index 000000000..f3df003a5 Binary files /dev/null and b/assets/images/social/docs/software/using/perl.png differ diff --git a/assets/images/social/docs/software/using/postgresql.png b/assets/images/social/docs/software/using/postgresql.png new file mode 100644 index 000000000..6caeb5a13 Binary files /dev/null and b/assets/images/social/docs/software/using/postgresql.png differ diff --git a/assets/images/social/docs/software/using/python.png b/assets/images/social/docs/software/using/python.png new file mode 100644 index 000000000..4ba81739f Binary files /dev/null and b/assets/images/social/docs/software/using/python.png differ diff --git a/assets/images/social/docs/software/using/quantum-espresso.png b/assets/images/social/docs/software/using/quantum-espresso.png new file mode 100644 index 000000000..252f111ba Binary files /dev/null and b/assets/images/social/docs/software/using/quantum-espresso.png differ diff --git a/assets/images/social/docs/software/using/rclone.png b/assets/images/social/docs/software/using/rclone.png new file mode 100644 index 000000000..49d9abb2d Binary files /dev/null and b/assets/images/social/docs/software/using/rclone.png differ diff --git a/assets/images/social/docs/software/using/schrodinger.png b/assets/images/social/docs/software/using/schrodinger.png new file mode 100644 index 000000000..dd5eb12cd Binary files /dev/null and b/assets/images/social/docs/software/using/schrodinger.png differ diff --git a/assets/images/social/docs/software/using/singularity.png b/assets/images/social/docs/software/using/singularity.png new file mode 100644 index 000000000..4d09056cc Binary files /dev/null and b/assets/images/social/docs/software/using/singularity.png differ diff --git a/assets/images/social/docs/software/using/spark.png b/assets/images/social/docs/software/using/spark.png new file mode 100644 index 000000000..2c2324c7d Binary files /dev/null and b/assets/images/social/docs/software/using/spark.png differ diff --git a/assets/images/social/docs/storage/data-protection.png b/assets/images/social/docs/storage/data-protection.png new file mode 100644 index 000000000..67545ad55 Binary files /dev/null and b/assets/images/social/docs/storage/data-protection.png differ diff --git a/assets/images/social/docs/storage/data-sharing.png b/assets/images/social/docs/storage/data-sharing.png new file mode 100644 index 000000000..c5e3551f2 Binary files /dev/null and b/assets/images/social/docs/storage/data-sharing.png differ diff --git a/assets/images/social/docs/storage/data-transfer.png b/assets/images/social/docs/storage/data-transfer.png new file mode 100644 index 000000000..21a9cc6e2 Binary files /dev/null and b/assets/images/social/docs/storage/data-transfer.png differ diff --git a/assets/images/social/docs/storage/filesystems.png b/assets/images/social/docs/storage/filesystems.png new file mode 100644 index 000000000..4aac7b49c Binary files /dev/null and b/assets/images/social/docs/storage/filesystems.png differ diff --git a/assets/images/social/docs/storage/index.png b/assets/images/social/docs/storage/index.png new file mode 100644 index 000000000..1d3518286 Binary files /dev/null and b/assets/images/social/docs/storage/index.png differ diff --git a/assets/images/social/docs/tags.png b/assets/images/social/docs/tags.png new file mode 100644 index 000000000..a526aad17 Binary files /dev/null and b/assets/images/social/docs/tags.png differ diff --git a/assets/images/social/docs/tech/facts.png b/assets/images/social/docs/tech/facts.png new file mode 100644 index 000000000..a62116a97 Binary files /dev/null and b/assets/images/social/docs/tech/facts.png differ diff --git a/assets/images/social/docs/tech/index.png b/assets/images/social/docs/tech/index.png new file mode 100644 index 000000000..ddb3b6a4c Binary files /dev/null and b/assets/images/social/docs/tech/index.png differ diff --git a/assets/images/social/docs/tech/status.png b/assets/images/social/docs/tech/status.png new file mode 100644 index 000000000..f36243f2c Binary files /dev/null and b/assets/images/social/docs/tech/status.png differ diff --git a/assets/images/social/docs/user-guide/gpu.png b/assets/images/social/docs/user-guide/gpu.png new file mode 100644 index 000000000..28a0f1db2 Binary files /dev/null and b/assets/images/social/docs/user-guide/gpu.png differ diff --git a/assets/images/social/docs/user-guide/ondemand.png b/assets/images/social/docs/user-guide/ondemand.png new file mode 100644 index 000000000..aebe28aec Binary files /dev/null and b/assets/images/social/docs/user-guide/ondemand.png differ diff --git a/assets/images/social/docs/user-guide/running-jobs.png b/assets/images/social/docs/user-guide/running-jobs.png new file mode 100644 index 000000000..04ba125df Binary files /dev/null and b/assets/images/social/docs/user-guide/running-jobs.png differ diff --git a/assets/images/social/docs/user-guide/troubleshoot.png b/assets/images/social/docs/user-guide/troubleshoot.png new file mode 100644 index 000000000..47e2d03ef Binary files /dev/null and b/assets/images/social/docs/user-guide/troubleshoot.png differ diff --git a/assets/images/social/index.png b/assets/images/social/index.png new file mode 100644 index 000000000..876b2842d Binary files /dev/null and b/assets/images/social/index.png differ diff --git a/assets/javascripts/bundle.b9c29124.min.js b/assets/javascripts/bundle.b9c29124.min.js new file mode 100644 index 000000000..777456b4f --- /dev/null +++ b/assets/javascripts/bundle.b9c29124.min.js @@ -0,0 +1,3 @@ +"use strict";(()=>{var la=Object.create;var Mr=Object.defineProperty;var ua=Object.getOwnPropertyDescriptor;var ma=Object.getOwnPropertyNames,Ut=Object.getOwnPropertySymbols,da=Object.getPrototypeOf,Lr=Object.prototype.hasOwnProperty,un=Object.prototype.propertyIsEnumerable;var ln=(e,t,r)=>t in e?Mr(e,t,{enumerable:!0,configurable:!0,writable:!0,value:r}):e[t]=r,P=(e,t)=>{for(var r in t||(t={}))Lr.call(t,r)&&ln(e,r,t[r]);if(Ut)for(var r of Ut(t))un.call(t,r)&&ln(e,r,t[r]);return e};var mn=(e,t)=>{var r={};for(var n in e)Lr.call(e,n)&&t.indexOf(n)<0&&(r[n]=e[n]);if(e!=null&&Ut)for(var n of Ut(e))t.indexOf(n)<0&&un.call(e,n)&&(r[n]=e[n]);return r};var Ot=(e,t)=>()=>(t||e((t={exports:{}}).exports,t),t.exports);var ha=(e,t,r,n)=>{if(t&&typeof t=="object"||typeof t=="function")for(let o of ma(t))!Lr.call(e,o)&&o!==r&&Mr(e,o,{get:()=>t[o],enumerable:!(n=ua(t,o))||n.enumerable});return e};var it=(e,t,r)=>(r=e!=null?la(da(e)):{},ha(t||!e||!e.__esModule?Mr(r,"default",{value:e,enumerable:!0}):r,e));var hn=Ot((Ar,dn)=>{(function(e,t){typeof Ar=="object"&&typeof dn!="undefined"?t():typeof define=="function"&&define.amd?define(t):t()})(Ar,function(){"use strict";function e(r){var n=!0,o=!1,i=null,a={text:!0,search:!0,url:!0,tel:!0,email:!0,password:!0,number:!0,date:!0,month:!0,week:!0,time:!0,datetime:!0,"datetime-local":!0};function s(M){return!!(M&&M!==document&&M.nodeName!=="HTML"&&M.nodeName!=="BODY"&&"classList"in M&&"contains"in M.classList)}function c(M){var qe=M.type,Ue=M.tagName;return!!(Ue==="INPUT"&&a[qe]&&!M.readOnly||Ue==="TEXTAREA"&&!M.readOnly||M.isContentEditable)}function p(M){M.classList.contains("focus-visible")||(M.classList.add("focus-visible"),M.setAttribute("data-focus-visible-added",""))}function f(M){!M.hasAttribute("data-focus-visible-added")||(M.classList.remove("focus-visible"),M.removeAttribute("data-focus-visible-added"))}function l(M){M.metaKey||M.altKey||M.ctrlKey||(s(r.activeElement)&&p(r.activeElement),n=!0)}function m(M){n=!1}function d(M){!s(M.target)||(n||c(M.target))&&p(M.target)}function h(M){!s(M.target)||(M.target.classList.contains("focus-visible")||M.target.hasAttribute("data-focus-visible-added"))&&(o=!0,window.clearTimeout(i),i=window.setTimeout(function(){o=!1},100),f(M.target))}function b(M){document.visibilityState==="hidden"&&(o&&(n=!0),$())}function $(){document.addEventListener("mousemove",V),document.addEventListener("mousedown",V),document.addEventListener("mouseup",V),document.addEventListener("pointermove",V),document.addEventListener("pointerdown",V),document.addEventListener("pointerup",V),document.addEventListener("touchmove",V),document.addEventListener("touchstart",V),document.addEventListener("touchend",V)}function X(){document.removeEventListener("mousemove",V),document.removeEventListener("mousedown",V),document.removeEventListener("mouseup",V),document.removeEventListener("pointermove",V),document.removeEventListener("pointerdown",V),document.removeEventListener("pointerup",V),document.removeEventListener("touchmove",V),document.removeEventListener("touchstart",V),document.removeEventListener("touchend",V)}function V(M){M.target.nodeName&&M.target.nodeName.toLowerCase()==="html"||(n=!1,X())}document.addEventListener("keydown",l,!0),document.addEventListener("mousedown",m,!0),document.addEventListener("pointerdown",m,!0),document.addEventListener("touchstart",m,!0),document.addEventListener("visibilitychange",b,!0),$(),r.addEventListener("focus",d,!0),r.addEventListener("blur",h,!0),r.nodeType===Node.DOCUMENT_FRAGMENT_NODE&&r.host?r.host.setAttribute("data-js-focus-visible",""):r.nodeType===Node.DOCUMENT_NODE&&(document.documentElement.classList.add("js-focus-visible"),document.documentElement.setAttribute("data-js-focus-visible",""))}if(typeof window!="undefined"&&typeof document!="undefined"){window.applyFocusVisiblePolyfill=e;var t;try{t=new CustomEvent("focus-visible-polyfill-ready")}catch(r){t=document.createEvent("CustomEvent"),t.initCustomEvent("focus-visible-polyfill-ready",!1,!1,{})}window.dispatchEvent(t)}typeof document!="undefined"&&e(document)})});var bn=Ot(Cr=>{(function(e){var t=function(){try{return!!Symbol.iterator}catch(p){return!1}},r=t(),n=function(p){var f={next:function(){var l=p.shift();return{done:l===void 0,value:l}}};return r&&(f[Symbol.iterator]=function(){return f}),f},o=function(p){return encodeURIComponent(p).replace(/%20/g,"+")},i=function(p){return decodeURIComponent(String(p).replace(/\+/g," "))},a=function(){var p=function(l){Object.defineProperty(this,"_entries",{writable:!0,value:{}});var m=typeof l;if(m!=="undefined")if(m==="string")l!==""&&this._fromString(l);else if(l instanceof p){var d=this;l.forEach(function(X,V){d.append(V,X)})}else if(l!==null&&m==="object")if(Object.prototype.toString.call(l)==="[object Array]")for(var h=0;hd[0]?1:0}),p._entries&&(p._entries={});for(var l=0;l1?i(d[1]):"")}})})(typeof global!="undefined"?global:typeof window!="undefined"?window:typeof self!="undefined"?self:Cr);(function(e){var t=function(){try{var o=new e.URL("b","http://a");return o.pathname="c d",o.href==="http://a/c%20d"&&o.searchParams}catch(i){return!1}},r=function(){var o=e.URL,i=function(c,p){typeof c!="string"&&(c=String(c)),p&&typeof p!="string"&&(p=String(p));var f=document,l;if(p&&(e.location===void 0||p!==e.location.href)){p=p.toLowerCase(),f=document.implementation.createHTMLDocument(""),l=f.createElement("base"),l.href=p,f.head.appendChild(l);try{if(l.href.indexOf(p)!==0)throw new Error(l.href)}catch(M){throw new Error("URL unable to set base "+p+" due to "+M)}}var m=f.createElement("a");m.href=c,l&&(f.body.appendChild(m),m.href=m.href);var d=f.createElement("input");if(d.type="url",d.value=c,m.protocol===":"||!/:/.test(m.href)||!d.checkValidity()&&!p)throw new TypeError("Invalid URL");Object.defineProperty(this,"_anchorElement",{value:m});var h=new e.URLSearchParams(this.search),b=!0,$=!0,X=this;["append","delete","set"].forEach(function(M){var qe=h[M];h[M]=function(){qe.apply(h,arguments),b&&($=!1,X.search=h.toString(),$=!0)}}),Object.defineProperty(this,"searchParams",{value:h,enumerable:!0});var V=void 0;Object.defineProperty(this,"_updateSearchParams",{enumerable:!1,configurable:!1,writable:!1,value:function(){this.search!==V&&(V=this.search,$&&(b=!1,this.searchParams._fromString(this.search),b=!0))}})},a=i.prototype,s=function(c){Object.defineProperty(a,c,{get:function(){return this._anchorElement[c]},set:function(p){this._anchorElement[c]=p},enumerable:!0})};["hash","host","hostname","port","protocol"].forEach(function(c){s(c)}),Object.defineProperty(a,"search",{get:function(){return this._anchorElement.search},set:function(c){this._anchorElement.search=c,this._updateSearchParams()},enumerable:!0}),Object.defineProperties(a,{toString:{get:function(){var c=this;return function(){return c.href}}},href:{get:function(){return this._anchorElement.href.replace(/\?$/,"")},set:function(c){this._anchorElement.href=c,this._updateSearchParams()},enumerable:!0},pathname:{get:function(){return this._anchorElement.pathname.replace(/(^\/?)/,"/")},set:function(c){this._anchorElement.pathname=c},enumerable:!0},origin:{get:function(){var c={"http:":80,"https:":443,"ftp:":21}[this._anchorElement.protocol],p=this._anchorElement.port!=c&&this._anchorElement.port!=="";return this._anchorElement.protocol+"//"+this._anchorElement.hostname+(p?":"+this._anchorElement.port:"")},enumerable:!0},password:{get:function(){return""},set:function(c){},enumerable:!0},username:{get:function(){return""},set:function(c){},enumerable:!0}}),i.createObjectURL=function(c){return o.createObjectURL.apply(o,arguments)},i.revokeObjectURL=function(c){return o.revokeObjectURL.apply(o,arguments)},e.URL=i};if(t()||r(),e.location!==void 0&&!("origin"in e.location)){var n=function(){return e.location.protocol+"//"+e.location.hostname+(e.location.port?":"+e.location.port:"")};try{Object.defineProperty(e.location,"origin",{get:n,enumerable:!0})}catch(o){setInterval(function(){e.location.origin=n()},100)}}})(typeof global!="undefined"?global:typeof window!="undefined"?window:typeof self!="undefined"?self:Cr)});var jn=Ot((rc,Vt)=>{var vn,gn,yn,xn,En,wn,Sn,On,Tn,Wt,kr,_n,Mn,Ln,at,An,Cn,kn,Rn,Hn,Pn,$n,In,Dt;(function(e){var t=typeof global=="object"?global:typeof self=="object"?self:typeof this=="object"?this:{};typeof define=="function"&&define.amd?define("tslib",["exports"],function(n){e(r(t,r(n)))}):typeof Vt=="object"&&typeof Vt.exports=="object"?e(r(t,r(Vt.exports))):e(r(t));function r(n,o){return n!==t&&(typeof Object.create=="function"?Object.defineProperty(n,"__esModule",{value:!0}):n.__esModule=!0),function(i,a){return n[i]=o?o(i,a):a}}})(function(e){var t=Object.setPrototypeOf||{__proto__:[]}instanceof Array&&function(n,o){n.__proto__=o}||function(n,o){for(var i in o)Object.prototype.hasOwnProperty.call(o,i)&&(n[i]=o[i])};vn=function(n,o){if(typeof o!="function"&&o!==null)throw new TypeError("Class extends value "+String(o)+" is not a constructor or null");t(n,o);function i(){this.constructor=n}n.prototype=o===null?Object.create(o):(i.prototype=o.prototype,new i)},gn=Object.assign||function(n){for(var o,i=1,a=arguments.length;i=0;f--)(p=n[f])&&(c=(s<3?p(c):s>3?p(o,i,c):p(o,i))||c);return s>3&&c&&Object.defineProperty(o,i,c),c},En=function(n,o){return function(i,a){o(i,a,n)}},wn=function(n,o){if(typeof Reflect=="object"&&typeof Reflect.metadata=="function")return Reflect.metadata(n,o)},Sn=function(n,o,i,a){function s(c){return c instanceof i?c:new i(function(p){p(c)})}return new(i||(i=Promise))(function(c,p){function f(d){try{m(a.next(d))}catch(h){p(h)}}function l(d){try{m(a.throw(d))}catch(h){p(h)}}function m(d){d.done?c(d.value):s(d.value).then(f,l)}m((a=a.apply(n,o||[])).next())})},On=function(n,o){var i={label:0,sent:function(){if(c[0]&1)throw c[1];return c[1]},trys:[],ops:[]},a,s,c,p;return p={next:f(0),throw:f(1),return:f(2)},typeof Symbol=="function"&&(p[Symbol.iterator]=function(){return this}),p;function f(m){return function(d){return l([m,d])}}function l(m){if(a)throw new TypeError("Generator is already executing.");for(;i;)try{if(a=1,s&&(c=m[0]&2?s.return:m[0]?s.throw||((c=s.return)&&c.call(s),0):s.next)&&!(c=c.call(s,m[1])).done)return c;switch(s=0,c&&(m=[m[0]&2,c.value]),m[0]){case 0:case 1:c=m;break;case 4:return i.label++,{value:m[1],done:!1};case 5:i.label++,s=m[1],m=[0];continue;case 7:m=i.ops.pop(),i.trys.pop();continue;default:if(c=i.trys,!(c=c.length>0&&c[c.length-1])&&(m[0]===6||m[0]===2)){i=0;continue}if(m[0]===3&&(!c||m[1]>c[0]&&m[1]=n.length&&(n=void 0),{value:n&&n[a++],done:!n}}};throw new TypeError(o?"Object is not iterable.":"Symbol.iterator is not defined.")},kr=function(n,o){var i=typeof Symbol=="function"&&n[Symbol.iterator];if(!i)return n;var a=i.call(n),s,c=[],p;try{for(;(o===void 0||o-- >0)&&!(s=a.next()).done;)c.push(s.value)}catch(f){p={error:f}}finally{try{s&&!s.done&&(i=a.return)&&i.call(a)}finally{if(p)throw p.error}}return c},_n=function(){for(var n=[],o=0;o1||f(b,$)})})}function f(b,$){try{l(a[b]($))}catch(X){h(c[0][3],X)}}function l(b){b.value instanceof at?Promise.resolve(b.value.v).then(m,d):h(c[0][2],b)}function m(b){f("next",b)}function d(b){f("throw",b)}function h(b,$){b($),c.shift(),c.length&&f(c[0][0],c[0][1])}},Cn=function(n){var o,i;return o={},a("next"),a("throw",function(s){throw s}),a("return"),o[Symbol.iterator]=function(){return this},o;function a(s,c){o[s]=n[s]?function(p){return(i=!i)?{value:at(n[s](p)),done:s==="return"}:c?c(p):p}:c}},kn=function(n){if(!Symbol.asyncIterator)throw new TypeError("Symbol.asyncIterator is not defined.");var o=n[Symbol.asyncIterator],i;return o?o.call(n):(n=typeof Wt=="function"?Wt(n):n[Symbol.iterator](),i={},a("next"),a("throw"),a("return"),i[Symbol.asyncIterator]=function(){return this},i);function a(c){i[c]=n[c]&&function(p){return new Promise(function(f,l){p=n[c](p),s(f,l,p.done,p.value)})}}function s(c,p,f,l){Promise.resolve(l).then(function(m){c({value:m,done:f})},p)}},Rn=function(n,o){return Object.defineProperty?Object.defineProperty(n,"raw",{value:o}):n.raw=o,n};var r=Object.create?function(n,o){Object.defineProperty(n,"default",{enumerable:!0,value:o})}:function(n,o){n.default=o};Hn=function(n){if(n&&n.__esModule)return n;var o={};if(n!=null)for(var i in n)i!=="default"&&Object.prototype.hasOwnProperty.call(n,i)&&Dt(o,n,i);return r(o,n),o},Pn=function(n){return n&&n.__esModule?n:{default:n}},$n=function(n,o,i,a){if(i==="a"&&!a)throw new TypeError("Private accessor was defined without a getter");if(typeof o=="function"?n!==o||!a:!o.has(n))throw new TypeError("Cannot read private member from an object whose class did not declare it");return i==="m"?a:i==="a"?a.call(n):a?a.value:o.get(n)},In=function(n,o,i,a,s){if(a==="m")throw new TypeError("Private method is not writable");if(a==="a"&&!s)throw new TypeError("Private accessor was defined without a setter");if(typeof o=="function"?n!==o||!s:!o.has(n))throw new TypeError("Cannot write private member to an object whose class did not declare it");return a==="a"?s.call(n,i):s?s.value=i:o.set(n,i),i},e("__extends",vn),e("__assign",gn),e("__rest",yn),e("__decorate",xn),e("__param",En),e("__metadata",wn),e("__awaiter",Sn),e("__generator",On),e("__exportStar",Tn),e("__createBinding",Dt),e("__values",Wt),e("__read",kr),e("__spread",_n),e("__spreadArrays",Mn),e("__spreadArray",Ln),e("__await",at),e("__asyncGenerator",An),e("__asyncDelegator",Cn),e("__asyncValues",kn),e("__makeTemplateObject",Rn),e("__importStar",Hn),e("__importDefault",Pn),e("__classPrivateFieldGet",$n),e("__classPrivateFieldSet",In)})});var rn=Ot((It,tn)=>{(function(t,r){typeof It=="object"&&typeof tn=="object"?tn.exports=r():typeof define=="function"&&define.amd?define([],r):typeof It=="object"?It.ClipboardJS=r():t.ClipboardJS=r()})(It,function(){return function(){var e={686:function(n,o,i){"use strict";i.d(o,{default:function(){return fa}});var a=i(279),s=i.n(a),c=i(370),p=i.n(c),f=i(817),l=i.n(f);function m(F){try{return document.execCommand(F)}catch(L){return!1}}var d=function(L){var _=l()(L);return m("cut"),_},h=d;function b(F){var L=document.documentElement.getAttribute("dir")==="rtl",_=document.createElement("textarea");_.style.fontSize="12pt",_.style.border="0",_.style.padding="0",_.style.margin="0",_.style.position="absolute",_.style[L?"right":"left"]="-9999px";var I=window.pageYOffset||document.documentElement.scrollTop;return _.style.top="".concat(I,"px"),_.setAttribute("readonly",""),_.value=F,_}var $=function(L,_){var I=b(L);_.container.appendChild(I);var j=l()(I);return m("copy"),I.remove(),j},X=function(L){var _=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{container:document.body},I="";return typeof L=="string"?I=$(L,_):L instanceof HTMLInputElement&&!["text","search","url","tel","password"].includes(L==null?void 0:L.type)?I=$(L.value,_):(I=l()(L),m("copy")),I},V=X;function M(F){return typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?M=function(_){return typeof _}:M=function(_){return _&&typeof Symbol=="function"&&_.constructor===Symbol&&_!==Symbol.prototype?"symbol":typeof _},M(F)}var qe=function(){var L=arguments.length>0&&arguments[0]!==void 0?arguments[0]:{},_=L.action,I=_===void 0?"copy":_,j=L.container,Q=L.target,Ae=L.text;if(I!=="copy"&&I!=="cut")throw new Error('Invalid "action" value, use either "copy" or "cut"');if(Q!==void 0)if(Q&&M(Q)==="object"&&Q.nodeType===1){if(I==="copy"&&Q.hasAttribute("disabled"))throw new Error('Invalid "target" attribute. Please use "readonly" instead of "disabled" attribute');if(I==="cut"&&(Q.hasAttribute("readonly")||Q.hasAttribute("disabled")))throw new Error(`Invalid "target" attribute. You can't cut text from elements with "readonly" or "disabled" attributes`)}else throw new Error('Invalid "target" value, use a valid Element');if(Ae)return V(Ae,{container:j});if(Q)return I==="cut"?h(Q):V(Q,{container:j})},Ue=qe;function Pe(F){return typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?Pe=function(_){return typeof _}:Pe=function(_){return _&&typeof Symbol=="function"&&_.constructor===Symbol&&_!==Symbol.prototype?"symbol":typeof _},Pe(F)}function ra(F,L){if(!(F instanceof L))throw new TypeError("Cannot call a class as a function")}function fn(F,L){for(var _=0;_0&&arguments[0]!==void 0?arguments[0]:{};this.action=typeof j.action=="function"?j.action:this.defaultAction,this.target=typeof j.target=="function"?j.target:this.defaultTarget,this.text=typeof j.text=="function"?j.text:this.defaultText,this.container=Pe(j.container)==="object"?j.container:document.body}},{key:"listenClick",value:function(j){var Q=this;this.listener=p()(j,"click",function(Ae){return Q.onClick(Ae)})}},{key:"onClick",value:function(j){var Q=j.delegateTarget||j.currentTarget,Ae=this.action(Q)||"copy",Ft=Ue({action:Ae,container:this.container,target:this.target(Q),text:this.text(Q)});this.emit(Ft?"success":"error",{action:Ae,text:Ft,trigger:Q,clearSelection:function(){Q&&Q.focus(),window.getSelection().removeAllRanges()}})}},{key:"defaultAction",value:function(j){return _r("action",j)}},{key:"defaultTarget",value:function(j){var Q=_r("target",j);if(Q)return document.querySelector(Q)}},{key:"defaultText",value:function(j){return _r("text",j)}},{key:"destroy",value:function(){this.listener.destroy()}}],[{key:"copy",value:function(j){var Q=arguments.length>1&&arguments[1]!==void 0?arguments[1]:{container:document.body};return V(j,Q)}},{key:"cut",value:function(j){return h(j)}},{key:"isSupported",value:function(){var j=arguments.length>0&&arguments[0]!==void 0?arguments[0]:["copy","cut"],Q=typeof j=="string"?[j]:j,Ae=!!document.queryCommandSupported;return Q.forEach(function(Ft){Ae=Ae&&!!document.queryCommandSupported(Ft)}),Ae}}]),_}(s()),fa=pa},828:function(n){var o=9;if(typeof Element!="undefined"&&!Element.prototype.matches){var i=Element.prototype;i.matches=i.matchesSelector||i.mozMatchesSelector||i.msMatchesSelector||i.oMatchesSelector||i.webkitMatchesSelector}function a(s,c){for(;s&&s.nodeType!==o;){if(typeof s.matches=="function"&&s.matches(c))return s;s=s.parentNode}}n.exports=a},438:function(n,o,i){var a=i(828);function s(f,l,m,d,h){var b=p.apply(this,arguments);return f.addEventListener(m,b,h),{destroy:function(){f.removeEventListener(m,b,h)}}}function c(f,l,m,d,h){return typeof f.addEventListener=="function"?s.apply(null,arguments):typeof m=="function"?s.bind(null,document).apply(null,arguments):(typeof f=="string"&&(f=document.querySelectorAll(f)),Array.prototype.map.call(f,function(b){return s(b,l,m,d,h)}))}function p(f,l,m,d){return function(h){h.delegateTarget=a(h.target,l),h.delegateTarget&&d.call(f,h)}}n.exports=c},879:function(n,o){o.node=function(i){return i!==void 0&&i instanceof HTMLElement&&i.nodeType===1},o.nodeList=function(i){var a=Object.prototype.toString.call(i);return i!==void 0&&(a==="[object NodeList]"||a==="[object HTMLCollection]")&&"length"in i&&(i.length===0||o.node(i[0]))},o.string=function(i){return typeof i=="string"||i instanceof String},o.fn=function(i){var a=Object.prototype.toString.call(i);return a==="[object Function]"}},370:function(n,o,i){var a=i(879),s=i(438);function c(m,d,h){if(!m&&!d&&!h)throw new Error("Missing required arguments");if(!a.string(d))throw new TypeError("Second argument must be a String");if(!a.fn(h))throw new TypeError("Third argument must be a Function");if(a.node(m))return p(m,d,h);if(a.nodeList(m))return f(m,d,h);if(a.string(m))return l(m,d,h);throw new TypeError("First argument must be a String, HTMLElement, HTMLCollection, or NodeList")}function p(m,d,h){return m.addEventListener(d,h),{destroy:function(){m.removeEventListener(d,h)}}}function f(m,d,h){return Array.prototype.forEach.call(m,function(b){b.addEventListener(d,h)}),{destroy:function(){Array.prototype.forEach.call(m,function(b){b.removeEventListener(d,h)})}}}function l(m,d,h){return s(document.body,m,d,h)}n.exports=c},817:function(n){function o(i){var a;if(i.nodeName==="SELECT")i.focus(),a=i.value;else if(i.nodeName==="INPUT"||i.nodeName==="TEXTAREA"){var s=i.hasAttribute("readonly");s||i.setAttribute("readonly",""),i.select(),i.setSelectionRange(0,i.value.length),s||i.removeAttribute("readonly"),a=i.value}else{i.hasAttribute("contenteditable")&&i.focus();var c=window.getSelection(),p=document.createRange();p.selectNodeContents(i),c.removeAllRanges(),c.addRange(p),a=c.toString()}return a}n.exports=o},279:function(n){function o(){}o.prototype={on:function(i,a,s){var c=this.e||(this.e={});return(c[i]||(c[i]=[])).push({fn:a,ctx:s}),this},once:function(i,a,s){var c=this;function p(){c.off(i,p),a.apply(s,arguments)}return p._=a,this.on(i,p,s)},emit:function(i){var a=[].slice.call(arguments,1),s=((this.e||(this.e={}))[i]||[]).slice(),c=0,p=s.length;for(c;c{"use strict";var Is=/["'&<>]/;Mi.exports=js;function js(e){var t=""+e,r=Is.exec(t);if(!r)return t;var n,o="",i=0,a=0;for(i=r.index;i{function e(o,i){parent.postMessage(o,i||"*")}function t(...o){return o.reduce((i,a)=>i.then(()=>new Promise(s=>{let c=document.createElement("script");c.src=a,c.addEventListener("load",()=>s()),document.body.appendChild(c)})),Promise.resolve())}function r(){let o=document.createElement("iframe");return o.width=o.height=o.frameBorder="0",o}var n=class{constructor(o,i){if(this.url=o,this.onerror=null,this.onmessage=null,this.onmessageerror=null,this.handleMessage=s=>{s.source===this.worker&&(s.stopImmediatePropagation(),this.dispatchEvent(new MessageEvent("message",{data:s.data})),this.onmessage&&this.onmessage(s))},this.handleError=(s,c,p,f,l)=>{if(c===this.url.toString()){let m=new ErrorEvent("error",{message:s,filename:c,lineno:p,colno:f,error:l});this.dispatchEvent(m),this.onerror&&this.onerror(m)}},typeof i!="undefined")throw new TypeError("Options are not supported for iframe workers");let a=new EventTarget;this.addEventListener=a.addEventListener.bind(a),this.removeEventListener=a.removeEventListener.bind(a),this.dispatchEvent=a.dispatchEvent.bind(a),document.body.appendChild(this.iframe=r()),this.worker.document.open(),this.worker.document.write(` + + + +Redirecting... + + diff --git a/docs/advanced-topics/connection/index.html b/docs/advanced-topics/connection/index.html new file mode 100644 index 000000000..4531cdf69 --- /dev/null +++ b/docs/advanced-topics/connection/index.html @@ -0,0 +1,21 @@ + Connection options - Sherlock

Advanced connection options#

Login nodes#

Sherlock login nodes are regrouped behind a single DNS alias: login.sherlock.stanford.edu.

This alias provides a load-balanced login environment, and the assurance that you will be connected to the least loaded login node when you connect to Sherlock.

If for any reason, you want to directly connect to a specific login node and bypass the automatic load-balanced dispatching of new connections (which we don't recommend), you can use that login node's hostname explicitly. For instance:

$ ssh <sunetid>@ln21.sherlock.stanford.edu
+

This can be useful if you run long-standing processes on the login nodes, such as screen or tmux sessions. To find them back when you reconnect to Sherlock, you will indeed need to login to the same login node you started them on.

The drawback is that by connecting to a specific login node, you will forfeit the load-balancing benefits, which could result in a crowded environment, or even in login errors in case that specific login node is unavailable.

Authentication methods#

Public-key authentication

SSH public-key authentication is not supported on Sherlock.

The recommended way to authenticate to Sherlock is to simply use your SUNet ID and password, as described in the Connecting page.

Passwords are not stored on Sherlock. Sherlock login nodes will delegate password authentication to the University central Kerberos service.

GSSAPI#

For compatibility with previous generations of Sherlock, GSSAPI1 authentication is still allowed, and could be considered a more convenient option, as this mechanism doesn't require entering your password for each connection.

GSSAPI authentication relies on a token system, where users obtain Kerberos ticket-granting tickets, transmit them via SSH to the server they want to connect to, which will, in turn, verify their validity. That way, passwords are never stored locally, and never transit over the network. That's why Kerberos is usually considered the most secure method to authenticate.

To connect using GSSAPI on Sherlock, you'll need to go through a few steps2:

  1. make sure the Kerberos user tools are installed on your local machine. You'll need the kinit (and optionally klist and kdestroy) utilities. Please refer to your OS documentation to install them if required.

  2. download and install the Stanford krb5.conf file, which contains information about the Stanford Kerberos environment:

    $ sudo curl -o /etc/krb5.conf https://web.stanford.edu/dept/its/support/kerberos/dist/krb5.conf
    +
  3. configure your SSH client, by modifying (or creating if it doesn't exist already) the .ssh/config file in your home directory on your local machine. Using a text editor, you can add the following lines to your ~/.ssh/config file (indentation is important):

    Host login.sherlock.stanford.edu
    +    GSSAPIDelegateCredentials yes
    +    GSSAPIAuthentication yes
    +

Once everything is in place (you only need to do this once), you'll be able to test that your Kerberos installation works by running kinit <sunetid>@stanford.edu. You should get a password prompt, and upon success, you'll be able to list your Kerberos credentials with the klist command:

$ kinit kilian@stanford.edu
+Password for kilian@stanford.edu:
+$ klist
+Ticket cache: FILE:/tmp/krb5cc_215845_n4S4I6KgyM
+Default principal: kilian@stanford.edu
+
+Valid starting     Expires            Service principal
+07/28/17 17:33:54  07/29/17 18:33:32  krbtgt/stanford.edu@stanford.edu
+        renew until 08/04/17 17:33:32
+

Kerberos ticket expiration

Kerberos tickets have a 25-hour lifetime. So you'll need to run the kinit command pretty much once a day to continue being able to authenticate to Sherlock.

Please note that when your Kerberos ticket expire, existing Sherlock connections will not be interrupted. So you'll be able to keep connections open to Sherlock for several days without any issue.

You're now ready to connect to Sherlock using GSSAPI. Simply SSH as usual:

$ ssh <sunetid>@login.sherlock.stanford.edu
+

and if everything goes well, you should directly see the two-factor (Duo) prompt, without having to enter your password.

If you want to destroy your Kerberos ticket before its expiration, you can use the kdestroy command.

SSH options#

OpenSSH offers a variety of configuration options that you can use in ~/.ssh/config on your local computer. The following section describe some of the options you can use with Sherlock that may make connecting and transferring files more convenient.

Avoiding multiple Duo prompts#

In order to avoid getting a second-factor (Duo) prompt every time you want to open a new connection to Sherlock, you can take advantage of the multiplexing features provided by OpenSSH.

Simply add the following lines to your ~/.ssh/config file on your local machine to activate the ControlMaster option. If you already have a Host login.sherlock.stanford.edu block in your configuration file, simply add the Control* option lines in the same block.

Host login.sherlock.stanford.edu
+    ControlMaster auto
+    ControlPath ~/.ssh/%l%r@%h:%p
+

It will allow SSH to re-use an existing connection to Sherlock each time you open a new session (create a new SSH connection), thus avoiding subsequent 2FA prompts once the initial connection is established.

The slight disadvantage of this approach is that once you have a connection open to one of Sherlock's login nodes, all your subsequent connections will be using the same login node. This will somewhat defeat the purpose of the load-balancing mechanism used by the login nodes.

Connection failure with unix_listener error

If your connection fails with the following error message:

unix_listener: "..." too long for Unix domain socket
+
you're being hit by a macOS limitation, and you should replace the ControlPath line above by:
ControlPath ~/.ssh/%C
+

Connecting from abroad#

VPN

As a good security practice, we always recommend to use the Stanford VPN when connecting from untrusted networks.

Access to Sherlock is not restricted to campus, meaning that you can connect to Sherlock from pretty much anywhere, including when traveling abroad. We don't restrict inbound SSH connections to any specific IP address range or geographical location, so you shouldn't have any issue to reach the login nodes from anywhere.

Regarding two-step authentication, University IT provides alternate authentication options when phone service or Duo Mobile push notifications are not available.


  1. The Generic Security Service Application Program Interface (GSSAPI, also GSS-API) is an application programming interface for programs to access security services. It allows program to interact with security services such as Kerberos for user authentication. 

  2. Those instructions should work on Linux and MacOs computers. For Windows , we recommend using the WSL, as described in the Prerequisites page. 

\ No newline at end of file diff --git a/docs/advanced-topics/job-management/index.html b/docs/advanced-topics/job-management/index.html new file mode 100644 index 000000000..f41300aef --- /dev/null +++ b/docs/advanced-topics/job-management/index.html @@ -0,0 +1,19 @@ + Job management - Sherlock

Job management

Job submission limits#

You may have encountered situations where your jobs get rejected at submission with errors like this:

sbatch: error: MaxSubmitJobsPerAccount
+sbatch: error: MaxSubmitJobsPerUser
+

There are a number of limits on Sherlock, that are put in place to guarantee that all of the users can have a fair access to resources and a smooth experience while using them. One of those limits is about the total number of jobs a single user (and a single group) can have in queue at any given time. This helps ensuring that the scheduler is able to continue operating in an optimal fashion, without being overloaded by a single user or group.

To see the job submission limits on Sherlock run the sh_part command.

To run longer than 2 days on the normal partition you will need to add the "long" QOS to your submission scripts. For example to run for exactly 3 days add the following two lines to your sbatch script:

#SBATCH --time=3-00:00:00
+#SBATCH --qos=long
+

If you have access to an owners partition you will not need to add this QOS since the MaxWall on owners is 7 days.

Minimizing the number of jobs in queue#

It's generally a good practice to try reducing the number of jobs submitted to the scheduler, and depending on your workflow, there are various approaches for this. One solution may be to pack more work within a single job, which could help in reducing the overall number of jobs you'll have to submit.

Imagine you have a 100-task array job, where you run 1 app task per array item, which looks like this:

#!/bin/bash
+#SBATCH --array=1-100
+#SBATCH -n 1
+
+./app ${SLURM_ARRAY_TASK_ID}
+

This script would create 100 jobs in queue (even though they would all be regrouped under the same job array), each using 1 CPU to run 1 task.

Instead of that 100-task array job, you can try something like this:

#!/bin/bash
+#SBATCH --array=0-99:10
+#SBATCH -n 10
+
+for i in {0..9}; do
+    srun -n 1 ./app $((SLURM_ARRAY_TASK_ID+i)) &
+done
+
+wait # important to make sure the job doesn't exit before the background tasks are done
+
  • --array=0-99:10 will use job array indexes 0, 10, 20 ... 90
  • -n 10 will make sure each job can be subdivided in 10 1-CPU steps
  • the for loop will launch 10 tasks, with indexes from SLURM_ARRAY_TASK_ID to SLURM_ARRAY_TASK_ID + 9.

This would submit a 10-task array job, each of them running 10 steps simultaneously, on the 10 CPUs that each of the job array item will be allocated.

In the end, you'll have run the same number of app instances, but you'll have divided the number of jobs submitted by 10, and allow you to submit the same amount of work to the scheduler, while staying under the submission limits.

\ No newline at end of file diff --git a/docs/advanced-topics/node-features/index.html b/docs/advanced-topics/node-features/index.html new file mode 100644 index 000000000..6b7ead4c1 --- /dev/null +++ b/docs/advanced-topics/node-features/index.html @@ -0,0 +1,23 @@ + Node features - Sherlock

Node features

In heterogeneous environments, computing resources are often grouped together into single pools of resources, to make things easier and more accessible. Most applications can run on any type of hardware, so having all resources regrouped in the same partitions maximizes utilization and make job submission much easier, as users don't have dozens of options to choose from.

But for more specific use cases, it may be necessary to specifically select the hardware jobs will run on, either for performance or reproducibility purposes.

To that end, all the compute nodes on Sherlock have feature tags assigned to them. Multiple characteristics are available for each node, such as their class, CPU manufacturer, generation, part number and frequency, as well as Infiniband and GPU characteristics.

Requiring specific node features is generally not necessary

Using node features is an advanced topic which is generally not necessary to run simple jobs on Sherlock. If you're just starting, you most likely don't need to worry about those, they're only useful in very specific cases.

Available features#

The table below lists the possible features defined for each node.

Feature name Description Examples
CLASS:xxx Node type, as defined in the Sherlock catalog CLASS:SH3_CBASE, CLASS:SH3_G4TF64
CPU_MNF:xxx CPU manufacturer CPU_MNF:INTEL, CPU_MNF:AMD
CPU_GEN:xxx CPU generation CPU_GEN:RME for AMD Rome
CPU_GEN:SKX for Intel Skylake
CPU_SKU:xxx CPU name CPU_SKU:5118, CPU_SKU:7502P
CPU_FRQ:xxx CPU core base frequency CPU_FRQ:2.50GHz, CPU_FRQ:2.75GHz
GPU_BRD:xxx GPU brand GPU_BRD:GEFORCE, GPU_BRD:TESLA
GPU_GEN:xxx GPU generation GPU_GEN:VLT for Volta
GPU_GEN:AMP for Ampere
GPU_SKU:xxx GPU name GPU_SKU:A100_SXM4, GPU_SKU:RTX_3090
GPU_MEM:xxx GPU memory GPU_MEM:32GB, GPU_MEM:80GB
GPU_CC:xxx GPU Compute Capabilities GPU_CC:6.1, GPU_CC:8.0
IB:xxx Infiniband generation/speed IB:EDR, IB:HDR
NO_GPU special tag set on CPU-only nodes

Listing the features available in a partition#

All the node features available in a partition can be listed with sh_node_feat command.

For instance, to list all the GPU types in the gpu partition:

$ sh_node_feat -p gpu | grep GPU_SKU
+GPU_SKU:P100_PCIE
+GPU_SKU:P40
+GPU_SKU:RTX_2080Ti
+GPU_SKU:V100_PCIE
+GPU_SKU:V100S_PCIE
+GPU_SKU:V100_SXM2
+

To list all the CPU generations available in the normal partition:

$ sh_node_feat -p normal | grep CPU_GEN
+CPU_GEN:BDW
+CPU_GEN:MLN
+CPU_GEN:RME
+CPU_GEN:SKX
+

Requesting specific node features#

Those node features can be used in job submission options, as additional constraints for the job, so that the scheduler will only select nodes that match the requested features.

Adding job constraints often increases job pending times

It's important to keep in mind that requesting specific node features usually increases job pending times in queue. The more constraints the scheduler has to satisfy, the smaller the pool of compute nodes jobs can run on. hence the longer it may take for the scheduler to find eligible resources to run those jobs.

To specify a node feature as a job constraint, the -C/--constraint option can be used.

For instance, to submit a job that should only run on an AMD Rome CPU, you can add the following to your job submission options:

#SBATCH -C CPU_GEN:RME
+

Or to make sure that your training job will run on a GPU with 80GB of GPU memory:

#SBATCH -G 1
+#SBATCH -C GPU_MEM:80GB
+

Multiple constraints#

For more complex cases, multiple constraints could be composed in different ways, using logical operators.

Many node feature combinations are impossible to satisfy

Many combinations will result in impossible conditions, and will make jobs impossible to run on any node. The scheduler is usualyl able to detect this and reject the job at submission time.

For instance, submitting a job requesting an Intel CPU on the HDR IB fabric:

#SBATCH -C 'CPU_MNF:INTEL&IB:HDR'
+

will result in the following error:

error: Job submit/allocate failed: Requested node configuration is not available
+

as all the compute nodes on the IB fabric use AMD CPUs. Constraints must be used carefully and sparsingly to avoid unexpected suprises.

Some of the possible logical operations between constraints are listed below:

AND#

Only nodes with all the requested features are eligible to run the job. The ampersand sign (&) is used as the AND operator. For example:

#SBATCH -C 'GPU_MEM:32GB&IB:HDR'
+

will request a GPU with 32GB of memory on the HDR Infiniband fabric to run the job.

OR#

Only nodes with at least one of specified features will be eligible to run the job. The pipe sign (|) is used as the OR operator.

In multi-node jobs, it means that nodes allocated to the job may end up having different features. For example, the following options:

#SBATCH -N 1
+#SBATCH -C "CPU_GEN:RME|CPU_GEN:MLN"
+

may result in a two-node job where one node as an AMD Rome CPU, and the other node has a AMD Milan CPU.

Matching OR:#

When you need all nodes in a multi-node job to have the same set of features, a matching OR condition can be defined by enclosing the options within square brackets ([,]).

For instance, the following options may be used to request a job to run on nodes with the same frequency, either 2.5 GHz or 2/75GHz:

#SBATCH -C "[CPU_FRQ:2.50GHz|CPU_FRQ:2.75GHz]"
+

Node features are text tags

Node features are text tags, they have no associated numerical value, meaning that they can't be compared.

For instance, it's possible to add a constraint for GPU Compute Capabilities greater than 8.0. The workaround is to add a job constraint that satisfies all the possible values of that tag, like:

#SBATCH -C "GPU_CC:8.0|GPU_CC:8.6"
+

For more information, complete details about the --constraints/-C job submission option and its syntax can be found in the official Slurm documentation.

\ No newline at end of file diff --git a/docs/concepts/index.html b/docs/concepts/index.html new file mode 100644 index 000000000..5918d6ccb --- /dev/null +++ b/docs/concepts/index.html @@ -0,0 +1 @@ + Concepts - Sherlock

Concepts

Sherlock, a shared resource#

Sherlock is a shared compute cluster available for use by all Stanford faculty members and their research teams to support departmental or sponsored research.

Sherlock is a resource for research

Sherlock is not suitable for course work, class assignments or general-use training sessions.

Users interested in using computing resources in such contexts are encouraged to investigate FarmShare, Stanford’s community computing environment, which is primarily intended for supporting coursework.

It is open to the Stanford community as a computing resource to support departmental or sponsored research, thus a faculty member's sponsorship is required for all user accounts.

Usage policy

Please note that your use of this system falls under the "Computer and Network Usage Policy", as described in the Stanford Administrative Guide. In particular, sharing authentication credentials is strictly prohibited. Violation of this policy will result in termination of access to Sherlock.

Sherlock has been designed, deployed, and is maintained and operated by the Stanford Research Computing Center (SRCC) staff. The SRCC is a joint effort of the Dean of Research and IT Services to build and support a comprehensive program to advance computational research at Stanford.

Sherlock has been initially purchased and supported with seed funding from Stanford's Provost. It comprises a set of freely available compute nodes, a few specific resources such as large-memory machines and GPU servers, as well as the associated networking equipment and storage. These resources can be used to run computational codes and programs, and are managed through a job scheduler using a fair-share algorithm.

Data risk classification#

Low and Moderate Risk data

Sherlock is approved for computing with Low and Moderate Risk data only.

High Risk data

Sherlock is NOT approved to store or process HIPAA, PHI, PII nor any kind of High Risk data. The system is approved for computing with Low and Moderate Risk data only, and is not suitable to process High Risk data.

Users are responsible for ensuring the compliance of their own data.

For more information about data risk classifications, see the Information Security Risk Classification page.

Investing in Sherlock#

For users who need more than casual access to a shared computing environment, Sherlock also offers Faculty members the possibility to invest in additional, dedicated computing resources.

Unlike traditional clusters, Sherlock is a collaborative system where the majority of nodes are purchased and shared by the cluster users. When a user (typically a PI) purchases one or more nodes, they become an owner. Owners choose from a standard set of server configurations supported by SRCC staff (known as the Sherlock catalog) to add to the cluster.

When they're not in use, PI-purchased compute nodes can be used by other owners. This model also allows Sherlock owners to benefit from the scale of the cluster by giving them access to more compute nodes than their individual purchase, which gives them much greater flexibility than owning a standalone cluster.

The majority of Sherlock nodes are owners nodes

The vast majority of Sherlock's compute nodes have been purchased by individual PIs and groups, and PI purchases are the main driver behind the rapid expansion of the cluster, which went from 120 nodes to more than 1,000 nodes in less than 3 years.

The resource scheduler configuration works like this:

  • owners and their research teams get immediate and exclusive access to the resources they purchased,
  • when those nodes are idle, other owners can use them,
  • when the purchasing owners want to use their resources, jobs from other owners that may be running on them are preempted (ie. killed and re-queued).

This provides a way to get more resources to run less important jobs in the background, while making sure that an owner always gets immediate access to his/her own nodes.

Participating owners also have shared access to the public, shared Sherlock nodes, along with everyone else.

Benefits#

Benefits to owners include:

no wait time in queue: immediate and exclusive access to the purchased nodes

access to more resources: possibility to submit jobs to the other owners' nodes when they're not in use

Compared to hosting and managing computing resources on your own, purchasing nodes on Sherlock provides:

  • data center hosting, including backup power and cooling
  • system configuration, maintenance and administration
  • hardware diagnostics and repairs

Those benefits come in addition to the other Sherlock advantages:

  • access to high-performance, large parallel scratch storage space
  • access to snapshot'ed, replicated, enterprise-class storage space
  • optimized software stack, especially tailored for a range of research needs
  • tools to build and install additional software applications as needed
  • user support

Limitations#

Purchasing nodes on Sherlock is different from traditional server hosting.

In particular, purchasing your own compute nodes on Sherlock will NOT allow:

root access: owner nodes on Sherlock are still managed by SRCC in accordance with Stanford's Minimum Security Standards. Although users are welcome to install (or request) any software they may need, purchasing compute nodes on Sherlock does not allow root access to the nodes.

running permanent services: permanent processes such as web servers or databases can only run on owner nodes through the scheduler, using recurring or persistent jobs. Purchasing compute nodes on Sherlock does not provide a way to run anything that couldn't run on freely-available nodes.

direct network connectivity: owners' nodes are connected to the Sherlock's internal network and are not directly accessible from the outside, which means that they can't host public services like web or application servers.

bypassing the scheduler: jobs running on owners' nodes still need to be submitted to the scheduler. Direct shell access to the nodes is not possible outside of scheduled interactive sessions.

hardware changes: the hardware components of purchased nodes cannot be modified, removed, swapped or upgraded during the nodes' service lifetime.

configuration: the configuration of purchased nodes is tuned to provide optimal performance over a majority of use cases and applications, is identical on all nodes across the cluster, and cannot be changed, modified or altered in any way.

persistent local storage: local storage space provided on the compute nodes is only usable for the duration of a job and cannot be used to store long-term data.

additional storage space: purchasing compute nodes on Sherlock does not provide additional storage space. Please note that SRCC does offer the possibility for PIs to purchase their own storage space on Oak, for their long-term research data needs.

Purchasing nodes#

If you are interested in becoming an owner, you can find the latest information about ordering Sherlock nodes on the ordering page. Feel free to contact us is you have any additional question.

Cluster generations#

The research computing landscape evolves very quickly, and to both accommodate growth and technological advances, it's necessary to adapt the Sherlock environment to these evolutions.

Every year or so, a new generation of processors is released, which is why, over a span of several years, multiple generations of CPUs and GPUs make their way into Sherlock. This provides users with access to the latest features and performance enhancements, but it also adds some heterogeneity to the cluster, which is important to keep in mind when compiling software and requesting resources to run them.

Another key component of Sherlock is the interconnect network that links all of Sherlock's compute nodes together and act as a backbone for the whole cluster. This network fabric is of finite capacity, and based on the individual networking switches characteristics and the typical research computing workflows, it can accommodate up to about 850 compute nodes.

As nodes get added to Sherlock, the number of available ports decreases, and at some point, the fabric gets full and no more nodes can be added. Sherlock reached that stage for the first time in late 2016, which prompted the installation of a whole new fabric, to allow for further system expansion.

This kind of evolution is the perfect opportunity to upgrade other components too: management software, ancillary services architecture and user applications. In January 2017, those components were completely overhauled and a new, completely separate cluster was kick-started, using using a different set of hardware and software, while conserving the same storage infrastructure, to ease the transition process.

After a transition period, the older Sherlock hardware, compute and login nodes, have been be merged in the new cluster, and from a logical perspective (connection, job scheduling and computing resources), nodes attached to each of the fabrics have been reunited to form a single cluster again.

As Sherlock continues to evolve and grow, the new fabric will also approach capacity again, and the same process will happen again to start the next generation of Sherlock.

Maintenances and upgrades#

The SRCC institutes a monthly scheduled maintenance window on Sherlock, to ensure optimal operation, avoid potential issues and prepare for future expansions. This window will be used to make hardware repairs, software and firmware updates, and perform general manufacturer recommended maintenance on our environment.

As often as possible, maintenance tasks are performed in a rolling, non-disruptive fashion, but downtimes are sometimes an unfortunate necessity to allow disruptive operations that can't be conducted while users are working on the system.

Maintenance schedule

As often as possible, maintenances will take place on the first Tuesday of every month, from 08:00 to 12:00 Pacific time (noon), and will be announced 2 weeks in advance, through the usual communication channels.

In case an exceptional amount of work is required, the maintenance window could be extended to 10 hours (from 08:00 to 18:00).

During these times, access to Sherlock will be unavailable, login will be disabled and jobs won't run. A reservation will be placed in the scheduler so running jobs can finish before the maintenance, and jobs that wouldn't finish by the maintenance window would be pushed after it.

Common questions#

Q: Why doing maintenances at all?

A: Due to the scale of our computing environment and the increasing complexity of the systems we deploy, it is prudent to arrange for a regular time when we can comfortably and without pressure fix problems or update facilities with minimal impact to our customers. Most, if not all, major HPC centers have regular maintenance schedules. We also need to enforce the Minimum Security rules instituted by the Stanford Information Security Office, which mandate deployment of security patches in a timely manner.

Q: Why Tuesdays 08:00-12:00? Why not do this late at night?

A: We have observed that the least busy time for our services is at the beginning of the week in the morning hours. Using this time period should not interrupt most of our users. If the remote possibility of a problem that extends past the scheduled downtime occurs, we would have our full staff fresh and available to assist in repairs and quickly restore service.

Q: I have jobs running, what will happen to them?

A: For long-running jobs, we strongly recommend checkpointing your results on a periodic basis. Besides, we will place a reservation in the scheduler for each maintenance that would prevent jobs to run past it. This means that the scheduler will only allow jobs to run if they can finish by the time the maintenance starts. If you submit a long job soon before the maintenance, it will be delayed until after the maintenance. That will ensure that no work is lost when the maintenance starts.

\ No newline at end of file diff --git a/docs/credits/index.html b/docs/credits/index.html new file mode 100644 index 000000000..993be3c6b --- /dev/null +++ b/docs/credits/index.html @@ -0,0 +1,10 @@ + Credits - Sherlock

About us#

SRCC#

logo

The Stanford Research Computing Center (SRCC) is a joint effort of the Dean of Research and IT Services to build and support a comprehensive program to advance computational research at Stanford. That includes offering and supporting traditional high performance computing (HPC) systems, as well as systems for high throughput and data-intensive computing.

The SRCC also helps researchers transition their analyses and models from the desktop to more capable and plentiful resources, providing the opportunity to explore their data and answer research questions at a scale typically not possible on desktops or departmental servers. Partnering with national initiatives like NSF XSEDE program as well as vendors, the SRCC offers training and learning opportunities around HPC tools and technologies.

For more information, please see the SRCC website

Credits#

We would like to thank the following companies for their generous sponsorship, and for providing services and resources that help us manage Sherlock every day:

The Sherlock website and documentation also rely on the following projects:

Why the Sherlock name?#

If you're curious about where the Sherlock name came from, we always considered that computing resources in general and HPC clusters in particular should be the catalyst of innovation, be ahead of their time, and spur new discoveries.

And what better account of what's happening on a high-performance computing cluster than Benedict Cumberbatch describing his role as Sherlock Holmes in the BBC's modern adaptation of Arthur Conan Doyle's classic?

Benedict Cumberbatch, about Sherlock

There's a great charge you get from playing him, because of the volume of words in your head and the speed of thought – you really have to make your connections incredibly fast. He is one step ahead of the audience, and of anyone around him with normal intellect. They can't quite fathom where his leaps are taking him.

Yes, exactly. That's Sherlock.

Sherlock, of HBO fame#

And finally, we couldn't resist to the pleasure of citing the most prestigious accomplishment of Sherlock to date: a mention in HBO's Silicon Valley Season 4 finale!

screencap screencap

Yep, you got that right, Richard Hendricks wanted to use our very own Sherlock!

compression_stars Kudos to the show's crew and a big thank you to HBO Data compression stars, Professor Tsachy Weissman and Dmitri Pavlichin, for this incredible Sherlock shout-out. This has been an everlasting source of pride and amazement for the whole SRCC team! ❤

\ No newline at end of file diff --git a/docs/getting-started/connecting/index.html b/docs/getting-started/connecting/index.html new file mode 100644 index 000000000..3c3b275c4 --- /dev/null +++ b/docs/getting-started/connecting/index.html @@ -0,0 +1,70 @@ + Connecting - Sherlock

Connecting to Sherlock #

Sherlock account required

To be able to connect to Sherlock, you must first obtain a Sherlock account.

Credentials#

All users must have a Stanford SUNet ID and a Sherlock account to log in to Sherlock. Your Sherlock account uses the same username/password as your SUnet ID:

Username: SUNet ID
+Password: SUNet ID password
+

To request a Sherlock account, please see the Prerequisites page.

Resetting passwords

Sherlock does not store your SUNet ID password. As a consequence, we are unable to reset your password. If you require password assistance, please see the SUNet Account page.

Connection#

Access to Sherlock is provided via Secure Shell (SSH) login. Most Unix-like operating systems provide an SSH client by default that can be accessed by typing the ssh command in a terminal window.

To login to Sherlock, open a terminal and type the following command, where <sunetid> should be replaced by your actual SUNet ID:

$ ssh <sunetid>@login.sherlock.stanford.edu
+

Upon logging in, you will be connected to one of Sherlock's load-balanced login node. You should be automatically directed to the least-loaded login node at the moment of your connection, which should give you the best possible environment to work.

Host keys#

Upon your very first connection to Sherlock, you will be greeted by a warning such as :

The authenticity of host 'login.sherlock.stanford.edu' can't be established.
+ECDSA key fingerprint is SHA256:eB0bODKdaCWtPgv0pYozsdC5ckfcBFVOxeMwrNKdkmg.
+Are you sure you want to continue connecting (yes/no)?
+

The same warning will be displayed if your try to connect to one of the Data Transfer Node (DTN):

The authenticity of host 'dtn.sherlock.stanford.edu' can't be established.
+ECDSA key fingerprint is SHA256:eB0bODKdaCWtPgv0pYozsdC5ckfcBFVOxeMwrNKdkmg.
+Are you sure you want to continue connecting (yes/no)?
+

This warning is normal: your SSH client warns you that it is the first time it sees that new computer. To make sure you are actually connecting to the right machine, you should compare the ECDSA key fingerprint shown in the message with one of the fingerprints below:

Key type Key Fingerprint
RSA SHA256:T1q1Tbq8k5XBD5PIxvlCfTxNMi1ORWwKNRPeZPXUfJA
legacy format: f5:8f:01:46:d1:f9:66:5d:33:58:b4:82:d8:4a:34:41
ECDSA SHA256:eB0bODKdaCWtPgv0pYozsdC5ckfcBFVOxeMwrNKdkmg
legacy format: 70:4c:76:ea:ae:b2:0f:81:4b:9c:c6:5a:52:4c:7f:64

If they match, you can proceed and type ‘yes’. Your SSH program will then store that key and will verify it for every subsequent SSH connection, to make sure that the server you're connecting to is indeed Sherlock.

Host keys warning#

If you've connected to Sherlock 1.0 before, there's a good chance the Sherlock 1.0 keys were stored by your local SSH client. In that case, when connecting to Sherlock 2.0 using the sherlock.stanford.edu alias, you will be presented with the following message:

@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@ WARNING: POSSIBLE DNS SPOOFING DETECTED! @
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+The RSA host key for sherlock.stanford.edu has changed, and the key for
+the corresponding IP address 171.66.97.101 is unknown. This could
+either mean that DNS SPOOFING is happening or the IP address for the
+host and its host key have changed at the same time.
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@ WARNING: REMOTE HOST IDENTIFICATION HAS CHANGED! @
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+IT IS POSSIBLE THAT SOMEONE IS DOING SOMETHING NASTY!
+Someone could be eavesdropping on you right now (man-in-the-middle
+attack)!  It is also possible that a host key has just been changed.
+The fingerprint for the RSA key sent by the remote host is
+SHA256:T1q1Tbq8k5XBD5PIxvlCfTxNMi1ORWwKNRPeZPXUfJA.
+Please contact your system administrator.
+

You can just check that the SHA256 key listed in that warning message correctly matches the one listed in the table above, and if that's the case, you can safely remove the sherlock.stanford.edu entry from your ~/.ssh/known_hosts file with the following command on your local machine:

$ ssh-keygen -R sherlock.stanford.edu
+

and then connect again. You'll see the first-connection prompt mentioned above, and your SSH client will store the new keys for future connections.

Authentication#

Password#

To ease access and increase compatibility1 with different platforms, Sherlock allows a simple password-based authentication mechanism for SSH.2.

Upon connection, you will be asked for your SUNet ID password with the following prompt:

<sunetid>@login.sherlock.stanford.edu's password:
+

Enter your password, and if it's correct, you should see the following line:

Authenticated with partial success.
+

Second factor (2FA)#

Sherlock implements Stanford's Minimum Security Standards policies which mandate two-step authentication to access the cluster.

Two-step authentication protects your personal information and credentials by combining something only you know (your password) with something only you have (your phone, tablet or token). This prevents an attacker who would steal your password to actually use it to impersonate you. For more details about two-step authentication at Stanford, please refer to the University IT two-step page.

After successfully entering your password, you'll be prompted for your second authentication factor with a message like this:

Duo two-factor login for <sunetid>
+
+Enter a passcode or select one of the following options:
+
+ 1. Duo Push to XXX-XXX-9999
+ 2. Phone call to XXX-XXX-9999
+ 3. SMS passcodes to XXX-XXX-9999 (next code starts with: 9)
+
+Passcode or option (1-3):
+

Avoiding two-factor prompt on each connection

If you routinely open multiple sessions to Sherlock, having to confirm each one of them with a second authentication factor could rapidely become cumbersome. To work around this, the OpenSSH client allows multiplexing channels and re-using existing authenticated for opening new sessions. Please see the Advanced Connection Options page for more details.

If your second factor is accepted, you'll see the following message:

Success. Logging you in...
+

Troubleshooting#

Timeouts#

If you ever encounter timeout errors when connecting to Sherlock, like these:

$ ssh login.sherlock.stanford.edu
+ssh: connect to host login.sherlock.stanford.edu port 22: Operation timed out
+

you can try to either:

  • switch to a wired connection if you're connecting over wifi,
  • connect via the Stanford VPN

Authentication failures#

Excessive authentication failures

Entering an invalid password multiple times will result in a (temporary) ban of your IP address.

To prevent brute-force password guessing attacks on Sherlock login nodes, we automatically block IP addresses that generate too many authentication failures in a given time span. This results in a temporary ban of the infringing IP address, and the impossibility for the user to connect to Sherlock from that IP address.

When this happens, your SSH connection attempts will result in the following error:

ssh: connect to host login.sherlock.stanford.edu port 22: Connection refused
+

IP blocked by this mechanism will automatically be authorized again after a few minutes.

SSHFS on macOS

SSHFS on macOS is known to try to automatically reconnect filesystem mounts after resuming from sleep or uspend, even without any valid credentials. As a result, it will generate a lot of failed connection attempts and likely make your IP address blacklisted on login nodes.

Make sure to unmount your SSHFS drives before putting your macOS system to sleep to avoid this situation.

VPN

If your IP got blocked and you have an urgent need to connect, before the automatic blacklist expiration, we recommend trying to connect through Stanford's VPN: your computer will then use a different IP address and will not be affected by the ban on your regular IP address.

Login#

Congratulations! You've successfully connected to Sherlock. You'll be greeted by the following message of the day:

             --*-*- Stanford Research Computing Center -*-*--
+                  ____  _               _            _
+                 / ___|| |__   ___ _ __| | ___   ___| | __
+                 \___ \| '_ \ / _ \ '__| |/ _ \ / __| |/ /
+                  ___) | | | |  __/ |  | | (_) | (__|   <
+                 |____/|_| |_|\___|_|  |_|\___/ \___|_|\_\
+
+-----------------------------------------------------------------------------
+  This system is for authorized users only and users must comply with all
+  Stanford computing, network and research policies. All activity may be
+  recorded for security and monitoring purposes. For more information, see
+  https://doresearch.stanford.edu/policies/research-policy-handbook and
+  https://adminguide.stanford.edu/chapter-6/subchapter-2/policy-6-2-1
+-----------------------------------------------------------------------------
+  Sherlock is *NOT* approved for storing or processing HIPAA, PHI, PII nor
+  any kind of High Risk data. Users are responsible for the compliance of
+  their data.
+  See https://uit.stanford.edu/guide/riskclassifications for details.
+-----------------------------------------------------------------------------
+
+        Docs         https://www.sherlock.stanford.edu/docs
+        Support      https://www.sherlock.stanford.edu/docs/#support
+
+        Web          https://www.sherlock.stanford.edu
+        News         https://news.sherlock.stanford.edu
+        Status       https://status.sherlock.stanford.edu
+
+-----------------------------------------------------------------------------
+

Once authenticated to Sherlock, you'll see the following prompt:

[<sunetid>@sh03-ln01 login! ~]$

It indicates the name of the login node you've been connected to, and a reminder that you're actually connected to a login node, not a compute node.

Login nodes are not for computing

Login nodes are shared among many users and therefore must not be used to run computationally intensive tasks. Those should be submitted to the scheduler which will dispatch them on compute nodes.

By contrast, the shell prompt on a compute node looks like this:

[<sunetid>@sh03-01n01 ~]$

Start computing#

To start computing, there's still a extra step required, which is requesting resources to run your application. It's all described in the next section.


  1. On Sherlock 1.0, GSSAPI tokens (based on Kerberos tickets) were the only allowed authentication method, which could cause some interoperability with third-party SSH clients. 

  2. For other methods of authentication, see the Advanced Connection Options page. 

\ No newline at end of file diff --git a/docs/getting-started/index.html b/docs/getting-started/index.html new file mode 100644 index 000000000..bb3e50133 --- /dev/null +++ b/docs/getting-started/index.html @@ -0,0 +1 @@ + Getting started - Sherlock

Getting started#

Prerequisites#

To start using Sherlock, you will need:

  • an active SUNet ID,

    What is a SUNet ID?

    A SUNet ID is a unique 3-8 character account name that identifies you as a member of the Stanford community, with access to the Stanford University Network of computing resources and services. Not to be confused with University ID (a 8-digit number that appears on your Stanford ID Card), your SUNet ID is a permanent and visible part of your Stanford identity and often appears in your Stanford email address (eg. sunetid@stanford.edu).

    SUNet IDs are not managed by Research Computing. For more information, see https://accounts.stanford.edu/

    SUNet ID service levels and external collaborators

    Base-level service is sufficient for Sherlock accounts. External collaborators, or users without a SUNet ID, can be sponsored by a PI a get a sponsored SUNet ID at no cost. Please see the sponsorship page for more information.

  • a Sherlock account,

  • a SSH client,
  • good understanding of the concepts and terms used throughout that documentation,
  • some familiarity with Unix/Linux command-line environments, and notions of shell scripting.

How to request an account#

To request an account, the sponsoring Stanford faculty member should email srcc-support@stanford.edu, specifying the names and SUNet IDs of his/her research team members needing an account.

Sherlock is open to the Stanford community as a computing resource to support departmental or sponsored research, thus a faculty member's explicit consent is required for account requests.

Sherlock is a resource for research

Sherlock is a resource to help and support research, and is not suitable for course work, class assignments or general-use training sessions.

There is no fee associated with using Sherlock, and no limit in the amount of accounts each faculty member can request. We will periodically ensure that all accounts associated with each PI are still active, and reserve the right to close any Sherlock account whose SUNet ID is expired.

SSH clients#

Linux #

Linux distributions usually come with a version of the OpenSSH client already installed. So no additional software installation is required. If not, please refer to your distribution's documentation to install it.

macOS #

macOS systems usually come with a version of the OpenSSH client already installed. So no additional software installation is required

Windows #

Microsoft Windows includes a SSH client by default, that can be used to connect to Sherlock from a Windows terminal.

Windows also has a feature called the "Windows Subsystem for Linux" (WSL), which provides a Linux-like experience and make switching across systems more seamless. Please refer to the official documentation or this HOWTO for installation instructions.

The two options above will ensure the best compatibility with the Sherlock environment. If you'd like to explore other avenues, many other SSH client implementations are available, but have not necessarily been tested with Sherlock, so your mileage may vary.

Unix/Linux resources#

A full tutorial on using Unix/Linux is beyond the scope of this documentation. However, there are many tutorials for beginning to use Unix/Linux on the web.

A few tutorials we recommend are:

More specifically about HPC and Research Computing:

Text editors#

Multiple text editors are available on Sherlock. For beginners, we recommend the use of nano. And for more advanced uses, you'll also find below some resources about using vim

Note: you can also create/edit files with the Sherlock OnDemand File editor

Shell scripting#

Compute jobs launched on Sherlock are most often initialized by user-written shell scripts. Beyond that, many common operations can be simplified and automated using shell scripts.

For an introduction to shell scripting, you can refer to:

\ No newline at end of file diff --git a/docs/getting-started/prerequisites/index.html b/docs/getting-started/prerequisites/index.html new file mode 100644 index 000000000..e0c38c74d --- /dev/null +++ b/docs/getting-started/prerequisites/index.html @@ -0,0 +1,15 @@ + + + + + + Redirecting... + + + + + + +Redirecting... + + diff --git a/docs/getting-started/submitting/index.html b/docs/getting-started/submitting/index.html new file mode 100644 index 000000000..82cfe8e10 --- /dev/null +++ b/docs/getting-started/submitting/index.html @@ -0,0 +1,26 @@ + Submitting jobs - Sherlock

Submitting jobs

Principle#

Login nodes are not for computing

Login nodes are shared among many users and therefore must not be used to run computationally intensive tasks. Those should be submitted to the scheduler which will dispatch them on compute nodes.

Requesting resources#

A mandatory prerequisite for running computational tasks on Sherlock is to request computing resources. This is done via a resource scheduler, whose very purpose is to match compute resources in the cluster (CPUs, GPUs, memory, ...) with user resource requests.

The scheduler provides three key functions:

  1. it allocates access to resources (compute nodes) to users for some duration of time so they can perform work.
  2. it provides a framework for starting, executing, and monitoring work (typically a parallel job such as MPI) on a set of allocated nodes.
  3. it arbitrates contention for resources by managing a queue of pending jobs

Slurm#

Sherlock uses Slurm, an open-source resource manager and job scheduler, used by many of the world's supercomputers and computer clusters.

Slurm supports a variety of job submission techniques. By accurately requesting the resources you need, you’ll be able to get your work done.

Wait times in queue

As a quick rule of thumb, it's important to keep in mind that the more resources your job requests (CPUs, GPUs, memory, nodes, and time), the longer it may have to wait in queue before it could start.

In other words: accurately requesting resources to match your job's needs will minimize your wait times.

How to submit a job#

A job consists in two parts: resource requests and job steps.

Resource requests describe the amount of computing resource (CPUs, GPUs, memory, expected run time, etc.) that the job will need to successfully run.

Job steps describe tasks that must be executed.

Batch scripts#

The typical way of creating a job is to write a job submission script. A submission script is a shell script (e.g. a Bash script) whose first comments, if they are prefixed with #SBATCH, are interpreted by Slurm as parameters describing resource requests and submissions options1.

The submission script itself is a job step. Other job steps are created with the srun command.

For instance, the following script would request one task with one CPU for 10 minutes, along with 2 GB of memory, in the default partition:

submit.sh
#!/bin/bash
+#
+#SBATCH --job-name=test
+#
+#SBATCH --time=10:00
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=1
+#SBATCH --mem-per-cpu=2G
+
+srun hostname
+srun sleep 60
+

When started, the job would run a first job step srun hostname, which will launch the command hostname on the node on which the requested CPU was allocated. Then, a second job step will start the sleep command.

You can create this job submission script on Sherlock using a text editor such as nano or vim, and save it as submit.sh.

#SBATCH directives syntax

#SBATCH directives must be at the top of the script

Slurm will ignore all #SBATCH directives after the first non-comment line (that is, the first line in the script that doesn't start with a # character). Always put your #SBATCH parameters at the top of your batch script.

Spaces in parameters will cause #SBATCH directives to be ignored

Slurm will ignore all #SBATCH directives after the first white space. For instance directives like those:

#SBATCH --job-name=big job
+
#SBATCH --mem=16 G
+
#SBATCH --partition=normal, owners
+
will cause all following #SBATCH directives to be ignored and the job to be submitted with the default parameters.

Job submission#

Once the submission script is written properly, you can submit it to the scheduler with the sbatch command. Upon success, sbatch will return the ID it has assigned to the job (the jobid).

$ sbatch submit.sh
+Submitted batch job 1377
+

Check the job#

Once submitted, the job enters the queue in the PENDING state. When resources become available and the job has sufficient priority, an allocation is created for it and it moves to the RUNNING state. If the job completes correctly, it goes to the COMPLETED state, otherwise, its state is set to FAILED.

You'll be able to check the status of your job and follow its evolution with the squeue -u $USER command:

$ squeue -u $USER
+     JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
+      1377    normal     test   kilian  R       0:12      1 sh02-01n01
+

The scheduler will automatically create an output file that will contain the result of the commands run in the script file. That output file is names slurm-<jobid>.out by default, but can be customized via submission options. In the above example, you can list the contents of that output file with the following commands:

$ cat slurm-1377.out
+sh02-01n01
+

Congratulations, you've submitted your first batch job on Sherlock!

What's next?#

Actually, quite a lot. Although you now know how to submit a simple batch job, there are many other options and areas to explore in the next sections:


  1. You can get the complete list of parameters by referring to the sbatch manual page (man sbatch). 

\ No newline at end of file diff --git a/docs/glossary/index.html b/docs/glossary/index.html new file mode 100644 index 000000000..d98abd6b5 --- /dev/null +++ b/docs/glossary/index.html @@ -0,0 +1 @@ + Glossary - Sherlock

Glossary

What's a cluster?#

A computing cluster is a federation of multiple compute nodes (independent computers), most commonly linked together through a high-performance interconnect network.

What makes it a "super-computer" is the ability for a program to address resources (such as memory, CPU cores) located in different compute nodes, through the high-performance interconnect network.

overview

On a computing cluster, users typically connect to login nodes, using a secure remote login protocol such as SSH. Unlike in traditional interactive environments, users then need to prepare compute jobs to submit to a resource scheduler. Based on a set of rules and limits, the scheduler will then try to match the jobs' resource requirements with available resources such as CPUs, memory or computing accelerators such as GPUs. It will then execute the user defined tasks on the selected resources, and generate output files in one of the different storage locations available on the cluster, for the user to review and analyze.

Cluster components#

The terms that are typically used to describe cluster components could be confusing, so in an effort to clarify things, here's a schema of the most important ones, and their definition. components

CPU#

A Central Processing Unit (CPU), or core, or CPU core, is the smallest unit in a microprocessor that can carry out computational tasks, that is, run programs. Modern processors typically have multiple cores.

Socket#

A socket is the connector that houses the microprocessor. By extension, it represents the physical package of a processor, that typically contains multiple cores.

Node#

A node is a physical, stand-alone computer, that can handle computing tasks and run jobs. It's connected to other compute nodes via a fast network interconnect, and contains CPUs, memory and devices managed by an operating system.

Cluster#

A cluster is the complete collection of nodes with networking and file storage facilities. It's usually a group of independent computers connected via a fast network interconnect, managed by a resource manager, which acts as a large parallel computer.

Other commonly used terms#

To make this documentation more accessible, we try to explain key terms in a non-technical way. When reading these pages, please keep in mind the following definitions, presented in alphabetical order:

Application#

An application is a computer program designed to perform a group of coordinated functions, tasks, or activities for the benefit of the user. In the context of scientific computing, an application typically performs computations related to a scientific goal (molecular dynamics simulations, genome assembly, compuational fluid dynamics simulations, etc).

Backfill#

Backfill scheduling is a method that a scheduler can use in order to maximize utilization. It allows smaller (both in terms of size and time requirements), lower priority jobs to start before larger, higher priority ones, as long as doing so doesn't push back the higher-priority jobs expected start time.

Executable#

A binary (or executable) program refers to the machine-code compiled version of an application. This is which is a binary file that a computer can execute directly. As opposed to the application source code, which is the human-readable version of the application internal instructions, and which needs to be compiled by a compiler to produce the executable binary.

Fairshare#

A resource scheduler ranks jobs by priority for execution. Each job's priority in queue is determined by multiple factors, among which one being the user's fairshare score. A user's fairshare score is computed based on a target (the given portion of the resources that this user should be able to use) and the user's effetive usage, ie the amount of resources (s)he effectively used in the past. As a result, the more resources past jobs have used, the lower the priority of the next jobs will be. Past usage is computed based on a sliding window and progressively forgotten over time. This enables all users on a shared resource to get a fair portion of it for their own use, by giving higher priority to users who have been underserved in the past.

FLOPS#

Floating-point Operations Per Second (FLOPS) are a measure of computing performance, and represent the number of floating-point operations that a CPU can perform each second. Modern CPUs and GPUs are capable of doing TeraFLOPS (10^12 floating-point operations per second), depending on the precision of those operations (half-precision: 16 bits, single-precision: 32 bits, double-precision: 64 bits).

GPU#

A Graphical Processing Unit (GPU) is a specialized device initially designed to generate graphical output. On modern computing architecture, they are used to accelerate certain types of computation, which they are much faster than CPUs at. GPUs have their own memory, and are attached to CPUs, within a node. Each compute node can host one or more GPUs.

HPC#

High Performance Computing (HPC) refers to the practice of aggregating computing power to achieve higher performance that would be possible by using a typical computer.

Infiniband#

Infiniband is a networking standard that features high bandwidth and low latency. The current Infiniband devices are capable of transferring data at up to 200 Gbits/sec with less than a microsecond latency. As of this writing, the popular Infiniband versions are HDR (High Data Rate) with 200 Gbits/sec and EDR (Enhanced Data Rate) with 100 Gbits/sec.

IOPS#

Input/output operations per second (IOPS, pronounced eye-ops) is an input/output performance measurement used to characterize computer storage system performance.

Job#

A job, or batch job, is the scheduler’s base unit of computing by which resources are allocated to a user for a specified amount of time. Users create job submission scripts to ask the scheduler for resources such as cores, memory, runtime, etc. The scheduler puts the requests in a queue and allocates requested resources based on jobs’ priority.

Job step#

Job steps are sets of (possibly parallel) tasks within a job

Login nodes#

Login nodes are points of access to a compute cluster. Users usually connect to login nodes via SSH to compile and debug their code, review their results, do some simple tests, and submit their batch jobs to the parallel computer.

Login nodes are not for computing

Login nodes are usually shared among many users and therefore must not be used to run computationally intensive tasks. Those should be submitted to the scheduler which will dispatch them on compute nodes.

Modules#

Environment modules, or software modules, are a type of software management tool used on in most HPC environments. Using modules enable users to selectively pick the software that they want to use and add them to their environment. This allows to switch between different versions or flavors of the same software, pick compilers, libraries and software components and avoid conflicts between them.

MPI#

Message Passing Interface (MPI) is a standardized and portable message-passing system designed to exchange information between processes running on different nodes. There are several implementations of the MPI standard, which is the most common way used to scale parallel applications beyond a single compute node.

OpenMP#

Open Multi Processing (OpenMP) is a parallel programming model designed for shared memory architecture. It's based on pragmas that can be added in applications to let the compiler generate a code that can run on multiple cores, within the same node.

Partition#

A partition is a set of compute nodes within a cluster with a common feature. For example, compute nodes with GPU, or compute nodes belonging to same owner, could form a partition.

On Sherlock, you can see detailed partition information with the sh_part or sinfo commands.

QOS#

A Quality Of Service (QOS) is the set of rules and limitations that apply to a categories of job. The combination of a partition (set of machines where a job can run) and QOS (set of rules that applies to that job) makes what is often referred to as a scheduler queue.

Run time#

The run time, or walltime, of a job is the time required to finish its execution.

Scheduler#

The goal of a job scheduler is to find the appropriate resources to run a set of computational tasks in the most efficient manner. Based on resource requirements and job descriptions, it will prioritize those jobs, allocate resources (nodes, CPUs, memory) and schedule their execution.

Slurm#

Simple Linux Utility for Resource Management (SLURM) is a software that manages computing resources and schedule tasks on them. Slurm coordinates running of many programs on a shared facility and makes sure that resources are used in an optimal manner.

SSH#

Secure Shell (SSH) is a protocol to securely access remote computers. Based on the client-server model, multiple users with an SSH client can access a remote computer. Some operating systems such as Linux and Mac OS have a built-in SSH client and others can use one of many publicly available clients.

Thread#

A process, in the simplest terms, is an executing program. One or more threads run in the context of the process. A thread is the basic unit to which the operating system allocates processor time. A thread can execute any part of the process code, including parts currently being executed by another thread. Threads are co-located on the same node.

Task#

In the Slurm context, a task is to be understood as a process. A multi-process program is made of several tasks. A task is typically used to schedule a MPI process, that in turn can use several CPUs. By contrast, a multi-threaded program is composed of only one task, which uses several CPUs.
\ No newline at end of file diff --git a/docs/images/bighead.png b/docs/images/bighead.png new file mode 100644 index 000000000..9e73be97f Binary files /dev/null and b/docs/images/bighead.png differ diff --git a/docs/images/cluster_components.png b/docs/images/cluster_components.png new file mode 100644 index 000000000..52af1f68a Binary files /dev/null and b/docs/images/cluster_components.png differ diff --git a/docs/images/cluster_overview.png b/docs/images/cluster_overview.png new file mode 100644 index 000000000..52ce8f864 Binary files /dev/null and b/docs/images/cluster_overview.png differ diff --git a/docs/images/compression_stars.png b/docs/images/compression_stars.png new file mode 100644 index 000000000..7f23e1d12 Binary files /dev/null and b/docs/images/compression_stars.png differ diff --git a/docs/images/richard.png b/docs/images/richard.png new file mode 100644 index 000000000..4c2bbbf13 Binary files /dev/null and b/docs/images/richard.png differ diff --git a/docs/images/srcc.png b/docs/images/srcc.png new file mode 100644 index 000000000..29e6225a7 Binary files /dev/null and b/docs/images/srcc.png differ diff --git a/docs/index.html b/docs/index.html new file mode 100644 index 000000000..2459739d7 --- /dev/null +++ b/docs/index.html @@ -0,0 +1,16 @@ + Sherlock documentation - Sherlock

Sherlock documentation#

Sherlock

Welcome to Sherlock!#

Sherlock is a High-Performance Computing (HPC) cluster, operated by the Stanford Research Computing Center to provide computing resources to the Stanford community at large. You'll find all the documentation, tips, FAQs and information about Sherlock among these pages.

Why use Sherlock?#

Using Sherlock for your work provides many advantages over individual solutions: hosted in an on-premises, state-of-the-art datacenter, the Sherlock cluster is powered and cooled by installations that are optimized for scientific computing.

On Sherlock, simulations and workloads benefit from performance levels that only large scale HPC systems can offer: high-performance I/O infrastructure, petabytes of storage, large variety of hardware configurations, GPU accelerators, centralized system administration and management provided by the Stanford Research Computing Center (SRCC).

Such features are not easily accessible at the departmental level, and often require both significant initial investments and recurring costs. Joining Sherlock allows researchers and faculty members to avoid those costs and benefit from economies of scale, as well as to access larger, professionally managed computing resources that what would not be available on an individual or even departmental basis.

How much does it cost?#

Sherlock is free to use for anyone doing departmental or sponsored research at Stanford. Any faculty member can request access for research purposes, and get an account with a base storage allocation and unlimited compute time on the global, shared pool of resources.

No CPU.hour charge

Unlike all Cloud Service Providers and many HPC systems, there is no usage charge on Sherlock.

When you submit your work on Sherlock, you don't need to keep an eye on the clock and worry about how much that run will cost you. There is no limit on the total amount of computing you can run on the cluster, as long as resources are available, and there's no charge to use them, no matter how large or small your computations are.

In case those free resources are not sufficient, Stanford Research Computing offers Faculty members the opportunity to invest into the cluster, and get access to additional computing resources for their research teams. Using a traditional compute cluster condominium model, participating faculty and their teams get priority access to the resources they purchase. When they're idle, those resources are available to use by other owners on the cluster, giving them access to virtually unlimited resources.

Information sources#

Searching the docs

If you're looking for information on a specific topic, the Search feature of this site will allow you to quickly find the page you're looking for. Just press S, F or / to open the Search bar and start typing.

To help users take their first steps on Sherlock, we provide documentation and information through various channels:

Channel URL Purpose
Documentation
You are here
www.sherlock.stanford.edu/docs information to help new users start on Sherlock, and more in-depth documentation for users already familiar with the environment.
Changelog news.sherlock.stanford.edu announces, news and updates about Sherlock.
Dashboard status.sherlock.stanford.edu status of Sherlock's main components and services, outages, planned maintenance.

To get started, you can take a look at the concepts and glossary pages to get familiar with the terminology used throughout the documentation pages. Then, we recommend going through the following sections:

Acknowledgment / citation#

It is important and expected that publications resulting from computations performed on Sherlock acknowledge this. The following wording is suggested:

Acknowledgment

Some of the computing for this project was performed on the Sherlock cluster. We would like to thank Stanford University and the Stanford Research Computing Center for providing computational resources and support that contributed to these research results.

Support#

Research Computing support can be reached by sending an email to srcc-support@stanford.edu and mentioning Sherlock.

How to submit effective support requests

To ensure a timely and relevant response, please make sure to include some additional details, such as job ids, commands executed and error messages received, so we can help you better. For more details, see the Troubleshooting page.

As a member of the Sherlock community, you're also automatically subscribed to the sherlock-announce mailing-list, which is only used by the SRCC team to send important announcements about Sherlock,

Onboarding sessions#

We offer regular onboarding sessions for new Sherlock users.

On-boarding session times

On-boarding sessions are offered every first Wednesday of the month, 1PM-2PM PST, via Zoom

These one-hour sessions are a brief introduction to Sherlock's layout, its scheduler, the different file systems available on the cluster, as well as some job submission and software installation best practices for new users. They are a good intro course if you are new to Sherlock or HPC in general.

If you can't attend live on-boarding sessions, you can still take a look at the on-boarding slides as well as to this session recording.

Office hours#

Sending a question to srcc-support@stanford.edu is always the best first option for questions. That way you can include detailed descriptions of the problem or question, valuable output and error messages and any steps you took when you encountered your error. Also, everyone on our team will see your ticket, enabling the most appropriate group member to respond.

Office hours are a good place for more generalized questions about Sherlock, Slurm, Linux usage, data storage, queue structures/scheduling, job optimization and general capabilities of Sherlock. It's also useful for more technically nuanced questions that may not be easily answered with our ticketing system. In office hours some problems can indeed be solved quickly or progress can be made so that you can then work self-sufficiently towards a solution on your own.

COVID-19 update

We'll be holding remote office hours via Zoom, for the time being.

Office hours times

Click here to join the Sherlock Office Hours Zoom

  • Tuesday 10-11am
  • Thursday 3-4pm

You'll need a full-service SUNet ID (basically, a @stanford.edu email address) in order to authenticate and join Office Hours via Zoom. If you do not have a full service account, please contact us at srcc-support@stanford.edu.

If you can't make any of the Office Hours sessions, you can also make an appointment with Sherlock's support team.

What to expect#

  • We cannot accommodate walk-ins: we're unfortunately not staffed to welcome unscheduled visits, so please make sure that you're planning to stop by during office hours. We will not be able to help you otherwise.

  • We can rarely help with application-specific or algorithm problems.

  • You should plan your projects sufficiently in advance and not come to office hours at the last minute before a deadline. Sherlock is a busy resource with several thousand users and you should not expect your jobs to complete before a given date.

  • Not all questions and problems can be answered or solved during office hours, especially ones involving hardware, filesystem or network issues. Sherlock features several thousand computing, networking and storage components, that are constantly being monitored by our team. You can be sure that when Sherlock has an issue, we are aware of it and working on it.

User community#

Sherlock is present on the Stanford Slack Grid, and you're more than welcome to join the following channels:

  • #sherlock-announce, for announcements related to Sherlock and its surrounding services,
  • #sherlock-users, as a place for Sherlock users to connect directly with each other. If you have general questions about Sherlock, want to reach out to other Sherlock users to share tips, good practices, tutorials or other info, please feel free to do so there.

For more details about the SRCC Slack Workspace, and instructions on how to join this workspace and its channels, please see the SRCC support page.

Slack is not an official support channel

Please note that while SRCC staff will monitor these channels, the official way to get support is still to email us at srcc-support@stanford.edu.

Quick Start#

If you're in a rush1, here's a 3-step ultra-quick start:

  1. connect to Sherlock
$ ssh login.sherlock.stanford.edu
+
  1. get an interactive session on a compute node
[kilian@sh-ln01 login! ~]$ sh_dev
+
  1. run a command
[kilian@sh02-01n58 ~]$ module load python
+[kilian@sh02-01n58 ~]$ python -c "print('Hello Sherlock')"
+Hello Sherlock
+

Congrats! You ran your first job on Sherlock!

Replay#

Here's what it looks like in motion:


  1. even in a rush, you'll still need an account on the cluster. See the Prerequisites page for details. 

\ No newline at end of file diff --git a/docs/orders/index.html b/docs/orders/index.html new file mode 100644 index 000000000..ac510f91a --- /dev/null +++ b/docs/orders/index.html @@ -0,0 +1,16 @@ + Ordering nodes - Sherlock

Ordering nodes on Sherlock#

For research groups needing access to additional, dedicated computing resources on Sherlock, we offer the possibility for PIs to purchase their own compute nodes to add to the cluster.

Operating costs for managing and housing PI-purchased compute nodes are waived in exchange for letting other users make use of any idle compute cycles on the PI-owned nodes. Owners have priority access to the computing resources they purchase, but can access more nodes for their research if they need to. This provides the PI with much greater flexibility than owning a standalone cluster.

Conditions#

Service term#

Compute nodes are purchased for a duration of 4 years

Compute nodes are purchased and maintained based on a 4-year lifecycle, which is the duration of the equipment warranty and vendor support.

Owners will be notified during the 4th year that their nodes' lifetime is about to reach its term, at which point they'll be welcome to either:

  • renew their investment by purchasing new nodes,
  • continue to use the public portion of Sherlock's resources.

At the end of their service term, compute nodes are physically retired from the cluster, to make room for new equipment. Compute nodes may be kept running for an additional year at most after the end of their service term, while PIs plan for equipment refresh. Nodes failing during this period may not be repaired, and failed hardware will be disabled or removed from the system.

Please note that outside of exceptional circumstances, nodes purchased in Sherlock cannot be removed from cluster before the end of their service term.

Shared ownership#

Minimum order of one node per PI

The number of nodes in a shared order must be greater or equal to the number of purchasing PI groups.

For operational, administrative as well as usability reasons, we do not support shared ownership of equipment. Meaning that multiple PI groups cannot purchase and share a single compute node. Shared orders have a minimum of one node per purchasing PI group.

Compute nodes catalog#

SRCC offers a select number of compute node configurations that have been tested and validated on Sherlock and that aim to cover most computing needs.

Sherlock catalog

Complete details are available in the Sherlock compute nodes catalog 3

Configurations#

We try to provide hardware configurations that can cover the needs and requirements of a wide range of computing applications, in various scientific fields, and to propose a spectrum of pricing tiers, as shown in the table below:

Type Description Recommended usage Price range
CBASE Base configuration Best per-core performance for serial applications, multi-threaded (OpenMP) and distributed (MPI) applications.
Most flexible and cost-effective configuration
$
CPERF High-core count configuration Multi-threaded applications requiring higher numbers of CPU cores $$
CBIGMEM Large-memory configuration Serial or multi-threaded applications requiring terabytes of memory (genome assembly, etc...) $$$$
G4FP32 Base GPU configuration Single-precision (FP32) GPU-accelerated applications (CryoEM, MD...) with low GPU memory requirements $$
G4FP64 HPC GPU configuration AI, ML/DL and GPU-accelerated HPC codes requiring double-precision (FP64) and larger amounts of GPU memory $$$
G4TF64
G8TF64
Best-in-class GPU configuration AI, ML/DL and GPU-accelerated HPC codes requiring double-precision (FP64), large amounts of GPU memory, and heavy multi-GPU scaling $$$$
Choosing the best node configuration for your needs

Although some configurations may appear cheaper when looking at the dollar/core ratio, this is not the only point to consider when determining the best configuration for your workload.

Performance per core

There are other factors to take into account, notably the memory and I/O bandwidth per core, which could be lower on higher core-count configurations like CPERF. With multiple times more cores than CBASE, they still provide the same total amount of bandwidth to remote and local storage, as well as, to a lesser extend, to memory. Higher core-count CPUs also often offer lower core frequencies, which combined with less bandwidth per core, may result in lower performance for serial jobs.

CPERF nodes are an excellent fit for multi-threaded applications that don't span multiple nodes. But for more diverse workloads, they don't offer the same level of flexibility than the CBASE nodes, which can run a mix of serial, multi-threaded and MPI applications equally well.

Resources availability

Another important factor to take into account is that less nodes for a given number of cores offers less resilience against potential hardware failures: if a 128-core node becomes unavailable for some reason, that's 128 cores that nobody can use while the node is being repaired. But with 128 cores in 4x 32-core nodes, if a node fails, there are still 96 cores that can be used.

We'll be happy to help you determine the best configuration for your computing needs, feel free to reach out to schedule a consultation.

Configuration details for the different compute node types are listed in the Sherlock compute nodes catalog 3

Prices#

Prices for the different compute node types are listed in the Sherlock compute nodes catalog 3. They include tax and shipping fees, and are subject to change when quoted: they tend to follow the market-wide variations induced by global political and economical events, which are way outside of our control. Prices are provided there as a guideline for expectations.

There are two components in the cost of a compute node purchase:

  1. the cost of the hardware itself (capital purchase),

  2. a one-time, per-node infrastructure fee1 that will be charged to cover the costs of connecting the nodes to the cluster infrastructure (racks, PDUs, networking switches, cables...)

No recurring fees

There is currently no recurring fee associated with purchasing compute nodes on Sherlock. In particular, there is no CPU.hour charge, purchased nodes are available to their owners 100% of the time, at no additional cost.

Currently, there are no user, administrative or management fees associated with ongoing system administration of the Sherlock environment. However, PIs should anticipate the eventuality of modest system administration and support fees being levied within the 4 year lifetime of their compute nodes.

Purchasing process#

Minimum purchase

Please note that the minimum purchase is one physical server per PI group. We cannot accommodate multiple PIs pooling funds for a single node.

Single-node orders may incur additional delays

Some node configurations need to be ordered from the vendor by sets of 4 nodes (see the Sherlock catalog for details). So orders for quantities non-multiples of 4 need will to be grouped with other PI's orders, which may incur additional delays.

Purchasing nodes on Sherlock is usually a 5-step process:

  1. the PI use the order form to submit an order,
  2. SRCC requests a formal vendor quote to finalize pricing and communicate it back to the PI for approval,
  3. SRCC submits a Stanford PO to the vendor,
  4. SRCC takes delivery of the hardware and proceeds to its installation,
  5. SRCC notifies the PI that their nodes are ready to be used.

The typical delay between a PO submission to the vendor and the availability of the compute nodes to the PIs is usually between 4 and 8 weeks.

Supply chain disruption and component shortages

Global supply chain issues and component shortages have considerably increased lead times, and compute node deliveries are currently in the 6-month range.

Required information#

To place an order, we'll need the following information:

  • The SUNet ID of the PI making the purchase request
  • A PTA2 number to charge the hardware (capital) portion of the purchase
  • A PTA2 number to charge the per-node infrastructure fees (non-capital)
    It could be the same PTA used for the capital portion of the purchase, or a different one

Hardware costs could be spread over multiple PTAs (with a maximum of 2 PTAs per order). But please note that the infrastructure fees have to be charged to a single PTA.

Placing an order#

To start ordering compute nodes for Sherlock:

check the Sherlock catalog 3 to review prices and select your configurations

Choose

fill in the order form 3 to submit your request and provide the required information

Order

And we'll be in touch shortly!


  1. infrastructure fees are considered non-capital for cost accounting purposes and may incur indirect cost burdens on cost-reimbursable contracts and grants. 

  2. PTA is an acronym used for a Project-Task-Award combination representing an account in the Stanford Financial system. 

  3. SUNet ID required, document restricted to @stanford.edu accounts. 

\ No newline at end of file diff --git a/docs/overview/about/index.html b/docs/overview/about/index.html new file mode 100644 index 000000000..6f1fcf0e4 --- /dev/null +++ b/docs/overview/about/index.html @@ -0,0 +1,15 @@ + + + + + + Redirecting... + + + + + + +Redirecting... + + diff --git a/docs/overview/concepts/index.html b/docs/overview/concepts/index.html new file mode 100644 index 000000000..d7ee56184 --- /dev/null +++ b/docs/overview/concepts/index.html @@ -0,0 +1,15 @@ + + + + + + Redirecting... + + + + + + +Redirecting... + + diff --git a/docs/overview/index.html b/docs/overview/index.html new file mode 100644 index 000000000..e0c38c74d --- /dev/null +++ b/docs/overview/index.html @@ -0,0 +1,15 @@ + + + + + + Redirecting... + + + + + + +Redirecting... + + diff --git a/docs/overview/introduction/index.html b/docs/overview/introduction/index.html new file mode 100644 index 000000000..083e1867d --- /dev/null +++ b/docs/overview/introduction/index.html @@ -0,0 +1,15 @@ + + + + + + Redirecting... + + + + + + +Redirecting... + + diff --git a/docs/overview/orders/index.html b/docs/overview/orders/index.html new file mode 100644 index 000000000..f9894f70c --- /dev/null +++ b/docs/overview/orders/index.html @@ -0,0 +1,15 @@ + + + + + + Redirecting... + + + + + + +Redirecting... + + diff --git a/docs/overview/orders/process/index.html b/docs/overview/orders/process/index.html new file mode 100644 index 000000000..4e23a8b7f --- /dev/null +++ b/docs/overview/orders/process/index.html @@ -0,0 +1,15 @@ + + + + + + Redirecting... + + + + + + +Redirecting... + + diff --git a/docs/overview/tech/facts/index.html b/docs/overview/tech/facts/index.html new file mode 100644 index 000000000..4d32a662b --- /dev/null +++ b/docs/overview/tech/facts/index.html @@ -0,0 +1,15 @@ + + + + + + Redirecting... + + + + + + +Redirecting... + + diff --git a/docs/overview/tech/glossary/index.html b/docs/overview/tech/glossary/index.html new file mode 100644 index 000000000..82c15389f --- /dev/null +++ b/docs/overview/tech/glossary/index.html @@ -0,0 +1,15 @@ + + + + + + Redirecting... + + + + + + +Redirecting... + + diff --git a/docs/overview/tech/index.html b/docs/overview/tech/index.html new file mode 100644 index 000000000..9b729243b --- /dev/null +++ b/docs/overview/tech/index.html @@ -0,0 +1,15 @@ + + + + + + Redirecting... + + + + + + +Redirecting... + + diff --git a/docs/overview/tech/specs/index.html b/docs/overview/tech/specs/index.html new file mode 100644 index 000000000..875a2d208 --- /dev/null +++ b/docs/overview/tech/specs/index.html @@ -0,0 +1,15 @@ + + + + + + Redirecting... + + + + + + +Redirecting... + + diff --git a/docs/overview/tech/status/index.html b/docs/overview/tech/status/index.html new file mode 100644 index 000000000..a7f144eca --- /dev/null +++ b/docs/overview/tech/status/index.html @@ -0,0 +1,15 @@ + + + + + + Redirecting... + + + + + + +Redirecting... + + diff --git a/docs/software/containers/index.html b/docs/software/containers/index.html new file mode 100644 index 000000000..93cc6218d --- /dev/null +++ b/docs/software/containers/index.html @@ -0,0 +1,15 @@ + + + + + + Redirecting... + + + + + + +Redirecting... + + diff --git a/docs/software/index.html b/docs/software/index.html new file mode 100644 index 000000000..6e3cba5d8 --- /dev/null +++ b/docs/software/index.html @@ -0,0 +1 @@ + Software on Sherlock - Sherlock

Software on Sherlock#

Available software#

A set of supported software installations is provided for use on Sherlock. This software is made available through a Software Modules system. For the complete list of available software, please refer to the Software List page.

Licensed software can be used on Sherlock, under certain conditions. Feel free to contact us for more details or if you have questions. For more information about purchasing software licenses, you can contact the Stanford Software Licensing office.

Installation requests#

Installation requests

The SRCC team installs, for general use, a set of libraries, tools and software applications that are commonly used across many research groups. However, our staff resources are quite limited and don't allow us to build nor maintain custom software applications that may be requested by or be of use to a small number of users.

We strongly encourage users to build custom and field- or domain-specific software themselves, and install it in their own personal or group shared directories. That way, they can share the software installations with the rest of the users in their group, if necessary.

Users may even maintain and publish their own local module files to dynamically configure a running environment to use the software. They could share those modules with other users to simplify the use of their own custom software installations.

Installing your own software

For more information about building your own software on Sherlock, please see the Software Installation page.

If the software you need is not in the list of available software, and you have trouble installing it on your own, please contact us with as much details about the package as possible, and we will try to help you install it.

If it's a widely used software that could benefit multiple users across different scientific communities, we will consider install it globally as resources permit1.

Contributed software#

PI groups and labs can share their software installations and modules with the whole Sherlock user community, and let everyone benefit from their tuning efforts and software developments.

Contributed software is supported and maintained by each lab, and contact information is usually provided in the contribs module. See the Modules page for more information about using software modules on Sherlock.

If you're interested in sharing your software installations beyond your own group on Sherlock, please let us know, and we'll get in touch.


  1. Software requests, including version upgrades, are fulfilled in the order they are received, and as time permits. We don't have any dedicated team for software installations, and requests are handled along with other duties, typically within two to three weeks of being received. 

\ No newline at end of file diff --git a/docs/software/install/index.html b/docs/software/install/index.html new file mode 100644 index 000000000..c7c9af378 --- /dev/null +++ b/docs/software/install/index.html @@ -0,0 +1 @@ + Installation - Sherlock

Installation

Software installation requests

For more information about software installation requests, please see the Software Overview page

If the software package or version you need is not available in the list of provided software, you may compile and install it yourself. The recommended location for user-installed software is the $GROUP_HOME group shared directory, which is snapshotted and replicated off-site, and can easily be shared with members of a research group.

🚧 Work in progress 🚧

This page is a work in progress and is not complete yet. We are actively working on adding more content and information.

\ No newline at end of file diff --git a/docs/software/list/index.html b/docs/software/list/index.html new file mode 100644 index 000000000..53aeb4857 --- /dev/null +++ b/docs/software/list/index.html @@ -0,0 +1 @@ + List - Sherlock

Software list#

The full list of software centrally installed and managed on Sherlock is in the tables below.

Permanent work in progress

Software installations on Sherlock are an ever ongoing process. We're continuously adding new software to the list. If you're looking for something that is not in the list, there may be other options.

Subscribe to updates

Never want to miss a software update again? Stay up-to-date with new software updates by following the Sherlock software update RSS feed.

Categories#

Software modules on Sherlock are organized in categories, by scientific field or functional class. It means that you will have to first load a category module before getting access to individual modules. The math and devel categories are loaded by default. See the Modules page for further details and examples.

We currently provide 570 software modules, in 7 categories, covering 93 fields of science:

  • biology clinical science, computational biology, cryo-em, genomics, molecular biology, neurology, pathology, phylogenetics, population genetics, radiology, workflow management

  • chemistry cheminformatics, computational chemistry, crystallography, docking, electrostatics, molecular dynamics, quantum chemistry, tools

  • devel build, compiler, data, data analytics, debug, engine, framework, IDE, language, lib, mpi, networking, parser, profiling, runtime

  • math computational geometry, deep learning, graph computing, lib, linear algebra, machine learning, numerical analysis, numerical library, optimization, scientific computing, statistics, symbolic, technical computing, topic modelling

  • physics astronomy, CFD, cliemate modeling, climate modeling, geophysics, geoscience, lib, magnetism, materials science, micromagnetics, particle, photonics, quantum information science, quantum mechanics

  • system backup, benchmark, checkpointing, cloud interface, compiler, compression, containers, database, document management, document processing, file management, file transfer, framework, hardware, job management, language, libs, media, performance, resource monitoring, scm, shell, testing, tools

  • viz data, gis, graphs, imaging, molecular visualization, plotting, remote display

Licensed software

Access to software modules marked with in the tables below is restricted to properly licensed user groups.

SRCC is not funded to provide commercial software on Sherlock and researchers are responsible for the costs of purchasing and renewing commercial software licenses. For more information, please feel free to contact us and see the Stanford Software Licensing page for purchasing information.

Additional flags and features

Some of the modules listed below have been built to support specific architectures or parallel execution modes:

  • versions marked with support GPU acceleration
  • versions marked with support MPI parallel execution
  • versions marked with are the default version for the module

biology#

Field Module name Version(s) URL Description
clinical science simvascular 20180704
Website Simvascular is a blood flow simulation and analysis toolkit. This module provides the svFSI (Fluid Solid Interaction) solver.
computational biology py-biopython 1.70_py27
1.79_py36
1.79_py39
Website Biopython is a set of freely available tools for biological computation written in Python.
computational biology rosetta 3.8 
Website Rosetta is the premier software suite for modeling macromolecular structures. As a flexible, multi-purpose application, it includes tools for structure prediction, design, and remodeling of proteins and nucleic acids.
cryo-em ctffind 4.1.13
Website ctffind is a program for finding CTFs of electron micrographs.
cryo-em eman2 2.2  
2.91  
Website EMAN2 is a broadly based greyscale scientific image processing suite with a primary focus on processing data from transmission electron microscopes.
cryo-em imod 4.9.12 
4.11.5 
Website IMOD is a set of image processing, modeling and display programs used for tomographic reconstruction and for 3D reconstruction of EM serial sections and optical sections.
cryo-em motioncor2 1.3.1  
1.5.0 
1.6.4 
Website MotionCor2 is a multi-GPU accelerated program which corrects anisotropic image motion at the single pixel level.
cryo-em py-topaz 0.2.4_py36 
0.2.5_py39 
Website A pipeline for particle detection in cryo-electron microscopy images using convolutional neural networks trained from positive and unlabeled examples.
cryo-em relion 2.0.3  
2.1  
4.0.1  
Website RELION (for REgularised LIkelihood OptimisatioN, pronounce rely-on) is a stand-alone computer program that employs an empirical Bayesian approach to refinement of (multiple) 3D reconstructions or 2D class averages in electron cryo-microscopy (cryo-EM).
genomics angsd 0.919
0.931
Website ANGSD is a software for analyzing next generation sequencing data.
genomics augustus 3.3.2
Website AUGUSTUS is a program that predicts genes in eukaryotic genomic sequences.
genomics bamtools 2.5.1
Website BamTools is a project that provides both a C++ API and a command-line toolkit for reading, writing, and manipulating BAM (genome alignment) files.
genomics bcftools 1.6
1.8
1.16
Website BCFtools is a program for variant calling and manipulating files in the Variant Call Format (VCF) and its binary counterpart BCF.
genomics bcl-convert 4.2.7
Website The BCL Convert App generates demultiplexed FASTQ files from a run as input.
genomics bcl2fastq 2.20
Website The bcl2fastq2 conversion software can be used to convert BCL files from MiniSeq, MiSeq, NextSeq, HiSeq, iSeq and NovaSeq sequening systems.
genomics bedops 2.4.40
Website BEDOPS is an open-source command-line toolkit that performs highly efficient and scalable Boolean and other set operations, statistical calculations, archiving, conversion and other management of genomic data of arbitrary scale.
genomics bedtools 2.27.1
2.30.0
Website The bedtools utilities are a swiss-army knife of tools for a wide-range of genomics analysis tasks.
genomics bgen 1.1.4
Website bgen is the reference implementation of the BGEN format, a binary file format for imputed genotype and haplotype data.
genomics bowtie 1.2.2
Website Bowtie is an ultrafast, memory-efficient short read aligner.
genomics bowtie2 2.3.4.1
Website Bowtie 2 is an ultrafast and memory-efficient tool for aligning sequencing reads to long reference sequences.
genomics breseq 0.38.1
Website breseq is a computational pipeline for finding mutations relative to a reference sequence in short-read DNA resequencing data.
genomics bwa 0.7.17
Website BWA (Burrows-Wheeler Aligner) is a software package for mapping low-divergent sequences against a large reference genome, such as the human genome.
genomics canu 1.8
Website A single molecule sequence assembler for genomes large and small.
genomics cellranger 7.1.0
Website Cell Ranger is a set of analysis pipelines that process Chromium single-cell RNA-seq output to align reads, generate gene-cell matrices and perform clustering and gene expression analysis.
genomics cufflinks 2.2.1
Website Cufflinks assembles transcripts, estimates their abundances, and tests for differential expression and regulation in RNA-Seq samples.
genomics dorado 0.3.4
Website Dorado is a high-performance, easy-to-use, open source basecaller for Oxford Nanopore reads.
genomics fastqc 0.11.8
Website FastQC aims to provide a simple way to do some quality control checks on raw sequence data coming from high throughput sequencing pipelines.
genomics fastx_toolkit 0.0.14
Website The FASTX-Toolkit is a collection of command line tools for Short-Reads FASTA/FASTQ files preprocessing.
genomics freebayes 1.2.0
Website FreeBayes is a Bayesian genetic variant detector designed to find small polymorphisms.
genomics gatk 4.1.0.0
4.1.4.1
Website GATK (Genome Analysis Toolkit) offers a wide variety of tools with a primary focus on variant discovery and genotyping.
genomics gemma 0.98.5
Website GEMMA is a software toolkit for fast application of linear mixed models (LMMs) and related models to genome-wide association studies (GWAS) and other large-scale data sets.
genomics hic-pro 2.10.0
Website HiC-Pro: An optimized and flexible pipeline for Hi-C data processing.
genomics hisat2 2.1.0
Website HISAT2 is a fast and sensitive alignment program for mapping next-generation sequencing reads (both DNA and RNA) to a population of human genomes (as well as to a single reference genome).
genomics htslib 1.6
1.8
1.10.2
1.14
1.16
Website C library for high-throughput sequencing data formats.
genomics jellyfish 2.2.10
Website A fast multi-threaded k-mer counter.
genomics kallisto 0.44.0 
0.46.1
0.50.1
Website kallisto is a program for quantifying abundances of transcripts from RNA-Seq data using high-throughput sequencing reads.
genomics metal 20110325
Website The METAL software is designed to facilitate meta-analysis of large datasets (such as several whole genome scans) in a convenient, rapid and memory efficient manner.
genomics mixcr 2.1.12
4.6.0
Website MiXCR is a universal framework that processes big immunome data from raw sequences to quantitated clonotypes.
genomics ncbi-blast+ 2.6.0
2.7.1
2.11.0
Website NCBI BLAST+ is a suite of command-line tools to run BLAST (Basic Local Alignment Search Tool), an algorithm for comparing primary biological sequence information.
genomics ncbi-vdb 3.0.7
Website NCBI VDB is the database engine used by NCBI SRA tools.
genomics plink 1.07
1.90b5.3
2.0a1
2.0a2
Website PLINK is a free, open-source whole genome association analysis toolset, designed to perform a range of basic, large-scale analyses in a computationally efficient manner.
genomics popscle 0.1
Website popscle is a suite of population scale analysis tools for single-cell genomics data.
genomics py-busco 3.0.2_py27
Website Assessing genome assembly and annotation completeness with Benchmarking Universal Single-Copy Orthologs (BUSCO).
genomics py-bx-python 0.8.1_py27
0.8.13_py39
Website Tools for manipulating biological data, particularly multiple sequence alignments.
genomics py-cutadapt 1.18_py27 
1.18_py36
Website Cutadapt finds and removes adapter sequences, primers, poly-A tails and other types of unwanted sequence from your high-throughput sequencing reads.
genomics py-deeplabcut 2.2.3_py39 
Website A software package for animal pose estimation.
genomics py-deeptools 3.3.1_py36
Website Tools to process and analyze deep sequencing data.
genomics py-fithic 1.1.3_py27
Website Fit-Hi-C is a tool for assigning statistical confidence estimates to chromosomal contact maps produced by genome architecture assays.
genomics py-htseq 2.0.1_py39
Website HTSeq is a Python library to facilitate processing and analysis of data from high-throughput sequencing (HTS) experiments.
genomics py-macs2 2.1.1_py27
Website MACS (Model-based Analysis of ChIP-Seq) implements a novel ChIP-Seq analysis method.
genomics py-mageck 0.5.9.4_py36
Website Model-based Analysis of Genome-wide CRISPR-Cas9 Knockout (MAGeCK) is a computational tool to identify important genes from the recent genome-scale CRISPR-Cas9 knockout screens technology.
genomics py-mapdamage 2.2.1_py36
Website mapDamage2 is a computational framework which tracks and quantifies DNA damage patterns among ancient DNA sequencing reads generated by Next-Generation Sequencing platforms.
genomics py-multiqc 1.6_py27 
1.6_py36
Website MultiQC is a reporting tool that parses summary statistics from results and log files generated by other bioinformatics tools.
genomics py-obitools 1.2.13_py27
Website OBITools is a set of programs designed for analyzing NGS data in a DNA metabarcoding context.
genomics py-orthofinder 2.5.4_py39
Website OrthoFinder is a fast, accurate and comprehensive platform for comparative genomics.
genomics py-pybedtools 0.8.0_py27
0.8.2_py36
0.9.0_py39
Website Pybedtools wraps and extends BEDTools and offers feature-level manipulations from within Python.
genomics py-pysam 0.14.1_py27
0.15.3_py36
0.18.0_py39
Website Pysam is a python module for reading, manipulating and writing genomic data sets.
genomics py-scanpy 1.8.2_py39
Website Scanpy is a scalable toolkit for analyzing single-cell gene expression data.
genomics py-vcf2gwas 0.8.9_py39
Website Python API for comprehensive GWAS analysis using GEMMA.
genomics py-vispr 0.4.17_py36
Website A visualization framework for CRISPR/Cas9 knockout screens, analyzed with MAGeCK.
genomics regenie 2.2.4
Website regenie is a C++ program for whole genome regression modelling of large genome-wide association studies.
genomics rsem 1.3.3
Website RSEM is a software package for estimating gene and isoform expression levels from RNA-Seq data.
genomics salmon 0.12.0
Website Highly-accurate & wicked fast transcript-level quantification from RNA-seq reads using lightweight alignments.
genomics samtools 1.6
1.8
1.16.1
Website Tools (written in C using htslib) for manipulating next-generation sequencing data.
genomics sentieon 201808.01 
202112.01 
Website Sentieon Genomics software is a set of software tools that perform analysis of genomic data obtained from DNA sequencing.
genomics shapeit 4.0.0 
4.2.2
Website SHAPEIT4 is a fast and accurate method for estimation of haplotypes (aka phasing) for SNP array and high coverage sequencing data.
genomics sra-tools 2.11.0
3.0.7
Website The SRA Toolkit and SDK from NCBI is a collection of tools and libraries for using data in the INSDC Sequence Read Archives.
genomics star 2.5.4b
2.7.10b
Website STAR: ultrafast universal RNA-seq aligner.
genomics stringtie 2.2.1
Website StringTie is a fast and highly efficient assembler of RNA-Seq alignments into potential transcripts.
genomics tophat 2.1.1
Website TopHat is a fast splice junction mapper for RNA-Seq reads.
genomics trim_galore 0.5.0
Website Trim Galore! is a wrapper script to automate quality and adapter trimming as well as quality control, with some added functionality to remove biased methylation positions for RRBS sequence files.
genomics trinity 2.8.4
2.13.1
Website Trinity RNA-Seq de novo transcriptome assembly.
genomics vcflib 1.0.0
Website A C++ library for parsing and manipulating VCF files.
genomics vcftools 0.1.15
Website VCFtools is a program package designed for working with VCF files, such as those generated by the 1000 Genomes Project.
genomics viennarna 2.5.1
Website A C code library and several stand-alone programs for the prediction and comparison of RNA secondary structures.
molecular biology dssp 4.0.3
Website DSSP is an application to assign secondary structure to proteins.
molecular biology libcifpp 3.0.0
Website Library to work with mmCIF and PDB files.
neurology afni 17.2.07
18.2.04
21.3.00
Website AFNI (Analysis of Functional NeuroImages) is a set of C programs for processing, analyzing, and displaying functional MRI (FMRI) data - a technique for mapping human brain activity.
neurology ants 2.1.0
2.3.1
2.4.0
Website ANTs computes high-dimensional mappings to capture the statistics of brain structure and function.
neurology bart 0.7.00 
Website BART is a toolbox for Computational Magnetic Resonance Imaging.
neurology dcm2niix 1.0.20171215
1.0.20211006
Website dcm2niix is a program esigned to convert neuroimaging data from the DICOM format to the NIfTI format.
neurology freesurfer 6.0.1
7.1.1
7.2.0
7.3.2
7.4.1
Website An open source software suite for processing and analyzing (human) brain MRI images.
neurology fsl 5.0.10 
Website FSL is a comprehensive library of analysis tools for FMRI, MRI and DTI brain imaging data.
neurology mricron 20160502
Website MRIcron is a cross-platform NIfTI format image viewer.
neurology mrtrix 0.3.16
3.0.3
Website MRtrix3 provides a set of tools to perform various types of diffusion MRI analyses, from various forms of tractography through to next-generation group-level analyses.
neurology py-mdt 0.10.9_py36 
Website The Maastricht Diffusion Toolbox, MDT, is a framework and library for parallelized (GPU and multi-core CPU) diffusion Magnetic Resonance Imaging (MRI) modeling.
neurology py-nipype 1.1.3_py27
1.1.3_py36
Website Nipype is a Python project that provides a uniform interface to existing neuroimaging software and facilitates interaction between these packages within a single workflow.
neurology spm 12
Website The SPM software package has been designed for the analysis of brain imaging data sequences. The sequences can be a series of images from different cohorts, or time-series from the same subject.
neurology workbench 1.3.1
Website Connectome Workbench is an open source, freely available visualization and discovery tool used to map neuroimaging data, especially data generated by the Human Connectome Project.
pathology openslide 3.4.1
Website OpenSlide is a C library that provides a simple interface to read whole-slide images (also known as virtual slides).
pathology py-openslide-python 1.1.1_py27 
1.1.1_py36
Website OpenSlide Python is a Python interface to the OpenSlide library.
phylogenetics py-ete 3.0.0_py27
Website A Python framework for the analysis and visualization of trees.
population genetics py-admixfrog 0.6.1_py36
Website Admixfrog is a HMM to infer ancestry frogments (fragments) from low-coverage, contaminated data.
radiology nbia-data-retriever 4.2
Website The NBIA Data Retriever is an application to download radiology images from the TCIA Radiology Portal.
workflow management nextflow 23.04.3
Website Nextflow is a bioinformatics workflow manager that enables the development of portable and reproducible workflows.

chemistry#

Field Module name Version(s) URL Description
cheminformatics py-rdkit 2018.09.1_py27 
2018.09.1_py36
2022.09.1_py39
Website RDKit is a collection of cheminformatics and machine-learning software written in C++ and Python.
computational chemistry gaussian g16.A03  
g16.B01  
Website Gaussian is a general purpose computational chemistry software package.
computational chemistry libint 1.1.4
2.0.3
2.6.0
Website Libint computes molecular integrals.
computational chemistry libxc 3.0.0
5.2.2
Website Libxc is a library of exchange-correlation functionals for density-functional theory.
computational chemistry nwchem 6.8  
7.0.2  
Website NWChem is an ab initio computational chemistry software package which also includes quantum chemical and molecular dynamics functionality.
computational chemistry py-ase 3.14.1_py27
3.22.1_py39
Website The Atomic Simulation Environment (ASE) is a set of tools and Python modules for setting up, manipulating, running, visualizing and analyzing atomistic simulations.
computational chemistry schrodinger 2021-1    
2017-3   
2018-1   
2018-2   
2019-2   
2020-2   
2022-3   
Website Schrödinger Suites (Small-molecule Drug Discovery Suite, Material Science Suite, Biologics Suite) provide a set of molecular modelling software.
computational chemistry vasp 5.4.1    
6.1.1   
6.3.2   
6.4.1   
Website The Vienna Ab initio Simulation Package (VASP) is a computer program for atomic scale materials modelling, e.g. electronic structure calculations and quantum-mechanical molecular dynamics, from first principles.
crystallography clipper 2.1.20180802
Website Crystallographic automation and complex data manipulation libraries.
crystallography mmdb2 2.0.20
Website A C++ toolkit for working with macromolecular coordinate files.
crystallography ssm 1.4
Website A macromolecular superposition library.
crystallography vesta 3.4.4
Website VESTA is a 3D visualization program for structural models, volumetric data such as electron/nuclear densities, and crystal morphologies.
docking gnina 1.0.2 
Website A deep learning framework for molecular docking
electrostatics apbs 1.5
Website APBS solves the equations of continuum electrostatics for large biomolecular assemblages.
molecular dynamics gromacs 2016.3  
2018  
2021.3  
2023.1  
Website GROMACS is a versatile package to perform molecular dynamics, i.e. simulate the Newtonian equations of motion for systems with hundreds to millions of particles.
molecular dynamics lammps 20180316 
20200303  
Website LAMMPS is a classical molecular dynamics code that models an ensemble of particles in a liquid, solid, or gaseous state.
molecular dynamics openmm 7.1.1 
Website A high performance toolkit for molecular simulation.
molecular dynamics plumed 2.3.2 
Website PLUMED is an open source library for free energy calculations in molecular systems.
molecular dynamics py-raspa2 2.0.3_py27
Website RASPA2 is a general purpose classical simulation package that can be used for the simulation of molecules in gases, fluids, zeolites, aluminosilicates, metal-organic frameworks, carbon nanotubes and external fields.
molecular dynamics qbox 1.65.0 
Website Qbox is a First-Principles Molecular Dynamics code.
molecular dynamics quip 20170901 
20220426 
Website The QUIP package is a collection of software tools to carry out molecular dynamics simulations.
quantum chemistry cp2k 4.1   
9.1  
Website CP2K is a quantum chemistry and solid state physics software package that can perform atomistic simulations of solid state, liquid, molecular, periodic, material, crystal, and biological systems.
quantum chemistry ocean 2.9.7 
Website OCEAN is a versatile and user-friendly package for calculating core edge spectroscopy including excitonic effects.
quantum chemistry orca 4.2.1 
5.0.0 
5.0.3 
Website ORCA is a flexible, efficient and easy-to-use general purpose tool for quantum chemistry.
quantum chemistry quantum-espresso 6.2.1 
6.6 
7.0 
7.1 
Website Quantum ESPRESSO is an integrated suite of Open-Source computer codes for electronic-structure calculations and materials modeling at the nanoscale. It is based on density-functional theory, plane waves, and pseudopotentials.
quantum chemistry quantum-espresso_gpu 1.1  
7.0  
7.1  
Website Quantum ESPRESSO is an integrated suite of Open-Source computer codes for electronic-structure calculations and materials modeling at the nanoscale. It is based on density-functional theory, plane waves, and pseudopotentials.
quantum chemistry terachem 1.95A   
1.96H-beta  
Website TeraChem is general purpose quantum chemistry software designed to run on NVIDIA GPU architectures.
tools openbabel 3.1.1
Website Open Babel is a chemical toolbox designed to speak the many languages of chemical data.
tools py-openbabel 3.1.1.1_py39
Website Python bindings for Open Babel.

devel#

Field Module name Version(s) URL Description
build bazel 0.16.1
0.26.1
0.29.1
Website Bazel is a fast, scalable, multi-language and extensible build system.
build bazelisk 1.3.0
1.8.0
Website Bazelisk is a wrapper for Bazel written in Go.
build binutils 2.38
Website The GNU Binutils are a collection of binary tools.
build cmake 3.8.1
3.11.1
3.13.1
3.20.3
3.24.2
Website CMake is an extensible, open-source system that manages the build process in an operating system and in a compiler-independent manner.
build kerl 1.8.5
Website Kerl is a tool to easily build and install Erlang/OTP instances.
build make 4.4
Website GNU Make is a tool which controls the generation of executables and other non-source files of a program from the program's source files.
build ninja 1.9.0
Website Ninja is a small build system with a focus on speed.
build py-meson 0.51.1_py36
Website Meson is an open source build system meant to be both extremely fast, and, even more importantly, as user friendly as possible.
build py-scons 3.0.5_py27 
3.0.5_py36
Website SCons is an Open Source software construction tool.
compiler aocc 2.1.0
2.2.0
Website AMD Optimizing C/C++ Compiler - AOCC is a highly optimized C, C++ and Fortran compiler for x86 targets especially for Zen based AMD processors.
compiler gcc 6.3.0 
7.1.0
7.3.0
8.1.0
9.1.0
10.1.0
10.3.0
12.1.0
Website The GNU Compiler Collection includes front ends for C, C++, Fortran, Java, and Go, as well as libraries for these languages (libstdc++, libgcj,...).
compiler icc 2017.u2
2018.u1
2018
2019
Website Intel C++ Compiler, also known as icc or icl, is a group of C and C++ compilers from Intel
compiler ifort 2017.u2
2018.u1
2018
2019
Website Intel Fortran Compiler, also known as ifort, is a group of Fortran compilers from Intel
compiler llvm 7.0.0 
3.8.1
4.0.0
5.0.0
9.0.1
15.0.3
Website The LLVM Project is a collection of modular and reusable compiler and toolchain technologies. Clang is an LLVM native C/C++/Objective-C compiler,
compiler nvhpc 21.5 
21.7  
22.3  
23.3  
Website NVIDIA HPC Software Development Kit (SDK) including C, C++, and Fortran compilers.
compiler pgi 19.10
Website PGI compilers and tools, including Open MPI (Community Edition).
compiler smlnj 110.81
Website Standard ML of New Jersey (abbreviated SML/NJ) is a compiler for the Standard ML '97 programming language.
data h5utils 1.12.1
Website h5utils is a set of utilities for visualization and conversion of scientific data in the free, portable HDF5 format.
data hdf5 1.10.6  
1.10.0p1
1.10.2 
1.12.0
1.12.2 
Website HDF5 is a data model, library, and file format for storing and managing data. It supports an unlimited variety of datatypes, and is designed for flexible and efficient I/O and for high volume and complex data.
data hiredis 0.13.3
Website Hiredis is a minimalistic C client library for the Redis database.
data ncl 6.4.0
6.6.2
Website NCL is a free interpreted language designed specifically for scientific data processing and visualization.
data nco 4.8.0 
5.0.6
Website The NCO toolkit manipulates and analyzes data stored in netCDF-accessible formats.
data netcdf 4.4.1.1
4.8.1
Website NetCDF is a set of software libraries and self-describing, machine-independent data formats that support the creation, access, and sharing of array-oriented scientific data.
data netcdf-c 4.9.0 
Website NetCDF is a set of software libraries and self-describing, machine-independent data formats that support the creation, access, and sharing of array-oriented scientific data. This module provides C libraries.
data netcdf-cxx 4.3.1 
Website NetCDF is a set of software libraries and self-describing, machine-independent data formats that support the creation, access, and sharing of array-oriented scientific data. This module provides C++ libraries.
data netcdf-fortran 4.5.4 
Website NetCDF is a set of software libraries and self-describing, machine-independent data formats that support the creation, access, and sharing of array-oriented scientific data. This module provides Fortran libraries.
data pnetcdf 1.8.1 
1.12.3 
Website Parallel netCDF (PnetCDF) is a parallel I/O library for accessing NetCDF files in CDF-1, 2, and 5 formats.
data protobuf 3.4.0 
3.20.0
21.9
Website Protocol Buffers (a.k.a., protobuf) are Google's language-neutral, platform-neutral, extensible mechanism for serializing structured data.
data py-pandas 0.23.0_py27
0.23.0_py36
1.0.3_py36
1.3.1_py39
2.0.1_py39
Website pandas is an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language.
data py-protobuf 3.4.0_py27 
3.4.0_py36
3.6.1_py27
3.6.1_py36
3.15.8_py36
3.20.1_py39
4.21.9_py39
Website Python bindings for Google's Protocol Buffers data interchange format.
data redis 4.0.1
Website Redis is an open source, in-memory data structure store, used as a database, cache and message broker.
data zfp 1.0.0
Website zfp is an open-source library for compressed floating-point and integer arrays that support high throughput read and write random access.
data analytics hadoop 3.1.0 
3.3.1
Website The Apache Hadoop software library is a framework that allows for the distributed processing of large data sets across clusters of computers using simple programming models.
data analytics py-sparkhpc 0.3_py27
Website Launching and controlling spark on HPC clusters
data analytics spark 2.3.0 
3.2.1
Website Apache Spark™ is a unified analytics engine for large-scale data processing.
debug gdb 8.2.1
Website GDB is the GNU Project debugger.
debug valgrind 3.14.0
Website Valgrind is an instrumentation framework for building dynamic analysis tools.
engine v8 8.4.371.22
Website V8 is Google’s open source high-performance JavaScript and WebAssembly engine, written in C++.
framework dotnet 2.1.500
6.0.413
Website .NET is a free, cross-platform, open source developer platform for building many different types of applications.
framework ga 5.8.2
Website Global Arrays (GA) is a Partitioned Global Address Space (PGAS) programming model.
framework py-kedro 0.18.0_py39
Website Kedro is an open-source Python framework for creating reproducible, maintainable and modular data science code.
IDE code-server 4.16.1
Website Run VS Code on any machine anywhere and access it in the browser.
language cuda 9.0.176  
8.0.61 
9.1.85 
9.2.88 
9.2.148 
10.0.130 
10.1.105 
10.1.168 
10.2.89 
11.0.3 
11.1.1 
11.2.0 
11.3.1 
11.4.1 
11.5.0 
11.7.1 
12.0.0 
12.1.1 
12.2.0 
Website CUDA is a parallel computing platform and application programming interface (API) model created by Nvidia. It allows software developers and software engineers to use a CUDA-enabled graphics processing unit (GPU) for general purpose processing.
language erlang 21.3
Website Erlang is a programming language used to build massively scalable soft real-time systems with requirements on high availability.
language gcl 2.6.14
Website GCL is the official Common Lisp for the GNU project.
language go 1.9
1.14
1.18.2
Website Go is an open source programming language that makes it easy to build simple, reliable, and efficient software.
language guile 2.0.11
2.2.2
Website GNU Guile is the preferred extension system for the GNU Project, which features an implementation of the Scheme programming language.
language haskell 8.6.5
Website Haskell is a statically typed, purely functional programming language with type inference and lazy evaluation.
language java 1.8.0_131 
11.0.11
12.0.2
17.0.4
18.0.2
Website Java is a general-purpose computer programming language that is concurrent, class-based, object-oriented,[14] and specifically designed to have as few implementation dependencies as possible.
language julia 1.3.1
1.4.0
1.5.1
1.6.2
1.7.2
1.8.4
1.9.0
1.10.0
Website Julia is a high-level, high-performance dynamic programming language for numerical computing.
language lua 5.3.4
Website Lua is a powerful, efficient, lightweight, embeddable scripting language. It supports procedural programming, object-oriented programming, functional programming, data-driven programming, and data description.
language luarocks 2.4.3
Website LuaRocks is the package manager for Lua modules.
language manticore 20180301
Website Manticore is a high-level parallel programming language aimed at general-purpose applications running on multi-core processors.
language nodejs 8.9.4
9.5.0
16.13.0
18.15.0
Website Node.js is a JavaScript runtime built on Chrome's V8 JavaScript engine. It provides the npm package manager.
language perl 5.26.0
5.36.1
Website Perl 5 is a highly capable, feature-rich programming language with over 29 years of development.
language php 7.3.0
Website PHP (recursive acronym for PHP: Hypertext Preprocessor) is an open source general-purpose scripting language that is especially suited for web development.
language py-cython 0.27.3_py27
0.27.3_py36
0.29.21_py36
0.29.28_py39
Website Cython is an optimising static compiler for both the Python programming language and the extended Cython programming language (based on Pyrex).
language py-ipython 5.4.1_py27 
6.1.0_py36
8.3.0_py39
Website IPython is a command shell for interactive computing in multiple programming languages, originally developed for the Python programming language.
language py-jupyter 1.0.0_py27 
1.0.0_py36
1.0.0_py39
Website Jupyter is a browser-based interactive notebook for programming, mathematics, and data science. It supports a number of languages via plugins.
language py-jupyterlab 2.3.2_py36
4.0.8_py39
Website Jupyter is a browser-based interactive notebook for programming, mathematics, and data science. It supports a number of languages via plugins.
language python 2.7.13 
3.6.1
3.9.0
3.12.1
Website Python is an interpreted, interactive, object-oriented programming language.
language ruby 2.4.1
2.7.1
3.1.2
Website A dynamic, open source programming language with a focus on simplicity and productivity. It has an elegant syntax that is natural to read and easy to write.
language rust 1.35.0
1.56.1
1.63.0
1.72.0
Website A language empowering everyone to build reliable and efficient software.
language scala 2.12.6
Website Scala combines object-oriented and functional programming in one concise, high-level language.
lib ant 1.10.1
Website Apache Ant is a Java library and command-line tool whose mission is to drive processes described in build files as targets and extension points dependent upon each other.
lib boost 1.64.0
1.69.0 
1.75.0 
1.76.0 
1.79.0 
Website Boost is a set of libraries for the C++ programming language that provide support for tasks and structures such as linear algebra, pseudorandom number generation, multithreading, image processing, regular expressions, and unit testing.
lib chai 2.2.2  
Website Copy-hiding array abstraction to automatically migrate data between memory spaces.
lib cnmem 1.0.0 
Website CNMeM is a simple library to help the Deep Learning frameworks manage CUDA memory.
lib conduit 0.5.1  
Website Simplified Data Exchange for HPC Simulations.
lib cub 1.7.3 
1.10.0 
Website CUB is a flexible library of cooperative threadblock primitives and other utilities for CUDA kernel programming.
lib cutlass 0.1.0
3.1.0 
Website CUTLASS is a collection of CUDA C++ template abstractions for implementing high-performance matrix-multiplication (GEMM) at all levels and scales within CUDA.
lib dtcmp 1.1.3
Website Datatype Compare (DTCMP) Library for sorting and ranking distributed data using MPI.
lib eigen 3.3.3
3.4.0
Website Eigen is a C++ template library for linear algebra: matrices, vectors, numerical solvers, and related algorithms.
lib libcircle 0.3.0 
Website libcircle is an API for distributing embarrassingly parallel workloads using self-stabilization.
lib libctl 3.2.2
4.0.1
4.5.0
Website libctl is a library for supporting flexible control files in scientific simulations.
lib libevent 2.1.12
Website The libevent API provides a mechanism to execute a callback function when a specific event occurs on a file descriptor or after a timeout has been reached.
lib libgpuarray 0.7.5 
Website Library to manipulate tensors on the GPU.
lib libtree 2.0.0
Website libtree prints shared object dependencies as a tree.
lib lwgrp 1.0.4 
Website The Light-weight Group Library provides methods for MPI codes to quickly create and destroy process groups.
lib nccl 1.3.4 
2.0.4 
2.1.15 
2.2.13 
2.3.7 
2.4.8 
2.5.6 
2.8.4 
2.11.4 
2.17.1 
Website NCCL (pronounced 'Nickel') is a stand-alone library of standard collective communication routines, such as all-gather, reduce, broadcast, etc., that have been optimized to achieve high bandwidth over PCIe.
lib pugixml 1.12.1
Website Light-weight, simple and fast XML parser for C++ with XPath support.
lib py-cutlass 3.1.0_py39 
Website Python interface for CUTLASS
lib py-h5py 2.7.1_py27 
2.8.0_py36
2.10.0_py36
3.1.0_py36
3.7.0_py39
Website The h5py package is a Pythonic interface to the HDF5 binary data format.
lib py-netcdf4 1.3.1_py27 
1.3.1_py36
Website netcdf4-python is a Python interface to the netCDF C library.
lib py-nose 1.3.7_py39
Website nose is nicer testing for python.
lib py-numba 0.35.0_py27 
0.35.0_py36
0.53.1_py36
0.54.1_py39
Website Numba is a compiler for Python array and numerical functions that gives you the power to speed up your applications with high performance functions written directly in Python..
lib py-parsl 1.2.0_py39
Website Parsl is a flexible and scalable parallel programming library for Python.
lib py-pycuda 2017.1.1_py27 
2021.1_py36 
Website PyCUDA lets you access Nvidia‘s CUDA parallel computation API from Python.
lib py-rmm 23.04.00_py39 
Website Python interface for RMM
lib py-schwimmbad 0.3.1_py36 
0.3.2_py39 
Website schwimmbad provides a uniform interface to parallel processing pools and enables switching easily between local development (e.g., serial processing or with multiprocessing) and deployment on a cluster or supercomputer (via, e.g., MPI or JobLib).
lib py-scikit-image 0.13.0_py27
0.14.0_py27
0.15.0_py27
0.15.0_py36
0.17.2_py36
0.19.3_py39
0.20.0_py39
Website scikit-image is a collection of algorithms for image processing.
lib rabbitmq 3.7.13
Website RabbitMQ is an open-source message broker.
lib raja 0.12.1  
Website Collection of C++ software abstractions that enable architecture portability for HPC applications.
lib rmm 23.04.00 
Website RAPIDS Memory Manager library
lib swig 3.0.12
Website SWIG is an interface compiler that connects programs written in C and C++ with scripting languages such as Perl, Python, Ruby, and Tcl.
lib tbb 2017.u2
2018.u1
2018
2019
Website Intel® Threading Building Blocks (Intel® TBB) is a widely used C++ library for shared-memory parallel programming and heterogeneous computing (intra-node distributed memory programming).
lib trilinos 12.12.1 
Website Trilinos is a collection of open-source software libraries, called packages, intended to be used as building blocks for the development of scientific applications.
lib xsimd 7.6.0
8.1.0
Website C++ wrappers for SIMD intrinsics and parallelized, optimized mathematical functions (SSE, AVX, NEON, AVX512)
lib zeromq 4.2.2
Website ZeroMQ (also spelled ØMQ, 0MQ or ZMQ) is a high-performance asynchronous messaging library, aimed at use in distributed or concurrent applications.
mpi hpcx 2.6.0  
2.7.0  
2.8.1  
Website Mellanox HPC-X toolkit is a comprehensive software package that includes MPI and SHMEM/PGAS communications libraries.
mpi impi 2017.u2 
2018.u1 
2018 
2019 
Website Intel® MPI Library is a multi-fabric message passing library that implements the Message Passing Interface, version 3.1 (MPI-3.1) specification.
mpi openmpi 2.0.2 
2.1.1 
3.1.2  
4.0.3  
4.0.5  
4.1.0  
4.1.2  
Website The Open MPI Project is an open source Message Passing Interface implementation that is developed and maintained by a consortium of academic, research, and industry partners.
mpi py-mpi4py 3.0.0_py27 
3.0.3_py36 
3.1.3_py39 
Website MPI for Python provides Python bindings for the Message Passing Interface (MPI) standard. It is implemented on top of the MPI-½/3 specification and exposes an API which grounds on the standard MPI-2 C++ bindings.
networking gasnet 1.30.0 
Website GASNet is a language-independent, low-level networking layer that provides network-independent, high-performance communication primitives tailored for implementing parallel global address space SPMD languages and libraries.
networking libfabric 1.6.0
1.6.2
1.7.1
1.9.1
1.10.1
1.11.1
1.14.0
Website The Open Fabrics Interfaces (OFI) is a framework focused on exporting fabric communication services to applications. Libfabric is the library that defines and exports the user-space API of OFI.
networking py-ucx-py 0.24.0_py39
Website Python bindinbgs for UCX.
networking ucx 1.3.1
1.8.1 
1.9.0 
1.10.0 
1.12.1 
Website UCX is a communication library implementing high-performance messaging for MPI/PGAS frameworks.
parser antlr 2.7.7
Website ANTLR (ANother Tool for Language Recognition) is a powerful parser generator for reading, processing, executing, or translating structured text or binary files.
parser xerces-c 3.2.1
Website Xerces-C++ is a validating XML parser written in a portable subset of C++.
profiling amd-uprof 3.3.462
Website AMD uProf is a performance analysis tool for applications.
profiling darshan 3.4.4
Website Darshan is a scalable HPC I/O characterization tool.
runtime starpu 1.3.2 
Website StarPU is a unified runtime system that offers support for heterogeneous multicore architectures

math#

Field Module name Version(s) URL Description
computational geometry cgal 4.10
Website The Computational Geometry Algorithms Library (CGAL) is a C++ library that aims to provide easy access to efficient and reliable algorithms in computational geometry.
computational geometry dealii 9.4.1
Website deal.II is a C++ program library targeted at the computational solution of partial differential equations using adaptive finite elements.
computational geometry gmsh 4.10.1
Website Gmsh is an open source 3D finite element mesh generator with a built-in CAD engine and post-processor.
computational geometry opencascade 7.6.2
Website Open CASCADE Technology (OCCT) is an open-source full-scale 3D geometry library
computational geometry polymake 4.10
Website polymake is open source software for research in polyhedral geometry.
computational geometry qhull 2015.2
Website Qhull computes the convex hull, Delaunay triangulation, Voronoi diagram, halfspace intersection about a point, furthest-site Delaunay triangulation, and furthest-site Voronoi diagram.
computational geometry silo 4.11
Website A mesh and field I/O library and scientific database.
deep learning cudnn 6.0 
7.0.1 
7.0.4 
7.0.5 
7.1.4 
7.4.1.5 
7.6.4 
7.6.5 
8.1.1.33 
8.3.3.40 
8.6.0.163 
8.9.0.131 
Website NVIDIA cuDNN is a GPU-accelerated library of primitives for deep neural networks.
deep learning cutensor 1.2.0 
1.5.0.3 
Website GPU-accelerated tensor linear algebra library.
deep learning py-gym 0.21.0_py39
Website Gym is a toolkit for developing and comparing reinforcement learning algorithms.
deep learning py-horovod 0.12.1_py27   
0.12.1_py36  
Website Horovod is a distributed training framework for TensorFlow. The goal of Horovod is to make distributed Deep Learning fast and easy to use.
deep learning py-keras 2.1.5_py27  
2.0.8_py27 
2.1.5_py36 
2.2.4_py27 
2.2.4_py36 
2.3.1_py36 
Website Keras is a high-level neural networks API, written in Python and capable of running on top of TensorFlow, CNTK, or Theano.
deep learning py-onnx 1.0.1_py27
1.8.1_py36
1.12.0_py39
Website ONNX is a open format to represent deep learning models.
deep learning py-pytorch 0.3.0_py27  
0.2.0_py27 
0.2.0_py36 
0.3.0_py36 
1.0.0_py27 
1.0.0_py36 
1.4.0_py36 
1.6.0_py36 
1.8.1_py39 
1.11.0_py39  
2.0.0_py39  
Website PyTorch is a deep learning framework that puts Python first.
deep learning py-tensorboardx 1.8_py27 
Website TensorboardX is TensorBoard™ for PyTorch (and Chainer, MXNet, NumPy...)
deep learning py-tensorflow 2.1.0_py36  
1.4.0_py27 
1.5.0_py27 
1.5.0_py36 
1.9.0_py27 
1.9.0_py36 
2.4.1_py36 
2.6.2_py36 
2.9.1_py39 
2.10.0_py39 
Website TensorFlow™ is an open source software library for numerical computation using data flow graphs.
deep learning py-tensorlayer 1.6.3_py27 
Website TensorLayer is a Deep Learning (DL) and Reinforcement Learning (RL) library extended from Google TensorFlow.
deep learning py-tensorrt 8.5.1.7_py39 
Website Python bindings for the TensorRT library.
deep learning py-theano 1.0.1_py27 
Website Theano is a Python library that allows you to define, optimize, and evaluate mathematical expressions involving multi-dimensional arrays efficiently.
deep learning py-torchvision 0.15.1_py39
Website Datasets, model architectures, and common image transformations for computer vision for PyTorch.
deep learning py-triton 1.0.0_py39 
Website Triton is a language and compiler for writing highly efficient custom Deep-Learning primitives.
deep learning tensorrt 3.0.1 
3.0.4 
4.0.1.6 
5.0.2.6 
6.0.1.8 
7.0.0.11 
7.2.3.4 
8.5.1.7 
Website NVIDIA TensorRT™ is a high-performance deep learning inference optimizer and runtime that delivers low latency, high-throughput inference for deep learning applications.
deep learning torch 20180202 
Website Torch is a scientific computing framework with wide support for machine learning algorithms that puts GPUs first.
graph computing bliss 0.73
Website A tool for computing automorphism groups and canonical forms of graphs.
lib opencv 3.3.0  
4.5.2 
4.5.5 
4.7.0 
Website OpenCV (Open Source Computer Vision Library) is an open source computer vision and machine learning software library.
linear algebra armadillo 8.200.1
Website Armadillo is a high quality linear algebra library (matrix maths) for the C++ language, aiming towards a good balance between speed and ease of use.
linear algebra cusparselt 0.2.0.1 
Website NVIDIA cuSPARSELt is a high-performance CUDA library for sparse matrix-matrix multiplication.
machine learning py-scikit-learn 0.19.1_py27 
0.19.1_py36
0.24.2_py36
1.0.2_py39
1.3.2_py39
Website Scikit-learn is a free software machine learning library for the Python programming language.
numerical analysis matlab R2017a 
R2017b 
R2018a 
R2019a 
R2020a 
R2022b 
Website MATLAB is a multi-paradigm numerical computing environment and proprietary programming language developed by MathWorks.
numerical analysis octave 4.2.1
Website GNU Octave is a high-level language primarily intended for numerical computations.
numerical library arpack 3.5.0
3.7.0 
3.9.0 
Website Collection of Fortran77 subroutines designed to solve large scale eigenvalue problems.
numerical library blis 2.1
2.2.4
3.1.0
Website BLIS is a portable software framework for instantiating high-performance BLAS-like dense linear algebra libraries.
numerical library fftw 2.1.5
3.3.6 
3.3.8 
3.3.9
3.3.10 
Website The Fastest Fourier Transform in the West (FFTW) is a software library for computing discrete Fourier transforms (DFTs).
numerical library flexiblas 3.1.3
Website FlexiBLAS is a BLAS and LAPACK wrapper library with runtime exchangeable backends.
numerical library flint 2.9.0
Website FLINT is a C library for doing number theory.
numerical library glpk 4.63
Website The GLPK (GNU Linear Programming Kit) package is intended for solving large-scale linear programming (LP), mixed integer programming (MIP), and other related problems.
numerical library gmp 6.1.2
6.2.1
Website GMP is a free library for arbitrary precision arithmetic, operating on signed integers, rational numbers, and floating-point numbers.
numerical library gsl 1.16
2.3
2.7
Website The GNU Scientific Library (GSL) is a numerical library for C and C++ programmers. The library provides a wide range of mathematical routines such as random number generators, special functions and least-squares fitting.
numerical library harminv 1.4.1
Website harminv is a program designed to solve the problem of harmonic inversion: given a time series consisting of a sum of sinusoids (modes), extract their frequencies and amplitudes.
numerical library hypre 2.20.0 
Website HYPRE is a library of high performance preconditioners and solvers featuring multigrid methods for the solution of large, sparse linear systems of equations on massively parallel computers.
numerical library imkl 2017.u2
2018.u1
2018
2019
Website Intel Math Kernel Library (Intel MKL) is a library of optimized math routines for science, engineering, and financial applications. Core math functions include BLAS, LAPACK, ScaLAPACK, sparse solvers, fast Fourier transforms, and vector math.[3] The routines in MKL are hand-optimized specifically for Intel processors
numerical library libflame 2.1
2.2.4
3.1.0
Website libflame is a portable library for dense matrix computations, providing much of the functionality present in LAPACK
numerical library libxsmm 1.8.1
1.17
Website LIBXSMM is a library for small dense and small sparse matrix-matrix multiplications as well as for deep learning primitives such as small convolutions
numerical library metis 5.1.0
Website METIS is a set of serial programs for partitioning graphs, partitioning finite element meshes, and producing fill reducing orderings for sparse matrices.
numerical library mpc 1.2.1
Website GNU MPC is a C library for the arithmetic of complex numbers with arbitrarily high precision and correct rounding of the result.
numerical library mpfr 3.1.5
4.1.0
Website The MPFR library is a C library for multiple-precision floating-point computations with correct rounding.
numerical library mumps 5.1.2
Website A parallel sparse direct solver.
numerical library openblas 0.3.10 
0.2.19
0.3.4
0.3.9
0.3.20
0.3.26
Website OpenBLAS is an optimized BLAS library
numerical library parmetis 4.0.3 
Website ParMETIS is an MPI-based parallel library that implements a variety of algorithms for partitioning unstructured graphs, meshes, and for computing fill-reducing orderings of sparse matrices.
numerical library petsc 3.10.3 
3.18.5 
Website PETSc, the Portable, Extensible Toolkit for Scientific Computation, is a suite of data structures and routines for the scalable (parallel) solution of scientific applications modeled by partial differential equations.
numerical library py-autograd 1.0_py39 
Website Autograd can automatically differentiate native Python and Numpy code.
numerical library py-cupy 7.8.0_py36 
10.2.0_py39 
12.1.0_py39 
Website CuPy is an implementation of NumPy-compatible multi-dimensional array on CUDA.
numerical library py-gmpy2 2.0.8_py36
Website gmpy2 is a C-coded Python extension module that supports multiple-precision arithmetic.
numerical library py-jax 0.4.7_py39
Website JAX is Autograd and XLA, brought together for high-performance numerical computing.
numerical library py-jaxlib 0.4.7_py39
Website XLA library for Jax.
numerical library py-numpy 1.14.3_py27 
1.14.3_py36
1.17.2_py36
1.18.1_py36
1.19.2_py36
1.20.3_py39
1.24.2_py39
1.26.3_py312
Website NumPy is the fundamental package for scientific computing with Python.
numerical library py-petsc4py 3.18.5_py39
Website Python bindings for PETSc, the Portable, Extensible Toolkit for Scientific Computation.
numerical library py-psbody-mesh 0.4_py39
Website The MPI-IS Mesh Processing Library contains core functions for manipulating meshes and visualizing them.
numerical library py-pyublas 2017.1_py27
Website PyUblas provides a seamless glue layer between Numpy and Boost.Ublas for use with Boost.Python.
numerical library py-scipy 1.1.0_py27 
1.1.0_py36
1.4.1_py36
1.6.3_py39
1.10.1_py39
Website The SciPy library provides many user-friendly and efficient numerical routines such as routines for numerical integration and optimization.
numerical library py-slepc4py 3.18.2_py39
Website Python bindings for SLEPc.
numerical library py-tabmat 3.1.2_py39
Website Efficient matrix representations for working with tabular data.
numerical library qrupdate 1.1.2
Website qrupdate is a Fortran library for fast updates of QR and Cholesky decompositions.
numerical library scalapack 2.0.2 
2.1 
2.2.0 
Website ScaLAPACK is a library of high-performance linear algebra routines for parallel distributed memory machines.
numerical library scotch 6.0.4 
Website Software package and libraries for sequential and parallel graph partitioning, static mapping and clustering, sequential mesh and hypergraph partitioning, and sequential and parallel sparse matrix block ordering.
numerical library slepc 3.18.2 
Website SLEPc is a Scalable Library for Eigenvalue Problem Computations.
numerical library suitesparse 7.4.0 
Website SuiteSparse is a suite of sparse matrix algorithms.
numerical library superlu 5.2.1 
Website SuperLU is a general purpose library for the direct solution of large, sparse, nonsymmetric systems of linear equations.
numerical library tetgen 1.6.0
Website TetGen provides various features to generate good quality and adaptive tetrahedral meshes suitable for numerical methods, such as finite element or finite volume methods.
numerical library xblas 1.0.248
Website Extra precise basic linear algebra subroutines.
optimization gurobi 7.5.1
8.0.1_py27
8.0.1_py36
9.0.3_py36
10.0.1_py39
Website The Gurobi Optimizer is a commercial optimization solver for mathematical programming.
optimization knitro 10.3.0 
12.4.0
Website Artelys Knitro is an optimization solver for difficult large-scale nonlinear problems.
optimization nlopt 2.6.2
Website NLopt is a free/open-source library for nonlinear optimization.
optimization octeract 3.3.0
Website Octeract Engine is a proprietary massively parallel deterministic global optimization solver for general Mixed-Integer Nonlinear Programs (MINLP).
optimization py-optuna 2.10.0_py39
Website Optuna is an automatic hyperparameter optimization software framework, particularly designed for machine learning.
optimization sundials 6.4.1
Website SUNDIALS is a family of software packages providing robust and efficient time integrators and nonlinear solvers that can easily be incorporated into existing simulation codes.
scientific computing py-scipystack 1.0_py27 
1.0_py36
Website The SciPy Stack is a collection of open source software for scientific computing in Python. It provides the following packages: numpy, scipy, matplotlib, ipython, jupyter, pandas, sympy and nose.
statistics datamash 1.3
Website GNU datamash is a command-line program which performs basic numeric, textual and statistical operations on input textual data files.
statistics jags 4.3.0
4.3.1
Website Just another Gibbs sampler (JAGS) is a program for simulation from Bayesian hierarchical models using Markov chain Monte Carlo (MCMC).
statistics py-emcee 3.1.4_py39
Website The Python ensemble sampling toolkit for affine-invariant MCMC
statistics py-glum 2.1.2_py39
Website glum is a fast, modern, Python-first GLM estimation library.
statistics py-rpy2 2.8.6_py27
2.9.2_py36
Website rpy2 is an interface to R running embedded in a Python process.
statistics R 3.5.1 
3.4.0
3.6.1
4.0.2
4.1.2
4.2.0
4.3.2
Website R is a free software environment for statistical computing and graphics.
statistics rstudio 1.3.1093 
2023.09.1
Website RStudio is an integrated development environment (IDE) for R. It includes a console, syntax-highlighting editor that supports direct code execution, as well as tools for plotting, history, debugging and workspace management.
statistics rstudio-desktop 2022.02.2-485
Website RStudio is an integrated development environment (IDE) for R. It includes a console, syntax-highlighting editor that supports direct code execution, as well as tools for plotting, history, debugging and workspace management. This is the X11/GUI version.
statistics sas 9.4 
Website SAS is a software suite developed by SAS Institute for advanced analytics, multivariate analyses, business intelligence, data management, and predictive analytics.
statistics stata 15  
14 
16 
17 
18 
Website Stata is a complete, integrated statistical software package that provides everything you need for data analysis, data management, and graphics.
symbolic libmatheval 1.1.11
Website GNU libmatheval is a library (callable from C and Fortran) to parse and evaluate symbolic expressions input as text.
symbolic maxima 5.47.0
Website Maxima is a system for the manipulation of symbolic and numerical expressions.
symbolic py-pysr 0.12.3_py39
Website High-Performance Symbolic Regression in Python and Julia.
symbolic py-sympy 1.1.1_py27
1.1.1_py36
1.11.1_py39
Website SymPy is a Python library for symbolic mathematics.
technical computing mathematica 13.1.0 
Website A symbolic language and platform for modern technical computing.
topic modelling py-gensim 4.2.0_py39
Website Gensim is a Python library for topic modelling, document indexing and similarity retrieval with large corpora.

physics#

Field Module name Version(s) URL Description
astronomy cfitsio 4.0.0
Website FITSIO is a library of C and Fortran subroutines for reading and writing data files in FITS (Flexible Image Transport System) data format.
astronomy heasoft 6.22.1
6.26.1
Website HEAsoft is a Unified Release of the FTOOLS (General and mission-specific tools to manipulate FITS files) and XANADU (High-level, multi-mission tasks for X-ray astronomical spectral, timing, and imaging data analysis) software packages.
astronomy py-astropy 4.0.1_py36
Website The Astropy Project is a community effort to develop a common core package for Astronomy in Python and foster an ecosystem of interoperable astronomy packages.
astronomy py-lenstools 1.0_py36
Website This python package collects together a suite of widely used analysis tools in Weak Gravitational Lensing.
astronomy py-namaster 1.2.2_py36
Website NaMaster is a C library, Python module and standalone program to compute full-sky angular cross-power spectra of masked fields with arbitrary spin and an arbitrary number of known contaminants using a pseudo-Cl (aka MASTER) approach.
CFD su2 7.0.3
Website SU2: An Open-Source Suite for Multiphysics Simulation and Design
cliemate modeling fre-nctools 2022.01 
Website FRE-NCtools is a collection of tools to help with the creation and manipulation of netCDF files used for climate modeling.
climate modeling cdo 1.9.7.1
2.1.1
Website CDO is a collection of command line Operators to manipulate and analyse Climate and NWP model Data.
geophysics opensees 2.5.0 
Website OpenSees is a software framework for developing applications to simulate the performance of structural and geotechnical systems subjected to earthquakes.
geoscience gdal 3.4.1 
2.2.1
3.5.2
Website GDAL is a translator library for raster and vector geospatial data formats.
geoscience geos 3.6.2 
3.11.0
3.12.1
Website GEOS (Geometry Engine - Open Source) is a C++ port of Java Topology Suite (JTS).
geoscience geosx 0.2.0-20220523  
Website GEOSX is a simulation framework for modeling coupled flow, transport, and geomechanics in the subsurface.
geoscience gmtsar 6.2.2
Website An InSAR processing system based on GMT (Generic Mapping Tools).
geoscience proj 8.2.1 
4.9.3
9.1.0
Website PROJ is a generic coordinate transformation software that transforms geospatial coordinates from one coordinate reference system (CRS) to another.
geoscience py-opendrift 1.0.3_py27
Website OpenDrift is a software for modeling the trajectories and fate of objects or substances drifting in the ocean, or even in the atmosphere.
geoscience py-pyproj 1.9.5.1_py27 
1.9.5.1_py36
3.4.0_py39
Website Python interface to PROJ4 library for cartographic transformations.
geoscience swash 9.01a 
Website SWASH (an acronym of Simulating WAves till SHore) is a non-hydrostatic wave-flow model.
geoscience udunits 2.2.26
Website The UDUNITS package from Unidata is a C-based package for the programatic handling of units of physical quantities.
lib libgdsii 0.21
Website libGDSII C++ is a library and command-line utility for reading GDSII geometry files.
magnetism mumax 3.10 
Website mumax3 is a GPU-accelerated micromagnetic simulation program.
materials science atat 3.36
Website Alloy Theoretic Automated Toolkit: a software toolkit for modeling coupled configurational and vibrational disorder in alloy systems.
materials science py-megnet 1.3.0_py39 
Website The MatErials Graph Network (MEGNet) is an implementation of DeepMind's graph networks[1] for universal machine learning in materials science.
materials science py-pymatgen 2022.5.26_py39
Website Pymatgen (Python Materials Genomics) is a robust, open-source Python library for materials analysis.
micromagnetics oommf 1.2b4
Website OOMMF is a set of portable, extensible public domain micromagnetic program and associated tools.
particle openmc 0.10.0
Website OpenMC is a Monte Carlo particle transport simulation code focused on neutron criticality calculations.
photonics meep 1.3 
1.4.3 
1.24.0 
Website Meep is a free finite-difference time-domain (FDTD) simulation software package to model electromagnetic systems.
photonics mpb 1.5 
1.6.2 
1.11.1 
Website MPB is a free software package for computing the band structures, or dispersion relations, and electromagnetic modes of periodic dielectric structures, on both serial and parallel computers.
quantum information science cuquantum 22.03.0.40 
Website NVIDIA cuQuantum is an SDK of optimized libraries and tools for accelerating quantum computing workflows.
quantum information science py-cuquantum-python 22.3.0_py39
Website NVIDIA cuQuantum Python provides Python bindings and high-level object-oriented models for accessing the full functionalities of NVIDIA cuQuantum SDK from Python.
quantum mechanics py-quspin 0.3.5_py36
Website QuSpin is an open-source Python package for exact diagonalization and quantum dynamics of arbitrary boson, fermion and spin many-body systems.
quantum mechanics py-qutip 4.5.2_py36
Website QuTiP is open-source software for simulating the dynamics of closed and open quantum systems.

system#

Field Module name Version(s) URL Description
backup restic 0.9.5
0.12.1
0.16.3
Website Fast, secure, efficient backup program.
benchmark hp2p 3.2 
Website Heavy Peer To Peer: a MPI based benchmark for network diagnostic.
benchmark mpibench 20190729 
Website Times MPI collectives over a series of message sizes.
benchmark mprime 29.4
Website mprime is used by GIMPS, a distributed computing project dedicated to finding new Mersenne prime numbers, and which is commonly used as a stability testing utility.
benchmark osu-micro-benchmarks 5.6.1 
5.6.3  
5.7  
5.9  
Website The OSU MicroBenchmarks carry out a variety of message passing performance tests using MPI.
benchmark py-linktest 2.1.19_py39 
Website LinkTest is a communication API benchmarking tool that tests point-to-point connections.
checkpointing dmtcp 2.6.0
Website DMTCP (Distributed MultiThreaded Checkpointing) transparently checkpoints a single-host or distributed computation in user-space -- with no modifications to user code or to the O/S.
cloud interface aws-cli 2.0.50
Website This package provides a unified command line interface to Amazon Web Services.
cloud interface google-cloud-sdk 338.0.0
400.0.0
448.0.0
Website Command-line interface for Google Cloud Platform products and services.
cloud interface s5cmd 2.0.0
Website Parallel S3 and local filesystem execution tool.
cloud interface steampipe 0.14.6
Website Steampipe is an open source tool for querying cloud APIs in a universal way and reasoning about the data in SQL.
compiler mrc 1.3.3
Website MRC is a resource compiler that can create self-contained applications, by including all the required data inside executable files.
compression libarchive 3.3.2
3.4.2
3.5.2
Website The libarchive project develops a portable, efficient C library that can read and write streaming archives in a variety of formats.
compression libzip 1.5.1
Website libzip is a C library for reading, creating, and modifying zip archives.
compression lz4 1.8.0
Website LZ4 is lossless compression algorithm.
compression lzo 2.10
Website LZO is a portable lossless data compression library written in ANSI C.
compression mpibzip2 0.6 
Website MPIBZIP2 is a parallel implementation of the bzip2 block-sorting file compressor that uses MPI and achieves significant speedup on cluster machines.
compression p7zip 16.02
Website p7zip is a Linux port of 7zip, a file archiver with high compression ratio.
compression pbzip2 1.1.12
Website PBZIP2 is a parallel implementation of the bzip2 block-sorting file compressor that uses pthreads and achieves near-linear speedup on SMP machines.
compression pigz 2.4
Website A parallel implementation of gzip for modern multi-processor, multi-core machines.
compression szip 2.1.1
Website Szip compression software, providing lossless compression of scientific data, is an implementation of the extended-Rice lossless compression algorithm.
compression xz 5.2.3
Website XZ Utils, the successor to LZMA Utils, is free general-purpose data compression software with a high compression ratio.
compression zlib 1.2.11
Website zlib is designed to be a free, general-purpose, legally unencumbered -- that is, not covered by any patents -- lossless data-compression library for use on virtually any computer hardware and operating system.
compression zstd 1.5.2
Website Zstandard, or zstd, is a fast lossless compression algorithm, targeting real-time compression scenarios at zlib-level and better compression ratios.
containers libnvidia-container 1.0.0rc2 
Website libnvidia-container is a library and a simple CLI utility to automatically configure GNU/Linux containers leveraging NVIDIA hardware.
containers proot 5.2.0 
5.1.0
Website PRoot is a user-space implementation of chroot, mount --bind, and binfmt_misc.
containers py-spython 0.3.13_py39
0.3.13_py312
Website Singularity Python (spython) is the Python API for working with Singularity containers.
database bdb 6.2.32
Website Berkeley DB (BDB) is a software library intended to provide a high-performance embedded database for key/value data.
database mariadb 10.2.11 
10.6.9
Website MariaDB is a community-developed fork of the MySQL relational database management system intended to remain free under the GNU GPL.
database postgresql 10.5
14.5
Website PostgreSQL is a powerful, open source object-relational database system with a strong focus on reliability, feature robustness, and performance.
database sqlite 3.18.0
3.37.2
3.44.2
Website SQLite is a self-contained, high-reliability, embedded, full-featured, public-domain, SQL database engine.
database sqliteodbc 0.9998
Website ODBC driver for SQLite
database unixodbc 2.3.9
Website unixODBC is an open-source project that implements the ODBC API.
document management pandoc 2.7.3
Website Pandoc is a universal document converter.
document processing ghostscript 9.53.2
Website Ghostscript is an interpreter for the PostScript language and PDF files.
document processing groff 1.23.0
Website groff (GNU roff) is a typesetting system that reads plain text input files that include formatting commands to produce output in PostScript, PDF, HTML, or DVI formats or for display to a terminal.
document processing lyx 2.3.2
Website LyX is a document processor.
document processing poppler 0.47.0
Website Poppler is a PDF rendering library.
document processing texinfo 6.6
Website Texinfo is the official documentation format of the GNU project.
document processing texlive 2019
Website TeX Live is an easy way to get up and running with the TeX document production system.
file management dua-cli 2.20.1
Website dua (-> Disk Usage Analyzer) is a tool to conveniently learn about the usage of disk space of a given directory.
file management duc 1.4.4
Website Duc is a collection of tools for indexing, inspecting and visualizing disk usage.
file management exa 0.8.0
Website exa is a replacement for ls written in Rust.
file management fdupes 2.2.1
Website FDUPES is a program for identifying or deleting duplicate files residing within specified directories.
file management fpart 0.9.3
Website fpart sorts files and packs them into partitions.
file management midnight-commander 4.8.29
Website GNU Midnight Commander is a visual file manager.
file management ncdu 1.18.1 
1.15.1
2.2.1
Website Ncdu is a disk usage analyzer with an ncurses interface.
file management py-pcircle 0.17_py27 
Website pcircle contains a suite of file system tools developed at OLCF to take advantage of highly scalable parallel file system such as Lustre.
file management rmlint 2.8.0
Website rmlint finds space waste and other broken things on your filesystem and offers to remove it.
file management tdu 1.36
Website tdu estimates the disk space occupied by all files in a given path.
file transfer aria2 1.35.0
Website aria2 is a lightweight multi-protocol & multi-source command-line download utility.
file transfer aspera-cli 3.9.6
Website The IBM Aspera Command-Line Interface (the Aspera CLI) is a collection of Aspera tools for performing high-speed, secure data transfers from the command line.
file transfer gsutil 4.31
Website gsutil is a Python application that lets you access Cloud Storage from the command line.
file transfer lftp 4.8.1
Website LFTP is a sophisticated file transfer program supporting a number of network protocols (ftp, http, sftp, fish, torrent).
file transfer mpifileutils 0.10.1 
0.11 
0.11.1 
Website mpiFileUtils is a suite of MPI-based tools to manage large datasets, which may vary from large directory trees to large files.
file transfer py-globus-cli 1.2.0
1.9.0_py27
1.9.0_py36
3.2.0_py39
3.8.0_py39
3.19.0_py39
Website A command line wrapper over the Globus SDK for Python.
file transfer py-httpie 3.2.1_py39
Website HTTPie is a command-line HTTP client designed for testing, debugging, and generally interacting with APIs and HTTP servers.
file transfer rclone 1.55.1
1.59.1
1.65.0
Website Rclone is a command line program to sync files and directories to and from: Google Drive, Amazon S3, Dropbox, Google Cloud Storage, Amazon Drive, Microsoft One Drive, Hubic, Backblaze B2, Yandex Disk, or the local filesystem.
framework mono 5.12.0.301
5.20.1.19
Website Mono is an open source implementation of Microsoft's .NET Framework based on the ECMA standards for C# and the Common Language Runtime.
hardware hwloc 2.7.0
2.9.3
Website The Portable Hardware Locality (hwloc) software package provides a portable abstraction of the hierarchical topology of modern architectures.
hardware libpciaccess 0.16
Website Generic PCI access library.
job management slurm-drmaa 1.1.2
Website DRMAA for Slurm Workload Manager (Slurm) is an implementation of Open Grid Forum Distributed Resource Management Application API (DRMAA) version 1 for submission and control of jobs to Slurm.
language tcltk 8.6.6
Website Tcl (Tool Command Language) is a dynamic programming language, suitable for web and desktop applications, networking, administration, testing. Tk is a graphical user interface toolkit.
libs apr 1.6.3
Website The Apache Portable Runtime is a supporting library for the Apache web server. It provides a set of APIs that map to the underlying operating system.
libs apr-util 1.6.1
Website The Apache Portable Runtime is a supporting library for the Apache web server. It provides a set of APIs that map to the underlying operating system.
libs atk 2.24.0
Website ATK is the Accessibility Toolkit. It provides a set of generic interfaces allowing accessibility technologies such as screen readers to interact with a graphical user interface.
libs benchmark 1.2.0
Website A microbenchmark support library
libs cairo 1.14.10
Website Cairo is a 2D graphics library with support for multiple output devices.
libs cups 2.2.4
Website CUPS is the standards-based, open source printing system.
libs dbus 1.10.22
Website D-Bus is a message bus system, a simple way for applications to talk to one another.
libs enchant 1.6.1
2.2.3
Website Enchant is a library (and command-line program) that wraps a number of different spelling libraries and programs with a consistent interface.
libs fltk 1.3.4
Website FLTK (pronounced 'fulltick') is a cross-platform C++ GUI toolkit.
libs fontconfig 2.12.4
Website Fontconfig is a library for configuring and customizing font access.
libs freeglut 3.0.0
Website FreeGLUT is a free-software/open-source alternative to the OpenGL Utility Toolkit (GLUT) library.
libs freetype 2.8.1
2.9.1
Website FreeType is a software font engine that is designed to be small, efficient, highly customizable, and portable while capable of producing high-quality output (glyph images).
libs fribidi 1.0.12
Website The Free Implementation of the Unicode Bidirectional Algorithm.
libs ftgl 2.1.2
Website FTGL is a free cross-platform Open Source C++ library that uses Freetype2 to simplify rendering fonts in OpenGL applications.
libs gc 7.6.0
Website The Boehm-Demers-Weiser conservative garbage collector can be used as a garbage collecting replacement for C malloc or C++ new.
libs gconf 2.9.91
Website GConf is a system for storing application preferences.
libs gdk-pixbuf 2.36.8
Website The GdkPixbuf library provides facilities for loading images in a variety of file formats.
libs gflags 2.2.1
2.2.2
Website The gflags package contains a C++ library that implements commandline flags processing.
libs giflib 5.1.4
Website GIFLIB is a package of portable tools and library routines for working with GIF images.
libs glib 2.52.3
Website The GLib library provides core non-graphical functionality such as high level data types, Unicode manipulation, and an object and type system to C programs.
libs glog 0.3.5
Website C++ implementation of the Google logging module.
libs gnutls 3.5.9
Website GnuTLS is a secure communications library implementing the SSL, TLS and DTLS protocols and technologies around them.
libs gobject-introspection 1.52.1
Website GObject introspection is a middleware layer between C libraries (using GObject) and language bindings.
libs googletest 1.8.0
Website Google Test is Google's C++ test framework.
libs gstreamer 1.12.0
Website GStreamer is a library for constructing graphs of media-handling components.
libs gtk+ 2.24.30
3.22.18
Website GTK+, or the GIMP Toolkit, is a multi-platform toolkit for creating graphical user interfaces.
libs harfbuzz 1.4.8
Website HarfBuzz is an OpenType text shaping engine.
libs hunspell 1.6.2
Website Hunspell is a spell checker.
libs hyphen 2.8.8
Website Hyphen is a hyphenation library to use converted TeX hyphenation patterns.
libs icu 59.1
Website ICU is a set of C/C++ and Java libraries providing Unicode and Globalization support for software applications.
libs jansson 2.13.1
Website C library for encoding, decoding and manipulating JSON data.
libs jemalloc 5.3.0
Website jemalloc is a general purpose malloc(3) implementation that emphasizes fragmentation avoidance and scalable concurrency support.
libs json-glib 1.4.4
Website JSON-GLib is a library providing serialization and deserialization support for the JavaScript Object Notation (JSON) format described by RFC 4627.
libs leptonica 1.82.0
Website Leptonica is an open source library containing software that is broadly useful for image processing and image analysis applications.
libs libaio 0.3.111
Website libaio provides the Linux-native API for async I/O.
libs libart_lgpl 2.3.21
Website Libart is a library for high-performance 2D graphics.
libs libcroco 0.6.13
Website Libcroco is a standalone css2 parsing and manipulation library.
libs libepoxy 1.4.1
Website Epoxy is a library for handling OpenGL function pointer management for you.
libs libexif 0.6.21
Website A library for parsing, editing, and saving EXIF data.
libs libffi 3.2.1
Website libffi is a portable Foreign Function Interface library.
libs libgcrypt 1.8.2
Website Libgcrypt is a general purpose cryptographic library originally based on code from GnuPG.
libs libgd 2.2.5
Website GD is an open source code library for the dynamic creation of images by programmers.
libs libgdiplus 5.6
Website C-based implementation of the GDI+ API
libs libglvnd 1.2.0
Website libglvnd is a vendor-neutral dispatch layer for arbitrating OpenGL API calls between multiple vendors.
libs libgnomecanvas 2.30.3
Website Library for the GNOME canvas, an engine for structured graphics that offers a rich imaging model, high performance rendering, and a powerful, high-level API.
libs libgpg-error 1.27
Website Libgpg-error is a small library that originally defined common error values for all GnuPG components.
libs libiconv 1.16
Website libiconv is a conversion library for string encoding.
libs libidl 0.8.14
Website The libIDL package contains libraries for Interface Definition Language files. This is a specification for defining portable interfaces.
libs libjpeg-turbo 1.5.1 
2.1.4
Website libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2, AVX2, NEON, AltiVec) to accelerate baseline JPEG compression and decompression on x86, x86-64, ARM, and PowerPC systems
libs libmng 2.0.3
Website THE reference library for reading, displaying, writing and examining Multiple-Image Network Graphics. MNG is the animation extension to the popular PNG image-format.
libs libpng 1.2.57
1.6.29
Website libpng is the official PNG reference library. It supports almost all PNG features, is extensible, and has been extensively tested for over 20 years.
libs libproxy 0.4.15
Website libproxy is a library that provides automatic proxy configuration management.
libs libressl 2.5.3
3.2.1
Website LibreSSL is a version of the TLS/crypto stack forked from OpenSSL in 2014, with goals of modernizing the codebase, improving security, and applying best practice development processes.
libs librsvg 2.36.4
Website Librsvg is a library to render SVG files using cairo as a rendering engine.
libs libseccomp 2.3.3
Website The libseccomp library provides an easy to use, platform independent, interface to the Linux Kernel's syscall filtering mechanism..
libs libsodium 1.0.18
Website Sodium is a modern, easy-to-use software library for encryption, decryption, signatures, password hashing and more.
libs libsoup 2.61.2
Website libsoup is an HTTP client/server library for GNOME.
libs libtasn1 4.13
Website Libtasn1 is the ASN.1 library used by GnuTLS, p11-kit and some other packages.
libs libtiff 4.0.8 
4.4.0
4.5.0
Website libtiff provides support for the Tag Image File Format (TIFF), a widely used format for storing image data.
libs libunistring 0.9.7
Website Libunistring provides functions for manipulating Unicode strings and for manipulating C strings according to the Unicode standard.
libs libuuid 1.0.3
Website Portable uuid C library.
libs libuv 1.38.1
Website libuv is a multi-platform support library with a focus on asynchronous I/O.
libs libwebp 0.6.1
Website WebP is a modern image format that provides superior lossless and lossy compression for images on the web.
libs libxkbcommon 0.9.1
Website libxkbcommon is a keyboard keymap compiler and support library which processes a reduced subset of keymaps as defined by the XKB (X Keyboard Extension) specification.
libs libxml2 2.9.4
Website Libxml2 is a XML C parser and toolkit.
libs libxslt 1.1.32
Website Libxslt is the XSLT C library developed for the GNOME project. XSLT itself is a an XML language to define transformation for XML.
libs mesa 17.1.6
Website Mesa is an open-source implementation of the OpenGL, Vulkan and other specifications.
libs minipmi 1.0
Website Implementation of a minimal subset of the PMI1 and PMI2 specifications.
libs ncurses 6.0
6.4
Website The ncurses (new curses) library is a free software emulation of curses in System V Release 4.0 (SVr4), and more.
libs nettle 3.3
Website Nettle is a cryptographic library that is designed to fit easily in more or less any context.
libs openjpeg 2.3.1
Website OpenJPEG is an open-source JPEG 2000 codec written in C language.
libs openssl 3.0.7
Website OpenSSL is a full-featured toolkit for general-purpose cryptography and secure communication.
libs orbit 2.14.19
Website ORBit2 is a CORBA 2.4-compliant Object Request Broker (ORB) featuring mature C, C++ and Python bindings.
libs pango 1.40.10
Website Pango is a library for laying out and rendering of text, with an emphasis on internationalization.
libs pcre 8.40
Website The PCRE library is a set of functions that implement regular expression pattern matching using the same syntax and semantics as Perl 5.
libs pcre2 10.35
10.40
Website The PCRE22 library is a set of functions that implement regular expression pattern matching using the same syntax and semantics as Perl 5.
libs popt 1.16
Website Library for parsing command line options.
libs py-lmdb 0.93
Website Universal Python binding for the LMDB 'Lightning' Database.
libs py-mako 1.0.7_py27 
1.0.7_py36
Website Mako is a template library written in Python. It provides a familiar, non-XML syntax which compiles into Python modules for maximum performance.
libs py-pygobject 3.32.2_py36
Website PyGObject is a Python package which provides bindings for GObject based libraries such as GTK, GStreamer, WebKitGTK, GLib, GIO and many more.
libs py-pyopengl 3.1.5_py39
Website Standard OpenGL bindings for Python.
libs py-pyqt5 5.9.1_py36
Website PyQt5 is a comprehensive set of Python bindings for Qt v5.
libs readline 7.0
8.2
Website The GNU Readline library provides a set of functions for use by applications that allow users to edit command lines as they are typed in.
libs serf 1.3.9
Website The serf library is a high performance C-based HTTP client library built upon the Apache Portable Runtime (APR) library.
libs sionlib 1.7.7 
Website Scalable I/O library for parallel access to task-local files.
libs snappy 1.1.7
Website A fast compressor/decompressor.
libs talloc 2.1.14
Website talloc is a hierarchical, reference counted memory pool system with destructors.
libs tesseract 5.1.0
Website Tesseract is an open source text recognition (OCR) Engine.
libs utf8proc 2.4.0
Website iutf8proc is a small, clean C library that provides Unicode normalization, case-folding, and other operations for data in the UTF-8 encoding.
libs wxwidgets 3.0.4
Website wxWidgets is a C++ library that lets developers create applications for Windows, macOS, Linux and other platforms with a single code base.
libs yaml-cpp 0.7.0
Website yaml-cpp is a YAML parser and emitter in C++ matching the YAML 1.2 spec.
media ffmpeg 4.0
4.2.1
5.0
Website FFmpeg is the leading multimedia framework, able to decode, encode, transcode, mux, demux, stream, filter and play pretty much anything that humans and machines have created.
media libsndfile 1.0.28
Website Libsndfile is a C library for reading and writing files containing sampled sound (such as MS Windows WAV and the Apple/SGI AIFF format) through one standard library interface.
performance likwid 4.3.2
5.2.1 
Website Likwid is a simple toolsuite of command line applications for performance oriented programmers.
resource monitoring nvtop 1.1.0 
2.0.3 
3.0.2 
Website Nvtop stands for NVidia TOP, a (h)top like task monitor for NVIDIA GPUs.
resource monitoring remora 1.8.5
Website Remora is a tool to monitor runtime resource utilization.
resource monitoring ruse 2.0 
Website A command line tool to measure process resource usage.
scm gh 1.9.1
Website gh is GitHub on the command line. It brings pull requests, issues, and other GitHub concepts to the terminal next to where you are already working with git and your code.
scm git 2.39.1
Website Git is a free and open source distributed version control system designed to handle everything from small to very large projects with speed and efficiency.
scm git-annex 8.20210622
Website git-annex allows managing files with git, without checking the file contents into git.
scm git-credential-manager 2.0.696
Website Secure, cross-platform Git credential storage with authentication to GitHub, Azure Repos, and other popular Git hosting services.
scm git-lfs 2.4.0
Website Git Large File Storage (LFS) replaces large files such as audio samples, videos, datasets, and graphics with text pointers inside Git, while storing the file contents on a remote server.
scm libgit2 1.1.0
Website libgit2 is a portable, pure C implementation of the Git core methods provided as a re-entrant linkable library with a solid API
scm mercurial 4.5.3
Website Mercurial is a free, distributed source control management tool.
scm py-dvc 0.91.1_py36
Website Data Version Control or DVC is an open-source tool for data science and machine learning projects.
scm subversion 1.9.7
1.12.2
Website Subversion is an open source version control system.
shell powershell 7.1.5
Website PowerShell Core is a cross-platform automation and configuration tool/framework.
testing py-pytest 7.1.3_py39
Website pytest is a full-featured Python testing framework
tools clinfo 2.2.18.04.06 
Website clinfo is a simple command-line application that enumerates all possible (known) properties of the OpenCL platform and devices available on the system.
tools curl 8.4.0
Website curl is an open source command line tool and library for transferring data with URL syntax.
tools depot_tools 20200731
Website Tools for working with Chromium development.
tools expat 2.2.3
Website Expat is a stream-oriented XML parser library written in C.
tools graphicsmagick 1.3.26
Website GraphicsMagick is the swiss army knife of image processing.
tools imagemagick 7.0.7-2
Website ImageMagick is a free and open-source software suite for displaying, converting, and editing raster image and vector image files.
tools jq 1.6
Website jq is a lightweight and flexible command-line JSON processor.
tools leveldb 1.20
Website Symas LMDB is an extraordinarily fast, memory-efficient database we developed for the Symas OpenLDAP Project.
tools lmdb 0.9.21
Website Symas LMDB is an extraordinarily fast, memory-efficient database we developed for the Symas OpenLDAP Project.
tools motif 2.3.7
Website Motif is the toolkit for the Common Desktop Environment.
tools parallel 20180122
20200822
Website GNU parallel is a shell tool for executing jobs in parallel using one or more computers.
tools password-store 1.7.4
Website Simple password manager using gpg and ordinary unix directories.
tools py-clustershell 1.9.0_py39
Website ClusterShell is an event-driven open source Python library, designed to run local or distant commands in parallel on server farms or on large Linux clusters.
tools py-matlab-proxy 0.9.1_py39
0.10.0_py39
Website matlab-proxy is a Python package which enables you to launch MATLAB and access it from a web browser.
tools py-pyside 5.15.2.1_py39
Website PySide is the official Python module from the Qt for Python project, which provides access to the complete Qt framework.
tools py-wxpython 4.0.7_py39
4.2.0_py39
Website wxPython is the cross-platform GUI toolkit for the Python language,
tools qt 5.9.1 
6.4.0
Website QT is a cross-platform application framework that is used for developing application software that can be run on various software and hardware platforms.
tools ripgrep 11.0.1
Website ripgrep recursively searches directories for a regex pattern.
tools rocksdb 5.7.3
Website A library that provides an embeddable, persistent key-value store for fast storage.
tools x11 7.7
Website The X.Org project provides an open source implementation of the X Window System.
tools xkeyboard-config 2.21
Website The non-arch keyboard configuration database for X Window.

viz#

Field Module name Version(s) URL Description
data ncview 2.1.7
Website Ncview is a visual browser for netCDF format files.
gis gmt 6.4.0
Website GMT (The Generic Mapping Tools) is an open source collection of command-line tools for manipulating geographic and Cartesian data sets.
gis panoply 4.10.8
Website Panoply plots geo-referenced and other arrays from netCDF, HDF, GRIB, and other datasets.
gis py-cartopy 0.21.0_py39
Website Cartopy is a Python package designed for geospatial data processing in order to produce maps and other geospatial data analyses.
graphs graphviz 2.40.1
2.44.1
Website Graphviz is open source graph visualization software.
imaging py-pillow 5.1.0_py27 
5.1.0_py36
7.0.0_py36
8.2.0_py39
9.3.0_py39
Website Pillow is a friendly PIL (Python Imaging Library) fork.
imaging py-pillow-simd 7.0.0.post3_py36
9.2.0_py39
Website Pillow-SIMD is an optimized version of Pillow
molecular visualization ovito 3.7.11
Website OVITO is a scientific visualization and data analysis solution for atomistic and other particle-based models.
molecular visualization pymol 1.8.6.2 
2.5.3 
Website PyMOL is a Python-enhanced molecular graphics tool.
plotting gnuplot 5.2.0
Website Gnuplot is a portable command-line driven graphing utility for Linux, OS/2, MS Windows, OSX, VMS, and many other platforms.
plotting grace 5.1.25
Website Grace is a WYSIWYG tool to make two-dimensional plots of numerical data.
plotting mathgl 8.0.1
Website MathGL is a library to make high-quality scientific graphics.
plotting py-basemap 1.1.0_py27 
1.1.0_py36
Website The matplotlib basemap toolkit is a library for plotting 2D data on maps in Python.
plotting py-matplotlib 2.2.2_py27 
2.1.2_py27
2.1.2_py36
2.2.2_py36
3.1.1_py36
3.2.1_py36
3.4.2_py39
3.7.1_py39
Website Matplotlib is a Python 2D plotting library which produces publication quality figures in a variety of hardcopy formats and interactive environments across platforms.
plotting py-plotly 2.4.1_py27
Website Plotly's Python graphing library makes interactive, publication-quality graphs online.
plotting py-seaborn 0.12.1_py39
Website Seaborn is a Python data visualization library based on matplotlib. It provides a high-level interface for drawing attractive and informative statistical graphics.
plotting veusz 3.3.1
Website Veusz is a scientific plotting and graphing program with a graphical user interface, designed to produce publication-ready 2D and 3D plots.
remote display virtualgl 2.5.2
Website VirtualGL is an open source toolkit that gives any Unix or Linux remote display software the ability to run OpenGL applications with full 3D hardware acceleration.
\ No newline at end of file diff --git a/docs/software/modules/index.html b/docs/software/modules/index.html new file mode 100644 index 000000000..ee8f26fec --- /dev/null +++ b/docs/software/modules/index.html @@ -0,0 +1,155 @@ + Modules - Sherlock

Modules

Environment modules#

Software is provided on Sherlock under the form of loadable environment modules.

Software is only accessible via modules

The use of a module system means that most software is not accessible by default and has to be loaded using the module command. This mechanism allows us to provide multiple versions of the same software concurrently, and gives users the possibility to easily switch between software versions.

Sherlock uses Lmod to manage software installations. The modules system helps setting up the user's shell environment to give access to applications, and make running and compiling software easier. It also allows us to provide multiple versions of the same software, that would otherwise conflict with each other, and abstract things from the OS sometimes rigid versions and dependencies.

When you first log into Sherlock, you'll be presented with a default, bare bone environment with minimal software available. The module system is used to manage the user environment and to activate software packages on demand. In order to use software installed on Sherlock, you must first load the corresponding software module.

When you load a module, the system will set or modify your user environment variables to enable access to the software package provided by that module. For instance, the $PATH environment variable might be updated so that appropriate executables for that package can be used.

Module categories#

Modules on Sherlock are organized by scientific field, in distinct categories. This is to limit the information overload that can result when displaying the full list of available modules. Given the large diversity of the Sherlock user population, all users are not be interested in the same kind of software, and high-energy physicists may not want to see their screens cluttered with the latest bioinformatics packages.

Module categories

You will first have to load a category module before getting access to individual modules. The math and devel categories are loaded by default, and modules in those categories can be loaded directly

For instance, to be able to load the gromacs module, you'll first need to load the chemistry module. This can be done in a single command, by specifying first the category, then the actual application module name:

$ module load chemistry gromacs
+

The math and devel categories, which are loaded by default, provide direct access to compilers, languages, and MPI and numerical libraries.

For a complete list of software module categories, please refer to the list of available software

Searching for a module

To know how to access a module, you can use the module spider <module_name> command. It will search through all the installed modules, even if they're masked, and display instructions to load them. See the Examples section for details.

Module usage#

The most common module commands are outlined in the following table. module commands may be shortened with the ml alias, with slightly different semantics.

Module names auto-completion

The module command supports auto-completion, so you can just start typing the name of a module, and press Tab to let the shell automatically complete the module name and/or version.

Module command Short version Description
module avail ml av List available software1
module spider gromacs ml spider gromacs Search for particular software
module keyword blas ml key blas Search for blas in module names and descriptions
module whatis gcc ml whatis gcc Display information about the gcc module
module help gcc ml help gcc Display module specific help
module load gcc ml gcc Load a module to use the associated software
module load gsl/2.3 ml gsl/2.3 Load specific version of a module
module unload gcc ml -gcc Unload a module
module swap gcc icc ml -gcc icc Swap a module (unload gcc and replace it with icc)
module purge ml purge Remove all modules2
module save foo ml save foo Save the state of all loaded modules in a collection named foo
module restore foo ml restore foo Restore the state of saved modules from the foo collection

Additional module sub-commands are documented in the module help command. For complete reference, please refer to the official Lmod documentation.

Module properties#

Multiple versions

When multiple versions of the same module exist, module will load the one marked as Default (D). For the sake of reproducibility, we recommend always specifying the module version you want to load, as defaults may evolve over time.

To quickly see some of the modules characteristics, module avail will display colored property attributes next to the module names. The main module properties are:

  • S: Module is sticky, requires --force to unload or purge
  • L: Indicate currently loaded module
  • D: Default module that will be loaded when multiple versions are available
  • r: Restricted access, typically software under license. Contact us for details
  • g: GPU-accelerated software, will only run on GPU nodes
  • m: Software supports parallel execution using MPI

Searching for modules#

You can search through all the available modules for either:

  • a module name (if you already know it), using module spider
  • any string within modules names and descriptions, using module keyword

For instance, if you want to know how to load the gromacs module, you can do:

$ module spider gromacs
+

If you don't know the module name, or want to list all the modules that contain a specific string of characters in their name or description, you can use module keyword. For instance, the following command will list all the modules providing a BLAS library:

$ module keyword blas
+

Examples#

Listing#

To list all the modules that can be loaded, you can do:

$ ml av
+
+-- math -- numerical libraries, statistics, deep-learning, computer science ---
+   R/3.4.0             gsl/1.16             openblas/0.2.19
+   cudnn/5.1  (g)      gsl/2.3       (D)    py-scipystack/1.0_py27 (D)
+   cudnn/6.0  (g,D)    imkl/2017.u2         py-scipystack/1.0_py36
+   fftw/3.3.6          matlab/R2017a (r)
+
+------------------ devel -- compilers, MPI, languages, libs -------------------
+   boost/1.64.0          icc/2017.u2           python/2.7.13    (D)
+   cmake/3.8.1           ifort/2017.u2         python/3.6.1
+   cuda/8.0.61    (g)    impi/2017.u2   (m)    scons/2.5.1_py27 (D)
+   eigen/3.3.3           java/1.8.0_131        scons/2.5.1_py36
+   gcc/6.3.0      (D)    julia/0.5.1           sqlite/3.18.0
+   gcc/7.1.0             llvm/4.0.0            tbb/2017.u2
+   h5utils/1.12.1        nccl/1.3.4     (g)    tcltk/8.6.6
+   hdf5/1.10.0p1         openmpi/2.0.2  (m)
+
+-------------- categories -- load to make more modules available --------------
+   biology      devel (S,L)    physics    system
+   chemistry    math  (S,L)    staging    viz
+
+  Where:
+   S:  Module is Sticky, requires --force to unload or purge
+   r:  Restricted access
+   g:  GPU support
+   L:  Module is loaded
+   m:  MPI support
+   D:  Default Module
+
+Use "module spider" to find all possible modules.
+Use "module keyword key1 key2 ..." to search for all possible modules matching
+any of the "keys".
+

Searching#

To search for a specific string in modules names and descriptions, you can run:

$ module keyword numpy
+---------------------------------------------------------------------------
+
+The following modules match your search criteria: "numpy"
+---------------------------------------------------------------------------
+
+  py-scipystack: py-scipystack/1.0_py27, py-scipystack/1.0_py36
+    The SciPy Stack is a collection of open source software for scientific
+    computing in Python. It provides the following packages: numpy, scipy,
+    matplotlib, ipython, jupyter, pandas, sympy and nose.
+
+---------------------------------------------------------------------------
+[...]
+$ ml key compiler
+---------------------------------------------------------------------------
+
+The following modules match your search criteria: "compiler"
+---------------------------------------------------------------------------
+
+  cmake: cmake/3.8.1
+    CMake is an extensible, open-source system that manages the build
+    process in an operating system and in a compiler-independent manner.
+
+  gcc: gcc/6.3.0, gcc/7.1.0
+    The GNU Compiler Collection includes front ends for C, C++, Fortran,
+    Java, and Go, as well as libraries for these languages (libstdc++,
+    libgcj,...).
+
+  icc: icc/2017.u2
+    Intel C++ Compiler, also known as icc or icl, is a group of C and C++
+    compilers from Intel
+
+  ifort: ifort/2017.u2
+    Intel Fortran Compiler, also known as ifort, is a group of Fortran
+    compilers from Intel
+
+  llvm: llvm/4.0.0
+    The LLVM Project is a collection of modular and reusable compiler and
+    toolchain technologies. Clang is an LLVM native C/C++/Objective-C
+    compiler,
+
+---------------------------------------------------------------------------
+

To get information about a specific module, especially how to load it, the following command can be used:

$ module spider gromacs
+
+-------------------------------------------------------------------------------
+  gromacs: gromacs/2016.3
+-------------------------------------------------------------------------------
+    Description:
+      GROMACS is a versatile package to perform molecular dynamics, i.e.
+      simulate the Newtonian equations of motion for systems with hundreds to
+      millions of particles.
+
+    Properties:
+      GPU support      MPI support
+
+    You will need to load all module(s) on any one of the lines below before
+    the "gromacs/2016.3" module is available to load.
+
+      chemistry
+

Loading#

Loading a category module allows to get access to field-specific software:

$ ml chemistry
+$ ml av
+
+------------- chemistry -- quantum chemistry, molecular dynamics --------------
+   gromacs/2016.3 (g,m)    vasp/5.4.1 (g,r,m)
+
+-- math -- numerical libraries, statistics, deep-learning, computer science ---
+   R/3.4.0             gsl/1.16             openblas/0.2.19
+   cudnn/5.1  (g)      gsl/2.3       (D)    py-scipystack/1.0_py27 (D)
+   cudnn/6.0  (g,D)    imkl/2017.u2         py-scipystack/1.0_py36
+   fftw/3.3.6          matlab/R2017a (r)
+
+------------------ devel -- compilers, MPI, languages, libs -------------------
+   boost/1.64.0          icc/2017.u2           python/2.7.13    (D)
+   cmake/3.8.1           ifort/2017.u2         python/3.6.1
+   cuda/8.0.61    (g)    impi/2017.u2   (m)    scons/2.5.1_py27 (D)
+   eigen/3.3.3           java/1.8.0_131        scons/2.5.1_py36
+   gcc/6.3.0      (D)    julia/0.5.1           sqlite/3.18.0
+   gcc/7.1.0             llvm/4.0.0            tbb/2017.u2
+   h5utils/1.12.1        nccl/1.3.4     (g)    tcltk/8.6.6
+   hdf5/1.10.0p1         openmpi/2.0.2  (m)
+
+-------------- categories -- load to make more modules available --------------
+   biology          devel (S,L)    physics    system
+   chemistry (L)    math  (S,L)    staging    viz
+
+[...]
+

Resetting the modules environment#

If you want to reset your modules environment as it was when you initially connected to Sherlock, you can use the ml reset command: it will remove all the modules you have loaded, and restore the original state where only the math and devel categories are accessible.

If you want to remove all modules from your environment, including the default math and devel modules, you can use ml --force purge.

Loading modules in jobs#

In order for an application running in a Slurm job to have access to any necessary module-provided software packages, we recommend loading those modules in the job script directly. Since Slurm propagates all user environment variables by default, this is not strictly necessary, as jobs will inherit the modules loaded at submission time. But to make sure things are reproducible and avoid issues, it is preferable to explicitly load the modules in the batch scripts.

module load commands should be placed right after #SBATCH directives and before the actual executable calls. For instance:

#!/bin/bash
+#SBATCH ...
+#SBATCH ...
+#SBATCH ...
+
+ml reset
+ml load gromacs/2016.3
+
+srun gmx_mpi ...
+

Custom modules#

Users are welcome and encouraged to build and install their own software on Sherlock. To that end, and to facilitate usage or sharing of their custom software installations, they can create their own module repositories.

See the Software Installation page for more details.

Contributed software#

PI groups, labs or departments can share their software installations and modules with the whole Sherlock community of users, and let everyone benefit from their tuning efforts and software developments.

Those modules are available in the specific contribs category, and organized by contributor name.

For instance, listing the available contributed modules can be done with:

$ ml contribs
+$ ml av
+-------------------- contribs -- contributed software ----------------------
+   poldrack
+

To get information about a specific lab module:

$ ml show poldrack
+----------------------------------------------------------------------------
+   /share/software/modules/contribs/poldrack.lua:
+----------------------------------------------------------------------------
+prepend_path("MODULEPATH","/home/groups/russpold/modules")
+whatis("Name:        poldrack")
+whatis("Version:     1.0")
+whatis("Category:    contribs")
+whatis("URL:         https://github.com/poldracklab/lmod_modules")
+whatis("Description: Software modules contributed by the Poldrack Lab.")
+

And to list the available software modules contributed by the lab:

$ ml poldrack
+$ ml av
+
+------------------------ /home/groups/russpold/modules -------------------------
+   afni/17.3.03           freesurfer/6.0.1            gsl/2.3      (D)
+   anaconda/5.0.0-py36    fsl/5.0.9                   pigz/2.4
+   ants/2.1.0.post710     fsl/5.0.11           (D)    remora/1.8.2
+   c3d/1.1.0              git-annex/6.20171109        xft/2.3.2
+[...]
+

  1. If a module is not listed here, it might be unavailable in the loaded modules categories, and require loading another category module. Search for not-listed software using the module spider command. 

  2. The math and devel category modules will not be unloaded with module purge as they are "sticky". If a user wants to unload a sticky module, they must specify the --force option. 

\ No newline at end of file diff --git a/docs/software/overview/index.html b/docs/software/overview/index.html new file mode 100644 index 000000000..e0c38c74d --- /dev/null +++ b/docs/software/overview/index.html @@ -0,0 +1,15 @@ + + + + + + Redirecting... + + + + + + +Redirecting... + + diff --git a/docs/software/updates.xml b/docs/software/updates.xml new file mode 100644 index 000000000..6c38e90ff --- /dev/null +++ b/docs/software/updates.xml @@ -0,0 +1,252 @@ + + + +Sherlock software updates +Sherlock software update feed +https://www.sherlock.stanford.edu/docs/software/list + + + New module: biology/bcl-convert version 4.2.7 + The BCL Convert App generates demultiplexed FASTQ files from a run as input. + https://emea.support.illumina.com/sequencing/sequencing_software/bcl-convert.html + https://www.sherlock.stanford.edu/docs/software/list/?add:v=4.2.7#bcl-convert + biology, genomics + kilian@stanford.edu (Kilian Cavalotti) + Mon, 5 Feb 2024 10:12:19 -0800 + + + New module: system/py-spython version 0.3.13_py312 + Singularity Python (spython) is the Python API for working with Singularity containers. + https://github.com/singularityhub/singularity-cli + https://www.sherlock.stanford.edu/docs/software/list/?add:v=0.3.13_py312#py-spython + system, containers + kilian@stanford.edu (Kilian Cavalotti) + Thu, 25 Jan 2024 12:08:39 -0800 + + + New module: system/py-spython version 0.3.13_py39 + Singularity Python (spython) is the Python API for working with Singularity containers. + https://github.com/singularityhub/singularity-cli + https://www.sherlock.stanford.edu/docs/software/list/?add:v=0.3.13_py39#py-spython + system, containers + kilian@stanford.edu (Kilian Cavalotti) + Thu, 25 Jan 2024 12:08:39 -0800 + + + New version: system/restic version 0.16.3 + Fast, secure, efficient backup program. + https://restic.net + https://www.sherlock.stanford.edu/docs/software/list/?add:v=0.16.3#restic + system, backup + kilian@stanford.edu (Kilian Cavalotti) + Fri, 19 Jan 2024 14:49:47 -0800 + + + New version: math/py-numpy version 1.26.3_py312 + NumPy is the fundamental package for scientific computing with Python. + http://www.numpy.org + https://www.sherlock.stanford.edu/docs/software/list/?add:v=1.26.3_py312#py-numpy + math, numerical library + kilian@stanford.edu (Kilian Cavalotti) + Fri, 12 Jan 2024 18:23:23 -0800 + + + New version: math/openblas version 0.3.26 + OpenBLAS is an optimized BLAS library + http://www.openblas.net/ + https://www.sherlock.stanford.edu/docs/software/list/?add:v=0.3.26#openblas + math, numerical library + kilian@stanford.edu (Kilian Cavalotti) + Fri, 12 Jan 2024 18:19:06 -0800 + + + New version: devel/python version 3.12.1 + Python is an interpreted, interactive, object-oriented programming language. + https://www.python.org + https://www.sherlock.stanford.edu/docs/software/list/?add:v=3.12.1#python + devel, language + kilian@stanford.edu (Kilian Cavalotti) + Fri, 12 Jan 2024 17:11:46 -0800 + + + New version: system/sqlite version 3.44.2 + SQLite is a self-contained, high-reliability, embedded, full-featured, public-domain, SQL database engine. + https://www.sqlite.org + https://www.sherlock.stanford.edu/docs/software/list/?add:v=3.44.2#sqlite + system, database + kilian@stanford.edu (Kilian Cavalotti) + Fri, 12 Jan 2024 16:35:14 -0800 + + + New version: system/ncurses version 6.4 + The ncurses (new curses) library is a free software emulation of curses in System V Release 4.0 (SVr4), and more. + https://www.gnu.org/software/ncurses + https://www.sherlock.stanford.edu/docs/software/list/?add:v=6.4#ncurses + system, libs + kilian@stanford.edu (Kilian Cavalotti) + Fri, 12 Jan 2024 16:14:15 -0800 + + + New version: biology/mixcr version 4.6.0 + MiXCR is a universal framework that processes big immunome data from raw sequences to quantitated clonotypes. + https://github.com/milaboratory/mixcr + https://www.sherlock.stanford.edu/docs/software/list/?add:v=4.6.0#mixcr + biology, genomics + kilian@stanford.edu (Kilian Cavalotti) + Fri, 12 Jan 2024 15:18:30 -0800 + + + New version: system/readline version 8.2 + The GNU Readline library provides a set of functions for use by applications that allow users to edit command lines as they are typed in. + https://cnswww.cns.cwru.edu/php/chet/readline/rltop.html + https://www.sherlock.stanford.edu/docs/software/list/?add:v=8.2#readline + system, libs + kilian@stanford.edu (Kilian Cavalotti) + Wed, 10 Jan 2024 18:17:31 -0800 + + + New version: math/R version 4.3.2 + R is a free software environment for statistical computing and graphics. + http://r-project.org + https://www.sherlock.stanford.edu/docs/software/list/?add:v=4.3.2#R + math, statistics + kilian@stanford.edu (Kilian Cavalotti) + Wed, 10 Jan 2024 17:02:35 -0800 + + + New version: biology/eman2 version 2.91 + EMAN2 is a broadly based greyscale scientific image processing suite with a primary focus on processing data from transmission electron microscopes. + http://blake.bcm.edu/emanwiki/EMAN2 + https://www.sherlock.stanford.edu/docs/software/list/?add:v=2.91#eman2 + biology, cryo-em + kilian@stanford.edu (Kilian Cavalotti) + Tue, 9 Jan 2024 16:42:47 -0800 + + + New module: system/ftgl version 2.1.2 + FTGL is a free cross-platform Open Source C++ library that uses Freetype2 to simplify rendering fonts in OpenGL applications. + https://sourceforge.net/projects/ftgl/ + https://www.sherlock.stanford.edu/docs/software/list/?add:v=2.1.2#ftgl + system, libs + kilian@stanford.edu (Kilian Cavalotti) + Tue, 9 Jan 2024 15:27:48 -0800 + + + New module: devel/py-nose version 1.3.7_py39 + nose is nicer testing for python. + https://github.com/nose-devs/nose + https://www.sherlock.stanford.edu/docs/software/list/?add:v=1.3.7_py39#py-nose + devel, lib + kilian@stanford.edu (Kilian Cavalotti) + Tue, 9 Jan 2024 14:38:55 -0800 + + + New module: math/suitesparse version 7.4.0 + SuiteSparse is a suite of sparse matrix algorithms. + https://people.engr.tamu.edu/davis/suitesparse.html + https://www.sherlock.stanford.edu/docs/software/list/?add:v=7.4.0#suitesparse + math, numerical library + kilian@stanford.edu (Kilian Cavalotti) + Tue, 9 Jan 2024 14:01:12 -0800 + + + New module: math/maxima version 5.47.0 + Maxima is a system for the manipulation of symbolic and numerical expressions. + https://maxima.sourceforge.io/ + https://www.sherlock.stanford.edu/docs/software/list/?add:v=5.47.0#maxima + math, symbolic + kilian@stanford.edu (Kilian Cavalotti) + Tue, 9 Jan 2024 12:01:51 -0800 + + + New version: devel/julia version 1.10.0 + Julia is a high-level, high-performance dynamic programming language for numerical computing. + https://julialang.org + https://www.sherlock.stanford.edu/docs/software/list/?add:v=1.10.0#julia + devel, language + kilian@stanford.edu (Kilian Cavalotti) + Mon, 8 Jan 2024 14:28:41 -0800 + + + New module: devel/gcl version 2.6.14 + GCL is the official Common Lisp for the GNU project. + https://www.gnu.org/software/gcl/ + https://www.sherlock.stanford.edu/docs/software/list/?add:v=2.6.14#gcl + devel, language + kilian@stanford.edu (Kilian Cavalotti) + Mon, 8 Jan 2024 14:23:50 -0800 + + + New version: physics/geos version 3.12.1 + GEOS is a C/C++ library for computational geometry with a focus on algorithms used in geographic information systems (GIS) software. + https://libgeos.org + https://www.sherlock.stanford.edu/docs/software/list/?add:v=3.12.1#geos + physics, geoscience + kilian@stanford.edu (Kilian Cavalotti) + Tue, 12 Dec 2023 11:17:10 -0800 + + + New version: biology/kallisto version 0.50.1 + kallisto is a program for quantifying abundances of transcripts from RNA-Seq data using high-throughput sequencing reads. + https://pachterlab.github.io/kallisto/ + https://www.sherlock.stanford.edu/docs/software/list/?add:v=0.50.1#kallisto + biology, genomics + kilian@stanford.edu (Kilian Cavalotti) + Wed, 29 Nov 2023 13:50:00 -0800 + + + New version: system/py-matlab-proxy version 0.10.0_py39 + matlab-proxy is a Python package which enables you to launch MATLAB and access it from a web browser. + https://github.com/mathworks/matlab-proxy + https://www.sherlock.stanford.edu/docs/software/list/?add:v=0.10.0_py39#py-matlab-proxy + system, tools + kilian@stanford.edu (Kilian Cavalotti) + Wed, 29 Nov 2023 10:37:32 -0800 + + + New version: system/rclone version 1.65.0 + Rclone is a command line program to sync files and directories to and from + https://rclone.org + https://www.sherlock.stanford.edu/docs/software/list/?add:v=1.65.0#rclone + system, file transfer + kilian@stanford.edu (Kilian Cavalotti) + Tue, 28 Nov 2023 07:59:13 -0800 + + + New module: devel/darshan version 3.4.4 + Darshan is a scalable HPC I/O characterization tool. + https://www.mcs.anl.gov/research/projects/darshan/ + https://www.sherlock.stanford.edu/docs/software/list/?add:v=3.4.4#darshan + devel, profiling + kilian@stanford.edu (Kilian Cavalotti) + Thu, 16 Nov 2023 17:34:11 -0800 + + + New module: system/py-matlab-proxy version 0.9.1_py39 + matlab-proxy is a Python package which enables you to launch MATLAB and access it from a web browser. + https://github.com/mathworks/matlab-proxy + https://www.sherlock.stanford.edu/docs/software/list/?add:v=0.9.1_py39#py-matlab-proxy + system, tools + kilian@stanford.edu (Kilian Cavalotti) + Thu, 16 Nov 2023 13:19:14 -0800 + + + New version: system/py-globus-cli version 3.19.0_py39 + A command line wrapper over the Globus SDK for Python. + https://github.com/globus/globus-cli + https://www.sherlock.stanford.edu/docs/software/list/?add:v=3.19.0_py39#py-globus-cli + system, file transfer + kilian@stanford.edu (Kilian Cavalotti) + Thu, 16 Nov 2023 13:18:58 -0800 + + + New version: math/rstudio version 2023.09.1 + RStudio is an integrated development environment (IDE) for R. It includes a console, syntax-highlighting editor that supports direct code execution, as well as tools for plotting, history, debugging and workspace management. + http://www.rstudio.com + https://www.sherlock.stanford.edu/docs/software/list/?add:v=2023.09.1#rstudio + math, statistics + kilian@stanford.edu (Kilian Cavalotti) + Thu, 16 Nov 2023 13:18:43 -0800 + + + diff --git a/docs/software/using/R/index.html b/docs/software/using/R/index.html new file mode 100644 index 000000000..83a872995 --- /dev/null +++ b/docs/software/using/R/index.html @@ -0,0 +1,310 @@ + R - Sherlock

R

Introduction#

R is a programming language and software environment for statistical computing and graphics. It is similar to the S language and environment developed at Bell Laboratories. R provides a wide variety of statistical and graphical techniques and is highly extensible.

More documentation#

The following documentation is specifically intended for using R on Sherlock. For more complete documentation about R in general, please see the R documentation.

R on Sherlock#

R is available on Sherlock and the corresponding module can be loaded with:

$ ml R
+

For a list of available versions, you can execute ml spider R at the Sherlock prompt, or refer to the Software list page.

Using R#

Once your environment is configured (ie. when the R module is loaded), R can be started by simply typing R at the shell prompt:

$ R
+
+R version 3.5.1 (2018-07-02) -- "Feather Spray"
+Copyright (C) 2018 The R Foundation for Statistical Computing
+Platform: x86_64-pc-linux-gnu (64-bit)
+[...]
+Type 'demo()' for some demos, 'help()' for on-line help, or
+'help.start()' for an HTML browser interface to help.
+Type 'q()' to quit R.
+
+>
+

For a listing of command line options:

$ R --help
+

Running a R script#

There are several ways to launch an R script on the command line, which will have different ways of presenting the script's output:

Method Output
Rscript script.R displayed on screen, on stdout
R CMD BATCH script.R redirected to a script.Rout file
R --no-save < script.R displayed on screen, on stdout

Submitting a R job#

Here's an example R batch script that can be submitted via sbatch. It runs a simple matrix multiplication example, and demonstrates how to feed R code as a HEREDOC to R directly, so no intermediate R script is necessary:

#!/usr/bin/bash
+#SBATCH --time=00:10:00
+#SBATCH --mem=10G
+#SBATCH --output=Rtest.log
+
+# load the module
+ml R
+
+# run R code
+R --no-save << EOF
+set.seed (1)
+m <- 4000
+n <- 4000
+A <- matrix (runif (m*n),m,n)
+system.time (B <- crossprod(A))
+EOF
+

You can save this script as Rtest.sbatch and submit it to the scheduler with:

$ sbatch Rtest.sbatch
+

Once the job is done, you should get a Rtest.out file in the current directory, with the following contents:

R version 3.5.1 (2018-07-02) -- "Feather Spray"
+[...]
+> set.seed (1)
+> m <- 4000
+> n <- 4000
+> A <- matrix (runif (m*n),m,n)
+> system.time (B <- crossprod(A))
+   user  system elapsed
+  2.649   0.077   2.726
+

R packages#

R comes with a single package library in $R_HOME/library, which contains the standard and most common packages. This is usually in a system location and is not writable by end-users.

To accommodate individual user's requirements, R provides a way for each user to install packages in the location of their choice. The default value for a directory where users can install their own R packages is $HOME/R/x86_64-pc-linux-gnu-library/<R_version> where <R_version> depends on the R version that is used. For instance, if you have the R/3.5.1 module loaded, the default R user library path will be $HOME/R/x86_64-pc-linux-gnu-library/3.5.

This directory doesn't exist by default. The first time a user installs a package, R will ask if she wants to use the default location and create the directory.

Installing packages#

Install R packages in a standard shell session

Make sure to install your packages in a standard Sherlock shell session, not in an RStudio session.

To install a R package in your personal environment, the first thing to do is load the R module:

$ ml R
+

Then start a R session, and use the install.packages() function at the R prompt. For instance, the following example will install the doParallel package, using the US mirror of the CRAN repository:

$ R
+
+R version 3.5.1 (2018-07-02) -- "Feather Spray"
+[...]
+
+> install.packages('doParallel', repos='http://cran.us.r-project.org')
+

It should give the following warning:

Warning in install.packages("doParallel", repos = "http://cran.us.r-project.org") :
+  'lib = "/share/software/user/open/R/3.5.1/lib64/R/library"' is not writable
+Would you like to use a personal library instead? (yes/No/cancel)
+Would you like to create a personal library
+‘~/R/x86_64-pc-linux-gnu-library/3.5’
+to install packages into? (yes/No/cancel) y
+

Answering y twice will make R create a ~/R/x86_64-pc-linux-gnu-library/3.5 directory and instruct it to install future R packages there.

The installation will then proceed:

trying URL 'http://cran.us.r-project.org/src/contrib/doParallel_1.0.14.tar.gz'
+Content type 'application/x-gzip' length 173607 bytes (169 KB)
+==================================================
+downloaded 169 KB
+
+* installing *source* package ‘doParallel’ ...
+** package ‘doParallel’ successfully unpacked and MD5 sums checked
+** R
+** demo
+** inst
+** byte-compile and prepare package for lazy loading
+** help
+*** installing help indices
+** building package indices
+** installing vignettes
+** testing if installed package can be loaded
+* DONE (doParallel)
+
+The downloaded source packages are in
+        ‘/tmp/Rtmp0RHrMZ/downloaded_packages’
+>
+

and when it's done, you should be able to load the package within R with:

> library(doParallel)
+Loading required package: foreach
+Loading required package: iterators
+Loading required package: parallel
+>
+
Installing large packages#

Installing large R packages can sometimes be very time consuming. To speed things up, R can utilize multiple CPUs in parallel when the Ncpus=n option is added to the install.packages() command (where n is the number of CPUs you'd like to use).

For instance, you can get an interactive session with 4 CPU cores with sh_dev:

$ sh_dev -c 4
+$ ml R
+$ R
+> install.packages("dplyr", repos = "http://cran.us.r-project.org", Ncpus=4)
+
Alternative installation path#

To install R packages in a different location, you'll need to create that directory, and instruct R to install the packages there:

$ mkdir ~/R_libs/
+$ R
+> install.packages('doParallel', repos='http://cran.us.r-project.org', lib="~/R_libs")
+

The installation will proceed normally and the doParallel package will be installed in $HOME/R_libs/.

Specifying the full destination path for each package installation could quickly become tiresome, so to avoid this, you can create a .Renviron file in your $HOME directory, and define your R_libs path there:

$ cat << EOF > $HOME/.Renviron
+R_LIBS=~/R_libs
+EOF
+

With this, whenever R is started, the $HOME/R_libs/ directory will be added to the list of places R will look for packages, and you won't need to specify this installation path when using install.packages() anymore.

Where does R look for packages?

To see the directories where R searches for packages and libraries, you can use the following command in R:

> .libPaths()
+

Sharing R packages

If you'd like to share R packages within your group, you can simply define $R_LIBS to point to a shared directory, such as $GROUP_HOME/R_libs and have each user in the group use the instructions below to define it in their own environment.

Setting the installation repository#

When installing a package, R needs to know from which repository the package should be downloaded. If it's not specified, it will prompt for it and display a list of available CRAN mirrors.

To avoid setting the CRAN mirror each time you run install.packages you can permanently set the mirror by creating a .Rprofile file in your $HOME directory, which R will execute each time it starts.

For instance, adding the following contents to your ~/.Rprofile will make sure that every install.packages() invocation will use the closest CRAN mirror:

## local creates a new, empty environment
+## This avoids polluting the global environment with
+## the object r
+local({
+  r = getOption("repos")
+  r["CRAN"] = "https://cloud.r-project.org/"
+  options(repos = r)
+})
+

Once this is set, you only need to specify the name of the package to install, and R will use the mirror you defined automatically:

> install.packages("doParallel")
+[...]
+trying URL 'https://cloud.r-project.org/src/contrib/doParallel_1.0.14.tar.gz'
+Content type 'application/x-gzip' length 173607 bytes (169 KB)
+==================================================
+downloaded 169 KB
+
Installing packages from GitHub#

R packages can be directly installed from GitHub using the devtools package. devtools needs to be installed first, with:

> install.packages("devtools")
+

And then, you can then install a R package directly from its GitHub repository. For instance, to install dplyr from tidyverse/dplyr:

> library(devtools)
+> install_github("tidyverse/dplyr")
+

Package dependencies#

Sometimes when installing R packages, other software is needed for the installation and/or compilation. For instance, when trying to install the sf package, you may encounter the following error messages:

> install.packages("sf")
+[...]
+Configuration failed because libudunits2.so was not found. Try installing:...
+[...]
+configure: error: gdal-config not found or not executable.
+

This is because sf needs a few dependencies, like udunits and gdal in order to compile and install successfully. Fortunately those dependencies are already available as modules on Sherlock.

Whenever you see "not found" errors, you may want to try searching the modules inventory with module spider:

$ module spider udunits
+
+----------------------------------------------------------------------------
+  udunits: udunits/2.2.26
+----------------------------------------------------------------------------
+    Description:
+      The UDUNITS package from Unidata is a C-based package for the
+      programmatic handling of units of physical quantities.
+
+
+    You will need to load all module(s) on any one of the lines below before
+    the "udunits/2.2.26" module is available to load.
+
+      physics
+

So for sf, in order to load the dependencies, exit R, load the udunits and gdal modules, and try installing sf again:

$ ml load physics udunits gdal geos
+$ ml R
+$ R
+> install.packages("sf")
+

Getting dependencies right could be a matter of trial and error. You may have to load R, install packages, search modules, load modules, install packages again and so forth. Fortunately, R packages only need to be installed once, and many R package dependencies are already available as modules on Sherlock, you just need to search for them with module spider and load them.

And in case you're stuck, you can of course always send us an email and we'll be happy to assist.

Updating Packages#

To upgrade R packages, you can use the update.packages() function within a R session.

For instance, to update the doParallel package:

> update.packages('doParallel')
+

When the package name is omitted, update.packages() will try to update all the packages that are installed. Which is the most efficient way to ensure that all the packages in your local R library are up to date.

Centrally installed packages can not be updated

Note that attempting to update centrally installed packages will fail. You will have to use install.packages() to install your own version of the packages in your $HOME directory instead.

Removing packages#

To remove a package from your local R library, you can use the remove.packages() function. For instance:

> remove.packages('doParallel')
+

Examples#

Installing devtools#

devtools is a package that provides R functions that simplify many common tasks. While its core functionality revolves around package development, devtools can also be used to install packages, particularly those on GitHub.

Installing devtools is somewhat memory-intensive and has several dependencies. The following example shows how to run an interactive session with 4 CPUs, load the modules for the necessary dependencies, and install devtools for R version 4.2.0.

# Launch interactive dev session with 4 CPUs
+
+$ sh_dev -c 4
+
+# Load the required modules
+
+$ ml purge
+$ ml R/4.2.0
+$ ml system harfbuzz fribidi
+$ ml cmake libgit2
+$ ml openssl
+
+# Launch R and install devtools
+
+$ R
+> install.packages("devtools", repos = "http://cran.us.r-project.org", Ncpus=4)
+

Single node#

R has a couple of powerful and easy-to-use tools to parallelize your R jobs. doParallel is one of them. If the doParallel package is not installed in your environment yet, you can install it in a few easy steps.

Here is a quick doParallel example that uses one node and 16 cores on Sherlock (more nodes or CPU cores can be requested, as needed).

Save the two scripts below in a directory on Sherlock:

# Example doParallel script
+
+if(!require(doParallel)) install.packages("doParallel")
+library(doParallel)
+
+# use the environment variable SLURM_NTASKS_PER_NODE to set
+# the number of cores to use
+registerDoParallel(cores=(Sys.getenv("SLURM_NTASKS_PER_NODE")))
+
+# bootstrap iteration example
+x <- iris[which(iris[,5] != "setosa"), c(1,5)]
+iterations <- 10000# Number of iterations to run
+
+# parallel loop
+# note the '%dopar%' instruction
+parallel_time <- system.time({
+  r <- foreach(icount(iterations), .combine=cbind) %dopar% {
+    ind <- sample(100, 100, replace=TRUE)
+    result1 <- glm(x[ind,2]~x[ind,1], family=binomial(logit))
+    coefficients(result1)
+  }
+})[3]
+
+# show the number of parallel workers to be used
+getDoParWorkers()
+
+# execute the function
+parallel_time
+
#!/bin/bash
+
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=16
+#SBATCH --output=doParallel_test.log
+
+# --ntasks-per-node will be used in doParallel_test.R to specify the number
+# of cores to use on the machine.
+
+# load modules
+ml R/3.5.1
+
+# execute script
+Rscript doParallel_test.R
+

And then submit the job with:

$ sbatch doParallel_test.sbatch
+

Once the job has completed, the output file should contain something like this:

$ cat doParallel_test.out
+[1] "16"
+elapsed
+  3.551
+

Bonus points: observe the scalability of the doParallel loop by submitting the same script using a varying number of CPU cores:

$ for i in 2 4 8 16; do
+    sbatch --out=doP_${i}.out --ntasks-per-node=$i doParallel_test.sbatch
+done
+

When the jobs are done:

$ for i in 2 4 8 16; do
+    printf "%2i cores: %4.1fs\n" $i $(tail -n1 doP_$i.out)
+done
+ 2 cores: 13.6s
+ 4 cores:  7.8s
+ 8 cores:  4.9s
+16 cores:  3.6s
+

Multiple nodes#

To distribute parallel R tasks on multiple nodes, you can use the Rmpi package, which provides MPI bindings for R.

To install the Rmpi package, a module providing MPI library must first be loaded. For instance:

$ ml openmpi R
+$ R
+> install.packages("Rmpi")
+

Once the package is installed, the following scripts demonstrate a very basic Rmpi example.

# Example Rmpi script
+
+if (!require("Rmpi")) install.packages("Rmpi")
+library(Rmpi)
+
+# initialize an Rmpi environment
+ns <- mpi.universe.size() - 1
+mpi.spawn.Rslaves(nslaves=ns, needlog=TRUE)
+
+# send these commands to the slaves
+mpi.bcast.cmd( id <- mpi.comm.rank() )
+mpi.bcast.cmd( ns <- mpi.comm.size() )
+mpi.bcast.cmd( host <- mpi.get.processor.name() )
+
+# all slaves execute this command
+mpi.remote.exec(paste("I am", id, "of", ns, "running on", host))
+
+# close down the Rmpi environment
+mpi.close.Rslaves(dellog = FALSE)
+mpi.exit()
+
#!/bin/bash
+
+#SBATCH --nodes=2
+#SBATCH --ntasks=4
+#SBATCH --output=Rmpi-test.log
+
+## load modules
+# openmpi is not loaded by default with R, so it must be loaded explicitly
+ml R openmpi
+
+## run script
+# we use '-np 1' since Rmpi does its own task management
+mpirun -np 1 Rscript Rmpi-test.R
+

You can save those scripts as Rmpi-test.R and Rmpi-test.sbatch and then submit your job with:

$ sbatch Rmpi-test.sbatch
+

When the job is done, its output should look like this:

$ cat Rmpi-test.log
+        3 slaves are spawned successfully. 0 failed.
+master (rank 0, comm 1) of size 4 is running on: sh-06-33
+slave1 (rank 1, comm 1) of size 4 is running on: sh-06-33
+slave2 (rank 2, comm 1) of size 4 is running on: sh-06-33
+slave3 (rank 3, comm 1) of size 4 is running on: sh-06-34
+$slave1
+[1] "I am 1 of 4 running on sh-06-33"
+
+$slave2
+[1] "I am 2 of 4 running on sh-06-33"
+
+$slave3
+[1] "I am 3 of 4 running on sh-06-34"
+
+[1] 1
+[1] "Detaching Rmpi. Rmpi cannot be used unless relaunching R."
+

GPUs#

Here's a quick example that compares running a matrix multiplication on a CPU and on a GPU using R. It requires submitting a job to a GPU node and the gpuR R package.

# Example gpuR script
+
+if (!require("gpuR")) install.packages("gpuR")
+library(gpuR)
+
+print("CPU times")
+for(i in seq(1:7)) {
+    ORDER = 64*(2^i)
+    A = matrix(rnorm(ORDER^2), nrow=ORDER)
+    B = matrix(rnorm(ORDER^2), nrow=ORDER)
+    print(paste(i, sprintf("%5.2f", system.time({C = A %*% B})[3])))
+}
+
+print("GPU times")
+for(i in seq(1:7)) {
+    ORDER = 64*(2^i)
+    A = matrix(rnorm(ORDER^2), nrow=ORDER)
+    B = matrix(rnorm(ORDER^2), nrow=ORDER)
+    gpuA = gpuMatrix(A, type="double")
+    gpuB = gpuMatrix(B, type="double")
+    print(paste(i, sprintf("%5.2f", system.time({gpuC = gpuA %*% gpuB})[3])))
+}
+
#!/bin/bash
+
+#SBATCH --partition gpu
+#SBATCH --mem 8GB
+#SBATCH --gres gpu:1
+#SBATCH --output=gpuR-test.log
+
+## load modules
+# cuda is not loaded by default with R, so it must be loaded explicitly
+ml R cuda
+
+Rscript gpuR-test.R
+

After submitting the job with sbatch gpuR-test.sbatch, the output file should contain something like this:

[1] "CPU times"
+[1] "1  0.00"
+[1] "2  0.00"
+[1] "3  0.02"
+[1] "4  0.13"
+[1] "5  0.97"
+[1] "6  7.56"
+[1] "7 60.47"
+
+[1] "GPU times"
+[1] "1  0.10"
+[1] "2  0.04"
+[1] "3  0.02"
+[1] "4  0.07"
+[1] "5  0.39"
+[1] "6  2.04"
+[1] "7 11.59"
+

which shows a decent speedup for running on a GPU for the largest matrix sizes.

\ No newline at end of file diff --git a/docs/software/using/anaconda/index.html b/docs/software/using/anaconda/index.html new file mode 100644 index 000000000..ef1c4085e --- /dev/null +++ b/docs/software/using/anaconda/index.html @@ -0,0 +1 @@ + Anaconda - Sherlock

Anaconda

Introduction#

Anaconda is a Python/R distribution that aims to simplify package management and deployment for scientific computing. Although it can have merits on individual computers, it's often counter-productive on shared HPC systems like Sherlock.

Avoid using Anaconda on Sherlock

We recommend NOT using Anaconda on Sherlock, and instead consider other options like virtual environments or containers.

Why Anaconda should be avoided on Sherlock#

Anaconda is widely used in several scientific domain like data science, AI/ML, bio-informatics, and is often listed in some software documentation as the recommended (if not only) way to install it

It is a useful solution for simplifying the management of Python and scientific libraries on a personal computer. However, on highly-specialized HPC systems like Sherlock, management of these libraries and dependencies should be done by SRCC staff, to ensure compatibility and optimal performance on the cluster hardware.

For instance:

  • Anaconda very often installs software (compilers, scientific libraries etc.) which already exist on our Sherlock as modules, and does so in a sub-optimal fashion, by installing sub-optimal versions and configurations,
  • It installs binaries which are not optimized for the processor architectures on Sherlock,
  • it makes incorrect assumptions about the location of various system libraries,
  • Anaconda installs software in $HOME by default, where it writes large amounts of files. A single Anaconda installation can easily fill up your $HOME directory quota, and makes things difficult to manage,
  • Anaconda installations can't easily be relocated,
  • Anaconda modifies your $HOME/.bashrc file, which can easily cause conflicts and slow things down when you log in.

Worse, a Conda recipe can force the installation of R (even though it's already available on Sherlock). This installation won't perform nearly as well as the version we provide as a module (which uses optimized libraries), or not at all, the jobs launched with it may crash and end up wasting both computing resources and your time.

Installation issues

If you absolutely need to install anaconda/miniconda, please note that because of the large number of files that the installer will try to open, this will likely fail on a login node. So make sure to run the installation on a compute node, for instance using the sh_dev command.

What to do instead#

Use a virtual environment#

Instead of using Anaconda for your project, or when the installation instructions of the software you want to install are using it, you can use a virtual environment.

A virtual environment offers all the functionality you need to use Python on Sherlock. You can convert Anaconda instructions and use a virtual environment instead, by following these steps:

  1. list the dependencies (also called requirements) of the application you want to use:
    • check if there is a requirements.txt file in the Git repository or in the software sources,
    • or, check the variable install_requires of in the setup.py file, which lists the requirements.
  2. find which dependencies are Python modules and which are libraries provided by Anaconda. For example, CUDA and CuDNN are libraries that Anaconda can install, but which should not be re-installed as they are already available as modules on Sherlock,
  3. remove from the list of dependencies everything which is not a Python module (e.g. cudatoolkit and cudnn),
  4. create a virtual environment to install your dependencies.

And that's it: your software should run, without Anaconda. If you have any issues, please don't hesitate to contact us.

Use a container#

In some situations, the complexity of a program's dependencies requires the use of a solution where you can control the entire software environment. In these situations, we recommend using a container.

Tip

Existing Docker images can easily be converted into Apptainer/Singularity images.

The only potential downside of using containers is their size and the associated storage usage. But if your research group plans on using several container images, it could be useful to collect them all in a single location (like $GROUP_HOME) to avoid duplication.

\ No newline at end of file diff --git a/docs/software/using/clustershell/index.html b/docs/software/using/clustershell/index.html new file mode 100644 index 000000000..480669ba4 --- /dev/null +++ b/docs/software/using/clustershell/index.html @@ -0,0 +1,87 @@ + ClusterShell - Sherlock

ClusterShell

Introduction#

ClusterShell is a command-line tool and library that helps running commands in parallel on multiple servers. It allows executing arbitrary commands across multiple hosts. On Sherlock, it provides an easy way to run commands on nodes your jobs are running on, and collect back information. The two most useful commands provided are cluset, which can manipulate lists of nodenames, and clush, which can run commands on multiple nodes at once.

More documentation#

The following documentation specifically intended for using ClusterShell on Sherlock. For more complete documentation about ClusterShell in general, please see the ClusterShell documentation.

The ClusterShell library can also be directly be integrated in your Python scripts, to add a wide range of functionality. See the ClusterShell Python API documentation for reference.

ClusterShell on Sherlock#

ClusterShell is available on Sherlock and the corresponding module can be loaded with:

$ ml system py-clustershell
+

cluset#

The cluset command can be used to easily manipulate lists of node names, and to expand, fold, or count them:

$ cluset --expand sh03-01n[01-06]
+sh03-01n01 sh03-01n02 sh03-01n03 sh03-01n04 sh03-01n05 sh03-01n06
+
+$ cluset --count sh03-01n[01-06]
+6
+
+$ cluset --fold sh03-01n01 sh03-01n02 sh03-01n03 sh03-01n06
+sh03-01n[01-03,06]
+

clush#

The clush command uses the same node list syntax to allow running the same commands simultaneously on those nodes. clush uses SSH to connect to each of these nodes.

Warning

You can only SSH to nodes where your jobs are running, and as a consequence, clush will only work on those nodes.

For instance, to check the load on multiple compute nodes at once:

$ clush -w sh03-01n[01-03] cat /proc/loadavg
+sh03-01n01: 19.48 14.43 11.76 22/731 22897
+sh03-01n02: 13.20 13.29 13.64 14/831 1163
+sh03-01n03: 11.60 11.48 11.82 18/893 23945
+

Gathering identical output

Using the the -b option will regroup similar output lines to make large outputs easier to read. By default, the output of each node will be presented separately.

For instance, without -b:

$ clush -w sh03-01n[01-03] echo ok
+sh03-01n02: ok
+sh03-01n03: ok
+sh03-01n01: ok
+

With -b:

$ clush -bw sh03-01n[01-03] echo ok
+---------------
+sh03-01n[01-03] (3)
+---------------
+ok
+

Slurm integration#

On Sherlock, ClusterShell is also tightly integrated with the job scheduler, and can directly provide information about a user's jobs and the nodes they're running on. You can use the following groups to get specific node lists:

group name short name action example
@user: @u: list nodes where user has jobs running cluset -f @user:$USER
@job: @j: list nodes where job is running cluset -f @job:123456
@nodestate: @node:,@n: list nodes in given state cluset -f @nodestate:idle
@partition: @part:,@p: list nodes in given partition cluset -f @partition:gpu

For instance, to get the list of nodes where job 123456 is running:

$ cluset -f @job:123456`
+

Examples#

Job information#

For instance, if job 1988522 from user kilian is running on nodes sh02-01n[59-60], squeue would display this:

$ squeue -u kilian
+       JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
+     1988522    normal interact   kilian  R       1:30      2 sh02-01n[59-60]
+     1988523    normal interact   kilian  R       1:28      2 sh02-01n[61-62]
+

With ClusterShell, you could get:

  • the list of node names where user kilian has jobs running:

    $ cluset -f @user:kilian
    +sh02-01n[59-62]
    +
  • the nodes where job 1988522 is running, in an expanded form:

    $ cluset -e @job:1988522
    +sh02-01n59 sh02-01n60
    +

Node states#

You can also use those binding to get lists of nodes in a particular state, in a given partition. For instance, to list the nodes that are in "mixed" state in the dev partition, you can request the intersection between the @state:mixed and @partition:dev node lists:

$ cluset -f @nodestate:mixed -i @partition:dev
+sh02-01n[57-58]
+

Local storage#

To get a list of files in $L_SCRATCH on all the nodes that are part of job 1988522:

$ $ clush -w@j:1988522 tree $L_SCRATCH
+sh02-01n59: /lscratch/kilian
+sh02-01n59: ├── 1988522
+sh02-01n59: │   └── foo
+sh02-01n59: │       └── bar
+sh02-01n59: └── 1993608
+sh02-01n59:
+sh02-01n59: 3 directories, 1 file
+sh02-01n60: /lscratch/kilian
+sh02-01n60: └── 1988522
+sh02-01n60:
+sh02-01n60: 1 directory, 0 files
+

Process tree#

To display your process tree across all the nodes your jobs are running on:

$ clush -w @u:$USER pstree -au $USER
+sh02-09n71: mpiBench
+sh02-09n71:   `-3*[{mpiBench}]
+sh02-09n71: mpiBench
+sh02-09n71:   `-3*[{mpiBench}]
+sh02-09n71: mpiBench
+sh02-09n71:   `-3*[{mpiBench}]
+sh02-09n71: mpiBench
+sh02-09n71:   `-3*[{mpiBench}]
+sh02-10n01: mpiBench
+sh02-10n01:   `-3*[{mpiBench}]
+sh02-10n01: mpiBench
+sh02-10n01:   `-3*[{mpiBench}]
+sh02-10n01: mpiBench
+sh02-10n01:   `-3*[{mpiBench}]
+sh02-10n01: mpiBench
+sh02-10n01:   `-3*[{mpiBench}]
+

CPU usage#

To get the CPU and memory usage of your processes in job 2003264:

$ clush -w @j:2003264 ps -u$USER -o%cpu,rss,cmd
+sh03-07n12: %CPU   RSS CMD
+sh03-07n12:  0.0  4780 /home/users/kilian/benchs/MPI/mpiBench/mpiBench -i 1000000
+sh03-07n12:  0.0  4784 /home/users/kilian/benchs/MPI/mpiBench/mpiBench -i 1000000
+sh03-07n12:  0.0  4784 /home/users/kilian/benchs/MPI/mpiBench/mpiBench -i 1000000
+sh03-07n12:  0.0  4780 /home/users/kilian/benchs/MPI/mpiBench/mpiBench -i 1000000
+sh03-06n06: %CPU   RSS CMD
+sh03-06n06:  0.0 59596 /home/users/kilian/benchs/MPI/mpiBench/mpiBench -i 1000000
+sh03-06n06:  0.0 59576 /home/users/kilian/benchs/MPI/mpiBench/mpiBench -i 1000000
+sh03-06n06:  0.0 59580 /home/users/kilian/benchs/MPI/mpiBench/mpiBench -i 1000000
+sh03-06n06:  0.0 59588 /home/users/kilian/benchs/MPI/mpiBench/mpiBench -i 1000000
+sh03-06n05: %CPU   RSS CMD
+sh03-06n05:  0.0  7360 /home/users/kilian/benchs/MPI/mpiBench/mpiBench -i 1000000
+sh03-06n05:  0.0  7328 /home/users/kilian/benchs/MPI/mpiBench/mpiBench -i 1000000
+sh03-06n05:  0.0  7344 /home/users/kilian/benchs/MPI/mpiBench/mpiBench -i 1000000
+sh03-06n05:  0.0  7340 /home/users/kilian/benchs/MPI/mpiBench/mpiBench -i 1000000
+sh03-06n11: %CPU   RSS CMD
+sh03-06n11: 17.0 59604 /home/users/kilian/benchs/MPI/mpiBench/mpiBench -i 1000000
+sh03-06n11: 17.0 59588 /home/users/kilian/benchs/MPI/mpiBench/mpiBench -i 1000000
+sh03-06n11: 17.0 59592 /home/users/kilian/benchs/MPI/mpiBench/mpiBench -i 1000000
+sh03-06n11: 17.0 59580 /home/users/kilian/benchs/MPI/mpiBench/mpiBench -i 1000000
+

GPU usage#

To show what's running on all the GPUs on the nodes associated with job 123456:

$ clush -bw @job:123456 nvidia-smi --format=csv --query-compute-apps=process_name,utilization.memory
+sh03-12n01: /share/software/user/open/python/3.6.1/bin/python3.6, 15832 MiB
+sh02-12n04: /share/software/user/open/python/3.6.1/bin/python3.6, 15943 MiB
+
\ No newline at end of file diff --git a/docs/software/using/images/ngc_namd.png b/docs/software/using/images/ngc_namd.png new file mode 100644 index 000000000..716360901 Binary files /dev/null and b/docs/software/using/images/ngc_namd.png differ diff --git a/docs/software/using/julia/index.html b/docs/software/using/julia/index.html new file mode 100644 index 000000000..383d7476d --- /dev/null +++ b/docs/software/using/julia/index.html @@ -0,0 +1,116 @@ + Julia - Sherlock

Julia

Introduction#

Julia is a high-level general-purpose dynamic programming language that was originally designed to address the needs of high-performance numerical analysis and computational science, without the typical need of separate compilation to be fast, also usable for client and server web use, low-level systems programming or as a specification language. Julia aims to create an unprecedented combination of ease-of-use, power, and efficiency in a single language.

More documentation#

The following documentation is specifically intended for using Julia on Sherlock. For more complete documentation about Julia in general, please see the Julia documentation.

Julia on Sherlock#

Julia is available on Sherlock and the corresponding module can be loaded with:

$ ml julia
+

For a list of available versions, you can execute ml spider julia at the Sherlock prompt, or refer to the Software list page.

Using Julia#

Once your environment is configured (ie. when the julia module is loaded), julia can be started by simply typing julia at the shell prompt:

$ julia
+
+_
+   _       _ _(_)_     |  Documentation: https://docs.julialang.org
+  (_)     | (_) (_)    |
+   _ _   _| |_  __ _   |  Type "?" for help, "]?" for Pkg help.
+  | | | | | | |/ _` |  |
+  | | |_| | | | (_| |  |  Version 1.0.0 (2018-08-08)
+ _/ |\__'_|_|_|\__'_|  |  Official https://julialang.org/ release
+|__/                   |
+
+julia>
+

For a listing of command line options:

$ julia --help
+
+julia [switches] -- [programfile] [args...]
+ -v, --version             Display version information
+ -h, --help                Print this message
+
+ -J, --sysimage <file>     Start up with the given system image file
+ -H, --home <dir>          Set location of `julia` executable
+ --startup-file={yes|no}   Load `~/.julia/config/startup.jl`
+ --handle-signals={yes|no} Enable or disable Julia's default signal handlers
+ --sysimage-native-code={yes|no}
+                           Use native code from system image if available
+ --compiled-modules={yes|no}
+                           Enable or disable incremental precompilation of modules
+
+ -e, --eval <expr>         Evaluate <expr>
+ -E, --print <expr>        Evaluate <expr> and display the result
+ -L, --load <file>         Load <file> immediately on all processors
+
+ -p, --procs {N|auto}      Integer value N launches N additional local worker processes
+                           "auto" launches as many workers as the number
+                           of local CPU threads (logical cores)
+ --machine-file <file>     Run processes on hosts listed in <file>
+
+ -i                        Interactive mode; REPL runs and isinteractive() is true
+ -q, --quiet               Quiet startup: no banner, suppress REPL warnings
+

Running a Julia script#

A Julia program is easy to run on the command line outside of its interactive mode.

Here is an example where we create a simple Hello World program and launch it with Julia

$ echo 'println("hello world")' > helloworld.jl
+

That script can now simply be executed by calling julia <script_name>:

$ julia helloworld.jl
+hello world
+

Submitting a Julia job#

Here's an example Julia sbatch script that can be submitted via sbatch:

#!/bin/bash
+
+#SBATCH --time=00:10:00
+#SBATCH --mem=4G
+#SBATCH --output=julia_test.log
+
+# load the module
+ml julia
+
+# run the Julia application
+julia helloworld.jl
+

You can save this script as julia_test.sbatch and submit it to the scheduler with:

$ sbatch julia_test.sbatch
+

Once the job is done, you should get a julia_test.log file in the current directory, with the following contents:

$ cat julia_test.log
+hello world
+

Julia packages#

Julia provides an ever-growing list of packages that can be used to install add-on functionality to your Julia code.

Installing packages with Julia is very simple. Julia includes a package module in its base installation that handles installing, updating, and removing packages.

First import the Pkg module:

julia> import Pkg
+julia> Pkg.status()
+    Status `~/.julia/environments/v1.0/Project.toml`
+

Julia packages only need to be installed once

You only need to install Julia packages once on Sherlock. Since fielsystems are shared, packages installed on one node will immediately be available on all nodes on the cluster.

Installing packages#

You can first check the status of packages installed on Julia using the status function of the Pkg module:

julia> Pkg.status()
+No packages installed.
+

You can then add packages using the add function of the Pkg module:

julia> Pkg.add("Distributions")
+INFO: Cloning cache of Distributions from git://github.com/JuliaStats/Distributions.jl.git
+INFO: Cloning cache of NumericExtensions from git://github.com/lindahua/NumericExtensions.jl.git
+INFO: Cloning cache of Stats from git://github.com/JuliaStats/Stats.jl.git
+INFO: Installing Distributions v0.2.7
+INFO: Installing NumericExtensions v0.2.17
+INFO: Installing Stats v0.2.6
+INFO: REQUIRE updated.
+

Using the status function again, you can see that the package and its dependencies have been installed:

julia> Pkg.status()
+Required packages:
+ - Distributions                 0.2.7
+Additional packages:
+ - NumericExtensions             0.2.17
+ - Stats                         0.2.6
+

Updating Packages#

The update function of the Pkg module can update all packages installed:

julia> Pkg.update()
+INFO: Updating METADATA...
+INFO: Computing changes...
+INFO: Upgrading Distributions: v0.2.8 => v0.2.10
+INFO: Upgrading Stats: v0.2.7 => v0.2.8
+

Removing packages#

The remove function of the Pkg module can remove any packages installed as well:

julia> Pkg.rm("Distributions")
+INFO: Removing Distributions v0.2.7
+INFO: Removing Stats v0.2.6
+INFO: Removing NumericExtensions v0.2.17
+INFO: REQUIRE updated.
+
+julia> Pkg.status()
+Required packages:
+ - SHA                           0.3.2
+
+julia> Pkg.rm("SHA")
+INFO: Removing SHA v0.3.2
+INFO: REQUIRE updated.
+
+julia> Pkg.status()
+No packages installed.
+

Examples#

Parallel job#

Julia can natively spawn parallel workers across multiple compute nodes, without using MPI. There are two main modes of operation:

  1. ClusterManager: in this mode, you can spawn workers from within the Julia interpreter, and each worker will actually submit jobs to the scheduler, executing instructions within those jobs.

  2. using the --machine-file option: here, you submit a SLURM job and run the Julia interpreter in parallel mode within the job's resources.

The second mode is easier to use, and more convenient, since you have all your resources available and ready to use when the job starts. In mode 1, you'll need to wait for jobs to be dispatched and executed inside Julia.

Here is a quick example on how to use the --machine-file option on Sherlock.

Given following Julia script (julia_parallel_test.jl) that will print a line with the process id and the node it's executing on, in parallel:

using Distributed
+@everywhere println("process: $(myid()) on host $(gethostname())")
+

You can submit the following job:

#!/bin/bash
+#SBATCH --nodes 2
+#SBATCH --ntasks-per-node 4
+#SBATCH --time 5:0
+
+ml julia
+julia --machine-file <(srun hostname -s)  ./julia_parallel_test.jl
+

Save as julia_test.sbatch, and then:

$ sbatch  julia_test.sbatch
+

It will:

  1. Request 2 nodes, 4 tasks per node (8 tasks total)
  2. load the julia module
  3. Run Julia in parallel with a machine file that is automatically generated, listing the nodes that are assigned to your job.

It should output something like this in your job's output file:

process: 1 on host sh-06-33.int
+      From worker 2:    process: 2 on host sh-06-33.int
+      From worker 3:    process: 3 on host sh-06-34.int
+      From worker 5:    process: 5 on host sh-06-33.int
+      From worker 4:    process: 4 on host sh-06-33.int
+      From worker 6:    process: 6 on host sh-06-33.int
+      From worker 8:    process: 8 on host sh-06-34.int
+      From worker 9:    process: 9 on host sh-06-34.int
+      From worker 7:    process: 7 on host sh-06-34.int
+
\ No newline at end of file diff --git a/docs/software/using/mariadb/index.html b/docs/software/using/mariadb/index.html new file mode 100644 index 000000000..7cf01bb7e --- /dev/null +++ b/docs/software/using/mariadb/index.html @@ -0,0 +1,97 @@ + MariaDB - Sherlock

MariaDB

Introduction#

MariaDB is a community-developed fork of the MySQL relational database management system. It is completely compatible with MySQL and could be use as a drop-in replacement in the vast majority of cases.

More documentation#

The following documentation specifically intended for using MariaDB on Sherlock. For more complete documentation about MariaDB in general, please see the MariaDB documentation.

MariaDB on Sherlock#

We don't provide any centralized database service on Sherlock, but we provide a centralized installation of MariaDB, and each user is welcome to start their own instance of the database server to fit their jobs' needs.

The overall process to run an instance of MariaDB on Sherlock would look like this:

  1. configure and initialize your environment so you can start a database instance under your user account,
  2. start the database server,
  3. run SQL queries from the same node (via a local socket), or from other nodes and/or jobs (via the network).

Single-node access#

In that example, the database server and client will run within the same job, on the same compute node.

Preparation#

You first need to let MariaDB know where to store its database, where to log things, and how to allow connections from clients. The commands below only need to be executed once.

For this, you'll need to create a .my.cnf file in your home directory. Assuming you'll want to store your database files in a db/ directory in your $SCRATCH folder, you can run the following commands:

$ export DB_DIR=$SCRATCH/db
+$ mkdir $DB_DIR
+
+$ cat << EOF > ~/.my.cnf
+[mysqld]
+datadir=$DB_DIR
+socket=$DB_DIR/mariadb.sock
+user=$USER
+symbolic-links=0
+skip-networking
+
+[mysqld_safe]
+log-error=$DB_DIR/mariadbd.log
+pid-file=$DB_DIR/mariadbd.pid
+
+[mysql]
+socket=$DB_DIR/mariadb.sock
+EOF
+

.my.cnf doesn't support environment variables

Please note that if you edit your ~/.my.cnf file directly in a file editor, without using the HEREDOC syntax above, environment variables such as $DB_DIR, $HOME or $USER won't work: you will need to specify absolute paths explicitly, such as /scratch/users/kilian/db/mariadbd.log.

If you use the HEREDOC syntax, you can verify that the resulting .my.cnf file does actually contain full paths, and not environment variables anymore.

Once you have the .my.cnf file in place, you need to initialize your database with some internal data that MariaDB needs. In the same terminal, run the following commands:

$ ml system mariadb
+$ $MARIADB_DIR/scripts/mysql_install_db --basedir=$MARIADB_DIR  --datadir=$DB_DIR
+

Start the server#

You can now start the MariaDB server. For this, first get an allocation on a compute node, note the hostname of the compute node your job has been allocated, load the mariadb module, and then run the mysqld_safe process:

$ srun --pty bash
+$ echo $SLURM_JOB_NODELIST
+sh-01-01
+$ ml system mariadb
+$ mysqld_safe
+180705 18:14:27 mysqld_safe Logging to '/home/users/kilian/db/mysqld.log'.
+180705 18:14:28 mysqld_safe Starting mysqld daemon with databases from /home/users/kilian/db/
+

The mysqld_safe will be blocking, meaning it will not give the prompt back for as long as the MariaDB server runs.

If it does return on its own, it probably means that something went wrong, and you'll find more information about the issue in the $DB_DIR/mysqld.log file you defined in ~/.my.cnf.

Run queries#

You're now ready to run queries against that MariaDB instance, from the same node your job is running on.

From another terminal on Sherlock, connect to your job's compute node (here, it's sh-01-01, as shown above), load the mariadb module, and then run the mysql command: it will open the MariaDB shell, ready to run your SQL queries:

$ ssh sh-01-01
+$ ml system mariadb
+$ mysql
+Welcome to the MariaDB monitor.  Commands end with ; or \g.
+Your MariaDB connection id is 8
+Server version: 10.2.11-MariaDB Source distribution
+
+Copyright (c) 2000, 2017, Oracle, MariaDB Corporation Ab and others.
+
+Type 'help;' or '\h' for help. Type '\c' to clear the current input statement.
+
+MariaDB [(none)]>
+

Once you're done with your MariaDB instance, you can just terminate your job, and all the processes will be terminated automatically.

Multi-node access#

In case you need to run a more persistent instance of MariaDB, you can for instance submit a dedicated job to run the server, make it accessible over the network, and run queries from other jobs and/or nodes.

Enable network access#

The preparation steps are pretty similar to the single-node case, except the MariaDB server instance will be accessed over the network rather than through a local socket.

Network access must be secured

When running an networked instance of MariaDB, please keep in mind that any user on Sherlock will be able to connect to the TCP ports that mysqld runs on, and that proper configuration must be done to prevent unauthrozied access.

Like in the single-node case, you need to create a ~/.my.cnf file, but without the skip-networking directive.

$ export DB_DIR=$SCRATCH/db
+$ mkdir $DB_DIR
+
+$ cat << EOF > ~/.my.cnf
+[mysqld]
+datadir=$DB_DIR
+socket=$DB_DIR/mariadb.sock
+user=$USER
+symbolic-links=0
+
+[mysqld_safe]
+log-error=$DB_DIR/mariadbd.log
+pid-file=$DB_DIR/mariadbd.pid
+
+[mysql]
+socket=$DB_DIR/mariadb.sock
+EOF
+

And then initiate the database:

$ ml system mariadb
+$ $MARIADB_DIR/scripts/mysql_install_db --basedir=$MARIADB_DIR  --datadir=$DB_DIR
+

Secure access#

We will now set a password for the MariaDB root user to a random string, just for the purpose of preventing unauthorized access, since we won't need it for anything.

We will actually create a MariaDB user with all privileges on the databases, that will be able to connect to this instance from any node. This user will need a real password, though. So please make sure to replace the my-secure-password string below by the actual password of your choice.

Choose a proper password

This password will only be used to access this specific instance of MariaDB. Note that anybody knowing that password will be allowed to connect to your MariaDB instances and modify data in the tables.

  • do NOT literally use my-secure-password
  • do NOT use your SUNet ID password

Once you've chosen your password, you can start the mysqld process on a compute node, like before:

$ srun --pty bash
+$ echo $SLURM_JOB_NODELIST
+sh-01-01
+$ ml system mariadb
+$ mysqld_safe
+

And then, from another terminal, run the following commands to secure access to your MariaDB database.

$ ssh sh-01-01
+$ mysql -u root << EOF
+UPDATE mysql.user SET Password=PASSWORD(RAND()) WHERE User='root';
+DELETE FROM mysql.user WHERE User='root' AND Host NOT IN ('localhost', '127.0.0.1', '::1');
+DELETE FROM mysql.user WHERE User='';
+DELETE FROM mysql.db WHERE Db='test' OR Db='test_%';
+GRANT ALL PRIVILEGES ON *.* TO '$USER'@'%' IDENTIFIED BY 'my-secure-password' WITH GRANT OPTION;
+FLUSH PRIVILEGES;
+EOF
+

Once you've done that, you're ready to terminate that interactive job, and start a dedicated MariaDB server job.

Start MariaDB in a job#

You can use the following mariadb.sbatch job as a template:

#!/bin/bash
+
+#SBATCH --job-name=mariadb
+#SBATCH --time=8:0:0
+#SBATCH --dependency=singleton
+
+ml system mariadb
+mysqld_safe
+

and submit it with:

$ sbatch mariadb.sbatch
+

Concurrent instances will lead to data corruption

An important thing to keep in mind is that having multiple instances of a MariaDB server running at the same time, using the same database files, will certainly lead to catastrophic situations and the corruption of those files.

To prevent this from happening, the --dependency=singleton job submission option will make sure that only one instance of that job (based on its name and user) will run at any given time.

Connect to the running instance#

Now, from any node on Sherlock, whether from a login node, an interactive job, or a batch job, using the mysql CLI or any application binding in any language, you should be able to connect to your running MariaDB instance,

First, identify the node your job is running on with squeue:

$ squeue -u $USER -n mariadb
+             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
+          21383445    normal  mariadb   kilian  R       0:07      1 sh-01-02
+

and then, point your MariaDB client to that node:

$ ml system mariadb
+$ mysql -h sh-01-02 -p
+Enter password:
+Welcome to the MariaDB monitor.  Commands end with ; or \g.
+Your MariaDB connection id is 15
+Server version: 10.2.11-MariaDB Source distribution
+
+Copyright (c) 2000, 2017, Oracle, MariaDB Corporation Ab and others.
+
+Type 'help;' or '\h' for help. Type '\c' to clear the current input statement.
+
+MariaDB [(none)]>
+

That's it! You can now run SQL queries from anywhere on Sherlock to your own MariaDB instance.

Persistent DB instances#

SQL data is persistent

All the data you import in your SQL databases will be persistent across jobs. Meaning that you can run a PostgreSQL server job for the day, import data in its database, stop the job, and resubmit the same PostgreSQL server job the next day: all your data will still be there as long as the location you've chosen for your database (the $DB_DIR defined in the Preparation steps) is on a persistent storage location.

If you need database access for more than the maximum runtime of a job, you can use the instructions provided to define self-resubmitting recurring jobs and submit long-running database instances.

\ No newline at end of file diff --git a/docs/software/using/matlab/index.html b/docs/software/using/matlab/index.html new file mode 100644 index 000000000..0a9308161 --- /dev/null +++ b/docs/software/using/matlab/index.html @@ -0,0 +1,78 @@ + Matlab - Sherlock

Matlab

Introduction#

MATLAB is a numerical computing environment and proprietary programming language developed by MathWorks.

More documentation#

The following documentation is specifically intended for using Matlab on Sherlock. For more complete documentation about Matlab in general, please see the official MATLAB documentation.

MATLAB on Sherlock#

Licensing#

MATLAB is a commercial software suite, which is now available to no cost for all Stanford Faculty, students, and staff.

Note: a number of free, open-source alternatives exist and can be used in many situations: Octave, R, Julia, or Python are all available on Sherlock, and can often replace MATLAB with good results.

Using MATLAB#

The MATLAB module can be loaded with:

$ ml load matlab
+

This will load the current default version. For a list of available versions run ml spider matlab at the Sherlock prompt, or refer to the Software list page.

MATLAB can't run on login nodes

Running MATLAB directly on login nodes is not supported and will produce the following message:

-----------------------------------------------------------------------
+WARNING: running MATLAB directly on login nodes is not supported.  Please
+make sure you request an interactive session on a compute node with "sh_dev"
+for instance) before launching MATLAB interactively.
+-----------------------------------------------------------------------
+
You will need to submit a job or request an interactive session on a compute node before you can start MATLAB.

Once you are on a compute node and your environment is configured (ie. when the matlab module is loaded), MATLAB can be started by simply typing matlab at the shell prompt.

$ sh_dev
+$ ml load matlab
+$ matlab
+MATLAB is selecting SOFTWARE OPENGL rendering.
+                          < M A T L A B (R) >
+                Copyright 1984-2019 The MathWorks, Inc.
+                R2019a (9.6.0.1072779) 64-bit (glnxa64)
+                             March 8, 2019
+
+To get started, type doc.
+For product information, visit www.mathworks.com.
+
+>>
+

For a listing of command line options:

$ matlab -help
+

Running a MATLAB script#

There are several ways to launch a MATLAB script on the command line, as documented in the MATLAB documentation:

Method Output
matlab -nodesktop < script.m MATLAB will run the code from script.m and display output on stdout
matlab -nodisplay Start MATLAB in CLI mode, without its graphical desktop environment
matlab -nojvm do not start the JVM1

MATLAB GUI#

It's often best to use your laptop or desktop to develop, debug MATLAB and visualize the output. If you do need to use the MATLAB GUI on a large cluster like Sherlock, you will need to enable X11 forwarding in your SSH client.

For instance:

$ ssh -X <YourSUNetID>@login.sherlock.stanford.edu
+

And then, once on Sherlock:

$ sh_dev
+$ ml load matlab
+$ matlab
+

For more info on X11 forwarding, you can refer to this UIT page.

Examples#

Simple MATLAB job#

Here is an example MATLAB batch script that can submitted with sbatch:

#!/bin/bash
+#SBATCH --job-name=matlab_test
+#SBATCH --output=matlab_test."%j".out
+#SBATCH --error=matlab_test."%j".err
+#SBATCH --partition=normal
+#SBATCH --time=00:10:00
+#SBATCH --cpus-per-task=1
+#SBATCH --mem=8G
+#SBATCH --mail-type=ALL
+
+module load matlab
+matlab -nodisplay < example.m
+

This simple job, named matlab_test will run a MATLAB script named example.m in the normal partition, for a duration of 10 minutes, and use 1 CPU and 8GB of RAM. It will send you an email (to whatever email you used wen you signed up for Sherlock) when it begins, ends or fails.

Additionally, to aid in debugging, it will log any errors and output to the files matlab_test.JOBID.{out,err} with the jobid appended to the filename (%j).

To create the script, open a text editor on Sherlock, copy the contents of the script, and save it as matlab_test.sbatch

Then, submit the job with the sbatch command:

$ sbatch matlab_test.sbatch
+Submitted batch job 59942277
+

You can check the status of the job with the squeue command, and check the contents of the matlab_test.JOBID.{out,err} files to see the results.

Parallel loop#

You can run your MATLAB code across multiple CPUs on Sherlock using parfor loops, to take advantage of the multiple CPU cores that each node features. You can submit a job requesting as many CPUs as there are on a node in a single job. The key is to grab the SLURM environment variable $SLURM_CPUS_PER_TASK and create the worker pool in your MATLAB code with:

parpool('local', str2num(getenv('SLURM_CPUS_PER_TASK')))
+

Here is an example of a sbatch submission script that requests 16 CPUs on a node, and runs a simple MATLAB script using parfor.

Save the two scripts below as parfor.sbatch and parfor_loop.m:

#!/bin/bash
+#SBATCH -J pfor_matlab
+#SBATCH -o pfor".%j".out
+#SBATCH -e pfor".%j".err
+#SBATCH -t 20:00
+#SBATCH -p normal
+#SBATCH -c 16
+#SBATCH --mail-type=ALL
+
+module load matlab
+matlab -batch parfor_loop
+
%============================================================================
+% Parallel Monte Carlo calculation of PI
+%============================================================================
+parpool('local', str2num(getenv('SLURM_CPUS_PER_TASK')))
+R = 1;
+darts = 1e7;
+count = 0;
+tic
+parfor i = 1:darts
+   % Compute the X and Y coordinates of where the dart hit the...............
+   % square using Uniform distribution.......................................
+   x = R*rand(1);
+   y = R*rand(1);
+   if x^2 + y^2 <= R^2
+      % Increment the count of darts that fell inside of the.................
+      % circle...............................................................
+     count = count + 1; % Count is a reduction variable.
+   end
+end
+% Compute pi.................................................................
+myPI = 4*count/darts;
+T = toc;
+fprintf('The computed value of pi is %8.7f.n',myPI);
+fprintf('The parallel Monte-Carlo method is executed in %8.2f seconds.n', T);
+delete(gcp);
+exit;
+

You can now submit the job with the following command:

sbatch parfor.sbatch
+

If you run htop or pstree -u $USER on the compute node that is running your job, you will see all 16 cores allocated to your MATLAB code.

You can also try that same job with different numbers of CPUs, and see how well it scales.


  1. MATLAB uses the Java® Virtual Machine (JVM™) software to run the desktop and to display graphics. The -nojvm option enables you to start MATLAB without the JVM. Using this option minimizes memory usage and improves initial start-up speed, but restricts functionality. 

\ No newline at end of file diff --git a/docs/software/using/perl/index.html b/docs/software/using/perl/index.html new file mode 100644 index 000000000..c914e05aa --- /dev/null +++ b/docs/software/using/perl/index.html @@ -0,0 +1,6 @@ + Perl - Sherlock

Perl

Introduction#

Perl is a high-level, general-purpose, interpreted, dynamic programming language. Originally developed by Larry Wall in 1987 as a general-purpose Unix scripting language to make report processing easier, it has since undergone many changes and revisions.

Perl provides a framework allowing users to easily extend the language by installing new modules in their local environment. The Comprehensive Perl Archive Network (CPAN1) is an archive of over 25,000 distributions of software written in Perl, as well as documentation for it. It is searchable at http://metacpan.org or http://search.cpan.org and mirrored in over 270 locations around the world.

More documentation#

The following documentation specifically intended for using Perl on Sherlock. For more complete documentation about Perl in general, please see the Perl documentation.

Perl modules on Sherlock#

To install Perl modules from CPAN, we recommend using the (provided) App::cpanminus module and local::lib modules:

  • App::cpanminus is a popular alternative CPAN client that can be used to manage Perl distributions. It has many great features, including uninstalling modules.
  • local::lib allows users to install Perl modules in the directory of their choice (typically their home directory) without administrative privileges.

Both are already installed on Sherlock, and are automatically enabled and configured when you load the perl module. You don't need to add anything in your ~/.bashrc file, the Sherlock perl module will automatically create everything that is required so you can directly run a command to install Perl modules locally.

Installation#

Perl modules installation is only necessary once

You only need to install Perl modules once on Sherlock. Since fielsystems are shared, modules installed on one node will immediately be available on all nodes on the cluster.

As an example, to install the DateTime::TimeZone module, you can do the following:

$ ml perl
+$ cpanm DateTime::TimeZone
+

Usage#

Once installed, you can use the Perl modules directly, no specific options or syntax is required.

For instance, to check that the DateTime::TimeZone module is correctly installed:

$ perl -MDateTime::TimeZone -e 'print $DateTime::TimeZone::VERSION . "\n"';
+2.13
+

Uninstallation#

To uninstall a Perl module:

$ cpanm -U DateTime::TimeZone
+

  1. CPAN can denote either the archive network itself, or the Perl program that acts as an interface to the network and as an automated software installer (somewhat like a package manager). Most software on CPAN is free and open source. 

\ No newline at end of file diff --git a/docs/software/using/postgresql/index.html b/docs/software/using/postgresql/index.html new file mode 100644 index 000000000..f814f055f --- /dev/null +++ b/docs/software/using/postgresql/index.html @@ -0,0 +1,63 @@ + PostgreSQL - Sherlock

PostgreSQL

Introduction#

PostgreSQL is a powerful, open source object-relational database system with a strong focus on reliability, feature robustness, and performance.

More documentation#

The following documentation specifically intended for using PostgreSQL on Sherlock. For more complete documentation about PostgreSQL in general, please see the PostgreSQL documentation.

PostgreSQL on Sherlock#

We don't provide any centralized database service on Sherlock, but we provide a centralized installation of PostgreSQL, and each user is welcome to start their own instance of the database server to fit their jobs' needs.

The overall process to run an instance of PostgreSQL on Sherlock would look like this:

  1. configure and initialize your environment so you can start a database instance under your user account,
  2. start the database server,
  3. run SQL queries from the same node (via a local socket), or from other nodes and/or jobs (via the network).

Single-node access#

In that example, the database server and client will run within the same job, on the same compute node.

Preparation#

You first need to let PostgreSQL know where to store its database. The commands below only need to be executed once.

Assuming you'll want to store your database files in a db/ directory in your $SCRATCH folder, you can run the following commands:

$ export DB_DIR=$SCRATCH/db
+$ mkdir $DB_DIR
+

Once you have your $DB_DIR in place, you need to initialize your database with some internal data that PostgreSQL needs. In the same terminal, run the following commands:

$ ml system postgresql
+$ initdb $DB_DIR
+

Start the server#

You can now start the PostgreSQL server. For this, first get an allocation on a compute node, note the hostname of the compute node your job has been allocated, load the postgresql module, and then run the postgresql server:

$ srun --pty bash
+$ echo $SLURM_JOB_NODELIST
+sh-01-01
+$ ml system postgresql
+$ export DB_DIR=$SCRATCH/db
+$ postgres -D $DB_DIR
+[...]
+2018-10-09 17:42:08.094 PDT [3841] LOG:  database system is ready to accept connections
+

The postgres process will be blocking, meaning it will not give the prompt back for as long as the PostgreSQL server runs.

Run queries#

You're now ready to run queries against that PostgreSQL instance, from the same node your job is running on.

From another terminal on Sherlock, connect to your job's compute node (here, it's sh-01-01, as shown above), load the postgresql module, and then run the createdb command: it will create a database that you can use as a testbed:

$ ssh sh-01-01
+$ ml system postgresql
+$ createdb test_db
+

Once this is done, from the same shell, you can run the psql command, which will open the PostgreSQL shell, ready to run your SQL queries:

$ psql test_db
+psql (10.5)
+Type "help" for help.
+
+test_db=#
+

Once you're done with your PostgreSQL instance, you can just terminate your job, and all the processes will be terminated automatically.

Multi-node access#

In case you need to run a more persistent instance of PostgreSQL, you can for instance submit a dedicated job to run the server, make it accessible over the network, and run queries from other jobs and/or nodes.

Enable network access#

The preparation steps are pretty similar to the single-node case, except the PostgreSQL server instance will be accessed over the network rather than through a local socket.

Network access must be secured

When running an networked instance of PostgreSQL, please keep in mind that any user on Sherlock could potentially be able to connect to the TCP ports that postgres runs on, and that proper configuration must be done to prevent unauthrozied access.

Like in the single-node case, you need to start the postgres server process, but with the -i option to enable network connections, and define user access in your $DB_DIR/pg_hba.conf file (see below).

Secure access#

To allow network connections to the database server, a password will need to be defined for the PostgreSQL user. That will allow this user to connect to the PostgreSQL instance from any node. Please make sure to replace the my-secure-password string below by the actual password of your choice.

Choose a proper password

This password will only be used to access this specific instance of PostgreSQL. Note that anybody knowing that password will be allowed to connect to your PostgreSQL instances and modify data in the tables.

  • do NOT use my-secure-password
  • do NOT use your SUNet ID password

Once you've chosen your password, you can now start the PostgreSQL server on a compute, as described in the previous section, initialize the database, and set the user password:

$ srun --pty bash
+
+$ echo $SLURM_JOB_NODELIST
+sh-01-01
+$ export DB_DIR=$SCRATCH/db
+$ mkdir $DB_DIR
+
+$ ml system postgresql
+$ initdb $DB_DIR
+$ createdb test_db
+
+$ psql -c "ALTER USER $USER PASSWORD 'my-secure-password';" test_db
+

Then, we need to edit the $DB_DIR/ph_hba.conf file to allow network access for user $USER:

$ cat << EOF > $DB_DIR/pg_hba.conf
+local   all             all                                     trust
+host    all             all             127.0.0.1/32            trust
+host    all             all             ::1/128                 trust
+host    all             $USER           samenet                 md5
+EOF
+

Once you've done that, you're ready to terminate that interactive job, and start a dedicated PostgreSQL server job.

$ pg_ctl stop -D $DB_DIR
+$ logout
+

Start PostgreSQL in a job#

You can use the following postgresql.sbatch job as a template:

#!/bin/bash
+
+#SBATCH --job-name=postgresql
+#SBATCH --time=8:0:0
+#SBATCH --dependency=singleton
+
+export DB_DIR=$SCRATCH/db
+
+ml system postgresql
+
+postgres -i -D $DB_DIR
+

and submit it with:

$ sbatch postgresql.sbatch
+

Concurrent instances will lead to data corruption

An important thing to keep in mind is that having multiple instances of a PostgreSQL server running at the same time, using the same database files, will certainly lead to catastrophic situations and the corruption of those files.

To prevent this from happening, the --dependency=singleton job submission option will make sure that only one instance of that job (based on its name and user) will run at any given time.

Connect to the running instance#

Now, from any node on Sherlock, whether from a login node, an interactive job, or a batch job, using the mysql CLI or any application binding in any language, you should be able to connect to your running PostgreSQL instance,

First, identify the node your job is running on with squeue:

$ squeue -u $USER -n postgresql
+             JOBID PARTITION       NAME     USER ST       TIME  NODES NODELIST(REASON)
+          21383445    normal postgresql   kilian  R       0:07      1 sh-01-02
+

and then, point your PostgreSQL client to that node:

$ ml system postgresql
+$ mpsql -h sh-06-34  test_db
+Password:
+psql (10.5)
+Type "help" for help.
+
+test_db=#
+

That's it! You can now run SQL queries from anywhere on Sherlock to your own PostgreSQL instance.

Persistent DB instances#

SQL data is persistent

All the data you import in your SQL databases will be persistent across jobs. Meaning that you can run a PostgreSQL server job for the day, import data in its database, stop the job, and resubmit the same PostgreSQL server job the next day: all your data will still be there as long as the location you've chosen for your database (the $DB_DIR defined in the Preparation steps) is on a persistent storage location.

If you need database access for more than the maximum runtime of a job, you can use the instructions provided to define self-resubmitting recurring jobs and submit long-running database instances.

\ No newline at end of file diff --git a/docs/software/using/python/index.html b/docs/software/using/python/index.html new file mode 100644 index 000000000..c68e47c6b --- /dev/null +++ b/docs/software/using/python/index.html @@ -0,0 +1,63 @@ + Python - Sherlock

Python

Introduction#

Python is an interpreted high-level programming language for general-purpose programming. Its design philosophy emphasizes code readability. It provides constructs that enable clear programming on both small and large scales, which makes it both easy to learn and very well-suited for rapid prototyping.

More documentation#

The following documentation is specifically intended for using Python on Sherlock. For more complete documentation about Python in general, please see the Python documentation.

Python on Sherlock#

Sherlock features multiple versions of Python.

Some applications only work with legacy features of version 2.x, while more recent code will require specific version 3.x features. Modules on Sherlock may only be available in a single flavor (as denoted by their suffix: _py27 or _py36, because the application only supports one or the other.

You can load either version on Sherlock by doing the following commands:

$ ml python/2.7.13
+

or

$ ml python/3.6.1
+

The Python3 interpreter is python3

The Python3 executable is named python3, not python. So, once you have the "python/3.6.1" module loaded on Sherlock, you will need to use python3 to invoke the proper interpreter. python will still refer to the default, older system-level Python installation, and may result in errors when trying to run Python3 code.

This is an upstream decision detailed in PEP-394, not something specific to Sherlock.

Using Python#

Once your environment is configured (ie. when the Python module is loaded), Python can be started by simply typing python at the shell prompt:

$ python
+Python 2.7.13 (default, Apr 27 2017, 14:19:21)
+[GCC 4.8.5 20150623 (Red Hat 4.8.5-11)] on linux2
+Type "help", "copyright", "credits" or "license" for more information.
+>>>
+

Python in batch jobs#

Python output is buffered by default

By default, Python buffers console output. It means that when running Python in a batch job through Slurm, you may see output less often than you would when running interactively.

When output is being buffered, the print statements are aggregated until there is a enough data to print, and then the messages are all printed at once. And as a consequence, job output files (as specified with the --output and --error job submission options) will be refreshed less often and may give the impression that the job is not running.

For debugging or checking that a Python script is producing the correct output, you may want to switch off buffering.

Switching off buffering#

For a single python script you can use the -u option, as in python -u my_script.py. The -u option stands for "unbuffered".

For instance:

#!/bin/bash
+#SBATCH -n 1
+
+python -u my_script.py
+

Tip

You can also use the environment variable PYTHONUNBUFFERED to set unbuffered I/O for your whole batch script.

#!/bin/bash
+#SBATCH -n 1
+
+export PYTHONUNBUFFERED=True
+python my_script.py
+

NB: There is some performance penalty for having unbuffered print statements, so you may want to reduce the number of print statements, or run buffered for production runs.

Python packages#

The capabilities of Python can be extended with packages developed by third parties. In general, to simplify operations, it is left up to individual users and groups to install these third-party packages in their own directories. However, Sherlock provides tools to help you install the third-party packages that you need.

Among many others, the following common Python packages are provided on Sherlock:

Python modules on Sherlock generally follow the naming scheme below:

py-<package_name>/version_py<python_version>
+

For instance, NumPy modules are:

You can list all available module versions for a package with ml spider <package_name>. For instance:

$ ml spider tensorflow
+-------------------------------------------------------------------------------
+  py-tensorflow:
+-------------------------------------------------------------------------------
+    Description:
+      TensorFlow™ is an open source software library for numerical computation using data flow graphs.
+
+     Versions:
+        py-tensorflow/1.6.0_py27
+        py-tensorflow/1.6.0_py36
+        py-tensorflow/1.7.0_py27
+        py-tensorflow/1.9.0_py27
+        py-tensorflow/1.9.0_py36
+

Dependencies are handled automatically

When you decide to use NumPy on Sherlock, you just need to load the py-numpy module of your choice, and the correct Python interpreter will be loaded automatically. No need to load a python module explicitly.

Installing packages#

If you need to use a Python package that is not already provided as a module on Sherlock, you can use the pip command. This command takes care of compiling and installing most of Python packages and their dependencies. All of pip's commands and options are explained in detail in the Pip user guide.

A comprehensive index of Python packages can be found at PyPI.

To install Python packages with pip, you'll need to use the --user option. This will make sure that those packages are installed in a user-writable location (by default, your $HOME directory). Since your $HOME directory is shared across nodes on Sherlock, you'll only need to install your Python packages once, and they'll be ready to be used on every single node in the cluster.

For example:

$ pip install --user <package_name>
+

For Python 3, use pip3:

$ pip3 install --user <package_name>
+

Python packages will be installed in $HOME/.local/lib/python<<version>/site-packages, meaning that packages for Python 2.x and Python 3.x will be kept separate. This both means that they won't interfere with each other, but also that if you need to use a package with both Python 2.x and 3.x, you'll need to install it twice, once for each Python version.

List installed packages#

You can easily see the list of the Python packages installed in your environment, and their location, with pip list:

$ pip list -v
+Package    Version Location                                                            Installer
+---------- ------- ------------------------------------------------------------------- ---------
+pip        18.1    /share/software/user/open/python/2.7.13/lib/python2.7/site-packages pip
+setuptools 28.8.0  /share/software/user/open/python/2.7.13/lib/python2.7/site-packages pip
+urllib3    1.24    /home/users/kilian/.local/lib/python2.7/site-packages               pip
+virtualenv 15.1.0  /share/software/user/open/python/2.7.13/lib/python2.7/site-packages pip
+
Alternative installation path#

Python paths

While theoretically possible, installing Python packages in alternate locations can be tricky, so we recommend trying to stick to the pip install --user way as often as possible. But in case you absolutely need it, we provide some guidelines below.

One common case of needing to install Python packages in alternate locations is to share those packages with a group of users. Here's an example that will show how to install the urllib3 Python package in a group-shared location and let users from the group use it without having to install it themselves.

First, you need to create a directory to store those packages. We'll put it in $GROUP_HOME:

$ mkdir -p $GROUP_HOME/python/
+

Then, we load the Python module we need, and we instruct pip to install its packages in the directory we just created:

$ ml python/2.7.13
+$ PYTHONUSERBASE=$GROUP_HOME/python pip install --user urllib3
+

We still use the --user option, but with PYTHONUSERBASE pointing to a different directory, pip will install packages there.

Now, to be able to use that Python module, since it's not been installed in a default directory, you (and all the members of the group who will want to use that module) need to set their PYTHONPATH to include our new shared directory1:

$ export PYTHONPATH=$GROUP_HOME/python/lib/python2.7/site-packages:$PYTHONPATH
+

And now, the module should be visible:

$ pip list -v
+Package    Version Location                                                            Installer
+---------- ------- ------------------------------------------------------------------- ---------
+pip        18.1    /share/software/user/open/python/2.7.13/lib/python2.7/site-packages pip
+setuptools 28.8.0  /share/software/user/open/python/2.7.13/lib/python2.7/site-packages pip
+urllib3    1.24    /home/groups/ruthm/python/lib/python2.7/site-packages               pip
+virtualenv 15.1.0  /share/software/user/open/python/2.7.13/lib/python2.7/site-packages pip
+

$PYTHONPATH depends on the Python version

The $PYTHONPATH environment variable is dependent on the Python version you're using, so for Python 3.6, it should include $GROUP_HOME/python/lib/python3.6/site-packages

$PATH may also need to be updated

Some Python package sometimes also install executable scripts. To make them easily accessible in your environment, you may also want to modify your $PATH to include their installation directory.

For instance, if you installed Python packages in $GROUP_HOME/python:

$ export PATH=$GROUP_HOME/python/bin:$PATH
+

Installing from GitHub#

pip also supports installing packages from a variety of sources, including GitHub repositories.

For instance, to install HTTPie, you can do:

$ pip install --user git+git://github.com/jkbr/httpie.git
+
Installing from a requirements file#

pip allows installing a list of packages listed in a file, which can be pretty convenient to install several dependencies at once.

In order to do this, create a text file called requirements.txt and place each package you would like to install on its own line:

numpy
+scikit-learn
+keras
+tensorflow
+

You can now install your modules like so:

$ ml python
+$ pip install --user -r requirements.txt
+

Upgrading packages#

pip can update already installed packages with the following command:

$ pip install --user --upgrade <package_name>
+

Upgrading packages also works with requirements.txt files:

$ pip install --user --upgrade -r requirements.txt
+

Uninstalling packages#

To uninstall a Python package, you can use the pip uninstall command (note that it doesn't take any --user option):

$ pip uninstall <package_name>
+$ pip uninstall -r requirements.txt
+

  1. This line can also be added to a user's ~/.profile file, for a more permanent setting. 

\ No newline at end of file diff --git a/docs/software/using/quantum-espresso/index.html b/docs/software/using/quantum-espresso/index.html new file mode 100644 index 000000000..7acc00ae6 --- /dev/null +++ b/docs/software/using/quantum-espresso/index.html @@ -0,0 +1,73 @@ + Quantum Espresso - Sherlock

Quantum Espresso

Introduction#

Quantum ESPRESSO is an integrated suite of Open-Source computer codes for electronic-structure calculations and materials modeling at the nanoscale. It is based on density-functional theory, plane waves, and pseudo-potentials.Perl is a high-level, general-purpose, interpreted, dynamic programming

Quantum ESPRESSO has evolved into a distribution of independent and inter-operable codes in the spirit of an open-source project. The Quantum ESPRESSO distribution consists of a “historical” core set of components, and a set of plug-ins that perform more advanced tasks, plus a number of third-party packages designed to be inter-operable with the core components. Researchers active in the field of electronic-structure calculations are encouraged to participate in the project by contributing their own codes or by implementing their own ideas into existing codes.

More documentation#

The following documentation specifically intended for using Quantum Espresso on Sherlock. For more complete documentation about Quantum Espresso in general, please see the Quantum Espresso documentation.

Quantum Espresso on Sherlock#

To run Quantum Espresso on Sherlock, you can use one of the [provided modules][url_soft_qe], or run it from a container.

The CPU version of Quantum Espresso can be loaded via the quantum-espresso module:

$ ml chemistry quantum-espresso
+

and the GPU version can be loaded via the quantum-espresso_gpu module:

$ ml chemistry quantum-espresso_gpu
+

Examples#

Here are a few examples showing how to run the AUSURF112 benchmark.

Preparation#

The first step is to get the benchmark files:

$ cd $SCRATCH
+$ git clone https://github.com/QEF/benchmarks qe_benchmarks
+$ cd qe_benchmarks/AUSURF112
+

CPU version#

To submit a Quantum Espresso job to run the AUSURF112 benchmark on CPU nodes, the following submission script can be used:

#!/bin/bash
+#SBATCH --nodes=2                # number of nodes for the job
+#SBATCH --ntasks-per-node=16     # number of tasks per node
+#SBATCH --time=00:30:00          # total run time limit (HH:MM:SS)
+#SBATCH --mail-type=begin        # send email when job begins
+#SBATCH --mail-type=end          # send email when job ends
+
+module reset
+module load chemistry
+module load quantum-espresso/7.0
+
+cd $SCRATCH/qe_benchmarks
+cd AUSURF112
+
+srun pw.x -input ausurf.in -npool 2
+

In this example, the job will request 32 CPU cores on 2 nodes, 30 minutes of run time, and will send an email notification when the job starts and when it ends.

The job can be submitted with:

$ sbatch qe-bench_cpu.sbatch
+

GPU version#

Native#

The GPU version can be loaded through the quantum-espresso_gpu module.

Using the same benchmark files as for the CPU version above, you can create a job submissions script like this:

#!/bin/bash
+#SBATCH --partition=gpu          # partition to submit the job to
+#SBATCH --nodes=2                # number of nodes for the job
+#SBATCH --gpus-per-node=1        # number of GPUs per node
+#SBATCH --time=00:30:00          # total run time limit (HH:MM:SS)
+#SBATCH --mail-type=begin        # send email when job begins
+#SBATCH --mail-type=end          # send email when job ends
+
+module reset
+module load chemistry
+module load quantum-espresso_gpu/7.0
+
+cd $SCRATCH/qe_benchmarks
+cd AUSURF112
+
+srun pw.x -input ausurf.in -npool 2
+

In this example, the job will request 2 GPU on 2 nodes, 30 minutes of run time, and will send an email notification when the job starts and when it ends.

It can be submitted with:

$ sbatch qe-bench_gpu.sbatch
+

NGC container#

Another option to run a GPU version of Quantum Espresso is to use a 3rd-party container.

The NVIDIA GPU Cloud (NGC) hosts a Quantum Espresso container container that could be used on Sherlock.

With Singularity#

To use the container with Singularity, first pull the Quantum Espresso container with:

$ cd $SCRATCH
+$ singularity pull docker://nvcr.io/hpc/quantum_espresso:qe-7.0
+

Then create the following script:

#!/bin/bash
+#SBATCH --partition=gpu          # partition to submit the job to
+#SBATCH --nodes=2                # number of nodes for the job
+#SBATCH --gpus-per-node=1        # number of GPUs per node
+#SBATCH --mem=32GB               # memory per node
+#SBATCH --time=00:30:00          # total run time limit (HH:MM:SS)
+#SBATCH --mail-type=begin        # send email when job begins
+#SBATCH --mail-type=end          # send email when job ends
+
+cd $SCRATCH/qe_benchmarks
+cd AUSURF112
+
+srun singularity run --nv \
+    $SCRATCH/quantum_espresso_qe-7.0.sif \
+    pw.x -input ausurf.in -npool 2
+

and submit it:

$ sbatch qe-bench_gpu_singularity.sbatch
+
With pyxis/enroot#

To use the container with pyxis/enroot, you can directly submit the following script:

#!/bin/bash
+#SBATCH --partition=gpu          # partition to submit the job to
+#SBATCH --nodes=2                # number of nodes for the job
+#SBATCH --gpus-per-node=1        # number of GPUs per node
+#SBATCH --mem=32GB               # memory per node
+#SBATCH --time=00:30:00          # total run time limit (HH:MM:SS)
+#SBATCH --mail-type=begin        # send email when job begins
+#SBATCH --mail-type=end          # send email when job ends
+
+cd $SCRATCH/qe_benchmarks
+cd AUSURF112
+
+srun --container-image nvcr.io/hpc/quantum_espresso:qe-7.0 \
+     --container-workdir $PWD \
+     pw.x -input ausurf.in -npool 2
+

and submit it:

$ sbatch qe-bench_gpu_singularity.sbatch
+
\ No newline at end of file diff --git a/docs/software/using/rclone/index.html b/docs/software/using/rclone/index.html new file mode 100644 index 000000000..41c57863f --- /dev/null +++ b/docs/software/using/rclone/index.html @@ -0,0 +1,134 @@ + Rclone - Sherlock

Rclone

Introduction#

If you need to sync files between cloud storage to Sherlock, rclone is a command line program that can help. You can easily use it to transfer files from a cloud storage provider to Sherlock or Oak, or vice versa. The following tutorial walks through transferring files between Google Drive and Oak storage.

More documentation#

For more information on running rclone, please see the official documentation.

Setup#

rclone config#

Before transferring data for the first time, you will need to configure rclone so that it can access your Google Drive. This will require use of your browser, so you will need to connect to Sherlock with local port forwarding (ssh -L). You only need to do this when you are configuring rclone for the first time.

When running rclone config you will be prompted to enter names and values, indicated by the > symbol. To leave it empty, press Enter.

# Connect to Sherlock with local port fowarding
+$ ssh -L localhost:53682:localhost:53682 <SUNetID>@login.sherlock.stanford.edu
+
+
+# Load the rclone module
+$ ml system rclone
+
+
+# Run the rclone configuration tool
+$ rclone config
+
+No remotes found, make a new one?
+n) New remote
+s) Set configuration password
+q) Quit config
+n/s/q> n
+
+Enter name for new remote.
+name> gdrive
+
+Option Storage.
+Type of storage to configure.
+Choose a number from below, or type in your own value.
+ 1 / 1Fichier
+   \ (fichier)
+ 2 / Akamai NetStorage
+   \ (netstorage)
+       ...
+18 / Google Drive
+   \ (drive)
+       ...
+48 / premiumize.me
+   \ (premiumizeme)
+49 / seafile
+   \ (seafile)
+Storage> drive
+
+Option client_id.
+Google Application Client Id
+...
+Enter a value. Press Enter to leave empty.
+client_id>
+
+Option client_secret.
+OAuth Client Secret.
+Leave blank normally.
+Enter a value. Press Enter to leave empty.
+client_secret>
+
+Option scope.
+Scope that rclone should use when requesting access from drive.
+Choose a number from below, or type in your own value.
+Press Enter to leave empty.
+ 1 / Full access all files, excluding Application Data Folder.
+   \ (drive)
+...
+scope> 1
+
+Option service_account_file.
+Service Account Credentials JSON file path.
+Leave blank normally.
+...
+Enter a value. Press Enter to leave empty.
+service_account_file>
+
+Edit advanced config?
+y) Yes
+n) No (default)
+y/n> n
+
+Use auto config?
+ * Say Y if not sure
+ * Say N if you are working on a remote or headless machine
+
+y) Yes (default)
+n) No
+y/n> y
+
+2023/09/12 10:51:55 NOTICE: If your browser doesn't open automatically go to the
+following link: http://127.0.0.1:53682/auth?state=#################
+2023/09/12 10:51:55 NOTICE: Log in and authorize rclone for access
+2023/09/12 10:51:55 NOTICE: Waiting for code...
+

At this point, you can copy and paste the provided link into your browser. You will be asked to confirm that you want to allow rclone to access your files. Once you have successfully done so, you can complete the configuration in the terminal.

Configure this as a Shared Drive (Team Drive)?
+
+y) Yes
+n) No (default)
+y/n> n
+
+Configuration complete.
+Options:
+...
+Keep this "gdrive" remote?
+y) Yes this is OK (default)
+e) Edit this remote
+d) Delete this remote
+y/e/d> y
+
+Current remotes:
+
+Name                 Type
+====                 ====
+gdrive               drive
+
+e) Edit existing remote
+n) New remote
+d) Delete remote
+r) Rename remote
+c) Copy remote
+s) Set configuration password
+q) Quit config
+e/n/d/r/c/s/q> q
+

Examples#

rclone copy#

To transfer data between cloud storage and Sherlock or Oak, you can use the rclone copy command.

# Start an interactive dev session
+$ sh_dev
+
+# Load the rclone module
+$ ml system rclone
+
+# Copy a folder from Google Drive to Oak
+$ rclone copy gdrive:<folder name> /oak/stanford/groups/<group_name>/<folder name>
+
+$ Copy a single file from Oak to Google Drive
+$ rclone copy /oak/stanford/groups/<group name>/<file name> gdrive:
+

rclone ls/lsd#

To view the files and folders in your cloud storage, you can use the rclone ls and rclone lsd commands, respectively.

# Load the rclone module
+$ ml system rclone
+
+# List all top-level directories in Google Drive
+$ rclone lsd gdrive: --max-depth 1
+
+# List all files in a directory
+$ rclone ls gdrive:<folder name>
+
+# List all files on Google Drive (including those in folders)
+$ rclone ls gdrive:
+
\ No newline at end of file diff --git a/docs/software/using/schrodinger/index.html b/docs/software/using/schrodinger/index.html new file mode 100644 index 000000000..a16ec9358 --- /dev/null +++ b/docs/software/using/schrodinger/index.html @@ -0,0 +1,40 @@ + Schrödinger - Sherlock

Schrödinger

Introduction#

The Schrödinger suite is a commercial and licensed software used to simulate and model molecular behavior at the atomic level. The Schrödinger software tools include molecular dynamics simulations, quantum mechanics calculations, virtual screening and visualization tools.

More documentation#

The following documentation specifically intended for using Schrödinger on Sherlock. For more complete documentation about Schrödinger in general, please contact Schrödinger support.

Schrödinger on Sherlock#

Licensing#

Stanford Libraries have purchased a site license for the Schrödinger suite. Please contact Stanford Libraries at sciencelibrary@stanford.edu and CC srcc-support@stanford.edu if you would like to access Schrödinger on Sherlock: after we receive confirmation, your PI group will be granted access on Sherlock.

Using Schrödinger#

You can use Schrödinger software after having loaded the corresponding software module with the module command. To load the current default version:

module load chemistry schrodinger
+

To see all the available versions, you can use the module spider command:

$ module spider schrodinger
+

Once loaded, the $SCHRODINGER environment variable is automatically set to allow all Schrödinger commands to run. For example, to run the jaguar command:

$ jaguar run -WAIT H20.in
+

To call the basic Schrödinger run command, just enter:

$ run
+

or glide:

$ glide
+usage: glide_startup.py [options] <input_file>
+glide_startup.py: error: the following arguments are required: input_file
+

Maestro GUI#

OnDemand shell sessions

Opening an X11/GUI session will not work in a Sherlock OnDemand terminal session. You will need to use the method mentioned below, i.e. a standard terminal session with an X11 client.

To launch the Maestro GUI, once you have loaded the Schrödinger module, simply run:

$ maestro
+

You'll need to enable X11 forwarding in your initial connection to Sherlock, and request it as well for your job allocation.

Here are some example commands you can run:

# on your local machine
+$ ssh -X login.sherlock.stanford.edu
+
+# then from a Sherlock login node
+$ sh_dev -m 16GB
+
+# and finally on the allocated compute node:
+$ ml load chemistry schrodinger
+$ maestro
+

This will launch Maestro on a compute node and display its graphical user interface on your local machine's display.

GUI performance

Please note that running graphical user interfaces (GUIs) over the network via X11 over SSH may not necessarily yield the best performance. Graphical analysis is often best done on a local machine, while intensive, batch scheduled computations are carried over on the cluster.

For more information about X11 forwarding, you can refer to this page.

Examples#

batch job submission#

Here's an example batch script, requesting 1 CPU, for 10 minutes on the normal partition, that can be saved as water.sbatch:

#!/usr/bin/bash
+#SBATCH -o water.%j.out
+#SBATCH -e water.%j.err
+#SBATCH -n 1
+#SBATCH -t 10:00
+#SBATCH -p normal
+
+# Load required modules
+module load chemistry schrodinger
+
+# Run Schrödinger, -WAIT is often required
+jaguar run -WAIT H20.in
+

Save this input file as H2O.in:

&gen
+&
+&echo
+&
+&zmat
+O       0.0000000000000   0.0000000000000  -0.1135016000000
+H1      0.0000000000000   0.7531080000000   0.4540064000000
+H2      0.0000000000000  -0.7531080000000   0.4540064000000
+&
+

And you can submit the batch script with:

$ sbatch water.sbatch
+

After execution, you should find a H20.out output file in the current directory, as well as a log file (H20.log). If you don't, you can check for errors in the job output and error files: water.<jobid>.{out,err}.

\ No newline at end of file diff --git a/docs/software/using/singularity/index.html b/docs/software/using/singularity/index.html new file mode 100644 index 000000000..bdfd09420 --- /dev/null +++ b/docs/software/using/singularity/index.html @@ -0,0 +1,107 @@ + Singularity - Sherlock

Singularity#

Singularity is a tool for running containers on HPC systems, similar to Docker.

Introduction#

Containers are a solution to the problem of how to get software to run reliably when moved from one computing environment to another. They also resolve installation problems by packaging all the dependencies of an application within a self-sustainable image, a.k.a a container.

What's a container?

Put simply, a container consists of an entire runtime environment: an application, plus all its dependencies, libraries and other binaries, and configuration files needed to run it, bundled into one package. By containerizing the application platform and its dependencies, differences in OS distributions and underlying infrastructure are abstracted away.

Why not Docker?#

Docker has long been the reference and the most popular container framework in DevOps and Enterprise IT environments, so why not use Docker on Sherlock? Well, for a variety of technical reasons, mostly related to security.

Docker has never been designed nor developed to run in multi-tenants environments, and even less on HPC clusters. Specifically:

  • Docker requires a daemon running as root on all of the compute nodes, which has serious security implications,
  • all authenticated actions (such as login, push ...) are also executed as root, meaning that multiple users can't use those functions on the same node,
  • Docker uses cgroups to isolate containers, as does the Slurm scheduler, which uses cgroups to allocate resources to jobs and enforce limits. Those uses are unfortunately conflicting.
  • but most importantly, allowing users to run Docker containers will give them root privileges inside that container, which will in turn let them access any of the clusters' filesystems as root. This opens the door to user impersonation, inappropriate file tampering or stealing, and is obviously not something that can be allowed on a shared resource.

That last point is certainly the single most important reason why we won't use Docker on Sherlock.

Why Singularity?#

Singularity is Docker for HPC systems

Singularity allows running Docker containers natively, and is a perfect replacement for Docker on HPC systems such as Sherlock. That means that existing Docker container can be directly imported and natively run with SIngularity.

Despite Docker's shortcomings on HPC systems, the appeal of containers for scientific computing is undeniable, which is why we provide Singularity on Sherlock. Singularity is an alternative container framework, especially designed to run scientific applications on HPC clusters.

Singularity provides the same functionalities as Docker, without any of the drawbacks listed above. Using a completely different implementation, it doesn't require any privilege to run containers, and allow direct interaction with existing Docker containers.

The main motivation to use Singularity over Docker is the fact that it's been developed with HPC systems in mind, to solve those specific problems:

  • security: a user in the container is the same user as the one running the container, so no privilege escalation possible,
  • ease of deployment: no daemon running as root on each node, a container is simply an executable,
  • no need to mount filesystems or do bind mappings to access devices,
  • ability to run MPI jobs based on containers,
  • and more...

More documentation#

The following documentation specifically intended for using Singularity on Sherlock. For more complete documentation about building and running containers with Singularity, please see the Singularity documentation.

Singularity on Sherlock#

As announced during the SC'18 Supercomputing Conference, Singularity is an integral part of the Sherlock cluster, and Singularity commands can be executed natively on any login or compute node, without the need to load any additional module.

Importing containers#

Pre-built containers can be obtained from a variety of sources. For instance:

  • DockerHub contains containers for various software packages, which can be directly used with Singularity,
  • SingularityHub is a registry for scientific linux containers,
  • the NVIDIA GPU Cloud registry for GPU-optimized containers,
  • many individual projects contain specific instructions for installation via Docker and/or Singularity, and may provide pre-built images in other locations.

To illustrate how Singularity can import and run Docker containers, here's an example how to install and run the OpenFOAM CFD solver using Singularity. OpenFOAM can be quite difficult to install manually, but Singularity makes it very easy.

Interactive or batch usage

This example shows how to use Singularity interactively, but Singularity containers can be run in batch jobs as well.

The first step is to request an interactive shell, and to load the singularity module. Singularity images can be pulled directly from the compute nodes, and Singularity uses multiple CPU cores when assembling the image, so requesting multiple cores in your job can make the pull operation faster:

$ srun -c 4 --pty bash
+

We recommend storing Singularity images in $GROUP_HOME, as container images can take significant space in your $HOME directory.

$ mkdir -p $GROUP_HOME/$USER/simg
+$ cd $GROUP_HOME/$USER/simg
+

Then, the OpenFOAM container could be pulled directly from DockerHub by Singularity. This can take a moment to complete:

$ singularity pull docker://openfoam/openfoam6-paraview54
+Docker image path: index.docker.io/openfoam/openfoam6-paraview54:latest
+Cache folder set to /scratch/users/kilian/.singularity/docker
+Importing: base Singularity environment
+Exploding layer: sha256:1be7f2b886e89a58e59c4e685fcc5905a26ddef3201f290b96f1eff7d778e122.tar.gz
+[...]
+Building Singularity image...
+Singularity container built: ./openfoam6-paraview54.simg
+Cleaning up...
+Done. Container is at: ./openfoam6-paraview54.simg
+

Running containers#

Once the image is downloaded, you are ready to run OpenFOAM from the container. The singularity shell command can be used to start the container, and run a shell within that image:

By default on Sherlock, all the filesystems that are available on the compute node will also be available in the container. If you want to start your shell in a specific directory, you can use the --pwd /path/ option. For instance, we'll create a /tmp/openfoam_test/ directory to store our tests results (that will be wiped out at the end of the job), and start the container shell there:

$ mkdir /tmp/openfoam_test
+$ singularity shell --pwd /tmp/openfoam_test openfoam6-paraview54.simg
+Singularity: Invoking an interactive shell within container...
+Singularity openfoam6-paraview54.simg:/tmp/openfoam_test>
+

You're now in the container, as denoted by the shell prompt (Singularity[...].simg:[path]>), which is different from the prompt displayed on the compute node (which usually looks like [login]@[compute_node] [path]$.

OpenFOAM provides a convenience script that can be sourced to make OpenFOAM commands directly accessible and set a few useful environment variables:

> source /opt/openfoam6/etc/bashrc
+

Now, we can run a simple example using OpenFOAM:

> cp -r $FOAM_TUTORIALS/incompressible/simpleFoam/pitzDaily .
+> cd pitzDaily
+> blockMesh
+[...]
+End
+
+> simpleFoam
+/*---------------------------------------------------------------------------*\
+  =========                 |
+  \\      /  F ield         | OpenFOAM: The Open Source CFD Toolbox
+   \\    /   O peration     | Website:  https://openfoam.org
+    \\  /    A nd           | Version:  6
+     \\/     M anipulation  |
+\*---------------------------------------------------------------------------*/
+Build  : 6-1a0c91b3baa8
+Exec   : simpleFoam
+Date   : Oct 05 2018
+Time   : 23:37:30
+Host   : "sh01-06n33.int"
+PID    : 14670
+I/O    : uncollated
+Case   : /tmp/openfoam_test/pitzDaily
+nProcs : 1
+sigFpe : Enabling floating point exception trapping (FOAM_SIGFPE).
+fileModificationChecking : Monitoring run-time modified files using timeStampMaster (fileModificationSkew 10)
+allowSystemOperations : Allowing user-supplied system call operations
+
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
+Create time
+[...]
+SIMPLE solution converged in 288 iterations
+
+streamLine streamlines write:
+    seeded 10 particles
+    Tracks:10
+    Total samples:11980
+    Writing data to "/tmp/openfoam_test/pitzDaily/postProcessing/sets/streamlines/288"
+End
+
+>
+

When the simulation is done, you can exit the container with:

> exit
+

Because the container can see all the compute node's filesystems, the simulation output will be available in /tmp/openfoam_test after you exit the container:

$ ls /tmp/openfoam_test/pitzDaily/postProcessing/
+sets
+

GPU-enabled containers#

Sherlock also supports the use of container images provided by NVIDIA in the NVIDIA GPU Cloud (NGC). This registry provides GPU-accelerated containers for the most popular HPC and deep-learning scientific applications.

GPU support

Containers provided on NGC are only supported on Pascal and Volta architectures (TITAN Xp, Tesla P40, P100 or V100). For GPUs from the previous generations (GTX TITAN Black/X, Tesla K20/K80), things may or may not work.

We recommend making sure to select a supported GPU generation by adding the following directive to your batch script when submitting a job to run GPU-enabled containers from NGC:

#SBATCH -C "GPU_GEN:PSC|GPU_GEN:VLT"
+

Pulling NGC images#

As before, we start by requesting an interactive shell with multiple CPU cores, loading the Singularity module and moving the directory where we'll save those images:

$ srun -c 4 --pty bash
+$ cd $GROUP_HOME/simg
+

A GPU is not required for pulling GPU-enabled containers

GPU-enabled containers can be pulled on any node, including nodes without a GPU. But their execution requires a GPU and thus, they need to be executed within a GPU job. See the GPU job section for more information.

To be able to pull an image from NGC, authentication credentials must be set. Users need to register and create an NGC API key, complete details could be found in the NGC Getting Started Guide.

You can then set the following environment variable to allow Singularity to authenticate with NGC:

$ export SINGULARITY_DOCKER_USERNAME='$oauthtoken'
+$ export SINGULARITY_DOCKER_PASSWORD=<NVIDIA NGC API key>
+

Note

The SINGULARITY_DOCKER_USERNAME environment variable must be set to the literal $oauthtoken string, for every user. It should not be replaced by anything else. Only the API key is specific to each user.

Once credentials are set in the environment, container images can be pulled from the NGC registry normally.

The general form of the Singularity command used to pull NGC containers is: $ singularity pull docker://nvcr.io/<registry>/<app:tag>

For example to pull the NAMD NGC container tagged with version 2.12-171025 the corresponding command would be:

$ singularity pull docker://nvcr.io/hpc/namd:2.12-171025
+

After this command has finished, we'll have a Singularity image file in the current directory, named namd-2.12-171025.simg.

Running NGC containers#

Instructions about running NGC containers are provided on the NGC website, under each application:

NAMD on NGC

Each application comes with specific running instructions, so we recommend to follow the container's particular guidelines before running it with Singularity.

Containers that lack Singularity documentation have not been tested with Singularity.

Since all NGC containers are optimized for GPU acceleration, they will always be executed with the --nv Singularity option, to enable GPU support within the container.

We also need to submit a job requesting a GPU to run GPU-enabled containers. For instance:

$ srun -p gpu -c 4 --gres gpu:1 --pty bash
+

This will start an interactive shell on a GPU node, with 4 CPU cores and 1 GPU.

The NAMD container that was pulled just before can now be started with the following commands. We start by creating a temporary directory to hold the execution results, and start the container using this as the current directory:

$ mkdir /tmp/namd_test
+$ singularity shell --nv --pwd /tmp/namd_test $GROUP_HOME/simg/namd-2.12-171025.simg
+Singularity: Invoking an interactive shell within container...
+
+Singularity namd-2.12-171025.simg:/tmp/namd_test>
+

From there, we can run a NAMD test to verify that everything is working as expected.

> cp -r /workspace/examples .
+> /opt/namd/namd-multicore +p4 +idlepoll examples/apoa1/apoa1.namd
+Charm++: standalone mode (not using charmrun)
+Charm++> Running in Multicore mode:  4 threads
+Charm++> Using recursive bisection (scheme 3) for topology aware partitions
+Converse/Charm++ Commit ID: v6.8.2
+[...]
+Info: Built with CUDA version 9000
+Did not find +devices i,j,k,... argument, using all
+Pe 1 physical rank 1 will use CUDA device of pe 2
+Pe 3 physical rank 3 will use CUDA device of pe 2
+Pe 0 physical rank 0 will use CUDA device of pe 2
+Pe 2 physical rank 2 binding to CUDA device 0 on sh02-14n13.int: 'TITAN Xp'  Mem: 12196MB  Rev: 6.1
+Info: NAMD 2.12 for Linux-x86_64-multicore-CUDA
+[...]
+Info: SIMULATION PARAMETERS:
+Info: TIMESTEP               1
+[...]
+ENERGY:    2000     20247.5090     20325.4554      5719.0088       183.9328        -340639.3103     25366.3986         0.0000         0.0000     46364.9951        -222432.0107       168.6631   -268797.0057   -222054.5175       168.8733          -1129.9509     -1799.6459    921491.4634     -2007.8380     -2007.4145
+
+WRITING EXTENDED SYSTEM TO OUTPUT FILE AT STEP 2000
+WRITING COORDINATES TO OUTPUT FILE AT STEP 2000
+The last position output (seq=-2) takes 0.001 seconds, 559.844 MB of memory in use
+WRITING VELOCITIES TO OUTPUT FILE AT STEP 2000
+The last velocity output (seq=-2) takes 0.001 seconds, 559.844 MB of memory in use
+====================================================
+
+WallClock: 17.593451  CPUTime: 17.497925  Memory: 559.843750 MB
+[Partition 0][Node 0] End of program
+

The simulation should take a few seconds to run. You can verify that it correctly executed on a GPU in the output above. When it's done, you can exit the container with:

> exit
+

Because the container can see all the compute node's filesystems, the simulation output will be available in /tmp/named_test after you exit the container:

$ cd /tmp/namd_test/examples/apoa1/
+$ ls apoa1-out*
+apoa1-out.coor  apoa1-out.vel  apoa1-out.xsc
+

Building your own containers#

Building Singularity containers requires root privileges, and as such, cannot be done on Sherlock directly.

If you need to modify existing containers or build your own from scratch, The recommended workflow is to prepare and build your containers on your local Linux machine (it could either be a workstation, a laptop or a virtual machine), transfer the resulting container image to Sherlock, and run it there.

For complete details about how to build Singularity containers, please refer to the Singularity documentation.


  1. For more information about using modules on Sherlock, please see the software modules documentation

\ No newline at end of file diff --git a/docs/software/using/spark/index.html b/docs/software/using/spark/index.html new file mode 100644 index 000000000..11afb6e70 --- /dev/null +++ b/docs/software/using/spark/index.html @@ -0,0 +1,83 @@ + Spark - Sherlock

Spark

Introduction#

Apache Spark™ is a general engine for large-scale data processing. This document gives a quick introduction how to get a first test program in Spark running on Sherlock.

More documentation#

The following documentation specifically intended for using Spark on Sherlock. For more complete documentation about Spark in general, please see the Apache Spark documentation.

Spark on Sherlock#

Running Apache Spark on Sherlock is a bit different from using a traditional Spark/Hadoop cluster in that it requires some level of integration with the scheduler. In a sense, the computing resources (memory and CPU) need to be allocated twice. First, sufficient resources for the Spark application need to be allocated via Slurm ; and secondly, spark-submit resource allocation flags need to be properly specified.

In order to use Spark, three steps have to be kept in mind when submitting a job to the queuing system:

  1. a new Spark cluster has to be started on the allocated nodes
  2. once the Spark cluster is up and running, Spark jobs have to be submitted to the cluster
  3. after all Spark jobs have finished running, the cluster has to be shut down

The following scripts show how to implement these three steps, and use the Pi Monte-Carlo calculation as an example.

Single-node job#

In this example, all the Spark processes run on the same compute node, which makes for a fairly simply sbatch script. The following example will start a 8-core job on a single node, and run a Spark task within that allocation:

#!/bin/bash
+
+#SBATCH --job-name=spark_singlenode
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=8
+#SBATCH --time=10
+
+module load spark
+
+# This syntax tells spark to use all cpu cores on the node.
+export MASTER="local[*]"
+
+# This is a Scala example
+run-example SparkPi 1000
+
+# This is a Python example.
+spark-submit --master $MASTER $SPARK_HOME/examples/src/main/python/pi.py 1000
+

Multi-node job#

To start a Spark cluster and run a task on multiple nodes, more preliminary steps are necessary. Here's an example script that will span 2 nodes, start 2 Spark workers on each node, and allow each worker to use 8 cores:

#!/bin/bash
+#SBATCH --nodes=2
+#SBATCH --mem-per-cpu=4G
+#SBATCH --cpus-per-task=8
+#SBATCH --ntasks-per-node=2
+#SBATCH --output=sparkjob-%j.out
+
+## --------------------------------------
+## 0. Preparation
+## --------------------------------------
+
+# load the Spark module
+module load spark
+
+# identify the Spark cluster with the Slurm jobid
+export SPARK_IDENT_STRING=$SLURM_JOBID
+
+# prepare directories
+export SPARK_WORKER_DIR=${SPARK_WORKER_DIR:-$HOME/.spark/worker}
+export SPARK_LOG_DIR=${SPARK_LOG_DIR:-$HOME/.spark/logs}
+export SPARK_LOCAL_DIRS=${SPARK_LOCAL_DIRS:-/tmp/spark}
+mkdir -p $SPARK_LOG_DIR $SPARK_WORKER_DIR
+
+## --------------------------------------
+## 1. Start the Spark cluster master
+## --------------------------------------
+
+start-master.sh
+sleep 1
+MASTER_URL=$(grep -Po '(?=spark://).*' \
+             $SPARK_LOG_DIR/spark-${SPARK_IDENT_STRING}-org.*master*.out)
+
+## --------------------------------------
+## 2. Start the Spark cluster workers
+## --------------------------------------
+
+# get the resource details from the Slurm job
+export SPARK_WORKER_CORES=${SLURM_CPUS_PER_TASK:-1}
+export SPARK_MEM=$(( ${SLURM_MEM_PER_CPU:-4096} * ${SLURM_CPUS_PER_TASK:-1} ))M
+export SPARK_DAEMON_MEMORY=$SPARK_MEM
+export SPARK_WORKER_MEMORY=$SPARK_MEM
+export SPARK_EXECUTOR_MEMORY=$SPARK_MEM
+
+# start the workers on each node allocated to the tjob
+export SPARK_NO_DAEMONIZE=1
+srun  --output=$SPARK_LOG_DIR/spark-%j-workers.out --label \
+      start-slave.sh ${MASTER_URL} &
+
+## --------------------------------------
+## 3. Submit a task to the Spark cluster
+## --------------------------------------
+
+spark-submit --master ${MASTER_URL} \
+             --total-executor-cores $((SLURM_NTASKS * SLURM_CPUS_PER_TASK)) \
+             $SPARK_HOME/examples/src/main/python/pi.py 10000
+
+## --------------------------------------
+## 4. Clean up
+## --------------------------------------
+
+# stop the workers
+scancel ${SLURM_JOBID}.0
+
+# stop the master
+stop-master.sh
+
\ No newline at end of file diff --git a/docs/storage/data-protection/index.html b/docs/storage/data-protection/index.html new file mode 100644 index 000000000..4df839eb5 --- /dev/null +++ b/docs/storage/data-protection/index.html @@ -0,0 +1,20 @@ + Data protection - Sherlock

Data protection

Data protection is mostly a task for the user

Except for $HOME and $GROUP_HOME, data on Sherlock is not backed up, nor archived. It's up to each user and group to make sure they maintain multiple copies of their data if needed.

Snapshots#

File system snapshots represent the state of the file system at a particular point in time. They allow accessing the file system contents as it was a different times in the past, and get back data that may have been deleted or modified since the snapshot was taken.

Important

Snapshots are only available on $HOME and $GROUP_HOME.

Accessing snapshots#

Snapshots taken in $HOME and $GROUP_HOME are accessible in a .snapshot directory at any level of the hierarchy. Those .snapshot directories don't appear when listing directory contents with ls, but they can be listed explicitly or accessed with cd:

$ cd $HOME
+$ ls -ald .snapshot/users*
+[...]
+drwx------ 118 sunetid group  6680 Jul 21 11:16 .snapshot/users.daily.20170721
+drwx------ 118 sunetid group  6702 Jul 21 16:19 .snapshot/users.daily.20170722
+drwx------ 118 sunetid group  6702 Jul 21 16:19 .snapshot/users.daily.20170723
+drwx------ 118 sunetid group  6702 Jul 24 10:57 .snapshot/users.daily.20170724
+drwx------ 118 sunetid group  6702 Jul 24 10:57 .snapshot/users.daily.latest
+drwx------ 118 sunetid group  6702 Jul 21 16:19 .snapshot/users.hourly.20170722-16:00
+drwx------ 118 sunetid group  6702 Jul 21 16:19 .snapshot/users.hourly.20170722-17:00
+drwx------ 118 sunetid group  6702 Jul 21 16:19 .snapshot/users.hourly.20170722-18:00
+drwx------ 118 sunetid group  6702 Jul 21 16:19 .snapshot/users.hourly.20170722-19:00
+drwx------ 118 sunetid group  6702 Jul 21 16:19 .snapshot/users.hourly.20170722-20:00
+[...]
+$ cd .snapshot/users.daily.latest
+

For instance:

  • the $HOME/.snapshot/users.daily.latest directory is the latest daily snapshot available, and stores the contents of the $HOME directory as they were when the last daily snapshot was taken,
  • the $HOME/foo/.snapshot/users.hourly.20170722-18:00 can be used to retrieve the contents of the $HOME/foo directory as it was at 6pm on July 22th, 2017.

Restoring from a snapshot#

If you deleted a file or modified it and want to restore an earlier version, you can simply copy the file from its saved version in the appropriate snapshot.

Examples:

  • to restore the last known version of $HOME/foo/bar:

    $ cp $HOME/foo/.snapshot/users.hourly.latest/bar $HOME/foo/bar
    +

    or

    $ cp $HOME/.snapshot/foo/users.hourly.latest/bar $HOME/foo/bar
    +

    (both commands are equivalent)

  • to restore your ~/.bashrc file from 2 days ago:

    $ SNAP_DATE=$(date +%Y%m%d -d "2 days ago")
    +$ cp $HOME/.snapshot/users.daily.${SNAP_DATE}/.bashrc $HOME/.bashrc
    +

Snapshot policy#

The current1 policy is to take snapshots on an hourly, daily and weekly basis. Older snapshots automatically expire after their retention period. The snapshot policy applies to both $HOME and $GROUP_HOME storage spaces.

Snapshot frequency Retention period Number of snapshots
hourly 2 days 48
daily 1 week 7
weekly 1 month 4

The shortest interval between snapshots is an hour. That means that if you create a file and then delete it within the hour, it won't appear in snapshots, and you won't be able to restore it.

If a file exists for more than an hour, and is then deleted, it will be present in the hourly snapshots for the next 48 hours, and you'll be able to retrieve it during that period. Similarly, if a file exists for more than a day, it could be restored for up to 7 days.

Snapshots don't count towards your quota.

Snapshots, as well as the entire filesystem, are replicated to an off-site system, to ensure that data could be retrieved even in case of a catastrophic failure of the whole system or datacenter-level disaster.

Backups#

Although the SRCC doesn't offer any backup service per se, we do provide all the tools required to transfer data in and out of Sherlock.

Suggested options to backup your data include:


  1. The snapshot policy is subject to change and may be adjusted as the storage system usage conditions evolve. 

\ No newline at end of file diff --git a/docs/storage/data-sharing/index.html b/docs/storage/data-sharing/index.html new file mode 100644 index 000000000..6e3c4a5cb --- /dev/null +++ b/docs/storage/data-sharing/index.html @@ -0,0 +1,47 @@ + Data sharing - Sherlock

Data sharing

The following sections present and detail options to share data across users and groups on Sherlock.

Sharing data locally on Sherlock#

Traditional Unix permissions#

Standard Unix file permissions are supported on Sherlock and provide read, write and execute permissions for the three distinct access classes.

The access classes are defined as follows:

  • Files and directories are owned by a user. The owner determines the file's user class. Distinct permissions apply to the owner.
  • Files and directories are assigned a group, which define the file's group class. Distinct permissions apply to members of the file's group. The owner may be a member of the file's group.
  • Users who are not the owner, nor a member of the group, comprise a file's others class. Distinct permissions apply to others.

The following permissions apply to each class:

  • The read permission grants the ability to read a file. When set for a directory, this permission grants the ability to read the names of files in the directory, but not to find out any further information about them such as contents, file type, size, ownership, permissions.
  • The write permission grants the ability to modify a file. When set for a directory, this permission grants the ability to modify entries in the directory. This includes creating files, deleting files, and renaming files.
  • The execute permission grants the ability to execute a file. This permission must be set for executable programs, including shell scripts, in order to allow the operating system to run them. When set for a directory, this permission grants the ability to access file contents and meta-information if its name is known, but not list files inside the directory, unless read is set also.

Shared directories traversal

If you need to give access to one of your files to another user, they will at least need execute permission on each directory within the path to that file.

The effective permissions are determined based on the first class the user falls within in the order of user, group then others. For example, the user who is the owner of the file will have the permissions given to the user class regardless of the permissions assigned to the group class or others class.

While traditional Unix permissions are sufficient in most cases to share files with all the users within the same group, they are not enough to share files with a specific subset of users, or with users from other groups. Access Control Lists (ACLs) can be used for that purpose.

There are two type of ACLs supported on Sherlock depending on the underlying filesystem:

Type Filesystems
NFSv4 ACLs $HOME and $GROUP_HOME
POSIX ACLs $SCRATCH, $GROUP_SCRATCH, $L_SCRATCH and $OAK

POSIX ACLs#

POSIX ACLs allows you to grant or deny access to files and directories for different users (or groups), independently of the file owner or group.

Two types of POSIX ACLs can be defined:

  • Access ACLs: grant permission for a specific file or directory.
  • Default ACLs: allow to set a default set of ACLs that will be applied to any file or directory without any already defined ACL. Can only be set on directories.

ACLs are set with the setfacl command, and displayed with getfacl. For more details and examples, please refer to this documentation.

In the example below, we allow two users to access a restricted directory located at $GROUP_SCRATCH/restricted-dir/:

$ cd $GROUP_SCRATCH
+
+### Create new directory
+$ mkdir restricted-dir
+
+### Remove 'group' and 'other' access
+$ chmod g-rwx,o-rwx restricted-dir
+
+### Give user bob read and traversal permissions to the directory
+$ setfacl -m u:bob:rX restricted-dir
+
+### Use default ACLs (-d) to give user bob read access to all new
+### files and sub-directories that will be created in "restricted-dir"
+$ setfacl -d -m u:bob:rX restricted-dir
+
+### Give user alice read, write and traversal permissions for the directory
+$ setfacl -m u:alice:rwX restricted-dir
+
+### Use default ACLs (-d) to give user alice read and write access to all
+### new files and sub-directories
+$ setfacl -d -m u:alice:rwX restricted-dir
+
+### Show ACLs
+$ getfacl restricted-dir
+# file: restricted-dir/
+# owner: joe
+# group: grp
+# flags: -s-
+user::rwx
+user:bob:r-x
+group::---
+mask::r-x
+other::---
+default:user::rwx
+default:user:alice:rwx
+default:user:bob:r-x
+default:group::---
+default:mask::rwx
+default:other::---
+

Default permissions on $GROUP_SCRATCH

By default, the Unix permissions on the root directory $GROUP_SCRATCH don't allow read nor traversal access for others (ie. any user not part of your PI group). If you need to share files with users outside of your own group, please contact us so we can set the appropriate permissions on your folder.

For $SCRATCH, you're the owner of the directory and so you can change the permissions yourself.

NFSv4 ACLs#

$HOME and $GROUP_HOME also allow setting ACLs, albeit with different syntax and semantics than POSIX ACLs. The principle is very similar, though.

An ACL in NFSv4 is a list of rules setting permissions on files or directories. A permission rule, or Access Control Entry (ACE), is of the form type:flags:principle:permissions.

Commonly used entries for these fields are:

  • type: A (allow) or D (deny)
  • flags: g (group), d (directory-inherit), f (file-inherit), n (no-propagate-inherit), or i (inherit-only)
  • principle: a named user (user@sherlock), a group, or one of three special principles: OWNER@, GROUP@, and EVERYONE@.
  • permissions: there are 14 permission characters, as well as the shortcuts R, W, and X. Here is a list of possible permissions that can be included in the permissions field (options are Case Sensitive)
  • r read-data (files) / list-directory (directories)
  • w write-data (files) / create-file (directories)
  • x execute (files) / change-directory (directories)
  • a append-data (files) / create-subdirectory (directories)
  • t read-attributes: read the attributes of the file/directory.
  • T write-attributes: write the attributes of the file/directory.
  • n read-named-attributes: read the named attributes of the file/directory.
  • N write-named-attributes: write the named attributes of the file/directory.
  • c read-ACL: read the file/directory NFSv4 ACL.
  • C write-ACL: write the file/directory NFSv4 ACL.
  • o write-owner: change ownership of the file/directory.
  • y synchronize: allow clients to use synchronous I/O with the server.
  • d delete: delete the file/directory. Some servers will allow a delete to occur if either this permission is set in the file/directory or if the delete-child permission is set in its parent directory.
  • D delete-child: remove a file or subdirectory from within the given directory (directories only)

A comprehensive listing of allowable field strings is given in the manual page nfs4_acl(5)

To see what permissions are set on a particular file, use the nfs4_getfacl command. For example, newly created file1 may have default permissions listed by ls -l as -rw-r—r—. Listing the permissions with nfs4_getfacl would display the following:

$ nfs4_getfacl file1
+A::OWNER@:rwatTnNcCoy
+A:g:GROUP@:rtncy
+A::EVERYONE@:rtncy
+

To set permissions on a file, use the nfs4_setfacl command. For convenience, NFSv4 provides the shortcuts R, W and X for setting read, write, and execute permissions. For example, to add write permissions for the current group on file1, use nfs4_setfacl with the -a switch:

$ nfs4_setfacl -a A::GROUP@:W file1
+

This command switched the GROUP@ permission field from rtncy to rwatTnNcCoy. However, be aware that NFSv4 file permission shortcuts have a different meanings than the traditional Unix r, w, and x. For example issuing chmod g+w file1 will set GROUP@ to rwatncy.

Although the shortcut permissions can be handy, often rules need to be more customized. Use nfs4_setfacl -e file1 to open the ACL for file1 in a text editor.

Access Control Entries allow more fine grained control over file and directory permissions than does the chmod command. For example, if user joe wants to give read, write and traverse permissions to jack for her directory private, she would issue:

$ nfs4_setfacl -R -a A::jack@sherlock:RWX private/
+

The -R switch recursively applies the rule to the files and directories within private/ as well.

To allow jack to create files and subdirectories within private/ with the permissions as granted above, inheritance rules need to be applied.

$ nfs4_setfacl -R -a A:fd:jack@sherlock:RWX private/
+

By default, each permission is in the Deny state and an ACE is required to explicitly allow a permission. However, be aware that a server may silently override a users ACE, usually to a less permissive setting.

For complete documentation and examples on using NFSv4 ACLs, please see the manual page at nfs4_acl(5).

Default permissions on $GROUP_HOME

By default, the Unix permissions on the root directory $GROUP_HOME don't allow read nor traversal access for others (ie. any user not part of your PI group). If you need to share files with users outside of your own group, please contact us so we can set the appropriate permissions on your folder.

For $HOME, you're the owner of the directory and so you can change the permissions yourself.

Sharing data outside of Sherlock#

If you'd like to share data stored on Sherlock with external collaborators, there are two possibilities:

  1. sponsor a SUNet ID1 for these collaborators, and contact us us to create a account for them on Sherlock. This will grant them access to your resources on Sherlock (compute as well as storage) and give them access to your group shared files, like any other user in your group.

  2. if you don't want to grant full access to your Sherlock resources to your external collaborators, you can use the Globus data sharing feature. This won't require your collaborators to get Stanford accounts, and will allow easy sharing of the datasets of your choice.

    Globus Sharing is only available through the Oak endpoint

    Globus Sharing is only available on $OAK, using the Oak Globus Endpoint 2 (srcc#oak).

    For complete details about sharing data with Globus, please see the Globus documentation at https://docs.globus.org/how-to/share-files/


  1. a base-level SUNet ID (free) is sufficient to get an account on Sherlock. For more details about SUNet ID levels and associated services, please see the Stanford UIT SUNet IDs page

  2. SUNet ID required 

\ No newline at end of file diff --git a/docs/storage/data-transfer/index.html b/docs/storage/data-transfer/index.html new file mode 100644 index 000000000..5194be402 --- /dev/null +++ b/docs/storage/data-transfer/index.html @@ -0,0 +1,68 @@ + Data transfer - Sherlock

Data transfer

Transfer protocols#

A number of methods allow transferring data in/out of Sherlock. For most cases, we recommend using SSH-based file transfer commands, such as scp, sftp, or rsync. They will provide the best performance for data transfers from and to campus.

For large transfers, using DTNs is recommended

Most casual data transfers could be done through the login nodes, by pointing your transfer tool to login.sherlock.stanford.edu. But because of resource limits on the login nodes, larger transfer may not work as expected.

For transferring large amounts of data, Sherlock features a specific Data Transfer Node, with dedicated bandwidth, as well as a managed Globus endpoint, that can be used for scheduled, unattended data transfers.

We also provide tools on Sherlock to transfer data to various Cloud providers, such as AWS, Google Drive, Dropbox, Box, etc.

Prerequisites#

Most of the commands detailed below require a terminal and an SSH client1 on your local machine to launch commands.

You'll need to start a terminal and type the given example commands at the prompt, omitting the initial $ character (it just indicates a command prompt, and then should not be typed in).

Host keys#

Upon your very first connection to Sherlock, you will be greeted by a warning such as :

The authenticity of host 'login.sherlock.stanford.edu' can't be established.
+ECDSA key fingerprint is SHA256:eB0bODKdaCWtPgv0pYozsdC5ckfcBFVOxeMwrNKdkmg.
+Are you sure you want to continue connecting (yes/no)?
+

The same warning will be displayed if your try to connect to one of the Data Transfer Node (DTN):

The authenticity of host 'dtn.sherlock.stanford.edu' can't be established.
+ECDSA key fingerprint is SHA256:eB0bODKdaCWtPgv0pYozsdC5ckfcBFVOxeMwrNKdkmg.
+Are you sure you want to continue connecting (yes/no)?
+

This warning is normal: your SSH client warns you that it is the first time it sees that new computer. To make sure you are actually connecting to the right machine, you should compare the ECDSA key fingerprint shown in the message with one of the fingerprints below:

Key type Key Fingerprint
RSA SHA256:T1q1Tbq8k5XBD5PIxvlCfTxNMi1ORWwKNRPeZPXUfJA
legacy format: f5:8f:01:46:d1:f9:66:5d:33:58:b4:82:d8:4a:34:41
ECDSA SHA256:eB0bODKdaCWtPgv0pYozsdC5ckfcBFVOxeMwrNKdkmg
legacy format: 70:4c:76:ea:ae:b2:0f:81:4b:9c:c6:5a:52:4c:7f:64

If they match, you can proceed and type ‘yes’. Your SSH program will then store that key and will verify it for every subsequent SSH connection, to make sure that the server you're connecting to is indeed Sherlock.

Host keys warning#

If you've connected to Sherlock 1.0 before, there's a good chance the Sherlock 1.0 keys were stored by your local SSH client. In that case, when connecting to Sherlock 2.0 using the sherlock.stanford.edu alias, you will be presented with the following message:

@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@ WARNING: POSSIBLE DNS SPOOFING DETECTED! @
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+The RSA host key for sherlock.stanford.edu has changed, and the key for
+the corresponding IP address 171.66.97.101 is unknown. This could
+either mean that DNS SPOOFING is happening or the IP address for the
+host and its host key have changed at the same time.
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@ WARNING: REMOTE HOST IDENTIFICATION HAS CHANGED! @
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+IT IS POSSIBLE THAT SOMEONE IS DOING SOMETHING NASTY!
+Someone could be eavesdropping on you right now (man-in-the-middle
+attack)!  It is also possible that a host key has just been changed.
+The fingerprint for the RSA key sent by the remote host is
+SHA256:T1q1Tbq8k5XBD5PIxvlCfTxNMi1ORWwKNRPeZPXUfJA.
+Please contact your system administrator.
+

You can just check that the SHA256 key listed in that warning message correctly matches the one listed in the table above, and if that's the case, you can safely remove the sherlock.stanford.edu entry from your ~/.ssh/known_hosts file with the following command on your local machine:

$ ssh-keygen -R sherlock.stanford.edu
+

and then connect again. You'll see the first-connection prompt mentioned above, and your SSH client will store the new keys for future connections.

SSH-based protocols#

User name

In all the examples below, you'll need to replace <sunetid> by your actual SUNet ID. If you happen to use the same login name on your local machine, you can omit it.

SCP (Secure Copy)#

The easiest command to use to transfer files to/from Sherlock is scp. It works like the cp command, except it can work over the network to copy files from one computer to another, using the secure SSH protocol.

The general syntax to copy a file to a remote server is:

$ scp <source_file_path> <username>@<remote_host>:<destination_path>'
+

For instance, the following command will copy the file named foo from your local machine to your home directory on Sherlock:

$ scp foo <sunetid>@login.sherlock.stanford.edu:
+
Note the : character, that separates the hostname from the destination path. Here, the destination path is empty, which will instruct scp to copy the file in your home directory.

You can copy foo under a different name, or to another directory, with the following commands:

$ scp foo <sunetid>@login.sherlock.stanford.edu:bar
+$ scp foo <sunetid>@login.sherlock.stanford.edu:~/subdir/baz
+

To copy back files from Sherlock to your local machine, you just need to reverse the order of the arguments:

$ scp <sunetid>@login.sherlock.stanford.edu:foo local_foo
+

And finally, scp also support recursive copying of directories, with the -r option:

$ scp -r dir/ <sunetid>@login.sherlock.stanford.edu:dir/
+
This will copy the dir/ directory and all of its contents in your home directory on Sherlock.

SFTP (Secure File Transfer Protocol)#

SFTP clients are interactive file transfer programs, similar to FTP, which perform all operations over an encrypted transport.

A variety of graphical SFTP clients are available for different OSes:

When setting up your connection to Sherlock in the above programs, use the following information:

Hostname: login.sherlock.stanford.edu
+Port:     22
+Username: SUNet ID
+Password: SUNet ID password
+

OpenSSH also provides a command-line SFTP client, originally named sftp.

To log in to Sherlock:

$ sftp <sunetid>@login.sherlock.stanford.edu
+Connected to login.sherlock.stanford.edu.
+sftp>
+
For more information about using the command-line SFTP client, you can refer to this tutorial for more details and examples.

rsync#

If you have complex hierarchies of files to transfer, or if you need to synchronize a set of files and directories between your local machine and Sherlock, rsync will be the best tool for the job. It will efficiently transfer and synchronize files across systems, by checking the timestamp and size of files. Which means that it won't re-transfer files that have not changed since the last transfer, and will complete faster.

For instance, to transfer the whole ~/data/ folder tree from your local machine to your home directory on Sherlock, you can use the following command:

$ rsync -a ~/data/ <sunetid>@login.sherlock.stanford.edu:data/
+
Note the slash (/) at the end of the directories name, which is important to instruct rsync to synchronize the whole directories.

To get more information about the transfer rate and follow its progress, you can use additional options:

$ rsync -avP ~/data/ <sunetid>@login.sherlock.stanford.edu:data/
+sending incremental file list
+./
+file1
+      1,755,049 100%    2.01MB/s    0:00:00 (xfr#2, to-chk=226/240)
+file2
+      2,543,699 100%    2.48MB/s    0:00:00 (xfr#3, to-chk=225/240)
+file3
+     34,930,688  19%   72.62MB/s    0:00:08
+
+[...]
+
For more information about using the rsync, you can refer to this tutorial for more details and examples.

SSHFS#

Sometimes, moving files in and out of the cluster, and maintaining two copies of each of the files you work on, both on your local machine and on Sherlock, may be painful. Fortunately, Sherlock offers the ability to mount any of its filesystems to your local machine, using a secure and encrypted connection.

With SSHFS, a FUSE-based filesystem implementation used to mount remote SSH-accessible filesystems, you can access your files on Sherlock as if they were locally stored on your own computer.

This comes particularly handy when you need to access those files from an application that is not available on Sherlock, but that you already use or can install on your local machine. Like a data processing program that you have licensed for your own computer but can't use on Sherlock, a specific text editor that only runs on macOS, or any data-intensive 3D rendering software that wouldn't work comfortably enough over a forwarded X11 connection.

SSHFS is available for Linux , macOS , and Windows .

SSHFS on macOS

SSHFS on macOS is known to try to automatically reconnect filesystem mounts after resuming from sleep or suspend, even without any valid credentials. As a result, it will generate a lot of failed connection attempts and likely make your IP address blacklisted on login nodes.

Make sure to unmount your SSHFS drives before putting your macOS system to sleep to avoid this situation.

The following option could also be useful to avoid some permission issues: -o defer_permissions

For instance, on a Linux machine with SSHFS installed, you could mount your Sherlock home directory via a Sherlock DTN with the following commands:

$ mkdir ~/sherlock_home
+$ sshfs <sunetid>@dtn.sherlock.stanford.edu:./ ~/sherlock_home
+

Using DTNs for data transfer

Using the Sherlock DTNs instead of login nodes will ensure optimal performance for data transfers. Login nodes only have limited resources, that could limit data transfer rates or disconnect during long data transfers.

And to unmount it:

$ umount ~/sherlock_home
+

For more information about using SSHFS on your local machine, you can refer to this tutorial for more details and examples.

Globus#

Globus improves SSH-based file transfer protocols by providing the following features:

  • automates large data transfers,
  • handles transient errors, and can resume failed transfers,
  • simplifies the implementation of high-performance transfers between computing centers.

Globus is a Software as a Service (SaaS) system that provides end-users with a browser interface to initiate data transfers between endpoints. Globus allows users to "drag and drop" files from one endpoint to another. Endpoints are terminals for data; they can be laptops or supercomputers, and anything in between. The Globus web service negotiates, monitors, and optimizes transfers through firewalls and across network address translation (NAT). Under certain circumstances, with high performance hardware transfer rates exceeding 1 GB/s are possible. For more information about Globus, please see the Globus documentation.

Authentication#

To use Globus, you will first need to authenticate at Globus.org. You can either sign up for a Globus account, or use your SUNet ID account for authentication to Globus (which will be required to authenticate to the Sherlock endpoint).

To use your SUNet ID, choose "Stanford University" from the drop down menu at the Login page and follow the instructions from there.

Transfer#

Endpoint name

The Globus endpoint name for Sherlock is SRCC Sherlock.

Oak endpoint

The Sherlock endpoint only provides access to Sherlock-specific file systems ($HOME, $GROUP_HOME, $SCRATCH and $GROUP_SCRATCH). Oak features its own Globus endpoint: SRCC Oak.

You can use Globus to transfer data between your local workstation (e.g., your laptop or desktop) and Sherlock. In this workflow, you configure your local workstation as a Globus endpoint by installing the Globus Connect software.

  1. Log in to Globus.org
  2. Use the Manage Endpoints interface to "add Globus Connect Personal" as an endpoint (you'll need to install Globus Connect Personal on your local machine)
  3. Transfer Files, using your new workstation endpoint for one side of the transfer, and the Sherlock endpoint (SRCC Sherlock) on the other side.

You can also transfer data between two remote endpoints, by choosing another endpoint you have access to instead of your local machine.

CLI and API#

Globus also provides a command-line interface (CLI) and application programming interface (API) as alternatives to its web interface.

For more information about the API, please see the Globus API documentation for more details.

For more information about the CLI, please see the Globus CLI documentation and Globus CLI quick start. Note that the Globus CLI is available through the module system on Sherlock:

$ module load system py-globus-cli
+$ globus login
+# follow instructions to get set up
+

Once you've authorized the application, you can use the globus CLI to copy files in between endpoints and collections that you have access to. Endpoints and collections are identified by their unique UUID4 identifiers, which are viewable through the Globus web app. The CLI will step you through any additional authorizations required for you to access the endpoints or collections.

For example, to asynchronously copy files between Sherlock and Oak (if that you have already been allocated Oak storage):

$ GLOBUS_SHERLOCK_UUID="6881ae2e-db26-11e5-9772-22000b9da45e"
+$ GLOBUS_OAK_UUID="8b3a8b64-d4ab-4551-b37e-ca0092f769a7"
+$ globus transfer --recursive \
+    "$GLOBUS_SHERLOCK_UUID:$SCRATCH/my-interesting-project" \
+    "$GLOBUS_OAK_UUID:$OAK/my-interesting-project-copy"
+

Data Transfer Nodes (DTNs)#

No shell

The DTNs don't provide any interactive shell, so connecting via SSH directly won't work. It will only accept scp, sftp, rsync of bbcp connections.

A pool of dedicated Data Transfer Nodes is available on Sherlock, to provide exclusive resources for large-scale data transfers.

The main benefit of using it is that transfer tasks can't be disrupted by other users interactive tasks or filesystem access and I/O-related workloads on the login nodes.

By using the Sherlock DTNs, you'll make sure that your data flows will go through a computer whose sole purpose is to move data around.

It supports:

To transfer files via the DTNs, simply use dtn.sherlock.stanford.edu as a remote server host name. For instance:

$ scp foo <sunetid>@dtn.sherlock.stanford.edu:~/foo
+

$HOME on DTNs

One important difference to keep in mind when transferring files through the Sherlock DTNs is that the default destination path for files, unless specified, is the user $SCRATCH directory, not $HOME.

That means that the following command:

$ scp foo <sunetid>@dtn.sherlock.stanford.edu:
+
will create the foo file in $SCRATCH/foo, and not in $HOME/foo.

You can transfer file to your $HOME directory via the DTNs by specifying the full path as the destination: $ scp foo <sunetid>@dtn.sherlock.stanford.edu:$HOME/foo

Cloud storage#

If you need to backup some of your Sherlock files to cloud-based storage services, we also provide a set of utilities that can help.

Google Drive#

Google Drive storage for Stanford users

Google Drive is free for educational institutions. Meaning you can get free and unlimited storage on Google Drive using your @stanford.edu account. See the University IT Google Drive page for more details.

We provide the rclone tool on Sherlock to interact with Google Drive. You'll just need to load the rclone module to be able to use it to move your files from/to Google Drive:

$ module load system rclone
+$ rclone --help
+

This tutorial provides an example of transferring files between Google Drive and Oak storage.

The Globus CLI (see above) can also be used to copy files from Sherlock to Stanford's Google Drive.

AWS#

You can also access AWS storage from the Sherlock command line with the AWS Command Line Interface:

$ module load system aws-cli
+$ aws help
+

Other services#

If you need to access other cloud storage services, you can use rclone: it can be used to sync files and directories to and from Google Drive, Amazon S3, Box, Dropbox, Google Cloud Storage, Amazon Drive, Microsoft OneDrive and many more.

$ ml load system rclone
+$ rclone -h
+

For more details about how to use rclone, please see the official documentation.


  1. For more details, see the SSH clients page

  2. Fetch is a commercial program, and is available as part of the Essential Stanford Software bundle. 

\ No newline at end of file diff --git a/docs/storage/filesystems/index.html b/docs/storage/filesystems/index.html new file mode 100644 index 000000000..9612c4eb2 --- /dev/null +++ b/docs/storage/filesystems/index.html @@ -0,0 +1,6 @@ + Filesystems - Sherlock

Filesystems

The following sections describe the characteristics and best uses of each of the Sherlock's filesystems.

$HOME#

Summary

$HOME is your home directory. It's the best place to keep your code and important data as it provides snapshots and off-site replication. It is not meant to host data that will be actively read and written to by compute jobs.

Characteristics
Type high speed, distributed NFS file system
Quota 15 GB for the whole $HOME directory
Snapshots yes (cf. Snapshots) for more info)
Backups off-site replication
Purge policy not purged
Scope all login and compute nodes

$HOME is best suited for personal configuration files, scripts, small reference files or datasets, source code and individual software installation

When you log in, the system automatically sets the current working directory to $HOME: it's the location you'll end up when connecting to Sherlock. You can store your source code and build your executables there.

We strongly recommend using $HOME to reference your home directory in scripts, rather than its explicit path.

Checking quota usage#

The sh_quota tool can be used to display quota usage on $HOME

$ sh_quota -f HOME
+

See the Checking Quotas section for more details.


$GROUP_HOME#

Summary

$GROUP_HOME is your group home directory. It's the best place to keep your group's shared code, software installations and important data as it provides snapshots and off-site replication. It is not meant to host data that will be actively read and written to by compute jobs.

$HOME and $GROUP_HOME are based on the same physical file system.

Characteristics
Type high speed, distributed NFS file system
Quota 1 TB for the whole $GROUP_HOME directory
Snapshots yes (cf. Snapshots) for more info)
Backups off-site replication
Purge policy not purged
Scope all login and compute nodes

$GROUP_HOME is best suited for group shared source code, common software installations, shared data sets and scripts.

We strongly recommend using $GROUP_HOME to reference your group home directory in scripts, rather than its explicit path.

Checking quota usage#

The sh_quota tool can be used to display quota usage on $GROUP_HOME

$ sh_quota -f GROUP_HOME
+

See the Checking Quotas section for more details.


$SCRATCH#

Summary

$SCRATCH is your personal scratch space. It's the best place to store temporary files, such as raw job output, intermediate files, unprocessed results, and so on.

Purge policy

Files are automatically purged from $SCRATCH after an inactivity period:

  • files that are not modified after 90 days are automatically deleted,
  • contents need to change for a file to be considered modified. The touch command does not modify file contents and thus does not extend a file's lifetime on the filesystem.

$SCRATCH is not meant to store permanent data, and should only be used for data associated with currently running jobs. It's not a target for backups, archived data, etc. See the Expiration Policy section for details.

Characteristics
Type Parallel, high-performance Lustre file system
Quota 100 TB / 20,000,000 inodes2
Snapshots NO
Backups NO
Purge policy data not modified in the last 90 days are automatically purged
Scope all login and compute nodes

$SCRATCH is best suited for large files, such as raw job output, intermediate job files, unprocessed simulation results, and so on. This is the recommended location to run jobs from, and to store files that will be read or written to during job execution.

Old files are automatically purged on $SCRATCH so users should avoid storing long-term data there.

Each compute node has a low latency, high-bandwidth Infiniband link to $SCRATCH. The aggregate bandwidth of the filesystem is about 75GB/s. So any job with high data performance requirements will take advantage from using $SCRATCH for I/O.

We strongly recommend using $SCRATCH to reference your scratch directory in scripts, rather than its explicit path.

Checking quota usage#

The sh_quota tool can be used to display quota usage on $SCRATCH

$ sh_quota -f SCRATCH
+

See the Checking Quotas section for more details.

Expiration policy#

Inactive files are automatically purged

Files that are not modified in the last 90 days will be automatically deleted from the filesystem.

To manage available space and maintain optimal performance for all jobs, all files on $SCRATCH are subject to automatic purges. Meaning that after a period of inactivity, files that are not used anymore will be automatically deleted from the filesystem.

File activity is defined based on the last time a file's contents (the actual data in the file) have been modified. Meaning that files whose contents have not been modified in the previous 90 days will be automatically deleted.

Each time a file's contents are modified, the expiration countdown is reset, and the file gets another 90-day of lifetime.

Metadata changes don't qualify as an update

Modifying a file's contents is the only way to reset the expiration countdown and extend the file's lifetime on the filesystem.

Metadata modifications such as: reading the file, renaming it, moving it to a different directory, changing its permissions or its ownership, "touching" it to update its last modification or access times, won't have any effect on the purge countdown.

Purges are based on an internal filesystem property that reflects the last date a file's data has been modified, and which is unfortunately not readily accessible by users.

Please note that tools like ls will only display the date of the last metadata1 modification for a file, which is not necessarily relevant to determine a file's eligibility for deletion. For instance, using the touch command on a file to update its last modification date will only update the metadata, not the data, and as such, will not reset the purge countdown timer.

Filesystem purges are a continuous process: they don't run at particular times, but are carried out in a permanent background fashion. Files are not necessarily deleted right away when they become eligible for deletion. For instance, if you create a file on February 1st and don't ever modify it afterwards, it will be automatically become eligible for deletion on May 1st, and can be deleted anytime after this date.

Empty directory trees that stay devoid of any file for more than 90 days will be automatically cleaned up as well.


$GROUP_SCRATCH#

$SCRATCH and $GROUP_SCRATCH are based on the same physical file system.

Summary

$GROUP_SCRATCH is your group shared scratch space. It's the best place to store temporary files, such as raw job output, intermediate files, or unprocessed results that need to be shared among users within a group.

$GROUP_SCRATCH is NOT a backup target

$GROUP_SCRATCH is not meant to store permanent data, and should only be used for data associated with currently running jobs. It's not a target for backups, archived data, etc.

Characteristics
Type parallel, high-performance Lustre file system
Quota 100 TB / 20,000,000 inodes2
Snapshots NO
Backups NO
Purge policy data not accessed in the last 90 days are automatically purged
Scope all login and compute nodes

$GROUP_SCRATCH is best suited for large files, such as raw job output, intermediate job files, unprocessed simulation results, and so on. This is the recommended location to run jobs from, and to store files that will be read or written to during job execution.

Old files are automatically purged on $GROUP_SCRATCH so users should avoid storing long-term data there.

We strongly recommend using $GROUP_SCRATCH to reference your group scratch directory in scripts, rather than its explicit path.

Checking quota usage#

The sh_quota tool can be used to display quota usage on $GROUP_SCRATCH

$ sh_quota -f GROUP_SCRATCH
+

See the Checking Quotas section for more details.

Expiration policy#

As $SCRATCH and $GROUP_SCRATCH are on the same filesystem, the same expiration policy applies to both. Please see the $SCRATCH section above for more details.


$L_SCRATCH#

Summary

$L_SCRATCH is local to each compute node, and could be used to store temporary files for jobs with high IOPS requirements. Files stored in $L_SCRATCH are purged at the end of the job.

Characteristics
Type local filesystem, specific to each node, based on SSD
Quota n/a (usable space limited by the size of the physical storage devices, typically around 150 GB)
Snapshots NO
Backups NO
Purge policy data immediately purged at the end of the job
Scope locally on each node, not shared across nodes

$L_SCRATCH is best suited for small temporary files and applications which require low latency and high IOPS levels, typically intermediate job files, checkpoints, dumps of temporary states, etc.

Files stored in $L_SCRATCH are local to each node and can't be accessed from other nodes, nor from login nodes.

Please note that an additional, job-specific environment variable, $L_SCRATCH_JOB, will be set to a subdirectory of $L_SCRATCH for each job. So, if you have two jobs running on the same compute node, $L_SCRATCH will be the same and accessible from both jobs, while $L_SCRATCH_JOB will be different for each job.

For instance, if you have jobs 98423 and 98672 running on this same nodes, the variables will be set as follows:

Job id $L_SCRATCH L_SCRATCH_JOB
98423 /lscratch/kilian /lscratch/kilian/98423
98672 /lscratch/kilian /lscratch/kilian/98672

We strongly recommend using $L_SCRATCH to reference your local scratch directory in scripts, rather than its full path.

Expiration policy#

All files stored in $L_SCRATCH_JOB are automatically purged at the end of the job, whether the job was successful or not. If you need to conserve files that were generated in $L_SCRATCH_JOB after the job ends, don't forget to add a command at the end of your batch script to copy them to one of the more persistent storage locations, such as $HOME or $SCRATCH.

Data stored in $L_SCRATCH will be purged at the end of a job, only if no other job from the same user is still running on the node. Which means that data stored in $L_SCRATCH (but in not $L_SCRATCH_JOB) will persist on the node until the last job from the user terminates.


$OAK#

Summary

$OAK is SRCC's research data storage offering. It provides an affordable, longer-term storage option for labs and researchers, and is ideally suited to host large datasets, or curated, post-processed results from job campaigns, as well as final results used for publication.

Order $OAK

Oak storage can be easily ordered online using the Oak Storage Service page.

$OAK is opt-in and is available as an option on Sherlock. Meaning that only members of groups which have purchased storage on Oak can access this filesystem.

For complete details and characteristics, including pricing, please refer to the Oak Storage Service page.

Characteristics
Type parallel, capacitive Lustre filesystem
Quota amount purchased (in 10 TB increments)
Snapshots NO
Backups optional cloud backup available
please contact us for details
Purge policy not purged
Scope all login and compute nodes
also available through gateways outside of Sherlock

$OAK is ideally suited for large shared datasets, archival data and curated, post-processed results from job campaigns, as well as final results used for publication.

Although jobs can directly read and write to $OAK during execution, it is recommended to first stage files from $OAK to $SCRATCH at the beginning of a series of jobs, and save the desired results back from $SCRATCH to $OAK at the end of the job campaign.

We strongly recommend using $OAK to reference your group home directory in scripts, rather than its explicit path.

$OAK is not backed up

$OAK is not backed up or replicated, by design, and deleted files cannot be recovered. We recommend all researchers to keep an additional copy of their important files (for instance, in Google Drive).

Cloud backup option

For additional data security, SRCC now offers "cloud backup" of Oak data as a managed service option. For an additional monthly fee, data on Oak can be backed up to the cloud (researchers are responsible for cloud storage costs). Please contact us if you'd like additional information.

Checking quota usage#

The sh_quota tool can be used to display quota usage on $OAK

$ sh_quota -f OAK
+

See the Checking Quotas section for more details.


  1. Metadata are data such as a file's size, name, path, owner, permissions, etc. 

  2. An inode (index node) is a data structure in a Unix-style file system that describes a file-system object such as a file or a directory. 

\ No newline at end of file diff --git a/docs/storage/index.html b/docs/storage/index.html new file mode 100644 index 000000000..8f02ba719 --- /dev/null +++ b/docs/storage/index.html @@ -0,0 +1,45 @@ + Storage on Sherlock - Sherlock

Storage on Sherlock#

Sherlock provides access to several file systems, each with distinct storage characteristics. Each user and PI group get access to a set of pre-defined directories in these file systems to store their data.

Sherlock is a compute cluster, not a storage system

Sherlock's storage resources are limited and are shared among many users. They are meant to store data and code associated with projects for which you are using Sherlock's computational resources. This space is for work actively being computed on with Sherlock, and should not be used as a target for backups from other systems.

If you're looking for a long-term storage solution for research data, SRCC offers the Oak storage system, which is specifically intended for this usage.

Those file systems are shared with other users, and are subject to quota limits and for some of them, purge policies (time-residency limits).

Filesystem overview#

Features and purpose#

Name Type Backups / Snapshots Performance Purpose Cost
$HOME, $GROUP_HOME NFS / low small, important files (source code, executables, configuration files...) free
$SCRATCH, $GROUP_SCRATCH Lustre / high bandwidth large, temporary files (checkpoints, raw application output...) free
$L_SCRATCH local SSD / low latency, high IOPS job specific output requiring high IOPS free
$OAK Lustre option / moderate long term storage of research data volume-based1

Access scope#

Name Scope Access sharing level
$HOME cluster user
$GROUP_HOME cluster group
$SCRATCH cluster user
$GROUP_SCRATCH cluster group
$L_SCRATCH compute node user
$OAK cluster (optional, purchase required) group

Group storage locations are typically shared between all the members of the same PI group. User locations are only accessible by the user.

Quotas and limits#

Volume and inodes

Quotas are applied on both volume (the amount of data stored in bytes) and inodes: an inode (index node) is a data structure in a Unix-style file system that describes a file-system object such as a file or a directory. In practice, each filesystem entry (file, directory, link) counts as an inode.

Name Quota type Volume quota Inode quota Retention
$HOME directory 15 GB n/a
$GROUP_HOME directory 1 TB n/a
$SCRATCH directory 100 TB 20 million time limited
$GROUP_SCRATCH directory 100 TB 20 million time limited
$L_SCRATCH n/a n/a n/a job lifetime
$OAK directory amount purchased function of the volume purchased

Quota types:

  • directory: based on files location and account for all the files that are in a given directory.
  • user: based on files ownership and account for all the files that belong to a given user.
  • group: based on files ownership and account for all the files that belong to a given group.

Retention types:

  • : files are kept as long as the user account exists on Sherlock.
  • time limited: files are kept for a fixed length of time after they've been last modified. Once the limit is reached, files expire and are automatically deleted.
  • job lifetime: files are only kept for the duration of the job and are automatically purged when the job ends.

Global failsafe user and quota groups on /scratch

To prevent potential issues which would result in the file system filling up completely and making it unusable for everyone, additional user and group-level quotas are in place on the /scratch file system, as a failsafe:

  • a user will not be able to use more than 250 TB (50M inodes) in total, in all the /scratch directories they have access to.

  • a group will not be able to use more than 1 PB (200M inodes) in total across all the /scratch directories its group members have access to.

Checking quotas#

To check your quota usage on the different filesystems you have access to, you can use the sh_quota command:

$ sh_quota
++---------------------------------------------------------------------------+
+| Disk usage for user kilian (group: ruthm)                                 |
++---------------------------------------------------------------------------+
+|   Filesystem |  volume /   limit                  | inodes /  limit       |
++---------------------------------------------------------------------------+
+          HOME |   9.4GB /  15.0GB [||||||     62%] |      - /      - (  -%)
+    GROUP_HOME | 562.6GB /   1.0TB [|||||      56%] |      - /      - (  -%)
+       SCRATCH |  65.0GB / 100.0TB [            0%] | 143.8K /  20.0M (  0%)
+ GROUP_SCRATCH | 172.2GB / 100.0TB [            0%] |  53.4K /  20.0M (  0%)
+           OAK |  30.8TB / 240.0TB [|          12%] |   6.6M /  36.0M ( 18%)
++---------------------------------------------------------------------------+
+

Several options are provided to allow listing quotas for a specific filesystem only, or in the context of a different group (for users who are members of several PI groups). Please see the sh_quota usage information for details:

$ sh_quota -h
+sh_quota: display user and group quota information for all accessible filesystems.
+
+Usage: sh_quota [OPTIONS]
+    Optional arguments:
+        -f FILESYSTEM   only display quota information for FILESYSTEM.
+                        For instance: "-f $HOME"
+        -g GROUP        for users with multiple group memberships, display
+                        group quotas in the context of that group
+        -n              don't display headers
+        -j              JSON output (implies -n)
+
Examples#

For instance, to only display your quota usage on $HOME:

$ sh_quota -f HOME
+

If you belong to multiple groups, you can display the group quotas for your secondary groups with:

$ sh_quota -g <group_name>
+

And finally, for great output control, an option to display quota usage in JSON is provided via the -j option:

$ sh_quota -f SCRATCH -j
+{
+  "SCRATCH": {
+    "quotas": {
+      "type": "user",
+      "blocks": {
+        "usage": "47476660",
+        "limit": "21474836480"
+      },
+      "inodes": {
+        "usage": "97794",
+        "limit": "20000000"
+      }
+    }
+  }
+}
+

Where should I store my files?#

Not all filesystems are equivalent

Choosing the appropriate storage location for your files is an essential step towards making your utilization of the cluster the most efficient possible. It will make your own experience much smoother, yield better performance for your jobs and simulations, and contribute to make Sherlock a useful and well-functioning resource for everyone.

Here is where we recommend storing different types of files and data on Sherlock:

  • personal scripts, configuration files and software installations → $HOME
  • group-shared scripts, software installations and medium-sized datasets → $GROUP_HOME
  • temporary output of jobs, large checkpoint files → $SCRATCH
  • curated output of job campaigns, large group-shared datasets, archives → $OAK

Accessing filesystems#

On Sherlock#

Filesystem environment variables

To facilitate access and data management, user and group storage location on Sherlock are identified by a set of environment variables, such as $HOME or $SCRATCH.

We strongly recommend using those variables in your scripts rather than explicit paths, to facilitate transition to new systems for instance. By using those environment variables, you'll be sure that your scripts will continue to work even if the underlying filesystem paths change.

To see the contents of these variables, you can use the echo command. For instance, to see the absolute path of your $SCRATCH directory:

$ echo $SCRATCH
+/scratch/users/kilian
+

Or for instance, to move to your group-shared home directory:

$ cd $GROUP_HOME
+

From other systems#

External filesystems cannot be mounted on Sherlock

For a variety of security, manageability and technical considerations, we can't mount external filesystems nor data storage systems on Sherlock. The recommended approach is to make Sherlock's data available on external systems.

You can mount any of your Sherlock directories on any external system you have access to by using SSHFS. For more details, please refer to the Data Transfer page.


  1. For more information about Oak, its characteristics and cost model, please see the Oak Service Description page

\ No newline at end of file diff --git a/docs/storage/overview/index.html b/docs/storage/overview/index.html new file mode 100644 index 000000000..e0c38c74d --- /dev/null +++ b/docs/storage/overview/index.html @@ -0,0 +1,15 @@ + + + + + + Redirecting... + + + + + + +Redirecting... + + diff --git a/docs/tags/index.html b/docs/tags/index.html new file mode 100644 index 000000000..68b13cb2b --- /dev/null +++ b/docs/tags/index.html @@ -0,0 +1 @@ + Tags - Sherlock
\ No newline at end of file diff --git a/docs/tech/facts/index.html b/docs/tech/facts/index.html new file mode 100644 index 000000000..b9650960c --- /dev/null +++ b/docs/tech/facts/index.html @@ -0,0 +1,13 @@ + Facts - Sherlock

Sherlock facts#

as of February 2024

Users#

  • 7,054 user accounts

  • 1,115 PI groups

    from all Stanford's seven Schools, SLAC, Stanford Institutes, etc.

  • 201 owner groups

Interfaces#

  • 12 login nodes

  • 3 data transfer nodes (DTNs)

Computing#

  • 5.00 PFLOPs (FP64)

    18.73 (FP32) PFLOPs

  • 53,488 CPU cores

    4 CPU generations (13 CPU models)

  • 756 GPUs

    4 GPU generations (12 GPU models)

Hardware#

  • 1,731 compute nodes

    19 server models (from 3 different manufacturers)

  • 37 racks

    1,147 rack units

Energy#

  • 564.7 kW

    total power usage

  • 57 PDUs

Storage#

  • 9.7 PB $SCRATCH

    parallel, distributed filesystem, delivering over 200 GB/s of I/O bandwidth

  • 51.3 PB $OAK

    long term research data storage

Networking#

  • 104 Infiniband switches

    across 2 Infiniband fabrics (EDR, HDR)

  • 5,740 Infiniband cables

    spanning about 30.23 km

  • 53 Ethernet switches

Scheduler#

  • 178 Slurm partitions

  • 47,065 CPU.hours/day

    over 5 years of computing in a single day

  • $3,144,743 /month

    to run the same workload on t2.large on-demand cloud instances

\ No newline at end of file diff --git a/docs/tech/index.html b/docs/tech/index.html new file mode 100644 index 000000000..e89056db7 --- /dev/null +++ b/docs/tech/index.html @@ -0,0 +1,3 @@ + Technical specifications - Sherlock

Technical specifications#

In a nutshell#

Sherlock features over 1,700 compute nodes, 53,400+ CPU cores and 700+ GPUs, for a total computing power of more than 5.0 Petaflops. That would rank it in the Top500 list of the most powerful supercomputers in the world.

The cluster currently extends across 2 Infiniband fabrics (EDR, HDR). A 9.7 PB parallel, distributed filesystem, delivering over 200 GB/s of I/O bandwidth, provides scratch storage for more than 7,000 users, and 1,100 PI groups.

Resources#

The Sherlock cluster has been initiated in January 2014 with a base of freely available computing resources (about 2,000 CPU cores) and the accompanying networking and storage infrastructure (about 1 PB of shared storage).

Since then, it's been constantly expanding, spawning multiple cluster generations, with numerous contributions from many research groups on campus.

Cluster generations

For more information about Sherlock's ongoing evolution and expansion, please see Cluster generations.

Interface#

Type Qty Details
login nodes 12 sherlock.stanford.edu (load-balanced)
data transfer nodes 3 dedicated bandwidth for large data transfers

Computing#

Access to computing resources

Computing resources marked with below are freely available to every Sherlock user. Resources marked with are only accessible to Sherlock owners and their research teams.

Type Access Nodes CPU cores Details
compute nodes
normal partition
195 5,236 - 57x 20 (Intel E5-2640v4), 128 GB RAM, EDR IB
- 40x 24 (Intel 5118), 191 GB RAM, EDR IB
- 28x 32 (AMD 7543), 256 GB RAM, HDR IB
- 70x 32 (AMD 7502), 256 GB RAM, HDR IB
development nodes
dev partition
4 104 - 2x 20 (Intel E5-2640v4), 128 GB RAM, EDR IB
- 2x 32 (AMD 7543P), 256 GB RAM, HDR IB
- 32x Tesla A30_MIG-1g.6gb
large memory nodes
bigmem partition
9 504 - 4x 24 (Intel 5118), 384 GB RAM, EDR IB
- 1x 32 (Intel E5-2697Av4), 512 GB RAM, EDR IB
- 1x 56 (Intel E5-4650v4), 3072 GB RAM, EDR IB
- 1x 64 (AMD 7502), 4096 GB RAM, HDR IB
- 2x 128 (AMD 7742), 1024 GB RAM, HDR IB
GPU nodes
gpu partition
26 748 - 1x 20 (Intel E5-2640v4), 256 GB RAM, EDR IB
- 4x Tesla P100 PCIe
- 1x 20 (Intel E5-2640v4), 256 GB RAM, EDR IB
- 4x Tesla P40
- 3x 20 (Intel E5-2640v4), 256 GB RAM, EDR IB
- 4x Tesla V100_SXM2
- 1x 24 (Intel 5118), 191 GB RAM, EDR IB
- 4x Tesla V100_SXM2
- 2x 24 (Intel 5118), 191 GB RAM, EDR IB
- 4x Tesla V100 PCIe
- 16x 32 (AMD 7502P), 256 GB RAM, HDR IB
- 4x Geforce RTX_2080Ti
- 2x 32 (AMD 7502P), 256 GB RAM, HDR IB
- 4x Tesla V100S PCIe
privately-owned nodes
owners partition
1,493 48,648 40 different node configurations, including GPU and bigmem nodes
Total 1,731 53,488 756

Storage#

More information

For more information about storage options on Sherlock, please refer to the Storage section of the documentation.

Sherlock is architected around shared storage components, meaning that users can find the same files and directories from all of the Sherlock nodes.

  • Highly-available NFS filesystem for user and group home directories (with hourly snapshots and off-site replication)
  • High-performance Lustre scratch filesystem (9.7 PB parallel, distributed filesystem, delivering over 200 GB/s of I/O bandwidth)
  • Direct access to SRCC's Oak long-term research data storage system (51.3 PB)
\ No newline at end of file diff --git a/docs/tech/status/index.html b/docs/tech/status/index.html new file mode 100644 index 000000000..0db1c4e70 --- /dev/null +++ b/docs/tech/status/index.html @@ -0,0 +1,26 @@ + Status - Sherlock

Status

Scheduled maintenances

Maintenance operations and upgrades are scheduled on Sherlock on a regular basis. Per the University's Minimum Security policies, we deploy security patches on Sherlock as required for compliance.

Components and services#

Sherlock status is

For more details about Sherlock components and services, see the status dashboard.

Current usage#

\ No newline at end of file diff --git a/docs/user-guide/gpu/index.html b/docs/user-guide/gpu/index.html new file mode 100644 index 000000000..31e0c63d8 --- /dev/null +++ b/docs/user-guide/gpu/index.html @@ -0,0 +1,78 @@ + GPU nodes - Sherlock

GPU nodes

To support the latest computing advancements in many fields of science, Sherlock features a number of compute nodes with GPUs that can be used to run a variety of GPU-accelerated applications. Those nodes are available to everyone, but are a scarce, highly-demanded resource, so getting access to them may require some wait time in queue.

Getting your own GPU nodes

If you need frequent access to GPU nodes, we recommend considering becoming an owner on Sherlock, so you can have immediate access to your GPU nodes when you need them.

GPU nodes#

A limited number of GPU nodes are available in the gpu partition. Anybody running on Sherlock can submit a job there. As owners contribute to expand Sherlock, more GPU nodes are added to the owners partition, for use by PI groups which purchased their own compute nodes.

There are a variety of different GPU configuration available in the gpu partition. To see the available GPU types, please see the GPU types section.

Submitting a GPU job#

To submit a GPU job, you'll need to use the --gpus (or -G) option in your batch script or command line submission options.

For instance, the following script will request one GPU for two hours in the gpu partition, and run the GPU-enabled version of gromacs:

#!/bin/bash
+#SBATCH -p gpu
+#SBATCH -c 10
+#SBATCH -G 1
+
+ml load gromacs/2016.3
+
+srun gmx_gpu ...
+

You can also directly run GPU processes on compute nodes with srun. For instance, the following command will display details about the GPUs allocated to your job:

$ srun -p gpu --gpus 2 nvidia-smi
+Fri Jul 28 12:41:49 2017
++-----------------------------------------------------------------------------+
+| NVIDIA-SMI 375.51                 Driver Version: 375.51                    |
+|-------------------------------+----------------------+----------------------+
+| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
+|===============================+======================+======================|
+|   0  Tesla P40           On   | 0000:03:00.0     Off |                    0 |
+| N/A   26C    P8    10W / 250W |      0MiB / 22912MiB |      0%   E. Process |
++-------------------------------+----------------------+----------------------+
+|   1  Tesla P40           On   | 0000:04:00.0     Off |                    0 |
+| N/A   24C    P8    10W / 250W |      0MiB / 22912MiB |      0%   E. Process |
++-------------------------------+----------------------+----------------------+
+
++-----------------------------------------------------------------------------+
+| Processes:                                                       GPU Memory |
+|  GPU       PID  Type  Process name                               Usage      |
+|=============================================================================|
+|  No running processes found                                                 |
++-----------------------------------------------------------------------------+
+

GPU resources MUST be requested explicitly

Jobs will be rejected at submission time if they don't explicitly request GPU resources.

The gpu partition only accepts jobs explicitly requesting GPU resources. If they don't, they will be rejected with the following message:

$ salloc -p gpu
+srun: error: Unable to allocate resources: Job violates accounting/QOS policy (job submit limit, user's size and/or time limits)
+

Interactive sessions#

As for any other compute node, you can submit an interactive job and request a shell on a GPU node with the following command:

$ salloc -p gpu --gpus 1
+salloc: job 38068928 queued and waiting for resources
+salloc: job 38068928 has been allocated resources
+$ nvidia-smi --query-gpu=index,name --format=csv,noheader
+0, Tesla V100-SXM2-16GB
+

Instant lightweight GPU instances#

Given that some tasks don't necessarily require a full-fledged, top-of-the-line GPU, lightweight GPU instances are provided to allow instant access to GPU resources for quick debugging, prototyping or testing jobs.

Lightweight GPU instances

Lightweight GPU instances leverage NVIDIA’s Multi-Instance GPU (MIG) to provide multiple fully isolated GPU instances on the same physical GPU, each with their own high-bandwidth memory, cache, and compute cores.

Those GPU instances are instantly available via the dev partition, and can be requested with the sh_dev command:

# sh_dev -g 1
+[...]
+[kilian@sh03-17n15 ~] (job 17628407) $ nvidia-smi -L
+GPU 0: NVIDIA A30 (UUID: GPU-ac772b5a-123a-dc76-9480-5998f435fe84)
+  MIG 1g.6gb      Device  0: (UUID: MIG-87e5d835-8046-594a-b237-ccc770b868ef)
+

For interactive apps in the Sherlock OnDemand interface, requesting a GPU in the dev partition will initiate an interactive session with access to a lightweight GPU instance.

gpu_dev

GPU types#

Since Sherlock features many different types of GPUs, each with its own technical characteristics, performance profiles and specificities, you may want to ensure that your job runs on a specific type of GPU.

To that end, Slurm allows users to specify constraints when submitting jobs, which will indicate the scheduler that only nodes having features matching the job constraints could be used to satisfy the request. Multiple constraints may be specified and combined with various operators (please refer to the official Slurm documentation for details).

The list of available features on compute nodes can be obtained with the node_feat1 command. And more specifically, to list the GPU-related features of nodes in the gpu partition::

$ node_feat -p gpu | grep GPU_
+GPU_BRD:TESLA
+GPU_GEN:PSC
+GPU_MEM:16GB
+GPU_MEM:24GB
+GPU_SKU:TESLA_P100_PCIE
+GPU_SKU:TESLA_P40
+

You can use node_feat without any option to list all the features of all the nodes in all the partitions. But please note that node_feat will only list the features of nodes from partitions you have access to, so output may vary depending on your group membership.

The different characteristics2 of various GPU types are listed in the following table

Slurm feature Description Possible values Example job constraint
GPU_BRD GPU brand GEFORCE: GeForce / TITAN
TESLA: Tesla
#SBATCH -C GPU_BRD:TESLA
GPU_GEN GPU generation PSC: Pascal
MXW: Maxwell
#SBATCH -C GPU_GEN:PSC
GPU_MEM Amount of GPU memory 16GB, 24GB #SBATCH -C GPU_MEM:16GB
GPU_SKU GPU model TESLA_P100_PCIE
TESLA_P40
#SBATCH -C GPU_SKU:TESLA_P40

Depending on the partitions you have access to, more features may be available to be requested in your jobs.

For instance, to request a Tesla GPU for you job, you can use the following submission options:

$ srun -p gpu -G 1 -C GPU_BRD:TESLA nvidia-smi -L
+GPU 0: Tesla P100-SXM2-16GB (UUID: GPU-4f91f58f-f3ea-d414-d4ce-faf587c5c4d4)
+

Unsatisfiable constraints

If you specify a constraint that can't be satisfied in the partition you're submitting your job to, the job will be rejected by the scheduler. For instance, requesting a RTX3090 GPU in the gpu partition, which doesn't feature any, will result in an error:

$ srun -p gpu -G 1 -C GPU_SKU:RTX_3090 nvidia-smi -L
+srun: error: Unable to allocate resources: Requested node configuration is not available
+

For more information about requesting specific node features and adding job constraints, you can also refer to the "Node features" page.

GPU compute modes#

By default, GPUs on Sherlock are set in the Exclusive Process compute mode3, to provide the best performance and an isolated environment for jobs, out of the box.

Some software may require GPUs to be set to a different compute mode, for instance to share a GPU across different processes within the same application.

To handle that case, we developed a specific option, --gpu_cmode, that users can add to their srun and sbatch submission options, to choose the compute mode for the GPUs allocated to their job.

Here's the list of the different compute modes supported on Sherlock's GPUs:

GPU compute mode --gpu_cmode option Description
"Default" shared Multiple contexts are allowed per device (NVIDIA default)
"Exclusive Process" exclusive Only one context is allowed per device, usable from multiple threads at a time (Sherlock default)
"Prohibited" prohibited No CUDA context can be created on the device

By default, or if the --gpu_cmode option is not specified, GPUs will be set in the "Exclusive Process" mode, as demonstrated by this example command:

$ srun -p gpu -G 1 nvidia-smi
++-----------------------------------------------------------------------------+
+| NVIDIA-SMI 387.26                 Driver Version: 387.26                    |
+|-------------------------------+----------------------+----------------------+
+| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
+|===============================+======================+======================|
+|   0  Tesla P40           On   | 00000000:03:00.0 Off |                    0 |
+| N/A   22C    P8    10W / 250W |      0MiB / 22912MiB |      0%   E. Process |
++-------------------------------+----------------------+----------------------+
+

With the --gpu_cmode option, the scheduler will set the GPU compute mode to the desired value before execution:

$ srun -p gpu -G 1 --gpu_cmode=shared nvidia-smi
++-----------------------------------------------------------------------------+
+| NVIDIA-SMI 387.26                 Driver Version: 387.26                    |
+|-------------------------------+----------------------+----------------------+
+| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
+|===============================+======================+======================|
+|   0  Tesla P40           On   | 00000000:03:00.0 Off |                    0 |
+| N/A   22C    P8    10W / 250W |      0MiB / 22912MiB |      0%      Default |
++-------------------------------+----------------------+----------------------+
+

Tip

"Default" is the name that the NVIDIA System Management Interface (nvidia-smi) uses to describe the mode where a GPU can be shared between different processes. It does not represent the default GPU compute mode on Sherlock, which is "Exclusive Process".

Advanced options#

A number of submission options are available when submitting GPU jobs, to request specific resource mapping or task binding options.

Here are some examples to allocate a set of resources as a function of the number of requested GPUs:

  • --cpus-per-gpu: requests a number of CPUs per allocated GPU.

    For instance, the following options will allocate 2 GPUs and 4 CPUs:

    $ salloc -p gpu -G 2 --cpus-per-gpu=2
    +
  • --gpus-per-node: requests a number of GPUs per node,

  • --gpus-per-task: requests a number of GPUs per spawned task,
  • --mem-per-gpu: allocates (host) memory per allocated GPU.

Other options can help set particular GPU properties (topology, frequency...):

  • --gpu-bind: specify task/GPU binding mode.

    By default every spawned task can access every GPU allocated to the job. This option can help making sure that tasks are bound to the closest GPU, for better performance.

  • --gpu-freq: specify GPU and memory frequency. For instance:

    $ srun -p test -G 1 --gpu-freq=highm1,verbose /bin/true
    +GpuFreq=memory_freq:2600,graphics_freq:758
    +

Those options are all available to the srun/sbatch/salloc commands, and more details about each of them can be found in the Slurm documentation.

Conflicting options

Given the multitude of options, it's very easy to submit a job with conflicting options. In most cases the job will be rejected.

For instance:

$ sbatch --gpus-per-task=1 --cpus-per-gpu=2  --cpus-per-task=1 ...
+
Here, the first two options implicitly set cpu-per-task to 2, while the third option explicitly sets cpus-per-task to 1. So the job's requirements are conflicting and can't be satisfied.

Environment and diagnostic tools#

nvtop#

GPU usage information can be shown with the nvtop tool. nvtop is available as a module, which can be loaded like this:

$ ml load system nvtop
+

nvtop provides an htop-like interactive view of GPU utilization. Users can monitor, estimate and fine tune their GPU resource requests with this tool. Percent GPU and memory utilization is shown as a user's GPU code is running.

nvtop


  1. See node_feat -h for more details. 

  2. The lists of values provided in the table are non exhaustive. 

  3. The list of available GPU compute modes and relevant details are available in the CUDA Toolkit Documentation 

\ No newline at end of file diff --git a/docs/user-guide/images/file_explorer_btn1.png b/docs/user-guide/images/file_explorer_btn1.png new file mode 100644 index 000000000..a33a461a7 Binary files /dev/null and b/docs/user-guide/images/file_explorer_btn1.png differ diff --git a/docs/user-guide/images/file_explorer_btn2.png b/docs/user-guide/images/file_explorer_btn2.png new file mode 100644 index 000000000..302a47345 Binary files /dev/null and b/docs/user-guide/images/file_explorer_btn2.png differ diff --git a/docs/user-guide/images/gpu_dev.png b/docs/user-guide/images/gpu_dev.png new file mode 100644 index 000000000..3304c7dc1 Binary files /dev/null and b/docs/user-guide/images/gpu_dev.png differ diff --git a/docs/user-guide/images/nvtop.png b/docs/user-guide/images/nvtop.png new file mode 100644 index 000000000..036f303dc Binary files /dev/null and b/docs/user-guide/images/nvtop.png differ diff --git a/docs/user-guide/images/ood_code-server.png b/docs/user-guide/images/ood_code-server.png new file mode 100644 index 000000000..5969d8e2c Binary files /dev/null and b/docs/user-guide/images/ood_code-server.png differ diff --git a/docs/user-guide/images/ood_dashboard.png b/docs/user-guide/images/ood_dashboard.png new file mode 100644 index 000000000..763f8ff57 Binary files /dev/null and b/docs/user-guide/images/ood_dashboard.png differ diff --git a/docs/user-guide/images/ood_jup.png b/docs/user-guide/images/ood_jup.png new file mode 100644 index 000000000..df9f0274e Binary files /dev/null and b/docs/user-guide/images/ood_jup.png differ diff --git a/docs/user-guide/images/ood_jup_notebook.png b/docs/user-guide/images/ood_jup_notebook.png new file mode 100644 index 000000000..d5010ed10 Binary files /dev/null and b/docs/user-guide/images/ood_jup_notebook.png differ diff --git a/docs/user-guide/images/ood_juplab.png b/docs/user-guide/images/ood_juplab.png new file mode 100644 index 000000000..f32581cc9 Binary files /dev/null and b/docs/user-guide/images/ood_juplab.png differ diff --git a/docs/user-guide/images/ood_logo.png b/docs/user-guide/images/ood_logo.png new file mode 100644 index 000000000..08c186736 Binary files /dev/null and b/docs/user-guide/images/ood_logo.png differ diff --git a/docs/user-guide/images/ood_matlab.png b/docs/user-guide/images/ood_matlab.png new file mode 100644 index 000000000..dee5c0849 Binary files /dev/null and b/docs/user-guide/images/ood_matlab.png differ diff --git a/docs/user-guide/images/ood_my_jobs.png b/docs/user-guide/images/ood_my_jobs.png new file mode 100644 index 000000000..968385e25 Binary files /dev/null and b/docs/user-guide/images/ood_my_jobs.png differ diff --git a/docs/user-guide/images/ood_new_job.png b/docs/user-guide/images/ood_new_job.png new file mode 100644 index 000000000..0b59aefdf Binary files /dev/null and b/docs/user-guide/images/ood_new_job.png differ diff --git a/docs/user-guide/images/ood_rstudio.png b/docs/user-guide/images/ood_rstudio.png new file mode 100644 index 000000000..50f183e5a Binary files /dev/null and b/docs/user-guide/images/ood_rstudio.png differ diff --git a/docs/user-guide/images/ood_sess.png b/docs/user-guide/images/ood_sess.png new file mode 100644 index 000000000..6bad1590f Binary files /dev/null and b/docs/user-guide/images/ood_sess.png differ diff --git a/docs/user-guide/images/ood_sess_support.png b/docs/user-guide/images/ood_sess_support.png new file mode 100644 index 000000000..5767d08a9 Binary files /dev/null and b/docs/user-guide/images/ood_sess_support.png differ diff --git a/docs/user-guide/images/ood_shell.png b/docs/user-guide/images/ood_shell.png new file mode 100644 index 000000000..09952328f Binary files /dev/null and b/docs/user-guide/images/ood_shell.png differ diff --git a/docs/user-guide/images/ood_submit_job.png b/docs/user-guide/images/ood_submit_job.png new file mode 100644 index 000000000..5d35e9179 Binary files /dev/null and b/docs/user-guide/images/ood_submit_job.png differ diff --git a/docs/user-guide/images/ood_tb.png b/docs/user-guide/images/ood_tb.png new file mode 100644 index 000000000..bf0597521 Binary files /dev/null and b/docs/user-guide/images/ood_tb.png differ diff --git a/docs/user-guide/ondemand/index.html b/docs/user-guide/ondemand/index.html new file mode 100644 index 000000000..56f52fa39 --- /dev/null +++ b/docs/user-guide/ondemand/index.html @@ -0,0 +1,11 @@ + OnDemand - Sherlock

OnDemand

Introduction#

The Sherlock OnDemand interface allows you to conduct your research on Sherlock through a web browser. You can manage files (create, edit and move them), submit and monitor your jobs, see their output, check the status of the job queue, run a Jupyter notebook and much more, without logging in to Sherlock the traditional way, via a SSH terminal connection.

Quote

In neuroimaging there are a number of software pipelines that output HTML reports heavy on images files. Sherlock OnDemand allows users to check those as they appear on their $SCRATCH folder, for quick quality control, instead of having to mount remote filesystems, download data locally or move to any other storage location. Since the data itself is already quite big and costly to move, OnDemand is extremely helpful for fast assessment.

-- Carolina Ramirez, Williams PANLab

More documentation#

Open OnDemand was created by the Ohio Supercomputer Center. ood

The following documentation is specifically intended for using OnDemand on Sherlock. For more complete documentation about OnDemand in general, please see the extensive documentation for OnDemand created by OSC, including many video tutorials.

Connecting#

Connection information

To connect to Sherlock OnDemand, simply point your browser to https://ondemand.sherlock.stanford.edu

Sherlock OnDemand requires the same level of authentication than connecting to Sherlock over SSH. You will be prompted for your SUNet ID and password, and will go through the regular two-step authentication process.

The Sherlock OnDemand Dashboard will then open. From there, you can use the menus across the top of the page to manage files, get a shell on Sherlock, submit jobs or open interactive applications such as Jupyter Notebooks or RStudio sessions.

ood_dashboard

To end your Sherlock OnDemand session, click on the "Log Out" link at the top right of the Dashboard window and close your browser.

Getting a shell#

You can get shell access to Sherlock by choosing Clusters > Sherlock Shell Access from the top menu in the OnDemand Dashboard.

In the window that will open, you'll be logged in to one of Sherlock's login nodes, exactly as if you were using SSH to connect. Except you don't need to install any SSH client on your local machine, configure Kerberos or deal with your SSH client configuration to avoid endless two-factor prompts. How cool is that?

ood_shell

Managing files#

To create, edit or move files, click on the Files menu from the Dashboard page. A drop-down menu will appear, listing your most common storage locations on Sherlock: $HOME, $GROUP_HOME, $SCRATCH, $GROUP_SCRATCH, and all Oak storage you have access to, including your main $OAK1. Any rclone remotes you create on Sherlock to connect to cloud storage will appear here as well.

Choosing one of the file spaces opens the File Explorer in a new browser tab. The files in the selected directory are listed.

There are two sets of buttons in the File Explorer.

  • Under the three vertical dots menu next to each filename: fs_btn1 Those buttons allow you to View, Edit, Rename, Download, or Delete a file.

  • At the top of the window, on the right side: fs_btn2

    Button Function
    Open in Terminal Open a terminal window on Sherlock in a new browser tab
    Refresh Refresh the list of directory contents
    New File Create a new, empty file
    New Directory Create a new sub-directory
    Upload Copy a file from your local machine to Sherlock
    Download Download selected files to your local machine
    Copy/Move Copy or move selected files (after moving to a different directory)
    Delete Delete selected files
    Change directory Change your current working directory
    Copy path Copy the current working directory path to your clipboard
    Show Dotfiles Toggle the display of dotfiles (files starting with a ., which are usually hidden)
    Show Owner/Mode Toggle the display of owner and permission settings

Creating and editing jobs#

You can create new job scripts, edit existing scripts, and submit them to the scheduler through the Sherlock OnDemand interface.

From the top menus in the Dashboard, choose Jobs > Job Composer. A Job Composer window will open. There are two tabs at the top: Jobs and Templates.

In the Jobs tab, you'll find a list of the job you've submitted through OnDemand. The Templates tab will allow you to define your own job templates.

Creating a new job script#

To create a new job script. you'll need to follow the steps below.

Select a template#

Go to the Jobs tab in the Jobs Composer interface. You'll find a default template there: "Simple Sequential Job".

To create a new job script, click the blue New Job > From Default Template button in the upper left. You'll see a green message at the top of the page indicating: "Job was successfully created".

At the right of the Jobs page, you can see the Job Details, including the location of the script and the script name (by default, main_job.sh). Under that, you will see the contents of the job script in a section named Submit Script.

ood_new_job

Edit the job script#

You'll need to edit the job script, so it contains the commands and workflow that you want to submit to the scheduler.

If you need more resources than the defaults, you must include options to change them in the job script. For more details, see the Running jobs section.

You can edit the script in several ways:

  • click the blue Edit Files button at the top of the Jobs tab in the Jobs Composer window,
  • in the Jobs tab in the Jobs Composer window, find the Submit Script section at the bottom right. Click the blue Open Editor button.

After you save the file, the editor window remains open, but if you return to the Jobs Composer window, you will see that the content of your script has changed.

Edit the job options#

In the Jobs tab in the Jobs Composer window, click the blue Job Options button. The options for the selected job such as name, the job script to run, and the account it run under are displayed and can be edited. Click Save or Cancel to return to the job listing.

Submitting jobs#

To submit a job, select in in the Jobs tab in the Jobs Composer page. Click the green Submit button to submit the selected job. A message at the top of the window shows whether the job submission was successful or not. If it is not, you can edit the job script or options and resubmit. When the job is submitted successfully, the status of the job in the Jobs Composer window will change to Queued or Running. When the job completes, the status will change to Completed.

ood_submit_job

Monitoring jobs#

From the Dashboard page, The Jobs > Active Jobs top-level menu will bring you to a live view of Sherlock's scheduler queue. You'll be able to see all the jobs currently in queue, including running and pending jobs, as well as some details about individual jobs.

ood_my_jobs

At the bottom of the detailed view, you'll find two button that will bring you to the directory where that job's files are located, either in the File Manager or in a Shell session.

Interactive applications#

One of the main features of Sherlock OnDemand is the ability to run interactive applications directly from the web interface, without leaving your web browser.

Jupyter Notebooks#

You can run Jupyter Notebooks (using Python, Julia or other languages) through Sherlock OnDemand.

Some preliminary setup may be required

Before running your first Jupyter Notebook with IJulia, you'll need to run the following steps (this only needs to be done once):

$ ml julia
+$ julia
+julia> using Pkg;
+julia> Pkg.add("IJulia")
+

When you see the message that IJulia has been installed, you can end your interactive session.

To start a Jupyter session from Sherlock OnDemand:

  1. Select Interactive Apps > Jupyter Notebook from the top menu in the Dashboard page.

  2. In the screen that opens, specify the different parameters for your job (time limit, number of nodes, CPUs, partition to use, etc.). You can also choose to be notified by email when your notebook starts.

ood_jup

  1. Click the blue Launch button to start your JupyterHub session. You may have to wait in the queue for resources to become available for you.

  2. When your session starts, you can click on the blue Connect to Jupyter button to open your Jupyter Notebook. The Dashboard window will display information about your Jupyter session, including the name of the compute node it is running on, when it started, and how much time remains. ood_sess

  3. In your new Jupyter Notebook tab, you'll see 3 tabs: Files, Running and Clusters. ood_jup_notebook

By default, you are in the Files tab; that displays the contents of your $HOME directory on Sherlock. You can navigate through your files there.

Under the Running tab, you will see the list of all the notebooks or terminal sessions that you have currently running.

  1. You can now start a Jupyter Notebook:

    1. To open an existing Jupyter Notebook, which is already stored on Sherlock, navigate to its location in the Files tab and click on its name. A new window running the notebook will open.
    2. To create a new Jupyter Notebook, click on the New button at the top right of the file listing, and choose the kernel of your choice from the drop down.

To terminate your Jupyter Notebook session, go back to the Dashboard, and click on the My Interactive Sessions in the top menu. This will bring you to a page listing all your currently active interactive session. Identify the one you'd like to terminate and click on the red Cancel button.

JupyterLab#

To run JupyterLab via Sherlock OnDemand:

  1. Select Interactive Apps > JupyterLab from the top menu in the Dashboard page.

  2. In the screen that opens, specify the different parameters for your job (time limit, number of nodes, CPUs, partition to use, etc.). You can also choose to be notified by email when your session starts.

  3. Click the blue Launch button to start your JupyterLab session. You may have to wait in the queue for resources to become available.

  4. When your session starts, click the blue Connect to JupyterLab button. A new window opens with the JupyterLab interface.

  5. The first time you connect to JupyterLab via Sherlock OnDemand, you'll see 2 tabs: Files and Launcher.

ood_juplab

The Files tab displays the contents of your $HOME directory on Sherlock. You can navigate through your files there.

In the Launcher tab, you will have the option to create a new Jupyter Notebook new Console session by clicking the tile showing the kernel of your choice. You can also open the Terminal or a text editor for a variety of file types by clicking the corresponding tile.

To create a new kernel for IJulia:

  1. In the Launcher, click the Terminal tile in the "Other" section.

  2. In the Terminal, run the following commands:

    $ ml julia
    +$ julia
    +julia> using Pkg;
    +julia> Pkg.add("IJulia")
    +
  3. Open a new Launcher tab by clicking the + sign next to your open Terminal tab. Julia will now be listed in the "Notebook" and "Console" sections as an available kernel.

To create a custom kernel for a virtual environment using Python 3.x:

  1. In a shell session, activate your environment and run the following:

    $ pip3 install ipykernel
    +$ python3 -m ipykernel install --user --name env --display-name "My Env"
    +

    This will create a kernel for the environment env. It will appear as My Env in the JupyterLab Launcher.

    Creating a custom kernel for a Python 2.x environment

    When working with a Python 2.x environment, use the python/pip commands instead.

  2. The custom kernel will now be listed as option in the "Notebook" and "Console" sections in the JupyterLab Launcher. To start a Jupyter Notebook using your virtual environment, click on the tile for that kernel.

    Creating a custom kernel for a conda environment

    In order to use a kernel created from a conda environment, you must unload the python and py-jupyterlab modules from your JupyterLab session. This can be done using the JupyterLab Lmod extension. To use the Lmod extension, select the bottom tab in the left side menu of your JupyterLab window. You may also need to restart the kernel for your notebook or console.

MATLAB#

To run MATLAB via Sherlock OnDemand:

  1. Select Interactive Apps > MATLAB from the top menu in the Dashboard page.

  2. In the screen that opens, specify the different parameters for your job (time limit, number of nodes, CPUs, partition to use, etc.). You can also choose to be notified by email when your session starts.

  3. Click the blue Launch button to start your MATLAB session. You may have to wait in the queue for resources to become available.

  4. When your session starts, click the blue Connect to MATLAB button. A new window opens with the MATLAB interface.

ood_matlab

RStudio#

To run RStudio via Sherlock OnDemand:

  1. Select Interactive Apps > RStudio Server from the top menu in the Dashboard page.

  2. In the screen that opens, specify the different parameters for your job (time limit, number of nodes, CPUs, partition to use, etc.). You can also choose to be notified by email when your session starts.

  3. Click the blue Launch button to start your RStudio session. You may have to wait in the queue for resources to become available.

  4. When your session starts, click the blue Connect to RStudio Server button. A new window opens with the RStudio interface.

ood_rstudio

Installing packages in RStudio

You may encounter errors while installing R packages within RStudio. First try installing R packages in a shell session on the Sherlock command line. See our R packages documentation for more information.

TensorBoard#

To run TensorBoard via Sherlock OnDemand:

  1. Select Interactive Apps > TensorBoard from the top menu in the Dashboard page.

  2. In the screen that opens, specify the different parameters for your job (time limit, number of nodes, CPUs, partition to use, etc.). You can also choose to be notified by email when your session starts.

  3. Click the blue Launch button to start your TensorBoard session. You may have to wait in the queue for resources to become available.

  4. When your session starts, click the blue Connect to TensorBoard button. A new window opens with the TensorBoard interface.

ood_tb

VS Code#

You can use VS Code on Sherlock through the code-server interactive app.

Using your local VS Code with remote SSH

Connecting to Sherlock from VS Code on your local machine is not supported at this time due to a known issue with the closed-source "Remote SSH" extension.

To start a VS Code session via Sherlock OnDemand:

  1. Select Interactive Apps > code-server from the top menu in the Dashboard page.

  2. In the screen that opens, specify the different parameters for your job (time limit, number of nodes, CPUs, partition to use, etc.). You can also choose to be notified by email when your session starts.

  3. Click the blue Launch button to start your code-server session. You may have to wait in the queue for resources to become available.

  4. When your session starts, click the blue Connect to code-server button. A new window opens with the code-server interface.

ood_code-server

Support#

If you are experiencing issues with Sherlock or your interactive session, you can contact us directly from Sherlock OnDemand.

To submit a ticket about Sherlock or Sherlock OnDemand in general:

  1. Select Help -> Submit Support Ticket from the top menu in the Dashboard page.

  2. In the screen that opens, complete the Support Ticket form. When applicable, please provide:

    • the full path to any files involved in your question or problem,

    • the command(s) you ran, and/or the job submission script(s) you used,

    • the exact, entire error message (or trace) you received.

  3. Click the blue Submit support ticket form. Research Computing support will respond to you as soon as we are able.

To submit a ticket about your current or recent interactive session:

  1. Select My Interactive Sessions from the top menu in the Dashboard page.

  2. In the screen that opens, find the card for the session you need help with. Active sessions will have a green header, and past sessions will have a gray header. Click that card's Submit support ticket link to open the Support Ticket form. ood_sess_support

  3. Complete the Support Ticket form. When applicable, please provide:

    • the full path to any files involved in your question or problem,

    • the command(s) you ran, and/or the job submission script(s) you used,

    • the exact, entire error message (or trace) you received.

  4. Click the blue Submit support ticket form. Research Computing support will respond to you as soon as we are able.


  1. if you have access to the Oak storage system

\ No newline at end of file diff --git a/docs/user-guide/running-jobs/index.html b/docs/user-guide/running-jobs/index.html new file mode 100644 index 000000000..22109453c --- /dev/null +++ b/docs/user-guide/running-jobs/index.html @@ -0,0 +1,155 @@ + Running jobs - Sherlock

Running jobs

Login nodes#

Login nodes are not for computing

Login nodes are shared among many users and therefore must not be used to run computationally intensive tasks. Those should be submitted to the scheduler which will dispatch them on compute nodes.

The key principle of a shared computing environment is that resources are shared among users and must be scheduled. It is mandatory to schedule work by submitting jobs to the scheduler on Sherlock. And since login nodes are a shared resource, they must not be used to execute computing tasks.

Acceptable use of login nodes include:

  • lightweight file transfers,
  • script and configuration file editing,
  • job submission and monitoring.

Resource limits are enforced

To minimize disruption and ensure a comfortable working environment for users, resource limits are enforced on login nodes, and processes started there will automatically be terminated if their resource usage (including CPU time, memory and run time) exceed those limits.

Slurm commands#

Slurm allows requesting resources and submitting jobs in a variety of ways. The main Slurm commands to submit jobs are listed in the table below:

Command Description Behavior
salloc Request resources and allocates them to a job Starts a new shell, but does not execute anything
srun Request resources and runs a command on the allocated compute node(s) Blocking command: will not return until the job ends
sbatch Request resources and runs a script on the allocated compute node(s) Asynchronous command: will return as soon as the job is submitted

Interactive jobs#

Dedicated nodes#

Interactive jobs allow users to log in to a compute node to run commands interactively on the command line. They could be an integral part of an interactive programming and debugging workflow. The simplest way to establish an interactive session on Sherlock is to use the sh_dev command:

$ sh_dev
+

This will open a login shell using one core and 4 GB of memory on one node for one hour. The sh_dev sessions run on dedicated compute nodes. This ensures minimal wait times when you need to access a node for testing script, debug code or any kind of interactive work.

sh_dev also provides X11 forwarding via the submission host (typically the login node you're connected to) and can thus be used to run GUI applications.

Compute nodes#

If you need more resources1, you can pass options to sh_dev, to request more CPU cores, more nodes, or even run in a different partition. sh_dev -h will provide more information:

$ sh_dev -h
+sh_dev: start an interactive shell on a compute node.
+
+Usage: sh_dev [OPTIONS]
+    Optional arguments:
+        -c      number of CPU cores to request (OpenMP/pthreads, default: 1)
+        -n      number of tasks to request (MPI ranks, default: 1)
+        -N      number of nodes to request (default: 1)
+        -m      memory amount to request (default: 4GB)
+        -p      partition to run the job in (default: dev)
+        -t      time limit (default: 01:00:00)
+        -r      allocate resources from the named reservation (default: none)
+        -J      job name (default: sh_dev)
+        -q      quality of service to request for the job (default: normal)
+
+    Note: the default partition only allows for limited amount of resources.
+    If you need more, your job will be rejected unless you specify an
+    alternative partition with -p.
+

Another way to get an interactive session on a compute node is to use srun to execute a shell through the scheduler. For instance, to start a bash session on a compute node, with the default resource requirements (one core for 2 hours), you can run:

$ srun --pty bash
+

The main advantage of this approach is that it will allow you to specify the whole range of submission options that sh_dev may not support.

Finally, if you prefer to submit an existing job script or other executable as an interactive job, you can use the salloc command:

$ salloc script.sh
+

If you don't provide a command to execute, salloc will start a Slurm job and allocate resources for it, but it will not automatically connect you to the allocated node(s). It will only start a new shell on the same node you launched salloc from, and set up the appropriate $SLURM_* environment variables. So you will typically need to look at them to see what nodes have been assigned to your job. For instance:

$ salloc
+salloc: Granted job allocation 655914
+$ echo $SLURM_NODELIST
+sh02-01n55
+$ ssh sh02-01n55
+[...]
+sh02-01n55 ~ $
+

Connecting to nodes#

Login to compute nodes

Users are not allowed to login to compute nodes unless they have a job running there.

If you SSH to a compute node without any active job allocation, you'll be greeted by the following message:

$ ssh sh02-01n01
+Access denied by pam_slurm_adopt: you have no active jobs on this node
+Connection closed
+$
+

Once you have a job running on a node, you can SSH directly to it and run additional processes2, or observe how you application behaves, debug issues, and so on.

The salloc command supports the same parameters as sbatch, and can override any default configuration. Note that any #SBATCH directive in your job script will not be interpreted by salloc when it is executed in this way. You must specify all arguments directly on the command line for them to be taken into account.

Batch jobs#

It's easy to schedule batch jobs on Sherlock. A job is simply an instance of your program, for example your R, Python or Matlab script that is submitted to and executed by the scheduler (Slurm). When you submit a job with the sbatch command it's called a batch job and it will either run immediately or will pend (wait) in the queue.

The length of time a job will pend is determined by several factors; how many other jobs are in the queue ahead or your job and how many resources your job is requesting are the most important factors. One key principle when requesting resources is to always try to request as few resources as you need to get your job done. This will ensure your job pends in the queue for as little time as necessary. To get a rough idea of what resources are needed, you can profile your code/jobs in an sh_dev session in real-time with htop, nvtop, sacct etc. The basic concept is to tell the scheduler what resources your job needs and how long is should run. These resources are:

CPUs: How many CPUs the program you are calling the in the sbatch script needs, unless it can utilize multiple CPUs at once you should request a single CPU. Check your code's documentation or try running in an interactive session with sh_dev and run htop if you are unsure.

GPUs: If your code is GPU enabled, how many GPUs does your code need? Use the diagnostic tool nvtop to see if your code is capable of running on multiple GPUs and how much GPU memory it's using in real-time.

memory (RAM): How much memory your job will consume. Some things to consider, will it load a large file or matrix into memory? Does it consume a lot of memory on your laptop? Often the default memory is sufficient for many jobs.

time: How long will it take for your code to run to completion?

partition: What set of compute nodes on Sherlock will you run on, normal, gpu, owners, bigmem? Use the sh_part command to see what partitions you are allowed to run on. The default partition on Sherlock is the normal partition.

Next, you tell the scheduler what your job should should do: load modules and run your code. Note that any logic you can code into a bash script with the bash scripting language can also be coded into an sbatch script.

This example job, will run the Python script mycode.py for 10 minutes on the normal partition using 1 CPU and 8 GB of memory. To aid in debugging we are naming this job "test_job" and appending the Job ID (%j) to the two output files that Slurm creates when a job is run. The output files are written to the directory in which you launched your job in, you can also specify a different path. One file will contain any errors and the other will contain non-error output. Look in these 2 files ending in .err and .out for useful debugging information and error output.

Because it's a Python 3 script that uses some Numpy code, we need to load the python/3.6.1 and the py-numpy/1.19.2_py36 modules. The Python script is then called just as you would on the command line at the end of the sbatch script:

sbatch script:

#!/usr/bin/bash
+#SBATCH --job-name=test_job
+#SBATCH --output=test_job.%j.out
+#SBATCH --error=test_job.%j.err
+#SBATCH --time=10:00
+#SBATCH -p normal
+#SBATCH -c 1
+#SBATCH --mem=8GB
+module load python/3.6.1
+module load py-numpy/1.19.2_py36
+python3 mycode.py
+
Create and edit the sbatch script with a text editor like vim/nano or the OnDemand file manager. Then save the file, in this example we call it "test.sbatch".

Submit to the scheduler with the sbatch command:

$sbatch test.sbatch
+
Monitor your job and job ID in the queue with the squeue command:

$squeue -u $USER
+   JOBID     PARTITION     NAME     USER    ST       TIME  NODES  NODELIST(REASON)
+   44915821    normal    test_job  <userID>  PD       0:00      1 (Priority)
+

Notice that the jobs state (ST) in pending (PD)

Once the job starts to run that will change to R:

$squeue -u $USER
+    JOBID     PARTITION     NAME     USER     ST      TIME  NODES   NODELIST(REASON)
+    44915854    normal test_job  <userID>     R      0:10     1     sh02-01n49
+

Here you can see it has been running (R) on the compute node sh02-01n49 for 10 seconds. While your job is running you have ssh access to that node and can run diagnostic tools such as htop and nvtop in order to monitor your job's memory and CPU/GPU utilization in real-time. You can also manage this job based on the JobID assigned to it (44915854). For example the job can be cancelled with the scancel command.

Resource requests#

To get a better idea of the amount of resources your job will need, you can use the ruse command, available as a module:

$ module load system ruse
+

ruse is a command line tool developed by Jan Moren to measure a process' resource usage. It periodically measures the resource use of a process and its subprocesses, and can help you find out how much resource to allocate to your job. It will determine the actual memory, execution time and cores that individual programs or MPI applications need to request in their job submission options.

ruse periodically samples the process and its subprocesses and keeps track of the CPU, time and maximum memory use. It also optionally records the sampled values over time. The purpose or Ruse is not to profile processes in detail, but to follow jobs that run for many minutes, hours or days, with no performance impact and without changing the measured application in any way.

You'll find complete documentation and details about ruse's usage on the project webpage, but here are a few useful examples.

Sizing a job#

In its simplest form, ruse can help discover how much resources a new script or application will need. For instance, you can start a sizing session on a compute node with an overestimated amount of resources, and start your application like this:

$ ruse ./myapp
+

This will generate a <myapp>-<pid>/ruse output file in the current directory, looking like this:

Time:           02:55:47
+Memory:         7.4 GB
+Cores:          4
+Total_procs:    3
+Active_procs:   2
+Proc(%): 99.9  99.9
+

It shows that myapp:

  • ran for almost 3 hours
  • used a little less than 8B of memory
  • had 4 cores available,
  • spawned 3 processes, among which at most 2 were active at the same time,
  • that both active processes each used 99.9% of a CPU core

This information could be useful in tailoring the job resource requirements to its exact needs, making sure that the job won't be killed for exceeding one of its resource limits, and that the job won't have to wait too long in queue for resources that it won't use. The corresponding job request could look like this:

#SBATCH --time 3:00:00
+#SBATCH --mem 8GB
+#SBATCH --cpus-per-task 2
+
Verifying a job's usage#

It's also important to verify that applications, especially parallel ones, stay in the confines of the resources they've requested. For instance, a number of parallel computing libraries will make the assumption that they can use all the resources on the host, will automatically determine the number of physical CPU cores present on the compute node, and start as many processes. This could be a significant issue if the job requested less CPUs, as more processes will be constrained on less CPU cores, which will result in node overload and degraded performance for the application.

To avoid this, you can start your application with ruse and report usage for each time step specified with -t. You can also request the reports to be displayed directly on stdout rather than stored in a file.

For instance, this will report usage every 10 seconds:

$ ruse -s -t10 --stdout ./myapp
+   time         mem   processes  process usage
+  (secs)        (MB)  tot  actv  (sorted, %CPU)
+     10        57.5    17    16   33  33  33  25  25  25  25  25  25  25  25  20  20  20  20  20
+     20        57.5    17    16   33  33  33  25  25  25  25  25  25  25  25  20  20  20  20  20
+     30        57.5    17    16   33  33  33  25  25  25  25  25  25  25  25  20  20  20  20  20
+
+Time:           00:00:30
+Memory:         57.5 MB
+Cores:          4
+Total_procs:   17
+Active_procs:  16
+Proc(%): 33.3  33.3  33.2  25.0  25.0  25.0  25.0  25.0  25.0  24.9  24.9  20.0  20.0  20.0  20.0  19.9
+

Here, we can see that despite having being allocated 4 CPUs, the application started 17 threads, 16 of which were active running intensive computations, with the unfortunate consequence that each process could only use a fraction of a CPU.

In that case, to ensure optimal performance and system operation, it's important to modify the application parameters to make sure that it doesn't start more computing processes than the number of requested CPU cores.

Available resources#

Whether you are submitting a batch job, or an or interactive job, it's important to know the resources that are available to you. For this reason, we provide sh_part, a command-line tool to help answer questions such as:

  • which partitions do I have access to?
  • how many jobs are running on them?
  • how many CPUs can I use?
  • where should I submit my jobs?

sh_part can be executed on any login or compute node to see what partitions are available to you, and its output looks like this:

$ sh_part
+     QUEUE STA   FREE  TOTAL   FREE  TOTAL RESORC  OTHER MAXJOBTIME    CORES       NODE   GRES
+ PARTITION TUS  CORES  CORES  NODES  NODES PENDNG PENDNG  DAY-HR:MN    /NODE     MEM-GB (COUNT)
+    normal   *    153   1792      0     84    23k    127    7-00:00    20-24    128-191 -
+    bigmem         29     88      0      2      0      8    1-00:00    32-56   512-3072 -
+       dev         31     40      0      2      0      0    0-02:00       20        128 -
+       gpu         47    172      0      8    116      1    7-00:00    20-24    191-256 gpu:4(S:0-1)(2),gpu:4(S:0)(6)
+

The above example shows four possible partitions where jobs can be submitted: normal, bigmem, dev, or gpu. It also provides additional information such as the maximum amount of time allowed in each partition (MAXJOBTIME), the number of other jobs already in queue, along with the ranges of memory available on nodes in each partition.

  • in the QUEUE PARTITION column, the * character indicates the default partition.
  • the RESOURCE PENDING column shows the core count of pending jobs that are waiting on resources,
  • the OTHER PENDING column lists core counts for jobs that are pending for other reasons, such as licenses, user, group or any other limit,
  • the GRES column shows the number and type of Generic RESsources available in that partition (typically, GPUs), which CPU socket they're available from, and the number of nodes that feature that specific GRES combination. So for instance, in the output above, gpu:4(S:0-1)(2) means that the gpu partition features 2 nodes with 4 GPUs each, and that those GPUs are accessible from both CPU sockets (S:0-1).

Recurring jobs#

Warning

Cron tasks are not supported on Sherlock.

Users are not allowed to create cron jobs on Sherlock, for a variety of reasons:

  • resources limits cannot be easily enforced in cron jobs, meaning that a single user can end up monopolizing all the resources of a login node,
  • no amount of resources can be guaranteed when executing a cron job, leading to unreliable runtime and performance,
  • user cron jobs have the potential of bringing down whole nodes by creating fork bombs, if they're not carefully crafted and tested,
  • compute and login nodes could be redeployed at any time, meaning that cron jobs scheduled there could go away without the user being notified, and cause all sorts of unexpected results,
  • cron jobs could be mistakenly scheduled on several nodes and run multiple times, which could result in corrupted files.

As an alternative, if you need to run recurring tasks at regular intervals, we recommend the following approach: by using the --begin job submission option, and creating a job that resubmits itself once it's done, you can virtually emulate the behavior and benefits of a cron job, without its disadvantages: your task will be scheduled on a compute node, and use all of the resources it requested, without being impacted by anything else.

Depending on your recurring job's specificities, where you submit it and the state of the cluster at the time of execution, the starting time of that task may not be guaranteed and result in a delay in execution, as it will be scheduled by Slurm like any other jobs. Typical recurring jobs, such as file synchronization, database updates or backup tasks don't require strict starting times, though, so most users find this an acceptable trade-off.

The table below summarizes the advantages and inconvenients of each approach:

Cron tasks Recurring jobs
Authorized on Sherlock
Dedicated resources for the task
Persistent across node redeployments
Unique, controlled execution
Precise schedule

Recurrent job example#

The script below presents an example of such a recurrent job, that would emulate a cron task. It will append a timestamped line to a cron.log file in your $HOME directory and run every 7 days.

cron.sbatch
#!/bin/bash
+#SBATCH --job-name=cron
+#SBATCH --begin=now+7days
+#SBATCH --dependency=singleton
+#SBATCH --time=00:02:00
+#SBATCH --mail-type=FAIL
+
+
+## Insert the command to run below. Here, we're just storing the date in a
+## cron.log file
+date -R >> $HOME/cron.log
+
+## Resubmit the job for the next execution
+sbatch $0
+

If the job payload (here the date command) fails for some reason and generates and error, the job will not be resubmitted, and the user will be notified by email.

We encourage users to get familiar with the submission options used in this script by giving a look at the sbatch man page, but some details are given below:

Submission option or command Explanation
--job-name=cron makes it easy to identify the job, is used by the --dependency=singleton option to identify identical jobs, and will allow cancelling the job by name (because its jobid will change each time it's submitted)
--begin=now+7days will instruct the scheduler to not even consider the job for scheduling before 7 days after it's been submitted
--dependency=singleton will make sure that only one cron job runs at any given time
--time=00:02:00 runtime limit for the job (here 2 minutes). You'll need to adjust the value depending on the task you need to run (shorter runtime requests usually result in the job running closer to the clock mark)
--mail-type=FAIL will send an email notification to the user if the job ever fails
sbatch $0 will resubmit the job script by calling its own name ($0) after successful execution

You can save the script as cron.sbatch or any other name, and submit it with:

$ sbatch cron.sbatch
+

It will start running for the first time 7 days after you submit it, and it will continue to run until you cancel it with the following command (using the job name, as defined by the --job-name option):

$ scancel -n cron
+

Persistent jobs#

Recurring jobs described above are a good way to emulate cron jobs on Sherlock, but don't fit all needs, especially when a persistent service is required.

For instance, workflows that require a persistent database connection would benefit from an ever-running database server instance. We don't provide persistent database services on Sherlock, but instructions and examples on how to submit database server jobs are provided for MariaDB or PostgreSQL.

In case those database instances need to run pretty much continuously (within the limits of available resources and runtime maximums), the previous approach described in the recurring jobs section could fall a bit short. Recurring jobs are mainly designed for jobs that have a fixed execution time and don't reach their time limit, but need to run at given intervals (like synchronization or backup jobs, for instance).

Because a database server process will never end within the job, and will continue until the job reaches its time limit, the last resubmission command (sbatch $0) will actually never be executed, and the job won't be resubmitted.

To work around this, a possible approach is to catch a specific signal sent by the scheduler at a predefined time, before the time limit is reached, and then re-queue the job. This is easily done with the Bash trap command, which can be instructed to re-submit a job when it receives the SIGUSR1 signal.

Automatically resubmitting a job doesn't make it immediately runnable

Jobs that are automatically re-submitted using this technique won't restart right away: the will get back in queue and stay pending until their execution conditions (priority, resources, usage limits...) are satisfied.

Persistent job example#

Here's the recurring job example from above, modified to:

  1. instruct the scheduler to send a SIGUSR1 signal to the job 90 seconds3 before reaching its time limit (with the #SBATCH --signal option),
  2. re-submit itself upon receiving that SIGUSR1 signal (with the trap command)
persistent.sbatch
#!/bin/bash
+#
+#SBATCH --job-name=persistent
+#SBATCH --dependency=singleton
+#SBATCH --time=00:05:00
+#SBATCH --signal=B:SIGUSR1@90
+
+# catch the SIGUSR1 signal
+_resubmit() {
+    ## Resubmit the job for the next execution
+    echo "$(date): job $SLURM_JOBID received SIGUSR1 at $(date), re-submitting"
+    sbatch $0
+}
+trap _resubmit SIGUSR1
+
+## Insert the command to run below. Here, we're just outputting the date every
+## 10 seconds, forever
+
+echo "$(date): job $SLURM_JOBID starting on $SLURM_NODELIST"
+while true; do
+    echo "$(date): normal execution"
+    sleep 60
+done
+

Long running processes need to run in the background

If your job's actual payload (the application or command you want to run) is running continuously for the whole duration of the job, it needs to be executed in the background, so the trap can be processed.

To run your application in the background, just add a & at the end of the command and then add a wait statement at the end of the script, to make the shell wait until the end of the job.

For instance, if you were to run a PostgreSQL database server, the while true ... done loop in the previous example could be replaced by something like this:

postgres -i -D $DB_DIR &
+wait
+

Persistent $JOBID#

One potential issue with having a persistent job re-submit itself when it reaches its runtime limit is that it will get a different $JOBID each time it's (re-)submitted.

This could be particularly challenging when other jobs depend on it, like in the database server scenario, where client jobs would need to start only if the database server is running. This can be achieved with job dependencies, but those dependencies have to be expressed using jobids, so having the server job's id changing at each re-submission will be difficult to handle.

To avoid this, the re-submission command (sbatch $0) can be replaced by a re-queuing command:

scontrol requeue $SLURM_JOBID
+

The benefit of that change is that the job will keep the same $JOBID across all re-submissions. And now, dependencies can be added to other jobs using that specific $JOBID, without having to worry about it changing. And there will be only one $JOBID to track for that database server job.

The previous example can then be modified as follows:

persistent.sbatch
#!/bin/bash
+#SBATCH --job-name=persistent
+#SBATCH --dependency=singleton
+#SBATCH --time=00:05:00
+#SBATCH --signal=B:SIGUSR1@90
+
+# catch the SIGUSR1 signal
+_requeue() {
+    echo "$(date): job $SLURM_JOBID received SIGUSR1, re-queueing"
+    scontrol requeue $SLURM_JOBID
+}
+trap '_requeue' SIGUSR1
+
+## Insert the command to run below. Here, we're just outputting the date every
+## 60 seconds, forever
+
+echo "$(date): job $SLURM_JOBID starting on $SLURM_NODELIST"
+while true; do
+    echo "$(date): normal execution"
+    sleep 60
+done
+

Submitting that job will produce an output similar to this:

Mon Nov  5 10:30:59 PST 2018: Job 31182239 starting on sh-06-34
+Mon Nov  5 10:30:59 PST 2018: normal execution
+Mon Nov  5 10:31:59 PST 2018: normal execution
+Mon Nov  5 10:32:59 PST 2018: normal execution
+Mon Nov  5 10:33:59 PST 2018: normal execution
+Mon Nov  5 10:34:59 PST 2018: Job 31182239 received SIGUSR1, re-queueing
+slurmstepd: error: *** JOB 31182239 ON sh-06-34 CANCELLED AT 2018-11-05T10:35:06 DUE TO JOB REQUEUE ***
+Mon Nov  5 10:38:11 PST 2018: Job 31182239 starting on sh-06-34
+Mon Nov  5 10:38:11 PST 2018: normal execution
+Mon Nov  5 10:39:11 PST 2018: normal execution
+

The job runs for 5 minutes, then received the SIGUSR1 signal, is re-queued, restarts for 5 minutes, and so on, until it's properly scancelled.


  1. The dedicated partition that sh_dev uses by default only allows up to 2 cores and 8 GB or memory per user at any given time. So if you need more resources for your interactive session, you may have to specify a different partition. See the Partitions section for more details. 

  2. Please note that your SSH session will be attached to your running job, and that resources used by that interactive shell will count towards your job's resource limits. So if you start a process using large amounts of memory via SSH while your job is running, you may hit the job's memory limits, which will trigger its termination. 

  3. Due to the resolution of event handling by the scheduler, the signal may be sent up to 60 seconds earlier than specified. 

\ No newline at end of file diff --git a/docs/user-guide/troubleshoot/index.html b/docs/user-guide/troubleshoot/index.html new file mode 100644 index 000000000..7b28c03be --- /dev/null +++ b/docs/user-guide/troubleshoot/index.html @@ -0,0 +1 @@ + Troubleshooting - Sherlock

Troubleshooting

Sherlock is a resource for research, and as such, it is in perpetual evolution, as hardware, applications, libraries, and modules are added, updated, and/or modified on a regular basis. Sometimes issues can appear where none existed before. When you find something missing or a behavior that seems odd, please let us know.

How to submit a support request#

Google it first!

When encountering issues with software, if the misbehavior involves an error message, the first step should always be to look up the error message online. There's a good chance somebody stumbled upon the same hurdles before, and may even provide some fix or workaround.

One of the most helpful Google searches is your_application sbatch. For example if you're having trouble submitting jobs or allocating resources (CPUs, time, memory) with Cell Ranger, search for cell ranger sbatch to see how others have successfully run your application on a cluster.

If you're facing issues you can't figure out, we're here to help. Feel free to email us at srcc-support@stanford.edu, but please keep the following points in mind to ensure a timely and relevant response to your support requests.

Please provide relevant information

We need to understand the issue you're facing, and in most cases, we need to be able to reproduce it, so it could be diagnosed and addressed. Please make sure to provide enough information so we could help you in the best possible way.

This typically involves providing the following information:

  • your SUNet ID,
  • some context about your problem (were you submitting a job, copying a file, compiling an application?),
  • if relevant, the full path to the files involved in your question or problem,
  • the name of node where you received the error (usually displayed in your command-line prompt),
  • the command(s) you ran, and/or the job submission script(s) you used,
  • the relevant job ID(s),
  • the exact, entire error message (or trace) you received.

Error messages are critical

This is very important. Without proper error messages, there is nothing we can do to help. And "it doesn't work" is not a proper error message. Also, please cut and paste the actual text of the output, commands, and error messages rather than screenshots in your tickets.
That way it is much easier for us to try to replicate your errors.

You can avoid email back and forth where we ask for all the relevant details, and thus delay the problem resolution, by providing all this information from the start. This will help us get to your problem immediately.

\ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 000000000..155127405 --- /dev/null +++ b/index.html @@ -0,0 +1,6 @@ + Sherlock
Sherlock
The HPC cluster for all your computing needs
Need to access computing resources to support your sponsored or departmental research at Stanford? You may want to try out the Sherlock cluster! Funded and supported by the Provost and Dean of Research, Sherlock is a shared computing cluster available for use by all Stanford faculty and their research teams.
More information

Services

A one-stop shop for all our scientific computing needs

compute

All the resources you need in one place: compute nodes, GPUs, large memory nodes, blazing fast interconnect, parallel filesystems, and more!

explore

Sherlock provides all the software tools and storage resources you'll need to explore and analyze your research data.

discover

With a whole range of computational tools at your fingertips, scientific breakthroughs will just be a batch job away.

In a nutshell

All about Sherlock

What is Sherlock?#

Sherlock is a shared computing cluster available for use by all Stanford Faculty members and their research teams, for sponsored or departmental faculty research. All research teams on Sherlock have access to a base set of managed computing resources, GPU-based servers, and a multi-petabyte, high-performance parallel file system for short-term storage.

Faculty can supplement these shared nodes by purchasing additional servers, and become Sherlock owners. By investing in the cluster, PI groups not only receive exclusive access to the nodes they purchase, but also get access to all of the other owner compute nodes when they're not in use, thus giving them access to the whole breadth of Sherlock resources.

Why should I use Sherlock?#

Using Sherlock for your work provides many advantages over individual solutions: hosted in an on-premises, state-of-the-art datacenter dedicated to research computing systems, the Sherlock cluster is powered and cooled by installations that are optimized for scientific computing.

On Sherlock, simulations and workloads benefit from performance levels that only large scale HPC systems can offer: high-performance I/O infrastructure, petabytes of storage, large variety of hardware configurations, GPU accelerators, centralized system administration and management provided by the Stanford Research Computing Center (SRCC).

Such features are not easily accessible at the departmental level, and often require both significant initial investments and recurring costs. Joining Sherlock allows researchers and Faculty members to avoid those costs and benefit from economies of scale, as well as to access larger, professionally managed computing resources that what would not be available on an individual or even departmental basis.

How much does it cost?#

Sherlock is free to use for anyone doing departmental or sponsored research at Stanford.

Any Faculty member can request access for research purposes, and get an account with a base storage allocation and unlimited compute time on the global, shared pool of resources.

Stanford Research Computing provides faculty with the opportunity to purchase from a catalog a recommended compute node configurations, for the use of their research teams. Using a traditional compute cluster condominium model, participating faculty and their teams get priority access to the resources they purchase. When those resources are idle, other "owners" can use them, until the purchasing owner wants to use them. When this happens, those other owners jobs are re-queued to free up resources. Participating owner PIs also have shared access to the original base Sherlock nodes, along with everyone else.

How big is it?#

Quite big! It's actually difficult to give a definitive answer, as Sherlock is constantly evolving and expanding with new hardware additions.

As of February 2024, Sherlock features over 6,500 CPU cores available to all researchers, and more than 48,600 additional CPU cores available to Sherlock owners, faculty who have augmented the cluster with their own purchases. With a computing power over 5.0 Petaflops, Sherlock would have its place in the Top500 list of the 500 most powerful computer systems in the world.

For more details about Sherlock size and technical specifications, please refer to the tech specs section of the documentation. And for even more numbers and figures, see the Sherlock facts page.

OK, I'm sold, how do I start?#

You can request an account right now, take a look at the documentation, and drop us an email if you have any questions.

I want my own nodes!#

If you're interested in becoming an owner on Sherlock, and benefit from all the advantages associated, please take a look at the catalog of configurations, feel free to use the ordering form to submit your request, and we'll get back to you.

\ No newline at end of file diff --git a/ondemand/index.html b/ondemand/index.html new file mode 100644 index 000000000..de5e416c7 --- /dev/null +++ b/ondemand/index.html @@ -0,0 +1,15 @@ + + + + + + Redirecting... + + + + + + +Redirecting... + + diff --git a/order/index.html b/order/index.html new file mode 100644 index 000000000..57f07d053 --- /dev/null +++ b/order/index.html @@ -0,0 +1,15 @@ + + + + + + Redirecting... + + + + + + +Redirecting... + + diff --git a/search/search_index.json b/search/search_index.json new file mode 100644 index 000000000..9d1cf7bab --- /dev/null +++ b/search/search_index.json @@ -0,0 +1 @@ +{"config":{"lang":["en"],"separator":"[\\s\\-,:!=\\[\\]()\"/]+|\\.(?!\\d)|&[lg]t;|(?!\\b)(?=[A-Z][a-z])"},"docs":[{"title":"Sherlock","text":"","location":""},{"title":"What is Sherlock?","text":"

Sherlock is a shared computing cluster available for use by all Stanford Faculty members and their research teams, for sponsored or departmental faculty research. All research teams on Sherlock have access to a base set of managed computing resources, GPU-based servers, and a multi-petabyte, high-performance parallel file system for short-term storage.

Faculty can supplement these shared nodes by purchasing additional servers, and become Sherlock owners. By investing in the cluster, PI groups not only receive exclusive access to the nodes they purchase, but also get access to all of the other owner compute nodes when they're not in use, thus giving them access to the whole breadth of Sherlock resources.

","location":"#what-is-sherlock"},{"title":"Why should I use Sherlock?","text":"

Using Sherlock for your work provides many advantages over individual solutions: hosted in an on-premises, state-of-the-art datacenter dedicated to research computing systems, the Sherlock cluster is powered and cooled by installations that are optimized for scientific computing.

On Sherlock, simulations and workloads benefit from performance levels that only large scale HPC systems can offer: high-performance I/O infrastructure, petabytes of storage, large variety of hardware configurations, GPU accelerators, centralized system administration and management provided by the Stanford Research Computing Center (SRCC).

Such features are not easily accessible at the departmental level, and often require both significant initial investments and recurring costs. Joining Sherlock allows researchers and Faculty members to avoid those costs and benefit from economies of scale, as well as to access larger, professionally managed computing resources that what would not be available on an individual or even departmental basis.

","location":"#why-should-i-use-sherlock"},{"title":"How much does it cost?","text":"

Sherlock is free to use for anyone doing departmental or sponsored research at Stanford.

Any Faculty member can request access for research purposes, and get an account with a base storage allocation and unlimited compute time on the global, shared pool of resources.

Stanford Research Computing provides faculty with the opportunity to purchase from a catalog a recommended compute node configurations, for the use of their research teams. Using a traditional compute cluster condominium model, participating faculty and their teams get priority access to the resources they purchase. When those resources are idle, other \"owners\" can use them, until the purchasing owner wants to use them. When this happens, those other owners jobs are re-queued to free up resources. Participating owner PIs also have shared access to the original base Sherlock nodes, along with everyone else.

","location":"#how-much-does-it-cost"},{"title":"How big is it?","text":"

Quite big! It's actually difficult to give a definitive answer, as Sherlock is constantly evolving and expanding with new hardware additions.

As of February 2024, Sherlock features over 6,500 CPU cores available to all researchers, and more than 48,600 additional CPU cores available to Sherlock owners, faculty who have augmented the cluster with their own purchases. With a computing power over 5.0 Petaflops, Sherlock would have its place in the Top500 list of the 500 most powerful computer systems in the world.

For more details about Sherlock size and technical specifications, please refer to the tech specs section of the documentation. And for even more numbers and figures, see the Sherlock facts page.

","location":"#how-big-is-it"},{"title":"OK, I'm sold, how do I start?","text":"

You can request an account right now, take a look at the documentation, and drop us an email if you have any questions.

","location":"#ok-im-sold-how-do-i-start"},{"title":"I want my own nodes!","text":"

If you're interested in becoming an owner on Sherlock, and benefit from all the advantages associated, please take a look at the catalog of configurations, feel free to use the ordering form to submit your request, and we'll get back to you.

","location":"#i-want-my-own-nodes"},{"title":"Sherlock documentation","text":"

","location":"docs/"},{"title":"Welcome to Sherlock!","text":"

Sherlock is a High-Performance Computing (HPC) cluster, operated by the Stanford Research Computing Center to provide computing resources to the Stanford community at large. You'll find all the documentation, tips, FAQs and information about Sherlock among these pages.

","location":"docs/#welcome-to-sherlock"},{"title":"Why use Sherlock?","text":"

Using Sherlock for your work provides many advantages over individual solutions: hosted in an on-premises, state-of-the-art datacenter, the Sherlock cluster is powered and cooled by installations that are optimized for scientific computing.

On Sherlock, simulations and workloads benefit from performance levels that only large scale HPC systems can offer: high-performance I/O infrastructure, petabytes of storage, large variety of hardware configurations, GPU accelerators, centralized system administration and management provided by the Stanford Research Computing Center (SRCC).

Such features are not easily accessible at the departmental level, and often require both significant initial investments and recurring costs. Joining Sherlock allows researchers and faculty members to avoid those costs and benefit from economies of scale, as well as to access larger, professionally managed computing resources that what would not be available on an individual or even departmental basis.

","location":"docs/#why-use-sherlock"},{"title":"How much does it cost?","text":"

Sherlock is free to use for anyone doing departmental or sponsored research at Stanford. Any faculty member can request access for research purposes, and get an account with a base storage allocation and unlimited compute time on the global, shared pool of resources.

No CPU.hour charge

Unlike all Cloud Service Providers and many HPC systems, there is no usage charge on Sherlock.

When you submit your work on Sherlock, you don't need to keep an eye on the clock and worry about how much that run will cost you. There is no limit on the total amount of computing you can run on the cluster, as long as resources are available, and there's no charge to use them, no matter how large or small your computations are.

In case those free resources are not sufficient, Stanford Research Computing offers Faculty members the opportunity to invest into the cluster, and get access to additional computing resources for their research teams. Using a traditional compute cluster condominium model, participating faculty and their teams get priority access to the resources they purchase. When they're idle, those resources are available to use by other owners on the cluster, giving them access to virtually unlimited resources.

","location":"docs/#how-much-does-it-cost"},{"title":"Information sources","text":"

Searching the docs

If you're looking for information on a specific topic, the Search feature of this site will allow you to quickly find the page you're looking for. Just press S, F or / to open the Search bar and start typing.

To help users take their first steps on Sherlock, we provide documentation and information through various channels:

Channel URL Purpose Documentation You are here www.sherlock.stanford.edu/docs information to help new users start on Sherlock, and more in-depth documentation for users already familiar with the environment. Changelog news.sherlock.stanford.edu announces, news and updates about Sherlock. Dashboard status.sherlock.stanford.edu status of Sherlock's main components and services, outages, planned maintenance.

To get started, you can take a look at the concepts and glossary pages to get familiar with the terminology used throughout the documentation pages. Then, we recommend going through the following sections:

  • Prerequisites
  • Connecting to the cluster
  • Submitting jobs
","location":"docs/#information-sources"},{"title":"Acknowledgment / citation","text":"

It is important and expected that publications resulting from computations performed on Sherlock acknowledge this. The following wording is suggested:

Acknowledgment

Some of the computing for this project was performed on the Sherlock cluster. We would like to thank Stanford University and the Stanford Research Computing Center for providing computational resources and support that contributed to these research results.

","location":"docs/#acknowledgment-citation"},{"title":"Support","text":"","location":"docs/#support"},{"title":"Email (recommended)","text":"

Research Computing support can be reached by sending an email to srcc-support@stanford.edu and mentioning Sherlock.

How to submit effective support requests

To ensure a timely and relevant response, please make sure to include some additional details, such as job ids, commands executed and error messages received, so we can help you better. For more details, see the Troubleshooting page.

As a member of the Sherlock community, you're also automatically subscribed to the sherlock-announce mailing-list, which is only used by the SRCC team to send important announcements about Sherlock,

","location":"docs/#email-recommended"},{"title":"Onboarding sessions","text":"

We offer regular onboarding sessions for new Sherlock users.

On-boarding session times

On-boarding sessions are offered every first Wednesday of the month, 1PM-2PM PST, via Zoom

These one-hour sessions are a brief introduction to Sherlock's layout, its scheduler, the different file systems available on the cluster, as well as some job submission and software installation best practices for new users. They are a good intro course if you are new to Sherlock or HPC in general.

If you can't attend live on-boarding sessions, you can still take a look at the on-boarding slides as well as to this session recording.

","location":"docs/#onboarding-sessions"},{"title":"Office hours","text":"

Sending a question to srcc-support@stanford.edu is always the best first option for questions. That way you can include detailed descriptions of the problem or question, valuable output and error messages and any steps you took when you encountered your error. Also, everyone on our team will see your ticket, enabling the most appropriate group member to respond.

Office hours are a good place for more generalized questions about Sherlock, Slurm, Linux usage, data storage, queue structures/scheduling, job optimization and general capabilities of Sherlock. It's also useful for more technically nuanced questions that may not be easily answered with our ticketing system. In office hours some problems can indeed be solved quickly or progress can be made so that you can then work self-sufficiently towards a solution on your own.

COVID-19 update

We'll be holding remote office hours via Zoom, for the time being.

Office hours times

Click here to join the Sherlock Office Hours Zoom

  • Tuesday 10-11am
  • Thursday 3-4pm

You'll need a full-service SUNet ID (basically, a @stanford.edu email address) in order to authenticate and join Office Hours via Zoom. If you do not have a full service account, please contact us at srcc-support@stanford.edu.

If you can't make any of the Office Hours sessions, you can also make an appointment with Sherlock's support team.

","location":"docs/#office-hours"},{"title":"What to expect","text":"
  • We cannot accommodate walk-ins: we're unfortunately not staffed to welcome unscheduled visits, so please make sure that you're planning to stop by during office hours. We will not be able to help you otherwise.

  • We can rarely help with application-specific or algorithm problems.

  • You should plan your projects sufficiently in advance and not come to office hours at the last minute before a deadline. Sherlock is a busy resource with several thousand users and you should not expect your jobs to complete before a given date.

  • Not all questions and problems can be answered or solved during office hours, especially ones involving hardware, filesystem or network issues. Sherlock features several thousand computing, networking and storage components, that are constantly being monitored by our team. You can be sure that when Sherlock has an issue, we are aware of it and working on it.

","location":"docs/#what-to-expect"},{"title":"User community","text":"

Sherlock is present on the Stanford Slack Grid, and you're more than welcome to join the following channels:

  • #sherlock-announce, for announcements related to Sherlock and its surrounding services,
  • #sherlock-users, as a place for Sherlock users to connect directly with each other. If you have general questions about Sherlock, want to reach out to other Sherlock users to share tips, good practices, tutorials or other info, please feel free to do so there.

For more details about the SRCC Slack Workspace, and instructions on how to join this workspace and its channels, please see the SRCC support page.

Slack is not an official support channel

Please note that while SRCC staff will monitor these channels, the official way to get support is still to email us at srcc-support@stanford.edu.

","location":"docs/#user-community"},{"title":"Quick Start","text":"

If you're in a rush1, here's a 3-step ultra-quick start:

  1. connect to Sherlock
$ ssh login.sherlock.stanford.edu\n
  1. get an interactive session on a compute node
[kilian@sh-ln01 login! ~]$ sh_dev\n
  1. run a command
[kilian@sh02-01n58 ~]$ module load python\n[kilian@sh02-01n58 ~]$ python -c \"print('Hello Sherlock')\"\nHello Sherlock\n

Congrats! You ran your first job on Sherlock!

","location":"docs/#quick-start"},{"title":"Replay","text":"

Here's what it looks like in motion:

  1. even in a rush, you'll still need an account on the cluster. See the Prerequisites page for details.\u00a0\u21a9

","location":"docs/#replay"},{"title":"Concepts","text":"","location":"docs/concepts/"},{"title":"Sherlock, a shared resource","text":"

Sherlock is a shared compute cluster available for use by all Stanford faculty members and their research teams to support departmental or sponsored research.

Sherlock is a resource for research

Sherlock is not suitable for course work, class assignments or general-use training sessions.

Users interested in using computing resources in such contexts are encouraged to investigate FarmShare, Stanford\u2019s community computing environment, which is primarily intended for supporting coursework.

It is open to the Stanford community as a computing resource to support departmental or sponsored research, thus a faculty member's sponsorship is required for all user accounts.

Usage policy

Please note that your use of this system falls under the \"Computer and Network Usage Policy\", as described in the Stanford Administrative Guide. In particular, sharing authentication credentials is strictly prohibited. Violation of this policy will result in termination of access to Sherlock.

Sherlock has been designed, deployed, and is maintained and operated by the Stanford Research Computing Center (SRCC) staff. The SRCC is a joint effort of the Dean of Research and IT Services to build and support a comprehensive program to advance computational research at Stanford.

Sherlock has been initially purchased and supported with seed funding from Stanford's Provost. It comprises a set of freely available compute nodes, a few specific resources such as large-memory machines and GPU servers, as well as the associated networking equipment and storage. These resources can be used to run computational codes and programs, and are managed through a job scheduler using a fair-share algorithm.

","location":"docs/concepts/#sherlock-a-shared-resource"},{"title":"Data risk classification","text":"

Low and Moderate Risk data

Sherlock is approved for computing with Low and Moderate Risk data only.

High Risk data

Sherlock is NOT approved to store or process HIPAA, PHI, PII nor any kind of High Risk data. The system is approved for computing with Low and Moderate Risk data only, and is not suitable to process High Risk data.

Users are responsible for ensuring the compliance of their own data.

For more information about data risk classifications, see the Information Security Risk Classification page.

","location":"docs/concepts/#data-risk-classification"},{"title":"Investing in Sherlock","text":"

For users who need more than casual access to a shared computing environment, Sherlock also offers Faculty members the possibility to invest in additional, dedicated computing resources.

Unlike traditional clusters, Sherlock is a collaborative system where the majority of nodes are purchased and shared by the cluster users. When a user (typically a PI) purchases one or more nodes, they become an owner. Owners choose from a standard set of server configurations supported by SRCC staff (known as the Sherlock catalog) to add to the cluster.

When they're not in use, PI-purchased compute nodes can be used by other owners. This model also allows Sherlock owners to benefit from the scale of the cluster by giving them access to more compute nodes than their individual purchase, which gives them much greater flexibility than owning a standalone cluster.

The majority of Sherlock nodes are owners nodes

The vast majority of Sherlock's compute nodes have been purchased by individual PIs and groups, and PI purchases are the main driver behind the rapid expansion of the cluster, which went from 120 nodes to more than 1,000 nodes in less than 3 years.

The resource scheduler configuration works like this:

  • owners and their research teams get immediate and exclusive access to the resources they purchased,
  • when those nodes are idle, other owners can use them,
  • when the purchasing owners want to use their resources, jobs from other owners that may be running on them are preempted (ie. killed and re-queued).

This provides a way to get more resources to run less important jobs in the background, while making sure that an owner always gets immediate access to his/her own nodes.

Participating owners also have shared access to the public, shared Sherlock nodes, along with everyone else.

","location":"docs/concepts/#investing-in-sherlock"},{"title":"Benefits","text":"

Benefits to owners include:

no wait time in queue: immediate and exclusive access to the purchased nodes

access to more resources: possibility to submit jobs to the other owners' nodes when they're not in use

Compared to hosting and managing computing resources on your own, purchasing nodes on Sherlock provides:

  • data center hosting, including backup power and cooling
  • system configuration, maintenance and administration
  • hardware diagnostics and repairs

Those benefits come in addition to the other Sherlock advantages:

  • access to high-performance, large parallel scratch storage space
  • access to snapshot'ed, replicated, enterprise-class storage space
  • optimized software stack, especially tailored for a range of research needs
  • tools to build and install additional software applications as needed
  • user support
","location":"docs/concepts/#benefits"},{"title":"Limitations","text":"

Purchasing nodes on Sherlock is different from traditional server hosting.

In particular, purchasing your own compute nodes on Sherlock will NOT allow:

root access: owner nodes on Sherlock are still managed by SRCC in accordance with Stanford's Minimum Security Standards. Although users are welcome to install (or request) any software they may need, purchasing compute nodes on Sherlock does not allow root access to the nodes.

running permanent services: permanent processes such as web servers or databases can only run on owner nodes through the scheduler, using recurring or persistent jobs. Purchasing compute nodes on Sherlock does not provide a way to run anything that couldn't run on freely-available nodes.

direct network connectivity: owners' nodes are connected to the Sherlock's internal network and are not directly accessible from the outside, which means that they can't host public services like web or application servers.

bypassing the scheduler: jobs running on owners' nodes still need to be submitted to the scheduler. Direct shell access to the nodes is not possible outside of scheduled interactive sessions.

hardware changes: the hardware components of purchased nodes cannot be modified, removed, swapped or upgraded during the nodes' service lifetime.

configuration: the configuration of purchased nodes is tuned to provide optimal performance over a majority of use cases and applications, is identical on all nodes across the cluster, and cannot be changed, modified or altered in any way.

persistent local storage: local storage space provided on the compute nodes is only usable for the duration of a job and cannot be used to store long-term data.

additional storage space: purchasing compute nodes on Sherlock does not provide additional storage space. Please note that SRCC does offer the possibility for PIs to purchase their own storage space on Oak, for their long-term research data needs.

","location":"docs/concepts/#limitations"},{"title":"Purchasing nodes","text":"

If you are interested in becoming an owner, you can find the latest information about ordering Sherlock nodes on the ordering page. Feel free to contact us is you have any additional question.

","location":"docs/concepts/#purchasing-nodes"},{"title":"Cluster generations","text":"

The research computing landscape evolves very quickly, and to both accommodate growth and technological advances, it's necessary to adapt the Sherlock environment to these evolutions.

Every year or so, a new generation of processors is released, which is why, over a span of several years, multiple generations of CPUs and GPUs make their way into Sherlock. This provides users with access to the latest features and performance enhancements, but it also adds some heterogeneity to the cluster, which is important to keep in mind when compiling software and requesting resources to run them.

Another key component of Sherlock is the interconnect network that links all of Sherlock's compute nodes together and act as a backbone for the whole cluster. This network fabric is of finite capacity, and based on the individual networking switches characteristics and the typical research computing workflows, it can accommodate up to about 850 compute nodes.

As nodes get added to Sherlock, the number of available ports decreases, and at some point, the fabric gets full and no more nodes can be added. Sherlock reached that stage for the first time in late 2016, which prompted the installation of a whole new fabric, to allow for further system expansion.

This kind of evolution is the perfect opportunity to upgrade other components too: management software, ancillary services architecture and user applications. In January 2017, those components were completely overhauled and a new, completely separate cluster was kick-started, using using a different set of hardware and software, while conserving the same storage infrastructure, to ease the transition process.

After a transition period, the older Sherlock hardware, compute and login nodes, have been be merged in the new cluster, and from a logical perspective (connection, job scheduling and computing resources), nodes attached to each of the fabrics have been reunited to form a single cluster again.

As Sherlock continues to evolve and grow, the new fabric will also approach capacity again, and the same process will happen again to start the next generation of Sherlock.

","location":"docs/concepts/#cluster-generations"},{"title":"Maintenances and upgrades","text":"

The SRCC institutes a monthly scheduled maintenance window on Sherlock, to ensure optimal operation, avoid potential issues and prepare for future expansions. This window will be used to make hardware repairs, software and firmware updates, and perform general manufacturer recommended maintenance on our environment.

As often as possible, maintenance tasks are performed in a rolling, non-disruptive fashion, but downtimes are sometimes an unfortunate necessity to allow disruptive operations that can't be conducted while users are working on the system.

Maintenance schedule

As often as possible, maintenances will take place on the first Tuesday of every month, from 08:00 to 12:00 Pacific time (noon), and will be announced 2 weeks in advance, through the usual communication channels.

In case an exceptional amount of work is required, the maintenance window could be extended to 10 hours (from 08:00 to 18:00).

During these times, access to Sherlock will be unavailable, login will be disabled and jobs won't run. A reservation will be placed in the scheduler so running jobs can finish before the maintenance, and jobs that wouldn't finish by the maintenance window would be pushed after it.

","location":"docs/concepts/#maintenances-and-upgrades"},{"title":"Common questions","text":"

Q: Why doing maintenances at all?

A: Due to the scale of our computing environment and the increasing complexity of the systems we deploy, it is prudent to arrange for a regular time when we can comfortably and without pressure fix problems or update facilities with minimal impact to our customers. Most, if not all, major HPC centers have regular maintenance schedules. We also need to enforce the Minimum Security rules instituted by the Stanford Information Security Office, which mandate deployment of security patches in a timely manner.

Q: Why Tuesdays 08:00-12:00? Why not do this late at night?

A: We have observed that the least busy time for our services is at the beginning of the week in the morning hours. Using this time period should not interrupt most of our users. If the remote possibility of a problem that extends past the scheduled downtime occurs, we would have our full staff fresh and available to assist in repairs and quickly restore service.

Q: I have jobs running, what will happen to them?

A: For long-running jobs, we strongly recommend checkpointing your results on a periodic basis. Besides, we will place a reservation in the scheduler for each maintenance that would prevent jobs to run past it. This means that the scheduler will only allow jobs to run if they can finish by the time the maintenance starts. If you submit a long job soon before the maintenance, it will be delayed until after the maintenance. That will ensure that no work is lost when the maintenance starts.

","location":"docs/concepts/#common-questions"},{"title":"About us","text":"","location":"docs/credits/"},{"title":"SRCC","text":"

The Stanford Research Computing Center (SRCC) is a joint effort of the Dean of Research and IT Services to build and support a comprehensive program to advance computational research at Stanford. That includes offering and supporting traditional high performance computing (HPC) systems, as well as systems for high throughput and data-intensive computing.

The SRCC also helps researchers transition their analyses and models from the desktop to more capable and plentiful resources, providing the opportunity to explore their data and answer research questions at a scale typically not possible on desktops or departmental servers. Partnering with national initiatives like NSF XSEDE program as well as vendors, the SRCC offers training and learning opportunities around HPC tools and technologies.

For more information, please see the SRCC website

","location":"docs/credits/#srcc"},{"title":"Credits","text":"

We would like to thank the following companies for their generous sponsorship, and for providing services and resources that help us manage Sherlock every day:

  • GitHub
  • Hund
  • Noticeable

The Sherlock website and documentation also rely on the following projects:

  • MkDocs
  • Material for MkDocs
","location":"docs/credits/#credits"},{"title":"Why the Sherlock name?","text":"

If you're curious about where the Sherlock name came from, we always considered that computing resources in general and HPC clusters in particular should be the catalyst of innovation, be ahead of their time, and spur new discoveries.

And what better account of what's happening on a high-performance computing cluster than Benedict Cumberbatch describing his role as Sherlock Holmes in the BBC's modern adaptation of Arthur Conan Doyle's classic?

Benedict Cumberbatch, about Sherlock

There's a great charge you get from playing him, because of the volume of words in your head and the speed of thought \u2013 you really have to make your connections incredibly fast. He is one step ahead of the audience, and of anyone around him with normal intellect. They can't quite fathom where his leaps are taking him.

Yes, exactly. That's Sherlock.

","location":"docs/credits/#why-the-sherlock-name"},{"title":"Sherlock, of HBO fame","text":"

And finally, we couldn't resist to the pleasure of citing the most prestigious accomplishment of Sherlock to date: a mention in HBO's Silicon Valley Season 4 finale!

Yep, you got that right, Richard Hendricks wanted to use our very own Sherlock!

Kudos to the show's crew and a big thank you to HBO Data compression stars, Professor Tsachy Weissman and Dmitri Pavlichin, for this incredible Sherlock shout-out. This has been an everlasting source of pride and amazement for the whole SRCC team!

","location":"docs/credits/#sherlock-of-hbo-fame"},{"title":"Glossary","text":"","location":"docs/glossary/"},{"title":"What's a cluster?","text":"

A computing cluster is a federation of multiple compute nodes (independent computers), most commonly linked together through a high-performance interconnect network.

What makes it a \"super-computer\" is the ability for a program to address resources (such as memory, CPU cores) located in different compute nodes, through the high-performance interconnect network.

On a computing cluster, users typically connect to login nodes, using a secure remote login protocol such as SSH. Unlike in traditional interactive environments, users then need to prepare compute jobs to submit to a resource scheduler. Based on a set of rules and limits, the scheduler will then try to match the jobs' resource requirements with available resources such as CPUs, memory or computing accelerators such as GPUs. It will then execute the user defined tasks on the selected resources, and generate output files in one of the different storage locations available on the cluster, for the user to review and analyze.

","location":"docs/glossary/#whats-a-cluster"},{"title":"Cluster components","text":"

The terms that are typically used to describe cluster components could be confusing, so in an effort to clarify things, here's a schema of the most important ones, and their definition.

","location":"docs/glossary/#cluster-components"},{"title":"CPU","text":"A Central Processing Unit (CPU), or core, or CPU core, is the smallest unit in a microprocessor that can carry out computational tasks, that is, run programs. Modern processors typically have multiple cores.","location":"docs/glossary/#cpu"},{"title":"Socket","text":"A socket is the connector that houses the microprocessor. By extension, it represents the physical package of a processor, that typically contains multiple cores.","location":"docs/glossary/#socket"},{"title":"Node","text":"A node is a physical, stand-alone computer, that can handle computing tasks and run jobs. It's connected to other compute nodes via a fast network interconnect, and contains CPUs, memory and devices managed by an operating system.","location":"docs/glossary/#node"},{"title":"Cluster","text":"A cluster is the complete collection of nodes with networking and file storage facilities. It's usually a group of independent computers connected via a fast network interconnect, managed by a resource manager, which acts as a large parallel computer.","location":"docs/glossary/#cluster"},{"title":"Other commonly used terms","text":"

To make this documentation more accessible, we try to explain key terms in a non-technical way. When reading these pages, please keep in mind the following definitions, presented in alphabetical order:

","location":"docs/glossary/#other-commonly-used-terms"},{"title":"Application","text":"An application is a computer program designed to perform a group of coordinated functions, tasks, or activities for the benefit of the user. In the context of scientific computing, an application typically performs computations related to a scientific goal (molecular dynamics simulations, genome assembly, compuational fluid dynamics simulations, etc).","location":"docs/glossary/#application"},{"title":"Backfill","text":"Backfill scheduling is a method that a scheduler can use in order to maximize utilization. It allows smaller (both in terms of size and time requirements), lower priority jobs to start before larger, higher priority ones, as long as doing so doesn't push back the higher-priority jobs expected start time.","location":"docs/glossary/#backfill"},{"title":"Executable","text":"A binary (or executable) program refers to the machine-code compiled version of an application. This is which is a binary file that a computer can execute directly. As opposed to the application source code, which is the human-readable version of the application internal instructions, and which needs to be compiled by a compiler to produce the executable binary.","location":"docs/glossary/#executable"},{"title":"Fairshare","text":"A resource scheduler ranks jobs by priority for execution. Each job's priority in queue is determined by multiple factors, among which one being the user's fairshare score. A user's fairshare score is computed based on a target (the given portion of the resources that this user should be able to use) and the user's effetive usage, ie the amount of resources (s)he effectively used in the past. As a result, the more resources past jobs have used, the lower the priority of the next jobs will be. Past usage is computed based on a sliding window and progressively forgotten over time. This enables all users on a shared resource to get a fair portion of it for their own use, by giving higher priority to users who have been underserved in the past.","location":"docs/glossary/#fairshare"},{"title":"FLOPS","text":"Floating-point Operations Per Second (FLOPS) are a measure of computing performance, and represent the number of floating-point operations that a CPU can perform each second. Modern CPUs and GPUs are capable of doing TeraFLOPS (10^12 floating-point operations per second), depending on the precision of those operations (half-precision: 16 bits, single-precision: 32 bits, double-precision: 64 bits).","location":"docs/glossary/#flops"},{"title":"GPU","text":"A Graphical Processing Unit (GPU) is a specialized device initially designed to generate graphical output. On modern computing architecture, they are used to accelerate certain types of computation, which they are much faster than CPUs at. GPUs have their own memory, and are attached to CPUs, within a node. Each compute node can host one or more GPUs.","location":"docs/glossary/#gpu"},{"title":"HPC","text":"High Performance Computing (HPC) refers to the practice of aggregating computing power to achieve higher performance that would be possible by using a typical computer.","location":"docs/glossary/#hpc"},{"title":"Infiniband","text":"Infiniband is a networking standard that features high bandwidth and low latency. The current Infiniband devices are capable of transferring data at up to 200 Gbits/sec with less than a microsecond latency. As of this writing, the popular Infiniband versions are HDR (High Data Rate) with 200 Gbits/sec and EDR (Enhanced Data Rate) with 100 Gbits/sec.","location":"docs/glossary/#infiniband"},{"title":"IOPS","text":"Input/output operations per second (IOPS, pronounced eye-ops) is an input/output performance measurement used to characterize computer storage system performance.","location":"docs/glossary/#iops"},{"title":"Job","text":"A job, or batch job, is the scheduler\u2019s base unit of computing by which resources are allocated to a user for a specified amount of time. Users create job submission scripts to ask the scheduler for resources such as cores, memory, runtime, etc. The scheduler puts the requests in a queue and allocates requested resources based on jobs\u2019 priority.","location":"docs/glossary/#job"},{"title":"Job step","text":"Job steps are sets of (possibly parallel) tasks within a job","location":"docs/glossary/#job-step"},{"title":"Login nodes","text":"

Login nodes are points of access to a compute cluster. Users usually connect to login nodes via SSH to compile and debug their code, review their results, do some simple tests, and submit their batch jobs to the parallel computer.

Login nodes are not for computing

Login nodes are usually shared among many users and therefore must not be used to run computationally intensive tasks. Those should be submitted to the scheduler which will dispatch them on compute nodes.

","location":"docs/glossary/#login-nodes"},{"title":"Modules","text":"Environment modules, or software modules, are a type of software management tool used on in most HPC environments. Using modules enable users to selectively pick the software that they want to use and add them to their environment. This allows to switch between different versions or flavors of the same software, pick compilers, libraries and software components and avoid conflicts between them.","location":"docs/glossary/#modules"},{"title":"MPI","text":"Message Passing Interface (MPI) is a standardized and portable message-passing system designed to exchange information between processes running on different nodes. There are several implementations of the MPI standard, which is the most common way used to scale parallel applications beyond a single compute node.","location":"docs/glossary/#mpi"},{"title":"OpenMP","text":"Open Multi Processing (OpenMP) is a parallel programming model designed for shared memory architecture. It's based on pragmas that can be added in applications to let the compiler generate a code that can run on multiple cores, within the same node.","location":"docs/glossary/#openmp"},{"title":"Partition","text":"

A partition is a set of compute nodes within a cluster with a common feature. For example, compute nodes with GPU, or compute nodes belonging to same owner, could form a partition.

On Sherlock, you can see detailed partition information with the sh_part or sinfo commands.

","location":"docs/glossary/#partition"},{"title":"QOS","text":"A Quality Of Service (QOS) is the set of rules and limitations that apply to a categories of job. The combination of a partition (set of machines where a job can run) and QOS (set of rules that applies to that job) makes what is often referred to as a scheduler queue.","location":"docs/glossary/#qos"},{"title":"Run time","text":"The run time, or walltime, of a job is the time required to finish its execution.","location":"docs/glossary/#run-time"},{"title":"Scheduler","text":"The goal of a job scheduler is to find the appropriate resources to run a set of computational tasks in the most efficient manner. Based on resource requirements and job descriptions, it will prioritize those jobs, allocate resources (nodes, CPUs, memory) and schedule their execution.","location":"docs/glossary/#scheduler"},{"title":"Slurm","text":"Simple Linux Utility for Resource Management (SLURM) is a software that manages computing resources and schedule tasks on them. Slurm coordinates running of many programs on a shared facility and makes sure that resources are used in an optimal manner.","location":"docs/glossary/#slurm"},{"title":"SSH","text":"Secure Shell (SSH) is a protocol to securely access remote computers. Based on the client-server model, multiple users with an SSH client can access a remote computer. Some operating systems such as Linux and Mac OS have a built-in SSH client and others can use one of many publicly available clients.","location":"docs/glossary/#ssh"},{"title":"Thread","text":"A process, in the simplest terms, is an executing program. One or more threads run in the context of the process. A thread is the basic unit to which the operating system allocates processor time. A thread can execute any part of the process code, including parts currently being executed by another thread. Threads are co-located on the same node.","location":"docs/glossary/#thread"},{"title":"Task","text":"In the Slurm context, a task is to be understood as a process. A multi-process program is made of several tasks. A task is typically used to schedule a MPI process, that in turn can use several CPUs. By contrast, a multi-threaded program is composed of only one task, which uses several CPUs.","location":"docs/glossary/#task"},{"title":"Ordering nodes on Sherlock","text":"

For research groups needing access to additional, dedicated computing resources on Sherlock, we offer the possibility for PIs to purchase their own compute nodes to add to the cluster.

Operating costs for managing and housing PI-purchased compute nodes are waived in exchange for letting other users make use of any idle compute cycles on the PI-owned nodes. Owners have priority access to the computing resources they purchase, but can access more nodes for their research if they need to. This provides the PI with much greater flexibility than owning a standalone cluster.

","location":"docs/orders/"},{"title":"Conditions","text":"","location":"docs/orders/#conditions"},{"title":"Service term","text":"

Compute nodes are purchased for a duration of 4 years

Compute nodes are purchased and maintained based on a 4-year lifecycle, which is the duration of the equipment warranty and vendor support.

Owners will be notified during the 4th year that their nodes' lifetime is about to reach its term, at which point they'll be welcome to either:

  • renew their investment by purchasing new nodes,
  • continue to use the public portion of Sherlock's resources.

At the end of their service term, compute nodes are physically retired from the cluster, to make room for new equipment. Compute nodes may be kept running for an additional year at most after the end of their service term, while PIs plan for equipment refresh. Nodes failing during this period may not be repaired, and failed hardware will be disabled or removed from the system.

Please note that outside of exceptional circumstances, nodes purchased in Sherlock cannot be removed from cluster before the end of their service term.

","location":"docs/orders/#service-term"},{"title":"Shared ownership","text":"

Minimum order of one node per PI

The number of nodes in a shared order must be greater or equal to the number of purchasing PI groups.

For operational, administrative as well as usability reasons, we do not support shared ownership of equipment. Meaning that multiple PI groups cannot purchase and share a single compute node. Shared orders have a minimum of one node per purchasing PI group.

","location":"docs/orders/#shared-ownership"},{"title":"Compute nodes catalog","text":"

SRCC offers a select number of compute node configurations that have been tested and validated on Sherlock and that aim to cover most computing needs.

Sherlock catalog

Complete details are available in the Sherlock compute nodes catalog 3

","location":"docs/orders/#compute-nodes-catalog"},{"title":"Configurations","text":"

We try to provide hardware configurations that can cover the needs and requirements of a wide range of computing applications, in various scientific fields, and to propose a spectrum of pricing tiers, as shown in the table below:

Type Description Recommended usage Price range CBASE Base configuration Best per-core performance for serial applications, multi-threaded (OpenMP) and distributed (MPI) applications. Most flexible and cost-effective configuration $ CPERF High-core count configuration Multi-threaded applications requiring higher numbers of CPU cores $$ CBIGMEM Large-memory configuration Serial or multi-threaded applications requiring terabytes of memory (genome assembly, etc...) $$$$ G4FP32 Base GPU configuration Single-precision (FP32) GPU-accelerated applications (CryoEM, MD...) with low GPU memory requirements $$ G4FP64 HPC GPU configuration AI, ML/DL and GPU-accelerated HPC codes requiring double-precision (FP64) and larger amounts of GPU memory $$$ G4TF64G8TF64 Best-in-class GPU configuration AI, ML/DL and GPU-accelerated HPC codes requiring double-precision (FP64), large amounts of GPU memory, and heavy multi-GPU scaling $$$$ Choosing the best node configuration for your needs

Although some configurations may appear cheaper when looking at the dollar/core ratio, this is not the only point to consider when determining the best configuration for your workload.

Performance per core

There are other factors to take into account, notably the memory and I/O bandwidth per core, which could be lower on higher core-count configurations like CPERF. With multiple times more cores than CBASE, they still provide the same total amount of bandwidth to remote and local storage, as well as, to a lesser extend, to memory. Higher core-count CPUs also often offer lower core frequencies, which combined with less bandwidth per core, may result in lower performance for serial jobs.

CPERF nodes are an excellent fit for multi-threaded applications that don't span multiple nodes. But for more diverse workloads, they don't offer the same level of flexibility than the CBASE nodes, which can run a mix of serial, multi-threaded and MPI applications equally well.

Resources availability

Another important factor to take into account is that less nodes for a given number of cores offers less resilience against potential hardware failures: if a 128-core node becomes unavailable for some reason, that's 128 cores that nobody can use while the node is being repaired. But with 128 cores in 4x 32-core nodes, if a node fails, there are still 96 cores that can be used.

We'll be happy to help you determine the best configuration for your computing needs, feel free to reach out to schedule a consultation.

Configuration details for the different compute node types are listed in the Sherlock compute nodes catalog 3

","location":"docs/orders/#configurations"},{"title":"Prices","text":"

Prices for the different compute node types are listed in the Sherlock compute nodes catalog 3. They include tax and shipping fees, and are subject to change when quoted: they tend to follow the market-wide variations induced by global political and economical events, which are way outside of our control. Prices are provided there as a guideline for expectations.

There are two components in the cost of a compute node purchase:

  1. the cost of the hardware itself (capital purchase),

  2. a one-time, per-node infrastructure fee1 that will be charged to cover the costs of connecting the nodes to the cluster infrastructure (racks, PDUs, networking switches, cables...)

No recurring fees

There is currently no recurring fee associated with purchasing compute nodes on Sherlock. In particular, there is no CPU.hour charge, purchased nodes are available to their owners 100% of the time, at no additional cost.

Currently, there are no user, administrative or management fees associated with ongoing system administration of the Sherlock environment. However, PIs should anticipate the eventuality of modest system administration and support fees being levied within the 4 year lifetime of their compute nodes.

","location":"docs/orders/#prices"},{"title":"Purchasing process","text":"

Minimum purchase

Please note that the minimum purchase is one physical server per PI group. We cannot accommodate multiple PIs pooling funds for a single node.

Single-node orders may incur additional delays

Some node configurations need to be ordered from the vendor by sets of 4 nodes (see the Sherlock catalog for details). So orders for quantities non-multiples of 4 need will to be grouped with other PI's orders, which may incur additional delays.

Purchasing nodes on Sherlock is usually a 5-step process:

  1. the PI use the order form to submit an order,
  2. SRCC requests a formal vendor quote to finalize pricing and communicate it back to the PI for approval,
  3. SRCC submits a Stanford PO to the vendor,
  4. SRCC takes delivery of the hardware and proceeds to its installation,
  5. SRCC notifies the PI that their nodes are ready to be used.

The typical delay between a PO submission to the vendor and the availability of the compute nodes to the PIs is usually between 4 and 8 weeks.

Supply chain disruption and component shortages

Global supply chain issues and component shortages have considerably increased lead times, and compute node deliveries are currently in the 6-month range.

","location":"docs/orders/#purchasing-process"},{"title":"Required information","text":"

To place an order, we'll need the following information:

  • The SUNet ID of the PI making the purchase request
  • A PTA2 number to charge the hardware (capital) portion of the purchase
  • A PTA2 number to charge the per-node infrastructure fees (non-capital) It could be the same PTA used for the capital portion of the purchase, or a different one

Hardware costs could be spread over multiple PTAs (with a maximum of 2 PTAs per order). But please note that the infrastructure fees have to be charged to a single PTA.

","location":"docs/orders/#required-information"},{"title":"Placing an order","text":"

To start ordering compute nodes for Sherlock:

check the Sherlock catalog 3 to review prices and select your configurations

Choose

fill in the order form 3 to submit your request and provide the required information

Order

And we'll be in touch shortly!

  1. infrastructure fees are considered non-capital for cost accounting purposes and may incur indirect cost burdens on cost-reimbursable contracts and grants.\u00a0\u21a9

  2. PTA is an acronym used for a Project-Task-Award combination representing an account in the Stanford Financial system.\u00a0\u21a9\u21a9

  3. SUNet ID required, document restricted to @stanford.edu accounts.\u00a0\u21a9\u21a9\u21a9\u21a9\u21a9

","location":"docs/orders/#placing-an-order"},{"title":"Tags","text":"

Here is a list of documentation tags:

","location":"docs/tags/"},{"title":"advanced","text":"
  • Node features
","location":"docs/tags/#advanced"},{"title":"connection","text":"
  • Connection options
  • Connecting
  • Data transfer
","location":"docs/tags/#connection"},{"title":"slurm","text":"
  • Job management
  • Node features
  • Submitting jobs
  • Running jobs
","location":"docs/tags/#slurm"},{"title":"tech","text":"
  • Technical specifications
  • Facts
","location":"docs/tags/#tech"},{"title":"Advanced connection options","text":"","location":"docs/advanced-topics/connection/","tags":["connection"]},{"title":"Login nodes","text":"

Sherlock login nodes are regrouped behind a single DNS alias: login.sherlock.stanford.edu.

This alias provides a load-balanced login environment, and the assurance that you will be connected to the least loaded login node when you connect to Sherlock.

If for any reason, you want to directly connect to a specific login node and bypass the automatic load-balanced dispatching of new connections (which we don't recommend), you can use that login node's hostname explicitly. For instance:

$ ssh <sunetid>@ln21.sherlock.stanford.edu\n

This can be useful if you run long-standing processes on the login nodes, such as screen or tmux sessions. To find them back when you reconnect to Sherlock, you will indeed need to login to the same login node you started them on.

The drawback is that by connecting to a specific login node, you will forfeit the load-balancing benefits, which could result in a crowded environment, or even in login errors in case that specific login node is unavailable.

","location":"docs/advanced-topics/connection/#login-nodes","tags":["connection"]},{"title":"Authentication methods","text":"

Public-key authentication

SSH public-key authentication is not supported on Sherlock.

","location":"docs/advanced-topics/connection/#authentication-methods","tags":["connection"]},{"title":"Password (recommended)","text":"

The recommended way to authenticate to Sherlock is to simply use your SUNet ID and password, as described in the Connecting page.

Passwords are not stored on Sherlock. Sherlock login nodes will delegate password authentication to the University central Kerberos service.

","location":"docs/advanced-topics/connection/#password-recommended","tags":["connection"]},{"title":"GSSAPI","text":"

For compatibility with previous generations of Sherlock, GSSAPI1 authentication is still allowed, and could be considered a more convenient option, as this mechanism doesn't require entering your password for each connection.

GSSAPI authentication relies on a token system, where users obtain Kerberos ticket-granting tickets, transmit them via SSH to the server they want to connect to, which will, in turn, verify their validity. That way, passwords are never stored locally, and never transit over the network. That's why Kerberos is usually considered the most secure method to authenticate.

To connect using GSSAPI on Sherlock, you'll need to go through a few steps2:

  1. make sure the Kerberos user tools are installed on your local machine. You'll need the kinit (and optionally klist and kdestroy) utilities. Please refer to your OS documentation to install them if required.

  2. download and install the Stanford krb5.conf file, which contains information about the Stanford Kerberos environment:

    $ sudo curl -o /etc/krb5.conf https://web.stanford.edu/dept/its/support/kerberos/dist/krb5.conf\n
  3. configure your SSH client, by modifying (or creating if it doesn't exist already) the .ssh/config file in your home directory on your local machine. Using a text editor, you can add the following lines to your ~/.ssh/config file (indentation is important):

    Host login.sherlock.stanford.edu\n    GSSAPIDelegateCredentials yes\n    GSSAPIAuthentication yes\n

Once everything is in place (you only need to do this once), you'll be able to test that your Kerberos installation works by running kinit <sunetid>@stanford.edu. You should get a password prompt, and upon success, you'll be able to list your Kerberos credentials with the klist command:

$ kinit kilian@stanford.edu\nPassword for kilian@stanford.edu:\n$ klist\nTicket cache: FILE:/tmp/krb5cc_215845_n4S4I6KgyM\nDefault principal: kilian@stanford.edu\n\nValid starting     Expires            Service principal\n07/28/17 17:33:54  07/29/17 18:33:32  krbtgt/stanford.edu@stanford.edu\n        renew until 08/04/17 17:33:32\n

Kerberos ticket expiration

Kerberos tickets have a 25-hour lifetime. So you'll need to run the kinit command pretty much once a day to continue being able to authenticate to Sherlock.

Please note that when your Kerberos ticket expire, existing Sherlock connections will not be interrupted. So you'll be able to keep connections open to Sherlock for several days without any issue.

You're now ready to connect to Sherlock using GSSAPI. Simply SSH as usual:

$ ssh <sunetid>@login.sherlock.stanford.edu\n

and if everything goes well, you should directly see the two-factor (Duo) prompt, without having to enter your password.

If you want to destroy your Kerberos ticket before its expiration, you can use the kdestroy command.

","location":"docs/advanced-topics/connection/#gssapi","tags":["connection"]},{"title":"SSH options","text":"

OpenSSH offers a variety of configuration options that you can use in ~/.ssh/config on your local computer. The following section describe some of the options you can use with Sherlock that may make connecting and transferring files more convenient.

","location":"docs/advanced-topics/connection/#ssh-options","tags":["connection"]},{"title":"Avoiding multiple Duo prompts","text":"

In order to avoid getting a second-factor (Duo) prompt every time you want to open a new connection to Sherlock, you can take advantage of the multiplexing features provided by OpenSSH.

Simply add the following lines to your ~/.ssh/config file on your local machine to activate the ControlMaster option. If you already have a Host login.sherlock.stanford.edu block in your configuration file, simply add the Control* option lines in the same block.

Host login.sherlock.stanford.edu\n    ControlMaster auto\n    ControlPath ~/.ssh/%l%r@%h:%p\n

It will allow SSH to re-use an existing connection to Sherlock each time you open a new session (create a new SSH connection), thus avoiding subsequent 2FA prompts once the initial connection is established.

The slight disadvantage of this approach is that once you have a connection open to one of Sherlock's login nodes, all your subsequent connections will be using the same login node. This will somewhat defeat the purpose of the load-balancing mechanism used by the login nodes.

Connection failure with unix_listener error

If your connection fails with the following error message:

unix_listener: \"...\" too long for Unix domain socket\n
you're being hit by a macOS limitation, and you should replace the ControlPath line above by:
ControlPath ~/.ssh/%C\n

","location":"docs/advanced-topics/connection/#avoiding-multiple-duo-prompts","tags":["connection"]},{"title":"Connecting from abroad","text":"

VPN

As a good security practice, we always recommend to use the Stanford VPN when connecting from untrusted networks.

Access to Sherlock is not restricted to campus, meaning that you can connect to Sherlock from pretty much anywhere, including when traveling abroad. We don't restrict inbound SSH connections to any specific IP address range or geographical location, so you shouldn't have any issue to reach the login nodes from anywhere.

Regarding two-step authentication, University IT provides alternate authentication options when phone service or Duo Mobile push notifications are not available.

  1. The Generic Security Service Application Program Interface (GSSAPI, also GSS-API) is an application programming interface for programs to access security services. It allows program to interact with security services such as Kerberos for user authentication.\u00a0\u21a9

  2. Those instructions should work on Linux and MacOs computers. For Windows , we recommend using the WSL, as described in the Prerequisites page.\u00a0\u21a9

","location":"docs/advanced-topics/connection/#connecting-from-abroad","tags":["connection"]},{"title":"Job management","text":"","location":"docs/advanced-topics/job-management/","tags":["slurm"]},{"title":"Job submission limits","text":"

You may have encountered situations where your jobs get rejected at submission with errors like this:

sbatch: error: MaxSubmitJobsPerAccount\nsbatch: error: MaxSubmitJobsPerUser\n

There are a number of limits on Sherlock, that are put in place to guarantee that all of the users can have a fair access to resources and a smooth experience while using them. One of those limits is about the total number of jobs a single user (and a single group) can have in queue at any given time. This helps ensuring that the scheduler is able to continue operating in an optimal fashion, without being overloaded by a single user or group.

To see the job submission limits on Sherlock run the sh_part command.

To run longer than 2 days on the normal partition you will need to add the \"long\" QOS to your submission scripts. For example to run for exactly 3 days add the following two lines to your sbatch script:

#SBATCH --time=3-00:00:00\n#SBATCH --qos=long\n

If you have access to an owners partition you will not need to add this QOS since the MaxWall on owners is 7 days.

","location":"docs/advanced-topics/job-management/#job-submission-limits","tags":["slurm"]},{"title":"Minimizing the number of jobs in queue","text":"

It's generally a good practice to try reducing the number of jobs submitted to the scheduler, and depending on your workflow, there are various approaches for this. One solution may be to pack more work within a single job, which could help in reducing the overall number of jobs you'll have to submit.

Imagine you have a 100-task array job, where you run 1 app task per array item, which looks like this:

#!/bin/bash\n#SBATCH --array=1-100\n#SBATCH -n 1\n\n./app ${SLURM_ARRAY_TASK_ID}\n

This script would create 100 jobs in queue (even though they would all be regrouped under the same job array), each using 1 CPU to run 1 task.

Instead of that 100-task array job, you can try something like this:

#!/bin/bash\n#SBATCH --array=0-99:10\n#SBATCH -n 10\n\nfor i in {0..9}; do\n\u00a0 \u00a0 srun -n 1 ./app $((SLURM_ARRAY_TASK_ID+i)) &\ndone\n\nwait # important to make sure the job doesn't exit before the background tasks are done\n
  • --array=0-99:10 will use job array indexes 0, 10, 20 ... 90
  • -n 10 will make sure each job can be subdivided in 10 1-CPU steps
  • the for loop will launch 10 tasks, with indexes from SLURM_ARRAY_TASK_ID to SLURM_ARRAY_TASK_ID + 9.

This would submit a 10-task array job, each of them running 10 steps simultaneously, on the 10 CPUs that each of the job array item will be allocated.

In the end, you'll have run the same number of app instances, but you'll have divided the number of jobs submitted by 10, and allow you to submit the same amount of work to the scheduler, while staying under the submission limits.

","location":"docs/advanced-topics/job-management/#minimizing-the-number-of-jobs-in-queue","tags":["slurm"]},{"title":"Node features","text":"

In heterogeneous environments, computing resources are often grouped together into single pools of resources, to make things easier and more accessible. Most applications can run on any type of hardware, so having all resources regrouped in the same partitions maximizes utilization and make job submission much easier, as users don't have dozens of options to choose from.

But for more specific use cases, it may be necessary to specifically select the hardware jobs will run on, either for performance or reproducibility purposes.

To that end, all the compute nodes on Sherlock have feature tags assigned to them. Multiple characteristics are available for each node, such as their class, CPU manufacturer, generation, part number and frequency, as well as Infiniband and GPU characteristics.

Requiring specific node features is generally not necessary

Using node features is an advanced topic which is generally not necessary to run simple jobs on Sherlock. If you're just starting, you most likely don't need to worry about those, they're only useful in very specific cases.

","location":"docs/advanced-topics/node-features/","tags":["slurm","advanced"]},{"title":"Available features","text":"

The table below lists the possible features defined for each node.

Feature name Description Examples CLASS:xxx Node type, as defined in the Sherlock catalog CLASS:SH3_CBASE, CLASS:SH3_G4TF64 CPU_MNF:xxx CPU manufacturer CPU_MNF:INTEL, CPU_MNF:AMD CPU_GEN:xxx CPU generation CPU_GEN:RME for AMD RomeCPU_GEN:SKX for Intel Skylake CPU_SKU:xxx CPU name CPU_SKU:5118, CPU_SKU:7502P CPU_FRQ:xxx CPU core base frequency CPU_FRQ:2.50GHz, CPU_FRQ:2.75GHz GPU_BRD:xxx GPU brand GPU_BRD:GEFORCE, GPU_BRD:TESLA GPU_GEN:xxx GPU generation GPU_GEN:VLT for VoltaGPU_GEN:AMP for Ampere GPU_SKU:xxx GPU name GPU_SKU:A100_SXM4, GPU_SKU:RTX_3090 GPU_MEM:xxx GPU memory GPU_MEM:32GB, GPU_MEM:80GB GPU_CC:xxx GPU Compute Capabilities GPU_CC:6.1, GPU_CC:8.0 IB:xxx Infiniband generation/speed IB:EDR, IB:HDR NO_GPU special tag set on CPU-only nodes","location":"docs/advanced-topics/node-features/#available-features","tags":["slurm","advanced"]},{"title":"Listing the features available in a partition","text":"

All the node features available in a partition can be listed with sh_node_feat command.

For instance, to list all the GPU types in the gpu partition:

$ sh_node_feat -p gpu | grep GPU_SKU\nGPU_SKU:P100_PCIE\nGPU_SKU:P40\nGPU_SKU:RTX_2080Ti\nGPU_SKU:V100_PCIE\nGPU_SKU:V100S_PCIE\nGPU_SKU:V100_SXM2\n

To list all the CPU generations available in the normal partition:

$ sh_node_feat -p normal | grep CPU_GEN\nCPU_GEN:BDW\nCPU_GEN:MLN\nCPU_GEN:RME\nCPU_GEN:SKX\n
","location":"docs/advanced-topics/node-features/#listing-the-features-available-in-a-partition","tags":["slurm","advanced"]},{"title":"Requesting specific node features","text":"

Those node features can be used in job submission options, as additional constraints for the job, so that the scheduler will only select nodes that match the requested features.

Adding job constraints often increases job pending times

It's important to keep in mind that requesting specific node features usually increases job pending times in queue. The more constraints the scheduler has to satisfy, the smaller the pool of compute nodes jobs can run on. hence the longer it may take for the scheduler to find eligible resources to run those jobs.

To specify a node feature as a job constraint, the -C/--constraint option can be used.

For instance, to submit a job that should only run on an AMD Rome CPU, you can add the following to your job submission options:

#SBATCH -C CPU_GEN:RME\n

Or to make sure that your training job will run on a GPU with 80GB of GPU memory:

#SBATCH -G 1\n#SBATCH -C GPU_MEM:80GB\n
","location":"docs/advanced-topics/node-features/#requesting-specific-node-features","tags":["slurm","advanced"]},{"title":"Multiple constraints","text":"

For more complex cases, multiple constraints could be composed in different ways, using logical operators.

Many node feature combinations are impossible to satisfy

Many combinations will result in impossible conditions, and will make jobs impossible to run on any node. The scheduler is usualyl able to detect this and reject the job at submission time.

For instance, submitting a job requesting an Intel CPU on the HDR IB fabric:

#SBATCH -C 'CPU_MNF:INTEL&IB:HDR'\n

will result in the following error:

error: Job submit/allocate failed: Requested node configuration is not available\n

as all the compute nodes on the IB fabric use AMD CPUs. Constraints must be used carefully and sparsingly to avoid unexpected suprises.

Some of the possible logical operations between constraints are listed below:

","location":"docs/advanced-topics/node-features/#multiple-constraints","tags":["slurm","advanced"]},{"title":"AND","text":"

Only nodes with all the requested features are eligible to run the job. The ampersand sign (&) is used as the AND operator. For example:

#SBATCH -C 'GPU_MEM:32GB&IB:HDR'\n

will request a GPU with 32GB of memory on the HDR Infiniband fabric to run the job.

","location":"docs/advanced-topics/node-features/#and","tags":["slurm","advanced"]},{"title":"OR","text":"

Only nodes with at least one of specified features will be eligible to run the job. The pipe sign (|) is used as the OR operator.

In multi-node jobs, it means that nodes allocated to the job may end up having different features. For example, the following options:

#SBATCH -N 1\n#SBATCH -C \"CPU_GEN:RME|CPU_GEN:MLN\"\n

may result in a two-node job where one node as an AMD Rome CPU, and the other node has a AMD Milan CPU.

","location":"docs/advanced-topics/node-features/#or","tags":["slurm","advanced"]},{"title":"Matching OR:","text":"

When you need all nodes in a multi-node job to have the same set of features, a matching OR condition can be defined by enclosing the options within square brackets ([,]).

For instance, the following options may be used to request a job to run on nodes with the same frequency, either 2.5 GHz or 2/75GHz:

#SBATCH -C \"[CPU_FRQ:2.50GHz|CPU_FRQ:2.75GHz]\"\n

Node features are text tags

Node features are text tags, they have no associated numerical value, meaning that they can't be compared.

For instance, it's possible to add a constraint for GPU Compute Capabilities greater than 8.0. The workaround is to add a job constraint that satisfies all the possible values of that tag, like:

#SBATCH -C \"GPU_CC:8.0|GPU_CC:8.6\"\n

For more information, complete details about the --constraints/-C job submission option and its syntax can be found in the official Slurm documentation.

","location":"docs/advanced-topics/node-features/#matching-or","tags":["slurm","advanced"]},{"title":"Getting started","text":"","location":"docs/getting-started/"},{"title":"Prerequisites","text":"

To start using Sherlock, you will need:

  • an active SUNet ID,

    What is a SUNet ID?

    A SUNet ID is a unique 3-8 character account name that identifies you as a member of the Stanford community, with access to the Stanford University Network of computing resources and services. Not to be confused with University ID (a 8-digit number that appears on your Stanford ID Card), your SUNet ID is a permanent and visible part of your Stanford identity and often appears in your Stanford email address (eg. sunetid@stanford.edu).

    SUNet IDs are not managed by Research Computing. For more information, see https://accounts.stanford.edu/

    SUNet ID service levels and external collaborators

    Base-level service is sufficient for Sherlock accounts. External collaborators, or users without a SUNet ID, can be sponsored by a PI a get a sponsored SUNet ID at no cost. Please see the sponsorship page for more information.

  • a Sherlock account,

  • a SSH client,
  • good understanding of the concepts and terms used throughout that documentation,
  • some familiarity with Unix/Linux command-line environments, and notions of shell scripting.
","location":"docs/getting-started/#prerequisites"},{"title":"How to request an account","text":"

To request an account, the sponsoring Stanford faculty member should email srcc-support@stanford.edu, specifying the names and SUNet IDs of his/her research team members needing an account.

Sherlock is open to the Stanford community as a computing resource to support departmental or sponsored research, thus a faculty member's explicit consent is required for account requests.

Sherlock is a resource for research

Sherlock is a resource to help and support research, and is not suitable for course work, class assignments or general-use training sessions.

There is no fee associated with using Sherlock, and no limit in the amount of accounts each faculty member can request. We will periodically ensure that all accounts associated with each PI are still active, and reserve the right to close any Sherlock account whose SUNet ID is expired.

","location":"docs/getting-started/#how-to-request-an-account"},{"title":"SSH clients","text":"","location":"docs/getting-started/#ssh-clients"},{"title":"Linux","text":"

Linux distributions usually come with a version of the OpenSSH client already installed. So no additional software installation is required. If not, please refer to your distribution's documentation to install it.

","location":"docs/getting-started/#linux"},{"title":"macOS","text":"

macOS systems usually come with a version of the OpenSSH client already installed. So no additional software installation is required

","location":"docs/getting-started/#macos"},{"title":"Windows","text":"

Microsoft Windows includes a SSH client by default, that can be used to connect to Sherlock from a Windows terminal.

Windows also has a feature called the \"Windows Subsystem for Linux\" (WSL), which provides a Linux-like experience and make switching across systems more seamless. Please refer to the official documentation or this HOWTO for installation instructions.

The two options above will ensure the best compatibility with the Sherlock environment. If you'd like to explore other avenues, many other SSH client implementations are available, but have not necessarily been tested with Sherlock, so your mileage may vary.

","location":"docs/getting-started/#windows"},{"title":"Unix/Linux resources","text":"

A full tutorial on using Unix/Linux is beyond the scope of this documentation. However, there are many tutorials for beginning to use Unix/Linux on the web.

A few tutorials we recommend are:

  • Unix Tutorial for Beginners (University of Surrey, UK)
  • Introduction to Unix (Imperial College, London)
  • The Unix Shell (Software Carpentry)

More specifically about HPC and Research Computing:

  • Intro to HPC (HPC Carpentry)
  • HPC in a day (Software Carpentry}
  • Research Computing Q&A (Ask.Cyberinfrastructure)
","location":"docs/getting-started/#unixlinux-resources"},{"title":"Text editors","text":"

Multiple text editors are available on Sherlock. For beginners, we recommend the use of nano. And for more advanced uses, you'll also find below some resources about using vim

  • nano guide (Gentoo wiki)
  • vim guide (Gentoo wiki)

Note: you can also create/edit files with the Sherlock OnDemand File editor

","location":"docs/getting-started/#text-editors"},{"title":"Shell scripting","text":"

Compute jobs launched on Sherlock are most often initialized by user-written shell scripts. Beyond that, many common operations can be simplified and automated using shell scripts.

For an introduction to shell scripting, you can refer to:

  • Bash Programming - Introduction HOWTO
","location":"docs/getting-started/#shell-scripting"},{"title":"Connecting to Sherlock","text":"

Sherlock account required

To be able to connect to Sherlock, you must first obtain a Sherlock account.

","location":"docs/getting-started/connecting/","tags":["connection"]},{"title":"Credentials","text":"

All users must have a Stanford SUNet ID and a Sherlock account to log in to Sherlock. Your Sherlock account uses the same username/password as your SUnet ID:

Username: SUNet ID\nPassword: SUNet ID password\n

To request a Sherlock account, please see the Prerequisites page.

Resetting passwords

Sherlock does not store your SUNet ID password. As a consequence, we are unable to reset your password. If you require password assistance, please see the SUNet Account page.

","location":"docs/getting-started/connecting/#credentials","tags":["connection"]},{"title":"Connection","text":"

Access to Sherlock is provided via Secure Shell (SSH) login. Most Unix-like operating systems provide an SSH client by default that can be accessed by typing the ssh command in a terminal window.

To login to Sherlock, open a terminal and type the following command, where <sunetid> should be replaced by your actual SUNet ID:

$ ssh <sunetid>@login.sherlock.stanford.edu\n

Upon logging in, you will be connected to one of Sherlock's load-balanced login node. You should be automatically directed to the least-loaded login node at the moment of your connection, which should give you the best possible environment to work.

","location":"docs/getting-started/connecting/#connection","tags":["connection"]},{"title":"Host keys","text":"

Upon your very first connection to Sherlock, you will be greeted by a warning such as :

The authenticity of host 'login.sherlock.stanford.edu' can't be established.\nECDSA key fingerprint is SHA256:eB0bODKdaCWtPgv0pYozsdC5ckfcBFVOxeMwrNKdkmg.\nAre you sure you want to continue connecting (yes/no)?\n

The same warning will be displayed if your try to connect to one of the Data Transfer Node (DTN):

The authenticity of host 'dtn.sherlock.stanford.edu' can't be established.\nECDSA key fingerprint is SHA256:eB0bODKdaCWtPgv0pYozsdC5ckfcBFVOxeMwrNKdkmg.\nAre you sure you want to continue connecting (yes/no)?\n

This warning is normal: your SSH client warns you that it is the first time it sees that new computer. To make sure you are actually connecting to the right machine, you should compare the ECDSA key fingerprint shown in the message with one of the fingerprints below:

Key type Key Fingerprint RSA SHA256:T1q1Tbq8k5XBD5PIxvlCfTxNMi1ORWwKNRPeZPXUfJAlegacy format: f5:8f:01:46:d1:f9:66:5d:33:58:b4:82:d8:4a:34:41 ECDSA SHA256:eB0bODKdaCWtPgv0pYozsdC5ckfcBFVOxeMwrNKdkmglegacy format: 70:4c:76:ea:ae:b2:0f:81:4b:9c:c6:5a:52:4c:7f:64

If they match, you can proceed and type \u2018yes\u2019. Your SSH program will then store that key and will verify it for every subsequent SSH connection, to make sure that the server you're connecting to is indeed Sherlock.

","location":"docs/getting-started/connecting/#host-keys","tags":["connection"]},{"title":"Host keys warning","text":"

If you've connected to Sherlock 1.0 before, there's a good chance the Sherlock 1.0 keys were stored by your local SSH client. In that case, when connecting to Sherlock 2.0 using the sherlock.stanford.edu alias, you will be presented with the following message:

@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n@ WARNING: POSSIBLE DNS SPOOFING DETECTED! @\n@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\nThe RSA host key for sherlock.stanford.edu has changed, and the key for\nthe corresponding IP address 171.66.97.101 is unknown. This could\neither mean that DNS SPOOFING is happening or the IP address for the\nhost and its host key have changed at the same time.\n@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n@ WARNING: REMOTE HOST IDENTIFICATION HAS CHANGED! @\n@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\nIT IS POSSIBLE THAT SOMEONE IS DOING SOMETHING NASTY!\nSomeone could be eavesdropping on you right now (man-in-the-middle\nattack)!  It is also possible that a host key has just been changed.\nThe fingerprint for the RSA key sent by the remote host is\nSHA256:T1q1Tbq8k5XBD5PIxvlCfTxNMi1ORWwKNRPeZPXUfJA.\nPlease contact your system administrator.\n

You can just check that the SHA256 key listed in that warning message correctly matches the one listed in the table above, and if that's the case, you can safely remove the sherlock.stanford.edu entry from your ~/.ssh/known_hosts file with the following command on your local machine:

$ ssh-keygen -R sherlock.stanford.edu\n

and then connect again. You'll see the first-connection prompt mentioned above, and your SSH client will store the new keys for future connections.

","location":"docs/getting-started/connecting/#host-keys-warning","tags":["connection"]},{"title":"Authentication","text":"","location":"docs/getting-started/connecting/#authentication","tags":["connection"]},{"title":"Password","text":"

To ease access and increase compatibility1 with different platforms, Sherlock allows a simple password-based authentication mechanism for SSH.2.

Upon connection, you will be asked for your SUNet ID password with the following prompt:

<sunetid>@login.sherlock.stanford.edu's password:\n

Enter your password, and if it's correct, you should see the following line:

Authenticated with partial success.\n
","location":"docs/getting-started/connecting/#password","tags":["connection"]},{"title":"Second factor (2FA)","text":"

Sherlock implements Stanford's Minimum Security Standards policies which mandate two-step authentication to access the cluster.

Two-step authentication protects your personal information and credentials by combining something only you know (your password) with something only you have (your phone, tablet or token). This prevents an attacker who would steal your password to actually use it to impersonate you. For more details about two-step authentication at Stanford, please refer to the University IT two-step page.

After successfully entering your password, you'll be prompted for your second authentication factor with a message like this:

Duo two-factor login for <sunetid>\n\nEnter a passcode or select one of the following options:\n\n 1. Duo Push to XXX-XXX-9999\n 2. Phone call to XXX-XXX-9999\n 3. SMS passcodes to XXX-XXX-9999 (next code starts with: 9)\n\nPasscode or option (1-3):\n

Avoiding two-factor prompt on each connection

If you routinely open multiple sessions to Sherlock, having to confirm each one of them with a second authentication factor could rapidely become cumbersome. To work around this, the OpenSSH client allows multiplexing channels and re-using existing authenticated for opening new sessions. Please see the Advanced Connection Options page for more details.

If your second factor is accepted, you'll see the following message:

Success. Logging you in...\n
","location":"docs/getting-started/connecting/#second-factor-2fa","tags":["connection"]},{"title":"Troubleshooting","text":"","location":"docs/getting-started/connecting/#troubleshooting","tags":["connection"]},{"title":"Timeouts","text":"

If you ever encounter timeout errors when connecting to Sherlock, like these:

$ ssh login.sherlock.stanford.edu\nssh: connect to host login.sherlock.stanford.edu port 22: Operation timed out\n

you can try to either:

  • switch to a wired connection if you're connecting over wifi,
  • connect via the Stanford VPN
","location":"docs/getting-started/connecting/#timeouts","tags":["connection"]},{"title":"Authentication failures","text":"

Excessive authentication failures

Entering an invalid password multiple times will result in a (temporary) ban of your IP address.

To prevent brute-force password guessing attacks on Sherlock login nodes, we automatically block IP addresses that generate too many authentication failures in a given time span. This results in a temporary ban of the infringing IP address, and the impossibility for the user to connect to Sherlock from that IP address.

When this happens, your SSH connection attempts will result in the following error:

ssh: connect to host login.sherlock.stanford.edu port 22: Connection refused\n

IP blocked by this mechanism will automatically be authorized again after a few minutes.

SSHFS on macOS

SSHFS on macOS is known to try to automatically reconnect filesystem mounts after resuming from sleep or uspend, even without any valid credentials. As a result, it will generate a lot of failed connection attempts and likely make your IP address blacklisted on login nodes.

Make sure to unmount your SSHFS drives before putting your macOS system to sleep to avoid this situation.

VPN

If your IP got blocked and you have an urgent need to connect, before the automatic blacklist expiration, we recommend trying to connect through Stanford's VPN: your computer will then use a different IP address and will not be affected by the ban on your regular IP address.

","location":"docs/getting-started/connecting/#authentication-failures","tags":["connection"]},{"title":"Login","text":"

Congratulations! You've successfully connected to Sherlock. You'll be greeted by the following message of the day:

             --*-*- Stanford Research Computing Center -*-*--\n                  ____  _               _            _\n                 / ___|| |__   ___ _ __| | ___   ___| | __\n                 \\___ \\| '_ \\ / _ \\ '__| |/ _ \\ / __| |/ /\n                  ___) | | | |  __/ |  | | (_) | (__|   <\n                 |____/|_| |_|\\___|_|  |_|\\___/ \\___|_|\\_\\\n\n-----------------------------------------------------------------------------\n  This system is for authorized users only and users must comply with all\n  Stanford computing, network and research policies. All activity may be\n  recorded for security and monitoring purposes. For more information, see\n  https://doresearch.stanford.edu/policies/research-policy-handbook and\n  https://adminguide.stanford.edu/chapter-6/subchapter-2/policy-6-2-1\n-----------------------------------------------------------------------------\n  Sherlock is *NOT* approved for storing or processing HIPAA, PHI, PII nor\n  any kind of High Risk data. Users are responsible for the compliance of\n  their data.\n  See https://uit.stanford.edu/guide/riskclassifications for details.\n-----------------------------------------------------------------------------\n\n        Docs         https://www.sherlock.stanford.edu/docs\n        Support      https://www.sherlock.stanford.edu/docs/#support\n\n        Web          https://www.sherlock.stanford.edu\n        News         https://news.sherlock.stanford.edu\n        Status       https://status.sherlock.stanford.edu\n\n-----------------------------------------------------------------------------\n

Once authenticated to Sherlock, you'll see the following prompt:

[<sunetid>@sh03-ln01 login! ~]$

It indicates the name of the login node you've been connected to, and a reminder that you're actually connected to a login node, not a compute node.

Login nodes are not for computing

Login nodes are shared among many users and therefore must not be used to run computationally intensive tasks. Those should be submitted to the scheduler which will dispatch them on compute nodes.

By contrast, the shell prompt on a compute node looks like this:

[<sunetid>@sh03-01n01 ~]$

","location":"docs/getting-started/connecting/#login","tags":["connection"]},{"title":"Start computing","text":"

To start computing, there's still a extra step required, which is requesting resources to run your application. It's all described in the next section.

  1. On Sherlock 1.0, GSSAPI tokens (based on Kerberos tickets) were the only allowed authentication method, which could cause some interoperability with third-party SSH clients.\u00a0\u21a9

  2. For other methods of authentication, see the Advanced Connection Options page.\u00a0\u21a9

","location":"docs/getting-started/connecting/#start-computing","tags":["connection"]},{"title":"Submitting jobs","text":"","location":"docs/getting-started/submitting/","tags":["slurm"]},{"title":"Principle","text":"

Login nodes are not for computing

Login nodes are shared among many users and therefore must not be used to run computationally intensive tasks. Those should be submitted to the scheduler which will dispatch them on compute nodes.

","location":"docs/getting-started/submitting/#principle","tags":["slurm"]},{"title":"Requesting resources","text":"

A mandatory prerequisite for running computational tasks on Sherlock is to request computing resources. This is done via a resource scheduler, whose very purpose is to match compute resources in the cluster (CPUs, GPUs, memory, ...) with user resource requests.

The scheduler provides three key functions:

  1. it allocates access to resources (compute nodes) to users for some duration of time so they can perform work.
  2. it provides a framework for starting, executing, and monitoring work (typically a parallel job such as MPI) on a set of allocated nodes.
  3. it arbitrates contention for resources by managing a queue of pending jobs
","location":"docs/getting-started/submitting/#requesting-resources","tags":["slurm"]},{"title":"Slurm","text":"

Sherlock uses Slurm, an open-source resource manager and job scheduler, used by many of the world's supercomputers and computer clusters.

Slurm supports a variety of job submission techniques. By accurately requesting the resources you need, you\u2019ll be able to get your work done.

Wait times in queue

As a quick rule of thumb, it's important to keep in mind that the more resources your job requests (CPUs, GPUs, memory, nodes, and time), the longer it may have to wait in queue before it could start.

In other words: accurately requesting resources to match your job's needs will minimize your wait times.

","location":"docs/getting-started/submitting/#slurm","tags":["slurm"]},{"title":"How to submit a job","text":"A job consists in two parts: resource requests and job steps.

Resource requests describe the amount of computing resource (CPUs, GPUs, memory, expected run time, etc.) that the job will need to successfully run.

Job steps describe tasks that must be executed.

","location":"docs/getting-started/submitting/#how-to-submit-a-job","tags":["slurm"]},{"title":"Batch scripts","text":"

The typical way of creating a job is to write a job submission script. A submission script is a shell script (e.g. a Bash script) whose first comments, if they are prefixed with #SBATCH, are interpreted by Slurm as parameters describing resource requests and submissions options1.

The submission script itself is a job step. Other job steps are created with the srun command.

For instance, the following script would request one task with one CPU for 10 minutes, along with 2 GB of memory, in the default partition:

submit.sh
#!/bin/bash\n#\n#SBATCH --job-name=test\n#\n#SBATCH --time=10:00\n#SBATCH --ntasks=1\n#SBATCH --cpus-per-task=1\n#SBATCH --mem-per-cpu=2G\n\nsrun hostname\nsrun sleep 60\n

When started, the job would run a first job step srun hostname, which will launch the command hostname on the node on which the requested CPU was allocated. Then, a second job step will start the sleep command.

You can create this job submission script on Sherlock using a text editor such as nano or vim, and save it as submit.sh.

#SBATCH directives syntax

#SBATCH directives must be at the top of the script

Slurm will ignore all #SBATCH directives after the first non-comment line (that is, the first line in the script that doesn't start with a # character). Always put your #SBATCH parameters at the top of your batch script.

Spaces in parameters will cause #SBATCH directives to be ignored

Slurm will ignore all #SBATCH directives after the first white space. For instance directives like those:

#SBATCH --job-name=big job\n
#SBATCH --mem=16 G\n
#SBATCH --partition=normal, owners\n
will cause all following #SBATCH directives to be ignored and the job to be submitted with the default parameters.

","location":"docs/getting-started/submitting/#batch-scripts","tags":["slurm"]},{"title":"Job submission","text":"

Once the submission script is written properly, you can submit it to the scheduler with the sbatch command. Upon success, sbatch will return the ID it has assigned to the job (the jobid).

$ sbatch submit.sh\nSubmitted batch job 1377\n
","location":"docs/getting-started/submitting/#job-submission","tags":["slurm"]},{"title":"Check the job","text":"

Once submitted, the job enters the queue in the PENDING state. When resources become available and the job has sufficient priority, an allocation is created for it and it moves to the RUNNING state. If the job completes correctly, it goes to the COMPLETED state, otherwise, its state is set to FAILED.

You'll be able to check the status of your job and follow its evolution with the squeue -u $USER command:

$ squeue -u $USER\n     JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)\n      1377    normal     test   kilian  R       0:12      1 sh02-01n01\n

The scheduler will automatically create an output file that will contain the result of the commands run in the script file. That output file is names slurm-<jobid>.out by default, but can be customized via submission options. In the above example, you can list the contents of that output file with the following commands:

$ cat slurm-1377.out\nsh02-01n01\n

Congratulations, you've submitted your first batch job on Sherlock!

","location":"docs/getting-started/submitting/#check-the-job","tags":["slurm"]},{"title":"What's next?","text":"

Actually, quite a lot. Although you now know how to submit a simple batch job, there are many other options and areas to explore in the next sections:

  • Data transfer
  • Storage
  • Running jobs
  1. You can get the complete list of parameters by referring to the sbatch manual page (man sbatch).\u00a0\u21a9

","location":"docs/getting-started/submitting/#whats-next","tags":["slurm"]},{"title":"Software on Sherlock","text":"","location":"docs/software/"},{"title":"Available software","text":"

A set of supported software installations is provided for use on Sherlock. This software is made available through a Software Modules system. For the complete list of available software, please refer to the Software List page.

Licensed software can be used on Sherlock, under certain conditions. Feel free to contact us for more details or if you have questions. For more information about purchasing software licenses, you can contact the Stanford Software Licensing office.

","location":"docs/software/#available-software"},{"title":"Installation requests","text":"

Installation requests

The SRCC team installs, for general use, a set of libraries, tools and software applications that are commonly used across many research groups. However, our staff resources are quite limited and don't allow us to build nor maintain custom software applications that may be requested by or be of use to a small number of users.

We strongly encourage users to build custom and field- or domain-specific software themselves, and install it in their own personal or group shared directories. That way, they can share the software installations with the rest of the users in their group, if necessary.

Users may even maintain and publish their own local module files to dynamically configure a running environment to use the software. They could share those modules with other users to simplify the use of their own custom software installations.

Installing your own software

For more information about building your own software on Sherlock, please see the Software Installation page.

If the software you need is not in the list of available software, and you have trouble installing it on your own, please contact us with as much details about the package as possible, and we will try to help you install it.

If it's a widely used software that could benefit multiple users across different scientific communities, we will consider install it globally as resources permit1.

","location":"docs/software/#installation-requests"},{"title":"Contributed software","text":"

PI groups and labs can share their software installations and modules with the whole Sherlock user community, and let everyone benefit from their tuning efforts and software developments.

Contributed software is supported and maintained by each lab, and contact information is usually provided in the contribs module. See the Modules page for more information about using software modules on Sherlock.

If you're interested in sharing your software installations beyond your own group on Sherlock, please let us know, and we'll get in touch.

  1. Software requests, including version upgrades, are fulfilled in the order they are received, and as time permits. We don't have any dedicated team for software installations, and requests are handled along with other duties, typically within two to three weeks of being received.\u00a0\u21a9

","location":"docs/software/#contributed-software"},{"title":"Installation","text":"

Software installation requests

For more information about software installation requests, please see the Software Overview page

If the software package or version you need is not available in the list of provided software, you may compile and install it yourself. The recommended location for user-installed software is the $GROUP_HOME group shared directory, which is snapshotted and replicated off-site, and can easily be shared with members of a research group.

Work in progress

This page is a work in progress and is not complete yet. We are actively working on adding more content and information.

","location":"docs/software/install/"},{"title":"List","text":"

","location":"docs/software/list/"},{"title":"Software list","text":"

The full list of software centrally installed and managed on Sherlock is in the tables below.

Permanent work in progress

Software installations on Sherlock are an ever ongoing process. We're continuously adding new software to the list. If you're looking for something that is not in the list, there may be other options.

Subscribe to updates

Never want to miss a software update again? Stay up-to-date with new software updates by following the Sherlock software update RSS feed.

","location":"docs/software/list/#software-list"},{"title":"Categories","text":"

Software modules on Sherlock are organized in categories, by scientific field or functional class. It means that you will have to first load a category module before getting access to individual modules. The math and devel categories are loaded by default. See the Modules page for further details and examples.

We currently provide 570 software modules, in 7 categories, covering 93 fields of science:

  • biology clinical science, computational biology, cryo-em, genomics, molecular biology, neurology, pathology, phylogenetics, population genetics, radiology, workflow management

  • chemistry cheminformatics, computational chemistry, crystallography, docking, electrostatics, molecular dynamics, quantum chemistry, tools

  • devel build, compiler, data, data analytics, debug, engine, framework, IDE, language, lib, mpi, networking, parser, profiling, runtime

  • math computational geometry, deep learning, graph computing, lib, linear algebra, machine learning, numerical analysis, numerical library, optimization, scientific computing, statistics, symbolic, technical computing, topic modelling

  • physics astronomy, CFD, cliemate modeling, climate modeling, geophysics, geoscience, lib, magnetism, materials science, micromagnetics, particle, photonics, quantum information science, quantum mechanics

  • system backup, benchmark, checkpointing, cloud interface, compiler, compression, containers, database, document management, document processing, file management, file transfer, framework, hardware, job management, language, libs, media, performance, resource monitoring, scm, shell, testing, tools

  • viz data, gis, graphs, imaging, molecular visualization, plotting, remote display

Licensed software

Access to software modules marked with in the tables below is restricted to properly licensed user groups.

SRCC is not funded to provide commercial software on Sherlock and researchers are responsible for the costs of purchasing and renewing commercial software licenses. For more information, please feel free to contact us and see the Stanford Software Licensing page for purchasing information.

Additional flags and features

Some of the modules listed below have been built to support specific architectures or parallel execution modes:

  • versions marked with support GPU acceleration
  • versions marked with support MPI parallel execution
  • versions marked with are the default version for the module
","location":"docs/software/list/#categories"},{"title":"biology","text":"Field Module\u00a0name Version(s) URL Description clinical science simvascular 20180704 Website Simvascular is a blood flow simulation and analysis toolkit. This module provides the svFSI (Fluid Solid Interaction) solver. computational biology py-biopython 1.70_py271.79_py361.79_py39 Website Biopython is a set of freely available tools for biological computation written in Python. computational biology rosetta 3.8 Website Rosetta is the premier software suite for modeling macromolecular structures. As a flexible, multi-purpose application, it includes tools for structure prediction, design, and remodeling of proteins and nucleic acids. cryo-em ctffind 4.1.13 Website ctffind is a program for finding CTFs of electron micrographs. cryo-em eman2 2.2 2.91 Website EMAN2 is a broadly based greyscale scientific image processing suite with a primary focus on processing data from transmission electron microscopes. cryo-em imod 4.9.12 4.11.5 Website IMOD is a set of image processing, modeling and display programs used for tomographic reconstruction and for 3D reconstruction of EM serial sections and optical sections. cryo-em motioncor2 1.3.1 1.5.0 1.6.4 Website MotionCor2 is a multi-GPU accelerated program which corrects anisotropic image motion at the single pixel level. cryo-em py-topaz 0.2.4_py36 0.2.5_py39 Website A pipeline for particle detection in cryo-electron microscopy images using convolutional neural networks trained from positive and unlabeled examples. cryo-em relion 2.0.3 2.1 4.0.1 Website RELION (for REgularised LIkelihood OptimisatioN, pronounce rely-on) is a stand-alone computer program that employs an empirical Bayesian approach to refinement of (multiple) 3D reconstructions or 2D class averages in electron cryo-microscopy (cryo-EM). genomics angsd 0.9190.931 Website ANGSD is a software for analyzing next generation sequencing data. genomics augustus 3.3.2 Website AUGUSTUS is a program that predicts genes in eukaryotic genomic sequences. genomics bamtools 2.5.1 Website BamTools is a project that provides both a C++ API and a command-line toolkit for reading, writing, and manipulating BAM (genome alignment) files. genomics bcftools 1.61.81.16 Website BCFtools is a program for variant calling and manipulating files in the Variant Call Format (VCF) and its binary counterpart BCF. genomics bcl-convert 4.2.7 Website The BCL Convert App generates demultiplexed FASTQ files from a run as input. genomics bcl2fastq 2.20 Website The bcl2fastq2 conversion software can be used to convert BCL files from MiniSeq, MiSeq, NextSeq, HiSeq, iSeq and NovaSeq sequening systems. genomics bedops 2.4.40 Website BEDOPS is an open-source command-line toolkit that performs highly efficient and scalable Boolean and other set operations, statistical calculations, archiving, conversion and other management of genomic data of arbitrary scale. genomics bedtools 2.27.12.30.0 Website The bedtools utilities are a swiss-army knife of tools for a wide-range of genomics analysis tasks. genomics bgen 1.1.4 Website bgen is the reference implementation of the BGEN format, a binary file format for imputed genotype and haplotype data. genomics bowtie 1.2.2 Website Bowtie is an ultrafast, memory-efficient short read aligner. genomics bowtie2 2.3.4.1 Website Bowtie 2 is an ultrafast and memory-efficient tool for aligning sequencing reads to long reference sequences. genomics breseq 0.38.1 Website breseq is a computational pipeline for finding mutations relative to a reference sequence in short-read DNA resequencing data. genomics bwa 0.7.17 Website BWA (Burrows-Wheeler Aligner) is a software package for mapping low-divergent sequences against a large reference genome, such as the human genome. genomics canu 1.8 Website A single molecule sequence assembler for genomes large and small. genomics cellranger 7.1.0 Website Cell Ranger is a set of analysis pipelines that process Chromium single-cell RNA-seq output to align reads, generate gene-cell matrices and perform clustering and gene expression analysis. genomics cufflinks 2.2.1 Website Cufflinks assembles transcripts, estimates their abundances, and tests for differential expression and regulation in RNA-Seq samples. genomics dorado 0.3.4 Website Dorado is a high-performance, easy-to-use, open source basecaller for Oxford Nanopore reads. genomics fastqc 0.11.8 Website FastQC aims to provide a simple way to do some quality control checks on raw sequence data coming from high throughput sequencing pipelines. genomics fastx_toolkit 0.0.14 Website The FASTX-Toolkit is a collection of command line tools for Short-Reads FASTA/FASTQ files preprocessing. genomics freebayes 1.2.0 Website FreeBayes is a Bayesian genetic variant detector designed to find small polymorphisms. genomics gatk 4.1.0.04.1.4.1 Website GATK (Genome Analysis Toolkit) offers a wide variety of tools with a primary focus on variant discovery and genotyping. genomics gemma 0.98.5 Website GEMMA is a software toolkit for fast application of linear mixed models (LMMs) and related models to genome-wide association studies (GWAS) and other large-scale data sets. genomics hic-pro 2.10.0 Website HiC-Pro: An optimized and flexible pipeline for Hi-C data processing. genomics hisat2 2.1.0 Website HISAT2 is a fast and sensitive alignment program for mapping next-generation sequencing reads (both DNA and RNA) to a population of human genomes (as well as to a single reference genome). genomics htslib 1.61.81.10.21.141.16 Website C library for high-throughput sequencing data formats. genomics jellyfish 2.2.10 Website A fast multi-threaded k-mer counter. genomics kallisto 0.44.0 0.46.10.50.1 Website kallisto is a program for quantifying abundances of transcripts from RNA-Seq data using high-throughput sequencing reads. genomics metal 20110325 Website The METAL software is designed to facilitate meta-analysis of large datasets (such as several whole genome scans) in a convenient, rapid and memory efficient manner. genomics mixcr 2.1.124.6.0 Website MiXCR is a universal framework that processes big immunome data from raw sequences to quantitated clonotypes. genomics ncbi-blast+ 2.6.02.7.12.11.0 Website NCBI BLAST+ is a suite of command-line tools to run BLAST (Basic Local Alignment Search Tool), an algorithm for comparing primary biological sequence information. genomics ncbi-vdb 3.0.7 Website NCBI VDB is the database engine used by NCBI SRA tools. genomics plink 1.071.90b5.32.0a12.0a2 Website PLINK is a free, open-source whole genome association analysis toolset, designed to perform a range of basic, large-scale analyses in a computationally efficient manner. genomics popscle 0.1 Website popscle is a suite of population scale analysis tools for single-cell genomics data. genomics py-busco 3.0.2_py27 Website Assessing genome assembly and annotation completeness with Benchmarking Universal Single-Copy Orthologs (BUSCO). genomics py-bx-python 0.8.1_py270.8.13_py39 Website Tools for manipulating biological data, particularly multiple sequence alignments. genomics py-cutadapt 1.18_py27 1.18_py36 Website Cutadapt finds and removes adapter sequences, primers, poly-A tails and other types of unwanted sequence from your high-throughput sequencing reads. genomics py-deeplabcut 2.2.3_py39 Website A software package for animal pose estimation. genomics py-deeptools 3.3.1_py36 Website Tools to process and analyze deep sequencing data. genomics py-fithic 1.1.3_py27 Website Fit-Hi-C is a tool for assigning statistical confidence estimates to chromosomal contact maps produced by genome architecture assays. genomics py-htseq 2.0.1_py39 Website HTSeq is a Python library to facilitate processing and analysis of data from high-throughput sequencing (HTS) experiments. genomics py-macs2 2.1.1_py27 Website MACS (Model-based Analysis of ChIP-Seq) implements a novel ChIP-Seq analysis method. genomics py-mageck 0.5.9.4_py36 Website Model-based Analysis of Genome-wide CRISPR-Cas9 Knockout (MAGeCK) is a computational tool to identify important genes from the recent genome-scale CRISPR-Cas9 knockout screens technology. genomics py-mapdamage 2.2.1_py36 Website mapDamage2 is a computational framework which tracks and quantifies DNA damage patterns among ancient DNA sequencing reads generated by Next-Generation Sequencing platforms. genomics py-multiqc 1.6_py27 1.6_py36 Website MultiQC is a reporting tool that parses summary statistics from results and log files generated by other bioinformatics tools. genomics py-obitools 1.2.13_py27 Website OBITools is a set of programs designed for analyzing NGS data in a DNA metabarcoding context. genomics py-orthofinder 2.5.4_py39 Website OrthoFinder is a fast, accurate and comprehensive platform for comparative genomics. genomics py-pybedtools 0.8.0_py270.8.2_py360.9.0_py39 Website Pybedtools wraps and extends BEDTools and offers feature-level manipulations from within Python. genomics py-pysam 0.14.1_py270.15.3_py360.18.0_py39 Website Pysam is a python module for reading, manipulating and writing genomic data sets. genomics py-scanpy 1.8.2_py39 Website Scanpy is a scalable toolkit for analyzing single-cell gene expression data. genomics py-vcf2gwas 0.8.9_py39 Website Python API for comprehensive GWAS analysis using GEMMA. genomics py-vispr 0.4.17_py36 Website A visualization framework for CRISPR/Cas9 knockout screens, analyzed with MAGeCK. genomics regenie 2.2.4 Website regenie is a C++ program for whole genome regression modelling of large genome-wide association studies. genomics rsem 1.3.3 Website RSEM is a software package for estimating gene and isoform expression levels from RNA-Seq data. genomics salmon 0.12.0 Website Highly-accurate & wicked fast transcript-level quantification from RNA-seq reads using lightweight alignments. genomics samtools 1.61.81.16.1 Website Tools (written in C using htslib) for manipulating next-generation sequencing data. genomics sentieon 201808.01 202112.01 Website Sentieon Genomics software is a set of software tools that perform analysis of genomic data obtained from DNA sequencing. genomics shapeit 4.0.0 4.2.2 Website SHAPEIT4 is a fast and accurate method for estimation of haplotypes (aka phasing) for SNP array and high coverage sequencing data. genomics sra-tools 2.11.03.0.7 Website The SRA Toolkit and SDK from NCBI is a collection of tools and libraries for using data in the INSDC Sequence Read Archives. genomics star 2.5.4b2.7.10b Website STAR: ultrafast universal RNA-seq aligner. genomics stringtie 2.2.1 Website StringTie is a fast and highly efficient assembler of RNA-Seq alignments into potential transcripts. genomics tophat 2.1.1 Website TopHat is a fast splice junction mapper for RNA-Seq reads. genomics trim_galore 0.5.0 Website Trim Galore! is a wrapper script to automate quality and adapter trimming as well as quality control, with some added functionality to remove biased methylation positions for RRBS sequence files. genomics trinity 2.8.42.13.1 Website Trinity RNA-Seq de novo transcriptome assembly. genomics vcflib 1.0.0 Website A C++ library for parsing and manipulating VCF files. genomics vcftools 0.1.15 Website VCFtools is a program package designed for working with VCF files, such as those generated by the 1000 Genomes Project. genomics viennarna 2.5.1 Website A C code library and several stand-alone programs for the prediction and comparison of RNA secondary structures. molecular biology dssp 4.0.3 Website DSSP is an application to assign secondary structure to proteins. molecular biology libcifpp 3.0.0 Website Library to work with mmCIF and PDB files. neurology afni 17.2.0718.2.0421.3.00 Website AFNI (Analysis of Functional NeuroImages) is a set of C programs for processing, analyzing, and displaying functional MRI (FMRI) data - a technique for mapping human brain activity. neurology ants 2.1.02.3.12.4.0 Website ANTs computes high-dimensional mappings to capture the statistics of brain structure and function. neurology bart 0.7.00 Website BART is a toolbox for Computational Magnetic Resonance Imaging. neurology dcm2niix 1.0.201712151.0.20211006 Website dcm2niix is a program esigned to convert neuroimaging data from the DICOM format to the NIfTI format. neurology freesurfer 6.0.17.1.17.2.07.3.27.4.1 Website An open source software suite for processing and analyzing (human) brain MRI images. neurology fsl 5.0.10 Website FSL is a comprehensive library of analysis tools for FMRI, MRI and DTI brain imaging data. neurology mricron 20160502 Website MRIcron is a cross-platform NIfTI format image viewer. neurology mrtrix 0.3.163.0.3 Website MRtrix3 provides a set of tools to perform various types of diffusion MRI analyses, from various forms of tractography through to next-generation group-level analyses. neurology py-mdt 0.10.9_py36 Website The Maastricht Diffusion Toolbox, MDT, is a framework and library for parallelized (GPU and multi-core CPU) diffusion Magnetic Resonance Imaging (MRI) modeling. neurology py-nipype 1.1.3_py271.1.3_py36 Website Nipype is a Python project that provides a uniform interface to existing neuroimaging software and facilitates interaction between these packages within a single workflow. neurology spm 12 Website The SPM software package has been designed for the analysis of brain imaging data sequences. The sequences can be a series of images from different cohorts, or time-series from the same subject. neurology workbench 1.3.1 Website Connectome Workbench is an open source, freely available visualization and discovery tool used to map neuroimaging data, especially data generated by the Human Connectome Project. pathology openslide 3.4.1 Website OpenSlide is a C library that provides a simple interface to read whole-slide images (also known as virtual slides). pathology py-openslide-python 1.1.1_py27 1.1.1_py36 Website OpenSlide Python is a Python interface to the OpenSlide library. phylogenetics py-ete 3.0.0_py27 Website A Python framework for the analysis and visualization of trees. population genetics py-admixfrog 0.6.1_py36 Website Admixfrog is a HMM to infer ancestry frogments (fragments) from low-coverage, contaminated data. radiology nbia-data-retriever 4.2 Website The NBIA Data Retriever is an application to download radiology images from the TCIA Radiology Portal. workflow management nextflow 23.04.3 Website Nextflow is a bioinformatics workflow manager that enables the development of portable and reproducible workflows.","location":"docs/software/list/#biology"},{"title":"chemistry","text":"Field Module\u00a0name Version(s) URL Description cheminformatics py-rdkit 2018.09.1_py27 2018.09.1_py362022.09.1_py39 Website RDKit is a collection of cheminformatics and machine-learning software written in C++ and Python. computational chemistry gaussian g16.A03 g16.B01 Website Gaussian is a general purpose computational chemistry software package. computational chemistry libint 1.1.42.0.32.6.0 Website Libint computes molecular integrals. computational chemistry libxc 3.0.05.2.2 Website Libxc is a library of exchange-correlation functionals for density-functional theory. computational chemistry nwchem 6.8 7.0.2 Website NWChem is an ab initio computational chemistry software package which also includes quantum chemical and molecular dynamics functionality. computational chemistry py-ase 3.14.1_py273.22.1_py39 Website The Atomic Simulation Environment (ASE) is a set of tools and Python modules for setting up, manipulating, running, visualizing and analyzing atomistic simulations. computational chemistry schrodinger 2021-1 2017-3 2018-1 2018-2 2019-2 2020-2 2022-3 Website Schr\u00f6dinger Suites (Small-molecule Drug Discovery Suite, Material Science Suite, Biologics Suite) provide a set of molecular modelling software. computational chemistry vasp 5.4.1 6.1.1 6.3.2 6.4.1 Website The Vienna Ab initio Simulation Package (VASP) is a computer program for atomic scale materials modelling, e.g. electronic structure calculations and quantum-mechanical molecular dynamics, from first principles. crystallography clipper 2.1.20180802 Website Crystallographic automation and complex data manipulation libraries. crystallography mmdb2 2.0.20 Website A C++ toolkit for working with macromolecular coordinate files. crystallography ssm 1.4 Website A macromolecular superposition library. crystallography vesta 3.4.4 Website VESTA is a 3D visualization program for structural models, volumetric data such as electron/nuclear densities, and crystal morphologies. docking gnina 1.0.2 Website A deep learning framework for molecular docking electrostatics apbs 1.5 Website APBS solves the equations of continuum electrostatics for large biomolecular assemblages. molecular dynamics gromacs 2016.3 2018 2021.3 2023.1 Website GROMACS is a versatile package to perform molecular dynamics, i.e. simulate the Newtonian equations of motion for systems with hundreds to millions of particles. molecular dynamics lammps 20180316 20200303 Website LAMMPS is a classical molecular dynamics code that models an ensemble of particles in a liquid, solid, or gaseous state. molecular dynamics openmm 7.1.1 Website A high performance toolkit for molecular simulation. molecular dynamics plumed 2.3.2 Website PLUMED is an open source library for free energy calculations in molecular systems. molecular dynamics py-raspa2 2.0.3_py27 Website RASPA2 is a general purpose classical simulation package that can be used for the simulation of molecules in gases, fluids, zeolites, aluminosilicates, metal-organic frameworks, carbon nanotubes and external fields. molecular dynamics qbox 1.65.0 Website Qbox is a First-Principles Molecular Dynamics code. molecular dynamics quip 20170901 20220426 Website The QUIP package is a collection of software tools to carry out molecular dynamics simulations. quantum chemistry cp2k 4.1 9.1 Website CP2K is a quantum chemistry and solid state physics software package that can perform atomistic simulations of solid state, liquid, molecular, periodic, material, crystal, and biological systems. quantum chemistry ocean 2.9.7 Website OCEAN is a versatile and user-friendly package for calculating core edge spectroscopy including excitonic effects. quantum chemistry orca 4.2.1 5.0.0 5.0.3 Website ORCA is a flexible, efficient and easy-to-use general purpose tool for quantum chemistry. quantum chemistry quantum-espresso 6.2.1 6.6 7.0 7.1 Website Quantum ESPRESSO is an integrated suite of Open-Source computer codes for electronic-structure calculations and materials modeling at the nanoscale. It is based on density-functional theory, plane waves, and pseudopotentials. quantum chemistry quantum-espresso_gpu 1.1 7.0 7.1 Website Quantum ESPRESSO is an integrated suite of Open-Source computer codes for electronic-structure calculations and materials modeling at the nanoscale. It is based on density-functional theory, plane waves, and pseudopotentials. quantum chemistry terachem 1.95A 1.96H-beta Website TeraChem is general purpose quantum chemistry software designed to run on NVIDIA GPU architectures. tools openbabel 3.1.1 Website Open Babel is a chemical toolbox designed to speak the many languages of chemical data. tools py-openbabel 3.1.1.1_py39 Website Python bindings for Open Babel.","location":"docs/software/list/#chemistry"},{"title":"devel","text":"Field Module\u00a0name Version(s) URL Description build bazel 0.16.10.26.10.29.1 Website Bazel is a fast, scalable, multi-language and extensible build system. build bazelisk 1.3.01.8.0 Website Bazelisk is a wrapper for Bazel written in Go. build binutils 2.38 Website The GNU Binutils are a collection of binary tools. build cmake 3.8.13.11.13.13.13.20.33.24.2 Website CMake is an extensible, open-source system that manages the build process in an operating system and in a compiler-independent manner. build kerl 1.8.5 Website Kerl is a tool to easily build and install Erlang/OTP instances. build make 4.4 Website GNU Make is a tool which controls the generation of executables and other non-source files of a program from the program's source files. build ninja 1.9.0 Website Ninja is a small build system with a focus on speed. build py-meson 0.51.1_py36 Website Meson is an open source build system meant to be both extremely fast, and, even more importantly, as user friendly as possible. build py-scons 3.0.5_py27 3.0.5_py36 Website SCons is an Open Source software construction tool. compiler aocc 2.1.02.2.0 Website AMD Optimizing C/C++ Compiler - AOCC is a highly optimized C, C++ and Fortran compiler for x86 targets especially for Zen based AMD processors. compiler gcc 6.3.0 7.1.07.3.08.1.09.1.010.1.010.3.012.1.0 Website The GNU Compiler Collection includes front ends for C, C++, Fortran, Java, and Go, as well as libraries for these languages (libstdc++, libgcj,...). compiler icc 2017.u22018.u120182019 Website Intel C++ Compiler, also known as icc or icl, is a group of C and C++ compilers from Intel compiler ifort 2017.u22018.u120182019 Website Intel Fortran Compiler, also known as ifort, is a group of Fortran compilers from Intel compiler llvm 7.0.0 3.8.14.0.05.0.09.0.115.0.3 Website The LLVM Project is a collection of modular and reusable compiler and toolchain technologies. Clang is an LLVM native C/C++/Objective-C compiler, compiler nvhpc 21.5 21.7 22.3 23.3 Website NVIDIA HPC Software Development Kit (SDK) including C, C++, and Fortran compilers. compiler pgi 19.10 Website PGI compilers and tools, including Open MPI (Community Edition). compiler smlnj 110.81 Website Standard ML of New Jersey (abbreviated SML/NJ) is a compiler for the Standard ML '97 programming language. data h5utils 1.12.1 Website h5utils is a set of utilities for visualization and conversion of scientific data in the free, portable HDF5 format. data hdf5 1.10.6 1.10.0p11.10.2 1.12.01.12.2 Website HDF5 is a data model, library, and file format for storing and managing data. It supports an unlimited variety of datatypes, and is designed for flexible and efficient I/O and for high volume and complex data. data hiredis 0.13.3 Website Hiredis is a minimalistic C client library for the Redis database. data ncl 6.4.06.6.2 Website NCL is a free interpreted language designed specifically for scientific data processing and visualization. data nco 4.8.0 5.0.6 Website The NCO toolkit manipulates and analyzes data stored in netCDF-accessible formats. data netcdf 4.4.1.14.8.1 Website NetCDF is a set of software libraries and self-describing, machine-independent data formats that support the creation, access, and sharing of array-oriented scientific data. data netcdf-c 4.9.0 Website NetCDF is a set of software libraries and self-describing, machine-independent data formats that support the creation, access, and sharing of array-oriented scientific data. This module provides C libraries. data netcdf-cxx 4.3.1 Website NetCDF is a set of software libraries and self-describing, machine-independent data formats that support the creation, access, and sharing of array-oriented scientific data. This module provides C++ libraries. data netcdf-fortran 4.5.4 Website NetCDF is a set of software libraries and self-describing, machine-independent data formats that support the creation, access, and sharing of array-oriented scientific data. This module provides Fortran libraries. data pnetcdf 1.8.1 1.12.3 Website Parallel netCDF (PnetCDF) is a parallel I/O library for accessing NetCDF files in CDF-1, 2, and 5 formats. data protobuf 3.4.0 3.20.021.9 Website Protocol Buffers (a.k.a., protobuf) are Google's language-neutral, platform-neutral, extensible mechanism for serializing structured data. data py-pandas 0.23.0_py270.23.0_py361.0.3_py361.3.1_py392.0.1_py39 Website pandas is an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language. data py-protobuf 3.4.0_py27 3.4.0_py363.6.1_py273.6.1_py363.15.8_py363.20.1_py394.21.9_py39 Website Python bindings for Google's Protocol Buffers data interchange format. data redis 4.0.1 Website Redis is an open source, in-memory data structure store, used as a database, cache and message broker. data zfp 1.0.0 Website zfp is an open-source library for compressed floating-point and integer arrays that support high throughput read and write random access. data analytics hadoop 3.1.0 3.3.1 Website The Apache Hadoop software library is a framework that allows for the distributed processing of large data sets across clusters of computers using simple programming models. data analytics py-sparkhpc 0.3_py27 Website Launching and controlling spark on HPC clusters data analytics spark 2.3.0 3.2.1 Website Apache Spark\u2122 is a unified analytics engine for large-scale data processing. debug gdb 8.2.1 Website GDB is the GNU Project debugger. debug valgrind 3.14.0 Website Valgrind is an instrumentation framework for building dynamic analysis tools. engine v8 8.4.371.22 Website V8 is Google\u2019s open source high-performance JavaScript and WebAssembly engine, written in C++. framework dotnet 2.1.5006.0.413 Website .NET is a free, cross-platform, open source developer platform for building many different types of applications. framework ga 5.8.2 Website Global Arrays (GA) is a Partitioned Global Address Space (PGAS) programming model. framework py-kedro 0.18.0_py39 Website Kedro is an open-source Python framework for creating reproducible, maintainable and modular data science code. IDE code-server 4.16.1 Website Run VS Code on any machine anywhere and access it in the browser. language cuda 9.0.176 8.0.61 9.1.85 9.2.88 9.2.148 10.0.130 10.1.105 10.1.168 10.2.89 11.0.3 11.1.1 11.2.0 11.3.1 11.4.1 11.5.0 11.7.1 12.0.0 12.1.1 12.2.0 Website CUDA is a parallel computing platform and application programming interface (API) model created by Nvidia. It allows software developers and software engineers to use a CUDA-enabled graphics processing unit (GPU) for general purpose processing. language erlang 21.3 Website Erlang is a programming language used to build massively scalable soft real-time systems with requirements on high availability. language gcl 2.6.14 Website GCL is the official Common Lisp for the GNU project. language go 1.91.141.18.2 Website Go is an open source programming language that makes it easy to build simple, reliable, and efficient software. language guile 2.0.112.2.2 Website GNU Guile is the preferred extension system for the GNU Project, which features an implementation of the Scheme programming language. language haskell 8.6.5 Website Haskell is a statically typed, purely functional programming language with type inference and lazy evaluation. language java 1.8.0_131 11.0.1112.0.217.0.418.0.2 Website Java is a general-purpose computer programming language that is concurrent, class-based, object-oriented,[14] and specifically designed to have as few implementation dependencies as possible. language julia 1.3.11.4.01.5.11.6.21.7.21.8.41.9.01.10.0 Website Julia is a high-level, high-performance dynamic programming language for numerical computing. language lua 5.3.4 Website Lua is a powerful, efficient, lightweight, embeddable scripting language. It supports procedural programming, object-oriented programming, functional programming, data-driven programming, and data description. language luarocks 2.4.3 Website LuaRocks is the package manager for Lua modules. language manticore 20180301 Website Manticore is a high-level parallel programming language aimed at general-purpose applications running on multi-core processors. language nodejs 8.9.49.5.016.13.018.15.0 Website Node.js is a JavaScript runtime built on Chrome's V8 JavaScript engine. It provides the npm package manager. language perl 5.26.05.36.1 Website Perl 5 is a highly capable, feature-rich programming language with over 29 years of development. language php 7.3.0 Website PHP (recursive acronym for PHP: Hypertext Preprocessor) is an open source general-purpose scripting language that is especially suited for web development. language py-cython 0.27.3_py270.27.3_py360.29.21_py360.29.28_py39 Website Cython is an optimising static compiler for both the Python programming language and the extended Cython programming language (based on Pyrex). language py-ipython 5.4.1_py27 6.1.0_py368.3.0_py39 Website IPython is a command shell for interactive computing in multiple programming languages, originally developed for the Python programming language. language py-jupyter 1.0.0_py27 1.0.0_py361.0.0_py39 Website Jupyter is a browser-based interactive notebook for programming, mathematics, and data science. It supports a number of languages via plugins. language py-jupyterlab 2.3.2_py364.0.8_py39 Website Jupyter is a browser-based interactive notebook for programming, mathematics, and data science. It supports a number of languages via plugins. language python 2.7.13 3.6.13.9.03.12.1 Website Python is an interpreted, interactive, object-oriented programming language. language ruby 2.4.12.7.13.1.2 Website A dynamic, open source programming language with a focus on simplicity and productivity. It has an elegant syntax that is natural to read and easy to write. language rust 1.35.01.56.11.63.01.72.0 Website A language empowering everyone to build reliable and efficient software. language scala 2.12.6 Website Scala combines object-oriented and functional programming in one concise, high-level language. lib ant 1.10.1 Website Apache Ant is a Java library and command-line tool whose mission is to drive processes described in build files as targets and extension points dependent upon each other. lib boost 1.64.01.69.0 1.75.0 1.76.0 1.79.0 Website Boost is a set of libraries for the C++ programming language that provide support for tasks and structures such as linear algebra, pseudorandom number generation, multithreading, image processing, regular expressions, and unit testing. lib chai 2.2.2 Website Copy-hiding array abstraction to automatically migrate data between memory spaces. lib cnmem 1.0.0 Website CNMeM is a simple library to help the Deep Learning frameworks manage CUDA memory. lib conduit 0.5.1 Website Simplified Data Exchange for HPC Simulations. lib cub 1.7.3 1.10.0 Website CUB is a flexible library of cooperative threadblock primitives and other utilities for CUDA kernel programming. lib cutlass 0.1.03.1.0 Website CUTLASS is a collection of CUDA C++ template abstractions for implementing high-performance matrix-multiplication (GEMM) at all levels and scales within CUDA. lib dtcmp 1.1.3 Website Datatype Compare (DTCMP) Library for sorting and ranking distributed data using MPI. lib eigen 3.3.33.4.0 Website Eigen is a C++ template library for linear algebra: matrices, vectors, numerical solvers, and related algorithms. lib libcircle 0.3.0 Website libcircle is an API for distributing embarrassingly parallel workloads using self-stabilization. lib libctl 3.2.24.0.14.5.0 Website libctl is a library for supporting flexible control files in scientific simulations. lib libevent 2.1.12 Website The libevent API provides a mechanism to execute a callback function when a specific event occurs on a file descriptor or after a timeout has been reached. lib libgpuarray 0.7.5 Website Library to manipulate tensors on the GPU. lib libtree 2.0.0 Website libtree prints shared object dependencies as a tree. lib lwgrp 1.0.4 Website The Light-weight Group Library provides methods for MPI codes to quickly create and destroy process groups. lib nccl 1.3.4 2.0.4 2.1.15 2.2.13 2.3.7 2.4.8 2.5.6 2.8.4 2.11.4 2.17.1 Website NCCL (pronounced 'Nickel') is a stand-alone library of standard collective communication routines, such as all-gather, reduce, broadcast, etc., that have been optimized to achieve high bandwidth over PCIe. lib pugixml 1.12.1 Website Light-weight, simple and fast XML parser for C++ with XPath support. lib py-cutlass 3.1.0_py39 Website Python interface for CUTLASS lib py-h5py 2.7.1_py27 2.8.0_py362.10.0_py363.1.0_py363.7.0_py39 Website The h5py package is a Pythonic interface to the HDF5 binary data format. lib py-netcdf4 1.3.1_py27 1.3.1_py36 Website netcdf4-python is a Python interface to the netCDF C library. lib py-nose 1.3.7_py39 Website nose is nicer testing for python. lib py-numba 0.35.0_py27 0.35.0_py360.53.1_py360.54.1_py39 Website Numba is a compiler for Python array and numerical functions that gives you the power to speed up your applications with high performance functions written directly in Python.. lib py-parsl 1.2.0_py39 Website Parsl is a flexible and scalable parallel programming library for Python. lib py-pycuda 2017.1.1_py27 2021.1_py36 Website PyCUDA lets you access Nvidia\u2018s CUDA parallel computation API from Python. lib py-rmm 23.04.00_py39 Website Python interface for RMM lib py-schwimmbad 0.3.1_py36 0.3.2_py39 Website schwimmbad provides a uniform interface to parallel processing pools and enables switching easily between local development (e.g., serial processing or with multiprocessing) and deployment on a cluster or supercomputer (via, e.g., MPI or JobLib). lib py-scikit-image 0.13.0_py270.14.0_py270.15.0_py270.15.0_py360.17.2_py360.19.3_py390.20.0_py39 Website scikit-image is a collection of algorithms for image processing. lib rabbitmq 3.7.13 Website RabbitMQ is an open-source message broker. lib raja 0.12.1 Website Collection of C++ software abstractions that enable architecture portability for HPC applications. lib rmm 23.04.00 Website RAPIDS Memory Manager library lib swig 3.0.12 Website SWIG is an interface compiler that connects programs written in C and C++ with scripting languages such as Perl, Python, Ruby, and Tcl. lib tbb 2017.u22018.u120182019 Website Intel\u00ae Threading Building Blocks (Intel\u00ae TBB) is a widely used C++ library for shared-memory parallel programming and heterogeneous computing (intra-node distributed memory programming). lib trilinos 12.12.1 Website Trilinos is a collection of open-source software libraries, called packages, intended to be used as building blocks for the development of scientific applications. lib xsimd 7.6.08.1.0 Website C++ wrappers for SIMD intrinsics and parallelized, optimized mathematical functions (SSE, AVX, NEON, AVX512) lib zeromq 4.2.2 Website ZeroMQ (also spelled \u00d8MQ, 0MQ or ZMQ) is a high-performance asynchronous messaging library, aimed at use in distributed or concurrent applications. mpi hpcx 2.6.0 2.7.0 2.8.1 Website Mellanox HPC-X toolkit is a comprehensive software package that includes MPI and SHMEM/PGAS communications libraries. mpi impi 2017.u2 2018.u1 2018 2019 Website Intel\u00ae MPI Library is a multi-fabric message passing library that implements the Message Passing Interface, version 3.1 (MPI-3.1) specification. mpi openmpi 2.0.2 2.1.1 3.1.2 4.0.3 4.0.5 4.1.0 4.1.2 Website The Open MPI Project is an open source Message Passing Interface implementation that is developed and maintained by a consortium of academic, research, and industry partners. mpi py-mpi4py 3.0.0_py27 3.0.3_py36 3.1.3_py39 Website MPI for Python provides Python bindings for the Message Passing Interface (MPI) standard. It is implemented on top of the MPI-\u00bd/3 specification and exposes an API which grounds on the standard MPI-2 C++ bindings. networking gasnet 1.30.0 Website GASNet is a language-independent, low-level networking layer that provides network-independent, high-performance communication primitives tailored for implementing parallel global address space SPMD languages and libraries. networking libfabric 1.6.01.6.21.7.11.9.11.10.11.11.11.14.0 Website The Open Fabrics Interfaces (OFI) is a framework focused on exporting fabric communication services to applications. Libfabric is the library that defines and exports the user-space API of OFI. networking py-ucx-py 0.24.0_py39 Website Python bindinbgs for UCX. networking ucx 1.3.11.8.1 1.9.0 1.10.0 1.12.1 Website UCX is a communication library implementing high-performance messaging for MPI/PGAS frameworks. parser antlr 2.7.7 Website ANTLR (ANother Tool for Language Recognition) is a powerful parser generator for reading, processing, executing, or translating structured text or binary files. parser xerces-c 3.2.1 Website Xerces-C++ is a validating XML parser written in a portable subset of C++. profiling amd-uprof 3.3.462 Website AMD uProf is a performance analysis tool for applications. profiling darshan 3.4.4 Website Darshan is a scalable HPC I/O characterization tool. runtime starpu 1.3.2 Website StarPU is a unified runtime system that offers support for heterogeneous multicore architectures","location":"docs/software/list/#devel"},{"title":"math","text":"Field Module\u00a0name Version(s) URL Description computational geometry cgal 4.10 Website The Computational Geometry Algorithms Library (CGAL) is a C++ library that aims to provide easy access to efficient and reliable algorithms in computational geometry. computational geometry dealii 9.4.1 Website deal.II is a C++ program library targeted at the computational solution of partial differential equations using adaptive finite elements. computational geometry gmsh 4.10.1 Website Gmsh is an open source 3D finite element mesh generator with a built-in CAD engine and post-processor. computational geometry opencascade 7.6.2 Website Open CASCADE Technology (OCCT) is an open-source full-scale 3D geometry library computational geometry polymake 4.10 Website polymake is open source software for research in polyhedral geometry. computational geometry qhull 2015.2 Website Qhull computes the convex hull, Delaunay triangulation, Voronoi diagram, halfspace intersection about a point, furthest-site Delaunay triangulation, and furthest-site Voronoi diagram. computational geometry silo 4.11 Website A mesh and field I/O library and scientific database. deep learning cudnn 6.0 7.0.1 7.0.4 7.0.5 7.1.4 7.4.1.5 7.6.4 7.6.5 8.1.1.33 8.3.3.40 8.6.0.163 8.9.0.131 Website NVIDIA cuDNN is a GPU-accelerated library of primitives for deep neural networks. deep learning cutensor 1.2.0 1.5.0.3 Website GPU-accelerated tensor linear algebra library. deep learning py-gym 0.21.0_py39 Website Gym is a toolkit for developing and comparing reinforcement learning algorithms. deep learning py-horovod 0.12.1_py27 0.12.1_py36 Website Horovod is a distributed training framework for TensorFlow. The goal of Horovod is to make distributed Deep Learning fast and easy to use. deep learning py-keras 2.1.5_py27 2.0.8_py27 2.1.5_py36 2.2.4_py27 2.2.4_py36 2.3.1_py36 Website Keras is a high-level neural networks API, written in Python and capable of running on top of TensorFlow, CNTK, or Theano. deep learning py-onnx 1.0.1_py271.8.1_py361.12.0_py39 Website ONNX is a open format to represent deep learning models. deep learning py-pytorch 0.3.0_py27 0.2.0_py27 0.2.0_py36 0.3.0_py36 1.0.0_py27 1.0.0_py36 1.4.0_py36 1.6.0_py36 1.8.1_py39 1.11.0_py39 2.0.0_py39 Website PyTorch is a deep learning framework that puts Python first. deep learning py-tensorboardx 1.8_py27 Website TensorboardX is TensorBoard\u2122 for PyTorch (and Chainer, MXNet, NumPy...) deep learning py-tensorflow 2.1.0_py36 1.4.0_py27 1.5.0_py27 1.5.0_py36 1.9.0_py27 1.9.0_py36 2.4.1_py36 2.6.2_py36 2.9.1_py39 2.10.0_py39 Website TensorFlow\u2122 is an open source software library for numerical computation using data flow graphs. deep learning py-tensorlayer 1.6.3_py27 Website TensorLayer is a Deep Learning (DL) and Reinforcement Learning (RL) library extended from Google TensorFlow. deep learning py-tensorrt 8.5.1.7_py39 Website Python bindings for the TensorRT library. deep learning py-theano 1.0.1_py27 Website Theano is a Python library that allows you to define, optimize, and evaluate mathematical expressions involving multi-dimensional arrays efficiently. deep learning py-torchvision 0.15.1_py39 Website Datasets, model architectures, and common image transformations for computer vision for PyTorch. deep learning py-triton 1.0.0_py39 Website Triton is a language and compiler for writing highly efficient custom Deep-Learning primitives. deep learning tensorrt 3.0.1 3.0.4 4.0.1.6 5.0.2.6 6.0.1.8 7.0.0.11 7.2.3.4 8.5.1.7 Website NVIDIA TensorRT\u2122 is a high-performance deep learning inference optimizer and runtime that delivers low latency, high-throughput inference for deep learning applications. deep learning torch 20180202 Website Torch is a scientific computing framework with wide support for machine learning algorithms that puts GPUs first. graph computing bliss 0.73 Website A tool for computing automorphism groups and canonical forms of graphs. lib opencv 3.3.0 4.5.2 4.5.5 4.7.0 Website OpenCV (Open Source Computer Vision Library) is an open source computer vision and machine learning software library. linear algebra armadillo 8.200.1 Website Armadillo is a high quality linear algebra library (matrix maths) for the C++ language, aiming towards a good balance between speed and ease of use. linear algebra cusparselt 0.2.0.1 Website NVIDIA cuSPARSELt is a high-performance CUDA library for sparse matrix-matrix multiplication. machine learning py-scikit-learn 0.19.1_py27 0.19.1_py360.24.2_py361.0.2_py391.3.2_py39 Website Scikit-learn is a free software machine learning library for the Python programming language. numerical analysis matlab R2017a R2017b R2018a R2019a R2020a R2022b Website MATLAB is a multi-paradigm numerical computing environment and proprietary programming language developed by MathWorks. numerical analysis octave 4.2.1 Website GNU Octave is a high-level language primarily intended for numerical computations. numerical library arpack 3.5.03.7.0 3.9.0 Website Collection of Fortran77 subroutines designed to solve large scale eigenvalue problems. numerical library blis 2.12.2.43.1.0 Website BLIS is a portable software framework for instantiating high-performance BLAS-like dense linear algebra libraries. numerical library fftw 2.1.53.3.6 3.3.8 3.3.93.3.10 Website The Fastest Fourier Transform in the West (FFTW) is a software library for computing discrete Fourier transforms (DFTs). numerical library flexiblas 3.1.3 Website FlexiBLAS is a BLAS and LAPACK wrapper library with runtime exchangeable backends. numerical library flint 2.9.0 Website FLINT is a C library for doing number theory. numerical library glpk 4.63 Website The GLPK (GNU Linear Programming Kit) package is intended for solving large-scale linear programming (LP), mixed integer programming (MIP), and other related problems. numerical library gmp 6.1.26.2.1 Website GMP is a free library for arbitrary precision arithmetic, operating on signed integers, rational numbers, and floating-point numbers. numerical library gsl 1.162.32.7 Website The GNU Scientific Library (GSL) is a numerical library for C and C++ programmers. The library provides a wide range of mathematical routines such as random number generators, special functions and least-squares fitting. numerical library harminv 1.4.1 Website harminv is a program designed to solve the problem of harmonic inversion: given a time series consisting of a sum of sinusoids (modes), extract their frequencies and amplitudes. numerical library hypre 2.20.0 Website HYPRE is a library of high performance preconditioners and solvers featuring multigrid methods for the solution of large, sparse linear systems of equations on massively parallel computers. numerical library imkl 2017.u22018.u120182019 Website Intel Math Kernel Library (Intel MKL) is a library of optimized math routines for science, engineering, and financial applications. Core math functions include BLAS, LAPACK, ScaLAPACK, sparse solvers, fast Fourier transforms, and vector math.[3] The routines in MKL are hand-optimized specifically for Intel processors numerical library libflame 2.12.2.43.1.0 Website libflame is a portable library for dense matrix computations, providing much of the functionality present in LAPACK numerical library libxsmm 1.8.11.17 Website LIBXSMM is a library for small dense and small sparse matrix-matrix multiplications as well as for deep learning primitives such as small convolutions numerical library metis 5.1.0 Website METIS is a set of serial programs for partitioning graphs, partitioning finite element meshes, and producing fill reducing orderings for sparse matrices. numerical library mpc 1.2.1 Website GNU MPC is a C library for the arithmetic of complex numbers with arbitrarily high precision and correct rounding of the result. numerical library mpfr 3.1.54.1.0 Website The MPFR library is a C library for multiple-precision floating-point computations with correct rounding. numerical library mumps 5.1.2 Website A parallel sparse direct solver. numerical library openblas 0.3.10 0.2.190.3.40.3.90.3.200.3.26 Website OpenBLAS is an optimized BLAS library numerical library parmetis 4.0.3 Website ParMETIS is an MPI-based parallel library that implements a variety of algorithms for partitioning unstructured graphs, meshes, and for computing fill-reducing orderings of sparse matrices. numerical library petsc 3.10.3 3.18.5 Website PETSc, the Portable, Extensible Toolkit for Scientific Computation, is a suite of data structures and routines for the scalable (parallel) solution of scientific applications modeled by partial differential equations. numerical library py-autograd 1.0_py39 Website Autograd can automatically differentiate native Python and Numpy code. numerical library py-cupy 7.8.0_py36 10.2.0_py39 12.1.0_py39 Website CuPy is an implementation of NumPy-compatible multi-dimensional array on CUDA. numerical library py-gmpy2 2.0.8_py36 Website gmpy2 is a C-coded Python extension module that supports multiple-precision arithmetic. numerical library py-jax 0.4.7_py39 Website JAX is Autograd and XLA, brought together for high-performance numerical computing. numerical library py-jaxlib 0.4.7_py39 Website XLA library for Jax. numerical library py-numpy 1.14.3_py27 1.14.3_py361.17.2_py361.18.1_py361.19.2_py361.20.3_py391.24.2_py391.26.3_py312 Website NumPy is the fundamental package for scientific computing with Python. numerical library py-petsc4py 3.18.5_py39 Website Python bindings for PETSc, the Portable, Extensible Toolkit for Scientific Computation. numerical library py-psbody-mesh 0.4_py39 Website The MPI-IS Mesh Processing Library contains core functions for manipulating meshes and visualizing them. numerical library py-pyublas 2017.1_py27 Website PyUblas provides a seamless glue layer between Numpy and Boost.Ublas for use with Boost.Python. numerical library py-scipy 1.1.0_py27 1.1.0_py361.4.1_py361.6.3_py391.10.1_py39 Website The SciPy library provides many user-friendly and efficient numerical routines such as routines for numerical integration and optimization. numerical library py-slepc4py 3.18.2_py39 Website Python bindings for SLEPc. numerical library py-tabmat 3.1.2_py39 Website Efficient matrix representations for working with tabular data. numerical library qrupdate 1.1.2 Website qrupdate is a Fortran library for fast updates of QR and Cholesky decompositions. numerical library scalapack 2.0.2 2.1 2.2.0 Website ScaLAPACK is a library of high-performance linear algebra routines for parallel distributed memory machines. numerical library scotch 6.0.4 Website Software package and libraries for sequential and parallel graph partitioning, static mapping and clustering, sequential mesh and hypergraph partitioning, and sequential and parallel sparse matrix block ordering. numerical library slepc 3.18.2 Website SLEPc is a Scalable Library for Eigenvalue Problem Computations. numerical library suitesparse 7.4.0 Website SuiteSparse is a suite of sparse matrix algorithms. numerical library superlu 5.2.1 Website SuperLU is a general purpose library for the direct solution of large, sparse, nonsymmetric systems of linear equations. numerical library tetgen 1.6.0 Website TetGen provides various features to generate good quality and adaptive tetrahedral meshes suitable for numerical methods, such as finite element or finite volume methods. numerical library xblas 1.0.248 Website Extra precise basic linear algebra subroutines. optimization gurobi 7.5.18.0.1_py278.0.1_py369.0.3_py3610.0.1_py39 Website The Gurobi Optimizer is a commercial optimization solver for mathematical programming. optimization knitro 10.3.0 12.4.0 Website Artelys Knitro is an optimization solver for difficult large-scale nonlinear problems. optimization nlopt 2.6.2 Website NLopt is a free/open-source library for nonlinear optimization. optimization octeract 3.3.0 Website Octeract Engine is a proprietary massively parallel deterministic global optimization solver for general Mixed-Integer Nonlinear Programs (MINLP). optimization py-optuna 2.10.0_py39 Website Optuna is an automatic hyperparameter optimization software framework, particularly designed for machine learning. optimization sundials 6.4.1 Website SUNDIALS is a family of software packages providing robust and efficient time integrators and nonlinear solvers that can easily be incorporated into existing simulation codes. scientific computing py-scipystack 1.0_py27 1.0_py36 Website The SciPy Stack is a collection of open source software for scientific computing in Python. It provides the following packages: numpy, scipy, matplotlib, ipython, jupyter, pandas, sympy and nose. statistics datamash 1.3 Website GNU datamash is a command-line program which performs basic numeric, textual and statistical operations on input textual data files. statistics jags 4.3.04.3.1 Website Just another Gibbs sampler (JAGS) is a program for simulation from Bayesian hierarchical models using Markov chain Monte Carlo (MCMC). statistics py-emcee 3.1.4_py39 Website The Python ensemble sampling toolkit for affine-invariant MCMC statistics py-glum 2.1.2_py39 Website glum is a fast, modern, Python-first GLM estimation library. statistics py-rpy2 2.8.6_py272.9.2_py36 Website rpy2 is an interface to R running embedded in a Python process. statistics R 3.5.1 3.4.03.6.14.0.24.1.24.2.04.3.2 Website R is a free software environment for statistical computing and graphics. statistics rstudio 1.3.1093 2023.09.1 Website RStudio is an integrated development environment (IDE) for R. It includes a console, syntax-highlighting editor that supports direct code execution, as well as tools for plotting, history, debugging and workspace management. statistics rstudio-desktop 2022.02.2-485 Website RStudio is an integrated development environment (IDE) for R. It includes a console, syntax-highlighting editor that supports direct code execution, as well as tools for plotting, history, debugging and workspace management. This is the X11/GUI version. statistics sas 9.4 Website SAS is a software suite developed by SAS Institute for advanced analytics, multivariate analyses, business intelligence, data management, and predictive analytics. statistics stata 15 14 16 17 18 Website Stata is a complete, integrated statistical software package that provides everything you need for data analysis, data management, and graphics. symbolic libmatheval 1.1.11 Website GNU libmatheval is a library (callable from C and Fortran) to parse and evaluate symbolic expressions input as text. symbolic maxima 5.47.0 Website Maxima is a system for the manipulation of symbolic and numerical expressions. symbolic py-pysr 0.12.3_py39 Website High-Performance Symbolic Regression in Python and Julia. symbolic py-sympy 1.1.1_py271.1.1_py361.11.1_py39 Website SymPy is a Python library for symbolic mathematics. technical computing mathematica 13.1.0 Website A symbolic language and platform for modern technical computing. topic modelling py-gensim 4.2.0_py39 Website Gensim is a Python library for topic modelling, document indexing and similarity retrieval with large corpora.","location":"docs/software/list/#math"},{"title":"physics","text":"Field Module\u00a0name Version(s) URL Description astronomy cfitsio 4.0.0 Website FITSIO is a library of C and Fortran subroutines for reading and writing data files in FITS (Flexible Image Transport System) data format. astronomy heasoft 6.22.16.26.1 Website HEAsoft is a Unified Release of the FTOOLS (General and mission-specific tools to manipulate FITS files) and XANADU (High-level, multi-mission tasks for X-ray astronomical spectral, timing, and imaging data analysis) software packages. astronomy py-astropy 4.0.1_py36 Website The Astropy Project is a community effort to develop a common core package for Astronomy in Python and foster an ecosystem of interoperable astronomy packages. astronomy py-lenstools 1.0_py36 Website This python package collects together a suite of widely used analysis tools in Weak Gravitational Lensing. astronomy py-namaster 1.2.2_py36 Website NaMaster is a C library, Python module and standalone program to compute full-sky angular cross-power spectra of masked fields with arbitrary spin and an arbitrary number of known contaminants using a pseudo-Cl (aka MASTER) approach. CFD su2 7.0.3 Website SU2: An Open-Source Suite for Multiphysics Simulation and Design cliemate modeling fre-nctools 2022.01 Website FRE-NCtools is a collection of tools to help with the creation and manipulation of netCDF files used for climate modeling. climate modeling cdo 1.9.7.12.1.1 Website CDO is a collection of command line Operators to manipulate and analyse Climate and NWP model Data. geophysics opensees 2.5.0 Website OpenSees is a software framework for developing applications to simulate the performance of structural and geotechnical systems subjected to earthquakes. geoscience gdal 3.4.1 2.2.13.5.2 Website GDAL is a translator library for raster and vector geospatial data formats. geoscience geos 3.6.2 3.11.03.12.1 Website GEOS (Geometry Engine - Open Source) is a C++ port of Java Topology Suite (JTS). geoscience geosx 0.2.0-20220523 Website GEOSX is a simulation framework for modeling coupled flow, transport, and geomechanics in the subsurface. geoscience gmtsar 6.2.2 Website An InSAR processing system based on GMT (Generic Mapping Tools). geoscience proj 8.2.1 4.9.39.1.0 Website PROJ is a generic coordinate transformation software that transforms geospatial coordinates from one coordinate reference system (CRS) to another. geoscience py-opendrift 1.0.3_py27 Website OpenDrift is a software for modeling the trajectories and fate of objects or substances drifting in the ocean, or even in the atmosphere. geoscience py-pyproj 1.9.5.1_py27 1.9.5.1_py363.4.0_py39 Website Python interface to PROJ4 library for cartographic transformations. geoscience swash 9.01a Website SWASH (an acronym of Simulating WAves till SHore) is a non-hydrostatic wave-flow model. geoscience udunits 2.2.26 Website The UDUNITS package from Unidata is a C-based package for the programatic handling of units of physical quantities. lib libgdsii 0.21 Website libGDSII C++ is a library and command-line utility for reading GDSII geometry files. magnetism mumax 3.10 Website mumax3 is a GPU-accelerated micromagnetic simulation program. materials science atat 3.36 Website Alloy Theoretic Automated Toolkit: a software toolkit for modeling coupled configurational and vibrational disorder in alloy systems. materials science py-megnet 1.3.0_py39 Website The MatErials Graph Network (MEGNet) is an implementation of DeepMind's graph networks[1] for universal machine learning in materials science. materials science py-pymatgen 2022.5.26_py39 Website Pymatgen (Python Materials Genomics) is a robust, open-source Python library for materials analysis. micromagnetics oommf 1.2b4 Website OOMMF is a set of portable, extensible public domain micromagnetic program and associated tools. particle openmc 0.10.0 Website OpenMC is a Monte Carlo particle transport simulation code focused on neutron criticality calculations. photonics meep 1.3 1.4.3 1.24.0 Website Meep is a free finite-difference time-domain (FDTD) simulation software package to model electromagnetic systems. photonics mpb 1.5 1.6.2 1.11.1 Website MPB is a free software package for computing the band structures, or dispersion relations, and electromagnetic modes of periodic dielectric structures, on both serial and parallel computers. quantum information science cuquantum 22.03.0.40 Website NVIDIA cuQuantum is an SDK of optimized libraries and tools for accelerating quantum computing workflows. quantum information science py-cuquantum-python 22.3.0_py39 Website NVIDIA cuQuantum Python provides Python bindings and high-level object-oriented models for accessing the full functionalities of NVIDIA cuQuantum SDK from Python. quantum mechanics py-quspin 0.3.5_py36 Website QuSpin is an open-source Python package for exact diagonalization and quantum dynamics of arbitrary boson, fermion and spin many-body systems. quantum mechanics py-qutip 4.5.2_py36 Website QuTiP is open-source software for simulating the dynamics of closed and open quantum systems.","location":"docs/software/list/#physics"},{"title":"system","text":"Field Module\u00a0name Version(s) URL Description backup restic 0.9.50.12.10.16.3 Website Fast, secure, efficient backup program. benchmark hp2p 3.2 Website Heavy Peer To Peer: a MPI based benchmark for network diagnostic. benchmark mpibench 20190729 Website Times MPI collectives over a series of message sizes. benchmark mprime 29.4 Website mprime is used by GIMPS, a distributed computing project dedicated to finding new Mersenne prime numbers, and which is commonly used as a stability testing utility. benchmark osu-micro-benchmarks 5.6.1 5.6.3 5.7 5.9 Website The OSU MicroBenchmarks carry out a variety of message passing performance tests using MPI. benchmark py-linktest 2.1.19_py39 Website LinkTest is a communication API benchmarking tool that tests point-to-point connections. checkpointing dmtcp 2.6.0 Website DMTCP (Distributed MultiThreaded Checkpointing) transparently checkpoints a single-host or distributed computation in user-space -- with no modifications to user code or to the O/S. cloud interface aws-cli 2.0.50 Website This package provides a unified command line interface to Amazon Web Services. cloud interface google-cloud-sdk 338.0.0400.0.0448.0.0 Website Command-line interface for Google Cloud Platform products and services. cloud interface s5cmd 2.0.0 Website Parallel S3 and local filesystem execution tool. cloud interface steampipe 0.14.6 Website Steampipe is an open source tool for querying cloud APIs in a universal way and reasoning about the data in SQL. compiler mrc 1.3.3 Website MRC is a resource compiler that can create self-contained applications, by including all the required data inside executable files. compression libarchive 3.3.23.4.23.5.2 Website The libarchive project develops a portable, efficient C library that can read and write streaming archives in a variety of formats. compression libzip 1.5.1 Website libzip is a C library for reading, creating, and modifying zip archives. compression lz4 1.8.0 Website LZ4 is lossless compression algorithm. compression lzo 2.10 Website LZO is a portable lossless data compression library written in ANSI C. compression mpibzip2 0.6 Website MPIBZIP2 is a parallel implementation of the bzip2 block-sorting file compressor that uses MPI and achieves significant speedup on cluster machines. compression p7zip 16.02 Website p7zip is a Linux port of 7zip, a file archiver with high compression ratio. compression pbzip2 1.1.12 Website PBZIP2 is a parallel implementation of the bzip2 block-sorting file compressor that uses pthreads and achieves near-linear speedup on SMP machines. compression pigz 2.4 Website A parallel implementation of gzip for modern multi-processor, multi-core machines. compression szip 2.1.1 Website Szip compression software, providing lossless compression of scientific data, is an implementation of the extended-Rice lossless compression algorithm. compression xz 5.2.3 Website XZ Utils, the successor to LZMA Utils, is free general-purpose data compression software with a high compression ratio. compression zlib 1.2.11 Website zlib is designed to be a free, general-purpose, legally unencumbered -- that is, not covered by any patents -- lossless data-compression library for use on virtually any computer hardware and operating system. compression zstd 1.5.2 Website Zstandard, or zstd, is a fast lossless compression algorithm, targeting real-time compression scenarios at zlib-level and better compression ratios. containers libnvidia-container 1.0.0rc2 Website libnvidia-container is a library and a simple CLI utility to automatically configure GNU/Linux containers leveraging NVIDIA hardware. containers proot 5.2.0 5.1.0 Website PRoot is a user-space implementation of chroot, mount --bind, and binfmt_misc. containers py-spython 0.3.13_py390.3.13_py312 Website Singularity Python (spython) is the Python API for working with Singularity containers. database bdb 6.2.32 Website Berkeley DB (BDB) is a software library intended to provide a high-performance embedded database for key/value data. database mariadb 10.2.11 10.6.9 Website MariaDB is a community-developed fork of the MySQL relational database management system intended to remain free under the GNU GPL. database postgresql 10.514.5 Website PostgreSQL is a powerful, open source object-relational database system with a strong focus on reliability, feature robustness, and performance. database sqlite 3.18.03.37.23.44.2 Website SQLite is a self-contained, high-reliability, embedded, full-featured, public-domain, SQL database engine. database sqliteodbc 0.9998 Website ODBC driver for SQLite database unixodbc 2.3.9 Website unixODBC is an open-source project that implements the ODBC API. document management pandoc 2.7.3 Website Pandoc is a universal document converter. document processing ghostscript 9.53.2 Website Ghostscript is an interpreter for the PostScript language and PDF files. document processing groff 1.23.0 Website groff (GNU roff) is a typesetting system that reads plain text input files that include formatting commands to produce output in PostScript, PDF, HTML, or DVI formats or for display to a terminal. document processing lyx 2.3.2 Website LyX is a document processor. document processing poppler 0.47.0 Website Poppler is a PDF rendering library. document processing texinfo 6.6 Website Texinfo is the official documentation format of the GNU project. document processing texlive 2019 Website TeX Live is an easy way to get up and running with the TeX document production system. file management dua-cli 2.20.1 Website dua (-> Disk Usage Analyzer) is a tool to conveniently learn about the usage of disk space of a given directory. file management duc 1.4.4 Website Duc is a collection of tools for indexing, inspecting and visualizing disk usage. file management exa 0.8.0 Website exa is a replacement for ls written in Rust. file management fdupes 2.2.1 Website FDUPES is a program for identifying or deleting duplicate files residing within specified directories. file management fpart 0.9.3 Website fpart sorts files and packs them into partitions. file management midnight-commander 4.8.29 Website GNU Midnight Commander is a visual file manager. file management ncdu 1.18.1 1.15.12.2.1 Website Ncdu is a disk usage analyzer with an ncurses interface. file management py-pcircle 0.17_py27 Website pcircle contains a suite of file system tools developed at OLCF to take advantage of highly scalable parallel file system such as Lustre. file management rmlint 2.8.0 Website rmlint finds space waste and other broken things on your filesystem and offers to remove it. file management tdu 1.36 Website tdu estimates the disk space occupied by all files in a given path. file transfer aria2 1.35.0 Website aria2 is a lightweight multi-protocol & multi-source command-line download utility. file transfer aspera-cli 3.9.6 Website The IBM Aspera Command-Line Interface (the Aspera CLI) is a collection of Aspera tools for performing high-speed, secure data transfers from the command line. file transfer gsutil 4.31 Website gsutil is a Python application that lets you access Cloud Storage from the command line. file transfer lftp 4.8.1 Website LFTP is a sophisticated file transfer program supporting a number of network protocols (ftp, http, sftp, fish, torrent). file transfer mpifileutils 0.10.1 0.11 0.11.1 Website mpiFileUtils is a suite of MPI-based tools to manage large datasets, which may vary from large directory trees to large files. file transfer py-globus-cli 1.2.01.9.0_py271.9.0_py363.2.0_py393.8.0_py393.19.0_py39 Website A command line wrapper over the Globus SDK for Python. file transfer py-httpie 3.2.1_py39 Website HTTPie is a command-line HTTP client designed for testing, debugging, and generally interacting with APIs and HTTP servers. file transfer rclone 1.55.11.59.11.65.0 Website Rclone is a command line program to sync files and directories to and from: Google Drive, Amazon S3, Dropbox, Google Cloud Storage, Amazon Drive, Microsoft One Drive, Hubic, Backblaze B2, Yandex Disk, or the local filesystem. framework mono 5.12.0.3015.20.1.19 Website Mono is an open source implementation of Microsoft's .NET Framework based on the ECMA standards for C# and the Common Language Runtime. hardware hwloc 2.7.02.9.3 Website The Portable Hardware Locality (hwloc) software package provides a portable abstraction of the hierarchical topology of modern architectures. hardware libpciaccess 0.16 Website Generic PCI access library. job management slurm-drmaa 1.1.2 Website DRMAA for Slurm Workload Manager (Slurm) is an implementation of Open Grid Forum Distributed Resource Management Application API (DRMAA) version 1 for submission and control of jobs to Slurm. language tcltk 8.6.6 Website Tcl (Tool Command Language) is a dynamic programming language, suitable for web and desktop applications, networking, administration, testing. Tk is a graphical user interface toolkit. libs apr 1.6.3 Website The Apache Portable Runtime is a supporting library for the Apache web server. It provides a set of APIs that map to the underlying operating system. libs apr-util 1.6.1 Website The Apache Portable Runtime is a supporting library for the Apache web server. It provides a set of APIs that map to the underlying operating system. libs atk 2.24.0 Website ATK is the Accessibility Toolkit. It provides a set of generic interfaces allowing accessibility technologies such as screen readers to interact with a graphical user interface. libs benchmark 1.2.0 Website A microbenchmark support library libs cairo 1.14.10 Website Cairo is a 2D graphics library with support for multiple output devices. libs cups 2.2.4 Website CUPS is the standards-based, open source printing system. libs dbus 1.10.22 Website D-Bus is a message bus system, a simple way for applications to talk to one another. libs enchant 1.6.12.2.3 Website Enchant is a library (and command-line program) that wraps a number of different spelling libraries and programs with a consistent interface. libs fltk 1.3.4 Website FLTK (pronounced 'fulltick') is a cross-platform C++ GUI toolkit. libs fontconfig 2.12.4 Website Fontconfig is a library for configuring and customizing font access. libs freeglut 3.0.0 Website FreeGLUT is a free-software/open-source alternative to the OpenGL Utility Toolkit (GLUT) library. libs freetype 2.8.12.9.1 Website FreeType is a software font engine that is designed to be small, efficient, highly customizable, and portable while capable of producing high-quality output (glyph images). libs fribidi 1.0.12 Website The Free Implementation of the Unicode Bidirectional Algorithm. libs ftgl 2.1.2 Website FTGL is a free cross-platform Open Source C++ library that uses Freetype2 to simplify rendering fonts in OpenGL applications. libs gc 7.6.0 Website The Boehm-Demers-Weiser conservative garbage collector can be used as a garbage collecting replacement for C malloc or C++ new. libs gconf 2.9.91 Website GConf is a system for storing application preferences. libs gdk-pixbuf 2.36.8 Website The GdkPixbuf library provides facilities for loading images in a variety of file formats. libs gflags 2.2.12.2.2 Website The gflags package contains a C++ library that implements commandline flags processing. libs giflib 5.1.4 Website GIFLIB is a package of portable tools and library routines for working with GIF images. libs glib 2.52.3 Website The GLib library provides core non-graphical functionality such as high level data types, Unicode manipulation, and an object and type system to C programs. libs glog 0.3.5 Website C++ implementation of the Google logging module. libs gnutls 3.5.9 Website GnuTLS is a secure communications library implementing the SSL, TLS and DTLS protocols and technologies around them. libs gobject-introspection 1.52.1 Website GObject introspection is a middleware layer between C libraries (using GObject) and language bindings. libs googletest 1.8.0 Website Google Test is Google's C++ test framework. libs gstreamer 1.12.0 Website GStreamer is a library for constructing graphs of media-handling components. libs gtk+ 2.24.303.22.18 Website GTK+, or the GIMP Toolkit, is a multi-platform toolkit for creating graphical user interfaces. libs harfbuzz 1.4.8 Website HarfBuzz is an OpenType text shaping engine. libs hunspell 1.6.2 Website Hunspell is a spell checker. libs hyphen 2.8.8 Website Hyphen is a hyphenation library to use converted TeX hyphenation patterns. libs icu 59.1 Website ICU is a set of C/C++ and Java libraries providing Unicode and Globalization support for software applications. libs jansson 2.13.1 Website C library for encoding, decoding and manipulating JSON data. libs jemalloc 5.3.0 Website jemalloc is a general purpose malloc(3) implementation that emphasizes fragmentation avoidance and scalable concurrency support. libs json-glib 1.4.4 Website JSON-GLib is a library providing serialization and deserialization support for the JavaScript Object Notation (JSON) format described by RFC 4627. libs leptonica 1.82.0 Website Leptonica is an open source library containing software that is broadly useful for image processing and image analysis applications. libs libaio 0.3.111 Website libaio provides the Linux-native API for async I/O. libs libart_lgpl 2.3.21 Website Libart is a library for high-performance 2D graphics. libs libcroco 0.6.13 Website Libcroco is a standalone css2 parsing and manipulation library. libs libepoxy 1.4.1 Website Epoxy is a library for handling OpenGL function pointer management for you. libs libexif 0.6.21 Website A library for parsing, editing, and saving EXIF data. libs libffi 3.2.1 Website libffi is a portable Foreign Function Interface library. libs libgcrypt 1.8.2 Website Libgcrypt is a general purpose cryptographic library originally based on code from GnuPG. libs libgd 2.2.5 Website GD is an open source code library for the dynamic creation of images by programmers. libs libgdiplus 5.6 Website C-based implementation of the GDI+ API libs libglvnd 1.2.0 Website libglvnd is a vendor-neutral dispatch layer for arbitrating OpenGL API calls between multiple vendors. libs libgnomecanvas 2.30.3 Website Library for the GNOME canvas, an engine for structured graphics that offers a rich imaging model, high performance rendering, and a powerful, high-level API. libs libgpg-error 1.27 Website Libgpg-error is a small library that originally defined common error values for all GnuPG components. libs libiconv 1.16 Website libiconv is a conversion library for string encoding. libs libidl 0.8.14 Website The libIDL package contains libraries for Interface Definition Language files. This is a specification for defining portable interfaces. libs libjpeg-turbo 1.5.1 2.1.4 Website libjpeg-turbo is a JPEG image codec that uses SIMD instructions (MMX, SSE2, AVX2, NEON, AltiVec) to accelerate baseline JPEG compression and decompression on x86, x86-64, ARM, and PowerPC systems libs libmng 2.0.3 Website THE reference library for reading, displaying, writing and examining Multiple-Image Network Graphics. MNG is the animation extension to the popular PNG image-format. libs libpng 1.2.571.6.29 Website libpng is the official PNG reference library. It supports almost all PNG features, is extensible, and has been extensively tested for over 20 years. libs libproxy 0.4.15 Website libproxy is a library that provides automatic proxy configuration management. libs libressl 2.5.33.2.1 Website LibreSSL is a version of the TLS/crypto stack forked from OpenSSL in 2014, with goals of modernizing the codebase, improving security, and applying best practice development processes. libs librsvg 2.36.4 Website Librsvg is a library to render SVG files using cairo as a rendering engine. libs libseccomp 2.3.3 Website The libseccomp library provides an easy to use, platform independent, interface to the Linux Kernel's syscall filtering mechanism.. libs libsodium 1.0.18 Website Sodium is a modern, easy-to-use software library for encryption, decryption, signatures, password hashing and more. libs libsoup 2.61.2 Website libsoup is an HTTP client/server library for GNOME. libs libtasn1 4.13 Website Libtasn1 is the ASN.1 library used by GnuTLS, p11-kit and some other packages. libs libtiff 4.0.8 4.4.04.5.0 Website libtiff provides support for the Tag Image File Format (TIFF), a widely used format for storing image data. libs libunistring 0.9.7 Website Libunistring provides functions for manipulating Unicode strings and for manipulating C strings according to the Unicode standard. libs libuuid 1.0.3 Website Portable uuid C library. libs libuv 1.38.1 Website libuv is a multi-platform support library with a focus on asynchronous I/O. libs libwebp 0.6.1 Website WebP is a modern image format that provides superior lossless and lossy compression for images on the web. libs libxkbcommon 0.9.1 Website libxkbcommon is a keyboard keymap compiler and support library which processes a reduced subset of keymaps as defined by the XKB (X Keyboard Extension) specification. libs libxml2 2.9.4 Website Libxml2 is a XML C parser and toolkit. libs libxslt 1.1.32 Website Libxslt is the XSLT C library developed for the GNOME project. XSLT itself is a an XML language to define transformation for XML. libs mesa 17.1.6 Website Mesa is an open-source implementation of the OpenGL, Vulkan and other specifications. libs minipmi 1.0 Website Implementation of a minimal subset of the PMI1 and PMI2 specifications. libs ncurses 6.06.4 Website The ncurses (new curses) library is a free software emulation of curses in System V Release 4.0 (SVr4), and more. libs nettle 3.3 Website Nettle is a cryptographic library that is designed to fit easily in more or less any context. libs openjpeg 2.3.1 Website OpenJPEG is an open-source JPEG 2000 codec written in C language. libs openssl 3.0.7 Website OpenSSL is a full-featured toolkit for general-purpose cryptography and secure communication. libs orbit 2.14.19 Website ORBit2 is a CORBA 2.4-compliant Object Request Broker (ORB) featuring mature C, C++ and Python bindings. libs pango 1.40.10 Website Pango is a library for laying out and rendering of text, with an emphasis on internationalization. libs pcre 8.40 Website The PCRE library is a set of functions that implement regular expression pattern matching using the same syntax and semantics as Perl 5. libs pcre2 10.3510.40 Website The PCRE22 library is a set of functions that implement regular expression pattern matching using the same syntax and semantics as Perl 5. libs popt 1.16 Website Library for parsing command line options. libs py-lmdb 0.93 Website Universal Python binding for the LMDB 'Lightning' Database. libs py-mako 1.0.7_py27 1.0.7_py36 Website Mako is a template library written in Python. It provides a familiar, non-XML syntax which compiles into Python modules for maximum performance. libs py-pygobject 3.32.2_py36 Website PyGObject is a Python package which provides bindings for GObject based libraries such as GTK, GStreamer, WebKitGTK, GLib, GIO and many more. libs py-pyopengl 3.1.5_py39 Website Standard OpenGL bindings for Python. libs py-pyqt5 5.9.1_py36 Website PyQt5 is a comprehensive set of Python bindings for Qt v5. libs readline 7.08.2 Website The GNU Readline library provides a set of functions for use by applications that allow users to edit command lines as they are typed in. libs serf 1.3.9 Website The serf library is a high performance C-based HTTP client library built upon the Apache Portable Runtime (APR) library. libs sionlib 1.7.7 Website Scalable I/O library for parallel access to task-local files. libs snappy 1.1.7 Website A fast compressor/decompressor. libs talloc 2.1.14 Website talloc is a hierarchical, reference counted memory pool system with destructors. libs tesseract 5.1.0 Website Tesseract is an open source text recognition (OCR) Engine. libs utf8proc 2.4.0 Website iutf8proc is a small, clean C library that provides Unicode normalization, case-folding, and other operations for data in the UTF-8 encoding. libs wxwidgets 3.0.4 Website wxWidgets is a C++ library that lets developers create applications for Windows, macOS, Linux and other platforms with a single code base. libs yaml-cpp 0.7.0 Website yaml-cpp is a YAML parser and emitter in C++ matching the YAML 1.2 spec. media ffmpeg 4.04.2.15.0 Website FFmpeg is the leading multimedia framework, able to decode, encode, transcode, mux, demux, stream, filter and play pretty much anything that humans and machines have created. media libsndfile 1.0.28 Website Libsndfile is a C library for reading and writing files containing sampled sound (such as MS Windows WAV and the Apple/SGI AIFF format) through one standard library interface. performance likwid 4.3.25.2.1 Website Likwid is a simple toolsuite of command line applications for performance oriented programmers. resource monitoring nvtop 1.1.0 2.0.3 3.0.2 Website Nvtop stands for NVidia TOP, a (h)top like task monitor for NVIDIA GPUs. resource monitoring remora 1.8.5 Website Remora is a tool to monitor runtime resource utilization. resource monitoring ruse 2.0 Website A command line tool to measure process resource usage. scm gh 1.9.1 Website gh is GitHub on the command line. It brings pull requests, issues, and other GitHub concepts to the terminal next to where you are already working with git and your code. scm git 2.39.1 Website Git is a free and open source distributed version control system designed to handle everything from small to very large projects with speed and efficiency. scm git-annex 8.20210622 Website git-annex allows managing files with git, without checking the file contents into git. scm git-credential-manager 2.0.696 Website Secure, cross-platform Git credential storage with authentication to GitHub, Azure Repos, and other popular Git hosting services. scm git-lfs 2.4.0 Website Git Large File Storage (LFS) replaces large files such as audio samples, videos, datasets, and graphics with text pointers inside Git, while storing the file contents on a remote server. scm libgit2 1.1.0 Website libgit2 is a portable, pure C implementation of the Git core methods provided as a re-entrant linkable library with a solid API scm mercurial 4.5.3 Website Mercurial is a free, distributed source control management tool. scm py-dvc 0.91.1_py36 Website Data Version Control or DVC is an open-source tool for data science and machine learning projects. scm subversion 1.9.71.12.2 Website Subversion is an open source version control system. shell powershell 7.1.5 Website PowerShell Core is a cross-platform automation and configuration tool/framework. testing py-pytest 7.1.3_py39 Website pytest is a full-featured Python testing framework tools clinfo 2.2.18.04.06 Website clinfo is a simple command-line application that enumerates all possible (known) properties of the OpenCL platform and devices available on the system. tools curl 8.4.0 Website curl is an open source command line tool and library for transferring data with URL syntax. tools depot_tools 20200731 Website Tools for working with Chromium development. tools expat 2.2.3 Website Expat is a stream-oriented XML parser library written in C. tools graphicsmagick 1.3.26 Website GraphicsMagick is the swiss army knife of image processing. tools imagemagick 7.0.7-2 Website ImageMagick is a free and open-source software suite for displaying, converting, and editing raster image and vector image files. tools jq 1.6 Website jq is a lightweight and flexible command-line JSON processor. tools leveldb 1.20 Website Symas LMDB is an extraordinarily fast, memory-efficient database we developed for the Symas OpenLDAP Project. tools lmdb 0.9.21 Website Symas LMDB is an extraordinarily fast, memory-efficient database we developed for the Symas OpenLDAP Project. tools motif 2.3.7 Website Motif is the toolkit for the Common Desktop Environment. tools parallel 2018012220200822 Website GNU parallel is a shell tool for executing jobs in parallel using one or more computers. tools password-store 1.7.4 Website Simple password manager using gpg and ordinary unix directories. tools py-clustershell 1.9.0_py39 Website ClusterShell is an event-driven open source Python library, designed to run local or distant commands in parallel on server farms or on large Linux clusters. tools py-matlab-proxy 0.9.1_py390.10.0_py39 Website matlab-proxy is a Python package which enables you to launch MATLAB and access it from a web browser. tools py-pyside 5.15.2.1_py39 Website PySide is the official Python module from the Qt for Python project, which provides access to the complete Qt framework. tools py-wxpython 4.0.7_py394.2.0_py39 Website wxPython is the cross-platform GUI toolkit for the Python language, tools qt 5.9.1 6.4.0 Website QT is a cross-platform application framework that is used for developing application software that can be run on various software and hardware platforms. tools ripgrep 11.0.1 Website ripgrep recursively searches directories for a regex pattern. tools rocksdb 5.7.3 Website A library that provides an embeddable, persistent key-value store for fast storage. tools x11 7.7 Website The X.Org project provides an open source implementation of the X Window System. tools xkeyboard-config 2.21 Website The non-arch keyboard configuration database for X Window.","location":"docs/software/list/#system"},{"title":"viz","text":"Field Module\u00a0name Version(s) URL Description data ncview 2.1.7 Website Ncview is a visual browser for netCDF format files. gis gmt 6.4.0 Website GMT (The Generic Mapping Tools) is an open source collection of command-line tools for manipulating geographic and Cartesian data sets. gis panoply 4.10.8 Website Panoply plots geo-referenced and other arrays from netCDF, HDF, GRIB, and other datasets. gis py-cartopy 0.21.0_py39 Website Cartopy is a Python package designed for geospatial data processing in order to produce maps and other geospatial data analyses. graphs graphviz 2.40.12.44.1 Website Graphviz is open source graph visualization software. imaging py-pillow 5.1.0_py27 5.1.0_py367.0.0_py368.2.0_py399.3.0_py39 Website Pillow is a friendly PIL (Python Imaging Library) fork. imaging py-pillow-simd 7.0.0.post3_py369.2.0_py39 Website Pillow-SIMD is an optimized version of Pillow molecular visualization ovito 3.7.11 Website OVITO is a scientific visualization and data analysis solution for atomistic and other particle-based models. molecular visualization pymol 1.8.6.2 2.5.3 Website PyMOL is a Python-enhanced molecular graphics tool. plotting gnuplot 5.2.0 Website Gnuplot is a portable command-line driven graphing utility for Linux, OS/2, MS Windows, OSX, VMS, and many other platforms. plotting grace 5.1.25 Website Grace is a WYSIWYG tool to make two-dimensional plots of numerical data. plotting mathgl 8.0.1 Website MathGL is a library to make high-quality scientific graphics. plotting py-basemap 1.1.0_py27 1.1.0_py36 Website The matplotlib basemap toolkit is a library for plotting 2D data on maps in Python. plotting py-matplotlib 2.2.2_py27 2.1.2_py272.1.2_py362.2.2_py363.1.1_py363.2.1_py363.4.2_py393.7.1_py39 Website Matplotlib is a Python 2D plotting library which produces publication quality figures in a variety of hardcopy formats and interactive environments across platforms. plotting py-plotly 2.4.1_py27 Website Plotly's Python graphing library makes interactive, publication-quality graphs online. plotting py-seaborn 0.12.1_py39 Website Seaborn is a Python data visualization library based on matplotlib. It provides a high-level interface for drawing attractive and informative statistical graphics. plotting veusz 3.3.1 Website Veusz is a scientific plotting and graphing program with a graphical user interface, designed to produce publication-ready 2D and 3D plots. remote display virtualgl 2.5.2 Website VirtualGL is an open source toolkit that gives any Unix or Linux remote display software the ability to run OpenGL applications with full 3D hardware acceleration.","location":"docs/software/list/#viz"},{"title":"Modules","text":"","location":"docs/software/modules/"},{"title":"Environment modules","text":"

Software is provided on Sherlock under the form of loadable environment modules.

Software is only accessible via modules

The use of a module system means that most software is not accessible by default and has to be loaded using the module command. This mechanism allows us to provide multiple versions of the same software concurrently, and gives users the possibility to easily switch between software versions.

Sherlock uses Lmod to manage software installations. The modules system helps setting up the user's shell environment to give access to applications, and make running and compiling software easier. It also allows us to provide multiple versions of the same software, that would otherwise conflict with each other, and abstract things from the OS sometimes rigid versions and dependencies.

When you first log into Sherlock, you'll be presented with a default, bare bone environment with minimal software available. The module system is used to manage the user environment and to activate software packages on demand. In order to use software installed on Sherlock, you must first load the corresponding software module.

When you load a module, the system will set or modify your user environment variables to enable access to the software package provided by that module. For instance, the $PATH environment variable might be updated so that appropriate executables for that package can be used.

","location":"docs/software/modules/#environment-modules"},{"title":"Module categories","text":"

Modules on Sherlock are organized by scientific field, in distinct categories. This is to limit the information overload that can result when displaying the full list of available modules. Given the large diversity of the Sherlock user population, all users are not be interested in the same kind of software, and high-energy physicists may not want to see their screens cluttered with the latest bioinformatics packages.

Module categories

You will first have to load a category module before getting access to individual modules. The math and devel categories are loaded by default, and modules in those categories can be loaded directly

For instance, to be able to load the gromacs module, you'll first need to load the chemistry module. This can be done in a single command, by specifying first the category, then the actual application module name:

$ module load chemistry gromacs\n

The math and devel categories, which are loaded by default, provide direct access to compilers, languages, and MPI and numerical libraries.

For a complete list of software module categories, please refer to the list of available software

Searching for a module

To know how to access a module, you can use the module spider <module_name> command. It will search through all the installed modules, even if they're masked, and display instructions to load them. See the Examples section for details.

","location":"docs/software/modules/#module-categories"},{"title":"Module usage","text":"

The most common module commands are outlined in the following table. module commands may be shortened with the ml alias, with slightly different semantics.

Module names auto-completion

The module command supports auto-completion, so you can just start typing the name of a module, and press Tab to let the shell automatically complete the module name and/or version.

Module\u00a0command Short\u00a0version Description module avail ml av List\u00a0available\u00a0software1 module spider gromacs ml spider gromacs Search for particular software module keyword blas ml key blas Search for blas in module names and descriptions module whatis gcc ml whatis gcc Display information about the gcc module module help gcc ml help gcc Display module specific help module load gcc ml gcc Load a module to use the associated software module load gsl/2.3 ml gsl/2.3 Load specific version of a module module unload gcc ml -gcc Unload a module module swap gcc icc ml -gcc icc Swap a module (unload gcc and replace it with icc) module purge ml purge Remove all modules2 module save foo ml save foo Save the state of all loaded modules in a collection named foo module restore foo ml restore foo Restore the state of saved modules from the foo collection

Additional module sub-commands are documented in the module help command. For complete reference, please refer to the official Lmod documentation.

","location":"docs/software/modules/#module-usage"},{"title":"Module properties","text":"

Multiple versions

When multiple versions of the same module exist, module will load the one marked as Default (D). For the sake of reproducibility, we recommend always specifying the module version you want to load, as defaults may evolve over time.

To quickly see some of the modules characteristics, module avail will display colored property attributes next to the module names. The main module properties are:

  • S: Module is sticky, requires --force to unload or purge
  • L: Indicate currently loaded module
  • D: Default module that will be loaded when multiple versions are available
  • r: Restricted access, typically software under license. Contact us for details
  • g: GPU-accelerated software, will only run on GPU nodes
  • m: Software supports parallel execution using MPI
","location":"docs/software/modules/#module-properties"},{"title":"Searching for modules","text":"

You can search through all the available modules for either:

  • a module name (if you already know it), using module spider
  • any string within modules names and descriptions, using module keyword

For instance, if you want to know how to load the gromacs module, you can do:

$ module spider gromacs\n

If you don't know the module name, or want to list all the modules that contain a specific string of characters in their name or description, you can use module keyword. For instance, the following command will list all the modules providing a BLAS library:

$ module keyword blas\n
","location":"docs/software/modules/#searching-for-modules"},{"title":"Examples","text":"","location":"docs/software/modules/#examples"},{"title":"Listing","text":"

To list all the modules that can be loaded, you can do:

$ ml av\n\n-- math -- numerical libraries, statistics, deep-learning, computer science ---\n   R/3.4.0             gsl/1.16             openblas/0.2.19\n   cudnn/5.1  (g)      gsl/2.3       (D)    py-scipystack/1.0_py27 (D)\n   cudnn/6.0  (g,D)    imkl/2017.u2         py-scipystack/1.0_py36\n   fftw/3.3.6          matlab/R2017a (r)\n\n------------------ devel -- compilers, MPI, languages, libs -------------------\n   boost/1.64.0          icc/2017.u2           python/2.7.13    (D)\n   cmake/3.8.1           ifort/2017.u2         python/3.6.1\n   cuda/8.0.61    (g)    impi/2017.u2   (m)    scons/2.5.1_py27 (D)\n   eigen/3.3.3           java/1.8.0_131        scons/2.5.1_py36\n   gcc/6.3.0      (D)    julia/0.5.1           sqlite/3.18.0\n   gcc/7.1.0             llvm/4.0.0            tbb/2017.u2\n   h5utils/1.12.1        nccl/1.3.4     (g)    tcltk/8.6.6\n   hdf5/1.10.0p1         openmpi/2.0.2  (m)\n\n-------------- categories -- load to make more modules available --------------\n   biology      devel (S,L)    physics    system\n   chemistry    math  (S,L)    staging    viz\n\n  Where:\n   S:  Module is Sticky, requires --force to unload or purge\n   r:  Restricted access\n   g:  GPU support\n   L:  Module is loaded\n   m:  MPI support\n   D:  Default Module\n\nUse \"module spider\" to find all possible modules.\nUse \"module keyword key1 key2 ...\" to search for all possible modules matching\nany of the \"keys\".\n
","location":"docs/software/modules/#listing"},{"title":"Searching","text":"

To search for a specific string in modules names and descriptions, you can run:

$ module keyword numpy\n---------------------------------------------------------------------------\n\nThe following modules match your search criteria: \"numpy\"\n---------------------------------------------------------------------------\n\n  py-scipystack: py-scipystack/1.0_py27, py-scipystack/1.0_py36\n    The SciPy Stack is a collection of open source software for scientific\n    computing in Python. It provides the following packages: numpy, scipy,\n    matplotlib, ipython, jupyter, pandas, sympy and nose.\n\n---------------------------------------------------------------------------\n[...]\n$ ml key compiler\n---------------------------------------------------------------------------\n\nThe following modules match your search criteria: \"compiler\"\n---------------------------------------------------------------------------\n\n  cmake: cmake/3.8.1\n    CMake is an extensible, open-source system that manages the build\n    process in an operating system and in a compiler-independent manner.\n\n  gcc: gcc/6.3.0, gcc/7.1.0\n    The GNU Compiler Collection includes front ends for C, C++, Fortran,\n    Java, and Go, as well as libraries for these languages (libstdc++,\n    libgcj,...).\n\n  icc: icc/2017.u2\n    Intel C++ Compiler, also known as icc or icl, is a group of C and C++\n    compilers from Intel\n\n  ifort: ifort/2017.u2\n    Intel Fortran Compiler, also known as ifort, is a group of Fortran\n    compilers from Intel\n\n  llvm: llvm/4.0.0\n    The LLVM Project is a collection of modular and reusable compiler and\n    toolchain technologies. Clang is an LLVM native C/C++/Objective-C\n    compiler,\n\n---------------------------------------------------------------------------\n

To get information about a specific module, especially how to load it, the following command can be used:

$ module spider gromacs\n\n-------------------------------------------------------------------------------\n  gromacs: gromacs/2016.3\n-------------------------------------------------------------------------------\n    Description:\n      GROMACS is a versatile package to perform molecular dynamics, i.e.\n      simulate the Newtonian equations of motion for systems with hundreds to\n      millions of particles.\n\n    Properties:\n      GPU support      MPI support\n\n    You will need to load all module(s) on any one of the lines below before\n    the \"gromacs/2016.3\" module is available to load.\n\n      chemistry\n
","location":"docs/software/modules/#searching"},{"title":"Loading","text":"

Loading a category module allows to get access to field-specific software:

$ ml chemistry\n$ ml av\n\n------------- chemistry -- quantum chemistry, molecular dynamics --------------\n   gromacs/2016.3 (g,m)    vasp/5.4.1 (g,r,m)\n\n-- math -- numerical libraries, statistics, deep-learning, computer science ---\n   R/3.4.0             gsl/1.16             openblas/0.2.19\n   cudnn/5.1  (g)      gsl/2.3       (D)    py-scipystack/1.0_py27 (D)\n   cudnn/6.0  (g,D)    imkl/2017.u2         py-scipystack/1.0_py36\n   fftw/3.3.6          matlab/R2017a (r)\n\n------------------ devel -- compilers, MPI, languages, libs -------------------\n   boost/1.64.0          icc/2017.u2           python/2.7.13    (D)\n   cmake/3.8.1           ifort/2017.u2         python/3.6.1\n   cuda/8.0.61    (g)    impi/2017.u2   (m)    scons/2.5.1_py27 (D)\n   eigen/3.3.3           java/1.8.0_131        scons/2.5.1_py36\n   gcc/6.3.0      (D)    julia/0.5.1           sqlite/3.18.0\n   gcc/7.1.0             llvm/4.0.0            tbb/2017.u2\n   h5utils/1.12.1        nccl/1.3.4     (g)    tcltk/8.6.6\n   hdf5/1.10.0p1         openmpi/2.0.2  (m)\n\n-------------- categories -- load to make more modules available --------------\n   biology          devel (S,L)    physics    system\n   chemistry (L)    math  (S,L)    staging    viz\n\n[...]\n
","location":"docs/software/modules/#loading"},{"title":"Resetting the modules environment","text":"

If you want to reset your modules environment as it was when you initially connected to Sherlock, you can use the ml reset command: it will remove all the modules you have loaded, and restore the original state where only the math and devel categories are accessible.

If you want to remove all modules from your environment, including the default math and devel modules, you can use ml --force purge.

","location":"docs/software/modules/#resetting-the-modules-environment"},{"title":"Loading modules in jobs","text":"

In order for an application running in a Slurm job to have access to any necessary module-provided software packages, we recommend loading those modules in the job script directly. Since Slurm propagates all user environment variables by default, this is not strictly necessary, as jobs will inherit the modules loaded at submission time. But to make sure things are reproducible and avoid issues, it is preferable to explicitly load the modules in the batch scripts.

module load commands should be placed right after #SBATCH directives and before the actual executable calls. For instance:

#!/bin/bash\n#SBATCH ...\n#SBATCH ...\n#SBATCH ...\n\nml reset\nml load gromacs/2016.3\n\nsrun gmx_mpi ...\n
","location":"docs/software/modules/#loading-modules-in-jobs"},{"title":"Custom modules","text":"

Users are welcome and encouraged to build and install their own software on Sherlock. To that end, and to facilitate usage or sharing of their custom software installations, they can create their own module repositories.

See the Software Installation page for more details.

","location":"docs/software/modules/#custom-modules"},{"title":"Contributed software","text":"

PI groups, labs or departments can share their software installations and modules with the whole Sherlock community of users, and let everyone benefit from their tuning efforts and software developments.

Those modules are available in the specific contribs category, and organized by contributor name.

For instance, listing the available contributed modules can be done with:

$ ml contribs\n$ ml av\n-------------------- contribs -- contributed software ----------------------\n   poldrack\n

To get information about a specific lab module:

$ ml show poldrack\n----------------------------------------------------------------------------\n   /share/software/modules/contribs/poldrack.lua:\n----------------------------------------------------------------------------\nprepend_path(\"MODULEPATH\",\"/home/groups/russpold/modules\")\nwhatis(\"Name:        poldrack\")\nwhatis(\"Version:     1.0\")\nwhatis(\"Category:    contribs\")\nwhatis(\"URL:         https://github.com/poldracklab/lmod_modules\")\nwhatis(\"Description: Software modules contributed by the Poldrack Lab.\")\n

And to list the available software modules contributed by the lab:

$ ml poldrack\n$ ml av\n\n------------------------ /home/groups/russpold/modules -------------------------\n   afni/17.3.03           freesurfer/6.0.1            gsl/2.3      (D)\n   anaconda/5.0.0-py36    fsl/5.0.9                   pigz/2.4\n   ants/2.1.0.post710     fsl/5.0.11           (D)    remora/1.8.2\n   c3d/1.1.0              git-annex/6.20171109        xft/2.3.2\n[...]\n
  1. If a module is not listed here, it might be unavailable in the loaded modules categories, and require loading another category module. Search for not-listed software using the module spider command.\u00a0\u21a9

  2. The math and devel category modules will not be unloaded with module purge as they are \"sticky\". If a user wants to unload a sticky module, they must specify the --force option.\u00a0\u21a9

","location":"docs/software/modules/#contributed-software"},{"title":"R","text":"","location":"docs/software/using/R/"},{"title":"Introduction","text":"

R is a programming language and software environment for statistical computing and graphics. It is similar to the S language and environment developed at Bell Laboratories. R provides a wide variety of statistical and graphical techniques and is highly extensible.

","location":"docs/software/using/R/#introduction"},{"title":"More documentation","text":"

The following documentation is specifically intended for using R on Sherlock. For more complete documentation about R in general, please see the R documentation.

","location":"docs/software/using/R/#more-documentation"},{"title":"R on Sherlock","text":"

R is available on Sherlock and the corresponding module can be loaded with:

$ ml R\n

For a list of available versions, you can execute ml spider R at the Sherlock prompt, or refer to the Software list page.

","location":"docs/software/using/R/#r-on-sherlock"},{"title":"Using R","text":"

Once your environment is configured (ie. when the R module is loaded), R can be started by simply typing R at the shell prompt:

$ R\n\nR version 3.5.1 (2018-07-02) -- \"Feather Spray\"\nCopyright (C) 2018 The R Foundation for Statistical Computing\nPlatform: x86_64-pc-linux-gnu (64-bit)\n[...]\nType 'demo()' for some demos, 'help()' for on-line help, or\n'help.start()' for an HTML browser interface to help.\nType 'q()' to quit R.\n\n>\n

For a listing of command line options:

$ R --help\n
","location":"docs/software/using/R/#using-r"},{"title":"Running a R script","text":"

There are several ways to launch an R script on the command line, which will have different ways of presenting the script's output:

Method Output Rscript script.R displayed on screen, on stdout R CMD BATCH script.R redirected to a script.Rout file R --no-save < script.R displayed on screen, on stdout","location":"docs/software/using/R/#running-a-r-script"},{"title":"Submitting a R job","text":"

Here's an example R batch script that can be submitted via sbatch. It runs a simple matrix multiplication example, and demonstrates how to feed R code as a HEREDOC to R directly, so no intermediate R script is necessary:

Rtest.sbatch
#!/usr/bin/bash\n#SBATCH --time=00:10:00\n#SBATCH --mem=10G\n#SBATCH --output=Rtest.log\n\n# load the module\nml R\n\n# run R code\nR --no-save << EOF\nset.seed (1)\nm <- 4000\nn <- 4000\nA <- matrix (runif (m*n),m,n)\nsystem.time (B <- crossprod(A))\nEOF\n

You can save this script as Rtest.sbatch and submit it to the scheduler with:

$ sbatch Rtest.sbatch\n

Once the job is done, you should get a Rtest.out file in the current directory, with the following contents:

R version 3.5.1 (2018-07-02) -- \"Feather Spray\"\n[...]\n> set.seed (1)\n> m <- 4000\n> n <- 4000\n> A <- matrix (runif (m*n),m,n)\n> system.time (B <- crossprod(A))\n   user  system elapsed\n  2.649   0.077   2.726\n
","location":"docs/software/using/R/#submitting-a-r-job"},{"title":"R packages","text":"

R comes with a single package library in $R_HOME/library, which contains the standard and most common packages. This is usually in a system location and is not writable by end-users.

To accommodate individual user's requirements, R provides a way for each user to install packages in the location of their choice. The default value for a directory where users can install their own R packages is $HOME/R/x86_64-pc-linux-gnu-library/<R_version> where <R_version> depends on the R version that is used. For instance, if you have the R/3.5.1 module loaded, the default R user library path will be $HOME/R/x86_64-pc-linux-gnu-library/3.5.

This directory doesn't exist by default. The first time a user installs a package, R will ask if she wants to use the default location and create the directory.

","location":"docs/software/using/R/#r-packages"},{"title":"Installing packages","text":"

Install R packages in a standard shell session

Make sure to install your packages in a standard Sherlock shell session, not in an RStudio session.

To install a R package in your personal environment, the first thing to do is load the R module:

$ ml R\n

Then start a R session, and use the install.packages() function at the R prompt. For instance, the following example will install the doParallel package, using the US mirror of the CRAN repository:

$ R\n\nR version 3.5.1 (2018-07-02) -- \"Feather Spray\"\n[...]\n\n> install.packages('doParallel', repos='http://cran.us.r-project.org')\n

It should give the following warning:

Warning in install.packages(\"doParallel\", repos = \"http://cran.us.r-project.org\") :\n  'lib = \"/share/software/user/open/R/3.5.1/lib64/R/library\"' is not writable\nWould you like to use a personal library instead? (yes/No/cancel)\nWould you like to create a personal library\n\u2018~/R/x86_64-pc-linux-gnu-library/3.5\u2019\nto install packages into? (yes/No/cancel) y\n

Answering y twice will make R create a ~/R/x86_64-pc-linux-gnu-library/3.5 directory and instruct it to install future R packages there.

The installation will then proceed:

trying URL 'http://cran.us.r-project.org/src/contrib/doParallel_1.0.14.tar.gz'\nContent type 'application/x-gzip' length 173607 bytes (169 KB)\n==================================================\ndownloaded 169 KB\n\n* installing *source* package \u2018doParallel\u2019 ...\n** package \u2018doParallel\u2019 successfully unpacked and MD5 sums checked\n** R\n** demo\n** inst\n** byte-compile and prepare package for lazy loading\n** help\n*** installing help indices\n** building package indices\n** installing vignettes\n** testing if installed package can be loaded\n* DONE (doParallel)\n\nThe downloaded source packages are in\n        \u2018/tmp/Rtmp0RHrMZ/downloaded_packages\u2019\n>\n

and when it's done, you should be able to load the package within R with:

> library(doParallel)\nLoading required package: foreach\nLoading required package: iterators\nLoading required package: parallel\n>\n
","location":"docs/software/using/R/#installing-packages"},{"title":"Installing large packages","text":"

Installing large R packages can sometimes be very time consuming. To speed things up, R can utilize multiple CPUs in parallel when the Ncpus=n option is added to the install.packages() command (where n is the number of CPUs you'd like to use).

For instance, you can get an interactive session with 4 CPU cores with sh_dev:

$ sh_dev -c 4\n$ ml R\n$ R\n> install.packages(\"dplyr\", repos = \"http://cran.us.r-project.org\", Ncpus=4)\n
","location":"docs/software/using/R/#installing-large-packages"},{"title":"Alternative installation path","text":"

To install R packages in a different location, you'll need to create that directory, and instruct R to install the packages there:

$ mkdir ~/R_libs/\n$ R\n> install.packages('doParallel', repos='http://cran.us.r-project.org', lib=\"~/R_libs\")\n

The installation will proceed normally and the doParallel package will be installed in $HOME/R_libs/.

Specifying the full destination path for each package installation could quickly become tiresome, so to avoid this, you can create a .Renviron file in your $HOME directory, and define your R_libs path there:

$ cat << EOF > $HOME/.Renviron\nR_LIBS=~/R_libs\nEOF\n

With this, whenever R is started, the $HOME/R_libs/ directory will be added to the list of places R will look for packages, and you won't need to specify this installation path when using install.packages() anymore.

Where does R look for packages?

To see the directories where R searches for packages and libraries, you can use the following command in R:

> .libPaths()\n

Sharing R packages

If you'd like to share R packages within your group, you can simply define $R_LIBS to point to a shared directory, such as $GROUP_HOME/R_libs and have each user in the group use the instructions below to define it in their own environment.

","location":"docs/software/using/R/#alternative-installation-path"},{"title":"Setting the installation repository","text":"

When installing a package, R needs to know from which repository the package should be downloaded. If it's not specified, it will prompt for it and display a list of available CRAN mirrors.

To avoid setting the CRAN mirror each time you run install.packages you can permanently set the mirror by creating a .Rprofile file in your $HOME directory, which R will execute each time it starts.

For instance, adding the following contents to your ~/.Rprofile will make sure that every install.packages() invocation will use the closest CRAN mirror:

## local creates a new, empty environment\n## This avoids polluting the global environment with\n## the object r\nlocal({\n  r = getOption(\"repos\")\n  r[\"CRAN\"] = \"https://cloud.r-project.org/\"\n  options(repos = r)\n})\n

Once this is set, you only need to specify the name of the package to install, and R will use the mirror you defined automatically:

> install.packages(\"doParallel\")\n[...]\ntrying URL 'https://cloud.r-project.org/src/contrib/doParallel_1.0.14.tar.gz'\nContent type 'application/x-gzip' length 173607 bytes (169 KB)\n==================================================\ndownloaded 169 KB\n
","location":"docs/software/using/R/#setting-the-installation-repository"},{"title":"Installing packages from GitHub","text":"

R packages can be directly installed from GitHub using the devtools package. devtools needs to be installed first, with:

> install.packages(\"devtools\")\n

And then, you can then install a R package directly from its GitHub repository. For instance, to install dplyr from tidyverse/dplyr:

> library(devtools)\n> install_github(\"tidyverse/dplyr\")\n
","location":"docs/software/using/R/#installing-packages-from-github"},{"title":"Package dependencies","text":"

Sometimes when installing R packages, other software is needed for the installation and/or compilation. For instance, when trying to install the sf package, you may encounter the following error messages:

> install.packages(\"sf\")\n[...]\nConfiguration failed because libudunits2.so was not found. Try installing:...\n[...]\nconfigure: error: gdal-config not found or not executable.\n

This is because sf needs a few dependencies, like udunits and gdal in order to compile and install successfully. Fortunately those dependencies are already available as modules on Sherlock.

Whenever you see \"not found\" errors, you may want to try searching the modules inventory with module spider:

$ module spider udunits\n\n----------------------------------------------------------------------------\n  udunits: udunits/2.2.26\n----------------------------------------------------------------------------\n    Description:\n      The UDUNITS package from Unidata is a C-based package for the\n      programmatic handling of units of physical quantities.\n\n\n    You will need to load all module(s) on any one of the lines below before\n    the \"udunits/2.2.26\" module is available to load.\n\n      physics\n

So for sf, in order to load the dependencies, exit R, load the udunits and gdal modules, and try installing sf again:

$ ml load physics udunits gdal geos\n$ ml R\n$ R\n> install.packages(\"sf\")\n

Getting dependencies right could be a matter of trial and error. You may have to load R, install packages, search modules, load modules, install packages again and so forth. Fortunately, R packages only need to be installed once, and many R package dependencies are already available as modules on Sherlock, you just need to search for them with module spider and load them.

And in case you're stuck, you can of course always send us an email and we'll be happy to assist.

","location":"docs/software/using/R/#package-dependencies"},{"title":"Updating Packages","text":"

To upgrade R packages, you can use the update.packages() function within a R session.

For instance, to update the doParallel package:

> update.packages('doParallel')\n

When the package name is omitted, update.packages() will try to update all the packages that are installed. Which is the most efficient way to ensure that all the packages in your local R library are up to date.

Centrally installed packages can not be updated

Note that attempting to update centrally installed packages will fail. You will have to use install.packages() to install your own version of the packages in your $HOME directory instead.

","location":"docs/software/using/R/#updating-packages"},{"title":"Removing packages","text":"

To remove a package from your local R library, you can use the remove.packages() function. For instance:

> remove.packages('doParallel')\n
","location":"docs/software/using/R/#removing-packages"},{"title":"Examples","text":"","location":"docs/software/using/R/#examples"},{"title":"Installing devtools","text":"

devtools is a package that provides R functions that simplify many common tasks. While its core functionality revolves around package development, devtools can also be used to install packages, particularly those on GitHub.

Installing devtools is somewhat memory-intensive and has several dependencies. The following example shows how to run an interactive session with 4 CPUs, load the modules for the necessary dependencies, and install devtools for R version 4.2.0.

# Launch interactive dev session with 4 CPUs\n\n$ sh_dev -c 4\n\n# Load the required modules\n\n$ ml purge\n$ ml R/4.2.0\n$ ml system harfbuzz fribidi\n$ ml cmake libgit2\n$ ml openssl\n\n# Launch R and install devtools\n\n$ R\n> install.packages(\"devtools\", repos = \"http://cran.us.r-project.org\", Ncpus=4)\n
","location":"docs/software/using/R/#installing-devtools"},{"title":"Single node","text":"

R has a couple of powerful and easy-to-use tools to parallelize your R jobs. doParallel is one of them. If the doParallel package is not installed in your environment yet, you can install it in a few easy steps.

Here is a quick doParallel example that uses one node and 16 cores on Sherlock (more nodes or CPU cores can be requested, as needed).

Save the two scripts below in a directory on Sherlock:

doParallel_test.RdoParallel_test.sbatch
# Example doParallel script\n\nif(!require(doParallel)) install.packages(\"doParallel\")\nlibrary(doParallel)\n\n# use the environment variable SLURM_NTASKS_PER_NODE to set\n# the number of cores to use\nregisterDoParallel(cores=(Sys.getenv(\"SLURM_NTASKS_PER_NODE\")))\n\n# bootstrap iteration example\nx <- iris[which(iris[,5] != \"setosa\"), c(1,5)]\niterations <- 10000# Number of iterations to run\n\n# parallel loop\n# note the '%dopar%' instruction\nparallel_time <- system.time({\n  r <- foreach(icount(iterations), .combine=cbind) %dopar% {\n    ind <- sample(100, 100, replace=TRUE)\n    result1 <- glm(x[ind,2]~x[ind,1], family=binomial(logit))\n    coefficients(result1)\n  }\n})[3]\n\n# show the number of parallel workers to be used\ngetDoParWorkers()\n\n# execute the function\nparallel_time\n
#!/bin/bash\n\n#SBATCH --nodes=1\n#SBATCH --ntasks-per-node=16\n#SBATCH --output=doParallel_test.log\n\n# --ntasks-per-node will be used in doParallel_test.R to specify the number\n# of cores to use on the machine.\n\n# load modules\nml R/3.5.1\n\n# execute script\nRscript doParallel_test.R\n

And then submit the job with:

$ sbatch doParallel_test.sbatch\n

Once the job has completed, the output file should contain something like this:

$ cat doParallel_test.out\n[1] \"16\"\nelapsed\n  3.551\n

Bonus points: observe the scalability of the doParallel loop by submitting the same script using a varying number of CPU cores:

$ for i in 2 4 8 16; do\n    sbatch --out=doP_${i}.out --ntasks-per-node=$i doParallel_test.sbatch\ndone\n

When the jobs are done:

$ for i in 2 4 8 16; do\n    printf \"%2i cores: %4.1fs\\n\" $i $(tail -n1 doP_$i.out)\ndone\n 2 cores: 13.6s\n 4 cores:  7.8s\n 8 cores:  4.9s\n16 cores:  3.6s\n
","location":"docs/software/using/R/#single-node"},{"title":"Multiple nodes","text":"

To distribute parallel R tasks on multiple nodes, you can use the Rmpi package, which provides MPI bindings for R.

To install the Rmpi package, a module providing MPI library must first be loaded. For instance:

$ ml openmpi R\n$ R\n> install.packages(\"Rmpi\")\n

Once the package is installed, the following scripts demonstrate a very basic Rmpi example.

Rmpi-test.RRmpi-test.sbatch
# Example Rmpi script\n\nif (!require(\"Rmpi\")) install.packages(\"Rmpi\")\nlibrary(Rmpi)\n\n# initialize an Rmpi environment\nns <- mpi.universe.size() - 1\nmpi.spawn.Rslaves(nslaves=ns, needlog=TRUE)\n\n# send these commands to the slaves\nmpi.bcast.cmd( id <- mpi.comm.rank() )\nmpi.bcast.cmd( ns <- mpi.comm.size() )\nmpi.bcast.cmd( host <- mpi.get.processor.name() )\n\n# all slaves execute this command\nmpi.remote.exec(paste(\"I am\", id, \"of\", ns, \"running on\", host))\n\n# close down the Rmpi environment\nmpi.close.Rslaves(dellog = FALSE)\nmpi.exit()\n
#!/bin/bash\n\n#SBATCH --nodes=2\n#SBATCH --ntasks=4\n#SBATCH --output=Rmpi-test.log\n\n## load modules\n# openmpi is not loaded by default with R, so it must be loaded explicitly\nml R openmpi\n\n## run script\n# we use '-np 1' since Rmpi does its own task management\nmpirun -np 1 Rscript Rmpi-test.R\n

You can save those scripts as Rmpi-test.R and Rmpi-test.sbatch and then submit your job with:

$ sbatch Rmpi-test.sbatch\n

When the job is done, its output should look like this:

$ cat Rmpi-test.log\n        3 slaves are spawned successfully. 0 failed.\nmaster (rank 0, comm 1) of size 4 is running on: sh-06-33\nslave1 (rank 1, comm 1) of size 4 is running on: sh-06-33\nslave2 (rank 2, comm 1) of size 4 is running on: sh-06-33\nslave3 (rank 3, comm 1) of size 4 is running on: sh-06-34\n$slave1\n[1] \"I am 1 of 4 running on sh-06-33\"\n\n$slave2\n[1] \"I am 2 of 4 running on sh-06-33\"\n\n$slave3\n[1] \"I am 3 of 4 running on sh-06-34\"\n\n[1] 1\n[1] \"Detaching Rmpi. Rmpi cannot be used unless relaunching R.\"\n
","location":"docs/software/using/R/#multiple-nodes"},{"title":"GPUs","text":"

Here's a quick example that compares running a matrix multiplication on a CPU and on a GPU using R. It requires submitting a job to a GPU node and the gpuR R package.

gpuR-test.RgpuR-test.sbatch
# Example gpuR script\n\nif (!require(\"gpuR\")) install.packages(\"gpuR\")\nlibrary(gpuR)\n\nprint(\"CPU times\")\nfor(i in seq(1:7)) {\n    ORDER = 64*(2^i)\n    A = matrix(rnorm(ORDER^2), nrow=ORDER)\n    B = matrix(rnorm(ORDER^2), nrow=ORDER)\n    print(paste(i, sprintf(\"%5.2f\", system.time({C = A %*% B})[3])))\n}\n\nprint(\"GPU times\")\nfor(i in seq(1:7)) {\n    ORDER = 64*(2^i)\n    A = matrix(rnorm(ORDER^2), nrow=ORDER)\n    B = matrix(rnorm(ORDER^2), nrow=ORDER)\n    gpuA = gpuMatrix(A, type=\"double\")\n    gpuB = gpuMatrix(B, type=\"double\")\n    print(paste(i, sprintf(\"%5.2f\", system.time({gpuC = gpuA %*% gpuB})[3])))\n}\n
#!/bin/bash\n\n#SBATCH --partition gpu\n#SBATCH --mem 8GB\n#SBATCH --gres gpu:1\n#SBATCH --output=gpuR-test.log\n\n## load modules\n# cuda is not loaded by default with R, so it must be loaded explicitly\nml R cuda\n\nRscript gpuR-test.R\n

After submitting the job with sbatch gpuR-test.sbatch, the output file should contain something like this:

[1] \"CPU times\"\n[1] \"1  0.00\"\n[1] \"2  0.00\"\n[1] \"3  0.02\"\n[1] \"4  0.13\"\n[1] \"5  0.97\"\n[1] \"6  7.56\"\n[1] \"7 60.47\"\n\n[1] \"GPU times\"\n[1] \"1  0.10\"\n[1] \"2  0.04\"\n[1] \"3  0.02\"\n[1] \"4  0.07\"\n[1] \"5  0.39\"\n[1] \"6  2.04\"\n[1] \"7 11.59\"\n

which shows a decent speedup for running on a GPU for the largest matrix sizes.

","location":"docs/software/using/R/#gpus"},{"title":"Anaconda","text":"","location":"docs/software/using/anaconda/"},{"title":"Introduction","text":"

Anaconda is a Python/R distribution that aims to simplify package management and deployment for scientific computing. Although it can have merits on individual computers, it's often counter-productive on shared HPC systems like Sherlock.

Avoid using Anaconda on Sherlock

We recommend NOT using Anaconda on Sherlock, and instead consider other options like virtual environments or containers.

","location":"docs/software/using/anaconda/#introduction"},{"title":"Why Anaconda should be avoided on Sherlock","text":"

Anaconda is widely used in several scientific domain like data science, AI/ML, bio-informatics, and is often listed in some software documentation as the recommended (if not only) way to install it

It is a useful solution for simplifying the management of Python and scientific libraries on a personal computer. However, on highly-specialized HPC systems like Sherlock, management of these libraries and dependencies should be done by SRCC staff, to ensure compatibility and optimal performance on the cluster hardware.

For instance:

  • Anaconda very often installs software (compilers, scientific libraries etc.) which already exist on our Sherlock as modules, and does so in a sub-optimal fashion, by installing sub-optimal versions and configurations,
  • It installs binaries which are not optimized for the processor architectures on Sherlock,
  • it makes incorrect assumptions about the location of various system libraries,
  • Anaconda installs software in $HOME by default, where it writes large amounts of files. A single Anaconda installation can easily fill up your $HOME directory quota, and makes things difficult to manage,
  • Anaconda installations can't easily be relocated,
  • Anaconda modifies your $HOME/.bashrc file, which can easily cause conflicts and slow things down when you log in.

Worse, a Conda recipe can force the installation of R (even though it's already available on Sherlock). This installation won't perform nearly as well as the version we provide as a module (which uses optimized libraries), or not at all, the jobs launched with it may crash and end up wasting both computing resources and your time.

Installation issues

If you absolutely need to install anaconda/miniconda, please note that because of the large number of files that the installer will try to open, this will likely fail on a login node. So make sure to run the installation on a compute node, for instance using the sh_dev command.

","location":"docs/software/using/anaconda/#why-anaconda-should-be-avoided-on-sherlock"},{"title":"What to do instead","text":"","location":"docs/software/using/anaconda/#what-to-do-instead"},{"title":"Use a virtual environment","text":"

Instead of using Anaconda for your project, or when the installation instructions of the software you want to install are using it, you can use a virtual environment.

A virtual environment offers all the functionality you need to use Python on Sherlock. You can convert Anaconda instructions and use a virtual environment instead, by following these steps:

  1. list the dependencies (also called requirements) of the application you want to use:
    • check if there is a requirements.txt file in the Git repository or in the software sources,
    • or, check the variable install_requires of in the setup.py file, which lists the requirements.
  2. find which dependencies are Python modules and which are libraries provided by Anaconda. For example, CUDA and CuDNN are libraries that Anaconda can install, but which should not be re-installed as they are already available as modules on Sherlock,
  3. remove from the list of dependencies everything which is not a Python module (e.g. cudatoolkit and cudnn),
  4. create a virtual environment to install your dependencies.

And that's it: your software should run, without Anaconda. If you have any issues, please don't hesitate to contact us.

","location":"docs/software/using/anaconda/#use-a-virtual-environment"},{"title":"Use a container","text":"

In some situations, the complexity of a program's dependencies requires the use of a solution where you can control the entire software environment. In these situations, we recommend using a container.

Tip

Existing Docker images can easily be converted into Apptainer/Singularity images.

The only potential downside of using containers is their size and the associated storage usage. But if your research group plans on using several container images, it could be useful to collect them all in a single location (like $GROUP_HOME) to avoid duplication.

","location":"docs/software/using/anaconda/#use-a-container"},{"title":"ClusterShell","text":"","location":"docs/software/using/clustershell/"},{"title":"Introduction","text":"

ClusterShell is a command-line tool and library that helps running commands in parallel on multiple servers. It allows executing arbitrary commands across multiple hosts. On Sherlock, it provides an easy way to run commands on nodes your jobs are running on, and collect back information. The two most useful commands provided are cluset, which can manipulate lists of nodenames, and clush, which can run commands on multiple nodes at once.

","location":"docs/software/using/clustershell/#introduction"},{"title":"More documentation","text":"

The following documentation specifically intended for using ClusterShell on Sherlock. For more complete documentation about ClusterShell in general, please see the ClusterShell documentation.

The ClusterShell library can also be directly be integrated in your Python scripts, to add a wide range of functionality. See the ClusterShell Python API documentation for reference.

","location":"docs/software/using/clustershell/#more-documentation"},{"title":"ClusterShell on Sherlock","text":"

ClusterShell is available on Sherlock and the corresponding module can be loaded with:

$ ml system py-clustershell\n
","location":"docs/software/using/clustershell/#clustershell-on-sherlock"},{"title":"cluset","text":"

The cluset command can be used to easily manipulate lists of node names, and to expand, fold, or count them:

$ cluset --expand sh03-01n[01-06]\nsh03-01n01 sh03-01n02 sh03-01n03 sh03-01n04 sh03-01n05 sh03-01n06\n\n$ cluset --count sh03-01n[01-06]\n6\n\n$ cluset --fold sh03-01n01 sh03-01n02 sh03-01n03 sh03-01n06\nsh03-01n[01-03,06]\n
","location":"docs/software/using/clustershell/#cluset"},{"title":"clush","text":"

The clush command uses the same node list syntax to allow running the same commands simultaneously on those nodes. clush uses SSH to connect to each of these nodes.

Warning

You can only SSH to nodes where your jobs are running, and as a consequence, clush will only work on those nodes.

For instance, to check the load on multiple compute nodes at once:

$ clush -w sh03-01n[01-03] cat /proc/loadavg\nsh03-01n01: 19.48 14.43 11.76 22/731 22897\nsh03-01n02: 13.20 13.29 13.64 14/831 1163\nsh03-01n03: 11.60 11.48 11.82 18/893 23945\n

Gathering identical output

Using the the -b option will regroup similar output lines to make large outputs easier to read. By default, the output of each node will be presented separately.

For instance, without -b:

$ clush -w sh03-01n[01-03] echo ok\nsh03-01n02: ok\nsh03-01n03: ok\nsh03-01n01: ok\n

With -b:

$ clush -bw sh03-01n[01-03] echo ok\n---------------\nsh03-01n[01-03] (3)\n---------------\nok\n
","location":"docs/software/using/clustershell/#clush"},{"title":"Slurm integration","text":"

On Sherlock, ClusterShell is also tightly integrated with the job scheduler, and can directly provide information about a user's jobs and the nodes they're running on. You can use the following groups to get specific node lists:

group name short name action example @user: @u: list nodes where user has jobs running cluset -f @user:$USER @job: @j: list nodes where job is running cluset -f @job:123456 @nodestate: @node:,@n: list nodes in given state cluset -f @nodestate:idle @partition: @part:,@p: list nodes in given partition cluset -f @partition:gpu

For instance, to get the list of nodes where job 123456 is running:

$ cluset -f @job:123456`\n
","location":"docs/software/using/clustershell/#slurm-integration"},{"title":"Examples","text":"","location":"docs/software/using/clustershell/#examples"},{"title":"Job information","text":"

For instance, if job 1988522 from user kilian is running on nodes sh02-01n[59-60], squeue would display this:

$ squeue -u kilian\n       JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)\n     1988522    normal interact   kilian  R       1:30      2 sh02-01n[59-60]\n     1988523    normal interact   kilian  R       1:28      2 sh02-01n[61-62]\n

With ClusterShell, you could get:

  • the list of node names where user kilian has jobs running:

    $ cluset -f @user:kilian\nsh02-01n[59-62]\n
  • the nodes where job 1988522 is running, in an expanded form:

    $ cluset -e @job:1988522\nsh02-01n59 sh02-01n60\n
","location":"docs/software/using/clustershell/#job-information"},{"title":"Node states","text":"

You can also use those binding to get lists of nodes in a particular state, in a given partition. For instance, to list the nodes that are in \"mixed\" state in the dev partition, you can request the intersection between the @state:mixed and @partition:dev node lists:

$ cluset -f @nodestate:mixed -i @partition:dev\nsh02-01n[57-58]\n
","location":"docs/software/using/clustershell/#node-states"},{"title":"Local storage","text":"

To get a list of files in $L_SCRATCH on all the nodes that are part of job 1988522:

$ $ clush -w@j:1988522 tree $L_SCRATCH\nsh02-01n59: /lscratch/kilian\nsh02-01n59: \u251c\u2500\u2500 1988522\nsh02-01n59: \u2502\u00a0\u00a0 \u2514\u2500\u2500 foo\nsh02-01n59: \u2502\u00a0\u00a0     \u2514\u2500\u2500 bar\nsh02-01n59: \u2514\u2500\u2500 1993608\nsh02-01n59:\nsh02-01n59: 3 directories, 1 file\nsh02-01n60: /lscratch/kilian\nsh02-01n60: \u2514\u2500\u2500 1988522\nsh02-01n60:\nsh02-01n60: 1 directory, 0 files\n
","location":"docs/software/using/clustershell/#local-storage"},{"title":"Process tree","text":"

To display your process tree across all the nodes your jobs are running on:

$ clush -w @u:$USER pstree -au $USER\nsh02-09n71: mpiBench\nsh02-09n71:   `-3*[{mpiBench}]\nsh02-09n71: mpiBench\nsh02-09n71:   `-3*[{mpiBench}]\nsh02-09n71: mpiBench\nsh02-09n71:   `-3*[{mpiBench}]\nsh02-09n71: mpiBench\nsh02-09n71:   `-3*[{mpiBench}]\nsh02-10n01: mpiBench\nsh02-10n01:   `-3*[{mpiBench}]\nsh02-10n01: mpiBench\nsh02-10n01:   `-3*[{mpiBench}]\nsh02-10n01: mpiBench\nsh02-10n01:   `-3*[{mpiBench}]\nsh02-10n01: mpiBench\nsh02-10n01:   `-3*[{mpiBench}]\n
","location":"docs/software/using/clustershell/#process-tree"},{"title":"CPU usage","text":"

To get the CPU and memory usage of your processes in job 2003264:

$ clush -w @j:2003264 ps -u$USER -o%cpu,rss,cmd\nsh03-07n12: %CPU   RSS CMD\nsh03-07n12:  0.0  4780 /home/users/kilian/benchs/MPI/mpiBench/mpiBench -i 1000000\nsh03-07n12:  0.0  4784 /home/users/kilian/benchs/MPI/mpiBench/mpiBench -i 1000000\nsh03-07n12:  0.0  4784 /home/users/kilian/benchs/MPI/mpiBench/mpiBench -i 1000000\nsh03-07n12:  0.0  4780 /home/users/kilian/benchs/MPI/mpiBench/mpiBench -i 1000000\nsh03-06n06: %CPU   RSS CMD\nsh03-06n06:  0.0 59596 /home/users/kilian/benchs/MPI/mpiBench/mpiBench -i 1000000\nsh03-06n06:  0.0 59576 /home/users/kilian/benchs/MPI/mpiBench/mpiBench -i 1000000\nsh03-06n06:  0.0 59580 /home/users/kilian/benchs/MPI/mpiBench/mpiBench -i 1000000\nsh03-06n06:  0.0 59588 /home/users/kilian/benchs/MPI/mpiBench/mpiBench -i 1000000\nsh03-06n05: %CPU   RSS CMD\nsh03-06n05:  0.0  7360 /home/users/kilian/benchs/MPI/mpiBench/mpiBench -i 1000000\nsh03-06n05:  0.0  7328 /home/users/kilian/benchs/MPI/mpiBench/mpiBench -i 1000000\nsh03-06n05:  0.0  7344 /home/users/kilian/benchs/MPI/mpiBench/mpiBench -i 1000000\nsh03-06n05:  0.0  7340 /home/users/kilian/benchs/MPI/mpiBench/mpiBench -i 1000000\nsh03-06n11: %CPU   RSS CMD\nsh03-06n11: 17.0 59604 /home/users/kilian/benchs/MPI/mpiBench/mpiBench -i 1000000\nsh03-06n11: 17.0 59588 /home/users/kilian/benchs/MPI/mpiBench/mpiBench -i 1000000\nsh03-06n11: 17.0 59592 /home/users/kilian/benchs/MPI/mpiBench/mpiBench -i 1000000\nsh03-06n11: 17.0 59580 /home/users/kilian/benchs/MPI/mpiBench/mpiBench -i 1000000\n
","location":"docs/software/using/clustershell/#cpu-usage"},{"title":"GPU usage","text":"

To show what's running on all the GPUs on the nodes associated with job 123456:

$ clush -bw @job:123456 nvidia-smi --format=csv --query-compute-apps=process_name,utilization.memory\nsh03-12n01: /share/software/user/open/python/3.6.1/bin/python3.6, 15832 MiB\nsh02-12n04: /share/software/user/open/python/3.6.1/bin/python3.6, 15943 MiB\n
","location":"docs/software/using/clustershell/#gpu-usage"},{"title":"Julia","text":"","location":"docs/software/using/julia/"},{"title":"Introduction","text":"

Julia is a high-level general-purpose dynamic programming language that was originally designed to address the needs of high-performance numerical analysis and computational science, without the typical need of separate compilation to be fast, also usable for client and server web use, low-level systems programming or as a specification language. Julia aims to create an unprecedented combination of ease-of-use, power, and efficiency in a single language.

","location":"docs/software/using/julia/#introduction"},{"title":"More documentation","text":"

The following documentation is specifically intended for using Julia on Sherlock. For more complete documentation about Julia in general, please see the Julia documentation.

","location":"docs/software/using/julia/#more-documentation"},{"title":"Julia on Sherlock","text":"

Julia is available on Sherlock and the corresponding module can be loaded with:

$ ml julia\n

For a list of available versions, you can execute ml spider julia at the Sherlock prompt, or refer to the Software list page.

","location":"docs/software/using/julia/#julia-on-sherlock"},{"title":"Using Julia","text":"

Once your environment is configured (ie. when the julia module is loaded), julia can be started by simply typing julia at the shell prompt:

$ julia\n\n_\n   _       _ _(_)_     |  Documentation: https://docs.julialang.org\n  (_)     | (_) (_)    |\n   _ _   _| |_  __ _   |  Type \"?\" for help, \"]?\" for Pkg help.\n  | | | | | | |/ _` |  |\n  | | |_| | | | (_| |  |  Version 1.0.0 (2018-08-08)\n _/ |\\__'_|_|_|\\__'_|  |  Official https://julialang.org/ release\n|__/                   |\n\njulia>\n

For a listing of command line options:

$ julia --help\n\njulia [switches] -- [programfile] [args...]\n -v, --version             Display version information\n -h, --help                Print this message\n\n -J, --sysimage <file>     Start up with the given system image file\n -H, --home <dir>          Set location of `julia` executable\n --startup-file={yes|no}   Load `~/.julia/config/startup.jl`\n --handle-signals={yes|no} Enable or disable Julia's default signal handlers\n --sysimage-native-code={yes|no}\n                           Use native code from system image if available\n --compiled-modules={yes|no}\n                           Enable or disable incremental precompilation of modules\n\n -e, --eval <expr>         Evaluate <expr>\n -E, --print <expr>        Evaluate <expr> and display the result\n -L, --load <file>         Load <file> immediately on all processors\n\n -p, --procs {N|auto}      Integer value N launches N additional local worker processes\n                           \"auto\" launches as many workers as the number\n                           of local CPU threads (logical cores)\n --machine-file <file>     Run processes on hosts listed in <file>\n\n -i                        Interactive mode; REPL runs and isinteractive() is true\n -q, --quiet               Quiet startup: no banner, suppress REPL warnings\n
","location":"docs/software/using/julia/#using-julia"},{"title":"Running a Julia script","text":"

A Julia program is easy to run on the command line outside of its interactive mode.

Here is an example where we create a simple Hello World program and launch it with Julia

$ echo 'println(\"hello world\")' > helloworld.jl\n

That script can now simply be executed by calling julia <script_name>:

$ julia helloworld.jl\nhello world\n
","location":"docs/software/using/julia/#running-a-julia-script"},{"title":"Submitting a Julia job","text":"

Here's an example Julia sbatch script that can be submitted via sbatch:

julia_test.sbatch
#!/bin/bash\n\n#SBATCH --time=00:10:00\n#SBATCH --mem=4G\n#SBATCH --output=julia_test.log\n\n# load the module\nml julia\n\n# run the Julia application\njulia helloworld.jl\n

You can save this script as julia_test.sbatch and submit it to the scheduler with:

$ sbatch julia_test.sbatch\n

Once the job is done, you should get a julia_test.log file in the current directory, with the following contents:

$ cat julia_test.log\nhello world\n
","location":"docs/software/using/julia/#submitting-a-julia-job"},{"title":"Julia packages","text":"

Julia provides an ever-growing list of packages that can be used to install add-on functionality to your Julia code.

Installing packages with Julia is very simple. Julia includes a package module in its base installation that handles installing, updating, and removing packages.

First import the Pkg module:

julia> import Pkg\njulia> Pkg.status()\n    Status `~/.julia/environments/v1.0/Project.toml`\n

Julia packages only need to be installed once

You only need to install Julia packages once on Sherlock. Since fielsystems are shared, packages installed on one node will immediately be available on all nodes on the cluster.

","location":"docs/software/using/julia/#julia-packages"},{"title":"Installing packages","text":"

You can first check the status of packages installed on Julia using the status function of the Pkg module:

julia> Pkg.status()\nNo packages installed.\n

You can then add packages using the add function of the Pkg module:

julia> Pkg.add(\"Distributions\")\nINFO: Cloning cache of Distributions from git://github.com/JuliaStats/Distributions.jl.git\nINFO: Cloning cache of NumericExtensions from git://github.com/lindahua/NumericExtensions.jl.git\nINFO: Cloning cache of Stats from git://github.com/JuliaStats/Stats.jl.git\nINFO: Installing Distributions v0.2.7\nINFO: Installing NumericExtensions v0.2.17\nINFO: Installing Stats v0.2.6\nINFO: REQUIRE updated.\n

Using the status function again, you can see that the package and its dependencies have been installed:

julia> Pkg.status()\nRequired packages:\n - Distributions                 0.2.7\nAdditional packages:\n - NumericExtensions             0.2.17\n - Stats                         0.2.6\n
","location":"docs/software/using/julia/#installing-packages"},{"title":"Updating Packages","text":"

The update function of the Pkg module can update all packages installed:

julia> Pkg.update()\nINFO: Updating METADATA...\nINFO: Computing changes...\nINFO: Upgrading Distributions: v0.2.8 => v0.2.10\nINFO: Upgrading Stats: v0.2.7 => v0.2.8\n
","location":"docs/software/using/julia/#updating-packages"},{"title":"Removing packages","text":"

The remove function of the Pkg module can remove any packages installed as well:

julia> Pkg.rm(\"Distributions\")\nINFO: Removing Distributions v0.2.7\nINFO: Removing Stats v0.2.6\nINFO: Removing NumericExtensions v0.2.17\nINFO: REQUIRE updated.\n\njulia> Pkg.status()\nRequired packages:\n - SHA                           0.3.2\n\njulia> Pkg.rm(\"SHA\")\nINFO: Removing SHA v0.3.2\nINFO: REQUIRE updated.\n\njulia> Pkg.status()\nNo packages installed.\n
","location":"docs/software/using/julia/#removing-packages"},{"title":"Examples","text":"","location":"docs/software/using/julia/#examples"},{"title":"Parallel job","text":"

Julia can natively spawn parallel workers across multiple compute nodes, without using MPI. There are two main modes of operation:

  1. ClusterManager: in this mode, you can spawn workers from within the Julia interpreter, and each worker will actually submit jobs to the scheduler, executing instructions within those jobs.

  2. using the --machine-file option: here, you submit a SLURM job and run the Julia interpreter in parallel mode within the job's resources.

The second mode is easier to use, and more convenient, since you have all your resources available and ready to use when the job starts. In mode 1, you'll need to wait for jobs to be dispatched and executed inside Julia.

Here is a quick example on how to use the --machine-file option on Sherlock.

Given following Julia script (julia_parallel_test.jl) that will print a line with the process id and the node it's executing on, in parallel:

julia_parallel_test.jl
using Distributed\n@everywhere println(\"process: $(myid()) on host $(gethostname())\")\n

You can submit the following job:

julia_test.sbatch
#!/bin/bash\n#SBATCH --nodes 2\n#SBATCH --ntasks-per-node 4\n#SBATCH --time 5:0\n\nml julia\njulia --machine-file <(srun hostname -s)  ./julia_parallel_test.jl\n

Save as julia_test.sbatch, and then:

$ sbatch  julia_test.sbatch\n

It will:

  1. Request 2 nodes, 4 tasks per node (8 tasks total)
  2. load the julia module
  3. Run Julia in parallel with a machine file that is automatically generated, listing the nodes that are assigned to your job.

It should output something like this in your job's output file:

process: 1 on host sh-06-33.int\n      From worker 2:    process: 2 on host sh-06-33.int\n      From worker 3:    process: 3 on host sh-06-34.int\n      From worker 5:    process: 5 on host sh-06-33.int\n      From worker 4:    process: 4 on host sh-06-33.int\n      From worker 6:    process: 6 on host sh-06-33.int\n      From worker 8:    process: 8 on host sh-06-34.int\n      From worker 9:    process: 9 on host sh-06-34.int\n      From worker 7:    process: 7 on host sh-06-34.int\n
","location":"docs/software/using/julia/#parallel-job"},{"title":"MariaDB","text":"","location":"docs/software/using/mariadb/"},{"title":"Introduction","text":"

MariaDB is a community-developed fork of the MySQL relational database management system. It is completely compatible with MySQL and could be use as a drop-in replacement in the vast majority of cases.

","location":"docs/software/using/mariadb/#introduction"},{"title":"More documentation","text":"

The following documentation specifically intended for using MariaDB on Sherlock. For more complete documentation about MariaDB in general, please see the MariaDB documentation.

","location":"docs/software/using/mariadb/#more-documentation"},{"title":"MariaDB on Sherlock","text":"

We don't provide any centralized database service on Sherlock, but we provide a centralized installation of MariaDB, and each user is welcome to start their own instance of the database server to fit their jobs' needs.

The overall process to run an instance of MariaDB on Sherlock would look like this:

  1. configure and initialize your environment so you can start a database instance under your user account,
  2. start the database server,
  3. run SQL queries from the same node (via a local socket), or from other nodes and/or jobs (via the network).
","location":"docs/software/using/mariadb/#mariadb-on-sherlock"},{"title":"Single-node access","text":"

In that example, the database server and client will run within the same job, on the same compute node.

","location":"docs/software/using/mariadb/#single-node-access"},{"title":"Preparation","text":"

You first need to let MariaDB know where to store its database, where to log things, and how to allow connections from clients. The commands below only need to be executed once.

For this, you'll need to create a .my.cnf file in your home directory. Assuming you'll want to store your database files in a db/ directory in your $SCRATCH folder, you can run the following commands:

$ export DB_DIR=$SCRATCH/db\n$ mkdir $DB_DIR\n\n$ cat << EOF > ~/.my.cnf\n[mysqld]\ndatadir=$DB_DIR\nsocket=$DB_DIR/mariadb.sock\nuser=$USER\nsymbolic-links=0\nskip-networking\n\n[mysqld_safe]\nlog-error=$DB_DIR/mariadbd.log\npid-file=$DB_DIR/mariadbd.pid\n\n[mysql]\nsocket=$DB_DIR/mariadb.sock\nEOF\n

.my.cnf doesn't support environment variables

Please note that if you edit your ~/.my.cnf file directly in a file editor, without using the HEREDOC syntax above, environment variables such as $DB_DIR, $HOME or $USER won't work: you will need to specify absolute paths explicitly, such as /scratch/users/kilian/db/mariadbd.log.

If you use the HEREDOC syntax, you can verify that the resulting .my.cnf file does actually contain full paths, and not environment variables anymore.

Once you have the .my.cnf file in place, you need to initialize your database with some internal data that MariaDB needs. In the same terminal, run the following commands:

$ ml system mariadb\n$ $MARIADB_DIR/scripts/mysql_install_db --basedir=$MARIADB_DIR  --datadir=$DB_DIR\n
","location":"docs/software/using/mariadb/#preparation"},{"title":"Start the server","text":"

You can now start the MariaDB server. For this, first get an allocation on a compute node, note the hostname of the compute node your job has been allocated, load the mariadb module, and then run the mysqld_safe process:

$ srun --pty bash\n$ echo $SLURM_JOB_NODELIST\nsh-01-01\n$ ml system mariadb\n$ mysqld_safe\n180705 18:14:27 mysqld_safe Logging to '/home/users/kilian/db/mysqld.log'.\n180705 18:14:28 mysqld_safe Starting mysqld daemon with databases from /home/users/kilian/db/\n

The mysqld_safe will be blocking, meaning it will not give the prompt back for as long as the MariaDB server runs.

If it does return on its own, it probably means that something went wrong, and you'll find more information about the issue in the $DB_DIR/mysqld.log file you defined in ~/.my.cnf.

","location":"docs/software/using/mariadb/#start-the-server"},{"title":"Run queries","text":"

You're now ready to run queries against that MariaDB instance, from the same node your job is running on.

From another terminal on Sherlock, connect to your job's compute node (here, it's sh-01-01, as shown above), load the mariadb module, and then run the mysql command: it will open the MariaDB shell, ready to run your SQL queries:

$ ssh sh-01-01\n$ ml system mariadb\n$ mysql\nWelcome to the MariaDB monitor.  Commands end with ; or \\g.\nYour MariaDB connection id is 8\nServer version: 10.2.11-MariaDB Source distribution\n\nCopyright (c) 2000, 2017, Oracle, MariaDB Corporation Ab and others.\n\nType 'help;' or '\\h' for help. Type '\\c' to clear the current input statement.\n\nMariaDB [(none)]>\n

Once you're done with your MariaDB instance, you can just terminate your job, and all the processes will be terminated automatically.

","location":"docs/software/using/mariadb/#run-queries"},{"title":"Multi-node access","text":"

In case you need to run a more persistent instance of MariaDB, you can for instance submit a dedicated job to run the server, make it accessible over the network, and run queries from other jobs and/or nodes.

","location":"docs/software/using/mariadb/#multi-node-access"},{"title":"Enable network access","text":"

The preparation steps are pretty similar to the single-node case, except the MariaDB server instance will be accessed over the network rather than through a local socket.

Network access must be secured

When running an networked instance of MariaDB, please keep in mind that any user on Sherlock will be able to connect to the TCP ports that mysqld runs on, and that proper configuration must be done to prevent unauthrozied access.

Like in the single-node case, you need to create a ~/.my.cnf file, but without the skip-networking directive.

$ export DB_DIR=$SCRATCH/db\n$ mkdir $DB_DIR\n\n$ cat << EOF > ~/.my.cnf\n[mysqld]\ndatadir=$DB_DIR\nsocket=$DB_DIR/mariadb.sock\nuser=$USER\nsymbolic-links=0\n\n[mysqld_safe]\nlog-error=$DB_DIR/mariadbd.log\npid-file=$DB_DIR/mariadbd.pid\n\n[mysql]\nsocket=$DB_DIR/mariadb.sock\nEOF\n

And then initiate the database:

$ ml system mariadb\n$ $MARIADB_DIR/scripts/mysql_install_db --basedir=$MARIADB_DIR  --datadir=$DB_DIR\n
","location":"docs/software/using/mariadb/#enable-network-access"},{"title":"Secure access","text":"

We will now set a password for the MariaDB root user to a random string, just for the purpose of preventing unauthorized access, since we won't need it for anything.

We will actually create a MariaDB user with all privileges on the databases, that will be able to connect to this instance from any node. This user will need a real password, though. So please make sure to replace the my-secure-password string below by the actual password of your choice.

Choose a proper password

This password will only be used to access this specific instance of MariaDB. Note that anybody knowing that password will be allowed to connect to your MariaDB instances and modify data in the tables.

  • do NOT literally use my-secure-password
  • do NOT use your SUNet ID password

Once you've chosen your password, you can start the mysqld process on a compute node, like before:

$ srun --pty bash\n$ echo $SLURM_JOB_NODELIST\nsh-01-01\n$ ml system mariadb\n$ mysqld_safe\n

And then, from another terminal, run the following commands to secure access to your MariaDB database.

$ ssh sh-01-01\n$ mysql -u root << EOF\nUPDATE mysql.user SET Password=PASSWORD(RAND()) WHERE User='root';\nDELETE FROM mysql.user WHERE User='root' AND Host NOT IN ('localhost', '127.0.0.1', '::1');\nDELETE FROM mysql.user WHERE User='';\nDELETE FROM mysql.db WHERE Db='test' OR Db='test_%';\nGRANT ALL PRIVILEGES ON *.* TO '$USER'@'%' IDENTIFIED BY 'my-secure-password' WITH GRANT OPTION;\nFLUSH PRIVILEGES;\nEOF\n

Once you've done that, you're ready to terminate that interactive job, and start a dedicated MariaDB server job.

","location":"docs/software/using/mariadb/#secure-access"},{"title":"Start MariaDB in a job","text":"

You can use the following mariadb.sbatch job as a template:

#!/bin/bash\n\n#SBATCH --job-name=mariadb\n#SBATCH --time=8:0:0\n#SBATCH --dependency=singleton\n\nml system mariadb\nmysqld_safe\n

and submit it with:

$ sbatch mariadb.sbatch\n

Concurrent instances will lead to data corruption

An important thing to keep in mind is that having multiple instances of a MariaDB server running at the same time, using the same database files, will certainly lead to catastrophic situations and the corruption of those files.

To prevent this from happening, the --dependency=singleton job submission option will make sure that only one instance of that job (based on its name and user) will run at any given time.

","location":"docs/software/using/mariadb/#start-mariadb-in-a-job"},{"title":"Connect to the running instance","text":"

Now, from any node on Sherlock, whether from a login node, an interactive job, or a batch job, using the mysql CLI or any application binding in any language, you should be able to connect to your running MariaDB instance,

First, identify the node your job is running on with squeue:

$ squeue -u $USER -n mariadb\n             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)\n          21383445    normal  mariadb   kilian  R       0:07      1 sh-01-02\n

and then, point your MariaDB client to that node:

$ ml system mariadb\n$ mysql -h sh-01-02 -p\nEnter password:\nWelcome to the MariaDB monitor.  Commands end with ; or \\g.\nYour MariaDB connection id is 15\nServer version: 10.2.11-MariaDB Source distribution\n\nCopyright (c) 2000, 2017, Oracle, MariaDB Corporation Ab and others.\n\nType 'help;' or '\\h' for help. Type '\\c' to clear the current input statement.\n\nMariaDB [(none)]>\n

That's it! You can now run SQL queries from anywhere on Sherlock to your own MariaDB instance.

","location":"docs/software/using/mariadb/#connect-to-the-running-instance"},{"title":"Persistent DB instances","text":"

SQL data is persistent

All the data you import in your SQL databases will be persistent across jobs. Meaning that you can run a PostgreSQL server job for the day, import data in its database, stop the job, and resubmit the same PostgreSQL server job the next day: all your data will still be there as long as the location you've chosen for your database (the $DB_DIR defined in the Preparation steps) is on a persistent storage location.

If you need database access for more than the maximum runtime of a job, you can use the instructions provided to define self-resubmitting recurring jobs and submit long-running database instances.

","location":"docs/software/using/mariadb/#persistent-db-instances"},{"title":"Matlab","text":"","location":"docs/software/using/matlab/"},{"title":"Introduction","text":"

MATLAB is a numerical computing environment and proprietary programming language developed by MathWorks.

","location":"docs/software/using/matlab/#introduction"},{"title":"More documentation","text":"

The following documentation is specifically intended for using Matlab on Sherlock. For more complete documentation about Matlab in general, please see the official MATLAB documentation.

","location":"docs/software/using/matlab/#more-documentation"},{"title":"MATLAB on Sherlock","text":"","location":"docs/software/using/matlab/#matlab-on-sherlock"},{"title":"Licensing","text":"

MATLAB is a commercial software suite, which is now available to no cost for all Stanford Faculty, students, and staff.

Note: a number of free, open-source alternatives exist and can be used in many situations: Octave, R, Julia, or Python are all available on Sherlock, and can often replace MATLAB with good results.

","location":"docs/software/using/matlab/#licensing"},{"title":"Using MATLAB","text":"

The MATLAB module can be loaded with:

$ ml load matlab\n

This will load the current default version. For a list of available versions run ml spider matlab at the Sherlock prompt, or refer to the Software list page.

MATLAB can't run on login nodes

Running MATLAB directly on login nodes is not supported and will produce the following message:

-----------------------------------------------------------------------\nWARNING: running MATLAB directly on login nodes is not supported.  Please\nmake sure you request an interactive session on a compute node with \"sh_dev\"\nfor instance) before launching MATLAB interactively.\n-----------------------------------------------------------------------\n
You will need to submit a job or request an interactive session on a compute node before you can start MATLAB.

Once you are on a compute node and your environment is configured (ie. when the matlab module is loaded), MATLAB can be started by simply typing matlab at the shell prompt.

$ sh_dev\n$ ml load matlab\n$ matlab\nMATLAB is selecting SOFTWARE OPENGL rendering.\n                          < M A T L A B (R) >\n                Copyright 1984-2019 The MathWorks, Inc.\n                R2019a (9.6.0.1072779) 64-bit (glnxa64)\n                             March 8, 2019\n\nTo get started, type doc.\nFor product information, visit www.mathworks.com.\n\n>>\n

For a listing of command line options:

$ matlab -help\n
","location":"docs/software/using/matlab/#using-matlab"},{"title":"Running a MATLAB script","text":"

There are several ways to launch a MATLAB script on the command line, as documented in the MATLAB documentation:

Method Output matlab -nodesktop < script.m MATLAB will run the code from script.m and display output on stdout matlab -nodisplay Start MATLAB in CLI mode, without its graphical desktop environment matlab -nojvm do not start the JVM1","location":"docs/software/using/matlab/#running-a-matlab-script"},{"title":"MATLAB GUI","text":"

It's often best to use your laptop or desktop to develop, debug MATLAB and visualize the output. If you do need to use the MATLAB GUI on a large cluster like Sherlock, you will need to enable X11 forwarding in your SSH client.

For instance:

$ ssh -X <YourSUNetID>@login.sherlock.stanford.edu\n

And then, once on Sherlock:

$ sh_dev\n$ ml load matlab\n$ matlab\n

For more info on X11 forwarding, you can refer to this UIT page.

","location":"docs/software/using/matlab/#matlab-gui"},{"title":"Examples","text":"","location":"docs/software/using/matlab/#examples"},{"title":"Simple MATLAB job","text":"

Here is an example MATLAB batch script that can submitted with sbatch:

#!/bin/bash\n#SBATCH --job-name=matlab_test\n#SBATCH --output=matlab_test.\"%j\".out\n#SBATCH --error=matlab_test.\"%j\".err\n#SBATCH --partition=normal\n#SBATCH --time=00:10:00\n#SBATCH --cpus-per-task=1\n#SBATCH --mem=8G\n#SBATCH --mail-type=ALL\n\nmodule load matlab\nmatlab -nodisplay < example.m\n

This simple job, named matlab_test will run a MATLAB script named example.m in the normal partition, for a duration of 10 minutes, and use 1 CPU and 8GB of RAM. It will send you an email (to whatever email you used wen you signed up for Sherlock) when it begins, ends or fails.

Additionally, to aid in debugging, it will log any errors and output to the files matlab_test.JOBID.{out,err} with the jobid appended to the filename (%j).

To create the script, open a text editor on Sherlock, copy the contents of the script, and save it as matlab_test.sbatch

Then, submit the job with the sbatch command:

$ sbatch matlab_test.sbatch\nSubmitted batch job 59942277\n

You can check the status of the job with the squeue command, and check the contents of the matlab_test.JOBID.{out,err} files to see the results.

","location":"docs/software/using/matlab/#simple-matlab-job"},{"title":"Parallel loop","text":"

You can run your MATLAB code across multiple CPUs on Sherlock using parfor loops, to take advantage of the multiple CPU cores that each node features. You can submit a job requesting as many CPUs as there are on a node in a single job. The key is to grab the SLURM environment variable $SLURM_CPUS_PER_TASK and create the worker pool in your MATLAB code with:

parpool('local', str2num(getenv('SLURM_CPUS_PER_TASK')))\n

Here is an example of a sbatch submission script that requests 16 CPUs on a node, and runs a simple MATLAB script using parfor.

Save the two scripts below as parfor.sbatch and parfor_loop.m:

parfor.sbatchparfor_loop.m
#!/bin/bash\n#SBATCH -J pfor_matlab\n#SBATCH -o pfor\".%j\".out\n#SBATCH -e pfor\".%j\".err\n#SBATCH -t 20:00\n#SBATCH -p normal\n#SBATCH -c 16\n#SBATCH --mail-type=ALL\n\nmodule load matlab\nmatlab -batch parfor_loop\n
%============================================================================\n% Parallel Monte Carlo calculation of PI\n%============================================================================\nparpool('local', str2num(getenv('SLURM_CPUS_PER_TASK')))\nR = 1;\ndarts = 1e7;\ncount = 0;\ntic\nparfor i = 1:darts\n   % Compute the X and Y coordinates of where the dart hit the...............\n   % square using Uniform distribution.......................................\n   x = R*rand(1);\n   y = R*rand(1);\n   if x^2 + y^2 <= R^2\n      % Increment the count of darts that fell inside of the.................\n      % circle...............................................................\n     count = count + 1; % Count is a reduction variable.\n   end\nend\n% Compute pi.................................................................\nmyPI = 4*count/darts;\nT = toc;\nfprintf('The computed value of pi is %8.7f.n',myPI);\nfprintf('The parallel Monte-Carlo method is executed in %8.2f seconds.n', T);\ndelete(gcp);\nexit;\n

You can now submit the job with the following command:

sbatch parfor.sbatch\n

If you run htop or pstree -u $USER on the compute node that is running your job, you will see all 16 cores allocated to your MATLAB code.

You can also try that same job with different numbers of CPUs, and see how well it scales.

  1. MATLAB uses the Java\u00ae Virtual Machine (JVM\u2122) software to run the desktop and to display graphics. The -nojvm option enables you to start MATLAB without the JVM. Using this option minimizes memory usage and improves initial start-up speed, but restricts functionality.\u00a0\u21a9

","location":"docs/software/using/matlab/#parallel-loop"},{"title":"Perl","text":"","location":"docs/software/using/perl/"},{"title":"Introduction","text":"

Perl is a high-level, general-purpose, interpreted, dynamic programming language. Originally developed by Larry Wall in 1987 as a general-purpose Unix scripting language to make report processing easier, it has since undergone many changes and revisions.

Perl provides a framework allowing users to easily extend the language by installing new modules in their local environment. The Comprehensive Perl Archive Network (CPAN1) is an archive of over 25,000 distributions of software written in Perl, as well as documentation for it. It is searchable at http://metacpan.org or http://search.cpan.org and mirrored in over 270 locations around the world.

","location":"docs/software/using/perl/#introduction"},{"title":"More documentation","text":"

The following documentation specifically intended for using Perl on Sherlock. For more complete documentation about Perl in general, please see the Perl documentation.

","location":"docs/software/using/perl/#more-documentation"},{"title":"Perl modules on Sherlock","text":"

To install Perl modules from CPAN, we recommend using the (provided) App::cpanminus module and local::lib modules:

  • App::cpanminus is a popular alternative CPAN client that can be used to manage Perl distributions. It has many great features, including uninstalling modules.
  • local::lib allows users to install Perl modules in the directory of their choice (typically their home directory) without administrative privileges.

Both are already installed on Sherlock, and are automatically enabled and configured when you load the perl module. You don't need to add anything in your ~/.bashrc file, the Sherlock perl module will automatically create everything that is required so you can directly run a command to install Perl modules locally.

","location":"docs/software/using/perl/#perl-modules-on-sherlock"},{"title":"Installation","text":"

Perl modules installation is only necessary once

You only need to install Perl modules once on Sherlock. Since fielsystems are shared, modules installed on one node will immediately be available on all nodes on the cluster.

As an example, to install the DateTime::TimeZone module, you can do the following:

$ ml perl\n$ cpanm DateTime::TimeZone\n
","location":"docs/software/using/perl/#installation"},{"title":"Usage","text":"

Once installed, you can use the Perl modules directly, no specific options or syntax is required.

For instance, to check that the DateTime::TimeZone module is correctly installed:

$ perl -MDateTime::TimeZone -e 'print $DateTime::TimeZone::VERSION . \"\\n\"';\n2.13\n
","location":"docs/software/using/perl/#usage"},{"title":"Uninstallation","text":"

To uninstall a Perl module:

$ cpanm -U DateTime::TimeZone\n
  1. CPAN can denote either the archive network itself, or the Perl program that acts as an interface to the network and as an automated software installer (somewhat like a package manager). Most software on CPAN is free and open source.\u00a0\u21a9

","location":"docs/software/using/perl/#uninstallation"},{"title":"PostgreSQL","text":"","location":"docs/software/using/postgresql/"},{"title":"Introduction","text":"

PostgreSQL is a powerful, open source object-relational database system with a strong focus on reliability, feature robustness, and performance.

","location":"docs/software/using/postgresql/#introduction"},{"title":"More documentation","text":"

The following documentation specifically intended for using PostgreSQL on Sherlock. For more complete documentation about PostgreSQL in general, please see the PostgreSQL documentation.

","location":"docs/software/using/postgresql/#more-documentation"},{"title":"PostgreSQL on Sherlock","text":"

We don't provide any centralized database service on Sherlock, but we provide a centralized installation of PostgreSQL, and each user is welcome to start their own instance of the database server to fit their jobs' needs.

The overall process to run an instance of PostgreSQL on Sherlock would look like this:

  1. configure and initialize your environment so you can start a database instance under your user account,
  2. start the database server,
  3. run SQL queries from the same node (via a local socket), or from other nodes and/or jobs (via the network).
","location":"docs/software/using/postgresql/#postgresql-on-sherlock"},{"title":"Single-node access","text":"

In that example, the database server and client will run within the same job, on the same compute node.

","location":"docs/software/using/postgresql/#single-node-access"},{"title":"Preparation","text":"

You first need to let PostgreSQL know where to store its database. The commands below only need to be executed once.

Assuming you'll want to store your database files in a db/ directory in your $SCRATCH folder, you can run the following commands:

$ export DB_DIR=$SCRATCH/db\n$ mkdir $DB_DIR\n

Once you have your $DB_DIR in place, you need to initialize your database with some internal data that PostgreSQL needs. In the same terminal, run the following commands:

$ ml system postgresql\n$ initdb $DB_DIR\n
","location":"docs/software/using/postgresql/#preparation"},{"title":"Start the server","text":"

You can now start the PostgreSQL server. For this, first get an allocation on a compute node, note the hostname of the compute node your job has been allocated, load the postgresql module, and then run the postgresql server:

$ srun --pty bash\n$ echo $SLURM_JOB_NODELIST\nsh-01-01\n$ ml system postgresql\n$ export DB_DIR=$SCRATCH/db\n$ postgres -D $DB_DIR\n[...]\n2018-10-09 17:42:08.094 PDT [3841] LOG:  database system is ready to accept connections\n

The postgres process will be blocking, meaning it will not give the prompt back for as long as the PostgreSQL server runs.

","location":"docs/software/using/postgresql/#start-the-server"},{"title":"Run queries","text":"

You're now ready to run queries against that PostgreSQL instance, from the same node your job is running on.

From another terminal on Sherlock, connect to your job's compute node (here, it's sh-01-01, as shown above), load the postgresql module, and then run the createdb command: it will create a database that you can use as a testbed:

$ ssh sh-01-01\n$ ml system postgresql\n$ createdb test_db\n

Once this is done, from the same shell, you can run the psql command, which will open the PostgreSQL shell, ready to run your SQL queries:

$ psql test_db\npsql (10.5)\nType \"help\" for help.\n\ntest_db=#\n

Once you're done with your PostgreSQL instance, you can just terminate your job, and all the processes will be terminated automatically.

","location":"docs/software/using/postgresql/#run-queries"},{"title":"Multi-node access","text":"

In case you need to run a more persistent instance of PostgreSQL, you can for instance submit a dedicated job to run the server, make it accessible over the network, and run queries from other jobs and/or nodes.

","location":"docs/software/using/postgresql/#multi-node-access"},{"title":"Enable network access","text":"

The preparation steps are pretty similar to the single-node case, except the PostgreSQL server instance will be accessed over the network rather than through a local socket.

Network access must be secured

When running an networked instance of PostgreSQL, please keep in mind that any user on Sherlock could potentially be able to connect to the TCP ports that postgres runs on, and that proper configuration must be done to prevent unauthrozied access.

Like in the single-node case, you need to start the postgres server process, but with the -i option to enable network connections, and define user access in your $DB_DIR/pg_hba.conf file (see below).

","location":"docs/software/using/postgresql/#enable-network-access"},{"title":"Secure access","text":"

To allow network connections to the database server, a password will need to be defined for the PostgreSQL user. That will allow this user to connect to the PostgreSQL instance from any node. Please make sure to replace the my-secure-password string below by the actual password of your choice.

Choose a proper password

This password will only be used to access this specific instance of PostgreSQL. Note that anybody knowing that password will be allowed to connect to your PostgreSQL instances and modify data in the tables.

  • do NOT use my-secure-password
  • do NOT use your SUNet ID password

Once you've chosen your password, you can now start the PostgreSQL server on a compute, as described in the previous section, initialize the database, and set the user password:

$ srun --pty bash\n\n$ echo $SLURM_JOB_NODELIST\nsh-01-01\n$ export DB_DIR=$SCRATCH/db\n$ mkdir $DB_DIR\n\n$ ml system postgresql\n$ initdb $DB_DIR\n$ createdb test_db\n\n$ psql -c \"ALTER USER $USER PASSWORD 'my-secure-password';\" test_db\n

Then, we need to edit the $DB_DIR/ph_hba.conf file to allow network access for user $USER:

$ cat << EOF > $DB_DIR/pg_hba.conf\nlocal   all             all                                     trust\nhost    all             all             127.0.0.1/32            trust\nhost    all             all             ::1/128                 trust\nhost    all             $USER           samenet                 md5\nEOF\n

Once you've done that, you're ready to terminate that interactive job, and start a dedicated PostgreSQL server job.

$ pg_ctl stop -D $DB_DIR\n$ logout\n
","location":"docs/software/using/postgresql/#secure-access"},{"title":"Start PostgreSQL in a job","text":"

You can use the following postgresql.sbatch job as a template:

#!/bin/bash\n\n#SBATCH --job-name=postgresql\n#SBATCH --time=8:0:0\n#SBATCH --dependency=singleton\n\nexport DB_DIR=$SCRATCH/db\n\nml system postgresql\n\npostgres -i -D $DB_DIR\n

and submit it with:

$ sbatch postgresql.sbatch\n

Concurrent instances will lead to data corruption

An important thing to keep in mind is that having multiple instances of a PostgreSQL server running at the same time, using the same database files, will certainly lead to catastrophic situations and the corruption of those files.

To prevent this from happening, the --dependency=singleton job submission option will make sure that only one instance of that job (based on its name and user) will run at any given time.

","location":"docs/software/using/postgresql/#start-postgresql-in-a-job"},{"title":"Connect to the running instance","text":"

Now, from any node on Sherlock, whether from a login node, an interactive job, or a batch job, using the mysql CLI or any application binding in any language, you should be able to connect to your running PostgreSQL instance,

First, identify the node your job is running on with squeue:

$ squeue -u $USER -n postgresql\n             JOBID PARTITION       NAME     USER ST       TIME  NODES NODELIST(REASON)\n          21383445    normal postgresql   kilian  R       0:07      1 sh-01-02\n

and then, point your PostgreSQL client to that node:

$ ml system postgresql\n$ mpsql -h sh-06-34  test_db\nPassword:\npsql (10.5)\nType \"help\" for help.\n\ntest_db=#\n

That's it! You can now run SQL queries from anywhere on Sherlock to your own PostgreSQL instance.

","location":"docs/software/using/postgresql/#connect-to-the-running-instance"},{"title":"Persistent DB instances","text":"

SQL data is persistent

All the data you import in your SQL databases will be persistent across jobs. Meaning that you can run a PostgreSQL server job for the day, import data in its database, stop the job, and resubmit the same PostgreSQL server job the next day: all your data will still be there as long as the location you've chosen for your database (the $DB_DIR defined in the Preparation steps) is on a persistent storage location.

If you need database access for more than the maximum runtime of a job, you can use the instructions provided to define self-resubmitting recurring jobs and submit long-running database instances.

","location":"docs/software/using/postgresql/#persistent-db-instances"},{"title":"Python","text":"","location":"docs/software/using/python/"},{"title":"Introduction","text":"

Python is an interpreted high-level programming language for general-purpose programming. Its design philosophy emphasizes code readability. It provides constructs that enable clear programming on both small and large scales, which makes it both easy to learn and very well-suited for rapid prototyping.

","location":"docs/software/using/python/#introduction"},{"title":"More documentation","text":"

The following documentation is specifically intended for using Python on Sherlock. For more complete documentation about Python in general, please see the Python documentation.

","location":"docs/software/using/python/#more-documentation"},{"title":"Python on Sherlock","text":"

Sherlock features multiple versions of Python.

Some applications only work with legacy features of version 2.x, while more recent code will require specific version 3.x features. Modules on Sherlock may only be available in a single flavor (as denoted by their suffix: _py27 or _py36, because the application only supports one or the other.

You can load either version on Sherlock by doing the following commands:

$ ml python/2.7.13\n

or

$ ml python/3.6.1\n

The Python3 interpreter is python3

The Python3 executable is named python3, not python. So, once you have the \"python/3.6.1\" module loaded on Sherlock, you will need to use python3 to invoke the proper interpreter. python will still refer to the default, older system-level Python installation, and may result in errors when trying to run Python3 code.

This is an upstream decision detailed in PEP-394, not something specific to Sherlock.

","location":"docs/software/using/python/#python-on-sherlock"},{"title":"Using Python","text":"

Once your environment is configured (ie. when the Python module is loaded), Python can be started by simply typing python at the shell prompt:

$ python\nPython 2.7.13 (default, Apr 27 2017, 14:19:21)\n[GCC 4.8.5 20150623 (Red Hat 4.8.5-11)] on linux2\nType \"help\", \"copyright\", \"credits\" or \"license\" for more information.\n>>>\n
","location":"docs/software/using/python/#using-python"},{"title":"Python in batch jobs","text":"

Python output is buffered by default

By default, Python buffers console output. It means that when running Python in a batch job through Slurm, you may see output less often than you would when running interactively.

When output is being buffered, the print statements are aggregated until there is a enough data to print, and then the messages are all printed at once. And as a consequence, job output files (as specified with the --output and --error job submission options) will be refreshed less often and may give the impression that the job is not running.

For debugging or checking that a Python script is producing the correct output, you may want to switch off buffering.

","location":"docs/software/using/python/#python-in-batch-jobs"},{"title":"Switching off buffering","text":"

For a single python script you can use the -u option, as in python -u my_script.py. The -u option stands for \"unbuffered\".

For instance:

#!/bin/bash\n#SBATCH -n 1\n\npython -u my_script.py\n

Tip

You can also use the environment variable PYTHONUNBUFFERED to set unbuffered I/O for your whole batch script.

#!/bin/bash\n#SBATCH -n 1\n\nexport PYTHONUNBUFFERED=True\npython my_script.py\n

NB: There is some performance penalty for having unbuffered print statements, so you may want to reduce the number of print statements, or run buffered for production runs.

","location":"docs/software/using/python/#switching-off-buffering"},{"title":"Python packages","text":"

The capabilities of Python can be extended with packages developed by third parties. In general, to simplify operations, it is left up to individual users and groups to install these third-party packages in their own directories. However, Sherlock provides tools to help you install the third-party packages that you need.

Among many others, the following common Python packages are provided on Sherlock:

  • NumPy
  • SciPy

Python modules on Sherlock generally follow the naming scheme below:

py-<package_name>/version_py<python_version>\n

For instance, NumPy modules are:

  • py-numpy/1.14.3_py27
  • py-numpy/1.14.3_py36

You can list all available module versions for a package with ml spider <package_name>. For instance:

$ ml spider tensorflow\n-------------------------------------------------------------------------------\n  py-tensorflow:\n-------------------------------------------------------------------------------\n    Description:\n      TensorFlow\u2122 is an open source software library for numerical computation using data flow graphs.\n\n     Versions:\n        py-tensorflow/1.6.0_py27\n        py-tensorflow/1.6.0_py36\n        py-tensorflow/1.7.0_py27\n        py-tensorflow/1.9.0_py27\n        py-tensorflow/1.9.0_py36\n

Dependencies are handled automatically

When you decide to use NumPy on Sherlock, you just need to load the py-numpy module of your choice, and the correct Python interpreter will be loaded automatically. No need to load a python module explicitly.

","location":"docs/software/using/python/#python-packages"},{"title":"Installing packages","text":"

If you need to use a Python package that is not already provided as a module on Sherlock, you can use the pip command. This command takes care of compiling and installing most of Python packages and their dependencies. All of pip's commands and options are explained in detail in the Pip user guide.

A comprehensive index of Python packages can be found at PyPI.

To install Python packages with pip, you'll need to use the --user option. This will make sure that those packages are installed in a user-writable location (by default, your $HOME directory). Since your $HOME directory is shared across nodes on Sherlock, you'll only need to install your Python packages once, and they'll be ready to be used on every single node in the cluster.

For example:

$ pip install --user <package_name>\n

For Python 3, use pip3:

$ pip3 install --user <package_name>\n

Python packages will be installed in $HOME/.local/lib/python<<version>/site-packages, meaning that packages for Python 2.x and Python 3.x will be kept separate. This both means that they won't interfere with each other, but also that if you need to use a package with both Python 2.x and 3.x, you'll need to install it twice, once for each Python version.

","location":"docs/software/using/python/#installing-packages"},{"title":"List installed packages","text":"

You can easily see the list of the Python packages installed in your environment, and their location, with pip list:

$ pip list -v\nPackage    Version Location                                                            Installer\n---------- ------- ------------------------------------------------------------------- ---------\npip        18.1    /share/software/user/open/python/2.7.13/lib/python2.7/site-packages pip\nsetuptools 28.8.0  /share/software/user/open/python/2.7.13/lib/python2.7/site-packages pip\nurllib3    1.24    /home/users/kilian/.local/lib/python2.7/site-packages               pip\nvirtualenv 15.1.0  /share/software/user/open/python/2.7.13/lib/python2.7/site-packages pip\n
","location":"docs/software/using/python/#list-installed-packages"},{"title":"Alternative installation path","text":"

Python paths

While theoretically possible, installing Python packages in alternate locations can be tricky, so we recommend trying to stick to the pip install --user way as often as possible. But in case you absolutely need it, we provide some guidelines below.

One common case of needing to install Python packages in alternate locations is to share those packages with a group of users. Here's an example that will show how to install the urllib3 Python package in a group-shared location and let users from the group use it without having to install it themselves.

First, you need to create a directory to store those packages. We'll put it in $GROUP_HOME:

$ mkdir -p $GROUP_HOME/python/\n

Then, we load the Python module we need, and we instruct pip to install its packages in the directory we just created:

$ ml python/2.7.13\n$ PYTHONUSERBASE=$GROUP_HOME/python pip install --user urllib3\n

We still use the --user option, but with PYTHONUSERBASE pointing to a different directory, pip will install packages there.

Now, to be able to use that Python module, since it's not been installed in a default directory, you (and all the members of the group who will want to use that module) need to set their PYTHONPATH to include our new shared directory1:

$ export PYTHONPATH=$GROUP_HOME/python/lib/python2.7/site-packages:$PYTHONPATH\n

And now, the module should be visible:

$ pip list -v\nPackage    Version Location                                                            Installer\n---------- ------- ------------------------------------------------------------------- ---------\npip        18.1    /share/software/user/open/python/2.7.13/lib/python2.7/site-packages pip\nsetuptools 28.8.0  /share/software/user/open/python/2.7.13/lib/python2.7/site-packages pip\nurllib3    1.24    /home/groups/ruthm/python/lib/python2.7/site-packages               pip\nvirtualenv 15.1.0  /share/software/user/open/python/2.7.13/lib/python2.7/site-packages pip\n

$PYTHONPATH depends on the Python version

The $PYTHONPATH environment variable is dependent on the Python version you're using, so for Python 3.6, it should include $GROUP_HOME/python/lib/python3.6/site-packages

$PATH may also need to be updated

Some Python package sometimes also install executable scripts. To make them easily accessible in your environment, you may also want to modify your $PATH to include their installation directory.

For instance, if you installed Python packages in $GROUP_HOME/python:

$ export PATH=$GROUP_HOME/python/bin:$PATH\n

","location":"docs/software/using/python/#alternative-installation-path"},{"title":"Installing from GitHub","text":"

pip also supports installing packages from a variety of sources, including GitHub repositories.

For instance, to install HTTPie, you can do:

$ pip install --user git+git://github.com/jkbr/httpie.git\n
","location":"docs/software/using/python/#installing-from-github"},{"title":"Installing from a requirements file","text":"

pip allows installing a list of packages listed in a file, which can be pretty convenient to install several dependencies at once.

In order to do this, create a text file called requirements.txt and place each package you would like to install on its own line:

requirements.txt
numpy\nscikit-learn\nkeras\ntensorflow\n

You can now install your modules like so:

$ ml python\n$ pip install --user -r requirements.txt\n
","location":"docs/software/using/python/#installing-from-a-requirements-file"},{"title":"Upgrading packages","text":"

pip can update already installed packages with the following command:

$ pip install --user --upgrade <package_name>\n

Upgrading packages also works with requirements.txt files:

$ pip install --user --upgrade -r requirements.txt\n
","location":"docs/software/using/python/#upgrading-packages"},{"title":"Uninstalling packages","text":"

To uninstall a Python package, you can use the pip uninstall command (note that it doesn't take any --user option):

$ pip uninstall <package_name>\n$ pip uninstall -r requirements.txt\n
  1. This line can also be added to a user's ~/.profile file, for a more permanent setting.\u00a0\u21a9

","location":"docs/software/using/python/#uninstalling-packages"},{"title":"Quantum Espresso","text":"","location":"docs/software/using/quantum-espresso/"},{"title":"Introduction","text":"

Quantum ESPRESSO is an integrated suite of Open-Source computer codes for electronic-structure calculations and materials modeling at the nanoscale. It is based on density-functional theory, plane waves, and pseudo-potentials.Perl is a high-level, general-purpose, interpreted, dynamic programming

Quantum ESPRESSO has evolved into a distribution of independent and inter-operable codes in the spirit of an open-source project. The Quantum ESPRESSO distribution consists of a \u201chistorical\u201d core set of components, and a set of plug-ins that perform more advanced tasks, plus a number of third-party packages designed to be inter-operable with the core components. Researchers active in the field of electronic-structure calculations are encouraged to participate in the project by contributing their own codes or by implementing their own ideas into existing codes.

","location":"docs/software/using/quantum-espresso/#introduction"},{"title":"More documentation","text":"

The following documentation specifically intended for using Quantum Espresso on Sherlock. For more complete documentation about Quantum Espresso in general, please see the Quantum Espresso documentation.

","location":"docs/software/using/quantum-espresso/#more-documentation"},{"title":"Quantum Espresso on Sherlock","text":"

To run Quantum Espresso on Sherlock, you can use one of the [provided modules][url_soft_qe], or run it from a container.

The CPU version of Quantum Espresso can be loaded via the quantum-espresso module:

$ ml chemistry quantum-espresso\n

and the GPU version can be loaded via the quantum-espresso_gpu module:

$ ml chemistry quantum-espresso_gpu\n
","location":"docs/software/using/quantum-espresso/#quantum-espresso-on-sherlock"},{"title":"Examples","text":"

Here are a few examples showing how to run the AUSURF112 benchmark.

","location":"docs/software/using/quantum-espresso/#examples"},{"title":"Preparation","text":"

The first step is to get the benchmark files:

$ cd $SCRATCH\n$ git clone https://github.com/QEF/benchmarks qe_benchmarks\n$ cd qe_benchmarks/AUSURF112\n
","location":"docs/software/using/quantum-espresso/#preparation"},{"title":"CPU version","text":"

To submit a Quantum Espresso job to run the AUSURF112 benchmark on CPU nodes, the following submission script can be used:

qe-bench_cpu.sbatch
#!/bin/bash\n#SBATCH --nodes=2                # number of nodes for the job\n#SBATCH --ntasks-per-node=16     # number of tasks per node\n#SBATCH --time=00:30:00          # total run time limit (HH:MM:SS)\n#SBATCH --mail-type=begin        # send email when job begins\n#SBATCH --mail-type=end          # send email when job ends\n\nmodule reset\nmodule load chemistry\nmodule load quantum-espresso/7.0\n\ncd $SCRATCH/qe_benchmarks\ncd AUSURF112\n\nsrun pw.x -input ausurf.in -npool 2\n

In this example, the job will request 32 CPU cores on 2 nodes, 30 minutes of run time, and will send an email notification when the job starts and when it ends.

The job can be submitted with:

$ sbatch qe-bench_cpu.sbatch\n
","location":"docs/software/using/quantum-espresso/#cpu-version"},{"title":"GPU version","text":"","location":"docs/software/using/quantum-espresso/#gpu-version"},{"title":"Native","text":"

The GPU version can be loaded through the quantum-espresso_gpu module.

Using the same benchmark files as for the CPU version above, you can create a job submissions script like this:

qe-bench_gpu.sbatch
#!/bin/bash\n#SBATCH --partition=gpu          # partition to submit the job to\n#SBATCH --nodes=2                # number of nodes for the job\n#SBATCH --gpus-per-node=1        # number of GPUs per node\n#SBATCH --time=00:30:00          # total run time limit (HH:MM:SS)\n#SBATCH --mail-type=begin        # send email when job begins\n#SBATCH --mail-type=end          # send email when job ends\n\nmodule reset\nmodule load chemistry\nmodule load quantum-espresso_gpu/7.0\n\ncd $SCRATCH/qe_benchmarks\ncd AUSURF112\n\nsrun pw.x -input ausurf.in -npool 2\n

In this example, the job will request 2 GPU on 2 nodes, 30 minutes of run time, and will send an email notification when the job starts and when it ends.

It can be submitted with:

$ sbatch qe-bench_gpu.sbatch\n
","location":"docs/software/using/quantum-espresso/#native"},{"title":"NGC container","text":"

Another option to run a GPU version of Quantum Espresso is to use a 3rd-party container.

The NVIDIA GPU Cloud (NGC) hosts a Quantum Espresso container container that could be used on Sherlock.

","location":"docs/software/using/quantum-espresso/#ngc-container"},{"title":"With Singularity","text":"

To use the container with Singularity, first pull the Quantum Espresso container with:

$ cd $SCRATCH\n$ singularity pull docker://nvcr.io/hpc/quantum_espresso:qe-7.0\n

Then create the following script:

qe-bench_gpu_singularity.sbatch
#!/bin/bash\n#SBATCH --partition=gpu          # partition to submit the job to\n#SBATCH --nodes=2                # number of nodes for the job\n#SBATCH --gpus-per-node=1        # number of GPUs per node\n#SBATCH --mem=32GB               # memory per node\n#SBATCH --time=00:30:00          # total run time limit (HH:MM:SS)\n#SBATCH --mail-type=begin        # send email when job begins\n#SBATCH --mail-type=end          # send email when job ends\n\ncd $SCRATCH/qe_benchmarks\ncd AUSURF112\n\nsrun singularity run --nv \\\n    $SCRATCH/quantum_espresso_qe-7.0.sif \\\n    pw.x -input ausurf.in -npool 2\n

and submit it:

$ sbatch qe-bench_gpu_singularity.sbatch\n
","location":"docs/software/using/quantum-espresso/#with-singularity"},{"title":"With pyxis/enroot","text":"

To use the container with pyxis/enroot, you can directly submit the following script:

qe-bench_gpu_enroot.sbatch
#!/bin/bash\n#SBATCH --partition=gpu          # partition to submit the job to\n#SBATCH --nodes=2                # number of nodes for the job\n#SBATCH --gpus-per-node=1        # number of GPUs per node\n#SBATCH --mem=32GB               # memory per node\n#SBATCH --time=00:30:00          # total run time limit (HH:MM:SS)\n#SBATCH --mail-type=begin        # send email when job begins\n#SBATCH --mail-type=end          # send email when job ends\n\ncd $SCRATCH/qe_benchmarks\ncd AUSURF112\n\nsrun --container-image nvcr.io/hpc/quantum_espresso:qe-7.0 \\\n     --container-workdir $PWD \\\n     pw.x -input ausurf.in -npool 2\n

and submit it:

$ sbatch qe-bench_gpu_singularity.sbatch\n
","location":"docs/software/using/quantum-espresso/#with-pyxisenroot"},{"title":"Rclone","text":"","location":"docs/software/using/rclone/"},{"title":"Introduction","text":"

If you need to sync files between cloud storage to Sherlock, rclone is a command line program that can help. You can easily use it to transfer files from a cloud storage provider to Sherlock or Oak, or vice versa. The following tutorial walks through transferring files between Google Drive and Oak storage.

","location":"docs/software/using/rclone/#introduction"},{"title":"More documentation","text":"

For more information on running rclone, please see the official documentation.

","location":"docs/software/using/rclone/#more-documentation"},{"title":"Setup","text":"","location":"docs/software/using/rclone/#setup"},{"title":"rclone config","text":"

Before transferring data for the first time, you will need to configure rclone so that it can access your Google Drive. This will require use of your browser, so you will need to connect to Sherlock with local port forwarding (ssh -L). You only need to do this when you are configuring rclone for the first time.

When running rclone config you will be prompted to enter names and values, indicated by the > symbol. To leave it empty, press Enter.

# Connect to Sherlock with local port fowarding\n$ ssh -L localhost:53682:localhost:53682 <SUNetID>@login.sherlock.stanford.edu\n\n\n# Load the rclone module\n$ ml system rclone\n\n\n# Run the rclone configuration tool\n$ rclone config\n\nNo remotes found, make a new one?\nn) New remote\ns) Set configuration password\nq) Quit config\nn/s/q> n\n\nEnter name for new remote.\nname> gdrive\n\nOption Storage.\nType of storage to configure.\nChoose a number from below, or type in your own value.\n 1 / 1Fichier\n   \\ (fichier)\n 2 / Akamai NetStorage\n   \\ (netstorage)\n       ...\n18 / Google Drive\n   \\ (drive)\n       ...\n48 / premiumize.me\n   \\ (premiumizeme)\n49 / seafile\n   \\ (seafile)\nStorage> drive\n\nOption client_id.\nGoogle Application Client Id\n...\nEnter a value. Press Enter to leave empty.\nclient_id>\n\nOption client_secret.\nOAuth Client Secret.\nLeave blank normally.\nEnter a value. Press Enter to leave empty.\nclient_secret>\n\nOption scope.\nScope that rclone should use when requesting access from drive.\nChoose a number from below, or type in your own value.\nPress Enter to leave empty.\n 1 / Full access all files, excluding Application Data Folder.\n   \\ (drive)\n...\nscope> 1\n\nOption service_account_file.\nService Account Credentials JSON file path.\nLeave blank normally.\n...\nEnter a value. Press Enter to leave empty.\nservice_account_file>\n\nEdit advanced config?\ny) Yes\nn) No (default)\ny/n> n\n\nUse auto config?\n * Say Y if not sure\n * Say N if you are working on a remote or headless machine\n\ny) Yes (default)\nn) No\ny/n> y\n\n2023/09/12 10:51:55 NOTICE: If your browser doesn't open automatically go to the\nfollowing link: http://127.0.0.1:53682/auth?state=#################\n2023/09/12 10:51:55 NOTICE: Log in and authorize rclone for access\n2023/09/12 10:51:55 NOTICE: Waiting for code...\n

At this point, you can copy and paste the provided link into your browser. You will be asked to confirm that you want to allow rclone to access your files. Once you have successfully done so, you can complete the configuration in the terminal.

Configure this as a Shared Drive (Team Drive)?\n\ny) Yes\nn) No (default)\ny/n> n\n\nConfiguration complete.\nOptions:\n...\nKeep this \"gdrive\" remote?\ny) Yes this is OK (default)\ne) Edit this remote\nd) Delete this remote\ny/e/d> y\n\nCurrent remotes:\n\nName                 Type\n====                 ====\ngdrive               drive\n\ne) Edit existing remote\nn) New remote\nd) Delete remote\nr) Rename remote\nc) Copy remote\ns) Set configuration password\nq) Quit config\ne/n/d/r/c/s/q> q\n
","location":"docs/software/using/rclone/#rclone-config"},{"title":"Examples","text":"","location":"docs/software/using/rclone/#examples"},{"title":"rclone copy","text":"

To transfer data between cloud storage and Sherlock or Oak, you can use the rclone copy command.

# Start an interactive dev session\n$ sh_dev\n\n# Load the rclone module\n$ ml system rclone\n\n# Copy a folder from Google Drive to Oak\n$ rclone copy gdrive:<folder name> /oak/stanford/groups/<group_name>/<folder name>\n\n$ Copy a single file from Oak to Google Drive\n$ rclone copy /oak/stanford/groups/<group name>/<file name> gdrive:\n
","location":"docs/software/using/rclone/#rclone-copy"},{"title":"rclone ls/lsd","text":"

To view the files and folders in your cloud storage, you can use the rclone ls and rclone lsd commands, respectively.

# Load the rclone module\n$ ml system rclone\n\n# List all top-level directories in Google Drive\n$ rclone lsd gdrive: --max-depth 1\n\n# List all files in a directory\n$ rclone ls gdrive:<folder name>\n\n# List all files on Google Drive (including those in folders)\n$ rclone ls gdrive:\n
","location":"docs/software/using/rclone/#rclone-lslsd"},{"title":"Schr\u00f6dinger","text":"","location":"docs/software/using/schrodinger/"},{"title":"Introduction","text":"

The Schr\u00f6dinger suite is a commercial and licensed software used to simulate and model molecular behavior at the atomic level. The Schr\u00f6dinger software tools include molecular dynamics simulations, quantum mechanics calculations, virtual screening and visualization tools.

","location":"docs/software/using/schrodinger/#introduction"},{"title":"More documentation","text":"

The following documentation specifically intended for using Schr\u00f6dinger on Sherlock. For more complete documentation about Schr\u00f6dinger in general, please contact Schr\u00f6dinger support.

","location":"docs/software/using/schrodinger/#more-documentation"},{"title":"Schr\u00f6dinger on Sherlock","text":"","location":"docs/software/using/schrodinger/#schrodinger-on-sherlock"},{"title":"Licensing","text":"

Stanford Libraries have purchased a site license for the Schr\u00f6dinger suite. Please contact Stanford Libraries at sciencelibrary@stanford.edu and CC srcc-support@stanford.edu if you would like to access Schr\u00f6dinger on Sherlock: after we receive confirmation, your PI group will be granted access on Sherlock.

","location":"docs/software/using/schrodinger/#licensing"},{"title":"Using Schr\u00f6dinger","text":"

You can use Schr\u00f6dinger software after having loaded the corresponding software module with the module command. To load the current default version:

module load chemistry schrodinger\n

To see all the available versions, you can use the module spider command:

$ module spider schrodinger\n

Once loaded, the $SCHRODINGER environment variable is automatically set to allow all Schr\u00f6dinger commands to run. For example, to run the jaguar command:

$ jaguar run -WAIT H20.in\n

To call the basic Schr\u00f6dinger run command, just enter:

$ run\n

or glide:

$ glide\nusage: glide_startup.py [options] <input_file>\nglide_startup.py: error: the following arguments are required: input_file\n
","location":"docs/software/using/schrodinger/#using-schrodinger"},{"title":"Maestro GUI","text":"

OnDemand shell sessions

Opening an X11/GUI session will not work in a Sherlock OnDemand terminal session. You will need to use the method mentioned below, i.e. a standard terminal session with an X11 client.

To launch the Maestro GUI, once you have loaded the Schr\u00f6dinger module, simply run:

$ maestro\n

You'll need to enable X11 forwarding in your initial connection to Sherlock, and request it as well for your job allocation.

Here are some example commands you can run:

# on your local machine\n$ ssh -X login.sherlock.stanford.edu\n\n# then from a Sherlock login node\n$ sh_dev -m 16GB\n\n# and finally on the allocated compute node:\n$ ml load chemistry schrodinger\n$ maestro\n

This will launch Maestro on a compute node and display its graphical user interface on your local machine's display.

GUI performance

Please note that running graphical user interfaces (GUIs) over the network via X11 over SSH may not necessarily yield the best performance. Graphical analysis is often best done on a local machine, while intensive, batch scheduled computations are carried over on the cluster.

For more information about X11 forwarding, you can refer to this page.

","location":"docs/software/using/schrodinger/#maestro-gui"},{"title":"Examples","text":"","location":"docs/software/using/schrodinger/#examples"},{"title":"batch job submission","text":"

Here's an example batch script, requesting 1 CPU, for 10 minutes on the normal partition, that can be saved as water.sbatch:

#!/usr/bin/bash\n#SBATCH -o water.%j.out\n#SBATCH -e water.%j.err\n#SBATCH -n 1\n#SBATCH -t 10:00\n#SBATCH -p normal\n\n# Load required modules\nmodule load chemistry schrodinger\n\n# Run Schr\u00f6dinger, -WAIT is often required\njaguar run -WAIT H20.in\n

Save this input file as H2O.in:

&gen\n&\n&echo\n&\n&zmat\nO       0.0000000000000   0.0000000000000  -0.1135016000000\nH1      0.0000000000000   0.7531080000000   0.4540064000000\nH2      0.0000000000000  -0.7531080000000   0.4540064000000\n&\n

And you can submit the batch script with:

$ sbatch water.sbatch\n

After execution, you should find a H20.out output file in the current directory, as well as a log file (H20.log). If you don't, you can check for errors in the job output and error files: water.<jobid>.{out,err}.

","location":"docs/software/using/schrodinger/#batch-job-submission"},{"title":"Singularity","text":"

Singularity is a tool for running containers on HPC systems, similar to Docker.

","location":"docs/software/using/singularity/"},{"title":"Introduction","text":"

Containers are a solution to the problem of how to get software to run reliably when moved from one computing environment to another. They also resolve installation problems by packaging all the dependencies of an application within a self-sustainable image, a.k.a a container.

What's a container?

Put simply, a container consists of an entire runtime environment: an application, plus all its dependencies, libraries and other binaries, and configuration files needed to run it, bundled into one package. By containerizing the application platform and its dependencies, differences in OS distributions and underlying infrastructure are abstracted away.

","location":"docs/software/using/singularity/#introduction"},{"title":"Why not Docker?","text":"

Docker has long been the reference and the most popular container framework in DevOps and Enterprise IT environments, so why not use Docker on Sherlock? Well, for a variety of technical reasons, mostly related to security.

Docker has never been designed nor developed to run in multi-tenants environments, and even less on HPC clusters. Specifically:

  • Docker requires a daemon running as root on all of the compute nodes, which has serious security implications,
  • all authenticated actions (such as login, push ...) are also executed as root, meaning that multiple users can't use those functions on the same node,
  • Docker uses cgroups to isolate containers, as does the Slurm scheduler, which uses cgroups to allocate resources to jobs and enforce limits. Those uses are unfortunately conflicting.
  • but most importantly, allowing users to run Docker containers will give them root privileges inside that container, which will in turn let them access any of the clusters' filesystems as root. This opens the door to user impersonation, inappropriate file tampering or stealing, and is obviously not something that can be allowed on a shared resource.

That last point is certainly the single most important reason why we won't use Docker on Sherlock.

","location":"docs/software/using/singularity/#why-not-docker"},{"title":"Why Singularity?","text":"

Singularity is Docker for HPC systems

Singularity allows running Docker containers natively, and is a perfect replacement for Docker on HPC systems such as Sherlock. That means that existing Docker container can be directly imported and natively run with SIngularity.

Despite Docker's shortcomings on HPC systems, the appeal of containers for scientific computing is undeniable, which is why we provide Singularity on Sherlock. Singularity is an alternative container framework, especially designed to run scientific applications on HPC clusters.

Singularity provides the same functionalities as Docker, without any of the drawbacks listed above. Using a completely different implementation, it doesn't require any privilege to run containers, and allow direct interaction with existing Docker containers.

The main motivation to use Singularity over Docker is the fact that it's been developed with HPC systems in mind, to solve those specific problems:

  • security: a user in the container is the same user as the one running the container, so no privilege escalation possible,
  • ease of deployment: no daemon running as root on each node, a container is simply an executable,
  • no need to mount filesystems or do bind mappings to access devices,
  • ability to run MPI jobs based on containers,
  • and more...
","location":"docs/software/using/singularity/#why-singularity"},{"title":"More documentation","text":"

The following documentation specifically intended for using Singularity on Sherlock. For more complete documentation about building and running containers with Singularity, please see the Singularity documentation.

","location":"docs/software/using/singularity/#more-documentation"},{"title":"Singularity on Sherlock","text":"

As announced during the SC'18 Supercomputing Conference, Singularity is an integral part of the Sherlock cluster, and Singularity commands can be executed natively on any login or compute node, without the need to load any additional module.

","location":"docs/software/using/singularity/#singularity-on-sherlock"},{"title":"Importing containers","text":"

Pre-built containers can be obtained from a variety of sources. For instance:

  • DockerHub contains containers for various software packages, which can be directly used with Singularity,
  • SingularityHub is a registry for scientific linux containers,
  • the NVIDIA GPU Cloud registry for GPU-optimized containers,
  • many individual projects contain specific instructions for installation via Docker and/or Singularity, and may provide pre-built images in other locations.

To illustrate how Singularity can import and run Docker containers, here's an example how to install and run the OpenFOAM CFD solver using Singularity. OpenFOAM can be quite difficult to install manually, but Singularity makes it very easy.

Interactive or batch usage

This example shows how to use Singularity interactively, but Singularity containers can be run in batch jobs as well.

The first step is to request an interactive shell, and to load the singularity module. Singularity images can be pulled directly from the compute nodes, and Singularity uses multiple CPU cores when assembling the image, so requesting multiple cores in your job can make the pull operation faster:

$ srun -c 4 --pty bash\n

We recommend storing Singularity images in $GROUP_HOME, as container images can take significant space in your $HOME directory.

$ mkdir -p $GROUP_HOME/$USER/simg\n$ cd $GROUP_HOME/$USER/simg\n

Then, the OpenFOAM container could be pulled directly from DockerHub by Singularity. This can take a moment to complete:

$ singularity pull docker://openfoam/openfoam6-paraview54\nDocker image path: index.docker.io/openfoam/openfoam6-paraview54:latest\nCache folder set to /scratch/users/kilian/.singularity/docker\nImporting: base Singularity environment\nExploding layer: sha256:1be7f2b886e89a58e59c4e685fcc5905a26ddef3201f290b96f1eff7d778e122.tar.gz\n[...]\nBuilding Singularity image...\nSingularity container built: ./openfoam6-paraview54.simg\nCleaning up...\nDone. Container is at: ./openfoam6-paraview54.simg\n
","location":"docs/software/using/singularity/#importing-containers"},{"title":"Running containers","text":"

Once the image is downloaded, you are ready to run OpenFOAM from the container. The singularity shell command can be used to start the container, and run a shell within that image:

By default on Sherlock, all the filesystems that are available on the compute node will also be available in the container. If you want to start your shell in a specific directory, you can use the --pwd /path/ option. For instance, we'll create a /tmp/openfoam_test/ directory to store our tests results (that will be wiped out at the end of the job), and start the container shell there:

$ mkdir /tmp/openfoam_test\n$ singularity shell --pwd /tmp/openfoam_test openfoam6-paraview54.simg\nSingularity: Invoking an interactive shell within container...\nSingularity openfoam6-paraview54.simg:/tmp/openfoam_test>\n

You're now in the container, as denoted by the shell prompt (Singularity[...].simg:[path]>), which is different from the prompt displayed on the compute node (which usually looks like [login]@[compute_node] [path]$.

OpenFOAM provides a convenience script that can be sourced to make OpenFOAM commands directly accessible and set a few useful environment variables:

> source /opt/openfoam6/etc/bashrc\n

Now, we can run a simple example using OpenFOAM:

> cp -r $FOAM_TUTORIALS/incompressible/simpleFoam/pitzDaily .\n> cd pitzDaily\n> blockMesh\n[...]\nEnd\n\n> simpleFoam\n/*---------------------------------------------------------------------------*\\\n  =========                 |\n  \\\\      /  F ield         | OpenFOAM: The Open Source CFD Toolbox\n   \\\\    /   O peration     | Website:  https://openfoam.org\n    \\\\  /    A nd           | Version:  6\n     \\\\/     M anipulation  |\n\\*---------------------------------------------------------------------------*/\nBuild  : 6-1a0c91b3baa8\nExec   : simpleFoam\nDate   : Oct 05 2018\nTime   : 23:37:30\nHost   : \"sh01-06n33.int\"\nPID    : 14670\nI/O    : uncollated\nCase   : /tmp/openfoam_test/pitzDaily\nnProcs : 1\nsigFpe : Enabling floating point exception trapping (FOAM_SIGFPE).\nfileModificationChecking : Monitoring run-time modified files using timeStampMaster (fileModificationSkew 10)\nallowSystemOperations : Allowing user-supplied system call operations\n\n// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //\nCreate time\n[...]\nSIMPLE solution converged in 288 iterations\n\nstreamLine streamlines write:\n    seeded 10 particles\n    Tracks:10\n    Total samples:11980\n    Writing data to \"/tmp/openfoam_test/pitzDaily/postProcessing/sets/streamlines/288\"\nEnd\n\n>\n

When the simulation is done, you can exit the container with:

> exit\n

Because the container can see all the compute node's filesystems, the simulation output will be available in /tmp/openfoam_test after you exit the container:

$ ls /tmp/openfoam_test/pitzDaily/postProcessing/\nsets\n
","location":"docs/software/using/singularity/#running-containers"},{"title":"GPU-enabled containers","text":"

Sherlock also supports the use of container images provided by NVIDIA in the NVIDIA GPU Cloud (NGC). This registry provides GPU-accelerated containers for the most popular HPC and deep-learning scientific applications.

GPU support

Containers provided on NGC are only supported on Pascal and Volta architectures (TITAN Xp, Tesla P40, P100 or V100). For GPUs from the previous generations (GTX TITAN Black/X, Tesla K20/K80), things may or may not work.

We recommend making sure to select a supported GPU generation by adding the following directive to your batch script when submitting a job to run GPU-enabled containers from NGC:

#SBATCH -C \"GPU_GEN:PSC|GPU_GEN:VLT\"\n

","location":"docs/software/using/singularity/#gpu-enabled-containers"},{"title":"Pulling NGC images","text":"

As before, we start by requesting an interactive shell with multiple CPU cores, loading the Singularity module and moving the directory where we'll save those images:

$ srun -c 4 --pty bash\n$ cd $GROUP_HOME/simg\n

A GPU is not required for pulling GPU-enabled containers

GPU-enabled containers can be pulled on any node, including nodes without a GPU. But their execution requires a GPU and thus, they need to be executed within a GPU job. See the GPU job section for more information.

To be able to pull an image from NGC, authentication credentials must be set. Users need to register and create an NGC API key, complete details could be found in the NGC Getting Started Guide.

You can then set the following environment variable to allow Singularity to authenticate with NGC:

$ export SINGULARITY_DOCKER_USERNAME='$oauthtoken'\n$ export SINGULARITY_DOCKER_PASSWORD=<NVIDIA NGC API key>\n

Note

The SINGULARITY_DOCKER_USERNAME environment variable must be set to the literal $oauthtoken string, for every user. It should not be replaced by anything else. Only the API key is specific to each user.

Once credentials are set in the environment, container images can be pulled from the NGC registry normally.

The general form of the Singularity command used to pull NGC containers is: $ singularity pull docker://nvcr.io/<registry>/<app:tag>

For example to pull the NAMD NGC container tagged with version 2.12-171025 the corresponding command would be:

$ singularity pull docker://nvcr.io/hpc/namd:2.12-171025\n

After this command has finished, we'll have a Singularity image file in the current directory, named namd-2.12-171025.simg.

","location":"docs/software/using/singularity/#pulling-ngc-images"},{"title":"Running NGC containers","text":"

Instructions about running NGC containers are provided on the NGC website, under each application:

Each application comes with specific running instructions, so we recommend to follow the container's particular guidelines before running it with Singularity.

Containers that lack Singularity documentation have not been tested with Singularity.

Since all NGC containers are optimized for GPU acceleration, they will always be executed with the --nv Singularity option, to enable GPU support within the container.

We also need to submit a job requesting a GPU to run GPU-enabled containers. For instance:

$ srun -p gpu -c 4 --gres gpu:1 --pty bash\n

This will start an interactive shell on a GPU node, with 4 CPU cores and 1 GPU.

The NAMD container that was pulled just before can now be started with the following commands. We start by creating a temporary directory to hold the execution results, and start the container using this as the current directory:

$ mkdir /tmp/namd_test\n$ singularity shell --nv --pwd /tmp/namd_test $GROUP_HOME/simg/namd-2.12-171025.simg\nSingularity: Invoking an interactive shell within container...\n\nSingularity namd-2.12-171025.simg:/tmp/namd_test>\n

From there, we can run a NAMD test to verify that everything is working as expected.

> cp -r /workspace/examples .\n> /opt/namd/namd-multicore +p4 +idlepoll examples/apoa1/apoa1.namd\nCharm++: standalone mode (not using charmrun)\nCharm++> Running in Multicore mode:  4 threads\nCharm++> Using recursive bisection (scheme 3) for topology aware partitions\nConverse/Charm++ Commit ID: v6.8.2\n[...]\nInfo: Built with CUDA version 9000\nDid not find +devices i,j,k,... argument, using all\nPe 1 physical rank 1 will use CUDA device of pe 2\nPe 3 physical rank 3 will use CUDA device of pe 2\nPe 0 physical rank 0 will use CUDA device of pe 2\nPe 2 physical rank 2 binding to CUDA device 0 on sh02-14n13.int: 'TITAN Xp'  Mem: 12196MB  Rev: 6.1\nInfo: NAMD 2.12 for Linux-x86_64-multicore-CUDA\n[...]\nInfo: SIMULATION PARAMETERS:\nInfo: TIMESTEP               1\n[...]\nENERGY:    2000     20247.5090     20325.4554      5719.0088       183.9328        -340639.3103     25366.3986         0.0000         0.0000     46364.9951        -222432.0107       168.6631   -268797.0057   -222054.5175       168.8733          -1129.9509     -1799.6459    921491.4634     -2007.8380     -2007.4145\n\nWRITING EXTENDED SYSTEM TO OUTPUT FILE AT STEP 2000\nWRITING COORDINATES TO OUTPUT FILE AT STEP 2000\nThe last position output (seq=-2) takes 0.001 seconds, 559.844 MB of memory in use\nWRITING VELOCITIES TO OUTPUT FILE AT STEP 2000\nThe last velocity output (seq=-2) takes 0.001 seconds, 559.844 MB of memory in use\n====================================================\n\nWallClock: 17.593451  CPUTime: 17.497925  Memory: 559.843750 MB\n[Partition 0][Node 0] End of program\n

The simulation should take a few seconds to run. You can verify that it correctly executed on a GPU in the output above. When it's done, you can exit the container with:

> exit\n

Because the container can see all the compute node's filesystems, the simulation output will be available in /tmp/named_test after you exit the container:

$ cd /tmp/namd_test/examples/apoa1/\n$ ls apoa1-out*\napoa1-out.coor  apoa1-out.vel  apoa1-out.xsc\n
","location":"docs/software/using/singularity/#running-ngc-containers"},{"title":"Building your own containers","text":"

Building Singularity containers requires root privileges, and as such, cannot be done on Sherlock directly.

If you need to modify existing containers or build your own from scratch, The recommended workflow is to prepare and build your containers on your local Linux machine (it could either be a workstation, a laptop or a virtual machine), transfer the resulting container image to Sherlock, and run it there.

For complete details about how to build Singularity containers, please refer to the Singularity documentation.

  1. For more information about using modules on Sherlock, please see the software modules documentation.\u00a0\u21a9

","location":"docs/software/using/singularity/#building-your-own-containers"},{"title":"Spark","text":"","location":"docs/software/using/spark/"},{"title":"Introduction","text":"

Apache Spark\u2122 is a general engine for large-scale data processing. This document gives a quick introduction how to get a first test program in Spark running on Sherlock.

","location":"docs/software/using/spark/#introduction"},{"title":"More documentation","text":"

The following documentation specifically intended for using Spark on Sherlock. For more complete documentation about Spark in general, please see the Apache Spark documentation.

","location":"docs/software/using/spark/#more-documentation"},{"title":"Spark on Sherlock","text":"

Running Apache Spark on Sherlock is a bit different from using a traditional Spark/Hadoop cluster in that it requires some level of integration with the scheduler. In a sense, the computing resources (memory and CPU) need to be allocated twice. First, sufficient resources for the Spark application need to be allocated via Slurm ; and secondly, spark-submit resource allocation flags need to be properly specified.

In order to use Spark, three steps have to be kept in mind when submitting a job to the queuing system:

  1. a new Spark cluster has to be started on the allocated nodes
  2. once the Spark cluster is up and running, Spark jobs have to be submitted to the cluster
  3. after all Spark jobs have finished running, the cluster has to be shut down

The following scripts show how to implement these three steps, and use the Pi Monte-Carlo calculation as an example.

","location":"docs/software/using/spark/#spark-on-sherlock"},{"title":"Single-node job","text":"

In this example, all the Spark processes run on the same compute node, which makes for a fairly simply sbatch script. The following example will start a 8-core job on a single node, and run a Spark task within that allocation:

#!/bin/bash\n\n#SBATCH --job-name=spark_singlenode\n#SBATCH --nodes=1\n#SBATCH --cpus-per-task=8\n#SBATCH --time=10\n\nmodule load spark\n\n# This syntax tells spark to use all cpu cores on the node.\nexport MASTER=\"local[*]\"\n\n# This is a Scala example\nrun-example SparkPi 1000\n\n# This is a Python example.\nspark-submit --master $MASTER $SPARK_HOME/examples/src/main/python/pi.py 1000\n
","location":"docs/software/using/spark/#single-node-job"},{"title":"Multi-node job","text":"

To start a Spark cluster and run a task on multiple nodes, more preliminary steps are necessary. Here's an example script that will span 2 nodes, start 2 Spark workers on each node, and allow each worker to use 8 cores:

#!/bin/bash\n#SBATCH --nodes=2\n#SBATCH --mem-per-cpu=4G\n#SBATCH --cpus-per-task=8\n#SBATCH --ntasks-per-node=2\n#SBATCH --output=sparkjob-%j.out\n\n## --------------------------------------\n## 0. Preparation\n## --------------------------------------\n\n# load the Spark module\nmodule load spark\n\n# identify the Spark cluster with the Slurm jobid\nexport SPARK_IDENT_STRING=$SLURM_JOBID\n\n# prepare directories\nexport SPARK_WORKER_DIR=${SPARK_WORKER_DIR:-$HOME/.spark/worker}\nexport SPARK_LOG_DIR=${SPARK_LOG_DIR:-$HOME/.spark/logs}\nexport SPARK_LOCAL_DIRS=${SPARK_LOCAL_DIRS:-/tmp/spark}\nmkdir -p $SPARK_LOG_DIR $SPARK_WORKER_DIR\n\n## --------------------------------------\n## 1. Start the Spark cluster master\n## --------------------------------------\n\nstart-master.sh\nsleep 1\nMASTER_URL=$(grep -Po '(?=spark://).*' \\\n             $SPARK_LOG_DIR/spark-${SPARK_IDENT_STRING}-org.*master*.out)\n\n## --------------------------------------\n## 2. Start the Spark cluster workers\n## --------------------------------------\n\n# get the resource details from the Slurm job\nexport SPARK_WORKER_CORES=${SLURM_CPUS_PER_TASK:-1}\nexport SPARK_MEM=$(( ${SLURM_MEM_PER_CPU:-4096} * ${SLURM_CPUS_PER_TASK:-1} ))M\nexport SPARK_DAEMON_MEMORY=$SPARK_MEM\nexport SPARK_WORKER_MEMORY=$SPARK_MEM\nexport SPARK_EXECUTOR_MEMORY=$SPARK_MEM\n\n# start the workers on each node allocated to the tjob\nexport SPARK_NO_DAEMONIZE=1\nsrun  --output=$SPARK_LOG_DIR/spark-%j-workers.out --label \\\n      start-slave.sh ${MASTER_URL} &\n\n## --------------------------------------\n## 3. Submit a task to the Spark cluster\n## --------------------------------------\n\nspark-submit --master ${MASTER_URL} \\\n             --total-executor-cores $((SLURM_NTASKS * SLURM_CPUS_PER_TASK)) \\\n             $SPARK_HOME/examples/src/main/python/pi.py 10000\n\n## --------------------------------------\n## 4. Clean up\n## --------------------------------------\n\n# stop the workers\nscancel ${SLURM_JOBID}.0\n\n# stop the master\nstop-master.sh\n
","location":"docs/software/using/spark/#multi-node-job"},{"title":"Storage on Sherlock","text":"

Sherlock provides access to several file systems, each with distinct storage characteristics. Each user and PI group get access to a set of pre-defined directories in these file systems to store their data.

Sherlock is a compute cluster, not a storage system

Sherlock's storage resources are limited and are shared among many users. They are meant to store data and code associated with projects for which you are using Sherlock's computational resources. This space is for work actively being computed on with Sherlock, and should not be used as a target for backups from other systems.

If you're looking for a long-term storage solution for research data, SRCC offers the Oak storage system, which is specifically intended for this usage.

Those file systems are shared with other users, and are subject to quota limits and for some of them, purge policies (time-residency limits).

","location":"docs/storage/"},{"title":"Filesystem overview","text":"","location":"docs/storage/#filesystem-overview"},{"title":"Features and purpose","text":"Name Type Backups / Snapshots Performance Purpose Cost $HOME, $GROUP_HOME NFS / low small, important files (source code, executables, configuration files...) free $SCRATCH, $GROUP_SCRATCH Lustre / high bandwidth large, temporary files (checkpoints, raw application output...) free $L_SCRATCH local SSD / low latency, high IOPS job specific output requiring high IOPS free $OAK Lustre option / moderate long term storage of research data volume-based1","location":"docs/storage/#features-and-purpose"},{"title":"Access scope","text":"Name Scope Access sharing level $HOME cluster user $GROUP_HOME cluster group $SCRATCH cluster user $GROUP_SCRATCH cluster group $L_SCRATCH compute node user $OAK cluster (optional, purchase required) group

Group storage locations are typically shared between all the members of the same PI group. User locations are only accessible by the user.

","location":"docs/storage/#access-scope"},{"title":"Quotas and limits","text":"

Volume and inodes

Quotas are applied on both volume (the amount of data stored in bytes) and inodes: an inode (index node) is a data structure in a Unix-style file system that describes a file-system object such as a file or a directory. In practice, each filesystem entry (file, directory, link) counts as an inode.

Name Quota type Volume quota Inode quota Retention $HOME directory 15 GB n/a $GROUP_HOME directory 1 TB n/a $SCRATCH directory 100 TB 20 million time limited $GROUP_SCRATCH directory 100 TB 20 million time limited $L_SCRATCH n/a n/a n/a job lifetime $OAK directory amount purchased function of the volume purchased

Quota types:

  • directory: based on files location and account for all the files that are in a given directory.
  • user: based on files ownership and account for all the files that belong to a given user.
  • group: based on files ownership and account for all the files that belong to a given group.

Retention types:

  • : files are kept as long as the user account exists on Sherlock.
  • time limited: files are kept for a fixed length of time after they've been last modified. Once the limit is reached, files expire and are automatically deleted.
  • job lifetime: files are only kept for the duration of the job and are automatically purged when the job ends.

Global failsafe user and quota groups on /scratch

To prevent potential issues which would result in the file system filling up completely and making it unusable for everyone, additional user and group-level quotas are in place on the /scratch file system, as a failsafe:

  • a user will not be able to use more than 250 TB (50M inodes) in total, in all the /scratch directories they have access to.

  • a group will not be able to use more than 1 PB (200M inodes) in total across all the /scratch directories its group members have access to.

","location":"docs/storage/#quotas-and-limits"},{"title":"Checking quotas","text":"

To check your quota usage on the different filesystems you have access to, you can use the sh_quota command:

$ sh_quota\n+---------------------------------------------------------------------------+\n| Disk usage for user kilian (group: ruthm)                                 |\n+---------------------------------------------------------------------------+\n|   Filesystem |  volume /   limit                  | inodes /  limit       |\n+---------------------------------------------------------------------------+\n          HOME |   9.4GB /  15.0GB [||||||     62%] |      - /      - (  -%)\n    GROUP_HOME | 562.6GB /   1.0TB [|||||      56%] |      - /      - (  -%)\n       SCRATCH |  65.0GB / 100.0TB [            0%] | 143.8K /  20.0M (  0%)\n GROUP_SCRATCH | 172.2GB / 100.0TB [            0%] |  53.4K /  20.0M (  0%)\n           OAK |  30.8TB / 240.0TB [|          12%] |   6.6M /  36.0M ( 18%)\n+---------------------------------------------------------------------------+\n

Several options are provided to allow listing quotas for a specific filesystem only, or in the context of a different group (for users who are members of several PI groups). Please see the sh_quota usage information for details:

$ sh_quota -h\nsh_quota: display user and group quota information for all accessible filesystems.\n\nUsage: sh_quota [OPTIONS]\n    Optional arguments:\n        -f FILESYSTEM   only display quota information for FILESYSTEM.\n                        For instance: \"-f $HOME\"\n        -g GROUP        for users with multiple group memberships, display\n                        group quotas in the context of that group\n        -n              don't display headers\n        -j              JSON output (implies -n)\n
","location":"docs/storage/#checking-quotas"},{"title":"Examples","text":"

For instance, to only display your quota usage on $HOME:

$ sh_quota -f HOME\n

If you belong to multiple groups, you can display the group quotas for your secondary groups with:

$ sh_quota -g <group_name>\n

And finally, for great output control, an option to display quota usage in JSON is provided via the -j option:

$ sh_quota -f SCRATCH -j\n{\n  \"SCRATCH\": {\n    \"quotas\": {\n      \"type\": \"user\",\n      \"blocks\": {\n        \"usage\": \"47476660\",\n        \"limit\": \"21474836480\"\n      },\n      \"inodes\": {\n        \"usage\": \"97794\",\n        \"limit\": \"20000000\"\n      }\n    }\n  }\n}\n
","location":"docs/storage/#examples"},{"title":"Where should I store my files?","text":"

Not all filesystems are equivalent

Choosing the appropriate storage location for your files is an essential step towards making your utilization of the cluster the most efficient possible. It will make your own experience much smoother, yield better performance for your jobs and simulations, and contribute to make Sherlock a useful and well-functioning resource for everyone.

Here is where we recommend storing different types of files and data on Sherlock:

  • personal scripts, configuration files and software installations \u2192 $HOME
  • group-shared scripts, software installations and medium-sized datasets \u2192 $GROUP_HOME
  • temporary output of jobs, large checkpoint files \u2192 $SCRATCH
  • curated output of job campaigns, large group-shared datasets, archives \u2192 $OAK
","location":"docs/storage/#where-should-i-store-my-files"},{"title":"Accessing filesystems","text":"","location":"docs/storage/#accessing-filesystems"},{"title":"On Sherlock","text":"

Filesystem environment variables

To facilitate access and data management, user and group storage location on Sherlock are identified by a set of environment variables, such as $HOME or $SCRATCH.

We strongly recommend using those variables in your scripts rather than explicit paths, to facilitate transition to new systems for instance. By using those environment variables, you'll be sure that your scripts will continue to work even if the underlying filesystem paths change.

To see the contents of these variables, you can use the echo command. For instance, to see the absolute path of your $SCRATCH directory:

$ echo $SCRATCH\n/scratch/users/kilian\n

Or for instance, to move to your group-shared home directory:

$ cd $GROUP_HOME\n
","location":"docs/storage/#on-sherlock"},{"title":"From other systems","text":"

External filesystems cannot be mounted on Sherlock

For a variety of security, manageability and technical considerations, we can't mount external filesystems nor data storage systems on Sherlock. The recommended approach is to make Sherlock's data available on external systems.

You can mount any of your Sherlock directories on any external system you have access to by using SSHFS. For more details, please refer to the Data Transfer page.

  1. For more information about Oak, its characteristics and cost model, please see the Oak Service Description page.\u00a0\u21a9

","location":"docs/storage/#from-other-systems"},{"title":"Data protection","text":"

Data protection is mostly a task for the user

Except for $HOME and $GROUP_HOME, data on Sherlock is not backed up, nor archived. It's up to each user and group to make sure they maintain multiple copies of their data if needed.

","location":"docs/storage/data-protection/"},{"title":"Snapshots","text":"

File system snapshots represent the state of the file system at a particular point in time. They allow accessing the file system contents as it was a different times in the past, and get back data that may have been deleted or modified since the snapshot was taken.

Important

Snapshots are only available on $HOME and $GROUP_HOME.

","location":"docs/storage/data-protection/#snapshots"},{"title":"Accessing snapshots","text":"

Snapshots taken in $HOME and $GROUP_HOME are accessible in a .snapshot directory at any level of the hierarchy. Those .snapshot directories don't appear when listing directory contents with ls, but they can be listed explicitly or accessed with cd:

$ cd $HOME\n$ ls -ald .snapshot/users*\n[...]\ndrwx------ 118 sunetid group  6680 Jul 21 11:16 .snapshot/users.daily.20170721\ndrwx------ 118 sunetid group  6702 Jul 21 16:19 .snapshot/users.daily.20170722\ndrwx------ 118 sunetid group  6702 Jul 21 16:19 .snapshot/users.daily.20170723\ndrwx------ 118 sunetid group  6702 Jul 24 10:57 .snapshot/users.daily.20170724\ndrwx------ 118 sunetid group  6702 Jul 24 10:57 .snapshot/users.daily.latest\ndrwx------ 118 sunetid group  6702 Jul 21 16:19 .snapshot/users.hourly.20170722-16:00\ndrwx------ 118 sunetid group  6702 Jul 21 16:19 .snapshot/users.hourly.20170722-17:00\ndrwx------ 118 sunetid group  6702 Jul 21 16:19 .snapshot/users.hourly.20170722-18:00\ndrwx------ 118 sunetid group  6702 Jul 21 16:19 .snapshot/users.hourly.20170722-19:00\ndrwx------ 118 sunetid group  6702 Jul 21 16:19 .snapshot/users.hourly.20170722-20:00\n[...]\n$ cd .snapshot/users.daily.latest\n

For instance:

  • the $HOME/.snapshot/users.daily.latest directory is the latest daily snapshot available, and stores the contents of the $HOME directory as they were when the last daily snapshot was taken,
  • the $HOME/foo/.snapshot/users.hourly.20170722-18:00 can be used to retrieve the contents of the $HOME/foo directory as it was at 6pm on July 22th, 2017.
","location":"docs/storage/data-protection/#accessing-snapshots"},{"title":"Restoring from a snapshot","text":"

If you deleted a file or modified it and want to restore an earlier version, you can simply copy the file from its saved version in the appropriate snapshot.

Examples:

  • to restore the last known version of $HOME/foo/bar:

    $ cp $HOME/foo/.snapshot/users.hourly.latest/bar $HOME/foo/bar\n

    or

    $ cp $HOME/.snapshot/foo/users.hourly.latest/bar $HOME/foo/bar\n

    (both commands are equivalent)

  • to restore your ~/.bashrc file from 2 days ago:

    $ SNAP_DATE=$(date +%Y%m%d -d \"2 days ago\")\n$ cp $HOME/.snapshot/users.daily.${SNAP_DATE}/.bashrc $HOME/.bashrc\n
","location":"docs/storage/data-protection/#restoring-from-a-snapshot"},{"title":"Snapshot policy","text":"

The current1 policy is to take snapshots on an hourly, daily and weekly basis. Older snapshots automatically expire after their retention period. The snapshot policy applies to both $HOME and $GROUP_HOME storage spaces.

Snapshot frequency Retention period Number of snapshots hourly 2 days 48 daily 1 week 7 weekly 1 month 4

The shortest interval between snapshots is an hour. That means that if you create a file and then delete it within the hour, it won't appear in snapshots, and you won't be able to restore it.

If a file exists for more than an hour, and is then deleted, it will be present in the hourly snapshots for the next 48 hours, and you'll be able to retrieve it during that period. Similarly, if a file exists for more than a day, it could be restored for up to 7 days.

Snapshots don't count towards your quota.

Snapshots, as well as the entire filesystem, are replicated to an off-site system, to ensure that data could be retrieved even in case of a catastrophic failure of the whole system or datacenter-level disaster.

","location":"docs/storage/data-protection/#snapshot-policy"},{"title":"Backups","text":"

Although the SRCC doesn't offer any backup service per se, we do provide all the tools required to transfer data in and out of Sherlock.

Suggested options to backup your data include:

  • Oak, SRCC's long-term research data storage service (Recommended)
  • University IT Storage options and backup services
  • Cloud storage providers (see the Data transfer page for information about the tools we provide to transfer files to/from the cloud)
  1. The snapshot policy is subject to change and may be adjusted as the storage system usage conditions evolve.\u00a0\u21a9

","location":"docs/storage/data-protection/#backups"},{"title":"Data sharing","text":"

The following sections present and detail options to share data across users and groups on Sherlock.

","location":"docs/storage/data-sharing/"},{"title":"Sharing data locally on Sherlock","text":"","location":"docs/storage/data-sharing/#sharing-data-locally-on-sherlock"},{"title":"Traditional Unix permissions","text":"

Standard Unix file permissions are supported on Sherlock and provide read, write and execute permissions for the three distinct access classes.

The access classes are defined as follows:

  • Files and directories are owned by a user. The owner determines the file's user class. Distinct permissions apply to the owner.
  • Files and directories are assigned a group, which define the file's group class. Distinct permissions apply to members of the file's group. The owner may be a member of the file's group.
  • Users who are not the owner, nor a member of the group, comprise a file's others class. Distinct permissions apply to others.

The following permissions apply to each class:

  • The read permission grants the ability to read a file. When set for a directory, this permission grants the ability to read the names of files in the directory, but not to find out any further information about them such as contents, file type, size, ownership, permissions.
  • The write permission grants the ability to modify a file. When set for a directory, this permission grants the ability to modify entries in the directory. This includes creating files, deleting files, and renaming files.
  • The execute permission grants the ability to execute a file. This permission must be set for executable programs, including shell scripts, in order to allow the operating system to run them. When set for a directory, this permission grants the ability to access file contents and meta-information if its name is known, but not list files inside the directory, unless read is set also.

Shared directories traversal

If you need to give access to one of your files to another user, they will at least need execute permission on each directory within the path to that file.

The effective permissions are determined based on the first class the user falls within in the order of user, group then others. For example, the user who is the owner of the file will have the permissions given to the user class regardless of the permissions assigned to the group class or others class.

While traditional Unix permissions are sufficient in most cases to share files with all the users within the same group, they are not enough to share files with a specific subset of users, or with users from other groups. Access Control Lists (ACLs) can be used for that purpose.

There are two type of ACLs supported on Sherlock depending on the underlying filesystem:

Type Filesystems NFSv4 ACLs $HOME and $GROUP_HOME POSIX ACLs $SCRATCH, $GROUP_SCRATCH, $L_SCRATCH and $OAK","location":"docs/storage/data-sharing/#traditional-unix-permissions"},{"title":"POSIX ACLs","text":"

POSIX ACLs allows you to grant or deny access to files and directories for different users (or groups), independently of the file owner or group.

Two types of POSIX ACLs can be defined:

  • Access ACLs: grant permission for a specific file or directory.
  • Default ACLs: allow to set a default set of ACLs that will be applied to any file or directory without any already defined ACL. Can only be set on directories.

ACLs are set with the setfacl command, and displayed with getfacl. For more details and examples, please refer to this documentation.

In the example below, we allow two users to access a restricted directory located at $GROUP_SCRATCH/restricted-dir/:

$ cd $GROUP_SCRATCH\n\n### Create new directory\n$ mkdir restricted-dir\n\n### Remove 'group' and 'other' access\n$ chmod g-rwx,o-rwx restricted-dir\n\n### Give user bob read and traversal permissions to the directory\n$ setfacl -m u:bob:rX restricted-dir\n\n### Use default ACLs (-d) to give user bob read access to all new\n### files and sub-directories that will be created in \"restricted-dir\"\n$ setfacl -d -m u:bob:rX restricted-dir\n\n### Give user alice read, write and traversal permissions for the directory\n$ setfacl -m u:alice:rwX restricted-dir\n\n### Use default ACLs (-d) to give user alice read and write access to all\n### new files and sub-directories\n$ setfacl -d -m u:alice:rwX restricted-dir\n\n### Show ACLs\n$ getfacl restricted-dir\n# file: restricted-dir/\n# owner: joe\n# group: grp\n# flags: -s-\nuser::rwx\nuser:bob:r-x\ngroup::---\nmask::r-x\nother::---\ndefault:user::rwx\ndefault:user:alice:rwx\ndefault:user:bob:r-x\ndefault:group::---\ndefault:mask::rwx\ndefault:other::---\n

Default permissions on $GROUP_SCRATCH

By default, the Unix permissions on the root directory $GROUP_SCRATCH don't allow read nor traversal access for others (ie. any user not part of your PI group). If you need to share files with users outside of your own group, please contact us so we can set the appropriate permissions on your folder.

For $SCRATCH, you're the owner of the directory and so you can change the permissions yourself.

","location":"docs/storage/data-sharing/#posix-acls"},{"title":"NFSv4 ACLs","text":"

$HOME and $GROUP_HOME also allow setting ACLs, albeit with different syntax and semantics than POSIX ACLs. The principle is very similar, though.

An ACL in NFSv4 is a list of rules setting permissions on files or directories. A permission rule, or Access Control Entry (ACE), is of the form type:flags:principle:permissions.

Commonly used entries for these fields are:

  • type: A (allow) or D (deny)
  • flags: g (group), d (directory-inherit), f (file-inherit), n (no-propagate-inherit), or i (inherit-only)
  • principle: a named user (user@sherlock), a group, or one of three special principles: OWNER@, GROUP@, and EVERYONE@.
  • permissions: there are 14 permission characters, as well as the shortcuts R, W, and X. Here is a list of possible permissions that can be included in the permissions field (options are Case Sensitive)
  • r read-data (files) / list-directory (directories)
  • w write-data (files) / create-file (directories)
  • x execute (files) / change-directory (directories)
  • a append-data (files) / create-subdirectory (directories)
  • t read-attributes: read the attributes of the file/directory.
  • T write-attributes: write the attributes of the file/directory.
  • n read-named-attributes: read the named attributes of the file/directory.
  • N write-named-attributes: write the named attributes of the file/directory.
  • c read-ACL: read the file/directory NFSv4 ACL.
  • C write-ACL: write the file/directory NFSv4 ACL.
  • o write-owner: change ownership of the file/directory.
  • y synchronize: allow clients to use synchronous I/O with the server.
  • d delete: delete the file/directory. Some servers will allow a delete to occur if either this permission is set in the file/directory or if the delete-child permission is set in its parent directory.
  • D delete-child: remove a file or subdirectory from within the given directory (directories only)
  • A comprehensive listing of allowable field strings is given in the manual page nfs4_acl(5)

    To see what permissions are set on a particular file, use the nfs4_getfacl command. For example, newly created file1 may have default permissions listed by ls -l as -rw-r\u2014r\u2014. Listing the permissions with nfs4_getfacl would display the following:

    $ nfs4_getfacl file1\nA::OWNER@:rwatTnNcCoy\nA:g:GROUP@:rtncy\nA::EVERYONE@:rtncy\n

    To set permissions on a file, use the nfs4_setfacl command. For convenience, NFSv4 provides the shortcuts R, W and X for setting read, write, and execute permissions. For example, to add write permissions for the current group on file1, use nfs4_setfacl with the -a switch:

    $ nfs4_setfacl -a A::GROUP@:W file1\n

    This command switched the GROUP@ permission field from rtncy to rwatTnNcCoy. However, be aware that NFSv4 file permission shortcuts have a different meanings than the traditional Unix r, w, and x. For example issuing chmod g+w file1 will set GROUP@ to rwatncy.

    Although the shortcut permissions can be handy, often rules need to be more customized. Use nfs4_setfacl -e file1 to open the ACL for file1 in a text editor.

    Access Control Entries allow more fine grained control over file and directory permissions than does the chmod command. For example, if user joe wants to give read, write and traverse permissions to jack for her directory private, she would issue:

    $ nfs4_setfacl -R -a A::jack@sherlock:RWX private/\n

    The -R switch recursively applies the rule to the files and directories within private/ as well.

    To allow jack to create files and subdirectories within private/ with the permissions as granted above, inheritance rules need to be applied.

    $ nfs4_setfacl -R -a A:fd:jack@sherlock:RWX private/\n

    By default, each permission is in the Deny state and an ACE is required to explicitly allow a permission. However, be aware that a server may silently override a users ACE, usually to a less permissive setting.

    For complete documentation and examples on using NFSv4 ACLs, please see the manual page at nfs4_acl(5).

    Default permissions on $GROUP_HOME

    By default, the Unix permissions on the root directory $GROUP_HOME don't allow read nor traversal access for others (ie. any user not part of your PI group). If you need to share files with users outside of your own group, please contact us so we can set the appropriate permissions on your folder.

    For $HOME, you're the owner of the directory and so you can change the permissions yourself.

    ","location":"docs/storage/data-sharing/#nfsv4-acls"},{"title":"Sharing data outside of Sherlock","text":"

    If you'd like to share data stored on Sherlock with external collaborators, there are two possibilities:

    1. sponsor a SUNet ID1 for these collaborators, and contact us us to create a account for them on Sherlock. This will grant them access to your resources on Sherlock (compute as well as storage) and give them access to your group shared files, like any other user in your group.

    2. if you don't want to grant full access to your Sherlock resources to your external collaborators, you can use the Globus data sharing feature. This won't require your collaborators to get Stanford accounts, and will allow easy sharing of the datasets of your choice.

      Globus Sharing is only available through the Oak endpoint

      Globus Sharing is only available on $OAK, using the Oak Globus Endpoint 2 (srcc#oak).

      For complete details about sharing data with Globus, please see the Globus documentation at https://docs.globus.org/how-to/share-files/

    1. a base-level SUNet ID (free) is sufficient to get an account on Sherlock. For more details about SUNet ID levels and associated services, please see the Stanford UIT SUNet IDs page.\u00a0\u21a9

    2. SUNet ID required\u00a0\u21a9

    ","location":"docs/storage/data-sharing/#sharing-data-outside-of-sherlock"},{"title":"Data transfer","text":"","location":"docs/storage/data-transfer/","tags":["connection"]},{"title":"Transfer protocols","text":"

    A number of methods allow transferring data in/out of Sherlock. For most cases, we recommend using SSH-based file transfer commands, such as scp, sftp, or rsync. They will provide the best performance for data transfers from and to campus.

    For large transfers, using DTNs is recommended

    Most casual data transfers could be done through the login nodes, by pointing your transfer tool to login.sherlock.stanford.edu. But because of resource limits on the login nodes, larger transfer may not work as expected.

    For transferring large amounts of data, Sherlock features a specific Data Transfer Node, with dedicated bandwidth, as well as a managed Globus endpoint, that can be used for scheduled, unattended data transfers.

    We also provide tools on Sherlock to transfer data to various Cloud providers, such as AWS, Google Drive, Dropbox, Box, etc.

    ","location":"docs/storage/data-transfer/#transfer-protocols","tags":["connection"]},{"title":"Prerequisites","text":"

    Most of the commands detailed below require a terminal and an SSH client1 on your local machine to launch commands.

    You'll need to start a terminal and type the given example commands at the prompt, omitting the initial $ character (it just indicates a command prompt, and then should not be typed in).

    ","location":"docs/storage/data-transfer/#prerequisites","tags":["connection"]},{"title":"Host keys","text":"

    Upon your very first connection to Sherlock, you will be greeted by a warning such as :

    The authenticity of host 'login.sherlock.stanford.edu' can't be established.\nECDSA key fingerprint is SHA256:eB0bODKdaCWtPgv0pYozsdC5ckfcBFVOxeMwrNKdkmg.\nAre you sure you want to continue connecting (yes/no)?\n

    The same warning will be displayed if your try to connect to one of the Data Transfer Node (DTN):

    The authenticity of host 'dtn.sherlock.stanford.edu' can't be established.\nECDSA key fingerprint is SHA256:eB0bODKdaCWtPgv0pYozsdC5ckfcBFVOxeMwrNKdkmg.\nAre you sure you want to continue connecting (yes/no)?\n

    This warning is normal: your SSH client warns you that it is the first time it sees that new computer. To make sure you are actually connecting to the right machine, you should compare the ECDSA key fingerprint shown in the message with one of the fingerprints below:

    Key type Key Fingerprint RSA SHA256:T1q1Tbq8k5XBD5PIxvlCfTxNMi1ORWwKNRPeZPXUfJAlegacy format: f5:8f:01:46:d1:f9:66:5d:33:58:b4:82:d8:4a:34:41 ECDSA SHA256:eB0bODKdaCWtPgv0pYozsdC5ckfcBFVOxeMwrNKdkmglegacy format: 70:4c:76:ea:ae:b2:0f:81:4b:9c:c6:5a:52:4c:7f:64

    If they match, you can proceed and type \u2018yes\u2019. Your SSH program will then store that key and will verify it for every subsequent SSH connection, to make sure that the server you're connecting to is indeed Sherlock.

    ","location":"docs/storage/data-transfer/#host-keys","tags":["connection"]},{"title":"Host keys warning","text":"

    If you've connected to Sherlock 1.0 before, there's a good chance the Sherlock 1.0 keys were stored by your local SSH client. In that case, when connecting to Sherlock 2.0 using the sherlock.stanford.edu alias, you will be presented with the following message:

    @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n@ WARNING: POSSIBLE DNS SPOOFING DETECTED! @\n@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\nThe RSA host key for sherlock.stanford.edu has changed, and the key for\nthe corresponding IP address 171.66.97.101 is unknown. This could\neither mean that DNS SPOOFING is happening or the IP address for the\nhost and its host key have changed at the same time.\n@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n@ WARNING: REMOTE HOST IDENTIFICATION HAS CHANGED! @\n@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\nIT IS POSSIBLE THAT SOMEONE IS DOING SOMETHING NASTY!\nSomeone could be eavesdropping on you right now (man-in-the-middle\nattack)!  It is also possible that a host key has just been changed.\nThe fingerprint for the RSA key sent by the remote host is\nSHA256:T1q1Tbq8k5XBD5PIxvlCfTxNMi1ORWwKNRPeZPXUfJA.\nPlease contact your system administrator.\n

    You can just check that the SHA256 key listed in that warning message correctly matches the one listed in the table above, and if that's the case, you can safely remove the sherlock.stanford.edu entry from your ~/.ssh/known_hosts file with the following command on your local machine:

    $ ssh-keygen -R sherlock.stanford.edu\n

    and then connect again. You'll see the first-connection prompt mentioned above, and your SSH client will store the new keys for future connections.

    ","location":"docs/storage/data-transfer/#host-keys-warning","tags":["connection"]},{"title":"SSH-based protocols","text":"

    User name

    In all the examples below, you'll need to replace <sunetid> by your actual SUNet ID. If you happen to use the same login name on your local machine, you can omit it.

    ","location":"docs/storage/data-transfer/#ssh-based-protocols","tags":["connection"]},{"title":"SCP (Secure Copy)","text":"

    The easiest command to use to transfer files to/from Sherlock is scp. It works like the cp command, except it can work over the network to copy files from one computer to another, using the secure SSH protocol.

    The general syntax to copy a file to a remote server is:

    $ scp <source_file_path> <username>@<remote_host>:<destination_path>'\n

    For instance, the following command will copy the file named foo from your local machine to your home directory on Sherlock:

    $ scp foo <sunetid>@login.sherlock.stanford.edu:\n
    Note the : character, that separates the hostname from the destination path. Here, the destination path is empty, which will instruct scp to copy the file in your home directory.

    You can copy foo under a different name, or to another directory, with the following commands:

    $ scp foo <sunetid>@login.sherlock.stanford.edu:bar\n$ scp foo <sunetid>@login.sherlock.stanford.edu:~/subdir/baz\n

    To copy back files from Sherlock to your local machine, you just need to reverse the order of the arguments:

    $ scp <sunetid>@login.sherlock.stanford.edu:foo local_foo\n

    And finally, scp also support recursive copying of directories, with the -r option:

    $ scp -r dir/ <sunetid>@login.sherlock.stanford.edu:dir/\n
    This will copy the dir/ directory and all of its contents in your home directory on Sherlock.

    ","location":"docs/storage/data-transfer/#scp-secure-copy","tags":["connection"]},{"title":"SFTP (Secure File Transfer Protocol)","text":"

    SFTP clients are interactive file transfer programs, similar to FTP, which perform all operations over an encrypted transport.

    A variety of graphical SFTP clients are available for different OSes:

    • WinSCP
    • SecureFX ,
    • Fetch2
    • CyberDuck

    When setting up your connection to Sherlock in the above programs, use the following information:

    Hostname: login.sherlock.stanford.edu\nPort:     22\nUsername: SUNet ID\nPassword: SUNet ID password\n

    OpenSSH also provides a command-line SFTP client, originally named sftp.

    To log in to Sherlock:

    $ sftp <sunetid>@login.sherlock.stanford.edu\nConnected to login.sherlock.stanford.edu.\nsftp>\n
    For more information about using the command-line SFTP client, you can refer to this tutorial for more details and examples.

    ","location":"docs/storage/data-transfer/#sftp-secure-file-transfer-protocol","tags":["connection"]},{"title":"rsync","text":"

    If you have complex hierarchies of files to transfer, or if you need to synchronize a set of files and directories between your local machine and Sherlock, rsync will be the best tool for the job. It will efficiently transfer and synchronize files across systems, by checking the timestamp and size of files. Which means that it won't re-transfer files that have not changed since the last transfer, and will complete faster.

    For instance, to transfer the whole ~/data/ folder tree from your local machine to your home directory on Sherlock, you can use the following command:

    $ rsync -a ~/data/ <sunetid>@login.sherlock.stanford.edu:data/\n
    Note the slash (/) at the end of the directories name, which is important to instruct rsync to synchronize the whole directories.

    To get more information about the transfer rate and follow its progress, you can use additional options:

    $ rsync -avP ~/data/ <sunetid>@login.sherlock.stanford.edu:data/\nsending incremental file list\n./\nfile1\n      1,755,049 100%    2.01MB/s    0:00:00 (xfr#2, to-chk=226/240)\nfile2\n      2,543,699 100%    2.48MB/s    0:00:00 (xfr#3, to-chk=225/240)\nfile3\n     34,930,688  19%   72.62MB/s    0:00:08\n\n[...]\n
    For more information about using the rsync, you can refer to this tutorial for more details and examples.

    ","location":"docs/storage/data-transfer/#rsync","tags":["connection"]},{"title":"SSHFS","text":"

    Sometimes, moving files in and out of the cluster, and maintaining two copies of each of the files you work on, both on your local machine and on Sherlock, may be painful. Fortunately, Sherlock offers the ability to mount any of its filesystems to your local machine, using a secure and encrypted connection.

    With SSHFS, a FUSE-based filesystem implementation used to mount remote SSH-accessible filesystems, you can access your files on Sherlock as if they were locally stored on your own computer.

    This comes particularly handy when you need to access those files from an application that is not available on Sherlock, but that you already use or can install on your local machine. Like a data processing program that you have licensed for your own computer but can't use on Sherlock, a specific text editor that only runs on macOS, or any data-intensive 3D rendering software that wouldn't work comfortably enough over a forwarded X11 connection.

    SSHFS is available for Linux , macOS , and Windows .

    SSHFS on macOS

    SSHFS on macOS is known to try to automatically reconnect filesystem mounts after resuming from sleep or suspend, even without any valid credentials. As a result, it will generate a lot of failed connection attempts and likely make your IP address blacklisted on login nodes.

    Make sure to unmount your SSHFS drives before putting your macOS system to sleep to avoid this situation.

    The following option could also be useful to avoid some permission issues: -o defer_permissions

    For instance, on a Linux machine with SSHFS installed, you could mount your Sherlock home directory via a Sherlock DTN with the following commands:

    $ mkdir ~/sherlock_home\n$ sshfs <sunetid>@dtn.sherlock.stanford.edu:./ ~/sherlock_home\n

    Using DTNs for data transfer

    Using the Sherlock DTNs instead of login nodes will ensure optimal performance for data transfers. Login nodes only have limited resources, that could limit data transfer rates or disconnect during long data transfers.

    And to unmount it:

    $ umount ~/sherlock_home\n

    For more information about using SSHFS on your local machine, you can refer to this tutorial for more details and examples.

    ","location":"docs/storage/data-transfer/#sshfs","tags":["connection"]},{"title":"Globus","text":"

    Globus improves SSH-based file transfer protocols by providing the following features:

    • automates large data transfers,
    • handles transient errors, and can resume failed transfers,
    • simplifies the implementation of high-performance transfers between computing centers.

    Globus is a Software as a Service (SaaS) system that provides end-users with a browser interface to initiate data transfers between endpoints. Globus allows users to \"drag and drop\" files from one endpoint to another. Endpoints are terminals for data; they can be laptops or supercomputers, and anything in between. The Globus web service negotiates, monitors, and optimizes transfers through firewalls and across network address translation (NAT). Under certain circumstances, with high performance hardware transfer rates exceeding 1 GB/s are possible. For more information about Globus, please see the Globus documentation.

    ","location":"docs/storage/data-transfer/#globus","tags":["connection"]},{"title":"Authentication","text":"

    To use Globus, you will first need to authenticate at Globus.org. You can either sign up for a Globus account, or use your SUNet ID account for authentication to Globus (which will be required to authenticate to the Sherlock endpoint).

    To use your SUNet ID, choose \"Stanford University\" from the drop down menu at the Login page and follow the instructions from there.

    ","location":"docs/storage/data-transfer/#authentication","tags":["connection"]},{"title":"Transfer","text":"

    Endpoint name

    The Globus endpoint name for Sherlock is SRCC Sherlock.

    Oak endpoint

    The Sherlock endpoint only provides access to Sherlock-specific file systems ($HOME, $GROUP_HOME, $SCRATCH and $GROUP_SCRATCH). Oak features its own Globus endpoint: SRCC Oak.

    You can use Globus to transfer data between your local workstation (e.g., your laptop or desktop) and Sherlock. In this workflow, you configure your local workstation as a Globus endpoint by installing the Globus Connect software.

    1. Log in to Globus.org
    2. Use the Manage Endpoints interface to \"add Globus Connect Personal\" as an endpoint (you'll need to install Globus Connect Personal on your local machine)
    3. Transfer Files, using your new workstation endpoint for one side of the transfer, and the Sherlock endpoint (SRCC Sherlock) on the other side.

    You can also transfer data between two remote endpoints, by choosing another endpoint you have access to instead of your local machine.

    ","location":"docs/storage/data-transfer/#transfer","tags":["connection"]},{"title":"CLI and API","text":"

    Globus also provides a command-line interface (CLI) and application programming interface (API) as alternatives to its web interface.

    For more information about the API, please see the Globus API documentation for more details.

    For more information about the CLI, please see the Globus CLI documentation and Globus CLI quick start. Note that the Globus CLI is available through the module system on Sherlock:

    $ module load system py-globus-cli\n$ globus login\n# follow instructions to get set up\n

    Once you've authorized the application, you can use the globus CLI to copy files in between endpoints and collections that you have access to. Endpoints and collections are identified by their unique UUID4 identifiers, which are viewable through the Globus web app. The CLI will step you through any additional authorizations required for you to access the endpoints or collections.

    For example, to asynchronously copy files between Sherlock and Oak (if that you have already been allocated Oak storage):

    $ GLOBUS_SHERLOCK_UUID=\"6881ae2e-db26-11e5-9772-22000b9da45e\"\n$ GLOBUS_OAK_UUID=\"8b3a8b64-d4ab-4551-b37e-ca0092f769a7\"\n$ globus transfer --recursive \\\n    \"$GLOBUS_SHERLOCK_UUID:$SCRATCH/my-interesting-project\" \\\n    \"$GLOBUS_OAK_UUID:$OAK/my-interesting-project-copy\"\n
    ","location":"docs/storage/data-transfer/#cli-and-api","tags":["connection"]},{"title":"Data Transfer Nodes (DTNs)","text":"

    No shell

    The DTNs don't provide any interactive shell, so connecting via SSH directly won't work. It will only accept scp, sftp, rsync of bbcp connections.

    A pool of dedicated Data Transfer Nodes is available on Sherlock, to provide exclusive resources for large-scale data transfers.

    The main benefit of using it is that transfer tasks can't be disrupted by other users interactive tasks or filesystem access and I/O-related workloads on the login nodes.

    By using the Sherlock DTNs, you'll make sure that your data flows will go through a computer whose sole purpose is to move data around.

    It supports:

    • SSH-based protocols (such as the ones described above)
    • bbcp
    • Globus

    To transfer files via the DTNs, simply use dtn.sherlock.stanford.edu as a remote server host name. For instance:

    $ scp foo <sunetid>@dtn.sherlock.stanford.edu:~/foo\n

    $HOME on DTNs

    One important difference to keep in mind when transferring files through the Sherlock DTNs is that the default destination path for files, unless specified, is the user $SCRATCH directory, not $HOME.

    That means that the following command:

    $ scp foo <sunetid>@dtn.sherlock.stanford.edu:\n
    will create the foo file in $SCRATCH/foo, and not in $HOME/foo.

    You can transfer file to your $HOME directory via the DTNs by specifying the full path as the destination: $ scp foo <sunetid>@dtn.sherlock.stanford.edu:$HOME/foo

    ","location":"docs/storage/data-transfer/#data-transfer-nodes-dtns","tags":["connection"]},{"title":"Cloud storage","text":"

    If you need to backup some of your Sherlock files to cloud-based storage services, we also provide a set of utilities that can help.

    ","location":"docs/storage/data-transfer/#cloud-storage","tags":["connection"]},{"title":"Google Drive","text":"

    Google Drive storage for Stanford users

    Google Drive is free for educational institutions. Meaning you can get free and unlimited storage on Google Drive using your @stanford.edu account. See the University IT Google Drive page for more details.

    We provide the rclone tool on Sherlock to interact with Google Drive. You'll just need to load the rclone module to be able to use it to move your files from/to Google Drive:

    $ module load system rclone\n$ rclone --help\n

    This tutorial provides an example of transferring files between Google Drive and Oak storage.

    The Globus CLI (see above) can also be used to copy files from Sherlock to Stanford's Google Drive.

    ","location":"docs/storage/data-transfer/#google-drive","tags":["connection"]},{"title":"AWS","text":"

    You can also access AWS storage from the Sherlock command line with the AWS Command Line Interface:

    $ module load system aws-cli\n$ aws help\n
    ","location":"docs/storage/data-transfer/#aws","tags":["connection"]},{"title":"Other services","text":"

    If you need to access other cloud storage services, you can use rclone: it can be used to sync files and directories to and from Google Drive, Amazon S3, Box, Dropbox, Google Cloud Storage, Amazon Drive, Microsoft OneDrive and many more.

    $ ml load system rclone\n$ rclone -h\n

    For more details about how to use rclone, please see the official documentation.

    1. For more details, see the SSH clients page.\u00a0\u21a9

    2. Fetch is a commercial program, and is available as part of the Essential Stanford Software bundle.\u00a0\u21a9

    ","location":"docs/storage/data-transfer/#other-services","tags":["connection"]},{"title":"Filesystems","text":"

    The following sections describe the characteristics and best uses of each of the Sherlock's filesystems.

    ","location":"docs/storage/filesystems/"},{"title":"$HOME","text":"

    Summary

    $HOME is your home directory. It's the best place to keep your code and important data as it provides snapshots and off-site replication. It is not meant to host data that will be actively read and written to by compute jobs.

    Characteristics Type high speed, distributed NFS file system Quota 15 GB for the whole $HOME directory Snapshots yes (cf. Snapshots) for more info) Backups off-site replication Purge policy not purged Scope all login and compute nodes","location":"docs/storage/filesystems/#home"},{"title":"Recommended usage","text":"

    $HOME is best suited for personal configuration files, scripts, small reference files or datasets, source code and individual software installation

    When you log in, the system automatically sets the current working directory to $HOME: it's the location you'll end up when connecting to Sherlock. You can store your source code and build your executables there.

    We strongly recommend using $HOME to reference your home directory in scripts, rather than its explicit path.

    ","location":"docs/storage/filesystems/#recommended-usage"},{"title":"Checking quota usage","text":"

    The sh_quota tool can be used to display quota usage on $HOME

    $ sh_quota -f HOME\n

    See the Checking Quotas section for more details.

    ","location":"docs/storage/filesystems/#checking-quota-usage"},{"title":"$GROUP_HOME","text":"

    Summary

    $GROUP_HOME is your group home directory. It's the best place to keep your group's shared code, software installations and important data as it provides snapshots and off-site replication. It is not meant to host data that will be actively read and written to by compute jobs.

    $HOME and $GROUP_HOME are based on the same physical file system.

    Characteristics Type high speed, distributed NFS file system Quota 1 TB for the whole $GROUP_HOME directory Snapshots yes (cf. Snapshots) for more info) Backups off-site replication Purge policy not purged Scope all login and compute nodes","location":"docs/storage/filesystems/#group_home"},{"title":"Recommended usage","text":"

    $GROUP_HOME is best suited for group shared source code, common software installations, shared data sets and scripts.

    We strongly recommend using $GROUP_HOME to reference your group home directory in scripts, rather than its explicit path.

    ","location":"docs/storage/filesystems/#recommended-usage_1"},{"title":"Checking quota usage","text":"

    The sh_quota tool can be used to display quota usage on $GROUP_HOME

    $ sh_quota -f GROUP_HOME\n

    See the Checking Quotas section for more details.

    ","location":"docs/storage/filesystems/#checking-quota-usage_1"},{"title":"$SCRATCH","text":"

    Summary

    $SCRATCH is your personal scratch space. It's the best place to store temporary files, such as raw job output, intermediate files, unprocessed results, and so on.

    Purge policy

    Files are automatically purged from $SCRATCH after an inactivity period:

    • files that are not modified after 90 days are automatically deleted,
    • contents need to change for a file to be considered modified. The touch command does not modify file contents and thus does not extend a file's lifetime on the filesystem.

    $SCRATCH is not meant to store permanent data, and should only be used for data associated with currently running jobs. It's not a target for backups, archived data, etc. See the Expiration Policy section for details.

    Characteristics Type Parallel, high-performance Lustre file system Quota 100 TB / 20,000,000 inodes2 Snapshots NO Backups NO Purge policy data not modified in the last 90 days are automatically purged Scope all login and compute nodes","location":"docs/storage/filesystems/#scratch"},{"title":"Recommended usage","text":"

    $SCRATCH is best suited for large files, such as raw job output, intermediate job files, unprocessed simulation results, and so on. This is the recommended location to run jobs from, and to store files that will be read or written to during job execution.

    Old files are automatically purged on $SCRATCH so users should avoid storing long-term data there.

    Each compute node has a low latency, high-bandwidth Infiniband link to $SCRATCH. The aggregate bandwidth of the filesystem is about 75GB/s. So any job with high data performance requirements will take advantage from using $SCRATCH for I/O.

    We strongly recommend using $SCRATCH to reference your scratch directory in scripts, rather than its explicit path.

    ","location":"docs/storage/filesystems/#recommended-usage_2"},{"title":"Checking quota usage","text":"

    The sh_quota tool can be used to display quota usage on $SCRATCH

    $ sh_quota -f SCRATCH\n

    See the Checking Quotas section for more details.

    ","location":"docs/storage/filesystems/#checking-quota-usage_2"},{"title":"Expiration policy","text":"

    Inactive files are automatically purged

    Files that are not modified in the last 90 days will be automatically deleted from the filesystem.

    To manage available space and maintain optimal performance for all jobs, all files on $SCRATCH are subject to automatic purges. Meaning that after a period of inactivity, files that are not used anymore will be automatically deleted from the filesystem.

    File activity is defined based on the last time a file's contents (the actual data in the file) have been modified. Meaning that files whose contents have not been modified in the previous 90 days will be automatically deleted.

    Each time a file's contents are modified, the expiration countdown is reset, and the file gets another 90-day of lifetime.

    Metadata changes don't qualify as an update

    Modifying a file's contents is the only way to reset the expiration countdown and extend the file's lifetime on the filesystem.

    Metadata modifications such as: reading the file, renaming it, moving it to a different directory, changing its permissions or its ownership, \"touching\" it to update its last modification or access times, won't have any effect on the purge countdown.

    Purges are based on an internal filesystem property that reflects the last date a file's data has been modified, and which is unfortunately not readily accessible by users.

    Please note that tools like ls will only display the date of the last metadata1 modification for a file, which is not necessarily relevant to determine a file's eligibility for deletion. For instance, using the touch command on a file to update its last modification date will only update the metadata, not the data, and as such, will not reset the purge countdown timer.

    Filesystem purges are a continuous process: they don't run at particular times, but are carried out in a permanent background fashion. Files are not necessarily deleted right away when they become eligible for deletion. For instance, if you create a file on February 1st and don't ever modify it afterwards, it will be automatically become eligible for deletion on May 1st, and can be deleted anytime after this date.

    Empty directory trees that stay devoid of any file for more than 90 days will be automatically cleaned up as well.

    ","location":"docs/storage/filesystems/#expiration-policy"},{"title":"$GROUP_SCRATCH","text":"

    $SCRATCH and $GROUP_SCRATCH are based on the same physical file system.

    Summary

    $GROUP_SCRATCH is your group shared scratch space. It's the best place to store temporary files, such as raw job output, intermediate files, or unprocessed results that need to be shared among users within a group.

    $GROUP_SCRATCH is NOT a backup target

    $GROUP_SCRATCH is not meant to store permanent data, and should only be used for data associated with currently running jobs. It's not a target for backups, archived data, etc.

    Characteristics Type parallel, high-performance Lustre file system Quota 100 TB / 20,000,000 inodes2 Snapshots NO Backups NO Purge policy data not accessed in the last 90 days are automatically purged Scope all login and compute nodes","location":"docs/storage/filesystems/#group_scratch"},{"title":"Recommended usage","text":"

    $GROUP_SCRATCH is best suited for large files, such as raw job output, intermediate job files, unprocessed simulation results, and so on. This is the recommended location to run jobs from, and to store files that will be read or written to during job execution.

    Old files are automatically purged on $GROUP_SCRATCH so users should avoid storing long-term data there.

    We strongly recommend using $GROUP_SCRATCH to reference your group scratch directory in scripts, rather than its explicit path.

    ","location":"docs/storage/filesystems/#recommended-usage_3"},{"title":"Checking quota usage","text":"

    The sh_quota tool can be used to display quota usage on $GROUP_SCRATCH

    $ sh_quota -f GROUP_SCRATCH\n

    See the Checking Quotas section for more details.

    ","location":"docs/storage/filesystems/#checking-quota-usage_3"},{"title":"Expiration policy","text":"

    As $SCRATCH and $GROUP_SCRATCH are on the same filesystem, the same expiration policy applies to both. Please see the $SCRATCH section above for more details.

    ","location":"docs/storage/filesystems/#expiration-policy_1"},{"title":"$L_SCRATCH","text":"

    Summary

    $L_SCRATCH is local to each compute node, and could be used to store temporary files for jobs with high IOPS requirements. Files stored in $L_SCRATCH are purged at the end of the job.

    Characteristics Type local filesystem, specific to each node, based on SSD Quota n/a (usable space limited by the size of the physical storage devices, typically around 150 GB) Snapshots NO Backups NO Purge policy data immediately purged at the end of the job Scope locally on each node, not shared across nodes","location":"docs/storage/filesystems/#l_scratch"},{"title":"Recommended usage","text":"

    $L_SCRATCH is best suited for small temporary files and applications which require low latency and high IOPS levels, typically intermediate job files, checkpoints, dumps of temporary states, etc.

    Files stored in $L_SCRATCH are local to each node and can't be accessed from other nodes, nor from login nodes.

    Please note that an additional, job-specific environment variable, $L_SCRATCH_JOB, will be set to a subdirectory of $L_SCRATCH for each job. So, if you have two jobs running on the same compute node, $L_SCRATCH will be the same and accessible from both jobs, while $L_SCRATCH_JOB will be different for each job.

    For instance, if you have jobs 98423 and 98672 running on this same nodes, the variables will be set as follows:

    Job id $L_SCRATCH L_SCRATCH_JOB 98423 /lscratch/kilian /lscratch/kilian/98423 98672 /lscratch/kilian /lscratch/kilian/98672

    We strongly recommend using $L_SCRATCH to reference your local scratch directory in scripts, rather than its full path.

    ","location":"docs/storage/filesystems/#recommended-usage_4"},{"title":"Expiration policy","text":"

    All files stored in $L_SCRATCH_JOB are automatically purged at the end of the job, whether the job was successful or not. If you need to conserve files that were generated in $L_SCRATCH_JOB after the job ends, don't forget to add a command at the end of your batch script to copy them to one of the more persistent storage locations, such as $HOME or $SCRATCH.

    Data stored in $L_SCRATCH will be purged at the end of a job, only if no other job from the same user is still running on the node. Which means that data stored in $L_SCRATCH (but in not $L_SCRATCH_JOB) will persist on the node until the last job from the user terminates.

    ","location":"docs/storage/filesystems/#expiration-policy_2"},{"title":"$OAK","text":"

    Summary

    $OAK is SRCC's research data storage offering. It provides an affordable, longer-term storage option for labs and researchers, and is ideally suited to host large datasets, or curated, post-processed results from job campaigns, as well as final results used for publication.

    Order $OAK

    Oak storage can be easily ordered online using the Oak Storage Service page.

    $OAK is opt-in and is available as an option on Sherlock. Meaning that only members of groups which have purchased storage on Oak can access this filesystem.

    For complete details and characteristics, including pricing, please refer to the Oak Storage Service page.

    Characteristics Type parallel, capacitive Lustre filesystem Quota amount purchased (in 10 TB increments) Snapshots NO Backups optional cloud backup available please contact us for details Purge policy not purged Scope all login and compute nodes also available through gateways outside of Sherlock","location":"docs/storage/filesystems/#oak"},{"title":"Recommended usage","text":"

    $OAK is ideally suited for large shared datasets, archival data and curated, post-processed results from job campaigns, as well as final results used for publication.

    Although jobs can directly read and write to $OAK during execution, it is recommended to first stage files from $OAK to $SCRATCH at the beginning of a series of jobs, and save the desired results back from $SCRATCH to $OAK at the end of the job campaign.

    We strongly recommend using $OAK to reference your group home directory in scripts, rather than its explicit path.

    $OAK is not backed up

    $OAK is not backed up or replicated, by design, and deleted files cannot be recovered. We recommend all researchers to keep an additional copy of their important files (for instance, in Google Drive).

    Cloud backup option

    For additional data security, SRCC now offers \"cloud backup\" of Oak data as a managed service option. For an additional monthly fee, data on Oak can be backed up to the cloud (researchers are responsible for cloud storage costs). Please contact us if you'd like additional information.

    ","location":"docs/storage/filesystems/#recommended-usage_5"},{"title":"Checking quota usage","text":"

    The sh_quota tool can be used to display quota usage on $OAK

    $ sh_quota -f OAK\n

    See the Checking Quotas section for more details.

    1. Metadata are data such as a file's size, name, path, owner, permissions, etc.\u00a0\u21a9

    2. An inode (index node) is a data structure in a Unix-style file system that describes a file-system object such as a file or a directory.\u00a0\u21a9\u21a9

    ","location":"docs/storage/filesystems/#checking-quota-usage_4"},{"title":"Technical specifications","text":"","location":"docs/tech/","tags":["tech"]},{"title":"In a nutshell","text":"

    Sherlock features over 1,700 compute nodes, 53,400+ CPU cores and 700+ GPUs, for a total computing power of more than 5.0 Petaflops. That would rank it in the Top500 list of the most powerful supercomputers in the world.

    The cluster currently extends across 2 Infiniband fabrics (EDR, HDR). A 9.7 PB parallel, distributed filesystem, delivering over 200 GB/s of I/O bandwidth, provides scratch storage for more than 7,000 users, and 1,100 PI groups.

    ","location":"docs/tech/#in-a-nutshell","tags":["tech"]},{"title":"Resources","text":"

    The Sherlock cluster has been initiated in January 2014 with a base of freely available computing resources (about 2,000 CPU cores) and the accompanying networking and storage infrastructure (about 1 PB of shared storage).

    Since then, it's been constantly expanding, spawning multiple cluster generations, with numerous contributions from many research groups on campus.

    Cluster generations

    For more information about Sherlock's ongoing evolution and expansion, please see Cluster generations.

    ","location":"docs/tech/#resources","tags":["tech"]},{"title":"Interface","text":"Type Qty Details login nodes 12 sherlock.stanford.edu (load-balanced) data transfer nodes 3 dedicated bandwidth for large data transfers","location":"docs/tech/#interface","tags":["tech"]},{"title":"Computing","text":"

    Access to computing resources

    Computing resources marked with below are freely available to every Sherlock user. Resources marked with are only accessible to Sherlock owners and their research teams.

    Type Access Nodes CPU cores Details compute nodesnormal partition 195 5,236 - 57x 20 (Intel E5-2640v4), 128 GB RAM, EDR IB- 40x 24 (Intel 5118), 191 GB RAM, EDR IB- 28x 32 (AMD 7543), 256 GB RAM, HDR IB- 70x 32 (AMD 7502), 256 GB RAM, HDR IB development nodesdev partition 4 104 - 2x 20 (Intel E5-2640v4), 128 GB RAM, EDR IB- 2x 32 (AMD 7543P), 256 GB RAM, HDR IB- 32x Tesla A30_MIG-1g.6gb large memory nodesbigmem partition 9 504 - 4x 24 (Intel 5118), 384 GB RAM, EDR IB- 1x 32 (Intel E5-2697Av4), 512 GB RAM, EDR IB- 1x 56 (Intel E5-4650v4), 3072 GB RAM, EDR IB- 1x 64 (AMD 7502), 4096 GB RAM, HDR IB- 2x 128 (AMD 7742), 1024 GB RAM, HDR IB GPU nodesgpu partition 26 748 - 1x 20 (Intel E5-2640v4), 256 GB RAM, EDR IB- 4x Tesla P100 PCIe - 1x 20 (Intel E5-2640v4), 256 GB RAM, EDR IB- 4x Tesla P40 - 3x 20 (Intel E5-2640v4), 256 GB RAM, EDR IB- 4x Tesla V100_SXM2 - 1x 24 (Intel 5118), 191 GB RAM, EDR IB- 4x Tesla V100_SXM2 - 2x 24 (Intel 5118), 191 GB RAM, EDR IB- 4x Tesla V100 PCIe - 16x 32 (AMD 7502P), 256 GB RAM, HDR IB- 4x Geforce RTX_2080Ti - 2x 32 (AMD 7502P), 256 GB RAM, HDR IB- 4x Tesla V100S PCIe privately-owned nodesowners partition 1,493 48,648 40 different node configurations, including GPU and bigmem nodes Total 1,731 53,488 756","location":"docs/tech/#computing","tags":["tech"]},{"title":"Storage","text":"

    More information

    For more information about storage options on Sherlock, please refer to the Storage section of the documentation.

    Sherlock is architected around shared storage components, meaning that users can find the same files and directories from all of the Sherlock nodes.

    • Highly-available NFS filesystem for user and group home directories (with hourly snapshots and off-site replication)
    • High-performance Lustre scratch filesystem (9.7 PB parallel, distributed filesystem, delivering over 200 GB/s of I/O bandwidth)
    • Direct access to SRCC's Oak long-term research data storage system (51.3 PB)
    ","location":"docs/tech/#storage","tags":["tech"]},{"title":"Sherlock facts","text":"

    as of February 2024

    ","location":"docs/tech/facts/","tags":["tech"]},{"title":"Users","text":"
    • 7,054 user accounts

    • 1,115 PI groups

      from all Stanford's seven Schools, SLAC, Stanford Institutes, etc.

    • 201 owner groups

    ","location":"docs/tech/facts/#users","tags":["tech"]},{"title":"Interfaces","text":"
    • 12 login nodes

    • 3 data transfer nodes (DTNs)

    ","location":"docs/tech/facts/#interfaces","tags":["tech"]},{"title":"Computing","text":"
    • 5.00 PFLOPs (FP64)

      18.73 (FP32) PFLOPs

    • 53,488 CPU cores

      4 CPU generations (13 CPU models)

    • 756 GPUs

      4 GPU generations (12 GPU models)

    ","location":"docs/tech/facts/#computing","tags":["tech"]},{"title":"Hardware","text":"
    • 1,731 compute nodes

      19 server models (from 3 different manufacturers)

    • 37 racks

      1,147 rack units

    ","location":"docs/tech/facts/#hardware","tags":["tech"]},{"title":"Energy","text":"
    • 564.7 kW

      total power usage

    • 57 PDUs

    ","location":"docs/tech/facts/#energy","tags":["tech"]},{"title":"Storage","text":"
    • 9.7 PB $SCRATCH

      parallel, distributed filesystem, delivering over 200 GB/s of I/O bandwidth

    • 51.3 PB $OAK

      long term research data storage

    ","location":"docs/tech/facts/#storage","tags":["tech"]},{"title":"Networking","text":"
    • 104 Infiniband switches

      across 2 Infiniband fabrics (EDR, HDR)

    • 5,740 Infiniband cables

      spanning about 30.23 km

    • 53 Ethernet switches

    ","location":"docs/tech/facts/#networking","tags":["tech"]},{"title":"Scheduler","text":"
    • 178 Slurm partitions

    • 47,065 CPU.hours/day

      over 5 years of computing in a single day

    • $3,144,743 /month

      to run the same workload on t2.large on-demand cloud instances

    ","location":"docs/tech/facts/#scheduler","tags":["tech"]},{"title":"Status","text":"var statusWidget = new Status.Widget({ hostname: \"status.sherlock.stanford.edu\", selector: \"#sh_status\", display: { ledPosition: \"left\", } });

    Scheduled maintenances

    Maintenance operations and upgrades are scheduled on Sherlock on a regular basis. Per the University's Minimum Security policies, we deploy security patches on Sherlock as required for compliance.

    ","location":"docs/tech/status/"},{"title":"Components and services","text":"

    Sherlock status is

    For more details about Sherlock components and services, see the status dashboard.

    ","location":"docs/tech/status/#components-and-services"},{"title":"Current usage","text":"","location":"docs/tech/status/#current-usage"},{"title":"GPU nodes","text":"

    To support the latest computing advancements in many fields of science, Sherlock features a number of compute nodes with GPUs that can be used to run a variety of GPU-accelerated applications. Those nodes are available to everyone, but are a scarce, highly-demanded resource, so getting access to them may require some wait time in queue.

    Getting your own GPU nodes

    If you need frequent access to GPU nodes, we recommend considering becoming an owner on Sherlock, so you can have immediate access to your GPU nodes when you need them.

    ","location":"docs/user-guide/gpu/"},{"title":"GPU nodes","text":"

    A limited number of GPU nodes are available in the gpu partition. Anybody running on Sherlock can submit a job there. As owners contribute to expand Sherlock, more GPU nodes are added to the owners partition, for use by PI groups which purchased their own compute nodes.

    There are a variety of different GPU configuration available in the gpu partition. To see the available GPU types, please see the GPU types section.

    ","location":"docs/user-guide/gpu/#gpu-nodes"},{"title":"Submitting a GPU job","text":"

    To submit a GPU job, you'll need to use the --gpus (or -G) option in your batch script or command line submission options.

    For instance, the following script will request one GPU for two hours in the gpu partition, and run the GPU-enabled version of gromacs:

    #!/bin/bash\n#SBATCH -p gpu\n#SBATCH -c 10\n#SBATCH -G 1\n\nml load gromacs/2016.3\n\nsrun gmx_gpu ...\n

    You can also directly run GPU processes on compute nodes with srun. For instance, the following command will display details about the GPUs allocated to your job:

    $ srun -p gpu --gpus 2 nvidia-smi\nFri Jul 28 12:41:49 2017\n+-----------------------------------------------------------------------------+\n| NVIDIA-SMI 375.51                 Driver Version: 375.51                    |\n|-------------------------------+----------------------+----------------------+\n| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n|===============================+======================+======================|\n|   0  Tesla P40           On   | 0000:03:00.0     Off |                    0 |\n| N/A   26C    P8    10W / 250W |      0MiB / 22912MiB |      0%   E. Process |\n+-------------------------------+----------------------+----------------------+\n|   1  Tesla P40           On   | 0000:04:00.0     Off |                    0 |\n| N/A   24C    P8    10W / 250W |      0MiB / 22912MiB |      0%   E. Process |\n+-------------------------------+----------------------+----------------------+\n\n+-----------------------------------------------------------------------------+\n| Processes:                                                       GPU Memory |\n|  GPU       PID  Type  Process name                               Usage      |\n|=============================================================================|\n|  No running processes found                                                 |\n+-----------------------------------------------------------------------------+\n

    GPU resources MUST be requested explicitly

    Jobs will be rejected at submission time if they don't explicitly request GPU resources.

    The gpu partition only accepts jobs explicitly requesting GPU resources. If they don't, they will be rejected with the following message:

    $ salloc -p gpu\nsrun: error: Unable to allocate resources: Job violates accounting/QOS policy (job submit limit, user's size and/or time limits)\n
    ","location":"docs/user-guide/gpu/#submitting-a-gpu-job"},{"title":"Interactive sessions","text":"

    As for any other compute node, you can submit an interactive job and request a shell on a GPU node with the following command:

    $ salloc -p gpu --gpus 1\nsalloc: job 38068928 queued and waiting for resources\nsalloc: job 38068928 has been allocated resources\n$ nvidia-smi --query-gpu=index,name --format=csv,noheader\n0, Tesla V100-SXM2-16GB\n
    ","location":"docs/user-guide/gpu/#interactive-sessions"},{"title":"Instant lightweight GPU instances","text":"

    Given that some tasks don't necessarily require a full-fledged, top-of-the-line GPU, lightweight GPU instances are provided to allow instant access to GPU resources for quick debugging, prototyping or testing jobs.

    Lightweight GPU instances

    Lightweight GPU instances leverage NVIDIA\u2019s Multi-Instance GPU (MIG) to provide multiple fully isolated GPU instances on the same physical GPU, each with their own high-bandwidth memory, cache, and compute cores.

    Those GPU instances are instantly available via the dev partition, and can be requested with the sh_dev command:

    # sh_dev -g 1\n[...]\n[kilian@sh03-17n15 ~] (job 17628407) $ nvidia-smi -L\nGPU 0: NVIDIA A30 (UUID: GPU-ac772b5a-123a-dc76-9480-5998f435fe84)\n  MIG 1g.6gb      Device  0: (UUID: MIG-87e5d835-8046-594a-b237-ccc770b868ef)\n

    For interactive apps in the Sherlock OnDemand interface, requesting a GPU in the dev partition will initiate an interactive session with access to a lightweight GPU instance.

    ","location":"docs/user-guide/gpu/#instant-lightweight-gpu-instances"},{"title":"GPU types","text":"

    Since Sherlock features many different types of GPUs, each with its own technical characteristics, performance profiles and specificities, you may want to ensure that your job runs on a specific type of GPU.

    To that end, Slurm allows users to specify constraints when submitting jobs, which will indicate the scheduler that only nodes having features matching the job constraints could be used to satisfy the request. Multiple constraints may be specified and combined with various operators (please refer to the official Slurm documentation for details).

    The list of available features on compute nodes can be obtained with the node_feat1 command. And more specifically, to list the GPU-related features of nodes in the gpu partition::

    $ node_feat -p gpu | grep GPU_\nGPU_BRD:TESLA\nGPU_GEN:PSC\nGPU_MEM:16GB\nGPU_MEM:24GB\nGPU_SKU:TESLA_P100_PCIE\nGPU_SKU:TESLA_P40\n

    You can use node_feat without any option to list all the features of all the nodes in all the partitions. But please note that node_feat will only list the features of nodes from partitions you have access to, so output may vary depending on your group membership.

    The different characteristics2 of various GPU types are listed in the following table

    Slurm\u00a0feature Description Possible values Example job constraint GPU_BRD GPU brand GEFORCE: GeForce / TITANTESLA: Tesla #SBATCH -C GPU_BRD:TESLA GPU_GEN GPU generation PSC: PascalMXW: Maxwell #SBATCH -C GPU_GEN:PSC GPU_MEM Amount of GPU memory 16GB, 24GB #SBATCH -C GPU_MEM:16GB GPU_SKU GPU model TESLA_P100_PCIETESLA_P40 #SBATCH -C GPU_SKU:TESLA_P40

    Depending on the partitions you have access to, more features may be available to be requested in your jobs.

    For instance, to request a Tesla GPU for you job, you can use the following submission options:

    $ srun -p gpu -G 1 -C GPU_BRD:TESLA nvidia-smi -L\nGPU 0: Tesla P100-SXM2-16GB (UUID: GPU-4f91f58f-f3ea-d414-d4ce-faf587c5c4d4)\n

    Unsatisfiable constraints

    If you specify a constraint that can't be satisfied in the partition you're submitting your job to, the job will be rejected by the scheduler. For instance, requesting a RTX3090 GPU in the gpu partition, which doesn't feature any, will result in an error:

    $ srun -p gpu -G 1 -C GPU_SKU:RTX_3090 nvidia-smi -L\nsrun: error: Unable to allocate resources: Requested node configuration is not available\n

    For more information about requesting specific node features and adding job constraints, you can also refer to the \"Node features\" page.

    ","location":"docs/user-guide/gpu/#gpu-types"},{"title":"GPU compute modes","text":"

    By default, GPUs on Sherlock are set in the Exclusive Process compute mode3, to provide the best performance and an isolated environment for jobs, out of the box.

    Some software may require GPUs to be set to a different compute mode, for instance to share a GPU across different processes within the same application.

    To handle that case, we developed a specific option, --gpu_cmode, that users can add to their srun and sbatch submission options, to choose the compute mode for the GPUs allocated to their job.

    Here's the list of the different compute modes supported on Sherlock's GPUs:

    GPU\u00a0compute\u00a0mode --gpu_cmode option Description \"Default\" shared Multiple contexts are allowed per device (NVIDIA default) \"Exclusive Process\" exclusive Only one context is allowed per device, usable from multiple threads at a time (Sherlock default) \"Prohibited\" prohibited No CUDA context can be created on the device

    By default, or if the --gpu_cmode option is not specified, GPUs will be set in the \"Exclusive Process\" mode, as demonstrated by this example command:

    $ srun -p gpu -G 1 nvidia-smi\n+-----------------------------------------------------------------------------+\n| NVIDIA-SMI 387.26                 Driver Version: 387.26                    |\n|-------------------------------+----------------------+----------------------+\n| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n|===============================+======================+======================|\n|   0  Tesla P40           On   | 00000000:03:00.0 Off |                    0 |\n| N/A   22C    P8    10W / 250W |      0MiB / 22912MiB |      0%   E. Process |\n+-------------------------------+----------------------+----------------------+\n

    With the --gpu_cmode option, the scheduler will set the GPU compute mode to the desired value before execution:

    $ srun -p gpu -G 1 --gpu_cmode=shared nvidia-smi\n+-----------------------------------------------------------------------------+\n| NVIDIA-SMI 387.26                 Driver Version: 387.26                    |\n|-------------------------------+----------------------+----------------------+\n| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n|===============================+======================+======================|\n|   0  Tesla P40           On   | 00000000:03:00.0 Off |                    0 |\n| N/A   22C    P8    10W / 250W |      0MiB / 22912MiB |      0%      Default |\n+-------------------------------+----------------------+----------------------+\n

    Tip

    \"Default\" is the name that the NVIDIA System Management Interface (nvidia-smi) uses to describe the mode where a GPU can be shared between different processes. It does not represent the default GPU compute mode on Sherlock, which is \"Exclusive Process\".

    ","location":"docs/user-guide/gpu/#gpu-compute-modes"},{"title":"Advanced options","text":"

    A number of submission options are available when submitting GPU jobs, to request specific resource mapping or task binding options.

    Here are some examples to allocate a set of resources as a function of the number of requested GPUs:

    • --cpus-per-gpu: requests a number of CPUs per allocated GPU.

      For instance, the following options will allocate 2 GPUs and 4 CPUs:

      $ salloc -p gpu -G 2 --cpus-per-gpu=2\n
    • --gpus-per-node: requests a number of GPUs per node,

    • --gpus-per-task: requests a number of GPUs per spawned task,
    • --mem-per-gpu: allocates (host) memory per allocated GPU.

    Other options can help set particular GPU properties (topology, frequency...):

    • --gpu-bind: specify task/GPU binding mode.

      By default every spawned task can access every GPU allocated to the job. This option can help making sure that tasks are bound to the closest GPU, for better performance.

    • --gpu-freq: specify GPU and memory frequency. For instance:

      $ srun -p test -G 1 --gpu-freq=highm1,verbose /bin/true\nGpuFreq=memory_freq:2600,graphics_freq:758\n

    Those options are all available to the srun/sbatch/salloc commands, and more details about each of them can be found in the Slurm documentation.

    Conflicting options

    Given the multitude of options, it's very easy to submit a job with conflicting options. In most cases the job will be rejected.

    For instance:

    $ sbatch --gpus-per-task=1 --cpus-per-gpu=2  --cpus-per-task=1 ...\n
    Here, the first two options implicitly set cpu-per-task to 2, while the third option explicitly sets cpus-per-task to 1. So the job's requirements are conflicting and can't be satisfied.

    ","location":"docs/user-guide/gpu/#advanced-options"},{"title":"Environment and diagnostic tools","text":"","location":"docs/user-guide/gpu/#environment-and-diagnostic-tools"},{"title":"nvtop","text":"

    GPU usage information can be shown with the nvtop tool. nvtop is available as a module, which can be loaded like this:

    $ ml load system nvtop\n

    nvtop provides an htop-like interactive view of GPU utilization. Users can monitor, estimate and fine tune their GPU resource requests with this tool. Percent GPU and memory utilization is shown as a user's GPU code is running.

    1. See node_feat -h for more details.\u00a0\u21a9

    2. The lists of values provided in the table are non exhaustive.\u00a0\u21a9

    3. The list of available GPU compute modes and relevant details are available in the CUDA Toolkit Documentation \u21a9

    ","location":"docs/user-guide/gpu/#nvtop"},{"title":"OnDemand","text":"","location":"docs/user-guide/ondemand/"},{"title":"Introduction","text":"

    The Sherlock OnDemand interface allows you to conduct your research on Sherlock through a web browser. You can manage files (create, edit and move them), submit and monitor your jobs, see their output, check the status of the job queue, run a Jupyter notebook and much more, without logging in to Sherlock the traditional way, via a SSH terminal connection.

    Quote

    In neuroimaging there are a number of software pipelines that output HTML reports heavy on images files. Sherlock OnDemand allows users to check those as they appear on their $SCRATCH folder, for quick quality control, instead of having to mount remote filesystems, download data locally or move to any other storage location. Since the data itself is already quite big and costly to move, OnDemand is extremely helpful for fast assessment.

    -- Carolina Ramirez, Williams PANLab

    ","location":"docs/user-guide/ondemand/#introduction"},{"title":"More documentation","text":"

    Open OnDemand was created by the Ohio Supercomputer Center.

    The following documentation is specifically intended for using OnDemand on Sherlock. For more complete documentation about OnDemand in general, please see the extensive documentation for OnDemand created by OSC, including many video tutorials.

    ","location":"docs/user-guide/ondemand/#more-documentation"},{"title":"Connecting","text":"

    Connection information

    To connect to Sherlock OnDemand, simply point your browser to https://ondemand.sherlock.stanford.edu

    Sherlock OnDemand requires the same level of authentication than connecting to Sherlock over SSH. You will be prompted for your SUNet ID and password, and will go through the regular two-step authentication process.

    The Sherlock OnDemand Dashboard will then open. From there, you can use the menus across the top of the page to manage files, get a shell on Sherlock, submit jobs or open interactive applications such as Jupyter Notebooks or RStudio sessions.

    To end your Sherlock OnDemand session, click on the \"Log Out\" link at the top right of the Dashboard window and close your browser.

    ","location":"docs/user-guide/ondemand/#connecting"},{"title":"Getting a shell","text":"

    You can get shell access to Sherlock by choosing Clusters > Sherlock Shell Access from the top menu in the OnDemand Dashboard.

    In the window that will open, you'll be logged in to one of Sherlock's login nodes, exactly as if you were using SSH to connect. Except you don't need to install any SSH client on your local machine, configure Kerberos or deal with your SSH client configuration to avoid endless two-factor prompts. How cool is that?

    ","location":"docs/user-guide/ondemand/#getting-a-shell"},{"title":"Managing files","text":"

    To create, edit or move files, click on the Files menu from the Dashboard page. A drop-down menu will appear, listing your most common storage locations on Sherlock: $HOME, $GROUP_HOME, $SCRATCH, $GROUP_SCRATCH, and all Oak storage you have access to, including your main $OAK1. Any rclone remotes you create on Sherlock to connect to cloud storage will appear here as well.

    Choosing one of the file spaces opens the File Explorer in a new browser tab. The files in the selected directory are listed.

    There are two sets of buttons in the File Explorer.

    • Under the three vertical dots menu next to each filename: Those buttons allow you to View, Edit, Rename, Download, or Delete a file.

    • At the top of the window, on the right side:

      Button Function Open in Terminal Open a terminal window on Sherlock in a new browser tab Refresh Refresh the list of directory contents New File Create a new, empty file New Directory Create a new sub-directory Upload Copy a file from your local machine to Sherlock Download Download selected files to your local machine Copy/Move Copy or move selected files (after moving to a different directory) Delete Delete selected files Change directory Change your current working directory Copy path Copy the current working directory path to your clipboard Show Dotfiles Toggle the display of dotfiles (files starting with a ., which are usually hidden) Show Owner/Mode Toggle the display of owner and permission settings
    ","location":"docs/user-guide/ondemand/#managing-files"},{"title":"Creating and editing jobs","text":"

    You can create new job scripts, edit existing scripts, and submit them to the scheduler through the Sherlock OnDemand interface.

    From the top menus in the Dashboard, choose Jobs > Job Composer. A Job Composer window will open. There are two tabs at the top: Jobs and Templates.

    In the Jobs tab, you'll find a list of the job you've submitted through OnDemand. The Templates tab will allow you to define your own job templates.

    ","location":"docs/user-guide/ondemand/#creating-and-editing-jobs"},{"title":"Creating a new job script","text":"

    To create a new job script. you'll need to follow the steps below.

    ","location":"docs/user-guide/ondemand/#creating-a-new-job-script"},{"title":"Select a template","text":"

    Go to the Jobs tab in the Jobs Composer interface. You'll find a default template there: \"Simple Sequential Job\".

    To create a new job script, click the blue New Job > From Default Template button in the upper left. You'll see a green message at the top of the page indicating: \"Job was successfully created\".

    At the right of the Jobs page, you can see the Job Details, including the location of the script and the script name (by default, main_job.sh). Under that, you will see the contents of the job script in a section named Submit Script.

    ","location":"docs/user-guide/ondemand/#select-a-template"},{"title":"Edit the job script","text":"

    You'll need to edit the job script, so it contains the commands and workflow that you want to submit to the scheduler.

    If you need more resources than the defaults, you must include options to change them in the job script. For more details, see the Running jobs section.

    You can edit the script in several ways:

    • click the blue Edit Files button at the top of the Jobs tab in the Jobs Composer window,
    • in the Jobs tab in the Jobs Composer window, find the Submit Script section at the bottom right. Click the blue Open Editor button.

    After you save the file, the editor window remains open, but if you return to the Jobs Composer window, you will see that the content of your script has changed.

    ","location":"docs/user-guide/ondemand/#edit-the-job-script"},{"title":"Edit the job options","text":"

    In the Jobs tab in the Jobs Composer window, click the blue Job Options button. The options for the selected job such as name, the job script to run, and the account it run under are displayed and can be edited. Click Save or Cancel to return to the job listing.

    ","location":"docs/user-guide/ondemand/#edit-the-job-options"},{"title":"Submitting jobs","text":"

    To submit a job, select in in the Jobs tab in the Jobs Composer page. Click the green Submit button to submit the selected job. A message at the top of the window shows whether the job submission was successful or not. If it is not, you can edit the job script or options and resubmit. When the job is submitted successfully, the status of the job in the Jobs Composer window will change to Queued or Running. When the job completes, the status will change to Completed.

    ","location":"docs/user-guide/ondemand/#submitting-jobs"},{"title":"Monitoring jobs","text":"

    From the Dashboard page, The Jobs > Active Jobs top-level menu will bring you to a live view of Sherlock's scheduler queue. You'll be able to see all the jobs currently in queue, including running and pending jobs, as well as some details about individual jobs.

    At the bottom of the detailed view, you'll find two button that will bring you to the directory where that job's files are located, either in the File Manager or in a Shell session.

    ","location":"docs/user-guide/ondemand/#monitoring-jobs"},{"title":"Interactive applications","text":"

    One of the main features of Sherlock OnDemand is the ability to run interactive applications directly from the web interface, without leaving your web browser.

    ","location":"docs/user-guide/ondemand/#interactive-applications"},{"title":"Jupyter Notebooks","text":"

    You can run Jupyter Notebooks (using Python, Julia or other languages) through Sherlock OnDemand.

    Some preliminary setup may be required

    Before running your first Jupyter Notebook with IJulia, you'll need to run the following steps (this only needs to be done once):

    $ ml julia\n$ julia\njulia> using Pkg;\njulia> Pkg.add(\"IJulia\")\n

    When you see the message that IJulia has been installed, you can end your interactive session.

    To start a Jupyter session from Sherlock OnDemand:

    1. Select Interactive Apps > Jupyter Notebook from the top menu in the Dashboard page.

    2. In the screen that opens, specify the different parameters for your job (time limit, number of nodes, CPUs, partition to use, etc.). You can also choose to be notified by email when your notebook starts.

    1. Click the blue Launch button to start your JupyterHub session. You may have to wait in the queue for resources to become available for you.

    2. When your session starts, you can click on the blue Connect to Jupyter button to open your Jupyter Notebook. The Dashboard window will display information about your Jupyter session, including the name of the compute node it is running on, when it started, and how much time remains.

    3. In your new Jupyter Notebook tab, you'll see 3 tabs: Files, Running and Clusters.

    By default, you are in the Files tab; that displays the contents of your $HOME directory on Sherlock. You can navigate through your files there.

    Under the Running tab, you will see the list of all the notebooks or terminal sessions that you have currently running.

    1. You can now start a Jupyter Notebook:

      1. To open an existing Jupyter Notebook, which is already stored on Sherlock, navigate to its location in the Files tab and click on its name. A new window running the notebook will open.
      2. To create a new Jupyter Notebook, click on the New button at the top right of the file listing, and choose the kernel of your choice from the drop down.

    To terminate your Jupyter Notebook session, go back to the Dashboard, and click on the My Interactive Sessions in the top menu. This will bring you to a page listing all your currently active interactive session. Identify the one you'd like to terminate and click on the red Cancel button.

    ","location":"docs/user-guide/ondemand/#jupyter-notebooks"},{"title":"JupyterLab","text":"

    To run JupyterLab via Sherlock OnDemand:

    1. Select Interactive Apps > JupyterLab from the top menu in the Dashboard page.

    2. In the screen that opens, specify the different parameters for your job (time limit, number of nodes, CPUs, partition to use, etc.). You can also choose to be notified by email when your session starts.

    3. Click the blue Launch button to start your JupyterLab session. You may have to wait in the queue for resources to become available.

    4. When your session starts, click the blue Connect to JupyterLab button. A new window opens with the JupyterLab interface.

    5. The first time you connect to JupyterLab via Sherlock OnDemand, you'll see 2 tabs: Files and Launcher.

    The Files tab displays the contents of your $HOME directory on Sherlock. You can navigate through your files there.

    In the Launcher tab, you will have the option to create a new Jupyter Notebook new Console session by clicking the tile showing the kernel of your choice. You can also open the Terminal or a text editor for a variety of file types by clicking the corresponding tile.

    To create a new kernel for IJulia:

    1. In the Launcher, click the Terminal tile in the \"Other\" section.

    2. In the Terminal, run the following commands:

      $ ml julia\n$ julia\njulia> using Pkg;\njulia> Pkg.add(\"IJulia\")\n
    3. Open a new Launcher tab by clicking the + sign next to your open Terminal tab. Julia will now be listed in the \"Notebook\" and \"Console\" sections as an available kernel.

    To create a custom kernel for a virtual environment using Python 3.x:

    1. In a shell session, activate your environment and run the following:

      $ pip3 install ipykernel\n$ python3 -m ipykernel install --user --name env --display-name \"My Env\"\n

      This will create a kernel for the environment env. It will appear as My Env in the JupyterLab Launcher.

      Creating a custom kernel for a Python 2.x environment

      When working with a Python 2.x environment, use the python/pip commands instead.

    2. The custom kernel will now be listed as option in the \"Notebook\" and \"Console\" sections in the JupyterLab Launcher. To start a Jupyter Notebook using your virtual environment, click on the tile for that kernel.

      Creating a custom kernel for a conda environment

      In order to use a kernel created from a conda environment, you must unload the python and py-jupyterlab modules from your JupyterLab session. This can be done using the JupyterLab Lmod extension. To use the Lmod extension, select the bottom tab in the left side menu of your JupyterLab window. You may also need to restart the kernel for your notebook or console.

    ","location":"docs/user-guide/ondemand/#jupyterlab"},{"title":"MATLAB","text":"

    To run MATLAB via Sherlock OnDemand:

    1. Select Interactive Apps > MATLAB from the top menu in the Dashboard page.

    2. In the screen that opens, specify the different parameters for your job (time limit, number of nodes, CPUs, partition to use, etc.). You can also choose to be notified by email when your session starts.

    3. Click the blue Launch button to start your MATLAB session. You may have to wait in the queue for resources to become available.

    4. When your session starts, click the blue Connect to MATLAB button. A new window opens with the MATLAB interface.

    ","location":"docs/user-guide/ondemand/#matlab"},{"title":"RStudio","text":"

    To run RStudio via Sherlock OnDemand:

    1. Select Interactive Apps > RStudio Server from the top menu in the Dashboard page.

    2. In the screen that opens, specify the different parameters for your job (time limit, number of nodes, CPUs, partition to use, etc.). You can also choose to be notified by email when your session starts.

    3. Click the blue Launch button to start your RStudio session. You may have to wait in the queue for resources to become available.

    4. When your session starts, click the blue Connect to RStudio Server button. A new window opens with the RStudio interface.

    Installing packages in RStudio

    You may encounter errors while installing R packages within RStudio. First try installing R packages in a shell session on the Sherlock command line. See our R packages documentation for more information.

    ","location":"docs/user-guide/ondemand/#rstudio"},{"title":"TensorBoard","text":"

    To run TensorBoard via Sherlock OnDemand:

    1. Select Interactive Apps > TensorBoard from the top menu in the Dashboard page.

    2. In the screen that opens, specify the different parameters for your job (time limit, number of nodes, CPUs, partition to use, etc.). You can also choose to be notified by email when your session starts.

    3. Click the blue Launch button to start your TensorBoard session. You may have to wait in the queue for resources to become available.

    4. When your session starts, click the blue Connect to TensorBoard button. A new window opens with the TensorBoard interface.

    ","location":"docs/user-guide/ondemand/#tensorboard"},{"title":"VS Code","text":"

    You can use VS Code on Sherlock through the code-server interactive app.

    Using your local VS Code with remote SSH

    Connecting to Sherlock from VS Code on your local machine is not supported at this time due to a known issue with the closed-source \"Remote SSH\" extension.

    To start a VS Code session via Sherlock OnDemand:

    1. Select Interactive Apps > code-server from the top menu in the Dashboard page.

    2. In the screen that opens, specify the different parameters for your job (time limit, number of nodes, CPUs, partition to use, etc.). You can also choose to be notified by email when your session starts.

    3. Click the blue Launch button to start your code-server session. You may have to wait in the queue for resources to become available.

    4. When your session starts, click the blue Connect to code-server button. A new window opens with the code-server interface.

    ","location":"docs/user-guide/ondemand/#vs-code"},{"title":"Support","text":"

    If you are experiencing issues with Sherlock or your interactive session, you can contact us directly from Sherlock OnDemand.

    To submit a ticket about Sherlock or Sherlock OnDemand in general:

    1. Select Help -> Submit Support Ticket from the top menu in the Dashboard page.

    2. In the screen that opens, complete the Support Ticket form. When applicable, please provide:

      • the full path to any files involved in your question or problem,

      • the command(s) you ran, and/or the job submission script(s) you used,

      • the exact, entire error message (or trace) you received.

    3. Click the blue Submit support ticket form. Research Computing support will respond to you as soon as we are able.

    To submit a ticket about your current or recent interactive session:

    1. Select My Interactive Sessions from the top menu in the Dashboard page.

    2. In the screen that opens, find the card for the session you need help with. Active sessions will have a green header, and past sessions will have a gray header. Click that card's Submit support ticket link to open the Support Ticket form.

    3. Complete the Support Ticket form. When applicable, please provide:

      • the full path to any files involved in your question or problem,

      • the command(s) you ran, and/or the job submission script(s) you used,

      • the exact, entire error message (or trace) you received.

    4. Click the blue Submit support ticket form. Research Computing support will respond to you as soon as we are able.

    1. if you have access to the Oak storage system.\u00a0\u21a9

    ","location":"docs/user-guide/ondemand/#support"},{"title":"Running jobs","text":"","location":"docs/user-guide/running-jobs/","tags":["slurm"]},{"title":"Login nodes","text":"

    Login nodes are not for computing

    Login nodes are shared among many users and therefore must not be used to run computationally intensive tasks. Those should be submitted to the scheduler which will dispatch them on compute nodes.

    The key principle of a shared computing environment is that resources are shared among users and must be scheduled. It is mandatory to schedule work by submitting jobs to the scheduler on Sherlock. And since login nodes are a shared resource, they must not be used to execute computing tasks.

    Acceptable use of login nodes include:

    • lightweight file transfers,
    • script and configuration file editing,
    • job submission and monitoring.

    Resource limits are enforced

    To minimize disruption and ensure a comfortable working environment for users, resource limits are enforced on login nodes, and processes started there will automatically be terminated if their resource usage (including CPU time, memory and run time) exceed those limits.

    ","location":"docs/user-guide/running-jobs/#login-nodes","tags":["slurm"]},{"title":"Slurm commands","text":"

    Slurm allows requesting resources and submitting jobs in a variety of ways. The main Slurm commands to submit jobs are listed in the table below:

    Command Description Behavior salloc Request resources and allocates them to a job Starts a new shell, but does not execute anything srun Request resources and runs a command on the allocated compute node(s) Blocking command: will not return until the job ends sbatch Request resources and runs a script on the allocated compute node(s) Asynchronous command: will return as soon as the job is submitted","location":"docs/user-guide/running-jobs/#slurm-commands","tags":["slurm"]},{"title":"Interactive jobs","text":"","location":"docs/user-guide/running-jobs/#interactive-jobs","tags":["slurm"]},{"title":"Dedicated nodes","text":"

    Interactive jobs allow users to log in to a compute node to run commands interactively on the command line. They could be an integral part of an interactive programming and debugging workflow. The simplest way to establish an interactive session on Sherlock is to use the sh_dev command:

    $ sh_dev\n

    This will open a login shell using one core and 4 GB of memory on one node for one hour. The sh_dev sessions run on dedicated compute nodes. This ensures minimal wait times when you need to access a node for testing script, debug code or any kind of interactive work.

    sh_dev also provides X11 forwarding via the submission host (typically the login node you're connected to) and can thus be used to run GUI applications.

    ","location":"docs/user-guide/running-jobs/#dedicated-nodes","tags":["slurm"]},{"title":"Compute nodes","text":"

    If you need more resources1, you can pass options to sh_dev, to request more CPU cores, more nodes, or even run in a different partition. sh_dev -h will provide more information:

    $ sh_dev -h\nsh_dev: start an interactive shell on a compute node.\n\nUsage: sh_dev [OPTIONS]\n    Optional arguments:\n        -c      number of CPU cores to request (OpenMP/pthreads, default: 1)\n        -n      number of tasks to request (MPI ranks, default: 1)\n        -N      number of nodes to request (default: 1)\n        -m      memory amount to request (default: 4GB)\n        -p      partition to run the job in (default: dev)\n        -t      time limit (default: 01:00:00)\n        -r      allocate resources from the named reservation (default: none)\n        -J      job name (default: sh_dev)\n        -q      quality of service to request for the job (default: normal)\n\n    Note: the default partition only allows for limited amount of resources.\n    If you need more, your job will be rejected unless you specify an\n    alternative partition with -p.\n

    Another way to get an interactive session on a compute node is to use srun to execute a shell through the scheduler. For instance, to start a bash session on a compute node, with the default resource requirements (one core for 2 hours), you can run:

    $ srun --pty bash\n

    The main advantage of this approach is that it will allow you to specify the whole range of submission options that sh_dev may not support.

    Finally, if you prefer to submit an existing job script or other executable as an interactive job, you can use the salloc command:

    $ salloc script.sh\n

    If you don't provide a command to execute, salloc will start a Slurm job and allocate resources for it, but it will not automatically connect you to the allocated node(s). It will only start a new shell on the same node you launched salloc from, and set up the appropriate $SLURM_* environment variables. So you will typically need to look at them to see what nodes have been assigned to your job. For instance:

    $ salloc\nsalloc: Granted job allocation 655914\n$ echo $SLURM_NODELIST\nsh02-01n55\n$ ssh sh02-01n55\n[...]\nsh02-01n55 ~ $\n
    ","location":"docs/user-guide/running-jobs/#compute-nodes","tags":["slurm"]},{"title":"Connecting to nodes","text":"

    Login to compute nodes

    Users are not allowed to login to compute nodes unless they have a job running there.

    If you SSH to a compute node without any active job allocation, you'll be greeted by the following message:

    $ ssh sh02-01n01\nAccess denied by pam_slurm_adopt: you have no active jobs on this node\nConnection closed\n$\n

    Once you have a job running on a node, you can SSH directly to it and run additional processes2, or observe how you application behaves, debug issues, and so on.

    The salloc command supports the same parameters as sbatch, and can override any default configuration. Note that any #SBATCH directive in your job script will not be interpreted by salloc when it is executed in this way. You must specify all arguments directly on the command line for them to be taken into account.

    ","location":"docs/user-guide/running-jobs/#connecting-to-nodes","tags":["slurm"]},{"title":"Batch jobs","text":"

    It's easy to schedule batch jobs on Sherlock. A job is simply an instance of your program, for example your R, Python or Matlab script that is submitted to and executed by the scheduler (Slurm). When you submit a job with the sbatch command it's called a batch job and it will either run immediately or will pend (wait) in the queue.

    The length of time a job will pend is determined by several factors; how many other jobs are in the queue ahead or your job and how many resources your job is requesting are the most important factors. One key principle when requesting resources is to always try to request as few resources as you need to get your job done. This will ensure your job pends in the queue for as little time as necessary. To get a rough idea of what resources are needed, you can profile your code/jobs in an sh_dev session in real-time with htop, nvtop, sacct etc. The basic concept is to tell the scheduler what resources your job needs and how long is should run. These resources are:

    CPUs: How many CPUs the program you are calling the in the sbatch script needs, unless it can utilize multiple CPUs at once you should request a single CPU. Check your code's documentation or try running in an interactive session with sh_dev and run htop if you are unsure.

    GPUs: If your code is GPU enabled, how many GPUs does your code need? Use the diagnostic tool nvtop to see if your code is capable of running on multiple GPUs and how much GPU memory it's using in real-time.

    memory (RAM): How much memory your job will consume. Some things to consider, will it load a large file or matrix into memory? Does it consume a lot of memory on your laptop? Often the default memory is sufficient for many jobs.

    time: How long will it take for your code to run to completion?

    partition: What set of compute nodes on Sherlock will you run on, normal, gpu, owners, bigmem? Use the sh_part command to see what partitions you are allowed to run on. The default partition on Sherlock is the normal partition.

    Next, you tell the scheduler what your job should should do: load modules and run your code. Note that any logic you can code into a bash script with the bash scripting language can also be coded into an sbatch script.

    This example job, will run the Python script mycode.py for 10 minutes on the normal partition using 1 CPU and 8 GB of memory. To aid in debugging we are naming this job \"test_job\" and appending the Job ID (%j) to the two output files that Slurm creates when a job is run. The output files are written to the directory in which you launched your job in, you can also specify a different path. One file will contain any errors and the other will contain non-error output. Look in these 2 files ending in .err and .out for useful debugging information and error output.

    Because it's a Python 3 script that uses some Numpy code, we need to load the python/3.6.1 and the py-numpy/1.19.2_py36 modules. The Python script is then called just as you would on the command line at the end of the sbatch script:

    sbatch script:

    #!/usr/bin/bash\n#SBATCH --job-name=test_job\n#SBATCH --output=test_job.%j.out\n#SBATCH --error=test_job.%j.err\n#SBATCH --time=10:00\n#SBATCH -p normal\n#SBATCH -c 1\n#SBATCH --mem=8GB\nmodule load python/3.6.1\nmodule load py-numpy/1.19.2_py36\npython3 mycode.py\n
    Create and edit the sbatch script with a text editor like vim/nano or the OnDemand file manager. Then save the file, in this example we call it \"test.sbatch\".

    Submit to the scheduler with the sbatch command:

    $sbatch test.sbatch\n
    Monitor your job and job ID in the queue with the squeue command:

    $squeue -u $USER\n   JOBID     PARTITION     NAME     USER    ST       TIME  NODES  NODELIST(REASON)\n   44915821    normal    test_job  <userID>  PD       0:00      1 (Priority)\n

    Notice that the jobs state (ST) in pending (PD)

    Once the job starts to run that will change to R:

    $squeue -u $USER\n    JOBID     PARTITION     NAME     USER     ST      TIME  NODES   NODELIST(REASON)\n    44915854    normal test_job  <userID>     R      0:10     1     sh02-01n49\n

    Here you can see it has been running (R) on the compute node sh02-01n49 for 10 seconds. While your job is running you have ssh access to that node and can run diagnostic tools such as htop and nvtop in order to monitor your job's memory and CPU/GPU utilization in real-time. You can also manage this job based on the JobID assigned to it (44915854). For example the job can be cancelled with the scancel command.

    ","location":"docs/user-guide/running-jobs/#batch-jobs","tags":["slurm"]},{"title":"Resource requests","text":"

    To get a better idea of the amount of resources your job will need, you can use the ruse command, available as a module:

    $ module load system ruse\n

    ruse is a command line tool developed by Jan Moren to measure a process' resource usage. It periodically measures the resource use of a process and its subprocesses, and can help you find out how much resource to allocate to your job. It will determine the actual memory, execution time and cores that individual programs or MPI applications need to request in their job submission options.

    ruse periodically samples the process and its subprocesses and keeps track of the CPU, time and maximum memory use. It also optionally records the sampled values over time. The purpose or Ruse is not to profile processes in detail, but to follow jobs that run for many minutes, hours or days, with no performance impact and without changing the measured application in any way.

    You'll find complete documentation and details about ruse's usage on the project webpage, but here are a few useful examples.

    ","location":"docs/user-guide/running-jobs/#resource-requests","tags":["slurm"]},{"title":"Sizing a job","text":"

    In its simplest form, ruse can help discover how much resources a new script or application will need. For instance, you can start a sizing session on a compute node with an overestimated amount of resources, and start your application like this:

    $ ruse ./myapp\n

    This will generate a <myapp>-<pid>/ruse output file in the current directory, looking like this:

    Time:           02:55:47\nMemory:         7.4 GB\nCores:          4\nTotal_procs:    3\nActive_procs:   2\nProc(%): 99.9  99.9\n

    It shows that myapp:

    • ran for almost 3 hours
    • used a little less than 8B of memory
    • had 4 cores available,
    • spawned 3 processes, among which at most 2 were active at the same time,
    • that both active processes each used 99.9% of a CPU core

    This information could be useful in tailoring the job resource requirements to its exact needs, making sure that the job won't be killed for exceeding one of its resource limits, and that the job won't have to wait too long in queue for resources that it won't use. The corresponding job request could look like this:

    #SBATCH --time 3:00:00\n#SBATCH --mem 8GB\n#SBATCH --cpus-per-task 2\n
    ","location":"docs/user-guide/running-jobs/#sizing-a-job","tags":["slurm"]},{"title":"Verifying a job's usage","text":"

    It's also important to verify that applications, especially parallel ones, stay in the confines of the resources they've requested. For instance, a number of parallel computing libraries will make the assumption that they can use all the resources on the host, will automatically determine the number of physical CPU cores present on the compute node, and start as many processes. This could be a significant issue if the job requested less CPUs, as more processes will be constrained on less CPU cores, which will result in node overload and degraded performance for the application.

    To avoid this, you can start your application with ruse and report usage for each time step specified with -t. You can also request the reports to be displayed directly on stdout rather than stored in a file.

    For instance, this will report usage every 10 seconds:

    $ ruse -s -t10 --stdout ./myapp\n   time         mem   processes  process usage\n  (secs)        (MB)  tot  actv  (sorted, %CPU)\n     10        57.5    17    16   33  33  33  25  25  25  25  25  25  25  25  20  20  20  20  20\n     20        57.5    17    16   33  33  33  25  25  25  25  25  25  25  25  20  20  20  20  20\n     30        57.5    17    16   33  33  33  25  25  25  25  25  25  25  25  20  20  20  20  20\n\nTime:           00:00:30\nMemory:         57.5 MB\nCores:          4\nTotal_procs:   17\nActive_procs:  16\nProc(%): 33.3  33.3  33.2  25.0  25.0  25.0  25.0  25.0  25.0  24.9  24.9  20.0  20.0  20.0  20.0  19.9\n

    Here, we can see that despite having being allocated 4 CPUs, the application started 17 threads, 16 of which were active running intensive computations, with the unfortunate consequence that each process could only use a fraction of a CPU.

    In that case, to ensure optimal performance and system operation, it's important to modify the application parameters to make sure that it doesn't start more computing processes than the number of requested CPU cores.

    ","location":"docs/user-guide/running-jobs/#verifying-a-jobs-usage","tags":["slurm"]},{"title":"Available resources","text":"

    Whether you are submitting a batch job, or an or interactive job, it's important to know the resources that are available to you. For this reason, we provide sh_part, a command-line tool to help answer questions such as:

    • which partitions do I have access to?
    • how many jobs are running on them?
    • how many CPUs can I use?
    • where should I submit my jobs?

    sh_part can be executed on any login or compute node to see what partitions are available to you, and its output looks like this:

    $ sh_part\n     QUEUE STA   FREE  TOTAL   FREE  TOTAL RESORC  OTHER MAXJOBTIME    CORES       NODE   GRES\n PARTITION TUS  CORES  CORES  NODES  NODES PENDNG PENDNG  DAY-HR:MN    /NODE     MEM-GB (COUNT)\n    normal   *    153   1792      0     84    23k    127    7-00:00    20-24    128-191 -\n    bigmem         29     88      0      2      0      8    1-00:00    32-56   512-3072 -\n       dev         31     40      0      2      0      0    0-02:00       20        128 -\n       gpu         47    172      0      8    116      1    7-00:00    20-24    191-256 gpu:4(S:0-1)(2),gpu:4(S:0)(6)\n

    The above example shows four possible partitions where jobs can be submitted: normal, bigmem, dev, or gpu. It also provides additional information such as the maximum amount of time allowed in each partition (MAXJOBTIME), the number of other jobs already in queue, along with the ranges of memory available on nodes in each partition.

    • in the QUEUE PARTITION column, the * character indicates the default partition.
    • the RESOURCE PENDING column shows the core count of pending jobs that are waiting on resources,
    • the OTHER PENDING column lists core counts for jobs that are pending for other reasons, such as licenses, user, group or any other limit,
    • the GRES column shows the number and type of Generic RESsources available in that partition (typically, GPUs), which CPU socket they're available from, and the number of nodes that feature that specific GRES combination. So for instance, in the output above, gpu:4(S:0-1)(2) means that the gpu partition features 2 nodes with 4 GPUs each, and that those GPUs are accessible from both CPU sockets (S:0-1).
    ","location":"docs/user-guide/running-jobs/#available-resources","tags":["slurm"]},{"title":"Recurring jobs","text":"

    Warning

    Cron tasks are not supported on Sherlock.

    Users are not allowed to create cron jobs on Sherlock, for a variety of reasons:

    • resources limits cannot be easily enforced in cron jobs, meaning that a single user can end up monopolizing all the resources of a login node,
    • no amount of resources can be guaranteed when executing a cron job, leading to unreliable runtime and performance,
    • user cron jobs have the potential of bringing down whole nodes by creating fork bombs, if they're not carefully crafted and tested,
    • compute and login nodes could be redeployed at any time, meaning that cron jobs scheduled there could go away without the user being notified, and cause all sorts of unexpected results,
    • cron jobs could be mistakenly scheduled on several nodes and run multiple times, which could result in corrupted files.

    As an alternative, if you need to run recurring tasks at regular intervals, we recommend the following approach: by using the --begin job submission option, and creating a job that resubmits itself once it's done, you can virtually emulate the behavior and benefits of a cron job, without its disadvantages: your task will be scheduled on a compute node, and use all of the resources it requested, without being impacted by anything else.

    Depending on your recurring job's specificities, where you submit it and the state of the cluster at the time of execution, the starting time of that task may not be guaranteed and result in a delay in execution, as it will be scheduled by Slurm like any other jobs. Typical recurring jobs, such as file synchronization, database updates or backup tasks don't require strict starting times, though, so most users find this an acceptable trade-off.

    The table below summarizes the advantages and inconvenients of each approach:

    Cron tasks Recurring jobs Authorized on Sherlock Dedicated resources for the task Persistent across node redeployments Unique, controlled execution Precise schedule","location":"docs/user-guide/running-jobs/#recurring-jobs","tags":["slurm"]},{"title":"Recurrent job example","text":"

    The script below presents an example of such a recurrent job, that would emulate a cron task. It will append a timestamped line to a cron.log file in your $HOME directory and run every 7 days.

    cron.sbatch
    #!/bin/bash\n#SBATCH --job-name=cron\n#SBATCH --begin=now+7days\n#SBATCH --dependency=singleton\n#SBATCH --time=00:02:00\n#SBATCH --mail-type=FAIL\n\n\n## Insert the command to run below. Here, we're just storing the date in a\n## cron.log file\ndate -R >> $HOME/cron.log\n\n## Resubmit the job for the next execution\nsbatch $0\n

    If the job payload (here the date command) fails for some reason and generates and error, the job will not be resubmitted, and the user will be notified by email.

    We encourage users to get familiar with the submission options used in this script by giving a look at the sbatch man page, but some details are given below:

    Submission\u00a0option\u00a0or\u00a0command Explanation --job-name=cron makes it easy to identify the job, is used by the --dependency=singleton option to identify identical jobs, and will allow cancelling the job by name (because its jobid will change each time it's submitted) --begin=now+7days will instruct the scheduler to not even consider the job for scheduling before 7 days after it's been submitted --dependency=singleton will make sure that only one cron job runs at any given time --time=00:02:00 runtime limit for the job (here 2 minutes). You'll need to adjust the value depending on the task you need to run (shorter runtime requests usually result in the job running closer to the clock mark) --mail-type=FAIL will send an email notification to the user if the job ever fails sbatch $0 will resubmit the job script by calling its own name ($0) after successful execution

    You can save the script as cron.sbatch or any other name, and submit it with:

    $ sbatch cron.sbatch\n

    It will start running for the first time 7 days after you submit it, and it will continue to run until you cancel it with the following command (using the job name, as defined by the --job-name option):

    $ scancel -n cron\n
    ","location":"docs/user-guide/running-jobs/#recurrent-job-example","tags":["slurm"]},{"title":"Persistent jobs","text":"

    Recurring jobs described above are a good way to emulate cron jobs on Sherlock, but don't fit all needs, especially when a persistent service is required.

    For instance, workflows that require a persistent database connection would benefit from an ever-running database server instance. We don't provide persistent database services on Sherlock, but instructions and examples on how to submit database server jobs are provided for MariaDB or PostgreSQL.

    In case those database instances need to run pretty much continuously (within the limits of available resources and runtime maximums), the previous approach described in the recurring jobs section could fall a bit short. Recurring jobs are mainly designed for jobs that have a fixed execution time and don't reach their time limit, but need to run at given intervals (like synchronization or backup jobs, for instance).

    Because a database server process will never end within the job, and will continue until the job reaches its time limit, the last resubmission command (sbatch $0) will actually never be executed, and the job won't be resubmitted.

    To work around this, a possible approach is to catch a specific signal sent by the scheduler at a predefined time, before the time limit is reached, and then re-queue the job. This is easily done with the Bash trap command, which can be instructed to re-submit a job when it receives the SIGUSR1 signal.

    Automatically resubmitting a job doesn't make it immediately runnable

    Jobs that are automatically re-submitted using this technique won't restart right away: the will get back in queue and stay pending until their execution conditions (priority, resources, usage limits...) are satisfied.

    ","location":"docs/user-guide/running-jobs/#persistent-jobs","tags":["slurm"]},{"title":"Persistent job example","text":"

    Here's the recurring job example from above, modified to:

    1. instruct the scheduler to send a SIGUSR1 signal to the job 90 seconds3 before reaching its time limit (with the #SBATCH --signal option),
    2. re-submit itself upon receiving that SIGUSR1 signal (with the trap command)
    persistent.sbatch
    #!/bin/bash\n#\n#SBATCH --job-name=persistent\n#SBATCH --dependency=singleton\n#SBATCH --time=00:05:00\n#SBATCH --signal=B:SIGUSR1@90\n\n# catch the SIGUSR1 signal\n_resubmit() {\n    ## Resubmit the job for the next execution\n    echo \"$(date): job $SLURM_JOBID received SIGUSR1 at $(date), re-submitting\"\n    sbatch $0\n}\ntrap _resubmit SIGUSR1\n\n## Insert the command to run below. Here, we're just outputting the date every\n## 10 seconds, forever\n\necho \"$(date): job $SLURM_JOBID starting on $SLURM_NODELIST\"\nwhile true; do\n    echo \"$(date): normal execution\"\n    sleep 60\ndone\n

    Long running processes need to run in the background

    If your job's actual payload (the application or command you want to run) is running continuously for the whole duration of the job, it needs to be executed in the background, so the trap can be processed.

    To run your application in the background, just add a & at the end of the command and then add a wait statement at the end of the script, to make the shell wait until the end of the job.

    For instance, if you were to run a PostgreSQL database server, the while true ... done loop in the previous example could be replaced by something like this:

    postgres -i -D $DB_DIR &\nwait\n
    ","location":"docs/user-guide/running-jobs/#persistent-job-example","tags":["slurm"]},{"title":"Persistent $JOBID","text":"

    One potential issue with having a persistent job re-submit itself when it reaches its runtime limit is that it will get a different $JOBID each time it's (re-)submitted.

    This could be particularly challenging when other jobs depend on it, like in the database server scenario, where client jobs would need to start only if the database server is running. This can be achieved with job dependencies, but those dependencies have to be expressed using jobids, so having the server job's id changing at each re-submission will be difficult to handle.

    To avoid this, the re-submission command (sbatch $0) can be replaced by a re-queuing command:

    scontrol requeue $SLURM_JOBID\n

    The benefit of that change is that the job will keep the same $JOBID across all re-submissions. And now, dependencies can be added to other jobs using that specific $JOBID, without having to worry about it changing. And there will be only one $JOBID to track for that database server job.

    The previous example can then be modified as follows:

    persistent.sbatch
    #!/bin/bash\n#SBATCH --job-name=persistent\n#SBATCH --dependency=singleton\n#SBATCH --time=00:05:00\n#SBATCH --signal=B:SIGUSR1@90\n\n# catch the SIGUSR1 signal\n_requeue() {\n    echo \"$(date): job $SLURM_JOBID received SIGUSR1, re-queueing\"\n    scontrol requeue $SLURM_JOBID\n}\ntrap '_requeue' SIGUSR1\n\n## Insert the command to run below. Here, we're just outputting the date every\n## 60 seconds, forever\n\necho \"$(date): job $SLURM_JOBID starting on $SLURM_NODELIST\"\nwhile true; do\n    echo \"$(date): normal execution\"\n    sleep 60\ndone\n

    Submitting that job will produce an output similar to this:

    Mon Nov  5 10:30:59 PST 2018: Job 31182239 starting on sh-06-34\nMon Nov  5 10:30:59 PST 2018: normal execution\nMon Nov  5 10:31:59 PST 2018: normal execution\nMon Nov  5 10:32:59 PST 2018: normal execution\nMon Nov  5 10:33:59 PST 2018: normal execution\nMon Nov  5 10:34:59 PST 2018: Job 31182239 received SIGUSR1, re-queueing\nslurmstepd: error: *** JOB 31182239 ON sh-06-34 CANCELLED AT 2018-11-05T10:35:06 DUE TO JOB REQUEUE ***\nMon Nov  5 10:38:11 PST 2018: Job 31182239 starting on sh-06-34\nMon Nov  5 10:38:11 PST 2018: normal execution\nMon Nov  5 10:39:11 PST 2018: normal execution\n

    The job runs for 5 minutes, then received the SIGUSR1 signal, is re-queued, restarts for 5 minutes, and so on, until it's properly scancelled.

    1. The dedicated partition that sh_dev uses by default only allows up to 2 cores and 8 GB or memory per user at any given time. So if you need more resources for your interactive session, you may have to specify a different partition. See the Partitions section for more details.\u00a0\u21a9

    2. Please note that your SSH session will be attached to your running job, and that resources used by that interactive shell will count towards your job's resource limits. So if you start a process using large amounts of memory via SSH while your job is running, you may hit the job's memory limits, which will trigger its termination.\u00a0\u21a9

    3. Due to the resolution of event handling by the scheduler, the signal may be sent up to 60 seconds earlier than specified.\u00a0\u21a9

    ","location":"docs/user-guide/running-jobs/#persistent-jobid","tags":["slurm"]},{"title":"Troubleshooting","text":"

    Sherlock is a resource for research, and as such, it is in perpetual evolution, as hardware, applications, libraries, and modules are added, updated, and/or modified on a regular basis. Sometimes issues can appear where none existed before. When you find something missing or a behavior that seems odd, please let us know.

    ","location":"docs/user-guide/troubleshoot/"},{"title":"How to submit a support request","text":"

    Google it first!

    When encountering issues with software, if the misbehavior involves an error message, the first step should always be to look up the error message online. There's a good chance somebody stumbled upon the same hurdles before, and may even provide some fix or workaround.

    One of the most helpful Google searches is your_application sbatch. For example if you're having trouble submitting jobs or allocating resources (CPUs, time, memory) with Cell Ranger, search for cell ranger sbatch to see how others have successfully run your application on a cluster.

    If you're facing issues you can't figure out, we're here to help. Feel free to email us at srcc-support@stanford.edu, but please keep the following points in mind to ensure a timely and relevant response to your support requests.

    Please provide relevant information

    We need to understand the issue you're facing, and in most cases, we need to be able to reproduce it, so it could be diagnosed and addressed. Please make sure to provide enough information so we could help you in the best possible way.

    This typically involves providing the following information:

    • your SUNet ID,
    • some context about your problem (were you submitting a job, copying a file, compiling an application?),
    • if relevant, the full path to the files involved in your question or problem,
    • the name of node where you received the error (usually displayed in your command-line prompt),
    • the command(s) you ran, and/or the job submission script(s) you used,
    • the relevant job ID(s),
    • the exact, entire error message (or trace) you received.

    Error messages are critical

    This is very important. Without proper error messages, there is nothing we can do to help. And \"it doesn't work\" is not a proper error message. Also, please cut and paste the actual text of the output, commands, and error messages rather than screenshots in your tickets. That way it is much easier for us to try to replicate your errors.

    You can avoid email back and forth where we ask for all the relevant details, and thus delay the problem resolution, by providing all this information from the start. This will help us get to your problem immediately.

    ","location":"docs/user-guide/troubleshoot/#how-to-submit-a-support-request"},{"title":"Tags","text":"

    Here is a list of documentation tags:

    ","location":"docs/tags/"},{"title":"advanced","text":"
    • Node features
    ","location":"docs/tags/#advanced"},{"title":"connection","text":"
    • Connection options
    • Connecting
    • Data transfer
    ","location":"docs/tags/#connection"},{"title":"slurm","text":"
    • Job management
    • Node features
    • Submitting jobs
    • Running jobs
    ","location":"docs/tags/#slurm"},{"title":"tech","text":"
    • Technical specifications
    • Facts
    ","location":"docs/tags/#tech"}]} \ No newline at end of file diff --git a/shell/index.html b/shell/index.html new file mode 100644 index 000000000..a7d419316 --- /dev/null +++ b/shell/index.html @@ -0,0 +1,15 @@ + + + + + + Redirecting... + + + + + + +Redirecting... + + diff --git a/sitemap.xml b/sitemap.xml new file mode 100644 index 000000000..a07a96e82 --- /dev/null +++ b/sitemap.xml @@ -0,0 +1,223 @@ + + + + https://www.sherlock.stanford.edu/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/concepts/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/credits/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/glossary/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/orders/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/tags/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/advanced-topics/connection/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/advanced-topics/job-management/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/advanced-topics/node-features/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/getting-started/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/getting-started/connecting/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/getting-started/submitting/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/software/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/software/install/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/software/list/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/software/modules/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/software/using/R/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/software/using/anaconda/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/software/using/clustershell/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/software/using/julia/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/software/using/mariadb/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/software/using/matlab/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/software/using/perl/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/software/using/postgresql/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/software/using/python/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/software/using/quantum-espresso/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/software/using/rclone/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/software/using/schrodinger/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/software/using/singularity/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/software/using/spark/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/storage/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/storage/data-protection/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/storage/data-sharing/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/storage/data-transfer/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/storage/filesystems/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/tech/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/tech/facts/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/tech/status/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/user-guide/gpu/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/user-guide/ondemand/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/user-guide/running-jobs/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/user-guide/troubleshoot/ + 2024-02-05 + daily + + + https://www.sherlock.stanford.edu/docs/tags/ + 2024-02-05 + daily + + \ No newline at end of file diff --git a/sitemap.xml.gz b/sitemap.xml.gz new file mode 100644 index 000000000..3b6bf28c4 Binary files /dev/null and b/sitemap.xml.gz differ