diff --git a/404.html b/404.html index a9891b3feb..b0b82b2813 100644 --- a/404.html +++ b/404.html @@ -16,13 +16,13 @@ - - + +
-
Skip to main content

Sorry! Page Not Found

We have been doing some work on our website, chances are that the page you're looking for is in the new docs section.

- - +
Skip to main content

Sorry! Page Not Found

We have been doing some work on our website, chances are that the page you're looking for is in the new docs section.

+ + \ No newline at end of file diff --git a/assets/js/cb7c2a83.0688a6d4.js b/assets/js/01a15f20.86c0ca94.js similarity index 93% rename from assets/js/cb7c2a83.0688a6d4.js rename to assets/js/01a15f20.86c0ca94.js index 00406b45c9..58af46bc98 100644 --- a/assets/js/cb7c2a83.0688a6d4.js +++ b/assets/js/01a15f20.86c0ca94.js @@ -1 +1 @@ -"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[1933],{3905:function(e,t,n){n.d(t,{Zo:function(){return u},kt:function(){return m}});var r=n(7294);function a(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function o(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,r)}return n}function i(e){for(var t=1;t=0||(a[n]=e[n]);return a}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(a[n]=e[n])}return a}var l=r.createContext({}),c=function(e){var t=r.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):i(i({},t),e)),n},u=function(e){var t=c(e.components);return r.createElement(l.Provider,{value:t},e.children)},p={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},y=r.forwardRef((function(e,t){var n=e.components,a=e.mdxType,o=e.originalType,l=e.parentName,u=s(e,["components","mdxType","originalType","parentName"]),y=c(n),m=a,f=y["".concat(l,".").concat(m)]||y[m]||p[m]||o;return n?r.createElement(f,i(i({ref:t},u),{},{components:n})):r.createElement(f,i({ref:t},u))}));function m(e,t){var n=arguments,a=t&&t.mdxType;if("string"==typeof e||a){var o=n.length,i=new Array(o);i[0]=y;var s={};for(var l in t)hasOwnProperty.call(t,l)&&(s[l]=t[l]);s.originalType=e,s.mdxType="string"==typeof e?e:a,i[1]=s;for(var c=2;c=0||(a[n]=e[n]);return a}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(a[n]=e[n])}return a}var l=r.createContext({}),c=function(e){var t=r.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):i(i({},t),e)),n},u=function(e){var t=c(e.components);return r.createElement(l.Provider,{value:t},e.children)},p={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},y=r.forwardRef((function(e,t){var n=e.components,a=e.mdxType,o=e.originalType,l=e.parentName,u=s(e,["components","mdxType","originalType","parentName"]),y=c(n),m=a,f=y["".concat(l,".").concat(m)]||y[m]||p[m]||o;return n?r.createElement(f,i(i({ref:t},u),{},{components:n})):r.createElement(f,i({ref:t},u))}));function m(e,t){var n=arguments,a=t&&t.mdxType;if("string"==typeof e||a){var o=n.length,i=new Array(o);i[0]=y;var s={};for(var l in t)hasOwnProperty.call(t,l)&&(s[l]=t[l]);s.originalType=e,s.mdxType="string"==typeof e?e:a,i[1]=s;for(var c=2;c=0||(n[r]=e[r]);return n}(e,a);if(Object.getOwnPropertySymbols){var s=Object.getOwnPropertySymbols(e);for(t=0;t=0||Object.prototype.propertyIsEnumerable.call(e,r)&&(n[r]=e[r])}return n}var p=t.createContext({}),c=function(e){var a=t.useContext(p),r=a;return e&&(r="function"==typeof e?e(a):o(o({},a),e)),r},i=function(e){var a=c(e.components);return t.createElement(p.Provider,{value:a},e.children)},u={inlineCode:"code",wrapper:function(e){var a=e.children;return t.createElement(t.Fragment,{},a)}},k=t.forwardRef((function(e,a){var r=e.components,n=e.mdxType,s=e.originalType,p=e.parentName,i=l(e,["components","mdxType","originalType","parentName"]),k=c(r),m=n,d=k["".concat(p,".").concat(m)]||k[m]||u[m]||s;return r?t.createElement(d,o(o({ref:a},i),{},{components:r})):t.createElement(d,o({ref:a},i))}));function m(e,a){var r=arguments,n=a&&a.mdxType;if("string"==typeof e||n){var s=r.length,o=new Array(s);o[0]=k;var l={};for(var p in a)hasOwnProperty.call(a,p)&&(l[p]=a[p]);l.originalType=e,l.mdxType="string"==typeof e?e:n,o[1]=l;for(var c=2;c=0||(n[r]=e[r]);return n}(e,a);if(Object.getOwnPropertySymbols){var s=Object.getOwnPropertySymbols(e);for(t=0;t=0||Object.prototype.propertyIsEnumerable.call(e,r)&&(n[r]=e[r])}return n}var p=t.createContext({}),c=function(e){var a=t.useContext(p),r=a;return e&&(r="function"==typeof e?e(a):o(o({},a),e)),r},i=function(e){var a=c(e.components);return t.createElement(p.Provider,{value:a},e.children)},u={inlineCode:"code",wrapper:function(e){var a=e.children;return t.createElement(t.Fragment,{},a)}},k=t.forwardRef((function(e,a){var r=e.components,n=e.mdxType,s=e.originalType,p=e.parentName,i=l(e,["components","mdxType","originalType","parentName"]),k=c(r),m=n,d=k["".concat(p,".").concat(m)]||k[m]||u[m]||s;return r?t.createElement(d,o(o({ref:a},i),{},{components:r})):t.createElement(d,o({ref:a},i))}));function m(e,a){var r=arguments,n=a&&a.mdxType;if("string"==typeof e||n){var s=r.length,o=new Array(s);o[0]=k;var l={};for(var p in a)hasOwnProperty.call(a,p)&&(l[p]=a[p]);l.originalType=e,l.mdxType="string"==typeof e?e:n,o[1]=l;for(var c=2;c=0||(o[n]=e[n]);return o}(e,t);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(o[n]=e[n])}return o}var l=r.createContext({}),p=function(e){var t=r.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):i(i({},t),e)),n},d=function(e){var t=p(e.components);return r.createElement(l.Provider,{value:t},e.children)},m={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},c=r.forwardRef((function(e,t){var n=e.components,o=e.mdxType,a=e.originalType,l=e.parentName,d=s(e,["components","mdxType","originalType","parentName"]),c=p(n),u=o,h=c["".concat(l,".").concat(u)]||c[u]||m[u]||a;return n?r.createElement(h,i(i({ref:t},d),{},{components:n})):r.createElement(h,i({ref:t},d))}));function u(e,t){var n=arguments,o=t&&t.mdxType;if("string"==typeof e||o){var a=n.length,i=new Array(a);i[0]=c;var s={};for(var l in t)hasOwnProperty.call(t,l)&&(s[l]=t[l]);s.originalType=e,s.mdxType="string"==typeof e?e:o,i[1]=s;for(var p=2;p=0||(o[n]=e[n]);return o}(e,t);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(o[n]=e[n])}return o}var l=r.createContext({}),p=function(e){var t=r.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):i(i({},t),e)),n},d=function(e){var t=p(e.components);return r.createElement(l.Provider,{value:t},e.children)},m={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},c=r.forwardRef((function(e,t){var n=e.components,o=e.mdxType,a=e.originalType,l=e.parentName,d=s(e,["components","mdxType","originalType","parentName"]),c=p(n),u=o,h=c["".concat(l,".").concat(u)]||c[u]||m[u]||a;return n?r.createElement(h,i(i({ref:t},d),{},{components:n})):r.createElement(h,i({ref:t},d))}));function u(e,t){var n=arguments,o=t&&t.mdxType;if("string"==typeof e||o){var a=n.length,i=new Array(a);i[0]=c;var s={};for(var l in t)hasOwnProperty.call(t,l)&&(s[l]=t[l]);s.originalType=e,s.mdxType="string"==typeof e?e:o,i[1]=s;for(var p=2;p=0||(r[i]=e[i]);return r}(e,n);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);for(t=0;t=0||Object.prototype.propertyIsEnumerable.call(e,i)&&(r[i]=e[i])}return r}var l=t.createContext({}),p=function(e){var n=t.useContext(l),i=n;return e&&(i="function"==typeof e?e(n):o(o({},n),e)),i},c=function(e){var n=p(e.components);return t.createElement(l.Provider,{value:n},e.children)},u={inlineCode:"code",wrapper:function(e){var n=e.children;return t.createElement(t.Fragment,{},n)}},d=t.forwardRef((function(e,n){var i=e.components,r=e.mdxType,a=e.originalType,l=e.parentName,c=s(e,["components","mdxType","originalType","parentName"]),d=p(i),m=r,g=d["".concat(l,".").concat(m)]||d[m]||u[m]||a;return i?t.createElement(g,o(o({ref:n},c),{},{components:i})):t.createElement(g,o({ref:n},c))}));function m(e,n){var i=arguments,r=n&&n.mdxType;if("string"==typeof e||r){var a=i.length,o=new Array(a);o[0]=d;var s={};for(var l in n)hasOwnProperty.call(n,l)&&(s[l]=n[l]);s.originalType=e,s.mdxType="string"==typeof e?e:r,o[1]=s;for(var p=2;p=0||(a[t]=e[t]);return a}(e,n);if(Object.getOwnPropertySymbols){var l=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,t)&&(a[t]=e[t])}return a}var i=r.createContext({}),p=function(e){var n=r.useContext(i),t=n;return e&&(t="function"==typeof e?e(n):s(s({},n),e)),t},u=function(e){var n=p(e.components);return r.createElement(i.Provider,{value:n},e.children)},c={inlineCode:"code",wrapper:function(e){var n=e.children;return r.createElement(r.Fragment,{},n)}},d=r.forwardRef((function(e,n){var t=e.components,a=e.mdxType,l=e.originalType,i=e.parentName,u=o(e,["components","mdxType","originalType","parentName"]),d=p(t),m=a,f=d["".concat(i,".").concat(m)]||d[m]||c[m]||l;return t?r.createElement(f,s(s({ref:n},u),{},{components:t})):r.createElement(f,s({ref:n},u))}));function m(e,n){var t=arguments,a=n&&n.mdxType;if("string"==typeof e||a){var l=t.length,s=new Array(l);s[0]=d;var o={};for(var i in n)hasOwnProperty.call(n,i)&&(o[i]=n[i]);o.originalType=e,o.mdxType="string"==typeof e?e:a,s[1]=o;for(var p=2;p [?? x 4]\n# Database: spark_connection\n eruptions waiting eruptions_output waiting_output\n \n 1 3.600 79 3.600 79\n 2 1.800 54 1.800 54\n 3 3.333 74 3.333 74\n 4 2.283 62 2.283 62\n 5 4.533 85 4.533 85\n 6 2.883 55 2.883 55\n 7 4.700 88 4.700 88\n 8 3.600 85 3.600 85\n 9 1.950 51 1.950 51\n 10 4.350 85 4.350 85\n # ... with more rows\n...\n")),(0,l.kt)("h2",{id:"azure-databricks"},"Azure Databricks"),(0,l.kt)("p",null,'In Azure Databricks, you can install devtools and the spark package from URL\nand then use spark_connect with method = "databricks":'),(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-R"},'install.packages("devtools")\ndevtools::install_url("https://mmlspark.azureedge.net/rrr/synapseml-0.11.3.zip")\nlibrary(sparklyr)\nlibrary(dplyr)\nsc <- spark_connect(method = "databricks")\nfaithful_df <- copy_to(sc, faithful)\nunfit_model = ml_light_gbmregressor(sc, maxDepth=20, featuresCol="waiting", labelCol="eruptions", numIterations=10, unfit.model=TRUE)\nml_train_regressor(faithful_df, labelCol="eruptions", unfit_model)\n')),(0,l.kt)("h2",{id:"building-from-source"},"Building from Source"),(0,l.kt)("p",null,"Our R bindings are built as part of the ",(0,l.kt)("a",{parentName:"p",href:"../Developer%20Setup"},"normal build\nprocess"),". To get a quick build, start at the root\nof the synapseml directory, and find the generated files. For instance,\nto find the R files for deep-learning, run"),(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-bash"},"sbt packageR\nls ./deep-learning/target/scala-2.12/generated/src/R/synapseml/R\n")),(0,l.kt)("p",null,"You can then run R in a terminal and install the above files directly:"),(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-R"},'...\ndevtools::install_local("./deep-learning/target/scala-2.12/generated/src/R/synapseml/R")\n...\n')))}m.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/052105dd.822a82f9.js b/assets/js/052105dd.822a82f9.js new file mode 100644 index 0000000000..c532876096 --- /dev/null +++ b/assets/js/052105dd.822a82f9.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[3036],{3905:function(e,a,t){t.d(a,{Zo:function(){return c},kt:function(){return d}});var n=t(7294);function r(e,a,t){return a in e?Object.defineProperty(e,a,{value:t,enumerable:!0,configurable:!0,writable:!0}):e[a]=t,e}function o(e,a){var t=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);a&&(n=n.filter((function(a){return Object.getOwnPropertyDescriptor(e,a).enumerable}))),t.push.apply(t,n)}return t}function l(e){for(var a=1;a=0||(r[t]=e[t]);return r}(e,a);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,t)&&(r[t]=e[t])}return r}var i=n.createContext({}),p=function(e){var a=n.useContext(i),t=a;return e&&(t="function"==typeof e?e(a):l(l({},a),e)),t},c=function(e){var a=p(e.components);return n.createElement(i.Provider,{value:a},e.children)},u={inlineCode:"code",wrapper:function(e){var a=e.children;return n.createElement(n.Fragment,{},a)}},m=n.forwardRef((function(e,a){var t=e.components,r=e.mdxType,o=e.originalType,i=e.parentName,c=s(e,["components","mdxType","originalType","parentName"]),m=p(t),d=r,f=m["".concat(i,".").concat(d)]||m[d]||u[d]||o;return t?n.createElement(f,l(l({ref:a},c),{},{components:t})):n.createElement(f,l({ref:a},c))}));function d(e,a){var t=arguments,r=a&&a.mdxType;if("string"==typeof e||r){var o=t.length,l=new Array(o);l[0]=m;var s={};for(var i in a)hasOwnProperty.call(a,i)&&(s[i]=a[i]);s.originalType=e,s.mdxType="string"==typeof e?e:r,l[1]=s;for(var p=2;p=0||(i[a]=t[a]);return i}(t,e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(t);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(t,a)&&(i[a]=t[a])}return i}var s=n.createContext({}),p=function(t){var e=n.useContext(s),a=e;return t&&(a="function"==typeof t?t(e):o(o({},e),t)),a},c=function(t){var e=p(t.components);return n.createElement(s.Provider,{value:e},t.children)},d={inlineCode:"code",wrapper:function(t){var e=t.children;return n.createElement(n.Fragment,{},e)}},u=n.forwardRef((function(t,e){var a=t.components,i=t.mdxType,r=t.originalType,s=t.parentName,c=l(t,["components","mdxType","originalType","parentName"]),u=p(a),m=i,b=u["".concat(s,".").concat(m)]||u[m]||d[m]||r;return a?n.createElement(b,o(o({ref:e},c),{},{components:a})):n.createElement(b,o({ref:e},c))}));function m(t,e){var a=arguments,i=e&&e.mdxType;if("string"==typeof t||i){var r=a.length,o=new Array(r);o[0]=u;var l={};for(var s in e)hasOwnProperty.call(e,s)&&(l[s]=e[s]);l.originalType=t,l.mdxType="string"==typeof t?t:i,o[1]=l;for(var p=2;p=0||(i[a]=t[a]);return i}(t,e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(t);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(t,a)&&(i[a]=t[a])}return i}var s=n.createContext({}),p=function(t){var e=n.useContext(s),a=e;return t&&(a="function"==typeof t?t(e):o(o({},e),t)),a},c=function(t){var e=p(t.components);return n.createElement(s.Provider,{value:e},t.children)},d={inlineCode:"code",wrapper:function(t){var e=t.children;return n.createElement(n.Fragment,{},e)}},u=n.forwardRef((function(t,e){var a=t.components,i=t.mdxType,r=t.originalType,s=t.parentName,c=l(t,["components","mdxType","originalType","parentName"]),u=p(a),m=i,b=u["".concat(s,".").concat(m)]||u[m]||d[m]||r;return a?n.createElement(b,o(o({ref:e},c),{},{components:a})):n.createElement(b,o({ref:e},c))}));function m(t,e){var a=arguments,i=e&&e.mdxType;if("string"==typeof t||i){var r=a.length,o=new Array(r);o[0]=u;var l={};for(var s in e)hasOwnProperty.call(e,s)&&(l[s]=e[s]);l.originalType=t,l.mdxType="string"==typeof t?t:i,o[1]=l;for(var p=2;p=0||(a[t]=e[t]);return a}(e,n);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,t)&&(a[t]=e[t])}return a}var s=r.createContext({}),p=function(e){var n=r.useContext(s),t=n;return e&&(t="function"==typeof e?e(n):o(o({},n),e)),t},c=function(e){var n=p(e.components);return r.createElement(s.Provider,{value:n},e.children)},u={inlineCode:"code",wrapper:function(e){var n=e.children;return r.createElement(r.Fragment,{},n)}},d=r.forwardRef((function(e,n){var t=e.components,a=e.mdxType,i=e.originalType,s=e.parentName,c=l(e,["components","mdxType","originalType","parentName"]),d=p(t),f=a,m=d["".concat(s,".").concat(f)]||d[f]||u[f]||i;return t?r.createElement(m,o(o({ref:n},c),{},{components:t})):r.createElement(m,o({ref:n},c))}));function f(e,n){var t=arguments,a=n&&n.mdxType;if("string"==typeof e||a){var i=t.length,o=new Array(i);o[0]=d;var l={};for(var s in n)hasOwnProperty.call(n,s)&&(l[s]=n[s]);l.originalType=e,l.mdxType="string"==typeof e?e:a,o[1]=l;for(var p=2;p=0||(a[t]=e[t]);return a}(e,n);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,t)&&(a[t]=e[t])}return a}var s=r.createContext({}),p=function(e){var n=r.useContext(s),t=n;return e&&(t="function"==typeof e?e(n):o(o({},n),e)),t},c=function(e){var n=p(e.components);return r.createElement(s.Provider,{value:n},e.children)},u={inlineCode:"code",wrapper:function(e){var n=e.children;return r.createElement(r.Fragment,{},n)}},d=r.forwardRef((function(e,n){var t=e.components,a=e.mdxType,i=e.originalType,s=e.parentName,c=l(e,["components","mdxType","originalType","parentName"]),d=p(t),f=a,m=d["".concat(s,".").concat(f)]||d[f]||u[f]||i;return t?r.createElement(m,o(o({ref:n},c),{},{components:t})):r.createElement(m,o({ref:n},c))}));function f(e,n){var t=arguments,a=n&&n.mdxType;if("string"==typeof e||a){var i=t.length,o=new Array(i);o[0]=d;var l={};for(var s in n)hasOwnProperty.call(n,s)&&(l[s]=n[s]);l.originalType=e,l.mdxType="string"==typeof e?e:a,o[1]=l;for(var p=2;p=0||(o[n]=e[n]);return o}(e,t);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(o[n]=e[n])}return o}var c=r.createContext({}),l=function(e){var t=r.useContext(c),n=t;return e&&(n="function"==typeof e?e(t):i(i({},t),e)),n},u=function(e){var t=l(e.components);return r.createElement(c.Provider,{value:t},e.children)},p={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},d=r.forwardRef((function(e,t){var n=e.components,o=e.mdxType,a=e.originalType,c=e.parentName,u=s(e,["components","mdxType","originalType","parentName"]),d=l(n),f=o,m=d["".concat(c,".").concat(f)]||d[f]||p[f]||a;return n?r.createElement(m,i(i({ref:t},u),{},{components:n})):r.createElement(m,i({ref:t},u))}));function f(e,t){var n=arguments,o=t&&t.mdxType;if("string"==typeof e||o){var a=n.length,i=new Array(a);i[0]=d;var s={};for(var c in t)hasOwnProperty.call(t,c)&&(s[c]=t[c]);s.originalType=e,s.mdxType="string"==typeof e?e:o,i[1]=s;for(var l=2;l=0||(o[n]=e[n]);return o}(e,t);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(o[n]=e[n])}return o}var c=r.createContext({}),l=function(e){var t=r.useContext(c),n=t;return e&&(n="function"==typeof e?e(t):i(i({},t),e)),n},u=function(e){var t=l(e.components);return r.createElement(c.Provider,{value:t},e.children)},p={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},d=r.forwardRef((function(e,t){var n=e.components,o=e.mdxType,a=e.originalType,c=e.parentName,u=s(e,["components","mdxType","originalType","parentName"]),d=l(n),f=o,m=d["".concat(c,".").concat(f)]||d[f]||p[f]||a;return n?r.createElement(m,i(i({ref:t},u),{},{components:n})):r.createElement(m,i({ref:t},u))}));function f(e,t){var n=arguments,o=t&&t.mdxType;if("string"==typeof e||o){var a=n.length,i=new Array(a);i[0]=d;var s={};for(var c in t)hasOwnProperty.call(t,c)&&(s[c]=t[c]);s.originalType=e,s.mdxType="string"==typeof e?e:o,i[1]=s;for(var l=2;l=0||(o[t]=e[t]);return o}(e,n);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,t)&&(o[t]=e[t])}return o}var l=r.createContext({}),p=function(e){var n=r.useContext(l),t=n;return e&&(t="function"==typeof e?e(n):s(s({},n),e)),t},m=function(e){var n=p(e.components);return r.createElement(l.Provider,{value:n},e.children)},c={inlineCode:"code",wrapper:function(e){var n=e.children;return r.createElement(r.Fragment,{},n)}},u=r.forwardRef((function(e,n){var t=e.components,o=e.mdxType,a=e.originalType,l=e.parentName,m=i(e,["components","mdxType","originalType","parentName"]),u=p(t),d=o,f=u["".concat(l,".").concat(d)]||u[d]||c[d]||a;return t?r.createElement(f,s(s({ref:n},m),{},{components:t})):r.createElement(f,s({ref:n},m))}));function d(e,n){var t=arguments,o=n&&n.mdxType;if("string"==typeof e||o){var a=t.length,s=new Array(a);s[0]=u;var i={};for(var l in n)hasOwnProperty.call(n,l)&&(i[l]=n[l]);i.originalType=e,i.mdxType="string"==typeof e?e:o,s[1]=i;for(var p=2;p" + c + "" for c in cols])\n\n style = """\n\n\n\n\n"""\n\n table = []\n for row in rows:\n table.append("")\n for col in cols:\n if col in image_cols:\n rep = \'\'.format(row[col])\n else:\n rep = row[col]\n table.append("{}".format(rep))\n table.append("")\n tableHTML = "".join(table)\n\n body = """\n\n\n \n {} \n \n {}\n
\n\n\n """.format(\n header, tableHTML\n )\n try:\n if running_on_databricks():\n displayHTML(style + body)\n else:\n import IPython\n\n IPython.display.HTML(style + body)\n except:\n pass\n')),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-python"},'snowLeopardQueries = ["snow leopard"]\nsnowLeopardUrls = bingPhotoSearch("snow leopard", snowLeopardQueries, pages=100)\ndisplayDF(snowLeopardUrls)\n')),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-python"},'randomWords = spark.read.parquet(\n "wasbs://publicwasb@mmlspark.blob.core.windows.net/random_words.parquet"\n).cache()\nrandomWords.show()\n')),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-python"},'randomLinks = (\n randomWords.mlTransform(\n BingImageSearch()\n .setSubscriptionKey(bing_search_key)\n .setCount(10)\n .setQueryCol("words")\n .setOutputCol("images")\n )\n .mlTransform(BingImageSearch.getUrlTransformer("images", "urls"))\n .withColumn("label", lit("other"))\n .limit(400)\n)\n\ndisplayDF(randomLinks)\n')),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-python"},'images = (\n snowLeopardUrls.union(randomLinks)\n .distinct()\n .repartition(100)\n .mlTransform(\n BingImageSearch.downloadFromUrls("urls", "image", concurrency=5, timeout=5000)\n )\n .dropna()\n)\n\ntrain, test = images.randomSplit([0.7, 0.3], seed=1)\n')),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-python"},'from pyspark.ml import Pipeline\nfrom pyspark.ml.feature import StringIndexer\nfrom pyspark.ml.classification import LogisticRegression\nfrom pyspark.sql.functions import udf\nfrom synapse.ml.onnx import ImageFeaturizer\nfrom synapse.ml.stages import UDFTransformer\nfrom pyspark.sql.types import *\n\n\ndef getIndex(row):\n return float(row[1])\n\n\nmodel = Pipeline(\n stages=[\n StringIndexer(inputCol="labels", outputCol="index"),\n ImageFeaturizer(\n inputCol="image",\n outputCol="features",\n autoConvertToColor=True,\n ignoreDecodingErrors=True,\n ).setModel("ResNet50"),\n LogisticRegression(maxIter=5, labelCol="index", regParam=10.0),\n UDFTransformer()\n .setUDF(udf(getIndex, DoubleType()))\n .setInputCol("probability")\n .setOutputCol("leopard_prob"),\n ]\n)\n\nfitModel = model.fit(train)\n')),(0,a.kt)("img",{src:"https://mmlspark.blob.core.windows.net/graphics/SnowLeopardAD/SLPipeline.PNG",width:"900"}),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-python"},'def plotConfusionMatrix(df, label, prediction, classLabels):\n from synapse.ml.plot import confusionMatrix\n import matplotlib.pyplot as plt\n\n fig = plt.figure(figsize=(4.5, 4.5))\n confusionMatrix(df, label, prediction, classLabels)\n display(fig)\n\n\nif not running_on_synapse():\n plotConfusionMatrix(\n fitModel.transform(test), "index", "prediction", fitModel.stages[0].labels\n )\n')),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-python"},'import urllib.request\nfrom synapse.ml.explainers import ImageLIME\n\ntest_image_url = (\n "https://mmlspark.blob.core.windows.net/graphics/SnowLeopardAD/snow_leopard1.jpg"\n)\nwith urllib.request.urlopen(test_image_url) as url:\n barr = url.read()\ntest_subsample = spark.createDataFrame([(bytearray(barr),)], ["image"])\n\nlime = (\n ImageLIME()\n .setModel(fitModel)\n .setTargetCol("leopard_prob")\n .setOutputCol("weights")\n .setInputCol("image")\n .setCellSize(100.0)\n .setModifier(50.0)\n .setNumSamples(300)\n)\n\nresult = lime.transform(test_subsample)\n')),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-python"},'import matplotlib.pyplot as plt\nimport PIL, io, numpy as np\n\n\ndef plot_superpixels(row):\n image_bytes = row["image"]\n superpixels = row["superpixels"]["clusters"]\n weights = list(row["weights"][0])\n mean_weight = np.percentile(weights, 90)\n img = (PIL.Image.open(io.BytesIO(image_bytes))).convert("RGBA")\n image_array = np.asarray(img).copy()\n for (sp, w) in zip(superpixels, weights):\n if w > mean_weight:\n for (x, y) in sp:\n image_array[y, x, 1] = 255\n image_array[y, x, 3] = 200\n plt.clf()\n plt.imshow(image_array)\n display()\n\n\n# Gets first row from the LIME-transformed data frame\nif not running_on_synapse():\n plot_superpixels(result.take(1)[0])\n')),(0,a.kt)("h3",{id:"your-results-will-look-like"},"Your results will look like:"),(0,a.kt)("img",{src:"https://mmlspark.blob.core.windows.net/graphics/SnowLeopardAD/lime_results.png",width:"900"}))}d.isMDXComponent=!0}}]); \ No newline at end of file +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[745],{3905:function(e,n,t){t.d(n,{Zo:function(){return m},kt:function(){return d}});var r=t(7294);function o(e,n,t){return n in e?Object.defineProperty(e,n,{value:t,enumerable:!0,configurable:!0,writable:!0}):e[n]=t,e}function a(e,n){var t=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);n&&(r=r.filter((function(n){return Object.getOwnPropertyDescriptor(e,n).enumerable}))),t.push.apply(t,r)}return t}function s(e){for(var n=1;n=0||(o[t]=e[t]);return o}(e,n);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,t)&&(o[t]=e[t])}return o}var l=r.createContext({}),p=function(e){var n=r.useContext(l),t=n;return e&&(t="function"==typeof e?e(n):s(s({},n),e)),t},m=function(e){var n=p(e.components);return r.createElement(l.Provider,{value:n},e.children)},c={inlineCode:"code",wrapper:function(e){var n=e.children;return r.createElement(r.Fragment,{},n)}},u=r.forwardRef((function(e,n){var t=e.components,o=e.mdxType,a=e.originalType,l=e.parentName,m=i(e,["components","mdxType","originalType","parentName"]),u=p(t),d=o,f=u["".concat(l,".").concat(d)]||u[d]||c[d]||a;return t?r.createElement(f,s(s({ref:n},m),{},{components:t})):r.createElement(f,s({ref:n},m))}));function d(e,n){var t=arguments,o=n&&n.mdxType;if("string"==typeof e||o){var a=t.length,s=new Array(a);s[0]=u;var i={};for(var l in n)hasOwnProperty.call(n,l)&&(i[l]=n[l]);i.originalType=e,i.mdxType="string"==typeof e?e:o,s[1]=i;for(var p=2;p" + c + "" for c in cols])\n\n style = """\n\n\n\n\n"""\n\n table = []\n for row in rows:\n table.append("")\n for col in cols:\n if col in image_cols:\n rep = \'\'.format(row[col])\n else:\n rep = row[col]\n table.append("{}".format(rep))\n table.append("")\n tableHTML = "".join(table)\n\n body = """\n\n\n \n {} \n \n {}\n
\n\n\n """.format(\n header, tableHTML\n )\n try:\n if running_on_databricks():\n displayHTML(style + body)\n else:\n import IPython\n\n IPython.display.HTML(style + body)\n except:\n pass\n')),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-python"},'snowLeopardQueries = ["snow leopard"]\nsnowLeopardUrls = bingPhotoSearch("snow leopard", snowLeopardQueries, pages=100)\ndisplayDF(snowLeopardUrls)\n')),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-python"},'randomWords = spark.read.parquet(\n "wasbs://publicwasb@mmlspark.blob.core.windows.net/random_words.parquet"\n).cache()\nrandomWords.show()\n')),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-python"},'randomLinks = (\n randomWords.mlTransform(\n BingImageSearch()\n .setSubscriptionKey(bing_search_key)\n .setCount(10)\n .setQueryCol("words")\n .setOutputCol("images")\n )\n .mlTransform(BingImageSearch.getUrlTransformer("images", "urls"))\n .withColumn("label", lit("other"))\n .limit(400)\n)\n\ndisplayDF(randomLinks)\n')),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-python"},'images = (\n snowLeopardUrls.union(randomLinks)\n .distinct()\n .repartition(100)\n .mlTransform(\n BingImageSearch.downloadFromUrls("urls", "image", concurrency=5, timeout=5000)\n )\n .dropna()\n)\n\ntrain, test = images.randomSplit([0.7, 0.3], seed=1)\n')),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-python"},'from pyspark.ml import Pipeline\nfrom pyspark.ml.feature import StringIndexer\nfrom pyspark.ml.classification import LogisticRegression\nfrom pyspark.sql.functions import udf\nfrom synapse.ml.onnx import ImageFeaturizer\nfrom synapse.ml.stages import UDFTransformer\nfrom pyspark.sql.types import *\n\n\ndef getIndex(row):\n return float(row[1])\n\n\nmodel = Pipeline(\n stages=[\n StringIndexer(inputCol="labels", outputCol="index"),\n ImageFeaturizer(\n inputCol="image",\n outputCol="features",\n autoConvertToColor=True,\n ignoreDecodingErrors=True,\n ).setModel("ResNet50"),\n LogisticRegression(maxIter=5, labelCol="index", regParam=10.0),\n UDFTransformer()\n .setUDF(udf(getIndex, DoubleType()))\n .setInputCol("probability")\n .setOutputCol("leopard_prob"),\n ]\n)\n\nfitModel = model.fit(train)\n')),(0,a.kt)("img",{src:"https://mmlspark.blob.core.windows.net/graphics/SnowLeopardAD/SLPipeline.PNG",width:"900"}),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-python"},'def plotConfusionMatrix(df, label, prediction, classLabels):\n from synapse.ml.plot import confusionMatrix\n import matplotlib.pyplot as plt\n\n fig = plt.figure(figsize=(4.5, 4.5))\n confusionMatrix(df, label, prediction, classLabels)\n display(fig)\n\n\nif not running_on_synapse():\n plotConfusionMatrix(\n fitModel.transform(test), "index", "prediction", fitModel.stages[0].labels\n )\n')),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-python"},'import urllib.request\nfrom synapse.ml.explainers import ImageLIME\n\ntest_image_url = (\n "https://mmlspark.blob.core.windows.net/graphics/SnowLeopardAD/snow_leopard1.jpg"\n)\nwith urllib.request.urlopen(test_image_url) as url:\n barr = url.read()\ntest_subsample = spark.createDataFrame([(bytearray(barr),)], ["image"])\n\nlime = (\n ImageLIME()\n .setModel(fitModel)\n .setTargetCol("leopard_prob")\n .setOutputCol("weights")\n .setInputCol("image")\n .setCellSize(100.0)\n .setModifier(50.0)\n .setNumSamples(300)\n)\n\nresult = lime.transform(test_subsample)\n')),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-python"},'import matplotlib.pyplot as plt\nimport PIL, io, numpy as np\n\n\ndef plot_superpixels(row):\n image_bytes = row["image"]\n superpixels = row["superpixels"]["clusters"]\n weights = list(row["weights"][0])\n mean_weight = np.percentile(weights, 90)\n img = (PIL.Image.open(io.BytesIO(image_bytes))).convert("RGBA")\n image_array = np.asarray(img).copy()\n for (sp, w) in zip(superpixels, weights):\n if w > mean_weight:\n for (x, y) in sp:\n image_array[y, x, 1] = 255\n image_array[y, x, 3] = 200\n plt.clf()\n plt.imshow(image_array)\n display()\n\n\n# Gets first row from the LIME-transformed data frame\nif not running_on_synapse():\n plot_superpixels(result.take(1)[0])\n')),(0,a.kt)("h3",{id:"your-results-will-look-like"},"Your results will look like:"),(0,a.kt)("img",{src:"https://mmlspark.blob.core.windows.net/graphics/SnowLeopardAD/lime_results.png",width:"900"}))}d.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/09bc1516.83fe0d81.js b/assets/js/09bc1516.83fe0d81.js new file mode 100644 index 0000000000..c88cf9bcf4 --- /dev/null +++ b/assets/js/09bc1516.83fe0d81.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[2408],{3905:function(e,t,a){a.d(t,{Zo:function(){return c},kt:function(){return y}});var n=a(7294);function r(e,t,a){return t in e?Object.defineProperty(e,t,{value:a,enumerable:!0,configurable:!0,writable:!0}):e[t]=a,e}function i(e,t){var a=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);t&&(n=n.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),a.push.apply(a,n)}return a}function s(e){for(var t=1;t=0||(r[a]=e[a]);return r}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(r[a]=e[a])}return r}var l=n.createContext({}),p=function(e){var t=n.useContext(l),a=t;return e&&(a="function"==typeof e?e(t):s(s({},t),e)),a},c=function(e){var t=p(e.components);return n.createElement(l.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return n.createElement(n.Fragment,{},t)}},m=n.forwardRef((function(e,t){var a=e.components,r=e.mdxType,i=e.originalType,l=e.parentName,c=o(e,["components","mdxType","originalType","parentName"]),m=p(a),y=r,d=m["".concat(l,".").concat(y)]||m[y]||u[y]||i;return a?n.createElement(d,s(s({ref:t},c),{},{components:a})):n.createElement(d,s({ref:t},c))}));function y(e,t){var a=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var i=a.length,s=new Array(i);s[0]=m;var o={};for(var l in t)hasOwnProperty.call(t,l)&&(o[l]=t[l]);o.originalType=e,o.mdxType="string"==typeof e?e:r,s[1]=o;for(var p=2;p=0||(o[r]=e[r]);return o}(e,t);if(Object.getOwnPropertySymbols){var s=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,r)&&(o[r]=e[r])}return o}var c=n.createContext({}),u=function(e){var t=n.useContext(c),r=t;return e&&(r="function"==typeof e?e(t):i(i({},t),e)),r},p=function(e){var t=u(e.components);return n.createElement(c.Provider,{value:t},e.children)},l={inlineCode:"code",wrapper:function(e){var t=e.children;return n.createElement(n.Fragment,{},t)}},f=n.forwardRef((function(e,t){var r=e.components,o=e.mdxType,s=e.originalType,c=e.parentName,p=a(e,["components","mdxType","originalType","parentName"]),f=u(r),m=o,v=f["".concat(c,".").concat(m)]||f[m]||l[m]||s;return r?n.createElement(v,i(i({ref:t},p),{},{components:r})):n.createElement(v,i({ref:t},p))}));function m(e,t){var r=arguments,o=t&&t.mdxType;if("string"==typeof e||o){var s=r.length,i=new Array(s);i[0]=f;var a={};for(var c in t)hasOwnProperty.call(t,c)&&(a[c]=t[c]);a.originalType=e,a.mdxType="string"==typeof e?e:o,i[1]=a;for(var u=2;u ",(0,s.kt)("strong",{parentName:"li"},"Go to resource"),". Once at the resource, you can get the key from ",(0,s.kt)("strong",{parentName:"li"},"Resource Management")," > ",(0,s.kt)("strong",{parentName:"li"},"Keys and Endpoint"),". Copy the key and paste it into the notebook. Store keys securely and do not share them. ")),(0,s.kt)("h2",{id:"cognitive-services"},"Cognitive Services"),(0,s.kt)("p",null,"To set up ",(0,s.kt)("a",{parentName:"p",href:"https://azure.microsoft.com/products/cognitive-services/"},"Cognitive Services")," for use with SynapseML you first need to:"),(0,s.kt)("ul",null,(0,s.kt)("li",{parentName:"ul"},(0,s.kt)("a",{parentName:"li",href:"https://learn.microsoft.com/azure/role-based-access-control/role-assignments-steps"},"Assign yourself the Cognitive Services Contributor role")," to agree to the responsible AI terms and create a resource. "),(0,s.kt)("li",{parentName:"ul"},(0,s.kt)("a",{parentName:"li",href:"https://portal.azure.com/#create/Microsoft.CognitiveServicesAllInOne"},"Create an Azure Cognitive multi-service (Decision, Language, Speech, Vision) resource"),". Alternatively, you can follow the steps to ",(0,s.kt)("a",{parentName:"li",href:"https://learn.microsoft.com/en-us/azure/cognitive-services/cognitive-services-apis-create-account?tabs=decision%2Canomaly-detector%2Clanguage-service%2Ccomputer-vision%2Cwindows#create-a-new-azure-cognitive-services-resource"},"create Single-service resource"),". "),(0,s.kt)("li",{parentName:"ul"},"Get your Cognitive Service resource's key. After your resource is successfully deployed, select ",(0,s.kt)("strong",{parentName:"li"},"Next Steps")," > ",(0,s.kt)("strong",{parentName:"li"},"Go to resource"),". Once at the resource, you can get the key from ",(0,s.kt)("strong",{parentName:"li"},"Resource Management")," > ",(0,s.kt)("strong",{parentName:"li"},"Keys and Endpoint"),". Copy the key and paste it into the notebook. Store keys securely and do not share them.")))}m.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/0a65b9de.ce66e4a9.js b/assets/js/0a65b9de.ce66e4a9.js deleted file mode 100644 index 7f44724dac..0000000000 --- a/assets/js/0a65b9de.ce66e4a9.js +++ /dev/null @@ -1 +0,0 @@ -"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[939],{3905:function(e,t,r){r.d(t,{Zo:function(){return p},kt:function(){return m}});var n=r(7294);function o(e,t,r){return t in e?Object.defineProperty(e,t,{value:r,enumerable:!0,configurable:!0,writable:!0}):e[t]=r,e}function s(e,t){var r=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);t&&(n=n.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),r.push.apply(r,n)}return r}function i(e){for(var t=1;t=0||(o[r]=e[r]);return o}(e,t);if(Object.getOwnPropertySymbols){var s=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,r)&&(o[r]=e[r])}return o}var c=n.createContext({}),u=function(e){var t=n.useContext(c),r=t;return e&&(r="function"==typeof e?e(t):i(i({},t),e)),r},p=function(e){var t=u(e.components);return n.createElement(c.Provider,{value:t},e.children)},l={inlineCode:"code",wrapper:function(e){var t=e.children;return n.createElement(n.Fragment,{},t)}},f=n.forwardRef((function(e,t){var r=e.components,o=e.mdxType,s=e.originalType,c=e.parentName,p=a(e,["components","mdxType","originalType","parentName"]),f=u(r),m=o,v=f["".concat(c,".").concat(m)]||f[m]||l[m]||s;return r?n.createElement(v,i(i({ref:t},p),{},{components:r})):n.createElement(v,i({ref:t},p))}));function m(e,t){var r=arguments,o=t&&t.mdxType;if("string"==typeof e||o){var s=r.length,i=new Array(s);i[0]=f;var a={};for(var c in t)hasOwnProperty.call(t,c)&&(a[c]=t[c]);a.originalType=e,a.mdxType="string"==typeof e?e:o,i[1]=a;for(var u=2;u ",(0,s.kt)("strong",{parentName:"li"},"Go to resource"),". Once at the resource, you can get the key from ",(0,s.kt)("strong",{parentName:"li"},"Resource Management")," > ",(0,s.kt)("strong",{parentName:"li"},"Keys and Endpoint"),". Copy the key and paste it into the notebook. Store keys securely and do not share them. ")),(0,s.kt)("h2",{id:"cognitive-services"},"Cognitive Services"),(0,s.kt)("p",null,"To set up ",(0,s.kt)("a",{parentName:"p",href:"https://azure.microsoft.com/products/cognitive-services/"},"Cognitive Services")," for use with SynapseML you first need to:"),(0,s.kt)("ul",null,(0,s.kt)("li",{parentName:"ul"},(0,s.kt)("a",{parentName:"li",href:"https://learn.microsoft.com/azure/role-based-access-control/role-assignments-steps"},"Assign yourself the Cognitive Services Contributor role")," to agree to the responsible AI terms and create a resource. "),(0,s.kt)("li",{parentName:"ul"},(0,s.kt)("a",{parentName:"li",href:"https://portal.azure.com/#create/Microsoft.CognitiveServicesAllInOne"},"Create an Azure Cognitive multi-service (Decision, Language, Speech, Vision) resource"),". Alternatively, you can follow the steps to ",(0,s.kt)("a",{parentName:"li",href:"https://learn.microsoft.com/en-us/azure/cognitive-services/cognitive-services-apis-create-account?tabs=decision%2Canomaly-detector%2Clanguage-service%2Ccomputer-vision%2Cwindows#create-a-new-azure-cognitive-services-resource"},"create Single-service resource"),". "),(0,s.kt)("li",{parentName:"ul"},"Get your Cognitive Service resource's key. After your resource is successfully deployed, select ",(0,s.kt)("strong",{parentName:"li"},"Next Steps")," > ",(0,s.kt)("strong",{parentName:"li"},"Go to resource"),". Once at the resource, you can get the key from ",(0,s.kt)("strong",{parentName:"li"},"Resource Management")," > ",(0,s.kt)("strong",{parentName:"li"},"Keys and Endpoint"),". Copy the key and paste it into the notebook. Store keys securely and do not share them.")))}m.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/0b416bde.9dc40de7.js b/assets/js/0b416bde.9dc40de7.js new file mode 100644 index 0000000000..d866d13a25 --- /dev/null +++ b/assets/js/0b416bde.9dc40de7.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[8579],{3905:function(e,t,n){n.d(t,{Zo:function(){return p},kt:function(){return m}});var r=n(7294);function a(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function i(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,r)}return n}function o(e){for(var t=1;t=0||(a[n]=e[n]);return a}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(a[n]=e[n])}return a}var l=r.createContext({}),c=function(e){var t=r.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):o(o({},t),e)),n},p=function(e){var t=c(e.components);return r.createElement(l.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},d=r.forwardRef((function(e,t){var n=e.components,a=e.mdxType,i=e.originalType,l=e.parentName,p=s(e,["components","mdxType","originalType","parentName"]),d=c(n),m=a,f=d["".concat(l,".").concat(m)]||d[m]||u[m]||i;return n?r.createElement(f,o(o({ref:t},p),{},{components:n})):r.createElement(f,o({ref:t},p))}));function m(e,t){var n=arguments,a=t&&t.mdxType;if("string"==typeof e||a){var i=n.length,o=new Array(i);o[0]=d;var s={};for(var l in t)hasOwnProperty.call(t,l)&&(s[l]=t[l]);s.originalType=e,s.mdxType="string"==typeof e?e:a,o[1]=s;for(var c=2;c=0||(s[a]=e[a]);return s}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(s[a]=e[a])}return s}var i=n.createContext({}),c=function(e){var t=n.useContext(i),a=t;return e&&(a="function"==typeof e?e(t):l(l({},t),e)),a},m=function(e){var t=c(e.components);return n.createElement(i.Provider,{value:t},e.children)},p={inlineCode:"code",wrapper:function(e){var t=e.children;return n.createElement(n.Fragment,{},t)}},u=n.forwardRef((function(e,t){var a=e.components,s=e.mdxType,o=e.originalType,i=e.parentName,m=r(e,["components","mdxType","originalType","parentName"]),u=c(a),y=s,g=u["".concat(i,".").concat(y)]||u[y]||p[y]||o;return a?n.createElement(g,l(l({ref:t},m),{},{components:a})):n.createElement(g,l({ref:t},m))}));function y(e,t){var a=arguments,s=t&&t.mdxType;if("string"==typeof e||s){var o=a.length,l=new Array(o);l[0]=u;var r={};for(var i in t)hasOwnProperty.call(t,i)&&(r[i]=t[i]);r.originalType=e,r.mdxType="string"==typeof e?e:s,l[1]=r;for(var c=2;c child <"+("string"==typeof e.type?e.type:e.type.name)+'>: all children of the component should be , and every should have a unique "value" prop.')})))?void 0:a.filter(Boolean))?t:[]}(e).map((function(e){var t=e.props;return{value:t.value,label:t.label,attributes:t.attributes,default:t.default}}))}function u(e){var t=e.values,a=e.children;return(0,s.useMemo)((function(){var e=null!=t?t:p(a);return function(e){var t=(0,c.l)(e,(function(e,t){return e.value===t.value}));if(t.length>0)throw new Error('Docusaurus error: Duplicate values "'+t.map((function(e){return e.value})).join(", ")+'" found in . Every value needs to be unique.')}(e),e}),[t,a])}function y(e){var t=e.value;return e.tabValues.some((function(e){return e.value===t}))}function g(e){var t=e.queryString,a=void 0!==t&&t,n=e.groupId,o=(0,r.k6)(),l=function(e){var t=e.queryString,a=void 0!==t&&t,n=e.groupId;if("string"==typeof a)return a;if(!1===a)return null;if(!0===a&&!n)throw new Error('Docusaurus error: The component groupId prop is required if queryString=true, because this value is used as the search param name. You can also provide an explicit value such as queryString="my-search-param".');return null!=n?n:null}({queryString:a,groupId:n});return[(0,i._X)(l),(0,s.useCallback)((function(e){if(l){var t=new URLSearchParams(o.location.search);t.set(l,e),o.replace(Object.assign({},o.location,{search:t.toString()}))}}),[l,o])]}function d(e){var t,a,n,o,l=e.defaultValue,r=e.queryString,i=void 0!==r&&r,c=e.groupId,p=u(e),d=(0,s.useState)((function(){return function(e){var t,a=e.defaultValue,n=e.tabValues;if(0===n.length)throw new Error("Docusaurus error: the component requires at least one children component");if(a){if(!y({value:a,tabValues:n}))throw new Error('Docusaurus error: The has a defaultValue "'+a+'" but none of its children has the corresponding value. Available values are: '+n.map((function(e){return e.value})).join(", ")+". If you intend to show no default tab, use defaultValue={null} instead.");return a}var s=null!=(t=n.find((function(e){return e.default})))?t:n[0];if(!s)throw new Error("Unexpected error: 0 tabValues");return s.value}({defaultValue:l,tabValues:p})})),v=d[0],f=d[1],h=g({queryString:i,groupId:c}),b=h[0],T=h[1],k=(t=function(e){return e?"docusaurus.tab."+e:null}({groupId:c}.groupId),a=(0,m.Nk)(t),n=a[0],o=a[1],[n,(0,s.useCallback)((function(e){t&&o.set(e)}),[t,o])]),I=k[0],S=k[1],_=function(){var e=null!=b?b:I;return y({value:e,tabValues:p})?e:null}();return(0,s.useLayoutEffect)((function(){_&&f(_)}),[_]),{selectedValue:v,selectValue:(0,s.useCallback)((function(e){if(!y({value:e,tabValues:p}))throw new Error("Can't select invalid tab value="+e);f(e),T(e),S(e)}),[T,S,p]),tabValues:p}}var v=a(2389),f="tabList__CuJ",h="tabItem_LNqP";function b(e){var t=e.className,a=e.block,r=e.selectedValue,i=e.selectValue,c=e.tabValues,m=[],p=(0,l.o5)().blockElementScrollPositionUntilNextRender,u=function(e){var t=e.currentTarget,a=m.indexOf(t),n=c[a].value;n!==r&&(p(t),i(n))},y=function(e){var t,a=null;switch(e.key){case"Enter":u(e);break;case"ArrowRight":var n,s=m.indexOf(e.currentTarget)+1;a=null!=(n=m[s])?n:m[0];break;case"ArrowLeft":var o,l=m.indexOf(e.currentTarget)-1;a=null!=(o=m[l])?o:m[m.length-1]}null==(t=a)||t.focus()};return s.createElement("ul",{role:"tablist","aria-orientation":"horizontal",className:(0,o.Z)("tabs",{"tabs--block":a},t)},c.map((function(e){var t=e.value,a=e.label,l=e.attributes;return s.createElement("li",(0,n.Z)({role:"tab",tabIndex:r===t?0:-1,"aria-selected":r===t,key:t,ref:function(e){return m.push(e)},onKeyDown:y,onClick:u},l,{className:(0,o.Z)("tabs__item",h,null==l?void 0:l.className,{"tabs__item--active":r===t})}),null!=a?a:t)})))}function T(e){var t=e.lazy,a=e.children,n=e.selectedValue,o=(Array.isArray(a)?a:[a]).filter(Boolean);if(t){var l=o.find((function(e){return e.props.value===n}));return l?(0,s.cloneElement)(l,{className:"margin-top--md"}):null}return s.createElement("div",{className:"margin-top--md"},o.map((function(e,t){return(0,s.cloneElement)(e,{key:t,hidden:e.props.value!==n})})))}function k(e){var t=d(e);return s.createElement("div",{className:(0,o.Z)("tabs-container",f)},s.createElement(b,(0,n.Z)({},e,t)),s.createElement(T,(0,n.Z)({},e,t)))}function I(e){var t=(0,v.Z)();return s.createElement(k,(0,n.Z)({key:String(t)},e))}},1989:function(e,t,a){var n=a(7294),s=a(2263);t.Z=function(e){var t=e.className,a=e.py,o=e.scala,l=e.csharp,r=e.sourceLink,i=(0,s.Z)().siteConfig.customFields.version,c="https://mmlspark.blob.core.windows.net/docs/"+i+"/pyspark/"+a,m="https://mmlspark.blob.core.windows.net/docs/"+i+"/scala/"+o,p="https://mmlspark.blob.core.windows.net/docs/"+i+"/dotnet/"+l;return n.createElement("table",null,n.createElement("tbody",null,n.createElement("tr",null,n.createElement("td",null,n.createElement("strong",null,"Python API: "),n.createElement("a",{href:c},t)),n.createElement("td",null,n.createElement("strong",null,"Scala API: "),n.createElement("a",{href:m},t)),n.createElement("td",null,n.createElement("strong",null,".NET API: "),n.createElement("a",{href:p},t)),n.createElement("td",null,n.createElement("strong",null,"Source: "),n.createElement("a",{href:r},t)))))}},7804:function(e,t,a){a.r(t),a.d(t,{assets:function(){return X},contentTitle:function(){return U},default:function(){return Q},frontMatter:function(){return j},metadata:function(){return H},toc:function(){return J}});var n=a(3117),s=a(102),o=(a(7294),a(3905)),l=a(4866),r=a(5162),i=a(1989),c=["components"],m=[{value:"Text Analytics",id:"text-analytics",level:2},{value:"EntityDetector",id:"entitydetector",level:3},{value:"KeyPhraseExtractor",id:"keyphraseextractor",level:3},{value:"LanguageDetector",id:"languagedetector",level:3},{value:"NER",id:"ner",level:3},{value:"PII",id:"pii",level:3},{value:"TextSentiment",id:"textsentiment",level:3}],p={toc:m};function u(e){var t=e.components,a=(0,s.Z)(e,c);return(0,o.kt)("wrapper",(0,n.Z)({},p,a,{components:t,mdxType:"MDXLayout"}),(0,o.kt)("h2",{id:"text-analytics"},"Text Analytics"),(0,o.kt)("h3",{id:"entitydetector"},"EntityDetector"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ntextKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))\ndf = spark.createDataFrame([\n ("1", "Microsoft released Windows 10"),\n ("2", "In 1975, Bill Gates III and Paul Allen founded the company.")\n], ["id", "text"])\n\nentity = (EntityDetector()\n .setSubscriptionKey(textKey)\n .setLocation("eastus")\n .setLanguage("en")\n .setOutputCol("replies")\n .setErrorCol("error"))\n\nentity.transform(df).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.text.EntityDetector\nimport spark.implicits._\nimport org.apache.spark.sql.functions.{col, flatten}\n\nval textKey = sys.env.getOrElse("COGNITIVE_API_KEY", None)\nval df = Seq(\n ("1", "Microsoft released Windows 10"),\n ("2", "In 1975, Bill Gates III and Paul Allen founded the company.")\n ).toDF("id", "text")\n\nval entity = (new EntityDetector()\n .setSubscriptionKey(textKey)\n .setLocation("eastus")\n .setLanguage("en")\n .setOutputCol("replies"))\n\nentity.transform(df).show()\n')))),(0,o.kt)(i.Z,{className:"EntityDetector",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.EntityDetector",scala:"com/microsoft/azure/synapse/ml/cognitive/EntityDetector.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1EntityDetector.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/TextAnalytics.scala",mdxType:"DocTable"}),(0,o.kt)("h3",{id:"keyphraseextractor"},"KeyPhraseExtractor"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ntextKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))\ndf = spark.createDataFrame([\n ("en", "Hello world. This is some input text that I love."),\n ("fr", "Bonjour tout le monde"),\n ("es", "La carretera estaba atascada. Hab\xeda mucho tr\xe1fico el d\xeda de ayer.")\n], ["lang", "text"])\n\nkeyPhrase = (KeyPhraseExtractor()\n .setSubscriptionKey(textKey)\n .setLocation("eastus")\n .setLanguageCol("lang")\n .setOutputCol("replies")\n .setErrorCol("error"))\n\nkeyPhrase.transform(df).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.text.KeyPhraseExtractor\nimport spark.implicits._\n\nval textKey = sys.env.getOrElse("COGNITIVE_API_KEY", None)\nval df = Seq(\n ("en", "Hello world. This is some input text that I love."),\n ("fr", "Bonjour tout le monde"),\n ("es", "La carretera estaba atascada. Hab\xeda mucho tr\xe1fico el d\xeda de ayer."),\n ("en", null)\n ).toDF("lang", "text")\n\nval keyPhrase = (new KeyPhraseExtractor()\n .setSubscriptionKey(textKey)\n .setLocation("eastus")\n .setLanguageCol("lang")\n .setOutputCol("replies"))\n\nkeyPhrase.transform(df).show()\n')))),(0,o.kt)(i.Z,{className:"KeyPhraseExtractor",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.KeyPhraseExtractor",scala:"com/microsoft/azure/synapse/ml/cognitive/KeyPhraseExtractor.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1KeyPhraseExtractor.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/TextAnalytics.scala",mdxType:"DocTable"}),(0,o.kt)("h3",{id:"languagedetector"},"LanguageDetector"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ntextKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))\ndf = spark.createDataFrame([\n ("Hello World",),\n ("Bonjour tout le monde",),\n ("La carretera estaba atascada. Hab\xeda mucho tr\xe1fico el d\xeda de ayer.",),\n ("\u4f60\u597d",),\n ("\u3053\u3093\u306b\u3061\u306f",),\n (":) :( :D",)\n], ["text",])\n\nlanguage = (LanguageDetector()\n .setSubscriptionKey(textKey)\n .setLocation("eastus")\n .setTextCol("text")\n .setOutputCol("language")\n .setErrorCol("error"))\n\nlanguage.transform(df).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.text.LanguageDetector\nimport spark.implicits._\n\nval textKey = sys.env.getOrElse("COGNITIVE_API_KEY", None)\nval df = Seq(\n "Hello World",\n "Bonjour tout le monde",\n "La carretera estaba atascada. Hab\xeda mucho tr\xe1fico el d\xeda de ayer.",\n ":) :( :D"\n ).toDF("text")\n\nval language = (new LanguageDetector()\n .setSubscriptionKey(textKey)\n .setLocation("eastus")\n .setOutputCol("replies"))\n\nlanguage.transform(df).show()\n')))),(0,o.kt)(i.Z,{className:"LanguageDetector",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.LanguageDetector",scala:"com/microsoft/azure/synapse/ml/cognitive/LanguageDetector.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1LanguageDetector.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/TextAnalytics.scala",mdxType:"DocTable"}),(0,o.kt)("h3",{id:"ner"},"NER"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ntextKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))\ndf = spark.createDataFrame([\n ("1", "en", "I had a wonderful trip to Seattle last week."),\n ("2", "en", "I visited Space Needle 2 times.")\n], ["id", "language", "text"])\n\nner = (NER()\n .setSubscriptionKey(textKey)\n .setLocation("eastus")\n .setLanguageCol("language")\n .setOutputCol("replies")\n .setErrorCol("error"))\n\nner.transform(df).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.text.NER\nimport spark.implicits._\n\nval textKey = sys.env.getOrElse("COGNITIVE_API_KEY", None)\nval df = Seq(\n ("1", "en", "I had a wonderful trip to Seattle last week."),\n ("2", "en", "I visited Space Needle 2 times.")\n ).toDF("id", "language", "text")\n\nval ner = (new NER()\n .setSubscriptionKey(textKey)\n .setLocation("eastus")\n .setLanguage("en")\n .setOutputCol("response"))\n\nner.transform(df).show()\n')))),(0,o.kt)(i.Z,{className:"NER",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.NER",scala:"com/microsoft/azure/synapse/ml/cognitive/NER.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1NER.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/TextAnalytics.scala",mdxType:"DocTable"}),(0,o.kt)("h3",{id:"pii"},"PII"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ntextKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))\ndf = spark.createDataFrame([\n ("1", "en", "My SSN is 859-98-0987"),\n ("2", "en",\n "Your ABA number - 111000025 - is the first 9 digits in the lower left hand corner of your personal check."),\n ("3", "en", "Is 998.214.865-68 your Brazilian CPF number?")\n], ["id", "language", "text"])\n\npii = (PII()\n .setSubscriptionKey(textKey)\n .setLocation("eastus")\n .setLanguage("en")\n .setOutputCol("response"))\n\npii.transform(df).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.text.PII\nimport spark.implicits._\n\nval textKey = sys.env.getOrElse("COGNITIVE_API_KEY", None)\nval df = Seq(\n ("1", "en", "My SSN is 859-98-0987"),\n ("2", "en",\n "Your ABA number - 111000025 - is the first 9 digits in the lower left hand corner of your personal check."),\n ("3", "en", "Is 998.214.865-68 your Brazilian CPF number?")\n ).toDF("id", "language", "text")\n\nval pii = (new PII()\n .setSubscriptionKey(textKey)\n .setLocation("eastus")\n .setLanguage("en")\n .setOutputCol("response"))\n\npii.transform(df).show()\n')))),(0,o.kt)(i.Z,{className:"PII",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.PII",scala:"com/microsoft/azure/synapse/ml/cognitive/PII.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1PII.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/TextAnalytics.scala",mdxType:"DocTable"}),(0,o.kt)("h3",{id:"textsentiment"},"TextSentiment"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ntextKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))\ndf = spark.createDataFrame([\n ("I am so happy today, its sunny!", "en-US"),\n ("I am frustrated by this rush hour traffic", "en-US"),\n ("The cognitive services on spark aint bad", "en-US"),\n], ["text", "language"])\n\nsentiment = (TextSentiment()\n .setSubscriptionKey(textKey)\n .setLocation("eastus")\n .setTextCol("text")\n .setOutputCol("sentiment")\n .setErrorCol("error")\n .setLanguageCol("language"))\n\nsentiment.transform(df).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.text.TextSentiment\nimport spark.implicits._\n\nval textKey = sys.env.getOrElse("COGNITIVE_API_KEY", None)\nval df = Seq(\n ("en", "Hello world. This is some input text that I love."),\n ("fr", "Bonjour tout le monde"),\n ("es", "La carretera estaba atascada. Hab\xeda mucho tr\xe1fico el d\xeda de ayer."),\n (null, "ich bin ein berliner"),\n (null, null),\n ("en", null)\n ).toDF("lang", "text")\n\nval sentiment = (new TextSentiment()\n .setSubscriptionKey(textKey)\n .setLocation("eastus")\n .setLanguageCol("lang")\n .setModelVersion("latest")\n .setShowStats(true)\n .setOutputCol("replies"))\n\nsentiment.transform(df).show()\n')))),(0,o.kt)(i.Z,{className:"TextSentiment",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.TextSentiment",scala:"com/microsoft/azure/synapse/ml/cognitive/TextSentiment.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1TextSentiment.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/TextAnalytics.scala",mdxType:"DocTable"}))}u.isMDXComponent=!0;var y=["components"],g=[{value:"Translator",id:"translator",level:2},{value:"Translate",id:"translate",level:3},{value:"Transliterate",id:"transliterate",level:3},{value:"Detect",id:"detect",level:3},{value:"BreakSentence",id:"breaksentence",level:3},{value:"DictionaryLookup",id:"dictionarylookup",level:3},{value:"DictionaryExamples",id:"dictionaryexamples",level:3},{value:"DocumentTranslator",id:"documenttranslator",level:3}],d={toc:g};function v(e){var t=e.components,a=(0,s.Z)(e,y);return(0,o.kt)("wrapper",(0,n.Z)({},d,a,{components:t,mdxType:"MDXLayout"}),(0,o.kt)("h2",{id:"translator"},"Translator"),(0,o.kt)("h3",{id:"translate"},"Translate"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ntranslatorKey = os.environ.get("TRANSLATOR_KEY", getSecret("translator-key"))\ndf = spark.createDataFrame([\n (["Hello, what is your name?", "Bye"],)\n], ["text",])\n\ntranslate = (Translate()\n .setSubscriptionKey(translatorKey)\n .setLocation("eastus")\n .setTextCol("text")\n .setToLanguage(["zh-Hans", "fr"])\n .setOutputCol("translation")\n .setConcurrency(5))\n\n(translate\n .transform(df)\n .withColumn("translation", flatten(col("translation.translations")))\n .withColumn("translation", col("translation.text"))\n .select("translation")).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.translate.Translate\nimport spark.implicits._\nimport org.apache.spark.sql.functions.{col, flatten}\n\nval translatorKey = sys.env.getOrElse("TRANSLATOR_KEY", None)\nval df = Seq(List("Hello, what is your name?", "Bye")).toDF("text")\n\nval translate = (new Translate()\n .setSubscriptionKey(translatorKey)\n .setLocation("eastus")\n .setTextCol("text")\n .setToLanguage(Seq("zh-Hans", "fr"))\n .setOutputCol("translation")\n .setConcurrency(5))\n\n(translate\n .transform(df)\n .withColumn("translation", flatten(col("translation.translations")))\n .withColumn("translation", col("translation.text"))\n .select("translation")).show()\n')))),(0,o.kt)(i.Z,{className:"Translate",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.Translate",scala:"com/microsoft/azure/synapse/ml/cognitive/Translate.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1Translate.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/TextTranslator.scala",mdxType:"DocTable"}),(0,o.kt)("h3",{id:"transliterate"},"Transliterate"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ntranslatorKey = os.environ.get("TRANSLATOR_KEY", getSecret("translator-key"))\ndf = spark.createDataFrame([\n (["\u3053\u3093\u306b\u3061\u306f", "\u3055\u3088\u3046\u306a\u3089"],)\n], ["text",])\n\ntransliterate = (Transliterate()\n .setSubscriptionKey(translatorKey)\n .setLocation("eastus")\n .setLanguage("ja")\n .setFromScript("Jpan")\n .setToScript("Latn")\n .setTextCol("text")\n .setOutputCol("result"))\n\n(transliterate\n .transform(df)\n .withColumn("text", col("result.text"))\n .withColumn("script", col("result.script"))\n .select("text", "script")).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.translate.Transliterate\nimport spark.implicits._\nimport org.apache.spark.sql.functions.col\n\nval translatorKey = sys.env.getOrElse("TRANSLATOR_KEY", None)\nval df = Seq(List("\u3053\u3093\u306b\u3061\u306f", "\u3055\u3088\u3046\u306a\u3089")).toDF("text")\n\nval transliterate = (new Transliterate()\n .setSubscriptionKey(translatorKey)\n .setLocation("eastus")\n .setLanguage("ja")\n .setFromScript("Jpan")\n .setToScript("Latn")\n .setTextCol("text")\n .setOutputCol("result"))\n\n(transliterate\n .transform(df)\n .withColumn("text", col("result.text"))\n .withColumn("script", col("result.script"))\n .select("text", "script")).show()\n')))),(0,o.kt)(i.Z,{className:"Transliterate",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.Transliterate",scala:"com/microsoft/azure/synapse/ml/cognitive/Transliterate.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1Transliterate.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/TextTranslator.scala",mdxType:"DocTable"}),(0,o.kt)("h3",{id:"detect"},"Detect"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ntranslatorKey = os.environ.get("TRANSLATOR_KEY", getSecret("translator-key"))\ndf = spark.createDataFrame([\n (["Hello, what is your name?"],)\n], ["text",])\n\ndetect = (Detect()\n .setSubscriptionKey(translatorKey)\n .setLocation("eastus")\n .setTextCol("text")\n .setOutputCol("result"))\n\n(detect\n .transform(df)\n .withColumn("language", col("result.language"))\n .select("language")).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.translate.Detect\nimport spark.implicits._\nimport org.apache.spark.sql.functions.col\n\nval translatorKey = sys.env.getOrElse("TRANSLATOR_KEY", None)\nval df = Seq(List("Hello, what is your name?")).toDF("text")\n\nval detect = (new Detect()\n .setSubscriptionKey(translatorKey)\n .setLocation("eastus")\n .setTextCol("text")\n .setOutputCol("result"))\n\n(detect\n .transform(df)\n .withColumn("language", col("result.language"))\n .select("language")).show()\n')))),(0,o.kt)(i.Z,{className:"Detect",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.Detect",scala:"com/microsoft/azure/synapse/ml/cognitive/Detect.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1Detect.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/TextTranslator.scala",mdxType:"DocTable"}),(0,o.kt)("h3",{id:"breaksentence"},"BreakSentence"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ntranslatorKey = os.environ.get("TRANSLATOR_KEY", getSecret("translator-key"))\ndf = spark.createDataFrame([\n (["Hello, what is your name?"],)\n], ["text",])\n\nbreakSentence = (BreakSentence()\n .setSubscriptionKey(translatorKey)\n .setLocation("eastus")\n .setTextCol("text")\n .setOutputCol("result"))\n\n(breakSentence\n .transform(df)\n .withColumn("sentLen", flatten(col("result.sentLen")))\n .select("sentLen")).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.translate.BreakSentence\nimport spark.implicits._\nimport org.apache.spark.sql.functions.{col, flatten}\n\nval translatorKey = sys.env.getOrElse("TRANSLATOR_KEY", None)\nval df = Seq(List("Hello, what is your name?")).toDF("text")\n\nval breakSentence = (new BreakSentence()\n .setSubscriptionKey(translatorKey)\n .setLocation("eastus")\n .setTextCol("text")\n .setOutputCol("result"))\n\n(breakSentence\n .transform(df)\n .withColumn("sentLen", flatten(col("result.sentLen")))\n .select("sentLen")).show()\n')))),(0,o.kt)(i.Z,{className:"BreakSentence",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.BreakSentence",scala:"com/microsoft/azure/synapse/ml/cognitive/BreakSentence.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1BreakSentence.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/TextTranslator.scala",mdxType:"DocTable"}),(0,o.kt)("h3",{id:"dictionarylookup"},"DictionaryLookup"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ntranslatorKey = os.environ.get("TRANSLATOR_KEY", getSecret("translator-key"))\ndf = spark.createDataFrame([\n (["fly"],)\n], ["text",])\n\ndictionaryLookup = (DictionaryLookup()\n .setSubscriptionKey(translatorKey)\n .setLocation("eastus")\n .setFromLanguage("en")\n .setToLanguage("es")\n .setTextCol("text")\n .setOutputCol("result"))\n\n(dictionaryLookup\n .transform(df)\n .withColumn("translations", flatten(col("result.translations")))\n .withColumn("normalizedTarget", col("translations.normalizedTarget"))\n .select("normalizedTarget")).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.translate.DictionaryLookup\nimport spark.implicits._\nimport org.apache.spark.sql.functions.{col, flatten}\n\nval translatorKey = sys.env.getOrElse("TRANSLATOR_KEY", None)\nval df = Seq(List("fly")).toDF("text")\n\nval dictionaryLookup = (new DictionaryLookup()\n .setSubscriptionKey(translatorKey)\n .setLocation("eastus")\n .setFromLanguage("en")\n .setToLanguage("es")\n .setTextCol("text")\n .setOutputCol("result"))\n\n(dictionaryLookup\n .transform(df)\n .withColumn("translations", flatten(col("result.translations")))\n .withColumn("normalizedTarget", col("translations.normalizedTarget"))\n .select("normalizedTarget")).show()\n')))),(0,o.kt)(i.Z,{className:"DictionaryLookup",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.DictionaryLookup",scala:"com/microsoft/azure/synapse/ml/cognitive/DictionaryLookup.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1DictionaryLookup.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/TextTranslator.scala",mdxType:"DocTable"}),(0,o.kt)("h3",{id:"dictionaryexamples"},"DictionaryExamples"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ntranslatorKey = os.environ.get("TRANSLATOR_KEY", getSecret("translator-key"))\ndf = (spark.createDataFrame([\n ("fly", "volar")\n], ["text", "translation"])\n .withColumn("textAndTranslation", array(struct(col("text"), col("translation")))))\n\ndictionaryExamples = (DictionaryExamples()\n .setSubscriptionKey(translatorKey)\n .setLocation("eastus")\n .setFromLanguage("en")\n .setToLanguage("es")\n .setTextAndTranslationCol("textAndTranslation")\n .setOutputCol("result"))\n\n(dictionaryExamples\n .transform(df)\n .withColumn("examples", flatten(col("result.examples")))\n .select("examples")).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.translate.{DictionaryExamples, TextAndTranslation}\nimport spark.implicits._\nimport org.apache.spark.sql.functions.{col, flatten}\n\nval translatorKey = sys.env.getOrElse("TRANSLATOR_KEY", None)\nval df = Seq(List(TextAndTranslation("fly", "volar"))).toDF("textAndTranslation")\n\nval dictionaryExamples = (new DictionaryExamples()\n .setSubscriptionKey(translatorKey)\n .setLocation("eastus")\n .setFromLanguage("en")\n .setToLanguage("es")\n .setTextAndTranslationCol("textAndTranslation")\n .setOutputCol("result"))\n\n(dictionaryExamples\n .transform(df)\n .withColumn("examples", flatten(col("result.examples")))\n .select("examples")).show()\n')))),(0,o.kt)(i.Z,{className:"DictionaryExamples",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.DictionaryExamples",scala:"com/microsoft/azure/synapse/ml/cognitive/DictionaryExamples.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1DictionaryExamples.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/TextTranslator.scala",mdxType:"DocTable"}),(0,o.kt)("h3",{id:"documenttranslator"},"DocumentTranslator"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ntranslatorKey = os.environ.get("TRANSLATOR_KEY", getSecret("translator-key"))\ntranslatorName = os.environ.get("TRANSLATOR_NAME", "mmlspark-translator")\n\ndocumentTranslator = (DocumentTranslator()\n .setSubscriptionKey(translatorKey)\n .setServiceName(translatorName)\n .setSourceUrlCol("sourceUrl")\n .setTargetsCol("targets")\n .setOutputCol("translationStatus"))\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.translate.DocumentTranslator\nimport spark.implicits._\n\nval translatorKey = sys.env.getOrElse("TRANSLATOR_KEY", None)\nval translatorName = sys.env.getOrElse("TRANSLATOR_NAME", None)\n\nval documentTranslator = (new DocumentTranslator()\n .setSubscriptionKey(translatorKey)\n .setServiceName(translatorName)\n .setSourceUrlCol("sourceUrl")\n .setTargetsCol("targets")\n .setOutputCol("translationStatus"))\n')))),(0,o.kt)(i.Z,{className:"DocumentTranslator",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.DocumentTranslator",scala:"com/microsoft/azure/synapse/ml/cognitive/DocumentTranslator.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1DocumentTranslator.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/DocumentTranslator.scala",mdxType:"DocTable"}))}v.isMDXComponent=!0;var f=["components"],h=[{value:"Computer Vision",id:"computer-vision",level:2},{value:"OCR",id:"ocr",level:3},{value:"AnalyzeImage",id:"analyzeimage",level:3},{value:"RecognizeText",id:"recognizetext",level:3},{value:"ReadImage",id:"readimage",level:3},{value:"RecognizeDomainSpecificContent",id:"recognizedomainspecificcontent",level:3},{value:"GenerateThumbnails",id:"generatethumbnails",level:3},{value:"TagImage",id:"tagimage",level:3},{value:"DescribeImage",id:"describeimage",level:3}],b={toc:h};function T(e){var t=e.components,a=(0,s.Z)(e,f);return(0,o.kt)("wrapper",(0,n.Z)({},b,a,{components:t,mdxType:"MDXLayout"}),(0,o.kt)("h2",{id:"computer-vision"},"Computer Vision"),(0,o.kt)("h3",{id:"ocr"},"OCR"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ncognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))\n\ndf = spark.createDataFrame([\n ("https://mmlspark.blob.core.windows.net/datasets/OCR/test1.jpg", ),\n ], ["url", ])\n\nocr = (OCR()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setImageUrlCol("url")\n .setDetectOrientation(True)\n .setOutputCol("ocr"))\n\nocr.transform(df).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.vision.OCR\nimport spark.implicits._\n\nval cognitiveKey = sys.env.getOrElse("COGNITIVE_API_KEY", None)\nval df = Seq(\n "https://mmlspark.blob.core.windows.net/datasets/OCR/test1.jpg"\n ).toDF("url")\n\n\nval ocr = (new OCR()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setImageUrlCol("url")\n .setDetectOrientation(true)\n .setOutputCol("ocr"))\n\nocr.transform(df).show()\n')))),(0,o.kt)(i.Z,{className:"OCR",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.OCR",scala:"com/microsoft/azure/synapse/ml/cognitive/OCR.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1OCR.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/ComputerVision.scala",mdxType:"DocTable"}),(0,o.kt)("h3",{id:"analyzeimage"},"AnalyzeImage"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ncognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))\ndf = spark.createDataFrame([\n ("https://mmlspark.blob.core.windows.net/datasets/OCR/test1.jpg", "en"),\n ("https://mmlspark.blob.core.windows.net/datasets/OCR/test2.png", None),\n ("https://mmlspark.blob.core.windows.net/datasets/OCR/test3.png", "en")\n ], ["image", "language"])\n\n\nai = (AnalyzeImage()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setImageUrlCol("image")\n .setLanguageCol("language")\n .setVisualFeatures(["Categories", "Tags", "Description", "Faces", "ImageType", "Color", "Adult", "Objects", "Brands"])\n .setDetails(["Celebrities", "Landmarks"])\n .setOutputCol("features"))\n\nai.transform(df).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.vision.AnalyzeImage\nimport spark.implicits._\n\nval cognitiveKey = sys.env.getOrElse("COGNITIVE_API_KEY", None)\nval df = Seq(\n ("https://mmlspark.blob.core.windows.net/datasets/OCR/test1.jpg", "en"),\n ("https://mmlspark.blob.core.windows.net/datasets/OCR/test2.png", null),\n ("https://mmlspark.blob.core.windows.net/datasets/OCR/test3.png", "en")\n ).toDF("url", "language")\n\nval ai = (new AnalyzeImage()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setImageUrlCol("url")\n .setLanguageCol("language")\n .setVisualFeatures(Seq("Categories", "Tags", "Description", "Faces", "ImageType", "Color", "Adult", "Objects", "Brands"))\n .setDetails(Seq("Celebrities", "Landmarks"))\n .setOutputCol("features"))\n\nai.transform(df).select("url", "features").show()\n')))),(0,o.kt)(i.Z,{className:"AnalyzeImage",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.AnalyzeImage",scala:"com/microsoft/azure/synapse/ml/cognitive/AnalyzeImage.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1AnalyzeImage.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/ComputerVision.scala",mdxType:"DocTable"}),(0,o.kt)("h3",{id:"recognizetext"},"RecognizeText"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ncognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))\ndf = spark.createDataFrame([\n ("https://mmlspark.blob.core.windows.net/datasets/OCR/test1.jpg", ),\n ("https://mmlspark.blob.core.windows.net/datasets/OCR/test2.png", ),\n ("https://mmlspark.blob.core.windows.net/datasets/OCR/test3.png", )\n ], ["url", ])\n\nrt = (RecognizeText()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setImageUrlCol("url")\n .setMode("Printed")\n .setOutputCol("ocr")\n .setConcurrency(5))\n\nrt.transform(df).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.vision.RecognizeText\nimport spark.implicits._\n\nval cognitiveKey = sys.env.getOrElse("COGNITIVE_API_KEY", None)\nval df = Seq(\n "https://mmlspark.blob.core.windows.net/datasets/OCR/test1.jpg",\n "https://mmlspark.blob.core.windows.net/datasets/OCR/test2.png",\n "https://mmlspark.blob.core.windows.net/datasets/OCR/test3.png"\n ).toDF("url")\n\nval rt = (new RecognizeText()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setImageUrlCol("url")\n .setMode("Printed")\n .setOutputCol("ocr")\n .setConcurrency(5))\n\nrt.transform(df).show()\n')))),(0,o.kt)(i.Z,{className:"RecognizeText",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.RecognizeText",scala:"com/microsoft/azure/synapse/ml/cognitive/RecognizeText.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1RecognizeText.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/ComputerVision.scala",mdxType:"DocTable"}),(0,o.kt)("h3",{id:"readimage"},"ReadImage"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ncognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))\ndf = spark.createDataFrame([\n ("https://mmlspark.blob.core.windows.net/datasets/OCR/test1.jpg", ),\n ("https://mmlspark.blob.core.windows.net/datasets/OCR/test2.png", ),\n ("https://mmlspark.blob.core.windows.net/datasets/OCR/test3.png", )\n ], ["url", ])\n\nri = (ReadImage()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setImageUrlCol("url")\n .setOutputCol("ocr")\n .setConcurrency(5))\n\nri.transform(df).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.vision.ReadImage\nimport spark.implicits._\n\nval cognitiveKey = sys.env.getOrElse("COGNITIVE_API_KEY", None)\nval df = Seq(\n "https://mmlspark.blob.core.windows.net/datasets/OCR/test1.jpg",\n "https://mmlspark.blob.core.windows.net/datasets/OCR/test2.png",\n "https://mmlspark.blob.core.windows.net/datasets/OCR/test3.png"\n ).toDF("url")\n\nval ri = (new ReadImage()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setImageUrlCol("url")\n .setOutputCol("ocr")\n .setConcurrency(5))\n\nri.transform(df).show()\n')))),(0,o.kt)(i.Z,{className:"ReadImage",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.ReadImage",scala:"com/microsoft/azure/synapse/ml/cognitive/ReadImage.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1ReadImage.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/ComputerVision.scala",mdxType:"DocTable"}),(0,o.kt)("h3",{id:"recognizedomainspecificcontent"},"RecognizeDomainSpecificContent"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ncognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))\ndf = spark.createDataFrame([\n ("https://mmlspark.blob.core.windows.net/datasets/DSIR/test2.jpg", )\n ], ["url", ])\n\nceleb = (RecognizeDomainSpecificContent()\n .setSubscriptionKey(cognitiveKey)\n .setModel("celebrities")\n .setLocation("eastus")\n .setImageUrlCol("url")\n .setOutputCol("celebs"))\n\nceleb.transform(df).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.vision.RecognizeDomainSpecificContent\nimport spark.implicits._\n\nval cognitiveKey = sys.env.getOrElse("COGNITIVE_API_KEY", None)\nval df = Seq(\n "https://mmlspark.blob.core.windows.net/datasets/DSIR/test2.jpg"\n ).toDF("url")\n\nval celeb = (new RecognizeDomainSpecificContent()\n .setSubscriptionKey(cognitiveKey)\n .setModel("celebrities")\n .setLocation("eastus")\n .setImageUrlCol("url")\n .setOutputCol("celebs"))\n\nceleb.transform(df).show()\n')))),(0,o.kt)(i.Z,{className:"RecognizeDomainSpecificContent",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.RecognizeDomainSpecificContent",scala:"com/microsoft/azure/synapse/ml/cognitive/RecognizeDomainSpecificContent.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1RecognizeDomainSpecificContent.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/ComputerVision.scala",mdxType:"DocTable"}),(0,o.kt)("h3",{id:"generatethumbnails"},"GenerateThumbnails"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ncognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))\ndf = spark.createDataFrame([\n ("https://mmlspark.blob.core.windows.net/datasets/DSIR/test1.jpg", )\n ], ["url", ])\n\ngt = (GenerateThumbnails()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setHeight(50)\n .setWidth(50)\n .setSmartCropping(True)\n .setImageUrlCol("url")\n .setOutputCol("thumbnails"))\n\ngt.transform(df).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.vision.GenerateThumbnails\nimport spark.implicits._\n\nval cognitiveKey = sys.env.getOrElse("COGNITIVE_API_KEY", None)\nval df: DataFrame = Seq(\n "https://mmlspark.blob.core.windows.net/datasets/DSIR/test1.jpg"\n ).toDF("url")\n\nval gt = (new GenerateThumbnails()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setHeight(50)\n .setWidth(50)\n .setSmartCropping(true)\n .setImageUrlCol("url")\n .setOutputCol("thumbnails"))\n\ngt.transform(df).show()\n')))),(0,o.kt)(i.Z,{className:"GenerateThumbnails",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.GenerateThumbnails",scala:"com/microsoft/azure/synapse/ml/cognitive/GenerateThumbnails.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1GenerateThumbnails.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/ComputerVision.scala",mdxType:"DocTable"}),(0,o.kt)("h3",{id:"tagimage"},"TagImage"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ncognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))\ndf = spark.createDataFrame([\n ("https://mmlspark.blob.core.windows.net/datasets/DSIR/test1.jpg", )\n ], ["url", ])\n\nti = (TagImage()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setImageUrlCol("url")\n .setOutputCol("tags"))\n\nti.transform(df).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.vision.TagImage\nimport spark.implicits._\n\nval cognitiveKey = sys.env.getOrElse("COGNITIVE_API_KEY", None)\nval df = Seq(\n "https://mmlspark.blob.core.windows.net/datasets/DSIR/test1.jpg"\n ).toDF("url")\n\nval ti = (new TagImage()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setImageUrlCol("url")\n .setOutputCol("tags"))\n\nti.transform(df).show()\n')))),(0,o.kt)(i.Z,{className:"TagImage",py:"synapse.ml.cognitive.html#module-mmlspark.cognitive.TagImage",scala:"com/microsoft/azure/synapse/ml/cognitive/TagImage.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1TagImage.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/ComputerVision.scala",mdxType:"DocTable"}),(0,o.kt)("h3",{id:"describeimage"},"DescribeImage"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ncognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))\ndf = spark.createDataFrame([\n ("https://mmlspark.blob.core.windows.net/datasets/DSIR/test1.jpg", )\n ], ["url", ])\n\ndi = (DescribeImage()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setMaxCandidates(3)\n .setImageUrlCol("url")\n .setOutputCol("descriptions"))\n\ndi.transform(df).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.vision.DescribeImage\nimport spark.implicits._\n\nval cognitiveKey = sys.env.getOrElse("COGNITIVE_API_KEY", None)\nval df = Seq(\n "https://mmlspark.blob.core.windows.net/datasets/DSIR/test1.jpg"\n ).toDF("url")\n\nval di = (new DescribeImage()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setMaxCandidates(3)\n .setImageUrlCol("url")\n .setOutputCol("descriptions"))\n\ndi.transform(df).show()\n')))),(0,o.kt)(i.Z,{className:"DescribeImage",py:"synapse.ml.cognitive.html#module-mmlspark.cognitive.DescribeImage",scala:"com/microsoft/azure/synapse/ml/cognitive/DescribeImage.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1DescribeImage.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/ComputerVision.scala",mdxType:"DocTable"}))}T.isMDXComponent=!0;var k=["components"],I=[{value:"Form Recognizer",id:"form-recognizer",level:2},{value:"AnalyzeLayout",id:"analyzelayout",level:3},{value:"AnalyzeReceipts",id:"analyzereceipts",level:3},{value:"AnalyzeBusinessCards",id:"analyzebusinesscards",level:3},{value:"AnalyzeInvoices",id:"analyzeinvoices",level:3},{value:"AnalyzeIDDocuments",id:"analyzeiddocuments",level:3},{value:"AnalyzeCustomModel",id:"analyzecustommodel",level:3},{value:"GetCustomModel",id:"getcustommodel",level:3},{value:"ListCustomModels",id:"listcustommodels",level:3},{value:"Form Recognizer V3",id:"form-recognizer-v3",level:2},{value:"AnalyzeDocument",id:"analyzedocument",level:3}],S={toc:I};function _(e){var t=e.components,a=(0,s.Z)(e,k);return(0,o.kt)("wrapper",(0,n.Z)({},S,a,{components:t,mdxType:"MDXLayout"}),(0,o.kt)("h2",{id:"form-recognizer"},"Form Recognizer"),(0,o.kt)("h3",{id:"analyzelayout"},"AnalyzeLayout"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ncognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))\nimageDf = spark.createDataFrame([\n ("https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/layout1.jpg",)\n], ["source",])\n\nanalyzeLayout = (AnalyzeLayout()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setImageUrlCol("source")\n .setOutputCol("layout")\n .setConcurrency(5))\n\n(analyzeLayout.transform(imageDf)\n .withColumn("lines", flatten(col("layout.analyzeResult.readResults.lines")))\n .withColumn("readLayout", col("lines.text"))\n .withColumn("tables", flatten(col("layout.analyzeResult.pageResults.tables")))\n .withColumn("cells", flatten(col("tables.cells")))\n .withColumn("pageLayout", col("cells.text"))\n .select("source", "readLayout", "pageLayout")).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.form.AnalyzeLayout\nimport spark.implicits._\n\nval cognitiveKey = sys.env.getOrElse("COGNITIVE_API_KEY", None)\nval imageDf = Seq(\n "https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/layout1.jpg"\n ).toDF("source")\n\nval analyzeLayout = (new AnalyzeLayout()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setImageUrlCol("source")\n .setOutputCol("layout")\n .setConcurrency(5))\n\nanalyzeLayout.transform(imageDf).show()\n')))),(0,o.kt)(i.Z,{className:"AnalyzeLayout",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.AnalyzeLayout",scala:"com/microsoft/azure/synapse/ml/cognitive/AnalyzeLayout.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1AnalyzeLayout.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/FormRecognizer.scala",mdxType:"DocTable"}),(0,o.kt)("h3",{id:"analyzereceipts"},"AnalyzeReceipts"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ncognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))\nimageDf = spark.createDataFrame([\n ("https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/receipt1.png",),\n ("https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/receipt1.png",)\n], ["image",])\n\nanalyzeReceipts = (AnalyzeReceipts()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setImageUrlCol("image")\n .setOutputCol("receipts")\n .setConcurrency(5))\n\nanalyzeReceipts.transform(imageDf).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.form.AnalyzeReceipts\nimport spark.implicits._\n\nval cognitiveKey = sys.env.getOrElse("COGNITIVE_API_KEY", None)\nval imageDf = Seq(\n "https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/receipt1.png",\n "https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/receipt1.png"\n ).toDF("source")\n\nval analyzeReceipts = (new AnalyzeReceipts()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setImageUrlCol("source")\n .setOutputCol("receipts")\n .setConcurrency(5))\n\nanalyzeReceipts.transform(imageDf).show()\n')))),(0,o.kt)(i.Z,{className:"AnalyzeReceipts",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.AnalyzeReceipts",scala:"com/microsoft/azure/synapse/ml/cognitive/AnalyzeReceipts.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1AnalyzeReceipts.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/FormRecognizer.scala",mdxType:"DocTable"}),(0,o.kt)("h3",{id:"analyzebusinesscards"},"AnalyzeBusinessCards"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ncognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))\nimageDf = spark.createDataFrame([\n ("https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/business_card.jpg",)\n], ["source",])\n\nanalyzeBusinessCards = (AnalyzeBusinessCards()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setImageUrlCol("source")\n .setOutputCol("businessCards")\n .setConcurrency(5))\n\nanalyzeBusinessCards.transform(imageDf).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.form.AnalyzeBusinessCards\nimport spark.implicits._\n\nval cognitiveKey = sys.env.getOrElse("COGNITIVE_API_KEY", None)\nval imageDf = Seq(\n "https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/business_card.jpg"\n ).toDF("source")\n\nval analyzeBusinessCards = (new AnalyzeBusinessCards()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setImageUrlCol("source")\n .setOutputCol("businessCards")\n .setConcurrency(5))\n\nanalyzeBusinessCards.transform(imageDf).show()\n')))),(0,o.kt)(i.Z,{className:"AnalyzeBusinessCards",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.AnalyzeBusinessCards",scala:"com/microsoft/azure/synapse/ml/cognitive/AnalyzeBusinessCards.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1AnalyzeBusinessCards.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/FormRecognizer.scala",mdxType:"DocTable"}),(0,o.kt)("h3",{id:"analyzeinvoices"},"AnalyzeInvoices"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ncognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))\nimageDf = spark.createDataFrame([\n ("https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/invoice2.png",)\n], ["source",])\n\nanalyzeInvoices = (AnalyzeInvoices()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setImageUrlCol("source")\n .setOutputCol("invoices")\n .setConcurrency(5))\n\n(analyzeInvoices\n .transform(imageDf)\n .withColumn("documents", explode(col("invoices.analyzeResult.documentResults.fields")))\n .select("source", "documents")).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.form.AnalyzeInvoices\nimport spark.implicits._\n\nval cognitiveKey = sys.env.getOrElse("COGNITIVE_API_KEY", None)\nval imageDf = Seq(\n "https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/invoice2.png"\n ).toDF("source")\n\nval analyzeInvoices = (new AnalyzeInvoices()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setImageUrlCol("source")\n .setOutputCol("invoices")\n .setConcurrency(5))\n\nanalyzeInvoices.transform(imageD4).show()\n')))),(0,o.kt)(i.Z,{className:"AnalyzeInvoices",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.AnalyzeInvoices",scala:"com/microsoft/azure/synapse/ml/cognitive/AnalyzeInvoices.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1AnalyzeInvoices.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/FormRecognizer.scala",mdxType:"DocTable"}),(0,o.kt)("h3",{id:"analyzeiddocuments"},"AnalyzeIDDocuments"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ncognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))\nimageDf = spark.createDataFrame([\n ("https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/id1.jpg",)\n], ["source",])\n\nanalyzeIDDocuments = (AnalyzeIDDocuments()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setImageUrlCol("source")\n .setOutputCol("ids")\n .setConcurrency(5))\n\n(analyzeIDDocuments\n .transform(imageDf)\n .withColumn("documents", explode(col("ids.analyzeResult.documentResults.fields")))\n .select("source", "documents")).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.form.AnalyzeIDDocuments\nimport spark.implicits._\n\nval cognitiveKey = sys.env.getOrElse("COGNITIVE_API_KEY", None)\nval imageDf = Seq(\n "https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/id1.jpg"\n ).toDF("source")\n\nval analyzeIDDocuments = (new AnalyzeIDDocuments()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setImageUrlCol("source")\n .setOutputCol("ids")\n .setConcurrency(5))\n\nanalyzeIDDocuments.transform(imageDf).show()\n')))),(0,o.kt)(i.Z,{className:"AnalyzeIDDocuments",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.AnalyzeIDDocuments",scala:"com/microsoft/azure/synapse/ml/cognitive/AnalyzeIDDocuments.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1AnalyzeIDDocuments.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/FormRecognizer.scala",mdxType:"DocTable"}),(0,o.kt)("h3",{id:"analyzecustommodel"},"AnalyzeCustomModel"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ncognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))\nmodelId = "02bc2f58-2beb-4ae3-84fb-08f011b2f7b8" # put your own modelId here\nimageDf = spark.createDataFrame([\n ("https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/invoice2.png",)\n], ["source",])\n\nanalyzeCustomModel = (AnalyzeCustomModel()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setModelId(modelId)\n .setImageUrlCol("source")\n .setOutputCol("output")\n .setConcurrency(5))\n\n(analyzeCustomModel\n .transform(imageDf)\n .withColumn("keyValuePairs", flatten(col("output.analyzeResult.pageResults.keyValuePairs")))\n .withColumn("keys", col("keyValuePairs.key.text"))\n .withColumn("values", col("keyValuePairs.value.text"))\n .withColumn("keyValuePairs", create_map(lit("key"), col("keys"), lit("value"), col("values")))\n .select("source", "keyValuePairs")).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.form.AnalyzeCustomModel\nimport spark.implicits._\n\nval cognitiveKey = sys.env.getOrElse("COGNITIVE_API_KEY", None)\nval modelId = "02bc2f58-2beb-4ae3-84fb-08f011b2f7b8" // put your own modelId here\nval imageDf = Seq(\n "https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/invoice2.png"\n ).toDF("source")\n\nval analyzeCustomModel = (new AnalyzeCustomModel()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setModelId(modelId)\n .setImageUrlCol("source")\n .setOutputCol("output")\n .setConcurrency(5))\n\nanalyzeCustomModel.transform(imageDf).show()\n')))),(0,o.kt)(i.Z,{className:"AnalyzeCustomModel",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.AnalyzeCustomModel",scala:"com/microsoft/azure/synapse/ml/cognitive/AnalyzeCustomModel.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1AnalyzeCustomModel.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/FormRecognizer.scala",mdxType:"DocTable"}),(0,o.kt)("h3",{id:"getcustommodel"},"GetCustomModel"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ncognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))\nmodelId = "02bc2f58-2beb-4ae3-84fb-08f011b2f7b8" # put your own modelId here\nemptyDf = spark.createDataFrame([("",)])\n\ngetCustomModel = (GetCustomModel()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setModelId(modelId)\n .setIncludeKeys(True)\n .setOutputCol("model")\n .setConcurrency(5))\n\n(getCustomModel\n .transform(emptyDf)\n .withColumn("modelInfo", col("model.ModelInfo"))\n .withColumn("trainResult", col("model.TrainResult"))\n .select("modelInfo", "trainResult")).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.form.GetCustomModel\nimport spark.implicits._\n\nval cognitiveKey = sys.env.getOrElse("COGNITIVE_API_KEY", None)\nval modelId = "02bc2f58-2beb-4ae3-84fb-08f011b2f7b8" // put your own modelId here\nval emptyDf = Seq("").toDF()\n\nval getCustomModel = (new GetCustomModel()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setModelId(modelId)\n .setIncludeKeys(true)\n .setOutputCol("model")\n .setConcurrency(5))\n\ngetCustomModel.transform(emptyDf).show()\n')))),(0,o.kt)(i.Z,{className:"GetCustomModel",py:"synapse.ml.cognitive.html#module-mmlspark.cognitive.GetCustomModel",scala:"com/microsoft/azure/synapse/ml/cognitive/GetCustomModel.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1GetCustomModel.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/FormRecognizer.scala",mdxType:"DocTable"}),(0,o.kt)("h3",{id:"listcustommodels"},"ListCustomModels"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ncognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))\nemptyDf = spark.createDataFrame([("",)])\n\nlistCustomModels = (ListCustomModels()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setOp("full")\n .setOutputCol("models")\n .setConcurrency(5))\n\n(listCustomModels\n .transform(emptyDf)\n .withColumn("modelIds", col("models.modelList.modelId"))\n .select("modelIds")).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.form.ListCustomModels\nimport spark.implicits._\n\nval cognitiveKey = sys.env.getOrElse("COGNITIVE_API_KEY", None)\nval emptyDf = Seq("").toDF()\n\nval listCustomModels = (new ListCustomModels()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setOp("full")\n .setOutputCol("models")\n .setConcurrency(5))\n\nlistCustomModels.transform(emptyDf).show()\n')))),(0,o.kt)(i.Z,{className:"ListCustomModels",py:"synapse.ml.cognitive.html#module-mmlspark.cognitive.ListCustomModels",scala:"com/microsoft/azure/synapse/ml/cognitive/ListCustomModels.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1ListCustomModels.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/FormRecognizer.scala",mdxType:"DocTable"}),(0,o.kt)("h2",{id:"form-recognizer-v3"},"Form Recognizer V3"),(0,o.kt)("h3",{id:"analyzedocument"},"AnalyzeDocument"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ncognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))\nimageDf = spark.createDataFrame([\n ("https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/layout1.jpg",)\n], ["source",])\n\nanalyzeDocument = (AnalyzeDocument()\n # For supported prebuilt models, please go to documentation page for details\n .setPrebuiltModelId("prebuilt-layout")\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setImageUrlCol("source")\n .setOutputCol("result")\n .setConcurrency(5))\n\n(analyzeDocument.transform(imageDf)\n .withColumn("content", col("result.analyzeResult.content"))\n .withColumn("cells", flatten(col("result.analyzeResult.tables.cells")))\n .withColumn("cells", col("cells.content"))\n .select("source", "result", "content", "cells")).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.form.AnalyzeDocument\nimport spark.implicits._\n\nval cognitiveKey = sys.env.getOrElse("COGNITIVE_API_KEY", None)\nval imageDf = Seq(\n "https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/layout1.jpg"\n ).toDF("source")\n\nval analyzeDocument = (new AnalyzeDocument()\n .setPrebuiltModelId("prebuilt-layout")\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setImageUrlCol("source")\n .setOutputCol("result")\n .setConcurrency(5))\n\nanalyzeDocument.transform(imageDf).show()\n')))),(0,o.kt)(i.Z,{className:"AnalyzeDocument",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.AnalyzeDocument",scala:"com/microsoft/azure/synapse/ml/cognitive/AnalyzeDocument.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1AnalyzeDocument.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/FormRecognizerV3.scala",mdxType:"DocTable"}))}_.isMDXComponent=!0;var C=["components"],x=[{value:"Anomaly Detection",id:"anomaly-detection",level:2},{value:"DetectLastAnomaly",id:"detectlastanomaly",level:3},{value:"DetectAnomalies",id:"detectanomalies",level:3},{value:"SimpleDetectAnomalies",id:"simpledetectanomalies",level:3}],D={toc:x};function w(e){var t=e.components,a=(0,s.Z)(e,C);return(0,o.kt)("wrapper",(0,n.Z)({},D,a,{components:t,mdxType:"MDXLayout"}),(0,o.kt)("h2",{id:"anomaly-detection"},"Anomaly Detection"),(0,o.kt)("h3",{id:"detectlastanomaly"},"DetectLastAnomaly"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\nfrom pyspark.sql.functions import lit\n\nanomalyKey = os.environ.get("ANOMALY_API_KEY", getSecret("anomaly-api-key"))\ndf = (spark.createDataFrame([\n ("1972-01-01T00:00:00Z", 826.0),\n ("1972-02-01T00:00:00Z", 799.0),\n ("1972-03-01T00:00:00Z", 890.0),\n ("1972-04-01T00:00:00Z", 900.0),\n ("1972-05-01T00:00:00Z", 766.0),\n ("1972-06-01T00:00:00Z", 805.0),\n ("1972-07-01T00:00:00Z", 821.0),\n ("1972-08-01T00:00:00Z", 20000.0),\n ("1972-09-01T00:00:00Z", 883.0),\n ("1972-10-01T00:00:00Z", 898.0),\n ("1972-11-01T00:00:00Z", 957.0),\n ("1972-12-01T00:00:00Z", 924.0),\n ("1973-01-01T00:00:00Z", 881.0),\n ("1973-02-01T00:00:00Z", 837.0),\n ("1973-03-01T00:00:00Z", 90000.0)\n], ["timestamp", "value"])\n .withColumn("group", lit(1))\n .withColumn("inputs", struct(col("timestamp"), col("value")))\n .groupBy(col("group"))\n .agg(sort_array(collect_list(col("inputs"))).alias("inputs")))\n\ndla = (DetectLastAnomaly()\n .setSubscriptionKey(anomalyKey)\n .setLocation("westus2")\n .setOutputCol("anomalies")\n .setSeriesCol("inputs")\n .setGranularity("monthly")\n .setErrorCol("errors"))\n\ndla.transform(df).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.anomaly.DetectLastAnomaly\nimport spark.implicits._\nimport org.apache.spark.sql.functions.{col, collect_list, lit, sort_array, struct}\n\nval anomalyKey = sys.env.getOrElse("ANOMALY_API_KEY", None)\nval df = (Seq(\n ("1972-01-01T00:00:00Z", 826.0),\n ("1972-02-01T00:00:00Z", 799.0),\n ("1972-03-01T00:00:00Z", 890.0),\n ("1972-04-01T00:00:00Z", 900.0),\n ("1972-05-01T00:00:00Z", 766.0),\n ("1972-06-01T00:00:00Z", 805.0),\n ("1972-07-01T00:00:00Z", 821.0),\n ("1972-08-01T00:00:00Z", 20000.0),\n ("1972-09-01T00:00:00Z", 883.0),\n ("1972-10-01T00:00:00Z", 898.0),\n ("1972-11-01T00:00:00Z", 957.0),\n ("1972-12-01T00:00:00Z", 924.0),\n ("1973-01-01T00:00:00Z", 881.0),\n ("1973-02-01T00:00:00Z", 837.0),\n ("1973-03-01T00:00:00Z", 90000.0)\n ).toDF("timestamp","value")\n .withColumn("group", lit(1))\n .withColumn("inputs", struct(col("timestamp"), col("value")))\n .groupBy(col("group"))\n .agg(sort_array(collect_list(col("inputs"))).alias("inputs")))\n\nval dla = (new DetectLastAnomaly()\n .setSubscriptionKey(anomalyKey)\n .setLocation("westus2")\n .setOutputCol("anomalies")\n .setSeriesCol("inputs")\n .setGranularity("monthly")\n .setErrorCol("errors"))\n\ndla.transform(df).show()\n')))),(0,o.kt)(i.Z,{className:"DetectLastAnomaly",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.DetectLastAnomaly",scala:"com/microsoft/azure/synapse/ml/cognitive/DetectLastAnomaly.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1DetectLastAnomaly.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/AnomalyDetection.scala",mdxType:"DocTable"}),(0,o.kt)("h3",{id:"detectanomalies"},"DetectAnomalies"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\nanomalyKey = os.environ.get("ANOMALY_API_KEY", getSecret("anomaly-api-key"))\ndf = (spark.createDataFrame([\n ("1972-01-01T00:00:00Z", 826.0),\n ("1972-02-01T00:00:00Z", 799.0),\n ("1972-03-01T00:00:00Z", 890.0),\n ("1972-04-01T00:00:00Z", 900.0),\n ("1972-05-01T00:00:00Z", 766.0),\n ("1972-06-01T00:00:00Z", 805.0),\n ("1972-07-01T00:00:00Z", 821.0),\n ("1972-08-01T00:00:00Z", 20000.0),\n ("1972-09-01T00:00:00Z", 883.0),\n ("1972-10-01T00:00:00Z", 898.0),\n ("1972-11-01T00:00:00Z", 957.0),\n ("1972-12-01T00:00:00Z", 924.0),\n ("1973-01-01T00:00:00Z", 881.0),\n ("1973-02-01T00:00:00Z", 837.0),\n ("1973-03-01T00:00:00Z", 90000.0)\n], ["timestamp", "value"])\n .withColumn("group", lit(1))\n .withColumn("inputs", struct(col("timestamp"), col("value")))\n .groupBy(col("group"))\n .agg(sort_array(collect_list(col("inputs"))).alias("inputs")))\n\nda = (DetectAnomalies()\n .setSubscriptionKey(anomalyKey)\n .setLocation("westus2")\n .setOutputCol("anomalies")\n .setSeriesCol("inputs")\n .setGranularity("monthly"))\n\nda.transform(df).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.anomaly.DetectAnomalies\nimport spark.implicits._\n\nval anomalyKey = sys.env.getOrElse("ANOMALY_API_KEY", None)\nval df = (Seq(\n ("1972-01-01T00:00:00Z", 826.0),\n ("1972-02-01T00:00:00Z", 799.0),\n ("1972-03-01T00:00:00Z", 890.0),\n ("1972-04-01T00:00:00Z", 900.0),\n ("1972-05-01T00:00:00Z", 766.0),\n ("1972-06-01T00:00:00Z", 805.0),\n ("1972-07-01T00:00:00Z", 821.0),\n ("1972-08-01T00:00:00Z", 20000.0),\n ("1972-09-01T00:00:00Z", 883.0),\n ("1972-10-01T00:00:00Z", 898.0),\n ("1972-11-01T00:00:00Z", 957.0),\n ("1972-12-01T00:00:00Z", 924.0),\n ("1973-01-01T00:00:00Z", 881.0),\n ("1973-02-01T00:00:00Z", 837.0),\n ("1973-03-01T00:00:00Z", 90000.0)\n ).toDF("timestamp","value")\n .withColumn("group", lit(1))\n .withColumn("inputs", struct(col("timestamp"), col("value")))\n .groupBy(col("group"))\n .agg(sort_array(collect_list(col("inputs"))).alias("inputs")))\n\nval da = (new DetectAnomalies()\n .setSubscriptionKey(anomalyKey)\n .setLocation("westus2")\n .setOutputCol("anomalies")\n .setSeriesCol("inputs")\n .setGranularity("monthly"))\n\nda.transform(df).show()\n')))),(0,o.kt)(i.Z,{className:"DetectAnomalies",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.DetectAnomalies",scala:"com/microsoft/azure/synapse/ml/cognitive/DetectAnomalies.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1DetectAnomalies.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/AnomalyDetection.scala",mdxType:"DocTable"}),(0,o.kt)("h3",{id:"simpledetectanomalies"},"SimpleDetectAnomalies"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\nanomalyKey = os.environ.get("ANOMALY_API_KEY", getSecret("anomaly-api-key"))\ndf = (spark.createDataFrame([\n ("1972-01-01T00:00:00Z", 826.0, 1.0),\n ("1972-02-01T00:00:00Z", 799.0, 1.0),\n ("1972-03-01T00:00:00Z", 890.0, 1.0),\n ("1972-04-01T00:00:00Z", 900.0, 1.0),\n ("1972-05-01T00:00:00Z", 766.0, 1.0),\n ("1972-06-01T00:00:00Z", 805.0, 1.0),\n ("1972-07-01T00:00:00Z", 821.0, 1.0),\n ("1972-08-01T00:00:00Z", 20000.0, 1.0),\n ("1972-09-01T00:00:00Z", 883.0, 1.0),\n ("1972-10-01T00:00:00Z", 898.0, 1.0),\n ("1972-11-01T00:00:00Z", 957.0, 1.0),\n ("1972-12-01T00:00:00Z", 924.0, 1.0),\n ("1973-01-01T00:00:00Z", 881.0, 1.0),\n ("1973-02-01T00:00:00Z", 837.0, 1.0),\n ("1973-03-01T00:00:00Z", 90000.0, 1.0),\n ("1972-01-01T00:00:00Z", 826.0, 2.0),\n ("1972-02-01T00:00:00Z", 799.0, 2.0),\n ("1972-03-01T00:00:00Z", 890.0, 2.0),\n ("1972-04-01T00:00:00Z", 900.0, 2.0),\n ("1972-05-01T00:00:00Z", 766.0, 2.0),\n ("1972-06-01T00:00:00Z", 805.0, 2.0),\n ("1972-07-01T00:00:00Z", 821.0, 2.0),\n ("1972-08-01T00:00:00Z", 20000.0, 2.0),\n ("1972-09-01T00:00:00Z", 883.0, 2.0),\n ("1972-10-01T00:00:00Z", 898.0, 2.0),\n ("1972-11-01T00:00:00Z", 957.0, 2.0),\n ("1972-12-01T00:00:00Z", 924.0, 2.0),\n ("1973-01-01T00:00:00Z", 881.0, 2.0),\n ("1973-02-01T00:00:00Z", 837.0, 2.0),\n ("1973-03-01T00:00:00Z", 90000.0, 2.0)\n], ["timestamp", "value", "group"]))\n\nsda = (SimpleDetectAnomalies()\n .setSubscriptionKey(anomalyKey)\n .setLocation("westus2")\n .setOutputCol("anomalies")\n .setGroupbyCol("group")\n .setGranularity("monthly"))\n\nsda.transform(df).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.anomaly.SimpleDetectAnomalies\nimport spark.implicits._\n\nval anomalyKey = sys.env.getOrElse("ANOMALY_API_KEY", None)\nval baseSeq = Seq(\n ("1972-01-01T00:00:00Z", 826.0),\n ("1972-02-01T00:00:00Z", 799.0),\n ("1972-03-01T00:00:00Z", 890.0),\n ("1972-04-01T00:00:00Z", 900.0),\n ("1972-05-01T00:00:00Z", 766.0),\n ("1972-06-01T00:00:00Z", 805.0),\n ("1972-07-01T00:00:00Z", 821.0),\n ("1972-08-01T00:00:00Z", 20000.0),\n ("1972-09-01T00:00:00Z", 883.0),\n ("1972-10-01T00:00:00Z", 898.0),\n ("1972-11-01T00:00:00Z", 957.0),\n ("1972-12-01T00:00:00Z", 924.0),\n ("1973-01-01T00:00:00Z", 881.0),\n ("1973-02-01T00:00:00Z", 837.0),\n ("1973-03-01T00:00:00Z", 9000.0)\n )\nval df = (baseSeq.map(p => (p._1,p._2,1.0))\n .++(baseSeq.map(p => (p._1,p._2,2.0)))\n .toDF("timestamp","value","group"))\n\nval sda = (new SimpleDetectAnomalies()\n .setSubscriptionKey(anomalyKey)\n .setLocation("westus2")\n .setOutputCol("anomalies")\n .setGroupbyCol("group")\n .setGranularity("monthly"))\n\nsda.transform(df).show()\n')))),(0,o.kt)(i.Z,{className:"SimpleDetectAnomalies",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.SimpleDetectAnomalies",scala:"com/microsoft/azure/synapse/ml/cognitive/SimpleDetectAnomalies.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1SimpleDetectAnomalies.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/AnomalyDetection.scala",mdxType:"DocTable"}))}w.isMDXComponent=!0;var N=["components"],K=[{value:"Face",id:"face",level:2},{value:"DetectFace",id:"detectface",level:3},{value:"FindSimilarFace",id:"findsimilarface",level:3},{value:"GroupFaces",id:"groupfaces",level:3},{value:"IdentifyFaces",id:"identifyfaces",level:3},{value:"VerifyFaces",id:"verifyfaces",level:3}],L={toc:K};function z(e){var t=e.components,a=(0,s.Z)(e,N);return(0,o.kt)("wrapper",(0,n.Z)({},L,a,{components:t,mdxType:"MDXLayout"}),(0,o.kt)("h2",{id:"face"},"Face"),(0,o.kt)("h3",{id:"detectface"},"DetectFace"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ncognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))\ndf = spark.createDataFrame([\n ("https://mmlspark.blob.core.windows.net/datasets/DSIR/test1.jpg",),\n], ["url"])\n\nface = (DetectFace()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setImageUrlCol("url")\n .setOutputCol("detected_faces")\n .setReturnFaceId(True)\n .setReturnFaceLandmarks(False)\n .setReturnFaceAttributes(["age", "gender", "headPose", "smile", "facialHair", "glasses", "emotion",\n "hair", "makeup", "occlusion", "accessories", "blur", "exposure", "noise"]))\n\nface.transform(df).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.face.DetectFace\nimport spark.implicits._\n\nval cognitiveKey = sys.env.getOrElse("COGNITIVE_API_KEY", None)\nval df: DataFrame = Seq(\n "https://mmlspark.blob.core.windows.net/datasets/DSIR/test2.jpg"\n ).toDF("url")\n\nval face = (new DetectFace()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setImageUrlCol("url")\n .setOutputCol("face")\n .setReturnFaceId(true)\n .setReturnFaceLandmarks(true)\n .setReturnFaceAttributes(Seq(\n "age", "gender", "headPose", "smile", "facialHair", "glasses", "emotion",\n "hair", "makeup", "occlusion", "accessories", "blur", "exposure", "noise")))\n\nface.transform(df).show()\n')))),(0,o.kt)(i.Z,{className:"DetectFace",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.DetectFace",scala:"com/microsoft/azure/synapse/ml/cognitive/DetectFace.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1DetectFace.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/Face.scala",mdxType:"DocTable"}),(0,o.kt)("h3",{id:"findsimilarface"},"FindSimilarFace"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ncognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))\ndf = spark.createDataFrame([\n ("https://mmlspark.blob.core.windows.net/datasets/DSIR/test1.jpg",),\n ("https://mmlspark.blob.core.windows.net/datasets/DSIR/test2.jpg",),\n ("https://mmlspark.blob.core.windows.net/datasets/DSIR/test3.jpg",)\n], ["url"])\n\ndetector = (DetectFace()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setImageUrlCol("url")\n .setOutputCol("detected_faces")\n .setReturnFaceId(True)\n .setReturnFaceLandmarks(False)\n .setReturnFaceAttributes([]))\n\nfaceIdDF = detector.transform(df).select("detected_faces").select(col("detected_faces").getItem(0).getItem("faceId").alias("id"))\nfaceIds = [row.asDict()[\'id\'] for row in faceIdDF.collect()]\n\nfindSimilar = (FindSimilarFace()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setOutputCol("similar")\n .setFaceIdCol("id")\n .setFaceIds(faceIds))\n\nfindSimilar.transform(faceIdDF).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.face.{DetectFace, FindSimilarFace}\nimport spark.implicits._\n\nval cognitiveKey = sys.env.getOrElse("COGNITIVE_API_KEY", None)\nval df: DataFrame = Seq(\n "https://mmlspark.blob.core.windows.net/datasets/DSIR/test1.jpg",\n "https://mmlspark.blob.core.windows.net/datasets/DSIR/test2.jpg",\n "https://mmlspark.blob.core.windows.net/datasets/DSIR/test3.jpg"\n ).toDF("url")\nval detector = (new DetectFace()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setImageUrlCol("url")\n .setOutputCol("detected_faces")\n .setReturnFaceId(true)\n .setReturnFaceLandmarks(false)\n .setReturnFaceAttributes(Seq()))\n\nval faceIdDF = (detector.transform(df)\n .select(col("detected_faces").getItem(0).getItem("faceId").alias("id"))\n .cache())\nval faceIds = faceIdDF.collect().map(row => row.getAs[String]("id"))\n\nval findSimilar = (new FindSimilarFace()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setOutputCol("similar")\n .setFaceIdCol("id")\n .setFaceIds(faceIds))\n\nfindSimilar.transform(faceIdDF).show()\n')))),(0,o.kt)(i.Z,{className:"FindSimilarFace",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.FindSimilarFace",scala:"com/microsoft/azure/synapse/ml/cognitive/FindSimilarFace.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1FindSimilarFace.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/Face.scala",mdxType:"DocTable"}),(0,o.kt)("h3",{id:"groupfaces"},"GroupFaces"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ncognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))\ndf = spark.createDataFrame([\n ("https://mmlspark.blob.core.windows.net/datasets/DSIR/test1.jpg",),\n ("https://mmlspark.blob.core.windows.net/datasets/DSIR/test2.jpg",),\n ("https://mmlspark.blob.core.windows.net/datasets/DSIR/test3.jpg",)\n], ["url"])\n\ndetector = (DetectFace()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setImageUrlCol("url")\n .setOutputCol("detected_faces")\n .setReturnFaceId(True)\n .setReturnFaceLandmarks(False)\n .setReturnFaceAttributes([]))\n\nfaceIdDF = detector.transform(df).select("detected_faces").select(col("detected_faces").getItem(0).getItem("faceId").alias("id"))\nfaceIds = [row.asDict()[\'id\'] for row in faceIdDF.collect()]\n\ngroup = (GroupFaces()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setOutputCol("grouping")\n .setFaceIds(faceIds))\n\ngroup.transform(faceIdDF).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.face.{DetectFace, GroupFaces}\nimport spark.implicits._\n\nval cognitiveKey = sys.env.getOrElse("COGNITIVE_API_KEY", None)\nval df: DataFrame = Seq(\n "https://mmlspark.blob.core.windows.net/datasets/DSIR/test1.jpg",\n "https://mmlspark.blob.core.windows.net/datasets/DSIR/test2.jpg",\n "https://mmlspark.blob.core.windows.net/datasets/DSIR/test3.jpg"\n ).toDF("url")\nval detector = (new DetectFace()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setImageUrlCol("url")\n .setOutputCol("detected_faces")\n .setReturnFaceId(true)\n .setReturnFaceLandmarks(false)\n .setReturnFaceAttributes(Seq()))\n\nval faceIdDF = (detector.transform(df)\n .select(col("detected_faces").getItem(0).getItem("faceId").alias("id"))\n .cache())\nval faceIds = faceIdDF.collect().map(row => row.getAs[String]("id"))\n\nval group = (new GroupFaces()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setOutputCol("grouping")\n .setFaceIds(faceIds))\n\ngroup.transform(faceIdDF).show()\n')))),(0,o.kt)(i.Z,{className:"GroupFaces",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.GroupFaces",scala:"com/microsoft/azure/synapse/ml/cognitive/GroupFaces.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1GroupFaces.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/Face.scala",mdxType:"DocTable"}),(0,o.kt)("h3",{id:"identifyfaces"},"IdentifyFaces"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ncognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))\npgId = "PUT_YOUR_PERSON_GROUP_ID"\n\nidentifyFaces = (IdentifyFaces()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setFaceIdsCol("faces")\n .setPersonGroupId(pgId)\n .setOutputCol("identified_faces"))\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.face.IdentifyFaces\nimport spark.implicits._\n\nval cognitiveKey = sys.env.getOrElse("COGNITIVE_API_KEY", None)\nval pgId = "PUT_YOUR_PERSON_GROUP_ID"\n\nval identifyFaces = (new IdentifyFaces()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setFaceIdsCol("faces")\n .setPersonGroupId(pgId)\n .setOutputCol("identified_faces"))\n')))),(0,o.kt)(i.Z,{className:"IdentifyFaces",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.IdentifyFaces",scala:"com/microsoft/azure/synapse/ml/cognitive/IdentifyFaces.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1IdentifyFaces.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/Face.scala",mdxType:"DocTable"}),(0,o.kt)("h3",{id:"verifyfaces"},"VerifyFaces"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ncognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))\ndf = spark.createDataFrame([\n ("https://mmlspark.blob.core.windows.net/datasets/DSIR/test1.jpg",),\n ("https://mmlspark.blob.core.windows.net/datasets/DSIR/test2.jpg",),\n ("https://mmlspark.blob.core.windows.net/datasets/DSIR/test3.jpg",)\n], ["url"])\n\ndetector = (DetectFace()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setImageUrlCol("url")\n .setOutputCol("detected_faces")\n .setReturnFaceId(True)\n .setReturnFaceLandmarks(False)\n .setReturnFaceAttributes([]))\n\nfaceIdDF = detector.transform(df).select("detected_faces").select(col("detected_faces").getItem(0).getItem("faceId").alias("faceId1"))\nfaceIdDF2 = faceIdDF.withColumn("faceId2", lit(faceIdDF.take(1)[0].asDict()[\'faceId1\']))\n\nverify = (VerifyFaces()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setOutputCol("same")\n .setFaceId1Col("faceId1")\n .setFaceId2Col("faceId2"))\n\nverify.transform(faceIdDF2).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.face.{DetectFace, VerifyFaces}\nimport spark.implicits._\n\nval cognitiveKey = sys.env.getOrElse("COGNITIVE_API_KEY", None)\nval df: DataFrame = Seq(\n "https://mmlspark.blob.core.windows.net/datasets/DSIR/test1.jpg",\n "https://mmlspark.blob.core.windows.net/datasets/DSIR/test2.jpg",\n "https://mmlspark.blob.core.windows.net/datasets/DSIR/test3.jpg"\n ).toDF("url")\n\nval detector = (new DetectFace()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setImageUrlCol("url")\n .setOutputCol("detected_faces")\n .setReturnFaceId(true)\n .setReturnFaceLandmarks(false)\n .setReturnFaceAttributes(Seq()))\n\nval faceIdDF = (detector.transform(df)\n .select(col("detected_faces").getItem(0).getItem("faceId").alias("faceId1"))\n .cache())\nval faceIdDF2 = faceIdDF.withColumn("faceId2", lit(faceIdDF.take(1).head.getString(0)))\n\nval verify = (new VerifyFaces()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setOutputCol("same")\n .setFaceId1Col("faceId1")\n .setFaceId2Col("faceId2"))\n\nverify.transform(faceIdDF2).show()\n')))),(0,o.kt)(i.Z,{className:"VerifyFaces",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.VerifyFaces",scala:"com/microsoft/azure/synapse/ml/cognitive/VerifyFaces.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1VerifyFaces.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/Face.scala",mdxType:"DocTable"}))}z.isMDXComponent=!0;var Z=["components"],E=[{value:"Speech To Text",id:"speech-to-text",level:2},{value:"SpeechToText",id:"speechtotext",level:3},{value:"SpeechToTextSDK",id:"speechtotextsdk",level:3}],A={toc:E};function O(e){var t=e.components,a=(0,s.Z)(e,Z);return(0,o.kt)("wrapper",(0,n.Z)({},A,a,{components:t,mdxType:"MDXLayout"}),(0,o.kt)("h2",{id:"speech-to-text"},"Speech To Text"),(0,o.kt)("h3",{id:"speechtotext"},"SpeechToText"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\nimport requests\n\ncognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))\nlink = "https://mmlspark.blob.core.windows.net/datasets/Speech/audio2.wav"\naudioBytes = requests.get(link).content\ndf = spark.createDataFrame([(audioBytes,)\n ], ["audio"])\n\nstt = (SpeechToText()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setOutputCol("text")\n .setAudioDataCol("audio")\n .setLanguage("en-US")\n .setFormat("simple"))\n\nstt.transform(df).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.speech.SpeechToText\nimport org.apache.commons.compress.utils.IOUtils\nimport spark.implicits._\nimport java.net.URL\n\nval cognitiveKey = sys.env.getOrElse("COGNITIVE_API_KEY", None)\nval audioBytes = IOUtils.toByteArray(new URL("https://mmlspark.blob.core.windows.net/datasets/Speech/test1.wav").openStream())\n\nval df: DataFrame = Seq(\n Tuple1(audioBytes)\n ).toDF("audio")\n\nval stt = (new SpeechToText()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setOutputCol("text")\n .setAudioDataCol("audio")\n .setLanguage("en-US")\n .setFormat("simple"))\n\nstt.transform(df).show()\n')))),(0,o.kt)(i.Z,{className:"SpeechToText",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.SpeechToText",scala:"com/microsoft/azure/synapse/ml/cognitive/SpeechToText.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1SpeechToText.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/SpeechToText.scala",mdxType:"DocTable"}),(0,o.kt)("h3",{id:"speechtotextsdk"},"SpeechToTextSDK"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\nimport requests\n\ncognitiveKey = os.environ.get("COGNITIVE_API_KEY", getSecret("cognitive-api-key"))\ndf = spark.createDataFrame([("https://mmlspark.blob.core.windows.net/datasets/Speech/audio2.wav",)\n ], ["url"])\n\nspeech_to_text = (SpeechToTextSDK()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setOutputCol("text")\n .setAudioDataCol("url")\n .setLanguage("en-US")\n .setProfanity("Masked"))\n\nspeech_to_text.transform(df).show()\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.speech.SpeechToTextSDK\nimport spark.implicits._\nimport org.apache.commons.compress.utils.IOUtils\nimport java.net.URL\n\nval cognitiveKey = sys.env.getOrElse("COGNITIVE_API_KEY", None)\nval df: DataFrame = Seq(\n "https://mmlspark.blob.core.windows.net/datasets/Speech/audio2.wav"\n ).toDF("url")\n\nval speech_to_text = (new SpeechToTextSDK()\n .setSubscriptionKey(cognitiveKey)\n .setLocation("eastus")\n .setOutputCol("text")\n .setAudioDataCol("url")\n .setLanguage("en-US")\n .setProfanity("Masked"))\n\nspeech_to_text.transform(df).show()\n')))),(0,o.kt)(i.Z,{className:"SpeechToTextSDK",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.SpeechToTextSDK",scala:"com/microsoft/azure/synapse/ml/cognitive/SpeechToTextSDK.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1SpeechToTextSDK.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/SpeechToTextSDK.scala",mdxType:"DocTable"}))}O.isMDXComponent=!0;var F=["components"],R=[{value:"Azure Search",id:"azure-search",level:2},{value:"AzureSearch",id:"azuresearch",level:3}],P={toc:R};function M(e){var t=e.components,a=(0,s.Z)(e,F);return(0,o.kt)("wrapper",(0,n.Z)({},P,a,{components:t,mdxType:"MDXLayout"}),(0,o.kt)("h2",{id:"azure-search"},"Azure Search"),(0,o.kt)("h3",{id:"azuresearch"},"AzureSearch"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\nazureSearchKey = os.environ.get("AZURE_SEARCH_KEY", getSecret("azure-search-key"))\ntestServiceName = "mmlspark-azure-search"\n\nindexName = "test-website"\n\ndef createSimpleIndexJson(indexName):\n json_str = """\n {\n "name": "%s",\n "fields": [\n {\n "name": "id",\n "type": "Edm.String",\n "key": true,\n "facetable": false\n },\n {\n "name": "fileName",\n "type": "Edm.String",\n "searchable": false,\n "sortable": false,\n "facetable": false\n },\n {\n "name": "text",\n "type": "Edm.String",\n "filterable": false,\n "sortable": false,\n "facetable": false\n }\n ]\n }\n """\n\n return json_str % indexName\n\ndf = (spark.createDataFrame([\n ("upload", "0", "file0", "text0"),\n ("upload", "1", "file1", "text1"),\n ("upload", "2", "file2", "text2"),\n ("upload", "3", "file3", "text3")\n], ["searchAction", "id", "fileName", "text"]))\n\nad = (AddDocuments()\n .setSubscriptionKey(azureSearchKey)\n .setServiceName(testServiceName)\n .setOutputCol("out")\n .setErrorCol("err")\n .setIndexName(indexName)\n .setActionCol("searchAction"))\n\nad.transform(df).show()\n\nAzureSearchWriter.writeToAzureSearch(df,\n subscriptionKey=azureSearchKey,\n actionCol="searchAction",\n serviceName=testServiceName,\n indexJson=createSimpleIndexJson(indexName))\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.search.{AddDocuments, AzureSearchWriter}\nimport spark.implicits._\n\nval azureSearchKey = sys.env.getOrElse("AZURE_SEARCH_KEY", None)\nval testServiceName = "mmlspark-azure-search"\n\nval indexName = "test-website"\n\ndef createSimpleIndexJson(indexName: String) = {\n s"""\n |{\n | "name": "$indexName",\n | "fields": [\n | {\n | "name": "id",\n | "type": "Edm.String",\n | "key": true,\n | "facetable": false\n | },\n | {\n | "name": "fileName",\n | "type": "Edm.String",\n | "searchable": false,\n | "sortable": false,\n | "facetable": false\n | },\n | {\n | "name": "text",\n | "type": "Edm.String",\n | "filterable": false,\n | "sortable": false,\n | "facetable": false\n | }\n | ]\n | }\n """.stripMargin\n}\n\nval df = ((0 until 4)\n .map(i => ("upload", s"$i", s"file$i", s"text$i"))\n .toDF("searchAction", "id", "fileName", "text"))\n\nval ad = (new AddDocuments()\n .setSubscriptionKey(azureSearchKey)\n .setServiceName(testServiceName)\n .setOutputCol("out")\n .setErrorCol("err")\n .setIndexName(indexName)\n .setActionCol("searchAction"))\n\nad.transform(df).show()\n\nAzureSearchWriter.write(df,\n Map("subscriptionKey" -> azureSearchKey,\n "actionCol" -> "searchAction",\n "serviceName" -> testServiceName,\n "indexJson" -> createSimpleIndexJson(indexName)))\n')))),(0,o.kt)(i.Z,{className:"AzureSearch",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.AzureSearch",scala:"com/microsoft/azure/synapse/ml/cognitive/AzureSearch.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1AddDocuments.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/AzureSearch.scala",mdxType:"DocTable"}))}M.isMDXComponent=!0;var V=["components"],G=[{value:"Bing Image Search",id:"bing-image-search",level:2},{value:"BingImageSearch",id:"bingimagesearch",level:3}],Y={toc:G};function q(e){var t=e.components,a=(0,s.Z)(e,V);return(0,o.kt)("wrapper",(0,n.Z)({},Y,a,{components:t,mdxType:"MDXLayout"}),(0,o.kt)("h2",{id:"bing-image-search"},"Bing Image Search"),(0,o.kt)("h3",{id:"bingimagesearch"},"BingImageSearch"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(r.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\nbingSearchKey = os.environ.get("BING_SEARCH_KEY", getSecret("bing-search-key"))\n\n# Number of images Bing will return per query\nimgsPerBatch = 10\n# A list of offsets, used to page into the search results\noffsets = [(i*imgsPerBatch,) for i in range(100)]\n# Since web content is our data, we create a dataframe with options on that data: offsets\nbingParameters = spark.createDataFrame(offsets, ["offset"])\n\n# Run the Bing Image Search service with our text query\nbingSearch = (BingImageSearch()\n .setSubscriptionKey(bingSearchKey)\n .setOffsetCol("offset")\n .setQuery("Martin Luther King Jr. quotes")\n .setCount(imgsPerBatch)\n .setOutputCol("images"))\n\n# Transformer that extracts and flattens the richly structured output of Bing Image Search into a simple URL column\ngetUrls = BingImageSearch.getUrlTransformer("images", "url")\n\n# This displays the full results returned\nbingSearch.transform(bingParameters).show()\n\n# Since we have two services, they are put into a pipeline\npipeline = PipelineModel(stages=[bingSearch, getUrls])\n\n# Show the results of your search: image URLs\npipeline.transform(bingParameters).show()\n\n'))),(0,o.kt)(r.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.bing.BingImageSearch\nimport spark.implicits._\n\nval bingSearchKey = sys.env.getOrElse("BING_SEARCH_KEY", None)\n\n// Number of images Bing will return per query\nval imgsPerBatch = 10\n// A list of offsets, used to page into the search results\nval offsets = (0 until 100).map(i => i*imgsPerBatch)\n// Since web content is our data, we create a dataframe with options on that data: offsets\nval bingParameters = Seq(offsets).toDF("offset")\n\n// Run the Bing Image Search service with our text query\nval bingSearch = (new BingImageSearch()\n .setSubscriptionKey(bingSearchKey)\n .setOffsetCol("offset")\n .setQuery("Martin Luther King Jr. quotes")\n .setCount(imgsPerBatch)\n .setOutputCol("images"))\n\n// Transformer that extracts and flattens the richly structured output of Bing Image Search into a simple URL column\nval getUrls = BingImageSearch.getUrlTransformer("images", "url")\n\n// This displays the full results returned\nbingSearch.transform(bingParameters).show()\n\n// Show the results of your search: image URLs\ngetUrls.transform(bingSearch.transform(bingParameters)).show()\n')))),(0,o.kt)(i.Z,{className:"BingImageSearch",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.BingImageSearch",scala:"com/microsoft/azure/synapse/ml/cognitive/BingImageSearch.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1BingImageSearch.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/BingImageSearch.scala",mdxType:"DocTable"}))}q.isMDXComponent=!0;var B=["components"],j={title:"Transformers - Cognitive",sidebar_label:"Cognitive",hide_title:!0},U=void 0,H={unversionedId:"Quick Examples/transformers/transformers_cognitive",id:"version-0.11.3/Quick Examples/transformers/transformers_cognitive",title:"Transformers - Cognitive",description:"",source:"@site/versioned_docs/version-0.11.3/Quick Examples/transformers/transformers_cognitive.md",sourceDirName:"Quick Examples/transformers",slug:"/Quick Examples/transformers/transformers_cognitive",permalink:"/SynapseML/docs/0.11.3/Quick Examples/transformers/transformers_cognitive",draft:!1,tags:[],version:"0.11.3",frontMatter:{title:"Transformers - Cognitive",sidebar_label:"Cognitive",hide_title:!0}},X={},J=[].concat(m,g,h,I,x,K,E,R,G),W={toc:J};function Q(e){var t=e.components,a=(0,s.Z)(e,B);return(0,o.kt)("wrapper",(0,n.Z)({},W,a,{components:t,mdxType:"MDXLayout"}),(0,o.kt)(u,{mdxType:"TextAnalytics"}),(0,o.kt)(v,{mdxType:"Translator"}),(0,o.kt)(T,{mdxType:"ComputerVision"}),(0,o.kt)(_,{mdxType:"FormRecognizer"}),(0,o.kt)(w,{mdxType:"AnomalyDetection"}),(0,o.kt)(z,{mdxType:"Face"}),(0,o.kt)(O,{mdxType:"SpeechToText"}),(0,o.kt)(M,{mdxType:"AzureSearch"}),(0,o.kt)(q,{mdxType:"BingImageSearch"}))}Q.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/6ff7775f.255e5c21.js b/assets/js/0daa5b3f.838eb04c.js similarity index 97% rename from assets/js/6ff7775f.255e5c21.js rename to assets/js/0daa5b3f.838eb04c.js index 0a761b3861..ca0f0dbbd1 100644 --- a/assets/js/6ff7775f.255e5c21.js +++ b/assets/js/0daa5b3f.838eb04c.js @@ -1 +1 @@ -"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[9461],{3905:function(e,t,n){n.d(t,{Zo:function(){return m},kt:function(){return c}});var r=n(7294);function a(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function o(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,r)}return n}function l(e){for(var t=1;t=0||(a[n]=e[n]);return a}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(a[n]=e[n])}return a}var s=r.createContext({}),p=function(e){var t=r.useContext(s),n=t;return e&&(n="function"==typeof e?e(t):l(l({},t),e)),n},m=function(e){var t=p(e.components);return r.createElement(s.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},d=r.forwardRef((function(e,t){var n=e.components,a=e.mdxType,o=e.originalType,s=e.parentName,m=i(e,["components","mdxType","originalType","parentName"]),d=p(n),c=a,f=d["".concat(s,".").concat(c)]||d[c]||u[c]||o;return n?r.createElement(f,l(l({ref:t},m),{},{components:n})):r.createElement(f,l({ref:t},m))}));function c(e,t){var n=arguments,a=t&&t.mdxType;if("string"==typeof e||a){var o=n.length,l=new Array(o);l[0]=d;var i={};for(var s in t)hasOwnProperty.call(t,s)&&(i[s]=t[s]);i.originalType=e,i.mdxType="string"==typeof e?e:a,l[1]=i;for(var p=2;p bytes:\n from onnxmltools.convert import convert_lightgbm\n from onnxconverter_common.data_types import FloatTensorType\n\n initial_types = [("input", FloatTensorType([-1, input_size]))]\n onnx_model = convert_lightgbm(\n lgbm_model, initial_types=initial_types, target_opset=9\n )\n return onnx_model.SerializeToString()\n\n\nbooster_model_str = model.getLightGBMBooster().modelStr().get()\nbooster = lgb.Booster(model_str=booster_model_str)\nmodel_payload_ml = convertModel(booster, len(feature_cols))\n')),(0,o.kt)("p",null,"After conversion, load the ONNX payload into an ",(0,o.kt)("inlineCode",{parentName:"p"},"ONNXModel")," and inspect the model inputs and outputs:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.onnx import ONNXModel\n\nonnx_ml = ONNXModel().setModelPayload(model_payload_ml)\n\nprint("Model inputs:" + str(onnx_ml.getModelInputs()))\nprint("Model outputs:" + str(onnx_ml.getModelOutputs()))\n')),(0,o.kt)("p",null,"Map the model input to the input dataframe's column name (FeedDict), and map the output dataframe's column names to the model outputs (FetchDict)."),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'onnx_ml = (\n onnx_ml.setDeviceType("CPU")\n .setFeedDict({"input": "features"})\n .setFetchDict({"probability": "probabilities", "prediction": "label"})\n .setMiniBatchSize(5000)\n)\n')),(0,o.kt)("h2",{id:"use-the-model-for-inference"},"Use the model for inference"),(0,o.kt)("p",null,"To perform inference with the model, the following code creates testing data and transforms the data through the ONNX model."),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from pyspark.ml.feature import VectorAssembler\nimport pandas as pd\nimport numpy as np\n\nn = 1000 * 1000\nm = 95\ntest = np.random.rand(n, m)\ntestPdf = pd.DataFrame(test)\ncols = list(map(str, testPdf.columns))\ntestDf = spark.createDataFrame(testPdf)\ntestDf = testDf.union(testDf).repartition(200)\ntestDf = (\n VectorAssembler()\n .setInputCols(cols)\n .setOutputCol("features")\n .transform(testDf)\n .drop(*cols)\n .cache()\n)\n\ndisplay(onnx_ml.transform(testDf))\n')),(0,o.kt)("p",null,"The output should look similar to the following table, though the values and number of rows may differ:"),(0,o.kt)("table",null,(0,o.kt)("thead",{parentName:"table"},(0,o.kt)("tr",{parentName:"thead"},(0,o.kt)("th",{parentName:"tr",align:null},"Index"),(0,o.kt)("th",{parentName:"tr",align:null},"Features"),(0,o.kt)("th",{parentName:"tr",align:null},"Prediction"),(0,o.kt)("th",{parentName:"tr",align:null},"Probability"))),(0,o.kt)("tbody",{parentName:"table"},(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:null},"1"),(0,o.kt)("td",{parentName:"tr",align:null},(0,o.kt)("inlineCode",{parentName:"td"},'"{"type":1,"values":[0.105...')),(0,o.kt)("td",{parentName:"tr",align:null},"0"),(0,o.kt)("td",{parentName:"tr",align:null},(0,o.kt)("inlineCode",{parentName:"td"},'"{"0":0.835...'))),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:null},"2"),(0,o.kt)("td",{parentName:"tr",align:null},(0,o.kt)("inlineCode",{parentName:"td"},'"{"type":1,"values":[0.814...')),(0,o.kt)("td",{parentName:"tr",align:null},"0"),(0,o.kt)("td",{parentName:"tr",align:null},(0,o.kt)("inlineCode",{parentName:"td"},'"{"0":0.658...'))))))}c.isMDXComponent=!0}}]); \ No newline at end of file +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[5024],{3905:function(e,t,n){n.d(t,{Zo:function(){return m},kt:function(){return c}});var r=n(7294);function a(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function o(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,r)}return n}function l(e){for(var t=1;t=0||(a[n]=e[n]);return a}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(a[n]=e[n])}return a}var s=r.createContext({}),p=function(e){var t=r.useContext(s),n=t;return e&&(n="function"==typeof e?e(t):l(l({},t),e)),n},m=function(e){var t=p(e.components);return r.createElement(s.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},d=r.forwardRef((function(e,t){var n=e.components,a=e.mdxType,o=e.originalType,s=e.parentName,m=i(e,["components","mdxType","originalType","parentName"]),d=p(n),c=a,f=d["".concat(s,".").concat(c)]||d[c]||u[c]||o;return n?r.createElement(f,l(l({ref:t},m),{},{components:n})):r.createElement(f,l({ref:t},m))}));function c(e,t){var n=arguments,a=t&&t.mdxType;if("string"==typeof e||a){var o=n.length,l=new Array(o);l[0]=d;var i={};for(var s in t)hasOwnProperty.call(t,s)&&(i[s]=t[s]);i.originalType=e,i.mdxType="string"==typeof e?e:a,l[1]=i;for(var p=2;p bytes:\n from onnxmltools.convert import convert_lightgbm\n from onnxconverter_common.data_types import FloatTensorType\n\n initial_types = [("input", FloatTensorType([-1, input_size]))]\n onnx_model = convert_lightgbm(\n lgbm_model, initial_types=initial_types, target_opset=9\n )\n return onnx_model.SerializeToString()\n\n\nbooster_model_str = model.getLightGBMBooster().modelStr().get()\nbooster = lgb.Booster(model_str=booster_model_str)\nmodel_payload_ml = convertModel(booster, len(feature_cols))\n')),(0,o.kt)("p",null,"After conversion, load the ONNX payload into an ",(0,o.kt)("inlineCode",{parentName:"p"},"ONNXModel")," and inspect the model inputs and outputs:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.onnx import ONNXModel\n\nonnx_ml = ONNXModel().setModelPayload(model_payload_ml)\n\nprint("Model inputs:" + str(onnx_ml.getModelInputs()))\nprint("Model outputs:" + str(onnx_ml.getModelOutputs()))\n')),(0,o.kt)("p",null,"Map the model input to the input dataframe's column name (FeedDict), and map the output dataframe's column names to the model outputs (FetchDict)."),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'onnx_ml = (\n onnx_ml.setDeviceType("CPU")\n .setFeedDict({"input": "features"})\n .setFetchDict({"probability": "probabilities", "prediction": "label"})\n .setMiniBatchSize(5000)\n)\n')),(0,o.kt)("h2",{id:"use-the-model-for-inference"},"Use the model for inference"),(0,o.kt)("p",null,"To perform inference with the model, the following code creates testing data and transforms the data through the ONNX model."),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from pyspark.ml.feature import VectorAssembler\nimport pandas as pd\nimport numpy as np\n\nn = 1000 * 1000\nm = 95\ntest = np.random.rand(n, m)\ntestPdf = pd.DataFrame(test)\ncols = list(map(str, testPdf.columns))\ntestDf = spark.createDataFrame(testPdf)\ntestDf = testDf.union(testDf).repartition(200)\ntestDf = (\n VectorAssembler()\n .setInputCols(cols)\n .setOutputCol("features")\n .transform(testDf)\n .drop(*cols)\n .cache()\n)\n\ndisplay(onnx_ml.transform(testDf))\n')),(0,o.kt)("p",null,"The output should look similar to the following table, though the values and number of rows may differ:"),(0,o.kt)("table",null,(0,o.kt)("thead",{parentName:"table"},(0,o.kt)("tr",{parentName:"thead"},(0,o.kt)("th",{parentName:"tr",align:null},"Index"),(0,o.kt)("th",{parentName:"tr",align:null},"Features"),(0,o.kt)("th",{parentName:"tr",align:null},"Prediction"),(0,o.kt)("th",{parentName:"tr",align:null},"Probability"))),(0,o.kt)("tbody",{parentName:"table"},(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:null},"1"),(0,o.kt)("td",{parentName:"tr",align:null},(0,o.kt)("inlineCode",{parentName:"td"},'"{"type":1,"values":[0.105...')),(0,o.kt)("td",{parentName:"tr",align:null},"0"),(0,o.kt)("td",{parentName:"tr",align:null},(0,o.kt)("inlineCode",{parentName:"td"},'"{"0":0.835...'))),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:null},"2"),(0,o.kt)("td",{parentName:"tr",align:null},(0,o.kt)("inlineCode",{parentName:"td"},'"{"type":1,"values":[0.814...')),(0,o.kt)("td",{parentName:"tr",align:null},"0"),(0,o.kt)("td",{parentName:"tr",align:null},(0,o.kt)("inlineCode",{parentName:"td"},'"{"0":0.658...'))))))}c.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/c4d09a44.ce4796a6.js b/assets/js/0e0ee9b2.fb4b07d7.js similarity index 95% rename from assets/js/c4d09a44.ce4796a6.js rename to assets/js/0e0ee9b2.fb4b07d7.js index c17ff1473e..b81f54d2d5 100644 --- a/assets/js/c4d09a44.ce4796a6.js +++ b/assets/js/0e0ee9b2.fb4b07d7.js @@ -1 +1 @@ -"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[2406],{3905:function(e,t,n){n.d(t,{Zo:function(){return u},kt:function(){return m}});var r=n(7294);function i(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function a(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,r)}return n}function o(e){for(var t=1;t=0||(i[n]=e[n]);return i}(e,t);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(i[n]=e[n])}return i}var l=r.createContext({}),p=function(e){var t=r.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):o(o({},t),e)),n},u=function(e){var t=p(e.components);return r.createElement(l.Provider,{value:t},e.children)},c={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},d=r.forwardRef((function(e,t){var n=e.components,i=e.mdxType,a=e.originalType,l=e.parentName,u=s(e,["components","mdxType","originalType","parentName"]),d=p(n),m=i,f=d["".concat(l,".").concat(m)]||d[m]||c[m]||a;return n?r.createElement(f,o(o({ref:t},u),{},{components:n})):r.createElement(f,o({ref:t},u))}));function m(e,t){var n=arguments,i=t&&t.mdxType;if("string"==typeof e||i){var a=n.length,o=new Array(a);o[0]=d;var s={};for(var l in t)hasOwnProperty.call(t,l)&&(s[l]=t[l]);s.originalType=e,s.mdxType="string"==typeof e?e:i,o[1]=s;for(var p=2;p=0||(i[n]=e[n]);return i}(e,t);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(i[n]=e[n])}return i}var l=r.createContext({}),p=function(e){var t=r.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):o(o({},t),e)),n},u=function(e){var t=p(e.components);return r.createElement(l.Provider,{value:t},e.children)},c={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},d=r.forwardRef((function(e,t){var n=e.components,i=e.mdxType,a=e.originalType,l=e.parentName,u=s(e,["components","mdxType","originalType","parentName"]),d=p(n),m=i,f=d["".concat(l,".").concat(m)]||d[m]||c[m]||a;return n?r.createElement(f,o(o({ref:t},u),{},{components:n})):r.createElement(f,o({ref:t},u))}));function m(e,t){var n=arguments,i=t&&t.mdxType;if("string"==typeof e||i){var a=n.length,o=new Array(a);o[0]=d;var s={};for(var l in t)hasOwnProperty.call(t,l)&&(s[l]=t[l]);s.originalType=e,s.mdxType="string"==typeof e?e:i,o[1]=s;for(var p=2;p=0||(a[n]=e[n]);return a}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(a[n]=e[n])}return a}var o=r.createContext({}),u=function(e){var t=r.useContext(o),n=t;return e&&(n="function"==typeof e?e(t):l(l({},t),e)),n},m=function(e){var t=u(e.components);return r.createElement(o.Provider,{value:t},e.children)},c={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},p=r.forwardRef((function(e,t){var n=e.components,a=e.mdxType,i=e.originalType,o=e.parentName,m=s(e,["components","mdxType","originalType","parentName"]),p=u(n),d=a,h=p["".concat(o,".").concat(d)]||p[d]||c[d]||i;return n?r.createElement(h,l(l({ref:t},m),{},{components:n})):r.createElement(h,l({ref:t},m))}));function d(e,t){var n=arguments,a=t&&t.mdxType;if("string"==typeof e||a){var i=n.length,l=new Array(i);l[0]=p;var s={};for(var o in t)hasOwnProperty.call(t,o)&&(s[o]=t[o]);s.originalType=e,s.mdxType="string"==typeof e?e:a,l[1]=s;for(var u=2;u=0||(a[n]=e[n]);return a}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(a[n]=e[n])}return a}var o=r.createContext({}),u=function(e){var t=r.useContext(o),n=t;return e&&(n="function"==typeof e?e(t):l(l({},t),e)),n},m=function(e){var t=u(e.components);return r.createElement(o.Provider,{value:t},e.children)},c={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},p=r.forwardRef((function(e,t){var n=e.components,a=e.mdxType,i=e.originalType,o=e.parentName,m=s(e,["components","mdxType","originalType","parentName"]),p=u(n),d=a,h=p["".concat(o,".").concat(d)]||p[d]||c[d]||i;return n?r.createElement(h,l(l({ref:t},m),{},{components:n})):r.createElement(h,l({ref:t},m))}));function d(e,t){var n=arguments,a=t&&t.mdxType;if("string"==typeof e||a){var i=n.length,l=new Array(i);l[0]=p;var s={};for(var o in t)hasOwnProperty.call(t,o)&&(s[o]=t[o]);s.originalType=e,s.mdxType="string"==typeof e?e:a,l[1]=s;for(var u=2;u=0||(r[n]=e[n]);return r}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(r[n]=e[n])}return r}var s=a.createContext({}),p=function(e){var t=a.useContext(s),n=t;return e&&(n="function"==typeof e?e(t):l(l({},t),e)),n},c=function(e){var t=p(e.components);return a.createElement(s.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return a.createElement(a.Fragment,{},t)}},d=a.forwardRef((function(e,t){var n=e.components,r=e.mdxType,i=e.originalType,s=e.parentName,c=o(e,["components","mdxType","originalType","parentName"]),d=p(n),f=r,m=d["".concat(s,".").concat(f)]||d[f]||u[f]||i;return n?a.createElement(m,l(l({ref:t},c),{},{components:n})):a.createElement(m,l({ref:t},c))}));function f(e,t){var n=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var i=n.length,l=new Array(i);l[0]=d;var o={};for(var s in t)hasOwnProperty.call(t,s)&&(o[s]=t[s]);o.originalType=e,o.mdxType="string"==typeof e?e:r,l[1]=o;for(var p=2;p=0||(r[a]=e[a]);return r}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(r[a]=e[a])}return r}var p=n.createContext({}),s=function(e){var t=n.useContext(p),a=t;return e&&(a="function"==typeof e?e(t):i(i({},t),e)),a},m=function(e){var t=s(e.components);return n.createElement(p.Provider,{value:t},e.children)},d={inlineCode:"code",wrapper:function(e){var t=e.children;return n.createElement(n.Fragment,{},t)}},u=n.forwardRef((function(e,t){var a=e.components,r=e.mdxType,o=e.originalType,p=e.parentName,m=l(e,["components","mdxType","originalType","parentName"]),u=s(a),c=r,N=u["".concat(p,".").concat(c)]||u[c]||d[c]||o;return a?n.createElement(N,i(i({ref:t},m),{},{components:a})):n.createElement(N,i({ref:t},m))}));function c(e,t){var a=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var o=a.length,i=new Array(o);i[0]=u;var l={};for(var p in t)hasOwnProperty.call(t,p)&&(l[p]=t[p]);l.originalType=e,l.mdxType="string"==typeof e?e:r,i[1]=l;for(var s=2;sModel Slicing',id:"model-slicing",level:2},{value:"Example",id:"example",level:2}],u={toc:d};function c(e){var t=e.components,a=(0,r.Z)(e,i);return(0,o.kt)("wrapper",(0,n.Z)({},u,a,{components:t,mdxType:"MDXLayout"}),(0,o.kt)("h1",{id:"onnx-model-inferencing-on-spark"},"ONNX model inferencing on Spark"),(0,o.kt)("h2",{id:"onnx"},"ONNX"),(0,o.kt)("p",null,(0,o.kt)("a",{parentName:"p",href:"https://onnx.ai/"},"ONNX")," is an open format to represent both deep learning and traditional machine learning models. With ONNX, AI developers can more easily move models between state-of-the-art tools and choose the combination that is best for them."),(0,o.kt)("p",null,"SynapseML now includes a Spark transformer to bring a trained ONNX model to Apache Spark, so you can run inference on your data with Spark's large-scale data processing power."),(0,o.kt)("h2",{id:"onnxhub"},"ONNXHub"),(0,o.kt)("p",null,'Although you can use your own local model, many popular existing models are provided through the ONNXHub. You can use\na model\'s ONNXHub name (for example "MNIST") and download the bytes of the model, and some metadata about the model. You can also list\navailable models, optionally filtering by name or tags.'),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},' // List models\n val hub = new ONNXHub()\n val models = hub.listModels(model = Some("mnist"), tags = Some(Seq("vision")))\n\n // Retrieve and transform with a model\n val info = hub.getModelInfo("resnet50")\n val bytes = hub.load(name)\n val model = new ONNXModel()\n .setModelPayload(bytes)\n .setFeedDict(Map("data" -> "features"))\n .setFetchDict(Map("rawPrediction" -> "resnetv24_dense0_fwd"))\n .setSoftMaxDict(Map("rawPrediction" -> "probability"))\n .setArgMaxDict(Map("rawPrediction" -> "prediction"))\n .setMiniBatchSize(1)\n\n val (probability, _) = model.transform({YOUR_DATAFRAME})\n .select("probability", "prediction")\n .as[(Vector, Double)]\n .head\n')),(0,o.kt)("h2",{id:"usage"},"Usage"),(0,o.kt)("ol",null,(0,o.kt)("li",{parentName:"ol"},(0,o.kt)("p",{parentName:"li"},"Create a ",(0,o.kt)("inlineCode",{parentName:"p"},"com.microsoft.azure.synapse.ml.onnx.ONNXModel")," object and use ",(0,o.kt)("inlineCode",{parentName:"p"},"setModelLocation")," or ",(0,o.kt)("inlineCode",{parentName:"p"},"setModelPayload")," to load the ONNX model."),(0,o.kt)("p",{parentName:"li"},"For example:"),(0,o.kt)("pre",{parentName:"li"},(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'val onnx = new ONNXModel().setModelLocation("/path/to/model.onnx")\n')),(0,o.kt)("p",{parentName:"li"},"Optionally, create the model from the ONNXHub."),(0,o.kt)("pre",{parentName:"li"},(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'val onnx = new ONNXModel().setModelPayload(hub.load("MNIST"))\n'))),(0,o.kt)("li",{parentName:"ol"},(0,o.kt)("p",{parentName:"li"},"Use ONNX visualization tool (for example, ",(0,o.kt)("a",{parentName:"p",href:"https://netron.app/"},"Netron"),") to inspect the ONNX model's input and output nodes."),(0,o.kt)("p",{parentName:"li"},(0,o.kt)("img",{parentName:"p",src:"https://mmlspark.blob.core.windows.net/graphics/ONNXModelInputsOutputs.png",alt:"Screenshot that illustrates an ONNX model's input and output nodes"}))),(0,o.kt)("li",{parentName:"ol"},(0,o.kt)("p",{parentName:"li"},"Set the parameters properly to the ",(0,o.kt)("inlineCode",{parentName:"p"},"ONNXModel")," object."),(0,o.kt)("p",{parentName:"li"},"The ",(0,o.kt)("inlineCode",{parentName:"p"},"com.microsoft.azure.synapse.ml.onnx.ONNXModel")," class provides a set of parameters to control the behavior of the inference."),(0,o.kt)("table",{parentName:"li"},(0,o.kt)("thead",{parentName:"table"},(0,o.kt)("tr",{parentName:"thead"},(0,o.kt)("th",{parentName:"tr",align:"left"},"Parameter"),(0,o.kt)("th",{parentName:"tr",align:"left"},"Description"),(0,o.kt)("th",{parentName:"tr",align:"left"},"Default Value"))),(0,o.kt)("tbody",{parentName:"table"},(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:"left"},"feedDict"),(0,o.kt)("td",{parentName:"tr",align:"left"},"Map the ONNX model's expected input node names to the input DataFrame's column names. Make sure the input DataFrame's column schema matches with the corresponding input's shape of the ONNX model. For example, an image classification model may have an input node of shape ",(0,o.kt)("inlineCode",{parentName:"td"},"[1, 3, 224, 224]")," with type Float. It's assumed that the first dimension (1) is the batch size. Then the input DataFrame's corresponding column's type should be ",(0,o.kt)("inlineCode",{parentName:"td"},"ArrayType(ArrayType(ArrayType(FloatType)))"),"."),(0,o.kt)("td",{parentName:"tr",align:"left"},"None")),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:"left"},"fetchDict"),(0,o.kt)("td",{parentName:"tr",align:"left"},"Map the output DataFrame's column names to the ONNX model's output node names. NOTE: If you put outputs that are intermediate in the model, transform will automatically slice at those outputs. See the section on ",(0,o.kt)("a",{parentName:"td",href:"#slicing"},"Slicing"),"."),(0,o.kt)("td",{parentName:"tr",align:"left"},"None")),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:"left"},"miniBatcher"),(0,o.kt)("td",{parentName:"tr",align:"left"},"Specify the MiniBatcher to use."),(0,o.kt)("td",{parentName:"tr",align:"left"},(0,o.kt)("inlineCode",{parentName:"td"},"FixedMiniBatchTransformer")," with batch size 10")),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:"left"},"softMaxDict"),(0,o.kt)("td",{parentName:"tr",align:"left"},"A map between output DataFrame columns, where the value column will be computed from taking the softmax of the key column. If the 'rawPrediction' column contains logits outputs, then one can set softMaxDict to ",(0,o.kt)("inlineCode",{parentName:"td"},'Map("rawPrediction" -> "probability")')," to obtain the probability outputs."),(0,o.kt)("td",{parentName:"tr",align:"left"},"None")),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:"left"},"argMaxDict"),(0,o.kt)("td",{parentName:"tr",align:"left"},"A map between output DataFrame columns, where the value column will be computed from taking the argmax of the key column. This parameter can be used to convert probability or logits output to the predicted label."),(0,o.kt)("td",{parentName:"tr",align:"left"},"None")),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:"left"},"deviceType"),(0,o.kt)("td",{parentName:"tr",align:"left"},"Specify a device type the model inference runs on. Supported types are: CPU or CUDA. If not specified, auto detection will be used."),(0,o.kt)("td",{parentName:"tr",align:"left"},"None")),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:"left"},"optimizationLevel"),(0,o.kt)("td",{parentName:"tr",align:"left"},"Specify the ",(0,o.kt)("a",{parentName:"td",href:"https://onnxruntime.ai/docs/performance/model-optimizations/graph-optimizations.html#graph-optimization-levels"},"optimization level")," for the ONNX graph optimizations. Supported values are: ",(0,o.kt)("inlineCode",{parentName:"td"},"NO_OPT"),", ",(0,o.kt)("inlineCode",{parentName:"td"},"BASIC_OPT"),", ",(0,o.kt)("inlineCode",{parentName:"td"},"EXTENDED_OPT"),", ",(0,o.kt)("inlineCode",{parentName:"td"},"ALL_OPT"),"."),(0,o.kt)("td",{parentName:"tr",align:"left"},(0,o.kt)("inlineCode",{parentName:"td"},"ALL_OPT")))))),(0,o.kt)("li",{parentName:"ol"},(0,o.kt)("p",{parentName:"li"},"Call ",(0,o.kt)("inlineCode",{parentName:"p"},"transform")," method to run inference on the input DataFrame."))),(0,o.kt)("h2",{id:"model-slicing"},(0,o.kt)("a",{name:"slicing"}),"Model Slicing"),(0,o.kt)("p",null,"By default, an ONNX model is treated as a black box with inputs and outputs.\nIf you want to use intermediate nodes of a model, you can slice the model at particular nodes. Slicing will create a new model,\nkeeping only parts of the model that are needed for those nodes. This new model's outputs will be the outputs from\nthe intermediate nodes. You can save the sliced model and use it to transform just like any other ONNXModel."),(0,o.kt)("p",null,"This slicing feature is used implicitly by the ImageFeaturizer, which uses ONNX models. The OnnxHub manifest entry for each model\nincludes which intermediate node outputs should be used for featurization, so the ImageFeaturizer will automatically slice at the correct nodes."),(0,o.kt)("p",null,"The below example shows how to perform the slicing manually with a direct ONNXModel."),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},' // create a df: Dataframe with image data\n val hub = new ONNXHub()\n val info = hub.getModelInfo("resnet50")\n val bytes = hub.load(name)\n val intermediateOutputName = "resnetv24_pool1_fwd"\n val slicedModel = new ONNXModel()\n .setModelPayload(bytes)\n .setFeedDict(Map("data" -> "features"))\n .setFetchDict(Map("rawFeatures" -> intermediateOutputName)) // automatic slicing based on fetch dictionary\n // -- or --\n // .sliceAtOutput(intermediateOutputName) // manual slicing\n\n val slicedModelDf = slicedModel.transform(df)\n')),(0,o.kt)("h2",{id:"example"},"Example"),(0,o.kt)("ul",null,(0,o.kt)("li",{parentName:"ul"},(0,o.kt)("a",{parentName:"li",href:"../../Responsible%20AI/Image%20Explainers"},"Image Explainers")),(0,o.kt)("li",{parentName:"ul"},(0,o.kt)("a",{parentName:"li",href:"../Quickstart%20-%20ONNX%20Model%20Inference"},"Quickstart - ONNX Model Inference"))))}c.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/0fa36e32.f356a13c.js b/assets/js/0fa36e32.f356a13c.js new file mode 100644 index 0000000000..b83b41dae5 --- /dev/null +++ b/assets/js/0fa36e32.f356a13c.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[6555],{3905:function(t,e,r){r.d(e,{Zo:function(){return p},kt:function(){return f}});var n=r(7294);function a(t,e,r){return e in t?Object.defineProperty(t,e,{value:r,enumerable:!0,configurable:!0,writable:!0}):t[e]=r,t}function i(t,e){var r=Object.keys(t);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(t);e&&(n=n.filter((function(e){return Object.getOwnPropertyDescriptor(t,e).enumerable}))),r.push.apply(r,n)}return r}function o(t){for(var e=1;e=0||(a[r]=t[r]);return a}(t,e);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(t);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(t,r)&&(a[r]=t[r])}return a}var l=n.createContext({}),c=function(t){var e=n.useContext(l),r=e;return t&&(r="function"==typeof t?t(e):o(o({},e),t)),r},p=function(t){var e=c(t.components);return n.createElement(l.Provider,{value:e},t.children)},u={inlineCode:"code",wrapper:function(t){var e=t.children;return n.createElement(n.Fragment,{},e)}},d=n.forwardRef((function(t,e){var r=t.components,a=t.mdxType,i=t.originalType,l=t.parentName,p=s(t,["components","mdxType","originalType","parentName"]),d=c(r),f=a,m=d["".concat(l,".").concat(f)]||d[f]||u[f]||i;return r?n.createElement(m,o(o({ref:e},p),{},{components:r})):n.createElement(m,o({ref:e},p))}));function f(t,e){var r=arguments,a=e&&e.mdxType;if("string"==typeof t||a){var i=r.length,o=new Array(i);o[0]=d;var s={};for(var l in e)hasOwnProperty.call(e,l)&&(s[l]=e[l]);s.originalType=t,s.mdxType="string"==typeof t?t:a,o[1]=s;for(var c=2;c=0||(n[a]=e[a]);return n}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(n[a]=e[a])}return n}var l=r.createContext({}),p=function(e){var t=r.useContext(l),a=t;return e&&(a="function"==typeof e?e(t):s(s({},t),e)),a},u=function(e){var t=p(e.components);return r.createElement(l.Provider,{value:t},e.children)},c={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},m=r.forwardRef((function(e,t){var a=e.components,n=e.mdxType,i=e.originalType,l=e.parentName,u=o(e,["components","mdxType","originalType","parentName"]),m=p(a),d=n,g=m["".concat(l,".").concat(d)]||m[d]||c[d]||i;return a?r.createElement(g,s(s({ref:t},u),{},{components:a})):r.createElement(g,s({ref:t},u))}));function d(e,t){var a=arguments,n=t&&t.mdxType;if("string"==typeof e||n){var i=a.length,s=new Array(i);s[0]=m;var o={};for(var l in t)hasOwnProperty.call(t,l)&&(o[l]=t[l]);o.originalType=e,o.mdxType="string"==typeof e?e:n,s[1]=o;for(var p=2;p=0||(i[n]=e[n]);return i}(e,t);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(i[n]=e[n])}return i}var s=r.createContext({}),p=function(e){var t=r.useContext(s),n=t;return e&&(n="function"==typeof e?e(t):o(o({},t),e)),n},c=function(e){var t=p(e.components);return r.createElement(s.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},m=r.forwardRef((function(e,t){var n=e.components,i=e.mdxType,a=e.originalType,s=e.parentName,c=l(e,["components","mdxType","originalType","parentName"]),m=p(n),d=i,f=m["".concat(s,".").concat(d)]||m[d]||u[d]||a;return n?r.createElement(f,o(o({ref:t},c),{},{components:n})):r.createElement(f,o({ref:t},c))}));function d(e,t){var n=arguments,i=t&&t.mdxType;if("string"==typeof e||i){var a=n.length,o=new Array(a);o[0]=m;var l={};for(var s in t)hasOwnProperty.call(t,s)&&(l[s]=t[s]);l.originalType=e,l.mdxType="string"==typeof e?e:i,o[1]=l;for(var p=2;p 3).cast(LongType()))\n .select("label", "text")\n .cache()\n)\n\ndisplay(data)\n')),(0,a.kt)("p",null,"We train a text classification model, and randomly sample 10 rows to explain."),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},'train, test = data.randomSplit([0.60, 0.40])\n\npipeline = Pipeline(\n stages=[\n TextFeaturizer(\n inputCol="text",\n outputCol="features",\n useStopWordsRemover=True,\n useIDF=True,\n minDocFreq=20,\n numFeatures=1 << 16,\n ),\n LogisticRegression(maxIter=100, regParam=0.005, labelCol="label", featuresCol="features"),\n ]\n)\n\nmodel = pipeline.fit(train)\n\nprediction = model.transform(test)\n\nexplain_instances = prediction.orderBy(rand()).limit(10)\n')),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},'def plotConfusionMatrix(df, label, prediction, classLabels):\n from synapse.ml.plot import confusionMatrix\n import matplotlib.pyplot as plt\n\n fig = plt.figure(figsize=(4.5, 4.5))\n confusionMatrix(df, label, prediction, classLabels)\n if running_on_synapse():\n plt.show()\n else:\n display(fig)\n\n\nplotConfusionMatrix(model.transform(test), "label", "prediction", [0, 1])\n')),(0,a.kt)("p",null,"First we use the LIME text explainer to explain the model's predicted probability for a given observation."),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},'lime = TextLIME(\n model=model,\n outputCol="weights",\n inputCol="text",\n targetCol="probability",\n targetClasses=[1],\n tokensCol="tokens",\n samplingFraction=0.7,\n numSamples=2000,\n)\n\nlime_results = (\n lime.transform(explain_instances)\n .select("tokens", "weights", "r2", "probability", "text")\n .withColumn("probability", vec_access("probability", lit(1)))\n .withColumn("weights", vec2array(col("weights").getItem(0)))\n .withColumn("r2", vec_access("r2", lit(0)))\n .withColumn("tokens_weights", arrays_zip("tokens", "weights"))\n)\n\ndisplay(lime_results.select("probability", "r2", "tokens_weights", "text").orderBy(col("probability").desc()))\n')),(0,a.kt)("p",null,"Then we use the Kernel SHAP text explainer to explain the model's predicted probability for a given observation."),(0,a.kt)("blockquote",null,(0,a.kt)("p",{parentName:"blockquote"},"Notice that we drop the base value from the SHAP output before displaying the SHAP values. The base value is the model output for an empty string.")),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},'shap = TextSHAP(\n model=model,\n outputCol="shaps",\n inputCol="text",\n targetCol="probability",\n targetClasses=[1],\n tokensCol="tokens",\n numSamples=5000,\n)\n\nshap_results = (\n shap.transform(explain_instances)\n .select("tokens", "shaps", "r2", "probability", "text")\n .withColumn("probability", vec_access("probability", lit(1)))\n .withColumn("shaps", vec2array(col("shaps").getItem(0)))\n .withColumn("shaps", slice(col("shaps"), lit(2), size(col("shaps"))))\n .withColumn("r2", vec_access("r2", lit(0)))\n .withColumn("tokens_shaps", arrays_zip("tokens", "shaps"))\n)\n\ndisplay(shap_results.select("probability", "r2", "tokens_shaps", "text").orderBy(col("probability").desc()))\n')))}d.isMDXComponent=!0}}]); \ No newline at end of file +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[8933],{3905:function(e,t,n){n.d(t,{Zo:function(){return c},kt:function(){return d}});var r=n(7294);function i(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function a(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,r)}return n}function o(e){for(var t=1;t=0||(i[n]=e[n]);return i}(e,t);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(i[n]=e[n])}return i}var s=r.createContext({}),p=function(e){var t=r.useContext(s),n=t;return e&&(n="function"==typeof e?e(t):o(o({},t),e)),n},c=function(e){var t=p(e.components);return r.createElement(s.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},m=r.forwardRef((function(e,t){var n=e.components,i=e.mdxType,a=e.originalType,s=e.parentName,c=l(e,["components","mdxType","originalType","parentName"]),m=p(n),d=i,f=m["".concat(s,".").concat(d)]||m[d]||u[d]||a;return n?r.createElement(f,o(o({ref:t},c),{},{components:n})):r.createElement(f,o({ref:t},c))}));function d(e,t){var n=arguments,i=t&&t.mdxType;if("string"==typeof e||i){var a=n.length,o=new Array(a);o[0]=m;var l={};for(var s in t)hasOwnProperty.call(t,s)&&(l[s]=t[s]);l.originalType=e,l.mdxType="string"==typeof e?e:i,o[1]=l;for(var p=2;p 3).cast(LongType()))\n .select("label", "text")\n .cache()\n)\n\ndisplay(data)\n')),(0,a.kt)("p",null,"We train a text classification model, and randomly sample 10 rows to explain."),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},'train, test = data.randomSplit([0.60, 0.40])\n\npipeline = Pipeline(\n stages=[\n TextFeaturizer(\n inputCol="text",\n outputCol="features",\n useStopWordsRemover=True,\n useIDF=True,\n minDocFreq=20,\n numFeatures=1 << 16,\n ),\n LogisticRegression(maxIter=100, regParam=0.005, labelCol="label", featuresCol="features"),\n ]\n)\n\nmodel = pipeline.fit(train)\n\nprediction = model.transform(test)\n\nexplain_instances = prediction.orderBy(rand()).limit(10)\n')),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},'def plotConfusionMatrix(df, label, prediction, classLabels):\n from synapse.ml.plot import confusionMatrix\n import matplotlib.pyplot as plt\n\n fig = plt.figure(figsize=(4.5, 4.5))\n confusionMatrix(df, label, prediction, classLabels)\n if running_on_synapse():\n plt.show()\n else:\n display(fig)\n\n\nplotConfusionMatrix(model.transform(test), "label", "prediction", [0, 1])\n')),(0,a.kt)("p",null,"First we use the LIME text explainer to explain the model's predicted probability for a given observation."),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},'lime = TextLIME(\n model=model,\n outputCol="weights",\n inputCol="text",\n targetCol="probability",\n targetClasses=[1],\n tokensCol="tokens",\n samplingFraction=0.7,\n numSamples=2000,\n)\n\nlime_results = (\n lime.transform(explain_instances)\n .select("tokens", "weights", "r2", "probability", "text")\n .withColumn("probability", vec_access("probability", lit(1)))\n .withColumn("weights", vec2array(col("weights").getItem(0)))\n .withColumn("r2", vec_access("r2", lit(0)))\n .withColumn("tokens_weights", arrays_zip("tokens", "weights"))\n)\n\ndisplay(lime_results.select("probability", "r2", "tokens_weights", "text").orderBy(col("probability").desc()))\n')),(0,a.kt)("p",null,"Then we use the Kernel SHAP text explainer to explain the model's predicted probability for a given observation."),(0,a.kt)("blockquote",null,(0,a.kt)("p",{parentName:"blockquote"},"Notice that we drop the base value from the SHAP output before displaying the SHAP values. The base value is the model output for an empty string.")),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},'shap = TextSHAP(\n model=model,\n outputCol="shaps",\n inputCol="text",\n targetCol="probability",\n targetClasses=[1],\n tokensCol="tokens",\n numSamples=5000,\n)\n\nshap_results = (\n shap.transform(explain_instances)\n .select("tokens", "shaps", "r2", "probability", "text")\n .withColumn("probability", vec_access("probability", lit(1)))\n .withColumn("shaps", vec2array(col("shaps").getItem(0)))\n .withColumn("shaps", slice(col("shaps"), lit(2), size(col("shaps"))))\n .withColumn("r2", vec_access("r2", lit(0)))\n .withColumn("tokens_shaps", arrays_zip("tokens", "shaps"))\n)\n\ndisplay(shap_results.select("probability", "r2", "tokens_shaps", "text").orderBy(col("probability").desc()))\n')))}d.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/12d85ab5.3ede2f06.js b/assets/js/12d85ab5.3ede2f06.js new file mode 100644 index 0000000000..1c1f4e6ed7 --- /dev/null +++ b/assets/js/12d85ab5.3ede2f06.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[2412],{3905:function(e,t,a){a.d(t,{Zo:function(){return m},kt:function(){return h}});var n=a(7294);function s(e,t,a){return t in e?Object.defineProperty(e,t,{value:a,enumerable:!0,configurable:!0,writable:!0}):e[t]=a,e}function o(e,t){var a=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);t&&(n=n.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),a.push.apply(a,n)}return a}function r(e){for(var t=1;t=0||(s[a]=e[a]);return s}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(s[a]=e[a])}return s}var l=n.createContext({}),c=function(e){var t=n.useContext(l),a=t;return e&&(a="function"==typeof e?e(t):r(r({},t),e)),a},m=function(e){var t=c(e.components);return n.createElement(l.Provider,{value:t},e.children)},p={inlineCode:"code",wrapper:function(e){var t=e.children;return n.createElement(n.Fragment,{},t)}},u=n.forwardRef((function(e,t){var a=e.components,s=e.mdxType,o=e.originalType,l=e.parentName,m=i(e,["components","mdxType","originalType","parentName"]),u=c(a),h=s,d=u["".concat(l,".").concat(h)]||u[h]||p[h]||o;return a?n.createElement(d,r(r({ref:t},m),{},{components:a})):n.createElement(d,r({ref:t},m))}));function h(e,t){var a=arguments,s=t&&t.mdxType;if("string"==typeof e||s){var o=a.length,r=new Array(o);r[0]=u;var i={};for(var l in t)hasOwnProperty.call(t,l)&&(i[l]=t[l]);i.originalType=e,i.mdxType="string"==typeof e?e:s,r[1]=i;for(var c=2;c=0||(s[a]=e[a]);return s}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(s[a]=e[a])}return s}var l=n.createContext({}),c=function(e){var t=n.useContext(l),a=t;return e&&(a="function"==typeof e?e(t):r(r({},t),e)),a},m=function(e){var t=c(e.components);return n.createElement(l.Provider,{value:t},e.children)},p={inlineCode:"code",wrapper:function(e){var t=e.children;return n.createElement(n.Fragment,{},t)}},u=n.forwardRef((function(e,t){var a=e.components,s=e.mdxType,o=e.originalType,l=e.parentName,m=i(e,["components","mdxType","originalType","parentName"]),u=c(a),h=s,d=u["".concat(l,".").concat(h)]||u[h]||p[h]||o;return a?n.createElement(d,r(r({ref:t},m),{},{components:a})):n.createElement(d,r({ref:t},m))}));function h(e,t){var a=arguments,s=t&&t.mdxType;if("string"==typeof e||s){var o=a.length,r=new Array(o);r[0]=u;var i={};for(var l in t)hasOwnProperty.call(t,l)&&(i[l]=t[l]);i.originalType=e,i.mdxType="string"==typeof e?e:s,r[1]=i;for(var c=2;c=0||(r[a]=e[a]);return r}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(r[a]=e[a])}return r}var l=n.createContext({}),p=function(e){var t=n.useContext(l),a=t;return e&&(a="function"==typeof e?e(t):o(o({},t),e)),a},m=function(e){var t=p(e.components);return n.createElement(l.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return n.createElement(n.Fragment,{},t)}},c=n.forwardRef((function(e,t){var a=e.components,r=e.mdxType,i=e.originalType,l=e.parentName,m=s(e,["components","mdxType","originalType","parentName"]),c=p(a),d=r,b=c["".concat(l,".").concat(d)]||c[d]||u[d]||i;return a?n.createElement(b,o(o({ref:t},m),{},{components:a})):n.createElement(b,o({ref:t},m))}));function d(e,t){var a=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var i=a.length,o=new Array(i);o[0]=c;var s={};for(var l in t)hasOwnProperty.call(t,l)&&(s[l]=t[l]);s.originalType=e,s.mdxType="string"==typeof e?e:r,o[1]=s;for(var p=2;p=0||(o[n]=e[n]);return o}(e,t);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(o[n]=e[n])}return o}var p=a.createContext({}),c=function(e){var t=a.useContext(p),n=t;return e&&(n="function"==typeof e?e(t):i(i({},t),e)),n},l=function(e){var t=c(e.components);return a.createElement(p.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return a.createElement(a.Fragment,{},t)}},d=a.forwardRef((function(e,t){var n=e.components,o=e.mdxType,r=e.originalType,p=e.parentName,l=s(e,["components","mdxType","originalType","parentName"]),d=c(n),m=o,h=d["".concat(p,".").concat(m)]||d[m]||u[m]||r;return n?a.createElement(h,i(i({ref:t},l),{},{components:n})):a.createElement(h,i({ref:t},l))}));function m(e,t){var n=arguments,o=t&&t.mdxType;if("string"==typeof e||o){var r=n.length,i=new Array(r);i[0]=d;var s={};for(var p in t)hasOwnProperty.call(t,p)&&(s[p]=t[p]);s.originalType=e,s.mdxType="string"==typeof e?e:o,i[1]=s;for(var c=2;c=0||(o[n]=e[n]);return o}(e,t);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(o[n]=e[n])}return o}var p=a.createContext({}),c=function(e){var t=a.useContext(p),n=t;return e&&(n="function"==typeof e?e(t):i(i({},t),e)),n},l=function(e){var t=c(e.components);return a.createElement(p.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return a.createElement(a.Fragment,{},t)}},d=a.forwardRef((function(e,t){var n=e.components,o=e.mdxType,r=e.originalType,p=e.parentName,l=s(e,["components","mdxType","originalType","parentName"]),d=c(n),m=o,h=d["".concat(p,".").concat(m)]||d[m]||u[m]||r;return n?a.createElement(h,i(i({ref:t},l),{},{components:n})):a.createElement(h,i({ref:t},l))}));function m(e,t){var n=arguments,o=t&&t.mdxType;if("string"==typeof e||o){var r=n.length,i=new Array(r);i[0]=d;var s={};for(var p in t)hasOwnProperty.call(t,p)&&(s[p]=t[p]);s.originalType=e,s.mdxType="string"==typeof e?e:o,i[1]=s;for(var c=2;c=0||(r[n]=e[n]);return r}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(l=0;l=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(r[n]=e[n])}return r}var s=l.createContext({}),c=function(e){var t=l.useContext(s),n=t;return e&&(n="function"==typeof e?e(t):a(a({},t),e)),n},m=function(e){var t=c(e.components);return l.createElement(s.Provider,{value:t},e.children)},p={inlineCode:"code",wrapper:function(e){var t=e.children;return l.createElement(l.Fragment,{},t)}},u=l.forwardRef((function(e,t){var n=e.components,r=e.mdxType,o=e.originalType,s=e.parentName,m=i(e,["components","mdxType","originalType","parentName"]),u=c(n),d=r,f=u["".concat(s,".").concat(d)]||u[d]||p[d]||o;return n?l.createElement(f,a(a({ref:t},m),{},{components:n})):l.createElement(f,a({ref:t},m))}));function d(e,t){var n=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var o=n.length,a=new Array(o);a[0]=u;var i={};for(var s in t)hasOwnProperty.call(t,s)&&(i[s]=t[s]);i.originalType=e,i.mdxType="string"==typeof e?e:r,a[1]=i;for(var c=2;c ",(0,o.kt)("strong",{parentName:"li"},"External connections")," -> ",(0,o.kt)("strong",{parentName:"li"},"Linked services"),", select ",(0,o.kt)("strong",{parentName:"li"},"+ New")),(0,o.kt)("li",{parentName:"ul"},"Select the workspace you want to log the model in and create the linked service. You need the ",(0,o.kt)("strong",{parentName:"li"},"name of the linked service")," to set up connection.")),(0,o.kt)("h4",{id:"auth-synapse-workspace"},"Auth Synapse Workspace"),(0,o.kt)("img",{src:"https://mmlspark.blob.core.windows.net/graphics/Documentation/ml_linked_service_2.png",width:"600"}),(0,o.kt)("ul",null,(0,o.kt)("li",{parentName:"ul"},"Go to the ",(0,o.kt)("strong",{parentName:"li"},"Azure Machine Learning workspace")," resource -> ",(0,o.kt)("strong",{parentName:"li"},"access control (IAM)")," -> ",(0,o.kt)("strong",{parentName:"li"},"Role assignment"),", select ",(0,o.kt)("strong",{parentName:"li"},"+ Add"),", choose ",(0,o.kt)("strong",{parentName:"li"},"Add role assignment")),(0,o.kt)("li",{parentName:"ul"},"Choose ",(0,o.kt)("strong",{parentName:"li"},"contributor"),", select next"),(0,o.kt)("li",{parentName:"ul"},"In members page, choose ",(0,o.kt)("strong",{parentName:"li"},"Managed identity"),", select ",(0,o.kt)("strong",{parentName:"li"},"+ select members"),". Under ",(0,o.kt)("strong",{parentName:"li"},"managed identity"),", choose Synapse workspace. Under ",(0,o.kt)("strong",{parentName:"li"},"Select"),", choose the workspace you run your experiment on. Click ",(0,o.kt)("strong",{parentName:"li"},"Select"),", ",(0,o.kt)("strong",{parentName:"li"},"Review + assign"),".")),(0,o.kt)("h4",{id:"use-mlflow-in-synapse-with-linked-service"},"Use MLFlow in Synapse with Linked Service"),(0,o.kt)("p",null,"Set up connection"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'\n#AML\xa0workspace\xa0authentication\xa0using\xa0linked\xa0service\nfrom\xa0notebookutils.mssparkutils\xa0import\xa0azureML\nlinked_service_name = "YourLinkedServiceName"\nws\xa0=\xa0azureML.getWorkspace(linked_service_name)\nmlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())\n\n#Set\xa0MLflow\xa0experiment.\xa0\nexperiment_name\xa0=\xa0"synapse-mlflow-experiment"\nmlflow.set_experiment(experiment_name)\xa0\n')),(0,o.kt)("h4",{id:"use-mlflow-in-synapse-without-a-linked-service"},"Use MLFlow in Synapse without a Linked Service"),(0,o.kt)("p",null,"Once you create an AML workspace, you can obtain the MLflow tracking URL directly. The AML start page is where you can locate the MLflow tracking URL."),(0,o.kt)("img",{src:"https://mmlspark.blob.core.windows.net/graphics/Documentation/mlflow_tracking_url.png",width:"600"}),'You can set it tracking url with ```python mlflow.set_tracking_uri("your mlflow tracking url") ```',(0,o.kt)("h2",{id:"mlflow-api-reference"},"MLFlow API Reference"),(0,o.kt)("ul",null,(0,o.kt)("li",{parentName:"ul"},(0,o.kt)("a",{parentName:"li",href:"https://www.mlflow.org/docs/latest/python_api/mlflow.spark.html#mlflow.spark.save_model"},"mlflow.spark.save_model")),(0,o.kt)("li",{parentName:"ul"},(0,o.kt)("a",{parentName:"li",href:"https://www.mlflow.org/docs/latest/python_api/mlflow.spark.html#mlflow.spark.log_model"},"mlflow.spark.log_model")),(0,o.kt)("li",{parentName:"ul"},(0,o.kt)("a",{parentName:"li",href:"https://www.mlflow.org/docs/latest/python_api/mlflow.spark.html#mlflow.spark.load_model"},"mlflow.spark.load_model")),(0,o.kt)("li",{parentName:"ul"},(0,o.kt)("a",{parentName:"li",href:"https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.log_metric"},"mlflow.log_metric"))),(0,o.kt)("h2",{id:"examples"},"Examples"),(0,o.kt)("h3",{id:"lightgbmclassifier"},"LightGBMClassifier"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'import mlflow\nfrom synapse.ml.featurize import Featurize\nfrom synapse.ml.lightgbm import *\nfrom synapse.ml.train import ComputeModelStatistics\n\nwith mlflow.start_run():\n\n feature_columns = ["Number of times pregnant","Plasma glucose concentration a 2 hours in an oral glucose tolerance test",\n "Diastolic blood pressure (mm Hg)","Triceps skin fold thickness (mm)","2-Hour serum insulin (mu U/ml)",\n "Body mass index (weight in kg/(height in m)^2)","Diabetes pedigree function","Age (years)"]\n df = spark.createDataFrame([\n (0,131,66,40,0,34.3,0.196,22,1),\n (7,194,68,28,0,35.9,0.745,41,1),\n (3,139,54,0,0,25.6,0.402,22,1),\n (6,134,70,23,130,35.4,0.542,29,1),\n (9,124,70,33,402,35.4,0.282,34,0),\n (0,93,100,39,72,43.4,1.021,35,0),\n (4,110,76,20,100,28.4,0.118,27,0),\n (2,127,58,24,275,27.7,1.6,25,0),\n (0,104,64,37,64,33.6,0.51,22,1),\n (2,120,54,0,0,26.8,0.455,27,0),\n (7,178,84,0,0,39.9,0.331,41,1),\n (2,88,58,26,16,28.4,0.766,22,0),\n (1,91,64,24,0,29.2,0.192,21,0),\n (10,101,76,48,180,32.9,0.171,63,0),\n (5,73,60,0,0,26.8,0.268,27,0),\n (3,158,70,30,328,35.5,0.344,35,1),\n (2,105,75,0,0,23.3,0.56,53,0),\n (12,84,72,31,0,29.7,0.297,46,1),\n (9,119,80,35,0,29.0,0.263,29,1),\n (6,93,50,30,64,28.7,0.356,23,0),\n (1,126,60,0,0,30.1,0.349,47,1)\n ], feature_columns+["labels"]).repartition(2)\n\n\n featurize = (Featurize()\n .setOutputCol("features")\n .setInputCols(feature_columns)\n .setOneHotEncodeCategoricals(True)\n .setNumFeatures(4096))\n\n df_trans = featurize.fit(df).transform(df)\n\n lightgbm_classifier = (LightGBMClassifier()\n .setFeaturesCol("features")\n .setRawPredictionCol("rawPrediction")\n .setDefaultListenPort(12402)\n .setNumLeaves(5)\n .setNumIterations(10)\n .setObjective("binary")\n .setLabelCol("labels")\n .setLeafPredictionCol("leafPrediction")\n .setFeaturesShapCol("featuresShap"))\n\n lightgbm_model = lightgbm_classifier.fit(df_trans)\n\n # Use mlflow.spark.save_model to save the model to your path\n mlflow.spark.save_model(lightgbm_model, "lightgbm_model")\n # Use mlflow.spark.log_model to log the model if you have a connected mlflow service\n mlflow.spark.log_model(lightgbm_model, "lightgbm_model")\n\n # Use mlflow.pyfunc.load_model to load model back as PyFuncModel and apply predict\n prediction = mlflow.pyfunc.load_model("lightgbm_model").predict(df_trans.toPandas())\n prediction = list(map(str, prediction))\n mlflow.log_param("prediction", ",".join(prediction))\n\n # Use mlflow.spark.load_model to load model back as PipelineModel and apply transform\n predictions = mlflow.spark.load_model("lightgbm_model").transform(df_trans)\n metrics = ComputeModelStatistics(evaluationMetric="classification", labelCol=\'labels\', scoredLabelsCol=\'prediction\').transform(predictions).collect()\n mlflow.log_metric("accuracy", metrics[0][\'accuracy\'])\n')),(0,o.kt)("h3",{id:"cognitive-services"},"Cognitive Services"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'import mlflow\nfrom synapse.ml.cognitive import *\n\nwith mlflow.start_run():\n\n text_key = "YOUR_COG_SERVICE_SUBSCRIPTION_KEY"\n df = spark.createDataFrame([\n ("I am so happy today, its sunny!", "en-US"),\n ("I am frustrated by this rush hour traffic", "en-US"),\n ("The cognitive services on spark aint bad", "en-US"),\n ], ["text", "language"])\n\n sentiment_model = (TextSentiment()\n .setSubscriptionKey(text_key)\n .setLocation("eastus")\n .setTextCol("text")\n .setOutputCol("prediction")\n .setErrorCol("error")\n .setLanguageCol("language"))\n\n display(sentiment_model.transform(df))\n\n mlflow.spark.save_model(sentiment_model, "sentiment_model")\n mlflow.spark.log_model(sentiment_model, "sentiment_model")\n\n output_df = mlflow.spark.load_model("sentiment_model").transform(df)\n display(output_df)\n\n # In order to call the predict function successfully you need to specify the\n # outputCol name as `prediction`\n prediction = mlflow.pyfunc.load_model("sentiment_model").predict(df.toPandas())\n prediction = list(map(str, prediction))\n mlflow.log_param("prediction", ",".join(prediction))\n')))}d.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/15a59c33.33bffc88.js b/assets/js/15a59c33.33bffc88.js new file mode 100644 index 0000000000..723302cf80 --- /dev/null +++ b/assets/js/15a59c33.33bffc88.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[7924],{3905:function(e,t,n){n.d(t,{Zo:function(){return u},kt:function(){return m}});var a=n(7294);function r(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function o(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);t&&(a=a.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,a)}return n}function i(e){for(var t=1;t=0||(r[n]=e[n]);return r}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(r[n]=e[n])}return r}var s=a.createContext({}),p=function(e){var t=a.useContext(s),n=t;return e&&(n="function"==typeof e?e(t):i(i({},t),e)),n},u=function(e){var t=p(e.components);return a.createElement(s.Provider,{value:t},e.children)},d={inlineCode:"code",wrapper:function(e){var t=e.children;return a.createElement(a.Fragment,{},t)}},c=a.forwardRef((function(e,t){var n=e.components,r=e.mdxType,o=e.originalType,s=e.parentName,u=l(e,["components","mdxType","originalType","parentName"]),c=p(n),m=r,h=c["".concat(s,".").concat(m)]||c[m]||d[m]||o;return n?a.createElement(h,i(i({ref:t},u),{},{components:n})):a.createElement(h,i({ref:t},u))}));function m(e,t){var n=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var o=n.length,i=new Array(o);i[0]=c;var l={};for(var s in t)hasOwnProperty.call(t,s)&&(l[s]=t[s]);l.originalType=e,l.mdxType="string"==typeof e?e:r,i[1]=l;for(var p=2;p)\n\n# See notebook examples for how to create and save several\n# examples of CNTK models\nnetwork = CNTKModel.load("file:///path/to/my_cntkmodel.mml")\n\ntransformed_df = network.transform(df).makeReply()\n\nserver = transformed_df \\\n .writeStream \\\n .server() \\\n .replyTo("my_api") \\\n .queryName("my_query") \\\n .option("checkpointLocation", "file:///path/to/checkpoints") \\\n .start()\n')),(0,o.kt)("h2",{id:"architecture"},"Architecture"),(0,o.kt)("p",null,"Spark Serving adds special streaming sources and sinks to turn any\nstructured streaming job into a web service. Spark Serving comes\nwith two deployment options that vary based on what form of load balancing\nis being used. "),(0,o.kt)("p",null,"In brief you can use:\n",(0,o.kt)("inlineCode",{parentName:"p"},"spark.readStream.server()"),": For head node load balanced services\n",(0,o.kt)("inlineCode",{parentName:"p"},"spark.readStream.distributedServer()"),": For custom load balanced services\n",(0,o.kt)("inlineCode",{parentName:"p"},"spark.readStream.continuousServer()"),": For a custom load balanced, submillisecond-latency continuous server"),(0,o.kt)("p",null,"to create the various different serving dataframes and use the equivalent statements after ",(0,o.kt)("inlineCode",{parentName:"p"},"df.writeStream"),"\nfor replying to the web requests."),(0,o.kt)("h3",{id:"head-node-load-balanced"},"Head Node Load Balanced"),(0,o.kt)("p",null,"You can deploy head node load balancing with the ",(0,o.kt)("inlineCode",{parentName:"p"},"HTTPSource")," and\n",(0,o.kt)("inlineCode",{parentName:"p"},"HTTPSink")," classes. This mode spins up a queue on the head node,\ndistributes work across partitions, then collects response data back to\nthe head node. All HTTP requests are kept and replied to on the head\nnode. In both python and Scala these classes can be access by using\n",(0,o.kt)("inlineCode",{parentName:"p"},"spark.readStream.server()")," after importing SynapseML.\nThis mode allows for more complex windowing, repartitioning, and\nSQL operations. This option is also idea for rapid setup and testing,\nas it doesn't require any further load balancing or network\nswitches. A diagram of this configuration can be seen in this image:"),(0,o.kt)("p",{align:"center"},(0,o.kt)("img",{src:"https://mmlspark.blob.core.windows.net/graphics/HeadNodeDistributed2.png",width:"600"})),(0,o.kt)("h3",{id:"fully-distributed-custom-load-balancer"},"Fully Distributed (Custom Load Balancer)"),(0,o.kt)("p",null,"You can configure Spark Serving for a custom load balancer using the\n",(0,o.kt)("inlineCode",{parentName:"p"},"DistributedHTTPSource")," and ",(0,o.kt)("inlineCode",{parentName:"p"},"DistributedHTTPSink")," classes. This mode\nspins up servers on each executor JVM.\nIn both python and Scala these classes can be access by using\n",(0,o.kt)("inlineCode",{parentName:"p"},"spark.readStream.distributedServer()")," after importing SynapseML.\nEach server will feed its\nexecutor's partitions in parallel. This mode is key for high throughput\nand low latency as data doesn't need to be transferred to and from the\nhead node. This deployment results in several web services that all\nroute into the same spark computation. You can deploy an external load\nbalancer to unify the executor's services under a single IP address.\nSupport for automatic load balancer management and deployment is\ntargeted for the next release of SynapseML. A diagram of this\nconfiguration can be seen here:"),(0,o.kt)("p",{align:"center"},(0,o.kt)("img",{src:"https://mmlspark.blob.core.windows.net/graphics/FullyDistributed2.png",width:"600"})),(0,o.kt)("p",null,"Queries that involve data movement across workers, such as a nontrivial\nSQL join, need special consideration. The user must ensure that the\nright machine replies to each request. One can route data back to the\noriginating partition with a broadcast join. In the future, request\nrouting will be automatically handled by the sink."),(0,o.kt)("h3",{id:"sub-millisecond-latency-with-continuous-processing"},"Sub-Millisecond Latency with Continuous Processing"),(0,o.kt)("p",{align:"center"},(0,o.kt)("img",{src:"https://mmlspark.blob.core.windows.net/graphics/latency_comparison.png",width:"600"})),(0,o.kt)("p",null,"Continuous processing can be enabled by hooking into the ",(0,o.kt)("inlineCode",{parentName:"p"},"HTTPSourceV2")," class using:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre"},"spark.readStream.continuousServer()\n ...\n")),(0,o.kt)("p",null,"In continuous serving, much like continuous streaming you need to add a trigger to your write statement:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre"},' df.writeStream\n .continuousServer()\n .trigger(continuous="1 second")\n ...\n')),(0,o.kt)("p",null,"The architecture is similar to the custom load balancer setup described earlier.\nMore specifically, Spark will manage a web service on each partition.\nThese webservices can be unified together using an Azure Load Balancer,\nKubernetes Service Endpoint, Azure Application gateway or any other way to load balance a distributed service.\nIt's currently the user's responsibility to optionally unify these services as they see fit.\nIn the future, we'll include options to dynamically spin up and manage a load balancer."),(0,o.kt)("h4",{id:"databricks-setup"},"Databricks Setup"),(0,o.kt)("p",null,"Databricks is a managed architecture and they've restricted\nall incoming traffic to the nodes of the cluster.\nIf you create a web service in your databricks cluster (head or worker nodes),\nyour cluster can communicate with the service, but the outside world can't.\nHowever, in the future, Databricks will support Virtual Network Injection, so problem will not arise.\nIn the meantime, you must use SSH tunneling to forward the services to another machine(s)\nto act as a networking gateway. This machine can be any machine that accepts SSH traffic and requests.\nWe have included settings to automatically configure this SSH tunneling for convenience."),(0,o.kt)("h5",{id:"linux-gateway-setup---azure"},"Linux Gateway Setup - Azure"),(0,o.kt)("ol",null,(0,o.kt)("li",{parentName:"ol"},(0,o.kt)("a",{parentName:"li",href:"https://docs.microsoft.com/en-us/azure/virtual-machines/linux/quick-create-portal"},"Create a Linux VM using SSH")),(0,o.kt)("li",{parentName:"ol"},(0,o.kt)("a",{parentName:"li",href:"https://docs.microsoft.com/en-us/azure/virtual-machines/windows/nsg-quickstart-portal"},"Open ports 8000-9999 from the Azure portal")),(0,o.kt)("li",{parentName:"ol"},"Open the port on the firewall on the VM",(0,o.kt)("pre",{parentName:"li"},(0,o.kt)("code",{parentName:"pre",className:"language-$xslt"},'firewall-cmd --zone=public --add-port=8000-10000/tcp --permanent\nfirewall-cmd --reload\necho "GatewayPorts yes" >> /etc/ssh/sshd_config\nservice ssh --full-restart\n'))),(0,o.kt)("li",{parentName:"ol"},"Add your private key to a private container in ",(0,o.kt)("a",{parentName:"li",href:"https://docs.microsoft.com/en-us/azure/storage/common/storage-quickstart-create-account?toc=%2Fazure%2Fstorage%2Fblobs%2Ftoc.json&tabs=portal"},"Azure Storage Blob"),"."),(0,o.kt)("li",{parentName:"ol"},"Generate a SAS link for your key and save it."),(0,o.kt)("li",{parentName:"ol"},'Include the following parameters on your reader to configure the SSH tunneling:\nserving_inputs = (spark.readStream.continuousServer()\n.option("numPartitions", 1)\n.option("forwarding.enabled", True) # enable ssh forwarding to a gateway machine\n.option("forwarding.username", "username")\n.option("forwarding.sshHost", "ip or dns")\n.option("forwarding.keySas", "SAS url from the previous step")\n.address("localhost", 8904, "my_api")\n.load()')),(0,o.kt)("p",null,"This setup will make your service require an extra jump and affect latency.\nIt's important to pick a gateway that has good connectivity to your spark cluster.\nFor best performance and ease of configuration, we suggest using Spark Serving\non an open cluster environment such as Kubernetes, Mesos, or Azure Batch."),(0,o.kt)("h2",{id:"parameters"},"Parameters"),(0,o.kt)("table",null,(0,o.kt)("thead",{parentName:"table"},(0,o.kt)("tr",{parentName:"thead"},(0,o.kt)("th",{parentName:"tr",align:null},"Parameter Name"),(0,o.kt)("th",{parentName:"tr",align:null},"Description"),(0,o.kt)("th",{parentName:"tr",align:null},"Necessary"),(0,o.kt)("th",{parentName:"tr",align:null},"Default Value"),(0,o.kt)("th",{parentName:"tr",align:null},"Applicable When"))),(0,o.kt)("tbody",{parentName:"table"},(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:null},"host"),(0,o.kt)("td",{parentName:"tr",align:null},"The host to spin up a server on"),(0,o.kt)("td",{parentName:"tr",align:null},"Yes"),(0,o.kt)("td",{parentName:"tr",align:null}),(0,o.kt)("td",{parentName:"tr",align:null})),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:null},"port"),(0,o.kt)("td",{parentName:"tr",align:null},"The starting port when creating the web services. Web services will increment this port several times to find an open port. In the future, the flexibility of this param will be expanded"),(0,o.kt)("td",{parentName:"tr",align:null},"yes"),(0,o.kt)("td",{parentName:"tr",align:null}),(0,o.kt)("td",{parentName:"tr",align:null})),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:null},"name"),(0,o.kt)("td",{parentName:"tr",align:null},"The Path of the api a user would call. The format is ",(0,o.kt)("inlineCode",{parentName:"td"},"hostname:port/name")),(0,o.kt)("td",{parentName:"tr",align:null},"yes"),(0,o.kt)("td",{parentName:"tr",align:null}),(0,o.kt)("td",{parentName:"tr",align:null})),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:null},"forwarding.enabled"),(0,o.kt)("td",{parentName:"tr",align:null},"Whether to forward the services to a gateway machine"),(0,o.kt)("td",{parentName:"tr",align:null},"no"),(0,o.kt)("td",{parentName:"tr",align:null},"false"),(0,o.kt)("td",{parentName:"tr",align:null},"When you need to forward services out of a protected network. Only Supported for Continuous Serving.")),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:null},"forwarding.username"),(0,o.kt)("td",{parentName:"tr",align:null},"the username to connect to on the remote host"),(0,o.kt)("td",{parentName:"tr",align:null},"no"),(0,o.kt)("td",{parentName:"tr",align:null}),(0,o.kt)("td",{parentName:"tr",align:null})),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:null},"forwarding.sshport"),(0,o.kt)("td",{parentName:"tr",align:null},"the port to ssh connect to"),(0,o.kt)("td",{parentName:"tr",align:null},"no"),(0,o.kt)("td",{parentName:"tr",align:null},"22"),(0,o.kt)("td",{parentName:"tr",align:null})),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:null},"forwarding.sshHost"),(0,o.kt)("td",{parentName:"tr",align:null},"the host of the gateway machine"),(0,o.kt)("td",{parentName:"tr",align:null},"no"),(0,o.kt)("td",{parentName:"tr",align:null}),(0,o.kt)("td",{parentName:"tr",align:null})),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:null},"forwarding.keySas"),(0,o.kt)("td",{parentName:"tr",align:null},"A Secure access link that can be used to automatically download the required ssh private key"),(0,o.kt)("td",{parentName:"tr",align:null},"no"),(0,o.kt)("td",{parentName:"tr",align:null}),(0,o.kt)("td",{parentName:"tr",align:null},"Sometimes more convenient than a directory")),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:null},"forwarding.keyDir"),(0,o.kt)("td",{parentName:"tr",align:null},"A directory on the machines holding the private key"),(0,o.kt)("td",{parentName:"tr",align:null},"no"),(0,o.kt)("td",{parentName:"tr",align:null},'"~/.ssh"'),(0,o.kt)("td",{parentName:"tr",align:null},"Useful if you can't send keys over the wire securely")))))}m.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/109f6864.63edc8b3.js b/assets/js/15ea8379.449501db.js similarity index 97% rename from assets/js/109f6864.63edc8b3.js rename to assets/js/15ea8379.449501db.js index 7ad89af24a..3c847bb002 100644 --- a/assets/js/109f6864.63edc8b3.js +++ b/assets/js/15ea8379.449501db.js @@ -1 +1 @@ -"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[4994],{3905:function(e,t,a){a.d(t,{Zo:function(){return u},kt:function(){return d}});var r=a(7294);function n(e,t,a){return t in e?Object.defineProperty(e,t,{value:a,enumerable:!0,configurable:!0,writable:!0}):e[t]=a,e}function i(e,t){var a=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),a.push.apply(a,r)}return a}function s(e){for(var t=1;t=0||(n[a]=e[a]);return n}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(n[a]=e[a])}return n}var l=r.createContext({}),p=function(e){var t=r.useContext(l),a=t;return e&&(a="function"==typeof e?e(t):s(s({},t),e)),a},u=function(e){var t=p(e.components);return r.createElement(l.Provider,{value:t},e.children)},c={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},m=r.forwardRef((function(e,t){var a=e.components,n=e.mdxType,i=e.originalType,l=e.parentName,u=o(e,["components","mdxType","originalType","parentName"]),m=p(a),d=n,g=m["".concat(l,".").concat(d)]||m[d]||c[d]||i;return a?r.createElement(g,s(s({ref:t},u),{},{components:a})):r.createElement(g,s({ref:t},u))}));function d(e,t){var a=arguments,n=t&&t.mdxType;if("string"==typeof e||n){var i=a.length,s=new Array(i);s[0]=m;var o={};for(var l in t)hasOwnProperty.call(t,l)&&(o[l]=t[l]);o.originalType=e,o.mdxType="string"==typeof e?e:n,s[1]=o;for(var p=2;p=0||(n[a]=e[a]);return n}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(n[a]=e[a])}return n}var l=r.createContext({}),p=function(e){var t=r.useContext(l),a=t;return e&&(a="function"==typeof e?e(t):s(s({},t),e)),a},u=function(e){var t=p(e.components);return r.createElement(l.Provider,{value:t},e.children)},c={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},m=r.forwardRef((function(e,t){var a=e.components,n=e.mdxType,i=e.originalType,l=e.parentName,u=o(e,["components","mdxType","originalType","parentName"]),m=p(a),d=n,g=m["".concat(l,".").concat(d)]||m[d]||c[d]||i;return a?r.createElement(g,s(s({ref:t},u),{},{components:a})):r.createElement(g,s({ref:t},u))}));function d(e,t){var a=arguments,n=t&&t.mdxType;if("string"==typeof e||n){var i=a.length,s=new Array(i);s[0]=m;var o={};for(var l in t)hasOwnProperty.call(t,l)&&(o[l]=t[l]);o.originalType=e,o.mdxType="string"==typeof e?e:n,s[1]=o;for(var p=2;p=0||(n[a]=e[a]);return n}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(n[a]=e[a])}return n}var u=r.createContext({}),i=function(e){var t=r.useContext(u),a=t;return e&&(a="function"==typeof e?e(t):l(l({},t),e)),a},c=function(e){var t=i(e.components);return r.createElement(u.Provider,{value:t},e.children)},p={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},m=r.forwardRef((function(e,t){var a=e.components,n=e.mdxType,o=e.originalType,u=e.parentName,c=s(e,["components","mdxType","originalType","parentName"]),m=i(a),b=n,f=m["".concat(u,".").concat(b)]||m[b]||p[b]||o;return a?r.createElement(f,l(l({ref:t},c),{},{components:a})):r.createElement(f,l({ref:t},c))}));function b(e,t){var a=arguments,n=t&&t.mdxType;if("string"==typeof e||n){var o=a.length,l=new Array(o);l[0]=m;var s={};for(var u in t)hasOwnProperty.call(t,u)&&(s[u]=t[u]);s.originalType=e,s.mdxType="string"==typeof e?e:n,l[1]=s;for(var i=2;i child <"+("string"==typeof e.type?e.type:e.type.name)+'>: all children of the component should be , and every should have a unique "value" prop.')})))?void 0:a.filter(Boolean))?t:[]}(e).map((function(e){var t=e.props;return{value:t.value,label:t.label,attributes:t.attributes,default:t.default}}))}function m(e){var t=e.values,a=e.children;return(0,n.useMemo)((function(){var e=null!=t?t:p(a);return function(e){var t=(0,i.l)(e,(function(e,t){return e.value===t.value}));if(t.length>0)throw new Error('Docusaurus error: Duplicate values "'+t.map((function(e){return e.value})).join(", ")+'" found in . Every value needs to be unique.')}(e),e}),[t,a])}function b(e){var t=e.value;return e.tabValues.some((function(e){return e.value===t}))}function f(e){var t=e.queryString,a=void 0!==t&&t,r=e.groupId,o=(0,s.k6)(),l=function(e){var t=e.queryString,a=void 0!==t&&t,r=e.groupId;if("string"==typeof a)return a;if(!1===a)return null;if(!0===a&&!r)throw new Error('Docusaurus error: The component groupId prop is required if queryString=true, because this value is used as the search param name. You can also provide an explicit value such as queryString="my-search-param".');return null!=r?r:null}({queryString:a,groupId:r});return[(0,u._X)(l),(0,n.useCallback)((function(e){if(l){var t=new URLSearchParams(o.location.search);t.set(l,e),o.replace(Object.assign({},o.location,{search:t.toString()}))}}),[l,o])]}function d(e){var t,a,r,o,l=e.defaultValue,s=e.queryString,u=void 0!==s&&s,i=e.groupId,p=m(e),d=(0,n.useState)((function(){return function(e){var t,a=e.defaultValue,r=e.tabValues;if(0===r.length)throw new Error("Docusaurus error: the component requires at least one children component");if(a){if(!b({value:a,tabValues:r}))throw new Error('Docusaurus error: The has a defaultValue "'+a+'" but none of its children has the corresponding value. Available values are: '+r.map((function(e){return e.value})).join(", ")+". If you intend to show no default tab, use defaultValue={null} instead.");return a}var n=null!=(t=r.find((function(e){return e.default})))?t:r[0];if(!n)throw new Error("Unexpected error: 0 tabValues");return n.value}({defaultValue:l,tabValues:p})})),v=d[0],y=d[1],w=f({queryString:u,groupId:i}),h=w[0],g=w[1],E=(t=function(e){return e?"docusaurus.tab."+e:null}({groupId:i}.groupId),a=(0,c.Nk)(t),r=a[0],o=a[1],[r,(0,n.useCallback)((function(e){t&&o.set(e)}),[t,o])]),k=E[0],V=E[1],x=function(){var e=null!=h?h:k;return b({value:e,tabValues:p})?e:null}();return(0,n.useLayoutEffect)((function(){x&&y(x)}),[x]),{selectedValue:v,selectValue:(0,n.useCallback)((function(e){if(!b({value:e,tabValues:p}))throw new Error("Can't select invalid tab value="+e);y(e),g(e),V(e)}),[g,V,p]),tabValues:p}}var v=a(2389),y="tabList__CuJ",w="tabItem_LNqP";function h(e){var t=e.className,a=e.block,s=e.selectedValue,u=e.selectValue,i=e.tabValues,c=[],p=(0,l.o5)().blockElementScrollPositionUntilNextRender,m=function(e){var t=e.currentTarget,a=c.indexOf(t),r=i[a].value;r!==s&&(p(t),u(r))},b=function(e){var t,a=null;switch(e.key){case"Enter":m(e);break;case"ArrowRight":var r,n=c.indexOf(e.currentTarget)+1;a=null!=(r=c[n])?r:c[0];break;case"ArrowLeft":var o,l=c.indexOf(e.currentTarget)-1;a=null!=(o=c[l])?o:c[c.length-1]}null==(t=a)||t.focus()};return n.createElement("ul",{role:"tablist","aria-orientation":"horizontal",className:(0,o.Z)("tabs",{"tabs--block":a},t)},i.map((function(e){var t=e.value,a=e.label,l=e.attributes;return n.createElement("li",(0,r.Z)({role:"tab",tabIndex:s===t?0:-1,"aria-selected":s===t,key:t,ref:function(e){return c.push(e)},onKeyDown:b,onClick:m},l,{className:(0,o.Z)("tabs__item",w,null==l?void 0:l.className,{"tabs__item--active":s===t})}),null!=a?a:t)})))}function g(e){var t=e.lazy,a=e.children,r=e.selectedValue,o=(Array.isArray(a)?a:[a]).filter(Boolean);if(t){var l=o.find((function(e){return e.props.value===r}));return l?(0,n.cloneElement)(l,{className:"margin-top--md"}):null}return n.createElement("div",{className:"margin-top--md"},o.map((function(e,t){return(0,n.cloneElement)(e,{key:t,hidden:e.props.value!==r})})))}function E(e){var t=d(e);return n.createElement("div",{className:(0,o.Z)("tabs-container",y)},n.createElement(h,(0,r.Z)({},e,t)),n.createElement(g,(0,r.Z)({},e,t)))}function k(e){var t=(0,v.Z)();return n.createElement(E,(0,r.Z)({key:String(t)},e))}},1989:function(e,t,a){var r=a(7294),n=a(2263);t.Z=function(e){var t=e.className,a=e.py,o=e.scala,l=e.csharp,s=e.sourceLink,u=(0,n.Z)().siteConfig.customFields.version,i="https://mmlspark.blob.core.windows.net/docs/"+u+"/pyspark/"+a,c="https://mmlspark.blob.core.windows.net/docs/"+u+"/scala/"+o,p="https://mmlspark.blob.core.windows.net/docs/"+u+"/dotnet/"+l;return r.createElement("table",null,r.createElement("tbody",null,r.createElement("tr",null,r.createElement("td",null,r.createElement("strong",null,"Python API: "),r.createElement("a",{href:i},t)),r.createElement("td",null,r.createElement("strong",null,"Scala API: "),r.createElement("a",{href:c},t)),r.createElement("td",null,r.createElement("strong",null,".NET API: "),r.createElement("a",{href:p},t)),r.createElement("td",null,r.createElement("strong",null,"Source: "),r.createElement("a",{href:s},t)))))}},9956:function(e,t,a){a.r(t),a.d(t,{assets:function(){return y},contentTitle:function(){return d},default:function(){return g},frontMatter:function(){return f},metadata:function(){return v},toc:function(){return w}});var r=a(3117),n=a(102),o=(a(7294),a(3905)),l=a(4866),s=a(5162),u=a(1989),i=["components"],c=[{value:"VowpalWabbitRegressor",id:"vowpalwabbitregressor",level:2},{value:"VowpalWabbitContextualBandit",id:"vowpalwabbitcontextualbandit",level:2}],p={toc:c};function m(e){var t=e.components,a=(0,n.Z)(e,i);return(0,o.kt)("wrapper",(0,r.Z)({},p,a,{components:t,mdxType:"MDXLayout"}),(0,o.kt)("h2",{id:"vowpalwabbitregressor"},"VowpalWabbitRegressor"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(s.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.vw import *\n\nvw = (VowpalWabbitRegressor()\n .setLabelCol("Y1")\n .setFeaturesCol("features")\n .setPredictionCol("pred"))\n\nvwRegressor = (VowpalWabbitRegressor()\n .setNumPasses(20)\n .setPassThroughArgs("--holdout_off --loss_function quantile -q :: -l 0.1"))\n'))),(0,o.kt)(s.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.vw._\n\nval vw = (new VowpalWabbitRegressor()\n .setLabelCol("Y1")\n .setFeaturesCol("features")\n .setPredictionCol("pred"))\n\nval vwRegressor = (new VowpalWabbitRegressor()\n .setNumPasses(20)\n .setPassThroughArgs("--holdout_off --loss_function quantile -q :: -l 0.1"))\n\n')))),(0,o.kt)(u.Z,{className:"VowpalWabbitRegressor",py:"synapse.ml.vw.html#module-synapse.ml.vw.VowpalWabbitRegressor",scala:"com/microsoft/azure/synapse/ml/vw/VowpalWabbitRegressor.html",csharp:"classSynapse_1_1ML_1_1Vw_1_1VowpalWabbitRegressor.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/vw/src/main/scala/com/microsoft/azure/synapse/ml/vw/VowpalWabbitRegressor.scala",mdxType:"DocTable"}),(0,o.kt)("h2",{id:"vowpalwabbitcontextualbandit"},"VowpalWabbitContextualBandit"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(s.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.vw import *\n\ncb = (VowpalWabbitContextualBandit()\n .setPassThroughArgs("--cb_explore_adf --epsilon 0.2 --quiet")\n .setLabelCol("cost")\n .setProbabilityCol("prob")\n .setChosenActionCol("chosen_action")\n .setSharedCol("shared_features")\n .setFeaturesCol("action_features")\n .setUseBarrierExecutionMode(False))\n'))),(0,o.kt)(s.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.vw._\n\nval cb = (new VowpalWabbitContextualBandit()\n .setPassThroughArgs("--cb_explore_adf --epsilon 0.2 --quiet")\n .setLabelCol("cost")\n .setProbabilityCol("prob")\n .setChosenActionCol("chosen_action")\n .setSharedCol("shared_features")\n .setFeaturesCol("action_features")\n .setUseBarrierExecutionMode(false))\n\n')))),(0,o.kt)(u.Z,{className:"VowpalWabbitContextualBandit",py:"synapse.ml.vw.html#module-synapse.ml.vw.VowpalWabbitContextualBandit",scala:"com/microsoft/azure/synapse/ml/vw/VowpalWabbitContextualBandit.html",csharp:"classSynapse_1_1ML_1_1Vw_1_1VowpalWabbitContextualBandit.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/vw/src/main/scala/com/microsoft/azure/synapse/ml/vw/VowpalWabbitContextualBandit.scala",mdxType:"DocTable"}))}m.isMDXComponent=!0;var b=["components"],f={title:"Estimators - Vowpal Wabbit",sidebar_label:"Vowpal Wabbit",hide_title:!0},d="Vowpal Wabbit",v={unversionedId:"Quick Examples/estimators/estimators_vw",id:"version-0.11.3/Quick Examples/estimators/estimators_vw",title:"Estimators - Vowpal Wabbit",description:"",source:"@site/versioned_docs/version-0.11.3/Quick Examples/estimators/estimators_vw.md",sourceDirName:"Quick Examples/estimators",slug:"/Quick Examples/estimators/estimators_vw",permalink:"/SynapseML/docs/Quick Examples/estimators/estimators_vw",draft:!1,tags:[],version:"0.11.3",frontMatter:{title:"Estimators - Vowpal Wabbit",sidebar_label:"Vowpal Wabbit",hide_title:!0}},y={},w=[].concat(c),h={toc:w};function g(e){var t=e.components,a=(0,n.Z)(e,b);return(0,o.kt)("wrapper",(0,r.Z)({},h,a,{components:t,mdxType:"MDXLayout"}),(0,o.kt)("h1",{id:"vowpal-wabbit"},"Vowpal Wabbit"),(0,o.kt)(m,{mdxType:"VW"}))}g.isMDXComponent=!0}}]); \ No newline at end of file +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[4535],{3905:function(e,t,a){a.d(t,{Zo:function(){return c},kt:function(){return b}});var r=a(7294);function n(e,t,a){return t in e?Object.defineProperty(e,t,{value:a,enumerable:!0,configurable:!0,writable:!0}):e[t]=a,e}function o(e,t){var a=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),a.push.apply(a,r)}return a}function l(e){for(var t=1;t=0||(n[a]=e[a]);return n}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(n[a]=e[a])}return n}var u=r.createContext({}),i=function(e){var t=r.useContext(u),a=t;return e&&(a="function"==typeof e?e(t):l(l({},t),e)),a},c=function(e){var t=i(e.components);return r.createElement(u.Provider,{value:t},e.children)},p={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},m=r.forwardRef((function(e,t){var a=e.components,n=e.mdxType,o=e.originalType,u=e.parentName,c=s(e,["components","mdxType","originalType","parentName"]),m=i(a),b=n,f=m["".concat(u,".").concat(b)]||m[b]||p[b]||o;return a?r.createElement(f,l(l({ref:t},c),{},{components:a})):r.createElement(f,l({ref:t},c))}));function b(e,t){var a=arguments,n=t&&t.mdxType;if("string"==typeof e||n){var o=a.length,l=new Array(o);l[0]=m;var s={};for(var u in t)hasOwnProperty.call(t,u)&&(s[u]=t[u]);s.originalType=e,s.mdxType="string"==typeof e?e:n,l[1]=s;for(var i=2;i child <"+("string"==typeof e.type?e.type:e.type.name)+'>: all children of the component should be , and every should have a unique "value" prop.')})))?void 0:a.filter(Boolean))?t:[]}(e).map((function(e){var t=e.props;return{value:t.value,label:t.label,attributes:t.attributes,default:t.default}}))}function m(e){var t=e.values,a=e.children;return(0,n.useMemo)((function(){var e=null!=t?t:p(a);return function(e){var t=(0,i.l)(e,(function(e,t){return e.value===t.value}));if(t.length>0)throw new Error('Docusaurus error: Duplicate values "'+t.map((function(e){return e.value})).join(", ")+'" found in . Every value needs to be unique.')}(e),e}),[t,a])}function b(e){var t=e.value;return e.tabValues.some((function(e){return e.value===t}))}function f(e){var t=e.queryString,a=void 0!==t&&t,r=e.groupId,o=(0,s.k6)(),l=function(e){var t=e.queryString,a=void 0!==t&&t,r=e.groupId;if("string"==typeof a)return a;if(!1===a)return null;if(!0===a&&!r)throw new Error('Docusaurus error: The component groupId prop is required if queryString=true, because this value is used as the search param name. You can also provide an explicit value such as queryString="my-search-param".');return null!=r?r:null}({queryString:a,groupId:r});return[(0,u._X)(l),(0,n.useCallback)((function(e){if(l){var t=new URLSearchParams(o.location.search);t.set(l,e),o.replace(Object.assign({},o.location,{search:t.toString()}))}}),[l,o])]}function d(e){var t,a,r,o,l=e.defaultValue,s=e.queryString,u=void 0!==s&&s,i=e.groupId,p=m(e),d=(0,n.useState)((function(){return function(e){var t,a=e.defaultValue,r=e.tabValues;if(0===r.length)throw new Error("Docusaurus error: the component requires at least one children component");if(a){if(!b({value:a,tabValues:r}))throw new Error('Docusaurus error: The has a defaultValue "'+a+'" but none of its children has the corresponding value. Available values are: '+r.map((function(e){return e.value})).join(", ")+". If you intend to show no default tab, use defaultValue={null} instead.");return a}var n=null!=(t=r.find((function(e){return e.default})))?t:r[0];if(!n)throw new Error("Unexpected error: 0 tabValues");return n.value}({defaultValue:l,tabValues:p})})),v=d[0],y=d[1],w=f({queryString:u,groupId:i}),h=w[0],g=w[1],E=(t=function(e){return e?"docusaurus.tab."+e:null}({groupId:i}.groupId),a=(0,c.Nk)(t),r=a[0],o=a[1],[r,(0,n.useCallback)((function(e){t&&o.set(e)}),[t,o])]),k=E[0],V=E[1],x=function(){var e=null!=h?h:k;return b({value:e,tabValues:p})?e:null}();return(0,n.useLayoutEffect)((function(){x&&y(x)}),[x]),{selectedValue:v,selectValue:(0,n.useCallback)((function(e){if(!b({value:e,tabValues:p}))throw new Error("Can't select invalid tab value="+e);y(e),g(e),V(e)}),[g,V,p]),tabValues:p}}var v=a(2389),y="tabList__CuJ",w="tabItem_LNqP";function h(e){var t=e.className,a=e.block,s=e.selectedValue,u=e.selectValue,i=e.tabValues,c=[],p=(0,l.o5)().blockElementScrollPositionUntilNextRender,m=function(e){var t=e.currentTarget,a=c.indexOf(t),r=i[a].value;r!==s&&(p(t),u(r))},b=function(e){var t,a=null;switch(e.key){case"Enter":m(e);break;case"ArrowRight":var r,n=c.indexOf(e.currentTarget)+1;a=null!=(r=c[n])?r:c[0];break;case"ArrowLeft":var o,l=c.indexOf(e.currentTarget)-1;a=null!=(o=c[l])?o:c[c.length-1]}null==(t=a)||t.focus()};return n.createElement("ul",{role:"tablist","aria-orientation":"horizontal",className:(0,o.Z)("tabs",{"tabs--block":a},t)},i.map((function(e){var t=e.value,a=e.label,l=e.attributes;return n.createElement("li",(0,r.Z)({role:"tab",tabIndex:s===t?0:-1,"aria-selected":s===t,key:t,ref:function(e){return c.push(e)},onKeyDown:b,onClick:m},l,{className:(0,o.Z)("tabs__item",w,null==l?void 0:l.className,{"tabs__item--active":s===t})}),null!=a?a:t)})))}function g(e){var t=e.lazy,a=e.children,r=e.selectedValue,o=(Array.isArray(a)?a:[a]).filter(Boolean);if(t){var l=o.find((function(e){return e.props.value===r}));return l?(0,n.cloneElement)(l,{className:"margin-top--md"}):null}return n.createElement("div",{className:"margin-top--md"},o.map((function(e,t){return(0,n.cloneElement)(e,{key:t,hidden:e.props.value!==r})})))}function E(e){var t=d(e);return n.createElement("div",{className:(0,o.Z)("tabs-container",y)},n.createElement(h,(0,r.Z)({},e,t)),n.createElement(g,(0,r.Z)({},e,t)))}function k(e){var t=(0,v.Z)();return n.createElement(E,(0,r.Z)({key:String(t)},e))}},1989:function(e,t,a){var r=a(7294),n=a(2263);t.Z=function(e){var t=e.className,a=e.py,o=e.scala,l=e.csharp,s=e.sourceLink,u=(0,n.Z)().siteConfig.customFields.version,i="https://mmlspark.blob.core.windows.net/docs/"+u+"/pyspark/"+a,c="https://mmlspark.blob.core.windows.net/docs/"+u+"/scala/"+o,p="https://mmlspark.blob.core.windows.net/docs/"+u+"/dotnet/"+l;return r.createElement("table",null,r.createElement("tbody",null,r.createElement("tr",null,r.createElement("td",null,r.createElement("strong",null,"Python API: "),r.createElement("a",{href:i},t)),r.createElement("td",null,r.createElement("strong",null,"Scala API: "),r.createElement("a",{href:c},t)),r.createElement("td",null,r.createElement("strong",null,".NET API: "),r.createElement("a",{href:p},t)),r.createElement("td",null,r.createElement("strong",null,"Source: "),r.createElement("a",{href:s},t)))))}},3314:function(e,t,a){a.r(t),a.d(t,{assets:function(){return y},contentTitle:function(){return d},default:function(){return g},frontMatter:function(){return f},metadata:function(){return v},toc:function(){return w}});var r=a(3117),n=a(102),o=(a(7294),a(3905)),l=a(4866),s=a(5162),u=a(1989),i=["components"],c=[{value:"VowpalWabbitRegressor",id:"vowpalwabbitregressor",level:2},{value:"VowpalWabbitContextualBandit",id:"vowpalwabbitcontextualbandit",level:2}],p={toc:c};function m(e){var t=e.components,a=(0,n.Z)(e,i);return(0,o.kt)("wrapper",(0,r.Z)({},p,a,{components:t,mdxType:"MDXLayout"}),(0,o.kt)("h2",{id:"vowpalwabbitregressor"},"VowpalWabbitRegressor"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(s.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.vw import *\n\nvw = (VowpalWabbitRegressor()\n .setLabelCol("Y1")\n .setFeaturesCol("features")\n .setPredictionCol("pred"))\n\nvwRegressor = (VowpalWabbitRegressor()\n .setNumPasses(20)\n .setPassThroughArgs("--holdout_off --loss_function quantile -q :: -l 0.1"))\n'))),(0,o.kt)(s.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.vw._\n\nval vw = (new VowpalWabbitRegressor()\n .setLabelCol("Y1")\n .setFeaturesCol("features")\n .setPredictionCol("pred"))\n\nval vwRegressor = (new VowpalWabbitRegressor()\n .setNumPasses(20)\n .setPassThroughArgs("--holdout_off --loss_function quantile -q :: -l 0.1"))\n\n')))),(0,o.kt)(u.Z,{className:"VowpalWabbitRegressor",py:"synapse.ml.vw.html#module-synapse.ml.vw.VowpalWabbitRegressor",scala:"com/microsoft/azure/synapse/ml/vw/VowpalWabbitRegressor.html",csharp:"classSynapse_1_1ML_1_1Vw_1_1VowpalWabbitRegressor.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/vw/src/main/scala/com/microsoft/azure/synapse/ml/vw/VowpalWabbitRegressor.scala",mdxType:"DocTable"}),(0,o.kt)("h2",{id:"vowpalwabbitcontextualbandit"},"VowpalWabbitContextualBandit"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(s.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.vw import *\n\ncb = (VowpalWabbitContextualBandit()\n .setPassThroughArgs("--cb_explore_adf --epsilon 0.2 --quiet")\n .setLabelCol("cost")\n .setProbabilityCol("prob")\n .setChosenActionCol("chosen_action")\n .setSharedCol("shared_features")\n .setFeaturesCol("action_features")\n .setUseBarrierExecutionMode(False))\n'))),(0,o.kt)(s.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.vw._\n\nval cb = (new VowpalWabbitContextualBandit()\n .setPassThroughArgs("--cb_explore_adf --epsilon 0.2 --quiet")\n .setLabelCol("cost")\n .setProbabilityCol("prob")\n .setChosenActionCol("chosen_action")\n .setSharedCol("shared_features")\n .setFeaturesCol("action_features")\n .setUseBarrierExecutionMode(false))\n\n')))),(0,o.kt)(u.Z,{className:"VowpalWabbitContextualBandit",py:"synapse.ml.vw.html#module-synapse.ml.vw.VowpalWabbitContextualBandit",scala:"com/microsoft/azure/synapse/ml/vw/VowpalWabbitContextualBandit.html",csharp:"classSynapse_1_1ML_1_1Vw_1_1VowpalWabbitContextualBandit.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/vw/src/main/scala/com/microsoft/azure/synapse/ml/vw/VowpalWabbitContextualBandit.scala",mdxType:"DocTable"}))}m.isMDXComponent=!0;var b=["components"],f={title:"Estimators - Vowpal Wabbit",sidebar_label:"Vowpal Wabbit",hide_title:!0},d="Vowpal Wabbit",v={unversionedId:"Quick Examples/estimators/estimators_vw",id:"version-0.11.4/Quick Examples/estimators/estimators_vw",title:"Estimators - Vowpal Wabbit",description:"",source:"@site/versioned_docs/version-0.11.4/Quick Examples/estimators/estimators_vw.md",sourceDirName:"Quick Examples/estimators",slug:"/Quick Examples/estimators/estimators_vw",permalink:"/SynapseML/docs/Quick Examples/estimators/estimators_vw",draft:!1,tags:[],version:"0.11.4",frontMatter:{title:"Estimators - Vowpal Wabbit",sidebar_label:"Vowpal Wabbit",hide_title:!0}},y={},w=[].concat(c),h={toc:w};function g(e){var t=e.components,a=(0,n.Z)(e,b);return(0,o.kt)("wrapper",(0,r.Z)({},h,a,{components:t,mdxType:"MDXLayout"}),(0,o.kt)("h1",{id:"vowpal-wabbit"},"Vowpal Wabbit"),(0,o.kt)(m,{mdxType:"VW"}))}g.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/1dc2f362.cdb0b9ed.js b/assets/js/1dc2f362.cdb0b9ed.js new file mode 100644 index 0000000000..9b6a87374d --- /dev/null +++ b/assets/js/1dc2f362.cdb0b9ed.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[9864],{3905:function(e,t,n){n.d(t,{Zo:function(){return m},kt:function(){return u}});var a=n(7294);function r(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function i(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);t&&(a=a.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,a)}return n}function o(e){for(var t=1;t=0||(r[n]=e[n]);return r}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(r[n]=e[n])}return r}var s=a.createContext({}),p=function(e){var t=a.useContext(s),n=t;return e&&(n="function"==typeof e?e(t):o(o({},t),e)),n},m=function(e){var t=p(e.components);return a.createElement(s.Provider,{value:t},e.children)},c={inlineCode:"code",wrapper:function(e){var t=e.children;return a.createElement(a.Fragment,{},t)}},d=a.forwardRef((function(e,t){var n=e.components,r=e.mdxType,i=e.originalType,s=e.parentName,m=l(e,["components","mdxType","originalType","parentName"]),d=p(n),u=r,g=d["".concat(s,".").concat(u)]||d[u]||c[u]||i;return n?a.createElement(g,o(o({ref:t},m),{},{components:n})):a.createElement(g,o({ref:t},m))}));function u(e,t){var n=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var i=n.length,o=new Array(i);o[0]=d;var l={};for(var s in t)hasOwnProperty.call(t,s)&&(l[s]=t[s]);l.originalType=e,l.mdxType="string"==typeof e?e:r,o[1]=l;for(var p=2;p=0||(a[n]=e[n]);return a}(e,t);if(Object.getOwnPropertySymbols){var s=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(a[n]=e[n])}return a}var l=r.createContext({}),c=function(e){var t=r.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):i(i({},t),e)),n},p=function(e){var t=c(e.components);return r.createElement(l.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},m=r.forwardRef((function(e,t){var n=e.components,a=e.mdxType,s=e.originalType,l=e.parentName,p=o(e,["components","mdxType","originalType","parentName"]),m=c(n),d=a,h=m["".concat(l,".").concat(d)]||m[d]||u[d]||s;return n?r.createElement(h,i(i({ref:t},p),{},{components:n})):r.createElement(h,i({ref:t},p))}));function d(e,t){var n=arguments,a=t&&t.mdxType;if("string"==typeof e||a){var s=n.length,i=new Array(s);i[0]=m;var o={};for(var l in t)hasOwnProperty.call(t,l)&&(o[l]=t[l]);o.originalType=e,o.mdxType="string"==typeof e?e:a,i[1]=o;for(var c=2;c=0||(a[n]=e[n]);return a}(e,t);if(Object.getOwnPropertySymbols){var s=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(a[n]=e[n])}return a}var l=r.createContext({}),c=function(e){var t=r.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):i(i({},t),e)),n},p=function(e){var t=c(e.components);return r.createElement(l.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},m=r.forwardRef((function(e,t){var n=e.components,a=e.mdxType,s=e.originalType,l=e.parentName,p=o(e,["components","mdxType","originalType","parentName"]),m=c(n),d=a,h=m["".concat(l,".").concat(d)]||m[d]||u[d]||s;return n?r.createElement(h,i(i({ref:t},p),{},{components:n})):r.createElement(h,i({ref:t},p))}));function d(e,t){var n=arguments,a=t&&t.mdxType;if("string"==typeof e||a){var s=n.length,i=new Array(s);i[0]=m;var o={};for(var l in t)hasOwnProperty.call(t,l)&&(o[l]=t[l]);o.originalType=e,o.mdxType="string"==typeof e?e:a,i[1]=o;for(var c=2;c=0||(r[n]=e[n]);return r}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(r[n]=e[n])}return r}var l=a.createContext({}),p=function(e){var t=a.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):s(s({},t),e)),n},m=function(e){var t=p(e.components);return a.createElement(l.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return a.createElement(a.Fragment,{},t)}},d=a.forwardRef((function(e,t){var n=e.components,r=e.mdxType,o=e.originalType,l=e.parentName,m=i(e,["components","mdxType","originalType","parentName"]),d=p(n),c=r,f=d["".concat(l,".").concat(c)]||d[c]||u[c]||o;return n?a.createElement(f,s(s({ref:t},m),{},{components:n})):a.createElement(f,s({ref:t},m))}));function c(e,t){var n=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var o=n.length,s=new Array(o);s[0]=d;var i={};for(var l in t)hasOwnProperty.call(t,l)&&(i[l]=t[l]);i.originalType=e,i.mdxType="string"==typeof e?e:r,s[1]=i;for(var p=2;p 3) or bad based on the text of the review. You\naccomplish it by training LogisticRegression learners with different\nhyperparameters and choosing the best model."),(0,o.kt)("h2",{id:"setup"},"Setup"),(0,o.kt)("p",null,"Import necessary Python libraries and get a spark session."),(0,o.kt)("h2",{id:"read-the-data"},"Read the data"),(0,o.kt)("p",null,"Download and read in the data."),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'rawData = spark.read.parquet(\n "wasbs://publicwasb@mmlspark.blob.core.windows.net/BookReviewsFromAmazon10K.parquet"\n)\nrawData.show(5)\n')),(0,o.kt)("h2",{id:"extract-features-and-process-data"},"Extract features and process data"),(0,o.kt)("p",null,"Real data is more complex than the above dataset. It's common\nfor a dataset to have features of multiple types, such as text, numeric, and\ncategorical. To illustrate how difficult it's to work with these\ndatasets, add two numerical features to the dataset: the ",(0,o.kt)("strong",{parentName:"p"},"word count")," of the review and the ",(0,o.kt)("strong",{parentName:"p"},"mean word length"),"."),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},"from pyspark.sql.functions import udf\nfrom pyspark.sql.types import *\n\n\ndef wordCount(s):\n return len(s.split())\n\n\ndef wordLength(s):\n import numpy as np\n\n ss = [len(w) for w in s.split()]\n return round(float(np.mean(ss)), 2)\n\n\nwordLengthUDF = udf(wordLength, DoubleType())\nwordCountUDF = udf(wordCount, IntegerType())\n")),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.stages import UDFTransformer\n\nwordLength = "wordLength"\nwordCount = "wordCount"\nwordLengthTransformer = UDFTransformer(\n inputCol="text", outputCol=wordLength, udf=wordLengthUDF\n)\nwordCountTransformer = UDFTransformer(\n inputCol="text", outputCol=wordCount, udf=wordCountUDF\n)\n')),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from pyspark.ml import Pipeline\n\ndata = (\n Pipeline(stages=[wordLengthTransformer, wordCountTransformer])\n .fit(rawData)\n .transform(rawData)\n .withColumn("label", rawData["rating"] > 3)\n .drop("rating")\n)\n')),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},"data.show(5)\n")),(0,o.kt)("h2",{id:"classify-using-pyspark"},"Classify using pyspark"),(0,o.kt)("p",null,"To choose the best LogisticRegression classifier using the ",(0,o.kt)("inlineCode",{parentName:"p"},"pyspark"),"\nlibrary, you need to ",(0,o.kt)("em",{parentName:"p"},"explicitly")," perform the following steps:"),(0,o.kt)("ol",null,(0,o.kt)("li",{parentName:"ol"},"Process the features:",(0,o.kt)("ul",{parentName:"li"},(0,o.kt)("li",{parentName:"ul"},"Tokenize the text column"),(0,o.kt)("li",{parentName:"ul"},"Hash the tokenized column into a vector using hashing"),(0,o.kt)("li",{parentName:"ul"},"Merge the numeric features with the vector"))),(0,o.kt)("li",{parentName:"ol"},"Process the label column: cast it into the proper type."),(0,o.kt)("li",{parentName:"ol"},"Train multiple LogisticRegression algorithms on the ",(0,o.kt)("inlineCode",{parentName:"li"},"train")," dataset\nwith different hyperparameters"),(0,o.kt)("li",{parentName:"ol"},"Compute the area under the ROC curve for each of the trained models\nand select the model with the highest metric as computed on the\n",(0,o.kt)("inlineCode",{parentName:"li"},"test")," dataset"),(0,o.kt)("li",{parentName:"ol"},"Evaluate the best model on the ",(0,o.kt)("inlineCode",{parentName:"li"},"validation")," set")),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from pyspark.ml.feature import Tokenizer, HashingTF\nfrom pyspark.ml.feature import VectorAssembler\n\n# Featurize text column\ntokenizer = Tokenizer(inputCol="text", outputCol="tokenizedText")\nnumFeatures = 10000\nhashingScheme = HashingTF(\n inputCol="tokenizedText", outputCol="TextFeatures", numFeatures=numFeatures\n)\ntokenizedData = tokenizer.transform(data)\nfeaturizedData = hashingScheme.transform(tokenizedData)\n\n# Merge text and numeric features in one feature column\nfeatureColumnsArray = ["TextFeatures", "wordCount", "wordLength"]\nassembler = VectorAssembler(inputCols=featureColumnsArray, outputCol="features")\nassembledData = assembler.transform(featurizedData)\n\n# Select only columns of interest\n# Convert rating column from boolean to int\nprocessedData = assembledData.select("label", "features").withColumn(\n "label", assembledData.label.cast(IntegerType())\n)\n')),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from pyspark.ml.evaluation import BinaryClassificationEvaluator\nfrom pyspark.ml.classification import LogisticRegression\n\n# Prepare data for learning\ntrain, test, validation = processedData.randomSplit([0.60, 0.20, 0.20], seed=123)\n\n# Train the models on the \'train\' data\nlrHyperParams = [0.05, 0.1, 0.2, 0.4]\nlogisticRegressions = [\n LogisticRegression(regParam=hyperParam) for hyperParam in lrHyperParams\n]\nevaluator = BinaryClassificationEvaluator(\n rawPredictionCol="rawPrediction", metricName="areaUnderROC"\n)\nmetrics = []\nmodels = []\n\n# Select the best model\nfor learner in logisticRegressions:\n model = learner.fit(train)\n models.append(model)\n scoredData = model.transform(test)\n metrics.append(evaluator.evaluate(scoredData))\nbestMetric = max(metrics)\nbestModel = models[metrics.index(bestMetric)]\n\n# Get AUC on the validation dataset\nscoredVal = bestModel.transform(validation)\nprint(evaluator.evaluate(scoredVal))\n')),(0,o.kt)("h2",{id:"classify-using-synapseml"},"Classify using SynapseML"),(0,o.kt)("p",null,"The pipeline can be simplified by using SynapseML:"),(0,o.kt)("ol",null,(0,o.kt)("li",{parentName:"ol"},(0,o.kt)("p",{parentName:"li"},"The ",(0,o.kt)("strong",{parentName:"p"},(0,o.kt)("inlineCode",{parentName:"strong"},"TrainClassifier"))," Estimator featurizes the data internally,\nas long as the columns selected in the ",(0,o.kt)("inlineCode",{parentName:"p"},"train"),", ",(0,o.kt)("inlineCode",{parentName:"p"},"test"),", ",(0,o.kt)("inlineCode",{parentName:"p"},"validation"),"\ndataset represent the features")),(0,o.kt)("li",{parentName:"ol"},(0,o.kt)("p",{parentName:"li"},"The ",(0,o.kt)("strong",{parentName:"p"},(0,o.kt)("inlineCode",{parentName:"strong"},"FindBestModel"))," Estimator finds the best model from a pool of\ntrained models by finding the model that performs best on the ",(0,o.kt)("inlineCode",{parentName:"p"},"test"),"\ndataset given the specified metric")),(0,o.kt)("li",{parentName:"ol"},(0,o.kt)("p",{parentName:"li"},"The ",(0,o.kt)("strong",{parentName:"p"},(0,o.kt)("inlineCode",{parentName:"strong"},"ComputeModelStatistics"))," Transformer computes the different\nmetrics on a scored dataset (in our case, the ",(0,o.kt)("inlineCode",{parentName:"p"},"validation")," dataset)\nat the same time"))),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.train import TrainClassifier, ComputeModelStatistics\nfrom synapse.ml.automl import FindBestModel\n\n# Prepare data for learning\ntrain, test, validation = data.randomSplit([0.60, 0.20, 0.20], seed=123)\n\n# Train the models on the \'train\' data\nlrHyperParams = [0.05, 0.1, 0.2, 0.4]\nlogisticRegressions = [\n LogisticRegression(regParam=hyperParam) for hyperParam in lrHyperParams\n]\nlrmodels = [\n TrainClassifier(model=lrm, labelCol="label", numFeatures=10000).fit(train)\n for lrm in logisticRegressions\n]\n\n# Select the best model\nbestModel = FindBestModel(evaluationMetric="AUC", models=lrmodels).fit(test)\n\n\n# Get AUC on the validation dataset\npredictions = bestModel.transform(validation)\nmetrics = ComputeModelStatistics().transform(predictions)\nprint(\n "Best model\'s AUC on validation set = "\n + "{0:.2f}%".format(metrics.first()["AUC"] * 100)\n)\n')))}c.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/20641120.b0de029a.js b/assets/js/20641120.b0de029a.js new file mode 100644 index 0000000000..4a767172ba --- /dev/null +++ b/assets/js/20641120.b0de029a.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[787],{3905:function(e,t,n){n.d(t,{Zo:function(){return m},kt:function(){return c}});var a=n(7294);function r(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function o(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);t&&(a=a.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,a)}return n}function s(e){for(var t=1;t=0||(r[n]=e[n]);return r}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(r[n]=e[n])}return r}var l=a.createContext({}),p=function(e){var t=a.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):s(s({},t),e)),n},m=function(e){var t=p(e.components);return a.createElement(l.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return a.createElement(a.Fragment,{},t)}},d=a.forwardRef((function(e,t){var n=e.components,r=e.mdxType,o=e.originalType,l=e.parentName,m=i(e,["components","mdxType","originalType","parentName"]),d=p(n),c=r,f=d["".concat(l,".").concat(c)]||d[c]||u[c]||o;return n?a.createElement(f,s(s({ref:t},m),{},{components:n})):a.createElement(f,s({ref:t},m))}));function c(e,t){var n=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var o=n.length,s=new Array(o);s[0]=d;var i={};for(var l in t)hasOwnProperty.call(t,l)&&(i[l]=t[l]);i.originalType=e,i.mdxType="string"==typeof e?e:r,s[1]=i;for(var p=2;p 3) or bad based on the text of the review. You\naccomplish it by training LogisticRegression learners with different\nhyperparameters and choosing the best model."),(0,o.kt)("h2",{id:"setup"},"Setup"),(0,o.kt)("p",null,"Import necessary Python libraries and get a spark session."),(0,o.kt)("h2",{id:"read-the-data"},"Read the data"),(0,o.kt)("p",null,"Download and read in the data."),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'rawData = spark.read.parquet(\n "wasbs://publicwasb@mmlspark.blob.core.windows.net/BookReviewsFromAmazon10K.parquet"\n)\nrawData.show(5)\n')),(0,o.kt)("h2",{id:"extract-features-and-process-data"},"Extract features and process data"),(0,o.kt)("p",null,"Real data is more complex than the above dataset. It's common\nfor a dataset to have features of multiple types, such as text, numeric, and\ncategorical. To illustrate how difficult it's to work with these\ndatasets, add two numerical features to the dataset: the ",(0,o.kt)("strong",{parentName:"p"},"word count")," of the review and the ",(0,o.kt)("strong",{parentName:"p"},"mean word length"),"."),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},"from pyspark.sql.functions import udf\nfrom pyspark.sql.types import *\n\n\ndef wordCount(s):\n return len(s.split())\n\n\ndef wordLength(s):\n import numpy as np\n\n ss = [len(w) for w in s.split()]\n return round(float(np.mean(ss)), 2)\n\n\nwordLengthUDF = udf(wordLength, DoubleType())\nwordCountUDF = udf(wordCount, IntegerType())\n")),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.stages import UDFTransformer\n\nwordLength = "wordLength"\nwordCount = "wordCount"\nwordLengthTransformer = UDFTransformer(\n inputCol="text", outputCol=wordLength, udf=wordLengthUDF\n)\nwordCountTransformer = UDFTransformer(\n inputCol="text", outputCol=wordCount, udf=wordCountUDF\n)\n')),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from pyspark.ml import Pipeline\n\ndata = (\n Pipeline(stages=[wordLengthTransformer, wordCountTransformer])\n .fit(rawData)\n .transform(rawData)\n .withColumn("label", rawData["rating"] > 3)\n .drop("rating")\n)\n')),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},"data.show(5)\n")),(0,o.kt)("h2",{id:"classify-using-pyspark"},"Classify using pyspark"),(0,o.kt)("p",null,"To choose the best LogisticRegression classifier using the ",(0,o.kt)("inlineCode",{parentName:"p"},"pyspark"),"\nlibrary, you need to ",(0,o.kt)("em",{parentName:"p"},"explicitly")," perform the following steps:"),(0,o.kt)("ol",null,(0,o.kt)("li",{parentName:"ol"},"Process the features:",(0,o.kt)("ul",{parentName:"li"},(0,o.kt)("li",{parentName:"ul"},"Tokenize the text column"),(0,o.kt)("li",{parentName:"ul"},"Hash the tokenized column into a vector using hashing"),(0,o.kt)("li",{parentName:"ul"},"Merge the numeric features with the vector"))),(0,o.kt)("li",{parentName:"ol"},"Process the label column: cast it into the proper type."),(0,o.kt)("li",{parentName:"ol"},"Train multiple LogisticRegression algorithms on the ",(0,o.kt)("inlineCode",{parentName:"li"},"train")," dataset\nwith different hyperparameters"),(0,o.kt)("li",{parentName:"ol"},"Compute the area under the ROC curve for each of the trained models\nand select the model with the highest metric as computed on the\n",(0,o.kt)("inlineCode",{parentName:"li"},"test")," dataset"),(0,o.kt)("li",{parentName:"ol"},"Evaluate the best model on the ",(0,o.kt)("inlineCode",{parentName:"li"},"validation")," set")),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from pyspark.ml.feature import Tokenizer, HashingTF\nfrom pyspark.ml.feature import VectorAssembler\n\n# Featurize text column\ntokenizer = Tokenizer(inputCol="text", outputCol="tokenizedText")\nnumFeatures = 10000\nhashingScheme = HashingTF(\n inputCol="tokenizedText", outputCol="TextFeatures", numFeatures=numFeatures\n)\ntokenizedData = tokenizer.transform(data)\nfeaturizedData = hashingScheme.transform(tokenizedData)\n\n# Merge text and numeric features in one feature column\nfeatureColumnsArray = ["TextFeatures", "wordCount", "wordLength"]\nassembler = VectorAssembler(inputCols=featureColumnsArray, outputCol="features")\nassembledData = assembler.transform(featurizedData)\n\n# Select only columns of interest\n# Convert rating column from boolean to int\nprocessedData = assembledData.select("label", "features").withColumn(\n "label", assembledData.label.cast(IntegerType())\n)\n')),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from pyspark.ml.evaluation import BinaryClassificationEvaluator\nfrom pyspark.ml.classification import LogisticRegression\n\n# Prepare data for learning\ntrain, test, validation = processedData.randomSplit([0.60, 0.20, 0.20], seed=123)\n\n# Train the models on the \'train\' data\nlrHyperParams = [0.05, 0.1, 0.2, 0.4]\nlogisticRegressions = [\n LogisticRegression(regParam=hyperParam) for hyperParam in lrHyperParams\n]\nevaluator = BinaryClassificationEvaluator(\n rawPredictionCol="rawPrediction", metricName="areaUnderROC"\n)\nmetrics = []\nmodels = []\n\n# Select the best model\nfor learner in logisticRegressions:\n model = learner.fit(train)\n models.append(model)\n scoredData = model.transform(test)\n metrics.append(evaluator.evaluate(scoredData))\nbestMetric = max(metrics)\nbestModel = models[metrics.index(bestMetric)]\n\n# Get AUC on the validation dataset\nscoredVal = bestModel.transform(validation)\nprint(evaluator.evaluate(scoredVal))\n')),(0,o.kt)("h2",{id:"classify-using-synapseml"},"Classify using SynapseML"),(0,o.kt)("p",null,"The pipeline can be simplified by using SynapseML:"),(0,o.kt)("ol",null,(0,o.kt)("li",{parentName:"ol"},(0,o.kt)("p",{parentName:"li"},"The ",(0,o.kt)("strong",{parentName:"p"},(0,o.kt)("inlineCode",{parentName:"strong"},"TrainClassifier"))," Estimator featurizes the data internally,\nas long as the columns selected in the ",(0,o.kt)("inlineCode",{parentName:"p"},"train"),", ",(0,o.kt)("inlineCode",{parentName:"p"},"test"),", ",(0,o.kt)("inlineCode",{parentName:"p"},"validation"),"\ndataset represent the features")),(0,o.kt)("li",{parentName:"ol"},(0,o.kt)("p",{parentName:"li"},"The ",(0,o.kt)("strong",{parentName:"p"},(0,o.kt)("inlineCode",{parentName:"strong"},"FindBestModel"))," Estimator finds the best model from a pool of\ntrained models by finding the model that performs best on the ",(0,o.kt)("inlineCode",{parentName:"p"},"test"),"\ndataset given the specified metric")),(0,o.kt)("li",{parentName:"ol"},(0,o.kt)("p",{parentName:"li"},"The ",(0,o.kt)("strong",{parentName:"p"},(0,o.kt)("inlineCode",{parentName:"strong"},"ComputeModelStatistics"))," Transformer computes the different\nmetrics on a scored dataset (in our case, the ",(0,o.kt)("inlineCode",{parentName:"p"},"validation")," dataset)\nat the same time"))),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.train import TrainClassifier, ComputeModelStatistics\nfrom synapse.ml.automl import FindBestModel\n\n# Prepare data for learning\ntrain, test, validation = data.randomSplit([0.60, 0.20, 0.20], seed=123)\n\n# Train the models on the \'train\' data\nlrHyperParams = [0.05, 0.1, 0.2, 0.4]\nlogisticRegressions = [\n LogisticRegression(regParam=hyperParam) for hyperParam in lrHyperParams\n]\nlrmodels = [\n TrainClassifier(model=lrm, labelCol="label", numFeatures=10000).fit(train)\n for lrm in logisticRegressions\n]\n\n# Select the best model\nbestModel = FindBestModel(evaluationMetric="AUC", models=lrmodels).fit(test)\n\n\n# Get AUC on the validation dataset\npredictions = bestModel.transform(validation)\nmetrics = ComputeModelStatistics().transform(predictions)\nprint(\n "Best model\'s AUC on validation set = "\n + "{0:.2f}%".format(metrics.first()["AUC"] * 100)\n)\n')))}c.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/9c5088a2.5c1faf65.js b/assets/js/20e0fe38.5a07c469.js similarity index 96% rename from assets/js/9c5088a2.5c1faf65.js rename to assets/js/20e0fe38.5a07c469.js index 8c488569c3..86a5302f7a 100644 --- a/assets/js/9c5088a2.5c1faf65.js +++ b/assets/js/20e0fe38.5a07c469.js @@ -1 +1 @@ -"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[7925],{3905:function(e,t,o){o.d(t,{Zo:function(){return c},kt:function(){return d}});var n=o(7294);function l(e,t,o){return t in e?Object.defineProperty(e,t,{value:o,enumerable:!0,configurable:!0,writable:!0}):e[t]=o,e}function r(e,t){var o=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);t&&(n=n.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),o.push.apply(o,n)}return o}function a(e){for(var t=1;t=0||(l[o]=e[o]);return l}(e,t);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,o)&&(l[o]=e[o])}return l}var s=n.createContext({}),p=function(e){var t=n.useContext(s),o=t;return e&&(o="function"==typeof e?e(t):a(a({},t),e)),o},c=function(e){var t=p(e.components);return n.createElement(s.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return n.createElement(n.Fragment,{},t)}},m=n.forwardRef((function(e,t){var o=e.components,l=e.mdxType,r=e.originalType,s=e.parentName,c=i(e,["components","mdxType","originalType","parentName"]),m=p(o),d=l,f=m["".concat(s,".").concat(d)]||m[d]||u[d]||r;return o?n.createElement(f,a(a({ref:t},c),{},{components:o})):n.createElement(f,a({ref:t},c))}));function d(e,t){var o=arguments,l=t&&t.mdxType;if("string"==typeof e||l){var r=o.length,a=new Array(r);a[0]=m;var i={};for(var s in t)hasOwnProperty.call(t,s)&&(i[s]=t[s]);i.originalType=e,i.mdxType="string"==typeof e?e:l,a[1]=i;for(var p=2;p@.blob.core.windows.net/PATH_TO_YOUR/log_model_allowlist.txt")),(0,r.kt)("li",{parentName:"ul"},"In Databricks ",(0,r.kt)("inlineCode",{parentName:"li"},"/dbfs/FileStore/PATH_TO_YOUR/log_model_allowlist.txt"),".")),(0,r.kt)("ol",{start:2},(0,r.kt)("li",{parentName:"ol"},"Set spark configuration ",(0,r.kt)("inlineCode",{parentName:"li"},"spark.mlflow.pysparkml.autolog.logModelAllowlistFile")," to the path of your ",(0,r.kt)("inlineCode",{parentName:"li"},"log_model_allowlist.txt")," file."),(0,r.kt)("li",{parentName:"ol"},"Call ",(0,r.kt)("inlineCode",{parentName:"li"},"mlflow.pyspark.ml.autolog()")," before your training code to enable autologging for all supported models.")),(0,r.kt)("p",null,"Note:"),(0,r.kt)("ol",null,(0,r.kt)("li",{parentName:"ol"},"If you want to support autologging of PySpark models not present in the log_model_allowlist file, you can add such models to the file."),(0,r.kt)("li",{parentName:"ol"},"If you've enabled autologging, then don't write explicit ",(0,r.kt)("inlineCode",{parentName:"li"},"with mlflow.start_run()")," as it might cause multiple runs for one single model or one run for multiple models.")),(0,r.kt)("h2",{id:"configuration-process-in-databricks-as-an-example"},"Configuration process in Databricks as an example"),(0,r.kt)("ol",null,(0,r.kt)("li",{parentName:"ol"},"Install latest MLflow via ",(0,r.kt)("inlineCode",{parentName:"li"},"%pip install mlflow")),(0,r.kt)("li",{parentName:"ol"},"Upload your customized ",(0,r.kt)("inlineCode",{parentName:"li"},"log_model_allowlist.txt")," file to dbfs by clicking File/Upload Data button on Databricks UI."),(0,r.kt)("li",{parentName:"ol"},"Set Cluster Spark configuration following ",(0,r.kt)("a",{parentName:"li",href:"https://docs.microsoft.com/en-us/azure/databricks/clusters/configure#spark-configuration"},"this documentation"))),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre"},"spark.mlflow.pysparkml.autolog.logModelAllowlistFile /dbfs/FileStore/PATH_TO_YOUR/log_model_allowlist.txt\n")),(0,r.kt)("ol",{start:4},(0,r.kt)("li",{parentName:"ol"},"Run the following line before your training code executes.")),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre"},"mlflow.pyspark.ml.autolog()\n")),(0,r.kt)("p",null,"You can customize how autologging works by supplying appropriate ",(0,r.kt)("a",{parentName:"p",href:"https://www.mlflow.org/docs/latest/python_api/mlflow.pyspark.ml.html#mlflow.pyspark.ml.autolog"},"parameters"),"."),(0,r.kt)("ol",{start:5},(0,r.kt)("li",{parentName:"ol"},"To find your experiment's results via the ",(0,r.kt)("inlineCode",{parentName:"li"},"Experiments")," tab of the MLFlow UI.",(0,r.kt)("img",{src:"https://mmlspark.blob.core.windows.net/graphics/adb_experiments.png",width:"1200"}))),(0,r.kt)("h2",{id:"example-for-conditionalknnmodel"},"Example for ConditionalKNNModel"),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'from pyspark.ml.linalg import Vectors\nfrom synapse.ml.nn import *\n\ndf = spark.createDataFrame([\n (Vectors.dense(2.0,2.0,2.0), "foo", 1),\n (Vectors.dense(2.0,2.0,4.0), "foo", 3),\n (Vectors.dense(2.0,2.0,6.0), "foo", 4),\n (Vectors.dense(2.0,2.0,8.0), "foo", 3),\n (Vectors.dense(2.0,2.0,10.0), "foo", 1),\n (Vectors.dense(2.0,2.0,12.0), "foo", 2),\n (Vectors.dense(2.0,2.0,14.0), "foo", 0),\n (Vectors.dense(2.0,2.0,16.0), "foo", 1),\n (Vectors.dense(2.0,2.0,18.0), "foo", 3),\n (Vectors.dense(2.0,2.0,20.0), "foo", 0),\n (Vectors.dense(2.0,4.0,2.0), "foo", 2),\n (Vectors.dense(2.0,4.0,4.0), "foo", 4),\n (Vectors.dense(2.0,4.0,6.0), "foo", 2),\n (Vectors.dense(2.0,4.0,8.0), "foo", 2),\n (Vectors.dense(2.0,4.0,10.0), "foo", 4),\n (Vectors.dense(2.0,4.0,12.0), "foo", 3),\n (Vectors.dense(2.0,4.0,14.0), "foo", 2),\n (Vectors.dense(2.0,4.0,16.0), "foo", 1),\n (Vectors.dense(2.0,4.0,18.0), "foo", 4),\n (Vectors.dense(2.0,4.0,20.0), "foo", 4)\n], ["features","values","labels"])\n\ncnn = (ConditionalKNN().setOutputCol("prediction"))\ncnnm = cnn.fit(df)\n\ntest_df = spark.createDataFrame([\n (Vectors.dense(2.0,2.0,2.0), "foo", 1, [0, 1]),\n (Vectors.dense(2.0,2.0,4.0), "foo", 4, [0, 1]),\n (Vectors.dense(2.0,2.0,6.0), "foo", 2, [0, 1]),\n (Vectors.dense(2.0,2.0,8.0), "foo", 4, [0, 1]),\n (Vectors.dense(2.0,2.0,10.0), "foo", 4, [0, 1])\n], ["features","values","labels","conditioner"])\n\ndisplay(cnnm.transform(test_df))\n')),(0,r.kt)("p",null,"This code should log one run with a ConditionalKNNModel artifact and its parameters."),(0,r.kt)("img",{src:"https://mmlspark.blob.core.windows.net/graphics/autologgingRunSample.png",width:"1200"}))}d.isMDXComponent=!0}}]); \ No newline at end of file +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[1102],{3905:function(e,t,o){o.d(t,{Zo:function(){return c},kt:function(){return d}});var n=o(7294);function l(e,t,o){return t in e?Object.defineProperty(e,t,{value:o,enumerable:!0,configurable:!0,writable:!0}):e[t]=o,e}function r(e,t){var o=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);t&&(n=n.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),o.push.apply(o,n)}return o}function a(e){for(var t=1;t=0||(l[o]=e[o]);return l}(e,t);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,o)&&(l[o]=e[o])}return l}var s=n.createContext({}),p=function(e){var t=n.useContext(s),o=t;return e&&(o="function"==typeof e?e(t):a(a({},t),e)),o},c=function(e){var t=p(e.components);return n.createElement(s.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return n.createElement(n.Fragment,{},t)}},m=n.forwardRef((function(e,t){var o=e.components,l=e.mdxType,r=e.originalType,s=e.parentName,c=i(e,["components","mdxType","originalType","parentName"]),m=p(o),d=l,f=m["".concat(s,".").concat(d)]||m[d]||u[d]||r;return o?n.createElement(f,a(a({ref:t},c),{},{components:o})):n.createElement(f,a({ref:t},c))}));function d(e,t){var o=arguments,l=t&&t.mdxType;if("string"==typeof e||l){var r=o.length,a=new Array(r);a[0]=m;var i={};for(var s in t)hasOwnProperty.call(t,s)&&(i[s]=t[s]);i.originalType=e,i.mdxType="string"==typeof e?e:l,a[1]=i;for(var p=2;p@.blob.core.windows.net/PATH_TO_YOUR/log_model_allowlist.txt")),(0,r.kt)("li",{parentName:"ul"},"In Databricks ",(0,r.kt)("inlineCode",{parentName:"li"},"/dbfs/FileStore/PATH_TO_YOUR/log_model_allowlist.txt"),".")),(0,r.kt)("ol",{start:2},(0,r.kt)("li",{parentName:"ol"},"Set spark configuration ",(0,r.kt)("inlineCode",{parentName:"li"},"spark.mlflow.pysparkml.autolog.logModelAllowlistFile")," to the path of your ",(0,r.kt)("inlineCode",{parentName:"li"},"log_model_allowlist.txt")," file."),(0,r.kt)("li",{parentName:"ol"},"Call ",(0,r.kt)("inlineCode",{parentName:"li"},"mlflow.pyspark.ml.autolog()")," before your training code to enable autologging for all supported models.")),(0,r.kt)("p",null,"Note:"),(0,r.kt)("ol",null,(0,r.kt)("li",{parentName:"ol"},"If you want to support autologging of PySpark models not present in the log_model_allowlist file, you can add such models to the file."),(0,r.kt)("li",{parentName:"ol"},"If you've enabled autologging, then don't write explicit ",(0,r.kt)("inlineCode",{parentName:"li"},"with mlflow.start_run()")," as it might cause multiple runs for one single model or one run for multiple models.")),(0,r.kt)("h2",{id:"configuration-process-in-databricks-as-an-example"},"Configuration process in Databricks as an example"),(0,r.kt)("ol",null,(0,r.kt)("li",{parentName:"ol"},"Install latest MLflow via ",(0,r.kt)("inlineCode",{parentName:"li"},"%pip install mlflow")),(0,r.kt)("li",{parentName:"ol"},"Upload your customized ",(0,r.kt)("inlineCode",{parentName:"li"},"log_model_allowlist.txt")," file to dbfs by clicking File/Upload Data button on Databricks UI."),(0,r.kt)("li",{parentName:"ol"},"Set Cluster Spark configuration following ",(0,r.kt)("a",{parentName:"li",href:"https://docs.microsoft.com/en-us/azure/databricks/clusters/configure#spark-configuration"},"this documentation"))),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre"},"spark.mlflow.pysparkml.autolog.logModelAllowlistFile /dbfs/FileStore/PATH_TO_YOUR/log_model_allowlist.txt\n")),(0,r.kt)("ol",{start:4},(0,r.kt)("li",{parentName:"ol"},"Run the following line before your training code executes.")),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre"},"mlflow.pyspark.ml.autolog()\n")),(0,r.kt)("p",null,"You can customize how autologging works by supplying appropriate ",(0,r.kt)("a",{parentName:"p",href:"https://www.mlflow.org/docs/latest/python_api/mlflow.pyspark.ml.html#mlflow.pyspark.ml.autolog"},"parameters"),"."),(0,r.kt)("ol",{start:5},(0,r.kt)("li",{parentName:"ol"},"To find your experiment's results via the ",(0,r.kt)("inlineCode",{parentName:"li"},"Experiments")," tab of the MLFlow UI.",(0,r.kt)("img",{src:"https://mmlspark.blob.core.windows.net/graphics/adb_experiments.png",width:"1200"}))),(0,r.kt)("h2",{id:"example-for-conditionalknnmodel"},"Example for ConditionalKNNModel"),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'from pyspark.ml.linalg import Vectors\nfrom synapse.ml.nn import *\n\ndf = spark.createDataFrame([\n (Vectors.dense(2.0,2.0,2.0), "foo", 1),\n (Vectors.dense(2.0,2.0,4.0), "foo", 3),\n (Vectors.dense(2.0,2.0,6.0), "foo", 4),\n (Vectors.dense(2.0,2.0,8.0), "foo", 3),\n (Vectors.dense(2.0,2.0,10.0), "foo", 1),\n (Vectors.dense(2.0,2.0,12.0), "foo", 2),\n (Vectors.dense(2.0,2.0,14.0), "foo", 0),\n (Vectors.dense(2.0,2.0,16.0), "foo", 1),\n (Vectors.dense(2.0,2.0,18.0), "foo", 3),\n (Vectors.dense(2.0,2.0,20.0), "foo", 0),\n (Vectors.dense(2.0,4.0,2.0), "foo", 2),\n (Vectors.dense(2.0,4.0,4.0), "foo", 4),\n (Vectors.dense(2.0,4.0,6.0), "foo", 2),\n (Vectors.dense(2.0,4.0,8.0), "foo", 2),\n (Vectors.dense(2.0,4.0,10.0), "foo", 4),\n (Vectors.dense(2.0,4.0,12.0), "foo", 3),\n (Vectors.dense(2.0,4.0,14.0), "foo", 2),\n (Vectors.dense(2.0,4.0,16.0), "foo", 1),\n (Vectors.dense(2.0,4.0,18.0), "foo", 4),\n (Vectors.dense(2.0,4.0,20.0), "foo", 4)\n], ["features","values","labels"])\n\ncnn = (ConditionalKNN().setOutputCol("prediction"))\ncnnm = cnn.fit(df)\n\ntest_df = spark.createDataFrame([\n (Vectors.dense(2.0,2.0,2.0), "foo", 1, [0, 1]),\n (Vectors.dense(2.0,2.0,4.0), "foo", 4, [0, 1]),\n (Vectors.dense(2.0,2.0,6.0), "foo", 2, [0, 1]),\n (Vectors.dense(2.0,2.0,8.0), "foo", 4, [0, 1]),\n (Vectors.dense(2.0,2.0,10.0), "foo", 4, [0, 1])\n], ["features","values","labels","conditioner"])\n\ndisplay(cnnm.transform(test_df))\n')),(0,r.kt)("p",null,"This code should log one run with a ConditionalKNNModel artifact and its parameters."),(0,r.kt)("img",{src:"https://mmlspark.blob.core.windows.net/graphics/autologgingRunSample.png",width:"1200"}))}d.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/b8963dc0.6f694e38.js b/assets/js/22f921cc.b4ab4462.js similarity index 97% rename from assets/js/b8963dc0.6f694e38.js rename to assets/js/22f921cc.b4ab4462.js index d25cfa89c4..28a84b91b3 100644 --- a/assets/js/b8963dc0.6f694e38.js +++ b/assets/js/22f921cc.b4ab4462.js @@ -1 +1 @@ -"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[5287],{3905:function(e,t,n){n.d(t,{Zo:function(){return c},kt:function(){return d}});var a=n(7294);function r(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function o(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);t&&(a=a.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,a)}return n}function s(e){for(var t=1;t=0||(r[n]=e[n]);return r}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(r[n]=e[n])}return r}var i=a.createContext({}),u=function(e){var t=a.useContext(i),n=t;return e&&(n="function"==typeof e?e(t):s(s({},t),e)),n},c=function(e){var t=u(e.components);return a.createElement(i.Provider,{value:t},e.children)},m={inlineCode:"code",wrapper:function(e){var t=e.children;return a.createElement(a.Fragment,{},t)}},p=a.forwardRef((function(e,t){var n=e.components,r=e.mdxType,o=e.originalType,i=e.parentName,c=l(e,["components","mdxType","originalType","parentName"]),p=u(n),d=r,f=p["".concat(i,".").concat(d)]||p[d]||m[d]||o;return n?a.createElement(f,s(s({ref:t},c),{},{components:n})):a.createElement(f,s({ref:t},c))}));function d(e,t){var n=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var o=n.length,s=new Array(o);s[0]=p;var l={};for(var i in t)hasOwnProperty.call(t,i)&&(l[i]=t[i]);l.originalType=e,l.mdxType="string"==typeof e?e:r,s[1]=l;for(var u=2;u=0||(r[n]=e[n]);return r}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(r[n]=e[n])}return r}var i=a.createContext({}),u=function(e){var t=a.useContext(i),n=t;return e&&(n="function"==typeof e?e(t):s(s({},t),e)),n},c=function(e){var t=u(e.components);return a.createElement(i.Provider,{value:t},e.children)},m={inlineCode:"code",wrapper:function(e){var t=e.children;return a.createElement(a.Fragment,{},t)}},p=a.forwardRef((function(e,t){var n=e.components,r=e.mdxType,o=e.originalType,i=e.parentName,c=l(e,["components","mdxType","originalType","parentName"]),p=u(n),d=r,f=p["".concat(i,".").concat(d)]||p[d]||m[d]||o;return n?a.createElement(f,s(s({ref:t},c),{},{components:n})):a.createElement(f,s({ref:t},c))}));function d(e,t){var n=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var o=n.length,s=new Array(o);s[0]=p;var l={};for(var i in t)hasOwnProperty.call(t,i)&&(l[i]=t[i]);l.originalType=e,l.mdxType="string"==typeof e?e:r,s[1]=l;for(var u=2;u=0||(o[t]=e[t]);return o}(e,n);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,t)&&(o[t]=e[t])}return o}var l=r.createContext({}),p=function(e){var n=r.useContext(l),t=n;return e&&(t="function"==typeof e?e(n):s(s({},n),e)),t},m=function(e){var n=p(e.components);return r.createElement(l.Provider,{value:n},e.children)},c={inlineCode:"code",wrapper:function(e){var n=e.children;return r.createElement(r.Fragment,{},n)}},u=r.forwardRef((function(e,n){var t=e.components,o=e.mdxType,a=e.originalType,l=e.parentName,m=i(e,["components","mdxType","originalType","parentName"]),u=p(t),d=o,f=u["".concat(l,".").concat(d)]||u[d]||c[d]||a;return t?r.createElement(f,s(s({ref:n},m),{},{components:t})):r.createElement(f,s({ref:n},m))}));function d(e,n){var t=arguments,o=n&&n.mdxType;if("string"==typeof e||o){var a=t.length,s=new Array(a);s[0]=u;var i={};for(var l in n)hasOwnProperty.call(n,l)&&(i[l]=n[l]);i.originalType=e,i.mdxType="string"==typeof e?e:o,s[1]=i;for(var p=2;p" + c + "" for c in cols])\n\n style = """\n\n\n\n\n"""\n\n table = []\n for row in rows:\n table.append("")\n for col in cols:\n if col in image_cols:\n rep = \'\'.format(row[col])\n else:\n rep = row[col]\n table.append("{}".format(rep))\n table.append("")\n tableHTML = "".join(table)\n\n body = """\n\n\n \n {} \n \n {}\n
\n\n\n """.format(\n header, tableHTML\n )\n try:\n if running_on_databricks():\n displayHTML(style + body)\n else:\n import IPython\n\n IPython.display.HTML(style + body)\n except:\n pass\n')),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-python"},'snowLeopardQueries = ["snow leopard"]\nsnowLeopardUrls = bingPhotoSearch("snow leopard", snowLeopardQueries, pages=100)\ndisplayDF(snowLeopardUrls)\n')),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-python"},'randomWords = spark.read.parquet(\n "wasbs://publicwasb@mmlspark.blob.core.windows.net/random_words.parquet"\n).cache()\nrandomWords.show()\n')),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-python"},'randomLinks = (\n randomWords.mlTransform(\n BingImageSearch()\n .setSubscriptionKey(bing_search_key)\n .setCount(10)\n .setQueryCol("words")\n .setOutputCol("images")\n )\n .mlTransform(BingImageSearch.getUrlTransformer("images", "urls"))\n .withColumn("label", lit("other"))\n .limit(400)\n)\n\ndisplayDF(randomLinks)\n')),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-python"},'images = (\n snowLeopardUrls.union(randomLinks)\n .distinct()\n .repartition(100)\n .mlTransform(\n BingImageSearch.downloadFromUrls("urls", "image", concurrency=5, timeout=5000)\n )\n .dropna()\n)\n\ntrain, test = images.randomSplit([0.7, 0.3], seed=1)\n')),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-python"},'from pyspark.ml import Pipeline\nfrom pyspark.ml.feature import StringIndexer\nfrom pyspark.ml.classification import LogisticRegression\nfrom pyspark.sql.functions import udf\nfrom synapse.ml.onnx import ImageFeaturizer\nfrom synapse.ml.stages import UDFTransformer\nfrom pyspark.sql.types import *\n\n\ndef getIndex(row):\n return float(row[1])\n\n\nmodel = Pipeline(\n stages=[\n StringIndexer(inputCol="labels", outputCol="index"),\n ImageFeaturizer(\n inputCol="image",\n outputCol="features",\n autoConvertToColor=True,\n ignoreDecodingErrors=True,\n ).setModel("ResNet50"),\n LogisticRegression(maxIter=5, labelCol="index", regParam=10.0),\n UDFTransformer()\n .setUDF(udf(getIndex, DoubleType()))\n .setInputCol("probability")\n .setOutputCol("leopard_prob"),\n ]\n)\n\nfitModel = model.fit(train)\n')),(0,a.kt)("img",{src:"https://mmlspark.blob.core.windows.net/graphics/SnowLeopardAD/SLPipeline.PNG",width:"900"}),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-python"},'def plotConfusionMatrix(df, label, prediction, classLabels):\n from synapse.ml.plot import confusionMatrix\n import matplotlib.pyplot as plt\n\n fig = plt.figure(figsize=(4.5, 4.5))\n confusionMatrix(df, label, prediction, classLabels)\n display(fig)\n\n\nif not running_on_synapse():\n plotConfusionMatrix(\n fitModel.transform(test), "index", "prediction", fitModel.stages[0].labels\n )\n')),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-python"},'import urllib.request\nfrom synapse.ml.explainers import ImageLIME\n\ntest_image_url = (\n "https://mmlspark.blob.core.windows.net/graphics/SnowLeopardAD/snow_leopard1.jpg"\n)\nwith urllib.request.urlopen(test_image_url) as url:\n barr = url.read()\ntest_subsample = spark.createDataFrame([(bytearray(barr),)], ["image"])\n\nlime = (\n ImageLIME()\n .setModel(fitModel)\n .setTargetCol("leopard_prob")\n .setOutputCol("weights")\n .setInputCol("image")\n .setCellSize(100.0)\n .setModifier(50.0)\n .setNumSamples(300)\n)\n\nresult = lime.transform(test_subsample)\n')),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre",className:"language-python"},'import matplotlib.pyplot as plt\nimport PIL, io, numpy as np\n\n\ndef plot_superpixels(row):\n image_bytes = row["image"]\n superpixels = row["superpixels"]["clusters"]\n weights = list(row["weights"][0])\n mean_weight = np.percentile(weights, 90)\n img = (PIL.Image.open(io.BytesIO(image_bytes))).convert("RGBA")\n image_array = np.asarray(img).copy()\n for (sp, w) in zip(superpixels, weights):\n if w > mean_weight:\n for (x, y) in sp:\n image_array[y, x, 1] = 255\n image_array[y, x, 3] = 200\n plt.clf()\n plt.imshow(image_array)\n display()\n\n\n# Gets first row from the LIME-transformed data frame\nif not running_on_synapse():\n plot_superpixels(result.take(1)[0])\n')),(0,a.kt)("h3",{id:"your-results-will-look-like"},"Your results will look like:"),(0,a.kt)("img",{src:"https://mmlspark.blob.core.windows.net/graphics/SnowLeopardAD/lime_results.png",width:"900"}))}d.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/295a8e78.cc742d78.js b/assets/js/295a8e78.cc742d78.js new file mode 100644 index 0000000000..9e03b02ad1 --- /dev/null +++ b/assets/js/295a8e78.cc742d78.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[6021],{3905:function(t,e,n){n.d(e,{Zo:function(){return p},kt:function(){return m}});var r=n(7294);function a(t,e,n){return e in t?Object.defineProperty(t,e,{value:n,enumerable:!0,configurable:!0,writable:!0}):t[e]=n,t}function i(t,e){var n=Object.keys(t);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(t);e&&(r=r.filter((function(e){return Object.getOwnPropertyDescriptor(t,e).enumerable}))),n.push.apply(n,r)}return n}function o(t){for(var e=1;e=0||(a[n]=t[n]);return a}(t,e);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(t);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(t,n)&&(a[n]=t[n])}return a}var l=r.createContext({}),c=function(t){var e=r.useContext(l),n=e;return t&&(n="function"==typeof t?t(e):o(o({},e),t)),n},p=function(t){var e=c(t.components);return r.createElement(l.Provider,{value:e},t.children)},u={inlineCode:"code",wrapper:function(t){var e=t.children;return r.createElement(r.Fragment,{},e)}},d=r.forwardRef((function(t,e){var n=t.components,a=t.mdxType,i=t.originalType,l=t.parentName,p=s(t,["components","mdxType","originalType","parentName"]),d=c(n),m=a,f=d["".concat(l,".").concat(m)]||d[m]||u[m]||i;return n?r.createElement(f,o(o({ref:e},p),{},{components:n})):r.createElement(f,o({ref:e},p))}));function m(t,e){var n=arguments,a=e&&e.mdxType;if("string"==typeof t||a){var i=n.length,o=new Array(i);o[0]=d;var s={};for(var l in e)hasOwnProperty.call(e,l)&&(s[l]=e[l]);s.originalType=t,s.mdxType="string"==typeof t?t:a,o[1]=s;for(var c=2;c=0||(r[n]=e[n]);return r}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(r[n]=e[n])}return r}var s=a.createContext({}),p=function(e){var t=a.useContext(s),n=t;return e&&(n="function"==typeof e?e(t):o(o({},t),e)),n},m=function(e){var t=p(e.components);return a.createElement(s.Provider,{value:t},e.children)},c={inlineCode:"code",wrapper:function(e){var t=e.children;return a.createElement(a.Fragment,{},t)}},u=a.forwardRef((function(e,t){var n=e.components,r=e.mdxType,i=e.originalType,s=e.parentName,m=l(e,["components","mdxType","originalType","parentName"]),u=p(n),d=r,g=u["".concat(s,".").concat(d)]||u[d]||c[d]||i;return n?a.createElement(g,o(o({ref:t},m),{},{components:n})):a.createElement(g,o({ref:t},m))}));function d(e,t){var n=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var i=n.length,o=new Array(i);o[0]=u;var l={};for(var s in t)hasOwnProperty.call(t,s)&&(l[s]=t[s]);l.originalType=e,l.mdxType="string"==typeof e?e:r,o[1]=l;for(var p=2;p=0||(r[n]=e[n]);return r}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(r[n]=e[n])}return r}var s=a.createContext({}),p=function(e){var t=a.useContext(s),n=t;return e&&(n="function"==typeof e?e(t):o(o({},t),e)),n},m=function(e){var t=p(e.components);return a.createElement(s.Provider,{value:t},e.children)},c={inlineCode:"code",wrapper:function(e){var t=e.children;return a.createElement(a.Fragment,{},t)}},u=a.forwardRef((function(e,t){var n=e.components,r=e.mdxType,i=e.originalType,s=e.parentName,m=l(e,["components","mdxType","originalType","parentName"]),u=p(n),d=r,g=u["".concat(s,".").concat(d)]||u[d]||c[d]||i;return n?a.createElement(g,o(o({ref:t},m),{},{components:n})):a.createElement(g,o({ref:t},m))}));function d(e,t){var n=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var i=n.length,o=new Array(i);o[0]=u;var l={};for(var s in t)hasOwnProperty.call(t,s)&&(l[s]=t[s]);l.originalType=e,l.mdxType="string"==typeof e?e:r,o[1]=l;for(var p=2;p=0||(r[n]=e[n]);return r}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(r[n]=e[n])}return r}var l=a.createContext({}),p=function(e){var t=a.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):s(s({},t),e)),n},m=function(e){var t=p(e.components);return a.createElement(l.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return a.createElement(a.Fragment,{},t)}},d=a.forwardRef((function(e,t){var n=e.components,r=e.mdxType,i=e.originalType,l=e.parentName,m=o(e,["components","mdxType","originalType","parentName"]),d=p(n),c=r,k=d["".concat(l,".").concat(c)]||d[c]||u[c]||i;return n?a.createElement(k,s(s({ref:t},m),{},{components:n})):a.createElement(k,s({ref:t},m))}));function c(e,t){var n=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var i=n.length,s=new Array(i);s[0]=d;var o={};for(var l in t)hasOwnProperty.call(t,l)&&(o[l]=t[l]);o.originalType=e,o.mdxType="string"==typeof e?e:r,s[1]=o;for(var p=2;p=0||(r[n]=e[n]);return r}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(r[n]=e[n])}return r}var l=a.createContext({}),p=function(e){var t=a.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):s(s({},t),e)),n},m=function(e){var t=p(e.components);return a.createElement(l.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return a.createElement(a.Fragment,{},t)}},d=a.forwardRef((function(e,t){var n=e.components,r=e.mdxType,i=e.originalType,l=e.parentName,m=o(e,["components","mdxType","originalType","parentName"]),d=p(n),c=r,k=d["".concat(l,".").concat(c)]||d[c]||u[c]||i;return n?a.createElement(k,s(s({ref:t},m),{},{components:n})):a.createElement(k,s({ref:t},m))}));function c(e,t){var n=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var i=n.length,s=new Array(i);s[0]=d;var o={};for(var l in t)hasOwnProperty.call(t,l)&&(o[l]=t[l]);o.originalType=e,o.mdxType="string"==typeof e?e:r,s[1]=o;for(var p=2;p=0||(r[a]=e[a]);return r}(e,t);if(Object.getOwnPropertySymbols){var l=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(r[a]=e[a])}return r}var p=n.createContext({}),m=function(e){var t=n.useContext(p),a=t;return e&&(a="function"==typeof e?e(t):i(i({},t),e)),a},d=function(e){var t=m(e.components);return n.createElement(p.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return n.createElement(n.Fragment,{},t)}},s=n.forwardRef((function(e,t){var a=e.components,r=e.mdxType,l=e.originalType,p=e.parentName,d=o(e,["components","mdxType","originalType","parentName"]),s=m(a),k=r,g=s["".concat(p,".").concat(k)]||s[k]||u[k]||l;return a?n.createElement(g,i(i({ref:t},d),{},{components:a})):n.createElement(g,i({ref:t},d))}));function k(e,t){var a=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var l=a.length,i=new Array(l);i[0]=s;var o={};for(var p in t)hasOwnProperty.call(t,p)&&(o[p]=t[p]);o.originalType=e,o.mdxType="string"==typeof e?e:r,i[1]=o;for(var m=2;mTabularLIME",id:"tabularlime",level:3},{value:"TabularSHAP",id:"tabularshap",level:3},{value:"VectorLIME",id:"vectorlime",level:3},{value:"VectorSHAP",id:"vectorshap",level:3},{value:"ImageLIME",id:"imagelime",level:3},{value:"ImageSHAP",id:"imageshap",level:3},{value:"TextLIME",id:"textlime",level:3},{value:"TextSHAP",id:"textshap",level:3},{value:"Result interpretation",id:"result-interpretation",level:2},{value:"LIME explainers",id:"lime-explainers",level:3},{value:"SHAP explainers",id:"shap-explainers",level:3},{value:"Base value",id:"base-value",level:4}],s={toc:u};function k(e){var t=e.components,a=(0,r.Z)(e,i);return(0,l.kt)("wrapper",(0,n.Z)({},s,a,{components:t,mdxType:"MDXLayout"}),(0,l.kt)("h1",{id:"model-interpretation-on-spark"},"Model Interpretation on Spark"),(0,l.kt)("h2",{id:"interpretable-machine-learning"},"Interpretable Machine Learning"),(0,l.kt)("p",null,"Interpretable Machine Learning helps developers, data scientists and business stakeholders in the organization gain a comprehensive understanding of their machine learning models. It can also be used to debug models, explain predictions and enable auditing to meet compliance with regulatory requirements."),(0,l.kt)("h2",{id:"why-run-model-interpretation-on-spark"},"Why run model interpretation on Spark"),(0,l.kt)("p",null,"Model-agnostic interpretation methods can be computationally expensive due to the multiple evaluations needed to compute the explanations. Model interpretation on Spark enables users to interpret a black-box model at massive scales with the Apache Spark\u2122 distributed computing ecosystem. Various components support local interpretation for tabular, vector, image and text classification models, with two popular model-agnostic interpretation methods: ",(0,l.kt)("a",{parentName:"p",href:"https://arxiv.org/abs/1602.04938"},"LIME")," and ",(0,l.kt)("a",{parentName:"p",href:"https://arxiv.org/abs/1705.07874"},"Kernel SHAP"),"."),(0,l.kt)("h2",{id:"usage"},"Usage"),(0,l.kt)("p",null,"Both LIME and Kernel SHAP are local interpretation methods. Local interpretation explains why does the model predict certain outcome for a given observation."),(0,l.kt)("p",null,"Both explainers extends from ",(0,l.kt)("inlineCode",{parentName:"p"},"org.apache.spark.ml.Transformer"),". After setting up the explainer parameters, simply call the ",(0,l.kt)("inlineCode",{parentName:"p"},"transform")," function on a ",(0,l.kt)("inlineCode",{parentName:"p"},"DataFrame")," of observations to interpret the model behavior on these observations."),(0,l.kt)("p",null,"To see examples of model interpretability on Spark in action, take a look at these sample notebooks:"),(0,l.kt)("ul",null,(0,l.kt)("li",{parentName:"ul"},(0,l.kt)("a",{parentName:"li",href:"../Tabular%20Explainers"},"Tabular Explainers")),(0,l.kt)("li",{parentName:"ul"},(0,l.kt)("a",{parentName:"li",href:"../Image%20Explainers"},"Image Explainers")),(0,l.kt)("li",{parentName:"ul"},(0,l.kt)("a",{parentName:"li",href:"../Text%20Explainers"},"Text Explainers"))),(0,l.kt)("table",null,(0,l.kt)("thead",{parentName:"table"},(0,l.kt)("tr",{parentName:"thead"},(0,l.kt)("th",{parentName:"tr",align:null}),(0,l.kt)("th",{parentName:"tr",align:null},"Tabular models"),(0,l.kt)("th",{parentName:"tr",align:null},"Vector models"),(0,l.kt)("th",{parentName:"tr",align:null},"Image models"),(0,l.kt)("th",{parentName:"tr",align:null},"Text models"))),(0,l.kt)("tbody",{parentName:"table"},(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"LIME explainers"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("a",{parentName:"td",href:"#tabularlime"},"TabularLIME")),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("a",{parentName:"td",href:"#vectorlime"},"VectorLIME")),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("a",{parentName:"td",href:"#imagelime"},"ImageLIME")),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("a",{parentName:"td",href:"#textlime"},"TextLIME"))),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"Kernel SHAP explainers"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("a",{parentName:"td",href:"#tabularshap"},"TabularSHAP")),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("a",{parentName:"td",href:"#vectorshap"},"VectorSHAP")),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("a",{parentName:"td",href:"#imageshap"},"ImageSHAP")),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("a",{parentName:"td",href:"#textshap"},"TextSHAP"))))),(0,l.kt)("h3",{id:"common-local-explainer-params"},"Common local explainer params"),(0,l.kt)("p",null,"All local explainers support the following params:"),(0,l.kt)("table",null,(0,l.kt)("thead",{parentName:"table"},(0,l.kt)("tr",{parentName:"thead"},(0,l.kt)("th",{parentName:"tr",align:null},"Param"),(0,l.kt)("th",{parentName:"tr",align:null},"Type"),(0,l.kt)("th",{parentName:"tr",align:null},"Default"),(0,l.kt)("th",{parentName:"tr",align:null},"Description"))),(0,l.kt)("tbody",{parentName:"table"},(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"targetCol"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"String")),(0,l.kt)("td",{parentName:"tr",align:null},'"probability"'),(0,l.kt)("td",{parentName:"tr",align:null},'The column name of the prediction target to explain (i.e. the response variable). This is usually set to "prediction" for regression models and "probability" for probabilistic classification models.')),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"targetClasses"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Array[Int]")),(0,l.kt)("td",{parentName:"tr",align:null},"empty array"),(0,l.kt)("td",{parentName:"tr",align:null},"The indices of the classes for multinomial classification models.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"targetClassesCol"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"String")),(0,l.kt)("td",{parentName:"tr",align:null}),(0,l.kt)("td",{parentName:"tr",align:null},"The name of the column that specifies the indices of the classes for multinomial classification models.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"outputCol"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"String")),(0,l.kt)("td",{parentName:"tr",align:null}),(0,l.kt)("td",{parentName:"tr",align:null},"The name of the output column for interpretation results.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"model"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Transformer")),(0,l.kt)("td",{parentName:"tr",align:null}),(0,l.kt)("td",{parentName:"tr",align:null},"The model to be explained.")))),(0,l.kt)("h3",{id:"common-lime-explainer-params"},"Common LIME explainer params"),(0,l.kt)("p",null,"All LIME based explainers (",(0,l.kt)("a",{parentName:"p",href:"#tabularlime"},"TabularLIME"),", ",(0,l.kt)("a",{parentName:"p",href:"#vectorlime"},"VectorLIME"),", ",(0,l.kt)("a",{parentName:"p",href:"#imagelime"},"ImageLIME"),", ",(0,l.kt)("a",{parentName:"p",href:"#textlime"},"TextLIME"),") support the following params:"),(0,l.kt)("table",null,(0,l.kt)("thead",{parentName:"table"},(0,l.kt)("tr",{parentName:"thead"},(0,l.kt)("th",{parentName:"tr",align:null},"Param"),(0,l.kt)("th",{parentName:"tr",align:null},"Type"),(0,l.kt)("th",{parentName:"tr",align:null},"Default"),(0,l.kt)("th",{parentName:"tr",align:null},"Description"))),(0,l.kt)("tbody",{parentName:"table"},(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"regularization"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Double")),(0,l.kt)("td",{parentName:"tr",align:null},"0"),(0,l.kt)("td",{parentName:"tr",align:null},"Regularization param for the underlying lasso regression.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"kernelWidth"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Double")),(0,l.kt)("td",{parentName:"tr",align:null},"sqrt(number of features) * 0.75"),(0,l.kt)("td",{parentName:"tr",align:null},"Kernel width for the exponential kernel.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"numSamples"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Int")),(0,l.kt)("td",{parentName:"tr",align:null},"1000"),(0,l.kt)("td",{parentName:"tr",align:null},"Number of samples to generate.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"metricsCol"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"String")),(0,l.kt)("td",{parentName:"tr",align:null},'"r2"'),(0,l.kt)("td",{parentName:"tr",align:null},"Column name for fitting metrics.")))),(0,l.kt)("h3",{id:"common-shap-explainer-params"},"Common SHAP explainer params"),(0,l.kt)("p",null,"All Kernel SHAP based explainers (",(0,l.kt)("a",{parentName:"p",href:"#tabularshap"},"TabularSHAP"),", ",(0,l.kt)("a",{parentName:"p",href:"#vectorshap"},"VectorSHAP"),", ",(0,l.kt)("a",{parentName:"p",href:"#imageshap"},"ImageSHAP"),", ",(0,l.kt)("a",{parentName:"p",href:"#textshap"},"TextSHAP"),") support the following params:"),(0,l.kt)("table",null,(0,l.kt)("thead",{parentName:"table"},(0,l.kt)("tr",{parentName:"thead"},(0,l.kt)("th",{parentName:"tr",align:null},"Param"),(0,l.kt)("th",{parentName:"tr",align:null},"Type"),(0,l.kt)("th",{parentName:"tr",align:null},"Default"),(0,l.kt)("th",{parentName:"tr",align:null},"Description"))),(0,l.kt)("tbody",{parentName:"table"},(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"infWeight"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Double")),(0,l.kt)("td",{parentName:"tr",align:null},"1E8"),(0,l.kt)("td",{parentName:"tr",align:null},"The double value to represent infinite weight.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"numSamples"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Int")),(0,l.kt)("td",{parentName:"tr",align:null},"2 * (number of features) + 2048"),(0,l.kt)("td",{parentName:"tr",align:null},"Number of samples to generate.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"metricsCol"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"String")),(0,l.kt)("td",{parentName:"tr",align:null},'"r2"'),(0,l.kt)("td",{parentName:"tr",align:null},"Column name for fitting metrics.")))),(0,l.kt)("h3",{id:"tabular-model-explainer-params"},"Tabular model explainer params"),(0,l.kt)("p",null,"All tabular model explainers (",(0,l.kt)("a",{parentName:"p",href:"#tabularlime"},"TabularLIME"),", ",(0,l.kt)("a",{parentName:"p",href:"#tabularshap"},"TabularSHAP"),") support the following params:"),(0,l.kt)("table",null,(0,l.kt)("thead",{parentName:"table"},(0,l.kt)("tr",{parentName:"thead"},(0,l.kt)("th",{parentName:"tr",align:null},"Param"),(0,l.kt)("th",{parentName:"tr",align:null},"Type"),(0,l.kt)("th",{parentName:"tr",align:null},"Default"),(0,l.kt)("th",{parentName:"tr",align:null},"Description"))),(0,l.kt)("tbody",{parentName:"table"},(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"inputCols"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Array[String]")),(0,l.kt)("td",{parentName:"tr",align:null}),(0,l.kt)("td",{parentName:"tr",align:null},"The names of input columns to the black-box model.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"backgroundData"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"DataFrame")),(0,l.kt)("td",{parentName:"tr",align:null}),(0,l.kt)("td",{parentName:"tr",align:null},"A dataframe containing background data. It must contain all the input columns needed by the black-box model.")))),(0,l.kt)("h3",{id:"vector-model-explainer-params"},"Vector model explainer params"),(0,l.kt)("p",null,"All vector model explainers (",(0,l.kt)("a",{parentName:"p",href:"#vectorlime"},"VectorLIME"),", ",(0,l.kt)("a",{parentName:"p",href:"#vectorshap"},"VectorSHAP"),") support the following params:"),(0,l.kt)("table",null,(0,l.kt)("thead",{parentName:"table"},(0,l.kt)("tr",{parentName:"thead"},(0,l.kt)("th",{parentName:"tr",align:null},"Param"),(0,l.kt)("th",{parentName:"tr",align:null},"Type"),(0,l.kt)("th",{parentName:"tr",align:null},"Default"),(0,l.kt)("th",{parentName:"tr",align:null},"Description"))),(0,l.kt)("tbody",{parentName:"table"},(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"inputCol"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"String")),(0,l.kt)("td",{parentName:"tr",align:null}),(0,l.kt)("td",{parentName:"tr",align:null},"The names of input vector column to the black-box model.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"backgroundData"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"DataFrame")),(0,l.kt)("td",{parentName:"tr",align:null}),(0,l.kt)("td",{parentName:"tr",align:null},"A dataframe containing background data. It must contain the input vector column needed by the black-box model.")))),(0,l.kt)("h3",{id:"image-model-explainer-params"},"Image model explainer params"),(0,l.kt)("p",null,"All image model explainers (",(0,l.kt)("a",{parentName:"p",href:"#imagelime"},"ImageLIME"),", ",(0,l.kt)("a",{parentName:"p",href:"#imageshap"},"ImageSHAP"),") support the following params:"),(0,l.kt)("table",null,(0,l.kt)("thead",{parentName:"table"},(0,l.kt)("tr",{parentName:"thead"},(0,l.kt)("th",{parentName:"tr",align:null},"Param"),(0,l.kt)("th",{parentName:"tr",align:null},"Type"),(0,l.kt)("th",{parentName:"tr",align:null},"Default"),(0,l.kt)("th",{parentName:"tr",align:null},"Description"))),(0,l.kt)("tbody",{parentName:"table"},(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"inputCol"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"String")),(0,l.kt)("td",{parentName:"tr",align:null}),(0,l.kt)("td",{parentName:"tr",align:null},"The names of input image column to the black-box model.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"cellSize"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Double")),(0,l.kt)("td",{parentName:"tr",align:null},"16"),(0,l.kt)("td",{parentName:"tr",align:null},"Number that controls the size of the super-pixels.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"modifier"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Double")),(0,l.kt)("td",{parentName:"tr",align:null},"130"),(0,l.kt)("td",{parentName:"tr",align:null},"Controls the trade-off spatial and color distance of super-pixels.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"superpixelCol"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"String")),(0,l.kt)("td",{parentName:"tr",align:null},'"superpixels"'),(0,l.kt)("td",{parentName:"tr",align:null},"The column holding the super-pixel decompositions.")))),(0,l.kt)("h3",{id:"text-model-explainer-params"},"Text model explainer params"),(0,l.kt)("p",null,"All text model explainers (",(0,l.kt)("a",{parentName:"p",href:"#textlime"},"TextLIME"),", ",(0,l.kt)("a",{parentName:"p",href:"#textshap"},"TextSHAP"),") support the following params:"),(0,l.kt)("table",null,(0,l.kt)("thead",{parentName:"table"},(0,l.kt)("tr",{parentName:"thead"},(0,l.kt)("th",{parentName:"tr",align:null},"Param"),(0,l.kt)("th",{parentName:"tr",align:null},"Type"),(0,l.kt)("th",{parentName:"tr",align:null},"Default"),(0,l.kt)("th",{parentName:"tr",align:null},"Description"))),(0,l.kt)("tbody",{parentName:"table"},(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"inputCol"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"String")),(0,l.kt)("td",{parentName:"tr",align:null}),(0,l.kt)("td",{parentName:"tr",align:null},"The names of input text column to the black-box model.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"tokensCol"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"String")),(0,l.kt)("td",{parentName:"tr",align:null},'"tokens"'),(0,l.kt)("td",{parentName:"tr",align:null},"The column holding the text tokens.")))),(0,l.kt)("h3",{id:"tabularlime"},(0,l.kt)("inlineCode",{parentName:"h3"},"TabularLIME")),(0,l.kt)("table",null,(0,l.kt)("thead",{parentName:"table"},(0,l.kt)("tr",{parentName:"thead"},(0,l.kt)("th",{parentName:"tr",align:null},"Param"),(0,l.kt)("th",{parentName:"tr",align:null},"Type"),(0,l.kt)("th",{parentName:"tr",align:null},"Default"),(0,l.kt)("th",{parentName:"tr",align:null},"Description"))),(0,l.kt)("tbody",{parentName:"table"},(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"categoricalFeatures"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Array[String]")),(0,l.kt)("td",{parentName:"tr",align:null},"empty array"),(0,l.kt)("td",{parentName:"tr",align:null},"The name of columns that should be treated as categorical variables.")))),(0,l.kt)("blockquote",null,(0,l.kt)("p",{parentName:"blockquote"},"For categorical features, ",(0,l.kt)("inlineCode",{parentName:"p"},"TabularLIME")," creates new samples by drawing samples based on the value distribution from the background dataset. For numerical features, it creates new samples by drawing from a normal distribution with mean taken from the target value to be explained, and standard deviation taken from the background dataset.")),(0,l.kt)("h3",{id:"tabularshap"},(0,l.kt)("inlineCode",{parentName:"h3"},"TabularSHAP")),(0,l.kt)("p",null,"No additional params are supported."),(0,l.kt)("h3",{id:"vectorlime"},(0,l.kt)("inlineCode",{parentName:"h3"},"VectorLIME")),(0,l.kt)("p",null,"No additional params are supported."),(0,l.kt)("blockquote",null,(0,l.kt)("p",{parentName:"blockquote"},(0,l.kt)("inlineCode",{parentName:"p"},"VectorLIME")," assumes all features are numerical, and categorical features are not supported in ",(0,l.kt)("inlineCode",{parentName:"p"},"VectorLIME"),".")),(0,l.kt)("h3",{id:"vectorshap"},(0,l.kt)("inlineCode",{parentName:"h3"},"VectorSHAP")),(0,l.kt)("p",null,"No additional params are supported."),(0,l.kt)("h3",{id:"imagelime"},(0,l.kt)("inlineCode",{parentName:"h3"},"ImageLIME")),(0,l.kt)("table",null,(0,l.kt)("thead",{parentName:"table"},(0,l.kt)("tr",{parentName:"thead"},(0,l.kt)("th",{parentName:"tr",align:null},"Param"),(0,l.kt)("th",{parentName:"tr",align:null},"Type"),(0,l.kt)("th",{parentName:"tr",align:null},"Default"),(0,l.kt)("th",{parentName:"tr",align:null},"Description"))),(0,l.kt)("tbody",{parentName:"table"},(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"samplingFraction"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Double")),(0,l.kt)("td",{parentName:"tr",align:null},"0.7"),(0,l.kt)("td",{parentName:"tr",align:null},"The fraction of super-pixels to keep on during sampling.")))),(0,l.kt)("blockquote",null,(0,l.kt)("p",{parentName:"blockquote"},(0,l.kt)("inlineCode",{parentName:"p"},"ImageLIME")," creates new samples by randomly turning super-pixels on or off with probability of keeping on set to ",(0,l.kt)("inlineCode",{parentName:"p"},"SamplingFraction"),".")),(0,l.kt)("h3",{id:"imageshap"},(0,l.kt)("inlineCode",{parentName:"h3"},"ImageSHAP")),(0,l.kt)("p",null,"No additional params are supported."),(0,l.kt)("h3",{id:"textlime"},(0,l.kt)("inlineCode",{parentName:"h3"},"TextLIME")),(0,l.kt)("table",null,(0,l.kt)("thead",{parentName:"table"},(0,l.kt)("tr",{parentName:"thead"},(0,l.kt)("th",{parentName:"tr",align:null},"Param"),(0,l.kt)("th",{parentName:"tr",align:null},"Type"),(0,l.kt)("th",{parentName:"tr",align:null},"Default"),(0,l.kt)("th",{parentName:"tr",align:null},"Description"))),(0,l.kt)("tbody",{parentName:"table"},(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"samplingFraction"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Double")),(0,l.kt)("td",{parentName:"tr",align:null},"0.7"),(0,l.kt)("td",{parentName:"tr",align:null},"The fraction of word tokens to keep on during sampling.")))),(0,l.kt)("blockquote",null,(0,l.kt)("p",{parentName:"blockquote"},(0,l.kt)("inlineCode",{parentName:"p"},"TextLIME")," creates new samples by randomly turning word tokens on or off with probability of keeping on set to ",(0,l.kt)("inlineCode",{parentName:"p"},"SamplingFraction"),".")),(0,l.kt)("h3",{id:"textshap"},(0,l.kt)("inlineCode",{parentName:"h3"},"TextSHAP")),(0,l.kt)("p",null,"No additional params are supported."),(0,l.kt)("h2",{id:"result-interpretation"},"Result interpretation"),(0,l.kt)("h3",{id:"lime-explainers"},"LIME explainers"),(0,l.kt)("p",null,"LIME explainers return an array of vectors, and each vector maps to a class being explained. Each component of the vector is the coefficient for the corresponding feature, super-pixel, or word token from the local surrogate model."),(0,l.kt)("ul",null,(0,l.kt)("li",{parentName:"ul"},"For categorical variables, super-pixels, or word tokens, the coefficient shows the average change in model outcome if this feature is unknown to the model, if the super-pixel is replaced with background color (black), or if the word token is replaced with empty string."),(0,l.kt)("li",{parentName:"ul"},"For numeric variables, the coefficient shows the change in model outcome if the feature value is incremented by 1 unit.")),(0,l.kt)("h3",{id:"shap-explainers"},"SHAP explainers"),(0,l.kt)("p",null,"SHAP explainers return an array of vectors, and each vector maps to a class being explained. Each vector starts with the ",(0,l.kt)("a",{parentName:"p",href:"#base-value"},"base value"),", and each following component of the vector is the Shapley value for each feature, super-pixel, or token."),(0,l.kt)("p",null,"The base value and Shapley values are additive, and they should add up to the model output for the target observation."),(0,l.kt)("h4",{id:"base-value"},"Base value"),(0,l.kt)("ul",null,(0,l.kt)("li",{parentName:"ul"},"For tabular and vector models, the base value represents the mean outcome of the model for the background dataset."),(0,l.kt)("li",{parentName:"ul"},"For image models, the base value represents the model outcome for a background (all black) image."),(0,l.kt)("li",{parentName:"ul"},"For text models, the base value represents the model outcome for an empty string.")))}k.isMDXComponent=!0}}]); \ No newline at end of file +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[2428],{3905:function(e,t,a){a.d(t,{Zo:function(){return d},kt:function(){return k}});var n=a(7294);function r(e,t,a){return t in e?Object.defineProperty(e,t,{value:a,enumerable:!0,configurable:!0,writable:!0}):e[t]=a,e}function l(e,t){var a=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);t&&(n=n.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),a.push.apply(a,n)}return a}function i(e){for(var t=1;t=0||(r[a]=e[a]);return r}(e,t);if(Object.getOwnPropertySymbols){var l=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(r[a]=e[a])}return r}var p=n.createContext({}),m=function(e){var t=n.useContext(p),a=t;return e&&(a="function"==typeof e?e(t):i(i({},t),e)),a},d=function(e){var t=m(e.components);return n.createElement(p.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return n.createElement(n.Fragment,{},t)}},s=n.forwardRef((function(e,t){var a=e.components,r=e.mdxType,l=e.originalType,p=e.parentName,d=o(e,["components","mdxType","originalType","parentName"]),s=m(a),k=r,g=s["".concat(p,".").concat(k)]||s[k]||u[k]||l;return a?n.createElement(g,i(i({ref:t},d),{},{components:a})):n.createElement(g,i({ref:t},d))}));function k(e,t){var a=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var l=a.length,i=new Array(l);i[0]=s;var o={};for(var p in t)hasOwnProperty.call(t,p)&&(o[p]=t[p]);o.originalType=e,o.mdxType="string"==typeof e?e:r,i[1]=o;for(var m=2;mTabularLIME",id:"tabularlime",level:3},{value:"TabularSHAP",id:"tabularshap",level:3},{value:"VectorLIME",id:"vectorlime",level:3},{value:"VectorSHAP",id:"vectorshap",level:3},{value:"ImageLIME",id:"imagelime",level:3},{value:"ImageSHAP",id:"imageshap",level:3},{value:"TextLIME",id:"textlime",level:3},{value:"TextSHAP",id:"textshap",level:3},{value:"Result interpretation",id:"result-interpretation",level:2},{value:"LIME explainers",id:"lime-explainers",level:3},{value:"SHAP explainers",id:"shap-explainers",level:3},{value:"Base value",id:"base-value",level:4}],s={toc:u};function k(e){var t=e.components,a=(0,r.Z)(e,i);return(0,l.kt)("wrapper",(0,n.Z)({},s,a,{components:t,mdxType:"MDXLayout"}),(0,l.kt)("h1",{id:"model-interpretation-on-spark"},"Model Interpretation on Spark"),(0,l.kt)("h2",{id:"interpretable-machine-learning"},"Interpretable Machine Learning"),(0,l.kt)("p",null,"Interpretable Machine Learning helps developers, data scientists and business stakeholders in the organization gain a comprehensive understanding of their machine learning models. It can also be used to debug models, explain predictions and enable auditing to meet compliance with regulatory requirements."),(0,l.kt)("h2",{id:"why-run-model-interpretation-on-spark"},"Why run model interpretation on Spark"),(0,l.kt)("p",null,"Model-agnostic interpretation methods can be computationally expensive due to the multiple evaluations needed to compute the explanations. Model interpretation on Spark enables users to interpret a black-box model at massive scales with the Apache Spark\u2122 distributed computing ecosystem. Various components support local interpretation for tabular, vector, image and text classification models, with two popular model-agnostic interpretation methods: ",(0,l.kt)("a",{parentName:"p",href:"https://arxiv.org/abs/1602.04938"},"LIME")," and ",(0,l.kt)("a",{parentName:"p",href:"https://arxiv.org/abs/1705.07874"},"Kernel SHAP"),"."),(0,l.kt)("h2",{id:"usage"},"Usage"),(0,l.kt)("p",null,"Both LIME and Kernel SHAP are local interpretation methods. Local interpretation explains why does the model predict certain outcome for a given observation."),(0,l.kt)("p",null,"Both explainers extends from ",(0,l.kt)("inlineCode",{parentName:"p"},"org.apache.spark.ml.Transformer"),". After setting up the explainer parameters, simply call the ",(0,l.kt)("inlineCode",{parentName:"p"},"transform")," function on a ",(0,l.kt)("inlineCode",{parentName:"p"},"DataFrame")," of observations to interpret the model behavior on these observations."),(0,l.kt)("p",null,"To see examples of model interpretability on Spark in action, take a look at these sample notebooks:"),(0,l.kt)("ul",null,(0,l.kt)("li",{parentName:"ul"},(0,l.kt)("a",{parentName:"li",href:"../Tabular%20Explainers"},"Tabular Explainers")),(0,l.kt)("li",{parentName:"ul"},(0,l.kt)("a",{parentName:"li",href:"../Image%20Explainers"},"Image Explainers")),(0,l.kt)("li",{parentName:"ul"},(0,l.kt)("a",{parentName:"li",href:"../Text%20Explainers"},"Text Explainers"))),(0,l.kt)("table",null,(0,l.kt)("thead",{parentName:"table"},(0,l.kt)("tr",{parentName:"thead"},(0,l.kt)("th",{parentName:"tr",align:null}),(0,l.kt)("th",{parentName:"tr",align:null},"Tabular models"),(0,l.kt)("th",{parentName:"tr",align:null},"Vector models"),(0,l.kt)("th",{parentName:"tr",align:null},"Image models"),(0,l.kt)("th",{parentName:"tr",align:null},"Text models"))),(0,l.kt)("tbody",{parentName:"table"},(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"LIME explainers"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("a",{parentName:"td",href:"#tabularlime"},"TabularLIME")),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("a",{parentName:"td",href:"#vectorlime"},"VectorLIME")),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("a",{parentName:"td",href:"#imagelime"},"ImageLIME")),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("a",{parentName:"td",href:"#textlime"},"TextLIME"))),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"Kernel SHAP explainers"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("a",{parentName:"td",href:"#tabularshap"},"TabularSHAP")),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("a",{parentName:"td",href:"#vectorshap"},"VectorSHAP")),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("a",{parentName:"td",href:"#imageshap"},"ImageSHAP")),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("a",{parentName:"td",href:"#textshap"},"TextSHAP"))))),(0,l.kt)("h3",{id:"common-local-explainer-params"},"Common local explainer params"),(0,l.kt)("p",null,"All local explainers support the following params:"),(0,l.kt)("table",null,(0,l.kt)("thead",{parentName:"table"},(0,l.kt)("tr",{parentName:"thead"},(0,l.kt)("th",{parentName:"tr",align:null},"Param"),(0,l.kt)("th",{parentName:"tr",align:null},"Type"),(0,l.kt)("th",{parentName:"tr",align:null},"Default"),(0,l.kt)("th",{parentName:"tr",align:null},"Description"))),(0,l.kt)("tbody",{parentName:"table"},(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"targetCol"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"String")),(0,l.kt)("td",{parentName:"tr",align:null},'"probability"'),(0,l.kt)("td",{parentName:"tr",align:null},'The column name of the prediction target to explain (i.e. the response variable). This is usually set to "prediction" for regression models and "probability" for probabilistic classification models.')),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"targetClasses"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Array[Int]")),(0,l.kt)("td",{parentName:"tr",align:null},"empty array"),(0,l.kt)("td",{parentName:"tr",align:null},"The indices of the classes for multinomial classification models.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"targetClassesCol"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"String")),(0,l.kt)("td",{parentName:"tr",align:null}),(0,l.kt)("td",{parentName:"tr",align:null},"The name of the column that specifies the indices of the classes for multinomial classification models.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"outputCol"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"String")),(0,l.kt)("td",{parentName:"tr",align:null}),(0,l.kt)("td",{parentName:"tr",align:null},"The name of the output column for interpretation results.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"model"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Transformer")),(0,l.kt)("td",{parentName:"tr",align:null}),(0,l.kt)("td",{parentName:"tr",align:null},"The model to be explained.")))),(0,l.kt)("h3",{id:"common-lime-explainer-params"},"Common LIME explainer params"),(0,l.kt)("p",null,"All LIME based explainers (",(0,l.kt)("a",{parentName:"p",href:"#tabularlime"},"TabularLIME"),", ",(0,l.kt)("a",{parentName:"p",href:"#vectorlime"},"VectorLIME"),", ",(0,l.kt)("a",{parentName:"p",href:"#imagelime"},"ImageLIME"),", ",(0,l.kt)("a",{parentName:"p",href:"#textlime"},"TextLIME"),") support the following params:"),(0,l.kt)("table",null,(0,l.kt)("thead",{parentName:"table"},(0,l.kt)("tr",{parentName:"thead"},(0,l.kt)("th",{parentName:"tr",align:null},"Param"),(0,l.kt)("th",{parentName:"tr",align:null},"Type"),(0,l.kt)("th",{parentName:"tr",align:null},"Default"),(0,l.kt)("th",{parentName:"tr",align:null},"Description"))),(0,l.kt)("tbody",{parentName:"table"},(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"regularization"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Double")),(0,l.kt)("td",{parentName:"tr",align:null},"0"),(0,l.kt)("td",{parentName:"tr",align:null},"Regularization param for the underlying lasso regression.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"kernelWidth"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Double")),(0,l.kt)("td",{parentName:"tr",align:null},"sqrt(number of features) * 0.75"),(0,l.kt)("td",{parentName:"tr",align:null},"Kernel width for the exponential kernel.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"numSamples"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Int")),(0,l.kt)("td",{parentName:"tr",align:null},"1000"),(0,l.kt)("td",{parentName:"tr",align:null},"Number of samples to generate.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"metricsCol"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"String")),(0,l.kt)("td",{parentName:"tr",align:null},'"r2"'),(0,l.kt)("td",{parentName:"tr",align:null},"Column name for fitting metrics.")))),(0,l.kt)("h3",{id:"common-shap-explainer-params"},"Common SHAP explainer params"),(0,l.kt)("p",null,"All Kernel SHAP based explainers (",(0,l.kt)("a",{parentName:"p",href:"#tabularshap"},"TabularSHAP"),", ",(0,l.kt)("a",{parentName:"p",href:"#vectorshap"},"VectorSHAP"),", ",(0,l.kt)("a",{parentName:"p",href:"#imageshap"},"ImageSHAP"),", ",(0,l.kt)("a",{parentName:"p",href:"#textshap"},"TextSHAP"),") support the following params:"),(0,l.kt)("table",null,(0,l.kt)("thead",{parentName:"table"},(0,l.kt)("tr",{parentName:"thead"},(0,l.kt)("th",{parentName:"tr",align:null},"Param"),(0,l.kt)("th",{parentName:"tr",align:null},"Type"),(0,l.kt)("th",{parentName:"tr",align:null},"Default"),(0,l.kt)("th",{parentName:"tr",align:null},"Description"))),(0,l.kt)("tbody",{parentName:"table"},(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"infWeight"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Double")),(0,l.kt)("td",{parentName:"tr",align:null},"1E8"),(0,l.kt)("td",{parentName:"tr",align:null},"The double value to represent infinite weight.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"numSamples"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Int")),(0,l.kt)("td",{parentName:"tr",align:null},"2 * (number of features) + 2048"),(0,l.kt)("td",{parentName:"tr",align:null},"Number of samples to generate.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"metricsCol"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"String")),(0,l.kt)("td",{parentName:"tr",align:null},'"r2"'),(0,l.kt)("td",{parentName:"tr",align:null},"Column name for fitting metrics.")))),(0,l.kt)("h3",{id:"tabular-model-explainer-params"},"Tabular model explainer params"),(0,l.kt)("p",null,"All tabular model explainers (",(0,l.kt)("a",{parentName:"p",href:"#tabularlime"},"TabularLIME"),", ",(0,l.kt)("a",{parentName:"p",href:"#tabularshap"},"TabularSHAP"),") support the following params:"),(0,l.kt)("table",null,(0,l.kt)("thead",{parentName:"table"},(0,l.kt)("tr",{parentName:"thead"},(0,l.kt)("th",{parentName:"tr",align:null},"Param"),(0,l.kt)("th",{parentName:"tr",align:null},"Type"),(0,l.kt)("th",{parentName:"tr",align:null},"Default"),(0,l.kt)("th",{parentName:"tr",align:null},"Description"))),(0,l.kt)("tbody",{parentName:"table"},(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"inputCols"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Array[String]")),(0,l.kt)("td",{parentName:"tr",align:null}),(0,l.kt)("td",{parentName:"tr",align:null},"The names of input columns to the black-box model.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"backgroundData"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"DataFrame")),(0,l.kt)("td",{parentName:"tr",align:null}),(0,l.kt)("td",{parentName:"tr",align:null},"A dataframe containing background data. It must contain all the input columns needed by the black-box model.")))),(0,l.kt)("h3",{id:"vector-model-explainer-params"},"Vector model explainer params"),(0,l.kt)("p",null,"All vector model explainers (",(0,l.kt)("a",{parentName:"p",href:"#vectorlime"},"VectorLIME"),", ",(0,l.kt)("a",{parentName:"p",href:"#vectorshap"},"VectorSHAP"),") support the following params:"),(0,l.kt)("table",null,(0,l.kt)("thead",{parentName:"table"},(0,l.kt)("tr",{parentName:"thead"},(0,l.kt)("th",{parentName:"tr",align:null},"Param"),(0,l.kt)("th",{parentName:"tr",align:null},"Type"),(0,l.kt)("th",{parentName:"tr",align:null},"Default"),(0,l.kt)("th",{parentName:"tr",align:null},"Description"))),(0,l.kt)("tbody",{parentName:"table"},(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"inputCol"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"String")),(0,l.kt)("td",{parentName:"tr",align:null}),(0,l.kt)("td",{parentName:"tr",align:null},"The names of input vector column to the black-box model.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"backgroundData"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"DataFrame")),(0,l.kt)("td",{parentName:"tr",align:null}),(0,l.kt)("td",{parentName:"tr",align:null},"A dataframe containing background data. It must contain the input vector column needed by the black-box model.")))),(0,l.kt)("h3",{id:"image-model-explainer-params"},"Image model explainer params"),(0,l.kt)("p",null,"All image model explainers (",(0,l.kt)("a",{parentName:"p",href:"#imagelime"},"ImageLIME"),", ",(0,l.kt)("a",{parentName:"p",href:"#imageshap"},"ImageSHAP"),") support the following params:"),(0,l.kt)("table",null,(0,l.kt)("thead",{parentName:"table"},(0,l.kt)("tr",{parentName:"thead"},(0,l.kt)("th",{parentName:"tr",align:null},"Param"),(0,l.kt)("th",{parentName:"tr",align:null},"Type"),(0,l.kt)("th",{parentName:"tr",align:null},"Default"),(0,l.kt)("th",{parentName:"tr",align:null},"Description"))),(0,l.kt)("tbody",{parentName:"table"},(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"inputCol"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"String")),(0,l.kt)("td",{parentName:"tr",align:null}),(0,l.kt)("td",{parentName:"tr",align:null},"The names of input image column to the black-box model.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"cellSize"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Double")),(0,l.kt)("td",{parentName:"tr",align:null},"16"),(0,l.kt)("td",{parentName:"tr",align:null},"Number that controls the size of the super-pixels.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"modifier"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Double")),(0,l.kt)("td",{parentName:"tr",align:null},"130"),(0,l.kt)("td",{parentName:"tr",align:null},"Controls the trade-off spatial and color distance of super-pixels.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"superpixelCol"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"String")),(0,l.kt)("td",{parentName:"tr",align:null},'"superpixels"'),(0,l.kt)("td",{parentName:"tr",align:null},"The column holding the super-pixel decompositions.")))),(0,l.kt)("h3",{id:"text-model-explainer-params"},"Text model explainer params"),(0,l.kt)("p",null,"All text model explainers (",(0,l.kt)("a",{parentName:"p",href:"#textlime"},"TextLIME"),", ",(0,l.kt)("a",{parentName:"p",href:"#textshap"},"TextSHAP"),") support the following params:"),(0,l.kt)("table",null,(0,l.kt)("thead",{parentName:"table"},(0,l.kt)("tr",{parentName:"thead"},(0,l.kt)("th",{parentName:"tr",align:null},"Param"),(0,l.kt)("th",{parentName:"tr",align:null},"Type"),(0,l.kt)("th",{parentName:"tr",align:null},"Default"),(0,l.kt)("th",{parentName:"tr",align:null},"Description"))),(0,l.kt)("tbody",{parentName:"table"},(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"inputCol"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"String")),(0,l.kt)("td",{parentName:"tr",align:null}),(0,l.kt)("td",{parentName:"tr",align:null},"The names of input text column to the black-box model.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"tokensCol"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"String")),(0,l.kt)("td",{parentName:"tr",align:null},'"tokens"'),(0,l.kt)("td",{parentName:"tr",align:null},"The column holding the text tokens.")))),(0,l.kt)("h3",{id:"tabularlime"},(0,l.kt)("inlineCode",{parentName:"h3"},"TabularLIME")),(0,l.kt)("table",null,(0,l.kt)("thead",{parentName:"table"},(0,l.kt)("tr",{parentName:"thead"},(0,l.kt)("th",{parentName:"tr",align:null},"Param"),(0,l.kt)("th",{parentName:"tr",align:null},"Type"),(0,l.kt)("th",{parentName:"tr",align:null},"Default"),(0,l.kt)("th",{parentName:"tr",align:null},"Description"))),(0,l.kt)("tbody",{parentName:"table"},(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"categoricalFeatures"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Array[String]")),(0,l.kt)("td",{parentName:"tr",align:null},"empty array"),(0,l.kt)("td",{parentName:"tr",align:null},"The name of columns that should be treated as categorical variables.")))),(0,l.kt)("blockquote",null,(0,l.kt)("p",{parentName:"blockquote"},"For categorical features, ",(0,l.kt)("inlineCode",{parentName:"p"},"TabularLIME")," creates new samples by drawing samples based on the value distribution from the background dataset. For numerical features, it creates new samples by drawing from a normal distribution with mean taken from the target value to be explained, and standard deviation taken from the background dataset.")),(0,l.kt)("h3",{id:"tabularshap"},(0,l.kt)("inlineCode",{parentName:"h3"},"TabularSHAP")),(0,l.kt)("p",null,"No additional params are supported."),(0,l.kt)("h3",{id:"vectorlime"},(0,l.kt)("inlineCode",{parentName:"h3"},"VectorLIME")),(0,l.kt)("p",null,"No additional params are supported."),(0,l.kt)("blockquote",null,(0,l.kt)("p",{parentName:"blockquote"},(0,l.kt)("inlineCode",{parentName:"p"},"VectorLIME")," assumes all features are numerical, and categorical features are not supported in ",(0,l.kt)("inlineCode",{parentName:"p"},"VectorLIME"),".")),(0,l.kt)("h3",{id:"vectorshap"},(0,l.kt)("inlineCode",{parentName:"h3"},"VectorSHAP")),(0,l.kt)("p",null,"No additional params are supported."),(0,l.kt)("h3",{id:"imagelime"},(0,l.kt)("inlineCode",{parentName:"h3"},"ImageLIME")),(0,l.kt)("table",null,(0,l.kt)("thead",{parentName:"table"},(0,l.kt)("tr",{parentName:"thead"},(0,l.kt)("th",{parentName:"tr",align:null},"Param"),(0,l.kt)("th",{parentName:"tr",align:null},"Type"),(0,l.kt)("th",{parentName:"tr",align:null},"Default"),(0,l.kt)("th",{parentName:"tr",align:null},"Description"))),(0,l.kt)("tbody",{parentName:"table"},(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"samplingFraction"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Double")),(0,l.kt)("td",{parentName:"tr",align:null},"0.7"),(0,l.kt)("td",{parentName:"tr",align:null},"The fraction of super-pixels to keep on during sampling.")))),(0,l.kt)("blockquote",null,(0,l.kt)("p",{parentName:"blockquote"},(0,l.kt)("inlineCode",{parentName:"p"},"ImageLIME")," creates new samples by randomly turning super-pixels on or off with probability of keeping on set to ",(0,l.kt)("inlineCode",{parentName:"p"},"SamplingFraction"),".")),(0,l.kt)("h3",{id:"imageshap"},(0,l.kt)("inlineCode",{parentName:"h3"},"ImageSHAP")),(0,l.kt)("p",null,"No additional params are supported."),(0,l.kt)("h3",{id:"textlime"},(0,l.kt)("inlineCode",{parentName:"h3"},"TextLIME")),(0,l.kt)("table",null,(0,l.kt)("thead",{parentName:"table"},(0,l.kt)("tr",{parentName:"thead"},(0,l.kt)("th",{parentName:"tr",align:null},"Param"),(0,l.kt)("th",{parentName:"tr",align:null},"Type"),(0,l.kt)("th",{parentName:"tr",align:null},"Default"),(0,l.kt)("th",{parentName:"tr",align:null},"Description"))),(0,l.kt)("tbody",{parentName:"table"},(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"samplingFraction"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Double")),(0,l.kt)("td",{parentName:"tr",align:null},"0.7"),(0,l.kt)("td",{parentName:"tr",align:null},"The fraction of word tokens to keep on during sampling.")))),(0,l.kt)("blockquote",null,(0,l.kt)("p",{parentName:"blockquote"},(0,l.kt)("inlineCode",{parentName:"p"},"TextLIME")," creates new samples by randomly turning word tokens on or off with probability of keeping on set to ",(0,l.kt)("inlineCode",{parentName:"p"},"SamplingFraction"),".")),(0,l.kt)("h3",{id:"textshap"},(0,l.kt)("inlineCode",{parentName:"h3"},"TextSHAP")),(0,l.kt)("p",null,"No additional params are supported."),(0,l.kt)("h2",{id:"result-interpretation"},"Result interpretation"),(0,l.kt)("h3",{id:"lime-explainers"},"LIME explainers"),(0,l.kt)("p",null,"LIME explainers return an array of vectors, and each vector maps to a class being explained. Each component of the vector is the coefficient for the corresponding feature, super-pixel, or word token from the local surrogate model."),(0,l.kt)("ul",null,(0,l.kt)("li",{parentName:"ul"},"For categorical variables, super-pixels, or word tokens, the coefficient shows the average change in model outcome if this feature is unknown to the model, if the super-pixel is replaced with background color (black), or if the word token is replaced with empty string."),(0,l.kt)("li",{parentName:"ul"},"For numeric variables, the coefficient shows the change in model outcome if the feature value is incremented by 1 unit.")),(0,l.kt)("h3",{id:"shap-explainers"},"SHAP explainers"),(0,l.kt)("p",null,"SHAP explainers return an array of vectors, and each vector maps to a class being explained. Each vector starts with the ",(0,l.kt)("a",{parentName:"p",href:"#base-value"},"base value"),", and each following component of the vector is the Shapley value for each feature, super-pixel, or token."),(0,l.kt)("p",null,"The base value and Shapley values are additive, and they should add up to the model output for the target observation."),(0,l.kt)("h4",{id:"base-value"},"Base value"),(0,l.kt)("ul",null,(0,l.kt)("li",{parentName:"ul"},"For tabular and vector models, the base value represents the mean outcome of the model for the background dataset."),(0,l.kt)("li",{parentName:"ul"},"For image models, the base value represents the model outcome for a background (all black) image."),(0,l.kt)("li",{parentName:"ul"},"For text models, the base value represents the model outcome for an empty string.")))}k.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/325e22bd.73aba1fd.js b/assets/js/325e22bd.5135cb71.js similarity index 96% rename from assets/js/325e22bd.73aba1fd.js rename to assets/js/325e22bd.5135cb71.js index ad6cf377b9..f9f1408acd 100644 --- a/assets/js/325e22bd.73aba1fd.js +++ b/assets/js/325e22bd.5135cb71.js @@ -1 +1 @@ -"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[2321],{3905:function(e,t,n){n.d(t,{Zo:function(){return c},kt:function(){return d}});var a=n(7294);function r(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function i(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);t&&(a=a.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,a)}return n}function l(e){for(var t=1;t=0||(r[n]=e[n]);return r}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(r[n]=e[n])}return r}var u=a.createContext({}),s=function(e){var t=a.useContext(u),n=t;return e&&(n="function"==typeof e?e(t):l(l({},t),e)),n},c=function(e){var t=s(e.components);return a.createElement(u.Provider,{value:t},e.children)},m={inlineCode:"code",wrapper:function(e){var t=e.children;return a.createElement(a.Fragment,{},t)}},p=a.forwardRef((function(e,t){var n=e.components,r=e.mdxType,i=e.originalType,u=e.parentName,c=o(e,["components","mdxType","originalType","parentName"]),p=s(n),d=r,f=p["".concat(u,".").concat(d)]||p[d]||m[d]||i;return n?a.createElement(f,l(l({ref:t},c),{},{components:n})):a.createElement(f,l({ref:t},c))}));function d(e,t){var n=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var i=n.length,l=new Array(i);l[0]=p;var o={};for(var u in t)hasOwnProperty.call(t,u)&&(o[u]=t[u]);o.originalType=e,o.mdxType="string"==typeof e?e:r,l[1]=o;for(var s=2;s child <"+("string"==typeof e.type?e.type:e.type.name)+'>: all children of the component should be , and every should have a unique "value" prop.')})))?void 0:n.filter(Boolean))?t:[]}(e).map((function(e){var t=e.props;return{value:t.value,label:t.label,attributes:t.attributes,default:t.default}}))}function p(e){var t=e.values,n=e.children;return(0,r.useMemo)((function(){var e=null!=t?t:m(n);return function(e){var t=(0,s.l)(e,(function(e,t){return e.value===t.value}));if(t.length>0)throw new Error('Docusaurus error: Duplicate values "'+t.map((function(e){return e.value})).join(", ")+'" found in . Every value needs to be unique.')}(e),e}),[t,n])}function d(e){var t=e.value;return e.tabValues.some((function(e){return e.value===t}))}function f(e){var t=e.queryString,n=void 0!==t&&t,a=e.groupId,i=(0,o.k6)(),l=function(e){var t=e.queryString,n=void 0!==t&&t,a=e.groupId;if("string"==typeof n)return n;if(!1===n)return null;if(!0===n&&!a)throw new Error('Docusaurus error: The component groupId prop is required if queryString=true, because this value is used as the search param name. You can also provide an explicit value such as queryString="my-search-param".');return null!=a?a:null}({queryString:n,groupId:a});return[(0,u._X)(l),(0,r.useCallback)((function(e){if(l){var t=new URLSearchParams(i.location.search);t.set(l,e),i.replace(Object.assign({},i.location,{search:t.toString()}))}}),[l,i])]}function v(e){var t,n,a,i,l=e.defaultValue,o=e.queryString,u=void 0!==o&&o,s=e.groupId,m=p(e),v=(0,r.useState)((function(){return function(e){var t,n=e.defaultValue,a=e.tabValues;if(0===a.length)throw new Error("Docusaurus error: the component requires at least one children component");if(n){if(!d({value:n,tabValues:a}))throw new Error('Docusaurus error: The has a defaultValue "'+n+'" but none of its children has the corresponding value. Available values are: '+a.map((function(e){return e.value})).join(", ")+". If you intend to show no default tab, use defaultValue={null} instead.");return n}var r=null!=(t=a.find((function(e){return e.default})))?t:a[0];if(!r)throw new Error("Unexpected error: 0 tabValues");return r.value}({defaultValue:l,tabValues:m})})),y=v[0],b=v[1],g=f({queryString:u,groupId:s}),h=g[0],E=g[1],w=(t=function(e){return e?"docusaurus.tab."+e:null}({groupId:s}.groupId),n=(0,c.Nk)(t),a=n[0],i=n[1],[a,(0,r.useCallback)((function(e){t&&i.set(e)}),[t,i])]),T=w[0],k=w[1],S=function(){var e=null!=h?h:T;return d({value:e,tabValues:m})?e:null}();return(0,r.useLayoutEffect)((function(){S&&b(S)}),[S]),{selectedValue:y,selectValue:(0,r.useCallback)((function(e){if(!d({value:e,tabValues:m}))throw new Error("Can't select invalid tab value="+e);b(e),E(e),k(e)}),[E,k,m]),tabValues:m}}var y=n(2389),b="tabList__CuJ",g="tabItem_LNqP";function h(e){var t=e.className,n=e.block,o=e.selectedValue,u=e.selectValue,s=e.tabValues,c=[],m=(0,l.o5)().blockElementScrollPositionUntilNextRender,p=function(e){var t=e.currentTarget,n=c.indexOf(t),a=s[n].value;a!==o&&(m(t),u(a))},d=function(e){var t,n=null;switch(e.key){case"Enter":p(e);break;case"ArrowRight":var a,r=c.indexOf(e.currentTarget)+1;n=null!=(a=c[r])?a:c[0];break;case"ArrowLeft":var i,l=c.indexOf(e.currentTarget)-1;n=null!=(i=c[l])?i:c[c.length-1]}null==(t=n)||t.focus()};return r.createElement("ul",{role:"tablist","aria-orientation":"horizontal",className:(0,i.Z)("tabs",{"tabs--block":n},t)},s.map((function(e){var t=e.value,n=e.label,l=e.attributes;return r.createElement("li",(0,a.Z)({role:"tab",tabIndex:o===t?0:-1,"aria-selected":o===t,key:t,ref:function(e){return c.push(e)},onKeyDown:d,onClick:p},l,{className:(0,i.Z)("tabs__item",g,null==l?void 0:l.className,{"tabs__item--active":o===t})}),null!=n?n:t)})))}function E(e){var t=e.lazy,n=e.children,a=e.selectedValue,i=(Array.isArray(n)?n:[n]).filter(Boolean);if(t){var l=i.find((function(e){return e.props.value===a}));return l?(0,r.cloneElement)(l,{className:"margin-top--md"}):null}return r.createElement("div",{className:"margin-top--md"},i.map((function(e,t){return(0,r.cloneElement)(e,{key:t,hidden:e.props.value!==a})})))}function w(e){var t=v(e);return r.createElement("div",{className:(0,i.Z)("tabs-container",b)},r.createElement(h,(0,a.Z)({},e,t)),r.createElement(E,(0,a.Z)({},e,t)))}function T(e){var t=(0,y.Z)();return r.createElement(w,(0,a.Z)({key:String(t)},e))}},1989:function(e,t,n){var a=n(7294),r=n(2263);t.Z=function(e){var t=e.className,n=e.py,i=e.scala,l=e.csharp,o=e.sourceLink,u=(0,r.Z)().siteConfig.customFields.version,s="https://mmlspark.blob.core.windows.net/docs/"+u+"/pyspark/"+n,c="https://mmlspark.blob.core.windows.net/docs/"+u+"/scala/"+i,m="https://mmlspark.blob.core.windows.net/docs/"+u+"/dotnet/"+l;return a.createElement("table",null,a.createElement("tbody",null,a.createElement("tr",null,a.createElement("td",null,a.createElement("strong",null,"Python API: "),a.createElement("a",{href:s},t)),a.createElement("td",null,a.createElement("strong",null,"Scala API: "),a.createElement("a",{href:c},t)),a.createElement("td",null,a.createElement("strong",null,".NET API: "),a.createElement("a",{href:m},t)),a.createElement("td",null,a.createElement("strong",null,"Source: "),a.createElement("a",{href:o},t)))))}},3819:function(e,t,n){n.r(t),n.d(t,{assets:function(){return b},contentTitle:function(){return v},default:function(){return E},frontMatter:function(){return f},metadata:function(){return y},toc:function(){return g}});var a=n(3117),r=n(102),i=(n(7294),n(3905)),l=n(4866),o=n(5162),u=n(1989),s=["components"],c=[{value:"SimpleFitMultivariateAnomaly",id:"simplefitmultivariateanomaly",level:2}],m={toc:c};function p(e){var t=e.components,n=(0,r.Z)(e,s);return(0,i.kt)("wrapper",(0,a.Z)({},m,n,{components:t,mdxType:"MDXLayout"}),(0,i.kt)("h2",{id:"simplefitmultivariateanomaly"},"SimpleFitMultivariateAnomaly"),(0,i.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,i.kt)(o.Z,{value:"py",mdxType:"TabItem"},(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\nanomalyKey = os.environ.get("ANOMALY_API_KEY", getSecret("anomaly-api-key"))\nstartTime = "2021-01-01T00:00:00Z"\nendTime = "2021-01-03T01:59:00Z"\ntimestampColumn = "timestamp"\ninputColumns = ["feature0", "feature1", "feature2"]\nintermediateSaveDir = "wasbs://madtest@anomalydetectiontest.blob.core.windows.net/intermediateData"\n\nsimpleFitMultivariateAnomaly = (SimpleFitMultivariateAnomaly()\n .setSubscriptionKey(anomalyKey)\n .setLocation("westus2")\n .setOutputCol("result")\n .setStartTime(startTime)\n .setEndTime(endTime)\n .setIntermediateSaveDir(intermediateSaveDir)\n .setTimestampCol(timestampColumn)\n .setInputCols(inputColumns)\n .setSlidingWindow(50))\n\n# uncomment below for fitting your own dataframe\n# model = simpleFitMultivariateAnomaly.fit(df)\n# simpleFitMultivariateAnomaly.cleanUpIntermediateData()\n'))),(0,i.kt)(o.Z,{value:"scala",mdxType:"TabItem"},(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.anomaly.FitMultivariateAnomaly\n\nval startTime: String = "2021-01-01T00:00:00Z"\nval endTime: String = "2021-01-02T12:00:00Z"\nval timestampColumn: String = "timestamp"\nval inputColumns: Array[String] = Array("feature0", "feature1", "feature2")\nval intermediateSaveDir: String = "wasbs://madtest@anomalydetectiontest.blob.core.windows.net/intermediateData"\nval anomalyKey = sys.env.getOrElse("ANOMALY_API_KEY", None)\n\nval simpleFitMultivariateAnomaly = (new SimpleFitMultivariateAnomaly()\n .setSubscriptionKey(anomalyKey)\n .setLocation("westus2")\n .setOutputCol("result")\n .setStartTime(startTime)\n .setEndTime(endTime)\n .setIntermediateSaveDir(intermediateSaveDir)\n .setTimestampCol(timestampColumn)\n .setInputCols(inputColumns)\n .setSlidingWindow(50))\n\nval df = (spark.read.format("csv")\n .option("header", True)\n .load("wasbs://datasets@mmlspark.blob.core.windows.net/MAD/mad_example.csv"))\n\nval model = simpleFitMultivariateAnomaly.fit(df)\n\nval result = (model\n .setStartTime(startTime)\n .setEndTime(endTime)\n .setOutputCol("result")\n .setTimestampCol(timestampColumn)\n .setInputCols(inputColumns)\n .transform(df))\n\nresult.show()\n\nsimpleFitMultivariateAnomaly.cleanUpIntermediateData()\nmodel.cleanUpIntermediateData()\n')))),(0,i.kt)(u.Z,{className:"SimpleFitMultivariateAnomaly",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.SimpleFitMultivariateAnomaly",scala:"com/microsoft/azure/synapse/ml/cognitive/SimpleFitMultivariateAnomaly.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1SimpleFitMultivariateAnomaly.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/MultivariateAnomalyDetection.scala",mdxType:"DocTable"}))}p.isMDXComponent=!0;var d=["components"],f={title:"Estimators - Cognitive",sidebar_label:"Cognitive",hide_title:!0},v=void 0,y={unversionedId:"Quick Examples/estimators/estimators_cognitive",id:"version-0.11.3/Quick Examples/estimators/estimators_cognitive",title:"Estimators - Cognitive",description:"",source:"@site/versioned_docs/version-0.11.3/Quick Examples/estimators/estimators_cognitive.md",sourceDirName:"Quick Examples/estimators",slug:"/Quick Examples/estimators/estimators_cognitive",permalink:"/SynapseML/docs/Quick Examples/estimators/estimators_cognitive",draft:!1,tags:[],version:"0.11.3",frontMatter:{title:"Estimators - Cognitive",sidebar_label:"Cognitive",hide_title:!0}},b={},g=[].concat(c),h={toc:g};function E(e){var t=e.components,n=(0,r.Z)(e,d);return(0,i.kt)("wrapper",(0,a.Z)({},h,n,{components:t,mdxType:"MDXLayout"}),(0,i.kt)(p,{mdxType:"MAD"}))}E.isMDXComponent=!0}}]); \ No newline at end of file +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[2321],{3905:function(e,t,n){n.d(t,{Zo:function(){return c},kt:function(){return d}});var a=n(7294);function r(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function i(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);t&&(a=a.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,a)}return n}function l(e){for(var t=1;t=0||(r[n]=e[n]);return r}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(r[n]=e[n])}return r}var u=a.createContext({}),s=function(e){var t=a.useContext(u),n=t;return e&&(n="function"==typeof e?e(t):l(l({},t),e)),n},c=function(e){var t=s(e.components);return a.createElement(u.Provider,{value:t},e.children)},m={inlineCode:"code",wrapper:function(e){var t=e.children;return a.createElement(a.Fragment,{},t)}},p=a.forwardRef((function(e,t){var n=e.components,r=e.mdxType,i=e.originalType,u=e.parentName,c=o(e,["components","mdxType","originalType","parentName"]),p=s(n),d=r,f=p["".concat(u,".").concat(d)]||p[d]||m[d]||i;return n?a.createElement(f,l(l({ref:t},c),{},{components:n})):a.createElement(f,l({ref:t},c))}));function d(e,t){var n=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var i=n.length,l=new Array(i);l[0]=p;var o={};for(var u in t)hasOwnProperty.call(t,u)&&(o[u]=t[u]);o.originalType=e,o.mdxType="string"==typeof e?e:r,l[1]=o;for(var s=2;s child <"+("string"==typeof e.type?e.type:e.type.name)+'>: all children of the component should be , and every should have a unique "value" prop.')})))?void 0:n.filter(Boolean))?t:[]}(e).map((function(e){var t=e.props;return{value:t.value,label:t.label,attributes:t.attributes,default:t.default}}))}function p(e){var t=e.values,n=e.children;return(0,r.useMemo)((function(){var e=null!=t?t:m(n);return function(e){var t=(0,s.l)(e,(function(e,t){return e.value===t.value}));if(t.length>0)throw new Error('Docusaurus error: Duplicate values "'+t.map((function(e){return e.value})).join(", ")+'" found in . Every value needs to be unique.')}(e),e}),[t,n])}function d(e){var t=e.value;return e.tabValues.some((function(e){return e.value===t}))}function f(e){var t=e.queryString,n=void 0!==t&&t,a=e.groupId,i=(0,o.k6)(),l=function(e){var t=e.queryString,n=void 0!==t&&t,a=e.groupId;if("string"==typeof n)return n;if(!1===n)return null;if(!0===n&&!a)throw new Error('Docusaurus error: The component groupId prop is required if queryString=true, because this value is used as the search param name. You can also provide an explicit value such as queryString="my-search-param".');return null!=a?a:null}({queryString:n,groupId:a});return[(0,u._X)(l),(0,r.useCallback)((function(e){if(l){var t=new URLSearchParams(i.location.search);t.set(l,e),i.replace(Object.assign({},i.location,{search:t.toString()}))}}),[l,i])]}function v(e){var t,n,a,i,l=e.defaultValue,o=e.queryString,u=void 0!==o&&o,s=e.groupId,m=p(e),v=(0,r.useState)((function(){return function(e){var t,n=e.defaultValue,a=e.tabValues;if(0===a.length)throw new Error("Docusaurus error: the component requires at least one children component");if(n){if(!d({value:n,tabValues:a}))throw new Error('Docusaurus error: The has a defaultValue "'+n+'" but none of its children has the corresponding value. Available values are: '+a.map((function(e){return e.value})).join(", ")+". If you intend to show no default tab, use defaultValue={null} instead.");return n}var r=null!=(t=a.find((function(e){return e.default})))?t:a[0];if(!r)throw new Error("Unexpected error: 0 tabValues");return r.value}({defaultValue:l,tabValues:m})})),y=v[0],b=v[1],g=f({queryString:u,groupId:s}),h=g[0],E=g[1],w=(t=function(e){return e?"docusaurus.tab."+e:null}({groupId:s}.groupId),n=(0,c.Nk)(t),a=n[0],i=n[1],[a,(0,r.useCallback)((function(e){t&&i.set(e)}),[t,i])]),T=w[0],k=w[1],S=function(){var e=null!=h?h:T;return d({value:e,tabValues:m})?e:null}();return(0,r.useLayoutEffect)((function(){S&&b(S)}),[S]),{selectedValue:y,selectValue:(0,r.useCallback)((function(e){if(!d({value:e,tabValues:m}))throw new Error("Can't select invalid tab value="+e);b(e),E(e),k(e)}),[E,k,m]),tabValues:m}}var y=n(2389),b="tabList__CuJ",g="tabItem_LNqP";function h(e){var t=e.className,n=e.block,o=e.selectedValue,u=e.selectValue,s=e.tabValues,c=[],m=(0,l.o5)().blockElementScrollPositionUntilNextRender,p=function(e){var t=e.currentTarget,n=c.indexOf(t),a=s[n].value;a!==o&&(m(t),u(a))},d=function(e){var t,n=null;switch(e.key){case"Enter":p(e);break;case"ArrowRight":var a,r=c.indexOf(e.currentTarget)+1;n=null!=(a=c[r])?a:c[0];break;case"ArrowLeft":var i,l=c.indexOf(e.currentTarget)-1;n=null!=(i=c[l])?i:c[c.length-1]}null==(t=n)||t.focus()};return r.createElement("ul",{role:"tablist","aria-orientation":"horizontal",className:(0,i.Z)("tabs",{"tabs--block":n},t)},s.map((function(e){var t=e.value,n=e.label,l=e.attributes;return r.createElement("li",(0,a.Z)({role:"tab",tabIndex:o===t?0:-1,"aria-selected":o===t,key:t,ref:function(e){return c.push(e)},onKeyDown:d,onClick:p},l,{className:(0,i.Z)("tabs__item",g,null==l?void 0:l.className,{"tabs__item--active":o===t})}),null!=n?n:t)})))}function E(e){var t=e.lazy,n=e.children,a=e.selectedValue,i=(Array.isArray(n)?n:[n]).filter(Boolean);if(t){var l=i.find((function(e){return e.props.value===a}));return l?(0,r.cloneElement)(l,{className:"margin-top--md"}):null}return r.createElement("div",{className:"margin-top--md"},i.map((function(e,t){return(0,r.cloneElement)(e,{key:t,hidden:e.props.value!==a})})))}function w(e){var t=v(e);return r.createElement("div",{className:(0,i.Z)("tabs-container",b)},r.createElement(h,(0,a.Z)({},e,t)),r.createElement(E,(0,a.Z)({},e,t)))}function T(e){var t=(0,y.Z)();return r.createElement(w,(0,a.Z)({key:String(t)},e))}},1989:function(e,t,n){var a=n(7294),r=n(2263);t.Z=function(e){var t=e.className,n=e.py,i=e.scala,l=e.csharp,o=e.sourceLink,u=(0,r.Z)().siteConfig.customFields.version,s="https://mmlspark.blob.core.windows.net/docs/"+u+"/pyspark/"+n,c="https://mmlspark.blob.core.windows.net/docs/"+u+"/scala/"+i,m="https://mmlspark.blob.core.windows.net/docs/"+u+"/dotnet/"+l;return a.createElement("table",null,a.createElement("tbody",null,a.createElement("tr",null,a.createElement("td",null,a.createElement("strong",null,"Python API: "),a.createElement("a",{href:s},t)),a.createElement("td",null,a.createElement("strong",null,"Scala API: "),a.createElement("a",{href:c},t)),a.createElement("td",null,a.createElement("strong",null,".NET API: "),a.createElement("a",{href:m},t)),a.createElement("td",null,a.createElement("strong",null,"Source: "),a.createElement("a",{href:o},t)))))}},3819:function(e,t,n){n.r(t),n.d(t,{assets:function(){return b},contentTitle:function(){return v},default:function(){return E},frontMatter:function(){return f},metadata:function(){return y},toc:function(){return g}});var a=n(3117),r=n(102),i=(n(7294),n(3905)),l=n(4866),o=n(5162),u=n(1989),s=["components"],c=[{value:"SimpleFitMultivariateAnomaly",id:"simplefitmultivariateanomaly",level:2}],m={toc:c};function p(e){var t=e.components,n=(0,r.Z)(e,s);return(0,i.kt)("wrapper",(0,a.Z)({},m,n,{components:t,mdxType:"MDXLayout"}),(0,i.kt)("h2",{id:"simplefitmultivariateanomaly"},"SimpleFitMultivariateAnomaly"),(0,i.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,i.kt)(o.Z,{value:"py",mdxType:"TabItem"},(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\nanomalyKey = os.environ.get("ANOMALY_API_KEY", getSecret("anomaly-api-key"))\nstartTime = "2021-01-01T00:00:00Z"\nendTime = "2021-01-03T01:59:00Z"\ntimestampColumn = "timestamp"\ninputColumns = ["feature0", "feature1", "feature2"]\nintermediateSaveDir = "wasbs://madtest@anomalydetectiontest.blob.core.windows.net/intermediateData"\n\nsimpleFitMultivariateAnomaly = (SimpleFitMultivariateAnomaly()\n .setSubscriptionKey(anomalyKey)\n .setLocation("westus2")\n .setOutputCol("result")\n .setStartTime(startTime)\n .setEndTime(endTime)\n .setIntermediateSaveDir(intermediateSaveDir)\n .setTimestampCol(timestampColumn)\n .setInputCols(inputColumns)\n .setSlidingWindow(50))\n\n# uncomment below for fitting your own dataframe\n# model = simpleFitMultivariateAnomaly.fit(df)\n# simpleFitMultivariateAnomaly.cleanUpIntermediateData()\n'))),(0,i.kt)(o.Z,{value:"scala",mdxType:"TabItem"},(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.anomaly.FitMultivariateAnomaly\n\nval startTime: String = "2021-01-01T00:00:00Z"\nval endTime: String = "2021-01-02T12:00:00Z"\nval timestampColumn: String = "timestamp"\nval inputColumns: Array[String] = Array("feature0", "feature1", "feature2")\nval intermediateSaveDir: String = "wasbs://madtest@anomalydetectiontest.blob.core.windows.net/intermediateData"\nval anomalyKey = sys.env.getOrElse("ANOMALY_API_KEY", None)\n\nval simpleFitMultivariateAnomaly = (new SimpleFitMultivariateAnomaly()\n .setSubscriptionKey(anomalyKey)\n .setLocation("westus2")\n .setOutputCol("result")\n .setStartTime(startTime)\n .setEndTime(endTime)\n .setIntermediateSaveDir(intermediateSaveDir)\n .setTimestampCol(timestampColumn)\n .setInputCols(inputColumns)\n .setSlidingWindow(50))\n\nval df = (spark.read.format("csv")\n .option("header", True)\n .load("wasbs://datasets@mmlspark.blob.core.windows.net/MAD/mad_example.csv"))\n\nval model = simpleFitMultivariateAnomaly.fit(df)\n\nval result = (model\n .setStartTime(startTime)\n .setEndTime(endTime)\n .setOutputCol("result")\n .setTimestampCol(timestampColumn)\n .setInputCols(inputColumns)\n .transform(df))\n\nresult.show()\n\nsimpleFitMultivariateAnomaly.cleanUpIntermediateData()\nmodel.cleanUpIntermediateData()\n')))),(0,i.kt)(u.Z,{className:"SimpleFitMultivariateAnomaly",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.SimpleFitMultivariateAnomaly",scala:"com/microsoft/azure/synapse/ml/cognitive/SimpleFitMultivariateAnomaly.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1SimpleFitMultivariateAnomaly.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/MultivariateAnomalyDetection.scala",mdxType:"DocTable"}))}p.isMDXComponent=!0;var d=["components"],f={title:"Estimators - Cognitive",sidebar_label:"Cognitive",hide_title:!0},v=void 0,y={unversionedId:"Quick Examples/estimators/estimators_cognitive",id:"version-0.11.3/Quick Examples/estimators/estimators_cognitive",title:"Estimators - Cognitive",description:"",source:"@site/versioned_docs/version-0.11.3/Quick Examples/estimators/estimators_cognitive.md",sourceDirName:"Quick Examples/estimators",slug:"/Quick Examples/estimators/estimators_cognitive",permalink:"/SynapseML/docs/0.11.3/Quick Examples/estimators/estimators_cognitive",draft:!1,tags:[],version:"0.11.3",frontMatter:{title:"Estimators - Cognitive",sidebar_label:"Cognitive",hide_title:!0}},b={},g=[].concat(c),h={toc:g};function E(e){var t=e.components,n=(0,r.Z)(e,d);return(0,i.kt)("wrapper",(0,a.Z)({},h,n,{components:t,mdxType:"MDXLayout"}),(0,i.kt)(p,{mdxType:"MAD"}))}E.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/327be84b.ce7070a2.js b/assets/js/327be84b.ce7070a2.js new file mode 100644 index 0000000000..fa027f7e3b --- /dev/null +++ b/assets/js/327be84b.ce7070a2.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[8399],{3905:function(e,t,a){a.d(t,{Zo:function(){return c},kt:function(){return m}});var n=a(7294);function r(e,t,a){return t in e?Object.defineProperty(e,t,{value:a,enumerable:!0,configurable:!0,writable:!0}):e[t]=a,e}function i(e,t){var a=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);t&&(n=n.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),a.push.apply(a,n)}return a}function o(e){for(var t=1;t=0||(r[a]=e[a]);return r}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(r[a]=e[a])}return r}var s=n.createContext({}),p=function(e){var t=n.useContext(s),a=t;return e&&(a="function"==typeof e?e(t):o(o({},t),e)),a},c=function(e){var t=p(e.components);return n.createElement(s.Provider,{value:t},e.children)},d={inlineCode:"code",wrapper:function(e){var t=e.children;return n.createElement(n.Fragment,{},t)}},u=n.forwardRef((function(e,t){var a=e.components,r=e.mdxType,i=e.originalType,s=e.parentName,c=l(e,["components","mdxType","originalType","parentName"]),u=p(a),m=r,f=u["".concat(s,".").concat(m)]||u[m]||d[m]||i;return a?n.createElement(f,o(o({ref:t},c),{},{components:a})):n.createElement(f,o({ref:t},c))}));function m(e,t){var a=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var i=a.length,o=new Array(i);o[0]=u;var l={};for(var s in t)hasOwnProperty.call(t,s)&&(l[s]=t[s]);l.originalType=e,l.mdxType="string"==typeof e?e:r,o[1]=l;for(var p=2;p 0, 1.0).otherwise(0.0))\n .select(["label", "text"])\n)\n')),(0,i.kt)("h2",{id:"vw-synapseml-training"},"VW SynapseML Training"),(0,i.kt)("p",null,"Now we are ready to define a pipeline which consists of feature engineering steps and the VW model."),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-python"},'# Specify featurizers\ntokenizer = RegexTokenizer(inputCol="text", outputCol="words")\n\ncount_vectorizer = CountVectorizer(inputCol="words", outputCol="features")\n\n# Define VW classification model\nargs = "--loss_function=logistic --quiet --holdout_off"\nvw_model = VowpalWabbitClassifier(\n featuresCol="features", labelCol="label", passThroughArgs=args, numPasses=10\n)\n\n# Create a pipeline\nvw_pipeline = Pipeline(stages=[tokenizer, count_vectorizer, vw_model])\n')),(0,i.kt)("p",null,"With the prepared training data, we can fit the model pipeline as follows."),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-python"},"vw_trained = vw_pipeline.fit(df_train)\n")),(0,i.kt)("h2",{id:"model-performance-evaluation"},"Model Performance Evaluation"),(0,i.kt)("p",null,"After training the model, we evaluate the performance of the model using the test set which is manually labeled."),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-python"},'df_test = pd.read_csv(\n os.path.join(".", DATA_FOLDER, TEST_FILENAME),\n header=None,\n names=COL_NAMES,\n encoding=ENCODING,\n)\ndf_test = spark.createDataFrame(df_test, verifySchema=False)\n')),(0,i.kt)("p",null,"We only use positive and negative tweets in the test set to evaluate the model, since our model is a binary classification model trained with only positive and negative tweets."),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-python"},'print("Number of test samples before filtering: ", df_test.count())\ndf_test = (\n df_test.filter(col("label") != 2.0)\n .withColumn("label", when(col("label") > 0, 1.0).otherwise(0.0))\n .select(["label", "text"])\n)\nprint("Number of test samples after filtering: ", df_test.count())\n')),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-python"},"# Make predictions\npredictions = vw_trained.transform(df_test)\npredictions.limit(10).toPandas()\n")),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-python"},'# Compute model performance metrics\nmetrics = ComputeModelStatistics(\n evaluationMetric="classification", labelCol="label", scoredLabelsCol="prediction"\n).transform(predictions)\nmetrics.toPandas()\n')),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-python"},'# Utility class for plotting ROC curve (https://stackoverflow.com/questions/52847408/pyspark-extract-roc-curve)\nclass CurveMetrics(BinaryClassificationMetrics):\n def __init__(self, *args):\n super(CurveMetrics, self).__init__(*args)\n\n def get_curve(self, method):\n rdd = getattr(self._java_model, method)().toJavaRDD()\n points = []\n for row in rdd.collect():\n points += [(float(row._1()), float(row._2()))]\n return points\n\n\npreds = predictions.select("label", "probability").rdd.map(\n lambda row: (float(row["probability"][1]), float(row["label"]))\n)\nroc_points = CurveMetrics(preds).get_curve("roc")\n\n# Plot ROC curve\nfig = plt.figure()\nx_val = [x[0] for x in roc_points]\ny_val = [x[1] for x in roc_points]\nplt.title("ROC curve on test set")\nplt.xlabel("False positive rate")\nplt.ylabel("True positive rate")\nplt.plot(x_val, y_val)\n# Use display() if you\'re on Azure Databricks or you can do plt.show()\nplt.show()\n')),(0,i.kt)("p",null,"You should see an ROC curve like the following after the above cell is executed. "),(0,i.kt)("img",{src:"https://user-images.githubusercontent.com/20047467/69376052-9b0a3380-0c77-11ea-9266-11aa44350cbe.png",width:"400",height:"320"}))}m.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/37ab9beb.de640460.js b/assets/js/37ab9beb.de640460.js new file mode 100644 index 0000000000..a4fbc16199 --- /dev/null +++ b/assets/js/37ab9beb.de640460.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[4992],{3905:function(e,t,n){n.d(t,{Zo:function(){return c},kt:function(){return d}});var r=n(7294);function a(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function i(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,r)}return n}function o(e){for(var t=1;t=0||(a[n]=e[n]);return a}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(a[n]=e[n])}return a}var l=r.createContext({}),p=function(e){var t=r.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):o(o({},t),e)),n},c=function(e){var t=p(e.components);return r.createElement(l.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},m=r.forwardRef((function(e,t){var n=e.components,a=e.mdxType,i=e.originalType,l=e.parentName,c=s(e,["components","mdxType","originalType","parentName"]),m=p(n),d=a,f=m["".concat(l,".").concat(d)]||m[d]||u[d]||i;return n?r.createElement(f,o(o({ref:t},c),{},{components:n})):r.createElement(f,o({ref:t},c))}));function d(e,t){var n=arguments,a=t&&t.mdxType;if("string"==typeof e||a){var i=n.length,o=new Array(i);o[0]=m;var s={};for(var l in t)hasOwnProperty.call(t,l)&&(s[l]=t[l]);s.originalType=e,s.mdxType="string"==typeof e?e:a,o[1]=s;for(var p=2;p=0||(n[t]=a[t]);return n}(a,e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(a);for(s=0;s=0||Object.prototype.propertyIsEnumerable.call(a,t)&&(n[t]=a[t])}return n}var i=s.createContext({}),p=function(a){var e=s.useContext(i),t=e;return a&&(t="function"==typeof a?a(e):l(l({},e),a)),t},o=function(a){var e=p(a.components);return s.createElement(i.Provider,{value:e},a.children)},c={inlineCode:"code",wrapper:function(a){var e=a.children;return s.createElement(s.Fragment,{},e)}},u=s.forwardRef((function(a,e){var t=a.components,n=a.mdxType,r=a.originalType,i=a.parentName,o=m(a,["components","mdxType","originalType","parentName"]),u=p(t),N=n,k=u["".concat(i,".").concat(N)]||u[N]||c[N]||r;return t?s.createElement(k,l(l({ref:e},o),{},{components:t})):s.createElement(k,l({ref:e},o))}));function N(a,e){var t=arguments,n=e&&e.mdxType;if("string"==typeof a||n){var r=t.length,l=new Array(r);l[0]=u;var m={};for(var i in e)hasOwnProperty.call(e,i)&&(m[i]=e[i]);m.originalType=a,m.mdxType="string"==typeof a?a:n,l[1]=m;for(var p=2;p50K} to {0, 1} to represent our binary classification label column\nlabel_col = "income"\ndf = df.withColumn(\n label_col, F.when(F.col(label_col).contains("<=50K"), F.lit(0)).otherwise(F.lit(1))\n)\n')),(0,r.kt)("h3",{id:"perform-preliminary-analysis-on-columns-of-interest"},"Perform preliminary analysis on columns of interest"),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'display(df.groupBy("race").count())\n')),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'display(df.groupBy("sex").count())\n')),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'# Choose columns/features to do data balance analysis on\ncols_of_interest = ["race", "sex"]\ndisplay(df.select(cols_of_interest + [label_col]))\n')),(0,r.kt)("h3",{id:"calculate-feature-balance-measures"},(0,r.kt)("a",{parentName:"h3",href:"../Data%20Balance%20Analysis"},"Calculate Feature Balance Measures")),(0,r.kt)("p",null,"Feature Balance Measures allow us to see whether each combination of sensitive feature is receiving the positive outcome (true prediction) at equal rates."),(0,r.kt)("p",null,"In this context, we define a feature balance measure, also referred to as the parity, for label y as the absolute difference between the association metrics of two different sensitive classes ",(0,r.kt)("span",{parentName:"p",className:"math math-inline"},(0,r.kt)("span",{parentName:"span",className:"katex"},(0,r.kt)("span",{parentName:"span",className:"katex-mathml"},(0,r.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,r.kt)("semantics",{parentName:"math"},(0,r.kt)("mrow",{parentName:"semantics"},(0,r.kt)("mo",{parentName:"mrow",stretchy:"false"},"["),(0,r.kt)("msub",{parentName:"mrow"},(0,r.kt)("mi",{parentName:"msub"},"x"),(0,r.kt)("mi",{parentName:"msub"},"A")),(0,r.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,r.kt)("msub",{parentName:"mrow"},(0,r.kt)("mi",{parentName:"msub"},"x"),(0,r.kt)("mi",{parentName:"msub"},"B")),(0,r.kt)("mo",{parentName:"mrow",stretchy:"false"},"]")),(0,r.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"[x_A, x_B]")))),(0,r.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,r.kt)("span",{parentName:"span",className:"base"},(0,r.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,r.kt)("span",{parentName:"span",className:"mopen"},"["),(0,r.kt)("span",{parentName:"span",className:"mord"},(0,r.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,r.kt)("span",{parentName:"span",className:"msupsub"},(0,r.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,r.kt)("span",{parentName:"span",className:"vlist-r"},(0,r.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.32833099999999993em"}},(0,r.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,r.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,r.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,r.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"A")))),(0,r.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,r.kt)("span",{parentName:"span",className:"vlist-r"},(0,r.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,r.kt)("span",{parentName:"span"})))))),(0,r.kt)("span",{parentName:"span",className:"mpunct"},","),(0,r.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,r.kt)("span",{parentName:"span",className:"mord"},(0,r.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,r.kt)("span",{parentName:"span",className:"msupsub"},(0,r.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,r.kt)("span",{parentName:"span",className:"vlist-r"},(0,r.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.32833099999999993em"}},(0,r.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,r.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,r.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,r.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.05017em"}},"B")))),(0,r.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,r.kt)("span",{parentName:"span",className:"vlist-r"},(0,r.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,r.kt)("span",{parentName:"span"})))))),(0,r.kt)("span",{parentName:"span",className:"mclose"},"]"))))),", with respect to the association metric ",(0,r.kt)("span",{parentName:"p",className:"math math-inline"},(0,r.kt)("span",{parentName:"span",className:"katex"},(0,r.kt)("span",{parentName:"span",className:"katex-mathml"},(0,r.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,r.kt)("semantics",{parentName:"math"},(0,r.kt)("mrow",{parentName:"semantics"},(0,r.kt)("mi",{parentName:"mrow"},"A"),(0,r.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,r.kt)("msub",{parentName:"mrow"},(0,r.kt)("mi",{parentName:"msub"},"x"),(0,r.kt)("mi",{parentName:"msub"},"i")),(0,r.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,r.kt)("mi",{parentName:"mrow"},"y"),(0,r.kt)("mo",{parentName:"mrow",stretchy:"false"},")")),(0,r.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"A(x_i, y)")))),(0,r.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,r.kt)("span",{parentName:"span",className:"base"},(0,r.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,r.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,r.kt)("span",{parentName:"span",className:"mopen"},"("),(0,r.kt)("span",{parentName:"span",className:"mord"},(0,r.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,r.kt)("span",{parentName:"span",className:"msupsub"},(0,r.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,r.kt)("span",{parentName:"span",className:"vlist-r"},(0,r.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.31166399999999994em"}},(0,r.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,r.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,r.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,r.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"i")))),(0,r.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,r.kt)("span",{parentName:"span",className:"vlist-r"},(0,r.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,r.kt)("span",{parentName:"span"})))))),(0,r.kt)("span",{parentName:"span",className:"mpunct"},","),(0,r.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,r.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.03588em"}},"y"),(0,r.kt)("span",{parentName:"span",className:"mclose"},")"))))),". That is:"),(0,r.kt)("p",null,(0,r.kt)("span",{parentName:"p",className:"math math-inline"},(0,r.kt)("span",{parentName:"span",className:"katex"},(0,r.kt)("span",{parentName:"span",className:"katex-mathml"},(0,r.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,r.kt)("semantics",{parentName:"math"},(0,r.kt)("mrow",{parentName:"semantics"},(0,r.kt)("mi",{parentName:"mrow"},"p"),(0,r.kt)("mi",{parentName:"mrow"},"a"),(0,r.kt)("mi",{parentName:"mrow"},"r"),(0,r.kt)("mi",{parentName:"mrow"},"i"),(0,r.kt)("mi",{parentName:"mrow"},"t"),(0,r.kt)("mi",{parentName:"mrow"},"y"),(0,r.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,r.kt)("mi",{parentName:"mrow"},"y"),(0,r.kt)("mi",{parentName:"mrow",mathvariant:"normal"},"\u2223"),(0,r.kt)("msub",{parentName:"mrow"},(0,r.kt)("mi",{parentName:"msub"},"x"),(0,r.kt)("mi",{parentName:"msub"},"A")),(0,r.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,r.kt)("msub",{parentName:"mrow"},(0,r.kt)("mi",{parentName:"msub"},"x"),(0,r.kt)("mi",{parentName:"msub"},"B")),(0,r.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,r.kt)("mi",{parentName:"mrow"},"A"),(0,r.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,r.kt)("mo",{parentName:"mrow"},"\u22c5"),(0,r.kt)("mo",{parentName:"mrow",stretchy:"false"},")"),(0,r.kt)("mo",{parentName:"mrow",stretchy:"false"},")"),(0,r.kt)("mo",{parentName:"mrow"},(0,r.kt)("mi",{parentName:"mo",mathvariant:"normal"},"\u2254")),(0,r.kt)("mi",{parentName:"mrow"},"A"),(0,r.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,r.kt)("msub",{parentName:"mrow"},(0,r.kt)("mi",{parentName:"msub"},"x"),(0,r.kt)("mi",{parentName:"msub"},"A")),(0,r.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,r.kt)("mi",{parentName:"mrow"},"y"),(0,r.kt)("mo",{parentName:"mrow",stretchy:"false"},")"),(0,r.kt)("mo",{parentName:"mrow"},"\u2212"),(0,r.kt)("mi",{parentName:"mrow"},"A"),(0,r.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,r.kt)("msub",{parentName:"mrow"},(0,r.kt)("mi",{parentName:"msub"},"x"),(0,r.kt)("mi",{parentName:"msub"},"B")),(0,r.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,r.kt)("mi",{parentName:"mrow"},"y"),(0,r.kt)("mo",{parentName:"mrow",stretchy:"false"},")")),(0,r.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"parity(y \\vert x_A, x_B, A(\\cdot)) \\coloneqq A(x_A, y) - A(x_B, y)")))),(0,r.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,r.kt)("span",{parentName:"span",className:"base"},(0,r.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,r.kt)("span",{parentName:"span",className:"mord mathnormal"},"p"),(0,r.kt)("span",{parentName:"span",className:"mord mathnormal"},"a"),(0,r.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.02778em"}},"r"),(0,r.kt)("span",{parentName:"span",className:"mord mathnormal"},"i"),(0,r.kt)("span",{parentName:"span",className:"mord mathnormal"},"t"),(0,r.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.03588em"}},"y"),(0,r.kt)("span",{parentName:"span",className:"mopen"},"("),(0,r.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.03588em"}},"y"),(0,r.kt)("span",{parentName:"span",className:"mord"},"\u2223"),(0,r.kt)("span",{parentName:"span",className:"mord"},(0,r.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,r.kt)("span",{parentName:"span",className:"msupsub"},(0,r.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,r.kt)("span",{parentName:"span",className:"vlist-r"},(0,r.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.32833099999999993em"}},(0,r.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,r.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,r.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,r.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"A")))),(0,r.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,r.kt)("span",{parentName:"span",className:"vlist-r"},(0,r.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,r.kt)("span",{parentName:"span"})))))),(0,r.kt)("span",{parentName:"span",className:"mpunct"},","),(0,r.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,r.kt)("span",{parentName:"span",className:"mord"},(0,r.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,r.kt)("span",{parentName:"span",className:"msupsub"},(0,r.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,r.kt)("span",{parentName:"span",className:"vlist-r"},(0,r.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.32833099999999993em"}},(0,r.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,r.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,r.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,r.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.05017em"}},"B")))),(0,r.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,r.kt)("span",{parentName:"span",className:"vlist-r"},(0,r.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,r.kt)("span",{parentName:"span"})))))),(0,r.kt)("span",{parentName:"span",className:"mpunct"},","),(0,r.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,r.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,r.kt)("span",{parentName:"span",className:"mopen"},"("),(0,r.kt)("span",{parentName:"span",className:"mord"},"\u22c5"),(0,r.kt)("span",{parentName:"span",className:"mclose"},")"),(0,r.kt)("span",{parentName:"span",className:"mclose"},")"),(0,r.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}}),(0,r.kt)("span",{parentName:"span",className:"mrel"},(0,r.kt)("span",{parentName:"span",className:"mrel"},(0,r.kt)("span",{parentName:"span",className:"mop",style:{position:"relative",top:"-0.03472em"}},":")),(0,r.kt)("span",{parentName:"span",className:"mrel"},(0,r.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"-0.06666666666666667em"}})),(0,r.kt)("span",{parentName:"span",className:"mrel"},"=")),(0,r.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}})),(0,r.kt)("span",{parentName:"span",className:"base"},(0,r.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,r.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,r.kt)("span",{parentName:"span",className:"mopen"},"("),(0,r.kt)("span",{parentName:"span",className:"mord"},(0,r.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,r.kt)("span",{parentName:"span",className:"msupsub"},(0,r.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,r.kt)("span",{parentName:"span",className:"vlist-r"},(0,r.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.32833099999999993em"}},(0,r.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,r.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,r.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,r.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"A")))),(0,r.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,r.kt)("span",{parentName:"span",className:"vlist-r"},(0,r.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,r.kt)("span",{parentName:"span"})))))),(0,r.kt)("span",{parentName:"span",className:"mpunct"},","),(0,r.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,r.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.03588em"}},"y"),(0,r.kt)("span",{parentName:"span",className:"mclose"},")"),(0,r.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2222222222222222em"}}),(0,r.kt)("span",{parentName:"span",className:"mbin"},"\u2212"),(0,r.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2222222222222222em"}})),(0,r.kt)("span",{parentName:"span",className:"base"},(0,r.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,r.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,r.kt)("span",{parentName:"span",className:"mopen"},"("),(0,r.kt)("span",{parentName:"span",className:"mord"},(0,r.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,r.kt)("span",{parentName:"span",className:"msupsub"},(0,r.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,r.kt)("span",{parentName:"span",className:"vlist-r"},(0,r.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.32833099999999993em"}},(0,r.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,r.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,r.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,r.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.05017em"}},"B")))),(0,r.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,r.kt)("span",{parentName:"span",className:"vlist-r"},(0,r.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,r.kt)("span",{parentName:"span"})))))),(0,r.kt)("span",{parentName:"span",className:"mpunct"},","),(0,r.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,r.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.03588em"}},"y"),(0,r.kt)("span",{parentName:"span",className:"mclose"},")")))))),(0,r.kt)("p",null,"Using the dataset, we can see if the various sexes and races are receiving >50k income at equal or unequal rates."),(0,r.kt)("p",null,"Note: Many of these metrics were influenced by this paper ",(0,r.kt)("a",{parentName:"p",href:"https://arxiv.org/abs/2103.03417"},"Measuring Model Biases in the Absence of Ground Truth"),"."),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.exploratory import FeatureBalanceMeasure\n\nfeature_balance_measures = (\n FeatureBalanceMeasure()\n .setSensitiveCols(cols_of_interest)\n .setLabelCol(label_col)\n .setVerbose(True)\n .transform(df)\n)\n\n# Sort by Statistical Parity descending for all features\ndisplay(feature_balance_measures.sort(F.abs("FeatureBalanceMeasure.dp").desc()))\n')),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'# Drill down to feature == "sex"\ndisplay(\n feature_balance_measures.filter(F.col("FeatureName") == "sex").sort(\n F.abs("FeatureBalanceMeasure.dp").desc()\n )\n)\n')),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'# Drill down to feature == "race"\ndisplay(\n feature_balance_measures.filter(F.col("FeatureName") == "race").sort(\n F.abs("FeatureBalanceMeasure.dp").desc()\n )\n)\n')),(0,r.kt)("h4",{id:"visualize-feature-balance-measures"},"Visualize Feature Balance Measures"),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'races = [row["race"] for row in df.groupBy("race").count().select("race").collect()]\ndp_rows = (\n feature_balance_measures.filter(F.col("FeatureName") == "race")\n .select("ClassA", "ClassB", "FeatureBalanceMeasure.dp")\n .collect()\n)\nrace_dp_values = [(row["ClassA"], row["ClassB"], row["dp"]) for row in dp_rows]\n\nrace_dp_array = np.zeros((len(races), len(races)))\nfor class_a, class_b, dp_value in race_dp_values:\n i, j = races.index(class_a), races.index(class_b)\n dp_value = round(dp_value, 2)\n race_dp_array[i, j] = dp_value\n race_dp_array[j, i] = -1 * dp_value\n\ncolormap = "RdBu"\ndp_min, dp_max = -1.0, 1.0\n\nfig, ax = plt.subplots()\nim = ax.imshow(race_dp_array, vmin=dp_min, vmax=dp_max, cmap=colormap)\n\ncbar = ax.figure.colorbar(im, ax=ax)\ncbar.ax.set_ylabel("Statistical Parity", rotation=-90, va="bottom")\n\nax.set_xticks(np.arange(len(races)))\nax.set_yticks(np.arange(len(races)))\nax.set_xticklabels(races)\nax.set_yticklabels(races)\n\nplt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")\n\nfor i in range(len(races)):\n for j in range(len(races)):\n text = ax.text(j, i, race_dp_array[i, j], ha="center", va="center", color="k")\n\nax.set_title("Statistical Parity of Races in Adult Dataset")\nfig.tight_layout()\nplt.show()\n')),(0,r.kt)("p",null,(0,r.kt)("img",{parentName:"p",src:"https://mmlspark.blob.core.windows.net/graphics/responsible_ai/DataBalanceAnalysis_AdultCensusIncome_RacesDP.png",alt:"Statistical Parity of Races in Adult Dataset"})),(0,r.kt)("h4",{id:"interpret-feature-balance-measures"},"Interpret Feature Balance Measures"),(0,r.kt)("p",null,"Statistical Parity:"),(0,r.kt)("ul",null,(0,r.kt)("li",{parentName:"ul"},"When it is positive, it means that ClassA sees the positive outcome more than ClassB."),(0,r.kt)("li",{parentName:"ul"},"When it is negative, it means that ClassB sees the positive outcome more than ClassA.")),(0,r.kt)("hr",null),(0,r.kt)("p",null,"From the results, we can tell the following:"),(0,r.kt)("p",null,"For Sex:"),(0,r.kt)("ul",null,(0,r.kt)("li",{parentName:"ul"},'SP(Male, Female) = 0.1963 shows "Male" observations are associated with ">50k" income label more often than "Female" observations.')),(0,r.kt)("p",null,"For Race:"),(0,r.kt)("ul",null,(0,r.kt)("li",{parentName:"ul"},'SP(Other, Asian-Pac-Islander) = -0.1734 shows "Other" observations are associated with ">50k" income label less than "Asian-Pac-Islander" observations.'),(0,r.kt)("li",{parentName:"ul"},'SP(White, Other) = 0.1636 shows "White" observations are associated with ">50k" income label more often than "Other" observations.'),(0,r.kt)("li",{parentName:"ul"},'SP(Asian-Pac-Islander, Amer-Indian-Eskimo) = 0.1494 shows "Asian-Pac-Islander" observations are associated with ">50k" income label more often than "Amer-Indian-Eskimo" observations.')),(0,r.kt)("p",null,"Again, you can take mitigation steps to upsample/downsample your data to be less biased towards certain features and feature values."),(0,r.kt)("p",null,"Built-in mitigation steps are coming soon."),(0,r.kt)("h3",{id:"calculate-distribution-balance-measures"},"Calculate ",(0,r.kt)("a",{parentName:"h3",href:"../Data%20Balance%20Analysis"},"Distribution Balance Measures")),(0,r.kt)("p",null,"Distribution Balance Measures allow us to compare our data with a reference distribution (i.e. uniform distribution). They are calculated per sensitive column and don't use the label column. |"),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.exploratory import DistributionBalanceMeasure\n\ndistribution_balance_measures = (\n DistributionBalanceMeasure().setSensitiveCols(cols_of_interest).transform(df)\n)\n\n# Sort by JS Distance descending\ndisplay(\n distribution_balance_measures.sort(\n F.abs("DistributionBalanceMeasure.js_dist").desc()\n )\n)\n')),(0,r.kt)("h4",{id:"visualize-distribution-balance-measures"},"Visualize Distribution Balance Measures"),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'distribution_rows = distribution_balance_measures.collect()\nrace_row = [row for row in distribution_rows if row["FeatureName"] == "race"][0][\n "DistributionBalanceMeasure"\n]\nsex_row = [row for row in distribution_rows if row["FeatureName"] == "sex"][0][\n "DistributionBalanceMeasure"\n]\n\nmeasures_of_interest = [\n "kl_divergence",\n "js_dist",\n "inf_norm_dist",\n "total_variation_dist",\n "wasserstein_dist",\n]\nrace_measures = [round(race_row[measure], 4) for measure in measures_of_interest]\nsex_measures = [round(sex_row[measure], 4) for measure in measures_of_interest]\n\nx = np.arange(len(measures_of_interest))\nwidth = 0.35\n\nfig, ax = plt.subplots()\nrects1 = ax.bar(x - width / 2, race_measures, width, label="Race")\nrects2 = ax.bar(x + width / 2, sex_measures, width, label="Sex")\n\nax.set_xlabel("Measure")\nax.set_ylabel("Value")\nax.set_title("Distribution Balance Measures of Sex and Race in Adult Dataset")\nax.set_xticks(x)\nax.set_xticklabels(measures_of_interest)\nax.legend()\n\nplt.setp(ax.get_xticklabels(), rotation=20, ha="right", rotation_mode="default")\n\n\ndef autolabel(rects):\n for rect in rects:\n height = rect.get_height()\n ax.annotate(\n "{}".format(height),\n xy=(rect.get_x() + rect.get_width() / 2, height),\n xytext=(0, 1), # 1 point vertical offset\n textcoords="offset points",\n ha="center",\n va="bottom",\n )\n\n\nautolabel(rects1)\nautolabel(rects2)\n\nfig.tight_layout()\n\nplt.show()\n')),(0,r.kt)("p",null,(0,r.kt)("img",{parentName:"p",src:"https://mmlspark.blob.core.windows.net/graphics/responsible_ai/DataBalanceAnalysis_AdultCensusIncome_DistributionMeasures.png",alt:"Distribution Balance Measures of Sex and Race in Adult Dataset"})),(0,r.kt)("h4",{id:"interpret-distribution-balance-measures"},"Interpret Distribution Balance Measures"),(0,r.kt)("p",null,"Race has a JS Distance of 0.5104 while Sex has a JS Distance of 0.1217."),(0,r.kt)("p",null,"Knowing that JS Distance is between ","[0, 1]"," where 0 means perfectly balanced distribution, we can tell that:"),(0,r.kt)("ul",null,(0,r.kt)("li",{parentName:"ul"},"There is a larger disparity between various races than various sexes in our dataset."),(0,r.kt)("li",{parentName:"ul"},"Race is nowhere close to a perfectly balanced distribution (i.e. some races are seen ALOT more than others in our dataset)."),(0,r.kt)("li",{parentName:"ul"},"Sex is fairly close to a perfectly balanced distribution.")),(0,r.kt)("h3",{id:"calculate-aggregate-balance-measures"},"Calculate ",(0,r.kt)("a",{parentName:"h3",href:"../Data%20Balance%20Analysis"},"Aggregate Balance Measures")),(0,r.kt)("p",null,"Aggregate Balance Measures allow us to obtain a higher notion of inequality. They are calculated on the global set of sensitive columns and don't use the label column."),(0,r.kt)("p",null,"These measures look at distribution of records across all combinations of sensitive columns. For example, if Sex and Race are sensitive columns, it shall try to quantify imbalance across all combinations - (Male, Black), (Female, White), (Male, Asian-Pac-Islander), etc."),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},"from synapse.ml.exploratory import AggregateBalanceMeasure\n\naggregate_balance_measures = (\n AggregateBalanceMeasure().setSensitiveCols(cols_of_interest).transform(df)\n)\n\ndisplay(aggregate_balance_measures)\n")),(0,r.kt)("h4",{id:"interpret-aggregate-balance-measures"},"Interpret Aggregate Balance Measures"),(0,r.kt)("p",null,"An Atkinson Index of 0.7779 lets us know that 77.79% of data points need to be foregone to have a more equal share among our features."),(0,r.kt)("p",null,"It lets us know that our dataset is leaning towards maximum inequality, and we should take actionable steps to:"),(0,r.kt)("ul",null,(0,r.kt)("li",{parentName:"ul"},"Upsample data points where the feature value is barely observed."),(0,r.kt)("li",{parentName:"ul"},"Downsample data points where the feature value is observed much more than others.")),(0,r.kt)("h3",{id:"summary"},"Summary"),(0,r.kt)("p",null,"Throughout the course of this sample notebook, we have:"),(0,r.kt)("ol",null,(0,r.kt)("li",{parentName:"ol"},'Chosen "Race" and "Sex" as columns of interest in the Adult Census Income dataset.'),(0,r.kt)("li",{parentName:"ol"},"Done preliminary analysis on our dataset. "),(0,r.kt)("li",{parentName:"ol"},"Ran the 3 groups of measures that compose our ",(0,r.kt)("strong",{parentName:"li"},"Data Balance Analysis"),":")),(0,r.kt)("ul",null,(0,r.kt)("li",{parentName:"ul"},(0,r.kt)("strong",{parentName:"li"},"Feature Balance Measures"),(0,r.kt)("ul",{parentName:"li"},(0,r.kt)("li",{parentName:"ul"},'Calculated Feature Balance Measures to see that the highest Statistical Parity is in "Sex": Males see >50k income much more than Females.'),(0,r.kt)("li",{parentName:"ul"},"Visualized Statistical Parity of Races to see that Asian-Pac-Islander sees >50k income much more than Other, in addition to other race combinations."))),(0,r.kt)("li",{parentName:"ul"},(0,r.kt)("strong",{parentName:"li"},"Distribution Balance Measures")," ",(0,r.kt)("ul",{parentName:"li"},(0,r.kt)("li",{parentName:"ul"},'Calculated Distribution Balance Measures to see that "Sex" is much closer to a perfectly balanced distribution than "Race".'),(0,r.kt)("li",{parentName:"ul"},'Visualized various distribution balance measures to compare their values for "Race" and "Sex".'))),(0,r.kt)("li",{parentName:"ul"},(0,r.kt)("strong",{parentName:"li"},"Aggregate Balance Measures"),(0,r.kt)("ul",{parentName:"li"},(0,r.kt)("li",{parentName:"ul"},"Calculated Aggregate Balance Measures to see that we need to forego 77.79% of data points to have a perfectly balanced dataset. We identified that our dataset is leaning towards maximum inequality, and we should take actionable steps to:"),(0,r.kt)("li",{parentName:"ul"},"Upsample data points where the feature value is barely observed."),(0,r.kt)("li",{parentName:"ul"},"Downsample data points where the feature value is observed much more than others.")))),(0,r.kt)("p",null,(0,r.kt)("strong",{parentName:"p"},"In conclusion:")),(0,r.kt)("ul",null,(0,r.kt)("li",{parentName:"ul"},"These measures provide an indicator of disparity on the data, allowing for users to explore potential mitigations before proceeding to train. "),(0,r.kt)("li",{parentName:"ul"},'Users can use these measures to set thresholds on their level of "tolerance" for data representation.'),(0,r.kt)("li",{parentName:"ul"},"Production pipelines can use these measures as baseline for models that require frequent retraining on new data. "),(0,r.kt)("li",{parentName:"ul"},"These measures can also be saved as key metadata for the model/service built and added as part of model cards or transparency notes helping drive overall accountability for the ML service built and its performance across different demographics or sensitive attributes.")))}N.isMDXComponent=!0}}]); \ No newline at end of file +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[5990],{3905:function(a,e,t){t.d(e,{Zo:function(){return o},kt:function(){return N}});var s=t(7294);function n(a,e,t){return e in a?Object.defineProperty(a,e,{value:t,enumerable:!0,configurable:!0,writable:!0}):a[e]=t,a}function r(a,e){var t=Object.keys(a);if(Object.getOwnPropertySymbols){var s=Object.getOwnPropertySymbols(a);e&&(s=s.filter((function(e){return Object.getOwnPropertyDescriptor(a,e).enumerable}))),t.push.apply(t,s)}return t}function l(a){for(var e=1;e=0||(n[t]=a[t]);return n}(a,e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(a);for(s=0;s=0||Object.prototype.propertyIsEnumerable.call(a,t)&&(n[t]=a[t])}return n}var i=s.createContext({}),p=function(a){var e=s.useContext(i),t=e;return a&&(t="function"==typeof a?a(e):l(l({},e),a)),t},o=function(a){var e=p(a.components);return s.createElement(i.Provider,{value:e},a.children)},c={inlineCode:"code",wrapper:function(a){var e=a.children;return s.createElement(s.Fragment,{},e)}},u=s.forwardRef((function(a,e){var t=a.components,n=a.mdxType,r=a.originalType,i=a.parentName,o=m(a,["components","mdxType","originalType","parentName"]),u=p(t),N=n,k=u["".concat(i,".").concat(N)]||u[N]||c[N]||r;return t?s.createElement(k,l(l({ref:e},o),{},{components:t})):s.createElement(k,l({ref:e},o))}));function N(a,e){var t=arguments,n=e&&e.mdxType;if("string"==typeof a||n){var r=t.length,l=new Array(r);l[0]=u;var m={};for(var i in e)hasOwnProperty.call(e,i)&&(m[i]=e[i]);m.originalType=a,m.mdxType="string"==typeof a?a:n,l[1]=m;for(var p=2;p50K} to {0, 1} to represent our binary classification label column\nlabel_col = "income"\ndf = df.withColumn(\n label_col, F.when(F.col(label_col).contains("<=50K"), F.lit(0)).otherwise(F.lit(1))\n)\n')),(0,r.kt)("h3",{id:"perform-preliminary-analysis-on-columns-of-interest"},"Perform preliminary analysis on columns of interest"),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'display(df.groupBy("race").count())\n')),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'display(df.groupBy("sex").count())\n')),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'# Choose columns/features to do data balance analysis on\ncols_of_interest = ["race", "sex"]\ndisplay(df.select(cols_of_interest + [label_col]))\n')),(0,r.kt)("h3",{id:"calculate-feature-balance-measures"},(0,r.kt)("a",{parentName:"h3",href:"../Data%20Balance%20Analysis"},"Calculate Feature Balance Measures")),(0,r.kt)("p",null,"Feature Balance Measures allow us to see whether each combination of sensitive feature is receiving the positive outcome (true prediction) at equal rates."),(0,r.kt)("p",null,"In this context, we define a feature balance measure, also referred to as the parity, for label y as the absolute difference between the association metrics of two different sensitive classes ",(0,r.kt)("span",{parentName:"p",className:"math math-inline"},(0,r.kt)("span",{parentName:"span",className:"katex"},(0,r.kt)("span",{parentName:"span",className:"katex-mathml"},(0,r.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,r.kt)("semantics",{parentName:"math"},(0,r.kt)("mrow",{parentName:"semantics"},(0,r.kt)("mo",{parentName:"mrow",stretchy:"false"},"["),(0,r.kt)("msub",{parentName:"mrow"},(0,r.kt)("mi",{parentName:"msub"},"x"),(0,r.kt)("mi",{parentName:"msub"},"A")),(0,r.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,r.kt)("msub",{parentName:"mrow"},(0,r.kt)("mi",{parentName:"msub"},"x"),(0,r.kt)("mi",{parentName:"msub"},"B")),(0,r.kt)("mo",{parentName:"mrow",stretchy:"false"},"]")),(0,r.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"[x_A, x_B]")))),(0,r.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,r.kt)("span",{parentName:"span",className:"base"},(0,r.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,r.kt)("span",{parentName:"span",className:"mopen"},"["),(0,r.kt)("span",{parentName:"span",className:"mord"},(0,r.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,r.kt)("span",{parentName:"span",className:"msupsub"},(0,r.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,r.kt)("span",{parentName:"span",className:"vlist-r"},(0,r.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.32833099999999993em"}},(0,r.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,r.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,r.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,r.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"A")))),(0,r.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,r.kt)("span",{parentName:"span",className:"vlist-r"},(0,r.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,r.kt)("span",{parentName:"span"})))))),(0,r.kt)("span",{parentName:"span",className:"mpunct"},","),(0,r.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,r.kt)("span",{parentName:"span",className:"mord"},(0,r.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,r.kt)("span",{parentName:"span",className:"msupsub"},(0,r.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,r.kt)("span",{parentName:"span",className:"vlist-r"},(0,r.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.32833099999999993em"}},(0,r.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,r.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,r.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,r.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.05017em"}},"B")))),(0,r.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,r.kt)("span",{parentName:"span",className:"vlist-r"},(0,r.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,r.kt)("span",{parentName:"span"})))))),(0,r.kt)("span",{parentName:"span",className:"mclose"},"]"))))),", with respect to the association metric ",(0,r.kt)("span",{parentName:"p",className:"math math-inline"},(0,r.kt)("span",{parentName:"span",className:"katex"},(0,r.kt)("span",{parentName:"span",className:"katex-mathml"},(0,r.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,r.kt)("semantics",{parentName:"math"},(0,r.kt)("mrow",{parentName:"semantics"},(0,r.kt)("mi",{parentName:"mrow"},"A"),(0,r.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,r.kt)("msub",{parentName:"mrow"},(0,r.kt)("mi",{parentName:"msub"},"x"),(0,r.kt)("mi",{parentName:"msub"},"i")),(0,r.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,r.kt)("mi",{parentName:"mrow"},"y"),(0,r.kt)("mo",{parentName:"mrow",stretchy:"false"},")")),(0,r.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"A(x_i, y)")))),(0,r.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,r.kt)("span",{parentName:"span",className:"base"},(0,r.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,r.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,r.kt)("span",{parentName:"span",className:"mopen"},"("),(0,r.kt)("span",{parentName:"span",className:"mord"},(0,r.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,r.kt)("span",{parentName:"span",className:"msupsub"},(0,r.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,r.kt)("span",{parentName:"span",className:"vlist-r"},(0,r.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.31166399999999994em"}},(0,r.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,r.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,r.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,r.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"i")))),(0,r.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,r.kt)("span",{parentName:"span",className:"vlist-r"},(0,r.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,r.kt)("span",{parentName:"span"})))))),(0,r.kt)("span",{parentName:"span",className:"mpunct"},","),(0,r.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,r.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.03588em"}},"y"),(0,r.kt)("span",{parentName:"span",className:"mclose"},")"))))),". That is:"),(0,r.kt)("p",null,(0,r.kt)("span",{parentName:"p",className:"math math-inline"},(0,r.kt)("span",{parentName:"span",className:"katex"},(0,r.kt)("span",{parentName:"span",className:"katex-mathml"},(0,r.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,r.kt)("semantics",{parentName:"math"},(0,r.kt)("mrow",{parentName:"semantics"},(0,r.kt)("mi",{parentName:"mrow"},"p"),(0,r.kt)("mi",{parentName:"mrow"},"a"),(0,r.kt)("mi",{parentName:"mrow"},"r"),(0,r.kt)("mi",{parentName:"mrow"},"i"),(0,r.kt)("mi",{parentName:"mrow"},"t"),(0,r.kt)("mi",{parentName:"mrow"},"y"),(0,r.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,r.kt)("mi",{parentName:"mrow"},"y"),(0,r.kt)("mi",{parentName:"mrow",mathvariant:"normal"},"\u2223"),(0,r.kt)("msub",{parentName:"mrow"},(0,r.kt)("mi",{parentName:"msub"},"x"),(0,r.kt)("mi",{parentName:"msub"},"A")),(0,r.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,r.kt)("msub",{parentName:"mrow"},(0,r.kt)("mi",{parentName:"msub"},"x"),(0,r.kt)("mi",{parentName:"msub"},"B")),(0,r.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,r.kt)("mi",{parentName:"mrow"},"A"),(0,r.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,r.kt)("mo",{parentName:"mrow"},"\u22c5"),(0,r.kt)("mo",{parentName:"mrow",stretchy:"false"},")"),(0,r.kt)("mo",{parentName:"mrow",stretchy:"false"},")"),(0,r.kt)("mo",{parentName:"mrow"},(0,r.kt)("mi",{parentName:"mo",mathvariant:"normal"},"\u2254")),(0,r.kt)("mi",{parentName:"mrow"},"A"),(0,r.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,r.kt)("msub",{parentName:"mrow"},(0,r.kt)("mi",{parentName:"msub"},"x"),(0,r.kt)("mi",{parentName:"msub"},"A")),(0,r.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,r.kt)("mi",{parentName:"mrow"},"y"),(0,r.kt)("mo",{parentName:"mrow",stretchy:"false"},")"),(0,r.kt)("mo",{parentName:"mrow"},"\u2212"),(0,r.kt)("mi",{parentName:"mrow"},"A"),(0,r.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,r.kt)("msub",{parentName:"mrow"},(0,r.kt)("mi",{parentName:"msub"},"x"),(0,r.kt)("mi",{parentName:"msub"},"B")),(0,r.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,r.kt)("mi",{parentName:"mrow"},"y"),(0,r.kt)("mo",{parentName:"mrow",stretchy:"false"},")")),(0,r.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"parity(y \\vert x_A, x_B, A(\\cdot)) \\coloneqq A(x_A, y) - A(x_B, y)")))),(0,r.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,r.kt)("span",{parentName:"span",className:"base"},(0,r.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,r.kt)("span",{parentName:"span",className:"mord mathnormal"},"p"),(0,r.kt)("span",{parentName:"span",className:"mord mathnormal"},"a"),(0,r.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.02778em"}},"r"),(0,r.kt)("span",{parentName:"span",className:"mord mathnormal"},"i"),(0,r.kt)("span",{parentName:"span",className:"mord mathnormal"},"t"),(0,r.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.03588em"}},"y"),(0,r.kt)("span",{parentName:"span",className:"mopen"},"("),(0,r.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.03588em"}},"y"),(0,r.kt)("span",{parentName:"span",className:"mord"},"\u2223"),(0,r.kt)("span",{parentName:"span",className:"mord"},(0,r.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,r.kt)("span",{parentName:"span",className:"msupsub"},(0,r.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,r.kt)("span",{parentName:"span",className:"vlist-r"},(0,r.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.32833099999999993em"}},(0,r.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,r.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,r.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,r.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"A")))),(0,r.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,r.kt)("span",{parentName:"span",className:"vlist-r"},(0,r.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,r.kt)("span",{parentName:"span"})))))),(0,r.kt)("span",{parentName:"span",className:"mpunct"},","),(0,r.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,r.kt)("span",{parentName:"span",className:"mord"},(0,r.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,r.kt)("span",{parentName:"span",className:"msupsub"},(0,r.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,r.kt)("span",{parentName:"span",className:"vlist-r"},(0,r.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.32833099999999993em"}},(0,r.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,r.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,r.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,r.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.05017em"}},"B")))),(0,r.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,r.kt)("span",{parentName:"span",className:"vlist-r"},(0,r.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,r.kt)("span",{parentName:"span"})))))),(0,r.kt)("span",{parentName:"span",className:"mpunct"},","),(0,r.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,r.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,r.kt)("span",{parentName:"span",className:"mopen"},"("),(0,r.kt)("span",{parentName:"span",className:"mord"},"\u22c5"),(0,r.kt)("span",{parentName:"span",className:"mclose"},")"),(0,r.kt)("span",{parentName:"span",className:"mclose"},")"),(0,r.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}}),(0,r.kt)("span",{parentName:"span",className:"mrel"},(0,r.kt)("span",{parentName:"span",className:"mrel"},(0,r.kt)("span",{parentName:"span",className:"mop",style:{position:"relative",top:"-0.03472em"}},":")),(0,r.kt)("span",{parentName:"span",className:"mrel"},(0,r.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"-0.06666666666666667em"}})),(0,r.kt)("span",{parentName:"span",className:"mrel"},"=")),(0,r.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}})),(0,r.kt)("span",{parentName:"span",className:"base"},(0,r.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,r.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,r.kt)("span",{parentName:"span",className:"mopen"},"("),(0,r.kt)("span",{parentName:"span",className:"mord"},(0,r.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,r.kt)("span",{parentName:"span",className:"msupsub"},(0,r.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,r.kt)("span",{parentName:"span",className:"vlist-r"},(0,r.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.32833099999999993em"}},(0,r.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,r.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,r.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,r.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"A")))),(0,r.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,r.kt)("span",{parentName:"span",className:"vlist-r"},(0,r.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,r.kt)("span",{parentName:"span"})))))),(0,r.kt)("span",{parentName:"span",className:"mpunct"},","),(0,r.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,r.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.03588em"}},"y"),(0,r.kt)("span",{parentName:"span",className:"mclose"},")"),(0,r.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2222222222222222em"}}),(0,r.kt)("span",{parentName:"span",className:"mbin"},"\u2212"),(0,r.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2222222222222222em"}})),(0,r.kt)("span",{parentName:"span",className:"base"},(0,r.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,r.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,r.kt)("span",{parentName:"span",className:"mopen"},"("),(0,r.kt)("span",{parentName:"span",className:"mord"},(0,r.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,r.kt)("span",{parentName:"span",className:"msupsub"},(0,r.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,r.kt)("span",{parentName:"span",className:"vlist-r"},(0,r.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.32833099999999993em"}},(0,r.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,r.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,r.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,r.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.05017em"}},"B")))),(0,r.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,r.kt)("span",{parentName:"span",className:"vlist-r"},(0,r.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,r.kt)("span",{parentName:"span"})))))),(0,r.kt)("span",{parentName:"span",className:"mpunct"},","),(0,r.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,r.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.03588em"}},"y"),(0,r.kt)("span",{parentName:"span",className:"mclose"},")")))))),(0,r.kt)("p",null,"Using the dataset, we can see if the various sexes and races are receiving >50k income at equal or unequal rates."),(0,r.kt)("p",null,"Note: Many of these metrics were influenced by this paper ",(0,r.kt)("a",{parentName:"p",href:"https://arxiv.org/abs/2103.03417"},"Measuring Model Biases in the Absence of Ground Truth"),"."),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.exploratory import FeatureBalanceMeasure\n\nfeature_balance_measures = (\n FeatureBalanceMeasure()\n .setSensitiveCols(cols_of_interest)\n .setLabelCol(label_col)\n .setVerbose(True)\n .transform(df)\n)\n\n# Sort by Statistical Parity descending for all features\ndisplay(feature_balance_measures.sort(F.abs("FeatureBalanceMeasure.dp").desc()))\n')),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'# Drill down to feature == "sex"\ndisplay(\n feature_balance_measures.filter(F.col("FeatureName") == "sex").sort(\n F.abs("FeatureBalanceMeasure.dp").desc()\n )\n)\n')),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'# Drill down to feature == "race"\ndisplay(\n feature_balance_measures.filter(F.col("FeatureName") == "race").sort(\n F.abs("FeatureBalanceMeasure.dp").desc()\n )\n)\n')),(0,r.kt)("h4",{id:"visualize-feature-balance-measures"},"Visualize Feature Balance Measures"),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'races = [row["race"] for row in df.groupBy("race").count().select("race").collect()]\ndp_rows = (\n feature_balance_measures.filter(F.col("FeatureName") == "race")\n .select("ClassA", "ClassB", "FeatureBalanceMeasure.dp")\n .collect()\n)\nrace_dp_values = [(row["ClassA"], row["ClassB"], row["dp"]) for row in dp_rows]\n\nrace_dp_array = np.zeros((len(races), len(races)))\nfor class_a, class_b, dp_value in race_dp_values:\n i, j = races.index(class_a), races.index(class_b)\n dp_value = round(dp_value, 2)\n race_dp_array[i, j] = dp_value\n race_dp_array[j, i] = -1 * dp_value\n\ncolormap = "RdBu"\ndp_min, dp_max = -1.0, 1.0\n\nfig, ax = plt.subplots()\nim = ax.imshow(race_dp_array, vmin=dp_min, vmax=dp_max, cmap=colormap)\n\ncbar = ax.figure.colorbar(im, ax=ax)\ncbar.ax.set_ylabel("Statistical Parity", rotation=-90, va="bottom")\n\nax.set_xticks(np.arange(len(races)))\nax.set_yticks(np.arange(len(races)))\nax.set_xticklabels(races)\nax.set_yticklabels(races)\n\nplt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")\n\nfor i in range(len(races)):\n for j in range(len(races)):\n text = ax.text(j, i, race_dp_array[i, j], ha="center", va="center", color="k")\n\nax.set_title("Statistical Parity of Races in Adult Dataset")\nfig.tight_layout()\nplt.show()\n')),(0,r.kt)("p",null,(0,r.kt)("img",{parentName:"p",src:"https://mmlspark.blob.core.windows.net/graphics/responsible_ai/DataBalanceAnalysis_AdultCensusIncome_RacesDP.png",alt:"Statistical Parity of Races in Adult Dataset"})),(0,r.kt)("h4",{id:"interpret-feature-balance-measures"},"Interpret Feature Balance Measures"),(0,r.kt)("p",null,"Statistical Parity:"),(0,r.kt)("ul",null,(0,r.kt)("li",{parentName:"ul"},"When it is positive, it means that ClassA sees the positive outcome more than ClassB."),(0,r.kt)("li",{parentName:"ul"},"When it is negative, it means that ClassB sees the positive outcome more than ClassA.")),(0,r.kt)("hr",null),(0,r.kt)("p",null,"From the results, we can tell the following:"),(0,r.kt)("p",null,"For Sex:"),(0,r.kt)("ul",null,(0,r.kt)("li",{parentName:"ul"},'SP(Male, Female) = 0.1963 shows "Male" observations are associated with ">50k" income label more often than "Female" observations.')),(0,r.kt)("p",null,"For Race:"),(0,r.kt)("ul",null,(0,r.kt)("li",{parentName:"ul"},'SP(Other, Asian-Pac-Islander) = -0.1734 shows "Other" observations are associated with ">50k" income label less than "Asian-Pac-Islander" observations.'),(0,r.kt)("li",{parentName:"ul"},'SP(White, Other) = 0.1636 shows "White" observations are associated with ">50k" income label more often than "Other" observations.'),(0,r.kt)("li",{parentName:"ul"},'SP(Asian-Pac-Islander, Amer-Indian-Eskimo) = 0.1494 shows "Asian-Pac-Islander" observations are associated with ">50k" income label more often than "Amer-Indian-Eskimo" observations.')),(0,r.kt)("p",null,"Again, you can take mitigation steps to upsample/downsample your data to be less biased towards certain features and feature values."),(0,r.kt)("p",null,"Built-in mitigation steps are coming soon."),(0,r.kt)("h3",{id:"calculate-distribution-balance-measures"},"Calculate ",(0,r.kt)("a",{parentName:"h3",href:"../Data%20Balance%20Analysis"},"Distribution Balance Measures")),(0,r.kt)("p",null,"Distribution Balance Measures allow us to compare our data with a reference distribution (i.e. uniform distribution). They are calculated per sensitive column and don't use the label column. |"),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.exploratory import DistributionBalanceMeasure\n\ndistribution_balance_measures = (\n DistributionBalanceMeasure().setSensitiveCols(cols_of_interest).transform(df)\n)\n\n# Sort by JS Distance descending\ndisplay(\n distribution_balance_measures.sort(\n F.abs("DistributionBalanceMeasure.js_dist").desc()\n )\n)\n')),(0,r.kt)("h4",{id:"visualize-distribution-balance-measures"},"Visualize Distribution Balance Measures"),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'distribution_rows = distribution_balance_measures.collect()\nrace_row = [row for row in distribution_rows if row["FeatureName"] == "race"][0][\n "DistributionBalanceMeasure"\n]\nsex_row = [row for row in distribution_rows if row["FeatureName"] == "sex"][0][\n "DistributionBalanceMeasure"\n]\n\nmeasures_of_interest = [\n "kl_divergence",\n "js_dist",\n "inf_norm_dist",\n "total_variation_dist",\n "wasserstein_dist",\n]\nrace_measures = [round(race_row[measure], 4) for measure in measures_of_interest]\nsex_measures = [round(sex_row[measure], 4) for measure in measures_of_interest]\n\nx = np.arange(len(measures_of_interest))\nwidth = 0.35\n\nfig, ax = plt.subplots()\nrects1 = ax.bar(x - width / 2, race_measures, width, label="Race")\nrects2 = ax.bar(x + width / 2, sex_measures, width, label="Sex")\n\nax.set_xlabel("Measure")\nax.set_ylabel("Value")\nax.set_title("Distribution Balance Measures of Sex and Race in Adult Dataset")\nax.set_xticks(x)\nax.set_xticklabels(measures_of_interest)\nax.legend()\n\nplt.setp(ax.get_xticklabels(), rotation=20, ha="right", rotation_mode="default")\n\n\ndef autolabel(rects):\n for rect in rects:\n height = rect.get_height()\n ax.annotate(\n "{}".format(height),\n xy=(rect.get_x() + rect.get_width() / 2, height),\n xytext=(0, 1), # 1 point vertical offset\n textcoords="offset points",\n ha="center",\n va="bottom",\n )\n\n\nautolabel(rects1)\nautolabel(rects2)\n\nfig.tight_layout()\n\nplt.show()\n')),(0,r.kt)("p",null,(0,r.kt)("img",{parentName:"p",src:"https://mmlspark.blob.core.windows.net/graphics/responsible_ai/DataBalanceAnalysis_AdultCensusIncome_DistributionMeasures.png",alt:"Distribution Balance Measures of Sex and Race in Adult Dataset"})),(0,r.kt)("h4",{id:"interpret-distribution-balance-measures"},"Interpret Distribution Balance Measures"),(0,r.kt)("p",null,"Race has a JS Distance of 0.5104 while Sex has a JS Distance of 0.1217."),(0,r.kt)("p",null,"Knowing that JS Distance is between ","[0, 1]"," where 0 means perfectly balanced distribution, we can tell that:"),(0,r.kt)("ul",null,(0,r.kt)("li",{parentName:"ul"},"There is a larger disparity between various races than various sexes in our dataset."),(0,r.kt)("li",{parentName:"ul"},"Race is nowhere close to a perfectly balanced distribution (i.e. some races are seen ALOT more than others in our dataset)."),(0,r.kt)("li",{parentName:"ul"},"Sex is fairly close to a perfectly balanced distribution.")),(0,r.kt)("h3",{id:"calculate-aggregate-balance-measures"},"Calculate ",(0,r.kt)("a",{parentName:"h3",href:"../Data%20Balance%20Analysis"},"Aggregate Balance Measures")),(0,r.kt)("p",null,"Aggregate Balance Measures allow us to obtain a higher notion of inequality. They are calculated on the global set of sensitive columns and don't use the label column."),(0,r.kt)("p",null,"These measures look at distribution of records across all combinations of sensitive columns. For example, if Sex and Race are sensitive columns, it shall try to quantify imbalance across all combinations - (Male, Black), (Female, White), (Male, Asian-Pac-Islander), etc."),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},"from synapse.ml.exploratory import AggregateBalanceMeasure\n\naggregate_balance_measures = (\n AggregateBalanceMeasure().setSensitiveCols(cols_of_interest).transform(df)\n)\n\ndisplay(aggregate_balance_measures)\n")),(0,r.kt)("h4",{id:"interpret-aggregate-balance-measures"},"Interpret Aggregate Balance Measures"),(0,r.kt)("p",null,"An Atkinson Index of 0.7779 lets us know that 77.79% of data points need to be foregone to have a more equal share among our features."),(0,r.kt)("p",null,"It lets us know that our dataset is leaning towards maximum inequality, and we should take actionable steps to:"),(0,r.kt)("ul",null,(0,r.kt)("li",{parentName:"ul"},"Upsample data points where the feature value is barely observed."),(0,r.kt)("li",{parentName:"ul"},"Downsample data points where the feature value is observed much more than others.")),(0,r.kt)("h3",{id:"summary"},"Summary"),(0,r.kt)("p",null,"Throughout the course of this sample notebook, we have:"),(0,r.kt)("ol",null,(0,r.kt)("li",{parentName:"ol"},'Chosen "Race" and "Sex" as columns of interest in the Adult Census Income dataset.'),(0,r.kt)("li",{parentName:"ol"},"Done preliminary analysis on our dataset. "),(0,r.kt)("li",{parentName:"ol"},"Ran the 3 groups of measures that compose our ",(0,r.kt)("strong",{parentName:"li"},"Data Balance Analysis"),":")),(0,r.kt)("ul",null,(0,r.kt)("li",{parentName:"ul"},(0,r.kt)("strong",{parentName:"li"},"Feature Balance Measures"),(0,r.kt)("ul",{parentName:"li"},(0,r.kt)("li",{parentName:"ul"},'Calculated Feature Balance Measures to see that the highest Statistical Parity is in "Sex": Males see >50k income much more than Females.'),(0,r.kt)("li",{parentName:"ul"},"Visualized Statistical Parity of Races to see that Asian-Pac-Islander sees >50k income much more than Other, in addition to other race combinations."))),(0,r.kt)("li",{parentName:"ul"},(0,r.kt)("strong",{parentName:"li"},"Distribution Balance Measures")," ",(0,r.kt)("ul",{parentName:"li"},(0,r.kt)("li",{parentName:"ul"},'Calculated Distribution Balance Measures to see that "Sex" is much closer to a perfectly balanced distribution than "Race".'),(0,r.kt)("li",{parentName:"ul"},'Visualized various distribution balance measures to compare their values for "Race" and "Sex".'))),(0,r.kt)("li",{parentName:"ul"},(0,r.kt)("strong",{parentName:"li"},"Aggregate Balance Measures"),(0,r.kt)("ul",{parentName:"li"},(0,r.kt)("li",{parentName:"ul"},"Calculated Aggregate Balance Measures to see that we need to forego 77.79% of data points to have a perfectly balanced dataset. We identified that our dataset is leaning towards maximum inequality, and we should take actionable steps to:"),(0,r.kt)("li",{parentName:"ul"},"Upsample data points where the feature value is barely observed."),(0,r.kt)("li",{parentName:"ul"},"Downsample data points where the feature value is observed much more than others.")))),(0,r.kt)("p",null,(0,r.kt)("strong",{parentName:"p"},"In conclusion:")),(0,r.kt)("ul",null,(0,r.kt)("li",{parentName:"ul"},"These measures provide an indicator of disparity on the data, allowing for users to explore potential mitigations before proceeding to train. "),(0,r.kt)("li",{parentName:"ul"},'Users can use these measures to set thresholds on their level of "tolerance" for data representation.'),(0,r.kt)("li",{parentName:"ul"},"Production pipelines can use these measures as baseline for models that require frequent retraining on new data. "),(0,r.kt)("li",{parentName:"ul"},"These measures can also be saved as key metadata for the model/service built and added as part of model cards or transparency notes helping drive overall accountability for the ML service built and its performance across different demographics or sensitive attributes.")))}N.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/391cb159.62c40d70.js b/assets/js/391cb159.62c40d70.js new file mode 100644 index 0000000000..99e0bf3d26 --- /dev/null +++ b/assets/js/391cb159.62c40d70.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[1867],{3905:function(e,t,a){a.d(t,{Zo:function(){return m},kt:function(){return h}});var n=a(7294);function r(e,t,a){return t in e?Object.defineProperty(e,t,{value:a,enumerable:!0,configurable:!0,writable:!0}):e[t]=a,e}function i(e,t){var a=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);t&&(n=n.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),a.push.apply(a,n)}return a}function s(e){for(var t=1;t=0||(r[a]=e[a]);return r}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(r[a]=e[a])}return r}var l=n.createContext({}),p=function(e){var t=n.useContext(l),a=t;return e&&(a="function"==typeof e?e(t):s(s({},t),e)),a},m=function(e){var t=p(e.components);return n.createElement(l.Provider,{value:t},e.children)},c={inlineCode:"code",wrapper:function(e){var t=e.children;return n.createElement(n.Fragment,{},t)}},u=n.forwardRef((function(e,t){var a=e.components,r=e.mdxType,i=e.originalType,l=e.parentName,m=o(e,["components","mdxType","originalType","parentName"]),u=p(a),h=r,d=u["".concat(l,".").concat(h)]||u[h]||c[h]||i;return a?n.createElement(d,s(s({ref:t},m),{},{components:a})):n.createElement(d,s({ref:t},m))}));function h(e,t){var a=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var i=a.length,s=new Array(i);s[0]=u;var o={};for(var l in t)hasOwnProperty.call(t,l)&&(o[l]=t[l]);o.originalType=e,o.mdxType="string"==typeof e?e:r,s[1]=o;for(var p=2;p\n')),(0,i.kt)("p",null,"For streaming mode, only one Dataset is created per partition, so ",(0,i.kt)("em",{parentName:"p"},"useSingleDataMode")," has no effect. It's effectively always true."),(0,i.kt)("h3",{id:"data-sampling"},"Data Sampling"),(0,i.kt)("p",null,"In order for LightGBM algorithm to work, it must first create a set of bin boundaries for optimization. It does this calculation by\nfirst sampling the data before any training or inferencing starts. (",(0,i.kt)("a",{parentName:"p",href:"https://github.com/Microsoft/LightGBM"},"LightGBM docs"),"). The number of\nsamples to use is set using ",(0,i.kt)("em",{parentName:"p"},"binSampleCount"),", which must be a minimal percent of the data or LightGBM rejects it."),(0,i.kt)("p",null,"For ",(0,i.kt)("em",{parentName:"p"},"bulk")," mode, this sampling is automatically done over the entire data, and each executor uses its own partitions to calculate samples for only\na subset of the features. This distributed sampling can have subtle effects since partitioning can affect the calculated bins.\nAlso, all data is sampled no matter what."),(0,i.kt)("p",null,"For ",(0,i.kt)("em",{parentName:"p"},"streaming")," mode, there are more explicit user controls for this sampling, and it's all done from the driver.\nThe ",(0,i.kt)("em",{parentName:"p"},"samplingMode")," property controls the behavior. The efficiency of these methods increases from first to last."),(0,i.kt)("ul",null,(0,i.kt)("li",{parentName:"ul"},(0,i.kt)("em",{parentName:"li"},"global")," - Like bulk mode, the random sample is calculated by iterating over entire data (hence data is traversed twice)"),(0,i.kt)("li",{parentName:"ul"},(0,i.kt)("em",{parentName:"li"},"subset")," - (default) Samples only from the first ",(0,i.kt)("em",{parentName:"li"},"samplingSubsetSize")," elements. Assumes this subset is representative."),(0,i.kt)("li",{parentName:"ul"},(0,i.kt)("em",{parentName:"li"},"fixed")," - There's no random sample. The first ",(0,i.kt)("em",{parentName:"li"},"binSampleSize")," rows are used. Assumes randomized data.\nFor large row counts, ",(0,i.kt)("em",{parentName:"li"},"subset")," and ",(0,i.kt)("em",{parentName:"li"},"fixed")," modes can save a first iteration over the entire data.")),(0,i.kt)("h4",{id:"reference-dataset"},"Reference Dataset"),(0,i.kt)("p",null,"The sampling of the data to calculate bin boundaries happens every ",(0,i.kt)("em",{parentName:"p"},"fit")," call.\nIf repeating a fit many times (for example, hyperparameter tuning), this calculation is duplicated effort."),(0,i.kt)("p",null,"For ",(0,i.kt)("em",{parentName:"p"},"streaming")," mode, there's an optimization that a client can set to use the previously calculated bin boundaries. The\nsampling calculation results in a ",(0,i.kt)("em",{parentName:"p"},"reference dataset"),", which can be reused. After a fit, there will be a ",(0,i.kt)("em",{parentName:"p"},"referenceDataset")," property\non the estimator that was calculated and used for that fit. If that is set on the next estimator (or you reuse the same one),\nit will use that instead of resampling the data."),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-python"},"from synapse.ml.lightgbm import LightGBMClassifier\nclassifier = LightGBMClassifier(learningRate=0.3,\n numIterations=100,\n numLeaves=31)\nmodel1 = classifier.fit(train)\n\nclassifier.learningRate = 0.4\nmodel2 = classifier.fit(train)\n")),(0,i.kt)("p",null,"The 'model2' call to 'fit' doesn't resample the data and uses the same bin boundaries as 'model1'."),(0,i.kt)("p",null,(0,i.kt)("em",{parentName:"p"},"Caution"),": Some parameters actually affect the bin boundary calculation and require the use of a new reference dataset every time.\nThese parameters include ",(0,i.kt)("em",{parentName:"p"},"isEnableSparse"),", ",(0,i.kt)("em",{parentName:"p"},"useMissing"),", and ",(0,i.kt)("em",{parentName:"p"},"zeroAsMissing")," that you can set from SynapseML. If you manually set\nsome parameters with ",(0,i.kt)("em",{parentName:"p"},"passThroughArgs"),", you should look at LightGBM docs to see if they affect bin boundaries. If you're setting\nany parameter that affects bin boundaries and reusing the same estimator, you should set referenceDataset to an empty array between calls."),(0,i.kt)("h3",{id:"barrier-execution-mode"},"Barrier Execution Mode"),(0,i.kt)("p",null,"By default LightGBM uses the regular spark paradigm for launching tasks and communicates with the driver to coordinate task execution.\nThe driver thread aggregates all task host:port information and then communicates the full list back to the workers in order for NetworkInit to be called.\nThis procedure requires the driver to know how many tasks there are, and a mismatch between the expected number of tasks and the actual number causes\nthe initialization to deadlock."),(0,i.kt)("p",null,"If you're experiencing network issues, you can try using Spark's ",(0,i.kt)("em",{parentName:"p"},"barrier")," execution mode. SynapseML provides a ",(0,i.kt)("inlineCode",{parentName:"p"},"UseBarrierExecutionMode")," flag,\nto use Apache Spark's ",(0,i.kt)("inlineCode",{parentName:"p"},"barrier()")," stage to ensure all tasks execute at the same time.\nBarrier execution mode changes the logic to aggregate ",(0,i.kt)("inlineCode",{parentName:"p"},"host:port")," information across all tasks in a synchronized way.\nTo use it in scala, you can call setUseBarrierExecutionMode(true), for example:"),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre"},"val lgbm = new LightGBMClassifier()\n .setLabelCol(labelColumn)\n .setObjective(binaryObjective)\n .setUseBarrierExecutionMode(true)\n...\n\n")),(0,i.kt)("p",null,"Note: barrier execution mode can also cause complicated issues, so use it only if needed."))}h.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/391cb159.8796e433.js b/assets/js/391cb159.8796e433.js deleted file mode 100644 index 8fd81e6b52..0000000000 --- a/assets/js/391cb159.8796e433.js +++ /dev/null @@ -1 +0,0 @@ -"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[1867],{3905:function(e,t,a){a.d(t,{Zo:function(){return m},kt:function(){return h}});var n=a(7294);function r(e,t,a){return t in e?Object.defineProperty(e,t,{value:a,enumerable:!0,configurable:!0,writable:!0}):e[t]=a,e}function i(e,t){var a=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);t&&(n=n.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),a.push.apply(a,n)}return a}function s(e){for(var t=1;t=0||(r[a]=e[a]);return r}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(r[a]=e[a])}return r}var l=n.createContext({}),p=function(e){var t=n.useContext(l),a=t;return e&&(a="function"==typeof e?e(t):s(s({},t),e)),a},m=function(e){var t=p(e.components);return n.createElement(l.Provider,{value:t},e.children)},c={inlineCode:"code",wrapper:function(e){var t=e.children;return n.createElement(n.Fragment,{},t)}},u=n.forwardRef((function(e,t){var a=e.components,r=e.mdxType,i=e.originalType,l=e.parentName,m=o(e,["components","mdxType","originalType","parentName"]),u=p(a),h=r,d=u["".concat(l,".").concat(h)]||u[h]||c[h]||i;return a?n.createElement(d,s(s({ref:t},m),{},{components:a})):n.createElement(d,s({ref:t},m))}));function h(e,t){var a=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var i=a.length,s=new Array(i);s[0]=u;var o={};for(var l in t)hasOwnProperty.call(t,l)&&(o[l]=t[l]);o.originalType=e,o.mdxType="string"==typeof e?e:r,s[1]=o;for(var p=2;p\n')),(0,i.kt)("p",null,"For streaming mode, only one Dataset is created per partition, so ",(0,i.kt)("em",{parentName:"p"},"useSingleDataMode")," has no effect. It's effectively always true."),(0,i.kt)("h3",{id:"data-sampling"},"Data Sampling"),(0,i.kt)("p",null,"In order for LightGBM algorithm to work, it must first create a set of bin boundaries for optimization. It does this calculation by\nfirst sampling the data before any training or inferencing starts. (",(0,i.kt)("a",{parentName:"p",href:"https://github.com/Microsoft/LightGBM"},"LightGBM docs"),"). The number of\nsamples to use is set using ",(0,i.kt)("em",{parentName:"p"},"binSampleCount"),", which must be a minimal percent of the data or LightGBM rejects it."),(0,i.kt)("p",null,"For ",(0,i.kt)("em",{parentName:"p"},"bulk")," mode, this sampling is automatically done over the entire data, and each executor uses its own partitions to calculate samples for only\na subset of the features. This distributed sampling can have subtle effects since partitioning can affect the calculated bins.\nAlso, all data is sampled no matter what."),(0,i.kt)("p",null,"For ",(0,i.kt)("em",{parentName:"p"},"streaming")," mode, there are more explicit user controls for this sampling, and it's all done from the driver.\nThe ",(0,i.kt)("em",{parentName:"p"},"samplingMode")," property controls the behavior. The efficiency of these methods increases from first to last."),(0,i.kt)("ul",null,(0,i.kt)("li",{parentName:"ul"},(0,i.kt)("em",{parentName:"li"},"global")," - Like bulk mode, the random sample is calculated by iterating over entire data (hence data is traversed twice)"),(0,i.kt)("li",{parentName:"ul"},(0,i.kt)("em",{parentName:"li"},"subset")," - (default) Samples only from the first ",(0,i.kt)("em",{parentName:"li"},"samplingSubsetSize")," elements. Assumes this subset is representative."),(0,i.kt)("li",{parentName:"ul"},(0,i.kt)("em",{parentName:"li"},"fixed")," - There's no random sample. The first ",(0,i.kt)("em",{parentName:"li"},"binSampleSize")," rows are used. Assumes randomized data.\nFor large row counts, ",(0,i.kt)("em",{parentName:"li"},"subset")," and ",(0,i.kt)("em",{parentName:"li"},"fixed")," modes can save a first iteration over the entire data.")),(0,i.kt)("h4",{id:"reference-dataset"},"Reference Dataset"),(0,i.kt)("p",null,"The sampling of the data to calculate bin boundaries happens every ",(0,i.kt)("em",{parentName:"p"},"fit")," call.\nIf repeating a fit many times (for example, hyperparameter tuning), this calculation is duplicated effort."),(0,i.kt)("p",null,"For ",(0,i.kt)("em",{parentName:"p"},"streaming")," mode, there's an optimization that a client can set to use the previously calculated bin boundaries. The\nsampling calculation results in a ",(0,i.kt)("em",{parentName:"p"},"reference dataset"),", which can be reused. After a fit, there will be a ",(0,i.kt)("em",{parentName:"p"},"referenceDataset")," property\non the estimator that was calculated and used for that fit. If that is set on the next estimator (or you reuse the same one),\nit will use that instead of resampling the data."),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-python"},"from synapse.ml.lightgbm import LightGBMClassifier\nclassifier = LightGBMClassifier(learningRate=0.3,\n numIterations=100,\n numLeaves=31)\nmodel1 = classifier.fit(train)\n\nclassifier.learningRate = 0.4\nmodel2 = classifier.fit(train)\n")),(0,i.kt)("p",null,"The 'model2' call to 'fit' doesn't resample the data and uses the same bin boundaries as 'model1'."),(0,i.kt)("p",null,(0,i.kt)("em",{parentName:"p"},"Caution"),": Some parameters actually affect the bin boundary calculation and require the use of a new reference dataset every time.\nThese parameters include ",(0,i.kt)("em",{parentName:"p"},"isEnableSparse"),", ",(0,i.kt)("em",{parentName:"p"},"useMissing"),", and ",(0,i.kt)("em",{parentName:"p"},"zeroAsMissing")," that you can set from SynapseML. If you manually set\nsome parameters with ",(0,i.kt)("em",{parentName:"p"},"passThroughArgs"),", you should look at LightGBM docs to see if they affect bin boundaries. If you're setting\nany parameter that affects bin boundaries and reusing the same estimator, you should set referenceDataset to an empty array between calls."),(0,i.kt)("h3",{id:"barrier-execution-mode"},"Barrier Execution Mode"),(0,i.kt)("p",null,"By default LightGBM uses the regular spark paradigm for launching tasks and communicates with the driver to coordinate task execution.\nThe driver thread aggregates all task host:port information and then communicates the full list back to the workers in order for NetworkInit to be called.\nThis procedure requires the driver to know how many tasks there are, and a mismatch between the expected number of tasks and the actual number causes\nthe initialization to deadlock."),(0,i.kt)("p",null,"If you're experiencing network issues, you can try using Spark's ",(0,i.kt)("em",{parentName:"p"},"barrier")," execution mode. SynapseML provides a ",(0,i.kt)("inlineCode",{parentName:"p"},"UseBarrierExecutionMode")," flag,\nto use Apache Spark's ",(0,i.kt)("inlineCode",{parentName:"p"},"barrier()")," stage to ensure all tasks execute at the same time.\nBarrier execution mode changes the logic to aggregate ",(0,i.kt)("inlineCode",{parentName:"p"},"host:port")," information across all tasks in a synchronized way.\nTo use it in scala, you can call setUseBarrierExecutionMode(true), for example:"),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre"},"val lgbm = new LightGBMClassifier()\n .setLabelCol(labelColumn)\n .setObjective(binaryObjective)\n .setUseBarrierExecutionMode(true)\n...\n\n")),(0,i.kt)("p",null,"Note: barrier execution mode can also cause complicated issues, so use it only if needed."))}h.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/3a4f9d93.34ed0cdd.js b/assets/js/3a4f9d93.34ed0cdd.js new file mode 100644 index 0000000000..7f2a493577 --- /dev/null +++ b/assets/js/3a4f9d93.34ed0cdd.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[5487],{3905:function(e,t,n){n.d(t,{Zo:function(){return c},kt:function(){return m}});var r=n(7294);function o(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function a(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,r)}return n}function i(e){for(var t=1;t=0||(o[n]=e[n]);return o}(e,t);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(o[n]=e[n])}return o}var s=r.createContext({}),p=function(e){var t=r.useContext(s),n=t;return e&&(n="function"==typeof e?e(t):i(i({},t),e)),n},c=function(e){var t=p(e.components);return r.createElement(s.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},d=r.forwardRef((function(e,t){var n=e.components,o=e.mdxType,a=e.originalType,s=e.parentName,c=l(e,["components","mdxType","originalType","parentName"]),d=p(n),m=o,f=d["".concat(s,".").concat(m)]||d[m]||u[m]||a;return n?r.createElement(f,i(i({ref:t},c),{},{components:n})):r.createElement(f,i({ref:t},c))}));function m(e,t){var n=arguments,o=t&&t.mdxType;if("string"==typeof e||o){var a=n.length,i=new Array(a);i[0]=d;var l={};for(var s in t)hasOwnProperty.call(t,s)&&(l[s]=t[s]);l.originalType=e,l.mdxType="string"==typeof e?e:o,i[1]=l;for(var p=2;p=0||(o[n]=e[n]);return o}(e,t);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(o[n]=e[n])}return o}var s=r.createContext({}),p=function(e){var t=r.useContext(s),n=t;return e&&(n="function"==typeof e?e(t):i(i({},t),e)),n},c=function(e){var t=p(e.components);return r.createElement(s.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},d=r.forwardRef((function(e,t){var n=e.components,o=e.mdxType,a=e.originalType,s=e.parentName,c=l(e,["components","mdxType","originalType","parentName"]),d=p(n),m=o,f=d["".concat(s,".").concat(m)]||d[m]||u[m]||a;return n?r.createElement(f,i(i({ref:t},c),{},{components:n})):r.createElement(f,i({ref:t},c))}));function m(e,t){var n=arguments,o=t&&t.mdxType;if("string"==typeof e||o){var a=n.length,i=new Array(a);i[0]=d;var l={};for(var s in t)hasOwnProperty.call(t,s)&&(l[s]=t[s]);l.originalType=e,l.mdxType="string"==typeof e?e:o,i[1]=l;for(var p=2;p=0||(a[t]=e[t]);return a}(e,n);if(Object.getOwnPropertySymbols){var l=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,t)&&(a[t]=e[t])}return a}var c=r.createContext({}),i=function(e){var n=r.useContext(c),t=n;return e&&(t="function"==typeof e?e(n):o(o({},n),e)),t},s=function(e){var n=i(e.components);return r.createElement(c.Provider,{value:n},e.children)},p={inlineCode:"code",wrapper:function(e){var n=e.children;return r.createElement(r.Fragment,{},n)}},m=r.forwardRef((function(e,n){var t=e.components,a=e.mdxType,l=e.originalType,c=e.parentName,s=u(e,["components","mdxType","originalType","parentName"]),m=i(t),f=a,d=m["".concat(c,".").concat(f)]||m[f]||p[f]||l;return t?r.createElement(d,o(o({ref:n},s),{},{components:t})):r.createElement(d,o({ref:n},s))}));function f(e,n){var t=arguments,a=n&&n.mdxType;if("string"==typeof e||a){var l=t.length,o=new Array(l);o[0]=m;var u={};for(var c in n)hasOwnProperty.call(n,c)&&(u[c]=n[c]);u.originalType=e,u.mdxType="string"==typeof e?e:a,o[1]=u;for(var i=2;i child <"+("string"==typeof e.type?e.type:e.type.name)+'>: all children of the component should be , and every should have a unique "value" prop.')})))?void 0:t.filter(Boolean))?n:[]}(e).map((function(e){var n=e.props;return{value:n.value,label:n.label,attributes:n.attributes,default:n.default}}))}function m(e){var n=e.values,t=e.children;return(0,a.useMemo)((function(){var e=null!=n?n:p(t);return function(e){var n=(0,i.l)(e,(function(e,n){return e.value===n.value}));if(n.length>0)throw new Error('Docusaurus error: Duplicate values "'+n.map((function(e){return e.value})).join(", ")+'" found in . Every value needs to be unique.')}(e),e}),[n,t])}function f(e){var n=e.value;return e.tabValues.some((function(e){return e.value===n}))}function d(e){var n=e.queryString,t=void 0!==n&&n,r=e.groupId,l=(0,u.k6)(),o=function(e){var n=e.queryString,t=void 0!==n&&n,r=e.groupId;if("string"==typeof t)return t;if(!1===t)return null;if(!0===t&&!r)throw new Error('Docusaurus error: The component groupId prop is required if queryString=true, because this value is used as the search param name. You can also provide an explicit value such as queryString="my-search-param".');return null!=r?r:null}({queryString:t,groupId:r});return[(0,c._X)(o),(0,a.useCallback)((function(e){if(o){var n=new URLSearchParams(l.location.search);n.set(o,e),l.replace(Object.assign({},l.location,{search:n.toString()}))}}),[o,l])]}function b(e){var n,t,r,l,o=e.defaultValue,u=e.queryString,c=void 0!==u&&u,i=e.groupId,p=m(e),b=(0,a.useState)((function(){return function(e){var n,t=e.defaultValue,r=e.tabValues;if(0===r.length)throw new Error("Docusaurus error: the component requires at least one children component");if(t){if(!f({value:t,tabValues:r}))throw new Error('Docusaurus error: The has a defaultValue "'+t+'" but none of its children has the corresponding value. Available values are: '+r.map((function(e){return e.value})).join(", ")+". If you intend to show no default tab, use defaultValue={null} instead.");return t}var a=null!=(n=r.find((function(e){return e.default})))?n:r[0];if(!a)throw new Error("Unexpected error: 0 tabValues");return a.value}({defaultValue:o,tabValues:p})})),v=b[0],y=b[1],h=d({queryString:c,groupId:i}),g=h[0],E=h[1],k=(n=function(e){return e?"docusaurus.tab."+e:null}({groupId:i}.groupId),t=(0,s.Nk)(n),r=t[0],l=t[1],[r,(0,a.useCallback)((function(e){n&&l.set(e)}),[n,l])]),O=k[0],_=k[1],w=function(){var e=null!=g?g:O;return f({value:e,tabValues:p})?e:null}();return(0,a.useLayoutEffect)((function(){w&&y(w)}),[w]),{selectedValue:v,selectValue:(0,a.useCallback)((function(e){if(!f({value:e,tabValues:p}))throw new Error("Can't select invalid tab value="+e);y(e),E(e),_(e)}),[E,_,p]),tabValues:p}}var v=t(2389),y="tabList__CuJ",h="tabItem_LNqP";function g(e){var n=e.className,t=e.block,u=e.selectedValue,c=e.selectValue,i=e.tabValues,s=[],p=(0,o.o5)().blockElementScrollPositionUntilNextRender,m=function(e){var n=e.currentTarget,t=s.indexOf(n),r=i[t].value;r!==u&&(p(n),c(r))},f=function(e){var n,t=null;switch(e.key){case"Enter":m(e);break;case"ArrowRight":var r,a=s.indexOf(e.currentTarget)+1;t=null!=(r=s[a])?r:s[0];break;case"ArrowLeft":var l,o=s.indexOf(e.currentTarget)-1;t=null!=(l=s[o])?l:s[s.length-1]}null==(n=t)||n.focus()};return a.createElement("ul",{role:"tablist","aria-orientation":"horizontal",className:(0,l.Z)("tabs",{"tabs--block":t},n)},i.map((function(e){var n=e.value,t=e.label,o=e.attributes;return a.createElement("li",(0,r.Z)({role:"tab",tabIndex:u===n?0:-1,"aria-selected":u===n,key:n,ref:function(e){return s.push(e)},onKeyDown:f,onClick:m},o,{className:(0,l.Z)("tabs__item",h,null==o?void 0:o.className,{"tabs__item--active":u===n})}),null!=t?t:n)})))}function E(e){var n=e.lazy,t=e.children,r=e.selectedValue,l=(Array.isArray(t)?t:[t]).filter(Boolean);if(n){var o=l.find((function(e){return e.props.value===r}));return o?(0,a.cloneElement)(o,{className:"margin-top--md"}):null}return a.createElement("div",{className:"margin-top--md"},l.map((function(e,n){return(0,a.cloneElement)(e,{key:n,hidden:e.props.value!==r})})))}function k(e){var n=b(e);return a.createElement("div",{className:(0,l.Z)("tabs-container",y)},a.createElement(g,(0,r.Z)({},e,n)),a.createElement(E,(0,r.Z)({},e,n)))}function O(e){var n=(0,v.Z)();return a.createElement(k,(0,r.Z)({key:String(n)},e))}},1989:function(e,n,t){var r=t(7294),a=t(2263);n.Z=function(e){var n=e.className,t=e.py,l=e.scala,o=e.csharp,u=e.sourceLink,c=(0,a.Z)().siteConfig.customFields.version,i="https://mmlspark.blob.core.windows.net/docs/"+c+"/pyspark/"+t,s="https://mmlspark.blob.core.windows.net/docs/"+c+"/scala/"+l,p="https://mmlspark.blob.core.windows.net/docs/"+c+"/dotnet/"+o;return r.createElement("table",null,r.createElement("tbody",null,r.createElement("tr",null,r.createElement("td",null,r.createElement("strong",null,"Python API: "),r.createElement("a",{href:i},n)),r.createElement("td",null,r.createElement("strong",null,"Scala API: "),r.createElement("a",{href:s},n)),r.createElement("td",null,r.createElement("strong",null,".NET API: "),r.createElement("a",{href:p},n)),r.createElement("td",null,r.createElement("strong",null,"Source: "),r.createElement("a",{href:u},n)))))}},9100:function(e,n,t){t.r(n),t.d(n,{assets:function(){return y},contentTitle:function(){return b},default:function(){return E},frontMatter:function(){return d},metadata:function(){return v},toc:function(){return h}});var r=t(3117),a=t(102),l=(t(7294),t(3905)),o=t(4866),u=t(5162),c=t(1989),i=["components"],s=[{value:"ONNXModel",id:"onnxmodel",level:2}],p={toc:s};function m(e){var n=e.components,t=(0,a.Z)(e,i);return(0,l.kt)("wrapper",(0,r.Z)({},p,t,{components:n,mdxType:"MDXLayout"}),(0,l.kt)("h2",{id:"onnxmodel"},"ONNXModel"),(0,l.kt)(o.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,l.kt)(u.Z,{value:"py",mdxType:"TabItem"},(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-py"},'from synapse.ml.onnx import ONNXModel\n\nmodel_path = "PUT_YOUR_MODEL_PATH"\nonnx_ml = (ONNXModel()\n .setModelLocation(model_path)\n .setFeedDict({"float_input": "features"})\n .setFetchDict({"prediction": "output_label", "rawProbability": "output_probability"}))\n'))),(0,l.kt)(u.Z,{value:"scala",mdxType:"TabItem"},(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.onnx._\n\nval model_path = "PUT_YOUR_MODEL_PATH"\nval onnx_ml = (new ONNXModel()\n .setModelLocation(model_path)\n .setFeedDict(Map("float_input" -> "features"))\n .setFetchDict(Map("prediction" -> "output_label", "rawProbability" -> "output_probability")))\n')))),(0,l.kt)(c.Z,{className:"ONNXModel",py:"synapse.ml.onnx.html#module-synapse.ml.onnx.ONNXModel",scala:"com/microsoft/azure/synapse/ml/onnx/ONNXModel.html",csharp:"classSynapse_1_1ML_1_1Onnx_1_1ONNXModel.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/deep-learning/src/main/scala/com/microsoft/azure/synapse/ml/onnx/ONNXModel.scala",mdxType:"DocTable"}))}m.isMDXComponent=!0;var f=["components"],d={title:"Deep Learning",sidebar_label:"Deep Learning"},b=void 0,v={unversionedId:"Quick Examples/transformers/transformers_deep_learning",id:"version-0.11.3/Quick Examples/transformers/transformers_deep_learning",title:"Deep Learning",description:"",source:"@site/versioned_docs/version-0.11.3/Quick Examples/transformers/transformers_deep_learning.md",sourceDirName:"Quick Examples/transformers",slug:"/Quick Examples/transformers/transformers_deep_learning",permalink:"/SynapseML/docs/Quick Examples/transformers/transformers_deep_learning",draft:!1,tags:[],version:"0.11.3",frontMatter:{title:"Deep Learning",sidebar_label:"Deep Learning"}},y={},h=[].concat(s),g={toc:h};function E(e){var n=e.components,t=(0,a.Z)(e,f);return(0,l.kt)("wrapper",(0,r.Z)({},g,t,{components:n,mdxType:"MDXLayout"}),(0,l.kt)(m,{mdxType:"ONNXModel"}))}E.isMDXComponent=!0}}]); \ No newline at end of file +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[1103],{3905:function(e,n,t){t.d(n,{Zo:function(){return s},kt:function(){return f}});var r=t(7294);function a(e,n,t){return n in e?Object.defineProperty(e,n,{value:t,enumerable:!0,configurable:!0,writable:!0}):e[n]=t,e}function l(e,n){var t=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);n&&(r=r.filter((function(n){return Object.getOwnPropertyDescriptor(e,n).enumerable}))),t.push.apply(t,r)}return t}function o(e){for(var n=1;n=0||(a[t]=e[t]);return a}(e,n);if(Object.getOwnPropertySymbols){var l=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,t)&&(a[t]=e[t])}return a}var c=r.createContext({}),i=function(e){var n=r.useContext(c),t=n;return e&&(t="function"==typeof e?e(n):o(o({},n),e)),t},s=function(e){var n=i(e.components);return r.createElement(c.Provider,{value:n},e.children)},p={inlineCode:"code",wrapper:function(e){var n=e.children;return r.createElement(r.Fragment,{},n)}},m=r.forwardRef((function(e,n){var t=e.components,a=e.mdxType,l=e.originalType,c=e.parentName,s=u(e,["components","mdxType","originalType","parentName"]),m=i(t),f=a,d=m["".concat(c,".").concat(f)]||m[f]||p[f]||l;return t?r.createElement(d,o(o({ref:n},s),{},{components:t})):r.createElement(d,o({ref:n},s))}));function f(e,n){var t=arguments,a=n&&n.mdxType;if("string"==typeof e||a){var l=t.length,o=new Array(l);o[0]=m;var u={};for(var c in n)hasOwnProperty.call(n,c)&&(u[c]=n[c]);u.originalType=e,u.mdxType="string"==typeof e?e:a,o[1]=u;for(var i=2;i child <"+("string"==typeof e.type?e.type:e.type.name)+'>: all children of the component should be , and every should have a unique "value" prop.')})))?void 0:t.filter(Boolean))?n:[]}(e).map((function(e){var n=e.props;return{value:n.value,label:n.label,attributes:n.attributes,default:n.default}}))}function m(e){var n=e.values,t=e.children;return(0,a.useMemo)((function(){var e=null!=n?n:p(t);return function(e){var n=(0,i.l)(e,(function(e,n){return e.value===n.value}));if(n.length>0)throw new Error('Docusaurus error: Duplicate values "'+n.map((function(e){return e.value})).join(", ")+'" found in . Every value needs to be unique.')}(e),e}),[n,t])}function f(e){var n=e.value;return e.tabValues.some((function(e){return e.value===n}))}function d(e){var n=e.queryString,t=void 0!==n&&n,r=e.groupId,l=(0,u.k6)(),o=function(e){var n=e.queryString,t=void 0!==n&&n,r=e.groupId;if("string"==typeof t)return t;if(!1===t)return null;if(!0===t&&!r)throw new Error('Docusaurus error: The component groupId prop is required if queryString=true, because this value is used as the search param name. You can also provide an explicit value such as queryString="my-search-param".');return null!=r?r:null}({queryString:t,groupId:r});return[(0,c._X)(o),(0,a.useCallback)((function(e){if(o){var n=new URLSearchParams(l.location.search);n.set(o,e),l.replace(Object.assign({},l.location,{search:n.toString()}))}}),[o,l])]}function b(e){var n,t,r,l,o=e.defaultValue,u=e.queryString,c=void 0!==u&&u,i=e.groupId,p=m(e),b=(0,a.useState)((function(){return function(e){var n,t=e.defaultValue,r=e.tabValues;if(0===r.length)throw new Error("Docusaurus error: the component requires at least one children component");if(t){if(!f({value:t,tabValues:r}))throw new Error('Docusaurus error: The has a defaultValue "'+t+'" but none of its children has the corresponding value. Available values are: '+r.map((function(e){return e.value})).join(", ")+". If you intend to show no default tab, use defaultValue={null} instead.");return t}var a=null!=(n=r.find((function(e){return e.default})))?n:r[0];if(!a)throw new Error("Unexpected error: 0 tabValues");return a.value}({defaultValue:o,tabValues:p})})),v=b[0],y=b[1],h=d({queryString:c,groupId:i}),g=h[0],E=h[1],k=(n=function(e){return e?"docusaurus.tab."+e:null}({groupId:i}.groupId),t=(0,s.Nk)(n),r=t[0],l=t[1],[r,(0,a.useCallback)((function(e){n&&l.set(e)}),[n,l])]),O=k[0],_=k[1],w=function(){var e=null!=g?g:O;return f({value:e,tabValues:p})?e:null}();return(0,a.useLayoutEffect)((function(){w&&y(w)}),[w]),{selectedValue:v,selectValue:(0,a.useCallback)((function(e){if(!f({value:e,tabValues:p}))throw new Error("Can't select invalid tab value="+e);y(e),E(e),_(e)}),[E,_,p]),tabValues:p}}var v=t(2389),y="tabList__CuJ",h="tabItem_LNqP";function g(e){var n=e.className,t=e.block,u=e.selectedValue,c=e.selectValue,i=e.tabValues,s=[],p=(0,o.o5)().blockElementScrollPositionUntilNextRender,m=function(e){var n=e.currentTarget,t=s.indexOf(n),r=i[t].value;r!==u&&(p(n),c(r))},f=function(e){var n,t=null;switch(e.key){case"Enter":m(e);break;case"ArrowRight":var r,a=s.indexOf(e.currentTarget)+1;t=null!=(r=s[a])?r:s[0];break;case"ArrowLeft":var l,o=s.indexOf(e.currentTarget)-1;t=null!=(l=s[o])?l:s[s.length-1]}null==(n=t)||n.focus()};return a.createElement("ul",{role:"tablist","aria-orientation":"horizontal",className:(0,l.Z)("tabs",{"tabs--block":t},n)},i.map((function(e){var n=e.value,t=e.label,o=e.attributes;return a.createElement("li",(0,r.Z)({role:"tab",tabIndex:u===n?0:-1,"aria-selected":u===n,key:n,ref:function(e){return s.push(e)},onKeyDown:f,onClick:m},o,{className:(0,l.Z)("tabs__item",h,null==o?void 0:o.className,{"tabs__item--active":u===n})}),null!=t?t:n)})))}function E(e){var n=e.lazy,t=e.children,r=e.selectedValue,l=(Array.isArray(t)?t:[t]).filter(Boolean);if(n){var o=l.find((function(e){return e.props.value===r}));return o?(0,a.cloneElement)(o,{className:"margin-top--md"}):null}return a.createElement("div",{className:"margin-top--md"},l.map((function(e,n){return(0,a.cloneElement)(e,{key:n,hidden:e.props.value!==r})})))}function k(e){var n=b(e);return a.createElement("div",{className:(0,l.Z)("tabs-container",y)},a.createElement(g,(0,r.Z)({},e,n)),a.createElement(E,(0,r.Z)({},e,n)))}function O(e){var n=(0,v.Z)();return a.createElement(k,(0,r.Z)({key:String(n)},e))}},1989:function(e,n,t){var r=t(7294),a=t(2263);n.Z=function(e){var n=e.className,t=e.py,l=e.scala,o=e.csharp,u=e.sourceLink,c=(0,a.Z)().siteConfig.customFields.version,i="https://mmlspark.blob.core.windows.net/docs/"+c+"/pyspark/"+t,s="https://mmlspark.blob.core.windows.net/docs/"+c+"/scala/"+l,p="https://mmlspark.blob.core.windows.net/docs/"+c+"/dotnet/"+o;return r.createElement("table",null,r.createElement("tbody",null,r.createElement("tr",null,r.createElement("td",null,r.createElement("strong",null,"Python API: "),r.createElement("a",{href:i},n)),r.createElement("td",null,r.createElement("strong",null,"Scala API: "),r.createElement("a",{href:s},n)),r.createElement("td",null,r.createElement("strong",null,".NET API: "),r.createElement("a",{href:p},n)),r.createElement("td",null,r.createElement("strong",null,"Source: "),r.createElement("a",{href:u},n)))))}},8105:function(e,n,t){t.r(n),t.d(n,{assets:function(){return y},contentTitle:function(){return b},default:function(){return E},frontMatter:function(){return d},metadata:function(){return v},toc:function(){return h}});var r=t(3117),a=t(102),l=(t(7294),t(3905)),o=t(4866),u=t(5162),c=t(1989),i=["components"],s=[{value:"ONNXModel",id:"onnxmodel",level:2}],p={toc:s};function m(e){var n=e.components,t=(0,a.Z)(e,i);return(0,l.kt)("wrapper",(0,r.Z)({},p,t,{components:n,mdxType:"MDXLayout"}),(0,l.kt)("h2",{id:"onnxmodel"},"ONNXModel"),(0,l.kt)(o.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,l.kt)(u.Z,{value:"py",mdxType:"TabItem"},(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-py"},'from synapse.ml.onnx import ONNXModel\n\nmodel_path = "PUT_YOUR_MODEL_PATH"\nonnx_ml = (ONNXModel()\n .setModelLocation(model_path)\n .setFeedDict({"float_input": "features"})\n .setFetchDict({"prediction": "output_label", "rawProbability": "output_probability"}))\n'))),(0,l.kt)(u.Z,{value:"scala",mdxType:"TabItem"},(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.onnx._\n\nval model_path = "PUT_YOUR_MODEL_PATH"\nval onnx_ml = (new ONNXModel()\n .setModelLocation(model_path)\n .setFeedDict(Map("float_input" -> "features"))\n .setFetchDict(Map("prediction" -> "output_label", "rawProbability" -> "output_probability")))\n')))),(0,l.kt)(c.Z,{className:"ONNXModel",py:"synapse.ml.onnx.html#module-synapse.ml.onnx.ONNXModel",scala:"com/microsoft/azure/synapse/ml/onnx/ONNXModel.html",csharp:"classSynapse_1_1ML_1_1Onnx_1_1ONNXModel.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/deep-learning/src/main/scala/com/microsoft/azure/synapse/ml/onnx/ONNXModel.scala",mdxType:"DocTable"}))}m.isMDXComponent=!0;var f=["components"],d={title:"Deep Learning",sidebar_label:"Deep Learning"},b=void 0,v={unversionedId:"Quick Examples/transformers/transformers_deep_learning",id:"version-0.11.4/Quick Examples/transformers/transformers_deep_learning",title:"Deep Learning",description:"",source:"@site/versioned_docs/version-0.11.4/Quick Examples/transformers/transformers_deep_learning.md",sourceDirName:"Quick Examples/transformers",slug:"/Quick Examples/transformers/transformers_deep_learning",permalink:"/SynapseML/docs/Quick Examples/transformers/transformers_deep_learning",draft:!1,tags:[],version:"0.11.4",frontMatter:{title:"Deep Learning",sidebar_label:"Deep Learning"}},y={},h=[].concat(s),g={toc:h};function E(e){var n=e.components,t=(0,a.Z)(e,f);return(0,l.kt)("wrapper",(0,r.Z)({},g,t,{components:n,mdxType:"MDXLayout"}),(0,l.kt)(m,{mdxType:"ONNXModel"}))}E.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/d39aa6d3.baab4dfe.js b/assets/js/3c1f4383.02695154.js similarity index 90% rename from assets/js/d39aa6d3.baab4dfe.js rename to assets/js/3c1f4383.02695154.js index b60c3ea97e..0aad77186f 100644 --- a/assets/js/d39aa6d3.baab4dfe.js +++ b/assets/js/3c1f4383.02695154.js @@ -1 +1 @@ -"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[7930],{3905:function(e,t,n){n.d(t,{Zo:function(){return u},kt:function(){return m}});var r=n(7294);function o(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function i(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,r)}return n}function l(e){for(var t=1;t=0||(o[n]=e[n]);return o}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(o[n]=e[n])}return o}var c=r.createContext({}),s=function(e){var t=r.useContext(c),n=t;return e&&(n="function"==typeof e?e(t):l(l({},t),e)),n},u=function(e){var t=s(e.components);return r.createElement(c.Provider,{value:t},e.children)},p={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},f=r.forwardRef((function(e,t){var n=e.components,o=e.mdxType,i=e.originalType,c=e.parentName,u=a(e,["components","mdxType","originalType","parentName"]),f=s(n),m=o,y=f["".concat(c,".").concat(m)]||f[m]||p[m]||i;return n?r.createElement(y,l(l({ref:t},u),{},{components:n})):r.createElement(y,l({ref:t},u))}));function m(e,t){var n=arguments,o=t&&t.mdxType;if("string"==typeof e||o){var i=n.length,l=new Array(i);l[0]=f;var a={};for(var c in t)hasOwnProperty.call(t,c)&&(a[c]=t[c]);a.originalType=e,a.mdxType="string"==typeof e?e:o,l[1]=a;for(var s=2;s=0||(o[n]=e[n]);return o}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(o[n]=e[n])}return o}var c=r.createContext({}),s=function(e){var t=r.useContext(c),n=t;return e&&(n="function"==typeof e?e(t):l(l({},t),e)),n},u=function(e){var t=s(e.components);return r.createElement(c.Provider,{value:t},e.children)},p={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},f=r.forwardRef((function(e,t){var n=e.components,o=e.mdxType,i=e.originalType,c=e.parentName,u=a(e,["components","mdxType","originalType","parentName"]),f=s(n),m=o,y=f["".concat(c,".").concat(m)]||f[m]||p[m]||i;return n?r.createElement(y,l(l({ref:t},u),{},{components:n})):r.createElement(y,l({ref:t},u))}));function m(e,t){var n=arguments,o=t&&t.mdxType;if("string"==typeof e||o){var i=n.length,l=new Array(i);l[0]=f;var a={};for(var c in t)hasOwnProperty.call(t,c)&&(a[c]=t[c]);a.originalType=e,a.mdxType="string"==typeof e?e:o,l[1]=a;for(var s=2;s=0||(o[n]=e[n]);return o}(e,t);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(o[n]=e[n])}return o}var s=r.createContext({}),p=function(e){var t=r.useContext(s),n=t;return e&&(n="function"==typeof e?e(t):i(i({},t),e)),n},c=function(e){var t=p(e.components);return r.createElement(s.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},d=r.forwardRef((function(e,t){var n=e.components,o=e.mdxType,a=e.originalType,s=e.parentName,c=l(e,["components","mdxType","originalType","parentName"]),d=p(n),m=o,f=d["".concat(s,".").concat(m)]||d[m]||u[m]||a;return n?r.createElement(f,i(i({ref:t},c),{},{components:n})):r.createElement(f,i({ref:t},c))}));function m(e,t){var n=arguments,o=t&&t.mdxType;if("string"==typeof e||o){var a=n.length,i=new Array(a);i[0]=d;var l={};for(var s in t)hasOwnProperty.call(t,s)&&(l[s]=t[s]);l.originalType=e,l.mdxType="string"==typeof e?e:o,i[1]=l;for(var p=2;p=0||(o[n]=e[n]);return o}(e,t);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(o[n]=e[n])}return o}var s=r.createContext({}),p=function(e){var t=r.useContext(s),n=t;return e&&(n="function"==typeof e?e(t):i(i({},t),e)),n},c=function(e){var t=p(e.components);return r.createElement(s.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},d=r.forwardRef((function(e,t){var n=e.components,o=e.mdxType,a=e.originalType,s=e.parentName,c=l(e,["components","mdxType","originalType","parentName"]),d=p(n),m=o,f=d["".concat(s,".").concat(m)]||d[m]||u[m]||a;return n?r.createElement(f,i(i({ref:t},c),{},{components:n})):r.createElement(f,i({ref:t},c))}));function m(e,t){var n=arguments,o=t&&t.mdxType;if("string"==typeof e||o){var a=n.length,i=new Array(a);i[0]=d;var l={};for(var s in t)hasOwnProperty.call(t,s)&&(l[s]=t[s]);l.originalType=e,l.mdxType="string"==typeof e?e:o,i[1]=l;for(var p=2;p=0||(a[t]=e[t]);return a}(e,n);if(Object.getOwnPropertySymbols){var l=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,t)&&(a[t]=e[t])}return a}var i=r.createContext({}),p=function(e){var n=r.useContext(i),t=n;return e&&(t="function"==typeof e?e(n):s(s({},n),e)),t},u=function(e){var n=p(e.components);return r.createElement(i.Provider,{value:n},e.children)},c={inlineCode:"code",wrapper:function(e){var n=e.children;return r.createElement(r.Fragment,{},n)}},d=r.forwardRef((function(e,n){var t=e.components,a=e.mdxType,l=e.originalType,i=e.parentName,u=o(e,["components","mdxType","originalType","parentName"]),d=p(t),m=a,f=d["".concat(i,".").concat(m)]||d[m]||c[m]||l;return t?r.createElement(f,s(s({ref:n},u),{},{components:t})):r.createElement(f,s({ref:n},u))}));function m(e,n){var t=arguments,a=n&&n.mdxType;if("string"==typeof e||a){var l=t.length,s=new Array(l);s[0]=d;var o={};for(var i in n)hasOwnProperty.call(n,i)&&(o[i]=n[i]);o.originalType=e,o.mdxType="string"==typeof e?e:a,s[1]=o;for(var p=2;p [?? x 4]\n# Database: spark_connection\n eruptions waiting eruptions_output waiting_output\n \n 1 3.600 79 3.600 79\n 2 1.800 54 1.800 54\n 3 3.333 74 3.333 74\n 4 2.283 62 2.283 62\n 5 4.533 85 4.533 85\n 6 2.883 55 2.883 55\n 7 4.700 88 4.700 88\n 8 3.600 85 3.600 85\n 9 1.950 51 1.950 51\n 10 4.350 85 4.350 85\n # ... with more rows\n...\n")),(0,l.kt)("h2",{id:"azure-databricks"},"Azure Databricks"),(0,l.kt)("p",null,'In Azure Databricks, you can install devtools and the spark package from URL\nand then use spark_connect with method = "databricks":'),(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-R"},'install.packages("devtools")\ndevtools::install_url("https://mmlspark.azureedge.net/rrr/synapseml-0.11.3.zip")\nlibrary(sparklyr)\nlibrary(dplyr)\nsc <- spark_connect(method = "databricks")\nfaithful_df <- copy_to(sc, faithful)\nunfit_model = ml_light_gbmregressor(sc, maxDepth=20, featuresCol="waiting", labelCol="eruptions", numIterations=10, unfit.model=TRUE)\nml_train_regressor(faithful_df, labelCol="eruptions", unfit_model)\n')),(0,l.kt)("h2",{id:"building-from-source"},"Building from Source"),(0,l.kt)("p",null,"Our R bindings are built as part of the ",(0,l.kt)("a",{parentName:"p",href:"../Developer%20Setup"},"normal build\nprocess"),". To get a quick build, start at the root\nof the synapseml directory, and find the generated files. For instance,\nto find the R files for deep-learning, run"),(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-bash"},"sbt packageR\nls ./deep-learning/target/scala-2.12/generated/src/R/synapseml/R\n")),(0,l.kt)("p",null,"You can then run R in a terminal and install the above files directly:"),(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-R"},'...\ndevtools::install_local("./deep-learning/target/scala-2.12/generated/src/R/synapseml/R")\n...\n')))}m.isMDXComponent=!0}}]); \ No newline at end of file +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[5437],{3905:function(e,n,t){t.d(n,{Zo:function(){return u},kt:function(){return m}});var r=t(7294);function a(e,n,t){return n in e?Object.defineProperty(e,n,{value:t,enumerable:!0,configurable:!0,writable:!0}):e[n]=t,e}function l(e,n){var t=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);n&&(r=r.filter((function(n){return Object.getOwnPropertyDescriptor(e,n).enumerable}))),t.push.apply(t,r)}return t}function s(e){for(var n=1;n=0||(a[t]=e[t]);return a}(e,n);if(Object.getOwnPropertySymbols){var l=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,t)&&(a[t]=e[t])}return a}var i=r.createContext({}),p=function(e){var n=r.useContext(i),t=n;return e&&(t="function"==typeof e?e(n):s(s({},n),e)),t},u=function(e){var n=p(e.components);return r.createElement(i.Provider,{value:n},e.children)},c={inlineCode:"code",wrapper:function(e){var n=e.children;return r.createElement(r.Fragment,{},n)}},d=r.forwardRef((function(e,n){var t=e.components,a=e.mdxType,l=e.originalType,i=e.parentName,u=o(e,["components","mdxType","originalType","parentName"]),d=p(t),m=a,f=d["".concat(i,".").concat(m)]||d[m]||c[m]||l;return t?r.createElement(f,s(s({ref:n},u),{},{components:t})):r.createElement(f,s({ref:n},u))}));function m(e,n){var t=arguments,a=n&&n.mdxType;if("string"==typeof e||a){var l=t.length,s=new Array(l);s[0]=d;var o={};for(var i in n)hasOwnProperty.call(n,i)&&(o[i]=n[i]);o.originalType=e,o.mdxType="string"==typeof e?e:a,s[1]=o;for(var p=2;p [?? x 4]\n# Database: spark_connection\n eruptions waiting eruptions_output waiting_output\n \n 1 3.600 79 3.600 79\n 2 1.800 54 1.800 54\n 3 3.333 74 3.333 74\n 4 2.283 62 2.283 62\n 5 4.533 85 4.533 85\n 6 2.883 55 2.883 55\n 7 4.700 88 4.700 88\n 8 3.600 85 3.600 85\n 9 1.950 51 1.950 51\n 10 4.350 85 4.350 85\n # ... with more rows\n...\n")),(0,l.kt)("h2",{id:"azure-databricks"},"Azure Databricks"),(0,l.kt)("p",null,'In Azure Databricks, you can install devtools and the spark package from URL\nand then use spark_connect with method = "databricks":'),(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-R"},'install.packages("devtools")\ndevtools::install_url("https://mmlspark.azureedge.net/rrr/synapseml-0.11.4.zip")\nlibrary(sparklyr)\nlibrary(dplyr)\nsc <- spark_connect(method = "databricks")\nfaithful_df <- copy_to(sc, faithful)\nunfit_model = ml_light_gbmregressor(sc, maxDepth=20, featuresCol="waiting", labelCol="eruptions", numIterations=10, unfit.model=TRUE)\nml_train_regressor(faithful_df, labelCol="eruptions", unfit_model)\n')),(0,l.kt)("h2",{id:"building-from-source"},"Building from Source"),(0,l.kt)("p",null,"Our R bindings are built as part of the ",(0,l.kt)("a",{parentName:"p",href:"../Developer%20Setup"},"normal build\nprocess"),". To get a quick build, start at the root\nof the synapseml directory, and find the generated files. For instance,\nto find the R files for deep-learning, run"),(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-bash"},"sbt packageR\nls ./deep-learning/target/scala-2.12/generated/src/R/synapseml/R\n")),(0,l.kt)("p",null,"You can then run R in a terminal and install the above files directly:"),(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-R"},'...\ndevtools::install_local("./deep-learning/target/scala-2.12/generated/src/R/synapseml/R")\n...\n')))}m.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/47bbce76.1d7531df.js b/assets/js/47bbce76.1d7531df.js deleted file mode 100644 index e8a0296eb8..0000000000 --- a/assets/js/47bbce76.1d7531df.js +++ /dev/null @@ -1 +0,0 @@ -"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[464],{3905:function(e,t,n){n.d(t,{Zo:function(){return u},kt:function(){return p}});var r=n(7294);function i(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function o(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,r)}return n}function s(e){for(var t=1;t=0||(i[n]=e[n]);return i}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(i[n]=e[n])}return i}var l=r.createContext({}),c=function(e){var t=r.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):s(s({},t),e)),n},u=function(e){var t=c(e.components);return r.createElement(l.Provider,{value:t},e.children)},g={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},m=r.forwardRef((function(e,t){var n=e.components,i=e.mdxType,o=e.originalType,l=e.parentName,u=a(e,["components","mdxType","originalType","parentName"]),m=c(n),p=i,f=m["".concat(l,".").concat(p)]||m[p]||g[p]||o;return n?r.createElement(f,s(s({ref:t},u),{},{components:n})):r.createElement(f,s({ref:t},u))}));function p(e,t){var n=arguments,i=t&&t.mdxType;if("string"==typeof e||i){var o=n.length,s=new Array(o);s[0]=m;var a={};for(var l in t)hasOwnProperty.call(t,l)&&(a[l]=t[l]);a.originalType=e,a.mdxType="string"==typeof e?e:i,s[1]=a;for(var c=2;c=0||(i[n]=e[n]);return i}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(i[n]=e[n])}return i}var l=r.createContext({}),c=function(e){var t=r.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):s(s({},t),e)),n},u=function(e){var t=c(e.components);return r.createElement(l.Provider,{value:t},e.children)},g={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},m=r.forwardRef((function(e,t){var n=e.components,i=e.mdxType,o=e.originalType,l=e.parentName,u=a(e,["components","mdxType","originalType","parentName"]),m=c(n),p=i,f=m["".concat(l,".").concat(p)]||m[p]||g[p]||o;return n?r.createElement(f,s(s({ref:t},u),{},{components:n})):r.createElement(f,s({ref:t},u))}));function p(e,t){var n=arguments,i=t&&t.mdxType;if("string"==typeof e||i){var o=n.length,s=new Array(o);s[0]=m;var a={};for(var l in t)hasOwnProperty.call(t,l)&&(a[l]=t[l]);a.originalType=e,a.mdxType="string"==typeof e?e:i,s[1]=a;for(var c=2;c=0||(a[t]=e[t]);return a}(e,n);if(Object.getOwnPropertySymbols){var l=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,t)&&(a[t]=e[t])}return a}var c=r.createContext({}),i=function(e){var n=r.useContext(c),t=n;return e&&(t="function"==typeof e?e(n):o(o({},n),e)),t},s=function(e){var n=i(e.components);return r.createElement(c.Provider,{value:n},e.children)},p={inlineCode:"code",wrapper:function(e){var n=e.children;return r.createElement(r.Fragment,{},n)}},m=r.forwardRef((function(e,n){var t=e.components,a=e.mdxType,l=e.originalType,c=e.parentName,s=u(e,["components","mdxType","originalType","parentName"]),m=i(t),f=a,d=m["".concat(c,".").concat(f)]||m[f]||p[f]||l;return t?r.createElement(d,o(o({ref:n},s),{},{components:t})):r.createElement(d,o({ref:n},s))}));function f(e,n){var t=arguments,a=n&&n.mdxType;if("string"==typeof e||a){var l=t.length,o=new Array(l);o[0]=m;var u={};for(var c in n)hasOwnProperty.call(n,c)&&(u[c]=n[c]);u.originalType=e,u.mdxType="string"==typeof e?e:a,o[1]=u;for(var i=2;i child <"+("string"==typeof e.type?e.type:e.type.name)+'>: all children of the component should be , and every should have a unique "value" prop.')})))?void 0:t.filter(Boolean))?n:[]}(e).map((function(e){var n=e.props;return{value:n.value,label:n.label,attributes:n.attributes,default:n.default}}))}function m(e){var n=e.values,t=e.children;return(0,a.useMemo)((function(){var e=null!=n?n:p(t);return function(e){var n=(0,i.l)(e,(function(e,n){return e.value===n.value}));if(n.length>0)throw new Error('Docusaurus error: Duplicate values "'+n.map((function(e){return e.value})).join(", ")+'" found in . Every value needs to be unique.')}(e),e}),[n,t])}function f(e){var n=e.value;return e.tabValues.some((function(e){return e.value===n}))}function d(e){var n=e.queryString,t=void 0!==n&&n,r=e.groupId,l=(0,u.k6)(),o=function(e){var n=e.queryString,t=void 0!==n&&n,r=e.groupId;if("string"==typeof t)return t;if(!1===t)return null;if(!0===t&&!r)throw new Error('Docusaurus error: The component groupId prop is required if queryString=true, because this value is used as the search param name. You can also provide an explicit value such as queryString="my-search-param".');return null!=r?r:null}({queryString:t,groupId:r});return[(0,c._X)(o),(0,a.useCallback)((function(e){if(o){var n=new URLSearchParams(l.location.search);n.set(o,e),l.replace(Object.assign({},l.location,{search:n.toString()}))}}),[o,l])]}function b(e){var n,t,r,l,o=e.defaultValue,u=e.queryString,c=void 0!==u&&u,i=e.groupId,p=m(e),b=(0,a.useState)((function(){return function(e){var n,t=e.defaultValue,r=e.tabValues;if(0===r.length)throw new Error("Docusaurus error: the component requires at least one children component");if(t){if(!f({value:t,tabValues:r}))throw new Error('Docusaurus error: The has a defaultValue "'+t+'" but none of its children has the corresponding value. Available values are: '+r.map((function(e){return e.value})).join(", ")+". If you intend to show no default tab, use defaultValue={null} instead.");return t}var a=null!=(n=r.find((function(e){return e.default})))?n:r[0];if(!a)throw new Error("Unexpected error: 0 tabValues");return a.value}({defaultValue:o,tabValues:p})})),v=b[0],y=b[1],h=d({queryString:c,groupId:i}),g=h[0],E=h[1],k=(n=function(e){return e?"docusaurus.tab."+e:null}({groupId:i}.groupId),t=(0,s.Nk)(n),r=t[0],l=t[1],[r,(0,a.useCallback)((function(e){n&&l.set(e)}),[n,l])]),O=k[0],_=k[1],w=function(){var e=null!=g?g:O;return f({value:e,tabValues:p})?e:null}();return(0,a.useLayoutEffect)((function(){w&&y(w)}),[w]),{selectedValue:v,selectValue:(0,a.useCallback)((function(e){if(!f({value:e,tabValues:p}))throw new Error("Can't select invalid tab value="+e);y(e),E(e),_(e)}),[E,_,p]),tabValues:p}}var v=t(2389),y="tabList__CuJ",h="tabItem_LNqP";function g(e){var n=e.className,t=e.block,u=e.selectedValue,c=e.selectValue,i=e.tabValues,s=[],p=(0,o.o5)().blockElementScrollPositionUntilNextRender,m=function(e){var n=e.currentTarget,t=s.indexOf(n),r=i[t].value;r!==u&&(p(n),c(r))},f=function(e){var n,t=null;switch(e.key){case"Enter":m(e);break;case"ArrowRight":var r,a=s.indexOf(e.currentTarget)+1;t=null!=(r=s[a])?r:s[0];break;case"ArrowLeft":var l,o=s.indexOf(e.currentTarget)-1;t=null!=(l=s[o])?l:s[s.length-1]}null==(n=t)||n.focus()};return a.createElement("ul",{role:"tablist","aria-orientation":"horizontal",className:(0,l.Z)("tabs",{"tabs--block":t},n)},i.map((function(e){var n=e.value,t=e.label,o=e.attributes;return a.createElement("li",(0,r.Z)({role:"tab",tabIndex:u===n?0:-1,"aria-selected":u===n,key:n,ref:function(e){return s.push(e)},onKeyDown:f,onClick:m},o,{className:(0,l.Z)("tabs__item",h,null==o?void 0:o.className,{"tabs__item--active":u===n})}),null!=t?t:n)})))}function E(e){var n=e.lazy,t=e.children,r=e.selectedValue,l=(Array.isArray(t)?t:[t]).filter(Boolean);if(n){var o=l.find((function(e){return e.props.value===r}));return o?(0,a.cloneElement)(o,{className:"margin-top--md"}):null}return a.createElement("div",{className:"margin-top--md"},l.map((function(e,n){return(0,a.cloneElement)(e,{key:n,hidden:e.props.value!==r})})))}function k(e){var n=b(e);return a.createElement("div",{className:(0,l.Z)("tabs-container",y)},a.createElement(g,(0,r.Z)({},e,n)),a.createElement(E,(0,r.Z)({},e,n)))}function O(e){var n=(0,v.Z)();return a.createElement(k,(0,r.Z)({key:String(n)},e))}},1989:function(e,n,t){var r=t(7294),a=t(2263);n.Z=function(e){var n=e.className,t=e.py,l=e.scala,o=e.csharp,u=e.sourceLink,c=(0,a.Z)().siteConfig.customFields.version,i="https://mmlspark.blob.core.windows.net/docs/"+c+"/pyspark/"+t,s="https://mmlspark.blob.core.windows.net/docs/"+c+"/scala/"+l,p="https://mmlspark.blob.core.windows.net/docs/"+c+"/dotnet/"+o;return r.createElement("table",null,r.createElement("tbody",null,r.createElement("tr",null,r.createElement("td",null,r.createElement("strong",null,"Python API: "),r.createElement("a",{href:i},n)),r.createElement("td",null,r.createElement("strong",null,"Scala API: "),r.createElement("a",{href:s},n)),r.createElement("td",null,r.createElement("strong",null,".NET API: "),r.createElement("a",{href:p},n)),r.createElement("td",null,r.createElement("strong",null,"Source: "),r.createElement("a",{href:u},n)))))}},9100:function(e,n,t){t.r(n),t.d(n,{assets:function(){return y},contentTitle:function(){return b},default:function(){return E},frontMatter:function(){return d},metadata:function(){return v},toc:function(){return h}});var r=t(3117),a=t(102),l=(t(7294),t(3905)),o=t(4866),u=t(5162),c=t(1989),i=["components"],s=[{value:"ONNXModel",id:"onnxmodel",level:2}],p={toc:s};function m(e){var n=e.components,t=(0,a.Z)(e,i);return(0,l.kt)("wrapper",(0,r.Z)({},p,t,{components:n,mdxType:"MDXLayout"}),(0,l.kt)("h2",{id:"onnxmodel"},"ONNXModel"),(0,l.kt)(o.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,l.kt)(u.Z,{value:"py",mdxType:"TabItem"},(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-py"},'from synapse.ml.onnx import ONNXModel\n\nmodel_path = "PUT_YOUR_MODEL_PATH"\nonnx_ml = (ONNXModel()\n .setModelLocation(model_path)\n .setFeedDict({"float_input": "features"})\n .setFetchDict({"prediction": "output_label", "rawProbability": "output_probability"}))\n'))),(0,l.kt)(u.Z,{value:"scala",mdxType:"TabItem"},(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.onnx._\n\nval model_path = "PUT_YOUR_MODEL_PATH"\nval onnx_ml = (new ONNXModel()\n .setModelLocation(model_path)\n .setFeedDict(Map("float_input" -> "features"))\n .setFetchDict(Map("prediction" -> "output_label", "rawProbability" -> "output_probability")))\n')))),(0,l.kt)(c.Z,{className:"ONNXModel",py:"synapse.ml.onnx.html#module-synapse.ml.onnx.ONNXModel",scala:"com/microsoft/azure/synapse/ml/onnx/ONNXModel.html",csharp:"classSynapse_1_1ML_1_1Onnx_1_1ONNXModel.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/deep-learning/src/main/scala/com/microsoft/azure/synapse/ml/onnx/ONNXModel.scala",mdxType:"DocTable"}))}m.isMDXComponent=!0;var f=["components"],d={title:"Deep Learning",sidebar_label:"Deep Learning"},b=void 0,v={unversionedId:"Quick Examples/transformers/transformers_deep_learning",id:"version-0.11.3/Quick Examples/transformers/transformers_deep_learning",title:"Deep Learning",description:"",source:"@site/versioned_docs/version-0.11.3/Quick Examples/transformers/transformers_deep_learning.md",sourceDirName:"Quick Examples/transformers",slug:"/Quick Examples/transformers/transformers_deep_learning",permalink:"/SynapseML/docs/0.11.3/Quick Examples/transformers/transformers_deep_learning",draft:!1,tags:[],version:"0.11.3",frontMatter:{title:"Deep Learning",sidebar_label:"Deep Learning"}},y={},h=[].concat(s),g={toc:h};function E(e){var n=e.components,t=(0,a.Z)(e,f);return(0,l.kt)("wrapper",(0,r.Z)({},g,t,{components:n,mdxType:"MDXLayout"}),(0,l.kt)(m,{mdxType:"ONNXModel"}))}E.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/4997ef4f.4c3be39d.js b/assets/js/4997ef4f.4c3be39d.js new file mode 100644 index 0000000000..40d7c6248f --- /dev/null +++ b/assets/js/4997ef4f.4c3be39d.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[4772],{3905:function(e,t,a){a.d(t,{Zo:function(){return d},kt:function(){return k}});var n=a(7294);function r(e,t,a){return t in e?Object.defineProperty(e,t,{value:a,enumerable:!0,configurable:!0,writable:!0}):e[t]=a,e}function l(e,t){var a=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);t&&(n=n.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),a.push.apply(a,n)}return a}function i(e){for(var t=1;t=0||(r[a]=e[a]);return r}(e,t);if(Object.getOwnPropertySymbols){var l=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(r[a]=e[a])}return r}var p=n.createContext({}),m=function(e){var t=n.useContext(p),a=t;return e&&(a="function"==typeof e?e(t):i(i({},t),e)),a},d=function(e){var t=m(e.components);return n.createElement(p.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return n.createElement(n.Fragment,{},t)}},s=n.forwardRef((function(e,t){var a=e.components,r=e.mdxType,l=e.originalType,p=e.parentName,d=o(e,["components","mdxType","originalType","parentName"]),s=m(a),k=r,g=s["".concat(p,".").concat(k)]||s[k]||u[k]||l;return a?n.createElement(g,i(i({ref:t},d),{},{components:a})):n.createElement(g,i({ref:t},d))}));function k(e,t){var a=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var l=a.length,i=new Array(l);i[0]=s;var o={};for(var p in t)hasOwnProperty.call(t,p)&&(o[p]=t[p]);o.originalType=e,o.mdxType="string"==typeof e?e:r,i[1]=o;for(var m=2;mTabularLIME",id:"tabularlime",level:3},{value:"TabularSHAP",id:"tabularshap",level:3},{value:"VectorLIME",id:"vectorlime",level:3},{value:"VectorSHAP",id:"vectorshap",level:3},{value:"ImageLIME",id:"imagelime",level:3},{value:"ImageSHAP",id:"imageshap",level:3},{value:"TextLIME",id:"textlime",level:3},{value:"TextSHAP",id:"textshap",level:3},{value:"Result interpretation",id:"result-interpretation",level:2},{value:"LIME explainers",id:"lime-explainers",level:3},{value:"SHAP explainers",id:"shap-explainers",level:3},{value:"Base value",id:"base-value",level:4}],s={toc:u};function k(e){var t=e.components,a=(0,r.Z)(e,i);return(0,l.kt)("wrapper",(0,n.Z)({},s,a,{components:t,mdxType:"MDXLayout"}),(0,l.kt)("h1",{id:"model-interpretation-on-spark"},"Model Interpretation on Spark"),(0,l.kt)("h2",{id:"interpretable-machine-learning"},"Interpretable Machine Learning"),(0,l.kt)("p",null,"Interpretable Machine Learning helps developers, data scientists and business stakeholders in the organization gain a comprehensive understanding of their machine learning models. It can also be used to debug models, explain predictions and enable auditing to meet compliance with regulatory requirements."),(0,l.kt)("h2",{id:"why-run-model-interpretation-on-spark"},"Why run model interpretation on Spark"),(0,l.kt)("p",null,"Model-agnostic interpretation methods can be computationally expensive due to the multiple evaluations needed to compute the explanations. Model interpretation on Spark enables users to interpret a black-box model at massive scales with the Apache Spark\u2122 distributed computing ecosystem. Various components support local interpretation for tabular, vector, image and text classification models, with two popular model-agnostic interpretation methods: ",(0,l.kt)("a",{parentName:"p",href:"https://arxiv.org/abs/1602.04938"},"LIME")," and ",(0,l.kt)("a",{parentName:"p",href:"https://arxiv.org/abs/1705.07874"},"Kernel SHAP"),"."),(0,l.kt)("h2",{id:"usage"},"Usage"),(0,l.kt)("p",null,"Both LIME and Kernel SHAP are local interpretation methods. Local interpretation explains why does the model predict certain outcome for a given observation."),(0,l.kt)("p",null,"Both explainers extends from ",(0,l.kt)("inlineCode",{parentName:"p"},"org.apache.spark.ml.Transformer"),". After setting up the explainer parameters, simply call the ",(0,l.kt)("inlineCode",{parentName:"p"},"transform")," function on a ",(0,l.kt)("inlineCode",{parentName:"p"},"DataFrame")," of observations to interpret the model behavior on these observations."),(0,l.kt)("p",null,"To see examples of model interpretability on Spark in action, take a look at these sample notebooks:"),(0,l.kt)("ul",null,(0,l.kt)("li",{parentName:"ul"},(0,l.kt)("a",{parentName:"li",href:"../Tabular%20Explainers"},"Tabular Explainers")),(0,l.kt)("li",{parentName:"ul"},(0,l.kt)("a",{parentName:"li",href:"../Image%20Explainers"},"Image Explainers")),(0,l.kt)("li",{parentName:"ul"},(0,l.kt)("a",{parentName:"li",href:"../Text%20Explainers"},"Text Explainers"))),(0,l.kt)("table",null,(0,l.kt)("thead",{parentName:"table"},(0,l.kt)("tr",{parentName:"thead"},(0,l.kt)("th",{parentName:"tr",align:null}),(0,l.kt)("th",{parentName:"tr",align:null},"Tabular models"),(0,l.kt)("th",{parentName:"tr",align:null},"Vector models"),(0,l.kt)("th",{parentName:"tr",align:null},"Image models"),(0,l.kt)("th",{parentName:"tr",align:null},"Text models"))),(0,l.kt)("tbody",{parentName:"table"},(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"LIME explainers"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("a",{parentName:"td",href:"#tabularlime"},"TabularLIME")),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("a",{parentName:"td",href:"#vectorlime"},"VectorLIME")),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("a",{parentName:"td",href:"#imagelime"},"ImageLIME")),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("a",{parentName:"td",href:"#textlime"},"TextLIME"))),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"Kernel SHAP explainers"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("a",{parentName:"td",href:"#tabularshap"},"TabularSHAP")),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("a",{parentName:"td",href:"#vectorshap"},"VectorSHAP")),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("a",{parentName:"td",href:"#imageshap"},"ImageSHAP")),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("a",{parentName:"td",href:"#textshap"},"TextSHAP"))))),(0,l.kt)("h3",{id:"common-local-explainer-params"},"Common local explainer params"),(0,l.kt)("p",null,"All local explainers support the following params:"),(0,l.kt)("table",null,(0,l.kt)("thead",{parentName:"table"},(0,l.kt)("tr",{parentName:"thead"},(0,l.kt)("th",{parentName:"tr",align:null},"Param"),(0,l.kt)("th",{parentName:"tr",align:null},"Type"),(0,l.kt)("th",{parentName:"tr",align:null},"Default"),(0,l.kt)("th",{parentName:"tr",align:null},"Description"))),(0,l.kt)("tbody",{parentName:"table"},(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"targetCol"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"String")),(0,l.kt)("td",{parentName:"tr",align:null},'"probability"'),(0,l.kt)("td",{parentName:"tr",align:null},'The column name of the prediction target to explain (i.e. the response variable). This is usually set to "prediction" for regression models and "probability" for probabilistic classification models.')),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"targetClasses"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Array[Int]")),(0,l.kt)("td",{parentName:"tr",align:null},"empty array"),(0,l.kt)("td",{parentName:"tr",align:null},"The indices of the classes for multinomial classification models.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"targetClassesCol"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"String")),(0,l.kt)("td",{parentName:"tr",align:null}),(0,l.kt)("td",{parentName:"tr",align:null},"The name of the column that specifies the indices of the classes for multinomial classification models.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"outputCol"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"String")),(0,l.kt)("td",{parentName:"tr",align:null}),(0,l.kt)("td",{parentName:"tr",align:null},"The name of the output column for interpretation results.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"model"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Transformer")),(0,l.kt)("td",{parentName:"tr",align:null}),(0,l.kt)("td",{parentName:"tr",align:null},"The model to be explained.")))),(0,l.kt)("h3",{id:"common-lime-explainer-params"},"Common LIME explainer params"),(0,l.kt)("p",null,"All LIME based explainers (",(0,l.kt)("a",{parentName:"p",href:"#tabularlime"},"TabularLIME"),", ",(0,l.kt)("a",{parentName:"p",href:"#vectorlime"},"VectorLIME"),", ",(0,l.kt)("a",{parentName:"p",href:"#imagelime"},"ImageLIME"),", ",(0,l.kt)("a",{parentName:"p",href:"#textlime"},"TextLIME"),") support the following params:"),(0,l.kt)("table",null,(0,l.kt)("thead",{parentName:"table"},(0,l.kt)("tr",{parentName:"thead"},(0,l.kt)("th",{parentName:"tr",align:null},"Param"),(0,l.kt)("th",{parentName:"tr",align:null},"Type"),(0,l.kt)("th",{parentName:"tr",align:null},"Default"),(0,l.kt)("th",{parentName:"tr",align:null},"Description"))),(0,l.kt)("tbody",{parentName:"table"},(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"regularization"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Double")),(0,l.kt)("td",{parentName:"tr",align:null},"0"),(0,l.kt)("td",{parentName:"tr",align:null},"Regularization param for the underlying lasso regression.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"kernelWidth"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Double")),(0,l.kt)("td",{parentName:"tr",align:null},"sqrt(number of features) * 0.75"),(0,l.kt)("td",{parentName:"tr",align:null},"Kernel width for the exponential kernel.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"numSamples"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Int")),(0,l.kt)("td",{parentName:"tr",align:null},"1000"),(0,l.kt)("td",{parentName:"tr",align:null},"Number of samples to generate.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"metricsCol"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"String")),(0,l.kt)("td",{parentName:"tr",align:null},'"r2"'),(0,l.kt)("td",{parentName:"tr",align:null},"Column name for fitting metrics.")))),(0,l.kt)("h3",{id:"common-shap-explainer-params"},"Common SHAP explainer params"),(0,l.kt)("p",null,"All Kernel SHAP based explainers (",(0,l.kt)("a",{parentName:"p",href:"#tabularshap"},"TabularSHAP"),", ",(0,l.kt)("a",{parentName:"p",href:"#vectorshap"},"VectorSHAP"),", ",(0,l.kt)("a",{parentName:"p",href:"#imageshap"},"ImageSHAP"),", ",(0,l.kt)("a",{parentName:"p",href:"#textshap"},"TextSHAP"),") support the following params:"),(0,l.kt)("table",null,(0,l.kt)("thead",{parentName:"table"},(0,l.kt)("tr",{parentName:"thead"},(0,l.kt)("th",{parentName:"tr",align:null},"Param"),(0,l.kt)("th",{parentName:"tr",align:null},"Type"),(0,l.kt)("th",{parentName:"tr",align:null},"Default"),(0,l.kt)("th",{parentName:"tr",align:null},"Description"))),(0,l.kt)("tbody",{parentName:"table"},(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"infWeight"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Double")),(0,l.kt)("td",{parentName:"tr",align:null},"1E8"),(0,l.kt)("td",{parentName:"tr",align:null},"The double value to represent infinite weight.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"numSamples"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Int")),(0,l.kt)("td",{parentName:"tr",align:null},"2 * (number of features) + 2048"),(0,l.kt)("td",{parentName:"tr",align:null},"Number of samples to generate.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"metricsCol"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"String")),(0,l.kt)("td",{parentName:"tr",align:null},'"r2"'),(0,l.kt)("td",{parentName:"tr",align:null},"Column name for fitting metrics.")))),(0,l.kt)("h3",{id:"tabular-model-explainer-params"},"Tabular model explainer params"),(0,l.kt)("p",null,"All tabular model explainers (",(0,l.kt)("a",{parentName:"p",href:"#tabularlime"},"TabularLIME"),", ",(0,l.kt)("a",{parentName:"p",href:"#tabularshap"},"TabularSHAP"),") support the following params:"),(0,l.kt)("table",null,(0,l.kt)("thead",{parentName:"table"},(0,l.kt)("tr",{parentName:"thead"},(0,l.kt)("th",{parentName:"tr",align:null},"Param"),(0,l.kt)("th",{parentName:"tr",align:null},"Type"),(0,l.kt)("th",{parentName:"tr",align:null},"Default"),(0,l.kt)("th",{parentName:"tr",align:null},"Description"))),(0,l.kt)("tbody",{parentName:"table"},(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"inputCols"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Array[String]")),(0,l.kt)("td",{parentName:"tr",align:null}),(0,l.kt)("td",{parentName:"tr",align:null},"The names of input columns to the black-box model.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"backgroundData"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"DataFrame")),(0,l.kt)("td",{parentName:"tr",align:null}),(0,l.kt)("td",{parentName:"tr",align:null},"A dataframe containing background data. It must contain all the input columns needed by the black-box model.")))),(0,l.kt)("h3",{id:"vector-model-explainer-params"},"Vector model explainer params"),(0,l.kt)("p",null,"All vector model explainers (",(0,l.kt)("a",{parentName:"p",href:"#vectorlime"},"VectorLIME"),", ",(0,l.kt)("a",{parentName:"p",href:"#vectorshap"},"VectorSHAP"),") support the following params:"),(0,l.kt)("table",null,(0,l.kt)("thead",{parentName:"table"},(0,l.kt)("tr",{parentName:"thead"},(0,l.kt)("th",{parentName:"tr",align:null},"Param"),(0,l.kt)("th",{parentName:"tr",align:null},"Type"),(0,l.kt)("th",{parentName:"tr",align:null},"Default"),(0,l.kt)("th",{parentName:"tr",align:null},"Description"))),(0,l.kt)("tbody",{parentName:"table"},(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"inputCol"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"String")),(0,l.kt)("td",{parentName:"tr",align:null}),(0,l.kt)("td",{parentName:"tr",align:null},"The names of input vector column to the black-box model.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"backgroundData"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"DataFrame")),(0,l.kt)("td",{parentName:"tr",align:null}),(0,l.kt)("td",{parentName:"tr",align:null},"A dataframe containing background data. It must contain the input vector column needed by the black-box model.")))),(0,l.kt)("h3",{id:"image-model-explainer-params"},"Image model explainer params"),(0,l.kt)("p",null,"All image model explainers (",(0,l.kt)("a",{parentName:"p",href:"#imagelime"},"ImageLIME"),", ",(0,l.kt)("a",{parentName:"p",href:"#imageshap"},"ImageSHAP"),") support the following params:"),(0,l.kt)("table",null,(0,l.kt)("thead",{parentName:"table"},(0,l.kt)("tr",{parentName:"thead"},(0,l.kt)("th",{parentName:"tr",align:null},"Param"),(0,l.kt)("th",{parentName:"tr",align:null},"Type"),(0,l.kt)("th",{parentName:"tr",align:null},"Default"),(0,l.kt)("th",{parentName:"tr",align:null},"Description"))),(0,l.kt)("tbody",{parentName:"table"},(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"inputCol"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"String")),(0,l.kt)("td",{parentName:"tr",align:null}),(0,l.kt)("td",{parentName:"tr",align:null},"The names of input image column to the black-box model.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"cellSize"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Double")),(0,l.kt)("td",{parentName:"tr",align:null},"16"),(0,l.kt)("td",{parentName:"tr",align:null},"Number that controls the size of the super-pixels.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"modifier"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Double")),(0,l.kt)("td",{parentName:"tr",align:null},"130"),(0,l.kt)("td",{parentName:"tr",align:null},"Controls the trade-off spatial and color distance of super-pixels.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"superpixelCol"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"String")),(0,l.kt)("td",{parentName:"tr",align:null},'"superpixels"'),(0,l.kt)("td",{parentName:"tr",align:null},"The column holding the super-pixel decompositions.")))),(0,l.kt)("h3",{id:"text-model-explainer-params"},"Text model explainer params"),(0,l.kt)("p",null,"All text model explainers (",(0,l.kt)("a",{parentName:"p",href:"#textlime"},"TextLIME"),", ",(0,l.kt)("a",{parentName:"p",href:"#textshap"},"TextSHAP"),") support the following params:"),(0,l.kt)("table",null,(0,l.kt)("thead",{parentName:"table"},(0,l.kt)("tr",{parentName:"thead"},(0,l.kt)("th",{parentName:"tr",align:null},"Param"),(0,l.kt)("th",{parentName:"tr",align:null},"Type"),(0,l.kt)("th",{parentName:"tr",align:null},"Default"),(0,l.kt)("th",{parentName:"tr",align:null},"Description"))),(0,l.kt)("tbody",{parentName:"table"},(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"inputCol"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"String")),(0,l.kt)("td",{parentName:"tr",align:null}),(0,l.kt)("td",{parentName:"tr",align:null},"The names of input text column to the black-box model.")),(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"tokensCol"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"String")),(0,l.kt)("td",{parentName:"tr",align:null},'"tokens"'),(0,l.kt)("td",{parentName:"tr",align:null},"The column holding the text tokens.")))),(0,l.kt)("h3",{id:"tabularlime"},(0,l.kt)("inlineCode",{parentName:"h3"},"TabularLIME")),(0,l.kt)("table",null,(0,l.kt)("thead",{parentName:"table"},(0,l.kt)("tr",{parentName:"thead"},(0,l.kt)("th",{parentName:"tr",align:null},"Param"),(0,l.kt)("th",{parentName:"tr",align:null},"Type"),(0,l.kt)("th",{parentName:"tr",align:null},"Default"),(0,l.kt)("th",{parentName:"tr",align:null},"Description"))),(0,l.kt)("tbody",{parentName:"table"},(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"categoricalFeatures"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Array[String]")),(0,l.kt)("td",{parentName:"tr",align:null},"empty array"),(0,l.kt)("td",{parentName:"tr",align:null},"The name of columns that should be treated as categorical variables.")))),(0,l.kt)("blockquote",null,(0,l.kt)("p",{parentName:"blockquote"},"For categorical features, ",(0,l.kt)("inlineCode",{parentName:"p"},"TabularLIME")," creates new samples by drawing samples based on the value distribution from the background dataset. For numerical features, it creates new samples by drawing from a normal distribution with mean taken from the target value to be explained, and standard deviation taken from the background dataset.")),(0,l.kt)("h3",{id:"tabularshap"},(0,l.kt)("inlineCode",{parentName:"h3"},"TabularSHAP")),(0,l.kt)("p",null,"No additional params are supported."),(0,l.kt)("h3",{id:"vectorlime"},(0,l.kt)("inlineCode",{parentName:"h3"},"VectorLIME")),(0,l.kt)("p",null,"No additional params are supported."),(0,l.kt)("blockquote",null,(0,l.kt)("p",{parentName:"blockquote"},(0,l.kt)("inlineCode",{parentName:"p"},"VectorLIME")," assumes all features are numerical, and categorical features are not supported in ",(0,l.kt)("inlineCode",{parentName:"p"},"VectorLIME"),".")),(0,l.kt)("h3",{id:"vectorshap"},(0,l.kt)("inlineCode",{parentName:"h3"},"VectorSHAP")),(0,l.kt)("p",null,"No additional params are supported."),(0,l.kt)("h3",{id:"imagelime"},(0,l.kt)("inlineCode",{parentName:"h3"},"ImageLIME")),(0,l.kt)("table",null,(0,l.kt)("thead",{parentName:"table"},(0,l.kt)("tr",{parentName:"thead"},(0,l.kt)("th",{parentName:"tr",align:null},"Param"),(0,l.kt)("th",{parentName:"tr",align:null},"Type"),(0,l.kt)("th",{parentName:"tr",align:null},"Default"),(0,l.kt)("th",{parentName:"tr",align:null},"Description"))),(0,l.kt)("tbody",{parentName:"table"},(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"samplingFraction"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Double")),(0,l.kt)("td",{parentName:"tr",align:null},"0.7"),(0,l.kt)("td",{parentName:"tr",align:null},"The fraction of super-pixels to keep on during sampling.")))),(0,l.kt)("blockquote",null,(0,l.kt)("p",{parentName:"blockquote"},(0,l.kt)("inlineCode",{parentName:"p"},"ImageLIME")," creates new samples by randomly turning super-pixels on or off with probability of keeping on set to ",(0,l.kt)("inlineCode",{parentName:"p"},"SamplingFraction"),".")),(0,l.kt)("h3",{id:"imageshap"},(0,l.kt)("inlineCode",{parentName:"h3"},"ImageSHAP")),(0,l.kt)("p",null,"No additional params are supported."),(0,l.kt)("h3",{id:"textlime"},(0,l.kt)("inlineCode",{parentName:"h3"},"TextLIME")),(0,l.kt)("table",null,(0,l.kt)("thead",{parentName:"table"},(0,l.kt)("tr",{parentName:"thead"},(0,l.kt)("th",{parentName:"tr",align:null},"Param"),(0,l.kt)("th",{parentName:"tr",align:null},"Type"),(0,l.kt)("th",{parentName:"tr",align:null},"Default"),(0,l.kt)("th",{parentName:"tr",align:null},"Description"))),(0,l.kt)("tbody",{parentName:"table"},(0,l.kt)("tr",{parentName:"tbody"},(0,l.kt)("td",{parentName:"tr",align:null},"samplingFraction"),(0,l.kt)("td",{parentName:"tr",align:null},(0,l.kt)("inlineCode",{parentName:"td"},"Double")),(0,l.kt)("td",{parentName:"tr",align:null},"0.7"),(0,l.kt)("td",{parentName:"tr",align:null},"The fraction of word tokens to keep on during sampling.")))),(0,l.kt)("blockquote",null,(0,l.kt)("p",{parentName:"blockquote"},(0,l.kt)("inlineCode",{parentName:"p"},"TextLIME")," creates new samples by randomly turning word tokens on or off with probability of keeping on set to ",(0,l.kt)("inlineCode",{parentName:"p"},"SamplingFraction"),".")),(0,l.kt)("h3",{id:"textshap"},(0,l.kt)("inlineCode",{parentName:"h3"},"TextSHAP")),(0,l.kt)("p",null,"No additional params are supported."),(0,l.kt)("h2",{id:"result-interpretation"},"Result interpretation"),(0,l.kt)("h3",{id:"lime-explainers"},"LIME explainers"),(0,l.kt)("p",null,"LIME explainers return an array of vectors, and each vector maps to a class being explained. Each component of the vector is the coefficient for the corresponding feature, super-pixel, or word token from the local surrogate model."),(0,l.kt)("ul",null,(0,l.kt)("li",{parentName:"ul"},"For categorical variables, super-pixels, or word tokens, the coefficient shows the average change in model outcome if this feature is unknown to the model, if the super-pixel is replaced with background color (black), or if the word token is replaced with empty string."),(0,l.kt)("li",{parentName:"ul"},"For numeric variables, the coefficient shows the change in model outcome if the feature value is incremented by 1 unit.")),(0,l.kt)("h3",{id:"shap-explainers"},"SHAP explainers"),(0,l.kt)("p",null,"SHAP explainers return an array of vectors, and each vector maps to a class being explained. Each vector starts with the ",(0,l.kt)("a",{parentName:"p",href:"#base-value"},"base value"),", and each following component of the vector is the Shapley value for each feature, super-pixel, or token."),(0,l.kt)("p",null,"The base value and Shapley values are additive, and they should add up to the model output for the target observation."),(0,l.kt)("h4",{id:"base-value"},"Base value"),(0,l.kt)("ul",null,(0,l.kt)("li",{parentName:"ul"},"For tabular and vector models, the base value represents the mean outcome of the model for the background dataset."),(0,l.kt)("li",{parentName:"ul"},"For image models, the base value represents the model outcome for a background (all black) image."),(0,l.kt)("li",{parentName:"ul"},"For text models, the base value represents the model outcome for an empty string.")))}k.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/6f479459.bf577ea1.js b/assets/js/49e6864b.6274e2e6.js similarity index 97% rename from assets/js/6f479459.bf577ea1.js rename to assets/js/49e6864b.6274e2e6.js index 249264f03d..83d9421763 100644 --- a/assets/js/6f479459.bf577ea1.js +++ b/assets/js/49e6864b.6274e2e6.js @@ -1 +1 @@ -"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[4963],{3905:function(e,n,t){t.d(n,{Zo:function(){return m},kt:function(){return g}});var r=t(7294);function a(e,n,t){return n in e?Object.defineProperty(e,n,{value:t,enumerable:!0,configurable:!0,writable:!0}):e[n]=t,e}function l(e,n){var t=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);n&&(r=r.filter((function(n){return Object.getOwnPropertyDescriptor(e,n).enumerable}))),t.push.apply(t,r)}return t}function i(e){for(var n=1;n=0||(a[t]=e[t]);return a}(e,n);if(Object.getOwnPropertySymbols){var l=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,t)&&(a[t]=e[t])}return a}var s=r.createContext({}),p=function(e){var n=r.useContext(s),t=n;return e&&(t="function"==typeof e?e(n):i(i({},n),e)),t},m=function(e){var n=p(e.components);return r.createElement(s.Provider,{value:n},e.children)},u={inlineCode:"code",wrapper:function(e){var n=e.children;return r.createElement(r.Fragment,{},n)}},c=r.forwardRef((function(e,n){var t=e.components,a=e.mdxType,l=e.originalType,s=e.parentName,m=o(e,["components","mdxType","originalType","parentName"]),c=p(t),g=a,d=c["".concat(s,".").concat(g)]||c[g]||u[g]||l;return t?r.createElement(d,i(i({ref:n},m),{},{components:t})):r.createElement(d,i({ref:n},m))}));function g(e,n){var t=arguments,a=n&&n.mdxType;if("string"==typeof e||a){var l=t.length,i=new Array(l);i[0]=c;var o={};for(var s in n)hasOwnProperty.call(n,s)&&(o[s]=n[s]);o.originalType=e,o.mdxType="string"==typeof e?e:a,i[1]=o;for(var p=2;p green_value:\n for (x, y) in sp:\n image_array[y, x, 1] = 255\n image_array[y, x, 3] = 200\n plt.clf()\n plt.imshow(image_array)\n plt.show()\n')),(0,l.kt)("p",null,"Create a dataframe for a testing image, and use the ResNet50 ONNX model to infer the image."),(0,l.kt)("p",null,'The result shows 39.6% probability of "violin" (889), and 38.4% probability of "upright piano" (881).'),(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.io import *\n\nimage_df = spark.read.image().load(\n "wasbs://publicwasb@mmlspark.blob.core.windows.net/explainers/images/david-lusvardi-dWcUncxocQY-unsplash.jpg"\n)\ndisplay(image_df)\n\n# Rotate the image array from BGR into RGB channels for visualization later.\nrow = image_df.select(\n "image.height", "image.width", "image.nChannels", "image.data"\n).head()\nlocals().update(row.asDict())\nrgb_image_array = rotate_color_channel(data, height, width, nChannels)\n\n# Download the ONNX model\nmodelPayload = downloadBytes(\n "https://mmlspark.blob.core.windows.net/publicwasb/ONNXModels/resnet50-v2-7.onnx"\n)\n\nfeaturizer = (\n ImageTransformer(inputCol="image", outputCol="features")\n .resize(224, True)\n .centerCrop(224, 224)\n .normalize(\n mean=[0.485, 0.456, 0.406],\n std=[0.229, 0.224, 0.225],\n color_scale_factor=1 / 255,\n )\n .setTensorElementType(FloatType())\n)\n\nonnx = (\n ONNXModel()\n .setModelPayload(modelPayload)\n .setFeedDict({"data": "features"})\n .setFetchDict({"rawPrediction": "resnetv24_dense0_fwd"})\n .setSoftMaxDict({"rawPrediction": "probability"})\n .setMiniBatchSize(1)\n)\n\nmodel = Pipeline(stages=[featurizer, onnx]).fit(image_df)\n')),(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-python"},'predicted = (\n model.transform(image_df)\n .withColumn("top2pred", arg_top_k(col("probability"), lit(2)))\n .withColumn("top2prob", vec_slice(col("probability"), col("top2pred")))\n)\n\ndisplay(predicted.select("top2pred", "top2prob"))\n')),(0,l.kt)("p",null,"First we use the LIME image explainer to explain the model's top 2 classes' probabilities."),(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-python"},'lime = (\n ImageLIME()\n .setModel(model)\n .setOutputCol("weights")\n .setInputCol("image")\n .setCellSize(150.0)\n .setModifier(50.0)\n .setNumSamples(500)\n .setTargetCol("probability")\n .setTargetClassesCol("top2pred")\n .setSamplingFraction(0.7)\n)\n\nlime_result = (\n lime.transform(predicted)\n .withColumn("weights_violin", col("weights").getItem(0))\n .withColumn("weights_piano", col("weights").getItem(1))\n .cache()\n)\n\ndisplay(lime_result.select(col("weights_violin"), col("weights_piano")))\nlime_row = lime_result.head()\n')),(0,l.kt)("p",null,'We plot the LIME weights for "violin" output and "upright piano" output.'),(0,l.kt)("p",null,"Green areas are superpixels with LIME weights above 95 percentile."),(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-python"},'plot_superpixels(\n rgb_image_array,\n lime_row["superpixels"]["clusters"],\n list(lime_row["weights_violin"]),\n 95,\n)\nplot_superpixels(\n rgb_image_array,\n lime_row["superpixels"]["clusters"],\n list(lime_row["weights_piano"]),\n 95,\n)\n')),(0,l.kt)("p",null,"Your results will look like:"),(0,l.kt)("img",{src:"https://mmlspark.blob.core.windows.net/graphics/explainers/image-lime-20210811.png"}),(0,l.kt)("p",null,"Then we use the Kernel SHAP image explainer to explain the model's top 2 classes' probabilities."),(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-python"},'shap = (\n ImageSHAP()\n .setModel(model)\n .setOutputCol("shaps")\n .setSuperpixelCol("superpixels")\n .setInputCol("image")\n .setCellSize(150.0)\n .setModifier(50.0)\n .setNumSamples(500)\n .setTargetCol("probability")\n .setTargetClassesCol("top2pred")\n)\n\nshap_result = (\n shap.transform(predicted)\n .withColumn("shaps_violin", col("shaps").getItem(0))\n .withColumn("shaps_piano", col("shaps").getItem(1))\n .cache()\n)\n\ndisplay(shap_result.select(col("shaps_violin"), col("shaps_piano")))\nshap_row = shap_result.head()\n')),(0,l.kt)("p",null,'We plot the SHAP values for "piano" output and "cell" output.'),(0,l.kt)("p",null,"Green areas are superpixels with SHAP values above 95 percentile."),(0,l.kt)("blockquote",null,(0,l.kt)("p",{parentName:"blockquote"},"Notice that we drop the base value from the SHAP output before rendering the superpixels. The base value is the model output for the background (all black) image.")),(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-python"},'plot_superpixels(\n rgb_image_array,\n shap_row["superpixels"]["clusters"],\n list(shap_row["shaps_violin"][1:]),\n 95,\n)\nplot_superpixels(\n rgb_image_array,\n shap_row["superpixels"]["clusters"],\n list(shap_row["shaps_piano"][1:]),\n 95,\n)\n')),(0,l.kt)("p",null,"Your results will look like:"),(0,l.kt)("img",{src:"https://mmlspark.blob.core.windows.net/graphics/explainers/image-shap-20210811.png"}))}g.isMDXComponent=!0}}]); \ No newline at end of file +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[2707],{3905:function(e,n,t){t.d(n,{Zo:function(){return m},kt:function(){return g}});var r=t(7294);function a(e,n,t){return n in e?Object.defineProperty(e,n,{value:t,enumerable:!0,configurable:!0,writable:!0}):e[n]=t,e}function l(e,n){var t=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);n&&(r=r.filter((function(n){return Object.getOwnPropertyDescriptor(e,n).enumerable}))),t.push.apply(t,r)}return t}function i(e){for(var n=1;n=0||(a[t]=e[t]);return a}(e,n);if(Object.getOwnPropertySymbols){var l=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,t)&&(a[t]=e[t])}return a}var s=r.createContext({}),p=function(e){var n=r.useContext(s),t=n;return e&&(t="function"==typeof e?e(n):i(i({},n),e)),t},m=function(e){var n=p(e.components);return r.createElement(s.Provider,{value:n},e.children)},u={inlineCode:"code",wrapper:function(e){var n=e.children;return r.createElement(r.Fragment,{},n)}},c=r.forwardRef((function(e,n){var t=e.components,a=e.mdxType,l=e.originalType,s=e.parentName,m=o(e,["components","mdxType","originalType","parentName"]),c=p(t),g=a,d=c["".concat(s,".").concat(g)]||c[g]||u[g]||l;return t?r.createElement(d,i(i({ref:n},m),{},{components:t})):r.createElement(d,i({ref:n},m))}));function g(e,n){var t=arguments,a=n&&n.mdxType;if("string"==typeof e||a){var l=t.length,i=new Array(l);i[0]=c;var o={};for(var s in n)hasOwnProperty.call(n,s)&&(o[s]=n[s]);o.originalType=e,o.mdxType="string"==typeof e?e:a,i[1]=o;for(var p=2;p green_value:\n for (x, y) in sp:\n image_array[y, x, 1] = 255\n image_array[y, x, 3] = 200\n plt.clf()\n plt.imshow(image_array)\n plt.show()\n')),(0,l.kt)("p",null,"Create a dataframe for a testing image, and use the ResNet50 ONNX model to infer the image."),(0,l.kt)("p",null,'The result shows 39.6% probability of "violin" (889), and 38.4% probability of "upright piano" (881).'),(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.io import *\n\nimage_df = spark.read.image().load(\n "wasbs://publicwasb@mmlspark.blob.core.windows.net/explainers/images/david-lusvardi-dWcUncxocQY-unsplash.jpg"\n)\ndisplay(image_df)\n\n# Rotate the image array from BGR into RGB channels for visualization later.\nrow = image_df.select(\n "image.height", "image.width", "image.nChannels", "image.data"\n).head()\nlocals().update(row.asDict())\nrgb_image_array = rotate_color_channel(data, height, width, nChannels)\n\n# Download the ONNX model\nmodelPayload = downloadBytes(\n "https://mmlspark.blob.core.windows.net/publicwasb/ONNXModels/resnet50-v2-7.onnx"\n)\n\nfeaturizer = (\n ImageTransformer(inputCol="image", outputCol="features")\n .resize(224, True)\n .centerCrop(224, 224)\n .normalize(\n mean=[0.485, 0.456, 0.406],\n std=[0.229, 0.224, 0.225],\n color_scale_factor=1 / 255,\n )\n .setTensorElementType(FloatType())\n)\n\nonnx = (\n ONNXModel()\n .setModelPayload(modelPayload)\n .setFeedDict({"data": "features"})\n .setFetchDict({"rawPrediction": "resnetv24_dense0_fwd"})\n .setSoftMaxDict({"rawPrediction": "probability"})\n .setMiniBatchSize(1)\n)\n\nmodel = Pipeline(stages=[featurizer, onnx]).fit(image_df)\n')),(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-python"},'predicted = (\n model.transform(image_df)\n .withColumn("top2pred", arg_top_k(col("probability"), lit(2)))\n .withColumn("top2prob", vec_slice(col("probability"), col("top2pred")))\n)\n\ndisplay(predicted.select("top2pred", "top2prob"))\n')),(0,l.kt)("p",null,"First we use the LIME image explainer to explain the model's top 2 classes' probabilities."),(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-python"},'lime = (\n ImageLIME()\n .setModel(model)\n .setOutputCol("weights")\n .setInputCol("image")\n .setCellSize(150.0)\n .setModifier(50.0)\n .setNumSamples(500)\n .setTargetCol("probability")\n .setTargetClassesCol("top2pred")\n .setSamplingFraction(0.7)\n)\n\nlime_result = (\n lime.transform(predicted)\n .withColumn("weights_violin", col("weights").getItem(0))\n .withColumn("weights_piano", col("weights").getItem(1))\n .cache()\n)\n\ndisplay(lime_result.select(col("weights_violin"), col("weights_piano")))\nlime_row = lime_result.head()\n')),(0,l.kt)("p",null,'We plot the LIME weights for "violin" output and "upright piano" output.'),(0,l.kt)("p",null,"Green areas are superpixels with LIME weights above 95 percentile."),(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-python"},'plot_superpixels(\n rgb_image_array,\n lime_row["superpixels"]["clusters"],\n list(lime_row["weights_violin"]),\n 95,\n)\nplot_superpixels(\n rgb_image_array,\n lime_row["superpixels"]["clusters"],\n list(lime_row["weights_piano"]),\n 95,\n)\n')),(0,l.kt)("p",null,"Your results will look like:"),(0,l.kt)("img",{src:"https://mmlspark.blob.core.windows.net/graphics/explainers/image-lime-20210811.png"}),(0,l.kt)("p",null,"Then we use the Kernel SHAP image explainer to explain the model's top 2 classes' probabilities."),(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-python"},'shap = (\n ImageSHAP()\n .setModel(model)\n .setOutputCol("shaps")\n .setSuperpixelCol("superpixels")\n .setInputCol("image")\n .setCellSize(150.0)\n .setModifier(50.0)\n .setNumSamples(500)\n .setTargetCol("probability")\n .setTargetClassesCol("top2pred")\n)\n\nshap_result = (\n shap.transform(predicted)\n .withColumn("shaps_violin", col("shaps").getItem(0))\n .withColumn("shaps_piano", col("shaps").getItem(1))\n .cache()\n)\n\ndisplay(shap_result.select(col("shaps_violin"), col("shaps_piano")))\nshap_row = shap_result.head()\n')),(0,l.kt)("p",null,'We plot the SHAP values for "piano" output and "cell" output.'),(0,l.kt)("p",null,"Green areas are superpixels with SHAP values above 95 percentile."),(0,l.kt)("blockquote",null,(0,l.kt)("p",{parentName:"blockquote"},"Notice that we drop the base value from the SHAP output before rendering the superpixels. The base value is the model output for the background (all black) image.")),(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-python"},'plot_superpixels(\n rgb_image_array,\n shap_row["superpixels"]["clusters"],\n list(shap_row["shaps_violin"][1:]),\n 95,\n)\nplot_superpixels(\n rgb_image_array,\n shap_row["superpixels"]["clusters"],\n list(shap_row["shaps_piano"][1:]),\n 95,\n)\n')),(0,l.kt)("p",null,"Your results will look like:"),(0,l.kt)("img",{src:"https://mmlspark.blob.core.windows.net/graphics/explainers/image-shap-20210811.png"}))}g.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/4bbbdfcf.46696f3e.js b/assets/js/4bbbdfcf.46696f3e.js new file mode 100644 index 0000000000..5284a5068a --- /dev/null +++ b/assets/js/4bbbdfcf.46696f3e.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[6413],{3905:function(e,t,n){n.d(t,{Zo:function(){return p},kt:function(){return d}});var r=n(7294);function a(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function o(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,r)}return n}function i(e){for(var t=1;t=0||(a[n]=e[n]);return a}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(a[n]=e[n])}return a}var l=r.createContext({}),c=function(e){var t=r.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):i(i({},t),e)),n},p=function(e){var t=c(e.components);return r.createElement(l.Provider,{value:t},e.children)},m={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},u=r.forwardRef((function(e,t){var n=e.components,a=e.mdxType,o=e.originalType,l=e.parentName,p=s(e,["components","mdxType","originalType","parentName"]),u=c(n),d=a,h=u["".concat(l,".").concat(d)]||u[d]||m[d]||o;return n?r.createElement(h,i(i({ref:t},p),{},{components:n})):r.createElement(h,i({ref:t},p))}));function d(e,t){var n=arguments,a=t&&t.mdxType;if("string"==typeof e||a){var o=n.length,i=new Array(o);i[0]=u;var s={};for(var l in t)hasOwnProperty.call(t,l)&&(s[l]=t[l]);s.originalType=e,s.mdxType="string"==typeof e?e:a,i[1]=s;for(var c=2;c= lit(inferenceStartTime))\n .toPandas()\n)\n\nrdf\n')),(0,o.kt)("p",null,"Let's now format the ",(0,o.kt)("inlineCode",{parentName:"p"},"contributors")," column that stores the contribution score from each sensor to the detected anomalies. The next cell formats this data, and splits the contribution score of each sensor into its own column."),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'def parse(x):\n if len(x) > 0:\n return dict([item[:2] for item in x])\n else:\n return {"sensor_1": 0, "sensor_2": 0, "sensor_3": 0}\n\n\nrdf["contributors"] = rdf["interpretation"].apply(parse)\nrdf = pd.concat(\n [\n rdf.drop(["contributors"], axis=1),\n pd.json_normalize(rdf["contributors"]).rename(\n columns={\n "sensor_1": "series_1",\n "sensor_2": "series_2",\n "sensor_3": "series_3",\n }\n ),\n ],\n axis=1,\n)\nrdf\n')),(0,o.kt)("p",null,"Great! We now have the contribution scores of sensors 1, 2, and 3 in the ",(0,o.kt)("inlineCode",{parentName:"p"},"series_0"),", ",(0,o.kt)("inlineCode",{parentName:"p"},"series_1"),", and ",(0,o.kt)("inlineCode",{parentName:"p"},"series_2")," columns respectively. "),(0,o.kt)("p",null,"Let's run the next cell to plot the results. The ",(0,o.kt)("inlineCode",{parentName:"p"},"minSeverity")," parameter in the first line specifies the minimum severity of the anomalies to be plotted. "),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'minSeverity = 0.1\n\n\n####### Main Figure #######\nplt.figure(figsize=(23, 8))\nplt.plot(\n rdf["timestamp"],\n rdf["sensor_1"],\n color="tab:orange",\n line,\n linewidth=2,\n label="sensor_1",\n)\nplt.plot(\n rdf["timestamp"],\n rdf["sensor_2"],\n color="tab:green",\n line,\n linewidth=2,\n label="sensor_2",\n)\nplt.plot(\n rdf["timestamp"],\n rdf["sensor_3"],\n color="tab:blue",\n line,\n linewidth=2,\n label="sensor_3",\n)\nplt.grid(axis="y")\nplt.tick_params(axis="x", which="both", bottom=False, labelbottom=False)\nplt.legend()\n\nanoms = list(rdf["severity"] >= minSeverity)\n_, _, ymin, ymax = plt.axis()\nplt.vlines(np.where(anoms), ymin=ymin, ymax=ymax, color="r", alpha=0.8)\n\nplt.legend()\nplt.title(\n "A plot of the values from the three sensors with the detected anomalies highlighted in red."\n)\nplt.show()\n\n####### Severity Figure #######\nplt.figure(figsize=(23, 1))\nplt.tick_params(axis="x", which="both", bottom=False, labelbottom=False)\nplt.plot(\n rdf["timestamp"],\n rdf["severity"],\n color="black",\n line,\n linewidth=2,\n label="Severity score",\n)\nplt.plot(\n rdf["timestamp"],\n [minSeverity] * len(rdf["severity"]),\n color="red",\n line,\n linewidth=1,\n label="minSeverity",\n)\nplt.grid(axis="y")\nplt.legend()\nplt.ylim([0, 1])\nplt.title("Severity of the detected anomalies")\nplt.show()\n\n####### Contributors Figure #######\nplt.figure(figsize=(23, 1))\nplt.tick_params(axis="x", which="both", bottom=False, labelbottom=False)\nplt.bar(\n rdf["timestamp"], rdf["series_1"], width=2, color="tab:orange", label="sensor_1"\n)\nplt.bar(\n rdf["timestamp"],\n rdf["series_2"],\n width=2,\n color="tab:green",\n label="sensor_2",\n bottom=rdf["series_1"],\n)\nplt.bar(\n rdf["timestamp"],\n rdf["series_3"],\n width=2,\n color="tab:blue",\n label="sensor_3",\n bottom=rdf["series_1"] + rdf["series_2"],\n)\nplt.grid(axis="y")\nplt.legend()\nplt.ylim([0, 1])\nplt.title("The contribution of each sensor to the detected anomaly")\nplt.show()\n')),(0,o.kt)("img",{width:"1300",src:"https://mmlspark.blob.core.windows.net/graphics/multivariate-anomaly-detection-plot.png"}),(0,o.kt)("p",null,"The plots show the raw data from the sensors (inside the inference window) in orange, green, and blue. The red vertical lines in the first figure show the detected anomalies that have a severity greater than or equal to ",(0,o.kt)("inlineCode",{parentName:"p"},"minSeverity"),". "),(0,o.kt)("p",null,"The second plot shows the severity score of all the detected anomalies, with the ",(0,o.kt)("inlineCode",{parentName:"p"},"minSeverity")," threshold shown in the dotted red line."),(0,o.kt)("p",null,"Finally, the last plot shows the contribution of the data from each sensor to the detected anomalies. It helps us diagnose and understand the most likely cause of each anomaly."))}d.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/6b40ec54.62e05667.js b/assets/js/4cee39d7.09bbfad3.js similarity index 96% rename from assets/js/6b40ec54.62e05667.js rename to assets/js/4cee39d7.09bbfad3.js index 9d24d288f6..108e663055 100644 --- a/assets/js/6b40ec54.62e05667.js +++ b/assets/js/4cee39d7.09bbfad3.js @@ -1 +1 @@ -"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[2712],{3905:function(e,t,r){r.d(t,{Zo:function(){return m},kt:function(){return c}});var n=r(7294);function a(e,t,r){return t in e?Object.defineProperty(e,t,{value:r,enumerable:!0,configurable:!0,writable:!0}):e[t]=r,e}function o(e,t){var r=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);t&&(n=n.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),r.push.apply(r,n)}return r}function i(e){for(var t=1;t=0||(a[r]=e[r]);return a}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,r)&&(a[r]=e[r])}return a}var l=n.createContext({}),p=function(e){var t=n.useContext(l),r=t;return e&&(r="function"==typeof e?e(t):i(i({},t),e)),r},m=function(e){var t=p(e.components);return n.createElement(l.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return n.createElement(n.Fragment,{},t)}},d=n.forwardRef((function(e,t){var r=e.components,a=e.mdxType,o=e.originalType,l=e.parentName,m=s(e,["components","mdxType","originalType","parentName"]),d=p(r),c=a,y=d["".concat(l,".").concat(c)]||d[c]||u[c]||o;return r?n.createElement(y,i(i({ref:t},m),{},{components:r})):n.createElement(y,i({ref:t},m))}));function c(e,t){var r=arguments,a=t&&t.mdxType;if("string"==typeof e||a){var o=r.length,i=new Array(o);i[0]=d;var s={};for(var l in t)hasOwnProperty.call(t,l)&&(s[l]=t[l]);s.originalType=e,s.mdxType="string"==typeof e?e:a,i[1]=s;for(var p=2;p=0||(a[r]=e[r]);return a}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,r)&&(a[r]=e[r])}return a}var l=n.createContext({}),p=function(e){var t=n.useContext(l),r=t;return e&&(r="function"==typeof e?e(t):i(i({},t),e)),r},m=function(e){var t=p(e.components);return n.createElement(l.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return n.createElement(n.Fragment,{},t)}},d=n.forwardRef((function(e,t){var r=e.components,a=e.mdxType,o=e.originalType,l=e.parentName,m=s(e,["components","mdxType","originalType","parentName"]),d=p(r),c=a,y=d["".concat(l,".").concat(c)]||d[c]||u[c]||o;return r?n.createElement(y,i(i({ref:t},m),{},{components:r})):n.createElement(y,i({ref:t},m))}));function c(e,t){var r=arguments,a=t&&t.mdxType;if("string"==typeof e||a){var o=r.length,i=new Array(o);i[0]=d;var s={};for(var l in t)hasOwnProperty.call(t,l)&&(s[l]=t[l]);s.originalType=e,s.mdxType="string"==typeof e?e:a,i[1]=s;for(var p=2;p=0||(r[n]=e[n]);return r}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(r[n]=e[n])}return r}var s=a.createContext({}),p=function(e){var t=a.useContext(s),n=t;return e&&(n="function"==typeof e?e(t):o(o({},t),e)),n},m=function(e){var t=p(e.components);return a.createElement(s.Provider,{value:t},e.children)},c={inlineCode:"code",wrapper:function(e){var t=e.children;return a.createElement(a.Fragment,{},t)}},d=a.forwardRef((function(e,t){var n=e.components,r=e.mdxType,i=e.originalType,s=e.parentName,m=l(e,["components","mdxType","originalType","parentName"]),d=p(n),u=r,g=d["".concat(s,".").concat(u)]||d[u]||c[u]||i;return n?a.createElement(g,o(o({ref:t},m),{},{components:n})):a.createElement(g,o({ref:t},m))}));function u(e,t){var n=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var i=n.length,o=new Array(i);o[0]=d;var l={};for(var s in t)hasOwnProperty.call(t,s)&&(l[s]=t[s]);l.originalType=e,l.mdxType="string"==typeof e?e:r,o[1]=l;for(var p=2;p=0||(r[n]=e[n]);return r}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(r[n]=e[n])}return r}var s=a.createContext({}),p=function(e){var t=a.useContext(s),n=t;return e&&(n="function"==typeof e?e(t):o(o({},t),e)),n},m=function(e){var t=p(e.components);return a.createElement(s.Provider,{value:t},e.children)},c={inlineCode:"code",wrapper:function(e){var t=e.children;return a.createElement(a.Fragment,{},t)}},d=a.forwardRef((function(e,t){var n=e.components,r=e.mdxType,i=e.originalType,s=e.parentName,m=l(e,["components","mdxType","originalType","parentName"]),d=p(n),u=r,g=d["".concat(s,".").concat(u)]||d[u]||c[u]||i;return n?a.createElement(g,o(o({ref:t},m),{},{components:n})):a.createElement(g,o({ref:t},m))}));function u(e,t){var n=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var i=n.length,o=new Array(i);o[0]=d;var l={};for(var s in t)hasOwnProperty.call(t,s)&&(l[s]=t[s]);l.originalType=e,l.mdxType="string"==typeof e?e:r,o[1]=l;for(var p=2;p=0||(a[n]=e[n]);return a}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(a[n]=e[n])}return a}var o=r.createContext({}),u=function(e){var t=r.useContext(o),n=t;return e&&(n="function"==typeof e?e(t):l(l({},t),e)),n},m=function(e){var t=u(e.components);return r.createElement(o.Provider,{value:t},e.children)},c={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},p=r.forwardRef((function(e,t){var n=e.components,a=e.mdxType,i=e.originalType,o=e.parentName,m=s(e,["components","mdxType","originalType","parentName"]),p=u(n),d=a,h=p["".concat(o,".").concat(d)]||p[d]||c[d]||i;return n?r.createElement(h,l(l({ref:t},m),{},{components:n})):r.createElement(h,l({ref:t},m))}));function d(e,t){var n=arguments,a=t&&t.mdxType;if("string"==typeof e||a){var i=n.length,l=new Array(i);l[0]=p;var s={};for(var o in t)hasOwnProperty.call(t,o)&&(s[o]=t[o]);s.originalType=e,s.mdxType="string"==typeof e?e:a,l[1]=s;for(var u=2;u=0||(n[r]=e[r]);return n}(e,t);if(Object.getOwnPropertySymbols){var l=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,r)&&(n[r]=e[r])}return n}var i=a.createContext({}),u=function(e){var t=a.useContext(i),r=t;return e&&(r="function"==typeof e?e(t):s(s({},t),e)),r},c=function(e){var t=u(e.components);return a.createElement(i.Provider,{value:t},e.children)},m={inlineCode:"code",wrapper:function(e){var t=e.children;return a.createElement(a.Fragment,{},t)}},p=a.forwardRef((function(e,t){var r=e.components,n=e.mdxType,l=e.originalType,i=e.parentName,c=o(e,["components","mdxType","originalType","parentName"]),p=u(r),g=n,f=p["".concat(i,".").concat(g)]||p[g]||m[g]||l;return r?a.createElement(f,s(s({ref:t},c),{},{components:r})):a.createElement(f,s({ref:t},c))}));function g(e,t){var r=arguments,n=t&&t.mdxType;if("string"==typeof e||n){var l=r.length,s=new Array(l);s[0]=p;var o={};for(var i in t)hasOwnProperty.call(t,i)&&(o[i]=t[i]);o.originalType=e,o.mdxType="string"==typeof e?e:n,s[1]=o;for(var u=2;u child <"+("string"==typeof e.type?e.type:e.type.name)+'>: all children of the component should be , and every should have a unique "value" prop.')})))?void 0:r.filter(Boolean))?t:[]}(e).map((function(e){var t=e.props;return{value:t.value,label:t.label,attributes:t.attributes,default:t.default}}))}function p(e){var t=e.values,r=e.children;return(0,n.useMemo)((function(){var e=null!=t?t:m(r);return function(e){var t=(0,u.l)(e,(function(e,t){return e.value===t.value}));if(t.length>0)throw new Error('Docusaurus error: Duplicate values "'+t.map((function(e){return e.value})).join(", ")+'" found in . Every value needs to be unique.')}(e),e}),[t,r])}function g(e){var t=e.value;return e.tabValues.some((function(e){return e.value===t}))}function f(e){var t=e.queryString,r=void 0!==t&&t,a=e.groupId,l=(0,o.k6)(),s=function(e){var t=e.queryString,r=void 0!==t&&t,a=e.groupId;if("string"==typeof r)return r;if(!1===r)return null;if(!0===r&&!a)throw new Error('Docusaurus error: The component groupId prop is required if queryString=true, because this value is used as the search param name. You can also provide an explicit value such as queryString="my-search-param".');return null!=a?a:null}({queryString:r,groupId:a});return[(0,i._X)(s),(0,n.useCallback)((function(e){if(s){var t=new URLSearchParams(l.location.search);t.set(s,e),l.replace(Object.assign({},l.location,{search:t.toString()}))}}),[s,l])]}function b(e){var t,r,a,l,s=e.defaultValue,o=e.queryString,i=void 0!==o&&o,u=e.groupId,m=p(e),b=(0,n.useState)((function(){return function(e){var t,r=e.defaultValue,a=e.tabValues;if(0===a.length)throw new Error("Docusaurus error: the component requires at least one children component");if(r){if(!g({value:r,tabValues:a}))throw new Error('Docusaurus error: The has a defaultValue "'+r+'" but none of its children has the corresponding value. Available values are: '+a.map((function(e){return e.value})).join(", ")+". If you intend to show no default tab, use defaultValue={null} instead.");return r}var n=null!=(t=a.find((function(e){return e.default})))?t:a[0];if(!n)throw new Error("Unexpected error: 0 tabValues");return n.value}({defaultValue:s,tabValues:m})})),h=b[0],d=b[1],v=f({queryString:i,groupId:u}),y=v[0],k=v[1],L=(t=function(e){return e?"docusaurus.tab."+e:null}({groupId:u}.groupId),r=(0,c.Nk)(t),a=r[0],l=r[1],[a,(0,n.useCallback)((function(e){t&&l.set(e)}),[t,l])]),E=L[0],w=L[1],M=function(){var e=null!=y?y:E;return g({value:e,tabValues:m})?e:null}();return(0,n.useLayoutEffect)((function(){M&&d(M)}),[M]),{selectedValue:h,selectValue:(0,n.useCallback)((function(e){if(!g({value:e,tabValues:m}))throw new Error("Can't select invalid tab value="+e);d(e),k(e),w(e)}),[k,w,m]),tabValues:m}}var h=r(2389),d="tabList__CuJ",v="tabItem_LNqP";function y(e){var t=e.className,r=e.block,o=e.selectedValue,i=e.selectValue,u=e.tabValues,c=[],m=(0,s.o5)().blockElementScrollPositionUntilNextRender,p=function(e){var t=e.currentTarget,r=c.indexOf(t),a=u[r].value;a!==o&&(m(t),i(a))},g=function(e){var t,r=null;switch(e.key){case"Enter":p(e);break;case"ArrowRight":var a,n=c.indexOf(e.currentTarget)+1;r=null!=(a=c[n])?a:c[0];break;case"ArrowLeft":var l,s=c.indexOf(e.currentTarget)-1;r=null!=(l=c[s])?l:c[c.length-1]}null==(t=r)||t.focus()};return n.createElement("ul",{role:"tablist","aria-orientation":"horizontal",className:(0,l.Z)("tabs",{"tabs--block":r},t)},u.map((function(e){var t=e.value,r=e.label,s=e.attributes;return n.createElement("li",(0,a.Z)({role:"tab",tabIndex:o===t?0:-1,"aria-selected":o===t,key:t,ref:function(e){return c.push(e)},onKeyDown:g,onClick:p},s,{className:(0,l.Z)("tabs__item",v,null==s?void 0:s.className,{"tabs__item--active":o===t})}),null!=r?r:t)})))}function k(e){var t=e.lazy,r=e.children,a=e.selectedValue,l=(Array.isArray(r)?r:[r]).filter(Boolean);if(t){var s=l.find((function(e){return e.props.value===a}));return s?(0,n.cloneElement)(s,{className:"margin-top--md"}):null}return n.createElement("div",{className:"margin-top--md"},l.map((function(e,t){return(0,n.cloneElement)(e,{key:t,hidden:e.props.value!==a})})))}function L(e){var t=b(e);return n.createElement("div",{className:(0,l.Z)("tabs-container",d)},n.createElement(y,(0,a.Z)({},e,t)),n.createElement(k,(0,a.Z)({},e,t)))}function E(e){var t=(0,h.Z)();return n.createElement(L,(0,a.Z)({key:String(t)},e))}},1989:function(e,t,r){var a=r(7294),n=r(2263);t.Z=function(e){var t=e.className,r=e.py,l=e.scala,s=e.csharp,o=e.sourceLink,i=(0,n.Z)().siteConfig.customFields.version,u="https://mmlspark.blob.core.windows.net/docs/"+i+"/pyspark/"+r,c="https://mmlspark.blob.core.windows.net/docs/"+i+"/scala/"+l,m="https://mmlspark.blob.core.windows.net/docs/"+i+"/dotnet/"+s;return a.createElement("table",null,a.createElement("tbody",null,a.createElement("tr",null,a.createElement("td",null,a.createElement("strong",null,"Python API: "),a.createElement("a",{href:u},t)),a.createElement("td",null,a.createElement("strong",null,"Scala API: "),a.createElement("a",{href:c},t)),a.createElement("td",null,a.createElement("strong",null,".NET API: "),a.createElement("a",{href:m},t)),a.createElement("td",null,a.createElement("strong",null,"Source: "),a.createElement("a",{href:o},t)))))}},114:function(e,t,r){r.r(t),r.d(t,{assets:function(){return d},contentTitle:function(){return b},default:function(){return k},frontMatter:function(){return f},metadata:function(){return h},toc:function(){return v}});var a=r(3117),n=r(102),l=(r(7294),r(3905)),s=r(4866),o=r(5162),i=r(1989),u=["components"],c=[{value:"LightGBMClassifier",id:"lightgbmclassifier",level:2},{value:"LightGBMRanker",id:"lightgbmranker",level:2},{value:"LightGBMRegressor",id:"lightgbmregressor",level:2}],m={toc:c};function p(e){var t=e.components,r=(0,n.Z)(e,u);return(0,l.kt)("wrapper",(0,a.Z)({},m,r,{components:t,mdxType:"MDXLayout"}),(0,l.kt)("h2",{id:"lightgbmclassifier"},"LightGBMClassifier"),(0,l.kt)(s.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,l.kt)(o.Z,{value:"py",mdxType:"TabItem"},(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.lightgbm import *\n\nlgbmClassifier = (LightGBMClassifier()\n .setFeaturesCol("features")\n .setRawPredictionCol("rawPrediction")\n .setDefaultListenPort(12402)\n .setNumLeaves(5)\n .setNumIterations(10)\n .setObjective("binary")\n .setLabelCol("labels")\n .setLeafPredictionCol("leafPrediction")\n .setFeaturesShapCol("featuresShap"))\n'))),(0,l.kt)(o.Z,{value:"scala",mdxType:"TabItem"},(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.lightgbm._\n\nval lgbmClassifier = (new LightGBMClassifier()\n .setFeaturesCol("features")\n .setRawPredictionCol("rawPrediction")\n .setDefaultListenPort(12402)\n .setNumLeaves(5)\n .setNumIterations(10)\n .setObjective("binary")\n .setLabelCol("labels")\n .setLeafPredictionCol("leafPrediction")\n .setFeaturesShapCol("featuresShap"))\n')))),(0,l.kt)(i.Z,{className:"LightGBMClassifier",py:"synapse.ml.lightgbm.html#module-synapse.ml.lightgbm.LightGBMClassifier",scala:"com/microsoft/azure/synapse/ml/lightgbm/LightGBMClassifier.html",csharp:"classSynapse_1_1ML_1_1Lightgbm_1_1LightGBMClassifier.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/lightgbm/src/main/scala/com/microsoft/azure/synapse/ml/lightgbm/LightGBMClassifier.scala",mdxType:"DocTable"}),(0,l.kt)("h2",{id:"lightgbmranker"},"LightGBMRanker"),(0,l.kt)(s.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,l.kt)(o.Z,{value:"py",mdxType:"TabItem"},(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.lightgbm import *\n\nlgbmRanker = (LightGBMRanker()\n .setLabelCol("labels")\n .setFeaturesCol("features")\n .setGroupCol("query")\n .setDefaultListenPort(12402)\n .setRepartitionByGroupingColumn(False)\n .setNumLeaves(5)\n .setNumIterations(10))\n'))),(0,l.kt)(o.Z,{value:"scala",mdxType:"TabItem"},(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.lightgbm._\n\nval lgbmRanker = (new LightGBMRanker()\n .setLabelCol("labels")\n .setFeaturesCol("features")\n .setGroupCol("query")\n .setDefaultListenPort(12402)\n .setRepartitionByGroupingColumn(false)\n .setNumLeaves(5)\n .setNumIterations(10))\n')))),(0,l.kt)(i.Z,{className:"LightGBMRanker",py:"synapse.ml.lightgbm.html#module-synapse.ml.lightgbm.LightGBMRanker",scala:"com/microsoft/azure/synapse/ml/lightgbm/LightGBMRanker.html",csharp:"classSynapse_1_1ML_1_1Lightgbm_1_1LightGBMRanker.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/lightgbm/src/main/scala/com/microsoft/azure/synapse/ml/lightgbm/LightGBMRanker.scala",mdxType:"DocTable"}),(0,l.kt)("h2",{id:"lightgbmregressor"},"LightGBMRegressor"),(0,l.kt)(s.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,l.kt)(o.Z,{value:"py",mdxType:"TabItem"},(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.lightgbm import *\n\nlgbmRegressor = (LightGBMRegressor()\n .setLabelCol("labels")\n .setFeaturesCol("features")\n .setDefaultListenPort(12402)\n .setNumLeaves(5)\n .setNumIterations(10))\n'))),(0,l.kt)(o.Z,{value:"scala",mdxType:"TabItem"},(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.lightgbm._\n\nval lgbmRegressor = (new LightGBMRegressor()\n .setLabelCol("labels")\n .setFeaturesCol("features")\n .setDefaultListenPort(12402)\n .setNumLeaves(5)\n .setNumIterations(10))\n')))),(0,l.kt)(i.Z,{className:"LightGBMRegressor",py:"synapse.ml.lightgbm.html#module-synapse.ml.lightgbm.LightGBMRegressor",scala:"com/microsoft/azure/synapse/ml/lightgbm/LightGBMRegressor.html",csharp:"classSynapse_1_1ML_1_1Lightgbm_1_1LightGBMRegressor.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/lightgbm/src/main/scala/com/microsoft/azure/synapse/ml/lightgbm/LightGBMRegressor.scala",mdxType:"DocTable"}))}p.isMDXComponent=!0;var g=["components"],f={title:"Estimators - LightGBM",sidebar_label:"LightGBM",hide_title:!0},b="LightGBM",h={unversionedId:"Quick Examples/estimators/estimators_lightgbm",id:"version-0.11.3/Quick Examples/estimators/estimators_lightgbm",title:"Estimators - LightGBM",description:"",source:"@site/versioned_docs/version-0.11.3/Quick Examples/estimators/estimators_lightgbm.md",sourceDirName:"Quick Examples/estimators",slug:"/Quick Examples/estimators/estimators_lightgbm",permalink:"/SynapseML/docs/Quick Examples/estimators/estimators_lightgbm",draft:!1,tags:[],version:"0.11.3",frontMatter:{title:"Estimators - LightGBM",sidebar_label:"LightGBM",hide_title:!0}},d={},v=[].concat(c),y={toc:v};function k(e){var t=e.components,r=(0,n.Z)(e,g);return(0,l.kt)("wrapper",(0,a.Z)({},y,r,{components:t,mdxType:"MDXLayout"}),(0,l.kt)("h1",{id:"lightgbm"},"LightGBM"),(0,l.kt)(p,{mdxType:"LightGBM"}))}k.isMDXComponent=!0}}]); \ No newline at end of file +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[5847],{3905:function(e,t,r){r.d(t,{Zo:function(){return c},kt:function(){return g}});var a=r(7294);function n(e,t,r){return t in e?Object.defineProperty(e,t,{value:r,enumerable:!0,configurable:!0,writable:!0}):e[t]=r,e}function l(e,t){var r=Object.keys(e);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);t&&(a=a.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),r.push.apply(r,a)}return r}function s(e){for(var t=1;t=0||(n[r]=e[r]);return n}(e,t);if(Object.getOwnPropertySymbols){var l=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,r)&&(n[r]=e[r])}return n}var i=a.createContext({}),u=function(e){var t=a.useContext(i),r=t;return e&&(r="function"==typeof e?e(t):s(s({},t),e)),r},c=function(e){var t=u(e.components);return a.createElement(i.Provider,{value:t},e.children)},m={inlineCode:"code",wrapper:function(e){var t=e.children;return a.createElement(a.Fragment,{},t)}},p=a.forwardRef((function(e,t){var r=e.components,n=e.mdxType,l=e.originalType,i=e.parentName,c=o(e,["components","mdxType","originalType","parentName"]),p=u(r),g=n,f=p["".concat(i,".").concat(g)]||p[g]||m[g]||l;return r?a.createElement(f,s(s({ref:t},c),{},{components:r})):a.createElement(f,s({ref:t},c))}));function g(e,t){var r=arguments,n=t&&t.mdxType;if("string"==typeof e||n){var l=r.length,s=new Array(l);s[0]=p;var o={};for(var i in t)hasOwnProperty.call(t,i)&&(o[i]=t[i]);o.originalType=e,o.mdxType="string"==typeof e?e:n,s[1]=o;for(var u=2;u child <"+("string"==typeof e.type?e.type:e.type.name)+'>: all children of the component should be , and every should have a unique "value" prop.')})))?void 0:r.filter(Boolean))?t:[]}(e).map((function(e){var t=e.props;return{value:t.value,label:t.label,attributes:t.attributes,default:t.default}}))}function p(e){var t=e.values,r=e.children;return(0,n.useMemo)((function(){var e=null!=t?t:m(r);return function(e){var t=(0,u.l)(e,(function(e,t){return e.value===t.value}));if(t.length>0)throw new Error('Docusaurus error: Duplicate values "'+t.map((function(e){return e.value})).join(", ")+'" found in . Every value needs to be unique.')}(e),e}),[t,r])}function g(e){var t=e.value;return e.tabValues.some((function(e){return e.value===t}))}function f(e){var t=e.queryString,r=void 0!==t&&t,a=e.groupId,l=(0,o.k6)(),s=function(e){var t=e.queryString,r=void 0!==t&&t,a=e.groupId;if("string"==typeof r)return r;if(!1===r)return null;if(!0===r&&!a)throw new Error('Docusaurus error: The component groupId prop is required if queryString=true, because this value is used as the search param name. You can also provide an explicit value such as queryString="my-search-param".');return null!=a?a:null}({queryString:r,groupId:a});return[(0,i._X)(s),(0,n.useCallback)((function(e){if(s){var t=new URLSearchParams(l.location.search);t.set(s,e),l.replace(Object.assign({},l.location,{search:t.toString()}))}}),[s,l])]}function b(e){var t,r,a,l,s=e.defaultValue,o=e.queryString,i=void 0!==o&&o,u=e.groupId,m=p(e),b=(0,n.useState)((function(){return function(e){var t,r=e.defaultValue,a=e.tabValues;if(0===a.length)throw new Error("Docusaurus error: the component requires at least one children component");if(r){if(!g({value:r,tabValues:a}))throw new Error('Docusaurus error: The has a defaultValue "'+r+'" but none of its children has the corresponding value. Available values are: '+a.map((function(e){return e.value})).join(", ")+". If you intend to show no default tab, use defaultValue={null} instead.");return r}var n=null!=(t=a.find((function(e){return e.default})))?t:a[0];if(!n)throw new Error("Unexpected error: 0 tabValues");return n.value}({defaultValue:s,tabValues:m})})),h=b[0],d=b[1],v=f({queryString:i,groupId:u}),y=v[0],k=v[1],L=(t=function(e){return e?"docusaurus.tab."+e:null}({groupId:u}.groupId),r=(0,c.Nk)(t),a=r[0],l=r[1],[a,(0,n.useCallback)((function(e){t&&l.set(e)}),[t,l])]),E=L[0],w=L[1],M=function(){var e=null!=y?y:E;return g({value:e,tabValues:m})?e:null}();return(0,n.useLayoutEffect)((function(){M&&d(M)}),[M]),{selectedValue:h,selectValue:(0,n.useCallback)((function(e){if(!g({value:e,tabValues:m}))throw new Error("Can't select invalid tab value="+e);d(e),k(e),w(e)}),[k,w,m]),tabValues:m}}var h=r(2389),d="tabList__CuJ",v="tabItem_LNqP";function y(e){var t=e.className,r=e.block,o=e.selectedValue,i=e.selectValue,u=e.tabValues,c=[],m=(0,s.o5)().blockElementScrollPositionUntilNextRender,p=function(e){var t=e.currentTarget,r=c.indexOf(t),a=u[r].value;a!==o&&(m(t),i(a))},g=function(e){var t,r=null;switch(e.key){case"Enter":p(e);break;case"ArrowRight":var a,n=c.indexOf(e.currentTarget)+1;r=null!=(a=c[n])?a:c[0];break;case"ArrowLeft":var l,s=c.indexOf(e.currentTarget)-1;r=null!=(l=c[s])?l:c[c.length-1]}null==(t=r)||t.focus()};return n.createElement("ul",{role:"tablist","aria-orientation":"horizontal",className:(0,l.Z)("tabs",{"tabs--block":r},t)},u.map((function(e){var t=e.value,r=e.label,s=e.attributes;return n.createElement("li",(0,a.Z)({role:"tab",tabIndex:o===t?0:-1,"aria-selected":o===t,key:t,ref:function(e){return c.push(e)},onKeyDown:g,onClick:p},s,{className:(0,l.Z)("tabs__item",v,null==s?void 0:s.className,{"tabs__item--active":o===t})}),null!=r?r:t)})))}function k(e){var t=e.lazy,r=e.children,a=e.selectedValue,l=(Array.isArray(r)?r:[r]).filter(Boolean);if(t){var s=l.find((function(e){return e.props.value===a}));return s?(0,n.cloneElement)(s,{className:"margin-top--md"}):null}return n.createElement("div",{className:"margin-top--md"},l.map((function(e,t){return(0,n.cloneElement)(e,{key:t,hidden:e.props.value!==a})})))}function L(e){var t=b(e);return n.createElement("div",{className:(0,l.Z)("tabs-container",d)},n.createElement(y,(0,a.Z)({},e,t)),n.createElement(k,(0,a.Z)({},e,t)))}function E(e){var t=(0,h.Z)();return n.createElement(L,(0,a.Z)({key:String(t)},e))}},1989:function(e,t,r){var a=r(7294),n=r(2263);t.Z=function(e){var t=e.className,r=e.py,l=e.scala,s=e.csharp,o=e.sourceLink,i=(0,n.Z)().siteConfig.customFields.version,u="https://mmlspark.blob.core.windows.net/docs/"+i+"/pyspark/"+r,c="https://mmlspark.blob.core.windows.net/docs/"+i+"/scala/"+l,m="https://mmlspark.blob.core.windows.net/docs/"+i+"/dotnet/"+s;return a.createElement("table",null,a.createElement("tbody",null,a.createElement("tr",null,a.createElement("td",null,a.createElement("strong",null,"Python API: "),a.createElement("a",{href:u},t)),a.createElement("td",null,a.createElement("strong",null,"Scala API: "),a.createElement("a",{href:c},t)),a.createElement("td",null,a.createElement("strong",null,".NET API: "),a.createElement("a",{href:m},t)),a.createElement("td",null,a.createElement("strong",null,"Source: "),a.createElement("a",{href:o},t)))))}},114:function(e,t,r){r.r(t),r.d(t,{assets:function(){return d},contentTitle:function(){return b},default:function(){return k},frontMatter:function(){return f},metadata:function(){return h},toc:function(){return v}});var a=r(3117),n=r(102),l=(r(7294),r(3905)),s=r(4866),o=r(5162),i=r(1989),u=["components"],c=[{value:"LightGBMClassifier",id:"lightgbmclassifier",level:2},{value:"LightGBMRanker",id:"lightgbmranker",level:2},{value:"LightGBMRegressor",id:"lightgbmregressor",level:2}],m={toc:c};function p(e){var t=e.components,r=(0,n.Z)(e,u);return(0,l.kt)("wrapper",(0,a.Z)({},m,r,{components:t,mdxType:"MDXLayout"}),(0,l.kt)("h2",{id:"lightgbmclassifier"},"LightGBMClassifier"),(0,l.kt)(s.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,l.kt)(o.Z,{value:"py",mdxType:"TabItem"},(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.lightgbm import *\n\nlgbmClassifier = (LightGBMClassifier()\n .setFeaturesCol("features")\n .setRawPredictionCol("rawPrediction")\n .setDefaultListenPort(12402)\n .setNumLeaves(5)\n .setNumIterations(10)\n .setObjective("binary")\n .setLabelCol("labels")\n .setLeafPredictionCol("leafPrediction")\n .setFeaturesShapCol("featuresShap"))\n'))),(0,l.kt)(o.Z,{value:"scala",mdxType:"TabItem"},(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.lightgbm._\n\nval lgbmClassifier = (new LightGBMClassifier()\n .setFeaturesCol("features")\n .setRawPredictionCol("rawPrediction")\n .setDefaultListenPort(12402)\n .setNumLeaves(5)\n .setNumIterations(10)\n .setObjective("binary")\n .setLabelCol("labels")\n .setLeafPredictionCol("leafPrediction")\n .setFeaturesShapCol("featuresShap"))\n')))),(0,l.kt)(i.Z,{className:"LightGBMClassifier",py:"synapse.ml.lightgbm.html#module-synapse.ml.lightgbm.LightGBMClassifier",scala:"com/microsoft/azure/synapse/ml/lightgbm/LightGBMClassifier.html",csharp:"classSynapse_1_1ML_1_1Lightgbm_1_1LightGBMClassifier.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/lightgbm/src/main/scala/com/microsoft/azure/synapse/ml/lightgbm/LightGBMClassifier.scala",mdxType:"DocTable"}),(0,l.kt)("h2",{id:"lightgbmranker"},"LightGBMRanker"),(0,l.kt)(s.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,l.kt)(o.Z,{value:"py",mdxType:"TabItem"},(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.lightgbm import *\n\nlgbmRanker = (LightGBMRanker()\n .setLabelCol("labels")\n .setFeaturesCol("features")\n .setGroupCol("query")\n .setDefaultListenPort(12402)\n .setRepartitionByGroupingColumn(False)\n .setNumLeaves(5)\n .setNumIterations(10))\n'))),(0,l.kt)(o.Z,{value:"scala",mdxType:"TabItem"},(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.lightgbm._\n\nval lgbmRanker = (new LightGBMRanker()\n .setLabelCol("labels")\n .setFeaturesCol("features")\n .setGroupCol("query")\n .setDefaultListenPort(12402)\n .setRepartitionByGroupingColumn(false)\n .setNumLeaves(5)\n .setNumIterations(10))\n')))),(0,l.kt)(i.Z,{className:"LightGBMRanker",py:"synapse.ml.lightgbm.html#module-synapse.ml.lightgbm.LightGBMRanker",scala:"com/microsoft/azure/synapse/ml/lightgbm/LightGBMRanker.html",csharp:"classSynapse_1_1ML_1_1Lightgbm_1_1LightGBMRanker.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/lightgbm/src/main/scala/com/microsoft/azure/synapse/ml/lightgbm/LightGBMRanker.scala",mdxType:"DocTable"}),(0,l.kt)("h2",{id:"lightgbmregressor"},"LightGBMRegressor"),(0,l.kt)(s.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,l.kt)(o.Z,{value:"py",mdxType:"TabItem"},(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.lightgbm import *\n\nlgbmRegressor = (LightGBMRegressor()\n .setLabelCol("labels")\n .setFeaturesCol("features")\n .setDefaultListenPort(12402)\n .setNumLeaves(5)\n .setNumIterations(10))\n'))),(0,l.kt)(o.Z,{value:"scala",mdxType:"TabItem"},(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.lightgbm._\n\nval lgbmRegressor = (new LightGBMRegressor()\n .setLabelCol("labels")\n .setFeaturesCol("features")\n .setDefaultListenPort(12402)\n .setNumLeaves(5)\n .setNumIterations(10))\n')))),(0,l.kt)(i.Z,{className:"LightGBMRegressor",py:"synapse.ml.lightgbm.html#module-synapse.ml.lightgbm.LightGBMRegressor",scala:"com/microsoft/azure/synapse/ml/lightgbm/LightGBMRegressor.html",csharp:"classSynapse_1_1ML_1_1Lightgbm_1_1LightGBMRegressor.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/lightgbm/src/main/scala/com/microsoft/azure/synapse/ml/lightgbm/LightGBMRegressor.scala",mdxType:"DocTable"}))}p.isMDXComponent=!0;var g=["components"],f={title:"Estimators - LightGBM",sidebar_label:"LightGBM",hide_title:!0},b="LightGBM",h={unversionedId:"Quick Examples/estimators/estimators_lightgbm",id:"version-0.11.3/Quick Examples/estimators/estimators_lightgbm",title:"Estimators - LightGBM",description:"",source:"@site/versioned_docs/version-0.11.3/Quick Examples/estimators/estimators_lightgbm.md",sourceDirName:"Quick Examples/estimators",slug:"/Quick Examples/estimators/estimators_lightgbm",permalink:"/SynapseML/docs/0.11.3/Quick Examples/estimators/estimators_lightgbm",draft:!1,tags:[],version:"0.11.3",frontMatter:{title:"Estimators - LightGBM",sidebar_label:"LightGBM",hide_title:!0}},d={},v=[].concat(c),y={toc:v};function k(e){var t=e.components,r=(0,n.Z)(e,g);return(0,l.kt)("wrapper",(0,a.Z)({},y,r,{components:t,mdxType:"MDXLayout"}),(0,l.kt)("h1",{id:"lightgbm"},"LightGBM"),(0,l.kt)(p,{mdxType:"LightGBM"}))}k.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/4e3910af.79c13ea4.js b/assets/js/4e3910af.79c13ea4.js new file mode 100644 index 0000000000..40ec9114e7 --- /dev/null +++ b/assets/js/4e3910af.79c13ea4.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[3544],{3905:function(e,a,r){r.d(a,{Zo:function(){return c},kt:function(){return d}});var t=r(7294);function n(e,a,r){return a in e?Object.defineProperty(e,a,{value:r,enumerable:!0,configurable:!0,writable:!0}):e[a]=r,e}function s(e,a){var r=Object.keys(e);if(Object.getOwnPropertySymbols){var t=Object.getOwnPropertySymbols(e);a&&(t=t.filter((function(a){return Object.getOwnPropertyDescriptor(e,a).enumerable}))),r.push.apply(r,t)}return r}function o(e){for(var a=1;a=0||(n[r]=e[r]);return n}(e,a);if(Object.getOwnPropertySymbols){var s=Object.getOwnPropertySymbols(e);for(t=0;t=0||Object.prototype.propertyIsEnumerable.call(e,r)&&(n[r]=e[r])}return n}var i=t.createContext({}),m=function(e){var a=t.useContext(i),r=a;return e&&(r="function"==typeof e?e(a):o(o({},a),e)),r},c=function(e){var a=m(e.components);return t.createElement(i.Provider,{value:a},e.children)},p={inlineCode:"code",wrapper:function(e){var a=e.children;return t.createElement(t.Fragment,{},a)}},h=t.forwardRef((function(e,a){var r=e.components,n=e.mdxType,s=e.originalType,i=e.parentName,c=l(e,["components","mdxType","originalType","parentName"]),h=m(r),d=n,f=h["".concat(i,".").concat(d)]||h[d]||p[d]||s;return r?t.createElement(f,o(o({ref:a},c),{},{components:r})):t.createElement(f,o({ref:a},c))}));function d(e,a){var r=arguments,n=a&&a.mdxType;if("string"==typeof e||n){var s=r.length,o=new Array(s);o[0]=h;var l={};for(var i in a)hasOwnProperty.call(a,i)&&(l[i]=a[i]);l.originalType=e,l.mdxType="string"==typeof e?e:n,o[1]=l;for(var m=2;m=0||(l[t]=e[t]);return l}(e,n);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,t)&&(l[t]=e[t])}return l}var s=a.createContext({}),p=function(e){var n=a.useContext(s),t=n;return e&&(t="function"==typeof e?e(n):r(r({},n),e)),t},c=function(e){var n=p(e.components);return a.createElement(s.Provider,{value:n},e.children)},d={inlineCode:"code",wrapper:function(e){var n=e.children;return a.createElement(a.Fragment,{},n)}},u=a.forwardRef((function(e,n){var t=e.components,l=e.mdxType,i=e.originalType,s=e.parentName,c=o(e,["components","mdxType","originalType","parentName"]),u=p(t),m=l,f=u["".concat(s,".").concat(m)]||u[m]||d[m]||i;return t?a.createElement(f,r(r({ref:n},c),{},{components:t})):a.createElement(f,r({ref:n},c))}));function m(e,n){var t=arguments,l=n&&n.mdxType;if("string"==typeof e||l){var i=t.length,r=new Array(i);r[0]=u;var o={};for(var s in n)hasOwnProperty.call(n,s)&&(o[s]=n[s]);o.originalType=e,o.mdxType="string"==typeof e?e:l,r[1]=o;for(var p=2;p= 50K or < 50K based on our features."),(0,i.kt)("hr",null),(0,i.kt)("p",null,"Python dependencies:"),(0,i.kt)("p",null,"matplotlib==3.2.2"),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-python"},"from pyspark.ml import Pipeline\nfrom pyspark.ml.classification import GBTClassifier\nfrom pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder\nimport pyspark.sql.functions as F\nfrom pyspark.ml.evaluation import BinaryClassificationEvaluator\nfrom synapse.ml.explainers import ICETransformer\nimport matplotlib.pyplot as plt\nfrom synapse.ml.core.platform import *\n")),(0,i.kt)("h3",{id:"read-and-prepare-the-dataset"},"Read and prepare the dataset"),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-python"},'df = spark.read.parquet(\n "wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet"\n)\ndisplay(df)\n')),(0,i.kt)("h3",{id:"fit-the-model-and-view-the-predictions"},"Fit the model and view the predictions"),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-python"},'categorical_features = [\n "race",\n "workclass",\n "marital-status",\n "education",\n "occupation",\n "relationship",\n "native-country",\n "sex",\n]\nnumeric_features = [\n "age",\n "education-num",\n "capital-gain",\n "capital-loss",\n "hours-per-week",\n]\n')),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-python"},'string_indexer_outputs = [feature + "_idx" for feature in categorical_features]\none_hot_encoder_outputs = [feature + "_enc" for feature in categorical_features]\n\npipeline = Pipeline(\n stages=[\n StringIndexer()\n .setInputCol("income")\n .setOutputCol("label")\n .setStringOrderType("alphabetAsc"),\n StringIndexer()\n .setInputCols(categorical_features)\n .setOutputCols(string_indexer_outputs),\n OneHotEncoder()\n .setInputCols(string_indexer_outputs)\n .setOutputCols(one_hot_encoder_outputs),\n VectorAssembler(\n inputCols=one_hot_encoder_outputs + numeric_features, outputCol="features"\n ),\n GBTClassifier(weightCol="fnlwgt", maxDepth=7, maxIter=100),\n ]\n)\n\nmodel = pipeline.fit(df)\n')),(0,i.kt)("p",null,"Check that model makes sense and has reasonable output. For this, we will check the model performance by calculating the ROC-AUC score."),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-python"},'data = model.transform(df)\ndisplay(data.select("income", "probability", "prediction"))\n')),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-python"},'eval_auc = BinaryClassificationEvaluator(\n labelCol="label", rawPredictionCol="prediction"\n)\neval_auc.evaluate(data)\n')),(0,i.kt)("h2",{id:"partial-dependence-plots"},"Partial Dependence Plots"),(0,i.kt)("p",null,"Partial dependence plots (PDP) show the dependence between the target response and a set of input features of interest, marginalizing over the values of all other input features. It can show whether the relationship between the target response and the input feature is linear, smooth, monotonic, or more complex. This is relevant when you want to have an overall understanding of model behavior. E.g. Identifying specific age group has a favorable predictions vs other age groups."),(0,i.kt)("p",null,"If you want to learn more please check out the ",(0,i.kt)("a",{parentName:"p",href:"https://scikit-learn.org/stable/modules/partial_dependence.html#partial-dependence-plots"},"scikit-learn page on partial dependence plots"),"."),(0,i.kt)("h3",{id:"set-up-the-transformer-for-pdp"},"Set up the transformer for PDP"),(0,i.kt)("p",null,"To plot PDP we need to set up the instance of ",(0,i.kt)("inlineCode",{parentName:"p"},"ICETransformer")," first and set the ",(0,i.kt)("inlineCode",{parentName:"p"},"kind")," parameter to ",(0,i.kt)("inlineCode",{parentName:"p"},"average")," and then call the ",(0,i.kt)("inlineCode",{parentName:"p"},"transform")," function. "),(0,i.kt)("p",null,'For the setup we need to pass the pretrained model, specify the target column ("probability" in our case), and pass categorical and numeric feature names.'),(0,i.kt)("p",null,"Categorical and numeric features can be passed as a list of names. But we can specify parameters for the features by passing a list of dicts where each dict represents one feature. "),(0,i.kt)("p",null,"For the numeric features a dictionary can look like this:"),(0,i.kt)("p",null,'{"name": "capital-gain", "numSplits": 20, "rangeMin": 0.0, "rangeMax": 10000.0, "outputColName": "capital-gain_dependance"}'),(0,i.kt)("p",null,"Where the required key-value pair is ",(0,i.kt)("inlineCode",{parentName:"p"},"name")," - the name of the numeric feature. Next key-values pairs are optional: ",(0,i.kt)("inlineCode",{parentName:"p"},"numSplits")," - the number of splits for the value range for the numeric feature, ",(0,i.kt)("inlineCode",{parentName:"p"},"rangeMin")," - specifies the min value of the range for the numeric feature, ",(0,i.kt)("inlineCode",{parentName:"p"},"rangeMax")," - specifies the max value of the range for the numeric feature, ",(0,i.kt)("inlineCode",{parentName:"p"},"outputColName")," - the name for output column with explanations for the feature."),(0,i.kt)("p",null,"For the categorical features a dictionary can look like this:"),(0,i.kt)("p",null,'{"name": "marital-status", "numTopValues": 10, "outputColName": "marital-status_dependance"}'),(0,i.kt)("p",null,"Where the required key-value pair is ",(0,i.kt)("inlineCode",{parentName:"p"},"name")," - the name of the numeric feature. Next key-values pairs are optional: ",(0,i.kt)("inlineCode",{parentName:"p"},"numTopValues")," - the max number of top-occurring values to be included in the categorical feature, ",(0,i.kt)("inlineCode",{parentName:"p"},"outputColName")," - the name for output column with explanations for the feature."),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-python"},'pdp = ICETransformer(\n model=model,\n targetCol="probability",\n kind="average",\n targetClasses=[1],\n categoricalFeatures=categorical_features,\n numericFeatures=numeric_features,\n)\n')),(0,i.kt)("p",null,"PDP transformer returns a dataframe of 1 row * {number features to explain} columns. Each column contains a map between the feature's values and the model's average dependence for that feature value."),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-python"},"output_pdp = pdp.transform(df)\ndisplay(output_pdp)\n")),(0,i.kt)("h3",{id:"visualization"},"Visualization"),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-python"},'# Helper functions for visualization\n\n\ndef get_pandas_df_from_column(df, col_name):\n keys_df = df.select(F.explode(F.map_keys(F.col(col_name)))).distinct()\n keys = list(map(lambda row: row[0], keys_df.collect()))\n key_cols = list(map(lambda f: F.col(col_name).getItem(f).alias(str(f)), keys))\n final_cols = key_cols\n pandas_df = df.select(final_cols).toPandas()\n return pandas_df\n\n\ndef plot_dependence_for_categorical(df, col, col_int=True, figsize=(20, 5)):\n dict_values = {}\n col_names = list(df.columns)\n\n for col_name in col_names:\n dict_values[col_name] = df[col_name][0].toArray()[0]\n marklist = sorted(\n dict_values.items(), key=lambda x: int(x[0]) if col_int else x[0]\n )\n sortdict = dict(marklist)\n\n fig = plt.figure(figsize=figsize)\n plt.bar(sortdict.keys(), sortdict.values())\n\n plt.xlabel(col, size=13)\n plt.ylabel("Dependence")\n plt.show()\n\n\ndef plot_dependence_for_numeric(df, col, col_int=True, figsize=(20, 5)):\n dict_values = {}\n col_names = list(df.columns)\n\n for col_name in col_names:\n dict_values[col_name] = df[col_name][0].toArray()[0]\n marklist = sorted(\n dict_values.items(), key=lambda x: int(x[0]) if col_int else x[0]\n )\n sortdict = dict(marklist)\n\n fig = plt.figure(figsize=figsize)\n\n plt.plot(list(sortdict.keys()), list(sortdict.values()))\n\n plt.xlabel(col, size=13)\n plt.ylabel("Dependence")\n plt.ylim(0.0)\n plt.show()\n')),(0,i.kt)("h4",{id:"example-1-age"},'Example 1: "age"'),(0,i.kt)("p",null,"We can observe non-linear dependency. The model predicts that income rapidly grows from 24-46 y.o. age, after 46 y.o. model predictions slightly drops and from 68 y.o. remains stable."),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-python"},'df_education_num = get_pandas_df_from_column(output_pdp, "age_dependence")\nplot_dependence_for_numeric(df_education_num, "age")\n')),(0,i.kt)("p",null,"Your results will look like:"),(0,i.kt)("p",null,(0,i.kt)("img",{parentName:"p",src:"https://mmlspark.blob.core.windows.net/graphics/explainers/pdp_age.png",alt:"pdp_age"})),(0,i.kt)("h4",{id:"example-2-marital-status"},'Example 2: "marital-status"'),(0,i.kt)("p",null,'The model seems to treat "married-cv-spouse" as one category and tend to give a higher average prediction, and all others as a second category with the lower average prediction.'),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-python"},'df_occupation = get_pandas_df_from_column(output_pdp, "marital-status_dependence")\nplot_dependence_for_categorical(df_occupation, "marital-status", False, figsize=(30, 5))\n')),(0,i.kt)("p",null,"Your results will look like:\n",(0,i.kt)("img",{parentName:"p",src:"https://mmlspark.blob.core.windows.net/graphics/explainers/pdp_marital-status.png",alt:"pdp_marital-status"})),(0,i.kt)("h4",{id:"example-3-capital-gain"},'Example 3: "capital-gain"'),(0,i.kt)("p",null,"In the first graph, we run PDP with default parameters. We can see that this representation is not super useful because it is not granular enough. By default the range of numeric features are calculated dynamically from the data."),(0,i.kt)("p",null,"In the second graph, we set rangeMin = 0 and rangeMax = 10000 to visualize more granular interpretations for the feature of interest. Now we can see more clearly how the model made decisions in a smaller region."),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-python"},'df_education_num = get_pandas_df_from_column(output_pdp, "capital-gain_dependence")\nplot_dependence_for_numeric(df_education_num, "capital-gain_dependence")\n')),(0,i.kt)("p",null,"Your results will look like:"),(0,i.kt)("p",null,(0,i.kt)("img",{parentName:"p",src:"https://mmlspark.blob.core.windows.net/graphics/explainers/pdp_capital-gain-first.png",alt:"pdp_capital-gain-first"})),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-python"},'pdp_cap_gain = ICETransformer(\n model=model,\n targetCol="probability",\n kind="average",\n targetClasses=[1],\n numericFeatures=[\n {"name": "capital-gain", "numSplits": 20, "rangeMin": 0.0, "rangeMax": 10000.0}\n ],\n numSamples=50,\n)\noutput_pdp_cap_gain = pdp_cap_gain.transform(df)\ndf_education_num_gain = get_pandas_df_from_column(\n output_pdp_cap_gain, "capital-gain_dependence"\n)\nplot_dependence_for_numeric(df_education_num_gain, "capital-gain_dependence")\n')),(0,i.kt)("p",null,"Your results will look like:"),(0,i.kt)("p",null,(0,i.kt)("img",{parentName:"p",src:"https://mmlspark.blob.core.windows.net/graphics/explainers/pdp_capital-gain-second.png",alt:"pdp_capital-gain-second"})),(0,i.kt)("h3",{id:"conclusions"},"Conclusions"),(0,i.kt)("p",null,"PDP can be used to show how features influence model predictions on average and help modeler catch unexpected behavior from the model."),(0,i.kt)("h2",{id:"individual-conditional-expectation"},"Individual Conditional Expectation"),(0,i.kt)("p",null,"ICE plots display one line per instance that shows how the instance\u2019s prediction changes when a feature values change. Each line represents the predictions for one instance if we vary the feature of interest. This is relevant when you want to observe model prediction for instances individually in more details. "),(0,i.kt)("p",null,"If you want to learn more please check out the ",(0,i.kt)("a",{parentName:"p",href:"https://scikit-learn.org/stable/modules/partial_dependence.html#individual-conditional-expectation-ice-plot"},"scikit-learn page on ICE plots"),"."),(0,i.kt)("h3",{id:"set-up-the-transformer-for-ice"},"Set up the transformer for ICE"),(0,i.kt)("p",null,"To plot ICE we need to set up the instance of ",(0,i.kt)("inlineCode",{parentName:"p"},"ICETransformer")," first and set the ",(0,i.kt)("inlineCode",{parentName:"p"},"kind")," parameter to ",(0,i.kt)("inlineCode",{parentName:"p"},"individual")," and then call the ",(0,i.kt)("inlineCode",{parentName:"p"},"transform"),' function. For the setup we need to pass the pretrained model, specify the target column ("probability" in our case), and pass categorical and numeric feature names. For better visualization we set the number of samples to 50.'),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-python"},'ice = ICETransformer(\n model=model,\n targetCol="probability",\n targetClasses=[1],\n categoricalFeatures=categorical_features,\n numericFeatures=numeric_features,\n numSamples=50,\n)\n\noutput = ice.transform(df)\n')),(0,i.kt)("h3",{id:"visualization-1"},"Visualization"),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-python"},'# Helper functions for visualization\nfrom math import pi\n\nfrom collections import defaultdict\n\n\ndef plot_ice_numeric(df, col, col_int=True, figsize=(20, 10)):\n dict_values = defaultdict(list)\n col_names = list(df.columns)\n num_instances = df.shape[0]\n\n instances_y = {}\n i = 0\n\n for col_name in col_names:\n for i in range(num_instances):\n dict_values[i].append(df[col_name][i].toArray()[0])\n\n fig = plt.figure(figsize=figsize)\n for i in range(num_instances):\n plt.plot(col_names, dict_values[i], "k")\n\n plt.xlabel(col, size=13)\n plt.ylabel("Dependence")\n plt.ylim(0.0)\n\n\ndef plot_ice_categorical(df, col, col_int=True, figsize=(20, 10)):\n dict_values = defaultdict(list)\n col_names = list(df.columns)\n num_instances = df.shape[0]\n\n angles = [n / float(df.shape[1]) * 2 * pi for n in range(df.shape[1])]\n angles += angles[:1]\n\n instances_y = {}\n i = 0\n\n for col_name in col_names:\n for i in range(num_instances):\n dict_values[i].append(df[col_name][i].toArray()[0])\n\n fig = plt.figure(figsize=figsize)\n ax = plt.subplot(111, polar=True)\n plt.xticks(angles[:-1], col_names)\n\n for i in range(num_instances):\n values = dict_values[i]\n values += values[:1]\n ax.plot(angles, values, "k")\n ax.fill(angles, values, "teal", alpha=0.1)\n\n plt.xlabel(col, size=13)\n plt.show()\n\n\ndef overlay_ice_with_pdp(df_ice, df_pdp, col, col_int=True, figsize=(20, 5)):\n dict_values = defaultdict(list)\n col_names_ice = list(df_ice.columns)\n num_instances = df_ice.shape[0]\n\n instances_y = {}\n i = 0\n\n for col_name in col_names_ice:\n for i in range(num_instances):\n dict_values[i].append(df_ice[col_name][i].toArray()[0])\n\n fig = plt.figure(figsize=figsize)\n for i in range(num_instances):\n plt.plot(col_names_ice, dict_values[i], "k")\n\n dict_values_pdp = {}\n col_names = list(df_pdp.columns)\n\n for col_name in col_names:\n dict_values_pdp[col_name] = df_pdp[col_name][0].toArray()[0]\n marklist = sorted(\n dict_values_pdp.items(), key=lambda x: int(x[0]) if col_int else x[0]\n )\n sortdict = dict(marklist)\n\n plt.plot(col_names_ice, list(sortdict.values()), "r", linewidth=5)\n\n plt.xlabel(col, size=13)\n plt.ylabel("Dependence")\n plt.ylim(0.0)\n plt.show()\n')),(0,i.kt)("h4",{id:"example-1-numeric-feature-age"},'Example 1: Numeric feature: "age"'),(0,i.kt)("p",null,'We can overlay the PDP on top of ICE plots. In the graph, the red line shows the PDP plot for the "age" feature, and the black lines show ICE plots for 50 randomly selected observations. '),(0,i.kt)("p",null,'The visualization shows that all curves in the ICE plot follow a similar course. This means that the PDP (red line) is already a good summary of the relationships between the displayed feature "age" and the model\'s average predictions of "income". '),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-python"},'age_df_ice = get_pandas_df_from_column(output, "age_dependence")\nage_df_pdp = get_pandas_df_from_column(output_pdp, "age_dependence")\n\noverlay_ice_with_pdp(age_df_ice, age_df_pdp, col="age_dependence", figsize=(30, 10))\n')),(0,i.kt)("p",null,"Your results will look like:\n",(0,i.kt)("img",{parentName:"p",src:"https://mmlspark.blob.core.windows.net/graphics/explainers/pdp_age_overlayed.png",alt:"pdp_age_overlayed"})),(0,i.kt)("h4",{id:"example-2-categorical-feature-occupation"},'Example 2: Categorical feature: "occupation"'),(0,i.kt)("p",null,"For visualization of categorical features, we are using a star plot."),(0,i.kt)("ul",null,(0,i.kt)("li",{parentName:"ul"},"The X-axis here is a circle which is split into equal parts, each representing a feature value."),(0,i.kt)("li",{parentName:"ul"},"The Y-coordinate shows the dependence values. Each line represents a sample observation.")),(0,i.kt)("p",null,'Here we can see that "Farming-fishing" drives the least predictions - because values accumulated near the lowest probabilities, but, for example, "Exec-managerial" seems to have one of the highest impacts for model predictions.'),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-python"},'occupation_dep = get_pandas_df_from_column(output, "occupation_dependence")\n\nplot_ice_categorical(occupation_dep, "occupation_dependence", figsize=(30, 10))\n')),(0,i.kt)("p",null,"Your results will look like:"),(0,i.kt)("p",null,(0,i.kt)("img",{parentName:"p",src:"https://mmlspark.blob.core.windows.net/graphics/explainers/pdp_occupation-star-plot.png",alt:"pdp_occupation-star-plot"})),(0,i.kt)("h3",{id:"conclusions-1"},"Conclusions"),(0,i.kt)("p",null,"ICE plots show model behavior on individual observations. Each line represents the prediction from the model if we vary the feature of interest."),(0,i.kt)("h2",{id:"pdp-based-feature-importance"},"PDP-based Feature Importance"),(0,i.kt)("p",null,'Using PDP we can calculate a simple partial dependence-based feature importance measure. We note that a flat PDP indicates that varying the feature does not affect the prediction. The more the PDP varies, the more "important" the feature is. '),(0,i.kt)("p",null,"If you want to learn more please check out ",(0,i.kt)("a",{parentName:"p",href:"https://christophm.github.io/interpretable-ml-book/pdp.html#pdp-based-feature-importance"},"Christoph M's Interpretable ML Book"),"."),(0,i.kt)("h3",{id:"set-up-the-transformer-for-pdp-based-feature-importance"},"Set up the transformer for PDP-based Feature Importance"),(0,i.kt)("p",null,"To plot PDP-based feature importance, we first need to set up the instance of ",(0,i.kt)("inlineCode",{parentName:"p"},"ICETransformer")," by setting the ",(0,i.kt)("inlineCode",{parentName:"p"},"kind")," parameter to ",(0,i.kt)("inlineCode",{parentName:"p"},"feature"),". We can then call the ",(0,i.kt)("inlineCode",{parentName:"p"},"transform")," function. "),(0,i.kt)("p",null,(0,i.kt)("inlineCode",{parentName:"p"},"transform")," returns a two-column table where the first columns are feature importance values and the second are corresponding features names. The rows are sorted in descending order by feature importance values."),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-python"},'pdp_based_imp = ICETransformer(\n model=model,\n targetCol="probability",\n kind="feature",\n targetClasses=[1],\n categoricalFeatures=categorical_features,\n numericFeatures=numeric_features,\n)\n\noutput_pdp_based_imp = pdp_based_imp.transform(df)\ndisplay(output_pdp_based_imp)\n')),(0,i.kt)("h3",{id:"visualization-2"},"Visualization"),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-python"},'# Helper functions for visualization\n\n\ndef plot_pdp_based_imp(df, figsize=(35, 5)):\n values_list = list(df.select("pdpBasedDependence").toPandas()["pdpBasedDependence"])\n names = list(df.select("featureNames").toPandas()["featureNames"])\n dependence_values = []\n for vec in values_list:\n dependence_values.append(vec.toArray()[0])\n\n fig = plt.figure(figsize=figsize)\n plt.bar(names, dependence_values)\n\n plt.xlabel("Feature names", size=13)\n plt.ylabel("PDP-based-feature-imporance")\n plt.show()\n')),(0,i.kt)("p",null,"This shows that the features ",(0,i.kt)("inlineCode",{parentName:"p"},"capital-gain")," and ",(0,i.kt)("inlineCode",{parentName:"p"},"education-num")," were the most important for the model, and ",(0,i.kt)("inlineCode",{parentName:"p"},"sex")," and ",(0,i.kt)("inlineCode",{parentName:"p"},"education")," were the least important."),(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-python"},"plot_pdp_based_imp(output_pdp_based_imp)\n")),(0,i.kt)("p",null,"Your results will look like:"),(0,i.kt)("p",null,(0,i.kt)("img",{parentName:"p",src:"https://mmlspark.blob.core.windows.net/graphics/explainers/pdp-based-importance.png",alt:"pdp_based-importance"})),(0,i.kt)("h2",{id:"overall-conclusions"},"Overall conclusions"),(0,i.kt)("p",null,"Interpretation methods are very important responsible AI tools."),(0,i.kt)("p",null,"Partial dependence plots (PDP) and Individual Conditional Expectation (ICE) plots can be used to visualize and analyze interaction between the target response and a set of input features of interest."),(0,i.kt)("p",null,"PDPs show the dependence of the average prediction when varying each feature. In contrast, ICE shows the dependence for individual samples. The approaches can help give rough estimates of a function's deviation from a baseline. This is important not only to help debug and understand how a model behaves but is a useful step in building responsible AI systems. These methodologies can improve transparency and provide model consumers with an extra level of accountability by model creators."),(0,i.kt)("p",null,"Using examples above we showed how to calculate and visualize such plots at a scalable manner to understand how a classification or regression model makes predictions, which features heavily impact the model, and how model prediction changes when feature value changes."))}m.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/50af03e4.b59afece.js b/assets/js/50af03e4.b59afece.js new file mode 100644 index 0000000000..73d3aa3f8d --- /dev/null +++ b/assets/js/50af03e4.b59afece.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[6204],{3905:function(e,t,n){n.d(t,{Zo:function(){return c},kt:function(){return d}});var r=n(7294);function i(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function a(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,r)}return n}function o(e){for(var t=1;t=0||(i[n]=e[n]);return i}(e,t);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(i[n]=e[n])}return i}var s=r.createContext({}),p=function(e){var t=r.useContext(s),n=t;return e&&(n="function"==typeof e?e(t):o(o({},t),e)),n},c=function(e){var t=p(e.components);return r.createElement(s.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},m=r.forwardRef((function(e,t){var n=e.components,i=e.mdxType,a=e.originalType,s=e.parentName,c=l(e,["components","mdxType","originalType","parentName"]),m=p(n),d=i,f=m["".concat(s,".").concat(d)]||m[d]||u[d]||a;return n?r.createElement(f,o(o({ref:t},c),{},{components:n})):r.createElement(f,o({ref:t},c))}));function d(e,t){var n=arguments,i=t&&t.mdxType;if("string"==typeof e||i){var a=n.length,o=new Array(a);o[0]=m;var l={};for(var s in t)hasOwnProperty.call(t,s)&&(l[s]=t[s]);l.originalType=e,l.mdxType="string"==typeof e?e:i,o[1]=l;for(var p=2;p 3).cast(LongType()))\n .select("label", "text")\n .cache()\n)\n\ndisplay(data)\n')),(0,a.kt)("p",null,"We train a text classification model, and randomly sample 10 rows to explain."),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},'train, test = data.randomSplit([0.60, 0.40])\n\npipeline = Pipeline(\n stages=[\n TextFeaturizer(\n inputCol="text",\n outputCol="features",\n useStopWordsRemover=True,\n useIDF=True,\n minDocFreq=20,\n numFeatures=1 << 16,\n ),\n LogisticRegression(maxIter=100, regParam=0.005, labelCol="label", featuresCol="features"),\n ]\n)\n\nmodel = pipeline.fit(train)\n\nprediction = model.transform(test)\n\nexplain_instances = prediction.orderBy(rand()).limit(10)\n')),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},'def plotConfusionMatrix(df, label, prediction, classLabels):\n from synapse.ml.plot import confusionMatrix\n import matplotlib.pyplot as plt\n\n fig = plt.figure(figsize=(4.5, 4.5))\n confusionMatrix(df, label, prediction, classLabels)\n if running_on_synapse():\n plt.show()\n else:\n display(fig)\n\n\nplotConfusionMatrix(model.transform(test), "label", "prediction", [0, 1])\n')),(0,a.kt)("p",null,"First we use the LIME text explainer to explain the model's predicted probability for a given observation."),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},'lime = TextLIME(\n model=model,\n outputCol="weights",\n inputCol="text",\n targetCol="probability",\n targetClasses=[1],\n tokensCol="tokens",\n samplingFraction=0.7,\n numSamples=2000,\n)\n\nlime_results = (\n lime.transform(explain_instances)\n .select("tokens", "weights", "r2", "probability", "text")\n .withColumn("probability", vec_access("probability", lit(1)))\n .withColumn("weights", vec2array(col("weights").getItem(0)))\n .withColumn("r2", vec_access("r2", lit(0)))\n .withColumn("tokens_weights", arrays_zip("tokens", "weights"))\n)\n\ndisplay(lime_results.select("probability", "r2", "tokens_weights", "text").orderBy(col("probability").desc()))\n')),(0,a.kt)("p",null,"Then we use the Kernel SHAP text explainer to explain the model's predicted probability for a given observation."),(0,a.kt)("blockquote",null,(0,a.kt)("p",{parentName:"blockquote"},"Notice that we drop the base value from the SHAP output before displaying the SHAP values. The base value is the model output for an empty string.")),(0,a.kt)("pre",null,(0,a.kt)("code",{parentName:"pre"},'shap = TextSHAP(\n model=model,\n outputCol="shaps",\n inputCol="text",\n targetCol="probability",\n targetClasses=[1],\n tokensCol="tokens",\n numSamples=5000,\n)\n\nshap_results = (\n shap.transform(explain_instances)\n .select("tokens", "shaps", "r2", "probability", "text")\n .withColumn("probability", vec_access("probability", lit(1)))\n .withColumn("shaps", vec2array(col("shaps").getItem(0)))\n .withColumn("shaps", slice(col("shaps"), lit(2), size(col("shaps"))))\n .withColumn("r2", vec_access("r2", lit(0)))\n .withColumn("tokens_shaps", arrays_zip("tokens", "shaps"))\n)\n\ndisplay(shap_results.select("probability", "r2", "tokens_shaps", "text").orderBy(col("probability").desc()))\n')))}d.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/53a5cb1e.f8f3b51f.js b/assets/js/53a5cb1e.f8f3b51f.js new file mode 100644 index 0000000000..114433a31a --- /dev/null +++ b/assets/js/53a5cb1e.f8f3b51f.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[4548],{3905:function(e,t,n){n.d(t,{Zo:function(){return m},kt:function(){return c}});var a=n(7294);function r(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function i(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);t&&(a=a.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,a)}return n}function s(e){for(var t=1;t=0||(r[n]=e[n]);return r}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(r[n]=e[n])}return r}var l=a.createContext({}),p=function(e){var t=a.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):s(s({},t),e)),n},m=function(e){var t=p(e.components);return a.createElement(l.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return a.createElement(a.Fragment,{},t)}},d=a.forwardRef((function(e,t){var n=e.components,r=e.mdxType,i=e.originalType,l=e.parentName,m=o(e,["components","mdxType","originalType","parentName"]),d=p(n),c=r,k=d["".concat(l,".").concat(c)]||d[c]||u[c]||i;return n?a.createElement(k,s(s({ref:t},m),{},{components:n})):a.createElement(k,s({ref:t},m))}));function c(e,t){var n=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var i=n.length,s=new Array(i);s[0]=d;var o={};for(var l in t)hasOwnProperty.call(t,l)&&(o[l]=t[l]);o.originalType=e,o.mdxType="string"==typeof e?e:r,s[1]=o;for(var p=2;p=0||(r[a]=e[a]);return r}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(r[a]=e[a])}return r}var i=n.createContext({}),u=function(e){var t=n.useContext(i),a=t;return e&&(a="function"==typeof e?e(t):l(l({},t),e)),a},c=function(e){var t=u(e.components);return n.createElement(i.Provider,{value:t},e.children)},p={inlineCode:"code",wrapper:function(e){var t=e.children;return n.createElement(n.Fragment,{},t)}},m=n.forwardRef((function(e,t){var a=e.components,r=e.mdxType,o=e.originalType,i=e.parentName,c=s(e,["components","mdxType","originalType","parentName"]),m=u(a),f=r,b=m["".concat(i,".").concat(f)]||m[f]||p[f]||o;return a?n.createElement(b,l(l({ref:t},c),{},{components:a})):n.createElement(b,l({ref:t},c))}));function f(e,t){var a=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var o=a.length,l=new Array(o);l[0]=m;var s={};for(var i in t)hasOwnProperty.call(t,i)&&(s[i]=t[i]);s.originalType=e,s.mdxType="string"==typeof e?e:r,l[1]=s;for(var u=2;u child <"+("string"==typeof e.type?e.type:e.type.name)+'>: all children of the component should be , and every should have a unique "value" prop.')})))?void 0:a.filter(Boolean))?t:[]}(e).map((function(e){var t=e.props;return{value:t.value,label:t.label,attributes:t.attributes,default:t.default}}))}function m(e){var t=e.values,a=e.children;return(0,r.useMemo)((function(){var e=null!=t?t:p(a);return function(e){var t=(0,u.l)(e,(function(e,t){return e.value===t.value}));if(t.length>0)throw new Error('Docusaurus error: Duplicate values "'+t.map((function(e){return e.value})).join(", ")+'" found in . Every value needs to be unique.')}(e),e}),[t,a])}function f(e){var t=e.value;return e.tabValues.some((function(e){return e.value===t}))}function b(e){var t=e.queryString,a=void 0!==t&&t,n=e.groupId,o=(0,s.k6)(),l=function(e){var t=e.queryString,a=void 0!==t&&t,n=e.groupId;if("string"==typeof a)return a;if(!1===a)return null;if(!0===a&&!n)throw new Error('Docusaurus error: The component groupId prop is required if queryString=true, because this value is used as the search param name. You can also provide an explicit value such as queryString="my-search-param".');return null!=n?n:null}({queryString:a,groupId:n});return[(0,i._X)(l),(0,r.useCallback)((function(e){if(l){var t=new URLSearchParams(o.location.search);t.set(l,e),o.replace(Object.assign({},o.location,{search:t.toString()}))}}),[l,o])]}function v(e){var t,a,n,o,l=e.defaultValue,s=e.queryString,i=void 0!==s&&s,u=e.groupId,p=m(e),v=(0,r.useState)((function(){return function(e){var t,a=e.defaultValue,n=e.tabValues;if(0===n.length)throw new Error("Docusaurus error: the component requires at least one children component");if(a){if(!f({value:a,tabValues:n}))throw new Error('Docusaurus error: The has a defaultValue "'+a+'" but none of its children has the corresponding value. Available values are: '+n.map((function(e){return e.value})).join(", ")+". If you intend to show no default tab, use defaultValue={null} instead.");return a}var r=null!=(t=n.find((function(e){return e.default})))?t:n[0];if(!r)throw new Error("Unexpected error: 0 tabValues");return r.value}({defaultValue:l,tabValues:p})})),w=v[0],d=v[1],y=b({queryString:i,groupId:u}),h=y[0],g=y[1],k=(t=function(e){return e?"docusaurus.tab."+e:null}({groupId:u}.groupId),a=(0,c.Nk)(t),n=a[0],o=a[1],[n,(0,r.useCallback)((function(e){t&&o.set(e)}),[t,o])]),V=k[0],_=k[1],T=function(){var e=null!=h?h:V;return f({value:e,tabValues:p})?e:null}();return(0,r.useLayoutEffect)((function(){T&&d(T)}),[T]),{selectedValue:w,selectValue:(0,r.useCallback)((function(e){if(!f({value:e,tabValues:p}))throw new Error("Can't select invalid tab value="+e);d(e),g(e),_(e)}),[g,_,p]),tabValues:p}}var w=a(2389),d="tabList__CuJ",y="tabItem_LNqP";function h(e){var t=e.className,a=e.block,s=e.selectedValue,i=e.selectValue,u=e.tabValues,c=[],p=(0,l.o5)().blockElementScrollPositionUntilNextRender,m=function(e){var t=e.currentTarget,a=c.indexOf(t),n=u[a].value;n!==s&&(p(t),i(n))},f=function(e){var t,a=null;switch(e.key){case"Enter":m(e);break;case"ArrowRight":var n,r=c.indexOf(e.currentTarget)+1;a=null!=(n=c[r])?n:c[0];break;case"ArrowLeft":var o,l=c.indexOf(e.currentTarget)-1;a=null!=(o=c[l])?o:c[c.length-1]}null==(t=a)||t.focus()};return r.createElement("ul",{role:"tablist","aria-orientation":"horizontal",className:(0,o.Z)("tabs",{"tabs--block":a},t)},u.map((function(e){var t=e.value,a=e.label,l=e.attributes;return r.createElement("li",(0,n.Z)({role:"tab",tabIndex:s===t?0:-1,"aria-selected":s===t,key:t,ref:function(e){return c.push(e)},onKeyDown:f,onClick:m},l,{className:(0,o.Z)("tabs__item",y,null==l?void 0:l.className,{"tabs__item--active":s===t})}),null!=a?a:t)})))}function g(e){var t=e.lazy,a=e.children,n=e.selectedValue,o=(Array.isArray(a)?a:[a]).filter(Boolean);if(t){var l=o.find((function(e){return e.props.value===n}));return l?(0,r.cloneElement)(l,{className:"margin-top--md"}):null}return r.createElement("div",{className:"margin-top--md"},o.map((function(e,t){return(0,r.cloneElement)(e,{key:t,hidden:e.props.value!==n})})))}function k(e){var t=v(e);return r.createElement("div",{className:(0,o.Z)("tabs-container",d)},r.createElement(h,(0,n.Z)({},e,t)),r.createElement(g,(0,n.Z)({},e,t)))}function V(e){var t=(0,w.Z)();return r.createElement(k,(0,n.Z)({key:String(t)},e))}},1989:function(e,t,a){var n=a(7294),r=a(2263);t.Z=function(e){var t=e.className,a=e.py,o=e.scala,l=e.csharp,s=e.sourceLink,i=(0,r.Z)().siteConfig.customFields.version,u="https://mmlspark.blob.core.windows.net/docs/"+i+"/pyspark/"+a,c="https://mmlspark.blob.core.windows.net/docs/"+i+"/scala/"+o,p="https://mmlspark.blob.core.windows.net/docs/"+i+"/dotnet/"+l;return n.createElement("table",null,n.createElement("tbody",null,n.createElement("tr",null,n.createElement("td",null,n.createElement("strong",null,"Python API: "),n.createElement("a",{href:u},t)),n.createElement("td",null,n.createElement("strong",null,"Scala API: "),n.createElement("a",{href:c},t)),n.createElement("td",null,n.createElement("strong",null,".NET API: "),n.createElement("a",{href:p},t)),n.createElement("td",null,n.createElement("strong",null,"Source: "),n.createElement("a",{href:s},t)))))}},5947:function(e,t,a){a.r(t),a.d(t,{assets:function(){return d},contentTitle:function(){return v},default:function(){return g},frontMatter:function(){return b},metadata:function(){return w},toc:function(){return y}});var n=a(3117),r=a(102),o=(a(7294),a(3905)),l=a(4866),s=a(5162),i=a(1989),u=["components"],c=[{value:"VectorZipper",id:"vectorzipper",level:2},{value:"VowpalWabbitClassifier",id:"vowpalwabbitclassifier",level:2},{value:"VowpalWabbitFeaturizer",id:"vowpalwabbitfeaturizer",level:2},{value:"VowpalWabbitInteractions",id:"vowpalwabbitinteractions",level:2}],p={toc:c};function m(e){var t=e.components,a=(0,r.Z)(e,u);return(0,o.kt)("wrapper",(0,n.Z)({},p,a,{components:t,mdxType:"MDXLayout"}),(0,o.kt)("h2",{id:"vectorzipper"},"VectorZipper"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"},{label:".NET",value:"csharp"}],mdxType:"Tabs"},(0,o.kt)(s.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.vw import *\n\ndf = spark.createDataFrame([\n ("action1_f", "action2_f"),\n ("action1_f", "action2_f"),\n ("action1_f", "action2_f"),\n ("action1_f", "action2_f")\n], ["action1", "action2"])\n\nactionOneFeaturizer = (VowpalWabbitFeaturizer()\n .setInputCols(["action1"])\n .setOutputCol("sequence_one"))\n\nactionTwoFeaturizer = (VowpalWabbitFeaturizer()\n .setInputCols(["action2"])\n .setOutputCol("sequence_two"))\n\nseqDF = actionTwoFeaturizer.transform(actionOneFeaturizer.transform(df))\n\nvectorZipper = (VectorZipper()\n .setInputCols(["sequence_one", "sequence_two"])\n .setOutputCol("out"))\n\nvectorZipper.transform(seqDF).show()\n'))),(0,o.kt)(s.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.vw._\n\nval df = (Seq(\n ("action1_f", "action2_f"),\n ("action1_f", "action2_f"),\n ("action1_f", "action2_f"),\n ("action1_f", "action2_f")\n ).toDF("action1", "action2"))\n\nval actionOneFeaturizer = (new VowpalWabbitFeaturizer()\n .setInputCols(Array("action1"))\n .setOutputCol("sequence_one"))\n\nval actionTwoFeaturizer = (new VowpalWabbitFeaturizer()\n .setInputCols(Array("action2"))\n .setOutputCol("sequence_two"))\n\nval seqDF = actionTwoFeaturizer.transform(actionOneFeaturizer.transform(df))\n\nval vectorZipper = (new VectorZipper()\n .setInputCols(Array("sequence_one", "sequence_two"))\n .setOutputCol("out"))\n\nvectorZipper.transform(seqDF).show()\n'))),(0,o.kt)(s.Z,{value:"csharp",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-csharp"},'using System;\nusing System.Collections.Generic;\nusing Synapse.ML.Vw;\nusing Microsoft.Spark.Sql;\nusing Microsoft.Spark.Sql.Types;\n\nnamespace SynapseMLApp\n{\n class Program\n {\n static void Main(string[] args)\n {\n SparkSession spark =\n SparkSession\n .Builder()\n .AppName("Example")\n .GetOrCreate();\n\n DataFrame df = spark.CreateDataFrame(\n new List\n {\n new GenericRow(new object[] {"action1_f", "action2_f"}),\n new GenericRow(new object[] {"action1_f", "action2_f"}),\n new GenericRow(new object[] {"action1_f", "action2_f"}),\n new GenericRow(new object[] {"action1_f", "action2_f"})\n },\n new StructType(new List\n {\n new StructField("action1", new StringType()),\n new StructField("action2", new StringType())\n })\n );\n\n var actionOneFeaturizer = new VowpalWabbitFeaturizer()\n .SetInputCols(new string[]{"action1"})\n .SetOutputCol("sequence_one");\n var actionTwoFeaturizer = new VowpalWabbitFeaturizer()\n .SetInputCols(new string[]{"action2"})\n .SetOutputCol("sequence_two");\n var seqDF = actionTwoFeaturizer.Transform(actionOneFeaturizer.Transform(df));\n\n var vectorZipper = new VectorZipper()\n .SetInputCols(new string[]{"sequence_one", "sequence_two"})\n .SetOutputCol("out");\n vectorZipper.Transform(seqDF).Show();\n\n spark.Stop();\n }\n }\n}\n')))),(0,o.kt)(i.Z,{className:"VectorZipper",py:"synapse.ml.vw.html#module-synapse.ml.vw.VectorZipper",scala:"com/microsoft/azure/synapse/ml/vw/VectorZipper.html",csharp:"classSynapse_1_1ML_1_1Vw_1_1VectorZipper.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/vw/src/main/scala/com/microsoft/azure/synapse/ml/vw/VectorZipper.scala",mdxType:"DocTable"}),(0,o.kt)("h2",{id:"vowpalwabbitclassifier"},"VowpalWabbitClassifier"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(s.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},"from synapse.ml.vw import *\n\nvw = (VowpalWabbitClassifier()\n .setNumBits(10)\n .setLearningRate(3.1)\n .setPowerT(0)\n .setLabelConversion(False))\n"))),(0,o.kt)(s.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},"import com.microsoft.azure.synapse.ml.vw._\n\nval vw = (new VowpalWabbitClassifier()\n .setNumBits(10)\n .setLearningRate(3.1)\n .setPowerT(0)\n .setLabelConversion(false))\n")))),(0,o.kt)(i.Z,{className:"VowpalWabbitClassifier",py:"synapse.ml.vw.html#module-synapse.ml.vw.VowpalWabbitClassifier",scala:"com/microsoft/azure/synapse/ml/vw/VowpalWabbitClassifier.html",csharp:"classSynapse_1_1ML_1_1Vw_1_1VowpalWabbitClassifier.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/vw/src/main/scala/com/microsoft/azure/synapse/ml/vw/VowpalWabbitClassifier.scala",mdxType:"DocTable"}),(0,o.kt)("h2",{id:"vowpalwabbitfeaturizer"},"VowpalWabbitFeaturizer"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(s.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.vw import *\n\nfeaturizer = (VowpalWabbitFeaturizer()\n .setStringSplitInputCols(["in"])\n .setPreserveOrderNumBits(2)\n .setNumBits(18)\n .setPrefixStringsWithColumnName(False)\n .setOutputCol("features"))\n'))),(0,o.kt)(s.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.vw._\n\nval featurizer = (new VowpalWabbitFeaturizer()\n .setStringSplitInputCols(Array("in"))\n .setPreserveOrderNumBits(2)\n .setNumBits(18)\n .setPrefixStringsWithColumnName(false)\n .setOutputCol("features"))\n')))),(0,o.kt)(i.Z,{className:"VowpalWabbitFeaturizer",py:"synapse.ml.vw.html#module-synapse.ml.vw.VowpalWabbitFeaturizer",scala:"com/microsoft/azure/synapse/ml/vw/VowpalWabbitFeaturizer.html",csharp:"classSynapse_1_1ML_1_1Vw_1_1VowpalWabbitFeaturizer.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/vw/src/main/scala/com/microsoft/azure/synapse/ml/vw/VowpalWabbitFeaturizer.scala",mdxType:"DocTable"}),(0,o.kt)("h2",{id:"vowpalwabbitinteractions"},"VowpalWabbitInteractions"),(0,o.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,o.kt)(s.Z,{value:"py",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.vw import *\n\ninteractions = (VowpalWabbitInteractions()\n .setInputCols(["v1"])\n .setOutputCol("out"))\n'))),(0,o.kt)(s.Z,{value:"scala",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.vw._\nimport org.apache.spark.ml.linalg._\n\ncase class Data(v1: Vector, v2: Vector, v3: Vector)\n\nval df = spark.createDataFrame(Seq(Data(\n Vectors.dense(Array(1.0, 2.0, 3.0)),\n Vectors.sparse(8, Array(5), Array(4.0)),\n Vectors.sparse(11, Array(8, 9), Array(7.0, 8.0))\n)))\n\nval interactions = (new VowpalWabbitInteractions()\n .setInputCols(Array("v1"))\n .setOutputCol("out"))\n\ninteractions.transform(df).show()\n')))),(0,o.kt)(i.Z,{className:"VowpalWabbitInteractions",py:"synapse.ml.vw.html#module-synapse.ml.vw.VowpalWabbitInteractions",scala:"com/microsoft/azure/synapse/ml/vw/VowpalWabbitInteractions.html",csharp:"classSynapse_1_1ML_1_1Vw_1_1VowpalWabbitInteractions.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/vw/src/main/scala/com/microsoft/azure/synapse/ml/vw/VowpalWabbitInteractions.scala",mdxType:"DocTable"}))}m.isMDXComponent=!0;var f=["components"],b={title:"Transformers - Vowpal Wabbit",sidebar_label:"Vowpal Wabbit",hide_title:!0},v="Vowpal Wabbit",w={unversionedId:"Quick Examples/transformers/transformers_vw",id:"version-0.11.3/Quick Examples/transformers/transformers_vw",title:"Transformers - Vowpal Wabbit",description:"",source:"@site/versioned_docs/version-0.11.3/Quick Examples/transformers/transformers_vw.md",sourceDirName:"Quick Examples/transformers",slug:"/Quick Examples/transformers/transformers_vw",permalink:"/SynapseML/docs/0.11.3/Quick Examples/transformers/transformers_vw",draft:!1,tags:[],version:"0.11.3",frontMatter:{title:"Transformers - Vowpal Wabbit",sidebar_label:"Vowpal Wabbit",hide_title:!0}},d={},y=[].concat(c),h={toc:y};function g(e){var t=e.components,a=(0,r.Z)(e,f);return(0,o.kt)("wrapper",(0,n.Z)({},h,a,{components:t,mdxType:"MDXLayout"}),(0,o.kt)("h1",{id:"vowpal-wabbit"},"Vowpal Wabbit"),(0,o.kt)(m,{mdxType:"VW"}))}g.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/57e687e8.9f9aa0a3.js b/assets/js/57e687e8.9f9aa0a3.js new file mode 100644 index 0000000000..71ed0c96b9 --- /dev/null +++ b/assets/js/57e687e8.9f9aa0a3.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[9159],{3905:function(e,t,n){n.d(t,{Zo:function(){return c},kt:function(){return d}});var a=n(7294);function r(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function o(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);t&&(a=a.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,a)}return n}function i(e){for(var t=1;t=0||(r[n]=e[n]);return r}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(r[n]=e[n])}return r}var l=a.createContext({}),p=function(e){var t=a.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):i(i({},t),e)),n},c=function(e){var t=p(e.components);return a.createElement(l.Provider,{value:t},e.children)},m={inlineCode:"code",wrapper:function(e){var t=e.children;return a.createElement(a.Fragment,{},t)}},u=a.forwardRef((function(e,t){var n=e.components,r=e.mdxType,o=e.originalType,l=e.parentName,c=s(e,["components","mdxType","originalType","parentName"]),u=p(n),d=r,h=u["".concat(l,".").concat(d)]||u[d]||m[d]||o;return n?a.createElement(h,i(i({ref:t},c),{},{components:n})):a.createElement(h,i({ref:t},c))}));function d(e,t){var n=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var o=n.length,i=new Array(o);i[0]=u;var s={};for(var l in t)hasOwnProperty.call(t,l)&&(s[l]=t[l]);s.originalType=e,s.mdxType="string"==typeof e?e:r,i[1]=s;for(var p=2;p"),". Assuming that you don't have active containers (including detached\nones), ",(0,o.kt)("inlineCode",{parentName:"p"},"docker system prune")," will remove this untagged image, reclaiming the\nused space."),(0,o.kt)("p",null,"If you've used an explicit version tag, then it will still exist after a new\npull, which means that you can continue using this version. If you\nused an unqualified name first and then a version-tagged one, Docker will fetch\nboth tags. Only the second fetch is fast since it points to content that\nwas already loaded. In this case, doing a ",(0,o.kt)("inlineCode",{parentName:"p"},"pull")," when there's a new version\nwill fetch the new ",(0,o.kt)("inlineCode",{parentName:"p"},"latest")," tag and change its meaning to the newer version, but\nthe older version will still be available under its own version tag."),(0,o.kt)("p",null,"Finally, if there are such version-tagged older versions that you want to get\nrid of, you can use ",(0,o.kt)("inlineCode",{parentName:"p"},"docker images")," to check the list of installed images and\ntheir tags, and ",(0,o.kt)("inlineCode",{parentName:"p"},"docker rmi :")," to remove the unwanted ones."),(0,o.kt)("h2",{id:"a-note-about-security"},"A note about security"),(0,o.kt)("p",null,"Executing code in a Docker container can be unsafe if the running user is\n",(0,o.kt)("inlineCode",{parentName:"p"},"root"),". For this reason, the SynapseML image uses a proper username instead. If\nyou still want to run as root (for instance, if you want to ",(0,o.kt)("inlineCode",{parentName:"p"},"apt install")," an\nanother ubuntu package), then you should use ",(0,o.kt)("inlineCode",{parentName:"p"},"--user root"),". This mode can be useful\nwhen combined with ",(0,o.kt)("inlineCode",{parentName:"p"},"docker exec")," to perform administrative work while the image\ncontinues to run as usual."),(0,o.kt)("h2",{id:"further-reading"},"Further reading"),(0,o.kt)("p",null,"This text briefly covers some of the useful things that you can do with the\nSynapseML Docker image (and other images in general). You can find much more\ndocumentation ",(0,o.kt)("a",{parentName:"p",href:"https://docs.docker.com/"},"online"),"."))}d.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/c40984d1.b90da9b0.js b/assets/js/589adaf2.0a101f04.js similarity index 97% rename from assets/js/c40984d1.b90da9b0.js rename to assets/js/589adaf2.0a101f04.js index 32d7fadf6a..22240eb2a0 100644 --- a/assets/js/c40984d1.b90da9b0.js +++ b/assets/js/589adaf2.0a101f04.js @@ -1 +1 @@ -"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[7320],{3905:function(e,t,n){n.d(t,{Zo:function(){return i},kt:function(){return p}});var r=n(7294);function a(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function l(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,r)}return n}function o(e){for(var t=1;t=0||(a[n]=e[n]);return a}(e,t);if(Object.getOwnPropertySymbols){var l=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(a[n]=e[n])}return a}var s=r.createContext({}),c=function(e){var t=r.useContext(s),n=t;return e&&(n="function"==typeof e?e(t):o(o({},t),e)),n},i=function(e){var t=c(e.components);return r.createElement(s.Provider,{value:t},e.children)},m={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},f=r.forwardRef((function(e,t){var n=e.components,a=e.mdxType,l=e.originalType,s=e.parentName,i=u(e,["components","mdxType","originalType","parentName"]),f=c(n),p=a,d=f["".concat(s,".").concat(p)]||f[p]||m[p]||l;return n?r.createElement(d,o(o({ref:t},i),{},{components:n})):r.createElement(d,o({ref:t},i))}));function p(e,t){var n=arguments,a=t&&t.mdxType;if("string"==typeof e||a){var l=n.length,o=new Array(l);o[0]=f;var u={};for(var s in t)hasOwnProperty.call(t,s)&&(u[s]=t[s]);u.originalType=e,u.mdxType="string"==typeof e?e:a,o[1]=u;for(var c=2;c child <"+("string"==typeof e.type?e.type:e.type.name)+'>: all children of the component should be , and every should have a unique "value" prop.')})))?void 0:n.filter(Boolean))?t:[]}(e).map((function(e){var t=e.props;return{value:t.value,label:t.label,attributes:t.attributes,default:t.default}}))}function f(e){var t=e.values,n=e.children;return(0,a.useMemo)((function(){var e=null!=t?t:m(n);return function(e){var t=(0,c.l)(e,(function(e,t){return e.value===t.value}));if(t.length>0)throw new Error('Docusaurus error: Duplicate values "'+t.map((function(e){return e.value})).join(", ")+'" found in . Every value needs to be unique.')}(e),e}),[t,n])}function p(e){var t=e.value;return e.tabValues.some((function(e){return e.value===t}))}function d(e){var t=e.queryString,n=void 0!==t&&t,r=e.groupId,l=(0,u.k6)(),o=function(e){var t=e.queryString,n=void 0!==t&&t,r=e.groupId;if("string"==typeof n)return n;if(!1===n)return null;if(!0===n&&!r)throw new Error('Docusaurus error: The component groupId prop is required if queryString=true, because this value is used as the search param name. You can also provide an explicit value such as queryString="my-search-param".');return null!=r?r:null}({queryString:n,groupId:r});return[(0,s._X)(o),(0,a.useCallback)((function(e){if(o){var t=new URLSearchParams(l.location.search);t.set(o,e),l.replace(Object.assign({},l.location,{search:t.toString()}))}}),[o,l])]}function b(e){var t,n,r,l,o=e.defaultValue,u=e.queryString,s=void 0!==u&&u,c=e.groupId,m=f(e),b=(0,a.useState)((function(){return function(e){var t,n=e.defaultValue,r=e.tabValues;if(0===r.length)throw new Error("Docusaurus error: the component requires at least one children component");if(n){if(!p({value:n,tabValues:r}))throw new Error('Docusaurus error: The has a defaultValue "'+n+'" but none of its children has the corresponding value. Available values are: '+r.map((function(e){return e.value})).join(", ")+". If you intend to show no default tab, use defaultValue={null} instead.");return n}var a=null!=(t=r.find((function(e){return e.default})))?t:r[0];if(!a)throw new Error("Unexpected error: 0 tabValues");return a.value}({defaultValue:o,tabValues:m})})),v=b[0],y=b[1],g=d({queryString:s,groupId:c}),h=g[0],E=g[1],T=(t=function(e){return e?"docusaurus.tab."+e:null}({groupId:c}.groupId),n=(0,i.Nk)(t),r=n[0],l=n[1],[r,(0,a.useCallback)((function(e){t&&l.set(e)}),[t,l])]),k=T[0],w=T[1],O=function(){var e=null!=h?h:k;return p({value:e,tabValues:m})?e:null}();return(0,a.useLayoutEffect)((function(){O&&y(O)}),[O]),{selectedValue:v,selectValue:(0,a.useCallback)((function(e){if(!p({value:e,tabValues:m}))throw new Error("Can't select invalid tab value="+e);y(e),E(e),w(e)}),[E,w,m]),tabValues:m}}var v=n(2389),y="tabList__CuJ",g="tabItem_LNqP";function h(e){var t=e.className,n=e.block,u=e.selectedValue,s=e.selectValue,c=e.tabValues,i=[],m=(0,o.o5)().blockElementScrollPositionUntilNextRender,f=function(e){var t=e.currentTarget,n=i.indexOf(t),r=c[n].value;r!==u&&(m(t),s(r))},p=function(e){var t,n=null;switch(e.key){case"Enter":f(e);break;case"ArrowRight":var r,a=i.indexOf(e.currentTarget)+1;n=null!=(r=i[a])?r:i[0];break;case"ArrowLeft":var l,o=i.indexOf(e.currentTarget)-1;n=null!=(l=i[o])?l:i[i.length-1]}null==(t=n)||t.focus()};return a.createElement("ul",{role:"tablist","aria-orientation":"horizontal",className:(0,l.Z)("tabs",{"tabs--block":n},t)},c.map((function(e){var t=e.value,n=e.label,o=e.attributes;return a.createElement("li",(0,r.Z)({role:"tab",tabIndex:u===t?0:-1,"aria-selected":u===t,key:t,ref:function(e){return i.push(e)},onKeyDown:p,onClick:f},o,{className:(0,l.Z)("tabs__item",g,null==o?void 0:o.className,{"tabs__item--active":u===t})}),null!=n?n:t)})))}function E(e){var t=e.lazy,n=e.children,r=e.selectedValue,l=(Array.isArray(n)?n:[n]).filter(Boolean);if(t){var o=l.find((function(e){return e.props.value===r}));return o?(0,a.cloneElement)(o,{className:"margin-top--md"}):null}return a.createElement("div",{className:"margin-top--md"},l.map((function(e,t){return(0,a.cloneElement)(e,{key:t,hidden:e.props.value!==r})})))}function T(e){var t=b(e);return a.createElement("div",{className:(0,l.Z)("tabs-container",y)},a.createElement(h,(0,r.Z)({},e,t)),a.createElement(E,(0,r.Z)({},e,t)))}function k(e){var t=(0,v.Z)();return a.createElement(T,(0,r.Z)({key:String(t)},e))}},1989:function(e,t,n){var r=n(7294),a=n(2263);t.Z=function(e){var t=e.className,n=e.py,l=e.scala,o=e.csharp,u=e.sourceLink,s=(0,a.Z)().siteConfig.customFields.version,c="https://mmlspark.blob.core.windows.net/docs/"+s+"/pyspark/"+n,i="https://mmlspark.blob.core.windows.net/docs/"+s+"/scala/"+l,m="https://mmlspark.blob.core.windows.net/docs/"+s+"/dotnet/"+o;return r.createElement("table",null,r.createElement("tbody",null,r.createElement("tr",null,r.createElement("td",null,r.createElement("strong",null,"Python API: "),r.createElement("a",{href:c},t)),r.createElement("td",null,r.createElement("strong",null,"Scala API: "),r.createElement("a",{href:i},t)),r.createElement("td",null,r.createElement("strong",null,".NET API: "),r.createElement("a",{href:m},t)),r.createElement("td",null,r.createElement("strong",null,"Source: "),r.createElement("a",{href:u},t)))))}},4356:function(e,t,n){n.r(t),n.d(t,{assets:function(){return y},contentTitle:function(){return b},default:function(){return E},frontMatter:function(){return d},metadata:function(){return v},toc:function(){return g}});var r=n(3117),a=n(102),l=(n(7294),n(3905)),o=n(4866),u=n(5162),s=n(1989),c=["components"],i=[{value:"DoubleMLEstimator",id:"doublemlestimator",level:2}],m={toc:i};function f(e){var t=e.components,n=(0,a.Z)(e,c);return(0,l.kt)("wrapper",(0,r.Z)({},m,n,{components:t,mdxType:"MDXLayout"}),(0,l.kt)("h2",{id:"doublemlestimator"},"DoubleMLEstimator"),(0,l.kt)(o.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,l.kt)(u.Z,{value:"py",mdxType:"TabItem"},(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.causal import *\nfrom pyspark.ml.classification import LogisticRegression\nfrom pyspark.sql.types import StructType, StructField, DoubleType, IntegerType, BooleanType\n\nschema = StructType([\n StructField("Treatment", BooleanType()),\n StructField("Outcome", BooleanType()),\n StructField("col2", DoubleType()),\n StructField("col3", DoubleType()),\n StructField("col4", DoubleType())\n ])\n\n\ndf = spark.createDataFrame([\n (False, True, 0.30, 0.66, 0.2),\n (True, False, 0.38, 0.53, 1.5),\n (False, True, 0.68, 0.98, 3.2),\n (True, False, 0.15, 0.32, 6.6),\n (False, True, 0.50, 0.65, 2.8),\n (True, True, 0.40, 0.54, 3.7),\n (False, True, 0.78, 0.97, 8.1),\n (True, False, 0.12, 0.32, 10.2),\n (False, True, 0.35, 0.63, 1.8),\n (True, False, 0.45, 0.57, 4.3),\n (False, True, 0.75, 0.97, 7.2),\n (True, True, 0.16, 0.32, 11.7)], schema\n)\n\ndml = (DoubleMLEstimator()\n .setTreatmentCol("Treatment")\n .setTreatmentModel(LogisticRegression())\n .setOutcomeCol("Outcome")\n .setOutcomeModel(LogisticRegression())\n .setMaxIter(20))\n\ndmlModel = dml.fit(df)\ndmlModel.getAvgTreatmentEffect()\ndmlModel.getConfidenceInterval()\n'))),(0,l.kt)(u.Z,{value:"scala",mdxType:"TabItem"},(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.causal._\nimport org.apache.spark.ml.classification.LogisticRegression\n\nval df = (Seq(\n (false, true, 0.50, 0.60, 0),\n (true, false, 0.40, 0.50, 1),\n (false, true, 0.78, 0.99, 2),\n (true, false, 0.12, 0.34, 3),\n (false, true, 0.50, 0.60, 0),\n (true, false, 0.40, 0.50, 1),\n (false, true, 0.78, 0.99, 2),\n (true, false, 0.12, 0.34, 3),\n (false, false, 0.50, 0.60, 0),\n (true, true, 0.40, 0.50, 1),\n (false, true, 0.78, 0.99, 2),\n (true, false, 0.12, 0.34, 3))\n .toDF("Treatment", "Outcome", "col2", "col3", "col4"))\n\nval dml = (new DoubleMLEstimator()\n .setTreatmentCol("Treatment")\n .setTreatmentModel(new LogisticRegression())\n .setOutcomeCol("Outcome")\n .setOutcomeModel(new LogisticRegression())\n .setMaxIter(20))\n\nval dmlModel = dml.fit(df)\ndmlModel.getAvgTreatmentEffect\ndmlModel.getConfidenceInterval\n')))),(0,l.kt)(s.Z,{className:"DoubleMLEstimator",py:"synapse.ml.causal.html#module-synapse.ml.causal.DoubleMLEstimator",scala:"com/microsoft/azure/synapse/ml/causal/DoubleMLEstimator.html",csharp:"classSynapse_1_1ML_1_1Causal_1_1DoubleMLEstimator.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/core/src/main/scala/com/microsoft/azure/synapse/ml/causal/DoubleMLEstimator.scala",mdxType:"DocTable"}))}f.isMDXComponent=!0;var p=["components"],d={title:"Estimators - Causal",sidebar_label:"Causal Inference",hide_title:!0},b="Causal Inference",v={unversionedId:"Quick Examples/estimators/estimators_causal",id:"version-0.11.3/Quick Examples/estimators/estimators_causal",title:"Estimators - Causal",description:"",source:"@site/versioned_docs/version-0.11.3/Quick Examples/estimators/estimators_causal.md",sourceDirName:"Quick Examples/estimators",slug:"/Quick Examples/estimators/estimators_causal",permalink:"/SynapseML/docs/Quick Examples/estimators/estimators_causal",draft:!1,tags:[],version:"0.11.3",frontMatter:{title:"Estimators - Causal",sidebar_label:"Causal Inference",hide_title:!0}},y={},g=[].concat(i),h={toc:g};function E(e){var t=e.components,n=(0,a.Z)(e,p);return(0,l.kt)("wrapper",(0,r.Z)({},h,n,{components:t,mdxType:"MDXLayout"}),(0,l.kt)("h1",{id:"causal-inference"},"Causal Inference"),(0,l.kt)(f,{mdxType:"DoubleMLEstimator"}))}E.isMDXComponent=!0}}]); \ No newline at end of file +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[6572],{3905:function(e,t,n){n.d(t,{Zo:function(){return i},kt:function(){return p}});var r=n(7294);function a(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function l(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,r)}return n}function o(e){for(var t=1;t=0||(a[n]=e[n]);return a}(e,t);if(Object.getOwnPropertySymbols){var l=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(a[n]=e[n])}return a}var s=r.createContext({}),c=function(e){var t=r.useContext(s),n=t;return e&&(n="function"==typeof e?e(t):o(o({},t),e)),n},i=function(e){var t=c(e.components);return r.createElement(s.Provider,{value:t},e.children)},m={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},f=r.forwardRef((function(e,t){var n=e.components,a=e.mdxType,l=e.originalType,s=e.parentName,i=u(e,["components","mdxType","originalType","parentName"]),f=c(n),p=a,d=f["".concat(s,".").concat(p)]||f[p]||m[p]||l;return n?r.createElement(d,o(o({ref:t},i),{},{components:n})):r.createElement(d,o({ref:t},i))}));function p(e,t){var n=arguments,a=t&&t.mdxType;if("string"==typeof e||a){var l=n.length,o=new Array(l);o[0]=f;var u={};for(var s in t)hasOwnProperty.call(t,s)&&(u[s]=t[s]);u.originalType=e,u.mdxType="string"==typeof e?e:a,o[1]=u;for(var c=2;c child <"+("string"==typeof e.type?e.type:e.type.name)+'>: all children of the component should be , and every should have a unique "value" prop.')})))?void 0:n.filter(Boolean))?t:[]}(e).map((function(e){var t=e.props;return{value:t.value,label:t.label,attributes:t.attributes,default:t.default}}))}function f(e){var t=e.values,n=e.children;return(0,a.useMemo)((function(){var e=null!=t?t:m(n);return function(e){var t=(0,c.l)(e,(function(e,t){return e.value===t.value}));if(t.length>0)throw new Error('Docusaurus error: Duplicate values "'+t.map((function(e){return e.value})).join(", ")+'" found in . Every value needs to be unique.')}(e),e}),[t,n])}function p(e){var t=e.value;return e.tabValues.some((function(e){return e.value===t}))}function d(e){var t=e.queryString,n=void 0!==t&&t,r=e.groupId,l=(0,u.k6)(),o=function(e){var t=e.queryString,n=void 0!==t&&t,r=e.groupId;if("string"==typeof n)return n;if(!1===n)return null;if(!0===n&&!r)throw new Error('Docusaurus error: The component groupId prop is required if queryString=true, because this value is used as the search param name. You can also provide an explicit value such as queryString="my-search-param".');return null!=r?r:null}({queryString:n,groupId:r});return[(0,s._X)(o),(0,a.useCallback)((function(e){if(o){var t=new URLSearchParams(l.location.search);t.set(o,e),l.replace(Object.assign({},l.location,{search:t.toString()}))}}),[o,l])]}function b(e){var t,n,r,l,o=e.defaultValue,u=e.queryString,s=void 0!==u&&u,c=e.groupId,m=f(e),b=(0,a.useState)((function(){return function(e){var t,n=e.defaultValue,r=e.tabValues;if(0===r.length)throw new Error("Docusaurus error: the component requires at least one children component");if(n){if(!p({value:n,tabValues:r}))throw new Error('Docusaurus error: The has a defaultValue "'+n+'" but none of its children has the corresponding value. Available values are: '+r.map((function(e){return e.value})).join(", ")+". If you intend to show no default tab, use defaultValue={null} instead.");return n}var a=null!=(t=r.find((function(e){return e.default})))?t:r[0];if(!a)throw new Error("Unexpected error: 0 tabValues");return a.value}({defaultValue:o,tabValues:m})})),v=b[0],y=b[1],g=d({queryString:s,groupId:c}),h=g[0],E=g[1],T=(t=function(e){return e?"docusaurus.tab."+e:null}({groupId:c}.groupId),n=(0,i.Nk)(t),r=n[0],l=n[1],[r,(0,a.useCallback)((function(e){t&&l.set(e)}),[t,l])]),k=T[0],w=T[1],O=function(){var e=null!=h?h:k;return p({value:e,tabValues:m})?e:null}();return(0,a.useLayoutEffect)((function(){O&&y(O)}),[O]),{selectedValue:v,selectValue:(0,a.useCallback)((function(e){if(!p({value:e,tabValues:m}))throw new Error("Can't select invalid tab value="+e);y(e),E(e),w(e)}),[E,w,m]),tabValues:m}}var v=n(2389),y="tabList__CuJ",g="tabItem_LNqP";function h(e){var t=e.className,n=e.block,u=e.selectedValue,s=e.selectValue,c=e.tabValues,i=[],m=(0,o.o5)().blockElementScrollPositionUntilNextRender,f=function(e){var t=e.currentTarget,n=i.indexOf(t),r=c[n].value;r!==u&&(m(t),s(r))},p=function(e){var t,n=null;switch(e.key){case"Enter":f(e);break;case"ArrowRight":var r,a=i.indexOf(e.currentTarget)+1;n=null!=(r=i[a])?r:i[0];break;case"ArrowLeft":var l,o=i.indexOf(e.currentTarget)-1;n=null!=(l=i[o])?l:i[i.length-1]}null==(t=n)||t.focus()};return a.createElement("ul",{role:"tablist","aria-orientation":"horizontal",className:(0,l.Z)("tabs",{"tabs--block":n},t)},c.map((function(e){var t=e.value,n=e.label,o=e.attributes;return a.createElement("li",(0,r.Z)({role:"tab",tabIndex:u===t?0:-1,"aria-selected":u===t,key:t,ref:function(e){return i.push(e)},onKeyDown:p,onClick:f},o,{className:(0,l.Z)("tabs__item",g,null==o?void 0:o.className,{"tabs__item--active":u===t})}),null!=n?n:t)})))}function E(e){var t=e.lazy,n=e.children,r=e.selectedValue,l=(Array.isArray(n)?n:[n]).filter(Boolean);if(t){var o=l.find((function(e){return e.props.value===r}));return o?(0,a.cloneElement)(o,{className:"margin-top--md"}):null}return a.createElement("div",{className:"margin-top--md"},l.map((function(e,t){return(0,a.cloneElement)(e,{key:t,hidden:e.props.value!==r})})))}function T(e){var t=b(e);return a.createElement("div",{className:(0,l.Z)("tabs-container",y)},a.createElement(h,(0,r.Z)({},e,t)),a.createElement(E,(0,r.Z)({},e,t)))}function k(e){var t=(0,v.Z)();return a.createElement(T,(0,r.Z)({key:String(t)},e))}},1989:function(e,t,n){var r=n(7294),a=n(2263);t.Z=function(e){var t=e.className,n=e.py,l=e.scala,o=e.csharp,u=e.sourceLink,s=(0,a.Z)().siteConfig.customFields.version,c="https://mmlspark.blob.core.windows.net/docs/"+s+"/pyspark/"+n,i="https://mmlspark.blob.core.windows.net/docs/"+s+"/scala/"+l,m="https://mmlspark.blob.core.windows.net/docs/"+s+"/dotnet/"+o;return r.createElement("table",null,r.createElement("tbody",null,r.createElement("tr",null,r.createElement("td",null,r.createElement("strong",null,"Python API: "),r.createElement("a",{href:c},t)),r.createElement("td",null,r.createElement("strong",null,"Scala API: "),r.createElement("a",{href:i},t)),r.createElement("td",null,r.createElement("strong",null,".NET API: "),r.createElement("a",{href:m},t)),r.createElement("td",null,r.createElement("strong",null,"Source: "),r.createElement("a",{href:u},t)))))}},1965:function(e,t,n){n.r(t),n.d(t,{assets:function(){return y},contentTitle:function(){return b},default:function(){return E},frontMatter:function(){return d},metadata:function(){return v},toc:function(){return g}});var r=n(3117),a=n(102),l=(n(7294),n(3905)),o=n(4866),u=n(5162),s=n(1989),c=["components"],i=[{value:"DoubleMLEstimator",id:"doublemlestimator",level:2}],m={toc:i};function f(e){var t=e.components,n=(0,a.Z)(e,c);return(0,l.kt)("wrapper",(0,r.Z)({},m,n,{components:t,mdxType:"MDXLayout"}),(0,l.kt)("h2",{id:"doublemlestimator"},"DoubleMLEstimator"),(0,l.kt)(o.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,l.kt)(u.Z,{value:"py",mdxType:"TabItem"},(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.causal import *\nfrom pyspark.ml.classification import LogisticRegression\nfrom pyspark.sql.types import StructType, StructField, DoubleType, IntegerType, BooleanType\n\nschema = StructType([\n StructField("Treatment", BooleanType()),\n StructField("Outcome", BooleanType()),\n StructField("col2", DoubleType()),\n StructField("col3", DoubleType()),\n StructField("col4", DoubleType())\n ])\n\n\ndf = spark.createDataFrame([\n (False, True, 0.30, 0.66, 0.2),\n (True, False, 0.38, 0.53, 1.5),\n (False, True, 0.68, 0.98, 3.2),\n (True, False, 0.15, 0.32, 6.6),\n (False, True, 0.50, 0.65, 2.8),\n (True, True, 0.40, 0.54, 3.7),\n (False, True, 0.78, 0.97, 8.1),\n (True, False, 0.12, 0.32, 10.2),\n (False, True, 0.35, 0.63, 1.8),\n (True, False, 0.45, 0.57, 4.3),\n (False, True, 0.75, 0.97, 7.2),\n (True, True, 0.16, 0.32, 11.7)], schema\n)\n\ndml = (DoubleMLEstimator()\n .setTreatmentCol("Treatment")\n .setTreatmentModel(LogisticRegression())\n .setOutcomeCol("Outcome")\n .setOutcomeModel(LogisticRegression())\n .setMaxIter(20))\n\ndmlModel = dml.fit(df)\ndmlModel.getAvgTreatmentEffect()\ndmlModel.getConfidenceInterval()\n'))),(0,l.kt)(u.Z,{value:"scala",mdxType:"TabItem"},(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.causal._\nimport org.apache.spark.ml.classification.LogisticRegression\n\nval df = (Seq(\n (false, true, 0.50, 0.60, 0),\n (true, false, 0.40, 0.50, 1),\n (false, true, 0.78, 0.99, 2),\n (true, false, 0.12, 0.34, 3),\n (false, true, 0.50, 0.60, 0),\n (true, false, 0.40, 0.50, 1),\n (false, true, 0.78, 0.99, 2),\n (true, false, 0.12, 0.34, 3),\n (false, false, 0.50, 0.60, 0),\n (true, true, 0.40, 0.50, 1),\n (false, true, 0.78, 0.99, 2),\n (true, false, 0.12, 0.34, 3))\n .toDF("Treatment", "Outcome", "col2", "col3", "col4"))\n\nval dml = (new DoubleMLEstimator()\n .setTreatmentCol("Treatment")\n .setTreatmentModel(new LogisticRegression())\n .setOutcomeCol("Outcome")\n .setOutcomeModel(new LogisticRegression())\n .setMaxIter(20))\n\nval dmlModel = dml.fit(df)\ndmlModel.getAvgTreatmentEffect\ndmlModel.getConfidenceInterval\n')))),(0,l.kt)(s.Z,{className:"DoubleMLEstimator",py:"synapse.ml.causal.html#module-synapse.ml.causal.DoubleMLEstimator",scala:"com/microsoft/azure/synapse/ml/causal/DoubleMLEstimator.html",csharp:"classSynapse_1_1ML_1_1Causal_1_1DoubleMLEstimator.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/core/src/main/scala/com/microsoft/azure/synapse/ml/causal/DoubleMLEstimator.scala",mdxType:"DocTable"}))}f.isMDXComponent=!0;var p=["components"],d={title:"Estimators - Causal",sidebar_label:"Causal Inference",hide_title:!0},b="Causal Inference",v={unversionedId:"Quick Examples/estimators/estimators_causal",id:"version-0.11.4/Quick Examples/estimators/estimators_causal",title:"Estimators - Causal",description:"",source:"@site/versioned_docs/version-0.11.4/Quick Examples/estimators/estimators_causal.md",sourceDirName:"Quick Examples/estimators",slug:"/Quick Examples/estimators/estimators_causal",permalink:"/SynapseML/docs/Quick Examples/estimators/estimators_causal",draft:!1,tags:[],version:"0.11.4",frontMatter:{title:"Estimators - Causal",sidebar_label:"Causal Inference",hide_title:!0}},y={},g=[].concat(i),h={toc:g};function E(e){var t=e.components,n=(0,a.Z)(e,p);return(0,l.kt)("wrapper",(0,r.Z)({},h,n,{components:t,mdxType:"MDXLayout"}),(0,l.kt)("h1",{id:"causal-inference"},"Causal Inference"),(0,l.kt)(f,{mdxType:"DoubleMLEstimator"}))}E.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/5962ef6e.12edaced.js b/assets/js/5962ef6e.12edaced.js new file mode 100644 index 0000000000..b93b17184c --- /dev/null +++ b/assets/js/5962ef6e.12edaced.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[2630],{3905:function(e,a,n){n.d(a,{Zo:function(){return c},kt:function(){return d}});var t=n(7294);function s(e,a,n){return a in e?Object.defineProperty(e,a,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[a]=n,e}function r(e,a){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var t=Object.getOwnPropertySymbols(e);a&&(t=t.filter((function(a){return Object.getOwnPropertyDescriptor(e,a).enumerable}))),n.push.apply(n,t)}return n}function o(e){for(var a=1;a=0||(s[n]=e[n]);return s}(e,a);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);for(t=0;t=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(s[n]=e[n])}return s}var i=t.createContext({}),m=function(e){var a=t.useContext(i),n=a;return e&&(n="function"==typeof e?e(a):o(o({},a),e)),n},c=function(e){var a=m(e.components);return t.createElement(i.Provider,{value:a},e.children)},u={inlineCode:"code",wrapper:function(e){var a=e.children;return t.createElement(t.Fragment,{},a)}},p=t.forwardRef((function(e,a){var n=e.components,s=e.mdxType,r=e.originalType,i=e.parentName,c=l(e,["components","mdxType","originalType","parentName"]),p=m(n),d=s,f=p["".concat(i,".").concat(d)]||p[d]||u[d]||r;return n?t.createElement(f,o(o({ref:a},c),{},{components:n})):t.createElement(f,o({ref:a},c))}));function d(e,a){var n=arguments,s=a&&a.mdxType;if("string"==typeof e||s){var r=n.length,o=new Array(r);o[0]=p;var l={};for(var i in a)hasOwnProperty.call(a,i)&&(l[i]=a[i]);l.originalType=e,l.mdxType="string"==typeof e?e:s,o[1]=l;for(var m=2;m child <"+("string"==typeof e.type?e.type:e.type.name)+'>: all children of the component should be , and every should have a unique "value" prop.')})))?void 0:n.filter(Boolean))?a:[]}(e).map((function(e){var a=e.props;return{value:a.value,label:a.label,attributes:a.attributes,default:a.default}}))}function p(e){var a=e.values,n=e.children;return(0,s.useMemo)((function(){var e=null!=a?a:u(n);return function(e){var a=(0,m.l)(e,(function(e,a){return e.value===a.value}));if(a.length>0)throw new Error('Docusaurus error: Duplicate values "'+a.map((function(e){return e.value})).join(", ")+'" found in . Every value needs to be unique.')}(e),e}),[a,n])}function d(e){var a=e.value;return e.tabValues.some((function(e){return e.value===a}))}function f(e){var a=e.queryString,n=void 0!==a&&a,t=e.groupId,r=(0,l.k6)(),o=function(e){var a=e.queryString,n=void 0!==a&&a,t=e.groupId;if("string"==typeof n)return n;if(!1===n)return null;if(!0===n&&!t)throw new Error('Docusaurus error: The component groupId prop is required if queryString=true, because this value is used as the search param name. You can also provide an explicit value such as queryString="my-search-param".');return null!=t?t:null}({queryString:n,groupId:t});return[(0,i._X)(o),(0,s.useCallback)((function(e){if(o){var a=new URLSearchParams(r.location.search);a.set(o,e),r.replace(Object.assign({},r.location,{search:a.toString()}))}}),[o,r])]}function y(e){var a,n,t,r,o=e.defaultValue,l=e.queryString,i=void 0!==l&&l,m=e.groupId,u=p(e),y=(0,s.useState)((function(){return function(e){var a,n=e.defaultValue,t=e.tabValues;if(0===t.length)throw new Error("Docusaurus error: the component requires at least one children component");if(n){if(!d({value:n,tabValues:t}))throw new Error('Docusaurus error: The has a defaultValue "'+n+'" but none of its children has the corresponding value. Available values are: '+t.map((function(e){return e.value})).join(", ")+". If you intend to show no default tab, use defaultValue={null} instead.");return n}var s=null!=(a=t.find((function(e){return e.default})))?a:t[0];if(!s)throw new Error("Unexpected error: 0 tabValues");return s.value}({defaultValue:o,tabValues:u})})),v=y[0],g=y[1],h=f({queryString:i,groupId:m}),k=h[0],b=h[1],T=(a=function(e){return e?"docusaurus.tab."+e:null}({groupId:m}.groupId),n=(0,c.Nk)(a),t=n[0],r=n[1],[t,(0,s.useCallback)((function(e){a&&r.set(e)}),[a,r])]),M=T[0],C=T[1],I=function(){var e=null!=k?k:M;return d({value:e,tabValues:u})?e:null}();return(0,s.useLayoutEffect)((function(){I&&g(I)}),[I]),{selectedValue:v,selectValue:(0,s.useCallback)((function(e){if(!d({value:e,tabValues:u}))throw new Error("Can't select invalid tab value="+e);g(e),b(e),C(e)}),[b,C,u]),tabValues:u}}var v=n(2389),g="tabList__CuJ",h="tabItem_LNqP";function k(e){var a=e.className,n=e.block,l=e.selectedValue,i=e.selectValue,m=e.tabValues,c=[],u=(0,o.o5)().blockElementScrollPositionUntilNextRender,p=function(e){var a=e.currentTarget,n=c.indexOf(a),t=m[n].value;t!==l&&(u(a),i(t))},d=function(e){var a,n=null;switch(e.key){case"Enter":p(e);break;case"ArrowRight":var t,s=c.indexOf(e.currentTarget)+1;n=null!=(t=c[s])?t:c[0];break;case"ArrowLeft":var r,o=c.indexOf(e.currentTarget)-1;n=null!=(r=c[o])?r:c[c.length-1]}null==(a=n)||a.focus()};return s.createElement("ul",{role:"tablist","aria-orientation":"horizontal",className:(0,r.Z)("tabs",{"tabs--block":n},a)},m.map((function(e){var a=e.value,n=e.label,o=e.attributes;return s.createElement("li",(0,t.Z)({role:"tab",tabIndex:l===a?0:-1,"aria-selected":l===a,key:a,ref:function(e){return c.push(e)},onKeyDown:d,onClick:p},o,{className:(0,r.Z)("tabs__item",h,null==o?void 0:o.className,{"tabs__item--active":l===a})}),null!=n?n:a)})))}function b(e){var a=e.lazy,n=e.children,t=e.selectedValue,r=(Array.isArray(n)?n:[n]).filter(Boolean);if(a){var o=r.find((function(e){return e.props.value===t}));return o?(0,s.cloneElement)(o,{className:"margin-top--md"}):null}return s.createElement("div",{className:"margin-top--md"},r.map((function(e,a){return(0,s.cloneElement)(e,{key:a,hidden:e.props.value!==t})})))}function T(e){var a=y(e);return s.createElement("div",{className:(0,r.Z)("tabs-container",g)},s.createElement(k,(0,t.Z)({},e,a)),s.createElement(b,(0,t.Z)({},e,a)))}function M(e){var a=(0,v.Z)();return s.createElement(T,(0,t.Z)({key:String(a)},e))}},1989:function(e,a,n){var t=n(7294),s=n(2263);a.Z=function(e){var a=e.className,n=e.py,r=e.scala,o=e.csharp,l=e.sourceLink,i=(0,s.Z)().siteConfig.customFields.version,m="https://mmlspark.blob.core.windows.net/docs/"+i+"/pyspark/"+n,c="https://mmlspark.blob.core.windows.net/docs/"+i+"/scala/"+r,u="https://mmlspark.blob.core.windows.net/docs/"+i+"/dotnet/"+o;return t.createElement("table",null,t.createElement("tbody",null,t.createElement("tr",null,t.createElement("td",null,t.createElement("strong",null,"Python API: "),t.createElement("a",{href:m},a)),t.createElement("td",null,t.createElement("strong",null,"Scala API: "),t.createElement("a",{href:c},a)),t.createElement("td",null,t.createElement("strong",null,".NET API: "),t.createElement("a",{href:u},a)),t.createElement("td",null,t.createElement("strong",null,"Source: "),t.createElement("a",{href:l},a)))))}},7505:function(e,a,n){n.r(a),n.d(a,{assets:function(){return B},contentTitle:function(){return V},default:function(){return K},frontMatter:function(){return P},metadata:function(){return A},toc:function(){return H}});var t=n(3117),s=n(102),r=(n(7294),n(3905)),o=n(4866),l=n(5162),i=n(1989),m=["components"],c=[{value:"AutoML",id:"automl",level:2},{value:"FindBestModel",id:"findbestmodel",level:3},{value:"TuneHyperparameters",id:"tunehyperparameters",level:3}],u={toc:c};function p(e){var a=e.components,n=(0,s.Z)(e,m);return(0,r.kt)("wrapper",(0,t.Z)({},u,n,{components:a,mdxType:"MDXLayout"}),(0,r.kt)("h2",{id:"automl"},"AutoML"),(0,r.kt)("h3",{id:"findbestmodel"},"FindBestModel"),(0,r.kt)(o.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,r.kt)(l.Z,{value:"py",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.automl import *\nfrom synapse.ml.train import *\nfrom pyspark.ml.classification import RandomForestClassifier\n\ndf = (spark.createDataFrame([\n (0, 2, 0.50, 0.60, 0),\n (1, 3, 0.40, 0.50, 1),\n (0, 4, 0.78, 0.99, 2),\n (1, 5, 0.12, 0.34, 3),\n (0, 1, 0.50, 0.60, 0),\n (1, 3, 0.40, 0.50, 1),\n (0, 3, 0.78, 0.99, 2),\n (1, 4, 0.12, 0.34, 3),\n (0, 0, 0.50, 0.60, 0),\n (1, 2, 0.40, 0.50, 1),\n (0, 3, 0.78, 0.99, 2),\n (1, 4, 0.12, 0.34, 3)\n], ["Label", "col1", "col2", "col3", "col4"]))\n\n# mocking models\nrandomForestClassifier = (TrainClassifier()\n .setModel(RandomForestClassifier()\n .setMaxBins(32)\n .setMaxDepth(5)\n .setMinInfoGain(0.0)\n .setMinInstancesPerNode(1)\n .setNumTrees(20)\n .setSubsamplingRate(1.0)\n .setSeed(0))\n .setFeaturesCol("mlfeatures")\n .setLabelCol("Label"))\nmodel = randomForestClassifier.fit(df)\n\nfindBestModel = (FindBestModel()\n .setModels([model, model])\n .setEvaluationMetric("accuracy"))\nbestModel = findBestModel.fit(df)\nbestModel.transform(df).show()\n'))),(0,r.kt)(l.Z,{value:"scala",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.automl._\nimport com.microsoft.azure.synapse.ml.train._\nimport spark.implicits._\nimport org.apache.spark.ml.Transformer\n\nval df = (Seq(\n (0, 2, 0.50, 0.60, 0),\n (1, 3, 0.40, 0.50, 1),\n (0, 4, 0.78, 0.99, 2),\n (1, 5, 0.12, 0.34, 3),\n (0, 1, 0.50, 0.60, 0),\n (1, 3, 0.40, 0.50, 1),\n (0, 3, 0.78, 0.99, 2),\n (1, 4, 0.12, 0.34, 3),\n (0, 0, 0.50, 0.60, 0),\n (1, 2, 0.40, 0.50, 1),\n (0, 3, 0.78, 0.99, 2),\n (1, 4, 0.12, 0.34, 3)\n ).toDF("Label", "col1", "col2", "col3", "col4"))\n\n// mocking models\nval randomForestClassifier = (new TrainClassifier()\n .setModel(\n new RandomForestClassifier()\n .setMaxBins(32)\n .setMaxDepth(5)\n .setMinInfoGain(0.0)\n .setMinInstancesPerNode(1)\n .setNumTrees(20)\n .setSubsamplingRate(1.0)\n .setSeed(0L))\n .setFeaturesCol("mlfeatures")\n .setLabelCol("Label"))\nval model = randomForestClassifier.fit(df)\n\nval findBestModel = (new FindBestModel()\n .setModels(Array(model.asInstanceOf[Transformer], model.asInstanceOf[Transformer]))\n .setEvaluationMetric("accuracy"))\nval bestModel = findBestModel.fit(df)\nbestModel.transform(df).show()\n')))),(0,r.kt)(i.Z,{className:"FindBestModel",py:"synapse.ml.automl.html#module-synapse.ml.automl.FindBestModel",scala:"com/microsoft/azure/synapse/ml/automl/FindBestModel.html",csharp:"classSynapse_1_1ML_1_1Automl_1_1FindBestModel.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/core/src/main/scala/com/microsoft/azure/synapse/ml/automl/FindBestModel.scala",mdxType:"DocTable"}),(0,r.kt)("h3",{id:"tunehyperparameters"},"TuneHyperparameters"),(0,r.kt)(o.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,r.kt)(l.Z,{value:"py",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.automl import *\nfrom synapse.ml.train import *\nfrom pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier\n\n\ndf = (spark.createDataFrame([\n (0, 1, 1, 1, 1, 1, 1.0, 3, 1, 1),\n (0, 1, 1, 1, 1, 2, 1.0, 1, 1, 1),\n (0, 1, 1, 1, 1, 2, 1.0, 2, 1, 1),\n (0, 1, 2, 3, 1, 2, 1.0, 3, 1, 1),\n (0, 3, 1, 1, 1, 2, 1.0, 3, 1, 1)\n], ["Label", "Clump_Thickness", "Uniformity_of_Cell_Size",\n "Uniformity_of_Cell_Shape", "Marginal_Adhesion", "Single_Epithelial_Cell_Size",\n "Bare_Nuclei", "Bland_Chromatin", "Normal_Nucleoli", "Mitoses"]))\n\nlogReg = LogisticRegression()\nrandForest = RandomForestClassifier()\ngbt = GBTClassifier()\nsmlmodels = [logReg, randForest, gbt]\nmmlmodels = [TrainClassifier(model=model, labelCol="Label") for model in smlmodels]\n\nparamBuilder = (HyperparamBuilder()\n .addHyperparam(logReg, logReg.regParam, RangeHyperParam(0.1, 0.3))\n .addHyperparam(randForest, randForest.numTrees, DiscreteHyperParam([5,10]))\n .addHyperparam(randForest, randForest.maxDepth, DiscreteHyperParam([3,5]))\n .addHyperparam(gbt, gbt.maxBins, RangeHyperParam(8,16))\n .addHyperparam(gbt, gbt.maxDepth, DiscreteHyperParam([3,5])))\nsearchSpace = paramBuilder.build()\n# The search space is a list of params to tuples of estimator and hyperparam\nrandomSpace = RandomSpace(searchSpace)\n\nbestModel = TuneHyperparameters(\n evaluationMetric="accuracy", models=mmlmodels, numFolds=2,\n numRuns=len(mmlmodels) * 2, parallelism=2,\n paramSpace=randomSpace.space(), seed=0).fit(df)\n'))),(0,r.kt)(l.Z,{value:"scala",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.automl._\nimport com.microsoft.azure.synapse.ml.train._\nimport spark.implicits._\n\nval logReg = new LogisticRegression()\nval randForest = new RandomForestClassifier()\nval gbt = new GBTClassifier()\nval smlmodels = Seq(logReg, randForest, gbt)\nval mmlmodels = smlmodels.map(model => new TrainClassifier().setModel(model).setLabelCol("Label"))\n\nval paramBuilder = new HyperparamBuilder()\n .addHyperparam(logReg.regParam, new DoubleRangeHyperParam(0.1, 0.3))\n .addHyperparam(randForest.numTrees, new DiscreteHyperParam(List(5,10)))\n .addHyperparam(randForest.maxDepth, new DiscreteHyperParam(List(3,5)))\n .addHyperparam(gbt.maxBins, new IntRangeHyperParam(8,16))\n.addHyperparam(gbt.maxDepth, new DiscreteHyperParam(List(3,5)))\nval searchSpace = paramBuilder.build()\nval randomSpace = new RandomSpace(searchSpace)\n\nval dataset: DataFrame = Seq(\n (0, 1, 1, 1, 1, 1, 1.0, 3, 1, 1),\n (0, 1, 1, 1, 1, 2, 1.0, 1, 1, 1),\n (0, 1, 1, 1, 1, 2, 1.0, 2, 1, 1),\n (0, 1, 2, 3, 1, 2, 1.0, 3, 1, 1),\n (0, 3, 1, 1, 1, 2, 1.0, 3, 1, 1))\n .toDF("Label", "Clump_Thickness", "Uniformity_of_Cell_Size",\n "Uniformity_of_Cell_Shape", "Marginal_Adhesion", "Single_Epithelial_Cell_Size",\n "Bare_Nuclei", "Bland_Chromatin", "Normal_Nucleoli", "Mitoses")\n\nval tuneHyperparameters = new TuneHyperparameters().setEvaluationMetric("accuracy")\n .setModels(mmlmodels.toArray).setNumFolds(2).setNumRuns(mmlmodels.length * 2)\n .setParallelism(1).setParamSpace(randomSpace).setSeed(0)\ntuneHyperparameters.fit(dataset).show()\n')))),(0,r.kt)(i.Z,{className:"TuneHyperparameters",py:"synapse.ml.automl.html#module-synapse.ml.automl.TuneHyperparameters",scala:"com/microsoft/azure/synapse/ml/automl/TuneHyperparameters.html",csharp:"classSynapse_1_1ML_1_1Automl_1_1TuneHyperparameters.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/core/src/main/scala/com/microsoft/azure/synapse/ml/automl/TuneHyperparameters.scala",mdxType:"DocTable"}))}p.isMDXComponent=!0;var d=["components"],f=[{value:"Featurize",id:"featurize",level:2},{value:"CleanMissingData",id:"cleanmissingdata",level:3},{value:"CountSelector",id:"countselector",level:3},{value:"Featurize",id:"featurize-1",level:3},{value:"ValueIndexer",id:"valueindexer",level:3},{value:"Featurize Text",id:"featurize-text",level:2},{value:"TextFeaturizer",id:"textfeaturizer",level:3}],y={toc:f};function v(e){var a=e.components,n=(0,s.Z)(e,d);return(0,r.kt)("wrapper",(0,t.Z)({},y,n,{components:a,mdxType:"MDXLayout"}),(0,r.kt)("h2",{id:"featurize"},"Featurize"),(0,r.kt)("h3",{id:"cleanmissingdata"},"CleanMissingData"),(0,r.kt)(o.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,r.kt)(l.Z,{value:"py",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.featurize import *\n\ndataset = spark.createDataFrame([\n (0, 2, 0.50, 0.60, 0),\n (1, 3, 0.40, None, None),\n (0, 4, 0.78, 0.99, 2),\n (1, 5, 0.12, 0.34, 3),\n (0, 1, 0.50, 0.60, 0),\n (None, None, None, None, None),\n (0, 3, 0.78, 0.99, 2),\n (1, 4, 0.12, 0.34, 3),\n (0, None, 0.50, 0.60, 0),\n (1, 2, 0.40, 0.50, None),\n (0, 3, None, 0.99, 2),\n (1, 4, 0.12, 0.34, 3)\n], ["col1", "col2", "col3", "col4", "col5"])\n\ncmd = (CleanMissingData()\n .setInputCols(dataset.columns)\n .setOutputCols(dataset.columns)\n .setCleaningMode("Mean"))\n'))),(0,r.kt)(l.Z,{value:"scala",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.featurize._\nimport java.lang.{Boolean => JBoolean, Double => JDouble, Integer => JInt}\nimport spark.implicits._\n\ndef createMockDataset: DataFrame = {\n Seq[(JInt, JInt, JDouble, JDouble, JInt)](\n (0, 2, 0.50, 0.60, 0),\n (1, 3, 0.40, null, null),\n (0, 4, 0.78, 0.99, 2),\n (1, 5, 0.12, 0.34, 3),\n (0, 1, 0.50, 0.60, 0),\n (null, null, null, null, null),\n (0, 3, 0.78, 0.99, 2),\n (1, 4, 0.12, 0.34, 3),\n (0, null, 0.50, 0.60, 0),\n (1, 2, 0.40, 0.50, null),\n (0, 3, null, 0.99, 2),\n (1, 4, 0.12, 0.34, 3))\n .toDF("col1", "col2", "col3", "col4", "col5")\n }\n\nval dataset = createMockDataset\nval cmd = (new CleanMissingData()\n .setInputCols(dataset.columns)\n .setOutputCols(dataset.columns)\n .setCleaningMode("Mean"))\n')))),(0,r.kt)(i.Z,{className:"CleanMissingData",py:"synapse.ml.featurize.html#module-synapse.ml.featurize.CleanMissingData",scala:"com/microsoft/azure/synapse/ml/featurize/CleanMissingData.html",csharp:"classSynapse_1_1ML_1_1Featurize_1_1CleanMissingData.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/core/src/main/scala/com/microsoft/azure/synapse/ml/featurize/CleanMissingData.scala",mdxType:"DocTable"}),(0,r.kt)("h3",{id:"countselector"},"CountSelector"),(0,r.kt)(o.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,r.kt)(l.Z,{value:"py",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.featurize import *\nfrom pyspark.ml.linalg import Vectors\n\ndf = spark.createDataFrame([\n (Vectors.sparse(3, [(0, 1.0), (2, 2.0)]), Vectors.dense(1.0, 0.1, 0)),\n (Vectors.sparse(3, [(0, 1.0), (2, 2.0)]), Vectors.dense(1.0, 0.1, 0))\n], ["col1", "col2"])\n\ncs = CountSelector().setInputCol("col1").setOutputCol("col3")\n\ncs.fit(df).transform(df).show()\n'))),(0,r.kt)(l.Z,{value:"scala",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.featurize._\nimport org.apache.spark.ml.linalg.Vectors\nimport spark.implicits._\n\nval df = Seq(\n (Vectors.sparse(3, Seq((0, 1.0), (2, 2.0))), Vectors.dense(1.0, 0.1, 0)),\n (Vectors.sparse(3, Seq((0, 1.0), (2, 2.0))), Vectors.dense(1.0, 0.1, 0))\n ).toDF("col1", "col2")\n\nval cs = (new CountSelector()\n .setInputCol("col1")\n .setOutputCol("col3"))\n\ncs.fit(df).transform(df).show()\n')))),(0,r.kt)(i.Z,{className:"CountSelector",py:"synapse.ml.featurize.html#module-synapse.ml.featurize.CountSelector",scala:"com/microsoft/azure/synapse/ml/featurize/CountSelector.html",csharp:"classSynapse_1_1ML_1_1Featurize_1_1CountSelector.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/core/src/main/scala/com/microsoft/azure/synapse/ml/featurize/CountSelector.scala",mdxType:"DocTable"}),(0,r.kt)("h3",{id:"featurize-1"},"Featurize"),(0,r.kt)(o.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,r.kt)(l.Z,{value:"py",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.featurize import *\n\ndataset = spark.createDataFrame([\n (0, 2, 0.50, 0.60, "pokemon are everywhere"),\n (1, 3, 0.40, 0.50, "they are in the woods"),\n (0, 4, 0.78, 0.99, "they are in the water"),\n (1, 5, 0.12, 0.34, "they are in the fields"),\n (0, 3, 0.78, 0.99, "pokemon - gotta catch em all")\n], ["Label", "col1", "col2", "col3"])\n\nfeat = (Featurize()\n .setNumFeatures(10)\n .setOutputCol("testColumn")\n .setInputCols(["col1", "col2", "col3"])\n .setOneHotEncodeCategoricals(False))\n\nfeat.fit(dataset).transform(dataset).show()\n'))),(0,r.kt)(l.Z,{value:"scala",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.featurize._\nimport spark.implicits._\n\nval dataset = Seq(\n (0, 2, 0.50, 0.60, "pokemon are everywhere"),\n (1, 3, 0.40, 0.50, "they are in the woods"),\n (0, 4, 0.78, 0.99, "they are in the water"),\n (1, 5, 0.12, 0.34, "they are in the fields"),\n (0, 3, 0.78, 0.99, "pokemon - gotta catch em all")).toDF("Label", "col1", "col2", "col3")\n\nval featureColumns = dataset.columns.filter(_ != "Label")\n\nval feat = (new Featurize()\n .setNumFeatures(10)\n .setOutputCol("testColumn")\n .setInputCols(featureColumns)\n .setOneHotEncodeCategoricals(false))\n\nfeat.fit(dataset).transform(dataset).show()\n')))),(0,r.kt)(i.Z,{className:"Featurize",py:"synapse.ml.featurize.html#module-synapse.ml.featurize.Featurize",scala:"com/microsoft/azure/synapse/ml/featurize/Featurize.html",csharp:"classSynapse_1_1ML_1_1Featurize_1_1Featurize.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/core/src/main/scala/com/microsoft/azure/synapse/ml/featurize/Featurize.scala",mdxType:"DocTable"}),(0,r.kt)("h3",{id:"valueindexer"},"ValueIndexer"),(0,r.kt)(o.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,r.kt)(l.Z,{value:"py",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.featurize import *\n\ndf = spark.createDataFrame([\n (-3, 24, 0.32534, True, "piano"),\n (1, 5, 5.67, False, "piano"),\n (-3, 5, 0.32534, False, "guitar")\n], ["int", "long", "double", "bool", "string"])\n\nvi = ValueIndexer().setInputCol("string").setOutputCol("string_cat")\n\nvi.fit(df).transform(df).show()\n'))),(0,r.kt)(l.Z,{value:"scala",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.featurize._\nimport spark.implicits._\n\nval df = Seq[(Int, Long, Double, Boolean, String)](\n (-3, 24L, 0.32534, true, "piano"),\n (1, 5L, 5.67, false, "piano"),\n (-3, 5L, 0.32534, false, "guitar")).toDF("int", "long", "double", "bool", "string")\n\nval vi = new ValueIndexer().setInputCol("string").setOutputCol("string_cat")\n\nvi.fit(df).transform(df).show()\n')))),(0,r.kt)(i.Z,{className:"ValueIndexer",py:"synapse.ml.featurize.html#module-synapse.ml.featurize.ValueIndexer",scala:"com/microsoft/azure/synapse/ml/featurize/ValueIndexer.html",csharp:"classSynapse_1_1ML_1_1Featurize_1_1ValueIndexer.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/core/src/main/scala/com/microsoft/azure/synapse/ml/featurize/ValueIndexer.scala",mdxType:"DocTable"}),(0,r.kt)("h2",{id:"featurize-text"},"Featurize Text"),(0,r.kt)("h3",{id:"textfeaturizer"},"TextFeaturizer"),(0,r.kt)(o.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,r.kt)(l.Z,{value:"py",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.featurize.text import *\n\ndfRaw = spark.createDataFrame([\n (0, "Hi I"),\n (1, "I wish for snow today"),\n (2, "we Cant go to the park, because of the snow!"),\n (3, "")\n], ["label", "sentence"])\n\ntfRaw = (TextFeaturizer()\n .setInputCol("sentence")\n .setOutputCol("features")\n .setNumFeatures(20))\n\ntfRaw.fit(dfRaw).transform(dfRaw).show()\n'))),(0,r.kt)(l.Z,{value:"scala",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.featurize.text._\nimport spark.implicits._\n\nval dfRaw = Seq((0, "Hi I"),\n (1, "I wish for snow today"),\n (2, "we Cant go to the park, because of the snow!"),\n (3, "")).toDF("label", "sentence")\n\nval tfRaw = (new TextFeaturizer()\n .setInputCol("sentence")\n .setOutputCol("features")\n .setNumFeatures(20))\n\ntfRaw.fit(dfRaw).transform(dfRaw).show()\n')))),(0,r.kt)(i.Z,{className:"TextFeaturizer",py:"synapse.ml.featurize.text.html#module-synapse.ml.featurize.text.TextFeaturizer",scala:"com/microsoft/azure/synapse/ml/featurize/text/TextFeaturizer.html",csharp:"classSynapse_1_1ML_1_1Featurize_1_1Text_1_1TextFeaturizer.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/core/src/main/scala/com/microsoft/azure/synapse/ml/featurize/text/TextFeaturizer.scala",mdxType:"DocTable"}))}v.isMDXComponent=!0;var g=["components"],h=[{value:"Isolation Forest",id:"isolation-forest",level:2},{value:"IsolationForest",id:"isolationforest",level:3}],k={toc:h};function b(e){var a=e.components,n=(0,s.Z)(e,g);return(0,r.kt)("wrapper",(0,t.Z)({},k,n,{components:a,mdxType:"MDXLayout"}),(0,r.kt)("h2",{id:"isolation-forest"},"Isolation Forest"),(0,r.kt)("h3",{id:"isolationforest"},"IsolationForest"),(0,r.kt)(o.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,r.kt)(l.Z,{value:"py",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.isolationforest import *\n\nisolationForest = (IsolationForest()\n .setNumEstimators(100)\n .setBootstrap(False)\n .setMaxSamples(256)\n .setMaxFeatures(1.0)\n .setFeaturesCol("features")\n .setPredictionCol("predictedLabel")\n .setScoreCol("outlierScore")\n .setContamination(0.02)\n .setContaminationError(0.02 * 0.01)\n .setRandomSeed(1))\n'))),(0,r.kt)(l.Z,{value:"scala",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.isolationforest._\nimport spark.implicits._\n\nval isolationForest = (new IsolationForest()\n .setNumEstimators(100)\n .setBootstrap(false)\n .setMaxSamples(256)\n .setMaxFeatures(1.0)\n .setFeaturesCol("features")\n .setPredictionCol("predictedLabel")\n .setScoreCol("outlierScore")\n .setContamination(0.02)\n .setContaminationError(0.02 * 0.01)\n .setRandomSeed(1))\n')))),(0,r.kt)(i.Z,{className:"IsolationForest",py:"synapse.ml.isolationforest.html#module-synapse.ml.isolationforest.IsolationForest",scala:"com/microsoft/azure/synapse/ml/isolationforest/IsolationForest.html",csharp:"classSynapse_1_1ML_1_1Isolationforest_1_1IsolationForest.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/core/src/main/scala/com/microsoft/azure/synapse/ml/isolationforest/IsolationForest.scala",mdxType:"DocTable"}))}b.isMDXComponent=!0;var T=["components"],M=[{value:"NN",id:"nn",level:2},{value:"ConditionalKNN",id:"conditionalknn",level:3},{value:"KNN",id:"knn",level:3}],C={toc:M};function I(e){var a=e.components,n=(0,s.Z)(e,T);return(0,r.kt)("wrapper",(0,t.Z)({},C,n,{components:a,mdxType:"MDXLayout"}),(0,r.kt)("h2",{id:"nn"},"NN"),(0,r.kt)("h3",{id:"conditionalknn"},"ConditionalKNN"),(0,r.kt)(o.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,r.kt)(l.Z,{value:"py",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.nn import *\n\ncknn = (ConditionalKNN()\n .setOutputCol("matches")\n .setFeaturesCol("features"))\n'))),(0,r.kt)(l.Z,{value:"scala",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.nn._\nimport spark.implicits._\n\nval cknn = (new ConditionalKNN()\n .setOutputCol("matches")\n .setFeaturesCol("features"))\n')))),(0,r.kt)(i.Z,{className:"ConditionalKNN",py:"synapse.ml.nn.html#module-synapse.ml.nn.ConditionalKNN",scala:"com/microsoft/azure/synapse/ml/nn/ConditionalKNN.html",csharp:"classSynapse_1_1ML_1_1Nn_1_1ConditionalKNN.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/core/src/main/scala/com/microsoft/azure/synapse/ml/nn/ConditionalKNN.scala",mdxType:"DocTable"}),(0,r.kt)("h3",{id:"knn"},"KNN"),(0,r.kt)(o.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,r.kt)(l.Z,{value:"py",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.nn import *\n\nknn = (KNN()\n .setOutputCol("matches"))\n'))),(0,r.kt)(l.Z,{value:"scala",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.nn._\nimport spark.implicits._\n\nval knn = (new KNN()\n .setOutputCol("matches"))\n')))),(0,r.kt)(i.Z,{className:"KNN",py:"synapse.ml.nn.html#module-synapse.ml.nn.KNN",scala:"com/microsoft/azure/synapse/ml/nn/KNN.html",csharp:"classSynapse_1_1ML_1_1Nn_1_1KNN.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/core/src/main/scala/com/microsoft/azure/synapse/ml/nn/KNN.scala",mdxType:"DocTable"}))}I.isMDXComponent=!0;var _=["components"],w=[{value:"Recommendation",id:"recommendation",level:2},{value:"RecommendationIndexer, RankingEvaluator, RankingAdapter and RankingTrainValidationSplit",id:"recommendationindexer-rankingevaluator-rankingadapter-and-rankingtrainvalidationsplit",level:3},{value:"SAR",id:"sar",level:3}],N={toc:w};function S(e){var a=e.components,n=(0,s.Z)(e,_);return(0,r.kt)("wrapper",(0,t.Z)({},N,n,{components:a,mdxType:"MDXLayout"}),(0,r.kt)("h2",{id:"recommendation"},"Recommendation"),(0,r.kt)("h3",{id:"recommendationindexer-rankingevaluator-rankingadapter-and-rankingtrainvalidationsplit"},"RecommendationIndexer, RankingEvaluator, RankingAdapter and RankingTrainValidationSplit"),(0,r.kt)(o.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,r.kt)(l.Z,{value:"py",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.recommendation import *\nfrom pyspark.ml.recommendation import ALS\nfrom pyspark.ml.tuning import *\n\nratings = (spark.createDataFrame([\n ("11", "Movie 01", 2),\n ("11", "Movie 03", 1),\n ("11", "Movie 04", 5),\n ("11", "Movie 05", 3),\n ("11", "Movie 06", 4),\n ("11", "Movie 07", 1),\n ("11", "Movie 08", 5),\n ("11", "Movie 09", 3),\n ("22", "Movie 01", 4),\n ("22", "Movie 02", 5),\n ("22", "Movie 03", 1),\n ("22", "Movie 05", 3),\n ("22", "Movie 06", 3),\n ("22", "Movie 07", 5),\n ("22", "Movie 08", 1),\n ("22", "Movie 10", 3),\n ("33", "Movie 01", 4),\n ("33", "Movie 03", 1),\n ("33", "Movie 04", 5),\n ("33", "Movie 05", 3),\n ("33", "Movie 06", 4),\n ("33", "Movie 08", 1),\n ("33", "Movie 09", 5),\n ("33", "Movie 10", 3),\n ("44", "Movie 01", 4),\n ("44", "Movie 02", 5),\n ("44", "Movie 03", 1),\n ("44", "Movie 05", 3),\n ("44", "Movie 06", 4),\n ("44", "Movie 07", 5),\n ("44", "Movie 08", 1),\n ("44", "Movie 10", 3)\n ], ["customerIDOrg", "itemIDOrg", "rating"])\n .dropDuplicates()\n .cache())\n\nrecommendationIndexer = (RecommendationIndexer()\n .setUserInputCol("customerIDOrg")\n .setUserOutputCol("customerID")\n .setItemInputCol("itemIDOrg")\n .setItemOutputCol("itemID")\n .setRatingCol("rating"))\n\ntransformedDf = (recommendationIndexer.fit(ratings)\n .transform(ratings).cache())\n\nals = (ALS()\n .setNumUserBlocks(1)\n .setNumItemBlocks(1)\n .setUserCol("customerID")\n .setItemCol("itemID")\n .setRatingCol("rating")\n .setSeed(0))\n\nevaluator = (RankingEvaluator()\n .setK(3)\n .setNItems(10))\n\nadapter = (RankingAdapter()\n .setK(evaluator.getK())\n .setRecommender(als))\n\nadapter.fit(transformedDf).transform(transformedDf).show()\n\nparamGrid = (ParamGridBuilder()\n .addGrid(als.regParam, [1.0])\n .build())\n\ntvRecommendationSplit = (RankingTrainValidationSplit()\n .setEstimator(als)\n .setEvaluator(evaluator)\n .setEstimatorParamMaps(paramGrid)\n .setTrainRatio(0.8)\n .setUserCol(recommendationIndexer.getUserOutputCol())\n .setItemCol(recommendationIndexer.getItemOutputCol())\n .setRatingCol("rating"))\n\ntvRecommendationSplit.fit(transformedDf).transform(transformedDf).show()\n'))),(0,r.kt)(l.Z,{value:"scala",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.recommendation._\nimport org.apache.spark.ml.recommendation.ALS\nimport org.apache.spark.ml.tuning._\nimport spark.implicits._\n\nval ratings = (Seq(\n ("11", "Movie 01", 2),\n ("11", "Movie 03", 1),\n ("11", "Movie 04", 5),\n ("11", "Movie 05", 3),\n ("11", "Movie 06", 4),\n ("11", "Movie 07", 1),\n ("11", "Movie 08", 5),\n ("11", "Movie 09", 3),\n ("22", "Movie 01", 4),\n ("22", "Movie 02", 5),\n ("22", "Movie 03", 1),\n ("22", "Movie 05", 3),\n ("22", "Movie 06", 3),\n ("22", "Movie 07", 5),\n ("22", "Movie 08", 1),\n ("22", "Movie 10", 3),\n ("33", "Movie 01", 4),\n ("33", "Movie 03", 1),\n ("33", "Movie 04", 5),\n ("33", "Movie 05", 3),\n ("33", "Movie 06", 4),\n ("33", "Movie 08", 1),\n ("33", "Movie 09", 5),\n ("33", "Movie 10", 3),\n ("44", "Movie 01", 4),\n ("44", "Movie 02", 5),\n ("44", "Movie 03", 1),\n ("44", "Movie 05", 3),\n ("44", "Movie 06", 4),\n ("44", "Movie 07", 5),\n ("44", "Movie 08", 1),\n ("44", "Movie 10", 3))\n .toDF("customerIDOrg", "itemIDOrg", "rating")\n .dropDuplicates()\n .cache())\n\nval recommendationIndexer = (new RecommendationIndexer()\n .setUserInputCol("customerIDOrg")\n .setUserOutputCol("customerID")\n .setItemInputCol("itemIDOrg")\n .setItemOutputCol("itemID")\n .setRatingCol("rating"))\n\nval transformedDf = (recommendationIndexer.fit(ratings)\n .transform(ratings).cache())\n\nval als = (new ALS()\n .setNumUserBlocks(1)\n .setNumItemBlocks(1)\n .setUserCol("customerID")\n .setItemCol("itemID")\n .setRatingCol("rating")\n .setSeed(0))\n\nval evaluator = (new RankingEvaluator()\n .setK(3)\n .setNItems(10))\n\nval adapter = (new RankingAdapter()\n .setK(evaluator.getK)\n .setRecommender(als))\n\nadapter.fit(transformedDf).transform(transformedDf).show()\n\nval paramGrid = (new ParamGridBuilder()\n .addGrid(als.regParam, Array(1.0))\n .build())\n\nval tvRecommendationSplit = (new RankingTrainValidationSplit()\n .setEstimator(als)\n .setEvaluator(evaluator)\n .setEstimatorParamMaps(paramGrid)\n .setTrainRatio(0.8)\n .setUserCol(recommendationIndexer.getUserOutputCol)\n .setItemCol(recommendationIndexer.getItemOutputCol)\n .setRatingCol("rating"))\n\ntvRecommendationSplit.fit(transformedDf).transform(transformedDf).show()\n')))),(0,r.kt)(i.Z,{className:"RecommendationIndexer",py:"synapse.ml.recommendation.html#module-synapse.ml.recommendation.RecommendationIndexer",scala:"com/microsoft/azure/synapse/ml/recommendation/RecommendationIndexer.html",csharp:"classSynapse_1_1ML_1_1Recommendation_1_1RecommendationIndexer.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/core/src/main/scala/com/microsoft/azure/synapse/ml/recommendation/RecommendationIndexer.scala",mdxType:"DocTable"}),(0,r.kt)(i.Z,{className:"RankingEvaluator",py:"synapse.ml.recommendation.html#module-synapse.ml.recommendation.RankingEvaluator",scala:"com/microsoft/azure/synapse/ml/recommendation/RankingEvaluator.html",csharp:"classSynapse_1_1ML_1_1Recommendation_1_1RankingEvaluator.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/core/src/main/scala/com/microsoft/azure/synapse/ml/recommendation/RankingEvaluator.scala",mdxType:"DocTable"}),(0,r.kt)(i.Z,{className:"RankingAdapter",py:"synapse.ml.recommendation.html#module-synapse.ml.recommendation.RankingAdapter",scala:"com/microsoft/azure/synapse/ml/recommendation/RankingAdapter.html",csharp:"classSynapse_1_1ML_1_1Recommendation_1_1RankingAdapter.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/core/src/main/scala/com/microsoft/azure/synapse/ml/recommendation/RankingAdapter.scala",mdxType:"DocTable"}),(0,r.kt)(i.Z,{className:"RankingTrainValidationSplit",py:"synapse.ml.recommendation.html#module-synapse.ml.recommendation.RankingTrainValidationSplit",scala:"com/microsoft/azure/synapse/ml/recommendation/RankingTrainValidationSplit.html",csharp:"classSynapse_1_1ML_1_1Recommendation_1_1RankingTrainValidationSplit.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/core/src/main/scala/com/microsoft/azure/synapse/ml/recommendation/RankingTrainValidationSplit.scala",mdxType:"DocTable"}),(0,r.kt)("h3",{id:"sar"},"SAR"),(0,r.kt)(o.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,r.kt)(l.Z,{value:"py",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.recommendation import *\n\nratings = (spark.createDataFrame([\n ("11", "Movie 01", 2),\n ("11", "Movie 03", 1),\n ("11", "Movie 04", 5),\n ("11", "Movie 05", 3),\n ("11", "Movie 06", 4),\n ("11", "Movie 07", 1),\n ("11", "Movie 08", 5),\n ("11", "Movie 09", 3),\n ("22", "Movie 01", 4),\n ("22", "Movie 02", 5),\n ("22", "Movie 03", 1),\n ("22", "Movie 05", 3),\n ("22", "Movie 06", 3),\n ("22", "Movie 07", 5),\n ("22", "Movie 08", 1),\n ("22", "Movie 10", 3),\n ("33", "Movie 01", 4),\n ("33", "Movie 03", 1),\n ("33", "Movie 04", 5),\n ("33", "Movie 05", 3),\n ("33", "Movie 06", 4),\n ("33", "Movie 08", 1),\n ("33", "Movie 09", 5),\n ("33", "Movie 10", 3),\n ("44", "Movie 01", 4),\n ("44", "Movie 02", 5),\n ("44", "Movie 03", 1),\n ("44", "Movie 05", 3),\n ("44", "Movie 06", 4),\n ("44", "Movie 07", 5),\n ("44", "Movie 08", 1),\n ("44", "Movie 10", 3)\n ], ["customerIDOrg", "itemIDOrg", "rating"])\n .dropDuplicates()\n .cache())\n\nrecommendationIndexer = (RecommendationIndexer()\n .setUserInputCol("customerIDOrg")\n .setUserOutputCol("customerID")\n .setItemInputCol("itemIDOrg")\n .setItemOutputCol("itemID")\n .setRatingCol("rating"))\n\nalgo = (SAR()\n .setUserCol("customerID")\n .setItemCol("itemID")\n .setRatingCol("rating")\n .setTimeCol("timestamp")\n .setSupportThreshold(1)\n .setSimilarityFunction("jacccard")\n .setActivityTimeFormat("EEE MMM dd HH:mm:ss Z yyyy"))\n\nadapter = (RankingAdapter()\n .setK(5)\n .setRecommender(algo))\n\nres1 = recommendationIndexer.fit(ratings).transform(ratings).cache()\n\nadapter.fit(res1).transform(res1).show()\n'))),(0,r.kt)(l.Z,{value:"scala",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.recommendation._\nimport spark.implicits._\n\nval ratings = (Seq(\n ("11", "Movie 01", 2),\n ("11", "Movie 03", 1),\n ("11", "Movie 04", 5),\n ("11", "Movie 05", 3),\n ("11", "Movie 06", 4),\n ("11", "Movie 07", 1),\n ("11", "Movie 08", 5),\n ("11", "Movie 09", 3),\n ("22", "Movie 01", 4),\n ("22", "Movie 02", 5),\n ("22", "Movie 03", 1),\n ("22", "Movie 05", 3),\n ("22", "Movie 06", 3),\n ("22", "Movie 07", 5),\n ("22", "Movie 08", 1),\n ("22", "Movie 10", 3),\n ("33", "Movie 01", 4),\n ("33", "Movie 03", 1),\n ("33", "Movie 04", 5),\n ("33", "Movie 05", 3),\n ("33", "Movie 06", 4),\n ("33", "Movie 08", 1),\n ("33", "Movie 09", 5),\n ("33", "Movie 10", 3),\n ("44", "Movie 01", 4),\n ("44", "Movie 02", 5),\n ("44", "Movie 03", 1),\n ("44", "Movie 05", 3),\n ("44", "Movie 06", 4),\n ("44", "Movie 07", 5),\n ("44", "Movie 08", 1),\n ("44", "Movie 10", 3))\n .toDF("customerIDOrg", "itemIDOrg", "rating")\n .dropDuplicates()\n .cache())\n\nval recommendationIndexer = (new RecommendationIndexer()\n .setUserInputCol("customerIDOrg")\n .setUserOutputCol("customerID")\n .setItemInputCol("itemIDOrg")\n .setItemOutputCol("itemID")\n .setRatingCol("rating"))\n\nval algo = (new SAR()\n .setUserCol("customerID")\n .setItemCol("itemID")\n .setRatingCol("rating")\n .setTimeCol("timestamp")\n .setSupportThreshold(1)\n .setSimilarityFunction("jacccard")\n .setActivityTimeFormat("EEE MMM dd HH:mm:ss Z yyyy"))\n\nval adapter = (new RankingAdapter()\n .setK(5)\n .setRecommender(algo))\n\nval res1 = recommendationIndexer.fit(ratings).transform(ratings).cache()\n\nadapter.fit(res1).transform(res1).show()\n')))),(0,r.kt)(i.Z,{className:"SAR",py:"synapse.ml.recommendation.html#module-synapse.ml.recommendation.SAR",scala:"com/microsoft/azure/synapse/ml/recommendation/SAR.html",csharp:"classSynapse_1_1ML_1_1Recommendation_1_1SAR.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/core/src/main/scala/com/microsoft/azure/synapse/ml/recommendation/SAR.scala",mdxType:"DocTable"}))}S.isMDXComponent=!0;var x=["components"],D=[{value:"Stages",id:"stages",level:2},{value:"ClassBalancer",id:"classbalancer",level:3},{value:"MultiColumnAdapter",id:"multicolumnadapter",level:3},{value:"Timer",id:"timer",level:3}],R={toc:D};function z(e){var a=e.components,n=(0,s.Z)(e,x);return(0,r.kt)("wrapper",(0,t.Z)({},R,n,{components:a,mdxType:"MDXLayout"}),(0,r.kt)("h2",{id:"stages"},"Stages"),(0,r.kt)("h3",{id:"classbalancer"},"ClassBalancer"),(0,r.kt)(o.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,r.kt)(l.Z,{value:"py",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.stages import *\n\ndf = (spark.createDataFrame([\n (0, 1.0, "Hi I"),\n (1, 1.0, "I wish for snow today"),\n (2, 2.0, "I wish for snow today"),\n (3, 2.0, "I wish for snow today"),\n (4, 2.0, "I wish for snow today"),\n (5, 2.0, "I wish for snow today"),\n (6, 0.0, "I wish for snow today"),\n (7, 1.0, "I wish for snow today"),\n (8, 0.0, "we Cant go to the park, because of the snow!"),\n (9, 2.0, "")\n ], ["index", "label", "sentence"]))\n\ncb = ClassBalancer().setInputCol("label")\n\ncb.fit(df).transform(df).show()\n'))),(0,r.kt)(l.Z,{value:"scala",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.stages._\n\nval df = Seq(\n (0, 1.0, "Hi I"),\n (1, 1.0, "I wish for snow today"),\n (2, 2.0, "I wish for snow today"),\n (3, 2.0, "I wish for snow today"),\n (4, 2.0, "I wish for snow today"),\n (5, 2.0, "I wish for snow today"),\n (6, 0.0, "I wish for snow today"),\n (7, 1.0, "I wish for snow today"),\n (8, 0.0, "we Cant go to the park, because of the snow!"),\n (9, 2.0, "")).toDF("index", "label", "sentence")\n\nval cb = new ClassBalancer().setInputCol("label")\n\ncb.fit(df).transform(df).show()\n')))),(0,r.kt)(i.Z,{className:"ClassBalancer",py:"synapse.ml.stages.html#module-synapse.ml.stages.ClassBalancer",scala:"com/microsoft/azure/synapse/ml/stages/ClassBalancer.html",csharp:"classSynapse_1_1ML_1_1Stages_1_1ClassBalancer.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/core/src/main/scala/com/microsoft/azure/synapse/ml/stages/ClassBalancer.scala",mdxType:"DocTable"}),(0,r.kt)("h3",{id:"multicolumnadapter"},"MultiColumnAdapter"),(0,r.kt)(o.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,r.kt)(l.Z,{value:"py",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.stages import *\nfrom pyspark.ml.feature import Tokenizer\n\ndf = (spark.createDataFrame([\n (0, "This is a test", "this is one too"),\n (1, "could be a test", "bar"),\n (2, "foo", "bar"),\n (3, "foo", "maybe not")\n ], ["label", "words1", "words2"]))\n\nstage1 = Tokenizer()\nmca = (MultiColumnAdapter()\n .setBaseStage(stage1)\n .setInputCols(["words1", "words2"])\n .setOutputCols(["output1", "output2"]))\n\nmca.fit(df).transform(df).show()\n'))),(0,r.kt)(l.Z,{value:"scala",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.stages._\nimport org.apache.spark.ml.feature.Tokenizer\n\nval df = (Seq(\n (0, "This is a test", "this is one too"),\n (1, "could be a test", "bar"),\n (2, "foo", "bar"),\n (3, "foo", "maybe not"))\n .toDF("label", "words1", "words2"))\n\nval stage1 = new Tokenizer()\nval mca = (new MultiColumnAdapter()\n .setBaseStage(stage1)\n .setInputCols(Array[String]("words1", "words2"))\n .setOutputCols(Array[String]("output1", "output2")))\n\nmca.fit(df).transform(df).show()\n')))),(0,r.kt)(i.Z,{className:"MultiColumnAdapter",py:"synapse.ml.stages.html#module-synapse.ml.stages.MultiColumnAdapter",scala:"com/microsoft/azure/synapse/ml/stages/MultiColumnAdapter.html",csharp:"classSynapse_1_1ML_1_1Stages_1_1MultiColumnAdapter.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/core/src/main/scala/com/microsoft/azure/synapse/ml/stages/MultiColumnAdapter.scala",mdxType:"DocTable"}),(0,r.kt)("h3",{id:"timer"},"Timer"),(0,r.kt)(o.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,r.kt)(l.Z,{value:"py",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.stages import *\nfrom pyspark.ml.feature import *\n\ndf = (spark.createDataFrame([\n (0, "Hi I"),\n (1, "I wish for snow today"),\n (2, "we Cant go to the park, because of the snow!"),\n (3, "")\n ], ["label", "sentence"]))\n\ntok = (Tokenizer()\n .setInputCol("sentence")\n .setOutputCol("tokens"))\n\ndf2 = Timer().setStage(tok).fit(df).transform(df)\n\ndf3 = HashingTF().setInputCol("tokens").setOutputCol("hash").transform(df2)\n\nidf = IDF().setInputCol("hash").setOutputCol("idf")\ntimer = Timer().setStage(idf)\n\ntimer.fit(df3).transform(df3).show()\n'))),(0,r.kt)(l.Z,{value:"scala",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.stages._\nimport org.apache.spark.ml.feature._\n\nval df = (Seq(\n (0, "Hi I"),\n (1, "I wish for snow today"),\n (2, "we Cant go to the park, because of the snow!"),\n (3, "")\n ).toDF("label", "sentence"))\n\nval tok = (new Tokenizer()\n .setInputCol("sentence")\n .setOutputCol("tokens"))\n\nval df2 = new Timer().setStage(tok).fit(df).transform(df)\n\nval df3 = new HashingTF().setInputCol("tokens").setOutputCol("hash").transform(df2)\n\nval idf = new IDF().setInputCol("hash").setOutputCol("idf")\nval timer = new Timer().setStage(idf)\n\ntimer.fit(df3).transform(df3).show()\n')))),(0,r.kt)(i.Z,{className:"Timer",py:"synapse.ml.stages.html#module-synapse.ml.stages.Timer",scala:"com/microsoft/azure/synapse/ml/stages/Timer.html",csharp:"classSynapse_1_1ML_1_1Stages_1_1Timer.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/core/src/main/scala/com/microsoft/azure/synapse/ml/stages/Timer.scala",mdxType:"DocTable"}))}z.isMDXComponent=!0;var F=["components"],L=[{value:"Train",id:"train",level:2},{value:"TrainClassifier",id:"trainclassifier",level:3},{value:"TrainRegressor",id:"trainregressor",level:3}],Z={toc:L};function E(e){var a=e.components,n=(0,s.Z)(e,F);return(0,r.kt)("wrapper",(0,t.Z)({},Z,n,{components:a,mdxType:"MDXLayout"}),(0,r.kt)("h2",{id:"train"},"Train"),(0,r.kt)("h3",{id:"trainclassifier"},"TrainClassifier"),(0,r.kt)(o.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,r.kt)(l.Z,{value:"py",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.train import *\nfrom pyspark.ml.classification import LogisticRegression\n\ndf = spark.createDataFrame([\n (0, 2, 0.50, 0.60, 0),\n (1, 3, 0.40, 0.50, 1),\n (0, 4, 0.78, 0.99, 2),\n (1, 5, 0.12, 0.34, 3),\n (0, 1, 0.50, 0.60, 0),\n (1, 3, 0.40, 0.50, 1),\n (0, 3, 0.78, 0.99, 2),\n (1, 4, 0.12, 0.34, 3),\n (0, 0, 0.50, 0.60, 0),\n (1, 2, 0.40, 0.50, 1),\n (0, 3, 0.78, 0.99, 2),\n (1, 4, 0.12, 0.34, 3)],\n ["Label", "col1", "col2", "col3", "col4"]\n)\n\ntc = (TrainClassifier()\n .setModel(LogisticRegression())\n .setLabelCol("Label"))\n\ntc.fit(df).transform(df).show()\n'))),(0,r.kt)(l.Z,{value:"scala",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.train._\nimport org.apache.spark.ml.classification.LogisticRegression\n\nval df = (Seq(\n (0, 2, 0.50, 0.60, 0),\n (1, 3, 0.40, 0.50, 1),\n (0, 4, 0.78, 0.99, 2),\n (1, 5, 0.12, 0.34, 3),\n (0, 1, 0.50, 0.60, 0),\n (1, 3, 0.40, 0.50, 1),\n (0, 3, 0.78, 0.99, 2),\n (1, 4, 0.12, 0.34, 3),\n (0, 0, 0.50, 0.60, 0),\n (1, 2, 0.40, 0.50, 1),\n (0, 3, 0.78, 0.99, 2),\n (1, 4, 0.12, 0.34, 3))\n .toDF("Label", "col1", "col2", "col3", "col4"))\n\nval tc = (new TrainClassifier()\n .setModel(new LogisticRegression())\n .setLabelCol("Label"))\n\ntc.fit(df).transform(df).show()\n')))),(0,r.kt)(i.Z,{className:"TrainClassifier",py:"synapse.ml.train.html#module-synapse.ml.train.TrainClassifier",scala:"com/microsoft/azure/synapse/ml/train/TrainClassifier.html",csharp:"classSynapse_1_1ML_1_1Train_1_1TrainClassifier.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/core/src/main/scala/com/microsoft/azure/synapse/ml/train/TrainClassifier.scala",mdxType:"DocTable"}),(0,r.kt)("h3",{id:"trainregressor"},"TrainRegressor"),(0,r.kt)(o.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,r.kt)(l.Z,{value:"py",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.train import *\nfrom pyspark.ml.regression import LinearRegression\n\ndataset = (spark.createDataFrame([\n (0.0, 2, 0.50, 0.60, 0.0),\n (1.0, 3, 0.40, 0.50, 1.0),\n (2.0, 4, 0.78, 0.99, 2.0),\n (3.0, 5, 0.12, 0.34, 3.0),\n (0.0, 1, 0.50, 0.60, 0.0),\n (1.0, 3, 0.40, 0.50, 1.0),\n (2.0, 3, 0.78, 0.99, 2.0),\n (3.0, 4, 0.12, 0.34, 3.0),\n (0.0, 0, 0.50, 0.60, 0.0),\n (1.0, 2, 0.40, 0.50, 1.0),\n (2.0, 3, 0.78, 0.99, 2.0),\n (3.0, 4, 0.12, 0.34, 3.0)],\n ["label", "col1", "col2", "col3", "col4"]))\n\nlinearRegressor = (LinearRegression()\n .setRegParam(0.3)\n .setElasticNetParam(0.8))\ntrainRegressor = (TrainRegressor()\n .setModel(linearRegressor)\n .setLabelCol("label"))\n\ntrainRegressor.fit(dataset).transform(dataset).show()\n'))),(0,r.kt)(l.Z,{value:"scala",mdxType:"TabItem"},(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.train._\nimport org.apache.spark.ml.regression.LinearRegression\n\nval dataset = (spark.createDataFrame(Seq(\n (0.0, 2, 0.50, 0.60, 0.0),\n (1.0, 3, 0.40, 0.50, 1.0),\n (2.0, 4, 0.78, 0.99, 2.0),\n (3.0, 5, 0.12, 0.34, 3.0),\n (0.0, 1, 0.50, 0.60, 0.0),\n (1.0, 3, 0.40, 0.50, 1.0),\n (2.0, 3, 0.78, 0.99, 2.0),\n (3.0, 4, 0.12, 0.34, 3.0),\n (0.0, 0, 0.50, 0.60, 0.0),\n (1.0, 2, 0.40, 0.50, 1.0),\n (2.0, 3, 0.78, 0.99, 2.0),\n (3.0, 4, 0.12, 0.34, 3.0)))\n .toDF("label", "col1", "col2", "col3", "col4"))\n\nval linearRegressor = (new LinearRegression()\n .setRegParam(0.3)\n .setElasticNetParam(0.8))\nval trainRegressor = (new TrainRegressor()\n .setModel(linearRegressor)\n .setLabelCol("label"))\n\ntrainRegressor.fit(dataset).transform(dataset).show()\n')))),(0,r.kt)(i.Z,{className:"TrainRegressor",py:"synapse.ml.train.html#module-synapse.ml.train.TrainRegressor",scala:"com/microsoft/azure/synapse/ml/train/TrainRegressor.html",csharp:"classSynapse_1_1ML_1_1Train_1_1TrainRegressor.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/core/src/main/scala/com/microsoft/azure/synapse/ml/train/TrainRegressor.scala",mdxType:"DocTable"}))}E.isMDXComponent=!0;var O=["components"],P={title:"Estimators - Core",sidebar_label:"Core",hide_title:!0},V=void 0,A={unversionedId:"Quick Examples/estimators/estimators_core",id:"version-0.11.3/Quick Examples/estimators/estimators_core",title:"Estimators - Core",description:"",source:"@site/versioned_docs/version-0.11.3/Quick Examples/estimators/estimators_core.md",sourceDirName:"Quick Examples/estimators",slug:"/Quick Examples/estimators/estimators_core",permalink:"/SynapseML/docs/0.11.3/Quick Examples/estimators/estimators_core",draft:!1,tags:[],version:"0.11.3",frontMatter:{title:"Estimators - Core",sidebar_label:"Core",hide_title:!0}},B={},H=[].concat(c,f,h,M,w,D,L),q={toc:H};function K(e){var a=e.components,n=(0,s.Z)(e,O);return(0,r.kt)("wrapper",(0,t.Z)({},q,n,{components:a,mdxType:"MDXLayout"}),(0,r.kt)(p,{mdxType:"AutoML"}),(0,r.kt)(v,{mdxType:"Featurize"}),(0,r.kt)(b,{mdxType:"IsolationForest"}),(0,r.kt)(I,{mdxType:"NN"}),(0,r.kt)(S,{mdxType:"Recommendation"}),(0,r.kt)(z,{mdxType:"Stages"}),(0,r.kt)(E,{mdxType:"Train"}))}K.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/59ef8022.0528de46.js b/assets/js/59ef8022.0528de46.js new file mode 100644 index 0000000000..ab7d8430c6 --- /dev/null +++ b/assets/js/59ef8022.0528de46.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[8491],{3905:function(a,e,t){t.d(e,{Zo:function(){return o},kt:function(){return k}});var n=t(7294);function s(a,e,t){return e in a?Object.defineProperty(a,e,{value:t,enumerable:!0,configurable:!0,writable:!0}):a[e]=t,a}function m(a,e){var t=Object.keys(a);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(a);e&&(n=n.filter((function(e){return Object.getOwnPropertyDescriptor(a,e).enumerable}))),t.push.apply(t,n)}return t}function r(a){for(var e=1;e=0||(s[t]=a[t]);return s}(a,e);if(Object.getOwnPropertySymbols){var m=Object.getOwnPropertySymbols(a);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(a,t)&&(s[t]=a[t])}return s}var l=n.createContext({}),i=function(a){var e=n.useContext(l),t=e;return a&&(t="function"==typeof a?a(e):r(r({},e),a)),t},o=function(a){var e=i(a.components);return n.createElement(l.Provider,{value:e},a.children)},N={inlineCode:"code",wrapper:function(a){var e=a.children;return n.createElement(n.Fragment,{},e)}},c=n.forwardRef((function(a,e){var t=a.components,s=a.mdxType,m=a.originalType,l=a.parentName,o=p(a,["components","mdxType","originalType","parentName"]),c=i(t),k=s,h=c["".concat(l,".").concat(k)]||c[k]||N[k]||m;return t?n.createElement(h,r(r({ref:e},o),{},{components:t})):n.createElement(h,r({ref:e},o))}));function k(a,e){var t=arguments,s=e&&e.mdxType;if("string"==typeof a||s){var m=t.length,r=new Array(m);r[0]=c;var p={};for(var l in e)hasOwnProperty.call(e,l)&&(p[l]=e[l]);p.originalType=a,p.mdxType="string"==typeof a?a:s,r[1]=p;for(var i=2;i50K} to {0, 1} to represent our binary classification label column\ndf = df.withColumn(label, F.when(F.col(label).contains("<=50K"), F.lit(0)).otherwise(F.lit(1)))\n'))),(0,m.kt)("li",{parentName:"ol"},(0,m.kt)("p",{parentName:"li"},"Create a ",(0,m.kt)("inlineCode",{parentName:"p"},"FeatureBalanceMeasure")," transformer and call ",(0,m.kt)("inlineCode",{parentName:"p"},"setSensitiveCols")," to set the list of sensitive features and call ",(0,m.kt)("inlineCode",{parentName:"p"},"setLabelCol")," to set the binary label column. Then, call the ",(0,m.kt)("inlineCode",{parentName:"p"},"transform")," method with your dataset and visualize the resulting dataframe."),(0,m.kt)("p",{parentName:"li"},"For example:"),(0,m.kt)("pre",{parentName:"li"},(0,m.kt)("code",{parentName:"pre",className:"language-python"},"feature_balance_measures = (\n FeatureBalanceMeasure()\n .setSensitiveCols(features)\n .setLabelCol(label)\n .transform(df)\n)\nfeature_balance_measures.show(truncate=False)\n"))),(0,m.kt)("li",{parentName:"ol"},(0,m.kt)("p",{parentName:"li"},"Create a ",(0,m.kt)("inlineCode",{parentName:"p"},"DistributionBalanceMeasure")," transformer and call ",(0,m.kt)("inlineCode",{parentName:"p"},"setSensitiveCols")," to set the list of sensitive features. Then, call the ",(0,m.kt)("inlineCode",{parentName:"p"},"transform")," method with your dataset and visualize the resulting dataframe."),(0,m.kt)("p",{parentName:"li"},"For example:"),(0,m.kt)("pre",{parentName:"li"},(0,m.kt)("code",{parentName:"pre",className:"language-python"},"distribution_balance_measures = (\n DistributionBalanceMeasure()\n .setSensitiveCols(features)\n .transform(df)\n)\ndistribution_balance_measures.show(truncate=False)\n"))),(0,m.kt)("li",{parentName:"ol"},(0,m.kt)("p",{parentName:"li"},"Create a ",(0,m.kt)("inlineCode",{parentName:"p"},"AggregateBalanceMeasure")," transformer and call ",(0,m.kt)("inlineCode",{parentName:"p"},"setSensitiveCols")," to set the list of sensitive features. Then, call the ",(0,m.kt)("inlineCode",{parentName:"p"},"transform")," method with your dataset and visualize the resulting dataframe."),(0,m.kt)("p",{parentName:"li"},"For example:"),(0,m.kt)("pre",{parentName:"li"},(0,m.kt)("code",{parentName:"pre",className:"language-python"},"aggregate_balance_measures = (\n AggregateBalanceMeasure()\n .setSensitiveCols(features)\n .transform(df)\n)\naggregate_balance_measures.show(truncate=False)\n")))),(0,m.kt)("p",null,"Note: If you're running this notebook in a Spark environment such as Azure Synapse or Databricks, then you can easily visualize the imbalance measures by calling the built-in plotting features ",(0,m.kt)("inlineCode",{parentName:"p"},"display()"),"."),(0,m.kt)("h2",{id:"measure-explanations"},"Measure Explanations"),(0,m.kt)("h3",{id:"feature-balance-measures"},"Feature Balance Measures"),(0,m.kt)("p",null,"Feature Balance Measures allow us to see whether each combination of sensitive feature is receiving the positive outcome (true prediction) at balanced probability."),(0,m.kt)("p",null,"In this context, we define a feature balance measure, called the parity, for label y. It is the difference between the association metrics of two different sensitive classes ",(0,m.kt)("span",{parentName:"p",className:"math math-inline"},(0,m.kt)("span",{parentName:"span",className:"katex"},(0,m.kt)("span",{parentName:"span",className:"katex-mathml"},(0,m.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,m.kt)("semantics",{parentName:"math"},(0,m.kt)("mrow",{parentName:"semantics"},(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"["),(0,m.kt)("msub",{parentName:"mrow"},(0,m.kt)("mi",{parentName:"msub"},"x"),(0,m.kt)("mi",{parentName:"msub"},"A")),(0,m.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,m.kt)("msub",{parentName:"mrow"},(0,m.kt)("mi",{parentName:"msub"},"x"),(0,m.kt)("mi",{parentName:"msub"},"B")),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"]")),(0,m.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"[x_A, x_B]")))),(0,m.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mopen"},"["),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,m.kt)("span",{parentName:"span",className:"msupsub"},(0,m.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.32833099999999993em"}},(0,m.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,m.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"A")))),(0,m.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,m.kt)("span",{parentName:"span"})))))),(0,m.kt)("span",{parentName:"span",className:"mpunct"},","),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,m.kt)("span",{parentName:"span",className:"msupsub"},(0,m.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.32833099999999993em"}},(0,m.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,m.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.05017em"}},"B")))),(0,m.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,m.kt)("span",{parentName:"span"})))))),(0,m.kt)("span",{parentName:"span",className:"mclose"},"]"))))),", with respect to the association metric ",(0,m.kt)("span",{parentName:"p",className:"math math-inline"},(0,m.kt)("span",{parentName:"span",className:"katex"},(0,m.kt)("span",{parentName:"span",className:"katex-mathml"},(0,m.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,m.kt)("semantics",{parentName:"math"},(0,m.kt)("mrow",{parentName:"semantics"},(0,m.kt)("mi",{parentName:"mrow"},"A"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,m.kt)("msub",{parentName:"mrow"},(0,m.kt)("mi",{parentName:"msub"},"x"),(0,m.kt)("mi",{parentName:"msub"},"i")),(0,m.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,m.kt)("mi",{parentName:"mrow"},"y"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},")")),(0,m.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"A(x_i, y)")))),(0,m.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,m.kt)("span",{parentName:"span",className:"mopen"},"("),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,m.kt)("span",{parentName:"span",className:"msupsub"},(0,m.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.31166399999999994em"}},(0,m.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,m.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"i")))),(0,m.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,m.kt)("span",{parentName:"span"})))))),(0,m.kt)("span",{parentName:"span",className:"mpunct"},","),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.03588em"}},"y"),(0,m.kt)("span",{parentName:"span",className:"mclose"},")"))))),". That is:"),(0,m.kt)("p",null,(0,m.kt)("span",{parentName:"p",className:"math math-inline"},(0,m.kt)("span",{parentName:"span",className:"katex"},(0,m.kt)("span",{parentName:"span",className:"katex-mathml"},(0,m.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,m.kt)("semantics",{parentName:"math"},(0,m.kt)("mrow",{parentName:"semantics"},(0,m.kt)("mi",{parentName:"mrow"},"p"),(0,m.kt)("mi",{parentName:"mrow"},"a"),(0,m.kt)("mi",{parentName:"mrow"},"r"),(0,m.kt)("mi",{parentName:"mrow"},"i"),(0,m.kt)("mi",{parentName:"mrow"},"t"),(0,m.kt)("mi",{parentName:"mrow"},"y"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,m.kt)("mi",{parentName:"mrow"},"y"),(0,m.kt)("mi",{parentName:"mrow",mathvariant:"normal"},"\u2223"),(0,m.kt)("msub",{parentName:"mrow"},(0,m.kt)("mi",{parentName:"msub"},"x"),(0,m.kt)("mi",{parentName:"msub"},"A")),(0,m.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,m.kt)("msub",{parentName:"mrow"},(0,m.kt)("mi",{parentName:"msub"},"x"),(0,m.kt)("mi",{parentName:"msub"},"B")),(0,m.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,m.kt)("mi",{parentName:"mrow"},"A"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,m.kt)("mo",{parentName:"mrow"},"\u22c5"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},")"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},")"),(0,m.kt)("mo",{parentName:"mrow"},(0,m.kt)("mi",{parentName:"mo",mathvariant:"normal"},"\u2254")),(0,m.kt)("mi",{parentName:"mrow"},"A"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,m.kt)("msub",{parentName:"mrow"},(0,m.kt)("mi",{parentName:"msub"},"x"),(0,m.kt)("mi",{parentName:"msub"},"A")),(0,m.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,m.kt)("mi",{parentName:"mrow"},"y"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},")"),(0,m.kt)("mo",{parentName:"mrow"},"\u2212"),(0,m.kt)("mi",{parentName:"mrow"},"A"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,m.kt)("msub",{parentName:"mrow"},(0,m.kt)("mi",{parentName:"msub"},"x"),(0,m.kt)("mi",{parentName:"msub"},"B")),(0,m.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,m.kt)("mi",{parentName:"mrow"},"y"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},")")),(0,m.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"parity(y \\vert x_A, x_B, A(\\cdot)) \\coloneqq A(x_A, y) - A(x_B, y)")))),(0,m.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"p"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"a"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.02778em"}},"r"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"i"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"t"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.03588em"}},"y"),(0,m.kt)("span",{parentName:"span",className:"mopen"},"("),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.03588em"}},"y"),(0,m.kt)("span",{parentName:"span",className:"mord"},"\u2223"),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,m.kt)("span",{parentName:"span",className:"msupsub"},(0,m.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.32833099999999993em"}},(0,m.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,m.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"A")))),(0,m.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,m.kt)("span",{parentName:"span"})))))),(0,m.kt)("span",{parentName:"span",className:"mpunct"},","),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,m.kt)("span",{parentName:"span",className:"msupsub"},(0,m.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.32833099999999993em"}},(0,m.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,m.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.05017em"}},"B")))),(0,m.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,m.kt)("span",{parentName:"span"})))))),(0,m.kt)("span",{parentName:"span",className:"mpunct"},","),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,m.kt)("span",{parentName:"span",className:"mopen"},"("),(0,m.kt)("span",{parentName:"span",className:"mord"},"\u22c5"),(0,m.kt)("span",{parentName:"span",className:"mclose"},")"),(0,m.kt)("span",{parentName:"span",className:"mclose"},")"),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}}),(0,m.kt)("span",{parentName:"span",className:"mrel"},(0,m.kt)("span",{parentName:"span",className:"mrel"},(0,m.kt)("span",{parentName:"span",className:"mop",style:{position:"relative",top:"-0.03472em"}},":")),(0,m.kt)("span",{parentName:"span",className:"mrel"},(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"-0.06666666666666667em"}})),(0,m.kt)("span",{parentName:"span",className:"mrel"},"=")),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}})),(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,m.kt)("span",{parentName:"span",className:"mopen"},"("),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,m.kt)("span",{parentName:"span",className:"msupsub"},(0,m.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.32833099999999993em"}},(0,m.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,m.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"A")))),(0,m.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,m.kt)("span",{parentName:"span"})))))),(0,m.kt)("span",{parentName:"span",className:"mpunct"},","),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.03588em"}},"y"),(0,m.kt)("span",{parentName:"span",className:"mclose"},")"),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2222222222222222em"}}),(0,m.kt)("span",{parentName:"span",className:"mbin"},"\u2212"),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2222222222222222em"}})),(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,m.kt)("span",{parentName:"span",className:"mopen"},"("),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,m.kt)("span",{parentName:"span",className:"msupsub"},(0,m.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.32833099999999993em"}},(0,m.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,m.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.05017em"}},"B")))),(0,m.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,m.kt)("span",{parentName:"span"})))))),(0,m.kt)("span",{parentName:"span",className:"mpunct"},","),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.03588em"}},"y"),(0,m.kt)("span",{parentName:"span",className:"mclose"},")")))))),(0,m.kt)("p",null,"Using the dataset, we can see if the various sexes and races are receiving >50k income at equal or unequal rates."),(0,m.kt)("p",null,"Note: Many of these metrics were influenced by this paper ",(0,m.kt)("a",{parentName:"p",href:"https://arxiv.org/abs/2103.03417"},"Measuring Model Biases in the Absence of Ground Truth"),"."),(0,m.kt)("table",null,(0,m.kt)("thead",{parentName:"table"},(0,m.kt)("tr",{parentName:"thead"},(0,m.kt)("th",{parentName:"tr",align:null},"Association Metric"),(0,m.kt)("th",{parentName:"tr",align:null},"Family"),(0,m.kt)("th",{parentName:"tr",align:null},"Description"),(0,m.kt)("th",{parentName:"tr",align:null},"Interpretation/Formula"),(0,m.kt)("th",{parentName:"tr",align:null},"Reference"))),(0,m.kt)("tbody",{parentName:"table"},(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Statistical Parity"),(0,m.kt)("td",{parentName:"tr",align:null},"Fairness"),(0,m.kt)("td",{parentName:"tr",align:null},"Proportion of each segment of a protected class (gender, for example) that should receive the positive outcome at equal rates."),(0,m.kt)("td",{parentName:"tr",align:null},"Closer to zero means better parity. ",(0,m.kt)("span",{parentName:"td",className:"math math-inline"},(0,m.kt)("span",{parentName:"span",className:"katex"},(0,m.kt)("span",{parentName:"span",className:"katex-mathml"},(0,m.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,m.kt)("semantics",{parentName:"math"},(0,m.kt)("mrow",{parentName:"semantics"},(0,m.kt)("mi",{parentName:"mrow"},"D"),(0,m.kt)("mi",{parentName:"mrow"},"P"),(0,m.kt)("mo",{parentName:"mrow"},"="),(0,m.kt)("mi",{parentName:"mrow"},"P"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,m.kt)("mi",{parentName:"mrow"},"Y"),(0,m.kt)("mi",{parentName:"mrow",mathvariant:"normal"},"\u2223"),(0,m.kt)("mi",{parentName:"mrow"},"A"),(0,m.kt)("mo",{parentName:"mrow"},"="),(0,m.kt)("mi",{parentName:"mrow"},"M"),(0,m.kt)("mi",{parentName:"mrow"},"a"),(0,m.kt)("mi",{parentName:"mrow"},"l"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},")"),(0,m.kt)("mo",{parentName:"mrow"},"\u2212"),(0,m.kt)("mi",{parentName:"mrow"},"P"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,m.kt)("mi",{parentName:"mrow"},"Y"),(0,m.kt)("mi",{parentName:"mrow",mathvariant:"normal"},"\u2223"),(0,m.kt)("mi",{parentName:"mrow"},"A"),(0,m.kt)("mo",{parentName:"mrow"},"="),(0,m.kt)("mi",{parentName:"mrow"},"F"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"m"),(0,m.kt)("mi",{parentName:"mrow"},"a"),(0,m.kt)("mi",{parentName:"mrow"},"l"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},")")),(0,m.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"DP = P(Y \\vert A = Male) - P(Y \\vert A = Female)")))),(0,m.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"0.68333em",verticalAlign:"0em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.02778em"}},"D"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.13889em"}},"P"),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}}),(0,m.kt)("span",{parentName:"span",className:"mrel"},"="),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}})),(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.13889em"}},"P"),(0,m.kt)("span",{parentName:"span",className:"mopen"},"("),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.22222em"}},"Y"),(0,m.kt)("span",{parentName:"span",className:"mord"},"\u2223"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}}),(0,m.kt)("span",{parentName:"span",className:"mrel"},"="),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}})),(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.10903em"}},"M"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"a"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.01968em"}},"l"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mclose"},")"),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2222222222222222em"}}),(0,m.kt)("span",{parentName:"span",className:"mbin"},"\u2212"),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2222222222222222em"}})),(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.13889em"}},"P"),(0,m.kt)("span",{parentName:"span",className:"mopen"},"("),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.22222em"}},"Y"),(0,m.kt)("span",{parentName:"span",className:"mord"},"\u2223"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}}),(0,m.kt)("span",{parentName:"span",className:"mrel"},"="),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}})),(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.13889em"}},"F"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"m"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"a"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.01968em"}},"l"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mclose"},")"))))),"."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Fairness_%28machine_learning%29"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Pointwise Mutual Information (PMI), normalized PMI"),(0,m.kt)("td",{parentName:"tr",align:null},"Entropy"),(0,m.kt)("td",{parentName:"tr",align:null},"The PMI of a pair of feature values (ex: Gender=Male and Gender=Female) quantifies the discrepancy between the probability of their coincidence given their joint distribution and their individual distributions (assuming independence)."),(0,m.kt)("td",{parentName:"tr",align:null},"Range (normalized) ",(0,m.kt)("span",{parentName:"td",className:"math math-inline"},(0,m.kt)("span",{parentName:"span",className:"katex"},(0,m.kt)("span",{parentName:"span",className:"katex-mathml"},(0,m.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,m.kt)("semantics",{parentName:"math"},(0,m.kt)("mrow",{parentName:"semantics"},(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"["),(0,m.kt)("mo",{parentName:"mrow"},"\u2212"),(0,m.kt)("mn",{parentName:"mrow"},"1"),(0,m.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,m.kt)("mn",{parentName:"mrow"},"1"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"]")),(0,m.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"[-1, 1]")))),(0,m.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mopen"},"["),(0,m.kt)("span",{parentName:"span",className:"mord"},"\u2212"),(0,m.kt)("span",{parentName:"span",className:"mord"},"1"),(0,m.kt)("span",{parentName:"span",className:"mpunct"},","),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,m.kt)("span",{parentName:"span",className:"mord"},"1"),(0,m.kt)("span",{parentName:"span",className:"mclose"},"]"))))),". -1 for no co-occurrences. 0 for co-occurrences at random. 1 for complete co-occurrences."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Pointwise_mutual_information"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Sorensen-Dice Coefficient (SDC)"),(0,m.kt)("td",{parentName:"tr",align:null},"Intersection-over-Union"),(0,m.kt)("td",{parentName:"tr",align:null},"Used to gauge the similarity of two samples. Related to F1 score."),(0,m.kt)("td",{parentName:"tr",align:null},"Equals twice the number of elements common to both sets divided by the sum of the number of elements in each set."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Jaccard Index"),(0,m.kt)("td",{parentName:"tr",align:null},"Intersection-over-Union"),(0,m.kt)("td",{parentName:"tr",align:null},"Similar to SDC, gauges the similarity and diversity of sample sets."),(0,m.kt)("td",{parentName:"tr",align:null},"Equals the size of the intersection divided by the size of the union of the sample sets."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Jaccard_index"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Kendall Rank Correlation"),(0,m.kt)("td",{parentName:"tr",align:null},"Correlation and Statistical Tests"),(0,m.kt)("td",{parentName:"tr",align:null},"Used to measure the ordinal association between two measured quantities."),(0,m.kt)("td",{parentName:"tr",align:null},"High when observations have a similar rank and low when observations have a dissimilar rank between the two variables."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Log-Likelihood Ratio"),(0,m.kt)("td",{parentName:"tr",align:null},"Correlation and Statistical Tests"),(0,m.kt)("td",{parentName:"tr",align:null},"Calculates the degree to which data supports one variable versus another. Log of the likelihood ratio, which gives the probability of correctly predicting the label in ratio to probability of incorrectly predicting label."),(0,m.kt)("td",{parentName:"tr",align:null},"If likelihoods are similar, it should be close to 0."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Likelihood_function#Likelihood_ratio"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"t-test"),(0,m.kt)("td",{parentName:"tr",align:null},"Correlation and Statistical Tests"),(0,m.kt)("td",{parentName:"tr",align:null},"Used to compare the means of two groups (pairwise)."),(0,m.kt)("td",{parentName:"tr",align:null},"Value looked up in t-Distribution tell if statistically significant or not."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Student's_t-test"},"Link"))))),(0,m.kt)("h3",{id:"distribution-balance-measures"},"Distribution Balance Measures"),(0,m.kt)("p",null,"Distribution Balance Measures allow us to compare our data with a reference distribution (currently only uniform distribution is supported as a reference distribution). They are calculated per sensitive column and don't depend on the label column."),(0,m.kt)("p",null,"For example, let's assume we have a dataset with nine rows and a Gender column, and we observe that:"),(0,m.kt)("ul",null,(0,m.kt)("li",{parentName:"ul"},'"Male" appears four times'),(0,m.kt)("li",{parentName:"ul"},'"Female" appears three times'),(0,m.kt)("li",{parentName:"ul"},'"Other" appears twice')),(0,m.kt)("p",null,"Assuming the uniform distribution:"),(0,m.kt)("div",{className:"math math-display"},(0,m.kt)("span",{parentName:"div",className:"katex-display"},(0,m.kt)("span",{parentName:"span",className:"katex"},(0,m.kt)("span",{parentName:"span",className:"katex-mathml"},(0,m.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML",display:"block"},(0,m.kt)("semantics",{parentName:"math"},(0,m.kt)("mrow",{parentName:"semantics"},(0,m.kt)("mi",{parentName:"mrow"},"R"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"f"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"r"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"n"),(0,m.kt)("mi",{parentName:"mrow"},"c"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"C"),(0,m.kt)("mi",{parentName:"mrow"},"o"),(0,m.kt)("mi",{parentName:"mrow"},"u"),(0,m.kt)("mi",{parentName:"mrow"},"n"),(0,m.kt)("mi",{parentName:"mrow"},"t"),(0,m.kt)("mo",{parentName:"mrow"},(0,m.kt)("mi",{parentName:"mo",mathvariant:"normal"},"\u2254")),(0,m.kt)("mfrac",{parentName:"mrow"},(0,m.kt)("mrow",{parentName:"mfrac"},(0,m.kt)("mi",{parentName:"mrow"},"n"),(0,m.kt)("mi",{parentName:"mrow"},"u"),(0,m.kt)("mi",{parentName:"mrow"},"m"),(0,m.kt)("mi",{parentName:"mrow"},"R"),(0,m.kt)("mi",{parentName:"mrow"},"o"),(0,m.kt)("mi",{parentName:"mrow"},"w"),(0,m.kt)("mi",{parentName:"mrow"},"s")),(0,m.kt)("mrow",{parentName:"mfrac"},(0,m.kt)("mi",{parentName:"mrow"},"n"),(0,m.kt)("mi",{parentName:"mrow"},"u"),(0,m.kt)("mi",{parentName:"mrow"},"m"),(0,m.kt)("mi",{parentName:"mrow"},"F"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"a"),(0,m.kt)("mi",{parentName:"mrow"},"t"),(0,m.kt)("mi",{parentName:"mrow"},"u"),(0,m.kt)("mi",{parentName:"mrow"},"r"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"V"),(0,m.kt)("mi",{parentName:"mrow"},"a"),(0,m.kt)("mi",{parentName:"mrow"},"l"),(0,m.kt)("mi",{parentName:"mrow"},"u"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"s")))),(0,m.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"ReferenceCount \\coloneqq \\frac{numRows}{numFeatureValues}")))),(0,m.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8888799999999999em",verticalAlign:"-0.19444em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.00773em"}},"R"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.10764em"}},"f"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.02778em"}},"r"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"n"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"c"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.07153em"}},"C"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"o"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"u"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"n"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"t"),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}}),(0,m.kt)("span",{parentName:"span",className:"mrel"},(0,m.kt)("span",{parentName:"span",className:"mrel"},(0,m.kt)("span",{parentName:"span",className:"mop",style:{position:"relative",top:"-0.03472em"}},":")),(0,m.kt)("span",{parentName:"span",className:"mrel"},(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"-0.06666666666666667em"}})),(0,m.kt)("span",{parentName:"span",className:"mrel"},"=")),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}})),(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"2.04633em",verticalAlign:"-0.686em"}}),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mopen nulldelimiter"}),(0,m.kt)("span",{parentName:"span",className:"mfrac"},(0,m.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"1.36033em"}},(0,m.kt)("span",{parentName:"span",style:{top:"-2.314em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"3em"}}),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"n"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"u"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"m"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.13889em"}},"F"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"a"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"t"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"u"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.02778em"}},"r"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.22222em"}},"V"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"a"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.01968em"}},"l"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"u"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"s"))),(0,m.kt)("span",{parentName:"span",style:{top:"-3.23em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"3em"}}),(0,m.kt)("span",{parentName:"span",className:"frac-line",style:{borderBottomWidth:"0.04em"}})),(0,m.kt)("span",{parentName:"span",style:{top:"-3.677em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"3em"}}),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"n"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"u"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"m"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.00773em"}},"R"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"o"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.02691em"}},"w"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"s")))),(0,m.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.686em"}},(0,m.kt)("span",{parentName:"span"}))))),(0,m.kt)("span",{parentName:"span",className:"mclose nulldelimiter"}))))))),(0,m.kt)("div",{className:"math math-display"},(0,m.kt)("span",{parentName:"div",className:"katex-display"},(0,m.kt)("span",{parentName:"span",className:"katex"},(0,m.kt)("span",{parentName:"span",className:"katex-mathml"},(0,m.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML",display:"block"},(0,m.kt)("semantics",{parentName:"math"},(0,m.kt)("mrow",{parentName:"semantics"},(0,m.kt)("mi",{parentName:"mrow"},"R"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"f"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"r"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"n"),(0,m.kt)("mi",{parentName:"mrow"},"c"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"P"),(0,m.kt)("mi",{parentName:"mrow"},"r"),(0,m.kt)("mi",{parentName:"mrow"},"o"),(0,m.kt)("mi",{parentName:"mrow"},"b"),(0,m.kt)("mi",{parentName:"mrow"},"a"),(0,m.kt)("mi",{parentName:"mrow"},"b"),(0,m.kt)("mi",{parentName:"mrow"},"i"),(0,m.kt)("mi",{parentName:"mrow"},"l"),(0,m.kt)("mi",{parentName:"mrow"},"i"),(0,m.kt)("mi",{parentName:"mrow"},"t"),(0,m.kt)("mi",{parentName:"mrow"},"y"),(0,m.kt)("mo",{parentName:"mrow"},(0,m.kt)("mi",{parentName:"mo",mathvariant:"normal"},"\u2254")),(0,m.kt)("mfrac",{parentName:"mrow"},(0,m.kt)("mn",{parentName:"mfrac"},"1"),(0,m.kt)("mrow",{parentName:"mfrac"},(0,m.kt)("mi",{parentName:"mrow"},"n"),(0,m.kt)("mi",{parentName:"mrow"},"u"),(0,m.kt)("mi",{parentName:"mrow"},"m"),(0,m.kt)("mi",{parentName:"mrow"},"F"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"a"),(0,m.kt)("mi",{parentName:"mrow"},"t"),(0,m.kt)("mi",{parentName:"mrow"},"u"),(0,m.kt)("mi",{parentName:"mrow"},"r"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"V"),(0,m.kt)("mi",{parentName:"mrow"},"a"),(0,m.kt)("mi",{parentName:"mrow"},"l"),(0,m.kt)("mi",{parentName:"mrow"},"u"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"s")))),(0,m.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"ReferenceProbability \\coloneqq \\frac{1}{numFeatureValues}")))),(0,m.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8888799999999999em",verticalAlign:"-0.19444em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.00773em"}},"R"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.10764em"}},"f"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.02778em"}},"r"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"n"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"c"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.13889em"}},"P"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.02778em"}},"r"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"o"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"b"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"a"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"b"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"i"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.01968em"}},"l"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"i"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"t"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.03588em"}},"y"),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}}),(0,m.kt)("span",{parentName:"span",className:"mrel"},(0,m.kt)("span",{parentName:"span",className:"mrel"},(0,m.kt)("span",{parentName:"span",className:"mop",style:{position:"relative",top:"-0.03472em"}},":")),(0,m.kt)("span",{parentName:"span",className:"mrel"},(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"-0.06666666666666667em"}})),(0,m.kt)("span",{parentName:"span",className:"mrel"},"=")),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}})),(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"2.00744em",verticalAlign:"-0.686em"}}),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mopen nulldelimiter"}),(0,m.kt)("span",{parentName:"span",className:"mfrac"},(0,m.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"1.32144em"}},(0,m.kt)("span",{parentName:"span",style:{top:"-2.314em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"3em"}}),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"n"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"u"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"m"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.13889em"}},"F"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"a"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"t"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"u"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.02778em"}},"r"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.22222em"}},"V"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"a"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.01968em"}},"l"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"u"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"s"))),(0,m.kt)("span",{parentName:"span",style:{top:"-3.23em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"3em"}}),(0,m.kt)("span",{parentName:"span",className:"frac-line",style:{borderBottomWidth:"0.04em"}})),(0,m.kt)("span",{parentName:"span",style:{top:"-3.677em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"3em"}}),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mord"},"1")))),(0,m.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.686em"}},(0,m.kt)("span",{parentName:"span"}))))),(0,m.kt)("span",{parentName:"span",className:"mclose nulldelimiter"}))))))),(0,m.kt)("table",null,(0,m.kt)("thead",{parentName:"table"},(0,m.kt)("tr",{parentName:"thead"},(0,m.kt)("th",{parentName:"tr",align:null},"Feature Value"),(0,m.kt)("th",{parentName:"tr",align:null},"Observed Count"),(0,m.kt)("th",{parentName:"tr",align:null},"Reference Count"),(0,m.kt)("th",{parentName:"tr",align:null},"Observed Probability"),(0,m.kt)("th",{parentName:"tr",align:null},"Reference Probabiliy"))),(0,m.kt)("tbody",{parentName:"table"},(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Male"),(0,m.kt)("td",{parentName:"tr",align:null},"4"),(0,m.kt)("td",{parentName:"tr",align:null},"9/3 = 3"),(0,m.kt)("td",{parentName:"tr",align:null},"4/9 = 0.44"),(0,m.kt)("td",{parentName:"tr",align:null},"3/9 = 0.33")),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Female"),(0,m.kt)("td",{parentName:"tr",align:null},"3"),(0,m.kt)("td",{parentName:"tr",align:null},"9/3 = 3"),(0,m.kt)("td",{parentName:"tr",align:null},"3/9 = 0.33"),(0,m.kt)("td",{parentName:"tr",align:null},"3/9 = 0.33")),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Other"),(0,m.kt)("td",{parentName:"tr",align:null},"2"),(0,m.kt)("td",{parentName:"tr",align:null},"9/3 = 3"),(0,m.kt)("td",{parentName:"tr",align:null},"2/9 = 0.22"),(0,m.kt)("td",{parentName:"tr",align:null},"3/9 = 0.33")))),(0,m.kt)("p",null,"We can use distance measures to find out how far our observed and reference distributions of these feature values are. Some of these distance measures include:"),(0,m.kt)("table",null,(0,m.kt)("thead",{parentName:"table"},(0,m.kt)("tr",{parentName:"thead"},(0,m.kt)("th",{parentName:"tr",align:null},"Measure"),(0,m.kt)("th",{parentName:"tr",align:null},"Description"),(0,m.kt)("th",{parentName:"tr",align:null},"Interpretation"),(0,m.kt)("th",{parentName:"tr",align:null},"Reference"))),(0,m.kt)("tbody",{parentName:"table"},(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"KL Divergence"),(0,m.kt)("td",{parentName:"tr",align:null},"Measure of how one probability distribution is different from a second, reference probability distribution. Measure of the information gained when one revises one's beliefs from the prior probability distribution Q to the posterior probability distribution P. In other words, it is the amount of information lost when Q is used to approximate P."),(0,m.kt)("td",{parentName:"tr",align:null},"Non-negative. 0 means P = Q."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"JS Distance"),(0,m.kt)("td",{parentName:"tr",align:null},"Measuring the similarity between two probability distributions. Symmetrized and smoothed version of the Kullback\u2013Leibler (KL) divergence. Square root of JS Divergence."),(0,m.kt)("td",{parentName:"tr",align:null},"Range ","[0, 1]",". 0 means perfectly same to balanced distribution."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Wasserstein Distance"),(0,m.kt)("td",{parentName:"tr",align:null},"This distance is also known as the earth mover\u2019s distance, since it can be seen as the minimum amount of \u201cwork\u201d required to transform u into v, where \u201cwork\u201d is measured as the amount of distribution weight that must be moved multiplied by the distance it has to be moved."),(0,m.kt)("td",{parentName:"tr",align:null},"Non-negative. 0 means P = Q."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Wasserstein_metric"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Infinity Norm Distance"),(0,m.kt)("td",{parentName:"tr",align:null},"Distance between two vectors is the greatest of their differences along any coordinate dimension. Also called Chebyshev distance or chessboard distance."),(0,m.kt)("td",{parentName:"tr",align:null},"Non-negative. 0 means same distribution."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Chebyshev_distance"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Total Variation Distance"),(0,m.kt)("td",{parentName:"tr",align:null},"It is equal to half the L1 (Manhattan) distance between the two distributions. Take the difference between the two proportions in each category, add up the absolute values of all the differences, and then divide the sum by 2."),(0,m.kt)("td",{parentName:"tr",align:null},"Non-negative. 0 means same distribution."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Total_variation_distance_of_probability_measures"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Chi-Squared Test"),(0,m.kt)("td",{parentName:"tr",align:null},"The chi-square test tests the null hypothesis that the categorical data has the given frequencies given expected frequencies in each category."),(0,m.kt)("td",{parentName:"tr",align:null},"p-value gives evidence against null-hypothesis that difference in observed and expected frequencies is by random chance."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Chi-squared_test"},"Link"))))),(0,m.kt)("h3",{id:"aggregate-balance-measures"},"Aggregate Balance Measures"),(0,m.kt)("p",null,"Aggregate Balance Measures allow us to obtain a higher notion of inequality. They're calculated on the set of all sensitive columns and don't depend on the label column."),(0,m.kt)("p",null,"These measures look at distribution of records across all combinations of sensitive columns. For example, if Sex and Race are specified as sensitive features, it then tries to quantify imbalance across all combinations of the two specified features - (Male, Black), (Female, White), (Male, Asian-Pac-Islander), etc."),(0,m.kt)("table",null,(0,m.kt)("thead",{parentName:"table"},(0,m.kt)("tr",{parentName:"thead"},(0,m.kt)("th",{parentName:"tr",align:null},"Measure"),(0,m.kt)("th",{parentName:"tr",align:null},"Description"),(0,m.kt)("th",{parentName:"tr",align:null},"Interpretation"),(0,m.kt)("th",{parentName:"tr",align:null},"Reference"))),(0,m.kt)("tbody",{parentName:"table"},(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Atkinson Index"),(0,m.kt)("td",{parentName:"tr",align:null},"It presents the percentage of total income that a given society would have to forego in order to have more equal shares of income between its citizens. This measure depends on the degree of societal aversion to inequality (a theoretical parameter decided by the researcher). A higher value entails greater social utility or willingness by individuals to accept smaller incomes in exchange for a more equal distribution. An important feature of the Atkinson index is that it can be decomposed into within-group and between-group inequality."),(0,m.kt)("td",{parentName:"tr",align:null},"Range ",(0,m.kt)("span",{parentName:"td",className:"math math-inline"},(0,m.kt)("span",{parentName:"span",className:"katex"},(0,m.kt)("span",{parentName:"span",className:"katex-mathml"},(0,m.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,m.kt)("semantics",{parentName:"math"},(0,m.kt)("mrow",{parentName:"semantics"},(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"["),(0,m.kt)("mn",{parentName:"mrow"},"0"),(0,m.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,m.kt)("mn",{parentName:"mrow"},"1"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"]")),(0,m.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"[0, 1]")))),(0,m.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mopen"},"["),(0,m.kt)("span",{parentName:"span",className:"mord"},"0"),(0,m.kt)("span",{parentName:"span",className:"mpunct"},","),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,m.kt)("span",{parentName:"span",className:"mord"},"1"),(0,m.kt)("span",{parentName:"span",className:"mclose"},"]"))))),". 0 if perfect equality. 1 means maximum inequality. In our case, it is the proportion of records for a sensitive columns\u2019 combination."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Atkinson_index"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Theil T Index"),(0,m.kt)("td",{parentName:"tr",align:null},'GE(1) = Theil\'s T and is more sensitive to differences at the top of the distribution. The Theil index is a statistic used to measure economic inequality. The Theil index measures an entropic "distance" the population is away from the "ideal" egalitarian state of everyone having the same income.'),(0,m.kt)("td",{parentName:"tr",align:null},"If everyone has the same income, then\xa0T_T\xa0equals\xa00. If one person has all the income, then\xa0T_T\xa0gives the result\xa0",(0,m.kt)("span",{parentName:"td",className:"math math-inline"},(0,m.kt)("span",{parentName:"span",className:"katex"},(0,m.kt)("span",{parentName:"span",className:"katex-mathml"},(0,m.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,m.kt)("semantics",{parentName:"math"},(0,m.kt)("mrow",{parentName:"semantics"},(0,m.kt)("mi",{parentName:"mrow"},"l"),(0,m.kt)("mi",{parentName:"mrow"},"n"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,m.kt)("mi",{parentName:"mrow"},"N"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},")")),(0,m.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"ln(N)")))),(0,m.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.01968em"}},"l"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"n"),(0,m.kt)("span",{parentName:"span",className:"mopen"},"("),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.10903em"}},"N"),(0,m.kt)("span",{parentName:"span",className:"mclose"},")"))))),".\xa00 means equal income and larger values mean higher level of disproportion."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Theil_index"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Theil L Index"),(0,m.kt)("td",{parentName:"tr",align:null},"GE(0) = Theil's L and is more sensitive to differences at the lower end of the distribution. Logarithm of (mean income)/(income i), over all the incomes included in the summation. It is also referred to as the mean log deviation measure. Because a transfer from a larger income to a smaller one will change the smaller income's ratio more than it changes the larger income's ratio, the transfer-principle is satisfied by this index."),(0,m.kt)("td",{parentName:"tr",align:null},"Same interpretation as Theil T Index."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Theil_index"},"Link"))))),(0,m.kt)("h2",{id:"mitigation"},"Mitigation"),(0,m.kt)("p",null,"It will not be a stretch to say that every real-world dataset has caveats, biases, and imbalances. Data collection is costly. Data Imbalance mitigation or de-biasing data is an area of research. There are many techniques available at various stages of ML lifecycle: during pre-processing, in-processing, and post processing. Here we outline a couple of pre-processing techniques -"),(0,m.kt)("h3",{id:"resampling"},"Resampling"),(0,m.kt)("p",null,"Resampling involves under-sampling from majority class and over-sampling from minority class. A na\xefve way to over-sample would be to duplicate records. Similarly, to under-sample one could remove records at random."),(0,m.kt)("ul",null,(0,m.kt)("li",{parentName:"ul"},(0,m.kt)("p",{parentName:"li"},"Caveats:"),(0,m.kt)("ol",{parentName:"li"},(0,m.kt)("li",{parentName:"ol"},"Under-sampling may remove valuable information."),(0,m.kt)("li",{parentName:"ol"},"Over-sampling may cause overfitting and poor generalization on test set.")))),(0,m.kt)("p",null,(0,m.kt)("img",{parentName:"p",src:"https://mmlspark.blob.core.windows.net/graphics/responsible_ai/DataBalanceAnalysis_SamplingBar.png",alt:"Bar chart undersampling and oversampling"})),(0,m.kt)("p",null,"There are smarter techniques to under-sample and over-sample in literature and implemented in Python\u2019s ",(0,m.kt)("a",{parentName:"p",href:"https://imbalanced-learn.org/stable/"},"imbalanced-learn")," package."),(0,m.kt)("p",null,"For example, we can cluster the records of the majority class, and do the under-sampling by removing records from each cluster, thus seeking to preserve information."),(0,m.kt)("p",null,"One technique of under-sampling is use of Tomek Links. Tomek links are pairs of instances that are very close but of opposite classes. Removing the instances of the majority class of each pair increases the space between the two classes, facilitating the classification process. A similar way to under-sample majority class is using Near-Miss. It first calculates the distance between all the points in the larger class with the points in the smaller class. When two points belonging to different classes are very close to each other in the distribution, this algorithm eliminates the datapoint of the larger class thereby trying to balance the distribution."),(0,m.kt)("p",null,(0,m.kt)("img",{parentName:"p",src:"https://mmlspark.blob.core.windows.net/graphics/responsible_ai/DataBalanceAnalysis_TomekLinks.png",alt:"Tomek Links"})),(0,m.kt)("p",null,"In over-sampling, instead of creating exact copies of the minority class records, we can introduce small variations into those copies, creating more diverse synthetic samples. This technique is called SMOTE (Synthetic Minority Oversampling Technique). It randomly picks a point from the minority class and computes the k-nearest neighbors for this point. The synthetic points are added between the chosen point and its neighbors."),(0,m.kt)("p",null,(0,m.kt)("img",{parentName:"p",src:"https://mmlspark.blob.core.windows.net/graphics/responsible_ai/DataBalanceAnalysis_SyntheticSamples.png",alt:"Synthetic Samples"})),(0,m.kt)("h3",{id:"reweighting"},"Reweighting"),(0,m.kt)("p",null,"There is an expected and observed value in each table cell. The weight is the value of expected / observed. Reweighting is easy to extend to multiple features with more than two groups. The weights are then incorporated in loss function of model training."),(0,m.kt)("p",null,(0,m.kt)("img",{parentName:"p",src:"https://mmlspark.blob.core.windows.net/graphics/responsible_ai/DataBalanceAnalysis_Reweight.png",alt:"Reweighting"})))}k.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/5b779334.7d6a3e9e.js b/assets/js/5b779334.38fdf924.js similarity index 98% rename from assets/js/5b779334.7d6a3e9e.js rename to assets/js/5b779334.38fdf924.js index a6986f6b37..f5dc1075ae 100644 --- a/assets/js/5b779334.7d6a3e9e.js +++ b/assets/js/5b779334.38fdf924.js @@ -1 +1 @@ -"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[6833],{3905:function(e,t,n){n.d(t,{Zo:function(){return p},kt:function(){return m}});var r=n(7294);function a(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function i(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,r)}return n}function o(e){for(var t=1;t=0||(a[n]=e[n]);return a}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(a[n]=e[n])}return a}var l=r.createContext({}),c=function(e){var t=r.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):o(o({},t),e)),n},p=function(e){var t=c(e.components);return r.createElement(l.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},d=r.forwardRef((function(e,t){var n=e.components,a=e.mdxType,i=e.originalType,l=e.parentName,p=s(e,["components","mdxType","originalType","parentName"]),d=c(n),m=a,f=d["".concat(l,".").concat(m)]||d[m]||u[m]||i;return n?r.createElement(f,o(o({ref:t},p),{},{components:n})):r.createElement(f,o({ref:t},p))}));function m(e,t){var n=arguments,a=t&&t.mdxType;if("string"==typeof e||a){var i=n.length,o=new Array(i);o[0]=d;var s={};for(var l in t)hasOwnProperty.call(t,l)&&(s[l]=t[l]);s.originalType=e,s.mdxType="string"==typeof e?e:a,o[1]=s;for(var c=2;c=0||(a[n]=e[n]);return a}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(a[n]=e[n])}return a}var l=r.createContext({}),c=function(e){var t=r.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):o(o({},t),e)),n},p=function(e){var t=c(e.components);return r.createElement(l.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},d=r.forwardRef((function(e,t){var n=e.components,a=e.mdxType,i=e.originalType,l=e.parentName,p=s(e,["components","mdxType","originalType","parentName"]),d=c(n),m=a,f=d["".concat(l,".").concat(m)]||d[m]||u[m]||i;return n?r.createElement(f,o(o({ref:t},p),{},{components:n})):r.createElement(f,o({ref:t},p))}));function m(e,t){var n=arguments,a=t&&t.mdxType;if("string"==typeof e||a){var i=n.length,o=new Array(i);o[0]=d;var s={};for(var l in t)hasOwnProperty.call(t,l)&&(s[l]=t[l]);s.originalType=e,s.mdxType="string"==typeof e?e:a,o[1]=s;for(var c=2;c=0||(o[n]=e[n]);return o}(e,t);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(o[n]=e[n])}return o}var l=r.createContext({}),p=function(e){var t=r.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):i(i({},t),e)),n},d=function(e){var t=p(e.components);return r.createElement(l.Provider,{value:t},e.children)},m={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},c=r.forwardRef((function(e,t){var n=e.components,o=e.mdxType,a=e.originalType,l=e.parentName,d=s(e,["components","mdxType","originalType","parentName"]),c=p(n),u=o,h=c["".concat(l,".").concat(u)]||c[u]||m[u]||a;return n?r.createElement(h,i(i({ref:t},d),{},{components:n})):r.createElement(h,i({ref:t},d))}));function u(e,t){var n=arguments,o=t&&t.mdxType;if("string"==typeof e||o){var a=n.length,i=new Array(a);i[0]=c;var s={};for(var l in t)hasOwnProperty.call(t,l)&&(s[l]=t[l]);s.originalType=e,s.mdxType="string"==typeof e?e:o,i[1]=s;for(var p=2;p=0||(r[t]=e[t]);return r}(e,n);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,t)&&(r[t]=e[t])}return r}var s=a.createContext({}),p=function(e){var n=a.useContext(s),t=n;return e&&(t="function"==typeof e?e(n):l(l({},n),e)),t},u=function(e){var n=p(e.components);return a.createElement(s.Provider,{value:n},e.children)},c={inlineCode:"code",wrapper:function(e){var n=e.children;return a.createElement(a.Fragment,{},n)}},d=a.forwardRef((function(e,n){var t=e.components,r=e.mdxType,o=e.originalType,s=e.parentName,u=i(e,["components","mdxType","originalType","parentName"]),d=p(t),m=r,h=d["".concat(s,".").concat(m)]||d[m]||c[m]||o;return t?a.createElement(h,l(l({ref:n},u),{},{components:t})):a.createElement(h,l({ref:n},u))}));function m(e,n){var t=arguments,r=n&&n.mdxType;if("string"==typeof e||r){var o=t.length,l=new Array(o);l[0]=d;var i={};for(var s in n)hasOwnProperty.call(n,s)&&(i[s]=n[s]);i.originalType=e,i.mdxType="string"==typeof e?e:r,l[1]=i;for(var p=2;p child <"+("string"==typeof e.type?e.type:e.type.name)+'>: all children of the component should be , and every should have a unique "value" prop.')})))?void 0:t.filter(Boolean))?n:[]}(e).map((function(e){var n=e.props;return{value:n.value,label:n.label,attributes:n.attributes,default:n.default}}))}function d(e){var n=e.values,t=e.children;return(0,r.useMemo)((function(){var e=null!=n?n:c(t);return function(e){var n=(0,p.l)(e,(function(e,n){return e.value===n.value}));if(n.length>0)throw new Error('Docusaurus error: Duplicate values "'+n.map((function(e){return e.value})).join(", ")+'" found in . Every value needs to be unique.')}(e),e}),[n,t])}function m(e){var n=e.value;return e.tabValues.some((function(e){return e.value===n}))}function h(e){var n=e.queryString,t=void 0!==n&&n,a=e.groupId,o=(0,i.k6)(),l=function(e){var n=e.queryString,t=void 0!==n&&n,a=e.groupId;if("string"==typeof t)return t;if(!1===t)return null;if(!0===t&&!a)throw new Error('Docusaurus error: The component groupId prop is required if queryString=true, because this value is used as the search param name. You can also provide an explicit value such as queryString="my-search-param".');return null!=a?a:null}({queryString:t,groupId:a});return[(0,s._X)(l),(0,r.useCallback)((function(e){if(l){var n=new URLSearchParams(o.location.search);n.set(l,e),o.replace(Object.assign({},o.location,{search:n.toString()}))}}),[l,o])]}function f(e){var n,t,a,o,l=e.defaultValue,i=e.queryString,s=void 0!==i&&i,p=e.groupId,c=d(e),f=(0,r.useState)((function(){return function(e){var n,t=e.defaultValue,a=e.tabValues;if(0===a.length)throw new Error("Docusaurus error: the component requires at least one children component");if(t){if(!m({value:t,tabValues:a}))throw new Error('Docusaurus error: The has a defaultValue "'+t+'" but none of its children has the corresponding value. Available values are: '+a.map((function(e){return e.value})).join(", ")+". If you intend to show no default tab, use defaultValue={null} instead.");return t}var r=null!=(n=a.find((function(e){return e.default})))?n:a[0];if(!r)throw new Error("Unexpected error: 0 tabValues");return r.value}({defaultValue:l,tabValues:c})})),k=f[0],y=f[1],v=h({queryString:s,groupId:p}),b=v[0],g=v[1],w=(n=function(e){return e?"docusaurus.tab."+e:null}({groupId:p}.groupId),t=(0,u.Nk)(n),a=t[0],o=t[1],[a,(0,r.useCallback)((function(e){n&&o.set(e)}),[n,o])]),S=w[0],T=w[1],N=function(){var e=null!=b?b:S;return m({value:e,tabValues:c})?e:null}();return(0,r.useLayoutEffect)((function(){N&&y(N)}),[N]),{selectedValue:k,selectValue:(0,r.useCallback)((function(e){if(!m({value:e,tabValues:c}))throw new Error("Can't select invalid tab value="+e);y(e),g(e),T(e)}),[g,T,c]),tabValues:c}}var k=t(2389),y="tabList__CuJ",v="tabItem_LNqP";function b(e){var n=e.className,t=e.block,i=e.selectedValue,s=e.selectValue,p=e.tabValues,u=[],c=(0,l.o5)().blockElementScrollPositionUntilNextRender,d=function(e){var n=e.currentTarget,t=u.indexOf(n),a=p[t].value;a!==i&&(c(n),s(a))},m=function(e){var n,t=null;switch(e.key){case"Enter":d(e);break;case"ArrowRight":var a,r=u.indexOf(e.currentTarget)+1;t=null!=(a=u[r])?a:u[0];break;case"ArrowLeft":var o,l=u.indexOf(e.currentTarget)-1;t=null!=(o=u[l])?o:u[u.length-1]}null==(n=t)||n.focus()};return r.createElement("ul",{role:"tablist","aria-orientation":"horizontal",className:(0,o.Z)("tabs",{"tabs--block":t},n)},p.map((function(e){var n=e.value,t=e.label,l=e.attributes;return r.createElement("li",(0,a.Z)({role:"tab",tabIndex:i===n?0:-1,"aria-selected":i===n,key:n,ref:function(e){return u.push(e)},onKeyDown:m,onClick:d},l,{className:(0,o.Z)("tabs__item",v,null==l?void 0:l.className,{"tabs__item--active":i===n})}),null!=t?t:n)})))}function g(e){var n=e.lazy,t=e.children,a=e.selectedValue,o=(Array.isArray(t)?t:[t]).filter(Boolean);if(n){var l=o.find((function(e){return e.props.value===a}));return l?(0,r.cloneElement)(l,{className:"margin-top--md"}):null}return r.createElement("div",{className:"margin-top--md"},o.map((function(e,n){return(0,r.cloneElement)(e,{key:n,hidden:e.props.value!==a})})))}function w(e){var n=f(e);return r.createElement("div",{className:(0,o.Z)("tabs-container",y)},r.createElement(b,(0,a.Z)({},e,n)),r.createElement(g,(0,a.Z)({},e,n)))}function S(e){var n=(0,k.Z)();return r.createElement(w,(0,a.Z)({key:String(n)},e))}},3006:function(e,n,t){t.r(n),t.d(n,{assets:function(){return d},contentTitle:function(){return u},default:function(){return f},frontMatter:function(){return p},metadata:function(){return c},toc:function(){return m}});var a=t(3117),r=t(102),o=(t(7294),t(3905)),l=t(4866),i=t(5162),s=["components"],p={title:".NET setup",hide_title:!0,sidebar_label:".NET setup",description:".NET setup"},u=".NET setup and example for SynapseML",c={unversionedId:"Reference/Dotnet Setup",id:"version-0.11.3/Reference/Dotnet Setup",title:".NET setup",description:".NET setup",source:"@site/versioned_docs/version-0.11.3/Reference/Dotnet Setup.md",sourceDirName:"Reference",slug:"/Reference/Dotnet Setup",permalink:"/SynapseML/docs/Reference/Dotnet Setup",draft:!1,tags:[],version:"0.11.3",frontMatter:{title:".NET setup",hide_title:!0,sidebar_label:".NET setup",description:".NET setup"},sidebar:"docs",previous:{title:"R setup",permalink:"/SynapseML/docs/Reference/R Setup"},next:{title:"Quickstart - LightGBM in Dotnet",permalink:"/SynapseML/docs/Reference/Quickstart - LightGBM in Dotnet"}},d={},m=[{value:"Installation",id:"installation",level:2},{value:"1. Install .NET",id:"1-install-net",level:3},{value:"2. Install Java",id:"2-install-java",level:3},{value:"3. Install Apache Spark",id:"3-install-apache-spark",level:3},{value:"4. Install .NET for Apache Spark",id:"4-install-net-for-apache-spark",level:3},{value:"5. Install WinUtils (Windows Only)",id:"5-install-winutils-windows-only",level:3},{value:"6. Set DOTNET_WORKER_DIR and check dependencies",id:"6-set-dotnet_worker_dir-and-check-dependencies",level:3},{value:"Write a .NET for SynapseML App",id:"write-a-net-for-synapseml-app",level:2},{value:"1. Create a console app",id:"1-create-a-console-app",level:3},{value:"2. Install NuGet package",id:"2-install-nuget-package",level:3},{value:"3. Write your app",id:"3-write-your-app",level:3},{value:"4. Run your .NET App",id:"4-run-your-net-app",level:3},{value:"Next",id:"next",level:2}],h={toc:m};function f(e){var n=e.components,t=(0,r.Z)(e,s);return(0,o.kt)("wrapper",(0,a.Z)({},h,t,{components:n,mdxType:"MDXLayout"}),(0,o.kt)("h1",{id:"net-setup-and-example-for-synapseml"},".NET setup and example for SynapseML"),(0,o.kt)("h2",{id:"installation"},"Installation"),(0,o.kt)("h3",{id:"1-install-net"},"1. Install .NET"),(0,o.kt)("p",null,"To start building .NET apps, you need to download and install the .NET SDK (Software Development Kit)."),(0,o.kt)("p",null,"Download and install the ",(0,o.kt)("a",{parentName:"p",href:"https://dotnet.microsoft.com/en-us/download/dotnet/3.1"},".NET Core SDK"),".\nInstalling the SDK adds the dotnet toolchain to your PATH."),(0,o.kt)("p",null,"Once you've installed the .NET Core SDK, open a new command prompt or terminal. Then run ",(0,o.kt)("inlineCode",{parentName:"p"},"dotnet"),"."),(0,o.kt)("p",null,"If the command runs and prints information about how to use dotnet, you can move to the next step.\nIf you receive a ",(0,o.kt)("inlineCode",{parentName:"p"},"'dotnet' is not recognized as an internal or external command")," error, make sure\nyou opened a new command prompt or terminal before running the command."),(0,o.kt)("h3",{id:"2-install-java"},"2. Install Java"),(0,o.kt)("p",null,"Install ",(0,o.kt)("a",{parentName:"p",href:"https://www.oracle.com/java/technologies/downloads/#java8"},"Java 8.1")," for Windows and macOS,\nor ",(0,o.kt)("a",{parentName:"p",href:"https://openjdk.org/install/"},"OpenJDK 8")," for Ubuntu."),(0,o.kt)("p",null,"Select the appropriate version for your operating system. For example, select jdk-8u201-windows-x64.exe\nfor a Windows x64 machine or jdk-8u231-macosx-x64.dmg for macOS. Then, use the command java to verify the installation."),(0,o.kt)("h3",{id:"3-install-apache-spark"},"3. Install Apache Spark"),(0,o.kt)("p",null,(0,o.kt)("a",{parentName:"p",href:"https://spark.apache.org/downloads.html"},"Download and install Apache Spark")," with version >= 3.2.0.\n(SynapseML v0.11.3 only supports spark version >= 3.2.0)"),(0,o.kt)("p",null,"Extract downloaded zipped files (with 7-Zip app on Windows or ",(0,o.kt)("inlineCode",{parentName:"p"},"tar")," on linux) and remember the location of\nextracted files, we take ",(0,o.kt)("inlineCode",{parentName:"p"},"~/bin/spark-3.2.0-bin-hadoop3.2/")," as an example here."),(0,o.kt)("p",null,"Run the following commands to set the environment variables used to locate Apache Spark.\nOn Windows, make sure to run the command prompt in administrator mode."),(0,o.kt)(l.Z,{groupId:"operating-systems",mdxType:"Tabs"},(0,o.kt)(i.Z,{value:"win",label:"Windows",default:!0,mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre"},' setx /M HADOOP_HOME C:\\bin\\spark-3.2.0-bin-hadoop3.2\\\n setx /M SPARK_HOME C:\\bin\\spark-3.2.0-bin-hadoop3.2\\\n setx /M PATH "%PATH%;%HADOOP_HOME%;%SPARK_HOME%bin" # Warning: Don\'t run this if your path is already long as it will truncate your path to 1024 characters and potentially remove entries!\n'))),(0,o.kt)(i.Z,{value:"linux",label:"Mac/Linux",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre"},' export SPARK_HOME=~/bin/spark-3.2.0-bin-hadoop3.2/\n export PATH="$SPARK_HOME/bin:$PATH"\n source ~/.bashrc\n')))),(0,o.kt)("p",null,"Once you've installed everything and set your environment variables, open a ",(0,o.kt)("strong",{parentName:"p"},"new")," command prompt or terminal and run the following command:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-bash"},"spark-submit --version\n")),(0,o.kt)("p",null,"If the command runs and prints version information, you can move to the next step."),(0,o.kt)("p",null,"If you receive a ",(0,o.kt)("inlineCode",{parentName:"p"},"'spark-submit' is not recognized as an internal or external command")," error, make sure you opened a ",(0,o.kt)("strong",{parentName:"p"},"new")," command prompt."),(0,o.kt)("h3",{id:"4-install-net-for-apache-spark"},"4. Install .NET for Apache Spark"),(0,o.kt)("p",null,"Download the ",(0,o.kt)("a",{parentName:"p",href:"https://github.com/dotnet/spark/releases"},"Microsoft.Spark.Worker")," ",(0,o.kt)("strong",{parentName:"p"},"v2.1.1")," release from the .NET for Apache Spark GitHub.\nFor example if you're on a Windows machine and plan to use .NET Core, download the Windows x64 netcoreapp3.1 release."),(0,o.kt)("p",null,"Extract Microsoft.Spark.Worker and remember the location."),(0,o.kt)("h3",{id:"5-install-winutils-windows-only"},"5. Install WinUtils (Windows Only)"),(0,o.kt)("p",null,".NET for Apache Spark requires WinUtils to be installed alongside Apache Spark.\n",(0,o.kt)("a",{parentName:"p",href:"https://github.com/steveloughran/winutils/blob/master/hadoop-3.0.0/bin/winutils.exe"},"Download winutils.exe"),".\nThen, copy WinUtils into C:\\bin\\spark-3.2.0-bin-hadoop3.2\\bin."),(0,o.kt)("admonition",{type:"note"},(0,o.kt)("p",{parentName:"admonition"},"If you're using a different version of Hadoop, select the version of WinUtils that's compatible with your version of Hadoop. You can see the Hadoop version at the end of your Spark install folder name.")),(0,o.kt)("h3",{id:"6-set-dotnet_worker_dir-and-check-dependencies"},"6. Set DOTNET_WORKER_DIR and check dependencies"),(0,o.kt)("p",null,"Run one of the following commands to set the DOTNET_WORKER_DIR environment variable, which is used by .NET apps to locate .NET for Apache Spark\nworker binaries. Make sure to replace with the directory where you downloaded and extracted the Microsoft.Spark.Worker.\nOn Windows, make sure to run the command prompt in administrator mode."),(0,o.kt)(l.Z,{groupId:"operating-systems",mdxType:"Tabs"},(0,o.kt)(i.Z,{value:"win",label:"Windows",default:!0,mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre"}," setx /M DOTNET_WORKER_DIR \n"))),(0,o.kt)(i.Z,{value:"linux",label:"Mac/Linux",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre"}," export DOTNET_WORKER_DIR=\n")))),(0,o.kt)("p",null,"Finally, double-check that you can run ",(0,o.kt)("inlineCode",{parentName:"p"},"dotnet, java, spark-shell")," from your command line before you move to the next section."),(0,o.kt)("h2",{id:"write-a-net-for-synapseml-app"},"Write a .NET for SynapseML App"),(0,o.kt)("h3",{id:"1-create-a-console-app"},"1. Create a console app"),(0,o.kt)("p",null,"In your command prompt or terminal, run the following commands to create a new console application:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-powershell"},"dotnet new console -o SynapseMLApp\ncd SynapseMLApp\n")),(0,o.kt)("p",null,"The ",(0,o.kt)("inlineCode",{parentName:"p"},"dotnet")," command creates a new application of type console for you. The -o parameter creates a directory\nnamed ",(0,o.kt)("inlineCode",{parentName:"p"},"SynapseMLApp")," where your app is stored and populates it with the required files.\nThe ",(0,o.kt)("inlineCode",{parentName:"p"},"cd SynapseMLApp")," command changes the directory to the app directory you created."),(0,o.kt)("h3",{id:"2-install-nuget-package"},"2. Install NuGet package"),(0,o.kt)("p",null,"To use .NET for Apache Spark in an app, install the Microsoft.Spark package.\nIn your command prompt or terminal, run the following command:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-powershell"},"dotnet add package Microsoft.Spark --version 2.1.1\n")),(0,o.kt)("admonition",{type:"note"},(0,o.kt)("p",{parentName:"admonition"},"This tutorial uses Microsoft.Spark version 2.1.1 as SynapseML 0.11.3 depends on it.\nChange to corresponding version if necessary.")),(0,o.kt)("p",null,"To use SynapseML features in the app, install SynapseML.X package.\nIn this tutorial, we use SynapseML.Cognitive as an example.\nIn your command prompt or terminal, run the following command:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-powershell"},"# Update Nuget Config to include SynapseML Feed\ndotnet nuget add source https://mmlspark.blob.core.windows.net/synapsemlnuget/index.json -n SynapseMLFeed\ndotnet add package SynapseML.Cognitive --version 0.11.3\n")),(0,o.kt)("p",null,"The ",(0,o.kt)("inlineCode",{parentName:"p"},"dotnet nuget add")," command adds SynapseML's resolver to the source, so that our package can be found."),(0,o.kt)("h3",{id:"3-write-your-app"},"3. Write your app"),(0,o.kt)("p",null,"Open Program.cs in Visual Studio Code, or any text editor. Replace its contents with this code:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-csharp"},'using System;\nusing System.Collections.Generic;\nusing Synapse.ML.Cognitive;\nusing Microsoft.Spark.Sql;\nusing Microsoft.Spark.Sql.Types;\n\nnamespace SynapseMLApp\n{\n class Program\n { static void Main(string[] args)\n {\n // Create Spark session\n SparkSession spark =\n SparkSession\n .Builder()\n .AppName("TextSentimentExample")\n .GetOrCreate();\n\n // Create DataFrame\n DataFrame df = spark.CreateDataFrame(\n new List\n {\n new GenericRow(new object[] {"I am so happy today, its sunny!", "en-US"}),\n new GenericRow(new object[] {"I am frustrated by this rush hour traffic", "en-US"}),\n new GenericRow(new object[] {"The cognitive services on spark aint bad", "en-US"})\n },\n new StructType(new List\n {\n new StructField("text", new StringType()),\n new StructField("language", new StringType())\n })\n );\n\n // Create TextSentiment\n var model = new TextSentiment()\n .SetSubscriptionKey("YOUR_SUBSCRIPTION_KEY")\n .SetLocation("eastus")\n .SetTextCol("text")\n .SetOutputCol("sentiment")\n .SetErrorCol("error")\n .SetLanguageCol("language");\n\n // Transform\n var outputDF = model.Transform(df);\n\n // Display results\n outputDF.Show();\n\n // Stop Spark session\n spark.Stop();\n }\n }\n}\n')),(0,o.kt)("p",null,(0,o.kt)("a",{parentName:"p",href:"https://docs.microsoft.com/en-us/dotnet/api/microsoft.spark.sql.sparksession?view=spark-dotnet"},"SparkSession")," is the entrypoint\nof Apache Spark applications, which manages the context and information of your application. A DataFrame is a way of organizing\ndata into a set of named columns."),(0,o.kt)("p",null,"Create a ",(0,o.kt)("a",{parentName:"p",href:"https://mmlspark.blob.core.windows.net/docs/0.11.3/dotnet/classSynapse_1_1ML_1_1Cognitive_1_1TextSentiment.html"},"TextSentiment"),"\ninstance, set corresponding subscription key and other configurations. Then, apply transformation to the dataframe,\nwhich analyzes the sentiment based on each row, and stores result into output column."),(0,o.kt)("p",null,"The result of the transformation is stored in another DataFrame. At this point, no operations have taken place because\n.NET for Apache Spark lazily evaluates the data. The operation defined by the call to model.Transform doesn't execute until the Show method is called to display the contents of the transformed DataFrame to the console. Once you no longer need the Spark\nsession, use the Stop method to stop your session."),(0,o.kt)("h3",{id:"4-run-your-net-app"},"4. Run your .NET App"),(0,o.kt)("p",null,"Run the following command to build your application:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-powershell"},"dotnet build\n")),(0,o.kt)("p",null,"Navigate to your build output directory. For example, in Windows you could run ",(0,o.kt)("inlineCode",{parentName:"p"},"cd bin\\Debug\\net5.0"),".\nUse the spark-submit command to submit your application to run on Apache Spark."),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-powershell"},"spark-submit --class org.apache.spark.deploy.dotnet.DotnetRunner --packages com.microsoft.azure:synapseml_2.12:0.11.3 --master local microsoft-spark-3-2_2.12-2.1.1.jar dotnet SynapseMLApp.dll\n")),(0,o.kt)("p",null,(0,o.kt)("inlineCode",{parentName:"p"},"--packages com.microsoft.azure:synapseml_2.12:0.11.3")," specifies the dependency on synapseml_2.12 version 0.11.3;\n",(0,o.kt)("inlineCode",{parentName:"p"},"microsoft-spark-3-2_2.12-2.1.1.jar")," specifies Microsoft.Spark version 2.1.1 and Spark version 3.2"),(0,o.kt)("admonition",{type:"note"},(0,o.kt)("p",{parentName:"admonition"},"This command assumes you have downloaded Apache Spark and added it to your PATH environment variable so that you can use spark-submit.\nOtherwise, you'd have to use the full path (for example, C:\\bin\\apache-spark\\bin\\spark-submit or ~/spark/bin/spark-submit).")),(0,o.kt)("p",null,"When your app runs, the sentiment analysis result is written to the console."),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre"},"+-----------------------------------------+--------+-----+--------------------------------------------------+\n| text|language|error| sentiment|\n+-----------------------------------------+--------+-----+--------------------------------------------------+\n| I am so happy today, its sunny!| en-US| null|[{positive, null, {0.99, 0.0, 0.0}, [{I am so h...|\n|I am frustrated by this rush hour traffic| en-US| null|[{negative, null, {0.0, 0.0, 0.99}, [{I am frus...|\n| The cognitive services on spark aint bad| en-US| null|[{negative, null, {0.0, 0.01, 0.99}, [{The cogn...|\n+-----------------------------------------+--------+-----+--------------------------------------------------+\n")),(0,o.kt)("p",null,"Congratulations! You successfully authored and ran a .NET for SynapseML app.\nRefer to the ",(0,o.kt)("a",{parentName:"p",href:"https://mmlspark.blob.core.windows.net/docs/0.11.3/dotnet/index.html"},"developer docs")," for API guidance."),(0,o.kt)("h2",{id:"next"},"Next"),(0,o.kt)("ul",null,(0,o.kt)("li",{parentName:"ul"},"Refer to this ",(0,o.kt)("a",{parentName:"li",href:"https://docs.microsoft.com/en-us/dotnet/spark/tutorials/databricks-deployment"},"tutorial")," for deploying a .NET app to Databricks."),(0,o.kt)("li",{parentName:"ul"},"You could download compatible ",(0,o.kt)("a",{parentName:"li",href:"https://mmlspark.blob.core.windows.net/publicwasb/dotnet/install-worker.sh"},"install-worker.sh"),"\nand ",(0,o.kt)("a",{parentName:"li",href:"https://mmlspark.blob.core.windows.net/publicwasb/dotnet/db-init.sh"},"db-init.sh")," files needed for deployment on Databricks.")))}f.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/603706f6.d42cdc8c.js b/assets/js/603706f6.d42cdc8c.js new file mode 100644 index 0000000000..d8aa948a79 --- /dev/null +++ b/assets/js/603706f6.d42cdc8c.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[5149],{3905:function(e,n,t){t.d(n,{Zo:function(){return u},kt:function(){return m}});var a=t(7294);function r(e,n,t){return n in e?Object.defineProperty(e,n,{value:t,enumerable:!0,configurable:!0,writable:!0}):e[n]=t,e}function o(e,n){var t=Object.keys(e);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);n&&(a=a.filter((function(n){return Object.getOwnPropertyDescriptor(e,n).enumerable}))),t.push.apply(t,a)}return t}function l(e){for(var n=1;n=0||(r[t]=e[t]);return r}(e,n);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,t)&&(r[t]=e[t])}return r}var s=a.createContext({}),p=function(e){var n=a.useContext(s),t=n;return e&&(t="function"==typeof e?e(n):l(l({},n),e)),t},u=function(e){var n=p(e.components);return a.createElement(s.Provider,{value:n},e.children)},c={inlineCode:"code",wrapper:function(e){var n=e.children;return a.createElement(a.Fragment,{},n)}},d=a.forwardRef((function(e,n){var t=e.components,r=e.mdxType,o=e.originalType,s=e.parentName,u=i(e,["components","mdxType","originalType","parentName"]),d=p(t),m=r,h=d["".concat(s,".").concat(m)]||d[m]||c[m]||o;return t?a.createElement(h,l(l({ref:n},u),{},{components:t})):a.createElement(h,l({ref:n},u))}));function m(e,n){var t=arguments,r=n&&n.mdxType;if("string"==typeof e||r){var o=t.length,l=new Array(o);l[0]=d;var i={};for(var s in n)hasOwnProperty.call(n,s)&&(i[s]=n[s]);i.originalType=e,i.mdxType="string"==typeof e?e:r,l[1]=i;for(var p=2;p child <"+("string"==typeof e.type?e.type:e.type.name)+'>: all children of the component should be , and every should have a unique "value" prop.')})))?void 0:t.filter(Boolean))?n:[]}(e).map((function(e){var n=e.props;return{value:n.value,label:n.label,attributes:n.attributes,default:n.default}}))}function d(e){var n=e.values,t=e.children;return(0,r.useMemo)((function(){var e=null!=n?n:c(t);return function(e){var n=(0,p.l)(e,(function(e,n){return e.value===n.value}));if(n.length>0)throw new Error('Docusaurus error: Duplicate values "'+n.map((function(e){return e.value})).join(", ")+'" found in . Every value needs to be unique.')}(e),e}),[n,t])}function m(e){var n=e.value;return e.tabValues.some((function(e){return e.value===n}))}function h(e){var n=e.queryString,t=void 0!==n&&n,a=e.groupId,o=(0,i.k6)(),l=function(e){var n=e.queryString,t=void 0!==n&&n,a=e.groupId;if("string"==typeof t)return t;if(!1===t)return null;if(!0===t&&!a)throw new Error('Docusaurus error: The component groupId prop is required if queryString=true, because this value is used as the search param name. You can also provide an explicit value such as queryString="my-search-param".');return null!=a?a:null}({queryString:t,groupId:a});return[(0,s._X)(l),(0,r.useCallback)((function(e){if(l){var n=new URLSearchParams(o.location.search);n.set(l,e),o.replace(Object.assign({},o.location,{search:n.toString()}))}}),[l,o])]}function f(e){var n,t,a,o,l=e.defaultValue,i=e.queryString,s=void 0!==i&&i,p=e.groupId,c=d(e),f=(0,r.useState)((function(){return function(e){var n,t=e.defaultValue,a=e.tabValues;if(0===a.length)throw new Error("Docusaurus error: the component requires at least one children component");if(t){if(!m({value:t,tabValues:a}))throw new Error('Docusaurus error: The has a defaultValue "'+t+'" but none of its children has the corresponding value. Available values are: '+a.map((function(e){return e.value})).join(", ")+". If you intend to show no default tab, use defaultValue={null} instead.");return t}var r=null!=(n=a.find((function(e){return e.default})))?n:a[0];if(!r)throw new Error("Unexpected error: 0 tabValues");return r.value}({defaultValue:l,tabValues:c})})),k=f[0],y=f[1],v=h({queryString:s,groupId:p}),b=v[0],g=v[1],w=(n=function(e){return e?"docusaurus.tab."+e:null}({groupId:p}.groupId),t=(0,u.Nk)(n),a=t[0],o=t[1],[a,(0,r.useCallback)((function(e){n&&o.set(e)}),[n,o])]),S=w[0],T=w[1],N=function(){var e=null!=b?b:S;return m({value:e,tabValues:c})?e:null}();return(0,r.useLayoutEffect)((function(){N&&y(N)}),[N]),{selectedValue:k,selectValue:(0,r.useCallback)((function(e){if(!m({value:e,tabValues:c}))throw new Error("Can't select invalid tab value="+e);y(e),g(e),T(e)}),[g,T,c]),tabValues:c}}var k=t(2389),y="tabList__CuJ",v="tabItem_LNqP";function b(e){var n=e.className,t=e.block,i=e.selectedValue,s=e.selectValue,p=e.tabValues,u=[],c=(0,l.o5)().blockElementScrollPositionUntilNextRender,d=function(e){var n=e.currentTarget,t=u.indexOf(n),a=p[t].value;a!==i&&(c(n),s(a))},m=function(e){var n,t=null;switch(e.key){case"Enter":d(e);break;case"ArrowRight":var a,r=u.indexOf(e.currentTarget)+1;t=null!=(a=u[r])?a:u[0];break;case"ArrowLeft":var o,l=u.indexOf(e.currentTarget)-1;t=null!=(o=u[l])?o:u[u.length-1]}null==(n=t)||n.focus()};return r.createElement("ul",{role:"tablist","aria-orientation":"horizontal",className:(0,o.Z)("tabs",{"tabs--block":t},n)},p.map((function(e){var n=e.value,t=e.label,l=e.attributes;return r.createElement("li",(0,a.Z)({role:"tab",tabIndex:i===n?0:-1,"aria-selected":i===n,key:n,ref:function(e){return u.push(e)},onKeyDown:m,onClick:d},l,{className:(0,o.Z)("tabs__item",v,null==l?void 0:l.className,{"tabs__item--active":i===n})}),null!=t?t:n)})))}function g(e){var n=e.lazy,t=e.children,a=e.selectedValue,o=(Array.isArray(t)?t:[t]).filter(Boolean);if(n){var l=o.find((function(e){return e.props.value===a}));return l?(0,r.cloneElement)(l,{className:"margin-top--md"}):null}return r.createElement("div",{className:"margin-top--md"},o.map((function(e,n){return(0,r.cloneElement)(e,{key:n,hidden:e.props.value!==a})})))}function w(e){var n=f(e);return r.createElement("div",{className:(0,o.Z)("tabs-container",y)},r.createElement(b,(0,a.Z)({},e,n)),r.createElement(g,(0,a.Z)({},e,n)))}function S(e){var n=(0,k.Z)();return r.createElement(w,(0,a.Z)({key:String(n)},e))}},3006:function(e,n,t){t.r(n),t.d(n,{assets:function(){return d},contentTitle:function(){return u},default:function(){return f},frontMatter:function(){return p},metadata:function(){return c},toc:function(){return m}});var a=t(3117),r=t(102),o=(t(7294),t(3905)),l=t(4866),i=t(5162),s=["components"],p={title:".NET setup",hide_title:!0,sidebar_label:".NET setup",description:".NET setup"},u=".NET setup and example for SynapseML",c={unversionedId:"Reference/Dotnet Setup",id:"version-0.11.3/Reference/Dotnet Setup",title:".NET setup",description:".NET setup",source:"@site/versioned_docs/version-0.11.3/Reference/Dotnet Setup.md",sourceDirName:"Reference",slug:"/Reference/Dotnet Setup",permalink:"/SynapseML/docs/0.11.3/Reference/Dotnet Setup",draft:!1,tags:[],version:"0.11.3",frontMatter:{title:".NET setup",hide_title:!0,sidebar_label:".NET setup",description:".NET setup"},sidebar:"docs",previous:{title:"R setup",permalink:"/SynapseML/docs/0.11.3/Reference/R Setup"},next:{title:"Quickstart - LightGBM in Dotnet",permalink:"/SynapseML/docs/0.11.3/Reference/Quickstart - LightGBM in Dotnet"}},d={},m=[{value:"Installation",id:"installation",level:2},{value:"1. Install .NET",id:"1-install-net",level:3},{value:"2. Install Java",id:"2-install-java",level:3},{value:"3. Install Apache Spark",id:"3-install-apache-spark",level:3},{value:"4. Install .NET for Apache Spark",id:"4-install-net-for-apache-spark",level:3},{value:"5. Install WinUtils (Windows Only)",id:"5-install-winutils-windows-only",level:3},{value:"6. Set DOTNET_WORKER_DIR and check dependencies",id:"6-set-dotnet_worker_dir-and-check-dependencies",level:3},{value:"Write a .NET for SynapseML App",id:"write-a-net-for-synapseml-app",level:2},{value:"1. Create a console app",id:"1-create-a-console-app",level:3},{value:"2. Install NuGet package",id:"2-install-nuget-package",level:3},{value:"3. Write your app",id:"3-write-your-app",level:3},{value:"4. Run your .NET App",id:"4-run-your-net-app",level:3},{value:"Next",id:"next",level:2}],h={toc:m};function f(e){var n=e.components,t=(0,r.Z)(e,s);return(0,o.kt)("wrapper",(0,a.Z)({},h,t,{components:n,mdxType:"MDXLayout"}),(0,o.kt)("h1",{id:"net-setup-and-example-for-synapseml"},".NET setup and example for SynapseML"),(0,o.kt)("h2",{id:"installation"},"Installation"),(0,o.kt)("h3",{id:"1-install-net"},"1. Install .NET"),(0,o.kt)("p",null,"To start building .NET apps, you need to download and install the .NET SDK (Software Development Kit)."),(0,o.kt)("p",null,"Download and install the ",(0,o.kt)("a",{parentName:"p",href:"https://dotnet.microsoft.com/en-us/download/dotnet/3.1"},".NET Core SDK"),".\nInstalling the SDK adds the dotnet toolchain to your PATH."),(0,o.kt)("p",null,"Once you've installed the .NET Core SDK, open a new command prompt or terminal. Then run ",(0,o.kt)("inlineCode",{parentName:"p"},"dotnet"),"."),(0,o.kt)("p",null,"If the command runs and prints information about how to use dotnet, you can move to the next step.\nIf you receive a ",(0,o.kt)("inlineCode",{parentName:"p"},"'dotnet' is not recognized as an internal or external command")," error, make sure\nyou opened a new command prompt or terminal before running the command."),(0,o.kt)("h3",{id:"2-install-java"},"2. Install Java"),(0,o.kt)("p",null,"Install ",(0,o.kt)("a",{parentName:"p",href:"https://www.oracle.com/java/technologies/downloads/#java8"},"Java 8.1")," for Windows and macOS,\nor ",(0,o.kt)("a",{parentName:"p",href:"https://openjdk.org/install/"},"OpenJDK 8")," for Ubuntu."),(0,o.kt)("p",null,"Select the appropriate version for your operating system. For example, select jdk-8u201-windows-x64.exe\nfor a Windows x64 machine or jdk-8u231-macosx-x64.dmg for macOS. Then, use the command java to verify the installation."),(0,o.kt)("h3",{id:"3-install-apache-spark"},"3. Install Apache Spark"),(0,o.kt)("p",null,(0,o.kt)("a",{parentName:"p",href:"https://spark.apache.org/downloads.html"},"Download and install Apache Spark")," with version >= 3.2.0.\n(SynapseML v0.11.3 only supports spark version >= 3.2.0)"),(0,o.kt)("p",null,"Extract downloaded zipped files (with 7-Zip app on Windows or ",(0,o.kt)("inlineCode",{parentName:"p"},"tar")," on linux) and remember the location of\nextracted files, we take ",(0,o.kt)("inlineCode",{parentName:"p"},"~/bin/spark-3.2.0-bin-hadoop3.2/")," as an example here."),(0,o.kt)("p",null,"Run the following commands to set the environment variables used to locate Apache Spark.\nOn Windows, make sure to run the command prompt in administrator mode."),(0,o.kt)(l.Z,{groupId:"operating-systems",mdxType:"Tabs"},(0,o.kt)(i.Z,{value:"win",label:"Windows",default:!0,mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre"},' setx /M HADOOP_HOME C:\\bin\\spark-3.2.0-bin-hadoop3.2\\\n setx /M SPARK_HOME C:\\bin\\spark-3.2.0-bin-hadoop3.2\\\n setx /M PATH "%PATH%;%HADOOP_HOME%;%SPARK_HOME%bin" # Warning: Don\'t run this if your path is already long as it will truncate your path to 1024 characters and potentially remove entries!\n'))),(0,o.kt)(i.Z,{value:"linux",label:"Mac/Linux",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre"},' export SPARK_HOME=~/bin/spark-3.2.0-bin-hadoop3.2/\n export PATH="$SPARK_HOME/bin:$PATH"\n source ~/.bashrc\n')))),(0,o.kt)("p",null,"Once you've installed everything and set your environment variables, open a ",(0,o.kt)("strong",{parentName:"p"},"new")," command prompt or terminal and run the following command:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-bash"},"spark-submit --version\n")),(0,o.kt)("p",null,"If the command runs and prints version information, you can move to the next step."),(0,o.kt)("p",null,"If you receive a ",(0,o.kt)("inlineCode",{parentName:"p"},"'spark-submit' is not recognized as an internal or external command")," error, make sure you opened a ",(0,o.kt)("strong",{parentName:"p"},"new")," command prompt."),(0,o.kt)("h3",{id:"4-install-net-for-apache-spark"},"4. Install .NET for Apache Spark"),(0,o.kt)("p",null,"Download the ",(0,o.kt)("a",{parentName:"p",href:"https://github.com/dotnet/spark/releases"},"Microsoft.Spark.Worker")," ",(0,o.kt)("strong",{parentName:"p"},"v2.1.1")," release from the .NET for Apache Spark GitHub.\nFor example if you're on a Windows machine and plan to use .NET Core, download the Windows x64 netcoreapp3.1 release."),(0,o.kt)("p",null,"Extract Microsoft.Spark.Worker and remember the location."),(0,o.kt)("h3",{id:"5-install-winutils-windows-only"},"5. Install WinUtils (Windows Only)"),(0,o.kt)("p",null,".NET for Apache Spark requires WinUtils to be installed alongside Apache Spark.\n",(0,o.kt)("a",{parentName:"p",href:"https://github.com/steveloughran/winutils/blob/master/hadoop-3.0.0/bin/winutils.exe"},"Download winutils.exe"),".\nThen, copy WinUtils into C:\\bin\\spark-3.2.0-bin-hadoop3.2\\bin."),(0,o.kt)("admonition",{type:"note"},(0,o.kt)("p",{parentName:"admonition"},"If you're using a different version of Hadoop, select the version of WinUtils that's compatible with your version of Hadoop. You can see the Hadoop version at the end of your Spark install folder name.")),(0,o.kt)("h3",{id:"6-set-dotnet_worker_dir-and-check-dependencies"},"6. Set DOTNET_WORKER_DIR and check dependencies"),(0,o.kt)("p",null,"Run one of the following commands to set the DOTNET_WORKER_DIR environment variable, which is used by .NET apps to locate .NET for Apache Spark\nworker binaries. Make sure to replace with the directory where you downloaded and extracted the Microsoft.Spark.Worker.\nOn Windows, make sure to run the command prompt in administrator mode."),(0,o.kt)(l.Z,{groupId:"operating-systems",mdxType:"Tabs"},(0,o.kt)(i.Z,{value:"win",label:"Windows",default:!0,mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre"}," setx /M DOTNET_WORKER_DIR \n"))),(0,o.kt)(i.Z,{value:"linux",label:"Mac/Linux",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre"}," export DOTNET_WORKER_DIR=\n")))),(0,o.kt)("p",null,"Finally, double-check that you can run ",(0,o.kt)("inlineCode",{parentName:"p"},"dotnet, java, spark-shell")," from your command line before you move to the next section."),(0,o.kt)("h2",{id:"write-a-net-for-synapseml-app"},"Write a .NET for SynapseML App"),(0,o.kt)("h3",{id:"1-create-a-console-app"},"1. Create a console app"),(0,o.kt)("p",null,"In your command prompt or terminal, run the following commands to create a new console application:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-powershell"},"dotnet new console -o SynapseMLApp\ncd SynapseMLApp\n")),(0,o.kt)("p",null,"The ",(0,o.kt)("inlineCode",{parentName:"p"},"dotnet")," command creates a new application of type console for you. The -o parameter creates a directory\nnamed ",(0,o.kt)("inlineCode",{parentName:"p"},"SynapseMLApp")," where your app is stored and populates it with the required files.\nThe ",(0,o.kt)("inlineCode",{parentName:"p"},"cd SynapseMLApp")," command changes the directory to the app directory you created."),(0,o.kt)("h3",{id:"2-install-nuget-package"},"2. Install NuGet package"),(0,o.kt)("p",null,"To use .NET for Apache Spark in an app, install the Microsoft.Spark package.\nIn your command prompt or terminal, run the following command:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-powershell"},"dotnet add package Microsoft.Spark --version 2.1.1\n")),(0,o.kt)("admonition",{type:"note"},(0,o.kt)("p",{parentName:"admonition"},"This tutorial uses Microsoft.Spark version 2.1.1 as SynapseML 0.11.3 depends on it.\nChange to corresponding version if necessary.")),(0,o.kt)("p",null,"To use SynapseML features in the app, install SynapseML.X package.\nIn this tutorial, we use SynapseML.Cognitive as an example.\nIn your command prompt or terminal, run the following command:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-powershell"},"# Update Nuget Config to include SynapseML Feed\ndotnet nuget add source https://mmlspark.blob.core.windows.net/synapsemlnuget/index.json -n SynapseMLFeed\ndotnet add package SynapseML.Cognitive --version 0.11.3\n")),(0,o.kt)("p",null,"The ",(0,o.kt)("inlineCode",{parentName:"p"},"dotnet nuget add")," command adds SynapseML's resolver to the source, so that our package can be found."),(0,o.kt)("h3",{id:"3-write-your-app"},"3. Write your app"),(0,o.kt)("p",null,"Open Program.cs in Visual Studio Code, or any text editor. Replace its contents with this code:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-csharp"},'using System;\nusing System.Collections.Generic;\nusing Synapse.ML.Cognitive;\nusing Microsoft.Spark.Sql;\nusing Microsoft.Spark.Sql.Types;\n\nnamespace SynapseMLApp\n{\n class Program\n { static void Main(string[] args)\n {\n // Create Spark session\n SparkSession spark =\n SparkSession\n .Builder()\n .AppName("TextSentimentExample")\n .GetOrCreate();\n\n // Create DataFrame\n DataFrame df = spark.CreateDataFrame(\n new List\n {\n new GenericRow(new object[] {"I am so happy today, its sunny!", "en-US"}),\n new GenericRow(new object[] {"I am frustrated by this rush hour traffic", "en-US"}),\n new GenericRow(new object[] {"The cognitive services on spark aint bad", "en-US"})\n },\n new StructType(new List\n {\n new StructField("text", new StringType()),\n new StructField("language", new StringType())\n })\n );\n\n // Create TextSentiment\n var model = new TextSentiment()\n .SetSubscriptionKey("YOUR_SUBSCRIPTION_KEY")\n .SetLocation("eastus")\n .SetTextCol("text")\n .SetOutputCol("sentiment")\n .SetErrorCol("error")\n .SetLanguageCol("language");\n\n // Transform\n var outputDF = model.Transform(df);\n\n // Display results\n outputDF.Show();\n\n // Stop Spark session\n spark.Stop();\n }\n }\n}\n')),(0,o.kt)("p",null,(0,o.kt)("a",{parentName:"p",href:"https://docs.microsoft.com/en-us/dotnet/api/microsoft.spark.sql.sparksession?view=spark-dotnet"},"SparkSession")," is the entrypoint\nof Apache Spark applications, which manages the context and information of your application. A DataFrame is a way of organizing\ndata into a set of named columns."),(0,o.kt)("p",null,"Create a ",(0,o.kt)("a",{parentName:"p",href:"https://mmlspark.blob.core.windows.net/docs/0.11.3/dotnet/classSynapse_1_1ML_1_1Cognitive_1_1TextSentiment.html"},"TextSentiment"),"\ninstance, set corresponding subscription key and other configurations. Then, apply transformation to the dataframe,\nwhich analyzes the sentiment based on each row, and stores result into output column."),(0,o.kt)("p",null,"The result of the transformation is stored in another DataFrame. At this point, no operations have taken place because\n.NET for Apache Spark lazily evaluates the data. The operation defined by the call to model.Transform doesn't execute until the Show method is called to display the contents of the transformed DataFrame to the console. Once you no longer need the Spark\nsession, use the Stop method to stop your session."),(0,o.kt)("h3",{id:"4-run-your-net-app"},"4. Run your .NET App"),(0,o.kt)("p",null,"Run the following command to build your application:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-powershell"},"dotnet build\n")),(0,o.kt)("p",null,"Navigate to your build output directory. For example, in Windows you could run ",(0,o.kt)("inlineCode",{parentName:"p"},"cd bin\\Debug\\net5.0"),".\nUse the spark-submit command to submit your application to run on Apache Spark."),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-powershell"},"spark-submit --class org.apache.spark.deploy.dotnet.DotnetRunner --packages com.microsoft.azure:synapseml_2.12:0.11.3 --master local microsoft-spark-3-2_2.12-2.1.1.jar dotnet SynapseMLApp.dll\n")),(0,o.kt)("p",null,(0,o.kt)("inlineCode",{parentName:"p"},"--packages com.microsoft.azure:synapseml_2.12:0.11.3")," specifies the dependency on synapseml_2.12 version 0.11.3;\n",(0,o.kt)("inlineCode",{parentName:"p"},"microsoft-spark-3-2_2.12-2.1.1.jar")," specifies Microsoft.Spark version 2.1.1 and Spark version 3.2"),(0,o.kt)("admonition",{type:"note"},(0,o.kt)("p",{parentName:"admonition"},"This command assumes you have downloaded Apache Spark and added it to your PATH environment variable so that you can use spark-submit.\nOtherwise, you'd have to use the full path (for example, C:\\bin\\apache-spark\\bin\\spark-submit or ~/spark/bin/spark-submit).")),(0,o.kt)("p",null,"When your app runs, the sentiment analysis result is written to the console."),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre"},"+-----------------------------------------+--------+-----+--------------------------------------------------+\n| text|language|error| sentiment|\n+-----------------------------------------+--------+-----+--------------------------------------------------+\n| I am so happy today, its sunny!| en-US| null|[{positive, null, {0.99, 0.0, 0.0}, [{I am so h...|\n|I am frustrated by this rush hour traffic| en-US| null|[{negative, null, {0.0, 0.0, 0.99}, [{I am frus...|\n| The cognitive services on spark aint bad| en-US| null|[{negative, null, {0.0, 0.01, 0.99}, [{The cogn...|\n+-----------------------------------------+--------+-----+--------------------------------------------------+\n")),(0,o.kt)("p",null,"Congratulations! You successfully authored and ran a .NET for SynapseML app.\nRefer to the ",(0,o.kt)("a",{parentName:"p",href:"https://mmlspark.blob.core.windows.net/docs/0.11.3/dotnet/index.html"},"developer docs")," for API guidance."),(0,o.kt)("h2",{id:"next"},"Next"),(0,o.kt)("ul",null,(0,o.kt)("li",{parentName:"ul"},"Refer to this ",(0,o.kt)("a",{parentName:"li",href:"https://docs.microsoft.com/en-us/dotnet/spark/tutorials/databricks-deployment"},"tutorial")," for deploying a .NET app to Databricks."),(0,o.kt)("li",{parentName:"ul"},"You could download compatible ",(0,o.kt)("a",{parentName:"li",href:"https://mmlspark.blob.core.windows.net/publicwasb/dotnet/install-worker.sh"},"install-worker.sh"),"\nand ",(0,o.kt)("a",{parentName:"li",href:"https://mmlspark.blob.core.windows.net/publicwasb/dotnet/db-init.sh"},"db-init.sh")," files needed for deployment on Databricks.")))}f.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/59ef8022.af61e93a.js b/assets/js/60a2189a.ec6b1ac0.js similarity index 99% rename from assets/js/59ef8022.af61e93a.js rename to assets/js/60a2189a.ec6b1ac0.js index beb2e89100..ab802c01ee 100644 --- a/assets/js/59ef8022.af61e93a.js +++ b/assets/js/60a2189a.ec6b1ac0.js @@ -1 +1 @@ -"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[8491],{3905:function(a,e,t){t.d(e,{Zo:function(){return o},kt:function(){return k}});var n=t(7294);function s(a,e,t){return e in a?Object.defineProperty(a,e,{value:t,enumerable:!0,configurable:!0,writable:!0}):a[e]=t,a}function m(a,e){var t=Object.keys(a);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(a);e&&(n=n.filter((function(e){return Object.getOwnPropertyDescriptor(a,e).enumerable}))),t.push.apply(t,n)}return t}function r(a){for(var e=1;e=0||(s[t]=a[t]);return s}(a,e);if(Object.getOwnPropertySymbols){var m=Object.getOwnPropertySymbols(a);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(a,t)&&(s[t]=a[t])}return s}var l=n.createContext({}),i=function(a){var e=n.useContext(l),t=e;return a&&(t="function"==typeof a?a(e):r(r({},e),a)),t},o=function(a){var e=i(a.components);return n.createElement(l.Provider,{value:e},a.children)},N={inlineCode:"code",wrapper:function(a){var e=a.children;return n.createElement(n.Fragment,{},e)}},c=n.forwardRef((function(a,e){var t=a.components,s=a.mdxType,m=a.originalType,l=a.parentName,o=p(a,["components","mdxType","originalType","parentName"]),c=i(t),k=s,h=c["".concat(l,".").concat(k)]||c[k]||N[k]||m;return t?n.createElement(h,r(r({ref:e},o),{},{components:t})):n.createElement(h,r({ref:e},o))}));function k(a,e){var t=arguments,s=e&&e.mdxType;if("string"==typeof a||s){var m=t.length,r=new Array(m);r[0]=c;var p={};for(var l in e)hasOwnProperty.call(e,l)&&(p[l]=e[l]);p.originalType=a,p.mdxType="string"==typeof a?a:s,r[1]=p;for(var i=2;i50K} to {0, 1} to represent our binary classification label column\ndf = df.withColumn(label, F.when(F.col(label).contains("<=50K"), F.lit(0)).otherwise(F.lit(1)))\n'))),(0,m.kt)("li",{parentName:"ol"},(0,m.kt)("p",{parentName:"li"},"Create a ",(0,m.kt)("inlineCode",{parentName:"p"},"FeatureBalanceMeasure")," transformer and call ",(0,m.kt)("inlineCode",{parentName:"p"},"setSensitiveCols")," to set the list of sensitive features and call ",(0,m.kt)("inlineCode",{parentName:"p"},"setLabelCol")," to set the binary label column. Then, call the ",(0,m.kt)("inlineCode",{parentName:"p"},"transform")," method with your dataset and visualize the resulting dataframe."),(0,m.kt)("p",{parentName:"li"},"For example:"),(0,m.kt)("pre",{parentName:"li"},(0,m.kt)("code",{parentName:"pre",className:"language-python"},"feature_balance_measures = (\n FeatureBalanceMeasure()\n .setSensitiveCols(features)\n .setLabelCol(label)\n .transform(df)\n)\nfeature_balance_measures.show(truncate=False)\n"))),(0,m.kt)("li",{parentName:"ol"},(0,m.kt)("p",{parentName:"li"},"Create a ",(0,m.kt)("inlineCode",{parentName:"p"},"DistributionBalanceMeasure")," transformer and call ",(0,m.kt)("inlineCode",{parentName:"p"},"setSensitiveCols")," to set the list of sensitive features. Then, call the ",(0,m.kt)("inlineCode",{parentName:"p"},"transform")," method with your dataset and visualize the resulting dataframe."),(0,m.kt)("p",{parentName:"li"},"For example:"),(0,m.kt)("pre",{parentName:"li"},(0,m.kt)("code",{parentName:"pre",className:"language-python"},"distribution_balance_measures = (\n DistributionBalanceMeasure()\n .setSensitiveCols(features)\n .transform(df)\n)\ndistribution_balance_measures.show(truncate=False)\n"))),(0,m.kt)("li",{parentName:"ol"},(0,m.kt)("p",{parentName:"li"},"Create a ",(0,m.kt)("inlineCode",{parentName:"p"},"AggregateBalanceMeasure")," transformer and call ",(0,m.kt)("inlineCode",{parentName:"p"},"setSensitiveCols")," to set the list of sensitive features. Then, call the ",(0,m.kt)("inlineCode",{parentName:"p"},"transform")," method with your dataset and visualize the resulting dataframe."),(0,m.kt)("p",{parentName:"li"},"For example:"),(0,m.kt)("pre",{parentName:"li"},(0,m.kt)("code",{parentName:"pre",className:"language-python"},"aggregate_balance_measures = (\n AggregateBalanceMeasure()\n .setSensitiveCols(features)\n .transform(df)\n)\naggregate_balance_measures.show(truncate=False)\n")))),(0,m.kt)("p",null,"Note: If you're running this notebook in a Spark environment such as Azure Synapse or Databricks, then you can easily visualize the imbalance measures by calling the built-in plotting features ",(0,m.kt)("inlineCode",{parentName:"p"},"display()"),"."),(0,m.kt)("h2",{id:"measure-explanations"},"Measure Explanations"),(0,m.kt)("h3",{id:"feature-balance-measures"},"Feature Balance Measures"),(0,m.kt)("p",null,"Feature Balance Measures allow us to see whether each combination of sensitive feature is receiving the positive outcome (true prediction) at balanced probability."),(0,m.kt)("p",null,"In this context, we define a feature balance measure, called the parity, for label y. It is the difference between the association metrics of two different sensitive classes ",(0,m.kt)("span",{parentName:"p",className:"math math-inline"},(0,m.kt)("span",{parentName:"span",className:"katex"},(0,m.kt)("span",{parentName:"span",className:"katex-mathml"},(0,m.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,m.kt)("semantics",{parentName:"math"},(0,m.kt)("mrow",{parentName:"semantics"},(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"["),(0,m.kt)("msub",{parentName:"mrow"},(0,m.kt)("mi",{parentName:"msub"},"x"),(0,m.kt)("mi",{parentName:"msub"},"A")),(0,m.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,m.kt)("msub",{parentName:"mrow"},(0,m.kt)("mi",{parentName:"msub"},"x"),(0,m.kt)("mi",{parentName:"msub"},"B")),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"]")),(0,m.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"[x_A, x_B]")))),(0,m.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mopen"},"["),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,m.kt)("span",{parentName:"span",className:"msupsub"},(0,m.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.32833099999999993em"}},(0,m.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,m.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"A")))),(0,m.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,m.kt)("span",{parentName:"span"})))))),(0,m.kt)("span",{parentName:"span",className:"mpunct"},","),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,m.kt)("span",{parentName:"span",className:"msupsub"},(0,m.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.32833099999999993em"}},(0,m.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,m.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.05017em"}},"B")))),(0,m.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,m.kt)("span",{parentName:"span"})))))),(0,m.kt)("span",{parentName:"span",className:"mclose"},"]"))))),", with respect to the association metric ",(0,m.kt)("span",{parentName:"p",className:"math math-inline"},(0,m.kt)("span",{parentName:"span",className:"katex"},(0,m.kt)("span",{parentName:"span",className:"katex-mathml"},(0,m.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,m.kt)("semantics",{parentName:"math"},(0,m.kt)("mrow",{parentName:"semantics"},(0,m.kt)("mi",{parentName:"mrow"},"A"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,m.kt)("msub",{parentName:"mrow"},(0,m.kt)("mi",{parentName:"msub"},"x"),(0,m.kt)("mi",{parentName:"msub"},"i")),(0,m.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,m.kt)("mi",{parentName:"mrow"},"y"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},")")),(0,m.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"A(x_i, y)")))),(0,m.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,m.kt)("span",{parentName:"span",className:"mopen"},"("),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,m.kt)("span",{parentName:"span",className:"msupsub"},(0,m.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.31166399999999994em"}},(0,m.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,m.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"i")))),(0,m.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,m.kt)("span",{parentName:"span"})))))),(0,m.kt)("span",{parentName:"span",className:"mpunct"},","),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.03588em"}},"y"),(0,m.kt)("span",{parentName:"span",className:"mclose"},")"))))),". That is:"),(0,m.kt)("p",null,(0,m.kt)("span",{parentName:"p",className:"math math-inline"},(0,m.kt)("span",{parentName:"span",className:"katex"},(0,m.kt)("span",{parentName:"span",className:"katex-mathml"},(0,m.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,m.kt)("semantics",{parentName:"math"},(0,m.kt)("mrow",{parentName:"semantics"},(0,m.kt)("mi",{parentName:"mrow"},"p"),(0,m.kt)("mi",{parentName:"mrow"},"a"),(0,m.kt)("mi",{parentName:"mrow"},"r"),(0,m.kt)("mi",{parentName:"mrow"},"i"),(0,m.kt)("mi",{parentName:"mrow"},"t"),(0,m.kt)("mi",{parentName:"mrow"},"y"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,m.kt)("mi",{parentName:"mrow"},"y"),(0,m.kt)("mi",{parentName:"mrow",mathvariant:"normal"},"\u2223"),(0,m.kt)("msub",{parentName:"mrow"},(0,m.kt)("mi",{parentName:"msub"},"x"),(0,m.kt)("mi",{parentName:"msub"},"A")),(0,m.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,m.kt)("msub",{parentName:"mrow"},(0,m.kt)("mi",{parentName:"msub"},"x"),(0,m.kt)("mi",{parentName:"msub"},"B")),(0,m.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,m.kt)("mi",{parentName:"mrow"},"A"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,m.kt)("mo",{parentName:"mrow"},"\u22c5"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},")"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},")"),(0,m.kt)("mo",{parentName:"mrow"},(0,m.kt)("mi",{parentName:"mo",mathvariant:"normal"},"\u2254")),(0,m.kt)("mi",{parentName:"mrow"},"A"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,m.kt)("msub",{parentName:"mrow"},(0,m.kt)("mi",{parentName:"msub"},"x"),(0,m.kt)("mi",{parentName:"msub"},"A")),(0,m.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,m.kt)("mi",{parentName:"mrow"},"y"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},")"),(0,m.kt)("mo",{parentName:"mrow"},"\u2212"),(0,m.kt)("mi",{parentName:"mrow"},"A"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,m.kt)("msub",{parentName:"mrow"},(0,m.kt)("mi",{parentName:"msub"},"x"),(0,m.kt)("mi",{parentName:"msub"},"B")),(0,m.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,m.kt)("mi",{parentName:"mrow"},"y"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},")")),(0,m.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"parity(y \\vert x_A, x_B, A(\\cdot)) \\coloneqq A(x_A, y) - A(x_B, y)")))),(0,m.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"p"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"a"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.02778em"}},"r"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"i"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"t"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.03588em"}},"y"),(0,m.kt)("span",{parentName:"span",className:"mopen"},"("),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.03588em"}},"y"),(0,m.kt)("span",{parentName:"span",className:"mord"},"\u2223"),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,m.kt)("span",{parentName:"span",className:"msupsub"},(0,m.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.32833099999999993em"}},(0,m.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,m.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"A")))),(0,m.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,m.kt)("span",{parentName:"span"})))))),(0,m.kt)("span",{parentName:"span",className:"mpunct"},","),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,m.kt)("span",{parentName:"span",className:"msupsub"},(0,m.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.32833099999999993em"}},(0,m.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,m.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.05017em"}},"B")))),(0,m.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,m.kt)("span",{parentName:"span"})))))),(0,m.kt)("span",{parentName:"span",className:"mpunct"},","),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,m.kt)("span",{parentName:"span",className:"mopen"},"("),(0,m.kt)("span",{parentName:"span",className:"mord"},"\u22c5"),(0,m.kt)("span",{parentName:"span",className:"mclose"},")"),(0,m.kt)("span",{parentName:"span",className:"mclose"},")"),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}}),(0,m.kt)("span",{parentName:"span",className:"mrel"},(0,m.kt)("span",{parentName:"span",className:"mrel"},(0,m.kt)("span",{parentName:"span",className:"mop",style:{position:"relative",top:"-0.03472em"}},":")),(0,m.kt)("span",{parentName:"span",className:"mrel"},(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"-0.06666666666666667em"}})),(0,m.kt)("span",{parentName:"span",className:"mrel"},"=")),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}})),(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,m.kt)("span",{parentName:"span",className:"mopen"},"("),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,m.kt)("span",{parentName:"span",className:"msupsub"},(0,m.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.32833099999999993em"}},(0,m.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,m.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"A")))),(0,m.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,m.kt)("span",{parentName:"span"})))))),(0,m.kt)("span",{parentName:"span",className:"mpunct"},","),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.03588em"}},"y"),(0,m.kt)("span",{parentName:"span",className:"mclose"},")"),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2222222222222222em"}}),(0,m.kt)("span",{parentName:"span",className:"mbin"},"\u2212"),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2222222222222222em"}})),(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,m.kt)("span",{parentName:"span",className:"mopen"},"("),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,m.kt)("span",{parentName:"span",className:"msupsub"},(0,m.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.32833099999999993em"}},(0,m.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,m.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.05017em"}},"B")))),(0,m.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,m.kt)("span",{parentName:"span"})))))),(0,m.kt)("span",{parentName:"span",className:"mpunct"},","),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.03588em"}},"y"),(0,m.kt)("span",{parentName:"span",className:"mclose"},")")))))),(0,m.kt)("p",null,"Using the dataset, we can see if the various sexes and races are receiving >50k income at equal or unequal rates."),(0,m.kt)("p",null,"Note: Many of these metrics were influenced by this paper ",(0,m.kt)("a",{parentName:"p",href:"https://arxiv.org/abs/2103.03417"},"Measuring Model Biases in the Absence of Ground Truth"),"."),(0,m.kt)("table",null,(0,m.kt)("thead",{parentName:"table"},(0,m.kt)("tr",{parentName:"thead"},(0,m.kt)("th",{parentName:"tr",align:null},"Association Metric"),(0,m.kt)("th",{parentName:"tr",align:null},"Family"),(0,m.kt)("th",{parentName:"tr",align:null},"Description"),(0,m.kt)("th",{parentName:"tr",align:null},"Interpretation/Formula"),(0,m.kt)("th",{parentName:"tr",align:null},"Reference"))),(0,m.kt)("tbody",{parentName:"table"},(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Statistical Parity"),(0,m.kt)("td",{parentName:"tr",align:null},"Fairness"),(0,m.kt)("td",{parentName:"tr",align:null},"Proportion of each segment of a protected class (gender, for example) that should receive the positive outcome at equal rates."),(0,m.kt)("td",{parentName:"tr",align:null},"Closer to zero means better parity. ",(0,m.kt)("span",{parentName:"td",className:"math math-inline"},(0,m.kt)("span",{parentName:"span",className:"katex"},(0,m.kt)("span",{parentName:"span",className:"katex-mathml"},(0,m.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,m.kt)("semantics",{parentName:"math"},(0,m.kt)("mrow",{parentName:"semantics"},(0,m.kt)("mi",{parentName:"mrow"},"D"),(0,m.kt)("mi",{parentName:"mrow"},"P"),(0,m.kt)("mo",{parentName:"mrow"},"="),(0,m.kt)("mi",{parentName:"mrow"},"P"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,m.kt)("mi",{parentName:"mrow"},"Y"),(0,m.kt)("mi",{parentName:"mrow",mathvariant:"normal"},"\u2223"),(0,m.kt)("mi",{parentName:"mrow"},"A"),(0,m.kt)("mo",{parentName:"mrow"},"="),(0,m.kt)("mi",{parentName:"mrow"},"M"),(0,m.kt)("mi",{parentName:"mrow"},"a"),(0,m.kt)("mi",{parentName:"mrow"},"l"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},")"),(0,m.kt)("mo",{parentName:"mrow"},"\u2212"),(0,m.kt)("mi",{parentName:"mrow"},"P"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,m.kt)("mi",{parentName:"mrow"},"Y"),(0,m.kt)("mi",{parentName:"mrow",mathvariant:"normal"},"\u2223"),(0,m.kt)("mi",{parentName:"mrow"},"A"),(0,m.kt)("mo",{parentName:"mrow"},"="),(0,m.kt)("mi",{parentName:"mrow"},"F"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"m"),(0,m.kt)("mi",{parentName:"mrow"},"a"),(0,m.kt)("mi",{parentName:"mrow"},"l"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},")")),(0,m.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"DP = P(Y \\vert A = Male) - P(Y \\vert A = Female)")))),(0,m.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"0.68333em",verticalAlign:"0em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.02778em"}},"D"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.13889em"}},"P"),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}}),(0,m.kt)("span",{parentName:"span",className:"mrel"},"="),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}})),(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.13889em"}},"P"),(0,m.kt)("span",{parentName:"span",className:"mopen"},"("),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.22222em"}},"Y"),(0,m.kt)("span",{parentName:"span",className:"mord"},"\u2223"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}}),(0,m.kt)("span",{parentName:"span",className:"mrel"},"="),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}})),(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.10903em"}},"M"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"a"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.01968em"}},"l"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mclose"},")"),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2222222222222222em"}}),(0,m.kt)("span",{parentName:"span",className:"mbin"},"\u2212"),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2222222222222222em"}})),(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.13889em"}},"P"),(0,m.kt)("span",{parentName:"span",className:"mopen"},"("),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.22222em"}},"Y"),(0,m.kt)("span",{parentName:"span",className:"mord"},"\u2223"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}}),(0,m.kt)("span",{parentName:"span",className:"mrel"},"="),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}})),(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.13889em"}},"F"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"m"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"a"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.01968em"}},"l"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mclose"},")"))))),"."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Fairness_%28machine_learning%29"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Pointwise Mutual Information (PMI), normalized PMI"),(0,m.kt)("td",{parentName:"tr",align:null},"Entropy"),(0,m.kt)("td",{parentName:"tr",align:null},"The PMI of a pair of feature values (ex: Gender=Male and Gender=Female) quantifies the discrepancy between the probability of their coincidence given their joint distribution and their individual distributions (assuming independence)."),(0,m.kt)("td",{parentName:"tr",align:null},"Range (normalized) ",(0,m.kt)("span",{parentName:"td",className:"math math-inline"},(0,m.kt)("span",{parentName:"span",className:"katex"},(0,m.kt)("span",{parentName:"span",className:"katex-mathml"},(0,m.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,m.kt)("semantics",{parentName:"math"},(0,m.kt)("mrow",{parentName:"semantics"},(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"["),(0,m.kt)("mo",{parentName:"mrow"},"\u2212"),(0,m.kt)("mn",{parentName:"mrow"},"1"),(0,m.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,m.kt)("mn",{parentName:"mrow"},"1"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"]")),(0,m.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"[-1, 1]")))),(0,m.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mopen"},"["),(0,m.kt)("span",{parentName:"span",className:"mord"},"\u2212"),(0,m.kt)("span",{parentName:"span",className:"mord"},"1"),(0,m.kt)("span",{parentName:"span",className:"mpunct"},","),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,m.kt)("span",{parentName:"span",className:"mord"},"1"),(0,m.kt)("span",{parentName:"span",className:"mclose"},"]"))))),". -1 for no co-occurrences. 0 for co-occurrences at random. 1 for complete co-occurrences."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Pointwise_mutual_information"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Sorensen-Dice Coefficient (SDC)"),(0,m.kt)("td",{parentName:"tr",align:null},"Intersection-over-Union"),(0,m.kt)("td",{parentName:"tr",align:null},"Used to gauge the similarity of two samples. Related to F1 score."),(0,m.kt)("td",{parentName:"tr",align:null},"Equals twice the number of elements common to both sets divided by the sum of the number of elements in each set."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Jaccard Index"),(0,m.kt)("td",{parentName:"tr",align:null},"Intersection-over-Union"),(0,m.kt)("td",{parentName:"tr",align:null},"Similar to SDC, gauges the similarity and diversity of sample sets."),(0,m.kt)("td",{parentName:"tr",align:null},"Equals the size of the intersection divided by the size of the union of the sample sets."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Jaccard_index"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Kendall Rank Correlation"),(0,m.kt)("td",{parentName:"tr",align:null},"Correlation and Statistical Tests"),(0,m.kt)("td",{parentName:"tr",align:null},"Used to measure the ordinal association between two measured quantities."),(0,m.kt)("td",{parentName:"tr",align:null},"High when observations have a similar rank and low when observations have a dissimilar rank between the two variables."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Log-Likelihood Ratio"),(0,m.kt)("td",{parentName:"tr",align:null},"Correlation and Statistical Tests"),(0,m.kt)("td",{parentName:"tr",align:null},"Calculates the degree to which data supports one variable versus another. Log of the likelihood ratio, which gives the probability of correctly predicting the label in ratio to probability of incorrectly predicting label."),(0,m.kt)("td",{parentName:"tr",align:null},"If likelihoods are similar, it should be close to 0."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Likelihood_function#Likelihood_ratio"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"t-test"),(0,m.kt)("td",{parentName:"tr",align:null},"Correlation and Statistical Tests"),(0,m.kt)("td",{parentName:"tr",align:null},"Used to compare the means of two groups (pairwise)."),(0,m.kt)("td",{parentName:"tr",align:null},"Value looked up in t-Distribution tell if statistically significant or not."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Student's_t-test"},"Link"))))),(0,m.kt)("h3",{id:"distribution-balance-measures"},"Distribution Balance Measures"),(0,m.kt)("p",null,"Distribution Balance Measures allow us to compare our data with a reference distribution (currently only uniform distribution is supported as a reference distribution). They are calculated per sensitive column and don't depend on the label column."),(0,m.kt)("p",null,"For example, let's assume we have a dataset with nine rows and a Gender column, and we observe that:"),(0,m.kt)("ul",null,(0,m.kt)("li",{parentName:"ul"},'"Male" appears four times'),(0,m.kt)("li",{parentName:"ul"},'"Female" appears three times'),(0,m.kt)("li",{parentName:"ul"},'"Other" appears twice')),(0,m.kt)("p",null,"Assuming the uniform distribution:"),(0,m.kt)("div",{className:"math math-display"},(0,m.kt)("span",{parentName:"div",className:"katex-display"},(0,m.kt)("span",{parentName:"span",className:"katex"},(0,m.kt)("span",{parentName:"span",className:"katex-mathml"},(0,m.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML",display:"block"},(0,m.kt)("semantics",{parentName:"math"},(0,m.kt)("mrow",{parentName:"semantics"},(0,m.kt)("mi",{parentName:"mrow"},"R"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"f"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"r"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"n"),(0,m.kt)("mi",{parentName:"mrow"},"c"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"C"),(0,m.kt)("mi",{parentName:"mrow"},"o"),(0,m.kt)("mi",{parentName:"mrow"},"u"),(0,m.kt)("mi",{parentName:"mrow"},"n"),(0,m.kt)("mi",{parentName:"mrow"},"t"),(0,m.kt)("mo",{parentName:"mrow"},(0,m.kt)("mi",{parentName:"mo",mathvariant:"normal"},"\u2254")),(0,m.kt)("mfrac",{parentName:"mrow"},(0,m.kt)("mrow",{parentName:"mfrac"},(0,m.kt)("mi",{parentName:"mrow"},"n"),(0,m.kt)("mi",{parentName:"mrow"},"u"),(0,m.kt)("mi",{parentName:"mrow"},"m"),(0,m.kt)("mi",{parentName:"mrow"},"R"),(0,m.kt)("mi",{parentName:"mrow"},"o"),(0,m.kt)("mi",{parentName:"mrow"},"w"),(0,m.kt)("mi",{parentName:"mrow"},"s")),(0,m.kt)("mrow",{parentName:"mfrac"},(0,m.kt)("mi",{parentName:"mrow"},"n"),(0,m.kt)("mi",{parentName:"mrow"},"u"),(0,m.kt)("mi",{parentName:"mrow"},"m"),(0,m.kt)("mi",{parentName:"mrow"},"F"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"a"),(0,m.kt)("mi",{parentName:"mrow"},"t"),(0,m.kt)("mi",{parentName:"mrow"},"u"),(0,m.kt)("mi",{parentName:"mrow"},"r"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"V"),(0,m.kt)("mi",{parentName:"mrow"},"a"),(0,m.kt)("mi",{parentName:"mrow"},"l"),(0,m.kt)("mi",{parentName:"mrow"},"u"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"s")))),(0,m.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"ReferenceCount \\coloneqq \\frac{numRows}{numFeatureValues}")))),(0,m.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8888799999999999em",verticalAlign:"-0.19444em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.00773em"}},"R"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.10764em"}},"f"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.02778em"}},"r"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"n"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"c"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.07153em"}},"C"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"o"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"u"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"n"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"t"),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}}),(0,m.kt)("span",{parentName:"span",className:"mrel"},(0,m.kt)("span",{parentName:"span",className:"mrel"},(0,m.kt)("span",{parentName:"span",className:"mop",style:{position:"relative",top:"-0.03472em"}},":")),(0,m.kt)("span",{parentName:"span",className:"mrel"},(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"-0.06666666666666667em"}})),(0,m.kt)("span",{parentName:"span",className:"mrel"},"=")),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}})),(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"2.04633em",verticalAlign:"-0.686em"}}),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mopen nulldelimiter"}),(0,m.kt)("span",{parentName:"span",className:"mfrac"},(0,m.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"1.36033em"}},(0,m.kt)("span",{parentName:"span",style:{top:"-2.314em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"3em"}}),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"n"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"u"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"m"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.13889em"}},"F"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"a"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"t"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"u"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.02778em"}},"r"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.22222em"}},"V"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"a"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.01968em"}},"l"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"u"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"s"))),(0,m.kt)("span",{parentName:"span",style:{top:"-3.23em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"3em"}}),(0,m.kt)("span",{parentName:"span",className:"frac-line",style:{borderBottomWidth:"0.04em"}})),(0,m.kt)("span",{parentName:"span",style:{top:"-3.677em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"3em"}}),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"n"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"u"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"m"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.00773em"}},"R"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"o"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.02691em"}},"w"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"s")))),(0,m.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.686em"}},(0,m.kt)("span",{parentName:"span"}))))),(0,m.kt)("span",{parentName:"span",className:"mclose nulldelimiter"}))))))),(0,m.kt)("div",{className:"math math-display"},(0,m.kt)("span",{parentName:"div",className:"katex-display"},(0,m.kt)("span",{parentName:"span",className:"katex"},(0,m.kt)("span",{parentName:"span",className:"katex-mathml"},(0,m.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML",display:"block"},(0,m.kt)("semantics",{parentName:"math"},(0,m.kt)("mrow",{parentName:"semantics"},(0,m.kt)("mi",{parentName:"mrow"},"R"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"f"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"r"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"n"),(0,m.kt)("mi",{parentName:"mrow"},"c"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"P"),(0,m.kt)("mi",{parentName:"mrow"},"r"),(0,m.kt)("mi",{parentName:"mrow"},"o"),(0,m.kt)("mi",{parentName:"mrow"},"b"),(0,m.kt)("mi",{parentName:"mrow"},"a"),(0,m.kt)("mi",{parentName:"mrow"},"b"),(0,m.kt)("mi",{parentName:"mrow"},"i"),(0,m.kt)("mi",{parentName:"mrow"},"l"),(0,m.kt)("mi",{parentName:"mrow"},"i"),(0,m.kt)("mi",{parentName:"mrow"},"t"),(0,m.kt)("mi",{parentName:"mrow"},"y"),(0,m.kt)("mo",{parentName:"mrow"},(0,m.kt)("mi",{parentName:"mo",mathvariant:"normal"},"\u2254")),(0,m.kt)("mfrac",{parentName:"mrow"},(0,m.kt)("mn",{parentName:"mfrac"},"1"),(0,m.kt)("mrow",{parentName:"mfrac"},(0,m.kt)("mi",{parentName:"mrow"},"n"),(0,m.kt)("mi",{parentName:"mrow"},"u"),(0,m.kt)("mi",{parentName:"mrow"},"m"),(0,m.kt)("mi",{parentName:"mrow"},"F"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"a"),(0,m.kt)("mi",{parentName:"mrow"},"t"),(0,m.kt)("mi",{parentName:"mrow"},"u"),(0,m.kt)("mi",{parentName:"mrow"},"r"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"V"),(0,m.kt)("mi",{parentName:"mrow"},"a"),(0,m.kt)("mi",{parentName:"mrow"},"l"),(0,m.kt)("mi",{parentName:"mrow"},"u"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"s")))),(0,m.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"ReferenceProbability \\coloneqq \\frac{1}{numFeatureValues}")))),(0,m.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8888799999999999em",verticalAlign:"-0.19444em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.00773em"}},"R"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.10764em"}},"f"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.02778em"}},"r"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"n"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"c"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.13889em"}},"P"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.02778em"}},"r"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"o"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"b"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"a"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"b"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"i"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.01968em"}},"l"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"i"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"t"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.03588em"}},"y"),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}}),(0,m.kt)("span",{parentName:"span",className:"mrel"},(0,m.kt)("span",{parentName:"span",className:"mrel"},(0,m.kt)("span",{parentName:"span",className:"mop",style:{position:"relative",top:"-0.03472em"}},":")),(0,m.kt)("span",{parentName:"span",className:"mrel"},(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"-0.06666666666666667em"}})),(0,m.kt)("span",{parentName:"span",className:"mrel"},"=")),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}})),(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"2.00744em",verticalAlign:"-0.686em"}}),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mopen nulldelimiter"}),(0,m.kt)("span",{parentName:"span",className:"mfrac"},(0,m.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"1.32144em"}},(0,m.kt)("span",{parentName:"span",style:{top:"-2.314em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"3em"}}),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"n"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"u"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"m"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.13889em"}},"F"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"a"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"t"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"u"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.02778em"}},"r"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.22222em"}},"V"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"a"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.01968em"}},"l"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"u"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"s"))),(0,m.kt)("span",{parentName:"span",style:{top:"-3.23em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"3em"}}),(0,m.kt)("span",{parentName:"span",className:"frac-line",style:{borderBottomWidth:"0.04em"}})),(0,m.kt)("span",{parentName:"span",style:{top:"-3.677em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"3em"}}),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mord"},"1")))),(0,m.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.686em"}},(0,m.kt)("span",{parentName:"span"}))))),(0,m.kt)("span",{parentName:"span",className:"mclose nulldelimiter"}))))))),(0,m.kt)("table",null,(0,m.kt)("thead",{parentName:"table"},(0,m.kt)("tr",{parentName:"thead"},(0,m.kt)("th",{parentName:"tr",align:null},"Feature Value"),(0,m.kt)("th",{parentName:"tr",align:null},"Observed Count"),(0,m.kt)("th",{parentName:"tr",align:null},"Reference Count"),(0,m.kt)("th",{parentName:"tr",align:null},"Observed Probability"),(0,m.kt)("th",{parentName:"tr",align:null},"Reference Probabiliy"))),(0,m.kt)("tbody",{parentName:"table"},(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Male"),(0,m.kt)("td",{parentName:"tr",align:null},"4"),(0,m.kt)("td",{parentName:"tr",align:null},"9/3 = 3"),(0,m.kt)("td",{parentName:"tr",align:null},"4/9 = 0.44"),(0,m.kt)("td",{parentName:"tr",align:null},"3/9 = 0.33")),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Female"),(0,m.kt)("td",{parentName:"tr",align:null},"3"),(0,m.kt)("td",{parentName:"tr",align:null},"9/3 = 3"),(0,m.kt)("td",{parentName:"tr",align:null},"3/9 = 0.33"),(0,m.kt)("td",{parentName:"tr",align:null},"3/9 = 0.33")),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Other"),(0,m.kt)("td",{parentName:"tr",align:null},"2"),(0,m.kt)("td",{parentName:"tr",align:null},"9/3 = 3"),(0,m.kt)("td",{parentName:"tr",align:null},"2/9 = 0.22"),(0,m.kt)("td",{parentName:"tr",align:null},"3/9 = 0.33")))),(0,m.kt)("p",null,"We can use distance measures to find out how far our observed and reference distributions of these feature values are. Some of these distance measures include:"),(0,m.kt)("table",null,(0,m.kt)("thead",{parentName:"table"},(0,m.kt)("tr",{parentName:"thead"},(0,m.kt)("th",{parentName:"tr",align:null},"Measure"),(0,m.kt)("th",{parentName:"tr",align:null},"Description"),(0,m.kt)("th",{parentName:"tr",align:null},"Interpretation"),(0,m.kt)("th",{parentName:"tr",align:null},"Reference"))),(0,m.kt)("tbody",{parentName:"table"},(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"KL Divergence"),(0,m.kt)("td",{parentName:"tr",align:null},"Measure of how one probability distribution is different from a second, reference probability distribution. Measure of the information gained when one revises one's beliefs from the prior probability distribution Q to the posterior probability distribution P. In other words, it is the amount of information lost when Q is used to approximate P."),(0,m.kt)("td",{parentName:"tr",align:null},"Non-negative. 0 means P = Q."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"JS Distance"),(0,m.kt)("td",{parentName:"tr",align:null},"Measuring the similarity between two probability distributions. Symmetrized and smoothed version of the Kullback\u2013Leibler (KL) divergence. Square root of JS Divergence."),(0,m.kt)("td",{parentName:"tr",align:null},"Range ","[0, 1]",". 0 means perfectly same to balanced distribution."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Wasserstein Distance"),(0,m.kt)("td",{parentName:"tr",align:null},"This distance is also known as the earth mover\u2019s distance, since it can be seen as the minimum amount of \u201cwork\u201d required to transform u into v, where \u201cwork\u201d is measured as the amount of distribution weight that must be moved multiplied by the distance it has to be moved."),(0,m.kt)("td",{parentName:"tr",align:null},"Non-negative. 0 means P = Q."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Wasserstein_metric"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Infinity Norm Distance"),(0,m.kt)("td",{parentName:"tr",align:null},"Distance between two vectors is the greatest of their differences along any coordinate dimension. Also called Chebyshev distance or chessboard distance."),(0,m.kt)("td",{parentName:"tr",align:null},"Non-negative. 0 means same distribution."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Chebyshev_distance"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Total Variation Distance"),(0,m.kt)("td",{parentName:"tr",align:null},"It is equal to half the L1 (Manhattan) distance between the two distributions. Take the difference between the two proportions in each category, add up the absolute values of all the differences, and then divide the sum by 2."),(0,m.kt)("td",{parentName:"tr",align:null},"Non-negative. 0 means same distribution."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Total_variation_distance_of_probability_measures"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Chi-Squared Test"),(0,m.kt)("td",{parentName:"tr",align:null},"The chi-square test tests the null hypothesis that the categorical data has the given frequencies given expected frequencies in each category."),(0,m.kt)("td",{parentName:"tr",align:null},"p-value gives evidence against null-hypothesis that difference in observed and expected frequencies is by random chance."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Chi-squared_test"},"Link"))))),(0,m.kt)("h3",{id:"aggregate-balance-measures"},"Aggregate Balance Measures"),(0,m.kt)("p",null,"Aggregate Balance Measures allow us to obtain a higher notion of inequality. They're calculated on the set of all sensitive columns and don't depend on the label column."),(0,m.kt)("p",null,"These measures look at distribution of records across all combinations of sensitive columns. For example, if Sex and Race are specified as sensitive features, it then tries to quantify imbalance across all combinations of the two specified features - (Male, Black), (Female, White), (Male, Asian-Pac-Islander), etc."),(0,m.kt)("table",null,(0,m.kt)("thead",{parentName:"table"},(0,m.kt)("tr",{parentName:"thead"},(0,m.kt)("th",{parentName:"tr",align:null},"Measure"),(0,m.kt)("th",{parentName:"tr",align:null},"Description"),(0,m.kt)("th",{parentName:"tr",align:null},"Interpretation"),(0,m.kt)("th",{parentName:"tr",align:null},"Reference"))),(0,m.kt)("tbody",{parentName:"table"},(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Atkinson Index"),(0,m.kt)("td",{parentName:"tr",align:null},"It presents the percentage of total income that a given society would have to forego in order to have more equal shares of income between its citizens. This measure depends on the degree of societal aversion to inequality (a theoretical parameter decided by the researcher). A higher value entails greater social utility or willingness by individuals to accept smaller incomes in exchange for a more equal distribution. An important feature of the Atkinson index is that it can be decomposed into within-group and between-group inequality."),(0,m.kt)("td",{parentName:"tr",align:null},"Range ",(0,m.kt)("span",{parentName:"td",className:"math math-inline"},(0,m.kt)("span",{parentName:"span",className:"katex"},(0,m.kt)("span",{parentName:"span",className:"katex-mathml"},(0,m.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,m.kt)("semantics",{parentName:"math"},(0,m.kt)("mrow",{parentName:"semantics"},(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"["),(0,m.kt)("mn",{parentName:"mrow"},"0"),(0,m.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,m.kt)("mn",{parentName:"mrow"},"1"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"]")),(0,m.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"[0, 1]")))),(0,m.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mopen"},"["),(0,m.kt)("span",{parentName:"span",className:"mord"},"0"),(0,m.kt)("span",{parentName:"span",className:"mpunct"},","),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,m.kt)("span",{parentName:"span",className:"mord"},"1"),(0,m.kt)("span",{parentName:"span",className:"mclose"},"]"))))),". 0 if perfect equality. 1 means maximum inequality. In our case, it is the proportion of records for a sensitive columns\u2019 combination."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Atkinson_index"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Theil T Index"),(0,m.kt)("td",{parentName:"tr",align:null},'GE(1) = Theil\'s T and is more sensitive to differences at the top of the distribution. The Theil index is a statistic used to measure economic inequality. The Theil index measures an entropic "distance" the population is away from the "ideal" egalitarian state of everyone having the same income.'),(0,m.kt)("td",{parentName:"tr",align:null},"If everyone has the same income, then\xa0T_T\xa0equals\xa00. If one person has all the income, then\xa0T_T\xa0gives the result\xa0",(0,m.kt)("span",{parentName:"td",className:"math math-inline"},(0,m.kt)("span",{parentName:"span",className:"katex"},(0,m.kt)("span",{parentName:"span",className:"katex-mathml"},(0,m.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,m.kt)("semantics",{parentName:"math"},(0,m.kt)("mrow",{parentName:"semantics"},(0,m.kt)("mi",{parentName:"mrow"},"l"),(0,m.kt)("mi",{parentName:"mrow"},"n"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,m.kt)("mi",{parentName:"mrow"},"N"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},")")),(0,m.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"ln(N)")))),(0,m.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.01968em"}},"l"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"n"),(0,m.kt)("span",{parentName:"span",className:"mopen"},"("),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.10903em"}},"N"),(0,m.kt)("span",{parentName:"span",className:"mclose"},")"))))),".\xa00 means equal income and larger values mean higher level of disproportion."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Theil_index"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Theil L Index"),(0,m.kt)("td",{parentName:"tr",align:null},"GE(0) = Theil's L and is more sensitive to differences at the lower end of the distribution. Logarithm of (mean income)/(income i), over all the incomes included in the summation. It is also referred to as the mean log deviation measure. Because a transfer from a larger income to a smaller one will change the smaller income's ratio more than it changes the larger income's ratio, the transfer-principle is satisfied by this index."),(0,m.kt)("td",{parentName:"tr",align:null},"Same interpretation as Theil T Index."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Theil_index"},"Link"))))),(0,m.kt)("h2",{id:"mitigation"},"Mitigation"),(0,m.kt)("p",null,"It will not be a stretch to say that every real-world dataset has caveats, biases, and imbalances. Data collection is costly. Data Imbalance mitigation or de-biasing data is an area of research. There are many techniques available at various stages of ML lifecycle: during pre-processing, in-processing, and post processing. Here we outline a couple of pre-processing techniques -"),(0,m.kt)("h3",{id:"resampling"},"Resampling"),(0,m.kt)("p",null,"Resampling involves under-sampling from majority class and over-sampling from minority class. A na\xefve way to over-sample would be to duplicate records. Similarly, to under-sample one could remove records at random."),(0,m.kt)("ul",null,(0,m.kt)("li",{parentName:"ul"},(0,m.kt)("p",{parentName:"li"},"Caveats:"),(0,m.kt)("ol",{parentName:"li"},(0,m.kt)("li",{parentName:"ol"},"Under-sampling may remove valuable information."),(0,m.kt)("li",{parentName:"ol"},"Over-sampling may cause overfitting and poor generalization on test set.")))),(0,m.kt)("p",null,(0,m.kt)("img",{parentName:"p",src:"https://mmlspark.blob.core.windows.net/graphics/responsible_ai/DataBalanceAnalysis_SamplingBar.png",alt:"Bar chart undersampling and oversampling"})),(0,m.kt)("p",null,"There are smarter techniques to under-sample and over-sample in literature and implemented in Python\u2019s ",(0,m.kt)("a",{parentName:"p",href:"https://imbalanced-learn.org/stable/"},"imbalanced-learn")," package."),(0,m.kt)("p",null,"For example, we can cluster the records of the majority class, and do the under-sampling by removing records from each cluster, thus seeking to preserve information."),(0,m.kt)("p",null,"One technique of under-sampling is use of Tomek Links. Tomek links are pairs of instances that are very close but of opposite classes. Removing the instances of the majority class of each pair increases the space between the two classes, facilitating the classification process. A similar way to under-sample majority class is using Near-Miss. It first calculates the distance between all the points in the larger class with the points in the smaller class. When two points belonging to different classes are very close to each other in the distribution, this algorithm eliminates the datapoint of the larger class thereby trying to balance the distribution."),(0,m.kt)("p",null,(0,m.kt)("img",{parentName:"p",src:"https://mmlspark.blob.core.windows.net/graphics/responsible_ai/DataBalanceAnalysis_TomekLinks.png",alt:"Tomek Links"})),(0,m.kt)("p",null,"In over-sampling, instead of creating exact copies of the minority class records, we can introduce small variations into those copies, creating more diverse synthetic samples. This technique is called SMOTE (Synthetic Minority Oversampling Technique). It randomly picks a point from the minority class and computes the k-nearest neighbors for this point. The synthetic points are added between the chosen point and its neighbors."),(0,m.kt)("p",null,(0,m.kt)("img",{parentName:"p",src:"https://mmlspark.blob.core.windows.net/graphics/responsible_ai/DataBalanceAnalysis_SyntheticSamples.png",alt:"Synthetic Samples"})),(0,m.kt)("h3",{id:"reweighting"},"Reweighting"),(0,m.kt)("p",null,"There is an expected and observed value in each table cell. The weight is the value of expected / observed. Reweighting is easy to extend to multiple features with more than two groups. The weights are then incorporated in loss function of model training."),(0,m.kt)("p",null,(0,m.kt)("img",{parentName:"p",src:"https://mmlspark.blob.core.windows.net/graphics/responsible_ai/DataBalanceAnalysis_Reweight.png",alt:"Reweighting"})))}k.isMDXComponent=!0}}]); \ No newline at end of file +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[3953],{3905:function(a,e,t){t.d(e,{Zo:function(){return o},kt:function(){return k}});var n=t(7294);function s(a,e,t){return e in a?Object.defineProperty(a,e,{value:t,enumerable:!0,configurable:!0,writable:!0}):a[e]=t,a}function m(a,e){var t=Object.keys(a);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(a);e&&(n=n.filter((function(e){return Object.getOwnPropertyDescriptor(a,e).enumerable}))),t.push.apply(t,n)}return t}function r(a){for(var e=1;e=0||(s[t]=a[t]);return s}(a,e);if(Object.getOwnPropertySymbols){var m=Object.getOwnPropertySymbols(a);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(a,t)&&(s[t]=a[t])}return s}var l=n.createContext({}),i=function(a){var e=n.useContext(l),t=e;return a&&(t="function"==typeof a?a(e):r(r({},e),a)),t},o=function(a){var e=i(a.components);return n.createElement(l.Provider,{value:e},a.children)},N={inlineCode:"code",wrapper:function(a){var e=a.children;return n.createElement(n.Fragment,{},e)}},c=n.forwardRef((function(a,e){var t=a.components,s=a.mdxType,m=a.originalType,l=a.parentName,o=p(a,["components","mdxType","originalType","parentName"]),c=i(t),k=s,h=c["".concat(l,".").concat(k)]||c[k]||N[k]||m;return t?n.createElement(h,r(r({ref:e},o),{},{components:t})):n.createElement(h,r({ref:e},o))}));function k(a,e){var t=arguments,s=e&&e.mdxType;if("string"==typeof a||s){var m=t.length,r=new Array(m);r[0]=c;var p={};for(var l in e)hasOwnProperty.call(e,l)&&(p[l]=e[l]);p.originalType=a,p.mdxType="string"==typeof a?a:s,r[1]=p;for(var i=2;i50K} to {0, 1} to represent our binary classification label column\ndf = df.withColumn(label, F.when(F.col(label).contains("<=50K"), F.lit(0)).otherwise(F.lit(1)))\n'))),(0,m.kt)("li",{parentName:"ol"},(0,m.kt)("p",{parentName:"li"},"Create a ",(0,m.kt)("inlineCode",{parentName:"p"},"FeatureBalanceMeasure")," transformer and call ",(0,m.kt)("inlineCode",{parentName:"p"},"setSensitiveCols")," to set the list of sensitive features and call ",(0,m.kt)("inlineCode",{parentName:"p"},"setLabelCol")," to set the binary label column. Then, call the ",(0,m.kt)("inlineCode",{parentName:"p"},"transform")," method with your dataset and visualize the resulting dataframe."),(0,m.kt)("p",{parentName:"li"},"For example:"),(0,m.kt)("pre",{parentName:"li"},(0,m.kt)("code",{parentName:"pre",className:"language-python"},"feature_balance_measures = (\n FeatureBalanceMeasure()\n .setSensitiveCols(features)\n .setLabelCol(label)\n .transform(df)\n)\nfeature_balance_measures.show(truncate=False)\n"))),(0,m.kt)("li",{parentName:"ol"},(0,m.kt)("p",{parentName:"li"},"Create a ",(0,m.kt)("inlineCode",{parentName:"p"},"DistributionBalanceMeasure")," transformer and call ",(0,m.kt)("inlineCode",{parentName:"p"},"setSensitiveCols")," to set the list of sensitive features. Then, call the ",(0,m.kt)("inlineCode",{parentName:"p"},"transform")," method with your dataset and visualize the resulting dataframe."),(0,m.kt)("p",{parentName:"li"},"For example:"),(0,m.kt)("pre",{parentName:"li"},(0,m.kt)("code",{parentName:"pre",className:"language-python"},"distribution_balance_measures = (\n DistributionBalanceMeasure()\n .setSensitiveCols(features)\n .transform(df)\n)\ndistribution_balance_measures.show(truncate=False)\n"))),(0,m.kt)("li",{parentName:"ol"},(0,m.kt)("p",{parentName:"li"},"Create a ",(0,m.kt)("inlineCode",{parentName:"p"},"AggregateBalanceMeasure")," transformer and call ",(0,m.kt)("inlineCode",{parentName:"p"},"setSensitiveCols")," to set the list of sensitive features. Then, call the ",(0,m.kt)("inlineCode",{parentName:"p"},"transform")," method with your dataset and visualize the resulting dataframe."),(0,m.kt)("p",{parentName:"li"},"For example:"),(0,m.kt)("pre",{parentName:"li"},(0,m.kt)("code",{parentName:"pre",className:"language-python"},"aggregate_balance_measures = (\n AggregateBalanceMeasure()\n .setSensitiveCols(features)\n .transform(df)\n)\naggregate_balance_measures.show(truncate=False)\n")))),(0,m.kt)("p",null,"Note: If you're running this notebook in a Spark environment such as Azure Synapse or Databricks, then you can easily visualize the imbalance measures by calling the built-in plotting features ",(0,m.kt)("inlineCode",{parentName:"p"},"display()"),"."),(0,m.kt)("h2",{id:"measure-explanations"},"Measure Explanations"),(0,m.kt)("h3",{id:"feature-balance-measures"},"Feature Balance Measures"),(0,m.kt)("p",null,"Feature Balance Measures allow us to see whether each combination of sensitive feature is receiving the positive outcome (true prediction) at balanced probability."),(0,m.kt)("p",null,"In this context, we define a feature balance measure, called the parity, for label y. It is the difference between the association metrics of two different sensitive classes ",(0,m.kt)("span",{parentName:"p",className:"math math-inline"},(0,m.kt)("span",{parentName:"span",className:"katex"},(0,m.kt)("span",{parentName:"span",className:"katex-mathml"},(0,m.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,m.kt)("semantics",{parentName:"math"},(0,m.kt)("mrow",{parentName:"semantics"},(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"["),(0,m.kt)("msub",{parentName:"mrow"},(0,m.kt)("mi",{parentName:"msub"},"x"),(0,m.kt)("mi",{parentName:"msub"},"A")),(0,m.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,m.kt)("msub",{parentName:"mrow"},(0,m.kt)("mi",{parentName:"msub"},"x"),(0,m.kt)("mi",{parentName:"msub"},"B")),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"]")),(0,m.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"[x_A, x_B]")))),(0,m.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mopen"},"["),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,m.kt)("span",{parentName:"span",className:"msupsub"},(0,m.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.32833099999999993em"}},(0,m.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,m.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"A")))),(0,m.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,m.kt)("span",{parentName:"span"})))))),(0,m.kt)("span",{parentName:"span",className:"mpunct"},","),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,m.kt)("span",{parentName:"span",className:"msupsub"},(0,m.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.32833099999999993em"}},(0,m.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,m.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.05017em"}},"B")))),(0,m.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,m.kt)("span",{parentName:"span"})))))),(0,m.kt)("span",{parentName:"span",className:"mclose"},"]"))))),", with respect to the association metric ",(0,m.kt)("span",{parentName:"p",className:"math math-inline"},(0,m.kt)("span",{parentName:"span",className:"katex"},(0,m.kt)("span",{parentName:"span",className:"katex-mathml"},(0,m.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,m.kt)("semantics",{parentName:"math"},(0,m.kt)("mrow",{parentName:"semantics"},(0,m.kt)("mi",{parentName:"mrow"},"A"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,m.kt)("msub",{parentName:"mrow"},(0,m.kt)("mi",{parentName:"msub"},"x"),(0,m.kt)("mi",{parentName:"msub"},"i")),(0,m.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,m.kt)("mi",{parentName:"mrow"},"y"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},")")),(0,m.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"A(x_i, y)")))),(0,m.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,m.kt)("span",{parentName:"span",className:"mopen"},"("),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,m.kt)("span",{parentName:"span",className:"msupsub"},(0,m.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.31166399999999994em"}},(0,m.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,m.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"i")))),(0,m.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,m.kt)("span",{parentName:"span"})))))),(0,m.kt)("span",{parentName:"span",className:"mpunct"},","),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.03588em"}},"y"),(0,m.kt)("span",{parentName:"span",className:"mclose"},")"))))),". That is:"),(0,m.kt)("p",null,(0,m.kt)("span",{parentName:"p",className:"math math-inline"},(0,m.kt)("span",{parentName:"span",className:"katex"},(0,m.kt)("span",{parentName:"span",className:"katex-mathml"},(0,m.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,m.kt)("semantics",{parentName:"math"},(0,m.kt)("mrow",{parentName:"semantics"},(0,m.kt)("mi",{parentName:"mrow"},"p"),(0,m.kt)("mi",{parentName:"mrow"},"a"),(0,m.kt)("mi",{parentName:"mrow"},"r"),(0,m.kt)("mi",{parentName:"mrow"},"i"),(0,m.kt)("mi",{parentName:"mrow"},"t"),(0,m.kt)("mi",{parentName:"mrow"},"y"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,m.kt)("mi",{parentName:"mrow"},"y"),(0,m.kt)("mi",{parentName:"mrow",mathvariant:"normal"},"\u2223"),(0,m.kt)("msub",{parentName:"mrow"},(0,m.kt)("mi",{parentName:"msub"},"x"),(0,m.kt)("mi",{parentName:"msub"},"A")),(0,m.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,m.kt)("msub",{parentName:"mrow"},(0,m.kt)("mi",{parentName:"msub"},"x"),(0,m.kt)("mi",{parentName:"msub"},"B")),(0,m.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,m.kt)("mi",{parentName:"mrow"},"A"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,m.kt)("mo",{parentName:"mrow"},"\u22c5"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},")"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},")"),(0,m.kt)("mo",{parentName:"mrow"},(0,m.kt)("mi",{parentName:"mo",mathvariant:"normal"},"\u2254")),(0,m.kt)("mi",{parentName:"mrow"},"A"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,m.kt)("msub",{parentName:"mrow"},(0,m.kt)("mi",{parentName:"msub"},"x"),(0,m.kt)("mi",{parentName:"msub"},"A")),(0,m.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,m.kt)("mi",{parentName:"mrow"},"y"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},")"),(0,m.kt)("mo",{parentName:"mrow"},"\u2212"),(0,m.kt)("mi",{parentName:"mrow"},"A"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,m.kt)("msub",{parentName:"mrow"},(0,m.kt)("mi",{parentName:"msub"},"x"),(0,m.kt)("mi",{parentName:"msub"},"B")),(0,m.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,m.kt)("mi",{parentName:"mrow"},"y"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},")")),(0,m.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"parity(y \\vert x_A, x_B, A(\\cdot)) \\coloneqq A(x_A, y) - A(x_B, y)")))),(0,m.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"p"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"a"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.02778em"}},"r"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"i"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"t"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.03588em"}},"y"),(0,m.kt)("span",{parentName:"span",className:"mopen"},"("),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.03588em"}},"y"),(0,m.kt)("span",{parentName:"span",className:"mord"},"\u2223"),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,m.kt)("span",{parentName:"span",className:"msupsub"},(0,m.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.32833099999999993em"}},(0,m.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,m.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"A")))),(0,m.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,m.kt)("span",{parentName:"span"})))))),(0,m.kt)("span",{parentName:"span",className:"mpunct"},","),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,m.kt)("span",{parentName:"span",className:"msupsub"},(0,m.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.32833099999999993em"}},(0,m.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,m.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.05017em"}},"B")))),(0,m.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,m.kt)("span",{parentName:"span"})))))),(0,m.kt)("span",{parentName:"span",className:"mpunct"},","),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,m.kt)("span",{parentName:"span",className:"mopen"},"("),(0,m.kt)("span",{parentName:"span",className:"mord"},"\u22c5"),(0,m.kt)("span",{parentName:"span",className:"mclose"},")"),(0,m.kt)("span",{parentName:"span",className:"mclose"},")"),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}}),(0,m.kt)("span",{parentName:"span",className:"mrel"},(0,m.kt)("span",{parentName:"span",className:"mrel"},(0,m.kt)("span",{parentName:"span",className:"mop",style:{position:"relative",top:"-0.03472em"}},":")),(0,m.kt)("span",{parentName:"span",className:"mrel"},(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"-0.06666666666666667em"}})),(0,m.kt)("span",{parentName:"span",className:"mrel"},"=")),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}})),(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,m.kt)("span",{parentName:"span",className:"mopen"},"("),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,m.kt)("span",{parentName:"span",className:"msupsub"},(0,m.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.32833099999999993em"}},(0,m.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,m.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"A")))),(0,m.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,m.kt)("span",{parentName:"span"})))))),(0,m.kt)("span",{parentName:"span",className:"mpunct"},","),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.03588em"}},"y"),(0,m.kt)("span",{parentName:"span",className:"mclose"},")"),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2222222222222222em"}}),(0,m.kt)("span",{parentName:"span",className:"mbin"},"\u2212"),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2222222222222222em"}})),(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,m.kt)("span",{parentName:"span",className:"mopen"},"("),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"x"),(0,m.kt)("span",{parentName:"span",className:"msupsub"},(0,m.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.32833099999999993em"}},(0,m.kt)("span",{parentName:"span",style:{top:"-2.5500000000000003em",marginLeft:"0em",marginRight:"0.05em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,m.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.05017em"}},"B")))),(0,m.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,m.kt)("span",{parentName:"span"})))))),(0,m.kt)("span",{parentName:"span",className:"mpunct"},","),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.03588em"}},"y"),(0,m.kt)("span",{parentName:"span",className:"mclose"},")")))))),(0,m.kt)("p",null,"Using the dataset, we can see if the various sexes and races are receiving >50k income at equal or unequal rates."),(0,m.kt)("p",null,"Note: Many of these metrics were influenced by this paper ",(0,m.kt)("a",{parentName:"p",href:"https://arxiv.org/abs/2103.03417"},"Measuring Model Biases in the Absence of Ground Truth"),"."),(0,m.kt)("table",null,(0,m.kt)("thead",{parentName:"table"},(0,m.kt)("tr",{parentName:"thead"},(0,m.kt)("th",{parentName:"tr",align:null},"Association Metric"),(0,m.kt)("th",{parentName:"tr",align:null},"Family"),(0,m.kt)("th",{parentName:"tr",align:null},"Description"),(0,m.kt)("th",{parentName:"tr",align:null},"Interpretation/Formula"),(0,m.kt)("th",{parentName:"tr",align:null},"Reference"))),(0,m.kt)("tbody",{parentName:"table"},(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Statistical Parity"),(0,m.kt)("td",{parentName:"tr",align:null},"Fairness"),(0,m.kt)("td",{parentName:"tr",align:null},"Proportion of each segment of a protected class (gender, for example) that should receive the positive outcome at equal rates."),(0,m.kt)("td",{parentName:"tr",align:null},"Closer to zero means better parity. ",(0,m.kt)("span",{parentName:"td",className:"math math-inline"},(0,m.kt)("span",{parentName:"span",className:"katex"},(0,m.kt)("span",{parentName:"span",className:"katex-mathml"},(0,m.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,m.kt)("semantics",{parentName:"math"},(0,m.kt)("mrow",{parentName:"semantics"},(0,m.kt)("mi",{parentName:"mrow"},"D"),(0,m.kt)("mi",{parentName:"mrow"},"P"),(0,m.kt)("mo",{parentName:"mrow"},"="),(0,m.kt)("mi",{parentName:"mrow"},"P"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,m.kt)("mi",{parentName:"mrow"},"Y"),(0,m.kt)("mi",{parentName:"mrow",mathvariant:"normal"},"\u2223"),(0,m.kt)("mi",{parentName:"mrow"},"A"),(0,m.kt)("mo",{parentName:"mrow"},"="),(0,m.kt)("mi",{parentName:"mrow"},"M"),(0,m.kt)("mi",{parentName:"mrow"},"a"),(0,m.kt)("mi",{parentName:"mrow"},"l"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},")"),(0,m.kt)("mo",{parentName:"mrow"},"\u2212"),(0,m.kt)("mi",{parentName:"mrow"},"P"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,m.kt)("mi",{parentName:"mrow"},"Y"),(0,m.kt)("mi",{parentName:"mrow",mathvariant:"normal"},"\u2223"),(0,m.kt)("mi",{parentName:"mrow"},"A"),(0,m.kt)("mo",{parentName:"mrow"},"="),(0,m.kt)("mi",{parentName:"mrow"},"F"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"m"),(0,m.kt)("mi",{parentName:"mrow"},"a"),(0,m.kt)("mi",{parentName:"mrow"},"l"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},")")),(0,m.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"DP = P(Y \\vert A = Male) - P(Y \\vert A = Female)")))),(0,m.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"0.68333em",verticalAlign:"0em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.02778em"}},"D"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.13889em"}},"P"),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}}),(0,m.kt)("span",{parentName:"span",className:"mrel"},"="),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}})),(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.13889em"}},"P"),(0,m.kt)("span",{parentName:"span",className:"mopen"},"("),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.22222em"}},"Y"),(0,m.kt)("span",{parentName:"span",className:"mord"},"\u2223"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}}),(0,m.kt)("span",{parentName:"span",className:"mrel"},"="),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}})),(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.10903em"}},"M"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"a"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.01968em"}},"l"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mclose"},")"),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2222222222222222em"}}),(0,m.kt)("span",{parentName:"span",className:"mbin"},"\u2212"),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2222222222222222em"}})),(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.13889em"}},"P"),(0,m.kt)("span",{parentName:"span",className:"mopen"},"("),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.22222em"}},"Y"),(0,m.kt)("span",{parentName:"span",className:"mord"},"\u2223"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}}),(0,m.kt)("span",{parentName:"span",className:"mrel"},"="),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}})),(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.13889em"}},"F"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"m"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"a"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.01968em"}},"l"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mclose"},")"))))),"."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Fairness_%28machine_learning%29"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Pointwise Mutual Information (PMI), normalized PMI"),(0,m.kt)("td",{parentName:"tr",align:null},"Entropy"),(0,m.kt)("td",{parentName:"tr",align:null},"The PMI of a pair of feature values (ex: Gender=Male and Gender=Female) quantifies the discrepancy between the probability of their coincidence given their joint distribution and their individual distributions (assuming independence)."),(0,m.kt)("td",{parentName:"tr",align:null},"Range (normalized) ",(0,m.kt)("span",{parentName:"td",className:"math math-inline"},(0,m.kt)("span",{parentName:"span",className:"katex"},(0,m.kt)("span",{parentName:"span",className:"katex-mathml"},(0,m.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,m.kt)("semantics",{parentName:"math"},(0,m.kt)("mrow",{parentName:"semantics"},(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"["),(0,m.kt)("mo",{parentName:"mrow"},"\u2212"),(0,m.kt)("mn",{parentName:"mrow"},"1"),(0,m.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,m.kt)("mn",{parentName:"mrow"},"1"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"]")),(0,m.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"[-1, 1]")))),(0,m.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mopen"},"["),(0,m.kt)("span",{parentName:"span",className:"mord"},"\u2212"),(0,m.kt)("span",{parentName:"span",className:"mord"},"1"),(0,m.kt)("span",{parentName:"span",className:"mpunct"},","),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,m.kt)("span",{parentName:"span",className:"mord"},"1"),(0,m.kt)("span",{parentName:"span",className:"mclose"},"]"))))),". -1 for no co-occurrences. 0 for co-occurrences at random. 1 for complete co-occurrences."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Pointwise_mutual_information"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Sorensen-Dice Coefficient (SDC)"),(0,m.kt)("td",{parentName:"tr",align:null},"Intersection-over-Union"),(0,m.kt)("td",{parentName:"tr",align:null},"Used to gauge the similarity of two samples. Related to F1 score."),(0,m.kt)("td",{parentName:"tr",align:null},"Equals twice the number of elements common to both sets divided by the sum of the number of elements in each set."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Jaccard Index"),(0,m.kt)("td",{parentName:"tr",align:null},"Intersection-over-Union"),(0,m.kt)("td",{parentName:"tr",align:null},"Similar to SDC, gauges the similarity and diversity of sample sets."),(0,m.kt)("td",{parentName:"tr",align:null},"Equals the size of the intersection divided by the size of the union of the sample sets."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Jaccard_index"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Kendall Rank Correlation"),(0,m.kt)("td",{parentName:"tr",align:null},"Correlation and Statistical Tests"),(0,m.kt)("td",{parentName:"tr",align:null},"Used to measure the ordinal association between two measured quantities."),(0,m.kt)("td",{parentName:"tr",align:null},"High when observations have a similar rank and low when observations have a dissimilar rank between the two variables."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Log-Likelihood Ratio"),(0,m.kt)("td",{parentName:"tr",align:null},"Correlation and Statistical Tests"),(0,m.kt)("td",{parentName:"tr",align:null},"Calculates the degree to which data supports one variable versus another. Log of the likelihood ratio, which gives the probability of correctly predicting the label in ratio to probability of incorrectly predicting label."),(0,m.kt)("td",{parentName:"tr",align:null},"If likelihoods are similar, it should be close to 0."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Likelihood_function#Likelihood_ratio"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"t-test"),(0,m.kt)("td",{parentName:"tr",align:null},"Correlation and Statistical Tests"),(0,m.kt)("td",{parentName:"tr",align:null},"Used to compare the means of two groups (pairwise)."),(0,m.kt)("td",{parentName:"tr",align:null},"Value looked up in t-Distribution tell if statistically significant or not."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Student's_t-test"},"Link"))))),(0,m.kt)("h3",{id:"distribution-balance-measures"},"Distribution Balance Measures"),(0,m.kt)("p",null,"Distribution Balance Measures allow us to compare our data with a reference distribution (currently only uniform distribution is supported as a reference distribution). They are calculated per sensitive column and don't depend on the label column."),(0,m.kt)("p",null,"For example, let's assume we have a dataset with nine rows and a Gender column, and we observe that:"),(0,m.kt)("ul",null,(0,m.kt)("li",{parentName:"ul"},'"Male" appears four times'),(0,m.kt)("li",{parentName:"ul"},'"Female" appears three times'),(0,m.kt)("li",{parentName:"ul"},'"Other" appears twice')),(0,m.kt)("p",null,"Assuming the uniform distribution:"),(0,m.kt)("div",{className:"math math-display"},(0,m.kt)("span",{parentName:"div",className:"katex-display"},(0,m.kt)("span",{parentName:"span",className:"katex"},(0,m.kt)("span",{parentName:"span",className:"katex-mathml"},(0,m.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML",display:"block"},(0,m.kt)("semantics",{parentName:"math"},(0,m.kt)("mrow",{parentName:"semantics"},(0,m.kt)("mi",{parentName:"mrow"},"R"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"f"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"r"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"n"),(0,m.kt)("mi",{parentName:"mrow"},"c"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"C"),(0,m.kt)("mi",{parentName:"mrow"},"o"),(0,m.kt)("mi",{parentName:"mrow"},"u"),(0,m.kt)("mi",{parentName:"mrow"},"n"),(0,m.kt)("mi",{parentName:"mrow"},"t"),(0,m.kt)("mo",{parentName:"mrow"},(0,m.kt)("mi",{parentName:"mo",mathvariant:"normal"},"\u2254")),(0,m.kt)("mfrac",{parentName:"mrow"},(0,m.kt)("mrow",{parentName:"mfrac"},(0,m.kt)("mi",{parentName:"mrow"},"n"),(0,m.kt)("mi",{parentName:"mrow"},"u"),(0,m.kt)("mi",{parentName:"mrow"},"m"),(0,m.kt)("mi",{parentName:"mrow"},"R"),(0,m.kt)("mi",{parentName:"mrow"},"o"),(0,m.kt)("mi",{parentName:"mrow"},"w"),(0,m.kt)("mi",{parentName:"mrow"},"s")),(0,m.kt)("mrow",{parentName:"mfrac"},(0,m.kt)("mi",{parentName:"mrow"},"n"),(0,m.kt)("mi",{parentName:"mrow"},"u"),(0,m.kt)("mi",{parentName:"mrow"},"m"),(0,m.kt)("mi",{parentName:"mrow"},"F"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"a"),(0,m.kt)("mi",{parentName:"mrow"},"t"),(0,m.kt)("mi",{parentName:"mrow"},"u"),(0,m.kt)("mi",{parentName:"mrow"},"r"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"V"),(0,m.kt)("mi",{parentName:"mrow"},"a"),(0,m.kt)("mi",{parentName:"mrow"},"l"),(0,m.kt)("mi",{parentName:"mrow"},"u"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"s")))),(0,m.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"ReferenceCount \\coloneqq \\frac{numRows}{numFeatureValues}")))),(0,m.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8888799999999999em",verticalAlign:"-0.19444em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.00773em"}},"R"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.10764em"}},"f"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.02778em"}},"r"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"n"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"c"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.07153em"}},"C"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"o"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"u"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"n"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"t"),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}}),(0,m.kt)("span",{parentName:"span",className:"mrel"},(0,m.kt)("span",{parentName:"span",className:"mrel"},(0,m.kt)("span",{parentName:"span",className:"mop",style:{position:"relative",top:"-0.03472em"}},":")),(0,m.kt)("span",{parentName:"span",className:"mrel"},(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"-0.06666666666666667em"}})),(0,m.kt)("span",{parentName:"span",className:"mrel"},"=")),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}})),(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"2.04633em",verticalAlign:"-0.686em"}}),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mopen nulldelimiter"}),(0,m.kt)("span",{parentName:"span",className:"mfrac"},(0,m.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"1.36033em"}},(0,m.kt)("span",{parentName:"span",style:{top:"-2.314em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"3em"}}),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"n"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"u"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"m"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.13889em"}},"F"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"a"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"t"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"u"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.02778em"}},"r"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.22222em"}},"V"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"a"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.01968em"}},"l"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"u"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"s"))),(0,m.kt)("span",{parentName:"span",style:{top:"-3.23em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"3em"}}),(0,m.kt)("span",{parentName:"span",className:"frac-line",style:{borderBottomWidth:"0.04em"}})),(0,m.kt)("span",{parentName:"span",style:{top:"-3.677em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"3em"}}),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"n"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"u"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"m"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.00773em"}},"R"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"o"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.02691em"}},"w"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"s")))),(0,m.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.686em"}},(0,m.kt)("span",{parentName:"span"}))))),(0,m.kt)("span",{parentName:"span",className:"mclose nulldelimiter"}))))))),(0,m.kt)("div",{className:"math math-display"},(0,m.kt)("span",{parentName:"div",className:"katex-display"},(0,m.kt)("span",{parentName:"span",className:"katex"},(0,m.kt)("span",{parentName:"span",className:"katex-mathml"},(0,m.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML",display:"block"},(0,m.kt)("semantics",{parentName:"math"},(0,m.kt)("mrow",{parentName:"semantics"},(0,m.kt)("mi",{parentName:"mrow"},"R"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"f"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"r"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"n"),(0,m.kt)("mi",{parentName:"mrow"},"c"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"P"),(0,m.kt)("mi",{parentName:"mrow"},"r"),(0,m.kt)("mi",{parentName:"mrow"},"o"),(0,m.kt)("mi",{parentName:"mrow"},"b"),(0,m.kt)("mi",{parentName:"mrow"},"a"),(0,m.kt)("mi",{parentName:"mrow"},"b"),(0,m.kt)("mi",{parentName:"mrow"},"i"),(0,m.kt)("mi",{parentName:"mrow"},"l"),(0,m.kt)("mi",{parentName:"mrow"},"i"),(0,m.kt)("mi",{parentName:"mrow"},"t"),(0,m.kt)("mi",{parentName:"mrow"},"y"),(0,m.kt)("mo",{parentName:"mrow"},(0,m.kt)("mi",{parentName:"mo",mathvariant:"normal"},"\u2254")),(0,m.kt)("mfrac",{parentName:"mrow"},(0,m.kt)("mn",{parentName:"mfrac"},"1"),(0,m.kt)("mrow",{parentName:"mfrac"},(0,m.kt)("mi",{parentName:"mrow"},"n"),(0,m.kt)("mi",{parentName:"mrow"},"u"),(0,m.kt)("mi",{parentName:"mrow"},"m"),(0,m.kt)("mi",{parentName:"mrow"},"F"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"a"),(0,m.kt)("mi",{parentName:"mrow"},"t"),(0,m.kt)("mi",{parentName:"mrow"},"u"),(0,m.kt)("mi",{parentName:"mrow"},"r"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"V"),(0,m.kt)("mi",{parentName:"mrow"},"a"),(0,m.kt)("mi",{parentName:"mrow"},"l"),(0,m.kt)("mi",{parentName:"mrow"},"u"),(0,m.kt)("mi",{parentName:"mrow"},"e"),(0,m.kt)("mi",{parentName:"mrow"},"s")))),(0,m.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"ReferenceProbability \\coloneqq \\frac{1}{numFeatureValues}")))),(0,m.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8888799999999999em",verticalAlign:"-0.19444em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.00773em"}},"R"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.10764em"}},"f"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.02778em"}},"r"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"n"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"c"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.13889em"}},"P"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.02778em"}},"r"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"o"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"b"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"a"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"b"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"i"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.01968em"}},"l"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"i"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"t"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.03588em"}},"y"),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}}),(0,m.kt)("span",{parentName:"span",className:"mrel"},(0,m.kt)("span",{parentName:"span",className:"mrel"},(0,m.kt)("span",{parentName:"span",className:"mop",style:{position:"relative",top:"-0.03472em"}},":")),(0,m.kt)("span",{parentName:"span",className:"mrel"},(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"-0.06666666666666667em"}})),(0,m.kt)("span",{parentName:"span",className:"mrel"},"=")),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.2777777777777778em"}})),(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"2.00744em",verticalAlign:"-0.686em"}}),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mopen nulldelimiter"}),(0,m.kt)("span",{parentName:"span",className:"mfrac"},(0,m.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"1.32144em"}},(0,m.kt)("span",{parentName:"span",style:{top:"-2.314em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"3em"}}),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"n"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"u"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"m"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.13889em"}},"F"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"a"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"t"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"u"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.02778em"}},"r"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.22222em"}},"V"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"a"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.01968em"}},"l"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"u"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"e"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"s"))),(0,m.kt)("span",{parentName:"span",style:{top:"-3.23em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"3em"}}),(0,m.kt)("span",{parentName:"span",className:"frac-line",style:{borderBottomWidth:"0.04em"}})),(0,m.kt)("span",{parentName:"span",style:{top:"-3.677em"}},(0,m.kt)("span",{parentName:"span",className:"pstrut",style:{height:"3em"}}),(0,m.kt)("span",{parentName:"span",className:"mord"},(0,m.kt)("span",{parentName:"span",className:"mord"},"1")))),(0,m.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,m.kt)("span",{parentName:"span",className:"vlist-r"},(0,m.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.686em"}},(0,m.kt)("span",{parentName:"span"}))))),(0,m.kt)("span",{parentName:"span",className:"mclose nulldelimiter"}))))))),(0,m.kt)("table",null,(0,m.kt)("thead",{parentName:"table"},(0,m.kt)("tr",{parentName:"thead"},(0,m.kt)("th",{parentName:"tr",align:null},"Feature Value"),(0,m.kt)("th",{parentName:"tr",align:null},"Observed Count"),(0,m.kt)("th",{parentName:"tr",align:null},"Reference Count"),(0,m.kt)("th",{parentName:"tr",align:null},"Observed Probability"),(0,m.kt)("th",{parentName:"tr",align:null},"Reference Probabiliy"))),(0,m.kt)("tbody",{parentName:"table"},(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Male"),(0,m.kt)("td",{parentName:"tr",align:null},"4"),(0,m.kt)("td",{parentName:"tr",align:null},"9/3 = 3"),(0,m.kt)("td",{parentName:"tr",align:null},"4/9 = 0.44"),(0,m.kt)("td",{parentName:"tr",align:null},"3/9 = 0.33")),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Female"),(0,m.kt)("td",{parentName:"tr",align:null},"3"),(0,m.kt)("td",{parentName:"tr",align:null},"9/3 = 3"),(0,m.kt)("td",{parentName:"tr",align:null},"3/9 = 0.33"),(0,m.kt)("td",{parentName:"tr",align:null},"3/9 = 0.33")),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Other"),(0,m.kt)("td",{parentName:"tr",align:null},"2"),(0,m.kt)("td",{parentName:"tr",align:null},"9/3 = 3"),(0,m.kt)("td",{parentName:"tr",align:null},"2/9 = 0.22"),(0,m.kt)("td",{parentName:"tr",align:null},"3/9 = 0.33")))),(0,m.kt)("p",null,"We can use distance measures to find out how far our observed and reference distributions of these feature values are. Some of these distance measures include:"),(0,m.kt)("table",null,(0,m.kt)("thead",{parentName:"table"},(0,m.kt)("tr",{parentName:"thead"},(0,m.kt)("th",{parentName:"tr",align:null},"Measure"),(0,m.kt)("th",{parentName:"tr",align:null},"Description"),(0,m.kt)("th",{parentName:"tr",align:null},"Interpretation"),(0,m.kt)("th",{parentName:"tr",align:null},"Reference"))),(0,m.kt)("tbody",{parentName:"table"},(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"KL Divergence"),(0,m.kt)("td",{parentName:"tr",align:null},"Measure of how one probability distribution is different from a second, reference probability distribution. Measure of the information gained when one revises one's beliefs from the prior probability distribution Q to the posterior probability distribution P. In other words, it is the amount of information lost when Q is used to approximate P."),(0,m.kt)("td",{parentName:"tr",align:null},"Non-negative. 0 means P = Q."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"JS Distance"),(0,m.kt)("td",{parentName:"tr",align:null},"Measuring the similarity between two probability distributions. Symmetrized and smoothed version of the Kullback\u2013Leibler (KL) divergence. Square root of JS Divergence."),(0,m.kt)("td",{parentName:"tr",align:null},"Range ","[0, 1]",". 0 means perfectly same to balanced distribution."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Jensen%E2%80%93Shannon_divergence"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Wasserstein Distance"),(0,m.kt)("td",{parentName:"tr",align:null},"This distance is also known as the earth mover\u2019s distance, since it can be seen as the minimum amount of \u201cwork\u201d required to transform u into v, where \u201cwork\u201d is measured as the amount of distribution weight that must be moved multiplied by the distance it has to be moved."),(0,m.kt)("td",{parentName:"tr",align:null},"Non-negative. 0 means P = Q."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Wasserstein_metric"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Infinity Norm Distance"),(0,m.kt)("td",{parentName:"tr",align:null},"Distance between two vectors is the greatest of their differences along any coordinate dimension. Also called Chebyshev distance or chessboard distance."),(0,m.kt)("td",{parentName:"tr",align:null},"Non-negative. 0 means same distribution."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Chebyshev_distance"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Total Variation Distance"),(0,m.kt)("td",{parentName:"tr",align:null},"It is equal to half the L1 (Manhattan) distance between the two distributions. Take the difference between the two proportions in each category, add up the absolute values of all the differences, and then divide the sum by 2."),(0,m.kt)("td",{parentName:"tr",align:null},"Non-negative. 0 means same distribution."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Total_variation_distance_of_probability_measures"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Chi-Squared Test"),(0,m.kt)("td",{parentName:"tr",align:null},"The chi-square test tests the null hypothesis that the categorical data has the given frequencies given expected frequencies in each category."),(0,m.kt)("td",{parentName:"tr",align:null},"p-value gives evidence against null-hypothesis that difference in observed and expected frequencies is by random chance."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Chi-squared_test"},"Link"))))),(0,m.kt)("h3",{id:"aggregate-balance-measures"},"Aggregate Balance Measures"),(0,m.kt)("p",null,"Aggregate Balance Measures allow us to obtain a higher notion of inequality. They're calculated on the set of all sensitive columns and don't depend on the label column."),(0,m.kt)("p",null,"These measures look at distribution of records across all combinations of sensitive columns. For example, if Sex and Race are specified as sensitive features, it then tries to quantify imbalance across all combinations of the two specified features - (Male, Black), (Female, White), (Male, Asian-Pac-Islander), etc."),(0,m.kt)("table",null,(0,m.kt)("thead",{parentName:"table"},(0,m.kt)("tr",{parentName:"thead"},(0,m.kt)("th",{parentName:"tr",align:null},"Measure"),(0,m.kt)("th",{parentName:"tr",align:null},"Description"),(0,m.kt)("th",{parentName:"tr",align:null},"Interpretation"),(0,m.kt)("th",{parentName:"tr",align:null},"Reference"))),(0,m.kt)("tbody",{parentName:"table"},(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Atkinson Index"),(0,m.kt)("td",{parentName:"tr",align:null},"It presents the percentage of total income that a given society would have to forego in order to have more equal shares of income between its citizens. This measure depends on the degree of societal aversion to inequality (a theoretical parameter decided by the researcher). A higher value entails greater social utility or willingness by individuals to accept smaller incomes in exchange for a more equal distribution. An important feature of the Atkinson index is that it can be decomposed into within-group and between-group inequality."),(0,m.kt)("td",{parentName:"tr",align:null},"Range ",(0,m.kt)("span",{parentName:"td",className:"math math-inline"},(0,m.kt)("span",{parentName:"span",className:"katex"},(0,m.kt)("span",{parentName:"span",className:"katex-mathml"},(0,m.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,m.kt)("semantics",{parentName:"math"},(0,m.kt)("mrow",{parentName:"semantics"},(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"["),(0,m.kt)("mn",{parentName:"mrow"},"0"),(0,m.kt)("mo",{parentName:"mrow",separator:"true"},","),(0,m.kt)("mn",{parentName:"mrow"},"1"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"]")),(0,m.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"[0, 1]")))),(0,m.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mopen"},"["),(0,m.kt)("span",{parentName:"span",className:"mord"},"0"),(0,m.kt)("span",{parentName:"span",className:"mpunct"},","),(0,m.kt)("span",{parentName:"span",className:"mspace",style:{marginRight:"0.16666666666666666em"}}),(0,m.kt)("span",{parentName:"span",className:"mord"},"1"),(0,m.kt)("span",{parentName:"span",className:"mclose"},"]"))))),". 0 if perfect equality. 1 means maximum inequality. In our case, it is the proportion of records for a sensitive columns\u2019 combination."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Atkinson_index"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Theil T Index"),(0,m.kt)("td",{parentName:"tr",align:null},'GE(1) = Theil\'s T and is more sensitive to differences at the top of the distribution. The Theil index is a statistic used to measure economic inequality. The Theil index measures an entropic "distance" the population is away from the "ideal" egalitarian state of everyone having the same income.'),(0,m.kt)("td",{parentName:"tr",align:null},"If everyone has the same income, then\xa0T_T\xa0equals\xa00. If one person has all the income, then\xa0T_T\xa0gives the result\xa0",(0,m.kt)("span",{parentName:"td",className:"math math-inline"},(0,m.kt)("span",{parentName:"span",className:"katex"},(0,m.kt)("span",{parentName:"span",className:"katex-mathml"},(0,m.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,m.kt)("semantics",{parentName:"math"},(0,m.kt)("mrow",{parentName:"semantics"},(0,m.kt)("mi",{parentName:"mrow"},"l"),(0,m.kt)("mi",{parentName:"mrow"},"n"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},"("),(0,m.kt)("mi",{parentName:"mrow"},"N"),(0,m.kt)("mo",{parentName:"mrow",stretchy:"false"},")")),(0,m.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"ln(N)")))),(0,m.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,m.kt)("span",{parentName:"span",className:"base"},(0,m.kt)("span",{parentName:"span",className:"strut",style:{height:"1em",verticalAlign:"-0.25em"}}),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.01968em"}},"l"),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal"},"n"),(0,m.kt)("span",{parentName:"span",className:"mopen"},"("),(0,m.kt)("span",{parentName:"span",className:"mord mathnormal",style:{marginRight:"0.10903em"}},"N"),(0,m.kt)("span",{parentName:"span",className:"mclose"},")"))))),".\xa00 means equal income and larger values mean higher level of disproportion."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Theil_index"},"Link"))),(0,m.kt)("tr",{parentName:"tbody"},(0,m.kt)("td",{parentName:"tr",align:null},"Theil L Index"),(0,m.kt)("td",{parentName:"tr",align:null},"GE(0) = Theil's L and is more sensitive to differences at the lower end of the distribution. Logarithm of (mean income)/(income i), over all the incomes included in the summation. It is also referred to as the mean log deviation measure. Because a transfer from a larger income to a smaller one will change the smaller income's ratio more than it changes the larger income's ratio, the transfer-principle is satisfied by this index."),(0,m.kt)("td",{parentName:"tr",align:null},"Same interpretation as Theil T Index."),(0,m.kt)("td",{parentName:"tr",align:null},(0,m.kt)("a",{parentName:"td",href:"https://en.wikipedia.org/wiki/Theil_index"},"Link"))))),(0,m.kt)("h2",{id:"mitigation"},"Mitigation"),(0,m.kt)("p",null,"It will not be a stretch to say that every real-world dataset has caveats, biases, and imbalances. Data collection is costly. Data Imbalance mitigation or de-biasing data is an area of research. There are many techniques available at various stages of ML lifecycle: during pre-processing, in-processing, and post processing. Here we outline a couple of pre-processing techniques -"),(0,m.kt)("h3",{id:"resampling"},"Resampling"),(0,m.kt)("p",null,"Resampling involves under-sampling from majority class and over-sampling from minority class. A na\xefve way to over-sample would be to duplicate records. Similarly, to under-sample one could remove records at random."),(0,m.kt)("ul",null,(0,m.kt)("li",{parentName:"ul"},(0,m.kt)("p",{parentName:"li"},"Caveats:"),(0,m.kt)("ol",{parentName:"li"},(0,m.kt)("li",{parentName:"ol"},"Under-sampling may remove valuable information."),(0,m.kt)("li",{parentName:"ol"},"Over-sampling may cause overfitting and poor generalization on test set.")))),(0,m.kt)("p",null,(0,m.kt)("img",{parentName:"p",src:"https://mmlspark.blob.core.windows.net/graphics/responsible_ai/DataBalanceAnalysis_SamplingBar.png",alt:"Bar chart undersampling and oversampling"})),(0,m.kt)("p",null,"There are smarter techniques to under-sample and over-sample in literature and implemented in Python\u2019s ",(0,m.kt)("a",{parentName:"p",href:"https://imbalanced-learn.org/stable/"},"imbalanced-learn")," package."),(0,m.kt)("p",null,"For example, we can cluster the records of the majority class, and do the under-sampling by removing records from each cluster, thus seeking to preserve information."),(0,m.kt)("p",null,"One technique of under-sampling is use of Tomek Links. Tomek links are pairs of instances that are very close but of opposite classes. Removing the instances of the majority class of each pair increases the space between the two classes, facilitating the classification process. A similar way to under-sample majority class is using Near-Miss. It first calculates the distance between all the points in the larger class with the points in the smaller class. When two points belonging to different classes are very close to each other in the distribution, this algorithm eliminates the datapoint of the larger class thereby trying to balance the distribution."),(0,m.kt)("p",null,(0,m.kt)("img",{parentName:"p",src:"https://mmlspark.blob.core.windows.net/graphics/responsible_ai/DataBalanceAnalysis_TomekLinks.png",alt:"Tomek Links"})),(0,m.kt)("p",null,"In over-sampling, instead of creating exact copies of the minority class records, we can introduce small variations into those copies, creating more diverse synthetic samples. This technique is called SMOTE (Synthetic Minority Oversampling Technique). It randomly picks a point from the minority class and computes the k-nearest neighbors for this point. The synthetic points are added between the chosen point and its neighbors."),(0,m.kt)("p",null,(0,m.kt)("img",{parentName:"p",src:"https://mmlspark.blob.core.windows.net/graphics/responsible_ai/DataBalanceAnalysis_SyntheticSamples.png",alt:"Synthetic Samples"})),(0,m.kt)("h3",{id:"reweighting"},"Reweighting"),(0,m.kt)("p",null,"There is an expected and observed value in each table cell. The weight is the value of expected / observed. Reweighting is easy to extend to multiple features with more than two groups. The weights are then incorporated in loss function of model training."),(0,m.kt)("p",null,(0,m.kt)("img",{parentName:"p",src:"https://mmlspark.blob.core.windows.net/graphics/responsible_ai/DataBalanceAnalysis_Reweight.png",alt:"Reweighting"})))}k.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/645abc7d.c2b4ccc7.js b/assets/js/645abc7d.5cf034ec.js similarity index 98% rename from assets/js/645abc7d.c2b4ccc7.js rename to assets/js/645abc7d.5cf034ec.js index 5aadafcb8a..dfb4257d06 100644 --- a/assets/js/645abc7d.c2b4ccc7.js +++ b/assets/js/645abc7d.5cf034ec.js @@ -1 +1 @@ -"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[1836],{3905:function(e,t,n){n.d(t,{Zo:function(){return c},kt:function(){return d}});var a=n(7294);function r(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function o(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);t&&(a=a.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,a)}return n}function i(e){for(var t=1;t=0||(r[n]=e[n]);return r}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(r[n]=e[n])}return r}var l=a.createContext({}),p=function(e){var t=a.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):i(i({},t),e)),n},c=function(e){var t=p(e.components);return a.createElement(l.Provider,{value:t},e.children)},m={inlineCode:"code",wrapper:function(e){var t=e.children;return a.createElement(a.Fragment,{},t)}},u=a.forwardRef((function(e,t){var n=e.components,r=e.mdxType,o=e.originalType,l=e.parentName,c=s(e,["components","mdxType","originalType","parentName"]),u=p(n),d=r,h=u["".concat(l,".").concat(d)]||u[d]||m[d]||o;return n?a.createElement(h,i(i({ref:t},c),{},{components:n})):a.createElement(h,i({ref:t},c))}));function d(e,t){var n=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var o=n.length,i=new Array(o);i[0]=u;var s={};for(var l in t)hasOwnProperty.call(t,l)&&(s[l]=t[l]);s.originalType=e,s.mdxType="string"==typeof e?e:r,i[1]=s;for(var p=2;p"),". Assuming that you don't have active containers (including detached\nones), ",(0,o.kt)("inlineCode",{parentName:"p"},"docker system prune")," will remove this untagged image, reclaiming the\nused space."),(0,o.kt)("p",null,"If you've used an explicit version tag, then it will still exist after a new\npull, which means that you can continue using this version. If you\nused an unqualified name first and then a version-tagged one, Docker will fetch\nboth tags. Only the second fetch is fast since it points to content that\nwas already loaded. In this case, doing a ",(0,o.kt)("inlineCode",{parentName:"p"},"pull")," when there's a new version\nwill fetch the new ",(0,o.kt)("inlineCode",{parentName:"p"},"latest")," tag and change its meaning to the newer version, but\nthe older version will still be available under its own version tag."),(0,o.kt)("p",null,"Finally, if there are such version-tagged older versions that you want to get\nrid of, you can use ",(0,o.kt)("inlineCode",{parentName:"p"},"docker images")," to check the list of installed images and\ntheir tags, and ",(0,o.kt)("inlineCode",{parentName:"p"},"docker rmi :")," to remove the unwanted ones."),(0,o.kt)("h2",{id:"a-note-about-security"},"A note about security"),(0,o.kt)("p",null,"Executing code in a Docker container can be unsafe if the running user is\n",(0,o.kt)("inlineCode",{parentName:"p"},"root"),". For this reason, the SynapseML image uses a proper username instead. If\nyou still want to run as root (for instance, if you want to ",(0,o.kt)("inlineCode",{parentName:"p"},"apt install")," an\nanother ubuntu package), then you should use ",(0,o.kt)("inlineCode",{parentName:"p"},"--user root"),". This mode can be useful\nwhen combined with ",(0,o.kt)("inlineCode",{parentName:"p"},"docker exec")," to perform administrative work while the image\ncontinues to run as usual."),(0,o.kt)("h2",{id:"further-reading"},"Further reading"),(0,o.kt)("p",null,"This text briefly covers some of the useful things that you can do with the\nSynapseML Docker image (and other images in general). You can find much more\ndocumentation ",(0,o.kt)("a",{parentName:"p",href:"https://docs.docker.com/"},"online"),"."))}d.isMDXComponent=!0}}]); \ No newline at end of file +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[1836],{3905:function(e,t,n){n.d(t,{Zo:function(){return c},kt:function(){return d}});var a=n(7294);function r(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function o(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);t&&(a=a.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,a)}return n}function i(e){for(var t=1;t=0||(r[n]=e[n]);return r}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(r[n]=e[n])}return r}var l=a.createContext({}),p=function(e){var t=a.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):i(i({},t),e)),n},c=function(e){var t=p(e.components);return a.createElement(l.Provider,{value:t},e.children)},m={inlineCode:"code",wrapper:function(e){var t=e.children;return a.createElement(a.Fragment,{},t)}},u=a.forwardRef((function(e,t){var n=e.components,r=e.mdxType,o=e.originalType,l=e.parentName,c=s(e,["components","mdxType","originalType","parentName"]),u=p(n),d=r,h=u["".concat(l,".").concat(d)]||u[d]||m[d]||o;return n?a.createElement(h,i(i({ref:t},c),{},{components:n})):a.createElement(h,i({ref:t},c))}));function d(e,t){var n=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var o=n.length,i=new Array(o);i[0]=u;var s={};for(var l in t)hasOwnProperty.call(t,l)&&(s[l]=t[l]);s.originalType=e,s.mdxType="string"==typeof e?e:r,i[1]=s;for(var p=2;p"),". Assuming that you don't have active containers (including detached\nones), ",(0,o.kt)("inlineCode",{parentName:"p"},"docker system prune")," will remove this untagged image, reclaiming the\nused space."),(0,o.kt)("p",null,"If you've used an explicit version tag, then it will still exist after a new\npull, which means that you can continue using this version. If you\nused an unqualified name first and then a version-tagged one, Docker will fetch\nboth tags. Only the second fetch is fast since it points to content that\nwas already loaded. In this case, doing a ",(0,o.kt)("inlineCode",{parentName:"p"},"pull")," when there's a new version\nwill fetch the new ",(0,o.kt)("inlineCode",{parentName:"p"},"latest")," tag and change its meaning to the newer version, but\nthe older version will still be available under its own version tag."),(0,o.kt)("p",null,"Finally, if there are such version-tagged older versions that you want to get\nrid of, you can use ",(0,o.kt)("inlineCode",{parentName:"p"},"docker images")," to check the list of installed images and\ntheir tags, and ",(0,o.kt)("inlineCode",{parentName:"p"},"docker rmi :")," to remove the unwanted ones."),(0,o.kt)("h2",{id:"a-note-about-security"},"A note about security"),(0,o.kt)("p",null,"Executing code in a Docker container can be unsafe if the running user is\n",(0,o.kt)("inlineCode",{parentName:"p"},"root"),". For this reason, the SynapseML image uses a proper username instead. If\nyou still want to run as root (for instance, if you want to ",(0,o.kt)("inlineCode",{parentName:"p"},"apt install")," an\nanother ubuntu package), then you should use ",(0,o.kt)("inlineCode",{parentName:"p"},"--user root"),". This mode can be useful\nwhen combined with ",(0,o.kt)("inlineCode",{parentName:"p"},"docker exec")," to perform administrative work while the image\ncontinues to run as usual."),(0,o.kt)("h2",{id:"further-reading"},"Further reading"),(0,o.kt)("p",null,"This text briefly covers some of the useful things that you can do with the\nSynapseML Docker image (and other images in general). You can find much more\ndocumentation ",(0,o.kt)("a",{parentName:"p",href:"https://docs.docker.com/"},"online"),"."))}d.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/ba9c0924.342ae59f.js b/assets/js/67e06a8b.47c38c8d.js similarity index 95% rename from assets/js/ba9c0924.342ae59f.js rename to assets/js/67e06a8b.47c38c8d.js index 7371650e00..93bd275337 100644 --- a/assets/js/ba9c0924.342ae59f.js +++ b/assets/js/67e06a8b.47c38c8d.js @@ -1 +1 @@ -"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[7218],{3905:function(e,t,n){n.d(t,{Zo:function(){return c},kt:function(){return f}});var r=n(7294);function i(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function a(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,r)}return n}function o(e){for(var t=1;t=0||(i[n]=e[n]);return i}(e,t);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(i[n]=e[n])}return i}var s=r.createContext({}),p=function(e){var t=r.useContext(s),n=t;return e&&(n="function"==typeof e?e(t):o(o({},t),e)),n},c=function(e){var t=p(e.components);return r.createElement(s.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},d=r.forwardRef((function(e,t){var n=e.components,i=e.mdxType,a=e.originalType,s=e.parentName,c=l(e,["components","mdxType","originalType","parentName"]),d=p(n),f=i,m=d["".concat(s,".").concat(f)]||d[f]||u[f]||a;return n?r.createElement(m,o(o({ref:t},c),{},{components:n})):r.createElement(m,o({ref:t},c))}));function f(e,t){var n=arguments,i=t&&t.mdxType;if("string"==typeof e||i){var a=n.length,o=new Array(a);o[0]=d;var l={};for(var s in t)hasOwnProperty.call(t,s)&&(l[s]=t[s]);l.originalType=e,l.mdxType="string"==typeof e?e:i,o[1]=l;for(var p=2;p=0||(i[n]=e[n]);return i}(e,t);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(i[n]=e[n])}return i}var s=r.createContext({}),p=function(e){var t=r.useContext(s),n=t;return e&&(n="function"==typeof e?e(t):o(o({},t),e)),n},c=function(e){var t=p(e.components);return r.createElement(s.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},d=r.forwardRef((function(e,t){var n=e.components,i=e.mdxType,a=e.originalType,s=e.parentName,c=l(e,["components","mdxType","originalType","parentName"]),d=p(n),f=i,m=d["".concat(s,".").concat(f)]||d[f]||u[f]||a;return n?r.createElement(m,o(o({ref:t},c),{},{components:n})):r.createElement(m,o({ref:t},c))}));function f(e,t){var n=arguments,i=t&&t.mdxType;if("string"==typeof e||i){var a=n.length,o=new Array(a);o[0]=d;var l={};for(var s in t)hasOwnProperty.call(t,s)&&(l[s]=t[s]);l.originalType=e,l.mdxType="string"==typeof e?e:i,o[1]=l;for(var p=2;p=0||(a[r]=e[r]);return a}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,r)&&(a[r]=e[r])}return a}var l=n.createContext({}),p=function(e){var t=n.useContext(l),r=t;return e&&(r="function"==typeof e?e(t):i(i({},t),e)),r},m=function(e){var t=p(e.components);return n.createElement(l.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return n.createElement(n.Fragment,{},t)}},d=n.forwardRef((function(e,t){var r=e.components,a=e.mdxType,o=e.originalType,l=e.parentName,m=s(e,["components","mdxType","originalType","parentName"]),d=p(r),c=a,y=d["".concat(l,".").concat(c)]||d[c]||u[c]||o;return r?n.createElement(y,i(i({ref:t},m),{},{components:r})):n.createElement(y,i({ref:t},m))}));function c(e,t){var r=arguments,a=t&&t.mdxType;if("string"==typeof e||a){var o=r.length,i=new Array(o);i[0]=d;var s={};for(var l in t)hasOwnProperty.call(t,l)&&(s[l]=t[l]);s.originalType=e,s.mdxType="string"==typeof e?e:a,i[1]=s;for(var p=2;p=0||(a[n]=e[n]);return a}(e,t);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);for(o=0;o=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(a[n]=e[n])}return a}var l=o.createContext({}),p=function(e){var t=o.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):s(s({},t),e)),n},c=function(e){var t=p(e.components);return o.createElement(l.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return o.createElement(o.Fragment,{},t)}},d=o.forwardRef((function(e,t){var n=e.components,a=e.mdxType,r=e.originalType,l=e.parentName,c=i(e,["components","mdxType","originalType","parentName"]),d=p(n),h=a,m=d["".concat(l,".").concat(h)]||d[h]||u[h]||r;return n?o.createElement(m,s(s({ref:t},c),{},{components:n})):o.createElement(m,s({ref:t},c))}));function h(e,t){var n=arguments,a=t&&t.mdxType;if("string"==typeof e||a){var r=n.length,s=new Array(r);s[0]=d;var i={};for(var l in t)hasOwnProperty.call(t,l)&&(i[l]=t[l]);i.originalType=e,i.mdxType="string"==typeof e?e:a,s[1]=i;for(var p=2;p North American datacenters\n# eu -> European datacenters\nurl_geo_prefix = "us"\n\n# Upload a geojson with polygons in them\nr = http.post(\n f"https://{url_geo_prefix}.atlas.microsoft.com/mapData/upload?api-version=1.0&dataFormat=geojson&subscription-key={maps_key}",\n json={\n "type": "FeatureCollection",\n "features": [\n {\n "type": "Feature",\n "properties": {"geometryId": "test_geometry"},\n "geometry": {\n "type": "Polygon",\n "coordinates": [\n [\n [-122.14290618896484, 47.67856488312544],\n [-122.03956604003906, 47.67856488312544],\n [-122.03956604003906, 47.7483271435476],\n [-122.14290618896484, 47.7483271435476],\n [-122.14290618896484, 47.67856488312544],\n ]\n ],\n },\n }\n ],\n },\n)\n\nlong_running_operation = r.headers.get("location")\ntime.sleep(30) # Sometimes this may take upto 30 seconds\nprint(f"Status Code: {r.status_code}, Long Running Operation: {long_running_operation}")\n# This Operation completes in approximately 5 ~ 15 seconds\nuser_data_id_resource_url = json.loads(\n http.get(f"{long_running_operation}&subscription-key={maps_key}").content\n)["resourceLocation"]\nuser_data_id = json.loads(\n http.get(f"{user_data_id_resource_url}&subscription-key={maps_key}").content\n)["udid"]\n')),(0,r.kt)("h3",{id:"use-the-function-to-check-if-point-is-in-polygon"},"Use the function to check if point is in polygon"),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'# Create a dataframe that\'s tied to it\'s column names\ndf = spark.createDataFrame(\n (\n (\n (48.858561, 2.294911),\n (47.639765, -122.127896),\n (47.621028, -122.348170),\n (47.734012, -122.102737),\n )\n ),\n StructType([StructField("lat", DoubleType()), StructField("lon", DoubleType())]),\n)\n\n# Run the Azure Maps geocoder to enhance the data with location data\ncheck_point_in_polygon = (\n CheckPointInPolygon()\n .setSubscriptionKey(maps_key)\n .setGeography(url_geo_prefix)\n .setUserDataIdentifier(user_data_id)\n .setLatitudeCol("lat")\n .setLongitudeCol("lon")\n .setOutputCol("output")\n)\n\n# Show the results of your text query in a table format\ndisplay(\n check_point_in_polygon.transform(df)\n .select(\n col("*"),\n col("output.result.pointInPolygons").alias("In Polygon"),\n col("output.result.intersectingGeometries").alias("Intersecting Polygons"),\n )\n .drop("output")\n)\n')),(0,r.kt)("h3",{id:"cleanup"},"Cleanup"),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'res = http.delete(\n f"https://{url_geo_prefix}.atlas.microsoft.com/mapData/{user_data_id}?api-version=1.0&subscription-key={maps_key}"\n)\n')))}h.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/6b9bdd6f.c75c08c6.js b/assets/js/6b9bdd6f.c75c08c6.js deleted file mode 100644 index 4a8814e01a..0000000000 --- a/assets/js/6b9bdd6f.c75c08c6.js +++ /dev/null @@ -1 +0,0 @@ -"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[1105],{3905:function(e,t,n){n.d(t,{Zo:function(){return c},kt:function(){return h}});var o=n(7294);function a(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function r(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);t&&(o=o.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,o)}return n}function s(e){for(var t=1;t=0||(a[n]=e[n]);return a}(e,t);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);for(o=0;o=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(a[n]=e[n])}return a}var l=o.createContext({}),p=function(e){var t=o.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):s(s({},t),e)),n},c=function(e){var t=p(e.components);return o.createElement(l.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return o.createElement(o.Fragment,{},t)}},d=o.forwardRef((function(e,t){var n=e.components,a=e.mdxType,r=e.originalType,l=e.parentName,c=i(e,["components","mdxType","originalType","parentName"]),d=p(n),h=a,m=d["".concat(l,".").concat(h)]||d[h]||u[h]||r;return n?o.createElement(m,s(s({ref:t},c),{},{components:n})):o.createElement(m,s({ref:t},c))}));function h(e,t){var n=arguments,a=t&&t.mdxType;if("string"==typeof e||a){var r=n.length,s=new Array(r);s[0]=d;var i={};for(var l in t)hasOwnProperty.call(t,l)&&(i[l]=t[l]);i.originalType=e,i.mdxType="string"==typeof e?e:a,s[1]=i;for(var p=2;p North American datacenters\n# eu -> European datacenters\nurl_geo_prefix = "us"\n\n# Upload a geojson with polygons in them\nr = http.post(\n f"https://{url_geo_prefix}.atlas.microsoft.com/mapData/upload?api-version=1.0&dataFormat=geojson&subscription-key={maps_key}",\n json={\n "type": "FeatureCollection",\n "features": [\n {\n "type": "Feature",\n "properties": {"geometryId": "test_geometry"},\n "geometry": {\n "type": "Polygon",\n "coordinates": [\n [\n [-122.14290618896484, 47.67856488312544],\n [-122.03956604003906, 47.67856488312544],\n [-122.03956604003906, 47.7483271435476],\n [-122.14290618896484, 47.7483271435476],\n [-122.14290618896484, 47.67856488312544],\n ]\n ],\n },\n }\n ],\n },\n)\n\nlong_running_operation = r.headers.get("location")\ntime.sleep(30) # Sometimes this may take upto 30 seconds\nprint(f"Status Code: {r.status_code}, Long Running Operation: {long_running_operation}")\n# This Operation completes in approximately 5 ~ 15 seconds\nuser_data_id_resource_url = json.loads(\n http.get(f"{long_running_operation}&subscription-key={maps_key}").content\n)["resourceLocation"]\nuser_data_id = json.loads(\n http.get(f"{user_data_id_resource_url}&subscription-key={maps_key}").content\n)["udid"]\n')),(0,r.kt)("h3",{id:"use-the-function-to-check-if-point-is-in-polygon"},"Use the function to check if point is in polygon"),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'# Create a dataframe that\'s tied to it\'s column names\ndf = spark.createDataFrame(\n (\n (\n (48.858561, 2.294911),\n (47.639765, -122.127896),\n (47.621028, -122.348170),\n (47.734012, -122.102737),\n )\n ),\n StructType([StructField("lat", DoubleType()), StructField("lon", DoubleType())]),\n)\n\n# Run the Azure Maps geocoder to enhance the data with location data\ncheck_point_in_polygon = (\n CheckPointInPolygon()\n .setSubscriptionKey(maps_key)\n .setGeography(url_geo_prefix)\n .setUserDataIdentifier(user_data_id)\n .setLatitudeCol("lat")\n .setLongitudeCol("lon")\n .setOutputCol("output")\n)\n\n# Show the results of your text query in a table format\ndisplay(\n check_point_in_polygon.transform(df)\n .select(\n col("*"),\n col("output.result.pointInPolygons").alias("In Polygon"),\n col("output.result.intersectingGeometries").alias("Intersecting Polygons"),\n )\n .drop("output")\n)\n')),(0,r.kt)("h3",{id:"cleanup"},"Cleanup"),(0,r.kt)("pre",null,(0,r.kt)("code",{parentName:"pre",className:"language-python"},'res = http.delete(\n f"https://{url_geo_prefix}.atlas.microsoft.com/mapData/{user_data_id}?api-version=1.0&subscription-key={maps_key}"\n)\n')))}h.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/4bbbdfcf.1b1b680f.js b/assets/js/6e159789.3cd6c4ef.js similarity index 98% rename from assets/js/4bbbdfcf.1b1b680f.js rename to assets/js/6e159789.3cd6c4ef.js index a03ed3dde0..79554e8447 100644 --- a/assets/js/4bbbdfcf.1b1b680f.js +++ b/assets/js/6e159789.3cd6c4ef.js @@ -1 +1 @@ -"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[6413],{3905:function(e,t,n){n.d(t,{Zo:function(){return p},kt:function(){return d}});var r=n(7294);function a(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function o(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,r)}return n}function i(e){for(var t=1;t=0||(a[n]=e[n]);return a}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(a[n]=e[n])}return a}var l=r.createContext({}),c=function(e){var t=r.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):i(i({},t),e)),n},p=function(e){var t=c(e.components);return r.createElement(l.Provider,{value:t},e.children)},m={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},u=r.forwardRef((function(e,t){var n=e.components,a=e.mdxType,o=e.originalType,l=e.parentName,p=s(e,["components","mdxType","originalType","parentName"]),u=c(n),d=a,h=u["".concat(l,".").concat(d)]||u[d]||m[d]||o;return n?r.createElement(h,i(i({ref:t},p),{},{components:n})):r.createElement(h,i({ref:t},p))}));function d(e,t){var n=arguments,a=t&&t.mdxType;if("string"==typeof e||a){var o=n.length,i=new Array(o);i[0]=u;var s={};for(var l in t)hasOwnProperty.call(t,l)&&(s[l]=t[l]);s.originalType=e,s.mdxType="string"==typeof e?e:a,i[1]=s;for(var c=2;c= lit(inferenceStartTime))\n .toPandas()\n)\n\nrdf\n')),(0,o.kt)("p",null,"Let's now format the ",(0,o.kt)("inlineCode",{parentName:"p"},"contributors")," column that stores the contribution score from each sensor to the detected anomalies. The next cell formats this data, and splits the contribution score of each sensor into its own column."),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'def parse(x):\n if len(x) > 0:\n return dict([item[:2] for item in x])\n else:\n return {"sensor_1": 0, "sensor_2": 0, "sensor_3": 0}\n\n\nrdf["contributors"] = rdf["interpretation"].apply(parse)\nrdf = pd.concat(\n [\n rdf.drop(["contributors"], axis=1),\n pd.json_normalize(rdf["contributors"]).rename(\n columns={\n "sensor_1": "series_1",\n "sensor_2": "series_2",\n "sensor_3": "series_3",\n }\n ),\n ],\n axis=1,\n)\nrdf\n')),(0,o.kt)("p",null,"Great! We now have the contribution scores of sensors 1, 2, and 3 in the ",(0,o.kt)("inlineCode",{parentName:"p"},"series_0"),", ",(0,o.kt)("inlineCode",{parentName:"p"},"series_1"),", and ",(0,o.kt)("inlineCode",{parentName:"p"},"series_2")," columns respectively. "),(0,o.kt)("p",null,"Let's run the next cell to plot the results. The ",(0,o.kt)("inlineCode",{parentName:"p"},"minSeverity")," parameter in the first line specifies the minimum severity of the anomalies to be plotted. "),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'minSeverity = 0.1\n\n\n####### Main Figure #######\nplt.figure(figsize=(23, 8))\nplt.plot(\n rdf["timestamp"],\n rdf["sensor_1"],\n color="tab:orange",\n line,\n linewidth=2,\n label="sensor_1",\n)\nplt.plot(\n rdf["timestamp"],\n rdf["sensor_2"],\n color="tab:green",\n line,\n linewidth=2,\n label="sensor_2",\n)\nplt.plot(\n rdf["timestamp"],\n rdf["sensor_3"],\n color="tab:blue",\n line,\n linewidth=2,\n label="sensor_3",\n)\nplt.grid(axis="y")\nplt.tick_params(axis="x", which="both", bottom=False, labelbottom=False)\nplt.legend()\n\nanoms = list(rdf["severity"] >= minSeverity)\n_, _, ymin, ymax = plt.axis()\nplt.vlines(np.where(anoms), ymin=ymin, ymax=ymax, color="r", alpha=0.8)\n\nplt.legend()\nplt.title(\n "A plot of the values from the three sensors with the detected anomalies highlighted in red."\n)\nplt.show()\n\n####### Severity Figure #######\nplt.figure(figsize=(23, 1))\nplt.tick_params(axis="x", which="both", bottom=False, labelbottom=False)\nplt.plot(\n rdf["timestamp"],\n rdf["severity"],\n color="black",\n line,\n linewidth=2,\n label="Severity score",\n)\nplt.plot(\n rdf["timestamp"],\n [minSeverity] * len(rdf["severity"]),\n color="red",\n line,\n linewidth=1,\n label="minSeverity",\n)\nplt.grid(axis="y")\nplt.legend()\nplt.ylim([0, 1])\nplt.title("Severity of the detected anomalies")\nplt.show()\n\n####### Contributors Figure #######\nplt.figure(figsize=(23, 1))\nplt.tick_params(axis="x", which="both", bottom=False, labelbottom=False)\nplt.bar(\n rdf["timestamp"], rdf["series_1"], width=2, color="tab:orange", label="sensor_1"\n)\nplt.bar(\n rdf["timestamp"],\n rdf["series_2"],\n width=2,\n color="tab:green",\n label="sensor_2",\n bottom=rdf["series_1"],\n)\nplt.bar(\n rdf["timestamp"],\n rdf["series_3"],\n width=2,\n color="tab:blue",\n label="sensor_3",\n bottom=rdf["series_1"] + rdf["series_2"],\n)\nplt.grid(axis="y")\nplt.legend()\nplt.ylim([0, 1])\nplt.title("The contribution of each sensor to the detected anomaly")\nplt.show()\n')),(0,o.kt)("img",{width:"1300",src:"https://mmlspark.blob.core.windows.net/graphics/multivariate-anomaly-detection-plot.png"}),(0,o.kt)("p",null,"The plots show the raw data from the sensors (inside the inference window) in orange, green, and blue. The red vertical lines in the first figure show the detected anomalies that have a severity greater than or equal to ",(0,o.kt)("inlineCode",{parentName:"p"},"minSeverity"),". "),(0,o.kt)("p",null,"The second plot shows the severity score of all the detected anomalies, with the ",(0,o.kt)("inlineCode",{parentName:"p"},"minSeverity")," threshold shown in the dotted red line."),(0,o.kt)("p",null,"Finally, the last plot shows the contribution of the data from each sensor to the detected anomalies. It helps us diagnose and understand the most likely cause of each anomaly."))}d.isMDXComponent=!0}}]); \ No newline at end of file +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[3611],{3905:function(e,t,n){n.d(t,{Zo:function(){return p},kt:function(){return d}});var r=n(7294);function a(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function o(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,r)}return n}function i(e){for(var t=1;t=0||(a[n]=e[n]);return a}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(a[n]=e[n])}return a}var l=r.createContext({}),c=function(e){var t=r.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):i(i({},t),e)),n},p=function(e){var t=c(e.components);return r.createElement(l.Provider,{value:t},e.children)},m={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},u=r.forwardRef((function(e,t){var n=e.components,a=e.mdxType,o=e.originalType,l=e.parentName,p=s(e,["components","mdxType","originalType","parentName"]),u=c(n),d=a,h=u["".concat(l,".").concat(d)]||u[d]||m[d]||o;return n?r.createElement(h,i(i({ref:t},p),{},{components:n})):r.createElement(h,i({ref:t},p))}));function d(e,t){var n=arguments,a=t&&t.mdxType;if("string"==typeof e||a){var o=n.length,i=new Array(o);i[0]=u;var s={};for(var l in t)hasOwnProperty.call(t,l)&&(s[l]=t[l]);s.originalType=e,s.mdxType="string"==typeof e?e:a,i[1]=s;for(var c=2;c= lit(inferenceStartTime))\n .toPandas()\n)\n\nrdf\n')),(0,o.kt)("p",null,"Let's now format the ",(0,o.kt)("inlineCode",{parentName:"p"},"contributors")," column that stores the contribution score from each sensor to the detected anomalies. The next cell formats this data, and splits the contribution score of each sensor into its own column."),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'def parse(x):\n if len(x) > 0:\n return dict([item[:2] for item in x])\n else:\n return {"sensor_1": 0, "sensor_2": 0, "sensor_3": 0}\n\n\nrdf["contributors"] = rdf["interpretation"].apply(parse)\nrdf = pd.concat(\n [\n rdf.drop(["contributors"], axis=1),\n pd.json_normalize(rdf["contributors"]).rename(\n columns={\n "sensor_1": "series_1",\n "sensor_2": "series_2",\n "sensor_3": "series_3",\n }\n ),\n ],\n axis=1,\n)\nrdf\n')),(0,o.kt)("p",null,"Great! We now have the contribution scores of sensors 1, 2, and 3 in the ",(0,o.kt)("inlineCode",{parentName:"p"},"series_0"),", ",(0,o.kt)("inlineCode",{parentName:"p"},"series_1"),", and ",(0,o.kt)("inlineCode",{parentName:"p"},"series_2")," columns respectively. "),(0,o.kt)("p",null,"Let's run the next cell to plot the results. The ",(0,o.kt)("inlineCode",{parentName:"p"},"minSeverity")," parameter in the first line specifies the minimum severity of the anomalies to be plotted. "),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'minSeverity = 0.1\n\n\n####### Main Figure #######\nplt.figure(figsize=(23, 8))\nplt.plot(\n rdf["timestamp"],\n rdf["sensor_1"],\n color="tab:orange",\n line,\n linewidth=2,\n label="sensor_1",\n)\nplt.plot(\n rdf["timestamp"],\n rdf["sensor_2"],\n color="tab:green",\n line,\n linewidth=2,\n label="sensor_2",\n)\nplt.plot(\n rdf["timestamp"],\n rdf["sensor_3"],\n color="tab:blue",\n line,\n linewidth=2,\n label="sensor_3",\n)\nplt.grid(axis="y")\nplt.tick_params(axis="x", which="both", bottom=False, labelbottom=False)\nplt.legend()\n\nanoms = list(rdf["severity"] >= minSeverity)\n_, _, ymin, ymax = plt.axis()\nplt.vlines(np.where(anoms), ymin=ymin, ymax=ymax, color="r", alpha=0.8)\n\nplt.legend()\nplt.title(\n "A plot of the values from the three sensors with the detected anomalies highlighted in red."\n)\nplt.show()\n\n####### Severity Figure #######\nplt.figure(figsize=(23, 1))\nplt.tick_params(axis="x", which="both", bottom=False, labelbottom=False)\nplt.plot(\n rdf["timestamp"],\n rdf["severity"],\n color="black",\n line,\n linewidth=2,\n label="Severity score",\n)\nplt.plot(\n rdf["timestamp"],\n [minSeverity] * len(rdf["severity"]),\n color="red",\n line,\n linewidth=1,\n label="minSeverity",\n)\nplt.grid(axis="y")\nplt.legend()\nplt.ylim([0, 1])\nplt.title("Severity of the detected anomalies")\nplt.show()\n\n####### Contributors Figure #######\nplt.figure(figsize=(23, 1))\nplt.tick_params(axis="x", which="both", bottom=False, labelbottom=False)\nplt.bar(\n rdf["timestamp"], rdf["series_1"], width=2, color="tab:orange", label="sensor_1"\n)\nplt.bar(\n rdf["timestamp"],\n rdf["series_2"],\n width=2,\n color="tab:green",\n label="sensor_2",\n bottom=rdf["series_1"],\n)\nplt.bar(\n rdf["timestamp"],\n rdf["series_3"],\n width=2,\n color="tab:blue",\n label="sensor_3",\n bottom=rdf["series_1"] + rdf["series_2"],\n)\nplt.grid(axis="y")\nplt.legend()\nplt.ylim([0, 1])\nplt.title("The contribution of each sensor to the detected anomaly")\nplt.show()\n')),(0,o.kt)("img",{width:"1300",src:"https://mmlspark.blob.core.windows.net/graphics/multivariate-anomaly-detection-plot.png"}),(0,o.kt)("p",null,"The plots show the raw data from the sensors (inside the inference window) in orange, green, and blue. The red vertical lines in the first figure show the detected anomalies that have a severity greater than or equal to ",(0,o.kt)("inlineCode",{parentName:"p"},"minSeverity"),". "),(0,o.kt)("p",null,"The second plot shows the severity score of all the detected anomalies, with the ",(0,o.kt)("inlineCode",{parentName:"p"},"minSeverity")," threshold shown in the dotted red line."),(0,o.kt)("p",null,"Finally, the last plot shows the contribution of the data from each sensor to the detected anomalies. It helps us diagnose and understand the most likely cause of each anomaly."))}d.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/6f479459.408e979d.js b/assets/js/6f479459.408e979d.js new file mode 100644 index 0000000000..912f58f4ae --- /dev/null +++ b/assets/js/6f479459.408e979d.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[4963],{3905:function(e,n,t){t.d(n,{Zo:function(){return m},kt:function(){return g}});var r=t(7294);function a(e,n,t){return n in e?Object.defineProperty(e,n,{value:t,enumerable:!0,configurable:!0,writable:!0}):e[n]=t,e}function l(e,n){var t=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);n&&(r=r.filter((function(n){return Object.getOwnPropertyDescriptor(e,n).enumerable}))),t.push.apply(t,r)}return t}function i(e){for(var n=1;n=0||(a[t]=e[t]);return a}(e,n);if(Object.getOwnPropertySymbols){var l=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,t)&&(a[t]=e[t])}return a}var s=r.createContext({}),p=function(e){var n=r.useContext(s),t=n;return e&&(t="function"==typeof e?e(n):i(i({},n),e)),t},m=function(e){var n=p(e.components);return r.createElement(s.Provider,{value:n},e.children)},u={inlineCode:"code",wrapper:function(e){var n=e.children;return r.createElement(r.Fragment,{},n)}},c=r.forwardRef((function(e,n){var t=e.components,a=e.mdxType,l=e.originalType,s=e.parentName,m=o(e,["components","mdxType","originalType","parentName"]),c=p(t),g=a,d=c["".concat(s,".").concat(g)]||c[g]||u[g]||l;return t?r.createElement(d,i(i({ref:n},m),{},{components:t})):r.createElement(d,i({ref:n},m))}));function g(e,n){var t=arguments,a=n&&n.mdxType;if("string"==typeof e||a){var l=t.length,i=new Array(l);i[0]=c;var o={};for(var s in n)hasOwnProperty.call(n,s)&&(o[s]=n[s]);o.originalType=e,o.mdxType="string"==typeof e?e:a,i[1]=o;for(var p=2;p green_value:\n for (x, y) in sp:\n image_array[y, x, 1] = 255\n image_array[y, x, 3] = 200\n plt.clf()\n plt.imshow(image_array)\n plt.show()\n')),(0,l.kt)("p",null,"Create a dataframe for a testing image, and use the ResNet50 ONNX model to infer the image."),(0,l.kt)("p",null,'The result shows 39.6% probability of "violin" (889), and 38.4% probability of "upright piano" (881).'),(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.io import *\n\nimage_df = spark.read.image().load(\n "wasbs://publicwasb@mmlspark.blob.core.windows.net/explainers/images/david-lusvardi-dWcUncxocQY-unsplash.jpg"\n)\ndisplay(image_df)\n\n# Rotate the image array from BGR into RGB channels for visualization later.\nrow = image_df.select(\n "image.height", "image.width", "image.nChannels", "image.data"\n).head()\nlocals().update(row.asDict())\nrgb_image_array = rotate_color_channel(data, height, width, nChannels)\n\n# Download the ONNX model\nmodelPayload = downloadBytes(\n "https://mmlspark.blob.core.windows.net/publicwasb/ONNXModels/resnet50-v2-7.onnx"\n)\n\nfeaturizer = (\n ImageTransformer(inputCol="image", outputCol="features")\n .resize(224, True)\n .centerCrop(224, 224)\n .normalize(\n mean=[0.485, 0.456, 0.406],\n std=[0.229, 0.224, 0.225],\n color_scale_factor=1 / 255,\n )\n .setTensorElementType(FloatType())\n)\n\nonnx = (\n ONNXModel()\n .setModelPayload(modelPayload)\n .setFeedDict({"data": "features"})\n .setFetchDict({"rawPrediction": "resnetv24_dense0_fwd"})\n .setSoftMaxDict({"rawPrediction": "probability"})\n .setMiniBatchSize(1)\n)\n\nmodel = Pipeline(stages=[featurizer, onnx]).fit(image_df)\n')),(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-python"},'predicted = (\n model.transform(image_df)\n .withColumn("top2pred", arg_top_k(col("probability"), lit(2)))\n .withColumn("top2prob", vec_slice(col("probability"), col("top2pred")))\n)\n\ndisplay(predicted.select("top2pred", "top2prob"))\n')),(0,l.kt)("p",null,"First we use the LIME image explainer to explain the model's top 2 classes' probabilities."),(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-python"},'lime = (\n ImageLIME()\n .setModel(model)\n .setOutputCol("weights")\n .setInputCol("image")\n .setCellSize(150.0)\n .setModifier(50.0)\n .setNumSamples(500)\n .setTargetCol("probability")\n .setTargetClassesCol("top2pred")\n .setSamplingFraction(0.7)\n)\n\nlime_result = (\n lime.transform(predicted)\n .withColumn("weights_violin", col("weights").getItem(0))\n .withColumn("weights_piano", col("weights").getItem(1))\n .cache()\n)\n\ndisplay(lime_result.select(col("weights_violin"), col("weights_piano")))\nlime_row = lime_result.head()\n')),(0,l.kt)("p",null,'We plot the LIME weights for "violin" output and "upright piano" output.'),(0,l.kt)("p",null,"Green areas are superpixels with LIME weights above 95 percentile."),(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-python"},'plot_superpixels(\n rgb_image_array,\n lime_row["superpixels"]["clusters"],\n list(lime_row["weights_violin"]),\n 95,\n)\nplot_superpixels(\n rgb_image_array,\n lime_row["superpixels"]["clusters"],\n list(lime_row["weights_piano"]),\n 95,\n)\n')),(0,l.kt)("p",null,"Your results will look like:"),(0,l.kt)("img",{src:"https://mmlspark.blob.core.windows.net/graphics/explainers/image-lime-20210811.png"}),(0,l.kt)("p",null,"Then we use the Kernel SHAP image explainer to explain the model's top 2 classes' probabilities."),(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-python"},'shap = (\n ImageSHAP()\n .setModel(model)\n .setOutputCol("shaps")\n .setSuperpixelCol("superpixels")\n .setInputCol("image")\n .setCellSize(150.0)\n .setModifier(50.0)\n .setNumSamples(500)\n .setTargetCol("probability")\n .setTargetClassesCol("top2pred")\n)\n\nshap_result = (\n shap.transform(predicted)\n .withColumn("shaps_violin", col("shaps").getItem(0))\n .withColumn("shaps_piano", col("shaps").getItem(1))\n .cache()\n)\n\ndisplay(shap_result.select(col("shaps_violin"), col("shaps_piano")))\nshap_row = shap_result.head()\n')),(0,l.kt)("p",null,'We plot the SHAP values for "piano" output and "cell" output.'),(0,l.kt)("p",null,"Green areas are superpixels with SHAP values above 95 percentile."),(0,l.kt)("blockquote",null,(0,l.kt)("p",{parentName:"blockquote"},"Notice that we drop the base value from the SHAP output before rendering the superpixels. The base value is the model output for the background (all black) image.")),(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-python"},'plot_superpixels(\n rgb_image_array,\n shap_row["superpixels"]["clusters"],\n list(shap_row["shaps_violin"][1:]),\n 95,\n)\nplot_superpixels(\n rgb_image_array,\n shap_row["superpixels"]["clusters"],\n list(shap_row["shaps_piano"][1:]),\n 95,\n)\n')),(0,l.kt)("p",null,"Your results will look like:"),(0,l.kt)("img",{src:"https://mmlspark.blob.core.windows.net/graphics/explainers/image-shap-20210811.png"}))}g.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/6f8cd013.b1c49d50.js b/assets/js/6f8cd013.b1c49d50.js new file mode 100644 index 0000000000..77bb17455a --- /dev/null +++ b/assets/js/6f8cd013.b1c49d50.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[8078],{3905:function(e,n,t){t.d(n,{Zo:function(){return p},kt:function(){return m}});var r=t(7294);function o(e,n,t){return n in e?Object.defineProperty(e,n,{value:t,enumerable:!0,configurable:!0,writable:!0}):e[n]=t,e}function a(e,n){var t=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);n&&(r=r.filter((function(n){return Object.getOwnPropertyDescriptor(e,n).enumerable}))),t.push.apply(t,r)}return t}function s(e){for(var n=1;n=0||(o[t]=e[t]);return o}(e,n);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,t)&&(o[t]=e[t])}return o}var l=r.createContext({}),c=function(e){var n=r.useContext(l),t=n;return e&&(t="function"==typeof e?e(n):s(s({},n),e)),t},p=function(e){var n=c(e.components);return r.createElement(l.Provider,{value:n},e.children)},u={inlineCode:"code",wrapper:function(e){var n=e.children;return r.createElement(r.Fragment,{},n)}},d=r.forwardRef((function(e,n){var t=e.components,o=e.mdxType,a=e.originalType,l=e.parentName,p=i(e,["components","mdxType","originalType","parentName"]),d=c(t),m=o,h=d["".concat(l,".").concat(m)]||d[m]||u[m]||a;return t?r.createElement(h,s(s({ref:n},p),{},{components:t})):r.createElement(h,s({ref:n},p))}));function m(e,n){var t=arguments,o=n&&n.mdxType;if("string"==typeof e||o){var a=t.length,s=new Array(a);s[0]=d;var i={};for(var l in n)hasOwnProperty.call(n,l)&&(i[l]=n[l]);i.originalType=e,i.mdxType="string"==typeof e?e:o,s[1]=i;for(var c=2;c=0||(a[n]=e[n]);return a}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(a[n]=e[n])}return a}var s=r.createContext({}),p=function(e){var t=r.useContext(s),n=t;return e&&(n="function"==typeof e?e(t):l(l({},t),e)),n},m=function(e){var t=p(e.components);return r.createElement(s.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},d=r.forwardRef((function(e,t){var n=e.components,a=e.mdxType,o=e.originalType,s=e.parentName,m=i(e,["components","mdxType","originalType","parentName"]),d=p(n),c=a,f=d["".concat(s,".").concat(c)]||d[c]||u[c]||o;return n?r.createElement(f,l(l({ref:t},m),{},{components:n})):r.createElement(f,l({ref:t},m))}));function c(e,t){var n=arguments,a=t&&t.mdxType;if("string"==typeof e||a){var o=n.length,l=new Array(o);l[0]=d;var i={};for(var s in t)hasOwnProperty.call(t,s)&&(i[s]=t[s]);i.originalType=e,i.mdxType="string"==typeof e?e:a,l[1]=i;for(var p=2;p bytes:\n from onnxmltools.convert import convert_lightgbm\n from onnxconverter_common.data_types import FloatTensorType\n\n initial_types = [("input", FloatTensorType([-1, input_size]))]\n onnx_model = convert_lightgbm(\n lgbm_model, initial_types=initial_types, target_opset=9\n )\n return onnx_model.SerializeToString()\n\n\nbooster_model_str = model.getLightGBMBooster().modelStr().get()\nbooster = lgb.Booster(model_str=booster_model_str)\nmodel_payload_ml = convertModel(booster, len(feature_cols))\n')),(0,o.kt)("p",null,"After conversion, load the ONNX payload into an ",(0,o.kt)("inlineCode",{parentName:"p"},"ONNXModel")," and inspect the model inputs and outputs:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.onnx import ONNXModel\n\nonnx_ml = ONNXModel().setModelPayload(model_payload_ml)\n\nprint("Model inputs:" + str(onnx_ml.getModelInputs()))\nprint("Model outputs:" + str(onnx_ml.getModelOutputs()))\n')),(0,o.kt)("p",null,"Map the model input to the input dataframe's column name (FeedDict), and map the output dataframe's column names to the model outputs (FetchDict)."),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'onnx_ml = (\n onnx_ml.setDeviceType("CPU")\n .setFeedDict({"input": "features"})\n .setFetchDict({"probability": "probabilities", "prediction": "label"})\n .setMiniBatchSize(5000)\n)\n')),(0,o.kt)("h2",{id:"use-the-model-for-inference"},"Use the model for inference"),(0,o.kt)("p",null,"To perform inference with the model, the following code creates testing data and transforms the data through the ONNX model."),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-python"},'from pyspark.ml.feature import VectorAssembler\nimport pandas as pd\nimport numpy as np\n\nn = 1000 * 1000\nm = 95\ntest = np.random.rand(n, m)\ntestPdf = pd.DataFrame(test)\ncols = list(map(str, testPdf.columns))\ntestDf = spark.createDataFrame(testPdf)\ntestDf = testDf.union(testDf).repartition(200)\ntestDf = (\n VectorAssembler()\n .setInputCols(cols)\n .setOutputCol("features")\n .transform(testDf)\n .drop(*cols)\n .cache()\n)\n\ndisplay(onnx_ml.transform(testDf))\n')),(0,o.kt)("p",null,"The output should look similar to the following table, though the values and number of rows may differ:"),(0,o.kt)("table",null,(0,o.kt)("thead",{parentName:"table"},(0,o.kt)("tr",{parentName:"thead"},(0,o.kt)("th",{parentName:"tr",align:null},"Index"),(0,o.kt)("th",{parentName:"tr",align:null},"Features"),(0,o.kt)("th",{parentName:"tr",align:null},"Prediction"),(0,o.kt)("th",{parentName:"tr",align:null},"Probability"))),(0,o.kt)("tbody",{parentName:"table"},(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:null},"1"),(0,o.kt)("td",{parentName:"tr",align:null},(0,o.kt)("inlineCode",{parentName:"td"},'"{"type":1,"values":[0.105...')),(0,o.kt)("td",{parentName:"tr",align:null},"0"),(0,o.kt)("td",{parentName:"tr",align:null},(0,o.kt)("inlineCode",{parentName:"td"},'"{"0":0.835...'))),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:null},"2"),(0,o.kt)("td",{parentName:"tr",align:null},(0,o.kt)("inlineCode",{parentName:"td"},'"{"type":1,"values":[0.814...')),(0,o.kt)("td",{parentName:"tr",align:null},"0"),(0,o.kt)("td",{parentName:"tr",align:null},(0,o.kt)("inlineCode",{parentName:"td"},'"{"0":0.658...'))))))}c.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/707d2a35.0cca7dff.js b/assets/js/707d2a35.0cca7dff.js new file mode 100644 index 0000000000..a862757335 --- /dev/null +++ b/assets/js/707d2a35.0cca7dff.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[6633],{3905:function(e,r,t){t.d(r,{Zo:function(){return u},kt:function(){return h}});var n=t(7294);function a(e,r,t){return r in e?Object.defineProperty(e,r,{value:t,enumerable:!0,configurable:!0,writable:!0}):e[r]=t,e}function s(e,r){var t=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);r&&(n=n.filter((function(r){return Object.getOwnPropertyDescriptor(e,r).enumerable}))),t.push.apply(t,n)}return t}function i(e){for(var r=1;r=0||(a[t]=e[t]);return a}(e,r);if(Object.getOwnPropertySymbols){var s=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,t)&&(a[t]=e[t])}return a}var c=n.createContext({}),l=function(e){var r=n.useContext(c),t=r;return e&&(t="function"==typeof e?e(r):i(i({},r),e)),t},u=function(e){var r=l(e.components);return n.createElement(c.Provider,{value:r},e.children)},p={inlineCode:"code",wrapper:function(e){var r=e.children;return n.createElement(n.Fragment,{},r)}},m=n.forwardRef((function(e,r){var t=e.components,a=e.mdxType,s=e.originalType,c=e.parentName,u=o(e,["components","mdxType","originalType","parentName"]),m=l(t),h=a,g=m["".concat(c,".").concat(h)]||m[h]||p[h]||s;return t?n.createElement(g,i(i({ref:r},u),{},{components:t})):n.createElement(g,i({ref:r},u))}));function h(e,r){var t=arguments,a=r&&r.mdxType;if("string"==typeof e||a){var s=t.length,i=new Array(s);i[0]=m;var o={};for(var c in r)hasOwnProperty.call(r,c)&&(o[c]=r[c]);o.originalType=e,o.mdxType="string"==typeof e?e:a,i[1]=o;for(var l=2;l"))\n .withColumn("Tags", split(col("Tags"), ",").cast("array"))\n .limit(25)\n)\n')),(0,s.kt)("img",{src:"https://mmlspark.blob.core.windows.net/graphics/CognitiveSearchHyperscale/MetArtworkSamples.png",width:"800"}),(0,s.kt)("pre",null,(0,s.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import AnalyzeImage\nfrom synapse.ml.stages import SelectColumns\n\n# define pipeline\ndescribeImage = (\n AnalyzeImage()\n .setSubscriptionKey(cognitive_key)\n .setLocation(cognitive_loc)\n .setImageUrlCol("PrimaryImageUrl")\n .setOutputCol("RawImageDescription")\n .setErrorCol("Errors")\n .setVisualFeatures(\n ["Categories", "Description", "Faces", "ImageType", "Color", "Adult"]\n )\n .setConcurrency(5)\n)\n\ndf2 = (\n describeImage.transform(data)\n .select("*", "RawImageDescription.*")\n .drop("Errors", "RawImageDescription")\n)\n')),(0,s.kt)("img",{src:"https://mmlspark.blob.core.windows.net/graphics/CognitiveSearchHyperscale/MetArtworksProcessed.png",width:"800"}),(0,s.kt)("p",null,"Before writing the results to a Search Index, you must define a schema which must specify the name, type, and attributes of each field in your index. Refer ",(0,s.kt)("a",{parentName:"p",href:"https://docs.microsoft.com/en-us/azure/search/search-what-is-an-index"},"Create a basic index in Azure Search")," for more information."),(0,s.kt)("pre",null,(0,s.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ndf2.writeToAzureSearch(\n subscriptionKey=azure_search_key,\n actionCol="searchAction",\n serviceName=search_service,\n indexName=search_index,\n keyCol="ObjectID",\n)\n')),(0,s.kt)("p",null,"The Search Index can be queried using the ",(0,s.kt)("a",{parentName:"p",href:"https://docs.microsoft.com/rest/api/searchservice/"},"Azure Search REST API")," by sending GET or POST requests and specifying query parameters that give the criteria for selecting matching documents. For more information on querying refer ",(0,s.kt)("a",{parentName:"p",href:"https://docs.microsoft.com/en-us/rest/api/searchservice/Search-Documents"},"Query your Azure Search index using the REST API")),(0,s.kt)("pre",null,(0,s.kt)("code",{parentName:"pre",className:"language-python"},'url = "https://{}.search.windows.net/indexes/{}/docs/search?api-version=2019-05-06".format(\n search_service, search_index\n)\nrequests.post(\n url, json={"search": "Glass"}, headers={"api-key": azure_search_key}\n).json()\n')))}h.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/0f113696.cb6254b0.js b/assets/js/70b306ba.449ef253.js similarity index 97% rename from assets/js/0f113696.cb6254b0.js rename to assets/js/70b306ba.449ef253.js index 32cb66c3b5..fd9cdd3aae 100644 --- a/assets/js/0f113696.cb6254b0.js +++ b/assets/js/70b306ba.449ef253.js @@ -1 +1 @@ -"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[6030],{3905:function(e,t,a){a.d(t,{Zo:function(){return m},kt:function(){return c}});var n=a(7294);function r(e,t,a){return t in e?Object.defineProperty(e,t,{value:a,enumerable:!0,configurable:!0,writable:!0}):e[t]=a,e}function o(e,t){var a=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);t&&(n=n.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),a.push.apply(a,n)}return a}function i(e){for(var t=1;t=0||(r[a]=e[a]);return r}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(r[a]=e[a])}return r}var p=n.createContext({}),s=function(e){var t=n.useContext(p),a=t;return e&&(a="function"==typeof e?e(t):i(i({},t),e)),a},m=function(e){var t=s(e.components);return n.createElement(p.Provider,{value:t},e.children)},d={inlineCode:"code",wrapper:function(e){var t=e.children;return n.createElement(n.Fragment,{},t)}},u=n.forwardRef((function(e,t){var a=e.components,r=e.mdxType,o=e.originalType,p=e.parentName,m=l(e,["components","mdxType","originalType","parentName"]),u=s(a),c=r,N=u["".concat(p,".").concat(c)]||u[c]||d[c]||o;return a?n.createElement(N,i(i({ref:t},m),{},{components:a})):n.createElement(N,i({ref:t},m))}));function c(e,t){var a=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var o=a.length,i=new Array(o);i[0]=u;var l={};for(var p in t)hasOwnProperty.call(t,p)&&(l[p]=t[p]);l.originalType=e,l.mdxType="string"==typeof e?e:r,i[1]=l;for(var s=2;sModel Slicing',id:"model-slicing",level:2},{value:"Example",id:"example",level:2}],u={toc:d};function c(e){var t=e.components,a=(0,r.Z)(e,i);return(0,o.kt)("wrapper",(0,n.Z)({},u,a,{components:t,mdxType:"MDXLayout"}),(0,o.kt)("h1",{id:"onnx-model-inferencing-on-spark"},"ONNX model inferencing on Spark"),(0,o.kt)("h2",{id:"onnx"},"ONNX"),(0,o.kt)("p",null,(0,o.kt)("a",{parentName:"p",href:"https://onnx.ai/"},"ONNX")," is an open format to represent both deep learning and traditional machine learning models. With ONNX, AI developers can more easily move models between state-of-the-art tools and choose the combination that is best for them."),(0,o.kt)("p",null,"SynapseML now includes a Spark transformer to bring a trained ONNX model to Apache Spark, so you can run inference on your data with Spark's large-scale data processing power."),(0,o.kt)("h2",{id:"onnxhub"},"ONNXHub"),(0,o.kt)("p",null,'Although you can use your own local model, many popular existing models are provided through the ONNXHub. You can use\na model\'s ONNXHub name (for example "MNIST") and download the bytes of the model, and some metadata about the model. You can also list\navailable models, optionally filtering by name or tags.'),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},' // List models\n val hub = new ONNXHub()\n val models = hub.listModels(model = Some("mnist"), tags = Some(Seq("vision")))\n\n // Retrieve and transform with a model\n val info = hub.getModelInfo("resnet50")\n val bytes = hub.load(name)\n val model = new ONNXModel()\n .setModelPayload(bytes)\n .setFeedDict(Map("data" -> "features"))\n .setFetchDict(Map("rawPrediction" -> "resnetv24_dense0_fwd"))\n .setSoftMaxDict(Map("rawPrediction" -> "probability"))\n .setArgMaxDict(Map("rawPrediction" -> "prediction"))\n .setMiniBatchSize(1)\n\n val (probability, _) = model.transform({YOUR_DATAFRAME})\n .select("probability", "prediction")\n .as[(Vector, Double)]\n .head\n')),(0,o.kt)("h2",{id:"usage"},"Usage"),(0,o.kt)("ol",null,(0,o.kt)("li",{parentName:"ol"},(0,o.kt)("p",{parentName:"li"},"Create a ",(0,o.kt)("inlineCode",{parentName:"p"},"com.microsoft.azure.synapse.ml.onnx.ONNXModel")," object and use ",(0,o.kt)("inlineCode",{parentName:"p"},"setModelLocation")," or ",(0,o.kt)("inlineCode",{parentName:"p"},"setModelPayload")," to load the ONNX model."),(0,o.kt)("p",{parentName:"li"},"For example:"),(0,o.kt)("pre",{parentName:"li"},(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'val onnx = new ONNXModel().setModelLocation("/path/to/model.onnx")\n')),(0,o.kt)("p",{parentName:"li"},"Optionally, create the model from the ONNXHub."),(0,o.kt)("pre",{parentName:"li"},(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'val onnx = new ONNXModel().setModelPayload(hub.load("MNIST"))\n'))),(0,o.kt)("li",{parentName:"ol"},(0,o.kt)("p",{parentName:"li"},"Use ONNX visualization tool (for example, ",(0,o.kt)("a",{parentName:"p",href:"https://netron.app/"},"Netron"),") to inspect the ONNX model's input and output nodes."),(0,o.kt)("p",{parentName:"li"},(0,o.kt)("img",{parentName:"p",src:"https://mmlspark.blob.core.windows.net/graphics/ONNXModelInputsOutputs.png",alt:"Screenshot that illustrates an ONNX model's input and output nodes"}))),(0,o.kt)("li",{parentName:"ol"},(0,o.kt)("p",{parentName:"li"},"Set the parameters properly to the ",(0,o.kt)("inlineCode",{parentName:"p"},"ONNXModel")," object."),(0,o.kt)("p",{parentName:"li"},"The ",(0,o.kt)("inlineCode",{parentName:"p"},"com.microsoft.azure.synapse.ml.onnx.ONNXModel")," class provides a set of parameters to control the behavior of the inference."),(0,o.kt)("table",{parentName:"li"},(0,o.kt)("thead",{parentName:"table"},(0,o.kt)("tr",{parentName:"thead"},(0,o.kt)("th",{parentName:"tr",align:"left"},"Parameter"),(0,o.kt)("th",{parentName:"tr",align:"left"},"Description"),(0,o.kt)("th",{parentName:"tr",align:"left"},"Default Value"))),(0,o.kt)("tbody",{parentName:"table"},(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:"left"},"feedDict"),(0,o.kt)("td",{parentName:"tr",align:"left"},"Map the ONNX model's expected input node names to the input DataFrame's column names. Make sure the input DataFrame's column schema matches with the corresponding input's shape of the ONNX model. For example, an image classification model may have an input node of shape ",(0,o.kt)("inlineCode",{parentName:"td"},"[1, 3, 224, 224]")," with type Float. It's assumed that the first dimension (1) is the batch size. Then the input DataFrame's corresponding column's type should be ",(0,o.kt)("inlineCode",{parentName:"td"},"ArrayType(ArrayType(ArrayType(FloatType)))"),"."),(0,o.kt)("td",{parentName:"tr",align:"left"},"None")),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:"left"},"fetchDict"),(0,o.kt)("td",{parentName:"tr",align:"left"},"Map the output DataFrame's column names to the ONNX model's output node names. NOTE: If you put outputs that are intermediate in the model, transform will automatically slice at those outputs. See the section on ",(0,o.kt)("a",{parentName:"td",href:"#slicing"},"Slicing"),"."),(0,o.kt)("td",{parentName:"tr",align:"left"},"None")),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:"left"},"miniBatcher"),(0,o.kt)("td",{parentName:"tr",align:"left"},"Specify the MiniBatcher to use."),(0,o.kt)("td",{parentName:"tr",align:"left"},(0,o.kt)("inlineCode",{parentName:"td"},"FixedMiniBatchTransformer")," with batch size 10")),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:"left"},"softMaxDict"),(0,o.kt)("td",{parentName:"tr",align:"left"},"A map between output DataFrame columns, where the value column will be computed from taking the softmax of the key column. If the 'rawPrediction' column contains logits outputs, then one can set softMaxDict to ",(0,o.kt)("inlineCode",{parentName:"td"},'Map("rawPrediction" -> "probability")')," to obtain the probability outputs."),(0,o.kt)("td",{parentName:"tr",align:"left"},"None")),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:"left"},"argMaxDict"),(0,o.kt)("td",{parentName:"tr",align:"left"},"A map between output DataFrame columns, where the value column will be computed from taking the argmax of the key column. This parameter can be used to convert probability or logits output to the predicted label."),(0,o.kt)("td",{parentName:"tr",align:"left"},"None")),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:"left"},"deviceType"),(0,o.kt)("td",{parentName:"tr",align:"left"},"Specify a device type the model inference runs on. Supported types are: CPU or CUDA. If not specified, auto detection will be used."),(0,o.kt)("td",{parentName:"tr",align:"left"},"None")),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:"left"},"optimizationLevel"),(0,o.kt)("td",{parentName:"tr",align:"left"},"Specify the ",(0,o.kt)("a",{parentName:"td",href:"https://onnxruntime.ai/docs/performance/model-optimizations/graph-optimizations.html#graph-optimization-levels"},"optimization level")," for the ONNX graph optimizations. Supported values are: ",(0,o.kt)("inlineCode",{parentName:"td"},"NO_OPT"),", ",(0,o.kt)("inlineCode",{parentName:"td"},"BASIC_OPT"),", ",(0,o.kt)("inlineCode",{parentName:"td"},"EXTENDED_OPT"),", ",(0,o.kt)("inlineCode",{parentName:"td"},"ALL_OPT"),"."),(0,o.kt)("td",{parentName:"tr",align:"left"},(0,o.kt)("inlineCode",{parentName:"td"},"ALL_OPT")))))),(0,o.kt)("li",{parentName:"ol"},(0,o.kt)("p",{parentName:"li"},"Call ",(0,o.kt)("inlineCode",{parentName:"p"},"transform")," method to run inference on the input DataFrame."))),(0,o.kt)("h2",{id:"model-slicing"},(0,o.kt)("a",{name:"slicing"}),"Model Slicing"),(0,o.kt)("p",null,"By default, an ONNX model is treated as a black box with inputs and outputs.\nIf you want to use intermediate nodes of a model, you can slice the model at particular nodes. Slicing will create a new model,\nkeeping only parts of the model that are needed for those nodes. This new model's outputs will be the outputs from\nthe intermediate nodes. You can save the sliced model and use it to transform just like any other ONNXModel."),(0,o.kt)("p",null,"This slicing feature is used implicitly by the ImageFeaturizer, which uses ONNX models. The OnnxHub manifest entry for each model\nincludes which intermediate node outputs should be used for featurization, so the ImageFeaturizer will automatically slice at the correct nodes."),(0,o.kt)("p",null,"The below example shows how to perform the slicing manually with a direct ONNXModel."),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},' // create a df: Dataframe with image data\n val hub = new ONNXHub()\n val info = hub.getModelInfo("resnet50")\n val bytes = hub.load(name)\n val intermediateOutputName = "resnetv24_pool1_fwd"\n val slicedModel = new ONNXModel()\n .setModelPayload(bytes)\n .setFeedDict(Map("data" -> "features"))\n .setFetchDict(Map("rawFeatures" -> intermediateOutputName)) // automatic slicing based on fetch dictionary\n // -- or --\n // .sliceAtOutput(intermediateOutputName) // manual slicing\n\n val slicedModelDf = slicedModel.transform(df)\n')),(0,o.kt)("h2",{id:"example"},"Example"),(0,o.kt)("ul",null,(0,o.kt)("li",{parentName:"ul"},(0,o.kt)("a",{parentName:"li",href:"../../Responsible%20AI/Image%20Explainers"},"Image Explainers")),(0,o.kt)("li",{parentName:"ul"},(0,o.kt)("a",{parentName:"li",href:"../Quickstart%20-%20ONNX%20Model%20Inference"},"Quickstart - ONNX Model Inference"))))}c.isMDXComponent=!0}}]); \ No newline at end of file +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[8377],{3905:function(e,t,a){a.d(t,{Zo:function(){return m},kt:function(){return c}});var n=a(7294);function r(e,t,a){return t in e?Object.defineProperty(e,t,{value:a,enumerable:!0,configurable:!0,writable:!0}):e[t]=a,e}function o(e,t){var a=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);t&&(n=n.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),a.push.apply(a,n)}return a}function i(e){for(var t=1;t=0||(r[a]=e[a]);return r}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(r[a]=e[a])}return r}var p=n.createContext({}),s=function(e){var t=n.useContext(p),a=t;return e&&(a="function"==typeof e?e(t):i(i({},t),e)),a},m=function(e){var t=s(e.components);return n.createElement(p.Provider,{value:t},e.children)},d={inlineCode:"code",wrapper:function(e){var t=e.children;return n.createElement(n.Fragment,{},t)}},u=n.forwardRef((function(e,t){var a=e.components,r=e.mdxType,o=e.originalType,p=e.parentName,m=l(e,["components","mdxType","originalType","parentName"]),u=s(a),c=r,N=u["".concat(p,".").concat(c)]||u[c]||d[c]||o;return a?n.createElement(N,i(i({ref:t},m),{},{components:a})):n.createElement(N,i({ref:t},m))}));function c(e,t){var a=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var o=a.length,i=new Array(o);i[0]=u;var l={};for(var p in t)hasOwnProperty.call(t,p)&&(l[p]=t[p]);l.originalType=e,l.mdxType="string"==typeof e?e:r,i[1]=l;for(var s=2;sModel Slicing',id:"model-slicing",level:2},{value:"Example",id:"example",level:2}],u={toc:d};function c(e){var t=e.components,a=(0,r.Z)(e,i);return(0,o.kt)("wrapper",(0,n.Z)({},u,a,{components:t,mdxType:"MDXLayout"}),(0,o.kt)("h1",{id:"onnx-model-inferencing-on-spark"},"ONNX model inferencing on Spark"),(0,o.kt)("h2",{id:"onnx"},"ONNX"),(0,o.kt)("p",null,(0,o.kt)("a",{parentName:"p",href:"https://onnx.ai/"},"ONNX")," is an open format to represent both deep learning and traditional machine learning models. With ONNX, AI developers can more easily move models between state-of-the-art tools and choose the combination that is best for them."),(0,o.kt)("p",null,"SynapseML now includes a Spark transformer to bring a trained ONNX model to Apache Spark, so you can run inference on your data with Spark's large-scale data processing power."),(0,o.kt)("h2",{id:"onnxhub"},"ONNXHub"),(0,o.kt)("p",null,'Although you can use your own local model, many popular existing models are provided through the ONNXHub. You can use\na model\'s ONNXHub name (for example "MNIST") and download the bytes of the model, and some metadata about the model. You can also list\navailable models, optionally filtering by name or tags.'),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},' // List models\n val hub = new ONNXHub()\n val models = hub.listModels(model = Some("mnist"), tags = Some(Seq("vision")))\n\n // Retrieve and transform with a model\n val info = hub.getModelInfo("resnet50")\n val bytes = hub.load(name)\n val model = new ONNXModel()\n .setModelPayload(bytes)\n .setFeedDict(Map("data" -> "features"))\n .setFetchDict(Map("rawPrediction" -> "resnetv24_dense0_fwd"))\n .setSoftMaxDict(Map("rawPrediction" -> "probability"))\n .setArgMaxDict(Map("rawPrediction" -> "prediction"))\n .setMiniBatchSize(1)\n\n val (probability, _) = model.transform({YOUR_DATAFRAME})\n .select("probability", "prediction")\n .as[(Vector, Double)]\n .head\n')),(0,o.kt)("h2",{id:"usage"},"Usage"),(0,o.kt)("ol",null,(0,o.kt)("li",{parentName:"ol"},(0,o.kt)("p",{parentName:"li"},"Create a ",(0,o.kt)("inlineCode",{parentName:"p"},"com.microsoft.azure.synapse.ml.onnx.ONNXModel")," object and use ",(0,o.kt)("inlineCode",{parentName:"p"},"setModelLocation")," or ",(0,o.kt)("inlineCode",{parentName:"p"},"setModelPayload")," to load the ONNX model."),(0,o.kt)("p",{parentName:"li"},"For example:"),(0,o.kt)("pre",{parentName:"li"},(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'val onnx = new ONNXModel().setModelLocation("/path/to/model.onnx")\n')),(0,o.kt)("p",{parentName:"li"},"Optionally, create the model from the ONNXHub."),(0,o.kt)("pre",{parentName:"li"},(0,o.kt)("code",{parentName:"pre",className:"language-scala"},'val onnx = new ONNXModel().setModelPayload(hub.load("MNIST"))\n'))),(0,o.kt)("li",{parentName:"ol"},(0,o.kt)("p",{parentName:"li"},"Use ONNX visualization tool (for example, ",(0,o.kt)("a",{parentName:"p",href:"https://netron.app/"},"Netron"),") to inspect the ONNX model's input and output nodes."),(0,o.kt)("p",{parentName:"li"},(0,o.kt)("img",{parentName:"p",src:"https://mmlspark.blob.core.windows.net/graphics/ONNXModelInputsOutputs.png",alt:"Screenshot that illustrates an ONNX model's input and output nodes"}))),(0,o.kt)("li",{parentName:"ol"},(0,o.kt)("p",{parentName:"li"},"Set the parameters properly to the ",(0,o.kt)("inlineCode",{parentName:"p"},"ONNXModel")," object."),(0,o.kt)("p",{parentName:"li"},"The ",(0,o.kt)("inlineCode",{parentName:"p"},"com.microsoft.azure.synapse.ml.onnx.ONNXModel")," class provides a set of parameters to control the behavior of the inference."),(0,o.kt)("table",{parentName:"li"},(0,o.kt)("thead",{parentName:"table"},(0,o.kt)("tr",{parentName:"thead"},(0,o.kt)("th",{parentName:"tr",align:"left"},"Parameter"),(0,o.kt)("th",{parentName:"tr",align:"left"},"Description"),(0,o.kt)("th",{parentName:"tr",align:"left"},"Default Value"))),(0,o.kt)("tbody",{parentName:"table"},(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:"left"},"feedDict"),(0,o.kt)("td",{parentName:"tr",align:"left"},"Map the ONNX model's expected input node names to the input DataFrame's column names. Make sure the input DataFrame's column schema matches with the corresponding input's shape of the ONNX model. For example, an image classification model may have an input node of shape ",(0,o.kt)("inlineCode",{parentName:"td"},"[1, 3, 224, 224]")," with type Float. It's assumed that the first dimension (1) is the batch size. Then the input DataFrame's corresponding column's type should be ",(0,o.kt)("inlineCode",{parentName:"td"},"ArrayType(ArrayType(ArrayType(FloatType)))"),"."),(0,o.kt)("td",{parentName:"tr",align:"left"},"None")),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:"left"},"fetchDict"),(0,o.kt)("td",{parentName:"tr",align:"left"},"Map the output DataFrame's column names to the ONNX model's output node names. NOTE: If you put outputs that are intermediate in the model, transform will automatically slice at those outputs. See the section on ",(0,o.kt)("a",{parentName:"td",href:"#slicing"},"Slicing"),"."),(0,o.kt)("td",{parentName:"tr",align:"left"},"None")),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:"left"},"miniBatcher"),(0,o.kt)("td",{parentName:"tr",align:"left"},"Specify the MiniBatcher to use."),(0,o.kt)("td",{parentName:"tr",align:"left"},(0,o.kt)("inlineCode",{parentName:"td"},"FixedMiniBatchTransformer")," with batch size 10")),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:"left"},"softMaxDict"),(0,o.kt)("td",{parentName:"tr",align:"left"},"A map between output DataFrame columns, where the value column will be computed from taking the softmax of the key column. If the 'rawPrediction' column contains logits outputs, then one can set softMaxDict to ",(0,o.kt)("inlineCode",{parentName:"td"},'Map("rawPrediction" -> "probability")')," to obtain the probability outputs."),(0,o.kt)("td",{parentName:"tr",align:"left"},"None")),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:"left"},"argMaxDict"),(0,o.kt)("td",{parentName:"tr",align:"left"},"A map between output DataFrame columns, where the value column will be computed from taking the argmax of the key column. This parameter can be used to convert probability or logits output to the predicted label."),(0,o.kt)("td",{parentName:"tr",align:"left"},"None")),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:"left"},"deviceType"),(0,o.kt)("td",{parentName:"tr",align:"left"},"Specify a device type the model inference runs on. Supported types are: CPU or CUDA. If not specified, auto detection will be used."),(0,o.kt)("td",{parentName:"tr",align:"left"},"None")),(0,o.kt)("tr",{parentName:"tbody"},(0,o.kt)("td",{parentName:"tr",align:"left"},"optimizationLevel"),(0,o.kt)("td",{parentName:"tr",align:"left"},"Specify the ",(0,o.kt)("a",{parentName:"td",href:"https://onnxruntime.ai/docs/performance/model-optimizations/graph-optimizations.html#graph-optimization-levels"},"optimization level")," for the ONNX graph optimizations. Supported values are: ",(0,o.kt)("inlineCode",{parentName:"td"},"NO_OPT"),", ",(0,o.kt)("inlineCode",{parentName:"td"},"BASIC_OPT"),", ",(0,o.kt)("inlineCode",{parentName:"td"},"EXTENDED_OPT"),", ",(0,o.kt)("inlineCode",{parentName:"td"},"ALL_OPT"),"."),(0,o.kt)("td",{parentName:"tr",align:"left"},(0,o.kt)("inlineCode",{parentName:"td"},"ALL_OPT")))))),(0,o.kt)("li",{parentName:"ol"},(0,o.kt)("p",{parentName:"li"},"Call ",(0,o.kt)("inlineCode",{parentName:"p"},"transform")," method to run inference on the input DataFrame."))),(0,o.kt)("h2",{id:"model-slicing"},(0,o.kt)("a",{name:"slicing"}),"Model Slicing"),(0,o.kt)("p",null,"By default, an ONNX model is treated as a black box with inputs and outputs.\nIf you want to use intermediate nodes of a model, you can slice the model at particular nodes. Slicing will create a new model,\nkeeping only parts of the model that are needed for those nodes. This new model's outputs will be the outputs from\nthe intermediate nodes. You can save the sliced model and use it to transform just like any other ONNXModel."),(0,o.kt)("p",null,"This slicing feature is used implicitly by the ImageFeaturizer, which uses ONNX models. The OnnxHub manifest entry for each model\nincludes which intermediate node outputs should be used for featurization, so the ImageFeaturizer will automatically slice at the correct nodes."),(0,o.kt)("p",null,"The below example shows how to perform the slicing manually with a direct ONNXModel."),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-scala"},' // create a df: Dataframe with image data\n val hub = new ONNXHub()\n val info = hub.getModelInfo("resnet50")\n val bytes = hub.load(name)\n val intermediateOutputName = "resnetv24_pool1_fwd"\n val slicedModel = new ONNXModel()\n .setModelPayload(bytes)\n .setFeedDict(Map("data" -> "features"))\n .setFetchDict(Map("rawFeatures" -> intermediateOutputName)) // automatic slicing based on fetch dictionary\n // -- or --\n // .sliceAtOutput(intermediateOutputName) // manual slicing\n\n val slicedModelDf = slicedModel.transform(df)\n')),(0,o.kt)("h2",{id:"example"},"Example"),(0,o.kt)("ul",null,(0,o.kt)("li",{parentName:"ul"},(0,o.kt)("a",{parentName:"li",href:"../../Responsible%20AI/Image%20Explainers"},"Image Explainers")),(0,o.kt)("li",{parentName:"ul"},(0,o.kt)("a",{parentName:"li",href:"../Quickstart%20-%20ONNX%20Model%20Inference"},"Quickstart - ONNX Model Inference"))))}c.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/71042800.1c8e9c69.js b/assets/js/71042800.1c8e9c69.js new file mode 100644 index 0000000000..e4723b786d --- /dev/null +++ b/assets/js/71042800.1c8e9c69.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[9484],{3905:function(e,t,n){n.d(t,{Zo:function(){return p},kt:function(){return m}});var r=n(7294);function a(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function i(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,r)}return n}function o(e){for(var t=1;t=0||(a[n]=e[n]);return a}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(a[n]=e[n])}return a}var l=r.createContext({}),c=function(e){var t=r.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):o(o({},t),e)),n},p=function(e){var t=c(e.components);return r.createElement(l.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},d=r.forwardRef((function(e,t){var n=e.components,a=e.mdxType,i=e.originalType,l=e.parentName,p=s(e,["components","mdxType","originalType","parentName"]),d=c(n),m=a,f=d["".concat(l,".").concat(m)]||d[m]||u[m]||i;return n?r.createElement(f,o(o({ref:t},p),{},{components:n})):r.createElement(f,o({ref:t},p))}));function m(e,t){var n=arguments,a=t&&t.mdxType;if("string"==typeof e||a){var i=n.length,o=new Array(i);o[0]=d;var s={};for(var l in t)hasOwnProperty.call(t,l)&&(s[l]=t[l]);s.originalType=e,s.mdxType="string"==typeof e?e:a,o[1]=s;for(var c=2;c=0||(a[n]=e[n]);return a}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(a[n]=e[n])}return a}var l=r.createContext({}),c=function(e){var t=r.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):o(o({},t),e)),n},p=function(e){var t=c(e.components);return r.createElement(l.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},d=r.forwardRef((function(e,t){var n=e.components,a=e.mdxType,i=e.originalType,l=e.parentName,p=s(e,["components","mdxType","originalType","parentName"]),d=c(n),m=a,f=d["".concat(l,".").concat(m)]||d[m]||u[m]||i;return n?r.createElement(f,o(o({ref:t},p),{},{components:n})):r.createElement(f,o({ref:t},p))}));function m(e,t){var n=arguments,a=t&&t.mdxType;if("string"==typeof e||a){var i=n.length,o=new Array(i);o[0]=d;var s={};for(var l in t)hasOwnProperty.call(t,l)&&(s[l]=t[l]);s.originalType=e,s.mdxType="string"==typeof e?e:a,o[1]=s;for(var c=2;c=0||(n[r]=e[r]);return n}(e,a);if(Object.getOwnPropertySymbols){var s=Object.getOwnPropertySymbols(e);for(t=0;t=0||Object.prototype.propertyIsEnumerable.call(e,r)&&(n[r]=e[r])}return n}var p=t.createContext({}),c=function(e){var a=t.useContext(p),r=a;return e&&(r="function"==typeof e?e(a):o(o({},a),e)),r},i=function(e){var a=c(e.components);return t.createElement(p.Provider,{value:a},e.children)},u={inlineCode:"code",wrapper:function(e){var a=e.children;return t.createElement(t.Fragment,{},a)}},k=t.forwardRef((function(e,a){var r=e.components,n=e.mdxType,s=e.originalType,p=e.parentName,i=l(e,["components","mdxType","originalType","parentName"]),k=c(r),m=n,d=k["".concat(p,".").concat(m)]||k[m]||u[m]||s;return r?t.createElement(d,o(o({ref:a},i),{},{components:r})):t.createElement(d,o({ref:a},i))}));function m(e,a){var r=arguments,n=a&&a.mdxType;if("string"==typeof e||n){var s=r.length,o=new Array(s);o[0]=k;var l={};for(var p in a)hasOwnProperty.call(a,p)&&(l[p]=a[p]);l.originalType=e,l.mdxType="string"==typeof e?e:n,o[1]=l;for(var c=2;c=0||(n[r]=e[r]);return n}(e,a);if(Object.getOwnPropertySymbols){var s=Object.getOwnPropertySymbols(e);for(t=0;t=0||Object.prototype.propertyIsEnumerable.call(e,r)&&(n[r]=e[r])}return n}var p=t.createContext({}),c=function(e){var a=t.useContext(p),r=a;return e&&(r="function"==typeof e?e(a):o(o({},a),e)),r},i=function(e){var a=c(e.components);return t.createElement(p.Provider,{value:a},e.children)},u={inlineCode:"code",wrapper:function(e){var a=e.children;return t.createElement(t.Fragment,{},a)}},k=t.forwardRef((function(e,a){var r=e.components,n=e.mdxType,s=e.originalType,p=e.parentName,i=l(e,["components","mdxType","originalType","parentName"]),k=c(r),m=n,d=k["".concat(p,".").concat(m)]||k[m]||u[m]||s;return r?t.createElement(d,o(o({ref:a},i),{},{components:r})):t.createElement(d,o({ref:a},i))}));function m(e,a){var r=arguments,n=a&&a.mdxType;if("string"==typeof e||n){var s=r.length,o=new Array(s);o[0]=k;var l={};for(var p in a)hasOwnProperty.call(a,p)&&(l[p]=a[p]);l.originalType=e,l.mdxType="string"==typeof e?e:n,o[1]=l;for(var c=2;c=0||(a[t]=e[t]);return a}(e,r);if(Object.getOwnPropertySymbols){var s=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,t)&&(a[t]=e[t])}return a}var c=n.createContext({}),l=function(e){var r=n.useContext(c),t=r;return e&&(t="function"==typeof e?e(r):i(i({},r),e)),t},u=function(e){var r=l(e.components);return n.createElement(c.Provider,{value:r},e.children)},p={inlineCode:"code",wrapper:function(e){var r=e.children;return n.createElement(n.Fragment,{},r)}},m=n.forwardRef((function(e,r){var t=e.components,a=e.mdxType,s=e.originalType,c=e.parentName,u=o(e,["components","mdxType","originalType","parentName"]),m=l(t),h=a,g=m["".concat(c,".").concat(h)]||m[h]||p[h]||s;return t?n.createElement(g,i(i({ref:r},u),{},{components:t})):n.createElement(g,i({ref:r},u))}));function h(e,r){var t=arguments,a=r&&r.mdxType;if("string"==typeof e||a){var s=t.length,i=new Array(s);i[0]=m;var o={};for(var c in r)hasOwnProperty.call(r,c)&&(o[c]=r[c]);o.originalType=e,o.mdxType="string"==typeof e?e:a,i[1]=o;for(var l=2;l"))\n .withColumn("Tags", split(col("Tags"), ",").cast("array"))\n .limit(25)\n)\n')),(0,s.kt)("img",{src:"https://mmlspark.blob.core.windows.net/graphics/CognitiveSearchHyperscale/MetArtworkSamples.png",width:"800"}),(0,s.kt)("pre",null,(0,s.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import AnalyzeImage\nfrom synapse.ml.stages import SelectColumns\n\n# define pipeline\ndescribeImage = (\n AnalyzeImage()\n .setSubscriptionKey(cognitive_key)\n .setLocation(cognitive_loc)\n .setImageUrlCol("PrimaryImageUrl")\n .setOutputCol("RawImageDescription")\n .setErrorCol("Errors")\n .setVisualFeatures(\n ["Categories", "Description", "Faces", "ImageType", "Color", "Adult"]\n )\n .setConcurrency(5)\n)\n\ndf2 = (\n describeImage.transform(data)\n .select("*", "RawImageDescription.*")\n .drop("Errors", "RawImageDescription")\n)\n')),(0,s.kt)("img",{src:"https://mmlspark.blob.core.windows.net/graphics/CognitiveSearchHyperscale/MetArtworksProcessed.png",width:"800"}),(0,s.kt)("p",null,"Before writing the results to a Search Index, you must define a schema which must specify the name, type, and attributes of each field in your index. Refer ",(0,s.kt)("a",{parentName:"p",href:"https://docs.microsoft.com/en-us/azure/search/search-what-is-an-index"},"Create a basic index in Azure Search")," for more information."),(0,s.kt)("pre",null,(0,s.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ndf2.writeToAzureSearch(\n subscriptionKey=azure_search_key,\n actionCol="searchAction",\n serviceName=search_service,\n indexName=search_index,\n keyCol="ObjectID",\n)\n')),(0,s.kt)("p",null,"The Search Index can be queried using the ",(0,s.kt)("a",{parentName:"p",href:"https://docs.microsoft.com/rest/api/searchservice/"},"Azure Search REST API")," by sending GET or POST requests and specifying query parameters that give the criteria for selecting matching documents. For more information on querying refer ",(0,s.kt)("a",{parentName:"p",href:"https://docs.microsoft.com/en-us/rest/api/searchservice/Search-Documents"},"Query your Azure Search index using the REST API")),(0,s.kt)("pre",null,(0,s.kt)("code",{parentName:"pre",className:"language-python"},'url = "https://{}.search.windows.net/indexes/{}/docs/search?api-version=2019-05-06".format(\n search_service, search_index\n)\nrequests.post(\n url, json={"search": "Glass"}, headers={"api-key": azure_search_key}\n).json()\n')))}h.isMDXComponent=!0}}]); \ No newline at end of file +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[6167],{3905:function(e,r,t){t.d(r,{Zo:function(){return u},kt:function(){return h}});var n=t(7294);function a(e,r,t){return r in e?Object.defineProperty(e,r,{value:t,enumerable:!0,configurable:!0,writable:!0}):e[r]=t,e}function s(e,r){var t=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);r&&(n=n.filter((function(r){return Object.getOwnPropertyDescriptor(e,r).enumerable}))),t.push.apply(t,n)}return t}function i(e){for(var r=1;r=0||(a[t]=e[t]);return a}(e,r);if(Object.getOwnPropertySymbols){var s=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,t)&&(a[t]=e[t])}return a}var c=n.createContext({}),l=function(e){var r=n.useContext(c),t=r;return e&&(t="function"==typeof e?e(r):i(i({},r),e)),t},u=function(e){var r=l(e.components);return n.createElement(c.Provider,{value:r},e.children)},p={inlineCode:"code",wrapper:function(e){var r=e.children;return n.createElement(n.Fragment,{},r)}},m=n.forwardRef((function(e,r){var t=e.components,a=e.mdxType,s=e.originalType,c=e.parentName,u=o(e,["components","mdxType","originalType","parentName"]),m=l(t),h=a,g=m["".concat(c,".").concat(h)]||m[h]||p[h]||s;return t?n.createElement(g,i(i({ref:r},u),{},{components:t})):n.createElement(g,i({ref:r},u))}));function h(e,r){var t=arguments,a=r&&r.mdxType;if("string"==typeof e||a){var s=t.length,i=new Array(s);i[0]=m;var o={};for(var c in r)hasOwnProperty.call(r,c)&&(o[c]=r[c]);o.originalType=e,o.mdxType="string"==typeof e?e:a,i[1]=o;for(var l=2;l"))\n .withColumn("Tags", split(col("Tags"), ",").cast("array"))\n .limit(25)\n)\n')),(0,s.kt)("img",{src:"https://mmlspark.blob.core.windows.net/graphics/CognitiveSearchHyperscale/MetArtworkSamples.png",width:"800"}),(0,s.kt)("pre",null,(0,s.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import AnalyzeImage\nfrom synapse.ml.stages import SelectColumns\n\n# define pipeline\ndescribeImage = (\n AnalyzeImage()\n .setSubscriptionKey(cognitive_key)\n .setLocation(cognitive_loc)\n .setImageUrlCol("PrimaryImageUrl")\n .setOutputCol("RawImageDescription")\n .setErrorCol("Errors")\n .setVisualFeatures(\n ["Categories", "Description", "Faces", "ImageType", "Color", "Adult"]\n )\n .setConcurrency(5)\n)\n\ndf2 = (\n describeImage.transform(data)\n .select("*", "RawImageDescription.*")\n .drop("Errors", "RawImageDescription")\n)\n')),(0,s.kt)("img",{src:"https://mmlspark.blob.core.windows.net/graphics/CognitiveSearchHyperscale/MetArtworksProcessed.png",width:"800"}),(0,s.kt)("p",null,"Before writing the results to a Search Index, you must define a schema which must specify the name, type, and attributes of each field in your index. Refer ",(0,s.kt)("a",{parentName:"p",href:"https://docs.microsoft.com/en-us/azure/search/search-what-is-an-index"},"Create a basic index in Azure Search")," for more information."),(0,s.kt)("pre",null,(0,s.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\ndf2.writeToAzureSearch(\n subscriptionKey=azure_search_key,\n actionCol="searchAction",\n serviceName=search_service,\n indexName=search_index,\n keyCol="ObjectID",\n)\n')),(0,s.kt)("p",null,"The Search Index can be queried using the ",(0,s.kt)("a",{parentName:"p",href:"https://docs.microsoft.com/rest/api/searchservice/"},"Azure Search REST API")," by sending GET or POST requests and specifying query parameters that give the criteria for selecting matching documents. For more information on querying refer ",(0,s.kt)("a",{parentName:"p",href:"https://docs.microsoft.com/en-us/rest/api/searchservice/Search-Documents"},"Query your Azure Search index using the REST API")),(0,s.kt)("pre",null,(0,s.kt)("code",{parentName:"pre",className:"language-python"},'url = "https://{}.search.windows.net/indexes/{}/docs/search?api-version=2019-05-06".format(\n search_service, search_index\n)\nrequests.post(\n url, json={"search": "Glass"}, headers={"api-key": azure_search_key}\n).json()\n')))}h.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/767a7177.2c4017ce.js b/assets/js/767a7177.2c4017ce.js new file mode 100644 index 0000000000..bd676855f0 --- /dev/null +++ b/assets/js/767a7177.2c4017ce.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[5040],{3905:function(e,t,r){r.d(t,{Zo:function(){return p},kt:function(){return m}});var n=r(7294);function o(e,t,r){return t in e?Object.defineProperty(e,t,{value:r,enumerable:!0,configurable:!0,writable:!0}):e[t]=r,e}function s(e,t){var r=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);t&&(n=n.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),r.push.apply(r,n)}return r}function i(e){for(var t=1;t=0||(o[r]=e[r]);return o}(e,t);if(Object.getOwnPropertySymbols){var s=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,r)&&(o[r]=e[r])}return o}var c=n.createContext({}),u=function(e){var t=n.useContext(c),r=t;return e&&(r="function"==typeof e?e(t):i(i({},t),e)),r},p=function(e){var t=u(e.components);return n.createElement(c.Provider,{value:t},e.children)},l={inlineCode:"code",wrapper:function(e){var t=e.children;return n.createElement(n.Fragment,{},t)}},f=n.forwardRef((function(e,t){var r=e.components,o=e.mdxType,s=e.originalType,c=e.parentName,p=a(e,["components","mdxType","originalType","parentName"]),f=u(r),m=o,v=f["".concat(c,".").concat(m)]||f[m]||l[m]||s;return r?n.createElement(v,i(i({ref:t},p),{},{components:r})):n.createElement(v,i({ref:t},p))}));function m(e,t){var r=arguments,o=t&&t.mdxType;if("string"==typeof e||o){var s=r.length,i=new Array(s);i[0]=f;var a={};for(var c in t)hasOwnProperty.call(t,c)&&(a[c]=t[c]);a.originalType=e,a.mdxType="string"==typeof e?e:o,i[1]=a;for(var u=2;u ",(0,s.kt)("strong",{parentName:"li"},"Go to resource"),". Once at the resource, you can get the key from ",(0,s.kt)("strong",{parentName:"li"},"Resource Management")," > ",(0,s.kt)("strong",{parentName:"li"},"Keys and Endpoint"),". Copy the key and paste it into the notebook. Store keys securely and do not share them. ")),(0,s.kt)("h2",{id:"cognitive-services"},"Cognitive Services"),(0,s.kt)("p",null,"To set up ",(0,s.kt)("a",{parentName:"p",href:"https://azure.microsoft.com/products/cognitive-services/"},"Cognitive Services")," for use with SynapseML you first need to:"),(0,s.kt)("ul",null,(0,s.kt)("li",{parentName:"ul"},(0,s.kt)("a",{parentName:"li",href:"https://learn.microsoft.com/azure/role-based-access-control/role-assignments-steps"},"Assign yourself the Cognitive Services Contributor role")," to agree to the responsible AI terms and create a resource. "),(0,s.kt)("li",{parentName:"ul"},(0,s.kt)("a",{parentName:"li",href:"https://portal.azure.com/#create/Microsoft.CognitiveServicesAllInOne"},"Create an Azure Cognitive multi-service (Decision, Language, Speech, Vision) resource"),". Alternatively, you can follow the steps to ",(0,s.kt)("a",{parentName:"li",href:"https://learn.microsoft.com/en-us/azure/cognitive-services/cognitive-services-apis-create-account?tabs=decision%2Canomaly-detector%2Clanguage-service%2Ccomputer-vision%2Cwindows#create-a-new-azure-cognitive-services-resource"},"create Single-service resource"),". "),(0,s.kt)("li",{parentName:"ul"},"Get your Cognitive Service resource's key. After your resource is successfully deployed, select ",(0,s.kt)("strong",{parentName:"li"},"Next Steps")," > ",(0,s.kt)("strong",{parentName:"li"},"Go to resource"),". Once at the resource, you can get the key from ",(0,s.kt)("strong",{parentName:"li"},"Resource Management")," > ",(0,s.kt)("strong",{parentName:"li"},"Keys and Endpoint"),". Copy the key and paste it into the notebook. Store keys securely and do not share them.")))}m.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/77c259dd.748f8aa6.js b/assets/js/77c259dd.748f8aa6.js new file mode 100644 index 0000000000..4087b9e1c3 --- /dev/null +++ b/assets/js/77c259dd.748f8aa6.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[224],{3905:function(e,t,a){a.d(t,{Zo:function(){return m},kt:function(){return h}});var n=a(7294);function s(e,t,a){return t in e?Object.defineProperty(e,t,{value:a,enumerable:!0,configurable:!0,writable:!0}):e[t]=a,e}function o(e,t){var a=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);t&&(n=n.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),a.push.apply(a,n)}return a}function r(e){for(var t=1;t=0||(s[a]=e[a]);return s}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(s[a]=e[a])}return s}var l=n.createContext({}),c=function(e){var t=n.useContext(l),a=t;return e&&(a="function"==typeof e?e(t):r(r({},t),e)),a},m=function(e){var t=c(e.components);return n.createElement(l.Provider,{value:t},e.children)},p={inlineCode:"code",wrapper:function(e){var t=e.children;return n.createElement(n.Fragment,{},t)}},u=n.forwardRef((function(e,t){var a=e.components,s=e.mdxType,o=e.originalType,l=e.parentName,m=i(e,["components","mdxType","originalType","parentName"]),u=c(a),h=s,d=u["".concat(l,".").concat(h)]||u[h]||p[h]||o;return a?n.createElement(d,r(r({ref:t},m),{},{components:a})):n.createElement(d,r({ref:t},m))}));function h(e,t){var a=arguments,s=t&&t.mdxType;if("string"==typeof e||s){var o=a.length,r=new Array(o);r[0]=u;var i={};for(var l in t)hasOwnProperty.call(t,l)&&(i[l]=t[l]);i.originalType=e,i.mdxType="string"==typeof e?e:s,r[1]=i;for(var c=2;c=0||(r[n]=e[n]);return r}(e,t);if(Object.getOwnPropertySymbols){var l=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(r[n]=e[n])}return r}var i=a.createContext({}),p=function(e){var t=a.useContext(i),n=t;return e&&(n="function"==typeof e?e(t):s(s({},t),e)),n},u=function(e){var t=p(e.components);return a.createElement(i.Provider,{value:t},e.children)},c={inlineCode:"code",wrapper:function(e){var t=e.children;return a.createElement(a.Fragment,{},t)}},f=a.forwardRef((function(e,t){var n=e.components,r=e.mdxType,l=e.originalType,i=e.parentName,u=o(e,["components","mdxType","originalType","parentName"]),f=p(n),m=r,d=f["".concat(i,".").concat(m)]||f[m]||c[m]||l;return n?a.createElement(d,s(s({ref:t},u),{},{components:n})):a.createElement(d,s({ref:t},u))}));function m(e,t){var n=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var l=n.length,s=new Array(l);s[0]=f;var o={};for(var i in t)hasOwnProperty.call(t,i)&&(o[i]=t[i]);o.originalType=e,o.mdxType="string"==typeof e?e:r,s[1]=o;for(var p=2;p=0||(r[t]=e[t]);return r}(e,a);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,t)&&(r[t]=e[t])}return r}var i=n.createContext({}),p=function(e){var a=n.useContext(i),t=a;return e&&(t="function"==typeof e?e(a):l(l({},a),e)),t},c=function(e){var a=p(e.components);return n.createElement(i.Provider,{value:a},e.children)},u={inlineCode:"code",wrapper:function(e){var a=e.children;return n.createElement(n.Fragment,{},a)}},m=n.forwardRef((function(e,a){var t=e.components,r=e.mdxType,o=e.originalType,i=e.parentName,c=s(e,["components","mdxType","originalType","parentName"]),m=p(t),d=r,f=m["".concat(i,".").concat(d)]||m[d]||u[d]||o;return t?n.createElement(f,l(l({ref:a},c),{},{components:t})):n.createElement(f,l({ref:a},c))}));function d(e,a){var t=arguments,r=a&&a.mdxType;if("string"==typeof e||r){var o=t.length,l=new Array(o);l[0]=m;var s={};for(var i in a)hasOwnProperty.call(a,i)&&(s[i]=a[i]);s.originalType=e,s.mdxType="string"==typeof e?e:r,l[1]=s;for(var p=2;p=0||(r[t]=e[t]);return r}(e,a);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,t)&&(r[t]=e[t])}return r}var i=n.createContext({}),p=function(e){var a=n.useContext(i),t=a;return e&&(t="function"==typeof e?e(a):l(l({},a),e)),t},c=function(e){var a=p(e.components);return n.createElement(i.Provider,{value:a},e.children)},u={inlineCode:"code",wrapper:function(e){var a=e.children;return n.createElement(n.Fragment,{},a)}},m=n.forwardRef((function(e,a){var t=e.components,r=e.mdxType,o=e.originalType,i=e.parentName,c=s(e,["components","mdxType","originalType","parentName"]),m=p(t),d=r,f=m["".concat(i,".").concat(d)]||m[d]||u[d]||o;return t?n.createElement(f,l(l({ref:a},c),{},{components:t})):n.createElement(f,l({ref:a},c))}));function d(e,a){var t=arguments,r=a&&a.mdxType;if("string"==typeof e||r){var o=t.length,l=new Array(o);l[0]=m;var s={};for(var i in a)hasOwnProperty.call(a,i)&&(s[i]=a[i]);s.originalType=e,s.mdxType="string"==typeof e?e:r,l[1]=s;for(var p=2;p=0||(r[t]=e[t]);return r}(e,n);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,t)&&(r[t]=e[t])}return r}var s=a.createContext({}),p=function(e){var n=a.useContext(s),t=n;return e&&(t="function"==typeof e?e(n):l(l({},n),e)),t},u=function(e){var n=p(e.components);return a.createElement(s.Provider,{value:n},e.children)},c={inlineCode:"code",wrapper:function(e){var n=e.children;return a.createElement(a.Fragment,{},n)}},d=a.forwardRef((function(e,n){var t=e.components,r=e.mdxType,o=e.originalType,s=e.parentName,u=i(e,["components","mdxType","originalType","parentName"]),d=p(t),m=r,h=d["".concat(s,".").concat(m)]||d[m]||c[m]||o;return t?a.createElement(h,l(l({ref:n},u),{},{components:t})):a.createElement(h,l({ref:n},u))}));function m(e,n){var t=arguments,r=n&&n.mdxType;if("string"==typeof e||r){var o=t.length,l=new Array(o);l[0]=d;var i={};for(var s in n)hasOwnProperty.call(n,s)&&(i[s]=n[s]);i.originalType=e,i.mdxType="string"==typeof e?e:r,l[1]=i;for(var p=2;p child <"+("string"==typeof e.type?e.type:e.type.name)+'>: all children of the component should be , and every should have a unique "value" prop.')})))?void 0:t.filter(Boolean))?n:[]}(e).map((function(e){var n=e.props;return{value:n.value,label:n.label,attributes:n.attributes,default:n.default}}))}function d(e){var n=e.values,t=e.children;return(0,r.useMemo)((function(){var e=null!=n?n:c(t);return function(e){var n=(0,p.l)(e,(function(e,n){return e.value===n.value}));if(n.length>0)throw new Error('Docusaurus error: Duplicate values "'+n.map((function(e){return e.value})).join(", ")+'" found in . Every value needs to be unique.')}(e),e}),[n,t])}function m(e){var n=e.value;return e.tabValues.some((function(e){return e.value===n}))}function h(e){var n=e.queryString,t=void 0!==n&&n,a=e.groupId,o=(0,i.k6)(),l=function(e){var n=e.queryString,t=void 0!==n&&n,a=e.groupId;if("string"==typeof t)return t;if(!1===t)return null;if(!0===t&&!a)throw new Error('Docusaurus error: The component groupId prop is required if queryString=true, because this value is used as the search param name. You can also provide an explicit value such as queryString="my-search-param".');return null!=a?a:null}({queryString:t,groupId:a});return[(0,s._X)(l),(0,r.useCallback)((function(e){if(l){var n=new URLSearchParams(o.location.search);n.set(l,e),o.replace(Object.assign({},o.location,{search:n.toString()}))}}),[l,o])]}function f(e){var n,t,a,o,l=e.defaultValue,i=e.queryString,s=void 0!==i&&i,p=e.groupId,c=d(e),f=(0,r.useState)((function(){return function(e){var n,t=e.defaultValue,a=e.tabValues;if(0===a.length)throw new Error("Docusaurus error: the component requires at least one children component");if(t){if(!m({value:t,tabValues:a}))throw new Error('Docusaurus error: The has a defaultValue "'+t+'" but none of its children has the corresponding value. Available values are: '+a.map((function(e){return e.value})).join(", ")+". If you intend to show no default tab, use defaultValue={null} instead.");return t}var r=null!=(n=a.find((function(e){return e.default})))?n:a[0];if(!r)throw new Error("Unexpected error: 0 tabValues");return r.value}({defaultValue:l,tabValues:c})})),k=f[0],y=f[1],v=h({queryString:s,groupId:p}),b=v[0],g=v[1],w=(n=function(e){return e?"docusaurus.tab."+e:null}({groupId:p}.groupId),t=(0,u.Nk)(n),a=t[0],o=t[1],[a,(0,r.useCallback)((function(e){n&&o.set(e)}),[n,o])]),S=w[0],T=w[1],N=function(){var e=null!=b?b:S;return m({value:e,tabValues:c})?e:null}();return(0,r.useLayoutEffect)((function(){N&&y(N)}),[N]),{selectedValue:k,selectValue:(0,r.useCallback)((function(e){if(!m({value:e,tabValues:c}))throw new Error("Can't select invalid tab value="+e);y(e),g(e),T(e)}),[g,T,c]),tabValues:c}}var k=t(2389),y="tabList__CuJ",v="tabItem_LNqP";function b(e){var n=e.className,t=e.block,i=e.selectedValue,s=e.selectValue,p=e.tabValues,u=[],c=(0,l.o5)().blockElementScrollPositionUntilNextRender,d=function(e){var n=e.currentTarget,t=u.indexOf(n),a=p[t].value;a!==i&&(c(n),s(a))},m=function(e){var n,t=null;switch(e.key){case"Enter":d(e);break;case"ArrowRight":var a,r=u.indexOf(e.currentTarget)+1;t=null!=(a=u[r])?a:u[0];break;case"ArrowLeft":var o,l=u.indexOf(e.currentTarget)-1;t=null!=(o=u[l])?o:u[u.length-1]}null==(n=t)||n.focus()};return r.createElement("ul",{role:"tablist","aria-orientation":"horizontal",className:(0,o.Z)("tabs",{"tabs--block":t},n)},p.map((function(e){var n=e.value,t=e.label,l=e.attributes;return r.createElement("li",(0,a.Z)({role:"tab",tabIndex:i===n?0:-1,"aria-selected":i===n,key:n,ref:function(e){return u.push(e)},onKeyDown:m,onClick:d},l,{className:(0,o.Z)("tabs__item",v,null==l?void 0:l.className,{"tabs__item--active":i===n})}),null!=t?t:n)})))}function g(e){var n=e.lazy,t=e.children,a=e.selectedValue,o=(Array.isArray(t)?t:[t]).filter(Boolean);if(n){var l=o.find((function(e){return e.props.value===a}));return l?(0,r.cloneElement)(l,{className:"margin-top--md"}):null}return r.createElement("div",{className:"margin-top--md"},o.map((function(e,n){return(0,r.cloneElement)(e,{key:n,hidden:e.props.value!==a})})))}function w(e){var n=f(e);return r.createElement("div",{className:(0,o.Z)("tabs-container",y)},r.createElement(b,(0,a.Z)({},e,n)),r.createElement(g,(0,a.Z)({},e,n)))}function S(e){var n=(0,k.Z)();return r.createElement(w,(0,a.Z)({key:String(n)},e))}},1036:function(e,n,t){t.r(n),t.d(n,{assets:function(){return d},contentTitle:function(){return u},default:function(){return f},frontMatter:function(){return p},metadata:function(){return c},toc:function(){return m}});var a=t(3117),r=t(102),o=(t(7294),t(3905)),l=t(4866),i=t(5162),s=["components"],p={title:".NET setup",hide_title:!0,sidebar_label:".NET setup",description:".NET setup"},u=".NET setup and example for SynapseML",c={unversionedId:"Reference/Dotnet Setup",id:"version-0.11.4/Reference/Dotnet Setup",title:".NET setup",description:".NET setup",source:"@site/versioned_docs/version-0.11.4/Reference/Dotnet Setup.md",sourceDirName:"Reference",slug:"/Reference/Dotnet Setup",permalink:"/SynapseML/docs/Reference/Dotnet Setup",draft:!1,tags:[],version:"0.11.4",frontMatter:{title:".NET setup",hide_title:!0,sidebar_label:".NET setup",description:".NET setup"},sidebar:"docs",previous:{title:"R setup",permalink:"/SynapseML/docs/Reference/R Setup"},next:{title:"Quickstart - LightGBM in Dotnet",permalink:"/SynapseML/docs/Reference/Quickstart - LightGBM in Dotnet"}},d={},m=[{value:"Installation",id:"installation",level:2},{value:"1. Install .NET",id:"1-install-net",level:3},{value:"2. Install Java",id:"2-install-java",level:3},{value:"3. Install Apache Spark",id:"3-install-apache-spark",level:3},{value:"4. Install .NET for Apache Spark",id:"4-install-net-for-apache-spark",level:3},{value:"5. Install WinUtils (Windows Only)",id:"5-install-winutils-windows-only",level:3},{value:"6. Set DOTNET_WORKER_DIR and check dependencies",id:"6-set-dotnet_worker_dir-and-check-dependencies",level:3},{value:"Write a .NET for SynapseML App",id:"write-a-net-for-synapseml-app",level:2},{value:"1. Create a console app",id:"1-create-a-console-app",level:3},{value:"2. Install NuGet package",id:"2-install-nuget-package",level:3},{value:"3. Write your app",id:"3-write-your-app",level:3},{value:"4. Run your .NET App",id:"4-run-your-net-app",level:3},{value:"Next",id:"next",level:2}],h={toc:m};function f(e){var n=e.components,t=(0,r.Z)(e,s);return(0,o.kt)("wrapper",(0,a.Z)({},h,t,{components:n,mdxType:"MDXLayout"}),(0,o.kt)("h1",{id:"net-setup-and-example-for-synapseml"},".NET setup and example for SynapseML"),(0,o.kt)("h2",{id:"installation"},"Installation"),(0,o.kt)("h3",{id:"1-install-net"},"1. Install .NET"),(0,o.kt)("p",null,"To start building .NET apps, you need to download and install the .NET SDK (Software Development Kit)."),(0,o.kt)("p",null,"Download and install the ",(0,o.kt)("a",{parentName:"p",href:"https://dotnet.microsoft.com/en-us/download/dotnet/3.1"},".NET Core SDK"),".\nInstalling the SDK adds the dotnet toolchain to your PATH."),(0,o.kt)("p",null,"Once you've installed the .NET Core SDK, open a new command prompt or terminal. Then run ",(0,o.kt)("inlineCode",{parentName:"p"},"dotnet"),"."),(0,o.kt)("p",null,"If the command runs and prints information about how to use dotnet, you can move to the next step.\nIf you receive a ",(0,o.kt)("inlineCode",{parentName:"p"},"'dotnet' is not recognized as an internal or external command")," error, make sure\nyou opened a new command prompt or terminal before running the command."),(0,o.kt)("h3",{id:"2-install-java"},"2. Install Java"),(0,o.kt)("p",null,"Install ",(0,o.kt)("a",{parentName:"p",href:"https://www.oracle.com/java/technologies/downloads/#java8"},"Java 8.1")," for Windows and macOS,\nor ",(0,o.kt)("a",{parentName:"p",href:"https://openjdk.org/install/"},"OpenJDK 8")," for Ubuntu."),(0,o.kt)("p",null,"Select the appropriate version for your operating system. For example, select jdk-8u201-windows-x64.exe\nfor a Windows x64 machine or jdk-8u231-macosx-x64.dmg for macOS. Then, use the command java to verify the installation."),(0,o.kt)("h3",{id:"3-install-apache-spark"},"3. Install Apache Spark"),(0,o.kt)("p",null,(0,o.kt)("a",{parentName:"p",href:"https://spark.apache.org/downloads.html"},"Download and install Apache Spark")," with version >= 3.2.0.\n(SynapseML v0.11.4 only supports spark version >= 3.2.0)"),(0,o.kt)("p",null,"Extract downloaded zipped files (with 7-Zip app on Windows or ",(0,o.kt)("inlineCode",{parentName:"p"},"tar")," on linux) and remember the location of\nextracted files, we take ",(0,o.kt)("inlineCode",{parentName:"p"},"~/bin/spark-3.2.0-bin-hadoop3.2/")," as an example here."),(0,o.kt)("p",null,"Run the following commands to set the environment variables used to locate Apache Spark.\nOn Windows, make sure to run the command prompt in administrator mode."),(0,o.kt)(l.Z,{groupId:"operating-systems",mdxType:"Tabs"},(0,o.kt)(i.Z,{value:"win",label:"Windows",default:!0,mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre"},' setx /M HADOOP_HOME C:\\bin\\spark-3.2.0-bin-hadoop3.2\\\n setx /M SPARK_HOME C:\\bin\\spark-3.2.0-bin-hadoop3.2\\\n setx /M PATH "%PATH%;%HADOOP_HOME%;%SPARK_HOME%bin" # Warning: Don\'t run this if your path is already long as it will truncate your path to 1024 characters and potentially remove entries!\n'))),(0,o.kt)(i.Z,{value:"linux",label:"Mac/Linux",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre"},' export SPARK_HOME=~/bin/spark-3.2.0-bin-hadoop3.2/\n export PATH="$SPARK_HOME/bin:$PATH"\n source ~/.bashrc\n')))),(0,o.kt)("p",null,"Once you've installed everything and set your environment variables, open a ",(0,o.kt)("strong",{parentName:"p"},"new")," command prompt or terminal and run the following command:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-bash"},"spark-submit --version\n")),(0,o.kt)("p",null,"If the command runs and prints version information, you can move to the next step."),(0,o.kt)("p",null,"If you receive a ",(0,o.kt)("inlineCode",{parentName:"p"},"'spark-submit' is not recognized as an internal or external command")," error, make sure you opened a ",(0,o.kt)("strong",{parentName:"p"},"new")," command prompt."),(0,o.kt)("h3",{id:"4-install-net-for-apache-spark"},"4. Install .NET for Apache Spark"),(0,o.kt)("p",null,"Download the ",(0,o.kt)("a",{parentName:"p",href:"https://github.com/dotnet/spark/releases"},"Microsoft.Spark.Worker")," ",(0,o.kt)("strong",{parentName:"p"},"v2.1.1")," release from the .NET for Apache Spark GitHub.\nFor example if you're on a Windows machine and plan to use .NET Core, download the Windows x64 netcoreapp3.1 release."),(0,o.kt)("p",null,"Extract Microsoft.Spark.Worker and remember the location."),(0,o.kt)("h3",{id:"5-install-winutils-windows-only"},"5. Install WinUtils (Windows Only)"),(0,o.kt)("p",null,".NET for Apache Spark requires WinUtils to be installed alongside Apache Spark.\n",(0,o.kt)("a",{parentName:"p",href:"https://github.com/steveloughran/winutils/blob/master/hadoop-3.0.0/bin/winutils.exe"},"Download winutils.exe"),".\nThen, copy WinUtils into C:\\bin\\spark-3.2.0-bin-hadoop3.2\\bin."),(0,o.kt)("admonition",{type:"note"},(0,o.kt)("p",{parentName:"admonition"},"If you're using a different version of Hadoop, select the version of WinUtils that's compatible with your version of Hadoop. You can see the Hadoop version at the end of your Spark install folder name.")),(0,o.kt)("h3",{id:"6-set-dotnet_worker_dir-and-check-dependencies"},"6. Set DOTNET_WORKER_DIR and check dependencies"),(0,o.kt)("p",null,"Run one of the following commands to set the DOTNET_WORKER_DIR environment variable, which is used by .NET apps to locate .NET for Apache Spark\nworker binaries. Make sure to replace with the directory where you downloaded and extracted the Microsoft.Spark.Worker.\nOn Windows, make sure to run the command prompt in administrator mode."),(0,o.kt)(l.Z,{groupId:"operating-systems",mdxType:"Tabs"},(0,o.kt)(i.Z,{value:"win",label:"Windows",default:!0,mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre"}," setx /M DOTNET_WORKER_DIR \n"))),(0,o.kt)(i.Z,{value:"linux",label:"Mac/Linux",mdxType:"TabItem"},(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre"}," export DOTNET_WORKER_DIR=\n")))),(0,o.kt)("p",null,"Finally, double-check that you can run ",(0,o.kt)("inlineCode",{parentName:"p"},"dotnet, java, spark-shell")," from your command line before you move to the next section."),(0,o.kt)("h2",{id:"write-a-net-for-synapseml-app"},"Write a .NET for SynapseML App"),(0,o.kt)("h3",{id:"1-create-a-console-app"},"1. Create a console app"),(0,o.kt)("p",null,"In your command prompt or terminal, run the following commands to create a new console application:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-powershell"},"dotnet new console -o SynapseMLApp\ncd SynapseMLApp\n")),(0,o.kt)("p",null,"The ",(0,o.kt)("inlineCode",{parentName:"p"},"dotnet")," command creates a new application of type console for you. The -o parameter creates a directory\nnamed ",(0,o.kt)("inlineCode",{parentName:"p"},"SynapseMLApp")," where your app is stored and populates it with the required files.\nThe ",(0,o.kt)("inlineCode",{parentName:"p"},"cd SynapseMLApp")," command changes the directory to the app directory you created."),(0,o.kt)("h3",{id:"2-install-nuget-package"},"2. Install NuGet package"),(0,o.kt)("p",null,"To use .NET for Apache Spark in an app, install the Microsoft.Spark package.\nIn your command prompt or terminal, run the following command:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-powershell"},"dotnet add package Microsoft.Spark --version 2.1.1\n")),(0,o.kt)("admonition",{type:"note"},(0,o.kt)("p",{parentName:"admonition"},"This tutorial uses Microsoft.Spark version 2.1.1 as SynapseML 0.11.4 depends on it.\nChange to corresponding version if necessary.")),(0,o.kt)("p",null,"To use SynapseML features in the app, install SynapseML.X package.\nIn this tutorial, we use SynapseML.Cognitive as an example.\nIn your command prompt or terminal, run the following command:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-powershell"},"# Update Nuget Config to include SynapseML Feed\ndotnet nuget add source https://mmlspark.blob.core.windows.net/synapsemlnuget/index.json -n SynapseMLFeed\ndotnet add package SynapseML.Cognitive --version 0.11.4\n")),(0,o.kt)("p",null,"The ",(0,o.kt)("inlineCode",{parentName:"p"},"dotnet nuget add")," command adds SynapseML's resolver to the source, so that our package can be found."),(0,o.kt)("h3",{id:"3-write-your-app"},"3. Write your app"),(0,o.kt)("p",null,"Open Program.cs in Visual Studio Code, or any text editor. Replace its contents with this code:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-csharp"},'using System;\nusing System.Collections.Generic;\nusing Synapse.ML.Cognitive;\nusing Microsoft.Spark.Sql;\nusing Microsoft.Spark.Sql.Types;\n\nnamespace SynapseMLApp\n{\n class Program\n { static void Main(string[] args)\n {\n // Create Spark session\n SparkSession spark =\n SparkSession\n .Builder()\n .AppName("TextSentimentExample")\n .GetOrCreate();\n\n // Create DataFrame\n DataFrame df = spark.CreateDataFrame(\n new List\n {\n new GenericRow(new object[] {"I am so happy today, its sunny!", "en-US"}),\n new GenericRow(new object[] {"I am frustrated by this rush hour traffic", "en-US"}),\n new GenericRow(new object[] {"The cognitive services on spark aint bad", "en-US"})\n },\n new StructType(new List\n {\n new StructField("text", new StringType()),\n new StructField("language", new StringType())\n })\n );\n\n // Create TextSentiment\n var model = new TextSentiment()\n .SetSubscriptionKey("YOUR_SUBSCRIPTION_KEY")\n .SetLocation("eastus")\n .SetTextCol("text")\n .SetOutputCol("sentiment")\n .SetErrorCol("error")\n .SetLanguageCol("language");\n\n // Transform\n var outputDF = model.Transform(df);\n\n // Display results\n outputDF.Show();\n\n // Stop Spark session\n spark.Stop();\n }\n }\n}\n')),(0,o.kt)("p",null,(0,o.kt)("a",{parentName:"p",href:"https://docs.microsoft.com/en-us/dotnet/api/microsoft.spark.sql.sparksession?view=spark-dotnet"},"SparkSession")," is the entrypoint\nof Apache Spark applications, which manages the context and information of your application. A DataFrame is a way of organizing\ndata into a set of named columns."),(0,o.kt)("p",null,"Create a ",(0,o.kt)("a",{parentName:"p",href:"https://mmlspark.blob.core.windows.net/docs/0.11.4/dotnet/classSynapse_1_1ML_1_1Cognitive_1_1TextSentiment.html"},"TextSentiment"),"\ninstance, set corresponding subscription key and other configurations. Then, apply transformation to the dataframe,\nwhich analyzes the sentiment based on each row, and stores result into output column."),(0,o.kt)("p",null,"The result of the transformation is stored in another DataFrame. At this point, no operations have taken place because\n.NET for Apache Spark lazily evaluates the data. The operation defined by the call to model.Transform doesn't execute until the Show method is called to display the contents of the transformed DataFrame to the console. Once you no longer need the Spark\nsession, use the Stop method to stop your session."),(0,o.kt)("h3",{id:"4-run-your-net-app"},"4. Run your .NET App"),(0,o.kt)("p",null,"Run the following command to build your application:"),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-powershell"},"dotnet build\n")),(0,o.kt)("p",null,"Navigate to your build output directory. For example, in Windows you could run ",(0,o.kt)("inlineCode",{parentName:"p"},"cd bin\\Debug\\net5.0"),".\nUse the spark-submit command to submit your application to run on Apache Spark."),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre",className:"language-powershell"},"spark-submit --class org.apache.spark.deploy.dotnet.DotnetRunner --packages com.microsoft.azure:synapseml_2.12:0.11.4 --master local microsoft-spark-3-2_2.12-2.1.1.jar dotnet SynapseMLApp.dll\n")),(0,o.kt)("p",null,(0,o.kt)("inlineCode",{parentName:"p"},"--packages com.microsoft.azure:synapseml_2.12:0.11.4")," specifies the dependency on synapseml_2.12 version 0.11.4;\n",(0,o.kt)("inlineCode",{parentName:"p"},"microsoft-spark-3-2_2.12-2.1.1.jar")," specifies Microsoft.Spark version 2.1.1 and Spark version 3.2"),(0,o.kt)("admonition",{type:"note"},(0,o.kt)("p",{parentName:"admonition"},"This command assumes you have downloaded Apache Spark and added it to your PATH environment variable so that you can use spark-submit.\nOtherwise, you'd have to use the full path (for example, C:\\bin\\apache-spark\\bin\\spark-submit or ~/spark/bin/spark-submit).")),(0,o.kt)("p",null,"When your app runs, the sentiment analysis result is written to the console."),(0,o.kt)("pre",null,(0,o.kt)("code",{parentName:"pre"},"+-----------------------------------------+--------+-----+--------------------------------------------------+\n| text|language|error| sentiment|\n+-----------------------------------------+--------+-----+--------------------------------------------------+\n| I am so happy today, its sunny!| en-US| null|[{positive, null, {0.99, 0.0, 0.0}, [{I am so h...|\n|I am frustrated by this rush hour traffic| en-US| null|[{negative, null, {0.0, 0.0, 0.99}, [{I am frus...|\n| The cognitive services on spark aint bad| en-US| null|[{negative, null, {0.0, 0.01, 0.99}, [{The cogn...|\n+-----------------------------------------+--------+-----+--------------------------------------------------+\n")),(0,o.kt)("p",null,"Congratulations! You successfully authored and ran a .NET for SynapseML app.\nRefer to the ",(0,o.kt)("a",{parentName:"p",href:"https://mmlspark.blob.core.windows.net/docs/0.11.4/dotnet/index.html"},"developer docs")," for API guidance."),(0,o.kt)("h2",{id:"next"},"Next"),(0,o.kt)("ul",null,(0,o.kt)("li",{parentName:"ul"},"Refer to this ",(0,o.kt)("a",{parentName:"li",href:"https://docs.microsoft.com/en-us/dotnet/spark/tutorials/databricks-deployment"},"tutorial")," for deploying a .NET app to Databricks."),(0,o.kt)("li",{parentName:"ul"},"You could download compatible ",(0,o.kt)("a",{parentName:"li",href:"https://mmlspark.blob.core.windows.net/publicwasb/dotnet/install-worker.sh"},"install-worker.sh"),"\nand ",(0,o.kt)("a",{parentName:"li",href:"https://mmlspark.blob.core.windows.net/publicwasb/dotnet/db-init.sh"},"db-init.sh")," files needed for deployment on Databricks.")))}f.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/8181e18a.3e65501f.js b/assets/js/8181e18a.3e65501f.js new file mode 100644 index 0000000000..715e785a83 --- /dev/null +++ b/assets/js/8181e18a.3e65501f.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[9226],{3905:function(e,t,n){n.d(t,{Zo:function(){return c},kt:function(){return m}});var r=n(7294);function o(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function a(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);t&&(r=r.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,r)}return n}function i(e){for(var t=1;t=0||(o[n]=e[n]);return o}(e,t);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(o[n]=e[n])}return o}var s=r.createContext({}),p=function(e){var t=r.useContext(s),n=t;return e&&(n="function"==typeof e?e(t):i(i({},t),e)),n},c=function(e){var t=p(e.components);return r.createElement(s.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},d=r.forwardRef((function(e,t){var n=e.components,o=e.mdxType,a=e.originalType,s=e.parentName,c=l(e,["components","mdxType","originalType","parentName"]),d=p(n),m=o,f=d["".concat(s,".").concat(m)]||d[m]||u[m]||a;return n?r.createElement(f,i(i({ref:t},c),{},{components:n})):r.createElement(f,i({ref:t},c))}));function m(e,t){var n=arguments,o=t&&t.mdxType;if("string"==typeof e||o){var a=n.length,i=new Array(a);i[0]=d;var l={};for(var s in t)hasOwnProperty.call(t,s)&&(l[s]=t[s]);l.originalType=e,l.mdxType="string"==typeof e?e:o,i[1]=l;for(var p=2;p=0||(n[a]=e[a]);return n}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(n[a]=e[a])}return n}var l=r.createContext({}),c=function(e){var t=r.useContext(l),a=t;return e&&(a="function"==typeof e?e(t):s(s({},t),e)),a},p=function(e){var t=c(e.components);return r.createElement(l.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return r.createElement(r.Fragment,{},t)}},f=r.forwardRef((function(e,t){var a=e.components,n=e.mdxType,o=e.originalType,l=e.parentName,p=i(e,["components","mdxType","originalType","parentName"]),f=c(a),m=n,y=f["".concat(l,".").concat(m)]||f[m]||u[m]||o;return a?r.createElement(y,s(s({ref:t},p),{},{components:a})):r.createElement(y,s({ref:t},p))}));function m(e,t){var a=arguments,n=t&&t.mdxType;if("string"==typeof e||n){var o=a.length,s=new Array(o);s[0]=f;var i={};for(var l in t)hasOwnProperty.call(t,l)&&(i[l]=t[l]);i.originalType=e,i.mdxType="string"==typeof e?e:n,s[1]=i;for(var c=2;c=0||(r[n]=e[n]);return r}(e,t);if(Object.getOwnPropertySymbols){var l=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(r[n]=e[n])}return r}var i=a.createContext({}),p=function(e){var t=a.useContext(i),n=t;return e&&(n="function"==typeof e?e(t):s(s({},t),e)),n},u=function(e){var t=p(e.components);return a.createElement(i.Provider,{value:t},e.children)},c={inlineCode:"code",wrapper:function(e){var t=e.children;return a.createElement(a.Fragment,{},t)}},f=a.forwardRef((function(e,t){var n=e.components,r=e.mdxType,l=e.originalType,i=e.parentName,u=o(e,["components","mdxType","originalType","parentName"]),f=p(n),m=r,d=f["".concat(i,".").concat(m)]||f[m]||c[m]||l;return n?a.createElement(d,s(s({ref:t},u),{},{components:n})):a.createElement(d,s({ref:t},u))}));function m(e,t){var n=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var l=n.length,s=new Array(l);s[0]=f;var o={};for(var i in t)hasOwnProperty.call(t,i)&&(o[i]=t[i]);o.originalType=e,o.mdxType="string"==typeof e?e:r,s[1]=o;for(var p=2;p=0||(r[n]=e[n]);return r}(e,t);if(Object.getOwnPropertySymbols){var l=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(r[n]=e[n])}return r}var i=a.createContext({}),p=function(e){var t=a.useContext(i),n=t;return e&&(n="function"==typeof e?e(t):s(s({},t),e)),n},u=function(e){var t=p(e.components);return a.createElement(i.Provider,{value:t},e.children)},c={inlineCode:"code",wrapper:function(e){var t=e.children;return a.createElement(a.Fragment,{},t)}},f=a.forwardRef((function(e,t){var n=e.components,r=e.mdxType,l=e.originalType,i=e.parentName,u=o(e,["components","mdxType","originalType","parentName"]),f=p(n),m=r,d=f["".concat(i,".").concat(m)]||f[m]||c[m]||l;return n?a.createElement(d,s(s({ref:t},u),{},{components:n})):a.createElement(d,s({ref:t},u))}));function m(e,t){var n=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var l=n.length,s=new Array(l);s[0]=f;var o={};for(var i in t)hasOwnProperty.call(t,i)&&(o[i]=t[i]);o.originalType=e,o.mdxType="string"==typeof e?e:r,s[1]=o;for(var p=2;p=0||(a[t]=e[t]);return a}(e,n);if(Object.getOwnPropertySymbols){var l=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,t)&&(a[t]=e[t])}return a}var i=r.createContext({}),p=function(e){var n=r.useContext(i),t=n;return e&&(t="function"==typeof e?e(n):s(s({},n),e)),t},u=function(e){var n=p(e.components);return r.createElement(i.Provider,{value:n},e.children)},c={inlineCode:"code",wrapper:function(e){var n=e.children;return r.createElement(r.Fragment,{},n)}},d=r.forwardRef((function(e,n){var t=e.components,a=e.mdxType,l=e.originalType,i=e.parentName,u=o(e,["components","mdxType","originalType","parentName"]),d=p(t),m=a,f=d["".concat(i,".").concat(m)]||d[m]||c[m]||l;return t?r.createElement(f,s(s({ref:n},u),{},{components:t})):r.createElement(f,s({ref:n},u))}));function m(e,n){var t=arguments,a=n&&n.mdxType;if("string"==typeof e||a){var l=t.length,s=new Array(l);s[0]=d;var o={};for(var i in n)hasOwnProperty.call(n,i)&&(o[i]=n[i]);o.originalType=e,o.mdxType="string"==typeof e?e:a,s[1]=o;for(var p=2;p [?? x 4]\n# Database: spark_connection\n eruptions waiting eruptions_output waiting_output\n \n 1 3.600 79 3.600 79\n 2 1.800 54 1.800 54\n 3 3.333 74 3.333 74\n 4 2.283 62 2.283 62\n 5 4.533 85 4.533 85\n 6 2.883 55 2.883 55\n 7 4.700 88 4.700 88\n 8 3.600 85 3.600 85\n 9 1.950 51 1.950 51\n 10 4.350 85 4.350 85\n # ... with more rows\n...\n")),(0,l.kt)("h2",{id:"azure-databricks"},"Azure Databricks"),(0,l.kt)("p",null,'In Azure Databricks, you can install devtools and the spark package from URL\nand then use spark_connect with method = "databricks":'),(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-R"},'install.packages("devtools")\ndevtools::install_url("https://mmlspark.azureedge.net/rrr/synapseml-0.11.3.zip")\nlibrary(sparklyr)\nlibrary(dplyr)\nsc <- spark_connect(method = "databricks")\nfaithful_df <- copy_to(sc, faithful)\nunfit_model = ml_light_gbmregressor(sc, maxDepth=20, featuresCol="waiting", labelCol="eruptions", numIterations=10, unfit.model=TRUE)\nml_train_regressor(faithful_df, labelCol="eruptions", unfit_model)\n')),(0,l.kt)("h2",{id:"building-from-source"},"Building from Source"),(0,l.kt)("p",null,"Our R bindings are built as part of the ",(0,l.kt)("a",{parentName:"p",href:"../Developer%20Setup"},"normal build\nprocess"),". To get a quick build, start at the root\nof the synapseml directory, and find the generated files. For instance,\nto find the R files for deep-learning, run"),(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-bash"},"sbt packageR\nls ./deep-learning/target/scala-2.12/generated/src/R/synapseml/R\n")),(0,l.kt)("p",null,"You can then run R in a terminal and install the above files directly:"),(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-R"},'...\ndevtools::install_local("./deep-learning/target/scala-2.12/generated/src/R/synapseml/R")\n...\n')))}m.isMDXComponent=!0}}]); \ No newline at end of file +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[1747],{3905:function(e,n,t){t.d(n,{Zo:function(){return u},kt:function(){return m}});var r=t(7294);function a(e,n,t){return n in e?Object.defineProperty(e,n,{value:t,enumerable:!0,configurable:!0,writable:!0}):e[n]=t,e}function l(e,n){var t=Object.keys(e);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);n&&(r=r.filter((function(n){return Object.getOwnPropertyDescriptor(e,n).enumerable}))),t.push.apply(t,r)}return t}function s(e){for(var n=1;n=0||(a[t]=e[t]);return a}(e,n);if(Object.getOwnPropertySymbols){var l=Object.getOwnPropertySymbols(e);for(r=0;r=0||Object.prototype.propertyIsEnumerable.call(e,t)&&(a[t]=e[t])}return a}var i=r.createContext({}),p=function(e){var n=r.useContext(i),t=n;return e&&(t="function"==typeof e?e(n):s(s({},n),e)),t},u=function(e){var n=p(e.components);return r.createElement(i.Provider,{value:n},e.children)},c={inlineCode:"code",wrapper:function(e){var n=e.children;return r.createElement(r.Fragment,{},n)}},d=r.forwardRef((function(e,n){var t=e.components,a=e.mdxType,l=e.originalType,i=e.parentName,u=o(e,["components","mdxType","originalType","parentName"]),d=p(t),m=a,f=d["".concat(i,".").concat(m)]||d[m]||c[m]||l;return t?r.createElement(f,s(s({ref:n},u),{},{components:t})):r.createElement(f,s({ref:n},u))}));function m(e,n){var t=arguments,a=n&&n.mdxType;if("string"==typeof e||a){var l=t.length,s=new Array(l);s[0]=d;var o={};for(var i in n)hasOwnProperty.call(n,i)&&(o[i]=n[i]);o.originalType=e,o.mdxType="string"==typeof e?e:a,s[1]=o;for(var p=2;p [?? x 4]\n# Database: spark_connection\n eruptions waiting eruptions_output waiting_output\n \n 1 3.600 79 3.600 79\n 2 1.800 54 1.800 54\n 3 3.333 74 3.333 74\n 4 2.283 62 2.283 62\n 5 4.533 85 4.533 85\n 6 2.883 55 2.883 55\n 7 4.700 88 4.700 88\n 8 3.600 85 3.600 85\n 9 1.950 51 1.950 51\n 10 4.350 85 4.350 85\n # ... with more rows\n...\n")),(0,l.kt)("h2",{id:"azure-databricks"},"Azure Databricks"),(0,l.kt)("p",null,'In Azure Databricks, you can install devtools and the spark package from URL\nand then use spark_connect with method = "databricks":'),(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-R"},'install.packages("devtools")\ndevtools::install_url("https://mmlspark.azureedge.net/rrr/synapseml-0.11.4.zip")\nlibrary(sparklyr)\nlibrary(dplyr)\nsc <- spark_connect(method = "databricks")\nfaithful_df <- copy_to(sc, faithful)\nunfit_model = ml_light_gbmregressor(sc, maxDepth=20, featuresCol="waiting", labelCol="eruptions", numIterations=10, unfit.model=TRUE)\nml_train_regressor(faithful_df, labelCol="eruptions", unfit_model)\n')),(0,l.kt)("h2",{id:"building-from-source"},"Building from Source"),(0,l.kt)("p",null,"Our R bindings are built as part of the ",(0,l.kt)("a",{parentName:"p",href:"../Developer%20Setup"},"normal build\nprocess"),". To get a quick build, start at the root\nof the synapseml directory, and find the generated files. For instance,\nto find the R files for deep-learning, run"),(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-bash"},"sbt packageR\nls ./deep-learning/target/scala-2.12/generated/src/R/synapseml/R\n")),(0,l.kt)("p",null,"You can then run R in a terminal and install the above files directly:"),(0,l.kt)("pre",null,(0,l.kt)("code",{parentName:"pre",className:"language-R"},'...\ndevtools::install_local("./deep-learning/target/scala-2.12/generated/src/R/synapseml/R")\n...\n')))}m.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/86ed1ff2.91286320.js b/assets/js/86ed1ff2.91286320.js new file mode 100644 index 0000000000..8580ab67ab --- /dev/null +++ b/assets/js/86ed1ff2.91286320.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[3090],{3905:function(e,t,n){n.d(t,{Zo:function(){return c},kt:function(){return f}});var a=n(7294);function i(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function r(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);t&&(a=a.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,a)}return n}function o(e){for(var t=1;t=0||(i[n]=e[n]);return i}(e,t);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(i[n]=e[n])}return i}var l=a.createContext({}),p=function(e){var t=a.useContext(l),n=t;return e&&(n="function"==typeof e?e(t):o(o({},t),e)),n},c=function(e){var t=p(e.components);return a.createElement(l.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return a.createElement(a.Fragment,{},t)}},m=a.forwardRef((function(e,t){var n=e.components,i=e.mdxType,r=e.originalType,l=e.parentName,c=s(e,["components","mdxType","originalType","parentName"]),m=p(n),f=i,d=m["".concat(l,".").concat(f)]||m[f]||u[f]||r;return n?a.createElement(d,o(o({ref:t},c),{},{components:n})):a.createElement(d,o({ref:t},c))}));function f(e,t){var n=arguments,i=t&&t.mdxType;if("string"==typeof e||i){var r=n.length,o=new Array(r);o[0]=m;var s={};for(var l in t)hasOwnProperty.call(t,l)&&(s[l]=t[l]);s.originalType=e,s.mdxType="string"==typeof e?e:i,o[1]=s;for(var p=2;p=0||(r[a]=e[a]);return r}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(r[a]=e[a])}return r}var l=n.createContext({}),p=function(e){var t=n.useContext(l),a=t;return e&&(a="function"==typeof e?e(t):o(o({},t),e)),a},m=function(e){var t=p(e.components);return n.createElement(l.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return n.createElement(n.Fragment,{},t)}},c=n.forwardRef((function(e,t){var a=e.components,r=e.mdxType,i=e.originalType,l=e.parentName,m=s(e,["components","mdxType","originalType","parentName"]),c=p(a),d=r,b=c["".concat(l,".").concat(d)]||c[d]||u[d]||i;return a?n.createElement(b,o(o({ref:t},m),{},{components:a})):n.createElement(b,o({ref:t},m))}));function d(e,t){var a=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var i=a.length,o=new Array(i);o[0]=c;var s={};for(var l in t)hasOwnProperty.call(t,l)&&(s[l]=t[l]);s.originalType=e,s.mdxType="string"==typeof e?e:r,o[1]=s;for(var p=2;p=0||(r[a]=e[a]);return r}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(r[a]=e[a])}return r}var l=n.createContext({}),p=function(e){var t=n.useContext(l),a=t;return e&&(a="function"==typeof e?e(t):o(o({},t),e)),a},m=function(e){var t=p(e.components);return n.createElement(l.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return n.createElement(n.Fragment,{},t)}},c=n.forwardRef((function(e,t){var a=e.components,r=e.mdxType,i=e.originalType,l=e.parentName,m=s(e,["components","mdxType","originalType","parentName"]),c=p(a),d=r,b=c["".concat(l,".").concat(d)]||c[d]||u[d]||i;return a?n.createElement(b,o(o({ref:t},m),{},{components:a})):n.createElement(b,o({ref:t},m))}));function d(e,t){var a=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var i=a.length,o=new Array(i);o[0]=c;var s={};for(var l in t)hasOwnProperty.call(t,l)&&(s[l]=t[l]);s.originalType=e,s.mdxType="string"==typeof e?e:r,o[1]=s;for(var p=2;p=0||(l[n]=e[n]);return l}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(l[n]=e[n])}return l}var p=a.createContext({}),s=function(e){var t=a.useContext(p),n=t;return e&&(n="function"==typeof e?e(t):i(i({},t),e)),n},d=function(e){var t=s(e.components);return a.createElement(p.Provider,{value:t},e.children)},c={inlineCode:"code",wrapper:function(e){var t=e.children;return a.createElement(a.Fragment,{},t)}},u=a.forwardRef((function(e,t){var n=e.components,l=e.mdxType,o=e.originalType,p=e.parentName,d=r(e,["components","mdxType","originalType","parentName"]),u=s(n),m=l,h=u["".concat(p,".").concat(m)]||u[m]||c[m]||o;return n?a.createElement(h,i(i({ref:t},d),{},{components:n})):a.createElement(h,i({ref:t},d))}));function m(e,t){var n=arguments,l=t&&t.mdxType;if("string"==typeof e||l){var o=n.length,i=new Array(o);i[0]=u;var r={};for(var p in t)hasOwnProperty.call(t,p)&&(r[p]=t[p]);r.originalType=e,r.mdxType="string"==typeof e?e:l,i[1]=r;for(var s=2;scompile, test:compile and it:compile",id:"compile-testcompile-and-itcompile",level:3},{value:"test",id:"test",level:3},{value:"scalastyle",id:"scalastyle",level:3},{value:"test:scalastyle",id:"testscalastyle",level:3},{value:"unidoc",id:"unidoc",level:3},{value:"Python Commands",id:"python-commands",level:2},{value:"createCondaEnv",id:"createcondaenv",level:3},{value:"cleanCondaEnv",id:"cleancondaenv",level:3},{value:"packagePython",id:"packagepython",level:3},{value:"generatePythonDoc",id:"generatepythondoc",level:3},{value:"installPipPackage",id:"installpippackage",level:3},{value:"testPython",id:"testpython",level:3},{value:"Environment + Publishing Commands",id:"environment--publishing-commands",level:2},{value:"getDatasets",id:"getdatasets",level:3},{value:"setup",id:"setup",level:3},{value:"package",id:"package",level:3},{value:"publishBlob",id:"publishblob",level:3},{value:"publishLocal",id:"publishlocal",level:3},{value:"publishDocs",id:"publishdocs",level:3},{value:"publishSigned",id:"publishsigned",level:3},{value:"sonatypeRelease",id:"sonatyperelease",level:3}],u={toc:c};function m(e){var t=e.components,n=(0,l.Z)(e,i);return(0,o.kt)("wrapper",(0,a.Z)({},u,n,{components:t,mdxType:"MDXLayout"}),(0,o.kt)("h1",{id:"synapseml-development-setup"},"SynapseML Development Setup"),(0,o.kt)("ol",null,(0,o.kt)("li",{parentName:"ol"},(0,o.kt)("a",{parentName:"li",href:"https://www.oracle.com/java/technologies/javase/jdk11-archive-downloads.html"},"Install JDK 11"),(0,o.kt)("ul",{parentName:"li"},(0,o.kt)("li",{parentName:"ul"},"You may need an Oracle login to download."))),(0,o.kt)("li",{parentName:"ol"},(0,o.kt)("a",{parentName:"li",href:"https://www.scala-sbt.org/1.x/docs/Setup.html"},"Install SBT")),(0,o.kt)("li",{parentName:"ol"},"Fork the repository on GitHub",(0,o.kt)("ul",{parentName:"li"},(0,o.kt)("li",{parentName:"ul"},"See how to here: ",(0,o.kt)("a",{parentName:"li",href:"https://docs.github.com/en/get-started/quickstart/fork-a-repo"},"Fork a repo - GitHub Docs")))),(0,o.kt)("li",{parentName:"ol"},"Clone your fork",(0,o.kt)("ul",{parentName:"li"},(0,o.kt)("li",{parentName:"ul"},(0,o.kt)("inlineCode",{parentName:"li"},"git clone https://github.com//SynapseML.git")),(0,o.kt)("li",{parentName:"ul"},"This command will automatically add your fork as the default remote, called ",(0,o.kt)("inlineCode",{parentName:"li"},"origin")))),(0,o.kt)("li",{parentName:"ol"},"Add another Git Remote to track the original SynapseML repo. It's recommended to call it ",(0,o.kt)("inlineCode",{parentName:"li"},"upstream"),":",(0,o.kt)("ul",{parentName:"li"},(0,o.kt)("li",{parentName:"ul"},(0,o.kt)("inlineCode",{parentName:"li"},"git remote add upstream https://github.com/microsoft/SynapseML.git")),(0,o.kt)("li",{parentName:"ul"},"See more about Git remotes here: ",(0,o.kt)("a",{parentName:"li",href:"https://git-scm.com/book/en/v2/Git-Basics-Working-with-Remotes"},"Git - Working with remotes")))),(0,o.kt)("li",{parentName:"ol"},"Go to the directory where you cloned the repo (for instance, ",(0,o.kt)("inlineCode",{parentName:"li"},"SynapseML"),") with ",(0,o.kt)("inlineCode",{parentName:"li"},"cd SynapseML")),(0,o.kt)("li",{parentName:"ol"},"Run sbt to compile and grab datasets",(0,o.kt)("ul",{parentName:"li"},(0,o.kt)("li",{parentName:"ul"},(0,o.kt)("inlineCode",{parentName:"li"},"sbt setup")))),(0,o.kt)("li",{parentName:"ol"},(0,o.kt)("a",{parentName:"li",href:"https://www.jetbrains.com/idea/download"},"Install IntelliJ")),(0,o.kt)("li",{parentName:"ol"},"Configure IntelliJ",(0,o.kt)("ul",{parentName:"li"},(0,o.kt)("li",{parentName:"ul"},"Install ",(0,o.kt)("a",{parentName:"li",href:"https://plugins.jetbrains.com/plugin/1347-scala"},"Scala plugin")," during initialization"),(0,o.kt)("li",{parentName:"ul"},(0,o.kt)("strong",{parentName:"li"},"OPEN")," the SynapseML directory from IntelliJ"),(0,o.kt)("li",{parentName:"ul"},"If the project doesn't automatically import, click on ",(0,o.kt)("inlineCode",{parentName:"li"},"build.sbt")," and import the project"))),(0,o.kt)("li",{parentName:"ol"},"Prepare your Python Environment",(0,o.kt)("ul",{parentName:"li"},(0,o.kt)("li",{parentName:"ul"},"Install ",(0,o.kt)("a",{parentName:"li",href:"https://docs.conda.io/en/latest/miniconda.html"},"Miniconda")),(0,o.kt)("li",{parentName:"ul"},"Note: if you want to run conda commands from IntelliJ, you may need to select the option to add conda to PATH during installation."),(0,o.kt)("li",{parentName:"ul"},"Activate the ",(0,o.kt)("inlineCode",{parentName:"li"},"synapseml")," conda environment by running ",(0,o.kt)("inlineCode",{parentName:"li"},"conda env create -f environment.yml")," from the ",(0,o.kt)("inlineCode",{parentName:"li"},"synapseml")," directory.",(0,o.kt)("admonition",{parentName:"li",type:"note"},(0,o.kt)("p",{parentName:"admonition"},"If you're using a Windows machine, remove\n",(0,o.kt)("inlineCode",{parentName:"p"},"horovod")," requirement in the environment.yml file, because horovod installation only\nsupports Linux or macOS. Horovod is used only for namespace ",(0,o.kt)("inlineCode",{parentName:"p"},"synapse.ml.dl"),"."))))),(0,o.kt)("li",{parentName:"ol"},"On Windows, install WinUtils",(0,o.kt)("ul",{parentName:"li"},(0,o.kt)("li",{parentName:"ul"},"Download ",(0,o.kt)("a",{parentName:"li",href:"https://github.com/steveloughran/winutils/blob/master/hadoop-3.0.0/bin/winutils.exe"},"WinUtils.exe")),(0,o.kt)("li",{parentName:"ul"},"Place it in C:\\Program Files\\Hadoop\\bin"),(0,o.kt)("li",{parentName:"ul"},"Add an environment variable HADOOP_HOME with value C:\\Program Files\\Hadoop"),(0,o.kt)("li",{parentName:"ul"},"Append C:\\Program Files\\Hadoop\\bin to PATH environment variable")))),(0,o.kt)("blockquote",null,(0,o.kt)("p",{parentName:"blockquote"},"NOTE"),(0,o.kt)("p",{parentName:"blockquote"},"If you will be regularly contributing to the SynapseML repo, you'll want to keep your fork synced with the\nupstream repository. Please read ",(0,o.kt)("a",{parentName:"p",href:"https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork"},"this GitHub doc"),"\nto know more and learn techniques about how to do it.")),(0,o.kt)("h1",{id:"publishing-and-using-build-secrets"},"Publishing and Using Build Secrets"),(0,o.kt)("p",null,"To use secrets in the build, you must be part of the synapsemlkeyvault\nand Azure subscription. If you're MSFT internal and would like to be\nadded, reach out to ",(0,o.kt)("inlineCode",{parentName:"p"},"synapseml-support@microsoft.com")),(0,o.kt)("h1",{id:"sbt-command-guide"},"SBT Command Guide"),(0,o.kt)("h2",{id:"scala-build-commands"},"Scala build commands"),(0,o.kt)("h3",{id:"compile-testcompile-and-itcompile"},(0,o.kt)("inlineCode",{parentName:"h3"},"compile"),", ",(0,o.kt)("inlineCode",{parentName:"h3"},"test:compile")," and ",(0,o.kt)("inlineCode",{parentName:"h3"},"it:compile")),(0,o.kt)("p",null,"Compiles the main, test, and integration test classes respectively"),(0,o.kt)("h3",{id:"test"},(0,o.kt)("inlineCode",{parentName:"h3"},"test")),(0,o.kt)("p",null,"Runs all synapsemltests"),(0,o.kt)("h3",{id:"scalastyle"},(0,o.kt)("inlineCode",{parentName:"h3"},"scalastyle")),(0,o.kt)("p",null,"Runs scalastyle check on main"),(0,o.kt)("h3",{id:"testscalastyle"},(0,o.kt)("inlineCode",{parentName:"h3"},"test:scalastyle")),(0,o.kt)("p",null,"Runs scalastyle check on test"),(0,o.kt)("h3",{id:"unidoc"},(0,o.kt)("inlineCode",{parentName:"h3"},"unidoc")),(0,o.kt)("p",null,"Generates documentation for scala sources"),(0,o.kt)("h2",{id:"python-commands"},"Python Commands"),(0,o.kt)("h3",{id:"createcondaenv"},(0,o.kt)("inlineCode",{parentName:"h3"},"createCondaEnv")),(0,o.kt)("p",null,"Creates a conda environment ",(0,o.kt)("inlineCode",{parentName:"p"},"synapseml")," from ",(0,o.kt)("inlineCode",{parentName:"p"},"environment.yml")," if it doesn't already exist.\nThis env is used for python testing.\n",(0,o.kt)("strong",{parentName:"p"},"Activate this env before using python build commands.")),(0,o.kt)("h3",{id:"cleancondaenv"},(0,o.kt)("inlineCode",{parentName:"h3"},"cleanCondaEnv")),(0,o.kt)("p",null,"Removes ",(0,o.kt)("inlineCode",{parentName:"p"},"synapseml")," conda env"),(0,o.kt)("h3",{id:"packagepython"},(0,o.kt)("inlineCode",{parentName:"h3"},"packagePython")),(0,o.kt)("p",null,"Compiles scala, runs python generation scripts, and creates a wheel"),(0,o.kt)("h3",{id:"generatepythondoc"},(0,o.kt)("inlineCode",{parentName:"h3"},"generatePythonDoc")),(0,o.kt)("p",null,"Generates documentation for generated python code"),(0,o.kt)("h3",{id:"installpippackage"},(0,o.kt)("inlineCode",{parentName:"h3"},"installPipPackage")),(0,o.kt)("p",null,"Installs generated python wheel into existing env"),(0,o.kt)("h3",{id:"testpython"},(0,o.kt)("inlineCode",{parentName:"h3"},"testPython")),(0,o.kt)("p",null,"Generates and runs python tests"),(0,o.kt)("h2",{id:"environment--publishing-commands"},"Environment + Publishing Commands"),(0,o.kt)("h3",{id:"getdatasets"},(0,o.kt)("inlineCode",{parentName:"h3"},"getDatasets")),(0,o.kt)("p",null,"Downloads all datasets used in tests to target folder"),(0,o.kt)("h3",{id:"setup"},(0,o.kt)("inlineCode",{parentName:"h3"},"setup")),(0,o.kt)("p",null,"Combination of ",(0,o.kt)("inlineCode",{parentName:"p"},"compile"),", ",(0,o.kt)("inlineCode",{parentName:"p"},"test:compile"),", ",(0,o.kt)("inlineCode",{parentName:"p"},"it:compile"),", ",(0,o.kt)("inlineCode",{parentName:"p"},"getDatasets")),(0,o.kt)("h3",{id:"package"},(0,o.kt)("inlineCode",{parentName:"h3"},"package")),(0,o.kt)("p",null,"Packages the library into a jar"),(0,o.kt)("h3",{id:"publishblob"},(0,o.kt)("inlineCode",{parentName:"h3"},"publishBlob")),(0,o.kt)("p",null,"Publishes Jar to SynapseML's Azure blob-based Maven repo. (Requires Keys)"),(0,o.kt)("h3",{id:"publishlocal"},(0,o.kt)("inlineCode",{parentName:"h3"},"publishLocal")),(0,o.kt)("p",null,"Publishes library to the local Maven repo"),(0,o.kt)("h3",{id:"publishdocs"},(0,o.kt)("inlineCode",{parentName:"h3"},"publishDocs")),(0,o.kt)("p",null,"Publishes scala and python doc to SynapseML's Azure storage account. (Requires Keys)"),(0,o.kt)("h3",{id:"publishsigned"},(0,o.kt)("inlineCode",{parentName:"h3"},"publishSigned")),(0,o.kt)("p",null,"Publishes the library to Sonatype staging repo"),(0,o.kt)("h3",{id:"sonatyperelease"},(0,o.kt)("inlineCode",{parentName:"h3"},"sonatypeRelease")),(0,o.kt)("p",null,"Promotes the published Sonatype artifact"))}m.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/8fd0c721.4f56f160.js b/assets/js/8fd0c721.4f56f160.js new file mode 100644 index 0000000000..6c500812e4 --- /dev/null +++ b/assets/js/8fd0c721.4f56f160.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[112],{3905:function(e,t,n){n.d(t,{Zo:function(){return u},kt:function(){return d}});var a=n(7294);function r(e,t,n){return t in e?Object.defineProperty(e,t,{value:n,enumerable:!0,configurable:!0,writable:!0}):e[t]=n,e}function o(e,t){var n=Object.keys(e);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);t&&(a=a.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),n.push.apply(n,a)}return n}function i(e){for(var t=1;t=0||(r[n]=e[n]);return r}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(r[n]=e[n])}return r}var s=a.createContext({}),c=function(e){var t=a.useContext(s),n=t;return e&&(n="function"==typeof e?e(t):i(i({},t),e)),n},u=function(e){var t=c(e.components);return a.createElement(s.Provider,{value:t},e.children)},p={inlineCode:"code",wrapper:function(e){var t=e.children;return a.createElement(a.Fragment,{},t)}},m=a.forwardRef((function(e,t){var n=e.components,r=e.mdxType,o=e.originalType,s=e.parentName,u=l(e,["components","mdxType","originalType","parentName"]),m=c(n),d=r,f=m["".concat(s,".").concat(d)]||m[d]||p[d]||o;return n?a.createElement(f,i(i({ref:t},u),{},{components:n})):a.createElement(f,i({ref:t},u))}));function d(e,t){var n=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var o=n.length,i=new Array(o);i[0]=m;var l={};for(var s in t)hasOwnProperty.call(t,s)&&(l[s]=t[s]);l.originalType=e,l.mdxType="string"==typeof e?e:r,i[1]=l;for(var c=2;c=0||(r[a]=e[a]);return r}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,a)&&(r[a]=e[a])}return r}var l=n.createContext({}),c=function(e){var t=n.useContext(l),a=t;return e&&(a="function"==typeof e?e(t):o(o({},t),e)),a},p=function(e){var t=c(e.components);return n.createElement(l.Provider,{value:t},e.children)},u={inlineCode:"code",wrapper:function(e){var t=e.children;return n.createElement(n.Fragment,{},t)}},m=n.forwardRef((function(e,t){var a=e.components,r=e.mdxType,i=e.originalType,l=e.parentName,p=s(e,["components","mdxType","originalType","parentName"]),m=c(a),d=r,h=m["".concat(l,".").concat(d)]||m[d]||u[d]||i;return a?n.createElement(h,o(o({ref:t},p),{},{components:a})):n.createElement(h,o({ref:t},p))}));function d(e,t){var a=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var i=a.length,o=new Array(i);o[0]=m;var s={};for(var l in t)hasOwnProperty.call(t,l)&&(s[l]=t[l]);s.originalType=e,s.mdxType="string"==typeof e?e:r,o[1]=s;for(var c=2;c=0||(r[n]=e[n]);return r}(e,t);if(Object.getOwnPropertySymbols){var o=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(r[n]=e[n])}return r}var i=a.createContext({}),u=function(e){var t=a.useContext(i),n=t;return e&&(n="function"==typeof e?e(t):l(l({},t),e)),n},c=function(e){var t=u(e.components);return a.createElement(i.Provider,{value:t},e.children)},p={inlineCode:"code",wrapper:function(e){var t=e.children;return a.createElement(a.Fragment,{},t)}},m=a.forwardRef((function(e,t){var n=e.components,r=e.mdxType,o=e.originalType,i=e.parentName,c=s(e,["components","mdxType","originalType","parentName"]),m=u(n),d=r,f=m["".concat(i,".").concat(d)]||m[d]||p[d]||o;return n?a.createElement(f,l(l({ref:t},c),{},{components:n})):a.createElement(f,l({ref:t},c))}));function d(e,t){var n=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var o=n.length,l=new Array(o);l[0]=m;var s={};for(var i in t)hasOwnProperty.call(t,i)&&(s[i]=t[i]);s.originalType=e,s.mdxType="string"==typeof e?e:r,l[1]=s;for(var u=2;u=0||(r[n]=e[n]);return r}(e,t);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(e);for(a=0;a=0||Object.prototype.propertyIsEnumerable.call(e,n)&&(r[n]=e[n])}return r}var u=a.createContext({}),s=function(e){var t=a.useContext(u),n=t;return e&&(n="function"==typeof e?e(t):l(l({},t),e)),n},c=function(e){var t=s(e.components);return a.createElement(u.Provider,{value:t},e.children)},m={inlineCode:"code",wrapper:function(e){var t=e.children;return a.createElement(a.Fragment,{},t)}},p=a.forwardRef((function(e,t){var n=e.components,r=e.mdxType,i=e.originalType,u=e.parentName,c=o(e,["components","mdxType","originalType","parentName"]),p=s(n),d=r,f=p["".concat(u,".").concat(d)]||p[d]||m[d]||i;return n?a.createElement(f,l(l({ref:t},c),{},{components:n})):a.createElement(f,l({ref:t},c))}));function d(e,t){var n=arguments,r=t&&t.mdxType;if("string"==typeof e||r){var i=n.length,l=new Array(i);l[0]=p;var o={};for(var u in t)hasOwnProperty.call(t,u)&&(o[u]=t[u]);o.originalType=e,o.mdxType="string"==typeof e?e:r,l[1]=o;for(var s=2;s child <"+("string"==typeof e.type?e.type:e.type.name)+'>: all children of the component should be , and every should have a unique "value" prop.')})))?void 0:n.filter(Boolean))?t:[]}(e).map((function(e){var t=e.props;return{value:t.value,label:t.label,attributes:t.attributes,default:t.default}}))}function p(e){var t=e.values,n=e.children;return(0,r.useMemo)((function(){var e=null!=t?t:m(n);return function(e){var t=(0,s.l)(e,(function(e,t){return e.value===t.value}));if(t.length>0)throw new Error('Docusaurus error: Duplicate values "'+t.map((function(e){return e.value})).join(", ")+'" found in . Every value needs to be unique.')}(e),e}),[t,n])}function d(e){var t=e.value;return e.tabValues.some((function(e){return e.value===t}))}function f(e){var t=e.queryString,n=void 0!==t&&t,a=e.groupId,i=(0,o.k6)(),l=function(e){var t=e.queryString,n=void 0!==t&&t,a=e.groupId;if("string"==typeof n)return n;if(!1===n)return null;if(!0===n&&!a)throw new Error('Docusaurus error: The component groupId prop is required if queryString=true, because this value is used as the search param name. You can also provide an explicit value such as queryString="my-search-param".');return null!=a?a:null}({queryString:n,groupId:a});return[(0,u._X)(l),(0,r.useCallback)((function(e){if(l){var t=new URLSearchParams(i.location.search);t.set(l,e),i.replace(Object.assign({},i.location,{search:t.toString()}))}}),[l,i])]}function v(e){var t,n,a,i,l=e.defaultValue,o=e.queryString,u=void 0!==o&&o,s=e.groupId,m=p(e),v=(0,r.useState)((function(){return function(e){var t,n=e.defaultValue,a=e.tabValues;if(0===a.length)throw new Error("Docusaurus error: the component requires at least one children component");if(n){if(!d({value:n,tabValues:a}))throw new Error('Docusaurus error: The has a defaultValue "'+n+'" but none of its children has the corresponding value. Available values are: '+a.map((function(e){return e.value})).join(", ")+". If you intend to show no default tab, use defaultValue={null} instead.");return n}var r=null!=(t=a.find((function(e){return e.default})))?t:a[0];if(!r)throw new Error("Unexpected error: 0 tabValues");return r.value}({defaultValue:l,tabValues:m})})),y=v[0],b=v[1],g=f({queryString:u,groupId:s}),h=g[0],E=g[1],w=(t=function(e){return e?"docusaurus.tab."+e:null}({groupId:s}.groupId),n=(0,c.Nk)(t),a=n[0],i=n[1],[a,(0,r.useCallback)((function(e){t&&i.set(e)}),[t,i])]),T=w[0],k=w[1],S=function(){var e=null!=h?h:T;return d({value:e,tabValues:m})?e:null}();return(0,r.useLayoutEffect)((function(){S&&b(S)}),[S]),{selectedValue:y,selectValue:(0,r.useCallback)((function(e){if(!d({value:e,tabValues:m}))throw new Error("Can't select invalid tab value="+e);b(e),E(e),k(e)}),[E,k,m]),tabValues:m}}var y=n(2389),b="tabList__CuJ",g="tabItem_LNqP";function h(e){var t=e.className,n=e.block,o=e.selectedValue,u=e.selectValue,s=e.tabValues,c=[],m=(0,l.o5)().blockElementScrollPositionUntilNextRender,p=function(e){var t=e.currentTarget,n=c.indexOf(t),a=s[n].value;a!==o&&(m(t),u(a))},d=function(e){var t,n=null;switch(e.key){case"Enter":p(e);break;case"ArrowRight":var a,r=c.indexOf(e.currentTarget)+1;n=null!=(a=c[r])?a:c[0];break;case"ArrowLeft":var i,l=c.indexOf(e.currentTarget)-1;n=null!=(i=c[l])?i:c[c.length-1]}null==(t=n)||t.focus()};return r.createElement("ul",{role:"tablist","aria-orientation":"horizontal",className:(0,i.Z)("tabs",{"tabs--block":n},t)},s.map((function(e){var t=e.value,n=e.label,l=e.attributes;return r.createElement("li",(0,a.Z)({role:"tab",tabIndex:o===t?0:-1,"aria-selected":o===t,key:t,ref:function(e){return c.push(e)},onKeyDown:d,onClick:p},l,{className:(0,i.Z)("tabs__item",g,null==l?void 0:l.className,{"tabs__item--active":o===t})}),null!=n?n:t)})))}function E(e){var t=e.lazy,n=e.children,a=e.selectedValue,i=(Array.isArray(n)?n:[n]).filter(Boolean);if(t){var l=i.find((function(e){return e.props.value===a}));return l?(0,r.cloneElement)(l,{className:"margin-top--md"}):null}return r.createElement("div",{className:"margin-top--md"},i.map((function(e,t){return(0,r.cloneElement)(e,{key:t,hidden:e.props.value!==a})})))}function w(e){var t=v(e);return r.createElement("div",{className:(0,i.Z)("tabs-container",b)},r.createElement(h,(0,a.Z)({},e,t)),r.createElement(E,(0,a.Z)({},e,t)))}function T(e){var t=(0,y.Z)();return r.createElement(w,(0,a.Z)({key:String(t)},e))}},1989:function(e,t,n){var a=n(7294),r=n(2263);t.Z=function(e){var t=e.className,n=e.py,i=e.scala,l=e.csharp,o=e.sourceLink,u=(0,r.Z)().siteConfig.customFields.version,s="https://mmlspark.blob.core.windows.net/docs/"+u+"/pyspark/"+n,c="https://mmlspark.blob.core.windows.net/docs/"+u+"/scala/"+i,m="https://mmlspark.blob.core.windows.net/docs/"+u+"/dotnet/"+l;return a.createElement("table",null,a.createElement("tbody",null,a.createElement("tr",null,a.createElement("td",null,a.createElement("strong",null,"Python API: "),a.createElement("a",{href:s},t)),a.createElement("td",null,a.createElement("strong",null,"Scala API: "),a.createElement("a",{href:c},t)),a.createElement("td",null,a.createElement("strong",null,".NET API: "),a.createElement("a",{href:m},t)),a.createElement("td",null,a.createElement("strong",null,"Source: "),a.createElement("a",{href:o},t)))))}},9009:function(e,t,n){n.r(t),n.d(t,{assets:function(){return b},contentTitle:function(){return v},default:function(){return E},frontMatter:function(){return f},metadata:function(){return y},toc:function(){return g}});var a=n(3117),r=n(102),i=(n(7294),n(3905)),l=n(4866),o=n(5162),u=n(1989),s=["components"],c=[{value:"SimpleFitMultivariateAnomaly",id:"simplefitmultivariateanomaly",level:2}],m={toc:c};function p(e){var t=e.components,n=(0,r.Z)(e,s);return(0,i.kt)("wrapper",(0,a.Z)({},m,n,{components:t,mdxType:"MDXLayout"}),(0,i.kt)("h2",{id:"simplefitmultivariateanomaly"},"SimpleFitMultivariateAnomaly"),(0,i.kt)(l.Z,{defaultValue:"py",values:[{label:"Python",value:"py"},{label:"Scala",value:"scala"}],mdxType:"Tabs"},(0,i.kt)(o.Z,{value:"py",mdxType:"TabItem"},(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-python"},'from synapse.ml.cognitive import *\n\nanomalyKey = os.environ.get("ANOMALY_API_KEY", getSecret("anomaly-api-key"))\nstartTime = "2021-01-01T00:00:00Z"\nendTime = "2021-01-03T01:59:00Z"\ntimestampColumn = "timestamp"\ninputColumns = ["feature0", "feature1", "feature2"]\nintermediateSaveDir = "wasbs://madtest@anomalydetectiontest.blob.core.windows.net/intermediateData"\n\nsimpleFitMultivariateAnomaly = (SimpleFitMultivariateAnomaly()\n .setSubscriptionKey(anomalyKey)\n .setLocation("westus2")\n .setOutputCol("result")\n .setStartTime(startTime)\n .setEndTime(endTime)\n .setIntermediateSaveDir(intermediateSaveDir)\n .setTimestampCol(timestampColumn)\n .setInputCols(inputColumns)\n .setSlidingWindow(50))\n\n# uncomment below for fitting your own dataframe\n# model = simpleFitMultivariateAnomaly.fit(df)\n# simpleFitMultivariateAnomaly.cleanUpIntermediateData()\n'))),(0,i.kt)(o.Z,{value:"scala",mdxType:"TabItem"},(0,i.kt)("pre",null,(0,i.kt)("code",{parentName:"pre",className:"language-scala"},'import com.microsoft.azure.synapse.ml.cognitive.anomaly.FitMultivariateAnomaly\n\nval startTime: String = "2021-01-01T00:00:00Z"\nval endTime: String = "2021-01-02T12:00:00Z"\nval timestampColumn: String = "timestamp"\nval inputColumns: Array[String] = Array("feature0", "feature1", "feature2")\nval intermediateSaveDir: String = "wasbs://madtest@anomalydetectiontest.blob.core.windows.net/intermediateData"\nval anomalyKey = sys.env.getOrElse("ANOMALY_API_KEY", None)\n\nval simpleFitMultivariateAnomaly = (new SimpleFitMultivariateAnomaly()\n .setSubscriptionKey(anomalyKey)\n .setLocation("westus2")\n .setOutputCol("result")\n .setStartTime(startTime)\n .setEndTime(endTime)\n .setIntermediateSaveDir(intermediateSaveDir)\n .setTimestampCol(timestampColumn)\n .setInputCols(inputColumns)\n .setSlidingWindow(50))\n\nval df = (spark.read.format("csv")\n .option("header", True)\n .load("wasbs://datasets@mmlspark.blob.core.windows.net/MAD/mad_example.csv"))\n\nval model = simpleFitMultivariateAnomaly.fit(df)\n\nval result = (model\n .setStartTime(startTime)\n .setEndTime(endTime)\n .setOutputCol("result")\n .setTimestampCol(timestampColumn)\n .setInputCols(inputColumns)\n .transform(df))\n\nresult.show()\n\nsimpleFitMultivariateAnomaly.cleanUpIntermediateData()\nmodel.cleanUpIntermediateData()\n')))),(0,i.kt)(u.Z,{className:"SimpleFitMultivariateAnomaly",py:"synapse.ml.cognitive.html#module-synapse.ml.cognitive.SimpleFitMultivariateAnomaly",scala:"com/microsoft/azure/synapse/ml/cognitive/SimpleFitMultivariateAnomaly.html",csharp:"classSynapse_1_1ML_1_1Cognitive_1_1SimpleFitMultivariateAnomaly.html",sourceLink:"https://github.com/microsoft/SynapseML/blob/master/cognitive/src/main/scala/com/microsoft/azure/synapse/ml/cognitive/MultivariateAnomalyDetection.scala",mdxType:"DocTable"}))}p.isMDXComponent=!0;var d=["components"],f={title:"Estimators - Cognitive",sidebar_label:"Cognitive",hide_title:!0},v=void 0,y={unversionedId:"Quick Examples/estimators/estimators_cognitive",id:"version-0.11.4/Quick Examples/estimators/estimators_cognitive",title:"Estimators - Cognitive",description:"",source:"@site/versioned_docs/version-0.11.4/Quick Examples/estimators/estimators_cognitive.md",sourceDirName:"Quick Examples/estimators",slug:"/Quick Examples/estimators/estimators_cognitive",permalink:"/SynapseML/docs/Quick Examples/estimators/estimators_cognitive",draft:!1,tags:[],version:"0.11.4",frontMatter:{title:"Estimators - Cognitive",sidebar_label:"Cognitive",hide_title:!0}},b={},g=[].concat(c),h={toc:g};function E(e){var t=e.components,n=(0,r.Z)(e,d);return(0,i.kt)("wrapper",(0,a.Z)({},h,n,{components:t,mdxType:"MDXLayout"}),(0,i.kt)(p,{mdxType:"MAD"}))}E.isMDXComponent=!0}}]); \ No newline at end of file diff --git a/assets/js/995576e9.7303de31.js b/assets/js/995576e9.7303de31.js new file mode 100644 index 0000000000..bb66aa6a2d --- /dev/null +++ b/assets/js/995576e9.7303de31.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunksynapseml=self.webpackChunksynapseml||[]).push([[3344],{3905:function(t,e,a){a.d(e,{Zo:function(){return s},kt:function(){return k}});var n=a(7294);function r(t,e,a){return e in t?Object.defineProperty(t,e,{value:a,enumerable:!0,configurable:!0,writable:!0}):t[e]=a,t}function i(t,e){var a=Object.keys(t);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(t);e&&(n=n.filter((function(e){return Object.getOwnPropertyDescriptor(t,e).enumerable}))),a.push.apply(a,n)}return a}function m(t){for(var e=1;e=0||(r[a]=t[a]);return r}(t,e);if(Object.getOwnPropertySymbols){var i=Object.getOwnPropertySymbols(t);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(t,a)&&(r[a]=t[a])}return r}var o=n.createContext({}),p=function(t){var e=n.useContext(o),a=e;return t&&(a="function"==typeof t?t(e):m(m({},e),t)),a},s=function(t){var e=p(t.components);return n.createElement(o.Provider,{value:e},t.children)},d={inlineCode:"code",wrapper:function(t){var e=t.children;return n.createElement(n.Fragment,{},e)}},c=n.forwardRef((function(t,e){var a=t.components,r=t.mdxType,i=t.originalType,o=t.parentName,s=l(t,["components","mdxType","originalType","parentName"]),c=p(a),k=r,g=c["".concat(o,".").concat(k)]||c[k]||d[k]||i;return a?n.createElement(g,m(m({ref:e},s),{},{components:a})):n.createElement(g,m({ref:e},s))}));function k(t,e){var a=arguments,r=e&&e.mdxType;if("string"==typeof t||r){var i=a.length,m=new Array(i);m[0]=c;var l={};for(var o in e)hasOwnProperty.call(e,o)&&(l[o]=e[o]);l.originalType=t,l.mdxType="string"==typeof t?t:r,m[1]=l;for(var p=2;p,,