From 22785d97761a4369714d0d4589a0c63d11a0cc31 Mon Sep 17 00:00:00 2001 From: Claudio <122022571+claudio-tw@users.noreply.github.com> Date: Mon, 10 Jul 2023 07:46:30 +0100 Subject: [PATCH] HISEL v0.4.0 - Better user interface (#35) * Comprehensive API for selection - to be used in tw-experimentation * fix hsic search * categorical tests * HISEL - v0.4.0 better user interface --- dist/hisel-0.3.0-py3-none-any.whl | Bin 16264 -> 0 bytes dist/hisel-0.4.0-py3-none-any.whl | Bin 0 -> 16741 bytes hisel/categorical.py | 40 +++- hisel/feature_selection.py | 3 + hisel/kernels.py | 5 +- hisel/select.py | 120 +++++++---- ...rkflow.ipynb => cont_cat_split_demo.ipynb} | 193 +++++++----------- poetry.lock | 151 ++++++++------ pyproject.toml | 3 +- tests/categorical_test.py | 70 +++++-- hiseltest.yml => tests/hiseltest.yml | 10 +- tests/install_and_run.sh | 2 + tests/select_test.py | 20 +- 13 files changed, 360 insertions(+), 257 deletions(-) delete mode 100644 dist/hisel-0.3.0-py3-none-any.whl create mode 100644 dist/hisel-0.4.0-py3-none-any.whl rename notebooks/{selection_workflow.ipynb => cont_cat_split_demo.ipynb} (60%) rename hiseltest.yml => tests/hiseltest.yml (66%) create mode 100644 tests/install_and_run.sh diff --git a/dist/hisel-0.3.0-py3-none-any.whl b/dist/hisel-0.3.0-py3-none-any.whl deleted file mode 100644 index 2e4e1823e7b8d4e46520c739e2b2eb007848560f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16264 zcmZ{LV|1m&_=hNcnK>8e(RxW0?3Z# zy^M&Wm%pQc7@&I)t`3EILQ?#LLQQQ#tk`YYe*{!D7H;EPY@W$7@EXT6nxhXSlPb}I ze<9{#8zwbT!yksB2+0Mt*e;(9A>+h>n@(P!cKuEggifFp&o|qO~{)tpf)Dk}&{) zzZUhc>6;k2nprqFTbUTyey(6aQzm|m9kKIO;|iVXC+9EF4oD@%A7b)iI)o=uh_I`9 zi?Vmx%JLO(H(NX+Q4CU>8~f@+2r{>DT$fn2$a*4SHAm-oJ91l*L2nBP?KW2565?$o zhl^bd>O$Yde`VrdEeX&ipxi*D^_d%6P00x$Q%kL7De&3JJOKh;oK<${$?%SAijWovwM+A)+!4X=vLj<-YOFwAqK?$?pl4KkhCR9h^8zLA7eyOy-H za|*SRsBzl1!RKji=s6Lra4mjK1&3gvpEgrt6VBQ2Q*L%ggU%4VOtXwR3jHo9QeON< z)F+jy{sk&53}B44OxUQ~C&lP>daCR)8`k8l6J5eBbkXBz7{;n73st!i10Tgk^ymcG zUkLTPY?Q%Zzadk)0u#3J%m))IE)ZQNhk{RBe#fq3DN()kW`oxI@>pZqhL*#QF00POU%Pm`GSc+ zNeQdJOZusup{WbxIphaiGwvoob590sQ9=OjSM2kaP(uPsm>GWRz%N6&Y;k(A5136? zd+YUem_j^e#8OqLTOOA9iWHmI6L!77nY!`w*=qh!`-ExP8SlVEd}+i|x}3ttT+ z?WYF@Uv#D7y{$)ysVclRjc7sIHQXd1w(?1yZZ$L%7rGRCY~1^u`MG}3t$_VXp;6RE z6*qz-HB_5AAdI5oF3|4&(pOkME6AaNEJ4bLMgty$CcR`Bwc9gUbiV(Ciu-J;@*e$Y zLZ6U%AqvFL=QWyyy~M_J%LM70Iol7N!4Z-!`Zv~nfn2;5x`8f}L~Czy<} zMZFHfl3SS`5R`VVq5Yj`kDNvC00)jhlJ_?97BeaZ!v;!CJ*TF_w-*tDNH>87Cu}4* z@q(j!M*5suUsKQ4KlEjye`~hW!VU|}7J2pU8kMg0J_)gENIyEo_T6a2xR;DYAQcgX zIpet|G<|wxJf5sSK4&}tDt9u>#%;CA!*Kj3T2*;_rpH9$Q#q%K%Oqdx*8v0LL3G0$ zXD0ro^_Z#FpV!Y&9yB0)=gz)4JMN#u+c)RL-*-19u%yTf=rERqvEhp21{UX4EX(s& z3J#4Z$u@!-YbbiBzZxVQSYKE6V9r>K5q5bLU2fm@MWDvUOrCrWQ+P{%@n~OS@aRM9 zxr=$uV4bsx`E*TcHSgZgLx<3+-T@jUoEle|LU#CY3TTHP23j?$#Vc2aQf>Ape4yV} zQd)tl9?T@JxnsC)AqIWjTBxs4ImCeJgH(2LhPoOw0v~@=5u&BR3VtxgGGpj{zs_K0 zj6^-v^!?*d-rAvvKCNLpP}5J&nMSp1pv#y0EtG6($%)nB5tP3EE)qm|@VCic zox@@dDV)_c18p6`-I2;QY4t%O>}8?gu+U%;i04U#UHV-Qi}8KVw}%HZ4wnF~Un0~Z zy^N6ZgE|!W4wu3s>v=mDNdncjmD-Rk+Uy$rA&!k)xU|gMUqOJgKmz731Uw)zI#vgi zI(hd`#sVz`B4(OQ+c$FvlTbGD$5gQITWeXF3Cldb19!ECplfa`lb-qM5ja#6 zD~`su8nn>^zg68li(;fjfmnTm_@|Sz1qy<~00985p#T8ve>*vIGb2|wXETFOe{E*s zYUN=6>FO%f%7Tm(SzABzmaYOiParzyYW*_-U3py*NW-Miuj5^9-5#K8=xYmW1hht?djbjj>%8kA{{s zp&(5AX9q_gM<^^>VpLrbYv?*@@w$Xc39>_lYJ8w9U&^M2RIC9bMx1SmLhd&cyl-e- zKSJ&M`m<`y8s~tkroRBq(pDzv7KP?yd_iwaR1MpZNg~}9QWc&U3KaY<4}x63%3K!B zt#o#W$z+5YY212V?ogqHz|ywYdtrp=FWuuy=@5DOfvpi+fUf&|+T)}Ba_VLffn@|gMORxb(3fD)kV@ycVmgJb1?yxpAqos;g+jzh3!aCh7NZOn5S3FcCyut4JH#`-U&pc>}eS z^0b{LTHIq5rz32|q;1?N7Yj1Bgf6|`_fD;-UXtH!TG*Gc`1m*RBFI9DM>Bh<=(pT(W=jqnNoAP4cNXb|yTejIm=b>ouGL3jTBcGzn3;7<$M?=n2`0k#S1MQ|Yf$9F7RlslB5%MCBfks>8o3TjCkFMKGA z0yKQGZ%wqjIvwjHJJ_#+o*wwGD5YLguzISl38k3V%@K$orIhwEPyF`>iH}D{EXm6) zIHbM6H9gfIvjp1S)!71uacil8GHg~gr1$1^rBWgC!RiM@t3RGCMg;{&t;e}iM{4P+ z925q{BJf92OVB1~_(o?u_(*boV7T;WnD+`JRtEqd)Qfn2cEKSmX)V`y4NIRUTe=v!}*Krmt=dqhCUT3BHKoE0gnQs3r{TW zyN;;4xbJ)|xOJ_U(hK?{D(t?s=yy?-HvvaaI6M%Po#TBCZZx0;7_C8duS?j$C*I_C z?&K@filWj&^W<;wgcb3k8yw)}X+) zc5n=I`zj`69|$$Wu0?QhUO^!s51jLwg`?mRsOvZx6`h?C3swJxoXZLd`3@n;;3W}s zgZdGp#z0nP45mLm`&4lmiP4yBJd8M?EjFK6u=xOxu$AM>4rXyU#dUJQ|3HPM&nVs&G9-u8Ly; zAc#3nud%l8YvZndVmDj(0P7!$zR{WM6fnD&J2+ z0jxK%L^3Ofj&$(zF}m4IcDS9;p7i|8OtVLyAS)^d=lc6%lIZNA+~Q;-?5cRda~N_G zk6coW99@eC*x_-VTE$CgJ{wZ}A^!|cLg|9p6|+8NPvHmq7qrbUOacc*HJZx{SD2=) zP=tg$F4E>-v&A|z%z`MW8=n__*N+}=`HXq)ltbscLlR-itiQUeulnz6hHvth4Eg46-9)ASV3m7(9p_;3Of0U%Q*pm5XU5~P zoxVO)_@~`5$^}<|`pmk?@c@8N)AQfCs*Rbmy_v1cr|Fr{GWg6>kpmu!2@}Fo*!71N zJFxC|GHs$z*AAmxl~AwfK(+Ji7EueMl!?!aFJYym3TUgECcrpBaHRGfIY0dSu*?k( zu}W8i$3S+p64TJ06_l=Zm6_J|)-Rvsuk<(`S?Ed;WyhsPLG84xBht(3;`Z^{r_OlV z(dlxdhK<>?$bzZGF0`mJJS}0+39XuQke>W}U-?!N3jIGWHsTSsdY7OV3v3l0V${F4 z7zgDH7}SBs!DP(V7M!;epxe}9a8-mQCU4S$gRQKtc=5xGYzW}ti$ivLU?fZoYSjXu zmiV@k)ZD0~bT1y981z{vi04UVdB>z_JSSmgwUxs4hGdw+Q+rXLQ=&3EXvW%Mv-N~+ zs>;#PK15~YfBs^(`S>n-A_aeF`%xrjkci+lfFfQ-QXxq%CWGCJCgoh}IVzosOmCEg zWapBkI6&XrhjDV*+K=!xZScC@3P=>TdZC+gY8pNfU;aoG*fR4A_iV4_E$CKtCL36a z73@58tTav~d*Y0!UXi9&)Msc~4!DMxF_QePG?xTMoz2rILfw$qmtbVx2{%SDR!d!~ zDh3a(G?NWF#uYR~%_=&`ghT2dE4-d~{1EvTP08m&X~jN_zm9dvK^kE{P*F`=PQqq) z6pl1V@3jj9x1~~~SMIFkm)o!A7U8cymM?umd;=xOb6BwKmPctD* z!*E(%pfq9XuW)kEUegS8O=?TwF9q!)qm+gn@@x)Qi(3cDL%7_vIjuzs8cLUD>60c_7~D zZ)rD9PWuKOV6}z=+a`J)ueWoHT$IEW#-AciDuucR zqXd|;`1Wq++eOwKu_WWyaXH-Q z&UF!xsZFRwa}2dV^W5(Z#)t_H(eK**TY2{Z z##;J~ty)84R1gYa-6hcW{PQbuDWn`83T#i2g5p9@=hb;yM2_{Q1*0ytdk~&U@*I8* zhG|JL=+RmJ5Vvj5!?WB>t|+=j42oIPAO|<*mW6k6BGU%c(|L$?woO6Dl%3$N;U!G= z-c;&7@dr*u3~)ZkT z#Mirmu<^v_bfBt!lD$_J+A{DP{?fB-EG?>A<_bZp#M(_yMJonoA);>kCViRDnc;(y zxk!Ngeq`xRYA%3rs;p4g=RB2p?34$OJ+iLH2EqK!w^cf^K@Hp3nK6#$%1DXP>fyi+ zc?IpH9F~*Dhh?;Ni~3tijg1W0i2=g+Pcnwvzj7TMD!mN+|oCw=}!Mpx#hLFxp;8T@|(AoX;cu^Qq z1I^4zyr5W2y{5L9^r7TY&3Wolwpk>hcja~}L7wvvl_q%JS)beaKBir3nG(^ZASRDdXEIOU=ais~|#Y|=#KW6FzwOqdH!BOG>&M#4{N zkp5xok&8UC?LcZ>g6!*|Ehu#UG)~#%rDv>f+=+P%j84V`E6arxCPe+Ye(ey+BEX&w zP*(v~cWMJDB9=KntpFV6~f9fdtkRR5_e z&fqjuVd*jE%nHchg^RUuYgWSZ$}-yna{1GwWv#VBy4|76nr}QL{e7Yli$^!#_FSH4 zRU|WJc_yG60)wx7C7hv=NRN&-sYoR{klIzspeGxhMP zs|#Cix7Bh3D|OQ_JO|Mx=YghFpJJ9pNs8sNQ)|{xN2Z=_dvyuC-=4?Ag4|G#Lkry% z8+1cupfJY~Gt3M{6|ve(sAG3OSKbLmKa{vYmfXZ>M($?3;LjsyUw@U9iW>tb2ISN- zXAU_+@|$~{oiUQ4x9E1b=8g3F^Ko(2d`BP$0`nHhO)}P-MvfSl8Pu>TgBmZoU0T{3 z*s?ghduh%5;DJ!gh3|p1&mWu4<9%4qaBh+|D#n}&5~Na2tqHZXMT!^)cY$kff_vL! zn+`?&)G(ML~gc-Rja!q{CgN$AV59#LMAuaO3%y8y|ye{6(KBV*>bbQ<*(AA#p zJwsAfx!b?Gf4?64?I6f%TVIJjSDIi`)TescduNqd6I^z3U%YZ;AYdS`rKN8$Rn?23 z-k|2nMI&aW_iIgtMp`yN4umsqj7SwndFH^3cfOEsqrBKB01$=kXFNYh=hn!OukXP-;UHiPgYIDW>Vso zM|B?8v_gVX@FR|}jX;EqJqV5GtY9p-Y0E%yhFI~euqaZF9wa_@WVJ;1b0em#er?VR z#xdk|-8E=GMYnrfm?|SSHQU3${XR}%WlR+-dZX1IlyHZJDO?D^3+I?J zYJoh38F0S^M~2*vg%F08cZzAH^=Fjp!4a_H#w6O9;V6@$>FIHh%KtuucgBOmGBYLy zjnN_8e|)sosApJwYQ9YT^4wWTb)y|)&D-Sd5XudzCur@+06WI&C)Dpmy-0ZikROJE_Zzp6chXXLesV2Lb+rIrzsDXcWXl#MoCrSm%Ph;&y}B5JC44$ z>npwW#&zQ@R=-=(Vjns=kYfkZLA_PXPJVDTwj1L2(To)X(B)iS>zlfr!>3mGeqrDw$*(Gp%O#(chjMC&wTb{zn`;StHgE;2%{}5u zs!VdS41crdLwes?-i?k5<<As2hdEAYBR@p{Il&WQBM@ zsa6ZXyS>3YO`?ibq&na=KpUs@$BgCYd%Hexfp%eroYQ5#egBr`#IV;ZEuIgmGR+*& zYY4h9MN|t-Y$RQ?SAaxdfI-o&$tNNBPM+_y-%mk7adFvk|Fg#yQJQRt3%y%a-LXbH z--t0wzoQ+O!C#=RDJ^Yb!9WAR%}S)%Gx5gks*lfosQYS)vTQ}qY^XRKTFh%6mI=n} z*5ga#phwfySW!QjF1d~G67!9JjHR@B1(ko+LDGJt>Aq)sY{phH+*>A5DP2Fppb#-V z@i?+~!@uLvit5N*fgUcYFwnrB;KDut*xb-W7E7*JxuOLMr4A-}$ZFJOe(u*}9fE*w zbM{SMb7!YF4*zZKl2{SjtfSud9dP<@in4^cLioZRMr7PGxf~piPXK}59zOG`PP~n* z^M~yW`l*rv_+V4uIwmldA>nza6mnSvu(r?82Wx(VOC2oVdK}v?gxbHi2>OfsPl}o{=hUJQyK{_LV^e`c6VEJ zV_d_uc~WZQvp{(i8F#KDiKvo3^Mn3o-(J{9;qB#hgJM0`vupjLXXI5|FLa#6gmeWc zHp*zC#}1I`&pNChYiABhand-!Q_y00aV0uJrNv}W+nW_Lt$D^yvh~)V3%+kt)CAL)_NR(X9Z&5JI&2Sr|MEf$dfJJ(IfHL7$c(l3b=e^y{e9hNyVJesS4+vU zPnD}!(Ub#S$E|j~W_DRR=>EmC@qDUoHw5~5Lt~uHpJHfCK70us9zr`{EnVDJi?R-< z%U%?u*E%hK2D4VfA^tqZ&hX#GgrG%?uwz6So%C5p)2A6ZID!`=IoGJ@BTF|6_89iCu0B2PWx+GzUt~C>* zHkcNCR7x?hv=m205_i$5e?9R&SoV#`X&RRXA8N8sa}&J%J}?Y561@LV|HRkHos2H% z+Qfrc+2dC#M*Cp&XRg8@!Qla2mrfYVYZ`GFkOSMQtq6|siCZ0H%RZZ5>g zx9=lPk85=+^m2vr@w;Zo_tz#yzwn7UOd;LiE?=$oX7$`8q^(qhXe-71^_VaH6I~Zw zLs?ww3aCsvp|>!I-lMrE*^Z7h7^yibY7!i{gGn1Sg7k6@!tZ35 zs=}rAZ*`NoW8H&_hud4eJhf<*=zK@GI%D|fYmS**2!Q?+7RFft0EBqR`@0<+@v*j6C^61 z#nxgXH*&Lx4ZlzfbS81_Jz_&1wAz+EQv|Cb-sab5ENH}oY9>heu#t16jw{?2E#zB~u7_G(8(y>d@3;j# zsLhBP4LOw{$&ZpGzsXItjbGLV9Y1B`&(C)5ACJk2;OC5hNJfbTTw3YR$P047F5*&_+_&2}M2(jlbUA5i378o{ zOkbemo;8jp$7Z90B1|zD?vSCZ6gzotl~xV7$01ftZr(fBI~YpU1e#=t zXyP!^Vgr+xd@YZqCmpP$KIMbdG6kGQs#3yCLw#ezNZmv5kR~gk72yWpRV0pkmn~S{ zm=#*_qO4=oLja$;EV3hdTv|oK#4&DR%P6K&grA#%5ZC7(z`=M|tSl@hBsCgv5b7k* z_h?QD;tjGBez6-iX=4BmseKW>1FbiL^dn~hmPw^qgCK{^CD+Eo4|uvdJb(z&WOF_d z1%P&#>)}EOvgj!gLwX}Zf>~8sQ9j1plI-y@hsd#$kn1usBYiD>;HFsPxg5cB>Q`Qt1 zFJ!{^w{Jzx`wIq|x}9?RiEZ4IQQQ+#+|$v4WajUiG-Ns_M(ezvIA-JE%zD(yqK-<1 zmND_#@!#bI0l$3r!x0(rTlWjcC5*rhyq3qx^0`uDmdb^$ddPriPCEMQf>SE0Z_4)~ zW@iq=+#Q%#m!Jyfn;Q3ARDfG+Zd)pETdE8|8cm9yLOU##_%|#m(5m)M# zpuJ`bi8`ym*W!Z#wWwE+oAes?%?X37ksxCM8a~R*se(!h?yOTpa>A458+IuP28A9U z(B{Bk1SGD-v)KS-7bK<(M$hX~6GOGmAAwzAz4Bk)V!lKyV}Y7uCV5Osk1r$)G&L~S z-qI*pEn3zHuZefkj453Cku=8YBu_B**~TVL&NwxoF3s{1XsR>>84^R&FGlf|^vLQ} zr{3F=mW?h%j6V3vv5P}{3b|V&d<9yCRa~os@pVl@@Pf0_t7C@RT}9Ks(Ph*Izq;i6 z8q?M$Xjra_=3$SRG+$BXB>synT{fDZx{+yyU9Twqq_|MgJZkjo05_IXyzI?wq{`PSXkq%8zCfUI77tP$fY;wgGQ{%n`@I zQ)`d4(-HVj{hfbpMNMgq4Wf+bHwurM$>yNFO_Rg??wN);UhM9dWHS+} z7=f{rxaJ>W^;qm#9iDQYF^eTI)*8Y5NkkiJNpV|=S?s!$^b!(e^d3$v1Q^Rfmot`1 zP6pt}q{JQ>MEQ02b!5v=c+p>sc2bc=S;fr-pHaU~1?C+IxgQ++B52{=RY$es4R`}F zR!NqQ`&-)|ax?~^rL zuV%tao4iQ0QOzC~yyAk?c*yhx%X3$xlnwqBdeKe`5z;fauvcrD*PqaThe4CJBaeSW zt9f67&3as2a%h%qX`O6g3b(63Uu=#VfSpkkI+r|jg0&}QCA<%gmsv3nfD{gpfHf&g zI9|TN_R8mia9lvfyGg0hj|&Aa&9*$#n+p{yFo3nV<|?5qp|~Fb*p(L>alB@;uYjO& zubpT4xYsKZbroNwAD{HE_T|?nk0RW>(je|;Pk4HShkL|Z?DsYa14}*sLaNKI>k$d_ zSf2F_JE)^fQIus(YGVc=v=vP?X!8py4)lD}X8T4PU)R1`+Uwu2VLM4*DNa;rjl5Xh zKSOR<*S`>HfZ`D&!9`bzB7Gdzj514u(P;EoZk0i)%NtDn9dQ>k{Kso0H>&c+0jKqw z0?TtT*2{Wi7sTC2j^WBj^``N4`}OVzrM`Y$Tp7w0smz=pWHmY(>HaAtFF}ZK9y&s` z10OO{Hxg--8?ZnPdliH&hf0)2^uX?AQ47t~paARb?jUO=6iZ!Yh&ojD(StNa; zg1G&fEd~KGdVI!1GAFdzW!~BOlQ8v(IPEZ~O+7A;ZaEU8#MqsRI|%Ca*4*G*uXb7e{Y8s&XW2R7<2-Zt%6P+-Z@( zYVQ zLAhB4K2lI9`DP~`mW?{1Y^1y-LC;jY0JjT^;kkaxbTlplvG0kRP$eeQ1v`3zgvI$T z2xk|>>myz%2aL@tfo;oh3Iq2})*`;AxyHIa=Q7ZNt%Cdnzhd5cq0X{l4Z_u=?Q8i4 znaL}AJKx{8v%<+jJu%6;WP$xFr@w^Atq$BLUgVT(UDE|yOXLz4M|-*Afwg!8WKxPm zO6kf>+Y3ohFQhh;O@hY-KKc!zZum&c)!w>=_LR-_uJHMNPN);&It0IQ)AA7!?X{=q z+MeX6QLN62zh0une%eS5MYEwr0Eq$A89i6vIpr^?x7;%zj<;)p{w>}{Rm<5aSE_Ba z)9M21-b}WEPL$W?AG(pK=vnG%u8k?JJA1C=j6 zeOOdSlNOuYJ!M1AuFd*K0>lrj9^Fd^k(5#cU7MS8Ac^4(7k;z`AZ%dXo&KeWE^_`j znOMpL2!JvxZHo*YP%$yR(8kaPvTUD8i*S`KsM08dd&=2D=Gg`H0+7Lr>htq?y`s0T zsnH@!c8c-dmF{-z^z5h`=j0vE>Jzw<(lgoggR<@!Blr?it;Iam4reCmb8(X8>S18k z_H43DPa~K#en3l|6HF~AZoO*pEig)TX}P=@0*vi=#(SZ!gx^EpF7Y@ne>cud{BX)( zrR9z>sVRsR>fT_jkLclIP0(Sm+GgnbTABPf$mik*g62!M(?^ajB|pj3{J}yglSxp9 z#lldgy>8f>HuJ@qMp?G1HNu7-?iEtUWW5Sb{5K^5zYtN2#?oqd=`r7gv-h{Ug!Zj8 zGH-#iBKP~8?Qgo`C^z5B+fqI%3`^(kpOScPxY2s(qA6k{B+t8m}= z&&mYlcZr?11gnZwKcoP!PFzd5bwBP)&iR)@&icmK%A5598*ugwyTx=^Tk0hPdyma(5G|t{BMK=~3Q(*9{46A+UaH$`n!r{8LbQ{6j!)*B zmS01tVT?1Q#z=O=)Tl-HHNw)8qA~*53s;ZoLcn&X(3{MZ(?+qJD>F2CS;#V`>9#pu zo9(FCvnH6`2_eKMGz7(2c0cBAF3F)}jOF0-3W{``>{TL?DtzR51jCAA;p~kbI|I_j zjh!F!JvtjD{f~W!v*f0hcH<5BD?-a&_oxUWDhe-@%#808&4kTbSR3i~09R($BjpY_2v5b_hiFoBct9_LF9 zx5Z-YY%NJ8CSu*owvwiN8$*?fMBg(UF#)!sGC^Npg*z)Cy-*!jS^C?>ymS$kRE@!O z5T?M$EUhk7vNp(U!a#E$*tZ5I>Lgp8R#;Rxb+TZnh)lD?3XDC?;P+A_HTsEX&uP>s zZL5wpg~#TY3WmEV5!bfTj4*_UqrU6W z!)CH28-@!KPqsq*m!|qS)O+ljrIbrJQiziK7&kPoOWjO@z^JRM_YJrCUrot$pK4Xl zr&Yn5E;Qvvdt1rH1L|Pr{~F~OgOiO<^rQLbNe64sSlpPOR}y&{ftT>2f|DSee1mv> zpZM~7iH6aERl*S@L$n(b17qs280ZRqb6fgNo%{9Rq(_dAmIu|t{1|v@W{>iQ%1G(4 zJON~JT#$plEk5$GUU~3$?>aRK^gGfl`mPB!{459Wa-R5#htmaPm{XhbmtkD`CNhCU z@`AmjQvmhVWy?)iZyPrP%C3)?&l0pXv5}k5FGm_RXe7EIz@uc3ISY;W1G;!ekxk(eu4PShLU z^?Qv5!m?Q{8v1HAyL67^bT6fSpk@VKXPI^;~5v?D*pGgW`XKBfEz3RfzNBg4eb@*C)Z8=K3y?mJ2bR{m~Mo z;~f``FT{UkZ#|s6*@q+^!Fxeoo-FGze|CHP65L)^&M-G^)Qo@kxffFk0rL{j`MVXs zfP>gmf`JIs__&%7%cp`<{o3;i?gzbErCAKKuYe4r#gq903UT~MxuSHqQS$Crm813u zjJ2T2Ra>;Xj22g>Sf-WlRQLzQ38j%X;q{J@#-}^q-W9}t!3!p>ZzX?QLqYd@tOOlW z0*6R`FXCF^3)1=c>y{(@Y1&FHaxgBNVqhD8p-azUV?np8rD51KLtRygC>;{fmxk=g z>kdmGGDqvn*4;iS*LWb<8RG@mZAunyX*qTPhM&pFW_klXR~1_S_|NGNhY$x2#5*jK zElb`EnX5vCmJ_+LYx2+@Qhjm8l4^#l8my+WbE_Vtdoi5WfD7n-r!tvXY6>h&TK8ts zuy~SR5lOi-o5bb)4=~+1fqkhITX4TXfVvmV1O5<2Cu);gS%3im@lRWb|3BA9bd2;Y z^o;bTRxYk|R`%u&402+sLZU*dLMbXU)@y=@zFk$C!!p2&BehTkMNo9`WI^PiL*srY1=7n_SuQD|^R?@oxl9Ea z*th@5yU@i+uvXVIR5KxNu5A>D?Io7w(2|450J0Dcw6!QNKgiRd2bLt|M;lG0 zIVx!jLn>Pu@KR+#*xepm?Y*&2MM8a{h{O5EK2h>pIPZ&n$5%;4gceo&UE(D5wQb9{RY+W&@FdC$?UCB}%* z8HO9;Xuq@4r0A;Py#PShWD(wU%rEiMu4R6+*`Yv_!N0VJaF8Fa;PPA|1Ii+^+Sf_h zs}v)%I!0XU2nTwz&KEq9%Q+H1cqH?|O$tr9ORCT14p5L>9vwu!ALByb?PDJZn=@~F zCAAjPE}vcYDAeEx_@fknaS_5m{d1dXeh!@fxZN})#l&RcMtPtE84PQ%AmRoCgwWkSK;OU1i0$zIyQ;ZsqTIO3B)o$6e}Nblk2DC!hB z-5Y(QLVyUOTa(PbJFLyk)xAD4q=BSM;(8>63vm-7uH}R>=}z)D;);3A{W5b4Ki4UG zPXVPhRpS6o`>F^6zdfKfWH%`+<9ZQ>RC$XRF#JxmPuMM)>76X5QA7aEW$GtLe9Igr zp0i7qPfSACng)+D18&2qyOfvj=J6lK^W)6(4-ruGPA#oA+*13ZCKH?9!x znY;Yby!z4ayBo`KF_6kWg{>IVK1|u}gF%}{zoIpP)G)B)Al!8B=vnIe+$8V)CdcQU zB${2E6Q0y^(o=oTP+mqY$9zn_SBkll0=02ZZ3EFSnipxi3Y8tm;E+W)y4EOq(bg|W zhRAj%n2{Fe^k zaLy2s!lWsd?pF}QlF0U?<}oiWP;+pXFOW=HJc_!aUkuB+;&t`6pe53<8cFI5Y<8|o z>>u9Q(@4=)_E-7p->=fWVl-z#Z-c$(iw2f(%P8H-%mIt}p~5g~F5hAv07-`>RnoQb zyDMeINO*%Ry_8|zR-Br&7>0={dhuy)ah*6xW6|wnU0Uz~nlGYdjx32#Mq93LyRfT9 z>t>s)u~jV!5EMjdbOK)TP@7f1d%>efPXgY(hQ;@-Bhc7xg!v13S(VI5P|-aK@@YnU zMGR#-G?aM|OjbT38tXULh{OC|p%i6+fKfsIb2ski0r;;^LC{}c|ED+iPxznRu>S%B z00lu^|AzmwH}+44KU1^qiA_!IqSwbb9}JdFQA|4}>jC&iz|QGZiB{FCCpOQrtA|5?@aH{K8PA5-{O z$${se_&;kD{>EQ^TEu_-0e>ywU+_QCe~OHMqisK{$o_@?Q*!*1<4*PWEpskVi`>_1`Ecg}wHhpQW+aohbAfO;@ zAfSI-)W4>0Z0HOyvvagCHnje{f(1<(yI<@`otITx6H4XQ?Npw~wL$?HT<8`71r&xv z_UcKg;d&Si*Q4U^US>~41r6VEVN}BK!rwx&9^sB_?W~xi@a$}%tJ)d~hV|FG*Fw~w zpZi0Cz4=|xWri>7%Z~6j-z^NdiToUy%2}ZaGz98x$i}8Zh51w_?A0Gx%EnvQ5fn+2*STV~R)FfHRR{=NO?TuiErN zD=iRI?JgpTy2)rK=^CEl0cHiNa3|sRG?;;JA)0q#3w$BIx- zke6;2xGoDruwm&h)s=G!&5!7z3o|k+R6@%~EVmOQjj1=*4o|82;d>>~2#(jE0`pr6 zcdbk@ss12(b&+_qk;3fQtuE&6g?X8 z8FNP-zXdVm9%Xr-{F3AdU_SNvlKO%p|NWVhCmU?ad_llneG4GIXoS6u6Aj9BSZT`? z^3~ip#!|3iin>QBR%Cm7CwYiX_Dj{J=i>c&0Cc=@Qr-{ZE5CGs+DeTN;5hG3Fzu8&iTRI3PpllY(04wmTTmBV+fYE239yrC&XiG zTc2+rbmpAsVX@^znq+ds>d3bM@q#SmXow5S91OYDZzd2O+M1a%7hmPsSYV}T@J+E~ zfHnaOvpr2mP!Detg__SeFv0@}8hG`<8G+xK^jp9RH4M=fyP;`O(y?B?KOhic+F#hy zkFv&16!s$*XWycO4#E{WGKxJIm?;l1$DzSL)Y!BSczk~hlTF@Y?CN}8kXyRTGM1}L zUzT%el`fdAXM79@WctM`sx!05rD1E3+8l?^KLc0-C8e(7&pzQhmC{ZCEw^72=F3km zo{>;Yhd;bj9}O~qBXMH7GmsAg3hduBcL}s#;O>;Na?Mi9ao z&INFVJ2X~!q>DuL*-VV#8sXYm=^Y?#J~qBfK)G=1jNbS*f&ui#Q^k$}<1!)rW<6;U zj+QfA4DTqYwLrI7;nyuDrrs%X(NhX(+SErLiMzKTb)REy_g46J7nOliw6qQWNR9EH z^$OJ^BKmidLiEnG<%I1PPaZv%3=nZA!H%js`H11tAQu!rz#liKyv5*$2P*ypMJ z-9pGbfN1EEH&fDIX6&HQW@UPm>`F;@dB6sny{i12cWH^B!909UH62pJ>Vw=!($)fx z6zeKuCcC*q6f zgt3qOQnTA|KOHh0^n5)@ZgTU_N@D(kmkSFfaob=V%h5z+W;|1Q63#J#0AfP$iI+#->J8t*mf zIts-L7%_kkSTEp7CGn-ht4xRm#Z2^@`8;25uvkxO^Qs?%*P4vj?UNX#yy{amB6-K=JHAsEC=ia@WpaAxhTepC8 z;3aB=&(x9nlwd^Rql|Z&cb`>pC!^}KPas{NQgSv0`0nPA*KbZn(` zvsZBSYI7Ce`NBWC&%= z3-aP31LmVd-3*r}X6&(g22_v3fzoZ;2gUPyt2}emv(bm-C9zd^h|W((gn`539RVgr ze?%J&Pk*hcM;q`j*mlbIo;WNyJ)_I87F}#$p~~=~XT)VT?y6NN(i1lY{Snmx`ZY)Y10 zO5eZH9{ej7d3q^{&>o2EmBWG-?iUw>22!j7=%A(OC*EJao2;lA4S88m&UfLi~-ezv9xK_ zy6Il-IO%Gy@B}d4u!xdpETp4dM#2g;9&r~n@d)@>U{vvV%A(W3m&FV@fjTexGhm4A zP_*&Tn~bM-JNvIvmOaUpmmS4L40@07=G$jY8)bx;aJmgW5>zydAB%F41%}H2Ik-I& zbrgBDTbIYHk=SD$J~oQR`SGOMQRBSRd!9?TB@^zvdxFvlX`?6oMRX@bGtl}i<42n- zVUk$J=UPJ9k#g4WACq1o(Yrq-ImRk!^4hMJ9c`J@2wcOrO6no~9sN+UNpRg^DTgO4w4tc5zB!nf&P*N3 z=vfe=(;s$|X(F8qu1B7txRtalmD6&vpVoN-n=y|}GO7!D z&LFkPMDT=OD$uD~$flI4)Dk=r?Ra(Ct^Qu!SE%KqI`5aEyS1g^4jmzIwxmhh{0{ z>*377H{(|oZc`B~vwUi>w#JPU$H3<2?7AGvt`&fl$!b1IAb!yY=r_qS`O zl`A$Vz~h{dVP6RvNg`u)L%83H`ICa)f&aeR_84R@<-mY|RFHsx;QzDQ%$+QZKNr}! zy1v~WE0W))f?;@SL8?sKK3%H{B#0g-aNs_bme@xB3Y%P(ff>=k982Tw@`klc)s8=8 zSz`oW*Yj!mb@R4Qs5qCRt7turCx6~$X#4|Q6ZDJVK-!O-FSMw4r~#Mj9_AxOUu7w% zC9OR0q0I`=@yWh7(e7w>{2JNDeid|g!+%9B^_YUwRdJ3h#ky_|Ljo|%fTct$Ck)aBNQUqIedszSEZR+cd))O9 zLsw16NEf3lzj2y(g`onoR$mdx(D$4)piC?J^T~5bzrL)U_@usY&o9{*D4YfTZaSzF zOuS|JYJrh$V1O$tdDTwo)cIDEo!6aXOFkli*s`R1>7A&SHm+9pi2H%8CZnTH{1I}8 zgiVy~*M3_K-qbq6H1*;SBQ9+O%DP!rp7QBl)oF#uAOb_#0@UnNIgPfEO-sQC43fAB4vrZSK)0xD{83%sp z{LLSAuszGp-rprjOvu(BdWKz-;NrZ3LO>oQ`!y3s!OdUCelj95D?J*z{u?=$1vJVX zVuJolJlHz*BW8`htjri}e{9x?;xaO$5!rYMv0qzsKCxg~jt6hlXuQnk0jHLb9J??- zq?<)xIl~L75AwCN=$w;)-ZW_775J0So9u`-MWjsIlIcexxm~;0IC?BPsGD}4biImj zJ!G!3eF0DabDnNvZC_I3j$V8{{nnim=)(qGDpfs`?o{qPD8zw2aQ zb-W7SPeOq&Z({Le7IqzJ5a(lbvl;C0+rd3)`5762N3Q@2Dm%ye`(l#FtfAcEL__SV zSiy5xauT;(Qp{`}vj@20aqU{gOKCnUQv4y`bWXz0T7Cu4r{pgDVEcl;@r_Afzo6;P@s`hs=kDvh!T>=PoNi~NY zp~>ueLu#8*tw)6uL<1XzVy%=g&9DI}=9y*@N}|Wa=f#(BQc?x9YhI}kB>~Z-j+|M3 zUE6SV47X7#Ha+_RSLMTF5pNwO?)0T;=MERC-V{%jnXiy^rAV^lQlnrtnwDW{<#jQ8 zpzTv#?NNW1C*+`pab(+K_N_+9i3E{l>D`!w6cnK0o8l`L?&M zfPhE>w%G0+(1Vzy&uvgwi;Vb?mXZIt#%}eIBYP}`aA56MEVdb^;4y$IUPe+ONiQaY z-HR^eSn571oq|Gdn1F2Kl%P03-`t0J{Mg!$n6xx_U2g#_3MaVGO*u7<5RWf^C<MpU;*Lo;GHE~y;o8gj-^(x=p71O$B&Z;dE< zb=Yv2k!3SvKN6sstWrh{8B}5}AGn{@ZH$ghyqgA((j`WOFX{Xt{4Sf6&6mWAV}ghe zch7?&`naQ~fw35$(RkYjWt_@;9UXE>HAk=3QrgL=c>oXxS4Tqx5y_D$GU`!~ zvvSz51jgUPnvqqgHfD#IJIAd4ceGnhcO1V{K}8pd%FyxZ{uw z8;mg>Wu9kr=^D0M0N@rDp%--*Lq@Wc4m@}!bu^ssarS^lT3@TrwHA{rA8|Lo#ptM|b*(yA;$)I7s zj-L2_T2C>vl+g$~%JRoa9$ajMi(93%SAiP5Nu3VcRr4q=GO*e|0m)N;mCsne6+;mG>FOD0r!qvO^!DK zTFWYod$p|Nml-O5$ktmWe)^cVwkARL*}-p|jqFZ4=#9QQtzR%Ce)IAo`$H$9)8u}M zX3`vy|%7}F+3eZg;+$QX)t2{aIYcfb6vgn1yi7VNfzG_g2RK+ z#GgUEFuzJ6iW)(lTbAdz`9k~hoG~`{9r~de%S*!AFwQ6p5AEVt5@e=q>Xg%LhXlV` zjQtbtR#j?OIiNjX)e+8CV$v?A=M)CfS;vD?KT$Uu_Qy*+h!MRg?GGcgYACe&rLfEp zUB7fjQhN39o~*${sFvI1oJ8a0!(J{SD0BHkJpX>u_=!oQTv$yf@zVjtOp>-&S~yGv zd`oUjz+Hb-Y}y6(omKf0Y~2bAzrqAhJ$!1w-S9+!vKbx+T4=kMgsbTUBh(Oph*Rx8 zEovS8(FAg4KkA7sF?oIRo-miWNO%vr2%X>xpCJ(>N=bDE;d&$0AfTyEKfH$AP^=KP zzuhEq*<0kURO}5@>%%N%AMi8NrM=eiN(CsYZ0kb#`?-dcqtl~GP!5y{TGBKEO$IuM zgp7C@F9naUKgF^0oI0#4)rfgb^QjwSSKAbA1*!-~5BdX012mr^{bh)n^d>9V??Mnd zn3+LWoUoW3FG063?A@XzqlQNfe9Q;x8j2q5(}jV7WGH-)du;X>gQ-XPc35)IR{-Tk zRNq{GQWO1|QcXU303o)+2g&>qW5$MzyDr_e8x6qg8d&>MxK57k`@rOva{CFSq55)8 z^fc(NE9WYn1Op?#GNPVPgTNG10Gro5^y(XA;tmjaa<;L%=vpEy)>fd}{ z&$QGg7J$n^p>69HyGsgj&^sxKxz^UbdGQ$?4P51`3SJu?7`dp)-Rw8%RKASiD`f^!d@Clr_4w$&|{}|IJqz zw)BZ8pZ#Yq7$tyGya~f$enRv;DmNyC9R^?QJvsTeJTh){V-YhPJ&FXr!>B5Ho7)#x zwyf}sOX~iMM}Z3_#<&W&^p*w3mxJktQNgT(k!2B}xAMulKWb){Fc_i?u!CPHIM;J}-yzKO4~FNjka4-g`-CqY zGZJ~y+E45C>0a>^3~<^e;lt)YF7>D>fqKiRxP0I2aa_QDr*YXBuqphx zDT{{v|4+liy+Tb}VV4ulcdCjrFcnQ$dW<=v0xEFfVr|@#mGHc>%zB?({v=^pbFGkW zXXvu#I}b^JpJ>?P;m!A5r{`H^$@E#C377`|z$=#^u*xDN#d6uHHEO8C zQ_i+Lx&+>D&tu_0Z>Yy%gzk#1e;qUhmAux!M8WTziqHh`y*%fg3_5AolC{Tj@%VF$G_)6MJ_`Gb#w5L7I|T% zJMuwY7jI=9Pcv!RP<7^_5d-L+uF23y%lgTIa>k4iso*Hh>;rh`3;EW| zi@p4i5Ef67FSf3PQrqu2eW&{xHz%KAlZBssvAeBN1TD$A9Ul^7#q~dE3;KaI8*(7y z!cMUGM)7{zQ+qvGG!X-&#LW-u+^%Vb1f>v0?BN=L2^o738_!w6S#VRAfnyB5#InMn zO4+-Sc-@iJ65Y>@n6Ub^IW8DQk=J$Cp#K!z>TO}F4BOCX4}tLcIDwNfQLN~pGLbd1 zE-YR1rwEELk`2KpM4dI#SZF*}jlq0CO-*g_qH|NalS^5{`h~qBDx##CRQ$f!GXDri zMhBh%pu_uRwP-TLnw$dmE5**qgU&JcFsz7!TjG+}@(cjM5>$ zfAXsn2PKw-D-3Mmf*)Qe$CP0U)Cuf>>n#Kd)K)a4FpRuIR4c77qg)S;fCV=e(fSNW znG{V=kDXM0&Jf-i4-U)Bm>3LZhj9PV;aa1vLGg*{GV#lEXC>8*R+J@glc!xUH<+%V zr9A`O7^{y^zXSCmFX0hq#BV>;m5{*+D{iZ3XO4uZ*Tv?a(}bz!?z~mkE^uaP zfXBcI19v7C_SKk#5FAO&pqRUdiE(RvK#CZNQbaCyeE>8Q`~5=GwctXzWCy1S&lk_u zhBC~Os={x1mwWCjKdZLwy>EZ5^wt~Ijkj3*ZbgrN=;T0&9!LZ8R0bUX;A(6)!0)3O zD+a=lb9#|GGK;oAtm>B)XFF(i6d5$k?~N`QD$)K;byk*SZn@e}H2Zc^BC(Y&DSp)y zcj8-hKT`JNms=?P@|=f?Q(B9x8BmikN*QXU=)N(t-db%&t4%USqiin51)nAbUFn7t z0(?!qUbe?OHR|AL6ToZ9+yi# zGY|E|;L9o^Xl<@J$l1UZoEGlP~3s6}T zuE1W|+(n%TWCrO{xG>$X_yHD3`;@A+KzO$|SSJZo(TY_2y!zSR+Z3EpcIVtEk!6XyqF+X6kjc<1+XP)HS81E-dJ)197twY4l9IF+1zw zb06runxHOQ&@&q-4hI+WnucV6GrRP7)7a_KbTwAgPo_z3;X6fr=O1G!ZC*j+pS6>; z9cjAn*&3U%mJIcjiT|3W7p7l`loo#!-n;JG@n}JHXsSRDpHS$pZ%c4t>jz?LU@ZGZ zu2`v}1sb&uHgU*e)Mqp>OHn#_mhkTK@;D z!w_MnAnn-ho6vc+W7h`(RZZTLJlSEL9}DerfRYoV!(+vonH%eQyi0#O_iO0#g%FsV zPA0fiDwG`xm)#RVIfR#$KQ1z|(!lG7@`fVhrTT-=l-D`ARVGuMm z^CSb$o3$-Vs^df{)(j2zY}JZ~;Hg6?5Yym+dj@CQh{G`Te7Hg^mR>@}uZ>*y34g`= zWvQdhGtT;g(0oj($2kcJB0AaJZPJZ#4b$dHsgBPA=TW5JIgiAnNqWr>`U1Ycu#Y0x z%IgF~yZ_3n^@*I3S82V_b`%rR5ujKvqlp~bN1;FKuzakY*)PRO<%CE^kLJad=m?e; zlR;~LPDHv67jTe?%t=n5CeEI3`(-|6*Sal{xAt8z&B;Wx%Wkmh`9-ro;t2zft};EG zWt1^?D6(Svu(Y&Ast_s%|F;=6PLx zoXwYFXiPqI2?GIA%Wo}B+**^e4yVgj6tveeHGc-HR^7}GH+o^oHa!IqGZr=MWX0KR zS}vpZLcQNcIC;!)_eFLD(g#Pe$j5d=!BCxi{Kxl2>|}7$24DqQhe(O_qX$WL3FIdk zoHOJ}oQH<}dg?oY2w)#Is?=JzGH0Fp1&N^sDn}of=sW#OX}R1LDquDacC~!FzXu;@ z7&`7cU?3phPt^zGKUJ4Kz|qFV+0Gnb;`mRs$0lK{GLsQ0f^ZPYiM;&S~q$P`g_H-*y7o)jVzeacHr@BlM zQ|3>w8fvYY#<>j|3>l=E1JpV9w8bB`#StN~V8PUQ=i6|;W!+CZR&cUS9c2b| z(&IS-Qx84xd9*HIi68`D%y$+X?`oNdX=B%*C%F`NRm@g4>`2*FxWnLdZczm^=`p9m zLXI9V>03rmRyV*|m5m`u7^Gv##HaYvVUk&82 z)JeWCN0sDxbH5_i(3b@>^QD^$Df}&G#NlzRZiQa1P(F4CfO3CrYsLn{uQDMCep#W(D zwK-yEHA^4qj1)-PpdO%`y&rlf!&DV2wRfwN$Q|t(P(0k;^6jZbvqU=w@#^f4f_?Fj z=m8V~2&jh(2ng{%fAhZtz0VWB0UX`fHTIkJuE0<)jg|%Kw&Q(&mLESXGJXg-9TTn4 z*xSP=Tg|kcj3y9|c%E;$7^$av&E8Pc z*h)55jkc26)fK^yFsNoGgHMRuT!M54@0}>_%l6joRA1)m_qaW?7t*aC)LUrGAtF>^ zoPYzVXHAEoseY8@iNP>z=_gAIH);0WWQ)q5#{H^kHSjQolQ>aDe5iA8-$^#AX&L)c z5otbT?p2}*LTMdIbAtR3`XG2qO5+Mps>{q^i`FQ5)~n7w$58KT_j^{p*;&H0+vt4S zKBqxR{Q1(2VKbo)3qp{3+F0Jy#J5zwR$E5agTwStRw<(~D=>B!SLX(JZRJHPb#Fdxj|p3&aicW~3;tzmk-9#$tGx|m8TR^WHxrRzvu}>fpwL<(q0CfO z*H$PP`hv3r)d~bSf}k^jy1sJuNCL}Oca9I%u<6rqGpSJrjVa^!{stAN^%7$ur^ZT0 z*GRhsVgRJx9)U+?RZC~c1Jg_LYaq zL`bt*=nvpZNKnB}g_Dql41e6hcJj4gl!LKB&M#e}mK)YxQT&;-b?+^es?I@(dPX}_Yia^^WL zs^GY&bOX9juaysXLh+aJXSHgqmQJDc^G~>p;safQYW7(SkN3(T&|oQbOlTvEU`bnJ zwYRD6nV_p!esK&!)y0->o#MRts`B;rx_cQENfDKS1P#*w{4kYhi&D%gEofX$Z3cMY z$`3~}s?qu}J!2DMe-uiWQt|a5IO+h<19uNNA&4)LCH+K*2)8zmAwx$$?h zb3w*QlVjwUoA3qa;*rPjfyVF(z%CT+O+%&ma5JSuNfW{QV?z|tIjsVJ1B<;~Sd?`B z{%uFSlU~3G&BV2Hi+gR1dtEgaUtd_a2{XUr8#EHd*yMTlEMx^HO6K9;!EDN}EAu-evJ zFZ?vDA!ZW`bIH@V=xA9&8e9;DelytOw3Z$de2y4O-z@ z!gg@;vqm1c3b}$>tw2a=B{G%w@(UfvlNVxW)VVGpqrc25K+ZcCdG@=HY@+7Q6&B}Q zo*8nB(V1l>`A?K)=i+2cls&ES! z9U}x|M5(;UduMzpr@=BuuQj;D38KO?{59IttKNgI?~=6z$~Vsl`QXm4()ibKh>FlTE4`Yl3$!<2dK+J!33{SQ;h z`4>fZjLYqHpM54Q$j`&qi)L2WsaEloIWVxC!0IKvZ9@SIA`1JgpCWFabaj(k*E=U}sQ*4wJkdKk_Vl)4K??$AuUQJ`{#yHTt}q$k=<}bM)MKSe1f1#%n*9a1_4y)1|mistr1F?O+$!@6@=o07pu;9nGOS_^5I{ zOCeS?6q|B{W-YFOqBF1IpLn^lekIaRaOZU7W_rAWIxn~j^7eobE*~Sgymona_mZdg z;USGn>cIaxMunH9-0mTFOO>LKuOc|JA&KA3>OYN~?rVeidQd%7-$r$+M!Nnrq^X%X zRKHUF=k3_U@IbQ5sAY=QESPo5D)Y6>PRuoD@#|?4N-Ob0u(NUB;#u|rJZgBPS?(x# zzhk}7w21a(i;5Rd4E4vZ%DY1Zzbf60KmRJq-6W3Wfs_S?cAG`k5Ol6;w&ntlplP99&MqPAL0lg z(`LV6kkZZ`icSZ=7$(!{g`*I77bP-#fR0Lz2OHFCa@|`@A>T}AE}ELWw?6ikbE&+r zY%Z~3oai!>r zQR%w6Z;^?ZLys?+`@0+A?++4kp&?9G#$~b{Z@*CMK76wvB}Q%)MG`q|L{X2M7_>G- zdX(^<_v=Z_4fC*YUWA(NU!tC&bO8fW<^lB(L0?_EfpF6N`Xh4;^-D*AG){OT4v~>| zZv7H3N}EE|N^IDIT2xA|^fRj_DXQv-wtaz5xSra5C>hRzw@cUMy}Xb29PL8`HPPTb zVI$Md;LgaYSpYi9$PwHSRJDw{c#iw)Cro9oEAkHzJIgx45#C@1fSn7V%H+{ZHC2-!;IpS;X ztX#4<+cQN!b0FctPT7Ngp5U^9NIiQa*wYSJFJf6vsmAXisN~zq5O^}OwS0wl?O+=|NHP7yt5w{p zrgGiVwQ%^$a{^~f(s&(_f^=k8^^ z2a^)pA2{*6X_~3h)JHC;D)}Km$x0eu#B!LaT2B{jjn6YWg;QnfPHa(U0K}fS6_d?o zrDg{M+ZL&w;@1;E3(>;#5CUI}x!!(tTgN$G+{+(jXPhu)oiW@wbLVU{H*E; zX|kDLW9GJO#3`2gUDU@(E)&EI)r>P-i^>=SJUp2Sv1Qj8UzF&!%ImbBnh|S2Lv^W; zB{|zdxc$CFNL%VlX&Ti|X*q)yhM1o-g_YfV-iN}c!S)I@gv@JmFa2A}_ZcpHPw018 zaZbd>9BuIuSLE=k$k!kDso*PWkbweB2CQTs6RXsiyuqsE zuOb_!kM)!0WAot(bpr%kuO<`Okt@${XRbesRQQvy+(95_P+}sG&nVv*z^$%Uf*koF zxkp*O3O*uD2)Le?P^|qTkmh8Xf?~*zbRk=*U{RuhL;yWvDC=nTP^1sDfxB~P46!JC zdjzOKGLLKwE2K;=K(+AmF_VmVscx@n z0$&9ZqMh8ee=^-LPYR-jHOhz>BiR;Hr55=ti&7IJ(*4*AR}brgz;~uFngB|vqhA~= z(=~Wm$kM0jwm4p!ZK&BZCzxFcA;l-u1;tr*KIW}1$)RP8-_!vOLH(d7u1dYY_ZU^gANU8Pk~zgEJ&#^!ZUOrdu)dk?n4;8_`rof^Mh|7 z=qC`v1Wx*Uj5js>CX11yr6iS@h-EL^N~+Rr6jcf`ea~>%1o(>b1bu-8?yP|HLUl}K z>2D{~(nUB@RR)tm*aAa9YF)5oZ2(|GUtn`)BO^I}$GF=bbKquaGp$Rwo+e$7Ta0fGgQiOdJP8L4VkLI7p9jrZLF=IX+3FKu2 z9>R+X4ubIV4dV5E;>+(P>W2FkafeV0kuJy#j48unU@Q2|ZD}`kuGfQ;ZrNU%Zd4ER zV-P7BJxc4!Bc(_31W?5>0d{)U_$WuZ<$*b#b*dB?cchv0T@!5hnRcG#Jh2rICksZf zCsySz!?^TKWCHQz1-l6+K-5>4EjJ;(ZQO{cJ6>X5OE8wihAu*<_B5(6$aDcfkCHv6 zJY3VwxN}(8q@M0zDFoGVskX<2C13VR#F>HJi70GSqY;|q3(d6*!v_1blcQatd&^FX zJMCXaqON2)(QdpGo5>*}>PMgbSPGuoqBgrYy~%#FKvbESr#3@BK6uqX0#eroPcy6> zYU&?lv5O3_hCME6g8}fGVLA(3~cyiC)_QvA}!~YIXb`9;V5aYvzpk;opM}jrY zl_Qaw3pt(j(GsEU854;w#D8ULIh?rBhb$h(dqG~FDC;(Vc6;;^*j`r7FgI=3jDPoe zFD4fP%}c=K?^J*Q?MI&w41}S@#?%CvKNXzl)t*;yKj_weoy7zs0i_!*9?ut0h~tOL z6{W$Cl6SuUFJGp*!M;qMp6m4;h|*4u|0oos)9R}ecz5KQ>B znfQGT6~pJT5^U%z1Z3iS5!VV|fcA$g)_hLr)3k+Z_+U&H#lRN+LYJ=H`hreZOT(~9 zx|+&YqBJNZZyK^EkGn4d;n|wsHt+UGxyJp$&loSjZ<8}|OUtqSFnx@VH_{sDxvJ3n z$A3URON z=TIgSO-=Cyi`KQ7G$fYfG%O)^W`nrA{{gl;+rKY`ViW!p6u5iA)b9^9m}1C_gcAe^ zNb55e;{W^HjE<3>m7bB_#KOs$&cfEzjzLaLMMzXgMJQR>P~p=Q|NB#^LD3f^mY+ii zR|U19&Je^##fA$ml4>DedRXj7cqlCV@J&^WV0|dmr(K)6T)(}2&H$4^lf9r4eXxZS z$xOf0n8U)rEoJ60Fc4S{Xt6oP?lXlukfDJk7*teQPC08j4Yu5~Y_cj=Q9XL919Nr| zH>}8OylQwBQlNv37?}|FDD(4WuPq+R5ogEAkov}S6D4(T7nlQCZsqFq6k@xvlVwo; z%2$;c{lI6Z@M!A&D>EMcq1eQ(M#nyhskzaX-QL|dWo7&5NWWf4-N-vY;mnH{quoS` zx_7IQ*!8MQxr(j!v4l6dq2-4Ln5`%dQMxnf@6RDz`htxT8eAg*=W0Zndw zfZ(BuGb_?1O`1^winGheLQ)`U(?H&V($;A5d>le0VHy|)j5PCs{Qh0WK&{MAXx{-c zF+oTj4)leSe9T9dIPjkR^99kq=ovSjF8rFs1O5;zSW^GwZ++W@IQYN$b;4L3vWl>> zJpevw#hlR=?6~Iq6e}a~;Ksg#l|?BGrHeF>XbqE<414X<^RIw2gp452djjx=IlyU# zYtb!#TsvxSvM+p=8d%+y)i4v(AqKt=te9INCcHraI>{b9M?^*_I89#{ny+Hb-uuDp z^-oks8iWlOscugp>}T=)X0N;iR_Czu$v&nxi+sUjx$HyneK#^M+=SrdyR7A#L6a~aTZ_fqpK z3UT$*WD(NFbD657Gq@Hb5m(uh!!nY;d?Opj~2EBs5VZFwtejUolZJaE_?nFIm#8fD_WpN9u8-k%g@bf__yzG0!-aio2Q%*1+`i_gbBsXh%~?cme&U2<-<^zO@73q!7dBzMQkDtxWrit zLldLrEK9^Y`zZJ4c*VPfso0T5+BKk~P?-$q2AH%XppwSo9J{yBrI^k#WmIXSaFzBVA}V0!o@$w&qz*(f3eS`KId6)VxFDurH0#?*Oo#vAi7nd7gI)W z_6f=oub*U9--0Hg2`J>bYgx8I`YpLK2phX}LuA@F2_C2rZa~CKl*Rhd+EFBGc`HdW z%xJc1q7``V+F5NUd~y0%=~^V%*Um?(TCU{uu>OKu=Q*yg$OreJ`aKD+n2Iboxn(ut z4c$6R`FONeCWWoaeN`={31NWwmSL8)v%t7u4t+PYvu7b>z?HB9uJrz40HC1>i!s)5 zo_&a*k`law(f%aLIU>lE6Mk5SsrwU`dZ|wFe7l{NGmQ25lxPv)d@JbAzIHBWjRZb8cAE;7%`1Uc`r&U~D52r}5oQ!hJ zJXR?NG=xkZL1XLn{pIofI-ke|9h+T=L-1e zqaffPFaM|U`cL?ut)+^qxE=mi`}&_0e|p3I zqL}|X#XlVr?k0cY|1@>|h1dS_w<-L~apC?a{!fd>U-*en5BfPY-Vzu8~vx^`X|Sq0^VO7Or-zj_>Y|TC-~3Z++Scr=6{3#*xUU{@n<3Q7lj4K|4{tb zvgl8)KNH+vTqw6;1K`#Hsa^=;8Xgx J;QPm?{|D8=hoJxf literal 0 HcmV?d00001 diff --git a/hisel/categorical.py b/hisel/categorical.py index dc93f08..06eee83 100644 --- a/hisel/categorical.py +++ b/hisel/categorical.py @@ -1,9 +1,14 @@ from typing import Optional, Set, Tuple, Callable, Union, List +import itertools +import threading +import sys +import time import numpy as np import pandas as pd from dataclasses import dataclass from sklearn.metrics import adjusted_mutual_info_score from joblib import Parallel, delayed +from tqdm import tqdm from hisel import permutohedron @@ -67,10 +72,12 @@ def select( parallel: bool = False, random_state: Optional[int] = None, ) -> Selection: + print(f'Number of categorical features: {xdf.shape[1]}') xdf = _preprocess_datatypes(xdf) x = xdf.values ydf = _preprocess_datatypes(ydf) allfeatures: List[np.ndarray] = [] + if isinstance(ydf, pd.Series): if ydf.dtypes == float: y = _discretise(ydf.values) @@ -105,6 +112,7 @@ def select( fs = np.concatenate(allfeatures) indexes = np.array(list(set(fs)), dtype=int) features = list(xdf.columns[indexes]) + print(f'Number of selected categorical features: {len(features)}') return Selection(indexes=indexes, features=features) @@ -124,7 +132,7 @@ def search( assert x.dtype == int assert y.dtype == int if num_permutations is None: - num_permutations = 3 * d + num_permutations = d x = x - np.amin(x, axis=0, keepdims=True) y = y - np.amin(y, axis=0, keepdims=True) active_set = set(range(d)) @@ -132,7 +140,7 @@ def search( features = np.array([], dtype=int) imall = .0 n_iter = 0 - while len(active_set) > 1 and n_iter < max_iter: + while len(active_set) > 0 and n_iter < max_iter: active = np.array(list(active_set)) num_active = len(active) num_haar_samples = min( @@ -148,11 +156,11 @@ def search( tries = Parallel(n_jobs=-1)([ delayed(_try_permutation)( ami, x, y, active, list(permutation)) - for permutation in permutations + for permutation in tqdm(permutations) ]) else: tries = [_try_permutation( - ami, x, y, active, list(permutation)) for permutation in permutations] + ami, x, y, active, list(permutation)) for permutation in tqdm(permutations)] im = .0 for im_, sel_ in tries: @@ -169,6 +177,16 @@ def search( features = np.concatenate((features, sel)) active_set = active_set.difference(set(features)) n_iter += 1 + threshold = im_ratio * imall + fwsel = _featurewise_selection( + ami, + x, + y, + threshold + ) + features = np.array(list( + set(features).union(set(fwsel)) + )) return features @@ -209,3 +227,17 @@ def _try_permutation( im = ims[s] selection = sel[:s+1] return im, selection + + +def _featurewise_selection( + metric: Callable[[np.ndarray, np.ndarray], np.ndarray], + x: np.ndarray, + y: np.ndarray, + threshold: float, +) -> List[int]: + sel = [] + for i in range(x.shape[1]): + v = metric(x[:, [i]], y) + if v > threshold: + sel.append(i) + return sel diff --git a/hisel/feature_selection.py b/hisel/feature_selection.py index c37f3d5..75528be 100644 --- a/hisel/feature_selection.py +++ b/hisel/feature_selection.py @@ -78,9 +78,12 @@ def select_features( hsiclasso_parameters = HSICLassoParameters() if categorical_search_parameters is None: categorical_search_parameters = SearchParameters() + + print("\n***Selection of continuous features***") continuous_lasso_selection: LassoSelection = select.select( xdf[continuous_features], ydf, **hsiclasso_parameters) + print("\n***Selection of categorical features***") categorical_search_selection: categorical.Selection = categorical.select( xdf[discrete_features], ydf, **categorical_search_parameters) diff --git a/hisel/kernels.py b/hisel/kernels.py index ba73541..0b63cfd 100644 --- a/hisel/kernels.py +++ b/hisel/kernels.py @@ -2,6 +2,7 @@ from joblib import Parallel, delayed from enum import Enum import numpy as np +from tqdm import tqdm class KernelType(Enum): @@ -233,10 +234,10 @@ def apply_feature_map( l, h, is_multivariate - ) for batch in batches] + ) for batch in tqdm(batches)] else: partial_phis = Parallel(n_jobs=-1)([ - delayed(_run_batch)(kernel_type, batch, l) for batch in batches + delayed(_run_batch)(kernel_type, batch, l) for batch in tqdm(batches) ]) phi: np.ndarray = np.vstack(partial_phis) return phi diff --git a/hisel/select.py b/hisel/select.py index f5e315b..1513f17 100644 --- a/hisel/select.py +++ b/hisel/select.py @@ -1,5 +1,5 @@ # API -from typing import List, Optional, Union +from typing import List, Optional, Union, Tuple from enum import Enum from dataclasses import dataclass import numpy as np @@ -42,22 +42,33 @@ def ksgmi( x: pd.DataFrame, y: Union[pd.DataFrame, pd.Series], threshold: float = .01, -): +) -> Tuple[List[str], pd.Series]: x = _preprocess_datatypes(x) y = _preprocess_datatypes(y) discrete_features = x.dtypes == int mix = x.values if isinstance(y, pd.Series) or (isinstance(y, pd.DataFrame) and y.shape[1] == 1): - miy = np.squeeze(y.values) + miys = np.squeeze(y.values).reshape(-1, 1) else: - miy = np.linalg.norm(y, axis=1) - compute_mi = mutual_info_classif if miy.dtype == int else mutual_info_regression - mis = compute_mi(mix, miy, discrete_features=discrete_features) - mis /= np.max(mis) - isrelevant = mis > threshold - relevant_features = np.arange(x.shape[1])[isrelevant] - print(f'ksg-mi preprocessing: {sum(isrelevant)} features are pre-selected') - return relevant_features, mis + miys = y.values + sel = set() + totmis = np.zeros(x.shape[1], dtype=float) + for j in range(miys.shape[1]): + miy = miys[:, j] + compute_mi = mutual_info_classif if miy.dtype == int else mutual_info_regression + mis = pd.Series( + compute_mi(mix, miy, discrete_features=discrete_features), + index=x.columns) + mis /= np.max(mis) + sel = sel.union(set( + set(mis.loc[mis > threshold].index) + )) + totmis += mis + mutual_infos = pd.Series(totmis, index=x.columns) + relevant_features = list(sel) + print( + f'ksg-mi preprocessing: {len(relevant_features)} features are pre-selected') + return relevant_features, mutual_infos class HSICSelector: @@ -66,6 +77,7 @@ def __init__(self, y: np.ndarray, xfeattype: Optional[FeatureType] = None, yfeattype: Optional[FeatureType] = None, + feature_names: Optional[List[str]] = None, ): assert x.ndim == 2 assert y.ndim == 2 @@ -94,6 +106,11 @@ def __init__(self, self.yfeattype = yfeattype self.xkerneltype = KernelType.DELTA if xfeattype == FeatureType.DISCR else KernelType.RBF self.ykerneltype = KernelType.DELTA if yfeattype == FeatureType.DISCR else KernelType.RBF + if feature_names is None: + self.feature_names = [f'f{f}' for f in range(x.shape[1])] + pass + else: + self.feature_names = feature_names def lasso_path(self): if not hasattr(self, 'lassopaths'): @@ -109,7 +126,7 @@ def lasso_path(self): paths.append(_p) path = np.mean(np.vstack(paths), axis=0) df: pd.DataFrame = pd.DataFrame( - path, columns=[f'f{f}' for f in range(path.shape[1])]) + path, columns=self.feature_names) return df def projection_matrix(self, @@ -208,31 +225,50 @@ def autoselect(self, number_of_epochs: int = 1, threshold: float = .01, device: Optional[str] = None, - ): - curve = self.regularization_curve( - batch_size=batch_size, - minibatch_size=minibatch_size, - number_of_epochs=number_of_epochs, - device=device, + lasso_path: Optional[pd.DataFrame] = None, + ) -> List[str]: + if lasso_path is None: + curve = self.regularization_curve( + batch_size=batch_size, + minibatch_size=minibatch_size, + number_of_epochs=number_of_epochs, + device=device, + ) + lasso_path = self.lasso_path() + return HSICSelector.select_from_lasso_path(lasso_path, threshold) + + @staticmethod + def select_from_lasso_path( + lasso_path: pd.DataFrame, + threshold: float = .01, + ) -> List[str]: + features = list(lasso_path.columns) + curve = np.cumsum(np.sort(lasso_path.iloc[-1, :])[::-1]) + ordered_features = sorted( + features, + key=lambda a: lasso_path[a].values[-1], + reverse=True ) betas = np.diff(curve, prepend=.0) betas /= betas[0] number_of_features = sum(betas > threshold) - return self.ordered_features[:number_of_features] + return ordered_features[:number_of_features] @dataclass class Selection: - preselection: np.ndarray - mis: np.ndarray - _innersel: np.ndarray - hsic_selection: np.ndarray - mi_ordered_features: np.ndarray - hsic_ordered_features: np.ndarray + preselection: List[str] + mis: pd.Series + hsic_selection: List[str] + mi_ordered_features: List[str] + hsic_ordered_features: List[str] lassopaths: pd.DataFrame regcurve: np.ndarray features: List[str] = None + def select_from_lasso_path(self, threshold: float = 0.01): + return HSICSelector.select_from_lasso_path(self.lassopaths, threshold) + def select( x: pd.DataFrame, @@ -249,13 +285,13 @@ def select( if use_preselection: cols, mis = ksgmi(x, y, mi_threshold) else: - cols = np.arange(d) - mis = np.zeros(d) - x_ = x.iloc[:, cols].values + cols = x.columns.tolist() + mis = pd.Series(np.zeros(d), index=cols) + x_ = x.loc[:, cols].values y_ = y.values if y_.ndim == 1: y_ = y_.reshape(-1, 1) - selector = HSICSelector(x_, y_) + selector = HSICSelector(x_, y_, feature_names=cols) innersel_ = selector.autoselect( threshold=hsic_threshold, batch_size=batch_size, @@ -263,25 +299,19 @@ def select( number_of_epochs=number_of_epochs, device=device ) - _innersel = np.array(innersel_) print(f'HSIC has selected {len(innersel_)} features') - hsic_ordered_features = cols[selector.ordered_features] - mi_ordered_features = np.argsort(mis)[::-1] - hsic_selection = cols[_innersel] - paths = selector.lasso_path() - renamecols = { - fd: f"f{cols[int(fd.split('f')[1])]}" - for fd in paths.columns - } - paths.rename( - columns=renamecols, - inplace=True + hsic_ordered_features = list( + np.array(cols)[selector.ordered_features] ) - curve = np.cumsum(np.sort(paths.iloc[-1, :])[::-1]) - features = list(x.columns[hsic_selection]) + preselection: List[str] = cols + mi_ordered_features: List[str] = list( + mis.sort_values(ascending=False).index) + hsic_selection: List[str] = innersel_ + paths: pd.DataFrame = selector.lasso_path() + curve: np.array = np.cumsum(np.sort(paths.iloc[-1, :])[::-1]) + features: List[str] = hsic_selection sel = Selection( - preselection=cols, - _innersel=_innersel, + preselection=preselection, mis=mis, hsic_selection=hsic_selection, mi_ordered_features=mi_ordered_features, diff --git a/notebooks/selection_workflow.ipynb b/notebooks/cont_cat_split_demo.ipynb similarity index 60% rename from notebooks/selection_workflow.ipynb rename to notebooks/cont_cat_split_demo.ipynb index f6c1bdd..48f3d93 100644 --- a/notebooks/selection_workflow.ipynb +++ b/notebooks/cont_cat_split_demo.ipynb @@ -12,7 +12,7 @@ "from scipy.stats import special_ortho_group\n", "import matplotlib.pyplot as plt\n", "\n", - "from hisel import select" + "import hisel" ] }, { @@ -104,7 +104,10 @@ "metadata": {}, "outputs": [], "source": [ - "relevant_features = np.sort(np.concatenate((relevant_cats, relevant_conts)))" + "all_relevant = np.sort(np.concatenate((relevant_cats, relevant_conts)))\n", + "relevant_cat_features = sorted(xdf.iloc[:, relevant_cats].columns.tolist())\n", + "relevant_cont_features = sorted(xdf.iloc[:, relevant_conts].columns.tolist())\n", + "relevant_features = sorted(xdf.iloc[:, all_relevant].columns.tolist())" ] }, { @@ -144,39 +147,31 @@ }, { "cell_type": "markdown", - "id": "c25651c8", + "id": "8d15d80c", "metadata": {}, "source": [ - "## Selection of categorical features" + "## KSG selection" ] }, { "cell_type": "code", "execution_count": null, - "id": "3a32f32d", + "id": "d38785f7", "metadata": {}, "outputs": [], "source": [ - "cat_selection = select.select(\n", - " catdf, \n", - " ydf,\n", - " hsic_threshold=.01,\n", - " batch_size=n,\n", - " minibatch_size=200,\n", - " number_of_epochs=3,\n", - " use_preselection=False,\n", - ")" + "ksgfeatures, ksgmis = hisel.select.ksgmi(xdf, ydf, threshold=.0001)" ] }, { "cell_type": "code", "execution_count": null, - "id": "729d6149", + "id": "3b592457", "metadata": {}, "outputs": [], "source": [ - "expected = sorted(list(relevant_cats))\n", - "selected = sorted(list(cat_selection.hsic_selection))\n", + "expected = sorted(list(relevant_features))\n", + "selected = sorted(list(ksgfeatures))\n", "leftout = sorted(list(set(expected).difference(set(selected))))\n", "print(f'Expected features:\\n{expected}')\n", "print(f'Selected features:\\n{selected}')\n", @@ -185,192 +180,156 @@ }, { "cell_type": "markdown", - "id": "3ebc367b", + "id": "bb293ae3", "metadata": {}, "source": [ - "## Selection of continuous features " + "## Selection" ] }, { "cell_type": "code", "execution_count": null, - "id": "a5f6a028", + "id": "785baa63", "metadata": {}, "outputs": [], "source": [ - "cont_selection = select.select(\n", - " contdf, \n", - " ydf,\n", - " hsic_threshold=.01,\n", - " batch_size=n,\n", - " minibatch_size=200,\n", - " number_of_epochs=3,\n", - " use_preselection=False,\n", + "categorical_search_params = hisel.feature_selection.SearchParameters(\n", + " num_permutations=40,\n", + " im_ratio=.01,\n", + " max_iter=2,\n", + " parallel=True,\n", + " random_state=None,\n", ")" ] }, { "cell_type": "code", "execution_count": null, - "id": "2b28c86b", + "id": "c044adaa", "metadata": {}, "outputs": [], "source": [ - "expected = sorted(list(relevant_conts))\n", - "selected = sorted(list(n_cat + cont_selection.hsic_selection))\n", - "leftout = sorted(list(set(expected).difference(set(selected))))\n", - "of = n_cat + cont_selection.hsic_ordered_features\n", - "impgrade_of_relconts = {n: int(np.squeeze(np.where(of==n))) for n in expected}\n", - "print(f'Expected features:\\n{expected}')\n", - "print(f'Importance grade of relevant features:\\n{impgrade_of_relconts}')\n", - "print(f'Selected features:\\n{selected}')\n", - "print(f'Left-out features:\\n{leftout}')" + "hsiclasso_params = hisel.feature_selection.HSICLassoParameters(\n", + " hsic_threshold=.01,\n", + " batch_size=5000,\n", + " minibatch_size=200,\n", + " number_of_epochs=4,\n", + " use_preselection=False,\n", + " device=None\n", + ")" ] }, { "cell_type": "code", "execution_count": null, - "id": "66ca3f62", + "id": "3ac19521", "metadata": {}, "outputs": [], "source": [ - "curve = cont_selection.regcurve\n", - "plt.plot(np.arange(1, 1+len(curve)), curve)" - ] - }, - { - "cell_type": "markdown", - "id": "202c6f62", - "metadata": {}, - "source": [ - "# HSIC selection " + "selection = hisel.feature_selection.select_features(\n", + " xdf, \n", + " ydf,\n", + " hsiclasso_params,\n", + " categorical_search_params\n", + ")\n", + "hsic_selection = selection['continuous_lasso_selection']\n", + "cat_selection = selection['categorical_search_selection']" ] }, { "cell_type": "code", "execution_count": null, - "id": "ff0ab417", + "id": "b944b1c4", "metadata": {}, "outputs": [], "source": [ - "selection = select.select(\n", - " xdf, \n", - " ydf,\n", - " hsic_threshold=.0075,\n", - " batch_size=n,\n", - " minibatch_size=200,\n", - " number_of_epochs=3,\n", - " use_preselection=False,\n", - ") # This is the longest to run" + "selected_cat_features = sorted(cat_selection.features)\n", + "selected_cont_features = sorted(hsic_selection.features)\n", + "selected_features = sorted(selection['selected_features'])" ] }, { "cell_type": "code", "execution_count": null, - "id": "3b115188", + "id": "61c5b7b2", "metadata": {}, "outputs": [], "source": [ - "expected = sorted(list(relevant_features))\n", - "selected = sorted(list(selection.hsic_selection))\n", - "leftout = sorted(list(set(expected).difference(set(selected))))\n", - "orderedfeats = list(selection.hsic_ordered_features)\n", - "of = selection.hsic_ordered_features\n", - "impgrade_of_relconts = {n:int(np.squeeze(np.where(of==n))) for n in expected}\n", - "print(f'Expected features:\\n{expected}')\n", - "print(f'Importance grade of relevant features:\\n{impgrade_of_relconts}')\n", - "print(f'Selected features:\\n{selected}')\n", - "print(f'Left-out features:\\n{leftout}')\n", - "print(f'Features in decreasing order of importance:\\n{orderedfeats}')" - ] - }, - { - "cell_type": "markdown", - "id": "8d15d80c", - "metadata": {}, - "source": [ - "## KSG selection" + "leftout_cat = sorted(list(\n", + " set(relevant_cat_features).difference(set(selected_cat_features))\n", + "))\n", + "print(f'Relevant cat features:\\n{relevant_cat_features}')\n", + "print(f'Selected cat features:\\n{selected_cat_features}')\n", + "print(f'Left-out cat features:\\n{leftout_cat}')" ] }, { "cell_type": "code", "execution_count": null, - "id": "d38785f7", + "id": "c8089fa2", "metadata": {}, "outputs": [], "source": [ - "ksgfeatures, ksgmis = select.ksgmi(xdf, ydf, threshold=.01)" + "leftout_cont = sorted(list(\n", + " set(relevant_cont_features).difference(set(selected_cont_features))\n", + "))\n", + "print(f'Relevant cont features:\\n{relevant_cont_features}')\n", + "print(f'Selected cont features:\\n{selected_cont_features}')\n", + "print(f'Left-out cont features:\\n{leftout_cont}')" ] }, { "cell_type": "code", "execution_count": null, - "id": "3b592457", + "id": "663973d1", "metadata": {}, "outputs": [], "source": [ - "expected = sorted(list(relevant_features))\n", - "selected = sorted(list(ksgfeatures))\n", - "leftout = sorted(list(set(expected).difference(set(selected))))\n", - "print(f'Expected features:\\n{expected}')\n", - "print(f'Selected features:\\n{selected}')\n", - "print(f'Left-out features:\\n{leftout}')" + "print(f'All relevant features:\\n{relevant_features}')\n", + "print(f'Selected features:\\n{selected_features}')" ] }, { "cell_type": "markdown", - "id": "d0d3ba9a", + "id": "93988107", "metadata": {}, "source": [ - "# HSIC selection with pre-selection " + "You can explore how the selection threshold affects the choice of the continuous features" ] }, { "cell_type": "code", "execution_count": null, - "id": "7dbf25ed", + "id": "7e1df0ea", "metadata": {}, "outputs": [], "source": [ - "selection = select.select(\n", - " xdf, \n", - " ydf,\n", - " mi_threshold=.0,\n", - " hsic_threshold=.0095,\n", - " batch_size=n,\n", - " minibatch_size=200,\n", - " number_of_epochs=3,\n", - " use_preselection=True,\n", - ")" + "hsic_selection.select_from_lasso_path(threshold=.025)" + ] + }, + { + "cell_type": "markdown", + "id": "73d4ad7d", + "metadata": {}, + "source": [ + "You can visualise the regularisation curve used to select the continuous features" ] }, { "cell_type": "code", "execution_count": null, - "id": "f2d662b8", + "id": "3ef1558f", "metadata": {}, "outputs": [], "source": [ - "expected = sorted(list(relevant_features))\n", - "preselected = sorted(list(selection.preselection))\n", - "selected = sorted(list(selection.hsic_selection))\n", - "preleftout = sorted(list(set(expected).difference(set(preselected))))\n", - "leftout = sorted(list(set(expected).difference(set(selected))))\n", - "orderedfeats = list(selection.hsic_ordered_features)\n", - "mi_orderedfeats = list(selection.mi_ordered_features)\n", - "print(f'Expected features:\\n{expected}')\n", - "print(f'Pre-selected features:\\n{preselected}')\n", - "print(f'Pre-leftout features:\\n{preleftout}')\n", - "print(f'Selected features:\\n{selected}')\n", - "print(f'Left-out features:\\n{leftout}')\n", - "print(f'Features in decreasing order of importance:\\n{orderedfeats}')\n", - "print(f'Features in decreasing order of MI:\\n{mi_orderedfeats}')" + "curve = hsic_selection.regcurve\n", + "plt.plot(np.arange(1, 1+len(curve)), curve)" ] }, { "cell_type": "code", "execution_count": null, - "id": "295ffa89", + "id": "33c58011", "metadata": {}, "outputs": [], "source": [] diff --git a/poetry.lock b/poetry.lock index e2176d4..74b2b02 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,88 +1,100 @@ # This file is automatically @generated by Poetry 1.4.1 and should not be changed by hand. +[[package]] +name = "colorama" +version = "0.4.6" +description = "Cross-platform colored terminal text." +category = "main" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +files = [ + {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, + {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, +] + [[package]] name = "joblib" -version = "1.2.0" +version = "1.3.0" description = "Lightweight pipelining with Python functions" category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "joblib-1.2.0-py3-none-any.whl", hash = "sha256:091138ed78f800342968c523bdde947e7a305b8594b910a0fea2ab83c3c6d385"}, - {file = "joblib-1.2.0.tar.gz", hash = "sha256:e1cee4a79e4af22881164f218d4311f60074197fb707e082e803b61f6d137018"}, + {file = "joblib-1.3.0-py3-none-any.whl", hash = "sha256:172d56d4c43dd6bcd953bea213018c4084cf754963bbf54b8dae40faea716b98"}, + {file = "joblib-1.3.0.tar.gz", hash = "sha256:0b12a65dc76c530dbd790dd92881f75c40932b4254a7c8e608a868df408ca0a3"}, ] [[package]] name = "numpy" -version = "1.24.3" +version = "1.24.4" description = "Fundamental package for array computing in Python" category = "main" optional = false python-versions = ">=3.8" files = [ - {file = "numpy-1.24.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3c1104d3c036fb81ab923f507536daedc718d0ad5a8707c6061cdfd6d184e570"}, - {file = "numpy-1.24.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:202de8f38fc4a45a3eea4b63e2f376e5f2dc64ef0fa692838e31a808520efaf7"}, - {file = "numpy-1.24.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8535303847b89aa6b0f00aa1dc62867b5a32923e4d1681a35b5eef2d9591a463"}, - {file = "numpy-1.24.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2d926b52ba1367f9acb76b0df6ed21f0b16a1ad87c6720a1121674e5cf63e2b6"}, - {file = "numpy-1.24.3-cp310-cp310-win32.whl", hash = "sha256:f21c442fdd2805e91799fbe044a7b999b8571bb0ab0f7850d0cb9641a687092b"}, - {file = "numpy-1.24.3-cp310-cp310-win_amd64.whl", hash = "sha256:ab5f23af8c16022663a652d3b25dcdc272ac3f83c3af4c02eb8b824e6b3ab9d7"}, - {file = "numpy-1.24.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:9a7721ec204d3a237225db3e194c25268faf92e19338a35f3a224469cb6039a3"}, - {file = "numpy-1.24.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d6cc757de514c00b24ae8cf5c876af2a7c3df189028d68c0cb4eaa9cd5afc2bf"}, - {file = "numpy-1.24.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76e3f4e85fc5d4fd311f6e9b794d0c00e7002ec122be271f2019d63376f1d385"}, - {file = "numpy-1.24.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a1d3c026f57ceaad42f8231305d4653d5f05dc6332a730ae5c0bea3513de0950"}, - {file = "numpy-1.24.3-cp311-cp311-win32.whl", hash = "sha256:c91c4afd8abc3908e00a44b2672718905b8611503f7ff87390cc0ac3423fb096"}, - {file = "numpy-1.24.3-cp311-cp311-win_amd64.whl", hash = "sha256:5342cf6aad47943286afa6f1609cad9b4266a05e7f2ec408e2cf7aea7ff69d80"}, - {file = "numpy-1.24.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7776ea65423ca6a15255ba1872d82d207bd1e09f6d0894ee4a64678dd2204078"}, - {file = "numpy-1.24.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ae8d0be48d1b6ed82588934aaaa179875e7dc4f3d84da18d7eae6eb3f06c242c"}, - {file = "numpy-1.24.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecde0f8adef7dfdec993fd54b0f78183051b6580f606111a6d789cd14c61ea0c"}, - {file = "numpy-1.24.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4749e053a29364d3452c034827102ee100986903263e89884922ef01a0a6fd2f"}, - {file = "numpy-1.24.3-cp38-cp38-win32.whl", hash = "sha256:d933fabd8f6a319e8530d0de4fcc2e6a61917e0b0c271fded460032db42a0fe4"}, - {file = "numpy-1.24.3-cp38-cp38-win_amd64.whl", hash = "sha256:56e48aec79ae238f6e4395886b5eaed058abb7231fb3361ddd7bfdf4eed54289"}, - {file = "numpy-1.24.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4719d5aefb5189f50887773699eaf94e7d1e02bf36c1a9d353d9f46703758ca4"}, - {file = "numpy-1.24.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0ec87a7084caa559c36e0a2309e4ecb1baa03b687201d0a847c8b0ed476a7187"}, - {file = "numpy-1.24.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea8282b9bcfe2b5e7d491d0bf7f3e2da29700cec05b49e64d6246923329f2b02"}, - {file = "numpy-1.24.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:210461d87fb02a84ef243cac5e814aad2b7f4be953b32cb53327bb49fd77fbb4"}, - {file = "numpy-1.24.3-cp39-cp39-win32.whl", hash = "sha256:784c6da1a07818491b0ffd63c6bbe5a33deaa0e25a20e1b3ea20cf0e43f8046c"}, - {file = "numpy-1.24.3-cp39-cp39-win_amd64.whl", hash = "sha256:d5036197ecae68d7f491fcdb4df90082b0d4960ca6599ba2659957aafced7c17"}, - {file = "numpy-1.24.3-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:352ee00c7f8387b44d19f4cada524586f07379c0d49270f87233983bc5087ca0"}, - {file = "numpy-1.24.3-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1a7d6acc2e7524c9955e5c903160aa4ea083736fde7e91276b0e5d98e6332812"}, - {file = "numpy-1.24.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:35400e6a8d102fd07c71ed7dcadd9eb62ee9a6e84ec159bd48c28235bbb0f8e4"}, - {file = "numpy-1.24.3.tar.gz", hash = "sha256:ab344f1bf21f140adab8e47fdbc7c35a477dc01408791f8ba00d018dd0bc5155"}, + {file = "numpy-1.24.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c0bfb52d2169d58c1cdb8cc1f16989101639b34c7d3ce60ed70b19c63eba0b64"}, + {file = "numpy-1.24.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ed094d4f0c177b1b8e7aa9cba7d6ceed51c0e569a5318ac0ca9a090680a6a1b1"}, + {file = "numpy-1.24.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79fc682a374c4a8ed08b331bef9c5f582585d1048fa6d80bc6c35bc384eee9b4"}, + {file = "numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ffe43c74893dbf38c2b0a1f5428760a1a9c98285553c89e12d70a96a7f3a4d6"}, + {file = "numpy-1.24.4-cp310-cp310-win32.whl", hash = "sha256:4c21decb6ea94057331e111a5bed9a79d335658c27ce2adb580fb4d54f2ad9bc"}, + {file = "numpy-1.24.4-cp310-cp310-win_amd64.whl", hash = "sha256:b4bea75e47d9586d31e892a7401f76e909712a0fd510f58f5337bea9572c571e"}, + {file = "numpy-1.24.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f136bab9c2cfd8da131132c2cf6cc27331dd6fae65f95f69dcd4ae3c3639c810"}, + {file = "numpy-1.24.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e2926dac25b313635e4d6cf4dc4e51c8c0ebfed60b801c799ffc4c32bf3d1254"}, + {file = "numpy-1.24.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:222e40d0e2548690405b0b3c7b21d1169117391c2e82c378467ef9ab4c8f0da7"}, + {file = "numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7215847ce88a85ce39baf9e89070cb860c98fdddacbaa6c0da3ffb31b3350bd5"}, + {file = "numpy-1.24.4-cp311-cp311-win32.whl", hash = "sha256:4979217d7de511a8d57f4b4b5b2b965f707768440c17cb70fbf254c4b225238d"}, + {file = "numpy-1.24.4-cp311-cp311-win_amd64.whl", hash = "sha256:b7b1fc9864d7d39e28f41d089bfd6353cb5f27ecd9905348c24187a768c79694"}, + {file = "numpy-1.24.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1452241c290f3e2a312c137a9999cdbf63f78864d63c79039bda65ee86943f61"}, + {file = "numpy-1.24.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:04640dab83f7c6c85abf9cd729c5b65f1ebd0ccf9de90b270cd61935eef0197f"}, + {file = "numpy-1.24.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5425b114831d1e77e4b5d812b69d11d962e104095a5b9c3b641a218abcc050e"}, + {file = "numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd80e219fd4c71fc3699fc1dadac5dcf4fd882bfc6f7ec53d30fa197b8ee22dc"}, + {file = "numpy-1.24.4-cp38-cp38-win32.whl", hash = "sha256:4602244f345453db537be5314d3983dbf5834a9701b7723ec28923e2889e0bb2"}, + {file = "numpy-1.24.4-cp38-cp38-win_amd64.whl", hash = "sha256:692f2e0f55794943c5bfff12b3f56f99af76f902fc47487bdfe97856de51a706"}, + {file = "numpy-1.24.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2541312fbf09977f3b3ad449c4e5f4bb55d0dbf79226d7724211acc905049400"}, + {file = "numpy-1.24.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9667575fb6d13c95f1b36aca12c5ee3356bf001b714fc354eb5465ce1609e62f"}, + {file = "numpy-1.24.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3a86ed21e4f87050382c7bc96571755193c4c1392490744ac73d660e8f564a9"}, + {file = "numpy-1.24.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d11efb4dbecbdf22508d55e48d9c8384db795e1b7b51ea735289ff96613ff74d"}, + {file = "numpy-1.24.4-cp39-cp39-win32.whl", hash = "sha256:6620c0acd41dbcb368610bb2f4d83145674040025e5536954782467100aa8835"}, + {file = "numpy-1.24.4-cp39-cp39-win_amd64.whl", hash = "sha256:befe2bf740fd8373cf56149a5c23a0f601e82869598d41f8e188a0e9869926f8"}, + {file = "numpy-1.24.4-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:31f13e25b4e304632a4619d0e0777662c2ffea99fcae2029556b17d8ff958aef"}, + {file = "numpy-1.24.4-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95f7ac6540e95bc440ad77f56e520da5bf877f87dca58bd095288dce8940532a"}, + {file = "numpy-1.24.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e98f220aa76ca2a977fe435f5b04d7b3470c0a2e6312907b37ba6068f26787f2"}, + {file = "numpy-1.24.4.tar.gz", hash = "sha256:80f5e3a4e498641401868df4208b74581206afbee7cf7b8329daae82676d9463"}, ] [[package]] name = "pandas" -version = "2.0.1" +version = "2.0.2" description = "Powerful data structures for data analysis, time series, and statistics" category = "main" optional = false python-versions = ">=3.8" files = [ - {file = "pandas-2.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:70a996a1d2432dadedbb638fe7d921c88b0cc4dd90374eab51bb33dc6c0c2a12"}, - {file = "pandas-2.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:909a72b52175590debbf1d0c9e3e6bce2f1833c80c76d80bd1aa09188be768e5"}, - {file = "pandas-2.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fe7914d8ddb2d54b900cec264c090b88d141a1eed605c9539a187dbc2547f022"}, - {file = "pandas-2.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a514ae436b23a92366fbad8365807fc0eed15ca219690b3445dcfa33597a5cc"}, - {file = "pandas-2.0.1-cp310-cp310-win32.whl", hash = "sha256:12bd6618e3cc737c5200ecabbbb5eaba8ab645a4b0db508ceeb4004bb10b060e"}, - {file = "pandas-2.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:2b6fe5f7ce1cba0e74188c8473c9091ead9b293ef0a6794939f8cc7947057abd"}, - {file = "pandas-2.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:00959a04a1d7bbc63d75a768540fb20ecc9e65fd80744c930e23768345a362a7"}, - {file = "pandas-2.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:af2449e9e984dfad39276b885271ba31c5e0204ffd9f21f287a245980b0e4091"}, - {file = "pandas-2.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:910df06feaf9935d05247db6de452f6d59820e432c18a2919a92ffcd98f8f79b"}, - {file = "pandas-2.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fa0067f2419f933101bdc6001bcea1d50812afbd367b30943417d67fbb99678"}, - {file = "pandas-2.0.1-cp311-cp311-win32.whl", hash = "sha256:7b8395d335b08bc8b050590da264f94a439b4770ff16bb51798527f1dd840388"}, - {file = "pandas-2.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:8db5a644d184a38e6ed40feeb12d410d7fcc36648443defe4707022da127fc35"}, - {file = "pandas-2.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7bbf173d364130334e0159a9a034f573e8b44a05320995127cf676b85fd8ce86"}, - {file = "pandas-2.0.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6c0853d487b6c868bf107a4b270a823746175b1932093b537b9b76c639fc6f7e"}, - {file = "pandas-2.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f25e23a03f7ad7211ffa30cb181c3e5f6d96a8e4cb22898af462a7333f8a74eb"}, - {file = "pandas-2.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e09a53a4fe8d6ae2149959a2d02e1ef2f4d2ceb285ac48f74b79798507e468b4"}, - {file = "pandas-2.0.1-cp38-cp38-win32.whl", hash = "sha256:a2564629b3a47b6aa303e024e3d84e850d36746f7e804347f64229f8c87416ea"}, - {file = "pandas-2.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:03e677c6bc9cfb7f93a8b617d44f6091613a5671ef2944818469be7b42114a00"}, - {file = "pandas-2.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3d099ecaa5b9e977b55cd43cf842ec13b14afa1cfa51b7e1179d90b38c53ce6a"}, - {file = "pandas-2.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a37ee35a3eb6ce523b2c064af6286c45ea1c7ff882d46e10d0945dbda7572753"}, - {file = "pandas-2.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:320b180d125c3842c5da5889183b9a43da4ebba375ab2ef938f57bf267a3c684"}, - {file = "pandas-2.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18d22cb9043b6c6804529810f492ab09d638ddf625c5dea8529239607295cb59"}, - {file = "pandas-2.0.1-cp39-cp39-win32.whl", hash = "sha256:90d1d365d77d287063c5e339f49b27bd99ef06d10a8843cf00b1a49326d492c1"}, - {file = "pandas-2.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:99f7192d8b0e6daf8e0d0fd93baa40056684e4b4aaaef9ea78dff34168e1f2f0"}, - {file = "pandas-2.0.1.tar.gz", hash = "sha256:19b8e5270da32b41ebf12f0e7165efa7024492e9513fb46fb631c5022ae5709d"}, + {file = "pandas-2.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9ebb9f1c22ddb828e7fd017ea265a59d80461d5a79154b49a4207bd17514d122"}, + {file = "pandas-2.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1eb09a242184092f424b2edd06eb2b99d06dc07eeddff9929e8667d4ed44e181"}, + {file = "pandas-2.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c7319b6e68de14e6209460f72a8d1ef13c09fb3d3ef6c37c1e65b35d50b5c145"}, + {file = "pandas-2.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd46bde7309088481b1cf9c58e3f0e204b9ff9e3244f441accd220dd3365ce7c"}, + {file = "pandas-2.0.2-cp310-cp310-win32.whl", hash = "sha256:51a93d422fbb1bd04b67639ba4b5368dffc26923f3ea32a275d2cc450f1d1c86"}, + {file = "pandas-2.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:66d00300f188fa5de73f92d5725ced162488f6dc6ad4cecfe4144ca29debe3b8"}, + {file = "pandas-2.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:02755de164da6827764ceb3bbc5f64b35cb12394b1024fdf88704d0fa06e0e2f"}, + {file = "pandas-2.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0a1e0576611641acde15c2322228d138258f236d14b749ad9af498ab69089e2d"}, + {file = "pandas-2.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a6b5f14cd24a2ed06e14255ff40fe2ea0cfaef79a8dd68069b7ace74bd6acbba"}, + {file = "pandas-2.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50e451932b3011b61d2961b4185382c92cc8c6ee4658dcd4f320687bb2d000ee"}, + {file = "pandas-2.0.2-cp311-cp311-win32.whl", hash = "sha256:7b21cb72958fc49ad757685db1919021d99650d7aaba676576c9e88d3889d456"}, + {file = "pandas-2.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:c4af689352c4fe3d75b2834933ee9d0ccdbf5d7a8a7264f0ce9524e877820c08"}, + {file = "pandas-2.0.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:69167693cb8f9b3fc060956a5d0a0a8dbfed5f980d9fd2c306fb5b9c855c814c"}, + {file = "pandas-2.0.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:30a89d0fec4263ccbf96f68592fd668939481854d2ff9da709d32a047689393b"}, + {file = "pandas-2.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a18e5c72b989ff0f7197707ceddc99828320d0ca22ab50dd1b9e37db45b010c0"}, + {file = "pandas-2.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7376e13d28eb16752c398ca1d36ccfe52bf7e887067af9a0474de6331dd948d2"}, + {file = "pandas-2.0.2-cp38-cp38-win32.whl", hash = "sha256:6d6d10c2142d11d40d6e6c0a190b1f89f525bcf85564707e31b0a39e3b398e08"}, + {file = "pandas-2.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:e69140bc2d29a8556f55445c15f5794490852af3de0f609a24003ef174528b79"}, + {file = "pandas-2.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b42b120458636a981077cfcfa8568c031b3e8709701315e2bfa866324a83efa8"}, + {file = "pandas-2.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f908a77cbeef9bbd646bd4b81214cbef9ac3dda4181d5092a4aa9797d1bc7774"}, + {file = "pandas-2.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:713f2f70abcdade1ddd68fc91577cb090b3544b07ceba78a12f799355a13ee44"}, + {file = "pandas-2.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cf3f0c361a4270185baa89ec7ab92ecaa355fe783791457077473f974f654df5"}, + {file = "pandas-2.0.2-cp39-cp39-win32.whl", hash = "sha256:598e9020d85a8cdbaa1815eb325a91cfff2bb2b23c1442549b8a3668e36f0f77"}, + {file = "pandas-2.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:77550c8909ebc23e56a89f91b40ad01b50c42cfbfab49b3393694a50549295ea"}, + {file = "pandas-2.0.2.tar.gz", hash = "sha256:dd5476b6c3fe410ee95926873f377b856dbc4e81a9c605a0dc05aaccc6a7c6c6"}, ] [package.dependencies] @@ -251,6 +263,27 @@ files = [ {file = "threadpoolctl-3.1.0.tar.gz", hash = "sha256:a335baacfaa4400ae1f0d8e3a58d6674d2f8828e3716bb2802c44955ad391380"}, ] +[[package]] +name = "tqdm" +version = "4.65.0" +description = "Fast, Extensible Progress Meter" +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tqdm-4.65.0-py3-none-any.whl", hash = "sha256:c4f53a17fe37e132815abceec022631be8ffe1b9381c2e6e30aa70edc99e9671"}, + {file = "tqdm-4.65.0.tar.gz", hash = "sha256:1871fb68a86b8fb3b59ca4cdd3dcccbc7e6d613eeed31f4c332531977b89beb5"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[package.extras] +dev = ["py-make (>=0.1.0)", "twine", "wheel"] +notebook = ["ipywidgets (>=6)"] +slack = ["slack-sdk"] +telegram = ["requests"] + [[package]] name = "tzdata" version = "2023.3" @@ -266,4 +299,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "<3.12,>=3.8" -content-hash = "c877d7dded830f323893792beb440c40504a20ca288ec5dbdd9089982b78cb76" +content-hash = "3ce4b48917467c8bf9536b10eb2e7ee4967bb452f9e945e125b3f74ec9c094c3" diff --git a/pyproject.toml b/pyproject.toml index fe7fb2a..73406d2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "hisel" -version = "0.3.0" +version = "0.4.0" description = "" authors = ["claudio "] readme = "README.md" @@ -11,6 +11,7 @@ numpy = ">=1.23" pandas = ">=1.5.3" scipy = ">=1.10" scikit-learn = ">=1.2.0" +tqdm = "*" [build-system] diff --git a/tests/categorical_test.py b/tests/categorical_test.py index 9fb1030..ddeb7d6 100644 --- a/tests/categorical_test.py +++ b/tests/categorical_test.py @@ -14,14 +14,11 @@ def test_discretise(self): np.all(keepdim[:, 0] == flat) ) - def test_search(self): - d = np.random.randint(low=10, high=12) + def generate_data(self): + d = np.random.randint(low=10, high=15) n = np.random.randint(low=4000, high=5000) h = np.random.randint(low=5, high=10) - n_rel = d // 3 - num_permutations = 1 * d - im_ratio = .05 - max_iter = 1 + n_rel = 3 random_state = np.random.randint(low=0, high=100) x = np.random.randint(low=0, high=h, size=(n, d)) @@ -32,6 +29,36 @@ def test_search(self): expected, = np.where(np.abs(a) > 0) t = a.reshape(1, 1, d) y = (t @ np.expand_dims(x, axis=2))[:, 0, 0] + return x, y, expected + + def test_search_and_select(self): + random_state = np.random.randint(low=1, high=50) + x, y, expected = self.generate_data() + selected = self._test_search( + x, y, expected, random_state) + selected, selection = self._test_select( + x, y, expected, selected, random_state) + + def _assert_expected(self, expected, selected): + n_rel = len(expected) + self.assertTrue( + len(selected) >= len(expected) - 1, + 'Too few selected features!' + ) + symdiff = set(selected).symmetric_difference(set(expected)) + threshold = 1 + int(n_rel < 3) + n_rel // 3 + self.assertTrue( + len(symdiff) < threshold, + 'Too large difference between selected and expected' + ) + + def _test_search(self, x, y, expected, random_state=None): + n, d = x.shape + n_rel = len(expected) + num_permutations = 15 + im_ratio = .01 + max_iter = 1 + selected = categorical.search( x, y, num_permutations=num_permutations, @@ -42,13 +69,16 @@ def test_search(self): ) print(f'expected:\n{sorted(expected)}') print(f'selected:\n{sorted(selected)}') - self.assertTrue( - len(selected) >= len(expected) - 1 - ) - self.assertTrue( - len(set(selected).symmetric_difference( - set(expected))) < 1 + n_rel // 3 - ) + self._assert_expected(expected, selected) + return selected + + def _test_select(self, x, y, expected, selected=None, random_state=None): + n, d = x.shape + n_rel = len(expected) + num_permutations = 20 + im_ratio = .01 + max_iter = 1 + xdf = pd.DataFrame(x, columns=[f'f{i}' for i in range(d)]) ydf = pd.Series(y) selection = categorical.select( @@ -60,7 +90,18 @@ def test_search(self): parallel=False, random_state=random_state, ) - print(f'selection:\n{sorted(selection.features)}') + if selected is None: + selected = categorical.search( + x, y, + num_permutations=num_permutations, + im_ratio=im_ratio, + max_iter=max_iter, + parallel=True, + random_state=random_state, + ) + print(f'expected:\n{sorted(expected)}') + print(f'selected:\n{sorted(selected)}') + self._assert_expected(expected, selected) recon = [int(f.replace('f', '')) for f in selection.features] self.assertEqual( set(selection.indexes), @@ -70,6 +111,7 @@ def test_search(self): set(recon), set(selected) ) + return selected, selection if __name__ == '__main__': diff --git a/hiseltest.yml b/tests/hiseltest.yml similarity index 66% rename from hiseltest.yml rename to tests/hiseltest.yml index b5f5692..27a3338 100644 --- a/hiseltest.yml +++ b/tests/hiseltest.yml @@ -3,16 +3,14 @@ channels: - conda-forge - nodefaults dependencies: - - python + - python=3.9 - ipython - ipykernel - - numpy - - pandas - - scipy - - scikit-learn + - numpy<1.22 - matplotlib + - pytest - pip - pip: - - cython - notebook + - pyHSICLasso diff --git a/tests/install_and_run.sh b/tests/install_and_run.sh new file mode 100644 index 0000000..a78e15d --- /dev/null +++ b/tests/install_and_run.sh @@ -0,0 +1,2 @@ +pip install --upgrade --force-reinstall ../dist/hisel-0.4.0-py3-none-any.whl +pytest diff --git a/tests/select_test.py b/tests/select_test.py index 791549f..543996f 100644 --- a/tests/select_test.py +++ b/tests/select_test.py @@ -191,14 +191,13 @@ def _test_selection( len(pyhsiclasso_selection), n_features, ) - self.assertEqual( - set(pyhsiclasso_selection), - set(features), - msg=( - f'\npyhsiclasso_selection: {sorted(pyhsiclasso_selection)}' - f'\nfeatures: {sorted(features)}\n\n' - ) + msg = ( + f'\npyhsiclasso_selection: {sorted(pyhsiclasso_selection)}' + f'\nfeatures: {sorted(features)}\n\n' ) + if not set(pyhsiclasso_selection) == set(features): + print( + f'WARNING: pyHSICLasso did not perform an exact selection:\n{msg}') selector = Selector( x, y, @@ -207,6 +206,7 @@ def _test_selection( ) selection = selector.select( n_features, batch_size=len(x) // 4, minibatch_size=400, number_of_epochs=3, device=device) + print(f'Expected features:\n{sorted(features)}') print( f'hisel selected features:\n{sorted(selection)}') self.assertEqual( @@ -218,7 +218,7 @@ def _test_selection( n_features, ) if SKLEARN_RECON: - miy = np.linalg.norm(y, axis=1) + miy = np.sum(y, axis=1) discrete_features = xfeattype == FeatureType.DISCR if yfeattype == FeatureType.CONT: mi = mutual_info_regression( @@ -264,8 +264,10 @@ def _test_selection( if QUICK_TEST: return # Test autoselection - We do not provide the number of features that should be selected - autoselection = selector.autoselect( + autoselected_features = selector.autoselect( batch_size=len(x) // 4, minibatch_size=400, number_of_epochs=3, threshold=3e-2, device=device) + autoselection = [int(feat.split('f')[-1]) + for feat in autoselected_features] print( f'hisel auto-selected features:\n{sorted(autoselection)}') if yfeattype == FeatureType.CONT: