From 232a6703051ff5e19fa265636e35ac7748a85d92 Mon Sep 17 00:00:00 2001 From: Lorenzo Volpi Date: Fri, 27 Oct 2023 12:35:25 +0200 Subject: [PATCH] baselines code updated --- .vscode/launch.json | 9 + baselines/__pycache__/atc.cpython-311.pyc | Bin 0 -> 2227 bytes baselines/__pycache__/doc.cpython-311.pyc | Bin 0 -> 432 bytes .../__pycache__/impweight.cpython-311.pyc | Bin 0 -> 3041 bytes baselines/__pycache__/pykliep.cpython-311.pyc | Bin 0 -> 11759 bytes baselines/__pycache__/rca.cpython-311.pyc | Bin 0 -> 970 bytes garg22_ATC/ATC_helper.py => baselines/atc.py | 0 baselines/densratio/RuLSIF.py | 277 +++++++++++++++++ baselines/densratio/__init__.py | 7 + .../__pycache__/RuLSIF.cpython-311.pyc | Bin 0 -> 11502 bytes .../__pycache__/__init__.cpython-311.pyc | Bin 0 -> 520 bytes .../__pycache__/core.cpython-311.pyc | Bin 0 -> 2800 bytes .../__pycache__/density_ratio.cpython-311.pyc | Bin 0 -> 3082 bytes .../__pycache__/helpers.cpython-311.pyc | Bin 0 -> 2066 bytes baselines/densratio/core.py | 70 +++++ baselines/densratio/density_ratio.py | 88 ++++++ baselines/densratio/helpers.py | 36 +++ {guillory21_doc => baselines}/doc.py | 0 baselines/impweight.py | 52 ++++ baselines/models.py | 140 +++++++++ baselines/pykliep.py | 219 ++++++++++++++ {elsahar19_rca => baselines}/rca.py | 0 jiang18_trustscore/trustscore.py | 141 --------- jiang18_trustscore/trustscore_evaluation.py | 286 ------------------ 24 files changed, 898 insertions(+), 427 deletions(-) create mode 100644 baselines/__pycache__/atc.cpython-311.pyc create mode 100644 baselines/__pycache__/doc.cpython-311.pyc create mode 100644 baselines/__pycache__/impweight.cpython-311.pyc create mode 100644 baselines/__pycache__/pykliep.cpython-311.pyc create mode 100644 baselines/__pycache__/rca.cpython-311.pyc rename garg22_ATC/ATC_helper.py => baselines/atc.py (100%) create mode 100644 baselines/densratio/RuLSIF.py create mode 100644 baselines/densratio/__init__.py create mode 100644 baselines/densratio/__pycache__/RuLSIF.cpython-311.pyc create mode 100644 baselines/densratio/__pycache__/__init__.cpython-311.pyc create mode 100644 baselines/densratio/__pycache__/core.cpython-311.pyc create mode 100644 baselines/densratio/__pycache__/density_ratio.cpython-311.pyc create mode 100644 baselines/densratio/__pycache__/helpers.cpython-311.pyc create mode 100644 baselines/densratio/core.py create mode 100644 baselines/densratio/density_ratio.py create mode 100644 baselines/densratio/helpers.py rename {guillory21_doc => baselines}/doc.py (100%) create mode 100644 baselines/impweight.py create mode 100644 baselines/models.py create mode 100644 baselines/pykliep.py rename {elsahar19_rca => baselines}/rca.py (100%) delete mode 100644 jiang18_trustscore/trustscore.py delete mode 100644 jiang18_trustscore/trustscore_evaluation.py diff --git a/.vscode/launch.json b/.vscode/launch.json index bbca6bd..91cb6a4 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -4,6 +4,7 @@ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 "version": "0.2.0", "configurations": [ + { "name": "main", "type": "python", @@ -11,6 +12,14 @@ "program": "C:\\Users\\Lorenzo Volpi\\source\\tesi\\quacc\\main.py", "console": "integratedTerminal", "justMyCode": true + }, + { + "name": "models", + "type": "python", + "request": "launch", + "program": "C:\\Users\\Lorenzo Volpi\\source\\tesi\\baselines\\models.py", + "console": "integratedTerminal", + "justMyCode": true } ] } \ No newline at end of file diff --git a/baselines/__pycache__/atc.cpython-311.pyc b/baselines/__pycache__/atc.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a9337f4b7d2cb517b54f336def0ce6c2b567854c GIT binary patch literal 2227 zcmb7F&2JM&6rb5Iud{ZXn3U2`1v`$RRw*JurF^tSxJgl}HW#2$qg0mTT?aSz+MQVg zN!OJRJp?HSCkkSbIHXcbi{Q{dp|@UlqdizFMMA1LaErvH;?llZe z`+INRd-Gd3+=qZ-f%mi~kI>(2&>^lZyt)9w1EeAq7tt(+FIVJB{49@=%KwCB1yukp zsv>Ynm4FA-0B~8w*HKD-$+nqEVNcGFXK1cWv>a~l%sxSQ;m_#RkH8)vW6#=mUAbqr zgA19x+H4n}OhCE}(qpiH+TG(Z*gfsqs@yZacLuNuGYg6Idg$jze_ejb?4Dpx$lfi` zl;ClC#p7wUn^Yro?n`h~UoOYKE*C3> zIa;ogoHl1@v@kcHrCPC|YjiGa^ZqIu5Lkw&lK6={~oi zSW~}F{Br*Ha9Dg< z#GY{n;=#|n{1=EwV4C8H3?Gj#W$%&@NG*>^Kbs6O4`3maE9-eO1XBT~@iwrw3oBJ? z!j?xIdBl}RY&=2+U@g}@4j72=`amrR<>5ek54Fab$mHGu(-zfu6ZedTLsAE;c+vm+ zzWqd%t4TdJyJ;nw(lV6GK8pZY$OcQ}HW^AsLde9u+P$Mm^zHRp&=kzzt_K}lL^4H{ zUxpCYL>>MNav0<{!e#~*eJJF_n|WmNdlA*-)f;BOl=GZlB7}u)q2nT9$Ci>>xe>^c z1zIKsVbSpT>^${ElGPV9s1;35^?8zC|J0L;*?Fxfr)qJuKFxR!TlGm@eG{28lU% zWMkO++!;)`gNgbV&xeMsOYYFg`d7`a^dhLN9&Jb#UJp9KVK+EjPd9_1diupcbVF(o zE4_ASEwT}*Pd7u6`qXwfdjDMG_S)r*%TD-&8$PjvWHITp72H&gS%a=}-1>e?iP=i5 z8R~Du?_FC-ucV)M#1~3<^?YNrLDo*)yRv%4`esWxX)7nUj}1SPEV7w?c;{i{QDkMh zITYWFyF+6uQ~x|yh8mi!9Jhtz2WzTZ{1f2tp*4j~w8)NeUuMH;5gnEY{gvj+}x8rbdqfgD8VfA2rTfAFhAmC~;Sb+d{mBv|CxA*RB3K z3=T!g9c)uI-te)58Ia2AzwHgHRg+(aUe#N5HNjxrsZCIQ|926PD6IR9ua)6KE5jwo zaMl1>XCn-MO56= literal 0 HcmV?d00001 diff --git a/baselines/__pycache__/doc.cpython-311.pyc b/baselines/__pycache__/doc.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9117dc07b6baf7a15040d09db75e57272d1a2748 GIT binary patch literal 432 zcmZ3^%ge<81gW9ysfj@PF^B^LOi;#W5g=naLkdF*V-7ag;>?(&#NyPP%)Hd%n3Vivy@JYH?CGf`@j!MFC(sa(KZ<36L<7TJZr&dA31*%4 zJ@#{h=7cW@n;AbR{vx-|1s0vpKn1|CxW$@Rnp;p=1PTDK?v)HhpujES0204AY;yBc oN^?@}ig9%%q8Ktp>Q;9=!|QUyzDaY@itoJ!f!PeumEm zW&)Vec_Ww&&4k=>zzAoB8NnR~jYu{+6NPc;Jv1Y#VGe0{L3*+-QUx`lM(^+l-G$F< zI-ZHCB1m_s62NXX2JnE2r%|$NhlxxkIY-bbEy#pwPUv^kl$FIKcH*z*GdeYOWlGBs zjZ!_I1FG~g(bZ{9Cd&Mk*8z`R(@0J;Uea<@H}OAHyGghuL*afURbsJAmD7 zQ%KzGm}w!4%W!^)BA@FsgRS`pEwtr-T3W>!Y@vlfYle#ZTSK%A>!=*?xu^4&LAn?) z#r7m*#&*YdV0gEU`py+@kyTy|s6oFm)R2lYoEqNXRN;$=pH~c8!3xvazGm@O;0^~n z!sj)NKsM6CI{UqvEru-EQ$>H^RQwjVtAjR`>!cQiV)$uVD@LBCuVU1SSkYzBl_gkV z)-w$kMN0%Hi%t@D>>Xz7ogW@Qe!Rmhx5L!r2y?O_=M+r^7sOW_Nik$f)3-G}GjHyI z=wqNq!7t@(IsBI^|Q+35m7f60iPD&1c(S*7ODr2+1r5d5LujYxCTh0%?o;M2mEY05{ ziZ*L%RG*!bsb=Uojn3-Xf~Sa)!V)>kF2exx584a5ETL}&Bpg`de_wpB*hGBr7u!;N z?Mk_KZOoRA)}^B>cm+R{x>hI4S1P?_AfBj8Csy!(BGDCoMOoF$c-63l!)3WVT}gd1 zTD|efxP9cTEu1Y~ei-ZD77p9OU|ks8?jP6~s7zIRs+aBlk$V5gKMp_mP3^_0TJJQx zjqaXiIM5aP9|G`y21-|(Q4|#aIPvbp%G+zhb|6s?Bx->~LyB)nLz~i2rPr2@*QMj7 zUpEGaE8cUXa%|&T^_Qy>`^nD^`z}l zbGvl3_^h|Ezu;6l=I_ewRdbQ=ED;NB(^z}Um#>s`$npMNSrM=TEQL-ZatihZ6S74K zGa5O~0J9Z2!$2Q zw}tV#Fuo;BYzh+( zQnv6~U3jg@qhQ~+A$0K2`qkRNX*+SIo;XvwzMbgv;SKSb@hAm}*`2x9c%C_u+++r}gVwy1-pVF&aKjeDGZBvn6>s&zkClA0kNPh+T&*){1J?{pMwAZ literal 0 HcmV?d00001 diff --git a/baselines/__pycache__/pykliep.cpython-311.pyc b/baselines/__pycache__/pykliep.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..77fa4260bf0a4c266f675529968e48e61745f3c7 GIT binary patch literal 11759 zcmb_CTWlNGl{0*Zq)3UPUY4zgM}AYXDajAT57}DDkJyPL%eL&Ku3PSiGn7W=%QK{H zu~e!pT!0M(M9r$Ok}4=$I9UZYwhG_^ZZ}0;pdaoZM?1SF1`y!gZmdK_6r|3-Kgzopiuw~)ipf$ad^rGxdlW};bcnj9 zed%i^sGCCOu;rSCrcBhU6lcCmah8uw6!if9Yw4Pmqc2ll>lauZ@zQE_Kob37HZBTx z0);W;V=Gj?d>ab)D4x1Ta}=yWUo&$i&dgaROq`XgnxHt_FKEurIf6E>`YwIV!ZmVE z_*&uXf^QX9bCqCafZ`o zGQ2iRh(;IzM(|-i5*H#9Onj1OqM|S%MA#7X`uTIGM+rnWGy$#g$uJWXqhaQh2JBoo z78T=cB)~I;T_C*MFpha6%JCuQG9Q9Pp-}@8;h1xAi5ZH)mIAo<2yXIZ^cE`$Y@BB< zPYS^}GsLm6xYj#9&0J1Q2-9qsW%xM5hJ4io(irSAF0w)dw=6-AAi)78AlrB%&U4Hy zfhFS*8GlV8u$Z@**q%FHeXEFyIN<63WdVfqHggl;mmPuN7fw~gEei+|06_#F5h$>+ zm>9i-SOViI-}jEaOm44CH=lW6+W);ffLGi) z(r$);(GXSWLX_*xh1pjmc7kV)5|{`Hu@@BLOqh>PMu7wplaOFtIEI)27KPqQN+l%Z zyz?ioGDx!`qHnRG1i|iWI%t9ufQ76MaPXFiCEo4P`HNf{95GHfPM83W4@MIa&ZmRA zBEdu$QKWNWw*|;L^6@&r&N)8Li(w(6&8eg$UgCAmr7b3;8z|~OKX@P0uLmcz!XF1G z_ML3|uYd2~GWdlV#Fe*d;(Lw%`0BqU?+%JpP@m95Saz^bcwQ|0dGHGw`bI>Qx&ga3 zjO5sWpuU`h>>ib(Q)4ti?VKVdC~k z;!Geu&Fpmmq+_)p%GQyK8jayT2$2Xc5@Ls<(HJ6EzeyTBqA}uM@f^LaK&&|Gy!4vJR<#vY zRc*I;aXc#VYL&KW)fBj;x>*u2)A4x`;tI<$QxGqH#Vt~I6loYOyvj7?dgmgVNeIq3BF<)$2 zwfp^`RpWktvSVE%!&mIziff|fHB05YsLh>e%Un%v%MSRsbDKKg<0^id+o0ym)i=V& zmTPNITNL}|EcyEq=2HRv?n1=@|A|Zp!5s#fKuzHgl4dM%{m!F&n9}6|M{h0;qT)qU z4O&u`k3oVw(4|NcFlcO<0ogI7um5*D;>6t4fKBOgBU1)bUkLZ6&on}%VRxBwNvBUT z8hbNU8?1*|cqyf-DpwJx8C$%mG!S-#JVKIjY`mq|PEj`q7R8$K(Gw^q^|=)=k-bRz zt6=J*2~52fZ!batx@^2p<0yT$apffN24}uOus=_WV*pPdd5ZeZZ)WV}D?rQ2j3cg} zj-saU{*YA3A)s08DM!$Rj2-VR41y15PFXl}5=uq~0$~5`Of{_LDURi6ZC_$jd}px@ z=37#ZpVFK)RsB=?!wRkhZmCLD19afMloijGs@A?gG=Xzr`k^UkD)IM>Geu=UKfjzn ze>!pZ534=~%skL}vv{TwjN)o|$~o46nZ_dAmuDGiQ0yl#KArogcz1CooYr=iO4%~a zrIu33cmqyVxIdg7oHWP7>W_g(ALzGz#%16hmvJY6aUu!C=6FwOohhB;Ij3|<$_vku@>1;&XwgW!l=?JWxQN5rUZ9m4yb}P z%EnFUa9`pI%HLMp8REn1VrdYRpfseHO%%|kpexXSB$^6+gMxz8eML=E?*Wq-Je;B) zIu~JBW?bai8=Bu(u!R^f7;d2{9F6Q{rjNhnN6Vr#YYdDCVuqjz%LH!0a8cx?STw>B zv*ptIiJdd3Pq=27k3N13Dz zk9O%Ol3i1qOKOZ5N* z1L7DIIFa19zJdow$Ttv*f^8$6fI@M&0MjMB_&J$k>0(!_9n+zUBB?grI+Ig7(tOe5qlpWvxFV2FgsDD2%n5MsmQ9D5?!{a5W= z8;tgYgj#);T4Xcy5;secZH)nf%s2RH)h6;_he>?02i>_KD}*%HkMO7nej?g?EN}<$ z?!Ye%YRusCwi7!{H4BkA(0UR~Rn{6Hbq66RUpJPG120M`lx z4@;{3HaHsKGDxZgOKO#$aG>h;S8%4oKjoL;R`HT*@=tkdL{wiQ%4=~G?#9N3Vw0@D zS396JFqw$lP~94v`wP40RJ#tup~DxEVO9Eu>MV>7MnjxhqwTY>ri98G?~$GebE;-I zSaS1*v_(xL{WEY7Se&&~ZT;MvY13L=!~B_r^BK>=g^&2<;hzil1-Y?DuIp9mdehdN z)2%q$vd%qguG+bC?~c4PvUpHw-7CBHDXx84*S>r;)!O;9?hm@(-?y|c?f%?eJ?D64 z-@Izyyx1e#yA*p@*538Jp(${Txqr;DfKOJ~9Fn@MYSZ>O; z_sEUCYwg=IeVOs)zMlu>_I*nGzC7h<+Cs8L8h1Ld+_~D?oo(%2YwKJ*{e=8`+V`|C zyX_6RZA57s$+nH;b{$%|`MB@X#1reM(Wms&(|>ei`!8lMj}`v<$CO=H7M-~h!@m=L zBjk2;=eGND-3N2qcIO?`;92?wWgXn|6=sX}jCc7;w)23}a&YD1W6!5o1qInxn5EzNx|#ykI4-+86?bRm;;MUB*1c=3 zu4SPn+i^jz8&&E?vvs4nO|A3J+?L(Tw6djZxlh@$Z<$rL^dd!b+k00$%Ju^*L(2C4 zm7B`;Bk5C$i^;WjLGMU<;%HEaXi zE3G}b`sRghKvF{+;HaS*aMVzZ{t$3-U1Qp*NlMfX#e@n74WKU{fd~XiX`V5cYl)QE zXw<}s38YOfm_zMxFeQr#+$G(dDwnw_Ga8&mSThi8$>`=|5dwZ8C}T;PADWEj85&fX zvb9PxQgj7ma5FX-X{s1$N?B)j$91zFjCS1#OF_InXnJTh&M-KK71%+?4_&?WoiZ?- zO&DOB(%r6IsxlMYt2muO9GUF4Pe7sI^ zFE;JQY$s+oS#9!bcAXaX@fQL+YPBC!11dECTOv_omMFvtR41sO2p z5+R6HfF7x;&?DM@P$P&$cq1+~nNI}1xF71tE$a(>)N-@XFTH>aBz_gOWly>m<)4iVedNRA^`YxruD{aepn(l3VZ)>Jm_UutSd(u_Uo%QL#wdR(E ziOhFbE@pSUl5IXLHy>7-5C8VilMeaVsB&ynZXQj)N)~KcwYQLazd6?m0-$B#Jl5(r z~MC3TQcH z;QMkyUVca$d66@r;|lVl3|=XCQYlF?Wur>)KZfE1y*gt7IY5`fA6%Ksgm(?N-!dY; z3gaiv{_?XA{`*<-gvQEN)m8{!YkXx6MJH6Vln86QY}FVDZ4k|nu;6I%0@NiGFobzD z-Z>A|+yK+Q!52P(?wRLmB`LQ2` z=0eYEJ63BulvJ-R`7b69Z?XYIop3Un_8hU2{^y!jp!?okFk zRd#5jyau6>%6e>5;Ru-LbXMr>QI zq4D04_l_*GOA}ysOf7dVUwruH%EgD*vh9cEh9gSDk*w=T&Rv(bYZS5osA9s)Ln!nJ z3P7P^JA6wx-hc}9MwSC-dT2IU0w5P?2$aD;W6&y7z88IMqfCJy2=0<32$YpS+gRXw zR3-a0j{=|PP^A`MBYezwu?f78!c&DFK9Dqr{tS3~771Pxkw~!+8{kLAyfz|(#67U2 zS{0a#3IU!FQtZN(8WNSzRie`$n-owffxqRglv6~}sWwoK07I%2Rc8U*Mn3Dh`uCSG zK^lW>mioNDal!kne%osOwoL#1fff4x;Ny<$#jDx+ZF2paO8uLlvD&w1UR}P3|1z&B z?LE)h53aT!l-v82_I`}DZvhJwzb#qv_p)lyukUYQNswTsS$JQYg0hxo;3claB;xUM zv$Dh*hE|>2zz)=uPF9R?a&!_*Tf#bU=}SMwa6(TCeo%qEXp+7htG`T@n}hIE4iFBd z*!~Y_uXW@Llp|mK52R0M)xzEpq@;UvQeYxcxbuW`KA<{^N%1;n#I`ZR1GPAc)o(+l zI!@mS@bLB=jeKI(MCn!RLn}tYD+D@7b5||M5gVE>(Dy5x5{3*^kFzd)9L<5()2E-K zSK6AV%#PZ%hNgx7HBZAr^P*>Q3}`2B?pYa3foR%KrY& z?+3thd0{p+f|mmsm@qZ9AS&D~*%q%{+ojZ^zpBw!H}G;7J_zXiEbJeTO2GM7vO zgM)ykcBOG{D;M!)Fd#<0&S;3nuWwX<(QP%FDl@=&8;k{qz&Lh&p~BG*D>!4Lmm=a6 z*i-fm91?OCBagwm0-eFz@=`L2GI(v!Ul_=12k^9(jPT&~;7>Ev23(e05L?P&JTFB4 zot!lZE)kR*sjBto8Cv1KX(?qhuFO@XoaBZX?jX5$2;Twl?VQ2kEICITtY9~;P`P`& z`-YkNNe!)~9aqA$8?k}AFYVU=1A4Rs5~UJ{a5BT#|LZO);3dHyn{mwj2jGkHsb(%3R~-;aKTWk2m6l7BX2dhpLw#i~8g5^u zhg5o8HN)=@L^MTIGyL{Q!X9FJ7c$KPe;rzrof}v+B}eghFie^SSv^Z(81C%inWgi~ zo}~-R-_5oTWL?K)*Kx&lJWXqzBg;eCR$tcDBfEMOS5KOL?y7tD+&kyygNwZE+M&31 zr0HCtX<{)UyLKwBozGm}RhM`9X4d7EU44qHZ{^iTZ#?N#4xW--r*M1?0r7U`X0{nd zbSbVb09@PjZs?uRk0NuCw0X_bJRe_}kv%&V&(1t$cl44hJ+ub;cNYlflRx$?_|hkH zF8BN~+0~(yUit9S($$ZK@1I}s{QScG3oDl&ANcfJa{HjtJ}5T~%C13eSYNKb;V0t@ zK^T2~x$}Mhl7BfMH}6-P_sjMBe>MIn_}fdLU4Pp7YyW5dr>xvRs`QV_9!<`Jzgkn9 zJ_$k|G)33q#f(|5-=Wm+Ab@|ef1&@wZA*JIH{b7B>RCP@H}ohCh`990^vUPVo5{2L zrdIg4((vya6#rL1qi=Fuj8}UGl{yeqOOm@>^>Y;wpC8cW5Ar3C5yZT178D`jm-w$H z<5h>%bfIhP+2CQGp3((wxn~-z*c*DO$?nrP69RG+(?6KJD18+w4-~izIE{pEtrvYb*;HQbGPQXd%}A{hF)%( z7v$z1rMYLN@lmVnKCZZrKXad6b)SB6^AES5-uy%INmzD|DekeXeT;C6mmcxD@i!q6 zHq86|5Et^p&zuq=EW7;vn+Y~l=&||zTr}YKi}<=Dp27^HJDOj58A_@Pf6*cYv|l@E z2s?`-2!>FW5G;tDSZc%!pNK?c5|W`oBiV@Kb!SSI+IrlD}Nj=3Jd8?`gGmHq)$ literal 0 HcmV?d00001 diff --git a/baselines/__pycache__/rca.cpython-311.pyc b/baselines/__pycache__/rca.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fb69719e87edf277da54bf3dfa65cf035fec0923 GIT binary patch literal 970 zcmbVK&ubGw6rS0gP0}V&F+mU^^i)_1r8Wme#e*dvG&G692w|aYcc&7&*$p!rA!S>F z6h!DDc&W#nDy0bG$$vsfN}yrDlP7P1+hBt5C_s#p9OfDi2gU6_i z5&9*AsVEa?{1BWIWFs3psEWZVj#5{vDwb!eL+V;p1D{ZBSFyDtQjOS=BUFvrxQz7Z zZ&_wb$3n9l&!ylbpP8KIxL>QAzQ;t=+OB&xbu4xApfm)wjK@V#C&-89rV^l2h|Xsx z2Wp^pXVElE+5iX2DZn_NLO~b>Y8jp3EnQ)}3h$Dr-G5UUJH~k@pyN_r0Zx=QL^t_nOS2hEI9T*a2)#&86I6mbuz!iG>jMHoU`< zP$&YMFN12MpEuK;S04_K58Ka&B=J7~F5gLa-u6jmKr%g&8OBrX!Wc5q95b>&$V?ZU zd1l_sSO`#{?1iYiGwhH)6JJ)D$UK{pv;Vuf52G|+tBhC_9?^+V+=d`^YPup~a=+D} zLan*Jt_rQesD1z1tTQ=`T!I6(SpuwBcuoCnZUNow4#aVp@E8=Z7f&8P=;S_CJ{B%h zD;KGiFB|>T+90*|HTS*pt#Fy$yvT0;*yv}AgKV*%D)!^0LA=x>C3XYu>oK9Z&3dE7 zZb2lXe9xgKbFYUYF^N{?XEOQy;n#&7oT2c3*xgm^2~2L&p-): Search range of Gaussian kernel bandwidth. + lambda_range (list): Search range of regularization parameter. + kernel_num (int): Number of kernels. (Default 100) + verbose (bool): Indicator to print messages (Default True) + + Returns: + densratio.DensityRatio object which has `compute_density_ratio()`. + """ + + # Number of samples. + nx = x.shape[0] + ny = y.shape[0] + + # Number of kernel functions. + kernel_num = min(kernel_num, nx) + + # Randomly take a subset of x, to identify centers for the kernels. + centers = x[randint(nx, size=kernel_num)] + + if verbose: + print("RuLSIF starting...") + + if len(sigma_range) == 1 and len(lambda_range) == 1: + sigma = sigma_range[0] + lambda_ = lambda_range[0] + else: + if verbose: + print("Searching for the optimal sigma and lambda...") + + # Grid-search cross-validation for optimal kernel and regularization parameters. + opt_params = search_sigma_and_lambda( + x, y, alpha, centers, sigma_range, lambda_range, verbose + ) + sigma = opt_params["sigma"] + lambda_ = opt_params["lambda"] + + if verbose: + print( + "Found optimal sigma = {:.3f}, lambda = {:.3f}.".format(sigma, lambda_) + ) + + if verbose: + print("Optimizing theta...") + + phi_x = compute_kernel_Gaussian(x, centers, sigma) + phi_y = compute_kernel_Gaussian(y, centers, sigma) + H = alpha * (phi_x.T.dot(phi_x) / nx) + (1 - alpha) * (phi_y.T.dot(phi_y) / ny) + h = phi_x.mean(axis=0).T + theta = asarray(solve(H + diag(array(lambda_).repeat(kernel_num)), h)).ravel() + + # No negative coefficients. + theta[theta < 0] = 0 + + # Compute the alpha-relative density ratio, at the given coordinates. + def alpha_density_ratio(coordinates): + # Evaluate the kernel at these coordinates, and take the dot-product with the weights. + coordinates = to_ndarray(coordinates) + phi_x = compute_kernel_Gaussian(coordinates, centers, sigma) + alpha_density_ratio = phi_x @ theta + + return alpha_density_ratio + + # Compute the approximate alpha-relative PE-divergence, given samples x and y from the respective distributions. + def alpha_PE_divergence(x, y): + # This is Y, in Reference 1. + x = to_ndarray(x) + + # Obtain alpha-relative density ratio at these points. + g_x = alpha_density_ratio(x) + + # This is Y', in Reference 1. + y = to_ndarray(y) + + # Obtain alpha-relative density ratio at these points. + g_y = alpha_density_ratio(y) + + # Compute the alpha-relative PE-divergence as given in Reference 1. + n = x.shape[0] + divergence = ( + -alpha * (g_x @ g_x) / 2 - (1 - alpha) * (g_y @ g_y) / 2 + g_x.sum(axis=0) + ) / n - 1.0 / 2 + return divergence + + # Compute the approximate alpha-relative KL-divergence, given samples x and y from the respective distributions. + def alpha_KL_divergence(x, y): + # This is Y, in Reference 1. + x = to_ndarray(x) + + # Obtain alpha-relative density ratio at these points. + g_x = alpha_density_ratio(x) + + # Compute the alpha-relative KL-divergence. + n = x.shape[0] + divergence = log(g_x).sum(axis=0) / n + return divergence + + alpha_PE = alpha_PE_divergence(x, y) + alpha_KL = alpha_KL_divergence(x, y) + + if verbose: + print("Approximate alpha-relative PE-divergence = {:03.2f}".format(alpha_PE)) + print("Approximate alpha-relative KL-divergence = {:03.2f}".format(alpha_KL)) + + kernel_info = KernelInfo( + kernel_type="Gaussian", kernel_num=kernel_num, sigma=sigma, centers=centers + ) + result = DensityRatio( + method="RuLSIF", + alpha=alpha, + theta=theta, + lambda_=lambda_, + alpha_PE=alpha_PE, + alpha_KL=alpha_KL, + kernel_info=kernel_info, + compute_density_ratio=alpha_density_ratio, + ) + + if verbose: + print("RuLSIF completed.") + + return result + + +# Grid-search cross-validation for the optimal parameters sigma and lambda by leave-one-out cross-validation. See Reference 2. +def search_sigma_and_lambda(x, y, alpha, centers, sigma_range, lambda_range, verbose): + nx = x.shape[0] + ny = y.shape[0] + n_min = min(nx, ny) + kernel_num = centers.shape[0] + + score_new = inf + sigma_new = 0 + lambda_new = 0 + + for sigma in sigma_range: + phi_x = compute_kernel_Gaussian(x, centers, sigma) # (nx, kernel_num) + phi_y = compute_kernel_Gaussian(y, centers, sigma) # (ny, kernel_num) + H = alpha * (phi_x.T @ phi_x / nx) + (1 - alpha) * ( + phi_y.T @ phi_y / ny + ) # (kernel_num, kernel_num) + h = phi_x.mean(axis=0).reshape(-1, 1) # (kernel_num, 1) + phi_x = phi_x[:n_min].T # (kernel_num, n_min) + phi_y = phi_y[:n_min].T # (kernel_num, n_min) + + for lambda_ in lambda_range: + B = H + diag( + array(lambda_ * (ny - 1) / ny).repeat(kernel_num) + ) # (kernel_num, kernel_num) + B_inv_X = solve(B, phi_y) # (kernel_num, n_min) + X_B_inv_X = multiply(phi_y, B_inv_X) # (kernel_num, n_min) + denom = ny * ones(n_min) - ones(kernel_num) @ X_B_inv_X # (n_min, ) + B0 = solve(B, h @ ones((1, n_min))) + B_inv_X @ diagflat( + h.T @ B_inv_X / denom + ) # (kernel_num, n_min) + B1 = solve(B, phi_x) + B_inv_X @ diagflat( + ones(kernel_num) @ multiply(phi_x, B_inv_X) + ) # (kernel_num, n_min) + B2 = (ny - 1) * (nx * B0 - B1) / (ny * (nx - 1)) # (kernel_num, n_min) + B2[B2 < 0] = 0 + r_y = multiply(phi_y, B2).sum(axis=0).T # (n_min, ) + r_x = multiply(phi_x, B2).sum(axis=0).T # (n_min, ) + + # Squared loss of RuLSIF, without regularization term. + # Directly related to the negative of the PE-divergence. + score = (r_y @ r_y / 2 - r_x.sum(axis=0)) / n_min + + if verbose: + print( + "sigma = %.5f, lambda = %.5f, score = %.5f" + % (sigma, lambda_, score) + ) + + if score < score_new: + score_new = score + sigma_new = sigma + lambda_new = lambda_ + + return {"sigma": sigma_new, "lambda": lambda_new} + + +def _compute_kernel_Gaussian(x_list, y_row, neg_gamma, res) -> None: + sq_norm = sum(power(x_list - y_row, 2), 1) + multiply(neg_gamma, sq_norm, res) + exp(res, res) + + +def _target_numpy_wrapper(x_list, y_list, neg_gamma): + res = empty((y_list.shape[0], x_list.shape[0]), np_float) + if isinstance(x_list, matrix) or isinstance(y_list, matrix): + res = asmatrix(res) + + for j, y_row in enumerate(y_list): + # `.T` aligns shapes for matrices, does nothing for 1D ndarray. + _compute_kernel_Gaussian(x_list, y_row, neg_gamma, res[j].T) + + return res + + +_compute_functions = {"numpy": _target_numpy_wrapper} +if guvectorize_compute: + _compute_functions.update( + { + key: guvectorize_compute(key)(_compute_kernel_Gaussian) + for key in ("cpu", "parallel") + } + ) + +_compute_function = _compute_functions[ + "cpu" if "cpu" in _compute_functions else "numpy" +] + + +# Returns a 2D numpy matrix of kernel evaluated at the gridpoints with coordinates from x_list and y_list. +def compute_kernel_Gaussian(x_list, y_list, sigma): + return _compute_function(x_list, y_list, -0.5 * sigma**-2).T + + +def set_compute_kernel_target(target: str) -> None: + global _compute_function + if target not in ("numpy", "cpu", "parallel"): + raise ValueError( + "'target' must be one of the following: 'numpy', 'cpu', or 'parallel'." + ) + + if target not in _compute_functions: + warn("'numba' not available; defaulting to 'numpy'.", ImportWarning) + target = "numpy" + + _compute_function = _compute_functions[target] diff --git a/baselines/densratio/__init__.py b/baselines/densratio/__init__.py new file mode 100644 index 0000000..2990b7e --- /dev/null +++ b/baselines/densratio/__init__.py @@ -0,0 +1,7 @@ +from warnings import filterwarnings + +from .core import densratio +from .RuLSIF import set_compute_kernel_target + +filterwarnings("default", message="'numba'", category=ImportWarning, module="densratio") +__all__ = ["densratio", "set_compute_kernel_target"] diff --git a/baselines/densratio/__pycache__/RuLSIF.cpython-311.pyc b/baselines/densratio/__pycache__/RuLSIF.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b7070ffe799bab904ad440942debc5ab26302434 GIT binary patch literal 11502 zcmd5?Yit|Wm7d{S6yFj_NtWf2C0U|tF_slSYR69WvSayC*Ro@W>wq0;MiOnlG(*~! zI^1%LbVC_lRPNe{0<-{IB(8#_fcj$rW4A%Giv>16IR*<80|*eX@Q+~s$Vhh4&5zwZ z=MG;(%T9ju$6gJO?#$fBx%ZrV&Ueq9-`ed~3ev;-NBQ6NQPjWVO(hx1nXmTJ6!kvE zQ6Y+>IbD>F=|Vah^ZKYhW(XN##*i^)3YlW&kU3@vSz=A0rkFKkjWvgwW44ejW)In8 zj*uhf3^`-2kSpd6xk-IP)Dvq7wUE3q+8XnQypT6>=BO{`5BXzlp*C`FiMGc&LLIRk zp&hZ#P$xWV4LpwR!yHu!)Yv-Kr8bZ4`7u+$Nn`a_)sGIK)cJbZ$-Id|g|K~jK zQe4Y8^$;F@Qho6d#`y{S)I?K!H`mH{-=qW`*BYlecYEa(#rAMs=&P6W@qL_~bIckz zKX9;zb4*cz4t5h7v0mh(Y$|e-XD-KQ5^*slu#q^=F=u&JObt#gq*;L%nUj%JDiWV# z1}>)0PMtm(ILeGL7Zy|Vi8ymQmW=W-KAwVB5^-p5mKXT=3@;wFGLZVm=GpihKbTBJ z;wgsXQ~V5;VUs1CC9x%=987RJh^zxEM(-4hq;T!;4UubkI*)#g?ZM zz|h%9dLMI+y^%;In77#&%dz{I^N~3|%8VtV(FiXvY@CAvA}h{En5pz!WD!d3t2T0; zPYZ06InSqVC4?Iy^Xf1&aA5z?aDX{{VDQLr(5en%q&y#Dc|JyxNg=_`%rnsKctn6P z;XEXS6bmC~Cbr@B5_^N4O~)e488*(w5<+AjGc})>=OfIU5iYUsTiBdTq=h)}nV5yK z%*=y?kRc(CGP%f$kOf|b_9O56zaLpx0}jQ=3Ie;R zm|;dp#=?p)u|njwV&Ed|oMJ&b3o4-)`B*ZwsOb6INkt!t&no(8VoouYOIu>;XeyG7 zE-HpZoEH^iGI5I+6up>^L2pJe5xvPn#w@`6gXF(NeGTXo+qf#%izrORdWIL`eDrjD zHWAP(EpzFcFq1GhOMG}H5lg01ykdzb!?V!@@MBFS!f}pp{UryjU79iD*vbj*gC2fg z?KV=>d%E|$@A=;Izt{FYB~jNaWxy8H)<(uG(Jy#oNnuOnBpD_1LmjL*_^Byll&BQ+ zQc03oGJdMlmZ|k~$0wNtXR3}@PT#nKQ%#k&C`xL0%NcHvo`qgbDTB6+YALs*uOHN$Rl+O}va)m+C9CDF^eWuK&%43g;~jN_B)STa`0%9-lM>W6!C?LN~4 zef4O@ZpoW!ZP#+keyK@2gN-~r$8}RX< z*DaNH@^x~lJOc?eYkQW=OE6zcpb?y5S+@k5DA~TX4O>GSmP#8-&{4)NHRn;IpGzG5 zsa0FE)>Foj^4BV)YGR-ydum7Bo1L{h%(}*#W#`#$mTs)wP{ekLa-QaU5HH4yB0C3=uG+#SAr9pM> z5jD*s&!FmwBqGZwLj*=pm^#@|2T%-3Q8B4cmVze}X#kVj8poL*9}N!AX7({^DW>uu zxYY6kY&)`q-9c9=7TWw6cw}!!#F;972fh$Xw7~DHi$EUe-=|V_ZzW2*elwH?olwjA zWkZ9Xqrqbvm%xH#bdnC%I(n6#b)ceq0h7`^lSl{v)z}n4bMuk#ZKZ`M zymrnN^9(FDUJwOzBZ`rXOhl&+3oo0^e)GoI(QB6h){EE9CSWJ9lwjUUM3a$gB4F4V z{u(qNxi-y;d=yZuc&(}c*HobelZyh{yQQA;{cF+03>y_s1h?U34ho7xK>nPip8A;l z4%s(Y^bO{kH(dQ|&Ffbl@v>{Y=o-%%OI_{gQxb>}MM>a=4nYE^Up5ht zp_la9H{_`zy3tFy;ZW=pBZ&N$EM4V|J!m_Xxi0Lu(S&SU{j-S2uq2+ z;Q$`c-$CDC+tHv=x)Ub8{bY3t>ykQlnICeV36} zKfW`UQG#MoE#-xYO76_rYTK}nPbl7s7tlD|87>%TAtF}vu}EA%Wrb}% zI(Y~M@Nvbj#vQ^c)&bNKRyj~i;9L=T?Zz7PB}LCAQi>tQvvC1ko!~)&@L52ktC$2n z$z#AlU~lqK!G|xrKmzS``h??Dn(JVdVk?iA04d@7_}*N`CyFk98-9xdZXy&bc?GPT z*8}saF5gjb?UP;mif}mwv!^!AjxCBd_HJ101;=jL(o?kb6f8YY&DPv8+3YWx{i{Qd z&7B2v=SMC1YjXF2V)ub{_Hp;&Ligd4Ysc!yeFu0%PuJ?g{a144oO!d^1MM}9(Hk~* z!80P;MvJ!5f^8Hy@(qGb%Pp+*t@`udTazALd8XI-tY1?=h;ei8SUp^H^}N)O%7(pb{sV|IQ+>%{aiY}~gcR+nR3;~ISvN&-tePl2_vZc3Z=`C1# zpW1rzBl%nFM%i|#Xgidh*l2uEve;L=_xb9dX7BqIr6AO?inxkjLTgU#jXjtW8(7#x%G6h^|Wl8EZQcY z*e*P_U65@Ti?)l|36NV`dv+q%x6*wlARAh@OeSN`7WGo%@fbU{s27pFR(!h+X`@^{ z1?TSkP@a{Y14ZXR_RSLbKJ62tE;{2|G@0VDCyvR-j>#{M$c{@z$0gZvxoEjuuv~uW z%cq{sQft@R(MOlRIJKqMwV0k!KtPNwrtBF|6GunM=`T52@N)S}?smLdca^$^w%kn) zC=KLmB-xW&zOux7NRQpJWp~knkZiNrojs@eNwf`>1Rh8Y>>UBy@fQf^nI%g@sGv#G zzaS83O`(A*r%W?PXKe7ZONMc3Mfc7H2)NX73c&_Fgh61T*M*!kfj^1fri2R+p7VOO zAIXWBt}dj6_zy!$uDUWVQbrpxa!bxUTG!_i`1OX6;`YG`&cHPUF*6>?$x%|%&vdm| zQjLp@Q*yoJErc^*mgsl>Dbpgk>jQ5o#0upkwMZ6@j1TdxL`KwrRkjboH3Y%5br9KR zT2tsl%1LUKTn$q9)(6WVROD@V?~^Q=5L!c^4Efc(Z{SGs5p0gQ+TXy5r@kJilbU85 za2M9pawB=`19+0BCI!h$bit2Etzj0R6*3cT4gEV`)^#ZNEt%qZA8-TsMD&1|bms%m0NWd=GAy z0&48zIMWwAI9nITP;bQ<*u?|R#XATi;HF>b0upek@P7m!z0-8Is?@Py8<(MP;cv2w0chusK(BE2GP2 z?w-j`l-zCicdjlJ-QC&Ar>>Ul>5|J0#O=vWZn`_SsDQEW8Iqh410VF-;q?jGb+qU@ zy3K=<&AoE@{*~4Ay>MQD9gu9>yT-2feH?)oum|Rsf3P|Y8(+X_K`VH9yOnIH?Vg`c z-kDq-$xX_(uA;5$iEYng+n&6z2K(c|qHQoomt5YJ*_HIl?0x%vC*VQt?WP|P(5@z6 z6G-k9@c&?Z1B%S$J$d%mKG`?0b^wAmvMX401)sPMK6V{kzb?B*imnmh#EXd0>i~A# zj@$^~4S(nA2#g?cKao4N(Z=MxYi7A^f3asW4L!wsPV#CQNO)A^%vSFqR>lskg5`<0^mmG%D5-URmU>sRLP+4Dx( z)dvh5M+S}~1H(u>jvRtx55yE!`|j%+8I@XJFTs4z<|L?`_p{eQ;9H|K<=eHkx6phLFT+6s z^KgfS;o$Q#23QT11a?lvdv_qCjf!XkpAwChlWREmc@D!Ew2=@v#MEmsn)WCM5Sm`= zUEnhgN{xS@wIm@ih<^}^g~wU~F=fCBEI`F{JB-18#RviF1el>XKNp^3V-Q5vgOyOs z;zBqMdt(uS8nF;+Af&Bw+#PO+%L*|l_!NF38pkZP;c!3bTQ%jqA2`-%*|WFk*$d$d zN3iG!W>0MxY`M2r&G{DD&{s6{6%2hF26IlIoBEq`@1Dz&KNVirhkwxg;{8{Lz;$b< z8_vhK4NSf$=rpUVhs)YY%;>dHIfea}leVPj)s&=PToO*z(eIQ~W} zjKC)~_)&;}%m}F5N)r!9FFYJz@I<`CWd=;bJiaq65(N^jW9b|CguX6B;D0T6#;7{u z4a4ci9G}8-oaAEomcS+<9xI4Y_CK)v&w+r#Se+}r`<<&S*r+`p?ps^<@Rfq=z(2ex zTaOp5$FpM_X7|dTU+t02-9>XZ?!!Aj3ay(yGs%6g7yDi(4>laW6>&LsHZ&&q zhBXHuh!BVpPD18=DgqyEyu)PR7(=H$3TY5x3Ite5T5R<9y1U&|)W;wi0guGVJj2%~ zy5N3Ae}i8Xu_B{RFej-MIAXG!?o2)gm>1x^1Q+8Lu9r%;G zH$Nr214YR09hKcB-4ro<4@5jgtriq&(yEL>fn#Un}!t#dJAA! z2LKs#5aF`cbk)`;F;H{R0Hn=7e)&HD?x1UT)vnB@y3ipjj}mc6;dNv1Jovx z{aeW3?wzA=(y(x=h6OBm!|vy&G<;rDCs`=(ueafxK84GuoV3ORY_$On$m(iKRBa)% z24cyL=iv||1@(z6gdan0X`)|czMqMu#S}9Q2ZQj9Rrxr47Ct3Q+=64gqf9?h-u`_| zKS&k+!6{;YMYR3Fr9pf@&Gs|#M2cZ=vXLk|9p#U~cW>k{m7D}7DvbmQ6W}$hZ?Vxd zKOqPSLGe5%GEu+@RP5>pW>?g&zJ$MqX8~IH3H%drGRsz>T?C6z3GY_o$;3_w??Ndn zO7tpZvec&8UhobT%y?}$+Df)5fGJK4?Kwf^b)feUnsfZg+g~u_wPCRp?5}+mlPzb9 zma_%RS-_+@)6=%SYln($2MPwSdX<%4=zu;h!RniZ|K|h!ia8wS5;NhjV!`jwU>?*_ zsbWASQ6s}BST!y;|@dEKr`39?-3cHkAy%Cx)Gm4Li4V71O=x-@9dmn`dbH0pmhW zZ!|*i637-I4$9kKvU>`?fr1^c>_m2AZSHH>rRWNjd&@@CBj6Y7sHK@6&d+R7@NbPH K|6WLl&io&CRAVFn literal 0 HcmV?d00001 diff --git a/baselines/densratio/__pycache__/__init__.cpython-311.pyc b/baselines/densratio/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0de4279048ed62db47d770eb17564d4b25edb8b9 GIT binary patch literal 520 zcmY*VF>4z!6n^sAcNZHQN+Gm#DRc-P+NGUL>Ci$TB$U$7#+U;kpPrAS(#c3Vialk_ z4+vz%HZ*VM(W;R#ed1}zoRLN;+k$yv6prDrN46Tbe zPFN}IJlZNsIc8hBRSN_zl4uOe(8VdUm?=GTp;WvSsO!x>_G1^{H`RfUxMr#}F3Vl` zx!l{|*OzNNOfDp2P7u7m0;aRng1-z`* oaS6S{-qGx|e=aX}D;WNP;a?bfvgmA9PhMB;H-}Mu^dsNrKeVrx!2kdN literal 0 HcmV?d00001 diff --git a/baselines/densratio/__pycache__/core.cpython-311.pyc b/baselines/densratio/__pycache__/core.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4a0d9412feb8262bc7ae6f22ed9d307edb07d337 GIT binary patch literal 2800 zcmbVOOKcNI7@m#otm60{(4S&bX-zwWX9F;}hg39)R7q{pC8KGwNkXbm_3J*vKk0YJ z0X<*@CIb>`L29eoa$A~gRzp`%bXV?lNTnuc#MUe&WicB1uAzlOXSl5y#HRQ&;M43Z z9uvGc9a)U_%}27)(6*2yDmg)2GkKb%=(gk$LbjDORl*p_!gBLi=KPg&XB|Ju*w(fR zEaNFzY~5)s@b$s}nF3e4hin0|)ADE)X6seTt;SX9rBx}9u2*?g>0kv0<)v5CZ-{dd z+ieG`tVnHYMEyas>!i*5Y%oZrt%Ahs6@wu5Zr-Q*s+g@6E`(J78!`QF#Pq)gGw?EI z-uHdyTg%$v%IeC_-ce<{txnpY+MEHwu#FvHdi z{+whu*NBO4P-apcPZLwUsj2o{O#&f*<@h5a29IEa5|iWp*taJ?=ygA*da`r4xzoz3WkX*rP=Oq?c+7}TZ=&sYq*AzwrK|1x;@N}3rXj-i_de7Lqpv<1qhEe_h!OjSz}8{`2!rs^XE>KD&daVLXG*0VquM#~&&oE{&5OM~q8u%L* zT7e;KT0VPZLp|{h6Y}oHDGe{^X2a145 z!dYx&xQ(Z&2!C3$3Rjp&MY;FRnF~bMR1AFvk^^{JUAQW_?tib>2bFShTNL{3zhAs~ zu`TN0wn)R#AQSb-3Gg|RrcS8lEKW${P-|^srl{lNHgj6WvuV$fn8?+v+1>9*i;k3a z0&engTI)n|cGaKnglggC$U9-iXW^@sp4O(g zm0>A5WmB$AO%qPxEkOBH)v8nCX`!UEPD_=Ac| z^poJ;%5X~#-oANf{&sHp^4+iQU%orJA$J$#?kC{`g@Z%I@Ng+STnG+7mG>;4Sm8za zXh}X=Mp7WY*81zhMkG;)B%Xx#7Y+;*!-J*pU?DgtSbHD(it@3Nd`z$ot#v)xw-Nog z5dGL?9V~|9rEt6ujBm+%3v$OwZtcWxyx2Wd>K=mWAFYoUHby53qZ6C^4y=6fU~FyA z+9$=XzEW3TabMq^)-AccAa{xz3_oly29KA5$ICtx?Aq-4@X`AlJ)?!5(XIUlSH6BQ zy>@tQtavC^IutAJkKNhzw5@&l^z!`u;}1I5^haMlmWyqdN^O^x&Th8uS-S8HbcHTP z15DIX*YCVe*k?Q{J909qTB&5x2|$8OXI)2dwXgdKsYcsX$$y0TA_q;gY0@|Th5!bs{YppN?A7V0iE{x%yKfVmt*6Vf+Q**75V LEu+_Tt~CDu7zgD_ literal 0 HcmV?d00001 diff --git a/baselines/densratio/__pycache__/density_ratio.cpython-311.pyc b/baselines/densratio/__pycache__/density_ratio.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..70c0b20ea7dde12cbc7dba04b4a1cb0f458557e9 GIT binary patch literal 3082 zcmcgu&rcgi6rQzrZL`=w90Lx-WZRU)8yX@-O_h==6$xq7LJ>;Trm|eE7ViR^uD$N; zIxUu^a!4f~dguW)sgz1NR78j#a_pZ0m8DuMMXJ-o*wgRXud@elu_8 z&6^+ZdvAD_NQeZ=JKs;~zefrA0|meE?Z)1x(0D{NqH!iEaygE}cE}7B!?`d=LgY5l z!VihYZ-fZhg3l{Co{b1F5;-&?Xwiox7tvyOiQMxN)n(3&mX|DAR2^uB?aE@IZH84` zTi6?e#v`JW9H)_7NaJ#0Ed-``+(1sycrYlq@fp2j8_wz-)iJDUUt5xxI9cfFFpQfG z!?*W<*{yLkq6EXW5J>*Yq!tD#I9!bf31_lW6F~B>i_{_@3Bgztq(~seK!S_*ibn2| zO>SO}xsjsotXP^WsAhRZbp>ZdcT_iOs>MZ3RoocsRu*pgt=T!Z_j8?=bW<@(OO`uW zu!`l1qbr(sbcN#C~zhf z-yOrXX8_hnYlsYI>ioC8&A}0im4i14%;Oh;Txz#!Q!0+UK z;B}aR2WI7Gegpcqg0U*hKn~BV*T9bW5o5)aG@kYNA={!jjSRsq#tH11WkirZ03u)j z8CGdZ9M5*pqeJPQZY-}jwv=TIlcz)pEO~9PbVpyVm?|}@%!;(2QWX}~sg(7YDC4?q zQU-guctYJfk~q;kIlO+YnLgb} zXPe2P^$RbOqmO5ve7KW5-$%J z{AL6ruLLes6jxM~qNP<#Y{wPli;8NtM_|jERZtXq5+>16fK9>-uoF?_FaxByeHCCW z_-pd%Z9d)NlftzY2_Sn$fNdNF&nVg}z!wxhVE$clZ?hfAUOjn~`IChp8exGTzh4L8 z5&5tA9rHBBnBC-qwCk*vbysW$S*cQV1>0CILe4JeC5RYX78p0^IhaPzBV-X|02{Z+ zvaEN*c<=sM@LWM&oJQrd0BfY_=em(MAOrSyr^R}_9%q-^mG&@6v9z}ajsGw0K}zEx zrNQTwAf;7TAu!t>!`s6I(Y`+vA?_tgOG9&3vh2oL zPV!o4;ih-}IM4TlEz%*_3mrxn2aqE)jqO9e!3Tjkh>k$3BdS?YRFmG`+a6-^d>i!^ z05F~fi08N;&xt6+b05aD7s|t<>tB2EoXR}bo|rqSiAHLo)gxd41MDO0_Qx@R<9+~B z%op@E1Po}8EyxFofSf)OkC|;g)8cvIOp62%PhdPA1f#MPMvOP!KSiC7BR-uD zjPWS;r+Mh&`xJDHzM4F&%O+>Mqc#x)gOFk|9J1|-5>H#^1o#+{hGP-oZrb@ zX=JWER~wnDzr3}Rd2gq8y3sqm%}=wdnU|w(q+F&($?=v9QQg}B%ZX>fGG&W(d+1f{ zT`$vKr?@z#H~TR_E6j0RlZ(P+=jG}OX;PB_SQ>b6?70A42IG}ZYt@)KD- zeKYg+z1{DA3#*a z;3wU730cZDG%H_1*RT<|fzlBp2pBb_^C%U1BHX8BSJf%i=Ul&KG!%^zRx){(&lSn| zg%ozxoSvH@u8-?zoSp3 z22M#mt!4D0&A;!x%x+BqreQQ>8XMS<7SICzR$hZ_J!oYN0WZqMNs5s}wo{O2y*@a?>6B-5g8VvR!z;(19?I?e|9*&kLZeLotBCgehxK{YB?7jR$ zEUvH2@6Y};_iKl^eq5-8$Ex8mr|n#=Ene&Hsl|F~oqOw=uRZi90{BZn9r=Qxrx0Mf z48%mx>y^PpP+n%fg2zJ5RKMKk^8ndlt~@a|E=k z5u9fO?(QJ-KQ+ur4Z z(WbmS9;8kkeOLp&Xoop(&~t$k4kw*G~&=V01olO1CVV z$2rw4Hf7U7oZE3oX>o3-pmUm^9ZG4m6TDKvnJb&ba#c2?7m4eKF6JpRSrNr%tiodVw@2jJb zdSG3PEM6&JyE#%GS$Xr3*6V1!wOH5k(Jzj!yuaE}i6yJCq|=t%h;=QVSr4{-eqkwb zbD}(Pv$x#q^qhDUJmCaS)S~U>*~Mrr9H}E2Tt;4TdPXXdo zJA;*IvKmb~fuvWbsdiWOIB9&pV1kOGqg?>dw2F;3{}zYVp0{f@G?af?sgzqz?t@5lHpVbX+vh7Z)yi@-#i RT~^*kKlo2#`4@x%{{f!v%!U8} literal 0 HcmV?d00001 diff --git a/baselines/densratio/core.py b/baselines/densratio/core.py new file mode 100644 index 0000000..c221419 --- /dev/null +++ b/baselines/densratio/core.py @@ -0,0 +1,70 @@ +""" +densratio.core +~~~~~~~~~~~~~~ + +Estimate Density Ratio p(x)/q(y) +""" + +from numpy import linspace + +from .helpers import to_ndarray +from .RuLSIF import RuLSIF + + +def densratio( + x, y, alpha=0, sigma_range="auto", lambda_range="auto", kernel_num=100, verbose=True +): + """Estimate alpha-mixture Density Ratio p(x)/(alpha*p(x) + (1 - alpha)*q(x)) + + Arguments: + x: sample from p(x). + y: sample from q(x). + alpha: Default 0 - corresponds to ordinary density ratio. + sigma_range: search range of Gaussian kernel bandwidth. + Default "auto" means 10^-3, 10^-2, ..., 10^9. + lambda_range: search range of regularization parameter for uLSIF. + Default "auto" means 10^-3, 10^-2, ..., 10^9. + kernel_num: number of kernels. Default 100. + verbose: indicator to print messages. Default True. + + Returns: + densratio.DensityRatio object which has `compute_density_ratio()`. + + Raises: + ValueError: if dimension of x != dimension of y + + Usage:: + >>> from scipy.stats import norm + >>> from densratio import densratio + + >>> x = norm.rvs(size=200, loc=1, scale=1./8) + >>> y = norm.rvs(size=200, loc=1, scale=1./2) + >>> result = densratio(x, y, alpha=0.7) + >>> print(result) + + >>> density_ratio = result.compute_density_ratio(y) + >>> print(density_ratio) + """ + + x = to_ndarray(x) + y = to_ndarray(y) + + if x.shape[1] != y.shape[1]: + raise ValueError("x and y must be same dimensions.") + + if isinstance(sigma_range, str) and sigma_range != "auto": + raise TypeError("Invalid value for sigma_range.") + + if isinstance(lambda_range, str) and lambda_range != "auto": + raise TypeError("Invalid value for lambda_range.") + + if sigma_range is None or (isinstance(sigma_range, str) and sigma_range == "auto"): + sigma_range = 10 ** linspace(-3, 9, 13) + + if lambda_range is None or ( + isinstance(lambda_range, str) and lambda_range == "auto" + ): + lambda_range = 10 ** linspace(-3, 9, 13) + + result = RuLSIF(x, y, alpha, sigma_range, lambda_range, kernel_num, verbose) + return result diff --git a/baselines/densratio/density_ratio.py b/baselines/densratio/density_ratio.py new file mode 100644 index 0000000..b02a9e9 --- /dev/null +++ b/baselines/densratio/density_ratio.py @@ -0,0 +1,88 @@ +from pprint import pformat +from re import sub + + +class DensityRatio: + """Density Ratio.""" + + def __init__( + self, + method, + alpha, + theta, + lambda_, + alpha_PE, + alpha_KL, + kernel_info, + compute_density_ratio, + ): + self.method = method + self.alpha = alpha + self.theta = theta + self.lambda_ = lambda_ + self.alpha_PE = alpha_PE + self.alpha_KL = alpha_KL + self.kernel_info = kernel_info + self.compute_density_ratio = compute_density_ratio + + def __str__(self): + return """ +Method: %(method)s + +Alpha: %(alpha)s + +Kernel Information: +%(kernel_info)s + +Kernel Weights (theta): + %(theta)s + +Regularization Parameter (lambda): %(lambda_)s + +Alpha-Relative PE-Divergence: %(alpha_PE)s + +Alpha-Relative KL-Divergence: %(alpha_KL)s + +Function to Estimate Density Ratio: + compute_density_ratio(x) + +"""[ + 1:-1 + ] % dict( + method=self.method, + kernel_info=self.kernel_info, + alpha=self.alpha, + theta=my_format(self.theta), + lambda_=self.lambda_, + alpha_PE=self.alpha_PE, + alpha_KL=self.alpha_KL, + ) + + +class KernelInfo: + """Kernel Information.""" + + def __init__(self, kernel_type, kernel_num, sigma, centers): + self.kernel_type = kernel_type + self.kernel_num = kernel_num + self.sigma = sigma + self.centers = centers + + def __str__(self): + return """ + Kernel type: %(kernel_type)s + Number of kernels: %(kernel_num)s + Bandwidth(sigma): %(sigma)s + Centers: %(centers)s +"""[ + 1:-1 + ] % dict( + kernel_type=self.kernel_type, + kernel_num=self.kernel_num, + sigma=self.sigma, + centers=my_format(self.centers), + ) + + +def my_format(str): + return sub(r"\s+", " ", (pformat(str).split("\n")[0] + "..")) diff --git a/baselines/densratio/helpers.py b/baselines/densratio/helpers.py new file mode 100644 index 0000000..08656f5 --- /dev/null +++ b/baselines/densratio/helpers.py @@ -0,0 +1,36 @@ +from numpy import array, ndarray, result_type + +np_float = result_type(float) +try: + import numba as nb +except ModuleNotFoundError: + guvectorize_compute = None +else: + _nb_float = nb.from_dtype(np_float) + + def guvectorize_compute(target: str, *, cache: bool = True): + return nb.guvectorize( + [nb.void(_nb_float[:, :], _nb_float[:], _nb_float, _nb_float[:])], + "(m, p),(p),()->(m)", + nopython=True, + target=target, + cache=cache, + ) + + +def is_numeric(x): + return isinstance(x, int) or isinstance(x, float) + + +def to_ndarray(x): + if isinstance(x, ndarray): + if len(x.shape) == 1: + return x.reshape(-1, 1) + else: + return x + elif str(type(x)) == "": + return x.values + elif not x: + raise ValueError("Cannot transform to numpy.matrix.") + else: + return to_ndarray(array(x)) diff --git a/guillory21_doc/doc.py b/baselines/doc.py similarity index 100% rename from guillory21_doc/doc.py rename to baselines/doc.py diff --git a/baselines/impweight.py b/baselines/impweight.py new file mode 100644 index 0000000..83e7f6e --- /dev/null +++ b/baselines/impweight.py @@ -0,0 +1,52 @@ +import numpy as np +from scipy.sparse import issparse, vstack +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import GridSearchCV +from sklearn.neighbors import KernelDensity + + +def logreg(Xtr, ytr, Xte): + # check "Direct Density Ratio Estimation for + # Large-scale Covariate Shift Adaptation", Eq.28 + + if issparse(Xtr): + X = vstack([Xtr, Xte]) + else: + X = np.concatenate([Xtr, Xte]) + + y = [0] * Xtr.shape[0] + [1] * Xte.shape[0] + + logreg = GridSearchCV( + LogisticRegression(), + param_grid={"C": np.logspace(-3, 3, 7), "class_weight": ["balanced", None]}, + n_jobs=-1, + ) + logreg.fit(X, y) + probs = logreg.predict_proba(Xtr) + prob_train, prob_test = probs[:, 0], probs[:, 1] + prior_train = Xtr.shape[0] + prior_test = Xte.shape[0] + w = (prior_train / prior_test) * (prob_test / prob_train) + return w + + +kdex2_params = {"bandwidth": np.logspace(-1, 1, 20)} + + +def kdex2_lltr(Xtr): + if issparse(Xtr): + Xtr = Xtr.toarray() + return GridSearchCV(KernelDensity(), kdex2_params).fit(Xtr).score_samples(Xtr) + + +def kdex2_weights(Xtr, Xte, log_likelihood_tr): + log_likelihood_te = ( + GridSearchCV(KernelDensity(), kdex2_params).fit(Xte).score_samples(Xtr) + ) + likelihood_tr = np.exp(log_likelihood_tr) + likelihood_te = np.exp(log_likelihood_te) + return likelihood_te / likelihood_tr + + +def get_acc(tr_preds, ytr, w): + return np.sum((1.0 * (tr_preds == ytr)) * w) / np.sum(w) diff --git a/baselines/models.py b/baselines/models.py new file mode 100644 index 0000000..001f02c --- /dev/null +++ b/baselines/models.py @@ -0,0 +1,140 @@ +# import itertools +# from typing import Iterable + +# import quapy as qp +# import quapy.functional as F +# from densratio import densratio +# from quapy.method.aggregative import * +# from quapy.protocol import ( +# AbstractStochasticSeededProtocol, +# OnLabelledCollectionProtocol, +# ) +# from scipy.sparse import issparse, vstack +# from scipy.spatial.distance import cdist +# from scipy.stats import multivariate_normal +# from sklearn.linear_model import LogisticRegression +# from sklearn.model_selection import GridSearchCV +# from sklearn.neighbors import KernelDensity + +import time + +import numpy as np +import sklearn.metrics as metrics +from pykliep import DensityRatioEstimator +from quapy.protocol import APP +from scipy.sparse import issparse, vstack +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import GridSearchCV +from sklearn.neighbors import KernelDensity + +import baselines.impweight as iw +from baselines.densratio import densratio +from quacc.dataset import Dataset + + +# --------------------------------------------------------------------------------------- +# Methods of "importance weight", e.g., by ratio density estimation (KLIEP, SILF, LogReg) +# --------------------------------------------------------------------------------------- +class ImportanceWeight: + def weights(self, Xtr, ytr, Xte): + ... + + +class KLIEP(ImportanceWeight): + def __init__(self): + pass + + def weights(self, Xtr, ytr, Xte): + kliep = DensityRatioEstimator() + kliep.fit(Xtr, Xte) + return kliep.predict(Xtr) + + +class USILF(ImportanceWeight): + def __init__(self, alpha=0.0): + self.alpha = alpha + + def weights(self, Xtr, ytr, Xte): + dense_ratio_obj = densratio(Xtr, Xte, alpha=self.alpha, verbose=False) + return dense_ratio_obj.compute_density_ratio(Xtr) + + +class LogReg(ImportanceWeight): + def __init__(self): + pass + + def weights(self, Xtr, ytr, Xte): + # check "Direct Density Ratio Estimation for + # Large-scale Covariate Shift Adaptation", Eq.28 + + if issparse(Xtr): + X = vstack([Xtr, Xte]) + else: + X = np.concatenate([Xtr, Xte]) + + y = [0] * Xtr.shape[0] + [1] * Xte.shape[0] + + logreg = GridSearchCV( + LogisticRegression(), + param_grid={"C": np.logspace(-3, 3, 7), "class_weight": ["balanced", None]}, + n_jobs=-1, + ) + logreg.fit(X, y) + probs = logreg.predict_proba(Xtr) + prob_train, prob_test = probs[:, 0], probs[:, 1] + prior_train = Xtr.shape[0] + prior_test = Xte.shape[0] + w = (prior_train / prior_test) * (prob_test / prob_train) + return w + + +class KDEx2(ImportanceWeight): + def __init__(self): + pass + + def weights(self, Xtr, ytr, Xte): + params = {"bandwidth": np.logspace(-1, 1, 20)} + log_likelihood_tr = ( + GridSearchCV(KernelDensity(), params).fit(Xtr).score_samples(Xtr) + ) + log_likelihood_te = ( + GridSearchCV(KernelDensity(), params).fit(Xte).score_samples(Xtr) + ) + likelihood_tr = np.exp(log_likelihood_tr) + likelihood_te = np.exp(log_likelihood_te) + return likelihood_te / likelihood_tr + + +if __name__ == "__main__": + # d = Dataset("rcv1", target="CCAT").get_raw() + d = Dataset("imdb", n_prevalences=1).get()[0] + + tstart = time.time() + lr = LogisticRegression() + lr.fit(*d.train.Xy) + val_preds = lr.predict(d.validation.X) + protocol = APP( + d.test, + n_prevalences=21, + repeats=1, + sample_size=100, + return_type="labelled_collection", + ) + + results = [] + for sample in protocol(): + wx = iw.logreg(d.validation.X, d.validation.y, sample.X) + test_preds = lr.predict(sample.X) + estim_acc = np.sum((1.0 * (val_preds == d.validation.y)) * wx) / np.sum(wx) + true_acc = metrics.accuracy_score(sample.y, test_preds) + results.append((sample.prevalence(), estim_acc, true_acc)) + + tend = time.time() + + for r in results: + print(*r) + + print(f"logreg finished [took {tend-tstart:.3f}s]") + import win11toast + + win11toast.notify("models.py", "Completed") diff --git a/baselines/pykliep.py b/baselines/pykliep.py new file mode 100644 index 0000000..b9ccedd --- /dev/null +++ b/baselines/pykliep.py @@ -0,0 +1,219 @@ +import warnings + +import numpy as np +from scipy.sparse import csr_matrix + + +class DensityRatioEstimator: + """ + Class to accomplish direct density estimation implementing the original KLIEP + algorithm from Direct Importance Estimation with Model Selection + and Its Application to Covariate Shift Adaptation by Sugiyama et al. + + The training set is distributed via + train ~ p(x) + and the test set is distributed via + test ~ q(x). + + The KLIEP algorithm and its variants approximate w(x) = q(x) / p(x) directly. The predict function returns the + estimate of w(x). The function w(x) can serve as sample weights for the training set during + training to modify the expectation function that the model's loss function is optimized via, + i.e. + + E_{x ~ w(x)p(x)} loss(x) = E_{x ~ q(x)} loss(x). + + Usage : + The fit method is used to run the KLIEP algorithm using LCV and returns value of J + trained on the entire training/test set with the best sigma found. + Use the predict method on the training set to determine the sample weights from the KLIEP algorithm. + """ + + def __init__( + self, + max_iter=5000, + num_params=[0.1, 0.2], + epsilon=1e-4, + cv=3, + sigmas=[0.01, 0.1, 0.25, 0.5, 0.75, 1], + random_state=None, + verbose=0, + ): + """ + Direct density estimation using an inner LCV loop to estimate the proper model. Can be used with sklearn + cross validation methods with or without storing the inner CV. To use a standard grid search. + + + max_iter : Number of iterations to perform + num_params : List of number of test set vectors used to construct the approximation for inner LCV. + Must be a float. Original paper used 10%, i.e. =.1 + sigmas : List of sigmas to be used in inner LCV loop. + epsilon : Additive factor in the iterative algorithm for numerical stability. + """ + self.max_iter = max_iter + self.num_params = num_params + self.epsilon = epsilon + self.verbose = verbose + self.sigmas = sigmas + self.cv = cv + self.random_state = 0 + + def fit(self, X_train, X_test, alpha_0=None): + """Uses cross validation to select sigma as in the original paper (LCV). + In a break from sklearn convention, y=X_test. + The parameter cv corresponds to R in the original paper. + Once found, the best sigma is used to train on the full set.""" + + # LCV loop, shuffle a copy in place for performance. + cv = self.cv + chunk = int(X_test.shape[0] / float(cv)) + if self.random_state is not None: + np.random.seed(self.random_state) + # if isinstance(X_test, csr_matrix): + # X_test_shuffled = X_test.toarray() + # else: + # X_test_shuffled = X_test.copy() + X_test_shuffled = X_test.copy() + + np.random.shuffle(X_test_shuffled) + + j_scores = {} + + if type(self.sigmas) != list: + self.sigmas = [self.sigmas] + + if type(self.num_params) != list: + self.num_params = [self.num_params] + + if len(self.sigmas) * len(self.num_params) > 1: + # Inner LCV loop + for num_param in self.num_params: + for sigma in self.sigmas: + j_scores[(num_param, sigma)] = np.zeros(cv) + for k in range(1, cv + 1): + if self.verbose > 0: + print("Training: sigma: %s R: %s" % (sigma, k)) + X_test_fold = X_test_shuffled[(k - 1) * chunk : k * chunk, :] + j_scores[(num_param, sigma)][k - 1] = self._fit( + X_train=X_train, + X_test=X_test_fold, + num_parameters=num_param, + sigma=sigma, + ) + j_scores[(num_param, sigma)] = np.mean(j_scores[(num_param, sigma)]) + + sorted_scores = sorted( + [x for x in j_scores.items() if np.isfinite(x[1])], + key=lambda x: x[1], + reverse=True, + ) + if len(sorted_scores) == 0: + warnings.warn("LCV failed to converge for all values of sigma.") + return self + self._sigma = sorted_scores[0][0][1] + self._num_parameters = sorted_scores[0][0][0] + self._j_scores = sorted_scores + else: + self._sigma = self.sigmas[0] + self._num_parameters = self.num_params[0] + # best sigma + self._j = self._fit( + X_train=X_train, + X_test=X_test_shuffled, + num_parameters=self._num_parameters, + sigma=self._sigma, + ) + + return self # Compatibility with sklearn + + def _fit(self, X_train, X_test, num_parameters, sigma, alpha_0=None): + """Fits the estimator with the given parameters w-hat and returns J""" + + num_parameters = num_parameters + + if type(num_parameters) == float: + num_parameters = int(X_test.shape[0] * num_parameters) + + self._select_param_vectors( + X_test=X_test, sigma=sigma, num_parameters=num_parameters + ) + + # if isinstance(X_train, csr_matrix): + # X_train = X_train.toarray() + X_train = self._reshape_X(X_train) + X_test = self._reshape_X(X_test) + + if alpha_0 is None: + alpha_0 = np.ones(shape=(num_parameters, 1)) / float(num_parameters) + + self._find_alpha( + X_train=X_train, + X_test=X_test, + num_parameters=num_parameters, + epsilon=self.epsilon, + alpha_0=alpha_0, + sigma=sigma, + ) + + return self._calculate_j(X_test, sigma=sigma) + + def _calculate_j(self, X_test, sigma): + pred = self.predict(X_test, sigma=sigma) + 0.0000001 + log = np.log(pred).sum() + return log / (X_test.shape[0]) + + def score(self, X_test): + """Return the J score, similar to sklearn's API""" + return self._calculate_j(X_test=X_test, sigma=self._sigma) + + @staticmethod + def _reshape_X(X): + """Reshape input from mxn to mx1xn to take advantage of numpy broadcasting.""" + if len(X.shape) != 3: + return X.reshape((X.shape[0], 1, X.shape[1])) + return X + + def _select_param_vectors(self, X_test, sigma, num_parameters): + """X_test is the test set. b is the number of parameters.""" + indices = np.random.choice(X_test.shape[0], size=num_parameters, replace=False) + self._test_vectors = X_test[indices, :].copy() + self._phi_fitted = True + + def _phi(self, X, sigma=None): + if sigma is None: + sigma = self._sigma + + if self._phi_fitted: + return np.exp( + -np.sum((X - self._test_vectors) ** 2, axis=-1) / (2 * sigma**2) + ) + raise Exception("Phi not fitted.") + + def _find_alpha(self, alpha_0, X_train, X_test, num_parameters, sigma, epsilon): + A = np.zeros(shape=(X_test.shape[0], num_parameters)) + b = np.zeros(shape=(num_parameters, 1)) + + A = self._phi(X_test, sigma) + b = self._phi(X_train, sigma).sum(axis=0) / X_train.shape[0] + b = b.reshape((num_parameters, 1)) + + out = alpha_0.copy() + for k in range(self.max_iter): + mat = np.dot(A, out) + mat += 0.000000001 + out += epsilon * np.dot(np.transpose(A), 1.0 / mat) + out += b * ( + ((1 - np.dot(np.transpose(b), out)) / np.dot(np.transpose(b), b)) + ) + out = np.maximum(0, out) + out /= np.dot(np.transpose(b), out) + + self._alpha = out + self._fitted = True + + def predict(self, X, sigma=None): + """Equivalent of w(X) from the original paper.""" + + X = self._reshape_X(X) + if not self._fitted: + raise Exception("Not fitted!") + return np.dot(self._phi(X, sigma=sigma), self._alpha).reshape((X.shape[0],)) diff --git a/elsahar19_rca/rca.py b/baselines/rca.py similarity index 100% rename from elsahar19_rca/rca.py rename to baselines/rca.py diff --git a/jiang18_trustscore/trustscore.py b/jiang18_trustscore/trustscore.py deleted file mode 100644 index 9b6d417..0000000 --- a/jiang18_trustscore/trustscore.py +++ /dev/null @@ -1,141 +0,0 @@ -# Copyright 2018 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -from sklearn.neighbors import KDTree, KNeighborsClassifier - - -class TrustScore: - """ - Trust Score: a measure of classifier uncertainty based on nearest neighbors. - """ - - def __init__(self, k=10, alpha=0.0, filtering="none", min_dist=1e-12): - """ - k and alpha are the tuning parameters for the filtering, - filtering: method of filtering. option are "none", "density", - "uncertainty" - min_dist: some small number to mitigate possible division by 0. - """ - self.k = k - self.filtering = filtering - self.alpha = alpha - self.min_dist = min_dist - - def filter_by_density(self, X: np.array): - """Filter out points with low kNN density. - - Args: - X: an array of sample points. - - Returns: - A subset of the array without points in the bottom alpha-fraction of - original points of kNN density. - """ - kdtree = KDTree(X) - knn_radii = kdtree.query(X, k=self.k)[0][:, -1] - eps = np.percentile(knn_radii, (1 - self.alpha) * 100) - return X[np.where(knn_radii <= eps)[0], :] - - def filter_by_uncertainty(self, X: np.array, y: np.array): - """Filter out points with high label disagreement amongst its kNN neighbors. - - Args: - X: an array of sample points. - - Returns: - A subset of the array without points in the bottom alpha-fraction of - samples with highest disagreement amongst its k nearest neighbors. - """ - neigh = KNeighborsClassifier(n_neighbors=self.k) - neigh.fit(X, y) - confidence = neigh.predict_proba(X) - cutoff = np.percentile(confidence, self.alpha * 100) - unfiltered_idxs = np.where(confidence >= cutoff)[0] - return X[unfiltered_idxs, :], y[unfiltered_idxs] - - def fit(self, X: np.array, y: np.array): - """Initialize trust score precomputations with training data. - - WARNING: assumes that the labels are 0-indexed (i.e. - 0, 1,..., n_labels-1). - - Args: - X: an array of sample points. - y: corresponding labels. - """ - - self.n_labels = np.max(y) + 1 - self.kdtrees = [None] * self.n_labels - if self.filtering == "uncertainty": - X_filtered, y_filtered = self.filter_by_uncertainty(X, y) - for label in range(self.n_labels): - if self.filtering == "none": - X_to_use = X[np.where(y == label)[0]] - self.kdtrees[label] = KDTree(X_to_use) - elif self.filtering == "density": - X_to_use = self.filter_by_density(X[np.where(y == label)[0]]) - self.kdtrees[label] = KDTree(X_to_use) - elif self.filtering == "uncertainty": - X_to_use = X_filtered[np.where(y_filtered == label)[0]] - self.kdtrees[label] = KDTree(X_to_use) - - if len(X_to_use) == 0: - print( - "Filtered too much or missing examples from a label! Please lower " - "alpha or check data." - ) - - def get_score(self, X: np.array, y_pred: np.array): - """Compute the trust scores. - - Given a set of points, determines the distance to each class. - - Args: - X: an array of sample points. - y_pred: The predicted labels for these points. - - Returns: - The trust score, which is ratio of distance to closest class that was not - the predicted class to the distance to the predicted class. - """ - d = np.tile(None, (X.shape[0], self.n_labels)) - for label_idx in range(self.n_labels): - d[:, label_idx] = self.kdtrees[label_idx].query(X, k=2)[0][:, -1] - - sorted_d = np.sort(d, axis=1) - d_to_pred = d[range(d.shape[0]), y_pred] - d_to_closest_not_pred = np.where( - sorted_d[:, 0] != d_to_pred, sorted_d[:, 0], sorted_d[:, 1] - ) - return d_to_closest_not_pred / (d_to_pred + self.min_dist) - - -class KNNConfidence: - """Baseline which uses disagreement to kNN classifier. - """ - - def __init__(self, k=10): - self.k = k - - def fit(self, X, y): - self.kdtree = KDTree(X) - self.y = y - - def get_score(self, X, y_pred): - knn_idxs = self.kdtree.query(X, k=self.k)[1] - knn_outputs = self.y[knn_idxs] - return np.mean( - knn_outputs == np.transpose(np.tile(y_pred, (self.k, 1))), axis=1 - ) diff --git a/jiang18_trustscore/trustscore_evaluation.py b/jiang18_trustscore/trustscore_evaluation.py deleted file mode 100644 index 78f50ec..0000000 --- a/jiang18_trustscore/trustscore_evaluation.py +++ /dev/null @@ -1,286 +0,0 @@ -# Copyright 2018 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -from sklearn.model_selection import StratifiedShuffleSplit -import matplotlib.pyplot as plt -from sklearn.decomposition import PCA -import matplotlib.cm as cm -from sklearn.metrics import precision_recall_curve -import tensorflow as tf - -from sklearn.linear_model import LogisticRegression -from sklearn.svm import LinearSVC -from sklearn.ensemble import RandomForestClassifier - - -def run_logistic(X_train, y_train, X_test, y_test, get_training=False): - model = LogisticRegression() - model.fit(X_train, y_train) - y_pred = model.predict(X_test) - all_confidence = model.predict_proba(X_test) - confidences = all_confidence[range(len(y_pred)), y_pred] - if not get_training: - return y_pred, confidences - y_pred_training = model.predict(X_train) - all_confidence_training = model.predict_proba(X_train) - confidence_training = all_confidence_training[range(len(y_pred_training)), - y_pred_training] - return y_pred, confidences, y_pred_training, confidence_training - - -def run_linear_svc(X_train, y_train, X_test, y_test, get_training=False): - model = LinearSVC() - model.fit(X_train, y_train) - y_pred = model.predict(X_test) - all_confidence = model.decision_function(X_test) - confidences = all_confidence[range(len(y_pred)), y_pred] - if not get_training: - return y_pred, confidences - y_pred_training = model.predict(X_train) - all_confidence_training = model.decision_function(X_train) - confidence_training = all_confidence_training[range(len(y_pred_training)), - y_pred_training] - return y_pred, confidences, y_pred_training, confidence_training - - -def run_random_forest(X_train, y_train, X_test, y_test, get_training=False): - model = RandomForestClassifier() - model.fit(X_train, y_train) - y_pred = model.predict(X_test) - all_confidence = model.predict_proba(X_test) - confidences = all_confidence[range(len(y_pred)), y_pred] - if not get_training: - return y_pred, confidences - y_pred_training = model.predict(X_train) - all_confidence_training = model.predict_proba(X_train) - confidence_training = all_confidence_training[range(len(y_pred_training)), - y_pred_training] - return y_pred, confidences, y_pred_training, confidence_training - - -def run_simple_NN(X, - y, - X_test, - y_test, - num_iter=10000, - hidden_units=100, - learning_rate=0.05, - batch_size=100, - display_steps=1000, - n_layers=1, - get_training=False): - """Run a NN with a single layer on some data. - - Returns the predicted values as well as the confidences. - """ - n_labels = np.max(y) + 1 - n_features = X.shape[1] - - x = tf.placeholder(tf.float32, [None, n_features]) - y_ = tf.placeholder(tf.float32, [None, n_labels]) - - def simple_NN(input_placeholder, n_layers): - - W_in = weight_variable([n_features, hidden_units]) - b_in = bias_variable([hidden_units]) - W_mid = [ - weight_variable([hidden_units, hidden_units]) - for i in range(n_layers - 1) - ] - b_mid = [bias_variable([hidden_units]) for i in range(n_layers - 1)] - W_out = weight_variable([hidden_units, n_labels]) - b_out = bias_variable([n_labels]) - - layers = [tf.nn.relu(tf.matmul(input_placeholder, W_in) + b_in)] - for i in range(n_layers - 1): - layer = tf.nn.relu(tf.matmul(layers[-1], W_mid[i]) + b_mid[i]) - layers.append(layer) - - logits = tf.matmul(layers[-1], W_out) + b_out - return logits - - NN_logits = simple_NN(x, n_layers) - - cross_entropy = tf.reduce_mean( - tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=NN_logits)) - train_step = tf.train.AdamOptimizer(learning_rate).minimize(cross_entropy) - correct_prediction = tf.equal(tf.argmax(NN_logits, 1), tf.argmax(y_, 1)) - accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) - - def one_hot(ns): - return np.eye(n_labels)[ns] - - y_onehot = one_hot(y) - y_test_onehot = one_hot(y_test) - - with tf.Session() as sess: - sess.run(tf.global_variables_initializer()) - for i in range(num_iter): - ns = np.random.randint(0, len(X), size=batch_size) - if (i + 1) % display_steps == 0: - train_accuracy = accuracy.eval(feed_dict={x: X, y_: y_onehot}) - test_accuracy = accuracy.eval(feed_dict={x: X_test, y_: y_test_onehot}) - - print("step %d, training accuracy %g, test accuracy %g" % - (i + 1, train_accuracy, test_accuracy)) - train_step.run(feed_dict={x: X[ns, :], y_: y_onehot[ns, :]}) - - testing_logits = NN_logits.eval(feed_dict={x: X_test}) - testing_prediction = tf.argmax(NN_logits, 1).eval(feed_dict={x: X_test}) - NN_softmax = tf.nn.softmax(NN_logits).eval(feed_dict={x: X_test}) - testing_confidence_raw = tf.reduce_max(NN_softmax, - 1).eval(feed_dict={x: X_test}) - - if not get_training: - return testing_prediction, testing_confidence_raw - training_prediction = tf.argmax(NN_logits, 1).eval(feed_dict={x: X}) - NN_softmax = tf.nn.softmax(NN_logits).eval(feed_dict={x: X}) - training_confidence_raw = tf.reduce_max(NN_softmax, - 1).eval(feed_dict={x: X}) - return testing_prediction, testing_confidence_raw, training_prediction, training_confidence_raw - - -def plot_precision_curve( - extra_plot_title, - percentile_levels, - signal_names, - final_TPs, - final_stderrs, - final_misclassification, - model_name="Model", - colors=["blue", "darkorange", "brown", "red", "purple"], - legend_loc=None, - figure_size=None, - ylim=None): - if figure_size is not None: - plt.figure(figsize=figure_size) - title = "Precision Curve" if extra_plot_title == "" else extra_plot_title - plt.title(title, fontsize=20) - colors = colors + list(cm.rainbow(np.linspace(0, 1, len(final_TPs)))) - - plt.xlabel("Percentile level", fontsize=18) - plt.ylabel("Precision", fontsize=18) - for i, signal_name in enumerate(signal_names): - ls = "--" if ("Model" in signal_name) else "-" - plt.plot( - percentile_levels, final_TPs[i], ls, c=colors[i], label=signal_name) - - plt.fill_between( - percentile_levels, - final_TPs[i] - final_stderrs[i], - final_TPs[i] + final_stderrs[i], - color=colors[i], - alpha=0.1) - - if legend_loc is None: - if 0. in percentile_levels: - plt.legend(loc="lower right", fontsize=14) - else: - plt.legend(loc="upper left", fontsize=14) - else: - if legend_loc == "outside": - plt.legend(bbox_to_anchor=(1.04, 1), loc="upper left", fontsize=14) - else: - plt.legend(loc=legend_loc, fontsize=14) - if ylim is not None: - plt.ylim(*ylim) - model_acc = 100 * (1 - final_misclassification) - plt.axvline(x=model_acc, linestyle="dotted", color="black") - plt.show() - - -def run_precision_recall_experiment_general(X, - y, - n_repeats, - percentile_levels, - trainer, - test_size=0.5, - extra_plot_title="", - signals=[], - signal_names=[], - predict_when_correct=False, - skip_print=False): - - def get_stderr(L): - return np.std(L) / np.sqrt(len(L)) - - all_signal_names = ["Model Confidence"] + signal_names - all_TPs = [[[] for p in percentile_levels] for signal in all_signal_names] - misclassifications = [] - sign = 1 if predict_when_correct else -1 - sss = StratifiedShuffleSplit( - n_splits=n_repeats, test_size=test_size, random_state=0) - for train_idx, test_idx in sss.split(X, y): - X_train = X[train_idx, :] - y_train = y[train_idx] - X_test = X[test_idx, :] - y_test = y[test_idx] - testing_prediction, testing_confidence_raw = trainer( - X_train, y_train, X_test, y_test) - target_points = np.where( - testing_prediction == y_test)[0] if predict_when_correct else np.where( - testing_prediction != y_test)[0] - - final_signals = [testing_confidence_raw] - for signal in signals: - signal.fit(X_train, y_train) - final_signals.append(signal.get_score(X_test, testing_prediction)) - - for p, percentile_level in enumerate(percentile_levels): - all_high_confidence_points = [ - np.where(sign * signal >= np.percentile(sign * - signal, percentile_level))[0] - for signal in final_signals - ] - - if 0 in map(len, all_high_confidence_points): - continue - TP = [ - len(np.intersect1d(high_confidence_points, target_points)) / - (1. * len(high_confidence_points)) - for high_confidence_points in all_high_confidence_points - ] - for i in range(len(all_signal_names)): - all_TPs[i][p].append(TP[i]) - misclassifications.append(len(target_points) / (1. * len(X_test))) - - final_TPs = [[] for signal in all_signal_names] - final_stderrs = [[] for signal in all_signal_names] - for p, percentile_level in enumerate(percentile_levels): - for i in range(len(all_signal_names)): - final_TPs[i].append(np.mean(all_TPs[i][p])) - final_stderrs[i].append(get_stderr(all_TPs[i][p])) - - if not skip_print: - print("Precision at percentile", percentile_level) - ss = "" - for i, signal_name in enumerate(all_signal_names): - ss += (signal_name + (": %.4f " % final_TPs[i][p])) - print(ss) - print() - - final_misclassification = np.mean(misclassifications) - - if not skip_print: - print("Misclassification rate mean/std", np.mean(misclassifications), - get_stderr(misclassifications)) - - for i in range(len(all_signal_names)): - final_TPs[i] = np.array(final_TPs[i]) - final_stderrs[i] = np.array(final_stderrs[i]) - - plot_precision_curve(extra_plot_title, percentile_levels, all_signal_names, - final_TPs, final_stderrs, final_misclassification) - return (all_signal_names, final_TPs, final_stderrs, final_misclassification)