From 695dcff635e89a7d70e7b9bfc912d8a348d7004f Mon Sep 17 00:00:00 2001 From: Silvia Corbara Date: Wed, 23 Jan 2019 23:39:58 +0100 Subject: [PATCH] Features sentence lengths --- src/__pycache__/model.cpython-36.pyc | Bin 4551 -> 5081 bytes src/author_identification.py | 3 +- src/author_verification.py | 5 +- .../__pycache__/dante_loader.cpython-36.pyc | Bin 0 -> 1850 bytes src/data/__pycache__/features.cpython-36.pyc | Bin 0 -> 19050 bytes src/data/features.py | 106 ++++++++++++------ 6 files changed, 74 insertions(+), 40 deletions(-) create mode 100644 src/data/__pycache__/dante_loader.cpython-36.pyc create mode 100644 src/data/__pycache__/features.cpython-36.pyc diff --git a/src/__pycache__/model.cpython-36.pyc b/src/__pycache__/model.cpython-36.pyc index c4b8f40eb2c2b2e726df48ee3b6fffbd900a1415..a9ab2bf873bec646c56786415150b50ed0c89bdb 100644 GIT binary patch delta 1735 zcmZuy&2Jk;6rY*>UVAs2GD}fZ=TZd3e$9lhg^XBWfzn%B|L*=Kr z$}5gzfB)Ilwe12SKa)qA0{${saQDkfbeiXQexEc>aEH6_oP3M$G4AaX?!~N8;Nz>L zI`Qp&OK}ZYqJmwQX)YAI zezVtF_j^H_r0VRDg?%vwJB?r*yfnYzht2k2w;%c-?Z3EU|TNV`XsQ1UEj8s(@ z&p}X}L6}3B2T1-@%R{eoq1h3G?PIoO7zu0u0$e?65|at!(?)jKkf{ciF`WS?B$7xP zvo$qhkrJtq7SfTNQj1YOTC1|`ZL^Jl%q*-D6@=jvcs(vA>s)vi<_4KmAf7k}y*Ai! zLpsY~!TSI{X;4lYj8k9fs(g|cxDqQ|<=O`0`d#L09fcd*+@}qlzgXlJx4|{|RCaA> z!{pQ4g`{p2$_FUV0n$0J08(?k)obYKTP$`Ky0@{v6R%Tv)&@k$33&x6Bm=ut*>CK zIeVqF(nig(;WabCGVSv;xoua5dVet_oW+!qP>o4sfci#Pk=Zwhm6*b1R>g-f2YgCU zhlIL^Z(Y56F+_mBBpp19}0re4A0IFl+ zD*D$DP(^Va0SkaYA;m?6e`S9LyG4N1-t)yEX!bU@{L$U5s(CSypzm3zm_v8F#Si8y^UU`M?INy+q8S{j=M-Z3mBe3!1<|j#J0I32J5Z=t%NC% z@0naBT(gdVxhpt=kMQ5xDLN1fle6BX;lW>rhlk=N=wk^{YByJ^RD6_mM~|F12|aNN z;VFdd%b6f>u+wV$;x&v-Aiy675lw{b1@VkW&dnHLnHJ{(P(ZBLQ&9RCEz-Pem*8h- qgSBcg`OW*7%Q6vt=wbG)|K&So7uPADdpXjh0r)6y?K92(lHs5OXIYP&6o96x83ki^mK z=0I2z3FinB)gTav3dD^|ThtRbBr3HMH%?$qz=hIF;mo1$ITVnX)%^B9Gyk_c^JeD9 z-M8!Y{rSAJY8|@#t;g7}>_0_EJ%crV_e#CzpJb4Q+%|(;UkgVef0hOASGO{{E6`$K zoh9o#>L>l;UNBPKXFY4@of>79nq}6rQycU*4Ec6yrS=3n%fQ61Sn4(7xEEnGm6^ZA z{f7F;m@0@co5+pM{7P$~vl55ujxnXaFvqvF#eE~_j(v*Z6)w0pd;~9Y!GtFFV1Jti zAC?S?8LaVTM2CeOSf~L94RkQT1PiqdA`*~^I5?kb9eq(BE}4f0RKSJ8HV;i0hY|2< zC@rXh4<%GKOu!hFQHfz*CI=<8&nq3f>)@}VCub+`{c^C{i2nNWkMGYcyxNG~@~@rz zd2_oFeeyAtuN!g~8t(#P={L;!KrFOYT5IhNsIvV7pH}znnzbKCy)m z%9gF6*G%k0^f?Hy-?Msl!cv?3slA-(6P-R=)bIdgnwyie-PQGtWN9s`omSU#xANz3 zPO*lt*ybTj=9v00AIv|C9r8KC^Mn@=K~^3keVkwsP7rJYEleIJ{F{YS)O;Bc2w5b3 znsA2DR7c#2QTYn$&{*k4-GLRiyRs9j&)r>*QD9s_Fiqfb^N|)FIRmw;Rk_gAQ(i@x zg+HsqKUUGWGc!u9e3WoXjTGDb(#?(HQNA@zJv9RP2l;i`fu(jbzb?BAt^Y-$C9rlT zJWeqo;Szz4vwWRU_y=?~XpBM3pf564s!!e{TF(H?_0-JfVJlN&CHu$?|t6;cw-}8{C4f|j}~Kp zu?wFE{d2VPD+t9D&sd-SMaE~MFZgQA>wC?d>`Ub-`IPm2<*NYyf$FG8wVv{RsAAQ| zNTm1y>uo&8er&(TotzYDK9tnO7ow4V8?Af_QF#^LCMmYrRFqwdJ4|@RE3p(y{ykTs zmWzHR&6A2vM8%i9_A8t|m1a>fXx*OfSRWEd&Du%98+QisF`FQrA$Xu**wbsKS_Ou%7N}ZVDk@Fi<=fUPz zW-j5V7dLL-Jp2ZSl!pgt_ApHj@91)Bi}|5Su=KDrLufWxbq86IC~fxUkKLxxv*Mv1 z%oB?f@|c`_8G`XHmptGxkN9hRS47;rg3%$pi#~xCpfyq1gJKzCz~D`81^mesBBI1I zS+k}O2G;$?fBwGX@Ao_dx|pF;O&j6`KYPJzc&t4vvP=)jWu*&muioF>peIbUHgBH3 z5?dH^Tj!4%rZ#DoZez40bB2d(;b*Rpoqva%G9;4e+d%Wtnz#H3WsIL-wO7kAt9=DR z7C*1wcZYrVV;xrEL@2Oh_KtU;d&Kxs)KL{pr1F<|Jy=@n!8YyIs$=V0uvUdjp+Xfs z6O(Ys>b8xlHvHDES{0uP^Hb$bS~S909#?X$S3a~BjaGpQK4EomyaN`6`TcxX(0^gq z4tDKe*ADVTR-KV|Kp7)P;sC6J-`2jNP`Zt~^R%=@mVA5f-o3%8+_MYonBxOO-f{B8 zjy^?7MmirBD$U2e*ogum$*@YL#>EYa<)hRdxzOl&mJBucoM$QUF6KHnM2eF}Cu-&V zAem!W^){RYbVu&eyeLzfg2Cj(9w9q6Px5JAJjw^ihq1OUS{t}1*JA|sp?2ZQw(R>k za;KzB%Cw6~4u+Xdau=NM7FoTpc{0;ZlIFtm-T%u@OgS+ilkzRxxO@+Si5N*B!A;Br z@9-U@LWET4U_9m>Pg0-9p7{v2FH?gG7PSVaU!bI{=8MqnngH${3Pp8k+u?q1?$^jYJtkc_T26x9&3l#tADbYm40~0IoZ= zw2=ydk5thlrj^^6r(%gUPwu}2g0f{rfVm3Qyg`B}X;?T7&gz@3v-(tA@AleehsMcM zrc2^&67P_B7Xp>4Fjgxkr}}Z}{JBYUYp7l{IP^lZOZqhu*GYUpg35^b5Tf5%1#eIo zl&Swm3hb>SbqmJj>k#Z;QHl_i%c6rAnH!ix$sn8kUaPsa3yX4(&|9G?j5e=gx~VP= oqD?rPl^2etDr=f56Wy8>>Lk;jk!#9L2oQl2qaeBz1@W8z0Hm+FcmMzZ literal 0 HcmV?d00001 diff --git a/src/data/__pycache__/features.cpython-36.pyc b/src/data/__pycache__/features.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0adc9f39ad589f58bb797c93ac846943786979dc GIT binary patch literal 19050 zcmdUX33y!Bb>6%;D+U7)gdlFBAdlN}|V&gOJU<0ehIx^3#VubbO6ZP+d8lDJiyCjWo# zn>Paxl;y_Bmkz#j-o5wT_1tsMIrrTA@TRe`;;t{h0 zB~CL{wNIz&BJMWR%b9A%VOqMGUCvc=8qOe`ujVzJZ5EbCsv{cCHH*um)zRfrwX{4| z9n-XYb9{MQbsNG38IhulN=e3KT(-%CY?n#7M5g3Y*&#b+m+Y23a+&Ov%jF7rgOue; z*(X=Y)pCtoE7!@iROEVjqwJR(WJYe3o8)FWAa9aeL-Y)Nud*z+-F1b(cm!tA-c|hK?@XqRlyjPCNad}W4l85C4dig%7 z%KPP{JR%>Enmj6=%t>8De3_R8S(Jt>$th{dvb3Zv9XTyV0tuxnXXLD$lNDK&$D}7~ z@uQTdoWFCUjr$P4mG`A+#R`ILOOd|Lj3{6+a5 z`Cj=x`HXx&M)(8rgYsEG zQNAQUD?cYcFTWtaC|{OelD{N>S^kRrRrzK4YZ%o@`4#z!{B`+N`8D|)^6T;&@;Bvg z$={a0BY#)^o_tmQzWf9Ehw_{9kK}9ekL91pKb1)Snf!D47xFLVU&+_yU(3Id-;#eT zzb*ex{=NK;oR|L~|55&v{Ac+un9JYA4E&z_SNU)9-{pVE@5>*^H{^fH|B_ebf6M=o zKa~F~fAoSQDS!iT0ck)6kOkxbc|ZX$0w@AT0VTi~U>vXwFag*Om;_t`m;zi1*a6rH z*ag@P*dq@CUk2C;7Y(gz*T^&0oTYP@U?*J0Mmd9;CjFt0s8?r0A>I; z0&W7_3^)LI6W|uWcK~h$+y;0v;4OfIfZG9w0CxcH1iW?O8`bULxCeD4vq*Ux;4bhU zOWqAQ47dky1n_piI{^0r-idksDxU8G+y}THa1`)vzypBy0Nx8Y1~?9Q5b%(!0zV8m z0eByv3V1)@B;XOi2LLs|qW}*u2dD!C-~;9tZmg2l3xLHZQ&rNu0a!wqv_A!CBFs5h z2DA|7T(kilgm;SLGk$uWvuyyulTMYh6au=)!Pz?wqwo+p1j19%oWIQwx9eMcUA4)77YBUgMB z@G-PZ?sy*XafJ81J%!LG053er&Whs7c#gE%X4z{ZQzaec7K6~MpNfk0z|@w#&@|3Pg+>r` zJQMiL9E9!8**3&WJ@e&l*mO}E;Jbi-=8-d&E;9p6NbS4{^#UY+jx)y(N` z!$YZaNNdz1H)vF|e%SDqtJ#L2OQ-9Q7|$)UIGw@`g6^i4H`R@&7(@^=``)!VJmcu_j& z&QxIvsoZ=$r{!VGkQG7Vs~3WE7<6QvsnWjv#W z6gL!m93HA6)}gW~rYJ^J7`(UKXpt-IqhTLtjqlJNA7~4Y9NWY*=yQ_-$w67fi_W1xb=e5e*cypPGbsN&~*rfWFgE+M5F_ z`*WOQkjI-tH_%7mbi-GAaH^5Om4%gr4(ME?1Ny{BlsHI^X`xRf$CTi_gA#mX8sr4D z!D&a625suv(Q)}PCG?5%F&&fv*c$1;YAqF%3v4GeIi`VS1dP)LKzmCRj~yVHv?r{B zed$8(V9_;#5UfOux$_OX>QYAdpa>92LF9uO5aMu@5wch{U<1UoAl@kLbgmo3E+vD{ z=>}rm4f{X=9gBUiG7cf0S37w*L^*Xi-%c3f+TFDAAutV=A`p6lD7%0#+cvt=44OrX zwul$3oY^pDJhW2gLmZNcvH&uh@`Dfd5swfI$3*3J#!-2WDHv4RgM3N*L=7U(FiO-Q z@8&_cc>>18HCwPqSjN;B6h=0(U9i4Z)%Xgsp#RK5kmDs2^F#x4Y%qrx%9^;$#6FyxCy9-z3VC0l)}_8hO?(~=8Di+a(k(D z*lJ-~GD{hT*PRbz4{|J3$sLWdO>j>u%KNQ_W+PaP@-Vw#K`gFLJhoPDwZn3wRSxRy zj(=svHE%&9<{(fd8%=4!+C1!xu+eVS)Ic*Mco}7_u~vM@Yj*v6jDdX4&C}ph6R_#E z7G6%7S>$~6z}*K=J{b5WIC-qmJk#(_9`=J%u=Y+0c-cQ0m^wnCcXD1Oogd6}R-(xT zKit$T2OkuHQcl6i0E(`;3$YEPB3CG>u0~)Jsa%k4S4bkcl1|8zIcnaFygK65`(e8+ zbr(Rbb}J}jIAso`>n->}xg9S0rW`JMt@3>&XBqOKe9RKE47sL+T=By)Osh`YgucM+ zowNiTt~Q#Asb3pTHFuyB700}leVI+r+YpK-NQ~jk?C=IsWL$F((l(GH;hHJD5$r>s z1VNVI-B@=HE3Z^CN0S1i8w(r*1bcxNkol3RHK*q+rPfm>W1pXNPiKzV{=sI?1~B70 z6*qEcZi<|6Q@39&9s;w4_4ab-&LgarZRAp`R|lPDBdpD>)clq-Va?7QYPRcMGq@9h zeu2m-Y6HxQKqmChZLU9>*jy1wpsl1Fb3ek#=(w)9^prDx1_4WsFtrpf91>*BIrr%> z-NV8;oO%(QJaqWaeI=cLMs;bZtUiihsXV$HLrVq7oW&c1dI9pnV7^2klvNLUpr&`IPO} z9rBXpql&wsG;0V5JAlA_<9+UXCxZ>E%+_} zT*ut`Zq~`lGpW^^Uf-x@lEP>XDu1QlR5om%@>`qQQrac022pVk-qhD<@H|uNl(Qo}>6Dytz=X5UEj#0`@sM68`S{~XqQ=sdG>5=K zYRwHXTkDV%ds196HoSvsDr2U5ST(mH{oE~~!*ztUHNBSUfd{5iYuTQ=1X;Y4?xlKI zPd#U6>M5)oDlg3V0dT&HIfsNMEzNPF2Z1UX^L{*{ER3NAKgxNX4n#?mS!y?0m0UDV zVNshm-U2r1VGxa;#csNNwibZ))v=a;cJQuRXkp7#U-Sb@&s=+c9%j&Hn&oW`!TZo^ zFaeZ;0wReD?l=@t*)fYq(+<{Y+3-hAmB^frBLYfg09iop+`Od|s8NNa$*&c9Zm$3- zGXg4&fM(9~?%GH%1GPRvTJ&-VN|u2- za1qp3w1(=)fY{sPYy%ocxdqeib}WUJjg8OtW)npD16HQ2hXX|r3-$n!$o!iGiAzq= znZk3gt9rkUw-?9`<;h)mL#a(JT!RBvWwuHlj5)ngh9=_bS`<=ob;Z`=YROqo{fR`D zIf@qg(wCBA)9B3uNIM7U%c*Po(qSu^J0zW4^B$Cff!96@S6nRK-@8QvT;oXt(87CP zs{vH>5bLM{AxzO@YDBXM;+&a>fzAK|WqM4JrX<3Gf`V^oSOh15Vqt;h4S{gDmje4A zTFZdtGf2sXE(8IT>T{_O?24!JOzIhD-qlGg^wO$$p%$0G$|@ixA5Mk2r92fSC)b;R zm?)Op|yz${4Yt>iE)8}r!>86`1<@UUN5gWK=HEPRU_!ems zvrMuAwC02GS9;5X#Dit8OrxcI7RI4p_PzRI8O~5HnR3H6N@-M{rm56g&<4GvxUEum z-7VJC>dZiqc`NKzu;_LC^7P^TM=EjUhgl;u!gSK5irSz@`so|Z^orKLVU>{KFX3bi zTX>Rs7II>DYAW@tb6KCwodsmWiP1voCtOivRSCi`>+54O-fM-(kR-hGnS*`_L1JhUoY3%s0 zBA#=sQTm+wjPs0(bwh)-zizNfU{dCjb(3F$%F%VRX%~?2$o1Mq*kSvu9ri4mG+eA` zICL0;e$PA`b4*OTr~vH|oNa`QR)q{nBv?|TLz@^YOA=oB9<vcCWdxfl z4!MAzK<5S@M%fM_f2=Ajo2wsPwgjjKd^e=eJ&i||QSJ`zPMe9Pi7;G*5_p3CW zZ#SW7A;?uV;3UN>&-sfmFU$9X@BP-I*KA%Fl<&rV1o{WAIXKJe<-5D)j8B<VM~1hJleOV4h$BGcaw2_o{+=HndTB03FLA$}>a)C&-GISQD2J-1A5Q_z=Zb%YE zzM)AjC$C{Vh2} zp^8cxm{wGBFR2>c+`!O4XpX4Rj56I0y!MouL+0qCj8aioi)zjWHSlV+6o>0=IGpVe z;*P6qoZ707FQF2pMJi8WSi}~%0LWmEH;t#EmOoFToj(FpXNbQNAd!n-Mg+1geeUxr z7hi(QNTp%yW`d0yyN=Y#UCSY_m4&xZpFgr6n?E|FGTh>=6=RuM)H0*J{8AR1ztLFh zAAC3k37VrDX-I-*B@^aVf@USRr35_#37UreJdmJ;C2Tj<4!{QQn@LbB3HnBJ-}eyg zFE{*|GMpH5aF9Y^SR*>NpZmFH{?sI1-y}ba}46K`gb(1EDqCKA8zwy>mFf zSiV@+Z<}5Itw~bymP&(RBmFhpu&>Lr_sgiXZ(iLnSOERWxzDP;t%TwRv61YU_bIPK zaQtaF_+Hm;`Pcd*f>-ahZ)rqBV&2j1^EZ?5NmDtjZ~m|?+CY|XU7DI6s@Yne{s!Lk z<>~&{Gq6Oc7Oha_uH2j;qL)lJ@m8{`jNK$&ODbNgIXK98EV{-L11b9mqSb7RY=ZkS zz$##i8nWW`11zAvfkzpd1?mq=rS0yEj^O8zRX3uk0!8i)2wZx5it2xy>v2@SSH`>VHN_lnH$gJ+ zf>IefceUdLo`d%CpSe3A>~@-d`TqS6?TQ;ED@4(n};j8R5jqk}ydA<>9;iJbZ>1qkLop_kznOZm> z8y8a+rBN;#0byGK&kHMAtKcU>QoeR3xmadC%-lJxA+S6- zipU6fJiW5V>kRUj10?*k2N8N@p;R;L zX}F^t_)N)D&hzk_(qHPDr;tbc1=q)f?O82mlD5&d;Hl>(4}OUt+8*cFJf^7*u4MSw z6Vy8NJ9YKAF)G5XQ;SDC%XSp$BAOTs)MofP-p5Y2l4D*#EHOzqo;-juDfnI>8j>M4 zTERrmS%6P8bk$)1<8mpx&b0ww4~&-B0fDWUs|U$c8cPr0u9+`O# z6k008&N*!3o-BkTVUanXb#~!MdM84OgW>@-IYwC<-F&)r+s`!6GXTEB=M!9aQ0Z(I;3*jY) zze>==kefgSu?pupyJuH``4}_tq@X&=LVaHN1e0fhFr@Ed_)|pW3quFnZv*1(d@+@C zCS8h9Yr7Vm8F$LnTnT%Sx?Bz_LT^Pt&xGOr!!91yWG~Ib1$d#rB)yEdIEKK%3~v;0 zufE3RkySC8K59wfM9wK(6v(Q(GD_oOi6Q41wh@hCsn-T8Jr_AAB4;JqW0yKzbhSXQ zj`D5+Z@9FpZtF;O1j0!YiQgXE+;6pA7FSkLE3x{?it)@{X?WGAK~Qr zC4@Px$!{A%vw(oR9Y0mYnG8NTP*}L2W z0WRof@%aEgC_qY%9~B@a&kqamEby9c<lJ|WIIv393cVumNN*Il z*aIbNqrEZUQg0l1thWt#T)XLdhrooYIq0juukDF-lqYU&8ttJ6Wui{_jt8BFBfjPe zl^UGn$vz|IjlP-&6P5K;3Re+l%E#%u$0ak~Q(jTM8Q{tlstnsn+i^S4WKCS~z%60C z_Ud(Y`WjuR`=-^`*Q|0thqH_Az%R!qYKx6d*$ZKp%ysENQtjM^sxJ$! z*BK~WU+f->RE^ImNV|h-wmmFp)2<( zPh%WHygC+alP&gE`Uf}aix4>e;Lz%&jU9T z9q4ZWPQ*5O(r~;{;VO&Qj^0}18id}3D+CSVLW3IgD>{vQg=z7dL3z60+sf@pg=kST zqtPV$Yc%4=0@E3GTHYy!4HdK<`IH`Ix0D6Zn3^Z_b+^W_3S8^rJ*SM)We# zXMv)Uy=ZCW6~36kX+j*vn?X!%Me}(UWAo<6iROvEKs1{mwNwhV8rUJM)zUD3f-CSc zbyd!}_}}X8Bb$tgnSKq4F-gmY#*rFQUxyJvhvZI}l7gLvbzo0nXRcu!HcsId9862t z{xC7IzY6tu2w|8MOKI~F9Q(uQL>Pv~5)O@?&zwy2ZVYb1q;brzhnLuScVb|A za6f_E1zdZD`;>X?9wRI+jq0_6qK0rhx|Tx-*Jjx|tSj_q4aUmSnE5*JcyBb^Mq_7b zY#lCDeN&JuoTH;l69V(t){6J|B+pF7B__4R_MsBFElS`+GBX>Om}2Wm{rN3&q@SfNTE6lH*UX3ewB@T%>S`H#-qvJW zzlL>d>#mM|FY-@+lh&`@vi0#TT4!xvLv7pe4s!;$1h5mpIm|9yw~m8HjPi1sKn{fC z*mhf25}!o8xHP@aEkF@tE-qEpQ*sG5L6|3&U&q};7_~)vd#W;Jw>^3x4ch}K=M8brfz1>fTK&4V z&fb@N$?@Kn*|F`R=u0*ctJ~vj&?@}q4_Yzls6h z`%d()=v%P^yjgIGY96^t~pPiK;?Stqy03( zW{yb3d7Ghxw@d@kC|B#?22p74r+W75Y8U%O~dhm|GRiDnqJm zKFE-&oF^Fi9MLmGs)3$j=(9xBB2m`1`Xa;Qteg96xcK0!)19u>j4v?%$A~z==6UAc zmgJ6=7c>xe8MT@o@kNe&79)Yn+aXL>1qns+i6`xG%Tivxg}NMW|(9$GjP87dw0i zN;n!+WM|;x;khH+a}ar1y8U?eENNAeRL%vn3Ov$-3e~a&Re5|f3dLK3DjwUQB*zWT zKsEn1%|AYrKf6W#3G=l$|8}+v#bfKvZIKt>XyXdGt^JY>x$|4(#+Tb>DZEsBv16U; zF^|zgRY*Y<<6(sHjW(zP^~v!gzGvw&8WL;palok9%ez&G_xl7R;)3zw3vClYrIM~x zp=H38G71A`b9-OH8@Bhkq4xH@W_zVA+S>*7w*xhd;T>iNFah8!>&z})6|2A9GS0rP zyO`JAs>)oH+z%fP-A{Vin%=x#?if6(a3M{^UcDjBrmF7$OWpNtQd+KxGZk#Bl~~ut zn(H-m-RhsI)*8j9hxqV_S4Iz>|0Db=SKFPyCUrthVwK027*ZFs?nQo<(X&KfBvK{w zE$NPpYQykw&-^^mlSD5O(U8!y3acroB&@pl1%`eR2-iZW4fweaJn9p6Q#~ZfX=!1L zhL?fiHMVNz_&vvtS4)0paJ_W12xt3GQ-q&HAO*o)fLNud9*Ar!xN{J>6yH{4Ps9Sd z1M(isTsB&LfM-$fip4M%Xik-<%d0N=s1J1aO%(2XElvF_^}LsVcmfg^cU z7e>yJ&6FY!{5mOBL?8uqHV$f(+||o37}v0!k=zl4&KdZTc_CJZHjbgG@!kJ?$F;jt zG6)Q4o$mBoK%Qr1tE?45^4wQ&g?cCzz(T z=L-z+JWG#itiT|#hbCZ_QO-hCIAkaEPRv9q;03S(?Xvi9$|8yVu!+ z*bXfENm^$bvhQr3>z$jN8RvRuzq8-H72)fha=KI~jg>N`iPGqfNrW#cZJ(H$*fTM) MFS~bpDPPL`G2jJjnE(I) literal 0 HcmV?d00001 diff --git a/src/data/features.py b/src/data/features.py index 576753f..d30009b 100644 --- a/src/data/features.py +++ b/src/data/features.py @@ -5,40 +5,39 @@ from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 from sklearn.preprocessing import normalize from scipy.sparse import hstack, csr_matrix, issparse -import collections from nltk.corpus import stopwords -latin_function_words = ['et', 'in', 'de', 'ad', 'non', 'vt', 'cvm', 'per', 'a', 'sed', 'qve', 'qvia', 'ex', 'sic', - 'si', 'etiam', 'idest', 'nam', 'vnde', 'ab', 'vel', 'sicvt', 'ita', 'enim', 'scilicet', 'nec', - 'pro', 'avtem', 'ibi', 'dvm', 'vero', 'tamen', 'inter', 'ideo', 'propter', 'contra', 'svb', - 'qvomodo', 'vbi', 'svper', 'iam', 'tam', 'hec', 'post', 'qvasi', 'ergo', 'inde', 'e', 'tvnc', - 'atqve', 'ac', 'sine', 'nisi', 'nvnc', 'qvando', 'ne', 'vsqve', 'sive', 'avt', 'igitvr', 'circa', - 'qvidem', 'svpra', 'ante', 'adhvc', 'sev' , 'apvd', 'olim', 'statim', 'satis', 'ob', 'qvoniam', - 'postea', 'nvnqvam'] +latin_function_words = ['et', 'in', 'de', 'ad', 'non', 'ut', 'cum', 'per', 'a', 'sed', 'que', 'quia', 'ex', 'sic', + 'si', 'etiam', 'idest', 'nam', 'unde', 'ab', 'uel', 'sicut', 'ita', 'enim', 'scilicet', 'nec', + 'pro', 'autem', 'ibi', 'dum', 'uero', 'tamen', 'inter', 'ideo', 'propter', 'contra', 'sub', + 'quomodo', 'ubi', 'super', 'iam', 'tam', 'hec', 'post', 'quasi', 'ergo', 'inde', 'e', 'tunc', + 'atque', 'ac', 'sine', 'nisi', 'nunc', 'quando', 'ne', 'usque', 'siue', 'aut', 'igitur', 'circa', + 'quidem', 'supra', 'ante', 'adhuc', 'seu' , 'apud', 'olim', 'statim', 'satis', 'ob', 'quoniam', + 'postea', 'nunquam'] -latin_conjugations = ['o', 'eo', 'io', 'as', 'es', 'is', 'at', 'et', 'it', 'amvs', 'emvs', 'imvs', 'atis', 'etis', - 'itis', 'ant', 'ent', 'vnt', 'ivnt', 'or', 'eor', 'ior', 'aris', 'eris', 'iris', 'atvr', 'etvr', - 'itvr', 'amvr', 'emvr', 'imvr', 'amini', 'emini', 'imini', 'antvr', 'entvr', 'vntvr', 'ivntvr', - 'abam', 'ebam', 'iebam', 'abas', 'ebas', 'iebas', 'abat', 'ebat', 'iebat', 'abamvs', 'ebamvs', - 'iebamvs', 'abatis', 'ebatis', 'iebatis', 'abant', 'ebant', 'iebant', 'abar', 'ebar', 'iebar', - 'abaris', 'ebaris', 'iebaris', 'abatvr', 'ebatvr', 'iebatvr', 'abamvr', 'ebamvr', 'iebamvr', - 'abamini', 'ebamini', 'iebamini', 'abantvr', 'ebantvr', 'iebantvr', 'abo', 'ebo', 'am', 'iam', - 'abis', 'ebis', 'ies', 'abit', 'ebit', 'iet', 'abimvs', 'ebimvs', 'emvs', 'iemvs', 'abitis', - 'ebitis', 'ietis', 'abvnt', 'ebvnt', 'ient', 'abor', 'ebor', 'ar', 'iar', 'aberis', 'eberis', - 'ieris', 'abitvr', 'ebitvr', 'ietvr', 'abimvr', 'ebimvr', 'iemvr', 'abimini', 'ebimini', 'iemini', - 'abvntvr', 'ebvntvr', 'ientvr', 'i', 'isti', 'it', 'imvs', 'istis', 'ervnt', 'em', 'eam', 'eas', - 'ias', 'eat', 'iat', 'eamvs', 'iamvs', 'eatis', 'iatis', 'eant', 'iant', 'er', 'ear', 'earis', - 'iaris', 'eatvr', 'iatvr', 'eamvr', 'iamvr', 'eamini', 'iamini', 'eantvr', 'iantvr', 'rem', 'res', - 'ret', 'remvs', 'retis', 'rent', 'rer', 'reris', 'retvr', 'remvr', 'remini', 'rentvr', 'erim', - 'issem', 'isses', 'isset', 'issemvs', 'issetis', 'issent', 'a', 'ate', 'e', 'ete', 'ite', 'are', - 'ere', 'ire', 'ato', 'eto', 'ito', 'atote', 'etote', 'itote', 'anto', 'ento', 'vnto', 'ivnto', - 'ator', 'etor', 'itor', 'aminor', 'eminor', 'iminor', 'antor', 'entor', 'vntor', 'ivntor', 'ari', - 'eri', 'iri', 'andi', 'ando', 'andvm', 'andvs', 'ande', 'ans', 'antis', 'anti', 'antem', 'antes', - 'antivm', 'antibvs', 'antia', 'esse', 'svm', 'es', 'est', 'svmvs', 'estis', 'svnt', 'eram', 'eras', - 'erat', 'eramvs', 'eratis', 'erant', 'ero', 'eris', 'erit', 'erimvs', 'eritis', 'erint', 'sim', - 'sis', 'sit', 'simvs', 'sitis', 'sint', 'essem', 'esses', 'esset', 'essemvs', 'essetis', 'essent', - 'fvi', 'fvisti', 'fvit', 'fvimvs', 'fvistis', 'fvervnt', 'este', 'esto', 'estote', 'svnto'] +latin_conjugations = ['o', 'eo', 'io', 'as', 'es', 'is', 'at', 'et', 'it', 'amus', 'emus', 'imus', 'atis', 'etis', + 'itis', 'ant', 'ent', 'unt', 'iunt', 'or', 'eor', 'ior', 'aris', 'eris', 'iris', 'atur', 'etur', + 'itur', 'amur', 'emur', 'imur', 'amini', 'emini', 'imini', 'antur', 'entur', 'untur', 'iuntur', + 'abam', 'ebam', 'iebam', 'abas', 'ebas', 'iebas', 'abat', 'ebat', 'iebat', 'abamus', 'ebamus', + 'iebamus', 'abatis', 'ebatis', 'iebatis', 'abant', 'ebant', 'iebant', 'abar', 'ebar', 'iebar', + 'abaris', 'ebaris', 'iebaris', 'abatur', 'ebatur', 'iebatur', 'abamur', 'ebamur', 'iebamur', + 'abamini', 'ebamini', 'iebamini', 'abantur', 'ebantur', 'iebantur', 'abo', 'ebo', 'am', 'iam', + 'abis', 'ebis', 'ies', 'abit', 'ebit', 'iet', 'abimus', 'ebimus', 'emus', 'iemus', 'abitis', + 'ebitis', 'ietis', 'abunt', 'ebunt', 'ient', 'abor', 'ebor', 'ar', 'iar', 'aberis', 'eberis', + 'ieris', 'abitur', 'ebitur', 'ietur', 'abimur', 'ebimur', 'iemur', 'abimini', 'ebimini', 'iemini', + 'abuntur', 'ebuntur', 'ientur', 'i', 'isti', 'it', 'imus', 'istis', 'erunt', 'em', 'eam', 'eas', + 'ias', 'eat', 'iat', 'eamus', 'iamus', 'eatis', 'iatis', 'eant', 'iant', 'er', 'ear', 'earis', + 'iaris', 'eatur', 'iatur', 'eamur', 'iamur', 'eamini', 'iamini', 'eantur', 'iantur', 'rem', 'res', + 'ret', 'remus', 'retis', 'rent', 'rer', 'reris', 'retur', 'remur', 'remini', 'rentur', 'erim', + 'issem', 'isses', 'isset', 'issemus', 'issetis', 'issent', 'a', 'ate', 'e', 'ete', 'ite', 'are', + 'ere', 'ire', 'ato', 'eto', 'ito', 'atote', 'etote', 'itote', 'anto', 'ento', 'unto', 'iunto', + 'ator', 'etor', 'itor', 'aminor', 'eminor', 'iminor', 'antor', 'entor', 'untor', 'iuntor', 'ari', + 'eri', 'iri', 'andi', 'ando', 'andum', 'andus', 'ande', 'ans', 'antis', 'anti', 'antem', 'antes', + 'antium', 'antibus', 'antia', 'esse', 'sum', 'es', 'est', 'sumus', 'estis', 'sunt', 'eram', 'eras', + 'erat', 'eramus', 'eratis', 'erant', 'ero', 'eris', 'erit', 'erimus', 'eritis', 'erint', 'sim', + 'sis', 'sit', 'simus', 'sitis', 'sint', 'essem', 'esses', 'esset', 'essemus', 'essetis', 'essent', + 'fui', 'fuisti', 'fuit', 'fuimus', 'fuistis', 'fuerunt', 'este', 'esto', 'estote', 'sunto'] spanish_conjugations = ['o','as','a','amos','áis','an','es','e','emos','éis','en','imos','ís','guir','ger','gir', 'ar', 'er', 'ir', 'é', 'aste', 'ó','asteis','aron','í','iste','ió','isteis','ieron', @@ -167,22 +166,45 @@ def _features_Mendenhall(documents, upto=23): :param documents: a list where each element is the text (string) of a document :return: a np.array of shape (D,F) where D is len(documents) and F is len(range of lengths considered) """ - features = [] - for text in documents: unmod_tokens = nltk.word_tokenize(text) mod_tokens = ([token.lower() for token in unmod_tokens if any(char.isalpha() for char in token)]) nwords = len(mod_tokens) - tokens_len = [len(token) for token in mod_tokens] - - count = collections.Counter(tokens_len) - features.append([1000.*count[i]/nwords for i in range(1,upto)]) - + tokens_count = [] + for i in range(1, upto): + tokens_count.append(1000.*(sum(j>= i for j in tokens_len))/nwords) + features.append(tokens_count) return np.array(features) +def _features_sentenceLengths(documents, downto=3, upto=70): + """ + Extract features as the length of the sentences, ie. number of words in the sentence. + :param documents: a list where each element is the text (string) of a document + :param downto: minimal length considered + :param upto: maximum length considered + :return: a np.array of shape (D,F) where D is len(documents) and F is len(range of lengths considered) + """ + features = [] + for text in documents: + sentences = [t.strip() for t in nltk.tokenize.sent_tokenize(text) if t.strip()] + nsent = len(sentences) + sent_len = [] + sent_count = [] + for sentence in sentences: + unmod_tokens = nltk.tokenize.word_tokenize(sentence) + mod_tokens = ([token for token in unmod_tokens if any(char.isalpha() for char in token)]) + sent_len.append(len(mod_tokens)) + for i in range(downto, upto): + sent_count.append(1000.*(sum(j>= i for j in sent_len))/nsent) + features.append(sent_count) + return np.array(features) + + + + def _features_tfidf(documents, tfidf_vectorizer=None, min_df = 1, ngrams=(1,1)): """ Extract features as tfidf matrix extracted from the documents @@ -238,6 +260,7 @@ class FeatureExtractor: function_words_freq=None, conjugations_freq=None, features_Mendenhall=True, + features_sentenceLengths=True, wordngrams=False, tfidf_feat_selection_ratio=1., n_wordngrams=(1, 1), @@ -271,6 +294,7 @@ class FeatureExtractor: self.function_words_freq = function_words_freq self.conjugations_freq = conjugations_freq self.features_Mendenhall = features_Mendenhall + self.features_sentenceLengths = features_sentenceLengths self.tfidf = wordngrams self.tfidf_feat_selection_ratio = tfidf_feat_selection_ratio self.wordngrams = n_wordngrams @@ -319,6 +343,10 @@ class FeatureExtractor: X = self._addfeatures(X, _features_Mendenhall(documents)) self._print('adding Mendenhall words features: {} features'.format(X.shape[1])) + if self.features_sentenceLengths: + X = self._addfeatures(X, _features_sentenceLengths(documents)) + self._print('adding sentence lengths features: {} features'.format(X.shape[1])) + # sparse feature extraction functions if self.tfidf: X_features, vectorizer = _features_tfidf(documents, ngrams=self.wordngrams) @@ -384,6 +412,10 @@ class FeatureExtractor: TEST = self._addfeatures(TEST, _features_Mendenhall(test)) self._print('adding Mendenhall words features: {} features'.format(TEST.shape[1])) + if self.features_sentenceLengths: + TEST = self._addfeatures(TEST, _features_sentenceLengths(test)) + self._print('adding sentence lengths features: {} features'.format(TEST.shape[1])) + # sparse feature extraction functions if self.tfidf: ep1_features, _ = _features_tfidf(test, self.tfidf_vectorizer)