From 1387ef2c59acd1d95c5887202b221bd1b155cd73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Moreo=20Fern=C3=A1ndez?= Date: Tue, 22 Jan 2019 19:06:16 +0100 Subject: [PATCH] identification vs attribution, macro-f1 and micro-f1 --- src/author_identification.py | 78 ++++++++++++++++++ src/{dante_eval.py => author_verification.py} | 22 +++-- .../__pycache__/dante_loader.cpython-36.pyc | Bin 1850 -> 0 bytes src/data/__pycache__/features.cpython-36.pyc | Bin 17280 -> 0 bytes src/data/features.py | 25 +++--- src/model.py | 29 +++++-- src/pan2015_eval.py | 1 + 7 files changed, 127 insertions(+), 28 deletions(-) create mode 100644 src/author_identification.py rename src/{dante_eval.py => author_verification.py} (72%) delete mode 100644 src/data/__pycache__/dante_loader.cpython-36.pyc delete mode 100644 src/data/__pycache__/features.cpython-36.pyc diff --git a/src/author_identification.py b/src/author_identification.py new file mode 100644 index 0000000..3df2892 --- /dev/null +++ b/src/author_identification.py @@ -0,0 +1,78 @@ +from sklearn.linear_model import LogisticRegression +from data.dante_loader import load_texts +from data.features import * +from model import AuthorshipVerificator, f1_from_counters +from sklearn.svm import LinearSVC, SVC +from util.color_visualization import color + +# DONE: ngrams should contain punctuation marks according to Sapkota et al. [39] in the PAN 2015 overview +# (More recently, it was shown that character +# n-grams corresponding to word affixes and including punctuation marks are the most +# significant features in cross-topic authorship attribution [57].) #we have cancelled the +# TODO: inspect the impact of chi-squared correlations against positive-only (or positive and negative) correlations for feature selection +# TODO: sentence length (Mendenhall-style) ? + + +for epistola in [2]: + if epistola==1: + authors = ['Dante','GiovanniBoccaccio','PierDellaVigna'] + else: + authors = ['Dante', 'BenvenutoDaImola', 'FilippoVillani','GiovanniBoccaccio','GiovanniDelVirgilio', + 'GrazioloBambaglioli','GuidoDaPisa','PietroAlighieri','ZonoDeMagnalis'] + + discarded = 0 + f1_scores = [] + counters = [] + for i,author in enumerate(authors): + print('='*80) + print('Authorship Identification for {} (complete {}/{})'.format(author, i, len(authors))) + print('Corpus of Epistola {}'.format(epistola)) + print('='*80) + path = '../testi_{}'.format(epistola) + if epistola==2: + path+='_with_GuidoDaPisa' + + positive, negative, ep_text = load_texts(path, positive_author=author, unknown_target='EpistolaXIII_{}.txt'.format(epistola)) + if len(positive) < 2: + discarded+=1 + continue + + n_full_docs = len(positive) + len(negative) + + feature_extractor = FeatureExtractor(function_words_freq='latin', + conjugations_freq='latin', + features_Mendenhall=True, + tfidf_feat_selection_ratio=0.1, + wordngrams=False, n_wordngrams=(1, 2), + charngrams=True, n_charngrams=(3, 4, 5), preserve_punctuation=False, + split_documents=True, split_policy=split_by_sentences, window_size=3, + normalize_features=True) + + Xtr,ytr,groups = feature_extractor.fit_transform(positive, negative) + print(ytr) + + ep, ep_fragments = feature_extractor.transform(ep_text, return_fragments=True, window_size=3) + + print('Fitting the Verificator') + av = AuthorshipVerificator(nfolds=10, estimator=LogisticRegression) + av.fit(Xtr,ytr,groups) + + score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=True, counters=True) + # print('LOO[full-docs]={:.3f} +-{:.5f}'.format(score_ave, score_std)) + f1_scores.append(f1_from_counters(tp, fp, fn, tn)) + counters.append((tp, fp, fn, tn)) + print('F1 for {} = {:.3f}'.format(author,f1_scores[-1])) + + + print('Computing macro- and micro-averages (discarded {}/{})'.format(discarded,len(authors))) + f1_scores = np.array(f1_scores) + counters = np.array(counters) + + macro_f1 = f1_scores.mean() + micro_f1 = f1_from_counters(*counters.sum(axis=0).tolist()) + + print('Macro-F1 = {:.3f}'.format(macro_f1)) + print('Micro-F1 = {:.3f}'.format(micro_f1)) + print() + + diff --git a/src/dante_eval.py b/src/author_verification.py similarity index 72% rename from src/dante_eval.py rename to src/author_verification.py index c2d2b52..53218be 100644 --- a/src/dante_eval.py +++ b/src/author_verification.py @@ -1,7 +1,7 @@ from sklearn.linear_model import LogisticRegression from data.dante_loader import load_texts from data.features import * -from model import AuthorshipVerificator +from model import AuthorshipVerificator, f1_from_counters from sklearn.svm import LinearSVC, SVC from util.color_visualization import color @@ -12,14 +12,16 @@ from util.color_visualization import color # TODO: inspect the impact of chi-squared correlations against positive-only (or positive and negative) correlations for feature selection # TODO: sentence length (Mendenhall-style) ? + for epistola in [1, 2]: + print('Epistola {}'.format(epistola)) print('='*80) path = '../testi_{}'.format(epistola) if epistola==2: path+='_with_GuidoDaPisa' - positive, negative, ep_text = load_texts(path, unknown_target='EpistolaXIII_{}.txt'.format(epistola)) + positive, negative, ep_text = load_texts(path, positive_author='Dante', unknown_target='EpistolaXIII_{}.txt'.format(epistola)) n_full_docs = len(positive) + len(negative) feature_extractor = FeatureExtractor(function_words_freq='latin', @@ -27,7 +29,7 @@ for epistola in [1, 2]: features_Mendenhall=True, tfidf_feat_selection_ratio=0.1, wordngrams=False, n_wordngrams=(1, 2), - charngrams=True, n_charngrams=(3, 4, 5), preserve_punctuation=False, + charngrams=True, n_charngrams=(2, 3, 4), preserve_punctuation=False, split_documents=True, split_policy=split_by_sentences, window_size=3, normalize_features=True) @@ -46,12 +48,14 @@ for epistola in [1, 2]: fulldoc_prob, fragment_probs = av.predict_proba(ep, title) # color(path='../dante_color/epistola{}.html'.format(epistola), texts=ep_fragments, probabilities=fragment_probs, title=title) - score_ave, score_std = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=False) - print('LOO[full-and-fragments]={:.3f} +-{:.5f}'.format(score_ave, score_std)) + # score_ave, score_std = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=False) + # print('LOO[full-and-fragments]={:.3f} +-{:.5f}'.format(score_ave, score_std)) - score_ave, score_std = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=True) - print('LOO[full-docs]={:.3f} +-{:.5f}'.format(score_ave, score_std)) + score_ave, score_std, tp, fp, fn, tn = av.leave_one_out(Xtr, ytr, groups, test_lowest_index_only=True, counters=True) + # print('LOO[full-docs]={:.3f} +-{:.5f}'.format(score_ave, score_std)) + f1_ = f1_from_counters(tp, fp, fn, tn) + print('F1 = {:.3f}'.format(f1_)) - score_ave, score_std = av.leave_one_out(Xtr, ytr, None) - print('LOO[w/o groups]={:.3f} +-{:.5f}'.format(score_ave, score_std)) + # score_ave, score_std = av.leave_one_out(Xtr, ytr, None) + # print('LOO[w/o groups]={:.3f} +-{:.5f}'.format(score_ave, score_std)) diff --git a/src/data/__pycache__/dante_loader.cpython-36.pyc b/src/data/__pycache__/dante_loader.cpython-36.pyc deleted file mode 100644 index a7b3e6873f5393a921228e671c5bd1b627ebf1a9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1850 zcmZXU&u<$=6vt;~XMcF@q@fa&6cG#(5pGK)g!rMVLZG%t<&aB2MOKit^^Bdh*SpT_ zq;-?s9&nKmQm_04T)6SqaF`o#>c7wneBU~*0<1O9Z)V>7dhhezhZ`I5;D5Q3$j9CMWcEOtZZYbf)n|2gRGj4M>*pL7{qp(W_z4-X{&BJeSNO^dWW)IWk@QyC0wwNEP1WOM~GlXW7Rd)Z8^N7F3cSXd_D;OQpyXX^W0a_E4Jt&qT1`OWhR=}TJAtFjV zlQnDlU|`*E{O9jGey!&j(8Ua$YT6Jl_}L3y!(;7Xk!5;FE-PJld-eY220dY-wR!XO zmDs|V+d6;DFttgubQ_~3nKL|O3qNy(?EE|Alp&E!-v*kG*1Y9UC}aEttG!x|S?wzj zviN!ZzB}x@AM3CRCqjW8vv<4$-6O`AqK>L)B9*_y>%r1m54LHyRvla4g0(7K3Kgp8 znV5u2R<~_bwc)pR)vEYZn4c8XtWAc@CmDf;~lUl%QDh0<-@ou{QOvgF%)_wEf&<(^$w#~dFR@{W@y zcJwJyGSd05P-#By#ZD9mNrqJ_H7;&YEFY!z$c0AFvt+2j=R8Y+cQMzwAyS+)I#Da< z2gw}6s<+`JpgZbcnipkiQ!tpE*dt`e=1D%yi%0nY`7qYjMQZ~W<$8?3KGZH;*_M4j zNA8rANtt#r$-ywwN$!I4-6E?OHcw{SNzz<+zWaaKi76)rWKzC`8<+1vFcBjOB)Ey0 z;2pk$REUrY9gN4k<4NlC*fSr%_GM~N!J^jS^b3@f)qF9!0MNHJ3J(B6X)@j8E$E&S z$^$?if-iv*S3Z@vbA1JnL>WU8O+(WkGn6}dtC1*UC~pMD@z#9?z&K$=Wo@xr1;BNu zmNrr$@R2H-#I$l7^HeOc=E?n+Kv1@<2ryTnnm0%gB@GLw!C8H?bylB>8{J;p?9e!Q z%5+J*P2wFA??Rwb6~=1iSbqmJj>k#Z;QHl_i%c6rAnU64sl0i26y;gH;7Z&9np|?U)7;Rp~bW>d# oM4NCnD=!>PRn|0BCb~5%)Jdj4BiEFh5Fi34MnQBb3gS2a0W__-?*IS* diff --git a/src/data/__pycache__/features.cpython-36.pyc b/src/data/__pycache__/features.cpython-36.pyc deleted file mode 100644 index e47739ad4ecb9caff219c39a26be95fd753b208e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 17280 zcmd6P349#adEd_LiNyj0L5R1g5k*T_nTAN(ksX?*Ws{<#i?&RfqSusHgZ&1;E*HDY zH?trCEqp=NVOu_OIQJntX)MQb5<6|<+^yPNZPPYw({y?yZQ3+U(>6U)w`!Z@|Nmxo zcCmyM|C0QYfWPm(H}8Jm``&lGw|woyM6omV`b%GVIFHPk5$Groog1C$1CH@rAldeqB5a* zR&%m4iT3&Cw&khH6!Ho(CPf*Sl1#{?Y?CS3F4M9@X5=Q>DZ6C1?2)~)Pxi~razJho zM{bpaa+@5ISIO;ihs;V@?vz){VR?<84t-MaYPVSc1%NykD<+!|2PRN_& zq`X<~k+;ZO<&>P3d*zJ0O};_yll$cXc~Bmbv+{O%hdjJ^e`UM8Q_ek+s!Ypyd6!&} zN93ZsTPm3Cd*qTlD({u5JSMKpOHD*PS&&6plDahHvNUB`TGEz|ToEI_1k#lixhmIW zRo3Kj>B+jhPu?#dkPpg-WJ8{i56hGClsqlZ$VcR(@-cZN z)AFtIZSw8%9rB&>UGm-XJ@UO+>G#QV@)`N8e82pF{5APO%%yY$={Y2v8wmV-;uv7KQF%^ zzbL;Xzbt=G{=WPJ`4#zx@{i;n%dg5mk$)=xOny!Nx%|5P3kl_4%D<9-E&oP-Lw-~K zt^7OrE&2EIALO^?Kg#dOb@^TSPx5>6pXI;Ef0f_Ip8hxN#DACnA^%hUK>n9}RsOg9 zA9+dsulzsx|MG|ON1wDL1+V}%APvX>vVa^Q4=4b}07bw!pahrzOait6rU2Uk(|{d- z8Nf||oq%0{-GDuSy?}jy{c;iE&42@dTL2E=R=`2PZGc07R{?GZ+#wBwvw$+-PQa@H zhXJnv%mMBK9042!90R--@H)WP0qzF89`FXh*8`3N-Uv7WcoX0x;LU)00B>3RYGntg z?h#$lTaj}La2oW-9_|I40lW?H4S@Rq_X8dPJczyiRm2Yg&H~;Jcn9ENz&iow0OtYk z0$cz*0=OtWgzpAa0Pg`@0z3+MFQ5u|4B!Ih0X2XCJir2A5wNs)SB1E)0~$Cn#Q9}F z6KUdp8PGzSd(Z}Sklrnp$L}i(+-U>wpRg+2odD294ertk;40GGt!sc)r1v9!7VtQr zhc?{5PXN{d@549u^8J7hEZRy*S$l%T~DU{6bw8E_k(|ZR%^DLDu*= zuZiym-s<@QJ~OqY`Z2^~t+rWqo2XPyhq)y`aBG*tV$Cw+nv1Ee!6E$wfn;w zxh-uBiY~JL3TBV)FrPH~sHB&?TA1myG0E(e6&EwlcxJJUE;(5XYYSFdHB2MmLTwk7 z{dx=8tvc(rSeQdYaN^KLes;xIjQDj_Ph;|xTz#<~te8r+RyQ@5l`&gj1~b7NGcHzu z?qz8OEB3t=N*;5-1eR-{5&dJF2Al%; zM-cQNWG~yM6r|2EoxPkh2Lh{M_fi`mt{^R$MuzDP>qB6W=Tha|*)ZG0h^;X1wHBLo ze<{p^PO#*qwW-I~omM+=>Mh5wwL9LeW!vnDZ~`m)_-j#=mr~-UMx3I_>$FL7ta{ z=(^;a8d8CKX~A=YuJQc2&T2Tl=mjIg-h{sVB7&4vurh$6ZEnL?XkoKFKpo(c-T}m~ zOS%uR1Fdid}g($}{5og?a9xsn!wyqJgIA4aEg%4z5 zp2z~Jvld8`4YAT3+aS(sV`i`}=FS4fTEIjW1z2qs`G|6?&&4B`i(sf-V{Xifdtj_R zXa>9HaisB31Rfa{MY6BjTe&jO+(k0zQZ=sYzGWr_oqX+q01m3HTG>b-gHG0@}A zV;Yzv!gSqJd~mCgK_&#Ig9+$fV*(muC5jwm$GFfK*)b-#@4y73Or4Z~F}Up*(qK$w zGrBG>#)QUbALBs@fTIx)>^8!JQb9Bc&5m&p5dzj3C4l<~DJ~~KH0ey(1?RE?77hMX z_XCg;L+qVrI8~eM)B{Ek5ETq?jF{qbln}CDFdzebX+xAK?sTu~3~e$|kK2uaeb=K1 z4RkG0#Ll>ccweK<%i)t-m-C_x!?#X1Z9K>p9A*ZX2|hUmq&c?HlV&h1a&$x#uyaI~d~6?DHfNXIxlVaA*|V{R+rEW1F&J`k}r3!@lQ zf*8cwI3;W^#7-1)J}rtzjY!-SU*J(#Haa8*8R-Wxf)8LM$2>I3GoB~5nV>iZLPCg7 zYzW0SWK~2CN=F$VlOcwc=@{iRhTMcX=!7vx1ihgSodd-)lRK&3sGMVhV=k*fz^mOYQ|FjDmKWgs6PT%c>(ttj&aJ+Pj3ReupxFn?Ad%V@Vif%9%Ruaf~I zY_lntUOccHP4flU@8)zSI(3-Xn3P+O7x6;B`g49X;&V~!=b%b)<*V8jsTZH~yKJjT z;upFS*2uv%3mtM4O}92_9$czw9=nkk);t0jU5SQ~K5O=%eC;hz)f(MJ&`Z1JJ5UB3 zssUYh(es^lu;dvhSaMs)gfimbJgVxk>IIJ9?R45E@T6>q(~-(C+?6?i-piJ85UENL z1qbmNPEk=9u6!K1K9{3vf^FW4)TT9YYjsUCNC?*P1Xk97dt$>ntvXaWb2e!}1hBz5 zfWIHXA}T*Rvu^dQMrtEvGEw{~`%30qG&d;USs(g%2WzqCj)Yb)GUb#B_RymW@pKbmqW|ZH6!bJY3#(J*Yd7)%*nU2Pj;S3tsVK=RyCJ2#&8fwtNo#} zLHn}T8W~G*myD82T`yp;>$GTIvYc{0%(R-p<*=YlVb)eAh|y~QGaU0;-DQd&FPxyU zF(^qly%sFjcBh;(_o2JWSh5%9D8CE+@37Qu!C;J<`eAX99+_)6eIo|gwNEaQvUaAY zt&%kfn6eJqjx}kU_aMJU^zn};iTqSg(mWE2sdYQRZf$^5>`n2+MCqM0-!f*lhg0)< z^wCk;ME}SH@s4f_H5!QC#hvQda7PqffeZ_st zyE-VV6k1T|YfGN5_zZLHg#`%dQJm$Y9{hJ>RDTLV3Wga`RIn#upgEQyDs+N1%3c1^ zf=hVL$MFG7WdK<~?%G0xCt#xjPLp3R^z2>%TxJYd8UxO(XYKW|UIymw7;(|dA(daW zz@P9v0}LS#X7A-Ov$K~)nbphhNo_8_vXh)c>pTg3XM4GYG)(CXVq(?CNivUNUclzN z5a|-a98d>bDt^nwa7@}Y&?d2%ce}w-+xTHABA^az{o1PbP#GH_dwX1Mfa5T?XxiOQ zgkdFPQ{{NHnWNw!Qei>6g(pPt<9n@~#2;eCqZ- zci4*O4sj>joJA`*KH68|Q;XR&(aJ<0~PWMtE{}byOkbDL?*}w)PfKYuV6@Xk3TTi8) zvKDOJ#6mBvf)`@30a8{5G5ts?$Tjj5l-yhwS0<^VgTH&BEV#A6>3don*M~~xISaVR(2Jn9lQVb`?SQ>!xvLFTSukc}!WAiA_U5^UD&Yewb@T9y2}DJkBZ-KZ-S8BS_`oth0-B*a2oJIurN3c zDkct*>L1OwP=2h^Ri=jXcxEDur**zVvsIGa z_X-&N6h`(bCMjPdg3W+r_YcT6K-u+O2ol;QCr@aX#iZS9qBMIcGLvE>DH6DSBgu~V zcx*Z;)vo6fdWCw|$i_TALs|`TLPN?|s3$F+y+%zH~vA)SXn+g@wQZ8q=lol~$mAYQVg3|5@=U}D zsR9nM!$@9NGTjcm#pH)Wilz$aqUxxA6~q~f7ej4xNHouEwHKd3ok$`VGLXkbn6t3c zkUC_WRg@(r8qq~K#RkU^z+AE}7vR3bgB}=UX-PkuR^g}S1WhiOS~M)wfK@Xp<39L` zJc+m7PIPhYKoL}yAy7oy%YWh8Wzg+3J?Ejr7Y@e?Ja zrS?fJ86J#z_&wVT$~A;M5m%{^I{ik~QZMxN;gN>`j@vYzE33{zJU?I%mmZ2&OmGRahwC}2Nt#aD%?1yQz3&()imd|V1a#n?GI291{)s^JIfmvhi zoc7>H8IGaG4GHx^8A(v> zgZn4k3V78H&6uwGrNbhuglfFnk%dkT78+IfB@7YUa3#w;jjtNDRQ_>2l>Vx{*v{}2LYy!+Roy<%I?edKR~Qyku^4ZCmRYbT*qrK{0+ou*s!`U2u8I>Bb8fn4lK zeME`zmKev)b5f3eeMZC*s;kp7|)-XeRGR*mUQK0NH%ynCJ7_MP1fYSx$ z&wDY%kk+YuHBmU@6JVZTr6&dKraFD?Uf?ol|+F>!3v!-q8 zh9rN+bM}m_wG#3mc6l6>h2D*XUNOL$gi;9KcP~v}6^t4XNe?O>yjt+{@!DK$kCYYAB{mzzk#9S_ZUU^q4>|xT1lwP!Us29 zaX|4N`Swhy2Z;|X00SUSc&=?j8_@7L!T0uQrbmX)LEFbmr{6}A(h)8)c;;$=x?AQ0NShBLsBJ^QKgJ0z zI6<7VR257BA^g7xZFnG4RXBPO!fZvD?d1{XdIf}ect2CsLa&H$tT&FZ*aIG`Vu?->t2xy!3r|M8yf($=LBYkYG*R0OHjcirrcbu6&i7ZRbSf z6g^ozp{{AxI5`)QSu;M^7@|rs88- zj49MS)9TAiUWJlgeU?~I!G@PrBQ3&hjRjr?^mQTjDwA01qJeIL&a zeMW@qR|lrRjeTNwY4~uM5!>(;dS=atuMav@?n4jb%S|WpmnYkCoZ1(Lg__BI#|5XO zE<*4+eBJJsXhnsy z4$r9e-&(y24=0?SdNhaT8xvcEuIPG!PElM9H{E$?uDDc(Q*{|iEZwWoVUO{r1kA%- z$J}S@i+GcQL>*pKy=zXU|E4pzXZ9k~mRutqa_E96B6D69CDfs}M|20*TpS}{!*AC0 zWXHEc5w}=P)*g3?hZ^%ZcgNw$ZC^$6=+PjL9CpeNq}>hT%^kGtyMp@~IY`tvgQu^~ zta!ZY;?|*`gpSn6pe_%yYX$d+5_ep7E*^HU{rA4(D1&2t#s6Zgizfre1`1DEe0KC% z0}lt}E#4?$K-`O|BEPEJ$Rccu*Yus){%p%{OghAfS{Z{T)n8!{FJ^eoQD=6cZs1Qh z?W>V@dqB3cYBiz2IR6*djpDj;@JNjH@ z$PFBB2ZKxAcPL9jy&HK-nAM|iz71k7EDl(Mp-Xt6eudjcrezk`U>Y9KLys z1^Jk8%G>a2*J~~qdK}GnGx#0`-^W15EJY7hBZh=0((p*dX_O4UR0%ZCu^C4;pJDJA zgU>P;Q9%mTD)u|5R(}tLdQp%%n6qB)-`butBU-{-zkQ-3QYWC!bBY~)3EBa40(%!U zM1cVjxj<%MaBCVa=^4D0fW`=wUN41UV9<$(G<1bV+I$3;L(nCWh6d5d!9>qoO7jL4 z_oZoE2C0i121TCtsd{q?okCNoMgfn8$IvtCM5aSfY>eyWQ&CfJqQgB|Y~ax`M~B*k z`K&_=X-t^kL^#Dj2K zX!P6Iw~lTr1~~dlf5Pa8wj6z8i_!OD^v|KUNt9t106PFT0k|vK#;Z0!D_G?|*@hZQ zH~KXd^%d{>KG}}%V&nD=>gvD?(syj6WE$2AFwYhncyj{%vl!i)mv=<=i9SSt99$QTS^r8we4{=@b%bTmy8{0G^!PCJvKUfl8f(8A>uj?S&q8Iw{Ep|xd|n` znjq$QqS6g9-!}nu{RIr^Z_L(dA#wNysXoYy)CN>qyJMi>Cn9W?YDsypHN?=`!HfO7 z*4vV)9lotUhqBh$x9Y&jvf{%3Rf9x-eElxW=keon#}?LKbzOjp@@#bD^Kqn&+T+S^ z`A%U#ET)FfFjwv1DmXB=u&l+P&1^iK?*yyL?88zOd)uFZ-XfZBVnr1Kyncx!0iBuV zdl}Fz6$u7)DN#Boa}Tr8ShQEeN%ly)7A`gT#?-QmWEDL=1A%2e%-Y+M+ObT5V4!wi zt>WoxcR9=~dv2?|&3r#9SH_|%j}<)69cy`f8Hr?c+vTx*TdRtn{Vdj7Zj<}(n zAPMmrztb_K&B|2tSY#;sTam5Ca;11N5tWtVzzbH<%U&*QM3$LvLDK^kqz=){car-7 zC7Z%5gV#Is4dYo!8ew`mox`2&PCPG(!YMq3D&Xl!&MsMnRFpy<%IORl$wf_Dd*st8 zJe)%L+JQ5pf^n1gB5^{~C6GwRoJvQ!)!-u=}CkQ`Cpxh?8WLlT?_c4)sd8Qt=`V~Mm&CW!3~8#}d2r3-#tXZ{c#5V~&k-5Wh&z1%r? zBjJGA7JI~ogpEp{zZ&A?&q$|pv;v`H)ttd{>}K;NzWSKj!uST_hn8dg9JBhZP$&{P;~*l z5DiiAWu(gX8Kd!-l2cf$R`I~N+oUp3s#dRb-Dad3=(BtCIX0&3jfhk!Uy1%!<_&^_ z{QOP?zYEa&nL^rzBpv+A4W&zm3J(-d7E0Oz8fX`do*$YiUUx<_{%I7H4@FPLvd{uM ztGZW}Tl~vSTqw5Y79z>qziNX^W{w^>y-%#{8LgG*$Ag=Q%1!-B& 0: return num / den # we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative return 1.0 +def f1(true_labels, predicted_labels): + tp, fp, fn, tn = get_counters(true_labels,predicted_labels) + return f1_from_counters(tp, fp, fn, tn ) + class AuthorshipVerificator: def __init__(self, nfolds=10, - params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'class_weight':['balanced']}, + params = {'C': np.logspace(-4,+4,9), 'class_weight':['balanced',None]}, estimator=SVC): self.nfolds = nfolds self.params = params @@ -70,7 +78,7 @@ class AuthorshipVerificator: return self - def leave_one_out(self, X, y, groups=None, test_lowest_index_only=True): + def leave_one_out(self, X, y, groups=None, test_lowest_index_only=True, counters=False): if groups is None: print('Computing LOO without groups') @@ -85,8 +93,15 @@ class AuthorshipVerificator: scores = cross_val_score(self.estimator, X, y, cv=folds, scoring=make_scorer(f1), n_jobs=-1) print(scores) - - return scores.mean(), scores.std() + if counters and test_lowest_index_only: + yfull_true = y[:len(folds)] + yfull_predict = np.zeros_like(yfull_true) + yfull_predict[scores == 1] = yfull_true[scores == 1] + yfull_predict[scores != 1] = 1-yfull_true[scores != 1] + tp, fp, fn, tn = get_counters(yfull_true, yfull_predict) + return scores.mean(), scores.std(), tp, fp, fn, tn + else: + return scores.mean(), scores.std() def predict(self, test, epistola_name=''): pred = self.estimator.predict(test) diff --git a/src/pan2015_eval.py b/src/pan2015_eval.py index d22c672..7c3b9da 100644 --- a/src/pan2015_eval.py +++ b/src/pan2015_eval.py @@ -32,6 +32,7 @@ def evaluation(y_pred, y_prob, y_true): def doall(problem,pos,neg,test,truth): print('[Start]{}'.format(problem)) feature_extractor = FeatureExtractor(function_words_freq=lang, + conjugations_freq=lang, features_Mendenhall=True, wordngrams=False, tfidf_feat_selection_ratio=0.1, charngrams=True, n_charngrams=[3, 4, 5],