gitea first commit

2020-11-21 18:20:20 +01:00 · 2020-11-21 18:20:20 +01:00 · c25af5e603
parent 4c6c92f6d4
commit c25af5e603
4 changed files with 17 additions and 10 deletions
--- a/src/author_identification_loo.py
+++ b/src/author_identification_loo.py
@ -14,7 +14,7 @@ AUTHORS_CORPUS_II = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'Boncompagn
                           'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']


-DEBUG_MODE = False
+DEBUG_MODE = True


 def main():
@ -57,7 +57,8 @@ def main():
        Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)

        print('Fitting the Verificator')
-        params = {'C': np.logspace(0, 1, 2)} if DEBUG_MODE else {'C': np.logspace(-3, +3, 7)}
+        #params = {'C': np.logspace(0, 1, 2)} if DEBUG_MODE else {'C': np.logspace(-3, +3, 7)}
+        params = {'C': np.logspace(0, 1, 2)} if DEBUG_MODE else {'C': [1,10,100,1000,0.1,0.01,0.001]}

        slice_charngrams = feature_extractor.feature_range['_cngrams_task']
        slice_wordngrams = feature_extractor.feature_range['_wngrams_task']
@ -66,8 +67,8 @@ def main():
        else:
            slice_first, slice_second = slice_wordngrams, slice_charngrams
        av = Pipeline([
-            ('featsel_cngrams', RangeFeatureSelector(slice_second, 0.1)),
-            ('featsel_wngrams', RangeFeatureSelector(slice_first, 0.1)),
+            ('featsel_cngrams', RangeFeatureSelector(slice_second, 0.05)),
+            ('featsel_wngrams', RangeFeatureSelector(slice_first, 0.05)),
            ('av', AuthorshipVerificator(C=1, param_grid=params))
        ])

@ -115,8 +116,9 @@ if __name__ == '__main__':
    parser.add_argument('positive', type=str, default="Dante", metavar='AUTHOR',
                        help= f'Positive author for the hypothesis (default "Dante"); set to "ALL" to check '
                              f'every author')
-    parser.add_argument('--log', type=str, metavar='PATH', default='./results.txt',
-                        help='path to the log file where to write the results (default ./results.txt)')
+    parser.add_argument('--log', type=str, metavar='PATH', default=None,
+                        help='path to the log file where to write the results '
+                             '(if not specified, then ./results_{corpuspath.name})')

    args = parser.parse_args()

--- a/src/experiments.sh
+++ b/src/experiments.sh
@ -9,11 +9,11 @@ if [ ! -d $corpus ]; then
  rm ../MedLatin.zip
 fi

-PY="python3 author_identification.py"
+PY="python3 author_identification_loo.py"
 MedLatin1="../MedLatin/Corpora/MedLatin1"
 MedLatin2="../MedLatin/Corpora/MedLatin2"
 EP1="../MedLatin/Epistle/EpistolaXIII_1.txt"
 EP2="../MedLatin/Epistle/EpistolaXIII_2.txt"

-$PY $MedLatin1 ALL --unknown $EP1 --loo --log ./results_EP1.txt
-$PY $MedLatin2 ALL --unknown $EP2 --loo --log ./results_EP2.txt
+$PY $MedLatin1 ALL --log ./resultsLoo_EP1.txt
+$PY $MedLatin2 ALL --log ./resultsLoo_EP2.txt
--- a/src/model.py
+++ b/src/model.py
@ -19,7 +19,7 @@ class AuthorshipVerificator(BaseEstimator):
        self.author_name = author_name

    def fit(self, X, y):
-        self.classifier = LogisticRegression(C=self.C, class_weight='balanced')
+        self.classifier = LogisticRegression(C=self.C, class_weight='balanced', solver='lbfgs', max_iter=1000)
        y = np.asarray(y)
        positive_examples = y.sum()
        if positive_examples >= self.nfolds and self.param_grid is not None:
--- a/src/requirements.txt
+++ b/src/requirements.txt
@ -0,0 +1,5 @@
+joblib==0.11
+nltk==3.4.5
+numpy==1.18.2
+scikit-learn==0.22.2.post1
+scipy==1.4.1