diff --git a/src/author_identification_loo.py b/src/author_identification_loo.py index 023315a..567d286 100755 --- a/src/author_identification_loo.py +++ b/src/author_identification_loo.py @@ -14,7 +14,7 @@ AUTHORS_CORPUS_II = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'Boncompagn 'RyccardusDeSanctoGermano', 'ZonoDeMagnalis'] -DEBUG_MODE = False +DEBUG_MODE = True def main(): @@ -57,7 +57,8 @@ def main(): Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative) print('Fitting the Verificator') - params = {'C': np.logspace(0, 1, 2)} if DEBUG_MODE else {'C': np.logspace(-3, +3, 7)} + #params = {'C': np.logspace(0, 1, 2)} if DEBUG_MODE else {'C': np.logspace(-3, +3, 7)} + params = {'C': np.logspace(0, 1, 2)} if DEBUG_MODE else {'C': [1,10,100,1000,0.1,0.01,0.001]} slice_charngrams = feature_extractor.feature_range['_cngrams_task'] slice_wordngrams = feature_extractor.feature_range['_wngrams_task'] @@ -66,8 +67,8 @@ def main(): else: slice_first, slice_second = slice_wordngrams, slice_charngrams av = Pipeline([ - ('featsel_cngrams', RangeFeatureSelector(slice_second, 0.1)), - ('featsel_wngrams', RangeFeatureSelector(slice_first, 0.1)), + ('featsel_cngrams', RangeFeatureSelector(slice_second, 0.05)), + ('featsel_wngrams', RangeFeatureSelector(slice_first, 0.05)), ('av', AuthorshipVerificator(C=1, param_grid=params)) ]) @@ -115,8 +116,9 @@ if __name__ == '__main__': parser.add_argument('positive', type=str, default="Dante", metavar='AUTHOR', help= f'Positive author for the hypothesis (default "Dante"); set to "ALL" to check ' f'every author') - parser.add_argument('--log', type=str, metavar='PATH', default='./results.txt', - help='path to the log file where to write the results (default ./results.txt)') + parser.add_argument('--log', type=str, metavar='PATH', default=None, + help='path to the log file where to write the results ' + '(if not specified, then ./results_{corpuspath.name})') args = parser.parse_args() diff --git a/src/experiments.sh b/src/experiments.sh index f5019c1..ddee056 100755 --- a/src/experiments.sh +++ b/src/experiments.sh @@ -9,11 +9,11 @@ if [ ! -d $corpus ]; then rm ../MedLatin.zip fi -PY="python3 author_identification.py" +PY="python3 author_identification_loo.py" MedLatin1="../MedLatin/Corpora/MedLatin1" MedLatin2="../MedLatin/Corpora/MedLatin2" EP1="../MedLatin/Epistle/EpistolaXIII_1.txt" EP2="../MedLatin/Epistle/EpistolaXIII_2.txt" -$PY $MedLatin1 ALL --unknown $EP1 --loo --log ./results_EP1.txt -$PY $MedLatin2 ALL --unknown $EP2 --loo --log ./results_EP2.txt +$PY $MedLatin1 ALL --log ./resultsLoo_EP1.txt +$PY $MedLatin2 ALL --log ./resultsLoo_EP2.txt diff --git a/src/model.py b/src/model.py index 45b82f0..e768fe1 100755 --- a/src/model.py +++ b/src/model.py @@ -19,7 +19,7 @@ class AuthorshipVerificator(BaseEstimator): self.author_name = author_name def fit(self, X, y): - self.classifier = LogisticRegression(C=self.C, class_weight='balanced') + self.classifier = LogisticRegression(C=self.C, class_weight='balanced', solver='lbfgs', max_iter=1000) y = np.asarray(y) positive_examples = y.sum() if positive_examples >= self.nfolds and self.param_grid is not None: diff --git a/src/requirements.txt b/src/requirements.txt new file mode 100644 index 0000000..54f1247 --- /dev/null +++ b/src/requirements.txt @@ -0,0 +1,5 @@ +joblib==0.11 +nltk==3.4.5 +numpy==1.18.2 +scikit-learn==0.22.2.post1 +scipy==1.4.1