final refinements

2020-11-27 21:04:00 +01:00 · 2020-11-27 21:04:00 +01:00 · 04f0eb17ed
parent 98d9d7800c
commit 04f0eb17ed
6 changed files with 24 additions and 35 deletions
--- a/src/author_identification_loo.py
+++ b/src/author_identification_loo.py
@ -37,7 +37,7 @@ def main():
            n_full_docs = len(positive) + len(negative)
            print(f'read {n_full_docs} documents from {path}')

-            feature_extractor = FeatureExtractor(**settings.config_loo)
+            feature_extractor = FeatureExtractor(**settings.config_feature_extraction)

            Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
            frange_chgrams = feature_extractor.feature_range['_cngrams_task']
--- a/src/author_identification_unknown.py
+++ b/src/author_identification_unknown.py
@ -38,8 +38,8 @@ def main():
            n_full_docs = len(positive) + len(negative)
            print(f'read {n_full_docs} documents from {path}')

-            settings.config_unk['feature_selection_ratio'] = args.featsel
-            feature_extractor = FeatureExtractor(**settings.config_unk)
+            settings.config_feature_extraction['feature_selection_ratio'] = args.featsel
+            feature_extractor = FeatureExtractor(**settings.config_feature_extraction)

            Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
            frange_chgrams = feature_extractor.feature_range['_cngrams_task']
--- a/src/experiments.sh
+++ b/src/experiments.sh
@ -9,11 +9,22 @@ if [ ! -d $corpus ]; then
  rm ../MedLatin.zip
 fi

-PY="python3 author_identification_loo.py"
+PYLOO="python3 author_identification_loo.py"
+PYUNK="python3 author_identification_unknown.py"
+
 MedLatin1="../MedLatin/Corpora/MedLatin1"
 MedLatin2="../MedLatin/Corpora/MedLatin2"
-EP1="../MedLatin/Epistle/EpistolaXIII_1.txt"
-EP2="../MedLatin/Epistle/EpistolaXIII_2.txt"

-$PY $MedLatin1 ALL --log ./resultsLoo_EP1.txt
-$PY $MedLatin2 ALL --log ./resultsLoo_EP2.txt
+EPXIII1="../MedLatin/Epistle/EpistolaXIII_1.txt"
+EPXIII2="../MedLatin/Epistle/EpistolaXIII_2.txt"
+EPXIV="../Epistola_ArigoVII.txt"
+
+for learner in lr svm mnb ; do
+  $PYLOO $MedLatin1 ALL --learner $learner --log ../results/resultsLOO_EP1_$learner.txt
+  $PYLOO $MedLatin2 ALL --learner $learner --log ../results/resultsLOO_EP2_$learner.txt
+
+  $PYUNK $MedLatin1 Dante $EPXIII1 --learner $learner --log ../results/resultsUNK_EP13_1_$learner.txt
+  $PYUNK $MedLatin2 Dante $EPXIII2 --learner $learner --log ../results/resultsUNK_EP13_2_$learner.txt
+  $PYUNK $MedLatin1 Dante $EPXIV --learner $learner --log ../results/resultsUNK_EP14_$learner.txt
+done
+
--- a/src/helpers.py
+++ b/src/helpers.py
@ -54,9 +54,9 @@ def check_log_loo(args):


 def check_log_unknown(args):
+    args.unknown_name = pathlib.Path(args.unknown).name
    if args.log is None:
        os.makedirs('../results', exist_ok=True)
        assert os.path.exists(args.unknown), f'file {args.unknown} does not exist'
-        args.unknown_name = pathlib.Path(args.unknown).name
        args.log = f'../results/Unknown{args.unknown_name}_Corpus{args.corpus_name}.Author{args.positive}.' \
                   f'fs{args.featsel}.classweight{str(args.class_weight)}.CLS{args.learner}.txt'
--- a/src/model.py
+++ b/src/model.py
@ -9,7 +9,6 @@ from util.evaluation import f1_metric
 from typing import List, Union


-
 class AuthorshipVerificator(BaseEstimator):

    def __init__(self, nfolds=10, param_grid=None, learner=None, C=1., alpha=0.001, class_weight='balanced',
@ -24,10 +23,7 @@ class AuthorshipVerificator(BaseEstimator):
        self.feat_selection_slices = feat_selection_slices
        self.feat_selection_ratio = feat_selection_ratio

-    def fit(self, X, y, groups=None, hyperparam_optimization=True):
-        if self.param_grid is None and hyperparam_optimization:
-            raise ValueError('Param grid is None, but hyperparameter optimization is requested')
-
+    def fit(self, X, y, groups=None):
        if self.feat_selection_slices is not None:
            self.fs = MultiRangeFeatureSelector(self.feat_selection_slices, feat_sel=self.feat_selection_ratio)
            X = self.fs.fit(X, y).transform(X)
@ -37,7 +33,7 @@ class AuthorshipVerificator(BaseEstimator):
                C=self.C, class_weight=self.class_weight, max_iter=1000, random_state=self.random_seed, solver='lbfgs'
            )
        elif self.learner == 'svm':
-            self.classifier = LinearSVC(C=self.C, class_weight=self.class_weight)
+            self.classifier = LinearSVC(C=self.C, class_weight=self.class_weight, max_iter=2500, random_state=self.random_seed)
        elif self.learner == 'mnb':
            self.classifier = MultinomialNB(alpha=self.alpha)

@ -47,7 +43,7 @@ class AuthorshipVerificator(BaseEstimator):
        if groups is None:
            groups = np.arange(len(y))

-        if hyperparam_optimization and (positive_examples >= self.nfolds) and (len(np.unique(groups[y==1])) > 1):
+        if (positive_examples >= self.nfolds) and (len(np.unique(groups[y==1])) > 1):
            folds = list(GroupKFold(n_splits=self.nfolds).split(X, y, groups))
            self.estimator = GridSearchCV(
                self.classifier, param_grid=self.param_grid, cv=folds, scoring=make_scorer(f1_metric), n_jobs=-1,
@ -135,18 +131,3 @@ class MultiRangeFeatureSelector(BaseEstimator, TransformerMixin):
    def __sort_ranges(self, ranges: List[slice]):
        return np.asarray(ranges)[np.argsort([r.start for r in ranges])[::-1]]

-
-def get_valid_folds(nfolds, X, y, groups, max_trials=100):
-    trials = 0
-    folds = list(GroupKFold(n_splits=nfolds).split(X, y, groups))
-    n_docs = len(y)
-    print(f'different classes={np.unique(y)}; #different documents={len(np.unique(groups))} positives={len(np.unique(groups[y==1]))}')
-    while any(len(np.unique(y[train])) < 2 for train, test in folds):
-        shuffle_index = np.random.permutation(n_docs)
-        X, y, groups = X[shuffle_index], y[shuffle_index], groups[shuffle_index]
-        folds = list(GroupKFold(n_splits=nfolds).split(X, y, groups))
-        print(f'\ttrial{trials}:{[len(np.unique(y[train])) for train, test in folds]}')
-        trials+=1
-        if trials>max_trials:
-            raise ValueError(f'could not meet condition after {max_trials} trials')
-    return folds
--- a/src/settings.py
+++ b/src/settings.py
@ -46,7 +46,7 @@ param_grid = {
    'mnb': {'alpha': np.logspace(-7,-1,7)}
 }

-config_loo = {
+config_feature_extraction = {
    'function_words_freq': 'latin',
    'conjugations_freq': 'latin',
    'features_Mendenhall': True,
@ -62,6 +62,3 @@ config_loo = {
    'window_size': 3,
    'normalize_features': True
 }
-
-config_unk = config_loo.copy()
-config_unk['feature_selection_ratio']=0.1