final refinements

This commit is contained in:
Alejandro Moreo Fernandez 2020-11-27 21:04:00 +01:00
parent 98d9d7800c
commit 04f0eb17ed
6 changed files with 24 additions and 35 deletions

View File

@ -37,7 +37,7 @@ def main():
n_full_docs = len(positive) + len(negative)
print(f'read {n_full_docs} documents from {path}')
feature_extractor = FeatureExtractor(**settings.config_loo)
feature_extractor = FeatureExtractor(**settings.config_feature_extraction)
Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
frange_chgrams = feature_extractor.feature_range['_cngrams_task']

View File

@ -38,8 +38,8 @@ def main():
n_full_docs = len(positive) + len(negative)
print(f'read {n_full_docs} documents from {path}')
settings.config_unk['feature_selection_ratio'] = args.featsel
feature_extractor = FeatureExtractor(**settings.config_unk)
settings.config_feature_extraction['feature_selection_ratio'] = args.featsel
feature_extractor = FeatureExtractor(**settings.config_feature_extraction)
Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
frange_chgrams = feature_extractor.feature_range['_cngrams_task']

View File

@ -9,11 +9,22 @@ if [ ! -d $corpus ]; then
rm ../MedLatin.zip
fi
PY="python3 author_identification_loo.py"
PYLOO="python3 author_identification_loo.py"
PYUNK="python3 author_identification_unknown.py"
MedLatin1="../MedLatin/Corpora/MedLatin1"
MedLatin2="../MedLatin/Corpora/MedLatin2"
EP1="../MedLatin/Epistle/EpistolaXIII_1.txt"
EP2="../MedLatin/Epistle/EpistolaXIII_2.txt"
$PY $MedLatin1 ALL --log ./resultsLoo_EP1.txt
$PY $MedLatin2 ALL --log ./resultsLoo_EP2.txt
EPXIII1="../MedLatin/Epistle/EpistolaXIII_1.txt"
EPXIII2="../MedLatin/Epistle/EpistolaXIII_2.txt"
EPXIV="../Epistola_ArigoVII.txt"
for learner in lr svm mnb ; do
$PYLOO $MedLatin1 ALL --learner $learner --log ../results/resultsLOO_EP1_$learner.txt
$PYLOO $MedLatin2 ALL --learner $learner --log ../results/resultsLOO_EP2_$learner.txt
$PYUNK $MedLatin1 Dante $EPXIII1 --learner $learner --log ../results/resultsUNK_EP13_1_$learner.txt
$PYUNK $MedLatin2 Dante $EPXIII2 --learner $learner --log ../results/resultsUNK_EP13_2_$learner.txt
$PYUNK $MedLatin1 Dante $EPXIV --learner $learner --log ../results/resultsUNK_EP14_$learner.txt
done

View File

@ -54,9 +54,9 @@ def check_log_loo(args):
def check_log_unknown(args):
args.unknown_name = pathlib.Path(args.unknown).name
if args.log is None:
os.makedirs('../results', exist_ok=True)
assert os.path.exists(args.unknown), f'file {args.unknown} does not exist'
args.unknown_name = pathlib.Path(args.unknown).name
args.log = f'../results/Unknown{args.unknown_name}_Corpus{args.corpus_name}.Author{args.positive}.' \
f'fs{args.featsel}.classweight{str(args.class_weight)}.CLS{args.learner}.txt'

View File

@ -9,7 +9,6 @@ from util.evaluation import f1_metric
from typing import List, Union
class AuthorshipVerificator(BaseEstimator):
def __init__(self, nfolds=10, param_grid=None, learner=None, C=1., alpha=0.001, class_weight='balanced',
@ -24,10 +23,7 @@ class AuthorshipVerificator(BaseEstimator):
self.feat_selection_slices = feat_selection_slices
self.feat_selection_ratio = feat_selection_ratio
def fit(self, X, y, groups=None, hyperparam_optimization=True):
if self.param_grid is None and hyperparam_optimization:
raise ValueError('Param grid is None, but hyperparameter optimization is requested')
def fit(self, X, y, groups=None):
if self.feat_selection_slices is not None:
self.fs = MultiRangeFeatureSelector(self.feat_selection_slices, feat_sel=self.feat_selection_ratio)
X = self.fs.fit(X, y).transform(X)
@ -37,7 +33,7 @@ class AuthorshipVerificator(BaseEstimator):
C=self.C, class_weight=self.class_weight, max_iter=1000, random_state=self.random_seed, solver='lbfgs'
)
elif self.learner == 'svm':
self.classifier = LinearSVC(C=self.C, class_weight=self.class_weight)
self.classifier = LinearSVC(C=self.C, class_weight=self.class_weight, max_iter=2500, random_state=self.random_seed)
elif self.learner == 'mnb':
self.classifier = MultinomialNB(alpha=self.alpha)
@ -47,7 +43,7 @@ class AuthorshipVerificator(BaseEstimator):
if groups is None:
groups = np.arange(len(y))
if hyperparam_optimization and (positive_examples >= self.nfolds) and (len(np.unique(groups[y==1])) > 1):
if (positive_examples >= self.nfolds) and (len(np.unique(groups[y==1])) > 1):
folds = list(GroupKFold(n_splits=self.nfolds).split(X, y, groups))
self.estimator = GridSearchCV(
self.classifier, param_grid=self.param_grid, cv=folds, scoring=make_scorer(f1_metric), n_jobs=-1,
@ -135,18 +131,3 @@ class MultiRangeFeatureSelector(BaseEstimator, TransformerMixin):
def __sort_ranges(self, ranges: List[slice]):
return np.asarray(ranges)[np.argsort([r.start for r in ranges])[::-1]]
def get_valid_folds(nfolds, X, y, groups, max_trials=100):
trials = 0
folds = list(GroupKFold(n_splits=nfolds).split(X, y, groups))
n_docs = len(y)
print(f'different classes={np.unique(y)}; #different documents={len(np.unique(groups))} positives={len(np.unique(groups[y==1]))}')
while any(len(np.unique(y[train])) < 2 for train, test in folds):
shuffle_index = np.random.permutation(n_docs)
X, y, groups = X[shuffle_index], y[shuffle_index], groups[shuffle_index]
folds = list(GroupKFold(n_splits=nfolds).split(X, y, groups))
print(f'\ttrial{trials}:{[len(np.unique(y[train])) for train, test in folds]}')
trials+=1
if trials>max_trials:
raise ValueError(f'could not meet condition after {max_trials} trials')
return folds

View File

@ -46,7 +46,7 @@ param_grid = {
'mnb': {'alpha': np.logspace(-7,-1,7)}
}
config_loo = {
config_feature_extraction = {
'function_words_freq': 'latin',
'conjugations_freq': 'latin',
'features_Mendenhall': True,
@ -62,6 +62,3 @@ config_loo = {
'window_size': 3,
'normalize_features': True
}
config_unk = config_loo.copy()
config_unk['feature_selection_ratio']=0.1