final refinements
This commit is contained in:
parent
98d9d7800c
commit
04f0eb17ed
|
@ -37,7 +37,7 @@ def main():
|
|||
n_full_docs = len(positive) + len(negative)
|
||||
print(f'read {n_full_docs} documents from {path}')
|
||||
|
||||
feature_extractor = FeatureExtractor(**settings.config_loo)
|
||||
feature_extractor = FeatureExtractor(**settings.config_feature_extraction)
|
||||
|
||||
Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
|
||||
frange_chgrams = feature_extractor.feature_range['_cngrams_task']
|
||||
|
|
|
@ -38,8 +38,8 @@ def main():
|
|||
n_full_docs = len(positive) + len(negative)
|
||||
print(f'read {n_full_docs} documents from {path}')
|
||||
|
||||
settings.config_unk['feature_selection_ratio'] = args.featsel
|
||||
feature_extractor = FeatureExtractor(**settings.config_unk)
|
||||
settings.config_feature_extraction['feature_selection_ratio'] = args.featsel
|
||||
feature_extractor = FeatureExtractor(**settings.config_feature_extraction)
|
||||
|
||||
Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
|
||||
frange_chgrams = feature_extractor.feature_range['_cngrams_task']
|
||||
|
|
|
@ -9,11 +9,22 @@ if [ ! -d $corpus ]; then
|
|||
rm ../MedLatin.zip
|
||||
fi
|
||||
|
||||
PY="python3 author_identification_loo.py"
|
||||
PYLOO="python3 author_identification_loo.py"
|
||||
PYUNK="python3 author_identification_unknown.py"
|
||||
|
||||
MedLatin1="../MedLatin/Corpora/MedLatin1"
|
||||
MedLatin2="../MedLatin/Corpora/MedLatin2"
|
||||
EP1="../MedLatin/Epistle/EpistolaXIII_1.txt"
|
||||
EP2="../MedLatin/Epistle/EpistolaXIII_2.txt"
|
||||
|
||||
$PY $MedLatin1 ALL --log ./resultsLoo_EP1.txt
|
||||
$PY $MedLatin2 ALL --log ./resultsLoo_EP2.txt
|
||||
EPXIII1="../MedLatin/Epistle/EpistolaXIII_1.txt"
|
||||
EPXIII2="../MedLatin/Epistle/EpistolaXIII_2.txt"
|
||||
EPXIV="../Epistola_ArigoVII.txt"
|
||||
|
||||
for learner in lr svm mnb ; do
|
||||
$PYLOO $MedLatin1 ALL --learner $learner --log ../results/resultsLOO_EP1_$learner.txt
|
||||
$PYLOO $MedLatin2 ALL --learner $learner --log ../results/resultsLOO_EP2_$learner.txt
|
||||
|
||||
$PYUNK $MedLatin1 Dante $EPXIII1 --learner $learner --log ../results/resultsUNK_EP13_1_$learner.txt
|
||||
$PYUNK $MedLatin2 Dante $EPXIII2 --learner $learner --log ../results/resultsUNK_EP13_2_$learner.txt
|
||||
$PYUNK $MedLatin1 Dante $EPXIV --learner $learner --log ../results/resultsUNK_EP14_$learner.txt
|
||||
done
|
||||
|
||||
|
|
|
@ -54,9 +54,9 @@ def check_log_loo(args):
|
|||
|
||||
|
||||
def check_log_unknown(args):
|
||||
args.unknown_name = pathlib.Path(args.unknown).name
|
||||
if args.log is None:
|
||||
os.makedirs('../results', exist_ok=True)
|
||||
assert os.path.exists(args.unknown), f'file {args.unknown} does not exist'
|
||||
args.unknown_name = pathlib.Path(args.unknown).name
|
||||
args.log = f'../results/Unknown{args.unknown_name}_Corpus{args.corpus_name}.Author{args.positive}.' \
|
||||
f'fs{args.featsel}.classweight{str(args.class_weight)}.CLS{args.learner}.txt'
|
25
src/model.py
25
src/model.py
|
@ -9,7 +9,6 @@ from util.evaluation import f1_metric
|
|||
from typing import List, Union
|
||||
|
||||
|
||||
|
||||
class AuthorshipVerificator(BaseEstimator):
|
||||
|
||||
def __init__(self, nfolds=10, param_grid=None, learner=None, C=1., alpha=0.001, class_weight='balanced',
|
||||
|
@ -24,10 +23,7 @@ class AuthorshipVerificator(BaseEstimator):
|
|||
self.feat_selection_slices = feat_selection_slices
|
||||
self.feat_selection_ratio = feat_selection_ratio
|
||||
|
||||
def fit(self, X, y, groups=None, hyperparam_optimization=True):
|
||||
if self.param_grid is None and hyperparam_optimization:
|
||||
raise ValueError('Param grid is None, but hyperparameter optimization is requested')
|
||||
|
||||
def fit(self, X, y, groups=None):
|
||||
if self.feat_selection_slices is not None:
|
||||
self.fs = MultiRangeFeatureSelector(self.feat_selection_slices, feat_sel=self.feat_selection_ratio)
|
||||
X = self.fs.fit(X, y).transform(X)
|
||||
|
@ -37,7 +33,7 @@ class AuthorshipVerificator(BaseEstimator):
|
|||
C=self.C, class_weight=self.class_weight, max_iter=1000, random_state=self.random_seed, solver='lbfgs'
|
||||
)
|
||||
elif self.learner == 'svm':
|
||||
self.classifier = LinearSVC(C=self.C, class_weight=self.class_weight)
|
||||
self.classifier = LinearSVC(C=self.C, class_weight=self.class_weight, max_iter=2500, random_state=self.random_seed)
|
||||
elif self.learner == 'mnb':
|
||||
self.classifier = MultinomialNB(alpha=self.alpha)
|
||||
|
||||
|
@ -47,7 +43,7 @@ class AuthorshipVerificator(BaseEstimator):
|
|||
if groups is None:
|
||||
groups = np.arange(len(y))
|
||||
|
||||
if hyperparam_optimization and (positive_examples >= self.nfolds) and (len(np.unique(groups[y==1])) > 1):
|
||||
if (positive_examples >= self.nfolds) and (len(np.unique(groups[y==1])) > 1):
|
||||
folds = list(GroupKFold(n_splits=self.nfolds).split(X, y, groups))
|
||||
self.estimator = GridSearchCV(
|
||||
self.classifier, param_grid=self.param_grid, cv=folds, scoring=make_scorer(f1_metric), n_jobs=-1,
|
||||
|
@ -135,18 +131,3 @@ class MultiRangeFeatureSelector(BaseEstimator, TransformerMixin):
|
|||
def __sort_ranges(self, ranges: List[slice]):
|
||||
return np.asarray(ranges)[np.argsort([r.start for r in ranges])[::-1]]
|
||||
|
||||
|
||||
def get_valid_folds(nfolds, X, y, groups, max_trials=100):
|
||||
trials = 0
|
||||
folds = list(GroupKFold(n_splits=nfolds).split(X, y, groups))
|
||||
n_docs = len(y)
|
||||
print(f'different classes={np.unique(y)}; #different documents={len(np.unique(groups))} positives={len(np.unique(groups[y==1]))}')
|
||||
while any(len(np.unique(y[train])) < 2 for train, test in folds):
|
||||
shuffle_index = np.random.permutation(n_docs)
|
||||
X, y, groups = X[shuffle_index], y[shuffle_index], groups[shuffle_index]
|
||||
folds = list(GroupKFold(n_splits=nfolds).split(X, y, groups))
|
||||
print(f'\ttrial{trials}:{[len(np.unique(y[train])) for train, test in folds]}')
|
||||
trials+=1
|
||||
if trials>max_trials:
|
||||
raise ValueError(f'could not meet condition after {max_trials} trials')
|
||||
return folds
|
||||
|
|
|
@ -46,7 +46,7 @@ param_grid = {
|
|||
'mnb': {'alpha': np.logspace(-7,-1,7)}
|
||||
}
|
||||
|
||||
config_loo = {
|
||||
config_feature_extraction = {
|
||||
'function_words_freq': 'latin',
|
||||
'conjugations_freq': 'latin',
|
||||
'features_Mendenhall': True,
|
||||
|
@ -62,6 +62,3 @@ config_loo = {
|
|||
'window_size': 3,
|
||||
'normalize_features': True
|
||||
}
|
||||
|
||||
config_unk = config_loo.copy()
|
||||
config_unk['feature_selection_ratio']=0.1
|
Loading…
Reference in New Issue