gitea first commit

This commit is contained in:
Alejandro Moreo Fernandez 2020-11-21 18:20:20 +01:00
parent 4c6c92f6d4
commit c25af5e603
4 changed files with 17 additions and 10 deletions

View File

@ -14,7 +14,7 @@ AUTHORS_CORPUS_II = ['Dante', 'BeneFlorentinus', 'BenvenutoDaImola', 'Boncompagn
'RyccardusDeSanctoGermano', 'ZonoDeMagnalis']
DEBUG_MODE = False
DEBUG_MODE = True
def main():
@ -57,7 +57,8 @@ def main():
Xtr, ytr, groups = feature_extractor.fit_transform(positive, negative)
print('Fitting the Verificator')
params = {'C': np.logspace(0, 1, 2)} if DEBUG_MODE else {'C': np.logspace(-3, +3, 7)}
#params = {'C': np.logspace(0, 1, 2)} if DEBUG_MODE else {'C': np.logspace(-3, +3, 7)}
params = {'C': np.logspace(0, 1, 2)} if DEBUG_MODE else {'C': [1,10,100,1000,0.1,0.01,0.001]}
slice_charngrams = feature_extractor.feature_range['_cngrams_task']
slice_wordngrams = feature_extractor.feature_range['_wngrams_task']
@ -66,8 +67,8 @@ def main():
else:
slice_first, slice_second = slice_wordngrams, slice_charngrams
av = Pipeline([
('featsel_cngrams', RangeFeatureSelector(slice_second, 0.1)),
('featsel_wngrams', RangeFeatureSelector(slice_first, 0.1)),
('featsel_cngrams', RangeFeatureSelector(slice_second, 0.05)),
('featsel_wngrams', RangeFeatureSelector(slice_first, 0.05)),
('av', AuthorshipVerificator(C=1, param_grid=params))
])
@ -115,8 +116,9 @@ if __name__ == '__main__':
parser.add_argument('positive', type=str, default="Dante", metavar='AUTHOR',
help= f'Positive author for the hypothesis (default "Dante"); set to "ALL" to check '
f'every author')
parser.add_argument('--log', type=str, metavar='PATH', default='./results.txt',
help='path to the log file where to write the results (default ./results.txt)')
parser.add_argument('--log', type=str, metavar='PATH', default=None,
help='path to the log file where to write the results '
'(if not specified, then ./results_{corpuspath.name})')
args = parser.parse_args()

View File

@ -9,11 +9,11 @@ if [ ! -d $corpus ]; then
rm ../MedLatin.zip
fi
PY="python3 author_identification.py"
PY="python3 author_identification_loo.py"
MedLatin1="../MedLatin/Corpora/MedLatin1"
MedLatin2="../MedLatin/Corpora/MedLatin2"
EP1="../MedLatin/Epistle/EpistolaXIII_1.txt"
EP2="../MedLatin/Epistle/EpistolaXIII_2.txt"
$PY $MedLatin1 ALL --unknown $EP1 --loo --log ./results_EP1.txt
$PY $MedLatin2 ALL --unknown $EP2 --loo --log ./results_EP2.txt
$PY $MedLatin1 ALL --log ./resultsLoo_EP1.txt
$PY $MedLatin2 ALL --log ./resultsLoo_EP2.txt

View File

@ -19,7 +19,7 @@ class AuthorshipVerificator(BaseEstimator):
self.author_name = author_name
def fit(self, X, y):
self.classifier = LogisticRegression(C=self.C, class_weight='balanced')
self.classifier = LogisticRegression(C=self.C, class_weight='balanced', solver='lbfgs', max_iter=1000)
y = np.asarray(y)
positive_examples = y.sum()
if positive_examples >= self.nfolds and self.param_grid is not None:

5
src/requirements.txt Normal file
View File

@ -0,0 +1,5 @@
joblib==0.11
nltk==3.4.5
numpy==1.18.2
scikit-learn==0.22.2.post1
scipy==1.4.1