huge refactoring, deep learning, and other stuff
This commit is contained in:
parent
d249c4801f
commit
22b7ea7e66
|
|
@ -13,6 +13,7 @@ from scipy.sparse import issparse
|
|||
import itertools
|
||||
from tqdm import tqdm
|
||||
import re
|
||||
from scipy.sparse import csr_matrix
|
||||
|
||||
|
||||
class MultilingualDataset:
|
||||
|
|
@ -68,27 +69,33 @@ class MultilingualDataset:
|
|||
if languages is not None:
|
||||
self.languages_view = languages
|
||||
|
||||
def training(self):
|
||||
return self.lXtr(), self.lYtr()
|
||||
def training(self, mask_numbers=False, target_as_csr=False):
|
||||
return self.lXtr(mask_numbers), self.lYtr(as_csr=target_as_csr)
|
||||
|
||||
def test(self):
|
||||
return self.lXte(), self.lYte()
|
||||
def test(self, mask_numbers=False, target_as_csr=False):
|
||||
return self.lXte(mask_numbers), self.lYte(as_csr=target_as_csr)
|
||||
|
||||
def lXtr(self):
|
||||
return {lang: Xtr for (lang, ((Xtr, _, _), _)) in self.multiling_dataset.items() if
|
||||
lang in self.langs()}
|
||||
# return {lang:self.mask_numbers(Xtr) for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
|
||||
def lXtr(self, mask_numbers=False):
|
||||
proc = lambda x:_mask_numbers(x) if mask_numbers else x
|
||||
# return {lang: Xtr for (lang, ((Xtr, _, _), _)) in self.multiling_dataset.items() if lang in self.langs()}
|
||||
return {lang:proc(Xtr) for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
|
||||
|
||||
def lXte(self):
|
||||
return {lang: Xte for (lang, (_, (Xte, _, _))) in self.multiling_dataset.items() if
|
||||
lang in self.langs()}
|
||||
# return {lang:self.mask_numbers(Xte) for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()}
|
||||
def lXte(self, mask_numbers=False):
|
||||
proc = lambda x: _mask_numbers(x) if mask_numbers else x
|
||||
# return {lang: Xte for (lang, (_, (Xte, _, _))) in self.multiling_dataset.items() if lang in self.langs()}
|
||||
return {lang:proc(Xte) for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()}
|
||||
|
||||
def lYtr(self):
|
||||
return {lang:self.cat_view(Ytr) for (lang, ((_,Ytr,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
|
||||
def lYtr(self, as_csr=False):
|
||||
lY = {lang:self.cat_view(Ytr) for (lang, ((_,Ytr,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
|
||||
if as_csr:
|
||||
lY = {l:csr_matrix(Y) for l,Y in lY.items()}
|
||||
return lY
|
||||
|
||||
def lYte(self):
|
||||
return {lang:self.cat_view(Yte) for (lang, (_,(_,Yte,_))) in self.multiling_dataset.items() if lang in self.langs()}
|
||||
def lYte(self, as_csr=False):
|
||||
lY = {lang:self.cat_view(Yte) for (lang, (_,(_,Yte,_))) in self.multiling_dataset.items() if lang in self.langs()}
|
||||
if as_csr:
|
||||
lY = {l:csr_matrix(Y) for l,Y in lY.items()}
|
||||
return lY
|
||||
|
||||
def cat_view(self, Y):
|
||||
if hasattr(self, 'categories_view'):
|
||||
|
|
@ -107,10 +114,11 @@ class MultilingualDataset:
|
|||
return self.lYtr()[self.langs()[0]].shape[1]
|
||||
|
||||
def show_dimensions(self):
|
||||
def shape(X):
|
||||
return X.shape if hasattr(X, 'shape') else len(X)
|
||||
for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items():
|
||||
if lang not in self.langs(): continue
|
||||
if hasattr(Xtr, 'shape') and hasattr(Xte, 'shape'):
|
||||
print("Lang {}, Xtr={}, ytr={}, Xte={}, yte={}".format(lang, Xtr.shape, self.cat_view(Ytr).shape, Xte.shape, self.cat_view(Yte).shape))
|
||||
print("Lang {}, Xtr={}, ytr={}, Xte={}, yte={}".format(lang, shape(Xtr), self.cat_view(Ytr).shape, shape(Xte), self.cat_view(Yte).shape))
|
||||
|
||||
def show_category_prevalences(self):
|
||||
#pass
|
||||
|
|
@ -135,12 +143,24 @@ class MultilingualDataset:
|
|||
def set_labels(self, labels):
|
||||
self.labels = labels
|
||||
|
||||
def mask_numbers(self, data, number_mask='numbermask'):
|
||||
mask = re.compile(r'\b[0-9][0-9.,-]*\b')
|
||||
masked = []
|
||||
for text in tqdm(data, desc='masking numbers'):
|
||||
masked.append(mask.sub(number_mask, text))
|
||||
return masked
|
||||
def _mask_numbers(data):
|
||||
mask_moredigit = re.compile(r'\s[\+-]?\d{5,}([\.,]\d*)*\b')
|
||||
mask_4digit = re.compile(r'\s[\+-]?\d{4}([\.,]\d*)*\b')
|
||||
mask_3digit = re.compile(r'\s[\+-]?\d{3}([\.,]\d*)*\b')
|
||||
mask_2digit = re.compile(r'\s[\+-]?\d{2}([\.,]\d*)*\b')
|
||||
mask_1digit = re.compile(r'\s[\+-]?\d{1}([\.,]\d*)*\b')
|
||||
masked = []
|
||||
for text in tqdm(data, desc='masking numbers'):
|
||||
text = ' ' + text
|
||||
text = mask_moredigit.sub(' MoreDigitMask', text)
|
||||
text = mask_4digit.sub(' FourDigitMask', text)
|
||||
text = mask_3digit.sub(' ThreeDigitMask', text)
|
||||
text = mask_2digit.sub(' TwoDigitMask', text)
|
||||
text = mask_1digit.sub(' OneDigitMask', text)
|
||||
masked.append(text.replace('.','').replace(',','').strip())
|
||||
return masked
|
||||
|
||||
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -541,12 +561,120 @@ def prepare_rcv_datasets(outpath, rcv1_data_home, rcv2_data_home, wiki_data_home
|
|||
build_juxtaposed_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, preprocess).save(yuxta_path)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------------------------
|
||||
# Methods to generate full RCV and JRC datasets
|
||||
# ----------------------------------------------------------------------------------------------------------------------
|
||||
def full_rcv_(rcv1_data_home, rcv2_data_home, outpath, langs):
|
||||
|
||||
|
||||
print('fetching the datasets')
|
||||
rcv1_train_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train')
|
||||
rcv1_test_documents, labels_rcv1_test = fetch_RCV1(rcv1_data_home, split='test')
|
||||
rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l != 'en'])
|
||||
|
||||
filter_by_categories(rcv1_train_documents, labels_rcv2)
|
||||
filter_by_categories(rcv1_test_documents, labels_rcv2)
|
||||
filter_by_categories(rcv2_documents, labels_rcv1)
|
||||
|
||||
label_names = get_active_labels(rcv1_train_documents + rcv2_documents)
|
||||
print('Active labels in RCV1/2 {}'.format(len(label_names)))
|
||||
|
||||
print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_train_documents), len(rcv1_test_documents), len(label_names)))
|
||||
print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents]))
|
||||
|
||||
mlb = MultiLabelBinarizer()
|
||||
mlb.fit([label_names])
|
||||
|
||||
all_docs = rcv1_train_documents + rcv1_test_documents + rcv2_documents
|
||||
lang_docs = {lang: [d for d in all_docs if d.lang == lang] for lang in langs}
|
||||
|
||||
def get_ids(doclist):
|
||||
return frozenset([d.id for d in doclist])
|
||||
|
||||
tr_ids = {'en': get_ids(rcv1_train_documents)}
|
||||
te_ids = {'en': get_ids(rcv1_test_documents)}
|
||||
for lang in langs:
|
||||
if lang == 'en': continue
|
||||
tr_ids[lang], te_ids[lang] = train_test_split([d.id for d in lang_docs[lang]], test_size=.3)
|
||||
|
||||
dataset = MultilingualDataset()
|
||||
dataset.dataset_name = 'RCV1/2-full'
|
||||
for lang in langs:
|
||||
print(f'processing {lang} with {len(tr_ids[lang])} training documents and {len(te_ids[lang])} documents')
|
||||
analyzer = CountVectorizer(
|
||||
strip_accents='unicode', min_df=3, stop_words=stopwords.words(NLTK_LANGMAP[lang])
|
||||
).build_analyzer()
|
||||
|
||||
Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.id) for d in lang_docs[lang] if d.id in tr_ids[lang]])
|
||||
Xte,Yte,IDte = zip(*[(d.text,d.categories,d.id) for d in lang_docs[lang] if d.id in te_ids[lang]])
|
||||
Xtr = [' '.join(analyzer(d)) for d in Xtr]
|
||||
Xte = [' '.join(analyzer(d)) for d in Xte]
|
||||
Ytr = mlb.transform(Ytr)
|
||||
Yte = mlb.transform(Yte)
|
||||
dataset.add(lang, _mask_numbers(Xtr), Ytr, _mask_numbers(Xte), Yte, IDtr, IDte)
|
||||
|
||||
dataset.save(outpath)
|
||||
|
||||
|
||||
def full_jrc_(jrc_data_home, langs, train_years, test_years, outpath, cat_policy='all', most_common_cat=300):
|
||||
|
||||
print('fetching the datasets')
|
||||
cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy)
|
||||
training_docs, label_names = fetch_jrcacquis(
|
||||
langs=langs, data_path=jrc_data_home, years=train_years, cat_filter=cat_list, cat_threshold=1, parallel=None, most_frequent=most_common_cat
|
||||
)
|
||||
test_docs, _ = fetch_jrcacquis(
|
||||
langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names, parallel='force'
|
||||
)
|
||||
|
||||
def _group_by_lang(doc_list, langs):
|
||||
return {lang: [d for d in doc_list if d.lang == lang] for lang in langs}
|
||||
|
||||
training_docs = _group_by_lang(training_docs, langs)
|
||||
test_docs = _group_by_lang(test_docs, langs)
|
||||
|
||||
mlb = MultiLabelBinarizer()
|
||||
mlb.fit([label_names])
|
||||
|
||||
dataset = MultilingualDataset()
|
||||
data.dataset_name = 'JRC-Acquis-full'
|
||||
for lang in langs:
|
||||
analyzer = CountVectorizer(
|
||||
strip_accents='unicode', min_df=3, stop_words=stopwords.words(NLTK_LANGMAP[lang])
|
||||
).build_analyzer()
|
||||
|
||||
Xtr, Ytr, IDtr = zip(*[(d.text, d.categories, d.parallel_id + '__' + d.id) for d in training_docs[lang] if d.lang == lang])
|
||||
Xte, Yte, IDte = zip(*[(d.text, d.categories, d.parallel_id + '__' + d.id) for d in test_docs[lang] if d.lang == lang])
|
||||
Xtr = [' '.join(analyzer(d)) for d in Xtr]
|
||||
Xte = [' '.join(analyzer(d)) for d in Xte]
|
||||
Ytr = mlb.transform(Ytr)
|
||||
Yte = mlb.transform(Yte)
|
||||
dataset.add(lang, _mask_numbers(Xtr), Ytr, _mask_numbers(Xte), Yte, IDtr, IDte)
|
||||
|
||||
dataset.save(outpath)
|
||||
|
||||
|
||||
#-----------------------------------------------------------------------------------------------------------------------
|
||||
# MAIN BUILDER
|
||||
#-----------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
if __name__=='__main__':
|
||||
import sys
|
||||
RCV1_PATH = '../Datasets/RCV1-v2/unprocessed_corpus'
|
||||
RCV2_PATH = '../Datasets/RCV2'
|
||||
JRC_DATAPATH = "../Datasets/JRC_Acquis_v3"
|
||||
full_rcv_(RCV1_PATH, RCV2_PATH, outpath='../rcv2/rcv1-2_doclist_full_processed.pickle', langs=RCV2_LANGS_WITH_NLTK_STEMMING + ['en'])
|
||||
# full_jrc_(JRC_DATAPATH, lang_set['JRC_NLTK'], train_years=list(range(1958, 2006)), test_years=[2006], outpath='../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle', cat_policy='all', most_common_cat=300)
|
||||
sys.exit(0)
|
||||
|
||||
# datasetpath = '../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle' # '../rcv2/rcv1-2_doclist_full_processed.pickle'
|
||||
# data = MultilingualDataset.load(datasetpath)
|
||||
# data.dataset_name='JRC-Acquis-full'#'RCV1/2-full'
|
||||
# for lang in RCV2_LANGS_WITH_NLTK_STEMMING + ['en']:
|
||||
# (Xtr, ytr, idtr), (Xte, yte, idte) = data.multiling_dataset[lang]
|
||||
# data.multiling_dataset[lang] = ((_mask_numbers(Xtr), ytr, idtr), (_mask_numbers(Xte), yte, idte))
|
||||
# data.save('../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle')#'../rcv2/rcv1-2_doclist_full_processed_2.pickle')
|
||||
# sys.exit(0)
|
||||
|
||||
assert len(sys.argv) == 5, "wrong number of arguments; required: " \
|
||||
"<JRC_PATH> <RCV1_PATH> <RCV2_PATH> <WIKI_PATH> "
|
||||
|
|
|
|||
|
|
@ -204,6 +204,7 @@ class FastTextMUSE(PretrainedEmbeddings):
|
|||
def extract(self, words):
|
||||
source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
|
||||
extraction = torch.zeros((len(words), self.dim()))
|
||||
# extraction = torch.empty(len(words), self.dim()).normal_(0, 1)
|
||||
extraction[source_idx] = self.embed.vectors[target_idx]
|
||||
return extraction
|
||||
|
||||
|
|
|
|||
|
|
@ -254,7 +254,7 @@ class NaivePolylingualClassifier:
|
|||
return {lang:self.model[lang].transform(lX[lang]) for lang in lX.keys()}
|
||||
else:
|
||||
langs = list(lX.keys())
|
||||
scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].transform)(lX[lang]) for lang in langs)
|
||||
scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].predict)(lX[lang]) for lang in langs)
|
||||
return {lang: scores[i] for i, lang in enumerate(langs)}
|
||||
|
||||
def best_params(self):
|
||||
|
|
@ -297,7 +297,7 @@ class MonolingualClassifier:
|
|||
self.model = GridSearchCV(self.model, param_grid=self.parameters, refit=True, cv=5, n_jobs=self.n_jobs,
|
||||
error_score=0, verbose=10)
|
||||
|
||||
print('fitting:', self.model)
|
||||
print(f'fitting: {self.model} on matrices of shape X={X.shape} Y={y.shape}')
|
||||
self.model.fit(X, y)
|
||||
if isinstance(self.model, GridSearchCV):
|
||||
self.best_params_ = self.model.best_params_
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
import numpy as np
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
#from data.text_preprocessor import NLTKStemTokenizer
|
||||
from data.tsr_function__ import get_tsr_matrix, get_supervised_matrix, pointwise_mutual_information, information_gain, \
|
||||
gain_ratio, gss
|
||||
from embeddings.embeddings import FastTextMUSE
|
||||
from embeddings.supervised import supervised_embeddings_tfidf, zscores
|
||||
from learning.learners import NaivePolylingualClassifier, MonolingualClassifier, _joblib_transform_multiling
|
||||
|
|
@ -10,6 +12,9 @@ from joblib import Parallel, delayed
|
|||
from scipy.sparse import issparse, vstack, hstack
|
||||
from transformers.StandardizeTransformer import StandardizeTransformer
|
||||
from util.SIF_embed import remove_pc
|
||||
from sklearn.preprocessing import normalize
|
||||
from sklearn.svm import SVC
|
||||
from scipy.sparse import csr_matrix
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Data Processing
|
||||
|
|
@ -39,20 +44,65 @@ class TfidfVectorizerMultilingual:
|
|||
else:
|
||||
return self.vectorizer[l].vocabulary_
|
||||
|
||||
def get_analyzer(self, l=None):
|
||||
if l is None:
|
||||
return {l:self.vectorizer[l].build_analyzer() for l in self.langs}
|
||||
else:
|
||||
return self.vectorizer[l].build_analyzer()
|
||||
|
||||
|
||||
class FeatureWeight:
|
||||
|
||||
def __init__(self, weight='tfidf', agg='mean'):
|
||||
assert weight in ['tfidf', 'pmi', 'ig'] or callable(weight), 'weight should either be "tfidf" or a callable function'
|
||||
assert agg in ['mean', 'max'], 'aggregation function should either be "mean" or "max"'
|
||||
self.weight = weight
|
||||
self.agg = agg
|
||||
self.fitted = False
|
||||
if weight=='pmi':
|
||||
self.weight = pointwise_mutual_information
|
||||
elif weight == 'ig':
|
||||
self.weight = information_gain
|
||||
|
||||
def fit(self, lX, ly):
|
||||
if not self.fitted:
|
||||
if self.weight == 'tfidf':
|
||||
self.lF = {l: np.ones(X.shape[1]) for l, X in lX.items()}
|
||||
else:
|
||||
self.lF = {}
|
||||
for l in lX.keys():
|
||||
X, y = lX[l], ly[l]
|
||||
|
||||
print(f'getting supervised cell-matrix lang {l}')
|
||||
tsr_matrix = get_tsr_matrix(get_supervised_matrix(X, y), tsr_score_funtion=self.weight)
|
||||
if self.agg == 'max':
|
||||
F = tsr_matrix.max(axis=0)
|
||||
elif self.agg == 'mean':
|
||||
F = tsr_matrix.mean(axis=0)
|
||||
self.lF[l] = F
|
||||
|
||||
self.fitted = True
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
return {lang: csr_matrix.multiply(lX[lang], self.lF[lang]) for lang in lX.keys()}
|
||||
|
||||
def fit_transform(self, lX, ly):
|
||||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Document Embeddings
|
||||
# ------------------------------------------------------------------
|
||||
class PosteriorProbabilitiesEmbedder:
|
||||
|
||||
def __init__(self, first_tier_learner, first_tier_parameters,
|
||||
n_jobs=-1):
|
||||
def __init__(self, first_tier_learner, first_tier_parameters=None, l2=True, n_jobs=-1):
|
||||
self.fist_tier_learner = first_tier_learner
|
||||
self.fist_tier_parameters = first_tier_parameters
|
||||
self.l2 = l2
|
||||
self.n_jobs = n_jobs
|
||||
self.doc_projector = NaivePolylingualClassifier(self.fist_tier_learner,
|
||||
self.fist_tier_parameters,
|
||||
n_jobs=n_jobs)
|
||||
self.doc_projector = NaivePolylingualClassifier(
|
||||
self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs
|
||||
)
|
||||
|
||||
def fit(self, lX, lY, lV=None):
|
||||
print('fitting the projectors... {}'.format(lX.keys()))
|
||||
|
|
@ -60,8 +110,8 @@ class PosteriorProbabilitiesEmbedder:
|
|||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
print('projecting the documents')
|
||||
lZ = self.doc_projector.predict_proba(lX)
|
||||
lZ = self.predict_proba(lX)
|
||||
lZ = _normalize(lZ, self.l2)
|
||||
return lZ
|
||||
|
||||
def fit_transform(self, lX, ly=None, lV=None):
|
||||
|
|
@ -70,28 +120,41 @@ class PosteriorProbabilitiesEmbedder:
|
|||
def best_params(self):
|
||||
return self.doc_projector.best_params()
|
||||
|
||||
def predict(self, lX, ly=None):
|
||||
return self.doc_projector.predict(lX)
|
||||
|
||||
def predict_proba(self, lX, ly=None):
|
||||
print(f'generating posterior probabilities for {sum([X.shape[0] for X in lX.values()])} the documents')
|
||||
return self.doc_projector.predict_proba(lX)
|
||||
|
||||
|
||||
class MuseEmbedder:
|
||||
|
||||
def __init__(self, path, n_jobs=-1):
|
||||
def __init__(self, path, lV=None, l2=True, n_jobs=-1, featureweight=FeatureWeight()):
|
||||
self.path=path
|
||||
self.lV = lV
|
||||
self.l2 = l2
|
||||
self.n_jobs = n_jobs
|
||||
self.featureweight = featureweight
|
||||
|
||||
def fit(self, lX, ly, lV):
|
||||
def fit(self, lX, ly, lV=None):
|
||||
assert lV is not None or self.lV is not None, 'lV not specified'
|
||||
self.langs = sorted(lX.keys())
|
||||
MUSE = Parallel(n_jobs=self.n_jobs)(
|
||||
delayed(FastTextMUSE)(self.path, lang) for lang in self.langs
|
||||
)
|
||||
self.MUSE = load_muse_embeddings(self.path, self.langs, self.n_jobs)
|
||||
lWordList = {l:self._get_wordlist_from_word2index(lV[l]) for l in self.langs}
|
||||
self.MUSE = {l:MUSE[i].extract(lWordList[l]).numpy() for i,l in enumerate(self.langs)}
|
||||
self.MUSE = {l:Muse.extract(lWordList[l]).numpy() for l,Muse in self.MUSE}
|
||||
self.featureweight.fit(lX, ly)
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
MUSE = self.MUSE
|
||||
lX = self.featureweight.transform(lX)
|
||||
XdotMUSE = Parallel(n_jobs=self.n_jobs)(
|
||||
delayed(XdotM)(lX[lang], MUSE[lang]) for lang in self.langs
|
||||
)
|
||||
return {l: XdotMUSE[i] for i, l in enumerate(self.langs)}
|
||||
lMuse = {l: XdotMUSE[i] for i, l in enumerate(self.langs)}
|
||||
lMuse = _normalize(lMuse, self.l2)
|
||||
return lMuse
|
||||
|
||||
def fit_transform(self, lX, ly, lV):
|
||||
return self.fit(lX, ly, lV).transform(lX)
|
||||
|
|
@ -102,9 +165,11 @@ class MuseEmbedder:
|
|||
|
||||
class WordClassEmbedder:
|
||||
|
||||
def __init__(self, n_jobs=-1, max_label_space=300):
|
||||
def __init__(self, l2=True, n_jobs=-1, max_label_space=300, featureweight=FeatureWeight()):
|
||||
self.n_jobs = n_jobs
|
||||
self.l2 = l2
|
||||
self.max_label_space=max_label_space
|
||||
self.featureweight = featureweight
|
||||
|
||||
def fit(self, lX, ly, lV=None):
|
||||
self.langs = sorted(lX.keys())
|
||||
|
|
@ -112,53 +177,43 @@ class WordClassEmbedder:
|
|||
delayed(word_class_embedding_matrix)(lX[lang], ly[lang], self.max_label_space) for lang in self.langs
|
||||
)
|
||||
self.lWCE = {l:WCE[i] for i,l in enumerate(self.langs)}
|
||||
self.featureweight.fit(lX, ly)
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
lWCE = self.lWCE
|
||||
lX = self.featureweight.transform(lX)
|
||||
XdotWCE = Parallel(n_jobs=self.n_jobs)(
|
||||
delayed(XdotM)(lX[lang], lWCE[lang]) for lang in self.langs
|
||||
delayed(XdotM)(lX[lang], lWCE[lang])for lang in self.langs
|
||||
)
|
||||
return {l: XdotWCE[i] for i, l in enumerate(self.langs)}
|
||||
lwce = {l: XdotWCE[i] for i, l in enumerate(self.langs)}
|
||||
lwce = _normalize(lwce, self.l2)
|
||||
return lwce
|
||||
|
||||
def fit_transform(self, lX, ly, lV=None):
|
||||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
|
||||
def word_class_embedding_matrix(X, Y, max_label_space=300):
|
||||
print('computing supervised embeddings...')
|
||||
WCE = supervised_embeddings_tfidf(X, Y)
|
||||
WCE = zscores(WCE, axis=0)
|
||||
|
||||
nC = Y.shape[1]
|
||||
if nC > max_label_space:
|
||||
print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
|
||||
f'Applying PCA(n_components={max_label_space})')
|
||||
pca = PCA(n_components=max_label_space)
|
||||
WCE = pca.fit(WCE).transform(WCE)
|
||||
|
||||
return WCE
|
||||
|
||||
|
||||
def XdotM(X,M):
|
||||
# return X.dot(M)
|
||||
E = X.dot(M)
|
||||
E = remove_pc(E, npc=1)
|
||||
return E
|
||||
|
||||
|
||||
class DocEmbedderList:
|
||||
def __init__(self, *embedder_list):
|
||||
|
||||
def __init__(self, *embedder_list, aggregation='concat'):
|
||||
assert aggregation in {'concat', 'mean'}, 'unknown aggregation mode, valid are "concat" and "mean"'
|
||||
if len(embedder_list)==0: embedder_list=[]
|
||||
self.embedders = embedder_list
|
||||
self.aggregation = aggregation
|
||||
|
||||
|
||||
def fit(self, lX, ly, lV):
|
||||
def fit(self, lX, ly, lV=None):
|
||||
for transformer in self.embedders:
|
||||
transformer.fit(lX,ly,lV)
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
if self.aggregation == 'concat':
|
||||
return self.transform_concat(lX)
|
||||
elif self.aggregation == 'mean':
|
||||
return self.transform_mean(lX)
|
||||
|
||||
def transform_concat(self, lX):
|
||||
if len(self.embedders)==1:
|
||||
return self.embedders[0].transform(lX)
|
||||
|
||||
|
|
@ -176,8 +231,27 @@ class DocEmbedderList:
|
|||
hstacker = hstack if some_sparse else np.hstack
|
||||
return {l:hstacker(lZparts[l]) for l in langs}
|
||||
|
||||
def transform_mean(self, lX):
|
||||
if len(self.embedders)==1:
|
||||
return self.embedders[0].transform(lX)
|
||||
|
||||
def fit_transform(self, lX, ly, lV):
|
||||
langs = sorted(lX.keys())
|
||||
|
||||
lZparts = {l: None for l in langs}
|
||||
for transformer in self.embedders:
|
||||
lZ = transformer.transform(lX)
|
||||
for l in langs:
|
||||
Z = lZ[l]
|
||||
if lZparts[l] is None:
|
||||
lZparts[l] = Z
|
||||
else:
|
||||
lZparts[l] += Z
|
||||
|
||||
n_transformers = len(self.embedders)
|
||||
|
||||
return {l:lZparts[l] / n_transformers for l in langs}
|
||||
|
||||
def fit_transform(self, lX, ly, lV=None):
|
||||
return self.fit(lX, ly, lV).transform(lX)
|
||||
|
||||
def best_params(self):
|
||||
|
|
@ -186,20 +260,55 @@ class DocEmbedderList:
|
|||
def append(self, embedder):
|
||||
self.embedders.append(embedder)
|
||||
|
||||
|
||||
class FeatureSet2Posteriors:
|
||||
def __init__(self, transformer, l2=True, n_jobs=-1):
|
||||
self.transformer = transformer
|
||||
self.l2=l2
|
||||
self.n_jobs = n_jobs
|
||||
self.prob_classifier = MetaClassifier(SVC(kernel='rbf', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs)
|
||||
|
||||
def fit(self, lX, ly, lV=None):
|
||||
if lV is None and hasattr(self.transformer, 'lV'):
|
||||
lV = self.transformer.lV
|
||||
lZ = self.transformer.fit_transform(lX, ly, lV)
|
||||
self.prob_classifier.fit(lZ, ly)
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
lP = self.predict_proba(lX)
|
||||
lP = _normalize(lP, self.l2)
|
||||
return lP
|
||||
|
||||
def fit_transform(self, lX, ly, lV):
|
||||
return self.fit(lX, ly, lV).transform(lX)
|
||||
|
||||
def predict(self, lX, ly=None):
|
||||
lZ = self.transformer.transform(lX)
|
||||
return self.prob_classifier.predict(lZ)
|
||||
|
||||
def predict_proba(self, lX, ly=None):
|
||||
lZ = self.transformer.transform(lX)
|
||||
return self.prob_classifier.predict_proba(lZ)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Meta-Classifier
|
||||
# ------------------------------------------------------------------
|
||||
class MetaClassifier:
|
||||
|
||||
def __init__(self, meta_learner, meta_parameters, n_jobs=-1):
|
||||
def __init__(self, meta_learner, meta_parameters=None, n_jobs=-1, standardize_range=None):
|
||||
self.n_jobs=n_jobs
|
||||
self.model = MonolingualClassifier(base_learner=meta_learner, parameters=meta_parameters, n_jobs=n_jobs)
|
||||
self.standardize_range = standardize_range
|
||||
|
||||
def fit(self, lZ, ly):
|
||||
tinit = time.time()
|
||||
Z, y = self.stack(lZ, ly)
|
||||
self.standardizer = StandardizeTransformer()
|
||||
|
||||
self.standardizer = StandardizeTransformer(range=self.standardize_range)
|
||||
Z = self.standardizer.fit_transform(Z)
|
||||
|
||||
print('fitting the Z-space of shape={}'.format(Z.shape))
|
||||
self.model.fit(Z, y)
|
||||
self.time = time.time() - tinit
|
||||
|
|
@ -217,6 +326,10 @@ class MetaClassifier:
|
|||
lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs)
|
||||
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
|
||||
|
||||
def predict_proba(self, lZ, ly=None):
|
||||
lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs)
|
||||
return _joblib_transform_multiling(self.model.predict_proba, lZ, n_jobs=self.n_jobs)
|
||||
|
||||
def best_params(self):
|
||||
return self.model.best_params()
|
||||
|
||||
|
|
@ -249,3 +362,65 @@ class Funnelling:
|
|||
return {'1st-tier':self.first_tier.best_params(),
|
||||
'meta':self.meta.best_params()}
|
||||
|
||||
|
||||
class Voting:
|
||||
def __init__(self, *prob_classifiers):
|
||||
assert all([hasattr(p, 'predict_proba') for p in prob_classifiers]), 'not all classifiers are probabilistic'
|
||||
self.prob_classifiers = prob_classifiers
|
||||
|
||||
def fit(self, lX, ly, lV=None):
|
||||
for classifier in self.prob_classifiers:
|
||||
classifier.fit(lX, ly, lV)
|
||||
|
||||
def predict(self, lX, ly=None):
|
||||
|
||||
lP = {l:[] for l in lX.keys()}
|
||||
for classifier in self.prob_classifiers:
|
||||
lPi = classifier.predict_proba(lX)
|
||||
for l in lX.keys():
|
||||
lP[l].append(lPi[l])
|
||||
|
||||
lP = {l:np.stack(Plist).mean(axis=0) for l,Plist in lP.items()}
|
||||
ly = {l:P>0.5 for l,P in lP.items()}
|
||||
|
||||
return ly
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------
|
||||
# HELPERS
|
||||
# ------------------------------------------------------------------------------
|
||||
|
||||
def load_muse_embeddings(we_path, langs, n_jobs=-1):
|
||||
MUSE = Parallel(n_jobs=n_jobs)(
|
||||
delayed(FastTextMUSE)(we_path, lang) for lang in langs
|
||||
)
|
||||
return {l: MUSE[i] for i, l in enumerate(langs)}
|
||||
|
||||
|
||||
def word_class_embedding_matrix(X, Y, max_label_space=300):
|
||||
print('computing supervised embeddings...')
|
||||
WCE = supervised_embeddings_tfidf(X, Y)
|
||||
WCE = zscores(WCE, axis=0)
|
||||
|
||||
nC = Y.shape[1]
|
||||
if nC > max_label_space:
|
||||
print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
|
||||
f'Applying PCA(n_components={max_label_space})')
|
||||
pca = PCA(n_components=max_label_space)
|
||||
WCE = pca.fit(WCE).transform(WCE)
|
||||
|
||||
return WCE
|
||||
|
||||
|
||||
def XdotM(X,M):
|
||||
# return X.dot(M)
|
||||
# print(f'X={X.shape}, M={M.shape}')
|
||||
E = X.dot(M)
|
||||
E = remove_pc(E, npc=1)
|
||||
return E
|
||||
|
||||
|
||||
def _normalize(lX, l2=True):
|
||||
return {l: normalize(X) for l, X in lX.items()} if l2 else lX
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,290 @@
|
|||
import argparse
|
||||
import torch.nn as nn
|
||||
from torch.optim.lr_scheduler import StepLR
|
||||
from dataset_builder import MultilingualDataset
|
||||
from learning.transformers import load_muse_embeddings
|
||||
from models.lstm_class import RNNMultilingualClassifier
|
||||
from util.csv_log import CSVLog
|
||||
from util.early_stop import EarlyStopping
|
||||
from util.common import *
|
||||
from util.file import create_if_not_exist
|
||||
from time import time
|
||||
from embeddings.pretrained import *
|
||||
from os.path import join
|
||||
from tqdm import tqdm
|
||||
from util.evaluation import evaluate
|
||||
from util.file import get_file_name
|
||||
|
||||
allowed_nets = {'rnn'}
|
||||
|
||||
# instantiates the net, initializes the model parameters, and sets embeddings trainable if requested
|
||||
def init_Net(nC, multilingual_index, xavier_uniform=True):
|
||||
net=opt.net
|
||||
assert net in allowed_nets, f'{net} not supported, valid ones are={allowed_nets}'
|
||||
|
||||
# instantiate the required net
|
||||
if net=='rnn':
|
||||
only_post = opt.posteriors and (not opt.pretrained) and (not opt.supervised)
|
||||
if only_post:
|
||||
print('working on ONLY POST mode')
|
||||
model = RNNMultilingualClassifier(
|
||||
output_size=nC,
|
||||
hidden_size=opt.hidden,
|
||||
lvocab_size=multilingual_index.l_vocabsize(),
|
||||
learnable_length=opt.learnable,
|
||||
lpretrained=multilingual_index.l_embeddings(),
|
||||
drop_embedding_range=multilingual_index.sup_range,
|
||||
drop_embedding_prop=opt.sup_drop,
|
||||
post_probabilities=opt.posteriors,
|
||||
only_post=only_post
|
||||
)
|
||||
|
||||
# weight initialization
|
||||
if xavier_uniform:
|
||||
for p in model.parameters():
|
||||
if p.dim() > 1 and p.requires_grad:
|
||||
nn.init.xavier_uniform_(p)
|
||||
|
||||
if opt.tunable:
|
||||
# this has to be performed *after* Xavier initialization is done,
|
||||
# otherwise the pretrained embedding parameters will be overrided
|
||||
model.finetune_pretrained()
|
||||
|
||||
return model.cuda()
|
||||
|
||||
|
||||
def set_method_name():
|
||||
method_name = f'{opt.net}(H{opt.hidden})'
|
||||
if opt.pretrained:
|
||||
method_name += f'-Muse'
|
||||
if opt.supervised:
|
||||
method_name += f'-WCE'
|
||||
if opt.posteriors:
|
||||
method_name += f'-Posteriors'
|
||||
if (opt.pretrained or opt.supervised) and opt.tunable:
|
||||
method_name+='-(trainable)'
|
||||
else:
|
||||
method_name += '-(static)'
|
||||
if opt.learnable > 0:
|
||||
method_name += f'-Learnable{opt.learnable}'
|
||||
return method_name
|
||||
|
||||
|
||||
def init_optimizer(model, lr):
|
||||
return torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr, weight_decay=opt.weight_decay)
|
||||
|
||||
|
||||
def init_logfile(method_name, opt):
|
||||
logfile = CSVLog(opt.log_file, ['dataset', 'method', 'epoch', 'measure', 'value', 'run', 'timelapse'])
|
||||
logfile.set_default('dataset', opt.dataset)
|
||||
logfile.set_default('run', opt.seed)
|
||||
logfile.set_default('method', method_name)
|
||||
assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} and run {opt.seed} already calculated'
|
||||
return logfile
|
||||
|
||||
|
||||
# loads the MUSE embeddings if requested, or returns empty dictionaries otherwise
|
||||
def load_pretrained_embeddings(we_path, langs):
|
||||
lpretrained = lpretrained_vocabulary = none_dict(langs)
|
||||
if opt.pretrained:
|
||||
lpretrained = load_muse_embeddings(we_path, langs, n_jobs=-1)
|
||||
lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs}
|
||||
return lpretrained, lpretrained_vocabulary
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------------------------
|
||||
def main():
|
||||
|
||||
method_name = set_method_name()
|
||||
logfile = init_logfile(method_name, opt)
|
||||
|
||||
# Loading the dataset
|
||||
data = MultilingualDataset.load(opt.dataset)
|
||||
# data.set_view(languages=['de', 'fr', 'sv', 'da', 'es', 'it'])
|
||||
data.show_dimensions()
|
||||
langs = data.langs()
|
||||
l_devel_raw, l_devel_target = data.training(target_as_csr=True)
|
||||
l_test_raw, l_test_target = data.test(target_as_csr=True)
|
||||
|
||||
# Loading the MUSE pretrained embeddings (only if requested)
|
||||
lpretrained, lpretrained_vocabulary = load_pretrained_embeddings(opt.we_path, langs)
|
||||
|
||||
# Data preparation: indexing / splitting / embedding matrices (pretrained + supervised) / posterior probs
|
||||
multilingual_index = MultilingualIndex()
|
||||
multilingual_index.index(l_devel_raw, l_devel_target, l_test_raw, lpretrained_vocabulary)
|
||||
multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=opt.seed)
|
||||
multilingual_index.embedding_matrices(lpretrained, opt.supervised)
|
||||
if opt.posteriors:
|
||||
lPtr, lPva, lPte = multilingual_index.posterior_probabilities(max_training_docs_by_lang=opt.svm_max_docs)
|
||||
else:
|
||||
lPtr, lPva, lPte = None, None, None
|
||||
|
||||
# Model initialization
|
||||
model = init_Net(data.num_categories(), multilingual_index)
|
||||
|
||||
optim = init_optimizer(model, lr=opt.lr)
|
||||
criterion = torch.nn.BCEWithLogitsLoss().cuda()
|
||||
lr_scheduler = StepLR(optim, step_size=25, gamma=0.5)
|
||||
batcher_train = Batch(opt.batch_size, batches_per_epoch=10, languages=langs, lpad=multilingual_index.l_pad())
|
||||
batcher_eval = Batch(opt.batch_size, batches_per_epoch=-1, languages=langs, lpad=multilingual_index.l_pad())
|
||||
|
||||
tinit = time()
|
||||
create_if_not_exist(opt.checkpoint_dir)
|
||||
early_stop = EarlyStopping(model, patience=opt.patience, checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_file_name(opt.dataset)}')
|
||||
|
||||
l_train_index, l_train_target = multilingual_index.l_train()
|
||||
l_val_index, l_val_target = multilingual_index.l_val()
|
||||
l_test_index = multilingual_index.l_test_index()
|
||||
|
||||
print('-'*80)
|
||||
print('Start training')
|
||||
for epoch in range(1, opt.nepochs + 1):
|
||||
train(model, batcher_train, l_train_index, lPtr, l_train_target, tinit, logfile, criterion, optim, epoch, method_name)
|
||||
lr_scheduler.step() # reduces the learning rate
|
||||
|
||||
# validation
|
||||
macrof1 = test(model, batcher_eval, l_val_index, lPva, l_val_target, tinit, epoch, logfile, criterion, 'va')
|
||||
early_stop(macrof1, epoch)
|
||||
if opt.test_each>0:
|
||||
if (opt.plotmode and (epoch==1 or epoch%opt.test_each==0)) or (not opt.plotmode and epoch%opt.test_each==0 and epoch<opt.nepochs):
|
||||
test(model, batcher_eval, l_test_index, lPte, l_test_target, tinit, epoch, logfile, criterion, 'te')
|
||||
|
||||
if early_stop.STOP:
|
||||
print('[early-stop] STOP')
|
||||
if not opt.plotmode: # with plotmode activated, early-stop is ignored
|
||||
break
|
||||
|
||||
# training is over
|
||||
|
||||
# restores the best model according to the Mf1 of the validation set (only when plotmode==False)
|
||||
# stoptime = early_stop.stop_time - tinit
|
||||
# stopepoch = early_stop.best_epoch
|
||||
# logfile.add_row(epoch=stopepoch, measure=f'early-stop', value=early_stop.best_score, timelapse=stoptime)
|
||||
|
||||
if opt.plotmode==False:
|
||||
print('-' * 80)
|
||||
print('Training over. Performing final evaluation')
|
||||
model = early_stop.restore_checkpoint()
|
||||
|
||||
if opt.val_epochs>0:
|
||||
print(f'running last {opt.val_epochs} training epochs on the validation set')
|
||||
for val_epoch in range(1, opt.val_epochs + 1):
|
||||
batcher_train.init_offset()
|
||||
train(model, batcher_train, l_val_index, lPva, l_val_target, tinit, logfile, criterion, optim, epoch+val_epoch, method_name)
|
||||
|
||||
# final test
|
||||
print('Training complete: testing')
|
||||
test(model, batcher_eval, l_test_index, lPte, l_test_target, tinit, epoch, logfile, criterion, 'te')
|
||||
|
||||
|
||||
def get_lr(optimizer):
|
||||
for param_group in optimizer.param_groups:
|
||||
return param_group['lr']
|
||||
|
||||
|
||||
def train(model, batcher, ltrain_index, ltrain_posteriors, lytr, tinit, logfile, criterion, optim, epoch, method_name):
|
||||
loss_history = []
|
||||
model.train()
|
||||
for idx, (batch, post, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, lytr)):
|
||||
optim.zero_grad()
|
||||
loss = criterion(model(batch, post, lang), target)
|
||||
loss.backward()
|
||||
clip_gradient(model)
|
||||
optim.step()
|
||||
loss_history.append(loss.item())
|
||||
|
||||
if idx % opt.log_interval == 0:
|
||||
interval_loss = np.mean(loss_history[-opt.log_interval:])
|
||||
print(f'{opt.dataset} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')
|
||||
|
||||
mean_loss = np.mean(interval_loss)
|
||||
logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit)
|
||||
return mean_loss
|
||||
|
||||
|
||||
def test(model, batcher, ltest_index, ltest_posteriors, lyte, tinit, epoch, logfile, criterion, measure_prefix):
|
||||
model.eval()
|
||||
langs = sorted(ltest_index.keys())
|
||||
predictions = {l:[] for l in langs}
|
||||
yte_stacked = {l:[] for l in langs}
|
||||
batcher.init_offset()
|
||||
for batch, post, target, lang in tqdm(batcher.batchify(ltest_index, ltest_posteriors, lyte), desc='evaluation: '):
|
||||
logits = model(batch, post, lang)
|
||||
loss = criterion(logits, target).item()
|
||||
prediction = predict(logits)
|
||||
predictions[lang].append(prediction)
|
||||
yte_stacked[lang].append(target.detach().cpu().numpy())
|
||||
|
||||
ly = {l:np.vstack(yte_stacked[l]) for l in langs}
|
||||
ly_ = {l:np.vstack(predictions[l]) for l in langs}
|
||||
l_eval = evaluate(ly, ly_)
|
||||
metrics = []
|
||||
for lang in langs:
|
||||
macrof1, microf1, macrok, microk = l_eval[lang]
|
||||
metrics.append([macrof1, microf1, macrok, microk])
|
||||
if measure_prefix=='te':
|
||||
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
|
||||
# results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'],
|
||||
# (config['max_label_space'], classifier.best_components),
|
||||
# config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time,
|
||||
# lang, macrof1, microf1, macrok, microk, '')
|
||||
Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
|
||||
print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
|
||||
|
||||
# logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=tend)
|
||||
# logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mf1, timelapse=tend)
|
||||
# logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-accuracy', value=acc, timelapse=tend)
|
||||
# logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=loss, timelapse=tend)
|
||||
|
||||
return Mf1
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------------------------
|
||||
if __name__ == '__main__':
|
||||
|
||||
parser = argparse.ArgumentParser(description='Neural text classification with Word-Class Embeddings')
|
||||
parser.add_argument('dataset', type=str, metavar='datasetpath', help=f'path to the pickled dataset')
|
||||
parser.add_argument('--batch-size', type=int, default=50, metavar='int', help='input batch size (default: 100)')
|
||||
parser.add_argument('--batch-size-test', type=int, default=250, metavar='int', help='batch size for testing (default: 250)')
|
||||
parser.add_argument('--nepochs', type=int, default=200, metavar='int', help='number of epochs (default: 200)')
|
||||
parser.add_argument('--patience', type=int, default=10, metavar='int', help='patience for early-stop (default: 10)')
|
||||
parser.add_argument('--plotmode', action='store_true', default=False, help='in plot mode executes a long run in order '
|
||||
'to generate enough data to produce trend plots (test-each should be >0. This mode is '
|
||||
'used to produce plots, and does not perform an evaluation on the test set.')
|
||||
parser.add_argument('--hidden', type=int, default=512, metavar='int', help='hidden lstm size (default: 512)')
|
||||
parser.add_argument('--lr', type=float, default=1e-3, metavar='float', help='learning rate (default: 1e-3)')
|
||||
parser.add_argument('--weight_decay', type=float, default=0, metavar='float', help='weight decay (default: 0)')
|
||||
parser.add_argument('--sup-drop', type=float, default=0.5, metavar='[0.0, 1.0]', help='dropout probability for the supervised matrix (default: 0.5)')
|
||||
parser.add_argument('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)')
|
||||
parser.add_argument('--svm-max-docs', type=int, default=1000, metavar='int', help='maximum number of documents by '
|
||||
'language used to train the calibrated SVMs (only used if --posteriors is active)')
|
||||
parser.add_argument('--log-interval', type=int, default=10, metavar='int', help='how many batches to wait before printing training status')
|
||||
parser.add_argument('--log-file', type=str, default='../log/log.csv', metavar='str', help='path to the log csv file')
|
||||
# parser.add_argument('--pickle-dir', type=str, default='../pickles', metavar='str', help=f'if set, specifies the path where to '
|
||||
# f'save/load the dataset pickled (set to None if you prefer not to retain the pickle file)')
|
||||
parser.add_argument('--test-each', type=int, default=0, metavar='int', help='how many epochs to wait before invoking test (default: 0, only at the end)')
|
||||
parser.add_argument('--checkpoint-dir', type=str, default='../checkpoint', metavar='str', help='path to the directory containing checkpoints')
|
||||
parser.add_argument('--net', type=str, default='rnn', metavar='str', help=f'net, one in {allowed_nets}')
|
||||
parser.add_argument('--pretrained', action='store_true', default=False, help='use MUSE pretrained embeddings')
|
||||
parser.add_argument('--supervised', action='store_true', default=False, help='use supervised embeddings')
|
||||
parser.add_argument('--posteriors', action='store_true', default=False, help='concatenate posterior probabilities to doc embeddings')
|
||||
parser.add_argument('--learnable', type=int, default=0, metavar='int', help='dimension of the learnable embeddings (default 0)')
|
||||
parser.add_argument('--val-epochs', type=int, default=1, metavar='int', help='number of training epochs to perform on the '
|
||||
'validation set once training is over (default 1)')
|
||||
parser.add_argument('--we-path', type=str, default='../embeddings', metavar='str',
|
||||
help=f'path to MUSE pretrained embeddings')
|
||||
parser.add_argument('--max-label-space', type=int, default=300, metavar='int', help='larger dimension allowed for the '
|
||||
'feature-label embedding (if larger, then PCA with this number of components is applied '
|
||||
'(default 300)')
|
||||
parser.add_argument('--force', action='store_true', default=False, help='do not check if this experiment has already been run')
|
||||
parser.add_argument('--tunable', action='store_true', default=False,
|
||||
help='pretrained embeddings are tunable from the begining (default False, i.e., static)')
|
||||
|
||||
opt = parser.parse_args()
|
||||
|
||||
assert torch.cuda.is_available(), 'CUDA not available'
|
||||
assert not opt.plotmode or opt.test_each > 0, 'plot mode implies --test-each>0'
|
||||
# if opt.pickle_dir: opt.pickle_path = join(opt.pickle_dir, f'{opt.dataset}.pickle')
|
||||
torch.manual_seed(opt.seed)
|
||||
|
||||
main()
|
||||
|
|
@ -0,0 +1,127 @@
|
|||
import os
|
||||
from dataset_builder import MultilingualDataset
|
||||
# from learning.learners import *
|
||||
from learning.learners import FunnellingMultimodal
|
||||
from learning.transformers import Funnelling, PosteriorProbabilitiesEmbedder, MetaClassifier, \
|
||||
TfidfVectorizerMultilingual, DocEmbedderList, WordClassEmbedder, MuseEmbedder, FeatureSet2Posteriors, Voting
|
||||
from util.evaluation import *
|
||||
from optparse import OptionParser
|
||||
from util.file import exists
|
||||
from util.results import PolylingualClassificationResults
|
||||
from sklearn.svm import SVC
|
||||
from util.util import get_learner, get_params
|
||||
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
|
||||
|
||||
parser = OptionParser()
|
||||
|
||||
parser.add_option("-d", "--dataset", dest="dataset",
|
||||
help="Path to the multilingual dataset processed and stored in .pickle format",
|
||||
default="../rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
|
||||
|
||||
parser.add_option("-o", "--output", dest="output",
|
||||
help="Result file", type=str, default='./results/results.csv')
|
||||
|
||||
parser.add_option("-P", "--probs", dest="probs", action='store_true',
|
||||
help="Add posterior probabilities to the document embedding representation", default=False)
|
||||
|
||||
parser.add_option("-S", "--supervised", dest="supervised", action='store_true',
|
||||
help="Add supervised (Word-Class Embeddings) to the document embedding representation", default=False)
|
||||
|
||||
parser.add_option("-U", "--pretrained", dest="pretrained", action='store_true',
|
||||
help="Add pretrained MUSE embeddings to the document embedding representation", default=False)
|
||||
|
||||
parser.add_option("-w", "--we-path", dest="we_path",
|
||||
help="Path to the MUSE polylingual word embeddings", default='../embeddings')
|
||||
|
||||
parser.add_option("-s", "--set_c", dest="set_c",type=float,
|
||||
help="Set the C parameter", default=1)
|
||||
|
||||
parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
|
||||
help="Optimize hyperparameters", default=False)
|
||||
|
||||
parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
|
||||
help="Number of parallel jobs (default is -1, all)", default=-1)
|
||||
|
||||
parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
|
||||
help="If smaller than number of target classes, PCA will be applied to supervised matrix. ",
|
||||
default=300)
|
||||
|
||||
# parser.add_option("-u", "--upca", dest="max_labels_U", type=int,
|
||||
# help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix."
|
||||
# " If set to 0 it will automatically search for the best number of components", default=300)
|
||||
|
||||
# parser.add_option("-a", dest="post_pca",
|
||||
# help="If set to True, will apply PCA to the z-space (posterior probabilities stacked along with "
|
||||
# "embedding space", default=False)
|
||||
|
||||
|
||||
def get_learner(calibrate=False, kernel='linear'):
|
||||
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, gamma='auto')
|
||||
|
||||
|
||||
def get_params(dense=False):
|
||||
if not op.optimc:
|
||||
return None
|
||||
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
|
||||
kernel = 'rbf' if dense else 'linear'
|
||||
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
|
||||
|
||||
#######################################################################################################################
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
(op, args) = parser.parse_args()
|
||||
|
||||
assert exists(op.dataset), 'Unable to find file '+str(op.dataset)
|
||||
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
|
||||
assert op.probs or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed'
|
||||
|
||||
dataset_file = os.path.basename(op.dataset)
|
||||
|
||||
results = PolylingualClassificationResults(op.output)
|
||||
|
||||
data = MultilingualDataset.load(op.dataset)
|
||||
data.show_dimensions()
|
||||
|
||||
lXtr, lytr = data.training()
|
||||
lXte, lyte = data.test()
|
||||
|
||||
meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
|
||||
|
||||
result_id = f'{dataset_file}_Prob{op.probs}_WCE{op.supervised}(PCA{op.max_labels_S})_MUSE{op.pretrained}{"_optimC" if op.optimc else ""}'
|
||||
|
||||
print(f'{result_id}')
|
||||
|
||||
# text preprocessing
|
||||
tfidfvectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
|
||||
|
||||
lXtr = tfidfvectorizer.fit_transform(lXtr, lytr)
|
||||
lXte = tfidfvectorizer.transform(lXte)
|
||||
lV = tfidfvectorizer.vocabulary()
|
||||
|
||||
classifiers = []
|
||||
if op.probs:
|
||||
classifiers.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True), first_tier_parameters=None))
|
||||
if op.supervised:
|
||||
classifiers.append(FeatureSet2Posteriors(WordClassEmbedder(max_label_space=op.max_labels_S)))
|
||||
if op.pretrained:
|
||||
classifiers.append(FeatureSet2Posteriors(MuseEmbedder(op.we_path, lV=lV)))
|
||||
|
||||
classifier = Voting(*classifiers)
|
||||
|
||||
print('# Fitting ...')
|
||||
classifier.fit(lXtr, lytr)
|
||||
|
||||
print('\n# Evaluating ...')
|
||||
l_eval = evaluate_method(classifier, lXte, lyte)
|
||||
|
||||
metrics = []
|
||||
for lang in lXte.keys():
|
||||
macrof1, microf1, macrok, microk = l_eval[lang]
|
||||
metrics.append([macrof1, microf1, macrok, microk])
|
||||
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
|
||||
# results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'],
|
||||
# (config['max_label_space'], classifier.best_components),
|
||||
# config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time,
|
||||
# lang, macrof1, microf1, macrok, microk, '')
|
||||
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
|
||||
|
|
@ -1,27 +1,19 @@
|
|||
import os
|
||||
from dataset_builder import MultilingualDataset
|
||||
# from learning.learners import *
|
||||
from learning.learners import FunnellingMultimodal
|
||||
from learning.transformers import Funnelling, PosteriorProbabilitiesEmbedder, MetaClassifier, \
|
||||
TfidfVectorizerMultilingual, DocEmbedderList, WordClassEmbedder, MuseEmbedder
|
||||
from learning.transformers import *
|
||||
from util.evaluation import *
|
||||
from optparse import OptionParser
|
||||
from util.file import exists
|
||||
from util.results import PolylingualClassificationResults
|
||||
from sklearn.svm import SVC
|
||||
from util.util import get_learner, get_params
|
||||
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
|
||||
|
||||
parser = OptionParser()
|
||||
|
||||
parser.add_option("-d", "--dataset", dest="dataset",
|
||||
help="Path to the multilingual dataset processed and stored in .pickle format",
|
||||
default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
|
||||
parser = OptionParser(usage="usage: %prog datapath [options]")
|
||||
|
||||
parser.add_option("-o", "--output", dest="output",
|
||||
help="Result file", type=str, default='./results/results.csv')
|
||||
|
||||
parser.add_option("-P", "--probs", dest="probs", action='store_true',
|
||||
parser.add_option("-P", "--posteriors", dest="posteriors", action='store_true',
|
||||
help="Add posterior probabilities to the document embedding representation", default=False)
|
||||
|
||||
parser.add_option("-S", "--supervised", dest="supervised", action='store_true',
|
||||
|
|
@ -30,6 +22,16 @@ parser.add_option("-S", "--supervised", dest="supervised", action='store_true',
|
|||
parser.add_option("-U", "--pretrained", dest="pretrained", action='store_true',
|
||||
help="Add pretrained MUSE embeddings to the document embedding representation", default=False)
|
||||
|
||||
parser.add_option("--nol2", dest="nol2", action='store_true',
|
||||
help="Deactivates l2 normalization as a post-processing for the document embedding views", default=False)
|
||||
|
||||
parser.add_option("--allprob", dest="allprob", action='store_true',
|
||||
help="All views are generated as posterior probabilities. This affects the supervised and pretrained "
|
||||
"embeddings, for which a calibrated classifier is generated, which generates the posteriors", default=False)
|
||||
|
||||
parser.add_option("--feat-weight", dest="feat_weight",
|
||||
help="Term weighting function to weight the averaged embeddings", type=str, default='tfidf')
|
||||
|
||||
parser.add_option("-w", "--we-path", dest="we_path",
|
||||
help="Path to the MUSE polylingual word embeddings", default='../embeddings')
|
||||
|
||||
|
|
@ -46,66 +48,61 @@ parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
|
|||
help="If smaller than number of target classes, PCA will be applied to supervised matrix. ",
|
||||
default=300)
|
||||
|
||||
# parser.add_option("-u", "--upca", dest="max_labels_U", type=int,
|
||||
# help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix."
|
||||
# " If set to 0 it will automatically search for the best number of components", default=300)
|
||||
|
||||
# parser.add_option("-a", dest="post_pca",
|
||||
# help="If set to True, will apply PCA to the z-space (posterior probabilities stacked along with "
|
||||
# "embedding space", default=False)
|
||||
|
||||
|
||||
def get_learner(calibrate=False, kernel='linear'):
|
||||
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, gamma='auto')
|
||||
|
||||
|
||||
def get_params(dense=False):
|
||||
if not op.optimc:
|
||||
return None
|
||||
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
|
||||
kernel = 'rbf' if dense else 'linear'
|
||||
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
|
||||
|
||||
#######################################################################################################################
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
(op, args) = parser.parse_args()
|
||||
|
||||
assert exists(op.dataset), 'Unable to find file '+str(op.dataset)
|
||||
assert len(args)==1, 'required argument "datapath" missing (path to the pickled dataset)'
|
||||
dataset = args[0]
|
||||
assert exists(dataset), 'Unable to find file '+str(dataset)
|
||||
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
|
||||
assert op.probs or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed'
|
||||
assert op.posteriors or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed'
|
||||
l2=(op.nol2==False)
|
||||
|
||||
dataset_file = os.path.basename(op.dataset)
|
||||
dataset_file = os.path.basename(dataset)
|
||||
|
||||
results = PolylingualClassificationResults(op.output)
|
||||
allprob='Prob' if op.allprob else ''
|
||||
result_id = f'{dataset_file}_ProbPost={op.posteriors}_{allprob}WCE={op.supervised}(PCA={op.max_labels_S})_{allprob}' \
|
||||
f'MUSE={op.pretrained}_weight={op.feat_weight}_l2={l2}{"_optimC" if op.optimc else ""}'
|
||||
print(f'{result_id}')
|
||||
|
||||
data = MultilingualDataset.load(op.dataset)
|
||||
data = MultilingualDataset.load(dataset)
|
||||
data.show_dimensions()
|
||||
|
||||
lXtr, lytr = data.training()
|
||||
lXte, lyte = data.test()
|
||||
|
||||
meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
|
||||
|
||||
result_id = f'{dataset_file}_Prob{op.probs}_WCE{op.supervised}(PCA{op.max_labels_S})_MUSE{op.pretrained}{"_optimC" if op.optimc else ""}'
|
||||
|
||||
print(f'{result_id}')
|
||||
|
||||
# text preprocessing
|
||||
tfidfvectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
|
||||
|
||||
# document embedding modules
|
||||
doc_embedder = DocEmbedderList()
|
||||
if op.probs:
|
||||
doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True), first_tier_parameters=None))
|
||||
# feature weighting (for word embeddings average)
|
||||
feat_weighting = FeatureWeight(op.feat_weight, agg='mean')
|
||||
|
||||
# # document embedding modules
|
||||
doc_embedder = DocEmbedderList(aggregation='concat')
|
||||
if op.posteriors:
|
||||
doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True, kernel='linear'), l2=l2))
|
||||
if op.supervised:
|
||||
doc_embedder.append(WordClassEmbedder(max_label_space=op.max_labels_S))
|
||||
wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting)
|
||||
if op.allprob:
|
||||
wce = FeatureSet2Posteriors(wce, l2=l2)
|
||||
doc_embedder.append(wce)
|
||||
if op.pretrained:
|
||||
doc_embedder.append(MuseEmbedder(op.we_path))
|
||||
muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting)
|
||||
if op.allprob:
|
||||
muse = FeatureSet2Posteriors(muse, l2=l2)
|
||||
doc_embedder.append(muse)
|
||||
|
||||
# metaclassifier
|
||||
meta = MetaClassifier(meta_learner=SVC(), meta_parameters=get_params(dense=True))
|
||||
meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
|
||||
meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'), meta_parameters=meta_parameters)
|
||||
|
||||
# ensembling the modules
|
||||
classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta)
|
||||
|
|
@ -123,6 +120,6 @@ if __name__ == '__main__':
|
|||
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
|
||||
# results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'],
|
||||
# (config['max_label_space'], classifier.best_components),
|
||||
# config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time,
|
||||
# config['dim_reduction_unsupervised'], op.optimc, dataset.split('/')[-1], classifier.time,
|
||||
# lang, macrof1, microf1, macrok, microk, '')
|
||||
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
|
||||
|
|
|
|||
|
|
@ -0,0 +1,47 @@
|
|||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
|
||||
|
||||
def init_embeddings(pretrained, vocab_size, learnable_length, device='cuda'):
|
||||
pretrained_embeddings = None
|
||||
pretrained_length = 0
|
||||
if pretrained is not None:
|
||||
pretrained_length = pretrained.shape[1]
|
||||
assert pretrained.shape[0] == vocab_size, 'pre-trained matrix does not match with the vocabulary size'
|
||||
pretrained_embeddings = nn.Embedding(vocab_size, pretrained_length)
|
||||
pretrained_embeddings.weight = nn.Parameter(pretrained, requires_grad=False)
|
||||
# pretrained_embeddings.to(device)
|
||||
|
||||
learnable_embeddings = None
|
||||
if learnable_length > 0:
|
||||
learnable_embeddings = nn.Embedding(vocab_size, learnable_length)
|
||||
# learnable_embeddings.to(device)
|
||||
|
||||
embedding_length = learnable_length + pretrained_length
|
||||
assert embedding_length > 0, '0-size embeddings'
|
||||
|
||||
return pretrained_embeddings, learnable_embeddings, embedding_length
|
||||
|
||||
|
||||
def embed(model, input, lang):
|
||||
input_list = []
|
||||
if model.lpretrained_embeddings[lang]:
|
||||
input_list.append(model.lpretrained_embeddings[lang](input))
|
||||
if model.llearnable_embeddings[lang]:
|
||||
input_list.append(model.llearnable_embeddings[lang](input))
|
||||
return torch.cat(tensors=input_list, dim=2)
|
||||
|
||||
|
||||
def embedding_dropout(input, drop_range, p_drop=0.5, training=True):
|
||||
if p_drop > 0 and training and drop_range is not None:
|
||||
p = p_drop
|
||||
drop_from, drop_to = drop_range
|
||||
m = drop_to - drop_from #length of the supervised embedding
|
||||
l = input.shape[2] #total embedding length
|
||||
corr = (1 - p)
|
||||
input[:, :, drop_from:drop_to] = corr * F.dropout(input[:, :, drop_from:drop_to], p=p)
|
||||
input /= (1 - (p * m / l))
|
||||
|
||||
return input
|
||||
|
|
@ -0,0 +1,96 @@
|
|||
#taken from https://github.com/prakashpandey9/Text-Classification-Pytorch/blob/master/models/LSTM.py
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.autograd import Variable
|
||||
from models.helpers import *
|
||||
|
||||
|
||||
class RNNMultilingualClassifier(nn.Module):
|
||||
|
||||
def __init__(self, output_size, hidden_size, lvocab_size, learnable_length, lpretrained=None,
|
||||
drop_embedding_range=None, drop_embedding_prop=0, post_probabilities=True, only_post=False):
|
||||
|
||||
super(RNNMultilingualClassifier, self).__init__()
|
||||
self.output_size = output_size
|
||||
self.hidden_size = hidden_size
|
||||
self.drop_embedding_range = drop_embedding_range
|
||||
self.drop_embedding_prop = drop_embedding_prop
|
||||
self.post_probabilities = post_probabilities
|
||||
assert 0 <= drop_embedding_prop <= 1, 'drop_embedding_prop: wrong range'
|
||||
|
||||
self.lpretrained_embeddings = nn.ModuleDict()
|
||||
self.llearnable_embeddings = nn.ModuleDict()
|
||||
self.embedding_length = None
|
||||
self.langs = sorted(lvocab_size.keys())
|
||||
self.only_post = only_post
|
||||
|
||||
self.n_layers = 1
|
||||
self.n_directions = 1
|
||||
|
||||
self.dropout = nn.Dropout(0.2)
|
||||
|
||||
lstm_out = 256
|
||||
ff1 = 512
|
||||
ff2 = 256
|
||||
|
||||
lpretrained_embeddings = {}
|
||||
llearnable_embeddings = {}
|
||||
if only_post==False:
|
||||
for l in self.langs:
|
||||
pretrained = lpretrained[l] if lpretrained else None
|
||||
pretrained_embeddings, learnable_embeddings, embedding_length = init_embeddings(
|
||||
pretrained, lvocab_size[l], learnable_length
|
||||
)
|
||||
lpretrained_embeddings[l] = pretrained_embeddings
|
||||
llearnable_embeddings[l] = learnable_embeddings
|
||||
self.embedding_length = embedding_length
|
||||
|
||||
# self.rnn = nn.LSTM(self.embedding_length, hidden_size, dropout=0.2 if self.n_layers>1 else 0, num_layers=self.n_layers, bidirectional=(self.n_directions==2))
|
||||
self.rnn = nn.GRU(self.embedding_length, hidden_size)
|
||||
self.linear0 = nn.Linear(hidden_size * self.n_directions, lstm_out)
|
||||
self.lpretrained_embeddings.update(lpretrained_embeddings)
|
||||
self.llearnable_embeddings.update(llearnable_embeddings)
|
||||
|
||||
self.linear1 = nn.Linear(lstm_out, ff1)
|
||||
self.linear2 = nn.Linear(ff1, ff2)
|
||||
|
||||
if only_post:
|
||||
self.label = nn.Linear(output_size, output_size)
|
||||
elif post_probabilities:
|
||||
self.label = nn.Linear(ff2+output_size, output_size)
|
||||
else:
|
||||
self.label = nn.Linear(ff2, output_size)
|
||||
|
||||
|
||||
def forward(self, input, post, lang):
|
||||
if self.only_post:
|
||||
doc_embedding = post
|
||||
else:
|
||||
doc_embedding = self.transform(input, lang)
|
||||
if self.post_probabilities:
|
||||
doc_embedding = torch.cat([doc_embedding, post], dim=1)
|
||||
|
||||
logits = self.label(doc_embedding)
|
||||
return logits
|
||||
|
||||
def transform(self, input, lang):
|
||||
batch_size = input.shape[0]
|
||||
input = embed(self, input, lang)
|
||||
input = embedding_dropout(input, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop,
|
||||
training=self.training)
|
||||
input = input.permute(1, 0, 2)
|
||||
h_0 = Variable(torch.zeros(self.n_layers*self.n_directions, batch_size, self.hidden_size).cuda())
|
||||
# c_0 = Variable(torch.zeros(self.n_layers*self.n_directions, batch_size, self.hidden_size).cuda())
|
||||
# output, (_, _) = self.lstm(input, (h_0, c_0))
|
||||
output, _ = self.rnn(input, h_0)
|
||||
output = output[-1,:,:]
|
||||
output = F.relu(self.linear0(output))
|
||||
output = self.dropout(F.relu(self.linear1(output)))
|
||||
output = self.dropout(F.relu(self.linear2(output)))
|
||||
return output
|
||||
|
||||
def finetune_pretrained(self):
|
||||
for l in self.langs:
|
||||
self.lpretrained_embeddings[l].requires_grad = True
|
||||
self.lpretrained_embeddings[l].weight.requires_grad = True
|
||||
|
||||
|
|
@ -2,15 +2,24 @@ import numpy as np
|
|||
|
||||
class StandardizeTransformer:
|
||||
|
||||
def __init__(self, axis=0):
|
||||
def __init__(self, axis=0, range=None):
|
||||
assert range is None or isinstance(range, slice), 'wrong format for range, should either be None or a slice'
|
||||
self.axis = axis
|
||||
self.yetfit=False
|
||||
self.yetfit = False
|
||||
self.range = range
|
||||
|
||||
def fit(self, X):
|
||||
print('fitting Standardizer')
|
||||
std=np.std(X, axis=self.axis, ddof=1)
|
||||
self.std = np.clip(std, 1e-5, None)
|
||||
self.mean = np.mean(X, axis=self.axis)
|
||||
if self.range is not None:
|
||||
ones = np.ones_like(self.std)
|
||||
zeros = np.zeros_like(self.mean)
|
||||
ones[self.range] = self.std[self.range]
|
||||
zeros[self.range] = self.mean[self.range]
|
||||
self.std = ones
|
||||
self.mean = zeros
|
||||
self.yetfit=True
|
||||
print('done\n')
|
||||
return self
|
||||
|
|
|
|||
|
|
@ -0,0 +1,367 @@
|
|||
import warnings
|
||||
from sklearn.svm import SVC
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.model_selection import train_test_split
|
||||
from embeddings.supervised import get_supervised_embeddings
|
||||
from learning.transformers import PosteriorProbabilitiesEmbedder, TfidfVectorizerMultilingual
|
||||
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
import torch
|
||||
from scipy.sparse import vstack, issparse
|
||||
|
||||
|
||||
def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary):
|
||||
"""
|
||||
Index (i.e., replaces word strings with numerical indexes) a list of string documents
|
||||
:param data: list of string documents
|
||||
:param vocab: a fixed mapping [str]->[int] of words to indexes
|
||||
:param known_words: a set of known words (e.g., words that, despite not being included in the vocab, can be retained
|
||||
because they are anyway contained in a pre-trained embedding set that we know in advance)
|
||||
:param analyzer: the preprocessor in charge of transforming the document string into a chain of string words
|
||||
:param unk_index: the index of the 'unknown token', i.e., a symbol that characterizes all words that we cannot keep
|
||||
:param out_of_vocabulary: an incremental mapping [str]->[int] of words to indexes that will index all those words that
|
||||
are not in the original vocab but that are in the known_words
|
||||
:return:
|
||||
"""
|
||||
indexes=[]
|
||||
vocabsize = len(vocab)
|
||||
unk_count = 0
|
||||
knw_count = 0
|
||||
out_count = 0
|
||||
pbar = tqdm(data, desc=f'indexing documents')
|
||||
for text in pbar:
|
||||
words = analyzer(text)
|
||||
index = []
|
||||
for word in words:
|
||||
if word in vocab:
|
||||
idx = vocab[word]
|
||||
else:
|
||||
if word in known_words:
|
||||
if word not in out_of_vocabulary:
|
||||
out_of_vocabulary[word] = vocabsize+len(out_of_vocabulary)
|
||||
idx = out_of_vocabulary[word]
|
||||
out_count += 1
|
||||
else:
|
||||
idx = unk_index
|
||||
unk_count += 1
|
||||
index.append(idx)
|
||||
indexes.append(index)
|
||||
knw_count += len(index)
|
||||
pbar.set_description(f'[unk = {unk_count}/{knw_count}={(100.*unk_count/knw_count):.2f}%]'
|
||||
f'[out = {out_count}/{knw_count}={(100.*out_count/knw_count):.2f}%]')
|
||||
return indexes
|
||||
|
||||
|
||||
def define_pad_length(index_list):
|
||||
lengths = [len(index) for index in index_list]
|
||||
return int(np.mean(lengths)+np.std(lengths))
|
||||
|
||||
|
||||
def pad(index_list, pad_index, max_pad_length=None):
|
||||
pad_length = np.max([len(index) for index in index_list])
|
||||
if max_pad_length is not None:
|
||||
pad_length = min(pad_length, max_pad_length)
|
||||
for i,indexes in enumerate(index_list):
|
||||
index_list[i] = [pad_index]*(pad_length-len(indexes)) + indexes[:pad_length]
|
||||
return index_list
|
||||
|
||||
|
||||
class Index:
|
||||
def __init__(self, devel_raw, devel_target, test_raw, lang):
|
||||
self.lang = lang
|
||||
self.devel_raw = devel_raw
|
||||
self.devel_target = devel_target
|
||||
self.test_raw = test_raw
|
||||
|
||||
def index(self, pretrained_vocabulary, analyzer, vocabulary):
|
||||
self.word2index = dict(vocabulary)
|
||||
known_words = set(self.word2index.keys())
|
||||
if pretrained_vocabulary is not None:
|
||||
known_words.update(pretrained_vocabulary)
|
||||
|
||||
self.word2index['UNKTOKEN'] = len(self.word2index)
|
||||
self.word2index['PADTOKEN'] = len(self.word2index)
|
||||
self.unk_index = self.word2index['UNKTOKEN']
|
||||
self.pad_index = self.word2index['PADTOKEN']
|
||||
|
||||
# index documents and keep track of test terms outside the development vocabulary that are in Muse (if available)
|
||||
self.out_of_vocabulary = dict()
|
||||
self.devel_index = index(self.devel_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary)
|
||||
self.test_index = index(self.test_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary)
|
||||
|
||||
self.vocabsize = len(self.word2index) + len(self.out_of_vocabulary)
|
||||
|
||||
print(f'[indexing complete for lang {self.lang}] vocabulary-size={self.vocabsize}')
|
||||
|
||||
def train_val_split(self, val_prop, max_val, seed):
|
||||
devel = self.devel_index
|
||||
target = self.devel_target
|
||||
devel_raw = self.devel_raw
|
||||
|
||||
val_size = int(min(len(devel) * val_prop, max_val))
|
||||
|
||||
self.train_index, self.val_index, self.train_target, self.val_target, self.train_raw, self.val_raw = \
|
||||
train_test_split(
|
||||
devel, target, devel_raw, test_size=val_size, random_state=seed, shuffle=True
|
||||
)
|
||||
|
||||
print(f'split lang {self.lang}: train={len(self.train_index)} val={len(self.val_index)} test={len(self.test_index)}')
|
||||
|
||||
def get_word_list(self):
|
||||
def extract_word_list(word2index):
|
||||
return [w for w,i in sorted(word2index.items(), key=lambda x: x[1])]
|
||||
|
||||
word_list = extract_word_list(self.word2index)
|
||||
word_list += extract_word_list(self.out_of_vocabulary)
|
||||
return word_list
|
||||
|
||||
def compose_embedding_matrix(self, pretrained, supervised, Xtr=None, Ytr=None):
|
||||
print(f'[generating embedding matrix for lang {self.lang}]')
|
||||
|
||||
self.wce_range = None
|
||||
embedding_parts = []
|
||||
|
||||
if pretrained is not None:
|
||||
print('\t[pretrained-matrix]')
|
||||
word_list = self.get_word_list()
|
||||
muse_embeddings = pretrained.extract(word_list)
|
||||
embedding_parts.append(muse_embeddings)
|
||||
del pretrained
|
||||
|
||||
if supervised:
|
||||
print('\t[supervised-matrix]')
|
||||
F = get_supervised_embeddings(Xtr, Ytr, reduction=None, method='dotn')
|
||||
num_missing_rows = self.vocabsize - F.shape[0]
|
||||
F = np.vstack((F, np.zeros(shape=(num_missing_rows, F.shape[1]))))
|
||||
F = torch.from_numpy(F).float()
|
||||
|
||||
offset = 0
|
||||
if embedding_parts:
|
||||
offset = embedding_parts[0].shape[1]
|
||||
self.wce_range = [offset, offset + F.shape[1]]
|
||||
|
||||
embedding_parts.append(F)
|
||||
|
||||
self.embedding_matrix = torch.cat(embedding_parts, dim=1)
|
||||
|
||||
print(f'[embedding matrix for lang {self.lang} has shape {self.embedding_matrix.shape}]')
|
||||
|
||||
|
||||
def none_dict(langs):
|
||||
return {l:None for l in langs}
|
||||
|
||||
class MultilingualIndex:
|
||||
def __init__(self): #, add_language_trace=False):
|
||||
self.l_index = {}
|
||||
self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
|
||||
# self.add_language_trace=add_language_trace
|
||||
|
||||
def index(self, l_devel_raw, l_devel_target, l_test_raw, l_pretrained_vocabulary):
|
||||
self.langs = sorted(l_devel_raw.keys())
|
||||
|
||||
#build the vocabularies
|
||||
self.l_vectorizer.fit(l_devel_raw)
|
||||
l_vocabulary = self.l_vectorizer.vocabulary()
|
||||
l_analyzer = self.l_vectorizer.get_analyzer()
|
||||
|
||||
for l in self.langs:
|
||||
self.l_index[l] = Index(l_devel_raw[l], l_devel_target[l], l_test_raw[l], l)
|
||||
self.l_index[l].index(l_pretrained_vocabulary[l], l_analyzer[l], l_vocabulary[l])
|
||||
|
||||
def train_val_split(self, val_prop=0.2, max_val=2000, seed=42):
|
||||
for l,index in self.l_index.items():
|
||||
index.train_val_split(val_prop, max_val, seed=seed)
|
||||
|
||||
def embedding_matrices(self, lpretrained, supervised):
|
||||
lXtr = self.get_lXtr() if supervised else none_dict(self.langs)
|
||||
lYtr = self.l_train_target() if supervised else none_dict(self.langs)
|
||||
for l,index in self.l_index.items():
|
||||
index.compose_embedding_matrix(lpretrained[l], supervised, lXtr[l], lYtr[l])
|
||||
self.sup_range = index.wce_range
|
||||
|
||||
# experimental... does it make sense to keep track of the language? i.e., to inform the network from which
|
||||
# language does the data came from...
|
||||
# if self.add_language_trace and pretrained_embeddings is not None:
|
||||
# print('adding language trace')
|
||||
# lang_trace = torch.zeros(size=(vocabsize, len(self.langs)))
|
||||
# lang_trace[:,i]=1
|
||||
# pretrained_embeddings = torch.cat([pretrained_embeddings, lang_trace], dim=1)
|
||||
|
||||
|
||||
def posterior_probabilities(self, max_training_docs_by_lang=5000):
|
||||
# choose a maximum of "max_training_docs_by_lang" for training the calibrated SVMs
|
||||
lXtr = {l:Xtr for l,Xtr in self.get_lXtr().items()}
|
||||
lYtr = {l:Ytr for l,Ytr in self.l_train_target().items()}
|
||||
for l in self.langs:
|
||||
n_elements = lXtr[l].shape[0]
|
||||
if n_elements > max_training_docs_by_lang:
|
||||
choice = np.random.permutation(n_elements)[:max_training_docs_by_lang]
|
||||
lXtr[l] = lXtr[l][choice]
|
||||
lYtr[l] = lYtr[l][choice]
|
||||
|
||||
# train the posterior probabilities embedder
|
||||
print('[posteriors] training a calibrated SVM')
|
||||
learner = SVC(kernel='linear', probability=True, cache_size=1000, C=1, random_state=1, gamma='auto')
|
||||
prob_embedder = PosteriorProbabilitiesEmbedder(learner, l2=False)
|
||||
prob_embedder.fit(lXtr, lYtr)
|
||||
|
||||
# transforms the training, validation, and test sets into posterior probabilities
|
||||
print('[posteriors] generating posterior probabilities')
|
||||
lPtr = prob_embedder.transform(self.get_lXtr())
|
||||
lPva = prob_embedder.transform(self.get_lXva())
|
||||
lPte = prob_embedder.transform(self.get_lXte())
|
||||
|
||||
print('[posteriors] done')
|
||||
return lPtr, lPva, lPte
|
||||
|
||||
def get_lXtr(self):
|
||||
if not hasattr(self, 'lXtr'):
|
||||
self.lXtr = self.l_vectorizer.transform({l: index.train_raw for l, index in self.l_index.items()})
|
||||
return self.lXtr
|
||||
|
||||
def get_lXva(self):
|
||||
if not hasattr(self, 'lXva'):
|
||||
self.lXva = self.l_vectorizer.transform({l: index.val_raw for l, index in self.l_index.items()})
|
||||
return self.lXva
|
||||
|
||||
def get_lXte(self):
|
||||
if not hasattr(self, 'lXte'):
|
||||
self.lXte = self.l_vectorizer.transform({l: index.test_raw for l, index in self.l_index.items()})
|
||||
return self.lXte
|
||||
|
||||
def l_vocabsize(self):
|
||||
return {l:index.vocabsize for l,index in self.l_index.items()}
|
||||
|
||||
def l_embeddings(self):
|
||||
return {l:index.embedding_matrix for l,index in self.l_index.items()}
|
||||
|
||||
def l_pad(self):
|
||||
return {l: index.pad_index for l, index in self.l_index.items()}
|
||||
|
||||
def l_train_index(self):
|
||||
return {l: index.train_index for l, index in self.l_index.items()}
|
||||
|
||||
def l_train_target(self):
|
||||
return {l: index.train_target for l, index in self.l_index.items()}
|
||||
|
||||
def l_val_index(self):
|
||||
return {l: index.val_index for l, index in self.l_index.items()}
|
||||
|
||||
def l_val_target(self):
|
||||
return {l: index.val_target for l, index in self.l_index.items()}
|
||||
|
||||
def l_test_index(self):
|
||||
return {l: index.test_index for l, index in self.l_index.items()}
|
||||
|
||||
def l_train(self):
|
||||
return self.l_train_index(), self.l_train_target()
|
||||
|
||||
def l_val(self):
|
||||
return self.l_val_index(), self.l_val_target()
|
||||
|
||||
|
||||
|
||||
class Batch:
|
||||
def __init__(self, batchsize, batches_per_epoch, languages, lpad, max_pad_length=500):
|
||||
self.batchsize = batchsize
|
||||
self.batches_per_epoch = batches_per_epoch
|
||||
self.languages = languages
|
||||
self.lpad=lpad
|
||||
self.max_pad_length=max_pad_length
|
||||
self.init_offset()
|
||||
|
||||
def init_offset(self):
|
||||
self.offset = {lang: 0 for lang in self.languages}
|
||||
|
||||
def batchify(self, l_index, l_post, llabels):
|
||||
langs = self.languages
|
||||
l_num_samples = {l:len(l_index[l]) for l in langs}
|
||||
|
||||
max_samples = max(l_num_samples.values())
|
||||
n_batches = max_samples // self.batchsize + 1 * (max_samples % self.batchsize > 0)
|
||||
if self.batches_per_epoch != -1 and self.batches_per_epoch < n_batches:
|
||||
n_batches = self.batches_per_epoch
|
||||
|
||||
for b in range(n_batches):
|
||||
for lang in langs:
|
||||
index, labels = l_index[lang], llabels[lang]
|
||||
offset = self.offset[lang]
|
||||
if offset >= l_num_samples[lang]:
|
||||
offset = 0
|
||||
limit = offset+self.batchsize
|
||||
|
||||
batch_slice = slice(offset, limit)
|
||||
batch = index[batch_slice]
|
||||
batch_labels = labels[batch_slice].toarray()
|
||||
|
||||
post = None
|
||||
if l_post is not None:
|
||||
post = torch.FloatTensor(l_post[lang][batch_slice]).cuda()
|
||||
|
||||
batch = pad(batch, pad_index=self.lpad[lang], max_pad_length=self.max_pad_length)
|
||||
|
||||
batch = torch.LongTensor(batch).cuda()
|
||||
target = torch.FloatTensor(batch_labels).cuda()
|
||||
|
||||
self.offset[lang] = limit
|
||||
|
||||
yield batch, post, target, lang
|
||||
|
||||
|
||||
def batchify(l_index, l_post, llabels, batchsize, lpad, max_pad_length=500):
|
||||
langs = sorted(l_index.keys())
|
||||
nsamples = max([len(l_index[l]) for l in langs])
|
||||
nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0)
|
||||
for b in range(nbatches):
|
||||
for lang in langs:
|
||||
index, labels = l_index[lang], llabels[lang]
|
||||
|
||||
if b * batchsize >= len(index):
|
||||
continue
|
||||
batch = index[b*batchsize:(b+1)*batchsize]
|
||||
batch_labels = labels[b*batchsize:(b+1)*batchsize].toarray()
|
||||
post = None
|
||||
if l_post is not None:
|
||||
post = torch.FloatTensor(l_post[lang][b*batchsize:(b+1)*batchsize]).cuda()
|
||||
batch = pad(batch, pad_index=lpad[lang], max_pad_length=max_pad_length)
|
||||
batch = torch.LongTensor(batch)
|
||||
target = torch.FloatTensor(batch_labels)
|
||||
yield batch.cuda(), post, target.cuda(), lang
|
||||
|
||||
|
||||
def batchify_unlabelled(index_list, batchsize, pad_index, max_pad_length=500):
|
||||
nsamples = len(index_list)
|
||||
nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0)
|
||||
for b in range(nbatches):
|
||||
batch = index_list[b*batchsize:(b+1)*batchsize]
|
||||
batch = pad(batch, pad_index=pad_index, max_pad_length=max_pad_length)
|
||||
batch = torch.LongTensor(batch)
|
||||
yield batch.cuda()
|
||||
|
||||
|
||||
def clip_gradient(model, clip_value=1e-1):
|
||||
params = list(filter(lambda p: p.grad is not None, model.parameters()))
|
||||
for p in params:
|
||||
p.grad.data.clamp_(-clip_value, clip_value)
|
||||
|
||||
|
||||
def predict(logits, classification_type='multilabel'):
|
||||
if classification_type == 'multilabel':
|
||||
prediction = torch.sigmoid(logits) > 0.5
|
||||
elif classification_type == 'singlelabel':
|
||||
prediction = torch.argmax(logits, dim=1).view(-1, 1)
|
||||
else:
|
||||
print('unknown classification type')
|
||||
|
||||
return prediction.detach().cpu().numpy()
|
||||
|
||||
|
||||
def count_parameters(model):
|
||||
return sum(p.numel() for p in model.parameters() if p.requires_grad)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,60 @@
|
|||
import os
|
||||
import pandas as pd
|
||||
pd.set_option('display.max_rows', 500)
|
||||
pd.set_option('display.max_columns', 500)
|
||||
pd.set_option('display.width', 1000)
|
||||
|
||||
|
||||
class CSVLog:
|
||||
|
||||
def __init__(self, file, columns=None, autoflush=True, verbose=False, overwrite=False):
|
||||
self.file = file
|
||||
self.autoflush = autoflush
|
||||
self.verbose = verbose
|
||||
if os.path.exists(file) and not overwrite:
|
||||
self.tell('Loading existing file from {}'.format(file))
|
||||
self.df = pd.read_csv(file, sep='\t')
|
||||
self.columns = sorted(self.df.columns.values.tolist())
|
||||
else:
|
||||
self.tell('File {} does not exist or overwrite=True. Creating new frame.'.format(file))
|
||||
assert columns is not None, 'columns cannot be None'
|
||||
self.columns = sorted(columns)
|
||||
dir = os.path.dirname(self.file)
|
||||
if dir and not os.path.exists(dir): os.makedirs(dir)
|
||||
self.df = pd.DataFrame(columns=self.columns)
|
||||
self.defaults={}
|
||||
|
||||
def already_calculated(self, **kwargs):
|
||||
df = self.df
|
||||
if df.shape[0]==0:
|
||||
return False
|
||||
if len(kwargs)==0:
|
||||
kwargs = self.defaults
|
||||
for key,val in kwargs.items():
|
||||
df = df.loc[df[key]==val]
|
||||
if df.shape[0]==0: return False
|
||||
return True
|
||||
|
||||
def set_default(self, param, value):
|
||||
self.defaults[param]=value
|
||||
|
||||
def add_row(self, **kwargs):
|
||||
for key in self.defaults.keys():
|
||||
if key not in kwargs:
|
||||
kwargs[key]=self.defaults[key]
|
||||
colums = sorted(list(kwargs.keys()))
|
||||
values = [kwargs[col_i] for col_i in colums]
|
||||
s = pd.Series(values, index=self.columns)
|
||||
self.df = self.df.append(s, ignore_index=True)
|
||||
if self.autoflush: self.flush()
|
||||
# self.tell(s.to_string())
|
||||
self.tell(kwargs)
|
||||
|
||||
def flush(self):
|
||||
self.df.to_csv(self.file, index=False, sep='\t')
|
||||
|
||||
def tell(self, msg):
|
||||
if self.verbose: print(msg)
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,53 @@
|
|||
#adapted from https://github.com/Bjarten/early-stopping-pytorch/blob/master/pytorchtools.py
|
||||
import torch
|
||||
from time import time
|
||||
from util.file import create_if_not_exist
|
||||
|
||||
|
||||
class EarlyStopping:
|
||||
|
||||
def __init__(self, model, patience=20, verbose=True, checkpoint='./checkpoint.pt'):
|
||||
# set patience to 0 or -1 to avoid stopping, but still keeping track of the best value and model parameters
|
||||
self.patience_limit = patience
|
||||
self.patience = patience
|
||||
self.verbose = verbose
|
||||
self.best_score = None
|
||||
self.best_epoch = None
|
||||
self.stop_time = None
|
||||
self.checkpoint = checkpoint
|
||||
self.model = model
|
||||
self.STOP = False
|
||||
|
||||
def __call__(self, watch_score, epoch):
|
||||
|
||||
if self.STOP: return #done
|
||||
|
||||
if self.best_score is None or watch_score >= self.best_score:
|
||||
self.best_score = watch_score
|
||||
self.best_epoch = epoch
|
||||
self.stop_time = time()
|
||||
if self.checkpoint:
|
||||
self.print(f'[early-stop] improved, saving model in {self.checkpoint}')
|
||||
torch.save(self.model, self.checkpoint)
|
||||
else:
|
||||
self.print(f'[early-stop] improved')
|
||||
self.patience = self.patience_limit
|
||||
else:
|
||||
self.patience -= 1
|
||||
if self.patience == 0:
|
||||
self.STOP = True
|
||||
self.print(f'[early-stop] patience exhausted')
|
||||
else:
|
||||
if self.patience>0: # if negative, then early-stop is ignored
|
||||
self.print(f'[early-stop] patience={self.patience}')
|
||||
|
||||
def reinit_counter(self):
|
||||
self.STOP = False
|
||||
self.patience=self.patience_limit
|
||||
|
||||
def restore_checkpoint(self):
|
||||
return torch.load(self.checkpoint)
|
||||
|
||||
def print(self, msg):
|
||||
if self.verbose:
|
||||
print(msg)
|
||||
|
|
@ -44,7 +44,7 @@ def evaluate_method(polylingual_method, lX, ly, predictor=None, soft=False, retu
|
|||
tinit=time.time()
|
||||
print('prediction for test')
|
||||
assert set(lX.keys()) == set(ly.keys()), 'inconsistent dictionaries in evaluate'
|
||||
n_jobs = polylingual_method.n_jobs
|
||||
n_jobs = polylingual_method.n_jobs if hasattr(polylingual_method, 'n_jobs') else -1
|
||||
|
||||
if predictor is None:
|
||||
predictor = polylingual_method.predict
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ from os import listdir, makedirs
|
|||
from os.path import isdir, isfile, join, exists, dirname
|
||||
#from sklearn.externals.six.moves import urllib
|
||||
import urllib
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def download_file(url, archive_filename):
|
||||
|
|
@ -36,4 +37,8 @@ def makedirs_if_not_exist(path):
|
|||
def create_if_not_exist(path):
|
||||
if not exists(path): makedirs(path)
|
||||
|
||||
def get_parent_name(path):
|
||||
return Path(path).parent
|
||||
|
||||
def get_file_name(path):
|
||||
return Path(path).name
|
||||
|
|
|
|||
Loading…
Reference in New Issue