huge refactoring, deep learning, and other stuff

This commit is contained in:
Alejandro Moreo Fernandez 2020-01-30 17:08:52 +01:00
parent d249c4801f
commit 22b7ea7e66
16 changed files with 1474 additions and 119 deletions

View File

@ -13,6 +13,7 @@ from scipy.sparse import issparse
import itertools
from tqdm import tqdm
import re
from scipy.sparse import csr_matrix
class MultilingualDataset:
@ -68,27 +69,33 @@ class MultilingualDataset:
if languages is not None:
self.languages_view = languages
def training(self):
return self.lXtr(), self.lYtr()
def training(self, mask_numbers=False, target_as_csr=False):
return self.lXtr(mask_numbers), self.lYtr(as_csr=target_as_csr)
def test(self):
return self.lXte(), self.lYte()
def test(self, mask_numbers=False, target_as_csr=False):
return self.lXte(mask_numbers), self.lYte(as_csr=target_as_csr)
def lXtr(self):
return {lang: Xtr for (lang, ((Xtr, _, _), _)) in self.multiling_dataset.items() if
lang in self.langs()}
# return {lang:self.mask_numbers(Xtr) for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
def lXtr(self, mask_numbers=False):
proc = lambda x:_mask_numbers(x) if mask_numbers else x
# return {lang: Xtr for (lang, ((Xtr, _, _), _)) in self.multiling_dataset.items() if lang in self.langs()}
return {lang:proc(Xtr) for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
def lXte(self):
return {lang: Xte for (lang, (_, (Xte, _, _))) in self.multiling_dataset.items() if
lang in self.langs()}
# return {lang:self.mask_numbers(Xte) for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()}
def lXte(self, mask_numbers=False):
proc = lambda x: _mask_numbers(x) if mask_numbers else x
# return {lang: Xte for (lang, (_, (Xte, _, _))) in self.multiling_dataset.items() if lang in self.langs()}
return {lang:proc(Xte) for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()}
def lYtr(self):
return {lang:self.cat_view(Ytr) for (lang, ((_,Ytr,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
def lYtr(self, as_csr=False):
lY = {lang:self.cat_view(Ytr) for (lang, ((_,Ytr,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
if as_csr:
lY = {l:csr_matrix(Y) for l,Y in lY.items()}
return lY
def lYte(self):
return {lang:self.cat_view(Yte) for (lang, (_,(_,Yte,_))) in self.multiling_dataset.items() if lang in self.langs()}
def lYte(self, as_csr=False):
lY = {lang:self.cat_view(Yte) for (lang, (_,(_,Yte,_))) in self.multiling_dataset.items() if lang in self.langs()}
if as_csr:
lY = {l:csr_matrix(Y) for l,Y in lY.items()}
return lY
def cat_view(self, Y):
if hasattr(self, 'categories_view'):
@ -107,10 +114,11 @@ class MultilingualDataset:
return self.lYtr()[self.langs()[0]].shape[1]
def show_dimensions(self):
def shape(X):
return X.shape if hasattr(X, 'shape') else len(X)
for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items():
if lang not in self.langs(): continue
if hasattr(Xtr, 'shape') and hasattr(Xte, 'shape'):
print("Lang {}, Xtr={}, ytr={}, Xte={}, yte={}".format(lang, Xtr.shape, self.cat_view(Ytr).shape, Xte.shape, self.cat_view(Yte).shape))
print("Lang {}, Xtr={}, ytr={}, Xte={}, yte={}".format(lang, shape(Xtr), self.cat_view(Ytr).shape, shape(Xte), self.cat_view(Yte).shape))
def show_category_prevalences(self):
#pass
@ -135,12 +143,24 @@ class MultilingualDataset:
def set_labels(self, labels):
self.labels = labels
def mask_numbers(self, data, number_mask='numbermask'):
mask = re.compile(r'\b[0-9][0-9.,-]*\b')
masked = []
for text in tqdm(data, desc='masking numbers'):
masked.append(mask.sub(number_mask, text))
return masked
def _mask_numbers(data):
mask_moredigit = re.compile(r'\s[\+-]?\d{5,}([\.,]\d*)*\b')
mask_4digit = re.compile(r'\s[\+-]?\d{4}([\.,]\d*)*\b')
mask_3digit = re.compile(r'\s[\+-]?\d{3}([\.,]\d*)*\b')
mask_2digit = re.compile(r'\s[\+-]?\d{2}([\.,]\d*)*\b')
mask_1digit = re.compile(r'\s[\+-]?\d{1}([\.,]\d*)*\b')
masked = []
for text in tqdm(data, desc='masking numbers'):
text = ' ' + text
text = mask_moredigit.sub(' MoreDigitMask', text)
text = mask_4digit.sub(' FourDigitMask', text)
text = mask_3digit.sub(' ThreeDigitMask', text)
text = mask_2digit.sub(' TwoDigitMask', text)
text = mask_1digit.sub(' OneDigitMask', text)
masked.append(text.replace('.','').replace(',','').strip())
return masked
# ----------------------------------------------------------------------------------------------------------------------
@ -541,12 +561,120 @@ def prepare_rcv_datasets(outpath, rcv1_data_home, rcv2_data_home, wiki_data_home
build_juxtaposed_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, preprocess).save(yuxta_path)
# ----------------------------------------------------------------------------------------------------------------------
# Methods to generate full RCV and JRC datasets
# ----------------------------------------------------------------------------------------------------------------------
def full_rcv_(rcv1_data_home, rcv2_data_home, outpath, langs):
print('fetching the datasets')
rcv1_train_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train')
rcv1_test_documents, labels_rcv1_test = fetch_RCV1(rcv1_data_home, split='test')
rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l != 'en'])
filter_by_categories(rcv1_train_documents, labels_rcv2)
filter_by_categories(rcv1_test_documents, labels_rcv2)
filter_by_categories(rcv2_documents, labels_rcv1)
label_names = get_active_labels(rcv1_train_documents + rcv2_documents)
print('Active labels in RCV1/2 {}'.format(len(label_names)))
print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_train_documents), len(rcv1_test_documents), len(label_names)))
print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents]))
mlb = MultiLabelBinarizer()
mlb.fit([label_names])
all_docs = rcv1_train_documents + rcv1_test_documents + rcv2_documents
lang_docs = {lang: [d for d in all_docs if d.lang == lang] for lang in langs}
def get_ids(doclist):
return frozenset([d.id for d in doclist])
tr_ids = {'en': get_ids(rcv1_train_documents)}
te_ids = {'en': get_ids(rcv1_test_documents)}
for lang in langs:
if lang == 'en': continue
tr_ids[lang], te_ids[lang] = train_test_split([d.id for d in lang_docs[lang]], test_size=.3)
dataset = MultilingualDataset()
dataset.dataset_name = 'RCV1/2-full'
for lang in langs:
print(f'processing {lang} with {len(tr_ids[lang])} training documents and {len(te_ids[lang])} documents')
analyzer = CountVectorizer(
strip_accents='unicode', min_df=3, stop_words=stopwords.words(NLTK_LANGMAP[lang])
).build_analyzer()
Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.id) for d in lang_docs[lang] if d.id in tr_ids[lang]])
Xte,Yte,IDte = zip(*[(d.text,d.categories,d.id) for d in lang_docs[lang] if d.id in te_ids[lang]])
Xtr = [' '.join(analyzer(d)) for d in Xtr]
Xte = [' '.join(analyzer(d)) for d in Xte]
Ytr = mlb.transform(Ytr)
Yte = mlb.transform(Yte)
dataset.add(lang, _mask_numbers(Xtr), Ytr, _mask_numbers(Xte), Yte, IDtr, IDte)
dataset.save(outpath)
def full_jrc_(jrc_data_home, langs, train_years, test_years, outpath, cat_policy='all', most_common_cat=300):
print('fetching the datasets')
cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy)
training_docs, label_names = fetch_jrcacquis(
langs=langs, data_path=jrc_data_home, years=train_years, cat_filter=cat_list, cat_threshold=1, parallel=None, most_frequent=most_common_cat
)
test_docs, _ = fetch_jrcacquis(
langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names, parallel='force'
)
def _group_by_lang(doc_list, langs):
return {lang: [d for d in doc_list if d.lang == lang] for lang in langs}
training_docs = _group_by_lang(training_docs, langs)
test_docs = _group_by_lang(test_docs, langs)
mlb = MultiLabelBinarizer()
mlb.fit([label_names])
dataset = MultilingualDataset()
data.dataset_name = 'JRC-Acquis-full'
for lang in langs:
analyzer = CountVectorizer(
strip_accents='unicode', min_df=3, stop_words=stopwords.words(NLTK_LANGMAP[lang])
).build_analyzer()
Xtr, Ytr, IDtr = zip(*[(d.text, d.categories, d.parallel_id + '__' + d.id) for d in training_docs[lang] if d.lang == lang])
Xte, Yte, IDte = zip(*[(d.text, d.categories, d.parallel_id + '__' + d.id) for d in test_docs[lang] if d.lang == lang])
Xtr = [' '.join(analyzer(d)) for d in Xtr]
Xte = [' '.join(analyzer(d)) for d in Xte]
Ytr = mlb.transform(Ytr)
Yte = mlb.transform(Yte)
dataset.add(lang, _mask_numbers(Xtr), Ytr, _mask_numbers(Xte), Yte, IDtr, IDte)
dataset.save(outpath)
#-----------------------------------------------------------------------------------------------------------------------
# MAIN BUILDER
#-----------------------------------------------------------------------------------------------------------------------
if __name__=='__main__':
import sys
RCV1_PATH = '../Datasets/RCV1-v2/unprocessed_corpus'
RCV2_PATH = '../Datasets/RCV2'
JRC_DATAPATH = "../Datasets/JRC_Acquis_v3"
full_rcv_(RCV1_PATH, RCV2_PATH, outpath='../rcv2/rcv1-2_doclist_full_processed.pickle', langs=RCV2_LANGS_WITH_NLTK_STEMMING + ['en'])
# full_jrc_(JRC_DATAPATH, lang_set['JRC_NLTK'], train_years=list(range(1958, 2006)), test_years=[2006], outpath='../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle', cat_policy='all', most_common_cat=300)
sys.exit(0)
# datasetpath = '../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle' # '../rcv2/rcv1-2_doclist_full_processed.pickle'
# data = MultilingualDataset.load(datasetpath)
# data.dataset_name='JRC-Acquis-full'#'RCV1/2-full'
# for lang in RCV2_LANGS_WITH_NLTK_STEMMING + ['en']:
# (Xtr, ytr, idtr), (Xte, yte, idte) = data.multiling_dataset[lang]
# data.multiling_dataset[lang] = ((_mask_numbers(Xtr), ytr, idtr), (_mask_numbers(Xte), yte, idte))
# data.save('../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle')#'../rcv2/rcv1-2_doclist_full_processed_2.pickle')
# sys.exit(0)
assert len(sys.argv) == 5, "wrong number of arguments; required: " \
"<JRC_PATH> <RCV1_PATH> <RCV2_PATH> <WIKI_PATH> "

View File

View File

@ -204,6 +204,7 @@ class FastTextMUSE(PretrainedEmbeddings):
def extract(self, words):
source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
extraction = torch.zeros((len(words), self.dim()))
# extraction = torch.empty(len(words), self.dim()).normal_(0, 1)
extraction[source_idx] = self.embed.vectors[target_idx]
return extraction

View File

@ -254,7 +254,7 @@ class NaivePolylingualClassifier:
return {lang:self.model[lang].transform(lX[lang]) for lang in lX.keys()}
else:
langs = list(lX.keys())
scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].transform)(lX[lang]) for lang in langs)
scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].predict)(lX[lang]) for lang in langs)
return {lang: scores[i] for i, lang in enumerate(langs)}
def best_params(self):
@ -297,7 +297,7 @@ class MonolingualClassifier:
self.model = GridSearchCV(self.model, param_grid=self.parameters, refit=True, cv=5, n_jobs=self.n_jobs,
error_score=0, verbose=10)
print('fitting:', self.model)
print(f'fitting: {self.model} on matrices of shape X={X.shape} Y={y.shape}')
self.model.fit(X, y)
if isinstance(self.model, GridSearchCV):
self.best_params_ = self.model.best_params_

View File

@ -1,6 +1,8 @@
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
#from data.text_preprocessor import NLTKStemTokenizer
from data.tsr_function__ import get_tsr_matrix, get_supervised_matrix, pointwise_mutual_information, information_gain, \
gain_ratio, gss
from embeddings.embeddings import FastTextMUSE
from embeddings.supervised import supervised_embeddings_tfidf, zscores
from learning.learners import NaivePolylingualClassifier, MonolingualClassifier, _joblib_transform_multiling
@ -10,6 +12,9 @@ from joblib import Parallel, delayed
from scipy.sparse import issparse, vstack, hstack
from transformers.StandardizeTransformer import StandardizeTransformer
from util.SIF_embed import remove_pc
from sklearn.preprocessing import normalize
from sklearn.svm import SVC
from scipy.sparse import csr_matrix
# ------------------------------------------------------------------
# Data Processing
@ -39,20 +44,65 @@ class TfidfVectorizerMultilingual:
else:
return self.vectorizer[l].vocabulary_
def get_analyzer(self, l=None):
if l is None:
return {l:self.vectorizer[l].build_analyzer() for l in self.langs}
else:
return self.vectorizer[l].build_analyzer()
class FeatureWeight:
def __init__(self, weight='tfidf', agg='mean'):
assert weight in ['tfidf', 'pmi', 'ig'] or callable(weight), 'weight should either be "tfidf" or a callable function'
assert agg in ['mean', 'max'], 'aggregation function should either be "mean" or "max"'
self.weight = weight
self.agg = agg
self.fitted = False
if weight=='pmi':
self.weight = pointwise_mutual_information
elif weight == 'ig':
self.weight = information_gain
def fit(self, lX, ly):
if not self.fitted:
if self.weight == 'tfidf':
self.lF = {l: np.ones(X.shape[1]) for l, X in lX.items()}
else:
self.lF = {}
for l in lX.keys():
X, y = lX[l], ly[l]
print(f'getting supervised cell-matrix lang {l}')
tsr_matrix = get_tsr_matrix(get_supervised_matrix(X, y), tsr_score_funtion=self.weight)
if self.agg == 'max':
F = tsr_matrix.max(axis=0)
elif self.agg == 'mean':
F = tsr_matrix.mean(axis=0)
self.lF[l] = F
self.fitted = True
return self
def transform(self, lX):
return {lang: csr_matrix.multiply(lX[lang], self.lF[lang]) for lang in lX.keys()}
def fit_transform(self, lX, ly):
return self.fit(lX, ly).transform(lX)
# ------------------------------------------------------------------
# Document Embeddings
# ------------------------------------------------------------------
class PosteriorProbabilitiesEmbedder:
def __init__(self, first_tier_learner, first_tier_parameters,
n_jobs=-1):
def __init__(self, first_tier_learner, first_tier_parameters=None, l2=True, n_jobs=-1):
self.fist_tier_learner = first_tier_learner
self.fist_tier_parameters = first_tier_parameters
self.l2 = l2
self.n_jobs = n_jobs
self.doc_projector = NaivePolylingualClassifier(self.fist_tier_learner,
self.fist_tier_parameters,
n_jobs=n_jobs)
self.doc_projector = NaivePolylingualClassifier(
self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs
)
def fit(self, lX, lY, lV=None):
print('fitting the projectors... {}'.format(lX.keys()))
@ -60,8 +110,8 @@ class PosteriorProbabilitiesEmbedder:
return self
def transform(self, lX):
print('projecting the documents')
lZ = self.doc_projector.predict_proba(lX)
lZ = self.predict_proba(lX)
lZ = _normalize(lZ, self.l2)
return lZ
def fit_transform(self, lX, ly=None, lV=None):
@ -70,28 +120,41 @@ class PosteriorProbabilitiesEmbedder:
def best_params(self):
return self.doc_projector.best_params()
def predict(self, lX, ly=None):
return self.doc_projector.predict(lX)
def predict_proba(self, lX, ly=None):
print(f'generating posterior probabilities for {sum([X.shape[0] for X in lX.values()])} the documents')
return self.doc_projector.predict_proba(lX)
class MuseEmbedder:
def __init__(self, path, n_jobs=-1):
def __init__(self, path, lV=None, l2=True, n_jobs=-1, featureweight=FeatureWeight()):
self.path=path
self.lV = lV
self.l2 = l2
self.n_jobs = n_jobs
self.featureweight = featureweight
def fit(self, lX, ly, lV):
def fit(self, lX, ly, lV=None):
assert lV is not None or self.lV is not None, 'lV not specified'
self.langs = sorted(lX.keys())
MUSE = Parallel(n_jobs=self.n_jobs)(
delayed(FastTextMUSE)(self.path, lang) for lang in self.langs
)
self.MUSE = load_muse_embeddings(self.path, self.langs, self.n_jobs)
lWordList = {l:self._get_wordlist_from_word2index(lV[l]) for l in self.langs}
self.MUSE = {l:MUSE[i].extract(lWordList[l]).numpy() for i,l in enumerate(self.langs)}
self.MUSE = {l:Muse.extract(lWordList[l]).numpy() for l,Muse in self.MUSE}
self.featureweight.fit(lX, ly)
return self
def transform(self, lX):
MUSE = self.MUSE
lX = self.featureweight.transform(lX)
XdotMUSE = Parallel(n_jobs=self.n_jobs)(
delayed(XdotM)(lX[lang], MUSE[lang]) for lang in self.langs
)
return {l: XdotMUSE[i] for i, l in enumerate(self.langs)}
lMuse = {l: XdotMUSE[i] for i, l in enumerate(self.langs)}
lMuse = _normalize(lMuse, self.l2)
return lMuse
def fit_transform(self, lX, ly, lV):
return self.fit(lX, ly, lV).transform(lX)
@ -102,9 +165,11 @@ class MuseEmbedder:
class WordClassEmbedder:
def __init__(self, n_jobs=-1, max_label_space=300):
def __init__(self, l2=True, n_jobs=-1, max_label_space=300, featureweight=FeatureWeight()):
self.n_jobs = n_jobs
self.l2 = l2
self.max_label_space=max_label_space
self.featureweight = featureweight
def fit(self, lX, ly, lV=None):
self.langs = sorted(lX.keys())
@ -112,53 +177,43 @@ class WordClassEmbedder:
delayed(word_class_embedding_matrix)(lX[lang], ly[lang], self.max_label_space) for lang in self.langs
)
self.lWCE = {l:WCE[i] for i,l in enumerate(self.langs)}
self.featureweight.fit(lX, ly)
return self
def transform(self, lX):
lWCE = self.lWCE
lX = self.featureweight.transform(lX)
XdotWCE = Parallel(n_jobs=self.n_jobs)(
delayed(XdotM)(lX[lang], lWCE[lang]) for lang in self.langs
delayed(XdotM)(lX[lang], lWCE[lang])for lang in self.langs
)
return {l: XdotWCE[i] for i, l in enumerate(self.langs)}
lwce = {l: XdotWCE[i] for i, l in enumerate(self.langs)}
lwce = _normalize(lwce, self.l2)
return lwce
def fit_transform(self, lX, ly, lV=None):
return self.fit(lX, ly).transform(lX)
def word_class_embedding_matrix(X, Y, max_label_space=300):
print('computing supervised embeddings...')
WCE = supervised_embeddings_tfidf(X, Y)
WCE = zscores(WCE, axis=0)
nC = Y.shape[1]
if nC > max_label_space:
print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
f'Applying PCA(n_components={max_label_space})')
pca = PCA(n_components=max_label_space)
WCE = pca.fit(WCE).transform(WCE)
return WCE
def XdotM(X,M):
# return X.dot(M)
E = X.dot(M)
E = remove_pc(E, npc=1)
return E
class DocEmbedderList:
def __init__(self, *embedder_list):
def __init__(self, *embedder_list, aggregation='concat'):
assert aggregation in {'concat', 'mean'}, 'unknown aggregation mode, valid are "concat" and "mean"'
if len(embedder_list)==0: embedder_list=[]
self.embedders = embedder_list
self.aggregation = aggregation
def fit(self, lX, ly, lV):
def fit(self, lX, ly, lV=None):
for transformer in self.embedders:
transformer.fit(lX,ly,lV)
return self
def transform(self, lX):
if self.aggregation == 'concat':
return self.transform_concat(lX)
elif self.aggregation == 'mean':
return self.transform_mean(lX)
def transform_concat(self, lX):
if len(self.embedders)==1:
return self.embedders[0].transform(lX)
@ -176,8 +231,27 @@ class DocEmbedderList:
hstacker = hstack if some_sparse else np.hstack
return {l:hstacker(lZparts[l]) for l in langs}
def transform_mean(self, lX):
if len(self.embedders)==1:
return self.embedders[0].transform(lX)
def fit_transform(self, lX, ly, lV):
langs = sorted(lX.keys())
lZparts = {l: None for l in langs}
for transformer in self.embedders:
lZ = transformer.transform(lX)
for l in langs:
Z = lZ[l]
if lZparts[l] is None:
lZparts[l] = Z
else:
lZparts[l] += Z
n_transformers = len(self.embedders)
return {l:lZparts[l] / n_transformers for l in langs}
def fit_transform(self, lX, ly, lV=None):
return self.fit(lX, ly, lV).transform(lX)
def best_params(self):
@ -186,20 +260,55 @@ class DocEmbedderList:
def append(self, embedder):
self.embedders.append(embedder)
class FeatureSet2Posteriors:
def __init__(self, transformer, l2=True, n_jobs=-1):
self.transformer = transformer
self.l2=l2
self.n_jobs = n_jobs
self.prob_classifier = MetaClassifier(SVC(kernel='rbf', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs)
def fit(self, lX, ly, lV=None):
if lV is None and hasattr(self.transformer, 'lV'):
lV = self.transformer.lV
lZ = self.transformer.fit_transform(lX, ly, lV)
self.prob_classifier.fit(lZ, ly)
return self
def transform(self, lX):
lP = self.predict_proba(lX)
lP = _normalize(lP, self.l2)
return lP
def fit_transform(self, lX, ly, lV):
return self.fit(lX, ly, lV).transform(lX)
def predict(self, lX, ly=None):
lZ = self.transformer.transform(lX)
return self.prob_classifier.predict(lZ)
def predict_proba(self, lX, ly=None):
lZ = self.transformer.transform(lX)
return self.prob_classifier.predict_proba(lZ)
# ------------------------------------------------------------------
# Meta-Classifier
# ------------------------------------------------------------------
class MetaClassifier:
def __init__(self, meta_learner, meta_parameters, n_jobs=-1):
def __init__(self, meta_learner, meta_parameters=None, n_jobs=-1, standardize_range=None):
self.n_jobs=n_jobs
self.model = MonolingualClassifier(base_learner=meta_learner, parameters=meta_parameters, n_jobs=n_jobs)
self.standardize_range = standardize_range
def fit(self, lZ, ly):
tinit = time.time()
Z, y = self.stack(lZ, ly)
self.standardizer = StandardizeTransformer()
self.standardizer = StandardizeTransformer(range=self.standardize_range)
Z = self.standardizer.fit_transform(Z)
print('fitting the Z-space of shape={}'.format(Z.shape))
self.model.fit(Z, y)
self.time = time.time() - tinit
@ -217,6 +326,10 @@ class MetaClassifier:
lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs)
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
def predict_proba(self, lZ, ly=None):
lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs)
return _joblib_transform_multiling(self.model.predict_proba, lZ, n_jobs=self.n_jobs)
def best_params(self):
return self.model.best_params()
@ -249,3 +362,65 @@ class Funnelling:
return {'1st-tier':self.first_tier.best_params(),
'meta':self.meta.best_params()}
class Voting:
def __init__(self, *prob_classifiers):
assert all([hasattr(p, 'predict_proba') for p in prob_classifiers]), 'not all classifiers are probabilistic'
self.prob_classifiers = prob_classifiers
def fit(self, lX, ly, lV=None):
for classifier in self.prob_classifiers:
classifier.fit(lX, ly, lV)
def predict(self, lX, ly=None):
lP = {l:[] for l in lX.keys()}
for classifier in self.prob_classifiers:
lPi = classifier.predict_proba(lX)
for l in lX.keys():
lP[l].append(lPi[l])
lP = {l:np.stack(Plist).mean(axis=0) for l,Plist in lP.items()}
ly = {l:P>0.5 for l,P in lP.items()}
return ly
# ------------------------------------------------------------------------------
# HELPERS
# ------------------------------------------------------------------------------
def load_muse_embeddings(we_path, langs, n_jobs=-1):
MUSE = Parallel(n_jobs=n_jobs)(
delayed(FastTextMUSE)(we_path, lang) for lang in langs
)
return {l: MUSE[i] for i, l in enumerate(langs)}
def word_class_embedding_matrix(X, Y, max_label_space=300):
print('computing supervised embeddings...')
WCE = supervised_embeddings_tfidf(X, Y)
WCE = zscores(WCE, axis=0)
nC = Y.shape[1]
if nC > max_label_space:
print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
f'Applying PCA(n_components={max_label_space})')
pca = PCA(n_components=max_label_space)
WCE = pca.fit(WCE).transform(WCE)
return WCE
def XdotM(X,M):
# return X.dot(M)
# print(f'X={X.shape}, M={M.shape}')
E = X.dot(M)
E = remove_pc(E, npc=1)
return E
def _normalize(lX, l2=True):
return {l: normalize(X) for l, X in lX.items()} if l2 else lX

290
src/main_deep_learning.py Executable file
View File

@ -0,0 +1,290 @@
import argparse
import torch.nn as nn
from torch.optim.lr_scheduler import StepLR
from dataset_builder import MultilingualDataset
from learning.transformers import load_muse_embeddings
from models.lstm_class import RNNMultilingualClassifier
from util.csv_log import CSVLog
from util.early_stop import EarlyStopping
from util.common import *
from util.file import create_if_not_exist
from time import time
from embeddings.pretrained import *
from os.path import join
from tqdm import tqdm
from util.evaluation import evaluate
from util.file import get_file_name
allowed_nets = {'rnn'}
# instantiates the net, initializes the model parameters, and sets embeddings trainable if requested
def init_Net(nC, multilingual_index, xavier_uniform=True):
net=opt.net
assert net in allowed_nets, f'{net} not supported, valid ones are={allowed_nets}'
# instantiate the required net
if net=='rnn':
only_post = opt.posteriors and (not opt.pretrained) and (not opt.supervised)
if only_post:
print('working on ONLY POST mode')
model = RNNMultilingualClassifier(
output_size=nC,
hidden_size=opt.hidden,
lvocab_size=multilingual_index.l_vocabsize(),
learnable_length=opt.learnable,
lpretrained=multilingual_index.l_embeddings(),
drop_embedding_range=multilingual_index.sup_range,
drop_embedding_prop=opt.sup_drop,
post_probabilities=opt.posteriors,
only_post=only_post
)
# weight initialization
if xavier_uniform:
for p in model.parameters():
if p.dim() > 1 and p.requires_grad:
nn.init.xavier_uniform_(p)
if opt.tunable:
# this has to be performed *after* Xavier initialization is done,
# otherwise the pretrained embedding parameters will be overrided
model.finetune_pretrained()
return model.cuda()
def set_method_name():
method_name = f'{opt.net}(H{opt.hidden})'
if opt.pretrained:
method_name += f'-Muse'
if opt.supervised:
method_name += f'-WCE'
if opt.posteriors:
method_name += f'-Posteriors'
if (opt.pretrained or opt.supervised) and opt.tunable:
method_name+='-(trainable)'
else:
method_name += '-(static)'
if opt.learnable > 0:
method_name += f'-Learnable{opt.learnable}'
return method_name
def init_optimizer(model, lr):
return torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr, weight_decay=opt.weight_decay)
def init_logfile(method_name, opt):
logfile = CSVLog(opt.log_file, ['dataset', 'method', 'epoch', 'measure', 'value', 'run', 'timelapse'])
logfile.set_default('dataset', opt.dataset)
logfile.set_default('run', opt.seed)
logfile.set_default('method', method_name)
assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} and run {opt.seed} already calculated'
return logfile
# loads the MUSE embeddings if requested, or returns empty dictionaries otherwise
def load_pretrained_embeddings(we_path, langs):
lpretrained = lpretrained_vocabulary = none_dict(langs)
if opt.pretrained:
lpretrained = load_muse_embeddings(we_path, langs, n_jobs=-1)
lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs}
return lpretrained, lpretrained_vocabulary
# ----------------------------------------------------------------------------------------------------------------------
def main():
method_name = set_method_name()
logfile = init_logfile(method_name, opt)
# Loading the dataset
data = MultilingualDataset.load(opt.dataset)
# data.set_view(languages=['de', 'fr', 'sv', 'da', 'es', 'it'])
data.show_dimensions()
langs = data.langs()
l_devel_raw, l_devel_target = data.training(target_as_csr=True)
l_test_raw, l_test_target = data.test(target_as_csr=True)
# Loading the MUSE pretrained embeddings (only if requested)
lpretrained, lpretrained_vocabulary = load_pretrained_embeddings(opt.we_path, langs)
# Data preparation: indexing / splitting / embedding matrices (pretrained + supervised) / posterior probs
multilingual_index = MultilingualIndex()
multilingual_index.index(l_devel_raw, l_devel_target, l_test_raw, lpretrained_vocabulary)
multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=opt.seed)
multilingual_index.embedding_matrices(lpretrained, opt.supervised)
if opt.posteriors:
lPtr, lPva, lPte = multilingual_index.posterior_probabilities(max_training_docs_by_lang=opt.svm_max_docs)
else:
lPtr, lPva, lPte = None, None, None
# Model initialization
model = init_Net(data.num_categories(), multilingual_index)
optim = init_optimizer(model, lr=opt.lr)
criterion = torch.nn.BCEWithLogitsLoss().cuda()
lr_scheduler = StepLR(optim, step_size=25, gamma=0.5)
batcher_train = Batch(opt.batch_size, batches_per_epoch=10, languages=langs, lpad=multilingual_index.l_pad())
batcher_eval = Batch(opt.batch_size, batches_per_epoch=-1, languages=langs, lpad=multilingual_index.l_pad())
tinit = time()
create_if_not_exist(opt.checkpoint_dir)
early_stop = EarlyStopping(model, patience=opt.patience, checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_file_name(opt.dataset)}')
l_train_index, l_train_target = multilingual_index.l_train()
l_val_index, l_val_target = multilingual_index.l_val()
l_test_index = multilingual_index.l_test_index()
print('-'*80)
print('Start training')
for epoch in range(1, opt.nepochs + 1):
train(model, batcher_train, l_train_index, lPtr, l_train_target, tinit, logfile, criterion, optim, epoch, method_name)
lr_scheduler.step() # reduces the learning rate
# validation
macrof1 = test(model, batcher_eval, l_val_index, lPva, l_val_target, tinit, epoch, logfile, criterion, 'va')
early_stop(macrof1, epoch)
if opt.test_each>0:
if (opt.plotmode and (epoch==1 or epoch%opt.test_each==0)) or (not opt.plotmode and epoch%opt.test_each==0 and epoch<opt.nepochs):
test(model, batcher_eval, l_test_index, lPte, l_test_target, tinit, epoch, logfile, criterion, 'te')
if early_stop.STOP:
print('[early-stop] STOP')
if not opt.plotmode: # with plotmode activated, early-stop is ignored
break
# training is over
# restores the best model according to the Mf1 of the validation set (only when plotmode==False)
# stoptime = early_stop.stop_time - tinit
# stopepoch = early_stop.best_epoch
# logfile.add_row(epoch=stopepoch, measure=f'early-stop', value=early_stop.best_score, timelapse=stoptime)
if opt.plotmode==False:
print('-' * 80)
print('Training over. Performing final evaluation')
model = early_stop.restore_checkpoint()
if opt.val_epochs>0:
print(f'running last {opt.val_epochs} training epochs on the validation set')
for val_epoch in range(1, opt.val_epochs + 1):
batcher_train.init_offset()
train(model, batcher_train, l_val_index, lPva, l_val_target, tinit, logfile, criterion, optim, epoch+val_epoch, method_name)
# final test
print('Training complete: testing')
test(model, batcher_eval, l_test_index, lPte, l_test_target, tinit, epoch, logfile, criterion, 'te')
def get_lr(optimizer):
for param_group in optimizer.param_groups:
return param_group['lr']
def train(model, batcher, ltrain_index, ltrain_posteriors, lytr, tinit, logfile, criterion, optim, epoch, method_name):
loss_history = []
model.train()
for idx, (batch, post, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, lytr)):
optim.zero_grad()
loss = criterion(model(batch, post, lang), target)
loss.backward()
clip_gradient(model)
optim.step()
loss_history.append(loss.item())
if idx % opt.log_interval == 0:
interval_loss = np.mean(loss_history[-opt.log_interval:])
print(f'{opt.dataset} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}')
mean_loss = np.mean(interval_loss)
logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit)
return mean_loss
def test(model, batcher, ltest_index, ltest_posteriors, lyte, tinit, epoch, logfile, criterion, measure_prefix):
model.eval()
langs = sorted(ltest_index.keys())
predictions = {l:[] for l in langs}
yte_stacked = {l:[] for l in langs}
batcher.init_offset()
for batch, post, target, lang in tqdm(batcher.batchify(ltest_index, ltest_posteriors, lyte), desc='evaluation: '):
logits = model(batch, post, lang)
loss = criterion(logits, target).item()
prediction = predict(logits)
predictions[lang].append(prediction)
yte_stacked[lang].append(target.detach().cpu().numpy())
ly = {l:np.vstack(yte_stacked[l]) for l in langs}
ly_ = {l:np.vstack(predictions[l]) for l in langs}
l_eval = evaluate(ly, ly_)
metrics = []
for lang in langs:
macrof1, microf1, macrok, microk = l_eval[lang]
metrics.append([macrof1, microf1, macrok, microk])
if measure_prefix=='te':
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
# results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'],
# (config['max_label_space'], classifier.best_components),
# config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time,
# lang, macrof1, microf1, macrok, microk, '')
Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0)
print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]')
# logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=tend)
# logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mf1, timelapse=tend)
# logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-accuracy', value=acc, timelapse=tend)
# logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=loss, timelapse=tend)
return Mf1
# ----------------------------------------------------------------------------------------------------------------------
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Neural text classification with Word-Class Embeddings')
parser.add_argument('dataset', type=str, metavar='datasetpath', help=f'path to the pickled dataset')
parser.add_argument('--batch-size', type=int, default=50, metavar='int', help='input batch size (default: 100)')
parser.add_argument('--batch-size-test', type=int, default=250, metavar='int', help='batch size for testing (default: 250)')
parser.add_argument('--nepochs', type=int, default=200, metavar='int', help='number of epochs (default: 200)')
parser.add_argument('--patience', type=int, default=10, metavar='int', help='patience for early-stop (default: 10)')
parser.add_argument('--plotmode', action='store_true', default=False, help='in plot mode executes a long run in order '
'to generate enough data to produce trend plots (test-each should be >0. This mode is '
'used to produce plots, and does not perform an evaluation on the test set.')
parser.add_argument('--hidden', type=int, default=512, metavar='int', help='hidden lstm size (default: 512)')
parser.add_argument('--lr', type=float, default=1e-3, metavar='float', help='learning rate (default: 1e-3)')
parser.add_argument('--weight_decay', type=float, default=0, metavar='float', help='weight decay (default: 0)')
parser.add_argument('--sup-drop', type=float, default=0.5, metavar='[0.0, 1.0]', help='dropout probability for the supervised matrix (default: 0.5)')
parser.add_argument('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)')
parser.add_argument('--svm-max-docs', type=int, default=1000, metavar='int', help='maximum number of documents by '
'language used to train the calibrated SVMs (only used if --posteriors is active)')
parser.add_argument('--log-interval', type=int, default=10, metavar='int', help='how many batches to wait before printing training status')
parser.add_argument('--log-file', type=str, default='../log/log.csv', metavar='str', help='path to the log csv file')
# parser.add_argument('--pickle-dir', type=str, default='../pickles', metavar='str', help=f'if set, specifies the path where to '
# f'save/load the dataset pickled (set to None if you prefer not to retain the pickle file)')
parser.add_argument('--test-each', type=int, default=0, metavar='int', help='how many epochs to wait before invoking test (default: 0, only at the end)')
parser.add_argument('--checkpoint-dir', type=str, default='../checkpoint', metavar='str', help='path to the directory containing checkpoints')
parser.add_argument('--net', type=str, default='rnn', metavar='str', help=f'net, one in {allowed_nets}')
parser.add_argument('--pretrained', action='store_true', default=False, help='use MUSE pretrained embeddings')
parser.add_argument('--supervised', action='store_true', default=False, help='use supervised embeddings')
parser.add_argument('--posteriors', action='store_true', default=False, help='concatenate posterior probabilities to doc embeddings')
parser.add_argument('--learnable', type=int, default=0, metavar='int', help='dimension of the learnable embeddings (default 0)')
parser.add_argument('--val-epochs', type=int, default=1, metavar='int', help='number of training epochs to perform on the '
'validation set once training is over (default 1)')
parser.add_argument('--we-path', type=str, default='../embeddings', metavar='str',
help=f'path to MUSE pretrained embeddings')
parser.add_argument('--max-label-space', type=int, default=300, metavar='int', help='larger dimension allowed for the '
'feature-label embedding (if larger, then PCA with this number of components is applied '
'(default 300)')
parser.add_argument('--force', action='store_true', default=False, help='do not check if this experiment has already been run')
parser.add_argument('--tunable', action='store_true', default=False,
help='pretrained embeddings are tunable from the begining (default False, i.e., static)')
opt = parser.parse_args()
assert torch.cuda.is_available(), 'CUDA not available'
assert not opt.plotmode or opt.test_each > 0, 'plot mode implies --test-each>0'
# if opt.pickle_dir: opt.pickle_path = join(opt.pickle_dir, f'{opt.dataset}.pickle')
torch.manual_seed(opt.seed)
main()

View File

@ -0,0 +1,127 @@
import os
from dataset_builder import MultilingualDataset
# from learning.learners import *
from learning.learners import FunnellingMultimodal
from learning.transformers import Funnelling, PosteriorProbabilitiesEmbedder, MetaClassifier, \
TfidfVectorizerMultilingual, DocEmbedderList, WordClassEmbedder, MuseEmbedder, FeatureSet2Posteriors, Voting
from util.evaluation import *
from optparse import OptionParser
from util.file import exists
from util.results import PolylingualClassificationResults
from sklearn.svm import SVC
from util.util import get_learner, get_params
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
parser = OptionParser()
parser.add_option("-d", "--dataset", dest="dataset",
help="Path to the multilingual dataset processed and stored in .pickle format",
default="../rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
parser.add_option("-o", "--output", dest="output",
help="Result file", type=str, default='./results/results.csv')
parser.add_option("-P", "--probs", dest="probs", action='store_true',
help="Add posterior probabilities to the document embedding representation", default=False)
parser.add_option("-S", "--supervised", dest="supervised", action='store_true',
help="Add supervised (Word-Class Embeddings) to the document embedding representation", default=False)
parser.add_option("-U", "--pretrained", dest="pretrained", action='store_true',
help="Add pretrained MUSE embeddings to the document embedding representation", default=False)
parser.add_option("-w", "--we-path", dest="we_path",
help="Path to the MUSE polylingual word embeddings", default='../embeddings')
parser.add_option("-s", "--set_c", dest="set_c",type=float,
help="Set the C parameter", default=1)
parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
help="Optimize hyperparameters", default=False)
parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
help="Number of parallel jobs (default is -1, all)", default=-1)
parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
help="If smaller than number of target classes, PCA will be applied to supervised matrix. ",
default=300)
# parser.add_option("-u", "--upca", dest="max_labels_U", type=int,
# help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix."
# " If set to 0 it will automatically search for the best number of components", default=300)
# parser.add_option("-a", dest="post_pca",
# help="If set to True, will apply PCA to the z-space (posterior probabilities stacked along with "
# "embedding space", default=False)
def get_learner(calibrate=False, kernel='linear'):
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, gamma='auto')
def get_params(dense=False):
if not op.optimc:
return None
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
kernel = 'rbf' if dense else 'linear'
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
#######################################################################################################################
if __name__ == '__main__':
(op, args) = parser.parse_args()
assert exists(op.dataset), 'Unable to find file '+str(op.dataset)
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
assert op.probs or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed'
dataset_file = os.path.basename(op.dataset)
results = PolylingualClassificationResults(op.output)
data = MultilingualDataset.load(op.dataset)
data.show_dimensions()
lXtr, lytr = data.training()
lXte, lyte = data.test()
meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
result_id = f'{dataset_file}_Prob{op.probs}_WCE{op.supervised}(PCA{op.max_labels_S})_MUSE{op.pretrained}{"_optimC" if op.optimc else ""}'
print(f'{result_id}')
# text preprocessing
tfidfvectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
lXtr = tfidfvectorizer.fit_transform(lXtr, lytr)
lXte = tfidfvectorizer.transform(lXte)
lV = tfidfvectorizer.vocabulary()
classifiers = []
if op.probs:
classifiers.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True), first_tier_parameters=None))
if op.supervised:
classifiers.append(FeatureSet2Posteriors(WordClassEmbedder(max_label_space=op.max_labels_S)))
if op.pretrained:
classifiers.append(FeatureSet2Posteriors(MuseEmbedder(op.we_path, lV=lV)))
classifier = Voting(*classifiers)
print('# Fitting ...')
classifier.fit(lXtr, lytr)
print('\n# Evaluating ...')
l_eval = evaluate_method(classifier, lXte, lyte)
metrics = []
for lang in lXte.keys():
macrof1, microf1, macrok, microk = l_eval[lang]
metrics.append([macrof1, microf1, macrok, microk])
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
# results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'],
# (config['max_label_space'], classifier.best_components),
# config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time,
# lang, macrof1, microf1, macrok, microk, '')
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))

View File

@ -1,27 +1,19 @@
import os
from dataset_builder import MultilingualDataset
# from learning.learners import *
from learning.learners import FunnellingMultimodal
from learning.transformers import Funnelling, PosteriorProbabilitiesEmbedder, MetaClassifier, \
TfidfVectorizerMultilingual, DocEmbedderList, WordClassEmbedder, MuseEmbedder
from learning.transformers import *
from util.evaluation import *
from optparse import OptionParser
from util.file import exists
from util.results import PolylingualClassificationResults
from sklearn.svm import SVC
from util.util import get_learner, get_params
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
parser = OptionParser()
parser.add_option("-d", "--dataset", dest="dataset",
help="Path to the multilingual dataset processed and stored in .pickle format",
default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle")
parser = OptionParser(usage="usage: %prog datapath [options]")
parser.add_option("-o", "--output", dest="output",
help="Result file", type=str, default='./results/results.csv')
parser.add_option("-P", "--probs", dest="probs", action='store_true',
parser.add_option("-P", "--posteriors", dest="posteriors", action='store_true',
help="Add posterior probabilities to the document embedding representation", default=False)
parser.add_option("-S", "--supervised", dest="supervised", action='store_true',
@ -30,6 +22,16 @@ parser.add_option("-S", "--supervised", dest="supervised", action='store_true',
parser.add_option("-U", "--pretrained", dest="pretrained", action='store_true',
help="Add pretrained MUSE embeddings to the document embedding representation", default=False)
parser.add_option("--nol2", dest="nol2", action='store_true',
help="Deactivates l2 normalization as a post-processing for the document embedding views", default=False)
parser.add_option("--allprob", dest="allprob", action='store_true',
help="All views are generated as posterior probabilities. This affects the supervised and pretrained "
"embeddings, for which a calibrated classifier is generated, which generates the posteriors", default=False)
parser.add_option("--feat-weight", dest="feat_weight",
help="Term weighting function to weight the averaged embeddings", type=str, default='tfidf')
parser.add_option("-w", "--we-path", dest="we_path",
help="Path to the MUSE polylingual word embeddings", default='../embeddings')
@ -46,66 +48,61 @@ parser.add_option("-p", "--pca", dest="max_labels_S", type=int,
help="If smaller than number of target classes, PCA will be applied to supervised matrix. ",
default=300)
# parser.add_option("-u", "--upca", dest="max_labels_U", type=int,
# help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix."
# " If set to 0 it will automatically search for the best number of components", default=300)
# parser.add_option("-a", dest="post_pca",
# help="If set to True, will apply PCA to the z-space (posterior probabilities stacked along with "
# "embedding space", default=False)
def get_learner(calibrate=False, kernel='linear'):
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, gamma='auto')
def get_params(dense=False):
if not op.optimc:
return None
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
kernel = 'rbf' if dense else 'linear'
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
#######################################################################################################################
if __name__ == '__main__':
(op, args) = parser.parse_args()
assert exists(op.dataset), 'Unable to find file '+str(op.dataset)
assert len(args)==1, 'required argument "datapath" missing (path to the pickled dataset)'
dataset = args[0]
assert exists(dataset), 'Unable to find file '+str(dataset)
assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option'
assert op.probs or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed'
assert op.posteriors or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed'
l2=(op.nol2==False)
dataset_file = os.path.basename(op.dataset)
dataset_file = os.path.basename(dataset)
results = PolylingualClassificationResults(op.output)
allprob='Prob' if op.allprob else ''
result_id = f'{dataset_file}_ProbPost={op.posteriors}_{allprob}WCE={op.supervised}(PCA={op.max_labels_S})_{allprob}' \
f'MUSE={op.pretrained}_weight={op.feat_weight}_l2={l2}{"_optimC" if op.optimc else ""}'
print(f'{result_id}')
data = MultilingualDataset.load(op.dataset)
data = MultilingualDataset.load(dataset)
data.show_dimensions()
lXtr, lytr = data.training()
lXte, lyte = data.test()
meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
result_id = f'{dataset_file}_Prob{op.probs}_WCE{op.supervised}(PCA{op.max_labels_S})_MUSE{op.pretrained}{"_optimC" if op.optimc else ""}'
print(f'{result_id}')
# text preprocessing
tfidfvectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
# document embedding modules
doc_embedder = DocEmbedderList()
if op.probs:
doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True), first_tier_parameters=None))
# feature weighting (for word embeddings average)
feat_weighting = FeatureWeight(op.feat_weight, agg='mean')
# # document embedding modules
doc_embedder = DocEmbedderList(aggregation='concat')
if op.posteriors:
doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True, kernel='linear'), l2=l2))
if op.supervised:
doc_embedder.append(WordClassEmbedder(max_label_space=op.max_labels_S))
wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting)
if op.allprob:
wce = FeatureSet2Posteriors(wce, l2=l2)
doc_embedder.append(wce)
if op.pretrained:
doc_embedder.append(MuseEmbedder(op.we_path))
muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting)
if op.allprob:
muse = FeatureSet2Posteriors(muse, l2=l2)
doc_embedder.append(muse)
# metaclassifier
meta = MetaClassifier(meta_learner=SVC(), meta_parameters=get_params(dense=True))
meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}]
meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'), meta_parameters=meta_parameters)
# ensembling the modules
classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta)
@ -123,6 +120,6 @@ if __name__ == '__main__':
print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}')
# results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'],
# (config['max_label_space'], classifier.best_components),
# config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time,
# config['dim_reduction_unsupervised'], op.optimc, dataset.split('/')[-1], classifier.time,
# lang, macrof1, microf1, macrok, microk, '')
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))

47
src/models/helpers.py Executable file
View File

@ -0,0 +1,47 @@
import torch
import torch.nn as nn
from torch.nn import functional as F
def init_embeddings(pretrained, vocab_size, learnable_length, device='cuda'):
pretrained_embeddings = None
pretrained_length = 0
if pretrained is not None:
pretrained_length = pretrained.shape[1]
assert pretrained.shape[0] == vocab_size, 'pre-trained matrix does not match with the vocabulary size'
pretrained_embeddings = nn.Embedding(vocab_size, pretrained_length)
pretrained_embeddings.weight = nn.Parameter(pretrained, requires_grad=False)
# pretrained_embeddings.to(device)
learnable_embeddings = None
if learnable_length > 0:
learnable_embeddings = nn.Embedding(vocab_size, learnable_length)
# learnable_embeddings.to(device)
embedding_length = learnable_length + pretrained_length
assert embedding_length > 0, '0-size embeddings'
return pretrained_embeddings, learnable_embeddings, embedding_length
def embed(model, input, lang):
input_list = []
if model.lpretrained_embeddings[lang]:
input_list.append(model.lpretrained_embeddings[lang](input))
if model.llearnable_embeddings[lang]:
input_list.append(model.llearnable_embeddings[lang](input))
return torch.cat(tensors=input_list, dim=2)
def embedding_dropout(input, drop_range, p_drop=0.5, training=True):
if p_drop > 0 and training and drop_range is not None:
p = p_drop
drop_from, drop_to = drop_range
m = drop_to - drop_from #length of the supervised embedding
l = input.shape[2] #total embedding length
corr = (1 - p)
input[:, :, drop_from:drop_to] = corr * F.dropout(input[:, :, drop_from:drop_to], p=p)
input /= (1 - (p * m / l))
return input

96
src/models/lstm_class.py Executable file
View File

@ -0,0 +1,96 @@
#taken from https://github.com/prakashpandey9/Text-Classification-Pytorch/blob/master/models/LSTM.py
import torch
import torch.nn as nn
from torch.autograd import Variable
from models.helpers import *
class RNNMultilingualClassifier(nn.Module):
def __init__(self, output_size, hidden_size, lvocab_size, learnable_length, lpretrained=None,
drop_embedding_range=None, drop_embedding_prop=0, post_probabilities=True, only_post=False):
super(RNNMultilingualClassifier, self).__init__()
self.output_size = output_size
self.hidden_size = hidden_size
self.drop_embedding_range = drop_embedding_range
self.drop_embedding_prop = drop_embedding_prop
self.post_probabilities = post_probabilities
assert 0 <= drop_embedding_prop <= 1, 'drop_embedding_prop: wrong range'
self.lpretrained_embeddings = nn.ModuleDict()
self.llearnable_embeddings = nn.ModuleDict()
self.embedding_length = None
self.langs = sorted(lvocab_size.keys())
self.only_post = only_post
self.n_layers = 1
self.n_directions = 1
self.dropout = nn.Dropout(0.2)
lstm_out = 256
ff1 = 512
ff2 = 256
lpretrained_embeddings = {}
llearnable_embeddings = {}
if only_post==False:
for l in self.langs:
pretrained = lpretrained[l] if lpretrained else None
pretrained_embeddings, learnable_embeddings, embedding_length = init_embeddings(
pretrained, lvocab_size[l], learnable_length
)
lpretrained_embeddings[l] = pretrained_embeddings
llearnable_embeddings[l] = learnable_embeddings
self.embedding_length = embedding_length
# self.rnn = nn.LSTM(self.embedding_length, hidden_size, dropout=0.2 if self.n_layers>1 else 0, num_layers=self.n_layers, bidirectional=(self.n_directions==2))
self.rnn = nn.GRU(self.embedding_length, hidden_size)
self.linear0 = nn.Linear(hidden_size * self.n_directions, lstm_out)
self.lpretrained_embeddings.update(lpretrained_embeddings)
self.llearnable_embeddings.update(llearnable_embeddings)
self.linear1 = nn.Linear(lstm_out, ff1)
self.linear2 = nn.Linear(ff1, ff2)
if only_post:
self.label = nn.Linear(output_size, output_size)
elif post_probabilities:
self.label = nn.Linear(ff2+output_size, output_size)
else:
self.label = nn.Linear(ff2, output_size)
def forward(self, input, post, lang):
if self.only_post:
doc_embedding = post
else:
doc_embedding = self.transform(input, lang)
if self.post_probabilities:
doc_embedding = torch.cat([doc_embedding, post], dim=1)
logits = self.label(doc_embedding)
return logits
def transform(self, input, lang):
batch_size = input.shape[0]
input = embed(self, input, lang)
input = embedding_dropout(input, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop,
training=self.training)
input = input.permute(1, 0, 2)
h_0 = Variable(torch.zeros(self.n_layers*self.n_directions, batch_size, self.hidden_size).cuda())
# c_0 = Variable(torch.zeros(self.n_layers*self.n_directions, batch_size, self.hidden_size).cuda())
# output, (_, _) = self.lstm(input, (h_0, c_0))
output, _ = self.rnn(input, h_0)
output = output[-1,:,:]
output = F.relu(self.linear0(output))
output = self.dropout(F.relu(self.linear1(output)))
output = self.dropout(F.relu(self.linear2(output)))
return output
def finetune_pretrained(self):
for l in self.langs:
self.lpretrained_embeddings[l].requires_grad = True
self.lpretrained_embeddings[l].weight.requires_grad = True

View File

@ -2,15 +2,24 @@ import numpy as np
class StandardizeTransformer:
def __init__(self, axis=0):
def __init__(self, axis=0, range=None):
assert range is None or isinstance(range, slice), 'wrong format for range, should either be None or a slice'
self.axis = axis
self.yetfit=False
self.yetfit = False
self.range = range
def fit(self, X):
print('fitting Standardizer')
std=np.std(X, axis=self.axis, ddof=1)
self.std = np.clip(std, 1e-5, None)
self.mean = np.mean(X, axis=self.axis)
if self.range is not None:
ones = np.ones_like(self.std)
zeros = np.zeros_like(self.mean)
ones[self.range] = self.std[self.range]
zeros[self.range] = self.mean[self.range]
self.std = ones
self.mean = zeros
self.yetfit=True
print('done\n')
return self

367
src/util/common.py Executable file
View File

@ -0,0 +1,367 @@
import warnings
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from embeddings.supervised import get_supervised_embeddings
from learning.transformers import PosteriorProbabilitiesEmbedder, TfidfVectorizerMultilingual
warnings.filterwarnings("ignore", category=DeprecationWarning)
import numpy as np
from tqdm import tqdm
import torch
from scipy.sparse import vstack, issparse
def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary):
"""
Index (i.e., replaces word strings with numerical indexes) a list of string documents
:param data: list of string documents
:param vocab: a fixed mapping [str]->[int] of words to indexes
:param known_words: a set of known words (e.g., words that, despite not being included in the vocab, can be retained
because they are anyway contained in a pre-trained embedding set that we know in advance)
:param analyzer: the preprocessor in charge of transforming the document string into a chain of string words
:param unk_index: the index of the 'unknown token', i.e., a symbol that characterizes all words that we cannot keep
:param out_of_vocabulary: an incremental mapping [str]->[int] of words to indexes that will index all those words that
are not in the original vocab but that are in the known_words
:return:
"""
indexes=[]
vocabsize = len(vocab)
unk_count = 0
knw_count = 0
out_count = 0
pbar = tqdm(data, desc=f'indexing documents')
for text in pbar:
words = analyzer(text)
index = []
for word in words:
if word in vocab:
idx = vocab[word]
else:
if word in known_words:
if word not in out_of_vocabulary:
out_of_vocabulary[word] = vocabsize+len(out_of_vocabulary)
idx = out_of_vocabulary[word]
out_count += 1
else:
idx = unk_index
unk_count += 1
index.append(idx)
indexes.append(index)
knw_count += len(index)
pbar.set_description(f'[unk = {unk_count}/{knw_count}={(100.*unk_count/knw_count):.2f}%]'
f'[out = {out_count}/{knw_count}={(100.*out_count/knw_count):.2f}%]')
return indexes
def define_pad_length(index_list):
lengths = [len(index) for index in index_list]
return int(np.mean(lengths)+np.std(lengths))
def pad(index_list, pad_index, max_pad_length=None):
pad_length = np.max([len(index) for index in index_list])
if max_pad_length is not None:
pad_length = min(pad_length, max_pad_length)
for i,indexes in enumerate(index_list):
index_list[i] = [pad_index]*(pad_length-len(indexes)) + indexes[:pad_length]
return index_list
class Index:
def __init__(self, devel_raw, devel_target, test_raw, lang):
self.lang = lang
self.devel_raw = devel_raw
self.devel_target = devel_target
self.test_raw = test_raw
def index(self, pretrained_vocabulary, analyzer, vocabulary):
self.word2index = dict(vocabulary)
known_words = set(self.word2index.keys())
if pretrained_vocabulary is not None:
known_words.update(pretrained_vocabulary)
self.word2index['UNKTOKEN'] = len(self.word2index)
self.word2index['PADTOKEN'] = len(self.word2index)
self.unk_index = self.word2index['UNKTOKEN']
self.pad_index = self.word2index['PADTOKEN']
# index documents and keep track of test terms outside the development vocabulary that are in Muse (if available)
self.out_of_vocabulary = dict()
self.devel_index = index(self.devel_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary)
self.test_index = index(self.test_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary)
self.vocabsize = len(self.word2index) + len(self.out_of_vocabulary)
print(f'[indexing complete for lang {self.lang}] vocabulary-size={self.vocabsize}')
def train_val_split(self, val_prop, max_val, seed):
devel = self.devel_index
target = self.devel_target
devel_raw = self.devel_raw
val_size = int(min(len(devel) * val_prop, max_val))
self.train_index, self.val_index, self.train_target, self.val_target, self.train_raw, self.val_raw = \
train_test_split(
devel, target, devel_raw, test_size=val_size, random_state=seed, shuffle=True
)
print(f'split lang {self.lang}: train={len(self.train_index)} val={len(self.val_index)} test={len(self.test_index)}')
def get_word_list(self):
def extract_word_list(word2index):
return [w for w,i in sorted(word2index.items(), key=lambda x: x[1])]
word_list = extract_word_list(self.word2index)
word_list += extract_word_list(self.out_of_vocabulary)
return word_list
def compose_embedding_matrix(self, pretrained, supervised, Xtr=None, Ytr=None):
print(f'[generating embedding matrix for lang {self.lang}]')
self.wce_range = None
embedding_parts = []
if pretrained is not None:
print('\t[pretrained-matrix]')
word_list = self.get_word_list()
muse_embeddings = pretrained.extract(word_list)
embedding_parts.append(muse_embeddings)
del pretrained
if supervised:
print('\t[supervised-matrix]')
F = get_supervised_embeddings(Xtr, Ytr, reduction=None, method='dotn')
num_missing_rows = self.vocabsize - F.shape[0]
F = np.vstack((F, np.zeros(shape=(num_missing_rows, F.shape[1]))))
F = torch.from_numpy(F).float()
offset = 0
if embedding_parts:
offset = embedding_parts[0].shape[1]
self.wce_range = [offset, offset + F.shape[1]]
embedding_parts.append(F)
self.embedding_matrix = torch.cat(embedding_parts, dim=1)
print(f'[embedding matrix for lang {self.lang} has shape {self.embedding_matrix.shape}]')
def none_dict(langs):
return {l:None for l in langs}
class MultilingualIndex:
def __init__(self): #, add_language_trace=False):
self.l_index = {}
self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
# self.add_language_trace=add_language_trace
def index(self, l_devel_raw, l_devel_target, l_test_raw, l_pretrained_vocabulary):
self.langs = sorted(l_devel_raw.keys())
#build the vocabularies
self.l_vectorizer.fit(l_devel_raw)
l_vocabulary = self.l_vectorizer.vocabulary()
l_analyzer = self.l_vectorizer.get_analyzer()
for l in self.langs:
self.l_index[l] = Index(l_devel_raw[l], l_devel_target[l], l_test_raw[l], l)
self.l_index[l].index(l_pretrained_vocabulary[l], l_analyzer[l], l_vocabulary[l])
def train_val_split(self, val_prop=0.2, max_val=2000, seed=42):
for l,index in self.l_index.items():
index.train_val_split(val_prop, max_val, seed=seed)
def embedding_matrices(self, lpretrained, supervised):
lXtr = self.get_lXtr() if supervised else none_dict(self.langs)
lYtr = self.l_train_target() if supervised else none_dict(self.langs)
for l,index in self.l_index.items():
index.compose_embedding_matrix(lpretrained[l], supervised, lXtr[l], lYtr[l])
self.sup_range = index.wce_range
# experimental... does it make sense to keep track of the language? i.e., to inform the network from which
# language does the data came from...
# if self.add_language_trace and pretrained_embeddings is not None:
# print('adding language trace')
# lang_trace = torch.zeros(size=(vocabsize, len(self.langs)))
# lang_trace[:,i]=1
# pretrained_embeddings = torch.cat([pretrained_embeddings, lang_trace], dim=1)
def posterior_probabilities(self, max_training_docs_by_lang=5000):
# choose a maximum of "max_training_docs_by_lang" for training the calibrated SVMs
lXtr = {l:Xtr for l,Xtr in self.get_lXtr().items()}
lYtr = {l:Ytr for l,Ytr in self.l_train_target().items()}
for l in self.langs:
n_elements = lXtr[l].shape[0]
if n_elements > max_training_docs_by_lang:
choice = np.random.permutation(n_elements)[:max_training_docs_by_lang]
lXtr[l] = lXtr[l][choice]
lYtr[l] = lYtr[l][choice]
# train the posterior probabilities embedder
print('[posteriors] training a calibrated SVM')
learner = SVC(kernel='linear', probability=True, cache_size=1000, C=1, random_state=1, gamma='auto')
prob_embedder = PosteriorProbabilitiesEmbedder(learner, l2=False)
prob_embedder.fit(lXtr, lYtr)
# transforms the training, validation, and test sets into posterior probabilities
print('[posteriors] generating posterior probabilities')
lPtr = prob_embedder.transform(self.get_lXtr())
lPva = prob_embedder.transform(self.get_lXva())
lPte = prob_embedder.transform(self.get_lXte())
print('[posteriors] done')
return lPtr, lPva, lPte
def get_lXtr(self):
if not hasattr(self, 'lXtr'):
self.lXtr = self.l_vectorizer.transform({l: index.train_raw for l, index in self.l_index.items()})
return self.lXtr
def get_lXva(self):
if not hasattr(self, 'lXva'):
self.lXva = self.l_vectorizer.transform({l: index.val_raw for l, index in self.l_index.items()})
return self.lXva
def get_lXte(self):
if not hasattr(self, 'lXte'):
self.lXte = self.l_vectorizer.transform({l: index.test_raw for l, index in self.l_index.items()})
return self.lXte
def l_vocabsize(self):
return {l:index.vocabsize for l,index in self.l_index.items()}
def l_embeddings(self):
return {l:index.embedding_matrix for l,index in self.l_index.items()}
def l_pad(self):
return {l: index.pad_index for l, index in self.l_index.items()}
def l_train_index(self):
return {l: index.train_index for l, index in self.l_index.items()}
def l_train_target(self):
return {l: index.train_target for l, index in self.l_index.items()}
def l_val_index(self):
return {l: index.val_index for l, index in self.l_index.items()}
def l_val_target(self):
return {l: index.val_target for l, index in self.l_index.items()}
def l_test_index(self):
return {l: index.test_index for l, index in self.l_index.items()}
def l_train(self):
return self.l_train_index(), self.l_train_target()
def l_val(self):
return self.l_val_index(), self.l_val_target()
class Batch:
def __init__(self, batchsize, batches_per_epoch, languages, lpad, max_pad_length=500):
self.batchsize = batchsize
self.batches_per_epoch = batches_per_epoch
self.languages = languages
self.lpad=lpad
self.max_pad_length=max_pad_length
self.init_offset()
def init_offset(self):
self.offset = {lang: 0 for lang in self.languages}
def batchify(self, l_index, l_post, llabels):
langs = self.languages
l_num_samples = {l:len(l_index[l]) for l in langs}
max_samples = max(l_num_samples.values())
n_batches = max_samples // self.batchsize + 1 * (max_samples % self.batchsize > 0)
if self.batches_per_epoch != -1 and self.batches_per_epoch < n_batches:
n_batches = self.batches_per_epoch
for b in range(n_batches):
for lang in langs:
index, labels = l_index[lang], llabels[lang]
offset = self.offset[lang]
if offset >= l_num_samples[lang]:
offset = 0
limit = offset+self.batchsize
batch_slice = slice(offset, limit)
batch = index[batch_slice]
batch_labels = labels[batch_slice].toarray()
post = None
if l_post is not None:
post = torch.FloatTensor(l_post[lang][batch_slice]).cuda()
batch = pad(batch, pad_index=self.lpad[lang], max_pad_length=self.max_pad_length)
batch = torch.LongTensor(batch).cuda()
target = torch.FloatTensor(batch_labels).cuda()
self.offset[lang] = limit
yield batch, post, target, lang
def batchify(l_index, l_post, llabels, batchsize, lpad, max_pad_length=500):
langs = sorted(l_index.keys())
nsamples = max([len(l_index[l]) for l in langs])
nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0)
for b in range(nbatches):
for lang in langs:
index, labels = l_index[lang], llabels[lang]
if b * batchsize >= len(index):
continue
batch = index[b*batchsize:(b+1)*batchsize]
batch_labels = labels[b*batchsize:(b+1)*batchsize].toarray()
post = None
if l_post is not None:
post = torch.FloatTensor(l_post[lang][b*batchsize:(b+1)*batchsize]).cuda()
batch = pad(batch, pad_index=lpad[lang], max_pad_length=max_pad_length)
batch = torch.LongTensor(batch)
target = torch.FloatTensor(batch_labels)
yield batch.cuda(), post, target.cuda(), lang
def batchify_unlabelled(index_list, batchsize, pad_index, max_pad_length=500):
nsamples = len(index_list)
nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0)
for b in range(nbatches):
batch = index_list[b*batchsize:(b+1)*batchsize]
batch = pad(batch, pad_index=pad_index, max_pad_length=max_pad_length)
batch = torch.LongTensor(batch)
yield batch.cuda()
def clip_gradient(model, clip_value=1e-1):
params = list(filter(lambda p: p.grad is not None, model.parameters()))
for p in params:
p.grad.data.clamp_(-clip_value, clip_value)
def predict(logits, classification_type='multilabel'):
if classification_type == 'multilabel':
prediction = torch.sigmoid(logits) > 0.5
elif classification_type == 'singlelabel':
prediction = torch.argmax(logits, dim=1).view(-1, 1)
else:
print('unknown classification type')
return prediction.detach().cpu().numpy()
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)

60
src/util/csv_log.py Executable file
View File

@ -0,0 +1,60 @@
import os
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
class CSVLog:
def __init__(self, file, columns=None, autoflush=True, verbose=False, overwrite=False):
self.file = file
self.autoflush = autoflush
self.verbose = verbose
if os.path.exists(file) and not overwrite:
self.tell('Loading existing file from {}'.format(file))
self.df = pd.read_csv(file, sep='\t')
self.columns = sorted(self.df.columns.values.tolist())
else:
self.tell('File {} does not exist or overwrite=True. Creating new frame.'.format(file))
assert columns is not None, 'columns cannot be None'
self.columns = sorted(columns)
dir = os.path.dirname(self.file)
if dir and not os.path.exists(dir): os.makedirs(dir)
self.df = pd.DataFrame(columns=self.columns)
self.defaults={}
def already_calculated(self, **kwargs):
df = self.df
if df.shape[0]==0:
return False
if len(kwargs)==0:
kwargs = self.defaults
for key,val in kwargs.items():
df = df.loc[df[key]==val]
if df.shape[0]==0: return False
return True
def set_default(self, param, value):
self.defaults[param]=value
def add_row(self, **kwargs):
for key in self.defaults.keys():
if key not in kwargs:
kwargs[key]=self.defaults[key]
colums = sorted(list(kwargs.keys()))
values = [kwargs[col_i] for col_i in colums]
s = pd.Series(values, index=self.columns)
self.df = self.df.append(s, ignore_index=True)
if self.autoflush: self.flush()
# self.tell(s.to_string())
self.tell(kwargs)
def flush(self):
self.df.to_csv(self.file, index=False, sep='\t')
def tell(self, msg):
if self.verbose: print(msg)

53
src/util/early_stop.py Executable file
View File

@ -0,0 +1,53 @@
#adapted from https://github.com/Bjarten/early-stopping-pytorch/blob/master/pytorchtools.py
import torch
from time import time
from util.file import create_if_not_exist
class EarlyStopping:
def __init__(self, model, patience=20, verbose=True, checkpoint='./checkpoint.pt'):
# set patience to 0 or -1 to avoid stopping, but still keeping track of the best value and model parameters
self.patience_limit = patience
self.patience = patience
self.verbose = verbose
self.best_score = None
self.best_epoch = None
self.stop_time = None
self.checkpoint = checkpoint
self.model = model
self.STOP = False
def __call__(self, watch_score, epoch):
if self.STOP: return #done
if self.best_score is None or watch_score >= self.best_score:
self.best_score = watch_score
self.best_epoch = epoch
self.stop_time = time()
if self.checkpoint:
self.print(f'[early-stop] improved, saving model in {self.checkpoint}')
torch.save(self.model, self.checkpoint)
else:
self.print(f'[early-stop] improved')
self.patience = self.patience_limit
else:
self.patience -= 1
if self.patience == 0:
self.STOP = True
self.print(f'[early-stop] patience exhausted')
else:
if self.patience>0: # if negative, then early-stop is ignored
self.print(f'[early-stop] patience={self.patience}')
def reinit_counter(self):
self.STOP = False
self.patience=self.patience_limit
def restore_checkpoint(self):
return torch.load(self.checkpoint)
def print(self, msg):
if self.verbose:
print(msg)

View File

@ -44,7 +44,7 @@ def evaluate_method(polylingual_method, lX, ly, predictor=None, soft=False, retu
tinit=time.time()
print('prediction for test')
assert set(lX.keys()) == set(ly.keys()), 'inconsistent dictionaries in evaluate'
n_jobs = polylingual_method.n_jobs
n_jobs = polylingual_method.n_jobs if hasattr(polylingual_method, 'n_jobs') else -1
if predictor is None:
predictor = polylingual_method.predict

View File

@ -2,6 +2,7 @@ from os import listdir, makedirs
from os.path import isdir, isfile, join, exists, dirname
#from sklearn.externals.six.moves import urllib
import urllib
from pathlib import Path
def download_file(url, archive_filename):
@ -36,4 +37,8 @@ def makedirs_if_not_exist(path):
def create_if_not_exist(path):
if not exists(path): makedirs(path)
def get_parent_name(path):
return Path(path).parent
def get_file_name(path):
return Path(path).name