From 22b7ea7e6621bd8aee621a94b8a6a6797aa08146 Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Thu, 30 Jan 2020 17:08:52 +0100 Subject: [PATCH] huge refactoring, deep learning, and other stuff --- src/dataset_builder.py | 176 ++++++++-- src/embeddings/__init__.py | 0 src/embeddings/embeddings.py | 1 + src/learning/learners.py | 4 +- src/learning/transformers.py | 265 ++++++++++++--- src/main_deep_learning.py | 290 ++++++++++++++++ src/main_majorityvoting_cls.py | 127 +++++++ src/main_multimodal_cls.py | 87 +++-- src/models/helpers.py | 47 +++ src/models/lstm_class.py | 96 ++++++ src/transformers/StandardizeTransformer.py | 13 +- src/util/common.py | 367 +++++++++++++++++++++ src/util/csv_log.py | 60 ++++ src/util/early_stop.py | 53 +++ src/util/evaluation.py | 2 +- src/util/file.py | 5 + 16 files changed, 1474 insertions(+), 119 deletions(-) create mode 100644 src/embeddings/__init__.py create mode 100755 src/main_deep_learning.py create mode 100644 src/main_majorityvoting_cls.py create mode 100755 src/models/helpers.py create mode 100755 src/models/lstm_class.py create mode 100755 src/util/common.py create mode 100755 src/util/csv_log.py create mode 100755 src/util/early_stop.py diff --git a/src/dataset_builder.py b/src/dataset_builder.py index 9af7b3f..454fea6 100644 --- a/src/dataset_builder.py +++ b/src/dataset_builder.py @@ -13,6 +13,7 @@ from scipy.sparse import issparse import itertools from tqdm import tqdm import re +from scipy.sparse import csr_matrix class MultilingualDataset: @@ -68,27 +69,33 @@ class MultilingualDataset: if languages is not None: self.languages_view = languages - def training(self): - return self.lXtr(), self.lYtr() + def training(self, mask_numbers=False, target_as_csr=False): + return self.lXtr(mask_numbers), self.lYtr(as_csr=target_as_csr) - def test(self): - return self.lXte(), self.lYte() + def test(self, mask_numbers=False, target_as_csr=False): + return self.lXte(mask_numbers), self.lYte(as_csr=target_as_csr) - def lXtr(self): - return {lang: Xtr for (lang, ((Xtr, _, _), _)) in self.multiling_dataset.items() if - lang in self.langs()} - # return {lang:self.mask_numbers(Xtr) for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()} + def lXtr(self, mask_numbers=False): + proc = lambda x:_mask_numbers(x) if mask_numbers else x + # return {lang: Xtr for (lang, ((Xtr, _, _), _)) in self.multiling_dataset.items() if lang in self.langs()} + return {lang:proc(Xtr) for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()} - def lXte(self): - return {lang: Xte for (lang, (_, (Xte, _, _))) in self.multiling_dataset.items() if - lang in self.langs()} - # return {lang:self.mask_numbers(Xte) for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()} + def lXte(self, mask_numbers=False): + proc = lambda x: _mask_numbers(x) if mask_numbers else x + # return {lang: Xte for (lang, (_, (Xte, _, _))) in self.multiling_dataset.items() if lang in self.langs()} + return {lang:proc(Xte) for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()} - def lYtr(self): - return {lang:self.cat_view(Ytr) for (lang, ((_,Ytr,_),_)) in self.multiling_dataset.items() if lang in self.langs()} + def lYtr(self, as_csr=False): + lY = {lang:self.cat_view(Ytr) for (lang, ((_,Ytr,_),_)) in self.multiling_dataset.items() if lang in self.langs()} + if as_csr: + lY = {l:csr_matrix(Y) for l,Y in lY.items()} + return lY - def lYte(self): - return {lang:self.cat_view(Yte) for (lang, (_,(_,Yte,_))) in self.multiling_dataset.items() if lang in self.langs()} + def lYte(self, as_csr=False): + lY = {lang:self.cat_view(Yte) for (lang, (_,(_,Yte,_))) in self.multiling_dataset.items() if lang in self.langs()} + if as_csr: + lY = {l:csr_matrix(Y) for l,Y in lY.items()} + return lY def cat_view(self, Y): if hasattr(self, 'categories_view'): @@ -107,10 +114,11 @@ class MultilingualDataset: return self.lYtr()[self.langs()[0]].shape[1] def show_dimensions(self): + def shape(X): + return X.shape if hasattr(X, 'shape') else len(X) for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items(): if lang not in self.langs(): continue - if hasattr(Xtr, 'shape') and hasattr(Xte, 'shape'): - print("Lang {}, Xtr={}, ytr={}, Xte={}, yte={}".format(lang, Xtr.shape, self.cat_view(Ytr).shape, Xte.shape, self.cat_view(Yte).shape)) + print("Lang {}, Xtr={}, ytr={}, Xte={}, yte={}".format(lang, shape(Xtr), self.cat_view(Ytr).shape, shape(Xte), self.cat_view(Yte).shape)) def show_category_prevalences(self): #pass @@ -135,12 +143,24 @@ class MultilingualDataset: def set_labels(self, labels): self.labels = labels - def mask_numbers(self, data, number_mask='numbermask'): - mask = re.compile(r'\b[0-9][0-9.,-]*\b') - masked = [] - for text in tqdm(data, desc='masking numbers'): - masked.append(mask.sub(number_mask, text)) - return masked +def _mask_numbers(data): + mask_moredigit = re.compile(r'\s[\+-]?\d{5,}([\.,]\d*)*\b') + mask_4digit = re.compile(r'\s[\+-]?\d{4}([\.,]\d*)*\b') + mask_3digit = re.compile(r'\s[\+-]?\d{3}([\.,]\d*)*\b') + mask_2digit = re.compile(r'\s[\+-]?\d{2}([\.,]\d*)*\b') + mask_1digit = re.compile(r'\s[\+-]?\d{1}([\.,]\d*)*\b') + masked = [] + for text in tqdm(data, desc='masking numbers'): + text = ' ' + text + text = mask_moredigit.sub(' MoreDigitMask', text) + text = mask_4digit.sub(' FourDigitMask', text) + text = mask_3digit.sub(' ThreeDigitMask', text) + text = mask_2digit.sub(' TwoDigitMask', text) + text = mask_1digit.sub(' OneDigitMask', text) + masked.append(text.replace('.','').replace(',','').strip()) + return masked + + # ---------------------------------------------------------------------------------------------------------------------- @@ -541,12 +561,120 @@ def prepare_rcv_datasets(outpath, rcv1_data_home, rcv2_data_home, wiki_data_home build_juxtaposed_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, preprocess).save(yuxta_path) +# ---------------------------------------------------------------------------------------------------------------------- +# Methods to generate full RCV and JRC datasets +# ---------------------------------------------------------------------------------------------------------------------- +def full_rcv_(rcv1_data_home, rcv2_data_home, outpath, langs): + + + print('fetching the datasets') + rcv1_train_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train') + rcv1_test_documents, labels_rcv1_test = fetch_RCV1(rcv1_data_home, split='test') + rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l != 'en']) + + filter_by_categories(rcv1_train_documents, labels_rcv2) + filter_by_categories(rcv1_test_documents, labels_rcv2) + filter_by_categories(rcv2_documents, labels_rcv1) + + label_names = get_active_labels(rcv1_train_documents + rcv2_documents) + print('Active labels in RCV1/2 {}'.format(len(label_names))) + + print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_train_documents), len(rcv1_test_documents), len(label_names))) + print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents])) + + mlb = MultiLabelBinarizer() + mlb.fit([label_names]) + + all_docs = rcv1_train_documents + rcv1_test_documents + rcv2_documents + lang_docs = {lang: [d for d in all_docs if d.lang == lang] for lang in langs} + + def get_ids(doclist): + return frozenset([d.id for d in doclist]) + + tr_ids = {'en': get_ids(rcv1_train_documents)} + te_ids = {'en': get_ids(rcv1_test_documents)} + for lang in langs: + if lang == 'en': continue + tr_ids[lang], te_ids[lang] = train_test_split([d.id for d in lang_docs[lang]], test_size=.3) + + dataset = MultilingualDataset() + dataset.dataset_name = 'RCV1/2-full' + for lang in langs: + print(f'processing {lang} with {len(tr_ids[lang])} training documents and {len(te_ids[lang])} documents') + analyzer = CountVectorizer( + strip_accents='unicode', min_df=3, stop_words=stopwords.words(NLTK_LANGMAP[lang]) + ).build_analyzer() + + Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.id) for d in lang_docs[lang] if d.id in tr_ids[lang]]) + Xte,Yte,IDte = zip(*[(d.text,d.categories,d.id) for d in lang_docs[lang] if d.id in te_ids[lang]]) + Xtr = [' '.join(analyzer(d)) for d in Xtr] + Xte = [' '.join(analyzer(d)) for d in Xte] + Ytr = mlb.transform(Ytr) + Yte = mlb.transform(Yte) + dataset.add(lang, _mask_numbers(Xtr), Ytr, _mask_numbers(Xte), Yte, IDtr, IDte) + + dataset.save(outpath) + + +def full_jrc_(jrc_data_home, langs, train_years, test_years, outpath, cat_policy='all', most_common_cat=300): + + print('fetching the datasets') + cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy) + training_docs, label_names = fetch_jrcacquis( + langs=langs, data_path=jrc_data_home, years=train_years, cat_filter=cat_list, cat_threshold=1, parallel=None, most_frequent=most_common_cat + ) + test_docs, _ = fetch_jrcacquis( + langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names, parallel='force' + ) + + def _group_by_lang(doc_list, langs): + return {lang: [d for d in doc_list if d.lang == lang] for lang in langs} + + training_docs = _group_by_lang(training_docs, langs) + test_docs = _group_by_lang(test_docs, langs) + + mlb = MultiLabelBinarizer() + mlb.fit([label_names]) + + dataset = MultilingualDataset() + data.dataset_name = 'JRC-Acquis-full' + for lang in langs: + analyzer = CountVectorizer( + strip_accents='unicode', min_df=3, stop_words=stopwords.words(NLTK_LANGMAP[lang]) + ).build_analyzer() + + Xtr, Ytr, IDtr = zip(*[(d.text, d.categories, d.parallel_id + '__' + d.id) for d in training_docs[lang] if d.lang == lang]) + Xte, Yte, IDte = zip(*[(d.text, d.categories, d.parallel_id + '__' + d.id) for d in test_docs[lang] if d.lang == lang]) + Xtr = [' '.join(analyzer(d)) for d in Xtr] + Xte = [' '.join(analyzer(d)) for d in Xte] + Ytr = mlb.transform(Ytr) + Yte = mlb.transform(Yte) + dataset.add(lang, _mask_numbers(Xtr), Ytr, _mask_numbers(Xte), Yte, IDtr, IDte) + + dataset.save(outpath) + + #----------------------------------------------------------------------------------------------------------------------- # MAIN BUILDER #----------------------------------------------------------------------------------------------------------------------- if __name__=='__main__': import sys + RCV1_PATH = '../Datasets/RCV1-v2/unprocessed_corpus' + RCV2_PATH = '../Datasets/RCV2' + JRC_DATAPATH = "../Datasets/JRC_Acquis_v3" + full_rcv_(RCV1_PATH, RCV2_PATH, outpath='../rcv2/rcv1-2_doclist_full_processed.pickle', langs=RCV2_LANGS_WITH_NLTK_STEMMING + ['en']) + # full_jrc_(JRC_DATAPATH, lang_set['JRC_NLTK'], train_years=list(range(1958, 2006)), test_years=[2006], outpath='../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle', cat_policy='all', most_common_cat=300) + sys.exit(0) + + # datasetpath = '../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle' # '../rcv2/rcv1-2_doclist_full_processed.pickle' + # data = MultilingualDataset.load(datasetpath) + # data.dataset_name='JRC-Acquis-full'#'RCV1/2-full' + # for lang in RCV2_LANGS_WITH_NLTK_STEMMING + ['en']: + # (Xtr, ytr, idtr), (Xte, yte, idte) = data.multiling_dataset[lang] + # data.multiling_dataset[lang] = ((_mask_numbers(Xtr), ytr, idtr), (_mask_numbers(Xte), yte, idte)) + # data.save('../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle')#'../rcv2/rcv1-2_doclist_full_processed_2.pickle') + # sys.exit(0) assert len(sys.argv) == 5, "wrong number of arguments; required: " \ " " diff --git a/src/embeddings/__init__.py b/src/embeddings/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/embeddings/embeddings.py b/src/embeddings/embeddings.py index a12c206..e4fdbb3 100644 --- a/src/embeddings/embeddings.py +++ b/src/embeddings/embeddings.py @@ -204,6 +204,7 @@ class FastTextMUSE(PretrainedEmbeddings): def extract(self, words): source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi) extraction = torch.zeros((len(words), self.dim())) + # extraction = torch.empty(len(words), self.dim()).normal_(0, 1) extraction[source_idx] = self.embed.vectors[target_idx] return extraction diff --git a/src/learning/learners.py b/src/learning/learners.py index ee87f2c..95f8c2b 100644 --- a/src/learning/learners.py +++ b/src/learning/learners.py @@ -254,7 +254,7 @@ class NaivePolylingualClassifier: return {lang:self.model[lang].transform(lX[lang]) for lang in lX.keys()} else: langs = list(lX.keys()) - scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].transform)(lX[lang]) for lang in langs) + scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].predict)(lX[lang]) for lang in langs) return {lang: scores[i] for i, lang in enumerate(langs)} def best_params(self): @@ -297,7 +297,7 @@ class MonolingualClassifier: self.model = GridSearchCV(self.model, param_grid=self.parameters, refit=True, cv=5, n_jobs=self.n_jobs, error_score=0, verbose=10) - print('fitting:', self.model) + print(f'fitting: {self.model} on matrices of shape X={X.shape} Y={y.shape}') self.model.fit(X, y) if isinstance(self.model, GridSearchCV): self.best_params_ = self.model.best_params_ diff --git a/src/learning/transformers.py b/src/learning/transformers.py index e5b0da4..190c32d 100644 --- a/src/learning/transformers.py +++ b/src/learning/transformers.py @@ -1,6 +1,8 @@ import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer #from data.text_preprocessor import NLTKStemTokenizer +from data.tsr_function__ import get_tsr_matrix, get_supervised_matrix, pointwise_mutual_information, information_gain, \ + gain_ratio, gss from embeddings.embeddings import FastTextMUSE from embeddings.supervised import supervised_embeddings_tfidf, zscores from learning.learners import NaivePolylingualClassifier, MonolingualClassifier, _joblib_transform_multiling @@ -10,6 +12,9 @@ from joblib import Parallel, delayed from scipy.sparse import issparse, vstack, hstack from transformers.StandardizeTransformer import StandardizeTransformer from util.SIF_embed import remove_pc +from sklearn.preprocessing import normalize +from sklearn.svm import SVC +from scipy.sparse import csr_matrix # ------------------------------------------------------------------ # Data Processing @@ -39,20 +44,65 @@ class TfidfVectorizerMultilingual: else: return self.vectorizer[l].vocabulary_ + def get_analyzer(self, l=None): + if l is None: + return {l:self.vectorizer[l].build_analyzer() for l in self.langs} + else: + return self.vectorizer[l].build_analyzer() + + +class FeatureWeight: + + def __init__(self, weight='tfidf', agg='mean'): + assert weight in ['tfidf', 'pmi', 'ig'] or callable(weight), 'weight should either be "tfidf" or a callable function' + assert agg in ['mean', 'max'], 'aggregation function should either be "mean" or "max"' + self.weight = weight + self.agg = agg + self.fitted = False + if weight=='pmi': + self.weight = pointwise_mutual_information + elif weight == 'ig': + self.weight = information_gain + + def fit(self, lX, ly): + if not self.fitted: + if self.weight == 'tfidf': + self.lF = {l: np.ones(X.shape[1]) for l, X in lX.items()} + else: + self.lF = {} + for l in lX.keys(): + X, y = lX[l], ly[l] + + print(f'getting supervised cell-matrix lang {l}') + tsr_matrix = get_tsr_matrix(get_supervised_matrix(X, y), tsr_score_funtion=self.weight) + if self.agg == 'max': + F = tsr_matrix.max(axis=0) + elif self.agg == 'mean': + F = tsr_matrix.mean(axis=0) + self.lF[l] = F + + self.fitted = True + return self + + def transform(self, lX): + return {lang: csr_matrix.multiply(lX[lang], self.lF[lang]) for lang in lX.keys()} + + def fit_transform(self, lX, ly): + return self.fit(lX, ly).transform(lX) # ------------------------------------------------------------------ # Document Embeddings # ------------------------------------------------------------------ class PosteriorProbabilitiesEmbedder: - def __init__(self, first_tier_learner, first_tier_parameters, - n_jobs=-1): + def __init__(self, first_tier_learner, first_tier_parameters=None, l2=True, n_jobs=-1): self.fist_tier_learner = first_tier_learner self.fist_tier_parameters = first_tier_parameters + self.l2 = l2 self.n_jobs = n_jobs - self.doc_projector = NaivePolylingualClassifier(self.fist_tier_learner, - self.fist_tier_parameters, - n_jobs=n_jobs) + self.doc_projector = NaivePolylingualClassifier( + self.fist_tier_learner, self.fist_tier_parameters, n_jobs=n_jobs + ) def fit(self, lX, lY, lV=None): print('fitting the projectors... {}'.format(lX.keys())) @@ -60,8 +110,8 @@ class PosteriorProbabilitiesEmbedder: return self def transform(self, lX): - print('projecting the documents') - lZ = self.doc_projector.predict_proba(lX) + lZ = self.predict_proba(lX) + lZ = _normalize(lZ, self.l2) return lZ def fit_transform(self, lX, ly=None, lV=None): @@ -70,28 +120,41 @@ class PosteriorProbabilitiesEmbedder: def best_params(self): return self.doc_projector.best_params() + def predict(self, lX, ly=None): + return self.doc_projector.predict(lX) + + def predict_proba(self, lX, ly=None): + print(f'generating posterior probabilities for {sum([X.shape[0] for X in lX.values()])} the documents') + return self.doc_projector.predict_proba(lX) + class MuseEmbedder: - def __init__(self, path, n_jobs=-1): + def __init__(self, path, lV=None, l2=True, n_jobs=-1, featureweight=FeatureWeight()): self.path=path + self.lV = lV + self.l2 = l2 self.n_jobs = n_jobs + self.featureweight = featureweight - def fit(self, lX, ly, lV): + def fit(self, lX, ly, lV=None): + assert lV is not None or self.lV is not None, 'lV not specified' self.langs = sorted(lX.keys()) - MUSE = Parallel(n_jobs=self.n_jobs)( - delayed(FastTextMUSE)(self.path, lang) for lang in self.langs - ) + self.MUSE = load_muse_embeddings(self.path, self.langs, self.n_jobs) lWordList = {l:self._get_wordlist_from_word2index(lV[l]) for l in self.langs} - self.MUSE = {l:MUSE[i].extract(lWordList[l]).numpy() for i,l in enumerate(self.langs)} + self.MUSE = {l:Muse.extract(lWordList[l]).numpy() for l,Muse in self.MUSE} + self.featureweight.fit(lX, ly) return self def transform(self, lX): MUSE = self.MUSE + lX = self.featureweight.transform(lX) XdotMUSE = Parallel(n_jobs=self.n_jobs)( delayed(XdotM)(lX[lang], MUSE[lang]) for lang in self.langs ) - return {l: XdotMUSE[i] for i, l in enumerate(self.langs)} + lMuse = {l: XdotMUSE[i] for i, l in enumerate(self.langs)} + lMuse = _normalize(lMuse, self.l2) + return lMuse def fit_transform(self, lX, ly, lV): return self.fit(lX, ly, lV).transform(lX) @@ -102,9 +165,11 @@ class MuseEmbedder: class WordClassEmbedder: - def __init__(self, n_jobs=-1, max_label_space=300): + def __init__(self, l2=True, n_jobs=-1, max_label_space=300, featureweight=FeatureWeight()): self.n_jobs = n_jobs + self.l2 = l2 self.max_label_space=max_label_space + self.featureweight = featureweight def fit(self, lX, ly, lV=None): self.langs = sorted(lX.keys()) @@ -112,53 +177,43 @@ class WordClassEmbedder: delayed(word_class_embedding_matrix)(lX[lang], ly[lang], self.max_label_space) for lang in self.langs ) self.lWCE = {l:WCE[i] for i,l in enumerate(self.langs)} + self.featureweight.fit(lX, ly) return self def transform(self, lX): lWCE = self.lWCE + lX = self.featureweight.transform(lX) XdotWCE = Parallel(n_jobs=self.n_jobs)( - delayed(XdotM)(lX[lang], lWCE[lang]) for lang in self.langs + delayed(XdotM)(lX[lang], lWCE[lang])for lang in self.langs ) - return {l: XdotWCE[i] for i, l in enumerate(self.langs)} + lwce = {l: XdotWCE[i] for i, l in enumerate(self.langs)} + lwce = _normalize(lwce, self.l2) + return lwce def fit_transform(self, lX, ly, lV=None): return self.fit(lX, ly).transform(lX) -def word_class_embedding_matrix(X, Y, max_label_space=300): - print('computing supervised embeddings...') - WCE = supervised_embeddings_tfidf(X, Y) - WCE = zscores(WCE, axis=0) - - nC = Y.shape[1] - if nC > max_label_space: - print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' - f'Applying PCA(n_components={max_label_space})') - pca = PCA(n_components=max_label_space) - WCE = pca.fit(WCE).transform(WCE) - - return WCE - - -def XdotM(X,M): - # return X.dot(M) - E = X.dot(M) - E = remove_pc(E, npc=1) - return E - - class DocEmbedderList: - def __init__(self, *embedder_list): + + def __init__(self, *embedder_list, aggregation='concat'): + assert aggregation in {'concat', 'mean'}, 'unknown aggregation mode, valid are "concat" and "mean"' if len(embedder_list)==0: embedder_list=[] self.embedders = embedder_list + self.aggregation = aggregation - - def fit(self, lX, ly, lV): + def fit(self, lX, ly, lV=None): for transformer in self.embedders: transformer.fit(lX,ly,lV) return self def transform(self, lX): + if self.aggregation == 'concat': + return self.transform_concat(lX) + elif self.aggregation == 'mean': + return self.transform_mean(lX) + + def transform_concat(self, lX): if len(self.embedders)==1: return self.embedders[0].transform(lX) @@ -176,8 +231,27 @@ class DocEmbedderList: hstacker = hstack if some_sparse else np.hstack return {l:hstacker(lZparts[l]) for l in langs} + def transform_mean(self, lX): + if len(self.embedders)==1: + return self.embedders[0].transform(lX) - def fit_transform(self, lX, ly, lV): + langs = sorted(lX.keys()) + + lZparts = {l: None for l in langs} + for transformer in self.embedders: + lZ = transformer.transform(lX) + for l in langs: + Z = lZ[l] + if lZparts[l] is None: + lZparts[l] = Z + else: + lZparts[l] += Z + + n_transformers = len(self.embedders) + + return {l:lZparts[l] / n_transformers for l in langs} + + def fit_transform(self, lX, ly, lV=None): return self.fit(lX, ly, lV).transform(lX) def best_params(self): @@ -186,20 +260,55 @@ class DocEmbedderList: def append(self, embedder): self.embedders.append(embedder) + +class FeatureSet2Posteriors: + def __init__(self, transformer, l2=True, n_jobs=-1): + self.transformer = transformer + self.l2=l2 + self.n_jobs = n_jobs + self.prob_classifier = MetaClassifier(SVC(kernel='rbf', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs) + + def fit(self, lX, ly, lV=None): + if lV is None and hasattr(self.transformer, 'lV'): + lV = self.transformer.lV + lZ = self.transformer.fit_transform(lX, ly, lV) + self.prob_classifier.fit(lZ, ly) + return self + + def transform(self, lX): + lP = self.predict_proba(lX) + lP = _normalize(lP, self.l2) + return lP + + def fit_transform(self, lX, ly, lV): + return self.fit(lX, ly, lV).transform(lX) + + def predict(self, lX, ly=None): + lZ = self.transformer.transform(lX) + return self.prob_classifier.predict(lZ) + + def predict_proba(self, lX, ly=None): + lZ = self.transformer.transform(lX) + return self.prob_classifier.predict_proba(lZ) + + # ------------------------------------------------------------------ # Meta-Classifier # ------------------------------------------------------------------ class MetaClassifier: - def __init__(self, meta_learner, meta_parameters, n_jobs=-1): + def __init__(self, meta_learner, meta_parameters=None, n_jobs=-1, standardize_range=None): self.n_jobs=n_jobs self.model = MonolingualClassifier(base_learner=meta_learner, parameters=meta_parameters, n_jobs=n_jobs) + self.standardize_range = standardize_range def fit(self, lZ, ly): tinit = time.time() Z, y = self.stack(lZ, ly) - self.standardizer = StandardizeTransformer() + + self.standardizer = StandardizeTransformer(range=self.standardize_range) Z = self.standardizer.fit_transform(Z) + print('fitting the Z-space of shape={}'.format(Z.shape)) self.model.fit(Z, y) self.time = time.time() - tinit @@ -217,6 +326,10 @@ class MetaClassifier: lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs) return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs) + def predict_proba(self, lZ, ly=None): + lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs) + return _joblib_transform_multiling(self.model.predict_proba, lZ, n_jobs=self.n_jobs) + def best_params(self): return self.model.best_params() @@ -249,3 +362,65 @@ class Funnelling: return {'1st-tier':self.first_tier.best_params(), 'meta':self.meta.best_params()} + +class Voting: + def __init__(self, *prob_classifiers): + assert all([hasattr(p, 'predict_proba') for p in prob_classifiers]), 'not all classifiers are probabilistic' + self.prob_classifiers = prob_classifiers + + def fit(self, lX, ly, lV=None): + for classifier in self.prob_classifiers: + classifier.fit(lX, ly, lV) + + def predict(self, lX, ly=None): + + lP = {l:[] for l in lX.keys()} + for classifier in self.prob_classifiers: + lPi = classifier.predict_proba(lX) + for l in lX.keys(): + lP[l].append(lPi[l]) + + lP = {l:np.stack(Plist).mean(axis=0) for l,Plist in lP.items()} + ly = {l:P>0.5 for l,P in lP.items()} + + return ly + + +# ------------------------------------------------------------------------------ +# HELPERS +# ------------------------------------------------------------------------------ + +def load_muse_embeddings(we_path, langs, n_jobs=-1): + MUSE = Parallel(n_jobs=n_jobs)( + delayed(FastTextMUSE)(we_path, lang) for lang in langs + ) + return {l: MUSE[i] for i, l in enumerate(langs)} + + +def word_class_embedding_matrix(X, Y, max_label_space=300): + print('computing supervised embeddings...') + WCE = supervised_embeddings_tfidf(X, Y) + WCE = zscores(WCE, axis=0) + + nC = Y.shape[1] + if nC > max_label_space: + print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. ' + f'Applying PCA(n_components={max_label_space})') + pca = PCA(n_components=max_label_space) + WCE = pca.fit(WCE).transform(WCE) + + return WCE + + +def XdotM(X,M): + # return X.dot(M) + # print(f'X={X.shape}, M={M.shape}') + E = X.dot(M) + E = remove_pc(E, npc=1) + return E + + +def _normalize(lX, l2=True): + return {l: normalize(X) for l, X in lX.items()} if l2 else lX + + diff --git a/src/main_deep_learning.py b/src/main_deep_learning.py new file mode 100755 index 0000000..5fc2a94 --- /dev/null +++ b/src/main_deep_learning.py @@ -0,0 +1,290 @@ +import argparse +import torch.nn as nn +from torch.optim.lr_scheduler import StepLR +from dataset_builder import MultilingualDataset +from learning.transformers import load_muse_embeddings +from models.lstm_class import RNNMultilingualClassifier +from util.csv_log import CSVLog +from util.early_stop import EarlyStopping +from util.common import * +from util.file import create_if_not_exist +from time import time +from embeddings.pretrained import * +from os.path import join +from tqdm import tqdm +from util.evaluation import evaluate +from util.file import get_file_name + +allowed_nets = {'rnn'} + +# instantiates the net, initializes the model parameters, and sets embeddings trainable if requested +def init_Net(nC, multilingual_index, xavier_uniform=True): + net=opt.net + assert net in allowed_nets, f'{net} not supported, valid ones are={allowed_nets}' + + # instantiate the required net + if net=='rnn': + only_post = opt.posteriors and (not opt.pretrained) and (not opt.supervised) + if only_post: + print('working on ONLY POST mode') + model = RNNMultilingualClassifier( + output_size=nC, + hidden_size=opt.hidden, + lvocab_size=multilingual_index.l_vocabsize(), + learnable_length=opt.learnable, + lpretrained=multilingual_index.l_embeddings(), + drop_embedding_range=multilingual_index.sup_range, + drop_embedding_prop=opt.sup_drop, + post_probabilities=opt.posteriors, + only_post=only_post + ) + + # weight initialization + if xavier_uniform: + for p in model.parameters(): + if p.dim() > 1 and p.requires_grad: + nn.init.xavier_uniform_(p) + + if opt.tunable: + # this has to be performed *after* Xavier initialization is done, + # otherwise the pretrained embedding parameters will be overrided + model.finetune_pretrained() + + return model.cuda() + + +def set_method_name(): + method_name = f'{opt.net}(H{opt.hidden})' + if opt.pretrained: + method_name += f'-Muse' + if opt.supervised: + method_name += f'-WCE' + if opt.posteriors: + method_name += f'-Posteriors' + if (opt.pretrained or opt.supervised) and opt.tunable: + method_name+='-(trainable)' + else: + method_name += '-(static)' + if opt.learnable > 0: + method_name += f'-Learnable{opt.learnable}' + return method_name + + +def init_optimizer(model, lr): + return torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr, weight_decay=opt.weight_decay) + + +def init_logfile(method_name, opt): + logfile = CSVLog(opt.log_file, ['dataset', 'method', 'epoch', 'measure', 'value', 'run', 'timelapse']) + logfile.set_default('dataset', opt.dataset) + logfile.set_default('run', opt.seed) + logfile.set_default('method', method_name) + assert opt.force or not logfile.already_calculated(), f'results for dataset {opt.dataset} method {method_name} and run {opt.seed} already calculated' + return logfile + + +# loads the MUSE embeddings if requested, or returns empty dictionaries otherwise +def load_pretrained_embeddings(we_path, langs): + lpretrained = lpretrained_vocabulary = none_dict(langs) + if opt.pretrained: + lpretrained = load_muse_embeddings(we_path, langs, n_jobs=-1) + lpretrained_vocabulary = {l: lpretrained[l].vocabulary() for l in langs} + return lpretrained, lpretrained_vocabulary + + +# ---------------------------------------------------------------------------------------------------------------------- +def main(): + + method_name = set_method_name() + logfile = init_logfile(method_name, opt) + + # Loading the dataset + data = MultilingualDataset.load(opt.dataset) + # data.set_view(languages=['de', 'fr', 'sv', 'da', 'es', 'it']) + data.show_dimensions() + langs = data.langs() + l_devel_raw, l_devel_target = data.training(target_as_csr=True) + l_test_raw, l_test_target = data.test(target_as_csr=True) + + # Loading the MUSE pretrained embeddings (only if requested) + lpretrained, lpretrained_vocabulary = load_pretrained_embeddings(opt.we_path, langs) + + # Data preparation: indexing / splitting / embedding matrices (pretrained + supervised) / posterior probs + multilingual_index = MultilingualIndex() + multilingual_index.index(l_devel_raw, l_devel_target, l_test_raw, lpretrained_vocabulary) + multilingual_index.train_val_split(val_prop=0.2, max_val=2000, seed=opt.seed) + multilingual_index.embedding_matrices(lpretrained, opt.supervised) + if opt.posteriors: + lPtr, lPva, lPte = multilingual_index.posterior_probabilities(max_training_docs_by_lang=opt.svm_max_docs) + else: + lPtr, lPva, lPte = None, None, None + + # Model initialization + model = init_Net(data.num_categories(), multilingual_index) + + optim = init_optimizer(model, lr=opt.lr) + criterion = torch.nn.BCEWithLogitsLoss().cuda() + lr_scheduler = StepLR(optim, step_size=25, gamma=0.5) + batcher_train = Batch(opt.batch_size, batches_per_epoch=10, languages=langs, lpad=multilingual_index.l_pad()) + batcher_eval = Batch(opt.batch_size, batches_per_epoch=-1, languages=langs, lpad=multilingual_index.l_pad()) + + tinit = time() + create_if_not_exist(opt.checkpoint_dir) + early_stop = EarlyStopping(model, patience=opt.patience, checkpoint=f'{opt.checkpoint_dir}/{method_name}-{get_file_name(opt.dataset)}') + + l_train_index, l_train_target = multilingual_index.l_train() + l_val_index, l_val_target = multilingual_index.l_val() + l_test_index = multilingual_index.l_test_index() + + print('-'*80) + print('Start training') + for epoch in range(1, opt.nepochs + 1): + train(model, batcher_train, l_train_index, lPtr, l_train_target, tinit, logfile, criterion, optim, epoch, method_name) + lr_scheduler.step() # reduces the learning rate + + # validation + macrof1 = test(model, batcher_eval, l_val_index, lPva, l_val_target, tinit, epoch, logfile, criterion, 'va') + early_stop(macrof1, epoch) + if opt.test_each>0: + if (opt.plotmode and (epoch==1 or epoch%opt.test_each==0)) or (not opt.plotmode and epoch%opt.test_each==0 and epoch0: + print(f'running last {opt.val_epochs} training epochs on the validation set') + for val_epoch in range(1, opt.val_epochs + 1): + batcher_train.init_offset() + train(model, batcher_train, l_val_index, lPva, l_val_target, tinit, logfile, criterion, optim, epoch+val_epoch, method_name) + + # final test + print('Training complete: testing') + test(model, batcher_eval, l_test_index, lPte, l_test_target, tinit, epoch, logfile, criterion, 'te') + + +def get_lr(optimizer): + for param_group in optimizer.param_groups: + return param_group['lr'] + + +def train(model, batcher, ltrain_index, ltrain_posteriors, lytr, tinit, logfile, criterion, optim, epoch, method_name): + loss_history = [] + model.train() + for idx, (batch, post, target, lang) in enumerate(batcher.batchify(ltrain_index, ltrain_posteriors, lytr)): + optim.zero_grad() + loss = criterion(model(batch, post, lang), target) + loss.backward() + clip_gradient(model) + optim.step() + loss_history.append(loss.item()) + + if idx % opt.log_interval == 0: + interval_loss = np.mean(loss_history[-opt.log_interval:]) + print(f'{opt.dataset} {method_name} Epoch: {epoch}, Step: {idx}, lr={get_lr(optim):.5f}, Training Loss: {interval_loss:.6f}') + + mean_loss = np.mean(interval_loss) + logfile.add_row(epoch=epoch, measure='tr_loss', value=mean_loss, timelapse=time() - tinit) + return mean_loss + + +def test(model, batcher, ltest_index, ltest_posteriors, lyte, tinit, epoch, logfile, criterion, measure_prefix): + model.eval() + langs = sorted(ltest_index.keys()) + predictions = {l:[] for l in langs} + yte_stacked = {l:[] for l in langs} + batcher.init_offset() + for batch, post, target, lang in tqdm(batcher.batchify(ltest_index, ltest_posteriors, lyte), desc='evaluation: '): + logits = model(batch, post, lang) + loss = criterion(logits, target).item() + prediction = predict(logits) + predictions[lang].append(prediction) + yte_stacked[lang].append(target.detach().cpu().numpy()) + + ly = {l:np.vstack(yte_stacked[l]) for l in langs} + ly_ = {l:np.vstack(predictions[l]) for l in langs} + l_eval = evaluate(ly, ly_) + metrics = [] + for lang in langs: + macrof1, microf1, macrok, microk = l_eval[lang] + metrics.append([macrof1, microf1, macrok, microk]) + if measure_prefix=='te': + print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') + # results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'], + # (config['max_label_space'], classifier.best_components), + # config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time, + # lang, macrof1, microf1, macrok, microk, '') + Mf1, mF1, MK, mk = np.mean(np.array(metrics), axis=0) + print(f'[{measure_prefix}] Averages: MF1, mF1, MK, mK [{Mf1:.5f}, {mF1:.5f}, {MK:.5f}, {mk:.5f}]') + + # logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-macro-F1', value=Mf1, timelapse=tend) + # logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-micro-F1', value=mf1, timelapse=tend) + # logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-accuracy', value=acc, timelapse=tend) + # logfile.add_row(epoch=epoch, measure=f'{measure_prefix}-loss', value=loss, timelapse=tend) + + return Mf1 + + +# ---------------------------------------------------------------------------------------------------------------------- +if __name__ == '__main__': + + parser = argparse.ArgumentParser(description='Neural text classification with Word-Class Embeddings') + parser.add_argument('dataset', type=str, metavar='datasetpath', help=f'path to the pickled dataset') + parser.add_argument('--batch-size', type=int, default=50, metavar='int', help='input batch size (default: 100)') + parser.add_argument('--batch-size-test', type=int, default=250, metavar='int', help='batch size for testing (default: 250)') + parser.add_argument('--nepochs', type=int, default=200, metavar='int', help='number of epochs (default: 200)') + parser.add_argument('--patience', type=int, default=10, metavar='int', help='patience for early-stop (default: 10)') + parser.add_argument('--plotmode', action='store_true', default=False, help='in plot mode executes a long run in order ' + 'to generate enough data to produce trend plots (test-each should be >0. This mode is ' + 'used to produce plots, and does not perform an evaluation on the test set.') + parser.add_argument('--hidden', type=int, default=512, metavar='int', help='hidden lstm size (default: 512)') + parser.add_argument('--lr', type=float, default=1e-3, metavar='float', help='learning rate (default: 1e-3)') + parser.add_argument('--weight_decay', type=float, default=0, metavar='float', help='weight decay (default: 0)') + parser.add_argument('--sup-drop', type=float, default=0.5, metavar='[0.0, 1.0]', help='dropout probability for the supervised matrix (default: 0.5)') + parser.add_argument('--seed', type=int, default=1, metavar='int', help='random seed (default: 1)') + parser.add_argument('--svm-max-docs', type=int, default=1000, metavar='int', help='maximum number of documents by ' + 'language used to train the calibrated SVMs (only used if --posteriors is active)') + parser.add_argument('--log-interval', type=int, default=10, metavar='int', help='how many batches to wait before printing training status') + parser.add_argument('--log-file', type=str, default='../log/log.csv', metavar='str', help='path to the log csv file') + # parser.add_argument('--pickle-dir', type=str, default='../pickles', metavar='str', help=f'if set, specifies the path where to ' + # f'save/load the dataset pickled (set to None if you prefer not to retain the pickle file)') + parser.add_argument('--test-each', type=int, default=0, metavar='int', help='how many epochs to wait before invoking test (default: 0, only at the end)') + parser.add_argument('--checkpoint-dir', type=str, default='../checkpoint', metavar='str', help='path to the directory containing checkpoints') + parser.add_argument('--net', type=str, default='rnn', metavar='str', help=f'net, one in {allowed_nets}') + parser.add_argument('--pretrained', action='store_true', default=False, help='use MUSE pretrained embeddings') + parser.add_argument('--supervised', action='store_true', default=False, help='use supervised embeddings') + parser.add_argument('--posteriors', action='store_true', default=False, help='concatenate posterior probabilities to doc embeddings') + parser.add_argument('--learnable', type=int, default=0, metavar='int', help='dimension of the learnable embeddings (default 0)') + parser.add_argument('--val-epochs', type=int, default=1, metavar='int', help='number of training epochs to perform on the ' + 'validation set once training is over (default 1)') + parser.add_argument('--we-path', type=str, default='../embeddings', metavar='str', + help=f'path to MUSE pretrained embeddings') + parser.add_argument('--max-label-space', type=int, default=300, metavar='int', help='larger dimension allowed for the ' + 'feature-label embedding (if larger, then PCA with this number of components is applied ' + '(default 300)') + parser.add_argument('--force', action='store_true', default=False, help='do not check if this experiment has already been run') + parser.add_argument('--tunable', action='store_true', default=False, + help='pretrained embeddings are tunable from the begining (default False, i.e., static)') + + opt = parser.parse_args() + + assert torch.cuda.is_available(), 'CUDA not available' + assert not opt.plotmode or opt.test_each > 0, 'plot mode implies --test-each>0' + # if opt.pickle_dir: opt.pickle_path = join(opt.pickle_dir, f'{opt.dataset}.pickle') + torch.manual_seed(opt.seed) + + main() diff --git a/src/main_majorityvoting_cls.py b/src/main_majorityvoting_cls.py new file mode 100644 index 0000000..607c409 --- /dev/null +++ b/src/main_majorityvoting_cls.py @@ -0,0 +1,127 @@ +import os +from dataset_builder import MultilingualDataset +# from learning.learners import * +from learning.learners import FunnellingMultimodal +from learning.transformers import Funnelling, PosteriorProbabilitiesEmbedder, MetaClassifier, \ + TfidfVectorizerMultilingual, DocEmbedderList, WordClassEmbedder, MuseEmbedder, FeatureSet2Posteriors, Voting +from util.evaluation import * +from optparse import OptionParser +from util.file import exists +from util.results import PolylingualClassificationResults +from sklearn.svm import SVC +from util.util import get_learner, get_params +from sklearn.linear_model import LogisticRegression, LogisticRegressionCV + +parser = OptionParser() + +parser.add_option("-d", "--dataset", dest="dataset", + help="Path to the multilingual dataset processed and stored in .pickle format", + default="../rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle") + +parser.add_option("-o", "--output", dest="output", + help="Result file", type=str, default='./results/results.csv') + +parser.add_option("-P", "--probs", dest="probs", action='store_true', + help="Add posterior probabilities to the document embedding representation", default=False) + +parser.add_option("-S", "--supervised", dest="supervised", action='store_true', + help="Add supervised (Word-Class Embeddings) to the document embedding representation", default=False) + +parser.add_option("-U", "--pretrained", dest="pretrained", action='store_true', + help="Add pretrained MUSE embeddings to the document embedding representation", default=False) + +parser.add_option("-w", "--we-path", dest="we_path", + help="Path to the MUSE polylingual word embeddings", default='../embeddings') + +parser.add_option("-s", "--set_c", dest="set_c",type=float, + help="Set the C parameter", default=1) + +parser.add_option("-c", "--optimc", dest="optimc", action='store_true', + help="Optimize hyperparameters", default=False) + +parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int, + help="Number of parallel jobs (default is -1, all)", default=-1) + +parser.add_option("-p", "--pca", dest="max_labels_S", type=int, + help="If smaller than number of target classes, PCA will be applied to supervised matrix. ", + default=300) + +# parser.add_option("-u", "--upca", dest="max_labels_U", type=int, +# help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix." +# " If set to 0 it will automatically search for the best number of components", default=300) + +# parser.add_option("-a", dest="post_pca", +# help="If set to True, will apply PCA to the z-space (posterior probabilities stacked along with " +# "embedding space", default=False) + + +def get_learner(calibrate=False, kernel='linear'): + return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, gamma='auto') + + +def get_params(dense=False): + if not op.optimc: + return None + c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1] + kernel = 'rbf' if dense else 'linear' + return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}] + +####################################################################################################################### + + +if __name__ == '__main__': + (op, args) = parser.parse_args() + + assert exists(op.dataset), 'Unable to find file '+str(op.dataset) + assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option' + assert op.probs or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed' + + dataset_file = os.path.basename(op.dataset) + + results = PolylingualClassificationResults(op.output) + + data = MultilingualDataset.load(op.dataset) + data.show_dimensions() + + lXtr, lytr = data.training() + lXte, lyte = data.test() + + meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}] + + result_id = f'{dataset_file}_Prob{op.probs}_WCE{op.supervised}(PCA{op.max_labels_S})_MUSE{op.pretrained}{"_optimC" if op.optimc else ""}' + + print(f'{result_id}') + + # text preprocessing + tfidfvectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) + + lXtr = tfidfvectorizer.fit_transform(lXtr, lytr) + lXte = tfidfvectorizer.transform(lXte) + lV = tfidfvectorizer.vocabulary() + + classifiers = [] + if op.probs: + classifiers.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True), first_tier_parameters=None)) + if op.supervised: + classifiers.append(FeatureSet2Posteriors(WordClassEmbedder(max_label_space=op.max_labels_S))) + if op.pretrained: + classifiers.append(FeatureSet2Posteriors(MuseEmbedder(op.we_path, lV=lV))) + + classifier = Voting(*classifiers) + + print('# Fitting ...') + classifier.fit(lXtr, lytr) + + print('\n# Evaluating ...') + l_eval = evaluate_method(classifier, lXte, lyte) + + metrics = [] + for lang in lXte.keys(): + macrof1, microf1, macrok, microk = l_eval[lang] + metrics.append([macrof1, microf1, macrok, microk]) + print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') + # results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'], + # (config['max_label_space'], classifier.best_components), + # config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time, + # lang, macrof1, microf1, macrok, microk, '') + print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) diff --git a/src/main_multimodal_cls.py b/src/main_multimodal_cls.py index 9378d92..a5224ab 100644 --- a/src/main_multimodal_cls.py +++ b/src/main_multimodal_cls.py @@ -1,27 +1,19 @@ import os from dataset_builder import MultilingualDataset -# from learning.learners import * -from learning.learners import FunnellingMultimodal -from learning.transformers import Funnelling, PosteriorProbabilitiesEmbedder, MetaClassifier, \ - TfidfVectorizerMultilingual, DocEmbedderList, WordClassEmbedder, MuseEmbedder +from learning.transformers import * from util.evaluation import * from optparse import OptionParser from util.file import exists from util.results import PolylingualClassificationResults from sklearn.svm import SVC -from util.util import get_learner, get_params -from sklearn.linear_model import LogisticRegression, LogisticRegressionCV -parser = OptionParser() -parser.add_option("-d", "--dataset", dest="dataset", - help="Path to the multilingual dataset processed and stored in .pickle format", - default="/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle") +parser = OptionParser(usage="usage: %prog datapath [options]") parser.add_option("-o", "--output", dest="output", help="Result file", type=str, default='./results/results.csv') -parser.add_option("-P", "--probs", dest="probs", action='store_true', +parser.add_option("-P", "--posteriors", dest="posteriors", action='store_true', help="Add posterior probabilities to the document embedding representation", default=False) parser.add_option("-S", "--supervised", dest="supervised", action='store_true', @@ -30,6 +22,16 @@ parser.add_option("-S", "--supervised", dest="supervised", action='store_true', parser.add_option("-U", "--pretrained", dest="pretrained", action='store_true', help="Add pretrained MUSE embeddings to the document embedding representation", default=False) +parser.add_option("--nol2", dest="nol2", action='store_true', + help="Deactivates l2 normalization as a post-processing for the document embedding views", default=False) + +parser.add_option("--allprob", dest="allprob", action='store_true', + help="All views are generated as posterior probabilities. This affects the supervised and pretrained " + "embeddings, for which a calibrated classifier is generated, which generates the posteriors", default=False) + +parser.add_option("--feat-weight", dest="feat_weight", + help="Term weighting function to weight the averaged embeddings", type=str, default='tfidf') + parser.add_option("-w", "--we-path", dest="we_path", help="Path to the MUSE polylingual word embeddings", default='../embeddings') @@ -46,66 +48,61 @@ parser.add_option("-p", "--pca", dest="max_labels_S", type=int, help="If smaller than number of target classes, PCA will be applied to supervised matrix. ", default=300) -# parser.add_option("-u", "--upca", dest="max_labels_U", type=int, -# help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix." -# " If set to 0 it will automatically search for the best number of components", default=300) - -# parser.add_option("-a", dest="post_pca", -# help="If set to True, will apply PCA to the z-space (posterior probabilities stacked along with " -# "embedding space", default=False) def get_learner(calibrate=False, kernel='linear'): return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, gamma='auto') - -def get_params(dense=False): - if not op.optimc: - return None - c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1] - kernel = 'rbf' if dense else 'linear' - return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}] - ####################################################################################################################### if __name__ == '__main__': (op, args) = parser.parse_args() - assert exists(op.dataset), 'Unable to find file '+str(op.dataset) + assert len(args)==1, 'required argument "datapath" missing (path to the pickled dataset)' + dataset = args[0] + assert exists(dataset), 'Unable to find file '+str(dataset) assert not (op.set_c != 1. and op.optimc), 'Parameter C cannot be defined along with optim_c option' - assert op.probs or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed' + assert op.posteriors or op.supervised or op.pretrained, 'empty set of document embeddings is not allowed' + l2=(op.nol2==False) - dataset_file = os.path.basename(op.dataset) + dataset_file = os.path.basename(dataset) results = PolylingualClassificationResults(op.output) + allprob='Prob' if op.allprob else '' + result_id = f'{dataset_file}_ProbPost={op.posteriors}_{allprob}WCE={op.supervised}(PCA={op.max_labels_S})_{allprob}' \ + f'MUSE={op.pretrained}_weight={op.feat_weight}_l2={l2}{"_optimC" if op.optimc else ""}' + print(f'{result_id}') - data = MultilingualDataset.load(op.dataset) + data = MultilingualDataset.load(dataset) data.show_dimensions() - lXtr, lytr = data.training() lXte, lyte = data.test() - meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}] - - result_id = f'{dataset_file}_Prob{op.probs}_WCE{op.supervised}(PCA{op.max_labels_S})_MUSE{op.pretrained}{"_optimC" if op.optimc else ""}' - - print(f'{result_id}') - # text preprocessing tfidfvectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) - # document embedding modules - doc_embedder = DocEmbedderList() - if op.probs: - doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True), first_tier_parameters=None)) + # feature weighting (for word embeddings average) + feat_weighting = FeatureWeight(op.feat_weight, agg='mean') + + # # document embedding modules + doc_embedder = DocEmbedderList(aggregation='concat') + if op.posteriors: + doc_embedder.append(PosteriorProbabilitiesEmbedder(first_tier_learner=get_learner(calibrate=True, kernel='linear'), l2=l2)) if op.supervised: - doc_embedder.append(WordClassEmbedder(max_label_space=op.max_labels_S)) + wce = WordClassEmbedder(max_label_space=op.max_labels_S, l2=l2, featureweight=feat_weighting) + if op.allprob: + wce = FeatureSet2Posteriors(wce, l2=l2) + doc_embedder.append(wce) if op.pretrained: - doc_embedder.append(MuseEmbedder(op.we_path)) + muse = MuseEmbedder(op.we_path, l2=l2, featureweight=feat_weighting) + if op.allprob: + muse = FeatureSet2Posteriors(muse, l2=l2) + doc_embedder.append(muse) # metaclassifier - meta = MetaClassifier(meta_learner=SVC(), meta_parameters=get_params(dense=True)) + meta_parameters = None if op.set_c != -1 else [{'C': [1, 1e3, 1e2, 1e1, 1e-1]}] + meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'), meta_parameters=meta_parameters) # ensembling the modules classifier = Funnelling(vectorizer=tfidfvectorizer, first_tier=doc_embedder, meta=meta) @@ -123,6 +120,6 @@ if __name__ == '__main__': print(f'Lang {lang}: macro-F1={macrof1:.3f} micro-F1={microf1:.3f}') # results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'], # (config['max_label_space'], classifier.best_components), - # config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time, + # config['dim_reduction_unsupervised'], op.optimc, dataset.split('/')[-1], classifier.time, # lang, macrof1, microf1, macrok, microk, '') print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) diff --git a/src/models/helpers.py b/src/models/helpers.py new file mode 100755 index 0000000..93e5805 --- /dev/null +++ b/src/models/helpers.py @@ -0,0 +1,47 @@ +import torch +import torch.nn as nn +from torch.nn import functional as F + + + +def init_embeddings(pretrained, vocab_size, learnable_length, device='cuda'): + pretrained_embeddings = None + pretrained_length = 0 + if pretrained is not None: + pretrained_length = pretrained.shape[1] + assert pretrained.shape[0] == vocab_size, 'pre-trained matrix does not match with the vocabulary size' + pretrained_embeddings = nn.Embedding(vocab_size, pretrained_length) + pretrained_embeddings.weight = nn.Parameter(pretrained, requires_grad=False) + # pretrained_embeddings.to(device) + + learnable_embeddings = None + if learnable_length > 0: + learnable_embeddings = nn.Embedding(vocab_size, learnable_length) + # learnable_embeddings.to(device) + + embedding_length = learnable_length + pretrained_length + assert embedding_length > 0, '0-size embeddings' + + return pretrained_embeddings, learnable_embeddings, embedding_length + + +def embed(model, input, lang): + input_list = [] + if model.lpretrained_embeddings[lang]: + input_list.append(model.lpretrained_embeddings[lang](input)) + if model.llearnable_embeddings[lang]: + input_list.append(model.llearnable_embeddings[lang](input)) + return torch.cat(tensors=input_list, dim=2) + + +def embedding_dropout(input, drop_range, p_drop=0.5, training=True): + if p_drop > 0 and training and drop_range is not None: + p = p_drop + drop_from, drop_to = drop_range + m = drop_to - drop_from #length of the supervised embedding + l = input.shape[2] #total embedding length + corr = (1 - p) + input[:, :, drop_from:drop_to] = corr * F.dropout(input[:, :, drop_from:drop_to], p=p) + input /= (1 - (p * m / l)) + + return input diff --git a/src/models/lstm_class.py b/src/models/lstm_class.py new file mode 100755 index 0000000..6d2e242 --- /dev/null +++ b/src/models/lstm_class.py @@ -0,0 +1,96 @@ +#taken from https://github.com/prakashpandey9/Text-Classification-Pytorch/blob/master/models/LSTM.py +import torch +import torch.nn as nn +from torch.autograd import Variable +from models.helpers import * + + +class RNNMultilingualClassifier(nn.Module): + + def __init__(self, output_size, hidden_size, lvocab_size, learnable_length, lpretrained=None, + drop_embedding_range=None, drop_embedding_prop=0, post_probabilities=True, only_post=False): + + super(RNNMultilingualClassifier, self).__init__() + self.output_size = output_size + self.hidden_size = hidden_size + self.drop_embedding_range = drop_embedding_range + self.drop_embedding_prop = drop_embedding_prop + self.post_probabilities = post_probabilities + assert 0 <= drop_embedding_prop <= 1, 'drop_embedding_prop: wrong range' + + self.lpretrained_embeddings = nn.ModuleDict() + self.llearnable_embeddings = nn.ModuleDict() + self.embedding_length = None + self.langs = sorted(lvocab_size.keys()) + self.only_post = only_post + + self.n_layers = 1 + self.n_directions = 1 + + self.dropout = nn.Dropout(0.2) + + lstm_out = 256 + ff1 = 512 + ff2 = 256 + + lpretrained_embeddings = {} + llearnable_embeddings = {} + if only_post==False: + for l in self.langs: + pretrained = lpretrained[l] if lpretrained else None + pretrained_embeddings, learnable_embeddings, embedding_length = init_embeddings( + pretrained, lvocab_size[l], learnable_length + ) + lpretrained_embeddings[l] = pretrained_embeddings + llearnable_embeddings[l] = learnable_embeddings + self.embedding_length = embedding_length + + # self.rnn = nn.LSTM(self.embedding_length, hidden_size, dropout=0.2 if self.n_layers>1 else 0, num_layers=self.n_layers, bidirectional=(self.n_directions==2)) + self.rnn = nn.GRU(self.embedding_length, hidden_size) + self.linear0 = nn.Linear(hidden_size * self.n_directions, lstm_out) + self.lpretrained_embeddings.update(lpretrained_embeddings) + self.llearnable_embeddings.update(llearnable_embeddings) + + self.linear1 = nn.Linear(lstm_out, ff1) + self.linear2 = nn.Linear(ff1, ff2) + + if only_post: + self.label = nn.Linear(output_size, output_size) + elif post_probabilities: + self.label = nn.Linear(ff2+output_size, output_size) + else: + self.label = nn.Linear(ff2, output_size) + + + def forward(self, input, post, lang): + if self.only_post: + doc_embedding = post + else: + doc_embedding = self.transform(input, lang) + if self.post_probabilities: + doc_embedding = torch.cat([doc_embedding, post], dim=1) + + logits = self.label(doc_embedding) + return logits + + def transform(self, input, lang): + batch_size = input.shape[0] + input = embed(self, input, lang) + input = embedding_dropout(input, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop, + training=self.training) + input = input.permute(1, 0, 2) + h_0 = Variable(torch.zeros(self.n_layers*self.n_directions, batch_size, self.hidden_size).cuda()) + # c_0 = Variable(torch.zeros(self.n_layers*self.n_directions, batch_size, self.hidden_size).cuda()) + # output, (_, _) = self.lstm(input, (h_0, c_0)) + output, _ = self.rnn(input, h_0) + output = output[-1,:,:] + output = F.relu(self.linear0(output)) + output = self.dropout(F.relu(self.linear1(output))) + output = self.dropout(F.relu(self.linear2(output))) + return output + + def finetune_pretrained(self): + for l in self.langs: + self.lpretrained_embeddings[l].requires_grad = True + self.lpretrained_embeddings[l].weight.requires_grad = True + diff --git a/src/transformers/StandardizeTransformer.py b/src/transformers/StandardizeTransformer.py index d0902b5..a46ffb6 100644 --- a/src/transformers/StandardizeTransformer.py +++ b/src/transformers/StandardizeTransformer.py @@ -2,15 +2,24 @@ import numpy as np class StandardizeTransformer: - def __init__(self, axis=0): + def __init__(self, axis=0, range=None): + assert range is None or isinstance(range, slice), 'wrong format for range, should either be None or a slice' self.axis = axis - self.yetfit=False + self.yetfit = False + self.range = range def fit(self, X): print('fitting Standardizer') std=np.std(X, axis=self.axis, ddof=1) self.std = np.clip(std, 1e-5, None) self.mean = np.mean(X, axis=self.axis) + if self.range is not None: + ones = np.ones_like(self.std) + zeros = np.zeros_like(self.mean) + ones[self.range] = self.std[self.range] + zeros[self.range] = self.mean[self.range] + self.std = ones + self.mean = zeros self.yetfit=True print('done\n') return self diff --git a/src/util/common.py b/src/util/common.py new file mode 100755 index 0000000..3bf1386 --- /dev/null +++ b/src/util/common.py @@ -0,0 +1,367 @@ +import warnings +from sklearn.svm import SVC +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.model_selection import train_test_split +from embeddings.supervised import get_supervised_embeddings +from learning.transformers import PosteriorProbabilitiesEmbedder, TfidfVectorizerMultilingual +warnings.filterwarnings("ignore", category=DeprecationWarning) +import numpy as np +from tqdm import tqdm +import torch +from scipy.sparse import vstack, issparse + + +def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary): + """ + Index (i.e., replaces word strings with numerical indexes) a list of string documents + :param data: list of string documents + :param vocab: a fixed mapping [str]->[int] of words to indexes + :param known_words: a set of known words (e.g., words that, despite not being included in the vocab, can be retained + because they are anyway contained in a pre-trained embedding set that we know in advance) + :param analyzer: the preprocessor in charge of transforming the document string into a chain of string words + :param unk_index: the index of the 'unknown token', i.e., a symbol that characterizes all words that we cannot keep + :param out_of_vocabulary: an incremental mapping [str]->[int] of words to indexes that will index all those words that + are not in the original vocab but that are in the known_words + :return: + """ + indexes=[] + vocabsize = len(vocab) + unk_count = 0 + knw_count = 0 + out_count = 0 + pbar = tqdm(data, desc=f'indexing documents') + for text in pbar: + words = analyzer(text) + index = [] + for word in words: + if word in vocab: + idx = vocab[word] + else: + if word in known_words: + if word not in out_of_vocabulary: + out_of_vocabulary[word] = vocabsize+len(out_of_vocabulary) + idx = out_of_vocabulary[word] + out_count += 1 + else: + idx = unk_index + unk_count += 1 + index.append(idx) + indexes.append(index) + knw_count += len(index) + pbar.set_description(f'[unk = {unk_count}/{knw_count}={(100.*unk_count/knw_count):.2f}%]' + f'[out = {out_count}/{knw_count}={(100.*out_count/knw_count):.2f}%]') + return indexes + + +def define_pad_length(index_list): + lengths = [len(index) for index in index_list] + return int(np.mean(lengths)+np.std(lengths)) + + +def pad(index_list, pad_index, max_pad_length=None): + pad_length = np.max([len(index) for index in index_list]) + if max_pad_length is not None: + pad_length = min(pad_length, max_pad_length) + for i,indexes in enumerate(index_list): + index_list[i] = [pad_index]*(pad_length-len(indexes)) + indexes[:pad_length] + return index_list + + +class Index: + def __init__(self, devel_raw, devel_target, test_raw, lang): + self.lang = lang + self.devel_raw = devel_raw + self.devel_target = devel_target + self.test_raw = test_raw + + def index(self, pretrained_vocabulary, analyzer, vocabulary): + self.word2index = dict(vocabulary) + known_words = set(self.word2index.keys()) + if pretrained_vocabulary is not None: + known_words.update(pretrained_vocabulary) + + self.word2index['UNKTOKEN'] = len(self.word2index) + self.word2index['PADTOKEN'] = len(self.word2index) + self.unk_index = self.word2index['UNKTOKEN'] + self.pad_index = self.word2index['PADTOKEN'] + + # index documents and keep track of test terms outside the development vocabulary that are in Muse (if available) + self.out_of_vocabulary = dict() + self.devel_index = index(self.devel_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary) + self.test_index = index(self.test_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary) + + self.vocabsize = len(self.word2index) + len(self.out_of_vocabulary) + + print(f'[indexing complete for lang {self.lang}] vocabulary-size={self.vocabsize}') + + def train_val_split(self, val_prop, max_val, seed): + devel = self.devel_index + target = self.devel_target + devel_raw = self.devel_raw + + val_size = int(min(len(devel) * val_prop, max_val)) + + self.train_index, self.val_index, self.train_target, self.val_target, self.train_raw, self.val_raw = \ + train_test_split( + devel, target, devel_raw, test_size=val_size, random_state=seed, shuffle=True + ) + + print(f'split lang {self.lang}: train={len(self.train_index)} val={len(self.val_index)} test={len(self.test_index)}') + + def get_word_list(self): + def extract_word_list(word2index): + return [w for w,i in sorted(word2index.items(), key=lambda x: x[1])] + + word_list = extract_word_list(self.word2index) + word_list += extract_word_list(self.out_of_vocabulary) + return word_list + + def compose_embedding_matrix(self, pretrained, supervised, Xtr=None, Ytr=None): + print(f'[generating embedding matrix for lang {self.lang}]') + + self.wce_range = None + embedding_parts = [] + + if pretrained is not None: + print('\t[pretrained-matrix]') + word_list = self.get_word_list() + muse_embeddings = pretrained.extract(word_list) + embedding_parts.append(muse_embeddings) + del pretrained + + if supervised: + print('\t[supervised-matrix]') + F = get_supervised_embeddings(Xtr, Ytr, reduction=None, method='dotn') + num_missing_rows = self.vocabsize - F.shape[0] + F = np.vstack((F, np.zeros(shape=(num_missing_rows, F.shape[1])))) + F = torch.from_numpy(F).float() + + offset = 0 + if embedding_parts: + offset = embedding_parts[0].shape[1] + self.wce_range = [offset, offset + F.shape[1]] + + embedding_parts.append(F) + + self.embedding_matrix = torch.cat(embedding_parts, dim=1) + + print(f'[embedding matrix for lang {self.lang} has shape {self.embedding_matrix.shape}]') + + +def none_dict(langs): + return {l:None for l in langs} + +class MultilingualIndex: + def __init__(self): #, add_language_trace=False): + self.l_index = {} + self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) + # self.add_language_trace=add_language_trace + + def index(self, l_devel_raw, l_devel_target, l_test_raw, l_pretrained_vocabulary): + self.langs = sorted(l_devel_raw.keys()) + + #build the vocabularies + self.l_vectorizer.fit(l_devel_raw) + l_vocabulary = self.l_vectorizer.vocabulary() + l_analyzer = self.l_vectorizer.get_analyzer() + + for l in self.langs: + self.l_index[l] = Index(l_devel_raw[l], l_devel_target[l], l_test_raw[l], l) + self.l_index[l].index(l_pretrained_vocabulary[l], l_analyzer[l], l_vocabulary[l]) + + def train_val_split(self, val_prop=0.2, max_val=2000, seed=42): + for l,index in self.l_index.items(): + index.train_val_split(val_prop, max_val, seed=seed) + + def embedding_matrices(self, lpretrained, supervised): + lXtr = self.get_lXtr() if supervised else none_dict(self.langs) + lYtr = self.l_train_target() if supervised else none_dict(self.langs) + for l,index in self.l_index.items(): + index.compose_embedding_matrix(lpretrained[l], supervised, lXtr[l], lYtr[l]) + self.sup_range = index.wce_range + + # experimental... does it make sense to keep track of the language? i.e., to inform the network from which + # language does the data came from... + # if self.add_language_trace and pretrained_embeddings is not None: + # print('adding language trace') + # lang_trace = torch.zeros(size=(vocabsize, len(self.langs))) + # lang_trace[:,i]=1 + # pretrained_embeddings = torch.cat([pretrained_embeddings, lang_trace], dim=1) + + + def posterior_probabilities(self, max_training_docs_by_lang=5000): + # choose a maximum of "max_training_docs_by_lang" for training the calibrated SVMs + lXtr = {l:Xtr for l,Xtr in self.get_lXtr().items()} + lYtr = {l:Ytr for l,Ytr in self.l_train_target().items()} + for l in self.langs: + n_elements = lXtr[l].shape[0] + if n_elements > max_training_docs_by_lang: + choice = np.random.permutation(n_elements)[:max_training_docs_by_lang] + lXtr[l] = lXtr[l][choice] + lYtr[l] = lYtr[l][choice] + + # train the posterior probabilities embedder + print('[posteriors] training a calibrated SVM') + learner = SVC(kernel='linear', probability=True, cache_size=1000, C=1, random_state=1, gamma='auto') + prob_embedder = PosteriorProbabilitiesEmbedder(learner, l2=False) + prob_embedder.fit(lXtr, lYtr) + + # transforms the training, validation, and test sets into posterior probabilities + print('[posteriors] generating posterior probabilities') + lPtr = prob_embedder.transform(self.get_lXtr()) + lPva = prob_embedder.transform(self.get_lXva()) + lPte = prob_embedder.transform(self.get_lXte()) + + print('[posteriors] done') + return lPtr, lPva, lPte + + def get_lXtr(self): + if not hasattr(self, 'lXtr'): + self.lXtr = self.l_vectorizer.transform({l: index.train_raw for l, index in self.l_index.items()}) + return self.lXtr + + def get_lXva(self): + if not hasattr(self, 'lXva'): + self.lXva = self.l_vectorizer.transform({l: index.val_raw for l, index in self.l_index.items()}) + return self.lXva + + def get_lXte(self): + if not hasattr(self, 'lXte'): + self.lXte = self.l_vectorizer.transform({l: index.test_raw for l, index in self.l_index.items()}) + return self.lXte + + def l_vocabsize(self): + return {l:index.vocabsize for l,index in self.l_index.items()} + + def l_embeddings(self): + return {l:index.embedding_matrix for l,index in self.l_index.items()} + + def l_pad(self): + return {l: index.pad_index for l, index in self.l_index.items()} + + def l_train_index(self): + return {l: index.train_index for l, index in self.l_index.items()} + + def l_train_target(self): + return {l: index.train_target for l, index in self.l_index.items()} + + def l_val_index(self): + return {l: index.val_index for l, index in self.l_index.items()} + + def l_val_target(self): + return {l: index.val_target for l, index in self.l_index.items()} + + def l_test_index(self): + return {l: index.test_index for l, index in self.l_index.items()} + + def l_train(self): + return self.l_train_index(), self.l_train_target() + + def l_val(self): + return self.l_val_index(), self.l_val_target() + + + +class Batch: + def __init__(self, batchsize, batches_per_epoch, languages, lpad, max_pad_length=500): + self.batchsize = batchsize + self.batches_per_epoch = batches_per_epoch + self.languages = languages + self.lpad=lpad + self.max_pad_length=max_pad_length + self.init_offset() + + def init_offset(self): + self.offset = {lang: 0 for lang in self.languages} + + def batchify(self, l_index, l_post, llabels): + langs = self.languages + l_num_samples = {l:len(l_index[l]) for l in langs} + + max_samples = max(l_num_samples.values()) + n_batches = max_samples // self.batchsize + 1 * (max_samples % self.batchsize > 0) + if self.batches_per_epoch != -1 and self.batches_per_epoch < n_batches: + n_batches = self.batches_per_epoch + + for b in range(n_batches): + for lang in langs: + index, labels = l_index[lang], llabels[lang] + offset = self.offset[lang] + if offset >= l_num_samples[lang]: + offset = 0 + limit = offset+self.batchsize + + batch_slice = slice(offset, limit) + batch = index[batch_slice] + batch_labels = labels[batch_slice].toarray() + + post = None + if l_post is not None: + post = torch.FloatTensor(l_post[lang][batch_slice]).cuda() + + batch = pad(batch, pad_index=self.lpad[lang], max_pad_length=self.max_pad_length) + + batch = torch.LongTensor(batch).cuda() + target = torch.FloatTensor(batch_labels).cuda() + + self.offset[lang] = limit + + yield batch, post, target, lang + + +def batchify(l_index, l_post, llabels, batchsize, lpad, max_pad_length=500): + langs = sorted(l_index.keys()) + nsamples = max([len(l_index[l]) for l in langs]) + nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0) + for b in range(nbatches): + for lang in langs: + index, labels = l_index[lang], llabels[lang] + + if b * batchsize >= len(index): + continue + batch = index[b*batchsize:(b+1)*batchsize] + batch_labels = labels[b*batchsize:(b+1)*batchsize].toarray() + post = None + if l_post is not None: + post = torch.FloatTensor(l_post[lang][b*batchsize:(b+1)*batchsize]).cuda() + batch = pad(batch, pad_index=lpad[lang], max_pad_length=max_pad_length) + batch = torch.LongTensor(batch) + target = torch.FloatTensor(batch_labels) + yield batch.cuda(), post, target.cuda(), lang + + +def batchify_unlabelled(index_list, batchsize, pad_index, max_pad_length=500): + nsamples = len(index_list) + nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0) + for b in range(nbatches): + batch = index_list[b*batchsize:(b+1)*batchsize] + batch = pad(batch, pad_index=pad_index, max_pad_length=max_pad_length) + batch = torch.LongTensor(batch) + yield batch.cuda() + + +def clip_gradient(model, clip_value=1e-1): + params = list(filter(lambda p: p.grad is not None, model.parameters())) + for p in params: + p.grad.data.clamp_(-clip_value, clip_value) + + +def predict(logits, classification_type='multilabel'): + if classification_type == 'multilabel': + prediction = torch.sigmoid(logits) > 0.5 + elif classification_type == 'singlelabel': + prediction = torch.argmax(logits, dim=1).view(-1, 1) + else: + print('unknown classification type') + + return prediction.detach().cpu().numpy() + + +def count_parameters(model): + return sum(p.numel() for p in model.parameters() if p.requires_grad) + + + + + + diff --git a/src/util/csv_log.py b/src/util/csv_log.py new file mode 100755 index 0000000..8c11e36 --- /dev/null +++ b/src/util/csv_log.py @@ -0,0 +1,60 @@ +import os +import pandas as pd +pd.set_option('display.max_rows', 500) +pd.set_option('display.max_columns', 500) +pd.set_option('display.width', 1000) + + +class CSVLog: + + def __init__(self, file, columns=None, autoflush=True, verbose=False, overwrite=False): + self.file = file + self.autoflush = autoflush + self.verbose = verbose + if os.path.exists(file) and not overwrite: + self.tell('Loading existing file from {}'.format(file)) + self.df = pd.read_csv(file, sep='\t') + self.columns = sorted(self.df.columns.values.tolist()) + else: + self.tell('File {} does not exist or overwrite=True. Creating new frame.'.format(file)) + assert columns is not None, 'columns cannot be None' + self.columns = sorted(columns) + dir = os.path.dirname(self.file) + if dir and not os.path.exists(dir): os.makedirs(dir) + self.df = pd.DataFrame(columns=self.columns) + self.defaults={} + + def already_calculated(self, **kwargs): + df = self.df + if df.shape[0]==0: + return False + if len(kwargs)==0: + kwargs = self.defaults + for key,val in kwargs.items(): + df = df.loc[df[key]==val] + if df.shape[0]==0: return False + return True + + def set_default(self, param, value): + self.defaults[param]=value + + def add_row(self, **kwargs): + for key in self.defaults.keys(): + if key not in kwargs: + kwargs[key]=self.defaults[key] + colums = sorted(list(kwargs.keys())) + values = [kwargs[col_i] for col_i in colums] + s = pd.Series(values, index=self.columns) + self.df = self.df.append(s, ignore_index=True) + if self.autoflush: self.flush() + # self.tell(s.to_string()) + self.tell(kwargs) + + def flush(self): + self.df.to_csv(self.file, index=False, sep='\t') + + def tell(self, msg): + if self.verbose: print(msg) + + + diff --git a/src/util/early_stop.py b/src/util/early_stop.py new file mode 100755 index 0000000..93544be --- /dev/null +++ b/src/util/early_stop.py @@ -0,0 +1,53 @@ +#adapted from https://github.com/Bjarten/early-stopping-pytorch/blob/master/pytorchtools.py +import torch +from time import time +from util.file import create_if_not_exist + + +class EarlyStopping: + + def __init__(self, model, patience=20, verbose=True, checkpoint='./checkpoint.pt'): + # set patience to 0 or -1 to avoid stopping, but still keeping track of the best value and model parameters + self.patience_limit = patience + self.patience = patience + self.verbose = verbose + self.best_score = None + self.best_epoch = None + self.stop_time = None + self.checkpoint = checkpoint + self.model = model + self.STOP = False + + def __call__(self, watch_score, epoch): + + if self.STOP: return #done + + if self.best_score is None or watch_score >= self.best_score: + self.best_score = watch_score + self.best_epoch = epoch + self.stop_time = time() + if self.checkpoint: + self.print(f'[early-stop] improved, saving model in {self.checkpoint}') + torch.save(self.model, self.checkpoint) + else: + self.print(f'[early-stop] improved') + self.patience = self.patience_limit + else: + self.patience -= 1 + if self.patience == 0: + self.STOP = True + self.print(f'[early-stop] patience exhausted') + else: + if self.patience>0: # if negative, then early-stop is ignored + self.print(f'[early-stop] patience={self.patience}') + + def reinit_counter(self): + self.STOP = False + self.patience=self.patience_limit + + def restore_checkpoint(self): + return torch.load(self.checkpoint) + + def print(self, msg): + if self.verbose: + print(msg) diff --git a/src/util/evaluation.py b/src/util/evaluation.py index 2ad6af6..742ba7b 100644 --- a/src/util/evaluation.py +++ b/src/util/evaluation.py @@ -44,7 +44,7 @@ def evaluate_method(polylingual_method, lX, ly, predictor=None, soft=False, retu tinit=time.time() print('prediction for test') assert set(lX.keys()) == set(ly.keys()), 'inconsistent dictionaries in evaluate' - n_jobs = polylingual_method.n_jobs + n_jobs = polylingual_method.n_jobs if hasattr(polylingual_method, 'n_jobs') else -1 if predictor is None: predictor = polylingual_method.predict diff --git a/src/util/file.py b/src/util/file.py index 511fccf..a3d0a3a 100644 --- a/src/util/file.py +++ b/src/util/file.py @@ -2,6 +2,7 @@ from os import listdir, makedirs from os.path import isdir, isfile, join, exists, dirname #from sklearn.externals.six.moves import urllib import urllib +from pathlib import Path def download_file(url, archive_filename): @@ -36,4 +37,8 @@ def makedirs_if_not_exist(path): def create_if_not_exist(path): if not exists(path): makedirs(path) +def get_parent_name(path): + return Path(path).parent +def get_file_name(path): + return Path(path).name