From dd34a96f87c77f9335fd649c3554859ed5c74736 Mon Sep 17 00:00:00 2001 From: andrea Date: Thu, 12 Dec 2019 11:18:38 +0100 Subject: [PATCH] also saving n_components if auto optimizing it removed some unnecessary columns from result csv --- src/FPEC_andrea.py | 48 ++++++-- src/data/embeddings.py | 30 ++--- src/learning/learners.py | 260 ++++++++++++++++++++------------------- src/util/results.py | 7 +- 4 files changed, 189 insertions(+), 156 deletions(-) diff --git a/src/FPEC_andrea.py b/src/FPEC_andrea.py index 1618c33..16934df 100644 --- a/src/FPEC_andrea.py +++ b/src/FPEC_andrea.py @@ -35,16 +35,22 @@ parser.add_option("-c", "--optimc", dest="optimc", action='store_true', parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int, help="Number of parallel jobs (default is -1, all)", default=-1) -parser.add_option("-p", "--pca", dest="max_labels", type=int, - help="If less than number of target classes, will apply PCA to supervised matrix. If set to 0 it" - " will automatically search for the best number of components", default=300) +parser.add_option("-p", "--pca", dest="max_labels_S", type=int, + help="If smaller than number of target classes, PCA will be applied to supervised matrix. " + "If set to 0 it will automatically search for the best number of components. " + "If set to -1 it will apply PCA to the vstacked supervised matrix (PCA dim set to 50 atm)", + default=300) parser.add_option("-u", "--upca", dest="max_labels_U", type=int, - help="If smaller than Unsupervised Dimension, will apply PCA to unsupervised matrix. If set to 0 it" - " will automatically search for the best number of components", default=300) + help="If smaller than Unsupervised Dimension, PCA will be applied to unsupervised matrix." + " If set to 0 it will automatically search for the best number of components", default=300) parser.add_option("-l", dest="lang", type=str) +parser.add_option("-a", dest="post_pca", + help="If set to True, will apply PCA to the z-space (posterior probabilities stacked along with " + "embedding space", default=False) + def get_learner(calibrate=False, kernel='linear'): return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto') @@ -73,7 +79,7 @@ if __name__ == '__main__': data = MultilingualDataset.load(op.dataset) data.show_dimensions() - data.set_view(languages=['en','it', 'pt', 'sv'], categories=list(range(10))) + # data.set_view(languages=['en','it', 'pt', 'sv'], categories=list(range(10))) # data.set_view(languages=[op.lang]) # data.set_view(categories=list(range(10))) lXtr, lytr = data.training() @@ -114,12 +120,34 @@ if __name__ == '__main__': ##### TODO - config dict is redundant - we have already op argparse ... config['reduction'] = 'PCA' - config['max_label_space'] = op.max_labels + config['max_label_space'] = op.max_labels_S config['dim_reduction_unsupervised'] = op.max_labels_U + config['post_pca'] = op.post_pca # config['plot_covariance_matrices'] = True result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '') + PLE_test = False + if PLE_test: + ple = PolylingualEmbeddingsClassifier(wordembeddings_path='/home/moreo/CLESA/PolylingualEmbeddings', + learner=get_learner(calibrate=False), + c_parameters=get_params(dense=False), + n_jobs=op.n_jobs) + + print('# Fitting ...') + ple.fit(lXtr, lytr) + + print('# Evaluating ...') + ple_eval = evaluate_method(ple, lXte, lyte) + + metrics = [] + for lang in lXte.keys(): + macrof1, microf1, macrok, microk = ple_eval[lang] + metrics.append([macrof1, microf1, macrok, microk]) + print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1)) + print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) + + print(f'### PolyEmbedd_andrea_{_config_id}\n') classifier = AndreaCLF(we_path=op.we_path, config=config, @@ -140,6 +168,8 @@ if __name__ == '__main__': macrof1, microf1, macrok, microk = l_eval[lang] metrics.append([macrof1, microf1, macrok, microk]) print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1)) - results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, config['we_type'], op.optimc, op.dataset.split('/')[-1], - classifier.time, lang, macrof1, microf1, macrok, microk, '') + results.add_row('PolyEmbed_andrea', 'svm', _config_id, config['we_type'], + (config['max_label_space'], classifier.best_components), + config['dim_reduction_unsupervised'], op.optimc, op.dataset.split('/')[-1], classifier.time, + lang, macrof1, microf1, macrok, microk, '') print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0)) diff --git a/src/data/embeddings.py b/src/data/embeddings.py index 66e830f..fb1f135 100644 --- a/src/data/embeddings.py +++ b/src/data/embeddings.py @@ -48,7 +48,7 @@ class WordEmbeddings: print('loading pkl in {}'.format(we_path + '.pkl')) (worddim, we) = pickle.load(open(we_path + '.pkl', 'rb')) else: - word_registry=set() + word_registry = set() lines = open(we_path).readlines() nwords, dims = [int(x) for x in lines[0].split()] print('reading we of {} dimensions'.format(dims)) @@ -61,13 +61,13 @@ class WordEmbeddings: word, *vals = line.split() wordp = word_preprocessor(word) if word_preprocessor is not None else word if wordp: - wordp=wordp[0] + wordp = wordp[0] if wordp in word_registry: print('warning: word <{}> generates a duplicate <{}> after preprocessing'.format(word,wordp)) elif len(vals) == dims: worddim[wordp] = index we[index, :] = np.array(vals).astype(float) - index+=1 + index += 1 # else: # print('warning: word <{}> generates an empty string after preprocessing'.format(word)) we = we[:index] @@ -151,7 +151,6 @@ class FastTextWikiNews(Vectors): def __init__(self, cache, language="en", **kwargs): url = self.url_base.format(language) - # name = self.path.format(language) name = cache + self._name.format(language) super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs) @@ -211,16 +210,11 @@ class StorageEmbeddings: def _add_embeddings_unsupervised(self, type, docs, vocs, max_label_space=300): for lang in docs.keys(): - nC = self.lang_U[lang].shape[1] print(f'# [unsupervised-matrix {type}] for {lang}') voc = np.asarray(list(zip(*sorted(vocs[lang].items(), key=lambda x: x[1])))[0]) self.lang_U[lang] = EmbeddingsAligned(type, self.path, lang, voc).vectors - # if self.lang_U[lang].shape[1] > dim != 0: - # print(f'unsupervised matrix has more dimensions ({self.lang_U[lang].shape[1]}) than' - # f' the allowed limit {dim}. Applying PCA(n_components={dim})') - # pca = PCA(n_components=dim) - # self.lang_U[lang] = pca.fit_transform(self.lang_U[lang]) print(f'Matrix U (weighted sum) of shape {self.lang_U[lang].shape}\n') + nC = self.lang_U[lang].shape[1] if max_label_space == 0: print(f'Computing optimal number of PCA components along matrices U') optimal_n = get_optimal_dim(self.lang_U, 'U') @@ -228,22 +222,28 @@ class StorageEmbeddings: elif max_label_space < nC: self.lang_U = run_pca(max_label_space, self.lang_U) + return def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space, voc): - # if max_label_space == 0: - # print('Computing optimal number of PCA components along matrices S...') - # optimal_n = self.get_optimal_supervised_components(docs, labels) - # max_label_space = optimal_n for lang in docs.keys(): # compute supervised matrices S - then apply PCA - nC = self.lang_S[lang].shape[1] print(f'# [supervised-matrix] for {lang}') self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space, voc[lang], lang) + nC = self.lang_S[lang].shape[1] print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n') if max_label_space == 0: + print(f'Computing optimal number of PCA components along matrices S') optimal_n = get_optimal_dim(self.lang_S, 'S') self.lang_S = run_pca(optimal_n, self.lang_S) + elif max_label_space == -1: + print(f'Computing PCA on vertical stacked WCE embeddings') + languages = self.lang_S.keys() + _temp_stack = np.vstack([self.lang_S[lang] for lang in languages]) + stacked_pca = PCA(n_components=50) + stacked_pca.fit(_temp_stack) + for lang in languages: + self.lang_S[lang] = stacked_pca.transform(self.lang_S[lang]) elif max_label_space < nC: self.lang_S = run_pca(max_label_space, self.lang_S) diff --git a/src/learning/learners.py b/src/learning/learners.py index 96e200c..1d119e3 100644 --- a/src/learning/learners.py +++ b/src/learning/learners.py @@ -8,7 +8,7 @@ from sklearn.model_selection import KFold from joblib import Parallel, delayed from sklearn.feature_extraction.text import TfidfVectorizer from transformers.StandardizeTransformer import StandardizeTransformer -# from sklearn.decomposition import PCA +from sklearn.decomposition import PCA def _sort_if_sparse(X): @@ -214,11 +214,6 @@ class NaivePolylingualClassifier: models = Parallel(n_jobs=self.n_jobs)\ (delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]),ly[lang]) for lang in langs) - # - # models = [MonolingualClassifier(self.base_learner, parameters=self.parameters) for lang in langs] - # - # for model, lang in zip(models, langs): - # model.fit(lX[lang], ly[lang]) self.model = {lang: models[i] for i, lang in enumerate(langs)} self.empty_categories = {lang:self.model[lang].empty_categories for lang in langs} @@ -329,6 +324,131 @@ class MonolingualClassifier: return self.best_params_ +class AndreaCLF(FunnellingPolylingualClassifier): + def __init__(self, + we_path, + config, + first_tier_learner, + meta_learner, + first_tier_parameters=None, + meta_parameters=None, + folded_projections=1, + calmode='cal', + n_jobs=-1): + + super().__init__(first_tier_learner, + meta_learner, + first_tier_parameters, + meta_parameters, + folded_projections, + calmode, + n_jobs) + + self.pca_independent_space = PCA(n_components=50) + self.we_path = we_path + self.config = config + self.lang_word2idx = dict() + self.languages = [] + self.lang_tfidf = {} + self.embedding_space = None + self.model = None + self.time = None + self.best_components = None # if auto optimize pca, it will store the optimal number of components + + def vectorize(self, lX, prediction=False): + langs = list(lX.keys()) + print(f'# tfidf-vectorizing docs') + if prediction: + + for lang in langs: + assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language' + tfidf_vectorizer = self.lang_tfidf[lang] + lX[lang] = tfidf_vectorizer.transform(lX[lang]) + return self + + for lang in langs: + tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True) + self.languages.append(lang) + tfidf_vectorizer.fit(lX[lang]) + lX[lang] = tfidf_vectorizer.transform(lX[lang]) + self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_ + self.lang_tfidf[lang] = tfidf_vectorizer + return self + + def _get_zspace(self, lXtr, lYtr): + print('\nfitting the projectors... {}'.format(list(lXtr.keys()))) + self.doc_projector.fit(lXtr, lYtr) + + print('\nprojecting the documents') + lZ = self._projection(self.doc_projector, lXtr) + + return lZ, lYtr + + def fit(self, lX, ly): + tinit = time.time() + print('Vectorizing documents...') + self.vectorize(lX) + + for lang in self.languages: + print(f'{lang}->{lX[lang].shape}') + + Z, zy = self._get_zspace(lX, ly) + + if self.config['supervised'] or self.config['unsupervised']: + self.embedding_space = StorageEmbeddings(self.we_path).fit(self.config, lX, self.lang_word2idx, ly) + _embedding_space = self.embedding_space.predict(self.config, lX) + if self.config['max_label_space'] == 0: + if _embedding_space.shape[1] - 300 > 0: + _temp = _embedding_space.shape[1] - 300 + else: + _temp = _embedding_space.shape[1] + self.best_components = _temp + # h_stacking posterior probabilities with (U) and/or (S) matrices + for lang in self.languages: + Z[lang] = np.hstack((Z[lang], _embedding_space[lang])) + + # stacking Z space vertically + _vertical_Z = np.vstack([Z[lang] for lang in self.languages]) + _vertical_Zy = np.vstack([zy[lang] for lang in self.languages]) + + self.standardizer = StandardizeTransformer() + _vertical_Z = self.standardizer.fit_predict(_vertical_Z) + + # todo testing ... + if self.config['post_pca']: + print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...') + self.pca_independent_space.fit(_vertical_Z) + _vertical_Z = self.pca_independent_space.transform(_vertical_Z) + + print('fitting the Z-space of shape={}'.format(_vertical_Z.shape)) + self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, + n_jobs=self.n_jobs) + self.model.fit(_vertical_Z, _vertical_Zy) + self.time = time.time() - tinit + print(f'\nTotal training time elapsed: {round((self.time/60), 2)} min') + + def predict(self, lX, ly): + print('Vectorizing documents') + self.vectorize(lX, prediction=True) + lZ = self._projection(self.doc_projector, lX) + + if self.config['supervised'] or self.config['unsupervised']: + _embedding_space = self.embedding_space.predict(self.config, lX) + + for lang in lX.keys(): + lZ[lang] = np.hstack((lZ[lang], _embedding_space[lang])) + + for lang in lZ.keys(): + print(lZ[lang].shape) + # todo testing + lZ[lang] = self.standardizer.predict(lZ[lang]) + if self.config['post_pca']: + print(f'Applying PCA({"dim ?? TODO"}) to Z-space ...') + lZ[lang] = self.pca_independent_space.transform(lZ[lang]) + + return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs) + + class PolylingualEmbeddingsClassifier: """ This classifier creates document embeddings by a tfidf weighted average of polylingual embeddings from the article @@ -395,24 +515,21 @@ class PolylingualEmbeddingsClassifier: langs = list(lX.keys()) WEtr, Ytr = [], [] self.fit_vectorizers(lX) # if already fit, does nothing - _lX = dict() for lang in langs: - _lX[lang] = self.lang_tfidf[lang].transform(lX[lang]) WEtr.append(self.embed(lX[lang], lang)) Ytr.append(ly[lang]) - # TODO @Andrea --> here embeddings should be stacked horizontally! WEtr = np.vstack(WEtr) Ytr = np.vstack(Ytr) self.embed_time = time.time() - tinit print('fitting the WE-space of shape={}'.format(WEtr.shape)) self.model = MonolingualClassifier(base_learner=self.learner, parameters=self.c_parameters, n_jobs=self.n_jobs) - self.model.fit(_lX['da'], ly['da']) + self.model.fit(WEtr, Ytr) self.time = time.time() - tinit return self - def predict(self, lX): + def predict(self, lX, lY): """ :param lX: a dictionary {language_label: [list of preprocessed documents]} """ @@ -427,123 +544,8 @@ class PolylingualEmbeddingsClassifier: """ assert self.model is not None, 'predict called before fit' langs = list(lX.keys()) - # lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory - return _joblib_transform_multiling(self.model.predict_proba, self.lang_tfidf['da'], n_jobs=self.n_jobs) + lWEte = {lang:self.embed(lX[lang], lang) for lang in langs} # parallelizing this may consume too much memory + return _joblib_transform_multiling(self.model.predict_proba, lWEte, n_jobs=self.n_jobs) def best_params(self): - return self.model.best_params() - - -class AndreaCLF(FunnellingPolylingualClassifier): - def __init__(self, - we_path, - config, - first_tier_learner, - meta_learner, - first_tier_parameters=None, - meta_parameters=None, - folded_projections=1, - calmode='cal', - n_jobs=-1): - - super().__init__(first_tier_learner, - meta_learner, - first_tier_parameters, - meta_parameters, - folded_projections, - calmode, - n_jobs) - - self.pca_independent_space = PCA(n_components=100) - self.we_path = we_path - self.config = config - self.lang_word2idx = dict() - self.languages = [] - self.lang_tfidf = {} - self.embedding_space = None - self.model = None - self.time = None - - def vectorize(self, lX, prediction=False): - langs = list(lX.keys()) - print(f'# tfidf-vectorizing docs') - if prediction: - for lang in langs: - assert lang in self.lang_tfidf.keys(), 'no tf-idf for given language' - tfidf_vectorizer = self.lang_tfidf[lang] - lX[lang] = tfidf_vectorizer.transform(lX[lang]) - return self - - for lang in langs: - tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, use_idf=True) - self.languages.append(lang) - tfidf_vectorizer.fit(lX[lang]) - lX[lang] = tfidf_vectorizer.transform(lX[lang]) - self.lang_word2idx[lang] = tfidf_vectorizer.vocabulary_ - self.lang_tfidf[lang] = tfidf_vectorizer - return self - - # @override std class method - def _get_zspace(self, lXtr, lYtr): - print('\nfitting the projectors... {}'.format(list(lXtr.keys()))) - self.doc_projector.fit(lXtr, lYtr) - - print('\nprojecting the documents') - lZ = self._projection(self.doc_projector, lXtr) - - return lZ, lYtr - - # @override std class method - def fit(self, lX, ly): - tinit = time.time() - print('Vectorizing documents...') - self.vectorize(lX) - - for lang in self.languages: - print(f'{lang}->{lX[lang].shape}') - - Z, zy = self._get_zspace(lX, ly) - - if self.config['supervised'] or self.config['unsupervised']: - self.embedding_space = StorageEmbeddings(self.we_path).fit(self.config, lX, self.lang_word2idx, ly) - _embedding_space = self.embedding_space.predict(self.config, lX) - # h_stacking posterior probabilities with (U) and/or (S) matrices - for lang in self.languages: - Z[lang] = np.hstack((Z[lang], _embedding_space[lang])) - - # stacking Z space vertically - _vertical_Z = np.vstack([Z[lang] for lang in self.languages]) - _vertical_Zy = np.vstack([zy[lang] for lang in self.languages]) - - # todo testing ... - # self.pca_independent_space.fit(_vertical_Z) - # _vertical_Z = self.pca_independent_space.transform(_vertical_Z) - - self.standardizer = StandardizeTransformer() - _vertical_Z = self.standardizer.fit_predict(_vertical_Z) - - print('fitting the Z-space of shape={}'.format(_vertical_Z.shape)) - self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, - n_jobs=self.n_jobs) - self.model.fit(_vertical_Z, _vertical_Zy) - self.time = time.time() - tinit - print(f'\nTotal training time elapsed: {round((self.time/60), 2)} min') - - def predict(self, lX, ly): - print('Vectorizing documents') - self.vectorize(lX, prediction=True) - lZ = self._projection(self.doc_projector, lX) - - if self.config['supervised'] or self.config['unsupervised']: - _embedding_space = self.embedding_space.predict(self.config, lX) - - for lang in lX.keys(): - lZ[lang] = np.hstack((lZ[lang], _embedding_space[lang])) - - for lang in lZ.keys(): - print(lZ[lang].shape) - # todo testing - # lZ[lang] = self.pca_independent_space.transform(lZ[lang]) - lZ[lang] = self.standardizer.predict(lZ[lang]) - - return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs) + return self.model.best_params() \ No newline at end of file diff --git a/src/util/results.py b/src/util/results.py index 7c25bec..a889e6d 100644 --- a/src/util/results.py +++ b/src/util/results.py @@ -5,7 +5,8 @@ import numpy as np class PolylingualClassificationResults: def __init__(self, file, autoflush=True, verbose=False): self.file = file - self.columns = ['id', 'method', 'learner', 'embed', 'embed_type', 'optimp', 'dataset', 'time', 'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes'] + self.columns = ['method', 'learner', 'embed', 'embed_type', 'pca_s', 'pca_u', 'optimp', 'dataset', 'time', + 'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes'] self.autoflush = autoflush self.verbose = verbose if os.path.exists(file): @@ -20,8 +21,8 @@ class PolylingualClassificationResults: def already_calculated(self, id): return (self.df['id'] == id).any() - def add_row(self, id, method, learner, embed, embed_type, optimp, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''): - s = pd.Series([id, method, learner, embed, embed_type, optimp, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns) + def add_row(self, method, learner, embed, embed_type, pca_s, pca_u, optimp, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''): + s = pd.Series([method, learner, embed, embed_type, pca_s, pca_u, optimp, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns) self.df = self.df.append(s, ignore_index=True) if self.autoflush: self.flush() self.tell(s.to_string())