From 499c6018c0e468770432c2d89ac459b1f14e6e23 Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Fri, 29 Nov 2019 18:14:14 +0100 Subject: [PATCH] standardization --- src/FPEC_andrea.py | 9 +++--- src/data/reader/wikipedia_tools.py | 4 +-- src/dataset_builder.py | 5 +++- src/learning/learners.py | 32 +++++++++++++--------- src/transformers/StandardizeTransformer.py | 23 ++++++++++++++++ 5 files changed, 53 insertions(+), 20 deletions(-) create mode 100644 src/transformers/StandardizeTransformer.py diff --git a/src/FPEC_andrea.py b/src/FPEC_andrea.py index ed203ce..4decdf6 100644 --- a/src/FPEC_andrea.py +++ b/src/FPEC_andrea.py @@ -20,7 +20,7 @@ parser.add_option("-e", "--mode-embed", dest="mode_embed", help="Set the embedding to be used [none, pretrained, supervised, both]", type=str, default='none') parser.add_option("-w", "--we-path", dest="we_path", - help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/embeddings/') + help="Path to the polylingual word embeddings", default='../embeddings/') parser.add_option("-s", "--set_c", dest="set_c",type=float, help="Set the C parameter", default=1) @@ -33,7 +33,7 @@ parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int, def get_learner(calibrate=False, kernel='linear'): - return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1) + return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced') def get_params(dense=False): # TODO kernel function could be useful for meta-classifier @@ -60,6 +60,7 @@ if __name__ == '__main__': data = MultilingualDataset.load(op.dataset) data.show_dimensions() + # data.set_view(languages=['en','it'], categories=list(range(10))) lXtr, lytr = data.training() lXte, lyte = data.test() @@ -96,8 +97,8 @@ if __name__ == '__main__': classifier = AndreaCLF(op.we_path, config, first_tier_learner=get_learner(calibrate=True), - meta_learner=get_learner(calibrate=False), - first_tier_parameters=get_params(dense=True), + meta_learner=get_learner(calibrate=False, kernel='rbf'), + first_tier_parameters=get_params(dense=False), meta_parameters=get_params(dense=True), n_jobs=op.n_jobs) diff --git a/src/data/reader/wikipedia_tools.py b/src/data/reader/wikipedia_tools.py index f30816c..83e11e3 100644 --- a/src/data/reader/wikipedia_tools.py +++ b/src/data/reader/wikipedia_tools.py @@ -1,9 +1,9 @@ from __future__ import print_function -import ijson +# import ijson +# from ijson.common import ObjectBuilder import os, sys from os.path import join from bz2 import BZ2File -from ijson.common import ObjectBuilder import pickle from util.file import list_dirs, list_files, makedirs_if_not_exist from itertools import islice diff --git a/src/dataset_builder.py b/src/dataset_builder.py index c28fbcf..3f6732c 100644 --- a/src/dataset_builder.py +++ b/src/dataset_builder.py @@ -5,7 +5,6 @@ from sklearn.preprocessing import MultiLabelBinarizer from data.reader.jrcacquis_reader import * from data.languages import lang_set, NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING from data.reader.rcv_reader import fetch_RCV1, fetch_RCV2, fetch_topic_hierarchy -from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample from data.text_preprocessor import NLTKStemTokenizer, preprocess_documents import pickle import numpy as np @@ -357,6 +356,9 @@ def retrieve_jrc_documents_from_dataset(datasetpath, jrc_data_home, train_years, # Dataset Generators # ---------------------------------------------------------------------------------------------------------------------- def prepare_jrc_datasets(jrc_data_home, wiki_data_home, langs, train_years, test_years, cat_policy, most_common_cat=-1, max_wiki=5000, run=0): + from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample + + """ Prepare all datasets for JRC-Acquis. The datasets include the "feature-independent" version, the "feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices. @@ -439,6 +441,7 @@ def prepare_jrc_datasets(jrc_data_home, wiki_data_home, langs, train_years, test def prepare_rcv_datasets(outpath, rcv1_data_home, rcv2_data_home, wiki_data_home, langs, train_for_lang=1000, test_for_lang=1000, max_wiki=5000, preprocess=True, run=0): + from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample """ Prepare all datasets for RCV1/RCV2. The datasets include the "feature-independent" version, the "feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices. diff --git a/src/learning/learners.py b/src/learning/learners.py index 77895ce..5a8f07e 100644 --- a/src/learning/learners.py +++ b/src/learning/learners.py @@ -8,6 +8,9 @@ from sklearn.model_selection import KFold from joblib import Parallel, delayed from sklearn.feature_extraction.text import TfidfVectorizer +from data.supervised import zscores +from transformers.StandardizeTransformer import StandardizeTransformer + def _sort_if_sparse(X): if issparse(X) and not X.has_sorted_indices: @@ -210,13 +213,13 @@ class NaivePolylingualClassifier: for lang in langs: _sort_if_sparse(lX[lang]) - # models = Parallel(n_jobs=self.n_jobs)\ - # (delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]),ly[lang]) for lang in langs) - - models = [MonolingualClassifier(self.base_learner, parameters=self.parameters) for lang in langs] - - for model, lang in zip(models, langs): - model.fit(lX[lang], ly[lang]) + models = Parallel(n_jobs=self.n_jobs)\ + (delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]),ly[lang]) for lang in langs) + # + # models = [MonolingualClassifier(self.base_learner, parameters=self.parameters) for lang in langs] + # + # for model, lang in zip(models, langs): + # model.fit(lX[lang], ly[lang]) self.model = {lang: models[i] for i, lang in enumerate(langs)} self.empty_categories = {lang:self.model[lang].empty_categories for lang in langs} @@ -537,7 +540,7 @@ class AndreaCLF(FunnellingPolylingualClassifier): self.vectorize(lX) for lang in self.languages: - print(lX[lang].shape) + print(f'{lang}->{lX[lang].shape}') Z, zy = self._get_zspace(lX, ly) @@ -552,12 +555,15 @@ class AndreaCLF(FunnellingPolylingualClassifier): for lang in list(lX.keys()): Z_embedded[lang] = np.hstack((Z[lang], l_weighted_em[lang])) Z = Z_embedded - del Z_embedded + # stacking Z space vertically _vertical_Z = np.vstack([Z[lang] for lang in self.languages]) _vertical_Zy = np.vstack([zy[lang] for lang in self.languages]) + self.standardizer = StandardizeTransformer() + _vertical_Z = self.standardizer.fit_predict(_vertical_Z) + print('fitting the Z-space of shape={}'.format(_vertical_Z.shape)) self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, n_jobs=self.n_jobs) @@ -578,10 +584,10 @@ class AndreaCLF(FunnellingPolylingualClassifier): Z_embedded = dict() for lang in lX.keys(): Z_embedded[lang] = np.hstack((lZ[lang], l_weighted_em[lang])) - print(Z_embedded[lang].shape) - - return _joblib_transform_multiling(self.model.predict, Z_embedded, n_jobs=self.n_jobs) - + lZ = Z_embedded + for lang in lZ.keys(): print(lZ[lang].shape) + lZ[lang] = self.standardizer.predict(lZ[lang]) + return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs) diff --git a/src/transformers/StandardizeTransformer.py b/src/transformers/StandardizeTransformer.py new file mode 100644 index 0000000..381d6c1 --- /dev/null +++ b/src/transformers/StandardizeTransformer.py @@ -0,0 +1,23 @@ +import numpy as np + +class StandardizeTransformer: + + def __init__(self, axis=0): + self.axis = axis + self.yetfit=False + + def fit(self, X): + print('fitting Standardizer') + std=np.std(X, axis=self.axis, ddof=1) + self.std = np.clip(std, 1e-5, None) + self.mean = np.mean(X, axis=self.axis) + self.yetfit=True + print('done') + return self + + def predict(self, X): + if not self.yetfit: 'transform called before fit' + return (X - self.mean) / self.std + + def fit_predict(self, X): + return self.fit(X).predict(X) \ No newline at end of file