standardization

This commit is contained in:
Alejandro Moreo Fernandez 2019-11-29 18:14:14 +01:00
parent 601da33836
commit 499c6018c0
5 changed files with 53 additions and 20 deletions

View File

@ -20,7 +20,7 @@ parser.add_option("-e", "--mode-embed", dest="mode_embed",
help="Set the embedding to be used [none, pretrained, supervised, both]", type=str, default='none') help="Set the embedding to be used [none, pretrained, supervised, both]", type=str, default='none')
parser.add_option("-w", "--we-path", dest="we_path", parser.add_option("-w", "--we-path", dest="we_path",
help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/embeddings/') help="Path to the polylingual word embeddings", default='../embeddings/')
parser.add_option("-s", "--set_c", dest="set_c",type=float, parser.add_option("-s", "--set_c", dest="set_c",type=float,
help="Set the C parameter", default=1) help="Set the C parameter", default=1)
@ -33,7 +33,7 @@ parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
def get_learner(calibrate=False, kernel='linear'): def get_learner(calibrate=False, kernel='linear'):
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1) return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced')
def get_params(dense=False): # TODO kernel function could be useful for meta-classifier def get_params(dense=False): # TODO kernel function could be useful for meta-classifier
@ -60,6 +60,7 @@ if __name__ == '__main__':
data = MultilingualDataset.load(op.dataset) data = MultilingualDataset.load(op.dataset)
data.show_dimensions() data.show_dimensions()
# data.set_view(languages=['en','it'], categories=list(range(10)))
lXtr, lytr = data.training() lXtr, lytr = data.training()
lXte, lyte = data.test() lXte, lyte = data.test()
@ -96,8 +97,8 @@ if __name__ == '__main__':
classifier = AndreaCLF(op.we_path, classifier = AndreaCLF(op.we_path,
config, config,
first_tier_learner=get_learner(calibrate=True), first_tier_learner=get_learner(calibrate=True),
meta_learner=get_learner(calibrate=False), meta_learner=get_learner(calibrate=False, kernel='rbf'),
first_tier_parameters=get_params(dense=True), first_tier_parameters=get_params(dense=False),
meta_parameters=get_params(dense=True), meta_parameters=get_params(dense=True),
n_jobs=op.n_jobs) n_jobs=op.n_jobs)

View File

@ -1,9 +1,9 @@
from __future__ import print_function from __future__ import print_function
import ijson # import ijson
# from ijson.common import ObjectBuilder
import os, sys import os, sys
from os.path import join from os.path import join
from bz2 import BZ2File from bz2 import BZ2File
from ijson.common import ObjectBuilder
import pickle import pickle
from util.file import list_dirs, list_files, makedirs_if_not_exist from util.file import list_dirs, list_files, makedirs_if_not_exist
from itertools import islice from itertools import islice

View File

@ -5,7 +5,6 @@ from sklearn.preprocessing import MultiLabelBinarizer
from data.reader.jrcacquis_reader import * from data.reader.jrcacquis_reader import *
from data.languages import lang_set, NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING from data.languages import lang_set, NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING
from data.reader.rcv_reader import fetch_RCV1, fetch_RCV2, fetch_topic_hierarchy from data.reader.rcv_reader import fetch_RCV1, fetch_RCV2, fetch_topic_hierarchy
from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample
from data.text_preprocessor import NLTKStemTokenizer, preprocess_documents from data.text_preprocessor import NLTKStemTokenizer, preprocess_documents
import pickle import pickle
import numpy as np import numpy as np
@ -357,6 +356,9 @@ def retrieve_jrc_documents_from_dataset(datasetpath, jrc_data_home, train_years,
# Dataset Generators # Dataset Generators
# ---------------------------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------------------------
def prepare_jrc_datasets(jrc_data_home, wiki_data_home, langs, train_years, test_years, cat_policy, most_common_cat=-1, max_wiki=5000, run=0): def prepare_jrc_datasets(jrc_data_home, wiki_data_home, langs, train_years, test_years, cat_policy, most_common_cat=-1, max_wiki=5000, run=0):
from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample
""" """
Prepare all datasets for JRC-Acquis. The datasets include the "feature-independent" version, the Prepare all datasets for JRC-Acquis. The datasets include the "feature-independent" version, the
"feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices. "feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices.
@ -439,6 +441,7 @@ def prepare_jrc_datasets(jrc_data_home, wiki_data_home, langs, train_years, test
def prepare_rcv_datasets(outpath, rcv1_data_home, rcv2_data_home, wiki_data_home, langs, def prepare_rcv_datasets(outpath, rcv1_data_home, rcv2_data_home, wiki_data_home, langs,
train_for_lang=1000, test_for_lang=1000, max_wiki=5000, preprocess=True, run=0): train_for_lang=1000, test_for_lang=1000, max_wiki=5000, preprocess=True, run=0):
from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample
""" """
Prepare all datasets for RCV1/RCV2. The datasets include the "feature-independent" version, the Prepare all datasets for RCV1/RCV2. The datasets include the "feature-independent" version, the
"feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices. "feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices.

View File

@ -8,6 +8,9 @@ from sklearn.model_selection import KFold
from joblib import Parallel, delayed from joblib import Parallel, delayed
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
from data.supervised import zscores
from transformers.StandardizeTransformer import StandardizeTransformer
def _sort_if_sparse(X): def _sort_if_sparse(X):
if issparse(X) and not X.has_sorted_indices: if issparse(X) and not X.has_sorted_indices:
@ -210,13 +213,13 @@ class NaivePolylingualClassifier:
for lang in langs: for lang in langs:
_sort_if_sparse(lX[lang]) _sort_if_sparse(lX[lang])
# models = Parallel(n_jobs=self.n_jobs)\ models = Parallel(n_jobs=self.n_jobs)\
# (delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]),ly[lang]) for lang in langs) (delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]),ly[lang]) for lang in langs)
#
models = [MonolingualClassifier(self.base_learner, parameters=self.parameters) for lang in langs] # models = [MonolingualClassifier(self.base_learner, parameters=self.parameters) for lang in langs]
#
for model, lang in zip(models, langs): # for model, lang in zip(models, langs):
model.fit(lX[lang], ly[lang]) # model.fit(lX[lang], ly[lang])
self.model = {lang: models[i] for i, lang in enumerate(langs)} self.model = {lang: models[i] for i, lang in enumerate(langs)}
self.empty_categories = {lang:self.model[lang].empty_categories for lang in langs} self.empty_categories = {lang:self.model[lang].empty_categories for lang in langs}
@ -537,7 +540,7 @@ class AndreaCLF(FunnellingPolylingualClassifier):
self.vectorize(lX) self.vectorize(lX)
for lang in self.languages: for lang in self.languages:
print(lX[lang].shape) print(f'{lang}->{lX[lang].shape}')
Z, zy = self._get_zspace(lX, ly) Z, zy = self._get_zspace(lX, ly)
@ -552,12 +555,15 @@ class AndreaCLF(FunnellingPolylingualClassifier):
for lang in list(lX.keys()): for lang in list(lX.keys()):
Z_embedded[lang] = np.hstack((Z[lang], l_weighted_em[lang])) Z_embedded[lang] = np.hstack((Z[lang], l_weighted_em[lang]))
Z = Z_embedded Z = Z_embedded
del Z_embedded
# stacking Z space vertically # stacking Z space vertically
_vertical_Z = np.vstack([Z[lang] for lang in self.languages]) _vertical_Z = np.vstack([Z[lang] for lang in self.languages])
_vertical_Zy = np.vstack([zy[lang] for lang in self.languages]) _vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
self.standardizer = StandardizeTransformer()
_vertical_Z = self.standardizer.fit_predict(_vertical_Z)
print('fitting the Z-space of shape={}'.format(_vertical_Z.shape)) print('fitting the Z-space of shape={}'.format(_vertical_Z.shape))
self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters, self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters,
n_jobs=self.n_jobs) n_jobs=self.n_jobs)
@ -578,10 +584,10 @@ class AndreaCLF(FunnellingPolylingualClassifier):
Z_embedded = dict() Z_embedded = dict()
for lang in lX.keys(): for lang in lX.keys():
Z_embedded[lang] = np.hstack((lZ[lang], l_weighted_em[lang])) Z_embedded[lang] = np.hstack((lZ[lang], l_weighted_em[lang]))
print(Z_embedded[lang].shape) lZ = Z_embedded
return _joblib_transform_multiling(self.model.predict, Z_embedded, n_jobs=self.n_jobs)
for lang in lZ.keys(): for lang in lZ.keys():
print(lZ[lang].shape) print(lZ[lang].shape)
lZ[lang] = self.standardizer.predict(lZ[lang])
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs) return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)

View File

@ -0,0 +1,23 @@
import numpy as np
class StandardizeTransformer:
def __init__(self, axis=0):
self.axis = axis
self.yetfit=False
def fit(self, X):
print('fitting Standardizer')
std=np.std(X, axis=self.axis, ddof=1)
self.std = np.clip(std, 1e-5, None)
self.mean = np.mean(X, axis=self.axis)
self.yetfit=True
print('done')
return self
def predict(self, X):
if not self.yetfit: 'transform called before fit'
return (X - self.mean) / self.std
def fit_predict(self, X):
return self.fit(X).predict(X)