standardization

This commit is contained in:
Alejandro Moreo Fernandez 2019-11-29 18:14:14 +01:00
parent 601da33836
commit 499c6018c0
5 changed files with 53 additions and 20 deletions

View File

@ -20,7 +20,7 @@ parser.add_option("-e", "--mode-embed", dest="mode_embed",
help="Set the embedding to be used [none, pretrained, supervised, both]", type=str, default='none')
parser.add_option("-w", "--we-path", dest="we_path",
help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/embeddings/')
help="Path to the polylingual word embeddings", default='../embeddings/')
parser.add_option("-s", "--set_c", dest="set_c",type=float,
help="Set the C parameter", default=1)
@ -33,7 +33,7 @@ parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
def get_learner(calibrate=False, kernel='linear'):
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1)
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced')
def get_params(dense=False): # TODO kernel function could be useful for meta-classifier
@ -60,6 +60,7 @@ if __name__ == '__main__':
data = MultilingualDataset.load(op.dataset)
data.show_dimensions()
# data.set_view(languages=['en','it'], categories=list(range(10)))
lXtr, lytr = data.training()
lXte, lyte = data.test()
@ -96,8 +97,8 @@ if __name__ == '__main__':
classifier = AndreaCLF(op.we_path,
config,
first_tier_learner=get_learner(calibrate=True),
meta_learner=get_learner(calibrate=False),
first_tier_parameters=get_params(dense=True),
meta_learner=get_learner(calibrate=False, kernel='rbf'),
first_tier_parameters=get_params(dense=False),
meta_parameters=get_params(dense=True),
n_jobs=op.n_jobs)

View File

@ -1,9 +1,9 @@
from __future__ import print_function
import ijson
# import ijson
# from ijson.common import ObjectBuilder
import os, sys
from os.path import join
from bz2 import BZ2File
from ijson.common import ObjectBuilder
import pickle
from util.file import list_dirs, list_files, makedirs_if_not_exist
from itertools import islice

View File

@ -5,7 +5,6 @@ from sklearn.preprocessing import MultiLabelBinarizer
from data.reader.jrcacquis_reader import *
from data.languages import lang_set, NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING
from data.reader.rcv_reader import fetch_RCV1, fetch_RCV2, fetch_topic_hierarchy
from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample
from data.text_preprocessor import NLTKStemTokenizer, preprocess_documents
import pickle
import numpy as np
@ -357,6 +356,9 @@ def retrieve_jrc_documents_from_dataset(datasetpath, jrc_data_home, train_years,
# Dataset Generators
# ----------------------------------------------------------------------------------------------------------------------
def prepare_jrc_datasets(jrc_data_home, wiki_data_home, langs, train_years, test_years, cat_policy, most_common_cat=-1, max_wiki=5000, run=0):
from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample
"""
Prepare all datasets for JRC-Acquis. The datasets include the "feature-independent" version, the
"feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices.
@ -439,6 +441,7 @@ def prepare_jrc_datasets(jrc_data_home, wiki_data_home, langs, train_years, test
def prepare_rcv_datasets(outpath, rcv1_data_home, rcv2_data_home, wiki_data_home, langs,
train_for_lang=1000, test_for_lang=1000, max_wiki=5000, preprocess=True, run=0):
from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample
"""
Prepare all datasets for RCV1/RCV2. The datasets include the "feature-independent" version, the
"feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices.

View File

@ -8,6 +8,9 @@ from sklearn.model_selection import KFold
from joblib import Parallel, delayed
from sklearn.feature_extraction.text import TfidfVectorizer
from data.supervised import zscores
from transformers.StandardizeTransformer import StandardizeTransformer
def _sort_if_sparse(X):
if issparse(X) and not X.has_sorted_indices:
@ -210,13 +213,13 @@ class NaivePolylingualClassifier:
for lang in langs:
_sort_if_sparse(lX[lang])
# models = Parallel(n_jobs=self.n_jobs)\
# (delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]),ly[lang]) for lang in langs)
models = [MonolingualClassifier(self.base_learner, parameters=self.parameters) for lang in langs]
for model, lang in zip(models, langs):
model.fit(lX[lang], ly[lang])
models = Parallel(n_jobs=self.n_jobs)\
(delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]),ly[lang]) for lang in langs)
#
# models = [MonolingualClassifier(self.base_learner, parameters=self.parameters) for lang in langs]
#
# for model, lang in zip(models, langs):
# model.fit(lX[lang], ly[lang])
self.model = {lang: models[i] for i, lang in enumerate(langs)}
self.empty_categories = {lang:self.model[lang].empty_categories for lang in langs}
@ -537,7 +540,7 @@ class AndreaCLF(FunnellingPolylingualClassifier):
self.vectorize(lX)
for lang in self.languages:
print(lX[lang].shape)
print(f'{lang}->{lX[lang].shape}')
Z, zy = self._get_zspace(lX, ly)
@ -552,12 +555,15 @@ class AndreaCLF(FunnellingPolylingualClassifier):
for lang in list(lX.keys()):
Z_embedded[lang] = np.hstack((Z[lang], l_weighted_em[lang]))
Z = Z_embedded
del Z_embedded
# stacking Z space vertically
_vertical_Z = np.vstack([Z[lang] for lang in self.languages])
_vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
self.standardizer = StandardizeTransformer()
_vertical_Z = self.standardizer.fit_predict(_vertical_Z)
print('fitting the Z-space of shape={}'.format(_vertical_Z.shape))
self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters,
n_jobs=self.n_jobs)
@ -578,10 +584,10 @@ class AndreaCLF(FunnellingPolylingualClassifier):
Z_embedded = dict()
for lang in lX.keys():
Z_embedded[lang] = np.hstack((lZ[lang], l_weighted_em[lang]))
print(Z_embedded[lang].shape)
return _joblib_transform_multiling(self.model.predict, Z_embedded, n_jobs=self.n_jobs)
lZ = Z_embedded
for lang in lZ.keys():
print(lZ[lang].shape)
lZ[lang] = self.standardizer.predict(lZ[lang])
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)

View File

@ -0,0 +1,23 @@
import numpy as np
class StandardizeTransformer:
def __init__(self, axis=0):
self.axis = axis
self.yetfit=False
def fit(self, X):
print('fitting Standardizer')
std=np.std(X, axis=self.axis, ddof=1)
self.std = np.clip(std, 1e-5, None)
self.mean = np.mean(X, axis=self.axis)
self.yetfit=True
print('done')
return self
def predict(self, X):
if not self.yetfit: 'transform called before fit'
return (X - self.mean) / self.std
def fit_predict(self, X):
return self.fit(X).predict(X)