standardization
This commit is contained in:
parent
601da33836
commit
499c6018c0
|
@ -20,7 +20,7 @@ parser.add_option("-e", "--mode-embed", dest="mode_embed",
|
||||||
help="Set the embedding to be used [none, pretrained, supervised, both]", type=str, default='none')
|
help="Set the embedding to be used [none, pretrained, supervised, both]", type=str, default='none')
|
||||||
|
|
||||||
parser.add_option("-w", "--we-path", dest="we_path",
|
parser.add_option("-w", "--we-path", dest="we_path",
|
||||||
help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/embeddings/')
|
help="Path to the polylingual word embeddings", default='../embeddings/')
|
||||||
|
|
||||||
parser.add_option("-s", "--set_c", dest="set_c",type=float,
|
parser.add_option("-s", "--set_c", dest="set_c",type=float,
|
||||||
help="Set the C parameter", default=1)
|
help="Set the C parameter", default=1)
|
||||||
|
@ -33,7 +33,7 @@ parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
|
||||||
|
|
||||||
|
|
||||||
def get_learner(calibrate=False, kernel='linear'):
|
def get_learner(calibrate=False, kernel='linear'):
|
||||||
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1)
|
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced')
|
||||||
|
|
||||||
|
|
||||||
def get_params(dense=False): # TODO kernel function could be useful for meta-classifier
|
def get_params(dense=False): # TODO kernel function could be useful for meta-classifier
|
||||||
|
@ -60,6 +60,7 @@ if __name__ == '__main__':
|
||||||
data = MultilingualDataset.load(op.dataset)
|
data = MultilingualDataset.load(op.dataset)
|
||||||
data.show_dimensions()
|
data.show_dimensions()
|
||||||
|
|
||||||
|
# data.set_view(languages=['en','it'], categories=list(range(10)))
|
||||||
lXtr, lytr = data.training()
|
lXtr, lytr = data.training()
|
||||||
lXte, lyte = data.test()
|
lXte, lyte = data.test()
|
||||||
|
|
||||||
|
@ -96,8 +97,8 @@ if __name__ == '__main__':
|
||||||
classifier = AndreaCLF(op.we_path,
|
classifier = AndreaCLF(op.we_path,
|
||||||
config,
|
config,
|
||||||
first_tier_learner=get_learner(calibrate=True),
|
first_tier_learner=get_learner(calibrate=True),
|
||||||
meta_learner=get_learner(calibrate=False),
|
meta_learner=get_learner(calibrate=False, kernel='rbf'),
|
||||||
first_tier_parameters=get_params(dense=True),
|
first_tier_parameters=get_params(dense=False),
|
||||||
meta_parameters=get_params(dense=True),
|
meta_parameters=get_params(dense=True),
|
||||||
n_jobs=op.n_jobs)
|
n_jobs=op.n_jobs)
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
import ijson
|
# import ijson
|
||||||
|
# from ijson.common import ObjectBuilder
|
||||||
import os, sys
|
import os, sys
|
||||||
from os.path import join
|
from os.path import join
|
||||||
from bz2 import BZ2File
|
from bz2 import BZ2File
|
||||||
from ijson.common import ObjectBuilder
|
|
||||||
import pickle
|
import pickle
|
||||||
from util.file import list_dirs, list_files, makedirs_if_not_exist
|
from util.file import list_dirs, list_files, makedirs_if_not_exist
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
|
|
|
@ -5,7 +5,6 @@ from sklearn.preprocessing import MultiLabelBinarizer
|
||||||
from data.reader.jrcacquis_reader import *
|
from data.reader.jrcacquis_reader import *
|
||||||
from data.languages import lang_set, NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING
|
from data.languages import lang_set, NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING
|
||||||
from data.reader.rcv_reader import fetch_RCV1, fetch_RCV2, fetch_topic_hierarchy
|
from data.reader.rcv_reader import fetch_RCV1, fetch_RCV2, fetch_topic_hierarchy
|
||||||
from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample
|
|
||||||
from data.text_preprocessor import NLTKStemTokenizer, preprocess_documents
|
from data.text_preprocessor import NLTKStemTokenizer, preprocess_documents
|
||||||
import pickle
|
import pickle
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -357,6 +356,9 @@ def retrieve_jrc_documents_from_dataset(datasetpath, jrc_data_home, train_years,
|
||||||
# Dataset Generators
|
# Dataset Generators
|
||||||
# ----------------------------------------------------------------------------------------------------------------------
|
# ----------------------------------------------------------------------------------------------------------------------
|
||||||
def prepare_jrc_datasets(jrc_data_home, wiki_data_home, langs, train_years, test_years, cat_policy, most_common_cat=-1, max_wiki=5000, run=0):
|
def prepare_jrc_datasets(jrc_data_home, wiki_data_home, langs, train_years, test_years, cat_policy, most_common_cat=-1, max_wiki=5000, run=0):
|
||||||
|
from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Prepare all datasets for JRC-Acquis. The datasets include the "feature-independent" version, the
|
Prepare all datasets for JRC-Acquis. The datasets include the "feature-independent" version, the
|
||||||
"feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices.
|
"feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices.
|
||||||
|
@ -439,6 +441,7 @@ def prepare_jrc_datasets(jrc_data_home, wiki_data_home, langs, train_years, test
|
||||||
|
|
||||||
def prepare_rcv_datasets(outpath, rcv1_data_home, rcv2_data_home, wiki_data_home, langs,
|
def prepare_rcv_datasets(outpath, rcv1_data_home, rcv2_data_home, wiki_data_home, langs,
|
||||||
train_for_lang=1000, test_for_lang=1000, max_wiki=5000, preprocess=True, run=0):
|
train_for_lang=1000, test_for_lang=1000, max_wiki=5000, preprocess=True, run=0):
|
||||||
|
from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample
|
||||||
"""
|
"""
|
||||||
Prepare all datasets for RCV1/RCV2. The datasets include the "feature-independent" version, the
|
Prepare all datasets for RCV1/RCV2. The datasets include the "feature-independent" version, the
|
||||||
"feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices.
|
"feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices.
|
||||||
|
|
|
@ -8,6 +8,9 @@ from sklearn.model_selection import KFold
|
||||||
from joblib import Parallel, delayed
|
from joblib import Parallel, delayed
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
|
||||||
|
from data.supervised import zscores
|
||||||
|
from transformers.StandardizeTransformer import StandardizeTransformer
|
||||||
|
|
||||||
|
|
||||||
def _sort_if_sparse(X):
|
def _sort_if_sparse(X):
|
||||||
if issparse(X) and not X.has_sorted_indices:
|
if issparse(X) and not X.has_sorted_indices:
|
||||||
|
@ -210,13 +213,13 @@ class NaivePolylingualClassifier:
|
||||||
for lang in langs:
|
for lang in langs:
|
||||||
_sort_if_sparse(lX[lang])
|
_sort_if_sparse(lX[lang])
|
||||||
|
|
||||||
# models = Parallel(n_jobs=self.n_jobs)\
|
models = Parallel(n_jobs=self.n_jobs)\
|
||||||
# (delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]),ly[lang]) for lang in langs)
|
(delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]),ly[lang]) for lang in langs)
|
||||||
|
#
|
||||||
models = [MonolingualClassifier(self.base_learner, parameters=self.parameters) for lang in langs]
|
# models = [MonolingualClassifier(self.base_learner, parameters=self.parameters) for lang in langs]
|
||||||
|
#
|
||||||
for model, lang in zip(models, langs):
|
# for model, lang in zip(models, langs):
|
||||||
model.fit(lX[lang], ly[lang])
|
# model.fit(lX[lang], ly[lang])
|
||||||
|
|
||||||
self.model = {lang: models[i] for i, lang in enumerate(langs)}
|
self.model = {lang: models[i] for i, lang in enumerate(langs)}
|
||||||
self.empty_categories = {lang:self.model[lang].empty_categories for lang in langs}
|
self.empty_categories = {lang:self.model[lang].empty_categories for lang in langs}
|
||||||
|
@ -537,7 +540,7 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
||||||
self.vectorize(lX)
|
self.vectorize(lX)
|
||||||
|
|
||||||
for lang in self.languages:
|
for lang in self.languages:
|
||||||
print(lX[lang].shape)
|
print(f'{lang}->{lX[lang].shape}')
|
||||||
|
|
||||||
Z, zy = self._get_zspace(lX, ly)
|
Z, zy = self._get_zspace(lX, ly)
|
||||||
|
|
||||||
|
@ -552,12 +555,15 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
||||||
for lang in list(lX.keys()):
|
for lang in list(lX.keys()):
|
||||||
Z_embedded[lang] = np.hstack((Z[lang], l_weighted_em[lang]))
|
Z_embedded[lang] = np.hstack((Z[lang], l_weighted_em[lang]))
|
||||||
Z = Z_embedded
|
Z = Z_embedded
|
||||||
del Z_embedded
|
|
||||||
|
|
||||||
# stacking Z space vertically
|
# stacking Z space vertically
|
||||||
_vertical_Z = np.vstack([Z[lang] for lang in self.languages])
|
_vertical_Z = np.vstack([Z[lang] for lang in self.languages])
|
||||||
_vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
|
_vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
|
||||||
|
|
||||||
|
self.standardizer = StandardizeTransformer()
|
||||||
|
_vertical_Z = self.standardizer.fit_predict(_vertical_Z)
|
||||||
|
|
||||||
print('fitting the Z-space of shape={}'.format(_vertical_Z.shape))
|
print('fitting the Z-space of shape={}'.format(_vertical_Z.shape))
|
||||||
self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters,
|
self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters,
|
||||||
n_jobs=self.n_jobs)
|
n_jobs=self.n_jobs)
|
||||||
|
@ -578,10 +584,10 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
||||||
Z_embedded = dict()
|
Z_embedded = dict()
|
||||||
for lang in lX.keys():
|
for lang in lX.keys():
|
||||||
Z_embedded[lang] = np.hstack((lZ[lang], l_weighted_em[lang]))
|
Z_embedded[lang] = np.hstack((lZ[lang], l_weighted_em[lang]))
|
||||||
print(Z_embedded[lang].shape)
|
lZ = Z_embedded
|
||||||
|
|
||||||
return _joblib_transform_multiling(self.model.predict, Z_embedded, n_jobs=self.n_jobs)
|
|
||||||
|
|
||||||
for lang in lZ.keys():
|
for lang in lZ.keys():
|
||||||
print(lZ[lang].shape)
|
print(lZ[lang].shape)
|
||||||
|
lZ[lang] = self.standardizer.predict(lZ[lang])
|
||||||
|
|
||||||
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
|
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
|
||||||
|
|
|
@ -0,0 +1,23 @@
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
class StandardizeTransformer:
|
||||||
|
|
||||||
|
def __init__(self, axis=0):
|
||||||
|
self.axis = axis
|
||||||
|
self.yetfit=False
|
||||||
|
|
||||||
|
def fit(self, X):
|
||||||
|
print('fitting Standardizer')
|
||||||
|
std=np.std(X, axis=self.axis, ddof=1)
|
||||||
|
self.std = np.clip(std, 1e-5, None)
|
||||||
|
self.mean = np.mean(X, axis=self.axis)
|
||||||
|
self.yetfit=True
|
||||||
|
print('done')
|
||||||
|
return self
|
||||||
|
|
||||||
|
def predict(self, X):
|
||||||
|
if not self.yetfit: 'transform called before fit'
|
||||||
|
return (X - self.mean) / self.std
|
||||||
|
|
||||||
|
def fit_predict(self, X):
|
||||||
|
return self.fit(X).predict(X)
|
Loading…
Reference in New Issue