standardization
This commit is contained in:
parent
601da33836
commit
499c6018c0
|
@ -20,7 +20,7 @@ parser.add_option("-e", "--mode-embed", dest="mode_embed",
|
|||
help="Set the embedding to be used [none, pretrained, supervised, both]", type=str, default='none')
|
||||
|
||||
parser.add_option("-w", "--we-path", dest="we_path",
|
||||
help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/embeddings/')
|
||||
help="Path to the polylingual word embeddings", default='../embeddings/')
|
||||
|
||||
parser.add_option("-s", "--set_c", dest="set_c",type=float,
|
||||
help="Set the C parameter", default=1)
|
||||
|
@ -33,7 +33,7 @@ parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
|
|||
|
||||
|
||||
def get_learner(calibrate=False, kernel='linear'):
|
||||
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1)
|
||||
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced')
|
||||
|
||||
|
||||
def get_params(dense=False): # TODO kernel function could be useful for meta-classifier
|
||||
|
@ -60,6 +60,7 @@ if __name__ == '__main__':
|
|||
data = MultilingualDataset.load(op.dataset)
|
||||
data.show_dimensions()
|
||||
|
||||
# data.set_view(languages=['en','it'], categories=list(range(10)))
|
||||
lXtr, lytr = data.training()
|
||||
lXte, lyte = data.test()
|
||||
|
||||
|
@ -96,8 +97,8 @@ if __name__ == '__main__':
|
|||
classifier = AndreaCLF(op.we_path,
|
||||
config,
|
||||
first_tier_learner=get_learner(calibrate=True),
|
||||
meta_learner=get_learner(calibrate=False),
|
||||
first_tier_parameters=get_params(dense=True),
|
||||
meta_learner=get_learner(calibrate=False, kernel='rbf'),
|
||||
first_tier_parameters=get_params(dense=False),
|
||||
meta_parameters=get_params(dense=True),
|
||||
n_jobs=op.n_jobs)
|
||||
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
from __future__ import print_function
|
||||
import ijson
|
||||
# import ijson
|
||||
# from ijson.common import ObjectBuilder
|
||||
import os, sys
|
||||
from os.path import join
|
||||
from bz2 import BZ2File
|
||||
from ijson.common import ObjectBuilder
|
||||
import pickle
|
||||
from util.file import list_dirs, list_files, makedirs_if_not_exist
|
||||
from itertools import islice
|
||||
|
|
|
@ -5,7 +5,6 @@ from sklearn.preprocessing import MultiLabelBinarizer
|
|||
from data.reader.jrcacquis_reader import *
|
||||
from data.languages import lang_set, NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING
|
||||
from data.reader.rcv_reader import fetch_RCV1, fetch_RCV2, fetch_topic_hierarchy
|
||||
from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample
|
||||
from data.text_preprocessor import NLTKStemTokenizer, preprocess_documents
|
||||
import pickle
|
||||
import numpy as np
|
||||
|
@ -357,6 +356,9 @@ def retrieve_jrc_documents_from_dataset(datasetpath, jrc_data_home, train_years,
|
|||
# Dataset Generators
|
||||
# ----------------------------------------------------------------------------------------------------------------------
|
||||
def prepare_jrc_datasets(jrc_data_home, wiki_data_home, langs, train_years, test_years, cat_policy, most_common_cat=-1, max_wiki=5000, run=0):
|
||||
from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample
|
||||
|
||||
|
||||
"""
|
||||
Prepare all datasets for JRC-Acquis. The datasets include the "feature-independent" version, the
|
||||
"feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices.
|
||||
|
@ -439,6 +441,7 @@ def prepare_jrc_datasets(jrc_data_home, wiki_data_home, langs, train_years, test
|
|||
|
||||
def prepare_rcv_datasets(outpath, rcv1_data_home, rcv2_data_home, wiki_data_home, langs,
|
||||
train_for_lang=1000, test_for_lang=1000, max_wiki=5000, preprocess=True, run=0):
|
||||
from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample
|
||||
"""
|
||||
Prepare all datasets for RCV1/RCV2. The datasets include the "feature-independent" version, the
|
||||
"feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices.
|
||||
|
|
|
@ -8,6 +8,9 @@ from sklearn.model_selection import KFold
|
|||
from joblib import Parallel, delayed
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
|
||||
from data.supervised import zscores
|
||||
from transformers.StandardizeTransformer import StandardizeTransformer
|
||||
|
||||
|
||||
def _sort_if_sparse(X):
|
||||
if issparse(X) and not X.has_sorted_indices:
|
||||
|
@ -210,13 +213,13 @@ class NaivePolylingualClassifier:
|
|||
for lang in langs:
|
||||
_sort_if_sparse(lX[lang])
|
||||
|
||||
# models = Parallel(n_jobs=self.n_jobs)\
|
||||
# (delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]),ly[lang]) for lang in langs)
|
||||
|
||||
models = [MonolingualClassifier(self.base_learner, parameters=self.parameters) for lang in langs]
|
||||
|
||||
for model, lang in zip(models, langs):
|
||||
model.fit(lX[lang], ly[lang])
|
||||
models = Parallel(n_jobs=self.n_jobs)\
|
||||
(delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]),ly[lang]) for lang in langs)
|
||||
#
|
||||
# models = [MonolingualClassifier(self.base_learner, parameters=self.parameters) for lang in langs]
|
||||
#
|
||||
# for model, lang in zip(models, langs):
|
||||
# model.fit(lX[lang], ly[lang])
|
||||
|
||||
self.model = {lang: models[i] for i, lang in enumerate(langs)}
|
||||
self.empty_categories = {lang:self.model[lang].empty_categories for lang in langs}
|
||||
|
@ -537,7 +540,7 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
|||
self.vectorize(lX)
|
||||
|
||||
for lang in self.languages:
|
||||
print(lX[lang].shape)
|
||||
print(f'{lang}->{lX[lang].shape}')
|
||||
|
||||
Z, zy = self._get_zspace(lX, ly)
|
||||
|
||||
|
@ -552,12 +555,15 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
|||
for lang in list(lX.keys()):
|
||||
Z_embedded[lang] = np.hstack((Z[lang], l_weighted_em[lang]))
|
||||
Z = Z_embedded
|
||||
del Z_embedded
|
||||
|
||||
|
||||
# stacking Z space vertically
|
||||
_vertical_Z = np.vstack([Z[lang] for lang in self.languages])
|
||||
_vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
|
||||
|
||||
self.standardizer = StandardizeTransformer()
|
||||
_vertical_Z = self.standardizer.fit_predict(_vertical_Z)
|
||||
|
||||
print('fitting the Z-space of shape={}'.format(_vertical_Z.shape))
|
||||
self.model = MonolingualClassifier(base_learner=self.meta_learner, parameters=self.meta_parameters,
|
||||
n_jobs=self.n_jobs)
|
||||
|
@ -578,10 +584,10 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
|||
Z_embedded = dict()
|
||||
for lang in lX.keys():
|
||||
Z_embedded[lang] = np.hstack((lZ[lang], l_weighted_em[lang]))
|
||||
print(Z_embedded[lang].shape)
|
||||
|
||||
return _joblib_transform_multiling(self.model.predict, Z_embedded, n_jobs=self.n_jobs)
|
||||
|
||||
lZ = Z_embedded
|
||||
|
||||
for lang in lZ.keys():
|
||||
print(lZ[lang].shape)
|
||||
lZ[lang] = self.standardizer.predict(lZ[lang])
|
||||
|
||||
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
|
||||
|
|
|
@ -0,0 +1,23 @@
|
|||
import numpy as np
|
||||
|
||||
class StandardizeTransformer:
|
||||
|
||||
def __init__(self, axis=0):
|
||||
self.axis = axis
|
||||
self.yetfit=False
|
||||
|
||||
def fit(self, X):
|
||||
print('fitting Standardizer')
|
||||
std=np.std(X, axis=self.axis, ddof=1)
|
||||
self.std = np.clip(std, 1e-5, None)
|
||||
self.mean = np.mean(X, axis=self.axis)
|
||||
self.yetfit=True
|
||||
print('done')
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
if not self.yetfit: 'transform called before fit'
|
||||
return (X - self.mean) / self.std
|
||||
|
||||
def fit_predict(self, X):
|
||||
return self.fit(X).predict(X)
|
Loading…
Reference in New Issue