refactored pca methods
This commit is contained in:
parent
509289b268
commit
9fa1899a7f
|
@ -1,4 +1,4 @@
|
||||||
import os, sys
|
import os
|
||||||
from dataset_builder import MultilingualDataset
|
from dataset_builder import MultilingualDataset
|
||||||
from learning.learners import *
|
from learning.learners import *
|
||||||
from util.evaluation import *
|
from util.evaluation import *
|
||||||
|
@ -21,7 +21,7 @@ parser.add_option("-e", "--mode-embed", dest="mode_embed",
|
||||||
help="Set the embedding to be used [none, unsupervised, supervised, both]", type=str, default='none')
|
help="Set the embedding to be used [none, unsupervised, supervised, both]", type=str, default='none')
|
||||||
|
|
||||||
parser.add_option("-w", "--we-path", dest="we_path",
|
parser.add_option("-w", "--we-path", dest="we_path",
|
||||||
help="Path to the polylingual word embeddings", default='../embeddings/')
|
help="Path to the polylingual word embeddings", default='/home/andreapdr/CLESA/')
|
||||||
|
|
||||||
parser.add_option('-t', "--we-type", dest="we_type", help="Aligned embeddings to use [FastText, MUSE]", type=str,
|
parser.add_option('-t', "--we-type", dest="we_type", help="Aligned embeddings to use [FastText, MUSE]", type=str,
|
||||||
default='MUSE')
|
default='MUSE')
|
||||||
|
@ -30,11 +30,21 @@ parser.add_option("-s", "--set_c", dest="set_c",type=float,
|
||||||
help="Set the C parameter", default=1)
|
help="Set the C parameter", default=1)
|
||||||
|
|
||||||
parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
|
parser.add_option("-c", "--optimc", dest="optimc", action='store_true',
|
||||||
help="Optimices hyperparameters", default=False)
|
help="Optimize hyperparameters", default=False)
|
||||||
|
|
||||||
parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
|
parser.add_option("-j", "--n_jobs", dest="n_jobs",type=int,
|
||||||
help="Number of parallel jobs (default is -1, all)", default=-1)
|
help="Number of parallel jobs (default is -1, all)", default=-1)
|
||||||
|
|
||||||
|
parser.add_option("-p", "--pca", dest="max_labels", type=int,
|
||||||
|
help="If less than number of target classes, will apply PCA to supervised matrix. If set to 0 it"
|
||||||
|
" will automatically search for the best number of components", default=300)
|
||||||
|
|
||||||
|
parser.add_option("-u", "--upca", dest="max_labels_U", type=int,
|
||||||
|
help="If smaller than Unsupervised Dimension, will apply PCA to unsupervised matrix. If set to 0 it"
|
||||||
|
" will automatically search for the best number of components", default=300)
|
||||||
|
|
||||||
|
parser.add_option("-l", dest="lang", type=str)
|
||||||
|
|
||||||
|
|
||||||
def get_learner(calibrate=False, kernel='linear'):
|
def get_learner(calibrate=False, kernel='linear'):
|
||||||
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto')
|
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=op.set_c, random_state=1, class_weight='balanced', gamma='auto')
|
||||||
|
@ -51,7 +61,6 @@ def get_params(dense=False):
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
(op, args) = parser.parse_args()
|
(op, args) = parser.parse_args()
|
||||||
|
|
||||||
assert exists(op.dataset), 'Unable to find file '+str(op.dataset)
|
assert exists(op.dataset), 'Unable to find file '+str(op.dataset)
|
||||||
|
@ -64,8 +73,9 @@ if __name__ == '__main__':
|
||||||
data = MultilingualDataset.load(op.dataset)
|
data = MultilingualDataset.load(op.dataset)
|
||||||
data.show_dimensions()
|
data.show_dimensions()
|
||||||
|
|
||||||
# data.set_view(languages=['en','it'], categories=list(range(10)))
|
data.set_view(languages=['en','it', 'pt', 'sv'], categories=list(range(10)))
|
||||||
# data.set_view(languages=['en','it'])
|
# data.set_view(languages=[op.lang])
|
||||||
|
# data.set_view(categories=list(range(10)))
|
||||||
lXtr, lytr = data.training()
|
lXtr, lytr = data.training()
|
||||||
lXte, lyte = data.test()
|
lXte, lyte = data.test()
|
||||||
|
|
||||||
|
@ -104,7 +114,9 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
##### TODO - config dict is redundant - we have already op argparse ...
|
##### TODO - config dict is redundant - we have already op argparse ...
|
||||||
config['reduction'] = 'PCA'
|
config['reduction'] = 'PCA'
|
||||||
config['max_label_space'] = 300
|
config['max_label_space'] = op.max_labels
|
||||||
|
config['dim_reduction_unsupervised'] = op.max_labels_U
|
||||||
|
# config['plot_covariance_matrices'] = True
|
||||||
|
|
||||||
result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')
|
result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')
|
||||||
|
|
||||||
|
@ -129,5 +141,5 @@ if __name__ == '__main__':
|
||||||
metrics.append([macrof1, microf1, macrok, microk])
|
metrics.append([macrof1, microf1, macrok, microk])
|
||||||
print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
|
print('Lang %s: macro-F1=%.3f micro-F1=%.3f' % (lang, macrof1, microf1))
|
||||||
results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, config['we_type'], op.optimc, op.dataset.split('/')[-1],
|
results.add_row(result_id, 'PolyEmbed_andrea', 'svm', _config_id, config['we_type'], op.optimc, op.dataset.split('/')[-1],
|
||||||
'not_binary', 'not_ablation', classifier.time, lang, macrof1, microf1, macrok, microk, '')
|
classifier.time, lang, macrof1, microf1, macrok, microk, '')
|
||||||
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
|
print('Averages: MF1, mF1, MK, mK', np.mean(np.array(metrics), axis=0))
|
||||||
|
|
|
@ -5,7 +5,9 @@ from torchtext.vocab import Vectors
|
||||||
import torch
|
import torch
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from data.supervised import get_supervised_embeddings
|
from data.supervised import get_supervised_embeddings
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from sklearn.decomposition import PCA
|
||||||
|
from util.decompositions import *
|
||||||
|
|
||||||
class PretrainedEmbeddings(ABC):
|
class PretrainedEmbeddings(ABC):
|
||||||
|
|
||||||
|
@ -110,10 +112,10 @@ class WordEmbeddings:
|
||||||
# vocabulary is a set of terms to be kept
|
# vocabulary is a set of terms to be kept
|
||||||
active_vocabulary = sorted([w for w in vocabulary if w in self.worddim])
|
active_vocabulary = sorted([w for w in vocabulary if w in self.worddim])
|
||||||
lost = len(vocabulary)-len(active_vocabulary)
|
lost = len(vocabulary)-len(active_vocabulary)
|
||||||
if lost>0: #some termr are missing, so it will be replaced by UNK
|
if lost > 0: #some terms are missing, so it will be replaced by UNK
|
||||||
print('warning: missing {} terms for lang {}'.format(lost, self.lang))
|
print('warning: missing {} terms for lang {}'.format(lost, self.lang))
|
||||||
self.we = self.get_vectors(active_vocabulary)
|
self.we = self.get_vectors(active_vocabulary)
|
||||||
assert self.we.shape[0]==len(active_vocabulary)
|
assert self.we.shape[0] == len(active_vocabulary)
|
||||||
self.dimword={i:w for i,w in enumerate(active_vocabulary)}
|
self.dimword={i:w for i,w in enumerate(active_vocabulary)}
|
||||||
self.worddim={w:i for i,w in enumerate(active_vocabulary)}
|
self.worddim={w:i for i,w in enumerate(active_vocabulary)}
|
||||||
return self
|
return self
|
||||||
|
@ -153,7 +155,6 @@ class FastTextWikiNews(Vectors):
|
||||||
url = self.url_base.format(language)
|
url = self.url_base.format(language)
|
||||||
# name = self.path.format(language)
|
# name = self.path.format(language)
|
||||||
name = cache + self._name.format(language)
|
name = cache + self._name.format(language)
|
||||||
# print(f'\n\nFASTEXTWIKI-NEW CLASS:\nurl = {url}\nname = {name}\ncache {cache}\nlanguage = {language}')
|
|
||||||
super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
|
super(FastTextWikiNews, self).__init__(name, cache=cache, url=url, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
@ -171,15 +172,17 @@ class EmbeddingsAligned(Vectors):
|
||||||
def vocabulary(self):
|
def vocabulary(self):
|
||||||
return set(self.stoi.keys())
|
return set(self.stoi.keys())
|
||||||
|
|
||||||
def dim(self):
|
|
||||||
return self.dim
|
|
||||||
|
|
||||||
def extract(self, words):
|
def extract(self, words):
|
||||||
source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.stoi)
|
source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.stoi)
|
||||||
extraction = torch.zeros((len(words), self.dim))
|
extraction = torch.zeros((len(words), self.dim))
|
||||||
extraction[source_idx] = self.vectors[target_idx]
|
extraction[source_idx] = self.vectors[target_idx]
|
||||||
return extraction
|
return extraction
|
||||||
|
|
||||||
|
def reduce(self, dim):
|
||||||
|
pca = PCA(n_components=dim)
|
||||||
|
self.vectors = pca.fit_transform(self.vectors)
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
class FastTextMUSE(PretrainedEmbeddings):
|
class FastTextMUSE(PretrainedEmbeddings):
|
||||||
|
|
||||||
|
@ -209,26 +212,44 @@ class StorageEmbeddings:
|
||||||
self.lang_U = dict()
|
self.lang_U = dict()
|
||||||
self.lang_S = dict()
|
self.lang_S = dict()
|
||||||
|
|
||||||
def _add_embeddings_unsupervised(self, type, docs, vocs):
|
def _add_embeddings_unsupervised(self, type, docs, vocs, max_label_space=300):
|
||||||
for lang in docs.keys():
|
for lang in docs.keys():
|
||||||
|
nC = self.lang_U[lang].shape[1]
|
||||||
print(f'# [unsupervised-matrix {type}] for {lang}')
|
print(f'# [unsupervised-matrix {type}] for {lang}')
|
||||||
voc = np.asarray(list(zip(*sorted(vocs[lang].items(), key=lambda x: x[1])))[0])
|
voc = np.asarray(list(zip(*sorted(vocs[lang].items(), key=lambda x: x[1])))[0])
|
||||||
self.lang_U[lang] = EmbeddingsAligned(type, self.path, lang, voc).vectors
|
self.lang_U[lang] = EmbeddingsAligned(type, self.path, lang, voc).vectors
|
||||||
|
# if self.lang_U[lang].shape[1] > dim != 0:
|
||||||
|
# print(f'unsupervised matrix has more dimensions ({self.lang_U[lang].shape[1]}) than'
|
||||||
|
# f' the allowed limit {dim}. Applying PCA(n_components={dim})')
|
||||||
|
# pca = PCA(n_components=dim)
|
||||||
|
# self.lang_U[lang] = pca.fit_transform(self.lang_U[lang])
|
||||||
print(f'Matrix U (weighted sum) of shape {self.lang_U[lang].shape}\n')
|
print(f'Matrix U (weighted sum) of shape {self.lang_U[lang].shape}\n')
|
||||||
|
if max_label_space == 0:
|
||||||
|
print(f'Computing optimal number of PCA components along matrices U')
|
||||||
|
optimal_n = get_optimal_dim(self.lang_U, 'U')
|
||||||
|
self.lang_U = run_pca(optimal_n, self.lang_U)
|
||||||
|
elif max_label_space < nC:
|
||||||
|
self.lang_U = run_pca(max_label_space, self.lang_U)
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space, voc):
|
def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space, voc):
|
||||||
_optimal = dict()
|
# if max_label_space == 0:
|
||||||
# TODO testing optimal max_label_space
|
# print('Computing optimal number of PCA components along matrices S...')
|
||||||
if max_label_space == 'optimal':
|
# optimal_n = self.get_optimal_supervised_components(docs, labels)
|
||||||
print('Computing optimal number of PCA components ...')
|
# max_label_space = optimal_n
|
||||||
optimal_n = self.get_optimal_supervised_components(docs, labels)
|
for lang in docs.keys(): # compute supervised matrices S - then apply PCA
|
||||||
max_label_space = optimal_n
|
nC = self.lang_S[lang].shape[1]
|
||||||
|
|
||||||
for lang in docs.keys():
|
|
||||||
print(f'# [supervised-matrix] for {lang}')
|
print(f'# [supervised-matrix] for {lang}')
|
||||||
self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space, voc[lang], lang)
|
self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space, voc[lang], lang)
|
||||||
print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n')
|
print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n')
|
||||||
|
|
||||||
|
if max_label_space == 0:
|
||||||
|
optimal_n = get_optimal_dim(self.lang_S, 'S')
|
||||||
|
self.lang_S = run_pca(optimal_n, self.lang_S)
|
||||||
|
elif max_label_space < nC:
|
||||||
|
self.lang_S = run_pca(max_label_space, self.lang_S)
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def _concatenate_embeddings(self, docs):
|
def _concatenate_embeddings(self, docs):
|
||||||
|
@ -239,7 +260,7 @@ class StorageEmbeddings:
|
||||||
|
|
||||||
def fit(self, config, docs, vocs, labels):
|
def fit(self, config, docs, vocs, labels):
|
||||||
if config['unsupervised']:
|
if config['unsupervised']:
|
||||||
self._add_embeddings_unsupervised(config['we_type'], docs, vocs)
|
self._add_embeddings_unsupervised(config['we_type'], docs, vocs, config['dim_reduction_unsupervised'])
|
||||||
if config['supervised']:
|
if config['supervised']:
|
||||||
self._add_emebeddings_supervised(docs, labels, config['reduction'], config['max_label_space'], vocs)
|
self._add_emebeddings_supervised(docs, labels, config['reduction'], config['max_label_space'], vocs)
|
||||||
return self
|
return self
|
||||||
|
@ -257,28 +278,58 @@ class StorageEmbeddings:
|
||||||
_r[lang] = docs[lang].dot(self.lang_U[lang])
|
_r[lang] = docs[lang].dot(self.lang_U[lang])
|
||||||
return _r
|
return _r
|
||||||
|
|
||||||
def get_optimal_supervised_components(self, docs, labels):
|
# @staticmethod
|
||||||
import matplotlib.pyplot as plt
|
# def get_optimal_supervised_components(docs, labels):
|
||||||
|
# optimal_n = get_optimal_dim(docs, 'S')
|
||||||
|
# return optimal_n
|
||||||
|
# _idx = []
|
||||||
|
#
|
||||||
|
# plt.figure(figsize=(15, 10))
|
||||||
|
# plt.title(f'WCE Explained Variance')
|
||||||
|
# plt.xlabel('Number of Components')
|
||||||
|
# plt.ylabel('Variance (%)')
|
||||||
|
#
|
||||||
|
# for lang in docs.keys():
|
||||||
|
# _r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space=0).tolist()
|
||||||
|
# _r = np.cumsum(_r)
|
||||||
|
# plt.plot(_r, label=lang)
|
||||||
|
# for i in range(len(_r)-1, 1, -1):
|
||||||
|
# delta = _r[i] - _r[i-1]
|
||||||
|
# if delta > 0:
|
||||||
|
# _idx.append(i)
|
||||||
|
# break
|
||||||
|
# best_n = max(_idx)
|
||||||
|
# plt.axvline(best_n, color='r', label='optimal N')
|
||||||
|
# plt.legend()
|
||||||
|
# plt.show()
|
||||||
|
# return best_n
|
||||||
|
#
|
||||||
|
# def get_optimal_unsupervised_components(self, type):
|
||||||
|
# _idx = []
|
||||||
|
#
|
||||||
|
# plt.figure(figsize=(15, 10))
|
||||||
|
# plt.title(f'Unsupervised Embeddings {type} Explained Variance')
|
||||||
|
# plt.xlabel('Number of Components')
|
||||||
|
# plt.ylabel('Variance (%)')
|
||||||
|
#
|
||||||
|
# for lang in self.lang_U.keys():
|
||||||
|
# pca = PCA(n_components=self.lang_U[lang].shape[1])
|
||||||
|
# pca.fit(self.lang_U[lang])
|
||||||
|
# _r = pca.explained_variance_ratio_
|
||||||
|
# _r = np.cumsum(_r)
|
||||||
|
# plt.plot(_r, label=lang)
|
||||||
|
# for i in range(len(_r) - 1, 1, -1):
|
||||||
|
# delta = _r[i] - _r[i - 1]
|
||||||
|
# if delta > 0:
|
||||||
|
# _idx.append(i)
|
||||||
|
# break
|
||||||
|
# best_n = max(_idx)
|
||||||
|
# plt.axvline(best_n, color='r', label='optimal N')
|
||||||
|
# plt.legend()
|
||||||
|
# plt.show()
|
||||||
|
#
|
||||||
|
# for lang in self.lang_U.keys():
|
||||||
|
# pca = PCA(n_components=best_n)
|
||||||
|
# self.lang_U[lang] = pca.fit_transform(self.lang_U[lang])
|
||||||
|
# return
|
||||||
|
|
||||||
_idx = []
|
|
||||||
|
|
||||||
plt.figure(figsize=(15, 10))
|
|
||||||
plt.title(f'WCE Explained Variance')
|
|
||||||
plt.xlabel('Number of Components')
|
|
||||||
plt.ylabel('Variance (%)')
|
|
||||||
|
|
||||||
for lang in docs.keys():
|
|
||||||
_r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space='optimal').tolist()
|
|
||||||
_r = np.cumsum(_r)
|
|
||||||
plt.plot(_r, label=lang)
|
|
||||||
for i in range(len(_r)-1, 1, -1):
|
|
||||||
# todo: if n_components (therfore #n labels) is not big enough every value will be smaller than the next one ...
|
|
||||||
delta = _r[i] - _r[i-1]
|
|
||||||
if delta > 0:
|
|
||||||
_idx.append(i)
|
|
||||||
break
|
|
||||||
best_n = int(sum(_idx)/len(_idx))
|
|
||||||
plt.vlines(best_n, 0, 1, colors='r', label='optimal N')
|
|
||||||
plt.legend()
|
|
||||||
plt.show()
|
|
||||||
return best_n
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square
|
from data.tsr_function__ import get_supervised_matrix, get_tsr_matrix, information_gain, chi_square
|
||||||
from sklearn.decomposition import PCA, TruncatedSVD
|
from sklearn.decomposition import PCA
|
||||||
from sklearn.manifold import TSNE
|
from sklearn.manifold import TSNE
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
@ -41,15 +41,9 @@ def supervised_embeddings_tsr(X,Y, tsr_function=information_gain, max_documents=
|
||||||
|
|
||||||
|
|
||||||
def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, lang='None', binary_structural_problems=-1, method='dotn', dozscore=True):
|
def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, lang='None', binary_structural_problems=-1, method='dotn', dozscore=True):
|
||||||
if max_label_space == 'optimal':
|
|
||||||
max_label_space = 0
|
|
||||||
|
|
||||||
if max_label_space != 0:
|
if max_label_space != 0:
|
||||||
print('computing supervised embeddings...')
|
print('computing supervised embeddings...')
|
||||||
|
|
||||||
nC = Y.shape[1]
|
nC = Y.shape[1]
|
||||||
if nC==2 and binary_structural_problems > nC:
|
|
||||||
raise ValueError('not implemented in this branch')
|
|
||||||
|
|
||||||
if method=='ppmi':
|
if method=='ppmi':
|
||||||
F = supervised_embeddings_ppmi(X, Y)
|
F = supervised_embeddings_ppmi(X, Y)
|
||||||
|
@ -64,8 +58,7 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, la
|
||||||
F = zscores(F, axis=0)
|
F = zscores(F, axis=0)
|
||||||
|
|
||||||
# Dumping F-matrix for further studies
|
# Dumping F-matrix for further studies
|
||||||
# TODO im not sure if voc.keys and F matrix indices are "aligned" correctly
|
dump_it = False
|
||||||
dump_it = True
|
|
||||||
if dump_it:
|
if dump_it:
|
||||||
with open(f'/home/andreapdr/funneling_pdr/src/dumps/WCE_{lang}.tsv', 'w') as outfile:
|
with open(f'/home/andreapdr/funneling_pdr/src/dumps/WCE_{lang}.tsv', 'w') as outfile:
|
||||||
np.savetxt(outfile, F, delimiter='\t')
|
np.savetxt(outfile, F, delimiter='\t')
|
||||||
|
@ -73,34 +66,32 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, la
|
||||||
for token in voc.keys():
|
for token in voc.keys():
|
||||||
outfile.write(token+'\n')
|
outfile.write(token+'\n')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if nC > max_label_space:
|
|
||||||
# TODO testing optimal max_label_space
|
|
||||||
if reduction == 'PCA':
|
|
||||||
if max_label_space == 0:
|
|
||||||
pca = PCA(n_components=Y.shape[1])
|
|
||||||
pca = pca.fit(F)
|
|
||||||
return pca.explained_variance_ratio_
|
|
||||||
|
|
||||||
print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
|
|
||||||
f'Applying PCA(n_components={max_label_space})')
|
|
||||||
pca = PCA(n_components=max_label_space)
|
|
||||||
pca = pca.fit(F)
|
|
||||||
F = pca.fit_transform(F)
|
|
||||||
elif reduction == 'TSNE':
|
|
||||||
print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
|
|
||||||
f'Applying t-SNE(n_components={max_label_space})')
|
|
||||||
tsne = TSNE(n_components=max_label_space)
|
|
||||||
F = tsne.fit_transform(F)
|
|
||||||
elif reduction == 'tSVD':
|
|
||||||
print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
|
|
||||||
f'Applying truncatedSVD(n_components={max_label_space})')
|
|
||||||
tSVD = TruncatedSVD(n_components=max_label_space)
|
|
||||||
F = tSVD.fit_transform(F)
|
|
||||||
|
|
||||||
return F
|
return F
|
||||||
|
|
||||||
|
# if nC >= max_label_space:
|
||||||
|
# if reduction == 'PCA':
|
||||||
|
# if max_label_space == 0:
|
||||||
|
# pca = PCA(n_components=Y.shape[1])
|
||||||
|
# pca = pca.fit(F)
|
||||||
|
# return pca.explained_variance_ratio_
|
||||||
|
#
|
||||||
|
# print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
|
||||||
|
# f'Applying PCA(n_components={max_label_space})')
|
||||||
|
# pca = PCA(n_components=max_label_space)
|
||||||
|
# F = pca.fit_transform(F)
|
||||||
|
# elif reduction == 'TSNE':
|
||||||
|
# print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
|
||||||
|
# f'Applying t-SNE(n_components={max_label_space})')
|
||||||
|
# tsne = TSNE(n_components=max_label_space)
|
||||||
|
# F = tsne.fit_transform(F)
|
||||||
|
# elif reduction == 'tSVD':
|
||||||
|
# print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
|
||||||
|
# f'Applying truncatedSVD(n_components={max_label_space})')
|
||||||
|
# tSVD = TruncatedSVD(n_components=max_label_space)
|
||||||
|
# F = tSVD.fit_transform(F)
|
||||||
|
#
|
||||||
|
# return F
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -8,6 +8,7 @@ from sklearn.model_selection import KFold
|
||||||
from joblib import Parallel, delayed
|
from joblib import Parallel, delayed
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
from transformers.StandardizeTransformer import StandardizeTransformer
|
from transformers.StandardizeTransformer import StandardizeTransformer
|
||||||
|
from sklearn.decomposition import PCA
|
||||||
|
|
||||||
|
|
||||||
def _sort_if_sparse(X):
|
def _sort_if_sparse(X):
|
||||||
|
@ -453,13 +454,12 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
||||||
calmode,
|
calmode,
|
||||||
n_jobs)
|
n_jobs)
|
||||||
|
|
||||||
|
self.pca_independent_space = PCA(n_components=100)
|
||||||
self.we_path = we_path
|
self.we_path = we_path
|
||||||
self.config = config
|
self.config = config
|
||||||
self.lang_word2idx = dict()
|
self.lang_word2idx = dict()
|
||||||
self.languages = []
|
self.languages = []
|
||||||
self.lang_tfidf = {}
|
self.lang_tfidf = {}
|
||||||
# self.word_embeddings = {}
|
|
||||||
# self.supervised_embeddings = {}
|
|
||||||
self.embedding_space = None
|
self.embedding_space = None
|
||||||
self.model = None
|
self.model = None
|
||||||
self.time = None
|
self.time = None
|
||||||
|
@ -515,6 +515,10 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
||||||
_vertical_Z = np.vstack([Z[lang] for lang in self.languages])
|
_vertical_Z = np.vstack([Z[lang] for lang in self.languages])
|
||||||
_vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
|
_vertical_Zy = np.vstack([zy[lang] for lang in self.languages])
|
||||||
|
|
||||||
|
# todo testing ...
|
||||||
|
# self.pca_independent_space.fit(_vertical_Z)
|
||||||
|
# _vertical_Z = self.pca_independent_space.transform(_vertical_Z)
|
||||||
|
|
||||||
self.standardizer = StandardizeTransformer()
|
self.standardizer = StandardizeTransformer()
|
||||||
_vertical_Z = self.standardizer.fit_predict(_vertical_Z)
|
_vertical_Z = self.standardizer.fit_predict(_vertical_Z)
|
||||||
|
|
||||||
|
@ -532,17 +536,14 @@ class AndreaCLF(FunnellingPolylingualClassifier):
|
||||||
|
|
||||||
if self.config['supervised'] or self.config['unsupervised']:
|
if self.config['supervised'] or self.config['unsupervised']:
|
||||||
_embedding_space = self.embedding_space.predict(self.config, lX)
|
_embedding_space = self.embedding_space.predict(self.config, lX)
|
||||||
# l_weighted_em = self.embed(lX, ly,
|
|
||||||
# unsupervised=self.config['unsupervised'],
|
|
||||||
# supervised=self.config['supervised'],
|
|
||||||
# prediction=True)
|
|
||||||
# Z_embedded = dict()
|
|
||||||
for lang in lX.keys():
|
for lang in lX.keys():
|
||||||
lZ[lang] = np.hstack((lZ[lang], _embedding_space[lang]))
|
lZ[lang] = np.hstack((lZ[lang], _embedding_space[lang]))
|
||||||
# lZ = Z_embedded
|
|
||||||
|
|
||||||
for lang in lZ.keys():
|
for lang in lZ.keys():
|
||||||
print(lZ[lang].shape)
|
print(lZ[lang].shape)
|
||||||
|
# todo testing
|
||||||
|
# lZ[lang] = self.pca_independent_space.transform(lZ[lang])
|
||||||
lZ[lang] = self.standardizer.predict(lZ[lang])
|
lZ[lang] = self.standardizer.predict(lZ[lang])
|
||||||
|
|
||||||
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
|
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
|
||||||
|
|
|
@ -0,0 +1,49 @@
|
||||||
|
from sklearn.decomposition import PCA
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
def run_pca(dim, X):
|
||||||
|
"""
|
||||||
|
:param dim: number of pca components to keep
|
||||||
|
:param X: dictionary str(lang): matrix
|
||||||
|
:return: dict lang: reduced matrix
|
||||||
|
"""
|
||||||
|
r = dict()
|
||||||
|
pca = PCA(n_components=dim)
|
||||||
|
for lang in X.keys():
|
||||||
|
r[lang] = pca.fit_transform(X[lang])
|
||||||
|
return r
|
||||||
|
|
||||||
|
|
||||||
|
def get_optimal_dim(X, embed_type):
|
||||||
|
"""
|
||||||
|
:param X: dict str(lang) : csr_matrix of embeddings unsupervised or supervised
|
||||||
|
:param embed_type: (str) embedding matrix type: S or U (WCE supervised or U unsupervised MUSE/FASTTEXT)
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
_idx = []
|
||||||
|
|
||||||
|
plt.figure(figsize=(15, 10))
|
||||||
|
if embed_type == 'U':
|
||||||
|
plt.title(f'Unsupervised Embeddings {"TODO"} Explained Variance')
|
||||||
|
else:
|
||||||
|
plt.title(f'WCE Explained Variance')
|
||||||
|
plt.xlabel('Number of Components')
|
||||||
|
plt.ylabel('Variance (%)')
|
||||||
|
|
||||||
|
for lang in X.keys():
|
||||||
|
pca = PCA(n_components=X[lang].shape[1])
|
||||||
|
pca.fit(X[lang])
|
||||||
|
_r = pca.explained_variance_ratio_
|
||||||
|
_r = np.cumsum(_r)
|
||||||
|
plt.plot(_r, label=lang)
|
||||||
|
for i in range(len(_r) - 1, 1, -1):
|
||||||
|
delta = _r[i] - _r[i - 1]
|
||||||
|
if delta > 0:
|
||||||
|
_idx.append(i)
|
||||||
|
break
|
||||||
|
best_n = max(_idx)
|
||||||
|
plt.axvline(best_n, color='r', label='optimal N')
|
||||||
|
plt.legend()
|
||||||
|
plt.show()
|
||||||
|
return best_n
|
|
@ -5,7 +5,7 @@ import numpy as np
|
||||||
class PolylingualClassificationResults:
|
class PolylingualClassificationResults:
|
||||||
def __init__(self, file, autoflush=True, verbose=False):
|
def __init__(self, file, autoflush=True, verbose=False):
|
||||||
self.file = file
|
self.file = file
|
||||||
self.columns = ['id', 'method', 'learner', 'embed', 'embed_type', 'optimp', 'dataset', 'binary', 'languages', 'time', 'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes']
|
self.columns = ['id', 'method', 'learner', 'embed', 'embed_type', 'optimp', 'dataset', 'time', 'lang', 'macrof1', 'microf1', 'macrok', 'microk', 'notes']
|
||||||
self.autoflush = autoflush
|
self.autoflush = autoflush
|
||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
if os.path.exists(file):
|
if os.path.exists(file):
|
||||||
|
@ -20,8 +20,8 @@ class PolylingualClassificationResults:
|
||||||
def already_calculated(self, id):
|
def already_calculated(self, id):
|
||||||
return (self.df['id'] == id).any()
|
return (self.df['id'] == id).any()
|
||||||
|
|
||||||
def add_row(self, id, method, learner, embed, embed_type, optimp, dataset, binary, ablation_lang, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
|
def add_row(self, id, method, learner, embed, embed_type, optimp, dataset, time, lang, macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
|
||||||
s = pd.Series([id, method, learner, embed, embed_type, optimp, dataset, binary, ablation_lang, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
|
s = pd.Series([id, method, learner, embed, embed_type, optimp, dataset, time, lang, macrof1, microf1, macrok, microk, notes], index=self.columns)
|
||||||
self.df = self.df.append(s, ignore_index=True)
|
self.df = self.df.append(s, ignore_index=True)
|
||||||
if self.autoflush: self.flush()
|
if self.autoflush: self.flush()
|
||||||
self.tell(s.to_string())
|
self.tell(s.to_string())
|
||||||
|
|
Loading…
Reference in New Issue