Plot variance explained by PCA for every language
This commit is contained in:
parent
ba1a72ff94
commit
509289b268
|
|
@ -104,7 +104,7 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
##### TODO - config dict is redundant - we have already op argparse ...
|
##### TODO - config dict is redundant - we have already op argparse ...
|
||||||
config['reduction'] = 'PCA'
|
config['reduction'] = 'PCA'
|
||||||
config['max_label_space'] = 'optimal'
|
config['max_label_space'] = 300
|
||||||
|
|
||||||
result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')
|
result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '')
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -217,7 +217,7 @@ class StorageEmbeddings:
|
||||||
print(f'Matrix U (weighted sum) of shape {self.lang_U[lang].shape}\n')
|
print(f'Matrix U (weighted sum) of shape {self.lang_U[lang].shape}\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space):
|
def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space, voc):
|
||||||
_optimal = dict()
|
_optimal = dict()
|
||||||
# TODO testing optimal max_label_space
|
# TODO testing optimal max_label_space
|
||||||
if max_label_space == 'optimal':
|
if max_label_space == 'optimal':
|
||||||
|
|
@ -227,7 +227,7 @@ class StorageEmbeddings:
|
||||||
|
|
||||||
for lang in docs.keys():
|
for lang in docs.keys():
|
||||||
print(f'# [supervised-matrix] for {lang}')
|
print(f'# [supervised-matrix] for {lang}')
|
||||||
self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space, lang)
|
self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space, voc[lang], lang)
|
||||||
print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n')
|
print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n')
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
@ -241,7 +241,7 @@ class StorageEmbeddings:
|
||||||
if config['unsupervised']:
|
if config['unsupervised']:
|
||||||
self._add_embeddings_unsupervised(config['we_type'], docs, vocs)
|
self._add_embeddings_unsupervised(config['we_type'], docs, vocs)
|
||||||
if config['supervised']:
|
if config['supervised']:
|
||||||
self._add_emebeddings_supervised(docs, labels, config['reduction'], config['max_label_space'])
|
self._add_emebeddings_supervised(docs, labels, config['reduction'], config['max_label_space'], vocs)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def predict(self, config, docs):
|
def predict(self, config, docs):
|
||||||
|
|
@ -269,10 +269,11 @@ class StorageEmbeddings:
|
||||||
|
|
||||||
for lang in docs.keys():
|
for lang in docs.keys():
|
||||||
_r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space='optimal').tolist()
|
_r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space='optimal').tolist()
|
||||||
plt.plot(np.cumsum(_r), label=lang)
|
_r = np.cumsum(_r)
|
||||||
|
plt.plot(_r, label=lang)
|
||||||
for i in range(len(_r)-1, 1, -1):
|
for i in range(len(_r)-1, 1, -1):
|
||||||
# todo: if n_components (therfore #n labels) is not big enough every value will be smaller than the next one ...
|
# todo: if n_components (therfore #n labels) is not big enough every value will be smaller than the next one ...
|
||||||
delta = _r[i-1] - _r[i]
|
delta = _r[i] - _r[i-1]
|
||||||
if delta > 0:
|
if delta > 0:
|
||||||
_idx.append(i)
|
_idx.append(i)
|
||||||
break
|
break
|
||||||
|
|
|
||||||
|
|
@ -40,7 +40,7 @@ def supervised_embeddings_tsr(X,Y, tsr_function=information_gain, max_documents=
|
||||||
return F
|
return F
|
||||||
|
|
||||||
|
|
||||||
def get_supervised_embeddings(X, Y, reduction, max_label_space=300, lang='None', binary_structural_problems=-1, method='dotn', dozscore=True):
|
def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, lang='None', binary_structural_problems=-1, method='dotn', dozscore=True):
|
||||||
if max_label_space == 'optimal':
|
if max_label_space == 'optimal':
|
||||||
max_label_space = 0
|
max_label_space = 0
|
||||||
|
|
||||||
|
|
@ -63,6 +63,18 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, lang='None',
|
||||||
if dozscore:
|
if dozscore:
|
||||||
F = zscores(F, axis=0)
|
F = zscores(F, axis=0)
|
||||||
|
|
||||||
|
# Dumping F-matrix for further studies
|
||||||
|
# TODO im not sure if voc.keys and F matrix indices are "aligned" correctly
|
||||||
|
dump_it = True
|
||||||
|
if dump_it:
|
||||||
|
with open(f'/home/andreapdr/funneling_pdr/src/dumps/WCE_{lang}.tsv', 'w') as outfile:
|
||||||
|
np.savetxt(outfile, F, delimiter='\t')
|
||||||
|
with open(f'/home/andreapdr/funneling_pdr/src/dumps/dict_WCE_{lang}.tsv', 'w') as outfile:
|
||||||
|
for token in voc.keys():
|
||||||
|
outfile.write(token+'\n')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if nC > max_label_space:
|
if nC > max_label_space:
|
||||||
# TODO testing optimal max_label_space
|
# TODO testing optimal max_label_space
|
||||||
if reduction == 'PCA':
|
if reduction == 'PCA':
|
||||||
|
|
@ -75,15 +87,6 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, lang='None',
|
||||||
f'Applying PCA(n_components={max_label_space})')
|
f'Applying PCA(n_components={max_label_space})')
|
||||||
pca = PCA(n_components=max_label_space)
|
pca = PCA(n_components=max_label_space)
|
||||||
pca = pca.fit(F)
|
pca = pca.fit(F)
|
||||||
########################################################
|
|
||||||
# import matplotlib.pyplot as plt
|
|
||||||
# plt.figure()
|
|
||||||
# plt.plot(np.cumsum(pca.explained_variance_ratio_))
|
|
||||||
# plt.xlabel('Number of Components')
|
|
||||||
# plt.ylabel('Variance (%)') #
|
|
||||||
# plt.title(f'WCE Explained Variance {lang}')
|
|
||||||
# plt.show()
|
|
||||||
########################################################
|
|
||||||
F = pca.fit_transform(F)
|
F = pca.fit_transform(F)
|
||||||
elif reduction == 'TSNE':
|
elif reduction == 'TSNE':
|
||||||
print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
|
print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue