diff --git a/src/FPEC_andrea.py b/src/FPEC_andrea.py index 137e6cc..185bcc2 100644 --- a/src/FPEC_andrea.py +++ b/src/FPEC_andrea.py @@ -104,7 +104,7 @@ if __name__ == '__main__': ##### TODO - config dict is redundant - we have already op argparse ... config['reduction'] = 'PCA' - config['max_label_space'] = 'optimal' + config['max_label_space'] = 300 result_id = dataset_file + 'PolyEmbedd_andrea_' + _config_id + ('_optimC' if op.optimc else '') diff --git a/src/data/embeddings.py b/src/data/embeddings.py index b5b253a..8005dad 100644 --- a/src/data/embeddings.py +++ b/src/data/embeddings.py @@ -217,7 +217,7 @@ class StorageEmbeddings: print(f'Matrix U (weighted sum) of shape {self.lang_U[lang].shape}\n') return - def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space): + def _add_emebeddings_supervised(self, docs, labels, reduction, max_label_space, voc): _optimal = dict() # TODO testing optimal max_label_space if max_label_space == 'optimal': @@ -227,7 +227,7 @@ class StorageEmbeddings: for lang in docs.keys(): print(f'# [supervised-matrix] for {lang}') - self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space, lang) + self.lang_S[lang] = get_supervised_embeddings(docs[lang], labels[lang], reduction, max_label_space, voc[lang], lang) print(f'[embedding matrix done] of shape={self.lang_S[lang].shape}\n') return @@ -241,7 +241,7 @@ class StorageEmbeddings: if config['unsupervised']: self._add_embeddings_unsupervised(config['we_type'], docs, vocs) if config['supervised']: - self._add_emebeddings_supervised(docs, labels, config['reduction'], config['max_label_space']) + self._add_emebeddings_supervised(docs, labels, config['reduction'], config['max_label_space'], vocs) return self def predict(self, config, docs): @@ -269,10 +269,11 @@ class StorageEmbeddings: for lang in docs.keys(): _r = get_supervised_embeddings(docs[lang], labels[lang], reduction='PCA', max_label_space='optimal').tolist() - plt.plot(np.cumsum(_r), label=lang) + _r = np.cumsum(_r) + plt.plot(_r, label=lang) for i in range(len(_r)-1, 1, -1): # todo: if n_components (therfore #n labels) is not big enough every value will be smaller than the next one ... - delta = _r[i-1] - _r[i] + delta = _r[i] - _r[i-1] if delta > 0: _idx.append(i) break diff --git a/src/data/supervised.py b/src/data/supervised.py index 02f8c84..d8e1f7d 100755 --- a/src/data/supervised.py +++ b/src/data/supervised.py @@ -40,7 +40,7 @@ def supervised_embeddings_tsr(X,Y, tsr_function=information_gain, max_documents= return F -def get_supervised_embeddings(X, Y, reduction, max_label_space=300, lang='None', binary_structural_problems=-1, method='dotn', dozscore=True): +def get_supervised_embeddings(X, Y, reduction, max_label_space=300, voc=None, lang='None', binary_structural_problems=-1, method='dotn', dozscore=True): if max_label_space == 'optimal': max_label_space = 0 @@ -63,6 +63,18 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, lang='None', if dozscore: F = zscores(F, axis=0) + # Dumping F-matrix for further studies + # TODO im not sure if voc.keys and F matrix indices are "aligned" correctly + dump_it = True + if dump_it: + with open(f'/home/andreapdr/funneling_pdr/src/dumps/WCE_{lang}.tsv', 'w') as outfile: + np.savetxt(outfile, F, delimiter='\t') + with open(f'/home/andreapdr/funneling_pdr/src/dumps/dict_WCE_{lang}.tsv', 'w') as outfile: + for token in voc.keys(): + outfile.write(token+'\n') + + + if nC > max_label_space: # TODO testing optimal max_label_space if reduction == 'PCA': @@ -75,15 +87,6 @@ def get_supervised_embeddings(X, Y, reduction, max_label_space=300, lang='None', f'Applying PCA(n_components={max_label_space})') pca = PCA(n_components=max_label_space) pca = pca.fit(F) - ######################################################## - # import matplotlib.pyplot as plt - # plt.figure() - # plt.plot(np.cumsum(pca.explained_variance_ratio_)) - # plt.xlabel('Number of Components') - # plt.ylabel('Variance (%)') # - # plt.title(f'WCE Explained Variance {lang}') - # plt.show() - ######################################################## F = pca.fit_transform(F) elif reduction == 'TSNE': print(f'supervised matrix has more dimensions ({nC}) than the allowed limit {max_label_space}. '