gFun/refactor/util/common.py

339 lines
13 KiB
Python

import numpy as np
import torch
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from util.embeddings_manager import supervised_embeddings_tfidf
class TfidfVectorizerMultilingual:
def __init__(self, **kwargs):
self.kwargs = kwargs
def fit(self, lX, ly=None):
self.langs = sorted(lX.keys())
self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs}
return self
def transform(self, lX):
return {l: self.vectorizer[l].transform(lX[l]) for l in self.langs}
def fit_transform(self, lX, ly=None):
return self.fit(lX, ly).transform(lX)
def vocabulary(self, l=None):
if l is None:
return {l: self.vectorizer[l].vocabulary_ for l in self.langs}
else:
return self.vectorizer[l].vocabulary_
def get_analyzer(self, l=None):
if l is None:
return {l: self.vectorizer[l].build_analyzer() for l in self.langs}
else:
return self.vectorizer[l].build_analyzer()
def _normalize(lX, l2=True):
return {lang: normalize(X) for lang, X in lX.items()} if l2 else lX
def none_dict(langs):
return {l:None for l in langs}
class MultilingualIndex:
def __init__(self):
"""
Class that contains monolingual Indexes
"""
self.l_index = {}
self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
def index(self, l_devel_raw, l_devel_target, l_test_raw, l_test_target, l_pretrained_vocabulary=None):
self.langs = sorted(l_devel_raw.keys())
self.l_vectorizer.fit(l_devel_raw)
l_vocabulary = self.l_vectorizer.vocabulary()
l_analyzer = self.l_vectorizer.get_analyzer()
if l_pretrained_vocabulary is None:
l_pretrained_vocabulary = none_dict(self.langs)
for lang in self.langs:
# Init monolingual Index
self.l_index[lang] = Index(l_devel_raw[lang], l_devel_target[lang], l_test_raw[lang], l_test_target[lang], lang)
# call to index() function of monolingual Index
self.l_index[lang].index(l_pretrained_vocabulary[lang], l_analyzer[lang], l_vocabulary[lang])
def train_val_split(self, val_prop=0.2, max_val=2000, seed=42):
for l,index in self.l_index.items():
index.train_val_split(val_prop, max_val, seed=seed)
def embedding_matrices(self, lpretrained, supervised):
"""
Extract from pretrained embeddings words that are found in the training dataset, then for each language
calls the respective monolingual index and build the embedding matrix (if supervised, WCE are concatenated
to the unsupervised vectors).
:param lpretrained: dict {lang : matrix of word-embeddings }
:param supervised: bool, whether to deploy Word-Class Embeddings or not
:return: self
"""
lXtr = self.get_lXtr() if supervised else none_dict(self.langs)
lYtr = self.l_train_target() if supervised else none_dict(self.langs)
lWordList = self.get_wordlist()
lExtracted = lpretrained.extract(lWordList)
for lang, index in self.l_index.items():
# if supervised concatenate embedding matrices of pretrained unsupervised
# and supervised word-class embeddings
index.compose_embedding_matrix(lExtracted[lang], supervised, lXtr[lang], lYtr[lang])
self.sup_range = index.wce_range
return self
def get_wordlist(self):
wordlist = {}
for lang, index in self.l_index.items():
wordlist[lang] = index.get_word_list()
return wordlist
def get_raw_lXtr(self):
lXtr_raw = {k:[] for k in self.langs}
lYtr_raw = {k: [] for k in self.langs}
for lang in self.langs:
lXtr_raw[lang] = self.l_index[lang].train_raw
lYtr_raw[lang] = self.l_index[lang].train_raw
return lXtr_raw
def get_raw_lXva(self):
lXva_raw = {k: [] for k in self.langs}
for lang in self.langs:
lXva_raw[lang] = self.l_index[lang].val_raw
return lXva_raw
def get_raw_lXte(self):
lXte_raw = {k: [] for k in self.langs}
for lang in self.langs:
lXte_raw[lang] = self.l_index[lang].test_raw
return lXte_raw
def get_lXtr(self):
if not hasattr(self, 'lXtr'):
self.lXtr = self.l_vectorizer.transform({l: index.train_raw for l, index in self.l_index.items()})
return self.lXtr
def get_lXva(self):
if not hasattr(self, 'lXva'):
self.lXva = self.l_vectorizer.transform({l: index.val_raw for l, index in self.l_index.items()})
return self.lXva
def get_lXte(self):
if not hasattr(self, 'lXte'):
self.lXte = self.l_vectorizer.transform({l: index.test_raw for l, index in self.l_index.items()})
return self.lXte
def get_target_dim(self):
return self.l_index[self.langs[0]].devel_target.shape[1]
def l_vocabsize(self):
return {l:index.vocabsize for l,index in self.l_index.items()}
def l_embeddings(self):
return {l:index.embedding_matrix for l,index in self.l_index.items()}
def l_pad(self):
return {l: index.pad_index for l, index in self.l_index.items()}
def l_train_index(self):
return {l: index.train_index for l, index in self.l_index.items()}
def l_train_raw_index(self):
return {l: index.train_raw for l, index in self.l_index.items()}
def l_train_target(self):
return {l: index.train_target for l, index in self.l_index.items()}
def l_val_index(self):
return {l: index.val_index for l, index in self.l_index.items()}
def l_val_raw_index(self):
return {l: index.val_raw for l, index in self.l_index.items()}
def l_val_target(self):
return {l: index.val_target for l, index in self.l_index.items()}
def l_test_target(self):
return {l: index.test_target for l, index in self.l_index.items()}
def l_test_index(self):
return {l: index.test_index for l, index in self.l_index.items()}
def l_test_raw(self):
print('TODO: implement MultilingualIndex method to return RAW test data!')
return NotImplementedError
def l_devel_index(self):
return {l: index.devel_index for l, index in self.l_index.items()}
def l_devel_target(self):
return {l: index.devel_target for l, index in self.l_index.items()}
def l_train(self):
return self.l_train_index(), self.l_train_target()
def l_val(self):
return self.l_val_index(), self.l_val_target()
def l_test(self):
return self.l_test_index(), self.l_test_target()
def l_train_raw(self):
return self.l_train_raw_index(), self.l_train_target()
def l_val_raw(self):
return self.l_val_raw_index(), self.l_val_target()
def get_l_pad_index(self):
return {l: index.get_pad_index() for l, index in self.l_index.items()}
class Index:
def __init__(self, devel_raw, devel_target, test_raw, test_target, lang):
"""
Monolingual Index, takes care of tokenizing raw data, converting strings to ids, splitting the data into
training and validation.
:param devel_raw: list of strings, list of raw training texts
:param devel_target:
:param test_raw: list of strings, list of raw test texts
:param lang: list, list of languages contained in the dataset
"""
self.lang = lang
self.devel_raw = devel_raw
self.devel_target = devel_target
self.test_raw = test_raw
self.test_target = test_target
def index(self, pretrained_vocabulary, analyzer, vocabulary):
self.word2index = dict(vocabulary)
known_words = set(self.word2index.keys())
if pretrained_vocabulary is not None:
known_words.update(pretrained_vocabulary)
self.word2index['UNKTOKEN'] = len(self.word2index)
self.word2index['PADTOKEN'] = len(self.word2index)
self.unk_index = self.word2index['UNKTOKEN']
self.pad_index = self.word2index['PADTOKEN']
# index documents and keep track of test terms outside the development vocabulary that are in Muse (if available)
self.out_of_vocabulary = dict()
self.devel_index = index(self.devel_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary)
self.test_index = index(self.test_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary)
self.vocabsize = len(self.word2index) + len(self.out_of_vocabulary)
print(f'[indexing complete for lang {self.lang}] vocabulary-size={self.vocabsize}')
def get_pad_index(self):
return self.pad_index
def train_val_split(self, val_prop, max_val, seed):
devel = self.devel_index
target = self.devel_target
devel_raw = self.devel_raw
val_size = int(min(len(devel) * val_prop, max_val))
self.train_index, self.val_index, self.train_target, self.val_target, self.train_raw, self.val_raw = \
train_test_split(
devel, target, devel_raw, test_size=val_size, random_state=seed, shuffle=True)
print(f'split lang {self.lang}: train={len(self.train_index)} val={len(self.val_index)} test={len(self.test_index)}')
def get_word_list(self):
def extract_word_list(word2index):
return [w for w, i in sorted(word2index.items(), key=lambda x: x[1])]
word_list = extract_word_list(self.word2index)
word_list += extract_word_list(self.out_of_vocabulary)
return word_list
def compose_embedding_matrix(self, pretrained, supervised, Xtr=None, Ytr=None):
print(f'[generating embedding matrix for lang {self.lang}]')
self.wce_range = None
embedding_parts = []
if pretrained is not None:
print('\t[pretrained-matrix]')
embedding_parts.append(pretrained)
del pretrained
if supervised:
print('\t[supervised-matrix]')
F = supervised_embeddings_tfidf(Xtr, Ytr)
num_missing_rows = self.vocabsize - F.shape[0]
F = np.vstack((F, np.zeros(shape=(num_missing_rows, F.shape[1]))))
F = torch.from_numpy(F).float()
offset = 0
if embedding_parts:
offset = embedding_parts[0].shape[1]
self.wce_range = [offset, offset + F.shape[1]]
embedding_parts.append(F)
self.embedding_matrix = torch.cat(embedding_parts, dim=1)
print(f'[embedding matrix for lang {self.lang} has shape {self.embedding_matrix.shape}]')
def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary):
"""
Index (i.e., replaces word strings with numerical indexes) a list of string documents
:param data: list of string documents
:param vocab: a fixed mapping [str]->[int] of words to indexes
:param known_words: a set of known words (e.g., words that, despite not being included in the vocab, can be retained
because they are anyway contained in a pre-trained embedding set that we know in advance)
:param analyzer: the preprocessor in charge of transforming the document string into a chain of string words
:param unk_index: the index of the 'unknown token', i.e., a symbol that characterizes all words that we cannot keep
:param out_of_vocabulary: an incremental mapping [str]->[int] of words to indexes that will index all those words that
are not in the original vocab but that are in the known_words
:return:
"""
indexes=[]
vocabsize = len(vocab)
unk_count = 0
knw_count = 0
out_count = 0
pbar = tqdm(data, desc=f'indexing')
for text in pbar:
words = analyzer(text)
index = []
for word in words:
if word in vocab:
idx = vocab[word]
else:
if word in known_words:
if word not in out_of_vocabulary:
out_of_vocabulary[word] = vocabsize+len(out_of_vocabulary)
idx = out_of_vocabulary[word]
out_count += 1
else:
idx = unk_index
unk_count += 1
index.append(idx)
indexes.append(index)
knw_count += len(index)
# pbar.set_description(f'[unk = {unk_count}/{knw_count}={(100.*unk_count/knw_count):.2f}%]'
# f'[out = {out_count}/{knw_count}={(100.*out_count/knw_count):.2f}%]')
return indexes
def is_true(tensor, device):
return torch.where(tensor == 1, torch.Tensor([1]).to(device), torch.Tensor([0]).to(device))
def is_false(tensor, device):
return torch.where(tensor == 0, torch.Tensor([1]).to(device), torch.Tensor([0]).to(device))