gFun/src/util/common.py

571 lines
23 KiB
Python
Executable File

import subprocess
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from embeddings.supervised import get_supervised_embeddings
import numpy as np
from tqdm import tqdm
import torch
warnings.filterwarnings("ignore", category=DeprecationWarning)
def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary):
"""
Index (i.e., replaces word strings with numerical indexes) a list of string documents
:param data: list of string documents
:param vocab: a fixed mapping [str]->[int] of words to indexes
:param known_words: a set of known words (e.g., words that, despite not being included in the vocab, can be retained
because they are anyway contained in a pre-trained embedding set that we know in advance)
:param analyzer: the preprocessor in charge of transforming the document string into a chain of string words
:param unk_index: the index of the 'unknown token', i.e., a symbol that characterizes all words that we cannot keep
:param out_of_vocabulary: an incremental mapping [str]->[int] of words to indexes that will index all those words that
are not in the original vocab but that are in the known_words
:return:
"""
indexes=[]
vocabsize = len(vocab)
unk_count = 0
knw_count = 0
out_count = 0
pbar = tqdm(data, desc=f'indexing documents')
for text in pbar:
words = analyzer(text)
index = []
for word in words:
if word in vocab:
idx = vocab[word]
else:
if word in known_words:
if word not in out_of_vocabulary:
out_of_vocabulary[word] = vocabsize+len(out_of_vocabulary)
idx = out_of_vocabulary[word]
out_count += 1
else:
idx = unk_index
unk_count += 1
index.append(idx)
indexes.append(index)
knw_count += len(index)
pbar.set_description(f'[unk = {unk_count}/{knw_count}={(100.*unk_count/knw_count):.2f}%]'
f'[out = {out_count}/{knw_count}={(100.*out_count/knw_count):.2f}%]')
return indexes
def define_pad_length(index_list):
lengths = [len(index) for index in index_list]
return int(np.mean(lengths)+np.std(lengths))
def pad(index_list, pad_index, max_pad_length=None):
pad_length = np.max([len(index) for index in index_list])
if max_pad_length is not None:
pad_length = min(pad_length, max_pad_length)
for i,indexes in enumerate(index_list):
index_list[i] = [pad_index]*(pad_length-len(indexes)) + indexes[:pad_length]
return index_list
class Index:
def __init__(self, devel_raw, devel_target, test_raw, lang):
self.lang = lang
self.devel_raw = devel_raw
self.devel_target = devel_target
self.test_raw = test_raw
def index(self, pretrained_vocabulary, analyzer, vocabulary):
self.word2index = dict(vocabulary)
known_words = set(self.word2index.keys())
if pretrained_vocabulary is not None:
known_words.update(pretrained_vocabulary)
self.word2index['UNKTOKEN'] = len(self.word2index)
self.word2index['PADTOKEN'] = len(self.word2index)
self.unk_index = self.word2index['UNKTOKEN']
self.pad_index = self.word2index['PADTOKEN']
# index documents and keep track of test terms outside the development vocabulary that are in Muse (if available)
self.out_of_vocabulary = dict()
self.devel_index = index(self.devel_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary)
self.test_index = index(self.test_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary)
self.vocabsize = len(self.word2index) + len(self.out_of_vocabulary)
print(f'[indexing complete for lang {self.lang}] vocabulary-size={self.vocabsize}')
def train_val_split(self, val_prop, max_val, seed):
devel = self.devel_index
target = self.devel_target
devel_raw = self.devel_raw
val_size = int(min(len(devel) * val_prop, max_val))
self.train_index, self.val_index, self.train_target, self.val_target, self.train_raw, self.val_raw = \
train_test_split(
devel, target, devel_raw, test_size=val_size, random_state=seed, shuffle=True
)
print(f'split lang {self.lang}: train={len(self.train_index)} val={len(self.val_index)} test={len(self.test_index)}')
def get_word_list(self):
def extract_word_list(word2index):
return [w for w,i in sorted(word2index.items(), key=lambda x: x[1])]
word_list = extract_word_list(self.word2index)
word_list += extract_word_list(self.out_of_vocabulary)
return word_list
def compose_embedding_matrix(self, pretrained, supervised, Xtr=None, Ytr=None):
print(f'[generating embedding matrix for lang {self.lang}]')
self.wce_range = None
embedding_parts = []
if pretrained is not None:
print('\t[pretrained-matrix]')
word_list = self.get_word_list()
muse_embeddings = pretrained.extract(word_list)
embedding_parts.append(muse_embeddings)
del pretrained
if supervised:
print('\t[supervised-matrix]')
F = get_supervised_embeddings(Xtr, Ytr, reduction=None, method='dotn')
num_missing_rows = self.vocabsize - F.shape[0]
F = np.vstack((F, np.zeros(shape=(num_missing_rows, F.shape[1]))))
F = torch.from_numpy(F).float()
offset = 0
if embedding_parts:
offset = embedding_parts[0].shape[1]
self.wce_range = [offset, offset + F.shape[1]]
embedding_parts.append(F)
make_dumps = False
if make_dumps:
print(f'Dumping Embedding Matrices ...')
import pickle
with open(f'../dumps/dump_{self.lang}_rcv.pkl', 'wb') as outfile:
pickle.dump((self.lang, embedding_parts, self.word2index), outfile)
with open(f'../dumps/corpus_{self.lang}_rcv.pkl', 'wb') as outfile2:
pickle.dump((self.lang, self.devel_raw, self.devel_target), outfile2)
self.embedding_matrix = torch.cat(embedding_parts, dim=1)
print(f'[embedding matrix for lang {self.lang} has shape {self.embedding_matrix.shape}]')
def none_dict(langs):
return {l:None for l in langs}
class MultilingualIndex:
def __init__(self): #, add_language_trace=False):
self.l_index = {}
self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
# self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True, max_features=25000)
# self.add_language_trace=add_language_trace}
def index(self, l_devel_raw, l_devel_target, l_test_raw, l_pretrained_vocabulary):
self.langs = sorted(l_devel_raw.keys())
#build the vocabularies
self.l_vectorizer.fit(l_devel_raw)
l_vocabulary = self.l_vectorizer.vocabulary()
l_analyzer = self.l_vectorizer.get_analyzer()
for l in self.langs:
self.l_index[l] = Index(l_devel_raw[l], l_devel_target[l], l_test_raw[l], l)
self.l_index[l].index(l_pretrained_vocabulary[l], l_analyzer[l], l_vocabulary[l])
def get_indexed(self, l_texts, pretrained_vocabulary=None):
assert len(self.l_index) != 0, 'Cannot index data without first index call to multilingual index!'
l_indexed = {}
for l, texts in l_texts.items():
if l in self.langs:
word2index = self.l_index[l].word2index
known_words = set(word2index.keys())
if pretrained_vocabulary[l] is not None:
known_words.update(pretrained_vocabulary[l])
l_indexed[l] = index(texts,
vocab=word2index,
known_words=known_words,
analyzer=self.l_vectorizer.get_analyzer(l),
unk_index=word2index['UNKTOKEN'],
out_of_vocabulary=dict())
return l_indexed
def train_val_split(self, val_prop=0.2, max_val=2000, seed=42):
for l,index in self.l_index.items():
index.train_val_split(val_prop, max_val, seed=seed)
def embedding_matrices(self, lpretrained, supervised):
lXtr = self.get_lXtr() if supervised else none_dict(self.langs)
lYtr = self.l_train_target() if supervised else none_dict(self.langs)
for l,index in self.l_index.items():
index.compose_embedding_matrix(lpretrained[l], supervised, lXtr[l], lYtr[l])
self.sup_range = index.wce_range
# TODO circular import with transformers --> when generating posterior prob, we import PosteriorProbabilitiesEmbedder which is defined in transformers
# def posterior_probabilities(self, max_training_docs_by_lang=5000, store_posteriors=False, stored_post=False):
# # choose a maximum of "max_training_docs_by_lang" for training the calibrated SVMs
# timeit = time.time()
# lXtr = {l:Xtr for l,Xtr in self.get_lXtr().items()}
# lYtr = {l:Ytr for l,Ytr in self.l_train_target().items()}
# if not stored_post:
# for l in self.langs:
# n_elements = lXtr[l].shape[0]
# if n_elements > max_training_docs_by_lang:
# choice = np.random.permutation(n_elements)[:max_training_docs_by_lang]
# lXtr[l] = lXtr[l][choice]
# lYtr[l] = lYtr[l][choice]
#
# # train the posterior probabilities embedder
# print('[posteriors] training a calibrated SVM')
# learner = SVC(kernel='linear', probability=True, cache_size=1000, C=1, random_state=1, gamma='auto')
# prob_embedder = PosteriorProbabilitiesEmbedder(learner, l2=False)
# prob_embedder.fit(lXtr, lYtr)
#
# # transforms the training, validation, and test sets into posterior probabilities
# print('[posteriors] generating posterior probabilities')
# lPtr = prob_embedder.transform(self.get_lXtr())
# lPva = prob_embedder.transform(self.get_lXva())
# lPte = prob_embedder.transform(self.get_lXte())
# # NB: Check splits indices !
# if store_posteriors:
# import pickle
# with open('../dumps/posteriors_fulljrc.pkl', 'wb') as outfile:
# pickle.dump([lPtr, lPva, lPte], outfile)
# print(f'Successfully dumped posteriors!')
# else:
# import pickle
# with open('../dumps/posteriors_fulljrc.pkl', 'rb') as infile:
# lPtr, lPva, lPte = pickle.load(infile)
# print(f'Successfully loaded stored posteriors!')
# print(f'[posteriors] done in {time.time() - timeit}')
# return lPtr, lPva, lPte
def bert_embeddings(self, bert_path, max_len=512, batch_size=64, stored_embeddings=False):
show_gpu('GPU memory before initializing mBert model:')
# TODO: load dumped embeddings?
from experiment_scripts.main_mbert_extractor import do_tokenization, ExtractorDataset, DataLoader
from transformers import BertConfig, BertForSequenceClassification
print('[mBERT] generating mBERT doc embeddings')
lXtr_raw = self.get_raw_lXtr()
lXva_raw = self.get_raw_lXva()
lXte_raw = self.get_raw_lXte()
print('# Tokenizing datasets')
l_tokenized_tr = do_tokenization(lXtr_raw, max_len=max_len, verbose=False)
tr_dataset = ExtractorDataset(l_tokenized_tr)
tr_lang_ids = tr_dataset.lang_ids
tr_dataloader = DataLoader(tr_dataset, batch_size=batch_size, shuffle=False)
l_tokenized_va = do_tokenization(lXva_raw, max_len=max_len, verbose=False)
va_dataset = ExtractorDataset(l_tokenized_va)
va_lang_ids = va_dataset.lang_ids
va_dataloader = DataLoader(va_dataset, batch_size=batch_size, shuffle=False)
l_tokenized_te = do_tokenization(lXte_raw, max_len=max_len, verbose=False)
te_dataset = ExtractorDataset(l_tokenized_te)
te_lang_ids = te_dataset.lang_ids
te_dataloader = DataLoader(te_dataset, batch_size=batch_size, shuffle=False)
num_labels = self.l_index[self.langs[0]].val_target.shape[1]
config = BertConfig.from_pretrained('bert-base-multilingual-cased', output_hidden_states=True,
num_labels=num_labels)
model = BertForSequenceClassification.from_pretrained(bert_path,
config=config).cuda()
print('# Extracting document embeddings')
tr_bert_embeddings, id2lang_tr = self.do_bert_embeddings(model, tr_dataloader, tr_lang_ids, verbose=False)
va_bert_embeddings, id2lang_va = self.do_bert_embeddings(model, va_dataloader, va_lang_ids, verbose=False)
te_bert_embeddings, id2lang_te = self.do_bert_embeddings(model, te_dataloader, te_lang_ids, verbose=False)
show_gpu('GPU memory before after mBert model:')
# Freeing GPU's memory
import gc
del model, tr_dataloader, va_dataloader, te_dataloader
gc.collect()
torch.cuda.empty_cache()
show_gpu('GPU memory after clearing cache:')
return tr_bert_embeddings, va_bert_embeddings, te_bert_embeddings
@staticmethod
def do_bert_embeddings(model, data, lang_ids, verbose=True):
if verbose:
print('# Feature Extractor Mode...')
all_batch_embeddings = {}
id2lang = {v: k for k, v in lang_ids.items()}
with torch.no_grad():
for batch, lang_idx in data:
out = model(batch.cuda())
last_hidden_state = out[1][-1]
batch_embeddings = last_hidden_state[:, 0, :]
for i, l_idx in enumerate(lang_idx.numpy()):
if id2lang[l_idx] not in all_batch_embeddings.keys():
all_batch_embeddings[id2lang[l_idx]] = batch_embeddings[i].detach().cpu().numpy()
else:
all_batch_embeddings[id2lang[l_idx]] = np.vstack((all_batch_embeddings[id2lang[l_idx]],
batch_embeddings[i].detach().cpu().numpy()))
return all_batch_embeddings, id2lang
def get_raw_lXtr(self):
lXtr_raw = {k:[] for k in self.langs}
lYtr_raw = {k: [] for k in self.langs}
for lang in self.langs:
lXtr_raw[lang] = self.l_index[lang].train_raw
lYtr_raw[lang] = self.l_index[lang].train_raw
return lXtr_raw
def get_raw_lXva(self):
lXva_raw = {k: [] for k in self.langs}
for lang in self.langs:
lXva_raw[lang] = self.l_index[lang].val_raw
return lXva_raw
def get_raw_lXte(self):
lXte_raw = {k: [] for k in self.langs}
for lang in self.langs:
lXte_raw[lang] = self.l_index[lang].test_raw
return lXte_raw
def get_lXtr(self):
if not hasattr(self, 'lXtr'):
self.lXtr = self.l_vectorizer.transform({l: index.train_raw for l, index in self.l_index.items()})
return self.lXtr
def get_lXva(self):
if not hasattr(self, 'lXva'):
self.lXva = self.l_vectorizer.transform({l: index.val_raw for l, index in self.l_index.items()})
return self.lXva
def get_lXte(self):
if not hasattr(self, 'lXte'):
self.lXte = self.l_vectorizer.transform({l: index.test_raw for l, index in self.l_index.items()})
return self.lXte
def l_vocabsize(self):
return {l:index.vocabsize for l,index in self.l_index.items()}
def l_embeddings(self):
return {l:index.embedding_matrix for l,index in self.l_index.items()}
def l_pad(self):
return {l: index.pad_index for l, index in self.l_index.items()}
def l_train_index(self):
return {l: index.train_index for l, index in self.l_index.items()}
def l_train_target(self):
return {l: index.train_target for l, index in self.l_index.items()}
def l_val_index(self):
return {l: index.val_index for l, index in self.l_index.items()}
def l_val_target(self):
return {l: index.val_target for l, index in self.l_index.items()}
def l_test_index(self):
return {l: index.test_index for l, index in self.l_index.items()}
def l_devel_index(self):
return {l: index.devel_index for l, index in self.l_index.items()}
def l_devel_target(self):
return {l: index.devel_target for l, index in self.l_index.items()}
def l_train(self):
return self.l_train_index(), self.l_train_target()
def l_val(self):
return self.l_val_index(), self.l_val_target()
class Batch:
def __init__(self, batchsize, batches_per_epoch, languages, lpad, max_pad_length=500):
self.batchsize = batchsize
self.batches_per_epoch = batches_per_epoch
self.languages = languages
self.lpad=lpad
self.max_pad_length=max_pad_length
self.init_offset()
def init_offset(self):
self.offset = {lang: 0 for lang in self.languages}
def batchify(self, l_index, l_post, l_bert, llabels):
langs = self.languages
l_num_samples = {l:len(l_index[l]) for l in langs}
max_samples = max(l_num_samples.values())
n_batches = max_samples // self.batchsize + 1 * (max_samples % self.batchsize > 0)
if self.batches_per_epoch != -1 and self.batches_per_epoch < n_batches:
n_batches = self.batches_per_epoch
for b in range(n_batches):
for lang in langs:
index, labels = l_index[lang], llabels[lang]
offset = self.offset[lang]
if offset >= l_num_samples[lang]:
offset = 0
limit = offset+self.batchsize
batch_slice = slice(offset, limit)
batch = index[batch_slice]
batch_labels = labels[batch_slice].toarray()
post = None
if l_post is not None:
post = torch.FloatTensor(l_post[lang][batch_slice]).cuda()
bert_emb = None
if l_bert is not None:
bert_emb = torch.FloatTensor(l_bert[lang][batch_slice]).cuda()
batch = pad(batch, pad_index=self.lpad[lang], max_pad_length=self.max_pad_length)
batch = torch.LongTensor(batch).cuda()
target = torch.FloatTensor(batch_labels).cuda()
self.offset[lang] = limit
yield batch, post, bert_emb, target, lang
def batchify(l_index, l_post, llabels, batchsize, lpad, max_pad_length=500):
langs = sorted(l_index.keys())
nsamples = max([len(l_index[l]) for l in langs])
nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0)
for b in range(nbatches):
for lang in langs:
index, labels = l_index[lang], llabels[lang]
if b * batchsize >= len(index):
continue
batch = index[b*batchsize:(b+1)*batchsize]
batch_labels = labels[b*batchsize:(b+1)*batchsize].toarray()
post = None
if l_post is not None:
post = torch.FloatTensor(l_post[lang][b*batchsize:(b+1)*batchsize]).cuda()
batch = pad(batch, pad_index=lpad[lang], max_pad_length=max_pad_length)
batch = torch.LongTensor(batch)
target = torch.FloatTensor(batch_labels)
yield batch.cuda(), post, target.cuda(), lang
def batchify_unlabelled(index_list, batchsize, pad_index, max_pad_length=500):
nsamples = len(index_list)
nbatches = nsamples // batchsize + 1*(nsamples%batchsize>0)
for b in range(nbatches):
batch = index_list[b*batchsize:(b+1)*batchsize]
batch = pad(batch, pad_index=pad_index, max_pad_length=max_pad_length)
batch = torch.LongTensor(batch)
yield batch.cuda()
def clip_gradient(model, clip_value=1e-1):
params = list(filter(lambda p: p.grad is not None, model.parameters()))
for p in params:
p.grad.data.clamp_(-clip_value, clip_value)
def predict(logits, classification_type='multilabel'):
if classification_type == 'multilabel':
prediction = torch.sigmoid(logits) > 0.5
elif classification_type == 'singlelabel':
prediction = torch.argmax(logits, dim=1).view(-1, 1)
else:
print('unknown classification type')
return prediction.detach().cpu().numpy()
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
def show_gpu(msg):
"""
ref: https://discuss.pytorch.org/t/access-gpu-memory-usage-in-pytorch/3192/4
"""
def query(field):
return (subprocess.check_output(
['nvidia-smi', f'--query-gpu={field}',
'--format=csv,nounits,noheader'],
encoding='utf-8'))
def to_int(result):
return int(result.strip().split('\n')[0])
used = to_int(query('memory.used'))
total = to_int(query('memory.total'))
pct = used / total
print('\n' + msg, f'{100 * pct:2.1f}% ({used} out of {total})')
class TfidfVectorizerMultilingual:
def __init__(self, **kwargs):
self.kwargs = kwargs
def fit(self, lX, ly=None):
self.langs = sorted(lX.keys())
self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs}
return self
def transform(self, lX):
return {l: self.vectorizer[l].transform(lX[l]) for l in self.langs}
def fit_transform(self, lX, ly=None):
return self.fit(lX, ly).transform(lX)
def vocabulary(self, l=None):
if l is None:
return {l: self.vectorizer[l].vocabulary_ for l in self.langs}
else:
return self.vectorizer[l].vocabulary_
def get_analyzer(self, l=None):
if l is None:
return {l: self.vectorizer[l].build_analyzer() for l in self.langs}
else:
return self.vectorizer[l].build_analyzer()
def get_learner(calibrate=False, kernel='linear', C=1):
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=C, random_state=1, gamma='auto', verbose=False)
def get_params(optimc=False):
if not optimc:
return None
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
kernel = 'rbf'
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
def get_method_name(dataset, posteriors, supervised, pretrained, mbert, gru,
gruMUSE, gruWCE, agg, allprob):
_id = '-'
_id_conf = [posteriors, supervised, pretrained, mbert, gru]
_id_name = ['X', 'W', 'M', 'B', 'G']
for i, conf in enumerate(_id_conf):
if conf:
_id += _id_name[i]
_id = _id if not gruMUSE else _id + '_muse'
_id = _id if not gruWCE else _id + '_wce'
_id = _id if not agg else _id + '_mean'
_id = _id if not allprob else _id + '_allprob'
_dataset_path = dataset.split('/')[-1].split('_')
dataset_id = _dataset_path[0] + _dataset_path[-1]
return _id, dataset_id