fixed common after problematic merge

This commit is contained in:
andrea 2021-02-05 11:22:30 +01:00
parent 1ac850630b
commit a1c4247e17
1 changed files with 3 additions and 98 deletions

View File

@ -1,103 +1,8 @@
import numpy as np import numpy as np
import torch import torch
warnings.filterwarnings("ignore", category=DeprecationWarning) from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary):
"""
Index (i.e., replaces word strings with numerical indexes) a list of string documents
:param data: list of string documents
:param vocab: a fixed mapping [str]->[int] of words to indexes
:param known_words: a set of known words (e.g., words that, despite not being included in the vocab, can be retained
because they are anyway contained in a pre-trained embedding set that we know in advance)
:param analyzer: the preprocessor in charge of transforming the document string into a chain of string words
:param unk_index: the index of the 'unknown token', i.e., a symbol that characterizes all words that we cannot keep
:param out_of_vocabulary: an incremental mapping [str]->[int] of words to indexes that will index all those words that
are not in the original vocab but that are in the known_words
:return:
"""
indexes=[]
vocabsize = len(vocab)
unk_count = 0
knw_count = 0
out_count = 0
# pbar = tqdm(data, desc=f'indexing documents')
# for text in pbar:
for text in data:
words = analyzer(text)
index = []
for word in words:
if word in vocab:
idx = vocab[word]
else:
if word in known_words:
if word not in out_of_vocabulary:
out_of_vocabulary[word] = vocabsize+len(out_of_vocabulary)
idx = out_of_vocabulary[word]
out_count += 1
else:
idx = unk_index
unk_count += 1
index.append(idx)
indexes.append(index)
knw_count += len(index)
# pbar.set_description(f'[unk = {unk_count}/{knw_count}={(100.*unk_count/knw_count):.2f}%]'
# f'[out = {out_count}/{knw_count}={(100.*out_count/knw_count):.2f}%]')
return indexes
def define_pad_length(index_list):
lengths = [len(index) for index in index_list]
return int(np.mean(lengths)+np.std(lengths))
def pad(index_list, pad_index, max_pad_length=None):
pad_length = np.max([len(index) for index in index_list])
if max_pad_length is not None:
pad_length = min(pad_length, max_pad_length)
for i,indexes in enumerate(index_list):
index_list[i] = [pad_index]*(pad_length-len(indexes)) + indexes[:pad_length]
return index_list
class Index:
def __init__(self, devel_raw, devel_target, test_raw, lang):
self.lang = lang
self.devel_raw = devel_raw
self.devel_target = devel_target
self.test_raw = test_raw
def index(self, pretrained_vocabulary, analyzer, vocabulary):
self.word2index = dict(vocabulary)
known_words = set(self.word2index.keys())
if pretrained_vocabulary is not None:
known_words.update(pretrained_vocabulary)
self.word2index['UNKTOKEN'] = len(self.word2index)
self.word2index['PADTOKEN'] = len(self.word2index)
self.unk_index = self.word2index['UNKTOKEN']
self.pad_index = self.word2index['PADTOKEN']
# index documents and keep track of test terms outside the development vocabulary that are in Muse (if available)
self.out_of_vocabulary = dict()
self.devel_index = index(self.devel_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary)
self.test_index = index(self.test_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary)
self.vocabsize = len(self.word2index) + len(self.out_of_vocabulary)
print(f'[indexing complete for lang {self.lang}] vocabulary-size={self.vocabsize}')
def train_val_split(self, val_prop, max_val, seed):
devel = self.devel_index
target = self.devel_target
devel_raw = self.devel_raw
val_size = int(min(len(devel) * val_prop, max_val))
self.train_index, self.val_index, self.train_target, self.val_target, self.train_raw, self.val_raw = \
train_test_split(
devel, target, devel_raw, test_size=val_size, random_state=seed, shuffle=True
)
from src.util.embeddings_manager import supervised_embeddings_tfidf from src.util.embeddings_manager import supervised_embeddings_tfidf