fixed common after problematic merge
This commit is contained in:
parent
1ac850630b
commit
a1c4247e17
|
|
@ -1,103 +1,8 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.preprocessing import normalize
|
||||||
def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary):
|
|
||||||
"""
|
|
||||||
Index (i.e., replaces word strings with numerical indexes) a list of string documents
|
|
||||||
:param data: list of string documents
|
|
||||||
:param vocab: a fixed mapping [str]->[int] of words to indexes
|
|
||||||
:param known_words: a set of known words (e.g., words that, despite not being included in the vocab, can be retained
|
|
||||||
because they are anyway contained in a pre-trained embedding set that we know in advance)
|
|
||||||
:param analyzer: the preprocessor in charge of transforming the document string into a chain of string words
|
|
||||||
:param unk_index: the index of the 'unknown token', i.e., a symbol that characterizes all words that we cannot keep
|
|
||||||
:param out_of_vocabulary: an incremental mapping [str]->[int] of words to indexes that will index all those words that
|
|
||||||
are not in the original vocab but that are in the known_words
|
|
||||||
:return:
|
|
||||||
"""
|
|
||||||
indexes=[]
|
|
||||||
vocabsize = len(vocab)
|
|
||||||
unk_count = 0
|
|
||||||
knw_count = 0
|
|
||||||
out_count = 0
|
|
||||||
# pbar = tqdm(data, desc=f'indexing documents')
|
|
||||||
# for text in pbar:
|
|
||||||
for text in data:
|
|
||||||
words = analyzer(text)
|
|
||||||
index = []
|
|
||||||
for word in words:
|
|
||||||
if word in vocab:
|
|
||||||
idx = vocab[word]
|
|
||||||
else:
|
|
||||||
if word in known_words:
|
|
||||||
if word not in out_of_vocabulary:
|
|
||||||
out_of_vocabulary[word] = vocabsize+len(out_of_vocabulary)
|
|
||||||
idx = out_of_vocabulary[word]
|
|
||||||
out_count += 1
|
|
||||||
else:
|
|
||||||
idx = unk_index
|
|
||||||
unk_count += 1
|
|
||||||
index.append(idx)
|
|
||||||
indexes.append(index)
|
|
||||||
knw_count += len(index)
|
|
||||||
# pbar.set_description(f'[unk = {unk_count}/{knw_count}={(100.*unk_count/knw_count):.2f}%]'
|
|
||||||
# f'[out = {out_count}/{knw_count}={(100.*out_count/knw_count):.2f}%]')
|
|
||||||
return indexes
|
|
||||||
|
|
||||||
|
|
||||||
def define_pad_length(index_list):
|
|
||||||
lengths = [len(index) for index in index_list]
|
|
||||||
return int(np.mean(lengths)+np.std(lengths))
|
|
||||||
|
|
||||||
|
|
||||||
def pad(index_list, pad_index, max_pad_length=None):
|
|
||||||
pad_length = np.max([len(index) for index in index_list])
|
|
||||||
if max_pad_length is not None:
|
|
||||||
pad_length = min(pad_length, max_pad_length)
|
|
||||||
for i,indexes in enumerate(index_list):
|
|
||||||
index_list[i] = [pad_index]*(pad_length-len(indexes)) + indexes[:pad_length]
|
|
||||||
return index_list
|
|
||||||
|
|
||||||
|
|
||||||
class Index:
|
|
||||||
def __init__(self, devel_raw, devel_target, test_raw, lang):
|
|
||||||
self.lang = lang
|
|
||||||
self.devel_raw = devel_raw
|
|
||||||
self.devel_target = devel_target
|
|
||||||
self.test_raw = test_raw
|
|
||||||
|
|
||||||
def index(self, pretrained_vocabulary, analyzer, vocabulary):
|
|
||||||
self.word2index = dict(vocabulary)
|
|
||||||
known_words = set(self.word2index.keys())
|
|
||||||
if pretrained_vocabulary is not None:
|
|
||||||
known_words.update(pretrained_vocabulary)
|
|
||||||
|
|
||||||
self.word2index['UNKTOKEN'] = len(self.word2index)
|
|
||||||
self.word2index['PADTOKEN'] = len(self.word2index)
|
|
||||||
self.unk_index = self.word2index['UNKTOKEN']
|
|
||||||
self.pad_index = self.word2index['PADTOKEN']
|
|
||||||
|
|
||||||
# index documents and keep track of test terms outside the development vocabulary that are in Muse (if available)
|
|
||||||
self.out_of_vocabulary = dict()
|
|
||||||
self.devel_index = index(self.devel_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary)
|
|
||||||
self.test_index = index(self.test_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary)
|
|
||||||
|
|
||||||
self.vocabsize = len(self.word2index) + len(self.out_of_vocabulary)
|
|
||||||
|
|
||||||
print(f'[indexing complete for lang {self.lang}] vocabulary-size={self.vocabsize}')
|
|
||||||
|
|
||||||
def train_val_split(self, val_prop, max_val, seed):
|
|
||||||
devel = self.devel_index
|
|
||||||
target = self.devel_target
|
|
||||||
devel_raw = self.devel_raw
|
|
||||||
|
|
||||||
val_size = int(min(len(devel) * val_prop, max_val))
|
|
||||||
|
|
||||||
self.train_index, self.val_index, self.train_target, self.val_target, self.train_raw, self.val_raw = \
|
|
||||||
train_test_split(
|
|
||||||
devel, target, devel_raw, test_size=val_size, random_state=seed, shuffle=True
|
|
||||||
)
|
|
||||||
|
|
||||||
from src.util.embeddings_manager import supervised_embeddings_tfidf
|
from src.util.embeddings_manager import supervised_embeddings_tfidf
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue