fixed common after problematic merge
This commit is contained in:
parent
1ac850630b
commit
a1c4247e17
|
|
@ -1,103 +1,8 @@
|
|||
import numpy as np
|
||||
import torch
|
||||
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||
|
||||
|
||||
def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary):
|
||||
"""
|
||||
Index (i.e., replaces word strings with numerical indexes) a list of string documents
|
||||
:param data: list of string documents
|
||||
:param vocab: a fixed mapping [str]->[int] of words to indexes
|
||||
:param known_words: a set of known words (e.g., words that, despite not being included in the vocab, can be retained
|
||||
because they are anyway contained in a pre-trained embedding set that we know in advance)
|
||||
:param analyzer: the preprocessor in charge of transforming the document string into a chain of string words
|
||||
:param unk_index: the index of the 'unknown token', i.e., a symbol that characterizes all words that we cannot keep
|
||||
:param out_of_vocabulary: an incremental mapping [str]->[int] of words to indexes that will index all those words that
|
||||
are not in the original vocab but that are in the known_words
|
||||
:return:
|
||||
"""
|
||||
indexes=[]
|
||||
vocabsize = len(vocab)
|
||||
unk_count = 0
|
||||
knw_count = 0
|
||||
out_count = 0
|
||||
# pbar = tqdm(data, desc=f'indexing documents')
|
||||
# for text in pbar:
|
||||
for text in data:
|
||||
words = analyzer(text)
|
||||
index = []
|
||||
for word in words:
|
||||
if word in vocab:
|
||||
idx = vocab[word]
|
||||
else:
|
||||
if word in known_words:
|
||||
if word not in out_of_vocabulary:
|
||||
out_of_vocabulary[word] = vocabsize+len(out_of_vocabulary)
|
||||
idx = out_of_vocabulary[word]
|
||||
out_count += 1
|
||||
else:
|
||||
idx = unk_index
|
||||
unk_count += 1
|
||||
index.append(idx)
|
||||
indexes.append(index)
|
||||
knw_count += len(index)
|
||||
# pbar.set_description(f'[unk = {unk_count}/{knw_count}={(100.*unk_count/knw_count):.2f}%]'
|
||||
# f'[out = {out_count}/{knw_count}={(100.*out_count/knw_count):.2f}%]')
|
||||
return indexes
|
||||
|
||||
|
||||
def define_pad_length(index_list):
|
||||
lengths = [len(index) for index in index_list]
|
||||
return int(np.mean(lengths)+np.std(lengths))
|
||||
|
||||
|
||||
def pad(index_list, pad_index, max_pad_length=None):
|
||||
pad_length = np.max([len(index) for index in index_list])
|
||||
if max_pad_length is not None:
|
||||
pad_length = min(pad_length, max_pad_length)
|
||||
for i,indexes in enumerate(index_list):
|
||||
index_list[i] = [pad_index]*(pad_length-len(indexes)) + indexes[:pad_length]
|
||||
return index_list
|
||||
|
||||
|
||||
class Index:
|
||||
def __init__(self, devel_raw, devel_target, test_raw, lang):
|
||||
self.lang = lang
|
||||
self.devel_raw = devel_raw
|
||||
self.devel_target = devel_target
|
||||
self.test_raw = test_raw
|
||||
|
||||
def index(self, pretrained_vocabulary, analyzer, vocabulary):
|
||||
self.word2index = dict(vocabulary)
|
||||
known_words = set(self.word2index.keys())
|
||||
if pretrained_vocabulary is not None:
|
||||
known_words.update(pretrained_vocabulary)
|
||||
|
||||
self.word2index['UNKTOKEN'] = len(self.word2index)
|
||||
self.word2index['PADTOKEN'] = len(self.word2index)
|
||||
self.unk_index = self.word2index['UNKTOKEN']
|
||||
self.pad_index = self.word2index['PADTOKEN']
|
||||
|
||||
# index documents and keep track of test terms outside the development vocabulary that are in Muse (if available)
|
||||
self.out_of_vocabulary = dict()
|
||||
self.devel_index = index(self.devel_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary)
|
||||
self.test_index = index(self.test_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary)
|
||||
|
||||
self.vocabsize = len(self.word2index) + len(self.out_of_vocabulary)
|
||||
|
||||
print(f'[indexing complete for lang {self.lang}] vocabulary-size={self.vocabsize}')
|
||||
|
||||
def train_val_split(self, val_prop, max_val, seed):
|
||||
devel = self.devel_index
|
||||
target = self.devel_target
|
||||
devel_raw = self.devel_raw
|
||||
|
||||
val_size = int(min(len(devel) * val_prop, max_val))
|
||||
|
||||
self.train_index, self.val_index, self.train_target, self.val_target, self.train_raw, self.val_raw = \
|
||||
train_test_split(
|
||||
devel, target, devel_raw, test_size=val_size, random_state=seed, shuffle=True
|
||||
)
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import normalize
|
||||
|
||||
from src.util.embeddings_manager import supervised_embeddings_tfidf
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue