From a1c4247e17e9831ac1c864cede726796e1598eb5 Mon Sep 17 00:00:00 2001 From: andrea Date: Fri, 5 Feb 2021 11:22:30 +0100 Subject: [PATCH] fixed common after problematic merge --- src/util/common.py | 101 ++------------------------------------------- 1 file changed, 3 insertions(+), 98 deletions(-) diff --git a/src/util/common.py b/src/util/common.py index 25f7b5f..9f44273 100644 --- a/src/util/common.py +++ b/src/util/common.py @@ -1,103 +1,8 @@ import numpy as np import torch -warnings.filterwarnings("ignore", category=DeprecationWarning) - - -def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary): - """ - Index (i.e., replaces word strings with numerical indexes) a list of string documents - :param data: list of string documents - :param vocab: a fixed mapping [str]->[int] of words to indexes - :param known_words: a set of known words (e.g., words that, despite not being included in the vocab, can be retained - because they are anyway contained in a pre-trained embedding set that we know in advance) - :param analyzer: the preprocessor in charge of transforming the document string into a chain of string words - :param unk_index: the index of the 'unknown token', i.e., a symbol that characterizes all words that we cannot keep - :param out_of_vocabulary: an incremental mapping [str]->[int] of words to indexes that will index all those words that - are not in the original vocab but that are in the known_words - :return: - """ - indexes=[] - vocabsize = len(vocab) - unk_count = 0 - knw_count = 0 - out_count = 0 - # pbar = tqdm(data, desc=f'indexing documents') - # for text in pbar: - for text in data: - words = analyzer(text) - index = [] - for word in words: - if word in vocab: - idx = vocab[word] - else: - if word in known_words: - if word not in out_of_vocabulary: - out_of_vocabulary[word] = vocabsize+len(out_of_vocabulary) - idx = out_of_vocabulary[word] - out_count += 1 - else: - idx = unk_index - unk_count += 1 - index.append(idx) - indexes.append(index) - knw_count += len(index) - # pbar.set_description(f'[unk = {unk_count}/{knw_count}={(100.*unk_count/knw_count):.2f}%]' - # f'[out = {out_count}/{knw_count}={(100.*out_count/knw_count):.2f}%]') - return indexes - - -def define_pad_length(index_list): - lengths = [len(index) for index in index_list] - return int(np.mean(lengths)+np.std(lengths)) - - -def pad(index_list, pad_index, max_pad_length=None): - pad_length = np.max([len(index) for index in index_list]) - if max_pad_length is not None: - pad_length = min(pad_length, max_pad_length) - for i,indexes in enumerate(index_list): - index_list[i] = [pad_index]*(pad_length-len(indexes)) + indexes[:pad_length] - return index_list - - -class Index: - def __init__(self, devel_raw, devel_target, test_raw, lang): - self.lang = lang - self.devel_raw = devel_raw - self.devel_target = devel_target - self.test_raw = test_raw - - def index(self, pretrained_vocabulary, analyzer, vocabulary): - self.word2index = dict(vocabulary) - known_words = set(self.word2index.keys()) - if pretrained_vocabulary is not None: - known_words.update(pretrained_vocabulary) - - self.word2index['UNKTOKEN'] = len(self.word2index) - self.word2index['PADTOKEN'] = len(self.word2index) - self.unk_index = self.word2index['UNKTOKEN'] - self.pad_index = self.word2index['PADTOKEN'] - - # index documents and keep track of test terms outside the development vocabulary that are in Muse (if available) - self.out_of_vocabulary = dict() - self.devel_index = index(self.devel_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary) - self.test_index = index(self.test_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary) - - self.vocabsize = len(self.word2index) + len(self.out_of_vocabulary) - - print(f'[indexing complete for lang {self.lang}] vocabulary-size={self.vocabsize}') - - def train_val_split(self, val_prop, max_val, seed): - devel = self.devel_index - target = self.devel_target - devel_raw = self.devel_raw - - val_size = int(min(len(devel) * val_prop, max_val)) - - self.train_index, self.val_index, self.train_target, self.val_target, self.train_raw, self.val_raw = \ - train_test_split( - devel, target, devel_raw, test_size=val_size, random_state=seed, shuffle=True - ) +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import normalize from src.util.embeddings_manager import supervised_embeddings_tfidf