fixed common after problematic merge

2021-02-05 11:22:30 +01:00 · 2021-02-05 11:22:30 +01:00 · a1c4247e17
parent 1ac850630b
commit a1c4247e17
1 changed files with 3 additions and 98 deletions
--- a/src/util/common.py
+++ b/src/util/common.py
@ -1,103 +1,8 @@
 import numpy as np
 import torch
-warnings.filterwarnings("ignore", category=DeprecationWarning)
-
-
-def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary):
-    """
-    Index (i.e., replaces word strings with numerical indexes) a list of string documents
-    :param data: list of string documents
-    :param vocab: a fixed mapping [str]->[int] of words to indexes
-    :param known_words: a set of known words (e.g., words that, despite not being included in the vocab, can be retained
-    because they are anyway contained in a pre-trained embedding set that we know in advance)
-    :param analyzer: the preprocessor in charge of transforming the document string into a chain of string words
-    :param unk_index: the index of the 'unknown token', i.e., a symbol that characterizes all words that we cannot keep
-    :param out_of_vocabulary: an incremental mapping [str]->[int] of words to indexes that will index all those words that
-    are not in the original vocab but that are in the known_words
-    :return:
-    """
-    indexes=[]
-    vocabsize = len(vocab)
-    unk_count = 0
-    knw_count = 0
-    out_count = 0
-    # pbar = tqdm(data, desc=f'indexing documents')
-    # for text in pbar:
-    for text in data:
-        words = analyzer(text)
-        index = []
-        for word in words:
-            if word in vocab:
-                idx = vocab[word]
-            else:
-                if word in known_words:
-                    if word not in out_of_vocabulary:
-                        out_of_vocabulary[word] = vocabsize+len(out_of_vocabulary)
-                    idx = out_of_vocabulary[word]
-                    out_count += 1
-                else:
-                    idx = unk_index
-                    unk_count += 1
-            index.append(idx)
-        indexes.append(index)
-        knw_count += len(index)
-        # pbar.set_description(f'[unk = {unk_count}/{knw_count}={(100.*unk_count/knw_count):.2f}%]'
-        #                      f'[out = {out_count}/{knw_count}={(100.*out_count/knw_count):.2f}%]')
-    return indexes
-
-
-def define_pad_length(index_list):
-    lengths = [len(index) for index in index_list]
-    return int(np.mean(lengths)+np.std(lengths))
-
-
-def pad(index_list, pad_index, max_pad_length=None):
-    pad_length = np.max([len(index) for index in index_list])
-    if max_pad_length is not None:
-        pad_length = min(pad_length, max_pad_length)
-    for i,indexes in enumerate(index_list):
-        index_list[i] = [pad_index]*(pad_length-len(indexes)) + indexes[:pad_length]
-    return index_list
-
-
-class Index:
-    def __init__(self, devel_raw, devel_target, test_raw, lang):
-        self.lang = lang
-        self.devel_raw = devel_raw
-        self.devel_target = devel_target
-        self.test_raw = test_raw
-
-    def index(self, pretrained_vocabulary, analyzer, vocabulary):
-        self.word2index = dict(vocabulary)
-        known_words = set(self.word2index.keys())
-        if pretrained_vocabulary is not None:
-            known_words.update(pretrained_vocabulary)
-
-        self.word2index['UNKTOKEN'] = len(self.word2index)
-        self.word2index['PADTOKEN'] = len(self.word2index)
-        self.unk_index = self.word2index['UNKTOKEN']
-        self.pad_index = self.word2index['PADTOKEN']
-
-        # index documents and keep track of test terms outside the development vocabulary that are in Muse (if available)
-        self.out_of_vocabulary = dict()
-        self.devel_index = index(self.devel_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary)
-        self.test_index = index(self.test_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary)
-
-        self.vocabsize = len(self.word2index) + len(self.out_of_vocabulary)
-
-        print(f'[indexing complete for lang {self.lang}] vocabulary-size={self.vocabsize}')
-
-    def train_val_split(self, val_prop, max_val, seed):
-        devel = self.devel_index
-        target = self.devel_target
-        devel_raw = self.devel_raw
-
-        val_size = int(min(len(devel) * val_prop, max_val))
-
-        self.train_index, self.val_index, self.train_target, self.val_target, self.train_raw, self.val_raw = \
-            train_test_split(
-                devel, target, devel_raw, test_size=val_size, random_state=seed, shuffle=True
-            )
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import normalize

 from src.util.embeddings_manager import supervised_embeddings_tfidf