fixed common after problematic merge

2021-02-05 11:22:30 +01:00 · 2021-02-05 11:22:30 +01:00 · a1c4247e17
parent 1ac850630b
commit a1c4247e17
1 changed files with 3 additions and 98 deletions
--- a/src/util/common.py
+++ b/src/util/common.py
@ -1,103 +1,8 @@
 import numpy as np
 import torch
-warnings.filterwarnings("ignore", category=DeprecationWarning)
+from sklearn.feature_extraction.text import TfidfVectorizer
-
+from sklearn.model_selection import train_test_split
-
+from sklearn.preprocessing import normalize
 def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary):
    """
    Index (i.e., replaces word strings with numerical indexes) a list of string documents
    :param data: list of string documents
    :param vocab: a fixed mapping [str]->[int] of words to indexes
    :param known_words: a set of known words (e.g., words that, despite not being included in the vocab, can be retained
    because they are anyway contained in a pre-trained embedding set that we know in advance)
    :param analyzer: the preprocessor in charge of transforming the document string into a chain of string words
    :param unk_index: the index of the 'unknown token', i.e., a symbol that characterizes all words that we cannot keep
    :param out_of_vocabulary: an incremental mapping [str]->[int] of words to indexes that will index all those words that
    are not in the original vocab but that are in the known_words
    :return:
    """
    indexes=[]
    vocabsize = len(vocab)
    unk_count = 0
    knw_count = 0
    out_count = 0
    # pbar = tqdm(data, desc=f'indexing documents')
    # for text in pbar:
    for text in data:
        words = analyzer(text)
        index = []
        for word in words:
            if word in vocab:
                idx = vocab[word]
            else:
                if word in known_words:
                    if word not in out_of_vocabulary:
                        out_of_vocabulary[word] = vocabsize+len(out_of_vocabulary)
                    idx = out_of_vocabulary[word]
                    out_count += 1
                else:
                    idx = unk_index
                    unk_count += 1
            index.append(idx)
        indexes.append(index)
        knw_count += len(index)
        # pbar.set_description(f'[unk = {unk_count}/{knw_count}={(100.*unk_count/knw_count):.2f}%]'
        #                      f'[out = {out_count}/{knw_count}={(100.*out_count/knw_count):.2f}%]')
    return indexes
 def define_pad_length(index_list):
    lengths = [len(index) for index in index_list]
    return int(np.mean(lengths)+np.std(lengths))
 def pad(index_list, pad_index, max_pad_length=None):
    pad_length = np.max([len(index) for index in index_list])
    if max_pad_length is not None:
        pad_length = min(pad_length, max_pad_length)
    for i,indexes in enumerate(index_list):
        index_list[i] = [pad_index]*(pad_length-len(indexes)) + indexes[:pad_length]
    return index_list
 class Index:
    def __init__(self, devel_raw, devel_target, test_raw, lang):
        self.lang = lang
        self.devel_raw = devel_raw
        self.devel_target = devel_target
        self.test_raw = test_raw
    def index(self, pretrained_vocabulary, analyzer, vocabulary):
        self.word2index = dict(vocabulary)
        known_words = set(self.word2index.keys())
        if pretrained_vocabulary is not None:
            known_words.update(pretrained_vocabulary)
        self.word2index['UNKTOKEN'] = len(self.word2index)
        self.word2index['PADTOKEN'] = len(self.word2index)
        self.unk_index = self.word2index['UNKTOKEN']
        self.pad_index = self.word2index['PADTOKEN']
        # index documents and keep track of test terms outside the development vocabulary that are in Muse (if available)
        self.out_of_vocabulary = dict()
        self.devel_index = index(self.devel_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary)
        self.test_index = index(self.test_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary)
        self.vocabsize = len(self.word2index) + len(self.out_of_vocabulary)
        print(f'[indexing complete for lang {self.lang}] vocabulary-size={self.vocabsize}')
    def train_val_split(self, val_prop, max_val, seed):
        devel = self.devel_index
        target = self.devel_target
        devel_raw = self.devel_raw
        val_size = int(min(len(devel) * val_prop, max_val))
        self.train_index, self.val_index, self.train_target, self.val_target, self.train_raw, self.val_raw = \
            train_test_split(
                devel, target, devel_raw, test_size=val_size, random_state=seed, shuffle=True
            )
 from src.util.embeddings_manager import supervised_embeddings_tfidf