103 lines
3.3 KiB
Python
103 lines
3.3 KiB
Python
from abc import ABC, abstractmethod
|
|
import torch, torchtext
|
|
# import gensim
|
|
# import os
|
|
import numpy as np
|
|
|
|
|
|
# class KeyedVectors:
|
|
#
|
|
# def __init__(self, word2index, weights):
|
|
# assert len(word2index)==weights.shape[0], 'wrong number of dimensions'
|
|
# index2word = {i:w for w,i in word2index.items()}
|
|
# assert len([i for i in range(len(index2word)) if i not in index2word])==0, 'gaps in indexing not allowed'
|
|
# self.word2index = word2index
|
|
# self.index2word = index2word
|
|
# self.weights = weights
|
|
#
|
|
# def extract(self, words):
|
|
# dim = self.weights.shape[1]
|
|
# v_size = len(words)
|
|
#
|
|
# source_idx, target_idx = [], []
|
|
# for i,word in enumerate(words):
|
|
# if word not in self.word2index: continue
|
|
# j = self.word2index[word]
|
|
# source_idx.append(i)
|
|
# target_idx.append(j)
|
|
#
|
|
# extraction = np.zeros((v_size, dim))
|
|
# extraction[np.asarray(source_idx)] = self.weights[np.asarray(target_idx)]
|
|
#
|
|
# return extraction
|
|
|
|
|
|
# class PretrainedEmbeddings(ABC):
|
|
#
|
|
# def __init__(self):
|
|
# super().__init__()
|
|
#
|
|
# @abstractmethod
|
|
# def vocabulary(self): pass
|
|
#
|
|
# @abstractmethod
|
|
# def dim(self): pass
|
|
#
|
|
# @classmethod
|
|
# def reindex(cls, words, word2index):
|
|
# source_idx, target_idx = [], []
|
|
# for i, word in enumerate(words):
|
|
# if word not in word2index: continue
|
|
# j = word2index[word]
|
|
# source_idx.append(i)
|
|
# target_idx.append(j)
|
|
# source_idx = np.asarray(source_idx)
|
|
# target_idx = np.asarray(target_idx)
|
|
# return source_idx, target_idx
|
|
|
|
|
|
# class GloVe(PretrainedEmbeddings):
|
|
#
|
|
# def __init__(self, setname='840B', path='./vectors_cache', max_vectors=None):
|
|
# super().__init__()
|
|
# print(f'Loading GloVe pretrained vectors from torchtext')
|
|
# self.embed = torchtext.vocab.GloVe(setname, cache=path, max_vectors=max_vectors)
|
|
# print('Done')
|
|
#
|
|
# def vocabulary(self):
|
|
# return set(self.embed.stoi.keys())
|
|
#
|
|
# def dim(self):
|
|
# return self.embed.dim
|
|
#
|
|
# def extract(self, words):
|
|
# source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
|
|
# extraction = torch.zeros((len(words), self.dim()))
|
|
# extraction[source_idx] = self.embed.vectors[target_idx]
|
|
# return extraction
|
|
|
|
|
|
# class Word2Vec(PretrainedEmbeddings):
|
|
#
|
|
# def __init__(self, path, limit=None):
|
|
# super().__init__()
|
|
# print(f'Loading word2vec pretrained vectors from {path}')
|
|
# assert os.path.exists(path), print(f'pre-trained keyed vectors not found in {path}')
|
|
# self.embed = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True, limit=limit)
|
|
# self.word2index={w:i for i,w in enumerate(self.embed.index2word)}
|
|
# print('Done')
|
|
#
|
|
# def vocabulary(self):
|
|
# return set(self.word2index.keys())
|
|
#
|
|
# def dim(self):
|
|
# return self.embed.vector_size
|
|
#
|
|
# def extract(self, words):
|
|
# source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.word2index)
|
|
# extraction = np.zeros((len(words), self.dim()))
|
|
# extraction[source_idx] = self.embed.vectors[target_idx]
|
|
# extraction = torch.from_numpy(extraction).float()
|
|
# return extraction
|
|
|