gFun/src/embeddings/pretrained.py

from abc import ABC, abstractmethod
import torch, torchtext
# import gensim
# import os
import numpy as np


# class KeyedVectors:
#
#     def __init__(self, word2index, weights):
#         assert len(word2index)==weights.shape[0], 'wrong number of dimensions'
#         index2word = {i:w for w,i in word2index.items()}
#         assert len([i for i in range(len(index2word)) if i not in index2word])==0, 'gaps in indexing not allowed'
#         self.word2index = word2index
#         self.index2word = index2word
#         self.weights = weights
#
#     def extract(self, words):
#         dim = self.weights.shape[1]
#         v_size = len(words)
#
#         source_idx, target_idx = [], []
#         for i,word in enumerate(words):
#             if word not in self.word2index: continue
#             j = self.word2index[word]
#             source_idx.append(i)
#             target_idx.append(j)
#
#         extraction = np.zeros((v_size, dim))
#         extraction[np.asarray(source_idx)] = self.weights[np.asarray(target_idx)]
#
#         return extraction


# class PretrainedEmbeddings(ABC):
#
#     def __init__(self):
#         super().__init__()
#
#     @abstractmethod
#     def vocabulary(self): pass
#
#     @abstractmethod
#     def dim(self): pass
#
#     @classmethod
#     def reindex(cls, words, word2index):
#         source_idx, target_idx = [], []
#         for i, word in enumerate(words):
#             if word not in word2index: continue
#             j = word2index[word]
#             source_idx.append(i)
#             target_idx.append(j)
#         source_idx = np.asarray(source_idx)
#         target_idx = np.asarray(target_idx)
#         return source_idx, target_idx


# class GloVe(PretrainedEmbeddings):
#
#     def __init__(self, setname='840B', path='./vectors_cache', max_vectors=None):
#         super().__init__()
#         print(f'Loading GloVe pretrained vectors from torchtext')
#         self.embed = torchtext.vocab.GloVe(setname, cache=path, max_vectors=max_vectors)
#         print('Done')
#
#     def vocabulary(self):
#         return set(self.embed.stoi.keys())
#
#     def dim(self):
#         return self.embed.dim
#
#     def extract(self, words):
#         source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
#         extraction = torch.zeros((len(words), self.dim()))
#         extraction[source_idx] = self.embed.vectors[target_idx]
#         return extraction


# class Word2Vec(PretrainedEmbeddings):
#
#     def __init__(self, path, limit=None):
#         super().__init__()
#         print(f'Loading word2vec pretrained vectors from {path}')
#         assert os.path.exists(path), print(f'pre-trained keyed vectors not found in {path}')
#         self.embed = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True, limit=limit)
#         self.word2index={w:i for i,w in enumerate(self.embed.index2word)}
#         print('Done')
#
#     def vocabulary(self):
#         return set(self.word2index.keys())
#
#     def dim(self):
#         return self.embed.vector_size
#
#     def extract(self, words):
#         source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.word2index)
#         extraction = np.zeros((len(words), self.dim()))
#         extraction[source_idx] = self.embed.vectors[target_idx]
#         extraction = torch.from_numpy(extraction).float()
#         return extraction