gFun/src/embeddings/pretrained.py

103 lines
3.3 KiB
Python

from abc import ABC, abstractmethod
import torch, torchtext
# import gensim
# import os
import numpy as np
# class KeyedVectors:
#
# def __init__(self, word2index, weights):
# assert len(word2index)==weights.shape[0], 'wrong number of dimensions'
# index2word = {i:w for w,i in word2index.items()}
# assert len([i for i in range(len(index2word)) if i not in index2word])==0, 'gaps in indexing not allowed'
# self.word2index = word2index
# self.index2word = index2word
# self.weights = weights
#
# def extract(self, words):
# dim = self.weights.shape[1]
# v_size = len(words)
#
# source_idx, target_idx = [], []
# for i,word in enumerate(words):
# if word not in self.word2index: continue
# j = self.word2index[word]
# source_idx.append(i)
# target_idx.append(j)
#
# extraction = np.zeros((v_size, dim))
# extraction[np.asarray(source_idx)] = self.weights[np.asarray(target_idx)]
#
# return extraction
# class PretrainedEmbeddings(ABC):
#
# def __init__(self):
# super().__init__()
#
# @abstractmethod
# def vocabulary(self): pass
#
# @abstractmethod
# def dim(self): pass
#
# @classmethod
# def reindex(cls, words, word2index):
# source_idx, target_idx = [], []
# for i, word in enumerate(words):
# if word not in word2index: continue
# j = word2index[word]
# source_idx.append(i)
# target_idx.append(j)
# source_idx = np.asarray(source_idx)
# target_idx = np.asarray(target_idx)
# return source_idx, target_idx
# class GloVe(PretrainedEmbeddings):
#
# def __init__(self, setname='840B', path='./vectors_cache', max_vectors=None):
# super().__init__()
# print(f'Loading GloVe pretrained vectors from torchtext')
# self.embed = torchtext.vocab.GloVe(setname, cache=path, max_vectors=max_vectors)
# print('Done')
#
# def vocabulary(self):
# return set(self.embed.stoi.keys())
#
# def dim(self):
# return self.embed.dim
#
# def extract(self, words):
# source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.embed.stoi)
# extraction = torch.zeros((len(words), self.dim()))
# extraction[source_idx] = self.embed.vectors[target_idx]
# return extraction
# class Word2Vec(PretrainedEmbeddings):
#
# def __init__(self, path, limit=None):
# super().__init__()
# print(f'Loading word2vec pretrained vectors from {path}')
# assert os.path.exists(path), print(f'pre-trained keyed vectors not found in {path}')
# self.embed = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True, limit=limit)
# self.word2index={w:i for i,w in enumerate(self.embed.index2word)}
# print('Done')
#
# def vocabulary(self):
# return set(self.word2index.keys())
#
# def dim(self):
# return self.embed.vector_size
#
# def extract(self, words):
# source_idx, target_idx = PretrainedEmbeddings.reindex(words, self.word2index)
# extraction = np.zeros((len(words), self.dim()))
# extraction[source_idx] = self.embed.vectors[target_idx]
# extraction = torch.from_numpy(extraction).float()
# return extraction