merged with refactor
This commit is contained in:
parent
a5912a22a9
commit
ca179aca23
|
@ -1,222 +0,0 @@
|
|||
import numpy as np
|
||||
import pytorch_lightning as pl
|
||||
import torch
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
from transformers import BertTokenizer
|
||||
|
||||
N_WORKERS = 8
|
||||
|
||||
|
||||
class RecurrentDataset(Dataset):
|
||||
def __init__(self, lX, ly, lPad_index):
|
||||
"""
|
||||
:param lX: dict {lang_id : np.ndarray}
|
||||
:param ly:
|
||||
"""
|
||||
self.lX = []
|
||||
self.ly = []
|
||||
self.lOffset = {}
|
||||
self.lPad_index = lPad_index
|
||||
|
||||
for lang, data in lX.items():
|
||||
offset = [len(self.lX)]
|
||||
self.lX.extend(data)
|
||||
offset.append(len(self.lX))
|
||||
self.lOffset[lang] = offset
|
||||
|
||||
for lang, target in ly.items():
|
||||
self.ly.extend(target)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.lX)
|
||||
|
||||
def __getitem__(self, index):
|
||||
X = self.lX[index]
|
||||
y = self.ly[index]
|
||||
return X, y, index, self._get_lang(index)
|
||||
|
||||
def _get_lang(self, index):
|
||||
for lang, l_range in self.lOffset.items():
|
||||
if index in range(l_range[0], l_range[1]):
|
||||
return lang
|
||||
|
||||
def collate_fn(self, data):
|
||||
"""
|
||||
Takes care of padding the batch and also check consistency of batch languages. Groups into dict {lang : lang_batch}
|
||||
items sampled from the Dataset class.
|
||||
:param data:
|
||||
:return:
|
||||
"""
|
||||
lX_batch = {}
|
||||
ly_batch = {}
|
||||
current_lang = data[0][-1]
|
||||
for d in data:
|
||||
if d[-1] == current_lang:
|
||||
if current_lang not in lX_batch.keys():
|
||||
lX_batch[current_lang] = []
|
||||
ly_batch[current_lang] = []
|
||||
lX_batch[current_lang].append(d[0])
|
||||
ly_batch[current_lang].append(d[1])
|
||||
else:
|
||||
current_lang = d[-1]
|
||||
lX_batch[current_lang] = []
|
||||
ly_batch[current_lang] = []
|
||||
lX_batch[current_lang].append(d[0])
|
||||
ly_batch[current_lang].append(d[1])
|
||||
|
||||
for lang in lX_batch.keys():
|
||||
lX_batch[lang] = self.pad(lX_batch[lang], pad_index=self.lPad_index[lang],
|
||||
max_pad_length=self.define_pad_length(lX_batch[lang]))
|
||||
lX_batch[lang] = torch.LongTensor(lX_batch[lang])
|
||||
ly_batch[lang] = torch.FloatTensor(ly_batch[lang])
|
||||
|
||||
return lX_batch, ly_batch
|
||||
|
||||
@staticmethod
|
||||
def define_pad_length(index_list):
|
||||
lengths = [len(index) for index in index_list]
|
||||
return int(np.mean(lengths) + np.std(lengths))
|
||||
|
||||
@staticmethod
|
||||
def pad(index_list, pad_index, max_pad_length=None):
|
||||
pad_length = np.max([len(index) for index in index_list])
|
||||
if max_pad_length is not None:
|
||||
pad_length = min(pad_length, max_pad_length)
|
||||
for i, indexes in enumerate(index_list):
|
||||
index_list[i] = [pad_index] * (pad_length - len(indexes)) + indexes[:pad_length]
|
||||
return index_list
|
||||
|
||||
|
||||
class RecurrentDataModule(pl.LightningDataModule):
|
||||
"""
|
||||
Pytorch Lightning Datamodule to be deployed with RecurrentGen.
|
||||
https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html
|
||||
"""
|
||||
def __init__(self, multilingualIndex, batchsize=64, n_jobs=-1):
|
||||
"""
|
||||
Init RecurrentDataModule.
|
||||
:param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents
|
||||
indexed by language code.
|
||||
:param batchsize: int, number of sample per batch.
|
||||
:param n_jobs: int, number of concurrent workers to be deployed (i.e., parallelizing data loading).
|
||||
"""
|
||||
self.multilingualIndex = multilingualIndex
|
||||
self.batchsize = batchsize
|
||||
self.n_jobs = n_jobs
|
||||
super().__init__()
|
||||
|
||||
def prepare_data(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
def setup(self, stage=None):
|
||||
if stage == 'fit' or stage is None:
|
||||
l_train_index, l_train_target = self.multilingualIndex.l_train()
|
||||
# Debug settings: reducing number of samples
|
||||
l_train_index = {l: train[:5] for l, train in l_train_index.items()}
|
||||
l_train_target = {l: target[:5] for l, target in l_train_target.items()}
|
||||
|
||||
self.training_dataset = RecurrentDataset(l_train_index, l_train_target,
|
||||
lPad_index=self.multilingualIndex.l_pad())
|
||||
|
||||
l_val_index, l_val_target = self.multilingualIndex.l_val()
|
||||
# Debug settings: reducing number of samples
|
||||
l_val_index = {l: train[:5] for l, train in l_val_index.items()}
|
||||
l_val_target = {l: target[:5] for l, target in l_val_target.items()}
|
||||
|
||||
self.val_dataset = RecurrentDataset(l_val_index, l_val_target,
|
||||
lPad_index=self.multilingualIndex.l_pad())
|
||||
if stage == 'test' or stage is None:
|
||||
l_test_index, l_test_target = self.multilingualIndex.l_test()
|
||||
# Debug settings: reducing number of samples
|
||||
l_test_index = {l: train[:5] for l, train in l_test_index.items()}
|
||||
l_test_target = {l: target[:5] for l, target in l_test_target.items()}
|
||||
|
||||
self.test_dataset = RecurrentDataset(l_test_index, l_test_target,
|
||||
lPad_index=self.multilingualIndex.l_pad())
|
||||
|
||||
def train_dataloader(self):
|
||||
return DataLoader(self.training_dataset, batch_size=self.batchsize, num_workers=self.n_jobs,
|
||||
collate_fn=self.training_dataset.collate_fn)
|
||||
|
||||
def val_dataloader(self):
|
||||
return DataLoader(self.val_dataset, batch_size=self.batchsize, num_workers=self.n_jobs,
|
||||
collate_fn=self.val_dataset.collate_fn)
|
||||
|
||||
def test_dataloader(self):
|
||||
return DataLoader(self.test_dataset, batch_size=self.batchsize, num_workers=self.n_jobs,
|
||||
collate_fn=self.test_dataset.collate_fn)
|
||||
|
||||
|
||||
def tokenize(l_raw, max_len):
|
||||
"""
|
||||
run Bert tokenization on dict {lang: list of samples}.
|
||||
:param l_raw:
|
||||
:param max_len:
|
||||
:return:
|
||||
"""
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
|
||||
l_tokenized = {}
|
||||
for lang in l_raw.keys():
|
||||
output_tokenizer = tokenizer(l_raw[lang], truncation=True, max_length=max_len, padding='max_length')
|
||||
l_tokenized[lang] = output_tokenizer['input_ids']
|
||||
return l_tokenized
|
||||
|
||||
|
||||
class BertDataModule(RecurrentDataModule):
|
||||
"""
|
||||
Pytorch Lightning Datamodule to be deployed with BertGen.
|
||||
https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html
|
||||
"""
|
||||
def __init__(self, multilingualIndex, batchsize=64, max_len=512):
|
||||
"""
|
||||
Init BertDataModule.
|
||||
:param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents
|
||||
indexed by language code.
|
||||
:param batchsize: int, number of sample per batch.
|
||||
:param max_len: int, max number of token per document. Absolute cap is 512.
|
||||
"""
|
||||
super().__init__(multilingualIndex, batchsize)
|
||||
self.max_len = max_len
|
||||
|
||||
def setup(self, stage=None):
|
||||
if stage == 'fit' or stage is None:
|
||||
l_train_raw, l_train_target = self.multilingualIndex.l_train_raw()
|
||||
# Debug settings: reducing number of samples
|
||||
l_train_raw = {l: train[:5] for l, train in l_train_raw.items()}
|
||||
l_train_target = {l: target[:5] for l, target in l_train_target.items()}
|
||||
|
||||
l_train_index = tokenize(l_train_raw, max_len=self.max_len)
|
||||
self.training_dataset = RecurrentDataset(l_train_index, l_train_target,
|
||||
lPad_index=self.multilingualIndex.l_pad())
|
||||
|
||||
l_val_raw, l_val_target = self.multilingualIndex.l_val_raw()
|
||||
# Debug settings: reducing number of samples
|
||||
l_val_raw = {l: train[:5] for l, train in l_val_raw.items()}
|
||||
l_val_target = {l: target[:5] for l, target in l_val_target.items()}
|
||||
|
||||
l_val_index = tokenize(l_val_raw, max_len=self.max_len)
|
||||
self.val_dataset = RecurrentDataset(l_val_index, l_val_target,
|
||||
lPad_index=self.multilingualIndex.l_pad())
|
||||
|
||||
if stage == 'test' or stage is None:
|
||||
l_test_raw, l_test_target = self.multilingualIndex.l_test_raw()
|
||||
# Debug settings: reducing number of samples
|
||||
l_test_raw = {l: train[:5] for l, train in l_test_raw.items()}
|
||||
l_test_target = {l: target[:5] for l, target in l_test_target.items()}
|
||||
|
||||
l_test_index = tokenize(l_test_raw, max_len=self.max_len)
|
||||
self.test_dataset = RecurrentDataset(l_test_index, l_test_target,
|
||||
lPad_index=self.multilingualIndex.l_pad())
|
||||
|
||||
def train_dataloader(self):
|
||||
"""
|
||||
NB: Setting n_workers to > 0 will cause "OSError: [Errno 24] Too many open files"
|
||||
:return:
|
||||
"""
|
||||
return DataLoader(self.training_dataset, batch_size=self.batchsize)
|
||||
|
||||
def val_dataloader(self):
|
||||
return DataLoader(self.val_dataset, batch_size=self.batchsize)
|
||||
|
||||
def test_dataloader(self):
|
||||
return DataLoader(self.test_dataset, batch_size=self.batchsize)
|
|
@ -1,712 +0,0 @@
|
|||
import itertools
|
||||
import pickle
|
||||
import re
|
||||
from os.path import exists
|
||||
|
||||
import numpy as np
|
||||
from nltk.corpus import stopwords
|
||||
from scipy.sparse import csr_matrix
|
||||
from scipy.sparse import issparse
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import MultiLabelBinarizer
|
||||
from tqdm import tqdm
|
||||
|
||||
from data.languages import NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING
|
||||
from data.reader.jrcacquis_reader import *
|
||||
from data.reader.rcv_reader import fetch_RCV1, fetch_RCV2
|
||||
from data.text_preprocessor import NLTKStemTokenizer, preprocess_documents
|
||||
|
||||
|
||||
class MultilingualDataset:
|
||||
"""
|
||||
A multilingual dataset is a dictionary of training and test documents indexed by language code.
|
||||
Train and test sets are represented as tuples of the type (X,Y,ids), where X is a matrix representation of the
|
||||
documents (e.g., a document-by-term sparse csr_matrix), Y is a document-by-label binary np.array indicating the
|
||||
labels of each document, and ids is a list of document-identifiers from the original collection.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.dataset_name = ""
|
||||
self.multiling_dataset = {}
|
||||
|
||||
def add(self, lang, Xtr, Ytr, Xte, Yte, tr_ids=None, te_ids=None):
|
||||
self.multiling_dataset[lang] = ((Xtr, Ytr, tr_ids), (Xte, Yte, te_ids))
|
||||
|
||||
def save(self, file):
|
||||
self.sort_indexes()
|
||||
pickle.dump(self, open(file, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
return self
|
||||
|
||||
def __getitem__(self, item):
|
||||
if item in self.langs():
|
||||
return self.multiling_dataset[item]
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def load(cls, file):
|
||||
data = pickle.load(open(file, 'rb'))
|
||||
data.sort_indexes()
|
||||
return data
|
||||
|
||||
@classmethod
|
||||
def load_ids(cls, file):
|
||||
data = pickle.load(open(file, 'rb'))
|
||||
tr_ids = {lang:tr_ids for (lang,((_,_,tr_ids), (_,_,_))) in data.multiling_dataset.items()}
|
||||
te_ids = {lang: te_ids for (lang, ((_, _, _), (_, _, te_ids))) in data.multiling_dataset.items()}
|
||||
return tr_ids, te_ids
|
||||
|
||||
def sort_indexes(self):
|
||||
for (lang, ((Xtr,_,_),(Xte,_,_))) in self.multiling_dataset.items():
|
||||
if issparse(Xtr): Xtr.sort_indices()
|
||||
if issparse(Xte): Xte.sort_indices()
|
||||
|
||||
def set_view(self, categories=None, languages=None):
|
||||
if categories is not None:
|
||||
if isinstance(categories, int):
|
||||
categories = np.array([categories])
|
||||
elif isinstance(categories, list):
|
||||
categories = np.array(categories)
|
||||
self.categories_view = categories
|
||||
if languages is not None:
|
||||
self.languages_view = languages
|
||||
|
||||
def training(self, mask_numbers=False, target_as_csr=False):
|
||||
return self.lXtr(mask_numbers), self.lYtr(as_csr=target_as_csr)
|
||||
|
||||
def test(self, mask_numbers=False, target_as_csr=False):
|
||||
return self.lXte(mask_numbers), self.lYte(as_csr=target_as_csr)
|
||||
|
||||
def lXtr(self, mask_numbers=False):
|
||||
proc = lambda x:_mask_numbers(x) if mask_numbers else x
|
||||
# return {lang: Xtr for (lang, ((Xtr, _, _), _)) in self.multiling_dataset.items() if lang in self.langs()}
|
||||
return {lang:proc(Xtr) for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
|
||||
|
||||
def lXte(self, mask_numbers=False):
|
||||
proc = lambda x: _mask_numbers(x) if mask_numbers else x
|
||||
# return {lang: Xte for (lang, (_, (Xte, _, _))) in self.multiling_dataset.items() if lang in self.langs()}
|
||||
return {lang:proc(Xte) for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()}
|
||||
|
||||
def lYtr(self, as_csr=False):
|
||||
lY = {lang:self.cat_view(Ytr) for (lang, ((_,Ytr,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
|
||||
if as_csr:
|
||||
lY = {l:csr_matrix(Y) for l,Y in lY.items()}
|
||||
return lY
|
||||
|
||||
def lYte(self, as_csr=False):
|
||||
lY = {lang:self.cat_view(Yte) for (lang, (_,(_,Yte,_))) in self.multiling_dataset.items() if lang in self.langs()}
|
||||
if as_csr:
|
||||
lY = {l:csr_matrix(Y) for l,Y in lY.items()}
|
||||
return lY
|
||||
|
||||
def cat_view(self, Y):
|
||||
if hasattr(self, 'categories_view'):
|
||||
return Y[:,self.categories_view]
|
||||
else:
|
||||
return Y
|
||||
|
||||
def langs(self):
|
||||
if hasattr(self, 'languages_view'):
|
||||
langs = self.languages_view
|
||||
else:
|
||||
langs = sorted(self.multiling_dataset.keys())
|
||||
return langs
|
||||
|
||||
def num_categories(self):
|
||||
return self.lYtr()[self.langs()[0]].shape[1]
|
||||
|
||||
def show_dimensions(self):
|
||||
def shape(X):
|
||||
return X.shape if hasattr(X, 'shape') else len(X)
|
||||
for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items():
|
||||
if lang not in self.langs(): continue
|
||||
print("Lang {}, Xtr={}, ytr={}, Xte={}, yte={}".format(lang, shape(Xtr), self.cat_view(Ytr).shape, shape(Xte), self.cat_view(Yte).shape))
|
||||
|
||||
def show_category_prevalences(self):
|
||||
nC = self.num_categories()
|
||||
accum_tr = np.zeros(nC, dtype=np.int)
|
||||
accum_te = np.zeros(nC, dtype=np.int)
|
||||
in_langs = np.zeros(nC, dtype=np.int) # count languages with at least one positive example (per category)
|
||||
for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items():
|
||||
if lang not in self.langs(): continue
|
||||
prev_train = np.sum(self.cat_view(Ytr), axis=0)
|
||||
prev_test = np.sum(self.cat_view(Yte), axis=0)
|
||||
accum_tr += prev_train
|
||||
accum_te += prev_test
|
||||
in_langs += (prev_train>0)*1
|
||||
print(lang+'-train', prev_train)
|
||||
print(lang+'-test', prev_test)
|
||||
print('all-train', accum_tr)
|
||||
print('all-test', accum_te)
|
||||
|
||||
return accum_tr, accum_te, in_langs
|
||||
|
||||
def set_labels(self, labels):
|
||||
self.labels = labels
|
||||
|
||||
def _mask_numbers(data):
|
||||
mask_moredigit = re.compile(r'\s[\+-]?\d{5,}([\.,]\d*)*\b')
|
||||
mask_4digit = re.compile(r'\s[\+-]?\d{4}([\.,]\d*)*\b')
|
||||
mask_3digit = re.compile(r'\s[\+-]?\d{3}([\.,]\d*)*\b')
|
||||
mask_2digit = re.compile(r'\s[\+-]?\d{2}([\.,]\d*)*\b')
|
||||
mask_1digit = re.compile(r'\s[\+-]?\d{1}([\.,]\d*)*\b')
|
||||
masked = []
|
||||
for text in tqdm(data, desc='masking numbers'):
|
||||
text = ' ' + text
|
||||
text = mask_moredigit.sub(' MoreDigitMask', text)
|
||||
text = mask_4digit.sub(' FourDigitMask', text)
|
||||
text = mask_3digit.sub(' ThreeDigitMask', text)
|
||||
text = mask_2digit.sub(' TwoDigitMask', text)
|
||||
text = mask_1digit.sub(' OneDigitMask', text)
|
||||
masked.append(text.replace('.','').replace(',','').strip())
|
||||
return masked
|
||||
|
||||
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ----------------------------------------------------------------------------------------------------------------------
|
||||
def get_active_labels(doclist):
|
||||
cat_list = set()
|
||||
for d in doclist:
|
||||
cat_list.update(d.categories)
|
||||
return list(cat_list)
|
||||
|
||||
def filter_by_categories(doclist, keep_categories):
|
||||
catset = frozenset(keep_categories)
|
||||
for d in doclist:
|
||||
d.categories = list(set(d.categories).intersection(catset))
|
||||
|
||||
def __years_to_str(years):
|
||||
if isinstance(years, list):
|
||||
if len(years) > 1:
|
||||
return str(years[0])+'-'+str(years[-1])
|
||||
return str(years[0])
|
||||
return str(years)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------------------------
|
||||
# Matrix builders
|
||||
# ----------------------------------------------------------------------------------------------------------------------
|
||||
def build_independent_matrices(dataset_name, langs, training_docs, test_docs, label_names, wiki_docs=[], preprocess=True):
|
||||
"""
|
||||
Builds the document-by-term weighted matrices for each language. Representations are independent of each other,
|
||||
i.e., each language-specific matrix lies in a dedicate feature space.
|
||||
:param dataset_name: the name of the dataset (str)
|
||||
:param langs: list of languages (str)
|
||||
:param training_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id)
|
||||
:param test_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id)
|
||||
:param label_names: list of names of labels (str)
|
||||
:param wiki_docs: doc-list (optional), if specified, project all wiki docs in the feature spaces built for the languages
|
||||
:param preprocess: whether or not to apply language-specific text preprocessing (stopword removal and stemming)
|
||||
:return: a MultilingualDataset. If wiki_docs has been specified, a dictionary lW is also returned, which indexes
|
||||
by language the processed wikipedia documents in their respective language-specific feature spaces
|
||||
"""
|
||||
|
||||
mlb = MultiLabelBinarizer()
|
||||
mlb.fit([label_names])
|
||||
|
||||
lW = {}
|
||||
|
||||
multilingual_dataset = MultilingualDataset()
|
||||
multilingual_dataset.dataset_name = dataset_name
|
||||
multilingual_dataset.set_labels(mlb.classes_)
|
||||
for lang in langs:
|
||||
print("\nprocessing %d training, %d test, %d wiki for language <%s>" %
|
||||
(len(training_docs[lang]), len(test_docs[lang]), len(wiki_docs[lang]) if wiki_docs else 0, lang))
|
||||
|
||||
tr_data, tr_labels, IDtr = zip(*training_docs[lang])
|
||||
te_data, te_labels, IDte = zip(*test_docs[lang])
|
||||
|
||||
if preprocess:
|
||||
tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True,
|
||||
tokenizer=NLTKStemTokenizer(lang, verbose=True),
|
||||
stop_words=stopwords.words(NLTK_LANGMAP[lang]))
|
||||
else:
|
||||
tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True)
|
||||
|
||||
Xtr = tfidf.fit_transform(tr_data)
|
||||
Xte = tfidf.transform(te_data)
|
||||
if wiki_docs:
|
||||
lW[lang] = tfidf.transform(wiki_docs[lang])
|
||||
|
||||
Ytr = mlb.transform(tr_labels)
|
||||
Yte = mlb.transform(te_labels)
|
||||
|
||||
multilingual_dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte)
|
||||
|
||||
multilingual_dataset.show_dimensions()
|
||||
multilingual_dataset.show_category_prevalences()
|
||||
|
||||
if wiki_docs:
|
||||
return multilingual_dataset, lW
|
||||
else:
|
||||
return multilingual_dataset
|
||||
|
||||
|
||||
# creates a MultilingualDataset where matrices shares a single yuxtaposed feature space
|
||||
def build_juxtaposed_matrices(dataset_name, langs, training_docs, test_docs, label_names, preprocess=True):
|
||||
"""
|
||||
Builds the document-by-term weighted matrices for each language. Representations are not independent of each other,
|
||||
since all of them lie on the same yuxtaposed feature space.
|
||||
:param dataset_name: the name of the dataset (str)
|
||||
:param langs: list of languages (str)
|
||||
:param training_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id)
|
||||
:param test_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id)
|
||||
:param label_names: list of names of labels (str)
|
||||
:param preprocess: whether or not to apply language-specific text preprocessing (stopword removal and stemming)
|
||||
:return: a MultilingualDataset. If wiki_docs has been specified, a dictionary lW is also returned, which indexes
|
||||
by language the processed wikipedia documents in their respective language-specific feature spaces
|
||||
"""
|
||||
|
||||
multiling_dataset = MultilingualDataset()
|
||||
multiling_dataset.dataset_name = dataset_name
|
||||
|
||||
mlb = MultiLabelBinarizer()
|
||||
mlb.fit([label_names])
|
||||
|
||||
multiling_dataset.set_labels(mlb.classes_)
|
||||
|
||||
tr_data_stack = []
|
||||
for lang in langs:
|
||||
print("\nprocessing %d training and %d test for language <%s>" % (len(training_docs[lang]), len(test_docs[lang]), lang))
|
||||
tr_data, tr_labels, tr_ID = zip(*training_docs[lang])
|
||||
te_data, te_labels, te_ID = zip(*test_docs[lang])
|
||||
if preprocess:
|
||||
tr_data = preprocess_documents(tr_data, lang)
|
||||
te_data = preprocess_documents(te_data, lang)
|
||||
tr_data_stack.extend(tr_data)
|
||||
multiling_dataset.add(lang, tr_data, tr_labels, te_data, te_labels, tr_ID, te_ID)
|
||||
|
||||
tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True)
|
||||
tfidf.fit(tr_data_stack)
|
||||
|
||||
for lang in langs:
|
||||
print("\nweighting documents for language <%s>" % (lang))
|
||||
(tr_data, tr_labels, tr_ID), (te_data, te_labels, te_ID) = multiling_dataset[lang]
|
||||
Xtr = tfidf.transform(tr_data)
|
||||
Xte = tfidf.transform(te_data)
|
||||
Ytr = mlb.transform(tr_labels)
|
||||
Yte = mlb.transform(te_labels)
|
||||
multiling_dataset.add(lang,Xtr,Ytr,Xte,Yte,tr_ID,te_ID)
|
||||
|
||||
multiling_dataset.show_dimensions()
|
||||
return multiling_dataset
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------------------------
|
||||
# Methods to recover the original documents from the MultilingualDataset's ids
|
||||
# ----------------------------------------------------------------------------------------------------------------------
|
||||
"""
|
||||
This method has been added a posteriori, to create document embeddings using the polylingual embeddings of the recent
|
||||
article 'Word Translation without Parallel Data'; basically, it takes one of the splits and retrieves the RCV documents
|
||||
from the doc ids and then pickles an object (tr_docs, te_docs, label_names) in the outpath
|
||||
"""
|
||||
def retrieve_rcv_documents_from_dataset(datasetpath, rcv1_data_home, rcv2_data_home, outpath):
|
||||
|
||||
tr_ids, te_ids = MultilingualDataset.load_ids(datasetpath)
|
||||
assert tr_ids.keys() == te_ids.keys(), 'inconsistent keys tr vs te'
|
||||
langs = list(tr_ids.keys())
|
||||
|
||||
print('fetching the datasets')
|
||||
rcv1_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train')
|
||||
rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l != 'en'])
|
||||
|
||||
filter_by_categories(rcv1_documents, labels_rcv2)
|
||||
filter_by_categories(rcv2_documents, labels_rcv1)
|
||||
|
||||
label_names = get_active_labels(rcv1_documents + rcv2_documents)
|
||||
print('Active labels in RCV1/2 {}'.format(len(label_names)))
|
||||
|
||||
print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_documents), 0, len(label_names)))
|
||||
print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents]))
|
||||
|
||||
all_docs = rcv1_documents + rcv2_documents
|
||||
mlb = MultiLabelBinarizer()
|
||||
mlb.fit([label_names])
|
||||
|
||||
dataset = MultilingualDataset()
|
||||
for lang in langs:
|
||||
analyzer = CountVectorizer(strip_accents='unicode', min_df=3,
|
||||
stop_words=stopwords.words(NLTK_LANGMAP[lang])).build_analyzer()
|
||||
|
||||
Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.id) for d in all_docs if d.lang == lang and d.id in tr_ids[lang]])
|
||||
Xte,Yte,IDte = zip(*[(d.text,d.categories,d.id) for d in all_docs if d.lang == lang and d.id in te_ids[lang]])
|
||||
Xtr = [' '.join(analyzer(d)) for d in Xtr]
|
||||
Xte = [' '.join(analyzer(d)) for d in Xte]
|
||||
Ytr = mlb.transform(Ytr)
|
||||
Yte = mlb.transform(Yte)
|
||||
dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte)
|
||||
|
||||
dataset.save(outpath)
|
||||
|
||||
"""
|
||||
Same thing but for JRC-Acquis
|
||||
"""
|
||||
def retrieve_jrc_documents_from_dataset(datasetpath, jrc_data_home, train_years, test_years, cat_policy, most_common_cat, outpath):
|
||||
|
||||
tr_ids, te_ids = MultilingualDataset.load_ids(datasetpath)
|
||||
assert tr_ids.keys() == te_ids.keys(), 'inconsistent keys tr vs te'
|
||||
langs = list(tr_ids.keys())
|
||||
|
||||
print('fetching the datasets')
|
||||
|
||||
cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy)
|
||||
training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=train_years,
|
||||
cat_filter=cat_list, cat_threshold=1, parallel=None,
|
||||
most_frequent=most_common_cat)
|
||||
test_docs, _ = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names,
|
||||
parallel='force')
|
||||
|
||||
def filter_by_id(doclist, ids):
|
||||
ids_set = frozenset(itertools.chain.from_iterable(ids.values()))
|
||||
return [x for x in doclist if (x.parallel_id+'__'+x.id) in ids_set]
|
||||
|
||||
training_docs = filter_by_id(training_docs, tr_ids)
|
||||
test_docs = filter_by_id(test_docs, te_ids)
|
||||
|
||||
print('jrc: {} train, {} test, {} categories'.format(len(training_docs), len(test_docs), len(label_names)))
|
||||
|
||||
mlb = MultiLabelBinarizer()
|
||||
mlb.fit([label_names])
|
||||
|
||||
dataset = MultilingualDataset()
|
||||
for lang in langs:
|
||||
analyzer = CountVectorizer(strip_accents='unicode', min_df=3,
|
||||
stop_words=stopwords.words(NLTK_LANGMAP[lang])).build_analyzer()
|
||||
|
||||
Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.parallel_id+'__'+d.id) for d in training_docs if d.lang == lang])
|
||||
Xte,Yte,IDte = zip(*[(d.text,d.categories,d.parallel_id+'__'+d.id) for d in test_docs if d.lang == lang])
|
||||
Xtr = [' '.join(analyzer(d)) for d in Xtr]
|
||||
Xte = [' '.join(analyzer(d)) for d in Xte]
|
||||
Ytr = mlb.transform(Ytr)
|
||||
Yte = mlb.transform(Yte)
|
||||
dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte)
|
||||
|
||||
dataset.save(outpath)
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------------------------
|
||||
# Dataset Generators
|
||||
# ----------------------------------------------------------------------------------------------------------------------
|
||||
def prepare_jrc_datasets(jrc_data_home, wiki_data_home, langs, train_years, test_years, cat_policy, most_common_cat=-1, max_wiki=5000, run=0):
|
||||
from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample
|
||||
|
||||
|
||||
"""
|
||||
Prepare all datasets for JRC-Acquis. The datasets include the "feature-independent" version, the
|
||||
"feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices.
|
||||
In all cases, training documents are strictly non-parallel, and test documents are strictly parallel
|
||||
:param jrc_data_home: path to the raw JRC-Acquis documents (it will be downloaded if not found), and the path where
|
||||
all splits will be generated
|
||||
:param wiki_data_home: path to the wikipedia dump (see data/readers/wikipedia_tools.py)
|
||||
:param langs: the list of languages to consider (as defined in data/languages.py)
|
||||
:param train_years: a list of ints containing the years to be considered as training documents
|
||||
:param test_years: a list of ints containing the years to be considered as test documents
|
||||
:param cat_policy: a string indicating which category selection policy to apply. Valid policies are, e.g., "all"
|
||||
(select all categories), "broadest" (select only the broadest concepts in the taxonomy), or "leaves" (select the
|
||||
leaves concepts in the taxonomy). See inspect_eurovoc from data/reader/jrcacquis_reader.py for more details
|
||||
:param most_common_cat: the maximum number of most common categories to consider, or -1 to keep them all
|
||||
:param max_wiki: the maximum number of wikipedia documents to consider (default 5000)
|
||||
:param run: a numeric label naming the random split (useful to keep track of different runs)
|
||||
:return: None
|
||||
"""
|
||||
|
||||
name = 'JRCacquis'
|
||||
run = '_run' + str(run)
|
||||
config_name = 'jrc_nltk_' + __years_to_str(train_years) + \
|
||||
'vs' + __years_to_str(test_years) + \
|
||||
'_' + cat_policy + \
|
||||
('_top' + str(most_common_cat) if most_common_cat!=-1 else '') + \
|
||||
'_noparallel_processed'
|
||||
|
||||
indep_path = join(jrc_data_home, config_name + run + '.pickle')
|
||||
upper_path = join(jrc_data_home, config_name + run + '_upper.pickle')
|
||||
yuxta_path = join(jrc_data_home, config_name + run + '_yuxtaposed.pickle')
|
||||
wiki_path = join(jrc_data_home, config_name + run + '.wiki.pickle')
|
||||
wiki_docs_path = join(jrc_data_home, config_name + '.wiki.raw.pickle')
|
||||
|
||||
cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy)
|
||||
training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=train_years,
|
||||
cat_filter=cat_list, cat_threshold=1, parallel=None,
|
||||
most_frequent=most_common_cat)
|
||||
test_docs, _ = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names,
|
||||
parallel='force')
|
||||
|
||||
print('Generating feature-independent dataset...')
|
||||
training_docs_no_parallel = random_sampling_avoiding_parallel(training_docs)
|
||||
|
||||
def _group_by_lang(doc_list, langs):
|
||||
return {lang: [(d.text, d.categories, d.parallel_id + '__' + d.id) for d in doc_list if d.lang == lang]
|
||||
for lang in langs}
|
||||
|
||||
training_docs = _group_by_lang(training_docs, langs)
|
||||
training_docs_no_parallel = _group_by_lang(training_docs_no_parallel, langs)
|
||||
test_docs = _group_by_lang(test_docs, langs)
|
||||
if not exists(indep_path):
|
||||
wiki_docs=None
|
||||
if max_wiki>0:
|
||||
if not exists(wiki_docs_path):
|
||||
wiki_docs = fetch_wikipedia_multilingual(wiki_data_home, langs, min_words=50, deletions=False)
|
||||
wiki_docs = random_wiki_sample(wiki_docs, max_wiki)
|
||||
pickle.dump(wiki_docs, open(wiki_docs_path, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
else:
|
||||
wiki_docs = pickle.load(open(wiki_docs_path, 'rb'))
|
||||
wiki_docs = random_wiki_sample(wiki_docs, max_wiki)
|
||||
|
||||
if wiki_docs:
|
||||
lang_data, wiki_docs = build_independent_matrices(name, langs, training_docs_no_parallel, test_docs, label_names, wiki_docs)
|
||||
pickle.dump(wiki_docs, open(wiki_path, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
else:
|
||||
lang_data = build_independent_matrices(name, langs, training_docs_no_parallel, test_docs, label_names)
|
||||
|
||||
lang_data.save(indep_path)
|
||||
|
||||
print('Generating upper-bound (English-only) dataset...')
|
||||
if not exists(upper_path):
|
||||
training_docs_eng_only = {'en':training_docs['en']}
|
||||
test_docs_eng_only = {'en':test_docs['en']}
|
||||
build_independent_matrices(name, ['en'], training_docs_eng_only, test_docs_eng_only, label_names).save(upper_path)
|
||||
|
||||
print('Generating yuxtaposed dataset...')
|
||||
if not exists(yuxta_path):
|
||||
build_juxtaposed_matrices(name, langs, training_docs_no_parallel, test_docs, label_names).save(yuxta_path)
|
||||
|
||||
|
||||
def prepare_rcv_datasets(outpath, rcv1_data_home, rcv2_data_home, wiki_data_home, langs,
|
||||
train_for_lang=1000, test_for_lang=1000, max_wiki=5000, preprocess=True, run=0):
|
||||
from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample
|
||||
"""
|
||||
Prepare all datasets for RCV1/RCV2. The datasets include the "feature-independent" version, the
|
||||
"feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices.
|
||||
|
||||
:param outpath: path where all splits will be dumped
|
||||
:param rcv1_data_home: path to the RCV1-v2 dataset (English only)
|
||||
:param rcv2_data_home: path to the RCV2 dataset (all languages other than English)
|
||||
:param wiki_data_home: path to the wikipedia dump (see data/readers/wikipedia_tools.py)
|
||||
:param langs: the list of languages to consider (as defined in data/languages.py)
|
||||
:param train_for_lang: maximum number of training documents per language
|
||||
:param test_for_lang: maximum number of test documents per language
|
||||
:param max_wiki: the maximum number of wikipedia documents to consider (default 5000)
|
||||
:param preprocess: whether or not to apply language-specific preprocessing (stopwords removal and stemming)
|
||||
:param run: a numeric label naming the random split (useful to keep track of different runs)
|
||||
:return: None
|
||||
"""
|
||||
|
||||
assert 'en' in langs, 'English is not in requested languages, but is needed for some datasets'
|
||||
assert len(langs)>1, 'the multilingual dataset cannot be built with only one dataset'
|
||||
assert not preprocess or set(langs).issubset(set(RCV2_LANGS_WITH_NLTK_STEMMING+['en'])), \
|
||||
"languages not in RCV1-v2/RCV2 scope or not in valid for NLTK's processing"
|
||||
|
||||
name = 'RCV1/2'
|
||||
run = '_run' + str(run)
|
||||
config_name = 'rcv1-2_nltk_trByLang'+str(train_for_lang)+'_teByLang'+str(test_for_lang)+\
|
||||
('_processed' if preprocess else '_raw')
|
||||
|
||||
indep_path = join(outpath, config_name + run + '.pickle')
|
||||
upper_path = join(outpath, config_name + run +'_upper.pickle')
|
||||
yuxta_path = join(outpath, config_name + run +'_yuxtaposed.pickle')
|
||||
wiki_path = join(outpath, config_name + run + '.wiki.pickle')
|
||||
wiki_docs_path = join(outpath, config_name + '.wiki.raw.pickle')
|
||||
|
||||
print('fetching the datasets')
|
||||
rcv1_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train')
|
||||
rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l!='en'])
|
||||
filter_by_categories(rcv1_documents, labels_rcv2)
|
||||
filter_by_categories(rcv2_documents, labels_rcv1)
|
||||
|
||||
label_names = get_active_labels(rcv1_documents+rcv2_documents)
|
||||
print('Active labels in RCV1/2 {}'.format(len(label_names)))
|
||||
|
||||
print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_documents), 0, len(label_names)))
|
||||
print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents]))
|
||||
|
||||
lang_docs = {lang: [d for d in rcv1_documents + rcv2_documents if d.lang == lang] for lang in langs}
|
||||
|
||||
# for the upper bound there are no parallel versions, so for the English case, we take as many documents as there
|
||||
# would be in the multilingual case -- then we will extract from them only train_for_lang for the other cases
|
||||
print('Generating upper-bound (English-only) dataset...')
|
||||
train, test = train_test_split(lang_docs['en'], train_size=train_for_lang*len(langs), test_size=test_for_lang, shuffle=True)
|
||||
train_lang_doc_map = {'en':[(d.text, d.categories, d.id) for d in train]}
|
||||
test_lang_doc_map = {'en':[(d.text, d.categories, d.id) for d in test]}
|
||||
build_independent_matrices(name, ['en'], train_lang_doc_map, test_lang_doc_map, label_names).save(upper_path)
|
||||
|
||||
train_lang_doc_map['en'] = train_lang_doc_map['en'][:train_for_lang]
|
||||
for lang in langs:
|
||||
if lang=='en': continue # already split
|
||||
test_take = min(test_for_lang, len(lang_docs[lang])-train_for_lang)
|
||||
train, test = train_test_split(lang_docs[lang], train_size=train_for_lang, test_size=test_take, shuffle=True)
|
||||
train_lang_doc_map[lang] = [(d.text, d.categories, d.id) for d in train]
|
||||
test_lang_doc_map[lang] = [(d.text, d.categories, d.id) for d in test]
|
||||
|
||||
print('Generating feature-independent dataset...')
|
||||
wiki_docs=None
|
||||
if max_wiki>0:
|
||||
if not exists(wiki_docs_path):
|
||||
wiki_docs = fetch_wikipedia_multilingual(wiki_data_home, langs, min_words=50, deletions=False)
|
||||
wiki_docs = random_wiki_sample(wiki_docs, max_wiki)
|
||||
pickle.dump(wiki_docs, open(wiki_docs_path, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
else:
|
||||
wiki_docs = pickle.load(open(wiki_docs_path, 'rb'))
|
||||
wiki_docs = random_wiki_sample(wiki_docs, max_wiki)
|
||||
|
||||
if wiki_docs:
|
||||
lang_data, wiki_docs_matrix = build_independent_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, wiki_docs, preprocess)
|
||||
pickle.dump(wiki_docs_matrix, open(wiki_path, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
else:
|
||||
lang_data = build_independent_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, wiki_docs, preprocess)
|
||||
|
||||
lang_data.save(indep_path)
|
||||
|
||||
print('Generating yuxtaposed dataset...')
|
||||
build_juxtaposed_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, preprocess).save(yuxta_path)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------------------------------------------------------
|
||||
# Methods to generate full RCV and JRC datasets
|
||||
# ----------------------------------------------------------------------------------------------------------------------
|
||||
def full_rcv_(rcv1_data_home, rcv2_data_home, outpath, langs):
|
||||
|
||||
|
||||
print('fetching the datasets')
|
||||
rcv1_train_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train')
|
||||
rcv1_test_documents, labels_rcv1_test = fetch_RCV1(rcv1_data_home, split='test')
|
||||
rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l != 'en'])
|
||||
|
||||
filter_by_categories(rcv1_train_documents, labels_rcv2)
|
||||
filter_by_categories(rcv1_test_documents, labels_rcv2)
|
||||
filter_by_categories(rcv2_documents, labels_rcv1)
|
||||
|
||||
label_names = get_active_labels(rcv1_train_documents + rcv2_documents)
|
||||
print('Active labels in RCV1/2 {}'.format(len(label_names)))
|
||||
|
||||
print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_train_documents), len(rcv1_test_documents), len(label_names)))
|
||||
print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents]))
|
||||
|
||||
mlb = MultiLabelBinarizer()
|
||||
mlb.fit([label_names])
|
||||
|
||||
all_docs = rcv1_train_documents + rcv1_test_documents + rcv2_documents
|
||||
lang_docs = {lang: [d for d in all_docs if d.lang == lang] for lang in langs}
|
||||
|
||||
def get_ids(doclist):
|
||||
return frozenset([d.id for d in doclist])
|
||||
|
||||
tr_ids = {'en': get_ids(rcv1_train_documents)}
|
||||
te_ids = {'en': get_ids(rcv1_test_documents)}
|
||||
for lang in langs:
|
||||
if lang == 'en': continue
|
||||
tr_ids[lang], te_ids[lang] = train_test_split([d.id for d in lang_docs[lang]], test_size=.3)
|
||||
|
||||
dataset = MultilingualDataset()
|
||||
dataset.dataset_name = 'RCV1/2-full'
|
||||
for lang in langs:
|
||||
print(f'processing {lang} with {len(tr_ids[lang])} training documents and {len(te_ids[lang])} documents')
|
||||
analyzer = CountVectorizer(
|
||||
strip_accents='unicode', min_df=3, stop_words=stopwords.words(NLTK_LANGMAP[lang])
|
||||
).build_analyzer()
|
||||
|
||||
Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.id) for d in lang_docs[lang] if d.id in tr_ids[lang]])
|
||||
Xte,Yte,IDte = zip(*[(d.text,d.categories,d.id) for d in lang_docs[lang] if d.id in te_ids[lang]])
|
||||
Xtr = [' '.join(analyzer(d)) for d in Xtr]
|
||||
Xte = [' '.join(analyzer(d)) for d in Xte]
|
||||
Ytr = mlb.transform(Ytr)
|
||||
Yte = mlb.transform(Yte)
|
||||
dataset.add(lang, _mask_numbers(Xtr), Ytr, _mask_numbers(Xte), Yte, IDtr, IDte)
|
||||
|
||||
dataset.save(outpath)
|
||||
|
||||
|
||||
def full_jrc_(jrc_data_home, langs, train_years, test_years, outpath, cat_policy='all', most_common_cat=300):
|
||||
|
||||
print('fetching the datasets')
|
||||
cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy)
|
||||
training_docs, label_names = fetch_jrcacquis(
|
||||
langs=langs, data_path=jrc_data_home, years=train_years, cat_filter=cat_list, cat_threshold=1, parallel=None, most_frequent=most_common_cat
|
||||
)
|
||||
test_docs, _ = fetch_jrcacquis(
|
||||
langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names, parallel='force'
|
||||
)
|
||||
|
||||
def _group_by_lang(doc_list, langs):
|
||||
return {lang: [d for d in doc_list if d.lang == lang] for lang in langs}
|
||||
|
||||
training_docs = _group_by_lang(training_docs, langs)
|
||||
test_docs = _group_by_lang(test_docs, langs)
|
||||
|
||||
mlb = MultiLabelBinarizer()
|
||||
mlb.fit([label_names])
|
||||
|
||||
dataset = MultilingualDataset()
|
||||
data.dataset_name = 'JRC-Acquis-full'
|
||||
for lang in langs:
|
||||
analyzer = CountVectorizer(
|
||||
strip_accents='unicode', min_df=3, stop_words=stopwords.words(NLTK_LANGMAP[lang])
|
||||
).build_analyzer()
|
||||
|
||||
Xtr, Ytr, IDtr = zip(*[(d.text, d.categories, d.parallel_id + '__' + d.id) for d in training_docs[lang] if d.lang == lang])
|
||||
Xte, Yte, IDte = zip(*[(d.text, d.categories, d.parallel_id + '__' + d.id) for d in test_docs[lang] if d.lang == lang])
|
||||
Xtr = [' '.join(analyzer(d)) for d in Xtr]
|
||||
Xte = [' '.join(analyzer(d)) for d in Xte]
|
||||
Ytr = mlb.transform(Ytr)
|
||||
Yte = mlb.transform(Yte)
|
||||
dataset.add(lang, _mask_numbers(Xtr), Ytr, _mask_numbers(Xte), Yte, IDtr, IDte)
|
||||
|
||||
dataset.save(outpath)
|
||||
|
||||
|
||||
#-----------------------------------------------------------------------------------------------------------------------
|
||||
# MAIN BUILDER
|
||||
#-----------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
if __name__=='__main__':
|
||||
import sys
|
||||
RCV1_PATH = '../Datasets/RCV1-v2/unprocessed_corpus'
|
||||
RCV2_PATH = '../Datasets/RCV2'
|
||||
JRC_DATAPATH = "../Datasets/JRC_Acquis_v3"
|
||||
full_rcv_(RCV1_PATH, RCV2_PATH, outpath='../rcv2/rcv1-2_doclist_full_processed.pickle', langs=RCV2_LANGS_WITH_NLTK_STEMMING + ['en'])
|
||||
# full_jrc_(JRC_DATAPATH, lang_set['JRC_NLTK'], train_years=list(range(1958, 2006)), test_years=[2006], outpath='../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle', cat_policy='all', most_common_cat=300)
|
||||
sys.exit(0)
|
||||
|
||||
# datasetpath = '../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle' # '../rcv2/rcv1-2_doclist_full_processed.pickle'
|
||||
# data = MultilingualDataset.load(datasetpath)
|
||||
# data.dataset_name='JRC-Acquis-full'#'RCV1/2-full'
|
||||
# for lang in RCV2_LANGS_WITH_NLTK_STEMMING + ['en']:
|
||||
# (Xtr, ytr, idtr), (Xte, yte, idte) = data.multiling_dataset[lang]
|
||||
# data.multiling_dataset[lang] = ((_mask_numbers(Xtr), ytr, idtr), (_mask_numbers(Xte), yte, idte))
|
||||
# data.save('../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle')#'../rcv2/rcv1-2_doclist_full_processed_2.pickle')
|
||||
# sys.exit(0)
|
||||
|
||||
assert len(sys.argv) == 5, "wrong number of arguments; required: " \
|
||||
"<JRC_PATH> <RCV1_PATH> <RCV2_PATH> <WIKI_PATH> "
|
||||
|
||||
JRC_DATAPATH = sys.argv[1] # "../Datasets/JRC_Acquis_v3"
|
||||
RCV1_PATH = sys.argv[2] #'../Datasets/RCV1-v2/unprocessed_corpus'
|
||||
RCV2_PATH = sys.argv[3] #'../Datasets/RCV2'
|
||||
WIKI_DATAPATH = sys.argv[4] #"../Datasets/Wikipedia/multilingual_docs_JRC_NLTK"
|
||||
|
||||
langs = lang_set['JRC_NLTK']
|
||||
max_wiki = 5000
|
||||
|
||||
for run in range(0,10):
|
||||
print('Building JRC-Acquis datasets run', run)
|
||||
prepare_jrc_datasets(JRC_DATAPATH, WIKI_DATAPATH, langs,
|
||||
train_years=list(range(1958, 2006)), test_years=[2006], max_wiki=max_wiki,
|
||||
cat_policy='all', most_common_cat=300, run=run)
|
||||
|
||||
print('Building RCV1-v2/2 datasets run', run)
|
||||
prepare_rcv_datasets(RCV2_PATH, RCV1_PATH, RCV2_PATH, WIKI_DATAPATH, RCV2_LANGS_WITH_NLTK_STEMMING + ['en'],
|
||||
train_for_lang=1000, test_for_lang=1000, max_wiki=max_wiki, run=run)
|
||||
|
||||
# uncomment this code if you want to retrieve the original documents to generate the data splits for PLE
|
||||
# (make sure you have not modified the above parameters, or adapt the following paths accordingly...)
|
||||
# datasetpath = join(RCV2_PATH,'rcv1-2_nltk_trByLang1000_teByLang1000_processed_run{}.pickle'.format(run))
|
||||
# outpath = datasetpath.replace('_nltk_','_doclist_')
|
||||
# retrieve_rcv_documents_from_dataset(datasetpath, RCV1_PATH, RCV2_PATH, outpath)
|
||||
|
||||
# datasetpath = join(JRC_DATAPATH, 'jrc_nltk_1958-2005vs2006_all_top300_noparallel_processed_run{}.pickle'.format(run))
|
||||
# outpath = datasetpath.replace('_nltk_', '_doclist_')
|
||||
# retrieve_jrc_documents_from_dataset(datasetpath, JRC_DATAPATH, train_years=list(range(1958, 2006)), test_years=[2006], cat_policy='all', most_common_cat=300, outpath=outpath)
|
||||
|
||||
|
||||
|
|
@ -1,42 +0,0 @@
|
|||
"""
|
||||
bg = Bulgarian
|
||||
cs = Czech
|
||||
da = Danish
|
||||
de = German
|
||||
el = Greek
|
||||
en = English
|
||||
es = Spanish
|
||||
et = Estonian
|
||||
fi = Finnish
|
||||
fr = French
|
||||
hu = Hungarian
|
||||
it = Italian
|
||||
lt = Lithuanian
|
||||
lv = Latvian
|
||||
nl = Dutch
|
||||
mt = Maltese
|
||||
pl = Polish
|
||||
pt = Portuguese
|
||||
ro = Romanian
|
||||
sk = Slovak
|
||||
sl = Slovene
|
||||
sv = Swedish
|
||||
"""
|
||||
|
||||
NLTK_LANGMAP = {'da': 'danish', 'nl': 'dutch', 'en': 'english', 'fi': 'finnish', 'fr': 'french', 'de': 'german',
|
||||
'hu': 'hungarian', 'it': 'italian', 'pt': 'portuguese', 'ro': 'romanian', 'es': 'spanish', 'sv': 'swedish'}
|
||||
|
||||
|
||||
#top 10 languages in wikipedia order by the number of articles
|
||||
#LANGS_10_MOST_WIKI = ['en','fr','sv','de','es','it','pt','nl','pl','ro']
|
||||
|
||||
#all languages in JRC-acquis v3
|
||||
JRC_LANGS = ['bg','cs','da','de','el','en','es','et','fi','fr','hu','it','lt','lv','mt','nl','pl','pt','ro','sk','sl','sv']
|
||||
JRC_LANGS_WITH_NLTK_STEMMING = ['da', 'nl', 'en', 'fi', 'fr', 'de', 'hu', 'it', 'pt', 'es', 'sv'] # 'romanian deleted for incompatibility issues'
|
||||
|
||||
RCV2_LANGS = ['ru', 'de', 'fr', 'sv', 'no', 'da', 'pt', 'it', 'es', 'jp', 'htw', 'nl']
|
||||
RCV2_LANGS_WITH_NLTK_STEMMING = ['de', 'fr', 'sv', 'da', 'pt', 'it', 'es', 'nl']
|
||||
|
||||
lang_set = {'JRC_NLTK':JRC_LANGS_WITH_NLTK_STEMMING, 'JRC':JRC_LANGS,
|
||||
'RCV2_NLTK':RCV2_LANGS_WITH_NLTK_STEMMING, 'RCV2':RCV2_LANGS}
|
||||
|
|
@ -1,324 +0,0 @@
|
|||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
import pickle
|
||||
import sys
|
||||
import tarfile
|
||||
import xml.etree.ElementTree as ET
|
||||
import zipfile
|
||||
from collections import Counter
|
||||
from os.path import join
|
||||
from random import shuffle
|
||||
|
||||
import rdflib
|
||||
from rdflib.namespace import RDF, SKOS
|
||||
from sklearn.datasets import get_data_home
|
||||
|
||||
from data.languages import JRC_LANGS
|
||||
from data.languages import lang_set
|
||||
from util.file import download_file, list_dirs, list_files
|
||||
|
||||
"""
|
||||
JRC Acquis' Nomenclature:
|
||||
bg = Bulgarian
|
||||
cs = Czech
|
||||
da = Danish
|
||||
de = German
|
||||
el = Greek
|
||||
en = English
|
||||
es = Spanish
|
||||
et = Estonian
|
||||
fi = Finnish
|
||||
fr = French
|
||||
hu = Hungarian
|
||||
it = Italian
|
||||
lt = Lithuanian
|
||||
lv = Latvian
|
||||
nl = Dutch
|
||||
mt = Maltese
|
||||
pl = Polish
|
||||
pt = Portuguese
|
||||
ro = Romanian
|
||||
sk = Slovak
|
||||
sl = Slovene
|
||||
sv = Swedish
|
||||
"""
|
||||
|
||||
class JRCAcquis_Document:
|
||||
def __init__(self, id, name, lang, year, head, body, categories):
|
||||
self.id = id
|
||||
self.parallel_id = name
|
||||
self.lang = lang
|
||||
self.year = year
|
||||
self.text = body if not head else head + "\n" + body
|
||||
self.categories = categories
|
||||
|
||||
# this is a workaround... for some reason, acutes are codified in a non-standard manner in titles
|
||||
# however, it seems that the title is often appearing as the first paragraph in the text/body (with
|
||||
# standard codification), so it might be preferable not to read the header after all (as here by default)
|
||||
def _proc_acute(text):
|
||||
for ch in ['a','e','i','o','u']:
|
||||
text = text.replace('%'+ch+'acute%',ch)
|
||||
return text
|
||||
|
||||
def parse_document(file, year, head=False):
|
||||
root = ET.parse(file).getroot()
|
||||
|
||||
doc_name = root.attrib['n'] # e.g., '22006A0211(01)'
|
||||
doc_lang = root.attrib['lang'] # e.g., 'es'
|
||||
doc_id = root.attrib['id'] # e.g., 'jrc22006A0211_01-es'
|
||||
doc_categories = [cat.text for cat in root.findall('.//teiHeader/profileDesc/textClass/classCode[@scheme="eurovoc"]')]
|
||||
doc_head = _proc_acute(root.find('.//text/body/head').text) if head else ''
|
||||
doc_body = '\n'.join([p.text for p in root.findall('.//text/body/div[@type="body"]/p')])
|
||||
|
||||
def raise_if_empty(field, from_file):
|
||||
if isinstance(field, str):
|
||||
if not field.strip():
|
||||
raise ValueError("Empty field in file %s" % from_file)
|
||||
|
||||
raise_if_empty(doc_name, file)
|
||||
raise_if_empty(doc_lang, file)
|
||||
raise_if_empty(doc_id, file)
|
||||
if head: raise_if_empty(doc_head, file)
|
||||
raise_if_empty(doc_body, file)
|
||||
|
||||
return JRCAcquis_Document(id=doc_id, name=doc_name, lang=doc_lang, year=year, head=doc_head, body=doc_body, categories=doc_categories)
|
||||
|
||||
# removes documents without a counterpart in all other languages
|
||||
def _force_parallel(doclist, langs):
|
||||
n_langs = len(langs)
|
||||
par_id_count = Counter([d.parallel_id for d in doclist])
|
||||
parallel_doc_ids = set([id for id,count in par_id_count.items() if count==n_langs])
|
||||
return [doc for doc in doclist if doc.parallel_id in parallel_doc_ids]
|
||||
|
||||
def random_sampling_avoiding_parallel(doclist):
|
||||
random_order = list(range(len(doclist)))
|
||||
shuffle(random_order)
|
||||
sampled_request = []
|
||||
parallel_ids = set()
|
||||
for ind in random_order:
|
||||
pid = doclist[ind].parallel_id
|
||||
if pid not in parallel_ids:
|
||||
sampled_request.append(doclist[ind])
|
||||
parallel_ids.add(pid)
|
||||
print('random_sampling_no_parallel:: from {} documents to {} documents'.format(len(doclist), len(sampled_request)))
|
||||
return sampled_request
|
||||
|
||||
|
||||
#filters out documents which do not contain any category in the cat_filter list, and filter all labels not in cat_filter
|
||||
def _filter_by_category(doclist, cat_filter):
|
||||
if not isinstance(cat_filter, frozenset):
|
||||
cat_filter = frozenset(cat_filter)
|
||||
filtered = []
|
||||
for doc in doclist:
|
||||
doc.categories = list(cat_filter & set(doc.categories))
|
||||
if doc.categories:
|
||||
doc.categories.sort()
|
||||
filtered.append(doc)
|
||||
print("filtered %d documents out without categories in the filter list" % (len(doclist) - len(filtered)))
|
||||
return filtered
|
||||
|
||||
#filters out categories with less than cat_threshold documents (and filters documents containing those categories)
|
||||
def _filter_by_frequency(doclist, cat_threshold):
|
||||
cat_count = Counter()
|
||||
for d in doclist:
|
||||
cat_count.update(d.categories)
|
||||
|
||||
freq_categories = [cat for cat,count in cat_count.items() if count>cat_threshold]
|
||||
freq_categories.sort()
|
||||
return _filter_by_category(doclist, freq_categories), freq_categories
|
||||
|
||||
#select top most_frequent categories (and filters documents containing those categories)
|
||||
def _most_common(doclist, most_frequent):
|
||||
cat_count = Counter()
|
||||
for d in doclist:
|
||||
cat_count.update(d.categories)
|
||||
|
||||
freq_categories = [cat for cat,count in cat_count.most_common(most_frequent)]
|
||||
freq_categories.sort()
|
||||
return _filter_by_category(doclist, freq_categories), freq_categories
|
||||
|
||||
def _get_categories(request):
|
||||
final_cats = set()
|
||||
for d in request:
|
||||
final_cats.update(d.categories)
|
||||
return list(final_cats)
|
||||
|
||||
def fetch_jrcacquis(langs=None, data_path=None, years=None, ignore_unclassified=True, cat_filter=None, cat_threshold=0,
|
||||
parallel=None, most_frequent=-1, DOWNLOAD_URL_BASE ='http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/'):
|
||||
|
||||
assert parallel in [None, 'force', 'avoid'], 'parallel mode not supported'
|
||||
if not langs:
|
||||
langs = JRC_LANGS
|
||||
else:
|
||||
if isinstance(langs, str): langs = [langs]
|
||||
for l in langs:
|
||||
if l not in JRC_LANGS:
|
||||
raise ValueError('Language %s is not among the valid languages in JRC-Acquis v3' % l)
|
||||
|
||||
if not data_path:
|
||||
data_path = get_data_home()
|
||||
|
||||
if not os.path.exists(data_path):
|
||||
os.mkdir(data_path)
|
||||
|
||||
request = []
|
||||
total_read = 0
|
||||
for l in langs:
|
||||
file_name = 'jrc-'+l+'.tgz'
|
||||
archive_path = join(data_path, file_name)
|
||||
|
||||
if not os.path.exists(archive_path):
|
||||
print("downloading language-specific dataset (once and for all) into %s" % data_path)
|
||||
DOWNLOAD_URL = join(DOWNLOAD_URL_BASE, file_name)
|
||||
download_file(DOWNLOAD_URL, archive_path)
|
||||
print("untarring dataset...")
|
||||
tarfile.open(archive_path, 'r:gz').extractall(data_path)
|
||||
|
||||
documents_dir = join(data_path, l)
|
||||
|
||||
print("Reading documents...")
|
||||
read = 0
|
||||
for dir in list_dirs(documents_dir):
|
||||
year = int(dir)
|
||||
if years==None or year in years:
|
||||
year_dir = join(documents_dir,dir)
|
||||
pickle_name = join(data_path, 'jrc_' + l + '_' + dir + '.pickle')
|
||||
if os.path.exists(pickle_name):
|
||||
print("loading from file %s" % pickle_name)
|
||||
l_y_documents = pickle.load(open(pickle_name, "rb"))
|
||||
read += len(l_y_documents)
|
||||
else:
|
||||
l_y_documents = []
|
||||
all_documents = list_files(year_dir)
|
||||
empty = 0
|
||||
for i,doc_file in enumerate(all_documents):
|
||||
try:
|
||||
jrc_doc = parse_document(join(year_dir, doc_file), year)
|
||||
except ValueError:
|
||||
jrc_doc = None
|
||||
|
||||
if jrc_doc and (not ignore_unclassified or jrc_doc.categories):
|
||||
l_y_documents.append(jrc_doc)
|
||||
else: empty += 1
|
||||
if len(all_documents)>50 and ((i+1) % (len(all_documents)/50) == 0):
|
||||
print('\r\tfrom %s: completed %d%%' % (year_dir, int((i+1)*100.0/len(all_documents))), end='')
|
||||
read+=1
|
||||
print('\r\tfrom %s: completed 100%% read %d documents (discarded %d without categories or empty fields)\n' % (year_dir, i+1, empty), end='')
|
||||
print("\t\t(Pickling object for future runs in %s)" % pickle_name)
|
||||
pickle.dump(l_y_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
request += l_y_documents
|
||||
print("Read %d documents for language %s\n" % (read, l))
|
||||
total_read += read
|
||||
print("Read %d documents in total" % (total_read))
|
||||
|
||||
if parallel=='force':
|
||||
request = _force_parallel(request, langs)
|
||||
elif parallel == 'avoid':
|
||||
request = random_sampling_avoiding_parallel(request)
|
||||
|
||||
final_cats = _get_categories(request)
|
||||
|
||||
if cat_filter:
|
||||
request = _filter_by_category(request, cat_filter)
|
||||
final_cats = _get_categories(request)
|
||||
if cat_threshold > 0:
|
||||
request, final_cats = _filter_by_frequency(request, cat_threshold)
|
||||
if most_frequent != -1 and len(final_cats) > most_frequent:
|
||||
request, final_cats = _most_common(request, most_frequent)
|
||||
|
||||
return request, final_cats
|
||||
|
||||
def print_cat_analysis(request):
|
||||
cat_count = Counter()
|
||||
for d in request:
|
||||
cat_count.update(d.categories)
|
||||
print("Number of active categories: {}".format(len(cat_count)))
|
||||
print(cat_count.most_common())
|
||||
|
||||
# inspects the Eurovoc thesaurus in order to select a subset of categories
|
||||
# currently, only 'broadest' policy (i.e., take all categories with no parent category), and 'all' is implemented
|
||||
def inspect_eurovoc(data_path, eurovoc_skos_core_concepts_filename='eurovoc_in_skos_core_concepts.rdf',
|
||||
eurovoc_url="http://publications.europa.eu/mdr/resource/thesaurus/eurovoc-20160630-0/skos/eurovoc_in_skos_core_concepts.zip",
|
||||
select="broadest"):
|
||||
|
||||
fullpath_pickle = join(data_path, select+'_concepts.pickle')
|
||||
if os.path.exists(fullpath_pickle):
|
||||
print("Pickled object found in %s. Loading it." % fullpath_pickle)
|
||||
return pickle.load(open(fullpath_pickle,'rb'))
|
||||
|
||||
fullpath = join(data_path, eurovoc_skos_core_concepts_filename)
|
||||
if not os.path.exists(fullpath):
|
||||
print("Path %s does not exist. Trying to download the skos EuroVoc file from %s" % (data_path, eurovoc_url))
|
||||
download_file(eurovoc_url, fullpath)
|
||||
print("Unzipping file...")
|
||||
zipped = zipfile.ZipFile(data_path + '.zip', 'r')
|
||||
zipped.extract("eurovoc_in_skos_core_concepts.rdf", data_path)
|
||||
zipped.close()
|
||||
|
||||
print("Parsing %s" %fullpath)
|
||||
g = rdflib.Graph()
|
||||
g.parse(location=fullpath, format="application/rdf+xml")
|
||||
|
||||
if select == "all":
|
||||
print("Selecting all concepts")
|
||||
all_concepts = list(g.subjects(RDF.type, SKOS.Concept))
|
||||
all_concepts = [c.toPython().split('/')[-1] for c in all_concepts]
|
||||
all_concepts.sort()
|
||||
selected_concepts = all_concepts
|
||||
elif select=="broadest":
|
||||
print("Selecting broadest concepts (those without any other broader concept linked to it)")
|
||||
all_concepts = set(g.subjects(RDF.type, SKOS.Concept))
|
||||
narrower_concepts = set(g.subjects(SKOS.broader, None))
|
||||
broadest_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - narrower_concepts)]
|
||||
broadest_concepts.sort()
|
||||
selected_concepts = broadest_concepts
|
||||
elif select=="leaves":
|
||||
print("Selecting leaves concepts (those not linked as broader of any other concept)")
|
||||
all_concepts = set(g.subjects(RDF.type, SKOS.Concept))
|
||||
broad_concepts = set(g.objects(None, SKOS.broader))
|
||||
leave_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - broad_concepts)]
|
||||
leave_concepts.sort()
|
||||
selected_concepts = leave_concepts
|
||||
else:
|
||||
raise ValueError("Selection policy %s is not currently supported" % select)
|
||||
|
||||
print("%d %s concepts found" % (len(selected_concepts), leave_concepts))
|
||||
print("Pickling concept list for faster further requests in %s" % fullpath_pickle)
|
||||
pickle.dump(selected_concepts, open(fullpath_pickle, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
return selected_concepts
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
def single_label_fragment(doclist):
|
||||
single = [d for d in doclist if len(d.categories) < 2]
|
||||
final_categories = set([d.categories[0] if d.categories else [] for d in single])
|
||||
print('{} single-label documents ({} categories) from the original {} documents'.format(len(single),
|
||||
len(final_categories),
|
||||
len(doclist)))
|
||||
return single, list(final_categories)
|
||||
|
||||
train_years = list(range(1986, 2006))
|
||||
test_years = [2006]
|
||||
cat_policy = 'leaves'
|
||||
most_common_cat = 300
|
||||
# JRC_DATAPATH = "/media/moreo/1TB Volume/Datasets/JRC_Acquis_v3"
|
||||
JRC_DATAPATH = "/storage/andrea/FUNNELING/data/JRC_Acquis_v3"
|
||||
langs = lang_set['JRC_NLTK']
|
||||
cat_list = inspect_eurovoc(JRC_DATAPATH, select=cat_policy)
|
||||
sys.exit()
|
||||
|
||||
training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=JRC_DATAPATH, years=train_years,cat_filter=cat_list, cat_threshold=1, parallel=None,most_frequent=most_common_cat)
|
||||
test_docs, label_namestest = fetch_jrcacquis(langs=langs, data_path=JRC_DATAPATH, years=test_years, cat_filter=label_names,parallel='force')
|
||||
|
||||
print('JRC-train: {} documents, {} labels'.format(len(training_docs), len(label_names)))
|
||||
print('JRC-test: {} documents, {} labels'.format(len(test_docs), len(label_namestest)))
|
||||
|
||||
training_docs, label_names = single_label_fragment(training_docs)
|
||||
test_docs, label_namestest = single_label_fragment(test_docs)
|
||||
|
||||
print('JRC-train: {} documents, {} labels'.format(len(training_docs), len(label_names)))
|
||||
print('JRC-test: {} documents, {} labels'.format(len(test_docs), len(label_namestest)))
|
||||
|
||||
|
|
@ -1,222 +0,0 @@
|
|||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
from os.path import join, exists
|
||||
from zipfile import ZipFile
|
||||
|
||||
import numpy as np
|
||||
|
||||
from util.file import download_file_if_not_exists
|
||||
from util.file import list_files
|
||||
|
||||
"""
|
||||
RCV2's Nomenclature:
|
||||
ru = Russian
|
||||
da = Danish
|
||||
de = German
|
||||
es = Spanish
|
||||
lat = Spanish Latin-American (actually is also 'es' in the collection)
|
||||
fr = French
|
||||
it = Italian
|
||||
nl = Dutch
|
||||
pt = Portuguese
|
||||
sv = Swedish
|
||||
ja = Japanese
|
||||
htw = Chinese
|
||||
no = Norwegian
|
||||
"""
|
||||
|
||||
RCV1_TOPICHIER_URL = "http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a02-orig-topics-hierarchy/rcv1.topics.hier.orig"
|
||||
RCV1PROC_BASE_URL= 'http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a12-token-files'
|
||||
RCV1_BASE_URL = "http://www.daviddlewis.com/resources/testcollections/rcv1/"
|
||||
RCV2_BASE_URL = "http://trec.nist.gov/data/reuters/reuters.html"
|
||||
|
||||
rcv1_test_data_gz = ['lyrl2004_tokens_test_pt0.dat.gz',
|
||||
'lyrl2004_tokens_test_pt1.dat.gz',
|
||||
'lyrl2004_tokens_test_pt2.dat.gz',
|
||||
'lyrl2004_tokens_test_pt3.dat.gz']
|
||||
|
||||
rcv1_train_data_gz = ['lyrl2004_tokens_train.dat.gz']
|
||||
|
||||
rcv1_doc_cats_data_gz = 'rcv1-v2.topics.qrels.gz'
|
||||
|
||||
RCV2_LANG_DIR = {'ru':'REUTE000',
|
||||
'de':'REUTE00A',
|
||||
'fr':'REUTE00B',
|
||||
'sv':'REUTE001',
|
||||
'no':'REUTE002',
|
||||
'da':'REUTE003',
|
||||
'pt':'REUTE004',
|
||||
'it':'REUTE005',
|
||||
'es':'REUTE006',
|
||||
'lat':'REUTE007',
|
||||
'jp':'REUTE008',
|
||||
'htw':'REUTE009',
|
||||
'nl':'REUTERS_'}
|
||||
|
||||
|
||||
class RCV_Document:
|
||||
|
||||
def __init__(self, id, text, categories, date='', lang=None):
|
||||
self.id = id
|
||||
self.date = date
|
||||
self.lang = lang
|
||||
self.text = text
|
||||
self.categories = categories
|
||||
|
||||
|
||||
class ExpectedLanguageException(Exception): pass
|
||||
class IDRangeException(Exception): pass
|
||||
|
||||
|
||||
nwords = []
|
||||
|
||||
def parse_document(xml_content, assert_lang=None, valid_id_range=None):
|
||||
root = ET.fromstring(xml_content)
|
||||
if assert_lang:
|
||||
if assert_lang not in root.attrib.values():
|
||||
if assert_lang != 'jp' or 'ja' not in root.attrib.values(): # some documents are attributed to 'ja', others to 'jp'
|
||||
raise ExpectedLanguageException('error: document of a different language')
|
||||
|
||||
doc_id = root.attrib['itemid']
|
||||
if valid_id_range is not None:
|
||||
if not valid_id_range[0] <= int(doc_id) <= valid_id_range[1]:
|
||||
raise IDRangeException
|
||||
|
||||
doc_categories = [cat.attrib['code'] for cat in
|
||||
root.findall('.//metadata/codes[@class="bip:topics:1.0"]/code')]
|
||||
|
||||
doc_date = root.attrib['date']
|
||||
doc_title = root.find('.//title').text
|
||||
doc_headline = root.find('.//headline').text
|
||||
doc_body = '\n'.join([p.text for p in root.findall('.//text/p')])
|
||||
|
||||
if not doc_body:
|
||||
raise ValueError('Empty document')
|
||||
|
||||
if doc_title is None: doc_title = ''
|
||||
if doc_headline is None or doc_headline in doc_title: doc_headline = ''
|
||||
text = '\n'.join([doc_title, doc_headline, doc_body]).strip()
|
||||
|
||||
text_length = len(text.split())
|
||||
global nwords
|
||||
nwords.append(text_length)
|
||||
|
||||
return RCV_Document(id=doc_id, text=text, categories=doc_categories, date=doc_date, lang=assert_lang)
|
||||
|
||||
|
||||
def fetch_RCV1(data_path, split='all'):
|
||||
|
||||
assert split in ['train', 'test', 'all'], 'split should be "train", "test", or "all"'
|
||||
|
||||
request = []
|
||||
labels = set()
|
||||
read_documents = 0
|
||||
lang = 'en'
|
||||
|
||||
training_documents = 23149
|
||||
test_documents = 781265
|
||||
|
||||
if split == 'all':
|
||||
split_range = (2286, 810596)
|
||||
expected = training_documents+test_documents
|
||||
elif split == 'train':
|
||||
split_range = (2286, 26150)
|
||||
expected = training_documents
|
||||
else:
|
||||
split_range = (26151, 810596)
|
||||
expected = test_documents
|
||||
|
||||
global nwords
|
||||
nwords=[]
|
||||
for part in list_files(data_path):
|
||||
if not re.match('\d+\.zip', part): continue
|
||||
target_file = join(data_path, part)
|
||||
assert exists(target_file), \
|
||||
"You don't seem to have the file "+part+" in " + data_path + ", and the RCV1 corpus can not be downloaded"+\
|
||||
" w/o a formal permission. Please, refer to " + RCV1_BASE_URL + " for more information."
|
||||
zipfile = ZipFile(target_file)
|
||||
for xmlfile in zipfile.namelist():
|
||||
xmlcontent = zipfile.open(xmlfile).read()
|
||||
try:
|
||||
doc = parse_document(xmlcontent, assert_lang=lang, valid_id_range=split_range)
|
||||
labels.update(doc.categories)
|
||||
request.append(doc)
|
||||
read_documents += 1
|
||||
except ValueError:
|
||||
print('\n\tskipping document {} with inconsistent language label: expected language {}'.format(part+'/'+xmlfile, lang))
|
||||
except (IDRangeException, ExpectedLanguageException) as e:
|
||||
pass
|
||||
print('\r[{}] read {} documents'.format(part, len(request)), end='')
|
||||
if read_documents == expected: break
|
||||
if read_documents == expected: break
|
||||
print()
|
||||
print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords)))
|
||||
return request, list(labels)
|
||||
|
||||
|
||||
def fetch_RCV2(data_path, languages=None):
|
||||
|
||||
if not languages:
|
||||
languages = list(RCV2_LANG_DIR.keys())
|
||||
else:
|
||||
assert set(languages).issubset(set(RCV2_LANG_DIR.keys())), 'languages not in scope'
|
||||
|
||||
request = []
|
||||
labels = set()
|
||||
global nwords
|
||||
nwords=[]
|
||||
for lang in languages:
|
||||
path = join(data_path, RCV2_LANG_DIR[lang])
|
||||
lang_docs_read = 0
|
||||
for part in list_files(path):
|
||||
target_file = join(path, part)
|
||||
assert exists(target_file), \
|
||||
"You don't seem to have the file "+part+" in " + path + ", and the RCV2 corpus can not be downloaded"+\
|
||||
" w/o a formal permission. Please, refer to " + RCV2_BASE_URL + " for more information."
|
||||
zipfile = ZipFile(target_file)
|
||||
for xmlfile in zipfile.namelist():
|
||||
xmlcontent = zipfile.open(xmlfile).read()
|
||||
try:
|
||||
doc = parse_document(xmlcontent, assert_lang=lang)
|
||||
labels.update(doc.categories)
|
||||
request.append(doc)
|
||||
lang_docs_read += 1
|
||||
except ValueError:
|
||||
print('\n\tskipping document {} with inconsistent language label: expected language {}'.format(RCV2_LANG_DIR[lang]+'/'+part+'/'+xmlfile, lang))
|
||||
except (IDRangeException, ExpectedLanguageException) as e:
|
||||
pass
|
||||
print('\r[{}] read {} documents, {} for language {}'.format(RCV2_LANG_DIR[lang]+'/'+part, len(request), lang_docs_read, lang), end='')
|
||||
print()
|
||||
print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords)))
|
||||
return request, list(labels)
|
||||
|
||||
|
||||
def fetch_topic_hierarchy(path, topics='all'):
|
||||
assert topics in ['all', 'leaves']
|
||||
|
||||
download_file_if_not_exists(RCV1_TOPICHIER_URL, path)
|
||||
hierarchy = {}
|
||||
for line in open(path, 'rt'):
|
||||
parts = line.strip().split()
|
||||
parent,child = parts[1],parts[3]
|
||||
if parent not in hierarchy:
|
||||
hierarchy[parent]=[]
|
||||
hierarchy[parent].append(child)
|
||||
|
||||
del hierarchy['None']
|
||||
del hierarchy['Root']
|
||||
print(hierarchy)
|
||||
|
||||
if topics=='all':
|
||||
topics = set(hierarchy.keys())
|
||||
for parent in hierarchy.keys():
|
||||
topics.update(hierarchy[parent])
|
||||
return list(topics)
|
||||
elif topics=='leaves':
|
||||
parents = set(hierarchy.keys())
|
||||
childs = set()
|
||||
for parent in hierarchy.keys():
|
||||
childs.update(hierarchy[parent])
|
||||
return list(childs.difference(parents))
|
||||
|
||||
|
|
@ -1,307 +0,0 @@
|
|||
from __future__ import print_function
|
||||
|
||||
# import ijson
|
||||
# from ijson.common import ObjectBuilder
|
||||
import os
|
||||
import pickle
|
||||
import re
|
||||
from bz2 import BZ2File
|
||||
from itertools import islice
|
||||
from os.path import join
|
||||
from xml.sax.saxutils import escape
|
||||
|
||||
import numpy as np
|
||||
|
||||
from util.file import list_dirs, list_files
|
||||
|
||||
policies = ["IN_ALL_LANGS", "IN_ANY_LANG"]
|
||||
|
||||
"""
|
||||
This file contains a set of tools for processing the Wikipedia multilingual documents.
|
||||
In what follows, it is assumed that you have already downloaded a Wikipedia dump (https://dumps.wikimedia.org/)
|
||||
and have processed each document to clean their texts with one of the tools:
|
||||
- https://github.com/aesuli/wikipediatools (Python 2)
|
||||
- https://github.com/aesuli/wikipedia-extractor (Python 3)
|
||||
It is also assumed you have dowloaded the all-entities json file (e.g., https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2)
|
||||
|
||||
This tools help you in:
|
||||
- Processes the huge json file as a stream, and create a multilingual map of corresponding titles for each language.
|
||||
Set the policy = "IN_ALL_LANGS" will extract only titles which appear in all (AND) languages, whereas "IN_ANY_LANG"
|
||||
extracts all titles appearing in at least one (OR) language (warning: this will creates a huge dictionary).
|
||||
Note: This version is quite slow. Although it is run once for all, you might be prefer to take a look at "Wikidata in BigQuery".
|
||||
- Processes the huge json file as a stream a creates a simplified file which occupies much less and is far faster to be processed.
|
||||
- Use the multilingual map to extract, from the clean text versions, individual xml documents containing all
|
||||
language-specific versions from the document.
|
||||
- Fetch the multilingual documents to create, for each of the specified languages, a list containing all documents,
|
||||
in a way that the i-th element from any list refers to the same element in the respective language.
|
||||
"""
|
||||
|
||||
def _doc_generator(text_path, langs):
|
||||
dotspace = re.compile(r'\.(?!\s)')
|
||||
for l,lang in enumerate(langs):
|
||||
print("Processing language <%s> (%d/%d)" % (lang, l, len(langs)))
|
||||
lang_dir = join(text_path, lang)
|
||||
split_dirs = list_dirs(lang_dir)
|
||||
for sd,split_dir in enumerate(split_dirs):
|
||||
print("\tprocessing split_dir <%s> (%d/%d)" % (split_dir, sd, len(split_dirs)))
|
||||
split_files = list_files(join(lang_dir, split_dir))
|
||||
for sf,split_file in enumerate(split_files):
|
||||
print("\t\tprocessing split_file <%s> (%d/%d)" % (split_file, sf, len(split_files)))
|
||||
with BZ2File(join(lang_dir, split_dir, split_file), 'r', buffering=1024*1024) as fi:
|
||||
while True:
|
||||
doc_lines = list(islice(fi, 3))
|
||||
if doc_lines:
|
||||
# some sentences are not followed by a space after the dot
|
||||
doc_lines[1] = dotspace.sub('. ', doc_lines[1])
|
||||
# [workaround] I found html symbol was not treated, and unescaping it now might not help...
|
||||
doc_lines[1] = escape(doc_lines[1].replace(" ", " "))
|
||||
yield doc_lines, lang
|
||||
else: break
|
||||
|
||||
def _extract_title(doc_lines):
|
||||
m = re.search('title="(.+?)"', doc_lines[0])
|
||||
if m: return m.group(1).decode('utf-8')
|
||||
else: raise ValueError("Error in xml format: document head is %s" % doc_lines[0])
|
||||
|
||||
def _create_doc(target_file, id, doc, lang):
|
||||
doc[0] = doc[0][:-2] + (' lang="%s">\n'%lang)
|
||||
with open(target_file, 'w') as fo:
|
||||
fo.write('<multidoc id="%s">\n'%id)
|
||||
[fo.write(line) for line in doc]
|
||||
fo.write('</multidoc>')
|
||||
|
||||
def _append_doc(target_file, doc, lang):
|
||||
doc[0] = doc[0][:-2] + (' lang="%s">\n' % lang)
|
||||
with open(target_file, 'r', buffering=1024*1024) as fi:
|
||||
lines = fi.readlines()
|
||||
if doc[0] in lines[1::3]:
|
||||
return
|
||||
lines[-1:-1]=doc
|
||||
with open(target_file, 'w', buffering=1024*1024) as fo:
|
||||
[fo.write(line) for line in lines]
|
||||
|
||||
def extract_multilingual_documents(inv_dict, langs, text_path, out_path):
|
||||
if not os.path.exists(out_path):
|
||||
os.makedirs(out_path)
|
||||
for lang in langs:
|
||||
if lang not in inv_dict:
|
||||
raise ValueError("Lang %s is not in the dictionary" % lang)
|
||||
|
||||
docs_created = len(list_files(out_path))
|
||||
print("%d multilingual documents found." % docs_created)
|
||||
for doc,lang in _doc_generator(text_path, langs):
|
||||
title = _extract_title(doc)
|
||||
|
||||
if title in inv_dict[lang]:
|
||||
#pass
|
||||
ids = inv_dict[lang][title]
|
||||
for id in ids:
|
||||
target_file = join(out_path, id) + ".xml"
|
||||
if os.path.exists(target_file):
|
||||
_append_doc(target_file, doc, lang)
|
||||
else:
|
||||
_create_doc(target_file, id, doc, lang)
|
||||
docs_created+=1
|
||||
else:
|
||||
if not re.match('[A-Za-z]+', title):
|
||||
print("Title <%s> for lang <%s> not in dictionary" % (title, lang))
|
||||
|
||||
|
||||
|
||||
def extract_multilingual_titles_from_simplefile(data_dir, filename, langs, policy="IN_ALL_LANGS", return_both=True):
|
||||
simplified_file = join(data_dir,filename)
|
||||
|
||||
if policy not in policies:
|
||||
raise ValueError("Policy %s not supported." % policy)
|
||||
print("extracting multilingual titles with policy %s (%s)" % (policy,' '.join(langs)))
|
||||
|
||||
lang_prefix = list(langs)
|
||||
lang_prefix.sort()
|
||||
pickle_prefix = "extraction_" + "_".join(lang_prefix) + "." + policy
|
||||
pickle_dict = join(data_dir, pickle_prefix+".multi_dict.pickle")
|
||||
pickle_invdict = join(data_dir, pickle_prefix+".multi_invdict.pickle")
|
||||
if os.path.exists(pickle_invdict):
|
||||
if return_both and os.path.exists(pickle_dict):
|
||||
print("Pickled files found in %s. Loading both (direct and inverse dictionaries)." % data_dir)
|
||||
return pickle.load(open(pickle_dict, 'rb')), pickle.load(open(pickle_invdict, 'rb'))
|
||||
elif return_both==False:
|
||||
print("Pickled file found in %s. Loading inverse dictionary only." % pickle_invdict)
|
||||
return pickle.load(open(pickle_invdict, 'rb'))
|
||||
|
||||
multiling_titles = {}
|
||||
inv_dict = {lang:{} for lang in langs}
|
||||
|
||||
def process_entry(line):
|
||||
parts = line.strip().split('\t')
|
||||
id = parts[0]
|
||||
if id in multiling_titles:
|
||||
raise ValueError("id <%s> already indexed" % id)
|
||||
|
||||
titles = dict(((lang_title[:lang_title.find(':')],lang_title[lang_title.find(':')+1:].decode('utf-8')) for lang_title in parts[1:]))
|
||||
for lang in titles.keys():
|
||||
if lang not in langs:
|
||||
del titles[lang]
|
||||
|
||||
if (policy == "IN_ALL_LANGS" and len(titles) == len(langs))\
|
||||
or (policy == "IN_ANY_LANG" and len(titles) > 0):
|
||||
multiling_titles[id] = titles
|
||||
for lang, title in titles.items():
|
||||
if title in inv_dict[lang]:
|
||||
inv_dict[lang][title].append(id)
|
||||
inv_dict[lang][title] = [id]
|
||||
|
||||
with BZ2File(simplified_file, 'r', buffering=1024*1024*16) as fi:
|
||||
completed = 0
|
||||
try:
|
||||
for line in fi:
|
||||
process_entry(line)
|
||||
completed += 1
|
||||
if completed % 10 == 0:
|
||||
print("\rCompleted %d\ttitles %d" % (completed,len(multiling_titles)), end="")
|
||||
print("\rCompleted %d\t\ttitles %d" % (completed, len(multiling_titles)), end="\n")
|
||||
except EOFError:
|
||||
print("\nUnexpected file ending... saving anyway")
|
||||
|
||||
print("Pickling dictionaries in %s" % data_dir)
|
||||
pickle.dump(multiling_titles, open(pickle_dict,'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
pickle.dump(inv_dict, open(pickle_invdict, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
print("Done")
|
||||
|
||||
return (multiling_titles, inv_dict) if return_both else inv_dict
|
||||
|
||||
|
||||
# in https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2
|
||||
def simplify_json_file(data_dir, langs, policy="IN_ALL_LANGS", json_file = "latest-all.json.bz2"):
|
||||
latest_all_json_file = join(data_dir,json_file)
|
||||
|
||||
if policy not in policies:
|
||||
raise ValueError("Policy %s not supported." % policy)
|
||||
|
||||
print("extracting multilingual titles with policy %s (%s)" % (policy,' '.join(langs)))
|
||||
|
||||
lang_prefix = list(langs)
|
||||
lang_prefix.sort()
|
||||
simple_titles_path = join(data_dir, "extraction_" + "_".join(lang_prefix) + "." + policy)
|
||||
|
||||
def process_entry(last, fo):
|
||||
global written
|
||||
id = last["id"]
|
||||
titles = None
|
||||
if policy == "IN_ALL_LANGS" and langs.issubset(last["labels"].keys()):
|
||||
titles = {lang: last["labels"][lang]["value"] for lang in langs}
|
||||
elif policy == "IN_ANY_LANG":
|
||||
titles = {lang: last["labels"][lang]["value"] for lang in langs if lang in last["labels"]}
|
||||
|
||||
if titles:
|
||||
fo.write((id+'\t'+'\t'.join([lang+':'+titles[lang] for lang in titles.keys()])+'\n').encode('utf-8'))
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
written = 0
|
||||
with BZ2File(latest_all_json_file, 'r', buffering=1024*1024*16) as fi, \
|
||||
BZ2File(join(data_dir,simple_titles_path+".simple.bz2"),'w') as fo:
|
||||
builder = ObjectBuilder()
|
||||
completed = 0
|
||||
for event, value in ijson.basic_parse(fi, buf_size=1024*1024*16):
|
||||
builder.event(event, value)
|
||||
if len(builder.value)>1:
|
||||
if process_entry(builder.value.pop(0), fo): written += 1
|
||||
completed += 1
|
||||
print("\rCompleted %d\ttitles %d" % (completed,written), end="")
|
||||
print("")
|
||||
|
||||
#process the last entry
|
||||
process_entry(builder.value.pop(0))
|
||||
|
||||
return simple_titles_path
|
||||
|
||||
"""
|
||||
Reads all multi-lingual documents in a folder (see wikipedia_tools.py to generate them) and generates, for each of the
|
||||
specified languages, a list contanining all its documents, so that the i-th element of any list refers to the language-
|
||||
specific version of the same document. Documents are forced to contain version in all specified languages and to contain
|
||||
a minimum number of words; otherwise it is discarded.
|
||||
"""
|
||||
class MinWordsNotReached(Exception): pass
|
||||
class WrongDocumentFormat(Exception): pass
|
||||
|
||||
def _load_multilang_doc(path, langs, min_words=100):
|
||||
import xml.etree.ElementTree as ET
|
||||
from xml.etree.ElementTree import Element, ParseError
|
||||
try:
|
||||
root = ET.parse(path).getroot()
|
||||
doc = {}
|
||||
for lang in langs:
|
||||
doc_body = root.find('.//doc[@lang="' + lang + '"]')
|
||||
if isinstance(doc_body, Element):
|
||||
n_words = len(doc_body.text.split(' '))
|
||||
if n_words >= min_words:
|
||||
doc[lang] = doc_body.text
|
||||
else:
|
||||
raise MinWordsNotReached
|
||||
else:
|
||||
raise WrongDocumentFormat
|
||||
except ParseError:
|
||||
raise WrongDocumentFormat
|
||||
return doc
|
||||
|
||||
#returns the multilingual documents mapped by language, and a counter with the number of documents readed
|
||||
def fetch_wikipedia_multilingual(wiki_multi_path, langs, min_words=100, deletions=False, max_documents=-1, pickle_name=None):
|
||||
if pickle_name and os.path.exists(pickle_name):
|
||||
print("unpickling %s" % pickle_name)
|
||||
return pickle.load(open(pickle_name, 'rb'))
|
||||
|
||||
multi_docs = list_files(wiki_multi_path)
|
||||
mling_documents = {l:[] for l in langs}
|
||||
valid_documents = 0
|
||||
minwords_exception = 0
|
||||
wrongdoc_exception = 0
|
||||
for d,multi_doc in enumerate(multi_docs):
|
||||
print("\rProcessed %d/%d documents, valid %d/%d, few_words=%d, few_langs=%d" %
|
||||
(d, len(multi_docs), valid_documents, len(multi_docs), minwords_exception, wrongdoc_exception),end="")
|
||||
doc_path = join(wiki_multi_path, multi_doc)
|
||||
try:
|
||||
m_doc = _load_multilang_doc(doc_path, langs, min_words)
|
||||
valid_documents += 1
|
||||
for l in langs:
|
||||
mling_documents[l].append(m_doc[l])
|
||||
except MinWordsNotReached:
|
||||
minwords_exception += 1
|
||||
if deletions: os.remove(doc_path)
|
||||
except WrongDocumentFormat:
|
||||
wrongdoc_exception += 1
|
||||
if deletions: os.remove(doc_path)
|
||||
if max_documents>0 and valid_documents>=max_documents:
|
||||
break
|
||||
|
||||
if pickle_name:
|
||||
print("Pickling wikipedia documents object in %s" % pickle_name)
|
||||
pickle.dump(mling_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
return mling_documents
|
||||
|
||||
def random_wiki_sample(l_wiki, max_documents):
|
||||
if max_documents == 0: return None
|
||||
langs = list(l_wiki.keys())
|
||||
assert len(np.unique([len(l_wiki[l]) for l in langs])) == 1, 'documents across languages do not seem to be aligned'
|
||||
ndocs_per_lang = len(l_wiki[langs[0]])
|
||||
if ndocs_per_lang > max_documents:
|
||||
sel = set(np.random.choice(list(range(ndocs_per_lang)), max_documents, replace=False))
|
||||
for lang in langs:
|
||||
l_wiki[lang] = [d for i, d in enumerate(l_wiki[lang]) if i in sel]
|
||||
return l_wiki
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
wikipedia_home = "../Datasets/Wikipedia"
|
||||
|
||||
from data.languages import JRC_LANGS_WITH_NLTK_STEMMING as langs
|
||||
langs = frozenset(langs)
|
||||
|
||||
simple_titles_path = simplify_json_file(wikipedia_home, langs, policy="IN_ALL_LANGS", json_file="latest-all.json.bz2")
|
||||
_, inv_dict = extract_multilingual_titles_from_simplefile(wikipedia_home, simple_titles_path, langs, policy='IN_ALL_LANGS')
|
||||
extract_multilingual_documents(inv_dict, langs, join(wikipedia_home,'text'),
|
||||
out_path=join(wikipedia_home, 'multilingual_docs_JRC_NLTK'))
|
||||
|
||||
|
|
@ -1,34 +0,0 @@
|
|||
from nltk import word_tokenize
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.stem import SnowballStemmer
|
||||
|
||||
from data.languages import NLTK_LANGMAP
|
||||
|
||||
|
||||
def preprocess_documents(documents, lang):
|
||||
tokens = NLTKStemTokenizer(lang, verbose=True)
|
||||
sw = stopwords.words(NLTK_LANGMAP[lang])
|
||||
return [' '.join([w for w in tokens(doc) if w not in sw]) for doc in documents]
|
||||
|
||||
|
||||
class NLTKStemTokenizer(object):
|
||||
|
||||
def __init__(self, lang, verbose=False):
|
||||
if lang not in NLTK_LANGMAP:
|
||||
raise ValueError('Language %s is not supported in NLTK' % lang)
|
||||
self.verbose=verbose
|
||||
self.called = 0
|
||||
self.wnl = SnowballStemmer(NLTK_LANGMAP[lang])
|
||||
self.cache = {}
|
||||
|
||||
def __call__(self, doc):
|
||||
self.called += 1
|
||||
if self.verbose:
|
||||
print("\r\t\t[documents processed %d]" % (self.called), end="")
|
||||
tokens = word_tokenize(doc)
|
||||
stems = []
|
||||
for t in tokens:
|
||||
if t not in self.cache:
|
||||
self.cache[t] = self.wnl.stem(t)
|
||||
stems.append(self.cache[t])
|
||||
return stems
|
|
@ -1,271 +0,0 @@
|
|||
import math
|
||||
|
||||
import numpy as np
|
||||
from joblib import Parallel, delayed
|
||||
from scipy.sparse import csr_matrix, csc_matrix
|
||||
from scipy.stats import t
|
||||
|
||||
|
||||
def get_probs(tpr, fpr, pc):
|
||||
# tpr = p(t|c) = p(tp)/p(c) = p(tp)/(p(tp)+p(fn))
|
||||
# fpr = p(t|_c) = p(fp)/p(_c) = p(fp)/(p(fp)+p(tn))
|
||||
pnc = 1.0 - pc
|
||||
tp = tpr * pc
|
||||
fn = pc - tp
|
||||
fp = fpr * pnc
|
||||
tn = pnc - fp
|
||||
return ContTable(tp=tp, fn=fn, fp=fp, tn=tn)
|
||||
|
||||
|
||||
def apply_tsr(tpr, fpr, pc, tsr):
|
||||
cell = get_probs(tpr, fpr, pc)
|
||||
return tsr(cell)
|
||||
|
||||
|
||||
def positive_information_gain(cell):
|
||||
if cell.tpr() < cell.fpr():
|
||||
return 0.0
|
||||
else:
|
||||
return information_gain(cell)
|
||||
|
||||
|
||||
def posneg_information_gain(cell):
|
||||
ig = information_gain(cell)
|
||||
if cell.tpr() < cell.fpr():
|
||||
return -ig
|
||||
else:
|
||||
return ig
|
||||
|
||||
|
||||
def __ig_factor(p_tc, p_t, p_c):
|
||||
den = p_t * p_c
|
||||
if den != 0.0 and p_tc != 0:
|
||||
return p_tc * math.log(p_tc / den, 2)
|
||||
else:
|
||||
return 0.0
|
||||
|
||||
|
||||
def information_gain(cell):
|
||||
return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + \
|
||||
__ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) +\
|
||||
__ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()) + \
|
||||
__ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c())
|
||||
|
||||
|
||||
def information_gain_mod(cell):
|
||||
return (__ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + __ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c())) \
|
||||
- (__ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) + __ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()))
|
||||
|
||||
|
||||
def pointwise_mutual_information(cell):
|
||||
return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c())
|
||||
|
||||
|
||||
def gain_ratio(cell):
|
||||
pc = cell.p_c()
|
||||
pnc = 1.0 - pc
|
||||
norm = pc * math.log(pc, 2) + pnc * math.log(pnc, 2)
|
||||
return information_gain(cell) / (-norm)
|
||||
|
||||
|
||||
def chi_square(cell):
|
||||
den = cell.p_f() * cell.p_not_f() * cell.p_c() * cell.p_not_c()
|
||||
if den==0.0: return 0.0
|
||||
num = gss(cell)**2
|
||||
return num / den
|
||||
|
||||
|
||||
def relevance_frequency(cell):
|
||||
a = cell.tp
|
||||
c = cell.fp
|
||||
if c == 0: c = 1
|
||||
return math.log(2.0 + (a * 1.0 / c), 2)
|
||||
|
||||
|
||||
def idf(cell):
|
||||
if cell.p_f()>0:
|
||||
return math.log(1.0 / cell.p_f())
|
||||
return 0.0
|
||||
|
||||
|
||||
def gss(cell):
|
||||
return cell.p_tp()*cell.p_tn() - cell.p_fp()*cell.p_fn()
|
||||
|
||||
|
||||
def conf_interval(xt, n):
|
||||
if n>30:
|
||||
z2 = 3.84145882069 # norm.ppf(0.5+0.95/2.0)**2
|
||||
else:
|
||||
z2 = t.ppf(0.5 + 0.95 / 2.0, df=max(n-1,1)) ** 2
|
||||
p = (xt + 0.5 * z2) / (n + z2)
|
||||
amplitude = 0.5 * z2 * math.sqrt((p * (1.0 - p)) / (n + z2))
|
||||
return p, amplitude
|
||||
|
||||
def strength(minPosRelFreq, minPos, maxNeg):
|
||||
if minPos > maxNeg:
|
||||
return math.log(2.0 * minPosRelFreq, 2.0)
|
||||
else:
|
||||
return 0.0
|
||||
|
||||
|
||||
#set cancel_features=True to allow some features to be weighted as 0 (as in the original article)
|
||||
#however, for some extremely imbalanced dataset caused all documents to be 0
|
||||
def conf_weight(cell, cancel_features=False):
|
||||
c = cell.get_c()
|
||||
not_c = cell.get_not_c()
|
||||
tp = cell.tp
|
||||
fp = cell.fp
|
||||
|
||||
pos_p, pos_amp = conf_interval(tp, c)
|
||||
neg_p, neg_amp = conf_interval(fp, not_c)
|
||||
|
||||
min_pos = pos_p-pos_amp
|
||||
max_neg = neg_p+neg_amp
|
||||
den = (min_pos + max_neg)
|
||||
minpos_relfreq = min_pos / (den if den != 0 else 1)
|
||||
|
||||
str_tplus = strength(minpos_relfreq, min_pos, max_neg);
|
||||
|
||||
if str_tplus == 0 and not cancel_features:
|
||||
return 1e-20
|
||||
|
||||
return str_tplus;
|
||||
|
||||
|
||||
class ContTable:
|
||||
|
||||
def __init__(self, tp=0, tn=0, fp=0, fn=0):
|
||||
self.tp=tp
|
||||
self.tn=tn
|
||||
self.fp=fp
|
||||
self.fn=fn
|
||||
|
||||
def get_d(self): return self.tp + self.tn + self.fp + self.fn
|
||||
|
||||
def get_c(self): return self.tp + self.fn
|
||||
|
||||
def get_not_c(self): return self.tn + self.fp
|
||||
|
||||
def get_f(self): return self.tp + self.fp
|
||||
|
||||
def get_not_f(self): return self.tn + self.fn
|
||||
|
||||
def p_c(self): return (1.0*self.get_c())/self.get_d()
|
||||
|
||||
def p_not_c(self): return 1.0-self.p_c()
|
||||
|
||||
def p_f(self): return (1.0*self.get_f())/self.get_d()
|
||||
|
||||
def p_not_f(self): return 1.0-self.p_f()
|
||||
|
||||
def p_tp(self): return (1.0*self.tp) / self.get_d()
|
||||
|
||||
def p_tn(self): return (1.0*self.tn) / self.get_d()
|
||||
|
||||
def p_fp(self): return (1.0*self.fp) / self.get_d()
|
||||
|
||||
def p_fn(self): return (1.0*self.fn) / self.get_d()
|
||||
|
||||
def tpr(self):
|
||||
c = 1.0*self.get_c()
|
||||
return self.tp / c if c > 0.0 else 0.0
|
||||
|
||||
def fpr(self):
|
||||
_c = 1.0*self.get_not_c()
|
||||
return self.fp / _c if _c > 0.0 else 0.0
|
||||
|
||||
|
||||
def round_robin_selection(X, Y, k, tsr_function=positive_information_gain):
|
||||
print(f'[selectiong {k} terms]')
|
||||
nC = Y.shape[1]
|
||||
FC = get_tsr_matrix(get_supervised_matrix(X, Y), tsr_function).T
|
||||
best_features_idx = np.argsort(-FC, axis=0).flatten()
|
||||
tsr_values = FC.flatten()
|
||||
selected_indexes_set = set()
|
||||
selected_indexes = list()
|
||||
selected_value = list()
|
||||
from_category = list()
|
||||
round_robin = iter(best_features_idx)
|
||||
values_iter = iter(tsr_values)
|
||||
round=0
|
||||
while len(selected_indexes) < k:
|
||||
term_idx = next(round_robin)
|
||||
term_val = next(values_iter)
|
||||
if term_idx not in selected_indexes_set:
|
||||
selected_indexes_set.add(term_idx)
|
||||
selected_indexes.append(term_idx)
|
||||
selected_value.append(term_val)
|
||||
from_category.append(round)
|
||||
round = (round + 1) % nC
|
||||
return np.asarray(selected_indexes, dtype=int), np.asarray(selected_value, dtype=float), np.asarray(from_category)
|
||||
|
||||
|
||||
def feature_label_contingency_table(positive_document_indexes, feature_document_indexes, nD):
|
||||
tp_ = len(positive_document_indexes & feature_document_indexes)
|
||||
fp_ = len(feature_document_indexes - positive_document_indexes)
|
||||
fn_ = len(positive_document_indexes - feature_document_indexes)
|
||||
tn_ = nD - (tp_ + fp_ + fn_)
|
||||
return ContTable(tp=tp_, tn=tn_, fp=fp_, fn=fn_)
|
||||
|
||||
|
||||
def category_tables(feature_sets, category_sets, c, nD, nF):
|
||||
return [feature_label_contingency_table(category_sets[c], feature_sets[f], nD) for f in range(nF)]
|
||||
|
||||
|
||||
"""
|
||||
Computes the nC x nF supervised matrix M where Mcf is the 4-cell contingency table for feature f and class c.
|
||||
Efficiency O(nF x nC x log(S)) where S is the sparse factor
|
||||
"""
|
||||
def get_supervised_matrix(coocurrence_matrix, label_matrix, n_jobs=-1):
|
||||
nD, nF = coocurrence_matrix.shape
|
||||
nD2, nC = label_matrix.shape
|
||||
|
||||
if nD != nD2:
|
||||
raise ValueError('Number of rows in coocurrence matrix shape %s and label matrix shape %s is not consistent' %
|
||||
(coocurrence_matrix.shape,label_matrix.shape))
|
||||
|
||||
def nonzero_set(matrix, col):
|
||||
return set(matrix[:, col].nonzero()[0])
|
||||
|
||||
if isinstance(coocurrence_matrix, csr_matrix):
|
||||
coocurrence_matrix = csc_matrix(coocurrence_matrix)
|
||||
feature_sets = [nonzero_set(coocurrence_matrix, f) for f in range(nF)]
|
||||
category_sets = [nonzero_set(label_matrix, c) for c in range(nC)]
|
||||
cell_matrix = Parallel(n_jobs=n_jobs, backend="threading")(delayed(category_tables)(feature_sets, category_sets, c, nD, nF) for c in range(nC))
|
||||
return np.array(cell_matrix)
|
||||
|
||||
# obtains the matrix T where Tcf=tsr(f,c) is the tsr score for category c and feature f
|
||||
def get_tsr_matrix(cell_matrix, tsr_score_funtion):
|
||||
nC,nF = cell_matrix.shape
|
||||
tsr_matrix = [[tsr_score_funtion(cell_matrix[c,f]) for f in range(nF)] for c in range(nC)]
|
||||
return np.array(tsr_matrix)
|
||||
|
||||
|
||||
""" The Fisher-score [1] is not computed on the 4-cell contingency table, but can
|
||||
take as input any real-valued feature column (e.g., tf-idf weights).
|
||||
feat is the feature vector, and c is a binary classification vector.
|
||||
This implementation covers only the binary case, while the formula is defined for multiclass
|
||||
single-label scenarios, for which the version [2] might be preferred.
|
||||
[1] R.O. Duda, P.E. Hart, and D.G. Stork. Pattern classification. Wiley-interscience, 2012.
|
||||
[2] Gu, Q., Li, Z., & Han, J. (2012). Generalized fisher score for feature selection. arXiv preprint arXiv:1202.3725.
|
||||
"""
|
||||
def fisher_score_binary(feat, c):
|
||||
neg = np.ones_like(c) - c
|
||||
|
||||
npos = np.sum(c)
|
||||
nneg = np.sum(neg)
|
||||
|
||||
mupos = np.mean(feat[c == 1])
|
||||
muneg = np.mean(feat[neg == 1])
|
||||
mu = np.mean(feat)
|
||||
|
||||
stdpos = np.std(feat[c == 1])
|
||||
stdneg = np.std(feat[neg == 1])
|
||||
|
||||
num = npos * ((mupos - mu) ** 2) + nneg * ((muneg - mu) ** 2)
|
||||
den = npos * (stdpos ** 2) + nneg * (stdneg ** 2)
|
||||
|
||||
if den>0:
|
||||
return num / den
|
||||
else:
|
||||
return num
|
|
@ -1,124 +0,0 @@
|
|||
from models.learners import *
|
||||
from util.common import _normalize
|
||||
from view_generators import VanillaFunGen
|
||||
|
||||
|
||||
class DocEmbedderList:
|
||||
"""
|
||||
Class that takes care of calling fit and transform function for every init embedder. Every ViewGenerator should be
|
||||
contained by this class in order to seamlessly train the overall architecture.
|
||||
"""
|
||||
def __init__(self, embedder_list, probabilistic=True):
|
||||
"""
|
||||
Init the DocEmbedderList.
|
||||
:param embedder_list: list of embedders to be deployed
|
||||
:param probabilistic: whether to recast view generators output to vectors of posterior probabilities or not
|
||||
"""
|
||||
assert len(embedder_list) != 0, 'Embedder list cannot be empty!'
|
||||
self.embedders = embedder_list
|
||||
self.probabilistic = probabilistic
|
||||
if probabilistic:
|
||||
_tmp = []
|
||||
for embedder in self.embedders:
|
||||
if isinstance(embedder, VanillaFunGen):
|
||||
_tmp.append(embedder)
|
||||
else:
|
||||
_tmp.append(FeatureSet2Posteriors(embedder))
|
||||
self.embedders = _tmp
|
||||
|
||||
def fit(self, lX, ly):
|
||||
"""
|
||||
Fit all the ViewGenerators contained by DocEmbedderList.
|
||||
:param lX:
|
||||
:param ly:
|
||||
:return: self
|
||||
"""
|
||||
for embedder in self.embedders:
|
||||
embedder.fit(lX, ly)
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
"""
|
||||
Project documents by means of every ViewGenerators. Projections are then averaged together and returned.
|
||||
:param lX:
|
||||
:return: common latent space (averaged).
|
||||
"""
|
||||
langs = sorted(lX.keys())
|
||||
lZparts = {lang: None for lang in langs}
|
||||
|
||||
for embedder in self.embedders:
|
||||
lZ = embedder.transform(lX)
|
||||
for lang in langs:
|
||||
Z = lZ[lang]
|
||||
if lZparts[lang] is None:
|
||||
lZparts[lang] = Z
|
||||
else:
|
||||
lZparts[lang] += Z
|
||||
n_embedders = len(self.embedders)
|
||||
return {lang: lZparts[lang]/n_embedders for lang in langs} # Averaging feature spaces
|
||||
|
||||
def fit_transform(self, lX, ly):
|
||||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
|
||||
class FeatureSet2Posteriors:
|
||||
"""
|
||||
Takes care of recasting features outputted by the embedders to vecotrs of posterior probabilities by means of
|
||||
a multiclass SVM.
|
||||
"""
|
||||
def __init__(self, embedder, l2=True, n_jobs=-1):
|
||||
"""
|
||||
Init the class.
|
||||
:param embedder: ViewGen, view generators which does not natively outputs posterior probabilities.
|
||||
:param l2: bool, whether to apply or not L2 normalization to the projection
|
||||
:param n_jobs: int, number of concurrent workers.
|
||||
"""
|
||||
self.embedder = embedder
|
||||
self.l2 = l2
|
||||
self.n_jobs = n_jobs
|
||||
self.prob_classifier = MetaClassifier(
|
||||
SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs)
|
||||
|
||||
def fit(self, lX, ly):
|
||||
lZ = self.embedder.fit_transform(lX, ly)
|
||||
self.prob_classifier.fit(lZ, ly)
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
lP = self.predict_proba(lX)
|
||||
lP = _normalize(lP, self.l2)
|
||||
return lP
|
||||
|
||||
def fit_transform(self, lX, ly):
|
||||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
def predict(self, lX):
|
||||
lZ = self.embedder.transform(lX)
|
||||
return self.prob_classifier.predict(lZ)
|
||||
|
||||
def predict_proba(self, lX):
|
||||
lZ = self.embedder.transform(lX)
|
||||
return self.prob_classifier.predict_proba(lZ)
|
||||
|
||||
|
||||
class Funnelling:
|
||||
"""
|
||||
Funnelling Architecture. It is composed by two tiers. The first-tier is a set of heterogeneous document embedders.
|
||||
The second-tier (i.e., the metaclassifier), operates the classification of the common latent space computed by
|
||||
the first-tier learners.
|
||||
"""
|
||||
def __init__(self, first_tier: DocEmbedderList, meta_classifier: MetaClassifier, n_jobs=-1):
|
||||
self.first_tier = first_tier
|
||||
self.meta = meta_classifier
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def fit(self, lX, ly):
|
||||
print('## Fitting first-tier learners!')
|
||||
lZ = self.first_tier.fit_transform(lX, ly)
|
||||
print('## Fitting meta-learner!')
|
||||
self.meta.fit(lZ, ly)
|
||||
|
||||
def predict(self, lX):
|
||||
lZ = self.first_tier.transform(lX)
|
||||
ly = self.meta.predict(lZ)
|
||||
return ly
|
167
refactor/main.py
167
refactor/main.py
|
@ -1,167 +0,0 @@
|
|||
from argparse import ArgumentParser
|
||||
|
||||
from data.dataset_builder import MultilingualDataset
|
||||
from funnelling import *
|
||||
from util.common import MultilingualIndex, get_params, get_method_name
|
||||
from util.evaluation import evaluate
|
||||
from util.results_csv import CSVlog
|
||||
from view_generators import *
|
||||
|
||||
|
||||
def main(args):
|
||||
assert args.post_embedder or args.muse_embedder or args.wce_embedder or args.gru_embedder or args.bert_embedder, \
|
||||
'empty set of document embeddings is not allowed!'
|
||||
|
||||
print('Running generalized funnelling...')
|
||||
|
||||
data = MultilingualDataset.load(args.dataset)
|
||||
data.set_view(languages=['it', 'fr'])
|
||||
data.show_dimensions()
|
||||
lX, ly = data.training()
|
||||
lXte, lyte = data.test()
|
||||
|
||||
# Init multilingualIndex - mandatory when deploying Neural View Generators...
|
||||
if args.gru_embedder or args.bert_embedder:
|
||||
multilingualIndex = MultilingualIndex()
|
||||
lMuse = MuseLoader(langs=sorted(lX.keys()), cache=args.muse_dir)
|
||||
multilingualIndex.index(lX, ly, lXte, lyte, l_pretrained_vocabulary=lMuse.vocabulary())
|
||||
|
||||
# Init ViewGenerators and append them to embedder_list
|
||||
embedder_list = []
|
||||
if args.post_embedder:
|
||||
posteriorEmbedder = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=args.n_jobs)
|
||||
embedder_list.append(posteriorEmbedder)
|
||||
|
||||
if args.muse_embedder:
|
||||
museEmbedder = MuseGen(muse_dir=args.muse_dir, n_jobs=args.n_jobs)
|
||||
embedder_list.append(museEmbedder)
|
||||
|
||||
if args.wce_embedder:
|
||||
wceEmbedder = WordClassGen(n_jobs=args.n_jobs)
|
||||
embedder_list.append(wceEmbedder)
|
||||
|
||||
if args.gru_embedder:
|
||||
rnnEmbedder = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=args.gru_wce, batch_size=256,
|
||||
nepochs=args.nepochs, gpus=args.gpus, n_jobs=args.n_jobs)
|
||||
embedder_list.append(rnnEmbedder)
|
||||
|
||||
if args.bert_embedder:
|
||||
bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=10, gpus=args.gpus, n_jobs=args.n_jobs)
|
||||
embedder_list.append(bertEmbedder)
|
||||
|
||||
# Init DocEmbedderList (i.e., first-tier learners or view generators) and metaclassifier
|
||||
docEmbedders = DocEmbedderList(embedder_list=embedder_list, probabilistic=True)
|
||||
meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'),
|
||||
meta_parameters=get_params(optimc=args.optimc))
|
||||
|
||||
# Init Funnelling Architecture
|
||||
gfun = Funnelling(first_tier=docEmbedders, meta_classifier=meta)
|
||||
|
||||
# Training ---------------------------------------
|
||||
print('\n[Training Generalized Funnelling]')
|
||||
time_init = time()
|
||||
time_tr = time()
|
||||
gfun.fit(lX, ly)
|
||||
time_tr = round(time() - time_tr, 3)
|
||||
print(f'Training completed in {time_tr} seconds!')
|
||||
|
||||
# Testing ----------------------------------------
|
||||
print('\n[Testing Generalized Funnelling]')
|
||||
time_te = time()
|
||||
ly_ = gfun.predict(lXte)
|
||||
l_eval = evaluate(ly_true=lyte, ly_pred=ly_)
|
||||
time_te = round(time() - time_te, 3)
|
||||
print(f'Testing completed in {time_te} seconds!')
|
||||
|
||||
# Logging ---------------------------------------
|
||||
print('\n[Results]')
|
||||
results = CSVlog(args.csv_dir)
|
||||
metrics = []
|
||||
for lang in lXte.keys():
|
||||
macrof1, microf1, macrok, microk = l_eval[lang]
|
||||
metrics.append([macrof1, microf1, macrok, microk])
|
||||
print(f'Lang {lang}: macro-F1 = {macrof1:.3f} micro-F1 = {microf1:.3f}')
|
||||
if results is not None:
|
||||
_id, _dataset = get_method_name(args)
|
||||
results.add_row(method='gfun',
|
||||
setting=_id,
|
||||
optimc=args.optimc,
|
||||
sif='True',
|
||||
zscore='True',
|
||||
l2='True',
|
||||
dataset=_dataset,
|
||||
time_tr=time_tr,
|
||||
time_te=time_te,
|
||||
lang=lang,
|
||||
macrof1=macrof1,
|
||||
microf1=microf1,
|
||||
macrok=macrok,
|
||||
microk=microk,
|
||||
notes='')
|
||||
print('Averages: MF1, mF1, MK, mK', np.round(np.mean(np.array(metrics), axis=0), 3))
|
||||
|
||||
overall_time = round(time() - time_init, 3)
|
||||
exit(f'\nExecuted in: {overall_time} seconds!')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = ArgumentParser(description='Run generalized funnelling, A. Moreo, A. Pedrotti and F. Sebastiani')
|
||||
|
||||
parser.add_argument('dataset', help='Path to the dataset')
|
||||
|
||||
parser.add_argument('-o', '--output', dest='csv_dir',
|
||||
help='Result file (default ../csv_log/gfun_results.csv)', type=str,
|
||||
default='csv_logs/gfun/gfun_results.csv')
|
||||
|
||||
parser.add_argument('-x', '--post_embedder', dest='post_embedder', action='store_true',
|
||||
help='deploy posterior probabilities embedder to compute document embeddings',
|
||||
default=False)
|
||||
|
||||
parser.add_argument('-w', '--wce_embedder', dest='wce_embedder', action='store_true',
|
||||
help='deploy (supervised) Word-Class embedder to the compute document embeddings',
|
||||
default=False)
|
||||
|
||||
parser.add_argument('-m', '--muse_embedder', dest='muse_embedder', action='store_true',
|
||||
help='deploy (pretrained) MUSE embedder to compute document embeddings',
|
||||
default=False)
|
||||
|
||||
parser.add_argument('-b', '--bert_embedder', dest='bert_embedder', action='store_true',
|
||||
help='deploy multilingual Bert to compute document embeddings',
|
||||
default=False)
|
||||
|
||||
parser.add_argument('-g', '--gru_embedder', dest='gru_embedder', action='store_true',
|
||||
help='deploy a GRU in order to compute document embeddings',
|
||||
default=False)
|
||||
|
||||
parser.add_argument('-c', '--c_optimize', dest='optimc', action='store_true',
|
||||
help='Optimize SVMs C hyperparameter',
|
||||
default=False)
|
||||
|
||||
parser.add_argument('-n', '--nepochs', dest='nepochs', type=str,
|
||||
help='Number of max epochs to train Recurrent embedder (i.e., -g)')
|
||||
|
||||
parser.add_argument('-j', '--n_jobs', dest='n_jobs', type=int,
|
||||
help='Number of parallel jobs (default is -1, all)',
|
||||
default=-1)
|
||||
|
||||
parser.add_argument('--muse_dir', dest='muse_dir', type=str,
|
||||
help='Path to the MUSE polylingual word embeddings (default ../embeddings)',
|
||||
default='../embeddings')
|
||||
|
||||
parser.add_argument('--gru_wce', dest='gru_wce', action='store_true',
|
||||
help='Deploy WCE embedding as embedding layer of the GRU View Generator',
|
||||
default=False)
|
||||
|
||||
parser.add_argument('--gru_dir', dest='gru_dir', type=str,
|
||||
help='Set the path to a pretrained GRU model (i.e., -g view generator)',
|
||||
default=None)
|
||||
|
||||
parser.add_argument('--bert_dir', dest='bert_dir', type=str,
|
||||
help='Set the path to a pretrained mBERT model (i.e., -b view generator)',
|
||||
default=None)
|
||||
|
||||
parser.add_argument('--gpus', help='specifies how many GPUs to use per node',
|
||||
default=None)
|
||||
|
||||
args = parser.parse_args()
|
||||
main(args)
|
|
@ -1,51 +0,0 @@
|
|||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
|
||||
def init_embeddings(pretrained, vocab_size, learnable_length):
|
||||
"""
|
||||
Compute the embedding matrix
|
||||
:param pretrained:
|
||||
:param vocab_size:
|
||||
:param learnable_length:
|
||||
:return:
|
||||
"""
|
||||
pretrained_embeddings = None
|
||||
pretrained_length = 0
|
||||
if pretrained is not None:
|
||||
pretrained_length = pretrained.shape[1]
|
||||
assert pretrained.shape[0] == vocab_size, 'pre-trained matrix does not match with the vocabulary size'
|
||||
pretrained_embeddings = nn.Embedding(vocab_size, pretrained_length)
|
||||
# requires_grad=False sets the embedding layer as NOT trainable
|
||||
pretrained_embeddings.weight = nn.Parameter(pretrained, requires_grad=False)
|
||||
|
||||
learnable_embeddings = None
|
||||
if learnable_length > 0:
|
||||
learnable_embeddings = nn.Embedding(vocab_size, learnable_length)
|
||||
|
||||
embedding_length = learnable_length + pretrained_length
|
||||
assert embedding_length > 0, '0-size embeddings'
|
||||
return pretrained_embeddings, learnable_embeddings, embedding_length
|
||||
|
||||
|
||||
def embed(model, input, lang):
|
||||
input_list = []
|
||||
if model.lpretrained_embeddings[lang]:
|
||||
input_list.append(model.lpretrained_embeddings[lang](input))
|
||||
if model.llearnable_embeddings[lang]:
|
||||
input_list.append(model.llearnable_embeddings[lang](input))
|
||||
return torch.cat(tensors=input_list, dim=2)
|
||||
|
||||
|
||||
def embedding_dropout(input, drop_range, p_drop=0.5, training=True):
|
||||
if p_drop > 0 and training and drop_range is not None:
|
||||
p = p_drop
|
||||
drop_from, drop_to = drop_range
|
||||
m = drop_to - drop_from #length of the supervised embedding
|
||||
l = input.shape[2] #total embedding length
|
||||
corr = (1 - p)
|
||||
input[:, :, drop_from:drop_to] = corr * F.dropout(input[:, :, drop_from:drop_to], p=p)
|
||||
input /= (1 - (p * m / l))
|
||||
|
||||
return input
|
|
@ -1,224 +0,0 @@
|
|||
import time
|
||||
|
||||
import numpy as np
|
||||
from joblib import Parallel, delayed
|
||||
from scipy.sparse import issparse
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.svm import SVC
|
||||
|
||||
from util.standardizer import StandardizeTransformer
|
||||
|
||||
|
||||
def get_learner(calibrate=False, kernel='linear', C=1):
|
||||
"""
|
||||
instantiate scikit Support Vector Classifier
|
||||
:param calibrate: boolean, whether to return posterior probabilities or not
|
||||
:param kernel: string,kernel to be applied to the SVC
|
||||
:param C: int or dict {'C': list of integer}, Regularization parameter
|
||||
:return: Support Vector Classifier
|
||||
"""
|
||||
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=C, random_state=1, gamma='auto', verbose=False)
|
||||
|
||||
|
||||
def _sort_if_sparse(X):
|
||||
if issparse(X) and not X.has_sorted_indices:
|
||||
X.sort_indices()
|
||||
|
||||
|
||||
def _joblib_transform_multiling(transformer, lX, n_jobs=-1):
|
||||
if n_jobs == 1:
|
||||
return {lang: transformer(lX[lang]) for lang in lX.keys()}
|
||||
else:
|
||||
langs = list(lX.keys())
|
||||
transformations = Parallel(n_jobs=n_jobs)(delayed(transformer)(lX[lang]) for lang in langs)
|
||||
return {lang: transformations[i] for i, lang in enumerate(langs)}
|
||||
|
||||
|
||||
class TrivialRejector:
|
||||
def fit(self, X, y):
|
||||
self.cats = y.shape[1]
|
||||
return self
|
||||
|
||||
def decision_function(self, X): return np.zeros((X.shape[0], self.cats))
|
||||
|
||||
def predict(self, X): return np.zeros((X.shape[0], self.cats))
|
||||
|
||||
def predict_proba(self, X): return np.zeros((X.shape[0], self.cats))
|
||||
|
||||
def best_params(self): return {}
|
||||
|
||||
|
||||
class NaivePolylingualClassifier:
|
||||
"""
|
||||
Is a mere set of independet MonolingualClassifiers
|
||||
"""
|
||||
|
||||
def __init__(self, base_learner, parameters=None, n_jobs=-1):
|
||||
self.base_learner = base_learner
|
||||
self.parameters = parameters
|
||||
self.model = None
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def fit(self, lX, ly):
|
||||
"""
|
||||
trains the independent monolingual classifiers
|
||||
:param lX: a dictionary {language_label: X csr-matrix}
|
||||
:param ly: a dictionary {language_label: y np.array}
|
||||
:return: self
|
||||
"""
|
||||
tinit = time.time()
|
||||
assert set(lX.keys()) == set(ly.keys()), 'inconsistent language mappings in fit'
|
||||
langs = list(lX.keys())
|
||||
for lang in langs:
|
||||
_sort_if_sparse(lX[lang])
|
||||
|
||||
models = Parallel(n_jobs=self.n_jobs)\
|
||||
(delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]), ly[lang]) for
|
||||
lang in langs)
|
||||
|
||||
self.model = {lang: models[i] for i, lang in enumerate(langs)}
|
||||
self.empty_categories = {lang: self.model[lang].empty_categories for lang in langs}
|
||||
self.time = time.time() - tinit
|
||||
return self
|
||||
|
||||
def decision_function(self, lX):
|
||||
"""
|
||||
:param lX: a dictionary {language_label: X csr-matrix}
|
||||
:return: a dictionary of classification scores for each class
|
||||
"""
|
||||
assert self.model is not None, 'predict called before fit'
|
||||
assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in decision function'
|
||||
langs = list(lX.keys())
|
||||
scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].decision_function)(lX[lang]) for lang in langs)
|
||||
return {lang: scores[i] for i, lang in enumerate(langs)}
|
||||
|
||||
def predict_proba(self, lX):
|
||||
"""
|
||||
:param lX: a dictionary {language_label: X csr-matrix}
|
||||
:return: a dictionary of probabilities that each document belongs to each class
|
||||
"""
|
||||
assert self.model is not None, 'predict called before fit'
|
||||
assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in decision function'
|
||||
langs = list(lX.keys())
|
||||
scores = Parallel(n_jobs=self.n_jobs, max_nbytes=None)(
|
||||
delayed(self.model[lang].predict_proba)(lX[lang]) for lang in langs)
|
||||
return {lang: scores[i] for i, lang in enumerate(langs)}
|
||||
|
||||
def predict(self, lX):
|
||||
"""
|
||||
:param lX: a dictionary {language_label: X csr-matrix}
|
||||
:return: a dictionary of predictions
|
||||
"""
|
||||
assert self.model is not None, 'predict called before fit'
|
||||
assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in predict'
|
||||
if self.n_jobs == 1:
|
||||
return {lang: self.model[lang].transform(lX[lang]) for lang in lX.keys()}
|
||||
else:
|
||||
langs = list(lX.keys())
|
||||
scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].predict)(lX[lang]) for lang in langs)
|
||||
return {lang: scores[i] for i, lang in enumerate(langs)}
|
||||
|
||||
def best_params(self):
|
||||
return {lang: model.best_params() for lang, model in self.model.items()}
|
||||
|
||||
|
||||
class MonolingualClassifier:
|
||||
|
||||
def __init__(self, base_learner, parameters=None, n_jobs=-1):
|
||||
self.learner = base_learner
|
||||
self.parameters = parameters
|
||||
self.model = None
|
||||
self.n_jobs = n_jobs
|
||||
self.best_params_ = None
|
||||
|
||||
def fit(self, X, y):
|
||||
if X.shape[0] == 0:
|
||||
print('Warning: X has 0 elements, a trivial rejector will be created')
|
||||
self.model = TrivialRejector().fit(X, y)
|
||||
self.empty_categories = np.arange(y.shape[1])
|
||||
return self
|
||||
|
||||
tinit = time.time()
|
||||
_sort_if_sparse(X)
|
||||
self.empty_categories = np.argwhere(np.sum(y, axis=0) == 0).flatten()
|
||||
# multi-class format
|
||||
if len(y.shape) == 2:
|
||||
if self.parameters is not None:
|
||||
self.parameters = [{'estimator__' + key: params[key] for key in params.keys()}
|
||||
for params in self.parameters]
|
||||
self.model = OneVsRestClassifier(self.learner, n_jobs=self.n_jobs)
|
||||
else:
|
||||
self.model = self.learner
|
||||
raise NotImplementedError('not working as a base-classifier for funneling if there are gaps in '
|
||||
'the labels across languages')
|
||||
|
||||
# parameter optimization?
|
||||
if self.parameters:
|
||||
print('debug: optimizing parameters:', self.parameters)
|
||||
self.model = GridSearchCV(self.model, param_grid=self.parameters, refit=True, cv=5, n_jobs=self.n_jobs,
|
||||
error_score=0, verbose=10)
|
||||
|
||||
print(f'fitting: Mono-lingual Classifier on matrices of shape X={X.shape} Y={y.shape}')
|
||||
self.model.fit(X, y)
|
||||
if isinstance(self.model, GridSearchCV):
|
||||
self.best_params_ = self.model.best_params_
|
||||
print('best parameters: ', self.best_params_)
|
||||
self.time = time.time() - tinit
|
||||
return self
|
||||
|
||||
def decision_function(self, X):
|
||||
assert self.model is not None, 'predict called before fit'
|
||||
_sort_if_sparse(X)
|
||||
return self.model.decision_function(X)
|
||||
|
||||
def predict_proba(self, X):
|
||||
assert self.model is not None, 'predict called before fit'
|
||||
assert hasattr(self.model, 'predict_proba'), 'the probability predictions are not enabled in this model'
|
||||
_sort_if_sparse(X)
|
||||
return self.model.predict_proba(X)
|
||||
|
||||
def predict(self, X):
|
||||
assert self.model is not None, 'predict called before fit'
|
||||
_sort_if_sparse(X)
|
||||
return self.model.predict(X)
|
||||
|
||||
def best_params(self):
|
||||
return self.best_params_
|
||||
|
||||
|
||||
class MetaClassifier:
|
||||
|
||||
def __init__(self, meta_learner, meta_parameters=None, n_jobs=-1, standardize_range=None):
|
||||
self.n_jobs = n_jobs
|
||||
self.model = MonolingualClassifier(base_learner=meta_learner, parameters=meta_parameters, n_jobs=n_jobs)
|
||||
self.standardize_range = standardize_range
|
||||
|
||||
def fit(self, lZ, ly):
|
||||
tinit = time.time()
|
||||
Z, y = self.stack(lZ, ly)
|
||||
|
||||
self.standardizer = StandardizeTransformer(range=self.standardize_range)
|
||||
Z = self.standardizer.fit_transform(Z)
|
||||
|
||||
print('fitting the Z-space of shape={}'.format(Z.shape))
|
||||
self.model.fit(Z, y)
|
||||
self.time = time.time() - tinit
|
||||
|
||||
def stack(self, lZ, ly=None):
|
||||
langs = list(lZ.keys())
|
||||
Z = np.vstack([lZ[lang] for lang in langs])
|
||||
if ly is not None:
|
||||
y = np.vstack([ly[lang] for lang in langs])
|
||||
return Z, y
|
||||
else:
|
||||
return Z
|
||||
|
||||
def predict(self, lZ):
|
||||
lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs)
|
||||
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
|
||||
|
||||
def predict_proba(self, lZ):
|
||||
lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs)
|
||||
return _joblib_transform_multiling(self.model.predict_proba, lZ, n_jobs=self.n_jobs)
|
||||
|
|
@ -1,113 +0,0 @@
|
|||
#taken from https://github.com/prakashpandey9/Text-Classification-Pytorch/blob/master/models/LSTM.py
|
||||
from torch.autograd import Variable
|
||||
|
||||
from models.helpers import *
|
||||
|
||||
|
||||
class RNNMultilingualClassifier(nn.Module):
|
||||
|
||||
def __init__(self, output_size, hidden_size, lvocab_size, learnable_length, lpretrained=None,
|
||||
drop_embedding_range=None, drop_embedding_prop=0, post_probabilities=True, only_post=False,
|
||||
bert_embeddings=False):
|
||||
|
||||
super(RNNMultilingualClassifier, self).__init__()
|
||||
self.output_size = output_size
|
||||
self.hidden_size = hidden_size
|
||||
self.drop_embedding_range = drop_embedding_range
|
||||
self.drop_embedding_prop = drop_embedding_prop
|
||||
self.post_probabilities = post_probabilities
|
||||
self.bert_embeddings = bert_embeddings
|
||||
assert 0 <= drop_embedding_prop <= 1, 'drop_embedding_prop: wrong range'
|
||||
|
||||
self.lpretrained_embeddings = nn.ModuleDict()
|
||||
self.llearnable_embeddings = nn.ModuleDict()
|
||||
self.embedding_length = None
|
||||
self.langs = sorted(lvocab_size.keys())
|
||||
self.only_post = only_post
|
||||
|
||||
self.n_layers = 1
|
||||
self.n_directions = 1
|
||||
|
||||
self.dropout = nn.Dropout(0.6)
|
||||
|
||||
lstm_out = 256
|
||||
ff1 = 512
|
||||
ff2 = 256
|
||||
|
||||
lpretrained_embeddings = {}
|
||||
llearnable_embeddings = {}
|
||||
if only_post==False:
|
||||
for l in self.langs:
|
||||
pretrained = lpretrained[l] if lpretrained else None
|
||||
pretrained_embeddings, learnable_embeddings, embedding_length = init_embeddings(
|
||||
pretrained, lvocab_size[l], learnable_length
|
||||
)
|
||||
lpretrained_embeddings[l] = pretrained_embeddings
|
||||
llearnable_embeddings[l] = learnable_embeddings
|
||||
self.embedding_length = embedding_length
|
||||
|
||||
# self.lstm = nn.LSTM(self.embedding_length, hidden_size, dropout=0.2 if self.n_layers>1 else 0, num_layers=self.n_layers, bidirectional=(self.n_directions==2))
|
||||
self.rnn = nn.GRU(self.embedding_length, hidden_size)
|
||||
self.linear0 = nn.Linear(hidden_size * self.n_directions, lstm_out)
|
||||
self.lpretrained_embeddings.update(lpretrained_embeddings)
|
||||
self.llearnable_embeddings.update(llearnable_embeddings)
|
||||
|
||||
self.linear1 = nn.Linear(lstm_out, ff1)
|
||||
self.linear2 = nn.Linear(ff1, ff2)
|
||||
|
||||
if only_post:
|
||||
self.label = nn.Linear(output_size, output_size)
|
||||
elif post_probabilities and not bert_embeddings:
|
||||
self.label = nn.Linear(ff2 + output_size, output_size)
|
||||
elif bert_embeddings and not post_probabilities:
|
||||
self.label = nn.Linear(ff2 + 768, output_size)
|
||||
elif post_probabilities and bert_embeddings:
|
||||
self.label = nn.Linear(ff2 + output_size + 768, output_size)
|
||||
else:
|
||||
self.label = nn.Linear(ff2, output_size)
|
||||
|
||||
def forward(self, input, post, bert_embed, lang):
|
||||
if self.only_post:
|
||||
doc_embedding = post
|
||||
else:
|
||||
doc_embedding = self.transform(input, lang)
|
||||
if self.post_probabilities:
|
||||
doc_embedding = torch.cat([doc_embedding, post], dim=1)
|
||||
if self.bert_embeddings:
|
||||
doc_embedding = torch.cat([doc_embedding, bert_embed], dim=1)
|
||||
|
||||
logits = self.label(doc_embedding)
|
||||
return logits
|
||||
|
||||
def transform(self, input, lang):
|
||||
batch_size = input.shape[0]
|
||||
input = embed(self, input, lang)
|
||||
input = embedding_dropout(input, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop,
|
||||
training=self.training)
|
||||
input = input.permute(1, 0, 2)
|
||||
h_0 = Variable(torch.zeros(self.n_layers*self.n_directions, batch_size, self.hidden_size).cuda())
|
||||
# c_0 = Variable(torch.zeros(self.n_layers*self.n_directions, batch_size, self.hidden_size).cuda())
|
||||
# output, (_, _) = self.lstm(input, (h_0, c_0))
|
||||
output, _ = self.rnn(input, h_0)
|
||||
output = output[-1, :, :]
|
||||
output = F.relu(self.linear0(output))
|
||||
output = self.dropout(F.relu(self.linear1(output)))
|
||||
output = self.dropout(F.relu(self.linear2(output)))
|
||||
return output
|
||||
|
||||
def finetune_pretrained(self):
|
||||
for l in self.langs:
|
||||
self.lpretrained_embeddings[l].requires_grad = True
|
||||
self.lpretrained_embeddings[l].weight.requires_grad = True
|
||||
|
||||
def get_embeddings(self, input, lang):
|
||||
batch_size = input.shape[0]
|
||||
input = embed(self, input, lang)
|
||||
input = embedding_dropout(input, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop,
|
||||
training=self.training)
|
||||
input = input.permute(1, 0, 2)
|
||||
h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, batch_size, self.hidden_size).cuda())
|
||||
output, _ = self.rnn(input, h_0)
|
||||
output = output[-1, :, :]
|
||||
return output.cpu().detach().numpy()
|
||||
|
|
@ -1,183 +0,0 @@
|
|||
import pytorch_lightning as pl
|
||||
import torch
|
||||
from torch.optim.lr_scheduler import StepLR
|
||||
from transformers import BertForSequenceClassification, AdamW
|
||||
|
||||
from util.common import define_pad_length, pad
|
||||
from util.pl_metrics import CustomF1, CustomK
|
||||
|
||||
|
||||
class BertModel(pl.LightningModule):
|
||||
|
||||
def __init__(self, output_size, stored_path, gpus=None):
|
||||
"""
|
||||
Init Bert model.
|
||||
:param output_size:
|
||||
:param stored_path:
|
||||
:param gpus:
|
||||
"""
|
||||
super().__init__()
|
||||
self.loss = torch.nn.BCEWithLogitsLoss()
|
||||
self.gpus = gpus
|
||||
self.microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
|
||||
self.macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
|
||||
self.microK = CustomK(num_classes=output_size, average='micro', device=self.gpus)
|
||||
self.macroK = CustomK(num_classes=output_size, average='macro', device=self.gpus)
|
||||
# Language specific metrics to compute metrics at epoch level
|
||||
self.lang_macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
|
||||
self.lang_microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
|
||||
self.lang_macroK = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
|
||||
self.lang_microK = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
|
||||
|
||||
if stored_path:
|
||||
self.bert = BertForSequenceClassification.from_pretrained(stored_path,
|
||||
num_labels=output_size,
|
||||
output_hidden_states=True)
|
||||
else:
|
||||
self.bert = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased',
|
||||
num_labels=output_size,
|
||||
output_hidden_states=True)
|
||||
self.save_hyperparameters()
|
||||
|
||||
def forward(self, X):
|
||||
logits = self.bert(X)
|
||||
return logits
|
||||
|
||||
def training_step(self, train_batch, batch_idx):
|
||||
X, y, _, batch_langs = train_batch
|
||||
X = torch.cat(X).view([X[0].shape[0], len(X)])
|
||||
y = y.type(torch.FloatTensor)
|
||||
y = y.to('cuda' if self.gpus else 'cpu')
|
||||
logits, _ = self.forward(X)
|
||||
loss = self.loss(logits, y)
|
||||
# Squashing logits through Sigmoid in order to get confidence score
|
||||
predictions = torch.sigmoid(logits) > 0.5
|
||||
microF1 = self.microF1(predictions, y)
|
||||
macroF1 = self.macroF1(predictions, y)
|
||||
microK = self.microK(predictions, y)
|
||||
macroK = self.macroK(predictions, y)
|
||||
self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True)
|
||||
self.log('train-macroF1', macroF1, on_step=True, on_epoch=True, prog_bar=False, logger=True)
|
||||
self.log('train-microF1', microF1, on_step=True, on_epoch=True, prog_bar=False, logger=True)
|
||||
self.log('train-macroK', macroK, on_step=True, on_epoch=True, prog_bar=False, logger=True)
|
||||
self.log('train-microK', microK, on_step=True, on_epoch=True, prog_bar=False, logger=True)
|
||||
lX, ly = self._reconstruct_dict(predictions, y, batch_langs)
|
||||
return {'loss': loss, 'pred': lX, 'target': ly}
|
||||
|
||||
def training_epoch_end(self, outputs):
|
||||
langs = []
|
||||
for output in outputs:
|
||||
langs.extend(list(output['pred'].keys()))
|
||||
langs = set(langs)
|
||||
# outputs is a of n dicts of m elements, where n is equal to the number of epoch steps and m is batchsize.
|
||||
# here we save epoch level metric values and compute them specifically for each language
|
||||
res_macroF1 = {lang: [] for lang in langs}
|
||||
res_microF1 = {lang: [] for lang in langs}
|
||||
res_macroK = {lang: [] for lang in langs}
|
||||
res_microK = {lang: [] for lang in langs}
|
||||
for output in outputs:
|
||||
lX, ly = output['pred'], output['target']
|
||||
for lang in lX.keys():
|
||||
X, y = lX[lang], ly[lang]
|
||||
lang_macroF1 = self.lang_macroF1(X, y)
|
||||
lang_microF1 = self.lang_microF1(X, y)
|
||||
lang_macroK = self.lang_macroK(X, y)
|
||||
lang_microK = self.lang_microK(X, y)
|
||||
|
||||
res_macroF1[lang].append(lang_macroF1)
|
||||
res_microF1[lang].append(lang_microF1)
|
||||
res_macroK[lang].append(lang_macroK)
|
||||
res_microK[lang].append(lang_microK)
|
||||
for lang in langs:
|
||||
avg_macroF1 = torch.mean(torch.Tensor(res_macroF1[lang]))
|
||||
avg_microF1 = torch.mean(torch.Tensor(res_microF1[lang]))
|
||||
avg_macroK = torch.mean(torch.Tensor(res_macroK[lang]))
|
||||
avg_microK = torch.mean(torch.Tensor(res_microK[lang]))
|
||||
self.logger.experiment.add_scalars('train-langs-macroF1', {f'{lang}': avg_macroF1}, self.current_epoch)
|
||||
self.logger.experiment.add_scalars('train-langs-microF1', {f'{lang}': avg_microF1}, self.current_epoch)
|
||||
self.logger.experiment.add_scalars('train-langs-macroK', {f'{lang}': avg_macroK}, self.current_epoch)
|
||||
self.logger.experiment.add_scalars('train-langs-microK', {f'{lang}': avg_microK}, self.current_epoch)
|
||||
|
||||
def validation_step(self, val_batch, batch_idx):
|
||||
X, y, _, batch_langs = val_batch
|
||||
X = torch.cat(X).view([X[0].shape[0], len(X)])
|
||||
y = y.type(torch.FloatTensor)
|
||||
y = y.to('cuda' if self.gpus else 'cpu')
|
||||
logits, _ = self.forward(X)
|
||||
loss = self.loss(logits, y)
|
||||
predictions = torch.sigmoid(logits) > 0.5
|
||||
microF1 = self.microF1(predictions, y)
|
||||
macroF1 = self.macroF1(predictions, y)
|
||||
microK = self.microK(predictions, y)
|
||||
macroK = self.macroK(predictions, y)
|
||||
self.log('val-loss', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True)
|
||||
self.log('val-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=True, logger=True)
|
||||
self.log('val-microF1', microF1, on_step=False, on_epoch=True, prog_bar=True, logger=True)
|
||||
self.log('val-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
|
||||
self.log('val-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
|
||||
return {'loss': loss}
|
||||
|
||||
def test_step(self, test_batch, batch_idx):
|
||||
X, y, _, batch_langs = test_batch
|
||||
X = torch.cat(X).view([X[0].shape[0], len(X)])
|
||||
y = y.type(torch.FloatTensor)
|
||||
y = y.to('cuda' if self.gpus else 'cpu')
|
||||
logits, _ = self.forward(X)
|
||||
loss = self.loss(logits, y)
|
||||
# Squashing logits through Sigmoid in order to get confidence score
|
||||
predictions = torch.sigmoid(logits) > 0.5
|
||||
microF1 = self.microF1(predictions, y)
|
||||
macroF1 = self.macroF1(predictions, y)
|
||||
microK = self.microK(predictions, y)
|
||||
macroK = self.macroK(predictions, y)
|
||||
self.log('test-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=False, logger=True)
|
||||
self.log('test-microF1', microF1, on_step=False, on_epoch=True, prog_bar=False, logger=True)
|
||||
self.log('test-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
|
||||
self.log('test-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
|
||||
return
|
||||
|
||||
def configure_optimizers(self, lr=3e-5, weight_decay=0.01):
|
||||
no_decay = ['bias', 'LayerNorm.weight']
|
||||
optimizer_grouped_parameters = [
|
||||
{'params': [p for n, p in self.bert.named_parameters()
|
||||
if not any(nd in n for nd in no_decay)],
|
||||
'weight_decay': weight_decay},
|
||||
{'params': [p for n, p in self.bert.named_parameters()
|
||||
if any(nd in n for nd in no_decay)],
|
||||
'weight_decay': weight_decay}
|
||||
]
|
||||
optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
|
||||
scheduler = StepLR(optimizer, step_size=25, gamma=0.1)
|
||||
return [optimizer], [scheduler]
|
||||
|
||||
def encode(self, lX, batch_size=64):
|
||||
with torch.no_grad():
|
||||
l_embed = {lang: [] for lang in lX.keys()}
|
||||
for lang in sorted(lX.keys()):
|
||||
for i in range(0, len(lX[lang]), batch_size):
|
||||
if i + batch_size > len(lX[lang]):
|
||||
batch = lX[lang][i:len(lX[lang])]
|
||||
else:
|
||||
batch = lX[lang][i:i + batch_size]
|
||||
max_pad_len = define_pad_length(batch)
|
||||
batch = pad(batch, pad_index=self.bert.config.pad_token_id, max_pad_length=max_pad_len)
|
||||
batch = torch.LongTensor(batch).to('cuda' if self.gpus else 'cpu')
|
||||
_, output = self.forward(batch)
|
||||
doc_embeds = output[-1][:, 0, :]
|
||||
l_embed[lang].append(doc_embeds.cpu())
|
||||
for k, v in l_embed.items():
|
||||
l_embed[k] = torch.cat(v, dim=0).numpy()
|
||||
return l_embed
|
||||
|
||||
@staticmethod
|
||||
def _reconstruct_dict(predictions, y, batch_langs):
|
||||
reconstructed_x = {lang: [] for lang in set(batch_langs)}
|
||||
reconstructed_y = {lang: [] for lang in set(batch_langs)}
|
||||
for i, pred in enumerate(predictions):
|
||||
reconstructed_x[batch_langs[i]].append(pred)
|
||||
reconstructed_y[batch_langs[i]].append(y[i])
|
||||
for k, v in reconstructed_x.items():
|
||||
reconstructed_x[k] = torch.cat(v).view(-1, predictions.shape[1])
|
||||
for k, v in reconstructed_y.items():
|
||||
reconstructed_y[k] = torch.cat(v).view(-1, predictions.shape[1])
|
||||
return reconstructed_x, reconstructed_y
|
|
@ -1,266 +0,0 @@
|
|||
# Lightning modules, see https://pytorch-lightning.readthedocs.io/en/latest/lightning_module.html
|
||||
import pytorch_lightning as pl
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch import nn
|
||||
from torch.autograd import Variable
|
||||
from torch.optim.lr_scheduler import StepLR
|
||||
from transformers import AdamW
|
||||
|
||||
from models.helpers import init_embeddings
|
||||
from util.common import define_pad_length, pad
|
||||
from util.pl_metrics import CustomF1, CustomK
|
||||
|
||||
|
||||
class RecurrentModel(pl.LightningModule):
|
||||
def __init__(self, lPretrained, langs, output_size, hidden_size, lVocab_size, learnable_length,
|
||||
drop_embedding_range, drop_embedding_prop, gpus=None):
|
||||
"""
|
||||
Init RNN model.
|
||||
:param lPretrained:
|
||||
:param langs:
|
||||
:param output_size:
|
||||
:param hidden_size:
|
||||
:param lVocab_size:
|
||||
:param learnable_length:
|
||||
:param drop_embedding_range:
|
||||
:param drop_embedding_prop:
|
||||
:param gpus:
|
||||
"""
|
||||
super().__init__()
|
||||
self.gpus = gpus
|
||||
self.langs = langs
|
||||
self.lVocab_size = lVocab_size
|
||||
self.learnable_length = learnable_length
|
||||
self.output_size = output_size
|
||||
self.hidden_size = hidden_size
|
||||
self.drop_embedding_range = drop_embedding_range
|
||||
self.drop_embedding_prop = drop_embedding_prop
|
||||
self.loss = torch.nn.BCEWithLogitsLoss()
|
||||
|
||||
self.microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
|
||||
self.macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
|
||||
self.microK = CustomK(num_classes=output_size, average='micro', device=self.gpus)
|
||||
self.macroK = CustomK(num_classes=output_size, average='macro', device=self.gpus)
|
||||
# Language specific metrics to compute metrics at epoch level
|
||||
self.lang_macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
|
||||
self.lang_microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
|
||||
self.lang_macroK = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
|
||||
self.lang_microK = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
|
||||
|
||||
self.lPretrained_embeddings = nn.ModuleDict()
|
||||
self.lLearnable_embeddings = nn.ModuleDict()
|
||||
|
||||
self.n_layers = 1
|
||||
self.n_directions = 1
|
||||
self.dropout = nn.Dropout(0.6)
|
||||
|
||||
lstm_out = 256
|
||||
ff1 = 512
|
||||
ff2 = 256
|
||||
|
||||
lpretrained_embeddings = {}
|
||||
llearnable_embeddings = {}
|
||||
|
||||
for lang in self.langs:
|
||||
pretrained = lPretrained[lang] if lPretrained else None
|
||||
pretrained_embeddings, learnable_embeddings, embedding_length = init_embeddings(
|
||||
pretrained, self.lVocab_size[lang], self.learnable_length)
|
||||
lpretrained_embeddings[lang] = pretrained_embeddings
|
||||
llearnable_embeddings[lang] = learnable_embeddings
|
||||
self.embedding_length = embedding_length
|
||||
|
||||
self.lPretrained_embeddings.update(lpretrained_embeddings)
|
||||
self.lLearnable_embeddings.update(llearnable_embeddings)
|
||||
|
||||
self.rnn = nn.GRU(self.embedding_length, hidden_size)
|
||||
self.linear0 = nn.Linear(hidden_size * self.n_directions, lstm_out)
|
||||
self.linear1 = nn.Linear(lstm_out, ff1)
|
||||
self.linear2 = nn.Linear(ff1, ff2)
|
||||
self.label = nn.Linear(ff2, self.output_size)
|
||||
|
||||
# TODO: setting lPretrained to None, letting it to its original value will "bug" first validation
|
||||
# step (i.e., checkpoint will store also its ++ value, I guess, making the saving process too slow)
|
||||
lPretrained = None
|
||||
self.save_hyperparameters()
|
||||
|
||||
def forward(self, lX):
|
||||
l_embed = []
|
||||
for lang in sorted(lX.keys()):
|
||||
doc_embedding = self.transform(lX[lang], lang)
|
||||
l_embed.append(doc_embedding)
|
||||
embed = torch.cat(l_embed, dim=0)
|
||||
logits = self.label(embed)
|
||||
return logits
|
||||
|
||||
def transform(self, X, lang):
|
||||
batch_size = X.shape[0]
|
||||
X = self.embed(X, lang)
|
||||
X = self.embedding_dropout(X, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop,
|
||||
training=self.training)
|
||||
X = X.permute(1, 0, 2)
|
||||
h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, batch_size, self.hidden_size).to(self.device))
|
||||
output, _ = self.rnn(X, h_0)
|
||||
output = output[-1, :, :]
|
||||
output = F.relu(self.linear0(output))
|
||||
output = self.dropout(F.relu(self.linear1(output)))
|
||||
output = self.dropout(F.relu(self.linear2(output)))
|
||||
return output
|
||||
|
||||
def encode(self, lX, l_pad, batch_size=128):
|
||||
"""
|
||||
Returns encoded data (i.e, RNN hidden state at second feed-forward layer - linear1). Dimensionality is 512.
|
||||
:param lX:
|
||||
:param l_pad:
|
||||
:param batch_size:
|
||||
:return:
|
||||
"""
|
||||
with torch.no_grad():
|
||||
l_embed = {lang: [] for lang in lX.keys()}
|
||||
for lang in sorted(lX.keys()):
|
||||
for i in range(0, len(lX[lang]), batch_size):
|
||||
if i+batch_size > len(lX[lang]):
|
||||
batch = lX[lang][i:len(lX[lang])]
|
||||
else:
|
||||
batch = lX[lang][i:i+batch_size]
|
||||
max_pad_len = define_pad_length(batch)
|
||||
batch = pad(batch, pad_index=l_pad[lang], max_pad_length=max_pad_len)
|
||||
X = torch.LongTensor(batch).to('cuda' if self.gpus else 'cpu')
|
||||
_batch_size = X.shape[0]
|
||||
X = self.embed(X, lang)
|
||||
X = self.embedding_dropout(X, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop,
|
||||
training=self.training)
|
||||
X = X.permute(1, 0, 2)
|
||||
h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, _batch_size, self.hidden_size).to(self.device))
|
||||
output, _ = self.rnn(X, h_0)
|
||||
output = output[-1, :, :]
|
||||
output = F.relu(self.linear0(output))
|
||||
output = self.dropout(F.relu(self.linear1(output)))
|
||||
l_embed[lang].append(output.cpu())
|
||||
for k, v in l_embed.items():
|
||||
l_embed[k] = torch.cat(v, dim=0).numpy()
|
||||
return l_embed
|
||||
|
||||
def training_step(self, train_batch, batch_idx):
|
||||
lX, ly = train_batch
|
||||
logits = self.forward(lX)
|
||||
_ly = []
|
||||
for lang in sorted(lX.keys()):
|
||||
_ly.append(ly[lang])
|
||||
y = torch.cat(_ly, dim=0)
|
||||
loss = self.loss(logits, y)
|
||||
# Squashing logits through Sigmoid in order to get confidence score
|
||||
predictions = torch.sigmoid(logits) > 0.5
|
||||
microF1 = self.microF1(predictions, y)
|
||||
macroF1 = self.macroF1(predictions, y)
|
||||
microK = self.microK(predictions, y)
|
||||
macroK = self.macroK(predictions, y)
|
||||
self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True)
|
||||
self.log('train-macroF1', macroF1, on_step=True, on_epoch=True, prog_bar=False, logger=True)
|
||||
self.log('train-microF1', microF1, on_step=True, on_epoch=True, prog_bar=False, logger=True)
|
||||
self.log('train-macroK', macroK, on_step=True, on_epoch=True, prog_bar=False, logger=True)
|
||||
self.log('train-microK', microK, on_step=True, on_epoch=True, prog_bar=False, logger=True)
|
||||
re_lX = self._reconstruct_dict(predictions, ly)
|
||||
return {'loss': loss, 'pred': re_lX, 'target': ly}
|
||||
|
||||
def training_epoch_end(self, outputs):
|
||||
# outputs is a of n dicts of m elements, where n is equal to the number of epoch steps and m is batchsize.
|
||||
# here we save epoch level metric values and compute them specifically for each language
|
||||
res_macroF1 = {lang: [] for lang in self.langs}
|
||||
res_microF1 = {lang: [] for lang in self.langs}
|
||||
res_macroK = {lang: [] for lang in self.langs}
|
||||
res_microK = {lang: [] for lang in self.langs}
|
||||
for output in outputs:
|
||||
lX, ly = output['pred'], output['target']
|
||||
for lang in lX.keys():
|
||||
X, y = lX[lang], ly[lang]
|
||||
lang_macroF1 = self.lang_macroF1(X, y)
|
||||
lang_microF1 = self.lang_microF1(X, y)
|
||||
lang_macroK = self.lang_macroK(X, y)
|
||||
lang_microK = self.lang_microK(X, y)
|
||||
|
||||
res_macroF1[lang].append(lang_macroF1)
|
||||
res_microF1[lang].append(lang_microF1)
|
||||
res_macroK[lang].append(lang_macroK)
|
||||
res_microK[lang].append(lang_microK)
|
||||
for lang in self.langs:
|
||||
avg_macroF1 = torch.mean(torch.Tensor(res_macroF1[lang]))
|
||||
avg_microF1 = torch.mean(torch.Tensor(res_microF1[lang]))
|
||||
avg_macroK = torch.mean(torch.Tensor(res_macroK[lang]))
|
||||
avg_microK = torch.mean(torch.Tensor(res_microK[lang]))
|
||||
self.logger.experiment.add_scalars('train-langs-macroF1', {f'{lang}': avg_macroF1}, self.current_epoch)
|
||||
self.logger.experiment.add_scalars('train-langs-microF1', {f'{lang}': avg_microF1}, self.current_epoch)
|
||||
self.logger.experiment.add_scalars('train-langs-macroK', {f'{lang}': avg_macroK}, self.current_epoch)
|
||||
self.logger.experiment.add_scalars('train-langs-microK', {f'{lang}': avg_microK}, self.current_epoch)
|
||||
|
||||
def validation_step(self, val_batch, batch_idx):
|
||||
lX, ly = val_batch
|
||||
logits = self.forward(lX)
|
||||
_ly = []
|
||||
for lang in sorted(lX.keys()):
|
||||
_ly.append(ly[lang])
|
||||
ly = torch.cat(_ly, dim=0)
|
||||
loss = self.loss(logits, ly)
|
||||
predictions = torch.sigmoid(logits) > 0.5
|
||||
microF1 = self.microF1(predictions, ly)
|
||||
macroF1 = self.macroF1(predictions, ly)
|
||||
microK = self.microK(predictions, ly)
|
||||
macroK = self.macroK(predictions, ly)
|
||||
self.log('val-loss', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True)
|
||||
self.log('val-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=True, logger=True)
|
||||
self.log('val-microF1', microF1, on_step=False, on_epoch=True, prog_bar=True, logger=True)
|
||||
self.log('val-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
|
||||
self.log('val-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
|
||||
return {'loss': loss}
|
||||
|
||||
def test_step(self, test_batch, batch_idx):
|
||||
lX, ly = test_batch
|
||||
logits = self.forward(lX)
|
||||
_ly = []
|
||||
for lang in sorted(lX.keys()):
|
||||
_ly.append(ly[lang])
|
||||
ly = torch.cat(_ly, dim=0)
|
||||
predictions = torch.sigmoid(logits) > 0.5
|
||||
microF1 = self.microF1(predictions, ly)
|
||||
macroF1 = self.macroF1(predictions, ly)
|
||||
microK = self.microK(predictions, ly)
|
||||
macroK = self.macroK(predictions, ly)
|
||||
self.log('test-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=False, logger=True)
|
||||
self.log('test-microF1', microF1, on_step=False, on_epoch=True, prog_bar=False, logger=True)
|
||||
self.log('test-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
|
||||
self.log('test-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
|
||||
return
|
||||
|
||||
def embed(self, X, lang):
|
||||
input_list = []
|
||||
if self.lPretrained_embeddings[lang]:
|
||||
input_list.append(self.lPretrained_embeddings[lang](X))
|
||||
if self.lLearnable_embeddings[lang]:
|
||||
input_list.append(self.lLearnable_embeddings[lang](X))
|
||||
return torch.cat(tensors=input_list, dim=2)
|
||||
|
||||
def embedding_dropout(self, X, drop_range, p_drop=0.5, training=True):
|
||||
if p_drop > 0 and training and drop_range is not None:
|
||||
p = p_drop
|
||||
drop_from, drop_to = drop_range
|
||||
m = drop_to - drop_from # length of the supervised embedding
|
||||
l = X.shape[2] # total embedding length
|
||||
corr = (1 - p)
|
||||
X[:, :, drop_from:drop_to] = corr * F.dropout(X[:, :, drop_from:drop_to], p=p)
|
||||
X /= (1 - (p * m / l))
|
||||
return X
|
||||
|
||||
def configure_optimizers(self):
|
||||
optimizer = AdamW(self.parameters(), lr=1e-3)
|
||||
scheduler = StepLR(optimizer, step_size=25, gamma=0.5)
|
||||
return [optimizer], [scheduler]
|
||||
|
||||
@staticmethod
|
||||
def _reconstruct_dict(X, ly):
|
||||
reconstructed = {}
|
||||
_start = 0
|
||||
for lang in sorted(ly.keys()):
|
||||
lang_batchsize = len(ly[lang])
|
||||
reconstructed[lang] = X[_start:_start+lang_batchsize]
|
||||
_start += lang_batchsize
|
||||
return reconstructed
|
|
@ -1,12 +0,0 @@
|
|||
transformers==2.11.0
|
||||
pandas==0.25.3
|
||||
numpy==1.17.4
|
||||
joblib==0.14.0
|
||||
tqdm==4.50.2
|
||||
pytorch_lightning==1.1.2
|
||||
torch==1.3.1
|
||||
nltk==3.4.5
|
||||
scipy==1.3.3
|
||||
rdflib==4.2.2
|
||||
torchtext==0.4.0
|
||||
scikit_learn==0.24.1
|
|
@ -1,6 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
for i in {0..10..1}
|
||||
do
|
||||
python main.py --gpus 0
|
||||
done
|
|
@ -1,59 +0,0 @@
|
|||
import numpy as np
|
||||
from sklearn.decomposition import TruncatedSVD
|
||||
|
||||
|
||||
def get_weighted_average(We, x, w):
|
||||
"""
|
||||
Compute the weighted average vectors
|
||||
:param We: We[i,:] is the vector for word i
|
||||
:param x: x[i, :] are the indices of the words in sentence i
|
||||
:param w: w[i, :] are the weights for the words in sentence i
|
||||
:return: emb[i, :] are the weighted average vector for sentence i
|
||||
"""
|
||||
n_samples = x.shape[0]
|
||||
emb = np.zeros((n_samples, We.shape[1]))
|
||||
for i in range(n_samples):
|
||||
emb[i,:] = w[i,:].dot(We[x[i,:],:]) / np.count_nonzero(w[i,:])
|
||||
return emb
|
||||
|
||||
|
||||
def compute_pc(X,npc=1):
|
||||
"""
|
||||
Compute the principal components.
|
||||
:param X: X[i,:] is a data point
|
||||
:param npc: number of principal components to remove
|
||||
:return: component_[i,:] is the i-th pc
|
||||
"""
|
||||
svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
|
||||
svd.fit(X)
|
||||
return svd.components_
|
||||
|
||||
|
||||
def remove_pc(X, npc=1):
|
||||
"""
|
||||
Remove the projection on the principal components
|
||||
:param X: X[i,:] is a data point
|
||||
:param npc: number of principal components to remove
|
||||
:return: XX[i, :] is the data point after removing its projection
|
||||
"""
|
||||
pc = compute_pc(X, npc)
|
||||
if npc == 1:
|
||||
XX = X - X.dot(pc.transpose()) * pc
|
||||
else:
|
||||
XX = X - X.dot(pc.transpose()).dot(pc)
|
||||
return XX
|
||||
|
||||
|
||||
def SIF_embedding(We, x, w, params):
|
||||
"""
|
||||
Compute the scores between pairs of sentences using weighted average + removing the projection on the first principal component
|
||||
:param We: We[i,:] is the vector for word i
|
||||
:param x: x[i, :] are the indices of the words in the i-th sentence
|
||||
:param w: w[i, :] are the weights for the words in the i-th sentence
|
||||
:param params.rmpc: if >0, remove the projections of the sentence embeddings to their first principal component
|
||||
:return: emb, emb[i, :] is the embedding for sentence i
|
||||
"""
|
||||
emb = get_weighted_average(We, x, w)
|
||||
if params.rmpc > 0:
|
||||
emb = remove_pc(emb, params.rmpc)
|
||||
return emb
|
|
@ -1,384 +0,0 @@
|
|||
import numpy as np
|
||||
import torch
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import normalize
|
||||
|
||||
from util.embeddings_manager import supervised_embeddings_tfidf
|
||||
|
||||
|
||||
class TfidfVectorizerMultilingual:
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
self.kwargs = kwargs
|
||||
|
||||
def fit(self, lX, ly=None):
|
||||
self.langs = sorted(lX.keys())
|
||||
self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs}
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
return {l: self.vectorizer[l].transform(lX[l]) for l in self.langs}
|
||||
|
||||
def fit_transform(self, lX, ly=None):
|
||||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
def vocabulary(self, l=None):
|
||||
if l is None:
|
||||
return {l: self.vectorizer[l].vocabulary_ for l in self.langs}
|
||||
else:
|
||||
return self.vectorizer[l].vocabulary_
|
||||
|
||||
def get_analyzer(self, l=None):
|
||||
if l is None:
|
||||
return {l: self.vectorizer[l].build_analyzer() for l in self.langs}
|
||||
else:
|
||||
return self.vectorizer[l].build_analyzer()
|
||||
|
||||
|
||||
def _normalize(lX, l2=True):
|
||||
return {lang: normalize(X) for lang, X in lX.items()} if l2 else lX
|
||||
|
||||
|
||||
def none_dict(langs):
|
||||
return {l: None for l in langs}
|
||||
|
||||
|
||||
class MultilingualIndex:
|
||||
def __init__(self):
|
||||
"""
|
||||
Class that contains monolingual Indexes
|
||||
"""
|
||||
self.l_index = {}
|
||||
self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
|
||||
|
||||
def index(self, l_devel_raw, l_devel_target, l_test_raw, l_test_target, l_pretrained_vocabulary=None):
|
||||
self.langs = sorted(l_devel_raw.keys())
|
||||
self.l_vectorizer.fit(l_devel_raw)
|
||||
l_vocabulary = self.l_vectorizer.vocabulary()
|
||||
l_analyzer = self.l_vectorizer.get_analyzer()
|
||||
if l_pretrained_vocabulary is None:
|
||||
l_pretrained_vocabulary = none_dict(self.langs)
|
||||
|
||||
for lang in self.langs:
|
||||
# Init monolingual Index
|
||||
self.l_index[lang] = Index(l_devel_raw[lang], l_devel_target[lang], l_test_raw[lang], l_test_target[lang],
|
||||
lang)
|
||||
# call to index() function of monolingual Index
|
||||
self.l_index[lang].index(l_pretrained_vocabulary[lang], l_analyzer[lang], l_vocabulary[lang])
|
||||
|
||||
def train_val_split(self, val_prop=0.2, max_val=2000, seed=42):
|
||||
for l, index in self.l_index.items():
|
||||
index.train_val_split(val_prop, max_val, seed=seed)
|
||||
|
||||
def embedding_matrices(self, lpretrained, supervised):
|
||||
"""
|
||||
Extract from pretrained embeddings words that are found in the training dataset, then for each language
|
||||
calls the respective monolingual index and build the embedding matrix (if supervised, WCE are concatenated
|
||||
to the unsupervised vectors).
|
||||
:param lpretrained: dict {lang : matrix of word-embeddings }
|
||||
:param supervised: bool, whether to deploy Word-Class Embeddings or not
|
||||
:return: self
|
||||
"""
|
||||
lXtr = self.get_lXtr() if supervised else none_dict(self.langs)
|
||||
lYtr = self.l_train_target() if supervised else none_dict(self.langs)
|
||||
lWordList = self.get_wordlist()
|
||||
lExtracted = lpretrained.extract(lWordList)
|
||||
for lang, index in self.l_index.items():
|
||||
# if supervised concatenate embedding matrices of pretrained unsupervised
|
||||
# and supervised word-class embeddings
|
||||
index.compose_embedding_matrix(lExtracted[lang], supervised, lXtr[lang], lYtr[lang])
|
||||
self.sup_range = index.wce_range
|
||||
return self
|
||||
|
||||
def get_wordlist(self):
|
||||
wordlist = {}
|
||||
for lang, index in self.l_index.items():
|
||||
wordlist[lang] = index.get_word_list()
|
||||
return wordlist
|
||||
|
||||
def get_raw_lXtr(self):
|
||||
lXtr_raw = {k: [] for k in self.langs}
|
||||
lYtr_raw = {k: [] for k in self.langs}
|
||||
for lang in self.langs:
|
||||
lXtr_raw[lang] = self.l_index[lang].train_raw
|
||||
lYtr_raw[lang] = self.l_index[lang].train_raw
|
||||
return lXtr_raw
|
||||
|
||||
def get_raw_lXva(self):
|
||||
lXva_raw = {k: [] for k in self.langs}
|
||||
for lang in self.langs:
|
||||
lXva_raw[lang] = self.l_index[lang].val_raw
|
||||
|
||||
return lXva_raw
|
||||
|
||||
def get_raw_lXte(self):
|
||||
lXte_raw = {k: [] for k in self.langs}
|
||||
for lang in self.langs:
|
||||
lXte_raw[lang] = self.l_index[lang].test_raw
|
||||
|
||||
return lXte_raw
|
||||
|
||||
def get_lXtr(self):
|
||||
if not hasattr(self, 'lXtr'):
|
||||
self.lXtr = self.l_vectorizer.transform({l: index.train_raw for l, index in self.l_index.items()})
|
||||
return self.lXtr
|
||||
|
||||
def get_lXva(self):
|
||||
if not hasattr(self, 'lXva'):
|
||||
self.lXva = self.l_vectorizer.transform({l: index.val_raw for l, index in self.l_index.items()})
|
||||
return self.lXva
|
||||
|
||||
def get_lXte(self):
|
||||
if not hasattr(self, 'lXte'):
|
||||
self.lXte = self.l_vectorizer.transform({l: index.test_raw for l, index in self.l_index.items()})
|
||||
return self.lXte
|
||||
|
||||
def get_target_dim(self):
|
||||
return self.l_index[self.langs[0]].devel_target.shape[1]
|
||||
|
||||
def l_vocabsize(self):
|
||||
return {l: index.vocabsize for l, index in self.l_index.items()}
|
||||
|
||||
def l_embeddings(self):
|
||||
return {l: index.embedding_matrix for l, index in self.l_index.items()}
|
||||
|
||||
def l_pad(self):
|
||||
return {l: index.pad_index for l, index in self.l_index.items()}
|
||||
|
||||
def l_train_index(self):
|
||||
return {l: index.train_index for l, index in self.l_index.items()}
|
||||
|
||||
def l_train_raw_index(self):
|
||||
return {l: index.train_raw for l, index in self.l_index.items()}
|
||||
|
||||
def l_train_target(self):
|
||||
return {l: index.train_target for l, index in self.l_index.items()}
|
||||
|
||||
def l_val_index(self):
|
||||
return {l: index.val_index for l, index in self.l_index.items()}
|
||||
|
||||
def l_val_raw_index(self):
|
||||
return {l: index.val_raw for l, index in self.l_index.items()}
|
||||
|
||||
def l_test_raw_index(self):
|
||||
return {l: index.test_raw for l, index in self.l_index.items()}
|
||||
|
||||
def l_devel_raw_index(self):
|
||||
return {l: index.devel_raw for l, index in self.l_index.items()}
|
||||
|
||||
def l_val_target(self):
|
||||
return {l: index.val_target for l, index in self.l_index.items()}
|
||||
|
||||
def l_test_target(self):
|
||||
return {l: index.test_target for l, index in self.l_index.items()}
|
||||
|
||||
def l_test_index(self):
|
||||
return {l: index.test_index for l, index in self.l_index.items()}
|
||||
|
||||
def l_devel_index(self):
|
||||
return {l: index.devel_index for l, index in self.l_index.items()}
|
||||
|
||||
def l_devel_target(self):
|
||||
return {l: index.devel_target for l, index in self.l_index.items()}
|
||||
|
||||
def l_train(self):
|
||||
return self.l_train_index(), self.l_train_target()
|
||||
|
||||
def l_val(self):
|
||||
return self.l_val_index(), self.l_val_target()
|
||||
|
||||
def l_test(self):
|
||||
return self.l_test_index(), self.l_test_target()
|
||||
|
||||
def l_train_raw(self):
|
||||
return self.l_train_raw_index(), self.l_train_target()
|
||||
|
||||
def l_val_raw(self):
|
||||
return self.l_val_raw_index(), self.l_val_target()
|
||||
|
||||
def l_test_raw(self):
|
||||
return self.l_test_raw_index(), self.l_test_target()
|
||||
|
||||
def l_devel_raw(self):
|
||||
return self.l_devel_raw_index(), self.l_devel_target()
|
||||
|
||||
def get_l_pad_index(self):
|
||||
return {l: index.get_pad_index() for l, index in self.l_index.items()}
|
||||
|
||||
|
||||
class Index:
|
||||
def __init__(self, devel_raw, devel_target, test_raw, test_target, lang):
|
||||
"""
|
||||
Monolingual Index, takes care of tokenizing raw data, converting strings to ids, splitting the data into
|
||||
training and validation.
|
||||
:param devel_raw: list of strings, list of raw training texts
|
||||
:param devel_target:
|
||||
:param test_raw: list of strings, list of raw test texts
|
||||
:param lang: list, list of languages contained in the dataset
|
||||
"""
|
||||
self.lang = lang
|
||||
self.devel_raw = devel_raw
|
||||
self.devel_target = devel_target
|
||||
self.test_raw = test_raw
|
||||
self.test_target = test_target
|
||||
|
||||
def index(self, pretrained_vocabulary, analyzer, vocabulary):
|
||||
self.word2index = dict(vocabulary)
|
||||
known_words = set(self.word2index.keys())
|
||||
if pretrained_vocabulary is not None:
|
||||
known_words.update(pretrained_vocabulary)
|
||||
|
||||
self.word2index['UNKTOKEN'] = len(self.word2index)
|
||||
self.word2index['PADTOKEN'] = len(self.word2index)
|
||||
self.unk_index = self.word2index['UNKTOKEN']
|
||||
self.pad_index = self.word2index['PADTOKEN']
|
||||
|
||||
# index documents and keep track of test terms outside the development vocabulary that are in Muse (if available)
|
||||
self.out_of_vocabulary = dict()
|
||||
self.devel_index = index(self.devel_raw, self.word2index, known_words, analyzer, self.unk_index,
|
||||
self.out_of_vocabulary)
|
||||
self.test_index = index(self.test_raw, self.word2index, known_words, analyzer, self.unk_index,
|
||||
self.out_of_vocabulary)
|
||||
|
||||
self.vocabsize = len(self.word2index) + len(self.out_of_vocabulary)
|
||||
|
||||
print(f'[indexing complete for lang {self.lang}] vocabulary-size={self.vocabsize}')
|
||||
|
||||
def get_pad_index(self):
|
||||
return self.pad_index
|
||||
|
||||
def train_val_split(self, val_prop, max_val, seed):
|
||||
devel = self.devel_index
|
||||
target = self.devel_target
|
||||
devel_raw = self.devel_raw
|
||||
|
||||
val_size = int(min(len(devel) * val_prop, max_val))
|
||||
|
||||
self.train_index, self.val_index, self.train_target, self.val_target, self.train_raw, self.val_raw = \
|
||||
train_test_split(
|
||||
devel, target, devel_raw, test_size=val_size, random_state=seed, shuffle=True)
|
||||
|
||||
print(
|
||||
f'split lang {self.lang}: train={len(self.train_index)} val={len(self.val_index)} test={len(self.test_index)}')
|
||||
|
||||
def get_word_list(self):
|
||||
def extract_word_list(word2index):
|
||||
return [w for w, i in sorted(word2index.items(), key=lambda x: x[1])]
|
||||
|
||||
word_list = extract_word_list(self.word2index)
|
||||
word_list += extract_word_list(self.out_of_vocabulary)
|
||||
return word_list
|
||||
|
||||
def compose_embedding_matrix(self, pretrained, supervised, Xtr=None, Ytr=None):
|
||||
print(f'[generating embedding matrix for lang {self.lang}]')
|
||||
|
||||
self.wce_range = None
|
||||
embedding_parts = []
|
||||
|
||||
if pretrained is not None:
|
||||
print('\t[pretrained-matrix]')
|
||||
embedding_parts.append(pretrained)
|
||||
del pretrained
|
||||
|
||||
if supervised:
|
||||
print('\t[supervised-matrix]')
|
||||
F = supervised_embeddings_tfidf(Xtr, Ytr)
|
||||
num_missing_rows = self.vocabsize - F.shape[0]
|
||||
F = np.vstack((F, np.zeros(shape=(num_missing_rows, F.shape[1]))))
|
||||
F = torch.from_numpy(F).float()
|
||||
|
||||
offset = 0
|
||||
if embedding_parts:
|
||||
offset = embedding_parts[0].shape[1]
|
||||
self.wce_range = [offset, offset + F.shape[1]]
|
||||
embedding_parts.append(F)
|
||||
|
||||
self.embedding_matrix = torch.cat(embedding_parts, dim=1)
|
||||
|
||||
print(f'[embedding matrix for lang {self.lang} has shape {self.embedding_matrix.shape}]')
|
||||
|
||||
|
||||
def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary):
|
||||
"""
|
||||
Index (i.e., replaces word strings with numerical indexes) a list of string documents
|
||||
:param data: list of string documents
|
||||
:param vocab: a fixed mapping [str]->[int] of words to indexes
|
||||
:param known_words: a set of known words (e.g., words that, despite not being included in the vocab, can be retained
|
||||
because they are anyway contained in a pre-trained embedding set that we know in advance)
|
||||
:param analyzer: the preprocessor in charge of transforming the document string into a chain of string words
|
||||
:param unk_index: the index of the 'unknown token', i.e., a symbol that characterizes all words that we cannot keep
|
||||
:param out_of_vocabulary: an incremental mapping [str]->[int] of words to indexes that will index all those words that
|
||||
are not in the original vocab but that are in the known_words
|
||||
:return:
|
||||
"""
|
||||
indexes = []
|
||||
vocabsize = len(vocab)
|
||||
unk_count = 0
|
||||
knw_count = 0
|
||||
out_count = 0
|
||||
# pbar = tqdm(data, desc=f'indexing')
|
||||
for text in data:
|
||||
words = analyzer(text)
|
||||
index = []
|
||||
for word in words:
|
||||
if word in vocab:
|
||||
idx = vocab[word]
|
||||
else:
|
||||
if word in known_words:
|
||||
if word not in out_of_vocabulary:
|
||||
out_of_vocabulary[word] = vocabsize + len(out_of_vocabulary)
|
||||
idx = out_of_vocabulary[word]
|
||||
out_count += 1
|
||||
else:
|
||||
idx = unk_index
|
||||
unk_count += 1
|
||||
index.append(idx)
|
||||
indexes.append(index)
|
||||
knw_count += len(index)
|
||||
# pbar.set_description(f'[unk = {unk_count}/{knw_count}={(100.*unk_count/knw_count):.2f}%]'
|
||||
# f'[out = {out_count}/{knw_count}={(100.*out_count/knw_count):.2f}%]')
|
||||
return indexes
|
||||
|
||||
|
||||
def is_true(tensor, device):
|
||||
return torch.where(tensor == 1, torch.Tensor([1]).to(device), torch.Tensor([0]).to(device))
|
||||
|
||||
|
||||
def is_false(tensor, device):
|
||||
return torch.where(tensor == 0, torch.Tensor([1]).to(device), torch.Tensor([0]).to(device))
|
||||
|
||||
|
||||
def define_pad_length(index_list):
|
||||
lengths = [len(index) for index in index_list]
|
||||
return int(np.mean(lengths) + np.std(lengths))
|
||||
|
||||
|
||||
def pad(index_list, pad_index, max_pad_length=None):
|
||||
pad_length = np.max([len(index) for index in index_list])
|
||||
if max_pad_length is not None:
|
||||
pad_length = min(pad_length, max_pad_length)
|
||||
for i, indexes in enumerate(index_list):
|
||||
index_list[i] = [pad_index] * (pad_length - len(indexes)) + indexes[:pad_length]
|
||||
return index_list
|
||||
|
||||
|
||||
def get_params(optimc=False):
|
||||
if not optimc:
|
||||
return None
|
||||
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
|
||||
kernel = 'rbf'
|
||||
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
|
||||
|
||||
|
||||
def get_method_name(args):
|
||||
_id = ''
|
||||
_id_conf = [args.post_embedder, args.wce_embedder, args.muse_embedder, args.bert_embedder, args.gru_embedder]
|
||||
_id_name = ['X', 'W', 'M', 'B', 'G']
|
||||
for i, conf in enumerate(_id_conf):
|
||||
if conf:
|
||||
_id += _id_name[i]
|
||||
_id = _id if not args.gru_wce else _id + '_wce'
|
||||
_dataset_path = args.dataset.split('/')[-1].split('_')
|
||||
dataset_id = _dataset_path[0] + _dataset_path[-1]
|
||||
return _id, dataset_id
|
|
@ -1,104 +0,0 @@
|
|||
from abc import ABC, abstractmethod
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torchtext.vocab import Vectors
|
||||
|
||||
from util.SIF_embed import remove_pc
|
||||
|
||||
|
||||
class PretrainedEmbeddings(ABC):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
@abstractmethod
|
||||
def vocabulary(self): pass
|
||||
|
||||
@abstractmethod
|
||||
def dim(self): pass
|
||||
|
||||
@classmethod
|
||||
def reindex(cls, words, word2index):
|
||||
if isinstance(words, dict):
|
||||
words = list(zip(*sorted(words.items(), key=lambda x: x[1])))[0]
|
||||
|
||||
source_idx, target_idx = [], []
|
||||
for i, word in enumerate(words):
|
||||
if word not in word2index:
|
||||
continue
|
||||
j = word2index[word]
|
||||
source_idx.append(i)
|
||||
target_idx.append(j)
|
||||
source_idx = np.asarray(source_idx)
|
||||
target_idx = np.asarray(target_idx)
|
||||
return source_idx, target_idx
|
||||
|
||||
|
||||
class MuseLoader:
|
||||
def __init__(self, langs, cache):
|
||||
self.langs = langs
|
||||
self.lEmbed = {}
|
||||
self.lExtracted = {}
|
||||
for lang in self.langs:
|
||||
print(f'Loading vectors for {lang}...')
|
||||
self.lEmbed[lang] = Vectors(f'wiki.multi.{lang}.vec', cache)
|
||||
|
||||
def dim(self):
|
||||
return self.lEmbed[list(self.lEmbed.keys())[0]].dim
|
||||
|
||||
def vocabulary(self):
|
||||
return {lang: set(self.lEmbed[lang].stoi.keys()) for lang in self.langs}
|
||||
|
||||
def extract(self, lVoc):
|
||||
"""
|
||||
Reindex pretrained loaded embedding in order to match indexes assigned by scikit vectorizer. Such indexes
|
||||
are consistent with those used by Word Class Embeddings (since we deploy the same vectorizer)
|
||||
:param lVoc: dict {lang : {word : id}}
|
||||
:return: torch embedding matrix of extracted embeddings i.e., words in lVoc
|
||||
"""
|
||||
for lang, words in lVoc.items():
|
||||
print(f'Extracting words for lang {lang}...')
|
||||
# words = list(zip(*sorted(lVoc[lang].items(), key=lambda x: x[1])))[0]
|
||||
source_id, target_id = PretrainedEmbeddings.reindex(words, self.lEmbed[lang].stoi)
|
||||
extraction = torch.zeros((len(words), self.dim()))
|
||||
extraction[source_id] = self.lEmbed[lang].vectors[target_id]
|
||||
self.lExtracted[lang] = extraction
|
||||
return self.lExtracted
|
||||
|
||||
def get_lEmbeddings(self):
|
||||
return {lang: self.lEmbed[lang].vectors for lang in self.langs}
|
||||
|
||||
|
||||
def XdotM(X, M, sif):
|
||||
E = X.dot(M)
|
||||
if sif:
|
||||
E = remove_pc(E, npc=1)
|
||||
return E
|
||||
|
||||
|
||||
def wce_matrix(X, Y):
|
||||
wce = supervised_embeddings_tfidf(X, Y)
|
||||
wce = zscores(wce, axis=0)
|
||||
return wce
|
||||
|
||||
|
||||
def supervised_embeddings_tfidf(X, Y):
|
||||
tfidf_norm = X.sum(axis=0)
|
||||
tfidf_norm[tfidf_norm == 0] = 1
|
||||
F = (X.T).dot(Y) / tfidf_norm.T
|
||||
return F
|
||||
|
||||
|
||||
def zscores(X, axis=0):
|
||||
"""
|
||||
scipy.stats.zscores does not avoid division by 0, which can indeed occur
|
||||
:param X:
|
||||
:param axis:
|
||||
:return:
|
||||
"""
|
||||
std = np.clip(np.std(X, ddof=1, axis=axis), 1e-5, None)
|
||||
mean = np.mean(X, axis=axis)
|
||||
return (X - mean) / std
|
||||
|
||||
|
|
@ -1,20 +0,0 @@
|
|||
import numpy as np
|
||||
from joblib import Parallel, delayed
|
||||
|
||||
from util.metrics import *
|
||||
|
||||
|
||||
def evaluation_metrics(y, y_):
|
||||
if len(y.shape) == len(y_.shape) == 1 and len(np.unique(y)) > 2: # single-label
|
||||
raise NotImplementedError() # return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro')
|
||||
else: # the metrics I implemented assume multiclass multilabel classification as binary classifiers
|
||||
return macroF1(y, y_), microF1(y, y_), macroK(y, y_), microK(y, y_)
|
||||
|
||||
|
||||
def evaluate(ly_true, ly_pred, metrics=evaluation_metrics, n_jobs=-1):
|
||||
if n_jobs == 1:
|
||||
return {lang: metrics(ly_true[lang], ly_pred[lang]) for lang in ly_true.keys()}
|
||||
else:
|
||||
langs = list(ly_true.keys())
|
||||
evals = Parallel(n_jobs=n_jobs)(delayed(metrics)(ly_true[lang], ly_pred[lang]) for lang in langs)
|
||||
return {lang: evals[i] for i, lang in enumerate(langs)}
|
|
@ -1,50 +0,0 @@
|
|||
import urllib
|
||||
from os import listdir, makedirs
|
||||
from os.path import isdir, isfile, join, exists, dirname
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def download_file(url, archive_filename):
|
||||
def progress(blocknum, bs, size):
|
||||
total_sz_mb = '%.2f MB' % (size / 1e6)
|
||||
current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
|
||||
print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='')
|
||||
print("Downloading %s" % url)
|
||||
urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress)
|
||||
print("")
|
||||
|
||||
|
||||
def download_file_if_not_exists(url, archive_path):
|
||||
if exists(archive_path): return
|
||||
makedirs_if_not_exist(dirname(archive_path))
|
||||
download_file(url,archive_path)
|
||||
|
||||
|
||||
def ls(dir, typecheck):
|
||||
el = [f for f in listdir(dir) if typecheck(join(dir, f))]
|
||||
el.sort()
|
||||
return el
|
||||
|
||||
|
||||
def list_dirs(dir):
|
||||
return ls(dir, typecheck=isdir)
|
||||
|
||||
|
||||
def list_files(dir):
|
||||
return ls(dir, typecheck=isfile)
|
||||
|
||||
|
||||
def makedirs_if_not_exist(path):
|
||||
if not exists(path): makedirs(path)
|
||||
|
||||
|
||||
def create_if_not_exist(path):
|
||||
if not exists(path): makedirs(path)
|
||||
|
||||
|
||||
def get_parent_name(path):
|
||||
return Path(path).parent
|
||||
|
||||
|
||||
def get_file_name(path):
|
||||
return Path(path).name
|
|
@ -1,152 +0,0 @@
|
|||
import numpy as np
|
||||
|
||||
|
||||
class ContTable:
|
||||
def __init__(self, tp=0, tn=0, fp=0, fn=0):
|
||||
self.tp = tp
|
||||
self.tn = tn
|
||||
self.fp = fp
|
||||
self.fn = fn
|
||||
|
||||
def get_d(self): return self.tp + self.tn + self.fp + self.fn
|
||||
|
||||
def get_c(self): return self.tp + self.fn
|
||||
|
||||
def get_not_c(self): return self.tn + self.fp
|
||||
|
||||
def get_f(self): return self.tp + self.fp
|
||||
|
||||
def get_not_f(self): return self.tn + self.fn
|
||||
|
||||
def p_c(self): return (1.0*self.get_c())/self.get_d()
|
||||
|
||||
def p_not_c(self): return 1.0-self.p_c()
|
||||
|
||||
def p_f(self): return (1.0*self.get_f())/self.get_d()
|
||||
|
||||
def p_not_f(self): return 1.0-self.p_f()
|
||||
|
||||
def p_tp(self): return (1.0*self.tp) / self.get_d()
|
||||
|
||||
def p_tn(self): return (1.0*self.tn) / self.get_d()
|
||||
|
||||
def p_fp(self): return (1.0*self.fp) / self.get_d()
|
||||
|
||||
def p_fn(self): return (1.0*self.fn) / self.get_d()
|
||||
|
||||
def tpr(self):
|
||||
c = 1.0*self.get_c()
|
||||
return self.tp / c if c > 0.0 else 0.0
|
||||
|
||||
def fpr(self):
|
||||
_c = 1.0*self.get_not_c()
|
||||
return self.fp / _c if _c > 0.0 else 0.0
|
||||
|
||||
def __add__(self, other):
|
||||
return ContTable(tp=self.tp + other.tp, tn=self.tn + other.tn, fp=self.fp + other.fp, fn=self.fn + other.fn)
|
||||
|
||||
|
||||
def accuracy(cell):
|
||||
return (cell.tp + cell.tn)*1.0 / (cell.tp + cell.fp + cell.fn + cell.tn)
|
||||
|
||||
|
||||
def f1(cell):
|
||||
num = 2.0 * cell.tp
|
||||
den = 2.0 * cell.tp + cell.fp + cell.fn
|
||||
if den > 0:
|
||||
return num / den
|
||||
# we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative
|
||||
return 1.0
|
||||
|
||||
|
||||
def K(cell):
|
||||
specificity, recall = 0., 0.
|
||||
|
||||
AN = cell.tn + cell.fp
|
||||
if AN != 0:
|
||||
specificity = cell.tn*1. / AN
|
||||
|
||||
AP = cell.tp + cell.fn
|
||||
if AP != 0:
|
||||
recall = cell.tp*1. / AP
|
||||
|
||||
if AP == 0:
|
||||
return 2. * specificity - 1.
|
||||
elif AN == 0:
|
||||
return 2. * recall - 1.
|
||||
else:
|
||||
return specificity + recall - 1.
|
||||
|
||||
|
||||
# if the classifier is single class, then the prediction is a vector of shape=(nD,) which causes issues when compared
|
||||
# to the true labels (of shape=(nD,1)). This method increases the dimensions of the predictions.
|
||||
def __check_consistency_and_adapt(true_labels, predictions):
|
||||
if predictions.ndim == 1:
|
||||
return __check_consistency_and_adapt(true_labels, np.expand_dims(predictions, axis=1))
|
||||
if true_labels.ndim == 1:
|
||||
return __check_consistency_and_adapt(np.expand_dims(true_labels, axis=1), predictions)
|
||||
if true_labels.shape != predictions.shape:
|
||||
raise ValueError("True and predicted label matrices shapes are inconsistent %s %s."
|
||||
% (true_labels.shape, predictions.shape))
|
||||
_, nC = true_labels.shape
|
||||
return true_labels, predictions, nC
|
||||
|
||||
|
||||
# computes the (soft) contingency table where tp, fp, fn, and tn are the cumulative masses for the posterioir
|
||||
# probabilitiesfron with respect to the true binary labels
|
||||
# true_labels and posterior_probabilities are two vectors of shape (number_documents,)
|
||||
def soft_single_metric_statistics(true_labels, posterior_probabilities):
|
||||
assert len(true_labels) == len(posterior_probabilities), "Format not consistent between true and predicted labels."
|
||||
tp = np.sum(posterior_probabilities[true_labels == 1])
|
||||
fn = np.sum(1. - posterior_probabilities[true_labels == 1])
|
||||
fp = np.sum(posterior_probabilities[true_labels == 0])
|
||||
tn = np.sum(1. - posterior_probabilities[true_labels == 0])
|
||||
return ContTable(tp=tp, tn=tn, fp=fp, fn=fn)
|
||||
|
||||
|
||||
# computes the (hard) counters tp, fp, fn, and tn fron a true and predicted vectors of hard decisions
|
||||
# true_labels and predicted_labels are two vectors of shape (number_documents,)
|
||||
def hard_single_metric_statistics(true_labels, predicted_labels):
|
||||
assert len(true_labels) == len(predicted_labels), "Format not consistent between true and predicted labels."
|
||||
nd = len(true_labels)
|
||||
tp = np.sum(predicted_labels[true_labels == 1])
|
||||
fp = np.sum(predicted_labels[true_labels == 0])
|
||||
fn = np.sum(true_labels[predicted_labels == 0])
|
||||
tn = nd - (tp+fp+fn)
|
||||
return ContTable(tp=tp, tn=tn, fp=fp, fn=fn)
|
||||
|
||||
|
||||
def macro_average(true_labels, predicted_labels, metric, metric_statistics=hard_single_metric_statistics):
|
||||
true_labels, predicted_labels, nC = __check_consistency_and_adapt(true_labels, predicted_labels)
|
||||
return np.mean([metric(metric_statistics(true_labels[:, c], predicted_labels[:, c])) for c in range(nC)])
|
||||
|
||||
|
||||
def micro_average(true_labels, predicted_labels, metric, metric_statistics=hard_single_metric_statistics):
|
||||
true_labels, predicted_labels, nC = __check_consistency_and_adapt(true_labels, predicted_labels)
|
||||
|
||||
accum = ContTable()
|
||||
for c in range(nC):
|
||||
other = metric_statistics(true_labels[:, c], predicted_labels[:, c])
|
||||
accum = accum + other
|
||||
|
||||
return metric(accum)
|
||||
|
||||
|
||||
# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
|
||||
def macroF1(true_labels, predicted_labels):
|
||||
return macro_average(true_labels, predicted_labels, f1)
|
||||
|
||||
|
||||
# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
|
||||
def microF1(true_labels, predicted_labels):
|
||||
return micro_average(true_labels, predicted_labels, f1)
|
||||
|
||||
|
||||
# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
|
||||
def macroK(true_labels, predicted_labels):
|
||||
return macro_average(true_labels, predicted_labels, K)
|
||||
|
||||
|
||||
# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
|
||||
def microK(true_labels, predicted_labels):
|
||||
return micro_average(true_labels, predicted_labels, K)
|
|
@ -1,141 +0,0 @@
|
|||
import torch
|
||||
from pytorch_lightning.metrics import Metric
|
||||
|
||||
from util.common import is_false, is_true
|
||||
|
||||
|
||||
def _update(pred, target, device):
|
||||
assert pred.shape == target.shape
|
||||
# preparing preds and targets for count
|
||||
true_pred = is_true(pred, device)
|
||||
false_pred = is_false(pred, device)
|
||||
true_target = is_true(target, device)
|
||||
false_target = is_false(target, device)
|
||||
|
||||
tp = torch.sum(true_pred * true_target, dim=0)
|
||||
tn = torch.sum(false_pred * false_target, dim=0)
|
||||
fp = torch.sum(true_pred * false_target, dim=0)
|
||||
fn = torch.sum(false_pred * target, dim=0)
|
||||
return tp, tn, fp, fn
|
||||
|
||||
|
||||
class CustomF1(Metric):
|
||||
def __init__(self, num_classes, device, average='micro'):
|
||||
"""
|
||||
Custom F1 metric.
|
||||
Scikit learn provides a full set of evaluation metrics, but they treat special cases differently.
|
||||
I.e., when the number of true positives, false positives, and false negatives amount to 0, all
|
||||
affected metrics (precision, recall, and thus f1) output 0 in Scikit learn.
|
||||
We adhere to the common practice of outputting 1 in this case since the classifier has correctly
|
||||
classified all examples as negatives.
|
||||
:param num_classes:
|
||||
:param device:
|
||||
:param average:
|
||||
"""
|
||||
super().__init__()
|
||||
self.num_classes = num_classes
|
||||
self.average = average
|
||||
self.device = 'cuda' if device else 'cpu'
|
||||
self.add_state('true_positive', default=torch.zeros(self.num_classes))
|
||||
self.add_state('true_negative', default=torch.zeros(self.num_classes))
|
||||
self.add_state('false_positive', default=torch.zeros(self.num_classes))
|
||||
self.add_state('false_negative', default=torch.zeros(self.num_classes))
|
||||
|
||||
def update(self, preds, target):
|
||||
true_positive, true_negative, false_positive, false_negative = _update(preds, target, self.device)
|
||||
|
||||
self.true_positive += true_positive
|
||||
self.true_negative += true_negative
|
||||
self.false_positive += false_positive
|
||||
self.false_negative += false_negative
|
||||
|
||||
def compute(self):
|
||||
if self.average == 'micro':
|
||||
num = 2.0 * self.true_positive.sum()
|
||||
den = 2.0 * self.true_positive.sum() + self.false_positive.sum() + self.false_negative.sum()
|
||||
if den > 0:
|
||||
return (num / den).to(self.device)
|
||||
return torch.FloatTensor([1.]).to(self.device)
|
||||
if self.average == 'macro':
|
||||
class_specific = []
|
||||
for i in range(self.num_classes):
|
||||
class_tp = self.true_positive[i]
|
||||
class_tn = self.true_negative[i]
|
||||
class_fp = self.false_positive[i]
|
||||
class_fn = self.false_negative[i]
|
||||
num = 2.0 * class_tp
|
||||
den = 2.0 * class_tp + class_fp + class_fn
|
||||
if den > 0:
|
||||
class_specific.append(num / den)
|
||||
else:
|
||||
class_specific.append(1.)
|
||||
average = torch.sum(torch.Tensor(class_specific))/self.num_classes
|
||||
return average.to(self.device)
|
||||
|
||||
|
||||
class CustomK(Metric):
|
||||
def __init__(self, num_classes, device, average='micro'):
|
||||
"""
|
||||
K metric. https://dl.acm.org/doi/10.1145/2808194.2809449
|
||||
:param num_classes:
|
||||
:param device:
|
||||
:param average:
|
||||
"""
|
||||
super().__init__()
|
||||
self.num_classes = num_classes
|
||||
self.average = average
|
||||
self.device = 'cuda' if device else 'cpu'
|
||||
self.add_state('true_positive', default=torch.zeros(self.num_classes))
|
||||
self.add_state('true_negative', default=torch.zeros(self.num_classes))
|
||||
self.add_state('false_positive', default=torch.zeros(self.num_classes))
|
||||
self.add_state('false_negative', default=torch.zeros(self.num_classes))
|
||||
|
||||
def update(self, preds, target):
|
||||
true_positive, true_negative, false_positive, false_negative = _update(preds, target, self.device)
|
||||
|
||||
self.true_positive += true_positive
|
||||
self.true_negative += true_negative
|
||||
self.false_positive += false_positive
|
||||
self.false_negative += false_negative
|
||||
|
||||
def compute(self):
|
||||
if self.average == 'micro':
|
||||
specificity, recall = 0., 0.
|
||||
absolute_negatives = self.true_negative.sum() + self.false_positive.sum()
|
||||
if absolute_negatives != 0:
|
||||
specificity = self.true_negative.sum()/absolute_negatives
|
||||
absolute_positives = self.true_positive.sum() + self.false_negative.sum()
|
||||
if absolute_positives != 0:
|
||||
recall = self.true_positive.sum()/absolute_positives
|
||||
|
||||
if absolute_positives == 0:
|
||||
return 2. * specificity - 1
|
||||
elif absolute_negatives == 0:
|
||||
return 2. * recall - 1
|
||||
else:
|
||||
return specificity + recall - 1
|
||||
|
||||
if self.average == 'macro':
|
||||
class_specific = []
|
||||
for i in range(self.num_classes):
|
||||
class_tp = self.true_positive[i]
|
||||
class_tn = self.true_negative[i]
|
||||
class_fp = self.false_positive[i]
|
||||
class_fn = self.false_negative[i]
|
||||
|
||||
specificity, recall = 0., 0.
|
||||
absolute_negatives = class_tn + class_fp
|
||||
if absolute_negatives != 0:
|
||||
specificity = class_tn / absolute_negatives
|
||||
absolute_positives = class_tp + class_fn
|
||||
if absolute_positives != 0:
|
||||
recall = class_tp / absolute_positives
|
||||
|
||||
if absolute_positives == 0:
|
||||
class_specific.append(2. * specificity - 1)
|
||||
elif absolute_negatives == 0:
|
||||
class_specific.append(2. * recall - 1)
|
||||
else:
|
||||
class_specific.append(specificity + recall - 1)
|
||||
average = torch.sum(torch.Tensor(class_specific)) / self.num_classes
|
||||
return average.to(self.device)
|
|
@ -1,53 +0,0 @@
|
|||
import os
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class CSVlog:
|
||||
def __init__(self, file, autoflush=True, verbose=False):
|
||||
self.file = file
|
||||
self.columns = ['method',
|
||||
'setting',
|
||||
'optimc',
|
||||
'sif',
|
||||
'zscore',
|
||||
'l2',
|
||||
'dataset',
|
||||
'time_tr',
|
||||
'time_te',
|
||||
'lang',
|
||||
'macrof1',
|
||||
'microf1',
|
||||
'macrok',
|
||||
'microk',
|
||||
'notes']
|
||||
self.autoflush = autoflush
|
||||
self.verbose = verbose
|
||||
if os.path.exists(file):
|
||||
self.tell('Loading existing file from {}'.format(file))
|
||||
self.df = pd.read_csv(file, sep='\t')
|
||||
else:
|
||||
self.tell('File {} does not exist. Creating new frame.'.format(file))
|
||||
dir = os.path.dirname(self.file)
|
||||
if dir and not os.path.exists(dir): os.makedirs(dir)
|
||||
self.df = pd.DataFrame(columns=self.columns)
|
||||
|
||||
def already_calculated(self, id):
|
||||
return (self.df['id'] == id).any()
|
||||
|
||||
def add_row(self, method, setting, optimc, sif, zscore, l2, dataset, time_tr, time_te, lang,
|
||||
macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
|
||||
s = pd.Series([method, setting, optimc, sif, zscore, l2, dataset, time_tr, time_te, lang,
|
||||
macrof1, microf1, macrok, microk, notes],
|
||||
index=self.columns)
|
||||
self.df = self.df.append(s, ignore_index=True)
|
||||
if self.autoflush: self.flush()
|
||||
self.tell(s.to_string())
|
||||
|
||||
def flush(self):
|
||||
self.df.to_csv(self.file, index=False, sep='\t')
|
||||
|
||||
def tell(self, msg):
|
||||
if self.verbose:
|
||||
print(msg)
|
|
@ -1,36 +0,0 @@
|
|||
import numpy as np
|
||||
|
||||
|
||||
class StandardizeTransformer:
|
||||
def __init__(self, axis=0, range=None):
|
||||
"""
|
||||
|
||||
:param axis:
|
||||
:param range:
|
||||
"""
|
||||
assert range is None or isinstance(range, slice), 'wrong format for range, should either be None or a slice'
|
||||
self.axis = axis
|
||||
self.yetfit = False
|
||||
self.range = range
|
||||
|
||||
def fit(self, X):
|
||||
print('Applying z-score standardization...')
|
||||
std=np.std(X, axis=self.axis, ddof=1)
|
||||
self.std = np.clip(std, 1e-5, None)
|
||||
self.mean = np.mean(X, axis=self.axis)
|
||||
if self.range is not None:
|
||||
ones = np.ones_like(self.std)
|
||||
zeros = np.zeros_like(self.mean)
|
||||
ones[self.range] = self.std[self.range]
|
||||
zeros[self.range] = self.mean[self.range]
|
||||
self.std = ones
|
||||
self.mean = zeros
|
||||
self.yetfit=True
|
||||
return self
|
||||
|
||||
def transform(self, X):
|
||||
if not self.yetfit: 'transform called before fit'
|
||||
return (X - self.mean) / self.std
|
||||
|
||||
def fit_transform(self, X):
|
||||
return self.fit(X).transform(X)
|
|
@ -1,375 +0,0 @@
|
|||
"""
|
||||
This module contains the view generators that take care of computing the view specific document embeddings:
|
||||
|
||||
- VanillaFunGen (-x) cast document representations encoded via TFIDF into posterior probabilities by means of SVM.
|
||||
|
||||
- WordClassGen (-w): generates document representation via Word-Class-Embeddings.
|
||||
Document embeddings are obtained via weighted sum of document's constituent embeddings.
|
||||
|
||||
- MuseGen (-m): generates document representation via MUSE embeddings.
|
||||
Document embeddings are obtained via weighted sum of document's constituent embeddings.
|
||||
|
||||
- RecurrentGen (-g): generates document embedding by means of a Gated Recurrent Units. The model can be
|
||||
initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,).
|
||||
Output dimension is (n_docs, 512).
|
||||
|
||||
- View generator (-b): generates document embedding via mBERT model.
|
||||
"""
|
||||
from abc import ABC, abstractmethod
|
||||
from time import time
|
||||
|
||||
from pytorch_lightning import Trainer
|
||||
from pytorch_lightning.loggers import TensorBoardLogger
|
||||
|
||||
from data.datamodule import RecurrentDataModule, BertDataModule, tokenize
|
||||
from models.learners import *
|
||||
from models.pl_bert import BertModel
|
||||
from models.pl_gru import RecurrentModel
|
||||
from util.common import TfidfVectorizerMultilingual, _normalize
|
||||
from util.embeddings_manager import MuseLoader, XdotM, wce_matrix
|
||||
|
||||
|
||||
class ViewGen(ABC):
|
||||
"""
|
||||
Abstract class for ViewGenerators implementations. Every ViewGen should implement these three methods in order to
|
||||
be seamlessly integrated in the overall architecture.
|
||||
"""
|
||||
@abstractmethod
|
||||
def fit(self, lX, ly):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def transform(self, lX):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def fit_transform(self, lX, ly):
|
||||
pass
|
||||
|
||||
|
||||
class VanillaFunGen(ViewGen):
|
||||
"""
|
||||
View Generator (x): original funnelling architecture proposed by Moreo, Esuli and
|
||||
Sebastiani in DOI: https://doi.org/10.1145/3326065
|
||||
"""
|
||||
def __init__(self, base_learner, first_tier_parameters=None, n_jobs=-1):
|
||||
"""
|
||||
Init Posterior Probabilities embedder (i.e., VanillaFunGen)
|
||||
:param base_learner: naive monolingual learners to be deployed as first-tier learners. Should be able to
|
||||
return posterior probabilities.
|
||||
:param base_learner:
|
||||
:param n_jobs: integer, number of concurrent workers
|
||||
"""
|
||||
super().__init__()
|
||||
self.learners = base_learner
|
||||
self.first_tier_parameters = first_tier_parameters
|
||||
self.n_jobs = n_jobs
|
||||
self.doc_projector = NaivePolylingualClassifier(base_learner=self.learners,
|
||||
parameters=self.first_tier_parameters, n_jobs=self.n_jobs)
|
||||
self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
|
||||
|
||||
def fit(self, lX, lY):
|
||||
print('# Fitting VanillaFunGen (X)...')
|
||||
lX = self.vectorizer.fit_transform(lX)
|
||||
self.doc_projector.fit(lX, lY)
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
"""
|
||||
(1) Vectorize documents; (2) Project them according to the learners SVMs, finally (3) Apply L2 normalization
|
||||
to the projection and returns it.
|
||||
:param lX: dict {lang: indexed documents}
|
||||
:return: document projection to the common latent space.
|
||||
"""
|
||||
lX = self.vectorizer.transform(lX)
|
||||
lZ = self.doc_projector.predict_proba(lX)
|
||||
lZ = _normalize(lZ, l2=True)
|
||||
return lZ
|
||||
|
||||
def fit_transform(self, lX, ly):
|
||||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
|
||||
class MuseGen(ViewGen):
|
||||
"""
|
||||
View Generator (m): generates document representation via MUSE embeddings (Fasttext multilingual word
|
||||
embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings.
|
||||
"""
|
||||
def __init__(self, muse_dir='../embeddings', n_jobs=-1):
|
||||
"""
|
||||
Init the MuseGen.
|
||||
:param muse_dir: string, path to folder containing muse embeddings
|
||||
:param n_jobs: int, number of concurrent workers
|
||||
"""
|
||||
super().__init__()
|
||||
self.muse_dir = muse_dir
|
||||
self.n_jobs = n_jobs
|
||||
self.langs = None
|
||||
self.lMuse = None
|
||||
self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
|
||||
|
||||
def fit(self, lX, ly):
|
||||
"""
|
||||
(1) Vectorize documents; (2) Load muse embeddings for words encountered while vectorizing.
|
||||
:param lX: dict {lang: indexed documents}
|
||||
:param ly: dict {lang: target vectors}
|
||||
:return: self.
|
||||
"""
|
||||
print('# Fitting MuseGen (M)...')
|
||||
self.vectorizer.fit(lX)
|
||||
self.langs = sorted(lX.keys())
|
||||
self.lMuse = MuseLoader(langs=self.langs, cache=self.muse_dir)
|
||||
lVoc = self.vectorizer.vocabulary()
|
||||
self.lMuse = self.lMuse.extract(lVoc) # overwriting lMuse with dict {lang : embed_matrix} with only known words
|
||||
# TODO: featureweight.fit
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
"""
|
||||
(1) Vectorize documents; (2) computes the weighted sum of MUSE embeddings found at document level,
|
||||
finally (3) Apply L2 normalization embedding and returns it.
|
||||
:param lX: dict {lang: indexed documents}
|
||||
:return: document projection to the common latent space.
|
||||
"""
|
||||
lX = self.vectorizer.transform(lX)
|
||||
XdotMUSE = Parallel(n_jobs=self.n_jobs)(
|
||||
delayed(XdotM)(lX[lang], self.lMuse[lang], sif=True) for lang in self.langs)
|
||||
lZ = {lang: XdotMUSE[i] for i, lang in enumerate(self.langs)}
|
||||
lZ = _normalize(lZ, l2=True)
|
||||
return lZ
|
||||
|
||||
def fit_transform(self, lX, ly):
|
||||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
|
||||
class WordClassGen(ViewGen):
|
||||
"""
|
||||
View Generator (w): generates document representation via Word-Class-Embeddings.
|
||||
Document embeddings are obtained via weighted sum of document's constituent embeddings.
|
||||
"""
|
||||
def __init__(self, n_jobs=-1):
|
||||
"""
|
||||
Init WordClassGen.
|
||||
:param n_jobs: int, number of concurrent workers
|
||||
"""
|
||||
super().__init__()
|
||||
self.n_jobs = n_jobs
|
||||
self.langs = None
|
||||
self.lWce = None
|
||||
self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
|
||||
|
||||
def fit(self, lX, ly):
|
||||
"""
|
||||
(1) Vectorize documents; (2) Load muse embeddings for words encountered while vectorizing.
|
||||
:param lX: dict {lang: indexed documents}
|
||||
:param ly: dict {lang: target vectors}
|
||||
:return: self.
|
||||
"""
|
||||
print('# Fitting WordClassGen (W)...')
|
||||
lX = self.vectorizer.fit_transform(lX)
|
||||
self.langs = sorted(lX.keys())
|
||||
wce = Parallel(n_jobs=self.n_jobs)(
|
||||
delayed(wce_matrix)(lX[lang], ly[lang]) for lang in self.langs)
|
||||
self.lWce = {l: wce[i] for i, l in enumerate(self.langs)}
|
||||
# TODO: featureweight.fit()
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
"""
|
||||
(1) Vectorize documents; (2) computes the weighted sum of Word-Class Embeddings found at document level,
|
||||
finally (3) Apply L2 normalization embedding and returns it.
|
||||
:param lX: dict {lang: indexed documents}
|
||||
:return: document projection to the common latent space.
|
||||
"""
|
||||
lX = self.vectorizer.transform(lX)
|
||||
XdotWce = Parallel(n_jobs=self.n_jobs)(
|
||||
delayed(XdotM)(lX[lang], self.lWce[lang], sif=True) for lang in self.langs)
|
||||
lWce = {l: XdotWce[i] for i, l in enumerate(self.langs)}
|
||||
lWce = _normalize(lWce, l2=True)
|
||||
return lWce
|
||||
|
||||
def fit_transform(self, lX, ly):
|
||||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
|
||||
class RecurrentGen(ViewGen):
|
||||
"""
|
||||
View Generator (G): generates document embedding by means of a Gated Recurrent Units. The model can be
|
||||
initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,).
|
||||
Output dimension is (n_docs, 512). The training will happen end-to-end. At inference time, the model returns
|
||||
the network internal state at the second feed-forward layer level. Training metrics are logged via TensorBoard.
|
||||
"""
|
||||
def __init__(self, multilingualIndex, pretrained_embeddings, wce, batch_size=512, nepochs=50,
|
||||
gpus=0, n_jobs=-1, stored_path=None):
|
||||
"""
|
||||
Init RecurrentGen.
|
||||
:param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents
|
||||
indexed by language code.
|
||||
:param pretrained_embeddings: dict {lang: tensor of embeddings}, it contains the pretrained embeddings to use
|
||||
as embedding layer.
|
||||
:param wce: Bool, whether to deploy Word-Class Embeddings (as proposed by A. Moreo). If True, supervised
|
||||
embeddings are concatenated to the deployed supervised embeddings. WCE dimensionality is equal to
|
||||
the number of target classes.
|
||||
:param batch_size: int, number of samples in a batch.
|
||||
:param nepochs: int, number of max epochs to train the model.
|
||||
:param gpus: int, specifies how many GPUs to use per node. If False computation will take place on cpu.
|
||||
:param n_jobs: int, number of concurrent workers (i.e., parallelizing data loading).
|
||||
:param stored_path: str, path to a pretrained model. If None the model will be trained from scratch.
|
||||
"""
|
||||
super().__init__()
|
||||
self.multilingualIndex = multilingualIndex
|
||||
self.langs = multilingualIndex.langs
|
||||
self.batch_size = batch_size
|
||||
self.gpus = gpus
|
||||
self.n_jobs = n_jobs
|
||||
self.stored_path = stored_path
|
||||
self.nepochs = nepochs
|
||||
|
||||
# EMBEDDINGS to be deployed
|
||||
self.pretrained = pretrained_embeddings
|
||||
self.wce = wce
|
||||
|
||||
self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1)
|
||||
self.multilingualIndex.embedding_matrices(self.pretrained, supervised=self.wce)
|
||||
self.model = self._init_model()
|
||||
self.logger = TensorBoardLogger(save_dir='tb_logs', name='rnn', default_hp_metric=False)
|
||||
# self.logger = CSVLogger(save_dir='csv_logs', name='rnn_dev')
|
||||
|
||||
def _init_model(self):
|
||||
if self.stored_path:
|
||||
lpretrained = self.multilingualIndex.l_embeddings()
|
||||
return RecurrentModel.load_from_checkpoint(self.stored_path, lPretrained=lpretrained)
|
||||
else:
|
||||
lpretrained = self.multilingualIndex.l_embeddings()
|
||||
langs = self.multilingualIndex.langs
|
||||
output_size = self.multilingualIndex.get_target_dim()
|
||||
hidden_size = 512
|
||||
lvocab_size = self.multilingualIndex.l_vocabsize()
|
||||
learnable_length = 0
|
||||
return RecurrentModel(
|
||||
lPretrained=lpretrained,
|
||||
langs=langs,
|
||||
output_size=output_size,
|
||||
hidden_size=hidden_size,
|
||||
lVocab_size=lvocab_size,
|
||||
learnable_length=learnable_length,
|
||||
drop_embedding_range=self.multilingualIndex.sup_range,
|
||||
drop_embedding_prop=0.5,
|
||||
gpus=self.gpus
|
||||
)
|
||||
|
||||
def fit(self, lX, ly):
|
||||
"""
|
||||
Train the Neural Network end-to-end.
|
||||
lX and ly are not directly used. We rather get them from the multilingual index used in the instantiation
|
||||
of the Dataset object (RecurrentDataset) in the GfunDataModule class.
|
||||
:param lX: dict {lang: indexed documents}
|
||||
:param ly: dict {lang: target vectors}
|
||||
:return: self.
|
||||
"""
|
||||
print('# Fitting RecurrentGen (G)...')
|
||||
recurrentDataModule = RecurrentDataModule(self.multilingualIndex, batchsize=self.batch_size, n_jobs=self.n_jobs)
|
||||
trainer = Trainer(gradient_clip_val=1e-1, gpus=self.gpus, logger=self.logger, max_epochs=self.nepochs,
|
||||
checkpoint_callback=False)
|
||||
|
||||
# vanilla_torch_model = torch.load(
|
||||
# '/home/andreapdr/funneling_pdr/checkpoint/gru_viewgen_-jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle')
|
||||
# self.model.linear0 = vanilla_torch_model.linear0
|
||||
# self.model.linear1 = vanilla_torch_model.linear1
|
||||
# self.model.linear2 = vanilla_torch_model.linear2
|
||||
# self.model.rnn = vanilla_torch_model.rnn
|
||||
|
||||
trainer.fit(self.model, datamodule=recurrentDataModule)
|
||||
trainer.test(self.model, datamodule=recurrentDataModule)
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
"""
|
||||
Project documents to the common latent space. Output dimensionality is 512.
|
||||
:param lX: dict {lang: indexed documents}
|
||||
:return: documents projected to the common latent space.
|
||||
"""
|
||||
l_pad = self.multilingualIndex.l_pad()
|
||||
data = self.multilingualIndex.l_devel_index()
|
||||
self.model.to('cuda' if self.gpus else 'cpu')
|
||||
self.model.eval()
|
||||
time_init = time()
|
||||
l_embeds = self.model.encode(data, l_pad, batch_size=256)
|
||||
transform_time = round(time() - time_init, 3)
|
||||
print(f'Executed! Transform took: {transform_time}')
|
||||
return l_embeds
|
||||
|
||||
def fit_transform(self, lX, ly):
|
||||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
|
||||
class BertGen(ViewGen):
|
||||
"""
|
||||
View Generator (b): generates document embedding via Bert model. The training happens end-to-end.
|
||||
At inference time, the model returns the network internal state at the last original layer (i.e. 12th). Document
|
||||
embeddings are the state associated with the "start" token. Training metrics are logged via TensorBoard.
|
||||
"""
|
||||
def __init__(self, multilingualIndex, batch_size=128, nepochs=50, gpus=0, n_jobs=-1, stored_path=None):
|
||||
"""
|
||||
Init Bert model
|
||||
:param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents
|
||||
indexed by language code.
|
||||
:param batch_size: int, number of samples per batch.
|
||||
:param nepochs: int, number of max epochs to train the model.
|
||||
:param gpus: int, specifies how many GPUs to use per node. If False computation will take place on cpu.
|
||||
:param n_jobs: int, number of concurrent workers.
|
||||
:param stored_path: str, path to a pretrained model. If None the model will be trained from scratch.
|
||||
"""
|
||||
super().__init__()
|
||||
self.multilingualIndex = multilingualIndex
|
||||
self.nepochs = nepochs
|
||||
self.gpus = gpus
|
||||
self.batch_size = batch_size
|
||||
self.n_jobs = n_jobs
|
||||
self.stored_path = stored_path
|
||||
self.model = self._init_model()
|
||||
self.logger = TensorBoardLogger(save_dir='tb_logs', name='bert', default_hp_metric=False)
|
||||
|
||||
def _init_model(self):
|
||||
output_size = self.multilingualIndex.get_target_dim()
|
||||
return BertModel(output_size=output_size, stored_path=self.stored_path, gpus=self.gpus)
|
||||
|
||||
def fit(self, lX, ly):
|
||||
"""
|
||||
Train the Neural Network end-to-end.
|
||||
lX and ly are not directly used. We rather get them from the multilingual index used in the instantiation
|
||||
of the Dataset object (RecurrentDataset) in the GfunDataModule class.
|
||||
:param lX: dict {lang: indexed documents}
|
||||
:param ly: dict {lang: target vectors}
|
||||
:return: self.
|
||||
"""
|
||||
print('# Fitting BertGen (M)...')
|
||||
self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1)
|
||||
bertDataModule = BertDataModule(self.multilingualIndex, batchsize=self.batch_size, max_len=512)
|
||||
trainer = Trainer(gradient_clip_val=1e-1, max_epochs=self.nepochs, gpus=self.gpus,
|
||||
logger=self.logger, checkpoint_callback=False)
|
||||
trainer.fit(self.model, datamodule=bertDataModule)
|
||||
trainer.test(self.model, datamodule=bertDataModule)
|
||||
return self
|
||||
|
||||
def transform(self, lX):
|
||||
"""
|
||||
Project documents to the common latent space. Output dimensionality is 768.
|
||||
:param lX: dict {lang: indexed documents}
|
||||
:return: documents projected to the common latent space.
|
||||
"""
|
||||
data = self.multilingualIndex.l_devel_raw_index()
|
||||
data = tokenize(data, max_len=512)
|
||||
self.model.to('cuda' if self.gpus else 'cpu')
|
||||
self.model.eval()
|
||||
time_init = time()
|
||||
l_emebds = self.model.encode(data, batch_size=64)
|
||||
transform_time = round(time() - time_init, 3)
|
||||
print(f'Executed! Transform took: {transform_time}')
|
||||
return l_emebds
|
||||
|
||||
def fit_transform(self, lX, ly):
|
||||
# we can assume that we have already indexed data for transform() since we are first calling fit()
|
||||
return self.fit(lX, ly).transform(lX)
|
||||
|
||||
|
Loading…
Reference in New Issue