merged with refactor

This commit is contained in:
andrea 2021-01-26 15:12:28 +01:00
parent a5912a22a9
commit ca179aca23
29 changed files with 0 additions and 4654 deletions

View File

@ -1,222 +0,0 @@
import numpy as np
import pytorch_lightning as pl
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
N_WORKERS = 8
class RecurrentDataset(Dataset):
def __init__(self, lX, ly, lPad_index):
"""
:param lX: dict {lang_id : np.ndarray}
:param ly:
"""
self.lX = []
self.ly = []
self.lOffset = {}
self.lPad_index = lPad_index
for lang, data in lX.items():
offset = [len(self.lX)]
self.lX.extend(data)
offset.append(len(self.lX))
self.lOffset[lang] = offset
for lang, target in ly.items():
self.ly.extend(target)
def __len__(self):
return len(self.lX)
def __getitem__(self, index):
X = self.lX[index]
y = self.ly[index]
return X, y, index, self._get_lang(index)
def _get_lang(self, index):
for lang, l_range in self.lOffset.items():
if index in range(l_range[0], l_range[1]):
return lang
def collate_fn(self, data):
"""
Takes care of padding the batch and also check consistency of batch languages. Groups into dict {lang : lang_batch}
items sampled from the Dataset class.
:param data:
:return:
"""
lX_batch = {}
ly_batch = {}
current_lang = data[0][-1]
for d in data:
if d[-1] == current_lang:
if current_lang not in lX_batch.keys():
lX_batch[current_lang] = []
ly_batch[current_lang] = []
lX_batch[current_lang].append(d[0])
ly_batch[current_lang].append(d[1])
else:
current_lang = d[-1]
lX_batch[current_lang] = []
ly_batch[current_lang] = []
lX_batch[current_lang].append(d[0])
ly_batch[current_lang].append(d[1])
for lang in lX_batch.keys():
lX_batch[lang] = self.pad(lX_batch[lang], pad_index=self.lPad_index[lang],
max_pad_length=self.define_pad_length(lX_batch[lang]))
lX_batch[lang] = torch.LongTensor(lX_batch[lang])
ly_batch[lang] = torch.FloatTensor(ly_batch[lang])
return lX_batch, ly_batch
@staticmethod
def define_pad_length(index_list):
lengths = [len(index) for index in index_list]
return int(np.mean(lengths) + np.std(lengths))
@staticmethod
def pad(index_list, pad_index, max_pad_length=None):
pad_length = np.max([len(index) for index in index_list])
if max_pad_length is not None:
pad_length = min(pad_length, max_pad_length)
for i, indexes in enumerate(index_list):
index_list[i] = [pad_index] * (pad_length - len(indexes)) + indexes[:pad_length]
return index_list
class RecurrentDataModule(pl.LightningDataModule):
"""
Pytorch Lightning Datamodule to be deployed with RecurrentGen.
https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html
"""
def __init__(self, multilingualIndex, batchsize=64, n_jobs=-1):
"""
Init RecurrentDataModule.
:param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents
indexed by language code.
:param batchsize: int, number of sample per batch.
:param n_jobs: int, number of concurrent workers to be deployed (i.e., parallelizing data loading).
"""
self.multilingualIndex = multilingualIndex
self.batchsize = batchsize
self.n_jobs = n_jobs
super().__init__()
def prepare_data(self, *args, **kwargs):
pass
def setup(self, stage=None):
if stage == 'fit' or stage is None:
l_train_index, l_train_target = self.multilingualIndex.l_train()
# Debug settings: reducing number of samples
l_train_index = {l: train[:5] for l, train in l_train_index.items()}
l_train_target = {l: target[:5] for l, target in l_train_target.items()}
self.training_dataset = RecurrentDataset(l_train_index, l_train_target,
lPad_index=self.multilingualIndex.l_pad())
l_val_index, l_val_target = self.multilingualIndex.l_val()
# Debug settings: reducing number of samples
l_val_index = {l: train[:5] for l, train in l_val_index.items()}
l_val_target = {l: target[:5] for l, target in l_val_target.items()}
self.val_dataset = RecurrentDataset(l_val_index, l_val_target,
lPad_index=self.multilingualIndex.l_pad())
if stage == 'test' or stage is None:
l_test_index, l_test_target = self.multilingualIndex.l_test()
# Debug settings: reducing number of samples
l_test_index = {l: train[:5] for l, train in l_test_index.items()}
l_test_target = {l: target[:5] for l, target in l_test_target.items()}
self.test_dataset = RecurrentDataset(l_test_index, l_test_target,
lPad_index=self.multilingualIndex.l_pad())
def train_dataloader(self):
return DataLoader(self.training_dataset, batch_size=self.batchsize, num_workers=self.n_jobs,
collate_fn=self.training_dataset.collate_fn)
def val_dataloader(self):
return DataLoader(self.val_dataset, batch_size=self.batchsize, num_workers=self.n_jobs,
collate_fn=self.val_dataset.collate_fn)
def test_dataloader(self):
return DataLoader(self.test_dataset, batch_size=self.batchsize, num_workers=self.n_jobs,
collate_fn=self.test_dataset.collate_fn)
def tokenize(l_raw, max_len):
"""
run Bert tokenization on dict {lang: list of samples}.
:param l_raw:
:param max_len:
:return:
"""
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
l_tokenized = {}
for lang in l_raw.keys():
output_tokenizer = tokenizer(l_raw[lang], truncation=True, max_length=max_len, padding='max_length')
l_tokenized[lang] = output_tokenizer['input_ids']
return l_tokenized
class BertDataModule(RecurrentDataModule):
"""
Pytorch Lightning Datamodule to be deployed with BertGen.
https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html
"""
def __init__(self, multilingualIndex, batchsize=64, max_len=512):
"""
Init BertDataModule.
:param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents
indexed by language code.
:param batchsize: int, number of sample per batch.
:param max_len: int, max number of token per document. Absolute cap is 512.
"""
super().__init__(multilingualIndex, batchsize)
self.max_len = max_len
def setup(self, stage=None):
if stage == 'fit' or stage is None:
l_train_raw, l_train_target = self.multilingualIndex.l_train_raw()
# Debug settings: reducing number of samples
l_train_raw = {l: train[:5] for l, train in l_train_raw.items()}
l_train_target = {l: target[:5] for l, target in l_train_target.items()}
l_train_index = tokenize(l_train_raw, max_len=self.max_len)
self.training_dataset = RecurrentDataset(l_train_index, l_train_target,
lPad_index=self.multilingualIndex.l_pad())
l_val_raw, l_val_target = self.multilingualIndex.l_val_raw()
# Debug settings: reducing number of samples
l_val_raw = {l: train[:5] for l, train in l_val_raw.items()}
l_val_target = {l: target[:5] for l, target in l_val_target.items()}
l_val_index = tokenize(l_val_raw, max_len=self.max_len)
self.val_dataset = RecurrentDataset(l_val_index, l_val_target,
lPad_index=self.multilingualIndex.l_pad())
if stage == 'test' or stage is None:
l_test_raw, l_test_target = self.multilingualIndex.l_test_raw()
# Debug settings: reducing number of samples
l_test_raw = {l: train[:5] for l, train in l_test_raw.items()}
l_test_target = {l: target[:5] for l, target in l_test_target.items()}
l_test_index = tokenize(l_test_raw, max_len=self.max_len)
self.test_dataset = RecurrentDataset(l_test_index, l_test_target,
lPad_index=self.multilingualIndex.l_pad())
def train_dataloader(self):
"""
NB: Setting n_workers to > 0 will cause "OSError: [Errno 24] Too many open files"
:return:
"""
return DataLoader(self.training_dataset, batch_size=self.batchsize)
def val_dataloader(self):
return DataLoader(self.val_dataset, batch_size=self.batchsize)
def test_dataloader(self):
return DataLoader(self.test_dataset, batch_size=self.batchsize)

View File

@ -1,712 +0,0 @@
import itertools
import pickle
import re
from os.path import exists
import numpy as np
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from scipy.sparse import issparse
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm import tqdm
from data.languages import NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING
from data.reader.jrcacquis_reader import *
from data.reader.rcv_reader import fetch_RCV1, fetch_RCV2
from data.text_preprocessor import NLTKStemTokenizer, preprocess_documents
class MultilingualDataset:
"""
A multilingual dataset is a dictionary of training and test documents indexed by language code.
Train and test sets are represented as tuples of the type (X,Y,ids), where X is a matrix representation of the
documents (e.g., a document-by-term sparse csr_matrix), Y is a document-by-label binary np.array indicating the
labels of each document, and ids is a list of document-identifiers from the original collection.
"""
def __init__(self):
self.dataset_name = ""
self.multiling_dataset = {}
def add(self, lang, Xtr, Ytr, Xte, Yte, tr_ids=None, te_ids=None):
self.multiling_dataset[lang] = ((Xtr, Ytr, tr_ids), (Xte, Yte, te_ids))
def save(self, file):
self.sort_indexes()
pickle.dump(self, open(file, 'wb'), pickle.HIGHEST_PROTOCOL)
return self
def __getitem__(self, item):
if item in self.langs():
return self.multiling_dataset[item]
return None
@classmethod
def load(cls, file):
data = pickle.load(open(file, 'rb'))
data.sort_indexes()
return data
@classmethod
def load_ids(cls, file):
data = pickle.load(open(file, 'rb'))
tr_ids = {lang:tr_ids for (lang,((_,_,tr_ids), (_,_,_))) in data.multiling_dataset.items()}
te_ids = {lang: te_ids for (lang, ((_, _, _), (_, _, te_ids))) in data.multiling_dataset.items()}
return tr_ids, te_ids
def sort_indexes(self):
for (lang, ((Xtr,_,_),(Xte,_,_))) in self.multiling_dataset.items():
if issparse(Xtr): Xtr.sort_indices()
if issparse(Xte): Xte.sort_indices()
def set_view(self, categories=None, languages=None):
if categories is not None:
if isinstance(categories, int):
categories = np.array([categories])
elif isinstance(categories, list):
categories = np.array(categories)
self.categories_view = categories
if languages is not None:
self.languages_view = languages
def training(self, mask_numbers=False, target_as_csr=False):
return self.lXtr(mask_numbers), self.lYtr(as_csr=target_as_csr)
def test(self, mask_numbers=False, target_as_csr=False):
return self.lXte(mask_numbers), self.lYte(as_csr=target_as_csr)
def lXtr(self, mask_numbers=False):
proc = lambda x:_mask_numbers(x) if mask_numbers else x
# return {lang: Xtr for (lang, ((Xtr, _, _), _)) in self.multiling_dataset.items() if lang in self.langs()}
return {lang:proc(Xtr) for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
def lXte(self, mask_numbers=False):
proc = lambda x: _mask_numbers(x) if mask_numbers else x
# return {lang: Xte for (lang, (_, (Xte, _, _))) in self.multiling_dataset.items() if lang in self.langs()}
return {lang:proc(Xte) for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()}
def lYtr(self, as_csr=False):
lY = {lang:self.cat_view(Ytr) for (lang, ((_,Ytr,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
if as_csr:
lY = {l:csr_matrix(Y) for l,Y in lY.items()}
return lY
def lYte(self, as_csr=False):
lY = {lang:self.cat_view(Yte) for (lang, (_,(_,Yte,_))) in self.multiling_dataset.items() if lang in self.langs()}
if as_csr:
lY = {l:csr_matrix(Y) for l,Y in lY.items()}
return lY
def cat_view(self, Y):
if hasattr(self, 'categories_view'):
return Y[:,self.categories_view]
else:
return Y
def langs(self):
if hasattr(self, 'languages_view'):
langs = self.languages_view
else:
langs = sorted(self.multiling_dataset.keys())
return langs
def num_categories(self):
return self.lYtr()[self.langs()[0]].shape[1]
def show_dimensions(self):
def shape(X):
return X.shape if hasattr(X, 'shape') else len(X)
for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items():
if lang not in self.langs(): continue
print("Lang {}, Xtr={}, ytr={}, Xte={}, yte={}".format(lang, shape(Xtr), self.cat_view(Ytr).shape, shape(Xte), self.cat_view(Yte).shape))
def show_category_prevalences(self):
nC = self.num_categories()
accum_tr = np.zeros(nC, dtype=np.int)
accum_te = np.zeros(nC, dtype=np.int)
in_langs = np.zeros(nC, dtype=np.int) # count languages with at least one positive example (per category)
for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items():
if lang not in self.langs(): continue
prev_train = np.sum(self.cat_view(Ytr), axis=0)
prev_test = np.sum(self.cat_view(Yte), axis=0)
accum_tr += prev_train
accum_te += prev_test
in_langs += (prev_train>0)*1
print(lang+'-train', prev_train)
print(lang+'-test', prev_test)
print('all-train', accum_tr)
print('all-test', accum_te)
return accum_tr, accum_te, in_langs
def set_labels(self, labels):
self.labels = labels
def _mask_numbers(data):
mask_moredigit = re.compile(r'\s[\+-]?\d{5,}([\.,]\d*)*\b')
mask_4digit = re.compile(r'\s[\+-]?\d{4}([\.,]\d*)*\b')
mask_3digit = re.compile(r'\s[\+-]?\d{3}([\.,]\d*)*\b')
mask_2digit = re.compile(r'\s[\+-]?\d{2}([\.,]\d*)*\b')
mask_1digit = re.compile(r'\s[\+-]?\d{1}([\.,]\d*)*\b')
masked = []
for text in tqdm(data, desc='masking numbers'):
text = ' ' + text
text = mask_moredigit.sub(' MoreDigitMask', text)
text = mask_4digit.sub(' FourDigitMask', text)
text = mask_3digit.sub(' ThreeDigitMask', text)
text = mask_2digit.sub(' TwoDigitMask', text)
text = mask_1digit.sub(' OneDigitMask', text)
masked.append(text.replace('.','').replace(',','').strip())
return masked
# ----------------------------------------------------------------------------------------------------------------------
# Helpers
# ----------------------------------------------------------------------------------------------------------------------
def get_active_labels(doclist):
cat_list = set()
for d in doclist:
cat_list.update(d.categories)
return list(cat_list)
def filter_by_categories(doclist, keep_categories):
catset = frozenset(keep_categories)
for d in doclist:
d.categories = list(set(d.categories).intersection(catset))
def __years_to_str(years):
if isinstance(years, list):
if len(years) > 1:
return str(years[0])+'-'+str(years[-1])
return str(years[0])
return str(years)
# ----------------------------------------------------------------------------------------------------------------------
# Matrix builders
# ----------------------------------------------------------------------------------------------------------------------
def build_independent_matrices(dataset_name, langs, training_docs, test_docs, label_names, wiki_docs=[], preprocess=True):
"""
Builds the document-by-term weighted matrices for each language. Representations are independent of each other,
i.e., each language-specific matrix lies in a dedicate feature space.
:param dataset_name: the name of the dataset (str)
:param langs: list of languages (str)
:param training_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id)
:param test_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id)
:param label_names: list of names of labels (str)
:param wiki_docs: doc-list (optional), if specified, project all wiki docs in the feature spaces built for the languages
:param preprocess: whether or not to apply language-specific text preprocessing (stopword removal and stemming)
:return: a MultilingualDataset. If wiki_docs has been specified, a dictionary lW is also returned, which indexes
by language the processed wikipedia documents in their respective language-specific feature spaces
"""
mlb = MultiLabelBinarizer()
mlb.fit([label_names])
lW = {}
multilingual_dataset = MultilingualDataset()
multilingual_dataset.dataset_name = dataset_name
multilingual_dataset.set_labels(mlb.classes_)
for lang in langs:
print("\nprocessing %d training, %d test, %d wiki for language <%s>" %
(len(training_docs[lang]), len(test_docs[lang]), len(wiki_docs[lang]) if wiki_docs else 0, lang))
tr_data, tr_labels, IDtr = zip(*training_docs[lang])
te_data, te_labels, IDte = zip(*test_docs[lang])
if preprocess:
tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True,
tokenizer=NLTKStemTokenizer(lang, verbose=True),
stop_words=stopwords.words(NLTK_LANGMAP[lang]))
else:
tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True)
Xtr = tfidf.fit_transform(tr_data)
Xte = tfidf.transform(te_data)
if wiki_docs:
lW[lang] = tfidf.transform(wiki_docs[lang])
Ytr = mlb.transform(tr_labels)
Yte = mlb.transform(te_labels)
multilingual_dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte)
multilingual_dataset.show_dimensions()
multilingual_dataset.show_category_prevalences()
if wiki_docs:
return multilingual_dataset, lW
else:
return multilingual_dataset
# creates a MultilingualDataset where matrices shares a single yuxtaposed feature space
def build_juxtaposed_matrices(dataset_name, langs, training_docs, test_docs, label_names, preprocess=True):
"""
Builds the document-by-term weighted matrices for each language. Representations are not independent of each other,
since all of them lie on the same yuxtaposed feature space.
:param dataset_name: the name of the dataset (str)
:param langs: list of languages (str)
:param training_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id)
:param test_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id)
:param label_names: list of names of labels (str)
:param preprocess: whether or not to apply language-specific text preprocessing (stopword removal and stemming)
:return: a MultilingualDataset. If wiki_docs has been specified, a dictionary lW is also returned, which indexes
by language the processed wikipedia documents in their respective language-specific feature spaces
"""
multiling_dataset = MultilingualDataset()
multiling_dataset.dataset_name = dataset_name
mlb = MultiLabelBinarizer()
mlb.fit([label_names])
multiling_dataset.set_labels(mlb.classes_)
tr_data_stack = []
for lang in langs:
print("\nprocessing %d training and %d test for language <%s>" % (len(training_docs[lang]), len(test_docs[lang]), lang))
tr_data, tr_labels, tr_ID = zip(*training_docs[lang])
te_data, te_labels, te_ID = zip(*test_docs[lang])
if preprocess:
tr_data = preprocess_documents(tr_data, lang)
te_data = preprocess_documents(te_data, lang)
tr_data_stack.extend(tr_data)
multiling_dataset.add(lang, tr_data, tr_labels, te_data, te_labels, tr_ID, te_ID)
tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True)
tfidf.fit(tr_data_stack)
for lang in langs:
print("\nweighting documents for language <%s>" % (lang))
(tr_data, tr_labels, tr_ID), (te_data, te_labels, te_ID) = multiling_dataset[lang]
Xtr = tfidf.transform(tr_data)
Xte = tfidf.transform(te_data)
Ytr = mlb.transform(tr_labels)
Yte = mlb.transform(te_labels)
multiling_dataset.add(lang,Xtr,Ytr,Xte,Yte,tr_ID,te_ID)
multiling_dataset.show_dimensions()
return multiling_dataset
# ----------------------------------------------------------------------------------------------------------------------
# Methods to recover the original documents from the MultilingualDataset's ids
# ----------------------------------------------------------------------------------------------------------------------
"""
This method has been added a posteriori, to create document embeddings using the polylingual embeddings of the recent
article 'Word Translation without Parallel Data'; basically, it takes one of the splits and retrieves the RCV documents
from the doc ids and then pickles an object (tr_docs, te_docs, label_names) in the outpath
"""
def retrieve_rcv_documents_from_dataset(datasetpath, rcv1_data_home, rcv2_data_home, outpath):
tr_ids, te_ids = MultilingualDataset.load_ids(datasetpath)
assert tr_ids.keys() == te_ids.keys(), 'inconsistent keys tr vs te'
langs = list(tr_ids.keys())
print('fetching the datasets')
rcv1_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train')
rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l != 'en'])
filter_by_categories(rcv1_documents, labels_rcv2)
filter_by_categories(rcv2_documents, labels_rcv1)
label_names = get_active_labels(rcv1_documents + rcv2_documents)
print('Active labels in RCV1/2 {}'.format(len(label_names)))
print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_documents), 0, len(label_names)))
print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents]))
all_docs = rcv1_documents + rcv2_documents
mlb = MultiLabelBinarizer()
mlb.fit([label_names])
dataset = MultilingualDataset()
for lang in langs:
analyzer = CountVectorizer(strip_accents='unicode', min_df=3,
stop_words=stopwords.words(NLTK_LANGMAP[lang])).build_analyzer()
Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.id) for d in all_docs if d.lang == lang and d.id in tr_ids[lang]])
Xte,Yte,IDte = zip(*[(d.text,d.categories,d.id) for d in all_docs if d.lang == lang and d.id in te_ids[lang]])
Xtr = [' '.join(analyzer(d)) for d in Xtr]
Xte = [' '.join(analyzer(d)) for d in Xte]
Ytr = mlb.transform(Ytr)
Yte = mlb.transform(Yte)
dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte)
dataset.save(outpath)
"""
Same thing but for JRC-Acquis
"""
def retrieve_jrc_documents_from_dataset(datasetpath, jrc_data_home, train_years, test_years, cat_policy, most_common_cat, outpath):
tr_ids, te_ids = MultilingualDataset.load_ids(datasetpath)
assert tr_ids.keys() == te_ids.keys(), 'inconsistent keys tr vs te'
langs = list(tr_ids.keys())
print('fetching the datasets')
cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy)
training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=train_years,
cat_filter=cat_list, cat_threshold=1, parallel=None,
most_frequent=most_common_cat)
test_docs, _ = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names,
parallel='force')
def filter_by_id(doclist, ids):
ids_set = frozenset(itertools.chain.from_iterable(ids.values()))
return [x for x in doclist if (x.parallel_id+'__'+x.id) in ids_set]
training_docs = filter_by_id(training_docs, tr_ids)
test_docs = filter_by_id(test_docs, te_ids)
print('jrc: {} train, {} test, {} categories'.format(len(training_docs), len(test_docs), len(label_names)))
mlb = MultiLabelBinarizer()
mlb.fit([label_names])
dataset = MultilingualDataset()
for lang in langs:
analyzer = CountVectorizer(strip_accents='unicode', min_df=3,
stop_words=stopwords.words(NLTK_LANGMAP[lang])).build_analyzer()
Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.parallel_id+'__'+d.id) for d in training_docs if d.lang == lang])
Xte,Yte,IDte = zip(*[(d.text,d.categories,d.parallel_id+'__'+d.id) for d in test_docs if d.lang == lang])
Xtr = [' '.join(analyzer(d)) for d in Xtr]
Xte = [' '.join(analyzer(d)) for d in Xte]
Ytr = mlb.transform(Ytr)
Yte = mlb.transform(Yte)
dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte)
dataset.save(outpath)
# ----------------------------------------------------------------------------------------------------------------------
# Dataset Generators
# ----------------------------------------------------------------------------------------------------------------------
def prepare_jrc_datasets(jrc_data_home, wiki_data_home, langs, train_years, test_years, cat_policy, most_common_cat=-1, max_wiki=5000, run=0):
from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample
"""
Prepare all datasets for JRC-Acquis. The datasets include the "feature-independent" version, the
"feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices.
In all cases, training documents are strictly non-parallel, and test documents are strictly parallel
:param jrc_data_home: path to the raw JRC-Acquis documents (it will be downloaded if not found), and the path where
all splits will be generated
:param wiki_data_home: path to the wikipedia dump (see data/readers/wikipedia_tools.py)
:param langs: the list of languages to consider (as defined in data/languages.py)
:param train_years: a list of ints containing the years to be considered as training documents
:param test_years: a list of ints containing the years to be considered as test documents
:param cat_policy: a string indicating which category selection policy to apply. Valid policies are, e.g., "all"
(select all categories), "broadest" (select only the broadest concepts in the taxonomy), or "leaves" (select the
leaves concepts in the taxonomy). See inspect_eurovoc from data/reader/jrcacquis_reader.py for more details
:param most_common_cat: the maximum number of most common categories to consider, or -1 to keep them all
:param max_wiki: the maximum number of wikipedia documents to consider (default 5000)
:param run: a numeric label naming the random split (useful to keep track of different runs)
:return: None
"""
name = 'JRCacquis'
run = '_run' + str(run)
config_name = 'jrc_nltk_' + __years_to_str(train_years) + \
'vs' + __years_to_str(test_years) + \
'_' + cat_policy + \
('_top' + str(most_common_cat) if most_common_cat!=-1 else '') + \
'_noparallel_processed'
indep_path = join(jrc_data_home, config_name + run + '.pickle')
upper_path = join(jrc_data_home, config_name + run + '_upper.pickle')
yuxta_path = join(jrc_data_home, config_name + run + '_yuxtaposed.pickle')
wiki_path = join(jrc_data_home, config_name + run + '.wiki.pickle')
wiki_docs_path = join(jrc_data_home, config_name + '.wiki.raw.pickle')
cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy)
training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=train_years,
cat_filter=cat_list, cat_threshold=1, parallel=None,
most_frequent=most_common_cat)
test_docs, _ = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names,
parallel='force')
print('Generating feature-independent dataset...')
training_docs_no_parallel = random_sampling_avoiding_parallel(training_docs)
def _group_by_lang(doc_list, langs):
return {lang: [(d.text, d.categories, d.parallel_id + '__' + d.id) for d in doc_list if d.lang == lang]
for lang in langs}
training_docs = _group_by_lang(training_docs, langs)
training_docs_no_parallel = _group_by_lang(training_docs_no_parallel, langs)
test_docs = _group_by_lang(test_docs, langs)
if not exists(indep_path):
wiki_docs=None
if max_wiki>0:
if not exists(wiki_docs_path):
wiki_docs = fetch_wikipedia_multilingual(wiki_data_home, langs, min_words=50, deletions=False)
wiki_docs = random_wiki_sample(wiki_docs, max_wiki)
pickle.dump(wiki_docs, open(wiki_docs_path, 'wb'), pickle.HIGHEST_PROTOCOL)
else:
wiki_docs = pickle.load(open(wiki_docs_path, 'rb'))
wiki_docs = random_wiki_sample(wiki_docs, max_wiki)
if wiki_docs:
lang_data, wiki_docs = build_independent_matrices(name, langs, training_docs_no_parallel, test_docs, label_names, wiki_docs)
pickle.dump(wiki_docs, open(wiki_path, 'wb'), pickle.HIGHEST_PROTOCOL)
else:
lang_data = build_independent_matrices(name, langs, training_docs_no_parallel, test_docs, label_names)
lang_data.save(indep_path)
print('Generating upper-bound (English-only) dataset...')
if not exists(upper_path):
training_docs_eng_only = {'en':training_docs['en']}
test_docs_eng_only = {'en':test_docs['en']}
build_independent_matrices(name, ['en'], training_docs_eng_only, test_docs_eng_only, label_names).save(upper_path)
print('Generating yuxtaposed dataset...')
if not exists(yuxta_path):
build_juxtaposed_matrices(name, langs, training_docs_no_parallel, test_docs, label_names).save(yuxta_path)
def prepare_rcv_datasets(outpath, rcv1_data_home, rcv2_data_home, wiki_data_home, langs,
train_for_lang=1000, test_for_lang=1000, max_wiki=5000, preprocess=True, run=0):
from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample
"""
Prepare all datasets for RCV1/RCV2. The datasets include the "feature-independent" version, the
"feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices.
:param outpath: path where all splits will be dumped
:param rcv1_data_home: path to the RCV1-v2 dataset (English only)
:param rcv2_data_home: path to the RCV2 dataset (all languages other than English)
:param wiki_data_home: path to the wikipedia dump (see data/readers/wikipedia_tools.py)
:param langs: the list of languages to consider (as defined in data/languages.py)
:param train_for_lang: maximum number of training documents per language
:param test_for_lang: maximum number of test documents per language
:param max_wiki: the maximum number of wikipedia documents to consider (default 5000)
:param preprocess: whether or not to apply language-specific preprocessing (stopwords removal and stemming)
:param run: a numeric label naming the random split (useful to keep track of different runs)
:return: None
"""
assert 'en' in langs, 'English is not in requested languages, but is needed for some datasets'
assert len(langs)>1, 'the multilingual dataset cannot be built with only one dataset'
assert not preprocess or set(langs).issubset(set(RCV2_LANGS_WITH_NLTK_STEMMING+['en'])), \
"languages not in RCV1-v2/RCV2 scope or not in valid for NLTK's processing"
name = 'RCV1/2'
run = '_run' + str(run)
config_name = 'rcv1-2_nltk_trByLang'+str(train_for_lang)+'_teByLang'+str(test_for_lang)+\
('_processed' if preprocess else '_raw')
indep_path = join(outpath, config_name + run + '.pickle')
upper_path = join(outpath, config_name + run +'_upper.pickle')
yuxta_path = join(outpath, config_name + run +'_yuxtaposed.pickle')
wiki_path = join(outpath, config_name + run + '.wiki.pickle')
wiki_docs_path = join(outpath, config_name + '.wiki.raw.pickle')
print('fetching the datasets')
rcv1_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train')
rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l!='en'])
filter_by_categories(rcv1_documents, labels_rcv2)
filter_by_categories(rcv2_documents, labels_rcv1)
label_names = get_active_labels(rcv1_documents+rcv2_documents)
print('Active labels in RCV1/2 {}'.format(len(label_names)))
print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_documents), 0, len(label_names)))
print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents]))
lang_docs = {lang: [d for d in rcv1_documents + rcv2_documents if d.lang == lang] for lang in langs}
# for the upper bound there are no parallel versions, so for the English case, we take as many documents as there
# would be in the multilingual case -- then we will extract from them only train_for_lang for the other cases
print('Generating upper-bound (English-only) dataset...')
train, test = train_test_split(lang_docs['en'], train_size=train_for_lang*len(langs), test_size=test_for_lang, shuffle=True)
train_lang_doc_map = {'en':[(d.text, d.categories, d.id) for d in train]}
test_lang_doc_map = {'en':[(d.text, d.categories, d.id) for d in test]}
build_independent_matrices(name, ['en'], train_lang_doc_map, test_lang_doc_map, label_names).save(upper_path)
train_lang_doc_map['en'] = train_lang_doc_map['en'][:train_for_lang]
for lang in langs:
if lang=='en': continue # already split
test_take = min(test_for_lang, len(lang_docs[lang])-train_for_lang)
train, test = train_test_split(lang_docs[lang], train_size=train_for_lang, test_size=test_take, shuffle=True)
train_lang_doc_map[lang] = [(d.text, d.categories, d.id) for d in train]
test_lang_doc_map[lang] = [(d.text, d.categories, d.id) for d in test]
print('Generating feature-independent dataset...')
wiki_docs=None
if max_wiki>0:
if not exists(wiki_docs_path):
wiki_docs = fetch_wikipedia_multilingual(wiki_data_home, langs, min_words=50, deletions=False)
wiki_docs = random_wiki_sample(wiki_docs, max_wiki)
pickle.dump(wiki_docs, open(wiki_docs_path, 'wb'), pickle.HIGHEST_PROTOCOL)
else:
wiki_docs = pickle.load(open(wiki_docs_path, 'rb'))
wiki_docs = random_wiki_sample(wiki_docs, max_wiki)
if wiki_docs:
lang_data, wiki_docs_matrix = build_independent_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, wiki_docs, preprocess)
pickle.dump(wiki_docs_matrix, open(wiki_path, 'wb'), pickle.HIGHEST_PROTOCOL)
else:
lang_data = build_independent_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, wiki_docs, preprocess)
lang_data.save(indep_path)
print('Generating yuxtaposed dataset...')
build_juxtaposed_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, preprocess).save(yuxta_path)
# ----------------------------------------------------------------------------------------------------------------------
# Methods to generate full RCV and JRC datasets
# ----------------------------------------------------------------------------------------------------------------------
def full_rcv_(rcv1_data_home, rcv2_data_home, outpath, langs):
print('fetching the datasets')
rcv1_train_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train')
rcv1_test_documents, labels_rcv1_test = fetch_RCV1(rcv1_data_home, split='test')
rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l != 'en'])
filter_by_categories(rcv1_train_documents, labels_rcv2)
filter_by_categories(rcv1_test_documents, labels_rcv2)
filter_by_categories(rcv2_documents, labels_rcv1)
label_names = get_active_labels(rcv1_train_documents + rcv2_documents)
print('Active labels in RCV1/2 {}'.format(len(label_names)))
print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_train_documents), len(rcv1_test_documents), len(label_names)))
print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents]))
mlb = MultiLabelBinarizer()
mlb.fit([label_names])
all_docs = rcv1_train_documents + rcv1_test_documents + rcv2_documents
lang_docs = {lang: [d for d in all_docs if d.lang == lang] for lang in langs}
def get_ids(doclist):
return frozenset([d.id for d in doclist])
tr_ids = {'en': get_ids(rcv1_train_documents)}
te_ids = {'en': get_ids(rcv1_test_documents)}
for lang in langs:
if lang == 'en': continue
tr_ids[lang], te_ids[lang] = train_test_split([d.id for d in lang_docs[lang]], test_size=.3)
dataset = MultilingualDataset()
dataset.dataset_name = 'RCV1/2-full'
for lang in langs:
print(f'processing {lang} with {len(tr_ids[lang])} training documents and {len(te_ids[lang])} documents')
analyzer = CountVectorizer(
strip_accents='unicode', min_df=3, stop_words=stopwords.words(NLTK_LANGMAP[lang])
).build_analyzer()
Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.id) for d in lang_docs[lang] if d.id in tr_ids[lang]])
Xte,Yte,IDte = zip(*[(d.text,d.categories,d.id) for d in lang_docs[lang] if d.id in te_ids[lang]])
Xtr = [' '.join(analyzer(d)) for d in Xtr]
Xte = [' '.join(analyzer(d)) for d in Xte]
Ytr = mlb.transform(Ytr)
Yte = mlb.transform(Yte)
dataset.add(lang, _mask_numbers(Xtr), Ytr, _mask_numbers(Xte), Yte, IDtr, IDte)
dataset.save(outpath)
def full_jrc_(jrc_data_home, langs, train_years, test_years, outpath, cat_policy='all', most_common_cat=300):
print('fetching the datasets')
cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy)
training_docs, label_names = fetch_jrcacquis(
langs=langs, data_path=jrc_data_home, years=train_years, cat_filter=cat_list, cat_threshold=1, parallel=None, most_frequent=most_common_cat
)
test_docs, _ = fetch_jrcacquis(
langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names, parallel='force'
)
def _group_by_lang(doc_list, langs):
return {lang: [d for d in doc_list if d.lang == lang] for lang in langs}
training_docs = _group_by_lang(training_docs, langs)
test_docs = _group_by_lang(test_docs, langs)
mlb = MultiLabelBinarizer()
mlb.fit([label_names])
dataset = MultilingualDataset()
data.dataset_name = 'JRC-Acquis-full'
for lang in langs:
analyzer = CountVectorizer(
strip_accents='unicode', min_df=3, stop_words=stopwords.words(NLTK_LANGMAP[lang])
).build_analyzer()
Xtr, Ytr, IDtr = zip(*[(d.text, d.categories, d.parallel_id + '__' + d.id) for d in training_docs[lang] if d.lang == lang])
Xte, Yte, IDte = zip(*[(d.text, d.categories, d.parallel_id + '__' + d.id) for d in test_docs[lang] if d.lang == lang])
Xtr = [' '.join(analyzer(d)) for d in Xtr]
Xte = [' '.join(analyzer(d)) for d in Xte]
Ytr = mlb.transform(Ytr)
Yte = mlb.transform(Yte)
dataset.add(lang, _mask_numbers(Xtr), Ytr, _mask_numbers(Xte), Yte, IDtr, IDte)
dataset.save(outpath)
#-----------------------------------------------------------------------------------------------------------------------
# MAIN BUILDER
#-----------------------------------------------------------------------------------------------------------------------
if __name__=='__main__':
import sys
RCV1_PATH = '../Datasets/RCV1-v2/unprocessed_corpus'
RCV2_PATH = '../Datasets/RCV2'
JRC_DATAPATH = "../Datasets/JRC_Acquis_v3"
full_rcv_(RCV1_PATH, RCV2_PATH, outpath='../rcv2/rcv1-2_doclist_full_processed.pickle', langs=RCV2_LANGS_WITH_NLTK_STEMMING + ['en'])
# full_jrc_(JRC_DATAPATH, lang_set['JRC_NLTK'], train_years=list(range(1958, 2006)), test_years=[2006], outpath='../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle', cat_policy='all', most_common_cat=300)
sys.exit(0)
# datasetpath = '../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle' # '../rcv2/rcv1-2_doclist_full_processed.pickle'
# data = MultilingualDataset.load(datasetpath)
# data.dataset_name='JRC-Acquis-full'#'RCV1/2-full'
# for lang in RCV2_LANGS_WITH_NLTK_STEMMING + ['en']:
# (Xtr, ytr, idtr), (Xte, yte, idte) = data.multiling_dataset[lang]
# data.multiling_dataset[lang] = ((_mask_numbers(Xtr), ytr, idtr), (_mask_numbers(Xte), yte, idte))
# data.save('../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle')#'../rcv2/rcv1-2_doclist_full_processed_2.pickle')
# sys.exit(0)
assert len(sys.argv) == 5, "wrong number of arguments; required: " \
"<JRC_PATH> <RCV1_PATH> <RCV2_PATH> <WIKI_PATH> "
JRC_DATAPATH = sys.argv[1] # "../Datasets/JRC_Acquis_v3"
RCV1_PATH = sys.argv[2] #'../Datasets/RCV1-v2/unprocessed_corpus'
RCV2_PATH = sys.argv[3] #'../Datasets/RCV2'
WIKI_DATAPATH = sys.argv[4] #"../Datasets/Wikipedia/multilingual_docs_JRC_NLTK"
langs = lang_set['JRC_NLTK']
max_wiki = 5000
for run in range(0,10):
print('Building JRC-Acquis datasets run', run)
prepare_jrc_datasets(JRC_DATAPATH, WIKI_DATAPATH, langs,
train_years=list(range(1958, 2006)), test_years=[2006], max_wiki=max_wiki,
cat_policy='all', most_common_cat=300, run=run)
print('Building RCV1-v2/2 datasets run', run)
prepare_rcv_datasets(RCV2_PATH, RCV1_PATH, RCV2_PATH, WIKI_DATAPATH, RCV2_LANGS_WITH_NLTK_STEMMING + ['en'],
train_for_lang=1000, test_for_lang=1000, max_wiki=max_wiki, run=run)
# uncomment this code if you want to retrieve the original documents to generate the data splits for PLE
# (make sure you have not modified the above parameters, or adapt the following paths accordingly...)
# datasetpath = join(RCV2_PATH,'rcv1-2_nltk_trByLang1000_teByLang1000_processed_run{}.pickle'.format(run))
# outpath = datasetpath.replace('_nltk_','_doclist_')
# retrieve_rcv_documents_from_dataset(datasetpath, RCV1_PATH, RCV2_PATH, outpath)
# datasetpath = join(JRC_DATAPATH, 'jrc_nltk_1958-2005vs2006_all_top300_noparallel_processed_run{}.pickle'.format(run))
# outpath = datasetpath.replace('_nltk_', '_doclist_')
# retrieve_jrc_documents_from_dataset(datasetpath, JRC_DATAPATH, train_years=list(range(1958, 2006)), test_years=[2006], cat_policy='all', most_common_cat=300, outpath=outpath)

View File

@ -1,42 +0,0 @@
"""
bg = Bulgarian
cs = Czech
da = Danish
de = German
el = Greek
en = English
es = Spanish
et = Estonian
fi = Finnish
fr = French
hu = Hungarian
it = Italian
lt = Lithuanian
lv = Latvian
nl = Dutch
mt = Maltese
pl = Polish
pt = Portuguese
ro = Romanian
sk = Slovak
sl = Slovene
sv = Swedish
"""
NLTK_LANGMAP = {'da': 'danish', 'nl': 'dutch', 'en': 'english', 'fi': 'finnish', 'fr': 'french', 'de': 'german',
'hu': 'hungarian', 'it': 'italian', 'pt': 'portuguese', 'ro': 'romanian', 'es': 'spanish', 'sv': 'swedish'}
#top 10 languages in wikipedia order by the number of articles
#LANGS_10_MOST_WIKI = ['en','fr','sv','de','es','it','pt','nl','pl','ro']
#all languages in JRC-acquis v3
JRC_LANGS = ['bg','cs','da','de','el','en','es','et','fi','fr','hu','it','lt','lv','mt','nl','pl','pt','ro','sk','sl','sv']
JRC_LANGS_WITH_NLTK_STEMMING = ['da', 'nl', 'en', 'fi', 'fr', 'de', 'hu', 'it', 'pt', 'es', 'sv'] # 'romanian deleted for incompatibility issues'
RCV2_LANGS = ['ru', 'de', 'fr', 'sv', 'no', 'da', 'pt', 'it', 'es', 'jp', 'htw', 'nl']
RCV2_LANGS_WITH_NLTK_STEMMING = ['de', 'fr', 'sv', 'da', 'pt', 'it', 'es', 'nl']
lang_set = {'JRC_NLTK':JRC_LANGS_WITH_NLTK_STEMMING, 'JRC':JRC_LANGS,
'RCV2_NLTK':RCV2_LANGS_WITH_NLTK_STEMMING, 'RCV2':RCV2_LANGS}

View File

@ -1,324 +0,0 @@
from __future__ import print_function
import os
import pickle
import sys
import tarfile
import xml.etree.ElementTree as ET
import zipfile
from collections import Counter
from os.path import join
from random import shuffle
import rdflib
from rdflib.namespace import RDF, SKOS
from sklearn.datasets import get_data_home
from data.languages import JRC_LANGS
from data.languages import lang_set
from util.file import download_file, list_dirs, list_files
"""
JRC Acquis' Nomenclature:
bg = Bulgarian
cs = Czech
da = Danish
de = German
el = Greek
en = English
es = Spanish
et = Estonian
fi = Finnish
fr = French
hu = Hungarian
it = Italian
lt = Lithuanian
lv = Latvian
nl = Dutch
mt = Maltese
pl = Polish
pt = Portuguese
ro = Romanian
sk = Slovak
sl = Slovene
sv = Swedish
"""
class JRCAcquis_Document:
def __init__(self, id, name, lang, year, head, body, categories):
self.id = id
self.parallel_id = name
self.lang = lang
self.year = year
self.text = body if not head else head + "\n" + body
self.categories = categories
# this is a workaround... for some reason, acutes are codified in a non-standard manner in titles
# however, it seems that the title is often appearing as the first paragraph in the text/body (with
# standard codification), so it might be preferable not to read the header after all (as here by default)
def _proc_acute(text):
for ch in ['a','e','i','o','u']:
text = text.replace('%'+ch+'acute%',ch)
return text
def parse_document(file, year, head=False):
root = ET.parse(file).getroot()
doc_name = root.attrib['n'] # e.g., '22006A0211(01)'
doc_lang = root.attrib['lang'] # e.g., 'es'
doc_id = root.attrib['id'] # e.g., 'jrc22006A0211_01-es'
doc_categories = [cat.text for cat in root.findall('.//teiHeader/profileDesc/textClass/classCode[@scheme="eurovoc"]')]
doc_head = _proc_acute(root.find('.//text/body/head').text) if head else ''
doc_body = '\n'.join([p.text for p in root.findall('.//text/body/div[@type="body"]/p')])
def raise_if_empty(field, from_file):
if isinstance(field, str):
if not field.strip():
raise ValueError("Empty field in file %s" % from_file)
raise_if_empty(doc_name, file)
raise_if_empty(doc_lang, file)
raise_if_empty(doc_id, file)
if head: raise_if_empty(doc_head, file)
raise_if_empty(doc_body, file)
return JRCAcquis_Document(id=doc_id, name=doc_name, lang=doc_lang, year=year, head=doc_head, body=doc_body, categories=doc_categories)
# removes documents without a counterpart in all other languages
def _force_parallel(doclist, langs):
n_langs = len(langs)
par_id_count = Counter([d.parallel_id for d in doclist])
parallel_doc_ids = set([id for id,count in par_id_count.items() if count==n_langs])
return [doc for doc in doclist if doc.parallel_id in parallel_doc_ids]
def random_sampling_avoiding_parallel(doclist):
random_order = list(range(len(doclist)))
shuffle(random_order)
sampled_request = []
parallel_ids = set()
for ind in random_order:
pid = doclist[ind].parallel_id
if pid not in parallel_ids:
sampled_request.append(doclist[ind])
parallel_ids.add(pid)
print('random_sampling_no_parallel:: from {} documents to {} documents'.format(len(doclist), len(sampled_request)))
return sampled_request
#filters out documents which do not contain any category in the cat_filter list, and filter all labels not in cat_filter
def _filter_by_category(doclist, cat_filter):
if not isinstance(cat_filter, frozenset):
cat_filter = frozenset(cat_filter)
filtered = []
for doc in doclist:
doc.categories = list(cat_filter & set(doc.categories))
if doc.categories:
doc.categories.sort()
filtered.append(doc)
print("filtered %d documents out without categories in the filter list" % (len(doclist) - len(filtered)))
return filtered
#filters out categories with less than cat_threshold documents (and filters documents containing those categories)
def _filter_by_frequency(doclist, cat_threshold):
cat_count = Counter()
for d in doclist:
cat_count.update(d.categories)
freq_categories = [cat for cat,count in cat_count.items() if count>cat_threshold]
freq_categories.sort()
return _filter_by_category(doclist, freq_categories), freq_categories
#select top most_frequent categories (and filters documents containing those categories)
def _most_common(doclist, most_frequent):
cat_count = Counter()
for d in doclist:
cat_count.update(d.categories)
freq_categories = [cat for cat,count in cat_count.most_common(most_frequent)]
freq_categories.sort()
return _filter_by_category(doclist, freq_categories), freq_categories
def _get_categories(request):
final_cats = set()
for d in request:
final_cats.update(d.categories)
return list(final_cats)
def fetch_jrcacquis(langs=None, data_path=None, years=None, ignore_unclassified=True, cat_filter=None, cat_threshold=0,
parallel=None, most_frequent=-1, DOWNLOAD_URL_BASE ='http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/'):
assert parallel in [None, 'force', 'avoid'], 'parallel mode not supported'
if not langs:
langs = JRC_LANGS
else:
if isinstance(langs, str): langs = [langs]
for l in langs:
if l not in JRC_LANGS:
raise ValueError('Language %s is not among the valid languages in JRC-Acquis v3' % l)
if not data_path:
data_path = get_data_home()
if not os.path.exists(data_path):
os.mkdir(data_path)
request = []
total_read = 0
for l in langs:
file_name = 'jrc-'+l+'.tgz'
archive_path = join(data_path, file_name)
if not os.path.exists(archive_path):
print("downloading language-specific dataset (once and for all) into %s" % data_path)
DOWNLOAD_URL = join(DOWNLOAD_URL_BASE, file_name)
download_file(DOWNLOAD_URL, archive_path)
print("untarring dataset...")
tarfile.open(archive_path, 'r:gz').extractall(data_path)
documents_dir = join(data_path, l)
print("Reading documents...")
read = 0
for dir in list_dirs(documents_dir):
year = int(dir)
if years==None or year in years:
year_dir = join(documents_dir,dir)
pickle_name = join(data_path, 'jrc_' + l + '_' + dir + '.pickle')
if os.path.exists(pickle_name):
print("loading from file %s" % pickle_name)
l_y_documents = pickle.load(open(pickle_name, "rb"))
read += len(l_y_documents)
else:
l_y_documents = []
all_documents = list_files(year_dir)
empty = 0
for i,doc_file in enumerate(all_documents):
try:
jrc_doc = parse_document(join(year_dir, doc_file), year)
except ValueError:
jrc_doc = None
if jrc_doc and (not ignore_unclassified or jrc_doc.categories):
l_y_documents.append(jrc_doc)
else: empty += 1
if len(all_documents)>50 and ((i+1) % (len(all_documents)/50) == 0):
print('\r\tfrom %s: completed %d%%' % (year_dir, int((i+1)*100.0/len(all_documents))), end='')
read+=1
print('\r\tfrom %s: completed 100%% read %d documents (discarded %d without categories or empty fields)\n' % (year_dir, i+1, empty), end='')
print("\t\t(Pickling object for future runs in %s)" % pickle_name)
pickle.dump(l_y_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL)
request += l_y_documents
print("Read %d documents for language %s\n" % (read, l))
total_read += read
print("Read %d documents in total" % (total_read))
if parallel=='force':
request = _force_parallel(request, langs)
elif parallel == 'avoid':
request = random_sampling_avoiding_parallel(request)
final_cats = _get_categories(request)
if cat_filter:
request = _filter_by_category(request, cat_filter)
final_cats = _get_categories(request)
if cat_threshold > 0:
request, final_cats = _filter_by_frequency(request, cat_threshold)
if most_frequent != -1 and len(final_cats) > most_frequent:
request, final_cats = _most_common(request, most_frequent)
return request, final_cats
def print_cat_analysis(request):
cat_count = Counter()
for d in request:
cat_count.update(d.categories)
print("Number of active categories: {}".format(len(cat_count)))
print(cat_count.most_common())
# inspects the Eurovoc thesaurus in order to select a subset of categories
# currently, only 'broadest' policy (i.e., take all categories with no parent category), and 'all' is implemented
def inspect_eurovoc(data_path, eurovoc_skos_core_concepts_filename='eurovoc_in_skos_core_concepts.rdf',
eurovoc_url="http://publications.europa.eu/mdr/resource/thesaurus/eurovoc-20160630-0/skos/eurovoc_in_skos_core_concepts.zip",
select="broadest"):
fullpath_pickle = join(data_path, select+'_concepts.pickle')
if os.path.exists(fullpath_pickle):
print("Pickled object found in %s. Loading it." % fullpath_pickle)
return pickle.load(open(fullpath_pickle,'rb'))
fullpath = join(data_path, eurovoc_skos_core_concepts_filename)
if not os.path.exists(fullpath):
print("Path %s does not exist. Trying to download the skos EuroVoc file from %s" % (data_path, eurovoc_url))
download_file(eurovoc_url, fullpath)
print("Unzipping file...")
zipped = zipfile.ZipFile(data_path + '.zip', 'r')
zipped.extract("eurovoc_in_skos_core_concepts.rdf", data_path)
zipped.close()
print("Parsing %s" %fullpath)
g = rdflib.Graph()
g.parse(location=fullpath, format="application/rdf+xml")
if select == "all":
print("Selecting all concepts")
all_concepts = list(g.subjects(RDF.type, SKOS.Concept))
all_concepts = [c.toPython().split('/')[-1] for c in all_concepts]
all_concepts.sort()
selected_concepts = all_concepts
elif select=="broadest":
print("Selecting broadest concepts (those without any other broader concept linked to it)")
all_concepts = set(g.subjects(RDF.type, SKOS.Concept))
narrower_concepts = set(g.subjects(SKOS.broader, None))
broadest_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - narrower_concepts)]
broadest_concepts.sort()
selected_concepts = broadest_concepts
elif select=="leaves":
print("Selecting leaves concepts (those not linked as broader of any other concept)")
all_concepts = set(g.subjects(RDF.type, SKOS.Concept))
broad_concepts = set(g.objects(None, SKOS.broader))
leave_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - broad_concepts)]
leave_concepts.sort()
selected_concepts = leave_concepts
else:
raise ValueError("Selection policy %s is not currently supported" % select)
print("%d %s concepts found" % (len(selected_concepts), leave_concepts))
print("Pickling concept list for faster further requests in %s" % fullpath_pickle)
pickle.dump(selected_concepts, open(fullpath_pickle, 'wb'), pickle.HIGHEST_PROTOCOL)
return selected_concepts
if __name__ == '__main__':
def single_label_fragment(doclist):
single = [d for d in doclist if len(d.categories) < 2]
final_categories = set([d.categories[0] if d.categories else [] for d in single])
print('{} single-label documents ({} categories) from the original {} documents'.format(len(single),
len(final_categories),
len(doclist)))
return single, list(final_categories)
train_years = list(range(1986, 2006))
test_years = [2006]
cat_policy = 'leaves'
most_common_cat = 300
# JRC_DATAPATH = "/media/moreo/1TB Volume/Datasets/JRC_Acquis_v3"
JRC_DATAPATH = "/storage/andrea/FUNNELING/data/JRC_Acquis_v3"
langs = lang_set['JRC_NLTK']
cat_list = inspect_eurovoc(JRC_DATAPATH, select=cat_policy)
sys.exit()
training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=JRC_DATAPATH, years=train_years,cat_filter=cat_list, cat_threshold=1, parallel=None,most_frequent=most_common_cat)
test_docs, label_namestest = fetch_jrcacquis(langs=langs, data_path=JRC_DATAPATH, years=test_years, cat_filter=label_names,parallel='force')
print('JRC-train: {} documents, {} labels'.format(len(training_docs), len(label_names)))
print('JRC-test: {} documents, {} labels'.format(len(test_docs), len(label_namestest)))
training_docs, label_names = single_label_fragment(training_docs)
test_docs, label_namestest = single_label_fragment(test_docs)
print('JRC-train: {} documents, {} labels'.format(len(training_docs), len(label_names)))
print('JRC-test: {} documents, {} labels'.format(len(test_docs), len(label_namestest)))

View File

@ -1,222 +0,0 @@
import re
import xml.etree.ElementTree as ET
from os.path import join, exists
from zipfile import ZipFile
import numpy as np
from util.file import download_file_if_not_exists
from util.file import list_files
"""
RCV2's Nomenclature:
ru = Russian
da = Danish
de = German
es = Spanish
lat = Spanish Latin-American (actually is also 'es' in the collection)
fr = French
it = Italian
nl = Dutch
pt = Portuguese
sv = Swedish
ja = Japanese
htw = Chinese
no = Norwegian
"""
RCV1_TOPICHIER_URL = "http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a02-orig-topics-hierarchy/rcv1.topics.hier.orig"
RCV1PROC_BASE_URL= 'http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a12-token-files'
RCV1_BASE_URL = "http://www.daviddlewis.com/resources/testcollections/rcv1/"
RCV2_BASE_URL = "http://trec.nist.gov/data/reuters/reuters.html"
rcv1_test_data_gz = ['lyrl2004_tokens_test_pt0.dat.gz',
'lyrl2004_tokens_test_pt1.dat.gz',
'lyrl2004_tokens_test_pt2.dat.gz',
'lyrl2004_tokens_test_pt3.dat.gz']
rcv1_train_data_gz = ['lyrl2004_tokens_train.dat.gz']
rcv1_doc_cats_data_gz = 'rcv1-v2.topics.qrels.gz'
RCV2_LANG_DIR = {'ru':'REUTE000',
'de':'REUTE00A',
'fr':'REUTE00B',
'sv':'REUTE001',
'no':'REUTE002',
'da':'REUTE003',
'pt':'REUTE004',
'it':'REUTE005',
'es':'REUTE006',
'lat':'REUTE007',
'jp':'REUTE008',
'htw':'REUTE009',
'nl':'REUTERS_'}
class RCV_Document:
def __init__(self, id, text, categories, date='', lang=None):
self.id = id
self.date = date
self.lang = lang
self.text = text
self.categories = categories
class ExpectedLanguageException(Exception): pass
class IDRangeException(Exception): pass
nwords = []
def parse_document(xml_content, assert_lang=None, valid_id_range=None):
root = ET.fromstring(xml_content)
if assert_lang:
if assert_lang not in root.attrib.values():
if assert_lang != 'jp' or 'ja' not in root.attrib.values(): # some documents are attributed to 'ja', others to 'jp'
raise ExpectedLanguageException('error: document of a different language')
doc_id = root.attrib['itemid']
if valid_id_range is not None:
if not valid_id_range[0] <= int(doc_id) <= valid_id_range[1]:
raise IDRangeException
doc_categories = [cat.attrib['code'] for cat in
root.findall('.//metadata/codes[@class="bip:topics:1.0"]/code')]
doc_date = root.attrib['date']
doc_title = root.find('.//title').text
doc_headline = root.find('.//headline').text
doc_body = '\n'.join([p.text for p in root.findall('.//text/p')])
if not doc_body:
raise ValueError('Empty document')
if doc_title is None: doc_title = ''
if doc_headline is None or doc_headline in doc_title: doc_headline = ''
text = '\n'.join([doc_title, doc_headline, doc_body]).strip()
text_length = len(text.split())
global nwords
nwords.append(text_length)
return RCV_Document(id=doc_id, text=text, categories=doc_categories, date=doc_date, lang=assert_lang)
def fetch_RCV1(data_path, split='all'):
assert split in ['train', 'test', 'all'], 'split should be "train", "test", or "all"'
request = []
labels = set()
read_documents = 0
lang = 'en'
training_documents = 23149
test_documents = 781265
if split == 'all':
split_range = (2286, 810596)
expected = training_documents+test_documents
elif split == 'train':
split_range = (2286, 26150)
expected = training_documents
else:
split_range = (26151, 810596)
expected = test_documents
global nwords
nwords=[]
for part in list_files(data_path):
if not re.match('\d+\.zip', part): continue
target_file = join(data_path, part)
assert exists(target_file), \
"You don't seem to have the file "+part+" in " + data_path + ", and the RCV1 corpus can not be downloaded"+\
" w/o a formal permission. Please, refer to " + RCV1_BASE_URL + " for more information."
zipfile = ZipFile(target_file)
for xmlfile in zipfile.namelist():
xmlcontent = zipfile.open(xmlfile).read()
try:
doc = parse_document(xmlcontent, assert_lang=lang, valid_id_range=split_range)
labels.update(doc.categories)
request.append(doc)
read_documents += 1
except ValueError:
print('\n\tskipping document {} with inconsistent language label: expected language {}'.format(part+'/'+xmlfile, lang))
except (IDRangeException, ExpectedLanguageException) as e:
pass
print('\r[{}] read {} documents'.format(part, len(request)), end='')
if read_documents == expected: break
if read_documents == expected: break
print()
print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords)))
return request, list(labels)
def fetch_RCV2(data_path, languages=None):
if not languages:
languages = list(RCV2_LANG_DIR.keys())
else:
assert set(languages).issubset(set(RCV2_LANG_DIR.keys())), 'languages not in scope'
request = []
labels = set()
global nwords
nwords=[]
for lang in languages:
path = join(data_path, RCV2_LANG_DIR[lang])
lang_docs_read = 0
for part in list_files(path):
target_file = join(path, part)
assert exists(target_file), \
"You don't seem to have the file "+part+" in " + path + ", and the RCV2 corpus can not be downloaded"+\
" w/o a formal permission. Please, refer to " + RCV2_BASE_URL + " for more information."
zipfile = ZipFile(target_file)
for xmlfile in zipfile.namelist():
xmlcontent = zipfile.open(xmlfile).read()
try:
doc = parse_document(xmlcontent, assert_lang=lang)
labels.update(doc.categories)
request.append(doc)
lang_docs_read += 1
except ValueError:
print('\n\tskipping document {} with inconsistent language label: expected language {}'.format(RCV2_LANG_DIR[lang]+'/'+part+'/'+xmlfile, lang))
except (IDRangeException, ExpectedLanguageException) as e:
pass
print('\r[{}] read {} documents, {} for language {}'.format(RCV2_LANG_DIR[lang]+'/'+part, len(request), lang_docs_read, lang), end='')
print()
print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords)))
return request, list(labels)
def fetch_topic_hierarchy(path, topics='all'):
assert topics in ['all', 'leaves']
download_file_if_not_exists(RCV1_TOPICHIER_URL, path)
hierarchy = {}
for line in open(path, 'rt'):
parts = line.strip().split()
parent,child = parts[1],parts[3]
if parent not in hierarchy:
hierarchy[parent]=[]
hierarchy[parent].append(child)
del hierarchy['None']
del hierarchy['Root']
print(hierarchy)
if topics=='all':
topics = set(hierarchy.keys())
for parent in hierarchy.keys():
topics.update(hierarchy[parent])
return list(topics)
elif topics=='leaves':
parents = set(hierarchy.keys())
childs = set()
for parent in hierarchy.keys():
childs.update(hierarchy[parent])
return list(childs.difference(parents))

View File

@ -1,307 +0,0 @@
from __future__ import print_function
# import ijson
# from ijson.common import ObjectBuilder
import os
import pickle
import re
from bz2 import BZ2File
from itertools import islice
from os.path import join
from xml.sax.saxutils import escape
import numpy as np
from util.file import list_dirs, list_files
policies = ["IN_ALL_LANGS", "IN_ANY_LANG"]
"""
This file contains a set of tools for processing the Wikipedia multilingual documents.
In what follows, it is assumed that you have already downloaded a Wikipedia dump (https://dumps.wikimedia.org/)
and have processed each document to clean their texts with one of the tools:
- https://github.com/aesuli/wikipediatools (Python 2)
- https://github.com/aesuli/wikipedia-extractor (Python 3)
It is also assumed you have dowloaded the all-entities json file (e.g., https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2)
This tools help you in:
- Processes the huge json file as a stream, and create a multilingual map of corresponding titles for each language.
Set the policy = "IN_ALL_LANGS" will extract only titles which appear in all (AND) languages, whereas "IN_ANY_LANG"
extracts all titles appearing in at least one (OR) language (warning: this will creates a huge dictionary).
Note: This version is quite slow. Although it is run once for all, you might be prefer to take a look at "Wikidata in BigQuery".
- Processes the huge json file as a stream a creates a simplified file which occupies much less and is far faster to be processed.
- Use the multilingual map to extract, from the clean text versions, individual xml documents containing all
language-specific versions from the document.
- Fetch the multilingual documents to create, for each of the specified languages, a list containing all documents,
in a way that the i-th element from any list refers to the same element in the respective language.
"""
def _doc_generator(text_path, langs):
dotspace = re.compile(r'\.(?!\s)')
for l,lang in enumerate(langs):
print("Processing language <%s> (%d/%d)" % (lang, l, len(langs)))
lang_dir = join(text_path, lang)
split_dirs = list_dirs(lang_dir)
for sd,split_dir in enumerate(split_dirs):
print("\tprocessing split_dir <%s> (%d/%d)" % (split_dir, sd, len(split_dirs)))
split_files = list_files(join(lang_dir, split_dir))
for sf,split_file in enumerate(split_files):
print("\t\tprocessing split_file <%s> (%d/%d)" % (split_file, sf, len(split_files)))
with BZ2File(join(lang_dir, split_dir, split_file), 'r', buffering=1024*1024) as fi:
while True:
doc_lines = list(islice(fi, 3))
if doc_lines:
# some sentences are not followed by a space after the dot
doc_lines[1] = dotspace.sub('. ', doc_lines[1])
# [workaround] I found &nbsp; html symbol was not treated, and unescaping it now might not help...
doc_lines[1] = escape(doc_lines[1].replace("&nbsp;", " "))
yield doc_lines, lang
else: break
def _extract_title(doc_lines):
m = re.search('title="(.+?)"', doc_lines[0])
if m: return m.group(1).decode('utf-8')
else: raise ValueError("Error in xml format: document head is %s" % doc_lines[0])
def _create_doc(target_file, id, doc, lang):
doc[0] = doc[0][:-2] + (' lang="%s">\n'%lang)
with open(target_file, 'w') as fo:
fo.write('<multidoc id="%s">\n'%id)
[fo.write(line) for line in doc]
fo.write('</multidoc>')
def _append_doc(target_file, doc, lang):
doc[0] = doc[0][:-2] + (' lang="%s">\n' % lang)
with open(target_file, 'r', buffering=1024*1024) as fi:
lines = fi.readlines()
if doc[0] in lines[1::3]:
return
lines[-1:-1]=doc
with open(target_file, 'w', buffering=1024*1024) as fo:
[fo.write(line) for line in lines]
def extract_multilingual_documents(inv_dict, langs, text_path, out_path):
if not os.path.exists(out_path):
os.makedirs(out_path)
for lang in langs:
if lang not in inv_dict:
raise ValueError("Lang %s is not in the dictionary" % lang)
docs_created = len(list_files(out_path))
print("%d multilingual documents found." % docs_created)
for doc,lang in _doc_generator(text_path, langs):
title = _extract_title(doc)
if title in inv_dict[lang]:
#pass
ids = inv_dict[lang][title]
for id in ids:
target_file = join(out_path, id) + ".xml"
if os.path.exists(target_file):
_append_doc(target_file, doc, lang)
else:
_create_doc(target_file, id, doc, lang)
docs_created+=1
else:
if not re.match('[A-Za-z]+', title):
print("Title <%s> for lang <%s> not in dictionary" % (title, lang))
def extract_multilingual_titles_from_simplefile(data_dir, filename, langs, policy="IN_ALL_LANGS", return_both=True):
simplified_file = join(data_dir,filename)
if policy not in policies:
raise ValueError("Policy %s not supported." % policy)
print("extracting multilingual titles with policy %s (%s)" % (policy,' '.join(langs)))
lang_prefix = list(langs)
lang_prefix.sort()
pickle_prefix = "extraction_" + "_".join(lang_prefix) + "." + policy
pickle_dict = join(data_dir, pickle_prefix+".multi_dict.pickle")
pickle_invdict = join(data_dir, pickle_prefix+".multi_invdict.pickle")
if os.path.exists(pickle_invdict):
if return_both and os.path.exists(pickle_dict):
print("Pickled files found in %s. Loading both (direct and inverse dictionaries)." % data_dir)
return pickle.load(open(pickle_dict, 'rb')), pickle.load(open(pickle_invdict, 'rb'))
elif return_both==False:
print("Pickled file found in %s. Loading inverse dictionary only." % pickle_invdict)
return pickle.load(open(pickle_invdict, 'rb'))
multiling_titles = {}
inv_dict = {lang:{} for lang in langs}
def process_entry(line):
parts = line.strip().split('\t')
id = parts[0]
if id in multiling_titles:
raise ValueError("id <%s> already indexed" % id)
titles = dict(((lang_title[:lang_title.find(':')],lang_title[lang_title.find(':')+1:].decode('utf-8')) for lang_title in parts[1:]))
for lang in titles.keys():
if lang not in langs:
del titles[lang]
if (policy == "IN_ALL_LANGS" and len(titles) == len(langs))\
or (policy == "IN_ANY_LANG" and len(titles) > 0):
multiling_titles[id] = titles
for lang, title in titles.items():
if title in inv_dict[lang]:
inv_dict[lang][title].append(id)
inv_dict[lang][title] = [id]
with BZ2File(simplified_file, 'r', buffering=1024*1024*16) as fi:
completed = 0
try:
for line in fi:
process_entry(line)
completed += 1
if completed % 10 == 0:
print("\rCompleted %d\ttitles %d" % (completed,len(multiling_titles)), end="")
print("\rCompleted %d\t\ttitles %d" % (completed, len(multiling_titles)), end="\n")
except EOFError:
print("\nUnexpected file ending... saving anyway")
print("Pickling dictionaries in %s" % data_dir)
pickle.dump(multiling_titles, open(pickle_dict,'wb'), pickle.HIGHEST_PROTOCOL)
pickle.dump(inv_dict, open(pickle_invdict, 'wb'), pickle.HIGHEST_PROTOCOL)
print("Done")
return (multiling_titles, inv_dict) if return_both else inv_dict
# in https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2
def simplify_json_file(data_dir, langs, policy="IN_ALL_LANGS", json_file = "latest-all.json.bz2"):
latest_all_json_file = join(data_dir,json_file)
if policy not in policies:
raise ValueError("Policy %s not supported." % policy)
print("extracting multilingual titles with policy %s (%s)" % (policy,' '.join(langs)))
lang_prefix = list(langs)
lang_prefix.sort()
simple_titles_path = join(data_dir, "extraction_" + "_".join(lang_prefix) + "." + policy)
def process_entry(last, fo):
global written
id = last["id"]
titles = None
if policy == "IN_ALL_LANGS" and langs.issubset(last["labels"].keys()):
titles = {lang: last["labels"][lang]["value"] for lang in langs}
elif policy == "IN_ANY_LANG":
titles = {lang: last["labels"][lang]["value"] for lang in langs if lang in last["labels"]}
if titles:
fo.write((id+'\t'+'\t'.join([lang+':'+titles[lang] for lang in titles.keys()])+'\n').encode('utf-8'))
return True
else:
return False
written = 0
with BZ2File(latest_all_json_file, 'r', buffering=1024*1024*16) as fi, \
BZ2File(join(data_dir,simple_titles_path+".simple.bz2"),'w') as fo:
builder = ObjectBuilder()
completed = 0
for event, value in ijson.basic_parse(fi, buf_size=1024*1024*16):
builder.event(event, value)
if len(builder.value)>1:
if process_entry(builder.value.pop(0), fo): written += 1
completed += 1
print("\rCompleted %d\ttitles %d" % (completed,written), end="")
print("")
#process the last entry
process_entry(builder.value.pop(0))
return simple_titles_path
"""
Reads all multi-lingual documents in a folder (see wikipedia_tools.py to generate them) and generates, for each of the
specified languages, a list contanining all its documents, so that the i-th element of any list refers to the language-
specific version of the same document. Documents are forced to contain version in all specified languages and to contain
a minimum number of words; otherwise it is discarded.
"""
class MinWordsNotReached(Exception): pass
class WrongDocumentFormat(Exception): pass
def _load_multilang_doc(path, langs, min_words=100):
import xml.etree.ElementTree as ET
from xml.etree.ElementTree import Element, ParseError
try:
root = ET.parse(path).getroot()
doc = {}
for lang in langs:
doc_body = root.find('.//doc[@lang="' + lang + '"]')
if isinstance(doc_body, Element):
n_words = len(doc_body.text.split(' '))
if n_words >= min_words:
doc[lang] = doc_body.text
else:
raise MinWordsNotReached
else:
raise WrongDocumentFormat
except ParseError:
raise WrongDocumentFormat
return doc
#returns the multilingual documents mapped by language, and a counter with the number of documents readed
def fetch_wikipedia_multilingual(wiki_multi_path, langs, min_words=100, deletions=False, max_documents=-1, pickle_name=None):
if pickle_name and os.path.exists(pickle_name):
print("unpickling %s" % pickle_name)
return pickle.load(open(pickle_name, 'rb'))
multi_docs = list_files(wiki_multi_path)
mling_documents = {l:[] for l in langs}
valid_documents = 0
minwords_exception = 0
wrongdoc_exception = 0
for d,multi_doc in enumerate(multi_docs):
print("\rProcessed %d/%d documents, valid %d/%d, few_words=%d, few_langs=%d" %
(d, len(multi_docs), valid_documents, len(multi_docs), minwords_exception, wrongdoc_exception),end="")
doc_path = join(wiki_multi_path, multi_doc)
try:
m_doc = _load_multilang_doc(doc_path, langs, min_words)
valid_documents += 1
for l in langs:
mling_documents[l].append(m_doc[l])
except MinWordsNotReached:
minwords_exception += 1
if deletions: os.remove(doc_path)
except WrongDocumentFormat:
wrongdoc_exception += 1
if deletions: os.remove(doc_path)
if max_documents>0 and valid_documents>=max_documents:
break
if pickle_name:
print("Pickling wikipedia documents object in %s" % pickle_name)
pickle.dump(mling_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL)
return mling_documents
def random_wiki_sample(l_wiki, max_documents):
if max_documents == 0: return None
langs = list(l_wiki.keys())
assert len(np.unique([len(l_wiki[l]) for l in langs])) == 1, 'documents across languages do not seem to be aligned'
ndocs_per_lang = len(l_wiki[langs[0]])
if ndocs_per_lang > max_documents:
sel = set(np.random.choice(list(range(ndocs_per_lang)), max_documents, replace=False))
for lang in langs:
l_wiki[lang] = [d for i, d in enumerate(l_wiki[lang]) if i in sel]
return l_wiki
if __name__ == "__main__":
wikipedia_home = "../Datasets/Wikipedia"
from data.languages import JRC_LANGS_WITH_NLTK_STEMMING as langs
langs = frozenset(langs)
simple_titles_path = simplify_json_file(wikipedia_home, langs, policy="IN_ALL_LANGS", json_file="latest-all.json.bz2")
_, inv_dict = extract_multilingual_titles_from_simplefile(wikipedia_home, simple_titles_path, langs, policy='IN_ALL_LANGS')
extract_multilingual_documents(inv_dict, langs, join(wikipedia_home,'text'),
out_path=join(wikipedia_home, 'multilingual_docs_JRC_NLTK'))

View File

@ -1,34 +0,0 @@
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from data.languages import NLTK_LANGMAP
def preprocess_documents(documents, lang):
tokens = NLTKStemTokenizer(lang, verbose=True)
sw = stopwords.words(NLTK_LANGMAP[lang])
return [' '.join([w for w in tokens(doc) if w not in sw]) for doc in documents]
class NLTKStemTokenizer(object):
def __init__(self, lang, verbose=False):
if lang not in NLTK_LANGMAP:
raise ValueError('Language %s is not supported in NLTK' % lang)
self.verbose=verbose
self.called = 0
self.wnl = SnowballStemmer(NLTK_LANGMAP[lang])
self.cache = {}
def __call__(self, doc):
self.called += 1
if self.verbose:
print("\r\t\t[documents processed %d]" % (self.called), end="")
tokens = word_tokenize(doc)
stems = []
for t in tokens:
if t not in self.cache:
self.cache[t] = self.wnl.stem(t)
stems.append(self.cache[t])
return stems

View File

@ -1,271 +0,0 @@
import math
import numpy as np
from joblib import Parallel, delayed
from scipy.sparse import csr_matrix, csc_matrix
from scipy.stats import t
def get_probs(tpr, fpr, pc):
# tpr = p(t|c) = p(tp)/p(c) = p(tp)/(p(tp)+p(fn))
# fpr = p(t|_c) = p(fp)/p(_c) = p(fp)/(p(fp)+p(tn))
pnc = 1.0 - pc
tp = tpr * pc
fn = pc - tp
fp = fpr * pnc
tn = pnc - fp
return ContTable(tp=tp, fn=fn, fp=fp, tn=tn)
def apply_tsr(tpr, fpr, pc, tsr):
cell = get_probs(tpr, fpr, pc)
return tsr(cell)
def positive_information_gain(cell):
if cell.tpr() < cell.fpr():
return 0.0
else:
return information_gain(cell)
def posneg_information_gain(cell):
ig = information_gain(cell)
if cell.tpr() < cell.fpr():
return -ig
else:
return ig
def __ig_factor(p_tc, p_t, p_c):
den = p_t * p_c
if den != 0.0 and p_tc != 0:
return p_tc * math.log(p_tc / den, 2)
else:
return 0.0
def information_gain(cell):
return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + \
__ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) +\
__ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()) + \
__ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c())
def information_gain_mod(cell):
return (__ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + __ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c())) \
- (__ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) + __ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()))
def pointwise_mutual_information(cell):
return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c())
def gain_ratio(cell):
pc = cell.p_c()
pnc = 1.0 - pc
norm = pc * math.log(pc, 2) + pnc * math.log(pnc, 2)
return information_gain(cell) / (-norm)
def chi_square(cell):
den = cell.p_f() * cell.p_not_f() * cell.p_c() * cell.p_not_c()
if den==0.0: return 0.0
num = gss(cell)**2
return num / den
def relevance_frequency(cell):
a = cell.tp
c = cell.fp
if c == 0: c = 1
return math.log(2.0 + (a * 1.0 / c), 2)
def idf(cell):
if cell.p_f()>0:
return math.log(1.0 / cell.p_f())
return 0.0
def gss(cell):
return cell.p_tp()*cell.p_tn() - cell.p_fp()*cell.p_fn()
def conf_interval(xt, n):
if n>30:
z2 = 3.84145882069 # norm.ppf(0.5+0.95/2.0)**2
else:
z2 = t.ppf(0.5 + 0.95 / 2.0, df=max(n-1,1)) ** 2
p = (xt + 0.5 * z2) / (n + z2)
amplitude = 0.5 * z2 * math.sqrt((p * (1.0 - p)) / (n + z2))
return p, amplitude
def strength(minPosRelFreq, minPos, maxNeg):
if minPos > maxNeg:
return math.log(2.0 * minPosRelFreq, 2.0)
else:
return 0.0
#set cancel_features=True to allow some features to be weighted as 0 (as in the original article)
#however, for some extremely imbalanced dataset caused all documents to be 0
def conf_weight(cell, cancel_features=False):
c = cell.get_c()
not_c = cell.get_not_c()
tp = cell.tp
fp = cell.fp
pos_p, pos_amp = conf_interval(tp, c)
neg_p, neg_amp = conf_interval(fp, not_c)
min_pos = pos_p-pos_amp
max_neg = neg_p+neg_amp
den = (min_pos + max_neg)
minpos_relfreq = min_pos / (den if den != 0 else 1)
str_tplus = strength(minpos_relfreq, min_pos, max_neg);
if str_tplus == 0 and not cancel_features:
return 1e-20
return str_tplus;
class ContTable:
def __init__(self, tp=0, tn=0, fp=0, fn=0):
self.tp=tp
self.tn=tn
self.fp=fp
self.fn=fn
def get_d(self): return self.tp + self.tn + self.fp + self.fn
def get_c(self): return self.tp + self.fn
def get_not_c(self): return self.tn + self.fp
def get_f(self): return self.tp + self.fp
def get_not_f(self): return self.tn + self.fn
def p_c(self): return (1.0*self.get_c())/self.get_d()
def p_not_c(self): return 1.0-self.p_c()
def p_f(self): return (1.0*self.get_f())/self.get_d()
def p_not_f(self): return 1.0-self.p_f()
def p_tp(self): return (1.0*self.tp) / self.get_d()
def p_tn(self): return (1.0*self.tn) / self.get_d()
def p_fp(self): return (1.0*self.fp) / self.get_d()
def p_fn(self): return (1.0*self.fn) / self.get_d()
def tpr(self):
c = 1.0*self.get_c()
return self.tp / c if c > 0.0 else 0.0
def fpr(self):
_c = 1.0*self.get_not_c()
return self.fp / _c if _c > 0.0 else 0.0
def round_robin_selection(X, Y, k, tsr_function=positive_information_gain):
print(f'[selectiong {k} terms]')
nC = Y.shape[1]
FC = get_tsr_matrix(get_supervised_matrix(X, Y), tsr_function).T
best_features_idx = np.argsort(-FC, axis=0).flatten()
tsr_values = FC.flatten()
selected_indexes_set = set()
selected_indexes = list()
selected_value = list()
from_category = list()
round_robin = iter(best_features_idx)
values_iter = iter(tsr_values)
round=0
while len(selected_indexes) < k:
term_idx = next(round_robin)
term_val = next(values_iter)
if term_idx not in selected_indexes_set:
selected_indexes_set.add(term_idx)
selected_indexes.append(term_idx)
selected_value.append(term_val)
from_category.append(round)
round = (round + 1) % nC
return np.asarray(selected_indexes, dtype=int), np.asarray(selected_value, dtype=float), np.asarray(from_category)
def feature_label_contingency_table(positive_document_indexes, feature_document_indexes, nD):
tp_ = len(positive_document_indexes & feature_document_indexes)
fp_ = len(feature_document_indexes - positive_document_indexes)
fn_ = len(positive_document_indexes - feature_document_indexes)
tn_ = nD - (tp_ + fp_ + fn_)
return ContTable(tp=tp_, tn=tn_, fp=fp_, fn=fn_)
def category_tables(feature_sets, category_sets, c, nD, nF):
return [feature_label_contingency_table(category_sets[c], feature_sets[f], nD) for f in range(nF)]
"""
Computes the nC x nF supervised matrix M where Mcf is the 4-cell contingency table for feature f and class c.
Efficiency O(nF x nC x log(S)) where S is the sparse factor
"""
def get_supervised_matrix(coocurrence_matrix, label_matrix, n_jobs=-1):
nD, nF = coocurrence_matrix.shape
nD2, nC = label_matrix.shape
if nD != nD2:
raise ValueError('Number of rows in coocurrence matrix shape %s and label matrix shape %s is not consistent' %
(coocurrence_matrix.shape,label_matrix.shape))
def nonzero_set(matrix, col):
return set(matrix[:, col].nonzero()[0])
if isinstance(coocurrence_matrix, csr_matrix):
coocurrence_matrix = csc_matrix(coocurrence_matrix)
feature_sets = [nonzero_set(coocurrence_matrix, f) for f in range(nF)]
category_sets = [nonzero_set(label_matrix, c) for c in range(nC)]
cell_matrix = Parallel(n_jobs=n_jobs, backend="threading")(delayed(category_tables)(feature_sets, category_sets, c, nD, nF) for c in range(nC))
return np.array(cell_matrix)
# obtains the matrix T where Tcf=tsr(f,c) is the tsr score for category c and feature f
def get_tsr_matrix(cell_matrix, tsr_score_funtion):
nC,nF = cell_matrix.shape
tsr_matrix = [[tsr_score_funtion(cell_matrix[c,f]) for f in range(nF)] for c in range(nC)]
return np.array(tsr_matrix)
""" The Fisher-score [1] is not computed on the 4-cell contingency table, but can
take as input any real-valued feature column (e.g., tf-idf weights).
feat is the feature vector, and c is a binary classification vector.
This implementation covers only the binary case, while the formula is defined for multiclass
single-label scenarios, for which the version [2] might be preferred.
[1] R.O. Duda, P.E. Hart, and D.G. Stork. Pattern classification. Wiley-interscience, 2012.
[2] Gu, Q., Li, Z., & Han, J. (2012). Generalized fisher score for feature selection. arXiv preprint arXiv:1202.3725.
"""
def fisher_score_binary(feat, c):
neg = np.ones_like(c) - c
npos = np.sum(c)
nneg = np.sum(neg)
mupos = np.mean(feat[c == 1])
muneg = np.mean(feat[neg == 1])
mu = np.mean(feat)
stdpos = np.std(feat[c == 1])
stdneg = np.std(feat[neg == 1])
num = npos * ((mupos - mu) ** 2) + nneg * ((muneg - mu) ** 2)
den = npos * (stdpos ** 2) + nneg * (stdneg ** 2)
if den>0:
return num / den
else:
return num

View File

@ -1,124 +0,0 @@
from models.learners import *
from util.common import _normalize
from view_generators import VanillaFunGen
class DocEmbedderList:
"""
Class that takes care of calling fit and transform function for every init embedder. Every ViewGenerator should be
contained by this class in order to seamlessly train the overall architecture.
"""
def __init__(self, embedder_list, probabilistic=True):
"""
Init the DocEmbedderList.
:param embedder_list: list of embedders to be deployed
:param probabilistic: whether to recast view generators output to vectors of posterior probabilities or not
"""
assert len(embedder_list) != 0, 'Embedder list cannot be empty!'
self.embedders = embedder_list
self.probabilistic = probabilistic
if probabilistic:
_tmp = []
for embedder in self.embedders:
if isinstance(embedder, VanillaFunGen):
_tmp.append(embedder)
else:
_tmp.append(FeatureSet2Posteriors(embedder))
self.embedders = _tmp
def fit(self, lX, ly):
"""
Fit all the ViewGenerators contained by DocEmbedderList.
:param lX:
:param ly:
:return: self
"""
for embedder in self.embedders:
embedder.fit(lX, ly)
return self
def transform(self, lX):
"""
Project documents by means of every ViewGenerators. Projections are then averaged together and returned.
:param lX:
:return: common latent space (averaged).
"""
langs = sorted(lX.keys())
lZparts = {lang: None for lang in langs}
for embedder in self.embedders:
lZ = embedder.transform(lX)
for lang in langs:
Z = lZ[lang]
if lZparts[lang] is None:
lZparts[lang] = Z
else:
lZparts[lang] += Z
n_embedders = len(self.embedders)
return {lang: lZparts[lang]/n_embedders for lang in langs} # Averaging feature spaces
def fit_transform(self, lX, ly):
return self.fit(lX, ly).transform(lX)
class FeatureSet2Posteriors:
"""
Takes care of recasting features outputted by the embedders to vecotrs of posterior probabilities by means of
a multiclass SVM.
"""
def __init__(self, embedder, l2=True, n_jobs=-1):
"""
Init the class.
:param embedder: ViewGen, view generators which does not natively outputs posterior probabilities.
:param l2: bool, whether to apply or not L2 normalization to the projection
:param n_jobs: int, number of concurrent workers.
"""
self.embedder = embedder
self.l2 = l2
self.n_jobs = n_jobs
self.prob_classifier = MetaClassifier(
SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs)
def fit(self, lX, ly):
lZ = self.embedder.fit_transform(lX, ly)
self.prob_classifier.fit(lZ, ly)
return self
def transform(self, lX):
lP = self.predict_proba(lX)
lP = _normalize(lP, self.l2)
return lP
def fit_transform(self, lX, ly):
return self.fit(lX, ly).transform(lX)
def predict(self, lX):
lZ = self.embedder.transform(lX)
return self.prob_classifier.predict(lZ)
def predict_proba(self, lX):
lZ = self.embedder.transform(lX)
return self.prob_classifier.predict_proba(lZ)
class Funnelling:
"""
Funnelling Architecture. It is composed by two tiers. The first-tier is a set of heterogeneous document embedders.
The second-tier (i.e., the metaclassifier), operates the classification of the common latent space computed by
the first-tier learners.
"""
def __init__(self, first_tier: DocEmbedderList, meta_classifier: MetaClassifier, n_jobs=-1):
self.first_tier = first_tier
self.meta = meta_classifier
self.n_jobs = n_jobs
def fit(self, lX, ly):
print('## Fitting first-tier learners!')
lZ = self.first_tier.fit_transform(lX, ly)
print('## Fitting meta-learner!')
self.meta.fit(lZ, ly)
def predict(self, lX):
lZ = self.first_tier.transform(lX)
ly = self.meta.predict(lZ)
return ly

View File

@ -1,167 +0,0 @@
from argparse import ArgumentParser
from data.dataset_builder import MultilingualDataset
from funnelling import *
from util.common import MultilingualIndex, get_params, get_method_name
from util.evaluation import evaluate
from util.results_csv import CSVlog
from view_generators import *
def main(args):
assert args.post_embedder or args.muse_embedder or args.wce_embedder or args.gru_embedder or args.bert_embedder, \
'empty set of document embeddings is not allowed!'
print('Running generalized funnelling...')
data = MultilingualDataset.load(args.dataset)
data.set_view(languages=['it', 'fr'])
data.show_dimensions()
lX, ly = data.training()
lXte, lyte = data.test()
# Init multilingualIndex - mandatory when deploying Neural View Generators...
if args.gru_embedder or args.bert_embedder:
multilingualIndex = MultilingualIndex()
lMuse = MuseLoader(langs=sorted(lX.keys()), cache=args.muse_dir)
multilingualIndex.index(lX, ly, lXte, lyte, l_pretrained_vocabulary=lMuse.vocabulary())
# Init ViewGenerators and append them to embedder_list
embedder_list = []
if args.post_embedder:
posteriorEmbedder = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=args.n_jobs)
embedder_list.append(posteriorEmbedder)
if args.muse_embedder:
museEmbedder = MuseGen(muse_dir=args.muse_dir, n_jobs=args.n_jobs)
embedder_list.append(museEmbedder)
if args.wce_embedder:
wceEmbedder = WordClassGen(n_jobs=args.n_jobs)
embedder_list.append(wceEmbedder)
if args.gru_embedder:
rnnEmbedder = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=args.gru_wce, batch_size=256,
nepochs=args.nepochs, gpus=args.gpus, n_jobs=args.n_jobs)
embedder_list.append(rnnEmbedder)
if args.bert_embedder:
bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=10, gpus=args.gpus, n_jobs=args.n_jobs)
embedder_list.append(bertEmbedder)
# Init DocEmbedderList (i.e., first-tier learners or view generators) and metaclassifier
docEmbedders = DocEmbedderList(embedder_list=embedder_list, probabilistic=True)
meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'),
meta_parameters=get_params(optimc=args.optimc))
# Init Funnelling Architecture
gfun = Funnelling(first_tier=docEmbedders, meta_classifier=meta)
# Training ---------------------------------------
print('\n[Training Generalized Funnelling]')
time_init = time()
time_tr = time()
gfun.fit(lX, ly)
time_tr = round(time() - time_tr, 3)
print(f'Training completed in {time_tr} seconds!')
# Testing ----------------------------------------
print('\n[Testing Generalized Funnelling]')
time_te = time()
ly_ = gfun.predict(lXte)
l_eval = evaluate(ly_true=lyte, ly_pred=ly_)
time_te = round(time() - time_te, 3)
print(f'Testing completed in {time_te} seconds!')
# Logging ---------------------------------------
print('\n[Results]')
results = CSVlog(args.csv_dir)
metrics = []
for lang in lXte.keys():
macrof1, microf1, macrok, microk = l_eval[lang]
metrics.append([macrof1, microf1, macrok, microk])
print(f'Lang {lang}: macro-F1 = {macrof1:.3f} micro-F1 = {microf1:.3f}')
if results is not None:
_id, _dataset = get_method_name(args)
results.add_row(method='gfun',
setting=_id,
optimc=args.optimc,
sif='True',
zscore='True',
l2='True',
dataset=_dataset,
time_tr=time_tr,
time_te=time_te,
lang=lang,
macrof1=macrof1,
microf1=microf1,
macrok=macrok,
microk=microk,
notes='')
print('Averages: MF1, mF1, MK, mK', np.round(np.mean(np.array(metrics), axis=0), 3))
overall_time = round(time() - time_init, 3)
exit(f'\nExecuted in: {overall_time} seconds!')
if __name__ == '__main__':
parser = ArgumentParser(description='Run generalized funnelling, A. Moreo, A. Pedrotti and F. Sebastiani')
parser.add_argument('dataset', help='Path to the dataset')
parser.add_argument('-o', '--output', dest='csv_dir',
help='Result file (default ../csv_log/gfun_results.csv)', type=str,
default='csv_logs/gfun/gfun_results.csv')
parser.add_argument('-x', '--post_embedder', dest='post_embedder', action='store_true',
help='deploy posterior probabilities embedder to compute document embeddings',
default=False)
parser.add_argument('-w', '--wce_embedder', dest='wce_embedder', action='store_true',
help='deploy (supervised) Word-Class embedder to the compute document embeddings',
default=False)
parser.add_argument('-m', '--muse_embedder', dest='muse_embedder', action='store_true',
help='deploy (pretrained) MUSE embedder to compute document embeddings',
default=False)
parser.add_argument('-b', '--bert_embedder', dest='bert_embedder', action='store_true',
help='deploy multilingual Bert to compute document embeddings',
default=False)
parser.add_argument('-g', '--gru_embedder', dest='gru_embedder', action='store_true',
help='deploy a GRU in order to compute document embeddings',
default=False)
parser.add_argument('-c', '--c_optimize', dest='optimc', action='store_true',
help='Optimize SVMs C hyperparameter',
default=False)
parser.add_argument('-n', '--nepochs', dest='nepochs', type=str,
help='Number of max epochs to train Recurrent embedder (i.e., -g)')
parser.add_argument('-j', '--n_jobs', dest='n_jobs', type=int,
help='Number of parallel jobs (default is -1, all)',
default=-1)
parser.add_argument('--muse_dir', dest='muse_dir', type=str,
help='Path to the MUSE polylingual word embeddings (default ../embeddings)',
default='../embeddings')
parser.add_argument('--gru_wce', dest='gru_wce', action='store_true',
help='Deploy WCE embedding as embedding layer of the GRU View Generator',
default=False)
parser.add_argument('--gru_dir', dest='gru_dir', type=str,
help='Set the path to a pretrained GRU model (i.e., -g view generator)',
default=None)
parser.add_argument('--bert_dir', dest='bert_dir', type=str,
help='Set the path to a pretrained mBERT model (i.e., -b view generator)',
default=None)
parser.add_argument('--gpus', help='specifies how many GPUs to use per node',
default=None)
args = parser.parse_args()
main(args)

View File

@ -1,51 +0,0 @@
import torch
import torch.nn as nn
from torch.nn import functional as F
def init_embeddings(pretrained, vocab_size, learnable_length):
"""
Compute the embedding matrix
:param pretrained:
:param vocab_size:
:param learnable_length:
:return:
"""
pretrained_embeddings = None
pretrained_length = 0
if pretrained is not None:
pretrained_length = pretrained.shape[1]
assert pretrained.shape[0] == vocab_size, 'pre-trained matrix does not match with the vocabulary size'
pretrained_embeddings = nn.Embedding(vocab_size, pretrained_length)
# requires_grad=False sets the embedding layer as NOT trainable
pretrained_embeddings.weight = nn.Parameter(pretrained, requires_grad=False)
learnable_embeddings = None
if learnable_length > 0:
learnable_embeddings = nn.Embedding(vocab_size, learnable_length)
embedding_length = learnable_length + pretrained_length
assert embedding_length > 0, '0-size embeddings'
return pretrained_embeddings, learnable_embeddings, embedding_length
def embed(model, input, lang):
input_list = []
if model.lpretrained_embeddings[lang]:
input_list.append(model.lpretrained_embeddings[lang](input))
if model.llearnable_embeddings[lang]:
input_list.append(model.llearnable_embeddings[lang](input))
return torch.cat(tensors=input_list, dim=2)
def embedding_dropout(input, drop_range, p_drop=0.5, training=True):
if p_drop > 0 and training and drop_range is not None:
p = p_drop
drop_from, drop_to = drop_range
m = drop_to - drop_from #length of the supervised embedding
l = input.shape[2] #total embedding length
corr = (1 - p)
input[:, :, drop_from:drop_to] = corr * F.dropout(input[:, :, drop_from:drop_to], p=p)
input /= (1 - (p * m / l))
return input

View File

@ -1,224 +0,0 @@
import time
import numpy as np
from joblib import Parallel, delayed
from scipy.sparse import issparse
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from util.standardizer import StandardizeTransformer
def get_learner(calibrate=False, kernel='linear', C=1):
"""
instantiate scikit Support Vector Classifier
:param calibrate: boolean, whether to return posterior probabilities or not
:param kernel: string,kernel to be applied to the SVC
:param C: int or dict {'C': list of integer}, Regularization parameter
:return: Support Vector Classifier
"""
return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=C, random_state=1, gamma='auto', verbose=False)
def _sort_if_sparse(X):
if issparse(X) and not X.has_sorted_indices:
X.sort_indices()
def _joblib_transform_multiling(transformer, lX, n_jobs=-1):
if n_jobs == 1:
return {lang: transformer(lX[lang]) for lang in lX.keys()}
else:
langs = list(lX.keys())
transformations = Parallel(n_jobs=n_jobs)(delayed(transformer)(lX[lang]) for lang in langs)
return {lang: transformations[i] for i, lang in enumerate(langs)}
class TrivialRejector:
def fit(self, X, y):
self.cats = y.shape[1]
return self
def decision_function(self, X): return np.zeros((X.shape[0], self.cats))
def predict(self, X): return np.zeros((X.shape[0], self.cats))
def predict_proba(self, X): return np.zeros((X.shape[0], self.cats))
def best_params(self): return {}
class NaivePolylingualClassifier:
"""
Is a mere set of independet MonolingualClassifiers
"""
def __init__(self, base_learner, parameters=None, n_jobs=-1):
self.base_learner = base_learner
self.parameters = parameters
self.model = None
self.n_jobs = n_jobs
def fit(self, lX, ly):
"""
trains the independent monolingual classifiers
:param lX: a dictionary {language_label: X csr-matrix}
:param ly: a dictionary {language_label: y np.array}
:return: self
"""
tinit = time.time()
assert set(lX.keys()) == set(ly.keys()), 'inconsistent language mappings in fit'
langs = list(lX.keys())
for lang in langs:
_sort_if_sparse(lX[lang])
models = Parallel(n_jobs=self.n_jobs)\
(delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]), ly[lang]) for
lang in langs)
self.model = {lang: models[i] for i, lang in enumerate(langs)}
self.empty_categories = {lang: self.model[lang].empty_categories for lang in langs}
self.time = time.time() - tinit
return self
def decision_function(self, lX):
"""
:param lX: a dictionary {language_label: X csr-matrix}
:return: a dictionary of classification scores for each class
"""
assert self.model is not None, 'predict called before fit'
assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in decision function'
langs = list(lX.keys())
scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].decision_function)(lX[lang]) for lang in langs)
return {lang: scores[i] for i, lang in enumerate(langs)}
def predict_proba(self, lX):
"""
:param lX: a dictionary {language_label: X csr-matrix}
:return: a dictionary of probabilities that each document belongs to each class
"""
assert self.model is not None, 'predict called before fit'
assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in decision function'
langs = list(lX.keys())
scores = Parallel(n_jobs=self.n_jobs, max_nbytes=None)(
delayed(self.model[lang].predict_proba)(lX[lang]) for lang in langs)
return {lang: scores[i] for i, lang in enumerate(langs)}
def predict(self, lX):
"""
:param lX: a dictionary {language_label: X csr-matrix}
:return: a dictionary of predictions
"""
assert self.model is not None, 'predict called before fit'
assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in predict'
if self.n_jobs == 1:
return {lang: self.model[lang].transform(lX[lang]) for lang in lX.keys()}
else:
langs = list(lX.keys())
scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].predict)(lX[lang]) for lang in langs)
return {lang: scores[i] for i, lang in enumerate(langs)}
def best_params(self):
return {lang: model.best_params() for lang, model in self.model.items()}
class MonolingualClassifier:
def __init__(self, base_learner, parameters=None, n_jobs=-1):
self.learner = base_learner
self.parameters = parameters
self.model = None
self.n_jobs = n_jobs
self.best_params_ = None
def fit(self, X, y):
if X.shape[0] == 0:
print('Warning: X has 0 elements, a trivial rejector will be created')
self.model = TrivialRejector().fit(X, y)
self.empty_categories = np.arange(y.shape[1])
return self
tinit = time.time()
_sort_if_sparse(X)
self.empty_categories = np.argwhere(np.sum(y, axis=0) == 0).flatten()
# multi-class format
if len(y.shape) == 2:
if self.parameters is not None:
self.parameters = [{'estimator__' + key: params[key] for key in params.keys()}
for params in self.parameters]
self.model = OneVsRestClassifier(self.learner, n_jobs=self.n_jobs)
else:
self.model = self.learner
raise NotImplementedError('not working as a base-classifier for funneling if there are gaps in '
'the labels across languages')
# parameter optimization?
if self.parameters:
print('debug: optimizing parameters:', self.parameters)
self.model = GridSearchCV(self.model, param_grid=self.parameters, refit=True, cv=5, n_jobs=self.n_jobs,
error_score=0, verbose=10)
print(f'fitting: Mono-lingual Classifier on matrices of shape X={X.shape} Y={y.shape}')
self.model.fit(X, y)
if isinstance(self.model, GridSearchCV):
self.best_params_ = self.model.best_params_
print('best parameters: ', self.best_params_)
self.time = time.time() - tinit
return self
def decision_function(self, X):
assert self.model is not None, 'predict called before fit'
_sort_if_sparse(X)
return self.model.decision_function(X)
def predict_proba(self, X):
assert self.model is not None, 'predict called before fit'
assert hasattr(self.model, 'predict_proba'), 'the probability predictions are not enabled in this model'
_sort_if_sparse(X)
return self.model.predict_proba(X)
def predict(self, X):
assert self.model is not None, 'predict called before fit'
_sort_if_sparse(X)
return self.model.predict(X)
def best_params(self):
return self.best_params_
class MetaClassifier:
def __init__(self, meta_learner, meta_parameters=None, n_jobs=-1, standardize_range=None):
self.n_jobs = n_jobs
self.model = MonolingualClassifier(base_learner=meta_learner, parameters=meta_parameters, n_jobs=n_jobs)
self.standardize_range = standardize_range
def fit(self, lZ, ly):
tinit = time.time()
Z, y = self.stack(lZ, ly)
self.standardizer = StandardizeTransformer(range=self.standardize_range)
Z = self.standardizer.fit_transform(Z)
print('fitting the Z-space of shape={}'.format(Z.shape))
self.model.fit(Z, y)
self.time = time.time() - tinit
def stack(self, lZ, ly=None):
langs = list(lZ.keys())
Z = np.vstack([lZ[lang] for lang in langs])
if ly is not None:
y = np.vstack([ly[lang] for lang in langs])
return Z, y
else:
return Z
def predict(self, lZ):
lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs)
return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs)
def predict_proba(self, lZ):
lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs)
return _joblib_transform_multiling(self.model.predict_proba, lZ, n_jobs=self.n_jobs)

View File

@ -1,113 +0,0 @@
#taken from https://github.com/prakashpandey9/Text-Classification-Pytorch/blob/master/models/LSTM.py
from torch.autograd import Variable
from models.helpers import *
class RNNMultilingualClassifier(nn.Module):
def __init__(self, output_size, hidden_size, lvocab_size, learnable_length, lpretrained=None,
drop_embedding_range=None, drop_embedding_prop=0, post_probabilities=True, only_post=False,
bert_embeddings=False):
super(RNNMultilingualClassifier, self).__init__()
self.output_size = output_size
self.hidden_size = hidden_size
self.drop_embedding_range = drop_embedding_range
self.drop_embedding_prop = drop_embedding_prop
self.post_probabilities = post_probabilities
self.bert_embeddings = bert_embeddings
assert 0 <= drop_embedding_prop <= 1, 'drop_embedding_prop: wrong range'
self.lpretrained_embeddings = nn.ModuleDict()
self.llearnable_embeddings = nn.ModuleDict()
self.embedding_length = None
self.langs = sorted(lvocab_size.keys())
self.only_post = only_post
self.n_layers = 1
self.n_directions = 1
self.dropout = nn.Dropout(0.6)
lstm_out = 256
ff1 = 512
ff2 = 256
lpretrained_embeddings = {}
llearnable_embeddings = {}
if only_post==False:
for l in self.langs:
pretrained = lpretrained[l] if lpretrained else None
pretrained_embeddings, learnable_embeddings, embedding_length = init_embeddings(
pretrained, lvocab_size[l], learnable_length
)
lpretrained_embeddings[l] = pretrained_embeddings
llearnable_embeddings[l] = learnable_embeddings
self.embedding_length = embedding_length
# self.lstm = nn.LSTM(self.embedding_length, hidden_size, dropout=0.2 if self.n_layers>1 else 0, num_layers=self.n_layers, bidirectional=(self.n_directions==2))
self.rnn = nn.GRU(self.embedding_length, hidden_size)
self.linear0 = nn.Linear(hidden_size * self.n_directions, lstm_out)
self.lpretrained_embeddings.update(lpretrained_embeddings)
self.llearnable_embeddings.update(llearnable_embeddings)
self.linear1 = nn.Linear(lstm_out, ff1)
self.linear2 = nn.Linear(ff1, ff2)
if only_post:
self.label = nn.Linear(output_size, output_size)
elif post_probabilities and not bert_embeddings:
self.label = nn.Linear(ff2 + output_size, output_size)
elif bert_embeddings and not post_probabilities:
self.label = nn.Linear(ff2 + 768, output_size)
elif post_probabilities and bert_embeddings:
self.label = nn.Linear(ff2 + output_size + 768, output_size)
else:
self.label = nn.Linear(ff2, output_size)
def forward(self, input, post, bert_embed, lang):
if self.only_post:
doc_embedding = post
else:
doc_embedding = self.transform(input, lang)
if self.post_probabilities:
doc_embedding = torch.cat([doc_embedding, post], dim=1)
if self.bert_embeddings:
doc_embedding = torch.cat([doc_embedding, bert_embed], dim=1)
logits = self.label(doc_embedding)
return logits
def transform(self, input, lang):
batch_size = input.shape[0]
input = embed(self, input, lang)
input = embedding_dropout(input, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop,
training=self.training)
input = input.permute(1, 0, 2)
h_0 = Variable(torch.zeros(self.n_layers*self.n_directions, batch_size, self.hidden_size).cuda())
# c_0 = Variable(torch.zeros(self.n_layers*self.n_directions, batch_size, self.hidden_size).cuda())
# output, (_, _) = self.lstm(input, (h_0, c_0))
output, _ = self.rnn(input, h_0)
output = output[-1, :, :]
output = F.relu(self.linear0(output))
output = self.dropout(F.relu(self.linear1(output)))
output = self.dropout(F.relu(self.linear2(output)))
return output
def finetune_pretrained(self):
for l in self.langs:
self.lpretrained_embeddings[l].requires_grad = True
self.lpretrained_embeddings[l].weight.requires_grad = True
def get_embeddings(self, input, lang):
batch_size = input.shape[0]
input = embed(self, input, lang)
input = embedding_dropout(input, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop,
training=self.training)
input = input.permute(1, 0, 2)
h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, batch_size, self.hidden_size).cuda())
output, _ = self.rnn(input, h_0)
output = output[-1, :, :]
return output.cpu().detach().numpy()

View File

@ -1,183 +0,0 @@
import pytorch_lightning as pl
import torch
from torch.optim.lr_scheduler import StepLR
from transformers import BertForSequenceClassification, AdamW
from util.common import define_pad_length, pad
from util.pl_metrics import CustomF1, CustomK
class BertModel(pl.LightningModule):
def __init__(self, output_size, stored_path, gpus=None):
"""
Init Bert model.
:param output_size:
:param stored_path:
:param gpus:
"""
super().__init__()
self.loss = torch.nn.BCEWithLogitsLoss()
self.gpus = gpus
self.microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
self.macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
self.microK = CustomK(num_classes=output_size, average='micro', device=self.gpus)
self.macroK = CustomK(num_classes=output_size, average='macro', device=self.gpus)
# Language specific metrics to compute metrics at epoch level
self.lang_macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
self.lang_microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
self.lang_macroK = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
self.lang_microK = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
if stored_path:
self.bert = BertForSequenceClassification.from_pretrained(stored_path,
num_labels=output_size,
output_hidden_states=True)
else:
self.bert = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased',
num_labels=output_size,
output_hidden_states=True)
self.save_hyperparameters()
def forward(self, X):
logits = self.bert(X)
return logits
def training_step(self, train_batch, batch_idx):
X, y, _, batch_langs = train_batch
X = torch.cat(X).view([X[0].shape[0], len(X)])
y = y.type(torch.FloatTensor)
y = y.to('cuda' if self.gpus else 'cpu')
logits, _ = self.forward(X)
loss = self.loss(logits, y)
# Squashing logits through Sigmoid in order to get confidence score
predictions = torch.sigmoid(logits) > 0.5
microF1 = self.microF1(predictions, y)
macroF1 = self.macroF1(predictions, y)
microK = self.microK(predictions, y)
macroK = self.macroK(predictions, y)
self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True)
self.log('train-macroF1', macroF1, on_step=True, on_epoch=True, prog_bar=False, logger=True)
self.log('train-microF1', microF1, on_step=True, on_epoch=True, prog_bar=False, logger=True)
self.log('train-macroK', macroK, on_step=True, on_epoch=True, prog_bar=False, logger=True)
self.log('train-microK', microK, on_step=True, on_epoch=True, prog_bar=False, logger=True)
lX, ly = self._reconstruct_dict(predictions, y, batch_langs)
return {'loss': loss, 'pred': lX, 'target': ly}
def training_epoch_end(self, outputs):
langs = []
for output in outputs:
langs.extend(list(output['pred'].keys()))
langs = set(langs)
# outputs is a of n dicts of m elements, where n is equal to the number of epoch steps and m is batchsize.
# here we save epoch level metric values and compute them specifically for each language
res_macroF1 = {lang: [] for lang in langs}
res_microF1 = {lang: [] for lang in langs}
res_macroK = {lang: [] for lang in langs}
res_microK = {lang: [] for lang in langs}
for output in outputs:
lX, ly = output['pred'], output['target']
for lang in lX.keys():
X, y = lX[lang], ly[lang]
lang_macroF1 = self.lang_macroF1(X, y)
lang_microF1 = self.lang_microF1(X, y)
lang_macroK = self.lang_macroK(X, y)
lang_microK = self.lang_microK(X, y)
res_macroF1[lang].append(lang_macroF1)
res_microF1[lang].append(lang_microF1)
res_macroK[lang].append(lang_macroK)
res_microK[lang].append(lang_microK)
for lang in langs:
avg_macroF1 = torch.mean(torch.Tensor(res_macroF1[lang]))
avg_microF1 = torch.mean(torch.Tensor(res_microF1[lang]))
avg_macroK = torch.mean(torch.Tensor(res_macroK[lang]))
avg_microK = torch.mean(torch.Tensor(res_microK[lang]))
self.logger.experiment.add_scalars('train-langs-macroF1', {f'{lang}': avg_macroF1}, self.current_epoch)
self.logger.experiment.add_scalars('train-langs-microF1', {f'{lang}': avg_microF1}, self.current_epoch)
self.logger.experiment.add_scalars('train-langs-macroK', {f'{lang}': avg_macroK}, self.current_epoch)
self.logger.experiment.add_scalars('train-langs-microK', {f'{lang}': avg_microK}, self.current_epoch)
def validation_step(self, val_batch, batch_idx):
X, y, _, batch_langs = val_batch
X = torch.cat(X).view([X[0].shape[0], len(X)])
y = y.type(torch.FloatTensor)
y = y.to('cuda' if self.gpus else 'cpu')
logits, _ = self.forward(X)
loss = self.loss(logits, y)
predictions = torch.sigmoid(logits) > 0.5
microF1 = self.microF1(predictions, y)
macroF1 = self.macroF1(predictions, y)
microK = self.microK(predictions, y)
macroK = self.macroK(predictions, y)
self.log('val-loss', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True)
self.log('val-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=True, logger=True)
self.log('val-microF1', microF1, on_step=False, on_epoch=True, prog_bar=True, logger=True)
self.log('val-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
self.log('val-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
return {'loss': loss}
def test_step(self, test_batch, batch_idx):
X, y, _, batch_langs = test_batch
X = torch.cat(X).view([X[0].shape[0], len(X)])
y = y.type(torch.FloatTensor)
y = y.to('cuda' if self.gpus else 'cpu')
logits, _ = self.forward(X)
loss = self.loss(logits, y)
# Squashing logits through Sigmoid in order to get confidence score
predictions = torch.sigmoid(logits) > 0.5
microF1 = self.microF1(predictions, y)
macroF1 = self.macroF1(predictions, y)
microK = self.microK(predictions, y)
macroK = self.macroK(predictions, y)
self.log('test-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=False, logger=True)
self.log('test-microF1', microF1, on_step=False, on_epoch=True, prog_bar=False, logger=True)
self.log('test-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
self.log('test-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
return
def configure_optimizers(self, lr=3e-5, weight_decay=0.01):
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in self.bert.named_parameters()
if not any(nd in n for nd in no_decay)],
'weight_decay': weight_decay},
{'params': [p for n, p in self.bert.named_parameters()
if any(nd in n for nd in no_decay)],
'weight_decay': weight_decay}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
scheduler = StepLR(optimizer, step_size=25, gamma=0.1)
return [optimizer], [scheduler]
def encode(self, lX, batch_size=64):
with torch.no_grad():
l_embed = {lang: [] for lang in lX.keys()}
for lang in sorted(lX.keys()):
for i in range(0, len(lX[lang]), batch_size):
if i + batch_size > len(lX[lang]):
batch = lX[lang][i:len(lX[lang])]
else:
batch = lX[lang][i:i + batch_size]
max_pad_len = define_pad_length(batch)
batch = pad(batch, pad_index=self.bert.config.pad_token_id, max_pad_length=max_pad_len)
batch = torch.LongTensor(batch).to('cuda' if self.gpus else 'cpu')
_, output = self.forward(batch)
doc_embeds = output[-1][:, 0, :]
l_embed[lang].append(doc_embeds.cpu())
for k, v in l_embed.items():
l_embed[k] = torch.cat(v, dim=0).numpy()
return l_embed
@staticmethod
def _reconstruct_dict(predictions, y, batch_langs):
reconstructed_x = {lang: [] for lang in set(batch_langs)}
reconstructed_y = {lang: [] for lang in set(batch_langs)}
for i, pred in enumerate(predictions):
reconstructed_x[batch_langs[i]].append(pred)
reconstructed_y[batch_langs[i]].append(y[i])
for k, v in reconstructed_x.items():
reconstructed_x[k] = torch.cat(v).view(-1, predictions.shape[1])
for k, v in reconstructed_y.items():
reconstructed_y[k] = torch.cat(v).view(-1, predictions.shape[1])
return reconstructed_x, reconstructed_y

View File

@ -1,266 +0,0 @@
# Lightning modules, see https://pytorch-lightning.readthedocs.io/en/latest/lightning_module.html
import pytorch_lightning as pl
import torch
import torch.nn.functional as F
from torch import nn
from torch.autograd import Variable
from torch.optim.lr_scheduler import StepLR
from transformers import AdamW
from models.helpers import init_embeddings
from util.common import define_pad_length, pad
from util.pl_metrics import CustomF1, CustomK
class RecurrentModel(pl.LightningModule):
def __init__(self, lPretrained, langs, output_size, hidden_size, lVocab_size, learnable_length,
drop_embedding_range, drop_embedding_prop, gpus=None):
"""
Init RNN model.
:param lPretrained:
:param langs:
:param output_size:
:param hidden_size:
:param lVocab_size:
:param learnable_length:
:param drop_embedding_range:
:param drop_embedding_prop:
:param gpus:
"""
super().__init__()
self.gpus = gpus
self.langs = langs
self.lVocab_size = lVocab_size
self.learnable_length = learnable_length
self.output_size = output_size
self.hidden_size = hidden_size
self.drop_embedding_range = drop_embedding_range
self.drop_embedding_prop = drop_embedding_prop
self.loss = torch.nn.BCEWithLogitsLoss()
self.microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
self.macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
self.microK = CustomK(num_classes=output_size, average='micro', device=self.gpus)
self.macroK = CustomK(num_classes=output_size, average='macro', device=self.gpus)
# Language specific metrics to compute metrics at epoch level
self.lang_macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
self.lang_microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
self.lang_macroK = CustomF1(num_classes=output_size, average='macro', device=self.gpus)
self.lang_microK = CustomF1(num_classes=output_size, average='micro', device=self.gpus)
self.lPretrained_embeddings = nn.ModuleDict()
self.lLearnable_embeddings = nn.ModuleDict()
self.n_layers = 1
self.n_directions = 1
self.dropout = nn.Dropout(0.6)
lstm_out = 256
ff1 = 512
ff2 = 256
lpretrained_embeddings = {}
llearnable_embeddings = {}
for lang in self.langs:
pretrained = lPretrained[lang] if lPretrained else None
pretrained_embeddings, learnable_embeddings, embedding_length = init_embeddings(
pretrained, self.lVocab_size[lang], self.learnable_length)
lpretrained_embeddings[lang] = pretrained_embeddings
llearnable_embeddings[lang] = learnable_embeddings
self.embedding_length = embedding_length
self.lPretrained_embeddings.update(lpretrained_embeddings)
self.lLearnable_embeddings.update(llearnable_embeddings)
self.rnn = nn.GRU(self.embedding_length, hidden_size)
self.linear0 = nn.Linear(hidden_size * self.n_directions, lstm_out)
self.linear1 = nn.Linear(lstm_out, ff1)
self.linear2 = nn.Linear(ff1, ff2)
self.label = nn.Linear(ff2, self.output_size)
# TODO: setting lPretrained to None, letting it to its original value will "bug" first validation
# step (i.e., checkpoint will store also its ++ value, I guess, making the saving process too slow)
lPretrained = None
self.save_hyperparameters()
def forward(self, lX):
l_embed = []
for lang in sorted(lX.keys()):
doc_embedding = self.transform(lX[lang], lang)
l_embed.append(doc_embedding)
embed = torch.cat(l_embed, dim=0)
logits = self.label(embed)
return logits
def transform(self, X, lang):
batch_size = X.shape[0]
X = self.embed(X, lang)
X = self.embedding_dropout(X, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop,
training=self.training)
X = X.permute(1, 0, 2)
h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, batch_size, self.hidden_size).to(self.device))
output, _ = self.rnn(X, h_0)
output = output[-1, :, :]
output = F.relu(self.linear0(output))
output = self.dropout(F.relu(self.linear1(output)))
output = self.dropout(F.relu(self.linear2(output)))
return output
def encode(self, lX, l_pad, batch_size=128):
"""
Returns encoded data (i.e, RNN hidden state at second feed-forward layer - linear1). Dimensionality is 512.
:param lX:
:param l_pad:
:param batch_size:
:return:
"""
with torch.no_grad():
l_embed = {lang: [] for lang in lX.keys()}
for lang in sorted(lX.keys()):
for i in range(0, len(lX[lang]), batch_size):
if i+batch_size > len(lX[lang]):
batch = lX[lang][i:len(lX[lang])]
else:
batch = lX[lang][i:i+batch_size]
max_pad_len = define_pad_length(batch)
batch = pad(batch, pad_index=l_pad[lang], max_pad_length=max_pad_len)
X = torch.LongTensor(batch).to('cuda' if self.gpus else 'cpu')
_batch_size = X.shape[0]
X = self.embed(X, lang)
X = self.embedding_dropout(X, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop,
training=self.training)
X = X.permute(1, 0, 2)
h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, _batch_size, self.hidden_size).to(self.device))
output, _ = self.rnn(X, h_0)
output = output[-1, :, :]
output = F.relu(self.linear0(output))
output = self.dropout(F.relu(self.linear1(output)))
l_embed[lang].append(output.cpu())
for k, v in l_embed.items():
l_embed[k] = torch.cat(v, dim=0).numpy()
return l_embed
def training_step(self, train_batch, batch_idx):
lX, ly = train_batch
logits = self.forward(lX)
_ly = []
for lang in sorted(lX.keys()):
_ly.append(ly[lang])
y = torch.cat(_ly, dim=0)
loss = self.loss(logits, y)
# Squashing logits through Sigmoid in order to get confidence score
predictions = torch.sigmoid(logits) > 0.5
microF1 = self.microF1(predictions, y)
macroF1 = self.macroF1(predictions, y)
microK = self.microK(predictions, y)
macroK = self.macroK(predictions, y)
self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True)
self.log('train-macroF1', macroF1, on_step=True, on_epoch=True, prog_bar=False, logger=True)
self.log('train-microF1', microF1, on_step=True, on_epoch=True, prog_bar=False, logger=True)
self.log('train-macroK', macroK, on_step=True, on_epoch=True, prog_bar=False, logger=True)
self.log('train-microK', microK, on_step=True, on_epoch=True, prog_bar=False, logger=True)
re_lX = self._reconstruct_dict(predictions, ly)
return {'loss': loss, 'pred': re_lX, 'target': ly}
def training_epoch_end(self, outputs):
# outputs is a of n dicts of m elements, where n is equal to the number of epoch steps and m is batchsize.
# here we save epoch level metric values and compute them specifically for each language
res_macroF1 = {lang: [] for lang in self.langs}
res_microF1 = {lang: [] for lang in self.langs}
res_macroK = {lang: [] for lang in self.langs}
res_microK = {lang: [] for lang in self.langs}
for output in outputs:
lX, ly = output['pred'], output['target']
for lang in lX.keys():
X, y = lX[lang], ly[lang]
lang_macroF1 = self.lang_macroF1(X, y)
lang_microF1 = self.lang_microF1(X, y)
lang_macroK = self.lang_macroK(X, y)
lang_microK = self.lang_microK(X, y)
res_macroF1[lang].append(lang_macroF1)
res_microF1[lang].append(lang_microF1)
res_macroK[lang].append(lang_macroK)
res_microK[lang].append(lang_microK)
for lang in self.langs:
avg_macroF1 = torch.mean(torch.Tensor(res_macroF1[lang]))
avg_microF1 = torch.mean(torch.Tensor(res_microF1[lang]))
avg_macroK = torch.mean(torch.Tensor(res_macroK[lang]))
avg_microK = torch.mean(torch.Tensor(res_microK[lang]))
self.logger.experiment.add_scalars('train-langs-macroF1', {f'{lang}': avg_macroF1}, self.current_epoch)
self.logger.experiment.add_scalars('train-langs-microF1', {f'{lang}': avg_microF1}, self.current_epoch)
self.logger.experiment.add_scalars('train-langs-macroK', {f'{lang}': avg_macroK}, self.current_epoch)
self.logger.experiment.add_scalars('train-langs-microK', {f'{lang}': avg_microK}, self.current_epoch)
def validation_step(self, val_batch, batch_idx):
lX, ly = val_batch
logits = self.forward(lX)
_ly = []
for lang in sorted(lX.keys()):
_ly.append(ly[lang])
ly = torch.cat(_ly, dim=0)
loss = self.loss(logits, ly)
predictions = torch.sigmoid(logits) > 0.5
microF1 = self.microF1(predictions, ly)
macroF1 = self.macroF1(predictions, ly)
microK = self.microK(predictions, ly)
macroK = self.macroK(predictions, ly)
self.log('val-loss', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True)
self.log('val-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=True, logger=True)
self.log('val-microF1', microF1, on_step=False, on_epoch=True, prog_bar=True, logger=True)
self.log('val-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
self.log('val-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
return {'loss': loss}
def test_step(self, test_batch, batch_idx):
lX, ly = test_batch
logits = self.forward(lX)
_ly = []
for lang in sorted(lX.keys()):
_ly.append(ly[lang])
ly = torch.cat(_ly, dim=0)
predictions = torch.sigmoid(logits) > 0.5
microF1 = self.microF1(predictions, ly)
macroF1 = self.macroF1(predictions, ly)
microK = self.microK(predictions, ly)
macroK = self.macroK(predictions, ly)
self.log('test-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=False, logger=True)
self.log('test-microF1', microF1, on_step=False, on_epoch=True, prog_bar=False, logger=True)
self.log('test-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
self.log('test-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True)
return
def embed(self, X, lang):
input_list = []
if self.lPretrained_embeddings[lang]:
input_list.append(self.lPretrained_embeddings[lang](X))
if self.lLearnable_embeddings[lang]:
input_list.append(self.lLearnable_embeddings[lang](X))
return torch.cat(tensors=input_list, dim=2)
def embedding_dropout(self, X, drop_range, p_drop=0.5, training=True):
if p_drop > 0 and training and drop_range is not None:
p = p_drop
drop_from, drop_to = drop_range
m = drop_to - drop_from # length of the supervised embedding
l = X.shape[2] # total embedding length
corr = (1 - p)
X[:, :, drop_from:drop_to] = corr * F.dropout(X[:, :, drop_from:drop_to], p=p)
X /= (1 - (p * m / l))
return X
def configure_optimizers(self):
optimizer = AdamW(self.parameters(), lr=1e-3)
scheduler = StepLR(optimizer, step_size=25, gamma=0.5)
return [optimizer], [scheduler]
@staticmethod
def _reconstruct_dict(X, ly):
reconstructed = {}
_start = 0
for lang in sorted(ly.keys()):
lang_batchsize = len(ly[lang])
reconstructed[lang] = X[_start:_start+lang_batchsize]
_start += lang_batchsize
return reconstructed

View File

@ -1,12 +0,0 @@
transformers==2.11.0
pandas==0.25.3
numpy==1.17.4
joblib==0.14.0
tqdm==4.50.2
pytorch_lightning==1.1.2
torch==1.3.1
nltk==3.4.5
scipy==1.3.3
rdflib==4.2.2
torchtext==0.4.0
scikit_learn==0.24.1

View File

@ -1,6 +0,0 @@
#!/usr/bin/env bash
for i in {0..10..1}
do
python main.py --gpus 0
done

View File

@ -1,59 +0,0 @@
import numpy as np
from sklearn.decomposition import TruncatedSVD
def get_weighted_average(We, x, w):
"""
Compute the weighted average vectors
:param We: We[i,:] is the vector for word i
:param x: x[i, :] are the indices of the words in sentence i
:param w: w[i, :] are the weights for the words in sentence i
:return: emb[i, :] are the weighted average vector for sentence i
"""
n_samples = x.shape[0]
emb = np.zeros((n_samples, We.shape[1]))
for i in range(n_samples):
emb[i,:] = w[i,:].dot(We[x[i,:],:]) / np.count_nonzero(w[i,:])
return emb
def compute_pc(X,npc=1):
"""
Compute the principal components.
:param X: X[i,:] is a data point
:param npc: number of principal components to remove
:return: component_[i,:] is the i-th pc
"""
svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
svd.fit(X)
return svd.components_
def remove_pc(X, npc=1):
"""
Remove the projection on the principal components
:param X: X[i,:] is a data point
:param npc: number of principal components to remove
:return: XX[i, :] is the data point after removing its projection
"""
pc = compute_pc(X, npc)
if npc == 1:
XX = X - X.dot(pc.transpose()) * pc
else:
XX = X - X.dot(pc.transpose()).dot(pc)
return XX
def SIF_embedding(We, x, w, params):
"""
Compute the scores between pairs of sentences using weighted average + removing the projection on the first principal component
:param We: We[i,:] is the vector for word i
:param x: x[i, :] are the indices of the words in the i-th sentence
:param w: w[i, :] are the weights for the words in the i-th sentence
:param params.rmpc: if >0, remove the projections of the sentence embeddings to their first principal component
:return: emb, emb[i, :] is the embedding for sentence i
"""
emb = get_weighted_average(We, x, w)
if params.rmpc > 0:
emb = remove_pc(emb, params.rmpc)
return emb

View File

@ -1,384 +0,0 @@
import numpy as np
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from util.embeddings_manager import supervised_embeddings_tfidf
class TfidfVectorizerMultilingual:
def __init__(self, **kwargs):
self.kwargs = kwargs
def fit(self, lX, ly=None):
self.langs = sorted(lX.keys())
self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs}
return self
def transform(self, lX):
return {l: self.vectorizer[l].transform(lX[l]) for l in self.langs}
def fit_transform(self, lX, ly=None):
return self.fit(lX, ly).transform(lX)
def vocabulary(self, l=None):
if l is None:
return {l: self.vectorizer[l].vocabulary_ for l in self.langs}
else:
return self.vectorizer[l].vocabulary_
def get_analyzer(self, l=None):
if l is None:
return {l: self.vectorizer[l].build_analyzer() for l in self.langs}
else:
return self.vectorizer[l].build_analyzer()
def _normalize(lX, l2=True):
return {lang: normalize(X) for lang, X in lX.items()} if l2 else lX
def none_dict(langs):
return {l: None for l in langs}
class MultilingualIndex:
def __init__(self):
"""
Class that contains monolingual Indexes
"""
self.l_index = {}
self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
def index(self, l_devel_raw, l_devel_target, l_test_raw, l_test_target, l_pretrained_vocabulary=None):
self.langs = sorted(l_devel_raw.keys())
self.l_vectorizer.fit(l_devel_raw)
l_vocabulary = self.l_vectorizer.vocabulary()
l_analyzer = self.l_vectorizer.get_analyzer()
if l_pretrained_vocabulary is None:
l_pretrained_vocabulary = none_dict(self.langs)
for lang in self.langs:
# Init monolingual Index
self.l_index[lang] = Index(l_devel_raw[lang], l_devel_target[lang], l_test_raw[lang], l_test_target[lang],
lang)
# call to index() function of monolingual Index
self.l_index[lang].index(l_pretrained_vocabulary[lang], l_analyzer[lang], l_vocabulary[lang])
def train_val_split(self, val_prop=0.2, max_val=2000, seed=42):
for l, index in self.l_index.items():
index.train_val_split(val_prop, max_val, seed=seed)
def embedding_matrices(self, lpretrained, supervised):
"""
Extract from pretrained embeddings words that are found in the training dataset, then for each language
calls the respective monolingual index and build the embedding matrix (if supervised, WCE are concatenated
to the unsupervised vectors).
:param lpretrained: dict {lang : matrix of word-embeddings }
:param supervised: bool, whether to deploy Word-Class Embeddings or not
:return: self
"""
lXtr = self.get_lXtr() if supervised else none_dict(self.langs)
lYtr = self.l_train_target() if supervised else none_dict(self.langs)
lWordList = self.get_wordlist()
lExtracted = lpretrained.extract(lWordList)
for lang, index in self.l_index.items():
# if supervised concatenate embedding matrices of pretrained unsupervised
# and supervised word-class embeddings
index.compose_embedding_matrix(lExtracted[lang], supervised, lXtr[lang], lYtr[lang])
self.sup_range = index.wce_range
return self
def get_wordlist(self):
wordlist = {}
for lang, index in self.l_index.items():
wordlist[lang] = index.get_word_list()
return wordlist
def get_raw_lXtr(self):
lXtr_raw = {k: [] for k in self.langs}
lYtr_raw = {k: [] for k in self.langs}
for lang in self.langs:
lXtr_raw[lang] = self.l_index[lang].train_raw
lYtr_raw[lang] = self.l_index[lang].train_raw
return lXtr_raw
def get_raw_lXva(self):
lXva_raw = {k: [] for k in self.langs}
for lang in self.langs:
lXva_raw[lang] = self.l_index[lang].val_raw
return lXva_raw
def get_raw_lXte(self):
lXte_raw = {k: [] for k in self.langs}
for lang in self.langs:
lXte_raw[lang] = self.l_index[lang].test_raw
return lXte_raw
def get_lXtr(self):
if not hasattr(self, 'lXtr'):
self.lXtr = self.l_vectorizer.transform({l: index.train_raw for l, index in self.l_index.items()})
return self.lXtr
def get_lXva(self):
if not hasattr(self, 'lXva'):
self.lXva = self.l_vectorizer.transform({l: index.val_raw for l, index in self.l_index.items()})
return self.lXva
def get_lXte(self):
if not hasattr(self, 'lXte'):
self.lXte = self.l_vectorizer.transform({l: index.test_raw for l, index in self.l_index.items()})
return self.lXte
def get_target_dim(self):
return self.l_index[self.langs[0]].devel_target.shape[1]
def l_vocabsize(self):
return {l: index.vocabsize for l, index in self.l_index.items()}
def l_embeddings(self):
return {l: index.embedding_matrix for l, index in self.l_index.items()}
def l_pad(self):
return {l: index.pad_index for l, index in self.l_index.items()}
def l_train_index(self):
return {l: index.train_index for l, index in self.l_index.items()}
def l_train_raw_index(self):
return {l: index.train_raw for l, index in self.l_index.items()}
def l_train_target(self):
return {l: index.train_target for l, index in self.l_index.items()}
def l_val_index(self):
return {l: index.val_index for l, index in self.l_index.items()}
def l_val_raw_index(self):
return {l: index.val_raw for l, index in self.l_index.items()}
def l_test_raw_index(self):
return {l: index.test_raw for l, index in self.l_index.items()}
def l_devel_raw_index(self):
return {l: index.devel_raw for l, index in self.l_index.items()}
def l_val_target(self):
return {l: index.val_target for l, index in self.l_index.items()}
def l_test_target(self):
return {l: index.test_target for l, index in self.l_index.items()}
def l_test_index(self):
return {l: index.test_index for l, index in self.l_index.items()}
def l_devel_index(self):
return {l: index.devel_index for l, index in self.l_index.items()}
def l_devel_target(self):
return {l: index.devel_target for l, index in self.l_index.items()}
def l_train(self):
return self.l_train_index(), self.l_train_target()
def l_val(self):
return self.l_val_index(), self.l_val_target()
def l_test(self):
return self.l_test_index(), self.l_test_target()
def l_train_raw(self):
return self.l_train_raw_index(), self.l_train_target()
def l_val_raw(self):
return self.l_val_raw_index(), self.l_val_target()
def l_test_raw(self):
return self.l_test_raw_index(), self.l_test_target()
def l_devel_raw(self):
return self.l_devel_raw_index(), self.l_devel_target()
def get_l_pad_index(self):
return {l: index.get_pad_index() for l, index in self.l_index.items()}
class Index:
def __init__(self, devel_raw, devel_target, test_raw, test_target, lang):
"""
Monolingual Index, takes care of tokenizing raw data, converting strings to ids, splitting the data into
training and validation.
:param devel_raw: list of strings, list of raw training texts
:param devel_target:
:param test_raw: list of strings, list of raw test texts
:param lang: list, list of languages contained in the dataset
"""
self.lang = lang
self.devel_raw = devel_raw
self.devel_target = devel_target
self.test_raw = test_raw
self.test_target = test_target
def index(self, pretrained_vocabulary, analyzer, vocabulary):
self.word2index = dict(vocabulary)
known_words = set(self.word2index.keys())
if pretrained_vocabulary is not None:
known_words.update(pretrained_vocabulary)
self.word2index['UNKTOKEN'] = len(self.word2index)
self.word2index['PADTOKEN'] = len(self.word2index)
self.unk_index = self.word2index['UNKTOKEN']
self.pad_index = self.word2index['PADTOKEN']
# index documents and keep track of test terms outside the development vocabulary that are in Muse (if available)
self.out_of_vocabulary = dict()
self.devel_index = index(self.devel_raw, self.word2index, known_words, analyzer, self.unk_index,
self.out_of_vocabulary)
self.test_index = index(self.test_raw, self.word2index, known_words, analyzer, self.unk_index,
self.out_of_vocabulary)
self.vocabsize = len(self.word2index) + len(self.out_of_vocabulary)
print(f'[indexing complete for lang {self.lang}] vocabulary-size={self.vocabsize}')
def get_pad_index(self):
return self.pad_index
def train_val_split(self, val_prop, max_val, seed):
devel = self.devel_index
target = self.devel_target
devel_raw = self.devel_raw
val_size = int(min(len(devel) * val_prop, max_val))
self.train_index, self.val_index, self.train_target, self.val_target, self.train_raw, self.val_raw = \
train_test_split(
devel, target, devel_raw, test_size=val_size, random_state=seed, shuffle=True)
print(
f'split lang {self.lang}: train={len(self.train_index)} val={len(self.val_index)} test={len(self.test_index)}')
def get_word_list(self):
def extract_word_list(word2index):
return [w for w, i in sorted(word2index.items(), key=lambda x: x[1])]
word_list = extract_word_list(self.word2index)
word_list += extract_word_list(self.out_of_vocabulary)
return word_list
def compose_embedding_matrix(self, pretrained, supervised, Xtr=None, Ytr=None):
print(f'[generating embedding matrix for lang {self.lang}]')
self.wce_range = None
embedding_parts = []
if pretrained is not None:
print('\t[pretrained-matrix]')
embedding_parts.append(pretrained)
del pretrained
if supervised:
print('\t[supervised-matrix]')
F = supervised_embeddings_tfidf(Xtr, Ytr)
num_missing_rows = self.vocabsize - F.shape[0]
F = np.vstack((F, np.zeros(shape=(num_missing_rows, F.shape[1]))))
F = torch.from_numpy(F).float()
offset = 0
if embedding_parts:
offset = embedding_parts[0].shape[1]
self.wce_range = [offset, offset + F.shape[1]]
embedding_parts.append(F)
self.embedding_matrix = torch.cat(embedding_parts, dim=1)
print(f'[embedding matrix for lang {self.lang} has shape {self.embedding_matrix.shape}]')
def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary):
"""
Index (i.e., replaces word strings with numerical indexes) a list of string documents
:param data: list of string documents
:param vocab: a fixed mapping [str]->[int] of words to indexes
:param known_words: a set of known words (e.g., words that, despite not being included in the vocab, can be retained
because they are anyway contained in a pre-trained embedding set that we know in advance)
:param analyzer: the preprocessor in charge of transforming the document string into a chain of string words
:param unk_index: the index of the 'unknown token', i.e., a symbol that characterizes all words that we cannot keep
:param out_of_vocabulary: an incremental mapping [str]->[int] of words to indexes that will index all those words that
are not in the original vocab but that are in the known_words
:return:
"""
indexes = []
vocabsize = len(vocab)
unk_count = 0
knw_count = 0
out_count = 0
# pbar = tqdm(data, desc=f'indexing')
for text in data:
words = analyzer(text)
index = []
for word in words:
if word in vocab:
idx = vocab[word]
else:
if word in known_words:
if word not in out_of_vocabulary:
out_of_vocabulary[word] = vocabsize + len(out_of_vocabulary)
idx = out_of_vocabulary[word]
out_count += 1
else:
idx = unk_index
unk_count += 1
index.append(idx)
indexes.append(index)
knw_count += len(index)
# pbar.set_description(f'[unk = {unk_count}/{knw_count}={(100.*unk_count/knw_count):.2f}%]'
# f'[out = {out_count}/{knw_count}={(100.*out_count/knw_count):.2f}%]')
return indexes
def is_true(tensor, device):
return torch.where(tensor == 1, torch.Tensor([1]).to(device), torch.Tensor([0]).to(device))
def is_false(tensor, device):
return torch.where(tensor == 0, torch.Tensor([1]).to(device), torch.Tensor([0]).to(device))
def define_pad_length(index_list):
lengths = [len(index) for index in index_list]
return int(np.mean(lengths) + np.std(lengths))
def pad(index_list, pad_index, max_pad_length=None):
pad_length = np.max([len(index) for index in index_list])
if max_pad_length is not None:
pad_length = min(pad_length, max_pad_length)
for i, indexes in enumerate(index_list):
index_list[i] = [pad_index] * (pad_length - len(indexes)) + indexes[:pad_length]
return index_list
def get_params(optimc=False):
if not optimc:
return None
c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1]
kernel = 'rbf'
return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}]
def get_method_name(args):
_id = ''
_id_conf = [args.post_embedder, args.wce_embedder, args.muse_embedder, args.bert_embedder, args.gru_embedder]
_id_name = ['X', 'W', 'M', 'B', 'G']
for i, conf in enumerate(_id_conf):
if conf:
_id += _id_name[i]
_id = _id if not args.gru_wce else _id + '_wce'
_dataset_path = args.dataset.split('/')[-1].split('_')
dataset_id = _dataset_path[0] + _dataset_path[-1]
return _id, dataset_id

View File

@ -1,104 +0,0 @@
from abc import ABC, abstractmethod
import numpy as np
import torch
from torchtext.vocab import Vectors
from util.SIF_embed import remove_pc
class PretrainedEmbeddings(ABC):
def __init__(self):
super().__init__()
@abstractmethod
def vocabulary(self): pass
@abstractmethod
def dim(self): pass
@classmethod
def reindex(cls, words, word2index):
if isinstance(words, dict):
words = list(zip(*sorted(words.items(), key=lambda x: x[1])))[0]
source_idx, target_idx = [], []
for i, word in enumerate(words):
if word not in word2index:
continue
j = word2index[word]
source_idx.append(i)
target_idx.append(j)
source_idx = np.asarray(source_idx)
target_idx = np.asarray(target_idx)
return source_idx, target_idx
class MuseLoader:
def __init__(self, langs, cache):
self.langs = langs
self.lEmbed = {}
self.lExtracted = {}
for lang in self.langs:
print(f'Loading vectors for {lang}...')
self.lEmbed[lang] = Vectors(f'wiki.multi.{lang}.vec', cache)
def dim(self):
return self.lEmbed[list(self.lEmbed.keys())[0]].dim
def vocabulary(self):
return {lang: set(self.lEmbed[lang].stoi.keys()) for lang in self.langs}
def extract(self, lVoc):
"""
Reindex pretrained loaded embedding in order to match indexes assigned by scikit vectorizer. Such indexes
are consistent with those used by Word Class Embeddings (since we deploy the same vectorizer)
:param lVoc: dict {lang : {word : id}}
:return: torch embedding matrix of extracted embeddings i.e., words in lVoc
"""
for lang, words in lVoc.items():
print(f'Extracting words for lang {lang}...')
# words = list(zip(*sorted(lVoc[lang].items(), key=lambda x: x[1])))[0]
source_id, target_id = PretrainedEmbeddings.reindex(words, self.lEmbed[lang].stoi)
extraction = torch.zeros((len(words), self.dim()))
extraction[source_id] = self.lEmbed[lang].vectors[target_id]
self.lExtracted[lang] = extraction
return self.lExtracted
def get_lEmbeddings(self):
return {lang: self.lEmbed[lang].vectors for lang in self.langs}
def XdotM(X, M, sif):
E = X.dot(M)
if sif:
E = remove_pc(E, npc=1)
return E
def wce_matrix(X, Y):
wce = supervised_embeddings_tfidf(X, Y)
wce = zscores(wce, axis=0)
return wce
def supervised_embeddings_tfidf(X, Y):
tfidf_norm = X.sum(axis=0)
tfidf_norm[tfidf_norm == 0] = 1
F = (X.T).dot(Y) / tfidf_norm.T
return F
def zscores(X, axis=0):
"""
scipy.stats.zscores does not avoid division by 0, which can indeed occur
:param X:
:param axis:
:return:
"""
std = np.clip(np.std(X, ddof=1, axis=axis), 1e-5, None)
mean = np.mean(X, axis=axis)
return (X - mean) / std

View File

@ -1,20 +0,0 @@
import numpy as np
from joblib import Parallel, delayed
from util.metrics import *
def evaluation_metrics(y, y_):
if len(y.shape) == len(y_.shape) == 1 and len(np.unique(y)) > 2: # single-label
raise NotImplementedError() # return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro')
else: # the metrics I implemented assume multiclass multilabel classification as binary classifiers
return macroF1(y, y_), microF1(y, y_), macroK(y, y_), microK(y, y_)
def evaluate(ly_true, ly_pred, metrics=evaluation_metrics, n_jobs=-1):
if n_jobs == 1:
return {lang: metrics(ly_true[lang], ly_pred[lang]) for lang in ly_true.keys()}
else:
langs = list(ly_true.keys())
evals = Parallel(n_jobs=n_jobs)(delayed(metrics)(ly_true[lang], ly_pred[lang]) for lang in langs)
return {lang: evals[i] for i, lang in enumerate(langs)}

View File

@ -1,50 +0,0 @@
import urllib
from os import listdir, makedirs
from os.path import isdir, isfile, join, exists, dirname
from pathlib import Path
def download_file(url, archive_filename):
def progress(blocknum, bs, size):
total_sz_mb = '%.2f MB' % (size / 1e6)
current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6)
print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='')
print("Downloading %s" % url)
urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress)
print("")
def download_file_if_not_exists(url, archive_path):
if exists(archive_path): return
makedirs_if_not_exist(dirname(archive_path))
download_file(url,archive_path)
def ls(dir, typecheck):
el = [f for f in listdir(dir) if typecheck(join(dir, f))]
el.sort()
return el
def list_dirs(dir):
return ls(dir, typecheck=isdir)
def list_files(dir):
return ls(dir, typecheck=isfile)
def makedirs_if_not_exist(path):
if not exists(path): makedirs(path)
def create_if_not_exist(path):
if not exists(path): makedirs(path)
def get_parent_name(path):
return Path(path).parent
def get_file_name(path):
return Path(path).name

View File

@ -1,152 +0,0 @@
import numpy as np
class ContTable:
def __init__(self, tp=0, tn=0, fp=0, fn=0):
self.tp = tp
self.tn = tn
self.fp = fp
self.fn = fn
def get_d(self): return self.tp + self.tn + self.fp + self.fn
def get_c(self): return self.tp + self.fn
def get_not_c(self): return self.tn + self.fp
def get_f(self): return self.tp + self.fp
def get_not_f(self): return self.tn + self.fn
def p_c(self): return (1.0*self.get_c())/self.get_d()
def p_not_c(self): return 1.0-self.p_c()
def p_f(self): return (1.0*self.get_f())/self.get_d()
def p_not_f(self): return 1.0-self.p_f()
def p_tp(self): return (1.0*self.tp) / self.get_d()
def p_tn(self): return (1.0*self.tn) / self.get_d()
def p_fp(self): return (1.0*self.fp) / self.get_d()
def p_fn(self): return (1.0*self.fn) / self.get_d()
def tpr(self):
c = 1.0*self.get_c()
return self.tp / c if c > 0.0 else 0.0
def fpr(self):
_c = 1.0*self.get_not_c()
return self.fp / _c if _c > 0.0 else 0.0
def __add__(self, other):
return ContTable(tp=self.tp + other.tp, tn=self.tn + other.tn, fp=self.fp + other.fp, fn=self.fn + other.fn)
def accuracy(cell):
return (cell.tp + cell.tn)*1.0 / (cell.tp + cell.fp + cell.fn + cell.tn)
def f1(cell):
num = 2.0 * cell.tp
den = 2.0 * cell.tp + cell.fp + cell.fn
if den > 0:
return num / den
# we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative
return 1.0
def K(cell):
specificity, recall = 0., 0.
AN = cell.tn + cell.fp
if AN != 0:
specificity = cell.tn*1. / AN
AP = cell.tp + cell.fn
if AP != 0:
recall = cell.tp*1. / AP
if AP == 0:
return 2. * specificity - 1.
elif AN == 0:
return 2. * recall - 1.
else:
return specificity + recall - 1.
# if the classifier is single class, then the prediction is a vector of shape=(nD,) which causes issues when compared
# to the true labels (of shape=(nD,1)). This method increases the dimensions of the predictions.
def __check_consistency_and_adapt(true_labels, predictions):
if predictions.ndim == 1:
return __check_consistency_and_adapt(true_labels, np.expand_dims(predictions, axis=1))
if true_labels.ndim == 1:
return __check_consistency_and_adapt(np.expand_dims(true_labels, axis=1), predictions)
if true_labels.shape != predictions.shape:
raise ValueError("True and predicted label matrices shapes are inconsistent %s %s."
% (true_labels.shape, predictions.shape))
_, nC = true_labels.shape
return true_labels, predictions, nC
# computes the (soft) contingency table where tp, fp, fn, and tn are the cumulative masses for the posterioir
# probabilitiesfron with respect to the true binary labels
# true_labels and posterior_probabilities are two vectors of shape (number_documents,)
def soft_single_metric_statistics(true_labels, posterior_probabilities):
assert len(true_labels) == len(posterior_probabilities), "Format not consistent between true and predicted labels."
tp = np.sum(posterior_probabilities[true_labels == 1])
fn = np.sum(1. - posterior_probabilities[true_labels == 1])
fp = np.sum(posterior_probabilities[true_labels == 0])
tn = np.sum(1. - posterior_probabilities[true_labels == 0])
return ContTable(tp=tp, tn=tn, fp=fp, fn=fn)
# computes the (hard) counters tp, fp, fn, and tn fron a true and predicted vectors of hard decisions
# true_labels and predicted_labels are two vectors of shape (number_documents,)
def hard_single_metric_statistics(true_labels, predicted_labels):
assert len(true_labels) == len(predicted_labels), "Format not consistent between true and predicted labels."
nd = len(true_labels)
tp = np.sum(predicted_labels[true_labels == 1])
fp = np.sum(predicted_labels[true_labels == 0])
fn = np.sum(true_labels[predicted_labels == 0])
tn = nd - (tp+fp+fn)
return ContTable(tp=tp, tn=tn, fp=fp, fn=fn)
def macro_average(true_labels, predicted_labels, metric, metric_statistics=hard_single_metric_statistics):
true_labels, predicted_labels, nC = __check_consistency_and_adapt(true_labels, predicted_labels)
return np.mean([metric(metric_statistics(true_labels[:, c], predicted_labels[:, c])) for c in range(nC)])
def micro_average(true_labels, predicted_labels, metric, metric_statistics=hard_single_metric_statistics):
true_labels, predicted_labels, nC = __check_consistency_and_adapt(true_labels, predicted_labels)
accum = ContTable()
for c in range(nC):
other = metric_statistics(true_labels[:, c], predicted_labels[:, c])
accum = accum + other
return metric(accum)
# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
def macroF1(true_labels, predicted_labels):
return macro_average(true_labels, predicted_labels, f1)
# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
def microF1(true_labels, predicted_labels):
return micro_average(true_labels, predicted_labels, f1)
# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
def macroK(true_labels, predicted_labels):
return macro_average(true_labels, predicted_labels, K)
# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format
def microK(true_labels, predicted_labels):
return micro_average(true_labels, predicted_labels, K)

View File

@ -1,141 +0,0 @@
import torch
from pytorch_lightning.metrics import Metric
from util.common import is_false, is_true
def _update(pred, target, device):
assert pred.shape == target.shape
# preparing preds and targets for count
true_pred = is_true(pred, device)
false_pred = is_false(pred, device)
true_target = is_true(target, device)
false_target = is_false(target, device)
tp = torch.sum(true_pred * true_target, dim=0)
tn = torch.sum(false_pred * false_target, dim=0)
fp = torch.sum(true_pred * false_target, dim=0)
fn = torch.sum(false_pred * target, dim=0)
return tp, tn, fp, fn
class CustomF1(Metric):
def __init__(self, num_classes, device, average='micro'):
"""
Custom F1 metric.
Scikit learn provides a full set of evaluation metrics, but they treat special cases differently.
I.e., when the number of true positives, false positives, and false negatives amount to 0, all
affected metrics (precision, recall, and thus f1) output 0 in Scikit learn.
We adhere to the common practice of outputting 1 in this case since the classifier has correctly
classified all examples as negatives.
:param num_classes:
:param device:
:param average:
"""
super().__init__()
self.num_classes = num_classes
self.average = average
self.device = 'cuda' if device else 'cpu'
self.add_state('true_positive', default=torch.zeros(self.num_classes))
self.add_state('true_negative', default=torch.zeros(self.num_classes))
self.add_state('false_positive', default=torch.zeros(self.num_classes))
self.add_state('false_negative', default=torch.zeros(self.num_classes))
def update(self, preds, target):
true_positive, true_negative, false_positive, false_negative = _update(preds, target, self.device)
self.true_positive += true_positive
self.true_negative += true_negative
self.false_positive += false_positive
self.false_negative += false_negative
def compute(self):
if self.average == 'micro':
num = 2.0 * self.true_positive.sum()
den = 2.0 * self.true_positive.sum() + self.false_positive.sum() + self.false_negative.sum()
if den > 0:
return (num / den).to(self.device)
return torch.FloatTensor([1.]).to(self.device)
if self.average == 'macro':
class_specific = []
for i in range(self.num_classes):
class_tp = self.true_positive[i]
class_tn = self.true_negative[i]
class_fp = self.false_positive[i]
class_fn = self.false_negative[i]
num = 2.0 * class_tp
den = 2.0 * class_tp + class_fp + class_fn
if den > 0:
class_specific.append(num / den)
else:
class_specific.append(1.)
average = torch.sum(torch.Tensor(class_specific))/self.num_classes
return average.to(self.device)
class CustomK(Metric):
def __init__(self, num_classes, device, average='micro'):
"""
K metric. https://dl.acm.org/doi/10.1145/2808194.2809449
:param num_classes:
:param device:
:param average:
"""
super().__init__()
self.num_classes = num_classes
self.average = average
self.device = 'cuda' if device else 'cpu'
self.add_state('true_positive', default=torch.zeros(self.num_classes))
self.add_state('true_negative', default=torch.zeros(self.num_classes))
self.add_state('false_positive', default=torch.zeros(self.num_classes))
self.add_state('false_negative', default=torch.zeros(self.num_classes))
def update(self, preds, target):
true_positive, true_negative, false_positive, false_negative = _update(preds, target, self.device)
self.true_positive += true_positive
self.true_negative += true_negative
self.false_positive += false_positive
self.false_negative += false_negative
def compute(self):
if self.average == 'micro':
specificity, recall = 0., 0.
absolute_negatives = self.true_negative.sum() + self.false_positive.sum()
if absolute_negatives != 0:
specificity = self.true_negative.sum()/absolute_negatives
absolute_positives = self.true_positive.sum() + self.false_negative.sum()
if absolute_positives != 0:
recall = self.true_positive.sum()/absolute_positives
if absolute_positives == 0:
return 2. * specificity - 1
elif absolute_negatives == 0:
return 2. * recall - 1
else:
return specificity + recall - 1
if self.average == 'macro':
class_specific = []
for i in range(self.num_classes):
class_tp = self.true_positive[i]
class_tn = self.true_negative[i]
class_fp = self.false_positive[i]
class_fn = self.false_negative[i]
specificity, recall = 0., 0.
absolute_negatives = class_tn + class_fp
if absolute_negatives != 0:
specificity = class_tn / absolute_negatives
absolute_positives = class_tp + class_fn
if absolute_positives != 0:
recall = class_tp / absolute_positives
if absolute_positives == 0:
class_specific.append(2. * specificity - 1)
elif absolute_negatives == 0:
class_specific.append(2. * recall - 1)
else:
class_specific.append(specificity + recall - 1)
average = torch.sum(torch.Tensor(class_specific)) / self.num_classes
return average.to(self.device)

View File

@ -1,53 +0,0 @@
import os
import numpy as np
import pandas as pd
class CSVlog:
def __init__(self, file, autoflush=True, verbose=False):
self.file = file
self.columns = ['method',
'setting',
'optimc',
'sif',
'zscore',
'l2',
'dataset',
'time_tr',
'time_te',
'lang',
'macrof1',
'microf1',
'macrok',
'microk',
'notes']
self.autoflush = autoflush
self.verbose = verbose
if os.path.exists(file):
self.tell('Loading existing file from {}'.format(file))
self.df = pd.read_csv(file, sep='\t')
else:
self.tell('File {} does not exist. Creating new frame.'.format(file))
dir = os.path.dirname(self.file)
if dir and not os.path.exists(dir): os.makedirs(dir)
self.df = pd.DataFrame(columns=self.columns)
def already_calculated(self, id):
return (self.df['id'] == id).any()
def add_row(self, method, setting, optimc, sif, zscore, l2, dataset, time_tr, time_te, lang,
macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''):
s = pd.Series([method, setting, optimc, sif, zscore, l2, dataset, time_tr, time_te, lang,
macrof1, microf1, macrok, microk, notes],
index=self.columns)
self.df = self.df.append(s, ignore_index=True)
if self.autoflush: self.flush()
self.tell(s.to_string())
def flush(self):
self.df.to_csv(self.file, index=False, sep='\t')
def tell(self, msg):
if self.verbose:
print(msg)

View File

@ -1,36 +0,0 @@
import numpy as np
class StandardizeTransformer:
def __init__(self, axis=0, range=None):
"""
:param axis:
:param range:
"""
assert range is None or isinstance(range, slice), 'wrong format for range, should either be None or a slice'
self.axis = axis
self.yetfit = False
self.range = range
def fit(self, X):
print('Applying z-score standardization...')
std=np.std(X, axis=self.axis, ddof=1)
self.std = np.clip(std, 1e-5, None)
self.mean = np.mean(X, axis=self.axis)
if self.range is not None:
ones = np.ones_like(self.std)
zeros = np.zeros_like(self.mean)
ones[self.range] = self.std[self.range]
zeros[self.range] = self.mean[self.range]
self.std = ones
self.mean = zeros
self.yetfit=True
return self
def transform(self, X):
if not self.yetfit: 'transform called before fit'
return (X - self.mean) / self.std
def fit_transform(self, X):
return self.fit(X).transform(X)

View File

@ -1,375 +0,0 @@
"""
This module contains the view generators that take care of computing the view specific document embeddings:
- VanillaFunGen (-x) cast document representations encoded via TFIDF into posterior probabilities by means of SVM.
- WordClassGen (-w): generates document representation via Word-Class-Embeddings.
Document embeddings are obtained via weighted sum of document's constituent embeddings.
- MuseGen (-m): generates document representation via MUSE embeddings.
Document embeddings are obtained via weighted sum of document's constituent embeddings.
- RecurrentGen (-g): generates document embedding by means of a Gated Recurrent Units. The model can be
initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,).
Output dimension is (n_docs, 512).
- View generator (-b): generates document embedding via mBERT model.
"""
from abc import ABC, abstractmethod
from time import time
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger
from data.datamodule import RecurrentDataModule, BertDataModule, tokenize
from models.learners import *
from models.pl_bert import BertModel
from models.pl_gru import RecurrentModel
from util.common import TfidfVectorizerMultilingual, _normalize
from util.embeddings_manager import MuseLoader, XdotM, wce_matrix
class ViewGen(ABC):
"""
Abstract class for ViewGenerators implementations. Every ViewGen should implement these three methods in order to
be seamlessly integrated in the overall architecture.
"""
@abstractmethod
def fit(self, lX, ly):
pass
@abstractmethod
def transform(self, lX):
pass
@abstractmethod
def fit_transform(self, lX, ly):
pass
class VanillaFunGen(ViewGen):
"""
View Generator (x): original funnelling architecture proposed by Moreo, Esuli and
Sebastiani in DOI: https://doi.org/10.1145/3326065
"""
def __init__(self, base_learner, first_tier_parameters=None, n_jobs=-1):
"""
Init Posterior Probabilities embedder (i.e., VanillaFunGen)
:param base_learner: naive monolingual learners to be deployed as first-tier learners. Should be able to
return posterior probabilities.
:param base_learner:
:param n_jobs: integer, number of concurrent workers
"""
super().__init__()
self.learners = base_learner
self.first_tier_parameters = first_tier_parameters
self.n_jobs = n_jobs
self.doc_projector = NaivePolylingualClassifier(base_learner=self.learners,
parameters=self.first_tier_parameters, n_jobs=self.n_jobs)
self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
def fit(self, lX, lY):
print('# Fitting VanillaFunGen (X)...')
lX = self.vectorizer.fit_transform(lX)
self.doc_projector.fit(lX, lY)
return self
def transform(self, lX):
"""
(1) Vectorize documents; (2) Project them according to the learners SVMs, finally (3) Apply L2 normalization
to the projection and returns it.
:param lX: dict {lang: indexed documents}
:return: document projection to the common latent space.
"""
lX = self.vectorizer.transform(lX)
lZ = self.doc_projector.predict_proba(lX)
lZ = _normalize(lZ, l2=True)
return lZ
def fit_transform(self, lX, ly):
return self.fit(lX, ly).transform(lX)
class MuseGen(ViewGen):
"""
View Generator (m): generates document representation via MUSE embeddings (Fasttext multilingual word
embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings.
"""
def __init__(self, muse_dir='../embeddings', n_jobs=-1):
"""
Init the MuseGen.
:param muse_dir: string, path to folder containing muse embeddings
:param n_jobs: int, number of concurrent workers
"""
super().__init__()
self.muse_dir = muse_dir
self.n_jobs = n_jobs
self.langs = None
self.lMuse = None
self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
def fit(self, lX, ly):
"""
(1) Vectorize documents; (2) Load muse embeddings for words encountered while vectorizing.
:param lX: dict {lang: indexed documents}
:param ly: dict {lang: target vectors}
:return: self.
"""
print('# Fitting MuseGen (M)...')
self.vectorizer.fit(lX)
self.langs = sorted(lX.keys())
self.lMuse = MuseLoader(langs=self.langs, cache=self.muse_dir)
lVoc = self.vectorizer.vocabulary()
self.lMuse = self.lMuse.extract(lVoc) # overwriting lMuse with dict {lang : embed_matrix} with only known words
# TODO: featureweight.fit
return self
def transform(self, lX):
"""
(1) Vectorize documents; (2) computes the weighted sum of MUSE embeddings found at document level,
finally (3) Apply L2 normalization embedding and returns it.
:param lX: dict {lang: indexed documents}
:return: document projection to the common latent space.
"""
lX = self.vectorizer.transform(lX)
XdotMUSE = Parallel(n_jobs=self.n_jobs)(
delayed(XdotM)(lX[lang], self.lMuse[lang], sif=True) for lang in self.langs)
lZ = {lang: XdotMUSE[i] for i, lang in enumerate(self.langs)}
lZ = _normalize(lZ, l2=True)
return lZ
def fit_transform(self, lX, ly):
return self.fit(lX, ly).transform(lX)
class WordClassGen(ViewGen):
"""
View Generator (w): generates document representation via Word-Class-Embeddings.
Document embeddings are obtained via weighted sum of document's constituent embeddings.
"""
def __init__(self, n_jobs=-1):
"""
Init WordClassGen.
:param n_jobs: int, number of concurrent workers
"""
super().__init__()
self.n_jobs = n_jobs
self.langs = None
self.lWce = None
self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True)
def fit(self, lX, ly):
"""
(1) Vectorize documents; (2) Load muse embeddings for words encountered while vectorizing.
:param lX: dict {lang: indexed documents}
:param ly: dict {lang: target vectors}
:return: self.
"""
print('# Fitting WordClassGen (W)...')
lX = self.vectorizer.fit_transform(lX)
self.langs = sorted(lX.keys())
wce = Parallel(n_jobs=self.n_jobs)(
delayed(wce_matrix)(lX[lang], ly[lang]) for lang in self.langs)
self.lWce = {l: wce[i] for i, l in enumerate(self.langs)}
# TODO: featureweight.fit()
return self
def transform(self, lX):
"""
(1) Vectorize documents; (2) computes the weighted sum of Word-Class Embeddings found at document level,
finally (3) Apply L2 normalization embedding and returns it.
:param lX: dict {lang: indexed documents}
:return: document projection to the common latent space.
"""
lX = self.vectorizer.transform(lX)
XdotWce = Parallel(n_jobs=self.n_jobs)(
delayed(XdotM)(lX[lang], self.lWce[lang], sif=True) for lang in self.langs)
lWce = {l: XdotWce[i] for i, l in enumerate(self.langs)}
lWce = _normalize(lWce, l2=True)
return lWce
def fit_transform(self, lX, ly):
return self.fit(lX, ly).transform(lX)
class RecurrentGen(ViewGen):
"""
View Generator (G): generates document embedding by means of a Gated Recurrent Units. The model can be
initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,).
Output dimension is (n_docs, 512). The training will happen end-to-end. At inference time, the model returns
the network internal state at the second feed-forward layer level. Training metrics are logged via TensorBoard.
"""
def __init__(self, multilingualIndex, pretrained_embeddings, wce, batch_size=512, nepochs=50,
gpus=0, n_jobs=-1, stored_path=None):
"""
Init RecurrentGen.
:param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents
indexed by language code.
:param pretrained_embeddings: dict {lang: tensor of embeddings}, it contains the pretrained embeddings to use
as embedding layer.
:param wce: Bool, whether to deploy Word-Class Embeddings (as proposed by A. Moreo). If True, supervised
embeddings are concatenated to the deployed supervised embeddings. WCE dimensionality is equal to
the number of target classes.
:param batch_size: int, number of samples in a batch.
:param nepochs: int, number of max epochs to train the model.
:param gpus: int, specifies how many GPUs to use per node. If False computation will take place on cpu.
:param n_jobs: int, number of concurrent workers (i.e., parallelizing data loading).
:param stored_path: str, path to a pretrained model. If None the model will be trained from scratch.
"""
super().__init__()
self.multilingualIndex = multilingualIndex
self.langs = multilingualIndex.langs
self.batch_size = batch_size
self.gpus = gpus
self.n_jobs = n_jobs
self.stored_path = stored_path
self.nepochs = nepochs
# EMBEDDINGS to be deployed
self.pretrained = pretrained_embeddings
self.wce = wce
self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1)
self.multilingualIndex.embedding_matrices(self.pretrained, supervised=self.wce)
self.model = self._init_model()
self.logger = TensorBoardLogger(save_dir='tb_logs', name='rnn', default_hp_metric=False)
# self.logger = CSVLogger(save_dir='csv_logs', name='rnn_dev')
def _init_model(self):
if self.stored_path:
lpretrained = self.multilingualIndex.l_embeddings()
return RecurrentModel.load_from_checkpoint(self.stored_path, lPretrained=lpretrained)
else:
lpretrained = self.multilingualIndex.l_embeddings()
langs = self.multilingualIndex.langs
output_size = self.multilingualIndex.get_target_dim()
hidden_size = 512
lvocab_size = self.multilingualIndex.l_vocabsize()
learnable_length = 0
return RecurrentModel(
lPretrained=lpretrained,
langs=langs,
output_size=output_size,
hidden_size=hidden_size,
lVocab_size=lvocab_size,
learnable_length=learnable_length,
drop_embedding_range=self.multilingualIndex.sup_range,
drop_embedding_prop=0.5,
gpus=self.gpus
)
def fit(self, lX, ly):
"""
Train the Neural Network end-to-end.
lX and ly are not directly used. We rather get them from the multilingual index used in the instantiation
of the Dataset object (RecurrentDataset) in the GfunDataModule class.
:param lX: dict {lang: indexed documents}
:param ly: dict {lang: target vectors}
:return: self.
"""
print('# Fitting RecurrentGen (G)...')
recurrentDataModule = RecurrentDataModule(self.multilingualIndex, batchsize=self.batch_size, n_jobs=self.n_jobs)
trainer = Trainer(gradient_clip_val=1e-1, gpus=self.gpus, logger=self.logger, max_epochs=self.nepochs,
checkpoint_callback=False)
# vanilla_torch_model = torch.load(
# '/home/andreapdr/funneling_pdr/checkpoint/gru_viewgen_-jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle')
# self.model.linear0 = vanilla_torch_model.linear0
# self.model.linear1 = vanilla_torch_model.linear1
# self.model.linear2 = vanilla_torch_model.linear2
# self.model.rnn = vanilla_torch_model.rnn
trainer.fit(self.model, datamodule=recurrentDataModule)
trainer.test(self.model, datamodule=recurrentDataModule)
return self
def transform(self, lX):
"""
Project documents to the common latent space. Output dimensionality is 512.
:param lX: dict {lang: indexed documents}
:return: documents projected to the common latent space.
"""
l_pad = self.multilingualIndex.l_pad()
data = self.multilingualIndex.l_devel_index()
self.model.to('cuda' if self.gpus else 'cpu')
self.model.eval()
time_init = time()
l_embeds = self.model.encode(data, l_pad, batch_size=256)
transform_time = round(time() - time_init, 3)
print(f'Executed! Transform took: {transform_time}')
return l_embeds
def fit_transform(self, lX, ly):
return self.fit(lX, ly).transform(lX)
class BertGen(ViewGen):
"""
View Generator (b): generates document embedding via Bert model. The training happens end-to-end.
At inference time, the model returns the network internal state at the last original layer (i.e. 12th). Document
embeddings are the state associated with the "start" token. Training metrics are logged via TensorBoard.
"""
def __init__(self, multilingualIndex, batch_size=128, nepochs=50, gpus=0, n_jobs=-1, stored_path=None):
"""
Init Bert model
:param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents
indexed by language code.
:param batch_size: int, number of samples per batch.
:param nepochs: int, number of max epochs to train the model.
:param gpus: int, specifies how many GPUs to use per node. If False computation will take place on cpu.
:param n_jobs: int, number of concurrent workers.
:param stored_path: str, path to a pretrained model. If None the model will be trained from scratch.
"""
super().__init__()
self.multilingualIndex = multilingualIndex
self.nepochs = nepochs
self.gpus = gpus
self.batch_size = batch_size
self.n_jobs = n_jobs
self.stored_path = stored_path
self.model = self._init_model()
self.logger = TensorBoardLogger(save_dir='tb_logs', name='bert', default_hp_metric=False)
def _init_model(self):
output_size = self.multilingualIndex.get_target_dim()
return BertModel(output_size=output_size, stored_path=self.stored_path, gpus=self.gpus)
def fit(self, lX, ly):
"""
Train the Neural Network end-to-end.
lX and ly are not directly used. We rather get them from the multilingual index used in the instantiation
of the Dataset object (RecurrentDataset) in the GfunDataModule class.
:param lX: dict {lang: indexed documents}
:param ly: dict {lang: target vectors}
:return: self.
"""
print('# Fitting BertGen (M)...')
self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1)
bertDataModule = BertDataModule(self.multilingualIndex, batchsize=self.batch_size, max_len=512)
trainer = Trainer(gradient_clip_val=1e-1, max_epochs=self.nepochs, gpus=self.gpus,
logger=self.logger, checkpoint_callback=False)
trainer.fit(self.model, datamodule=bertDataModule)
trainer.test(self.model, datamodule=bertDataModule)
return self
def transform(self, lX):
"""
Project documents to the common latent space. Output dimensionality is 768.
:param lX: dict {lang: indexed documents}
:return: documents projected to the common latent space.
"""
data = self.multilingualIndex.l_devel_raw_index()
data = tokenize(data, max_len=512)
self.model.to('cuda' if self.gpus else 'cpu')
self.model.eval()
time_init = time()
l_emebds = self.model.encode(data, batch_size=64)
transform_time = round(time() - time_init, 3)
print(f'Executed! Transform took: {transform_time}')
return l_emebds
def fit_transform(self, lX, ly):
# we can assume that we have already indexed data for transform() since we are first calling fit()
return self.fit(lX, ly).transform(lX)