diff --git a/refactor/data/__init__.py b/refactor/data/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/refactor/data/datamodule.py b/refactor/data/datamodule.py deleted file mode 100644 index da6ec92..0000000 --- a/refactor/data/datamodule.py +++ /dev/null @@ -1,222 +0,0 @@ -import numpy as np -import pytorch_lightning as pl -import torch -from torch.utils.data import Dataset, DataLoader -from transformers import BertTokenizer - -N_WORKERS = 8 - - -class RecurrentDataset(Dataset): - def __init__(self, lX, ly, lPad_index): - """ - :param lX: dict {lang_id : np.ndarray} - :param ly: - """ - self.lX = [] - self.ly = [] - self.lOffset = {} - self.lPad_index = lPad_index - - for lang, data in lX.items(): - offset = [len(self.lX)] - self.lX.extend(data) - offset.append(len(self.lX)) - self.lOffset[lang] = offset - - for lang, target in ly.items(): - self.ly.extend(target) - - def __len__(self): - return len(self.lX) - - def __getitem__(self, index): - X = self.lX[index] - y = self.ly[index] - return X, y, index, self._get_lang(index) - - def _get_lang(self, index): - for lang, l_range in self.lOffset.items(): - if index in range(l_range[0], l_range[1]): - return lang - - def collate_fn(self, data): - """ - Takes care of padding the batch and also check consistency of batch languages. Groups into dict {lang : lang_batch} - items sampled from the Dataset class. - :param data: - :return: - """ - lX_batch = {} - ly_batch = {} - current_lang = data[0][-1] - for d in data: - if d[-1] == current_lang: - if current_lang not in lX_batch.keys(): - lX_batch[current_lang] = [] - ly_batch[current_lang] = [] - lX_batch[current_lang].append(d[0]) - ly_batch[current_lang].append(d[1]) - else: - current_lang = d[-1] - lX_batch[current_lang] = [] - ly_batch[current_lang] = [] - lX_batch[current_lang].append(d[0]) - ly_batch[current_lang].append(d[1]) - - for lang in lX_batch.keys(): - lX_batch[lang] = self.pad(lX_batch[lang], pad_index=self.lPad_index[lang], - max_pad_length=self.define_pad_length(lX_batch[lang])) - lX_batch[lang] = torch.LongTensor(lX_batch[lang]) - ly_batch[lang] = torch.FloatTensor(ly_batch[lang]) - - return lX_batch, ly_batch - - @staticmethod - def define_pad_length(index_list): - lengths = [len(index) for index in index_list] - return int(np.mean(lengths) + np.std(lengths)) - - @staticmethod - def pad(index_list, pad_index, max_pad_length=None): - pad_length = np.max([len(index) for index in index_list]) - if max_pad_length is not None: - pad_length = min(pad_length, max_pad_length) - for i, indexes in enumerate(index_list): - index_list[i] = [pad_index] * (pad_length - len(indexes)) + indexes[:pad_length] - return index_list - - -class RecurrentDataModule(pl.LightningDataModule): - """ - Pytorch Lightning Datamodule to be deployed with RecurrentGen. - https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html - """ - def __init__(self, multilingualIndex, batchsize=64, n_jobs=-1): - """ - Init RecurrentDataModule. - :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents - indexed by language code. - :param batchsize: int, number of sample per batch. - :param n_jobs: int, number of concurrent workers to be deployed (i.e., parallelizing data loading). - """ - self.multilingualIndex = multilingualIndex - self.batchsize = batchsize - self.n_jobs = n_jobs - super().__init__() - - def prepare_data(self, *args, **kwargs): - pass - - def setup(self, stage=None): - if stage == 'fit' or stage is None: - l_train_index, l_train_target = self.multilingualIndex.l_train() - # Debug settings: reducing number of samples - l_train_index = {l: train[:5] for l, train in l_train_index.items()} - l_train_target = {l: target[:5] for l, target in l_train_target.items()} - - self.training_dataset = RecurrentDataset(l_train_index, l_train_target, - lPad_index=self.multilingualIndex.l_pad()) - - l_val_index, l_val_target = self.multilingualIndex.l_val() - # Debug settings: reducing number of samples - l_val_index = {l: train[:5] for l, train in l_val_index.items()} - l_val_target = {l: target[:5] for l, target in l_val_target.items()} - - self.val_dataset = RecurrentDataset(l_val_index, l_val_target, - lPad_index=self.multilingualIndex.l_pad()) - if stage == 'test' or stage is None: - l_test_index, l_test_target = self.multilingualIndex.l_test() - # Debug settings: reducing number of samples - l_test_index = {l: train[:5] for l, train in l_test_index.items()} - l_test_target = {l: target[:5] for l, target in l_test_target.items()} - - self.test_dataset = RecurrentDataset(l_test_index, l_test_target, - lPad_index=self.multilingualIndex.l_pad()) - - def train_dataloader(self): - return DataLoader(self.training_dataset, batch_size=self.batchsize, num_workers=self.n_jobs, - collate_fn=self.training_dataset.collate_fn) - - def val_dataloader(self): - return DataLoader(self.val_dataset, batch_size=self.batchsize, num_workers=self.n_jobs, - collate_fn=self.val_dataset.collate_fn) - - def test_dataloader(self): - return DataLoader(self.test_dataset, batch_size=self.batchsize, num_workers=self.n_jobs, - collate_fn=self.test_dataset.collate_fn) - - -def tokenize(l_raw, max_len): - """ - run Bert tokenization on dict {lang: list of samples}. - :param l_raw: - :param max_len: - :return: - """ - tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') - l_tokenized = {} - for lang in l_raw.keys(): - output_tokenizer = tokenizer(l_raw[lang], truncation=True, max_length=max_len, padding='max_length') - l_tokenized[lang] = output_tokenizer['input_ids'] - return l_tokenized - - -class BertDataModule(RecurrentDataModule): - """ - Pytorch Lightning Datamodule to be deployed with BertGen. - https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html - """ - def __init__(self, multilingualIndex, batchsize=64, max_len=512): - """ - Init BertDataModule. - :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents - indexed by language code. - :param batchsize: int, number of sample per batch. - :param max_len: int, max number of token per document. Absolute cap is 512. - """ - super().__init__(multilingualIndex, batchsize) - self.max_len = max_len - - def setup(self, stage=None): - if stage == 'fit' or stage is None: - l_train_raw, l_train_target = self.multilingualIndex.l_train_raw() - # Debug settings: reducing number of samples - l_train_raw = {l: train[:5] for l, train in l_train_raw.items()} - l_train_target = {l: target[:5] for l, target in l_train_target.items()} - - l_train_index = tokenize(l_train_raw, max_len=self.max_len) - self.training_dataset = RecurrentDataset(l_train_index, l_train_target, - lPad_index=self.multilingualIndex.l_pad()) - - l_val_raw, l_val_target = self.multilingualIndex.l_val_raw() - # Debug settings: reducing number of samples - l_val_raw = {l: train[:5] for l, train in l_val_raw.items()} - l_val_target = {l: target[:5] for l, target in l_val_target.items()} - - l_val_index = tokenize(l_val_raw, max_len=self.max_len) - self.val_dataset = RecurrentDataset(l_val_index, l_val_target, - lPad_index=self.multilingualIndex.l_pad()) - - if stage == 'test' or stage is None: - l_test_raw, l_test_target = self.multilingualIndex.l_test_raw() - # Debug settings: reducing number of samples - l_test_raw = {l: train[:5] for l, train in l_test_raw.items()} - l_test_target = {l: target[:5] for l, target in l_test_target.items()} - - l_test_index = tokenize(l_test_raw, max_len=self.max_len) - self.test_dataset = RecurrentDataset(l_test_index, l_test_target, - lPad_index=self.multilingualIndex.l_pad()) - - def train_dataloader(self): - """ - NB: Setting n_workers to > 0 will cause "OSError: [Errno 24] Too many open files" - :return: - """ - return DataLoader(self.training_dataset, batch_size=self.batchsize) - - def val_dataloader(self): - return DataLoader(self.val_dataset, batch_size=self.batchsize) - - def test_dataloader(self): - return DataLoader(self.test_dataset, batch_size=self.batchsize) diff --git a/refactor/data/dataset_builder.py b/refactor/data/dataset_builder.py deleted file mode 100644 index 0e91316..0000000 --- a/refactor/data/dataset_builder.py +++ /dev/null @@ -1,712 +0,0 @@ -import itertools -import pickle -import re -from os.path import exists - -import numpy as np -from nltk.corpus import stopwords -from scipy.sparse import csr_matrix -from scipy.sparse import issparse -from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import MultiLabelBinarizer -from tqdm import tqdm - -from data.languages import NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING -from data.reader.jrcacquis_reader import * -from data.reader.rcv_reader import fetch_RCV1, fetch_RCV2 -from data.text_preprocessor import NLTKStemTokenizer, preprocess_documents - - -class MultilingualDataset: - """ - A multilingual dataset is a dictionary of training and test documents indexed by language code. - Train and test sets are represented as tuples of the type (X,Y,ids), where X is a matrix representation of the - documents (e.g., a document-by-term sparse csr_matrix), Y is a document-by-label binary np.array indicating the - labels of each document, and ids is a list of document-identifiers from the original collection. - """ - - def __init__(self): - self.dataset_name = "" - self.multiling_dataset = {} - - def add(self, lang, Xtr, Ytr, Xte, Yte, tr_ids=None, te_ids=None): - self.multiling_dataset[lang] = ((Xtr, Ytr, tr_ids), (Xte, Yte, te_ids)) - - def save(self, file): - self.sort_indexes() - pickle.dump(self, open(file, 'wb'), pickle.HIGHEST_PROTOCOL) - return self - - def __getitem__(self, item): - if item in self.langs(): - return self.multiling_dataset[item] - return None - - @classmethod - def load(cls, file): - data = pickle.load(open(file, 'rb')) - data.sort_indexes() - return data - - @classmethod - def load_ids(cls, file): - data = pickle.load(open(file, 'rb')) - tr_ids = {lang:tr_ids for (lang,((_,_,tr_ids), (_,_,_))) in data.multiling_dataset.items()} - te_ids = {lang: te_ids for (lang, ((_, _, _), (_, _, te_ids))) in data.multiling_dataset.items()} - return tr_ids, te_ids - - def sort_indexes(self): - for (lang, ((Xtr,_,_),(Xte,_,_))) in self.multiling_dataset.items(): - if issparse(Xtr): Xtr.sort_indices() - if issparse(Xte): Xte.sort_indices() - - def set_view(self, categories=None, languages=None): - if categories is not None: - if isinstance(categories, int): - categories = np.array([categories]) - elif isinstance(categories, list): - categories = np.array(categories) - self.categories_view = categories - if languages is not None: - self.languages_view = languages - - def training(self, mask_numbers=False, target_as_csr=False): - return self.lXtr(mask_numbers), self.lYtr(as_csr=target_as_csr) - - def test(self, mask_numbers=False, target_as_csr=False): - return self.lXte(mask_numbers), self.lYte(as_csr=target_as_csr) - - def lXtr(self, mask_numbers=False): - proc = lambda x:_mask_numbers(x) if mask_numbers else x - # return {lang: Xtr for (lang, ((Xtr, _, _), _)) in self.multiling_dataset.items() if lang in self.langs()} - return {lang:proc(Xtr) for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()} - - def lXte(self, mask_numbers=False): - proc = lambda x: _mask_numbers(x) if mask_numbers else x - # return {lang: Xte for (lang, (_, (Xte, _, _))) in self.multiling_dataset.items() if lang in self.langs()} - return {lang:proc(Xte) for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()} - - def lYtr(self, as_csr=False): - lY = {lang:self.cat_view(Ytr) for (lang, ((_,Ytr,_),_)) in self.multiling_dataset.items() if lang in self.langs()} - if as_csr: - lY = {l:csr_matrix(Y) for l,Y in lY.items()} - return lY - - def lYte(self, as_csr=False): - lY = {lang:self.cat_view(Yte) for (lang, (_,(_,Yte,_))) in self.multiling_dataset.items() if lang in self.langs()} - if as_csr: - lY = {l:csr_matrix(Y) for l,Y in lY.items()} - return lY - - def cat_view(self, Y): - if hasattr(self, 'categories_view'): - return Y[:,self.categories_view] - else: - return Y - - def langs(self): - if hasattr(self, 'languages_view'): - langs = self.languages_view - else: - langs = sorted(self.multiling_dataset.keys()) - return langs - - def num_categories(self): - return self.lYtr()[self.langs()[0]].shape[1] - - def show_dimensions(self): - def shape(X): - return X.shape if hasattr(X, 'shape') else len(X) - for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items(): - if lang not in self.langs(): continue - print("Lang {}, Xtr={}, ytr={}, Xte={}, yte={}".format(lang, shape(Xtr), self.cat_view(Ytr).shape, shape(Xte), self.cat_view(Yte).shape)) - - def show_category_prevalences(self): - nC = self.num_categories() - accum_tr = np.zeros(nC, dtype=np.int) - accum_te = np.zeros(nC, dtype=np.int) - in_langs = np.zeros(nC, dtype=np.int) # count languages with at least one positive example (per category) - for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items(): - if lang not in self.langs(): continue - prev_train = np.sum(self.cat_view(Ytr), axis=0) - prev_test = np.sum(self.cat_view(Yte), axis=0) - accum_tr += prev_train - accum_te += prev_test - in_langs += (prev_train>0)*1 - print(lang+'-train', prev_train) - print(lang+'-test', prev_test) - print('all-train', accum_tr) - print('all-test', accum_te) - - return accum_tr, accum_te, in_langs - - def set_labels(self, labels): - self.labels = labels - -def _mask_numbers(data): - mask_moredigit = re.compile(r'\s[\+-]?\d{5,}([\.,]\d*)*\b') - mask_4digit = re.compile(r'\s[\+-]?\d{4}([\.,]\d*)*\b') - mask_3digit = re.compile(r'\s[\+-]?\d{3}([\.,]\d*)*\b') - mask_2digit = re.compile(r'\s[\+-]?\d{2}([\.,]\d*)*\b') - mask_1digit = re.compile(r'\s[\+-]?\d{1}([\.,]\d*)*\b') - masked = [] - for text in tqdm(data, desc='masking numbers'): - text = ' ' + text - text = mask_moredigit.sub(' MoreDigitMask', text) - text = mask_4digit.sub(' FourDigitMask', text) - text = mask_3digit.sub(' ThreeDigitMask', text) - text = mask_2digit.sub(' TwoDigitMask', text) - text = mask_1digit.sub(' OneDigitMask', text) - masked.append(text.replace('.','').replace(',','').strip()) - return masked - - - - -# ---------------------------------------------------------------------------------------------------------------------- -# Helpers -# ---------------------------------------------------------------------------------------------------------------------- -def get_active_labels(doclist): - cat_list = set() - for d in doclist: - cat_list.update(d.categories) - return list(cat_list) - -def filter_by_categories(doclist, keep_categories): - catset = frozenset(keep_categories) - for d in doclist: - d.categories = list(set(d.categories).intersection(catset)) - -def __years_to_str(years): - if isinstance(years, list): - if len(years) > 1: - return str(years[0])+'-'+str(years[-1]) - return str(years[0]) - return str(years) - - -# ---------------------------------------------------------------------------------------------------------------------- -# Matrix builders -# ---------------------------------------------------------------------------------------------------------------------- -def build_independent_matrices(dataset_name, langs, training_docs, test_docs, label_names, wiki_docs=[], preprocess=True): - """ - Builds the document-by-term weighted matrices for each language. Representations are independent of each other, - i.e., each language-specific matrix lies in a dedicate feature space. - :param dataset_name: the name of the dataset (str) - :param langs: list of languages (str) - :param training_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id) - :param test_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id) - :param label_names: list of names of labels (str) - :param wiki_docs: doc-list (optional), if specified, project all wiki docs in the feature spaces built for the languages - :param preprocess: whether or not to apply language-specific text preprocessing (stopword removal and stemming) - :return: a MultilingualDataset. If wiki_docs has been specified, a dictionary lW is also returned, which indexes - by language the processed wikipedia documents in their respective language-specific feature spaces - """ - - mlb = MultiLabelBinarizer() - mlb.fit([label_names]) - - lW = {} - - multilingual_dataset = MultilingualDataset() - multilingual_dataset.dataset_name = dataset_name - multilingual_dataset.set_labels(mlb.classes_) - for lang in langs: - print("\nprocessing %d training, %d test, %d wiki for language <%s>" % - (len(training_docs[lang]), len(test_docs[lang]), len(wiki_docs[lang]) if wiki_docs else 0, lang)) - - tr_data, tr_labels, IDtr = zip(*training_docs[lang]) - te_data, te_labels, IDte = zip(*test_docs[lang]) - - if preprocess: - tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True, - tokenizer=NLTKStemTokenizer(lang, verbose=True), - stop_words=stopwords.words(NLTK_LANGMAP[lang])) - else: - tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True) - - Xtr = tfidf.fit_transform(tr_data) - Xte = tfidf.transform(te_data) - if wiki_docs: - lW[lang] = tfidf.transform(wiki_docs[lang]) - - Ytr = mlb.transform(tr_labels) - Yte = mlb.transform(te_labels) - - multilingual_dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte) - - multilingual_dataset.show_dimensions() - multilingual_dataset.show_category_prevalences() - - if wiki_docs: - return multilingual_dataset, lW - else: - return multilingual_dataset - - -# creates a MultilingualDataset where matrices shares a single yuxtaposed feature space -def build_juxtaposed_matrices(dataset_name, langs, training_docs, test_docs, label_names, preprocess=True): - """ - Builds the document-by-term weighted matrices for each language. Representations are not independent of each other, - since all of them lie on the same yuxtaposed feature space. - :param dataset_name: the name of the dataset (str) - :param langs: list of languages (str) - :param training_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id) - :param test_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id) - :param label_names: list of names of labels (str) - :param preprocess: whether or not to apply language-specific text preprocessing (stopword removal and stemming) - :return: a MultilingualDataset. If wiki_docs has been specified, a dictionary lW is also returned, which indexes - by language the processed wikipedia documents in their respective language-specific feature spaces - """ - - multiling_dataset = MultilingualDataset() - multiling_dataset.dataset_name = dataset_name - - mlb = MultiLabelBinarizer() - mlb.fit([label_names]) - - multiling_dataset.set_labels(mlb.classes_) - - tr_data_stack = [] - for lang in langs: - print("\nprocessing %d training and %d test for language <%s>" % (len(training_docs[lang]), len(test_docs[lang]), lang)) - tr_data, tr_labels, tr_ID = zip(*training_docs[lang]) - te_data, te_labels, te_ID = zip(*test_docs[lang]) - if preprocess: - tr_data = preprocess_documents(tr_data, lang) - te_data = preprocess_documents(te_data, lang) - tr_data_stack.extend(tr_data) - multiling_dataset.add(lang, tr_data, tr_labels, te_data, te_labels, tr_ID, te_ID) - - tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True) - tfidf.fit(tr_data_stack) - - for lang in langs: - print("\nweighting documents for language <%s>" % (lang)) - (tr_data, tr_labels, tr_ID), (te_data, te_labels, te_ID) = multiling_dataset[lang] - Xtr = tfidf.transform(tr_data) - Xte = tfidf.transform(te_data) - Ytr = mlb.transform(tr_labels) - Yte = mlb.transform(te_labels) - multiling_dataset.add(lang,Xtr,Ytr,Xte,Yte,tr_ID,te_ID) - - multiling_dataset.show_dimensions() - return multiling_dataset - - -# ---------------------------------------------------------------------------------------------------------------------- -# Methods to recover the original documents from the MultilingualDataset's ids -# ---------------------------------------------------------------------------------------------------------------------- -""" -This method has been added a posteriori, to create document embeddings using the polylingual embeddings of the recent -article 'Word Translation without Parallel Data'; basically, it takes one of the splits and retrieves the RCV documents -from the doc ids and then pickles an object (tr_docs, te_docs, label_names) in the outpath -""" -def retrieve_rcv_documents_from_dataset(datasetpath, rcv1_data_home, rcv2_data_home, outpath): - - tr_ids, te_ids = MultilingualDataset.load_ids(datasetpath) - assert tr_ids.keys() == te_ids.keys(), 'inconsistent keys tr vs te' - langs = list(tr_ids.keys()) - - print('fetching the datasets') - rcv1_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train') - rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l != 'en']) - - filter_by_categories(rcv1_documents, labels_rcv2) - filter_by_categories(rcv2_documents, labels_rcv1) - - label_names = get_active_labels(rcv1_documents + rcv2_documents) - print('Active labels in RCV1/2 {}'.format(len(label_names))) - - print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_documents), 0, len(label_names))) - print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents])) - - all_docs = rcv1_documents + rcv2_documents - mlb = MultiLabelBinarizer() - mlb.fit([label_names]) - - dataset = MultilingualDataset() - for lang in langs: - analyzer = CountVectorizer(strip_accents='unicode', min_df=3, - stop_words=stopwords.words(NLTK_LANGMAP[lang])).build_analyzer() - - Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.id) for d in all_docs if d.lang == lang and d.id in tr_ids[lang]]) - Xte,Yte,IDte = zip(*[(d.text,d.categories,d.id) for d in all_docs if d.lang == lang and d.id in te_ids[lang]]) - Xtr = [' '.join(analyzer(d)) for d in Xtr] - Xte = [' '.join(analyzer(d)) for d in Xte] - Ytr = mlb.transform(Ytr) - Yte = mlb.transform(Yte) - dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte) - - dataset.save(outpath) - -""" -Same thing but for JRC-Acquis -""" -def retrieve_jrc_documents_from_dataset(datasetpath, jrc_data_home, train_years, test_years, cat_policy, most_common_cat, outpath): - - tr_ids, te_ids = MultilingualDataset.load_ids(datasetpath) - assert tr_ids.keys() == te_ids.keys(), 'inconsistent keys tr vs te' - langs = list(tr_ids.keys()) - - print('fetching the datasets') - - cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy) - training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=train_years, - cat_filter=cat_list, cat_threshold=1, parallel=None, - most_frequent=most_common_cat) - test_docs, _ = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names, - parallel='force') - - def filter_by_id(doclist, ids): - ids_set = frozenset(itertools.chain.from_iterable(ids.values())) - return [x for x in doclist if (x.parallel_id+'__'+x.id) in ids_set] - - training_docs = filter_by_id(training_docs, tr_ids) - test_docs = filter_by_id(test_docs, te_ids) - - print('jrc: {} train, {} test, {} categories'.format(len(training_docs), len(test_docs), len(label_names))) - - mlb = MultiLabelBinarizer() - mlb.fit([label_names]) - - dataset = MultilingualDataset() - for lang in langs: - analyzer = CountVectorizer(strip_accents='unicode', min_df=3, - stop_words=stopwords.words(NLTK_LANGMAP[lang])).build_analyzer() - - Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.parallel_id+'__'+d.id) for d in training_docs if d.lang == lang]) - Xte,Yte,IDte = zip(*[(d.text,d.categories,d.parallel_id+'__'+d.id) for d in test_docs if d.lang == lang]) - Xtr = [' '.join(analyzer(d)) for d in Xtr] - Xte = [' '.join(analyzer(d)) for d in Xte] - Ytr = mlb.transform(Ytr) - Yte = mlb.transform(Yte) - dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte) - - dataset.save(outpath) - -# ---------------------------------------------------------------------------------------------------------------------- -# Dataset Generators -# ---------------------------------------------------------------------------------------------------------------------- -def prepare_jrc_datasets(jrc_data_home, wiki_data_home, langs, train_years, test_years, cat_policy, most_common_cat=-1, max_wiki=5000, run=0): - from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample - - - """ - Prepare all datasets for JRC-Acquis. The datasets include the "feature-independent" version, the - "feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices. - In all cases, training documents are strictly non-parallel, and test documents are strictly parallel - :param jrc_data_home: path to the raw JRC-Acquis documents (it will be downloaded if not found), and the path where - all splits will be generated - :param wiki_data_home: path to the wikipedia dump (see data/readers/wikipedia_tools.py) - :param langs: the list of languages to consider (as defined in data/languages.py) - :param train_years: a list of ints containing the years to be considered as training documents - :param test_years: a list of ints containing the years to be considered as test documents - :param cat_policy: a string indicating which category selection policy to apply. Valid policies are, e.g., "all" - (select all categories), "broadest" (select only the broadest concepts in the taxonomy), or "leaves" (select the - leaves concepts in the taxonomy). See inspect_eurovoc from data/reader/jrcacquis_reader.py for more details - :param most_common_cat: the maximum number of most common categories to consider, or -1 to keep them all - :param max_wiki: the maximum number of wikipedia documents to consider (default 5000) - :param run: a numeric label naming the random split (useful to keep track of different runs) - :return: None - """ - - name = 'JRCacquis' - run = '_run' + str(run) - config_name = 'jrc_nltk_' + __years_to_str(train_years) + \ - 'vs' + __years_to_str(test_years) + \ - '_' + cat_policy + \ - ('_top' + str(most_common_cat) if most_common_cat!=-1 else '') + \ - '_noparallel_processed' - - indep_path = join(jrc_data_home, config_name + run + '.pickle') - upper_path = join(jrc_data_home, config_name + run + '_upper.pickle') - yuxta_path = join(jrc_data_home, config_name + run + '_yuxtaposed.pickle') - wiki_path = join(jrc_data_home, config_name + run + '.wiki.pickle') - wiki_docs_path = join(jrc_data_home, config_name + '.wiki.raw.pickle') - - cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy) - training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=train_years, - cat_filter=cat_list, cat_threshold=1, parallel=None, - most_frequent=most_common_cat) - test_docs, _ = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names, - parallel='force') - - print('Generating feature-independent dataset...') - training_docs_no_parallel = random_sampling_avoiding_parallel(training_docs) - - def _group_by_lang(doc_list, langs): - return {lang: [(d.text, d.categories, d.parallel_id + '__' + d.id) for d in doc_list if d.lang == lang] - for lang in langs} - - training_docs = _group_by_lang(training_docs, langs) - training_docs_no_parallel = _group_by_lang(training_docs_no_parallel, langs) - test_docs = _group_by_lang(test_docs, langs) - if not exists(indep_path): - wiki_docs=None - if max_wiki>0: - if not exists(wiki_docs_path): - wiki_docs = fetch_wikipedia_multilingual(wiki_data_home, langs, min_words=50, deletions=False) - wiki_docs = random_wiki_sample(wiki_docs, max_wiki) - pickle.dump(wiki_docs, open(wiki_docs_path, 'wb'), pickle.HIGHEST_PROTOCOL) - else: - wiki_docs = pickle.load(open(wiki_docs_path, 'rb')) - wiki_docs = random_wiki_sample(wiki_docs, max_wiki) - - if wiki_docs: - lang_data, wiki_docs = build_independent_matrices(name, langs, training_docs_no_parallel, test_docs, label_names, wiki_docs) - pickle.dump(wiki_docs, open(wiki_path, 'wb'), pickle.HIGHEST_PROTOCOL) - else: - lang_data = build_independent_matrices(name, langs, training_docs_no_parallel, test_docs, label_names) - - lang_data.save(indep_path) - - print('Generating upper-bound (English-only) dataset...') - if not exists(upper_path): - training_docs_eng_only = {'en':training_docs['en']} - test_docs_eng_only = {'en':test_docs['en']} - build_independent_matrices(name, ['en'], training_docs_eng_only, test_docs_eng_only, label_names).save(upper_path) - - print('Generating yuxtaposed dataset...') - if not exists(yuxta_path): - build_juxtaposed_matrices(name, langs, training_docs_no_parallel, test_docs, label_names).save(yuxta_path) - - -def prepare_rcv_datasets(outpath, rcv1_data_home, rcv2_data_home, wiki_data_home, langs, - train_for_lang=1000, test_for_lang=1000, max_wiki=5000, preprocess=True, run=0): - from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample - """ - Prepare all datasets for RCV1/RCV2. The datasets include the "feature-independent" version, the - "feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices. - - :param outpath: path where all splits will be dumped - :param rcv1_data_home: path to the RCV1-v2 dataset (English only) - :param rcv2_data_home: path to the RCV2 dataset (all languages other than English) - :param wiki_data_home: path to the wikipedia dump (see data/readers/wikipedia_tools.py) - :param langs: the list of languages to consider (as defined in data/languages.py) - :param train_for_lang: maximum number of training documents per language - :param test_for_lang: maximum number of test documents per language - :param max_wiki: the maximum number of wikipedia documents to consider (default 5000) - :param preprocess: whether or not to apply language-specific preprocessing (stopwords removal and stemming) - :param run: a numeric label naming the random split (useful to keep track of different runs) - :return: None - """ - - assert 'en' in langs, 'English is not in requested languages, but is needed for some datasets' - assert len(langs)>1, 'the multilingual dataset cannot be built with only one dataset' - assert not preprocess or set(langs).issubset(set(RCV2_LANGS_WITH_NLTK_STEMMING+['en'])), \ - "languages not in RCV1-v2/RCV2 scope or not in valid for NLTK's processing" - - name = 'RCV1/2' - run = '_run' + str(run) - config_name = 'rcv1-2_nltk_trByLang'+str(train_for_lang)+'_teByLang'+str(test_for_lang)+\ - ('_processed' if preprocess else '_raw') - - indep_path = join(outpath, config_name + run + '.pickle') - upper_path = join(outpath, config_name + run +'_upper.pickle') - yuxta_path = join(outpath, config_name + run +'_yuxtaposed.pickle') - wiki_path = join(outpath, config_name + run + '.wiki.pickle') - wiki_docs_path = join(outpath, config_name + '.wiki.raw.pickle') - - print('fetching the datasets') - rcv1_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train') - rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l!='en']) - filter_by_categories(rcv1_documents, labels_rcv2) - filter_by_categories(rcv2_documents, labels_rcv1) - - label_names = get_active_labels(rcv1_documents+rcv2_documents) - print('Active labels in RCV1/2 {}'.format(len(label_names))) - - print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_documents), 0, len(label_names))) - print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents])) - - lang_docs = {lang: [d for d in rcv1_documents + rcv2_documents if d.lang == lang] for lang in langs} - - # for the upper bound there are no parallel versions, so for the English case, we take as many documents as there - # would be in the multilingual case -- then we will extract from them only train_for_lang for the other cases - print('Generating upper-bound (English-only) dataset...') - train, test = train_test_split(lang_docs['en'], train_size=train_for_lang*len(langs), test_size=test_for_lang, shuffle=True) - train_lang_doc_map = {'en':[(d.text, d.categories, d.id) for d in train]} - test_lang_doc_map = {'en':[(d.text, d.categories, d.id) for d in test]} - build_independent_matrices(name, ['en'], train_lang_doc_map, test_lang_doc_map, label_names).save(upper_path) - - train_lang_doc_map['en'] = train_lang_doc_map['en'][:train_for_lang] - for lang in langs: - if lang=='en': continue # already split - test_take = min(test_for_lang, len(lang_docs[lang])-train_for_lang) - train, test = train_test_split(lang_docs[lang], train_size=train_for_lang, test_size=test_take, shuffle=True) - train_lang_doc_map[lang] = [(d.text, d.categories, d.id) for d in train] - test_lang_doc_map[lang] = [(d.text, d.categories, d.id) for d in test] - - print('Generating feature-independent dataset...') - wiki_docs=None - if max_wiki>0: - if not exists(wiki_docs_path): - wiki_docs = fetch_wikipedia_multilingual(wiki_data_home, langs, min_words=50, deletions=False) - wiki_docs = random_wiki_sample(wiki_docs, max_wiki) - pickle.dump(wiki_docs, open(wiki_docs_path, 'wb'), pickle.HIGHEST_PROTOCOL) - else: - wiki_docs = pickle.load(open(wiki_docs_path, 'rb')) - wiki_docs = random_wiki_sample(wiki_docs, max_wiki) - - if wiki_docs: - lang_data, wiki_docs_matrix = build_independent_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, wiki_docs, preprocess) - pickle.dump(wiki_docs_matrix, open(wiki_path, 'wb'), pickle.HIGHEST_PROTOCOL) - else: - lang_data = build_independent_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, wiki_docs, preprocess) - - lang_data.save(indep_path) - - print('Generating yuxtaposed dataset...') - build_juxtaposed_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, preprocess).save(yuxta_path) - - -# ---------------------------------------------------------------------------------------------------------------------- -# Methods to generate full RCV and JRC datasets -# ---------------------------------------------------------------------------------------------------------------------- -def full_rcv_(rcv1_data_home, rcv2_data_home, outpath, langs): - - - print('fetching the datasets') - rcv1_train_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train') - rcv1_test_documents, labels_rcv1_test = fetch_RCV1(rcv1_data_home, split='test') - rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l != 'en']) - - filter_by_categories(rcv1_train_documents, labels_rcv2) - filter_by_categories(rcv1_test_documents, labels_rcv2) - filter_by_categories(rcv2_documents, labels_rcv1) - - label_names = get_active_labels(rcv1_train_documents + rcv2_documents) - print('Active labels in RCV1/2 {}'.format(len(label_names))) - - print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_train_documents), len(rcv1_test_documents), len(label_names))) - print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents])) - - mlb = MultiLabelBinarizer() - mlb.fit([label_names]) - - all_docs = rcv1_train_documents + rcv1_test_documents + rcv2_documents - lang_docs = {lang: [d for d in all_docs if d.lang == lang] for lang in langs} - - def get_ids(doclist): - return frozenset([d.id for d in doclist]) - - tr_ids = {'en': get_ids(rcv1_train_documents)} - te_ids = {'en': get_ids(rcv1_test_documents)} - for lang in langs: - if lang == 'en': continue - tr_ids[lang], te_ids[lang] = train_test_split([d.id for d in lang_docs[lang]], test_size=.3) - - dataset = MultilingualDataset() - dataset.dataset_name = 'RCV1/2-full' - for lang in langs: - print(f'processing {lang} with {len(tr_ids[lang])} training documents and {len(te_ids[lang])} documents') - analyzer = CountVectorizer( - strip_accents='unicode', min_df=3, stop_words=stopwords.words(NLTK_LANGMAP[lang]) - ).build_analyzer() - - Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.id) for d in lang_docs[lang] if d.id in tr_ids[lang]]) - Xte,Yte,IDte = zip(*[(d.text,d.categories,d.id) for d in lang_docs[lang] if d.id in te_ids[lang]]) - Xtr = [' '.join(analyzer(d)) for d in Xtr] - Xte = [' '.join(analyzer(d)) for d in Xte] - Ytr = mlb.transform(Ytr) - Yte = mlb.transform(Yte) - dataset.add(lang, _mask_numbers(Xtr), Ytr, _mask_numbers(Xte), Yte, IDtr, IDte) - - dataset.save(outpath) - - -def full_jrc_(jrc_data_home, langs, train_years, test_years, outpath, cat_policy='all', most_common_cat=300): - - print('fetching the datasets') - cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy) - training_docs, label_names = fetch_jrcacquis( - langs=langs, data_path=jrc_data_home, years=train_years, cat_filter=cat_list, cat_threshold=1, parallel=None, most_frequent=most_common_cat - ) - test_docs, _ = fetch_jrcacquis( - langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names, parallel='force' - ) - - def _group_by_lang(doc_list, langs): - return {lang: [d for d in doc_list if d.lang == lang] for lang in langs} - - training_docs = _group_by_lang(training_docs, langs) - test_docs = _group_by_lang(test_docs, langs) - - mlb = MultiLabelBinarizer() - mlb.fit([label_names]) - - dataset = MultilingualDataset() - data.dataset_name = 'JRC-Acquis-full' - for lang in langs: - analyzer = CountVectorizer( - strip_accents='unicode', min_df=3, stop_words=stopwords.words(NLTK_LANGMAP[lang]) - ).build_analyzer() - - Xtr, Ytr, IDtr = zip(*[(d.text, d.categories, d.parallel_id + '__' + d.id) for d in training_docs[lang] if d.lang == lang]) - Xte, Yte, IDte = zip(*[(d.text, d.categories, d.parallel_id + '__' + d.id) for d in test_docs[lang] if d.lang == lang]) - Xtr = [' '.join(analyzer(d)) for d in Xtr] - Xte = [' '.join(analyzer(d)) for d in Xte] - Ytr = mlb.transform(Ytr) - Yte = mlb.transform(Yte) - dataset.add(lang, _mask_numbers(Xtr), Ytr, _mask_numbers(Xte), Yte, IDtr, IDte) - - dataset.save(outpath) - - -#----------------------------------------------------------------------------------------------------------------------- -# MAIN BUILDER -#----------------------------------------------------------------------------------------------------------------------- - -if __name__=='__main__': - import sys - RCV1_PATH = '../Datasets/RCV1-v2/unprocessed_corpus' - RCV2_PATH = '../Datasets/RCV2' - JRC_DATAPATH = "../Datasets/JRC_Acquis_v3" - full_rcv_(RCV1_PATH, RCV2_PATH, outpath='../rcv2/rcv1-2_doclist_full_processed.pickle', langs=RCV2_LANGS_WITH_NLTK_STEMMING + ['en']) - # full_jrc_(JRC_DATAPATH, lang_set['JRC_NLTK'], train_years=list(range(1958, 2006)), test_years=[2006], outpath='../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle', cat_policy='all', most_common_cat=300) - sys.exit(0) - - # datasetpath = '../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle' # '../rcv2/rcv1-2_doclist_full_processed.pickle' - # data = MultilingualDataset.load(datasetpath) - # data.dataset_name='JRC-Acquis-full'#'RCV1/2-full' - # for lang in RCV2_LANGS_WITH_NLTK_STEMMING + ['en']: - # (Xtr, ytr, idtr), (Xte, yte, idte) = data.multiling_dataset[lang] - # data.multiling_dataset[lang] = ((_mask_numbers(Xtr), ytr, idtr), (_mask_numbers(Xte), yte, idte)) - # data.save('../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle')#'../rcv2/rcv1-2_doclist_full_processed_2.pickle') - # sys.exit(0) - - assert len(sys.argv) == 5, "wrong number of arguments; required: " \ - " " - - JRC_DATAPATH = sys.argv[1] # "../Datasets/JRC_Acquis_v3" - RCV1_PATH = sys.argv[2] #'../Datasets/RCV1-v2/unprocessed_corpus' - RCV2_PATH = sys.argv[3] #'../Datasets/RCV2' - WIKI_DATAPATH = sys.argv[4] #"../Datasets/Wikipedia/multilingual_docs_JRC_NLTK" - - langs = lang_set['JRC_NLTK'] - max_wiki = 5000 - - for run in range(0,10): - print('Building JRC-Acquis datasets run', run) - prepare_jrc_datasets(JRC_DATAPATH, WIKI_DATAPATH, langs, - train_years=list(range(1958, 2006)), test_years=[2006], max_wiki=max_wiki, - cat_policy='all', most_common_cat=300, run=run) - - print('Building RCV1-v2/2 datasets run', run) - prepare_rcv_datasets(RCV2_PATH, RCV1_PATH, RCV2_PATH, WIKI_DATAPATH, RCV2_LANGS_WITH_NLTK_STEMMING + ['en'], - train_for_lang=1000, test_for_lang=1000, max_wiki=max_wiki, run=run) - - # uncomment this code if you want to retrieve the original documents to generate the data splits for PLE - # (make sure you have not modified the above parameters, or adapt the following paths accordingly...) - # datasetpath = join(RCV2_PATH,'rcv1-2_nltk_trByLang1000_teByLang1000_processed_run{}.pickle'.format(run)) - # outpath = datasetpath.replace('_nltk_','_doclist_') - # retrieve_rcv_documents_from_dataset(datasetpath, RCV1_PATH, RCV2_PATH, outpath) - - # datasetpath = join(JRC_DATAPATH, 'jrc_nltk_1958-2005vs2006_all_top300_noparallel_processed_run{}.pickle'.format(run)) - # outpath = datasetpath.replace('_nltk_', '_doclist_') - # retrieve_jrc_documents_from_dataset(datasetpath, JRC_DATAPATH, train_years=list(range(1958, 2006)), test_years=[2006], cat_policy='all', most_common_cat=300, outpath=outpath) - - - diff --git a/refactor/data/languages.py b/refactor/data/languages.py deleted file mode 100644 index 2d03d8e..0000000 --- a/refactor/data/languages.py +++ /dev/null @@ -1,42 +0,0 @@ -""" -bg = Bulgarian -cs = Czech -da = Danish -de = German -el = Greek -en = English -es = Spanish -et = Estonian -fi = Finnish -fr = French -hu = Hungarian -it = Italian -lt = Lithuanian -lv = Latvian -nl = Dutch -mt = Maltese -pl = Polish -pt = Portuguese -ro = Romanian -sk = Slovak -sl = Slovene -sv = Swedish -""" - -NLTK_LANGMAP = {'da': 'danish', 'nl': 'dutch', 'en': 'english', 'fi': 'finnish', 'fr': 'french', 'de': 'german', - 'hu': 'hungarian', 'it': 'italian', 'pt': 'portuguese', 'ro': 'romanian', 'es': 'spanish', 'sv': 'swedish'} - - -#top 10 languages in wikipedia order by the number of articles -#LANGS_10_MOST_WIKI = ['en','fr','sv','de','es','it','pt','nl','pl','ro'] - -#all languages in JRC-acquis v3 -JRC_LANGS = ['bg','cs','da','de','el','en','es','et','fi','fr','hu','it','lt','lv','mt','nl','pl','pt','ro','sk','sl','sv'] -JRC_LANGS_WITH_NLTK_STEMMING = ['da', 'nl', 'en', 'fi', 'fr', 'de', 'hu', 'it', 'pt', 'es', 'sv'] # 'romanian deleted for incompatibility issues' - -RCV2_LANGS = ['ru', 'de', 'fr', 'sv', 'no', 'da', 'pt', 'it', 'es', 'jp', 'htw', 'nl'] -RCV2_LANGS_WITH_NLTK_STEMMING = ['de', 'fr', 'sv', 'da', 'pt', 'it', 'es', 'nl'] - -lang_set = {'JRC_NLTK':JRC_LANGS_WITH_NLTK_STEMMING, 'JRC':JRC_LANGS, - 'RCV2_NLTK':RCV2_LANGS_WITH_NLTK_STEMMING, 'RCV2':RCV2_LANGS} - diff --git a/refactor/data/reader/__init__.py b/refactor/data/reader/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/refactor/data/reader/jrcacquis_reader.py b/refactor/data/reader/jrcacquis_reader.py deleted file mode 100644 index e911996..0000000 --- a/refactor/data/reader/jrcacquis_reader.py +++ /dev/null @@ -1,324 +0,0 @@ -from __future__ import print_function - -import os -import pickle -import sys -import tarfile -import xml.etree.ElementTree as ET -import zipfile -from collections import Counter -from os.path import join -from random import shuffle - -import rdflib -from rdflib.namespace import RDF, SKOS -from sklearn.datasets import get_data_home - -from data.languages import JRC_LANGS -from data.languages import lang_set -from util.file import download_file, list_dirs, list_files - -""" -JRC Acquis' Nomenclature: -bg = Bulgarian -cs = Czech -da = Danish -de = German -el = Greek -en = English -es = Spanish -et = Estonian -fi = Finnish -fr = French -hu = Hungarian -it = Italian -lt = Lithuanian -lv = Latvian -nl = Dutch -mt = Maltese -pl = Polish -pt = Portuguese -ro = Romanian -sk = Slovak -sl = Slovene -sv = Swedish -""" - -class JRCAcquis_Document: - def __init__(self, id, name, lang, year, head, body, categories): - self.id = id - self.parallel_id = name - self.lang = lang - self.year = year - self.text = body if not head else head + "\n" + body - self.categories = categories - -# this is a workaround... for some reason, acutes are codified in a non-standard manner in titles -# however, it seems that the title is often appearing as the first paragraph in the text/body (with -# standard codification), so it might be preferable not to read the header after all (as here by default) -def _proc_acute(text): - for ch in ['a','e','i','o','u']: - text = text.replace('%'+ch+'acute%',ch) - return text - -def parse_document(file, year, head=False): - root = ET.parse(file).getroot() - - doc_name = root.attrib['n'] # e.g., '22006A0211(01)' - doc_lang = root.attrib['lang'] # e.g., 'es' - doc_id = root.attrib['id'] # e.g., 'jrc22006A0211_01-es' - doc_categories = [cat.text for cat in root.findall('.//teiHeader/profileDesc/textClass/classCode[@scheme="eurovoc"]')] - doc_head = _proc_acute(root.find('.//text/body/head').text) if head else '' - doc_body = '\n'.join([p.text for p in root.findall('.//text/body/div[@type="body"]/p')]) - - def raise_if_empty(field, from_file): - if isinstance(field, str): - if not field.strip(): - raise ValueError("Empty field in file %s" % from_file) - - raise_if_empty(doc_name, file) - raise_if_empty(doc_lang, file) - raise_if_empty(doc_id, file) - if head: raise_if_empty(doc_head, file) - raise_if_empty(doc_body, file) - - return JRCAcquis_Document(id=doc_id, name=doc_name, lang=doc_lang, year=year, head=doc_head, body=doc_body, categories=doc_categories) - -# removes documents without a counterpart in all other languages -def _force_parallel(doclist, langs): - n_langs = len(langs) - par_id_count = Counter([d.parallel_id for d in doclist]) - parallel_doc_ids = set([id for id,count in par_id_count.items() if count==n_langs]) - return [doc for doc in doclist if doc.parallel_id in parallel_doc_ids] - -def random_sampling_avoiding_parallel(doclist): - random_order = list(range(len(doclist))) - shuffle(random_order) - sampled_request = [] - parallel_ids = set() - for ind in random_order: - pid = doclist[ind].parallel_id - if pid not in parallel_ids: - sampled_request.append(doclist[ind]) - parallel_ids.add(pid) - print('random_sampling_no_parallel:: from {} documents to {} documents'.format(len(doclist), len(sampled_request))) - return sampled_request - - -#filters out documents which do not contain any category in the cat_filter list, and filter all labels not in cat_filter -def _filter_by_category(doclist, cat_filter): - if not isinstance(cat_filter, frozenset): - cat_filter = frozenset(cat_filter) - filtered = [] - for doc in doclist: - doc.categories = list(cat_filter & set(doc.categories)) - if doc.categories: - doc.categories.sort() - filtered.append(doc) - print("filtered %d documents out without categories in the filter list" % (len(doclist) - len(filtered))) - return filtered - -#filters out categories with less than cat_threshold documents (and filters documents containing those categories) -def _filter_by_frequency(doclist, cat_threshold): - cat_count = Counter() - for d in doclist: - cat_count.update(d.categories) - - freq_categories = [cat for cat,count in cat_count.items() if count>cat_threshold] - freq_categories.sort() - return _filter_by_category(doclist, freq_categories), freq_categories - -#select top most_frequent categories (and filters documents containing those categories) -def _most_common(doclist, most_frequent): - cat_count = Counter() - for d in doclist: - cat_count.update(d.categories) - - freq_categories = [cat for cat,count in cat_count.most_common(most_frequent)] - freq_categories.sort() - return _filter_by_category(doclist, freq_categories), freq_categories - -def _get_categories(request): - final_cats = set() - for d in request: - final_cats.update(d.categories) - return list(final_cats) - -def fetch_jrcacquis(langs=None, data_path=None, years=None, ignore_unclassified=True, cat_filter=None, cat_threshold=0, - parallel=None, most_frequent=-1, DOWNLOAD_URL_BASE ='http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/'): - - assert parallel in [None, 'force', 'avoid'], 'parallel mode not supported' - if not langs: - langs = JRC_LANGS - else: - if isinstance(langs, str): langs = [langs] - for l in langs: - if l not in JRC_LANGS: - raise ValueError('Language %s is not among the valid languages in JRC-Acquis v3' % l) - - if not data_path: - data_path = get_data_home() - - if not os.path.exists(data_path): - os.mkdir(data_path) - - request = [] - total_read = 0 - for l in langs: - file_name = 'jrc-'+l+'.tgz' - archive_path = join(data_path, file_name) - - if not os.path.exists(archive_path): - print("downloading language-specific dataset (once and for all) into %s" % data_path) - DOWNLOAD_URL = join(DOWNLOAD_URL_BASE, file_name) - download_file(DOWNLOAD_URL, archive_path) - print("untarring dataset...") - tarfile.open(archive_path, 'r:gz').extractall(data_path) - - documents_dir = join(data_path, l) - - print("Reading documents...") - read = 0 - for dir in list_dirs(documents_dir): - year = int(dir) - if years==None or year in years: - year_dir = join(documents_dir,dir) - pickle_name = join(data_path, 'jrc_' + l + '_' + dir + '.pickle') - if os.path.exists(pickle_name): - print("loading from file %s" % pickle_name) - l_y_documents = pickle.load(open(pickle_name, "rb")) - read += len(l_y_documents) - else: - l_y_documents = [] - all_documents = list_files(year_dir) - empty = 0 - for i,doc_file in enumerate(all_documents): - try: - jrc_doc = parse_document(join(year_dir, doc_file), year) - except ValueError: - jrc_doc = None - - if jrc_doc and (not ignore_unclassified or jrc_doc.categories): - l_y_documents.append(jrc_doc) - else: empty += 1 - if len(all_documents)>50 and ((i+1) % (len(all_documents)/50) == 0): - print('\r\tfrom %s: completed %d%%' % (year_dir, int((i+1)*100.0/len(all_documents))), end='') - read+=1 - print('\r\tfrom %s: completed 100%% read %d documents (discarded %d without categories or empty fields)\n' % (year_dir, i+1, empty), end='') - print("\t\t(Pickling object for future runs in %s)" % pickle_name) - pickle.dump(l_y_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL) - request += l_y_documents - print("Read %d documents for language %s\n" % (read, l)) - total_read += read - print("Read %d documents in total" % (total_read)) - - if parallel=='force': - request = _force_parallel(request, langs) - elif parallel == 'avoid': - request = random_sampling_avoiding_parallel(request) - - final_cats = _get_categories(request) - - if cat_filter: - request = _filter_by_category(request, cat_filter) - final_cats = _get_categories(request) - if cat_threshold > 0: - request, final_cats = _filter_by_frequency(request, cat_threshold) - if most_frequent != -1 and len(final_cats) > most_frequent: - request, final_cats = _most_common(request, most_frequent) - - return request, final_cats - -def print_cat_analysis(request): - cat_count = Counter() - for d in request: - cat_count.update(d.categories) - print("Number of active categories: {}".format(len(cat_count))) - print(cat_count.most_common()) - -# inspects the Eurovoc thesaurus in order to select a subset of categories -# currently, only 'broadest' policy (i.e., take all categories with no parent category), and 'all' is implemented -def inspect_eurovoc(data_path, eurovoc_skos_core_concepts_filename='eurovoc_in_skos_core_concepts.rdf', - eurovoc_url="http://publications.europa.eu/mdr/resource/thesaurus/eurovoc-20160630-0/skos/eurovoc_in_skos_core_concepts.zip", - select="broadest"): - - fullpath_pickle = join(data_path, select+'_concepts.pickle') - if os.path.exists(fullpath_pickle): - print("Pickled object found in %s. Loading it." % fullpath_pickle) - return pickle.load(open(fullpath_pickle,'rb')) - - fullpath = join(data_path, eurovoc_skos_core_concepts_filename) - if not os.path.exists(fullpath): - print("Path %s does not exist. Trying to download the skos EuroVoc file from %s" % (data_path, eurovoc_url)) - download_file(eurovoc_url, fullpath) - print("Unzipping file...") - zipped = zipfile.ZipFile(data_path + '.zip', 'r') - zipped.extract("eurovoc_in_skos_core_concepts.rdf", data_path) - zipped.close() - - print("Parsing %s" %fullpath) - g = rdflib.Graph() - g.parse(location=fullpath, format="application/rdf+xml") - - if select == "all": - print("Selecting all concepts") - all_concepts = list(g.subjects(RDF.type, SKOS.Concept)) - all_concepts = [c.toPython().split('/')[-1] for c in all_concepts] - all_concepts.sort() - selected_concepts = all_concepts - elif select=="broadest": - print("Selecting broadest concepts (those without any other broader concept linked to it)") - all_concepts = set(g.subjects(RDF.type, SKOS.Concept)) - narrower_concepts = set(g.subjects(SKOS.broader, None)) - broadest_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - narrower_concepts)] - broadest_concepts.sort() - selected_concepts = broadest_concepts - elif select=="leaves": - print("Selecting leaves concepts (those not linked as broader of any other concept)") - all_concepts = set(g.subjects(RDF.type, SKOS.Concept)) - broad_concepts = set(g.objects(None, SKOS.broader)) - leave_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - broad_concepts)] - leave_concepts.sort() - selected_concepts = leave_concepts - else: - raise ValueError("Selection policy %s is not currently supported" % select) - - print("%d %s concepts found" % (len(selected_concepts), leave_concepts)) - print("Pickling concept list for faster further requests in %s" % fullpath_pickle) - pickle.dump(selected_concepts, open(fullpath_pickle, 'wb'), pickle.HIGHEST_PROTOCOL) - - return selected_concepts - -if __name__ == '__main__': - - def single_label_fragment(doclist): - single = [d for d in doclist if len(d.categories) < 2] - final_categories = set([d.categories[0] if d.categories else [] for d in single]) - print('{} single-label documents ({} categories) from the original {} documents'.format(len(single), - len(final_categories), - len(doclist))) - return single, list(final_categories) - - train_years = list(range(1986, 2006)) - test_years = [2006] - cat_policy = 'leaves' - most_common_cat = 300 - # JRC_DATAPATH = "/media/moreo/1TB Volume/Datasets/JRC_Acquis_v3" - JRC_DATAPATH = "/storage/andrea/FUNNELING/data/JRC_Acquis_v3" - langs = lang_set['JRC_NLTK'] - cat_list = inspect_eurovoc(JRC_DATAPATH, select=cat_policy) - sys.exit() - - training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=JRC_DATAPATH, years=train_years,cat_filter=cat_list, cat_threshold=1, parallel=None,most_frequent=most_common_cat) - test_docs, label_namestest = fetch_jrcacquis(langs=langs, data_path=JRC_DATAPATH, years=test_years, cat_filter=label_names,parallel='force') - - print('JRC-train: {} documents, {} labels'.format(len(training_docs), len(label_names))) - print('JRC-test: {} documents, {} labels'.format(len(test_docs), len(label_namestest))) - - training_docs, label_names = single_label_fragment(training_docs) - test_docs, label_namestest = single_label_fragment(test_docs) - - print('JRC-train: {} documents, {} labels'.format(len(training_docs), len(label_names))) - print('JRC-test: {} documents, {} labels'.format(len(test_docs), len(label_namestest))) - - diff --git a/refactor/data/reader/rcv_reader.py b/refactor/data/reader/rcv_reader.py deleted file mode 100644 index b3db098..0000000 --- a/refactor/data/reader/rcv_reader.py +++ /dev/null @@ -1,222 +0,0 @@ -import re -import xml.etree.ElementTree as ET -from os.path import join, exists -from zipfile import ZipFile - -import numpy as np - -from util.file import download_file_if_not_exists -from util.file import list_files - -""" -RCV2's Nomenclature: -ru = Russian -da = Danish -de = German -es = Spanish -lat = Spanish Latin-American (actually is also 'es' in the collection) -fr = French -it = Italian -nl = Dutch -pt = Portuguese -sv = Swedish -ja = Japanese -htw = Chinese -no = Norwegian -""" - -RCV1_TOPICHIER_URL = "http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a02-orig-topics-hierarchy/rcv1.topics.hier.orig" -RCV1PROC_BASE_URL= 'http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a12-token-files' -RCV1_BASE_URL = "http://www.daviddlewis.com/resources/testcollections/rcv1/" -RCV2_BASE_URL = "http://trec.nist.gov/data/reuters/reuters.html" - -rcv1_test_data_gz = ['lyrl2004_tokens_test_pt0.dat.gz', - 'lyrl2004_tokens_test_pt1.dat.gz', - 'lyrl2004_tokens_test_pt2.dat.gz', - 'lyrl2004_tokens_test_pt3.dat.gz'] - -rcv1_train_data_gz = ['lyrl2004_tokens_train.dat.gz'] - -rcv1_doc_cats_data_gz = 'rcv1-v2.topics.qrels.gz' - -RCV2_LANG_DIR = {'ru':'REUTE000', - 'de':'REUTE00A', - 'fr':'REUTE00B', - 'sv':'REUTE001', - 'no':'REUTE002', - 'da':'REUTE003', - 'pt':'REUTE004', - 'it':'REUTE005', - 'es':'REUTE006', - 'lat':'REUTE007', - 'jp':'REUTE008', - 'htw':'REUTE009', - 'nl':'REUTERS_'} - - -class RCV_Document: - - def __init__(self, id, text, categories, date='', lang=None): - self.id = id - self.date = date - self.lang = lang - self.text = text - self.categories = categories - - -class ExpectedLanguageException(Exception): pass -class IDRangeException(Exception): pass - - -nwords = [] - -def parse_document(xml_content, assert_lang=None, valid_id_range=None): - root = ET.fromstring(xml_content) - if assert_lang: - if assert_lang not in root.attrib.values(): - if assert_lang != 'jp' or 'ja' not in root.attrib.values(): # some documents are attributed to 'ja', others to 'jp' - raise ExpectedLanguageException('error: document of a different language') - - doc_id = root.attrib['itemid'] - if valid_id_range is not None: - if not valid_id_range[0] <= int(doc_id) <= valid_id_range[1]: - raise IDRangeException - - doc_categories = [cat.attrib['code'] for cat in - root.findall('.//metadata/codes[@class="bip:topics:1.0"]/code')] - - doc_date = root.attrib['date'] - doc_title = root.find('.//title').text - doc_headline = root.find('.//headline').text - doc_body = '\n'.join([p.text for p in root.findall('.//text/p')]) - - if not doc_body: - raise ValueError('Empty document') - - if doc_title is None: doc_title = '' - if doc_headline is None or doc_headline in doc_title: doc_headline = '' - text = '\n'.join([doc_title, doc_headline, doc_body]).strip() - - text_length = len(text.split()) - global nwords - nwords.append(text_length) - - return RCV_Document(id=doc_id, text=text, categories=doc_categories, date=doc_date, lang=assert_lang) - - -def fetch_RCV1(data_path, split='all'): - - assert split in ['train', 'test', 'all'], 'split should be "train", "test", or "all"' - - request = [] - labels = set() - read_documents = 0 - lang = 'en' - - training_documents = 23149 - test_documents = 781265 - - if split == 'all': - split_range = (2286, 810596) - expected = training_documents+test_documents - elif split == 'train': - split_range = (2286, 26150) - expected = training_documents - else: - split_range = (26151, 810596) - expected = test_documents - - global nwords - nwords=[] - for part in list_files(data_path): - if not re.match('\d+\.zip', part): continue - target_file = join(data_path, part) - assert exists(target_file), \ - "You don't seem to have the file "+part+" in " + data_path + ", and the RCV1 corpus can not be downloaded"+\ - " w/o a formal permission. Please, refer to " + RCV1_BASE_URL + " for more information." - zipfile = ZipFile(target_file) - for xmlfile in zipfile.namelist(): - xmlcontent = zipfile.open(xmlfile).read() - try: - doc = parse_document(xmlcontent, assert_lang=lang, valid_id_range=split_range) - labels.update(doc.categories) - request.append(doc) - read_documents += 1 - except ValueError: - print('\n\tskipping document {} with inconsistent language label: expected language {}'.format(part+'/'+xmlfile, lang)) - except (IDRangeException, ExpectedLanguageException) as e: - pass - print('\r[{}] read {} documents'.format(part, len(request)), end='') - if read_documents == expected: break - if read_documents == expected: break - print() - print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords))) - return request, list(labels) - - -def fetch_RCV2(data_path, languages=None): - - if not languages: - languages = list(RCV2_LANG_DIR.keys()) - else: - assert set(languages).issubset(set(RCV2_LANG_DIR.keys())), 'languages not in scope' - - request = [] - labels = set() - global nwords - nwords=[] - for lang in languages: - path = join(data_path, RCV2_LANG_DIR[lang]) - lang_docs_read = 0 - for part in list_files(path): - target_file = join(path, part) - assert exists(target_file), \ - "You don't seem to have the file "+part+" in " + path + ", and the RCV2 corpus can not be downloaded"+\ - " w/o a formal permission. Please, refer to " + RCV2_BASE_URL + " for more information." - zipfile = ZipFile(target_file) - for xmlfile in zipfile.namelist(): - xmlcontent = zipfile.open(xmlfile).read() - try: - doc = parse_document(xmlcontent, assert_lang=lang) - labels.update(doc.categories) - request.append(doc) - lang_docs_read += 1 - except ValueError: - print('\n\tskipping document {} with inconsistent language label: expected language {}'.format(RCV2_LANG_DIR[lang]+'/'+part+'/'+xmlfile, lang)) - except (IDRangeException, ExpectedLanguageException) as e: - pass - print('\r[{}] read {} documents, {} for language {}'.format(RCV2_LANG_DIR[lang]+'/'+part, len(request), lang_docs_read, lang), end='') - print() - print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords))) - return request, list(labels) - - -def fetch_topic_hierarchy(path, topics='all'): - assert topics in ['all', 'leaves'] - - download_file_if_not_exists(RCV1_TOPICHIER_URL, path) - hierarchy = {} - for line in open(path, 'rt'): - parts = line.strip().split() - parent,child = parts[1],parts[3] - if parent not in hierarchy: - hierarchy[parent]=[] - hierarchy[parent].append(child) - - del hierarchy['None'] - del hierarchy['Root'] - print(hierarchy) - - if topics=='all': - topics = set(hierarchy.keys()) - for parent in hierarchy.keys(): - topics.update(hierarchy[parent]) - return list(topics) - elif topics=='leaves': - parents = set(hierarchy.keys()) - childs = set() - for parent in hierarchy.keys(): - childs.update(hierarchy[parent]) - return list(childs.difference(parents)) - - diff --git a/refactor/data/reader/wikipedia_tools.py b/refactor/data/reader/wikipedia_tools.py deleted file mode 100644 index 9558fb6..0000000 --- a/refactor/data/reader/wikipedia_tools.py +++ /dev/null @@ -1,307 +0,0 @@ -from __future__ import print_function - -# import ijson -# from ijson.common import ObjectBuilder -import os -import pickle -import re -from bz2 import BZ2File -from itertools import islice -from os.path import join -from xml.sax.saxutils import escape - -import numpy as np - -from util.file import list_dirs, list_files - -policies = ["IN_ALL_LANGS", "IN_ANY_LANG"] - -""" -This file contains a set of tools for processing the Wikipedia multilingual documents. -In what follows, it is assumed that you have already downloaded a Wikipedia dump (https://dumps.wikimedia.org/) -and have processed each document to clean their texts with one of the tools: - - https://github.com/aesuli/wikipediatools (Python 2) - - https://github.com/aesuli/wikipedia-extractor (Python 3) -It is also assumed you have dowloaded the all-entities json file (e.g., https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2) - -This tools help you in: - - Processes the huge json file as a stream, and create a multilingual map of corresponding titles for each language. - Set the policy = "IN_ALL_LANGS" will extract only titles which appear in all (AND) languages, whereas "IN_ANY_LANG" - extracts all titles appearing in at least one (OR) language (warning: this will creates a huge dictionary). - Note: This version is quite slow. Although it is run once for all, you might be prefer to take a look at "Wikidata in BigQuery". - - Processes the huge json file as a stream a creates a simplified file which occupies much less and is far faster to be processed. - - Use the multilingual map to extract, from the clean text versions, individual xml documents containing all - language-specific versions from the document. - - Fetch the multilingual documents to create, for each of the specified languages, a list containing all documents, - in a way that the i-th element from any list refers to the same element in the respective language. -""" - -def _doc_generator(text_path, langs): - dotspace = re.compile(r'\.(?!\s)') - for l,lang in enumerate(langs): - print("Processing language <%s> (%d/%d)" % (lang, l, len(langs))) - lang_dir = join(text_path, lang) - split_dirs = list_dirs(lang_dir) - for sd,split_dir in enumerate(split_dirs): - print("\tprocessing split_dir <%s> (%d/%d)" % (split_dir, sd, len(split_dirs))) - split_files = list_files(join(lang_dir, split_dir)) - for sf,split_file in enumerate(split_files): - print("\t\tprocessing split_file <%s> (%d/%d)" % (split_file, sf, len(split_files))) - with BZ2File(join(lang_dir, split_dir, split_file), 'r', buffering=1024*1024) as fi: - while True: - doc_lines = list(islice(fi, 3)) - if doc_lines: - # some sentences are not followed by a space after the dot - doc_lines[1] = dotspace.sub('. ', doc_lines[1]) - # [workaround] I found   html symbol was not treated, and unescaping it now might not help... - doc_lines[1] = escape(doc_lines[1].replace(" ", " ")) - yield doc_lines, lang - else: break - -def _extract_title(doc_lines): - m = re.search('title="(.+?)"', doc_lines[0]) - if m: return m.group(1).decode('utf-8') - else: raise ValueError("Error in xml format: document head is %s" % doc_lines[0]) - -def _create_doc(target_file, id, doc, lang): - doc[0] = doc[0][:-2] + (' lang="%s">\n'%lang) - with open(target_file, 'w') as fo: - fo.write('\n'%id) - [fo.write(line) for line in doc] - fo.write('') - -def _append_doc(target_file, doc, lang): - doc[0] = doc[0][:-2] + (' lang="%s">\n' % lang) - with open(target_file, 'r', buffering=1024*1024) as fi: - lines = fi.readlines() - if doc[0] in lines[1::3]: - return - lines[-1:-1]=doc - with open(target_file, 'w', buffering=1024*1024) as fo: - [fo.write(line) for line in lines] - -def extract_multilingual_documents(inv_dict, langs, text_path, out_path): - if not os.path.exists(out_path): - os.makedirs(out_path) - for lang in langs: - if lang not in inv_dict: - raise ValueError("Lang %s is not in the dictionary" % lang) - - docs_created = len(list_files(out_path)) - print("%d multilingual documents found." % docs_created) - for doc,lang in _doc_generator(text_path, langs): - title = _extract_title(doc) - - if title in inv_dict[lang]: - #pass - ids = inv_dict[lang][title] - for id in ids: - target_file = join(out_path, id) + ".xml" - if os.path.exists(target_file): - _append_doc(target_file, doc, lang) - else: - _create_doc(target_file, id, doc, lang) - docs_created+=1 - else: - if not re.match('[A-Za-z]+', title): - print("Title <%s> for lang <%s> not in dictionary" % (title, lang)) - - - -def extract_multilingual_titles_from_simplefile(data_dir, filename, langs, policy="IN_ALL_LANGS", return_both=True): - simplified_file = join(data_dir,filename) - - if policy not in policies: - raise ValueError("Policy %s not supported." % policy) - print("extracting multilingual titles with policy %s (%s)" % (policy,' '.join(langs))) - - lang_prefix = list(langs) - lang_prefix.sort() - pickle_prefix = "extraction_" + "_".join(lang_prefix) + "." + policy - pickle_dict = join(data_dir, pickle_prefix+".multi_dict.pickle") - pickle_invdict = join(data_dir, pickle_prefix+".multi_invdict.pickle") - if os.path.exists(pickle_invdict): - if return_both and os.path.exists(pickle_dict): - print("Pickled files found in %s. Loading both (direct and inverse dictionaries)." % data_dir) - return pickle.load(open(pickle_dict, 'rb')), pickle.load(open(pickle_invdict, 'rb')) - elif return_both==False: - print("Pickled file found in %s. Loading inverse dictionary only." % pickle_invdict) - return pickle.load(open(pickle_invdict, 'rb')) - - multiling_titles = {} - inv_dict = {lang:{} for lang in langs} - - def process_entry(line): - parts = line.strip().split('\t') - id = parts[0] - if id in multiling_titles: - raise ValueError("id <%s> already indexed" % id) - - titles = dict(((lang_title[:lang_title.find(':')],lang_title[lang_title.find(':')+1:].decode('utf-8')) for lang_title in parts[1:])) - for lang in titles.keys(): - if lang not in langs: - del titles[lang] - - if (policy == "IN_ALL_LANGS" and len(titles) == len(langs))\ - or (policy == "IN_ANY_LANG" and len(titles) > 0): - multiling_titles[id] = titles - for lang, title in titles.items(): - if title in inv_dict[lang]: - inv_dict[lang][title].append(id) - inv_dict[lang][title] = [id] - - with BZ2File(simplified_file, 'r', buffering=1024*1024*16) as fi: - completed = 0 - try: - for line in fi: - process_entry(line) - completed += 1 - if completed % 10 == 0: - print("\rCompleted %d\ttitles %d" % (completed,len(multiling_titles)), end="") - print("\rCompleted %d\t\ttitles %d" % (completed, len(multiling_titles)), end="\n") - except EOFError: - print("\nUnexpected file ending... saving anyway") - - print("Pickling dictionaries in %s" % data_dir) - pickle.dump(multiling_titles, open(pickle_dict,'wb'), pickle.HIGHEST_PROTOCOL) - pickle.dump(inv_dict, open(pickle_invdict, 'wb'), pickle.HIGHEST_PROTOCOL) - print("Done") - - return (multiling_titles, inv_dict) if return_both else inv_dict - - -# in https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2 -def simplify_json_file(data_dir, langs, policy="IN_ALL_LANGS", json_file = "latest-all.json.bz2"): - latest_all_json_file = join(data_dir,json_file) - - if policy not in policies: - raise ValueError("Policy %s not supported." % policy) - - print("extracting multilingual titles with policy %s (%s)" % (policy,' '.join(langs))) - - lang_prefix = list(langs) - lang_prefix.sort() - simple_titles_path = join(data_dir, "extraction_" + "_".join(lang_prefix) + "." + policy) - - def process_entry(last, fo): - global written - id = last["id"] - titles = None - if policy == "IN_ALL_LANGS" and langs.issubset(last["labels"].keys()): - titles = {lang: last["labels"][lang]["value"] for lang in langs} - elif policy == "IN_ANY_LANG": - titles = {lang: last["labels"][lang]["value"] for lang in langs if lang in last["labels"]} - - if titles: - fo.write((id+'\t'+'\t'.join([lang+':'+titles[lang] for lang in titles.keys()])+'\n').encode('utf-8')) - return True - else: - return False - - written = 0 - with BZ2File(latest_all_json_file, 'r', buffering=1024*1024*16) as fi, \ - BZ2File(join(data_dir,simple_titles_path+".simple.bz2"),'w') as fo: - builder = ObjectBuilder() - completed = 0 - for event, value in ijson.basic_parse(fi, buf_size=1024*1024*16): - builder.event(event, value) - if len(builder.value)>1: - if process_entry(builder.value.pop(0), fo): written += 1 - completed += 1 - print("\rCompleted %d\ttitles %d" % (completed,written), end="") - print("") - - #process the last entry - process_entry(builder.value.pop(0)) - - return simple_titles_path - -""" -Reads all multi-lingual documents in a folder (see wikipedia_tools.py to generate them) and generates, for each of the -specified languages, a list contanining all its documents, so that the i-th element of any list refers to the language- -specific version of the same document. Documents are forced to contain version in all specified languages and to contain -a minimum number of words; otherwise it is discarded. -""" -class MinWordsNotReached(Exception): pass -class WrongDocumentFormat(Exception): pass - -def _load_multilang_doc(path, langs, min_words=100): - import xml.etree.ElementTree as ET - from xml.etree.ElementTree import Element, ParseError - try: - root = ET.parse(path).getroot() - doc = {} - for lang in langs: - doc_body = root.find('.//doc[@lang="' + lang + '"]') - if isinstance(doc_body, Element): - n_words = len(doc_body.text.split(' ')) - if n_words >= min_words: - doc[lang] = doc_body.text - else: - raise MinWordsNotReached - else: - raise WrongDocumentFormat - except ParseError: - raise WrongDocumentFormat - return doc - -#returns the multilingual documents mapped by language, and a counter with the number of documents readed -def fetch_wikipedia_multilingual(wiki_multi_path, langs, min_words=100, deletions=False, max_documents=-1, pickle_name=None): - if pickle_name and os.path.exists(pickle_name): - print("unpickling %s" % pickle_name) - return pickle.load(open(pickle_name, 'rb')) - - multi_docs = list_files(wiki_multi_path) - mling_documents = {l:[] for l in langs} - valid_documents = 0 - minwords_exception = 0 - wrongdoc_exception = 0 - for d,multi_doc in enumerate(multi_docs): - print("\rProcessed %d/%d documents, valid %d/%d, few_words=%d, few_langs=%d" % - (d, len(multi_docs), valid_documents, len(multi_docs), minwords_exception, wrongdoc_exception),end="") - doc_path = join(wiki_multi_path, multi_doc) - try: - m_doc = _load_multilang_doc(doc_path, langs, min_words) - valid_documents += 1 - for l in langs: - mling_documents[l].append(m_doc[l]) - except MinWordsNotReached: - minwords_exception += 1 - if deletions: os.remove(doc_path) - except WrongDocumentFormat: - wrongdoc_exception += 1 - if deletions: os.remove(doc_path) - if max_documents>0 and valid_documents>=max_documents: - break - - if pickle_name: - print("Pickling wikipedia documents object in %s" % pickle_name) - pickle.dump(mling_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL) - - return mling_documents - -def random_wiki_sample(l_wiki, max_documents): - if max_documents == 0: return None - langs = list(l_wiki.keys()) - assert len(np.unique([len(l_wiki[l]) for l in langs])) == 1, 'documents across languages do not seem to be aligned' - ndocs_per_lang = len(l_wiki[langs[0]]) - if ndocs_per_lang > max_documents: - sel = set(np.random.choice(list(range(ndocs_per_lang)), max_documents, replace=False)) - for lang in langs: - l_wiki[lang] = [d for i, d in enumerate(l_wiki[lang]) if i in sel] - return l_wiki - - -if __name__ == "__main__": - - wikipedia_home = "../Datasets/Wikipedia" - - from data.languages import JRC_LANGS_WITH_NLTK_STEMMING as langs - langs = frozenset(langs) - - simple_titles_path = simplify_json_file(wikipedia_home, langs, policy="IN_ALL_LANGS", json_file="latest-all.json.bz2") - _, inv_dict = extract_multilingual_titles_from_simplefile(wikipedia_home, simple_titles_path, langs, policy='IN_ALL_LANGS') - extract_multilingual_documents(inv_dict, langs, join(wikipedia_home,'text'), - out_path=join(wikipedia_home, 'multilingual_docs_JRC_NLTK')) - - diff --git a/refactor/data/text_preprocessor.py b/refactor/data/text_preprocessor.py deleted file mode 100644 index fcfddba..0000000 --- a/refactor/data/text_preprocessor.py +++ /dev/null @@ -1,34 +0,0 @@ -from nltk import word_tokenize -from nltk.corpus import stopwords -from nltk.stem import SnowballStemmer - -from data.languages import NLTK_LANGMAP - - -def preprocess_documents(documents, lang): - tokens = NLTKStemTokenizer(lang, verbose=True) - sw = stopwords.words(NLTK_LANGMAP[lang]) - return [' '.join([w for w in tokens(doc) if w not in sw]) for doc in documents] - - -class NLTKStemTokenizer(object): - - def __init__(self, lang, verbose=False): - if lang not in NLTK_LANGMAP: - raise ValueError('Language %s is not supported in NLTK' % lang) - self.verbose=verbose - self.called = 0 - self.wnl = SnowballStemmer(NLTK_LANGMAP[lang]) - self.cache = {} - - def __call__(self, doc): - self.called += 1 - if self.verbose: - print("\r\t\t[documents processed %d]" % (self.called), end="") - tokens = word_tokenize(doc) - stems = [] - for t in tokens: - if t not in self.cache: - self.cache[t] = self.wnl.stem(t) - stems.append(self.cache[t]) - return stems \ No newline at end of file diff --git a/refactor/data/tsr_function__.py b/refactor/data/tsr_function__.py deleted file mode 100755 index c458029..0000000 --- a/refactor/data/tsr_function__.py +++ /dev/null @@ -1,271 +0,0 @@ -import math - -import numpy as np -from joblib import Parallel, delayed -from scipy.sparse import csr_matrix, csc_matrix -from scipy.stats import t - - -def get_probs(tpr, fpr, pc): - # tpr = p(t|c) = p(tp)/p(c) = p(tp)/(p(tp)+p(fn)) - # fpr = p(t|_c) = p(fp)/p(_c) = p(fp)/(p(fp)+p(tn)) - pnc = 1.0 - pc - tp = tpr * pc - fn = pc - tp - fp = fpr * pnc - tn = pnc - fp - return ContTable(tp=tp, fn=fn, fp=fp, tn=tn) - - -def apply_tsr(tpr, fpr, pc, tsr): - cell = get_probs(tpr, fpr, pc) - return tsr(cell) - - -def positive_information_gain(cell): - if cell.tpr() < cell.fpr(): - return 0.0 - else: - return information_gain(cell) - - -def posneg_information_gain(cell): - ig = information_gain(cell) - if cell.tpr() < cell.fpr(): - return -ig - else: - return ig - - -def __ig_factor(p_tc, p_t, p_c): - den = p_t * p_c - if den != 0.0 and p_tc != 0: - return p_tc * math.log(p_tc / den, 2) - else: - return 0.0 - - -def information_gain(cell): - return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + \ - __ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) +\ - __ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()) + \ - __ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c()) - - -def information_gain_mod(cell): - return (__ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + __ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c())) \ - - (__ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) + __ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c())) - - -def pointwise_mutual_information(cell): - return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) - - -def gain_ratio(cell): - pc = cell.p_c() - pnc = 1.0 - pc - norm = pc * math.log(pc, 2) + pnc * math.log(pnc, 2) - return information_gain(cell) / (-norm) - - -def chi_square(cell): - den = cell.p_f() * cell.p_not_f() * cell.p_c() * cell.p_not_c() - if den==0.0: return 0.0 - num = gss(cell)**2 - return num / den - - -def relevance_frequency(cell): - a = cell.tp - c = cell.fp - if c == 0: c = 1 - return math.log(2.0 + (a * 1.0 / c), 2) - - -def idf(cell): - if cell.p_f()>0: - return math.log(1.0 / cell.p_f()) - return 0.0 - - -def gss(cell): - return cell.p_tp()*cell.p_tn() - cell.p_fp()*cell.p_fn() - - -def conf_interval(xt, n): - if n>30: - z2 = 3.84145882069 # norm.ppf(0.5+0.95/2.0)**2 - else: - z2 = t.ppf(0.5 + 0.95 / 2.0, df=max(n-1,1)) ** 2 - p = (xt + 0.5 * z2) / (n + z2) - amplitude = 0.5 * z2 * math.sqrt((p * (1.0 - p)) / (n + z2)) - return p, amplitude - -def strength(minPosRelFreq, minPos, maxNeg): - if minPos > maxNeg: - return math.log(2.0 * minPosRelFreq, 2.0) - else: - return 0.0 - - -#set cancel_features=True to allow some features to be weighted as 0 (as in the original article) -#however, for some extremely imbalanced dataset caused all documents to be 0 -def conf_weight(cell, cancel_features=False): - c = cell.get_c() - not_c = cell.get_not_c() - tp = cell.tp - fp = cell.fp - - pos_p, pos_amp = conf_interval(tp, c) - neg_p, neg_amp = conf_interval(fp, not_c) - - min_pos = pos_p-pos_amp - max_neg = neg_p+neg_amp - den = (min_pos + max_neg) - minpos_relfreq = min_pos / (den if den != 0 else 1) - - str_tplus = strength(minpos_relfreq, min_pos, max_neg); - - if str_tplus == 0 and not cancel_features: - return 1e-20 - - return str_tplus; - - -class ContTable: - - def __init__(self, tp=0, tn=0, fp=0, fn=0): - self.tp=tp - self.tn=tn - self.fp=fp - self.fn=fn - - def get_d(self): return self.tp + self.tn + self.fp + self.fn - - def get_c(self): return self.tp + self.fn - - def get_not_c(self): return self.tn + self.fp - - def get_f(self): return self.tp + self.fp - - def get_not_f(self): return self.tn + self.fn - - def p_c(self): return (1.0*self.get_c())/self.get_d() - - def p_not_c(self): return 1.0-self.p_c() - - def p_f(self): return (1.0*self.get_f())/self.get_d() - - def p_not_f(self): return 1.0-self.p_f() - - def p_tp(self): return (1.0*self.tp) / self.get_d() - - def p_tn(self): return (1.0*self.tn) / self.get_d() - - def p_fp(self): return (1.0*self.fp) / self.get_d() - - def p_fn(self): return (1.0*self.fn) / self.get_d() - - def tpr(self): - c = 1.0*self.get_c() - return self.tp / c if c > 0.0 else 0.0 - - def fpr(self): - _c = 1.0*self.get_not_c() - return self.fp / _c if _c > 0.0 else 0.0 - - -def round_robin_selection(X, Y, k, tsr_function=positive_information_gain): - print(f'[selectiong {k} terms]') - nC = Y.shape[1] - FC = get_tsr_matrix(get_supervised_matrix(X, Y), tsr_function).T - best_features_idx = np.argsort(-FC, axis=0).flatten() - tsr_values = FC.flatten() - selected_indexes_set = set() - selected_indexes = list() - selected_value = list() - from_category = list() - round_robin = iter(best_features_idx) - values_iter = iter(tsr_values) - round=0 - while len(selected_indexes) < k: - term_idx = next(round_robin) - term_val = next(values_iter) - if term_idx not in selected_indexes_set: - selected_indexes_set.add(term_idx) - selected_indexes.append(term_idx) - selected_value.append(term_val) - from_category.append(round) - round = (round + 1) % nC - return np.asarray(selected_indexes, dtype=int), np.asarray(selected_value, dtype=float), np.asarray(from_category) - - -def feature_label_contingency_table(positive_document_indexes, feature_document_indexes, nD): - tp_ = len(positive_document_indexes & feature_document_indexes) - fp_ = len(feature_document_indexes - positive_document_indexes) - fn_ = len(positive_document_indexes - feature_document_indexes) - tn_ = nD - (tp_ + fp_ + fn_) - return ContTable(tp=tp_, tn=tn_, fp=fp_, fn=fn_) - - -def category_tables(feature_sets, category_sets, c, nD, nF): - return [feature_label_contingency_table(category_sets[c], feature_sets[f], nD) for f in range(nF)] - - -""" -Computes the nC x nF supervised matrix M where Mcf is the 4-cell contingency table for feature f and class c. -Efficiency O(nF x nC x log(S)) where S is the sparse factor -""" -def get_supervised_matrix(coocurrence_matrix, label_matrix, n_jobs=-1): - nD, nF = coocurrence_matrix.shape - nD2, nC = label_matrix.shape - - if nD != nD2: - raise ValueError('Number of rows in coocurrence matrix shape %s and label matrix shape %s is not consistent' % - (coocurrence_matrix.shape,label_matrix.shape)) - - def nonzero_set(matrix, col): - return set(matrix[:, col].nonzero()[0]) - - if isinstance(coocurrence_matrix, csr_matrix): - coocurrence_matrix = csc_matrix(coocurrence_matrix) - feature_sets = [nonzero_set(coocurrence_matrix, f) for f in range(nF)] - category_sets = [nonzero_set(label_matrix, c) for c in range(nC)] - cell_matrix = Parallel(n_jobs=n_jobs, backend="threading")(delayed(category_tables)(feature_sets, category_sets, c, nD, nF) for c in range(nC)) - return np.array(cell_matrix) - -# obtains the matrix T where Tcf=tsr(f,c) is the tsr score for category c and feature f -def get_tsr_matrix(cell_matrix, tsr_score_funtion): - nC,nF = cell_matrix.shape - tsr_matrix = [[tsr_score_funtion(cell_matrix[c,f]) for f in range(nF)] for c in range(nC)] - return np.array(tsr_matrix) - - -""" The Fisher-score [1] is not computed on the 4-cell contingency table, but can -take as input any real-valued feature column (e.g., tf-idf weights). -feat is the feature vector, and c is a binary classification vector. -This implementation covers only the binary case, while the formula is defined for multiclass -single-label scenarios, for which the version [2] might be preferred. -[1] R.O. Duda, P.E. Hart, and D.G. Stork. Pattern classification. Wiley-interscience, 2012. -[2] Gu, Q., Li, Z., & Han, J. (2012). Generalized fisher score for feature selection. arXiv preprint arXiv:1202.3725. -""" -def fisher_score_binary(feat, c): - neg = np.ones_like(c) - c - - npos = np.sum(c) - nneg = np.sum(neg) - - mupos = np.mean(feat[c == 1]) - muneg = np.mean(feat[neg == 1]) - mu = np.mean(feat) - - stdpos = np.std(feat[c == 1]) - stdneg = np.std(feat[neg == 1]) - - num = npos * ((mupos - mu) ** 2) + nneg * ((muneg - mu) ** 2) - den = npos * (stdpos ** 2) + nneg * (stdneg ** 2) - - if den>0: - return num / den - else: - return num diff --git a/refactor/funnelling.py b/refactor/funnelling.py deleted file mode 100644 index 812a937..0000000 --- a/refactor/funnelling.py +++ /dev/null @@ -1,124 +0,0 @@ -from models.learners import * -from util.common import _normalize -from view_generators import VanillaFunGen - - -class DocEmbedderList: - """ - Class that takes care of calling fit and transform function for every init embedder. Every ViewGenerator should be - contained by this class in order to seamlessly train the overall architecture. - """ - def __init__(self, embedder_list, probabilistic=True): - """ - Init the DocEmbedderList. - :param embedder_list: list of embedders to be deployed - :param probabilistic: whether to recast view generators output to vectors of posterior probabilities or not - """ - assert len(embedder_list) != 0, 'Embedder list cannot be empty!' - self.embedders = embedder_list - self.probabilistic = probabilistic - if probabilistic: - _tmp = [] - for embedder in self.embedders: - if isinstance(embedder, VanillaFunGen): - _tmp.append(embedder) - else: - _tmp.append(FeatureSet2Posteriors(embedder)) - self.embedders = _tmp - - def fit(self, lX, ly): - """ - Fit all the ViewGenerators contained by DocEmbedderList. - :param lX: - :param ly: - :return: self - """ - for embedder in self.embedders: - embedder.fit(lX, ly) - return self - - def transform(self, lX): - """ - Project documents by means of every ViewGenerators. Projections are then averaged together and returned. - :param lX: - :return: common latent space (averaged). - """ - langs = sorted(lX.keys()) - lZparts = {lang: None for lang in langs} - - for embedder in self.embedders: - lZ = embedder.transform(lX) - for lang in langs: - Z = lZ[lang] - if lZparts[lang] is None: - lZparts[lang] = Z - else: - lZparts[lang] += Z - n_embedders = len(self.embedders) - return {lang: lZparts[lang]/n_embedders for lang in langs} # Averaging feature spaces - - def fit_transform(self, lX, ly): - return self.fit(lX, ly).transform(lX) - - -class FeatureSet2Posteriors: - """ - Takes care of recasting features outputted by the embedders to vecotrs of posterior probabilities by means of - a multiclass SVM. - """ - def __init__(self, embedder, l2=True, n_jobs=-1): - """ - Init the class. - :param embedder: ViewGen, view generators which does not natively outputs posterior probabilities. - :param l2: bool, whether to apply or not L2 normalization to the projection - :param n_jobs: int, number of concurrent workers. - """ - self.embedder = embedder - self.l2 = l2 - self.n_jobs = n_jobs - self.prob_classifier = MetaClassifier( - SVC(kernel='rbf', gamma='auto', probability=True, cache_size=1000, random_state=1), n_jobs=n_jobs) - - def fit(self, lX, ly): - lZ = self.embedder.fit_transform(lX, ly) - self.prob_classifier.fit(lZ, ly) - return self - - def transform(self, lX): - lP = self.predict_proba(lX) - lP = _normalize(lP, self.l2) - return lP - - def fit_transform(self, lX, ly): - return self.fit(lX, ly).transform(lX) - - def predict(self, lX): - lZ = self.embedder.transform(lX) - return self.prob_classifier.predict(lZ) - - def predict_proba(self, lX): - lZ = self.embedder.transform(lX) - return self.prob_classifier.predict_proba(lZ) - - -class Funnelling: - """ - Funnelling Architecture. It is composed by two tiers. The first-tier is a set of heterogeneous document embedders. - The second-tier (i.e., the metaclassifier), operates the classification of the common latent space computed by - the first-tier learners. - """ - def __init__(self, first_tier: DocEmbedderList, meta_classifier: MetaClassifier, n_jobs=-1): - self.first_tier = first_tier - self.meta = meta_classifier - self.n_jobs = n_jobs - - def fit(self, lX, ly): - print('## Fitting first-tier learners!') - lZ = self.first_tier.fit_transform(lX, ly) - print('## Fitting meta-learner!') - self.meta.fit(lZ, ly) - - def predict(self, lX): - lZ = self.first_tier.transform(lX) - ly = self.meta.predict(lZ) - return ly diff --git a/refactor/main.py b/refactor/main.py deleted file mode 100644 index ebc43a3..0000000 --- a/refactor/main.py +++ /dev/null @@ -1,167 +0,0 @@ -from argparse import ArgumentParser - -from data.dataset_builder import MultilingualDataset -from funnelling import * -from util.common import MultilingualIndex, get_params, get_method_name -from util.evaluation import evaluate -from util.results_csv import CSVlog -from view_generators import * - - -def main(args): - assert args.post_embedder or args.muse_embedder or args.wce_embedder or args.gru_embedder or args.bert_embedder, \ - 'empty set of document embeddings is not allowed!' - - print('Running generalized funnelling...') - - data = MultilingualDataset.load(args.dataset) - data.set_view(languages=['it', 'fr']) - data.show_dimensions() - lX, ly = data.training() - lXte, lyte = data.test() - - # Init multilingualIndex - mandatory when deploying Neural View Generators... - if args.gru_embedder or args.bert_embedder: - multilingualIndex = MultilingualIndex() - lMuse = MuseLoader(langs=sorted(lX.keys()), cache=args.muse_dir) - multilingualIndex.index(lX, ly, lXte, lyte, l_pretrained_vocabulary=lMuse.vocabulary()) - - # Init ViewGenerators and append them to embedder_list - embedder_list = [] - if args.post_embedder: - posteriorEmbedder = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=args.n_jobs) - embedder_list.append(posteriorEmbedder) - - if args.muse_embedder: - museEmbedder = MuseGen(muse_dir=args.muse_dir, n_jobs=args.n_jobs) - embedder_list.append(museEmbedder) - - if args.wce_embedder: - wceEmbedder = WordClassGen(n_jobs=args.n_jobs) - embedder_list.append(wceEmbedder) - - if args.gru_embedder: - rnnEmbedder = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=args.gru_wce, batch_size=256, - nepochs=args.nepochs, gpus=args.gpus, n_jobs=args.n_jobs) - embedder_list.append(rnnEmbedder) - - if args.bert_embedder: - bertEmbedder = BertGen(multilingualIndex, batch_size=4, nepochs=10, gpus=args.gpus, n_jobs=args.n_jobs) - embedder_list.append(bertEmbedder) - - # Init DocEmbedderList (i.e., first-tier learners or view generators) and metaclassifier - docEmbedders = DocEmbedderList(embedder_list=embedder_list, probabilistic=True) - meta = MetaClassifier(meta_learner=get_learner(calibrate=False, kernel='rbf'), - meta_parameters=get_params(optimc=args.optimc)) - - # Init Funnelling Architecture - gfun = Funnelling(first_tier=docEmbedders, meta_classifier=meta) - - # Training --------------------------------------- - print('\n[Training Generalized Funnelling]') - time_init = time() - time_tr = time() - gfun.fit(lX, ly) - time_tr = round(time() - time_tr, 3) - print(f'Training completed in {time_tr} seconds!') - - # Testing ---------------------------------------- - print('\n[Testing Generalized Funnelling]') - time_te = time() - ly_ = gfun.predict(lXte) - l_eval = evaluate(ly_true=lyte, ly_pred=ly_) - time_te = round(time() - time_te, 3) - print(f'Testing completed in {time_te} seconds!') - - # Logging --------------------------------------- - print('\n[Results]') - results = CSVlog(args.csv_dir) - metrics = [] - for lang in lXte.keys(): - macrof1, microf1, macrok, microk = l_eval[lang] - metrics.append([macrof1, microf1, macrok, microk]) - print(f'Lang {lang}: macro-F1 = {macrof1:.3f} micro-F1 = {microf1:.3f}') - if results is not None: - _id, _dataset = get_method_name(args) - results.add_row(method='gfun', - setting=_id, - optimc=args.optimc, - sif='True', - zscore='True', - l2='True', - dataset=_dataset, - time_tr=time_tr, - time_te=time_te, - lang=lang, - macrof1=macrof1, - microf1=microf1, - macrok=macrok, - microk=microk, - notes='') - print('Averages: MF1, mF1, MK, mK', np.round(np.mean(np.array(metrics), axis=0), 3)) - - overall_time = round(time() - time_init, 3) - exit(f'\nExecuted in: {overall_time} seconds!') - - -if __name__ == '__main__': - parser = ArgumentParser(description='Run generalized funnelling, A. Moreo, A. Pedrotti and F. Sebastiani') - - parser.add_argument('dataset', help='Path to the dataset') - - parser.add_argument('-o', '--output', dest='csv_dir', - help='Result file (default ../csv_log/gfun_results.csv)', type=str, - default='csv_logs/gfun/gfun_results.csv') - - parser.add_argument('-x', '--post_embedder', dest='post_embedder', action='store_true', - help='deploy posterior probabilities embedder to compute document embeddings', - default=False) - - parser.add_argument('-w', '--wce_embedder', dest='wce_embedder', action='store_true', - help='deploy (supervised) Word-Class embedder to the compute document embeddings', - default=False) - - parser.add_argument('-m', '--muse_embedder', dest='muse_embedder', action='store_true', - help='deploy (pretrained) MUSE embedder to compute document embeddings', - default=False) - - parser.add_argument('-b', '--bert_embedder', dest='bert_embedder', action='store_true', - help='deploy multilingual Bert to compute document embeddings', - default=False) - - parser.add_argument('-g', '--gru_embedder', dest='gru_embedder', action='store_true', - help='deploy a GRU in order to compute document embeddings', - default=False) - - parser.add_argument('-c', '--c_optimize', dest='optimc', action='store_true', - help='Optimize SVMs C hyperparameter', - default=False) - - parser.add_argument('-n', '--nepochs', dest='nepochs', type=str, - help='Number of max epochs to train Recurrent embedder (i.e., -g)') - - parser.add_argument('-j', '--n_jobs', dest='n_jobs', type=int, - help='Number of parallel jobs (default is -1, all)', - default=-1) - - parser.add_argument('--muse_dir', dest='muse_dir', type=str, - help='Path to the MUSE polylingual word embeddings (default ../embeddings)', - default='../embeddings') - - parser.add_argument('--gru_wce', dest='gru_wce', action='store_true', - help='Deploy WCE embedding as embedding layer of the GRU View Generator', - default=False) - - parser.add_argument('--gru_dir', dest='gru_dir', type=str, - help='Set the path to a pretrained GRU model (i.e., -g view generator)', - default=None) - - parser.add_argument('--bert_dir', dest='bert_dir', type=str, - help='Set the path to a pretrained mBERT model (i.e., -b view generator)', - default=None) - - parser.add_argument('--gpus', help='specifies how many GPUs to use per node', - default=None) - - args = parser.parse_args() - main(args) diff --git a/refactor/models/helpers.py b/refactor/models/helpers.py deleted file mode 100755 index b466f28..0000000 --- a/refactor/models/helpers.py +++ /dev/null @@ -1,51 +0,0 @@ -import torch -import torch.nn as nn -from torch.nn import functional as F - - -def init_embeddings(pretrained, vocab_size, learnable_length): - """ - Compute the embedding matrix - :param pretrained: - :param vocab_size: - :param learnable_length: - :return: - """ - pretrained_embeddings = None - pretrained_length = 0 - if pretrained is not None: - pretrained_length = pretrained.shape[1] - assert pretrained.shape[0] == vocab_size, 'pre-trained matrix does not match with the vocabulary size' - pretrained_embeddings = nn.Embedding(vocab_size, pretrained_length) - # requires_grad=False sets the embedding layer as NOT trainable - pretrained_embeddings.weight = nn.Parameter(pretrained, requires_grad=False) - - learnable_embeddings = None - if learnable_length > 0: - learnable_embeddings = nn.Embedding(vocab_size, learnable_length) - - embedding_length = learnable_length + pretrained_length - assert embedding_length > 0, '0-size embeddings' - return pretrained_embeddings, learnable_embeddings, embedding_length - - -def embed(model, input, lang): - input_list = [] - if model.lpretrained_embeddings[lang]: - input_list.append(model.lpretrained_embeddings[lang](input)) - if model.llearnable_embeddings[lang]: - input_list.append(model.llearnable_embeddings[lang](input)) - return torch.cat(tensors=input_list, dim=2) - - -def embedding_dropout(input, drop_range, p_drop=0.5, training=True): - if p_drop > 0 and training and drop_range is not None: - p = p_drop - drop_from, drop_to = drop_range - m = drop_to - drop_from #length of the supervised embedding - l = input.shape[2] #total embedding length - corr = (1 - p) - input[:, :, drop_from:drop_to] = corr * F.dropout(input[:, :, drop_from:drop_to], p=p) - input /= (1 - (p * m / l)) - - return input diff --git a/refactor/models/learners.py b/refactor/models/learners.py deleted file mode 100644 index 2654109..0000000 --- a/refactor/models/learners.py +++ /dev/null @@ -1,224 +0,0 @@ -import time - -import numpy as np -from joblib import Parallel, delayed -from scipy.sparse import issparse -from sklearn.model_selection import GridSearchCV -from sklearn.multiclass import OneVsRestClassifier -from sklearn.svm import SVC - -from util.standardizer import StandardizeTransformer - - -def get_learner(calibrate=False, kernel='linear', C=1): - """ - instantiate scikit Support Vector Classifier - :param calibrate: boolean, whether to return posterior probabilities or not - :param kernel: string,kernel to be applied to the SVC - :param C: int or dict {'C': list of integer}, Regularization parameter - :return: Support Vector Classifier - """ - return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=C, random_state=1, gamma='auto', verbose=False) - - -def _sort_if_sparse(X): - if issparse(X) and not X.has_sorted_indices: - X.sort_indices() - - -def _joblib_transform_multiling(transformer, lX, n_jobs=-1): - if n_jobs == 1: - return {lang: transformer(lX[lang]) for lang in lX.keys()} - else: - langs = list(lX.keys()) - transformations = Parallel(n_jobs=n_jobs)(delayed(transformer)(lX[lang]) for lang in langs) - return {lang: transformations[i] for i, lang in enumerate(langs)} - - -class TrivialRejector: - def fit(self, X, y): - self.cats = y.shape[1] - return self - - def decision_function(self, X): return np.zeros((X.shape[0], self.cats)) - - def predict(self, X): return np.zeros((X.shape[0], self.cats)) - - def predict_proba(self, X): return np.zeros((X.shape[0], self.cats)) - - def best_params(self): return {} - - -class NaivePolylingualClassifier: - """ - Is a mere set of independet MonolingualClassifiers - """ - - def __init__(self, base_learner, parameters=None, n_jobs=-1): - self.base_learner = base_learner - self.parameters = parameters - self.model = None - self.n_jobs = n_jobs - - def fit(self, lX, ly): - """ - trains the independent monolingual classifiers - :param lX: a dictionary {language_label: X csr-matrix} - :param ly: a dictionary {language_label: y np.array} - :return: self - """ - tinit = time.time() - assert set(lX.keys()) == set(ly.keys()), 'inconsistent language mappings in fit' - langs = list(lX.keys()) - for lang in langs: - _sort_if_sparse(lX[lang]) - - models = Parallel(n_jobs=self.n_jobs)\ - (delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]), ly[lang]) for - lang in langs) - - self.model = {lang: models[i] for i, lang in enumerate(langs)} - self.empty_categories = {lang: self.model[lang].empty_categories for lang in langs} - self.time = time.time() - tinit - return self - - def decision_function(self, lX): - """ - :param lX: a dictionary {language_label: X csr-matrix} - :return: a dictionary of classification scores for each class - """ - assert self.model is not None, 'predict called before fit' - assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in decision function' - langs = list(lX.keys()) - scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].decision_function)(lX[lang]) for lang in langs) - return {lang: scores[i] for i, lang in enumerate(langs)} - - def predict_proba(self, lX): - """ - :param lX: a dictionary {language_label: X csr-matrix} - :return: a dictionary of probabilities that each document belongs to each class - """ - assert self.model is not None, 'predict called before fit' - assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in decision function' - langs = list(lX.keys()) - scores = Parallel(n_jobs=self.n_jobs, max_nbytes=None)( - delayed(self.model[lang].predict_proba)(lX[lang]) for lang in langs) - return {lang: scores[i] for i, lang in enumerate(langs)} - - def predict(self, lX): - """ - :param lX: a dictionary {language_label: X csr-matrix} - :return: a dictionary of predictions - """ - assert self.model is not None, 'predict called before fit' - assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in predict' - if self.n_jobs == 1: - return {lang: self.model[lang].transform(lX[lang]) for lang in lX.keys()} - else: - langs = list(lX.keys()) - scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].predict)(lX[lang]) for lang in langs) - return {lang: scores[i] for i, lang in enumerate(langs)} - - def best_params(self): - return {lang: model.best_params() for lang, model in self.model.items()} - - -class MonolingualClassifier: - - def __init__(self, base_learner, parameters=None, n_jobs=-1): - self.learner = base_learner - self.parameters = parameters - self.model = None - self.n_jobs = n_jobs - self.best_params_ = None - - def fit(self, X, y): - if X.shape[0] == 0: - print('Warning: X has 0 elements, a trivial rejector will be created') - self.model = TrivialRejector().fit(X, y) - self.empty_categories = np.arange(y.shape[1]) - return self - - tinit = time.time() - _sort_if_sparse(X) - self.empty_categories = np.argwhere(np.sum(y, axis=0) == 0).flatten() - # multi-class format - if len(y.shape) == 2: - if self.parameters is not None: - self.parameters = [{'estimator__' + key: params[key] for key in params.keys()} - for params in self.parameters] - self.model = OneVsRestClassifier(self.learner, n_jobs=self.n_jobs) - else: - self.model = self.learner - raise NotImplementedError('not working as a base-classifier for funneling if there are gaps in ' - 'the labels across languages') - - # parameter optimization? - if self.parameters: - print('debug: optimizing parameters:', self.parameters) - self.model = GridSearchCV(self.model, param_grid=self.parameters, refit=True, cv=5, n_jobs=self.n_jobs, - error_score=0, verbose=10) - - print(f'fitting: Mono-lingual Classifier on matrices of shape X={X.shape} Y={y.shape}') - self.model.fit(X, y) - if isinstance(self.model, GridSearchCV): - self.best_params_ = self.model.best_params_ - print('best parameters: ', self.best_params_) - self.time = time.time() - tinit - return self - - def decision_function(self, X): - assert self.model is not None, 'predict called before fit' - _sort_if_sparse(X) - return self.model.decision_function(X) - - def predict_proba(self, X): - assert self.model is not None, 'predict called before fit' - assert hasattr(self.model, 'predict_proba'), 'the probability predictions are not enabled in this model' - _sort_if_sparse(X) - return self.model.predict_proba(X) - - def predict(self, X): - assert self.model is not None, 'predict called before fit' - _sort_if_sparse(X) - return self.model.predict(X) - - def best_params(self): - return self.best_params_ - - -class MetaClassifier: - - def __init__(self, meta_learner, meta_parameters=None, n_jobs=-1, standardize_range=None): - self.n_jobs = n_jobs - self.model = MonolingualClassifier(base_learner=meta_learner, parameters=meta_parameters, n_jobs=n_jobs) - self.standardize_range = standardize_range - - def fit(self, lZ, ly): - tinit = time.time() - Z, y = self.stack(lZ, ly) - - self.standardizer = StandardizeTransformer(range=self.standardize_range) - Z = self.standardizer.fit_transform(Z) - - print('fitting the Z-space of shape={}'.format(Z.shape)) - self.model.fit(Z, y) - self.time = time.time() - tinit - - def stack(self, lZ, ly=None): - langs = list(lZ.keys()) - Z = np.vstack([lZ[lang] for lang in langs]) - if ly is not None: - y = np.vstack([ly[lang] for lang in langs]) - return Z, y - else: - return Z - - def predict(self, lZ): - lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs) - return _joblib_transform_multiling(self.model.predict, lZ, n_jobs=self.n_jobs) - - def predict_proba(self, lZ): - lZ = _joblib_transform_multiling(self.standardizer.transform, lZ, n_jobs=self.n_jobs) - return _joblib_transform_multiling(self.model.predict_proba, lZ, n_jobs=self.n_jobs) - diff --git a/refactor/models/lstm_class.py b/refactor/models/lstm_class.py deleted file mode 100755 index 7f2cf59..0000000 --- a/refactor/models/lstm_class.py +++ /dev/null @@ -1,113 +0,0 @@ -#taken from https://github.com/prakashpandey9/Text-Classification-Pytorch/blob/master/models/LSTM.py -from torch.autograd import Variable - -from models.helpers import * - - -class RNNMultilingualClassifier(nn.Module): - - def __init__(self, output_size, hidden_size, lvocab_size, learnable_length, lpretrained=None, - drop_embedding_range=None, drop_embedding_prop=0, post_probabilities=True, only_post=False, - bert_embeddings=False): - - super(RNNMultilingualClassifier, self).__init__() - self.output_size = output_size - self.hidden_size = hidden_size - self.drop_embedding_range = drop_embedding_range - self.drop_embedding_prop = drop_embedding_prop - self.post_probabilities = post_probabilities - self.bert_embeddings = bert_embeddings - assert 0 <= drop_embedding_prop <= 1, 'drop_embedding_prop: wrong range' - - self.lpretrained_embeddings = nn.ModuleDict() - self.llearnable_embeddings = nn.ModuleDict() - self.embedding_length = None - self.langs = sorted(lvocab_size.keys()) - self.only_post = only_post - - self.n_layers = 1 - self.n_directions = 1 - - self.dropout = nn.Dropout(0.6) - - lstm_out = 256 - ff1 = 512 - ff2 = 256 - - lpretrained_embeddings = {} - llearnable_embeddings = {} - if only_post==False: - for l in self.langs: - pretrained = lpretrained[l] if lpretrained else None - pretrained_embeddings, learnable_embeddings, embedding_length = init_embeddings( - pretrained, lvocab_size[l], learnable_length - ) - lpretrained_embeddings[l] = pretrained_embeddings - llearnable_embeddings[l] = learnable_embeddings - self.embedding_length = embedding_length - - # self.lstm = nn.LSTM(self.embedding_length, hidden_size, dropout=0.2 if self.n_layers>1 else 0, num_layers=self.n_layers, bidirectional=(self.n_directions==2)) - self.rnn = nn.GRU(self.embedding_length, hidden_size) - self.linear0 = nn.Linear(hidden_size * self.n_directions, lstm_out) - self.lpretrained_embeddings.update(lpretrained_embeddings) - self.llearnable_embeddings.update(llearnable_embeddings) - - self.linear1 = nn.Linear(lstm_out, ff1) - self.linear2 = nn.Linear(ff1, ff2) - - if only_post: - self.label = nn.Linear(output_size, output_size) - elif post_probabilities and not bert_embeddings: - self.label = nn.Linear(ff2 + output_size, output_size) - elif bert_embeddings and not post_probabilities: - self.label = nn.Linear(ff2 + 768, output_size) - elif post_probabilities and bert_embeddings: - self.label = nn.Linear(ff2 + output_size + 768, output_size) - else: - self.label = nn.Linear(ff2, output_size) - - def forward(self, input, post, bert_embed, lang): - if self.only_post: - doc_embedding = post - else: - doc_embedding = self.transform(input, lang) - if self.post_probabilities: - doc_embedding = torch.cat([doc_embedding, post], dim=1) - if self.bert_embeddings: - doc_embedding = torch.cat([doc_embedding, bert_embed], dim=1) - - logits = self.label(doc_embedding) - return logits - - def transform(self, input, lang): - batch_size = input.shape[0] - input = embed(self, input, lang) - input = embedding_dropout(input, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop, - training=self.training) - input = input.permute(1, 0, 2) - h_0 = Variable(torch.zeros(self.n_layers*self.n_directions, batch_size, self.hidden_size).cuda()) - # c_0 = Variable(torch.zeros(self.n_layers*self.n_directions, batch_size, self.hidden_size).cuda()) - # output, (_, _) = self.lstm(input, (h_0, c_0)) - output, _ = self.rnn(input, h_0) - output = output[-1, :, :] - output = F.relu(self.linear0(output)) - output = self.dropout(F.relu(self.linear1(output))) - output = self.dropout(F.relu(self.linear2(output))) - return output - - def finetune_pretrained(self): - for l in self.langs: - self.lpretrained_embeddings[l].requires_grad = True - self.lpretrained_embeddings[l].weight.requires_grad = True - - def get_embeddings(self, input, lang): - batch_size = input.shape[0] - input = embed(self, input, lang) - input = embedding_dropout(input, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop, - training=self.training) - input = input.permute(1, 0, 2) - h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, batch_size, self.hidden_size).cuda()) - output, _ = self.rnn(input, h_0) - output = output[-1, :, :] - return output.cpu().detach().numpy() - diff --git a/refactor/models/pl_bert.py b/refactor/models/pl_bert.py deleted file mode 100644 index afb28b5..0000000 --- a/refactor/models/pl_bert.py +++ /dev/null @@ -1,183 +0,0 @@ -import pytorch_lightning as pl -import torch -from torch.optim.lr_scheduler import StepLR -from transformers import BertForSequenceClassification, AdamW - -from util.common import define_pad_length, pad -from util.pl_metrics import CustomF1, CustomK - - -class BertModel(pl.LightningModule): - - def __init__(self, output_size, stored_path, gpus=None): - """ - Init Bert model. - :param output_size: - :param stored_path: - :param gpus: - """ - super().__init__() - self.loss = torch.nn.BCEWithLogitsLoss() - self.gpus = gpus - self.microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus) - self.macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus) - self.microK = CustomK(num_classes=output_size, average='micro', device=self.gpus) - self.macroK = CustomK(num_classes=output_size, average='macro', device=self.gpus) - # Language specific metrics to compute metrics at epoch level - self.lang_macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus) - self.lang_microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus) - self.lang_macroK = CustomF1(num_classes=output_size, average='macro', device=self.gpus) - self.lang_microK = CustomF1(num_classes=output_size, average='micro', device=self.gpus) - - if stored_path: - self.bert = BertForSequenceClassification.from_pretrained(stored_path, - num_labels=output_size, - output_hidden_states=True) - else: - self.bert = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', - num_labels=output_size, - output_hidden_states=True) - self.save_hyperparameters() - - def forward(self, X): - logits = self.bert(X) - return logits - - def training_step(self, train_batch, batch_idx): - X, y, _, batch_langs = train_batch - X = torch.cat(X).view([X[0].shape[0], len(X)]) - y = y.type(torch.FloatTensor) - y = y.to('cuda' if self.gpus else 'cpu') - logits, _ = self.forward(X) - loss = self.loss(logits, y) - # Squashing logits through Sigmoid in order to get confidence score - predictions = torch.sigmoid(logits) > 0.5 - microF1 = self.microF1(predictions, y) - macroF1 = self.macroF1(predictions, y) - microK = self.microK(predictions, y) - macroK = self.macroK(predictions, y) - self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True) - self.log('train-macroF1', macroF1, on_step=True, on_epoch=True, prog_bar=False, logger=True) - self.log('train-microF1', microF1, on_step=True, on_epoch=True, prog_bar=False, logger=True) - self.log('train-macroK', macroK, on_step=True, on_epoch=True, prog_bar=False, logger=True) - self.log('train-microK', microK, on_step=True, on_epoch=True, prog_bar=False, logger=True) - lX, ly = self._reconstruct_dict(predictions, y, batch_langs) - return {'loss': loss, 'pred': lX, 'target': ly} - - def training_epoch_end(self, outputs): - langs = [] - for output in outputs: - langs.extend(list(output['pred'].keys())) - langs = set(langs) - # outputs is a of n dicts of m elements, where n is equal to the number of epoch steps and m is batchsize. - # here we save epoch level metric values and compute them specifically for each language - res_macroF1 = {lang: [] for lang in langs} - res_microF1 = {lang: [] for lang in langs} - res_macroK = {lang: [] for lang in langs} - res_microK = {lang: [] for lang in langs} - for output in outputs: - lX, ly = output['pred'], output['target'] - for lang in lX.keys(): - X, y = lX[lang], ly[lang] - lang_macroF1 = self.lang_macroF1(X, y) - lang_microF1 = self.lang_microF1(X, y) - lang_macroK = self.lang_macroK(X, y) - lang_microK = self.lang_microK(X, y) - - res_macroF1[lang].append(lang_macroF1) - res_microF1[lang].append(lang_microF1) - res_macroK[lang].append(lang_macroK) - res_microK[lang].append(lang_microK) - for lang in langs: - avg_macroF1 = torch.mean(torch.Tensor(res_macroF1[lang])) - avg_microF1 = torch.mean(torch.Tensor(res_microF1[lang])) - avg_macroK = torch.mean(torch.Tensor(res_macroK[lang])) - avg_microK = torch.mean(torch.Tensor(res_microK[lang])) - self.logger.experiment.add_scalars('train-langs-macroF1', {f'{lang}': avg_macroF1}, self.current_epoch) - self.logger.experiment.add_scalars('train-langs-microF1', {f'{lang}': avg_microF1}, self.current_epoch) - self.logger.experiment.add_scalars('train-langs-macroK', {f'{lang}': avg_macroK}, self.current_epoch) - self.logger.experiment.add_scalars('train-langs-microK', {f'{lang}': avg_microK}, self.current_epoch) - - def validation_step(self, val_batch, batch_idx): - X, y, _, batch_langs = val_batch - X = torch.cat(X).view([X[0].shape[0], len(X)]) - y = y.type(torch.FloatTensor) - y = y.to('cuda' if self.gpus else 'cpu') - logits, _ = self.forward(X) - loss = self.loss(logits, y) - predictions = torch.sigmoid(logits) > 0.5 - microF1 = self.microF1(predictions, y) - macroF1 = self.macroF1(predictions, y) - microK = self.microK(predictions, y) - macroK = self.macroK(predictions, y) - self.log('val-loss', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True) - self.log('val-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) - self.log('val-microF1', microF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) - self.log('val-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True) - self.log('val-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True) - return {'loss': loss} - - def test_step(self, test_batch, batch_idx): - X, y, _, batch_langs = test_batch - X = torch.cat(X).view([X[0].shape[0], len(X)]) - y = y.type(torch.FloatTensor) - y = y.to('cuda' if self.gpus else 'cpu') - logits, _ = self.forward(X) - loss = self.loss(logits, y) - # Squashing logits through Sigmoid in order to get confidence score - predictions = torch.sigmoid(logits) > 0.5 - microF1 = self.microF1(predictions, y) - macroF1 = self.macroF1(predictions, y) - microK = self.microK(predictions, y) - macroK = self.macroK(predictions, y) - self.log('test-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=False, logger=True) - self.log('test-microF1', microF1, on_step=False, on_epoch=True, prog_bar=False, logger=True) - self.log('test-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True) - self.log('test-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True) - return - - def configure_optimizers(self, lr=3e-5, weight_decay=0.01): - no_decay = ['bias', 'LayerNorm.weight'] - optimizer_grouped_parameters = [ - {'params': [p for n, p in self.bert.named_parameters() - if not any(nd in n for nd in no_decay)], - 'weight_decay': weight_decay}, - {'params': [p for n, p in self.bert.named_parameters() - if any(nd in n for nd in no_decay)], - 'weight_decay': weight_decay} - ] - optimizer = AdamW(optimizer_grouped_parameters, lr=lr) - scheduler = StepLR(optimizer, step_size=25, gamma=0.1) - return [optimizer], [scheduler] - - def encode(self, lX, batch_size=64): - with torch.no_grad(): - l_embed = {lang: [] for lang in lX.keys()} - for lang in sorted(lX.keys()): - for i in range(0, len(lX[lang]), batch_size): - if i + batch_size > len(lX[lang]): - batch = lX[lang][i:len(lX[lang])] - else: - batch = lX[lang][i:i + batch_size] - max_pad_len = define_pad_length(batch) - batch = pad(batch, pad_index=self.bert.config.pad_token_id, max_pad_length=max_pad_len) - batch = torch.LongTensor(batch).to('cuda' if self.gpus else 'cpu') - _, output = self.forward(batch) - doc_embeds = output[-1][:, 0, :] - l_embed[lang].append(doc_embeds.cpu()) - for k, v in l_embed.items(): - l_embed[k] = torch.cat(v, dim=0).numpy() - return l_embed - - @staticmethod - def _reconstruct_dict(predictions, y, batch_langs): - reconstructed_x = {lang: [] for lang in set(batch_langs)} - reconstructed_y = {lang: [] for lang in set(batch_langs)} - for i, pred in enumerate(predictions): - reconstructed_x[batch_langs[i]].append(pred) - reconstructed_y[batch_langs[i]].append(y[i]) - for k, v in reconstructed_x.items(): - reconstructed_x[k] = torch.cat(v).view(-1, predictions.shape[1]) - for k, v in reconstructed_y.items(): - reconstructed_y[k] = torch.cat(v).view(-1, predictions.shape[1]) - return reconstructed_x, reconstructed_y diff --git a/refactor/models/pl_gru.py b/refactor/models/pl_gru.py deleted file mode 100644 index afb12e6..0000000 --- a/refactor/models/pl_gru.py +++ /dev/null @@ -1,266 +0,0 @@ -# Lightning modules, see https://pytorch-lightning.readthedocs.io/en/latest/lightning_module.html -import pytorch_lightning as pl -import torch -import torch.nn.functional as F -from torch import nn -from torch.autograd import Variable -from torch.optim.lr_scheduler import StepLR -from transformers import AdamW - -from models.helpers import init_embeddings -from util.common import define_pad_length, pad -from util.pl_metrics import CustomF1, CustomK - - -class RecurrentModel(pl.LightningModule): - def __init__(self, lPretrained, langs, output_size, hidden_size, lVocab_size, learnable_length, - drop_embedding_range, drop_embedding_prop, gpus=None): - """ - Init RNN model. - :param lPretrained: - :param langs: - :param output_size: - :param hidden_size: - :param lVocab_size: - :param learnable_length: - :param drop_embedding_range: - :param drop_embedding_prop: - :param gpus: - """ - super().__init__() - self.gpus = gpus - self.langs = langs - self.lVocab_size = lVocab_size - self.learnable_length = learnable_length - self.output_size = output_size - self.hidden_size = hidden_size - self.drop_embedding_range = drop_embedding_range - self.drop_embedding_prop = drop_embedding_prop - self.loss = torch.nn.BCEWithLogitsLoss() - - self.microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus) - self.macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus) - self.microK = CustomK(num_classes=output_size, average='micro', device=self.gpus) - self.macroK = CustomK(num_classes=output_size, average='macro', device=self.gpus) - # Language specific metrics to compute metrics at epoch level - self.lang_macroF1 = CustomF1(num_classes=output_size, average='macro', device=self.gpus) - self.lang_microF1 = CustomF1(num_classes=output_size, average='micro', device=self.gpus) - self.lang_macroK = CustomF1(num_classes=output_size, average='macro', device=self.gpus) - self.lang_microK = CustomF1(num_classes=output_size, average='micro', device=self.gpus) - - self.lPretrained_embeddings = nn.ModuleDict() - self.lLearnable_embeddings = nn.ModuleDict() - - self.n_layers = 1 - self.n_directions = 1 - self.dropout = nn.Dropout(0.6) - - lstm_out = 256 - ff1 = 512 - ff2 = 256 - - lpretrained_embeddings = {} - llearnable_embeddings = {} - - for lang in self.langs: - pretrained = lPretrained[lang] if lPretrained else None - pretrained_embeddings, learnable_embeddings, embedding_length = init_embeddings( - pretrained, self.lVocab_size[lang], self.learnable_length) - lpretrained_embeddings[lang] = pretrained_embeddings - llearnable_embeddings[lang] = learnable_embeddings - self.embedding_length = embedding_length - - self.lPretrained_embeddings.update(lpretrained_embeddings) - self.lLearnable_embeddings.update(llearnable_embeddings) - - self.rnn = nn.GRU(self.embedding_length, hidden_size) - self.linear0 = nn.Linear(hidden_size * self.n_directions, lstm_out) - self.linear1 = nn.Linear(lstm_out, ff1) - self.linear2 = nn.Linear(ff1, ff2) - self.label = nn.Linear(ff2, self.output_size) - - # TODO: setting lPretrained to None, letting it to its original value will "bug" first validation - # step (i.e., checkpoint will store also its ++ value, I guess, making the saving process too slow) - lPretrained = None - self.save_hyperparameters() - - def forward(self, lX): - l_embed = [] - for lang in sorted(lX.keys()): - doc_embedding = self.transform(lX[lang], lang) - l_embed.append(doc_embedding) - embed = torch.cat(l_embed, dim=0) - logits = self.label(embed) - return logits - - def transform(self, X, lang): - batch_size = X.shape[0] - X = self.embed(X, lang) - X = self.embedding_dropout(X, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop, - training=self.training) - X = X.permute(1, 0, 2) - h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, batch_size, self.hidden_size).to(self.device)) - output, _ = self.rnn(X, h_0) - output = output[-1, :, :] - output = F.relu(self.linear0(output)) - output = self.dropout(F.relu(self.linear1(output))) - output = self.dropout(F.relu(self.linear2(output))) - return output - - def encode(self, lX, l_pad, batch_size=128): - """ - Returns encoded data (i.e, RNN hidden state at second feed-forward layer - linear1). Dimensionality is 512. - :param lX: - :param l_pad: - :param batch_size: - :return: - """ - with torch.no_grad(): - l_embed = {lang: [] for lang in lX.keys()} - for lang in sorted(lX.keys()): - for i in range(0, len(lX[lang]), batch_size): - if i+batch_size > len(lX[lang]): - batch = lX[lang][i:len(lX[lang])] - else: - batch = lX[lang][i:i+batch_size] - max_pad_len = define_pad_length(batch) - batch = pad(batch, pad_index=l_pad[lang], max_pad_length=max_pad_len) - X = torch.LongTensor(batch).to('cuda' if self.gpus else 'cpu') - _batch_size = X.shape[0] - X = self.embed(X, lang) - X = self.embedding_dropout(X, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop, - training=self.training) - X = X.permute(1, 0, 2) - h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, _batch_size, self.hidden_size).to(self.device)) - output, _ = self.rnn(X, h_0) - output = output[-1, :, :] - output = F.relu(self.linear0(output)) - output = self.dropout(F.relu(self.linear1(output))) - l_embed[lang].append(output.cpu()) - for k, v in l_embed.items(): - l_embed[k] = torch.cat(v, dim=0).numpy() - return l_embed - - def training_step(self, train_batch, batch_idx): - lX, ly = train_batch - logits = self.forward(lX) - _ly = [] - for lang in sorted(lX.keys()): - _ly.append(ly[lang]) - y = torch.cat(_ly, dim=0) - loss = self.loss(logits, y) - # Squashing logits through Sigmoid in order to get confidence score - predictions = torch.sigmoid(logits) > 0.5 - microF1 = self.microF1(predictions, y) - macroF1 = self.macroF1(predictions, y) - microK = self.microK(predictions, y) - macroK = self.macroK(predictions, y) - self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=False, logger=True) - self.log('train-macroF1', macroF1, on_step=True, on_epoch=True, prog_bar=False, logger=True) - self.log('train-microF1', microF1, on_step=True, on_epoch=True, prog_bar=False, logger=True) - self.log('train-macroK', macroK, on_step=True, on_epoch=True, prog_bar=False, logger=True) - self.log('train-microK', microK, on_step=True, on_epoch=True, prog_bar=False, logger=True) - re_lX = self._reconstruct_dict(predictions, ly) - return {'loss': loss, 'pred': re_lX, 'target': ly} - - def training_epoch_end(self, outputs): - # outputs is a of n dicts of m elements, where n is equal to the number of epoch steps and m is batchsize. - # here we save epoch level metric values and compute them specifically for each language - res_macroF1 = {lang: [] for lang in self.langs} - res_microF1 = {lang: [] for lang in self.langs} - res_macroK = {lang: [] for lang in self.langs} - res_microK = {lang: [] for lang in self.langs} - for output in outputs: - lX, ly = output['pred'], output['target'] - for lang in lX.keys(): - X, y = lX[lang], ly[lang] - lang_macroF1 = self.lang_macroF1(X, y) - lang_microF1 = self.lang_microF1(X, y) - lang_macroK = self.lang_macroK(X, y) - lang_microK = self.lang_microK(X, y) - - res_macroF1[lang].append(lang_macroF1) - res_microF1[lang].append(lang_microF1) - res_macroK[lang].append(lang_macroK) - res_microK[lang].append(lang_microK) - for lang in self.langs: - avg_macroF1 = torch.mean(torch.Tensor(res_macroF1[lang])) - avg_microF1 = torch.mean(torch.Tensor(res_microF1[lang])) - avg_macroK = torch.mean(torch.Tensor(res_macroK[lang])) - avg_microK = torch.mean(torch.Tensor(res_microK[lang])) - self.logger.experiment.add_scalars('train-langs-macroF1', {f'{lang}': avg_macroF1}, self.current_epoch) - self.logger.experiment.add_scalars('train-langs-microF1', {f'{lang}': avg_microF1}, self.current_epoch) - self.logger.experiment.add_scalars('train-langs-macroK', {f'{lang}': avg_macroK}, self.current_epoch) - self.logger.experiment.add_scalars('train-langs-microK', {f'{lang}': avg_microK}, self.current_epoch) - - def validation_step(self, val_batch, batch_idx): - lX, ly = val_batch - logits = self.forward(lX) - _ly = [] - for lang in sorted(lX.keys()): - _ly.append(ly[lang]) - ly = torch.cat(_ly, dim=0) - loss = self.loss(logits, ly) - predictions = torch.sigmoid(logits) > 0.5 - microF1 = self.microF1(predictions, ly) - macroF1 = self.macroF1(predictions, ly) - microK = self.microK(predictions, ly) - macroK = self.macroK(predictions, ly) - self.log('val-loss', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True) - self.log('val-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) - self.log('val-microF1', microF1, on_step=False, on_epoch=True, prog_bar=True, logger=True) - self.log('val-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True) - self.log('val-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True) - return {'loss': loss} - - def test_step(self, test_batch, batch_idx): - lX, ly = test_batch - logits = self.forward(lX) - _ly = [] - for lang in sorted(lX.keys()): - _ly.append(ly[lang]) - ly = torch.cat(_ly, dim=0) - predictions = torch.sigmoid(logits) > 0.5 - microF1 = self.microF1(predictions, ly) - macroF1 = self.macroF1(predictions, ly) - microK = self.microK(predictions, ly) - macroK = self.macroK(predictions, ly) - self.log('test-macroF1', macroF1, on_step=False, on_epoch=True, prog_bar=False, logger=True) - self.log('test-microF1', microF1, on_step=False, on_epoch=True, prog_bar=False, logger=True) - self.log('test-macroK', macroK, on_step=False, on_epoch=True, prog_bar=True, logger=True) - self.log('test-microK', microK, on_step=False, on_epoch=True, prog_bar=True, logger=True) - return - - def embed(self, X, lang): - input_list = [] - if self.lPretrained_embeddings[lang]: - input_list.append(self.lPretrained_embeddings[lang](X)) - if self.lLearnable_embeddings[lang]: - input_list.append(self.lLearnable_embeddings[lang](X)) - return torch.cat(tensors=input_list, dim=2) - - def embedding_dropout(self, X, drop_range, p_drop=0.5, training=True): - if p_drop > 0 and training and drop_range is not None: - p = p_drop - drop_from, drop_to = drop_range - m = drop_to - drop_from # length of the supervised embedding - l = X.shape[2] # total embedding length - corr = (1 - p) - X[:, :, drop_from:drop_to] = corr * F.dropout(X[:, :, drop_from:drop_to], p=p) - X /= (1 - (p * m / l)) - return X - - def configure_optimizers(self): - optimizer = AdamW(self.parameters(), lr=1e-3) - scheduler = StepLR(optimizer, step_size=25, gamma=0.5) - return [optimizer], [scheduler] - - @staticmethod - def _reconstruct_dict(X, ly): - reconstructed = {} - _start = 0 - for lang in sorted(ly.keys()): - lang_batchsize = len(ly[lang]) - reconstructed[lang] = X[_start:_start+lang_batchsize] - _start += lang_batchsize - return reconstructed diff --git a/refactor/requirements.txt b/refactor/requirements.txt deleted file mode 100644 index 4546a4a..0000000 --- a/refactor/requirements.txt +++ /dev/null @@ -1,12 +0,0 @@ -transformers==2.11.0 -pandas==0.25.3 -numpy==1.17.4 -joblib==0.14.0 -tqdm==4.50.2 -pytorch_lightning==1.1.2 -torch==1.3.1 -nltk==3.4.5 -scipy==1.3.3 -rdflib==4.2.2 -torchtext==0.4.0 -scikit_learn==0.24.1 diff --git a/refactor/run.sh b/refactor/run.sh deleted file mode 100644 index 04365f9..0000000 --- a/refactor/run.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env bash - -for i in {0..10..1} -do - python main.py --gpus 0 -done \ No newline at end of file diff --git a/refactor/util/SIF_embed.py b/refactor/util/SIF_embed.py deleted file mode 100644 index 4a3d712..0000000 --- a/refactor/util/SIF_embed.py +++ /dev/null @@ -1,59 +0,0 @@ -import numpy as np -from sklearn.decomposition import TruncatedSVD - - -def get_weighted_average(We, x, w): - """ - Compute the weighted average vectors - :param We: We[i,:] is the vector for word i - :param x: x[i, :] are the indices of the words in sentence i - :param w: w[i, :] are the weights for the words in sentence i - :return: emb[i, :] are the weighted average vector for sentence i - """ - n_samples = x.shape[0] - emb = np.zeros((n_samples, We.shape[1])) - for i in range(n_samples): - emb[i,:] = w[i,:].dot(We[x[i,:],:]) / np.count_nonzero(w[i,:]) - return emb - - -def compute_pc(X,npc=1): - """ - Compute the principal components. - :param X: X[i,:] is a data point - :param npc: number of principal components to remove - :return: component_[i,:] is the i-th pc - """ - svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0) - svd.fit(X) - return svd.components_ - - -def remove_pc(X, npc=1): - """ - Remove the projection on the principal components - :param X: X[i,:] is a data point - :param npc: number of principal components to remove - :return: XX[i, :] is the data point after removing its projection - """ - pc = compute_pc(X, npc) - if npc == 1: - XX = X - X.dot(pc.transpose()) * pc - else: - XX = X - X.dot(pc.transpose()).dot(pc) - return XX - - -def SIF_embedding(We, x, w, params): - """ - Compute the scores between pairs of sentences using weighted average + removing the projection on the first principal component - :param We: We[i,:] is the vector for word i - :param x: x[i, :] are the indices of the words in the i-th sentence - :param w: w[i, :] are the weights for the words in the i-th sentence - :param params.rmpc: if >0, remove the projections of the sentence embeddings to their first principal component - :return: emb, emb[i, :] is the embedding for sentence i - """ - emb = get_weighted_average(We, x, w) - if params.rmpc > 0: - emb = remove_pc(emb, params.rmpc) - return emb \ No newline at end of file diff --git a/refactor/util/common.py b/refactor/util/common.py deleted file mode 100644 index 61ac52f..0000000 --- a/refactor/util/common.py +++ /dev/null @@ -1,384 +0,0 @@ -import numpy as np -import torch -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import normalize - -from util.embeddings_manager import supervised_embeddings_tfidf - - -class TfidfVectorizerMultilingual: - - def __init__(self, **kwargs): - self.kwargs = kwargs - - def fit(self, lX, ly=None): - self.langs = sorted(lX.keys()) - self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs} - return self - - def transform(self, lX): - return {l: self.vectorizer[l].transform(lX[l]) for l in self.langs} - - def fit_transform(self, lX, ly=None): - return self.fit(lX, ly).transform(lX) - - def vocabulary(self, l=None): - if l is None: - return {l: self.vectorizer[l].vocabulary_ for l in self.langs} - else: - return self.vectorizer[l].vocabulary_ - - def get_analyzer(self, l=None): - if l is None: - return {l: self.vectorizer[l].build_analyzer() for l in self.langs} - else: - return self.vectorizer[l].build_analyzer() - - -def _normalize(lX, l2=True): - return {lang: normalize(X) for lang, X in lX.items()} if l2 else lX - - -def none_dict(langs): - return {l: None for l in langs} - - -class MultilingualIndex: - def __init__(self): - """ - Class that contains monolingual Indexes - """ - self.l_index = {} - self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) - - def index(self, l_devel_raw, l_devel_target, l_test_raw, l_test_target, l_pretrained_vocabulary=None): - self.langs = sorted(l_devel_raw.keys()) - self.l_vectorizer.fit(l_devel_raw) - l_vocabulary = self.l_vectorizer.vocabulary() - l_analyzer = self.l_vectorizer.get_analyzer() - if l_pretrained_vocabulary is None: - l_pretrained_vocabulary = none_dict(self.langs) - - for lang in self.langs: - # Init monolingual Index - self.l_index[lang] = Index(l_devel_raw[lang], l_devel_target[lang], l_test_raw[lang], l_test_target[lang], - lang) - # call to index() function of monolingual Index - self.l_index[lang].index(l_pretrained_vocabulary[lang], l_analyzer[lang], l_vocabulary[lang]) - - def train_val_split(self, val_prop=0.2, max_val=2000, seed=42): - for l, index in self.l_index.items(): - index.train_val_split(val_prop, max_val, seed=seed) - - def embedding_matrices(self, lpretrained, supervised): - """ - Extract from pretrained embeddings words that are found in the training dataset, then for each language - calls the respective monolingual index and build the embedding matrix (if supervised, WCE are concatenated - to the unsupervised vectors). - :param lpretrained: dict {lang : matrix of word-embeddings } - :param supervised: bool, whether to deploy Word-Class Embeddings or not - :return: self - """ - lXtr = self.get_lXtr() if supervised else none_dict(self.langs) - lYtr = self.l_train_target() if supervised else none_dict(self.langs) - lWordList = self.get_wordlist() - lExtracted = lpretrained.extract(lWordList) - for lang, index in self.l_index.items(): - # if supervised concatenate embedding matrices of pretrained unsupervised - # and supervised word-class embeddings - index.compose_embedding_matrix(lExtracted[lang], supervised, lXtr[lang], lYtr[lang]) - self.sup_range = index.wce_range - return self - - def get_wordlist(self): - wordlist = {} - for lang, index in self.l_index.items(): - wordlist[lang] = index.get_word_list() - return wordlist - - def get_raw_lXtr(self): - lXtr_raw = {k: [] for k in self.langs} - lYtr_raw = {k: [] for k in self.langs} - for lang in self.langs: - lXtr_raw[lang] = self.l_index[lang].train_raw - lYtr_raw[lang] = self.l_index[lang].train_raw - return lXtr_raw - - def get_raw_lXva(self): - lXva_raw = {k: [] for k in self.langs} - for lang in self.langs: - lXva_raw[lang] = self.l_index[lang].val_raw - - return lXva_raw - - def get_raw_lXte(self): - lXte_raw = {k: [] for k in self.langs} - for lang in self.langs: - lXte_raw[lang] = self.l_index[lang].test_raw - - return lXte_raw - - def get_lXtr(self): - if not hasattr(self, 'lXtr'): - self.lXtr = self.l_vectorizer.transform({l: index.train_raw for l, index in self.l_index.items()}) - return self.lXtr - - def get_lXva(self): - if not hasattr(self, 'lXva'): - self.lXva = self.l_vectorizer.transform({l: index.val_raw for l, index in self.l_index.items()}) - return self.lXva - - def get_lXte(self): - if not hasattr(self, 'lXte'): - self.lXte = self.l_vectorizer.transform({l: index.test_raw for l, index in self.l_index.items()}) - return self.lXte - - def get_target_dim(self): - return self.l_index[self.langs[0]].devel_target.shape[1] - - def l_vocabsize(self): - return {l: index.vocabsize for l, index in self.l_index.items()} - - def l_embeddings(self): - return {l: index.embedding_matrix for l, index in self.l_index.items()} - - def l_pad(self): - return {l: index.pad_index for l, index in self.l_index.items()} - - def l_train_index(self): - return {l: index.train_index for l, index in self.l_index.items()} - - def l_train_raw_index(self): - return {l: index.train_raw for l, index in self.l_index.items()} - - def l_train_target(self): - return {l: index.train_target for l, index in self.l_index.items()} - - def l_val_index(self): - return {l: index.val_index for l, index in self.l_index.items()} - - def l_val_raw_index(self): - return {l: index.val_raw for l, index in self.l_index.items()} - - def l_test_raw_index(self): - return {l: index.test_raw for l, index in self.l_index.items()} - - def l_devel_raw_index(self): - return {l: index.devel_raw for l, index in self.l_index.items()} - - def l_val_target(self): - return {l: index.val_target for l, index in self.l_index.items()} - - def l_test_target(self): - return {l: index.test_target for l, index in self.l_index.items()} - - def l_test_index(self): - return {l: index.test_index for l, index in self.l_index.items()} - - def l_devel_index(self): - return {l: index.devel_index for l, index in self.l_index.items()} - - def l_devel_target(self): - return {l: index.devel_target for l, index in self.l_index.items()} - - def l_train(self): - return self.l_train_index(), self.l_train_target() - - def l_val(self): - return self.l_val_index(), self.l_val_target() - - def l_test(self): - return self.l_test_index(), self.l_test_target() - - def l_train_raw(self): - return self.l_train_raw_index(), self.l_train_target() - - def l_val_raw(self): - return self.l_val_raw_index(), self.l_val_target() - - def l_test_raw(self): - return self.l_test_raw_index(), self.l_test_target() - - def l_devel_raw(self): - return self.l_devel_raw_index(), self.l_devel_target() - - def get_l_pad_index(self): - return {l: index.get_pad_index() for l, index in self.l_index.items()} - - -class Index: - def __init__(self, devel_raw, devel_target, test_raw, test_target, lang): - """ - Monolingual Index, takes care of tokenizing raw data, converting strings to ids, splitting the data into - training and validation. - :param devel_raw: list of strings, list of raw training texts - :param devel_target: - :param test_raw: list of strings, list of raw test texts - :param lang: list, list of languages contained in the dataset - """ - self.lang = lang - self.devel_raw = devel_raw - self.devel_target = devel_target - self.test_raw = test_raw - self.test_target = test_target - - def index(self, pretrained_vocabulary, analyzer, vocabulary): - self.word2index = dict(vocabulary) - known_words = set(self.word2index.keys()) - if pretrained_vocabulary is not None: - known_words.update(pretrained_vocabulary) - - self.word2index['UNKTOKEN'] = len(self.word2index) - self.word2index['PADTOKEN'] = len(self.word2index) - self.unk_index = self.word2index['UNKTOKEN'] - self.pad_index = self.word2index['PADTOKEN'] - - # index documents and keep track of test terms outside the development vocabulary that are in Muse (if available) - self.out_of_vocabulary = dict() - self.devel_index = index(self.devel_raw, self.word2index, known_words, analyzer, self.unk_index, - self.out_of_vocabulary) - self.test_index = index(self.test_raw, self.word2index, known_words, analyzer, self.unk_index, - self.out_of_vocabulary) - - self.vocabsize = len(self.word2index) + len(self.out_of_vocabulary) - - print(f'[indexing complete for lang {self.lang}] vocabulary-size={self.vocabsize}') - - def get_pad_index(self): - return self.pad_index - - def train_val_split(self, val_prop, max_val, seed): - devel = self.devel_index - target = self.devel_target - devel_raw = self.devel_raw - - val_size = int(min(len(devel) * val_prop, max_val)) - - self.train_index, self.val_index, self.train_target, self.val_target, self.train_raw, self.val_raw = \ - train_test_split( - devel, target, devel_raw, test_size=val_size, random_state=seed, shuffle=True) - - print( - f'split lang {self.lang}: train={len(self.train_index)} val={len(self.val_index)} test={len(self.test_index)}') - - def get_word_list(self): - def extract_word_list(word2index): - return [w for w, i in sorted(word2index.items(), key=lambda x: x[1])] - - word_list = extract_word_list(self.word2index) - word_list += extract_word_list(self.out_of_vocabulary) - return word_list - - def compose_embedding_matrix(self, pretrained, supervised, Xtr=None, Ytr=None): - print(f'[generating embedding matrix for lang {self.lang}]') - - self.wce_range = None - embedding_parts = [] - - if pretrained is not None: - print('\t[pretrained-matrix]') - embedding_parts.append(pretrained) - del pretrained - - if supervised: - print('\t[supervised-matrix]') - F = supervised_embeddings_tfidf(Xtr, Ytr) - num_missing_rows = self.vocabsize - F.shape[0] - F = np.vstack((F, np.zeros(shape=(num_missing_rows, F.shape[1])))) - F = torch.from_numpy(F).float() - - offset = 0 - if embedding_parts: - offset = embedding_parts[0].shape[1] - self.wce_range = [offset, offset + F.shape[1]] - embedding_parts.append(F) - - self.embedding_matrix = torch.cat(embedding_parts, dim=1) - - print(f'[embedding matrix for lang {self.lang} has shape {self.embedding_matrix.shape}]') - - -def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary): - """ - Index (i.e., replaces word strings with numerical indexes) a list of string documents - :param data: list of string documents - :param vocab: a fixed mapping [str]->[int] of words to indexes - :param known_words: a set of known words (e.g., words that, despite not being included in the vocab, can be retained - because they are anyway contained in a pre-trained embedding set that we know in advance) - :param analyzer: the preprocessor in charge of transforming the document string into a chain of string words - :param unk_index: the index of the 'unknown token', i.e., a symbol that characterizes all words that we cannot keep - :param out_of_vocabulary: an incremental mapping [str]->[int] of words to indexes that will index all those words that - are not in the original vocab but that are in the known_words - :return: - """ - indexes = [] - vocabsize = len(vocab) - unk_count = 0 - knw_count = 0 - out_count = 0 - # pbar = tqdm(data, desc=f'indexing') - for text in data: - words = analyzer(text) - index = [] - for word in words: - if word in vocab: - idx = vocab[word] - else: - if word in known_words: - if word not in out_of_vocabulary: - out_of_vocabulary[word] = vocabsize + len(out_of_vocabulary) - idx = out_of_vocabulary[word] - out_count += 1 - else: - idx = unk_index - unk_count += 1 - index.append(idx) - indexes.append(index) - knw_count += len(index) - # pbar.set_description(f'[unk = {unk_count}/{knw_count}={(100.*unk_count/knw_count):.2f}%]' - # f'[out = {out_count}/{knw_count}={(100.*out_count/knw_count):.2f}%]') - return indexes - - -def is_true(tensor, device): - return torch.where(tensor == 1, torch.Tensor([1]).to(device), torch.Tensor([0]).to(device)) - - -def is_false(tensor, device): - return torch.where(tensor == 0, torch.Tensor([1]).to(device), torch.Tensor([0]).to(device)) - - -def define_pad_length(index_list): - lengths = [len(index) for index in index_list] - return int(np.mean(lengths) + np.std(lengths)) - - -def pad(index_list, pad_index, max_pad_length=None): - pad_length = np.max([len(index) for index in index_list]) - if max_pad_length is not None: - pad_length = min(pad_length, max_pad_length) - for i, indexes in enumerate(index_list): - index_list[i] = [pad_index] * (pad_length - len(indexes)) + indexes[:pad_length] - return index_list - - -def get_params(optimc=False): - if not optimc: - return None - c_range = [1e4, 1e3, 1e2, 1e1, 1, 1e-1] - kernel = 'rbf' - return [{'kernel': [kernel], 'C': c_range, 'gamma':['auto']}] - - -def get_method_name(args): - _id = '' - _id_conf = [args.post_embedder, args.wce_embedder, args.muse_embedder, args.bert_embedder, args.gru_embedder] - _id_name = ['X', 'W', 'M', 'B', 'G'] - for i, conf in enumerate(_id_conf): - if conf: - _id += _id_name[i] - _id = _id if not args.gru_wce else _id + '_wce' - _dataset_path = args.dataset.split('/')[-1].split('_') - dataset_id = _dataset_path[0] + _dataset_path[-1] - return _id, dataset_id diff --git a/refactor/util/embeddings_manager.py b/refactor/util/embeddings_manager.py deleted file mode 100644 index 1d708fa..0000000 --- a/refactor/util/embeddings_manager.py +++ /dev/null @@ -1,104 +0,0 @@ -from abc import ABC, abstractmethod - -import numpy as np -import torch -from torchtext.vocab import Vectors - -from util.SIF_embed import remove_pc - - -class PretrainedEmbeddings(ABC): - - def __init__(self): - super().__init__() - - @abstractmethod - def vocabulary(self): pass - - @abstractmethod - def dim(self): pass - - @classmethod - def reindex(cls, words, word2index): - if isinstance(words, dict): - words = list(zip(*sorted(words.items(), key=lambda x: x[1])))[0] - - source_idx, target_idx = [], [] - for i, word in enumerate(words): - if word not in word2index: - continue - j = word2index[word] - source_idx.append(i) - target_idx.append(j) - source_idx = np.asarray(source_idx) - target_idx = np.asarray(target_idx) - return source_idx, target_idx - - -class MuseLoader: - def __init__(self, langs, cache): - self.langs = langs - self.lEmbed = {} - self.lExtracted = {} - for lang in self.langs: - print(f'Loading vectors for {lang}...') - self.lEmbed[lang] = Vectors(f'wiki.multi.{lang}.vec', cache) - - def dim(self): - return self.lEmbed[list(self.lEmbed.keys())[0]].dim - - def vocabulary(self): - return {lang: set(self.lEmbed[lang].stoi.keys()) for lang in self.langs} - - def extract(self, lVoc): - """ - Reindex pretrained loaded embedding in order to match indexes assigned by scikit vectorizer. Such indexes - are consistent with those used by Word Class Embeddings (since we deploy the same vectorizer) - :param lVoc: dict {lang : {word : id}} - :return: torch embedding matrix of extracted embeddings i.e., words in lVoc - """ - for lang, words in lVoc.items(): - print(f'Extracting words for lang {lang}...') - # words = list(zip(*sorted(lVoc[lang].items(), key=lambda x: x[1])))[0] - source_id, target_id = PretrainedEmbeddings.reindex(words, self.lEmbed[lang].stoi) - extraction = torch.zeros((len(words), self.dim())) - extraction[source_id] = self.lEmbed[lang].vectors[target_id] - self.lExtracted[lang] = extraction - return self.lExtracted - - def get_lEmbeddings(self): - return {lang: self.lEmbed[lang].vectors for lang in self.langs} - - -def XdotM(X, M, sif): - E = X.dot(M) - if sif: - E = remove_pc(E, npc=1) - return E - - -def wce_matrix(X, Y): - wce = supervised_embeddings_tfidf(X, Y) - wce = zscores(wce, axis=0) - return wce - - -def supervised_embeddings_tfidf(X, Y): - tfidf_norm = X.sum(axis=0) - tfidf_norm[tfidf_norm == 0] = 1 - F = (X.T).dot(Y) / tfidf_norm.T - return F - - -def zscores(X, axis=0): - """ - scipy.stats.zscores does not avoid division by 0, which can indeed occur - :param X: - :param axis: - :return: - """ - std = np.clip(np.std(X, ddof=1, axis=axis), 1e-5, None) - mean = np.mean(X, axis=axis) - return (X - mean) / std - - diff --git a/refactor/util/evaluation.py b/refactor/util/evaluation.py deleted file mode 100644 index 010d0e9..0000000 --- a/refactor/util/evaluation.py +++ /dev/null @@ -1,20 +0,0 @@ -import numpy as np -from joblib import Parallel, delayed - -from util.metrics import * - - -def evaluation_metrics(y, y_): - if len(y.shape) == len(y_.shape) == 1 and len(np.unique(y)) > 2: # single-label - raise NotImplementedError() # return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro') - else: # the metrics I implemented assume multiclass multilabel classification as binary classifiers - return macroF1(y, y_), microF1(y, y_), macroK(y, y_), microK(y, y_) - - -def evaluate(ly_true, ly_pred, metrics=evaluation_metrics, n_jobs=-1): - if n_jobs == 1: - return {lang: metrics(ly_true[lang], ly_pred[lang]) for lang in ly_true.keys()} - else: - langs = list(ly_true.keys()) - evals = Parallel(n_jobs=n_jobs)(delayed(metrics)(ly_true[lang], ly_pred[lang]) for lang in langs) - return {lang: evals[i] for i, lang in enumerate(langs)} diff --git a/refactor/util/file.py b/refactor/util/file.py deleted file mode 100644 index 8754f5a..0000000 --- a/refactor/util/file.py +++ /dev/null @@ -1,50 +0,0 @@ -import urllib -from os import listdir, makedirs -from os.path import isdir, isfile, join, exists, dirname -from pathlib import Path - - -def download_file(url, archive_filename): - def progress(blocknum, bs, size): - total_sz_mb = '%.2f MB' % (size / 1e6) - current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6) - print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='') - print("Downloading %s" % url) - urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress) - print("") - - -def download_file_if_not_exists(url, archive_path): - if exists(archive_path): return - makedirs_if_not_exist(dirname(archive_path)) - download_file(url,archive_path) - - -def ls(dir, typecheck): - el = [f for f in listdir(dir) if typecheck(join(dir, f))] - el.sort() - return el - - -def list_dirs(dir): - return ls(dir, typecheck=isdir) - - -def list_files(dir): - return ls(dir, typecheck=isfile) - - -def makedirs_if_not_exist(path): - if not exists(path): makedirs(path) - - -def create_if_not_exist(path): - if not exists(path): makedirs(path) - - -def get_parent_name(path): - return Path(path).parent - - -def get_file_name(path): - return Path(path).name diff --git a/refactor/util/metrics.py b/refactor/util/metrics.py deleted file mode 100644 index 7a6079e..0000000 --- a/refactor/util/metrics.py +++ /dev/null @@ -1,152 +0,0 @@ -import numpy as np - - -class ContTable: - def __init__(self, tp=0, tn=0, fp=0, fn=0): - self.tp = tp - self.tn = tn - self.fp = fp - self.fn = fn - - def get_d(self): return self.tp + self.tn + self.fp + self.fn - - def get_c(self): return self.tp + self.fn - - def get_not_c(self): return self.tn + self.fp - - def get_f(self): return self.tp + self.fp - - def get_not_f(self): return self.tn + self.fn - - def p_c(self): return (1.0*self.get_c())/self.get_d() - - def p_not_c(self): return 1.0-self.p_c() - - def p_f(self): return (1.0*self.get_f())/self.get_d() - - def p_not_f(self): return 1.0-self.p_f() - - def p_tp(self): return (1.0*self.tp) / self.get_d() - - def p_tn(self): return (1.0*self.tn) / self.get_d() - - def p_fp(self): return (1.0*self.fp) / self.get_d() - - def p_fn(self): return (1.0*self.fn) / self.get_d() - - def tpr(self): - c = 1.0*self.get_c() - return self.tp / c if c > 0.0 else 0.0 - - def fpr(self): - _c = 1.0*self.get_not_c() - return self.fp / _c if _c > 0.0 else 0.0 - - def __add__(self, other): - return ContTable(tp=self.tp + other.tp, tn=self.tn + other.tn, fp=self.fp + other.fp, fn=self.fn + other.fn) - - -def accuracy(cell): - return (cell.tp + cell.tn)*1.0 / (cell.tp + cell.fp + cell.fn + cell.tn) - - -def f1(cell): - num = 2.0 * cell.tp - den = 2.0 * cell.tp + cell.fp + cell.fn - if den > 0: - return num / den - # we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative - return 1.0 - - -def K(cell): - specificity, recall = 0., 0. - - AN = cell.tn + cell.fp - if AN != 0: - specificity = cell.tn*1. / AN - - AP = cell.tp + cell.fn - if AP != 0: - recall = cell.tp*1. / AP - - if AP == 0: - return 2. * specificity - 1. - elif AN == 0: - return 2. * recall - 1. - else: - return specificity + recall - 1. - - -# if the classifier is single class, then the prediction is a vector of shape=(nD,) which causes issues when compared -# to the true labels (of shape=(nD,1)). This method increases the dimensions of the predictions. -def __check_consistency_and_adapt(true_labels, predictions): - if predictions.ndim == 1: - return __check_consistency_and_adapt(true_labels, np.expand_dims(predictions, axis=1)) - if true_labels.ndim == 1: - return __check_consistency_and_adapt(np.expand_dims(true_labels, axis=1), predictions) - if true_labels.shape != predictions.shape: - raise ValueError("True and predicted label matrices shapes are inconsistent %s %s." - % (true_labels.shape, predictions.shape)) - _, nC = true_labels.shape - return true_labels, predictions, nC - - -# computes the (soft) contingency table where tp, fp, fn, and tn are the cumulative masses for the posterioir -# probabilitiesfron with respect to the true binary labels -# true_labels and posterior_probabilities are two vectors of shape (number_documents,) -def soft_single_metric_statistics(true_labels, posterior_probabilities): - assert len(true_labels) == len(posterior_probabilities), "Format not consistent between true and predicted labels." - tp = np.sum(posterior_probabilities[true_labels == 1]) - fn = np.sum(1. - posterior_probabilities[true_labels == 1]) - fp = np.sum(posterior_probabilities[true_labels == 0]) - tn = np.sum(1. - posterior_probabilities[true_labels == 0]) - return ContTable(tp=tp, tn=tn, fp=fp, fn=fn) - - -# computes the (hard) counters tp, fp, fn, and tn fron a true and predicted vectors of hard decisions -# true_labels and predicted_labels are two vectors of shape (number_documents,) -def hard_single_metric_statistics(true_labels, predicted_labels): - assert len(true_labels) == len(predicted_labels), "Format not consistent between true and predicted labels." - nd = len(true_labels) - tp = np.sum(predicted_labels[true_labels == 1]) - fp = np.sum(predicted_labels[true_labels == 0]) - fn = np.sum(true_labels[predicted_labels == 0]) - tn = nd - (tp+fp+fn) - return ContTable(tp=tp, tn=tn, fp=fp, fn=fn) - - -def macro_average(true_labels, predicted_labels, metric, metric_statistics=hard_single_metric_statistics): - true_labels, predicted_labels, nC = __check_consistency_and_adapt(true_labels, predicted_labels) - return np.mean([metric(metric_statistics(true_labels[:, c], predicted_labels[:, c])) for c in range(nC)]) - - -def micro_average(true_labels, predicted_labels, metric, metric_statistics=hard_single_metric_statistics): - true_labels, predicted_labels, nC = __check_consistency_and_adapt(true_labels, predicted_labels) - - accum = ContTable() - for c in range(nC): - other = metric_statistics(true_labels[:, c], predicted_labels[:, c]) - accum = accum + other - - return metric(accum) - - -# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format -def macroF1(true_labels, predicted_labels): - return macro_average(true_labels, predicted_labels, f1) - - -# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format -def microF1(true_labels, predicted_labels): - return micro_average(true_labels, predicted_labels, f1) - - -# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format -def macroK(true_labels, predicted_labels): - return macro_average(true_labels, predicted_labels, K) - - -# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format -def microK(true_labels, predicted_labels): - return micro_average(true_labels, predicted_labels, K) diff --git a/refactor/util/pl_metrics.py b/refactor/util/pl_metrics.py deleted file mode 100644 index bf8aa99..0000000 --- a/refactor/util/pl_metrics.py +++ /dev/null @@ -1,141 +0,0 @@ -import torch -from pytorch_lightning.metrics import Metric - -from util.common import is_false, is_true - - -def _update(pred, target, device): - assert pred.shape == target.shape - # preparing preds and targets for count - true_pred = is_true(pred, device) - false_pred = is_false(pred, device) - true_target = is_true(target, device) - false_target = is_false(target, device) - - tp = torch.sum(true_pred * true_target, dim=0) - tn = torch.sum(false_pred * false_target, dim=0) - fp = torch.sum(true_pred * false_target, dim=0) - fn = torch.sum(false_pred * target, dim=0) - return tp, tn, fp, fn - - -class CustomF1(Metric): - def __init__(self, num_classes, device, average='micro'): - """ - Custom F1 metric. - Scikit learn provides a full set of evaluation metrics, but they treat special cases differently. - I.e., when the number of true positives, false positives, and false negatives amount to 0, all - affected metrics (precision, recall, and thus f1) output 0 in Scikit learn. - We adhere to the common practice of outputting 1 in this case since the classifier has correctly - classified all examples as negatives. - :param num_classes: - :param device: - :param average: - """ - super().__init__() - self.num_classes = num_classes - self.average = average - self.device = 'cuda' if device else 'cpu' - self.add_state('true_positive', default=torch.zeros(self.num_classes)) - self.add_state('true_negative', default=torch.zeros(self.num_classes)) - self.add_state('false_positive', default=torch.zeros(self.num_classes)) - self.add_state('false_negative', default=torch.zeros(self.num_classes)) - - def update(self, preds, target): - true_positive, true_negative, false_positive, false_negative = _update(preds, target, self.device) - - self.true_positive += true_positive - self.true_negative += true_negative - self.false_positive += false_positive - self.false_negative += false_negative - - def compute(self): - if self.average == 'micro': - num = 2.0 * self.true_positive.sum() - den = 2.0 * self.true_positive.sum() + self.false_positive.sum() + self.false_negative.sum() - if den > 0: - return (num / den).to(self.device) - return torch.FloatTensor([1.]).to(self.device) - if self.average == 'macro': - class_specific = [] - for i in range(self.num_classes): - class_tp = self.true_positive[i] - class_tn = self.true_negative[i] - class_fp = self.false_positive[i] - class_fn = self.false_negative[i] - num = 2.0 * class_tp - den = 2.0 * class_tp + class_fp + class_fn - if den > 0: - class_specific.append(num / den) - else: - class_specific.append(1.) - average = torch.sum(torch.Tensor(class_specific))/self.num_classes - return average.to(self.device) - - -class CustomK(Metric): - def __init__(self, num_classes, device, average='micro'): - """ - K metric. https://dl.acm.org/doi/10.1145/2808194.2809449 - :param num_classes: - :param device: - :param average: - """ - super().__init__() - self.num_classes = num_classes - self.average = average - self.device = 'cuda' if device else 'cpu' - self.add_state('true_positive', default=torch.zeros(self.num_classes)) - self.add_state('true_negative', default=torch.zeros(self.num_classes)) - self.add_state('false_positive', default=torch.zeros(self.num_classes)) - self.add_state('false_negative', default=torch.zeros(self.num_classes)) - - def update(self, preds, target): - true_positive, true_negative, false_positive, false_negative = _update(preds, target, self.device) - - self.true_positive += true_positive - self.true_negative += true_negative - self.false_positive += false_positive - self.false_negative += false_negative - - def compute(self): - if self.average == 'micro': - specificity, recall = 0., 0. - absolute_negatives = self.true_negative.sum() + self.false_positive.sum() - if absolute_negatives != 0: - specificity = self.true_negative.sum()/absolute_negatives - absolute_positives = self.true_positive.sum() + self.false_negative.sum() - if absolute_positives != 0: - recall = self.true_positive.sum()/absolute_positives - - if absolute_positives == 0: - return 2. * specificity - 1 - elif absolute_negatives == 0: - return 2. * recall - 1 - else: - return specificity + recall - 1 - - if self.average == 'macro': - class_specific = [] - for i in range(self.num_classes): - class_tp = self.true_positive[i] - class_tn = self.true_negative[i] - class_fp = self.false_positive[i] - class_fn = self.false_negative[i] - - specificity, recall = 0., 0. - absolute_negatives = class_tn + class_fp - if absolute_negatives != 0: - specificity = class_tn / absolute_negatives - absolute_positives = class_tp + class_fn - if absolute_positives != 0: - recall = class_tp / absolute_positives - - if absolute_positives == 0: - class_specific.append(2. * specificity - 1) - elif absolute_negatives == 0: - class_specific.append(2. * recall - 1) - else: - class_specific.append(specificity + recall - 1) - average = torch.sum(torch.Tensor(class_specific)) / self.num_classes - return average.to(self.device) diff --git a/refactor/util/results_csv.py b/refactor/util/results_csv.py deleted file mode 100644 index be0ff84..0000000 --- a/refactor/util/results_csv.py +++ /dev/null @@ -1,53 +0,0 @@ -import os - -import numpy as np -import pandas as pd - - -class CSVlog: - def __init__(self, file, autoflush=True, verbose=False): - self.file = file - self.columns = ['method', - 'setting', - 'optimc', - 'sif', - 'zscore', - 'l2', - 'dataset', - 'time_tr', - 'time_te', - 'lang', - 'macrof1', - 'microf1', - 'macrok', - 'microk', - 'notes'] - self.autoflush = autoflush - self.verbose = verbose - if os.path.exists(file): - self.tell('Loading existing file from {}'.format(file)) - self.df = pd.read_csv(file, sep='\t') - else: - self.tell('File {} does not exist. Creating new frame.'.format(file)) - dir = os.path.dirname(self.file) - if dir and not os.path.exists(dir): os.makedirs(dir) - self.df = pd.DataFrame(columns=self.columns) - - def already_calculated(self, id): - return (self.df['id'] == id).any() - - def add_row(self, method, setting, optimc, sif, zscore, l2, dataset, time_tr, time_te, lang, - macrof1, microf1, macrok=np.nan, microk=np.nan, notes=''): - s = pd.Series([method, setting, optimc, sif, zscore, l2, dataset, time_tr, time_te, lang, - macrof1, microf1, macrok, microk, notes], - index=self.columns) - self.df = self.df.append(s, ignore_index=True) - if self.autoflush: self.flush() - self.tell(s.to_string()) - - def flush(self): - self.df.to_csv(self.file, index=False, sep='\t') - - def tell(self, msg): - if self.verbose: - print(msg) diff --git a/refactor/util/standardizer.py b/refactor/util/standardizer.py deleted file mode 100644 index 429bccd..0000000 --- a/refactor/util/standardizer.py +++ /dev/null @@ -1,36 +0,0 @@ -import numpy as np - - -class StandardizeTransformer: - def __init__(self, axis=0, range=None): - """ - - :param axis: - :param range: - """ - assert range is None or isinstance(range, slice), 'wrong format for range, should either be None or a slice' - self.axis = axis - self.yetfit = False - self.range = range - - def fit(self, X): - print('Applying z-score standardization...') - std=np.std(X, axis=self.axis, ddof=1) - self.std = np.clip(std, 1e-5, None) - self.mean = np.mean(X, axis=self.axis) - if self.range is not None: - ones = np.ones_like(self.std) - zeros = np.zeros_like(self.mean) - ones[self.range] = self.std[self.range] - zeros[self.range] = self.mean[self.range] - self.std = ones - self.mean = zeros - self.yetfit=True - return self - - def transform(self, X): - if not self.yetfit: 'transform called before fit' - return (X - self.mean) / self.std - - def fit_transform(self, X): - return self.fit(X).transform(X) \ No newline at end of file diff --git a/refactor/view_generators.py b/refactor/view_generators.py deleted file mode 100644 index 384ec76..0000000 --- a/refactor/view_generators.py +++ /dev/null @@ -1,375 +0,0 @@ -""" -This module contains the view generators that take care of computing the view specific document embeddings: - -- VanillaFunGen (-x) cast document representations encoded via TFIDF into posterior probabilities by means of SVM. - -- WordClassGen (-w): generates document representation via Word-Class-Embeddings. - Document embeddings are obtained via weighted sum of document's constituent embeddings. - -- MuseGen (-m): generates document representation via MUSE embeddings. - Document embeddings are obtained via weighted sum of document's constituent embeddings. - -- RecurrentGen (-g): generates document embedding by means of a Gated Recurrent Units. The model can be - initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). - Output dimension is (n_docs, 512). - -- View generator (-b): generates document embedding via mBERT model. -""" -from abc import ABC, abstractmethod -from time import time - -from pytorch_lightning import Trainer -from pytorch_lightning.loggers import TensorBoardLogger - -from data.datamodule import RecurrentDataModule, BertDataModule, tokenize -from models.learners import * -from models.pl_bert import BertModel -from models.pl_gru import RecurrentModel -from util.common import TfidfVectorizerMultilingual, _normalize -from util.embeddings_manager import MuseLoader, XdotM, wce_matrix - - -class ViewGen(ABC): - """ - Abstract class for ViewGenerators implementations. Every ViewGen should implement these three methods in order to - be seamlessly integrated in the overall architecture. - """ - @abstractmethod - def fit(self, lX, ly): - pass - - @abstractmethod - def transform(self, lX): - pass - - @abstractmethod - def fit_transform(self, lX, ly): - pass - - -class VanillaFunGen(ViewGen): - """ - View Generator (x): original funnelling architecture proposed by Moreo, Esuli and - Sebastiani in DOI: https://doi.org/10.1145/3326065 - """ - def __init__(self, base_learner, first_tier_parameters=None, n_jobs=-1): - """ - Init Posterior Probabilities embedder (i.e., VanillaFunGen) - :param base_learner: naive monolingual learners to be deployed as first-tier learners. Should be able to - return posterior probabilities. - :param base_learner: - :param n_jobs: integer, number of concurrent workers - """ - super().__init__() - self.learners = base_learner - self.first_tier_parameters = first_tier_parameters - self.n_jobs = n_jobs - self.doc_projector = NaivePolylingualClassifier(base_learner=self.learners, - parameters=self.first_tier_parameters, n_jobs=self.n_jobs) - self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) - - def fit(self, lX, lY): - print('# Fitting VanillaFunGen (X)...') - lX = self.vectorizer.fit_transform(lX) - self.doc_projector.fit(lX, lY) - return self - - def transform(self, lX): - """ - (1) Vectorize documents; (2) Project them according to the learners SVMs, finally (3) Apply L2 normalization - to the projection and returns it. - :param lX: dict {lang: indexed documents} - :return: document projection to the common latent space. - """ - lX = self.vectorizer.transform(lX) - lZ = self.doc_projector.predict_proba(lX) - lZ = _normalize(lZ, l2=True) - return lZ - - def fit_transform(self, lX, ly): - return self.fit(lX, ly).transform(lX) - - -class MuseGen(ViewGen): - """ - View Generator (m): generates document representation via MUSE embeddings (Fasttext multilingual word - embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings. - """ - def __init__(self, muse_dir='../embeddings', n_jobs=-1): - """ - Init the MuseGen. - :param muse_dir: string, path to folder containing muse embeddings - :param n_jobs: int, number of concurrent workers - """ - super().__init__() - self.muse_dir = muse_dir - self.n_jobs = n_jobs - self.langs = None - self.lMuse = None - self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) - - def fit(self, lX, ly): - """ - (1) Vectorize documents; (2) Load muse embeddings for words encountered while vectorizing. - :param lX: dict {lang: indexed documents} - :param ly: dict {lang: target vectors} - :return: self. - """ - print('# Fitting MuseGen (M)...') - self.vectorizer.fit(lX) - self.langs = sorted(lX.keys()) - self.lMuse = MuseLoader(langs=self.langs, cache=self.muse_dir) - lVoc = self.vectorizer.vocabulary() - self.lMuse = self.lMuse.extract(lVoc) # overwriting lMuse with dict {lang : embed_matrix} with only known words - # TODO: featureweight.fit - return self - - def transform(self, lX): - """ - (1) Vectorize documents; (2) computes the weighted sum of MUSE embeddings found at document level, - finally (3) Apply L2 normalization embedding and returns it. - :param lX: dict {lang: indexed documents} - :return: document projection to the common latent space. - """ - lX = self.vectorizer.transform(lX) - XdotMUSE = Parallel(n_jobs=self.n_jobs)( - delayed(XdotM)(lX[lang], self.lMuse[lang], sif=True) for lang in self.langs) - lZ = {lang: XdotMUSE[i] for i, lang in enumerate(self.langs)} - lZ = _normalize(lZ, l2=True) - return lZ - - def fit_transform(self, lX, ly): - return self.fit(lX, ly).transform(lX) - - -class WordClassGen(ViewGen): - """ - View Generator (w): generates document representation via Word-Class-Embeddings. - Document embeddings are obtained via weighted sum of document's constituent embeddings. - """ - def __init__(self, n_jobs=-1): - """ - Init WordClassGen. - :param n_jobs: int, number of concurrent workers - """ - super().__init__() - self.n_jobs = n_jobs - self.langs = None - self.lWce = None - self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) - - def fit(self, lX, ly): - """ - (1) Vectorize documents; (2) Load muse embeddings for words encountered while vectorizing. - :param lX: dict {lang: indexed documents} - :param ly: dict {lang: target vectors} - :return: self. - """ - print('# Fitting WordClassGen (W)...') - lX = self.vectorizer.fit_transform(lX) - self.langs = sorted(lX.keys()) - wce = Parallel(n_jobs=self.n_jobs)( - delayed(wce_matrix)(lX[lang], ly[lang]) for lang in self.langs) - self.lWce = {l: wce[i] for i, l in enumerate(self.langs)} - # TODO: featureweight.fit() - return self - - def transform(self, lX): - """ - (1) Vectorize documents; (2) computes the weighted sum of Word-Class Embeddings found at document level, - finally (3) Apply L2 normalization embedding and returns it. - :param lX: dict {lang: indexed documents} - :return: document projection to the common latent space. - """ - lX = self.vectorizer.transform(lX) - XdotWce = Parallel(n_jobs=self.n_jobs)( - delayed(XdotM)(lX[lang], self.lWce[lang], sif=True) for lang in self.langs) - lWce = {l: XdotWce[i] for i, l in enumerate(self.langs)} - lWce = _normalize(lWce, l2=True) - return lWce - - def fit_transform(self, lX, ly): - return self.fit(lX, ly).transform(lX) - - -class RecurrentGen(ViewGen): - """ - View Generator (G): generates document embedding by means of a Gated Recurrent Units. The model can be - initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). - Output dimension is (n_docs, 512). The training will happen end-to-end. At inference time, the model returns - the network internal state at the second feed-forward layer level. Training metrics are logged via TensorBoard. - """ - def __init__(self, multilingualIndex, pretrained_embeddings, wce, batch_size=512, nepochs=50, - gpus=0, n_jobs=-1, stored_path=None): - """ - Init RecurrentGen. - :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents - indexed by language code. - :param pretrained_embeddings: dict {lang: tensor of embeddings}, it contains the pretrained embeddings to use - as embedding layer. - :param wce: Bool, whether to deploy Word-Class Embeddings (as proposed by A. Moreo). If True, supervised - embeddings are concatenated to the deployed supervised embeddings. WCE dimensionality is equal to - the number of target classes. - :param batch_size: int, number of samples in a batch. - :param nepochs: int, number of max epochs to train the model. - :param gpus: int, specifies how many GPUs to use per node. If False computation will take place on cpu. - :param n_jobs: int, number of concurrent workers (i.e., parallelizing data loading). - :param stored_path: str, path to a pretrained model. If None the model will be trained from scratch. - """ - super().__init__() - self.multilingualIndex = multilingualIndex - self.langs = multilingualIndex.langs - self.batch_size = batch_size - self.gpus = gpus - self.n_jobs = n_jobs - self.stored_path = stored_path - self.nepochs = nepochs - - # EMBEDDINGS to be deployed - self.pretrained = pretrained_embeddings - self.wce = wce - - self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1) - self.multilingualIndex.embedding_matrices(self.pretrained, supervised=self.wce) - self.model = self._init_model() - self.logger = TensorBoardLogger(save_dir='tb_logs', name='rnn', default_hp_metric=False) - # self.logger = CSVLogger(save_dir='csv_logs', name='rnn_dev') - - def _init_model(self): - if self.stored_path: - lpretrained = self.multilingualIndex.l_embeddings() - return RecurrentModel.load_from_checkpoint(self.stored_path, lPretrained=lpretrained) - else: - lpretrained = self.multilingualIndex.l_embeddings() - langs = self.multilingualIndex.langs - output_size = self.multilingualIndex.get_target_dim() - hidden_size = 512 - lvocab_size = self.multilingualIndex.l_vocabsize() - learnable_length = 0 - return RecurrentModel( - lPretrained=lpretrained, - langs=langs, - output_size=output_size, - hidden_size=hidden_size, - lVocab_size=lvocab_size, - learnable_length=learnable_length, - drop_embedding_range=self.multilingualIndex.sup_range, - drop_embedding_prop=0.5, - gpus=self.gpus - ) - - def fit(self, lX, ly): - """ - Train the Neural Network end-to-end. - lX and ly are not directly used. We rather get them from the multilingual index used in the instantiation - of the Dataset object (RecurrentDataset) in the GfunDataModule class. - :param lX: dict {lang: indexed documents} - :param ly: dict {lang: target vectors} - :return: self. - """ - print('# Fitting RecurrentGen (G)...') - recurrentDataModule = RecurrentDataModule(self.multilingualIndex, batchsize=self.batch_size, n_jobs=self.n_jobs) - trainer = Trainer(gradient_clip_val=1e-1, gpus=self.gpus, logger=self.logger, max_epochs=self.nepochs, - checkpoint_callback=False) - - # vanilla_torch_model = torch.load( - # '/home/andreapdr/funneling_pdr/checkpoint/gru_viewgen_-jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle') - # self.model.linear0 = vanilla_torch_model.linear0 - # self.model.linear1 = vanilla_torch_model.linear1 - # self.model.linear2 = vanilla_torch_model.linear2 - # self.model.rnn = vanilla_torch_model.rnn - - trainer.fit(self.model, datamodule=recurrentDataModule) - trainer.test(self.model, datamodule=recurrentDataModule) - return self - - def transform(self, lX): - """ - Project documents to the common latent space. Output dimensionality is 512. - :param lX: dict {lang: indexed documents} - :return: documents projected to the common latent space. - """ - l_pad = self.multilingualIndex.l_pad() - data = self.multilingualIndex.l_devel_index() - self.model.to('cuda' if self.gpus else 'cpu') - self.model.eval() - time_init = time() - l_embeds = self.model.encode(data, l_pad, batch_size=256) - transform_time = round(time() - time_init, 3) - print(f'Executed! Transform took: {transform_time}') - return l_embeds - - def fit_transform(self, lX, ly): - return self.fit(lX, ly).transform(lX) - - -class BertGen(ViewGen): - """ - View Generator (b): generates document embedding via Bert model. The training happens end-to-end. - At inference time, the model returns the network internal state at the last original layer (i.e. 12th). Document - embeddings are the state associated with the "start" token. Training metrics are logged via TensorBoard. - """ - def __init__(self, multilingualIndex, batch_size=128, nepochs=50, gpus=0, n_jobs=-1, stored_path=None): - """ - Init Bert model - :param multilingualIndex: MultilingualIndex, it is a dictionary of training and test documents - indexed by language code. - :param batch_size: int, number of samples per batch. - :param nepochs: int, number of max epochs to train the model. - :param gpus: int, specifies how many GPUs to use per node. If False computation will take place on cpu. - :param n_jobs: int, number of concurrent workers. - :param stored_path: str, path to a pretrained model. If None the model will be trained from scratch. - """ - super().__init__() - self.multilingualIndex = multilingualIndex - self.nepochs = nepochs - self.gpus = gpus - self.batch_size = batch_size - self.n_jobs = n_jobs - self.stored_path = stored_path - self.model = self._init_model() - self.logger = TensorBoardLogger(save_dir='tb_logs', name='bert', default_hp_metric=False) - - def _init_model(self): - output_size = self.multilingualIndex.get_target_dim() - return BertModel(output_size=output_size, stored_path=self.stored_path, gpus=self.gpus) - - def fit(self, lX, ly): - """ - Train the Neural Network end-to-end. - lX and ly are not directly used. We rather get them from the multilingual index used in the instantiation - of the Dataset object (RecurrentDataset) in the GfunDataModule class. - :param lX: dict {lang: indexed documents} - :param ly: dict {lang: target vectors} - :return: self. - """ - print('# Fitting BertGen (M)...') - self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1) - bertDataModule = BertDataModule(self.multilingualIndex, batchsize=self.batch_size, max_len=512) - trainer = Trainer(gradient_clip_val=1e-1, max_epochs=self.nepochs, gpus=self.gpus, - logger=self.logger, checkpoint_callback=False) - trainer.fit(self.model, datamodule=bertDataModule) - trainer.test(self.model, datamodule=bertDataModule) - return self - - def transform(self, lX): - """ - Project documents to the common latent space. Output dimensionality is 768. - :param lX: dict {lang: indexed documents} - :return: documents projected to the common latent space. - """ - data = self.multilingualIndex.l_devel_raw_index() - data = tokenize(data, max_len=512) - self.model.to('cuda' if self.gpus else 'cpu') - self.model.eval() - time_init = time() - l_emebds = self.model.encode(data, batch_size=64) - transform_time = round(time() - time_init, 3) - print(f'Executed! Transform took: {transform_time}') - return l_emebds - - def fit_transform(self, lX, ly): - # we can assume that we have already indexed data for transform() since we are first calling fit() - return self.fit(lX, ly).transform(lX) - -