diff --git a/refactor/data/__init__.py b/refactor/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/refactor/data/datamodule.py b/refactor/data/datamodule.py new file mode 100644 index 0000000..bbb7cc1 --- /dev/null +++ b/refactor/data/datamodule.py @@ -0,0 +1,178 @@ +import torch +from torch.utils.data import Dataset, DataLoader +import numpy as np +import pytorch_lightning as pl +from transformers import BertTokenizer + +N_WORKERS = 8 + + +class RecurrentDataset(Dataset): + def __init__(self, lX, ly, lPad_index): + """ + :param lX: dict {lang_id : np.ndarray} + :param ly: + """ + self.lX = [] + self.ly = [] + self.lOffset = {} + self.lPad_index = lPad_index + + for lang, data in lX.items(): + offset = [len(self.lX)] + self.lX.extend(data) + offset.append(len(self.lX)) + self.lOffset[lang] = offset + + for lang, target in ly.items(): + self.ly.extend(target) + + def __len__(self): + return len(self.lX) + + def __getitem__(self, index): + X = self.lX[index] + y = self.ly[index] + return X, y, index, self._get_lang(index) + + def _get_lang(self, index): + for lang, l_range in self.lOffset.items(): + if index in range(l_range[0], l_range[1]): + return lang + + def collate_fn(self, data): + """ + Takes care of padding the batch and also check consistency of batch languages. Groups into dict {lang : lang_batch} + items sampled from the Dataset class. + :param data: + :return: + """ + lX_batch = {} + ly_batch = {} + current_lang = data[0][-1] + for d in data: + if d[-1] == current_lang: + if current_lang not in lX_batch.keys(): + lX_batch[current_lang] = [] + ly_batch[current_lang] = [] + lX_batch[current_lang].append(d[0]) + ly_batch[current_lang].append(d[1]) + else: + current_lang = d[-1] + lX_batch[current_lang] = [] + ly_batch[current_lang] = [] + lX_batch[current_lang].append(d[0]) + ly_batch[current_lang].append(d[1]) + + for lang in lX_batch.keys(): + # TODO: double check padding function (too many left pad tokens?) + lX_batch[lang] = self.pad(lX_batch[lang], pad_index=self.lPad_index[lang], max_pad_length=70) + # max_pad_length=self.define_pad_length(lX_batch[lang])) + lX_batch[lang] = torch.LongTensor(lX_batch[lang]) + ly_batch[lang] = torch.FloatTensor(ly_batch[lang]) + + return lX_batch, ly_batch + + @staticmethod + def define_pad_length(index_list): + lengths = [len(index) for index in index_list] + return int(np.mean(lengths) + np.std(lengths)) + + @staticmethod + def pad(index_list, pad_index, max_pad_length=None): + pad_length = np.max([len(index) for index in index_list]) + if max_pad_length is not None: + pad_length = min(pad_length, max_pad_length) + for i, indexes in enumerate(index_list): + index_list[i] = [pad_index] * (pad_length - len(indexes)) + indexes[:pad_length] + return index_list + + +class GfunDataModule(pl.LightningDataModule): + def __init__(self, multilingualIndex, batchsize=64): + """ + Pytorch-lightning DataModule: https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html + :param multilingualIndex: + :param batchsize: + """ + self.multilingualIndex = multilingualIndex + self.batchsize = batchsize + super().__init__() + + def prepare_data(self, *args, **kwargs): + pass + + def setup(self, stage=None): + # Assign train/val datasets for use in dataloaders + if stage == 'fit' or stage is None: + l_train_index, l_train_target = self.multilingualIndex.l_train() + self.training_dataset = RecurrentDataset(l_train_index, l_train_target, + lPad_index=self.multilingualIndex.l_pad()) + l_val_index, l_val_target = self.multilingualIndex.l_val() + self.val_dataset = RecurrentDataset(l_val_index, l_val_target, + lPad_index=self.multilingualIndex.l_pad()) + # Assign test dataset for use in dataloader(s) + if stage == 'test' or stage is None: + l_test_index, l_test_target = self.multilingualIndex.l_val() + self.test_dataset = RecurrentDataset(l_test_index, l_test_target, + lPad_index=self.multilingualIndex.l_pad()) + + def train_dataloader(self): + return DataLoader(self.training_dataset, batch_size=self.batchsize, num_workers=N_WORKERS, + collate_fn=self.training_dataset.collate_fn) + + def val_dataloader(self): + return DataLoader(self.val_dataset, batch_size=self.batchsize, num_workers=N_WORKERS, + collate_fn=self.val_dataset.collate_fn) + + def test_dataloader(self): + return DataLoader(self.test_dataset, batch_size=self.batchsize, num_workers=N_WORKERS, + collate_fn=self.test_dataset.collate_fn) + + +class BertDataModule(GfunDataModule): + def __init__(self, multilingualIndex, batchsize=64, max_len=512): + super().__init__(multilingualIndex, batchsize) + self.max_len = max_len + + def setup(self, stage=None): + # Assign train/val datasets for use in dataloaders + if stage == 'fit' or stage is None: + l_train_raw, l_train_target = self.multilingualIndex.l_train_raw() + l_train_index = self.tokenize(l_train_raw, max_len=self.max_len) + self.training_dataset = RecurrentDataset(l_train_index, l_train_target, + lPad_index=self.multilingualIndex.l_pad()) + l_val_raw, l_val_target = self.multilingualIndex.l_val_raw() + l_val_index = self.tokenize(l_val_raw, max_len=self.max_len) + self.val_dataset = RecurrentDataset(l_val_index, l_val_target, + lPad_index=self.multilingualIndex.l_pad()) + # Assign test dataset for use in dataloader(s) + # TODO + if stage == 'test' or stage is None: + l_val_raw, l_val_target = self.multilingualIndex.l_test_raw() + l_val_index = self.tokenize(l_val_raw) + self.test_dataset = RecurrentDataset(l_val_index, l_val_target, + lPad_index=self.multilingualIndex.l_pad()) + + @staticmethod + def tokenize(l_raw, max_len): + # TODO: check BertTokenizerFast https://huggingface.co/transformers/model_doc/bert.html#berttokenizerfast + tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') + l_tokenized = {} + for lang in l_raw.keys(): + output_tokenizer = tokenizer(l_raw[lang], truncation=True, max_length=max_len, padding='max_length') + l_tokenized[lang] = output_tokenizer['input_ids'] + return l_tokenized + + def train_dataloader(self): + """ + NB: Setting n_workers to > 0 will cause "OSError: [Errno 24] Too many open files" + :return: + """ + return DataLoader(self.training_dataset, batch_size=self.batchsize) + + def val_dataloader(self): + return DataLoader(self.val_dataset, batch_size=self.batchsize) + + def test_dataloader(self): + return DataLoader(self.test_dataset, batch_size=self.batchsize) diff --git a/refactor/data/dataset_builder.py b/refactor/data/dataset_builder.py new file mode 100644 index 0000000..b9650c7 --- /dev/null +++ b/refactor/data/dataset_builder.py @@ -0,0 +1,710 @@ +from os.path import join, exists +from nltk.corpus import stopwords +from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer +from sklearn.preprocessing import MultiLabelBinarizer +from data.reader.jrcacquis_reader import * +from data.languages import lang_set, NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING +from data.reader.rcv_reader import fetch_RCV1, fetch_RCV2, fetch_topic_hierarchy +from data.text_preprocessor import NLTKStemTokenizer, preprocess_documents +import pickle +import numpy as np +from sklearn.model_selection import train_test_split +from scipy.sparse import issparse +import itertools +from tqdm import tqdm +import re +from scipy.sparse import csr_matrix + + +class MultilingualDataset: + """ + A multilingual dataset is a dictionary of training and test documents indexed by language code. + Train and test sets are represented as tuples of the type (X,Y,ids), where X is a matrix representation of the + documents (e.g., a document-by-term sparse csr_matrix), Y is a document-by-label binary np.array indicating the + labels of each document, and ids is a list of document-identifiers from the original collection. + """ + + def __init__(self): + self.dataset_name = "" + self.multiling_dataset = {} + + def add(self, lang, Xtr, Ytr, Xte, Yte, tr_ids=None, te_ids=None): + self.multiling_dataset[lang] = ((Xtr, Ytr, tr_ids), (Xte, Yte, te_ids)) + + def save(self, file): + self.sort_indexes() + pickle.dump(self, open(file, 'wb'), pickle.HIGHEST_PROTOCOL) + return self + + def __getitem__(self, item): + if item in self.langs(): + return self.multiling_dataset[item] + return None + + @classmethod + def load(cls, file): + data = pickle.load(open(file, 'rb')) + data.sort_indexes() + return data + + @classmethod + def load_ids(cls, file): + data = pickle.load(open(file, 'rb')) + tr_ids = {lang:tr_ids for (lang,((_,_,tr_ids), (_,_,_))) in data.multiling_dataset.items()} + te_ids = {lang: te_ids for (lang, ((_, _, _), (_, _, te_ids))) in data.multiling_dataset.items()} + return tr_ids, te_ids + + def sort_indexes(self): + for (lang, ((Xtr,_,_),(Xte,_,_))) in self.multiling_dataset.items(): + if issparse(Xtr): Xtr.sort_indices() + if issparse(Xte): Xte.sort_indices() + + def set_view(self, categories=None, languages=None): + if categories is not None: + if isinstance(categories, int): + categories = np.array([categories]) + elif isinstance(categories, list): + categories = np.array(categories) + self.categories_view = categories + if languages is not None: + self.languages_view = languages + + def training(self, mask_numbers=False, target_as_csr=False): + return self.lXtr(mask_numbers), self.lYtr(as_csr=target_as_csr) + + def test(self, mask_numbers=False, target_as_csr=False): + return self.lXte(mask_numbers), self.lYte(as_csr=target_as_csr) + + def lXtr(self, mask_numbers=False): + proc = lambda x:_mask_numbers(x) if mask_numbers else x + # return {lang: Xtr for (lang, ((Xtr, _, _), _)) in self.multiling_dataset.items() if lang in self.langs()} + return {lang:proc(Xtr) for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()} + + def lXte(self, mask_numbers=False): + proc = lambda x: _mask_numbers(x) if mask_numbers else x + # return {lang: Xte for (lang, (_, (Xte, _, _))) in self.multiling_dataset.items() if lang in self.langs()} + return {lang:proc(Xte) for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()} + + def lYtr(self, as_csr=False): + lY = {lang:self.cat_view(Ytr) for (lang, ((_,Ytr,_),_)) in self.multiling_dataset.items() if lang in self.langs()} + if as_csr: + lY = {l:csr_matrix(Y) for l,Y in lY.items()} + return lY + + def lYte(self, as_csr=False): + lY = {lang:self.cat_view(Yte) for (lang, (_,(_,Yte,_))) in self.multiling_dataset.items() if lang in self.langs()} + if as_csr: + lY = {l:csr_matrix(Y) for l,Y in lY.items()} + return lY + + def cat_view(self, Y): + if hasattr(self, 'categories_view'): + return Y[:,self.categories_view] + else: + return Y + + def langs(self): + if hasattr(self, 'languages_view'): + langs = self.languages_view + else: + langs = sorted(self.multiling_dataset.keys()) + return langs + + def num_categories(self): + return self.lYtr()[self.langs()[0]].shape[1] + + def show_dimensions(self): + def shape(X): + return X.shape if hasattr(X, 'shape') else len(X) + for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items(): + if lang not in self.langs(): continue + print("Lang {}, Xtr={}, ytr={}, Xte={}, yte={}".format(lang, shape(Xtr), self.cat_view(Ytr).shape, shape(Xte), self.cat_view(Yte).shape)) + + def show_category_prevalences(self): + nC = self.num_categories() + accum_tr = np.zeros(nC, dtype=np.int) + accum_te = np.zeros(nC, dtype=np.int) + in_langs = np.zeros(nC, dtype=np.int) # count languages with at least one positive example (per category) + for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items(): + if lang not in self.langs(): continue + prev_train = np.sum(self.cat_view(Ytr), axis=0) + prev_test = np.sum(self.cat_view(Yte), axis=0) + accum_tr += prev_train + accum_te += prev_test + in_langs += (prev_train>0)*1 + print(lang+'-train', prev_train) + print(lang+'-test', prev_test) + print('all-train', accum_tr) + print('all-test', accum_te) + + return accum_tr, accum_te, in_langs + + def set_labels(self, labels): + self.labels = labels + +def _mask_numbers(data): + mask_moredigit = re.compile(r'\s[\+-]?\d{5,}([\.,]\d*)*\b') + mask_4digit = re.compile(r'\s[\+-]?\d{4}([\.,]\d*)*\b') + mask_3digit = re.compile(r'\s[\+-]?\d{3}([\.,]\d*)*\b') + mask_2digit = re.compile(r'\s[\+-]?\d{2}([\.,]\d*)*\b') + mask_1digit = re.compile(r'\s[\+-]?\d{1}([\.,]\d*)*\b') + masked = [] + for text in tqdm(data, desc='masking numbers'): + text = ' ' + text + text = mask_moredigit.sub(' MoreDigitMask', text) + text = mask_4digit.sub(' FourDigitMask', text) + text = mask_3digit.sub(' ThreeDigitMask', text) + text = mask_2digit.sub(' TwoDigitMask', text) + text = mask_1digit.sub(' OneDigitMask', text) + masked.append(text.replace('.','').replace(',','').strip()) + return masked + + + + +# ---------------------------------------------------------------------------------------------------------------------- +# Helpers +# ---------------------------------------------------------------------------------------------------------------------- +def get_active_labels(doclist): + cat_list = set() + for d in doclist: + cat_list.update(d.categories) + return list(cat_list) + +def filter_by_categories(doclist, keep_categories): + catset = frozenset(keep_categories) + for d in doclist: + d.categories = list(set(d.categories).intersection(catset)) + +def __years_to_str(years): + if isinstance(years, list): + if len(years) > 1: + return str(years[0])+'-'+str(years[-1]) + return str(years[0]) + return str(years) + + +# ---------------------------------------------------------------------------------------------------------------------- +# Matrix builders +# ---------------------------------------------------------------------------------------------------------------------- +def build_independent_matrices(dataset_name, langs, training_docs, test_docs, label_names, wiki_docs=[], preprocess=True): + """ + Builds the document-by-term weighted matrices for each language. Representations are independent of each other, + i.e., each language-specific matrix lies in a dedicate feature space. + :param dataset_name: the name of the dataset (str) + :param langs: list of languages (str) + :param training_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id) + :param test_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id) + :param label_names: list of names of labels (str) + :param wiki_docs: doc-list (optional), if specified, project all wiki docs in the feature spaces built for the languages + :param preprocess: whether or not to apply language-specific text preprocessing (stopword removal and stemming) + :return: a MultilingualDataset. If wiki_docs has been specified, a dictionary lW is also returned, which indexes + by language the processed wikipedia documents in their respective language-specific feature spaces + """ + + mlb = MultiLabelBinarizer() + mlb.fit([label_names]) + + lW = {} + + multilingual_dataset = MultilingualDataset() + multilingual_dataset.dataset_name = dataset_name + multilingual_dataset.set_labels(mlb.classes_) + for lang in langs: + print("\nprocessing %d training, %d test, %d wiki for language <%s>" % + (len(training_docs[lang]), len(test_docs[lang]), len(wiki_docs[lang]) if wiki_docs else 0, lang)) + + tr_data, tr_labels, IDtr = zip(*training_docs[lang]) + te_data, te_labels, IDte = zip(*test_docs[lang]) + + if preprocess: + tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True, + tokenizer=NLTKStemTokenizer(lang, verbose=True), + stop_words=stopwords.words(NLTK_LANGMAP[lang])) + else: + tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True) + + Xtr = tfidf.fit_transform(tr_data) + Xte = tfidf.transform(te_data) + if wiki_docs: + lW[lang] = tfidf.transform(wiki_docs[lang]) + + Ytr = mlb.transform(tr_labels) + Yte = mlb.transform(te_labels) + + multilingual_dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte) + + multilingual_dataset.show_dimensions() + multilingual_dataset.show_category_prevalences() + + if wiki_docs: + return multilingual_dataset, lW + else: + return multilingual_dataset + + +# creates a MultilingualDataset where matrices shares a single yuxtaposed feature space +def build_juxtaposed_matrices(dataset_name, langs, training_docs, test_docs, label_names, preprocess=True): + """ + Builds the document-by-term weighted matrices for each language. Representations are not independent of each other, + since all of them lie on the same yuxtaposed feature space. + :param dataset_name: the name of the dataset (str) + :param langs: list of languages (str) + :param training_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id) + :param test_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id) + :param label_names: list of names of labels (str) + :param preprocess: whether or not to apply language-specific text preprocessing (stopword removal and stemming) + :return: a MultilingualDataset. If wiki_docs has been specified, a dictionary lW is also returned, which indexes + by language the processed wikipedia documents in their respective language-specific feature spaces + """ + + multiling_dataset = MultilingualDataset() + multiling_dataset.dataset_name = dataset_name + + mlb = MultiLabelBinarizer() + mlb.fit([label_names]) + + multiling_dataset.set_labels(mlb.classes_) + + tr_data_stack = [] + for lang in langs: + print("\nprocessing %d training and %d test for language <%s>" % (len(training_docs[lang]), len(test_docs[lang]), lang)) + tr_data, tr_labels, tr_ID = zip(*training_docs[lang]) + te_data, te_labels, te_ID = zip(*test_docs[lang]) + if preprocess: + tr_data = preprocess_documents(tr_data, lang) + te_data = preprocess_documents(te_data, lang) + tr_data_stack.extend(tr_data) + multiling_dataset.add(lang, tr_data, tr_labels, te_data, te_labels, tr_ID, te_ID) + + tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True) + tfidf.fit(tr_data_stack) + + for lang in langs: + print("\nweighting documents for language <%s>" % (lang)) + (tr_data, tr_labels, tr_ID), (te_data, te_labels, te_ID) = multiling_dataset[lang] + Xtr = tfidf.transform(tr_data) + Xte = tfidf.transform(te_data) + Ytr = mlb.transform(tr_labels) + Yte = mlb.transform(te_labels) + multiling_dataset.add(lang,Xtr,Ytr,Xte,Yte,tr_ID,te_ID) + + multiling_dataset.show_dimensions() + return multiling_dataset + + +# ---------------------------------------------------------------------------------------------------------------------- +# Methods to recover the original documents from the MultilingualDataset's ids +# ---------------------------------------------------------------------------------------------------------------------- +""" +This method has been added a posteriori, to create document embeddings using the polylingual embeddings of the recent +article 'Word Translation without Parallel Data'; basically, it takes one of the splits and retrieves the RCV documents +from the doc ids and then pickles an object (tr_docs, te_docs, label_names) in the outpath +""" +def retrieve_rcv_documents_from_dataset(datasetpath, rcv1_data_home, rcv2_data_home, outpath): + + tr_ids, te_ids = MultilingualDataset.load_ids(datasetpath) + assert tr_ids.keys() == te_ids.keys(), 'inconsistent keys tr vs te' + langs = list(tr_ids.keys()) + + print('fetching the datasets') + rcv1_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train') + rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l != 'en']) + + filter_by_categories(rcv1_documents, labels_rcv2) + filter_by_categories(rcv2_documents, labels_rcv1) + + label_names = get_active_labels(rcv1_documents + rcv2_documents) + print('Active labels in RCV1/2 {}'.format(len(label_names))) + + print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_documents), 0, len(label_names))) + print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents])) + + all_docs = rcv1_documents + rcv2_documents + mlb = MultiLabelBinarizer() + mlb.fit([label_names]) + + dataset = MultilingualDataset() + for lang in langs: + analyzer = CountVectorizer(strip_accents='unicode', min_df=3, + stop_words=stopwords.words(NLTK_LANGMAP[lang])).build_analyzer() + + Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.id) for d in all_docs if d.lang == lang and d.id in tr_ids[lang]]) + Xte,Yte,IDte = zip(*[(d.text,d.categories,d.id) for d in all_docs if d.lang == lang and d.id in te_ids[lang]]) + Xtr = [' '.join(analyzer(d)) for d in Xtr] + Xte = [' '.join(analyzer(d)) for d in Xte] + Ytr = mlb.transform(Ytr) + Yte = mlb.transform(Yte) + dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte) + + dataset.save(outpath) + +""" +Same thing but for JRC-Acquis +""" +def retrieve_jrc_documents_from_dataset(datasetpath, jrc_data_home, train_years, test_years, cat_policy, most_common_cat, outpath): + + tr_ids, te_ids = MultilingualDataset.load_ids(datasetpath) + assert tr_ids.keys() == te_ids.keys(), 'inconsistent keys tr vs te' + langs = list(tr_ids.keys()) + + print('fetching the datasets') + + cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy) + training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=train_years, + cat_filter=cat_list, cat_threshold=1, parallel=None, + most_frequent=most_common_cat) + test_docs, _ = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names, + parallel='force') + + def filter_by_id(doclist, ids): + ids_set = frozenset(itertools.chain.from_iterable(ids.values())) + return [x for x in doclist if (x.parallel_id+'__'+x.id) in ids_set] + + training_docs = filter_by_id(training_docs, tr_ids) + test_docs = filter_by_id(test_docs, te_ids) + + print('jrc: {} train, {} test, {} categories'.format(len(training_docs), len(test_docs), len(label_names))) + + mlb = MultiLabelBinarizer() + mlb.fit([label_names]) + + dataset = MultilingualDataset() + for lang in langs: + analyzer = CountVectorizer(strip_accents='unicode', min_df=3, + stop_words=stopwords.words(NLTK_LANGMAP[lang])).build_analyzer() + + Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.parallel_id+'__'+d.id) for d in training_docs if d.lang == lang]) + Xte,Yte,IDte = zip(*[(d.text,d.categories,d.parallel_id+'__'+d.id) for d in test_docs if d.lang == lang]) + Xtr = [' '.join(analyzer(d)) for d in Xtr] + Xte = [' '.join(analyzer(d)) for d in Xte] + Ytr = mlb.transform(Ytr) + Yte = mlb.transform(Yte) + dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte) + + dataset.save(outpath) + +# ---------------------------------------------------------------------------------------------------------------------- +# Dataset Generators +# ---------------------------------------------------------------------------------------------------------------------- +def prepare_jrc_datasets(jrc_data_home, wiki_data_home, langs, train_years, test_years, cat_policy, most_common_cat=-1, max_wiki=5000, run=0): + from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample + + + """ + Prepare all datasets for JRC-Acquis. The datasets include the "feature-independent" version, the + "feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices. + In all cases, training documents are strictly non-parallel, and test documents are strictly parallel + :param jrc_data_home: path to the raw JRC-Acquis documents (it will be downloaded if not found), and the path where + all splits will be generated + :param wiki_data_home: path to the wikipedia dump (see data/readers/wikipedia_tools.py) + :param langs: the list of languages to consider (as defined in data/languages.py) + :param train_years: a list of ints containing the years to be considered as training documents + :param test_years: a list of ints containing the years to be considered as test documents + :param cat_policy: a string indicating which category selection policy to apply. Valid policies are, e.g., "all" + (select all categories), "broadest" (select only the broadest concepts in the taxonomy), or "leaves" (select the + leaves concepts in the taxonomy). See inspect_eurovoc from data/reader/jrcacquis_reader.py for more details + :param most_common_cat: the maximum number of most common categories to consider, or -1 to keep them all + :param max_wiki: the maximum number of wikipedia documents to consider (default 5000) + :param run: a numeric label naming the random split (useful to keep track of different runs) + :return: None + """ + + name = 'JRCacquis' + run = '_run' + str(run) + config_name = 'jrc_nltk_' + __years_to_str(train_years) + \ + 'vs' + __years_to_str(test_years) + \ + '_' + cat_policy + \ + ('_top' + str(most_common_cat) if most_common_cat!=-1 else '') + \ + '_noparallel_processed' + + indep_path = join(jrc_data_home, config_name + run + '.pickle') + upper_path = join(jrc_data_home, config_name + run + '_upper.pickle') + yuxta_path = join(jrc_data_home, config_name + run + '_yuxtaposed.pickle') + wiki_path = join(jrc_data_home, config_name + run + '.wiki.pickle') + wiki_docs_path = join(jrc_data_home, config_name + '.wiki.raw.pickle') + + cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy) + training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=train_years, + cat_filter=cat_list, cat_threshold=1, parallel=None, + most_frequent=most_common_cat) + test_docs, _ = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names, + parallel='force') + + print('Generating feature-independent dataset...') + training_docs_no_parallel = random_sampling_avoiding_parallel(training_docs) + + def _group_by_lang(doc_list, langs): + return {lang: [(d.text, d.categories, d.parallel_id + '__' + d.id) for d in doc_list if d.lang == lang] + for lang in langs} + + training_docs = _group_by_lang(training_docs, langs) + training_docs_no_parallel = _group_by_lang(training_docs_no_parallel, langs) + test_docs = _group_by_lang(test_docs, langs) + if not exists(indep_path): + wiki_docs=None + if max_wiki>0: + if not exists(wiki_docs_path): + wiki_docs = fetch_wikipedia_multilingual(wiki_data_home, langs, min_words=50, deletions=False) + wiki_docs = random_wiki_sample(wiki_docs, max_wiki) + pickle.dump(wiki_docs, open(wiki_docs_path, 'wb'), pickle.HIGHEST_PROTOCOL) + else: + wiki_docs = pickle.load(open(wiki_docs_path, 'rb')) + wiki_docs = random_wiki_sample(wiki_docs, max_wiki) + + if wiki_docs: + lang_data, wiki_docs = build_independent_matrices(name, langs, training_docs_no_parallel, test_docs, label_names, wiki_docs) + pickle.dump(wiki_docs, open(wiki_path, 'wb'), pickle.HIGHEST_PROTOCOL) + else: + lang_data = build_independent_matrices(name, langs, training_docs_no_parallel, test_docs, label_names) + + lang_data.save(indep_path) + + print('Generating upper-bound (English-only) dataset...') + if not exists(upper_path): + training_docs_eng_only = {'en':training_docs['en']} + test_docs_eng_only = {'en':test_docs['en']} + build_independent_matrices(name, ['en'], training_docs_eng_only, test_docs_eng_only, label_names).save(upper_path) + + print('Generating yuxtaposed dataset...') + if not exists(yuxta_path): + build_juxtaposed_matrices(name, langs, training_docs_no_parallel, test_docs, label_names).save(yuxta_path) + + +def prepare_rcv_datasets(outpath, rcv1_data_home, rcv2_data_home, wiki_data_home, langs, + train_for_lang=1000, test_for_lang=1000, max_wiki=5000, preprocess=True, run=0): + from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample + """ + Prepare all datasets for RCV1/RCV2. The datasets include the "feature-independent" version, the + "feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices. + + :param outpath: path where all splits will be dumped + :param rcv1_data_home: path to the RCV1-v2 dataset (English only) + :param rcv2_data_home: path to the RCV2 dataset (all languages other than English) + :param wiki_data_home: path to the wikipedia dump (see data/readers/wikipedia_tools.py) + :param langs: the list of languages to consider (as defined in data/languages.py) + :param train_for_lang: maximum number of training documents per language + :param test_for_lang: maximum number of test documents per language + :param max_wiki: the maximum number of wikipedia documents to consider (default 5000) + :param preprocess: whether or not to apply language-specific preprocessing (stopwords removal and stemming) + :param run: a numeric label naming the random split (useful to keep track of different runs) + :return: None + """ + + assert 'en' in langs, 'English is not in requested languages, but is needed for some datasets' + assert len(langs)>1, 'the multilingual dataset cannot be built with only one dataset' + assert not preprocess or set(langs).issubset(set(RCV2_LANGS_WITH_NLTK_STEMMING+['en'])), \ + "languages not in RCV1-v2/RCV2 scope or not in valid for NLTK's processing" + + name = 'RCV1/2' + run = '_run' + str(run) + config_name = 'rcv1-2_nltk_trByLang'+str(train_for_lang)+'_teByLang'+str(test_for_lang)+\ + ('_processed' if preprocess else '_raw') + + indep_path = join(outpath, config_name + run + '.pickle') + upper_path = join(outpath, config_name + run +'_upper.pickle') + yuxta_path = join(outpath, config_name + run +'_yuxtaposed.pickle') + wiki_path = join(outpath, config_name + run + '.wiki.pickle') + wiki_docs_path = join(outpath, config_name + '.wiki.raw.pickle') + + print('fetching the datasets') + rcv1_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train') + rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l!='en']) + filter_by_categories(rcv1_documents, labels_rcv2) + filter_by_categories(rcv2_documents, labels_rcv1) + + label_names = get_active_labels(rcv1_documents+rcv2_documents) + print('Active labels in RCV1/2 {}'.format(len(label_names))) + + print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_documents), 0, len(label_names))) + print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents])) + + lang_docs = {lang: [d for d in rcv1_documents + rcv2_documents if d.lang == lang] for lang in langs} + + # for the upper bound there are no parallel versions, so for the English case, we take as many documents as there + # would be in the multilingual case -- then we will extract from them only train_for_lang for the other cases + print('Generating upper-bound (English-only) dataset...') + train, test = train_test_split(lang_docs['en'], train_size=train_for_lang*len(langs), test_size=test_for_lang, shuffle=True) + train_lang_doc_map = {'en':[(d.text, d.categories, d.id) for d in train]} + test_lang_doc_map = {'en':[(d.text, d.categories, d.id) for d in test]} + build_independent_matrices(name, ['en'], train_lang_doc_map, test_lang_doc_map, label_names).save(upper_path) + + train_lang_doc_map['en'] = train_lang_doc_map['en'][:train_for_lang] + for lang in langs: + if lang=='en': continue # already split + test_take = min(test_for_lang, len(lang_docs[lang])-train_for_lang) + train, test = train_test_split(lang_docs[lang], train_size=train_for_lang, test_size=test_take, shuffle=True) + train_lang_doc_map[lang] = [(d.text, d.categories, d.id) for d in train] + test_lang_doc_map[lang] = [(d.text, d.categories, d.id) for d in test] + + print('Generating feature-independent dataset...') + wiki_docs=None + if max_wiki>0: + if not exists(wiki_docs_path): + wiki_docs = fetch_wikipedia_multilingual(wiki_data_home, langs, min_words=50, deletions=False) + wiki_docs = random_wiki_sample(wiki_docs, max_wiki) + pickle.dump(wiki_docs, open(wiki_docs_path, 'wb'), pickle.HIGHEST_PROTOCOL) + else: + wiki_docs = pickle.load(open(wiki_docs_path, 'rb')) + wiki_docs = random_wiki_sample(wiki_docs, max_wiki) + + if wiki_docs: + lang_data, wiki_docs_matrix = build_independent_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, wiki_docs, preprocess) + pickle.dump(wiki_docs_matrix, open(wiki_path, 'wb'), pickle.HIGHEST_PROTOCOL) + else: + lang_data = build_independent_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, wiki_docs, preprocess) + + lang_data.save(indep_path) + + print('Generating yuxtaposed dataset...') + build_juxtaposed_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, preprocess).save(yuxta_path) + + +# ---------------------------------------------------------------------------------------------------------------------- +# Methods to generate full RCV and JRC datasets +# ---------------------------------------------------------------------------------------------------------------------- +def full_rcv_(rcv1_data_home, rcv2_data_home, outpath, langs): + + + print('fetching the datasets') + rcv1_train_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train') + rcv1_test_documents, labels_rcv1_test = fetch_RCV1(rcv1_data_home, split='test') + rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l != 'en']) + + filter_by_categories(rcv1_train_documents, labels_rcv2) + filter_by_categories(rcv1_test_documents, labels_rcv2) + filter_by_categories(rcv2_documents, labels_rcv1) + + label_names = get_active_labels(rcv1_train_documents + rcv2_documents) + print('Active labels in RCV1/2 {}'.format(len(label_names))) + + print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_train_documents), len(rcv1_test_documents), len(label_names))) + print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents])) + + mlb = MultiLabelBinarizer() + mlb.fit([label_names]) + + all_docs = rcv1_train_documents + rcv1_test_documents + rcv2_documents + lang_docs = {lang: [d for d in all_docs if d.lang == lang] for lang in langs} + + def get_ids(doclist): + return frozenset([d.id for d in doclist]) + + tr_ids = {'en': get_ids(rcv1_train_documents)} + te_ids = {'en': get_ids(rcv1_test_documents)} + for lang in langs: + if lang == 'en': continue + tr_ids[lang], te_ids[lang] = train_test_split([d.id for d in lang_docs[lang]], test_size=.3) + + dataset = MultilingualDataset() + dataset.dataset_name = 'RCV1/2-full' + for lang in langs: + print(f'processing {lang} with {len(tr_ids[lang])} training documents and {len(te_ids[lang])} documents') + analyzer = CountVectorizer( + strip_accents='unicode', min_df=3, stop_words=stopwords.words(NLTK_LANGMAP[lang]) + ).build_analyzer() + + Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.id) for d in lang_docs[lang] if d.id in tr_ids[lang]]) + Xte,Yte,IDte = zip(*[(d.text,d.categories,d.id) for d in lang_docs[lang] if d.id in te_ids[lang]]) + Xtr = [' '.join(analyzer(d)) for d in Xtr] + Xte = [' '.join(analyzer(d)) for d in Xte] + Ytr = mlb.transform(Ytr) + Yte = mlb.transform(Yte) + dataset.add(lang, _mask_numbers(Xtr), Ytr, _mask_numbers(Xte), Yte, IDtr, IDte) + + dataset.save(outpath) + + +def full_jrc_(jrc_data_home, langs, train_years, test_years, outpath, cat_policy='all', most_common_cat=300): + + print('fetching the datasets') + cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy) + training_docs, label_names = fetch_jrcacquis( + langs=langs, data_path=jrc_data_home, years=train_years, cat_filter=cat_list, cat_threshold=1, parallel=None, most_frequent=most_common_cat + ) + test_docs, _ = fetch_jrcacquis( + langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names, parallel='force' + ) + + def _group_by_lang(doc_list, langs): + return {lang: [d for d in doc_list if d.lang == lang] for lang in langs} + + training_docs = _group_by_lang(training_docs, langs) + test_docs = _group_by_lang(test_docs, langs) + + mlb = MultiLabelBinarizer() + mlb.fit([label_names]) + + dataset = MultilingualDataset() + data.dataset_name = 'JRC-Acquis-full' + for lang in langs: + analyzer = CountVectorizer( + strip_accents='unicode', min_df=3, stop_words=stopwords.words(NLTK_LANGMAP[lang]) + ).build_analyzer() + + Xtr, Ytr, IDtr = zip(*[(d.text, d.categories, d.parallel_id + '__' + d.id) for d in training_docs[lang] if d.lang == lang]) + Xte, Yte, IDte = zip(*[(d.text, d.categories, d.parallel_id + '__' + d.id) for d in test_docs[lang] if d.lang == lang]) + Xtr = [' '.join(analyzer(d)) for d in Xtr] + Xte = [' '.join(analyzer(d)) for d in Xte] + Ytr = mlb.transform(Ytr) + Yte = mlb.transform(Yte) + dataset.add(lang, _mask_numbers(Xtr), Ytr, _mask_numbers(Xte), Yte, IDtr, IDte) + + dataset.save(outpath) + + +#----------------------------------------------------------------------------------------------------------------------- +# MAIN BUILDER +#----------------------------------------------------------------------------------------------------------------------- + +if __name__=='__main__': + import sys + RCV1_PATH = '../Datasets/RCV1-v2/unprocessed_corpus' + RCV2_PATH = '../Datasets/RCV2' + JRC_DATAPATH = "../Datasets/JRC_Acquis_v3" + full_rcv_(RCV1_PATH, RCV2_PATH, outpath='../rcv2/rcv1-2_doclist_full_processed.pickle', langs=RCV2_LANGS_WITH_NLTK_STEMMING + ['en']) + # full_jrc_(JRC_DATAPATH, lang_set['JRC_NLTK'], train_years=list(range(1958, 2006)), test_years=[2006], outpath='../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle', cat_policy='all', most_common_cat=300) + sys.exit(0) + + # datasetpath = '../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle' # '../rcv2/rcv1-2_doclist_full_processed.pickle' + # data = MultilingualDataset.load(datasetpath) + # data.dataset_name='JRC-Acquis-full'#'RCV1/2-full' + # for lang in RCV2_LANGS_WITH_NLTK_STEMMING + ['en']: + # (Xtr, ytr, idtr), (Xte, yte, idte) = data.multiling_dataset[lang] + # data.multiling_dataset[lang] = ((_mask_numbers(Xtr), ytr, idtr), (_mask_numbers(Xte), yte, idte)) + # data.save('../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle')#'../rcv2/rcv1-2_doclist_full_processed_2.pickle') + # sys.exit(0) + + assert len(sys.argv) == 5, "wrong number of arguments; required: " \ + " " + + JRC_DATAPATH = sys.argv[1] # "../Datasets/JRC_Acquis_v3" + RCV1_PATH = sys.argv[2] #'../Datasets/RCV1-v2/unprocessed_corpus' + RCV2_PATH = sys.argv[3] #'../Datasets/RCV2' + WIKI_DATAPATH = sys.argv[4] #"../Datasets/Wikipedia/multilingual_docs_JRC_NLTK" + + langs = lang_set['JRC_NLTK'] + max_wiki = 5000 + + for run in range(0,10): + print('Building JRC-Acquis datasets run', run) + prepare_jrc_datasets(JRC_DATAPATH, WIKI_DATAPATH, langs, + train_years=list(range(1958, 2006)), test_years=[2006], max_wiki=max_wiki, + cat_policy='all', most_common_cat=300, run=run) + + print('Building RCV1-v2/2 datasets run', run) + prepare_rcv_datasets(RCV2_PATH, RCV1_PATH, RCV2_PATH, WIKI_DATAPATH, RCV2_LANGS_WITH_NLTK_STEMMING + ['en'], + train_for_lang=1000, test_for_lang=1000, max_wiki=max_wiki, run=run) + + # uncomment this code if you want to retrieve the original documents to generate the data splits for PLE + # (make sure you have not modified the above parameters, or adapt the following paths accordingly...) + # datasetpath = join(RCV2_PATH,'rcv1-2_nltk_trByLang1000_teByLang1000_processed_run{}.pickle'.format(run)) + # outpath = datasetpath.replace('_nltk_','_doclist_') + # retrieve_rcv_documents_from_dataset(datasetpath, RCV1_PATH, RCV2_PATH, outpath) + + # datasetpath = join(JRC_DATAPATH, 'jrc_nltk_1958-2005vs2006_all_top300_noparallel_processed_run{}.pickle'.format(run)) + # outpath = datasetpath.replace('_nltk_', '_doclist_') + # retrieve_jrc_documents_from_dataset(datasetpath, JRC_DATAPATH, train_years=list(range(1958, 2006)), test_years=[2006], cat_policy='all', most_common_cat=300, outpath=outpath) + + + diff --git a/refactor/data/languages.py b/refactor/data/languages.py new file mode 100644 index 0000000..2d03d8e --- /dev/null +++ b/refactor/data/languages.py @@ -0,0 +1,42 @@ +""" +bg = Bulgarian +cs = Czech +da = Danish +de = German +el = Greek +en = English +es = Spanish +et = Estonian +fi = Finnish +fr = French +hu = Hungarian +it = Italian +lt = Lithuanian +lv = Latvian +nl = Dutch +mt = Maltese +pl = Polish +pt = Portuguese +ro = Romanian +sk = Slovak +sl = Slovene +sv = Swedish +""" + +NLTK_LANGMAP = {'da': 'danish', 'nl': 'dutch', 'en': 'english', 'fi': 'finnish', 'fr': 'french', 'de': 'german', + 'hu': 'hungarian', 'it': 'italian', 'pt': 'portuguese', 'ro': 'romanian', 'es': 'spanish', 'sv': 'swedish'} + + +#top 10 languages in wikipedia order by the number of articles +#LANGS_10_MOST_WIKI = ['en','fr','sv','de','es','it','pt','nl','pl','ro'] + +#all languages in JRC-acquis v3 +JRC_LANGS = ['bg','cs','da','de','el','en','es','et','fi','fr','hu','it','lt','lv','mt','nl','pl','pt','ro','sk','sl','sv'] +JRC_LANGS_WITH_NLTK_STEMMING = ['da', 'nl', 'en', 'fi', 'fr', 'de', 'hu', 'it', 'pt', 'es', 'sv'] # 'romanian deleted for incompatibility issues' + +RCV2_LANGS = ['ru', 'de', 'fr', 'sv', 'no', 'da', 'pt', 'it', 'es', 'jp', 'htw', 'nl'] +RCV2_LANGS_WITH_NLTK_STEMMING = ['de', 'fr', 'sv', 'da', 'pt', 'it', 'es', 'nl'] + +lang_set = {'JRC_NLTK':JRC_LANGS_WITH_NLTK_STEMMING, 'JRC':JRC_LANGS, + 'RCV2_NLTK':RCV2_LANGS_WITH_NLTK_STEMMING, 'RCV2':RCV2_LANGS} + diff --git a/refactor/data/reader/__init__.py b/refactor/data/reader/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/refactor/data/reader/jrcacquis_reader.py b/refactor/data/reader/jrcacquis_reader.py new file mode 100644 index 0000000..c0441ed --- /dev/null +++ b/refactor/data/reader/jrcacquis_reader.py @@ -0,0 +1,321 @@ +from __future__ import print_function +import os, sys +from os.path import join +import tarfile +import xml.etree.ElementTree as ET +from sklearn.datasets import get_data_home +import pickle +from util.file import download_file, list_dirs, list_files +import rdflib +from rdflib.namespace import RDF, SKOS +from rdflib import URIRef +import zipfile +from data.languages import JRC_LANGS +from collections import Counter +from random import shuffle +from data.languages import lang_set + +""" +JRC Acquis' Nomenclature: +bg = Bulgarian +cs = Czech +da = Danish +de = German +el = Greek +en = English +es = Spanish +et = Estonian +fi = Finnish +fr = French +hu = Hungarian +it = Italian +lt = Lithuanian +lv = Latvian +nl = Dutch +mt = Maltese +pl = Polish +pt = Portuguese +ro = Romanian +sk = Slovak +sl = Slovene +sv = Swedish +""" + +class JRCAcquis_Document: + def __init__(self, id, name, lang, year, head, body, categories): + self.id = id + self.parallel_id = name + self.lang = lang + self.year = year + self.text = body if not head else head + "\n" + body + self.categories = categories + +# this is a workaround... for some reason, acutes are codified in a non-standard manner in titles +# however, it seems that the title is often appearing as the first paragraph in the text/body (with +# standard codification), so it might be preferable not to read the header after all (as here by default) +def _proc_acute(text): + for ch in ['a','e','i','o','u']: + text = text.replace('%'+ch+'acute%',ch) + return text + +def parse_document(file, year, head=False): + root = ET.parse(file).getroot() + + doc_name = root.attrib['n'] # e.g., '22006A0211(01)' + doc_lang = root.attrib['lang'] # e.g., 'es' + doc_id = root.attrib['id'] # e.g., 'jrc22006A0211_01-es' + doc_categories = [cat.text for cat in root.findall('.//teiHeader/profileDesc/textClass/classCode[@scheme="eurovoc"]')] + doc_head = _proc_acute(root.find('.//text/body/head').text) if head else '' + doc_body = '\n'.join([p.text for p in root.findall('.//text/body/div[@type="body"]/p')]) + + def raise_if_empty(field, from_file): + if isinstance(field, str): + if not field.strip(): + raise ValueError("Empty field in file %s" % from_file) + + raise_if_empty(doc_name, file) + raise_if_empty(doc_lang, file) + raise_if_empty(doc_id, file) + if head: raise_if_empty(doc_head, file) + raise_if_empty(doc_body, file) + + return JRCAcquis_Document(id=doc_id, name=doc_name, lang=doc_lang, year=year, head=doc_head, body=doc_body, categories=doc_categories) + +# removes documents without a counterpart in all other languages +def _force_parallel(doclist, langs): + n_langs = len(langs) + par_id_count = Counter([d.parallel_id for d in doclist]) + parallel_doc_ids = set([id for id,count in par_id_count.items() if count==n_langs]) + return [doc for doc in doclist if doc.parallel_id in parallel_doc_ids] + +def random_sampling_avoiding_parallel(doclist): + random_order = list(range(len(doclist))) + shuffle(random_order) + sampled_request = [] + parallel_ids = set() + for ind in random_order: + pid = doclist[ind].parallel_id + if pid not in parallel_ids: + sampled_request.append(doclist[ind]) + parallel_ids.add(pid) + print('random_sampling_no_parallel:: from {} documents to {} documents'.format(len(doclist), len(sampled_request))) + return sampled_request + + +#filters out documents which do not contain any category in the cat_filter list, and filter all labels not in cat_filter +def _filter_by_category(doclist, cat_filter): + if not isinstance(cat_filter, frozenset): + cat_filter = frozenset(cat_filter) + filtered = [] + for doc in doclist: + doc.categories = list(cat_filter & set(doc.categories)) + if doc.categories: + doc.categories.sort() + filtered.append(doc) + print("filtered %d documents out without categories in the filter list" % (len(doclist) - len(filtered))) + return filtered + +#filters out categories with less than cat_threshold documents (and filters documents containing those categories) +def _filter_by_frequency(doclist, cat_threshold): + cat_count = Counter() + for d in doclist: + cat_count.update(d.categories) + + freq_categories = [cat for cat,count in cat_count.items() if count>cat_threshold] + freq_categories.sort() + return _filter_by_category(doclist, freq_categories), freq_categories + +#select top most_frequent categories (and filters documents containing those categories) +def _most_common(doclist, most_frequent): + cat_count = Counter() + for d in doclist: + cat_count.update(d.categories) + + freq_categories = [cat for cat,count in cat_count.most_common(most_frequent)] + freq_categories.sort() + return _filter_by_category(doclist, freq_categories), freq_categories + +def _get_categories(request): + final_cats = set() + for d in request: + final_cats.update(d.categories) + return list(final_cats) + +def fetch_jrcacquis(langs=None, data_path=None, years=None, ignore_unclassified=True, cat_filter=None, cat_threshold=0, + parallel=None, most_frequent=-1, DOWNLOAD_URL_BASE ='http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/'): + + assert parallel in [None, 'force', 'avoid'], 'parallel mode not supported' + if not langs: + langs = JRC_LANGS + else: + if isinstance(langs, str): langs = [langs] + for l in langs: + if l not in JRC_LANGS: + raise ValueError('Language %s is not among the valid languages in JRC-Acquis v3' % l) + + if not data_path: + data_path = get_data_home() + + if not os.path.exists(data_path): + os.mkdir(data_path) + + request = [] + total_read = 0 + for l in langs: + file_name = 'jrc-'+l+'.tgz' + archive_path = join(data_path, file_name) + + if not os.path.exists(archive_path): + print("downloading language-specific dataset (once and for all) into %s" % data_path) + DOWNLOAD_URL = join(DOWNLOAD_URL_BASE, file_name) + download_file(DOWNLOAD_URL, archive_path) + print("untarring dataset...") + tarfile.open(archive_path, 'r:gz').extractall(data_path) + + documents_dir = join(data_path, l) + + print("Reading documents...") + read = 0 + for dir in list_dirs(documents_dir): + year = int(dir) + if years==None or year in years: + year_dir = join(documents_dir,dir) + pickle_name = join(data_path, 'jrc_' + l + '_' + dir + '.pickle') + if os.path.exists(pickle_name): + print("loading from file %s" % pickle_name) + l_y_documents = pickle.load(open(pickle_name, "rb")) + read += len(l_y_documents) + else: + l_y_documents = [] + all_documents = list_files(year_dir) + empty = 0 + for i,doc_file in enumerate(all_documents): + try: + jrc_doc = parse_document(join(year_dir, doc_file), year) + except ValueError: + jrc_doc = None + + if jrc_doc and (not ignore_unclassified or jrc_doc.categories): + l_y_documents.append(jrc_doc) + else: empty += 1 + if len(all_documents)>50 and ((i+1) % (len(all_documents)/50) == 0): + print('\r\tfrom %s: completed %d%%' % (year_dir, int((i+1)*100.0/len(all_documents))), end='') + read+=1 + print('\r\tfrom %s: completed 100%% read %d documents (discarded %d without categories or empty fields)\n' % (year_dir, i+1, empty), end='') + print("\t\t(Pickling object for future runs in %s)" % pickle_name) + pickle.dump(l_y_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL) + request += l_y_documents + print("Read %d documents for language %s\n" % (read, l)) + total_read += read + print("Read %d documents in total" % (total_read)) + + if parallel=='force': + request = _force_parallel(request, langs) + elif parallel == 'avoid': + request = random_sampling_avoiding_parallel(request) + + final_cats = _get_categories(request) + + if cat_filter: + request = _filter_by_category(request, cat_filter) + final_cats = _get_categories(request) + if cat_threshold > 0: + request, final_cats = _filter_by_frequency(request, cat_threshold) + if most_frequent != -1 and len(final_cats) > most_frequent: + request, final_cats = _most_common(request, most_frequent) + + return request, final_cats + +def print_cat_analysis(request): + cat_count = Counter() + for d in request: + cat_count.update(d.categories) + print("Number of active categories: {}".format(len(cat_count))) + print(cat_count.most_common()) + +# inspects the Eurovoc thesaurus in order to select a subset of categories +# currently, only 'broadest' policy (i.e., take all categories with no parent category), and 'all' is implemented +def inspect_eurovoc(data_path, eurovoc_skos_core_concepts_filename='eurovoc_in_skos_core_concepts.rdf', + eurovoc_url="http://publications.europa.eu/mdr/resource/thesaurus/eurovoc-20160630-0/skos/eurovoc_in_skos_core_concepts.zip", + select="broadest"): + + fullpath_pickle = join(data_path, select+'_concepts.pickle') + if os.path.exists(fullpath_pickle): + print("Pickled object found in %s. Loading it." % fullpath_pickle) + return pickle.load(open(fullpath_pickle,'rb')) + + fullpath = join(data_path, eurovoc_skos_core_concepts_filename) + if not os.path.exists(fullpath): + print("Path %s does not exist. Trying to download the skos EuroVoc file from %s" % (data_path, eurovoc_url)) + download_file(eurovoc_url, fullpath) + print("Unzipping file...") + zipped = zipfile.ZipFile(data_path + '.zip', 'r') + zipped.extract("eurovoc_in_skos_core_concepts.rdf", data_path) + zipped.close() + + print("Parsing %s" %fullpath) + g = rdflib.Graph() + g.parse(location=fullpath, format="application/rdf+xml") + + if select == "all": + print("Selecting all concepts") + all_concepts = list(g.subjects(RDF.type, SKOS.Concept)) + all_concepts = [c.toPython().split('/')[-1] for c in all_concepts] + all_concepts.sort() + selected_concepts = all_concepts + elif select=="broadest": + print("Selecting broadest concepts (those without any other broader concept linked to it)") + all_concepts = set(g.subjects(RDF.type, SKOS.Concept)) + narrower_concepts = set(g.subjects(SKOS.broader, None)) + broadest_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - narrower_concepts)] + broadest_concepts.sort() + selected_concepts = broadest_concepts + elif select=="leaves": + print("Selecting leaves concepts (those not linked as broader of any other concept)") + all_concepts = set(g.subjects(RDF.type, SKOS.Concept)) + broad_concepts = set(g.objects(None, SKOS.broader)) + leave_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - broad_concepts)] + leave_concepts.sort() + selected_concepts = leave_concepts + else: + raise ValueError("Selection policy %s is not currently supported" % select) + + print("%d %s concepts found" % (len(selected_concepts), leave_concepts)) + print("Pickling concept list for faster further requests in %s" % fullpath_pickle) + pickle.dump(selected_concepts, open(fullpath_pickle, 'wb'), pickle.HIGHEST_PROTOCOL) + + return selected_concepts + +if __name__ == '__main__': + + def single_label_fragment(doclist): + single = [d for d in doclist if len(d.categories) < 2] + final_categories = set([d.categories[0] if d.categories else [] for d in single]) + print('{} single-label documents ({} categories) from the original {} documents'.format(len(single), + len(final_categories), + len(doclist))) + return single, list(final_categories) + + train_years = list(range(1986, 2006)) + test_years = [2006] + cat_policy = 'leaves' + most_common_cat = 300 + # JRC_DATAPATH = "/media/moreo/1TB Volume/Datasets/JRC_Acquis_v3" + JRC_DATAPATH = "/storage/andrea/FUNNELING/data/JRC_Acquis_v3" + langs = lang_set['JRC_NLTK'] + cat_list = inspect_eurovoc(JRC_DATAPATH, select=cat_policy) + sys.exit() + + training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=JRC_DATAPATH, years=train_years,cat_filter=cat_list, cat_threshold=1, parallel=None,most_frequent=most_common_cat) + test_docs, label_namestest = fetch_jrcacquis(langs=langs, data_path=JRC_DATAPATH, years=test_years, cat_filter=label_names,parallel='force') + + print('JRC-train: {} documents, {} labels'.format(len(training_docs), len(label_names))) + print('JRC-test: {} documents, {} labels'.format(len(test_docs), len(label_namestest))) + + training_docs, label_names = single_label_fragment(training_docs) + test_docs, label_namestest = single_label_fragment(test_docs) + + print('JRC-train: {} documents, {} labels'.format(len(training_docs), len(label_names))) + print('JRC-test: {} documents, {} labels'.format(len(test_docs), len(label_namestest))) + + diff --git a/refactor/data/reader/rcv_reader.py b/refactor/data/reader/rcv_reader.py new file mode 100644 index 0000000..cd4b416 --- /dev/null +++ b/refactor/data/reader/rcv_reader.py @@ -0,0 +1,225 @@ +from zipfile import ZipFile +import xml.etree.ElementTree as ET +from data.languages import RCV2_LANGS_WITH_NLTK_STEMMING, RCV2_LANGS +from util.file import list_files +from sklearn.datasets import get_data_home +import gzip +from os.path import join, exists +from util.file import download_file_if_not_exists +import re +from collections import Counter +import numpy as np +import sys + +""" +RCV2's Nomenclature: +ru = Russian +da = Danish +de = German +es = Spanish +lat = Spanish Latin-American (actually is also 'es' in the collection) +fr = French +it = Italian +nl = Dutch +pt = Portuguese +sv = Swedish +ja = Japanese +htw = Chinese +no = Norwegian +""" + +RCV1_TOPICHIER_URL = "http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a02-orig-topics-hierarchy/rcv1.topics.hier.orig" +RCV1PROC_BASE_URL= 'http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/a12-token-files' +RCV1_BASE_URL = "http://www.daviddlewis.com/resources/testcollections/rcv1/" +RCV2_BASE_URL = "http://trec.nist.gov/data/reuters/reuters.html" + +rcv1_test_data_gz = ['lyrl2004_tokens_test_pt0.dat.gz', + 'lyrl2004_tokens_test_pt1.dat.gz', + 'lyrl2004_tokens_test_pt2.dat.gz', + 'lyrl2004_tokens_test_pt3.dat.gz'] + +rcv1_train_data_gz = ['lyrl2004_tokens_train.dat.gz'] + +rcv1_doc_cats_data_gz = 'rcv1-v2.topics.qrels.gz' + +RCV2_LANG_DIR = {'ru':'REUTE000', + 'de':'REUTE00A', + 'fr':'REUTE00B', + 'sv':'REUTE001', + 'no':'REUTE002', + 'da':'REUTE003', + 'pt':'REUTE004', + 'it':'REUTE005', + 'es':'REUTE006', + 'lat':'REUTE007', + 'jp':'REUTE008', + 'htw':'REUTE009', + 'nl':'REUTERS_'} + + +class RCV_Document: + + def __init__(self, id, text, categories, date='', lang=None): + self.id = id + self.date = date + self.lang = lang + self.text = text + self.categories = categories + + +class ExpectedLanguageException(Exception): pass +class IDRangeException(Exception): pass + + +nwords = [] + +def parse_document(xml_content, assert_lang=None, valid_id_range=None): + root = ET.fromstring(xml_content) + if assert_lang: + if assert_lang not in root.attrib.values(): + if assert_lang != 'jp' or 'ja' not in root.attrib.values(): # some documents are attributed to 'ja', others to 'jp' + raise ExpectedLanguageException('error: document of a different language') + + doc_id = root.attrib['itemid'] + if valid_id_range is not None: + if not valid_id_range[0] <= int(doc_id) <= valid_id_range[1]: + raise IDRangeException + + doc_categories = [cat.attrib['code'] for cat in + root.findall('.//metadata/codes[@class="bip:topics:1.0"]/code')] + + doc_date = root.attrib['date'] + doc_title = root.find('.//title').text + doc_headline = root.find('.//headline').text + doc_body = '\n'.join([p.text for p in root.findall('.//text/p')]) + + if not doc_body: + raise ValueError('Empty document') + + if doc_title is None: doc_title = '' + if doc_headline is None or doc_headline in doc_title: doc_headline = '' + text = '\n'.join([doc_title, doc_headline, doc_body]).strip() + + text_length = len(text.split()) + global nwords + nwords.append(text_length) + + return RCV_Document(id=doc_id, text=text, categories=doc_categories, date=doc_date, lang=assert_lang) + + +def fetch_RCV1(data_path, split='all'): + + assert split in ['train', 'test', 'all'], 'split should be "train", "test", or "all"' + + request = [] + labels = set() + read_documents = 0 + lang = 'en' + + training_documents = 23149 + test_documents = 781265 + + if split == 'all': + split_range = (2286, 810596) + expected = training_documents+test_documents + elif split == 'train': + split_range = (2286, 26150) + expected = training_documents + else: + split_range = (26151, 810596) + expected = test_documents + + global nwords + nwords=[] + for part in list_files(data_path): + if not re.match('\d+\.zip', part): continue + target_file = join(data_path, part) + assert exists(target_file), \ + "You don't seem to have the file "+part+" in " + data_path + ", and the RCV1 corpus can not be downloaded"+\ + " w/o a formal permission. Please, refer to " + RCV1_BASE_URL + " for more information." + zipfile = ZipFile(target_file) + for xmlfile in zipfile.namelist(): + xmlcontent = zipfile.open(xmlfile).read() + try: + doc = parse_document(xmlcontent, assert_lang=lang, valid_id_range=split_range) + labels.update(doc.categories) + request.append(doc) + read_documents += 1 + except ValueError: + print('\n\tskipping document {} with inconsistent language label: expected language {}'.format(part+'/'+xmlfile, lang)) + except (IDRangeException, ExpectedLanguageException) as e: + pass + print('\r[{}] read {} documents'.format(part, len(request)), end='') + if read_documents == expected: break + if read_documents == expected: break + print() + print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords))) + return request, list(labels) + + +def fetch_RCV2(data_path, languages=None): + + if not languages: + languages = list(RCV2_LANG_DIR.keys()) + else: + assert set(languages).issubset(set(RCV2_LANG_DIR.keys())), 'languages not in scope' + + request = [] + labels = set() + global nwords + nwords=[] + for lang in languages: + path = join(data_path, RCV2_LANG_DIR[lang]) + lang_docs_read = 0 + for part in list_files(path): + target_file = join(path, part) + assert exists(target_file), \ + "You don't seem to have the file "+part+" in " + path + ", and the RCV2 corpus can not be downloaded"+\ + " w/o a formal permission. Please, refer to " + RCV2_BASE_URL + " for more information." + zipfile = ZipFile(target_file) + for xmlfile in zipfile.namelist(): + xmlcontent = zipfile.open(xmlfile).read() + try: + doc = parse_document(xmlcontent, assert_lang=lang) + labels.update(doc.categories) + request.append(doc) + lang_docs_read += 1 + except ValueError: + print('\n\tskipping document {} with inconsistent language label: expected language {}'.format(RCV2_LANG_DIR[lang]+'/'+part+'/'+xmlfile, lang)) + except (IDRangeException, ExpectedLanguageException) as e: + pass + print('\r[{}] read {} documents, {} for language {}'.format(RCV2_LANG_DIR[lang]+'/'+part, len(request), lang_docs_read, lang), end='') + print() + print('ave:{} std {} min {} max {}'.format(np.mean(nwords), np.std(nwords), np.min(nwords), np.max(nwords))) + return request, list(labels) + + +def fetch_topic_hierarchy(path, topics='all'): + assert topics in ['all', 'leaves'] + + download_file_if_not_exists(RCV1_TOPICHIER_URL, path) + hierarchy = {} + for line in open(path, 'rt'): + parts = line.strip().split() + parent,child = parts[1],parts[3] + if parent not in hierarchy: + hierarchy[parent]=[] + hierarchy[parent].append(child) + + del hierarchy['None'] + del hierarchy['Root'] + print(hierarchy) + + if topics=='all': + topics = set(hierarchy.keys()) + for parent in hierarchy.keys(): + topics.update(hierarchy[parent]) + return list(topics) + elif topics=='leaves': + parents = set(hierarchy.keys()) + childs = set() + for parent in hierarchy.keys(): + childs.update(hierarchy[parent]) + return list(childs.difference(parents)) + + diff --git a/refactor/data/reader/wikipedia_tools.py b/refactor/data/reader/wikipedia_tools.py new file mode 100644 index 0000000..83e11e3 --- /dev/null +++ b/refactor/data/reader/wikipedia_tools.py @@ -0,0 +1,304 @@ +from __future__ import print_function +# import ijson +# from ijson.common import ObjectBuilder +import os, sys +from os.path import join +from bz2 import BZ2File +import pickle +from util.file import list_dirs, list_files, makedirs_if_not_exist +from itertools import islice +import re +from xml.sax.saxutils import escape +import numpy as np + +policies = ["IN_ALL_LANGS", "IN_ANY_LANG"] + +""" +This file contains a set of tools for processing the Wikipedia multilingual documents. +In what follows, it is assumed that you have already downloaded a Wikipedia dump (https://dumps.wikimedia.org/) +and have processed each document to clean their texts with one of the tools: + - https://github.com/aesuli/wikipediatools (Python 2) + - https://github.com/aesuli/wikipedia-extractor (Python 3) +It is also assumed you have dowloaded the all-entities json file (e.g., https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2) + +This tools help you in: + - Processes the huge json file as a stream, and create a multilingual map of corresponding titles for each language. + Set the policy = "IN_ALL_LANGS" will extract only titles which appear in all (AND) languages, whereas "IN_ANY_LANG" + extracts all titles appearing in at least one (OR) language (warning: this will creates a huge dictionary). + Note: This version is quite slow. Although it is run once for all, you might be prefer to take a look at "Wikidata in BigQuery". + - Processes the huge json file as a stream a creates a simplified file which occupies much less and is far faster to be processed. + - Use the multilingual map to extract, from the clean text versions, individual xml documents containing all + language-specific versions from the document. + - Fetch the multilingual documents to create, for each of the specified languages, a list containing all documents, + in a way that the i-th element from any list refers to the same element in the respective language. +""" + +def _doc_generator(text_path, langs): + dotspace = re.compile(r'\.(?!\s)') + for l,lang in enumerate(langs): + print("Processing language <%s> (%d/%d)" % (lang, l, len(langs))) + lang_dir = join(text_path, lang) + split_dirs = list_dirs(lang_dir) + for sd,split_dir in enumerate(split_dirs): + print("\tprocessing split_dir <%s> (%d/%d)" % (split_dir, sd, len(split_dirs))) + split_files = list_files(join(lang_dir, split_dir)) + for sf,split_file in enumerate(split_files): + print("\t\tprocessing split_file <%s> (%d/%d)" % (split_file, sf, len(split_files))) + with BZ2File(join(lang_dir, split_dir, split_file), 'r', buffering=1024*1024) as fi: + while True: + doc_lines = list(islice(fi, 3)) + if doc_lines: + # some sentences are not followed by a space after the dot + doc_lines[1] = dotspace.sub('. ', doc_lines[1]) + # [workaround] I found   html symbol was not treated, and unescaping it now might not help... + doc_lines[1] = escape(doc_lines[1].replace(" ", " ")) + yield doc_lines, lang + else: break + +def _extract_title(doc_lines): + m = re.search('title="(.+?)"', doc_lines[0]) + if m: return m.group(1).decode('utf-8') + else: raise ValueError("Error in xml format: document head is %s" % doc_lines[0]) + +def _create_doc(target_file, id, doc, lang): + doc[0] = doc[0][:-2] + (' lang="%s">\n'%lang) + with open(target_file, 'w') as fo: + fo.write('\n'%id) + [fo.write(line) for line in doc] + fo.write('') + +def _append_doc(target_file, doc, lang): + doc[0] = doc[0][:-2] + (' lang="%s">\n' % lang) + with open(target_file, 'r', buffering=1024*1024) as fi: + lines = fi.readlines() + if doc[0] in lines[1::3]: + return + lines[-1:-1]=doc + with open(target_file, 'w', buffering=1024*1024) as fo: + [fo.write(line) for line in lines] + +def extract_multilingual_documents(inv_dict, langs, text_path, out_path): + if not os.path.exists(out_path): + os.makedirs(out_path) + for lang in langs: + if lang not in inv_dict: + raise ValueError("Lang %s is not in the dictionary" % lang) + + docs_created = len(list_files(out_path)) + print("%d multilingual documents found." % docs_created) + for doc,lang in _doc_generator(text_path, langs): + title = _extract_title(doc) + + if title in inv_dict[lang]: + #pass + ids = inv_dict[lang][title] + for id in ids: + target_file = join(out_path, id) + ".xml" + if os.path.exists(target_file): + _append_doc(target_file, doc, lang) + else: + _create_doc(target_file, id, doc, lang) + docs_created+=1 + else: + if not re.match('[A-Za-z]+', title): + print("Title <%s> for lang <%s> not in dictionary" % (title, lang)) + + + +def extract_multilingual_titles_from_simplefile(data_dir, filename, langs, policy="IN_ALL_LANGS", return_both=True): + simplified_file = join(data_dir,filename) + + if policy not in policies: + raise ValueError("Policy %s not supported." % policy) + print("extracting multilingual titles with policy %s (%s)" % (policy,' '.join(langs))) + + lang_prefix = list(langs) + lang_prefix.sort() + pickle_prefix = "extraction_" + "_".join(lang_prefix) + "." + policy + pickle_dict = join(data_dir, pickle_prefix+".multi_dict.pickle") + pickle_invdict = join(data_dir, pickle_prefix+".multi_invdict.pickle") + if os.path.exists(pickle_invdict): + if return_both and os.path.exists(pickle_dict): + print("Pickled files found in %s. Loading both (direct and inverse dictionaries)." % data_dir) + return pickle.load(open(pickle_dict, 'rb')), pickle.load(open(pickle_invdict, 'rb')) + elif return_both==False: + print("Pickled file found in %s. Loading inverse dictionary only." % pickle_invdict) + return pickle.load(open(pickle_invdict, 'rb')) + + multiling_titles = {} + inv_dict = {lang:{} for lang in langs} + + def process_entry(line): + parts = line.strip().split('\t') + id = parts[0] + if id in multiling_titles: + raise ValueError("id <%s> already indexed" % id) + + titles = dict(((lang_title[:lang_title.find(':')],lang_title[lang_title.find(':')+1:].decode('utf-8')) for lang_title in parts[1:])) + for lang in titles.keys(): + if lang not in langs: + del titles[lang] + + if (policy == "IN_ALL_LANGS" and len(titles) == len(langs))\ + or (policy == "IN_ANY_LANG" and len(titles) > 0): + multiling_titles[id] = titles + for lang, title in titles.items(): + if title in inv_dict[lang]: + inv_dict[lang][title].append(id) + inv_dict[lang][title] = [id] + + with BZ2File(simplified_file, 'r', buffering=1024*1024*16) as fi: + completed = 0 + try: + for line in fi: + process_entry(line) + completed += 1 + if completed % 10 == 0: + print("\rCompleted %d\ttitles %d" % (completed,len(multiling_titles)), end="") + print("\rCompleted %d\t\ttitles %d" % (completed, len(multiling_titles)), end="\n") + except EOFError: + print("\nUnexpected file ending... saving anyway") + + print("Pickling dictionaries in %s" % data_dir) + pickle.dump(multiling_titles, open(pickle_dict,'wb'), pickle.HIGHEST_PROTOCOL) + pickle.dump(inv_dict, open(pickle_invdict, 'wb'), pickle.HIGHEST_PROTOCOL) + print("Done") + + return (multiling_titles, inv_dict) if return_both else inv_dict + + +# in https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2 +def simplify_json_file(data_dir, langs, policy="IN_ALL_LANGS", json_file = "latest-all.json.bz2"): + latest_all_json_file = join(data_dir,json_file) + + if policy not in policies: + raise ValueError("Policy %s not supported." % policy) + + print("extracting multilingual titles with policy %s (%s)" % (policy,' '.join(langs))) + + lang_prefix = list(langs) + lang_prefix.sort() + simple_titles_path = join(data_dir, "extraction_" + "_".join(lang_prefix) + "." + policy) + + def process_entry(last, fo): + global written + id = last["id"] + titles = None + if policy == "IN_ALL_LANGS" and langs.issubset(last["labels"].keys()): + titles = {lang: last["labels"][lang]["value"] for lang in langs} + elif policy == "IN_ANY_LANG": + titles = {lang: last["labels"][lang]["value"] for lang in langs if lang in last["labels"]} + + if titles: + fo.write((id+'\t'+'\t'.join([lang+':'+titles[lang] for lang in titles.keys()])+'\n').encode('utf-8')) + return True + else: + return False + + written = 0 + with BZ2File(latest_all_json_file, 'r', buffering=1024*1024*16) as fi, \ + BZ2File(join(data_dir,simple_titles_path+".simple.bz2"),'w') as fo: + builder = ObjectBuilder() + completed = 0 + for event, value in ijson.basic_parse(fi, buf_size=1024*1024*16): + builder.event(event, value) + if len(builder.value)>1: + if process_entry(builder.value.pop(0), fo): written += 1 + completed += 1 + print("\rCompleted %d\ttitles %d" % (completed,written), end="") + print("") + + #process the last entry + process_entry(builder.value.pop(0)) + + return simple_titles_path + +""" +Reads all multi-lingual documents in a folder (see wikipedia_tools.py to generate them) and generates, for each of the +specified languages, a list contanining all its documents, so that the i-th element of any list refers to the language- +specific version of the same document. Documents are forced to contain version in all specified languages and to contain +a minimum number of words; otherwise it is discarded. +""" +class MinWordsNotReached(Exception): pass +class WrongDocumentFormat(Exception): pass + +def _load_multilang_doc(path, langs, min_words=100): + import xml.etree.ElementTree as ET + from xml.etree.ElementTree import Element, ParseError + try: + root = ET.parse(path).getroot() + doc = {} + for lang in langs: + doc_body = root.find('.//doc[@lang="' + lang + '"]') + if isinstance(doc_body, Element): + n_words = len(doc_body.text.split(' ')) + if n_words >= min_words: + doc[lang] = doc_body.text + else: + raise MinWordsNotReached + else: + raise WrongDocumentFormat + except ParseError: + raise WrongDocumentFormat + return doc + +#returns the multilingual documents mapped by language, and a counter with the number of documents readed +def fetch_wikipedia_multilingual(wiki_multi_path, langs, min_words=100, deletions=False, max_documents=-1, pickle_name=None): + if pickle_name and os.path.exists(pickle_name): + print("unpickling %s" % pickle_name) + return pickle.load(open(pickle_name, 'rb')) + + multi_docs = list_files(wiki_multi_path) + mling_documents = {l:[] for l in langs} + valid_documents = 0 + minwords_exception = 0 + wrongdoc_exception = 0 + for d,multi_doc in enumerate(multi_docs): + print("\rProcessed %d/%d documents, valid %d/%d, few_words=%d, few_langs=%d" % + (d, len(multi_docs), valid_documents, len(multi_docs), minwords_exception, wrongdoc_exception),end="") + doc_path = join(wiki_multi_path, multi_doc) + try: + m_doc = _load_multilang_doc(doc_path, langs, min_words) + valid_documents += 1 + for l in langs: + mling_documents[l].append(m_doc[l]) + except MinWordsNotReached: + minwords_exception += 1 + if deletions: os.remove(doc_path) + except WrongDocumentFormat: + wrongdoc_exception += 1 + if deletions: os.remove(doc_path) + if max_documents>0 and valid_documents>=max_documents: + break + + if pickle_name: + print("Pickling wikipedia documents object in %s" % pickle_name) + pickle.dump(mling_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL) + + return mling_documents + +def random_wiki_sample(l_wiki, max_documents): + if max_documents == 0: return None + langs = list(l_wiki.keys()) + assert len(np.unique([len(l_wiki[l]) for l in langs])) == 1, 'documents across languages do not seem to be aligned' + ndocs_per_lang = len(l_wiki[langs[0]]) + if ndocs_per_lang > max_documents: + sel = set(np.random.choice(list(range(ndocs_per_lang)), max_documents, replace=False)) + for lang in langs: + l_wiki[lang] = [d for i, d in enumerate(l_wiki[lang]) if i in sel] + return l_wiki + + +if __name__ == "__main__": + + wikipedia_home = "../Datasets/Wikipedia" + + from data.languages import JRC_LANGS_WITH_NLTK_STEMMING as langs + langs = frozenset(langs) + + simple_titles_path = simplify_json_file(wikipedia_home, langs, policy="IN_ALL_LANGS", json_file="latest-all.json.bz2") + _, inv_dict = extract_multilingual_titles_from_simplefile(wikipedia_home, simple_titles_path, langs, policy='IN_ALL_LANGS') + extract_multilingual_documents(inv_dict, langs, join(wikipedia_home,'text'), + out_path=join(wikipedia_home, 'multilingual_docs_JRC_NLTK')) + + diff --git a/refactor/data/text_preprocessor.py b/refactor/data/text_preprocessor.py new file mode 100644 index 0000000..1a6e3ae --- /dev/null +++ b/refactor/data/text_preprocessor.py @@ -0,0 +1,33 @@ +from nltk.corpus import stopwords +from data.languages import NLTK_LANGMAP +from nltk import word_tokenize +from nltk.stem import SnowballStemmer + + +def preprocess_documents(documents, lang): + tokens = NLTKStemTokenizer(lang, verbose=True) + sw = stopwords.words(NLTK_LANGMAP[lang]) + return [' '.join([w for w in tokens(doc) if w not in sw]) for doc in documents] + + +class NLTKStemTokenizer(object): + + def __init__(self, lang, verbose=False): + if lang not in NLTK_LANGMAP: + raise ValueError('Language %s is not supported in NLTK' % lang) + self.verbose=verbose + self.called = 0 + self.wnl = SnowballStemmer(NLTK_LANGMAP[lang]) + self.cache = {} + + def __call__(self, doc): + self.called += 1 + if self.verbose: + print("\r\t\t[documents processed %d]" % (self.called), end="") + tokens = word_tokenize(doc) + stems = [] + for t in tokens: + if t not in self.cache: + self.cache[t] = self.wnl.stem(t) + stems.append(self.cache[t]) + return stems \ No newline at end of file diff --git a/refactor/data/tsr_function__.py b/refactor/data/tsr_function__.py new file mode 100755 index 0000000..0af8690 --- /dev/null +++ b/refactor/data/tsr_function__.py @@ -0,0 +1,270 @@ +import math +import numpy as np +from scipy.stats import t +from joblib import Parallel, delayed +from scipy.sparse import csr_matrix, csc_matrix + + +def get_probs(tpr, fpr, pc): + # tpr = p(t|c) = p(tp)/p(c) = p(tp)/(p(tp)+p(fn)) + # fpr = p(t|_c) = p(fp)/p(_c) = p(fp)/(p(fp)+p(tn)) + pnc = 1.0 - pc + tp = tpr * pc + fn = pc - tp + fp = fpr * pnc + tn = pnc - fp + return ContTable(tp=tp, fn=fn, fp=fp, tn=tn) + + +def apply_tsr(tpr, fpr, pc, tsr): + cell = get_probs(tpr, fpr, pc) + return tsr(cell) + + +def positive_information_gain(cell): + if cell.tpr() < cell.fpr(): + return 0.0 + else: + return information_gain(cell) + + +def posneg_information_gain(cell): + ig = information_gain(cell) + if cell.tpr() < cell.fpr(): + return -ig + else: + return ig + + +def __ig_factor(p_tc, p_t, p_c): + den = p_t * p_c + if den != 0.0 and p_tc != 0: + return p_tc * math.log(p_tc / den, 2) + else: + return 0.0 + + +def information_gain(cell): + return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + \ + __ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) +\ + __ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c()) + \ + __ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c()) + + +def information_gain_mod(cell): + return (__ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + __ig_factor(cell.p_tn(), cell.p_not_f(), cell.p_not_c())) \ + - (__ig_factor(cell.p_fp(), cell.p_f(), cell.p_not_c()) + __ig_factor(cell.p_fn(), cell.p_not_f(), cell.p_c())) + + +def pointwise_mutual_information(cell): + return __ig_factor(cell.p_tp(), cell.p_f(), cell.p_c()) + + +def gain_ratio(cell): + pc = cell.p_c() + pnc = 1.0 - pc + norm = pc * math.log(pc, 2) + pnc * math.log(pnc, 2) + return information_gain(cell) / (-norm) + + +def chi_square(cell): + den = cell.p_f() * cell.p_not_f() * cell.p_c() * cell.p_not_c() + if den==0.0: return 0.0 + num = gss(cell)**2 + return num / den + + +def relevance_frequency(cell): + a = cell.tp + c = cell.fp + if c == 0: c = 1 + return math.log(2.0 + (a * 1.0 / c), 2) + + +def idf(cell): + if cell.p_f()>0: + return math.log(1.0 / cell.p_f()) + return 0.0 + + +def gss(cell): + return cell.p_tp()*cell.p_tn() - cell.p_fp()*cell.p_fn() + + +def conf_interval(xt, n): + if n>30: + z2 = 3.84145882069 # norm.ppf(0.5+0.95/2.0)**2 + else: + z2 = t.ppf(0.5 + 0.95 / 2.0, df=max(n-1,1)) ** 2 + p = (xt + 0.5 * z2) / (n + z2) + amplitude = 0.5 * z2 * math.sqrt((p * (1.0 - p)) / (n + z2)) + return p, amplitude + +def strength(minPosRelFreq, minPos, maxNeg): + if minPos > maxNeg: + return math.log(2.0 * minPosRelFreq, 2.0) + else: + return 0.0 + + +#set cancel_features=True to allow some features to be weighted as 0 (as in the original article) +#however, for some extremely imbalanced dataset caused all documents to be 0 +def conf_weight(cell, cancel_features=False): + c = cell.get_c() + not_c = cell.get_not_c() + tp = cell.tp + fp = cell.fp + + pos_p, pos_amp = conf_interval(tp, c) + neg_p, neg_amp = conf_interval(fp, not_c) + + min_pos = pos_p-pos_amp + max_neg = neg_p+neg_amp + den = (min_pos + max_neg) + minpos_relfreq = min_pos / (den if den != 0 else 1) + + str_tplus = strength(minpos_relfreq, min_pos, max_neg); + + if str_tplus == 0 and not cancel_features: + return 1e-20 + + return str_tplus; + + +class ContTable: + + def __init__(self, tp=0, tn=0, fp=0, fn=0): + self.tp=tp + self.tn=tn + self.fp=fp + self.fn=fn + + def get_d(self): return self.tp + self.tn + self.fp + self.fn + + def get_c(self): return self.tp + self.fn + + def get_not_c(self): return self.tn + self.fp + + def get_f(self): return self.tp + self.fp + + def get_not_f(self): return self.tn + self.fn + + def p_c(self): return (1.0*self.get_c())/self.get_d() + + def p_not_c(self): return 1.0-self.p_c() + + def p_f(self): return (1.0*self.get_f())/self.get_d() + + def p_not_f(self): return 1.0-self.p_f() + + def p_tp(self): return (1.0*self.tp) / self.get_d() + + def p_tn(self): return (1.0*self.tn) / self.get_d() + + def p_fp(self): return (1.0*self.fp) / self.get_d() + + def p_fn(self): return (1.0*self.fn) / self.get_d() + + def tpr(self): + c = 1.0*self.get_c() + return self.tp / c if c > 0.0 else 0.0 + + def fpr(self): + _c = 1.0*self.get_not_c() + return self.fp / _c if _c > 0.0 else 0.0 + + +def round_robin_selection(X, Y, k, tsr_function=positive_information_gain): + print(f'[selectiong {k} terms]') + nC = Y.shape[1] + FC = get_tsr_matrix(get_supervised_matrix(X, Y), tsr_function).T + best_features_idx = np.argsort(-FC, axis=0).flatten() + tsr_values = FC.flatten() + selected_indexes_set = set() + selected_indexes = list() + selected_value = list() + from_category = list() + round_robin = iter(best_features_idx) + values_iter = iter(tsr_values) + round=0 + while len(selected_indexes) < k: + term_idx = next(round_robin) + term_val = next(values_iter) + if term_idx not in selected_indexes_set: + selected_indexes_set.add(term_idx) + selected_indexes.append(term_idx) + selected_value.append(term_val) + from_category.append(round) + round = (round + 1) % nC + return np.asarray(selected_indexes, dtype=int), np.asarray(selected_value, dtype=float), np.asarray(from_category) + + +def feature_label_contingency_table(positive_document_indexes, feature_document_indexes, nD): + tp_ = len(positive_document_indexes & feature_document_indexes) + fp_ = len(feature_document_indexes - positive_document_indexes) + fn_ = len(positive_document_indexes - feature_document_indexes) + tn_ = nD - (tp_ + fp_ + fn_) + return ContTable(tp=tp_, tn=tn_, fp=fp_, fn=fn_) + + +def category_tables(feature_sets, category_sets, c, nD, nF): + return [feature_label_contingency_table(category_sets[c], feature_sets[f], nD) for f in range(nF)] + + +""" +Computes the nC x nF supervised matrix M where Mcf is the 4-cell contingency table for feature f and class c. +Efficiency O(nF x nC x log(S)) where S is the sparse factor +""" +def get_supervised_matrix(coocurrence_matrix, label_matrix, n_jobs=-1): + nD, nF = coocurrence_matrix.shape + nD2, nC = label_matrix.shape + + if nD != nD2: + raise ValueError('Number of rows in coocurrence matrix shape %s and label matrix shape %s is not consistent' % + (coocurrence_matrix.shape,label_matrix.shape)) + + def nonzero_set(matrix, col): + return set(matrix[:, col].nonzero()[0]) + + if isinstance(coocurrence_matrix, csr_matrix): + coocurrence_matrix = csc_matrix(coocurrence_matrix) + feature_sets = [nonzero_set(coocurrence_matrix, f) for f in range(nF)] + category_sets = [nonzero_set(label_matrix, c) for c in range(nC)] + cell_matrix = Parallel(n_jobs=n_jobs, backend="threading")(delayed(category_tables)(feature_sets, category_sets, c, nD, nF) for c in range(nC)) + return np.array(cell_matrix) + +# obtains the matrix T where Tcf=tsr(f,c) is the tsr score for category c and feature f +def get_tsr_matrix(cell_matrix, tsr_score_funtion): + nC,nF = cell_matrix.shape + tsr_matrix = [[tsr_score_funtion(cell_matrix[c,f]) for f in range(nF)] for c in range(nC)] + return np.array(tsr_matrix) + + +""" The Fisher-score [1] is not computed on the 4-cell contingency table, but can +take as input any real-valued feature column (e.g., tf-idf weights). +feat is the feature vector, and c is a binary classification vector. +This implementation covers only the binary case, while the formula is defined for multiclass +single-label scenarios, for which the version [2] might be preferred. +[1] R.O. Duda, P.E. Hart, and D.G. Stork. Pattern classification. Wiley-interscience, 2012. +[2] Gu, Q., Li, Z., & Han, J. (2012). Generalized fisher score for feature selection. arXiv preprint arXiv:1202.3725. +""" +def fisher_score_binary(feat, c): + neg = np.ones_like(c) - c + + npos = np.sum(c) + nneg = np.sum(neg) + + mupos = np.mean(feat[c == 1]) + muneg = np.mean(feat[neg == 1]) + mu = np.mean(feat) + + stdpos = np.std(feat[c == 1]) + stdneg = np.std(feat[neg == 1]) + + num = npos * ((mupos - mu) ** 2) + nneg * ((muneg - mu) ** 2) + den = npos * (stdpos ** 2) + nneg * (stdneg ** 2) + + if den>0: + return num / den + else: + return num diff --git a/refactor/debug_notebook.ipynb b/refactor/debug_notebook.ipynb new file mode 100644 index 0000000..f574694 --- /dev/null +++ b/refactor/debug_notebook.ipynb @@ -0,0 +1,36 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/refactor/devel_ideas.py b/refactor/devel_ideas.py new file mode 100644 index 0000000..bf5690a --- /dev/null +++ b/refactor/devel_ideas.py @@ -0,0 +1,95 @@ +class CustomMetrics(Metric): + def __init__( + self, + num_classes: int, + beta: float = 1.0, + threshold: float = 0.5, + average: str = "micro", + multilabel: bool = False, + compute_on_step: bool = True, + dist_sync_on_step: bool = False, + process_group: Optional[Any] = None, + ): + super().__init__( + compute_on_step=compute_on_step, dist_sync_on_step=dist_sync_on_step, process_group=process_group, + ) + + self.num_classes = num_classes + self.beta = beta + self.threshold = threshold + self.average = average + self.multilabel = multilabel + + allowed_average = ("micro", "macro", "weighted", None) + if self.average not in allowed_average: + raise ValueError('Argument `average` expected to be one of the following:' + f' {allowed_average} but got {self.average}') + + self.add_state("true_positives", default=torch.zeros(num_classes), dist_reduce_fx="sum") + self.add_state("predicted_positives", default=torch.zeros(num_classes), dist_reduce_fx="sum") + self.add_state("actual_positives", default=torch.zeros(num_classes), dist_reduce_fx="sum") + + def update(self, preds: torch.Tensor, target: torch.Tensor): + """ + Update state with predictions and targets. + + Args: + preds: Predictions from model + target: Ground truth values + """ + true_positives, predicted_positives, actual_positives = _fbeta_update( + preds, target, self.num_classes, self.threshold, self.multilabel + ) + + self.true_positives += true_positives + self.predicted_positives += predicted_positives + self.actual_positives += actual_positives + + def compute(self): + """ + Computes metrics over state. + """ + return _fbeta_compute(self.true_positives, self.predicted_positives, + self.actual_positives, self.beta, self.average) + + +def _fbeta_update( + preds: torch.Tensor, + target: torch.Tensor, + num_classes: int, + threshold: float = 0.5, + multilabel: bool = False +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + preds, target = _input_format_classification_one_hot( + num_classes, preds, target, threshold, multilabel + ) + true_positives = torch.sum(preds * target, dim=1) + predicted_positives = torch.sum(preds, dim=1) + actual_positives = torch.sum(target, dim=1) + return true_positives, predicted_positives, actual_positives + + +def _fbeta_compute( + true_positives: torch.Tensor, + predicted_positives: torch.Tensor, + actual_positives: torch.Tensor, + beta: float = 1.0, + average: str = "micro" +) -> torch.Tensor: + if average == "micro": + precision = true_positives.sum().float() / predicted_positives.sum() + recall = true_positives.sum().float() / actual_positives.sum() + else: + precision = true_positives.float() / predicted_positives + recall = true_positives.float() / actual_positives + + num = (1 + beta ** 2) * precision * recall + denom = beta ** 2 * precision + recall + new_num = 2 * true_positives + new_fp = predicted_positives - true_positives + new_fn = actual_positives - true_positives + new_den = 2 * true_positives + new_fp + new_fn + if new_den.sum() == 0: + # whats is the correct return type ? TODO + return 1. + return class_reduce(num, denom, weights=actual_positives, class_reduction=average) diff --git a/refactor/main.py b/refactor/main.py new file mode 100644 index 0000000..76c5e54 --- /dev/null +++ b/refactor/main.py @@ -0,0 +1,47 @@ +from argparse import ArgumentParser +from util.embeddings_manager import MuseLoader +from view_generators import RecurrentGen, BertGen +from data.dataset_builder import MultilingualDataset +from util.common import MultilingualIndex + + +def main(args): + N_JOBS = 8 + print('Running...') + + # _DATASET = '/homenfs/a.pedrotti1/datasets/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle' + # EMBEDDINGS_PATH = '/homenfs/a.pedrotti1/embeddings/MUSE' + + _DATASET = '/home/moreo/CLESA/rcv2/rcv1-2_doclist_trByLang1000_teByLang1000_processed_run0.pickle' + EMBEDDINGS_PATH = '/home/andreapdr/funneling_pdr/embeddings' + data = MultilingualDataset.load(_DATASET) + # data.set_view(languages=['it']) + lX, ly = data.training() + lXte, lyte = data.test() + + # Init multilingualIndex - mandatory when deploying Neural View Generators... + multilingualIndex = MultilingualIndex() + # lMuse = MuseLoader(langs=sorted(lX.keys()), cache=) + lMuse = MuseLoader(langs=sorted(lX.keys()), cache=EMBEDDINGS_PATH) + multilingualIndex.index(lX, ly, lXte, l_pretrained_vocabulary=lMuse.vocabulary()) + + # gFun = VanillaFunGen(base_learner=get_learner(calibrate=True), n_jobs=N_JOBS) + # gFun = MuseGen(muse_dir='/home/andreapdr/funneling_pdr/embeddings', n_jobs=N_JOBS) + # gFun = WordClassGen(n_jobs=N_JOBS) + gFun = RecurrentGen(multilingualIndex, pretrained_embeddings=lMuse, wce=False, gpus=args.gpus, n_jobs=N_JOBS, + stored_path='/home/andreapdr/gfun_refactor/tb_logs/gfun_rnn_dev/version_19/checkpoints/epoch=0-step=14.ckpt') + # gFun = BertGen(multilingualIndex, gpus=args.gpus, batch_size=128, n_jobs=N_JOBS) + + gFun.fit(lX, ly) + + # print('Projecting...') + # y_ = gFun.transform(lX) + + exit('Executed!') + + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_argument('--gpus', default=None) + args = parser.parse_args() + main(args) diff --git a/refactor/models/helpers.py b/refactor/models/helpers.py new file mode 100755 index 0000000..93e5805 --- /dev/null +++ b/refactor/models/helpers.py @@ -0,0 +1,47 @@ +import torch +import torch.nn as nn +from torch.nn import functional as F + + + +def init_embeddings(pretrained, vocab_size, learnable_length, device='cuda'): + pretrained_embeddings = None + pretrained_length = 0 + if pretrained is not None: + pretrained_length = pretrained.shape[1] + assert pretrained.shape[0] == vocab_size, 'pre-trained matrix does not match with the vocabulary size' + pretrained_embeddings = nn.Embedding(vocab_size, pretrained_length) + pretrained_embeddings.weight = nn.Parameter(pretrained, requires_grad=False) + # pretrained_embeddings.to(device) + + learnable_embeddings = None + if learnable_length > 0: + learnable_embeddings = nn.Embedding(vocab_size, learnable_length) + # learnable_embeddings.to(device) + + embedding_length = learnable_length + pretrained_length + assert embedding_length > 0, '0-size embeddings' + + return pretrained_embeddings, learnable_embeddings, embedding_length + + +def embed(model, input, lang): + input_list = [] + if model.lpretrained_embeddings[lang]: + input_list.append(model.lpretrained_embeddings[lang](input)) + if model.llearnable_embeddings[lang]: + input_list.append(model.llearnable_embeddings[lang](input)) + return torch.cat(tensors=input_list, dim=2) + + +def embedding_dropout(input, drop_range, p_drop=0.5, training=True): + if p_drop > 0 and training and drop_range is not None: + p = p_drop + drop_from, drop_to = drop_range + m = drop_to - drop_from #length of the supervised embedding + l = input.shape[2] #total embedding length + corr = (1 - p) + input[:, :, drop_from:drop_to] = corr * F.dropout(input[:, :, drop_from:drop_to], p=p) + input /= (1 - (p * m / l)) + + return input diff --git a/refactor/models/learners.py b/refactor/models/learners.py new file mode 100644 index 0000000..fcd4249 --- /dev/null +++ b/refactor/models/learners.py @@ -0,0 +1,185 @@ +import numpy as np +import time +from scipy.sparse import issparse +from sklearn.multiclass import OneVsRestClassifier +from sklearn.model_selection import GridSearchCV +from sklearn.svm import SVC +from joblib import Parallel, delayed + + +def get_learner(calibrate=False, kernel='linear', C=1): + """ + instantiate scikit Support Vector Classifier + :param calibrate: boolean, whether to return posterior probabilities or not + :param kernel: string,kernel to be applied to the SVC + :param C: int or dict {'C': list of integer}, Regularization parameter + :return: Support Vector Classifier + """ + return SVC(kernel=kernel, probability=calibrate, cache_size=1000, C=C, random_state=1, gamma='auto', verbose=False) + + +def _sort_if_sparse(X): + if issparse(X) and not X.has_sorted_indices: + X.sort_indices() + + +def _joblib_transform_multiling(transformer, lX, n_jobs=-1): + if n_jobs == 1: + return {lang: transformer(lX[lang]) for lang in lX.keys()} + else: + langs = list(lX.keys()) + transformations = Parallel(n_jobs=n_jobs)(delayed(transformer)(lX[lang]) for lang in langs) + return {lang: transformations[i] for i, lang in enumerate(langs)} + + +class TrivialRejector: + def fit(self, X, y): + self.cats = y.shape[1] + return self + + def decision_function(self, X): return np.zeros((X.shape[0], self.cats)) + + def predict(self, X): return np.zeros((X.shape[0], self.cats)) + + def predict_proba(self, X): return np.zeros((X.shape[0], self.cats)) + + def best_params(self): return {} + + +class NaivePolylingualClassifier: + """ + Is a mere set of independet MonolingualClassifiers + """ + + def __init__(self, base_learner, parameters=None, n_jobs=-1): + self.base_learner = base_learner + self.parameters = parameters + self.model = None + self.n_jobs = n_jobs + + def fit(self, lX, ly): + """ + trains the independent monolingual classifiers + :param lX: a dictionary {language_label: X csr-matrix} + :param ly: a dictionary {language_label: y np.array} + :return: self + """ + tinit = time.time() + assert set(lX.keys()) == set(ly.keys()), 'inconsistent language mappings in fit' + langs = list(lX.keys()) + for lang in langs: + _sort_if_sparse(lX[lang]) + + models = Parallel(n_jobs=self.n_jobs)\ + (delayed(MonolingualClassifier(self.base_learner, parameters=self.parameters).fit)((lX[lang]), ly[lang]) for + lang in langs) + + self.model = {lang: models[i] for i, lang in enumerate(langs)} + self.empty_categories = {lang: self.model[lang].empty_categories for lang in langs} + self.time = time.time() - tinit + return self + + def decision_function(self, lX): + """ + :param lX: a dictionary {language_label: X csr-matrix} + :return: a dictionary of classification scores for each class + """ + assert self.model is not None, 'predict called before fit' + assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in decision function' + langs = list(lX.keys()) + scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].decision_function)(lX[lang]) for lang in langs) + return {lang: scores[i] for i, lang in enumerate(langs)} + + def predict_proba(self, lX): + """ + :param lX: a dictionary {language_label: X csr-matrix} + :return: a dictionary of probabilities that each document belongs to each class + """ + assert self.model is not None, 'predict called before fit' + assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in decision function' + langs = list(lX.keys()) + scores = Parallel(n_jobs=self.n_jobs, max_nbytes=None)( + delayed(self.model[lang].predict_proba)(lX[lang]) for lang in langs) + return {lang: scores[i] for i, lang in enumerate(langs)} + + def predict(self, lX): + """ + :param lX: a dictionary {language_label: X csr-matrix} + :return: a dictionary of predictions + """ + assert self.model is not None, 'predict called before fit' + assert set(lX.keys()).issubset(set(self.model.keys())), 'unknown languages requested in predict' + if self.n_jobs == 1: + return {lang: self.model[lang].transform(lX[lang]) for lang in lX.keys()} + else: + langs = list(lX.keys()) + scores = Parallel(n_jobs=self.n_jobs)(delayed(self.model[lang].predict)(lX[lang]) for lang in langs) + return {lang: scores[i] for i, lang in enumerate(langs)} + + def best_params(self): + return {lang: model.best_params() for lang, model in self.model.items()} + + +class MonolingualClassifier: + + def __init__(self, base_learner, parameters=None, n_jobs=-1): + self.learner = base_learner + self.parameters = parameters + self.model = None + self.n_jobs = n_jobs + self.best_params_ = None + + def fit(self, X, y): + if X.shape[0] == 0: + print('Warning: X has 0 elements, a trivial rejector will be created') + self.model = TrivialRejector().fit(X, y) + self.empty_categories = np.arange(y.shape[1]) + return self + + tinit = time.time() + _sort_if_sparse(X) + self.empty_categories = np.argwhere(np.sum(y, axis=0) == 0).flatten() + # multi-class format + if len(y.shape) == 2: + if self.parameters is not None: + self.parameters = [{'estimator__' + key: params[key] for key in params.keys()} + for params in self.parameters] + self.model = OneVsRestClassifier(self.learner, n_jobs=self.n_jobs) + else: + self.model = self.learner + raise NotImplementedError('not working as a base-classifier for funneling if there are gaps in ' + 'the labels across languages') + + # parameter optimization? + if self.parameters: + print('debug: optimizing parameters:', self.parameters) + self.model = GridSearchCV(self.model, param_grid=self.parameters, refit=True, cv=5, n_jobs=self.n_jobs, + error_score=0, verbose=10) + + # print(f'fitting: {self.model} on matrices of shape X={X.shape} Y={y.shape}') + print(f'fitting: Mono-lingual Classifier on matrices of shape X={X.shape} Y={y.shape}') + self.model.fit(X, y) + if isinstance(self.model, GridSearchCV): + self.best_params_ = self.model.best_params_ + print('best parameters: ', self.best_params_) + self.time = time.time() - tinit + return self + + def decision_function(self, X): + assert self.model is not None, 'predict called before fit' + _sort_if_sparse(X) + return self.model.decision_function(X) + + def predict_proba(self, X): + assert self.model is not None, 'predict called before fit' + assert hasattr(self.model, 'predict_proba'), 'the probability predictions are not enabled in this model' + _sort_if_sparse(X) + return self.model.predict_proba(X) + + def predict(self, X): + assert self.model is not None, 'predict called before fit' + _sort_if_sparse(X) + return self.model.predict(X) + + def best_params(self): + return self.best_params_ diff --git a/refactor/models/lstm_class.py b/refactor/models/lstm_class.py new file mode 100755 index 0000000..98424f1 --- /dev/null +++ b/refactor/models/lstm_class.py @@ -0,0 +1,114 @@ +#taken from https://github.com/prakashpandey9/Text-Classification-Pytorch/blob/master/models/LSTM.py +import torch +import torch.nn as nn +from torch.autograd import Variable +from models.helpers import * + + +class RNNMultilingualClassifier(nn.Module): + + def __init__(self, output_size, hidden_size, lvocab_size, learnable_length, lpretrained=None, + drop_embedding_range=None, drop_embedding_prop=0, post_probabilities=True, only_post=False, + bert_embeddings=False): + + super(RNNMultilingualClassifier, self).__init__() + self.output_size = output_size + self.hidden_size = hidden_size + self.drop_embedding_range = drop_embedding_range + self.drop_embedding_prop = drop_embedding_prop + self.post_probabilities = post_probabilities + self.bert_embeddings = bert_embeddings + assert 0 <= drop_embedding_prop <= 1, 'drop_embedding_prop: wrong range' + + self.lpretrained_embeddings = nn.ModuleDict() + self.llearnable_embeddings = nn.ModuleDict() + self.embedding_length = None + self.langs = sorted(lvocab_size.keys()) + self.only_post = only_post + + self.n_layers = 1 + self.n_directions = 1 + + self.dropout = nn.Dropout(0.6) + + lstm_out = 256 + ff1 = 512 + ff2 = 256 + + lpretrained_embeddings = {} + llearnable_embeddings = {} + if only_post==False: + for l in self.langs: + pretrained = lpretrained[l] if lpretrained else None + pretrained_embeddings, learnable_embeddings, embedding_length = init_embeddings( + pretrained, lvocab_size[l], learnable_length + ) + lpretrained_embeddings[l] = pretrained_embeddings + llearnable_embeddings[l] = learnable_embeddings + self.embedding_length = embedding_length + + # self.lstm = nn.LSTM(self.embedding_length, hidden_size, dropout=0.2 if self.n_layers>1 else 0, num_layers=self.n_layers, bidirectional=(self.n_directions==2)) + self.rnn = nn.GRU(self.embedding_length, hidden_size) + self.linear0 = nn.Linear(hidden_size * self.n_directions, lstm_out) + self.lpretrained_embeddings.update(lpretrained_embeddings) + self.llearnable_embeddings.update(llearnable_embeddings) + + self.linear1 = nn.Linear(lstm_out, ff1) + self.linear2 = nn.Linear(ff1, ff2) + + if only_post: + self.label = nn.Linear(output_size, output_size) + elif post_probabilities and not bert_embeddings: + self.label = nn.Linear(ff2 + output_size, output_size) + elif bert_embeddings and not post_probabilities: + self.label = nn.Linear(ff2 + 768, output_size) + elif post_probabilities and bert_embeddings: + self.label = nn.Linear(ff2 + output_size + 768, output_size) + else: + self.label = nn.Linear(ff2, output_size) + + def forward(self, input, post, bert_embed, lang): + if self.only_post: + doc_embedding = post + else: + doc_embedding = self.transform(input, lang) + if self.post_probabilities: + doc_embedding = torch.cat([doc_embedding, post], dim=1) + if self.bert_embeddings: + doc_embedding = torch.cat([doc_embedding, bert_embed], dim=1) + + logits = self.label(doc_embedding) + return logits + + def transform(self, input, lang): + batch_size = input.shape[0] + input = embed(self, input, lang) + input = embedding_dropout(input, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop, + training=self.training) + input = input.permute(1, 0, 2) + h_0 = Variable(torch.zeros(self.n_layers*self.n_directions, batch_size, self.hidden_size).cuda()) + # c_0 = Variable(torch.zeros(self.n_layers*self.n_directions, batch_size, self.hidden_size).cuda()) + # output, (_, _) = self.lstm(input, (h_0, c_0)) + output, _ = self.rnn(input, h_0) + output = output[-1, :, :] + output = F.relu(self.linear0(output)) + output = self.dropout(F.relu(self.linear1(output))) + output = self.dropout(F.relu(self.linear2(output))) + return output + + def finetune_pretrained(self): + for l in self.langs: + self.lpretrained_embeddings[l].requires_grad = True + self.lpretrained_embeddings[l].weight.requires_grad = True + + def get_embeddings(self, input, lang): + batch_size = input.shape[0] + input = embed(self, input, lang) + input = embedding_dropout(input, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop, + training=self.training) + input = input.permute(1, 0, 2) + h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, batch_size, self.hidden_size).cuda()) + output, _ = self.rnn(input, h_0) + output = output[-1, :, :] + return output.cpu().detach().numpy() + diff --git a/refactor/models/pl_bert.py b/refactor/models/pl_bert.py new file mode 100644 index 0000000..4561004 --- /dev/null +++ b/refactor/models/pl_bert.py @@ -0,0 +1,64 @@ +import torch +import pytorch_lightning as pl +from torch.optim.lr_scheduler import StepLR +from transformers import BertForSequenceClassification, BertTokenizer, AdamW, BertConfig +from pytorch_lightning.metrics import F1, Accuracy, Metric + + +class BertModel(pl.LightningModule): + + def __init__(self, output_size, stored_path): + super().__init__() + self.loss = torch.nn.BCEWithLogitsLoss() + if stored_path: + self.bert = BertForSequenceClassification.from_pretrained(stored_path, + num_labels=output_size, + output_hidden_states=True) + else: + self.bert = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', + num_labels=output_size, + output_hidden_states=True) + self.accuracy = Accuracy() + self.save_hyperparameters() + + def forward(self, X): + logits = self.bert(X) + return logits + + def training_step(self, train_batch, batch_idx): + X, y, _, batch_langs = train_batch + X = torch.cat(X).view([X[0].shape[0], len(X)]) + y = y.type(torch.cuda.FloatTensor) + logits, _ = self.forward(X) + loss = self.loss(logits, y) + predictions = torch.sigmoid(logits) > 0.5 + accuracy = self.accuracy(predictions, y) + self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True) + self.log('train-accuracy', accuracy, on_step=True, on_epoch=True, prog_bar=False, logger=True) + return loss + + def validation_step(self, val_batch, batch_idx): + X, y, _, batch_langs = val_batch + X = torch.cat(X).view([X[0].shape[0], len(X)]) + y = y.type(torch.cuda.FloatTensor) + logits, _ = self.forward(X) + loss = self.loss(logits, y) + predictions = torch.sigmoid(logits) > 0.5 + accuracy = self.accuracy(predictions, y) + self.log('val-loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True) + self.log('val-accuracy', accuracy, on_step=True, on_epoch=True, prog_bar=False, logger=True) + return + + def configure_optimizers(self, lr=3e-5, weight_decay=0.01): + no_decay = ['bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in self.bert.named_parameters() + if not any(nd in n for nd in no_decay)], + 'weight_decay': weight_decay}, + {'params': [p for n, p in self.bert.named_parameters() + if any(nd in n for nd in no_decay)], + 'weight_decay': weight_decay} + ] + optimizer = AdamW(optimizer_grouped_parameters, lr=lr) + scheduler = StepLR(optimizer, step_size=25, gamma=0.1) + return [optimizer], [scheduler] diff --git a/refactor/models/pl_gru.py b/refactor/models/pl_gru.py new file mode 100644 index 0000000..7987156 --- /dev/null +++ b/refactor/models/pl_gru.py @@ -0,0 +1,312 @@ +import torch +from torch import nn +from torch.optim import Adam +from transformers import AdamW +import torch.nn.functional as F +from torch.autograd import Variable +import pytorch_lightning as pl +from pytorch_lightning.metrics import F1, Accuracy, Metric +from torch.optim.lr_scheduler import StepLR + +from util.evaluation import evaluate +from typing import Any, Optional, Tuple +from pytorch_lightning.metrics.utils import _input_format_classification_one_hot, class_reduce +import numpy as np + + +def init_embeddings(pretrained, vocab_size, learnable_length): + """ + Compute the embedding matrix + :param pretrained: + :param vocab_size: + :param learnable_length: + :return: + """ + pretrained_embeddings = None + pretrained_length = 0 + if pretrained is not None: + pretrained_length = pretrained.shape[1] + assert pretrained.shape[0] == vocab_size, 'pre-trained matrix does not match with the vocabulary size' + pretrained_embeddings = nn.Embedding(vocab_size, pretrained_length) + # requires_grad=False sets the embedding layer as NOT trainable + pretrained_embeddings.weight = nn.Parameter(pretrained, requires_grad=False) + + learnable_embeddings = None + if learnable_length > 0: + learnable_embeddings = nn.Embedding(vocab_size, learnable_length) + + embedding_length = learnable_length + pretrained_length + assert embedding_length > 0, '0-size embeddings' + return pretrained_embeddings, learnable_embeddings, embedding_length + + +class RecurrentModel(pl.LightningModule): + """ + Check out for logging insight https://www.learnopencv.com/tensorboard-with-pytorch-lightning/ + """ + + def __init__(self, lPretrained, langs, output_size, hidden_size, lVocab_size, learnable_length, + drop_embedding_range, drop_embedding_prop, lMuse_debug=None, multilingual_index_debug=None): + super().__init__() + self.langs = langs + self.lVocab_size = lVocab_size + self.learnable_length = learnable_length + self.output_size = output_size + self.hidden_size = hidden_size + self.drop_embedding_range = drop_embedding_range + self.drop_embedding_prop = drop_embedding_prop + self.loss = torch.nn.BCEWithLogitsLoss() + self.microf1 = F1(num_classes=output_size, multilabel=True, average='micro') + self.macrof1 = F1(num_classes=output_size, multilabel=True, average='macro') + self.accuracy = Accuracy() + self.customMetrics = CustomMetrics(num_classes=output_size, multilabel=True, average='micro') + + self.lPretrained_embeddings = nn.ModuleDict() + self.lLearnable_embeddings = nn.ModuleDict() + + self.n_layers = 1 + self.n_directions = 1 + self.dropout = nn.Dropout(0.6) + + # TODO: debug setting + self.lMuse = lMuse_debug + self.multilingual_index_debug = multilingual_index_debug + + lstm_out = 256 + ff1 = 512 + ff2 = 256 + + lpretrained_embeddings = {} + llearnable_embeddings = {} + + for lang in self.langs: + pretrained = lPretrained[lang] if lPretrained else None + pretrained_embeddings, learnable_embeddings, embedding_length = init_embeddings( + pretrained, self.lVocab_size[lang], self.learnable_length) + lpretrained_embeddings[lang] = pretrained_embeddings + llearnable_embeddings[lang] = learnable_embeddings + self.embedding_length = embedding_length + + self.lPretrained_embeddings.update(lpretrained_embeddings) + self.lLearnable_embeddings.update(llearnable_embeddings) + + self.rnn = nn.GRU(self.embedding_length, hidden_size) + self.linear0 = nn.Linear(hidden_size * self.n_directions, lstm_out) + self.linear1 = nn.Linear(lstm_out, ff1) + self.linear2 = nn.Linear(ff1, ff2) + self.label = nn.Linear(ff2, self.output_size) + + lPretrained = None # TODO: setting lPretrained to None, letting it to its original value will bug first + # validation step (i.e., checkpoint will store also its ++ value, I guess, making the saving process too slow) + self.save_hyperparameters() + + def forward(self, lX): + _tmp = [] + for lang in sorted(lX.keys()): + doc_embedding = self.transform(lX[lang], lang) + _tmp.append(doc_embedding) + embed = torch.cat(_tmp, dim=0) + logits = self.label(embed) + return logits + + def transform(self, X, lang): + batch_size = X.shape[0] + X = self.embed(X, lang) + X = self.embedding_dropout(X, drop_range=self.drop_embedding_range, p_drop=self.drop_embedding_prop, + training=self.training) + X = X.permute(1, 0, 2) + h_0 = Variable(torch.zeros(self.n_layers * self.n_directions, batch_size, self.hidden_size).to(self.device)) + output, _ = self.rnn(X, h_0) + output = output[-1, :, :] + output = F.relu(self.linear0(output)) + output = self.dropout(F.relu(self.linear1(output))) + output = self.dropout(F.relu(self.linear2(output))) + return output + + def training_step(self, train_batch, batch_idx): + # TODO: double check StepLR scheduler... + lX, ly = train_batch + logits = self.forward(lX) + _ly = [] + for lang in sorted(lX.keys()): + _ly.append(ly[lang]) + ly = torch.cat(_ly, dim=0) + loss = self.loss(logits, ly) + + # Squashing logits through Sigmoid in order to get confidence score + predictions = torch.sigmoid(logits) > 0.5 + + # microf1 = self.microf1(predictions, ly) + # macrof1 = self.macrof1(predictions, ly) + accuracy = self.accuracy(predictions, ly) + # l_pred = {lang: predictions.detach().cpu().numpy()} + # l_labels = {lang: ly.detach().cpu().numpy()} + # l_eval = evaluate(l_labels, l_pred, n_jobs=1) + + self.log('train-loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True) + self.log('train-accuracy', accuracy, on_step=True, on_epoch=True, prog_bar=False, logger=True) + return loss + + def validation_step(self, val_batch, batch_idx): + lX, ly = val_batch + logits = self.forward(lX) + _ly = [] + for lang in sorted(lX.keys()): + _ly.append(ly[lang]) + ly = torch.cat(_ly, dim=0) + loss = self.loss(logits, ly) + predictions = torch.sigmoid(logits) > 0.5 + # microf1 = self.microf1(predictions, ly) + # macrof1 = self.macrof1(predictions, ly) + accuracy = self.accuracy(predictions, ly) + + # l_pred = {lang: predictions.detach().cpu().numpy()} + # l_labels = {lang: y.detach().cpu().numpy()} + # l_eval = evaluate(l_labels, l_pred, n_jobs=1) + + self.log('val-loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True) + self.log('val-accuracy', accuracy, on_step=True, on_epoch=True, prog_bar=False, logger=True) + return + + def test_step(self, test_batch, batch_idx): + lX, ly = test_batch + logits = self.forward(lX) + _ly = [] + for lang in sorted(lX.keys()): + _ly.append(ly[lang]) + ly = torch.cat(_ly, dim=0) + predictions = torch.sigmoid(logits) > 0.5 + accuracy = self.accuracy(predictions, ly) + custom_metric = self.customMetrics(logits, ly) # TODO + self.log('test-accuracy', accuracy, on_step=False, on_epoch=True, prog_bar=False, logger=True) + self.log('test-custom', custom_metric, on_step=False, on_epoch=True, prog_bar=False, logger=True) + return {'pred': predictions, 'target': ly} + + def test_epoch_end(self, outputs): + # all_pred = torch.vstack([out['pred'] for out in outputs]) # TODO + # all_y = torch.vstack([out['target'] for out in outputs]) # TODO + # r = eval(all_y, all_pred) + # print(r) + # X = torch.cat(X).view([X[0].shape[0], len(X)]) + return + + def embed(self, X, lang): + input_list = [] + if self.lPretrained_embeddings[lang]: + input_list.append(self.lPretrained_embeddings[lang](X)) + if self.lLearnable_embeddings[lang]: + input_list.append(self.lLearnable_embeddings[lang](X)) + return torch.cat(tensors=input_list, dim=2) + + def embedding_dropout(self, X, drop_range, p_drop=0.5, training=True): + if p_drop > 0 and training and drop_range is not None: + p = p_drop + drop_from, drop_to = drop_range + m = drop_to - drop_from # length of the supervised embedding + l = X.shape[2] # total embedding length + corr = (1 - p) + X[:, :, drop_from:drop_to] = corr * F.dropout(X[:, :, drop_from:drop_to], p=p) + X /= (1 - (p * m / l)) + return X + + def configure_optimizers(self): + optimizer = AdamW(self.parameters(), lr=1e-3) + scheduler = StepLR(optimizer, step_size=25, gamma=0.5) + return [optimizer], [scheduler] + + +class CustomMetrics(Metric): + def __init__( + self, + num_classes: int, + beta: float = 1.0, + threshold: float = 0.5, + average: str = "micro", + multilabel: bool = False, + compute_on_step: bool = True, + dist_sync_on_step: bool = False, + process_group: Optional[Any] = None, + ): + super().__init__( + compute_on_step=compute_on_step, dist_sync_on_step=dist_sync_on_step, process_group=process_group, + ) + + self.num_classes = num_classes + self.beta = beta + self.threshold = threshold + self.average = average + self.multilabel = multilabel + + allowed_average = ("micro", "macro", "weighted", None) + if self.average not in allowed_average: + raise ValueError('Argument `average` expected to be one of the following:' + f' {allowed_average} but got {self.average}') + + self.add_state("true_positives", default=torch.zeros(num_classes), dist_reduce_fx="sum") + self.add_state("predicted_positives", default=torch.zeros(num_classes), dist_reduce_fx="sum") + self.add_state("actual_positives", default=torch.zeros(num_classes), dist_reduce_fx="sum") + + def update(self, preds: torch.Tensor, target: torch.Tensor): + """ + Update state with predictions and targets. + + Args: + preds: Predictions from model + target: Ground truth values + """ + true_positives, predicted_positives, actual_positives = _fbeta_update( + preds, target, self.num_classes, self.threshold, self.multilabel + ) + + self.true_positives += true_positives + self.predicted_positives += predicted_positives + self.actual_positives += actual_positives + + def compute(self): + """ + Computes metrics over state. + """ + return _fbeta_compute(self.true_positives, self.predicted_positives, + self.actual_positives, self.beta, self.average) + + +def _fbeta_update( + preds: torch.Tensor, + target: torch.Tensor, + num_classes: int, + threshold: float = 0.5, + multilabel: bool = False +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + preds, target = _input_format_classification_one_hot( + num_classes, preds, target, threshold, multilabel + ) + true_positives = torch.sum(preds * target, dim=1) + predicted_positives = torch.sum(preds, dim=1) + actual_positives = torch.sum(target, dim=1) + return true_positives, predicted_positives, actual_positives + + +def _fbeta_compute( + true_positives: torch.Tensor, + predicted_positives: torch.Tensor, + actual_positives: torch.Tensor, + beta: float = 1.0, + average: str = "micro" +) -> torch.Tensor: + if average == "micro": + precision = true_positives.sum().float() / predicted_positives.sum() + recall = true_positives.sum().float() / actual_positives.sum() + else: + precision = true_positives.float() / predicted_positives + recall = true_positives.float() / actual_positives + + num = (1 + beta ** 2) * precision * recall + denom = beta ** 2 * precision + recall + new_num = 2 * true_positives + new_fp = predicted_positives - true_positives + new_fn = actual_positives - true_positives + new_den = 2 * true_positives + new_fp + new_fn + if new_den.sum() == 0: + # whats is the correct return type ? TODO + return 1. + return class_reduce(num, denom, weights=actual_positives, class_reduction=average) diff --git a/refactor/run.sh b/refactor/run.sh new file mode 100644 index 0000000..04365f9 --- /dev/null +++ b/refactor/run.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +for i in {0..10..1} +do + python main.py --gpus 0 +done \ No newline at end of file diff --git a/refactor/util/SIF_embed.py b/refactor/util/SIF_embed.py new file mode 100644 index 0000000..cfe096e --- /dev/null +++ b/refactor/util/SIF_embed.py @@ -0,0 +1,56 @@ +import numpy as np +from sklearn.decomposition import TruncatedSVD + +def get_weighted_average(We, x, w): + """ + Compute the weighted average vectors + :param We: We[i,:] is the vector for word i + :param x: x[i, :] are the indices of the words in sentence i + :param w: w[i, :] are the weights for the words in sentence i + :return: emb[i, :] are the weighted average vector for sentence i + """ + n_samples = x.shape[0] + emb = np.zeros((n_samples, We.shape[1])) + for i in range(n_samples): + emb[i,:] = w[i,:].dot(We[x[i,:],:]) / np.count_nonzero(w[i,:]) + return emb + +def compute_pc(X,npc=1): + """ + Compute the principal components. + :param X: X[i,:] is a data point + :param npc: number of principal components to remove + :return: component_[i,:] is the i-th pc + """ + svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0) + svd.fit(X) + return svd.components_ + +def remove_pc(X, npc=1): + """ + Remove the projection on the principal components + :param X: X[i,:] is a data point + :param npc: number of principal components to remove + :return: XX[i, :] is the data point after removing its projection + """ + pc = compute_pc(X, npc) + if npc==1: + XX = X - X.dot(pc.transpose()) * pc + else: + XX = X - X.dot(pc.transpose()).dot(pc) + return XX + + +def SIF_embedding(We, x, w, params): + """ + Compute the scores between pairs of sentences using weighted average + removing the projection on the first principal component + :param We: We[i,:] is the vector for word i + :param x: x[i, :] are the indices of the words in the i-th sentence + :param w: w[i, :] are the weights for the words in the i-th sentence + :param params.rmpc: if >0, remove the projections of the sentence embeddings to their first principal component + :return: emb, emb[i, :] is the embedding for sentence i + """ + emb = get_weighted_average(We, x, w) + if params.rmpc > 0: + emb = remove_pc(emb, params.rmpc) + return emb \ No newline at end of file diff --git a/refactor/util/common.py b/refactor/util/common.py new file mode 100644 index 0000000..7792b1c --- /dev/null +++ b/refactor/util/common.py @@ -0,0 +1,322 @@ +import numpy as np +import torch +from tqdm import tqdm +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.preprocessing import normalize +from sklearn.model_selection import train_test_split +from util.embeddings_manager import supervised_embeddings_tfidf + + +class TfidfVectorizerMultilingual: + + def __init__(self, **kwargs): + self.kwargs = kwargs + + def fit(self, lX, ly=None): + self.langs = sorted(lX.keys()) + self.vectorizer = {l: TfidfVectorizer(**self.kwargs).fit(lX[l]) for l in self.langs} + return self + + def transform(self, lX): + return {l: self.vectorizer[l].transform(lX[l]) for l in self.langs} + + def fit_transform(self, lX, ly=None): + return self.fit(lX, ly).transform(lX) + + def vocabulary(self, l=None): + if l is None: + return {l: self.vectorizer[l].vocabulary_ for l in self.langs} + else: + return self.vectorizer[l].vocabulary_ + + def get_analyzer(self, l=None): + if l is None: + return {l: self.vectorizer[l].build_analyzer() for l in self.langs} + else: + return self.vectorizer[l].build_analyzer() + + +def _normalize(lX, l2=True): + return {lang: normalize(X) for lang, X in lX.items()} if l2 else lX + + +def none_dict(langs): + return {l:None for l in langs} + + +class MultilingualIndex: + def __init__(self): + """ + Class that contains monolingual Indexes + """ + self.l_index = {} + self.l_vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) + + def index(self, l_devel_raw, l_devel_target, l_test_raw, l_pretrained_vocabulary=None): + self.langs = sorted(l_devel_raw.keys()) + self.l_vectorizer.fit(l_devel_raw) + l_vocabulary = self.l_vectorizer.vocabulary() + l_analyzer = self.l_vectorizer.get_analyzer() + if l_pretrained_vocabulary is None: + l_pretrained_vocabulary = none_dict(self.langs) + + for lang in self.langs: + # Init monolingual Index + self.l_index[lang] = Index(l_devel_raw[lang], l_devel_target[lang], l_test_raw[lang], lang) + # call to index() function of monolingual Index + self.l_index[lang].index(l_pretrained_vocabulary[lang], l_analyzer[lang], l_vocabulary[lang]) + + def train_val_split(self, val_prop=0.2, max_val=2000, seed=42): + for l,index in self.l_index.items(): + index.train_val_split(val_prop, max_val, seed=seed) + + def embedding_matrices(self, lpretrained, supervised): + """ + Extract from pretrained embeddings words that are found in the training dataset, then for each language + calls the respective monolingual index and build the embedding matrix (if supervised, WCE are concatenated + to the unsupervised vectors). + :param lpretrained: dict {lang : matrix of word-embeddings } + :param supervised: bool, whether to deploy Word-Class Embeddings or not + :return: self + """ + lXtr = self.get_lXtr() if supervised else none_dict(self.langs) + lYtr = self.l_train_target() if supervised else none_dict(self.langs) + lWordList = self.get_wordlist() + lExtracted = lpretrained.extract(lWordList) + for lang, index in self.l_index.items(): + # if supervised concatenate embedding matrices of pretrained unsupervised + # and supervised word-class embeddings + index.compose_embedding_matrix(lExtracted[lang], supervised, lXtr[lang], lYtr[lang]) + self.sup_range = index.wce_range + return self + + def get_wordlist(self): + wordlist = {} + for lang, index in self.l_index.items(): + wordlist[lang] = index.get_word_list() + return wordlist + + def get_raw_lXtr(self): + lXtr_raw = {k:[] for k in self.langs} + lYtr_raw = {k: [] for k in self.langs} + for lang in self.langs: + lXtr_raw[lang] = self.l_index[lang].train_raw + lYtr_raw[lang] = self.l_index[lang].train_raw + return lXtr_raw + + def get_raw_lXva(self): + lXva_raw = {k: [] for k in self.langs} + for lang in self.langs: + lXva_raw[lang] = self.l_index[lang].val_raw + + return lXva_raw + + def get_raw_lXte(self): + lXte_raw = {k: [] for k in self.langs} + for lang in self.langs: + lXte_raw[lang] = self.l_index[lang].test_raw + + return lXte_raw + + def get_lXtr(self): + if not hasattr(self, 'lXtr'): + self.lXtr = self.l_vectorizer.transform({l: index.train_raw for l, index in self.l_index.items()}) + return self.lXtr + + def get_lXva(self): + if not hasattr(self, 'lXva'): + self.lXva = self.l_vectorizer.transform({l: index.val_raw for l, index in self.l_index.items()}) + return self.lXva + + def get_lXte(self): + if not hasattr(self, 'lXte'): + self.lXte = self.l_vectorizer.transform({l: index.test_raw for l, index in self.l_index.items()}) + return self.lXte + + def get_target_dim(self): + return self.l_index[self.langs[0]].devel_target.shape[1] + + def l_vocabsize(self): + return {l:index.vocabsize for l,index in self.l_index.items()} + + def l_embeddings(self): + return {l:index.embedding_matrix for l,index in self.l_index.items()} + + def l_pad(self): + return {l: index.pad_index for l, index in self.l_index.items()} + + def l_train_index(self): + return {l: index.train_index for l, index in self.l_index.items()} + + def l_train_raw_index(self): + return {l: index.train_raw for l, index in self.l_index.items()} + + def l_train_target(self): + return {l: index.train_target for l, index in self.l_index.items()} + + def l_val_index(self): + return {l: index.val_index for l, index in self.l_index.items()} + + def l_val_raw_index(self): + return {l: index.val_raw for l, index in self.l_index.items()} + + def l_val_target(self): + return {l: index.val_target for l, index in self.l_index.items()} + + def l_test_index(self): + return {l: index.test_index for l, index in self.l_index.items()} + + def l_test_raw(self): + print('TODO: implement MultilingualIndex method to return RAW test data!') + return NotImplementedError + + def l_devel_index(self): + return {l: index.devel_index for l, index in self.l_index.items()} + + def l_devel_target(self): + return {l: index.devel_target for l, index in self.l_index.items()} + + def l_train(self): + return self.l_train_index(), self.l_train_target() + + def l_val(self): + return self.l_val_index(), self.l_val_target() + + def l_train_raw(self): + return self.l_train_raw_index(), self.l_train_target() + + def l_val_raw(self): + return self.l_val_raw_index(), self.l_val_target() + + def get_l_pad_index(self): + return {l: index.get_pad_index() for l, index in self.l_index.items()} + + +class Index: + def __init__(self, devel_raw, devel_target, test_raw, lang): + """ + Monolingual Index, takes care of tokenizing raw data, converting strings to ids, splitting the data into + training and validation. + :param devel_raw: list of strings, list of raw training texts + :param devel_target: + :param test_raw: list of strings, list of raw test texts + :param lang: list, list of languages contained in the dataset + """ + self.lang = lang + self.devel_raw = devel_raw + self.devel_target = devel_target + self.test_raw = test_raw + + def index(self, pretrained_vocabulary, analyzer, vocabulary): + self.word2index = dict(vocabulary) + known_words = set(self.word2index.keys()) + if pretrained_vocabulary is not None: + known_words.update(pretrained_vocabulary) + + self.word2index['UNKTOKEN'] = len(self.word2index) + self.word2index['PADTOKEN'] = len(self.word2index) + self.unk_index = self.word2index['UNKTOKEN'] + self.pad_index = self.word2index['PADTOKEN'] + + # index documents and keep track of test terms outside the development vocabulary that are in Muse (if available) + self.out_of_vocabulary = dict() + self.devel_index = index(self.devel_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary) + self.test_index = index(self.test_raw, self.word2index, known_words, analyzer, self.unk_index, self.out_of_vocabulary) + + self.vocabsize = len(self.word2index) + len(self.out_of_vocabulary) + + print(f'[indexing complete for lang {self.lang}] vocabulary-size={self.vocabsize}') + + def get_pad_index(self): + return self.pad_index + + def train_val_split(self, val_prop, max_val, seed): + devel = self.devel_index + target = self.devel_target + devel_raw = self.devel_raw + + val_size = int(min(len(devel) * val_prop, max_val)) + + self.train_index, self.val_index, self.train_target, self.val_target, self.train_raw, self.val_raw = \ + train_test_split( + devel, target, devel_raw, test_size=val_size, random_state=seed, shuffle=True) + + print(f'split lang {self.lang}: train={len(self.train_index)} val={len(self.val_index)} test={len(self.test_index)}') + + def get_word_list(self): + def extract_word_list(word2index): + return [w for w, i in sorted(word2index.items(), key=lambda x: x[1])] + + word_list = extract_word_list(self.word2index) + word_list += extract_word_list(self.out_of_vocabulary) + return word_list + + def compose_embedding_matrix(self, pretrained, supervised, Xtr=None, Ytr=None): + print(f'[generating embedding matrix for lang {self.lang}]') + + self.wce_range = None + embedding_parts = [] + + if pretrained is not None: + print('\t[pretrained-matrix]') + embedding_parts.append(pretrained) + del pretrained + + if supervised: + print('\t[supervised-matrix]') + F = supervised_embeddings_tfidf(Xtr, Ytr) + num_missing_rows = self.vocabsize - F.shape[0] + F = np.vstack((F, np.zeros(shape=(num_missing_rows, F.shape[1])))) + F = torch.from_numpy(F).float() + + offset = 0 + if embedding_parts: + offset = embedding_parts[0].shape[1] + self.wce_range = [offset, offset + F.shape[1]] + embedding_parts.append(F) + + self.embedding_matrix = torch.cat(embedding_parts, dim=1) + + print(f'[embedding matrix for lang {self.lang} has shape {self.embedding_matrix.shape}]') + + +def index(data, vocab, known_words, analyzer, unk_index, out_of_vocabulary): + """ + Index (i.e., replaces word strings with numerical indexes) a list of string documents + :param data: list of string documents + :param vocab: a fixed mapping [str]->[int] of words to indexes + :param known_words: a set of known words (e.g., words that, despite not being included in the vocab, can be retained + because they are anyway contained in a pre-trained embedding set that we know in advance) + :param analyzer: the preprocessor in charge of transforming the document string into a chain of string words + :param unk_index: the index of the 'unknown token', i.e., a symbol that characterizes all words that we cannot keep + :param out_of_vocabulary: an incremental mapping [str]->[int] of words to indexes that will index all those words that + are not in the original vocab but that are in the known_words + :return: + """ + indexes=[] + vocabsize = len(vocab) + unk_count = 0 + knw_count = 0 + out_count = 0 + pbar = tqdm(data, desc=f'indexing') + for text in pbar: + words = analyzer(text) + index = [] + for word in words: + if word in vocab: + idx = vocab[word] + else: + if word in known_words: + if word not in out_of_vocabulary: + out_of_vocabulary[word] = vocabsize+len(out_of_vocabulary) + idx = out_of_vocabulary[word] + out_count += 1 + else: + idx = unk_index + unk_count += 1 + index.append(idx) + indexes.append(index) + knw_count += len(index) + # pbar.set_description(f'[unk = {unk_count}/{knw_count}={(100.*unk_count/knw_count):.2f}%]' + # f'[out = {out_count}/{knw_count}={(100.*out_count/knw_count):.2f}%]') + return indexes diff --git a/refactor/util/embeddings_manager.py b/refactor/util/embeddings_manager.py new file mode 100644 index 0000000..c0aca54 --- /dev/null +++ b/refactor/util/embeddings_manager.py @@ -0,0 +1,102 @@ +from torchtext.vocab import Vectors +import torch +from abc import ABC, abstractmethod +import numpy as np +from util.SIF_embed import remove_pc + + +class PretrainedEmbeddings(ABC): + + def __init__(self): + super().__init__() + + @abstractmethod + def vocabulary(self): pass + + @abstractmethod + def dim(self): pass + + @classmethod + def reindex(cls, words, word2index): + if isinstance(words, dict): + words = list(zip(*sorted(words.items(), key=lambda x: x[1])))[0] + + source_idx, target_idx = [], [] + for i, word in enumerate(words): + if word not in word2index: + continue + j = word2index[word] + source_idx.append(i) + target_idx.append(j) + source_idx = np.asarray(source_idx) + target_idx = np.asarray(target_idx) + return source_idx, target_idx + + +class MuseLoader: + def __init__(self, langs, cache): + self.langs = langs + self.lEmbed = {} + self.lExtracted = {} + for lang in self.langs: + print(f'Loading vectors for {lang}...') + self.lEmbed[lang] = Vectors(f'wiki.multi.{lang}.vec', cache) + + def dim(self): + return self.lEmbed[list(self.lEmbed.keys())[0]].dim + + def vocabulary(self): + return {lang: set(self.lEmbed[lang].stoi.keys()) for lang in self.langs} + + def extract(self, lVoc): + """ + Reindex pretrained loaded embedding in order to match indexes assigned by scikit vectorizer. Such indexes + are consistent with those used by Word Class Embeddings (since we deploy the same vectorizer) + :param lVoc: dict {lang : {word : id}} + :return: torch embedding matrix of extracted embeddings i.e., words in lVoc + """ + for lang, words in lVoc.items(): + print(f'Extracting words for lang {lang}...') + # words = list(zip(*sorted(lVoc[lang].items(), key=lambda x: x[1])))[0] + source_id, target_id = PretrainedEmbeddings.reindex(words, self.lEmbed[lang].stoi) + extraction = torch.zeros((len(words), self.dim())) + extraction[source_id] = self.lEmbed[lang].vectors[target_id] + self.lExtracted[lang] = extraction + return self.lExtracted + + def get_lEmbeddings(self): + return {lang: self.lEmbed[lang].vectors for lang in self.langs} + + +def XdotM(X, M, sif): + E = X.dot(M) + if sif: + E = remove_pc(E, npc=1) + return E + + +def wce_matrix(X, Y): + wce = supervised_embeddings_tfidf(X, Y) + wce = zscores(wce, axis=0) + return wce + + +def supervised_embeddings_tfidf(X, Y): + tfidf_norm = X.sum(axis=0) + tfidf_norm[tfidf_norm == 0] = 1 + F = (X.T).dot(Y) / tfidf_norm.T + return F + + +def zscores(X, axis=0): + """ + scipy.stats.zscores does not avoid division by 0, which can indeed occur + :param X: + :param axis: + :return: + """ + std = np.clip(np.std(X, ddof=1, axis=axis), 1e-5, None) + mean = np.mean(X, axis=axis) + return (X - mean) / std + + diff --git a/refactor/util/evaluation.py b/refactor/util/evaluation.py new file mode 100644 index 0000000..03c1792 --- /dev/null +++ b/refactor/util/evaluation.py @@ -0,0 +1,19 @@ +from joblib import Parallel, delayed +from util.metrics import * +import numpy as np + + +def evaluation_metrics(y, y_): + if len(y.shape) == len(y_.shape) == 1 and len(np.unique(y)) > 2: # single-label + raise NotImplementedError() # return f1_score(y,y_,average='macro'), f1_score(y,y_,average='micro') + else: # the metrics I implemented assume multiclass multilabel classification as binary classifiers + return macroF1(y, y_), microF1(y, y_), macroK(y, y_), microK(y, y_) + + +def evaluate(ly_true, ly_pred, metrics=evaluation_metrics, n_jobs=-1): + if n_jobs == 1: + return {lang: metrics(ly_true[lang], ly_pred[lang]) for lang in ly_true.keys()} + else: + langs = list(ly_true.keys()) + evals = Parallel(n_jobs=n_jobs)(delayed(metrics)(ly_true[lang], ly_pred[lang]) for lang in langs) + return {lang: evals[i] for i, lang in enumerate(langs)} diff --git a/refactor/util/file.py b/refactor/util/file.py new file mode 100644 index 0000000..a3d0a3a --- /dev/null +++ b/refactor/util/file.py @@ -0,0 +1,44 @@ +from os import listdir, makedirs +from os.path import isdir, isfile, join, exists, dirname +#from sklearn.externals.six.moves import urllib +import urllib +from pathlib import Path + + +def download_file(url, archive_filename): + def progress(blocknum, bs, size): + total_sz_mb = '%.2f MB' % (size / 1e6) + current_sz_mb = '%.2f MB' % ((blocknum * bs) / 1e6) + print('\rdownloaded %s / %s' % (current_sz_mb, total_sz_mb), end='') + print("Downloading %s" % url) + urllib.request.urlretrieve(url, filename=archive_filename, reporthook=progress) + print("") + +def download_file_if_not_exists(url, archive_path): + if exists(archive_path): return + makedirs_if_not_exist(dirname(archive_path)) + download_file(url,archive_path) + + +def ls(dir, typecheck): + el = [f for f in listdir(dir) if typecheck(join(dir, f))] + el.sort() + return el + +def list_dirs(dir): + return ls(dir, typecheck=isdir) + +def list_files(dir): + return ls(dir, typecheck=isfile) + +def makedirs_if_not_exist(path): + if not exists(path): makedirs(path) + +def create_if_not_exist(path): + if not exists(path): makedirs(path) + +def get_parent_name(path): + return Path(path).parent + +def get_file_name(path): + return Path(path).name diff --git a/refactor/util/metrics.py b/refactor/util/metrics.py new file mode 100644 index 0000000..7a6079e --- /dev/null +++ b/refactor/util/metrics.py @@ -0,0 +1,152 @@ +import numpy as np + + +class ContTable: + def __init__(self, tp=0, tn=0, fp=0, fn=0): + self.tp = tp + self.tn = tn + self.fp = fp + self.fn = fn + + def get_d(self): return self.tp + self.tn + self.fp + self.fn + + def get_c(self): return self.tp + self.fn + + def get_not_c(self): return self.tn + self.fp + + def get_f(self): return self.tp + self.fp + + def get_not_f(self): return self.tn + self.fn + + def p_c(self): return (1.0*self.get_c())/self.get_d() + + def p_not_c(self): return 1.0-self.p_c() + + def p_f(self): return (1.0*self.get_f())/self.get_d() + + def p_not_f(self): return 1.0-self.p_f() + + def p_tp(self): return (1.0*self.tp) / self.get_d() + + def p_tn(self): return (1.0*self.tn) / self.get_d() + + def p_fp(self): return (1.0*self.fp) / self.get_d() + + def p_fn(self): return (1.0*self.fn) / self.get_d() + + def tpr(self): + c = 1.0*self.get_c() + return self.tp / c if c > 0.0 else 0.0 + + def fpr(self): + _c = 1.0*self.get_not_c() + return self.fp / _c if _c > 0.0 else 0.0 + + def __add__(self, other): + return ContTable(tp=self.tp + other.tp, tn=self.tn + other.tn, fp=self.fp + other.fp, fn=self.fn + other.fn) + + +def accuracy(cell): + return (cell.tp + cell.tn)*1.0 / (cell.tp + cell.fp + cell.fn + cell.tn) + + +def f1(cell): + num = 2.0 * cell.tp + den = 2.0 * cell.tp + cell.fp + cell.fn + if den > 0: + return num / den + # we define f1 to be 1 if den==0 since the classifier has correctly classified all instances as negative + return 1.0 + + +def K(cell): + specificity, recall = 0., 0. + + AN = cell.tn + cell.fp + if AN != 0: + specificity = cell.tn*1. / AN + + AP = cell.tp + cell.fn + if AP != 0: + recall = cell.tp*1. / AP + + if AP == 0: + return 2. * specificity - 1. + elif AN == 0: + return 2. * recall - 1. + else: + return specificity + recall - 1. + + +# if the classifier is single class, then the prediction is a vector of shape=(nD,) which causes issues when compared +# to the true labels (of shape=(nD,1)). This method increases the dimensions of the predictions. +def __check_consistency_and_adapt(true_labels, predictions): + if predictions.ndim == 1: + return __check_consistency_and_adapt(true_labels, np.expand_dims(predictions, axis=1)) + if true_labels.ndim == 1: + return __check_consistency_and_adapt(np.expand_dims(true_labels, axis=1), predictions) + if true_labels.shape != predictions.shape: + raise ValueError("True and predicted label matrices shapes are inconsistent %s %s." + % (true_labels.shape, predictions.shape)) + _, nC = true_labels.shape + return true_labels, predictions, nC + + +# computes the (soft) contingency table where tp, fp, fn, and tn are the cumulative masses for the posterioir +# probabilitiesfron with respect to the true binary labels +# true_labels and posterior_probabilities are two vectors of shape (number_documents,) +def soft_single_metric_statistics(true_labels, posterior_probabilities): + assert len(true_labels) == len(posterior_probabilities), "Format not consistent between true and predicted labels." + tp = np.sum(posterior_probabilities[true_labels == 1]) + fn = np.sum(1. - posterior_probabilities[true_labels == 1]) + fp = np.sum(posterior_probabilities[true_labels == 0]) + tn = np.sum(1. - posterior_probabilities[true_labels == 0]) + return ContTable(tp=tp, tn=tn, fp=fp, fn=fn) + + +# computes the (hard) counters tp, fp, fn, and tn fron a true and predicted vectors of hard decisions +# true_labels and predicted_labels are two vectors of shape (number_documents,) +def hard_single_metric_statistics(true_labels, predicted_labels): + assert len(true_labels) == len(predicted_labels), "Format not consistent between true and predicted labels." + nd = len(true_labels) + tp = np.sum(predicted_labels[true_labels == 1]) + fp = np.sum(predicted_labels[true_labels == 0]) + fn = np.sum(true_labels[predicted_labels == 0]) + tn = nd - (tp+fp+fn) + return ContTable(tp=tp, tn=tn, fp=fp, fn=fn) + + +def macro_average(true_labels, predicted_labels, metric, metric_statistics=hard_single_metric_statistics): + true_labels, predicted_labels, nC = __check_consistency_and_adapt(true_labels, predicted_labels) + return np.mean([metric(metric_statistics(true_labels[:, c], predicted_labels[:, c])) for c in range(nC)]) + + +def micro_average(true_labels, predicted_labels, metric, metric_statistics=hard_single_metric_statistics): + true_labels, predicted_labels, nC = __check_consistency_and_adapt(true_labels, predicted_labels) + + accum = ContTable() + for c in range(nC): + other = metric_statistics(true_labels[:, c], predicted_labels[:, c]) + accum = accum + other + + return metric(accum) + + +# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format +def macroF1(true_labels, predicted_labels): + return macro_average(true_labels, predicted_labels, f1) + + +# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format +def microF1(true_labels, predicted_labels): + return micro_average(true_labels, predicted_labels, f1) + + +# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format +def macroK(true_labels, predicted_labels): + return macro_average(true_labels, predicted_labels, K) + + +# true_labels and predicted_labels are two matrices in sklearn.preprocessing.MultiLabelBinarizer format +def microK(true_labels, predicted_labels): + return micro_average(true_labels, predicted_labels, K) diff --git a/refactor/view_generators.py b/refactor/view_generators.py new file mode 100644 index 0000000..0ea3323 --- /dev/null +++ b/refactor/view_generators.py @@ -0,0 +1,258 @@ +""" +This module contains the view generators that take care of computing the view specific document embeddings: + +- VanillaFunGen (-X) cast document representations encoded via TFIDF into posterior probabilities by means of SVM. + +- WordClassGen (-W): generates document representation via Word-Class-Embeddings. + Document embeddings are obtained via weighted sum of document's constituent embeddings. + +- MuseGen (-M): + +- RecurrentGen (-G): generates document embedding by means of a Gated Recurrent Units. The model can be + initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). + Output dimension is (n_docs, 512). + +- View generator (-B): generates document embedding via mBERT model. +""" +from abc import ABC, abstractmethod +from models.learners import * +from util.embeddings_manager import MuseLoader, XdotM, wce_matrix +from util.common import TfidfVectorizerMultilingual, _normalize +from models.pl_gru import RecurrentModel +from models.pl_bert import BertModel +from models.lstm_class import RNNMultilingualClassifier +from pytorch_lightning import Trainer +from data.datamodule import GfunDataModule, BertDataModule +from pytorch_lightning.loggers import TensorBoardLogger +import torch + + +class ViewGen(ABC): + @abstractmethod + def fit(self, lX, ly): + pass + + @abstractmethod + def transform(self, lX): + pass + + @abstractmethod + def fit_transform(self, lX, ly): + pass + + +class VanillaFunGen(ViewGen): + def __init__(self, base_learner, n_jobs=-1): + """ + Original funnelling architecture proposed by Moreo, Esuli and Sebastiani in DOI: https://doi.org/10.1145/3326065 + :param base_learner: naive monolingual learners to be deployed as first-tier learners. Should be able to + return posterior probabilities. + :param n_jobs: integer, number of concurrent workers + """ + super().__init__() + self.learners = base_learner + self.n_jobs = n_jobs + self.doc_projector = NaivePolylingualClassifier(self.learners) + self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) + + def fit(self, lX, lY): + lX = self.vectorizer.fit_transform(lX) + self.doc_projector.fit(lX, lY) + return self + + def transform(self, lX): + lX = self.vectorizer.transform(lX) + lZ = self.doc_projector.predict_proba(lX) + return lZ + + def fit_transform(self, lX, ly): + return self.fit(lX, ly).transform(lX) + + +class MuseGen(ViewGen): + def __init__(self, muse_dir='../embeddings', n_jobs=-1): + """ + generates document representation via MUSE embeddings (Fasttext multilingual word + embeddings). Document embeddings are obtained via weighted sum of document's constituent embeddings. + :param muse_dir: string, path to folder containing muse embeddings + :param n_jobs: int, number of concurrent workers + """ + super().__init__() + self.muse_dir = muse_dir + self.n_jobs = n_jobs + self.langs = None + self.lMuse = None + self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) + + def fit(self, lX, ly): + self.vectorizer.fit(lX) + self.langs = sorted(lX.keys()) + self.lMuse = MuseLoader(langs=self.langs, cache=self.muse_dir) + lVoc = self.vectorizer.vocabulary() + self.lMuse = self.lMuse.extract(lVoc) # overwriting lMuse with dict {lang : embed_matrix} with only known words + # TODO: featureweight.fit + return self + + def transform(self, lX): + lX = self.vectorizer.transform(lX) + XdotMUSE = Parallel(n_jobs=self.n_jobs)( + delayed(XdotM)(lX[lang], self.lMuse[lang], sif=True) for lang in self.langs) + lZ = {lang: XdotMUSE[i] for i, lang in enumerate(self.langs)} + lZ = _normalize(lZ, l2=True) + return lZ + + def fit_transform(self, lX, ly): + return self.fit(lX, ly).transform(lX) + + +class WordClassGen(ViewGen): + + def __init__(self, n_jobs=-1): + """ + generates document representation via Word-Class-Embeddings. + Document embeddings are obtained via weighted sum of document's constituent embeddings. + :param n_jobs: int, number of concurrent workers + """ + super().__init__() + self.n_jobs = n_jobs + self.langs = None + self.lWce = None + self.vectorizer = TfidfVectorizerMultilingual(sublinear_tf=True, use_idf=True) + + def fit(self, lX, ly): + lX = self.vectorizer.fit_transform(lX) + self.langs = sorted(lX.keys()) + wce = Parallel(n_jobs=self.n_jobs)( + delayed(wce_matrix)(lX[lang], ly[lang]) for lang in self.langs) + self.lWce = {l: wce[i] for i, l in enumerate(self.langs)} + # TODO: featureweight.fit() + return self + + def transform(self, lX): + lX = self.vectorizer.transform(lX) + XdotWce = Parallel(n_jobs=self.n_jobs)( + delayed(XdotM)(lX[lang], self.lWce[lang], sif=True) for lang in self.langs) + lWce = {l: XdotWce[i] for i, l in enumerate(self.langs)} + lWce = _normalize(lWce, l2=True) + return lWce + + def fit_transform(self, lX, ly): + return self.fit(lX, ly).transform(lX) + + +class RecurrentGen(ViewGen): + # TODO: save model https://forums.pytorchlightning.ai/t/how-to-save-hparams-when-not-provided-as-argument-apparently-assigning-to-hparams-is-not-recomended/339/5 + # Problem: we are passing lPretrained to init the RecurrentModel -> incredible slow at saving (checkpoint). + # if we do not save it is impossible to init RecurrentModel by calling RecurrentModel.load_from_checkpoint() + def __init__(self, multilingualIndex, pretrained_embeddings, wce, batch_size=512, gpus=0, n_jobs=-1, stored_path=None): + """ + generates document embedding by means of a Gated Recurrent Units. The model can be + initialized with different (multilingual/aligned) word representations (e.g., MUSE, WCE, ecc.,). + Output dimension is (n_docs, 512). + :param multilingualIndex: + :param pretrained_embeddings: + :param wce: + :param gpus: + :param n_jobs: + """ + super().__init__() + self.multilingualIndex = multilingualIndex + self.langs = multilingualIndex.langs + self.batch_size = batch_size + self.gpus = gpus + self.n_jobs = n_jobs + self.stored_path = stored_path + + # EMBEDDINGS to be deployed + self.pretrained = pretrained_embeddings + self.wce = wce + + self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1) + self.multilingualIndex.embedding_matrices(self.pretrained, supervised=self.wce) + self.model = self._init_model() + # hp_tuning with Tensorboard: check https://www.tensorflow.org/tensorboard/hyperparameter_tuning_with_hparams + # however, setting it to False at the moment! + self.logger = TensorBoardLogger(save_dir='tb_logs', name='gfun_rnn_dev', default_hp_metric=False) + + def _init_model(self): + if self.stored_path: + lpretrained = self.multilingualIndex.l_embeddings() + return RecurrentModel.load_from_checkpoint(self.stored_path, lPretrained=lpretrained) + else: + lpretrained = self.multilingualIndex.l_embeddings() + langs = self.multilingualIndex.langs + output_size = self.multilingualIndex.get_target_dim() + hidden_size = 512 + lvocab_size = self.multilingualIndex.l_vocabsize() + learnable_length = 0 + return RecurrentModel( + lPretrained=lpretrained, + langs=langs, + output_size=output_size, + hidden_size=hidden_size, + lVocab_size=lvocab_size, + learnable_length=learnable_length, + drop_embedding_range=self.multilingualIndex.sup_range, + drop_embedding_prop=0.5 + ) + + def fit(self, lX, ly): + """ + lX and ly are not directly used. We rather get them from the multilingual index used in the instatiation + of the Dataset object (RecurrentDataset) in the GfunDataModule class. + :param lX: + :param ly: + :return: + """ + recurrentDataModule = GfunDataModule(self.multilingualIndex, batchsize=self.batch_size) + trainer = Trainer(gradient_clip_val=1e-1, gpus=self.gpus, logger=self.logger, max_epochs=50) + + # vanilla_torch_model = torch.load( + # '/home/andreapdr/funneling_pdr/checkpoint/gru_viewgen_-jrc_doclist_1958-2005vs2006_all_top300_noparallel_processed_run0.pickle') + # self.model.linear0 = vanilla_torch_model.linear0 + # self.model.linear1 = vanilla_torch_model.linear1 + # self.model.linear2 = vanilla_torch_model.linear2 + # self.model.rnn = vanilla_torch_model.rnn + + trainer.fit(self.model, datamodule=recurrentDataModule) + trainer.test(self.model, datamodule=recurrentDataModule) + return self + + def transform(self, lX): + pass + + def fit_transform(self, lX, ly): + pass + + +class BertGen(ViewGen): + + def __init__(self, multilingualIndex, batch_size=128, gpus=0, n_jobs=-1, stored_path=None): + super().__init__() + self.multilingualIndex = multilingualIndex + self.gpus = gpus + self.batch_size = batch_size + self.n_jobs = n_jobs + self.stored_path = stored_path + self.logger = TensorBoardLogger(save_dir='tb_logs', name='bert_dev', default_hp_metric=False) + self.model = self._init_model() + self.multilingualIndex.train_val_split(val_prop=0.2, max_val=2000, seed=1) + + def _init_model(self): + output_size = self.multilingualIndex.get_target_dim() + return BertModel(output_size=output_size, stored_path=self.stored_path) + + def fit(self, lX, ly): + bertDataModule = BertDataModule(self.multilingualIndex, batchsize=self.batch_size, max_len=512) + trainer = Trainer(default_root_dir='checkpoints/bert/', gradient_clip_val=1e-1, gpus=self.gpus, logger=self.logger) + trainer.fit(self.model, bertDataModule) + # trainer.test(self.model, bertDataModule) + pass + + def transform(self, lX): + pass + + def fit_transform(self, lX, ly): + pass + + diff --git a/test.py b/test.py deleted file mode 100644 index 3fbc4f8..0000000 --- a/test.py +++ /dev/null @@ -1 +0,0 @@ -# preparing refactor