gFun/refactor/data/dataset_builder.py

import itertools
import pickle
import re
from os.path import exists

import numpy as np
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from scipy.sparse import issparse
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm import tqdm

from data.languages import NLTK_LANGMAP, RCV2_LANGS_WITH_NLTK_STEMMING
from data.reader.jrcacquis_reader import *
from data.reader.rcv_reader import fetch_RCV1, fetch_RCV2
from data.text_preprocessor import NLTKStemTokenizer, preprocess_documents


class MultilingualDataset:
    """
    A multilingual dataset is a dictionary of training and test documents indexed by language code.
    Train and test sets are represented as tuples of the type (X,Y,ids), where X is a matrix representation of the
    documents (e.g., a document-by-term sparse csr_matrix), Y is a document-by-label binary np.array indicating the
    labels of each document, and ids is a list of document-identifiers from the original collection.
    """

    def __init__(self):
        self.dataset_name = ""
        self.multiling_dataset = {}

    def add(self, lang, Xtr, Ytr, Xte, Yte, tr_ids=None, te_ids=None):
        self.multiling_dataset[lang] = ((Xtr, Ytr, tr_ids), (Xte, Yte, te_ids))

    def save(self, file):
        self.sort_indexes()
        pickle.dump(self, open(file, 'wb'), pickle.HIGHEST_PROTOCOL)
        return self

    def __getitem__(self, item):
        if item in self.langs():
            return self.multiling_dataset[item]
        return None

    @classmethod
    def load(cls, file):
        data = pickle.load(open(file, 'rb'))
        data.sort_indexes()
        return data

    @classmethod
    def load_ids(cls, file):
        data = pickle.load(open(file, 'rb'))
        tr_ids = {lang:tr_ids for (lang,((_,_,tr_ids), (_,_,_))) in data.multiling_dataset.items()}
        te_ids = {lang: te_ids for (lang, ((_, _, _), (_, _, te_ids))) in data.multiling_dataset.items()}
        return tr_ids, te_ids

    def sort_indexes(self):
        for (lang, ((Xtr,_,_),(Xte,_,_))) in self.multiling_dataset.items():
            if issparse(Xtr): Xtr.sort_indices()
            if issparse(Xte): Xte.sort_indices()

    def set_view(self, categories=None, languages=None):
        if categories is not None:
            if isinstance(categories, int):
                categories = np.array([categories])
            elif isinstance(categories, list):
                categories = np.array(categories)
            self.categories_view = categories
        if languages is not None:
            self.languages_view = languages

    def training(self, mask_numbers=False, target_as_csr=False):
        return self.lXtr(mask_numbers), self.lYtr(as_csr=target_as_csr)

    def test(self, mask_numbers=False, target_as_csr=False):
        return self.lXte(mask_numbers), self.lYte(as_csr=target_as_csr)

    def lXtr(self, mask_numbers=False):
        proc = lambda x:_mask_numbers(x) if mask_numbers else x
        # return {lang: Xtr for (lang, ((Xtr, _, _), _)) in self.multiling_dataset.items() if lang in self.langs()}
        return {lang:proc(Xtr) for (lang, ((Xtr,_,_),_)) in self.multiling_dataset.items() if lang in self.langs()}

    def lXte(self, mask_numbers=False):
        proc = lambda x: _mask_numbers(x) if mask_numbers else x
        # return {lang: Xte for (lang, (_, (Xte, _, _))) in self.multiling_dataset.items() if lang in self.langs()}
        return {lang:proc(Xte) for (lang, (_,(Xte,_,_))) in self.multiling_dataset.items() if lang in self.langs()}

    def lYtr(self, as_csr=False):
        lY = {lang:self.cat_view(Ytr) for (lang, ((_,Ytr,_),_)) in self.multiling_dataset.items() if lang in self.langs()}
        if as_csr:
            lY = {l:csr_matrix(Y) for l,Y in lY.items()}
        return lY

    def lYte(self, as_csr=False):
        lY = {lang:self.cat_view(Yte) for (lang, (_,(_,Yte,_))) in self.multiling_dataset.items() if lang in self.langs()}
        if as_csr:
            lY = {l:csr_matrix(Y) for l,Y in lY.items()}
        return lY

    def cat_view(self, Y):
        if hasattr(self, 'categories_view'):
            return Y[:,self.categories_view]
        else:
            return Y

    def langs(self):
        if hasattr(self, 'languages_view'):
            langs = self.languages_view
        else:
            langs = sorted(self.multiling_dataset.keys())
        return langs

    def num_categories(self):
        return self.lYtr()[self.langs()[0]].shape[1]

    def show_dimensions(self):
        def shape(X):
            return X.shape if hasattr(X, 'shape') else len(X)
        for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items():
            if lang not in self.langs(): continue
            print("Lang {}, Xtr={}, ytr={}, Xte={}, yte={}".format(lang, shape(Xtr), self.cat_view(Ytr).shape, shape(Xte), self.cat_view(Yte).shape))

    def show_category_prevalences(self):
        nC = self.num_categories()
        accum_tr = np.zeros(nC, dtype=np.int)
        accum_te = np.zeros(nC, dtype=np.int)
        in_langs = np.zeros(nC, dtype=np.int)   # count languages with at least one positive example (per category)
        for (lang, ((Xtr, Ytr, IDtr), (Xte, Yte, IDte))) in self.multiling_dataset.items():
            if lang not in self.langs(): continue
            prev_train = np.sum(self.cat_view(Ytr), axis=0)
            prev_test = np.sum(self.cat_view(Yte), axis=0)
            accum_tr += prev_train
            accum_te += prev_test
            in_langs += (prev_train>0)*1
            print(lang+'-train', prev_train)
            print(lang+'-test', prev_test)
        print('all-train', accum_tr)
        print('all-test', accum_te)

        return accum_tr, accum_te, in_langs

    def set_labels(self, labels):
        self.labels = labels

def _mask_numbers(data):
    mask_moredigit = re.compile(r'\s[\+-]?\d{5,}([\.,]\d*)*\b')
    mask_4digit = re.compile(r'\s[\+-]?\d{4}([\.,]\d*)*\b')
    mask_3digit = re.compile(r'\s[\+-]?\d{3}([\.,]\d*)*\b')
    mask_2digit = re.compile(r'\s[\+-]?\d{2}([\.,]\d*)*\b')
    mask_1digit = re.compile(r'\s[\+-]?\d{1}([\.,]\d*)*\b')
    masked = []
    for text in tqdm(data, desc='masking numbers'):
        text = ' ' + text
        text = mask_moredigit.sub(' MoreDigitMask', text)
        text = mask_4digit.sub(' FourDigitMask', text)
        text = mask_3digit.sub(' ThreeDigitMask', text)
        text = mask_2digit.sub(' TwoDigitMask', text)
        text = mask_1digit.sub(' OneDigitMask', text)
        masked.append(text.replace('.','').replace(',','').strip())
    return masked


# ----------------------------------------------------------------------------------------------------------------------
# Helpers
# ----------------------------------------------------------------------------------------------------------------------
def get_active_labels(doclist):
    cat_list = set()
    for d in doclist:
        cat_list.update(d.categories)
    return list(cat_list)

def filter_by_categories(doclist, keep_categories):
    catset = frozenset(keep_categories)
    for d in doclist:
        d.categories = list(set(d.categories).intersection(catset))

def __years_to_str(years):
    if isinstance(years, list):
        if len(years) > 1:
            return str(years[0])+'-'+str(years[-1])
        return str(years[0])
    return str(years)


# ----------------------------------------------------------------------------------------------------------------------
# Matrix builders
# ----------------------------------------------------------------------------------------------------------------------
def build_independent_matrices(dataset_name, langs, training_docs, test_docs, label_names, wiki_docs=[], preprocess=True):
    """
    Builds the document-by-term weighted matrices for each language. Representations are independent of each other,
    i.e., each language-specific matrix lies in a dedicate feature space.
    :param dataset_name: the name of the dataset (str)
    :param langs: list of languages (str)
    :param training_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id)
    :param test_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id)
    :param label_names: list of names of labels (str)
    :param wiki_docs: doc-list (optional), if specified, project all wiki docs in the feature spaces built for the languages
    :param preprocess: whether or not to apply language-specific text preprocessing (stopword removal and stemming)
    :return: a MultilingualDataset. If wiki_docs has been specified, a dictionary lW is also returned, which indexes
    by language the processed wikipedia documents in their respective language-specific feature spaces
    """

    mlb = MultiLabelBinarizer()
    mlb.fit([label_names])

    lW = {}

    multilingual_dataset = MultilingualDataset()
    multilingual_dataset.dataset_name = dataset_name
    multilingual_dataset.set_labels(mlb.classes_)
    for lang in langs:
        print("\nprocessing %d training, %d test, %d wiki for language <%s>" %
              (len(training_docs[lang]), len(test_docs[lang]), len(wiki_docs[lang]) if wiki_docs else 0, lang))

        tr_data, tr_labels, IDtr = zip(*training_docs[lang])
        te_data, te_labels, IDte = zip(*test_docs[lang])

        if preprocess:
            tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True,
                                    tokenizer=NLTKStemTokenizer(lang, verbose=True),
                                    stop_words=stopwords.words(NLTK_LANGMAP[lang]))
        else:
            tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True)

        Xtr = tfidf.fit_transform(tr_data)
        Xte = tfidf.transform(te_data)
        if wiki_docs:
            lW[lang] = tfidf.transform(wiki_docs[lang])

        Ytr = mlb.transform(tr_labels)
        Yte = mlb.transform(te_labels)

        multilingual_dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte)

    multilingual_dataset.show_dimensions()
    multilingual_dataset.show_category_prevalences()

    if wiki_docs:
        return multilingual_dataset, lW
    else:
        return multilingual_dataset


# creates a MultilingualDataset where matrices shares a single yuxtaposed feature space
def build_juxtaposed_matrices(dataset_name, langs, training_docs, test_docs, label_names, preprocess=True):
    """
    Builds the document-by-term weighted matrices for each language. Representations are not independent of each other,
    since all of them lie on the same yuxtaposed feature space.
    :param dataset_name: the name of the dataset (str)
    :param langs: list of languages (str)
    :param training_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id)
    :param test_docs: map {lang:doc-list} where each doc is a tuple (text, categories, id)
    :param label_names: list of names of labels (str)
    :param preprocess: whether or not to apply language-specific text preprocessing (stopword removal and stemming)
    :return: a MultilingualDataset. If wiki_docs has been specified, a dictionary lW is also returned, which indexes
    by language the processed wikipedia documents in their respective language-specific feature spaces
    """

    multiling_dataset = MultilingualDataset()
    multiling_dataset.dataset_name = dataset_name

    mlb = MultiLabelBinarizer()
    mlb.fit([label_names])

    multiling_dataset.set_labels(mlb.classes_)

    tr_data_stack = []
    for lang in langs:
        print("\nprocessing %d training and %d test for language <%s>" % (len(training_docs[lang]), len(test_docs[lang]), lang))
        tr_data, tr_labels, tr_ID = zip(*training_docs[lang])
        te_data, te_labels, te_ID = zip(*test_docs[lang])
        if preprocess:
            tr_data = preprocess_documents(tr_data, lang)
            te_data = preprocess_documents(te_data, lang)
        tr_data_stack.extend(tr_data)
        multiling_dataset.add(lang, tr_data, tr_labels, te_data, te_labels, tr_ID, te_ID)

    tfidf = TfidfVectorizer(strip_accents='unicode', min_df=3, sublinear_tf=True)
    tfidf.fit(tr_data_stack)

    for lang in langs:
        print("\nweighting documents for language <%s>" % (lang))
        (tr_data, tr_labels, tr_ID), (te_data, te_labels, te_ID) = multiling_dataset[lang]
        Xtr = tfidf.transform(tr_data)
        Xte = tfidf.transform(te_data)
        Ytr = mlb.transform(tr_labels)
        Yte = mlb.transform(te_labels)
        multiling_dataset.add(lang,Xtr,Ytr,Xte,Yte,tr_ID,te_ID)

    multiling_dataset.show_dimensions()
    return multiling_dataset


# ----------------------------------------------------------------------------------------------------------------------
# Methods to recover the original documents from the MultilingualDataset's ids
# ----------------------------------------------------------------------------------------------------------------------
"""
This method has been added a posteriori, to create document embeddings using the polylingual embeddings of the recent
article 'Word Translation without Parallel Data'; basically, it takes one of the splits and retrieves the RCV documents
from the doc ids and then pickles an object (tr_docs, te_docs, label_names) in the outpath
"""
def retrieve_rcv_documents_from_dataset(datasetpath, rcv1_data_home, rcv2_data_home, outpath):

    tr_ids, te_ids = MultilingualDataset.load_ids(datasetpath)
    assert tr_ids.keys() == te_ids.keys(), 'inconsistent keys tr vs te'
    langs = list(tr_ids.keys())

    print('fetching the datasets')
    rcv1_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train')
    rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l != 'en'])

    filter_by_categories(rcv1_documents, labels_rcv2)
    filter_by_categories(rcv2_documents, labels_rcv1)

    label_names = get_active_labels(rcv1_documents + rcv2_documents)
    print('Active labels in RCV1/2 {}'.format(len(label_names)))

    print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_documents), 0, len(label_names)))
    print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents]))

    all_docs = rcv1_documents + rcv2_documents
    mlb = MultiLabelBinarizer()
    mlb.fit([label_names])

    dataset = MultilingualDataset()
    for lang in langs:
        analyzer = CountVectorizer(strip_accents='unicode', min_df=3,
                                   stop_words=stopwords.words(NLTK_LANGMAP[lang])).build_analyzer()

        Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.id) for d in all_docs if d.lang == lang and d.id in tr_ids[lang]])
        Xte,Yte,IDte = zip(*[(d.text,d.categories,d.id) for d in all_docs if d.lang == lang and d.id in te_ids[lang]])
        Xtr = [' '.join(analyzer(d)) for d in Xtr]
        Xte = [' '.join(analyzer(d)) for d in Xte]
        Ytr = mlb.transform(Ytr)
        Yte = mlb.transform(Yte)
        dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte)

    dataset.save(outpath)

"""
Same thing but for JRC-Acquis
"""
def retrieve_jrc_documents_from_dataset(datasetpath, jrc_data_home, train_years, test_years, cat_policy, most_common_cat, outpath):

    tr_ids, te_ids = MultilingualDataset.load_ids(datasetpath)
    assert tr_ids.keys() == te_ids.keys(), 'inconsistent keys tr vs te'
    langs = list(tr_ids.keys())

    print('fetching the datasets')

    cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy)
    training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=train_years,
                                                 cat_filter=cat_list, cat_threshold=1, parallel=None,
                                                 most_frequent=most_common_cat)
    test_docs, _ = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names,
                                   parallel='force')

    def filter_by_id(doclist, ids):
        ids_set = frozenset(itertools.chain.from_iterable(ids.values()))
        return [x for x in doclist if (x.parallel_id+'__'+x.id) in ids_set]

    training_docs = filter_by_id(training_docs, tr_ids)
    test_docs = filter_by_id(test_docs, te_ids)

    print('jrc: {} train, {} test, {} categories'.format(len(training_docs), len(test_docs), len(label_names)))

    mlb = MultiLabelBinarizer()
    mlb.fit([label_names])

    dataset = MultilingualDataset()
    for lang in langs:
        analyzer = CountVectorizer(strip_accents='unicode', min_df=3,
                                   stop_words=stopwords.words(NLTK_LANGMAP[lang])).build_analyzer()

        Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.parallel_id+'__'+d.id) for d in training_docs if d.lang == lang])
        Xte,Yte,IDte = zip(*[(d.text,d.categories,d.parallel_id+'__'+d.id) for d in test_docs if d.lang == lang])
        Xtr = [' '.join(analyzer(d)) for d in Xtr]
        Xte = [' '.join(analyzer(d)) for d in Xte]
        Ytr = mlb.transform(Ytr)
        Yte = mlb.transform(Yte)
        dataset.add(lang, Xtr, Ytr, Xte, Yte, IDtr, IDte)

    dataset.save(outpath)

# ----------------------------------------------------------------------------------------------------------------------
# Dataset Generators
# ----------------------------------------------------------------------------------------------------------------------
def prepare_jrc_datasets(jrc_data_home, wiki_data_home, langs, train_years, test_years, cat_policy, most_common_cat=-1, max_wiki=5000, run=0):
    from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample


    """
    Prepare all datasets for JRC-Acquis. The datasets include the "feature-independent" version, the
    "feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices.
    In all cases, training documents are strictly non-parallel, and test documents are strictly parallel
    :param jrc_data_home: path to the raw JRC-Acquis documents (it will be downloaded if not found), and the path where
    all splits will be generated
    :param wiki_data_home: path to the wikipedia dump (see data/readers/wikipedia_tools.py)
    :param langs: the list of languages to consider (as defined in data/languages.py)
    :param train_years: a list of ints containing the years to be considered as training documents
    :param test_years: a list of ints containing the  years to be considered as test documents
    :param cat_policy: a string indicating which category selection policy to apply. Valid policies are, e.g., "all"
    (select all categories), "broadest" (select only the broadest concepts in the taxonomy), or "leaves" (select the
    leaves concepts in the taxonomy). See inspect_eurovoc from data/reader/jrcacquis_reader.py for more details
    :param most_common_cat: the maximum number of most common categories to consider, or -1 to keep them all
    :param max_wiki: the maximum number of wikipedia documents to consider (default 5000)
    :param run: a numeric label naming the random split (useful to keep track of different runs)
    :return: None
    """

    name = 'JRCacquis'
    run = '_run' + str(run)
    config_name = 'jrc_nltk_' + __years_to_str(train_years) + \
                  'vs' + __years_to_str(test_years) + \
                  '_' + cat_policy + \
                  ('_top' + str(most_common_cat) if most_common_cat!=-1 else '') + \
                  '_noparallel_processed'

    indep_path = join(jrc_data_home, config_name + run + '.pickle')
    upper_path = join(jrc_data_home, config_name + run + '_upper.pickle')
    yuxta_path = join(jrc_data_home, config_name + run + '_yuxtaposed.pickle')
    wiki_path  = join(jrc_data_home, config_name + run + '.wiki.pickle')
    wiki_docs_path = join(jrc_data_home, config_name + '.wiki.raw.pickle')

    cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy)
    training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=train_years,
                                                 cat_filter=cat_list, cat_threshold=1, parallel=None,
                                                 most_frequent=most_common_cat)
    test_docs, _ = fetch_jrcacquis(langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names,
                                   parallel='force')

    print('Generating feature-independent dataset...')
    training_docs_no_parallel = random_sampling_avoiding_parallel(training_docs)

    def _group_by_lang(doc_list, langs):
        return {lang: [(d.text, d.categories, d.parallel_id + '__' + d.id) for d in doc_list if d.lang == lang]
                for lang in langs}

    training_docs = _group_by_lang(training_docs, langs)
    training_docs_no_parallel = _group_by_lang(training_docs_no_parallel, langs)
    test_docs = _group_by_lang(test_docs, langs)
    if not exists(indep_path):
        wiki_docs=None
        if max_wiki>0:
            if not exists(wiki_docs_path):
                wiki_docs = fetch_wikipedia_multilingual(wiki_data_home, langs, min_words=50, deletions=False)
                wiki_docs = random_wiki_sample(wiki_docs, max_wiki)
                pickle.dump(wiki_docs, open(wiki_docs_path, 'wb'), pickle.HIGHEST_PROTOCOL)
            else:
                wiki_docs = pickle.load(open(wiki_docs_path, 'rb'))
            wiki_docs = random_wiki_sample(wiki_docs, max_wiki)

        if wiki_docs:
            lang_data, wiki_docs = build_independent_matrices(name, langs, training_docs_no_parallel, test_docs, label_names, wiki_docs)
            pickle.dump(wiki_docs, open(wiki_path, 'wb'), pickle.HIGHEST_PROTOCOL)
        else:
            lang_data = build_independent_matrices(name, langs, training_docs_no_parallel, test_docs, label_names)

        lang_data.save(indep_path)

    print('Generating upper-bound (English-only) dataset...')
    if not exists(upper_path):
        training_docs_eng_only = {'en':training_docs['en']}
        test_docs_eng_only = {'en':test_docs['en']}
        build_independent_matrices(name, ['en'], training_docs_eng_only, test_docs_eng_only, label_names).save(upper_path)

    print('Generating yuxtaposed dataset...')
    if not exists(yuxta_path):
        build_juxtaposed_matrices(name, langs, training_docs_no_parallel, test_docs, label_names).save(yuxta_path)


def prepare_rcv_datasets(outpath, rcv1_data_home, rcv2_data_home, wiki_data_home, langs,
                         train_for_lang=1000, test_for_lang=1000, max_wiki=5000, preprocess=True, run=0):
    from data.reader.wikipedia_tools import fetch_wikipedia_multilingual, random_wiki_sample
    """
        Prepare all datasets for RCV1/RCV2. The datasets include the "feature-independent" version, the
        "feature-yuxtaposed" version, the monolingual version for the UpperBound, and the processed wikipedia matrices.

        :param outpath: path where all splits will be dumped
        :param rcv1_data_home: path to the RCV1-v2 dataset (English only)
        :param rcv2_data_home: path to the RCV2 dataset (all languages other than English)
        :param wiki_data_home: path to the wikipedia dump (see data/readers/wikipedia_tools.py)
        :param langs: the list of languages to consider (as defined in data/languages.py)
        :param train_for_lang: maximum number of training documents per language
        :param test_for_lang:  maximum number of test documents per language
        :param max_wiki: the maximum number of wikipedia documents to consider (default 5000)
        :param preprocess: whether or not to apply language-specific preprocessing (stopwords removal and stemming)
        :param run: a numeric label naming the random split (useful to keep track of different runs)
        :return: None
        """

    assert 'en' in langs, 'English is not in requested languages, but is needed for some datasets'
    assert len(langs)>1, 'the multilingual dataset cannot be built with only one dataset'
    assert not preprocess or set(langs).issubset(set(RCV2_LANGS_WITH_NLTK_STEMMING+['en'])), \
        "languages not in RCV1-v2/RCV2 scope or not in valid for NLTK's processing"

    name = 'RCV1/2'
    run = '_run' + str(run)
    config_name = 'rcv1-2_nltk_trByLang'+str(train_for_lang)+'_teByLang'+str(test_for_lang)+\
                  ('_processed' if preprocess else '_raw')

    indep_path = join(outpath, config_name + run + '.pickle')
    upper_path = join(outpath, config_name + run +'_upper.pickle')
    yuxta_path = join(outpath, config_name + run +'_yuxtaposed.pickle')
    wiki_path = join(outpath, config_name + run + '.wiki.pickle')
    wiki_docs_path = join(outpath, config_name + '.wiki.raw.pickle')

    print('fetching the datasets')
    rcv1_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train')
    rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l!='en'])
    filter_by_categories(rcv1_documents, labels_rcv2)
    filter_by_categories(rcv2_documents, labels_rcv1)

    label_names = get_active_labels(rcv1_documents+rcv2_documents)
    print('Active labels in RCV1/2 {}'.format(len(label_names)))

    print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_documents), 0, len(label_names)))
    print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents]))

    lang_docs = {lang: [d for d in rcv1_documents + rcv2_documents if d.lang == lang] for lang in langs}

    # for the upper bound there are no parallel versions, so for the English case, we take as many documents as there
    # would be in the multilingual case -- then we will extract from them only train_for_lang for the other cases
    print('Generating upper-bound (English-only) dataset...')
    train, test = train_test_split(lang_docs['en'], train_size=train_for_lang*len(langs), test_size=test_for_lang, shuffle=True)
    train_lang_doc_map = {'en':[(d.text, d.categories, d.id) for d in train]}
    test_lang_doc_map = {'en':[(d.text, d.categories, d.id) for d in test]}
    build_independent_matrices(name, ['en'], train_lang_doc_map, test_lang_doc_map, label_names).save(upper_path)

    train_lang_doc_map['en'] = train_lang_doc_map['en'][:train_for_lang]
    for lang in langs:
        if lang=='en': continue # already split
        test_take = min(test_for_lang, len(lang_docs[lang])-train_for_lang)
        train, test = train_test_split(lang_docs[lang], train_size=train_for_lang, test_size=test_take, shuffle=True)
        train_lang_doc_map[lang] = [(d.text, d.categories, d.id) for d in train]
        test_lang_doc_map[lang]  = [(d.text, d.categories, d.id) for d in test]

    print('Generating feature-independent dataset...')
    wiki_docs=None
    if max_wiki>0:
        if not exists(wiki_docs_path):
            wiki_docs = fetch_wikipedia_multilingual(wiki_data_home, langs, min_words=50, deletions=False)
            wiki_docs = random_wiki_sample(wiki_docs, max_wiki)
            pickle.dump(wiki_docs, open(wiki_docs_path, 'wb'), pickle.HIGHEST_PROTOCOL)
        else:
            wiki_docs = pickle.load(open(wiki_docs_path, 'rb'))
        wiki_docs = random_wiki_sample(wiki_docs, max_wiki)

    if wiki_docs:
        lang_data, wiki_docs_matrix = build_independent_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, wiki_docs, preprocess)
        pickle.dump(wiki_docs_matrix, open(wiki_path, 'wb'), pickle.HIGHEST_PROTOCOL)
    else:
        lang_data = build_independent_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, wiki_docs, preprocess)

    lang_data.save(indep_path)

    print('Generating yuxtaposed dataset...')
    build_juxtaposed_matrices(name, langs, train_lang_doc_map, test_lang_doc_map, label_names, preprocess).save(yuxta_path)


# ----------------------------------------------------------------------------------------------------------------------
# Methods to generate full RCV and JRC datasets
# ----------------------------------------------------------------------------------------------------------------------
def full_rcv_(rcv1_data_home, rcv2_data_home, outpath, langs):


    print('fetching the datasets')
    rcv1_train_documents, labels_rcv1 = fetch_RCV1(rcv1_data_home, split='train')
    rcv1_test_documents, labels_rcv1_test = fetch_RCV1(rcv1_data_home, split='test')
    rcv2_documents, labels_rcv2 = fetch_RCV2(rcv2_data_home, [l for l in langs if l != 'en'])

    filter_by_categories(rcv1_train_documents, labels_rcv2)
    filter_by_categories(rcv1_test_documents, labels_rcv2)
    filter_by_categories(rcv2_documents, labels_rcv1)

    label_names = get_active_labels(rcv1_train_documents + rcv2_documents)
    print('Active labels in RCV1/2 {}'.format(len(label_names)))

    print('rcv1: {} train, {} test, {} categories'.format(len(rcv1_train_documents), len(rcv1_test_documents), len(label_names)))
    print('rcv2: {} documents'.format(len(rcv2_documents)), Counter([doc.lang for doc in rcv2_documents]))

    mlb = MultiLabelBinarizer()
    mlb.fit([label_names])

    all_docs = rcv1_train_documents + rcv1_test_documents + rcv2_documents
    lang_docs = {lang: [d for d in all_docs if d.lang == lang] for lang in langs}

    def get_ids(doclist):
        return frozenset([d.id for d in doclist])

    tr_ids = {'en': get_ids(rcv1_train_documents)}
    te_ids = {'en': get_ids(rcv1_test_documents)}
    for lang in langs:
        if lang == 'en': continue
        tr_ids[lang], te_ids[lang] = train_test_split([d.id for d in lang_docs[lang]], test_size=.3)

    dataset = MultilingualDataset()
    dataset.dataset_name = 'RCV1/2-full'
    for lang in langs:
        print(f'processing {lang} with {len(tr_ids[lang])} training documents and {len(te_ids[lang])} documents')
        analyzer = CountVectorizer(
            strip_accents='unicode', min_df=3, stop_words=stopwords.words(NLTK_LANGMAP[lang])
        ).build_analyzer()

        Xtr,Ytr,IDtr = zip(*[(d.text,d.categories,d.id) for d in lang_docs[lang] if d.id in tr_ids[lang]])
        Xte,Yte,IDte = zip(*[(d.text,d.categories,d.id) for d in lang_docs[lang] if d.id in te_ids[lang]])
        Xtr = [' '.join(analyzer(d)) for d in Xtr]
        Xte = [' '.join(analyzer(d)) for d in Xte]
        Ytr = mlb.transform(Ytr)
        Yte = mlb.transform(Yte)
        dataset.add(lang, _mask_numbers(Xtr), Ytr, _mask_numbers(Xte), Yte, IDtr, IDte)

    dataset.save(outpath)


def full_jrc_(jrc_data_home, langs, train_years, test_years, outpath, cat_policy='all', most_common_cat=300):

    print('fetching the datasets')
    cat_list = inspect_eurovoc(jrc_data_home, select=cat_policy)
    training_docs, label_names = fetch_jrcacquis(
        langs=langs, data_path=jrc_data_home, years=train_years, cat_filter=cat_list, cat_threshold=1, parallel=None, most_frequent=most_common_cat
    )
    test_docs, _ = fetch_jrcacquis(
        langs=langs, data_path=jrc_data_home, years=test_years, cat_filter=label_names, parallel='force'
    )

    def _group_by_lang(doc_list, langs):
        return {lang: [d for d in doc_list if d.lang == lang] for lang in langs}

    training_docs = _group_by_lang(training_docs, langs)
    test_docs = _group_by_lang(test_docs, langs)

    mlb = MultiLabelBinarizer()
    mlb.fit([label_names])

    dataset = MultilingualDataset()
    data.dataset_name = 'JRC-Acquis-full'
    for lang in langs:
        analyzer = CountVectorizer(
            strip_accents='unicode', min_df=3, stop_words=stopwords.words(NLTK_LANGMAP[lang])
        ).build_analyzer()

        Xtr, Ytr, IDtr = zip(*[(d.text, d.categories, d.parallel_id + '__' + d.id) for d in training_docs[lang] if d.lang == lang])
        Xte, Yte, IDte = zip(*[(d.text, d.categories, d.parallel_id + '__' + d.id) for d in test_docs[lang] if d.lang == lang])
        Xtr = [' '.join(analyzer(d)) for d in Xtr]
        Xte = [' '.join(analyzer(d)) for d in Xte]
        Ytr = mlb.transform(Ytr)
        Yte = mlb.transform(Yte)
        dataset.add(lang, _mask_numbers(Xtr), Ytr, _mask_numbers(Xte), Yte, IDtr, IDte)

    dataset.save(outpath)


#-----------------------------------------------------------------------------------------------------------------------
# MAIN BUILDER
#-----------------------------------------------------------------------------------------------------------------------

if __name__=='__main__':
    import sys
    RCV1_PATH = '../Datasets/RCV1-v2/unprocessed_corpus'
    RCV2_PATH = '../Datasets/RCV2'
    JRC_DATAPATH = "../Datasets/JRC_Acquis_v3"
    full_rcv_(RCV1_PATH, RCV2_PATH, outpath='../rcv2/rcv1-2_doclist_full_processed.pickle', langs=RCV2_LANGS_WITH_NLTK_STEMMING + ['en'])
    # full_jrc_(JRC_DATAPATH, lang_set['JRC_NLTK'], train_years=list(range(1958, 2006)), test_years=[2006], outpath='../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle', cat_policy='all', most_common_cat=300)
    sys.exit(0)

    # datasetpath = '../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle' # '../rcv2/rcv1-2_doclist_full_processed.pickle'
    # data = MultilingualDataset.load(datasetpath)
    # data.dataset_name='JRC-Acquis-full'#'RCV1/2-full'
    # for lang in RCV2_LANGS_WITH_NLTK_STEMMING + ['en']:
    #     (Xtr, ytr, idtr), (Xte, yte, idte) = data.multiling_dataset[lang]
    #     data.multiling_dataset[lang] = ((_mask_numbers(Xtr), ytr, idtr), (_mask_numbers(Xte), yte, idte))
    # data.save('../jrc_acquis/jrc_doclist_1958-2005vs2006_all_top300_full_processed.pickle')#'../rcv2/rcv1-2_doclist_full_processed_2.pickle')
    # sys.exit(0)

    assert len(sys.argv) == 5, "wrong number of arguments; required: " \
                               "<JRC_PATH> <RCV1_PATH> <RCV2_PATH> <WIKI_PATH> "

    JRC_DATAPATH = sys.argv[1] # "../Datasets/JRC_Acquis_v3"
    RCV1_PATH = sys.argv[2] #'../Datasets/RCV1-v2/unprocessed_corpus'
    RCV2_PATH = sys.argv[3] #'../Datasets/RCV2'
    WIKI_DATAPATH = sys.argv[4] #"../Datasets/Wikipedia/multilingual_docs_JRC_NLTK"

    langs = lang_set['JRC_NLTK']
    max_wiki = 5000

    for run in range(0,10):
        print('Building JRC-Acquis datasets run', run)
        prepare_jrc_datasets(JRC_DATAPATH, WIKI_DATAPATH, langs,
                             train_years=list(range(1958, 2006)), test_years=[2006], max_wiki=max_wiki,
                             cat_policy='all', most_common_cat=300, run=run)

        print('Building RCV1-v2/2 datasets run', run)
        prepare_rcv_datasets(RCV2_PATH, RCV1_PATH, RCV2_PATH, WIKI_DATAPATH, RCV2_LANGS_WITH_NLTK_STEMMING + ['en'],
                             train_for_lang=1000, test_for_lang=1000, max_wiki=max_wiki, run=run)

        # uncomment this code if you want to retrieve the original documents to generate the data splits for PLE
        # (make sure you have not modified the above parameters, or adapt the following paths accordingly...)
        # datasetpath = join(RCV2_PATH,'rcv1-2_nltk_trByLang1000_teByLang1000_processed_run{}.pickle'.format(run))
        # outpath = datasetpath.replace('_nltk_','_doclist_')
        # retrieve_rcv_documents_from_dataset(datasetpath, RCV1_PATH, RCV2_PATH, outpath)

        # datasetpath = join(JRC_DATAPATH, 'jrc_nltk_1958-2005vs2006_all_top300_noparallel_processed_run{}.pickle'.format(run))
        # outpath = datasetpath.replace('_nltk_', '_doclist_')
        # retrieve_jrc_documents_from_dataset(datasetpath, JRC_DATAPATH, train_years=list(range(1958, 2006)), test_years=[2006], cat_policy='all', most_common_cat=300, outpath=outpath)