gFun/refactor/data/reader/jrcacquis_reader.py

from __future__ import print_function

import os
import pickle
import sys
import tarfile
import xml.etree.ElementTree as ET
import zipfile
from collections import Counter
from os.path import join
from random import shuffle

import rdflib
from rdflib.namespace import RDF, SKOS
from sklearn.datasets import get_data_home

from data.languages import JRC_LANGS
from data.languages import lang_set
from util.file import download_file, list_dirs, list_files

"""
JRC Acquis' Nomenclature:
bg = Bulgarian
cs = Czech
da = Danish
de = German
el = Greek
en = English
es = Spanish
et = Estonian
fi = Finnish
fr = French
hu = Hungarian
it = Italian
lt = Lithuanian
lv = Latvian
nl = Dutch
mt = Maltese
pl = Polish
pt = Portuguese
ro = Romanian
sk = Slovak
sl = Slovene
sv = Swedish
"""

class JRCAcquis_Document:
    def __init__(self, id, name, lang, year, head, body, categories):
        self.id = id
        self.parallel_id = name
        self.lang = lang
        self.year = year
        self.text = body if not head else head + "\n" + body
        self.categories = categories

# this is a workaround... for some reason, acutes are codified in a non-standard manner in titles
# however, it seems that the title is often appearing as the first paragraph in the text/body (with
# standard codification), so it might be preferable not to read the header after all (as here by default)
def _proc_acute(text):
    for ch in ['a','e','i','o','u']:
        text = text.replace('%'+ch+'acute%',ch)
    return text

def parse_document(file, year, head=False):
    root = ET.parse(file).getroot()

    doc_name = root.attrib['n'] # e.g., '22006A0211(01)'
    doc_lang = root.attrib['lang'] # e.g., 'es'
    doc_id   = root.attrib['id'] # e.g., 'jrc22006A0211_01-es'
    doc_categories = [cat.text for cat in root.findall('.//teiHeader/profileDesc/textClass/classCode[@scheme="eurovoc"]')]
    doc_head = _proc_acute(root.find('.//text/body/head').text) if head else ''
    doc_body = '\n'.join([p.text for p in root.findall('.//text/body/div[@type="body"]/p')])

    def raise_if_empty(field, from_file):
        if isinstance(field, str):
            if not field.strip():
                raise ValueError("Empty field in file %s" % from_file)

    raise_if_empty(doc_name, file)
    raise_if_empty(doc_lang, file)
    raise_if_empty(doc_id, file)
    if head: raise_if_empty(doc_head, file)
    raise_if_empty(doc_body, file)

    return JRCAcquis_Document(id=doc_id, name=doc_name, lang=doc_lang, year=year, head=doc_head, body=doc_body, categories=doc_categories)

# removes documents without a counterpart in all other languages
def _force_parallel(doclist, langs):
    n_langs = len(langs)
    par_id_count = Counter([d.parallel_id for d in doclist])
    parallel_doc_ids = set([id for id,count in par_id_count.items() if count==n_langs])
    return [doc for doc in doclist if doc.parallel_id in parallel_doc_ids]

def random_sampling_avoiding_parallel(doclist):
    random_order = list(range(len(doclist)))
    shuffle(random_order)
    sampled_request = []
    parallel_ids = set()
    for ind in random_order:
        pid = doclist[ind].parallel_id
        if pid not in parallel_ids:
            sampled_request.append(doclist[ind])
            parallel_ids.add(pid)
    print('random_sampling_no_parallel:: from {} documents to {} documents'.format(len(doclist), len(sampled_request)))
    return sampled_request


#filters out documents which do not contain any category in the cat_filter list, and filter all labels not in cat_filter
def _filter_by_category(doclist, cat_filter):
    if not isinstance(cat_filter, frozenset):
        cat_filter = frozenset(cat_filter)
    filtered = []
    for doc in doclist:
        doc.categories = list(cat_filter & set(doc.categories))
        if doc.categories:
            doc.categories.sort()
            filtered.append(doc)
    print("filtered %d documents out without categories in the filter list" % (len(doclist) - len(filtered)))
    return filtered

#filters out categories with less than cat_threshold documents (and filters documents containing those categories)
def _filter_by_frequency(doclist, cat_threshold):
    cat_count = Counter()
    for d in doclist:
        cat_count.update(d.categories)

    freq_categories = [cat for cat,count in cat_count.items() if count>cat_threshold]
    freq_categories.sort()
    return _filter_by_category(doclist, freq_categories), freq_categories

#select top most_frequent categories (and filters documents containing those categories)
def _most_common(doclist, most_frequent):
    cat_count = Counter()
    for d in doclist:
        cat_count.update(d.categories)

    freq_categories = [cat for cat,count in cat_count.most_common(most_frequent)]
    freq_categories.sort()
    return _filter_by_category(doclist, freq_categories), freq_categories

def _get_categories(request):
    final_cats = set()
    for d in request:
        final_cats.update(d.categories)
    return list(final_cats)

def fetch_jrcacquis(langs=None, data_path=None, years=None, ignore_unclassified=True, cat_filter=None, cat_threshold=0,
                    parallel=None, most_frequent=-1, DOWNLOAD_URL_BASE ='http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/'):

    assert parallel in [None, 'force', 'avoid'], 'parallel mode not supported'
    if not langs:
        langs = JRC_LANGS
    else:
        if isinstance(langs, str): langs = [langs]
        for l in langs:
            if l not in JRC_LANGS:
                raise ValueError('Language %s is not among the valid languages in JRC-Acquis v3' % l)

    if not data_path:
        data_path = get_data_home()

    if not os.path.exists(data_path):
        os.mkdir(data_path)

    request = []
    total_read = 0
    for l in langs:
        file_name = 'jrc-'+l+'.tgz'
        archive_path = join(data_path, file_name)

        if not os.path.exists(archive_path):
            print("downloading language-specific dataset (once and for all) into %s" % data_path)
            DOWNLOAD_URL = join(DOWNLOAD_URL_BASE, file_name)
            download_file(DOWNLOAD_URL, archive_path)
            print("untarring dataset...")
            tarfile.open(archive_path, 'r:gz').extractall(data_path)

        documents_dir = join(data_path, l)

        print("Reading documents...")
        read = 0
        for dir in list_dirs(documents_dir):
            year = int(dir)
            if years==None or year in years:
                year_dir = join(documents_dir,dir)
                pickle_name = join(data_path, 'jrc_' + l + '_' + dir + '.pickle')
                if os.path.exists(pickle_name):
                    print("loading from file %s" % pickle_name)
                    l_y_documents = pickle.load(open(pickle_name, "rb"))
                    read += len(l_y_documents)
                else:
                    l_y_documents = []
                    all_documents = list_files(year_dir)
                    empty = 0
                    for i,doc_file in enumerate(all_documents):
                        try:
                            jrc_doc = parse_document(join(year_dir, doc_file), year)
                        except ValueError:
                            jrc_doc = None

                        if jrc_doc and (not ignore_unclassified or jrc_doc.categories):
                            l_y_documents.append(jrc_doc)
                        else: empty += 1
                        if len(all_documents)>50 and ((i+1) % (len(all_documents)/50) == 0):
                            print('\r\tfrom %s: completed %d%%' % (year_dir, int((i+1)*100.0/len(all_documents))), end='')
                        read+=1
                    print('\r\tfrom %s: completed 100%% read %d documents (discarded %d without categories or empty fields)\n' % (year_dir, i+1, empty), end='')
                    print("\t\t(Pickling object for future runs in %s)" % pickle_name)
                    pickle.dump(l_y_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL)
                request += l_y_documents
        print("Read %d documents for language %s\n" % (read, l))
        total_read += read
    print("Read %d documents in total" % (total_read))

    if parallel=='force':
        request = _force_parallel(request, langs)
    elif parallel == 'avoid':
        request = random_sampling_avoiding_parallel(request)

    final_cats = _get_categories(request)

    if cat_filter:
        request = _filter_by_category(request, cat_filter)
        final_cats = _get_categories(request)
    if cat_threshold > 0:
        request, final_cats = _filter_by_frequency(request, cat_threshold)
    if most_frequent != -1 and len(final_cats) > most_frequent:
        request, final_cats = _most_common(request, most_frequent)

    return request, final_cats

def print_cat_analysis(request):
    cat_count = Counter()
    for d in request:
        cat_count.update(d.categories)
    print("Number of active categories: {}".format(len(cat_count)))
    print(cat_count.most_common())

# inspects the Eurovoc thesaurus in order to select a subset of categories
# currently, only 'broadest' policy (i.e., take all categories with no parent category), and 'all' is implemented
def inspect_eurovoc(data_path, eurovoc_skos_core_concepts_filename='eurovoc_in_skos_core_concepts.rdf',
                    eurovoc_url="http://publications.europa.eu/mdr/resource/thesaurus/eurovoc-20160630-0/skos/eurovoc_in_skos_core_concepts.zip",
                    select="broadest"):

    fullpath_pickle = join(data_path, select+'_concepts.pickle')
    if os.path.exists(fullpath_pickle):
        print("Pickled object found in %s. Loading it." % fullpath_pickle)
        return pickle.load(open(fullpath_pickle,'rb'))

    fullpath = join(data_path, eurovoc_skos_core_concepts_filename)
    if not os.path.exists(fullpath):
        print("Path %s does not exist. Trying to download the skos EuroVoc file from %s" % (data_path, eurovoc_url))
        download_file(eurovoc_url, fullpath)
        print("Unzipping file...")
        zipped = zipfile.ZipFile(data_path + '.zip', 'r')
        zipped.extract("eurovoc_in_skos_core_concepts.rdf", data_path)
        zipped.close()

    print("Parsing %s" %fullpath)
    g = rdflib.Graph()
    g.parse(location=fullpath, format="application/rdf+xml")

    if select == "all":
        print("Selecting all concepts")
        all_concepts = list(g.subjects(RDF.type, SKOS.Concept))
        all_concepts = [c.toPython().split('/')[-1] for c in all_concepts]
        all_concepts.sort()
        selected_concepts = all_concepts
    elif select=="broadest":
        print("Selecting broadest concepts (those without any other broader concept linked to it)")
        all_concepts = set(g.subjects(RDF.type, SKOS.Concept))
        narrower_concepts = set(g.subjects(SKOS.broader, None))
        broadest_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - narrower_concepts)]
        broadest_concepts.sort()
        selected_concepts = broadest_concepts
    elif select=="leaves":
        print("Selecting leaves concepts (those not linked as broader of any other concept)")
        all_concepts = set(g.subjects(RDF.type, SKOS.Concept))
        broad_concepts = set(g.objects(None, SKOS.broader))
        leave_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - broad_concepts)]
        leave_concepts.sort()
        selected_concepts = leave_concepts
    else:
        raise ValueError("Selection policy %s is not currently supported" % select)

    print("%d %s concepts found" % (len(selected_concepts), leave_concepts))
    print("Pickling concept list for faster further requests in %s" % fullpath_pickle)
    pickle.dump(selected_concepts, open(fullpath_pickle, 'wb'), pickle.HIGHEST_PROTOCOL)

    return selected_concepts

if __name__ == '__main__':

    def single_label_fragment(doclist):
        single = [d for d in doclist if len(d.categories) < 2]
        final_categories = set([d.categories[0] if d.categories else [] for d in single])
        print('{} single-label documents ({} categories) from the original {} documents'.format(len(single),
                                                                                                len(final_categories),
                                                                                                len(doclist)))
        return single, list(final_categories)

    train_years = list(range(1986, 2006))
    test_years = [2006]
    cat_policy = 'leaves'
    most_common_cat = 300
    # JRC_DATAPATH = "/media/moreo/1TB Volume/Datasets/JRC_Acquis_v3"
    JRC_DATAPATH = "/storage/andrea/FUNNELING/data/JRC_Acquis_v3"
    langs = lang_set['JRC_NLTK']
    cat_list = inspect_eurovoc(JRC_DATAPATH, select=cat_policy)
    sys.exit()

    training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=JRC_DATAPATH, years=train_years,cat_filter=cat_list, cat_threshold=1, parallel=None,most_frequent=most_common_cat)
    test_docs, label_namestest = fetch_jrcacquis(langs=langs, data_path=JRC_DATAPATH, years=test_years, cat_filter=label_names,parallel='force')

    print('JRC-train: {} documents, {} labels'.format(len(training_docs), len(label_names)))
    print('JRC-test: {} documents, {} labels'.format(len(test_docs), len(label_namestest)))

    training_docs, label_names = single_label_fragment(training_docs)
    test_docs, label_namestest = single_label_fragment(test_docs)

    print('JRC-train: {} documents, {} labels'.format(len(training_docs), len(label_names)))
    print('JRC-test: {} documents, {} labels'.format(len(test_docs), len(label_namestest)))