gFun/refactor/data/reader/jrcacquis_reader.py

from __future__ import print_function
import os, sys
from os.path import join
import tarfile
import xml.etree.ElementTree as ET
from sklearn.datasets import get_data_home
import pickle
from util.file import download_file, list_dirs, list_files
import rdflib
from rdflib.namespace import RDF, SKOS
from rdflib import URIRef
import zipfile
from data.languages import JRC_LANGS
from collections import Counter
from random import shuffle
from data.languages import lang_set

"""
JRC Acquis' Nomenclature:
bg = Bulgarian
cs = Czech
da = Danish
de = German
el = Greek
en = English
es = Spanish
et = Estonian
fi = Finnish
fr = French
hu = Hungarian
it = Italian
lt = Lithuanian
lv = Latvian
nl = Dutch
mt = Maltese
pl = Polish
pt = Portuguese
ro = Romanian
sk = Slovak
sl = Slovene
sv = Swedish
"""

class JRCAcquis_Document:
    def __init__(self, id, name, lang, year, head, body, categories):
        self.id = id
        self.parallel_id = name
        self.lang = lang
        self.year = year
        self.text = body if not head else head + "\n" + body
        self.categories = categories

# this is a workaround... for some reason, acutes are codified in a non-standard manner in titles
# however, it seems that the title is often appearing as the first paragraph in the text/body (with
# standard codification), so it might be preferable not to read the header after all (as here by default)
def _proc_acute(text):
    for ch in ['a','e','i','o','u']:
        text = text.replace('%'+ch+'acute%',ch)
    return text

def parse_document(file, year, head=False):
    root = ET.parse(file).getroot()

    doc_name = root.attrib['n'] # e.g., '22006A0211(01)'
    doc_lang = root.attrib['lang'] # e.g., 'es'
    doc_id   = root.attrib['id'] # e.g., 'jrc22006A0211_01-es'
    doc_categories = [cat.text for cat in root.findall('.//teiHeader/profileDesc/textClass/classCode[@scheme="eurovoc"]')]
    doc_head = _proc_acute(root.find('.//text/body/head').text) if head else ''
    doc_body = '\n'.join([p.text for p in root.findall('.//text/body/div[@type="body"]/p')])

    def raise_if_empty(field, from_file):
        if isinstance(field, str):
            if not field.strip():
                raise ValueError("Empty field in file %s" % from_file)

    raise_if_empty(doc_name, file)
    raise_if_empty(doc_lang, file)
    raise_if_empty(doc_id, file)
    if head: raise_if_empty(doc_head, file)
    raise_if_empty(doc_body, file)

    return JRCAcquis_Document(id=doc_id, name=doc_name, lang=doc_lang, year=year, head=doc_head, body=doc_body, categories=doc_categories)

# removes documents without a counterpart in all other languages
def _force_parallel(doclist, langs):
    n_langs = len(langs)
    par_id_count = Counter([d.parallel_id for d in doclist])
    parallel_doc_ids = set([id for id,count in par_id_count.items() if count==n_langs])
    return [doc for doc in doclist if doc.parallel_id in parallel_doc_ids]

def random_sampling_avoiding_parallel(doclist):
    random_order = list(range(len(doclist)))
    shuffle(random_order)
    sampled_request = []
    parallel_ids = set()
    for ind in random_order:
        pid = doclist[ind].parallel_id
        if pid not in parallel_ids:
            sampled_request.append(doclist[ind])
            parallel_ids.add(pid)
    print('random_sampling_no_parallel:: from {} documents to {} documents'.format(len(doclist), len(sampled_request)))
    return sampled_request


#filters out documents which do not contain any category in the cat_filter list, and filter all labels not in cat_filter
def _filter_by_category(doclist, cat_filter):
    if not isinstance(cat_filter, frozenset):
        cat_filter = frozenset(cat_filter)
    filtered = []
    for doc in doclist:
        doc.categories = list(cat_filter & set(doc.categories))
        if doc.categories:
            doc.categories.sort()
            filtered.append(doc)
    print("filtered %d documents out without categories in the filter list" % (len(doclist) - len(filtered)))
    return filtered

#filters out categories with less than cat_threshold documents (and filters documents containing those categories)
def _filter_by_frequency(doclist, cat_threshold):
    cat_count = Counter()
    for d in doclist:
        cat_count.update(d.categories)

    freq_categories = [cat for cat,count in cat_count.items() if count>cat_threshold]
    freq_categories.sort()
    return _filter_by_category(doclist, freq_categories), freq_categories

#select top most_frequent categories (and filters documents containing those categories)
def _most_common(doclist, most_frequent):
    cat_count = Counter()
    for d in doclist:
        cat_count.update(d.categories)

    freq_categories = [cat for cat,count in cat_count.most_common(most_frequent)]
    freq_categories.sort()
    return _filter_by_category(doclist, freq_categories), freq_categories

def _get_categories(request):
    final_cats = set()
    for d in request:
        final_cats.update(d.categories)
    return list(final_cats)

def fetch_jrcacquis(langs=None, data_path=None, years=None, ignore_unclassified=True, cat_filter=None, cat_threshold=0,
                    parallel=None, most_frequent=-1, DOWNLOAD_URL_BASE ='http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/'):

    assert parallel in [None, 'force', 'avoid'], 'parallel mode not supported'
    if not langs:
        langs = JRC_LANGS
    else:
        if isinstance(langs, str): langs = [langs]
        for l in langs:
            if l not in JRC_LANGS:
                raise ValueError('Language %s is not among the valid languages in JRC-Acquis v3' % l)

    if not data_path:
        data_path = get_data_home()

    if not os.path.exists(data_path):
        os.mkdir(data_path)

    request = []
    total_read = 0
    for l in langs:
        file_name = 'jrc-'+l+'.tgz'
        archive_path = join(data_path, file_name)

        if not os.path.exists(archive_path):
            print("downloading language-specific dataset (once and for all) into %s" % data_path)
            DOWNLOAD_URL = join(DOWNLOAD_URL_BASE, file_name)
            download_file(DOWNLOAD_URL, archive_path)
            print("untarring dataset...")
            tarfile.open(archive_path, 'r:gz').extractall(data_path)

        documents_dir = join(data_path, l)

        print("Reading documents...")
        read = 0
        for dir in list_dirs(documents_dir):
            year = int(dir)
            if years==None or year in years:
                year_dir = join(documents_dir,dir)
                pickle_name = join(data_path, 'jrc_' + l + '_' + dir + '.pickle')
                if os.path.exists(pickle_name):
                    print("loading from file %s" % pickle_name)
                    l_y_documents = pickle.load(open(pickle_name, "rb"))
                    read += len(l_y_documents)
                else:
                    l_y_documents = []
                    all_documents = list_files(year_dir)
                    empty = 0
                    for i,doc_file in enumerate(all_documents):
                        try:
                            jrc_doc = parse_document(join(year_dir, doc_file), year)
                        except ValueError:
                            jrc_doc = None

                        if jrc_doc and (not ignore_unclassified or jrc_doc.categories):
                            l_y_documents.append(jrc_doc)
                        else: empty += 1
                        if len(all_documents)>50 and ((i+1) % (len(all_documents)/50) == 0):
                            print('\r\tfrom %s: completed %d%%' % (year_dir, int((i+1)*100.0/len(all_documents))), end='')
                        read+=1
                    print('\r\tfrom %s: completed 100%% read %d documents (discarded %d without categories or empty fields)\n' % (year_dir, i+1, empty), end='')
                    print("\t\t(Pickling object for future runs in %s)" % pickle_name)
                    pickle.dump(l_y_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL)
                request += l_y_documents
        print("Read %d documents for language %s\n" % (read, l))
        total_read += read
    print("Read %d documents in total" % (total_read))

    if parallel=='force':
        request = _force_parallel(request, langs)
    elif parallel == 'avoid':
        request = random_sampling_avoiding_parallel(request)

    final_cats = _get_categories(request)

    if cat_filter:
        request = _filter_by_category(request, cat_filter)
        final_cats = _get_categories(request)
    if cat_threshold > 0:
        request, final_cats = _filter_by_frequency(request, cat_threshold)
    if most_frequent != -1 and len(final_cats) > most_frequent:
        request, final_cats = _most_common(request, most_frequent)

    return request, final_cats

def print_cat_analysis(request):
    cat_count = Counter()
    for d in request:
        cat_count.update(d.categories)
    print("Number of active categories: {}".format(len(cat_count)))
    print(cat_count.most_common())

# inspects the Eurovoc thesaurus in order to select a subset of categories
# currently, only 'broadest' policy (i.e., take all categories with no parent category), and 'all' is implemented
def inspect_eurovoc(data_path, eurovoc_skos_core_concepts_filename='eurovoc_in_skos_core_concepts.rdf',
                    eurovoc_url="http://publications.europa.eu/mdr/resource/thesaurus/eurovoc-20160630-0/skos/eurovoc_in_skos_core_concepts.zip",
                    select="broadest"):

    fullpath_pickle = join(data_path, select+'_concepts.pickle')
    if os.path.exists(fullpath_pickle):
        print("Pickled object found in %s. Loading it." % fullpath_pickle)
        return pickle.load(open(fullpath_pickle,'rb'))

    fullpath = join(data_path, eurovoc_skos_core_concepts_filename)
    if not os.path.exists(fullpath):
        print("Path %s does not exist. Trying to download the skos EuroVoc file from %s" % (data_path, eurovoc_url))
        download_file(eurovoc_url, fullpath)
        print("Unzipping file...")
        zipped = zipfile.ZipFile(data_path + '.zip', 'r')
        zipped.extract("eurovoc_in_skos_core_concepts.rdf", data_path)
        zipped.close()

    print("Parsing %s" %fullpath)
    g = rdflib.Graph()
    g.parse(location=fullpath, format="application/rdf+xml")

    if select == "all":
        print("Selecting all concepts")
        all_concepts = list(g.subjects(RDF.type, SKOS.Concept))
        all_concepts = [c.toPython().split('/')[-1] for c in all_concepts]
        all_concepts.sort()
        selected_concepts = all_concepts
    elif select=="broadest":
        print("Selecting broadest concepts (those without any other broader concept linked to it)")
        all_concepts = set(g.subjects(RDF.type, SKOS.Concept))
        narrower_concepts = set(g.subjects(SKOS.broader, None))
        broadest_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - narrower_concepts)]
        broadest_concepts.sort()
        selected_concepts = broadest_concepts
    elif select=="leaves":
        print("Selecting leaves concepts (those not linked as broader of any other concept)")
        all_concepts = set(g.subjects(RDF.type, SKOS.Concept))
        broad_concepts = set(g.objects(None, SKOS.broader))
        leave_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - broad_concepts)]
        leave_concepts.sort()
        selected_concepts = leave_concepts
    else:
        raise ValueError("Selection policy %s is not currently supported" % select)

    print("%d %s concepts found" % (len(selected_concepts), leave_concepts))
    print("Pickling concept list for faster further requests in %s" % fullpath_pickle)
    pickle.dump(selected_concepts, open(fullpath_pickle, 'wb'), pickle.HIGHEST_PROTOCOL)

    return selected_concepts

if __name__ == '__main__':

    def single_label_fragment(doclist):
        single = [d for d in doclist if len(d.categories) < 2]
        final_categories = set([d.categories[0] if d.categories else [] for d in single])
        print('{} single-label documents ({} categories) from the original {} documents'.format(len(single),
                                                                                                len(final_categories),
                                                                                                len(doclist)))
        return single, list(final_categories)

    train_years = list(range(1986, 2006))
    test_years = [2006]
    cat_policy = 'leaves'
    most_common_cat = 300
    # JRC_DATAPATH = "/media/moreo/1TB Volume/Datasets/JRC_Acquis_v3"
    JRC_DATAPATH = "/storage/andrea/FUNNELING/data/JRC_Acquis_v3"
    langs = lang_set['JRC_NLTK']
    cat_list = inspect_eurovoc(JRC_DATAPATH, select=cat_policy)
    sys.exit()

    training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=JRC_DATAPATH, years=train_years,cat_filter=cat_list, cat_threshold=1, parallel=None,most_frequent=most_common_cat)
    test_docs, label_namestest = fetch_jrcacquis(langs=langs, data_path=JRC_DATAPATH, years=test_years, cat_filter=label_names,parallel='force')

    print('JRC-train: {} documents, {} labels'.format(len(training_docs), len(label_names)))
    print('JRC-test: {} documents, {} labels'.format(len(test_docs), len(label_namestest)))

    training_docs, label_names = single_label_fragment(training_docs)
    test_docs, label_namestest = single_label_fragment(test_docs)

    print('JRC-train: {} documents, {} labels'.format(len(training_docs), len(label_names)))
    print('JRC-test: {} documents, {} labels'.format(len(test_docs), len(label_namestest)))