from __future__ import print_function import os, sys from os.path import join import tarfile import xml.etree.ElementTree as ET from sklearn.datasets import get_data_home import pickle from util.file import download_file, list_dirs, list_files import rdflib from rdflib.namespace import RDF, SKOS from rdflib import URIRef import zipfile from data.languages import JRC_LANGS from collections import Counter from random import shuffle from data.languages import lang_set """ JRC Acquis' Nomenclature: bg = Bulgarian cs = Czech da = Danish de = German el = Greek en = English es = Spanish et = Estonian fi = Finnish fr = French hu = Hungarian it = Italian lt = Lithuanian lv = Latvian nl = Dutch mt = Maltese pl = Polish pt = Portuguese ro = Romanian sk = Slovak sl = Slovene sv = Swedish """ class JRCAcquis_Document: def __init__(self, id, name, lang, year, head, body, categories): self.id = id self.parallel_id = name self.lang = lang self.year = year self.text = body if not head else head + "\n" + body self.categories = categories # this is a workaround... for some reason, acutes are codified in a non-standard manner in titles # however, it seems that the title is often appearing as the first paragraph in the text/body (with # standard codification), so it might be preferable not to read the header after all (as here by default) def _proc_acute(text): for ch in ['a','e','i','o','u']: text = text.replace('%'+ch+'acute%',ch) return text def parse_document(file, year, head=False): root = ET.parse(file).getroot() doc_name = root.attrib['n'] # e.g., '22006A0211(01)' doc_lang = root.attrib['lang'] # e.g., 'es' doc_id = root.attrib['id'] # e.g., 'jrc22006A0211_01-es' doc_categories = [cat.text for cat in root.findall('.//teiHeader/profileDesc/textClass/classCode[@scheme="eurovoc"]')] doc_head = _proc_acute(root.find('.//text/body/head').text) if head else '' doc_body = '\n'.join([p.text for p in root.findall('.//text/body/div[@type="body"]/p')]) def raise_if_empty(field, from_file): if isinstance(field, str): if not field.strip(): raise ValueError("Empty field in file %s" % from_file) raise_if_empty(doc_name, file) raise_if_empty(doc_lang, file) raise_if_empty(doc_id, file) if head: raise_if_empty(doc_head, file) raise_if_empty(doc_body, file) return JRCAcquis_Document(id=doc_id, name=doc_name, lang=doc_lang, year=year, head=doc_head, body=doc_body, categories=doc_categories) # removes documents without a counterpart in all other languages def _force_parallel(doclist, langs): n_langs = len(langs) par_id_count = Counter([d.parallel_id for d in doclist]) parallel_doc_ids = set([id for id,count in par_id_count.items() if count==n_langs]) return [doc for doc in doclist if doc.parallel_id in parallel_doc_ids] def random_sampling_avoiding_parallel(doclist): random_order = list(range(len(doclist))) shuffle(random_order) sampled_request = [] parallel_ids = set() for ind in random_order: pid = doclist[ind].parallel_id if pid not in parallel_ids: sampled_request.append(doclist[ind]) parallel_ids.add(pid) print('random_sampling_no_parallel:: from {} documents to {} documents'.format(len(doclist), len(sampled_request))) return sampled_request #filters out documents which do not contain any category in the cat_filter list, and filter all labels not in cat_filter def _filter_by_category(doclist, cat_filter): if not isinstance(cat_filter, frozenset): cat_filter = frozenset(cat_filter) filtered = [] for doc in doclist: doc.categories = list(cat_filter & set(doc.categories)) if doc.categories: doc.categories.sort() filtered.append(doc) print("filtered %d documents out without categories in the filter list" % (len(doclist) - len(filtered))) return filtered #filters out categories with less than cat_threshold documents (and filters documents containing those categories) def _filter_by_frequency(doclist, cat_threshold): cat_count = Counter() for d in doclist: cat_count.update(d.categories) freq_categories = [cat for cat,count in cat_count.items() if count>cat_threshold] freq_categories.sort() return _filter_by_category(doclist, freq_categories), freq_categories #select top most_frequent categories (and filters documents containing those categories) def _most_common(doclist, most_frequent): cat_count = Counter() for d in doclist: cat_count.update(d.categories) freq_categories = [cat for cat,count in cat_count.most_common(most_frequent)] freq_categories.sort() return _filter_by_category(doclist, freq_categories), freq_categories def _get_categories(request): final_cats = set() for d in request: final_cats.update(d.categories) return list(final_cats) def fetch_jrcacquis(langs=None, data_path=None, years=None, ignore_unclassified=True, cat_filter=None, cat_threshold=0, parallel=None, most_frequent=-1, DOWNLOAD_URL_BASE ='http://optima.jrc.it/Acquis/JRC-Acquis.3.0/corpus/'): assert parallel in [None, 'force', 'avoid'], 'parallel mode not supported' if not langs: langs = JRC_LANGS else: if isinstance(langs, str): langs = [langs] for l in langs: if l not in JRC_LANGS: raise ValueError('Language %s is not among the valid languages in JRC-Acquis v3' % l) if not data_path: data_path = get_data_home() if not os.path.exists(data_path): os.mkdir(data_path) request = [] total_read = 0 for l in langs: file_name = 'jrc-'+l+'.tgz' archive_path = join(data_path, file_name) if not os.path.exists(archive_path): print("downloading language-specific dataset (once and for all) into %s" % data_path) DOWNLOAD_URL = join(DOWNLOAD_URL_BASE, file_name) download_file(DOWNLOAD_URL, archive_path) print("untarring dataset...") tarfile.open(archive_path, 'r:gz').extractall(data_path) documents_dir = join(data_path, l) print("Reading documents...") read = 0 for dir in list_dirs(documents_dir): year = int(dir) if years==None or year in years: year_dir = join(documents_dir,dir) pickle_name = join(data_path, 'jrc_' + l + '_' + dir + '.pickle') if os.path.exists(pickle_name): print("loading from file %s" % pickle_name) l_y_documents = pickle.load(open(pickle_name, "rb")) read += len(l_y_documents) else: l_y_documents = [] all_documents = list_files(year_dir) empty = 0 for i,doc_file in enumerate(all_documents): try: jrc_doc = parse_document(join(year_dir, doc_file), year) except ValueError: jrc_doc = None if jrc_doc and (not ignore_unclassified or jrc_doc.categories): l_y_documents.append(jrc_doc) else: empty += 1 if len(all_documents)>50 and ((i+1) % (len(all_documents)/50) == 0): print('\r\tfrom %s: completed %d%%' % (year_dir, int((i+1)*100.0/len(all_documents))), end='') read+=1 print('\r\tfrom %s: completed 100%% read %d documents (discarded %d without categories or empty fields)\n' % (year_dir, i+1, empty), end='') print("\t\t(Pickling object for future runs in %s)" % pickle_name) pickle.dump(l_y_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL) request += l_y_documents print("Read %d documents for language %s\n" % (read, l)) total_read += read print("Read %d documents in total" % (total_read)) if parallel=='force': request = _force_parallel(request, langs) elif parallel == 'avoid': request = random_sampling_avoiding_parallel(request) final_cats = _get_categories(request) if cat_filter: request = _filter_by_category(request, cat_filter) final_cats = _get_categories(request) if cat_threshold > 0: request, final_cats = _filter_by_frequency(request, cat_threshold) if most_frequent != -1 and len(final_cats) > most_frequent: request, final_cats = _most_common(request, most_frequent) return request, final_cats def print_cat_analysis(request): cat_count = Counter() for d in request: cat_count.update(d.categories) print("Number of active categories: {}".format(len(cat_count))) print(cat_count.most_common()) # inspects the Eurovoc thesaurus in order to select a subset of categories # currently, only 'broadest' policy (i.e., take all categories with no parent category), and 'all' is implemented def inspect_eurovoc(data_path, eurovoc_skos_core_concepts_filename='eurovoc_in_skos_core_concepts.rdf', eurovoc_url="http://publications.europa.eu/mdr/resource/thesaurus/eurovoc-20160630-0/skos/eurovoc_in_skos_core_concepts.zip", select="broadest"): fullpath_pickle = join(data_path, select+'_concepts.pickle') if os.path.exists(fullpath_pickle): print("Pickled object found in %s. Loading it." % fullpath_pickle) return pickle.load(open(fullpath_pickle,'rb')) fullpath = join(data_path, eurovoc_skos_core_concepts_filename) if not os.path.exists(fullpath): print("Path %s does not exist. Trying to download the skos EuroVoc file from %s" % (data_path, eurovoc_url)) download_file(eurovoc_url, fullpath) print("Unzipping file...") zipped = zipfile.ZipFile(data_path + '.zip', 'r') zipped.extract("eurovoc_in_skos_core_concepts.rdf", data_path) zipped.close() print("Parsing %s" %fullpath) g = rdflib.Graph() g.parse(location=fullpath, format="application/rdf+xml") if select == "all": print("Selecting all concepts") all_concepts = list(g.subjects(RDF.type, SKOS.Concept)) all_concepts = [c.toPython().split('/')[-1] for c in all_concepts] all_concepts.sort() selected_concepts = all_concepts elif select=="broadest": print("Selecting broadest concepts (those without any other broader concept linked to it)") all_concepts = set(g.subjects(RDF.type, SKOS.Concept)) narrower_concepts = set(g.subjects(SKOS.broader, None)) broadest_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - narrower_concepts)] broadest_concepts.sort() selected_concepts = broadest_concepts elif select=="leaves": print("Selecting leaves concepts (those not linked as broader of any other concept)") all_concepts = set(g.subjects(RDF.type, SKOS.Concept)) broad_concepts = set(g.objects(None, SKOS.broader)) leave_concepts = [c.toPython().split('/')[-1] for c in (all_concepts - broad_concepts)] leave_concepts.sort() selected_concepts = leave_concepts else: raise ValueError("Selection policy %s is not currently supported" % select) print("%d %s concepts found" % (len(selected_concepts), leave_concepts)) print("Pickling concept list for faster further requests in %s" % fullpath_pickle) pickle.dump(selected_concepts, open(fullpath_pickle, 'wb'), pickle.HIGHEST_PROTOCOL) return selected_concepts if __name__ == '__main__': def single_label_fragment(doclist): single = [d for d in doclist if len(d.categories) < 2] final_categories = set([d.categories[0] if d.categories else [] for d in single]) print('{} single-label documents ({} categories) from the original {} documents'.format(len(single), len(final_categories), len(doclist))) return single, list(final_categories) train_years = list(range(1986, 2006)) test_years = [2006] cat_policy = 'leaves' most_common_cat = 300 # JRC_DATAPATH = "/media/moreo/1TB Volume/Datasets/JRC_Acquis_v3" JRC_DATAPATH = "/storage/andrea/FUNNELING/data/JRC_Acquis_v3" langs = lang_set['JRC_NLTK'] cat_list = inspect_eurovoc(JRC_DATAPATH, select=cat_policy) sys.exit() training_docs, label_names = fetch_jrcacquis(langs=langs, data_path=JRC_DATAPATH, years=train_years,cat_filter=cat_list, cat_threshold=1, parallel=None,most_frequent=most_common_cat) test_docs, label_namestest = fetch_jrcacquis(langs=langs, data_path=JRC_DATAPATH, years=test_years, cat_filter=label_names,parallel='force') print('JRC-train: {} documents, {} labels'.format(len(training_docs), len(label_names))) print('JRC-test: {} documents, {} labels'.format(len(test_docs), len(label_namestest))) training_docs, label_names = single_label_fragment(training_docs) test_docs, label_namestest = single_label_fragment(test_docs) print('JRC-train: {} documents, {} labels'.format(len(training_docs), len(label_names))) print('JRC-test: {} documents, {} labels'.format(len(test_docs), len(label_namestest)))