gFun/refactor/data/reader/wikipedia_tools.py

from __future__ import print_function

# import ijson
# from ijson.common import ObjectBuilder
import os
import pickle
import re
from bz2 import BZ2File
from itertools import islice
from os.path import join
from xml.sax.saxutils import escape

import numpy as np

from util.file import list_dirs, list_files

policies = ["IN_ALL_LANGS", "IN_ANY_LANG"]

"""
This file contains a set of tools for processing the Wikipedia multilingual documents.
In what follows, it is assumed that you have already downloaded a Wikipedia dump (https://dumps.wikimedia.org/)
and have processed each document to clean their texts with one of the tools:
    - https://github.com/aesuli/wikipediatools (Python 2)
    - https://github.com/aesuli/wikipedia-extractor (Python 3)
It is also assumed you have dowloaded the all-entities json file (e.g., https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2)

This tools help you in:
    - Processes the huge json file as a stream, and create a multilingual map of corresponding titles for each language.
    Set the policy = "IN_ALL_LANGS" will extract only titles which appear in all (AND) languages, whereas "IN_ANY_LANG"
    extracts all titles appearing in at least one (OR) language (warning: this will creates a huge dictionary).
    Note: This version is quite slow. Although it is run once for all, you might be prefer to take a look at "Wikidata in BigQuery".
    - Processes the huge json file as a stream a creates a simplified file which occupies much less and is far faster to be processed.
    - Use the multilingual map to extract, from the clean text versions, individual xml documents containing all
    language-specific versions from the document.
    - Fetch the multilingual documents to create, for each of the specified languages, a list containing all documents,
    in a way that the i-th element from any list refers to the same element in the respective language.
"""

def _doc_generator(text_path, langs):
    dotspace = re.compile(r'\.(?!\s)')
    for l,lang in enumerate(langs):
        print("Processing language <%s> (%d/%d)" % (lang, l, len(langs)))
        lang_dir = join(text_path, lang)
        split_dirs = list_dirs(lang_dir)
        for sd,split_dir in enumerate(split_dirs):
            print("\tprocessing split_dir <%s> (%d/%d)" % (split_dir, sd, len(split_dirs)))
            split_files = list_files(join(lang_dir, split_dir))
            for sf,split_file in enumerate(split_files):
                print("\t\tprocessing split_file <%s> (%d/%d)" % (split_file, sf, len(split_files)))
                with BZ2File(join(lang_dir, split_dir, split_file), 'r', buffering=1024*1024) as fi:
                    while True:
                        doc_lines = list(islice(fi, 3))
                        if doc_lines:
                            # some sentences are not followed by a space after the dot
                            doc_lines[1] = dotspace.sub('. ', doc_lines[1])
                            # [workaround] I found &nbsp; html symbol was not treated, and unescaping it now might not help...
                            doc_lines[1] = escape(doc_lines[1].replace("&nbsp;", " "))
                            yield doc_lines, lang
                        else: break

def _extract_title(doc_lines):
    m = re.search('title="(.+?)"', doc_lines[0])
    if m: return m.group(1).decode('utf-8')
    else: raise ValueError("Error in xml format: document head is %s" % doc_lines[0])

def _create_doc(target_file, id, doc, lang):
    doc[0] = doc[0][:-2] + (' lang="%s">\n'%lang)
    with open(target_file, 'w') as fo:
        fo.write('<multidoc id="%s">\n'%id)
        [fo.write(line) for line in doc]
        fo.write('</multidoc>')

def _append_doc(target_file, doc, lang):
    doc[0] = doc[0][:-2] + (' lang="%s">\n' % lang)
    with open(target_file, 'r', buffering=1024*1024) as fi:
        lines = fi.readlines()
    if doc[0] in lines[1::3]:
        return
    lines[-1:-1]=doc
    with open(target_file, 'w', buffering=1024*1024) as fo:
        [fo.write(line) for line in lines]

def extract_multilingual_documents(inv_dict, langs, text_path, out_path):
    if not os.path.exists(out_path):
        os.makedirs(out_path)
    for lang in langs:
        if lang not in inv_dict:
            raise ValueError("Lang %s is not in the dictionary" % lang)

    docs_created = len(list_files(out_path))
    print("%d multilingual documents found." % docs_created)
    for doc,lang in _doc_generator(text_path, langs):
        title = _extract_title(doc)

        if title in inv_dict[lang]:
            #pass
            ids = inv_dict[lang][title]
            for id in ids:
                target_file = join(out_path, id) + ".xml"
                if os.path.exists(target_file):
                    _append_doc(target_file, doc, lang)
                else:
                    _create_doc(target_file, id, doc, lang)
                    docs_created+=1
        else:
            if not re.match('[A-Za-z]+', title):
                print("Title <%s> for lang <%s> not in dictionary" % (title, lang))


def extract_multilingual_titles_from_simplefile(data_dir, filename, langs, policy="IN_ALL_LANGS", return_both=True):
    simplified_file = join(data_dir,filename)

    if policy not in policies:
        raise ValueError("Policy %s not supported." % policy)
    print("extracting multilingual titles with policy %s (%s)" % (policy,' '.join(langs)))

    lang_prefix = list(langs)
    lang_prefix.sort()
    pickle_prefix = "extraction_" + "_".join(lang_prefix) + "." + policy
    pickle_dict = join(data_dir, pickle_prefix+".multi_dict.pickle")
    pickle_invdict = join(data_dir, pickle_prefix+".multi_invdict.pickle")
    if os.path.exists(pickle_invdict):
        if return_both and os.path.exists(pickle_dict):
            print("Pickled files found in %s. Loading both (direct and inverse dictionaries)." % data_dir)
            return pickle.load(open(pickle_dict, 'rb')), pickle.load(open(pickle_invdict, 'rb'))
        elif return_both==False:
            print("Pickled file found in %s. Loading inverse dictionary only." % pickle_invdict)
            return pickle.load(open(pickle_invdict, 'rb'))

    multiling_titles = {}
    inv_dict = {lang:{} for lang in langs}

    def process_entry(line):
        parts = line.strip().split('\t')
        id = parts[0]
        if id in multiling_titles:
            raise ValueError("id <%s> already indexed" % id)

        titles = dict(((lang_title[:lang_title.find(':')],lang_title[lang_title.find(':')+1:].decode('utf-8')) for lang_title in parts[1:]))
        for lang in titles.keys():
            if lang not in langs:
                del titles[lang]

        if (policy == "IN_ALL_LANGS" and len(titles) == len(langs))\
                or (policy == "IN_ANY_LANG" and len(titles) > 0):
            multiling_titles[id] = titles
            for lang, title in titles.items():
                if title in inv_dict[lang]:
                    inv_dict[lang][title].append(id)
                inv_dict[lang][title] = [id]

    with BZ2File(simplified_file, 'r', buffering=1024*1024*16) as fi:
        completed = 0
        try:
            for line in fi:
                process_entry(line)
                completed += 1
                if completed % 10 == 0:
                    print("\rCompleted %d\ttitles %d" % (completed,len(multiling_titles)), end="")
            print("\rCompleted %d\t\ttitles %d" % (completed, len(multiling_titles)), end="\n")
        except EOFError:
            print("\nUnexpected file ending... saving anyway")

        print("Pickling dictionaries in %s" % data_dir)
        pickle.dump(multiling_titles, open(pickle_dict,'wb'), pickle.HIGHEST_PROTOCOL)
        pickle.dump(inv_dict, open(pickle_invdict, 'wb'), pickle.HIGHEST_PROTOCOL)
        print("Done")

    return (multiling_titles, inv_dict) if return_both else inv_dict


# in https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2
def simplify_json_file(data_dir, langs, policy="IN_ALL_LANGS", json_file = "latest-all.json.bz2"):
    latest_all_json_file = join(data_dir,json_file)

    if policy not in policies:
        raise ValueError("Policy %s not supported." % policy)

    print("extracting multilingual titles with policy %s (%s)" % (policy,' '.join(langs)))

    lang_prefix = list(langs)
    lang_prefix.sort()
    simple_titles_path = join(data_dir, "extraction_" + "_".join(lang_prefix) + "." + policy)

    def process_entry(last, fo):
        global written
        id = last["id"]
        titles = None
        if policy == "IN_ALL_LANGS" and langs.issubset(last["labels"].keys()):
            titles = {lang: last["labels"][lang]["value"] for lang in langs}
        elif policy == "IN_ANY_LANG":
            titles = {lang: last["labels"][lang]["value"] for lang in langs if lang in last["labels"]}

        if titles:
            fo.write((id+'\t'+'\t'.join([lang+':'+titles[lang] for lang in titles.keys()])+'\n').encode('utf-8'))
            return True
        else:
            return False

    written = 0
    with BZ2File(latest_all_json_file, 'r', buffering=1024*1024*16) as fi, \
            BZ2File(join(data_dir,simple_titles_path+".simple.bz2"),'w') as fo:
        builder = ObjectBuilder()
        completed = 0
        for event, value in ijson.basic_parse(fi, buf_size=1024*1024*16):
             builder.event(event, value)
             if len(builder.value)>1:
                if process_entry(builder.value.pop(0), fo): written += 1
                completed += 1
                print("\rCompleted %d\ttitles %d" % (completed,written), end="")
        print("")

        #process the last entry
        process_entry(builder.value.pop(0))

    return simple_titles_path

"""
Reads all multi-lingual documents in a folder (see wikipedia_tools.py to generate them) and generates, for each of the
specified languages, a list contanining all its documents, so that the i-th element of any list refers to the language-
specific version of the same document. Documents are forced to contain version in all specified languages and to contain
a minimum number of words; otherwise it is discarded.
"""
class MinWordsNotReached(Exception): pass
class WrongDocumentFormat(Exception): pass

def _load_multilang_doc(path, langs, min_words=100):
    import xml.etree.ElementTree as ET
    from xml.etree.ElementTree import Element, ParseError
    try:
        root = ET.parse(path).getroot()
        doc = {}
        for lang in langs:
            doc_body = root.find('.//doc[@lang="' + lang + '"]')
            if isinstance(doc_body, Element):
                n_words = len(doc_body.text.split(' '))
                if n_words >= min_words:
                    doc[lang] = doc_body.text
                else:
                    raise MinWordsNotReached
            else:
                raise WrongDocumentFormat
    except ParseError:
        raise WrongDocumentFormat
    return doc

#returns the multilingual documents mapped by language, and a counter with the number of documents readed
def fetch_wikipedia_multilingual(wiki_multi_path, langs, min_words=100, deletions=False, max_documents=-1, pickle_name=None):
    if pickle_name and os.path.exists(pickle_name):
        print("unpickling %s" % pickle_name)
        return pickle.load(open(pickle_name, 'rb'))

    multi_docs = list_files(wiki_multi_path)
    mling_documents = {l:[] for l in langs}
    valid_documents = 0
    minwords_exception = 0
    wrongdoc_exception = 0
    for d,multi_doc in enumerate(multi_docs):
        print("\rProcessed %d/%d documents, valid %d/%d, few_words=%d, few_langs=%d" %
              (d, len(multi_docs), valid_documents, len(multi_docs), minwords_exception, wrongdoc_exception),end="")
        doc_path = join(wiki_multi_path, multi_doc)
        try:
            m_doc = _load_multilang_doc(doc_path, langs, min_words)
            valid_documents += 1
            for l in langs:
                mling_documents[l].append(m_doc[l])
        except MinWordsNotReached:
            minwords_exception += 1
            if deletions: os.remove(doc_path)
        except WrongDocumentFormat:
            wrongdoc_exception += 1
            if deletions: os.remove(doc_path)
        if max_documents>0 and valid_documents>=max_documents:
            break

    if pickle_name:
        print("Pickling wikipedia documents object in %s" % pickle_name)
        pickle.dump(mling_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL)

    return mling_documents

def random_wiki_sample(l_wiki, max_documents):
    if max_documents == 0: return None
    langs = list(l_wiki.keys())
    assert len(np.unique([len(l_wiki[l]) for l in langs])) == 1, 'documents across languages do not seem to be aligned'
    ndocs_per_lang = len(l_wiki[langs[0]])
    if ndocs_per_lang > max_documents:
        sel = set(np.random.choice(list(range(ndocs_per_lang)), max_documents, replace=False))
        for lang in langs:
            l_wiki[lang] = [d for i, d in enumerate(l_wiki[lang]) if i in sel]
    return l_wiki


if __name__ == "__main__":

    wikipedia_home = "../Datasets/Wikipedia"

    from data.languages import JRC_LANGS_WITH_NLTK_STEMMING as langs
    langs = frozenset(langs)

    simple_titles_path = simplify_json_file(wikipedia_home, langs, policy="IN_ALL_LANGS", json_file="latest-all.json.bz2")
    _, inv_dict = extract_multilingual_titles_from_simplefile(wikipedia_home, simple_titles_path, langs, policy='IN_ALL_LANGS')
    extract_multilingual_documents(inv_dict, langs, join(wikipedia_home,'text'),
                                   out_path=join(wikipedia_home, 'multilingual_docs_JRC_NLTK'))