from __future__ import print_function # import ijson # from ijson.common import ObjectBuilder import os, sys from os.path import join from bz2 import BZ2File import pickle from util.file import list_dirs, list_files, makedirs_if_not_exist from itertools import islice import re from xml.sax.saxutils import escape import numpy as np policies = ["IN_ALL_LANGS", "IN_ANY_LANG"] """ This file contains a set of tools for processing the Wikipedia multilingual documents. In what follows, it is assumed that you have already downloaded a Wikipedia dump (https://dumps.wikimedia.org/) and have processed each document to clean their texts with one of the tools: - https://github.com/aesuli/wikipediatools (Python 2) - https://github.com/aesuli/wikipedia-extractor (Python 3) It is also assumed you have dowloaded the all-entities json file (e.g., https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2) This tools help you in: - Processes the huge json file as a stream, and create a multilingual map of corresponding titles for each language. Set the policy = "IN_ALL_LANGS" will extract only titles which appear in all (AND) languages, whereas "IN_ANY_LANG" extracts all titles appearing in at least one (OR) language (warning: this will creates a huge dictionary). Note: This version is quite slow. Although it is run once for all, you might be prefer to take a look at "Wikidata in BigQuery". - Processes the huge json file as a stream a creates a simplified file which occupies much less and is far faster to be processed. - Use the multilingual map to extract, from the clean text versions, individual xml documents containing all language-specific versions from the document. - Fetch the multilingual documents to create, for each of the specified languages, a list containing all documents, in a way that the i-th element from any list refers to the same element in the respective language. """ def _doc_generator(text_path, langs): dotspace = re.compile(r'\.(?!\s)') for l,lang in enumerate(langs): print("Processing language <%s> (%d/%d)" % (lang, l, len(langs))) lang_dir = join(text_path, lang) split_dirs = list_dirs(lang_dir) for sd,split_dir in enumerate(split_dirs): print("\tprocessing split_dir <%s> (%d/%d)" % (split_dir, sd, len(split_dirs))) split_files = list_files(join(lang_dir, split_dir)) for sf,split_file in enumerate(split_files): print("\t\tprocessing split_file <%s> (%d/%d)" % (split_file, sf, len(split_files))) with BZ2File(join(lang_dir, split_dir, split_file), 'r', buffering=1024*1024) as fi: while True: doc_lines = list(islice(fi, 3)) if doc_lines: # some sentences are not followed by a space after the dot doc_lines[1] = dotspace.sub('. ', doc_lines[1]) # [workaround] I found   html symbol was not treated, and unescaping it now might not help... doc_lines[1] = escape(doc_lines[1].replace(" ", " ")) yield doc_lines, lang else: break def _extract_title(doc_lines): m = re.search('title="(.+?)"', doc_lines[0]) if m: return m.group(1).decode('utf-8') else: raise ValueError("Error in xml format: document head is %s" % doc_lines[0]) def _create_doc(target_file, id, doc, lang): doc[0] = doc[0][:-2] + (' lang="%s">\n'%lang) with open(target_file, 'w') as fo: fo.write('\n'%id) [fo.write(line) for line in doc] fo.write('') def _append_doc(target_file, doc, lang): doc[0] = doc[0][:-2] + (' lang="%s">\n' % lang) with open(target_file, 'r', buffering=1024*1024) as fi: lines = fi.readlines() if doc[0] in lines[1::3]: return lines[-1:-1]=doc with open(target_file, 'w', buffering=1024*1024) as fo: [fo.write(line) for line in lines] def extract_multilingual_documents(inv_dict, langs, text_path, out_path): if not os.path.exists(out_path): os.makedirs(out_path) for lang in langs: if lang not in inv_dict: raise ValueError("Lang %s is not in the dictionary" % lang) docs_created = len(list_files(out_path)) print("%d multilingual documents found." % docs_created) for doc,lang in _doc_generator(text_path, langs): title = _extract_title(doc) if title in inv_dict[lang]: #pass ids = inv_dict[lang][title] for id in ids: target_file = join(out_path, id) + ".xml" if os.path.exists(target_file): _append_doc(target_file, doc, lang) else: _create_doc(target_file, id, doc, lang) docs_created+=1 else: if not re.match('[A-Za-z]+', title): print("Title <%s> for lang <%s> not in dictionary" % (title, lang)) def extract_multilingual_titles_from_simplefile(data_dir, filename, langs, policy="IN_ALL_LANGS", return_both=True): simplified_file = join(data_dir,filename) if policy not in policies: raise ValueError("Policy %s not supported." % policy) print("extracting multilingual titles with policy %s (%s)" % (policy,' '.join(langs))) lang_prefix = list(langs) lang_prefix.sort() pickle_prefix = "extraction_" + "_".join(lang_prefix) + "." + policy pickle_dict = join(data_dir, pickle_prefix+".multi_dict.pickle") pickle_invdict = join(data_dir, pickle_prefix+".multi_invdict.pickle") if os.path.exists(pickle_invdict): if return_both and os.path.exists(pickle_dict): print("Pickled files found in %s. Loading both (direct and inverse dictionaries)." % data_dir) return pickle.load(open(pickle_dict, 'rb')), pickle.load(open(pickle_invdict, 'rb')) elif return_both==False: print("Pickled file found in %s. Loading inverse dictionary only." % pickle_invdict) return pickle.load(open(pickle_invdict, 'rb')) multiling_titles = {} inv_dict = {lang:{} for lang in langs} def process_entry(line): parts = line.strip().split('\t') id = parts[0] if id in multiling_titles: raise ValueError("id <%s> already indexed" % id) titles = dict(((lang_title[:lang_title.find(':')],lang_title[lang_title.find(':')+1:].decode('utf-8')) for lang_title in parts[1:])) for lang in titles.keys(): if lang not in langs: del titles[lang] if (policy == "IN_ALL_LANGS" and len(titles) == len(langs))\ or (policy == "IN_ANY_LANG" and len(titles) > 0): multiling_titles[id] = titles for lang, title in titles.items(): if title in inv_dict[lang]: inv_dict[lang][title].append(id) inv_dict[lang][title] = [id] with BZ2File(simplified_file, 'r', buffering=1024*1024*16) as fi: completed = 0 try: for line in fi: process_entry(line) completed += 1 if completed % 10 == 0: print("\rCompleted %d\ttitles %d" % (completed,len(multiling_titles)), end="") print("\rCompleted %d\t\ttitles %d" % (completed, len(multiling_titles)), end="\n") except EOFError: print("\nUnexpected file ending... saving anyway") print("Pickling dictionaries in %s" % data_dir) pickle.dump(multiling_titles, open(pickle_dict,'wb'), pickle.HIGHEST_PROTOCOL) pickle.dump(inv_dict, open(pickle_invdict, 'wb'), pickle.HIGHEST_PROTOCOL) print("Done") return (multiling_titles, inv_dict) if return_both else inv_dict # in https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2 def simplify_json_file(data_dir, langs, policy="IN_ALL_LANGS", json_file = "latest-all.json.bz2"): latest_all_json_file = join(data_dir,json_file) if policy not in policies: raise ValueError("Policy %s not supported." % policy) print("extracting multilingual titles with policy %s (%s)" % (policy,' '.join(langs))) lang_prefix = list(langs) lang_prefix.sort() simple_titles_path = join(data_dir, "extraction_" + "_".join(lang_prefix) + "." + policy) def process_entry(last, fo): global written id = last["id"] titles = None if policy == "IN_ALL_LANGS" and langs.issubset(last["labels"].keys()): titles = {lang: last["labels"][lang]["value"] for lang in langs} elif policy == "IN_ANY_LANG": titles = {lang: last["labels"][lang]["value"] for lang in langs if lang in last["labels"]} if titles: fo.write((id+'\t'+'\t'.join([lang+':'+titles[lang] for lang in titles.keys()])+'\n').encode('utf-8')) return True else: return False written = 0 with BZ2File(latest_all_json_file, 'r', buffering=1024*1024*16) as fi, \ BZ2File(join(data_dir,simple_titles_path+".simple.bz2"),'w') as fo: builder = ObjectBuilder() completed = 0 for event, value in ijson.basic_parse(fi, buf_size=1024*1024*16): builder.event(event, value) if len(builder.value)>1: if process_entry(builder.value.pop(0), fo): written += 1 completed += 1 print("\rCompleted %d\ttitles %d" % (completed,written), end="") print("") #process the last entry process_entry(builder.value.pop(0)) return simple_titles_path """ Reads all multi-lingual documents in a folder (see wikipedia_tools.py to generate them) and generates, for each of the specified languages, a list contanining all its documents, so that the i-th element of any list refers to the language- specific version of the same document. Documents are forced to contain version in all specified languages and to contain a minimum number of words; otherwise it is discarded. """ class MinWordsNotReached(Exception): pass class WrongDocumentFormat(Exception): pass def _load_multilang_doc(path, langs, min_words=100): import xml.etree.ElementTree as ET from xml.etree.ElementTree import Element, ParseError try: root = ET.parse(path).getroot() doc = {} for lang in langs: doc_body = root.find('.//doc[@lang="' + lang + '"]') if isinstance(doc_body, Element): n_words = len(doc_body.text.split(' ')) if n_words >= min_words: doc[lang] = doc_body.text else: raise MinWordsNotReached else: raise WrongDocumentFormat except ParseError: raise WrongDocumentFormat return doc #returns the multilingual documents mapped by language, and a counter with the number of documents readed def fetch_wikipedia_multilingual(wiki_multi_path, langs, min_words=100, deletions=False, max_documents=-1, pickle_name=None): if pickle_name and os.path.exists(pickle_name): print("unpickling %s" % pickle_name) return pickle.load(open(pickle_name, 'rb')) multi_docs = list_files(wiki_multi_path) mling_documents = {l:[] for l in langs} valid_documents = 0 minwords_exception = 0 wrongdoc_exception = 0 for d,multi_doc in enumerate(multi_docs): print("\rProcessed %d/%d documents, valid %d/%d, few_words=%d, few_langs=%d" % (d, len(multi_docs), valid_documents, len(multi_docs), minwords_exception, wrongdoc_exception),end="") doc_path = join(wiki_multi_path, multi_doc) try: m_doc = _load_multilang_doc(doc_path, langs, min_words) valid_documents += 1 for l in langs: mling_documents[l].append(m_doc[l]) except MinWordsNotReached: minwords_exception += 1 if deletions: os.remove(doc_path) except WrongDocumentFormat: wrongdoc_exception += 1 if deletions: os.remove(doc_path) if max_documents>0 and valid_documents>=max_documents: break if pickle_name: print("Pickling wikipedia documents object in %s" % pickle_name) pickle.dump(mling_documents, open(pickle_name, 'wb'), pickle.HIGHEST_PROTOCOL) return mling_documents def random_wiki_sample(l_wiki, max_documents): if max_documents == 0: return None langs = list(l_wiki.keys()) assert len(np.unique([len(l_wiki[l]) for l in langs])) == 1, 'documents across languages do not seem to be aligned' ndocs_per_lang = len(l_wiki[langs[0]]) if ndocs_per_lang > max_documents: sel = set(np.random.choice(list(range(ndocs_per_lang)), max_documents, replace=False)) for lang in langs: l_wiki[lang] = [d for i, d in enumerate(l_wiki[lang]) if i in sel] return l_wiki if __name__ == "__main__": wikipedia_home = "../Datasets/Wikipedia" from data.languages import JRC_LANGS_WITH_NLTK_STEMMING as langs langs = frozenset(langs) simple_titles_path = simplify_json_file(wikipedia_home, langs, policy="IN_ALL_LANGS", json_file="latest-all.json.bz2") _, inv_dict = extract_multilingual_titles_from_simplefile(wikipedia_home, simple_titles_path, langs, policy='IN_ALL_LANGS') extract_multilingual_documents(inv_dict, langs, join(wikipedia_home,'text'), out_path=join(wikipedia_home, 'multilingual_docs_JRC_NLTK'))