eventExtractionHDN/preprocessing/xml_parser.py

from xml.etree import ElementTree
from xml.etree.ElementTree import parse
from pprint import pprint
import pandas as pd

XML_PATH = '../commentaries/'
abs_path = '/home/andrea/sviluppo/hdn/commentaries/rime-final.xml'
corpus = parse(abs_path, parser=ElementTree.XMLParser(encoding='utf-8'))   # TODO: check correct encoding  (e.g., ' mia proveden&#231;a')
commentaries = corpus.find('Notes')


def remove_whitespaces(text):
    if text is None:
        return
    text = text.lstrip()
    text = text.rstrip()
    return text

def clear_comment(comment):
    comment = comment.replace('<Body>', '')
    comment = comment.replace('</Body>', '')
    comment = comment.replace('\\n', '')
    comment = remove_whitespaces(comment)
    return comment

final_res = []
for comment in commentaries.iterfind('Nota'):
    _tmp = []
    text = _tmp.append(remove_whitespaces(comment.find('Testo').text))
    body = comment.find('Body')
    _tmp.append(clear_comment(ElementTree.tostring(body, encoding='unicode'))) # ?? This is not the way I did it the first time ...
    quotations = []
    #print(list(comment))
    for element in list(comment):
        if element.tag == 'Citazione':
            quotation = element
            fragment = quotation.find('Frammento')
            quot_type = quotation.find('Tipo')
            info_quoted_work = quotation.find('InfoOperaCitata')
            work_uri = info_quoted_work.find('OperaURI')
            work_title = info_quoted_work.find('TitoloOpera')
            quot_work_type = info_quoted_work.find('TipoOpera')
            quot_theme = info_quoted_work.find('Area')
            author_details = info_quoted_work.find('Autore')
            work_author = author_details.find('Name')
            author_uri = author_details.find('URI')
            _to_append = [fragment, quot_type, work_uri, work_title,
                work_author, author_uri, quot_theme, quot_work_type]

            for i, elem in enumerate(_to_append):
                if elem is None:
                    _to_append[i] = '_EMPTY'
                else:
                    _to_append[i] = remove_whitespaces(elem.text)
            quotations.append(_to_append)

    for elem in quotations:
        _to_append = _tmp + elem
        final_res.append(_to_append)

df_out = pd.DataFrame(final_res, columns=['text', 'comment', 'fragment', 'quot_type',
                                            'quot_uri', 'quot_title', 'quot_author',
                                            'author_uri', 'quot_theme', 'quot_work_type'])

df_out.to_csv('../commentaries/data_parsed/rime_DF.csv', index=False)