eventExtractionHDN/preprocessing/xml_parser.py

66 lines
2.6 KiB
Python

from xml.etree import ElementTree
from xml.etree.ElementTree import parse
from pprint import pprint
import pandas as pd
XML_PATH = '../commentaries/'
abs_path = '/home/andrea/sviluppo/hdn/commentaries/rime-final.xml'
corpus = parse(abs_path, parser=ElementTree.XMLParser(encoding='utf-8')) # TODO: check correct encoding (e.g., ' mia provedença')
commentaries = corpus.find('Notes')
def remove_whitespaces(text):
if text is None:
return
text = text.lstrip()
text = text.rstrip()
return text
def clear_comment(comment):
comment = comment.replace('<Body>', '')
comment = comment.replace('</Body>', '')
comment = comment.replace('\\n', '')
comment = remove_whitespaces(comment)
return comment
final_res = []
for comment in commentaries.iterfind('Nota'):
_tmp = []
text = _tmp.append(remove_whitespaces(comment.find('Testo').text))
body = comment.find('Body')
_tmp.append(clear_comment(ElementTree.tostring(body, encoding='unicode'))) # ?? This is not the way I did it the first time ...
quotations = []
#print(list(comment))
for element in list(comment):
if element.tag == 'Citazione':
quotation = element
fragment = quotation.find('Frammento')
quot_type = quotation.find('Tipo')
info_quoted_work = quotation.find('InfoOperaCitata')
work_uri = info_quoted_work.find('OperaURI')
work_title = info_quoted_work.find('TitoloOpera')
quot_work_type = info_quoted_work.find('TipoOpera')
quot_theme = info_quoted_work.find('Area')
author_details = info_quoted_work.find('Autore')
work_author = author_details.find('Name')
author_uri = author_details.find('URI')
_to_append = [fragment, quot_type, work_uri, work_title,
work_author, author_uri, quot_theme, quot_work_type]
for i, elem in enumerate(_to_append):
if elem is None:
_to_append[i] = '_EMPTY'
else:
_to_append[i] = remove_whitespaces(elem.text)
quotations.append(_to_append)
for elem in quotations:
_to_append = _tmp + elem
final_res.append(_to_append)
df_out = pd.DataFrame(final_res, columns=['text', 'comment', 'fragment', 'quot_type',
'quot_uri', 'quot_title', 'quot_author',
'author_uri', 'quot_theme', 'quot_work_type'])
df_out.to_csv('../commentaries/data_parsed/rime_DF.csv', index=False)