66 lines
2.6 KiB
Python
66 lines
2.6 KiB
Python
from xml.etree import ElementTree
|
|
from xml.etree.ElementTree import parse
|
|
from pprint import pprint
|
|
import pandas as pd
|
|
|
|
XML_PATH = '../commentaries/'
|
|
abs_path = '/home/andrea/sviluppo/hdn/commentaries/rime-final.xml'
|
|
corpus = parse(abs_path, parser=ElementTree.XMLParser(encoding='utf-8')) # TODO: check correct encoding (e.g., ' mia provedença')
|
|
commentaries = corpus.find('Notes')
|
|
|
|
|
|
|
|
def remove_whitespaces(text):
|
|
if text is None:
|
|
return
|
|
text = text.lstrip()
|
|
text = text.rstrip()
|
|
return text
|
|
|
|
def clear_comment(comment):
|
|
comment = comment.replace('<Body>', '')
|
|
comment = comment.replace('</Body>', '')
|
|
comment = comment.replace('\\n', '')
|
|
comment = remove_whitespaces(comment)
|
|
return comment
|
|
|
|
final_res = []
|
|
for comment in commentaries.iterfind('Nota'):
|
|
_tmp = []
|
|
text = _tmp.append(remove_whitespaces(comment.find('Testo').text))
|
|
body = comment.find('Body')
|
|
_tmp.append(clear_comment(ElementTree.tostring(body, encoding='unicode'))) # ?? This is not the way I did it the first time ...
|
|
quotations = []
|
|
#print(list(comment))
|
|
for element in list(comment):
|
|
if element.tag == 'Citazione':
|
|
quotation = element
|
|
fragment = quotation.find('Frammento')
|
|
quot_type = quotation.find('Tipo')
|
|
info_quoted_work = quotation.find('InfoOperaCitata')
|
|
work_uri = info_quoted_work.find('OperaURI')
|
|
work_title = info_quoted_work.find('TitoloOpera')
|
|
quot_work_type = info_quoted_work.find('TipoOpera')
|
|
quot_theme = info_quoted_work.find('Area')
|
|
author_details = info_quoted_work.find('Autore')
|
|
work_author = author_details.find('Name')
|
|
author_uri = author_details.find('URI')
|
|
_to_append = [fragment, quot_type, work_uri, work_title,
|
|
work_author, author_uri, quot_theme, quot_work_type]
|
|
|
|
for i, elem in enumerate(_to_append):
|
|
if elem is None:
|
|
_to_append[i] = '_EMPTY'
|
|
else:
|
|
_to_append[i] = remove_whitespaces(elem.text)
|
|
quotations.append(_to_append)
|
|
|
|
for elem in quotations:
|
|
_to_append = _tmp + elem
|
|
final_res.append(_to_append)
|
|
|
|
df_out = pd.DataFrame(final_res, columns=['text', 'comment', 'fragment', 'quot_type',
|
|
'quot_uri', 'quot_title', 'quot_author',
|
|
'author_uri', 'quot_theme', 'quot_work_type'])
|
|
|
|
df_out.to_csv('../commentaries/data_parsed/rime_DF.csv', index=False) |