from xml.etree import ElementTree from xml.etree.ElementTree import parse from pprint import pprint import pandas as pd XML_PATH = '../commentaries/' abs_path = '/home/andrea/sviluppo/hdn/commentaries/rime-final.xml' corpus = parse(abs_path, parser=ElementTree.XMLParser(encoding='utf-8')) # TODO: check correct encoding (e.g., ' mia provedença') commentaries = corpus.find('Notes') def remove_whitespaces(text): if text is None: return text = text.lstrip() text = text.rstrip() return text def clear_comment(comment): comment = comment.replace('', '') comment = comment.replace('', '') comment = comment.replace('\\n', '') comment = remove_whitespaces(comment) return comment final_res = [] for comment in commentaries.iterfind('Nota'): _tmp = [] text = _tmp.append(remove_whitespaces(comment.find('Testo').text)) body = comment.find('Body') _tmp.append(clear_comment(ElementTree.tostring(body, encoding='unicode'))) # ?? This is not the way I did it the first time ... quotations = [] #print(list(comment)) for element in list(comment): if element.tag == 'Citazione': quotation = element fragment = quotation.find('Frammento') quot_type = quotation.find('Tipo') info_quoted_work = quotation.find('InfoOperaCitata') work_uri = info_quoted_work.find('OperaURI') work_title = info_quoted_work.find('TitoloOpera') quot_work_type = info_quoted_work.find('TipoOpera') quot_theme = info_quoted_work.find('Area') author_details = info_quoted_work.find('Autore') work_author = author_details.find('Name') author_uri = author_details.find('URI') _to_append = [fragment, quot_type, work_uri, work_title, work_author, author_uri, quot_theme, quot_work_type] for i, elem in enumerate(_to_append): if elem is None: _to_append[i] = '_EMPTY' else: _to_append[i] = remove_whitespaces(elem.text) quotations.append(_to_append) for elem in quotations: _to_append = _tmp + elem final_res.append(_to_append) df_out = pd.DataFrame(final_res, columns=['text', 'comment', 'fragment', 'quot_type', 'quot_uri', 'quot_title', 'quot_author', 'author_uri', 'quot_theme', 'quot_work_type']) df_out.to_csv('../commentaries/data_parsed/rime_DF.csv', index=False)