16 KiB
16 KiB
In [1]:
import ast
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# importing useful Python utility libraries we'll need
from collections import Counter, defaultdict
import itertools
In [2]:
import xml.etree.ElementTree as ET
In [3]:
#tree = ET.parse('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml')
In [4]:
#root = tree.getroot()
In [5]:
from bs4 import BeautifulSoup
In [6]:
def read_tei(tei_file):
with open(tei_file, 'r') as tei:
soup = BeautifulSoup(tei, 'lxml')
return soup
raise RuntimeError('Cannot generate a soup from the input')
In [7]:
def elem_to_text(elem, default=''):
if elem:
return elem.getText(separator=' ', strip=True)
else:
return default
In [8]:
from dataclasses import dataclass
@dataclass
class Person:
firstname: str
middlename: str
surname: str
In [59]:
class TEIFile(object):
def __init__(self, filename):
self.filename = filename
self.soup = read_tei(filename)
self._text = None
# self._lemmas = None
# self._lemma_lemmas = None
# self._categ_lemmas = None
self._title = ''
self._abstract = ''
@property
def title(self):
if not self._title:
if not self.soup.title:
self._title = "na"
else:
self._title = self.soup.title.getText().replace('\n','').strip()
return self._title
@property
def authors(self):
#authors_in_header = self.soup.analytic.find_all('author')
authors_in_header = self.soup.find_all('author')
result = []
for author in authors_in_header:
persname = author.persname
if not persname:
continue
firstname = elem_to_text(persname.find("forename"))#, type="first"))
middlename = elem_to_text(persname.find("forename", type="middle"))
surname = elem_to_text(persname.surname)
person = Person(firstname, middlename, surname)
result.append(person)
return result
@property
def bibliography(self):
bibliography = self.soup.find_all('bibl')
result = []
for bibl in bibliography:
if not bibl:
continue
#if (elem_to_text(bibl).startswith("Enter your references here")):
# continue
my_bibl_tmp=elem_to_text(bibl).replace('\n','').strip()
my_bibl_tmp=my_bibl_tmp.replace(' .', '.')
result.append(" ".join(my_bibl_tmp.split()))
return result
@property
def text(self):
if not self._text:
divs_text = []
for div in self.soup.body.find_all("div1"):
# div is neither an appendix nor references, just plain text.
if not div.get("type"):
div_text = div.get_text(separator=' ', strip=True)
divs_text.append(div_text)
plain_text = " ".join(divs_text)
self._text = plain_text
return self._text
@property
def orderedlemma(self):
ordr_lms = []
i=0
for div in self.soup.body.find_all("div1"):
for verso in div.find_all('l'):
i=i+1;
j=0;
for lm in verso.find_all("lm"):
j=j+1;
lm_text=elem_to_text(lm).strip();
ctg=lm.get('catg');
if (lm.get('lemma')!=None):
lemma=lm.get('lemma');
else:
lemma="non_spec";
for parent in lm.parents:
if (parent.name=='div1'):
canto = parent.contents[0]
break;
ordr_lms.append((" ".join(lm_text.split()), " ".join(ctg.split()), " ".join(lemma.split()), canto.replace('\n','').strip(), i, j));
return ordr_lms
@property
def lemma(self):
lms_text = []
lms_tupl=()
for lm in self.soup.body.find_all("lm"):
lm_text=elem_to_text(lm).strip()
ctg=lm.get('catg');
if (lm.get('lemma')!=None):
lemma=lm.get('lemma');
else:
lemma="non_spec";
#lm_text=lm_text+", "+ctg+", "+lemma;
for parent in lm.parents:
if (parent.name=='div1'):
canto = parent.contents[0]
break;
lms_text.append((" ".join(lm_text.split()), " ".join(ctg.split()), " ".join(lemma.split()), canto.replace('\n','').strip()));
return lms_text
@property
def categ_lemma(self):
ctgs_text = []
for lm in self.soup.body.find_all("lm"):
ctg_text=lm.get('catg').strip();
ctgs_text.append(" ".join(ctg_text.split()))
return ctgs_text
@property
def lemma_lemma(self):
lemmas_text = []
for lm in self.soup.body.find_all("lm"):
if (lm.get('lemma')):
lemma_text=lm.get('lemma').strip();
else:
lemma_text='non_spec';
lemmas_text.append(" ".join(lemma_text.split()))
return lemmas_text
In [60]:
def tei_to_csv_entry(tei_file):
tei = TEIFile(tei_file)
print(f"Handled {tei_file}")
base_name = tei_file
return tei.orderedlemma, tei.categ_lemma, tei.lemma_lemma #, tei.abstract
Provo a vedere se il parser funziona¶
Dovrebbe arrivare sino al termine 'oscuro'
In [61]:
tei = TEIFile('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml')
bbs=tei.orderedlemma
for re in bbs:
print (re, end="\n"*2)
if (re[0].startswith('oscura')):
print('...')
break
Carico il testo e creo una tabella¶
faccio il parsing del testo e creo una tabella con ha 3 colonne: *lemma, categoria, lemma italiano"
In [63]:
mytesto=tei_to_csv_entry('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml')
In [66]:
data = [mytesto[0]]
#data[0]
dfObj = pd.DataFrame(data[0])
testo_tabella=pd.DataFrame(data[0], columns = ['Lemma' , 'Categoria', 'LemmaItaliano', 'Canto', 'Verso', 'PosizioneLemmaNelVerso'])
testo_tabella.count()
Out[66]:
In [67]:
testo_tabella.head(10)
Out[67]: