sshoc-skosmapping/Progetto_Lett.ipynb at 51983970771586baa98740a4feaefdf6f67d8748

36 KiB

Raw Blame History

Test per Parsing e generazione IRI¶

In [ ]:

import ast
import sys
import numpy as np
import pandas as pd
import rdflib
import matplotlib.pyplot as plt
# importing useful Python utility libraries we'll need
from collections import Counter, defaultdict
import itertools

In [ ]:

#from rdflib.namespace import CSVW, DC, DCAT, DCTERMS, DOAP, FOAF, ODRL2, ORG, OWL, \
#                           PROF, PROV, RDF, RDFS, SDO, SH, SKOS, SOSA, SSN, TIME, \
#                           VOID, XMLNS, XSD
from rdflib.namespace import DC, DCAT, DCTERMS, OWL, \
                            RDF, RDFS, SKOS,  \
                           XMLNS, XSD
from rdflib import Namespace
from rdflib import URIRef, BNode, Literal
n = Namespace("http://hdn.dantenetwork.it/resource/work/commedia/cantica/")

In [ ]:

import xml.etree.ElementTree as ET

In [ ]:

#tree = ET.parse('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml')

In [ ]:

#root = tree.getroot()

In [ ]:

from bs4 import BeautifulSoup

In [ ]:

def read_tei(tei_file):
    with open(tei_file, 'r') as tei:
        soup = BeautifulSoup(tei, 'lxml')
        return soup
    raise RuntimeError('Cannot generate a soup from the input')

In [ ]:

def elem_to_text(elem, default=''):
    if elem:
        return elem.getText(separator=' ', strip=True)
    else:
        return default

In [ ]:

from dataclasses import dataclass

@dataclass
class Person:
    firstname: str
    middlename: str
    surname: str

Parser¶

Provo a creare un parser.

Un estratto dal file inferno.xml:

<div1>  <head>Canto 1</head>
<lg type="canto">
    <l>
      <LM lemma="il" catg="rdms">Nel</LM>
      <LM lemma="in mezzo di" catg="eilaksl">mezzo</LM>
      <LM lemma="il" catg="rdms">del</LM>
      <LM lemma="cammino" catg="sm2ms">cammin</LM>
      <LM lemma="di" catg="epskg">di</LM>
      <LM lemma="nostro" catg="as1fs">nostra</LM>
      <LM lemma="vita" catg="sf1fs">vita</LM>
    </l>
    ...
    ...
    <l>
      <LM lemma="che" catg="pr">che</LM>
      <LM1>
        <LM lemma="il" catg="rdms">nel</LM> 
        <LM lemma="in" catg="epaksl">nel</LM>
      </LM1>
      <LM lemma="pensiero" catg="sm2ms">pensier</LM>
      <LM lemma="rinnovare" catg="vta1ips3">rinova</LM>
      <LM lemma="la" catg="rdfs">la</LM>
      <LM lemma="paura" catg="sf1fs">paura</LM>!
    </l>
    <l>
      ...

Il tag \<div1> individua la porzione di file di un Canto, il tag \<l> individua un verso, il tag \<LM> individua una forma flessa, ciascuna forma flessa ha 1 o 2 attributi. All'interno di un verso può essere presente il tag \<LM1> che ha come content più elementi \<LM>, ciascuno di essi contiene la stessa forma flessa ma differenti valori per gli attributi 'catg' e 'lemma'.

per questa implementazione uso la libreria Python Beatiful Soup.

In [ ]:

class TEIFile(object):
    def __init__(self, filename, idres=0):
        self.g = rdflib.Graph()
        self.filename = filename
        self.soup = read_tei(filename)
        self._text = None
        self.idres=idres;
        self.InFor = URIRef("http://example.org/word/InflectedForm")
       # self._lemmas = None
       # self._lemma_lemmas = None
       # self._categ_lemmas = None
        self._title = ''
        self._abstract = ''

    
    @property
    def title(self):
        if not self._title:
            if  not self.soup.title:
                self._title = "na"
            else:
                self._title = self.soup.title.getText().replace('\n','').strip()
        return self._title

    
    @property
    def authors(self):
        #authors_in_header = self.soup.analytic.find_all('author')
        authors_in_header = self.soup.find_all('author')

        result = []
        for author in authors_in_header:
            persname = author.persname
            if not persname:
                continue
            firstname = elem_to_text(persname.find("forename"))#, type="first"))
            middlename = elem_to_text(persname.find("forename", type="middle"))
            surname = elem_to_text(persname.surname)
            person = Person(firstname, middlename, surname)
            result.append(person)
        return result
    
    @property
    def bibliography(self):
        bibliography = self.soup.find_all('bibl')
        result = []
        for bibl in bibliography:
            if not bibl:
                continue
            #if (elem_to_text(bibl).startswith("Enter your references here")):
            #    continue
            my_bibl_tmp=elem_to_text(bibl).replace('\n','').strip()
            my_bibl_tmp=my_bibl_tmp.replace(' .', '.')
            result.append(" ".join(my_bibl_tmp.split()))
        return result


    @property
    def text(self):
        if not self._text:
            divs_text = []
            for div in self.soup.body.find_all("div1"):
                # div is neither an appendix nor references, just plain text.
                if not div.get("type"):
                    div_text = div.get_text(separator=' ', strip=True)
                    divs_text.append(div_text)

            plain_text = " ".join(divs_text)
            self._text = plain_text
        return self._text
    
    @property
    def orderedlemma(self):
        ordr_lms = []
        i=0
        for div in self.soup.body.find_all("div1"):
            for verso in div.find_all('l'):
                i=i+1;
                j=0;
                for lm in verso.find_all("lm"):
                    lstctg=[];
                    lstlms=[];
                    j=j+1;
                    lm_text=elem_to_text(lm).strip();
                    #ctg=lm.get('catg');
                    if (lm.get('catg')!=None):
                        ctg=lm.get('catg');
                    else:
                        ctg="non_spec";
                    
                    lstctg.append(" ".join(ctg.split())); 
                    
                    if (lm.get('lemma')!=None):
                        lemma=lm.get('lemma');
                    else:
                        lemma="non_spec";
                    lstlms.append(" ".join(lemma.split()));    
                    for parent in lm.parents:
                        if (parent.name=='div1'):
                            canto = parent.contents[0];
                        if (parent.name=='lm1' and ordr_lms[-1][0]==" ".join(lm_text.split())):
                            j=j-1;
                            lstctg=lstctg+ordr_lms[-1][1];
                            lstlms=lstlms+ordr_lms[-1][2];
                            ordr_lms.pop();
                    
                    ordr_lms.append((" ".join(lm_text.split()), lstctg, lstlms, canto.replace('\n','').strip(), i, j));
               
                
                   # ordr_lms.append((" ".join(lm_text.split()), " ".join(ctg.split()), " ".join(lemma.split()), canto.replace('\n','').strip(), i, j, "hdn:Works/Commedia/Cantica/1/"+str(i),
                   #                 "hdn:Works/Commedia/Cantica/1/"+str(i)+"/#"+str(j)));
                   
                
        return ordr_lms
    
    ##IRI forma flessa
    @property
    def IRIff(self):
        iriffs = []
        i=0
        for div in self.soup.body.find_all("div1"):
            for verso in div.find_all('l'):
                i=i+1;
                j=0;
                for lm in verso.find_all("lm"):
                    lstctg=[];
                    lstlms=[];
                    lstiri=[];
                    j=j+1;
                    lm_text=elem_to_text(lm).strip();
                    #ctg=lm.get('catg');
                    if (lm.get('catg')!=None):
                        ctg=lm.get('catg');
                    else:
                        ctg="non_spec";
                    
                    lstctg.append(" ".join(ctg.split())); 
                    
                    if (lm.get('lemma')!=None):
                        lemma=lm.get('lemma');
                    else:
                        lemma="non_spec";
                    lstlms.append(" ".join(lemma.split()));    
                    for parent in lm.parents:
                        if (parent.name=='div1'):
                            canto = parent.contents[0];
                        if (parent.name=='lm1' and iriffs[-1][0]==" ".join(lm_text.split())):
                            j=j-1;
                            #lstctg=lstctg+iriffs[-1][1];
                            #lstlms=lstlms+iriffs[-1][2];
                            iriffs.pop();
                    IRIff_text= "http://hdn.dantenetwork.it/resource/work/commedia/cantica/"+str(self.idres)+"/"+"/".join(canto.lower().split())+"/verso/"+str(i)+"#"+str(j);
                    IRIff_text_type= IRIff_text +' rdf:type InflectedForm . '
                    IRIff_text_pos= IRIff_text +' isInPosition '+str(j);
                    IRIff_text_exp= IRIff_text +' hasExpression "'+(" ".join(lm_text.split()))+'"^^xsd:string .' ;
                    IRIff_text_oo= IRIff_text +' isOccurrenceOf ulem . ';
                    IRIff_text_co= IRIff_text +" http://erlangen-crm.org/current/P148_is_component_of http://hdn.dantenetwork.it/resource/work/commedia/cantica/"+str(self.idres)+"/"+"/".join(canto.lower().split())+"/verso/"+str(i);
                    lstiri.append(IRIff_text);
                    lstiri.append(IRIff_text_type);
                    lstiri.append(IRIff_text_co);
                    lstiri.append(IRIff_text_pos);
                    lstiri.append(IRIff_text_exp);
                    lstiri.append(IRIff_text_oo);
                    iriffs.append((" ".join(lm_text.split()), canto.replace('\n','').strip(), i, j, lstiri));
               
                
                   # ordr_lms.append((" ".join(lm_text.split()), " ".join(ctg.split()), " ".join(lemma.split()), canto.replace('\n','').strip(), i, j, "hdn:Works/Commedia/Cantica/1/"+str(i),
                   #                 "hdn:Works/Commedia/Cantica/1/"+str(i)+"/#"+str(j)));
                   
                
        return iriffs
    
    #IRI forma flessa RDF
    @property
    def IRIffRDF(self):
        iriffs = []
        i=0
        for div in self.soup.body.find_all("div1"):
            for verso in div.find_all('l'):
                i=i+1;
                j=0;
                for lm in verso.find_all("lm"):
                    lstctg=[];
                    lstlms=[];
                    lstiri=[];
                    j=j+1;
                    lm_text=elem_to_text(lm).strip();
                    #ctg=lm.get('catg');
                    if (lm.get('catg')!=None):
                        ctg=lm.get('catg');
                    else:
                        ctg="non_spec";
                    
                    lstctg.append(" ".join(ctg.split())); 
                    
                    if (lm.get('lemma')!=None):
                        lemma=lm.get('lemma');
                    else:
                        lemma="non_spec";
                    lstlms.append(" ".join(lemma.split()));    
                    for parent in lm.parents:
                        if (parent.name=='div1'):
                            canto = parent.contents[0];
                        if (parent.name=='lm1' and iriffs[-1][0]==" ".join(lm_text.split())):
                            j=j-1;
                            #lstctg=lstctg+iriffs[-1][1];
                            #lstlms=lstlms+iriffs[-1][2];
                            iriffs.pop();
                    #g.add((bob, RDF.type, FOAF.Person))
                    #bob = URIRef("http://example.org/people/Bob")
                    IRIff_text= URIRef("http://hdn.dantenetwork.it/resource/work/commedia/cantica/"+str(self.idres)+"/"+"/".join(canto.lower().split())+"/verso/"+str(i)+"#"+str(j));
                    self.g.remove((IRIff_text, None, None))
                    self.g.add((IRIff_text, RDF.type, self.InFor))
                    
                    IRIff_text= "http://hdn.dantenetwork.it/resource/work/commedia/cantica/"+str(self.idres)+"/"+"/".join(canto.lower().split())+"/verso/"+str(i)+"#"+str(j);
                    IRIff_text_type= IRIff_text +' rdf:type InflectedForm . '
                    IRIff_text_pos= IRIff_text +' isInPosition '+str(j);
                    IRIff_text_exp= IRIff_text +' hasExpression "'+(" ".join(lm_text.split()))+'"^^xsd:string .' ;
                    IRIff_text_oo= IRIff_text +' isOccurrenceOf ulem . ';
                    IRIff_text_co= IRIff_text +" http://erlangen-crm.org/current/P148_is_component_of http://hdn.dantenetwork.it/resource/work/commedia/cantica/"+str(self.idres)+"/"+"/".join(canto.lower().split())+"/verso/"+str(i);
                    lstiri.append(IRIff_text);
                    lstiri.append(IRIff_text_type);
                    lstiri.append(IRIff_text_co);
                    lstiri.append(IRIff_text_pos);
                    lstiri.append(IRIff_text_exp);
                    lstiri.append(IRIff_text_oo);
                    iriffs.append((" ".join(lm_text.split()), canto.replace('\n','').strip(), i, j, lstiri));
               
                
                   # ordr_lms.append((" ".join(lm_text.split()), " ".join(ctg.split()), " ".join(lemma.split()), canto.replace('\n','').strip(), i, j, "hdn:Works/Commedia/Cantica/1/"+str(i),
                   #                 "hdn:Works/Commedia/Cantica/1/"+str(i)+"/#"+str(j)));
                   
                
        return self.g
    
    #IRI del verso
    @property
    def IRIverso(self):
        iris = []
        i=0
        for div in self.soup.body.find_all("div1"):
            islm1=False;
            for verso in div.find_all('l'):
                i=i+1;
                lm1_text=[];
                verso_text=elem_to_text(verso).strip();
                for child in verso.children: #Manage <LM1> elements
                    if (child.name=='lm1'):
                        islm1=True;
                        lm1_text.append(elem_to_text(child).strip());
                       # print (lm1_text);
                        
                if(islm1):
                    islm1=False;
                    for lm1str in lm1_text:
                        replace_str=lm1str.partition(' ')[0];
                        verso_text=verso_text.replace(lm1str, replace_str);
                
                for vparent in verso.parents:
                        if (vparent.name=='div1'):
                            canto = vparent.contents[0];
                #" ".join(verso_text.split())).strip()
                verso_text=verso_text.replace(" ,", ",");
                verso_text=verso_text.replace(" .", ".");
                verso_text=verso_text.replace(" !", "!");
                verso_text=verso_text.replace(" ?", "?");
                verso_text=verso_text.replace("l' ", "l'");
                iri_verso="http://hdn.dantenetwork.it/resource/work/commedia/cantica/"+str(self.idres)+"/"+"/".join(canto.lower().split())+"/verso/"+str(i);
                iri_verso=iri_verso+'\n a efrbroo:F2_Expression ,\n rdfs:Resource ; \nhttp://erlangen-crm.org/current/P190_has_symbolic_content "';
                iri_verso=iri_verso+verso_text.strip()+ '"^^xsd:string ;\n http://erlangen-crm.org/current/P3_has_note';
                iri_verso=iri_verso+' "'+str(i)+'"^^xsd:int ;\n http://hdn.dantenetwork.it/resource/has_number "'+str(i)+'"^^xsd:int .'
                
                iris.append((i, verso_text.strip(), iri_verso));
               
          
        return iris
    #IRI del verso
    
    
    #test
    @property
    def ff_ea(self):
        lms_text = []
        lms_tupl=()
        for lm in self.soup.body.find_all("lm"):
            lm_text=elem_to_text(lm).strip()
            ctg=lm.get('catg');
            if (lm.get('lemma')!=None):
                lemma=lm.get('lemma');
            else:
                lemma="non_spec";
            #lm_text=lm_text+", "+ctg+", "+lemma;
            for parent in lm.parents:
                if (parent.name=='div1'):
                    canto = parent.contents[0]
                    break;
            lms_text.append((" ".join(lm_text.split()), " ".join(ctg.split()), " ".join(lemma.split()), canto.replace('\n','').strip()));               
        return lms_text
    
    @property
    def categ_lemma(self):
        ctgs_text = []
        for lm in self.soup.body.find_all("lm"):
            ctg_text=lm.get('catg').strip();
            ctgs_text.append(" ".join(ctg_text.split()))
        return ctgs_text
    
    @property
    def lemma_lemma(self):
        lemmas_text = []
        for lm in self.soup.body.find_all("lm"):
            if (lm.get('lemma')):
                lemma_text=lm.get('lemma').strip();
            else:
                lemma_text='non_spec';
            lemmas_text.append(" ".join(lemma_text.split()))
        return lemmas_text

In [ ]:

def tei_to_csv_entry(tei_file, idres=0):
    tei = TEIFile(tei_file, idres)
    print(f"Handled {tei_file}")
    base_name = tei_file
    return tei.orderedlemma, tei.IRIverso, tei.IRIff, tei.IRIffRDF, tei.categ_lemma, tei.lemma_lemma  #, tei.abstract

Provo a vedere se il parser funziona¶

Dovrebbe arrivare sino al termine 'oscuro', controllare!

In [ ]:

tei = TEIFile('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml', 1)
bbs=tei.ff_ea
for re in bbs:
    print (re, end="\n"*2)
    if (re[0].startswith('oscura')):
        print('...')
        break

In [ ]:

g1=tei.IRIffRDF
                  
print(len(g1)) # prints 2

import pprint
for stmt in g1:
    pprint.pprint(stmt)

Elaboro il file inferno.xml¶

Eseguo il parsing del testo presente nel file e creo una tabella con le seguenti colonne: forma flessa, categoria, lemma, canto, verso, pposizione forma flessa nel verso

In [ ]:

mytesto=tei_to_csv_entry('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml', 1)

In [ ]:

data = [mytesto[0]]
#data[0]
dfObj = pd.DataFrame(data[0]) 
testo_tabella=pd.DataFrame(data[0], columns = ['FormaFlessa' , 'Categoria', 'Lemma', 'Canto', 'Verso', 'PosizioneFFNelVerso']) 
testo_tabella.count()

In [ ]:

testo_tabella.tail(10)

Creo una tabella con gli IRI dei versi per la cantica Inferno¶

La abella contiene il numero del verso, il verso e l'IRI del verso.
Per l'IRI del verso mi son basato su quanto riportato nel file Commedia.rdf, un esempio è il seguente:

http://hdn.dantenetwork.it/resource/work/commedia/cantica/2/canto/9/verso/106
a efrbroo:F2_Expression , rdfs:Resource ;
http://erlangen-crm.org/current/P190_has_symbolic_content
"Per li tre gradi sù di buona voglia"^^xsd:string ;
http://erlangen-crm.org/current/P3_has_note
"106"^^xsd:int ;
http://hdn.dantenetwork.it/resource/has_number
"106"^^xsd:int .

In [ ]:

data_IRI_versi_inf = [mytesto[1]]
#data_IRI_versi
df_IRI_versi_inf=pd.DataFrame(data_IRI_versi_inf[0], columns = ['NumeroVerso', 'Verso' , 'IRIVerso']) 
df_IRI_versi_inf.count()

In [ ]:

df_IRI_versi_inf.head().style.set_properties(subset=['IRIVerso'], **{'width': '400px'})

Creo una tabella con gli IRI delle FF¶

Algoritmo definito nella sezione 4 del documento

In [ ]:

data_IRI_ff_inf = [mytesto[2]]
#data_IRI_versi
df_IRI_ff_inf=pd.DataFrame(data_IRI_ff_inf[0], columns = ['FormaFlessa', 'Canto', 'NumeroVerso', 'Offset' , 'IRIFF']) 
df_IRI_ff_inf.count()

In [ ]:

df_IRI_ff_inf.tail().style.set_properties(subset=['IRIFF'], **{'width': '400px'})

Forse non tutti sanno che...¶

Nota: i risultati delle prossime elaborazioni considerano diverse tra loro due parole parole anche se differiscono per la presenza di maiuscole/minuscole

In [ ]:

df_inf_per_test=df_IRI_ff_inf[['FormaFlessa', 'Canto', 'NumeroVerso', 'Offset']]
df_num_ff=df_inf_per_test[df_inf_per_test['FormaFlessa'].str.len()>3]['FormaFlessa'].value_counts()
print("Le 10 parole (più lunghe di 3 caratteri) usate con maggiore frequenza nella prima Cantica sono:", end="\n"*2)
print('{:<10}{}'.format('Parola', 'Frequenza'))
df_num_ff.head(10)

In [ ]:

test_inf_versi=df_inf_per_test.groupby('NumeroVerso')['FormaFlessa'].apply(list).reset_index(name='parole')
#test_inf_versi.head()
parole_counter = Counter(itertools.chain(*test_inf_versi['parole']))
print('\nCi sono {} parole diverse nella prima Cantica.\n'.format(len(parole_counter)))

In [ ]:

print("\nLe 10 parole più frequenti nella prima Cantica, indipendentemente dalla lunghezza in caratteri, sono: \n")
print('{:<30}Frequenza\n'.format("Parola"))
for k, v in parole_counter.most_common(10):
    print(f'{k:<30}{v}')

Nel risultato della cella qui sotto si vede che alcune parole hanno il segno di punteggiatura, nella creazione degli IRI dovremmo toglierlo?

In [ ]:

least_common_parole = parole_counter.most_common()[-30:]
print("\nAlcune parole che compaiono una sola volta nella prima Cantica: \n")
print('{:<30}Frequenza\n'.format("Parola"))
for lk, lv in least_common_parole:
    print(f'{lk:<30}{lv}')

In [ ]:

#Frequenza delle parole palindrome
def is_palindrome(s):
    return s==s[::-1]

for k, v in parole_counter.most_common():
    if(len(k)>1 and is_palindrome(k)):
        print(f'{k:<30}{v}')

In [ ]:

#test_versi_1=test_inf_versi['parole']
#for tve in test_versi_1:
#    if(is_palindrome(("".join(tve)))):
#       print ("".join(tve))
    #print ((" ".join(tve)[::-1]))

In [ ]:

cooccurrences = []

for parole in test_inf_versi['parole']:
    parole_pairs = itertools.combinations(parole, 2)
    for pair in parole_pairs:
        if(len(pair[0])>3 and len(pair[1])>3):
             cooccurrences.append(tuple((pair)))
           # cooccurrences.append(tuple(sorted(pair)))

# Conto la frequenza di ciascuna cooccorrenza
parole_co_counter = Counter(cooccurrences)

In [ ]:

print("La frequenza delle co-occorrenze di due parole (non necessariamente consecutive e formate da almeno 4 caratteri) \nin uno stesso verso della prima Cantica", '\n')
print('{:<50}{}'.format('Co-ooccorrenza', 'Frequenza\n'))
for k, v in parole_co_counter.most_common(20):
    parole = '['+k[0] + ' , ' + k[1]+']'
    print(f'{parole:<50}{v}')
print('\n')
#print('\nMedia:')
#print(np.median(list(parole_co_counter.values())))

Cominciamo a lavorare con RDF¶

In [ ]:

#g.parse("/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DaMa/Commedia.rdf", format="nt")

Elaborazione del file purgatorio.xml¶

Eseguo il parsing del testo presente nel file e creo una tabella simile alla precedente

In [ ]:

#TEST IGNORARE
#tei_purgatorio = TEIFile('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/purgatorio.xml', 2)
#bbs_pu=tei_purgatorio.IRIverso
#for repu in bbs_pu:
#    print (repu, end="\n"*2)
#    if (repu[0].startswith('che')):
#        print('...')
#        break

In [ ]:

parsed_purgatorio=tei_to_csv_entry('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/purgatorio.xml', 2)

In [ ]:

data_purgatorio = [parsed_purgatorio[0]]
#dfObj_purgatorio = pd.DataFrame(data_purgatorio[0]) 
testo_purgatorio_tabella=pd.DataFrame(data_purgatorio[0], columns = ['FormaFlessa' , 'Categoria', 'Lemma', 'Canto', 'Verso', 'PosizioneFFNelVerso']) 
testo_purgatorio_tabella.count()

In [ ]:

testo_purgatorio_tabella.tail()

Creazione di una tabella con gli IRI dei versi per la cantica Purgatorio¶

La tabella contiene il numero del verso, il verso e l'IRI del verso.

In [ ]:

data_IRI_versi_pur = [parsed_purgatorio[1]]
#data_IRI_versi
df_IRI_versi_pur=pd.DataFrame(data_IRI_versi_pur[0], columns = ['NumeroVerso', 'Verso' , 'IRIVerso']) 
df_IRI_versi_pur.count()

In [ ]:

df_IRI_versi_pur.head().style.set_properties(subset=['IRIVerso'], **{'width': '400px'})

In [ ]:

data_IRI_ff_pur = [parsed_purgatorio[2]]
#data_IRI_versi
df_IRI_ff_pur=pd.DataFrame(data_IRI_ff_pur[0], columns = ['FormaFlessa', 'Canto', 'NumeroVerso', 'Offset' , 'IRIFF']) 
df_IRI_ff_pur.count()

In [ ]:

df_IRI_ff_pur.tail().style.set_properties(subset=['IRIFF'], **{'width': '400px'})

Elaborazione del file paradiso.xml¶

Eseguo il parsing del testo presente nel file e creo una tabella simile alle precedenti

In [ ]:

parsed_paradiso=tei_to_csv_entry('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/paradiso.xml', 3)

In [ ]:

data_paradiso = [parsed_paradiso[0]]
testo_paradiso_tabella=pd.DataFrame(data_paradiso[0], columns = ['FormaFlessa' , 'Categoria', 'Lemma', 'Canto', 'Verso', 'PosizioneFFNelVerso']) 
testo_paradiso_tabella.count()

In [ ]:

testo_paradiso_tabella.head(21)

Creazione di una tabella con gli IRI dei versi per la cantica Paradiso¶

La tabella contiene il numero del verso, il verso e l'IRI del verso.

In [ ]:

data_IRI_versi_par = [parsed_paradiso[1]]
#data_IRI_versi
df_IRI_versi_par=pd.DataFrame(data_IRI_versi_par[0], columns = ['NumeroVerso', 'Verso' , 'IRIVerso']) 
df_IRI_versi_par.count()

In [ ]:

df_IRI_versi_par.head().style.set_properties(subset=['IRIVerso'], **{'width': '400px'})

36 KiB Raw Blame History

Test per Parsing e generazione IRI¶

Parser¶

Provo a vedere se il parser funziona¶

Elaboro il file inferno.xml¶

Creo una tabella con gli IRI dei versi per la cantica Inferno¶

Creo una tabella con gli IRI delle FF¶

Forse non tutti sanno che...¶

Cominciamo a lavorare con RDF¶

Elaborazione del file purgatorio.xml¶

Creazione di una tabella con gli IRI dei versi per la cantica Purgatorio¶

Elaborazione del file paradiso.xml¶

Creazione di una tabella con gli IRI dei versi per la cantica Paradiso¶

36 KiB

Raw Blame History