sshoc-skosmapping/Progetto_Lett.ipynb

16 KiB

In [1]:
import ast
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# importing useful Python utility libraries we'll need
from collections import Counter, defaultdict
import itertools
In [2]:
import xml.etree.ElementTree as ET
In [3]:
#tree = ET.parse('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml')
In [4]:
#root = tree.getroot()
In [5]:
from bs4 import BeautifulSoup
In [6]:
def read_tei(tei_file):
    with open(tei_file, 'r') as tei:
        soup = BeautifulSoup(tei, 'lxml')
        return soup
    raise RuntimeError('Cannot generate a soup from the input')
In [7]:
def elem_to_text(elem, default=''):
    if elem:
        return elem.getText(separator=' ', strip=True)
    else:
        return default
In [8]:
from dataclasses import dataclass

@dataclass
class Person:
    firstname: str
    middlename: str
    surname: str
In [59]:
class TEIFile(object):
    def __init__(self, filename):
        self.filename = filename
        self.soup = read_tei(filename)
        self._text = None
       # self._lemmas = None
       # self._lemma_lemmas = None
       # self._categ_lemmas = None
        self._title = ''
        self._abstract = ''

    
    @property
    def title(self):
        if not self._title:
            if  not self.soup.title:
                self._title = "na"
            else:
                self._title = self.soup.title.getText().replace('\n','').strip()
        return self._title

    
    @property
    def authors(self):
        #authors_in_header = self.soup.analytic.find_all('author')
        authors_in_header = self.soup.find_all('author')

        result = []
        for author in authors_in_header:
            persname = author.persname
            if not persname:
                continue
            firstname = elem_to_text(persname.find("forename"))#, type="first"))
            middlename = elem_to_text(persname.find("forename", type="middle"))
            surname = elem_to_text(persname.surname)
            person = Person(firstname, middlename, surname)
            result.append(person)
        return result
    
    @property
    def bibliography(self):
        bibliography = self.soup.find_all('bibl')
        result = []
        for bibl in bibliography:
            if not bibl:
                continue
            #if (elem_to_text(bibl).startswith("Enter your references here")):
            #    continue
            my_bibl_tmp=elem_to_text(bibl).replace('\n','').strip()
            my_bibl_tmp=my_bibl_tmp.replace(' .', '.')
            result.append(" ".join(my_bibl_tmp.split()))
        return result


    @property
    def text(self):
        if not self._text:
            divs_text = []
            for div in self.soup.body.find_all("div1"):
                # div is neither an appendix nor references, just plain text.
                if not div.get("type"):
                    div_text = div.get_text(separator=' ', strip=True)
                    divs_text.append(div_text)

            plain_text = " ".join(divs_text)
            self._text = plain_text
        return self._text
    
    @property
    def orderedlemma(self):
        ordr_lms = []
        i=0
        for div in self.soup.body.find_all("div1"):
            for verso in div.find_all('l'):
                i=i+1;
                j=0;
                for lm in verso.find_all("lm"):
                    j=j+1;
                    lm_text=elem_to_text(lm).strip();
                    ctg=lm.get('catg');
                    if (lm.get('lemma')!=None):
                        lemma=lm.get('lemma');
                    else:
                        lemma="non_spec";
                    for parent in lm.parents:
                        if (parent.name=='div1'):
                            canto = parent.contents[0]
                            break;
                    ordr_lms.append((" ".join(lm_text.split()), " ".join(ctg.split()), " ".join(lemma.split()), canto.replace('\n','').strip(), i, j));               
        
        return ordr_lms
    
    @property
    def lemma(self):
        lms_text = []
        lms_tupl=()
        for lm in self.soup.body.find_all("lm"):
            lm_text=elem_to_text(lm).strip()
            ctg=lm.get('catg');
            if (lm.get('lemma')!=None):
                lemma=lm.get('lemma');
            else:
                lemma="non_spec";
            #lm_text=lm_text+", "+ctg+", "+lemma;
            for parent in lm.parents:
                if (parent.name=='div1'):
                    canto = parent.contents[0]
                    break;
            lms_text.append((" ".join(lm_text.split()), " ".join(ctg.split()), " ".join(lemma.split()), canto.replace('\n','').strip()));               
        return lms_text
    
    @property
    def categ_lemma(self):
        ctgs_text = []
        for lm in self.soup.body.find_all("lm"):
            ctg_text=lm.get('catg').strip();
            ctgs_text.append(" ".join(ctg_text.split()))
        return ctgs_text
    
    @property
    def lemma_lemma(self):
        lemmas_text = []
        for lm in self.soup.body.find_all("lm"):
            if (lm.get('lemma')):
                lemma_text=lm.get('lemma').strip();
            else:
                lemma_text='non_spec';
            lemmas_text.append(" ".join(lemma_text.split()))
        return lemmas_text
In [60]:
def tei_to_csv_entry(tei_file):
    tei = TEIFile(tei_file)
    print(f"Handled {tei_file}")
    base_name = tei_file
    return tei.orderedlemma, tei.categ_lemma, tei.lemma_lemma  #, tei.abstract

Provo a vedere se il parser funziona

Dovrebbe arrivare sino al termine 'oscuro'

In [61]:
tei = TEIFile('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml')
bbs=tei.orderedlemma
for re in bbs:
    print (re, end="\n"*2)
    if (re[0].startswith('oscura')):
        print('...')
        break
('Nel', 'rdms', 'il', 'Canto 1', 1, 1)

('mezzo', 'eilaksl', 'in mezzo di', 'Canto 1', 1, 2)

('del', 'rdms', 'il', 'Canto 1', 1, 3)

('cammin', 'sm2ms', 'cammino', 'Canto 1', 1, 4)

('di', 'epskg', 'di', 'Canto 1', 1, 5)

('nostra', 'as1fs', 'nostro', 'Canto 1', 1, 6)

('vita', 'sf1fs', 'vita', 'Canto 1', 1, 7)

('mi', 'pf1sypr', 'mi', 'Canto 1', 2, 1)

('ritrovai', 'vta+1irs1', 'ritrovare', 'Canto 1', 2, 2)

('per', 'epskpl', 'per', 'Canto 1', 2, 3)

('una', 'rifs', 'una', 'Canto 1', 2, 4)

('selva', 'sf1fs', 'selva', 'Canto 1', 2, 5)

('oscura', 'a1fs', 'oscuro', 'Canto 1', 2, 6)

...

Carico il testo e creo una tabella

faccio il parsing del testo e creo una tabella con ha 3 colonne: *lemma, categoria, lemma italiano"

In [63]:
mytesto=tei_to_csv_entry('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml')
Handled /Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml
In [66]:
data = [mytesto[0]]
#data[0]
dfObj = pd.DataFrame(data[0]) 
testo_tabella=pd.DataFrame(data[0], columns = ['Lemma' , 'Categoria', 'LemmaItaliano', 'Canto', 'Verso', 'PosizioneLemmaNelVerso']) 
testo_tabella.count()
Out[66]:
Lemma                     34280
Categoria                 34280
LemmaItaliano             34280
Canto                     34280
Verso                     34280
PosizioneLemmaNelVerso    34280
dtype: int64
In [67]:
testo_tabella.head(10)
Out[67]:
Lemma Categoria LemmaItaliano Canto Verso PosizioneLemmaNelVerso
0 Nel rdms il Canto 1 1 1
1 mezzo eilaksl in mezzo di Canto 1 1 2
2 del rdms il Canto 1 1 3
3 cammin sm2ms cammino Canto 1 1 4
4 di epskg di Canto 1 1 5
5 nostra as1fs nostro Canto 1 1 6
6 vita sf1fs vita Canto 1 1 7
7 mi pf1sypr mi Canto 1 2 1
8 ritrovai vta+1irs1 ritrovare Canto 1 2 2
9 per epskpl per Canto 1 2 3