sshoc-skosmapping/Progetto_Lett.ipynb at 8101fc1287cf6c41bf4e740ff669f74d9cfb4aad

In [1]:

import ast
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# importing useful Python utility libraries we'll need
from collections import Counter, defaultdict
import itertools

In [2]:

import xml.etree.ElementTree as ET

In [3]:

#tree = ET.parse('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml')

In [4]:

#root = tree.getroot()

In [5]:

from bs4 import BeautifulSoup

In [6]:

def read_tei(tei_file):
    with open(tei_file, 'r') as tei:
        soup = BeautifulSoup(tei, 'lxml')
        return soup
    raise RuntimeError('Cannot generate a soup from the input')

In [7]:

def elem_to_text(elem, default=''):
    if elem:
        return elem.getText(separator=' ', strip=True)
    else:
        return default

In [8]:

from dataclasses import dataclass

@dataclass
class Person:
    firstname: str
    middlename: str
    surname: str

In [59]:

class TEIFile(object):
    def __init__(self, filename):
        self.filename = filename
        self.soup = read_tei(filename)
        self._text = None
       # self._lemmas = None
       # self._lemma_lemmas = None
       # self._categ_lemmas = None
        self._title = ''
        self._abstract = ''

    
    @property
    def title(self):
        if not self._title:
            if  not self.soup.title:
                self._title = "na"
            else:
                self._title = self.soup.title.getText().replace('\n','').strip()
        return self._title

    
    @property
    def authors(self):
        #authors_in_header = self.soup.analytic.find_all('author')
        authors_in_header = self.soup.find_all('author')

        result = []
        for author in authors_in_header:
            persname = author.persname
            if not persname:
                continue
            firstname = elem_to_text(persname.find("forename"))#, type="first"))
            middlename = elem_to_text(persname.find("forename", type="middle"))
            surname = elem_to_text(persname.surname)
            person = Person(firstname, middlename, surname)
            result.append(person)
        return result
    
    @property
    def bibliography(self):
        bibliography = self.soup.find_all('bibl')
        result = []
        for bibl in bibliography:
            if not bibl:
                continue
            #if (elem_to_text(bibl).startswith("Enter your references here")):
            #    continue
            my_bibl_tmp=elem_to_text(bibl).replace('\n','').strip()
            my_bibl_tmp=my_bibl_tmp.replace(' .', '.')
            result.append(" ".join(my_bibl_tmp.split()))
        return result


    @property
    def text(self):
        if not self._text:
            divs_text = []
            for div in self.soup.body.find_all("div1"):
                # div is neither an appendix nor references, just plain text.
                if not div.get("type"):
                    div_text = div.get_text(separator=' ', strip=True)
                    divs_text.append(div_text)

            plain_text = " ".join(divs_text)
            self._text = plain_text
        return self._text
    
    @property
    def orderedlemma(self):
        ordr_lms = []
        i=0
        for div in self.soup.body.find_all("div1"):
            for verso in div.find_all('l'):
                i=i+1;
                j=0;
                for lm in verso.find_all("lm"):
                    j=j+1;
                    lm_text=elem_to_text(lm).strip();
                    ctg=lm.get('catg');
                    if (lm.get('lemma')!=None):
                        lemma=lm.get('lemma');
                    else:
                        lemma="non_spec";
                    for parent in lm.parents:
                        if (parent.name=='div1'):
                            canto = parent.contents[0]
                            break;
                    ordr_lms.append((" ".join(lm_text.split()), " ".join(ctg.split()), " ".join(lemma.split()), canto.replace('\n','').strip(), i, j));               
        
        return ordr_lms
    
    @property
    def lemma(self):
        lms_text = []
        lms_tupl=()
        for lm in self.soup.body.find_all("lm"):
            lm_text=elem_to_text(lm).strip()
            ctg=lm.get('catg');
            if (lm.get('lemma')!=None):
                lemma=lm.get('lemma');
            else:
                lemma="non_spec";
            #lm_text=lm_text+", "+ctg+", "+lemma;
            for parent in lm.parents:
                if (parent.name=='div1'):
                    canto = parent.contents[0]
                    break;
            lms_text.append((" ".join(lm_text.split()), " ".join(ctg.split()), " ".join(lemma.split()), canto.replace('\n','').strip()));               
        return lms_text
    
    @property
    def categ_lemma(self):
        ctgs_text = []
        for lm in self.soup.body.find_all("lm"):
            ctg_text=lm.get('catg').strip();
            ctgs_text.append(" ".join(ctg_text.split()))
        return ctgs_text
    
    @property
    def lemma_lemma(self):
        lemmas_text = []
        for lm in self.soup.body.find_all("lm"):
            if (lm.get('lemma')):
                lemma_text=lm.get('lemma').strip();
            else:
                lemma_text='non_spec';
            lemmas_text.append(" ".join(lemma_text.split()))
        return lemmas_text

In [60]:

def tei_to_csv_entry(tei_file):
    tei = TEIFile(tei_file)
    print(f"Handled {tei_file}")
    base_name = tei_file
    return tei.orderedlemma, tei.categ_lemma, tei.lemma_lemma  #, tei.abstract

Provo a vedere se il parser funziona¶

Dovrebbe arrivare sino al termine 'oscuro'

In [61]:

tei = TEIFile('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml')
bbs=tei.orderedlemma
for re in bbs:
    print (re, end="\n"*2)
    if (re[0].startswith('oscura')):
        print('...')
        break

('Nel', 'rdms', 'il', 'Canto 1', 1, 1)

('mezzo', 'eilaksl', 'in mezzo di', 'Canto 1', 1, 2)

('del', 'rdms', 'il', 'Canto 1', 1, 3)

('cammin', 'sm2ms', 'cammino', 'Canto 1', 1, 4)

('di', 'epskg', 'di', 'Canto 1', 1, 5)

('nostra', 'as1fs', 'nostro', 'Canto 1', 1, 6)

('vita', 'sf1fs', 'vita', 'Canto 1', 1, 7)

('mi', 'pf1sypr', 'mi', 'Canto 1', 2, 1)

('ritrovai', 'vta+1irs1', 'ritrovare', 'Canto 1', 2, 2)

('per', 'epskpl', 'per', 'Canto 1', 2, 3)

('una', 'rifs', 'una', 'Canto 1', 2, 4)

('selva', 'sf1fs', 'selva', 'Canto 1', 2, 5)

('oscura', 'a1fs', 'oscuro', 'Canto 1', 2, 6)

...

Carico il testo e creo una tabella¶

faccio il parsing del testo e creo una tabella con ha 3 colonne: *lemma, categoria, lemma italiano"

In [63]:

mytesto=tei_to_csv_entry('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml')

Handled /Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml

In [66]:

data = [mytesto[0]]
#data[0]
dfObj = pd.DataFrame(data[0]) 
testo_tabella=pd.DataFrame(data[0], columns = ['Lemma' , 'Categoria', 'LemmaItaliano', 'Canto', 'Verso', 'PosizioneLemmaNelVerso']) 
testo_tabella.count()

Out[66]:

Lemma                     34280
Categoria                 34280
LemmaItaliano             34280
Canto                     34280
Verso                     34280
PosizioneLemmaNelVerso    34280
dtype: int64

In [67]:

testo_tabella.head(10)

Out[67]:

	Lemma	Categoria	LemmaItaliano	Canto	Verso	PosizioneLemmaNelVerso
0	Nel	rdms	il	Canto 1	1	1
1	mezzo	eilaksl	in mezzo di	Canto 1	1	2
2	del	rdms	il	Canto 1	1	3
3	cammin	sm2ms	cammino	Canto 1	1	4
4	di	epskg	di	Canto 1	1	5
5	nostra	as1fs	nostro	Canto 1	1	6
6	vita	sf1fs	vita	Canto 1	1	7
7	mi	pf1sypr	mi	Canto 1	2	1
8	ritrovai	vta+1irs1	ritrovare	Canto 1	2	2
9	per	epskpl	per	Canto 1	2	3

16 KiB Raw Blame History

Provo a vedere se il parser funziona¶

Carico il testo e creo una tabella¶

16 KiB

Raw Blame History