sshoc-skosmapping/Progetto_Lett.ipynb

1003 lines
36 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Test per Parsing e generazione IRI"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import ast\n",
"import sys\n",
"import numpy as np\n",
"import pandas as pd\n",
"import rdflib\n",
"import matplotlib.pyplot as plt\n",
"# importing useful Python utility libraries we'll need\n",
"from collections import Counter, defaultdict\n",
"import itertools"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#from rdflib.namespace import CSVW, DC, DCAT, DCTERMS, DOAP, FOAF, ODRL2, ORG, OWL, \\\n",
"# PROF, PROV, RDF, RDFS, SDO, SH, SKOS, SOSA, SSN, TIME, \\\n",
"# VOID, XMLNS, XSD\n",
"from rdflib.namespace import DC, DCAT, DCTERMS, OWL, \\\n",
" RDF, RDFS, SKOS, \\\n",
" XMLNS, XSD\n",
"from rdflib import Namespace\n",
"from rdflib import URIRef, BNode, Literal\n",
"n = Namespace(\"http://hdn.dantenetwork.it/resource/work/commedia/cantica/\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import xml.etree.ElementTree as ET"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#tree = ET.parse('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#root = tree.getroot()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def read_tei(tei_file):\n",
" with open(tei_file, 'r') as tei:\n",
" soup = BeautifulSoup(tei, 'lxml')\n",
" return soup\n",
" raise RuntimeError('Cannot generate a soup from the input')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def elem_to_text(elem, default=''):\n",
" if elem:\n",
" return elem.getText(separator=' ', strip=True)\n",
" else:\n",
" return default"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from dataclasses import dataclass\n",
"\n",
"@dataclass\n",
"class Person:\n",
" firstname: str\n",
" middlename: str\n",
" surname: str"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Parser\n",
"\n",
"Provo a creare un parser.\n",
"\n",
"Un estratto dal file inferno.xml:\n",
"\n",
"~~~~\n",
"<div1> <head>Canto 1</head>\n",
"<lg type=\"canto\">\n",
" <l>\n",
" <LM lemma=\"il\" catg=\"rdms\">Nel</LM>\n",
" <LM lemma=\"in mezzo di\" catg=\"eilaksl\">mezzo</LM>\n",
" <LM lemma=\"il\" catg=\"rdms\">del</LM>\n",
" <LM lemma=\"cammino\" catg=\"sm2ms\">cammin</LM>\n",
" <LM lemma=\"di\" catg=\"epskg\">di</LM>\n",
" <LM lemma=\"nostro\" catg=\"as1fs\">nostra</LM>\n",
" <LM lemma=\"vita\" catg=\"sf1fs\">vita</LM>\n",
" </l>\n",
" ...\n",
" ...\n",
" <l>\n",
" <LM lemma=\"che\" catg=\"pr\">che</LM>\n",
" <LM1>\n",
" \t <LM lemma=\"il\" catg=\"rdms\">nel</LM> \n",
" \t <LM lemma=\"in\" catg=\"epaksl\">nel</LM>\n",
" </LM1>\n",
" <LM lemma=\"pensiero\" catg=\"sm2ms\">pensier</LM>\n",
" <LM lemma=\"rinnovare\" catg=\"vta1ips3\">rinova</LM>\n",
" <LM lemma=\"la\" catg=\"rdfs\">la</LM>\n",
" <LM lemma=\"paura\" catg=\"sf1fs\">paura</LM>!\n",
" </l>\n",
" <l>\n",
" ...\n",
"~~~~\n",
"\n",
" \n",
"Il tag \\<div1\\> individua la porzione di file di un *Canto*, il tag \\<l\\> individua un verso, il tag \\<LM\\> individua una *forma flessa*, ciascuna forma flessa ha 1 o 2 attributi.\n",
"All'interno di un verso può essere presente il tag \\<LM1\\> che ha come content più elementi \\<LM\\>, ciascuno di essi contiene la stessa forma flessa ma differenti valori per gli attributi 'catg' e 'lemma'.\n",
"\n",
"per questa implementazione uso la libreria Python [Beatiful Soup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class TEIFile(object):\n",
" def __init__(self, filename, idres=0):\n",
" self.g = rdflib.Graph()\n",
" self.filename = filename\n",
" self.soup = read_tei(filename)\n",
" self._text = None\n",
" self.idres=idres;\n",
" self.InFor = URIRef(\"http://example.org/word/InflectedForm\")\n",
" # self._lemmas = None\n",
" # self._lemma_lemmas = None\n",
" # self._categ_lemmas = None\n",
" self._title = ''\n",
" self._abstract = ''\n",
"\n",
" \n",
" @property\n",
" def title(self):\n",
" if not self._title:\n",
" if not self.soup.title:\n",
" self._title = \"na\"\n",
" else:\n",
" self._title = self.soup.title.getText().replace('\\n','').strip()\n",
" return self._title\n",
"\n",
" \n",
" @property\n",
" def authors(self):\n",
" #authors_in_header = self.soup.analytic.find_all('author')\n",
" authors_in_header = self.soup.find_all('author')\n",
"\n",
" result = []\n",
" for author in authors_in_header:\n",
" persname = author.persname\n",
" if not persname:\n",
" continue\n",
" firstname = elem_to_text(persname.find(\"forename\"))#, type=\"first\"))\n",
" middlename = elem_to_text(persname.find(\"forename\", type=\"middle\"))\n",
" surname = elem_to_text(persname.surname)\n",
" person = Person(firstname, middlename, surname)\n",
" result.append(person)\n",
" return result\n",
" \n",
" @property\n",
" def bibliography(self):\n",
" bibliography = self.soup.find_all('bibl')\n",
" result = []\n",
" for bibl in bibliography:\n",
" if not bibl:\n",
" continue\n",
" #if (elem_to_text(bibl).startswith(\"Enter your references here\")):\n",
" # continue\n",
" my_bibl_tmp=elem_to_text(bibl).replace('\\n','').strip()\n",
" my_bibl_tmp=my_bibl_tmp.replace(' .', '.')\n",
" result.append(\" \".join(my_bibl_tmp.split()))\n",
" return result\n",
"\n",
"\n",
" @property\n",
" def text(self):\n",
" if not self._text:\n",
" divs_text = []\n",
" for div in self.soup.body.find_all(\"div1\"):\n",
" # div is neither an appendix nor references, just plain text.\n",
" if not div.get(\"type\"):\n",
" div_text = div.get_text(separator=' ', strip=True)\n",
" divs_text.append(div_text)\n",
"\n",
" plain_text = \" \".join(divs_text)\n",
" self._text = plain_text\n",
" return self._text\n",
" \n",
" @property\n",
" def orderedlemma(self):\n",
" ordr_lms = []\n",
" i=0\n",
" for div in self.soup.body.find_all(\"div1\"):\n",
" for verso in div.find_all('l'):\n",
" i=i+1;\n",
" j=0;\n",
" for lm in verso.find_all(\"lm\"):\n",
" lstctg=[];\n",
" lstlms=[];\n",
" j=j+1;\n",
" lm_text=elem_to_text(lm).strip();\n",
" #ctg=lm.get('catg');\n",
" if (lm.get('catg')!=None):\n",
" ctg=lm.get('catg');\n",
" else:\n",
" ctg=\"non_spec\";\n",
" \n",
" lstctg.append(\" \".join(ctg.split())); \n",
" \n",
" if (lm.get('lemma')!=None):\n",
" lemma=lm.get('lemma');\n",
" else:\n",
" lemma=\"non_spec\";\n",
" lstlms.append(\" \".join(lemma.split())); \n",
" for parent in lm.parents:\n",
" if (parent.name=='div1'):\n",
" canto = parent.contents[0];\n",
" if (parent.name=='lm1' and ordr_lms[-1][0]==\" \".join(lm_text.split())):\n",
" j=j-1;\n",
" lstctg=lstctg+ordr_lms[-1][1];\n",
" lstlms=lstlms+ordr_lms[-1][2];\n",
" ordr_lms.pop();\n",
" \n",
" ordr_lms.append((\" \".join(lm_text.split()), lstctg, lstlms, canto.replace('\\n','').strip(), i, j));\n",
" \n",
" \n",
" # ordr_lms.append((\" \".join(lm_text.split()), \" \".join(ctg.split()), \" \".join(lemma.split()), canto.replace('\\n','').strip(), i, j, \"hdn:Works/Commedia/Cantica/1/\"+str(i),\n",
" # \"hdn:Works/Commedia/Cantica/1/\"+str(i)+\"/#\"+str(j)));\n",
" \n",
" \n",
" return ordr_lms\n",
" \n",
" ##IRI forma flessa\n",
" @property\n",
" def IRIff(self):\n",
" iriffs = []\n",
" i=0\n",
" for div in self.soup.body.find_all(\"div1\"):\n",
" for verso in div.find_all('l'):\n",
" i=i+1;\n",
" j=0;\n",
" for lm in verso.find_all(\"lm\"):\n",
" lstctg=[];\n",
" lstlms=[];\n",
" lstiri=[];\n",
" j=j+1;\n",
" lm_text=elem_to_text(lm).strip();\n",
" #ctg=lm.get('catg');\n",
" if (lm.get('catg')!=None):\n",
" ctg=lm.get('catg');\n",
" else:\n",
" ctg=\"non_spec\";\n",
" \n",
" lstctg.append(\" \".join(ctg.split())); \n",
" \n",
" if (lm.get('lemma')!=None):\n",
" lemma=lm.get('lemma');\n",
" else:\n",
" lemma=\"non_spec\";\n",
" lstlms.append(\" \".join(lemma.split())); \n",
" for parent in lm.parents:\n",
" if (parent.name=='div1'):\n",
" canto = parent.contents[0];\n",
" if (parent.name=='lm1' and iriffs[-1][0]==\" \".join(lm_text.split())):\n",
" j=j-1;\n",
" #lstctg=lstctg+iriffs[-1][1];\n",
" #lstlms=lstlms+iriffs[-1][2];\n",
" iriffs.pop();\n",
" IRIff_text= \"http://hdn.dantenetwork.it/resource/work/commedia/cantica/\"+str(self.idres)+\"/\"+\"/\".join(canto.lower().split())+\"/verso/\"+str(i)+\"#\"+str(j);\n",
" IRIff_text_type= IRIff_text +' rdf:type InflectedForm . '\n",
" IRIff_text_pos= IRIff_text +' isInPosition '+str(j);\n",
" IRIff_text_exp= IRIff_text +' hasExpression \"'+(\" \".join(lm_text.split()))+'\"^^xsd:string .' ;\n",
" IRIff_text_oo= IRIff_text +' isOccurrenceOf ulem . ';\n",
" IRIff_text_co= IRIff_text +\" http://erlangen-crm.org/current/P148_is_component_of http://hdn.dantenetwork.it/resource/work/commedia/cantica/\"+str(self.idres)+\"/\"+\"/\".join(canto.lower().split())+\"/verso/\"+str(i);\n",
" lstiri.append(IRIff_text);\n",
" lstiri.append(IRIff_text_type);\n",
" lstiri.append(IRIff_text_co);\n",
" lstiri.append(IRIff_text_pos);\n",
" lstiri.append(IRIff_text_exp);\n",
" lstiri.append(IRIff_text_oo);\n",
" iriffs.append((\" \".join(lm_text.split()), canto.replace('\\n','').strip(), i, j, lstiri));\n",
" \n",
" \n",
" # ordr_lms.append((\" \".join(lm_text.split()), \" \".join(ctg.split()), \" \".join(lemma.split()), canto.replace('\\n','').strip(), i, j, \"hdn:Works/Commedia/Cantica/1/\"+str(i),\n",
" # \"hdn:Works/Commedia/Cantica/1/\"+str(i)+\"/#\"+str(j)));\n",
" \n",
" \n",
" return iriffs\n",
" \n",
" #IRI forma flessa RDF\n",
" @property\n",
" def IRIffRDF(self):\n",
" iriffs = []\n",
" i=0\n",
" for div in self.soup.body.find_all(\"div1\"):\n",
" for verso in div.find_all('l'):\n",
" i=i+1;\n",
" j=0;\n",
" for lm in verso.find_all(\"lm\"):\n",
" lstctg=[];\n",
" lstlms=[];\n",
" lstiri=[];\n",
" j=j+1;\n",
" lm_text=elem_to_text(lm).strip();\n",
" #ctg=lm.get('catg');\n",
" if (lm.get('catg')!=None):\n",
" ctg=lm.get('catg');\n",
" else:\n",
" ctg=\"non_spec\";\n",
" \n",
" lstctg.append(\" \".join(ctg.split())); \n",
" \n",
" if (lm.get('lemma')!=None):\n",
" lemma=lm.get('lemma');\n",
" else:\n",
" lemma=\"non_spec\";\n",
" lstlms.append(\" \".join(lemma.split())); \n",
" for parent in lm.parents:\n",
" if (parent.name=='div1'):\n",
" canto = parent.contents[0];\n",
" if (parent.name=='lm1' and iriffs[-1][0]==\" \".join(lm_text.split())):\n",
" j=j-1;\n",
" #lstctg=lstctg+iriffs[-1][1];\n",
" #lstlms=lstlms+iriffs[-1][2];\n",
" iriffs.pop();\n",
" #g.add((bob, RDF.type, FOAF.Person))\n",
" #bob = URIRef(\"http://example.org/people/Bob\")\n",
" IRIff_text= URIRef(\"http://hdn.dantenetwork.it/resource/work/commedia/cantica/\"+str(self.idres)+\"/\"+\"/\".join(canto.lower().split())+\"/verso/\"+str(i)+\"#\"+str(j));\n",
" self.g.remove((IRIff_text, None, None))\n",
" self.g.add((IRIff_text, RDF.type, self.InFor))\n",
" \n",
" IRIff_text= \"http://hdn.dantenetwork.it/resource/work/commedia/cantica/\"+str(self.idres)+\"/\"+\"/\".join(canto.lower().split())+\"/verso/\"+str(i)+\"#\"+str(j);\n",
" IRIff_text_type= IRIff_text +' rdf:type InflectedForm . '\n",
" IRIff_text_pos= IRIff_text +' isInPosition '+str(j);\n",
" IRIff_text_exp= IRIff_text +' hasExpression \"'+(\" \".join(lm_text.split()))+'\"^^xsd:string .' ;\n",
" IRIff_text_oo= IRIff_text +' isOccurrenceOf ulem . ';\n",
" IRIff_text_co= IRIff_text +\" http://erlangen-crm.org/current/P148_is_component_of http://hdn.dantenetwork.it/resource/work/commedia/cantica/\"+str(self.idres)+\"/\"+\"/\".join(canto.lower().split())+\"/verso/\"+str(i);\n",
" lstiri.append(IRIff_text);\n",
" lstiri.append(IRIff_text_type);\n",
" lstiri.append(IRIff_text_co);\n",
" lstiri.append(IRIff_text_pos);\n",
" lstiri.append(IRIff_text_exp);\n",
" lstiri.append(IRIff_text_oo);\n",
" iriffs.append((\" \".join(lm_text.split()), canto.replace('\\n','').strip(), i, j, lstiri));\n",
" \n",
" \n",
" # ordr_lms.append((\" \".join(lm_text.split()), \" \".join(ctg.split()), \" \".join(lemma.split()), canto.replace('\\n','').strip(), i, j, \"hdn:Works/Commedia/Cantica/1/\"+str(i),\n",
" # \"hdn:Works/Commedia/Cantica/1/\"+str(i)+\"/#\"+str(j)));\n",
" \n",
" \n",
" return self.g\n",
" \n",
" #IRI del verso\n",
" @property\n",
" def IRIverso(self):\n",
" iris = []\n",
" i=0\n",
" for div in self.soup.body.find_all(\"div1\"):\n",
" islm1=False;\n",
" for verso in div.find_all('l'):\n",
" i=i+1;\n",
" lm1_text=[];\n",
" verso_text=elem_to_text(verso).strip();\n",
" for child in verso.children: #Manage <LM1> elements\n",
" if (child.name=='lm1'):\n",
" islm1=True;\n",
" lm1_text.append(elem_to_text(child).strip());\n",
" # print (lm1_text);\n",
" \n",
" if(islm1):\n",
" islm1=False;\n",
" for lm1str in lm1_text:\n",
" replace_str=lm1str.partition(' ')[0];\n",
" verso_text=verso_text.replace(lm1str, replace_str);\n",
" \n",
" for vparent in verso.parents:\n",
" if (vparent.name=='div1'):\n",
" canto = vparent.contents[0];\n",
" #\" \".join(verso_text.split())).strip()\n",
" verso_text=verso_text.replace(\" ,\", \",\");\n",
" verso_text=verso_text.replace(\" .\", \".\");\n",
" verso_text=verso_text.replace(\" !\", \"!\");\n",
" verso_text=verso_text.replace(\" ?\", \"?\");\n",
" verso_text=verso_text.replace(\"l' \", \"l'\");\n",
" iri_verso=\"http://hdn.dantenetwork.it/resource/work/commedia/cantica/\"+str(self.idres)+\"/\"+\"/\".join(canto.lower().split())+\"/verso/\"+str(i);\n",
" iri_verso=iri_verso+'\\n a efrbroo:F2_Expression ,\\n rdfs:Resource ; \\nhttp://erlangen-crm.org/current/P190_has_symbolic_content \"';\n",
" iri_verso=iri_verso+verso_text.strip()+ '\"^^xsd:string ;\\n http://erlangen-crm.org/current/P3_has_note';\n",
" iri_verso=iri_verso+' \"'+str(i)+'\"^^xsd:int ;\\n http://hdn.dantenetwork.it/resource/has_number \"'+str(i)+'\"^^xsd:int .'\n",
" \n",
" iris.append((i, verso_text.strip(), iri_verso));\n",
" \n",
" \n",
" return iris\n",
" #IRI del verso\n",
" \n",
" \n",
" #test\n",
" @property\n",
" def ff_ea(self):\n",
" lms_text = []\n",
" lms_tupl=()\n",
" for lm in self.soup.body.find_all(\"lm\"):\n",
" lm_text=elem_to_text(lm).strip()\n",
" ctg=lm.get('catg');\n",
" if (lm.get('lemma')!=None):\n",
" lemma=lm.get('lemma');\n",
" else:\n",
" lemma=\"non_spec\";\n",
" #lm_text=lm_text+\", \"+ctg+\", \"+lemma;\n",
" for parent in lm.parents:\n",
" if (parent.name=='div1'):\n",
" canto = parent.contents[0]\n",
" break;\n",
" lms_text.append((\" \".join(lm_text.split()), \" \".join(ctg.split()), \" \".join(lemma.split()), canto.replace('\\n','').strip())); \n",
" return lms_text\n",
" \n",
" @property\n",
" def categ_lemma(self):\n",
" ctgs_text = []\n",
" for lm in self.soup.body.find_all(\"lm\"):\n",
" ctg_text=lm.get('catg').strip();\n",
" ctgs_text.append(\" \".join(ctg_text.split()))\n",
" return ctgs_text\n",
" \n",
" @property\n",
" def lemma_lemma(self):\n",
" lemmas_text = []\n",
" for lm in self.soup.body.find_all(\"lm\"):\n",
" if (lm.get('lemma')):\n",
" lemma_text=lm.get('lemma').strip();\n",
" else:\n",
" lemma_text='non_spec';\n",
" lemmas_text.append(\" \".join(lemma_text.split()))\n",
" return lemmas_text"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def tei_to_csv_entry(tei_file, idres=0):\n",
" tei = TEIFile(tei_file, idres)\n",
" print(f\"Handled {tei_file}\")\n",
" base_name = tei_file\n",
" return tei.orderedlemma, tei.IRIverso, tei.IRIff, tei.IRIffRDF, tei.categ_lemma, tei.lemma_lemma #, tei.abstract"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Provo a vedere se il parser funziona\n",
"Dovrebbe arrivare sino al termine 'oscuro', controllare!"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tei = TEIFile('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml', 1)\n",
"bbs=tei.ff_ea\n",
"for re in bbs:\n",
" print (re, end=\"\\n\"*2)\n",
" if (re[0].startswith('oscura')):\n",
" print('...')\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"g1=tei.IRIffRDF\n",
" \n",
"print(len(g1)) # prints 2\n",
"\n",
"import pprint\n",
"for stmt in g1:\n",
" pprint.pprint(stmt)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Elaboro il file *inferno.xml*\n",
"Eseguo il parsing del testo presente nel file e creo una tabella con le seguenti colonne: *forma flessa, categoria, lemma, canto, verso, pposizione forma flessa nel verso*"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mytesto=tei_to_csv_entry('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml', 1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = [mytesto[0]]\n",
"#data[0]\n",
"dfObj = pd.DataFrame(data[0]) \n",
"testo_tabella=pd.DataFrame(data[0], columns = ['FormaFlessa' , 'Categoria', 'Lemma', 'Canto', 'Verso', 'PosizioneFFNelVerso']) \n",
"testo_tabella.count()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"testo_tabella.tail(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Creo una tabella con gli IRI dei versi per la cantica *Inferno*\n",
"\n",
"La abella contiene il numero del verso, il verso e l'IRI del verso. \n",
"Per l'IRI del verso mi son basato su quanto riportato nel file *Commedia.rdf*, un esempio è il seguente: \n",
"\n",
"> <http://hdn.dantenetwork.it/resource/work/commedia/cantica/2/canto/9/verso/106> \n",
"> a efrbroo:F2_Expression , rdfs:Resource ; \n",
"> <http://erlangen-crm.org/current/P190_has_symbolic_content> \n",
"> \"Per li tre gradi sù di buona voglia\"^^xsd:string ; \n",
"> <http://erlangen-crm.org/current/P3_has_note> \n",
"> \"106\"^^xsd:int ; \n",
"> <http://hdn.dantenetwork.it/resource/has_number> \n",
"> \"106\"^^xsd:int . \n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data_IRI_versi_inf = [mytesto[1]]\n",
"#data_IRI_versi\n",
"df_IRI_versi_inf=pd.DataFrame(data_IRI_versi_inf[0], columns = ['NumeroVerso', 'Verso' , 'IRIVerso']) \n",
"df_IRI_versi_inf.count()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_IRI_versi_inf.head().style.set_properties(subset=['IRIVerso'], **{'width': '400px'})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Creo una tabella con gli IRI delle FF\n",
"\n",
"Algoritmo definito nella sezione 4 del documento"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data_IRI_ff_inf = [mytesto[2]]\n",
"#data_IRI_versi\n",
"df_IRI_ff_inf=pd.DataFrame(data_IRI_ff_inf[0], columns = ['FormaFlessa', 'Canto', 'NumeroVerso', 'Offset' , 'IRIFF']) \n",
"df_IRI_ff_inf.count()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_IRI_ff_inf.tail().style.set_properties(subset=['IRIFF'], **{'width': '400px'})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Forse non tutti sanno che... \n",
"\n",
"\n",
"*Nota: i risultati delle prossime elaborazioni considerano diverse tra loro due parole parole anche se differiscono per la presenza di maiuscole/minuscole*"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_inf_per_test=df_IRI_ff_inf[['FormaFlessa', 'Canto', 'NumeroVerso', 'Offset']]\n",
"df_num_ff=df_inf_per_test[df_inf_per_test['FormaFlessa'].str.len()>3]['FormaFlessa'].value_counts()\n",
"print(\"Le 10 parole (più lunghe di 3 caratteri) usate con maggiore frequenza nella prima Cantica sono:\", end=\"\\n\"*2)\n",
"print('{:<10}{}'.format('Parola', 'Frequenza'))\n",
"df_num_ff.head(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"test_inf_versi=df_inf_per_test.groupby('NumeroVerso')['FormaFlessa'].apply(list).reset_index(name='parole')\n",
"#test_inf_versi.head()\n",
"parole_counter = Counter(itertools.chain(*test_inf_versi['parole']))\n",
"print('\\nCi sono {} parole diverse nella prima Cantica.\\n'.format(len(parole_counter)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"\\nLe 10 parole più frequenti nella prima Cantica, indipendentemente dalla lunghezza in caratteri, sono: \\n\")\n",
"print('{:<30}Frequenza\\n'.format(\"Parola\"))\n",
"for k, v in parole_counter.most_common(10):\n",
" print(f'{k:<30}{v}')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Nel risultato della cella qui sotto si vede che alcune parole hanno il segno di punteggiatura, nella creazione degli IRI dovremmo toglierlo?"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"least_common_parole = parole_counter.most_common()[-30:]\n",
"print(\"\\nAlcune parole che compaiono una sola volta nella prima Cantica: \\n\")\n",
"print('{:<30}Frequenza\\n'.format(\"Parola\"))\n",
"for lk, lv in least_common_parole:\n",
" print(f'{lk:<30}{lv}')\n",
" \n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Frequenza delle parole palindrome\n",
"def is_palindrome(s):\n",
" return s==s[::-1]\n",
"\n",
"for k, v in parole_counter.most_common():\n",
" if(len(k)>1 and is_palindrome(k)):\n",
" print(f'{k:<30}{v}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#test_versi_1=test_inf_versi['parole']\n",
"#for tve in test_versi_1:\n",
"# if(is_palindrome((\"\".join(tve)))):\n",
"# print (\"\".join(tve))\n",
" #print ((\" \".join(tve)[::-1]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"cooccurrences = []\n",
"\n",
"for parole in test_inf_versi['parole']:\n",
" parole_pairs = itertools.combinations(parole, 2)\n",
" for pair in parole_pairs:\n",
" if(len(pair[0])>3 and len(pair[1])>3):\n",
" cooccurrences.append(tuple((pair)))\n",
" # cooccurrences.append(tuple(sorted(pair)))\n",
"\n",
"# Conto la frequenza di ciascuna cooccorrenza\n",
"parole_co_counter = Counter(cooccurrences)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"La frequenza delle co-occorrenze di due parole (non necessariamente consecutive e formate da almeno 4 caratteri) \\nin uno stesso verso della prima Cantica\", '\\n')\n",
"print('{:<50}{}'.format('Co-ooccorrenza', 'Frequenza\\n'))\n",
"for k, v in parole_co_counter.most_common(20):\n",
" parole = '['+k[0] + ' , ' + k[1]+']'\n",
" print(f'{parole:<50}{v}')\n",
"print('\\n')\n",
"#print('\\nMedia:')\n",
"#print(np.median(list(parole_co_counter.values())))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Cominciamo a lavorare con RDF"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n",
"#g.parse(\"/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DaMa/Commedia.rdf\", format=\"nt\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Elaborazione del file *purgatorio.xml*\n",
"Eseguo il parsing del testo presente nel file e creo una tabella simile alla precedente"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#TEST IGNORARE\n",
"#tei_purgatorio = TEIFile('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/purgatorio.xml', 2)\n",
"#bbs_pu=tei_purgatorio.IRIverso\n",
"#for repu in bbs_pu:\n",
"# print (repu, end=\"\\n\"*2)\n",
"# if (repu[0].startswith('che')):\n",
"# print('...')\n",
"# break"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"parsed_purgatorio=tei_to_csv_entry('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/purgatorio.xml', 2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data_purgatorio = [parsed_purgatorio[0]]\n",
"#dfObj_purgatorio = pd.DataFrame(data_purgatorio[0]) \n",
"testo_purgatorio_tabella=pd.DataFrame(data_purgatorio[0], columns = ['FormaFlessa' , 'Categoria', 'Lemma', 'Canto', 'Verso', 'PosizioneFFNelVerso']) \n",
"testo_purgatorio_tabella.count()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"testo_purgatorio_tabella.tail()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Creazione di una tabella con gli IRI dei versi per la cantica *Purgatorio*\n",
"\n",
"La tabella contiene il numero del verso, il verso e l'IRI del verso. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data_IRI_versi_pur = [parsed_purgatorio[1]]\n",
"#data_IRI_versi\n",
"df_IRI_versi_pur=pd.DataFrame(data_IRI_versi_pur[0], columns = ['NumeroVerso', 'Verso' , 'IRIVerso']) \n",
"df_IRI_versi_pur.count()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_IRI_versi_pur.head().style.set_properties(subset=['IRIVerso'], **{'width': '400px'})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data_IRI_ff_pur = [parsed_purgatorio[2]]\n",
"#data_IRI_versi\n",
"df_IRI_ff_pur=pd.DataFrame(data_IRI_ff_pur[0], columns = ['FormaFlessa', 'Canto', 'NumeroVerso', 'Offset' , 'IRIFF']) \n",
"df_IRI_ff_pur.count()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_IRI_ff_pur.tail().style.set_properties(subset=['IRIFF'], **{'width': '400px'})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Elaborazione del file paradiso.xml\n",
"Eseguo il parsing del testo presente nel file e creo una tabella simile alle precedenti"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"parsed_paradiso=tei_to_csv_entry('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/paradiso.xml', 3)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data_paradiso = [parsed_paradiso[0]]\n",
"testo_paradiso_tabella=pd.DataFrame(data_paradiso[0], columns = ['FormaFlessa' , 'Categoria', 'Lemma', 'Canto', 'Verso', 'PosizioneFFNelVerso']) \n",
"testo_paradiso_tabella.count()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"testo_paradiso_tabella.head(21)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Creazione di una tabella con gli IRI dei versi per la cantica Paradiso\n",
"La tabella contiene il numero del verso, il verso e l'IRI del verso."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data_IRI_versi_par = [parsed_paradiso[1]]\n",
"#data_IRI_versi\n",
"df_IRI_versi_par=pd.DataFrame(data_IRI_versi_par[0], columns = ['NumeroVerso', 'Verso' , 'IRIVerso']) \n",
"df_IRI_versi_par.count()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_IRI_versi_par.head().style.set_properties(subset=['IRIVerso'], **{'width': '400px'})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
}
},
"nbformat": 4,
"nbformat_minor": 4
}