1003 lines
36 KiB
Plaintext
1003 lines
36 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Test per Parsing e generazione IRI"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import ast\n",
|
|
"import sys\n",
|
|
"import numpy as np\n",
|
|
"import pandas as pd\n",
|
|
"import rdflib\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"# importing useful Python utility libraries we'll need\n",
|
|
"from collections import Counter, defaultdict\n",
|
|
"import itertools"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#from rdflib.namespace import CSVW, DC, DCAT, DCTERMS, DOAP, FOAF, ODRL2, ORG, OWL, \\\n",
|
|
"# PROF, PROV, RDF, RDFS, SDO, SH, SKOS, SOSA, SSN, TIME, \\\n",
|
|
"# VOID, XMLNS, XSD\n",
|
|
"from rdflib.namespace import DC, DCAT, DCTERMS, OWL, \\\n",
|
|
" RDF, RDFS, SKOS, \\\n",
|
|
" XMLNS, XSD\n",
|
|
"from rdflib import Namespace\n",
|
|
"from rdflib import URIRef, BNode, Literal\n",
|
|
"n = Namespace(\"http://hdn.dantenetwork.it/resource/work/commedia/cantica/\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import xml.etree.ElementTree as ET"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#tree = ET.parse('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#root = tree.getroot()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from bs4 import BeautifulSoup"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def read_tei(tei_file):\n",
|
|
" with open(tei_file, 'r') as tei:\n",
|
|
" soup = BeautifulSoup(tei, 'lxml')\n",
|
|
" return soup\n",
|
|
" raise RuntimeError('Cannot generate a soup from the input')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def elem_to_text(elem, default=''):\n",
|
|
" if elem:\n",
|
|
" return elem.getText(separator=' ', strip=True)\n",
|
|
" else:\n",
|
|
" return default"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from dataclasses import dataclass\n",
|
|
"\n",
|
|
"@dataclass\n",
|
|
"class Person:\n",
|
|
" firstname: str\n",
|
|
" middlename: str\n",
|
|
" surname: str"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Parser\n",
|
|
"\n",
|
|
"Provo a creare un parser.\n",
|
|
"\n",
|
|
"Un estratto dal file inferno.xml:\n",
|
|
"\n",
|
|
"~~~~\n",
|
|
"<div1> <head>Canto 1</head>\n",
|
|
"<lg type=\"canto\">\n",
|
|
" <l>\n",
|
|
" <LM lemma=\"il\" catg=\"rdms\">Nel</LM>\n",
|
|
" <LM lemma=\"in mezzo di\" catg=\"eilaksl\">mezzo</LM>\n",
|
|
" <LM lemma=\"il\" catg=\"rdms\">del</LM>\n",
|
|
" <LM lemma=\"cammino\" catg=\"sm2ms\">cammin</LM>\n",
|
|
" <LM lemma=\"di\" catg=\"epskg\">di</LM>\n",
|
|
" <LM lemma=\"nostro\" catg=\"as1fs\">nostra</LM>\n",
|
|
" <LM lemma=\"vita\" catg=\"sf1fs\">vita</LM>\n",
|
|
" </l>\n",
|
|
" ...\n",
|
|
" ...\n",
|
|
" <l>\n",
|
|
" <LM lemma=\"che\" catg=\"pr\">che</LM>\n",
|
|
" <LM1>\n",
|
|
" \t <LM lemma=\"il\" catg=\"rdms\">nel</LM> \n",
|
|
" \t <LM lemma=\"in\" catg=\"epaksl\">nel</LM>\n",
|
|
" </LM1>\n",
|
|
" <LM lemma=\"pensiero\" catg=\"sm2ms\">pensier</LM>\n",
|
|
" <LM lemma=\"rinnovare\" catg=\"vta1ips3\">rinova</LM>\n",
|
|
" <LM lemma=\"la\" catg=\"rdfs\">la</LM>\n",
|
|
" <LM lemma=\"paura\" catg=\"sf1fs\">paura</LM>!\n",
|
|
" </l>\n",
|
|
" <l>\n",
|
|
" ...\n",
|
|
"~~~~\n",
|
|
"\n",
|
|
" \n",
|
|
"Il tag \\<div1\\> individua la porzione di file di un *Canto*, il tag \\<l\\> individua un verso, il tag \\<LM\\> individua una *forma flessa*, ciascuna forma flessa ha 1 o 2 attributi.\n",
|
|
"All'interno di un verso può essere presente il tag \\<LM1\\> che ha come content più elementi \\<LM\\>, ciascuno di essi contiene la stessa forma flessa ma differenti valori per gli attributi 'catg' e 'lemma'.\n",
|
|
"\n",
|
|
"per questa implementazione uso la libreria Python [Beatiful Soup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"class TEIFile(object):\n",
|
|
" def __init__(self, filename, idres=0):\n",
|
|
" self.g = rdflib.Graph()\n",
|
|
" self.filename = filename\n",
|
|
" self.soup = read_tei(filename)\n",
|
|
" self._text = None\n",
|
|
" self.idres=idres;\n",
|
|
" self.InFor = URIRef(\"http://example.org/word/InflectedForm\")\n",
|
|
" # self._lemmas = None\n",
|
|
" # self._lemma_lemmas = None\n",
|
|
" # self._categ_lemmas = None\n",
|
|
" self._title = ''\n",
|
|
" self._abstract = ''\n",
|
|
"\n",
|
|
" \n",
|
|
" @property\n",
|
|
" def title(self):\n",
|
|
" if not self._title:\n",
|
|
" if not self.soup.title:\n",
|
|
" self._title = \"na\"\n",
|
|
" else:\n",
|
|
" self._title = self.soup.title.getText().replace('\\n','').strip()\n",
|
|
" return self._title\n",
|
|
"\n",
|
|
" \n",
|
|
" @property\n",
|
|
" def authors(self):\n",
|
|
" #authors_in_header = self.soup.analytic.find_all('author')\n",
|
|
" authors_in_header = self.soup.find_all('author')\n",
|
|
"\n",
|
|
" result = []\n",
|
|
" for author in authors_in_header:\n",
|
|
" persname = author.persname\n",
|
|
" if not persname:\n",
|
|
" continue\n",
|
|
" firstname = elem_to_text(persname.find(\"forename\"))#, type=\"first\"))\n",
|
|
" middlename = elem_to_text(persname.find(\"forename\", type=\"middle\"))\n",
|
|
" surname = elem_to_text(persname.surname)\n",
|
|
" person = Person(firstname, middlename, surname)\n",
|
|
" result.append(person)\n",
|
|
" return result\n",
|
|
" \n",
|
|
" @property\n",
|
|
" def bibliography(self):\n",
|
|
" bibliography = self.soup.find_all('bibl')\n",
|
|
" result = []\n",
|
|
" for bibl in bibliography:\n",
|
|
" if not bibl:\n",
|
|
" continue\n",
|
|
" #if (elem_to_text(bibl).startswith(\"Enter your references here\")):\n",
|
|
" # continue\n",
|
|
" my_bibl_tmp=elem_to_text(bibl).replace('\\n','').strip()\n",
|
|
" my_bibl_tmp=my_bibl_tmp.replace(' .', '.')\n",
|
|
" result.append(\" \".join(my_bibl_tmp.split()))\n",
|
|
" return result\n",
|
|
"\n",
|
|
"\n",
|
|
" @property\n",
|
|
" def text(self):\n",
|
|
" if not self._text:\n",
|
|
" divs_text = []\n",
|
|
" for div in self.soup.body.find_all(\"div1\"):\n",
|
|
" # div is neither an appendix nor references, just plain text.\n",
|
|
" if not div.get(\"type\"):\n",
|
|
" div_text = div.get_text(separator=' ', strip=True)\n",
|
|
" divs_text.append(div_text)\n",
|
|
"\n",
|
|
" plain_text = \" \".join(divs_text)\n",
|
|
" self._text = plain_text\n",
|
|
" return self._text\n",
|
|
" \n",
|
|
" @property\n",
|
|
" def orderedlemma(self):\n",
|
|
" ordr_lms = []\n",
|
|
" i=0\n",
|
|
" for div in self.soup.body.find_all(\"div1\"):\n",
|
|
" for verso in div.find_all('l'):\n",
|
|
" i=i+1;\n",
|
|
" j=0;\n",
|
|
" for lm in verso.find_all(\"lm\"):\n",
|
|
" lstctg=[];\n",
|
|
" lstlms=[];\n",
|
|
" j=j+1;\n",
|
|
" lm_text=elem_to_text(lm).strip();\n",
|
|
" #ctg=lm.get('catg');\n",
|
|
" if (lm.get('catg')!=None):\n",
|
|
" ctg=lm.get('catg');\n",
|
|
" else:\n",
|
|
" ctg=\"non_spec\";\n",
|
|
" \n",
|
|
" lstctg.append(\" \".join(ctg.split())); \n",
|
|
" \n",
|
|
" if (lm.get('lemma')!=None):\n",
|
|
" lemma=lm.get('lemma');\n",
|
|
" else:\n",
|
|
" lemma=\"non_spec\";\n",
|
|
" lstlms.append(\" \".join(lemma.split())); \n",
|
|
" for parent in lm.parents:\n",
|
|
" if (parent.name=='div1'):\n",
|
|
" canto = parent.contents[0];\n",
|
|
" if (parent.name=='lm1' and ordr_lms[-1][0]==\" \".join(lm_text.split())):\n",
|
|
" j=j-1;\n",
|
|
" lstctg=lstctg+ordr_lms[-1][1];\n",
|
|
" lstlms=lstlms+ordr_lms[-1][2];\n",
|
|
" ordr_lms.pop();\n",
|
|
" \n",
|
|
" ordr_lms.append((\" \".join(lm_text.split()), lstctg, lstlms, canto.replace('\\n','').strip(), i, j));\n",
|
|
" \n",
|
|
" \n",
|
|
" # ordr_lms.append((\" \".join(lm_text.split()), \" \".join(ctg.split()), \" \".join(lemma.split()), canto.replace('\\n','').strip(), i, j, \"hdn:Works/Commedia/Cantica/1/\"+str(i),\n",
|
|
" # \"hdn:Works/Commedia/Cantica/1/\"+str(i)+\"/#\"+str(j)));\n",
|
|
" \n",
|
|
" \n",
|
|
" return ordr_lms\n",
|
|
" \n",
|
|
" ##IRI forma flessa\n",
|
|
" @property\n",
|
|
" def IRIff(self):\n",
|
|
" iriffs = []\n",
|
|
" i=0\n",
|
|
" for div in self.soup.body.find_all(\"div1\"):\n",
|
|
" for verso in div.find_all('l'):\n",
|
|
" i=i+1;\n",
|
|
" j=0;\n",
|
|
" for lm in verso.find_all(\"lm\"):\n",
|
|
" lstctg=[];\n",
|
|
" lstlms=[];\n",
|
|
" lstiri=[];\n",
|
|
" j=j+1;\n",
|
|
" lm_text=elem_to_text(lm).strip();\n",
|
|
" #ctg=lm.get('catg');\n",
|
|
" if (lm.get('catg')!=None):\n",
|
|
" ctg=lm.get('catg');\n",
|
|
" else:\n",
|
|
" ctg=\"non_spec\";\n",
|
|
" \n",
|
|
" lstctg.append(\" \".join(ctg.split())); \n",
|
|
" \n",
|
|
" if (lm.get('lemma')!=None):\n",
|
|
" lemma=lm.get('lemma');\n",
|
|
" else:\n",
|
|
" lemma=\"non_spec\";\n",
|
|
" lstlms.append(\" \".join(lemma.split())); \n",
|
|
" for parent in lm.parents:\n",
|
|
" if (parent.name=='div1'):\n",
|
|
" canto = parent.contents[0];\n",
|
|
" if (parent.name=='lm1' and iriffs[-1][0]==\" \".join(lm_text.split())):\n",
|
|
" j=j-1;\n",
|
|
" #lstctg=lstctg+iriffs[-1][1];\n",
|
|
" #lstlms=lstlms+iriffs[-1][2];\n",
|
|
" iriffs.pop();\n",
|
|
" IRIff_text= \"http://hdn.dantenetwork.it/resource/work/commedia/cantica/\"+str(self.idres)+\"/\"+\"/\".join(canto.lower().split())+\"/verso/\"+str(i)+\"#\"+str(j);\n",
|
|
" IRIff_text_type= IRIff_text +' rdf:type InflectedForm . '\n",
|
|
" IRIff_text_pos= IRIff_text +' isInPosition '+str(j);\n",
|
|
" IRIff_text_exp= IRIff_text +' hasExpression \"'+(\" \".join(lm_text.split()))+'\"^^xsd:string .' ;\n",
|
|
" IRIff_text_oo= IRIff_text +' isOccurrenceOf ulem . ';\n",
|
|
" IRIff_text_co= IRIff_text +\" http://erlangen-crm.org/current/P148_is_component_of http://hdn.dantenetwork.it/resource/work/commedia/cantica/\"+str(self.idres)+\"/\"+\"/\".join(canto.lower().split())+\"/verso/\"+str(i);\n",
|
|
" lstiri.append(IRIff_text);\n",
|
|
" lstiri.append(IRIff_text_type);\n",
|
|
" lstiri.append(IRIff_text_co);\n",
|
|
" lstiri.append(IRIff_text_pos);\n",
|
|
" lstiri.append(IRIff_text_exp);\n",
|
|
" lstiri.append(IRIff_text_oo);\n",
|
|
" iriffs.append((\" \".join(lm_text.split()), canto.replace('\\n','').strip(), i, j, lstiri));\n",
|
|
" \n",
|
|
" \n",
|
|
" # ordr_lms.append((\" \".join(lm_text.split()), \" \".join(ctg.split()), \" \".join(lemma.split()), canto.replace('\\n','').strip(), i, j, \"hdn:Works/Commedia/Cantica/1/\"+str(i),\n",
|
|
" # \"hdn:Works/Commedia/Cantica/1/\"+str(i)+\"/#\"+str(j)));\n",
|
|
" \n",
|
|
" \n",
|
|
" return iriffs\n",
|
|
" \n",
|
|
" #IRI forma flessa RDF\n",
|
|
" @property\n",
|
|
" def IRIffRDF(self):\n",
|
|
" iriffs = []\n",
|
|
" i=0\n",
|
|
" for div in self.soup.body.find_all(\"div1\"):\n",
|
|
" for verso in div.find_all('l'):\n",
|
|
" i=i+1;\n",
|
|
" j=0;\n",
|
|
" for lm in verso.find_all(\"lm\"):\n",
|
|
" lstctg=[];\n",
|
|
" lstlms=[];\n",
|
|
" lstiri=[];\n",
|
|
" j=j+1;\n",
|
|
" lm_text=elem_to_text(lm).strip();\n",
|
|
" #ctg=lm.get('catg');\n",
|
|
" if (lm.get('catg')!=None):\n",
|
|
" ctg=lm.get('catg');\n",
|
|
" else:\n",
|
|
" ctg=\"non_spec\";\n",
|
|
" \n",
|
|
" lstctg.append(\" \".join(ctg.split())); \n",
|
|
" \n",
|
|
" if (lm.get('lemma')!=None):\n",
|
|
" lemma=lm.get('lemma');\n",
|
|
" else:\n",
|
|
" lemma=\"non_spec\";\n",
|
|
" lstlms.append(\" \".join(lemma.split())); \n",
|
|
" for parent in lm.parents:\n",
|
|
" if (parent.name=='div1'):\n",
|
|
" canto = parent.contents[0];\n",
|
|
" if (parent.name=='lm1' and iriffs[-1][0]==\" \".join(lm_text.split())):\n",
|
|
" j=j-1;\n",
|
|
" #lstctg=lstctg+iriffs[-1][1];\n",
|
|
" #lstlms=lstlms+iriffs[-1][2];\n",
|
|
" iriffs.pop();\n",
|
|
" #g.add((bob, RDF.type, FOAF.Person))\n",
|
|
" #bob = URIRef(\"http://example.org/people/Bob\")\n",
|
|
" IRIff_text= URIRef(\"http://hdn.dantenetwork.it/resource/work/commedia/cantica/\"+str(self.idres)+\"/\"+\"/\".join(canto.lower().split())+\"/verso/\"+str(i)+\"#\"+str(j));\n",
|
|
" self.g.remove((IRIff_text, None, None))\n",
|
|
" self.g.add((IRIff_text, RDF.type, self.InFor))\n",
|
|
" \n",
|
|
" IRIff_text= \"http://hdn.dantenetwork.it/resource/work/commedia/cantica/\"+str(self.idres)+\"/\"+\"/\".join(canto.lower().split())+\"/verso/\"+str(i)+\"#\"+str(j);\n",
|
|
" IRIff_text_type= IRIff_text +' rdf:type InflectedForm . '\n",
|
|
" IRIff_text_pos= IRIff_text +' isInPosition '+str(j);\n",
|
|
" IRIff_text_exp= IRIff_text +' hasExpression \"'+(\" \".join(lm_text.split()))+'\"^^xsd:string .' ;\n",
|
|
" IRIff_text_oo= IRIff_text +' isOccurrenceOf ulem . ';\n",
|
|
" IRIff_text_co= IRIff_text +\" http://erlangen-crm.org/current/P148_is_component_of http://hdn.dantenetwork.it/resource/work/commedia/cantica/\"+str(self.idres)+\"/\"+\"/\".join(canto.lower().split())+\"/verso/\"+str(i);\n",
|
|
" lstiri.append(IRIff_text);\n",
|
|
" lstiri.append(IRIff_text_type);\n",
|
|
" lstiri.append(IRIff_text_co);\n",
|
|
" lstiri.append(IRIff_text_pos);\n",
|
|
" lstiri.append(IRIff_text_exp);\n",
|
|
" lstiri.append(IRIff_text_oo);\n",
|
|
" iriffs.append((\" \".join(lm_text.split()), canto.replace('\\n','').strip(), i, j, lstiri));\n",
|
|
" \n",
|
|
" \n",
|
|
" # ordr_lms.append((\" \".join(lm_text.split()), \" \".join(ctg.split()), \" \".join(lemma.split()), canto.replace('\\n','').strip(), i, j, \"hdn:Works/Commedia/Cantica/1/\"+str(i),\n",
|
|
" # \"hdn:Works/Commedia/Cantica/1/\"+str(i)+\"/#\"+str(j)));\n",
|
|
" \n",
|
|
" \n",
|
|
" return self.g\n",
|
|
" \n",
|
|
" #IRI del verso\n",
|
|
" @property\n",
|
|
" def IRIverso(self):\n",
|
|
" iris = []\n",
|
|
" i=0\n",
|
|
" for div in self.soup.body.find_all(\"div1\"):\n",
|
|
" islm1=False;\n",
|
|
" for verso in div.find_all('l'):\n",
|
|
" i=i+1;\n",
|
|
" lm1_text=[];\n",
|
|
" verso_text=elem_to_text(verso).strip();\n",
|
|
" for child in verso.children: #Manage <LM1> elements\n",
|
|
" if (child.name=='lm1'):\n",
|
|
" islm1=True;\n",
|
|
" lm1_text.append(elem_to_text(child).strip());\n",
|
|
" # print (lm1_text);\n",
|
|
" \n",
|
|
" if(islm1):\n",
|
|
" islm1=False;\n",
|
|
" for lm1str in lm1_text:\n",
|
|
" replace_str=lm1str.partition(' ')[0];\n",
|
|
" verso_text=verso_text.replace(lm1str, replace_str);\n",
|
|
" \n",
|
|
" for vparent in verso.parents:\n",
|
|
" if (vparent.name=='div1'):\n",
|
|
" canto = vparent.contents[0];\n",
|
|
" #\" \".join(verso_text.split())).strip()\n",
|
|
" verso_text=verso_text.replace(\" ,\", \",\");\n",
|
|
" verso_text=verso_text.replace(\" .\", \".\");\n",
|
|
" verso_text=verso_text.replace(\" !\", \"!\");\n",
|
|
" verso_text=verso_text.replace(\" ?\", \"?\");\n",
|
|
" verso_text=verso_text.replace(\"l' \", \"l'\");\n",
|
|
" iri_verso=\"http://hdn.dantenetwork.it/resource/work/commedia/cantica/\"+str(self.idres)+\"/\"+\"/\".join(canto.lower().split())+\"/verso/\"+str(i);\n",
|
|
" iri_verso=iri_verso+'\\n a efrbroo:F2_Expression ,\\n rdfs:Resource ; \\nhttp://erlangen-crm.org/current/P190_has_symbolic_content \"';\n",
|
|
" iri_verso=iri_verso+verso_text.strip()+ '\"^^xsd:string ;\\n http://erlangen-crm.org/current/P3_has_note';\n",
|
|
" iri_verso=iri_verso+' \"'+str(i)+'\"^^xsd:int ;\\n http://hdn.dantenetwork.it/resource/has_number \"'+str(i)+'\"^^xsd:int .'\n",
|
|
" \n",
|
|
" iris.append((i, verso_text.strip(), iri_verso));\n",
|
|
" \n",
|
|
" \n",
|
|
" return iris\n",
|
|
" #IRI del verso\n",
|
|
" \n",
|
|
" \n",
|
|
" #test\n",
|
|
" @property\n",
|
|
" def ff_ea(self):\n",
|
|
" lms_text = []\n",
|
|
" lms_tupl=()\n",
|
|
" for lm in self.soup.body.find_all(\"lm\"):\n",
|
|
" lm_text=elem_to_text(lm).strip()\n",
|
|
" ctg=lm.get('catg');\n",
|
|
" if (lm.get('lemma')!=None):\n",
|
|
" lemma=lm.get('lemma');\n",
|
|
" else:\n",
|
|
" lemma=\"non_spec\";\n",
|
|
" #lm_text=lm_text+\", \"+ctg+\", \"+lemma;\n",
|
|
" for parent in lm.parents:\n",
|
|
" if (parent.name=='div1'):\n",
|
|
" canto = parent.contents[0]\n",
|
|
" break;\n",
|
|
" lms_text.append((\" \".join(lm_text.split()), \" \".join(ctg.split()), \" \".join(lemma.split()), canto.replace('\\n','').strip())); \n",
|
|
" return lms_text\n",
|
|
" \n",
|
|
" @property\n",
|
|
" def categ_lemma(self):\n",
|
|
" ctgs_text = []\n",
|
|
" for lm in self.soup.body.find_all(\"lm\"):\n",
|
|
" ctg_text=lm.get('catg').strip();\n",
|
|
" ctgs_text.append(\" \".join(ctg_text.split()))\n",
|
|
" return ctgs_text\n",
|
|
" \n",
|
|
" @property\n",
|
|
" def lemma_lemma(self):\n",
|
|
" lemmas_text = []\n",
|
|
" for lm in self.soup.body.find_all(\"lm\"):\n",
|
|
" if (lm.get('lemma')):\n",
|
|
" lemma_text=lm.get('lemma').strip();\n",
|
|
" else:\n",
|
|
" lemma_text='non_spec';\n",
|
|
" lemmas_text.append(\" \".join(lemma_text.split()))\n",
|
|
" return lemmas_text"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def tei_to_csv_entry(tei_file, idres=0):\n",
|
|
" tei = TEIFile(tei_file, idres)\n",
|
|
" print(f\"Handled {tei_file}\")\n",
|
|
" base_name = tei_file\n",
|
|
" return tei.orderedlemma, tei.IRIverso, tei.IRIff, tei.IRIffRDF, tei.categ_lemma, tei.lemma_lemma #, tei.abstract"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Provo a vedere se il parser funziona\n",
|
|
"Dovrebbe arrivare sino al termine 'oscuro', controllare!"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"tei = TEIFile('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml', 1)\n",
|
|
"bbs=tei.ff_ea\n",
|
|
"for re in bbs:\n",
|
|
" print (re, end=\"\\n\"*2)\n",
|
|
" if (re[0].startswith('oscura')):\n",
|
|
" print('...')\n",
|
|
" break"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"g1=tei.IRIffRDF\n",
|
|
" \n",
|
|
"print(len(g1)) # prints 2\n",
|
|
"\n",
|
|
"import pprint\n",
|
|
"for stmt in g1:\n",
|
|
" pprint.pprint(stmt)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Elaboro il file *inferno.xml*\n",
|
|
"Eseguo il parsing del testo presente nel file e creo una tabella con le seguenti colonne: *forma flessa, categoria, lemma, canto, verso, pposizione forma flessa nel verso*"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"mytesto=tei_to_csv_entry('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml', 1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"data = [mytesto[0]]\n",
|
|
"#data[0]\n",
|
|
"dfObj = pd.DataFrame(data[0]) \n",
|
|
"testo_tabella=pd.DataFrame(data[0], columns = ['FormaFlessa' , 'Categoria', 'Lemma', 'Canto', 'Verso', 'PosizioneFFNelVerso']) \n",
|
|
"testo_tabella.count()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"testo_tabella.tail(10)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Creo una tabella con gli IRI dei versi per la cantica *Inferno*\n",
|
|
"\n",
|
|
"La abella contiene il numero del verso, il verso e l'IRI del verso. \n",
|
|
"Per l'IRI del verso mi son basato su quanto riportato nel file *Commedia.rdf*, un esempio è il seguente: \n",
|
|
"\n",
|
|
"> <http://hdn.dantenetwork.it/resource/work/commedia/cantica/2/canto/9/verso/106> \n",
|
|
"> a efrbroo:F2_Expression , rdfs:Resource ; \n",
|
|
"> <http://erlangen-crm.org/current/P190_has_symbolic_content> \n",
|
|
"> \"Per li tre gradi sù di buona voglia\"^^xsd:string ; \n",
|
|
"> <http://erlangen-crm.org/current/P3_has_note> \n",
|
|
"> \"106\"^^xsd:int ; \n",
|
|
"> <http://hdn.dantenetwork.it/resource/has_number> \n",
|
|
"> \"106\"^^xsd:int . \n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"data_IRI_versi_inf = [mytesto[1]]\n",
|
|
"#data_IRI_versi\n",
|
|
"df_IRI_versi_inf=pd.DataFrame(data_IRI_versi_inf[0], columns = ['NumeroVerso', 'Verso' , 'IRIVerso']) \n",
|
|
"df_IRI_versi_inf.count()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df_IRI_versi_inf.head().style.set_properties(subset=['IRIVerso'], **{'width': '400px'})"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Creo una tabella con gli IRI delle FF\n",
|
|
"\n",
|
|
"Algoritmo definito nella sezione 4 del documento"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"data_IRI_ff_inf = [mytesto[2]]\n",
|
|
"#data_IRI_versi\n",
|
|
"df_IRI_ff_inf=pd.DataFrame(data_IRI_ff_inf[0], columns = ['FormaFlessa', 'Canto', 'NumeroVerso', 'Offset' , 'IRIFF']) \n",
|
|
"df_IRI_ff_inf.count()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df_IRI_ff_inf.tail().style.set_properties(subset=['IRIFF'], **{'width': '400px'})"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Forse non tutti sanno che... \n",
|
|
"\n",
|
|
"\n",
|
|
"*Nota: i risultati delle prossime elaborazioni considerano diverse tra loro due parole parole anche se differiscono per la presenza di maiuscole/minuscole*"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df_inf_per_test=df_IRI_ff_inf[['FormaFlessa', 'Canto', 'NumeroVerso', 'Offset']]\n",
|
|
"df_num_ff=df_inf_per_test[df_inf_per_test['FormaFlessa'].str.len()>3]['FormaFlessa'].value_counts()\n",
|
|
"print(\"Le 10 parole (più lunghe di 3 caratteri) usate con maggiore frequenza nella prima Cantica sono:\", end=\"\\n\"*2)\n",
|
|
"print('{:<10}{}'.format('Parola', 'Frequenza'))\n",
|
|
"df_num_ff.head(10)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"test_inf_versi=df_inf_per_test.groupby('NumeroVerso')['FormaFlessa'].apply(list).reset_index(name='parole')\n",
|
|
"#test_inf_versi.head()\n",
|
|
"parole_counter = Counter(itertools.chain(*test_inf_versi['parole']))\n",
|
|
"print('\\nCi sono {} parole diverse nella prima Cantica.\\n'.format(len(parole_counter)))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"print(\"\\nLe 10 parole più frequenti nella prima Cantica, indipendentemente dalla lunghezza in caratteri, sono: \\n\")\n",
|
|
"print('{:<30}Frequenza\\n'.format(\"Parola\"))\n",
|
|
"for k, v in parole_counter.most_common(10):\n",
|
|
" print(f'{k:<30}{v}')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"Nel risultato della cella qui sotto si vede che alcune parole hanno il segno di punteggiatura, nella creazione degli IRI dovremmo toglierlo?"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"least_common_parole = parole_counter.most_common()[-30:]\n",
|
|
"print(\"\\nAlcune parole che compaiono una sola volta nella prima Cantica: \\n\")\n",
|
|
"print('{:<30}Frequenza\\n'.format(\"Parola\"))\n",
|
|
"for lk, lv in least_common_parole:\n",
|
|
" print(f'{lk:<30}{lv}')\n",
|
|
" \n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#Frequenza delle parole palindrome\n",
|
|
"def is_palindrome(s):\n",
|
|
" return s==s[::-1]\n",
|
|
"\n",
|
|
"for k, v in parole_counter.most_common():\n",
|
|
" if(len(k)>1 and is_palindrome(k)):\n",
|
|
" print(f'{k:<30}{v}')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#test_versi_1=test_inf_versi['parole']\n",
|
|
"#for tve in test_versi_1:\n",
|
|
"# if(is_palindrome((\"\".join(tve)))):\n",
|
|
"# print (\"\".join(tve))\n",
|
|
" #print ((\" \".join(tve)[::-1]))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"cooccurrences = []\n",
|
|
"\n",
|
|
"for parole in test_inf_versi['parole']:\n",
|
|
" parole_pairs = itertools.combinations(parole, 2)\n",
|
|
" for pair in parole_pairs:\n",
|
|
" if(len(pair[0])>3 and len(pair[1])>3):\n",
|
|
" cooccurrences.append(tuple((pair)))\n",
|
|
" # cooccurrences.append(tuple(sorted(pair)))\n",
|
|
"\n",
|
|
"# Conto la frequenza di ciascuna cooccorrenza\n",
|
|
"parole_co_counter = Counter(cooccurrences)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"print(\"La frequenza delle co-occorrenze di due parole (non necessariamente consecutive e formate da almeno 4 caratteri) \\nin uno stesso verso della prima Cantica\", '\\n')\n",
|
|
"print('{:<50}{}'.format('Co-ooccorrenza', 'Frequenza\\n'))\n",
|
|
"for k, v in parole_co_counter.most_common(20):\n",
|
|
" parole = '['+k[0] + ' , ' + k[1]+']'\n",
|
|
" print(f'{parole:<50}{v}')\n",
|
|
"print('\\n')\n",
|
|
"#print('\\nMedia:')\n",
|
|
"#print(np.median(list(parole_co_counter.values())))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Cominciamo a lavorare con RDF"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"\n",
|
|
"\n",
|
|
"#g.parse(\"/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DaMa/Commedia.rdf\", format=\"nt\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Elaborazione del file *purgatorio.xml*\n",
|
|
"Eseguo il parsing del testo presente nel file e creo una tabella simile alla precedente"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#TEST IGNORARE\n",
|
|
"#tei_purgatorio = TEIFile('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/purgatorio.xml', 2)\n",
|
|
"#bbs_pu=tei_purgatorio.IRIverso\n",
|
|
"#for repu in bbs_pu:\n",
|
|
"# print (repu, end=\"\\n\"*2)\n",
|
|
"# if (repu[0].startswith('che')):\n",
|
|
"# print('...')\n",
|
|
"# break"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"parsed_purgatorio=tei_to_csv_entry('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/purgatorio.xml', 2)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"data_purgatorio = [parsed_purgatorio[0]]\n",
|
|
"#dfObj_purgatorio = pd.DataFrame(data_purgatorio[0]) \n",
|
|
"testo_purgatorio_tabella=pd.DataFrame(data_purgatorio[0], columns = ['FormaFlessa' , 'Categoria', 'Lemma', 'Canto', 'Verso', 'PosizioneFFNelVerso']) \n",
|
|
"testo_purgatorio_tabella.count()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"testo_purgatorio_tabella.tail()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Creazione di una tabella con gli IRI dei versi per la cantica *Purgatorio*\n",
|
|
"\n",
|
|
"La tabella contiene il numero del verso, il verso e l'IRI del verso. "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"data_IRI_versi_pur = [parsed_purgatorio[1]]\n",
|
|
"#data_IRI_versi\n",
|
|
"df_IRI_versi_pur=pd.DataFrame(data_IRI_versi_pur[0], columns = ['NumeroVerso', 'Verso' , 'IRIVerso']) \n",
|
|
"df_IRI_versi_pur.count()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df_IRI_versi_pur.head().style.set_properties(subset=['IRIVerso'], **{'width': '400px'})"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"data_IRI_ff_pur = [parsed_purgatorio[2]]\n",
|
|
"#data_IRI_versi\n",
|
|
"df_IRI_ff_pur=pd.DataFrame(data_IRI_ff_pur[0], columns = ['FormaFlessa', 'Canto', 'NumeroVerso', 'Offset' , 'IRIFF']) \n",
|
|
"df_IRI_ff_pur.count()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df_IRI_ff_pur.tail().style.set_properties(subset=['IRIFF'], **{'width': '400px'})"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Elaborazione del file paradiso.xml\n",
|
|
"Eseguo il parsing del testo presente nel file e creo una tabella simile alle precedenti"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"parsed_paradiso=tei_to_csv_entry('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/paradiso.xml', 3)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"data_paradiso = [parsed_paradiso[0]]\n",
|
|
"testo_paradiso_tabella=pd.DataFrame(data_paradiso[0], columns = ['FormaFlessa' , 'Categoria', 'Lemma', 'Canto', 'Verso', 'PosizioneFFNelVerso']) \n",
|
|
"testo_paradiso_tabella.count()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"testo_paradiso_tabella.head(21)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### Creazione di una tabella con gli IRI dei versi per la cantica Paradiso\n",
|
|
"La tabella contiene il numero del verso, il verso e l'IRI del verso."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"data_IRI_versi_par = [parsed_paradiso[1]]\n",
|
|
"#data_IRI_versi\n",
|
|
"df_IRI_versi_par=pd.DataFrame(data_IRI_versi_par[0], columns = ['NumeroVerso', 'Verso' , 'IRIVerso']) \n",
|
|
"df_IRI_versi_par.count()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df_IRI_versi_par.head().style.set_properties(subset=['IRIVerso'], **{'width': '400px'})"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.7.7"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|