{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Test per Parsing e generazione IRI" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import ast\n", "import sys\n", "import numpy as np\n", "import pandas as pd\n", "import rdflib\n", "import matplotlib.pyplot as plt\n", "# importing useful Python utility libraries we'll need\n", "from collections import Counter, defaultdict\n", "import itertools" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#from rdflib.namespace import CSVW, DC, DCAT, DCTERMS, DOAP, FOAF, ODRL2, ORG, OWL, \\\n", "# PROF, PROV, RDF, RDFS, SDO, SH, SKOS, SOSA, SSN, TIME, \\\n", "# VOID, XMLNS, XSD\n", "from rdflib.namespace import DC, DCAT, DCTERMS, OWL, \\\n", " RDF, RDFS, SKOS, \\\n", " XMLNS, XSD\n", "from rdflib import Namespace\n", "from rdflib import URIRef, BNode, Literal\n", "n = Namespace(\"http://hdn.dantenetwork.it/resource/work/commedia/cantica/\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import xml.etree.ElementTree as ET" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#tree = ET.parse('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#root = tree.getroot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from bs4 import BeautifulSoup" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def read_tei(tei_file):\n", " with open(tei_file, 'r') as tei:\n", " soup = BeautifulSoup(tei, 'lxml')\n", " return soup\n", " raise RuntimeError('Cannot generate a soup from the input')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def elem_to_text(elem, default=''):\n", " if elem:\n", " return elem.getText(separator=' ', strip=True)\n", " else:\n", " return default" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from dataclasses import dataclass\n", "\n", "@dataclass\n", "class Person:\n", " firstname: str\n", " middlename: str\n", " surname: str" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Parser\n", "\n", "Provo a creare un parser.\n", "\n", "Un estratto dal file inferno.xml:\n", "\n", "~~~~\n", " Canto 1\n", "\n", " \n", " Nel\n", " mezzo\n", " del\n", " cammin\n", " di\n", " nostra\n", " vita\n", " \n", " ...\n", " ...\n", " \n", " che\n", " \n", " \t nel \n", " \t nel\n", " \n", " pensier\n", " rinova\n", " la\n", " paura!\n", " \n", " \n", " ...\n", "~~~~\n", "\n", " \n", "Il tag \\ individua la porzione di file di un *Canto*, il tag \\ individua un verso, il tag \\ individua una *forma flessa*, ciascuna forma flessa ha 1 o 2 attributi.\n", "All'interno di un verso può essere presente il tag \\ che ha come content più elementi \\, ciascuno di essi contiene la stessa forma flessa ma differenti valori per gli attributi 'catg' e 'lemma'.\n", "\n", "per questa implementazione uso la libreria Python [Beatiful Soup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "class TEIFile(object):\n", " def __init__(self, filename, idres=0):\n", " self.g = rdflib.Graph()\n", " self.filename = filename\n", " self.soup = read_tei(filename)\n", " self._text = None\n", " self.idres=idres;\n", " self.InFor = URIRef(\"http://example.org/word/InflectedForm\")\n", " # self._lemmas = None\n", " # self._lemma_lemmas = None\n", " # self._categ_lemmas = None\n", " self._title = ''\n", " self._abstract = ''\n", "\n", " \n", " @property\n", " def title(self):\n", " if not self._title:\n", " if not self.soup.title:\n", " self._title = \"na\"\n", " else:\n", " self._title = self.soup.title.getText().replace('\\n','').strip()\n", " return self._title\n", "\n", " \n", " @property\n", " def authors(self):\n", " #authors_in_header = self.soup.analytic.find_all('author')\n", " authors_in_header = self.soup.find_all('author')\n", "\n", " result = []\n", " for author in authors_in_header:\n", " persname = author.persname\n", " if not persname:\n", " continue\n", " firstname = elem_to_text(persname.find(\"forename\"))#, type=\"first\"))\n", " middlename = elem_to_text(persname.find(\"forename\", type=\"middle\"))\n", " surname = elem_to_text(persname.surname)\n", " person = Person(firstname, middlename, surname)\n", " result.append(person)\n", " return result\n", " \n", " @property\n", " def bibliography(self):\n", " bibliography = self.soup.find_all('bibl')\n", " result = []\n", " for bibl in bibliography:\n", " if not bibl:\n", " continue\n", " #if (elem_to_text(bibl).startswith(\"Enter your references here\")):\n", " # continue\n", " my_bibl_tmp=elem_to_text(bibl).replace('\\n','').strip()\n", " my_bibl_tmp=my_bibl_tmp.replace(' .', '.')\n", " result.append(\" \".join(my_bibl_tmp.split()))\n", " return result\n", "\n", "\n", " @property\n", " def text(self):\n", " if not self._text:\n", " divs_text = []\n", " for div in self.soup.body.find_all(\"div1\"):\n", " # div is neither an appendix nor references, just plain text.\n", " if not div.get(\"type\"):\n", " div_text = div.get_text(separator=' ', strip=True)\n", " divs_text.append(div_text)\n", "\n", " plain_text = \" \".join(divs_text)\n", " self._text = plain_text\n", " return self._text\n", " \n", " @property\n", " def orderedlemma(self):\n", " ordr_lms = []\n", " i=0\n", " for div in self.soup.body.find_all(\"div1\"):\n", " for verso in div.find_all('l'):\n", " i=i+1;\n", " j=0;\n", " for lm in verso.find_all(\"lm\"):\n", " lstctg=[];\n", " lstlms=[];\n", " j=j+1;\n", " lm_text=elem_to_text(lm).strip();\n", " #ctg=lm.get('catg');\n", " if (lm.get('catg')!=None):\n", " ctg=lm.get('catg');\n", " else:\n", " ctg=\"non_spec\";\n", " \n", " lstctg.append(\" \".join(ctg.split())); \n", " \n", " if (lm.get('lemma')!=None):\n", " lemma=lm.get('lemma');\n", " else:\n", " lemma=\"non_spec\";\n", " lstlms.append(\" \".join(lemma.split())); \n", " for parent in lm.parents:\n", " if (parent.name=='div1'):\n", " canto = parent.contents[0];\n", " if (parent.name=='lm1' and ordr_lms[-1][0]==\" \".join(lm_text.split())):\n", " j=j-1;\n", " lstctg=lstctg+ordr_lms[-1][1];\n", " lstlms=lstlms+ordr_lms[-1][2];\n", " ordr_lms.pop();\n", " \n", " ordr_lms.append((\" \".join(lm_text.split()), lstctg, lstlms, canto.replace('\\n','').strip(), i, j));\n", " \n", " \n", " # ordr_lms.append((\" \".join(lm_text.split()), \" \".join(ctg.split()), \" \".join(lemma.split()), canto.replace('\\n','').strip(), i, j, \"hdn:Works/Commedia/Cantica/1/\"+str(i),\n", " # \"hdn:Works/Commedia/Cantica/1/\"+str(i)+\"/#\"+str(j)));\n", " \n", " \n", " return ordr_lms\n", " \n", " ##IRI forma flessa\n", " @property\n", " def IRIff(self):\n", " iriffs = []\n", " i=0\n", " for div in self.soup.body.find_all(\"div1\"):\n", " for verso in div.find_all('l'):\n", " i=i+1;\n", " j=0;\n", " for lm in verso.find_all(\"lm\"):\n", " lstctg=[];\n", " lstlms=[];\n", " lstiri=[];\n", " j=j+1;\n", " lm_text=elem_to_text(lm).strip();\n", " #ctg=lm.get('catg');\n", " if (lm.get('catg')!=None):\n", " ctg=lm.get('catg');\n", " else:\n", " ctg=\"non_spec\";\n", " \n", " lstctg.append(\" \".join(ctg.split())); \n", " \n", " if (lm.get('lemma')!=None):\n", " lemma=lm.get('lemma');\n", " else:\n", " lemma=\"non_spec\";\n", " lstlms.append(\" \".join(lemma.split())); \n", " for parent in lm.parents:\n", " if (parent.name=='div1'):\n", " canto = parent.contents[0];\n", " if (parent.name=='lm1' and iriffs[-1][0]==\" \".join(lm_text.split())):\n", " j=j-1;\n", " #lstctg=lstctg+iriffs[-1][1];\n", " #lstlms=lstlms+iriffs[-1][2];\n", " iriffs.pop();\n", " IRIff_text= \"http://hdn.dantenetwork.it/resource/work/commedia/cantica/\"+str(self.idres)+\"/\"+\"/\".join(canto.lower().split())+\"/verso/\"+str(i)+\"#\"+str(j);\n", " IRIff_text_type= IRIff_text +' rdf:type InflectedForm . '\n", " IRIff_text_pos= IRIff_text +' isInPosition '+str(j);\n", " IRIff_text_exp= IRIff_text +' hasExpression \"'+(\" \".join(lm_text.split()))+'\"^^xsd:string .' ;\n", " IRIff_text_oo= IRIff_text +' isOccurrenceOf ulem . ';\n", " IRIff_text_co= IRIff_text +\" http://erlangen-crm.org/current/P148_is_component_of http://hdn.dantenetwork.it/resource/work/commedia/cantica/\"+str(self.idres)+\"/\"+\"/\".join(canto.lower().split())+\"/verso/\"+str(i);\n", " lstiri.append(IRIff_text);\n", " lstiri.append(IRIff_text_type);\n", " lstiri.append(IRIff_text_co);\n", " lstiri.append(IRIff_text_pos);\n", " lstiri.append(IRIff_text_exp);\n", " lstiri.append(IRIff_text_oo);\n", " iriffs.append((\" \".join(lm_text.split()), canto.replace('\\n','').strip(), i, j, lstiri));\n", " \n", " \n", " # ordr_lms.append((\" \".join(lm_text.split()), \" \".join(ctg.split()), \" \".join(lemma.split()), canto.replace('\\n','').strip(), i, j, \"hdn:Works/Commedia/Cantica/1/\"+str(i),\n", " # \"hdn:Works/Commedia/Cantica/1/\"+str(i)+\"/#\"+str(j)));\n", " \n", " \n", " return iriffs\n", " \n", " #IRI forma flessa RDF\n", " @property\n", " def IRIffRDF(self):\n", " iriffs = []\n", " i=0\n", " for div in self.soup.body.find_all(\"div1\"):\n", " for verso in div.find_all('l'):\n", " i=i+1;\n", " j=0;\n", " for lm in verso.find_all(\"lm\"):\n", " lstctg=[];\n", " lstlms=[];\n", " lstiri=[];\n", " j=j+1;\n", " lm_text=elem_to_text(lm).strip();\n", " #ctg=lm.get('catg');\n", " if (lm.get('catg')!=None):\n", " ctg=lm.get('catg');\n", " else:\n", " ctg=\"non_spec\";\n", " \n", " lstctg.append(\" \".join(ctg.split())); \n", " \n", " if (lm.get('lemma')!=None):\n", " lemma=lm.get('lemma');\n", " else:\n", " lemma=\"non_spec\";\n", " lstlms.append(\" \".join(lemma.split())); \n", " for parent in lm.parents:\n", " if (parent.name=='div1'):\n", " canto = parent.contents[0];\n", " if (parent.name=='lm1' and iriffs[-1][0]==\" \".join(lm_text.split())):\n", " j=j-1;\n", " #lstctg=lstctg+iriffs[-1][1];\n", " #lstlms=lstlms+iriffs[-1][2];\n", " iriffs.pop();\n", " #g.add((bob, RDF.type, FOAF.Person))\n", " #bob = URIRef(\"http://example.org/people/Bob\")\n", " IRIff_text= URIRef(\"http://hdn.dantenetwork.it/resource/work/commedia/cantica/\"+str(self.idres)+\"/\"+\"/\".join(canto.lower().split())+\"/verso/\"+str(i)+\"#\"+str(j));\n", " self.g.remove((IRIff_text, None, None))\n", " self.g.add((IRIff_text, RDF.type, self.InFor))\n", " \n", " IRIff_text= \"http://hdn.dantenetwork.it/resource/work/commedia/cantica/\"+str(self.idres)+\"/\"+\"/\".join(canto.lower().split())+\"/verso/\"+str(i)+\"#\"+str(j);\n", " IRIff_text_type= IRIff_text +' rdf:type InflectedForm . '\n", " IRIff_text_pos= IRIff_text +' isInPosition '+str(j);\n", " IRIff_text_exp= IRIff_text +' hasExpression \"'+(\" \".join(lm_text.split()))+'\"^^xsd:string .' ;\n", " IRIff_text_oo= IRIff_text +' isOccurrenceOf ulem . ';\n", " IRIff_text_co= IRIff_text +\" http://erlangen-crm.org/current/P148_is_component_of http://hdn.dantenetwork.it/resource/work/commedia/cantica/\"+str(self.idres)+\"/\"+\"/\".join(canto.lower().split())+\"/verso/\"+str(i);\n", " lstiri.append(IRIff_text);\n", " lstiri.append(IRIff_text_type);\n", " lstiri.append(IRIff_text_co);\n", " lstiri.append(IRIff_text_pos);\n", " lstiri.append(IRIff_text_exp);\n", " lstiri.append(IRIff_text_oo);\n", " iriffs.append((\" \".join(lm_text.split()), canto.replace('\\n','').strip(), i, j, lstiri));\n", " \n", " \n", " # ordr_lms.append((\" \".join(lm_text.split()), \" \".join(ctg.split()), \" \".join(lemma.split()), canto.replace('\\n','').strip(), i, j, \"hdn:Works/Commedia/Cantica/1/\"+str(i),\n", " # \"hdn:Works/Commedia/Cantica/1/\"+str(i)+\"/#\"+str(j)));\n", " \n", " \n", " return self.g\n", " \n", " #IRI del verso\n", " @property\n", " def IRIverso(self):\n", " iris = []\n", " i=0\n", " for div in self.soup.body.find_all(\"div1\"):\n", " islm1=False;\n", " for verso in div.find_all('l'):\n", " i=i+1;\n", " lm1_text=[];\n", " verso_text=elem_to_text(verso).strip();\n", " for child in verso.children: #Manage elements\n", " if (child.name=='lm1'):\n", " islm1=True;\n", " lm1_text.append(elem_to_text(child).strip());\n", " # print (lm1_text);\n", " \n", " if(islm1):\n", " islm1=False;\n", " for lm1str in lm1_text:\n", " replace_str=lm1str.partition(' ')[0];\n", " verso_text=verso_text.replace(lm1str, replace_str);\n", " \n", " for vparent in verso.parents:\n", " if (vparent.name=='div1'):\n", " canto = vparent.contents[0];\n", " #\" \".join(verso_text.split())).strip()\n", " verso_text=verso_text.replace(\" ,\", \",\");\n", " verso_text=verso_text.replace(\" .\", \".\");\n", " verso_text=verso_text.replace(\" !\", \"!\");\n", " verso_text=verso_text.replace(\" ?\", \"?\");\n", " verso_text=verso_text.replace(\"l' \", \"l'\");\n", " iri_verso=\"http://hdn.dantenetwork.it/resource/work/commedia/cantica/\"+str(self.idres)+\"/\"+\"/\".join(canto.lower().split())+\"/verso/\"+str(i);\n", " iri_verso=iri_verso+'\\n a efrbroo:F2_Expression ,\\n rdfs:Resource ; \\nhttp://erlangen-crm.org/current/P190_has_symbolic_content \"';\n", " iri_verso=iri_verso+verso_text.strip()+ '\"^^xsd:string ;\\n http://erlangen-crm.org/current/P3_has_note';\n", " iri_verso=iri_verso+' \"'+str(i)+'\"^^xsd:int ;\\n http://hdn.dantenetwork.it/resource/has_number \"'+str(i)+'\"^^xsd:int .'\n", " \n", " iris.append((i, verso_text.strip(), iri_verso));\n", " \n", " \n", " return iris\n", " #IRI del verso\n", " \n", " \n", " #test\n", " @property\n", " def ff_ea(self):\n", " lms_text = []\n", " lms_tupl=()\n", " for lm in self.soup.body.find_all(\"lm\"):\n", " lm_text=elem_to_text(lm).strip()\n", " ctg=lm.get('catg');\n", " if (lm.get('lemma')!=None):\n", " lemma=lm.get('lemma');\n", " else:\n", " lemma=\"non_spec\";\n", " #lm_text=lm_text+\", \"+ctg+\", \"+lemma;\n", " for parent in lm.parents:\n", " if (parent.name=='div1'):\n", " canto = parent.contents[0]\n", " break;\n", " lms_text.append((\" \".join(lm_text.split()), \" \".join(ctg.split()), \" \".join(lemma.split()), canto.replace('\\n','').strip())); \n", " return lms_text\n", " \n", " @property\n", " def categ_lemma(self):\n", " ctgs_text = []\n", " for lm in self.soup.body.find_all(\"lm\"):\n", " ctg_text=lm.get('catg').strip();\n", " ctgs_text.append(\" \".join(ctg_text.split()))\n", " return ctgs_text\n", " \n", " @property\n", " def lemma_lemma(self):\n", " lemmas_text = []\n", " for lm in self.soup.body.find_all(\"lm\"):\n", " if (lm.get('lemma')):\n", " lemma_text=lm.get('lemma').strip();\n", " else:\n", " lemma_text='non_spec';\n", " lemmas_text.append(\" \".join(lemma_text.split()))\n", " return lemmas_text" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def tei_to_csv_entry(tei_file, idres=0):\n", " tei = TEIFile(tei_file, idres)\n", " print(f\"Handled {tei_file}\")\n", " base_name = tei_file\n", " return tei.orderedlemma, tei.IRIverso, tei.IRIff, tei.IRIffRDF, tei.categ_lemma, tei.lemma_lemma #, tei.abstract" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Provo a vedere se il parser funziona\n", "Dovrebbe arrivare sino al termine 'oscuro', controllare!" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tei = TEIFile('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml', 1)\n", "bbs=tei.ff_ea\n", "for re in bbs:\n", " print (re, end=\"\\n\"*2)\n", " if (re[0].startswith('oscura')):\n", " print('...')\n", " break" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "g1=tei.IRIffRDF\n", " \n", "print(len(g1)) # prints 2\n", "\n", "import pprint\n", "for stmt in g1:\n", " pprint.pprint(stmt)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Elaboro il file *inferno.xml*\n", "Eseguo il parsing del testo presente nel file e creo una tabella con le seguenti colonne: *forma flessa, categoria, lemma, canto, verso, pposizione forma flessa nel verso*" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "mytesto=tei_to_csv_entry('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml', 1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data = [mytesto[0]]\n", "#data[0]\n", "dfObj = pd.DataFrame(data[0]) \n", "testo_tabella=pd.DataFrame(data[0], columns = ['FormaFlessa' , 'Categoria', 'Lemma', 'Canto', 'Verso', 'PosizioneFFNelVerso']) \n", "testo_tabella.count()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "testo_tabella.tail(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Creo una tabella con gli IRI dei versi per la cantica *Inferno*\n", "\n", "La abella contiene il numero del verso, il verso e l'IRI del verso. \n", "Per l'IRI del verso mi son basato su quanto riportato nel file *Commedia.rdf*, un esempio è il seguente: \n", "\n", "> \n", "> a efrbroo:F2_Expression , rdfs:Resource ; \n", "> \n", "> \"Per li tre gradi sù di buona voglia\"^^xsd:string ; \n", "> \n", "> \"106\"^^xsd:int ; \n", "> \n", "> \"106\"^^xsd:int . \n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data_IRI_versi_inf = [mytesto[1]]\n", "#data_IRI_versi\n", "df_IRI_versi_inf=pd.DataFrame(data_IRI_versi_inf[0], columns = ['NumeroVerso', 'Verso' , 'IRIVerso']) \n", "df_IRI_versi_inf.count()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_IRI_versi_inf.head().style.set_properties(subset=['IRIVerso'], **{'width': '400px'})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Creo una tabella con gli IRI delle FF\n", "\n", "Algoritmo definito nella sezione 4 del documento" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data_IRI_ff_inf = [mytesto[2]]\n", "#data_IRI_versi\n", "df_IRI_ff_inf=pd.DataFrame(data_IRI_ff_inf[0], columns = ['FormaFlessa', 'Canto', 'NumeroVerso', 'Offset' , 'IRIFF']) \n", "df_IRI_ff_inf.count()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_IRI_ff_inf.tail().style.set_properties(subset=['IRIFF'], **{'width': '400px'})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Forse non tutti sanno che... \n", "\n", "\n", "*Nota: i risultati delle prossime elaborazioni considerano diverse tra loro due parole parole anche se differiscono per la presenza di maiuscole/minuscole*" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_inf_per_test=df_IRI_ff_inf[['FormaFlessa', 'Canto', 'NumeroVerso', 'Offset']]\n", "df_num_ff=df_inf_per_test[df_inf_per_test['FormaFlessa'].str.len()>3]['FormaFlessa'].value_counts()\n", "print(\"Le 10 parole (più lunghe di 3 caratteri) usate con maggiore frequenza nella prima Cantica sono:\", end=\"\\n\"*2)\n", "print('{:<10}{}'.format('Parola', 'Frequenza'))\n", "df_num_ff.head(10)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test_inf_versi=df_inf_per_test.groupby('NumeroVerso')['FormaFlessa'].apply(list).reset_index(name='parole')\n", "#test_inf_versi.head()\n", "parole_counter = Counter(itertools.chain(*test_inf_versi['parole']))\n", "print('\\nCi sono {} parole diverse nella prima Cantica.\\n'.format(len(parole_counter)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"\\nLe 10 parole più frequenti nella prima Cantica, indipendentemente dalla lunghezza in caratteri, sono: \\n\")\n", "print('{:<30}Frequenza\\n'.format(\"Parola\"))\n", "for k, v in parole_counter.most_common(10):\n", " print(f'{k:<30}{v}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Nel risultato della cella qui sotto si vede che alcune parole hanno il segno di punteggiatura, nella creazione degli IRI dovremmo toglierlo?" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "least_common_parole = parole_counter.most_common()[-30:]\n", "print(\"\\nAlcune parole che compaiono una sola volta nella prima Cantica: \\n\")\n", "print('{:<30}Frequenza\\n'.format(\"Parola\"))\n", "for lk, lv in least_common_parole:\n", " print(f'{lk:<30}{lv}')\n", " \n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Frequenza delle parole palindrome\n", "def is_palindrome(s):\n", " return s==s[::-1]\n", "\n", "for k, v in parole_counter.most_common():\n", " if(len(k)>1 and is_palindrome(k)):\n", " print(f'{k:<30}{v}')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#test_versi_1=test_inf_versi['parole']\n", "#for tve in test_versi_1:\n", "# if(is_palindrome((\"\".join(tve)))):\n", "# print (\"\".join(tve))\n", " #print ((\" \".join(tve)[::-1]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cooccurrences = []\n", "\n", "for parole in test_inf_versi['parole']:\n", " parole_pairs = itertools.combinations(parole, 2)\n", " for pair in parole_pairs:\n", " if(len(pair[0])>3 and len(pair[1])>3):\n", " cooccurrences.append(tuple((pair)))\n", " # cooccurrences.append(tuple(sorted(pair)))\n", "\n", "# Conto la frequenza di ciascuna cooccorrenza\n", "parole_co_counter = Counter(cooccurrences)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"La frequenza delle co-occorrenze di due parole (non necessariamente consecutive e formate da almeno 4 caratteri) \\nin uno stesso verso della prima Cantica\", '\\n')\n", "print('{:<50}{}'.format('Co-ooccorrenza', 'Frequenza\\n'))\n", "for k, v in parole_co_counter.most_common(20):\n", " parole = '['+k[0] + ' , ' + k[1]+']'\n", " print(f'{parole:<50}{v}')\n", "print('\\n')\n", "#print('\\nMedia:')\n", "#print(np.median(list(parole_co_counter.values())))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Cominciamo a lavorare con RDF" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "\n", "#g.parse(\"/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DaMa/Commedia.rdf\", format=\"nt\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Elaborazione del file *purgatorio.xml*\n", "Eseguo il parsing del testo presente nel file e creo una tabella simile alla precedente" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#TEST IGNORARE\n", "#tei_purgatorio = TEIFile('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/purgatorio.xml', 2)\n", "#bbs_pu=tei_purgatorio.IRIverso\n", "#for repu in bbs_pu:\n", "# print (repu, end=\"\\n\"*2)\n", "# if (repu[0].startswith('che')):\n", "# print('...')\n", "# break" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "parsed_purgatorio=tei_to_csv_entry('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/purgatorio.xml', 2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data_purgatorio = [parsed_purgatorio[0]]\n", "#dfObj_purgatorio = pd.DataFrame(data_purgatorio[0]) \n", "testo_purgatorio_tabella=pd.DataFrame(data_purgatorio[0], columns = ['FormaFlessa' , 'Categoria', 'Lemma', 'Canto', 'Verso', 'PosizioneFFNelVerso']) \n", "testo_purgatorio_tabella.count()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "testo_purgatorio_tabella.tail()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Creazione di una tabella con gli IRI dei versi per la cantica *Purgatorio*\n", "\n", "La tabella contiene il numero del verso, il verso e l'IRI del verso. " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data_IRI_versi_pur = [parsed_purgatorio[1]]\n", "#data_IRI_versi\n", "df_IRI_versi_pur=pd.DataFrame(data_IRI_versi_pur[0], columns = ['NumeroVerso', 'Verso' , 'IRIVerso']) \n", "df_IRI_versi_pur.count()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_IRI_versi_pur.head().style.set_properties(subset=['IRIVerso'], **{'width': '400px'})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data_IRI_ff_pur = [parsed_purgatorio[2]]\n", "#data_IRI_versi\n", "df_IRI_ff_pur=pd.DataFrame(data_IRI_ff_pur[0], columns = ['FormaFlessa', 'Canto', 'NumeroVerso', 'Offset' , 'IRIFF']) \n", "df_IRI_ff_pur.count()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_IRI_ff_pur.tail().style.set_properties(subset=['IRIFF'], **{'width': '400px'})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Elaborazione del file paradiso.xml\n", "Eseguo il parsing del testo presente nel file e creo una tabella simile alle precedenti" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "parsed_paradiso=tei_to_csv_entry('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/paradiso.xml', 3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data_paradiso = [parsed_paradiso[0]]\n", "testo_paradiso_tabella=pd.DataFrame(data_paradiso[0], columns = ['FormaFlessa' , 'Categoria', 'Lemma', 'Canto', 'Verso', 'PosizioneFFNelVerso']) \n", "testo_paradiso_tabella.count()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "testo_paradiso_tabella.head(21)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Creazione di una tabella con gli IRI dei versi per la cantica Paradiso\n", "La tabella contiene il numero del verso, il verso e l'IRI del verso." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data_IRI_versi_par = [parsed_paradiso[1]]\n", "#data_IRI_versi\n", "df_IRI_versi_par=pd.DataFrame(data_IRI_versi_par[0], columns = ['NumeroVerso', 'Verso' , 'IRIVerso']) \n", "df_IRI_versi_par.count()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df_IRI_versi_par.head().style.set_properties(subset=['IRIVerso'], **{'width': '400px'})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" } }, "nbformat": 4, "nbformat_minor": 4 }