From 8101fc1287cf6c41bf4e740ff669f74d9cfb4aad Mon Sep 17 00:00:00 2001 From: Cesare Concordia Date: Tue, 2 Feb 2021 17:14:11 +0100 Subject: [PATCH] Upload files to '' --- Progetto_Lett.ipynb | 539 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 539 insertions(+) create mode 100644 Progetto_Lett.ipynb diff --git a/Progetto_Lett.ipynb b/Progetto_Lett.ipynb new file mode 100644 index 0000000..d95eb59 --- /dev/null +++ b/Progetto_Lett.ipynb @@ -0,0 +1,539 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "import sys\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "# importing useful Python utility libraries we'll need\n", + "from collections import Counter, defaultdict\n", + "import itertools" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import xml.etree.ElementTree as ET" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "#tree = ET.parse('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "#root = tree.getroot()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from bs4 import BeautifulSoup" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def read_tei(tei_file):\n", + " with open(tei_file, 'r') as tei:\n", + " soup = BeautifulSoup(tei, 'lxml')\n", + " return soup\n", + " raise RuntimeError('Cannot generate a soup from the input')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def elem_to_text(elem, default=''):\n", + " if elem:\n", + " return elem.getText(separator=' ', strip=True)\n", + " else:\n", + " return default" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from dataclasses import dataclass\n", + "\n", + "@dataclass\n", + "class Person:\n", + " firstname: str\n", + " middlename: str\n", + " surname: str" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "class TEIFile(object):\n", + " def __init__(self, filename):\n", + " self.filename = filename\n", + " self.soup = read_tei(filename)\n", + " self._text = None\n", + " # self._lemmas = None\n", + " # self._lemma_lemmas = None\n", + " # self._categ_lemmas = None\n", + " self._title = ''\n", + " self._abstract = ''\n", + "\n", + " \n", + " @property\n", + " def title(self):\n", + " if not self._title:\n", + " if not self.soup.title:\n", + " self._title = \"na\"\n", + " else:\n", + " self._title = self.soup.title.getText().replace('\\n','').strip()\n", + " return self._title\n", + "\n", + " \n", + " @property\n", + " def authors(self):\n", + " #authors_in_header = self.soup.analytic.find_all('author')\n", + " authors_in_header = self.soup.find_all('author')\n", + "\n", + " result = []\n", + " for author in authors_in_header:\n", + " persname = author.persname\n", + " if not persname:\n", + " continue\n", + " firstname = elem_to_text(persname.find(\"forename\"))#, type=\"first\"))\n", + " middlename = elem_to_text(persname.find(\"forename\", type=\"middle\"))\n", + " surname = elem_to_text(persname.surname)\n", + " person = Person(firstname, middlename, surname)\n", + " result.append(person)\n", + " return result\n", + " \n", + " @property\n", + " def bibliography(self):\n", + " bibliography = self.soup.find_all('bibl')\n", + " result = []\n", + " for bibl in bibliography:\n", + " if not bibl:\n", + " continue\n", + " #if (elem_to_text(bibl).startswith(\"Enter your references here\")):\n", + " # continue\n", + " my_bibl_tmp=elem_to_text(bibl).replace('\\n','').strip()\n", + " my_bibl_tmp=my_bibl_tmp.replace(' .', '.')\n", + " result.append(\" \".join(my_bibl_tmp.split()))\n", + " return result\n", + "\n", + "\n", + " @property\n", + " def text(self):\n", + " if not self._text:\n", + " divs_text = []\n", + " for div in self.soup.body.find_all(\"div1\"):\n", + " # div is neither an appendix nor references, just plain text.\n", + " if not div.get(\"type\"):\n", + " div_text = div.get_text(separator=' ', strip=True)\n", + " divs_text.append(div_text)\n", + "\n", + " plain_text = \" \".join(divs_text)\n", + " self._text = plain_text\n", + " return self._text\n", + " \n", + " @property\n", + " def orderedlemma(self):\n", + " ordr_lms = []\n", + " i=0\n", + " for div in self.soup.body.find_all(\"div1\"):\n", + " for verso in div.find_all('l'):\n", + " i=i+1;\n", + " j=0;\n", + " for lm in verso.find_all(\"lm\"):\n", + " j=j+1;\n", + " lm_text=elem_to_text(lm).strip();\n", + " ctg=lm.get('catg');\n", + " if (lm.get('lemma')!=None):\n", + " lemma=lm.get('lemma');\n", + " else:\n", + " lemma=\"non_spec\";\n", + " for parent in lm.parents:\n", + " if (parent.name=='div1'):\n", + " canto = parent.contents[0]\n", + " break;\n", + " ordr_lms.append((\" \".join(lm_text.split()), \" \".join(ctg.split()), \" \".join(lemma.split()), canto.replace('\\n','').strip(), i, j)); \n", + " \n", + " return ordr_lms\n", + " \n", + " @property\n", + " def lemma(self):\n", + " lms_text = []\n", + " lms_tupl=()\n", + " for lm in self.soup.body.find_all(\"lm\"):\n", + " lm_text=elem_to_text(lm).strip()\n", + " ctg=lm.get('catg');\n", + " if (lm.get('lemma')!=None):\n", + " lemma=lm.get('lemma');\n", + " else:\n", + " lemma=\"non_spec\";\n", + " #lm_text=lm_text+\", \"+ctg+\", \"+lemma;\n", + " for parent in lm.parents:\n", + " if (parent.name=='div1'):\n", + " canto = parent.contents[0]\n", + " break;\n", + " lms_text.append((\" \".join(lm_text.split()), \" \".join(ctg.split()), \" \".join(lemma.split()), canto.replace('\\n','').strip())); \n", + " return lms_text\n", + " \n", + " @property\n", + " def categ_lemma(self):\n", + " ctgs_text = []\n", + " for lm in self.soup.body.find_all(\"lm\"):\n", + " ctg_text=lm.get('catg').strip();\n", + " ctgs_text.append(\" \".join(ctg_text.split()))\n", + " return ctgs_text\n", + " \n", + " @property\n", + " def lemma_lemma(self):\n", + " lemmas_text = []\n", + " for lm in self.soup.body.find_all(\"lm\"):\n", + " if (lm.get('lemma')):\n", + " lemma_text=lm.get('lemma').strip();\n", + " else:\n", + " lemma_text='non_spec';\n", + " lemmas_text.append(\" \".join(lemma_text.split()))\n", + " return lemmas_text" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "def tei_to_csv_entry(tei_file):\n", + " tei = TEIFile(tei_file)\n", + " print(f\"Handled {tei_file}\")\n", + " base_name = tei_file\n", + " return tei.orderedlemma, tei.categ_lemma, tei.lemma_lemma #, tei.abstract" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Provo a vedere se il parser funziona\n", + "Dovrebbe arrivare sino al termine 'oscuro'" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "('Nel', 'rdms', 'il', 'Canto 1', 1, 1)\n", + "\n", + "('mezzo', 'eilaksl', 'in mezzo di', 'Canto 1', 1, 2)\n", + "\n", + "('del', 'rdms', 'il', 'Canto 1', 1, 3)\n", + "\n", + "('cammin', 'sm2ms', 'cammino', 'Canto 1', 1, 4)\n", + "\n", + "('di', 'epskg', 'di', 'Canto 1', 1, 5)\n", + "\n", + "('nostra', 'as1fs', 'nostro', 'Canto 1', 1, 6)\n", + "\n", + "('vita', 'sf1fs', 'vita', 'Canto 1', 1, 7)\n", + "\n", + "('mi', 'pf1sypr', 'mi', 'Canto 1', 2, 1)\n", + "\n", + "('ritrovai', 'vta+1irs1', 'ritrovare', 'Canto 1', 2, 2)\n", + "\n", + "('per', 'epskpl', 'per', 'Canto 1', 2, 3)\n", + "\n", + "('una', 'rifs', 'una', 'Canto 1', 2, 4)\n", + "\n", + "('selva', 'sf1fs', 'selva', 'Canto 1', 2, 5)\n", + "\n", + "('oscura', 'a1fs', 'oscuro', 'Canto 1', 2, 6)\n", + "\n", + "...\n" + ] + } + ], + "source": [ + "tei = TEIFile('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml')\n", + "bbs=tei.orderedlemma\n", + "for re in bbs:\n", + " print (re, end=\"\\n\"*2)\n", + " if (re[0].startswith('oscura')):\n", + " print('...')\n", + " break" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Carico il testo e creo una tabella\n", + "faccio il parsing del testo e creo una tabella con ha 3 colonne: *lemma, categoria, lemma italiano\"" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Handled /Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml\n" + ] + } + ], + "source": [ + "mytesto=tei_to_csv_entry('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml')" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Lemma 34280\n", + "Categoria 34280\n", + "LemmaItaliano 34280\n", + "Canto 34280\n", + "Verso 34280\n", + "PosizioneLemmaNelVerso 34280\n", + "dtype: int64" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = [mytesto[0]]\n", + "#data[0]\n", + "dfObj = pd.DataFrame(data[0]) \n", + "testo_tabella=pd.DataFrame(data[0], columns = ['Lemma' , 'Categoria', 'LemmaItaliano', 'Canto', 'Verso', 'PosizioneLemmaNelVerso']) \n", + "testo_tabella.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
LemmaCategoriaLemmaItalianoCantoVersoPosizioneLemmaNelVerso
0NelrdmsilCanto 111
1mezzoeilakslin mezzo diCanto 112
2delrdmsilCanto 113
3camminsm2mscamminoCanto 114
4diepskgdiCanto 115
5nostraas1fsnostroCanto 116
6vitasf1fsvitaCanto 117
7mipf1syprmiCanto 121
8ritrovaivta+1irs1ritrovareCanto 122
9perepskplperCanto 123
\n", + "
" + ], + "text/plain": [ + " Lemma Categoria LemmaItaliano Canto Verso PosizioneLemmaNelVerso\n", + "0 Nel rdms il Canto 1 1 1\n", + "1 mezzo eilaksl in mezzo di Canto 1 1 2\n", + "2 del rdms il Canto 1 1 3\n", + "3 cammin sm2ms cammino Canto 1 1 4\n", + "4 di epskg di Canto 1 1 5\n", + "5 nostra as1fs nostro Canto 1 1 6\n", + "6 vita sf1fs vita Canto 1 1 7\n", + "7 mi pf1sypr mi Canto 1 2 1\n", + "8 ritrovai vta+1irs1 ritrovare Canto 1 2 2\n", + "9 per epskpl per Canto 1 2 3" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "testo_tabella.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}