Upload files to ''

This commit is contained in:
Cesare Concordia 2021-02-02 17:14:11 +01:00
parent d0bc89cf67
commit 8101fc1287
1 changed files with 539 additions and 0 deletions

539
Progetto_Lett.ipynb Normal file
View File

@ -0,0 +1,539 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import ast\n",
"import sys\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"# importing useful Python utility libraries we'll need\n",
"from collections import Counter, defaultdict\n",
"import itertools"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import xml.etree.ElementTree as ET"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"#tree = ET.parse('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"#root = tree.getroot()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def read_tei(tei_file):\n",
" with open(tei_file, 'r') as tei:\n",
" soup = BeautifulSoup(tei, 'lxml')\n",
" return soup\n",
" raise RuntimeError('Cannot generate a soup from the input')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def elem_to_text(elem, default=''):\n",
" if elem:\n",
" return elem.getText(separator=' ', strip=True)\n",
" else:\n",
" return default"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"from dataclasses import dataclass\n",
"\n",
"@dataclass\n",
"class Person:\n",
" firstname: str\n",
" middlename: str\n",
" surname: str"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"class TEIFile(object):\n",
" def __init__(self, filename):\n",
" self.filename = filename\n",
" self.soup = read_tei(filename)\n",
" self._text = None\n",
" # self._lemmas = None\n",
" # self._lemma_lemmas = None\n",
" # self._categ_lemmas = None\n",
" self._title = ''\n",
" self._abstract = ''\n",
"\n",
" \n",
" @property\n",
" def title(self):\n",
" if not self._title:\n",
" if not self.soup.title:\n",
" self._title = \"na\"\n",
" else:\n",
" self._title = self.soup.title.getText().replace('\\n','').strip()\n",
" return self._title\n",
"\n",
" \n",
" @property\n",
" def authors(self):\n",
" #authors_in_header = self.soup.analytic.find_all('author')\n",
" authors_in_header = self.soup.find_all('author')\n",
"\n",
" result = []\n",
" for author in authors_in_header:\n",
" persname = author.persname\n",
" if not persname:\n",
" continue\n",
" firstname = elem_to_text(persname.find(\"forename\"))#, type=\"first\"))\n",
" middlename = elem_to_text(persname.find(\"forename\", type=\"middle\"))\n",
" surname = elem_to_text(persname.surname)\n",
" person = Person(firstname, middlename, surname)\n",
" result.append(person)\n",
" return result\n",
" \n",
" @property\n",
" def bibliography(self):\n",
" bibliography = self.soup.find_all('bibl')\n",
" result = []\n",
" for bibl in bibliography:\n",
" if not bibl:\n",
" continue\n",
" #if (elem_to_text(bibl).startswith(\"Enter your references here\")):\n",
" # continue\n",
" my_bibl_tmp=elem_to_text(bibl).replace('\\n','').strip()\n",
" my_bibl_tmp=my_bibl_tmp.replace(' .', '.')\n",
" result.append(\" \".join(my_bibl_tmp.split()))\n",
" return result\n",
"\n",
"\n",
" @property\n",
" def text(self):\n",
" if not self._text:\n",
" divs_text = []\n",
" for div in self.soup.body.find_all(\"div1\"):\n",
" # div is neither an appendix nor references, just plain text.\n",
" if not div.get(\"type\"):\n",
" div_text = div.get_text(separator=' ', strip=True)\n",
" divs_text.append(div_text)\n",
"\n",
" plain_text = \" \".join(divs_text)\n",
" self._text = plain_text\n",
" return self._text\n",
" \n",
" @property\n",
" def orderedlemma(self):\n",
" ordr_lms = []\n",
" i=0\n",
" for div in self.soup.body.find_all(\"div1\"):\n",
" for verso in div.find_all('l'):\n",
" i=i+1;\n",
" j=0;\n",
" for lm in verso.find_all(\"lm\"):\n",
" j=j+1;\n",
" lm_text=elem_to_text(lm).strip();\n",
" ctg=lm.get('catg');\n",
" if (lm.get('lemma')!=None):\n",
" lemma=lm.get('lemma');\n",
" else:\n",
" lemma=\"non_spec\";\n",
" for parent in lm.parents:\n",
" if (parent.name=='div1'):\n",
" canto = parent.contents[0]\n",
" break;\n",
" ordr_lms.append((\" \".join(lm_text.split()), \" \".join(ctg.split()), \" \".join(lemma.split()), canto.replace('\\n','').strip(), i, j)); \n",
" \n",
" return ordr_lms\n",
" \n",
" @property\n",
" def lemma(self):\n",
" lms_text = []\n",
" lms_tupl=()\n",
" for lm in self.soup.body.find_all(\"lm\"):\n",
" lm_text=elem_to_text(lm).strip()\n",
" ctg=lm.get('catg');\n",
" if (lm.get('lemma')!=None):\n",
" lemma=lm.get('lemma');\n",
" else:\n",
" lemma=\"non_spec\";\n",
" #lm_text=lm_text+\", \"+ctg+\", \"+lemma;\n",
" for parent in lm.parents:\n",
" if (parent.name=='div1'):\n",
" canto = parent.contents[0]\n",
" break;\n",
" lms_text.append((\" \".join(lm_text.split()), \" \".join(ctg.split()), \" \".join(lemma.split()), canto.replace('\\n','').strip())); \n",
" return lms_text\n",
" \n",
" @property\n",
" def categ_lemma(self):\n",
" ctgs_text = []\n",
" for lm in self.soup.body.find_all(\"lm\"):\n",
" ctg_text=lm.get('catg').strip();\n",
" ctgs_text.append(\" \".join(ctg_text.split()))\n",
" return ctgs_text\n",
" \n",
" @property\n",
" def lemma_lemma(self):\n",
" lemmas_text = []\n",
" for lm in self.soup.body.find_all(\"lm\"):\n",
" if (lm.get('lemma')):\n",
" lemma_text=lm.get('lemma').strip();\n",
" else:\n",
" lemma_text='non_spec';\n",
" lemmas_text.append(\" \".join(lemma_text.split()))\n",
" return lemmas_text"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"def tei_to_csv_entry(tei_file):\n",
" tei = TEIFile(tei_file)\n",
" print(f\"Handled {tei_file}\")\n",
" base_name = tei_file\n",
" return tei.orderedlemma, tei.categ_lemma, tei.lemma_lemma #, tei.abstract"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Provo a vedere se il parser funziona\n",
"Dovrebbe arrivare sino al termine 'oscuro'"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"('Nel', 'rdms', 'il', 'Canto 1', 1, 1)\n",
"\n",
"('mezzo', 'eilaksl', 'in mezzo di', 'Canto 1', 1, 2)\n",
"\n",
"('del', 'rdms', 'il', 'Canto 1', 1, 3)\n",
"\n",
"('cammin', 'sm2ms', 'cammino', 'Canto 1', 1, 4)\n",
"\n",
"('di', 'epskg', 'di', 'Canto 1', 1, 5)\n",
"\n",
"('nostra', 'as1fs', 'nostro', 'Canto 1', 1, 6)\n",
"\n",
"('vita', 'sf1fs', 'vita', 'Canto 1', 1, 7)\n",
"\n",
"('mi', 'pf1sypr', 'mi', 'Canto 1', 2, 1)\n",
"\n",
"('ritrovai', 'vta+1irs1', 'ritrovare', 'Canto 1', 2, 2)\n",
"\n",
"('per', 'epskpl', 'per', 'Canto 1', 2, 3)\n",
"\n",
"('una', 'rifs', 'una', 'Canto 1', 2, 4)\n",
"\n",
"('selva', 'sf1fs', 'selva', 'Canto 1', 2, 5)\n",
"\n",
"('oscura', 'a1fs', 'oscuro', 'Canto 1', 2, 6)\n",
"\n",
"...\n"
]
}
],
"source": [
"tei = TEIFile('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml')\n",
"bbs=tei.orderedlemma\n",
"for re in bbs:\n",
" print (re, end=\"\\n\"*2)\n",
" if (re[0].startswith('oscura')):\n",
" print('...')\n",
" break"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Carico il testo e creo una tabella\n",
"faccio il parsing del testo e creo una tabella con ha 3 colonne: *lemma, categoria, lemma italiano\""
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Handled /Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml\n"
]
}
],
"source": [
"mytesto=tei_to_csv_entry('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml')"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Lemma 34280\n",
"Categoria 34280\n",
"LemmaItaliano 34280\n",
"Canto 34280\n",
"Verso 34280\n",
"PosizioneLemmaNelVerso 34280\n",
"dtype: int64"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = [mytesto[0]]\n",
"#data[0]\n",
"dfObj = pd.DataFrame(data[0]) \n",
"testo_tabella=pd.DataFrame(data[0], columns = ['Lemma' , 'Categoria', 'LemmaItaliano', 'Canto', 'Verso', 'PosizioneLemmaNelVerso']) \n",
"testo_tabella.count()"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Lemma</th>\n",
" <th>Categoria</th>\n",
" <th>LemmaItaliano</th>\n",
" <th>Canto</th>\n",
" <th>Verso</th>\n",
" <th>PosizioneLemmaNelVerso</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Nel</td>\n",
" <td>rdms</td>\n",
" <td>il</td>\n",
" <td>Canto 1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>mezzo</td>\n",
" <td>eilaksl</td>\n",
" <td>in mezzo di</td>\n",
" <td>Canto 1</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>del</td>\n",
" <td>rdms</td>\n",
" <td>il</td>\n",
" <td>Canto 1</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>cammin</td>\n",
" <td>sm2ms</td>\n",
" <td>cammino</td>\n",
" <td>Canto 1</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>di</td>\n",
" <td>epskg</td>\n",
" <td>di</td>\n",
" <td>Canto 1</td>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>nostra</td>\n",
" <td>as1fs</td>\n",
" <td>nostro</td>\n",
" <td>Canto 1</td>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>vita</td>\n",
" <td>sf1fs</td>\n",
" <td>vita</td>\n",
" <td>Canto 1</td>\n",
" <td>1</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>mi</td>\n",
" <td>pf1sypr</td>\n",
" <td>mi</td>\n",
" <td>Canto 1</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>ritrovai</td>\n",
" <td>vta+1irs1</td>\n",
" <td>ritrovare</td>\n",
" <td>Canto 1</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>per</td>\n",
" <td>epskpl</td>\n",
" <td>per</td>\n",
" <td>Canto 1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Lemma Categoria LemmaItaliano Canto Verso PosizioneLemmaNelVerso\n",
"0 Nel rdms il Canto 1 1 1\n",
"1 mezzo eilaksl in mezzo di Canto 1 1 2\n",
"2 del rdms il Canto 1 1 3\n",
"3 cammin sm2ms cammino Canto 1 1 4\n",
"4 di epskg di Canto 1 1 5\n",
"5 nostra as1fs nostro Canto 1 1 6\n",
"6 vita sf1fs vita Canto 1 1 7\n",
"7 mi pf1sypr mi Canto 1 2 1\n",
"8 ritrovai vta+1irs1 ritrovare Canto 1 2 2\n",
"9 per epskpl per Canto 1 2 3"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"testo_tabella.head(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.7"
}
},
"nbformat": 4,
"nbformat_minor": 4
}