{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import ast\n", "import sys\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "# importing useful Python utility libraries we'll need\n", "from collections import Counter, defaultdict\n", "import itertools" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import xml.etree.ElementTree as ET" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "#tree = ET.parse('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "#root = tree.getroot()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from bs4 import BeautifulSoup" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def read_tei(tei_file):\n", " with open(tei_file, 'r') as tei:\n", " soup = BeautifulSoup(tei, 'lxml')\n", " return soup\n", " raise RuntimeError('Cannot generate a soup from the input')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "def elem_to_text(elem, default=''):\n", " if elem:\n", " return elem.getText(separator=' ', strip=True)\n", " else:\n", " return default" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "from dataclasses import dataclass\n", "\n", "@dataclass\n", "class Person:\n", " firstname: str\n", " middlename: str\n", " surname: str" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "class TEIFile(object):\n", " def __init__(self, filename):\n", " self.filename = filename\n", " self.soup = read_tei(filename)\n", " self._text = None\n", " # self._lemmas = None\n", " # self._lemma_lemmas = None\n", " # self._categ_lemmas = None\n", " self._title = ''\n", " self._abstract = ''\n", "\n", " \n", " @property\n", " def title(self):\n", " if not self._title:\n", " if not self.soup.title:\n", " self._title = \"na\"\n", " else:\n", " self._title = self.soup.title.getText().replace('\\n','').strip()\n", " return self._title\n", "\n", " \n", " @property\n", " def authors(self):\n", " #authors_in_header = self.soup.analytic.find_all('author')\n", " authors_in_header = self.soup.find_all('author')\n", "\n", " result = []\n", " for author in authors_in_header:\n", " persname = author.persname\n", " if not persname:\n", " continue\n", " firstname = elem_to_text(persname.find(\"forename\"))#, type=\"first\"))\n", " middlename = elem_to_text(persname.find(\"forename\", type=\"middle\"))\n", " surname = elem_to_text(persname.surname)\n", " person = Person(firstname, middlename, surname)\n", " result.append(person)\n", " return result\n", " \n", " @property\n", " def bibliography(self):\n", " bibliography = self.soup.find_all('bibl')\n", " result = []\n", " for bibl in bibliography:\n", " if not bibl:\n", " continue\n", " #if (elem_to_text(bibl).startswith(\"Enter your references here\")):\n", " # continue\n", " my_bibl_tmp=elem_to_text(bibl).replace('\\n','').strip()\n", " my_bibl_tmp=my_bibl_tmp.replace(' .', '.')\n", " result.append(\" \".join(my_bibl_tmp.split()))\n", " return result\n", "\n", "\n", " @property\n", " def text(self):\n", " if not self._text:\n", " divs_text = []\n", " for div in self.soup.body.find_all(\"div1\"):\n", " # div is neither an appendix nor references, just plain text.\n", " if not div.get(\"type\"):\n", " div_text = div.get_text(separator=' ', strip=True)\n", " divs_text.append(div_text)\n", "\n", " plain_text = \" \".join(divs_text)\n", " self._text = plain_text\n", " return self._text\n", " \n", " @property\n", " def orderedlemma(self):\n", " ordr_lms = []\n", " i=0\n", " for div in self.soup.body.find_all(\"div1\"):\n", " for verso in div.find_all('l'):\n", " i=i+1;\n", " j=0;\n", " for lm in verso.find_all(\"lm\"):\n", " j=j+1;\n", " lm_text=elem_to_text(lm).strip();\n", " ctg=lm.get('catg');\n", " if (lm.get('lemma')!=None):\n", " lemma=lm.get('lemma');\n", " else:\n", " lemma=\"non_spec\";\n", " for parent in lm.parents:\n", " if (parent.name=='div1'):\n", " canto = parent.contents[0]\n", " break;\n", " ordr_lms.append((\" \".join(lm_text.split()), \" \".join(ctg.split()), \" \".join(lemma.split()), canto.replace('\\n','').strip(), i, j)); \n", " \n", " return ordr_lms\n", " \n", " @property\n", " def lemma(self):\n", " lms_text = []\n", " lms_tupl=()\n", " for lm in self.soup.body.find_all(\"lm\"):\n", " lm_text=elem_to_text(lm).strip()\n", " ctg=lm.get('catg');\n", " if (lm.get('lemma')!=None):\n", " lemma=lm.get('lemma');\n", " else:\n", " lemma=\"non_spec\";\n", " #lm_text=lm_text+\", \"+ctg+\", \"+lemma;\n", " for parent in lm.parents:\n", " if (parent.name=='div1'):\n", " canto = parent.contents[0]\n", " break;\n", " lms_text.append((\" \".join(lm_text.split()), \" \".join(ctg.split()), \" \".join(lemma.split()), canto.replace('\\n','').strip())); \n", " return lms_text\n", " \n", " @property\n", " def categ_lemma(self):\n", " ctgs_text = []\n", " for lm in self.soup.body.find_all(\"lm\"):\n", " ctg_text=lm.get('catg').strip();\n", " ctgs_text.append(\" \".join(ctg_text.split()))\n", " return ctgs_text\n", " \n", " @property\n", " def lemma_lemma(self):\n", " lemmas_text = []\n", " for lm in self.soup.body.find_all(\"lm\"):\n", " if (lm.get('lemma')):\n", " lemma_text=lm.get('lemma').strip();\n", " else:\n", " lemma_text='non_spec';\n", " lemmas_text.append(\" \".join(lemma_text.split()))\n", " return lemmas_text" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "def tei_to_csv_entry(tei_file):\n", " tei = TEIFile(tei_file)\n", " print(f\"Handled {tei_file}\")\n", " base_name = tei_file\n", " return tei.orderedlemma, tei.categ_lemma, tei.lemma_lemma #, tei.abstract" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Provo a vedere se il parser funziona\n", "Dovrebbe arrivare sino al termine 'oscuro'" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "('Nel', 'rdms', 'il', 'Canto 1', 1, 1)\n", "\n", "('mezzo', 'eilaksl', 'in mezzo di', 'Canto 1', 1, 2)\n", "\n", "('del', 'rdms', 'il', 'Canto 1', 1, 3)\n", "\n", "('cammin', 'sm2ms', 'cammino', 'Canto 1', 1, 4)\n", "\n", "('di', 'epskg', 'di', 'Canto 1', 1, 5)\n", "\n", "('nostra', 'as1fs', 'nostro', 'Canto 1', 1, 6)\n", "\n", "('vita', 'sf1fs', 'vita', 'Canto 1', 1, 7)\n", "\n", "('mi', 'pf1sypr', 'mi', 'Canto 1', 2, 1)\n", "\n", "('ritrovai', 'vta+1irs1', 'ritrovare', 'Canto 1', 2, 2)\n", "\n", "('per', 'epskpl', 'per', 'Canto 1', 2, 3)\n", "\n", "('una', 'rifs', 'una', 'Canto 1', 2, 4)\n", "\n", "('selva', 'sf1fs', 'selva', 'Canto 1', 2, 5)\n", "\n", "('oscura', 'a1fs', 'oscuro', 'Canto 1', 2, 6)\n", "\n", "...\n" ] } ], "source": [ "tei = TEIFile('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml')\n", "bbs=tei.orderedlemma\n", "for re in bbs:\n", " print (re, end=\"\\n\"*2)\n", " if (re[0].startswith('oscura')):\n", " print('...')\n", " break" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Carico il testo e creo una tabella\n", "faccio il parsing del testo e creo una tabella con ha 3 colonne: *lemma, categoria, lemma italiano\"" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Handled /Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml\n" ] } ], "source": [ "mytesto=tei_to_csv_entry('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml')" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Lemma 34280\n", "Categoria 34280\n", "LemmaItaliano 34280\n", "Canto 34280\n", "Verso 34280\n", "PosizioneLemmaNelVerso 34280\n", "dtype: int64" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = [mytesto[0]]\n", "#data[0]\n", "dfObj = pd.DataFrame(data[0]) \n", "testo_tabella=pd.DataFrame(data[0], columns = ['Lemma' , 'Categoria', 'LemmaItaliano', 'Canto', 'Verso', 'PosizioneLemmaNelVerso']) \n", "testo_tabella.count()" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Lemma | \n", "Categoria | \n", "LemmaItaliano | \n", "Canto | \n", "Verso | \n", "PosizioneLemmaNelVerso | \n", "
---|---|---|---|---|---|---|
0 | \n", "Nel | \n", "rdms | \n", "il | \n", "Canto 1 | \n", "1 | \n", "1 | \n", "
1 | \n", "mezzo | \n", "eilaksl | \n", "in mezzo di | \n", "Canto 1 | \n", "1 | \n", "2 | \n", "
2 | \n", "del | \n", "rdms | \n", "il | \n", "Canto 1 | \n", "1 | \n", "3 | \n", "
3 | \n", "cammin | \n", "sm2ms | \n", "cammino | \n", "Canto 1 | \n", "1 | \n", "4 | \n", "
4 | \n", "di | \n", "epskg | \n", "di | \n", "Canto 1 | \n", "1 | \n", "5 | \n", "
5 | \n", "nostra | \n", "as1fs | \n", "nostro | \n", "Canto 1 | \n", "1 | \n", "6 | \n", "
6 | \n", "vita | \n", "sf1fs | \n", "vita | \n", "Canto 1 | \n", "1 | \n", "7 | \n", "
7 | \n", "mi | \n", "pf1sypr | \n", "mi | \n", "Canto 1 | \n", "2 | \n", "1 | \n", "
8 | \n", "ritrovai | \n", "vta+1irs1 | \n", "ritrovare | \n", "Canto 1 | \n", "2 | \n", "2 | \n", "
9 | \n", "per | \n", "epskpl | \n", "per | \n", "Canto 1 | \n", "2 | \n", "3 | \n", "