{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import ast\n",
    "import sys\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "# importing useful Python utility libraries we'll need\n",
    "from collections import Counter, defaultdict\n",
    "import itertools"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import xml.etree.ElementTree as ET"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "#tree = ET.parse('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "#root = tree.getroot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_tei(tei_file):\n",
    "    with open(tei_file, 'r') as tei:\n",
    "        soup = BeautifulSoup(tei, 'lxml')\n",
    "        return soup\n",
    "    raise RuntimeError('Cannot generate a soup from the input')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def elem_to_text(elem, default=''):\n",
    "    if elem:\n",
    "        return elem.getText(separator=' ', strip=True)\n",
    "    else:\n",
    "        return default"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from dataclasses import dataclass\n",
    "\n",
    "@dataclass\n",
    "class Person:\n",
    "    firstname: str\n",
    "    middlename: str\n",
    "    surname: str"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "class TEIFile(object):\n",
    "    def __init__(self, filename):\n",
    "        self.filename = filename\n",
    "        self.soup = read_tei(filename)\n",
    "        self._text = None\n",
    "       # self._lemmas = None\n",
    "       # self._lemma_lemmas = None\n",
    "       # self._categ_lemmas = None\n",
    "        self._title = ''\n",
    "        self._abstract = ''\n",
    "\n",
    "    \n",
    "    @property\n",
    "    def title(self):\n",
    "        if not self._title:\n",
    "            if  not self.soup.title:\n",
    "                self._title = \"na\"\n",
    "            else:\n",
    "                self._title = self.soup.title.getText().replace('\\n','').strip()\n",
    "        return self._title\n",
    "\n",
    "    \n",
    "    @property\n",
    "    def authors(self):\n",
    "        #authors_in_header = self.soup.analytic.find_all('author')\n",
    "        authors_in_header = self.soup.find_all('author')\n",
    "\n",
    "        result = []\n",
    "        for author in authors_in_header:\n",
    "            persname = author.persname\n",
    "            if not persname:\n",
    "                continue\n",
    "            firstname = elem_to_text(persname.find(\"forename\"))#, type=\"first\"))\n",
    "            middlename = elem_to_text(persname.find(\"forename\", type=\"middle\"))\n",
    "            surname = elem_to_text(persname.surname)\n",
    "            person = Person(firstname, middlename, surname)\n",
    "            result.append(person)\n",
    "        return result\n",
    "    \n",
    "    @property\n",
    "    def bibliography(self):\n",
    "        bibliography = self.soup.find_all('bibl')\n",
    "        result = []\n",
    "        for bibl in bibliography:\n",
    "            if not bibl:\n",
    "                continue\n",
    "            #if (elem_to_text(bibl).startswith(\"Enter your references here\")):\n",
    "            #    continue\n",
    "            my_bibl_tmp=elem_to_text(bibl).replace('\\n','').strip()\n",
    "            my_bibl_tmp=my_bibl_tmp.replace(' .', '.')\n",
    "            result.append(\" \".join(my_bibl_tmp.split()))\n",
    "        return result\n",
    "\n",
    "\n",
    "    @property\n",
    "    def text(self):\n",
    "        if not self._text:\n",
    "            divs_text = []\n",
    "            for div in self.soup.body.find_all(\"div1\"):\n",
    "                # div is neither an appendix nor references, just plain text.\n",
    "                if not div.get(\"type\"):\n",
    "                    div_text = div.get_text(separator=' ', strip=True)\n",
    "                    divs_text.append(div_text)\n",
    "\n",
    "            plain_text = \" \".join(divs_text)\n",
    "            self._text = plain_text\n",
    "        return self._text\n",
    "    \n",
    "    @property\n",
    "    def orderedlemma(self):\n",
    "        ordr_lms = []\n",
    "        i=0\n",
    "        for div in self.soup.body.find_all(\"div1\"):\n",
    "            for verso in div.find_all('l'):\n",
    "                i=i+1;\n",
    "                j=0;\n",
    "                for lm in verso.find_all(\"lm\"):\n",
    "                    j=j+1;\n",
    "                    lm_text=elem_to_text(lm).strip();\n",
    "                    ctg=lm.get('catg');\n",
    "                    if (lm.get('lemma')!=None):\n",
    "                        lemma=lm.get('lemma');\n",
    "                    else:\n",
    "                        lemma=\"non_spec\";\n",
    "                    for parent in lm.parents:\n",
    "                        if (parent.name=='div1'):\n",
    "                            canto = parent.contents[0]\n",
    "                            break;\n",
    "                    ordr_lms.append((\" \".join(lm_text.split()), \" \".join(ctg.split()), \" \".join(lemma.split()), canto.replace('\\n','').strip(), i, j));               \n",
    "        \n",
    "        return ordr_lms\n",
    "    \n",
    "    @property\n",
    "    def lemma(self):\n",
    "        lms_text = []\n",
    "        lms_tupl=()\n",
    "        for lm in self.soup.body.find_all(\"lm\"):\n",
    "            lm_text=elem_to_text(lm).strip()\n",
    "            ctg=lm.get('catg');\n",
    "            if (lm.get('lemma')!=None):\n",
    "                lemma=lm.get('lemma');\n",
    "            else:\n",
    "                lemma=\"non_spec\";\n",
    "            #lm_text=lm_text+\", \"+ctg+\", \"+lemma;\n",
    "            for parent in lm.parents:\n",
    "                if (parent.name=='div1'):\n",
    "                    canto = parent.contents[0]\n",
    "                    break;\n",
    "            lms_text.append((\" \".join(lm_text.split()), \" \".join(ctg.split()), \" \".join(lemma.split()), canto.replace('\\n','').strip()));               \n",
    "        return lms_text\n",
    "    \n",
    "    @property\n",
    "    def categ_lemma(self):\n",
    "        ctgs_text = []\n",
    "        for lm in self.soup.body.find_all(\"lm\"):\n",
    "            ctg_text=lm.get('catg').strip();\n",
    "            ctgs_text.append(\" \".join(ctg_text.split()))\n",
    "        return ctgs_text\n",
    "    \n",
    "    @property\n",
    "    def lemma_lemma(self):\n",
    "        lemmas_text = []\n",
    "        for lm in self.soup.body.find_all(\"lm\"):\n",
    "            if (lm.get('lemma')):\n",
    "                lemma_text=lm.get('lemma').strip();\n",
    "            else:\n",
    "                lemma_text='non_spec';\n",
    "            lemmas_text.append(\" \".join(lemma_text.split()))\n",
    "        return lemmas_text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "def tei_to_csv_entry(tei_file):\n",
    "    tei = TEIFile(tei_file)\n",
    "    print(f\"Handled {tei_file}\")\n",
    "    base_name = tei_file\n",
    "    return tei.orderedlemma, tei.categ_lemma, tei.lemma_lemma  #, tei.abstract"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Provo a vedere se il parser funziona\n",
    "Dovrebbe arrivare sino al termine 'oscuro'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('Nel', 'rdms', 'il', 'Canto 1', 1, 1)\n",
      "\n",
      "('mezzo', 'eilaksl', 'in mezzo di', 'Canto 1', 1, 2)\n",
      "\n",
      "('del', 'rdms', 'il', 'Canto 1', 1, 3)\n",
      "\n",
      "('cammin', 'sm2ms', 'cammino', 'Canto 1', 1, 4)\n",
      "\n",
      "('di', 'epskg', 'di', 'Canto 1', 1, 5)\n",
      "\n",
      "('nostra', 'as1fs', 'nostro', 'Canto 1', 1, 6)\n",
      "\n",
      "('vita', 'sf1fs', 'vita', 'Canto 1', 1, 7)\n",
      "\n",
      "('mi', 'pf1sypr', 'mi', 'Canto 1', 2, 1)\n",
      "\n",
      "('ritrovai', 'vta+1irs1', 'ritrovare', 'Canto 1', 2, 2)\n",
      "\n",
      "('per', 'epskpl', 'per', 'Canto 1', 2, 3)\n",
      "\n",
      "('una', 'rifs', 'una', 'Canto 1', 2, 4)\n",
      "\n",
      "('selva', 'sf1fs', 'selva', 'Canto 1', 2, 5)\n",
      "\n",
      "('oscura', 'a1fs', 'oscuro', 'Canto 1', 2, 6)\n",
      "\n",
      "...\n"
     ]
    }
   ],
   "source": [
    "tei = TEIFile('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml')\n",
    "bbs=tei.orderedlemma\n",
    "for re in bbs:\n",
    "    print (re, end=\"\\n\"*2)\n",
    "    if (re[0].startswith('oscura')):\n",
    "        print('...')\n",
    "        break"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Carico il testo e creo una tabella\n",
    "faccio il parsing del testo e creo una tabella con ha 3 colonne: *lemma, categoria, lemma italiano\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Handled /Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml\n"
     ]
    }
   ],
   "source": [
    "mytesto=tei_to_csv_entry('/Users/cesare/Projects/hdn/triple/DanteTriple/xml/DanteSearch/grammaticale/inferno_forparsing.xml')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Lemma                     34280\n",
       "Categoria                 34280\n",
       "LemmaItaliano             34280\n",
       "Canto                     34280\n",
       "Verso                     34280\n",
       "PosizioneLemmaNelVerso    34280\n",
       "dtype: int64"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = [mytesto[0]]\n",
    "#data[0]\n",
    "dfObj = pd.DataFrame(data[0]) \n",
    "testo_tabella=pd.DataFrame(data[0], columns = ['Lemma' , 'Categoria', 'LemmaItaliano', 'Canto', 'Verso', 'PosizioneLemmaNelVerso']) \n",
    "testo_tabella.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Lemma</th>\n",
       "      <th>Categoria</th>\n",
       "      <th>LemmaItaliano</th>\n",
       "      <th>Canto</th>\n",
       "      <th>Verso</th>\n",
       "      <th>PosizioneLemmaNelVerso</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Nel</td>\n",
       "      <td>rdms</td>\n",
       "      <td>il</td>\n",
       "      <td>Canto 1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>mezzo</td>\n",
       "      <td>eilaksl</td>\n",
       "      <td>in mezzo di</td>\n",
       "      <td>Canto 1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>del</td>\n",
       "      <td>rdms</td>\n",
       "      <td>il</td>\n",
       "      <td>Canto 1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>cammin</td>\n",
       "      <td>sm2ms</td>\n",
       "      <td>cammino</td>\n",
       "      <td>Canto 1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>di</td>\n",
       "      <td>epskg</td>\n",
       "      <td>di</td>\n",
       "      <td>Canto 1</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>nostra</td>\n",
       "      <td>as1fs</td>\n",
       "      <td>nostro</td>\n",
       "      <td>Canto 1</td>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>vita</td>\n",
       "      <td>sf1fs</td>\n",
       "      <td>vita</td>\n",
       "      <td>Canto 1</td>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>mi</td>\n",
       "      <td>pf1sypr</td>\n",
       "      <td>mi</td>\n",
       "      <td>Canto 1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>ritrovai</td>\n",
       "      <td>vta+1irs1</td>\n",
       "      <td>ritrovare</td>\n",
       "      <td>Canto 1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>per</td>\n",
       "      <td>epskpl</td>\n",
       "      <td>per</td>\n",
       "      <td>Canto 1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      Lemma  Categoria LemmaItaliano    Canto  Verso  PosizioneLemmaNelVerso\n",
       "0       Nel       rdms            il  Canto 1      1                       1\n",
       "1     mezzo    eilaksl   in mezzo di  Canto 1      1                       2\n",
       "2       del       rdms            il  Canto 1      1                       3\n",
       "3    cammin      sm2ms       cammino  Canto 1      1                       4\n",
       "4        di      epskg            di  Canto 1      1                       5\n",
       "5    nostra      as1fs        nostro  Canto 1      1                       6\n",
       "6      vita      sf1fs          vita  Canto 1      1                       7\n",
       "7        mi    pf1sypr            mi  Canto 1      2                       1\n",
       "8  ritrovai  vta+1irs1     ritrovare  Canto 1      2                       2\n",
       "9       per     epskpl           per  Canto 1      2                       3"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "testo_tabella.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}