Eliminare 'CItationDHres.ipynb'

2021-12-14 08:22:41 +01:00 · 2021-12-14 08:22:41 +01:00 · a21eef6d94
parent 1fefe236ac
commit a21eef6d94
1 changed files with 0 additions and 809 deletions
--- a/CItationDHres.ipynb
+++ b/CItationDHres.ipynb
@ -1,809 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import ast\n",
-    "import sys\n",
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "import matplotlib.pyplot as plt\n",
-    "\n",
-    "from bokeh.io import output_notebook, show\n",
-    "from bokeh.plotting import figure\n",
-    "from bs4 import BeautifulSoup"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def read_tei(tei_file):\n",
-    "    with open(tei_file, 'r') as tei:\n",
-    "        soup = BeautifulSoup(tei, 'lxml')\n",
-    "        return soup\n",
-    "    raise RuntimeError('Cannot generate a soup from the input')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def elem_to_text(elem, default=''):\n",
-    "    if elem:\n",
-    "        return elem.getText(separator=' ', strip=True)\n",
-    "    else:\n",
-    "        return default"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from dataclasses import dataclass\n",
-    "\n",
-    "@dataclass\n",
-    "class Person:\n",
-    "    firstname: str\n",
-    "    middlename: str\n",
-    "    surname: str"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class TEIFile(object):\n",
-    "    def __init__(self, filename):\n",
-    "        self.filename = filename\n",
-    "        self.soup = read_tei(filename)\n",
-    "        self._text = None\n",
-    "        self._title = ''\n",
-    "        self._abstract = ''\n",
-    "\n",
-    "    @property\n",
-    "    def doi(self):\n",
-    "        idno_elem = self.soup.find('idno', type='DOI')\n",
-    "        if not idno_elem:\n",
-    "            return ''\n",
-    "        else:\n",
-    "            return idno_elem.getText()\n",
-    "\n",
-    "    @property\n",
-    "    def title(self):\n",
-    "        if not self._title:\n",
-    "            if  not self.soup.title:\n",
-    "                self._title = \"na\"\n",
-    "            else:\n",
-    "                self._title = self.soup.title.getText()\n",
-    "        return self._title\n",
-    "\n",
-    "    @property\n",
-    "    def abstract(self):\n",
-    "        if not self._abstract:\n",
-    "            abstract = self.soup.abstract.getText(separator=' ', strip=True)\n",
-    "            self._abstract = abstract\n",
-    "        return self._abstract\n",
-    "\n",
-    "    @property\n",
-    "    def authors(self):\n",
-    "        #authors_in_header = self.soup.analytic.find_all('author')\n",
-    "        authors_in_header = self.soup.find_all('author')\n",
-    "\n",
-    "        result = []\n",
-    "        for author in authors_in_header:\n",
-    "            persname = author.persname\n",
-    "            if not persname:\n",
-    "                continue\n",
-    "            firstname = elem_to_text(persname.find(\"forename\"))#, type=\"first\"))\n",
-    "            middlename = elem_to_text(persname.find(\"forename\", type=\"middle\"))\n",
-    "            surname = elem_to_text(persname.surname)\n",
-    "            person = Person(firstname, middlename, surname)\n",
-    "            result.append(person)\n",
-    "        return result\n",
-    "    \n",
-    "    @property\n",
-    "    def bibliography(self):\n",
-    "        bibliography = self.soup.find_all('bibl')\n",
-    "        result = []\n",
-    "        for bibl in bibliography:\n",
-    "            if not bibl:\n",
-    "                continue\n",
-    "            \n",
-    "            result.append(elem_to_text(bibl))\n",
-    "        return result\n",
-    "    \n",
-    "    @property\n",
-    "    def text(self):\n",
-    "        if not self._text:\n",
-    "            divs_text = []\n",
-    "            for div in self.soup.body.find_all(\"div\"):\n",
-    "                # div is neither an appendix nor references, just plain text.\n",
-    "                if not div.get(\"type\"):\n",
-    "                    div_text = div.get_text(separator=' ', strip=True)\n",
-    "                    divs_text.append(div_text)\n",
-    "\n",
-    "            plain_text = \" \".join(divs_text)\n",
-    "            self._text = plain_text\n",
-    "        return self._text"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import multiprocessing"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from os.path import basename, splitext\n",
-    "\n",
-    "def basename_without_ext(path):\n",
-    "    base_name = basename(path)\n",
-    "    stem, ext = splitext(base_name)\n",
-    "    if stem.endswith('.tei'):\n",
-    "        # Return base name without tei file\n",
-    "        return stem[0:-4]\n",
-    "    else:\n",
-    "        return stem"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def tei_to_csv_entry(tei_file):\n",
-    "    tei = TEIFile(tei_file)\n",
-    "    print(f\"Handled {tei_file}\")\n",
-    "    base_name = basename_without_ext(tei_file)\n",
-    "    return base_name, tei.authors, tei.title, tei.bibliography#, tei.abstract"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import glob\n",
-    "from pathlib import Path\n",
-    "papers15 = sorted(Path(\"/Users/cesare/git/SSHOCCitationService/dataset/ToolXtractor/data/xml/2015/\").glob('*.xml'))\n",
-    "papers16 = sorted(Path(\"/Users/cesare/git/SSHOCCitationService/dataset/ToolXtractor/data/xml/2016/\").glob('*.xml'))\n",
-    "papers17 = sorted(Path(\"/Users/cesare/git/SSHOCCitationService/dataset/ToolXtractor/data/xml/2017/\").glob('*.xml'))\n",
-    "papers18 = sorted(Path(\"/Users/cesare/git/SSHOCCitationService/dataset/ToolXtractor/data/xml/2018/\").glob('*.xml'))\n",
-    "papers19 = sorted(Path(\"/Users/cesare/git/SSHOCCitationService/dataset/ToolXtractor/data/xml/2019/\").glob('*.xml'))\n",
-    "papers20 = sorted(Path(\"/Users/cesare/git/SSHOCCitationService/dataset/ToolXtractor/data/xml/2020/\").glob('*.xml'))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from multiprocessing.pool import Pool\n",
-    "pool = Pool()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Import the DH conference papers (2016-20020)\n",
-    "\n",
-    "The papers are downloaded from https://github.com/lehkost/ToolXtractor/"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "csv_entries15 = pool.map(tei_to_csv_entry, papers15)\n",
-    "csv_entries16 = pool.map(tei_to_csv_entry, papers16)\n",
-    "csv_entries17 = pool.map(tei_to_csv_entry, papers17)\n",
-    "csv_entries18 = pool.map(tei_to_csv_entry, papers18)\n",
-    "csv_entries19 = pool.map(tei_to_csv_entry, papers19)\n",
-    "csv_entries20 = pool.map(tei_to_csv_entry, papers20)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 84,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "ID              2359\n",
-       "Authors         2359\n",
-       "Title           2359\n",
-       "Bibliography    2359\n",
-       "dtype: int64"
-      ]
-     },
-     "execution_count": 84,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "result_csv0 = pd.DataFrame(csv_entries15, columns=['ID', 'Authors', 'Title', 'Bibliography'])\n",
-    "result_csv1 = result_csv0.append(pd.DataFrame(csv_entries16, columns=['ID', 'Authors', 'Title', 'Bibliography']))\n",
-    "result_csv2 = result_csv1.append(pd.DataFrame(csv_entries17, columns=['ID', 'Authors', 'Title', 'Bibliography']))\n",
-    "result_csv3 = result_csv2.append(pd.DataFrame(csv_entries18, columns=['ID', 'Authors', 'Title', 'Bibliography']))\n",
-    "result_csv4 = result_csv3.append(pd.DataFrame(csv_entries19, columns=['ID', 'Authors', 'Title', 'Bibliography']))\n",
-    "result_csv = result_csv4.append(pd.DataFrame(csv_entries20, columns=['ID', 'Authors', 'Title', 'Bibliography']))\n",
-    "result_csv.count()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Select the papers having the TEI \\<bibl\\>   elements.\n",
-    "\n",
-    "The \\<bibl\\> element (bibliographic citation) contains a loosely-structured bibliographic citation of which the sub-components may or may not be explicitly tagged. There are 1195 papers havingthis element and in total there are 11746 citations."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 85,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "ID              1195\n",
-       "Authors         1195\n",
-       "Title           1195\n",
-       "Bibliography    1195\n",
-       "dtype: int64"
-      ]
-     },
-     "execution_count": 85,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "test_csv=result_csv[result_csv['Bibliography'].str.len()>0]\n",
-    "test_csv.count()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 86,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "ID              11746\n",
-       "Title           11746\n",
-       "Bibliography    11746\n",
-       "dtype: int64"
-      ]
-     },
-     "execution_count": 86,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# all citations\n",
-    "my_df=test_csv[['ID','Title','Bibliography']]\n",
-    "my_exp_df=my_df.explode('Bibliography')\n",
-    "my_exp_df.count()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 87,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "134"
-      ]
-     },
-     "execution_count": 87,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "#Curiosity: there are at least 134 references cited more than once\n",
-    "df_p_d=my_exp_df[my_exp_df.duplicated(['Bibliography'], keep=\"last\")].sort_values('Bibliography')\n",
-    "df_p_d['Bibliography'].drop_duplicates().count()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Citations with DOI\n",
-    "There are 821 (of 11746) citations with a DOI"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 88,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Reference    821\n",
-       "DOI          821\n",
-       "dtype: int64"
-      ]
-     },
-     "execution_count": 88,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import re\n",
-    "regex = re.compile(r'\\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![\"&\\'<>])\\S)+)\\b', re.IGNORECASE)\n",
-    "df_refs=my_exp_df.Bibliography.values\n",
-    "df_refs_with_doi = pd.DataFrame(columns=[\"Reference\", \"DOI\"])\n",
-    "references=[]\n",
-    "DOIs=[]\n",
-    "for reference in df_refs:\n",
-    "    mydoi=re.search(regex, reference)\n",
-    "    if mydoi:\n",
-    "        references.append(reference);\n",
-    "        DOIs.append(mydoi[1]);\n",
-    "df_refs_with_doi['Reference']=references;\n",
-    "df_refs_with_doi['DOI']=DOIs;\n",
-    "df_refs_with_doi.count()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 89,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>Reference</th>\n",
-       "      <th>DOI</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>Byrne, G., and Goddard, L. (2010). The Stronge...</td>\n",
-       "      <td>10.1045/november2010-byrne</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>Lampert, C. K., and Southwick, S. B. (2013). L...</td>\n",
-       "      <td>10.1080/19386389.2013.826095</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>Singer, R. (2009). Linked Library Data Now! Jo...</td>\n",
-       "      <td>10.1080/19411260903035809</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>Thomas, L. and Solomon, D. (2014). Active User...</td>\n",
-       "      <td>10.1353/cea.2014.0014</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>Farquhar, A. and Baker, J. (2014). Interoperab...</td>\n",
-       "      <td>10.6084/m9.figshare.1092550%20</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                           Reference  \\\n",
-       "0  Byrne, G., and Goddard, L. (2010). The Stronge...   \n",
-       "1  Lampert, C. K., and Southwick, S. B. (2013). L...   \n",
-       "2  Singer, R. (2009). Linked Library Data Now! Jo...   \n",
-       "3  Thomas, L. and Solomon, D. (2014). Active User...   \n",
-       "4  Farquhar, A. and Baker, J. (2014). Interoperab...   \n",
-       "\n",
-       "                              DOI  \n",
-       "0      10.1045/november2010-byrne  \n",
-       "1    10.1080/19386389.2013.826095  \n",
-       "2       10.1080/19411260903035809  \n",
-       "3           10.1353/cea.2014.0014  \n",
-       "4  10.6084/m9.figshare.1092550%20  "
-      ]
-     },
-     "execution_count": 89,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "#Example: five citations that have DOIs\n",
-    "df_refs_with_doi.head()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Retrieve citation DOIs using CrossRef API\n",
-    "Crossref API allows querying the database by giving it in input strings that contain bibliography references. The reference string does not to be necessarily a well-written references. The input string is parsed by Crossref using machine learning techniques and the system tries to match the reference string with the metadata that are stored in the database. \n",
-    "\n",
-    "An important feature of Crossref API, is the score of sureness that Crossref API retrieve beside the document’s metadata. For each request, Crossref score indicates how much it is sure about the entities retrieved, if the score value is high the metadata retrieved are probably the corrected ones, if the score is low the metadata retrieved might be the wrong ones."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "As first test we use the Crossref API to check citations having DOIs, we chose '110' as minimum score value."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import urllib.request, json\n",
-    "references=df_refs_with_doi['Reference'];\n",
-    "df_citations16 = pd.DataFrame(columns=[\"Orig\", \"Crossref\", \"DOI\"])\n",
-    "originalCitations=[]\n",
-    "crossrefCitations=[]\n",
-    "DOIs=[]\n",
-    "score=[]\n",
-    "i=0;\n",
-    "j =0;\n",
-    "for cite in references:\n",
-    "    cit=urllib.parse.quote_plus(cite)\n",
-    "    try:\n",
-    "        with urllib.request.urlopen(\"https://api.crossref.org/works?query.bibliographic=\"+cit+\"&sort=score&mailto=cesare.concordia@gmail.com#\") as url:\n",
-    "            data16 = json.loads(url.read().decode())\n",
-    "            j=j+1\n",
-    "            if (j%25 == 0):\n",
-    "                print(f\"{j}, ({i})\")\n",
-    "            if (len(data16[\"message\"][\"items\"])>0) and (data16[\"message\"][\"items\"][0]['score'] >115):\n",
-    "                originalCitations.append(cite)\n",
-    "                crossrefCitations.append( data16[\"message\"][\"items\"][0])\n",
-    "                DOIs.append(data16[\"message\"][\"items\"][0]['DOI'])\n",
-    "                score.append(data16[\"message\"][\"items\"][0]['score'])\n",
-    "                i=i+1\n",
-    "                #print(f\"{i} found, out of {j}\")\n",
-    "            if (j>1000):\n",
-    "                break\n",
-    "    except urllib.error.URLError:\n",
-    "        print(cit)\n",
-    "    except urllib.error.HTTPError:\n",
-    "        print(cit)\n",
-    "        \n",
-    "df_citations16[\"Orig\"] = originalCitations\n",
-    "df_citations16[\"Crossref\"] = crossrefCitations\n",
-    "df_citations16[\"DOI\"] = DOIs\n",
-    "df_citations16[\"Score\"] = score\n",
-    "df_citations16.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 90,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Orig        327\n",
-       "Crossref    327\n",
-       "DOI_CR      327\n",
-       "Score       327\n",
-       "DOI         327\n",
-       "dtype: int64"
-      ]
-     },
-     "execution_count": 90,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df_cit_datasets=df_citations16.join(df_refs_with_doi.set_index('Reference'), on='Orig', lsuffix='_CR')\n",
-    "df_cit_datasets.count()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 91,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Orig        278\n",
-       "Crossref    278\n",
-       "DOI         278\n",
-       "Score       278\n",
-       "dtype: int64"
-      ]
-     },
-     "execution_count": 91,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "#remove duplicates\n",
-    "test=df_citations16\n",
-    "df_temp_dois=test.drop_duplicates(['DOI'])\n",
-    "df_temp_dois.count()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 92,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "278"
-      ]
-     },
-     "execution_count": 92,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df_dois=df_temp_dois[df_temp_dois['DOI'] != '']\n",
-    "df_dois_values=df_dois.DOI.values\n",
-    "df_dois_values.size"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_cn_citations = pd.DataFrame (columns = ['doi','cn_citation'])\n",
-    "import requests\n",
-    "#headers_dict = {\"Accept\": \"application/x-bibtex\"}\n",
-    "headers_dict = {\"Accept\": \"text/x-bibliography\", \"locale\":\"en-EN\"}\n",
-    "for var in df_dois_values:\n",
-    "    if ( var != \"\" and var!=None):\n",
-    "        print(var)\n",
-    "        try:\n",
-    "            r =requests.get(\"http://doi.org/\"+var, headers=headers_dict, timeout=20)\n",
-    "           # print(\"result: \"+r.content.decode(\"utf-8\"))\n",
-    "            df_cn_citations = df_cn_citations.append({'doi': var, 'cn_citation': r.content.decode(\"utf-8\")}, ignore_index=True)\n",
-    "        except requests.exceptions.ConnectionError:\n",
-    "          #  print(var)\n",
-    "            df_cn_citations = df_cn_citations.append({'doi': var, 'cn_citation': int(503)}, ignore_index=True)\n",
-    "        except requests.exceptions.ConnectTimeout:\n",
-    "          #  print(var)\n",
-    "            df_cn_citations = df_cn_citations.append({'doi': var, 'cn_citation': int(408)}, ignore_index=True)\n",
-    "        except requests.exceptions.ReadTimeout:\n",
-    "            df_cn_citations = df_cn_citations.append({'doi': var, 'cn_citation': int(408)}, ignore_index=True)\n",
-    "    else:\n",
-    "       # print(var ,0)\n",
-    "        df_cn_citations = df_cn_citations.append({'url': doi, 'cn_citation': int(400)}, ignore_index=True)\n",
-    "df_cn_citations.head(10)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 93,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>Orig</th>\n",
-       "      <th>DOI</th>\n",
-       "      <th>cn_citation</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>Lampert, C. K., and Southwick, S. B. (2013). L...</td>\n",
-       "      <td>10.1080/19386389.2013.826095</td>\n",
-       "      <td>Lampert, C. K., &amp; Southwick, S. B. (2013). Lea...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>Thomas, L. and Solomon, D. (2014). Active User...</td>\n",
-       "      <td>10.1353/cea.2014.0014</td>\n",
-       "      <td>Thomas, L., &amp; Solomon, D. (2014). Active Users...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>Omid, M. (2011). Design of an Expert System fo...</td>\n",
-       "      <td>10.1016/j.eswa.2010.09.103</td>\n",
-       "      <td>Omid, M. (2011). Design of an expert system fo...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>Trelea , I. C. (2003). The Particle Swarm Opti...</td>\n",
-       "      <td>10.1016/s0020-0190(02)00447-7</td>\n",
-       "      <td>Trelea, I. C. (2003). The particle swarm optim...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>Kenderdine, S. (2013). ‘Pure Land’: Inhabiting...</td>\n",
-       "      <td>10.1111/cura.12020</td>\n",
-       "      <td>Kenderdine, S. (2013). “Pure Land”: Inhabiting...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>Haentjens Dekker , R., van Hulle , D. , Middel...</td>\n",
-       "      <td>10.1093/llc/fqu007</td>\n",
-       "      <td>Haentjens Dekker, R., van Hulle, D., Middell, ...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                                Orig  \\\n",
-       "0  Lampert, C. K., and Southwick, S. B. (2013). L...   \n",
-       "1  Thomas, L. and Solomon, D. (2014). Active User...   \n",
-       "2  Omid, M. (2011). Design of an Expert System fo...   \n",
-       "3  Trelea , I. C. (2003). The Particle Swarm Opti...   \n",
-       "4  Kenderdine, S. (2013). ‘Pure Land’: Inhabiting...   \n",
-       "5  Haentjens Dekker , R., van Hulle , D. , Middel...   \n",
-       "\n",
-       "                             DOI  \\\n",
-       "0   10.1080/19386389.2013.826095   \n",
-       "1          10.1353/cea.2014.0014   \n",
-       "2     10.1016/j.eswa.2010.09.103   \n",
-       "3  10.1016/s0020-0190(02)00447-7   \n",
-       "4             10.1111/cura.12020   \n",
-       "5             10.1093/llc/fqu007   \n",
-       "\n",
-       "                                         cn_citation  \n",
-       "0  Lampert, C. K., & Southwick, S. B. (2013). Lea...  \n",
-       "1  Thomas, L., & Solomon, D. (2014). Active Users...  \n",
-       "2  Omid, M. (2011). Design of an expert system fo...  \n",
-       "3  Trelea, I. C. (2003). The particle swarm optim...  \n",
-       "4  Kenderdine, S. (2013). “Pure Land”: Inhabiting...  \n",
-       "5  Haentjens Dekker, R., van Hulle, D., Middell, ...  "
-      ]
-     },
-     "execution_count": 93,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df_cit_datasets=df_citations16.join(df_cn_citations.set_index('doi'), on='DOI')\n",
-    "df_cit_datasets[['Orig', 'DOI', 'cn_citation']].head(6)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 94,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Index(['Orig', 'Score', 'DOI', 'cn_citation'], dtype='object')"
-      ]
-     },
-     "execution_count": 94,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df_cit_table=df_cit_datasets[['Orig', 'Score', 'DOI', 'cn_citation']]\n",
-    "df_cit_table.columns"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import plotly.graph_objects as go\n",
-    "fig = go.Figure(data=[go.Table(header=dict(values=list(df_cit_table.columns), line=dict(color='black')), \n",
-    "                               cells=dict(values=[df_cit_table.Orig, df_cit_table.Score, df_cit_table.DOI, df_cit_table['cn_citation'] ]))])\n",
-    "fig.show(\"notebook\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.7"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}