sshoc-skosmapping/CItationDHres.ipynb

25 KiB
Raw Blame History

In [ ]:
import ast
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bs4 import BeautifulSoup
In [ ]:
def read_tei(tei_file):
    with open(tei_file, 'r') as tei:
        soup = BeautifulSoup(tei, 'lxml')
        return soup
    raise RuntimeError('Cannot generate a soup from the input')
In [ ]:
def elem_to_text(elem, default=''):
    if elem:
        return elem.getText(separator=' ', strip=True)
    else:
        return default
In [ ]:
from dataclasses import dataclass

@dataclass
class Person:
    firstname: str
    middlename: str
    surname: str
In [ ]:
class TEIFile(object):
    def __init__(self, filename):
        self.filename = filename
        self.soup = read_tei(filename)
        self._text = None
        self._title = ''
        self._abstract = ''

    @property
    def doi(self):
        idno_elem = self.soup.find('idno', type='DOI')
        if not idno_elem:
            return ''
        else:
            return idno_elem.getText()

    @property
    def title(self):
        if not self._title:
            if  not self.soup.title:
                self._title = "na"
            else:
                self._title = self.soup.title.getText()
        return self._title

    @property
    def abstract(self):
        if not self._abstract:
            abstract = self.soup.abstract.getText(separator=' ', strip=True)
            self._abstract = abstract
        return self._abstract

    @property
    def authors(self):
        #authors_in_header = self.soup.analytic.find_all('author')
        authors_in_header = self.soup.find_all('author')

        result = []
        for author in authors_in_header:
            persname = author.persname
            if not persname:
                continue
            firstname = elem_to_text(persname.find("forename"))#, type="first"))
            middlename = elem_to_text(persname.find("forename", type="middle"))
            surname = elem_to_text(persname.surname)
            person = Person(firstname, middlename, surname)
            result.append(person)
        return result
    
    @property
    def bibliography(self):
        bibliography = self.soup.find_all('bibl')
        result = []
        for bibl in bibliography:
            if not bibl:
                continue
            
            result.append(elem_to_text(bibl))
        return result
    
    @property
    def text(self):
        if not self._text:
            divs_text = []
            for div in self.soup.body.find_all("div"):
                # div is neither an appendix nor references, just plain text.
                if not div.get("type"):
                    div_text = div.get_text(separator=' ', strip=True)
                    divs_text.append(div_text)

            plain_text = " ".join(divs_text)
            self._text = plain_text
        return self._text
In [ ]:
import multiprocessing
In [ ]:
from os.path import basename, splitext

def basename_without_ext(path):
    base_name = basename(path)
    stem, ext = splitext(base_name)
    if stem.endswith('.tei'):
        # Return base name without tei file
        return stem[0:-4]
    else:
        return stem
In [ ]:
def tei_to_csv_entry(tei_file):
    tei = TEIFile(tei_file)
    print(f"Handled {tei_file}")
    base_name = basename_without_ext(tei_file)
    return base_name, tei.authors, tei.title, tei.bibliography#, tei.abstract
In [ ]:
import glob
from pathlib import Path
papers15 = sorted(Path("/Users/cesare/git/SSHOCCitationService/dataset/ToolXtractor/data/xml/2015/").glob('*.xml'))
papers16 = sorted(Path("/Users/cesare/git/SSHOCCitationService/dataset/ToolXtractor/data/xml/2016/").glob('*.xml'))
papers17 = sorted(Path("/Users/cesare/git/SSHOCCitationService/dataset/ToolXtractor/data/xml/2017/").glob('*.xml'))
papers18 = sorted(Path("/Users/cesare/git/SSHOCCitationService/dataset/ToolXtractor/data/xml/2018/").glob('*.xml'))
papers19 = sorted(Path("/Users/cesare/git/SSHOCCitationService/dataset/ToolXtractor/data/xml/2019/").glob('*.xml'))
papers20 = sorted(Path("/Users/cesare/git/SSHOCCitationService/dataset/ToolXtractor/data/xml/2020/").glob('*.xml'))
In [ ]:
from multiprocessing.pool import Pool
pool = Pool()

Import the DH conference papers (2016-20020)

The papers are downloaded from https://github.com/lehkost/ToolXtractor/

In [ ]:
csv_entries15 = pool.map(tei_to_csv_entry, papers15)
csv_entries16 = pool.map(tei_to_csv_entry, papers16)
csv_entries17 = pool.map(tei_to_csv_entry, papers17)
csv_entries18 = pool.map(tei_to_csv_entry, papers18)
csv_entries19 = pool.map(tei_to_csv_entry, papers19)
csv_entries20 = pool.map(tei_to_csv_entry, papers20)
In [84]:
result_csv0 = pd.DataFrame(csv_entries15, columns=['ID', 'Authors', 'Title', 'Bibliography'])
result_csv1 = result_csv0.append(pd.DataFrame(csv_entries16, columns=['ID', 'Authors', 'Title', 'Bibliography']))
result_csv2 = result_csv1.append(pd.DataFrame(csv_entries17, columns=['ID', 'Authors', 'Title', 'Bibliography']))
result_csv3 = result_csv2.append(pd.DataFrame(csv_entries18, columns=['ID', 'Authors', 'Title', 'Bibliography']))
result_csv4 = result_csv3.append(pd.DataFrame(csv_entries19, columns=['ID', 'Authors', 'Title', 'Bibliography']))
result_csv = result_csv4.append(pd.DataFrame(csv_entries20, columns=['ID', 'Authors', 'Title', 'Bibliography']))
result_csv.count()
Out[84]:
ID              2359
Authors         2359
Title           2359
Bibliography    2359
dtype: int64

Select the papers having the TEI \<bibl> elements.

The \<bibl> element (bibliographic citation) contains a loosely-structured bibliographic citation of which the sub-components may or may not be explicitly tagged. There are 1195 papers havingthis element and in total there are 11746 citations.

In [85]:
test_csv=result_csv[result_csv['Bibliography'].str.len()>0]
test_csv.count()
Out[85]:
ID              1195
Authors         1195
Title           1195
Bibliography    1195
dtype: int64
In [86]:
# all citations
my_df=test_csv[['ID','Title','Bibliography']]
my_exp_df=my_df.explode('Bibliography')
my_exp_df.count()
Out[86]:
ID              11746
Title           11746
Bibliography    11746
dtype: int64
In [87]:
#Curiosity: there are at least 134 references cited more than once
df_p_d=my_exp_df[my_exp_df.duplicated(['Bibliography'], keep="last")].sort_values('Bibliography')
df_p_d['Bibliography'].drop_duplicates().count()
Out[87]:
134

Citations with DOI

There are 821 (of 11746) citations with a DOI

In [88]:
import re
regex = re.compile(r'\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?!["&\'<>])\S)+)\b', re.IGNORECASE)
df_refs=my_exp_df.Bibliography.values
df_refs_with_doi = pd.DataFrame(columns=["Reference", "DOI"])
references=[]
DOIs=[]
for reference in df_refs:
    mydoi=re.search(regex, reference)
    if mydoi:
        references.append(reference);
        DOIs.append(mydoi[1]);
df_refs_with_doi['Reference']=references;
df_refs_with_doi['DOI']=DOIs;
df_refs_with_doi.count()
Out[88]:
Reference    821
DOI          821
dtype: int64
In [89]:
#Example: five citations that have DOIs
df_refs_with_doi.head()
Out[89]:
Reference DOI
0 Byrne, G., and Goddard, L. (2010). The Stronge... 10.1045/november2010-byrne
1 Lampert, C. K., and Southwick, S. B. (2013). L... 10.1080/19386389.2013.826095
2 Singer, R. (2009). Linked Library Data Now! Jo... 10.1080/19411260903035809
3 Thomas, L. and Solomon, D. (2014). Active User... 10.1353/cea.2014.0014
4 Farquhar, A. and Baker, J. (2014). Interoperab... 10.6084/m9.figshare.1092550%20

Retrieve citation DOIs using CrossRef API

Crossref API allows querying the database by giving it in input strings that contain bibliography references. The reference string does not to be necessarily a well-written references. The input string is parsed by Crossref using machine learning techniques and the system tries to match the reference string with the metadata that are stored in the database.

An important feature of Crossref API, is the score of sureness that Crossref API retrieve beside the documents metadata. For each request, Crossref score indicates how much it is sure about the entities retrieved, if the score value is high the metadata retrieved are probably the corrected ones, if the score is low the metadata retrieved might be the wrong ones.

As first test we use the Crossref API to check citations having DOIs, we chose '110' as minimum score value.

In [ ]:
import urllib.request, json
references=df_refs_with_doi['Reference'];
df_citations16 = pd.DataFrame(columns=["Orig", "Crossref", "DOI"])
originalCitations=[]
crossrefCitations=[]
DOIs=[]
score=[]
i=0;
j =0;
for cite in references:
    cit=urllib.parse.quote_plus(cite)
    try:
        with urllib.request.urlopen("https://api.crossref.org/works?query.bibliographic="+cit+"&sort=score&mailto=cesare.concordia@gmail.com#") as url:
            data16 = json.loads(url.read().decode())
            j=j+1
            if (j%25 == 0):
                print(f"{j}, ({i})")
            if (len(data16["message"]["items"])>0) and (data16["message"]["items"][0]['score'] >115):
                originalCitations.append(cite)
                crossrefCitations.append( data16["message"]["items"][0])
                DOIs.append(data16["message"]["items"][0]['DOI'])
                score.append(data16["message"]["items"][0]['score'])
                i=i+1
                #print(f"{i} found, out of {j}")
            if (j>1000):
                break
    except urllib.error.URLError:
        print(cit)
    except urllib.error.HTTPError:
        print(cit)
        
df_citations16["Orig"] = originalCitations
df_citations16["Crossref"] = crossrefCitations
df_citations16["DOI"] = DOIs
df_citations16["Score"] = score
df_citations16.head()
In [90]:
df_cit_datasets=df_citations16.join(df_refs_with_doi.set_index('Reference'), on='Orig', lsuffix='_CR')
df_cit_datasets.count()
Out[90]:
Orig        327
Crossref    327
DOI_CR      327
Score       327
DOI         327
dtype: int64
In [91]:
#remove duplicates
test=df_citations16
df_temp_dois=test.drop_duplicates(['DOI'])
df_temp_dois.count()
Out[91]:
Orig        278
Crossref    278
DOI         278
Score       278
dtype: int64
In [92]:
df_dois=df_temp_dois[df_temp_dois['DOI'] != '']
df_dois_values=df_dois.DOI.values
df_dois_values.size
Out[92]:
278
In [ ]:
df_cn_citations = pd.DataFrame (columns = ['doi','cn_citation'])
import requests
#headers_dict = {"Accept": "application/x-bibtex"}
headers_dict = {"Accept": "text/x-bibliography", "locale":"en-EN"}
for var in df_dois_values:
    if ( var != "" and var!=None):
        print(var)
        try:
            r =requests.get("http://doi.org/"+var, headers=headers_dict, timeout=20)
           # print("result: "+r.content.decode("utf-8"))
            df_cn_citations = df_cn_citations.append({'doi': var, 'cn_citation': r.content.decode("utf-8")}, ignore_index=True)
        except requests.exceptions.ConnectionError:
          #  print(var)
            df_cn_citations = df_cn_citations.append({'doi': var, 'cn_citation': int(503)}, ignore_index=True)
        except requests.exceptions.ConnectTimeout:
          #  print(var)
            df_cn_citations = df_cn_citations.append({'doi': var, 'cn_citation': int(408)}, ignore_index=True)
        except requests.exceptions.ReadTimeout:
            df_cn_citations = df_cn_citations.append({'doi': var, 'cn_citation': int(408)}, ignore_index=True)
    else:
       # print(var ,0)
        df_cn_citations = df_cn_citations.append({'url': doi, 'cn_citation': int(400)}, ignore_index=True)
df_cn_citations.head(10)
In [93]:
df_cit_datasets=df_citations16.join(df_cn_citations.set_index('doi'), on='DOI')
df_cit_datasets[['Orig', 'DOI', 'cn_citation']].head(6)
Out[93]:
Orig DOI cn_citation
0 Lampert, C. K., and Southwick, S. B. (2013). L... 10.1080/19386389.2013.826095 Lampert, C. K., & Southwick, S. B. (2013). Lea...
1 Thomas, L. and Solomon, D. (2014). Active User... 10.1353/cea.2014.0014 Thomas, L., & Solomon, D. (2014). Active Users...
2 Omid, M. (2011). Design of an Expert System fo... 10.1016/j.eswa.2010.09.103 Omid, M. (2011). Design of an expert system fo...
3 Trelea , I. C. (2003). The Particle Swarm Opti... 10.1016/s0020-0190(02)00447-7 Trelea, I. C. (2003). The particle swarm optim...
4 Kenderdine, S. (2013). Pure Land: Inhabiting... 10.1111/cura.12020 Kenderdine, S. (2013). “Pure Land”: Inhabiting...
5 Haentjens Dekker , R., van Hulle , D. , Middel... 10.1093/llc/fqu007 Haentjens Dekker, R., van Hulle, D., Middell, ...
In [94]:
df_cit_table=df_cit_datasets[['Orig', 'Score', 'DOI', 'cn_citation']]
df_cit_table.columns
Out[94]:
Index(['Orig', 'Score', 'DOI', 'cn_citation'], dtype='object')
In [ ]:
import plotly.graph_objects as go
fig = go.Figure(data=[go.Table(header=dict(values=list(df_cit_table.columns), line=dict(color='black')), 
                               cells=dict(values=[df_cit_table.Orig, df_cit_table.Score, df_cit_table.DOI, df_cit_table['cn_citation'] ]))])
fig.show("notebook")