25 KiB
import ast
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bs4 import BeautifulSoup
def read_tei(tei_file):
with open(tei_file, 'r') as tei:
soup = BeautifulSoup(tei, 'lxml')
return soup
raise RuntimeError('Cannot generate a soup from the input')
def elem_to_text(elem, default=''):
if elem:
return elem.getText(separator=' ', strip=True)
else:
return default
from dataclasses import dataclass
@dataclass
class Person:
firstname: str
middlename: str
surname: str
class TEIFile(object):
def __init__(self, filename):
self.filename = filename
self.soup = read_tei(filename)
self._text = None
self._title = ''
self._abstract = ''
@property
def doi(self):
idno_elem = self.soup.find('idno', type='DOI')
if not idno_elem:
return ''
else:
return idno_elem.getText()
@property
def title(self):
if not self._title:
if not self.soup.title:
self._title = "na"
else:
self._title = self.soup.title.getText()
return self._title
@property
def abstract(self):
if not self._abstract:
abstract = self.soup.abstract.getText(separator=' ', strip=True)
self._abstract = abstract
return self._abstract
@property
def authors(self):
#authors_in_header = self.soup.analytic.find_all('author')
authors_in_header = self.soup.find_all('author')
result = []
for author in authors_in_header:
persname = author.persname
if not persname:
continue
firstname = elem_to_text(persname.find("forename"))#, type="first"))
middlename = elem_to_text(persname.find("forename", type="middle"))
surname = elem_to_text(persname.surname)
person = Person(firstname, middlename, surname)
result.append(person)
return result
@property
def bibliography(self):
bibliography = self.soup.find_all('bibl')
result = []
for bibl in bibliography:
if not bibl:
continue
result.append(elem_to_text(bibl))
return result
@property
def text(self):
if not self._text:
divs_text = []
for div in self.soup.body.find_all("div"):
# div is neither an appendix nor references, just plain text.
if not div.get("type"):
div_text = div.get_text(separator=' ', strip=True)
divs_text.append(div_text)
plain_text = " ".join(divs_text)
self._text = plain_text
return self._text
import multiprocessing
from os.path import basename, splitext
def basename_without_ext(path):
base_name = basename(path)
stem, ext = splitext(base_name)
if stem.endswith('.tei'):
# Return base name without tei file
return stem[0:-4]
else:
return stem
def tei_to_csv_entry(tei_file):
tei = TEIFile(tei_file)
print(f"Handled {tei_file}")
base_name = basename_without_ext(tei_file)
return base_name, tei.authors, tei.title, tei.bibliography#, tei.abstract
import glob
from pathlib import Path
papers15 = sorted(Path("/Users/cesare/git/SSHOCCitationService/dataset/ToolXtractor/data/xml/2015/").glob('*.xml'))
papers16 = sorted(Path("/Users/cesare/git/SSHOCCitationService/dataset/ToolXtractor/data/xml/2016/").glob('*.xml'))
papers17 = sorted(Path("/Users/cesare/git/SSHOCCitationService/dataset/ToolXtractor/data/xml/2017/").glob('*.xml'))
papers18 = sorted(Path("/Users/cesare/git/SSHOCCitationService/dataset/ToolXtractor/data/xml/2018/").glob('*.xml'))
papers19 = sorted(Path("/Users/cesare/git/SSHOCCitationService/dataset/ToolXtractor/data/xml/2019/").glob('*.xml'))
papers20 = sorted(Path("/Users/cesare/git/SSHOCCitationService/dataset/ToolXtractor/data/xml/2020/").glob('*.xml'))
from multiprocessing.pool import Pool
pool = Pool()
Import the DH conference papers (2016-20020)¶
The papers are downloaded from https://github.com/lehkost/ToolXtractor/
csv_entries15 = pool.map(tei_to_csv_entry, papers15)
csv_entries16 = pool.map(tei_to_csv_entry, papers16)
csv_entries17 = pool.map(tei_to_csv_entry, papers17)
csv_entries18 = pool.map(tei_to_csv_entry, papers18)
csv_entries19 = pool.map(tei_to_csv_entry, papers19)
csv_entries20 = pool.map(tei_to_csv_entry, papers20)
result_csv0 = pd.DataFrame(csv_entries15, columns=['ID', 'Authors', 'Title', 'Bibliography'])
result_csv1 = result_csv0.append(pd.DataFrame(csv_entries16, columns=['ID', 'Authors', 'Title', 'Bibliography']))
result_csv2 = result_csv1.append(pd.DataFrame(csv_entries17, columns=['ID', 'Authors', 'Title', 'Bibliography']))
result_csv3 = result_csv2.append(pd.DataFrame(csv_entries18, columns=['ID', 'Authors', 'Title', 'Bibliography']))
result_csv4 = result_csv3.append(pd.DataFrame(csv_entries19, columns=['ID', 'Authors', 'Title', 'Bibliography']))
result_csv = result_csv4.append(pd.DataFrame(csv_entries20, columns=['ID', 'Authors', 'Title', 'Bibliography']))
result_csv.count()
Select the papers having the TEI \<bibl> elements.¶
The \<bibl> element (bibliographic citation) contains a loosely-structured bibliographic citation of which the sub-components may or may not be explicitly tagged. There are 1195 papers havingthis element and in total there are 11746 citations.
test_csv=result_csv[result_csv['Bibliography'].str.len()>0]
test_csv.count()
# all citations
my_df=test_csv[['ID','Title','Bibliography']]
my_exp_df=my_df.explode('Bibliography')
my_exp_df.count()
#Curiosity: there are at least 134 references cited more than once
df_p_d=my_exp_df[my_exp_df.duplicated(['Bibliography'], keep="last")].sort_values('Bibliography')
df_p_d['Bibliography'].drop_duplicates().count()
Citations with DOI¶
There are 821 (of 11746) citations with a DOI
import re
regex = re.compile(r'\b(10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?!["&\'<>])\S)+)\b', re.IGNORECASE)
df_refs=my_exp_df.Bibliography.values
df_refs_with_doi = pd.DataFrame(columns=["Reference", "DOI"])
references=[]
DOIs=[]
for reference in df_refs:
mydoi=re.search(regex, reference)
if mydoi:
references.append(reference);
DOIs.append(mydoi[1]);
df_refs_with_doi['Reference']=references;
df_refs_with_doi['DOI']=DOIs;
df_refs_with_doi.count()
#Example: five citations that have DOIs
df_refs_with_doi.head()
Retrieve citation DOIs using CrossRef API¶
Crossref API allows querying the database by giving it in input strings that contain bibliography references. The reference string does not to be necessarily a well-written references. The input string is parsed by Crossref using machine learning techniques and the system tries to match the reference string with the metadata that are stored in the database.
An important feature of Crossref API, is the score of sureness that Crossref API retrieve beside the document’s metadata. For each request, Crossref score indicates how much it is sure about the entities retrieved, if the score value is high the metadata retrieved are probably the corrected ones, if the score is low the metadata retrieved might be the wrong ones.
As first test we use the Crossref API to check citations having DOIs, we chose '110' as minimum score value.
import urllib.request, json
references=df_refs_with_doi['Reference'];
df_citations16 = pd.DataFrame(columns=["Orig", "Crossref", "DOI"])
originalCitations=[]
crossrefCitations=[]
DOIs=[]
score=[]
i=0;
j =0;
for cite in references:
cit=urllib.parse.quote_plus(cite)
try:
with urllib.request.urlopen("https://api.crossref.org/works?query.bibliographic="+cit+"&sort=score&mailto=cesare.concordia@gmail.com#") as url:
data16 = json.loads(url.read().decode())
j=j+1
if (j%25 == 0):
print(f"{j}, ({i})")
if (len(data16["message"]["items"])>0) and (data16["message"]["items"][0]['score'] >115):
originalCitations.append(cite)
crossrefCitations.append( data16["message"]["items"][0])
DOIs.append(data16["message"]["items"][0]['DOI'])
score.append(data16["message"]["items"][0]['score'])
i=i+1
#print(f"{i} found, out of {j}")
if (j>1000):
break
except urllib.error.URLError:
print(cit)
except urllib.error.HTTPError:
print(cit)
df_citations16["Orig"] = originalCitations
df_citations16["Crossref"] = crossrefCitations
df_citations16["DOI"] = DOIs
df_citations16["Score"] = score
df_citations16.head()
df_cit_datasets=df_citations16.join(df_refs_with_doi.set_index('Reference'), on='Orig', lsuffix='_CR')
df_cit_datasets.count()
#remove duplicates
test=df_citations16
df_temp_dois=test.drop_duplicates(['DOI'])
df_temp_dois.count()
df_dois=df_temp_dois[df_temp_dois['DOI'] != '']
df_dois_values=df_dois.DOI.values
df_dois_values.size
df_cn_citations = pd.DataFrame (columns = ['doi','cn_citation'])
import requests
#headers_dict = {"Accept": "application/x-bibtex"}
headers_dict = {"Accept": "text/x-bibliography", "locale":"en-EN"}
for var in df_dois_values:
if ( var != "" and var!=None):
print(var)
try:
r =requests.get("http://doi.org/"+var, headers=headers_dict, timeout=20)
# print("result: "+r.content.decode("utf-8"))
df_cn_citations = df_cn_citations.append({'doi': var, 'cn_citation': r.content.decode("utf-8")}, ignore_index=True)
except requests.exceptions.ConnectionError:
# print(var)
df_cn_citations = df_cn_citations.append({'doi': var, 'cn_citation': int(503)}, ignore_index=True)
except requests.exceptions.ConnectTimeout:
# print(var)
df_cn_citations = df_cn_citations.append({'doi': var, 'cn_citation': int(408)}, ignore_index=True)
except requests.exceptions.ReadTimeout:
df_cn_citations = df_cn_citations.append({'doi': var, 'cn_citation': int(408)}, ignore_index=True)
else:
# print(var ,0)
df_cn_citations = df_cn_citations.append({'url': doi, 'cn_citation': int(400)}, ignore_index=True)
df_cn_citations.head(10)
df_cit_datasets=df_citations16.join(df_cn_citations.set_index('doi'), on='DOI')
df_cit_datasets[['Orig', 'DOI', 'cn_citation']].head(6)
df_cit_table=df_cit_datasets[['Orig', 'Score', 'DOI', 'cn_citation']]
df_cit_table.columns
import plotly.graph_objects as go
fig = go.Figure(data=[go.Table(header=dict(values=list(df_cit_table.columns), line=dict(color='black')),
cells=dict(values=[df_cit_table.Orig, df_cit_table.Score, df_cit_table.DOI, df_cit_table['cn_citation'] ]))])
fig.show("notebook")