295 KiB
Review of data ingested from TAPoR (draft)¶
This is document cheks the TAPoR dataset using the python library Pandas.
Reference to ticket: https://gitlab.gwdg.de/sshoc/data-ingestion/-/issues/7
Preamble¶
import ast
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from im_tutorials.data import *
from im_tutorials.utilities import flatten_lists
from im_tutorials.features.text_preprocessing import *
from im_tutorials.features.document_vectors import document_vector
from im_tutorials.features.dim_reduction import WrapTSNE, GaussianMixtureEval
# for db
import sqlalchemy as db
from sqlalchemy import *
engine = create_engine(
"connection_string")
connection = engine.connect()
metadata = db.MetaData()
Import data¶
Query the DB to get TAPoR data¶
The TAPoR dataset used in this document is the sql dump published by Education and Research Archive (ERA) University of Alberta:
https://era.library.ualberta.ca/items/f2da0666-f523-44d4-a83c-fa06351a1e94
(creation date: 2020-01-01). The table tool contains 1504 records, each one describing a tool. Records have been filtered according the value of the field tool.is_approved, there are 1363 approved records. In this document this dataset will be called the TAPoR dataset.
Note that the TAPoR dataset reviewed here is not the same that has been used for the MP ingestion, this document will be update when we'll have it
df_db_tools=pd.read_sql_query('SELECT * FROM TaPOR.tools where is_approved=1 order by last_updated', connection)
df_db_tools.index
An example of TAPoR item¶
Let's take a look at a random TAPoR dataset entry. (The database schema of the TAPoR dataset is described here: https://era.library.ualberta.ca/items/f2da0666-f523-44d4-a83c-fa06351a1e94/download/8057eae2-3fae-4afa-bc8e-6dcc2a257b6f.)
#df_db_tools.dtypes
df_db_tools.iloc[500]
The following table shows 5 records of the TAPoR dataset.
df_db_tools.sort_values('name').head(5)
Check for duplicates in TAPoR dataset¶
Considering the values for 'name' and 'url', it appears that in the TAPoR dataset there are 4 duplicated descriptions
duplicateRowsDF0 = df_db_tools[df_db_tools.duplicated(['name', 'url'])].sort_values('name')
#print("The (possibly) duplicated items in TAPoR dataset:")
duplicateRowsDF0.head(15)
Get the ingested TAPoR data in the Market Place (using the API)¶
The SSHOC Market Place API entry:
https://sshoc-marketplace-api.acdh-dev.oeaw.ac.at/api/tools
has been used to extract the TAPoR descriptions imported in the SSHOC Market Place. In the rest of the document this dataset will be called: MP dataset
#x = ('2','3','4','5')
x = pd.Series(range(2,69))
url = 'https://sshoc-marketplace-api.acdh-dev.oeaw.ac.at/api/tools?page=1&perpage=20'
df_tool_all = pd.read_json(url, orient='columns')
for var in x:
url = "https://sshoc-marketplace-api.acdh-dev.oeaw.ac.at/api/tools?page="+str(var)+"&perpage=20"
df_tool_par=pd.read_json(url, orient='columns')
df_tool_all=df_tool_all.append(df_tool_par, ignore_index=True)
# print("url: "+ url + ":",var)
df_tool_all.index
There are 1353 tool descriptions in MP dataset. The following table shows 10 records of the MP dataset.
Let's take a look at row 500 of the MP dataset
#descriptions are in JSON, create a dataframe
df_tool_flat = pd.json_normalize(df_tool_all['tools'])
df_tool_flat.iloc[500]
#df_tool_flat.sort_values('label').head(10)
In the MP dataset there are 1353 tool descriptions.
df_tool_flat.index
Considering the values for 'label' and 'accessibleAT', it appears that in the MP dataset there are 9 duplicated descriptions¶
test_p_d=df_tool_flat[df_tool_flat.duplicated(['label', 'accessibleAt'])].sort_values('label')
test_p_d
#df_tool_flat.dtypes
df_tool_flat['accessibleAt'].replace(np.nan, "", inplace=True)
df_tool_flat['accessibleAt'].replace(r'^\s*$', "", regex=True)
#df_tool_flat['accessibleAt'].isnull()
#dataframe for MP properties
df_prop_data = pd.json_normalize(data=df_tool_all['tools'], record_path='properties', meta=['label'])
#df_prop_data.head(10)
#dataframe for MP contributors
df_contr_data = pd.json_normalize(data=df_tool_all['tools'], record_path='contributors', meta=['label'])
#df_contr_data.head(10)
df_mpdatasets=df_tool_flat.join(df_contr_data.set_index('label'), on='label')
#df_mpdatasets.head()
Comparing TAPoR dataset and MP datasets to find import issues¶
#create a dataframe with a subset of columns for the TAPoR dataset
df_tapor_worksub=df_db_tools.sort_values('name')[['name', 'url']].drop_duplicates()
df_tapor_worksub['url'].replace(np.nan, "", inplace=True)
df_tapor_worksub['url'].replace(r"\s+", np.nan, regex=True)
#df_tapor_worksub['url'].isnull()
#df_tapor_worksub.tail(30)
#create a dataframe with a subset of columns for the MP dataset and change column names to have homogenous formats
df_mp_taporsub= df_tool_flat[df_tool_flat['source.label'] == 'TAPoR']
df_mp_worksub=df_mp_taporsub.sort_values('label')[['label','accessibleAt']].drop_duplicates()
df_mp_worksub=df_mp_worksub.rename(columns={"label": "name", 'accessibleAt':'url'})
#df_mp_worksub['url'].isnull()
# define a function that compares dataframes
def dataframe_difference(df1, df2, which):
"""Find rows which are different between two DataFrames."""
comparison_df = df1.merge(df2,
indicator=True,
how='outer')
if which is None:
diff_df = comparison_df[comparison_df['_merge'] != 'both']
else:
diff_df = comparison_df[comparison_df['_merge'] == which]
diff_df.to_csv('data/diff.csv')
return diff_df
Considering values for 'name' and 'url', there are 1260 tool descriptions in MP dataset that are identical to descriptions in TAPoR dataset¶
df_both=dataframe_difference(df_mp_worksub, df_tapor_worksub, 'both')
df_both.index
df_both.head()
Considering values for 'name' and 'url', there are 83 tool descriptions in MP dataset but not in TAPoR dataset¶
#tools in TAPoR but not in MP datset
df_lo=dataframe_difference(df_mp_worksub.sort_values('name'), df_tapor_worksub.sort_values('name'), 'left_only')
# see 20 records in MP dataset but not in TAPoR
df_lo.head(20)
Comparing values for 'name' and 'url', there are 99 tool descriptions in TAPoR dataset but not in MP dataset¶
#Tools in MP dataset but not in TAPoR
df_ro=dataframe_difference(df_mp_worksub.sort_values('name'), df_tapor_worksub.sort_values('name'), 'right_only')
df_ro.head(20)
Distribution of items in TAPoR dataset by 'last_updated' value¶
Check the content of the field 'last_update' for TAPoR dataset descriptions. This value seems the date when a description of a tool has been updated the last time.
df_db_tools['correctdata']=pd.to_datetime(df_db_tools['last_updated'])
df_db_tools['justdata'] = df_db_tools['correctdata'].dt.year
df_reg_tm_sorted=df_db_tools.sort_values('last_updated')
df_reg_tools_sub=df_reg_tm_sorted[['name', 'url', 'last_updated']]
df_reg_tools_sub.head()
f, ax1 = plt.subplots(nrows=1, figsize=(15,6))
df_reg_tm_sorted.justdata.value_counts().reindex(sorted(df_reg_tm_sorted.justdata.value_counts().index)).plot(ax=ax1)
ax1.set_title('Number of tools by year their description has been updated', fontsize=15)
Check URL in TAPoR dataset¶
In TAPoR dataset there are descriptions where the URL of a Tool is not provided
df_reg_tools_sub_emurl=df_reg_tools_sub[df_reg_tools_sub['url'] == '']
#print("number of record with missed URL in TAPoR dataset:")
df_reg_tools_sub_emurl.count()
df_reg_tools_sub_whurl=df_reg_tools_sub[df_reg_tools_sub['url'] != '']
df_reg_tools_sub_whurl.index
#df_reg_tools_sub.head()
#for column in df_reg_tools_sub[['name', 'url']]:
# # Select column contents by column name using [] operator
# columnSeriesObj = df_reg_tools_sub[column]
# print('Colunm Name : ', column)
# print('Column Contents : ', columnSeriesObj.values)
df_urls=df_reg_tools_sub_whurl.url.values
#df_urls
data = {'url': ['test'],'status': [1]}
df_http_status = pd.DataFrame (data, columns = ['url','status'])
import requests
import re
regex = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
r'localhost|' #localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
for var in df_urls:
# print(var)
if ( var != "" and var!=None and re.match(regex, var)):
try:
r =requests.get(var,timeout=8)
#print("result: "+var+ " ",r.status_code)
df_http_status = df_http_status.append({'url': var, 'status': int(r.status_code)}, ignore_index=True)
except requests.exceptions.ConnectionError:
# print(var)
df_http_status = df_http_status.append({'url': var, 'status': int(503)}, ignore_index=True)
except requests.exceptions.ConnectTimeout:
# print(var)
df_http_status = df_http_status.append({'url': var, 'status': int(408)}, ignore_index=True)
except requests.exceptions.ReadTimeout:
# print(var)
df_http_status = df_http_status.append({'url': var, 'status': int(408)}, ignore_index=True)
except requests.exceptions.RequestException:
# print(var)
df_http_status = df_http_status.append({'url': var, 'status': int(500)}, ignore_index=True)
except TypeError:
# print(var)
df_http_status = df_http_status.append({'url': var, 'status': int(400)}, ignore_index=True)
else:
# print(var ,0)
df_http_status = df_http_status.append({'url': var, 'status': int(400)}, ignore_index=True)
df_http_status.head()
The HTTP result status values for URL in TAPoR dataset descriptions¶
The table below shows the HTTP Status code (https://en.wikipedia.org/wiki/List_of_HTTP_status_codes) obtained when 'clicking' on URL of tool descriptions of TAPoR dataset.
There is a significant number of URLs that seems not correct (status 404, 503, 500, 508....)
df_http_status_sub=df_http_status[df_http_status['status'] != 1]
df_db_st = df_http_status_sub['status'].value_counts()
df_db_st.head(10)
TAPoR dataset 'creators'¶
There are 164 descriptions in TAPoR dataset that don't have values in creators_name field, and there are 924 different creators.
df_db_tools_na=df_db_tools[df_db_tools['creators_name'] == ''].sort_values('last_updated')
df_db_tools_na.index
#the number of creators
len(df_db_tools['creators_name'].unique())-1
df_db_tools.loc[df_db_tools['creators_name']=='','creators_name']='n/a'
df_db_tech_NoCoT = df_db_tools['creators_name'].value_counts()
fig, ax = plt.subplots()
df_db_tech_NoCoT.head(20).plot.barh(figsize=(10,7), ax=ax)
ax.set_title('Number of tools by creators names (Top 10)')
ax.set_xlabel('N. of tools')
ax.set_ylabel('Creators');
Number of tool descriptions in TAPoR dataset that don't have the related creator email
df_db_tools_naem=df_db_tools[df_db_tools['creators_email'] == ''].sort_values('last_updated')
#df_db_tools_naem.index
len(df_db_tools_naem)
Number of tool description in TAPoR dataset that don't have the related creator URL
df_db_tools_nau=df_db_tools[df_db_tools['creators_url'] == ''].sort_values('last_updated')
len(df_db_tools_nau)
------¶
df_db_tech=pd.read_sql_query('select t.id, t.name, t.detail, t.creators_name, t.last_updated, at.name as "attributetype", av.name as"attribute", tags.text as "tag" from TaPOR.tools as t, TaPOR.attribute_values as av, TaPOR.tool_attributes as ta, TaPOR.attribute_types as at, TaPOR.tags as tags, TaPOR.tool_tags as tota where t.is_approved=1 and t.id=ta.tool_id and t.id=tota.tool_id and tags.id=tota.tag_id and ta.attribute_value_id=av.id and ta.attribute_type_id=at.id', connection)
#df_db_tech=pd.read_sql_table('tools', connection)
#df_db_tech.head(10)
df_db_tech.index
df_db_tech.columns
df_items=df_db_tech[['id', 'name', 'detail', 'creators_name', 'last_updated']].drop_duplicates()
#df_items.head(10)
Attributes in TAPoR dataset items¶
The following dataframe shows the list of attribute types defined in TaPOR dataset to charachterize tools
df_db_tools_toa=pd.read_sql_query('SELECT distinct name FROM TaPOR.attribute_types', connection)
df_db_tools_toa.head(20)
Tools with no attribute in TAPoR dataset¶
The following dataframe shows the main fields of tool descriptions in TAPoR dataset that do not have attribute values
df_db_tools_noatt=pd.read_sql_query('select distinct tools.id, tools.name, tools.creators_name, tools.url from TaPOR.tools where tools.is_approved=1 and tools.id not in (select distinct TaPOR.tool_attributes.tool_id from TaPOR.tool_attributes)', connection)
df_db_tools_noatt.head(19)
Type of Licenses in TAPoR dataset items¶
df_db_sub=df_db_tech[['id', 'name', 'detail', 'creators_name', 'last_updated', 'attributetype', 'attribute']]
df_to=df_db_sub[df_db_sub['attributetype'] == 'Type of license'].drop_duplicates()
df_to.index
df_db_lic = df_to['attribute'].value_counts()
df_db_lic.head(10)
fig, ax = plt.subplots()
df_db_lic.plot(kind='bar', figsize=(15,6), x='licences', y='tools',)
plt.grid(alpha=0.6)
ax.yaxis.set_label_text("")
ax.set_title("Number of Tools by License", fontsize=15)
ax.set_xlabel('License', fontsize=14)
ax.set_ylabel('N of Tools', fontsize=14);
plt.show()
#df_db_tech.loc[df_db_tech['country']=='', 'country']='N/A'
Type of analysis in TAPoR dataset items¶
A tool description can have more than one value for Type of analysis (i.e. a tool can perform one or more type of analysis)
df_to_ta=df_db_sub[df_db_sub['attributetype'] == 'Type of analysis'].drop_duplicates()
df_to_ta.tail(10)
df_db_a = df_to_ta['attribute'].value_counts()
df_db_a.head(25)
fig, ax = plt.subplots()
df_db_a.plot(kind='bar', figsize=(15,6), x='analysys', y='tools',)
plt.grid(alpha=0.6)
ax.yaxis.set_label_text("")
ax.set_title("Number of Tools by Type of Analysis", fontsize=15)
ax.set_xlabel('Type of Analysis', fontsize=14)
ax.set_ylabel('N of Tools', fontsize=14);
plt.show()
Tool families in TAPoR dataset items¶
df_to_tf=df_db_sub[df_db_sub['attributetype'] == 'Tool Family'].drop_duplicates()
df_to_tf = df_to_tf['attribute'].value_counts()
df_to_tf.head(20)
fig, ax = plt.subplots()
df_to_tf.plot(kind='bar', figsize=(15,6), x='analysys', y='tools',)
plt.grid(alpha=0.6)
ax.yaxis.set_label_text("")
ax.set_title("Number of Tools by Tool Families", fontsize=15)
ax.set_xlabel('Tool Family', fontsize=14)
ax.set_ylabel('N of Tools', fontsize=14);
plt.show()
Web Usable in TAPoR items¶
df_to_bp=df_db_sub[df_db_sub['attributetype'] == 'Web Usable'].drop_duplicates()
df_to_bp.head()
df_to_bp = df_to_bp['attribute'].value_counts()
df_to_bp.head(10)
fig, ax = plt.subplots()
df_to_bp.plot(kind='bar', figsize=(15,6), x='webusable', y='tools',)
plt.grid(alpha=0.6)
ax.yaxis.set_label_text("")
ax.set_title("Number of Tools by Web usability", fontsize=15)
ax.set_xlabel('Web usable', fontsize=14)
ax.set_ylabel('N of Tools', fontsize=14);
plt.show()