189 KiB
Draft: check properties for Tools and Services in the MarketPlace Dataset¶
This notebook checks values in the MarketPlace datsaset for Tools and Services.
External libraries and function to download descriptions from the MarketPlace dataset using the API¶
The following two cells are used to import the external libraries used in this Notebook and to define a function; in the final release of this Notebook this function will be (possibly) optimized and provided as an external library.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def getMPDescriptions (url, pages):
mdx = pd.Series(range(1, pages+1))
df_desc = pd.DataFrame()
for var in mdx:
turl = url+str(var)+"&perpage=20"
df_desc_par=pd.read_json(turl, orient='columns')
df_desc=df_desc.append(df_desc_par, ignore_index=True)
return (df_desc)
Get the the descriptions of Tools and Services¶
The MarketPlace API are used to download the descriptions of Tools and Services
df_tool_all = pd.DataFrame()
df_tool_all =getMPDescriptions ("https://sshoc-marketplace-api.acdh-dev.oeaw.ac.at/api/tools-services?page=", 81)
df_tool_all.index
A quick look at data¶
df_tool_flat = pd.json_normalize(df_tool_all['tools'])
#df_tool_work=df_tool_flat[['id', 'category', 'label', 'licenses', 'contributors', 'accessibleAt', 'sourceItemId']]
df_tool_flat.count()
df_tool_flat_opt=pd.json_normalize(df_tool_all['tools'])
df_tool_flat_opt = df_tool_flat_opt.replace('No description provided.', np.nan)
df_tool_flat_opt.licenses = df_tool_flat_opt.licenses.apply(lambda y: np.nan if len(y)==0 else y)
df_tool_flat_opt.externalIds = df_tool_flat_opt.externalIds.apply(lambda y: np.nan if len(y)==0 else y)
df_tool_flat_opt.contributors = df_tool_flat_opt.contributors.apply(lambda y: np.nan if len(y)==0 else y)
df_tool_flat_opt.accessibleAt = df_tool_flat_opt.accessibleAt.apply(lambda y: np.nan if len(y)==0 else y)
df_tool_flat_opt.relatedItems = df_tool_flat_opt.relatedItems.apply(lambda y: np.nan if len(y)==0 else y)
df_tool_flat_opt.olderVersions = df_tool_flat_opt.olderVersions.apply(lambda y: np.nan if len(y)==0 else y)
df_tool_flat_opt.newerVersions = df_tool_flat_opt.newerVersions.apply(lambda y: np.nan if len(y)==0 else y)
df_tool_flat_opt.properties = df_tool_flat_opt.properties.apply(lambda y: np.nan if len(y)==0 else y)
print('{:<35}Number of missing values'.format("Property"), end='\n')
df_tool_flat_opt.isnull().sum()
Checking description values¶
fig, ax = plt.subplots(figsize=(12, 6))
ax.hist(df_tool_flat_opt['description'].str.len(), bins=100)
ax.set_title('Description Length')
ax.set_xlabel('Characters in description')
ax.set_ylabel('Frequency');
print (f"\n There are {df_tool_flat_opt['description'].isna().sum()} Tools and Services with empty descriptions\n")
#Print the Tools and Services with empty descriptions in a CSV file
df_tool_flat_opt_e=df_tool_flat_opt[df_tool_flat_opt['description'].isna()]
df_tool_flat_opt_e[['id', 'label', 'description']].sort_values('label').to_csv(path_or_buf='ts_emptydescription.csv')
Count all Tools and Services where the description is shorter than an old school tweet (clearly the minimum amount of characters required to express any meaningful information in these years...:).
df_tool_flat_d = df_tool_flat_opt[(df_tool_flat_opt['description'].notnull()) & (df_tool_flat_opt['description'].str.len()<140)]
print (f'\n There are {df_tool_flat_d["description"].count()} Tools and Services where the description has less than 140 characters\n')
The following table shows some Tools and Services with short description. The list is currently saved on a file, this could be changed if we decided that this is a significant curation feature.
df_tool_flat_d[['id', 'label', 'description']].sort_values('label').head().style.set_properties(subset=['description'], **{'width': '600px'})
df_tool_flat_d[['id', 'label', 'description']].sort_values('label').to_csv(path_or_buf='ts_shortdescription.csv')
Checking values on Contributors¶
print (f"\n There are {df_tool_flat_opt['contributors'].isna().sum()} Tools and Services with no contributors\n")
df_prop_data_co = pd.json_normalize(data=df_tool_all['tools'], record_path='contributors', meta_prefix='tool_', meta=['label'])
df_prop_data_co.sort_values('tool_label').info()
df_prop_data_co.sort_values('tool_label').head()
df_prop_data_contrib = pd.json_normalize(data=df_tool_all['tools'], record_path='contributors', meta_prefix='tool_', meta=['label'])
df_prop_data_contrib['actor.externalIds'] = df_prop_data_contrib['actor.externalIds'].apply(lambda y: np.nan if len(y)==0 else y)
df_prop_data_contrib['actor.affiliations'] = df_prop_data_contrib['actor.affiliations'].apply(lambda y: np.nan if len(y)==0 else y)
print('{:<15}Number of missing values'.format("Property"), end='\n')
df_prop_data_contrib.isnull().sum()
Check the validity of URLs in the actor.website property using the HTTP Result Status¶
The code below explicitly execute an http call for every URL, waits for the Result Status Code of the call and then registers the code.
Depending on connections and server answer times it may take several minutes to process all URLs.
In the final release of this Notebook this code will be (possibly) optimized and provided as an external library.
df_tool_work_urls=df_prop_data_contrib[df_prop_data_contrib['actor.website'].str.len()>0]
df_urls=df_tool_work_urls['actor.website'].values
df_tool_work_aa_http_status = pd.DataFrame (columns = ['url','status'])
import requests
import re
regex = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
r'localhost|' #localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
for var in df_urls:
if ( var != "" and var!=None and re.match(regex, var)):
try:
r =requests.get(var,timeout=8)
df_tool_work_aa_http_status = df_tool_work_aa_http_status.append({'url': var, 'status': int(r.status_code)}, ignore_index=True)
except requests.exceptions.ConnectionError:
# print(var)
df_tool_work_aa_http_status = df_tool_work_aa_http_status.append({'url': var, 'status': int(503)}, ignore_index=True)
except requests.exceptions.ConnectTimeout:
# print(var)
df_tool_work_aa_http_status = df_tool_work_aa_http_status.append({'url': var, 'status': int(408)}, ignore_index=True)
except requests.exceptions.ReadTimeout:
# print(var)
df_tool_work_aa_http_status = df_tool_work_aa_http_status.append({'url': var, 'status': int(408)}, ignore_index=True)
except requests.exceptions.RequestException:
# print(var)
df_tool_work_aa_http_status = df_tool_work_aa_http_status.append({'url': var, 'status': int(500)}, ignore_index=True)
except TypeError:
# print(var)
df_tool_work_aa_http_status = df_tool_work_aa_http_status.append({'url': var, 'status': int(400)}, ignore_index=True)
else:
# print(var ,0)
df_tool_work_aa_http_status = df_tool_work_aa_http_status.append({'url': var, 'status': int(400)}, ignore_index=True)
df_tool_work_aa_http_status.head()
df_http_status_sub=df_tool_work_aa_http_status[df_tool_work_aa_http_status['status'] != 1]
df_db_st = df_http_status_sub['status'].value_counts()
print('{:<8}Frequency'.format("Status"))
df_db_st.head(10)
The first column in the table above shows the HTTP Status codes obtained when trying to connect on accessibleAt URLs, the second column the total number of URLs returning the status.
Notice that while 404 means that the resource is not found, other status codes may indicate temporary problems.
The image below summarizes of the above result.
fig, ax = plt.subplots()
df_db_st.plot(kind='bar', figsize=(15,6), x='Status', y='Frequency',)
plt.grid(alpha=0.6)
ax.yaxis.set_label_text("")
ax.set_title("Number of Result Codes in actor.website", fontsize=15)
ax.set_xlabel('Result Code', fontsize=14)
ax.set_ylabel('Frequency', fontsize=14);
plt.show()
The list of possibly wrong URLs is saved in a Comma Separated Values (CSV) file having the following columns: id, label, url, status. The final release of this notebook will save this data in the curation dataset.
df_http_status_err=df_http_status_sub[df_http_status_sub['status'] != 200]
df_list_of_tools_wrongaa=pd.merge(left=df_prop_data_contrib, right=df_http_status_err, left_on='actor.website', right_on='url')
df_list_of_tools_wrongaa.head()
df_list_of_tools_wrongaa[['actor.id', 'tool_label', 'actor.website', 'status']].sort_values('tool_label').to_csv(path_or_buf='ts_wrongcontributorsurls.csv')
Checking values on Properties¶
print (f"\n There are {df_tool_flat_opt['properties'].isna().sum()} Tools and Services with no properties\n")
#TODO: Print/Save the Tools and Services with empty properties
df_prop_data_ts = pd.json_normalize(data=df_tool_all['tools'], record_path='properties', meta_prefix='ts_', meta=['label'])
#df_prop_data_ts.sort_values('ts_label').head(7)
df_prop_data_ts = pd.json_normalize(data=df_tool_all['tools'], record_path='properties', meta_prefix='ts_', meta=['label'])
df_prop_data_ts['type.allowedVocabularies'] = df_prop_data_ts['type.allowedVocabularies'].apply(lambda y: np.nan if len(y)==0 else y)
print('{:<25}Number of missing values'.format("Property"), end='\n')
df_prop_data_ts.isnull().sum()
Values in type.code¶
a_df=df_prop_data_ts.drop_duplicates(['type.code','ts_label'])
df_temp_tc_label = a_df['type.code'].value_counts()
print('{:<28}Frequency'.format("Type Code"), end='\n')
df_temp_tc_label.head(39)
df_prop_data_ts[(df_prop_data_ts['type.code']=='terms-of-use')].sort_values('ts_label').tail(7)
df_typecode_ts=df_prop_data_ts[['ts_label', 'type.code']]
df_typecode_ts_flat=df_typecode_ts.groupby('ts_label')['type.code'].apply(set).reset_index(name='typecodes')
df_typecode_ts_flat.head()
from collections import Counter, defaultdict
import itertools
cooccurrences = []
for props in df_typecode_ts_flat['typecodes']:
prop_pairs = itertools.combinations(props, 2)
for pair in prop_pairs:
cooccurrences.append(tuple(sorted(pair)))
# Count the frequency of each cooccurring pair.
properties_co_counter = Counter(cooccurrences)
print("Top TypeCodes Cooccurrences by Frequency", '\n')
print('{:<50}{}'.format('Cooccurrence', 'Frequency'))
for k, v in properties_co_counter.most_common(10):
topics = '['+k[0] + ', ' + k[1]+']'
print(f'{topics:<50}{v}')
property_cooccurrences = list(
itertools.chain(*[[tuple(sorted(c)) for c in itertools.combinations(d, 2)]
for d in df_typecode_ts_flat['typecodes']])
)
# Count the frequency of each cooccurring pair.
property_edge_counter = Counter(property_cooccurrences)
property_cooccurrence_df = pd.DataFrame({
'prop0': [dcc[0] for dcc in property_edge_counter.keys()],
'prop1': [dcc[1] for dcc in property_edge_counter.keys()],
'count': list(property_edge_counter.values()),
}).pivot_table(index='prop0', columns='prop1')['count']
import seaborn as sns
fig, ax = plt.subplots(figsize=(22, 18))
sns.heatmap(property_cooccurrence_df, annot=True, linewidths=0.2, fmt='.0f', ax=ax, cbar=None, cmap='Blues', linecolor='gray')
ax.set_xticklabels(ax.get_xticklabels(), rotation=30, ha='right')
ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
ax.invert_yaxis()
ax.set_xlabel(None)
ax.set_ylabel(None)
#sns.heatmap(df, linewidths=2, linecolor='yellow')
title = 'Cooccurences of TypeCode in Tools and Services\n'
plt.title(title, loc='left', fontsize=20)
plt.show()
Values in concept.vocabulary.code¶
acvc_df=df_prop_data_ts.drop_duplicates(['concept.vocabulary.code','ts_label'])
df_temp_cvc_label = acvc_df['concept.vocabulary.code'].value_counts()
print('{:<18}Frequency'.format("Type Code"), end='\n')
df_temp_cvc_label.head(39)
df_temp_cvc_concept = acvc_df['concept.label'].value_counts()
df_temp_cvc_concept.head(39)