From 03663537a74f66c115b17b0067463c19b9b3aafb Mon Sep 17 00:00:00 2001 From: Cesare Concordia Date: Tue, 14 Dec 2021 08:23:34 +0100 Subject: [PATCH] Eliminare 'Values_Check_Tools_and_Services.ipynb' --- Values_Check_Tools_and_Services.ipynb | 1557 ------------------------- 1 file changed, 1557 deletions(-) delete mode 100644 Values_Check_Tools_and_Services.ipynb diff --git a/Values_Check_Tools_and_Services.ipynb b/Values_Check_Tools_and_Services.ipynb deleted file mode 100644 index 815a5d0..0000000 --- a/Values_Check_Tools_and_Services.ipynb +++ /dev/null @@ -1,1557 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Draft: check properties for Tools and Services in the MarketPlace Dataset \n", - "\n", - "This notebook checks values in the MarketPlace datsaset for Tools and Services. \n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### External libraries and function to download descriptions from the MarketPlace dataset using the API\n", - "The following two cells are used to import the external libraries used in this Notebook and to define a function; in the final release of this Notebook this function will be (possibly) optimized and provided as an external library." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "import matplotlib.pyplot as plt" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "def getMPDescriptions (url, pages):\n", - " mdx = pd.Series(range(1, pages+1))\n", - " df_desc = pd.DataFrame()\n", - " for var in mdx:\n", - " turl = url+str(var)+\"&perpage=20\"\n", - " df_desc_par=pd.read_json(turl, orient='columns')\n", - " df_desc=df_desc.append(df_desc_par, ignore_index=True)\n", - " \n", - " return (df_desc)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Get the the descriptions of Tools and Services\n", - "The MarketPlace API are used to download the descriptions of Tools and Services" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "RangeIndex(start=0, stop=1606, step=1)" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_tool_all = pd.DataFrame()\n", - "df_tool_all =getMPDescriptions (\"https://sshoc-marketplace-api.acdh-dev.oeaw.ac.at/api/tools-services?page=\", 81)\n", - "df_tool_all.index" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### A quick look at data" - ] - }, - { - "cell_type": "code", - "execution_count": 163, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "id 1606\n", - "category 1606\n", - "label 1606\n", - "version 0\n", - "persistentId 1606\n", - "description 1606\n", - "licenses 1606\n", - "contributors 1606\n", - "properties 1606\n", - "externalIds 1606\n", - "accessibleAt 1606\n", - "sourceItemId 1606\n", - "relatedItems 1606\n", - "lastInfoUpdate 1606\n", - "status 1606\n", - "olderVersions 1606\n", - "newerVersions 1606\n", - "source.id 1606\n", - "source.label 1606\n", - "source.url 1606\n", - "source.urlTemplate 1606\n", - "informationContributor.id 1606\n", - "informationContributor.username 1606\n", - "informationContributor.displayName 1606\n", - "informationContributor.enabled 1606\n", - "informationContributor.registrationDate 1606\n", - "informationContributor.role 1606\n", - "informationContributor.email 1606\n", - "dtype: int64" - ] - }, - "execution_count": 163, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_tool_flat = pd.json_normalize(df_tool_all['tools'])\n", - "#df_tool_work=df_tool_flat[['id', 'category', 'label', 'licenses', 'contributors', 'accessibleAt', 'sourceItemId']]\n", - "df_tool_flat.count()" - ] - }, - { - "cell_type": "code", - "execution_count": 80, - "metadata": {}, - "outputs": [], - "source": [ - "df_tool_flat_opt=pd.json_normalize(df_tool_all['tools'])\n", - "df_tool_flat_opt = df_tool_flat_opt.replace('No description provided.', np.nan)\n", - "df_tool_flat_opt.licenses = df_tool_flat_opt.licenses.apply(lambda y: np.nan if len(y)==0 else y)\n", - "df_tool_flat_opt.externalIds = df_tool_flat_opt.externalIds.apply(lambda y: np.nan if len(y)==0 else y)\n", - "df_tool_flat_opt.contributors = df_tool_flat_opt.contributors.apply(lambda y: np.nan if len(y)==0 else y)\n", - "df_tool_flat_opt.accessibleAt = df_tool_flat_opt.accessibleAt.apply(lambda y: np.nan if len(y)==0 else y)\n", - "df_tool_flat_opt.relatedItems = df_tool_flat_opt.relatedItems.apply(lambda y: np.nan if len(y)==0 else y)\n", - "df_tool_flat_opt.olderVersions = df_tool_flat_opt.olderVersions.apply(lambda y: np.nan if len(y)==0 else y)\n", - "df_tool_flat_opt.newerVersions = df_tool_flat_opt.newerVersions.apply(lambda y: np.nan if len(y)==0 else y)\n", - "df_tool_flat_opt.properties = df_tool_flat_opt.properties.apply(lambda y: np.nan if len(y)==0 else y)" - ] - }, - { - "cell_type": "code", - "execution_count": 194, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Property Number of missing values\n" - ] - }, - { - "data": { - "text/plain": [ - "id 0\n", - "category 0\n", - "label 0\n", - "version 1606\n", - "persistentId 0\n", - "description 174\n", - "licenses 1605\n", - "contributors 366\n", - "properties 199\n", - "externalIds 1606\n", - "accessibleAt 507\n", - "sourceItemId 0\n", - "relatedItems 1478\n", - "lastInfoUpdate 0\n", - "status 0\n", - "olderVersions 2\n", - "newerVersions 1606\n", - "source.id 0\n", - "source.label 0\n", - "source.url 0\n", - "source.urlTemplate 0\n", - "informationContributor.id 0\n", - "informationContributor.username 0\n", - "informationContributor.displayName 0\n", - "informationContributor.enabled 0\n", - "informationContributor.registrationDate 0\n", - "informationContributor.role 0\n", - "informationContributor.email 0\n", - "dtype: int64" - ] - }, - "execution_count": 194, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "print('{:<35}Number of missing values'.format(\"Property\"), end='\\n')\n", - "df_tool_flat_opt.isnull().sum()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Checking description values" - ] - }, - { - "cell_type": "code", - "execution_count": 115, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "fig, ax = plt.subplots(figsize=(12, 6))\n", - "ax.hist(df_tool_flat_opt['description'].str.len(), bins=100)\n", - "ax.set_title('Description Length')\n", - "ax.set_xlabel('Characters in description')\n", - "ax.set_ylabel('Frequency');" - ] - }, - { - "cell_type": "code", - "execution_count": 195, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - " There are 174 Tools and Services with empty descriptions\n", - "\n" - ] - } - ], - "source": [ - "print (f\"\\n There are {df_tool_flat_opt['description'].isna().sum()} Tools and Services with empty descriptions\\n\")" - ] - }, - { - "cell_type": "code", - "execution_count": 185, - "metadata": {}, - "outputs": [], - "source": [ - "#Print the Tools and Services with empty descriptions in a CSV file \n", - "df_tool_flat_opt_e=df_tool_flat_opt[df_tool_flat_opt['description'].isna()]\n", - "df_tool_flat_opt_e[['id', 'label', 'description']].sort_values('label').to_csv(path_or_buf='ts_emptydescription.csv')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Count all Tools and Services where the description is shorter than an old school tweet (clearly the minimum amount of characters required to express any meaningful information in these years...:)." - ] - }, - { - "cell_type": "code", - "execution_count": 190, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - " There are 217 Tools and Services where the description has less than 140 characters\n", - "\n" - ] - } - ], - "source": [ - "df_tool_flat_d = df_tool_flat_opt[(df_tool_flat_opt['description'].notnull()) & (df_tool_flat_opt['description'].str.len()<140)]\n", - "print (f'\\n There are {df_tool_flat_d[\"description\"].count()} Tools and Services where the description has less than 140 characters\\n')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The following table shows some Tools and Services with short description. The list is currently saved on a file, this could be changed if we decided that this is a significant curation feature." - ] - }, - { - "cell_type": "code", - "execution_count": 191, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
id label description
7629934ARRASARRAS is a historically important tool for analyzing and concording text. It notably provided inspiration for the TACT system.\n", - "
1530370Acronym Finder - Beta (TAPoRware)This tool locates acronyms and matches them with the corresponding full name from a user-specified input text.\n", - "
2730353AelfredAelfred is a bare-bones Java XML parser. It has not been updated since 2002, and is dependent on JDK 1.4, which is very outdated.\n", - "
4929219AnnotateItAnnotateIt lets users annotate anything anywhere on the web.\n", - "
5629224AnnotumAnnotum is an open-source, open-process, open-access scholarly authoring and publishing platform based on WordPress.\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 191, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_tool_flat_d[['id', 'label', 'description']].sort_values('label').head().style.set_properties(subset=['description'], **{'width': '600px'})" - ] - }, - { - "cell_type": "code", - "execution_count": 155, - "metadata": {}, - "outputs": [], - "source": [ - "df_tool_flat_d[['id', 'label', 'description']].sort_values('label').to_csv(path_or_buf='ts_shortdescription.csv')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Checking values on Contributors" - ] - }, - { - "cell_type": "code", - "execution_count": 198, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - " There are 366 Tools and Services with no contributors\n", - "\n" - ] - } - ], - "source": [ - "print (f\"\\n There are {df_tool_flat_opt['contributors'].isna().sum()} Tools and Services with no contributors\\n\")" - ] - }, - { - "cell_type": "code", - "execution_count": 203, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Int64Index: 1372 entries, 0 to 1355\n", - "Data columns (total 9 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 actor.id 1372 non-null int64 \n", - " 1 actor.name 1372 non-null object\n", - " 2 actor.externalIds 1372 non-null object\n", - " 3 actor.website 566 non-null object\n", - " 4 actor.email 293 non-null object\n", - " 5 actor.affiliations 1372 non-null object\n", - " 6 role.code 1372 non-null object\n", - " 7 role.label 1372 non-null object\n", - " 8 tool_label 1372 non-null object\n", - "dtypes: int64(1), object(8)\n", - "memory usage: 107.2+ KB\n" - ] - } - ], - "source": [ - "df_prop_data_co = pd.json_normalize(data=df_tool_all['tools'], record_path='contributors', meta_prefix='tool_', meta=['label'])\n", - "df_prop_data_co.sort_values('tool_label').info()" - ] - }, - { - "cell_type": "code", - "execution_count": 204, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
actor.idactor.nameactor.externalIdsactor.websiteactor.emailactor.affiliationsrole.coderole.labeltool_label
0483Ian Pearce, Devin Gaffney[]NoneNone[]contributorContributor140kit
1213Dassault Systemes[]NoneNone[]contributorContributor3DVIA Virtools
24514D[]http://www.4d.com/None[]contributorContributor4th Dimension
361780legs[]NoneNone[]contributorContributor80legs
4256Nathan Smith[]http://sonspring.com/None[]contributorContributor960 Grid System
\n", - "
" - ], - "text/plain": [ - " actor.id actor.name actor.externalIds \\\n", - "0 483 Ian Pearce, Devin Gaffney [] \n", - "1 213 Dassault Systemes [] \n", - "2 451 4D [] \n", - "3 617 80legs [] \n", - "4 256 Nathan Smith [] \n", - "\n", - " actor.website actor.email actor.affiliations role.code \\\n", - "0 None None [] contributor \n", - "1 None None [] contributor \n", - "2 http://www.4d.com/ None [] contributor \n", - "3 None None [] contributor \n", - "4 http://sonspring.com/ None [] contributor \n", - "\n", - " role.label tool_label \n", - "0 Contributor 140kit \n", - "1 Contributor 3DVIA Virtools \n", - "2 Contributor 4th Dimension \n", - "3 Contributor 80legs \n", - "4 Contributor 960 Grid System " - ] - }, - "execution_count": 204, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_prop_data_co.sort_values('tool_label').head()" - ] - }, - { - "cell_type": "code", - "execution_count": 207, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Property Number of missing values\n" - ] - }, - { - "data": { - "text/plain": [ - "actor.id 0\n", - "actor.name 0\n", - "actor.externalIds 1372\n", - "actor.website 806\n", - "actor.email 1079\n", - "actor.affiliations 1372\n", - "role.code 0\n", - "role.label 0\n", - "tool_label 0\n", - "dtype: int64" - ] - }, - "execution_count": 207, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_prop_data_contrib = pd.json_normalize(data=df_tool_all['tools'], record_path='contributors', meta_prefix='tool_', meta=['label'])\n", - "df_prop_data_contrib['actor.externalIds'] = df_prop_data_contrib['actor.externalIds'].apply(lambda y: np.nan if len(y)==0 else y)\n", - "df_prop_data_contrib['actor.affiliations'] = df_prop_data_contrib['actor.affiliations'].apply(lambda y: np.nan if len(y)==0 else y)\n", - "print('{:<15}Number of missing values'.format(\"Property\"), end='\\n')\n", - "df_prop_data_contrib.isnull().sum()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Check the validity of URLs in the *actor.website* property using the HTTP Result Status\n", - "The code below explicitly execute an http call for every URL, waits for the [Result Status Code](https://en.wikipedia.org/wiki/List_of_HTTP_status_codes) of the call and then registers the code. \n", - "Depending on connections and server answer times it may take several minutes to process all URLs. \n", - "In the final release of this Notebook this code will be (possibly) optimized and provided as an external library." - ] - }, - { - "cell_type": "code", - "execution_count": 213, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
urlstatus
0http://www.4d.com/200
1http://sonspring.com/200
2https://www.abbyy.com/200
3http://geoffreyrockwell.com/200
4https://www.adobe.com/200
\n", - "
" - ], - "text/plain": [ - " url status\n", - "0 http://www.4d.com/ 200\n", - "1 http://sonspring.com/ 200\n", - "2 https://www.abbyy.com/ 200\n", - "3 http://geoffreyrockwell.com/ 200\n", - "4 https://www.adobe.com/ 200" - ] - }, - "execution_count": 213, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_tool_work_urls=df_prop_data_contrib[df_prop_data_contrib['actor.website'].str.len()>0]\n", - "df_urls=df_tool_work_urls['actor.website'].values\n", - "df_tool_work_aa_http_status = pd.DataFrame (columns = ['url','status'])\n", - "import requests\n", - "import re\n", - "regex = re.compile(\n", - " r'^(?:http|ftp)s?://' # http:// or https://\n", - " r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\\.)+(?:[A-Z]{2,6}\\.?|[A-Z0-9-]{2,}\\.?)|' #domain...\n", - " r'localhost|' #localhost...\n", - " r'\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})' # ...or ip\n", - " r'(?::\\d+)?' # optional port\n", - " r'(?:/?|[/?]\\S+)$', re.IGNORECASE)\n", - "for var in df_urls:\n", - " if ( var != \"\" and var!=None and re.match(regex, var)):\n", - " try:\n", - " r =requests.get(var,timeout=8)\n", - " df_tool_work_aa_http_status = df_tool_work_aa_http_status.append({'url': var, 'status': int(r.status_code)}, ignore_index=True)\n", - " except requests.exceptions.ConnectionError:\n", - " # print(var)\n", - " df_tool_work_aa_http_status = df_tool_work_aa_http_status.append({'url': var, 'status': int(503)}, ignore_index=True)\n", - " except requests.exceptions.ConnectTimeout:\n", - " # print(var)\n", - " df_tool_work_aa_http_status = df_tool_work_aa_http_status.append({'url': var, 'status': int(408)}, ignore_index=True)\n", - " except requests.exceptions.ReadTimeout:\n", - " # print(var)\n", - " df_tool_work_aa_http_status = df_tool_work_aa_http_status.append({'url': var, 'status': int(408)}, ignore_index=True)\n", - " except requests.exceptions.RequestException:\n", - " # print(var)\n", - " df_tool_work_aa_http_status = df_tool_work_aa_http_status.append({'url': var, 'status': int(500)}, ignore_index=True)\n", - " except TypeError:\n", - " # print(var)\n", - " df_tool_work_aa_http_status = df_tool_work_aa_http_status.append({'url': var, 'status': int(400)}, ignore_index=True)\n", - " else:\n", - " # print(var ,0)\n", - " df_tool_work_aa_http_status = df_tool_work_aa_http_status.append({'url': var, 'status': int(400)}, ignore_index=True)\n", - "df_tool_work_aa_http_status.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 214, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Status Frequency\n" - ] - }, - { - "data": { - "text/plain": [ - "200 462\n", - "503 54\n", - "404 28\n", - "403 8\n", - "406 5\n", - "500 3\n", - "408 3\n", - "502 2\n", - "400 1\n", - "Name: status, dtype: int64" - ] - }, - "execution_count": 214, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_http_status_sub=df_tool_work_aa_http_status[df_tool_work_aa_http_status['status'] != 1]\n", - "df_db_st = df_http_status_sub['status'].value_counts()\n", - "print('{:<8}Frequency'.format(\"Status\"))\n", - "df_db_st.head(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The first column in the table above shows the HTTP Status codes obtained when trying to connect on *accessibleAt* URLs, the second column the total number of URLs returning the status.\n", - "Notice that while 404 means that the resource is not found, other status codes may indicate temporary problems. \n", - "The image below summarizes of the above result." - ] - }, - { - "cell_type": "code", - "execution_count": 215, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "fig, ax = plt.subplots()\n", - "df_db_st.plot(kind='bar', figsize=(15,6), x='Status', y='Frequency',)\n", - "plt.grid(alpha=0.6)\n", - "ax.yaxis.set_label_text(\"\")\n", - "ax.set_title(\"Number of Result Codes in actor.website\", fontsize=15)\n", - "ax.set_xlabel('Result Code', fontsize=14)\n", - "ax.set_ylabel('Frequency', fontsize=14);\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The list of possibly wrong URLs is saved in a Comma Separated Values (CSV) file having the following columns: *id, label, url, status*. The final release of this notebook will save this data in the curation dataset." - ] - }, - { - "cell_type": "code", - "execution_count": 222, - "metadata": {}, - "outputs": [], - "source": [ - "df_http_status_err=df_http_status_sub[df_http_status_sub['status'] != 200]\n", - "df_list_of_tools_wrongaa=pd.merge(left=df_prop_data_contrib, right=df_http_status_err, left_on='actor.website', right_on='url')\n", - "df_list_of_tools_wrongaa.head()\n", - "df_list_of_tools_wrongaa[['actor.id', 'tool_label', 'actor.website', 'status']].sort_values('tool_label').to_csv(path_or_buf='ts_wrongcontributorsurls.csv')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Checking values on Properties" - ] - }, - { - "cell_type": "code", - "execution_count": 225, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - " There are 199 Tools and Services with no properties\n", - "\n" - ] - } - ], - "source": [ - "print (f\"\\n There are {df_tool_flat_opt['properties'].isna().sum()} Tools and Services with no properties\\n\")" - ] - }, - { - "cell_type": "code", - "execution_count": 226, - "metadata": {}, - "outputs": [], - "source": [ - "#TODO: Print/Save the Tools and Services with empty properties " - ] - }, - { - "cell_type": "code", - "execution_count": 239, - "metadata": {}, - "outputs": [], - "source": [ - "df_prop_data_ts = pd.json_normalize(data=df_tool_all['tools'], record_path='properties', meta_prefix='ts_', meta=['label'])\n", - "#df_prop_data_ts.sort_values('ts_label').head(7)" - ] - }, - { - "cell_type": "code", - "execution_count": 255, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Property Number of missing values\n" - ] - }, - { - "data": { - "text/plain": [ - "id 0\n", - "value 3263\n", - "type.code 0\n", - "type.label 0\n", - "type.type 0\n", - "type.ord 0\n", - "type.allowedVocabularies 3128\n", - "concept.code 3128\n", - "concept.vocabulary.code 3128\n", - "concept.vocabulary.label 3128\n", - "concept.vocabulary.accessibleAt 6391\n", - "concept.label 3128\n", - "concept.notation 3128\n", - "concept.definition 3128\n", - "concept.uri 3128\n", - "concept 6391\n", - "ts_label 0\n", - "dtype: int64" - ] - }, - "execution_count": 255, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_prop_data_ts = pd.json_normalize(data=df_tool_all['tools'], record_path='properties', meta_prefix='ts_', meta=['label'])\n", - "df_prop_data_ts['type.allowedVocabularies'] = df_prop_data_ts['type.allowedVocabularies'].apply(lambda y: np.nan if len(y)==0 else y)\n", - "print('{:<25}Number of missing values'.format(\"Property\"), end='\\n')\n", - "df_prop_data_ts.isnull().sum()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Values in *type.code*" - ] - }, - { - "cell_type": "code", - "execution_count": 266, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Type Code Frequency\n" - ] - }, - { - "data": { - "text/plain": [ - "media 1092\n", - "activity 1080\n", - "terms-of-use 916\n", - "tool-family 156\n", - "keyword 142\n", - "language 132\n", - "thumbnail 68\n", - "version 56\n", - "authentication 51\n", - "geographical-availabilities 15\n", - "life-cycle-status 15\n", - "technical-readiness-level 15\n", - "usermanual-url 11\n", - "source-last-update 11\n", - "service-level-url 9\n", - "see-also 9\n", - "helpdesk-url 6\n", - "termsofuse-url 5\n", - "privacypolicy-url 4\n", - "methodica-link 4\n", - "accesspolicy-url 3\n", - "pages 2\n", - "volume 1\n", - "repository-url 1\n", - "wikidata-id 1\n", - "doi 1\n", - "license 1\n", - "media-caption 1\n", - "Name: type.code, dtype: int64" - ] - }, - "execution_count": 266, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "a_df=df_prop_data_ts.drop_duplicates(['type.code','ts_label'])\n", - "df_temp_tc_label = a_df['type.code'].value_counts()\n", - "print('{:<28}Frequency'.format(\"Type Code\"), end='\\n')\n", - "df_temp_tc_label.head(39)" - ] - }, - { - "cell_type": "code", - "execution_count": 291, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idvaluetype.codetype.labeltype.typetype.ordtype.allowedVocabulariesconcept.codeconcept.vocabulary.codeconcept.vocabulary.labelconcept.vocabulary.accessibleAtconcept.labelconcept.notationconcept.definitionconcept.uriconceptts_label
6149181262Open Sourceterms-of-useTerms Of Usestring42NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNword2vec
6148181261Freeterms-of-useTerms Of Usestring42NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNword2vec
6226181814Freeterms-of-useTerms Of Usestring42NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNwordsimilarity (Word 2 Word)
6227181815Open Sourceterms-of-useTerms Of Usestring42NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNwordsimilarity (Word 2 Word)
6271179508Open Sourceterms-of-useTerms Of Usestring42NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNxMod
6268179505Freeterms-of-useTerms Of Usestring42NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNxMod
6303178536Closed Sourceterms-of-useTerms Of Usestring42NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNyED Files
\n", - "
" - ], - "text/plain": [ - " id value type.code type.label type.type type.ord \\\n", - "6149 181262 Open Source terms-of-use Terms Of Use string 42 \n", - "6148 181261 Free terms-of-use Terms Of Use string 42 \n", - "6226 181814 Free terms-of-use Terms Of Use string 42 \n", - "6227 181815 Open Source terms-of-use Terms Of Use string 42 \n", - "6271 179508 Open Source terms-of-use Terms Of Use string 42 \n", - "6268 179505 Free terms-of-use Terms Of Use string 42 \n", - "6303 178536 Closed Source terms-of-use Terms Of Use string 42 \n", - "\n", - " type.allowedVocabularies concept.code concept.vocabulary.code \\\n", - "6149 NaN NaN NaN \n", - "6148 NaN NaN NaN \n", - "6226 NaN NaN NaN \n", - "6227 NaN NaN NaN \n", - "6271 NaN NaN NaN \n", - "6268 NaN NaN NaN \n", - "6303 NaN NaN NaN \n", - "\n", - " concept.vocabulary.label concept.vocabulary.accessibleAt concept.label \\\n", - "6149 NaN NaN NaN \n", - "6148 NaN NaN NaN \n", - "6226 NaN NaN NaN \n", - "6227 NaN NaN NaN \n", - "6271 NaN NaN NaN \n", - "6268 NaN NaN NaN \n", - "6303 NaN NaN NaN \n", - "\n", - " concept.notation concept.definition concept.uri concept \\\n", - "6149 NaN NaN NaN NaN \n", - "6148 NaN NaN NaN NaN \n", - "6226 NaN NaN NaN NaN \n", - "6227 NaN NaN NaN NaN \n", - "6271 NaN NaN NaN NaN \n", - "6268 NaN NaN NaN NaN \n", - "6303 NaN NaN NaN NaN \n", - "\n", - " ts_label \n", - "6149 word2vec \n", - "6148 word2vec \n", - "6226 wordsimilarity (Word 2 Word) \n", - "6227 wordsimilarity (Word 2 Word) \n", - "6271 xMod \n", - "6268 xMod \n", - "6303 yED Files " - ] - }, - "execution_count": 291, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_prop_data_ts[(df_prop_data_ts['type.code']=='terms-of-use')].sort_values('ts_label').tail(7)" - ] - }, - { - "cell_type": "code", - "execution_count": 267, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ts_labeltypecodes
0140kit{activity, media}
13DF Zephyr - photogrammetry software - 3d mode...{language, keyword}
23DHOP{language, keyword}
33DHOP: 3D Heritage Online Presenter{keyword}
43DReshaper \\| 3DReshaper{language}
\n", - "
" - ], - "text/plain": [ - " ts_label typecodes\n", - "0 140kit {activity, media}\n", - "1 3DF Zephyr - photogrammetry software - 3d mode... {language, keyword}\n", - "2 3DHOP {language, keyword}\n", - "3 3DHOP: 3D Heritage Online Presenter {keyword}\n", - "4 3DReshaper \\| 3DReshaper {language}" - ] - }, - "execution_count": 267, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_typecode_ts=df_prop_data_ts[['ts_label', 'type.code']]\n", - "df_typecode_ts_flat=df_typecode_ts.groupby('ts_label')['type.code'].apply(set).reset_index(name='typecodes')\n", - "df_typecode_ts_flat.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 277, - "metadata": {}, - "outputs": [], - "source": [ - "from collections import Counter, defaultdict\n", - "import itertools\n", - "cooccurrences = []\n", - "\n", - "for props in df_typecode_ts_flat['typecodes']:\n", - " prop_pairs = itertools.combinations(props, 2)\n", - " for pair in prop_pairs:\n", - " cooccurrences.append(tuple(sorted(pair)))\n", - "\n", - "# Count the frequency of each cooccurring pair.\n", - "properties_co_counter = Counter(cooccurrences)" - ] - }, - { - "cell_type": "code", - "execution_count": 281, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Top TypeCodes Cooccurrences by Frequency \n", - "\n", - "Cooccurrence Frequency\n", - "[activity, media] 1013\n", - "[media, terms-of-use] 861\n", - "[activity, terms-of-use] 828\n", - "[media, tool-family] 156\n", - "[terms-of-use, tool-family] 151\n", - "[activity, tool-family] 143\n", - "[keyword, language] 90\n", - "[thumbnail, version] 56\n", - "[authentication, thumbnail] 51\n", - "[authentication, version] 51\n" - ] - } - ], - "source": [ - "print(\"Top TypeCodes Cooccurrences by Frequency\", '\\n')\n", - "print('{:<50}{}'.format('Cooccurrence', 'Frequency'))\n", - "for k, v in properties_co_counter.most_common(10):\n", - " topics = '['+k[0] + ', ' + k[1]+']'\n", - " print(f'{topics:<50}{v}')" - ] - }, - { - "cell_type": "code", - "execution_count": 282, - "metadata": {}, - "outputs": [], - "source": [ - "property_cooccurrences = list(\n", - " itertools.chain(*[[tuple(sorted(c)) for c in itertools.combinations(d, 2)] \n", - " for d in df_typecode_ts_flat['typecodes']])\n", - ")\n", - "# Count the frequency of each cooccurring pair.\n", - "property_edge_counter = Counter(property_cooccurrences)" - ] - }, - { - "cell_type": "code", - "execution_count": 283, - "metadata": {}, - "outputs": [], - "source": [ - "property_cooccurrence_df = pd.DataFrame({\n", - " 'prop0': [dcc[0] for dcc in property_edge_counter.keys()],\n", - " 'prop1': [dcc[1] for dcc in property_edge_counter.keys()],\n", - " 'count': list(property_edge_counter.values()),\n", - "}).pivot_table(index='prop0', columns='prop1')['count']" - ] - }, - { - "cell_type": "code", - "execution_count": 285, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "import seaborn as sns\n", - "fig, ax = plt.subplots(figsize=(22, 18))\n", - "sns.heatmap(property_cooccurrence_df, annot=True, linewidths=0.2, fmt='.0f', ax=ax, cbar=None, cmap='Blues', linecolor='gray')\n", - "ax.set_xticklabels(ax.get_xticklabels(), rotation=30, ha='right')\n", - "ax.set_yticklabels(ax.get_yticklabels(), rotation=0)\n", - "ax.invert_yaxis()\n", - "ax.set_xlabel(None)\n", - "ax.set_ylabel(None)\n", - "#sns.heatmap(df, linewidths=2, linecolor='yellow')\n", - "title = 'Cooccurences of TypeCode in Tools and Services\\n'\n", - "plt.title(title, loc='left', fontsize=20)\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Values in *concept.vocabulary.code*" - ] - }, - { - "cell_type": "code", - "execution_count": 299, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Type Code Frequency\n" - ] - }, - { - "data": { - "text/plain": [ - "tadirah2 1080\n", - "iso-639-3 130\n", - "iso-639-3-v2 2\n", - "software-license 1\n", - "Name: concept.vocabulary.code, dtype: int64" - ] - }, - "execution_count": 299, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "acvc_df=df_prop_data_ts.drop_duplicates(['concept.vocabulary.code','ts_label'])\n", - "df_temp_cvc_label = acvc_df['concept.vocabulary.code'].value_counts()\n", - "print('{:<18}Frequency'.format(\"Type Code\"), end='\\n')\n", - "df_temp_cvc_label.head(39)" - ] - }, - { - "cell_type": "code", - "execution_count": 301, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Analyzing 358\n", - "eng 130\n", - "Discovering 115\n", - "Disseminating 109\n", - "Capturing 108\n", - "Creating 58\n", - "Sharing 32\n", - "Visual Analysis 29\n", - "Annotating 28\n", - "Web Development 27\n", - "Organizing 25\n", - "Editing 19\n", - "Data Cleansing 18\n", - "Collaborating 16\n", - "Storing 15\n", - "Gathering 14\n", - "Publishing 13\n", - "Parsing 12\n", - "Content Analysis 8\n", - "Named Entity Recognition 8\n", - "Converting 6\n", - "Communicating 6\n", - "Tagging 5\n", - "Programming 5\n", - "Modeling 5\n", - "Writing 5\n", - "Imaging 4\n", - "Translating 4\n", - "Enriching 3\n", - "Spatial Analysis 3\n", - "Recording 3\n", - "Network Analysis 3\n", - "Identifying 3\n", - "Transcribing 2\n", - "Interpreting 2\n", - "Archiving 2\n", - "Englisch 2\n", - "Designing 2\n", - "Structural Analysis 2\n", - "Name: concept.label, dtype: int64" - ] - }, - "execution_count": 301, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_temp_cvc_concept = acvc_df['concept.label'].value_counts()\n", - "df_temp_cvc_concept.head(39)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -}