From 10675a2d56f3dccac4afe62c208a7180e3975247 Mon Sep 17 00:00:00 2001 From: cesare Date: Tue, 21 Mar 2023 09:29:59 +0100 Subject: [PATCH] Uso di YARRRML per definire regole di mapping --- rules.yaml | 40 ++++++ sshoc_31_skos-yarrrml.ipynb | 254 ++++++++++++++++++++++++++++++++++++ 2 files changed, 294 insertions(+) create mode 100644 rules.yaml create mode 100644 sshoc_31_skos-yarrrml.ipynb diff --git a/rules.yaml b/rules.yaml new file mode 100644 index 0000000..0b3faa9 --- /dev/null +++ b/rules.yaml @@ -0,0 +1,40 @@ +prefixes: + dc: http://purl.org/dc/elements/1.1/ + dct: http://purl.org/dc/terms + iso369-3: http://id.loc.gov/vocabulary/iso639-3 + owl: http://www.w3.org/2002/07/owl# + skos: http://www.w3.org/2004/02/skos/core# + sshoccmd: http://sshoc.eu/XXX/ + xsd: http://www.w3.org/2001/XMLSchema# + +mappings: + conceptscheme: + sources: + - ['data.json~jsonpath', '$'] + s: skos:_ + po: + - [a, skos:conceptScheme] + - [dc:title, $(title), en~lang] + - [dc:description, $(description), en~lang] + - [dc:version, $(version), xsd:integer] + + concept: + sources: + - ['data.json~jsonpath', '$.concepts[*]'] + s: sshoccmd:$(ConceptId) + po: + - [a, skos:Concept] + - [skos:exactMatch, $(URI)] + - [dct:source, $(source) ] + - [skos:preflabel, $(Englishterm), en~lang] + - [skos:definition, $(Englishdefinition), en~lang] + - [skos:preflabel, $(Dutchterm), nl~lang] + - [skos:definition, $(Dutchdefinition), nl~lang] + - [skos:preflabel, $(Greekterm), gr~lang] + - [skos:definition, $(Greekhdefinition), gr~lang] + - [skos:preflabel, $(Italianterm), it~lang] + - [skos:definition, $(Italiandefinition), it~lang] + - [skos:preflabel, $(Frenchterm), fr~lang] + - [skos:definition, $(Frenchdefinition), fr~lang] + + \ No newline at end of file diff --git a/sshoc_31_skos-yarrrml.ipynb b/sshoc_31_skos-yarrrml.ipynb new file mode 100644 index 0000000..b110355 --- /dev/null +++ b/sshoc_31_skos-yarrrml.ipynb @@ -0,0 +1,254 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "mechanical-johns", + "metadata": {}, + "source": [ + "## Mapping *SSHOC Multilingual Metadata* to SKOS resources \n", + "\n", + "This Notebook implements a simple parser used to transform the SSHOC Multilingual Metadata, created in the Task 3.1 of the SSHOC project and published as spreadsheets, into SKOS resources. The parser reads the spreadsheet and transforms the content in SKOS data following a set of mapping rules defined using [YRRRML](https://rml.io/yarrrml/) , the result is stored in Turtle files, and downloaded in a Fuseki server.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "aware-comparison", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import rdflib\n", + "import itertools\n", + "import yaml\n", + "import datetime\n", + "import json\n", + "from jsonpath_ng import jsonpath, parse\n", + "from rdflib.namespace import DC, DCAT, DCTERMS, OWL, \\\n", + " RDF, RDFS, SKOS, \\\n", + " XMLNS, XSD, XMLNS\n", + "from rdflib import Namespace\n", + "from rdflib import URIRef, BNode, Literal" + ] + }, + { + "cell_type": "markdown", + "id": "failing-shift", + "metadata": {}, + "source": [ + "The file *config.yaml* contains the external information used in the parsing, including the path of the spreadsheets. Set the correct values before running the Notebook.\n", + "The file *mappings.yaml* contains the YARRRML mapping rules" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "cutting-triangle", + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " with open(\"config.yaml\", 'r') as stream:\n", + " try:\n", + " conf=yaml.safe_load(stream)\n", + " except yaml.YAMLError as exc:\n", + " print(exc)\n", + " with open(\"rules.yaml\", 'r') as stream:\n", + " try:\n", + " rules=yaml.safe_load(stream)\n", + " except yaml.YAMLError as exc:\n", + " print(exc)\n", + "except FileNotFoundError:\n", + " print('Warning config.yaml file not present! Please store it in the same directory as the notebook')\n", + "#print (conf)" + ] + }, + { + "cell_type": "markdown", + "id": "dd570749", + "metadata": {}, + "source": [ + "The following functions implement a basic parser that processes the YARRRML rules and creates a RDF graph" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "403b935b", + "metadata": {}, + "outputs": [], + "source": [ + "def jsonmapper (json_source, rules):\n", + " ccr_y = rdflib.Graph()\n", + " prefixes=rules['prefixes']\n", + " nss={}\n", + " for key in prefixes:\n", + " myns=Namespace(URIRef(prefixes[key]))\n", + " nss[key]=myns\n", + " ccr_y.bind(key, nss[key])\n", + " mappings=rules['mappings']\n", + " for mapi in mappings:\n", + " jsonpath_expression = parse(mappings[mapi]['sources'][0][1])\n", + " source_list = [match.value for match in jsonpath_expression.find(json_source)] \n", + " for source in source_list:\n", + " labelst=mappings[mapi]['s'].split(':')\n", + " if ('$' in labelst[1]):\n", + " labelpath=labelst[1].replace('$(','').replace(')','')\n", + " labelexpression=parse(labelpath)\n", + " lb_ids_list = [match.value for match in labelexpression.find(source)]\n", + " labelid=lb_ids_list[0]\n", + " else:\n", + " labelid=labelst[1]\n", + " labns=nss[labelst[0]]\n", + " urilabel=labns[labelid]\n", + " propsobs=mappings[mapi]['po']\n", + " for popob in propsobs:\n", + " if (popob[0]=='a'):\n", + " myob=popob[1].split(':')\n", + " tpns=nss[myob[0]]\n", + " ccr_y.add((urilabel, RDF.type, tpns[myob[1]]))\n", + " continue\n", + "\n", + " myspath=(f\"{popob[1].replace('$(','').replace(')','')}\")\n", + " po_expression = parse(myspath)\n", + " po_ids_list = [match.value for match in po_expression.find(source)]\n", + " lang=''\n", + " if (len(popob) >2 and ('lang' in popob[2])):\n", + " lang=popob[2].replace('~lang','')\n", + " for poval in po_ids_list:\n", + " ob=Literal((poval))\n", + " if lang!='':\n", + " ob= Literal(ob, lang=lang)\n", + " prns=nss[popob[0].split(':')[0]]\n", + " ccr_y.add((urilabel, prns[popob[0].split(':')[1]], ob))\n", + " return ccr_y\n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "ruled-america", + "metadata": {}, + "source": [ + "Download **SSHOC Multilingual Metadata** spreadsheet" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "olive-archive", + "metadata": {}, + "outputs": [], + "source": [ + "mdurl=conf['Source']['METADATASOURCE']\n", + "df_metadata=pd.read_csv(mdurl)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "square-michael", + "metadata": {}, + "outputs": [], + "source": [ + "df_metadata.rename(columns = {'English': 'Englishterm', 'Unnamed: 1':'Englishdefinition', 'Unnamed: 2':'source',\n", + " 'Unnamed: 3':'URI', 'Dutch':'Dutchterm', 'Unnamed: 5':'Dutchdefinition', \n", + " 'French':'Frenchterm', 'Unnamed: 7':'Frenchdefinition',\n", + " 'Greek':'Greekterm', 'Unnamed: 9':'Greekdefinition',\n", + " 'Italian':'Italianterm', 'Unnamed: 11':'Italiandefinition'}, inplace = True)\n", + "df_metadata=df_metadata.drop(0)\n", + "df_metadata['ConceptId']=df_metadata['URI'].apply(lambda y: y.replace('http://hdl.handle.net/11459/',''))\n", + "df_metadata['source']=df_metadata['source'].apply(lambda y: y.replace('(source: ','').replace(')',''))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "ff9828c8", + "metadata": {}, + "outputs": [], + "source": [ + "df_metadata.to_json('data/file.json', orient='records')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "ebdd5eb0", + "metadata": {}, + "outputs": [], + "source": [ + "myjson=df_metadata.to_json(orient='records')\n", + "concepts_metadata=json.loads(myjson)\n", + "json_metadata={'concepts': concepts_metadata}\n", + "json_metadata['title']=conf['Texts']['METADATATITLE']\n", + "json_metadata['description']=conf['Texts']['METADATADESCRIPTION']\n", + "json_metadata['id']=conf['Texts']['METADATAID']\n", + "json_metadata['createdate']=conf['Texts']['METADATACREATEDATE']\n", + "json_metadata['version']=conf['Texts']['METADATAVERSION']" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "abd396d6", + "metadata": {}, + "outputs": [], + "source": [ + "skosgraph=jsonmapper(json_metadata, rules)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a0a6b04a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + ")>" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "skosgraph.serialize(destination='data/skosccr_y.ttl', format=\"n3\")\n", + "skosgraph.serialize(destination='data/skosccr_y.rdf', format=\"pretty-xml\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "elementary-graphics", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}