Uso di YARRRML per definire regole di mapping
This commit is contained in:
parent
1054e4f877
commit
10675a2d56
|
@ -0,0 +1,40 @@
|
|||
prefixes:
|
||||
dc: http://purl.org/dc/elements/1.1/
|
||||
dct: http://purl.org/dc/terms
|
||||
iso369-3: http://id.loc.gov/vocabulary/iso639-3
|
||||
owl: http://www.w3.org/2002/07/owl#
|
||||
skos: http://www.w3.org/2004/02/skos/core#
|
||||
sshoccmd: http://sshoc.eu/XXX/
|
||||
xsd: http://www.w3.org/2001/XMLSchema#
|
||||
|
||||
mappings:
|
||||
conceptscheme:
|
||||
sources:
|
||||
- ['data.json~jsonpath', '$']
|
||||
s: skos:_
|
||||
po:
|
||||
- [a, skos:conceptScheme]
|
||||
- [dc:title, $(title), en~lang]
|
||||
- [dc:description, $(description), en~lang]
|
||||
- [dc:version, $(version), xsd:integer]
|
||||
|
||||
concept:
|
||||
sources:
|
||||
- ['data.json~jsonpath', '$.concepts[*]']
|
||||
s: sshoccmd:$(ConceptId)
|
||||
po:
|
||||
- [a, skos:Concept]
|
||||
- [skos:exactMatch, $(URI)]
|
||||
- [dct:source, $(source) ]
|
||||
- [skos:preflabel, $(Englishterm), en~lang]
|
||||
- [skos:definition, $(Englishdefinition), en~lang]
|
||||
- [skos:preflabel, $(Dutchterm), nl~lang]
|
||||
- [skos:definition, $(Dutchdefinition), nl~lang]
|
||||
- [skos:preflabel, $(Greekterm), gr~lang]
|
||||
- [skos:definition, $(Greekhdefinition), gr~lang]
|
||||
- [skos:preflabel, $(Italianterm), it~lang]
|
||||
- [skos:definition, $(Italiandefinition), it~lang]
|
||||
- [skos:preflabel, $(Frenchterm), fr~lang]
|
||||
- [skos:definition, $(Frenchdefinition), fr~lang]
|
||||
|
||||
|
|
@ -0,0 +1,254 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "mechanical-johns",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Mapping *SSHOC Multilingual Metadata* to SKOS resources \n",
|
||||
"\n",
|
||||
"This Notebook implements a simple parser used to transform the SSHOC Multilingual Metadata, created in the Task 3.1 of the SSHOC project and published as spreadsheets, into SKOS resources. The parser reads the spreadsheet and transforms the content in SKOS data following a set of mapping rules defined using [YRRRML](https://rml.io/yarrrml/) , the result is stored in Turtle files, and downloaded in a Fuseki server.\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "aware-comparison",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import rdflib\n",
|
||||
"import itertools\n",
|
||||
"import yaml\n",
|
||||
"import datetime\n",
|
||||
"import json\n",
|
||||
"from jsonpath_ng import jsonpath, parse\n",
|
||||
"from rdflib.namespace import DC, DCAT, DCTERMS, OWL, \\\n",
|
||||
" RDF, RDFS, SKOS, \\\n",
|
||||
" XMLNS, XSD, XMLNS\n",
|
||||
"from rdflib import Namespace\n",
|
||||
"from rdflib import URIRef, BNode, Literal"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "failing-shift",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The file *config.yaml* contains the external information used in the parsing, including the path of the spreadsheets. Set the correct values before running the Notebook.\n",
|
||||
"The file *mappings.yaml* contains the YARRRML mapping rules"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "cutting-triangle",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"try:\n",
|
||||
" with open(\"config.yaml\", 'r') as stream:\n",
|
||||
" try:\n",
|
||||
" conf=yaml.safe_load(stream)\n",
|
||||
" except yaml.YAMLError as exc:\n",
|
||||
" print(exc)\n",
|
||||
" with open(\"rules.yaml\", 'r') as stream:\n",
|
||||
" try:\n",
|
||||
" rules=yaml.safe_load(stream)\n",
|
||||
" except yaml.YAMLError as exc:\n",
|
||||
" print(exc)\n",
|
||||
"except FileNotFoundError:\n",
|
||||
" print('Warning config.yaml file not present! Please store it in the same directory as the notebook')\n",
|
||||
"#print (conf)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "dd570749",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The following functions implement a basic parser that processes the YARRRML rules and creates a RDF graph"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "403b935b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def jsonmapper (json_source, rules):\n",
|
||||
" ccr_y = rdflib.Graph()\n",
|
||||
" prefixes=rules['prefixes']\n",
|
||||
" nss={}\n",
|
||||
" for key in prefixes:\n",
|
||||
" myns=Namespace(URIRef(prefixes[key]))\n",
|
||||
" nss[key]=myns\n",
|
||||
" ccr_y.bind(key, nss[key])\n",
|
||||
" mappings=rules['mappings']\n",
|
||||
" for mapi in mappings:\n",
|
||||
" jsonpath_expression = parse(mappings[mapi]['sources'][0][1])\n",
|
||||
" source_list = [match.value for match in jsonpath_expression.find(json_source)] \n",
|
||||
" for source in source_list:\n",
|
||||
" labelst=mappings[mapi]['s'].split(':')\n",
|
||||
" if ('$' in labelst[1]):\n",
|
||||
" labelpath=labelst[1].replace('$(','').replace(')','')\n",
|
||||
" labelexpression=parse(labelpath)\n",
|
||||
" lb_ids_list = [match.value for match in labelexpression.find(source)]\n",
|
||||
" labelid=lb_ids_list[0]\n",
|
||||
" else:\n",
|
||||
" labelid=labelst[1]\n",
|
||||
" labns=nss[labelst[0]]\n",
|
||||
" urilabel=labns[labelid]\n",
|
||||
" propsobs=mappings[mapi]['po']\n",
|
||||
" for popob in propsobs:\n",
|
||||
" if (popob[0]=='a'):\n",
|
||||
" myob=popob[1].split(':')\n",
|
||||
" tpns=nss[myob[0]]\n",
|
||||
" ccr_y.add((urilabel, RDF.type, tpns[myob[1]]))\n",
|
||||
" continue\n",
|
||||
"\n",
|
||||
" myspath=(f\"{popob[1].replace('$(','').replace(')','')}\")\n",
|
||||
" po_expression = parse(myspath)\n",
|
||||
" po_ids_list = [match.value for match in po_expression.find(source)]\n",
|
||||
" lang=''\n",
|
||||
" if (len(popob) >2 and ('lang' in popob[2])):\n",
|
||||
" lang=popob[2].replace('~lang','')\n",
|
||||
" for poval in po_ids_list:\n",
|
||||
" ob=Literal((poval))\n",
|
||||
" if lang!='':\n",
|
||||
" ob= Literal(ob, lang=lang)\n",
|
||||
" prns=nss[popob[0].split(':')[0]]\n",
|
||||
" ccr_y.add((urilabel, prns[popob[0].split(':')[1]], ob))\n",
|
||||
" return ccr_y\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ruled-america",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Download **SSHOC Multilingual Metadata** spreadsheet"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "olive-archive",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"mdurl=conf['Source']['METADATASOURCE']\n",
|
||||
"df_metadata=pd.read_csv(mdurl)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "square-michael",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df_metadata.rename(columns = {'English': 'Englishterm', 'Unnamed: 1':'Englishdefinition', 'Unnamed: 2':'source',\n",
|
||||
" 'Unnamed: 3':'URI', 'Dutch':'Dutchterm', 'Unnamed: 5':'Dutchdefinition', \n",
|
||||
" 'French':'Frenchterm', 'Unnamed: 7':'Frenchdefinition',\n",
|
||||
" 'Greek':'Greekterm', 'Unnamed: 9':'Greekdefinition',\n",
|
||||
" 'Italian':'Italianterm', 'Unnamed: 11':'Italiandefinition'}, inplace = True)\n",
|
||||
"df_metadata=df_metadata.drop(0)\n",
|
||||
"df_metadata['ConceptId']=df_metadata['URI'].apply(lambda y: y.replace('http://hdl.handle.net/11459/',''))\n",
|
||||
"df_metadata['source']=df_metadata['source'].apply(lambda y: y.replace('(source: ','').replace(')',''))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "ff9828c8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df_metadata.to_json('data/file.json', orient='records')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "ebdd5eb0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"myjson=df_metadata.to_json(orient='records')\n",
|
||||
"concepts_metadata=json.loads(myjson)\n",
|
||||
"json_metadata={'concepts': concepts_metadata}\n",
|
||||
"json_metadata['title']=conf['Texts']['METADATATITLE']\n",
|
||||
"json_metadata['description']=conf['Texts']['METADATADESCRIPTION']\n",
|
||||
"json_metadata['id']=conf['Texts']['METADATAID']\n",
|
||||
"json_metadata['createdate']=conf['Texts']['METADATACREATEDATE']\n",
|
||||
"json_metadata['version']=conf['Texts']['METADATAVERSION']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "abd396d6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"skosgraph=jsonmapper(json_metadata, rules)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "a0a6b04a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<Graph identifier=Nd89e673445d645f69476105bb18cad6a (<class 'rdflib.graph.Graph'>)>"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"skosgraph.serialize(destination='data/skosccr_y.ttl', format=\"n3\")\n",
|
||||
"skosgraph.serialize(destination='data/skosccr_y.rdf', format=\"pretty-xml\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "elementary-graphics",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
Loading…
Reference in New Issue