sshoc-skosmapping/sshoc_31_skos.ipynb

555 lines
21 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "mechanical-johns",
"metadata": {},
"source": [
"## Mapping *Multilingual Data Stewardship terminology* and *SSHOC Multilingual Metadata* to SKOS resources\n",
"\n",
"This Notebook implements a simple parser used to transform the SSHOC Multilingual Data Stewardship Terminology and the SSHOC Multilingual Metadata, created in the Task 3.1 of the SSHOC project and published as spreadsheets, into SKOS resources. The parser reads the spreadsheets and transforms the content in SKOS data following a set of mapping rules, the result is stored in two Turtle files.\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "aware-comparison",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import rdflib\n",
"import itertools\n",
"import yaml\n",
"import datetime"
]
},
{
"cell_type": "markdown",
"id": "failing-shift",
"metadata": {},
"source": [
"The file *config.yaml* contains the external information used in the parsing, including the position of the spreadsheets. Set the correct values before running the Notebook."
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "cutting-triangle",
"metadata": {},
"outputs": [],
"source": [
"try:\n",
" with open(\"config.yaml\", 'r') as stream:\n",
" try:\n",
" conf=yaml.safe_load(stream)\n",
" except yaml.YAMLError as exc:\n",
" print(exc)\n",
"except FileNotFoundError:\n",
" print('Warning config.yaml file not present! Please store it in the same directory as the notebook')\n",
"#print (conf)"
]
},
{
"cell_type": "markdown",
"id": "cardiac-angel",
"metadata": {},
"source": [
"The following cells defines the *Namespaces* used in the parsing"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "neural-career",
"metadata": {},
"outputs": [],
"source": [
"from rdflib.namespace import DC, DCAT, DCTERMS, OWL, \\\n",
" RDF, RDFS, SKOS, \\\n",
" XMLNS, XSD, XMLNS\n",
"from rdflib import Namespace\n",
"from rdflib import URIRef, BNode, Literal\n",
"\n",
"sshocterm = Namespace(conf['Namespaces']['SSHOCTERM'])\n",
"sshoccmd= Namespace(conf['Namespaces']['SSHOCCMD'])\n",
"dc11=Namespace(\"http://purl.org/dc/elements/1.1/\");\n",
"dct = Namespace(\"http://purl.org/dc/terms/\")\n",
"iso369=Namespace(\"http://id.loc.gov/vocabulary/iso639-3\");"
]
},
{
"cell_type": "markdown",
"id": "virtual-conducting",
"metadata": {},
"source": [
"Download **Data Stewardship terminology** spreadsheet"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "systematic-bachelor",
"metadata": {},
"outputs": [],
"source": [
"url=conf['Source']['VOCABULARYSOURCE']\n",
"df_data=pd.read_csv(url)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "sealed-complexity",
"metadata": {},
"outputs": [],
"source": [
"df_data.rename(columns = {'Unnamed: 0': 'Concept ID', 'Unnamed: 1':'Subject', 'Unnamed: 2':'Term',\n",
" 'Unnamed: 3':'Source of definition', 'Translations':'Dutch', 'Unnamed: 5':'French', \n",
" 'Unnamed: 6':'German', 'Unnamed: 7':'Greek',\n",
" 'Unnamed: 8':'Italian', 'Unnamed: 9':'Slovenian',\n",
" 'Linking':'Loterre Open Science Thesaurus', 'Unnamed: 11':'Terms4FAIRSkills',\n",
" 'Unnamed: 12':'CCR metadata', 'Unnamed: 13':'Linked Open Vocabularies',\n",
" 'Unnamed: 14':'LOV 2', 'Unnamed: 15':'ISO',\n",
" 'Unnamed: 16':'Broader Concept'}, inplace = True)\n",
"df_data=df_data.drop(0)"
]
},
{
"cell_type": "markdown",
"id": "impossible-romantic",
"metadata": {},
"source": [
"Create a graph for the SKOS data and bind the namespaces to it"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "southeast-cholesterol",
"metadata": {},
"outputs": [],
"source": [
"c1rdf = rdflib.Graph()\n",
"c1rdf.bind(\"sshocterm\", sshocterm)\n",
"c1rdf.bind(\"sshoccmd\", sshoccmd)\n",
"c1rdf.bind(\"dc11\", dc11)\n",
"c1rdf.bind(\"dct\", dct)\n",
"c1rdf.bind(\"iso369-3\", iso369)\n",
"c1rdf.bind(\"skos\", SKOS)\n",
"c1rdf.bind(\"dc\", DC)\n",
"c1rdf.bind(\"rdf\", RDF)\n",
"c1rdf.bind(\"owl\", OWL)\n",
"c1rdf.bind(\"xsd\", XSD)\n"
]
},
{
"cell_type": "markdown",
"id": "duplicate-oregon",
"metadata": {},
"source": [
"Insert in the graph the *SKOS.ConceptScheme*"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "hydraulic-raising",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Graph identifier=Nfbc9b96ef4c047d9a72e68d22f8fc441 (<class 'rdflib.graph.Graph'>)>"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"now = datetime.datetime.today()\n",
"today_date=now.date()\n",
"title=Literal(conf['Texts']['VOCABULARYTITLE'], lang=conf['Texts']['LANG'])\n",
"description=Literal(conf['Texts']['VOCABULARYDESCRIPTION'], lang=conf['Texts']['LANG'])\n",
"identifier=Literal(conf['Texts']['VOCABULARYID'], lang=conf['Texts']['LANG'])\n",
"#identifier=URIRef(conf['Texts']['VOCABULARYID'])\n",
"createddate= Literal(conf['Texts']['VOCABULARYCREATEDATE'],datatype=XSD.date)\n",
"moddate= Literal(today_date,datatype=XSD.date)\n",
"version= Literal(conf['Texts']['VOCABULARYVERSION'],datatype=XSD.string)\n",
"\n",
"c1rdf.add((sshocterm[''], RDF.type, SKOS.ConceptScheme))\n",
"c1rdf.add((sshocterm[''], DC.title, title))\n",
"c1rdf.add((sshocterm[''], DC.identifier, identifier))\n",
"c1rdf.add((sshocterm[''], DC.description, description))\n",
"c1rdf.add((sshocterm[''], dct.created, createddate))\n",
"c1rdf.add((sshocterm[''], dct.modified, moddate))\n",
"c1rdf.add((sshocterm[''], OWL.versionInfo, version))\n",
"c1rdf.add((sshocterm[''], dct.language, iso369.eng))\n",
"c1rdf.add((sshocterm[''], dct.language, iso369.ger))\n",
"c1rdf.add((sshocterm[''], dct.language, iso369.fra))\n",
"c1rdf.add((sshocterm[''], dct.language, iso369.ell))\n",
"c1rdf.add((sshocterm[''], dct.language, iso369.ita))\n",
"c1rdf.add((sshocterm[''], dct.language, iso369.dut))\n",
"c1rdf.add((sshocterm[''], dct.language, iso369.slv))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "level-score",
"metadata": {},
"outputs": [],
"source": [
"#c1rdf.serialize(destination='data/skostest.rdf', format=\"n3\");#format=\"pretty-xml\")\n",
"#comrdf.serialize(destination='data/parsed_rdf/prima_cantica_forme_com.rdf', format=\"n3\");"
]
},
{
"cell_type": "markdown",
"id": "detailed-official",
"metadata": {},
"source": [
"The following cell implements the mapping rules for creating SKOS resources."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "failing-relative",
"metadata": {},
"outputs": [],
"source": [
"for index, row in df_data.iterrows():\n",
" \n",
" if row.Subject.lower()==\"preflabel\":\n",
" label=URIRef(row[\"Concept ID\"].strip())\n",
" \n",
" \n",
" c1rdf.add((sshocterm[''], SKOS.hasTopConcept, sshocterm[label]))\n",
" enlabel=Literal(row[\"Term\"].strip(), lang='en')\n",
" frlabel=Literal(row[\"French\"].strip(), lang='fr')\n",
" nllabel=Literal(row['Dutch'].strip(), lang='nl')\n",
" delabel=Literal(row['German'].strip(), lang='de')\n",
" itlabel=Literal(row['Italian'].strip(), lang='it')\n",
" sllabel=Literal(row['Slovenian'].strip(), lang='sl')\n",
" ellabel=Literal(row['Greek'].strip(), lang='el')\n",
" c1rdf.add((sshocterm[label], RDF.type, SKOS.Concept))\n",
" c1rdf.add((sshocterm[label], SKOS.inScheme, sshocterm['']))\n",
" c1rdf.add((sshocterm[label], SKOS.topConceptOf, sshocterm['']))\n",
" c1rdf.add((sshocterm[label], SKOS.prefLabel, enlabel))\n",
" c1rdf.add((sshocterm[label], SKOS.prefLabel, frlabel))\n",
" c1rdf.add((sshocterm[label], SKOS.prefLabel, nllabel))\n",
" c1rdf.add((sshocterm[label], SKOS.prefLabel, delabel))\n",
" c1rdf.add((sshocterm[label], SKOS.prefLabel, itlabel))\n",
" c1rdf.add((sshocterm[label], SKOS.prefLabel, sllabel))\n",
" c1rdf.add((sshocterm[label], SKOS.prefLabel, ellabel))\n",
" if row.Subject.lower()==\"altlabel\":\n",
" if not pd.isna(row['Term']):\n",
" c1rdf.add((sshocterm[label], SKOS.altLabel, Literal(row[\"Term\"].strip(), lang='en')))\n",
" if not pd.isna(row['French']):\n",
" c1rdf.add((sshocterm[label], SKOS.altLabel, Literal(row[\"French\"].strip(), lang='fr')))\n",
" if not pd.isna(row['Dutch']):\n",
" c1rdf.add((sshocterm[label], SKOS.altLabel, Literal(row[\"Dutch\"].strip(), lang='nl')))\n",
" if not pd.isna(row['German']):\n",
" c1rdf.add((sshocterm[label], SKOS.altLabel, Literal(row[\"German\"].strip(), lang='de')))\n",
" if not pd.isna(row['Italian']):\n",
" c1rdf.add((sshocterm[label], SKOS.altLabel, Literal(row[\"Italian\"].strip(), lang='it')))\n",
" if not pd.isna(row['Slovenian']):\n",
" c1rdf.add((sshocterm[label], SKOS.altLabel, Literal(row[\"Slovenian\"].strip(), lang='sl')))\n",
" if not pd.isna(row['Greek']):\n",
" c1rdf.add((sshocterm[label], SKOS.altLabel, Literal(row[\"Greek\"].strip(), lang='el')))\n",
" \n",
" if row.Subject.lower()==\"definition\":\n",
" endef=Literal(row[\"Term\"].strip(), lang='en')\n",
" frdef=Literal(row[\"French\"].strip(), lang='fr')\n",
" nldef=Literal(row['Dutch'].strip(), lang='nl')\n",
" dedef=Literal(row['German'].strip(), lang='de')\n",
" itdef=Literal(row['Italian'].strip(), lang='it')\n",
" sldef=Literal(row['Slovenian'].strip(), lang='sl')\n",
" eldef=Literal(row['Greek'].strip(), lang='el')\n",
" \n",
" c1rdf.add((sshocterm[label], SKOS.definition, endef))\n",
" c1rdf.add((sshocterm[label], SKOS.definition, frdef))\n",
" c1rdf.add((sshocterm[label], SKOS.definition, nldef))\n",
" c1rdf.add((sshocterm[label], SKOS.definition, dedef))\n",
" c1rdf.add((sshocterm[label], SKOS.definition, itdef))\n",
" c1rdf.add((sshocterm[label], SKOS.definition, sldef))\n",
" c1rdf.add((sshocterm[label], SKOS.definition, eldef))\n",
" if not pd.isna(row['Source of definition']):\n",
" source=Literal(row['Source of definition'].strip(), datatype=XSD.string)\n",
" #print (f'{label}, {source}')\n",
" c1rdf.add((sshocterm[label], dct.source, source))\n",
" if not pd.isna(row['Loterre Open Science Thesaurus']):\n",
" lote=URIRef(row['Loterre Open Science Thesaurus'])\n",
" c1rdf.add((sshocterm[label], SKOS.exactMatch, lote))\n",
" \n",
" if not pd.isna(row['Linked Open Vocabularies']):\n",
" lov=URIRef(row['Linked Open Vocabularies'])\n",
" c1rdf.add((sshocterm[label], SKOS.exactMatch, lov))\n",
" \n",
" if not pd.isna(row['LOV 2']):\n",
" lov2=URIRef(row['LOV 2'])\n",
" c1rdf.add((sshocterm[label], SKOS.exactMatch, lov2))\n",
" #Terms4FAIRSkills ISO \n",
" if not pd.isna(row['Terms4FAIRSkills']):\n",
" t4fs=Literal(row['Terms4FAIRSkills'].strip())\n",
" c1rdf.add((sshocterm[label], SKOS.note, t4fs))\n",
" if not pd.isna(row['ISO']):\n",
" tiso=Literal(row['ISO'].strip())\n",
" c1rdf.add((sshocterm[label], SKOS.note, tiso))\n",
" if not pd.isna(row['Broader Concept']):\n",
" broc=URIRef(row['Broader Concept'])\n",
" c1rdf.add((sshocterm[label], SKOS.broadMatch, broc))\n",
"\n",
"print(len(c1rdf))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "earlier-slovak",
"metadata": {},
"outputs": [],
"source": [
"#for s, p, o in c1rdf.triples((None, None, None)):\n",
"# print(\"{} {}\".format(s, o.n3))"
]
},
{
"cell_type": "markdown",
"id": "arabic-buyer",
"metadata": {},
"source": [
"Create a *Turtle* file and an *rdf* file in the **/data** directory with the SKOS resource for the **SSHOC Multilingual Data Stewardship Terminology** "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "treated-spotlight",
"metadata": {},
"outputs": [],
"source": [
"c1rdf.serialize(destination='data/mdstskos.ttl', format=\"n3\");#format=\"pretty-xml\")\n",
"c1rdf.serialize(destination='data/mdstskos.rdf', format=\"pretty-xml\");#format=\"pretty-xml\")"
]
},
{
"cell_type": "markdown",
"id": "ruled-america",
"metadata": {},
"source": [
"Download **SSHOC Multilingual Metadata** spreadsheet"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "olive-archive",
"metadata": {},
"outputs": [],
"source": [
"mdurl=conf['Source']['METADATASOURCE']\n",
"df_metadata=pd.read_csv(mdurl)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "square-michael",
"metadata": {},
"outputs": [],
"source": [
"df_metadata.rename(columns = {'English': 'Englishterm', 'Unnamed: 1':'Englishdefinition', 'Unnamed: 2':'source',\n",
" 'Unnamed: 3':'URI', 'Dutch':'Dutchterm', 'Unnamed: 5':'Dutchdefinition', \n",
" 'French':'Frenchterm', 'Unnamed: 7':'Frenchdefinition',\n",
" 'Greek':'Greekterm', 'Unnamed: 9':'Greekdefinition',\n",
" 'Italian':'Italianterm', 'Unnamed: 11':'Italiandefinition'}, inplace = True)\n",
"df_metadata=df_metadata.drop(0)\n"
]
},
{
"cell_type": "markdown",
"id": "explicit-routine",
"metadata": {},
"source": [
"Create a graph for the SKOS data and bind the namespaces to it"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "patient-winner",
"metadata": {},
"outputs": [],
"source": [
"ccr = rdflib.Graph()\n",
"ccr.bind(\"sshoccmd\", sshoccmd)\n",
"ccr.bind(\"sshocterm\", sshocterm)\n",
"ccr.bind(\"dc11\", dc11)\n",
"ccr.bind(\"dct\", dct)\n",
"ccr.bind(\"iso369-3\", iso369)\n",
"ccr.bind(\"skos\", SKOS)\n",
"ccr.bind(\"dc\", DC)\n",
"ccr.bind(\"rdf\", RDF)\n",
"ccr.bind(\"owl\", OWL)\n",
"ccr.bind(\"xsd\", XSD)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "least-waterproof",
"metadata": {},
"outputs": [],
"source": [
"now = datetime.datetime.today()\n",
"today_date=now.date()\n",
"title=Literal(conf['Texts']['METADATATITLE'], lang=conf['Texts']['LANG'])\n",
"description=Literal(conf['Texts']['METADATADESCRIPTION'], lang=conf['Texts']['LANG'])\n",
"identifier=Literal(conf['Texts']['METADATAID'], lang=conf['Texts']['LANG'])\n",
"#identifier=URIRef(conf['Texts']['METADATAID'])\n",
"createddate= Literal(conf['Texts']['METADATACREATEDATE'],datatype=XSD.date)\n",
"moddate= Literal(today_date,datatype=XSD.date)\n",
"version= Literal(conf['Texts']['METADATAVERSION'],datatype=XSD.string)\n",
"\n",
"ccr.add((sshoccmd[''], RDF.type, SKOS.ConceptScheme))\n",
"ccr.add((sshoccmd[''], DC.title, title))\n",
"ccr.add((sshoccmd[''], DC.description, description))\n",
"ccr.add((sshoccmd[''], DC.identifier, identifier))\n",
"ccr.add((sshoccmd[''], dct.created, createddate))\n",
"ccr.add((sshoccmd[''], dct.modified, moddate))\n",
"ccr.add((sshoccmd[''], OWL.versionInfo, version))\n",
"ccr.add((sshoccmd[''], dct.language, iso369.eng))\n",
"ccr.add((sshoccmd[''], dct.language, iso369.ger))\n",
"ccr.add((sshoccmd[''], dct.language, iso369.fra))\n",
"ccr.add((sshoccmd[''], dct.language, iso369.ell))\n",
"ccr.add((sshoccmd[''], dct.language, iso369.ita))\n",
"ccr.add((sshoccmd[''], dct.language, iso369.dut))\n",
"ccr.add((sshoccmd[''], dct.language, iso369.slv))"
]
},
{
"cell_type": "markdown",
"id": "passing-onion",
"metadata": {},
"source": [
"The following cell implements the mapping rules for creating a SKOS resource."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "confirmed-montana",
"metadata": {},
"outputs": [],
"source": [
"topconcepts=[]\n",
"for index, row in df_metadata.iterrows():\n",
" \n",
" label=row[\"URI\"]\n",
" urilabel=URIRef(label)\n",
" lastslash=label.rfind('/')\n",
" label=URIRef('sshoc_'+label[lastslash+1:])\n",
" ccr.add((sshoccmd[''], SKOS.hasTopConcept, sshoccmd[label]))\n",
" #topconcepts.append(Literal(sshoccmd[label]))\n",
" \n",
" strsource=row['source']\n",
" \n",
" strsource=strsource.replace('(source: ','')\n",
" strsource=strsource.replace(')','')\n",
" source=Literal(strsource.strip())\n",
" enterm=Literal(row[\"Englishterm\"].strip(), lang='en')\n",
" frterm=Literal(row[\"Frenchterm\"].strip(), lang='fr')\n",
" nlterm=Literal(row['Dutchterm'].strip(), lang='nl')\n",
" #determ=Literal(row['Germanterm'], lang='de')\n",
" itterm=Literal(row['Italianterm'].strip(), lang='it')\n",
" #slterm=Literal(row['Slovenianterm'].strip(), lang='sl')\n",
" elterm=Literal(row['Greekterm'].strip(), lang='el')\n",
" \n",
" endef=Literal(row[\"Englishdefinition\"].strip(), lang='en')\n",
" frdef=Literal(row[\"Frenchdefinition\"].strip(), lang='fr')\n",
" nldef=Literal(row['Dutchdefinition'].strip(), lang='nl')\n",
" #dedef=Literal(row['Germandefinition'], lang='de')\n",
" itdef=Literal(row['Italiandefinition'].strip(), lang='it')\n",
" #sldef=Literal(row['Sloveniandefinition'], lang='sl')\n",
" eldef=Literal(row['Greekdefinition'].strip(), lang='el')\n",
" \n",
" ccr.add((sshoccmd[label], RDF.type, SKOS.Concept))\n",
" ccr.add((sshoccmd[label], SKOS.prefLabel, enterm))\n",
" ccr.add((sshoccmd[label], SKOS.prefLabel, frterm))\n",
" ccr.add((sshoccmd[label], SKOS.prefLabel, nlterm))\n",
" #ccr.add(sshoccmd[label], SKOS.prefLabel, determ))\n",
" ccr.add((sshoccmd[label], SKOS.prefLabel, itterm))\n",
" #ccr.add((sshoccmd[label], SKOS.prefLabel, slterm))\n",
" ccr.add((sshoccmd[label], SKOS.prefLabel, elterm))\n",
" \n",
" ccr.add((sshoccmd[label], SKOS.definition, endef))\n",
" ccr.add((sshoccmd[label], SKOS.definition, frdef))\n",
" ccr.add((sshoccmd[label], SKOS.definition, nldef))\n",
" #ccr.add(sshoccmd[label], SKOS.definition, dedef))\n",
" ccr.add((sshoccmd[label], SKOS.definition, itdef))\n",
" #ccr.add((sshoccmd[label], SKOS.definition, sldef))\n",
" ccr.add((sshoccmd[label], SKOS.definition, eldef))\n",
" \n",
" ccr.add((sshoccmd[label], dct.source, source))\n",
" ccr.add((sshoccmd[label], SKOS.exactMatch, urilabel))\n",
" ccr.add((sshoccmd[label], SKOS.topConceptOf, sshoccmd['']))\n",
" \n",
"print(len(ccr))"
]
},
{
"cell_type": "markdown",
"id": "connected-honey",
"metadata": {},
"source": [
"Create a *Turtle* file and an *rdf* file in the **/data** directory with the SKOS resource for **SSHOC Multilingual Metadata** "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "greater-thunder",
"metadata": {},
"outputs": [],
"source": [
"ccr.serialize(destination='data/skosccr.rdf', format=\"pretty-xml\");#format=\"n3\")\n",
"ccr.serialize(destination='data/skosccr.ttl', format=\"n3\");#format=\"n3\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "elementary-graphics",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.1"
}
},
"nbformat": 4,
"nbformat_minor": 5
}