844 lines
27 KiB
Java
844 lines
27 KiB
Java
/*******************************************************************************
|
|
* Copyright (c) 2020 VRE4EIC Consortium
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*******************************************************************************/
|
|
package eu.sshoc.citation.service.wfconfigurator.util;
|
|
|
|
import java.io.BufferedReader;
|
|
import java.io.FileWriter;
|
|
import java.io.IOException;
|
|
import java.io.InputStream;
|
|
import java.io.InputStreamReader;
|
|
import java.io.PrintWriter;
|
|
import java.io.StringReader;
|
|
import java.io.StringWriter;
|
|
import java.net.HttpURLConnection;
|
|
import java.net.URISyntaxException;
|
|
import java.net.URL;
|
|
import java.nio.charset.Charset;
|
|
import java.util.HashMap;
|
|
import java.util.Stack;
|
|
import java.util.Vector;
|
|
|
|
import org.json.JSONArray;
|
|
import org.json.JSONException;
|
|
import org.json.JSONObject;
|
|
import org.jsoup.Jsoup;
|
|
import org.jsoup.nodes.Document;
|
|
import org.jsoup.nodes.Element;
|
|
import org.jsoup.select.Elements;
|
|
import javax.json.stream.JsonParser;
|
|
import javax.json.Json;
|
|
|
|
public class HeuristicParsers {
|
|
|
|
StringWriter sw = new StringWriter();
|
|
PrintWriter pw = new PrintWriter(sw);
|
|
CSVHelper repo=new CSVHelper();
|
|
|
|
public JSONObject getAPIMetadata(String pid) {
|
|
JSONObject metajsondata=new JSONObject();
|
|
JSONObject jsondata=new JSONObject();
|
|
try {
|
|
//Check if it is a handle
|
|
String dnsPid="";
|
|
|
|
|
|
if (pid.contains(".handle.net/")) {
|
|
//get the actual repository URL
|
|
String haid=pid.substring(22);
|
|
haid="https://hdl.handle.net/api/handles/"+haid;
|
|
haid=haid.replace("handles//", "handles/");
|
|
System.out.println("haid "+haid);
|
|
Document doc = SSLHelper.getConnection(haid).ignoreContentType(true)
|
|
.userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0").timeout(20 * 1000).get();
|
|
//System.out.println("doc "+doc.text());
|
|
JSONObject jsonrepometa=new JSONObject(doc.text());
|
|
JSONArray vals = new JSONArray();
|
|
vals=jsonrepometa.getJSONArray("values");
|
|
for (int i=0; i<vals.length(); i++) {
|
|
JSONObject tmp = vals.getJSONObject(i);
|
|
//System.out.println("typ"+tmp.getString("type")+"tip");
|
|
if (tmp.getString("type").contains("URL")) {
|
|
JSONObject urlob=(JSONObject) tmp.get("data");
|
|
dnsPid=urlob.getString("value");
|
|
}
|
|
}
|
|
|
|
}
|
|
System.out.println("pid "+dnsPid);
|
|
String apiURL=repo.getRepoApi(dnsPid);
|
|
|
|
if (apiURL!="none") {
|
|
apiURL=apiURL.replace("viewerPid", pid.substring(23));
|
|
System.out.println("apiURL "+apiURL);
|
|
Document apiDoc = SSLHelper.getConnection(apiURL).ignoreContentType(true)
|
|
.userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0").timeout(20 * 1000).get();
|
|
//System.out.println("doc "+apiDoc.text());
|
|
Elements metadata=apiDoc.getElementsByTag("oai_dc:dc");
|
|
for (Element meta: metadata) {
|
|
Elements metachilderen=meta.children();
|
|
for (Element child : metachilderen) {
|
|
//System.out.println("tag "+child.tagName());
|
|
//System.out.println("val "+ child.text());
|
|
if (metajsondata.length()>0 && metajsondata.has(child.tagName())) {
|
|
String tvalue=metajsondata.getString(child.tagName());
|
|
metajsondata.put(child.tagName(), tvalue+", "+child.text());
|
|
}
|
|
else
|
|
metajsondata.put(child.tagName(), child.text());
|
|
|
|
}
|
|
}
|
|
|
|
}
|
|
} catch (Exception e) {
|
|
// TODO Auto-generated catch block
|
|
e.printStackTrace();
|
|
}
|
|
if (metajsondata!=null && metajsondata.length()>0) {
|
|
try {
|
|
jsondata.put("citation string", "na");
|
|
jsondata.put("metadata source", "Repository API");
|
|
jsondata.put("properties", metajsondata);
|
|
} catch (JSONException e) {
|
|
// TODO Auto-generated catch block
|
|
e.printStackTrace();
|
|
}
|
|
}
|
|
return jsondata;
|
|
}
|
|
public JSONObject getZenodoMetadataJSONLD(String pid) {
|
|
JSONObject jsondata=new JSONObject();
|
|
try {
|
|
//Document doc = Jsoup.connect(pid).get();
|
|
|
|
Document doc = SSLHelper.getConnection(pid).ignoreContentType(true)
|
|
.userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0").timeout(20 * 1000).get();
|
|
Elements cmdheader=doc.getElementsByTag("cmd:Header");
|
|
if (cmdheader!=null && cmdheader.size()>0) {
|
|
JSONObject talarproperties=new JSONObject();
|
|
talarproperties=getTalarData(doc);
|
|
if (talarproperties!=null)
|
|
jsondata.put("properties", talarproperties);
|
|
return jsondata;
|
|
}
|
|
|
|
//application/ld+json
|
|
Elements scripts = doc.getElementsByTag("script");
|
|
for (Element script: scripts) {//get metadata from <script>
|
|
String type = script.attr("type");
|
|
if (type!=null && !type.trim().isEmpty() &&
|
|
(type.trim().equalsIgnoreCase("application/ld+json") || type.trim().equalsIgnoreCase("application/json"))) {
|
|
|
|
String jsonStr=script.toString().trim();
|
|
int headerLimit=jsonStr.indexOf(">");
|
|
String tmpStr=jsonStr.trim().substring(headerLimit+1);
|
|
// String jsonStr=script.outerHtml();
|
|
// String tmpStr=jsonStr.replace("<script type=\"application/ld+json\">", "");
|
|
String jsonStrraw1=tmpStr.trim().replace("</script>", "");
|
|
String jsonStrraw = jsonStrraw1.replace("\r\n", "");
|
|
jsonStr = jsonStrraw.replace('\r', ' ');
|
|
//System.out.println(jsonStr);
|
|
//getGraph(jsonStr);
|
|
JSONObject jsonproperties=new JSONObject();
|
|
|
|
jsonproperties=new JSONObject(jsonStr.trim());
|
|
if (jsonproperties.has("mainEntity")) {
|
|
jsonproperties=jsonproperties.getJSONObject("mainEntity");
|
|
}
|
|
if (jsondata.length()>0 && jsondata.has("properties")) {
|
|
jsondata.put("additional_properties", jsonproperties);
|
|
}
|
|
else {
|
|
jsondata.put("properties", jsonproperties);
|
|
}
|
|
}
|
|
}
|
|
//get the citation string
|
|
|
|
Element citationid = doc.getElementById("invenio-csl");
|
|
//Elements citationclass = doc.getElementsByAttributeValue("class", "citation-select");
|
|
Element citationclass = doc.select("span.citation-select").first();
|
|
String citationStr="";
|
|
if (citationid!=null) {
|
|
Elements cits= citationid.getElementsByTag("invenio-csl");
|
|
|
|
Element cit=cits.first();
|
|
|
|
citationStr=cit.attr("ng-init");
|
|
}else {
|
|
if (citationclass!=null) {
|
|
citationStr= citationclass.text();
|
|
}
|
|
}
|
|
if (!citationStr.trim().isEmpty()) {
|
|
jsondata.put("citation string", citationStr.trim());
|
|
}
|
|
else
|
|
if (pid.contains("doi.org/10") && jsondata.length()>0){
|
|
URL myURL = new URL(pid);
|
|
HttpURLConnection csURLConnection = (HttpURLConnection)myURL.openConnection();
|
|
csURLConnection.setRequestProperty("Accept", "text/x-bibliography");
|
|
csURLConnection.setConnectTimeout(18000); //set timeout to 18 seconds
|
|
InputStream mis = csURLConnection.getInputStream();
|
|
BufferedReader rd = new BufferedReader(new InputStreamReader(mis, Charset.forName("UTF-8")));
|
|
StringBuilder sb = new StringBuilder();
|
|
int cp;
|
|
while ((cp = rd.read()) != -1) {
|
|
sb.append((char) cp);
|
|
}
|
|
String jsonText = sb.toString();
|
|
if (!jsonText.trim().isEmpty()) {
|
|
jsondata.put("citation string", jsonText.trim());
|
|
}
|
|
}
|
|
|
|
}
|
|
catch (Exception e) {
|
|
|
|
//e.printStackTrace();
|
|
e.printStackTrace(pw);
|
|
String sStackTrace = sw.toString(); // stack trace as a string
|
|
pw.flush();
|
|
if (sStackTrace.length()>1500)
|
|
System.out.println(sStackTrace.substring(0, 1499));
|
|
System.out.println ("("+pid+") not available");
|
|
}
|
|
|
|
return jsondata;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public JSONObject getLinkMetadata(String pid) {
|
|
|
|
JSONObject jsondata=new JSONObject();
|
|
|
|
JSONObject jsonproperties=new JSONObject();
|
|
try {
|
|
//Document doc = Jsoup.connect(pid).timeout(15 * 1000).get();
|
|
Document doc = SSLHelper.getConnection(pid).timeout(15 * 1000).get();
|
|
Elements links = doc.getElementsByTag("link");
|
|
for (Element link: links) {//get metadata from <link>
|
|
String rel = link.attr("rel");
|
|
String href= link.attr("href");
|
|
if (!rel.trim().isEmpty() &&
|
|
rel.trim().equalsIgnoreCase("metadata")){
|
|
|
|
jsonproperties.put(rel, href);
|
|
}
|
|
|
|
}
|
|
if (jsonproperties.length()>0)
|
|
jsondata.put("properties", jsonproperties);
|
|
System.out.println(jsondata);
|
|
|
|
}
|
|
catch (Exception e) {
|
|
|
|
e.printStackTrace(pw);
|
|
String sStackTrace = sw.toString(); // stack trace as a string
|
|
if (sStackTrace.length()>200)
|
|
System.out.println(sStackTrace.substring(0, 199));
|
|
pw.flush();
|
|
System.out.println ("&&&&&&&&&&&&&&&&&&&&&&&&& ("+pid+") link metadata not available");
|
|
}
|
|
|
|
return jsondata;
|
|
|
|
}
|
|
public JSONObject getMetaMetadata(String pid){
|
|
JSONObject jsondata=new JSONObject();
|
|
// JSONArray authors = new JSONArray();
|
|
JSONObject jsonproperties=new JSONObject();
|
|
|
|
HashMap<String, String> authors_affiliation = new HashMap<String, String>();
|
|
try {
|
|
//Document doc = Jsoup.connect(pid).timeout(10 * 1000).get();
|
|
Document doc = SSLHelper.getConnection(pid).timeout(15 * 1000).get();
|
|
//String title = doc.title();
|
|
Elements cmdheader=doc.getElementsByTag("cmd:Header");
|
|
if (cmdheader!=null && cmdheader.size()>0) {
|
|
JSONObject talarproperties=new JSONObject();
|
|
talarproperties=getTalarData(doc);
|
|
if (talarproperties!=null)
|
|
jsondata.put("properties", talarproperties);
|
|
return jsondata;
|
|
}
|
|
|
|
|
|
Elements metas = doc.getElementsByTag("meta");
|
|
for (Element meta: metas) {//get metadata from <meta>
|
|
String name = meta.attr("name");
|
|
String property = meta.attr("property");
|
|
String content = meta.attr("content");
|
|
if (!name.trim().isEmpty() &&
|
|
!content.trim().isEmpty() &&
|
|
(name.toLowerCase().trim().startsWith("dc.") ||
|
|
name.trim().startsWith("citation_")) ||
|
|
name.toLowerCase().trim().startsWith("eprints.")){
|
|
if (jsonproperties.has(name)) {
|
|
String names= jsonproperties.get(name).toString();
|
|
content=names+", "+content;
|
|
}
|
|
|
|
if (name.equalsIgnoreCase("eprints.citation")) {
|
|
jsondata.put("citation string", content);
|
|
}
|
|
else
|
|
jsonproperties.put(name, content);
|
|
}
|
|
if (name.trim().contains("citation_author")) {//to be used to double check authors
|
|
authors_affiliation.put(content, "na");
|
|
}
|
|
if (!property.trim().isEmpty() && !content.trim().isEmpty())
|
|
jsonproperties.put(property.trim(), content.trim());
|
|
|
|
}
|
|
//get metadata about authors and affiliation from <div vocab="http://schema.org/"> section
|
|
|
|
Element record = doc.select("div[vocab]").first();
|
|
//search for vocab
|
|
if (record!=null) {
|
|
Elements items = record.select("span[property]"); // span with property attribute
|
|
|
|
for (Element item: items) {
|
|
String name = item.attr("property");
|
|
String val=item.attr("value");
|
|
if (!name.trim().isEmpty() && !val.trim().isEmpty()) {
|
|
jsonproperties.put(name, val);
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
//<div class="citation-popup" data-style-name="harvard" title="Harvard Citation" style="display:none;">
|
|
Element citationstring= doc.select("div[data-style-name]").first();
|
|
if (citationstring!=null) {
|
|
jsondata.put("citation string", citationstring.text().trim());
|
|
}
|
|
|
|
//check if metadata is in the html elements europeana style
|
|
|
|
Elements euRecord = doc.select("div[data-field-name]");
|
|
for (Element divmeta: euRecord) {
|
|
String mdname = divmeta.attr("data-field-name");
|
|
if (mdname != "") {
|
|
jsonproperties.put(mdname, divmeta.text().trim());
|
|
}
|
|
|
|
|
|
}
|
|
|
|
if (jsonproperties!=null && jsonproperties.length()>0)
|
|
jsondata.put("properties", jsonproperties);
|
|
|
|
if (jsondata==null || jsondata.length()==0) {
|
|
jsondata=getZenodoData(doc);
|
|
}
|
|
}
|
|
catch (Exception e) {
|
|
|
|
e.printStackTrace(pw);
|
|
String sStackTrace = sw.toString(); // stack trace as a string
|
|
if (sStackTrace.length()>200)
|
|
System.out.println(sStackTrace.substring(0, 199));
|
|
pw.flush();
|
|
System.out.println ("^^^^^^^^^^^^^^^^^^^^^ ("+pid+") meta metadata not available");
|
|
}
|
|
|
|
return jsondata;
|
|
}
|
|
public JSONObject getZenodoMetadata(String pid){
|
|
JSONObject jsondata=new JSONObject();
|
|
JSONArray authors = new JSONArray();
|
|
|
|
HashMap<String, String> authors_affiliation = new HashMap<String, String>();
|
|
try {
|
|
Document doc = Jsoup.connect(pid).get();
|
|
//String title = doc.title();
|
|
Elements metas = doc.getElementsByTag("meta");
|
|
for (Element meta: metas) {//get metadata from <meta>
|
|
String name = meta.attr("name");
|
|
String property = meta.attr("property");
|
|
String content = meta.attr("content");
|
|
if (!name.trim().isEmpty() &&
|
|
!content.trim().isEmpty() &&
|
|
!name.trim().contains("-site-verification") &&
|
|
!name.trim().contains("citation_author")) {
|
|
|
|
jsondata.put(name, content);
|
|
}
|
|
if (name.trim().contains("citation_author")) {//to be used to double check authors
|
|
authors_affiliation.put(content, "na");
|
|
}
|
|
if (!property.trim().isEmpty() && !content.trim().isEmpty())
|
|
jsondata.put(property.trim(), content.trim());
|
|
|
|
}
|
|
|
|
//get metadata about authors and affiliation from <div class="container record-detail"> section
|
|
|
|
Element recordDetail = doc.select("div.container.record-detail").first();
|
|
Elements affiliations = recordDetail.select("span[title]"); // span with title attribute
|
|
for (Element affiliation: affiliations) {
|
|
String organisation = affiliation.attr("title");
|
|
String auth=affiliation.text();
|
|
if (!organisation.trim().isEmpty() && !auth.trim().isEmpty()) {
|
|
JSONObject org = new JSONObject();
|
|
org.put("organisation", organisation);
|
|
org.put("author_name", auth);
|
|
authors.put(org);
|
|
}
|
|
}
|
|
|
|
jsondata.put("authors", authors);
|
|
|
|
|
|
|
|
Element head = doc.select("head").first();
|
|
Elements links= head.select("link[type]");
|
|
for (Element link:links) {
|
|
String rel=link.attr("rel");
|
|
String type= link.attr("type");
|
|
String href= link.attr("href");
|
|
if (rel.trim().equalsIgnoreCase("alternate")) {
|
|
JSONObject alt = new JSONObject();
|
|
alt.put("type", type);
|
|
alt.put("href", href);
|
|
jsondata.put("alternate", alt);
|
|
}
|
|
|
|
}
|
|
|
|
//get the citation string
|
|
|
|
Element citationid = doc.getElementById("invenio-csl");
|
|
|
|
Elements cits= citationid.getElementsByTag("invenio-csl");
|
|
|
|
Element cit=cits.first();
|
|
|
|
String citationStr=cit.attr("ng-init");
|
|
|
|
if (!citationStr.trim().isEmpty()) {
|
|
jsondata.put("citation string", citationStr.trim());
|
|
}
|
|
|
|
|
|
} catch (Exception e) {
|
|
|
|
e.printStackTrace(pw);
|
|
String sStackTrace = sw.toString(); // stack trace as a string
|
|
if (sStackTrace.length()>200)
|
|
System.out.println(sStackTrace.substring(0, 199));
|
|
pw.flush();
|
|
System.out.println ("@@@@@@@@@@@@@@@("+pid+") zenodo metadata not available");
|
|
}
|
|
|
|
return jsondata;
|
|
|
|
}
|
|
private JSONObject getTalarData (Document doc) throws JSONException {
|
|
JSONObject taljsondata=new JSONObject();
|
|
//header
|
|
Elements cmdheader=doc.getElementsByTag("cmd:Header");
|
|
if (cmdheader!=null && cmdheader.size()>0) {
|
|
for (Element header: cmdheader) {
|
|
Element mdcreator=header.getElementsByTag("cmd:MdCreator").first();
|
|
Element creationdate=header.getElementsByTag("cmd:MdCreationDate").first();
|
|
Element MdSelfLink=header.getElementsByTag("cmd:MdSelfLink").first();
|
|
taljsondata.put("cmd:MdCreator", mdcreator.text());
|
|
taljsondata.put("cmd:MdCreationDate", creationdate.text());
|
|
taljsondata.put("cmd:MdSelfLink", MdSelfLink.text());
|
|
taljsondata.put("cmd:MdProfile", header.getElementsByTag("cmd:MdProfile").first().text());
|
|
taljsondata.put("cmd:MdCollectionDisplayName", header.getElementsByTag("cmd:MdCollectionDisplayName").first().text());
|
|
|
|
}
|
|
}
|
|
//cmd:ResourceProxy
|
|
Elements resproxys=doc.getElementsByTag("cmd:ResourceProxy");
|
|
JSONArray proxyarray = new JSONArray();
|
|
for (Element proxy: resproxys) {
|
|
|
|
JSONObject jsonproxy=new JSONObject();
|
|
Element restype=proxy.getElementsByTag("cmd:ResourceType").first();
|
|
Element resref=proxy.getElementsByTag("cmd:Resourceref").first();
|
|
if (restype!=null) {
|
|
String mimetype=restype.attr("mimetype");
|
|
String rtype=restype.text();
|
|
jsonproxy.put("resource", rtype);
|
|
jsonproxy.put("type", mimetype);
|
|
}
|
|
if (resref!=null) {
|
|
String resid=resref.text();
|
|
jsonproxy.put("reference", resid);
|
|
}
|
|
|
|
//search for cmdp:TypeSpecificSizeInfo and cmdp:ResourceProxyInfo
|
|
|
|
String proxyid = proxy.attr("id");
|
|
Elements pinfo=doc.getElementsByAttributeValue("cmd:ref", proxyid);
|
|
JSONObject jsonproxyinfo=new JSONObject();
|
|
|
|
for (Element info:pinfo) {
|
|
String tname=info.tagName();
|
|
|
|
Elements infochildren=info.children();
|
|
JSONArray proxinfoyarray = new JSONArray();
|
|
for (Element infochild:infochildren) {
|
|
//System.out.println("tttname "+infochild.tagName());
|
|
JSONObject jsonproxyinfochildren=new JSONObject();
|
|
jsonproxyinfochildren.put(infochild.tagName(), infochild.text());
|
|
proxinfoyarray.put(jsonproxyinfochildren);
|
|
|
|
}
|
|
jsonproxyinfo.put(tname, proxinfoyarray);
|
|
|
|
|
|
}
|
|
if (jsonproxyinfo!=null) {
|
|
jsonproxy.put("info", jsonproxyinfo);
|
|
}
|
|
if (jsonproxy !=null)
|
|
proxyarray.put(jsonproxy);
|
|
|
|
}
|
|
if (proxyarray!=null) {
|
|
taljsondata.put("cmd:ResourceProxyList", proxyarray);
|
|
}
|
|
|
|
//cmdp:GeneralInfo
|
|
Elements generalInfo=doc.getElementsByTag("cmdp:GeneralInfo");
|
|
proxyarray=new JSONArray();
|
|
for (Element ginfo: generalInfo) {
|
|
JSONObject jsoninfo=new JSONObject();
|
|
Element location=ginfo.getElementsByTag("cmdp:Location").first();
|
|
if (location!=null) {
|
|
Element address= location.getElementsByTag("cmdp:Address").first();
|
|
Element country= location.getElementsByTag("cmdp:Country").first();
|
|
jsoninfo.put("cmdp:Address", address.text());
|
|
if (country!=null) {
|
|
JSONObject jsoncountry=new JSONObject();
|
|
JSONObject jsoncountryname=new JSONObject();
|
|
Element countryname= country.getElementsByTag("cmdp:CountryName").first();
|
|
String xmllang = countryname.attr("xml:lang");
|
|
jsoncountryname.put("xml:lang", xmllang);
|
|
jsoncountryname.put("cmdp:CountryName", countryname.text());
|
|
String countrycoding=country.getElementsByTag("cmdp:CountryCoding").first().text();
|
|
jsoncountry.put("cmdp:CountryCoding", countrycoding);
|
|
jsoncountry.put("cmdp:Countryname", jsoncountryname);
|
|
jsoninfo.put("cmdp:Country", jsoncountry);
|
|
}
|
|
|
|
|
|
}
|
|
taljsondata.put("cmdp:location", jsoninfo);
|
|
//Tags
|
|
Element tags=ginfo.getElementsByTag("cmdp:Tags").first();
|
|
proxyarray=new JSONArray();
|
|
if (tags!=null) {
|
|
Elements taglist = tags.getElementsByTag("cmdp:tag");
|
|
for (Element tag:taglist){
|
|
String taglang="";
|
|
JSONObject jsontag=new JSONObject();
|
|
taglang = tag.attr("xml:lang");
|
|
jsontag.put("xml:lang", taglang);
|
|
jsontag.put("cmdp:tag", tag.text());
|
|
proxyarray.put(jsontag);
|
|
}
|
|
taljsondata.put("cmdp:Tags", proxyarray);
|
|
}
|
|
|
|
}
|
|
//cmdp:Creators
|
|
|
|
Element creators=doc.getElementsByTag("cmdp:Creators").first();
|
|
proxyarray=new JSONArray();
|
|
if (creators!=null) {
|
|
//System.out.println(creators.getElementsByTag("cmdp:Person").first().text());
|
|
Elements creatorlist = creators.getElementsByTag("cmdp:Person");//cmdp:Person
|
|
for (Element person:creatorlist){
|
|
JSONObject jsonperson=new JSONObject();
|
|
jsonperson.put("cmdp:firstName", person.getElementsByTag("cmdp:firstName").first().text());
|
|
jsonperson.put("cmdp:lastName", person.getElementsByTag("cmdp:lastName").first().text());
|
|
jsonperson.put("cmdp:role", person.getElementsByTag("cmdp:role").first().text());
|
|
proxyarray.put(jsonperson);
|
|
}
|
|
taljsondata.put("cmdp:Creators", proxyarray);
|
|
}
|
|
//cmdp:Descriptions
|
|
Element descriptions=doc.getElementsByTag("cmdp:Descriptions").first();
|
|
proxyarray=new JSONArray();
|
|
if (descriptions!=null) {
|
|
Elements desclist = descriptions.getElementsByTag("cmdp:Description");
|
|
for (Element description:desclist){
|
|
JSONObject jsondesc=new JSONObject();
|
|
jsondesc.put("cmdp:Description", description.getElementsByTag("cmdp:Description").first().text());
|
|
jsondesc.put("xml:lang", description.getElementsByTag("cmdp:Description").first().attr("xml:lang"));
|
|
proxyarray.put(jsondesc);
|
|
}
|
|
taljsondata.put("cmdp:Descriptions", proxyarray);
|
|
}
|
|
//cmdp:ResourceName
|
|
Element resourcename=doc.getElementsByTag("cmdp:ResourceName").first();
|
|
if (resourcename!=null) {
|
|
taljsondata.put("cmdp:ResourceName", resourcename.text());
|
|
}
|
|
//cmdp:ResourceTitle
|
|
Element resourcetitle=doc.getElementsByTag("cmdp:ResourceTitle").first();
|
|
if (resourcetitle!=null) {
|
|
JSONObject jsonrestitle=new JSONObject();
|
|
jsonrestitle.put("cmdp:ResourceName", resourcetitle.text());
|
|
jsonrestitle.put("xml:lang", resourcetitle.attr("xml:lang"));
|
|
taljsondata.put("cmdp:ResourceTitle", jsonrestitle);
|
|
}
|
|
//cmdp:LegalOwner
|
|
|
|
Elements legalowners=doc.getElementsByTag("cmdp:LegalOwner");
|
|
proxyarray=new JSONArray();
|
|
for (Element legalown: legalowners) {
|
|
if (legalown!=null) {
|
|
JSONObject jsonlo=new JSONObject();
|
|
jsonlo.put("cmdp:LegalOwner", legalown.text());
|
|
jsonlo.put("xml:lang", legalown.attr("xml:lang"));
|
|
proxyarray.put(jsonlo);
|
|
}
|
|
|
|
}
|
|
taljsondata.put("cmdp:LegalOwner", proxyarray);
|
|
|
|
//cmdp:TimeCoverage
|
|
Elements timecoves=doc.getElementsByTag("cmdp:TimeCoverage");
|
|
proxyarray=new JSONArray();
|
|
for (Element timecov: timecoves) {
|
|
if (timecov!=null) {
|
|
JSONObject jsonlo=new JSONObject();
|
|
jsonlo.put("cmdp:TimeCoverage", timecov.text());
|
|
jsonlo.put("xml:lang", timecov.attr("xml:lang"));
|
|
proxyarray.put(jsonlo);
|
|
}
|
|
|
|
}
|
|
taljsondata.put("cmdp:TimeCoverage", proxyarray);
|
|
|
|
return taljsondata;
|
|
}
|
|
|
|
private JSONObject getZenodoData(Document doc) throws JSONException{
|
|
JSONObject jsondata=new JSONObject();
|
|
Elements scripts = doc.getElementsByTag("script");
|
|
for (Element script: scripts) {//get metadata from <script>
|
|
String type = script.attr("type");
|
|
if (type!=null && !type.trim().isEmpty() &&
|
|
(type.trim().equalsIgnoreCase("application/ld+json") || type.trim().equalsIgnoreCase("application/json"))) {
|
|
|
|
String jsonStr=script.toString().trim();
|
|
int headerLimit=jsonStr.indexOf(">");
|
|
String tmpStr=jsonStr.trim().substring(headerLimit+1);
|
|
// String jsonStr=script.outerHtml();
|
|
// String tmpStr=jsonStr.replace("<script type=\"application/ld+json\">", "");
|
|
String jsonStrraw1=tmpStr.trim().replace("</script>", "");
|
|
String jsonStrraw = jsonStrraw1.replace("\r\n", "");
|
|
jsonStr = jsonStrraw.replace('\r', ' ');
|
|
//System.out.println(jsonStr);
|
|
//getGraph(jsonStr);
|
|
JSONObject jsonproperties=new JSONObject();
|
|
jsonproperties=new JSONObject(jsonStr.trim());
|
|
if (jsondata.length()>0 && jsondata.has("properties")) {
|
|
jsondata.put("additional_properties", jsonproperties);
|
|
}
|
|
else {
|
|
jsondata.put("properties", jsonproperties);
|
|
}
|
|
}
|
|
}
|
|
//get the citation string
|
|
|
|
Element citationid = doc.getElementById("invenio-csl");
|
|
//Elements citationclass = doc.getElementsByAttributeValue("class", "citation-select");
|
|
Element citationclass = doc.select("span.citation-select").first();
|
|
String citationStr="";
|
|
if (citationid!=null) {
|
|
Elements cits= citationid.getElementsByTag("invenio-csl");
|
|
|
|
Element cit=cits.first();
|
|
|
|
citationStr=cit.attr("ng-init");
|
|
}else {
|
|
if (citationclass!=null) {
|
|
citationStr= citationclass.text();
|
|
}
|
|
}
|
|
if (!citationStr.trim().isEmpty()) {
|
|
jsondata.put("citation string", citationStr.trim());
|
|
}
|
|
return jsondata;
|
|
|
|
}
|
|
|
|
private JSONObject getGraph(String jsondata) {
|
|
JsonParser parser = Json.createParser(new StringReader(jsondata));
|
|
JSONArray jo = new JSONArray();
|
|
JSONArray links = new JSONArray();
|
|
JSONObject graph= new JSONObject();
|
|
String id="";
|
|
Stack<String> source = new Stack<>();
|
|
String target="";
|
|
boolean isarray=false;
|
|
int group=0;
|
|
int count=0;
|
|
try {
|
|
source.push("root");
|
|
while (parser.hasNext()) {
|
|
JsonParser.Event event = parser.next();
|
|
JSONObject joitem = new JSONObject();
|
|
JSONObject link = new JSONObject();
|
|
//target="";
|
|
switch(event) {
|
|
case START_ARRAY:
|
|
System.out.println("sa " + event.toString() +" - "+id+" "+group);
|
|
|
|
joitem.put("id", id+"_"+count);
|
|
joitem.put("name", id);
|
|
joitem.put("group", group);
|
|
joitem.put("value", id);
|
|
|
|
link.put("source", source.peek());
|
|
link.put("target", id+"_"+count);
|
|
link.put("value", 15);
|
|
links.put(link);
|
|
target="";
|
|
|
|
source.push(id+"_"+count);
|
|
group=group+1;
|
|
//id="";
|
|
isarray=true;
|
|
break;
|
|
case END_ARRAY:
|
|
group=group-1;
|
|
//source.push(id+"_"+count);
|
|
//isarray=false;
|
|
break;
|
|
case START_OBJECT:
|
|
System.out.println("so " + event.toString() +" - "+id+" "+group);
|
|
if (!isarray && !id.trim().isEmpty()) {
|
|
joitem.put("id", id+"_"+count);
|
|
joitem.put("name", id);
|
|
joitem.put("group", group);
|
|
joitem.put("value", id);
|
|
|
|
link.put("source", source.peek());
|
|
link.put("target", id+"_"+count);
|
|
link.put("value", 10);
|
|
links.put(link);
|
|
target="";
|
|
source.push(id+"_"+count);
|
|
group=group+1;
|
|
}
|
|
else {
|
|
if (!isarray && id.trim().isEmpty())
|
|
group=group+1;
|
|
else {
|
|
isarray=false;
|
|
source.push(source.peek());
|
|
}
|
|
}
|
|
|
|
|
|
break;
|
|
case END_OBJECT:
|
|
System.out.println("eo " + event.toString() +" - "+id+" "+group);
|
|
group=group-1;
|
|
|
|
if (!source.empty())
|
|
source.pop();
|
|
target="";
|
|
id="";
|
|
break;
|
|
case VALUE_FALSE:
|
|
case VALUE_NULL:
|
|
case VALUE_TRUE:
|
|
System.out.println("true " + event.toString());
|
|
break;
|
|
case KEY_NAME:
|
|
System.out.print("name " +event.toString() + " " +
|
|
parser.getString() + " - ");
|
|
|
|
id=parser.getString();
|
|
break;
|
|
case VALUE_STRING:
|
|
System.out.println("string " + event.toString() + " " +
|
|
parser.getString() +" "+group);
|
|
joitem.put("id", id+"_"+count);
|
|
joitem.put("name", id);
|
|
joitem.put("group", group);
|
|
joitem.put("value", parser.getString());
|
|
//id="";
|
|
target=id+"_"+count;
|
|
break;
|
|
case VALUE_NUMBER:
|
|
System.out.println("number " + event.toString() + " " +
|
|
parser.getString());
|
|
joitem.put("id", id+"_"+count);
|
|
joitem.put("name", id);
|
|
joitem.put("group", group);
|
|
joitem.put("value", parser.getString());
|
|
target=id+"_"+count;
|
|
break;
|
|
}
|
|
if(joitem.length()>0) {
|
|
jo.put(joitem);
|
|
count++;
|
|
if (!target.trim().isEmpty()) {
|
|
link.put("source", source.peek());
|
|
link.put("target", target);
|
|
link.put("value", 20);
|
|
links.put(link);
|
|
}
|
|
}
|
|
}
|
|
JSONObject temp= new JSONObject();
|
|
temp.put("id", "root");
|
|
temp.put("name", "root");
|
|
temp.put("group", 0);
|
|
temp.put("value", "root");
|
|
jo.put(temp);
|
|
|
|
graph.put("nodes", jo);
|
|
graph.put("links", links);
|
|
|
|
}
|
|
catch(Exception e) {
|
|
e.printStackTrace();
|
|
}
|
|
System.out.println(graph.toString());
|
|
// System.out.println(links.toString());
|
|
try (FileWriter file = new FileWriter("mygraphtest.json")) {
|
|
|
|
file.write(graph.toString());
|
|
file.flush();
|
|
|
|
} catch (IOException e) {
|
|
e.printStackTrace();
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
}
|