From e729dfc702948cf8f6e92ee9c0811080cfb2000b Mon Sep 17 00:00:00 2001 From: Cesare Date: Thu, 11 Nov 2021 16:23:41 +0100 Subject: [PATCH] code improved --- .../impl/CitationHarvesterImpl.java | 67 ++-- .../wfconfigurator/util/HeuristicParsers.java | 364 ++++++++++++++++-- 2 files changed, 365 insertions(+), 66 deletions(-) diff --git a/src/main/java/eu/sshoc/citation/service/wfconfigurator/impl/CitationHarvesterImpl.java b/src/main/java/eu/sshoc/citation/service/wfconfigurator/impl/CitationHarvesterImpl.java index eb422a8..6db987f 100644 --- a/src/main/java/eu/sshoc/citation/service/wfconfigurator/impl/CitationHarvesterImpl.java +++ b/src/main/java/eu/sshoc/citation/service/wfconfigurator/impl/CitationHarvesterImpl.java @@ -216,8 +216,11 @@ public class CitationHarvesterImpl { pid=pid.replace("http://", "https://"); } + if (!pid.startsWith("https")) + pid="https://"+pid.trim(); + //special cases - if (pid.contains("ien.bg.ac.rs")) + if (pid.contains("ien.bg.ac.rs") || pid.contains("eprints.rclis.org") || pid.contains("opengrey.eu")) pid=pid.replace("https://", "http://"); //first landing pages @@ -228,38 +231,47 @@ public class CitationHarvesterImpl { } - if (jsondata==null || jsondata.length()==0) { + /*if (jsondata==null || jsondata.length()==0) { System.out.println("getMetaMetadata ("+pid+")"); jsondata=heup.getMetaMetadata(pid); - if (jsondata!=null && jsondata.has("properties")){ - try { - JSONObject testp=(JSONObject) jsondata.get("properties"); - String citdoi=""; - if (testp.has("citation_doi") && !(pid.contains("doi.org/"))) {//check if there is metadata in DOI RA - - citdoi = testp.getString("citation_doi"); - if (citdoi.trim()!="" && !citdoi.contains("doi.org/")) { - citdoi="https://doi.org/"+citdoi.trim(); - } - if (citdoi.trim()!="") - pid=citdoi; - - } - - } catch (JSONException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - } - } + }*/ - if (jsondata==null || jsondata.length()==0) { + /*if (jsondata==null || jsondata.length()==0) { System.out.println("getLinkMetadata ("+pid+")"); jsondata= heup.getLinkMetadata(pid); - } + }*/ + if (jsondata!=null && jsondata.has("properties")){ + try { + JSONObject testp=(JSONObject) jsondata.get("properties"); + String citdoi=""; + if (testp.has("citation_doi") && !(pid.contains("doi.org/"))) {//check if there is metadata in DOI RA + + citdoi = testp.getString("citation_doi"); + if (citdoi.trim()!="" && !citdoi.contains("doi.org/")) { + citdoi="https://doi.org/"+citdoi.trim(); + } + + }else { + if ((testp.has("og:url")) && (testp.getString("og:url").trim().startsWith("https://dl.acm.org/doi/abs/"))) { + citdoi=testp.getString("og:url").trim().replace("https://dl.acm.org/doi/abs/", "https://doi.org/"); + } + + } + if (citdoi.trim()!="") + pid=citdoi; + + } catch (JSONException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + //https://link.springer.com/book/ + if (pid.startsWith("https://link.springer.com/book/")) + pid=pid.trim().replace("https://link.springer.com/book/", "https://doi.org/"); //then DOI service providers... try { //System.out.println("jsondata.length() "+((JSONObject) jsondata.get("properties")).length()); @@ -276,8 +288,9 @@ public class CitationHarvesterImpl { e1.printStackTrace(); } - if (pid.contains("doi.org") && (jsondata==null || jsondata.length()==0 || jsondata.isNull("citation string"))) { - System.out.println("getCitationCSL ("+pid+")"); + + if (pid.contains("doi.org") && (jsondata==null || jsondata.length()==0 || jsondata.isNull("citation string")) && !token.contains("testAPI")) { + System.out.println("getCitationCSL ("+pid+") "+ token); JSONObject jsoncsldata=null; jsoncsldata=getCitationCSL(pid, token); if (jsondata==null) diff --git a/src/main/java/eu/sshoc/citation/service/wfconfigurator/util/HeuristicParsers.java b/src/main/java/eu/sshoc/citation/service/wfconfigurator/util/HeuristicParsers.java index f115195..82a6269 100644 --- a/src/main/java/eu/sshoc/citation/service/wfconfigurator/util/HeuristicParsers.java +++ b/src/main/java/eu/sshoc/citation/service/wfconfigurator/util/HeuristicParsers.java @@ -54,8 +54,8 @@ public class HeuristicParsers { try { //Check if it is a handle String dnsPid=""; - - + + if (pid.contains(".handle.net/")) { //get the actual repository URL String haid=pid.substring(22); @@ -74,18 +74,18 @@ public class HeuristicParsers { if (tmp.getString("type").contains("URL")) { JSONObject urlob=(JSONObject) tmp.get("data"); dnsPid=urlob.getString("value"); - + } } } - + if (dnsPid.trim()=="") return (jsondata); - + System.out.println("pid "+dnsPid); String apiURL=repo.getRepoApi(dnsPid); - - if (apiURL!="none") { + + if (!apiURL.trim().equalsIgnoreCase("none")) { apiURL=apiURL.replace("viewerPid", pid.substring(23)); System.out.println("apiURL "+apiURL); Document apiDoc = SSLHelper.getConnection(apiURL).ignoreContentType(true) @@ -103,12 +103,12 @@ public class HeuristicParsers { } else metajsondata.put(child.tagName(), child.text()); - + } } - + } - + if (metajsondata!=null && metajsondata.length()>0) { String rname=repo.getRepoName(dnsPid); String rws=repo.getRepoURL(dnsPid); @@ -116,7 +116,7 @@ public class HeuristicParsers { jsonrepodata.put("name", rname); if (rws!="none") jsonrepodata.put("URL", rws); - + } } catch (Exception e) { // TODO Auto-generated catch block @@ -125,7 +125,7 @@ public class HeuristicParsers { if (metajsondata!=null && metajsondata.length()>0) { try { jsondata.put("citation string", "na"); - + jsondata.put("metadata source", jsonrepodata); jsondata.put("properties", metajsondata); } catch (JSONException e) { @@ -141,7 +141,7 @@ public class HeuristicParsers { //Document doc = Jsoup.connect(pid).get(); Document doc = SSLHelper.getConnection(pid).ignoreContentType(true) - .userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0").referrer("http://www.google.com").timeout(20 * 1000).get(); + .userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0").timeout(30 * 1000).get(); Elements cmdheader=doc.getElementsByTag("cmd:Header"); if (cmdheader!=null && cmdheader.size()>0) { JSONObject talarproperties=new JSONObject(); @@ -168,10 +168,12 @@ public class HeuristicParsers { String jsonStrraw1=tmpStr.trim().replace("", ""); String jsonStrraw = jsonStrraw1.replace("\r\n", ""); jsonStr = jsonStrraw.replace('\r', ' '); + jsonStr = jsonStr.replace("/*]]>*/", ""); + jsonStr = jsonStr.replace("/*0) { + jsondata.put("properties", jsonproperties); + } + + //get the citation string Element citationid = doc.getElementById("invenio-csl"); @@ -261,6 +265,26 @@ public class HeuristicParsers { if (!jsonText.trim().isEmpty()) { jsondata.put("citation string", jsonText.trim()); } + } + if (citationStr.trim().isEmpty() && jsonproperties!=null && jsonproperties.has("citation")){ + + jsondata.put("citation string", jsonproperties.getString("citation")); + + } + + if (jsondata==null || jsondata.length()==0) { + System.out.println("private getMetaMetadata ("+pid+")"); + jsondata=getMetaMetadata(doc); + } + + if (jsondata==null || jsondata.length()==0) { + System.out.println("private getLinkMetadata ("+pid+")"); + jsondata=getLinkMetadata(doc); + } + + if (jsondata==null || jsondata.length()==0) { + System.out.println("private getAttributeMetadata ("+pid+")"); + jsondata=getAttributeMetadata(doc); } } @@ -275,12 +299,101 @@ public class HeuristicParsers { System.out.println ("("+pid+") not available"); } + return jsondata; } - - - + + + private JSONObject getLinkMetadata(Document doc) { + + + JSONObject jsondata=new JSONObject(); + + JSONObject jsonproperties=new JSONObject(); + try { + + Elements links = doc.getElementsByTag("link"); + for (Element link: links) {//get metadata from + String rel = link.attr("rel"); + String href= link.attr("href"); + if (!rel.trim().isEmpty() && + rel.trim().equalsIgnoreCase("metadata")){ + + jsonproperties.put(rel, href); + } + + } + if (jsonproperties.length()>0) + jsondata.put("properties", jsonproperties); + else { + Elements xdts=doc.getElementsByAttributeValueContaining("xmlns:dcterms", "http://purl.org/dc/terms/"); + for (Element xdt : xdts) { + Elements dd=xdt.getElementsByTag("dd"); + Elements dt=xdt.getElementsByTag("dt"); + for (int i=0; i0) + jsondata.put("properties", jsonproperties); + } + if (jsonproperties.length()==0) { + Elements ulxmlns=doc.getElementsByTag("ul"); + for (Element ulxmln: ulxmlns) { + if (ulxmln.hasAttr("xmlns")) { + Elements ctncs=ulxmln.getElementsByAttributeValue("class", "Component_tree_node_content"); + for (Element ctnc: ctncs) { + Element mye= ctnc.parent(); + String ppr=mye.getElementsByTag("code").text().trim(); + String pco=mye.getElementsByTag("sample").text().trim(); + if (!ppr.isEmpty()){ + if (jsonproperties.has(ppr)) { + String vals= jsonproperties.get(ppr).toString(); + pco=vals+", "+pco; + } + jsonproperties.put(ppr, pco); + } + + } + } + + } + if (jsonproperties.length()>0) + jsondata.put("properties", jsonproperties); + } + System.out.println(jsondata); + + } + catch (Exception e) { + + e.printStackTrace(pw); + String sStackTrace = sw.toString(); // stack trace as a string + if (sStackTrace.length()>200) + System.out.println(sStackTrace.substring(0, 199)); + pw.flush(); + System.out.println ("&&&&&&&&&&&&&&&&&&&&&&&&& private link metadata not available"); + } + + return jsondata; + + + + } public JSONObject getLinkMetadata(String pid) { @@ -318,6 +431,179 @@ public class HeuristicParsers { return jsondata; + } + // + private JSONObject getAttributeMetadata(Document doc) { + + + JSONObject jsondata=new JSONObject(); + JSONObject jsonproperties=new JSONObject(); + + HashMap authors_affiliation = new HashMap(); + try { + + Elements dcel=doc.getElementsByAttributeValueStarting("property", "dc"); + + for (Element meta: dcel) {//get metadata from + String content = meta.text(); + String property = meta.attr("property"); + + if (!content.trim().isEmpty()){ + if (jsonproperties.has(property)) { + String names= jsonproperties.get(property).toString(); + content=names+", "+content; + } + + if (property.equalsIgnoreCase("eprints.citation")) { + jsondata.put("citation string", content); + } + else + jsonproperties.put(property, content); + + } + if (property.trim().contains("citation_author")) {//to be used to double check authors + authors_affiliation.put(content, "na"); + } + + if (!property.trim().isEmpty() && !content.trim().isEmpty()) + jsonproperties.put(property.trim(), content.trim()); + + } + //get metadata about authors and affiliation from
section + + + + //