diff --git a/pom.xml b/pom.xml index c2913ce..018e3a7 100644 --- a/pom.xml +++ b/pom.xml @@ -21,11 +21,6 @@ - - com.rabbitmq - rabbitmq-client - 1.3.0 - junit junit @@ -174,7 +169,12 @@ curator-recipes 4.0.1 - + + + org.jsoup + jsoup + 1.11.3 + diff --git a/src/main/java/eu/sshoc/citation/service/services/CitationHarvester.java b/src/main/java/eu/sshoc/citation/service/services/CitationHarvester.java index 3628b16..b468041 100644 --- a/src/main/java/eu/sshoc/citation/service/services/CitationHarvester.java +++ b/src/main/java/eu/sshoc/citation/service/services/CitationHarvester.java @@ -75,7 +75,7 @@ public class CitationHarvester { return wfc.getCitationMetadata(pid, token).toString(); } - @ApiOperation(value = "Returns formatted citation using via content negotiated request", + @ApiOperation(value = "Returns formatted citation using content negotiated request", notes = "A client with a valid identifier can invoke this web service to obtain a formatted citation, the text/bibliography content type is used ", response = String.class) @RequestMapping(value="/citharvester/getformcit", method=RequestMethod.GET, produces = MediaType.APPLICATION_JSON_VALUE) @@ -84,4 +84,14 @@ public class CitationHarvester { return wfc.getCitationCSL(pid, token).toString(); } + + @ApiOperation(value = "Returns a metadata of a citation parsing the HTML landing page", + notes = "A client with a valid identifier can invoke this web service to obtain metadata of a citation by parsing its HTML landing page", + response = String.class) + @RequestMapping(value="/citharvester/getmetadatahtml", method=RequestMethod.GET, produces = MediaType.APPLICATION_JSON_VALUE) + public String getCitationMetadataFromHTML(@RequestParam(value="pid") String pid, @RequestParam(value="token") String token) { + + return wfc.getCitationMetadataFromHTML(pid, token).toString(); + } + } diff --git a/src/main/java/eu/sshoc/citation/service/wfconfigurator/impl/CitationHarvesterImpl.java b/src/main/java/eu/sshoc/citation/service/wfconfigurator/impl/CitationHarvesterImpl.java index c7f3083..9f12fb8 100644 --- a/src/main/java/eu/sshoc/citation/service/wfconfigurator/impl/CitationHarvesterImpl.java +++ b/src/main/java/eu/sshoc/citation/service/wfconfigurator/impl/CitationHarvesterImpl.java @@ -17,6 +17,12 @@ import javax.net.ssl.HttpsURLConnection; import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import eu.sshoc.citation.service.wfconfigurator.util.HeuristicParsers; @@ -129,5 +135,9 @@ public class CitationHarvesterImpl { return jsondata; } + public JSONObject getCitationMetadataFromHTML(String pid, String token){ + HeuristicParsers heup= new HeuristicParsers(); + return heup.getZenodoMetadata(pid); + } } diff --git a/src/main/java/eu/sshoc/citation/service/wfconfigurator/util/HeuristicParsers.java b/src/main/java/eu/sshoc/citation/service/wfconfigurator/util/HeuristicParsers.java new file mode 100644 index 0000000..6b9e41c --- /dev/null +++ b/src/main/java/eu/sshoc/citation/service/wfconfigurator/util/HeuristicParsers.java @@ -0,0 +1,99 @@ +/******************************************************************************* + * Copyright (c) 2020 VRE4EIC Consortium + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *******************************************************************************/ +package eu.sshoc.citation.service.wfconfigurator.util; + +import java.util.HashMap; + +import org.json.JSONArray; +import org.json.JSONObject; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +public class HeuristicParsers { + + public JSONObject getZenodoMetadata(String pid){ + JSONObject jsondata=new JSONObject(); + JSONArray authors = new JSONArray(); + + HashMap authors_affiliation = new HashMap(); + try { + Document doc = Jsoup.connect(pid).get(); + //String title = doc.title(); + Elements metas = doc.getElementsByTag("meta"); + for (Element meta: metas) {//get metadata from + String name = meta.attr("name"); + String property = meta.attr("property"); + String content = meta.attr("content"); + if (!name.trim().isEmpty() && + !content.trim().isEmpty() && + !name.trim().contains("-site-verification") && + !name.trim().contains("citation_author")) { + + jsondata.put(name, content); + } + if (name.trim().contains("citation_author")) {//to be used to double check authors + authors_affiliation.put(content, "na"); + } + if (!property.trim().isEmpty() && !content.trim().isEmpty()) + jsondata.put(property.trim(), content.trim()); + + } + + //get metadata about authors and affiliation from
section + + Element recordDetail = doc.select("div.container.record-detail").first(); + Elements affiliations = recordDetail.select("span[title]"); // span with title attribute + for (Element affiliation: affiliations) { + String organisation = affiliation.attr("title"); + String auth=affiliation.text(); + if (!organisation.trim().isEmpty() && !auth.trim().isEmpty()) { + JSONObject org = new JSONObject(); + org.put("organisation", organisation); + org.put("author_name", auth); + authors.put(org); + } + } + + jsondata.put("authors", authors); + //search for alternate link + + Element head = doc.select("head").first(); + Elements links= head.select("link[type]"); + for (Element link:links) { + String rel=link.attr("rel"); + String type= link.attr("type"); + String href= link.attr("href"); + if (rel.trim().equalsIgnoreCase("alternate")) { + JSONObject alt = new JSONObject(); + alt.put("type", type); + alt.put("href", href); + jsondata.put("alternate", alt); + } + + } + + } catch (Exception e) { + + e.printStackTrace(); + } + + return jsondata; + + } + +}