new class added, landing pages metadata extractor being implemented

This commit is contained in:
Cesare Concordia 2020-10-27 11:15:48 +01:00
parent 32f3301dd1
commit 77c5fc1515
4 changed files with 126 additions and 7 deletions

12
pom.xml
View File

@ -21,11 +21,6 @@
<dependency>
<groupId>com.rabbitmq</groupId>
<artifactId>rabbitmq-client</artifactId>
<version>1.3.0</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
@ -174,7 +169,12 @@
<artifactId>curator-recipes</artifactId>
<version>4.0.1</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
</dependencies>
<repositories>
<repository>

View File

@ -75,7 +75,7 @@ public class CitationHarvester {
return wfc.getCitationMetadata(pid, token).toString();
}
@ApiOperation(value = "Returns formatted citation using via content negotiated request",
@ApiOperation(value = "Returns formatted citation using content negotiated request",
notes = "A client with a valid identifier can invoke this web service to obtain a formatted citation, the text/bibliography content type is used ",
response = String.class)
@RequestMapping(value="/citharvester/getformcit", method=RequestMethod.GET, produces = MediaType.APPLICATION_JSON_VALUE)
@ -84,4 +84,14 @@ public class CitationHarvester {
return wfc.getCitationCSL(pid, token).toString();
}
@ApiOperation(value = "Returns a metadata of a citation parsing the HTML landing page",
notes = "A client with a valid identifier can invoke this web service to obtain metadata of a citation by parsing its HTML landing page",
response = String.class)
@RequestMapping(value="/citharvester/getmetadatahtml", method=RequestMethod.GET, produces = MediaType.APPLICATION_JSON_VALUE)
public String getCitationMetadataFromHTML(@RequestParam(value="pid") String pid, @RequestParam(value="token") String token) {
return wfc.getCitationMetadataFromHTML(pid, token).toString();
}
}

View File

@ -17,6 +17,12 @@ import javax.net.ssl.HttpsURLConnection;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import eu.sshoc.citation.service.wfconfigurator.util.HeuristicParsers;
@ -129,5 +135,9 @@ public class CitationHarvesterImpl {
return jsondata;
}
public JSONObject getCitationMetadataFromHTML(String pid, String token){
HeuristicParsers heup= new HeuristicParsers();
return heup.getZenodoMetadata(pid);
}
}

View File

@ -0,0 +1,99 @@
/*******************************************************************************
* Copyright (c) 2020 VRE4EIC Consortium
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
package eu.sshoc.citation.service.wfconfigurator.util;
import java.util.HashMap;
import org.json.JSONArray;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class HeuristicParsers {
public JSONObject getZenodoMetadata(String pid){
JSONObject jsondata=new JSONObject();
JSONArray authors = new JSONArray();
HashMap<String, String> authors_affiliation = new HashMap<String, String>();
try {
Document doc = Jsoup.connect(pid).get();
//String title = doc.title();
Elements metas = doc.getElementsByTag("meta");
for (Element meta: metas) {//get metadata from <meta>
String name = meta.attr("name");
String property = meta.attr("property");
String content = meta.attr("content");
if (!name.trim().isEmpty() &&
!content.trim().isEmpty() &&
!name.trim().contains("-site-verification") &&
!name.trim().contains("citation_author")) {
jsondata.put(name, content);
}
if (name.trim().contains("citation_author")) {//to be used to double check authors
authors_affiliation.put(content, "na");
}
if (!property.trim().isEmpty() && !content.trim().isEmpty())
jsondata.put(property.trim(), content.trim());
}
//get metadata about authors and affiliation from <div class="container record-detail"> section
Element recordDetail = doc.select("div.container.record-detail").first();
Elements affiliations = recordDetail.select("span[title]"); // span with title attribute
for (Element affiliation: affiliations) {
String organisation = affiliation.attr("title");
String auth=affiliation.text();
if (!organisation.trim().isEmpty() && !auth.trim().isEmpty()) {
JSONObject org = new JSONObject();
org.put("organisation", organisation);
org.put("author_name", auth);
authors.put(org);
}
}
jsondata.put("authors", authors);
//search for alternate link
Element head = doc.select("head").first();
Elements links= head.select("link[type]");
for (Element link:links) {
String rel=link.attr("rel");
String type= link.attr("type");
String href= link.attr("href");
if (rel.trim().equalsIgnoreCase("alternate")) {
JSONObject alt = new JSONObject();
alt.put("type", type);
alt.put("href", href);
jsondata.put("alternate", alt);
}
}
} catch (Exception e) {
e.printStackTrace();
}
return jsondata;
}
}