new class added, landing pages metadata extractor being implemented
This commit is contained in:
parent
32f3301dd1
commit
77c5fc1515
12
pom.xml
12
pom.xml
|
@ -21,11 +21,6 @@
|
|||
|
||||
|
||||
|
||||
<dependency>
|
||||
<groupId>com.rabbitmq</groupId>
|
||||
<artifactId>rabbitmq-client</artifactId>
|
||||
<version>1.3.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
|
@ -174,7 +169,12 @@
|
|||
<artifactId>curator-recipes</artifactId>
|
||||
<version>4.0.1</version>
|
||||
</dependency>
|
||||
|
||||
|
||||
<dependency>
|
||||
<groupId>org.jsoup</groupId>
|
||||
<artifactId>jsoup</artifactId>
|
||||
<version>1.11.3</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
<repositories>
|
||||
<repository>
|
||||
|
|
|
@ -75,7 +75,7 @@ public class CitationHarvester {
|
|||
return wfc.getCitationMetadata(pid, token).toString();
|
||||
}
|
||||
|
||||
@ApiOperation(value = "Returns formatted citation using via content negotiated request",
|
||||
@ApiOperation(value = "Returns formatted citation using content negotiated request",
|
||||
notes = "A client with a valid identifier can invoke this web service to obtain a formatted citation, the text/bibliography content type is used ",
|
||||
response = String.class)
|
||||
@RequestMapping(value="/citharvester/getformcit", method=RequestMethod.GET, produces = MediaType.APPLICATION_JSON_VALUE)
|
||||
|
@ -84,4 +84,14 @@ public class CitationHarvester {
|
|||
return wfc.getCitationCSL(pid, token).toString();
|
||||
}
|
||||
|
||||
|
||||
@ApiOperation(value = "Returns a metadata of a citation parsing the HTML landing page",
|
||||
notes = "A client with a valid identifier can invoke this web service to obtain metadata of a citation by parsing its HTML landing page",
|
||||
response = String.class)
|
||||
@RequestMapping(value="/citharvester/getmetadatahtml", method=RequestMethod.GET, produces = MediaType.APPLICATION_JSON_VALUE)
|
||||
public String getCitationMetadataFromHTML(@RequestParam(value="pid") String pid, @RequestParam(value="token") String token) {
|
||||
|
||||
return wfc.getCitationMetadataFromHTML(pid, token).toString();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -17,6 +17,12 @@ import javax.net.ssl.HttpsURLConnection;
|
|||
import org.json.JSONArray;
|
||||
import org.json.JSONException;
|
||||
import org.json.JSONObject;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
|
||||
import eu.sshoc.citation.service.wfconfigurator.util.HeuristicParsers;
|
||||
|
||||
|
||||
|
||||
|
@ -129,5 +135,9 @@ public class CitationHarvesterImpl {
|
|||
|
||||
return jsondata;
|
||||
}
|
||||
public JSONObject getCitationMetadataFromHTML(String pid, String token){
|
||||
HeuristicParsers heup= new HeuristicParsers();
|
||||
return heup.getZenodoMetadata(pid);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,99 @@
|
|||
/*******************************************************************************
|
||||
* Copyright (c) 2020 VRE4EIC Consortium
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*******************************************************************************/
|
||||
package eu.sshoc.citation.service.wfconfigurator.util;
|
||||
|
||||
import java.util.HashMap;
|
||||
|
||||
import org.json.JSONArray;
|
||||
import org.json.JSONObject;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.select.Elements;
|
||||
|
||||
public class HeuristicParsers {
|
||||
|
||||
public JSONObject getZenodoMetadata(String pid){
|
||||
JSONObject jsondata=new JSONObject();
|
||||
JSONArray authors = new JSONArray();
|
||||
|
||||
HashMap<String, String> authors_affiliation = new HashMap<String, String>();
|
||||
try {
|
||||
Document doc = Jsoup.connect(pid).get();
|
||||
//String title = doc.title();
|
||||
Elements metas = doc.getElementsByTag("meta");
|
||||
for (Element meta: metas) {//get metadata from <meta>
|
||||
String name = meta.attr("name");
|
||||
String property = meta.attr("property");
|
||||
String content = meta.attr("content");
|
||||
if (!name.trim().isEmpty() &&
|
||||
!content.trim().isEmpty() &&
|
||||
!name.trim().contains("-site-verification") &&
|
||||
!name.trim().contains("citation_author")) {
|
||||
|
||||
jsondata.put(name, content);
|
||||
}
|
||||
if (name.trim().contains("citation_author")) {//to be used to double check authors
|
||||
authors_affiliation.put(content, "na");
|
||||
}
|
||||
if (!property.trim().isEmpty() && !content.trim().isEmpty())
|
||||
jsondata.put(property.trim(), content.trim());
|
||||
|
||||
}
|
||||
|
||||
//get metadata about authors and affiliation from <div class="container record-detail"> section
|
||||
|
||||
Element recordDetail = doc.select("div.container.record-detail").first();
|
||||
Elements affiliations = recordDetail.select("span[title]"); // span with title attribute
|
||||
for (Element affiliation: affiliations) {
|
||||
String organisation = affiliation.attr("title");
|
||||
String auth=affiliation.text();
|
||||
if (!organisation.trim().isEmpty() && !auth.trim().isEmpty()) {
|
||||
JSONObject org = new JSONObject();
|
||||
org.put("organisation", organisation);
|
||||
org.put("author_name", auth);
|
||||
authors.put(org);
|
||||
}
|
||||
}
|
||||
|
||||
jsondata.put("authors", authors);
|
||||
//search for alternate link
|
||||
|
||||
Element head = doc.select("head").first();
|
||||
Elements links= head.select("link[type]");
|
||||
for (Element link:links) {
|
||||
String rel=link.attr("rel");
|
||||
String type= link.attr("type");
|
||||
String href= link.attr("href");
|
||||
if (rel.trim().equalsIgnoreCase("alternate")) {
|
||||
JSONObject alt = new JSONObject();
|
||||
alt.put("type", type);
|
||||
alt.put("href", href);
|
||||
jsondata.put("alternate", alt);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
return jsondata;
|
||||
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue