new class added, landing pages metadata extractor being implemented

2020-10-27 11:15:48 +01:00 · 2020-10-27 11:15:48 +01:00 · 77c5fc1515
parent 32f3301dd1
commit 77c5fc1515
4 changed files with 126 additions and 7 deletions
--- a/pom.xml
+++ b/pom.xml
@ -21,11 +21,6 @@
 	
 	
 
-		<dependency>
-			<groupId>com.rabbitmq</groupId>
-			<artifactId>rabbitmq-client</artifactId>
-			<version>1.3.0</version>
-		</dependency>
 		<dependency>
 			<groupId>junit</groupId>
 			<artifactId>junit</artifactId>
@ -174,7 +169,12 @@
            <artifactId>curator-recipes</artifactId>
            <version>4.0.1</version>
        </dependency>
-		 
+
+        <dependency>
+        	<groupId>org.jsoup</groupId>
+        	<artifactId>jsoup</artifactId>
+        	<version>1.11.3</version>
+        </dependency>
 	</dependencies>
 	<repositories>
  <repository>
--- a/src/main/java/eu/sshoc/citation/service/services/CitationHarvester.java
+++ b/src/main/java/eu/sshoc/citation/service/services/CitationHarvester.java
@ -75,7 +75,7 @@ public class CitationHarvester {
 		 return wfc.getCitationMetadata(pid, token).toString();
 	 }
 	
-	 @ApiOperation(value = "Returns formatted citation using via content negotiated request", 
+	 @ApiOperation(value = "Returns formatted citation using content negotiated request", 
 		        notes = "A client with a valid identifier can invoke this web service to obtain a formatted citation, the text/bibliography content type is used ", 
 		        response = String.class)
 	    @RequestMapping(value="/citharvester/getformcit", method=RequestMethod.GET, produces = MediaType.APPLICATION_JSON_VALUE)
@ -84,4 +84,14 @@ public class CitationHarvester {
 		 return wfc.getCitationCSL(pid, token).toString();
 	 }

+	 
+	 @ApiOperation(value = "Returns a metadata of a citation parsing the HTML landing page", 
+		        notes = "A client with a valid identifier can invoke this web service to obtain metadata of a citation by parsing its HTML landing page", 
+		        response = String.class)
+	    @RequestMapping(value="/citharvester/getmetadatahtml", method=RequestMethod.GET, produces = MediaType.APPLICATION_JSON_VALUE)
+	 public String getCitationMetadataFromHTML(@RequestParam(value="pid") String pid,  @RequestParam(value="token") String token) {
+		
+		 return wfc.getCitationMetadataFromHTML(pid, token).toString();
+	 }
+	 
 }
--- a/src/main/java/eu/sshoc/citation/service/wfconfigurator/impl/CitationHarvesterImpl.java
+++ b/src/main/java/eu/sshoc/citation/service/wfconfigurator/impl/CitationHarvesterImpl.java
@ -17,6 +17,12 @@ import javax.net.ssl.HttpsURLConnection;
 import org.json.JSONArray;
 import org.json.JSONException;
 import org.json.JSONObject;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import eu.sshoc.citation.service.wfconfigurator.util.HeuristicParsers;



@ -129,5 +135,9 @@ public class CitationHarvesterImpl {
 	   
 		return jsondata;
 	}
+	public JSONObject getCitationMetadataFromHTML(String pid, String token){
+	   HeuristicParsers heup= new HeuristicParsers();
+		return heup.getZenodoMetadata(pid);
+	}

 }
--- a/src/main/java/eu/sshoc/citation/service/wfconfigurator/util/HeuristicParsers.java
+++ b/src/main/java/eu/sshoc/citation/service/wfconfigurator/util/HeuristicParsers.java
@ -0,0 +1,99 @@
+/*******************************************************************************
+ * Copyright (c) 2020 VRE4EIC Consortium
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
+package eu.sshoc.citation.service.wfconfigurator.util;
+
+import java.util.HashMap;
+
+import org.json.JSONArray;
+import org.json.JSONObject;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+public class HeuristicParsers {
+	
+	 public JSONObject getZenodoMetadata(String pid){
+		 JSONObject jsondata=new JSONObject();
+		 JSONArray authors = new JSONArray();
+		 
+		 HashMap<String, String> authors_affiliation = new HashMap<String, String>();
+			try {
+				Document doc = Jsoup.connect(pid).get();
+				//String title = doc.title();
+				Elements metas = doc.getElementsByTag("meta");
+				for (Element meta: metas) {//get metadata from <meta>
+					  String name = meta.attr("name");
+					  String property = meta.attr("property");
+					  String content = meta.attr("content");
+					  if (!name.trim().isEmpty() && 
+							  !content.trim().isEmpty() && 
+							  !name.trim().contains("-site-verification") && 
+							  !name.trim().contains("citation_author")) {
+						  
+						  jsondata.put(name, content);
+					  }
+					  if (name.trim().contains("citation_author")) {//to be used to double check  authors
+							  authors_affiliation.put(content, "na");
+					  }
+					  if (!property.trim().isEmpty() && !content.trim().isEmpty())
+						  jsondata.put(property.trim(), content.trim());
+						  
+					}
+				
+				//get metadata about authors and affiliation from <div class="container record-detail"> section
+				
+				Element recordDetail = doc.select("div.container.record-detail").first();
+				Elements affiliations = recordDetail.select("span[title]"); // span with title attribute
+				for (Element affiliation: affiliations) {
+					String organisation = affiliation.attr("title");
+					String auth=affiliation.text();
+					if (!organisation.trim().isEmpty() && !auth.trim().isEmpty()) {
+						JSONObject org = new JSONObject();
+						org.put("organisation", organisation);
+						org.put("author_name", auth);
+						authors.put(org);
+					}
+				}
+				
+				jsondata.put("authors", authors);
+				//search for alternate link
+				
+				Element head = doc.select("head").first();
+				Elements links= head.select("link[type]");
+				for (Element link:links) {
+					String rel=link.attr("rel");
+					String type= link.attr("type");
+					String href= link.attr("href");
+					if (rel.trim().equalsIgnoreCase("alternate")) {
+						JSONObject alt = new JSONObject();
+						alt.put("type", type);
+						alt.put("href", href);
+						jsondata.put("alternate", alt);
+					}
+					
+				}
+				
+			} catch (Exception e) {
+				
+				e.printStackTrace();
+			}
+		   
+			return jsondata;
+		 
+	 }
+
+}