sshoc-citationservice/src/main/java/eu/sshoc/citation/service/wfconfigurator/impl/CitationHarvesterImpl.java

289 lines
9.3 KiB
Java

package eu.sshoc.citation.service.wfconfigurator.impl;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringWriter;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.ProtocolException;
import java.net.URL;
import java.net.URLEncoder;
import java.nio.charset.Charset;
import java.util.Iterator;
import javax.net.ssl.HttpsURLConnection;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import eu.sshoc.citation.service.wfconfigurator.util.Consts;
import eu.sshoc.citation.service.wfconfigurator.util.HeuristicParsers;
public class CitationHarvesterImpl {
HeuristicParsers heup= new HeuristicParsers();
StringWriter sw = new StringWriter();
PrintWriter pw = new PrintWriter(sw);
String sid="10.1007/s11082-018-1327-1"; //10.1126/science.169.3946.635
URL myURL;
Boolean getCNCit=true;
public CitationHarvesterImpl(){
try {
URL myURL = new URL("https://doi.org/"+sid);
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private static String readAll(Reader rd) throws IOException {
StringBuilder sb = new StringBuilder();
int cp;
while ((cp = rd.read()) != -1) {
sb.append((char) cp);
}
return sb.toString();
}
public static JSONObject readJsonFromUrl(String url) throws IOException, JSONException {
InputStream is = new URL(url).openStream();
try {
BufferedReader rd = new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8")));
String jsonText = readAll(rd);
JSONObject json = new JSONObject(jsonText);
return json;
} finally {
is.close();
}
}
private static String getStandardCitation(String citurl) throws IOException{
URL myURL = new URL(citurl);
HttpURLConnection myURLConnection = (HttpURLConnection)myURL.openConnection();
myURLConnection.setRequestProperty("Accept", "text/x-bibliography; style=harvard3; locale=fr-FR");
InputStream mis = myURLConnection.getInputStream();
BufferedReader rd = new BufferedReader(new InputStreamReader(mis, Charset.forName("UTF-8")));
return (readAll(rd));
}
public String getCitationList(String id, String token){
return null;
}
public JSONObject getCitation(String id, String pid, String token){
JSONObject jsondata=null;
try {
String sid="10.1007/s11082-018-1327-1";
if (pid!=null && pid.trim()!="" && !pid.trim().equalsIgnoreCase("test"))
sid=pid.trim();
jsondata = readJsonFromUrl("https://api.test.datacite.org/dois/"+sid);
System.out.println(jsondata.toString());
// System.out.println(json.get("id"));
} catch (IOException | JSONException e) {
e.printStackTrace();
}
return jsondata;
}
public JSONObject getCitationMetadata(String pid, String token){
JSONObject jsondata=new JSONObject();
JSONObject jsonproperties=null;
//Consts myC= new Consts();
//String sid="10.1007/s11082-018-1327-1";
try {
if (pid!=null && pid.trim()!="" && !pid.trim().equalsIgnoreCase("test"))
sid=pid.trim();
//URL myURL = new URL("https://doi.org/"+sid);
if (sid.startsWith("http://") || sid.startsWith("https://"))
myURL = new URL(sid);
HttpURLConnection myURLConnection = (HttpURLConnection)myURL.openConnection();
//myURLConnection.setRequestProperty("Accept", "application/rdf+xml;q=0.5, application/vnd.citationstyles.csl+json;q=1.0");
myURLConnection.setRequestProperty("Accept", "application/vnd.citationstyles.csl+json, application/rdf+xml\\");
myURLConnection.setConnectTimeout(18000);
InputStream mis = myURLConnection.getInputStream();
BufferedReader rd = new BufferedReader(new InputStreamReader(mis, Charset.forName("UTF-8")));
String jsonText = readAll(rd);
jsonproperties = new JSONObject(jsonText);
//clean object
for (String field : Consts.unusedFields) {
if (jsonproperties.has(field))
jsonproperties.remove(field);
}
jsondata.put("properties", jsonproperties);
if (getCNCit)
jsondata.put("citation string", getCitationCSL(pid, token).get("citation string"));
System.out.println(jsondata.toString());
//JSONObject jsoncit = new JSONObject();
} catch (Exception e) {
//e.printStackTrace();
e.printStackTrace(pw);
String sStackTrace = sw.toString(); // stack trace as a string
if (sStackTrace.length()>800)
System.out.println(sStackTrace.substring(0, 799));
pw.flush();
System.out.println ("***************** ("+sid+") getCitationMetadata, content negotiation not available, maybe later?");
//return jsondata;
}
return jsondata;
}
public JSONObject getCitationCSL(String pid, String token){
String jsonText="";
JSONObject jsondata=null;
try {
if (pid!=null && pid.trim()!="" && !pid.trim().equalsIgnoreCase("test"))
sid=pid.trim();
if (sid.startsWith("http://") || sid.startsWith("https://")) {
if (sid.startsWith("http://")) {
sid=sid.replace("http://", "https://");
}
myURL = new URL(sid);
}
//URL myURL = new URL("https://doi.org/10.1126/science.169.3946.635");
HttpURLConnection myURLConnection = (HttpURLConnection)myURL.openConnection();
myURLConnection.setRequestProperty("Accept", "text/x-bibliography");
myURLConnection.setConnectTimeout(18000); //set timeout to 18 seconds
InputStream mis = myURLConnection.getInputStream();
BufferedReader rd = new BufferedReader(new InputStreamReader(mis, Charset.forName("UTF-8")));
jsonText = readAll(rd);
System.out.println("citation string "+ jsonText);
jsondata = new JSONObject();
if (!jsonText.isEmpty()) {
jsondata.put("citation string", jsonText);
//urllib.request.urlopen("https://api.crossref.org/works?query.bibliographic="+cit+"&sort=score&mailto=cesare.concordia@gmail.com#", timeout=18000)
String crurl="https://api.crossref.org/works?query.bibliographic="+ URLEncoder.encode(jsonText)+"&sort=score&mailto=cesare.concordia@gmail.com#";
URL crActURL= new URL(crurl);
HttpURLConnection crURLConnection = (HttpURLConnection)crActURL.openConnection();
crURLConnection.setConnectTimeout(18000); //set timeout to 18 seconds
InputStream cris = crURLConnection.getInputStream();
BufferedReader crrd = new BufferedReader(new InputStreamReader(cris, Charset.forName("UTF-8")));
String crjsonText = readAll(crrd);
//System.out.println(crjsonText);
JSONObject jsmeta=new JSONObject(crjsonText);
JSONObject jsmessage = (JSONObject) jsmeta.get("message");
JSONArray jsitems = (JSONArray) jsmessage.get("items");
//if score > 110 ok
//System.out.println(jsitems.toString());
}
} catch (Exception e) {
System.out.println(e);
/*
e.printStackTrace(pw);
String sStackTrace = sw.toString(); // stack trace as a string
if (sStackTrace.length()>200)
System.out.println(sStackTrace.substring(0, 2000));
*/
System.out.println ("####################### ("+pid+") getCitationCSL, citation by content negotiation not available");
}
return jsondata;
}
public JSONObject getCitationMetadataFromHTML(String pid, String token){
JSONObject jsondata=null;
jsondata=heup.getAPIMetadata(pid);
if (pid.startsWith("http://")) {
pid=pid.replace("http://", "https://");
}
//special cases
if (pid.contains("ien.bg.ac.rs"))
pid=pid.replace("https://", "http://");
//first landing pages
if (jsondata==null || jsondata.length()==0) {
System.out.println("getZenodoMetadataJSONLD ("+pid+")");
jsondata=heup.getZenodoMetadataJSONLD(pid);
}
if (jsondata==null || jsondata.length()==0) {
System.out.println("getMetaMetadata ("+pid+")");
jsondata=heup.getMetaMetadata(pid);
}
if (jsondata==null || jsondata.length()==0) {
System.out.println("getLinkMetadata ("+pid+")");
jsondata= heup.getLinkMetadata(pid);
}
//then DOI service providers...
try {
//System.out.println("jsondata.length() "+((JSONObject) jsondata.get("properties")).length());
//int jopl=((JSONObject) jsondata.get("properties")).length();
if ((jsondata==null || !(jsondata.has("properties")) || ((JSONObject) jsondata.get("properties")).length() < 3) && pid.contains("doi.org") ) {
getCNCit=true;
System.out.println("getCitationMetadata ("+pid+")");
jsondata=getCitationMetadata(pid, token);
getCNCit=true;
}
} catch (JSONException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
if (pid.contains("doi.org") && (jsondata==null || jsondata.length()==0 || jsondata.isNull("citation string"))) {
System.out.println("getCitationCSL ("+pid+")");
JSONObject jsoncsldata=null;
jsoncsldata=getCitationCSL(pid, token);
if (jsondata==null)
jsondata=jsoncsldata;
else {
Iterator<String> ite= jsoncsldata.keys();
while (ite.hasNext()) {
String key=ite.next();
try {
jsondata.put(key, jsoncsldata.get(key));
} catch (JSONException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
if (jsondata==null || jsondata.length()==0) {
try {
jsondata=new JSONObject();
jsondata.put("citation string", "na");
} catch (JSONException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
return jsondata;
}
}