sshoc-citationservice/src/main/java/eu/sshoc/citation/service/wfconfigurator/impl/CitationHarvesterImpl.java

360 lines
12 KiB
Java

package eu.sshoc.citation.service.wfconfigurator.impl;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringWriter;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.ProtocolException;
import java.net.URL;
import java.net.URLEncoder;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.net.ssl.HttpsURLConnection;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import eu.sshoc.citation.service.wfconfigurator.util.Consts;
import eu.sshoc.citation.service.wfconfigurator.util.HeuristicParsers;
public class CitationHarvesterImpl {
HeuristicParsers heup= new HeuristicParsers();
StringWriter sw = new StringWriter();
PrintWriter pw = new PrintWriter(sw);
String sid="10.1007/s11082-018-1327-1"; //10.1126/science.169.3946.635
URL myURL;
Boolean getCNCit=true;
public CitationHarvesterImpl(){
try {
URL myURL = new URL("https://doi.org/"+sid);
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private static String readAll(Reader rd) throws IOException {
StringBuilder sb = new StringBuilder();
int cp;
while ((cp = rd.read()) != -1) {
sb.append((char) cp);
}
return sb.toString();
}
public static JSONObject readJsonFromUrl(String url) throws IOException, JSONException {
InputStream is = new URL(url).openStream();
try {
BufferedReader rd = new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8")));
String jsonText = readAll(rd);
JSONObject json = new JSONObject(jsonText);
return json;
} finally {
is.close();
}
}
private static String getStandardCitation(String citurl) throws IOException{
URL myURL = new URL(citurl);
HttpURLConnection myURLConnection = (HttpURLConnection)myURL.openConnection();
myURLConnection.setRequestProperty("Accept", "text/x-bibliography; style=harvard3; locale=fr-FR");
InputStream mis = myURLConnection.getInputStream();
BufferedReader rd = new BufferedReader(new InputStreamReader(mis, Charset.forName("UTF-8")));
return (readAll(rd));
}
public String getCitationList(String id, String token){
return null;
}
public JSONObject getCitation(String id, String pid, String token){
JSONObject jsondata=null;
try {
String sid="10.1007/s11082-018-1327-1";
if (pid!=null && pid.trim()!="" && !pid.trim().equalsIgnoreCase("test"))
sid=pid.trim();
jsondata = readJsonFromUrl("https://api.test.datacite.org/dois/"+sid);
System.out.println(jsondata.toString());
// System.out.println(json.get("id"));
} catch (IOException | JSONException e) {
e.printStackTrace();
}
return jsondata;
}
public JSONObject getCitationMetadata(String pid, String token){
JSONObject jsondata=new JSONObject();
JSONObject jsonproperties=null;
//Consts myC= new Consts();
//String sid="10.1007/s11082-018-1327-1";
try {
if (pid!=null && pid.trim()!="" && !pid.trim().equalsIgnoreCase("test"))
sid=pid.trim();
//URL myURL = new URL("https://doi.org/"+sid);
if (sid.startsWith("http://") || sid.startsWith("https://"))
myURL = new URL(sid);
HttpURLConnection myURLConnection = (HttpURLConnection)myURL.openConnection();
//myURLConnection.setRequestProperty("Accept", "application/rdf+xml;q=0.5, application/vnd.citationstyles.csl+json;q=1.0");
//curl -LH "Accept: application/vnd.citationstyles.csl+json, application/rdf+xml" https://doi.org/10.1080/01930826.2016.1186969
myURLConnection.setRequestProperty("Accept", "application/vnd.citationstyles.csl+json, application/rdf+xml");
myURLConnection.setConnectTimeout(18000);
InputStream mis = myURLConnection.getInputStream();
BufferedReader rd = new BufferedReader(new InputStreamReader(mis, Charset.forName("UTF-8")));
String jsonText = readAll(rd);
jsonproperties = new JSONObject(jsonText);
//clean object
for (String field : Consts.unusedFields) {
if (jsonproperties.has(field))
jsonproperties.remove(field);
}
jsondata.put("properties", jsonproperties);
if (getCNCit)
jsondata.put("citation string", getCitationCSL(pid, token).get("citation string"));
System.out.println(jsondata.toString());
//JSONObject jsoncit = new JSONObject();
} catch (Exception e) {
//e.printStackTrace();
e.printStackTrace(pw);
String sStackTrace = sw.toString(); // stack trace as a string
if (sStackTrace.length()>800)
System.out.println(sStackTrace.substring(0, 799));
pw.flush();
System.out.println ("***************** ("+sid+") getCitationMetadata, content negotiation not available, maybe later?");
//return jsondata;
}
return jsondata;
}
public JSONObject getCitationCSL(String pid, String token){
String jsonText="";
JSONObject jsondata=null;
try {
if (pid!=null && pid.trim()!="" && !pid.trim().equalsIgnoreCase("test"))
sid=pid.trim();
if (sid.startsWith("http://") || sid.startsWith("https://")) {
if (sid.startsWith("http://")) {
sid=sid.replace("http://", "https://");
}
myURL = new URL(sid);
}
//URL myURL = new URL("https://doi.org/10.1126/science.169.3946.635");
HttpURLConnection myURLConnection = (HttpURLConnection)myURL.openConnection();
myURLConnection.setRequestProperty("Accept", "text/x-bibliography");
myURLConnection.setConnectTimeout(18000); //set timeout to 18 seconds
InputStream mis = myURLConnection.getInputStream();
BufferedReader rd = new BufferedReader(new InputStreamReader(mis, Charset.forName("UTF-8")));
jsonText = readAll(rd);
System.out.println("citation string "+ jsonText);
jsondata = new JSONObject();
if (!jsonText.isEmpty()) {
jsondata.put("citation string", jsonText);
//urllib.request.urlopen("https://api.crossref.org/works?query.bibliographic="+cit+"&sort=score&mailto=cesare.concordia@gmail.com#", timeout=18000)
String crurl="https://api.crossref.org/works?query.bibliographic="+ URLEncoder.encode(jsonText)+"&sort=score&mailto=cesare.concordia@gmail.com#";
URL crActURL= new URL(crurl);
HttpURLConnection crURLConnection = (HttpURLConnection)crActURL.openConnection();
crURLConnection.setConnectTimeout(18000); //set timeout to 18 seconds
InputStream cris = crURLConnection.getInputStream();
BufferedReader crrd = new BufferedReader(new InputStreamReader(cris, Charset.forName("UTF-8")));
String crjsonText = readAll(crrd);
//System.out.println(crjsonText);
JSONObject jsmeta=new JSONObject(crjsonText);
JSONObject jsmessage = (JSONObject) jsmeta.get("message");
JSONArray jsitems = (JSONArray) jsmessage.get("items");
//if score > 110 ok
//System.out.println(jsitems.toString());
}
} catch (Exception e) {
System.out.println(e);
/*
e.printStackTrace(pw);
String sStackTrace = sw.toString(); // stack trace as a string
if (sStackTrace.length()>200)
System.out.println(sStackTrace.substring(0, 2000));
*/
System.out.println ("####################### ("+pid+") getCitationCSL, citation by content negotiation not available");
}
return jsondata;
}
public JSONObject getCitationMetadataFromHTML(String pid, String token){
JSONObject jsondata=null;
//System.out.println("################## mypid completo "+pid);
pid=extractURL(pid)[0];
System.out.println("################## mypid "+pid);
if (pid.contains(".handle.net/")) {
jsondata=heup.getAPIMetadata(pid);
}
if (pid.startsWith("http://")) {
pid=pid.replace("http://", "https://");
}
if (!pid.startsWith("https"))
pid="https://"+pid.trim();
//special cases
if (pid.contains("ien.bg.ac.rs") || pid.contains("eprints.rclis.org") || pid.contains("opengrey.eu"))
pid=pid.replace("https://", "http://");
//first landing pages
if (jsondata==null || jsondata.length()==0) {
System.out.println("getZenodoMetadataJSONLD ("+pid+")");
jsondata=heup.getZenodoMetadataJSONLD(pid);
}
if (jsondata!=null && jsondata.has("properties")){
try {
JSONObject testp=(JSONObject) jsondata.get("properties");
String citdoi="";
if (testp.has("citation_doi") && !(pid.contains("doi.org/"))) {//check if there is metadata in DOI RA
citdoi = testp.getString("citation_doi");
if (citdoi.trim()!="" && !citdoi.contains("doi.org/")) {
citdoi="https://doi.org/"+citdoi.trim();
}
}else {
if ((testp.has("og:url")) && (testp.getString("og:url").trim().startsWith("https://dl.acm.org/doi/abs/"))) {
citdoi=testp.getString("og:url").trim().replace("https://dl.acm.org/doi/abs/", "https://doi.org/");
}
}
if (citdoi.trim()!="")
pid=citdoi;
} catch (JSONException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
//https://link.springer.com/book/
if (pid.startsWith("https://link.springer.com/book/"))
pid=pid.trim().replace("https://link.springer.com/book/", "https://doi.org/");
//then DOI service providers...
try {
//System.out.println("jsondata.length() "+((JSONObject) jsondata.get("properties")).length());
//int jopl=((JSONObject) jsondata.get("properties")).length();
if ((jsondata==null || !(jsondata.has("properties")) || ((JSONObject) jsondata.get("properties")).length() < 3) && pid.contains("doi.org") ) {
getCNCit=true;
System.out.println("getCitationMetadata ("+pid+")");
jsondata=getCitationMetadata(pid, token);
getCNCit=true;
}
} catch (JSONException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
if (pid.contains("doi.org") && (jsondata==null || jsondata.length()==0 || jsondata.isNull("citation string")) && !token.contains("testAPI")) {
System.out.println("getCitationCSL ("+pid+") "+ token);
JSONObject jsoncsldata=null;
jsoncsldata=getCitationCSL(pid, token);
if (jsondata==null)
jsondata=jsoncsldata;
else {
if(jsoncsldata!=null && jsoncsldata.length()>0) {
Iterator<String> ite= jsoncsldata.keys();
while (ite.hasNext()) {
String key=ite.next();
try {
jsondata.put(key, jsoncsldata.get(key));
} catch (JSONException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
}
if(jsondata!=null && jsondata.has("properties") && pid.contains("doi.org")){
getCNCit=false;
JSONObject mydoijo=new JSONObject();
mydoijo=getCitationMetadata(pid, token);
if (mydoijo!=null) {
try {
if (mydoijo.has("properties"))
jsondata.put("ra_properties", mydoijo.get("properties"));
} catch (JSONException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
getCNCit=true;
}
if (jsondata==null || jsondata.length()==0) {
try {
jsondata=new JSONObject();
jsondata.put("citation string", "na");
} catch (JSONException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
return jsondata;
}
private String[] extractURL(String text) {
List<String> list = new ArrayList<String>();
Pattern pattern = Pattern
.compile(
"(http://|https://){1}[\\w\\.\\-/:\\#\\?\\=\\&\\;\\%\\~\\+]+",
Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(text);
while (matcher.find()) {
list.add(matcher.group());
}
return list.toArray(new String[list.size()]);
}
}