360 lines
12 KiB
Java
360 lines
12 KiB
Java
package eu.sshoc.citation.service.wfconfigurator.impl;
|
|
|
|
import java.io.BufferedReader;
|
|
import java.io.IOException;
|
|
import java.io.InputStream;
|
|
import java.io.InputStreamReader;
|
|
import java.io.PrintWriter;
|
|
import java.io.Reader;
|
|
import java.io.StringWriter;
|
|
import java.net.HttpURLConnection;
|
|
import java.net.MalformedURLException;
|
|
import java.net.ProtocolException;
|
|
import java.net.URL;
|
|
import java.net.URLEncoder;
|
|
import java.nio.charset.Charset;
|
|
import java.util.ArrayList;
|
|
import java.util.Iterator;
|
|
import java.util.List;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
|
|
import javax.net.ssl.HttpsURLConnection;
|
|
|
|
import org.json.JSONArray;
|
|
import org.json.JSONException;
|
|
import org.json.JSONObject;
|
|
import org.jsoup.Jsoup;
|
|
import org.jsoup.nodes.Document;
|
|
import org.jsoup.nodes.Element;
|
|
import org.jsoup.select.Elements;
|
|
|
|
import eu.sshoc.citation.service.wfconfigurator.util.Consts;
|
|
import eu.sshoc.citation.service.wfconfigurator.util.HeuristicParsers;
|
|
|
|
|
|
|
|
public class CitationHarvesterImpl {
|
|
|
|
HeuristicParsers heup= new HeuristicParsers();
|
|
StringWriter sw = new StringWriter();
|
|
PrintWriter pw = new PrintWriter(sw);
|
|
String sid="10.1007/s11082-018-1327-1"; //10.1126/science.169.3946.635
|
|
URL myURL;
|
|
Boolean getCNCit=true;
|
|
|
|
public CitationHarvesterImpl(){
|
|
|
|
try {
|
|
URL myURL = new URL("https://doi.org/"+sid);
|
|
} catch (MalformedURLException e) {
|
|
// TODO Auto-generated catch block
|
|
e.printStackTrace();
|
|
}
|
|
|
|
}
|
|
|
|
private static String readAll(Reader rd) throws IOException {
|
|
StringBuilder sb = new StringBuilder();
|
|
int cp;
|
|
while ((cp = rd.read()) != -1) {
|
|
sb.append((char) cp);
|
|
}
|
|
return sb.toString();
|
|
}
|
|
|
|
public static JSONObject readJsonFromUrl(String url) throws IOException, JSONException {
|
|
InputStream is = new URL(url).openStream();
|
|
try {
|
|
BufferedReader rd = new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8")));
|
|
String jsonText = readAll(rd);
|
|
JSONObject json = new JSONObject(jsonText);
|
|
return json;
|
|
} finally {
|
|
is.close();
|
|
}
|
|
}
|
|
private static String getStandardCitation(String citurl) throws IOException{
|
|
URL myURL = new URL(citurl);
|
|
HttpURLConnection myURLConnection = (HttpURLConnection)myURL.openConnection();
|
|
myURLConnection.setRequestProperty("Accept", "text/x-bibliography; style=harvard3; locale=fr-FR");
|
|
InputStream mis = myURLConnection.getInputStream();
|
|
BufferedReader rd = new BufferedReader(new InputStreamReader(mis, Charset.forName("UTF-8")));
|
|
return (readAll(rd));
|
|
|
|
}
|
|
public String getCitationList(String id, String token){
|
|
return null;
|
|
}
|
|
|
|
public JSONObject getCitation(String id, String pid, String token){
|
|
JSONObject jsondata=null;
|
|
try {
|
|
String sid="10.1007/s11082-018-1327-1";
|
|
if (pid!=null && pid.trim()!="" && !pid.trim().equalsIgnoreCase("test"))
|
|
sid=pid.trim();
|
|
|
|
jsondata = readJsonFromUrl("https://api.test.datacite.org/dois/"+sid);
|
|
System.out.println(jsondata.toString());
|
|
// System.out.println(json.get("id"));
|
|
} catch (IOException | JSONException e) {
|
|
|
|
e.printStackTrace();
|
|
}
|
|
|
|
return jsondata;
|
|
}
|
|
public JSONObject getCitationMetadata(String pid, String token){
|
|
JSONObject jsondata=new JSONObject();
|
|
JSONObject jsonproperties=null;
|
|
//Consts myC= new Consts();
|
|
//String sid="10.1007/s11082-018-1327-1";
|
|
try {
|
|
if (pid!=null && pid.trim()!="" && !pid.trim().equalsIgnoreCase("test"))
|
|
sid=pid.trim();
|
|
//URL myURL = new URL("https://doi.org/"+sid);
|
|
if (sid.startsWith("http://") || sid.startsWith("https://"))
|
|
myURL = new URL(sid);
|
|
|
|
HttpURLConnection myURLConnection = (HttpURLConnection)myURL.openConnection();
|
|
//myURLConnection.setRequestProperty("Accept", "application/rdf+xml;q=0.5, application/vnd.citationstyles.csl+json;q=1.0");
|
|
//curl -LH "Accept: application/vnd.citationstyles.csl+json, application/rdf+xml" https://doi.org/10.1080/01930826.2016.1186969
|
|
myURLConnection.setRequestProperty("Accept", "application/vnd.citationstyles.csl+json, application/rdf+xml");
|
|
myURLConnection.setConnectTimeout(18000);
|
|
InputStream mis = myURLConnection.getInputStream();
|
|
BufferedReader rd = new BufferedReader(new InputStreamReader(mis, Charset.forName("UTF-8")));
|
|
String jsonText = readAll(rd);
|
|
jsonproperties = new JSONObject(jsonText);
|
|
//clean object
|
|
for (String field : Consts.unusedFields) {
|
|
if (jsonproperties.has(field))
|
|
jsonproperties.remove(field);
|
|
}
|
|
jsondata.put("properties", jsonproperties);
|
|
if (getCNCit)
|
|
jsondata.put("citation string", getCitationCSL(pid, token).get("citation string"));
|
|
System.out.println(jsondata.toString());
|
|
//JSONObject jsoncit = new JSONObject();
|
|
|
|
} catch (Exception e) {
|
|
|
|
//e.printStackTrace();
|
|
|
|
e.printStackTrace(pw);
|
|
String sStackTrace = sw.toString(); // stack trace as a string
|
|
if (sStackTrace.length()>800)
|
|
System.out.println(sStackTrace.substring(0, 799));
|
|
pw.flush();
|
|
System.out.println ("***************** ("+sid+") getCitationMetadata, content negotiation not available, maybe later?");
|
|
//return jsondata;
|
|
}
|
|
|
|
return jsondata;
|
|
}
|
|
|
|
public JSONObject getCitationCSL(String pid, String token){
|
|
String jsonText="";
|
|
JSONObject jsondata=null;
|
|
|
|
|
|
try {
|
|
|
|
if (pid!=null && pid.trim()!="" && !pid.trim().equalsIgnoreCase("test"))
|
|
sid=pid.trim();
|
|
|
|
if (sid.startsWith("http://") || sid.startsWith("https://")) {
|
|
if (sid.startsWith("http://")) {
|
|
sid=sid.replace("http://", "https://");
|
|
}
|
|
myURL = new URL(sid);
|
|
}
|
|
//URL myURL = new URL("https://doi.org/10.1126/science.169.3946.635");
|
|
HttpURLConnection myURLConnection = (HttpURLConnection)myURL.openConnection();
|
|
myURLConnection.setRequestProperty("Accept", "text/x-bibliography");
|
|
myURLConnection.setConnectTimeout(18000); //set timeout to 18 seconds
|
|
InputStream mis = myURLConnection.getInputStream();
|
|
BufferedReader rd = new BufferedReader(new InputStreamReader(mis, Charset.forName("UTF-8")));
|
|
jsonText = readAll(rd);
|
|
|
|
System.out.println("citation string "+ jsonText);
|
|
jsondata = new JSONObject();
|
|
if (!jsonText.isEmpty()) {
|
|
jsondata.put("citation string", jsonText);
|
|
//urllib.request.urlopen("https://api.crossref.org/works?query.bibliographic="+cit+"&sort=score&mailto=cesare.concordia@gmail.com#", timeout=18000)
|
|
String crurl="https://api.crossref.org/works?query.bibliographic="+ URLEncoder.encode(jsonText)+"&sort=score&mailto=cesare.concordia@gmail.com#";
|
|
URL crActURL= new URL(crurl);
|
|
HttpURLConnection crURLConnection = (HttpURLConnection)crActURL.openConnection();
|
|
crURLConnection.setConnectTimeout(18000); //set timeout to 18 seconds
|
|
InputStream cris = crURLConnection.getInputStream();
|
|
BufferedReader crrd = new BufferedReader(new InputStreamReader(cris, Charset.forName("UTF-8")));
|
|
String crjsonText = readAll(crrd);
|
|
//System.out.println(crjsonText);
|
|
|
|
JSONObject jsmeta=new JSONObject(crjsonText);
|
|
JSONObject jsmessage = (JSONObject) jsmeta.get("message");
|
|
JSONArray jsitems = (JSONArray) jsmessage.get("items");
|
|
//if score > 110 ok
|
|
//System.out.println(jsitems.toString());
|
|
}
|
|
} catch (Exception e) {
|
|
|
|
System.out.println(e);
|
|
/*
|
|
e.printStackTrace(pw);
|
|
String sStackTrace = sw.toString(); // stack trace as a string
|
|
if (sStackTrace.length()>200)
|
|
System.out.println(sStackTrace.substring(0, 2000));
|
|
*/
|
|
System.out.println ("####################### ("+pid+") getCitationCSL, citation by content negotiation not available");
|
|
}
|
|
|
|
return jsondata;
|
|
}
|
|
public JSONObject getCitationMetadataFromHTML(String pid, String token){
|
|
|
|
JSONObject jsondata=null;
|
|
//System.out.println("################## mypid completo "+pid);
|
|
pid=extractURL(pid)[0];
|
|
System.out.println("################## mypid "+pid);
|
|
|
|
if (pid.contains(".handle.net/")) {
|
|
jsondata=heup.getAPIMetadata(pid);
|
|
}
|
|
if (pid.startsWith("http://")) {
|
|
pid=pid.replace("http://", "https://");
|
|
}
|
|
|
|
if (!pid.startsWith("https"))
|
|
pid="https://"+pid.trim();
|
|
|
|
//special cases
|
|
if (pid.contains("ien.bg.ac.rs") || pid.contains("eprints.rclis.org") || pid.contains("opengrey.eu"))
|
|
pid=pid.replace("https://", "http://");
|
|
|
|
//first landing pages
|
|
|
|
if (jsondata==null || jsondata.length()==0) {
|
|
System.out.println("getZenodoMetadataJSONLD ("+pid+")");
|
|
jsondata=heup.getZenodoMetadataJSONLD(pid);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (jsondata!=null && jsondata.has("properties")){
|
|
try {
|
|
JSONObject testp=(JSONObject) jsondata.get("properties");
|
|
String citdoi="";
|
|
if (testp.has("citation_doi") && !(pid.contains("doi.org/"))) {//check if there is metadata in DOI RA
|
|
|
|
citdoi = testp.getString("citation_doi");
|
|
if (citdoi.trim()!="" && !citdoi.contains("doi.org/")) {
|
|
citdoi="https://doi.org/"+citdoi.trim();
|
|
}
|
|
|
|
}else {
|
|
if ((testp.has("og:url")) && (testp.getString("og:url").trim().startsWith("https://dl.acm.org/doi/abs/"))) {
|
|
citdoi=testp.getString("og:url").trim().replace("https://dl.acm.org/doi/abs/", "https://doi.org/");
|
|
}
|
|
|
|
}
|
|
if (citdoi.trim()!="")
|
|
pid=citdoi;
|
|
|
|
} catch (JSONException e) {
|
|
// TODO Auto-generated catch block
|
|
e.printStackTrace();
|
|
}
|
|
}
|
|
|
|
//https://link.springer.com/book/
|
|
if (pid.startsWith("https://link.springer.com/book/"))
|
|
pid=pid.trim().replace("https://link.springer.com/book/", "https://doi.org/");
|
|
//then DOI service providers...
|
|
try {
|
|
//System.out.println("jsondata.length() "+((JSONObject) jsondata.get("properties")).length());
|
|
//int jopl=((JSONObject) jsondata.get("properties")).length();
|
|
|
|
if ((jsondata==null || !(jsondata.has("properties")) || ((JSONObject) jsondata.get("properties")).length() < 3) && pid.contains("doi.org") ) {
|
|
getCNCit=true;
|
|
System.out.println("getCitationMetadata ("+pid+")");
|
|
jsondata=getCitationMetadata(pid, token);
|
|
getCNCit=true;
|
|
}
|
|
} catch (JSONException e1) {
|
|
// TODO Auto-generated catch block
|
|
e1.printStackTrace();
|
|
}
|
|
|
|
|
|
if (pid.contains("doi.org") && (jsondata==null || jsondata.length()==0 || jsondata.isNull("citation string")) && !token.contains("testAPI")) {
|
|
System.out.println("getCitationCSL ("+pid+") "+ token);
|
|
JSONObject jsoncsldata=null;
|
|
jsoncsldata=getCitationCSL(pid, token);
|
|
if (jsondata==null)
|
|
jsondata=jsoncsldata;
|
|
else {
|
|
if(jsoncsldata!=null && jsoncsldata.length()>0) {
|
|
Iterator<String> ite= jsoncsldata.keys();
|
|
while (ite.hasNext()) {
|
|
String key=ite.next();
|
|
try {
|
|
jsondata.put(key, jsoncsldata.get(key));
|
|
} catch (JSONException e) {
|
|
// TODO Auto-generated catch block
|
|
e.printStackTrace();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
}
|
|
|
|
if(jsondata!=null && jsondata.has("properties") && pid.contains("doi.org")){
|
|
|
|
getCNCit=false;
|
|
JSONObject mydoijo=new JSONObject();
|
|
mydoijo=getCitationMetadata(pid, token);
|
|
if (mydoijo!=null) {
|
|
try {
|
|
if (mydoijo.has("properties"))
|
|
jsondata.put("ra_properties", mydoijo.get("properties"));
|
|
|
|
} catch (JSONException e) {
|
|
// TODO Auto-generated catch block
|
|
e.printStackTrace();
|
|
}
|
|
}
|
|
getCNCit=true;
|
|
|
|
}
|
|
|
|
|
|
if (jsondata==null || jsondata.length()==0) {
|
|
try {
|
|
jsondata=new JSONObject();
|
|
jsondata.put("citation string", "na");
|
|
} catch (JSONException e) {
|
|
// TODO Auto-generated catch block
|
|
e.printStackTrace();
|
|
}
|
|
}
|
|
|
|
return jsondata;
|
|
}
|
|
private String[] extractURL(String text) {
|
|
List<String> list = new ArrayList<String>();
|
|
Pattern pattern = Pattern
|
|
.compile(
|
|
"(http://|https://){1}[\\w\\.\\-/:\\#\\?\\=\\&\\;\\%\\~\\+]+",
|
|
Pattern.CASE_INSENSITIVE);
|
|
Matcher matcher = pattern.matcher(text);
|
|
while (matcher.find()) {
|
|
list.add(matcher.group());
|
|
}
|
|
return list.toArray(new String[list.size()]);
|
|
}
|
|
|
|
}
|