review and changes in heuristic parser

This commit is contained in:
Cesare Concordia 2021-11-01 18:42:17 +01:00
parent 7a47672c14
commit 992464e0a8
3 changed files with 187 additions and 20 deletions

View File

@ -114,7 +114,8 @@ public class CitationHarvesterImpl {
HttpURLConnection myURLConnection = (HttpURLConnection)myURL.openConnection();
//myURLConnection.setRequestProperty("Accept", "application/rdf+xml;q=0.5, application/vnd.citationstyles.csl+json;q=1.0");
myURLConnection.setRequestProperty("Accept", "application/vnd.citationstyles.csl+json, application/rdf+xml\\");
//curl -LH "Accept: application/vnd.citationstyles.csl+json, application/rdf+xml" https://doi.org/10.1080/01930826.2016.1186969
myURLConnection.setRequestProperty("Accept", "application/vnd.citationstyles.csl+json, application/rdf+xml");
myURLConnection.setConnectTimeout(18000);
InputStream mis = myURLConnection.getInputStream();
BufferedReader rd = new BufferedReader(new InputStreamReader(mis, Charset.forName("UTF-8")));
@ -208,7 +209,9 @@ public class CitationHarvesterImpl {
JSONObject jsondata=null;
jsondata=heup.getAPIMetadata(pid);
if (pid.contains(".handle.net/")) {
jsondata=heup.getAPIMetadata(pid);
}
if (pid.startsWith("http://")) {
pid=pid.replace("http://", "https://");
}
@ -222,12 +225,35 @@ public class CitationHarvesterImpl {
if (jsondata==null || jsondata.length()==0) {
System.out.println("getZenodoMetadataJSONLD ("+pid+")");
jsondata=heup.getZenodoMetadataJSONLD(pid);
}
if (jsondata==null || jsondata.length()==0) {
System.out.println("getMetaMetadata ("+pid+")");
jsondata=heup.getMetaMetadata(pid);
}
if (jsondata!=null && jsondata.has("properties")){
try {
JSONObject testp=(JSONObject) jsondata.get("properties");
String citdoi="";
if (testp.has("citation_doi") && !(pid.contains("doi.org/"))) {//check if there is metadata in DOI RA
citdoi = testp.getString("citation_doi");
if (citdoi.trim()!="" && !citdoi.contains("doi.org/")) {
citdoi="https://doi.org/"+citdoi.trim();
}
if (citdoi.trim()!="")
pid=citdoi;
}
} catch (JSONException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
if (jsondata==null || jsondata.length()==0) {
System.out.println("getLinkMetadata ("+pid+")");
@ -249,6 +275,7 @@ public class CitationHarvesterImpl {
// TODO Auto-generated catch block
e1.printStackTrace();
}
if (pid.contains("doi.org") && (jsondata==null || jsondata.length()==0 || jsondata.isNull("citation string"))) {
System.out.println("getCitationCSL ("+pid+")");
JSONObject jsoncsldata=null;
@ -256,14 +283,16 @@ public class CitationHarvesterImpl {
if (jsondata==null)
jsondata=jsoncsldata;
else {
Iterator<String> ite= jsoncsldata.keys();
while (ite.hasNext()) {
String key=ite.next();
try {
jsondata.put(key, jsoncsldata.get(key));
} catch (JSONException e) {
// TODO Auto-generated catch block
e.printStackTrace();
if(jsoncsldata!=null && jsoncsldata.length()>0) {
Iterator<String> ite= jsoncsldata.keys();
while (ite.hasNext()) {
String key=ite.next();
try {
jsondata.put(key, jsoncsldata.get(key));
} catch (JSONException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
@ -271,6 +300,25 @@ public class CitationHarvesterImpl {
}
if(jsondata!=null && jsondata.has("properties") && pid.contains("doi.org")){
getCNCit=false;
JSONObject mydoijo=new JSONObject();
mydoijo=getCitationMetadata(pid, token);
if (mydoijo!=null) {
try {
if (mydoijo.has("properties"))
jsondata.put("ra_properties", mydoijo.get("properties"));
} catch (JSONException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
getCNCit=true;
}
if (jsondata==null || jsondata.length()==0) {
try {

View File

@ -26,6 +26,11 @@ import java.net.URL;
public class CSVHelper {
static CSVHelper app = new CSVHelper();
static BufferedReader br;
static CSVParser parser;
public CSVHelper() {
}
public static void main(String[] args) {
try {
@ -62,7 +67,7 @@ public class CSVHelper {
return new File(resource.toURI());
}
}
public CSVParser getRepoProfiles() throws URISyntaxException{
File repo=app.getFileFromResource("repoprofiles.csv");
@ -79,7 +84,7 @@ public class CSVHelper {
}
return null;
}
public String getRepoApi(String repositoryurl) throws URISyntaxException{
File repo=app.getFileFromResource("repoprofiles.csv");
@ -87,7 +92,7 @@ public class CSVHelper {
BufferedReader br = new BufferedReader(new FileReader(repo));
CSVParser parser = CSVFormat.DEFAULT.withDelimiter(';').withHeader().parse(br);
) {
int slashpos= repositoryurl.indexOf('/', 8);
String repoid=repositoryurl.substring(0, slashpos);
System.out.println("repoid "+repoid);
@ -101,7 +106,7 @@ public class CSVHelper {
System.out.println(record.get("Repository name") +" - "+record.get("api")+" - "+record.get("viewer"));
return record.get("viewer");
}
}
return "none";
@ -111,5 +116,59 @@ public class CSVHelper {
}
return null;
}
public String getRepoName(String repositoryurl) throws URISyntaxException{
File repo=app.getFileFromResource("repoprofiles.csv");
try(
BufferedReader br = new BufferedReader(new FileReader(repo));
CSVParser parser = CSVFormat.DEFAULT.withDelimiter(';').withHeader().parse(br);
) {
int slashpos= repositoryurl.indexOf('/', 8);
String repoid=repositoryurl.substring(0, slashpos);
//System.out.println("repoid "+repoid);
for(CSVRecord record : parser) {
String apiurl=record.get("api");
if(record.get("Website").trim().contains(repoid) || apiurl.contains(repoid) || record.get("dns").trim().contains(repoid)) {
System.out.println(record.get("Repository name") +" - "+record.get("api")+" - "+record.get("viewer"));
return record.get("Repository name");
}
}
return "none";
}
catch (Exception e) {
System.out.println(e);
}
return "none";
}
public String getRepoURL(String repositoryurl) throws URISyntaxException{
File repo=app.getFileFromResource("repoprofiles.csv");
try(
BufferedReader br = new BufferedReader(new FileReader(repo));
CSVParser parser = CSVFormat.DEFAULT.withDelimiter(';').withHeader().parse(br);
) {
int slashpos= repositoryurl.indexOf('/', 8);
String repoid=repositoryurl.substring(0, slashpos);
//System.out.println("repoid "+repoid);
for(CSVRecord record : parser) {
String apiurl=record.get("api");
if(record.get("Website").trim().contains(repoid) || apiurl.contains(repoid) || record.get("dns").trim().contains(repoid)) {
System.out.println(record.get("Repository name") +" - "+record.get("api")+" - "+record.get("viewer"));
return record.get("Website");
}
}
return "none";
}
catch (Exception e) {
System.out.println(e);
}
return "none";
}
}

View File

@ -50,6 +50,7 @@ public class HeuristicParsers {
public JSONObject getAPIMetadata(String pid) {
JSONObject metajsondata=new JSONObject();
JSONObject jsondata=new JSONObject();
JSONObject jsonrepodata=new JSONObject();
try {
//Check if it is a handle
String dnsPid="";
@ -73,10 +74,14 @@ public class HeuristicParsers {
if (tmp.getString("type").contains("URL")) {
JSONObject urlob=(JSONObject) tmp.get("data");
dnsPid=urlob.getString("value");
}
}
}
}
if (dnsPid.trim()=="")
return (jsondata);
System.out.println("pid "+dnsPid);
String apiURL=repo.getRepoApi(dnsPid);
@ -103,6 +108,16 @@ public class HeuristicParsers {
}
}
if (metajsondata!=null && metajsondata.length()>0) {
String rname=repo.getRepoName(dnsPid);
String rws=repo.getRepoURL(dnsPid);
if (rname!="none")
jsonrepodata.put("name", rname);
if (rws!="none")
jsonrepodata.put("URL", rws);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
@ -110,7 +125,8 @@ public class HeuristicParsers {
if (metajsondata!=null && metajsondata.length()>0) {
try {
jsondata.put("citation string", "na");
jsondata.put("metadata source", "Repository API");
jsondata.put("metadata source", jsonrepodata);
jsondata.put("properties", metajsondata);
} catch (JSONException e) {
// TODO Auto-generated catch block
@ -125,7 +141,7 @@ public class HeuristicParsers {
//Document doc = Jsoup.connect(pid).get();
Document doc = SSLHelper.getConnection(pid).ignoreContentType(true)
.userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0").timeout(20 * 1000).get();
.userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0").referrer("http://www.google.com").timeout(20 * 1000).get();
Elements cmdheader=doc.getElementsByTag("cmd:Header");
if (cmdheader!=null && cmdheader.size()>0) {
JSONObject talarproperties=new JSONObject();
@ -135,10 +151,12 @@ public class HeuristicParsers {
return jsondata;
}
JSONObject jsonproperties=new JSONObject();
//application/ld+json
Elements scripts = doc.getElementsByTag("script");
for (Element script: scripts) {//get metadata from <script>
String type = script.attr("type");
String dap=script.attr("data-analytics-provider");
if (type!=null && !type.trim().isEmpty() &&
(type.trim().equalsIgnoreCase("application/ld+json") || type.trim().equalsIgnoreCase("application/json"))) {
@ -152,7 +170,7 @@ public class HeuristicParsers {
jsonStr = jsonStrraw.replace('\r', ' ');
//System.out.println(jsonStr);
//getGraph(jsonStr);
JSONObject jsonproperties=new JSONObject();
//JSONObject jsonproperties=new JSONObject();
jsonproperties=new JSONObject(jsonStr.trim());
if (jsonproperties.has("mainEntity")) {
@ -165,7 +183,47 @@ public class HeuristicParsers {
jsondata.put("properties", jsonproperties);
}
}
if (dap!=null && !dap.trim().isEmpty() &&
(dap.trim().equalsIgnoreCase("ga") )) {
String jsondapStr=script.toString().trim();
//gaData.content = {
int headerLimit=jsondapStr.indexOf("gaData.content = {");
String tmpStr=jsondapStr.trim().substring(headerLimit+1);
int obend=tmpStr.indexOf("}");
String jsstr=tmpStr.substring(0, obend);
System.out.println("--------------------"+jsstr);
}
}
Elements metas = doc.getElementsByTag("meta");
for (Element meta: metas) {//get metadata from <meta>
String name = meta.attr("name");
String property = meta.attr("property");
String content = meta.attr("content");
if (!name.trim().isEmpty() &&
!content.trim().isEmpty() &&
(name.toLowerCase().trim().startsWith("dc.") ||
name.trim().startsWith("citation_")) ||
name.toLowerCase().trim().startsWith("eprints.")){
if (jsonproperties.has(name)) {
String names= jsonproperties.get(name).toString();
content=names+", "+content;
}
if (name.equalsIgnoreCase("eprints.citation")) {
jsondata.put("citation string", content);
}
else
jsonproperties.put(name, content);
}
if (!property.trim().isEmpty() && !content.trim().isEmpty())
jsonproperties.put(property.trim(), content.trim());
}
//get the citation string
Element citationid = doc.getElementById("invenio-csl");
@ -301,10 +359,12 @@ public class HeuristicParsers {
}
else
jsonproperties.put(name, content);
}
if (name.trim().contains("citation_author")) {//to be used to double check authors
authors_affiliation.put(content, "na");
}
if (!property.trim().isEmpty() && !content.trim().isEmpty())
jsonproperties.put(property.trim(), content.trim());