code improved

This commit is contained in:
Cesare Concordia 2021-11-11 16:23:41 +01:00
parent 992464e0a8
commit e729dfc702
2 changed files with 365 additions and 66 deletions

View File

@ -216,8 +216,11 @@ public class CitationHarvesterImpl {
pid=pid.replace("http://", "https://");
}
if (!pid.startsWith("https"))
pid="https://"+pid.trim();
//special cases
if (pid.contains("ien.bg.ac.rs"))
if (pid.contains("ien.bg.ac.rs") || pid.contains("eprints.rclis.org") || pid.contains("opengrey.eu"))
pid=pid.replace("https://", "http://");
//first landing pages
@ -228,38 +231,47 @@ public class CitationHarvesterImpl {
}
if (jsondata==null || jsondata.length()==0) {
/*if (jsondata==null || jsondata.length()==0) {
System.out.println("getMetaMetadata ("+pid+")");
jsondata=heup.getMetaMetadata(pid);
if (jsondata!=null && jsondata.has("properties")){
try {
JSONObject testp=(JSONObject) jsondata.get("properties");
String citdoi="";
if (testp.has("citation_doi") && !(pid.contains("doi.org/"))) {//check if there is metadata in DOI RA
citdoi = testp.getString("citation_doi");
if (citdoi.trim()!="" && !citdoi.contains("doi.org/")) {
citdoi="https://doi.org/"+citdoi.trim();
}
if (citdoi.trim()!="")
pid=citdoi;
}
} catch (JSONException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}*/
if (jsondata==null || jsondata.length()==0) {
/*if (jsondata==null || jsondata.length()==0) {
System.out.println("getLinkMetadata ("+pid+")");
jsondata= heup.getLinkMetadata(pid);
}
}*/
if (jsondata!=null && jsondata.has("properties")){
try {
JSONObject testp=(JSONObject) jsondata.get("properties");
String citdoi="";
if (testp.has("citation_doi") && !(pid.contains("doi.org/"))) {//check if there is metadata in DOI RA
citdoi = testp.getString("citation_doi");
if (citdoi.trim()!="" && !citdoi.contains("doi.org/")) {
citdoi="https://doi.org/"+citdoi.trim();
}
}else {
if ((testp.has("og:url")) && (testp.getString("og:url").trim().startsWith("https://dl.acm.org/doi/abs/"))) {
citdoi=testp.getString("og:url").trim().replace("https://dl.acm.org/doi/abs/", "https://doi.org/");
}
}
if (citdoi.trim()!="")
pid=citdoi;
} catch (JSONException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
//https://link.springer.com/book/
if (pid.startsWith("https://link.springer.com/book/"))
pid=pid.trim().replace("https://link.springer.com/book/", "https://doi.org/");
//then DOI service providers...
try {
//System.out.println("jsondata.length() "+((JSONObject) jsondata.get("properties")).length());
@ -276,8 +288,9 @@ public class CitationHarvesterImpl {
e1.printStackTrace();
}
if (pid.contains("doi.org") && (jsondata==null || jsondata.length()==0 || jsondata.isNull("citation string"))) {
System.out.println("getCitationCSL ("+pid+")");
if (pid.contains("doi.org") && (jsondata==null || jsondata.length()==0 || jsondata.isNull("citation string")) && !token.contains("testAPI")) {
System.out.println("getCitationCSL ("+pid+") "+ token);
JSONObject jsoncsldata=null;
jsoncsldata=getCitationCSL(pid, token);
if (jsondata==null)

View File

@ -54,8 +54,8 @@ public class HeuristicParsers {
try {
//Check if it is a handle
String dnsPid="";
if (pid.contains(".handle.net/")) {
//get the actual repository URL
String haid=pid.substring(22);
@ -74,18 +74,18 @@ public class HeuristicParsers {
if (tmp.getString("type").contains("URL")) {
JSONObject urlob=(JSONObject) tmp.get("data");
dnsPid=urlob.getString("value");
}
}
}
if (dnsPid.trim()=="")
return (jsondata);
System.out.println("pid "+dnsPid);
String apiURL=repo.getRepoApi(dnsPid);
if (apiURL!="none") {
if (!apiURL.trim().equalsIgnoreCase("none")) {
apiURL=apiURL.replace("viewerPid", pid.substring(23));
System.out.println("apiURL "+apiURL);
Document apiDoc = SSLHelper.getConnection(apiURL).ignoreContentType(true)
@ -103,12 +103,12 @@ public class HeuristicParsers {
}
else
metajsondata.put(child.tagName(), child.text());
}
}
}
if (metajsondata!=null && metajsondata.length()>0) {
String rname=repo.getRepoName(dnsPid);
String rws=repo.getRepoURL(dnsPid);
@ -116,7 +116,7 @@ public class HeuristicParsers {
jsonrepodata.put("name", rname);
if (rws!="none")
jsonrepodata.put("URL", rws);
}
} catch (Exception e) {
// TODO Auto-generated catch block
@ -125,7 +125,7 @@ public class HeuristicParsers {
if (metajsondata!=null && metajsondata.length()>0) {
try {
jsondata.put("citation string", "na");
jsondata.put("metadata source", jsonrepodata);
jsondata.put("properties", metajsondata);
} catch (JSONException e) {
@ -141,7 +141,7 @@ public class HeuristicParsers {
//Document doc = Jsoup.connect(pid).get();
Document doc = SSLHelper.getConnection(pid).ignoreContentType(true)
.userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0").referrer("http://www.google.com").timeout(20 * 1000).get();
.userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0").timeout(30 * 1000).get();
Elements cmdheader=doc.getElementsByTag("cmd:Header");
if (cmdheader!=null && cmdheader.size()>0) {
JSONObject talarproperties=new JSONObject();
@ -168,10 +168,12 @@ public class HeuristicParsers {
String jsonStrraw1=tmpStr.trim().replace("</script>", "");
String jsonStrraw = jsonStrraw1.replace("\r\n", "");
jsonStr = jsonStrraw.replace('\r', ' ');
jsonStr = jsonStr.replace("/*]]>*/", "");
jsonStr = jsonStr.replace("/*<![CDATA[*/", "");
//System.out.println(jsonStr);
//getGraph(jsonStr);
//JSONObject jsonproperties=new JSONObject();
jsonproperties=new JSONObject(jsonStr.trim());
if (jsonproperties.has("mainEntity")) {
jsonproperties=jsonproperties.getJSONObject("mainEntity");
@ -192,7 +194,7 @@ public class HeuristicParsers {
int obend=tmpStr.indexOf("}");
String jsstr=tmpStr.substring(0, obend);
System.out.println("--------------------"+jsstr);
}
}
Elements metas = doc.getElementsByTag("meta");
@ -200,30 +202,32 @@ public class HeuristicParsers {
String name = meta.attr("name");
String property = meta.attr("property");
String content = meta.attr("content");
if (!name.trim().isEmpty() &&
!content.trim().isEmpty() &&
(name.toLowerCase().trim().startsWith("dc.") ||
name.trim().startsWith("citation_")) ||
name.toLowerCase().trim().startsWith("eprints.")){
name.toLowerCase().trim().startsWith("eprints.")){
if (name.equalsIgnoreCase("eprints.citation")) {
jsondata.put("citation string", content);
}
if (jsonproperties.has(name)) {
String names= jsonproperties.get(name).toString();
content=names+", "+content;
}
if (name.equalsIgnoreCase("eprints.citation")) {
jsondata.put("citation string", content);
}
else
jsonproperties.put(name, content);
}
if (!property.trim().isEmpty() && !content.trim().isEmpty())
jsonproperties.put(property.trim(), content.trim());
}
if (jsonproperties!=null && jsonproperties.length()>0) {
jsondata.put("properties", jsonproperties);
}
//get the citation string
Element citationid = doc.getElementById("invenio-csl");
@ -261,6 +265,26 @@ public class HeuristicParsers {
if (!jsonText.trim().isEmpty()) {
jsondata.put("citation string", jsonText.trim());
}
}
if (citationStr.trim().isEmpty() && jsonproperties!=null && jsonproperties.has("citation")){
jsondata.put("citation string", jsonproperties.getString("citation"));
}
if (jsondata==null || jsondata.length()==0) {
System.out.println("private getMetaMetadata ("+pid+")");
jsondata=getMetaMetadata(doc);
}
if (jsondata==null || jsondata.length()==0) {
System.out.println("private getLinkMetadata ("+pid+")");
jsondata=getLinkMetadata(doc);
}
if (jsondata==null || jsondata.length()==0) {
System.out.println("private getAttributeMetadata ("+pid+")");
jsondata=getAttributeMetadata(doc);
}
}
@ -275,12 +299,101 @@ public class HeuristicParsers {
System.out.println ("("+pid+") not available");
}
return jsondata;
}
private JSONObject getLinkMetadata(Document doc) {
JSONObject jsondata=new JSONObject();
JSONObject jsonproperties=new JSONObject();
try {
Elements links = doc.getElementsByTag("link");
for (Element link: links) {//get metadata from <link>
String rel = link.attr("rel");
String href= link.attr("href");
if (!rel.trim().isEmpty() &&
rel.trim().equalsIgnoreCase("metadata")){
jsonproperties.put(rel, href);
}
}
if (jsonproperties.length()>0)
jsondata.put("properties", jsonproperties);
else {
Elements xdts=doc.getElementsByAttributeValueContaining("xmlns:dcterms", "http://purl.org/dc/terms/");
for (Element xdt : xdts) {
Elements dd=xdt.getElementsByTag("dd");
Elements dt=xdt.getElementsByTag("dt");
for (int i=0; i<dd.size(); i++) {
System.out.println(dt.get(i).text()+" - "+dd.get(i).text());
String content=dd.get(i).text().trim();
if (!dt.get(i).text().trim().isEmpty()){
if (jsonproperties.has(dt.get(i).text().trim())) {
String names= jsonproperties.get(dt.get(i).text().trim()).toString();
content=names+", "+content;
}
if (dt.get(i).text().trim().equalsIgnoreCase("eprints.citation")) {
jsondata.put("citation string", content);
}
else
jsonproperties.put(dt.get(i).text().trim(), content);
}
}
}
if (jsonproperties.length()>0)
jsondata.put("properties", jsonproperties);
}
if (jsonproperties.length()==0) {
Elements ulxmlns=doc.getElementsByTag("ul");
for (Element ulxmln: ulxmlns) {
if (ulxmln.hasAttr("xmlns")) {
Elements ctncs=ulxmln.getElementsByAttributeValue("class", "Component_tree_node_content");
for (Element ctnc: ctncs) {
Element mye= ctnc.parent();
String ppr=mye.getElementsByTag("code").text().trim();
String pco=mye.getElementsByTag("sample").text().trim();
if (!ppr.isEmpty()){
if (jsonproperties.has(ppr)) {
String vals= jsonproperties.get(ppr).toString();
pco=vals+", "+pco;
}
jsonproperties.put(ppr, pco);
}
}
}
}
if (jsonproperties.length()>0)
jsondata.put("properties", jsonproperties);
}
System.out.println(jsondata);
}
catch (Exception e) {
e.printStackTrace(pw);
String sStackTrace = sw.toString(); // stack trace as a string
if (sStackTrace.length()>200)
System.out.println(sStackTrace.substring(0, 199));
pw.flush();
System.out.println ("&&&&&&&&&&&&&&&&&&&&&&&&& private link metadata not available");
}
return jsondata;
}
public JSONObject getLinkMetadata(String pid) {
@ -318,6 +431,179 @@ public class HeuristicParsers {
return jsondata;
}
//
private JSONObject getAttributeMetadata(Document doc) {
JSONObject jsondata=new JSONObject();
JSONObject jsonproperties=new JSONObject();
HashMap<String, String> authors_affiliation = new HashMap<String, String>();
try {
Elements dcel=doc.getElementsByAttributeValueStarting("property", "dc");
for (Element meta: dcel) {//get metadata from <element property="dc...">
String content = meta.text();
String property = meta.attr("property");
if (!content.trim().isEmpty()){
if (jsonproperties.has(property)) {
String names= jsonproperties.get(property).toString();
content=names+", "+content;
}
if (property.equalsIgnoreCase("eprints.citation")) {
jsondata.put("citation string", content);
}
else
jsonproperties.put(property, content);
}
if (property.trim().contains("citation_author")) {//to be used to double check authors
authors_affiliation.put(content, "na");
}
if (!property.trim().isEmpty() && !content.trim().isEmpty())
jsonproperties.put(property.trim(), content.trim());
}
//get metadata about authors and affiliation from <div vocab="http://schema.org/"> section
//<div class="citation-popup" data-style-name="harvard" title="Harvard Citation" style="display:none;">
Element citationstring= doc.select("div[data-style-name]").first();
if (citationstring!=null) {
jsondata.put("citation string", citationstring.text().trim());
}
//check if metadata is in the html elements europeana style
if (jsonproperties!=null && jsonproperties.length()>0)
jsondata.put("properties", jsonproperties);
}
catch (Exception e) {
e.printStackTrace(pw);
String sStackTrace = sw.toString(); // stack trace as a string
if (sStackTrace.length()>200)
System.out.println(sStackTrace.substring(0, 199));
pw.flush();
System.out.println ("^^^^^^^^^^^^^^^^^^^^^ private attribute metadata not available");
}
return jsondata;
}
//
private JSONObject getMetaMetadata(Document doc) {
JSONObject jsondata=new JSONObject();
// JSONArray authors = new JSONArray();
JSONObject jsonproperties=new JSONObject();
HashMap<String, String> authors_affiliation = new HashMap<String, String>();
try {
Elements cmdheader=doc.getElementsByTag("cmd:Header");
if (cmdheader!=null && cmdheader.size()>0) {
JSONObject talarproperties=new JSONObject();
talarproperties=getTalarData(doc);
if (talarproperties!=null)
jsondata.put("properties", talarproperties);
return jsondata;
}
Elements metas = doc.getElementsByTag("meta");
for (Element meta: metas) {//get metadata from <meta>
String name = meta.attr("name");
String property = meta.attr("property");
String content = meta.attr("content");
if (!name.trim().isEmpty() &&
!content.trim().isEmpty() &&
(name.toLowerCase().trim().startsWith("dc.") ||
name.trim().startsWith("citation_")) ||
name.toLowerCase().trim().startsWith("eprints.")){
if (jsonproperties.has(name)) {
String names= jsonproperties.get(name).toString();
content=names+", "+content;
}
if (name.equalsIgnoreCase("eprints.citation")) {
jsondata.put("citation string", content);
}
else
jsonproperties.put(name, content);
}
if (name.trim().contains("citation_author")) {//to be used to double check authors
authors_affiliation.put(content, "na");
}
if (!property.trim().isEmpty() && !content.trim().isEmpty())
jsonproperties.put(property.trim(), content.trim());
}
//get metadata about authors and affiliation from <div vocab="http://schema.org/"> section
Element record = doc.select("div[vocab]").first();
//search for vocab
if (record!=null) {
Elements items = record.select("span[property]"); // span with property attribute
for (Element item: items) {
String name = item.attr("property");
String val=item.attr("value");
if (!name.trim().isEmpty() && !val.trim().isEmpty()) {
jsonproperties.put(name, val);
}
}
}
//<div class="citation-popup" data-style-name="harvard" title="Harvard Citation" style="display:none;">
Element citationstring= doc.select("div[data-style-name]").first();
if (citationstring!=null) {
jsondata.put("citation string", citationstring.text().trim());
}
//check if metadata is in the html elements europeana style
Elements euRecord = doc.select("div[data-field-name]");
for (Element divmeta: euRecord) {
String mdname = divmeta.attr("data-field-name");
if (mdname != "") {
jsonproperties.put(mdname, divmeta.text().trim());
}
}
if (jsonproperties!=null && jsonproperties.length()>0)
jsondata.put("properties", jsonproperties);
/*if (jsondata==null || jsondata.length()==0) {
jsondata=getZenodoData(doc);
}*/
}
catch (Exception e) {
e.printStackTrace(pw);
String sStackTrace = sw.toString(); // stack trace as a string
if (sStackTrace.length()>200)
System.out.println(sStackTrace.substring(0, 199));
pw.flush();
System.out.println ("^^^^^^^^^^^^^^^^^^^^^ private meta metadata not available");
}
return jsondata;
}
public JSONObject getMetaMetadata(String pid){
JSONObject jsondata=new JSONObject();
@ -348,23 +634,23 @@ public class HeuristicParsers {
!content.trim().isEmpty() &&
(name.toLowerCase().trim().startsWith("dc.") ||
name.trim().startsWith("citation_")) ||
name.toLowerCase().trim().startsWith("eprints.")){
name.toLowerCase().trim().startsWith("eprints.")){
if (jsonproperties.has(name)) {
String names= jsonproperties.get(name).toString();
content=names+", "+content;
}
if (name.equalsIgnoreCase("eprints.citation")) {
jsondata.put("citation string", content);
}
else
jsonproperties.put(name, content);
}
if (name.trim().contains("citation_author")) {//to be used to double check authors
authors_affiliation.put(content, "na");
}
if (!property.trim().isEmpty() && !content.trim().isEmpty())
jsonproperties.put(property.trim(), content.trim());
@ -391,7 +677,7 @@ public class HeuristicParsers {
if (citationstring!=null) {
jsondata.put("citation string", citationstring.text().trim());
}
//check if metadata is in the html elements europeana style
Elements euRecord = doc.select("div[data-field-name]");
@ -400,8 +686,8 @@ public class HeuristicParsers {
if (mdname != "") {
jsonproperties.put(mdname, divmeta.text().trim());
}
}
if (jsonproperties!=null && jsonproperties.length()>0)
@ -581,7 +867,7 @@ public class HeuristicParsers {
if (proxyarray!=null) {
taljsondata.put("cmd:ResourceProxyList", proxyarray);
}
//cmdp:GeneralInfo
Elements generalInfo=doc.getElementsByTag("cmdp:GeneralInfo");
proxyarray=new JSONArray();
@ -678,10 +964,10 @@ public class HeuristicParsers {
jsonlo.put("xml:lang", legalown.attr("xml:lang"));
proxyarray.put(jsonlo);
}
}
taljsondata.put("cmdp:LegalOwner", proxyarray);
//cmdp:TimeCoverage
Elements timecoves=doc.getElementsByTag("cmdp:TimeCoverage");
proxyarray=new JSONArray();
@ -692,10 +978,10 @@ public class HeuristicParsers {
jsonlo.put("xml:lang", timecov.attr("xml:lang"));
proxyarray.put(jsonlo);
}
}
taljsondata.put("cmdp:TimeCoverage", proxyarray);
return taljsondata;
}