review and changes in heuristic parser
This commit is contained in:
parent
7a47672c14
commit
992464e0a8
|
@ -114,7 +114,8 @@ public class CitationHarvesterImpl {
|
|||
|
||||
HttpURLConnection myURLConnection = (HttpURLConnection)myURL.openConnection();
|
||||
//myURLConnection.setRequestProperty("Accept", "application/rdf+xml;q=0.5, application/vnd.citationstyles.csl+json;q=1.0");
|
||||
myURLConnection.setRequestProperty("Accept", "application/vnd.citationstyles.csl+json, application/rdf+xml\\");
|
||||
//curl -LH "Accept: application/vnd.citationstyles.csl+json, application/rdf+xml" https://doi.org/10.1080/01930826.2016.1186969
|
||||
myURLConnection.setRequestProperty("Accept", "application/vnd.citationstyles.csl+json, application/rdf+xml");
|
||||
myURLConnection.setConnectTimeout(18000);
|
||||
InputStream mis = myURLConnection.getInputStream();
|
||||
BufferedReader rd = new BufferedReader(new InputStreamReader(mis, Charset.forName("UTF-8")));
|
||||
|
@ -208,7 +209,9 @@ public class CitationHarvesterImpl {
|
|||
|
||||
JSONObject jsondata=null;
|
||||
|
||||
jsondata=heup.getAPIMetadata(pid);
|
||||
if (pid.contains(".handle.net/")) {
|
||||
jsondata=heup.getAPIMetadata(pid);
|
||||
}
|
||||
if (pid.startsWith("http://")) {
|
||||
pid=pid.replace("http://", "https://");
|
||||
}
|
||||
|
@ -222,12 +225,35 @@ public class CitationHarvesterImpl {
|
|||
if (jsondata==null || jsondata.length()==0) {
|
||||
System.out.println("getZenodoMetadataJSONLD ("+pid+")");
|
||||
jsondata=heup.getZenodoMetadataJSONLD(pid);
|
||||
|
||||
}
|
||||
|
||||
if (jsondata==null || jsondata.length()==0) {
|
||||
System.out.println("getMetaMetadata ("+pid+")");
|
||||
jsondata=heup.getMetaMetadata(pid);
|
||||
}
|
||||
if (jsondata!=null && jsondata.has("properties")){
|
||||
try {
|
||||
JSONObject testp=(JSONObject) jsondata.get("properties");
|
||||
String citdoi="";
|
||||
if (testp.has("citation_doi") && !(pid.contains("doi.org/"))) {//check if there is metadata in DOI RA
|
||||
|
||||
citdoi = testp.getString("citation_doi");
|
||||
if (citdoi.trim()!="" && !citdoi.contains("doi.org/")) {
|
||||
citdoi="https://doi.org/"+citdoi.trim();
|
||||
}
|
||||
if (citdoi.trim()!="")
|
||||
pid=citdoi;
|
||||
|
||||
}
|
||||
|
||||
} catch (JSONException e) {
|
||||
// TODO Auto-generated catch block
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
if (jsondata==null || jsondata.length()==0) {
|
||||
System.out.println("getLinkMetadata ("+pid+")");
|
||||
|
@ -249,6 +275,7 @@ public class CitationHarvesterImpl {
|
|||
// TODO Auto-generated catch block
|
||||
e1.printStackTrace();
|
||||
}
|
||||
|
||||
if (pid.contains("doi.org") && (jsondata==null || jsondata.length()==0 || jsondata.isNull("citation string"))) {
|
||||
System.out.println("getCitationCSL ("+pid+")");
|
||||
JSONObject jsoncsldata=null;
|
||||
|
@ -256,14 +283,16 @@ public class CitationHarvesterImpl {
|
|||
if (jsondata==null)
|
||||
jsondata=jsoncsldata;
|
||||
else {
|
||||
Iterator<String> ite= jsoncsldata.keys();
|
||||
while (ite.hasNext()) {
|
||||
String key=ite.next();
|
||||
try {
|
||||
jsondata.put(key, jsoncsldata.get(key));
|
||||
} catch (JSONException e) {
|
||||
// TODO Auto-generated catch block
|
||||
e.printStackTrace();
|
||||
if(jsoncsldata!=null && jsoncsldata.length()>0) {
|
||||
Iterator<String> ite= jsoncsldata.keys();
|
||||
while (ite.hasNext()) {
|
||||
String key=ite.next();
|
||||
try {
|
||||
jsondata.put(key, jsoncsldata.get(key));
|
||||
} catch (JSONException e) {
|
||||
// TODO Auto-generated catch block
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -271,6 +300,25 @@ public class CitationHarvesterImpl {
|
|||
|
||||
}
|
||||
|
||||
if(jsondata!=null && jsondata.has("properties") && pid.contains("doi.org")){
|
||||
|
||||
getCNCit=false;
|
||||
JSONObject mydoijo=new JSONObject();
|
||||
mydoijo=getCitationMetadata(pid, token);
|
||||
if (mydoijo!=null) {
|
||||
try {
|
||||
if (mydoijo.has("properties"))
|
||||
jsondata.put("ra_properties", mydoijo.get("properties"));
|
||||
|
||||
} catch (JSONException e) {
|
||||
// TODO Auto-generated catch block
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
getCNCit=true;
|
||||
|
||||
}
|
||||
|
||||
|
||||
if (jsondata==null || jsondata.length()==0) {
|
||||
try {
|
||||
|
|
|
@ -26,6 +26,11 @@ import java.net.URL;
|
|||
public class CSVHelper {
|
||||
|
||||
static CSVHelper app = new CSVHelper();
|
||||
static BufferedReader br;
|
||||
static CSVParser parser;
|
||||
public CSVHelper() {
|
||||
|
||||
}
|
||||
public static void main(String[] args) {
|
||||
|
||||
try {
|
||||
|
@ -62,7 +67,7 @@ public class CSVHelper {
|
|||
return new File(resource.toURI());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public CSVParser getRepoProfiles() throws URISyntaxException{
|
||||
|
||||
File repo=app.getFileFromResource("repoprofiles.csv");
|
||||
|
@ -79,7 +84,7 @@ public class CSVHelper {
|
|||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
public String getRepoApi(String repositoryurl) throws URISyntaxException{
|
||||
|
||||
File repo=app.getFileFromResource("repoprofiles.csv");
|
||||
|
@ -87,7 +92,7 @@ public class CSVHelper {
|
|||
BufferedReader br = new BufferedReader(new FileReader(repo));
|
||||
CSVParser parser = CSVFormat.DEFAULT.withDelimiter(';').withHeader().parse(br);
|
||||
) {
|
||||
|
||||
|
||||
int slashpos= repositoryurl.indexOf('/', 8);
|
||||
String repoid=repositoryurl.substring(0, slashpos);
|
||||
System.out.println("repoid "+repoid);
|
||||
|
@ -101,7 +106,7 @@ public class CSVHelper {
|
|||
System.out.println(record.get("Repository name") +" - "+record.get("api")+" - "+record.get("viewer"));
|
||||
return record.get("viewer");
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
return "none";
|
||||
|
||||
|
@ -111,5 +116,59 @@ public class CSVHelper {
|
|||
}
|
||||
return null;
|
||||
}
|
||||
public String getRepoName(String repositoryurl) throws URISyntaxException{
|
||||
|
||||
File repo=app.getFileFromResource("repoprofiles.csv");
|
||||
try(
|
||||
BufferedReader br = new BufferedReader(new FileReader(repo));
|
||||
CSVParser parser = CSVFormat.DEFAULT.withDelimiter(';').withHeader().parse(br);
|
||||
) {
|
||||
|
||||
int slashpos= repositoryurl.indexOf('/', 8);
|
||||
String repoid=repositoryurl.substring(0, slashpos);
|
||||
//System.out.println("repoid "+repoid);
|
||||
for(CSVRecord record : parser) {
|
||||
String apiurl=record.get("api");
|
||||
if(record.get("Website").trim().contains(repoid) || apiurl.contains(repoid) || record.get("dns").trim().contains(repoid)) {
|
||||
System.out.println(record.get("Repository name") +" - "+record.get("api")+" - "+record.get("viewer"));
|
||||
return record.get("Repository name");
|
||||
}
|
||||
|
||||
}
|
||||
return "none";
|
||||
|
||||
}
|
||||
catch (Exception e) {
|
||||
System.out.println(e);
|
||||
}
|
||||
return "none";
|
||||
}
|
||||
public String getRepoURL(String repositoryurl) throws URISyntaxException{
|
||||
|
||||
File repo=app.getFileFromResource("repoprofiles.csv");
|
||||
try(
|
||||
BufferedReader br = new BufferedReader(new FileReader(repo));
|
||||
CSVParser parser = CSVFormat.DEFAULT.withDelimiter(';').withHeader().parse(br);
|
||||
) {
|
||||
|
||||
int slashpos= repositoryurl.indexOf('/', 8);
|
||||
String repoid=repositoryurl.substring(0, slashpos);
|
||||
//System.out.println("repoid "+repoid);
|
||||
for(CSVRecord record : parser) {
|
||||
String apiurl=record.get("api");
|
||||
if(record.get("Website").trim().contains(repoid) || apiurl.contains(repoid) || record.get("dns").trim().contains(repoid)) {
|
||||
System.out.println(record.get("Repository name") +" - "+record.get("api")+" - "+record.get("viewer"));
|
||||
return record.get("Website");
|
||||
}
|
||||
|
||||
}
|
||||
return "none";
|
||||
|
||||
}
|
||||
catch (Exception e) {
|
||||
System.out.println(e);
|
||||
}
|
||||
return "none";
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -50,6 +50,7 @@ public class HeuristicParsers {
|
|||
public JSONObject getAPIMetadata(String pid) {
|
||||
JSONObject metajsondata=new JSONObject();
|
||||
JSONObject jsondata=new JSONObject();
|
||||
JSONObject jsonrepodata=new JSONObject();
|
||||
try {
|
||||
//Check if it is a handle
|
||||
String dnsPid="";
|
||||
|
@ -73,10 +74,14 @@ public class HeuristicParsers {
|
|||
if (tmp.getString("type").contains("URL")) {
|
||||
JSONObject urlob=(JSONObject) tmp.get("data");
|
||||
dnsPid=urlob.getString("value");
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
if (dnsPid.trim()=="")
|
||||
return (jsondata);
|
||||
|
||||
System.out.println("pid "+dnsPid);
|
||||
String apiURL=repo.getRepoApi(dnsPid);
|
||||
|
||||
|
@ -103,6 +108,16 @@ public class HeuristicParsers {
|
|||
}
|
||||
|
||||
}
|
||||
|
||||
if (metajsondata!=null && metajsondata.length()>0) {
|
||||
String rname=repo.getRepoName(dnsPid);
|
||||
String rws=repo.getRepoURL(dnsPid);
|
||||
if (rname!="none")
|
||||
jsonrepodata.put("name", rname);
|
||||
if (rws!="none")
|
||||
jsonrepodata.put("URL", rws);
|
||||
|
||||
}
|
||||
} catch (Exception e) {
|
||||
// TODO Auto-generated catch block
|
||||
e.printStackTrace();
|
||||
|
@ -110,7 +125,8 @@ public class HeuristicParsers {
|
|||
if (metajsondata!=null && metajsondata.length()>0) {
|
||||
try {
|
||||
jsondata.put("citation string", "na");
|
||||
jsondata.put("metadata source", "Repository API");
|
||||
|
||||
jsondata.put("metadata source", jsonrepodata);
|
||||
jsondata.put("properties", metajsondata);
|
||||
} catch (JSONException e) {
|
||||
// TODO Auto-generated catch block
|
||||
|
@ -125,7 +141,7 @@ public class HeuristicParsers {
|
|||
//Document doc = Jsoup.connect(pid).get();
|
||||
|
||||
Document doc = SSLHelper.getConnection(pid).ignoreContentType(true)
|
||||
.userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0").timeout(20 * 1000).get();
|
||||
.userAgent("Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0").referrer("http://www.google.com").timeout(20 * 1000).get();
|
||||
Elements cmdheader=doc.getElementsByTag("cmd:Header");
|
||||
if (cmdheader!=null && cmdheader.size()>0) {
|
||||
JSONObject talarproperties=new JSONObject();
|
||||
|
@ -135,10 +151,12 @@ public class HeuristicParsers {
|
|||
return jsondata;
|
||||
}
|
||||
|
||||
JSONObject jsonproperties=new JSONObject();
|
||||
//application/ld+json
|
||||
Elements scripts = doc.getElementsByTag("script");
|
||||
for (Element script: scripts) {//get metadata from <script>
|
||||
String type = script.attr("type");
|
||||
String dap=script.attr("data-analytics-provider");
|
||||
if (type!=null && !type.trim().isEmpty() &&
|
||||
(type.trim().equalsIgnoreCase("application/ld+json") || type.trim().equalsIgnoreCase("application/json"))) {
|
||||
|
||||
|
@ -152,7 +170,7 @@ public class HeuristicParsers {
|
|||
jsonStr = jsonStrraw.replace('\r', ' ');
|
||||
//System.out.println(jsonStr);
|
||||
//getGraph(jsonStr);
|
||||
JSONObject jsonproperties=new JSONObject();
|
||||
//JSONObject jsonproperties=new JSONObject();
|
||||
|
||||
jsonproperties=new JSONObject(jsonStr.trim());
|
||||
if (jsonproperties.has("mainEntity")) {
|
||||
|
@ -165,7 +183,47 @@ public class HeuristicParsers {
|
|||
jsondata.put("properties", jsonproperties);
|
||||
}
|
||||
}
|
||||
if (dap!=null && !dap.trim().isEmpty() &&
|
||||
(dap.trim().equalsIgnoreCase("ga") )) {
|
||||
String jsondapStr=script.toString().trim();
|
||||
//gaData.content = {
|
||||
int headerLimit=jsondapStr.indexOf("gaData.content = {");
|
||||
String tmpStr=jsondapStr.trim().substring(headerLimit+1);
|
||||
int obend=tmpStr.indexOf("}");
|
||||
String jsstr=tmpStr.substring(0, obend);
|
||||
System.out.println("--------------------"+jsstr);
|
||||
|
||||
}
|
||||
}
|
||||
Elements metas = doc.getElementsByTag("meta");
|
||||
for (Element meta: metas) {//get metadata from <meta>
|
||||
String name = meta.attr("name");
|
||||
String property = meta.attr("property");
|
||||
String content = meta.attr("content");
|
||||
|
||||
if (!name.trim().isEmpty() &&
|
||||
!content.trim().isEmpty() &&
|
||||
(name.toLowerCase().trim().startsWith("dc.") ||
|
||||
name.trim().startsWith("citation_")) ||
|
||||
name.toLowerCase().trim().startsWith("eprints.")){
|
||||
if (jsonproperties.has(name)) {
|
||||
String names= jsonproperties.get(name).toString();
|
||||
content=names+", "+content;
|
||||
}
|
||||
|
||||
if (name.equalsIgnoreCase("eprints.citation")) {
|
||||
jsondata.put("citation string", content);
|
||||
}
|
||||
else
|
||||
jsonproperties.put(name, content);
|
||||
}
|
||||
|
||||
if (!property.trim().isEmpty() && !content.trim().isEmpty())
|
||||
jsonproperties.put(property.trim(), content.trim());
|
||||
|
||||
}
|
||||
|
||||
|
||||
//get the citation string
|
||||
|
||||
Element citationid = doc.getElementById("invenio-csl");
|
||||
|
@ -301,10 +359,12 @@ public class HeuristicParsers {
|
|||
}
|
||||
else
|
||||
jsonproperties.put(name, content);
|
||||
|
||||
}
|
||||
if (name.trim().contains("citation_author")) {//to be used to double check authors
|
||||
authors_affiliation.put(content, "na");
|
||||
}
|
||||
|
||||
if (!property.trim().isEmpty() && !content.trim().isEmpty())
|
||||
jsonproperties.put(property.trim(), content.trim());
|
||||
|
||||
|
|
Loading…
Reference in New Issue