extracting pid from citation, in progress

This commit is contained in:
Cesare Concordia 2022-06-06 18:13:50 +02:00
parent 6cf3981c6a
commit 108693a706
1 changed files with 20 additions and 10 deletions

View File

@ -13,7 +13,11 @@ import java.net.ProtocolException;
import java.net.URL;
import java.net.URLEncoder;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.net.ssl.HttpsURLConnection;
@ -152,6 +156,7 @@ public class CitationHarvesterImpl {
String jsonText="";
JSONObject jsondata=null;
try {
if (pid!=null && pid.trim()!="" && !pid.trim().equalsIgnoreCase("test"))
@ -208,6 +213,9 @@ public class CitationHarvesterImpl {
public JSONObject getCitationMetadataFromHTML(String pid, String token){
JSONObject jsondata=null;
//System.out.println("################## mypid completo "+pid);
pid=extractURL(pid)[0];
System.out.println("################## mypid "+pid);
if (pid.contains(".handle.net/")) {
jsondata=heup.getAPIMetadata(pid);
@ -231,18 +239,8 @@ public class CitationHarvesterImpl {
}
/*if (jsondata==null || jsondata.length()==0) {
System.out.println("getMetaMetadata ("+pid+")");
jsondata=heup.getMetaMetadata(pid);
}*/
/*if (jsondata==null || jsondata.length()==0) {
System.out.println("getLinkMetadata ("+pid+")");
jsondata= heup.getLinkMetadata(pid);
}*/
if (jsondata!=null && jsondata.has("properties")){
try {
JSONObject testp=(JSONObject) jsondata.get("properties");
@ -345,5 +343,17 @@ public class CitationHarvesterImpl {
return jsondata;
}
private String[] extractURL(String text) {
List<String> list = new ArrayList<String>();
Pattern pattern = Pattern
.compile(
"(http://|https://){1}[\\w\\.\\-/:\\#\\?\\=\\&\\;\\%\\~\\+]+",
Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(text);
while (matcher.find()) {
list.add(matcher.group());
}
return list.toArray(new String[list.size()]);
}
}