diff --git a/.github/workflows/snapshot_deploy.yml b/.github/workflows/snapshot_deploy.yml
index f75c29486..e61a1fd7d 100644
--- a/.github/workflows/snapshot_deploy.yml
+++ b/.github/workflows/snapshot_deploy.yml
@@ -77,4 +77,5 @@ jobs:
run: |
echo "Deploying to https://maven.pkg.github.com/${REPO} with revision ${REVISION}"
mvn deploy -DskipTests \
- -Dgithub.repo.url="https://maven.pkg.github.com/${REPO}"
+ -Dgithub.repo.url="https://maven.pkg.github.com/${REPO}" \
+ -DskipNexusStagingDeployMojo=true
diff --git a/core/src/main/java/org/dbpedia/extraction/nif/Link.java b/core/src/main/java/org/dbpedia/extraction/nif/Link.java
index d6b6ffb80..ddfadf582 100644
--- a/core/src/main/java/org/dbpedia/extraction/nif/Link.java
+++ b/core/src/main/java/org/dbpedia/extraction/nif/Link.java
@@ -11,11 +11,29 @@ public class Link implements Comparable {
private boolean topicLink = false;
private boolean topicPartLink = false;
private boolean surfaceFormLink = false;
-
+ private boolean citation = false;
+ private String citationId = "";
+
public Link() {
-
+
+ }
+
+ public boolean isCitation() {
+ return citation;
+ }
+
+ public void setCitation(boolean citation) {
+ this.citation = citation;
}
-
+
+ public String getCitationId() {
+ return citationId;
+ }
+
+ public void setCitationId(String citationId) {
+ this.citationId = citationId;
+ }
+
public boolean isSurfaceFormLink() {
return surfaceFormLink;
}
@@ -91,12 +109,12 @@ public void setExternal(boolean external) {
@Override
public int compareTo(Link link) {
// TODO Auto-generated method stub
- if(this.wordStart==link.getWordStart())
+ if (this.wordStart == link.getWordStart())
return 0;
- else if(this.wordStart paragraphs = null;
private Paragraph paragraph = null;
- private Link tempLink;
+ private Link tempLink;
private boolean inSup = false;
private boolean invisible = false;
- private NifExtractorContext context;
+ private NifExtractorContext context;
private ArrayList errors = new ArrayList<>();
-
+
public LinkExtractor(NifExtractorContext context) {
- paragraphs = new ArrayList();
+ paragraphs = new ArrayList();
this.context = context;
}
-
+
/**
* Gets called when entering an element
- * -handle text cleanup and remove Wikipedia specific stuff like reference numbers
- * -if we encounter a link, we make a new nif:Word
- * -we get the text out of a whitelist of elements.
- * If we encounter a non-whitelisted element, we set this.skipLevel to the current depth
- * of the dom tree and skip everything until we are back to that depth
- * -this thing badly needs refactoring
+ * - handle text cleanup and remove Wikipedia specific stuff like reference
+ * numbers
+ * - if we encounter a link, we make a new nif:Word
+ * - we get the text out of a whitelist of elements.
+ * If we encounter a non-whitelisted element, we set this.skipLevel to the
+ * current depth of the dom tree and skip everything until we are back to
+ * that depth
+ * - this thing badly needs refactoring
*/
-
+
public void head(Node node, int depth) {
- if(skipLevel>=0){
+ if (skipLevel >= 0) {
return;
}
- if(paragraph == null) {
+ if (paragraph == null) {
paragraph = new Paragraph(0, "", "p");
}
- //ignore all content inside invisible tags
- if(invisible || node.attr("style").matches(".*display\\s*:\\s*none.*")) {
+ // ignore all content inside invisible tags
+ if (invisible || node.attr("style").matches(".*display\\s*:\\s*none.*")) {
invisible = true;
return;
}
- if(node.nodeName().equals("#text")) {
- String tempText = node.toString();
-
- //replace no-break spaces because unescape doesn't deal with them
- tempText = StringEscapeUtils.unescapeHtml4(tempText);
- tempText = org.dbpedia.extraction.util.StringUtils.escape(tempText, replaceChars());
- tempText = tempText.replace("\\n", "\n").replace("\\t", "\t").replace("\\r", "");
-
- //this text node is the content of an element: make a new nif:Word
- if(inLink) {
- if(!tempText.trim().startsWith(this.context.wikipediaTemplateString + ":")) //not!
- {
- tempLink.setLinkText(tempText);
- tempLink.setWordStart(paragraph.getLength() + (Paragraph.FollowedByWhiteSpace(paragraph.getText()) ? 1 : 0));
- paragraph.addText(tempText);
- tempLink.setWordEnd(paragraph.getLength());
- }
- else{ // -> filter out hidden links to the underlying template
- errors.add("found Template in resource: " + this.context.resource + ": " + tempText);
- return;
- }
- }
- else
- paragraph.addText(tempText);
+ if (node.nodeName().equals("#text")) {
+ String tempText = node.toString();
+
+ // replace no-break spaces because unescape doesn't deal with them
+ tempText = StringEscapeUtils.unescapeHtml4(tempText);
+ tempText = org.dbpedia.extraction.util.StringUtils.escape(tempText, replaceChars());
+ tempText = tempText.replace("\\n", "\n").replace("\\t", "\t").replace("\\r", "");
+
+ // this text node is the content of an element: make a new nif:Word
+ if (inLink) {
+ if (!tempText.trim().startsWith(this.context.wikipediaTemplateString + ":")) // not!
+ {
+ tempLink.setLinkText(tempText);
+ tempLink.setWordStart(
+ paragraph.getLength() + (Paragraph.FollowedByWhiteSpace(paragraph.getText()) ? 1 : 0));
+ paragraph.addText(tempText);
+ tempLink.setWordEnd(paragraph.getLength());
+ } else { // -> filter out hidden links to the underlying template
+ errors.add("found Template in resource: " + this.context.resource + ": " + tempText);
+ return;
+ }
+ } else
+ paragraph.addText(tempText);
}
- else if(node.nodeName().equals("a")) {
+ else if (node.nodeName().equals("a")) {
- String link = node.attr("href");
- //TODO central string management
+ String link = node.attr("href");
+ // TODO central string management
/**
- * remove internal links linking to mediawiki meta pages. Also removes links that contain ":".
+ * remove internal links linking to mediawiki meta pages. Also removes links
+ * that contain ":".
* Wikipedia api standard link looks like (allowed):
* philosopher
- * see Schopenhauer: https://en.wikipedia.org/w/api.php?uselang=en&format=xml&action=parse&prop=text&pageid=17340400
+ * see Schopenhauer:
+ * https://en.wikipedia.org/w/api.php?uselang=en&format=xml&action=parse&prop=text&pageid=17340400
*/
- String linkPrefix = "/wiki/";
+ String linkPrefix = "/wiki/";
// SPECIAL CASE FOR RESTAPI PARSING https://en.wikipedia.org/api/rest_v1/
- if(node.hasAttr("rel")) {
+ if (node.hasAttr("rel")) {
String relType = node.attr("rel");
- if(relType.equals("mw:WikiLink")){
+ if (relType.equals("mw:WikiLink")) {
tempLink = new Link();
String uri = cleanLink(node.attr("href"), false);
setUri(uri);
@@ -109,13 +112,20 @@ else if(node.nodeName().equals("a")) {
String uri = cleanLink(node.attr("href"), false);
setUri(uri);
- //simple example of Help:IPA
- // [ˈaɐ̯tʊɐ̯ ˈʃoːpn̩haʊ̯ɐ]
+ // simple example of Help:IPA
+ // [ˈaɐ̯tʊɐ̯ ˈʃoːpn̩haʊ̯ɐ]
} else if (link.contains(linkPrefix) && link.contains(":")) {
/**
* TODO buggy
* Cleans up child nodes: difficult example
- * /ˈʃoʊpənhaʊ.ər/
+ * /ˈʃoʊpənhaʊ.ər/
*/
if (!node.childNodes().isEmpty()) {
if (node.childNode(0).nodeName().equals("#text") &&
@@ -128,93 +138,98 @@ else if(node.nodeName().equals("a")) {
} else {
skipLevel = depth;
}
- //TODO add example
+ // TODO add example
} else if (node.attr("class").equals("external text")) {
- //don't skip external links
+ // don't skip external links
tempLink = new Link();
String uri = cleanLink(node.attr("href"), true);
setUri(uri);
+ } else if (link.startsWith("#cite_note-")) {
+ tempLink = new Link();
+ tempLink.setCitation(true);
+ tempLink.setCitationId(link.substring(1));
+ inLink = true;
} else {
skipLevel = depth;
}
}
- } else if(node.nodeName().equals("p")) {
- if(paragraph != null) {
- addParagraph("p");
- }
- else
- paragraph = new Paragraph(0, "", "p");
- } else if(node.nodeName().equals("sup")) {
+ } else if (node.nodeName().equals("p")) {
+ if (paragraph != null) {
+ addParagraph("p");
+ } else
+ paragraph = new Paragraph(0, "", "p");
+ } else if (node.nodeName().equals("sup")) {
inSup = true;
- } else if(node.nodeName().matches("h\\d")) {
- addParagraph(node.nodeName());
- } else if(node.nodeName().equals("table")) {
- addParagraph("table");
- paragraph.addStructure(paragraph.getLength(), node.outerHtml(), "table", node.attr("class"), node.attr("id"));
- addParagraph("p");
- skipLevel = depth;
- } else if(node.nodeName().equals("span")) {
- //denote notes
-
- if(node.attr("class").contains("notebegin"))
- addParagraph("note");
-
- } else if(node.nodeName().equals("math")) {
- addParagraph("math");
- paragraph.addStructure(paragraph.getLength(), node.outerHtml(), "math", "tex", null);
- addParagraph("p");
- skipLevel = depth;
- }
+ } else if (node.nodeName().matches("h\\d")) {
+ addParagraph(node.nodeName());
+ } else if (node.nodeName().equals("table")) {
+ addParagraph("table");
+ paragraph.addStructure(paragraph.getLength(), node.outerHtml(), "table", node.attr("class"),
+ node.attr("id"));
+ addParagraph("p");
+ skipLevel = depth;
+ } else if (node.nodeName().equals("span")) {
+ // denote notes
+
+ if (node.attr("class").contains("notebegin"))
+ addParagraph("note");
+
+ } else if (node.nodeName().equals("math")) {
+ addParagraph("math");
+ paragraph.addStructure(paragraph.getLength(), node.outerHtml(), "math", "tex", null);
+ addParagraph("p");
+ skipLevel = depth;
+ }
}
private void setUri(String uri) {
- if(uri!=null) {
- tempLink.setUri(uri);
- tempLink.setExternal(true);
- inLink = true;
- } else {
- tempLink = new Link();
- }
+ if (uri != null) {
+ tempLink.setUri(uri);
+ tempLink.setExternal(true);
+ inLink = true;
+ } else {
+ tempLink = new Link();
+ }
}
-
+
private String cleanLink(String uri, boolean external) {
- if(!external) {
+ if (!external) {
String linkPrefix = "/wiki/";
- String linkPrefix2= "./";
- if(uri.contains(linkPrefix)){
- uri=uri.substring(uri.indexOf("?title=")+7);
+ String linkPrefix2 = "./";
+ if (uri.contains(linkPrefix)) {
+ uri = uri.substring(uri.indexOf("?title=") + 7);
} else if (uri.contains(linkPrefix2)) {
- uri=uri.substring(uri.indexOf("?title=")+3);
+ uri = uri.substring(uri.indexOf("?title=") + 3);
}
- //TODO central string management
- if(!this.context.language.equals("en")) {
- uri="http://"+this.context.language+".dbpedia.org/resource/"+uri;
+ // TODO central string management
+ if (!this.context.language.equals("en")) {
+ uri = "http://" + this.context.language + ".dbpedia.org/resource/" + uri;
} else {
- uri="http://dbpedia.org/resource/"+uri;
+ uri = "http://dbpedia.org/resource/" + uri;
}
uri = uri.replace("&action=edit&redlink=1", "");
-
+
} else {
- //there are links that contain illegal hostnames
+ // there are links that contain illegal hostnames
try {
- if(uri.startsWith("//"))
- uri = "http:"+uri;
- uri = URLEncoder.encode(uri,"UTF-8");
+ if (uri.startsWith("//"))
+ uri = "http:" + uri;
+ uri = URLEncoder.encode(uri, "UTF-8");
uri = uri.replace("%3A", ":").replace("%2F", "/").replace("%2E", ".");
-
- } catch(UnsupportedEncodingException e) {
- //this doesn't happen
+
+ } catch (UnsupportedEncodingException e) {
+ // this doesn't happen
e.printStackTrace();
}
}
return UriUtils.uriToDbpediaIri(uri).toString();
}
-
+
public void tail(Node node, int depth) {
- if(skipLevel>0) {
- if(skipLevel==depth) {
+ if (skipLevel > 0) {
+ if (skipLevel == depth) {
skipLevel = -1;
return;
} else {
@@ -222,74 +237,68 @@ public void tail(Node node, int depth) {
}
}
- if(node.nodeName().equals("a") && inLink) {
+ if (node.nodeName().equals("a") && inLink) {
inLink = false;
paragraph.addLink(tempLink);
tempLink = new Link();
- }
- else if(invisible && node.attr("style").matches(".*display\\s*:\\s*none.*")) {
- invisible = false;
- }
- else if(node.nodeName().equals("p") && paragraph != null) {
- addParagraph("p");
- }
- else if(node.nodeName().equals("sup") && inSup) {
+ } else if (invisible && node.attr("style").matches(".*display\\s*:\\s*none.*")) {
+ invisible = false;
+ } else if (node.nodeName().equals("p") && paragraph != null) {
+ addParagraph("p");
+ } else if (node.nodeName().equals("sup") && inSup) {
inSup = false;
+ } else if (node.nodeName().matches("h\\d")) {
+ addParagraph("p");
+ } else if (node.nodeName().equals("span")) {
+ if (node.attr("class").contains("noteend"))
+ addParagraph("p");
}
- else if(node.nodeName().matches("h\\d")) {
- addParagraph("p");
- }
- else if(node.nodeName().equals("span")) {
- if(node.attr("class").contains("noteend"))
- addParagraph("p");
- }
}
-
+
public List getParagraphs() {
- if(paragraph != null && paragraph.getLength() > 0)
- {
- paragraphs.add(paragraph);
- paragraph = null;
- }
- return paragraphs;
+ if (paragraph != null && paragraph.getLength() > 0) {
+ paragraphs.add(paragraph);
+ paragraph = null;
+ }
+ return paragraphs;
}
- private void addParagraph(String newTag){
- if(paragraph.getLength() != 0 || paragraph.getHtmlStrings().size() > 0)
- paragraphs.add(paragraph);
+ private void addParagraph(String newTag) {
+ if (paragraph.getLength() != 0 || paragraph.getHtmlStrings().size() > 0)
+ paragraphs.add(paragraph);
- paragraph = new Paragraph(0, "", (newTag == null ? "p" : newTag));
- }
+ paragraph = new Paragraph(0, "", (newTag == null ? "p" : newTag));
+ }
- public int getTableCount(){
- int count =0;
- for(Paragraph p : this.getParagraphs()){
+ public int getTableCount() {
+ int count = 0;
+ for (Paragraph p : this.getParagraphs()) {
count += paragraph.getHtmlStrings().size();
}
return count;
}
- public ArrayList getErrors(){
+ public ArrayList getErrors() {
return errors;
}
- private String[] replaceChars() {
- String[] rep = new String[256];
- rep['\n'] = "";
- rep['\u00A0'] = " ";
- return rep;
- }
-
- public static class NifExtractorContext {
- private String language;
- private String resource;
- private String wikipediaTemplateString;
-
- public NifExtractorContext(String language, String resource, String templateString){
- this.language = language;
- this.resource = resource;
- this.wikipediaTemplateString = templateString;
- }
- }
+ private String[] replaceChars() {
+ String[] rep = new String[256];
+ rep['\n'] = "";
+ rep['\u00A0'] = " ";
+ return rep;
+ }
+
+ public static class NifExtractorContext {
+ private String language;
+ private String resource;
+ private String wikipediaTemplateString;
+
+ public NifExtractorContext(String language, String resource, String templateString) {
+ this.language = language;
+ this.resource = resource;
+ this.wikipediaTemplateString = templateString;
+ }
+ }
}
diff --git a/core/src/main/resources/addonlangs.json b/core/src/main/resources/addonlangs.json
index 0538dd78a..8b45753b8 100644
--- a/core/src/main/resources/addonlangs.json
+++ b/core/src/main/resources/addonlangs.json
@@ -79,5 +79,11 @@
"name": "Cantonese",
"isoCode": "yue",
"iso639_3": "yue"
+ },
+ "en": {
+ "wikiCode": "en",
+ "name": "English",
+ "isoCode": "en",
+ "iso639_3": "eng"
}
}
\ No newline at end of file
diff --git a/core/src/main/resources/nifextractionconfig.json b/core/src/main/resources/nifextractionconfig.json
index c4548f431..d66d3ceb5 100644
--- a/core/src/main/resources/nifextractionconfig.json
+++ b/core/src/main/resources/nifextractionconfig.json
@@ -14,7 +14,6 @@
"nif-remove-elements":[
".noprint",
".haudio",
- "sup.reference",
"span.mw-editsection",
".error",
"#coordinates",
diff --git a/core/src/main/scala/org/dbpedia/extraction/mappings/DisambiguationExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/mappings/DisambiguationExtractor.scala
index e9beac491..326924411 100644
--- a/core/src/main/scala/org/dbpedia/extraction/mappings/DisambiguationExtractor.scala
+++ b/core/src/main/scala/org/dbpedia/extraction/mappings/DisambiguationExtractor.scala
@@ -22,7 +22,7 @@ extends PageNodeExtractor
{
private val language = context.language
- private val replaceString = DisambiguationExtractorConfig.disambiguationTitlePartMap(language.wikiCode)
+ private val replaceString = DisambiguationExtractorConfig.disambiguationTitlePartMap.getOrElse(language.wikiCode, " (disambiguation)")
val wikiPageDisambiguatesProperty = context.ontology.properties("wikiPageDisambiguates")
diff --git a/core/src/main/scala/org/dbpedia/extraction/nif/HtmlNifExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/nif/HtmlNifExtractor.scala
index 130d2c7e9..d2d8f47c5 100755
--- a/core/src/main/scala/org/dbpedia/extraction/nif/HtmlNifExtractor.scala
+++ b/core/src/main/scala/org/dbpedia/extraction/nif/HtmlNifExtractor.scala
@@ -39,6 +39,7 @@ abstract class HtmlNifExtractor(nifContextIri: String, language: String, nifPara
protected val templateString = "Template"
private val sectionMap = new mutable.HashMap[PageSection, ExtractedSection]()
+ private val citationMap = new mutable.HashMap[String, String]()
/**
* Extract the relevant html page divided in sections and paragraphs
@@ -285,9 +286,17 @@ abstract class HtmlNifExtractor(nifContextIri: String, language: String, nifPara
words += nifLinks(word, RdfNamespace.NIF.append("beginIndex"), (offset + link.getWordStart).toString, sourceUrl, RdfNamespace.XSD.append("nonNegativeInteger"))
words += nifLinks(word, RdfNamespace.NIF.append("endIndex"), (offset + link.getWordEnd).toString, sourceUrl, RdfNamespace.XSD.append("nonNegativeInteger"))
words += nifLinks(word, RdfNamespace.NIF.append("superString"), paragraphUri, sourceUrl, null)
- UriUtils.createURI(link.getUri) match{
- case Success(s) => words += nifLinks(word, "http://www.w3.org/2005/11/its/rdf#taIdentRef", s.toString, sourceUrl, null) //TODO IRI's might throw exception in org.dbpedia.extraction.destinations.formatters please check this
- case Failure(f) =>
+ if (link.isCitation) {
+ words += nifLinks(word, RdfNamespace.RDF.append("type"), "http://dbpedia.org/ontology/Citation", sourceUrl, null)
+ citationMap.get(link.getCitationId) match {
+ case Some(url) => words += nifLinks(word, "http://www.w3.org/2005/11/its/rdf#taIdentRef", url, sourceUrl, null)
+ case None =>
+ }
+ } else {
+ UriUtils.createURI(link.getUri) match{
+ case Success(s) => words += nifLinks(word, "http://www.w3.org/2005/11/its/rdf#taIdentRef", s.toString, sourceUrl, null) //TODO IRI's might throw exception in org.dbpedia.extraction.destinations.formatters please check this
+ case Failure(f) =>
+ }
}
if(writeLinkAnchors)
words += nifLinks(word, RdfNamespace.NIF.append("anchorOf"), link.getLinkText, sourceUrl, RdfNamespace.XSD.append("string"))
@@ -346,6 +355,15 @@ abstract class HtmlNifExtractor(nifContextIri: String, language: String, nifPara
protected def getJsoupDoc(html: String): Document = {
val doc = Jsoup.parse(cleanHtml(html))
+ //extract citations
+ for(note <- doc.select("li[id^=cite_note-]").asScala){
+ val id = note.id()
+ val extLink = note.select("a.external.text").first()
+ if (extLink != null) {
+ citationMap.put(id, extLink.attr("href"))
+ }
+ }
+
//delete queries
for(query <- cssSelectorConfigMap.removeElements)
for(item <- doc.select(query).asScala)
diff --git a/core/src/main/scala/org/dbpedia/extraction/util/Language.scala b/core/src/main/scala/org/dbpedia/extraction/util/Language.scala
index 7f27e5774..8583602ff 100644
--- a/core/src/main/scala/org/dbpedia/extraction/util/Language.scala
+++ b/core/src/main/scala/org/dbpedia/extraction/util/Language.scala
@@ -123,13 +123,35 @@ object Language extends (String => Language)
request.setHeader("User-Agent", customUserAgentText)
}
- val response = client.execute(request)
- val stream = response.getEntity.getContent
- val wikiLanguageCodes =
- try Source.fromInputStream(stream).getLines().toList
- finally{
- stream.close()
- client.close()
+ val wikiLanguageCodes =
+ try {
+ // Set a default User-Agent if none is provided
+ if (!customUserAgentEnabled) {
+ request.setHeader("User-Agent", "DBpedia-Extraction-Framework/1.0 (https://github.com/dbpedia/extraction-framework; dbpedia@infai.org)")
+ }
+
+ val response = client.execute(request)
+ val status = response.getStatusLine.getStatusCode
+ if (status >= 200 && status < 300) {
+ val stream = response.getEntity.getContent
+ try {
+ Source.fromInputStream(stream).getLines()
+ .map(_.trim)
+ .filter(line => line.nonEmpty && !line.contains(" ")) // Basic filter for language codes
+ .toList
+ } finally {
+ stream.close()
+ }
+ } else {
+ logger.log(Level.WARNING, "Language list fetch failed with status " + status + " from " + wikipediaLanguageUrl)
+ List.empty
+ }
+ } catch {
+ case e: Exception =>
+ logger.log(Level.WARNING, "Could not fetch language list from " + wikipediaLanguageUrl + ": " + e.getMessage)
+ List.empty // Fallback to empty list, addonlangs.json will still be used
+ } finally {
+ client.close()
}
val specialLangs: JsonConfig = new JsonConfig(this.getClass.getClassLoader.getResource("addonlangs.json"))