diff --git a/.github/workflows/snapshot_deploy.yml b/.github/workflows/snapshot_deploy.yml index f75c29486..e61a1fd7d 100644 --- a/.github/workflows/snapshot_deploy.yml +++ b/.github/workflows/snapshot_deploy.yml @@ -77,4 +77,5 @@ jobs: run: | echo "Deploying to https://maven.pkg.github.com/${REPO} with revision ${REVISION}" mvn deploy -DskipTests \ - -Dgithub.repo.url="https://maven.pkg.github.com/${REPO}" + -Dgithub.repo.url="https://maven.pkg.github.com/${REPO}" \ + -DskipNexusStagingDeployMojo=true diff --git a/core/src/main/java/org/dbpedia/extraction/nif/Link.java b/core/src/main/java/org/dbpedia/extraction/nif/Link.java index d6b6ffb80..ddfadf582 100644 --- a/core/src/main/java/org/dbpedia/extraction/nif/Link.java +++ b/core/src/main/java/org/dbpedia/extraction/nif/Link.java @@ -11,11 +11,29 @@ public class Link implements Comparable { private boolean topicLink = false; private boolean topicPartLink = false; private boolean surfaceFormLink = false; - + private boolean citation = false; + private String citationId = ""; + public Link() { - + + } + + public boolean isCitation() { + return citation; + } + + public void setCitation(boolean citation) { + this.citation = citation; } - + + public String getCitationId() { + return citationId; + } + + public void setCitationId(String citationId) { + this.citationId = citationId; + } + public boolean isSurfaceFormLink() { return surfaceFormLink; } @@ -91,12 +109,12 @@ public void setExternal(boolean external) { @Override public int compareTo(Link link) { // TODO Auto-generated method stub - if(this.wordStart==link.getWordStart()) + if (this.wordStart == link.getWordStart()) return 0; - else if(this.wordStart paragraphs = null; private Paragraph paragraph = null; - private Link tempLink; + private Link tempLink; private boolean inSup = false; private boolean invisible = false; - private NifExtractorContext context; + private NifExtractorContext context; private ArrayList errors = new ArrayList<>(); - + public LinkExtractor(NifExtractorContext context) { - paragraphs = new ArrayList(); + paragraphs = new ArrayList(); this.context = context; } - + /** * Gets called when entering an element - * -handle text cleanup and remove Wikipedia specific stuff like reference numbers - * -if we encounter a link, we make a new nif:Word - * -we get the text out of a whitelist of elements. - * If we encounter a non-whitelisted element, we set this.skipLevel to the current depth - * of the dom tree and skip everything until we are back to that depth - * -this thing badly needs refactoring + * - handle text cleanup and remove Wikipedia specific stuff like reference + * numbers + * - if we encounter a link, we make a new nif:Word + * - we get the text out of a whitelist of elements. + * If we encounter a non-whitelisted element, we set this.skipLevel to the + * current depth of the dom tree and skip everything until we are back to + * that depth + * - this thing badly needs refactoring */ - + public void head(Node node, int depth) { - if(skipLevel>=0){ + if (skipLevel >= 0) { return; } - if(paragraph == null) { + if (paragraph == null) { paragraph = new Paragraph(0, "", "p"); } - //ignore all content inside invisible tags - if(invisible || node.attr("style").matches(".*display\\s*:\\s*none.*")) { + // ignore all content inside invisible tags + if (invisible || node.attr("style").matches(".*display\\s*:\\s*none.*")) { invisible = true; return; } - if(node.nodeName().equals("#text")) { - String tempText = node.toString(); - - //replace no-break spaces because unescape doesn't deal with them - tempText = StringEscapeUtils.unescapeHtml4(tempText); - tempText = org.dbpedia.extraction.util.StringUtils.escape(tempText, replaceChars()); - tempText = tempText.replace("\\n", "\n").replace("\\t", "\t").replace("\\r", ""); - - //this text node is the content of an element: make a new nif:Word - if(inLink) { - if(!tempText.trim().startsWith(this.context.wikipediaTemplateString + ":")) //not! - { - tempLink.setLinkText(tempText); - tempLink.setWordStart(paragraph.getLength() + (Paragraph.FollowedByWhiteSpace(paragraph.getText()) ? 1 : 0)); - paragraph.addText(tempText); - tempLink.setWordEnd(paragraph.getLength()); - } - else{ // -> filter out hidden links to the underlying template - errors.add("found Template in resource: " + this.context.resource + ": " + tempText); - return; - } - } - else - paragraph.addText(tempText); + if (node.nodeName().equals("#text")) { + String tempText = node.toString(); + + // replace no-break spaces because unescape doesn't deal with them + tempText = StringEscapeUtils.unescapeHtml4(tempText); + tempText = org.dbpedia.extraction.util.StringUtils.escape(tempText, replaceChars()); + tempText = tempText.replace("\\n", "\n").replace("\\t", "\t").replace("\\r", ""); + + // this text node is the content of an element: make a new nif:Word + if (inLink) { + if (!tempText.trim().startsWith(this.context.wikipediaTemplateString + ":")) // not! + { + tempLink.setLinkText(tempText); + tempLink.setWordStart( + paragraph.getLength() + (Paragraph.FollowedByWhiteSpace(paragraph.getText()) ? 1 : 0)); + paragraph.addText(tempText); + tempLink.setWordEnd(paragraph.getLength()); + } else { // -> filter out hidden links to the underlying template + errors.add("found Template in resource: " + this.context.resource + ": " + tempText); + return; + } + } else + paragraph.addText(tempText); } - else if(node.nodeName().equals("a")) { + else if (node.nodeName().equals("a")) { - String link = node.attr("href"); - //TODO central string management + String link = node.attr("href"); + // TODO central string management /** - * remove internal links linking to mediawiki meta pages. Also removes links that contain ":". + * remove internal links linking to mediawiki meta pages. Also removes links + * that contain ":". * Wikipedia api standard link looks like (allowed): * philosopher - * see Schopenhauer: https://en.wikipedia.org/w/api.php?uselang=en&format=xml&action=parse&prop=text&pageid=17340400 + * see Schopenhauer: + * https://en.wikipedia.org/w/api.php?uselang=en&format=xml&action=parse&prop=text&pageid=17340400 */ - String linkPrefix = "/wiki/"; + String linkPrefix = "/wiki/"; // SPECIAL CASE FOR RESTAPI PARSING https://en.wikipedia.org/api/rest_v1/ - if(node.hasAttr("rel")) { + if (node.hasAttr("rel")) { String relType = node.attr("rel"); - if(relType.equals("mw:WikiLink")){ + if (relType.equals("mw:WikiLink")) { tempLink = new Link(); String uri = cleanLink(node.attr("href"), false); setUri(uri); @@ -109,13 +112,20 @@ else if(node.nodeName().equals("a")) { String uri = cleanLink(node.attr("href"), false); setUri(uri); - //simple example of Help:IPA - // [ˈaɐ̯tʊɐ̯ ˈʃoːpn̩haʊ̯ɐ] + // simple example of Help:IPA + // [ˈaɐ̯tʊɐ̯ ˈʃoːpn̩haʊ̯ɐ] } else if (link.contains(linkPrefix) && link.contains(":")) { /** * TODO buggy * Cleans up child nodes: difficult example - * /ˈʃpənh.ər/ + * /ˈʃpənh.ər/ */ if (!node.childNodes().isEmpty()) { if (node.childNode(0).nodeName().equals("#text") && @@ -128,93 +138,98 @@ else if(node.nodeName().equals("a")) { } else { skipLevel = depth; } - //TODO add example + // TODO add example } else if (node.attr("class").equals("external text")) { - //don't skip external links + // don't skip external links tempLink = new Link(); String uri = cleanLink(node.attr("href"), true); setUri(uri); + } else if (link.startsWith("#cite_note-")) { + tempLink = new Link(); + tempLink.setCitation(true); + tempLink.setCitationId(link.substring(1)); + inLink = true; } else { skipLevel = depth; } } - } else if(node.nodeName().equals("p")) { - if(paragraph != null) { - addParagraph("p"); - } - else - paragraph = new Paragraph(0, "", "p"); - } else if(node.nodeName().equals("sup")) { + } else if (node.nodeName().equals("p")) { + if (paragraph != null) { + addParagraph("p"); + } else + paragraph = new Paragraph(0, "", "p"); + } else if (node.nodeName().equals("sup")) { inSup = true; - } else if(node.nodeName().matches("h\\d")) { - addParagraph(node.nodeName()); - } else if(node.nodeName().equals("table")) { - addParagraph("table"); - paragraph.addStructure(paragraph.getLength(), node.outerHtml(), "table", node.attr("class"), node.attr("id")); - addParagraph("p"); - skipLevel = depth; - } else if(node.nodeName().equals("span")) { - //denote notes - - if(node.attr("class").contains("notebegin")) - addParagraph("note"); - - } else if(node.nodeName().equals("math")) { - addParagraph("math"); - paragraph.addStructure(paragraph.getLength(), node.outerHtml(), "math", "tex", null); - addParagraph("p"); - skipLevel = depth; - } + } else if (node.nodeName().matches("h\\d")) { + addParagraph(node.nodeName()); + } else if (node.nodeName().equals("table")) { + addParagraph("table"); + paragraph.addStructure(paragraph.getLength(), node.outerHtml(), "table", node.attr("class"), + node.attr("id")); + addParagraph("p"); + skipLevel = depth; + } else if (node.nodeName().equals("span")) { + // denote notes + + if (node.attr("class").contains("notebegin")) + addParagraph("note"); + + } else if (node.nodeName().equals("math")) { + addParagraph("math"); + paragraph.addStructure(paragraph.getLength(), node.outerHtml(), "math", "tex", null); + addParagraph("p"); + skipLevel = depth; + } } private void setUri(String uri) { - if(uri!=null) { - tempLink.setUri(uri); - tempLink.setExternal(true); - inLink = true; - } else { - tempLink = new Link(); - } + if (uri != null) { + tempLink.setUri(uri); + tempLink.setExternal(true); + inLink = true; + } else { + tempLink = new Link(); + } } - + private String cleanLink(String uri, boolean external) { - if(!external) { + if (!external) { String linkPrefix = "/wiki/"; - String linkPrefix2= "./"; - if(uri.contains(linkPrefix)){ - uri=uri.substring(uri.indexOf("?title=")+7); + String linkPrefix2 = "./"; + if (uri.contains(linkPrefix)) { + uri = uri.substring(uri.indexOf("?title=") + 7); } else if (uri.contains(linkPrefix2)) { - uri=uri.substring(uri.indexOf("?title=")+3); + uri = uri.substring(uri.indexOf("?title=") + 3); } - //TODO central string management - if(!this.context.language.equals("en")) { - uri="http://"+this.context.language+".dbpedia.org/resource/"+uri; + // TODO central string management + if (!this.context.language.equals("en")) { + uri = "http://" + this.context.language + ".dbpedia.org/resource/" + uri; } else { - uri="http://dbpedia.org/resource/"+uri; + uri = "http://dbpedia.org/resource/" + uri; } uri = uri.replace("&action=edit&redlink=1", ""); - + } else { - //there are links that contain illegal hostnames + // there are links that contain illegal hostnames try { - if(uri.startsWith("//")) - uri = "http:"+uri; - uri = URLEncoder.encode(uri,"UTF-8"); + if (uri.startsWith("//")) + uri = "http:" + uri; + uri = URLEncoder.encode(uri, "UTF-8"); uri = uri.replace("%3A", ":").replace("%2F", "/").replace("%2E", "."); - - } catch(UnsupportedEncodingException e) { - //this doesn't happen + + } catch (UnsupportedEncodingException e) { + // this doesn't happen e.printStackTrace(); } } return UriUtils.uriToDbpediaIri(uri).toString(); } - + public void tail(Node node, int depth) { - if(skipLevel>0) { - if(skipLevel==depth) { + if (skipLevel > 0) { + if (skipLevel == depth) { skipLevel = -1; return; } else { @@ -222,74 +237,68 @@ public void tail(Node node, int depth) { } } - if(node.nodeName().equals("a") && inLink) { + if (node.nodeName().equals("a") && inLink) { inLink = false; paragraph.addLink(tempLink); tempLink = new Link(); - } - else if(invisible && node.attr("style").matches(".*display\\s*:\\s*none.*")) { - invisible = false; - } - else if(node.nodeName().equals("p") && paragraph != null) { - addParagraph("p"); - } - else if(node.nodeName().equals("sup") && inSup) { + } else if (invisible && node.attr("style").matches(".*display\\s*:\\s*none.*")) { + invisible = false; + } else if (node.nodeName().equals("p") && paragraph != null) { + addParagraph("p"); + } else if (node.nodeName().equals("sup") && inSup) { inSup = false; + } else if (node.nodeName().matches("h\\d")) { + addParagraph("p"); + } else if (node.nodeName().equals("span")) { + if (node.attr("class").contains("noteend")) + addParagraph("p"); } - else if(node.nodeName().matches("h\\d")) { - addParagraph("p"); - } - else if(node.nodeName().equals("span")) { - if(node.attr("class").contains("noteend")) - addParagraph("p"); - } } - + public List getParagraphs() { - if(paragraph != null && paragraph.getLength() > 0) - { - paragraphs.add(paragraph); - paragraph = null; - } - return paragraphs; + if (paragraph != null && paragraph.getLength() > 0) { + paragraphs.add(paragraph); + paragraph = null; + } + return paragraphs; } - private void addParagraph(String newTag){ - if(paragraph.getLength() != 0 || paragraph.getHtmlStrings().size() > 0) - paragraphs.add(paragraph); + private void addParagraph(String newTag) { + if (paragraph.getLength() != 0 || paragraph.getHtmlStrings().size() > 0) + paragraphs.add(paragraph); - paragraph = new Paragraph(0, "", (newTag == null ? "p" : newTag)); - } + paragraph = new Paragraph(0, "", (newTag == null ? "p" : newTag)); + } - public int getTableCount(){ - int count =0; - for(Paragraph p : this.getParagraphs()){ + public int getTableCount() { + int count = 0; + for (Paragraph p : this.getParagraphs()) { count += paragraph.getHtmlStrings().size(); } return count; } - public ArrayList getErrors(){ + public ArrayList getErrors() { return errors; } - private String[] replaceChars() { - String[] rep = new String[256]; - rep['\n'] = ""; - rep['\u00A0'] = " "; - return rep; - } - - public static class NifExtractorContext { - private String language; - private String resource; - private String wikipediaTemplateString; - - public NifExtractorContext(String language, String resource, String templateString){ - this.language = language; - this.resource = resource; - this.wikipediaTemplateString = templateString; - } - } + private String[] replaceChars() { + String[] rep = new String[256]; + rep['\n'] = ""; + rep['\u00A0'] = " "; + return rep; + } + + public static class NifExtractorContext { + private String language; + private String resource; + private String wikipediaTemplateString; + + public NifExtractorContext(String language, String resource, String templateString) { + this.language = language; + this.resource = resource; + this.wikipediaTemplateString = templateString; + } + } } diff --git a/core/src/main/resources/addonlangs.json b/core/src/main/resources/addonlangs.json index 0538dd78a..8b45753b8 100644 --- a/core/src/main/resources/addonlangs.json +++ b/core/src/main/resources/addonlangs.json @@ -79,5 +79,11 @@ "name": "Cantonese", "isoCode": "yue", "iso639_3": "yue" + }, + "en": { + "wikiCode": "en", + "name": "English", + "isoCode": "en", + "iso639_3": "eng" } } \ No newline at end of file diff --git a/core/src/main/resources/nifextractionconfig.json b/core/src/main/resources/nifextractionconfig.json index c4548f431..d66d3ceb5 100644 --- a/core/src/main/resources/nifextractionconfig.json +++ b/core/src/main/resources/nifextractionconfig.json @@ -14,7 +14,6 @@ "nif-remove-elements":[ ".noprint", ".haudio", - "sup.reference", "span.mw-editsection", ".error", "#coordinates", diff --git a/core/src/main/scala/org/dbpedia/extraction/mappings/DisambiguationExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/mappings/DisambiguationExtractor.scala index e9beac491..326924411 100644 --- a/core/src/main/scala/org/dbpedia/extraction/mappings/DisambiguationExtractor.scala +++ b/core/src/main/scala/org/dbpedia/extraction/mappings/DisambiguationExtractor.scala @@ -22,7 +22,7 @@ extends PageNodeExtractor { private val language = context.language - private val replaceString = DisambiguationExtractorConfig.disambiguationTitlePartMap(language.wikiCode) + private val replaceString = DisambiguationExtractorConfig.disambiguationTitlePartMap.getOrElse(language.wikiCode, " (disambiguation)") val wikiPageDisambiguatesProperty = context.ontology.properties("wikiPageDisambiguates") diff --git a/core/src/main/scala/org/dbpedia/extraction/nif/HtmlNifExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/nif/HtmlNifExtractor.scala index 130d2c7e9..d2d8f47c5 100755 --- a/core/src/main/scala/org/dbpedia/extraction/nif/HtmlNifExtractor.scala +++ b/core/src/main/scala/org/dbpedia/extraction/nif/HtmlNifExtractor.scala @@ -39,6 +39,7 @@ abstract class HtmlNifExtractor(nifContextIri: String, language: String, nifPara protected val templateString = "Template" private val sectionMap = new mutable.HashMap[PageSection, ExtractedSection]() + private val citationMap = new mutable.HashMap[String, String]() /** * Extract the relevant html page divided in sections and paragraphs @@ -285,9 +286,17 @@ abstract class HtmlNifExtractor(nifContextIri: String, language: String, nifPara words += nifLinks(word, RdfNamespace.NIF.append("beginIndex"), (offset + link.getWordStart).toString, sourceUrl, RdfNamespace.XSD.append("nonNegativeInteger")) words += nifLinks(word, RdfNamespace.NIF.append("endIndex"), (offset + link.getWordEnd).toString, sourceUrl, RdfNamespace.XSD.append("nonNegativeInteger")) words += nifLinks(word, RdfNamespace.NIF.append("superString"), paragraphUri, sourceUrl, null) - UriUtils.createURI(link.getUri) match{ - case Success(s) => words += nifLinks(word, "http://www.w3.org/2005/11/its/rdf#taIdentRef", s.toString, sourceUrl, null) //TODO IRI's might throw exception in org.dbpedia.extraction.destinations.formatters please check this - case Failure(f) => + if (link.isCitation) { + words += nifLinks(word, RdfNamespace.RDF.append("type"), "http://dbpedia.org/ontology/Citation", sourceUrl, null) + citationMap.get(link.getCitationId) match { + case Some(url) => words += nifLinks(word, "http://www.w3.org/2005/11/its/rdf#taIdentRef", url, sourceUrl, null) + case None => + } + } else { + UriUtils.createURI(link.getUri) match{ + case Success(s) => words += nifLinks(word, "http://www.w3.org/2005/11/its/rdf#taIdentRef", s.toString, sourceUrl, null) //TODO IRI's might throw exception in org.dbpedia.extraction.destinations.formatters please check this + case Failure(f) => + } } if(writeLinkAnchors) words += nifLinks(word, RdfNamespace.NIF.append("anchorOf"), link.getLinkText, sourceUrl, RdfNamespace.XSD.append("string")) @@ -346,6 +355,15 @@ abstract class HtmlNifExtractor(nifContextIri: String, language: String, nifPara protected def getJsoupDoc(html: String): Document = { val doc = Jsoup.parse(cleanHtml(html)) + //extract citations + for(note <- doc.select("li[id^=cite_note-]").asScala){ + val id = note.id() + val extLink = note.select("a.external.text").first() + if (extLink != null) { + citationMap.put(id, extLink.attr("href")) + } + } + //delete queries for(query <- cssSelectorConfigMap.removeElements) for(item <- doc.select(query).asScala) diff --git a/core/src/main/scala/org/dbpedia/extraction/util/Language.scala b/core/src/main/scala/org/dbpedia/extraction/util/Language.scala index 7f27e5774..8583602ff 100644 --- a/core/src/main/scala/org/dbpedia/extraction/util/Language.scala +++ b/core/src/main/scala/org/dbpedia/extraction/util/Language.scala @@ -123,13 +123,35 @@ object Language extends (String => Language) request.setHeader("User-Agent", customUserAgentText) } - val response = client.execute(request) - val stream = response.getEntity.getContent - val wikiLanguageCodes = - try Source.fromInputStream(stream).getLines().toList - finally{ - stream.close() - client.close() + val wikiLanguageCodes = + try { + // Set a default User-Agent if none is provided + if (!customUserAgentEnabled) { + request.setHeader("User-Agent", "DBpedia-Extraction-Framework/1.0 (https://github.com/dbpedia/extraction-framework; dbpedia@infai.org)") + } + + val response = client.execute(request) + val status = response.getStatusLine.getStatusCode + if (status >= 200 && status < 300) { + val stream = response.getEntity.getContent + try { + Source.fromInputStream(stream).getLines() + .map(_.trim) + .filter(line => line.nonEmpty && !line.contains(" ")) // Basic filter for language codes + .toList + } finally { + stream.close() + } + } else { + logger.log(Level.WARNING, "Language list fetch failed with status " + status + " from " + wikipediaLanguageUrl) + List.empty + } + } catch { + case e: Exception => + logger.log(Level.WARNING, "Could not fetch language list from " + wikipediaLanguageUrl + ": " + e.getMessage) + List.empty // Fallback to empty list, addonlangs.json will still be used + } finally { + client.close() } val specialLangs: JsonConfig = new JsonConfig(this.getClass.getClassLoader.getResource("addonlangs.json"))