apache · krickert · Jun 16, 2026 · Jun 18, 2026 · Jun 18, 2026 · Jun 19, 2026
diff --git a/LICENSE b/LICENSE
@@ -370,3 +370,51 @@ The following license applies to the SLF4J API:
     LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
     OF CONTRACT, TORT OR OTHERWISE,  ARISING FROM, OUT OF OR IN CONNECTION
     WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+The following license applies to the bundled Unicode data files in
+opennlp-core/opennlp-runtime/src/main/resources/opennlp/tools/tokenize/uax29
+(WordBreakProperty.txt, ExtendedPictographic.txt),
+opennlp-core/opennlp-runtime/src/main/resources/opennlp/tools/util/normalizer
+(confusables.txt), and
+opennlp-core/opennlp-runtime/src/test/resources/opennlp/tools/tokenize/uax29
+(WordBreakTest.txt):
+
+    UNICODE LICENSE V3
+
+    COPYRIGHT AND PERMISSION NOTICE
+
+    Copyright (c) 1991-2026 Unicode, Inc.
+
+    NOTICE TO USER: Carefully read the following legal agreement. BY
+    DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
+    SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
+    TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT
+    DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.
+
+    Permission is hereby granted, free of charge, to any person obtaining a
+    copy of data files and any associated documentation (the "Data Files") or
+    software and any associated documentation (the "Software") to deal in the
+    Data Files or Software without restriction, including without limitation
+    the rights to use, copy, modify, merge, publish, distribute, and/or sell
+    copies of the Data Files or Software, and to permit persons to whom the
+    Data Files or Software are furnished to do so, provided that either (a)
+    this copyright and permission notice appear with all copies of the Data
+    Files or Software, or (b) this copyright and permission notice appear in
+    associated Documentation.
+
+    THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+    KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
+    THIRD PARTY RIGHTS.
+
+    IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
+    BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
+    OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+    WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+    ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA
+    FILES OR SOFTWARE.
+
+    Except as contained in this notice, the name of a copyright holder shall
+    not be used in advertising or otherwise to promote the sale, use or other
+    dealings in these Data Files or Software without prior written
+    authorization of the copyright holder.
diff --git a/NOTICE b/NOTICE
@@ -92,6 +92,36 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 
+============================================================================
+
+This product bundles data files from the Unicode Character Database (UCD)
+and the Unicode Security Mechanisms, version 17.0.0, published by Unicode,
+Inc. (https://www.unicode.org/Public/).
+
+  * opennlp-core/opennlp-runtime/src/main/resources/opennlp/tools/tokenize/uax29/WordBreakProperty.txt
+    is the upstream WordBreakProperty-17.0.0.txt, unmodified except for the
+    file name.
+  * opennlp-core/opennlp-runtime/src/test/resources/opennlp/tools/tokenize/uax29/WordBreakTest.txt
+    is the upstream WordBreakTest-17.0.0.txt, unmodified except for the file
+    name.
+  * opennlp-core/opennlp-runtime/src/main/resources/opennlp/tools/util/normalizer/confusables.txt
+    is the upstream confusables.txt from the Unicode Security Mechanisms
+    (UTS #39), unmodified.
+  * opennlp-core/opennlp-runtime/src/main/resources/opennlp/tools/tokenize/uax29/ExtendedPictographic.txt
+    is derived from the upstream emoji-data.txt (Emoji Data for UTS #51,
+    version 17.0): it keeps only the lines that assign the
+    Extended_Pictographic property and is renamed accordingly. It is a
+    filtered subset; the upstream file additionally carries the Emoji,
+    Emoji_Presentation, Emoji_Modifier, Emoji_Modifier_Base, and
+    Emoji_Component properties, which are not retained.
+
+The original Unicode copyright and license header is preserved verbatim at the
+top of each bundled file. These files are distributed under the Unicode License
+V3, the full text of which is reproduced in the LICENSE file accompanying this
+distribution.
+
+Copyright (c) 1991-2025 Unicode, Inc. All rights reserved.
+
 ============================================================================
 List of third-party dependencies grouped by their license type.
 

diff --git a/opennlp-api/pom.xml b/opennlp-api/pom.xml
@@ -49,6 +49,12 @@
       <artifactId>junit-jupiter-engine</artifactId>
       <scope>test</scope>
     </dependency>
+
+    <dependency>
+      <groupId>org.junit.jupiter</groupId>
+      <artifactId>junit-jupiter-params</artifactId>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
 
 </project>
diff --git a/opennlp-api/src/main/java/opennlp/tools/util/normalizer/NormalizedText.java b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/NormalizedText.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+/**
+ * The result of a normalization that keeps the original text alongside the normalized form.
+ *
+ * <p>The original is the source of truth (display, offsets, language-specific analysis); the
+ * normalized form is a derived view tuned for matching and search. The {@link OffsetMap} ties the
+ * two together so a position in the normalized text can be reported against the original.</p>
+ *
+ * @param original The untouched source text.
+ * @param normalized The normalized text.
+ * @param offsets The mapping between normalized and original character offsets.
+ */
+public record NormalizedText(CharSequence original, String normalized, OffsetMap offsets) {
+
+  /**
+   * Maps a normalized character offset back to the original text.
+   *
+   * @param normalizedOffset An offset in {@code [0, normalized().length()]}.
+   * @return The corresponding original character offset.
+   */
+  public int toOriginalOffset(int normalizedOffset) {
+    return offsets.toOriginalOffset(normalizedOffset);
+  }
+
+  /**
+   * Maps an original character offset forward to the normalized text.
+   *
+   * @param originalOffset An offset in {@code [0, original().length()]}.
+   * @return The corresponding normalized character offset.
+   */
+  public int toNormalizedOffset(int originalOffset) {
+    return offsets.toNormalizedOffset(originalOffset);
+  }
+}
diff --git a/opennlp-api/src/main/java/opennlp/tools/util/normalizer/OffsetMap.java b/opennlp-api/src/main/java/opennlp/tools/util/normalizer/OffsetMap.java
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import java.util.Arrays;
+
+/**
+ * A mapping between character offsets in a normalized string and the original text it came from.
+ *
+ * <p>Normalization that collapses runs or substitutes supplementary characters changes string
+ * length, so an offset into the normalized form no longer lines up with the original. This map
+ * records, for every normalized character, the original character offset it was produced from,
+ * which lets a match found in the normalized form be reported in original coordinates.</p>
+ *
+ * <p>The internal mapping is non-decreasing, so {@link #toOriginalOffset(int)} is a direct array
+ * read (O(1)) and {@link #toNormalizedOffset(int)} is a binary search (O(log n)). The map is
+ * built in the same single cursor pass that produces the normalized text, via {@link Builder}.</p>
+ */
+public final class OffsetMap {
+
+  // normalizedToOriginal[k] is the original char offset that produced normalized char k.
+  // It has one extra trailing slot mapping the end of the normalized text to the end of the
+  // original text, so offsets in [0, normalizedLength] are all valid.
+  private final int[] normalizedToOriginal;
+  private final int originalLength;
+
+  private OffsetMap(int[] normalizedToOriginal, int originalLength) {
+    this.normalizedToOriginal = normalizedToOriginal;
+    this.originalLength = originalLength;
+  }
+
+  /**
+   * Maps a normalized character offset back to the original text.
+   *
+   * @param normalizedOffset An offset in {@code [0, normalizedLength]}.
+   * @return The corresponding original character offset.
+   * @throws IndexOutOfBoundsException Thrown if {@code normalizedOffset} is out of range.
+   */
+  public int toOriginalOffset(int normalizedOffset) {
+    if (normalizedOffset < 0 || normalizedOffset >= normalizedToOriginal.length) {
+      throw new IndexOutOfBoundsException("normalized offset " + normalizedOffset
+          + " is outside [0, " + normalizedLength() + "]");
+    }
+    return normalizedToOriginal[normalizedOffset];
+  }
+
+  /**
+   * Maps an original character offset forward to the normalized text.
+   *
+   * <p>Returns the first normalized offset whose source is at or after {@code originalOffset}.
+   * When several original characters collapse to one normalized character, they all map to that
+   * single normalized offset.</p>
+   *
+   * @param originalOffset An offset in {@code [0, originalLength]}.
+   * @return The corresponding normalized character offset.
+   * @throws IndexOutOfBoundsException Thrown if {@code originalOffset} is out of range.
+   */
+  public int toNormalizedOffset(int originalOffset) {
+    if (originalOffset < 0 || originalOffset > originalLength) {
+      throw new IndexOutOfBoundsException("original offset " + originalOffset
+          + " is outside [0, " + originalLength + "]");
+    }
+    int low = 0;
+    int high = normalizedToOriginal.length - 1;
+    int answer = normalizedToOriginal.length - 1;
+    while (low <= high) {
+      final int mid = (low + high) >>> 1;
+      if (normalizedToOriginal[mid] >= originalOffset) {
+        answer = mid;
+        high = mid - 1;
+      } else {
+        low = mid + 1;
+      }
+    }
+    return answer;
+  }
+
+  /** {@return the length of the normalized text this map was built for} */
+  public int normalizedLength() {
+    return normalizedToOriginal.length - 1;
+  }
+
+  /** {@return the length of the original text this map was built for} */
+  public int originalLength() {
+    return originalLength;
+  }
+
+  /**
+   * Builds an {@link OffsetMap} incrementally during a normalization pass. Call {@link #map(int)}
+   * once for each character appended to the normalized output, then {@link #build(int)} once.
+   */
+  public static final class Builder {
+
+    private int[] buffer = new int[16];
+    private int length;
+
+    /**
+     * Records the original character offset that produced the next normalized character.
+     *
+     * @param originalOffset The source offset in the original text.
+     */
+    public void map(int originalOffset) {
+      if (length == buffer.length) {
+        buffer = Arrays.copyOf(buffer, buffer.length * 2);
+      }
+      buffer[length++] = originalOffset;
+    }
+
+    /**
+     * Finalizes the map.
+     *
+     * @param originalLength The length of the original text (used as the trailing sentinel).
+     * @return The immutable {@link OffsetMap}.
+     */
+    public OffsetMap build(int originalLength) {
+      final int[] mapping = Arrays.copyOf(buffer, length + 1);
+      mapping[length] = originalLength;
+      return new OffsetMap(mapping, originalLength);
+    }
+  }
+}
diff --git a/opennlp-core/opennlp-ml/opennlp-dl/README.md b/opennlp-core/opennlp-ml/opennlp-dl/README.md
@@ -22,6 +22,31 @@ Named entity models are commonly cased, so lower casing is disabled by default.
 Set `InferenceOptions#setLowerCase(true)` only for models trained with uncased
 input.
 
+### Unicode text handling
+
+Long input is split into overlapping chunks on the full Unicode `White_Space`
+set (not Java's `\s`), so no-break space, ideographic space, and the other UCD
+whitespace characters are recognized as delimiters. `NameFinderDL` locates
+reconstructed entity text in the original input with a cursor-based matcher that
+treats span spaces as flexible Unicode whitespace and compares other code points
+case-insensitively, so `Span#getCoveredText(...)` works on text from PDFs, the
+web, and multilingual sources.
+
+Optional input folding is off by default and controlled through
+`InferenceOptions`:
+
+```java
+InferenceOptions options = new InferenceOptions();
+options.setNormalizeWhitespace(true);  // each Unicode whitespace -> ASCII space (offset-preserving)
+options.setNormalizeDashes(true);      // Unicode dashes -> hyphen-minus (offset-preserving for BMP)
+NameFinderDL finder = new NameFinderDL(model, vocab, ids2Labels, options, sentenceDetector);
+```
+
+The same options apply to `DocumentCategorizerDL`. The underlying
+`CharClass` / `CodePointSet` engine and the broader normalization pipeline live
+in `opennlp.tools.util.normalizer` and are documented in the OpenNLP manual
+chapter *Text Normalization*.
+
 Export a Hugging Face NER model to ONNX, e.g.:
 
 ```bash
@@ -30,6 +55,9 @@ python -m transformers.onnx --model=dslim/bert-base-NER --feature token-classifi
 
 ## DocumentCategorizerDL
 
+Uses the same Unicode whitespace chunking and optional `InferenceOptions`
+normalization as `NameFinderDL` (see above).
+
 Export a Huggingface classification (e.g. sentiment) model to ONNX, e.g.:
 
 ```bash

diff --git a/opennlp-core/opennlp-ml/opennlp-dl/pom.xml b/opennlp-core/opennlp-ml/opennlp-dl/pom.xml
@@ -37,6 +37,11 @@
       <groupId>org.apache.opennlp</groupId>
       <artifactId>opennlp-api</artifactId>
     </dependency>
+    <!-- Provides the Unicode normalization engine (CharClass) used for input chunking. -->
+    <dependency>
+      <groupId>org.apache.opennlp</groupId>
+      <artifactId>opennlp-runtime</artifactId>
+    </dependency>
 
     <!-- External dependencies -->
     <dependency>
@@ -45,13 +50,6 @@
       <version>${onnxruntime.version}</version>
     </dependency>
 
-    <!-- TEST scope -->
-    <dependency>
-      <groupId>org.apache.opennlp</groupId>
-      <artifactId>opennlp-runtime</artifactId>
-      <scope>test</scope>
-    </dependency>
-
     <dependency>
       <groupId>org.slf4j</groupId>
       <artifactId>slf4j-api</artifactId>