apache · krickert · Jun 21, 2026 · Jun 21, 2026 · Jun 22, 2026 · Jun 22, 2026
diff --git a/opennlp-api/src/main/java/opennlp/tools/namefind/OffsetMappingNameFinder.java b/opennlp-api/src/main/java/opennlp/tools/namefind/OffsetMappingNameFinder.java
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.namefind;
+
+import opennlp.tools.util.Span;
+
+/**
+ * A {@link TokenNameFinder} that can additionally report detected spans in the character coordinates
+ * of the original input, mapping back through any text normalization applied before detection.
+ *
+ * <p>An implementation that normalizes input before detection (for example an ONNX model that folds
+ * Unicode whitespace or dashes) returns spans from {@link #find(String[])} in the coordinates of the
+ * normalized text, which no longer line up with the caller's input when a fold changes the length.
+ * {@link #findInOriginal(String[])} maps those spans back to original-input coordinates. This is a
+ * separate capability interface rather than a method on {@link TokenNameFinder} because the classic
+ * contract reports token-index spans, for which an original-character mapping is not meaningful; an
+ * interface-typed caller tests for the capability ({@code finder instanceof OffsetMappingNameFinder})
+ * instead of depending on a concrete implementation.</p>
+ */
+public interface OffsetMappingNameFinder extends TokenNameFinder {
+
+  /**
+   * Finds names and returns their {@link Span spans} in the character coordinates of the original
+   * input, regardless of any normalization applied before detection.
+   *
+   * @param tokens The tokens to search.
+   * @return The detected spans, in original-input character coordinates.
+   */
+  Span[] findInOriginal(String[] tokens);
+}
diff --git a/opennlp-core/opennlp-ml/opennlp-dl/README.md b/opennlp-core/opennlp-ml/opennlp-dl/README.md
@@ -22,6 +22,37 @@ Named entity models are commonly cased, so lower casing is disabled by default.
 Set `InferenceOptions#setLowerCase(true)` only for models trained with uncased
 input.
 
+### Unicode text handling
+
+Long input is split into overlapping chunks on the full Unicode `White_Space`
+set (not Java's `\s`), so no-break space, ideographic space, and the other UCD
+whitespace characters are recognized as delimiters. `NameFinderDL` locates
+reconstructed entity text in the original input with a cursor-based matcher that
+treats span spaces as flexible Unicode whitespace and compares other code points
+case-insensitively, so `Span#getCoveredText(...)` works on text from PDFs, the
+web, and multilingual sources.
+
+Optional input folding is off by default and controlled through
+`InferenceOptions`:
+
+```java
+InferenceOptions options = new InferenceOptions();
+options.setNormalizeWhitespace(true);  // each Unicode whitespace -> ASCII space (offset-preserving)
+options.setNormalizeDashes(true);      // Unicode dashes -> hyphen-minus (offset note below)
+NameFinderDL finder = new NameFinderDL(model, vocab, ids2Labels, options, sentenceDetector);
+```
+
+Whitespace folding is length-preserving, so it never moves offsets. Dash folding can shrink a
+non-BMP dash by one UTF-16 unit, but `NameFinderDL.findInOriginal` maps decoded spans back through
+the normalization `Alignment`, so reported spans stay correct in the original input even for
+non-BMP dashes. (`NameFinderDL.find` returns normalized-text offsets, which differ from the
+original only in that non-BMP-dash case.)
+
+The same options apply to `DocumentCategorizerDL`. The underlying
+`CharClass` / `CodePointSet` engine and the broader normalization pipeline live
+in `opennlp.tools.util.normalizer` and are documented in the OpenNLP manual
+chapter *Text Normalization*.
+
 Export a Hugging Face NER model to ONNX, e.g.:
 
 ```bash
@@ -30,12 +61,31 @@ python -m transformers.onnx --model=dslim/bert-base-NER --feature token-classifi
 
 ## DocumentCategorizerDL
 
+Uses the same Unicode whitespace chunking and optional `InferenceOptions`
+normalization as `NameFinderDL` (see above).
+
 Export a Huggingface classification (e.g. sentiment) model to ONNX, e.g.:
 
 ```bash
 python -m transformers.onnx --model=nlptown/bert-base-multilingual-uncased-sentiment --feature sequence-classification exported
 ```
 
+## Behavior changes in this release
+
+Integrators upgrading from an earlier `opennlp-dl` should note these intentional changes (OPENNLP-1850):
+
+- `NameFinderDL.find(...)` reports spans in the coordinates of the joined input it ran inference on,
+  which differ from the original text only when length-changing dash folding is enabled. Use the new
+  `NameFinderDL.findInOriginal(...)` (from `OffsetMappingNameFinder`) for original-text coordinates.
+- Spans that overlap at chunk boundaries are now merged longest-wins; `find(...)` previously returned
+  every decoded span, overlaps included.
+- Chunking splits on the Unicode `White_Space` set rather than `String#split("\\s+")`, and
+  whitespace-only input now yields no spans without running the model.
+- `DocumentCategorizerDL.categorize(...)` now rejects `null`/empty input, and a document with no
+  non-whitespace token, with `IllegalArgumentException` rather than running the model on empty input.
+- The example label constants `NameFinderDL.I_PER` and `NameFinderDL.B_PER` were removed; supply your
+  own label strings (any `B-<TYPE>`/`I-<TYPE>` pair works, as described above).
+
 ## SentenceVectors
 
 Convert a sentence vectors model to ONNX, e.g.:

diff --git a/opennlp-core/opennlp-ml/opennlp-dl/pom.xml b/opennlp-core/opennlp-ml/opennlp-dl/pom.xml
@@ -37,6 +37,14 @@
       <groupId>org.apache.opennlp</groupId>
       <artifactId>opennlp-api</artifactId>
     </dependency>
+    <!-- Test-only: some DL tests construct concrete runtime components (e.g. a sentence detector).
+         The Unicode normalization engine (CharClass) now lives in opennlp-api, so the main code
+         compiles against opennlp-api plus onnxruntime only. -->
+    <dependency>
+      <groupId>org.apache.opennlp</groupId>
+      <artifactId>opennlp-runtime</artifactId>
+      <scope>test</scope>
+    </dependency>
 
     <!-- External dependencies -->
     <dependency>
@@ -45,13 +53,6 @@
       <version>${onnxruntime.version}</version>
     </dependency>
 
-    <!-- TEST scope -->
-    <dependency>
-      <groupId>org.apache.opennlp</groupId>
-      <artifactId>opennlp-runtime</artifactId>
-      <scope>test</scope>
-    </dependency>
-
     <dependency>
       <groupId>org.slf4j</groupId>
       <artifactId>slf4j-api</artifactId>

diff --git a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/AbstractDL.java b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/AbstractDL.java
@@ -40,6 +40,10 @@
 import opennlp.tools.tokenize.BertTokenizer;
 import opennlp.tools.tokenize.Tokenizer;
 import opennlp.tools.tokenize.WordpieceTokenizer;
+import opennlp.tools.util.Span;
+import opennlp.tools.util.normalizer.AlignedText;
+import opennlp.tools.util.normalizer.Alignment;
+import opennlp.tools.util.normalizer.CharClass;
 
 /**
  * Base class for OpenNLP deep-learning classes using ONNX Runtime.
@@ -60,6 +64,17 @@ public abstract class AbstractDL implements AutoCloseable {
   protected record ChunkRange(int start, int end) {
   }
 
+  /**
+   * A rejoined chunk paired with its half-open character span in the text it was split from, so a
+   * chunk's decoded entities can be located within the region the chunk actually covers.
+   *
+   * @param text  The chunk text, the chunk's whitespace tokens rejoined with single ASCII spaces.
+   * @param start The inclusive character offset of the chunk in the source text.
+   * @param end   The exclusive character offset of the chunk in the source text.
+   */
+  protected record TextChunk(String text, int start, int end) {
+  }
+
   private static final Pattern JSON_ENTRY_PATTERN =
       Pattern.compile("\"((?:[^\"\\\\]|\\\\.)*)\"\\s*:\\s*(\\d+)");
 
@@ -316,15 +331,146 @@ protected static void validateSplitOptions(final InferenceOptions options) {
    */
   protected static void validateSplitOptions(final int documentSplitSize, final int splitOverlapSize) {
     if (documentSplitSize <= 0) {
-      throw new IllegalArgumentException("documentSplitSize must be greater than zero.");
+      throw new IllegalArgumentException("The documentSplitSize must be greater than zero.");
     }
     if (splitOverlapSize < 0) {
-      throw new IllegalArgumentException("splitOverlapSize must not be negative.");
+      throw new IllegalArgumentException("The splitOverlapSize must not be negative.");
     }
     if (splitOverlapSize >= documentSplitSize) {
       throw new IllegalArgumentException(
-          "splitOverlapSize must be smaller than documentSplitSize.");
+          "The splitOverlapSize must be smaller than documentSplitSize.");
+    }
+  }
+
+  /**
+   * Unicode-aware whitespace. Input is tokenized on the full Unicode {@code White_Space} set
+   * rather than the six ASCII characters Java's {@code \s} recognizes, and the same class is
+   * reused by subclasses that need to match against whitespace in the source text.
+   */
+  protected static final CharClass WHITESPACE = CharClass.whitespace();
+
+  /** Unicode dashes (excluding the mathematical minus signs), used for optional input folding. */
+  protected static final CharClass DASHES = CharClass.dashes();
+
+  /**
+   * Optionally folds Unicode whitespace and/or dashes in the input to their ASCII forms before
+   * inference, returning just the folded text. This is suitable for callers that do not map model
+   * output back to character offsets, such as whole-document classification. When the result must
+   * be mapped back to the original text (for example to report entity spans), use
+   * {@link #normalizeInputAligned(String, boolean, boolean)} instead, which also returns an
+   * {@link Alignment} that stays correct even when a fold changes the string length.
+   *
+   * @param text The input text.
+   * @param normalizeWhitespace Whether to fold whitespace to ASCII spaces.
+   * @param normalizeDashes Whether to fold dashes to the ASCII hyphen.
+   * @return The optionally normalized text.
+   */
+  protected static String normalizeInput(final String text, final boolean normalizeWhitespace,
+                                         final boolean normalizeDashes) {
+    String result = text;
+    if (normalizeWhitespace) {
+      result = WHITESPACE.normalize(result).toString();
+    }
+    if (normalizeDashes) {
+      result = DASHES.normalize(result).toString();
+    }
+    return result;
+  }
+
+  /**
+   * Like {@link #normalizeInput(String, boolean, boolean)} but also produces an {@link Alignment}
+   * from the folded text back to {@code text}, so model output positions map to original character
+   * offsets even when a fold changes the string length (a supplementary dash shrinking, or, for
+   * folds that may be added later, an expansion such as an ellipsis to three dots).
+   *
+   * @param text The input text.
+   * @param normalizeWhitespace Whether to fold whitespace to ASCII spaces.
+   * @param normalizeDashes Whether to fold dashes to the ASCII hyphen.
+   * @return The optionally normalized text paired with its alignment back to {@code text}.
+   */
+  protected static AlignedText normalizeInputAligned(final String text,
+      final boolean normalizeWhitespace, final boolean normalizeDashes) {
+    // Compose each enabled fold's alignment with the running alignment so the returned mapping is
+    // correct no matter whether a stage changes length. Whitespace folding here is a one-for-one
+    // replacement and so is length-preserving today; only dash folding moves offsets (a
+    // supplementary-plane dash shrinks from two chars to one). Composing through andThen rather
+    // than relying on the whitespace stage staying length-preserving keeps findInOriginal() correct
+    // if that ever changes.
+    AlignedText result = identityAligned(text, text);
+    if (normalizeWhitespace) {
+      result = compose(result, WHITESPACE.normalizeAligned(result.normalized()));
+    }
+    if (normalizeDashes) {
+      result = compose(result, DASHES.normalizeAligned(result.normalized()));
+    }
+    return result;
+  }
+
+  // Threads a fold stage onto the running alignment: accumulated maps original -> current and next
+  // maps current -> next.normalized(), so the composition maps original -> next.normalized().
+  private static AlignedText compose(final AlignedText accumulated, final AlignedText next) {
+    return new AlignedText(accumulated.original(), next.normalized(),
+        accumulated.alignment().andThen(next.alignment()));
+  }
+
+  // An AlignedText whose alignment is the identity, for the case where no length-changing fold was
+  // applied so the folded text has the same length and offsets as the original.
+  private static AlignedText identityAligned(final String original, final String normalized) {
+    final Alignment alignment =
+        new Alignment.Builder().equal(normalized.length()).build(normalized.length());
+    return new AlignedText(original, normalized, alignment);
+  }
+
+  /**
+   * Splits {@code text} on Unicode whitespace and groups the resulting tokens into overlapping
+   * chunks, each rejoined with single ASCII spaces, ready for WordPiece tokenization. The split
+   * uses the Unicode {@code White_Space} set, so spacing such as a no-break space or the
+   * ideographic space is recognized, and it yields no empty tokens from leading, trailing, or
+   * repeated whitespace.
+   *
+   * @param text The input text.
+   * @param documentSplitSize The maximum number of whitespace tokens per chunk.
+   * @param splitOverlapSize The number of tokens shared between consecutive chunks.
+   * @return The chunk strings, in order.
+   */
+  protected static List<String> whitespaceChunks(final String text, final int documentSplitSize,
+                                                 final int splitOverlapSize) {
+    final List<TextChunk> chunks = whitespaceChunkSpans(text, documentSplitSize, splitOverlapSize);
+    final List<String> groups = new ArrayList<>(chunks.size());
+    for (final TextChunk chunk : chunks) {
+      groups.add(chunk.text());
+    }
+    return groups;
+  }
+
+  /**
+   * Like {@link #whitespaceChunks(String, int, int)} but also carries each chunk's character span
+   * in {@code text}, so a chunk can be decoded bounded to the region it covers and overlapping
+   * chunks yield overlapping candidate spans rather than silently dropping a boundary entity.
+   *
+   * @param text The input text.
+   * @param documentSplitSize The maximum number of whitespace tokens per chunk.
+   * @param splitOverlapSize The number of tokens shared between consecutive chunks.
+   * @return The chunks, in order, each with its character span in {@code text}.
+   */
+  protected static List<TextChunk> whitespaceChunkSpans(final String text,
+      final int documentSplitSize, final int splitOverlapSize) {
+    final List<Span> tokenSpans = WHITESPACE.splitSpans(text);
+    final List<TextChunk> chunks = new ArrayList<>();
+    for (final ChunkRange range : chunkRanges(tokenSpans.size(), documentSplitSize,
+        splitOverlapSize)) {
+      final StringBuilder rejoined = new StringBuilder();
+      for (int i = range.start(); i < range.end(); i++) {
+        if (i > range.start()) {
+          rejoined.append(' ');
+        }
+        rejoined.append(text, tokenSpans.get(i).getStart(), tokenSpans.get(i).getEnd());
+      }
+      final int start = tokenSpans.get(range.start()).getStart();
+      final int end = tokenSpans.get(range.end() - 1).getEnd();
+      chunks.add(new TextChunk(rejoined.toString(), start, end));
     }
+    return chunks;
   }
 
   /**
@@ -340,7 +486,7 @@ protected static void validateSplitOptions(final int documentSplitSize, final in
   protected static List<ChunkRange> chunkRanges(final int tokenCount, final int documentSplitSize,
                                                 final int splitOverlapSize) {
     if (tokenCount < 0) {
-      throw new IllegalArgumentException("tokenCount must not be negative.");
+      throw new IllegalArgumentException("The tokenCount must not be negative.");
     }
     validateSplitOptions(documentSplitSize, splitOverlapSize);
 

diff --git a/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/InferenceOptions.java b/opennlp-core/opennlp-ml/opennlp-dl/src/main/java/opennlp/dl/InferenceOptions.java
@@ -26,6 +26,8 @@ public class InferenceOptions {
   private int documentSplitSize = 250;
   private int splitOverlapSize = 50;
   private Boolean lowerCase;
+  private boolean normalizeWhitespace;
+  private boolean normalizeDashes;
 
   public boolean isIncludeAttentionMask() {
     return includeAttentionMask;
@@ -75,6 +77,46 @@ public void setSplitOverlapSize(int splitOverlapSize) {
     this.splitOverlapSize = splitOverlapSize;
   }
 
+  /** {@return whether input whitespace is normalized to ASCII spaces before inference} */
+  public boolean isNormalizeWhitespace() {
+    return normalizeWhitespace;
+  }
+
+  /**
+   * Replaces every Unicode whitespace character in the input with an ASCII space before inference.
+   * This is offset preserving (each whitespace code point maps to one space), so any spans a model
+   * produces still align with the input. Off by default.
+   *
+   * <p>This is a one-for-one replacement, not the collapse-and-trim whitespace fold of the runtime
+   * {@code TextNormalizer.whitespace()} rung: runs of whitespace are not merged and leading or
+   * trailing whitespace is not removed, so offsets are preserved.</p>
+   *
+   * @param normalizeWhitespace Whether to normalize whitespace.
+   */
+  public void setNormalizeWhitespace(boolean normalizeWhitespace) {
+    this.normalizeWhitespace = normalizeWhitespace;
+  }
+
+  /** {@return whether input dashes are normalized to the ASCII hyphen before inference} */
+  public boolean isNormalizeDashes() {
+    return normalizeDashes;
+  }
+
+  /**
+   * Replaces Unicode dashes in the input with the ASCII hyphen-minus before inference. This is
+   * offset preserving for the dash characters in the Basic Multilingual Plane (the common case).
+   * The mathematical minus signs are not affected. Off by default.
+   *
+   * <p>A supplementary-plane dash shrinks from two chars to one, which shifts later offsets, so
+   * with this enabled {@code find(...)} reports offsets into the normalized text in that case. Use
+   * {@code NameFinderDL.findInOriginal(...)} for offsets mapped back to the original input.</p>
+   *
+   * @param normalizeDashes Whether to normalize dashes.
+   */
+  public void setNormalizeDashes(boolean normalizeDashes) {
+    this.normalizeDashes = normalizeDashes;
+  }
+
   /**
    * Returns whether tokenization should lower case the input text and strip
    * accents, as required by uncased models.