From ac99be6619b035e31a7c5391874463bba4e54dbb Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Mon, 24 Jun 2024 13:41:09 +0900
Subject: [PATCH 1/8] add a method for lazy split and tokenization

---
 .../java/com/worksap/nlp/sudachi/IOTools.java |  16 +-
 .../nlp/sudachi/JapaneseTokenizer.java        |   8 +-
 .../SentenceSplittingLazyAnalysis.java        | 157 ++++++++++++++++++
 .../com/worksap/nlp/sudachi/Tokenizer.java    |  41 ++++-
 4 files changed, 207 insertions(+), 15 deletions(-)
 create mode 100644 src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java
diff --git a/src/main/java/com/worksap/nlp/sudachi/IOTools.java b/src/main/java/com/worksap/nlp/sudachi/IOTools.java
index edb309ef..964988ce 100644
--- a/src/main/java/com/worksap/nlp/sudachi/IOTools.java
+++ b/src/main/java/com/worksap/nlp/sudachi/IOTools.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Works Applications Co., Ltd.
+ * Copyright (c) 2023-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,6 @@
 package com.worksap.nlp.sudachi;
 
 import java.io.IOException;
-import java.io.Reader;
 import java.nio.CharBuffer;
 
 public class IOTools {
@@ -26,22 +25,21 @@ private IOTools() {
     }
 
     /**
-     * Read as much as possible from reader to the result buffer. Some readers
-     * perform filtering on input by reducing the number of read characters in each
-     * batch.
+     * Read as much as possible from the readable to the result buffer. Use this to
+     * make sure that the buffer is fulfilled or no text left unread.
      *
-     * @param reader
-     *            input reader
+     * @param readable
+     *            input readable
      * @param result
      *            buffer to read into
      * @return number of read characters
      * @throws IOException
      *             when read operation fails
      */
-    public static int readAsMuchAsCan(Reader reader, CharBuffer result) throws IOException {
+    public static int readAsMuchAsCan(Readable readable, CharBuffer result) throws IOException {
         int totalRead = 0;
         while (result.hasRemaining()) {
-            int read = reader.read(result);
+            int read = readable.read(result);
             if (read < 0) {
                 if (totalRead == 0) {
                     return -1;
diff --git a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java
index 8547a8ea..7a131f36 100644
--- a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java
+++ b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Works Applications Co., Ltd.
+ * Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 import java.nio.CharBuffer;
 import java.util.ArrayList;
 import java.util.Collections;
+import java.util.Iterator;
 import java.util.List;
 
 import javax.json.Json;
@@ -119,6 +120,11 @@ public Iterable<MorphemeList> tokenizeSentences(SplitMode mode, Reader reader) t
         return sentences;
     }
 
+    @Override
+    public Iterator<List<Morpheme>> tokenizedSentenceIterator(SplitMode mode, Readable readable) {
+        return new SentenceSplittingLazyAnalysis(mode, this, readable);
+    }
+
     @Override
     public void setDumpOutput(PrintStream output) {
         dumpOutput = output;
diff --git a/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java b/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java
new file mode 100644
index 00000000..93db0652
--- /dev/null
+++ b/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2023 Works Applications Co., Ltd.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.worksap.nlp.sudachi;
+
+import java.io.IOException;
+import java.nio.CharBuffer;
+import java.util.Iterator;
+import java.util.List;
+import java.util.NoSuchElementException;
+
+import com.worksap.nlp.sudachi.dictionary.LexiconSet;
+import com.worksap.nlp.sudachi.sentdetect.SentenceDetector;
+
+/**
+ * Provides lazy sentence split and analysis.
+ */
+/* internal */ class SentenceSplittingLazyAnalysis
+        implements SentenceDetector.NonBreakCheker, Iterator<List<Morpheme>> {
+    private final SentenceDetector detector = new SentenceDetector();
+
+    private final Tokenizer.SplitMode mode;
+    private final JapaneseTokenizer tokenizer;
+    private final Readable readable;
+
+    SentenceSplittingLazyAnalysis(Tokenizer.SplitMode mode, JapaneseTokenizer tokenizer, Readable readable) {
+        this.mode = mode;
+        this.tokenizer = tokenizer;
+        this.readable = readable;
+
+        this.buffer = CharBuffer.allocate(SentenceDetector.DEFAULT_LIMIT);
+        this.buffer.flip();
+        this.input = tokenizer.buildInputText("");
+    }
+
+    // input buffer
+    private final CharBuffer buffer;
+    // preprocessed InputText of the buffer.
+    // used to normalize text for the sentence detection.
+    private UTF8InputText input;
+    // begining-of-sentence index of next sentence in the input
+    private int bos = 0;
+    // normalized text left. corresponds to `input.getSubstring(bos,
+    // input.getText().length())`
+    private String normalized = "";
+
+    /** Return bos position in the buffer. */
+    private int bosPosition() {
+        return input.textIndexToOriginalTextIndex(bos);
+    }
+
+    /**
+     * Reset the buffer discarding processed text, then read from the input.
+     * 
+     * @return the number of chars added to the buffer. -1 if input reabable is at
+     *         its end.
+     */
+    private int reloadBuffer() throws IOException {
+        buffer.position(bosPosition());
+        buffer.compact();
+        int nread = IOTools.readAsMuchAsCan(readable, buffer);
+        buffer.flip();
+
+        // align with new buffer state
+        input = tokenizer.buildInputText(buffer);
+        bos = 0;
+        normalized = input.getText();
+
+        return nread;
+    }
+
+    @Override
+    public boolean hasNext() {
+        if (!normalized.isEmpty()) {
+            return true;
+        }
+
+        int nread;
+        try {
+            nread = reloadBuffer();
+        } catch (IOException e) {
+            throw new RuntimeException(e.getMessage(), e);
+        }
+
+        return !(nread < 0 && !buffer.hasRemaining());
+    }
+
+    @Override
+    public MorphemeList next() {
+        int length = detector.getEos(normalized, this);
+        if (length > 0) { // sentence found
+            int eos = bos + length;
+            if (eos < normalized.length()) {
+                eos = input.getNextInOriginal(eos - 1);
+                length = eos - bos;
+            }
+            UTF8InputText sentence = input.slice(bos, eos);
+            bos = eos;
+            normalized = normalized.substring(length);
+            return tokenizer.tokenizeSentence(mode, sentence);
+        }
+
+        // buffer is just after reload but no (safe) eos found. need to clean it up.
+        // tokenize all text in the buffer.
+        if (bos == 0 && length < 0) {
+            bos = normalized.length();
+            normalized = "";
+            return tokenizer.tokenizeSentence(mode, input);
+        }
+
+        int nread;
+        try {
+            nread = reloadBuffer();
+        } catch (IOException e) {
+            throw new RuntimeException(e.getMessage(), e);
+        }
+
+        if (nread < 0 && !buffer.hasRemaining()) {
+            throw new NoSuchElementException("no texts left to analyze");
+        }
+
+        // recursive call with reloaded buffer.
+        return next();
+    }
+
+    @Override
+    public boolean hasNonBreakWord(int length) {
+        UTF8InputText inp = input;
+        int byteEOS = inp.getCodePointsOffsetLength(0, bos + length);
+        byte[] bytes = inp.getByteText();
+        LexiconSet lexicon = tokenizer.lexicon;
+        for (int i = Math.max(0, byteEOS - 64); i < byteEOS; i++) {
+            Iterator<int[]> iterator = lexicon.lookup(bytes, i);
+            while (iterator.hasNext()) {
+                int[] r = iterator.next();
+                int l = r[1];
+                if (l > byteEOS || (l == byteEOS && bos + length - inp.modifiedOffset(i) > 1)) {
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+}
diff --git a/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java b/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java
index 459be935..f9853d63 100644
--- a/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java
+++ b/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Works Applications Co., Ltd.
+ * Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 import java.io.IOException;
 import java.io.PrintStream;
 import java.io.Reader;
+import java.util.Iterator;
+import java.util.List;
 
 /**
  * A tokenizer of morphological analysis.
@@ -52,7 +54,7 @@ default MorphemeList tokenize(final String text) {
 
     /**
      * Tokenize sentences. This method divide an input text into sentences and
-     * tokenizes them.
+     * tokenizes them. When the text is long, it uses a lot of memory.
      *
      * @param mode
      *            a mode of splitting
@@ -64,7 +66,7 @@ default MorphemeList tokenize(final String text) {
 
     /**
      * Tokenize sentences. Divide an input text into sentences and tokenize them
-     * with {@link SplitMode}.C.
+     * with {@link SplitMode}.C. When the text is long, it uses a lot of memory.
      *
      * @param text
      *            input text
@@ -77,7 +79,8 @@ default Iterable<MorphemeList> tokenizeSentences(String text) {
 
     /**
      * Read an input text from {@code input}, divide it into sentences and tokenize
-     * them.
+     * them. It reads all text in the input and uses a lot of memory when the text
+     * is long.
      *
      * @param mode
      *            a mode of splitting
@@ -86,12 +89,14 @@ default Iterable<MorphemeList> tokenizeSentences(String text) {
      * @return a result of tokenizing
      * @throws IOException
      *             if reading a stream is failed
+     * @deprecated use {@link #tokenizeSentences(SplitMode, Readable)} instead.
      */
     Iterable<MorphemeList> tokenizeSentences(SplitMode mode, Reader input) throws IOException;
 
     /**
      * Reads an input text from {@code input}, divide it into sentences and
-     * tokenizes them with {@link SplitMode}.C.
+     * tokenizes them with {@link SplitMode}.C. It reads all text in the input and
+     * uses a lot of memory when the text is long.
      *
      * @param input
      *            a reader of input text
@@ -99,11 +104,37 @@ default Iterable<MorphemeList> tokenizeSentences(String text) {
      * @throws IOException
      *             if reading a stream is failed
      * @see #tokenizeSentences(SplitMode,Reader)
+     * @deprecated use {@link #tokenizeSentences(Readable)} instead.
      */
     default Iterable<MorphemeList> tokenizeSentences(Reader input) throws IOException {
         return tokenizeSentences(SplitMode.C, input);
     }
 
+    /**
+     * Read an input text from {@code input}, divide it into sentences and tokenize
+     * them. It reads the input lazily.
+     *
+     * @param mode
+     *            a mode of splitting
+     * @param input
+     *            a readable input text
+     * @return a result of tokenizing
+     */
+    Iterator<List<Morpheme>> tokenizedSentenceIterator(SplitMode mode, Readable input);
+
+    /**
+     * Read an input text from {@code input}, divide it into sentences and tokenize
+     * them with {@link SplitMode}.C. It reads the input lazily.
+     *
+     * @param input
+     *            a readable input text
+     * @return a result of tokenizing
+     * @see #tokenizedSentenceIterator(SplitMode,Readable)
+     */
+    default Iterator<List<Morpheme>> tokenizedSentenceIterator(Readable input) {
+        return tokenizedSentenceIterator(SplitMode.C, input);
+    }
+
     /**
      * Prints lattice structure of the analysis into the passed {@link PrintStream}.
      *

From 2ceca78d60f0f48fa1a835841204fb9d0d5c705d Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Mon, 24 Jun 2024 14:33:16 +0900
Subject: [PATCH 2/8] fix javadoc link

---
 src/main/java/com/worksap/nlp/sudachi/Tokenizer.java | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java b/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java
index f9853d63..51aacb01 100644
--- a/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java
+++ b/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java
@@ -89,7 +89,8 @@ default Iterable<MorphemeList> tokenizeSentences(String text) {
      * @return a result of tokenizing
      * @throws IOException
      *             if reading a stream is failed
-     * @deprecated use {@link #tokenizeSentences(SplitMode, Readable)} instead.
+     * @deprecated use {@link #tokenizedSentenceIterator(SplitMode, Readable)}
+     *             instead.
      */
     Iterable<MorphemeList> tokenizeSentences(SplitMode mode, Reader input) throws IOException;
 
@@ -104,7 +105,7 @@ default Iterable<MorphemeList> tokenizeSentences(String text) {
      * @throws IOException
      *             if reading a stream is failed
      * @see #tokenizeSentences(SplitMode,Reader)
-     * @deprecated use {@link #tokenizeSentences(Readable)} instead.
+     * @deprecated use {@link #tokenizedSentenceIterator(Readable)} instead.
      */
     default Iterable<MorphemeList> tokenizeSentences(Reader input) throws IOException {
         return tokenizeSentences(SplitMode.C, input);

From a606cb7c1f127d5515779593ea9dab3f04c3c094 Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Mon, 24 Jun 2024 15:58:12 +0900
Subject: [PATCH 3/8] add tests for a lazy analysis

---
 .../sudachi/JapaneseTokenizerStreamingTest.kt | 24 +++++++++
 .../nlp/sudachi/JapaneseTokenizerTest.java    | 54 +++++++++++++++++++
 2 files changed, 78 insertions(+)

diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt
index e52c459f..3e8a8b1d 100644
--- a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt
+++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt
@@ -64,4 +64,28 @@ class JapaneseTokenizerStreamingTest {
     val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } }
     assertEquals(5000, totalLength)
   }
+
+  @Test
+  fun streamingReadable() {
+    val reader = StringReader("あ".repeat(5000))
+    val result = tokenizer.tokenizedSentenceIterator(Tokenizer.SplitMode.C, reader).asSequence()
+    val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } }
+    assertEquals(5000, totalLength)
+  }
+
+  @Test
+  fun streamingBlockingReadable() {
+    val reader = BadReader("あ".repeat(5000))
+    val result = tokenizer.tokenizedSentenceIterator(Tokenizer.SplitMode.C, reader).asSequence()
+    val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } }
+    assertEquals(5000, totalLength)
+  }
+
+  @Test
+  fun streamLongTextShouldNotCauseOOM() {
+    val reader = StringReader("あ".repeat(10 * 1024 * 1024))
+    val result = tokenizer.tokenizedSentenceIterator(Tokenizer.SplitMode.C, reader).asSequence()
+    val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } }
+    assertEquals(10 * 1024 * 1024, totalLength)
+  }
 }
diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java
index 09c1afd1..9762f559 100644
--- a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java
+++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java
@@ -239,6 +239,60 @@ public void tokenizerWithReaderAndNormalization() throws IOException {
         assertThat(it.hasNext(), is(false));
     }
 
+    @Test
+    public void tokenizerWithReadable() throws IOException {
+        StringReader reader = new StringReader("京都。東京.東京都。京都");
+        Iterator<List<Morpheme>> it = tokenizer.tokenizedSentenceIterator(reader);
+        assertThat(it.hasNext(), is(true));
+        assertThat(it.next().size(), is(2));
+        assertThat(it.hasNext(), is(true));
+        assertThat(it.next().size(), is(2));
+        assertThat(it.hasNext(), is(true));
+        assertThat(it.next().size(), is(2));
+        assertThat(it.hasNext(), is(true));
+        assertThat(it.next().size(), is(1));
+        assertThat(it.hasNext(), is(false));
+    }
+
+    @Test
+    public void tokenizerWithLongReadable() throws IOException {
+        StringBuilder sb = new StringBuilder();
+        for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT * 2 / 3; i++) {
+            sb.append("京都。");
+        }
+        sb.append("京都");
+        StringReader reader = new StringReader(sb.toString());
+        Iterator<List<Morpheme>> it = tokenizer.tokenizedSentenceIterator(reader);
+        for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT * 2 / 3; i++) {
+            assertThat(it.hasNext(), is(true));
+            assertThat(it.next().size(), is(2));
+        }
+        assertThat(it.hasNext(), is(true));
+        assertThat(it.next().size(), is(1));
+        assertThat(it.hasNext(), is(false));
+    }
+
+    @Test
+    public void tokenizerWithReadableAndNormalization() throws IOException {
+        StringBuilder sb = new StringBuilder();
+        sb.append("東京都…。");
+        for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT / 3; i++) {
+            sb.append("京都。");
+        }
+        StringReader reader = new StringReader(sb.toString());
+        Iterator<List<Morpheme>> it = tokenizer.tokenizedSentenceIterator(reader);
+        assertThat(it.hasNext(), is(true));
+        assertThat(it.next().size(), is(5));
+        for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT / 3; i++) {
+            assertThat(it.hasNext(), is(true));
+            List<Morpheme> ms = it.next();
+            assertThat(ms.size(), is(2));
+            assertThat(ms.get(0).surface(), is("京都"));
+            assertThat(ms.get(1).surface(), is("。"));
+        }
+        assertThat(it.hasNext(), is(false));
+    }
+
     @Test
     public void zeroLengthMorpheme() {
         List<Morpheme> s = tokenizer.tokenize("…");

From 0b8f6605e3ed23c7226fbed774798583d6fcef40 Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Mon, 24 Jun 2024 16:37:34 +0900
Subject: [PATCH 4/8] Use proper exception class

---
 .../worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java   | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java b/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java
index 93db0652..9cc28653 100644
--- a/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java
+++ b/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java
@@ -17,6 +17,7 @@
 package com.worksap.nlp.sudachi;
 
 import java.io.IOException;
+import java.io.UncheckedIOException;
 import java.nio.CharBuffer;
 import java.util.Iterator;
 import java.util.List;
@@ -92,7 +93,7 @@ public boolean hasNext() {
         try {
             nread = reloadBuffer();
         } catch (IOException e) {
-            throw new RuntimeException(e.getMessage(), e);
+            throw new UncheckedIOException(e.getMessage(), e);
         }
 
         return !(nread < 0 && !buffer.hasRemaining());
@@ -125,7 +126,7 @@ public MorphemeList next() {
         try {
             nread = reloadBuffer();
         } catch (IOException e) {
-            throw new RuntimeException(e.getMessage(), e);
+            throw new UncheckedIOException(e.getMessage(), e);
         }
 
         if (nread < 0 && !buffer.hasRemaining()) {

From dfb9e1b8e6090fa0bf75442f8db0772a30df4d37 Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Mon, 24 Jun 2024 17:32:09 +0900
Subject: [PATCH 5/8] Add tests for error cases and fix code smells

---
 .../SentenceSplittingLazyAnalysis.java        |  2 +-
 .../com/worksap/nlp/sudachi/Tokenizer.java    |  2 +
 .../sudachi/JapaneseTokenizerStreamingTest.kt | 53 ++++++++++++++++++-
 .../nlp/sudachi/JapaneseTokenizerTest.java    | 25 +++++++--
 4 files changed, 74 insertions(+), 8 deletions(-)

diff --git a/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java b/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java
index 9cc28653..3e877d84 100644
--- a/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java
+++ b/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Works Applications Co., Ltd.
+ * Copyright (c) 2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java b/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java
index 51aacb01..61710b91 100644
--- a/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java
+++ b/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java
@@ -92,6 +92,7 @@ default Iterable<MorphemeList> tokenizeSentences(String text) {
      * @deprecated use {@link #tokenizedSentenceIterator(SplitMode, Readable)}
      *             instead.
      */
+    @Deprecated
     Iterable<MorphemeList> tokenizeSentences(SplitMode mode, Reader input) throws IOException;
 
     /**
@@ -107,6 +108,7 @@ default Iterable<MorphemeList> tokenizeSentences(String text) {
      * @see #tokenizeSentences(SplitMode,Reader)
      * @deprecated use {@link #tokenizedSentenceIterator(Readable)} instead.
      */
+    @Deprecated
     default Iterable<MorphemeList> tokenizeSentences(Reader input) throws IOException {
         return tokenizeSentences(SplitMode.C, input);
     }
diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt
index 3e8a8b1d..ec4b33d0 100644
--- a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt
+++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Works Applications Co., Ltd.
+ * Copyright (c) 2023-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@ import java.io.StringReader
 import kotlin.math.min
 import kotlin.test.Test
 import kotlin.test.assertEquals
+import kotlin.test.assertFailsWith
 
 class JapaneseTokenizerStreamingTest {
   private val tokenizer = TestDictionary.user0().create()
@@ -73,6 +74,19 @@ class JapaneseTokenizerStreamingTest {
     assertEquals(5000, totalLength)
   }
 
+  @Test
+  fun callingNextWithoutTextFails() {
+    val reader = StringReader("東京")
+    val it = tokenizer.tokenizedSentenceIterator(Tokenizer.SplitMode.C, reader)
+
+    val morphemes = it.next()
+    assertEquals("東京", morphemes.get(0).surface())
+
+    assertFailsWith<java.util.NoSuchElementException>(
+        block = { it.next() },
+    )
+  }
+
   @Test
   fun streamingBlockingReadable() {
     val reader = BadReader("あ".repeat(5000))
@@ -82,10 +96,45 @@ class JapaneseTokenizerStreamingTest {
   }
 
   @Test
-  fun streamLongTextShouldNotCauseOOM() {
+  fun streamingLongTextShouldNotCauseOOM() {
     val reader = StringReader("あ".repeat(10 * 1024 * 1024))
     val result = tokenizer.tokenizedSentenceIterator(Tokenizer.SplitMode.C, reader).asSequence()
     val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } }
     assertEquals(10 * 1024 * 1024, totalLength)
   }
+
+  class FailReader(private val data: String) : Reader() {
+
+    private var position: Int = 0
+    override fun read(cbuf: CharArray, off: Int, len: Int): Int {
+      // throws IOException after returning all the data
+      check(off >= 0)
+      check(off < cbuf.size)
+      check(len > 0)
+
+      val dataLen = data.length
+      val remaining = dataLen - position
+      if (remaining == 0) {
+        throw java.io.IOException("All data used.")
+      }
+
+      val toRead = min(remaining, len)
+      data.toCharArray(cbuf, off, position, position + toRead)
+      position += toRead
+      return toRead
+    }
+
+    override fun close() {}
+  }
+
+  @Test
+  fun failsWhenReaderFails() {
+    val reader = FailReader("あ".repeat(500))
+    // should not fail on the method call
+    val result = tokenizer.tokenizedSentenceIterator(Tokenizer.SplitMode.C, reader)
+
+    assertFailsWith<java.io.UncheckedIOException>(
+        block = { result.next() },
+    )
+  }
 }
diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java
index 9762f559..9f23f149 100644
--- a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java
+++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022 Works Applications Co., Ltd.
+ * Copyright (c) 2017-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -46,7 +46,7 @@ public class JapaneseTokenizerTest {
     JapaneseTokenizer tokenizer;
 
     @Before
-    public void setUp() throws IOException {
+    public void setUp() {
         dict = TestDictionary.INSTANCE.user1();
         tokenizer = (JapaneseTokenizer) dict.create();
     }
@@ -185,6 +185,11 @@ public void tokenizeSentencesWithSurrogatePair() {
         assertThat(it.hasNext(), is(false));
     }
 
+    /**
+     * @deprecated testing deprecated method
+     *             {@link #Tokenizer.tokenizeSentences(Reader)}.
+     */
+    @Deprecated
     @Test
     public void tokenizerWithReader() throws IOException {
         StringReader reader = new StringReader("京都。東京.東京都。京都");
@@ -200,6 +205,11 @@ public void tokenizerWithReader() throws IOException {
         assertThat(it.hasNext(), is(false));
     }
 
+    /**
+     * @deprecated testing deprecated method
+     *             {@link #Tokenizer.tokenizeSentences(Reader)}.
+     */
+    @Deprecated
     @Test
     public void tokenizerWithLongReader() throws IOException {
         StringBuilder sb = new StringBuilder();
@@ -218,6 +228,11 @@ public void tokenizerWithLongReader() throws IOException {
         assertThat(it.hasNext(), is(false));
     }
 
+    /**
+     * @deprecated testing deprecated method
+     *             {@link #Tokenizer.tokenizeSentences(Reader)}.
+     */
+    @Deprecated
     @Test
     public void tokenizerWithReaderAndNormalization() throws IOException {
         StringBuilder sb = new StringBuilder();
@@ -240,7 +255,7 @@ public void tokenizerWithReaderAndNormalization() throws IOException {
     }
 
     @Test
-    public void tokenizerWithReadable() throws IOException {
+    public void tokenizerWithReadable() {
         StringReader reader = new StringReader("京都。東京.東京都。京都");
         Iterator<List<Morpheme>> it = tokenizer.tokenizedSentenceIterator(reader);
         assertThat(it.hasNext(), is(true));
@@ -255,7 +270,7 @@ public void tokenizerWithReadable() throws IOException {
     }
 
     @Test
-    public void tokenizerWithLongReadable() throws IOException {
+    public void tokenizerWithLongReadable() {
         StringBuilder sb = new StringBuilder();
         for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT * 2 / 3; i++) {
             sb.append("京都。");
@@ -273,7 +288,7 @@ public void tokenizerWithLongReadable() throws IOException {
     }
 
     @Test
-    public void tokenizerWithReadableAndNormalization() throws IOException {
+    public void tokenizerWithReadableAndNormalization() {
         StringBuilder sb = new StringBuilder();
         sb.append("東京都…。");
         for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT / 3; i++) {

From 403e7bcf8d0e3f974888557b3b200270d7fb5372 Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Mon, 24 Jun 2024 18:28:09 +0900
Subject: [PATCH 6/8] add some more tests

---
 .../sudachi/JapaneseTokenizerStreamingTest.kt | 13 ++++++---
 .../nlp/sudachi/JapaneseTokenizerTest.java    | 29 +++++++++++++++++--
 2 files changed, 35 insertions(+), 7 deletions(-)

diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt
index ec4b33d0..d5d7e9de 100644
--- a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt
+++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt
@@ -129,12 +129,17 @@ class JapaneseTokenizerStreamingTest {
 
   @Test
   fun failsWhenReaderFails() {
-    val reader = FailReader("あ".repeat(500))
-    // should not fail on the method call
-    val result = tokenizer.tokenizedSentenceIterator(Tokenizer.SplitMode.C, reader)
+    var reader = FailReader("あ".repeat(500))
+    // should not fail on the instantiation
+    var it = tokenizer.tokenizedSentenceIterator(Tokenizer.SplitMode.C, reader)
+    assertFailsWith<java.io.UncheckedIOException>(
+        block = { it.hasNext() },
+    )
 
+    reader = FailReader("あ".repeat(500))
+    it = tokenizer.tokenizedSentenceIterator(Tokenizer.SplitMode.C, reader)
     assertFailsWith<java.io.UncheckedIOException>(
-        block = { result.next() },
+        block = { it.next() },
     )
   }
 }
diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java
index 9f23f149..0ab3480d 100644
--- a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java
+++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java
@@ -255,7 +255,7 @@ public void tokenizerWithReaderAndNormalization() throws IOException {
     }
 
     @Test
-    public void tokenizerWithReadable() {
+    public void tokenizedSentenceIterator() {
         StringReader reader = new StringReader("京都。東京.東京都。京都");
         Iterator<List<Morpheme>> it = tokenizer.tokenizedSentenceIterator(reader);
         assertThat(it.hasNext(), is(true));
@@ -267,10 +267,16 @@ public void tokenizerWithReadable() {
         assertThat(it.hasNext(), is(true));
         assertThat(it.next().size(), is(1));
         assertThat(it.hasNext(), is(false));
+
+        reader = new StringReader("な。なに。");
+        it = tokenizer.tokenizedSentenceIterator(reader);
+        assertThat(it.hasNext(), is(true));
+        assertThat(it.next().size(), is(3));
+        assertThat(it.hasNext(), is(false));
     }
 
     @Test
-    public void tokenizerWithLongReadable() {
+    public void tokenizedSentenceIteratorWithLongText() {
         StringBuilder sb = new StringBuilder();
         for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT * 2 / 3; i++) {
             sb.append("京都。");
@@ -288,7 +294,7 @@ public void tokenizerWithLongReadable() {
     }
 
     @Test
-    public void tokenizerWithReadableAndNormalization() {
+    public void tokenizedSentenceIteratorWithNormalization() {
         StringBuilder sb = new StringBuilder();
         sb.append("東京都…。");
         for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT / 3; i++) {
@@ -308,6 +314,23 @@ public void tokenizerWithReadableAndNormalization() {
         assertThat(it.hasNext(), is(false));
     }
 
+    @Test
+    public void tokenizedSentenceIteratorWithSurrogatePair() {
+        StringBuilder sb = new StringBuilder();
+        for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT - 1; i++) {
+            sb.append("。");
+        }
+        sb.append("😀");
+        StringReader reader = new StringReader(sb.toString());
+        Iterator<List<Morpheme>> it = tokenizer.tokenizedSentenceIterator(reader);
+
+        assertThat(it.hasNext(), is(true));
+        assertThat(it.next().size(), is(SentenceDetector.DEFAULT_LIMIT - 1));
+        assertThat(it.hasNext(), is(true));
+        assertThat(it.next().size(), is(1));
+        assertThat(it.hasNext(), is(false));
+    }
+
     @Test
     public void zeroLengthMorpheme() {
         List<Morpheme> s = tokenizer.tokenize("…");

From 323e98a9c6242c8219257a22227d82fb46fb6452 Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Tue, 25 Jun 2024 11:15:32 +0900
Subject: [PATCH 7/8] introduce surrogate-aware readable wrapper

---
 .../java/com/worksap/nlp/sudachi/IOTools.java | 45 +++++++++++++++++++
 .../nlp/sudachi/JapaneseTokenizer.java        |  3 +-
 .../SentenceSplittingLazyAnalysis.java        |  4 +-
 3 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/src/main/java/com/worksap/nlp/sudachi/IOTools.java b/src/main/java/com/worksap/nlp/sudachi/IOTools.java
index 964988ce..712327ec 100644
--- a/src/main/java/com/worksap/nlp/sudachi/IOTools.java
+++ b/src/main/java/com/worksap/nlp/sudachi/IOTools.java
@@ -51,4 +51,49 @@ public static int readAsMuchAsCan(Readable readable, CharBuffer result) throws I
         }
         return totalRead;
     }
+
+    /**
+     * Wrapper class for Readable, that uses {@link #readAsMuchAsCan} to read and
+     * guarantees that the last character read is not a high surrogate unless it is
+     * the last one in the readable.
+     */
+    public static class SurrogateAwareReadable implements Readable {
+        private Readable readable;
+        char lastTrailingHighSurrogate;
+
+        SurrogateAwareReadable(Readable input) {
+            this.readable = input;
+        }
+
+        @Override
+        public int read(CharBuffer cb) throws IOException {
+            boolean trailingKept = false;
+            if (lastTrailingHighSurrogate != 0) {
+                cb.append(lastTrailingHighSurrogate);
+                lastTrailingHighSurrogate = 0;
+                trailingKept = true;
+            }
+
+            int nread = IOTools.readAsMuchAsCan(readable, cb);
+            if (nread < 0) {
+                if (!trailingKept) {
+                    return -1;
+                }
+                // the last char in the readable is a high surrogate and there is nothing we can
+                // do.
+                return 1;
+            }
+            if (trailingKept) {
+                nread += 1;
+            }
+
+            char lastChar = cb.get(cb.position() - 1);
+            if (Character.isHighSurrogate(lastChar)) {
+                lastTrailingHighSurrogate = lastChar;
+                cb.position(cb.position() - 1);
+                nread -= 1;
+            }
+            return nread;
+        }
+    }
 }
diff --git a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java
index 7a131f36..9a06fe66 100644
--- a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java
+++ b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java
@@ -99,10 +99,11 @@ public Iterable<MorphemeList> tokenizeSentences(SplitMode mode, String text) {
 
     @Override
     public Iterable<MorphemeList> tokenizeSentences(SplitMode mode, Reader reader) throws IOException {
+        IOTools.SurrogateAwareReadable wrappedReader = new IOTools.SurrogateAwareReadable(reader);
         CharBuffer buffer = CharBuffer.allocate(SentenceDetector.DEFAULT_LIMIT);
         SentenceSplittingAnalysis analysis = new SentenceSplittingAnalysis(mode, this);
 
-        while (IOTools.readAsMuchAsCan(reader, buffer) > 0) {
+        while (wrappedReader.read(buffer) > 0) {
             buffer.flip();
             int length = analysis.tokenizeBuffer(buffer);
             if (length < 0) {
diff --git a/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java b/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java
index 3e877d84..44ebfdce 100644
--- a/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java
+++ b/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java
@@ -40,7 +40,7 @@
     SentenceSplittingLazyAnalysis(Tokenizer.SplitMode mode, JapaneseTokenizer tokenizer, Readable readable) {
         this.mode = mode;
         this.tokenizer = tokenizer;
-        this.readable = readable;
+        this.readable = new IOTools.SurrogateAwareReadable(readable);
 
         this.buffer = CharBuffer.allocate(SentenceDetector.DEFAULT_LIMIT);
         this.buffer.flip();
@@ -72,7 +72,7 @@ private int bosPosition() {
     private int reloadBuffer() throws IOException {
         buffer.position(bosPosition());
         buffer.compact();
-        int nread = IOTools.readAsMuchAsCan(readable, buffer);
+        int nread = readable.read(buffer);
         buffer.flip();
 
         // align with new buffer state

From d8e4d165ccfa38b5fae948326e17821548ed4921 Mon Sep 17 00:00:00 2001
From: mh-northlander <mh.northlander+github@gmail.com>
Date: Tue, 25 Jun 2024 16:54:26 +0900
Subject: [PATCH 8/8] rename method

---
 .../worksap/nlp/sudachi/JapaneseTokenizer.java |  2 +-
 .../com/worksap/nlp/sudachi/Tokenizer.java     | 13 ++++++-------
 .../sudachi/JapaneseTokenizerStreamingTest.kt  | 14 ++++++++------
 .../nlp/sudachi/JapaneseTokenizerTest.java     | 18 +++++++++---------
 4 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java
index 9a06fe66..201d59b5 100644
--- a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java
+++ b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java
@@ -122,7 +122,7 @@ public Iterable<MorphemeList> tokenizeSentences(SplitMode mode, Reader reader) t
     }
 
     @Override
-    public Iterator<List<Morpheme>> tokenizedSentenceIterator(SplitMode mode, Readable readable) {
+    public Iterator<List<Morpheme>> lazyTokenizeSentences(SplitMode mode, Readable readable) {
         return new SentenceSplittingLazyAnalysis(mode, this, readable);
     }
 
diff --git a/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java b/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java
index 61710b91..89f6adef 100644
--- a/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java
+++ b/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java
@@ -89,8 +89,7 @@ default Iterable<MorphemeList> tokenizeSentences(String text) {
      * @return a result of tokenizing
      * @throws IOException
      *             if reading a stream is failed
-     * @deprecated use {@link #tokenizedSentenceIterator(SplitMode, Readable)}
-     *             instead.
+     * @deprecated use {@link #lazyTokenizeSentences(SplitMode, Readable)} instead.
      */
     @Deprecated
     Iterable<MorphemeList> tokenizeSentences(SplitMode mode, Reader input) throws IOException;
@@ -106,7 +105,7 @@ default Iterable<MorphemeList> tokenizeSentences(String text) {
      * @throws IOException
      *             if reading a stream is failed
      * @see #tokenizeSentences(SplitMode,Reader)
-     * @deprecated use {@link #tokenizedSentenceIterator(Readable)} instead.
+     * @deprecated use {@link #lazyTokenizeSentences(Readable)} instead.
      */
     @Deprecated
     default Iterable<MorphemeList> tokenizeSentences(Reader input) throws IOException {
@@ -123,7 +122,7 @@ default Iterable<MorphemeList> tokenizeSentences(Reader input) throws IOExceptio
      *            a readable input text
      * @return a result of tokenizing
      */
-    Iterator<List<Morpheme>> tokenizedSentenceIterator(SplitMode mode, Readable input);
+    Iterator<List<Morpheme>> lazyTokenizeSentences(SplitMode mode, Readable input);
 
     /**
      * Read an input text from {@code input}, divide it into sentences and tokenize
@@ -132,10 +131,10 @@ default Iterable<MorphemeList> tokenizeSentences(Reader input) throws IOExceptio
      * @param input
      *            a readable input text
      * @return a result of tokenizing
-     * @see #tokenizedSentenceIterator(SplitMode,Readable)
+     * @see #lazyTokenizeSentences(SplitMode,Readable)
      */
-    default Iterator<List<Morpheme>> tokenizedSentenceIterator(Readable input) {
-        return tokenizedSentenceIterator(SplitMode.C, input);
+    default Iterator<List<Morpheme>> lazyTokenizeSentences(Readable input) {
+        return lazyTokenizeSentences(SplitMode.C, input);
     }
 
     /**
diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt
index d5d7e9de..b5f3d54a 100644
--- a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt
+++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt
@@ -52,6 +52,7 @@ class JapaneseTokenizerStreamingTest {
 
   @Test
   fun streamingTest() {
+    // Testing deprecated method `tokenizeSentences(Reader)`
     val reader = StringReader("あ".repeat(5000))
     val result = tokenizer.tokenizeSentences(Tokenizer.SplitMode.C, reader)
     val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } }
@@ -60,6 +61,7 @@ class JapaneseTokenizerStreamingTest {
 
   @Test
   fun streamingTestWithBadReader() {
+    // Testing deprecated method `tokenizeSentences(Reader)`
     val reader = BadReader("あ".repeat(5000))
     val result = tokenizer.tokenizeSentences(Tokenizer.SplitMode.C, reader)
     val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } }
@@ -69,7 +71,7 @@ class JapaneseTokenizerStreamingTest {
   @Test
   fun streamingReadable() {
     val reader = StringReader("あ".repeat(5000))
-    val result = tokenizer.tokenizedSentenceIterator(Tokenizer.SplitMode.C, reader).asSequence()
+    val result = tokenizer.lazyTokenizeSentences(Tokenizer.SplitMode.C, reader).asSequence()
     val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } }
     assertEquals(5000, totalLength)
   }
@@ -77,7 +79,7 @@ class JapaneseTokenizerStreamingTest {
   @Test
   fun callingNextWithoutTextFails() {
     val reader = StringReader("東京")
-    val it = tokenizer.tokenizedSentenceIterator(Tokenizer.SplitMode.C, reader)
+    val it = tokenizer.lazyTokenizeSentences(Tokenizer.SplitMode.C, reader)
 
     val morphemes = it.next()
     assertEquals("東京", morphemes.get(0).surface())
@@ -90,7 +92,7 @@ class JapaneseTokenizerStreamingTest {
   @Test
   fun streamingBlockingReadable() {
     val reader = BadReader("あ".repeat(5000))
-    val result = tokenizer.tokenizedSentenceIterator(Tokenizer.SplitMode.C, reader).asSequence()
+    val result = tokenizer.lazyTokenizeSentences(Tokenizer.SplitMode.C, reader).asSequence()
     val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } }
     assertEquals(5000, totalLength)
   }
@@ -98,7 +100,7 @@ class JapaneseTokenizerStreamingTest {
   @Test
   fun streamingLongTextShouldNotCauseOOM() {
     val reader = StringReader("あ".repeat(10 * 1024 * 1024))
-    val result = tokenizer.tokenizedSentenceIterator(Tokenizer.SplitMode.C, reader).asSequence()
+    val result = tokenizer.lazyTokenizeSentences(Tokenizer.SplitMode.C, reader).asSequence()
     val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } }
     assertEquals(10 * 1024 * 1024, totalLength)
   }
@@ -131,13 +133,13 @@ class JapaneseTokenizerStreamingTest {
   fun failsWhenReaderFails() {
     var reader = FailReader("あ".repeat(500))
     // should not fail on the instantiation
-    var it = tokenizer.tokenizedSentenceIterator(Tokenizer.SplitMode.C, reader)
+    var it = tokenizer.lazyTokenizeSentences(Tokenizer.SplitMode.C, reader)
     assertFailsWith<java.io.UncheckedIOException>(
         block = { it.hasNext() },
     )
 
     reader = FailReader("あ".repeat(500))
-    it = tokenizer.tokenizedSentenceIterator(Tokenizer.SplitMode.C, reader)
+    it = tokenizer.lazyTokenizeSentences(Tokenizer.SplitMode.C, reader)
     assertFailsWith<java.io.UncheckedIOException>(
         block = { it.next() },
     )
diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java
index 0ab3480d..a3242735 100644
--- a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java
+++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java
@@ -255,9 +255,9 @@ public void tokenizerWithReaderAndNormalization() throws IOException {
     }
 
     @Test
-    public void tokenizedSentenceIterator() {
+    public void lazyTokenizeSentences() {
         StringReader reader = new StringReader("京都。東京.東京都。京都");
-        Iterator<List<Morpheme>> it = tokenizer.tokenizedSentenceIterator(reader);
+        Iterator<List<Morpheme>> it = tokenizer.lazyTokenizeSentences(reader);
         assertThat(it.hasNext(), is(true));
         assertThat(it.next().size(), is(2));
         assertThat(it.hasNext(), is(true));
@@ -269,21 +269,21 @@ public void tokenizedSentenceIterator() {
         assertThat(it.hasNext(), is(false));
 
         reader = new StringReader("な。なに。");
-        it = tokenizer.tokenizedSentenceIterator(reader);
+        it = tokenizer.lazyTokenizeSentences(reader);
         assertThat(it.hasNext(), is(true));
         assertThat(it.next().size(), is(3));
         assertThat(it.hasNext(), is(false));
     }
 
     @Test
-    public void tokenizedSentenceIteratorWithLongText() {
+    public void lazyTokenizeSentencesWithLongText() {
         StringBuilder sb = new StringBuilder();
         for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT * 2 / 3; i++) {
             sb.append("京都。");
         }
         sb.append("京都");
         StringReader reader = new StringReader(sb.toString());
-        Iterator<List<Morpheme>> it = tokenizer.tokenizedSentenceIterator(reader);
+        Iterator<List<Morpheme>> it = tokenizer.lazyTokenizeSentences(reader);
         for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT * 2 / 3; i++) {
             assertThat(it.hasNext(), is(true));
             assertThat(it.next().size(), is(2));
@@ -294,14 +294,14 @@ public void tokenizedSentenceIteratorWithLongText() {
     }
 
     @Test
-    public void tokenizedSentenceIteratorWithNormalization() {
+    public void lazyTokenizeSentencesWithNormalization() {
         StringBuilder sb = new StringBuilder();
         sb.append("東京都…。");
         for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT / 3; i++) {
             sb.append("京都。");
         }
         StringReader reader = new StringReader(sb.toString());
-        Iterator<List<Morpheme>> it = tokenizer.tokenizedSentenceIterator(reader);
+        Iterator<List<Morpheme>> it = tokenizer.lazyTokenizeSentences(reader);
         assertThat(it.hasNext(), is(true));
         assertThat(it.next().size(), is(5));
         for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT / 3; i++) {
@@ -315,14 +315,14 @@ public void tokenizedSentenceIteratorWithNormalization() {
     }
 
     @Test
-    public void tokenizedSentenceIteratorWithSurrogatePair() {
+    public void lazyTokenizeSentencesWithSurrogatePair() {
         StringBuilder sb = new StringBuilder();
         for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT - 1; i++) {
             sb.append("。");
         }
         sb.append("😀");
         StringReader reader = new StringReader(sb.toString());
-        Iterator<List<Morpheme>> it = tokenizer.tokenizedSentenceIterator(reader);
+        Iterator<List<Morpheme>> it = tokenizer.lazyTokenizeSentences(reader);
 
         assertThat(it.hasNext(), is(true));
         assertThat(it.next().size(), is(SentenceDetector.DEFAULT_LIMIT - 1));