From ac99be6619b035e31a7c5391874463bba4e54dbb Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 24 Jun 2024 13:41:09 +0900 Subject: [PATCH 1/8] add a method for lazy split and tokenization --- .../java/com/worksap/nlp/sudachi/IOTools.java | 16 +- .../nlp/sudachi/JapaneseTokenizer.java | 8 +- .../SentenceSplittingLazyAnalysis.java | 157 ++++++++++++++++++ .../com/worksap/nlp/sudachi/Tokenizer.java | 41 ++++- 4 files changed, 207 insertions(+), 15 deletions(-) create mode 100644 src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java diff --git a/src/main/java/com/worksap/nlp/sudachi/IOTools.java b/src/main/java/com/worksap/nlp/sudachi/IOTools.java index edb309ef..964988ce 100644 --- a/src/main/java/com/worksap/nlp/sudachi/IOTools.java +++ b/src/main/java/com/worksap/nlp/sudachi/IOTools.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023 Works Applications Co., Ltd. + * Copyright (c) 2023-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +17,6 @@ package com.worksap.nlp.sudachi; import java.io.IOException; -import java.io.Reader; import java.nio.CharBuffer; public class IOTools { @@ -26,22 +25,21 @@ private IOTools() { } /** - * Read as much as possible from reader to the result buffer. Some readers - * perform filtering on input by reducing the number of read characters in each - * batch. + * Read as much as possible from the readable to the result buffer. Use this to + * make sure that the buffer is fulfilled or no text left unread. * - * @param reader - * input reader + * @param readable + * input readable * @param result * buffer to read into * @return number of read characters * @throws IOException * when read operation fails */ - public static int readAsMuchAsCan(Reader reader, CharBuffer result) throws IOException { + public static int readAsMuchAsCan(Readable readable, CharBuffer result) throws IOException { int totalRead = 0; while (result.hasRemaining()) { - int read = reader.read(result); + int read = readable.read(result); if (read < 0) { if (totalRead == 0) { return -1; diff --git a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java index 8547a8ea..7a131f36 100644 --- a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java +++ b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,6 +23,7 @@ import java.nio.CharBuffer; import java.util.ArrayList; import java.util.Collections; +import java.util.Iterator; import java.util.List; import javax.json.Json; @@ -119,6 +120,11 @@ public Iterable tokenizeSentences(SplitMode mode, Reader reader) t return sentences; } + @Override + public Iterator> tokenizedSentenceIterator(SplitMode mode, Readable readable) { + return new SentenceSplittingLazyAnalysis(mode, this, readable); + } + @Override public void setDumpOutput(PrintStream output) { dumpOutput = output; diff --git a/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java b/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java new file mode 100644 index 00000000..93db0652 --- /dev/null +++ b/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2023 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.worksap.nlp.sudachi; + +import java.io.IOException; +import java.nio.CharBuffer; +import java.util.Iterator; +import java.util.List; +import java.util.NoSuchElementException; + +import com.worksap.nlp.sudachi.dictionary.LexiconSet; +import com.worksap.nlp.sudachi.sentdetect.SentenceDetector; + +/** + * Provides lazy sentence split and analysis. + */ +/* internal */ class SentenceSplittingLazyAnalysis + implements SentenceDetector.NonBreakCheker, Iterator> { + private final SentenceDetector detector = new SentenceDetector(); + + private final Tokenizer.SplitMode mode; + private final JapaneseTokenizer tokenizer; + private final Readable readable; + + SentenceSplittingLazyAnalysis(Tokenizer.SplitMode mode, JapaneseTokenizer tokenizer, Readable readable) { + this.mode = mode; + this.tokenizer = tokenizer; + this.readable = readable; + + this.buffer = CharBuffer.allocate(SentenceDetector.DEFAULT_LIMIT); + this.buffer.flip(); + this.input = tokenizer.buildInputText(""); + } + + // input buffer + private final CharBuffer buffer; + // preprocessed InputText of the buffer. + // used to normalize text for the sentence detection. + private UTF8InputText input; + // begining-of-sentence index of next sentence in the input + private int bos = 0; + // normalized text left. corresponds to `input.getSubstring(bos, + // input.getText().length())` + private String normalized = ""; + + /** Return bos position in the buffer. */ + private int bosPosition() { + return input.textIndexToOriginalTextIndex(bos); + } + + /** + * Reset the buffer discarding processed text, then read from the input. + * + * @return the number of chars added to the buffer. -1 if input reabable is at + * its end. + */ + private int reloadBuffer() throws IOException { + buffer.position(bosPosition()); + buffer.compact(); + int nread = IOTools.readAsMuchAsCan(readable, buffer); + buffer.flip(); + + // align with new buffer state + input = tokenizer.buildInputText(buffer); + bos = 0; + normalized = input.getText(); + + return nread; + } + + @Override + public boolean hasNext() { + if (!normalized.isEmpty()) { + return true; + } + + int nread; + try { + nread = reloadBuffer(); + } catch (IOException e) { + throw new RuntimeException(e.getMessage(), e); + } + + return !(nread < 0 && !buffer.hasRemaining()); + } + + @Override + public MorphemeList next() { + int length = detector.getEos(normalized, this); + if (length > 0) { // sentence found + int eos = bos + length; + if (eos < normalized.length()) { + eos = input.getNextInOriginal(eos - 1); + length = eos - bos; + } + UTF8InputText sentence = input.slice(bos, eos); + bos = eos; + normalized = normalized.substring(length); + return tokenizer.tokenizeSentence(mode, sentence); + } + + // buffer is just after reload but no (safe) eos found. need to clean it up. + // tokenize all text in the buffer. + if (bos == 0 && length < 0) { + bos = normalized.length(); + normalized = ""; + return tokenizer.tokenizeSentence(mode, input); + } + + int nread; + try { + nread = reloadBuffer(); + } catch (IOException e) { + throw new RuntimeException(e.getMessage(), e); + } + + if (nread < 0 && !buffer.hasRemaining()) { + throw new NoSuchElementException("no texts left to analyze"); + } + + // recursive call with reloaded buffer. + return next(); + } + + @Override + public boolean hasNonBreakWord(int length) { + UTF8InputText inp = input; + int byteEOS = inp.getCodePointsOffsetLength(0, bos + length); + byte[] bytes = inp.getByteText(); + LexiconSet lexicon = tokenizer.lexicon; + for (int i = Math.max(0, byteEOS - 64); i < byteEOS; i++) { + Iterator iterator = lexicon.lookup(bytes, i); + while (iterator.hasNext()) { + int[] r = iterator.next(); + int l = r[1]; + if (l > byteEOS || (l == byteEOS && bos + length - inp.modifiedOffset(i) > 1)) { + return true; + } + } + } + return false; + } +} diff --git a/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java b/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java index 459be935..f9853d63 100644 --- a/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java +++ b/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,8 @@ import java.io.IOException; import java.io.PrintStream; import java.io.Reader; +import java.util.Iterator; +import java.util.List; /** * A tokenizer of morphological analysis. @@ -52,7 +54,7 @@ default MorphemeList tokenize(final String text) { /** * Tokenize sentences. This method divide an input text into sentences and - * tokenizes them. + * tokenizes them. When the text is long, it uses a lot of memory. * * @param mode * a mode of splitting @@ -64,7 +66,7 @@ default MorphemeList tokenize(final String text) { /** * Tokenize sentences. Divide an input text into sentences and tokenize them - * with {@link SplitMode}.C. + * with {@link SplitMode}.C. When the text is long, it uses a lot of memory. * * @param text * input text @@ -77,7 +79,8 @@ default Iterable tokenizeSentences(String text) { /** * Read an input text from {@code input}, divide it into sentences and tokenize - * them. + * them. It reads all text in the input and uses a lot of memory when the text + * is long. * * @param mode * a mode of splitting @@ -86,12 +89,14 @@ default Iterable tokenizeSentences(String text) { * @return a result of tokenizing * @throws IOException * if reading a stream is failed + * @deprecated use {@link #tokenizeSentences(SplitMode, Readable)} instead. */ Iterable tokenizeSentences(SplitMode mode, Reader input) throws IOException; /** * Reads an input text from {@code input}, divide it into sentences and - * tokenizes them with {@link SplitMode}.C. + * tokenizes them with {@link SplitMode}.C. It reads all text in the input and + * uses a lot of memory when the text is long. * * @param input * a reader of input text @@ -99,11 +104,37 @@ default Iterable tokenizeSentences(String text) { * @throws IOException * if reading a stream is failed * @see #tokenizeSentences(SplitMode,Reader) + * @deprecated use {@link #tokenizeSentences(Readable)} instead. */ default Iterable tokenizeSentences(Reader input) throws IOException { return tokenizeSentences(SplitMode.C, input); } + /** + * Read an input text from {@code input}, divide it into sentences and tokenize + * them. It reads the input lazily. + * + * @param mode + * a mode of splitting + * @param input + * a readable input text + * @return a result of tokenizing + */ + Iterator> tokenizedSentenceIterator(SplitMode mode, Readable input); + + /** + * Read an input text from {@code input}, divide it into sentences and tokenize + * them with {@link SplitMode}.C. It reads the input lazily. + * + * @param input + * a readable input text + * @return a result of tokenizing + * @see #tokenizedSentenceIterator(SplitMode,Readable) + */ + default Iterator> tokenizedSentenceIterator(Readable input) { + return tokenizedSentenceIterator(SplitMode.C, input); + } + /** * Prints lattice structure of the analysis into the passed {@link PrintStream}. * From 2ceca78d60f0f48fa1a835841204fb9d0d5c705d Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 24 Jun 2024 14:33:16 +0900 Subject: [PATCH 2/8] fix javadoc link --- src/main/java/com/worksap/nlp/sudachi/Tokenizer.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java b/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java index f9853d63..51aacb01 100644 --- a/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java +++ b/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java @@ -89,7 +89,8 @@ default Iterable tokenizeSentences(String text) { * @return a result of tokenizing * @throws IOException * if reading a stream is failed - * @deprecated use {@link #tokenizeSentences(SplitMode, Readable)} instead. + * @deprecated use {@link #tokenizedSentenceIterator(SplitMode, Readable)} + * instead. */ Iterable tokenizeSentences(SplitMode mode, Reader input) throws IOException; @@ -104,7 +105,7 @@ default Iterable tokenizeSentences(String text) { * @throws IOException * if reading a stream is failed * @see #tokenizeSentences(SplitMode,Reader) - * @deprecated use {@link #tokenizeSentences(Readable)} instead. + * @deprecated use {@link #tokenizedSentenceIterator(Readable)} instead. */ default Iterable tokenizeSentences(Reader input) throws IOException { return tokenizeSentences(SplitMode.C, input); From a606cb7c1f127d5515779593ea9dab3f04c3c094 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 24 Jun 2024 15:58:12 +0900 Subject: [PATCH 3/8] add tests for a lazy analysis --- .../sudachi/JapaneseTokenizerStreamingTest.kt | 24 +++++++++ .../nlp/sudachi/JapaneseTokenizerTest.java | 54 +++++++++++++++++++ 2 files changed, 78 insertions(+) diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt index e52c459f..3e8a8b1d 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt @@ -64,4 +64,28 @@ class JapaneseTokenizerStreamingTest { val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } } assertEquals(5000, totalLength) } + + @Test + fun streamingReadable() { + val reader = StringReader("あ".repeat(5000)) + val result = tokenizer.tokenizedSentenceIterator(Tokenizer.SplitMode.C, reader).asSequence() + val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } } + assertEquals(5000, totalLength) + } + + @Test + fun streamingBlockingReadable() { + val reader = BadReader("あ".repeat(5000)) + val result = tokenizer.tokenizedSentenceIterator(Tokenizer.SplitMode.C, reader).asSequence() + val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } } + assertEquals(5000, totalLength) + } + + @Test + fun streamLongTextShouldNotCauseOOM() { + val reader = StringReader("あ".repeat(10 * 1024 * 1024)) + val result = tokenizer.tokenizedSentenceIterator(Tokenizer.SplitMode.C, reader).asSequence() + val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } } + assertEquals(10 * 1024 * 1024, totalLength) + } } diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java index 09c1afd1..9762f559 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java @@ -239,6 +239,60 @@ public void tokenizerWithReaderAndNormalization() throws IOException { assertThat(it.hasNext(), is(false)); } + @Test + public void tokenizerWithReadable() throws IOException { + StringReader reader = new StringReader("京都。東京.東京都。京都"); + Iterator> it = tokenizer.tokenizedSentenceIterator(reader); + assertThat(it.hasNext(), is(true)); + assertThat(it.next().size(), is(2)); + assertThat(it.hasNext(), is(true)); + assertThat(it.next().size(), is(2)); + assertThat(it.hasNext(), is(true)); + assertThat(it.next().size(), is(2)); + assertThat(it.hasNext(), is(true)); + assertThat(it.next().size(), is(1)); + assertThat(it.hasNext(), is(false)); + } + + @Test + public void tokenizerWithLongReadable() throws IOException { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT * 2 / 3; i++) { + sb.append("京都。"); + } + sb.append("京都"); + StringReader reader = new StringReader(sb.toString()); + Iterator> it = tokenizer.tokenizedSentenceIterator(reader); + for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT * 2 / 3; i++) { + assertThat(it.hasNext(), is(true)); + assertThat(it.next().size(), is(2)); + } + assertThat(it.hasNext(), is(true)); + assertThat(it.next().size(), is(1)); + assertThat(it.hasNext(), is(false)); + } + + @Test + public void tokenizerWithReadableAndNormalization() throws IOException { + StringBuilder sb = new StringBuilder(); + sb.append("東京都…。"); + for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT / 3; i++) { + sb.append("京都。"); + } + StringReader reader = new StringReader(sb.toString()); + Iterator> it = tokenizer.tokenizedSentenceIterator(reader); + assertThat(it.hasNext(), is(true)); + assertThat(it.next().size(), is(5)); + for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT / 3; i++) { + assertThat(it.hasNext(), is(true)); + List ms = it.next(); + assertThat(ms.size(), is(2)); + assertThat(ms.get(0).surface(), is("京都")); + assertThat(ms.get(1).surface(), is("。")); + } + assertThat(it.hasNext(), is(false)); + } + @Test public void zeroLengthMorpheme() { List s = tokenizer.tokenize("…"); From 0b8f6605e3ed23c7226fbed774798583d6fcef40 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 24 Jun 2024 16:37:34 +0900 Subject: [PATCH 4/8] Use proper exception class --- .../worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java b/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java index 93db0652..9cc28653 100644 --- a/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java +++ b/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java @@ -17,6 +17,7 @@ package com.worksap.nlp.sudachi; import java.io.IOException; +import java.io.UncheckedIOException; import java.nio.CharBuffer; import java.util.Iterator; import java.util.List; @@ -92,7 +93,7 @@ public boolean hasNext() { try { nread = reloadBuffer(); } catch (IOException e) { - throw new RuntimeException(e.getMessage(), e); + throw new UncheckedIOException(e.getMessage(), e); } return !(nread < 0 && !buffer.hasRemaining()); @@ -125,7 +126,7 @@ public MorphemeList next() { try { nread = reloadBuffer(); } catch (IOException e) { - throw new RuntimeException(e.getMessage(), e); + throw new UncheckedIOException(e.getMessage(), e); } if (nread < 0 && !buffer.hasRemaining()) { From dfb9e1b8e6090fa0bf75442f8db0772a30df4d37 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 24 Jun 2024 17:32:09 +0900 Subject: [PATCH 5/8] Add tests for error cases and fix code smells --- .../SentenceSplittingLazyAnalysis.java | 2 +- .../com/worksap/nlp/sudachi/Tokenizer.java | 2 + .../sudachi/JapaneseTokenizerStreamingTest.kt | 53 ++++++++++++++++++- .../nlp/sudachi/JapaneseTokenizerTest.java | 25 +++++++-- 4 files changed, 74 insertions(+), 8 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java b/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java index 9cc28653..3e877d84 100644 --- a/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java +++ b/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023 Works Applications Co., Ltd. + * Copyright (c) 2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java b/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java index 51aacb01..61710b91 100644 --- a/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java +++ b/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java @@ -92,6 +92,7 @@ default Iterable tokenizeSentences(String text) { * @deprecated use {@link #tokenizedSentenceIterator(SplitMode, Readable)} * instead. */ + @Deprecated Iterable tokenizeSentences(SplitMode mode, Reader input) throws IOException; /** @@ -107,6 +108,7 @@ default Iterable tokenizeSentences(String text) { * @see #tokenizeSentences(SplitMode,Reader) * @deprecated use {@link #tokenizedSentenceIterator(Readable)} instead. */ + @Deprecated default Iterable tokenizeSentences(Reader input) throws IOException { return tokenizeSentences(SplitMode.C, input); } diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt index 3e8a8b1d..ec4b33d0 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023 Works Applications Co., Ltd. + * Copyright (c) 2023-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ import java.io.StringReader import kotlin.math.min import kotlin.test.Test import kotlin.test.assertEquals +import kotlin.test.assertFailsWith class JapaneseTokenizerStreamingTest { private val tokenizer = TestDictionary.user0().create() @@ -73,6 +74,19 @@ class JapaneseTokenizerStreamingTest { assertEquals(5000, totalLength) } + @Test + fun callingNextWithoutTextFails() { + val reader = StringReader("東京") + val it = tokenizer.tokenizedSentenceIterator(Tokenizer.SplitMode.C, reader) + + val morphemes = it.next() + assertEquals("東京", morphemes.get(0).surface()) + + assertFailsWith( + block = { it.next() }, + ) + } + @Test fun streamingBlockingReadable() { val reader = BadReader("あ".repeat(5000)) @@ -82,10 +96,45 @@ class JapaneseTokenizerStreamingTest { } @Test - fun streamLongTextShouldNotCauseOOM() { + fun streamingLongTextShouldNotCauseOOM() { val reader = StringReader("あ".repeat(10 * 1024 * 1024)) val result = tokenizer.tokenizedSentenceIterator(Tokenizer.SplitMode.C, reader).asSequence() val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } } assertEquals(10 * 1024 * 1024, totalLength) } + + class FailReader(private val data: String) : Reader() { + + private var position: Int = 0 + override fun read(cbuf: CharArray, off: Int, len: Int): Int { + // throws IOException after returning all the data + check(off >= 0) + check(off < cbuf.size) + check(len > 0) + + val dataLen = data.length + val remaining = dataLen - position + if (remaining == 0) { + throw java.io.IOException("All data used.") + } + + val toRead = min(remaining, len) + data.toCharArray(cbuf, off, position, position + toRead) + position += toRead + return toRead + } + + override fun close() {} + } + + @Test + fun failsWhenReaderFails() { + val reader = FailReader("あ".repeat(500)) + // should not fail on the method call + val result = tokenizer.tokenizedSentenceIterator(Tokenizer.SplitMode.C, reader) + + assertFailsWith( + block = { result.next() }, + ) + } } diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java index 9762f559..9f23f149 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2022 Works Applications Co., Ltd. + * Copyright (c) 2017-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -46,7 +46,7 @@ public class JapaneseTokenizerTest { JapaneseTokenizer tokenizer; @Before - public void setUp() throws IOException { + public void setUp() { dict = TestDictionary.INSTANCE.user1(); tokenizer = (JapaneseTokenizer) dict.create(); } @@ -185,6 +185,11 @@ public void tokenizeSentencesWithSurrogatePair() { assertThat(it.hasNext(), is(false)); } + /** + * @deprecated testing deprecated method + * {@link #Tokenizer.tokenizeSentences(Reader)}. + */ + @Deprecated @Test public void tokenizerWithReader() throws IOException { StringReader reader = new StringReader("京都。東京.東京都。京都"); @@ -200,6 +205,11 @@ public void tokenizerWithReader() throws IOException { assertThat(it.hasNext(), is(false)); } + /** + * @deprecated testing deprecated method + * {@link #Tokenizer.tokenizeSentences(Reader)}. + */ + @Deprecated @Test public void tokenizerWithLongReader() throws IOException { StringBuilder sb = new StringBuilder(); @@ -218,6 +228,11 @@ public void tokenizerWithLongReader() throws IOException { assertThat(it.hasNext(), is(false)); } + /** + * @deprecated testing deprecated method + * {@link #Tokenizer.tokenizeSentences(Reader)}. + */ + @Deprecated @Test public void tokenizerWithReaderAndNormalization() throws IOException { StringBuilder sb = new StringBuilder(); @@ -240,7 +255,7 @@ public void tokenizerWithReaderAndNormalization() throws IOException { } @Test - public void tokenizerWithReadable() throws IOException { + public void tokenizerWithReadable() { StringReader reader = new StringReader("京都。東京.東京都。京都"); Iterator> it = tokenizer.tokenizedSentenceIterator(reader); assertThat(it.hasNext(), is(true)); @@ -255,7 +270,7 @@ public void tokenizerWithReadable() throws IOException { } @Test - public void tokenizerWithLongReadable() throws IOException { + public void tokenizerWithLongReadable() { StringBuilder sb = new StringBuilder(); for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT * 2 / 3; i++) { sb.append("京都。"); @@ -273,7 +288,7 @@ public void tokenizerWithLongReadable() throws IOException { } @Test - public void tokenizerWithReadableAndNormalization() throws IOException { + public void tokenizerWithReadableAndNormalization() { StringBuilder sb = new StringBuilder(); sb.append("東京都…。"); for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT / 3; i++) { From 403e7bcf8d0e3f974888557b3b200270d7fb5372 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Mon, 24 Jun 2024 18:28:09 +0900 Subject: [PATCH 6/8] add some more tests --- .../sudachi/JapaneseTokenizerStreamingTest.kt | 13 ++++++--- .../nlp/sudachi/JapaneseTokenizerTest.java | 29 +++++++++++++++++-- 2 files changed, 35 insertions(+), 7 deletions(-) diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt index ec4b33d0..d5d7e9de 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt @@ -129,12 +129,17 @@ class JapaneseTokenizerStreamingTest { @Test fun failsWhenReaderFails() { - val reader = FailReader("あ".repeat(500)) - // should not fail on the method call - val result = tokenizer.tokenizedSentenceIterator(Tokenizer.SplitMode.C, reader) + var reader = FailReader("あ".repeat(500)) + // should not fail on the instantiation + var it = tokenizer.tokenizedSentenceIterator(Tokenizer.SplitMode.C, reader) + assertFailsWith( + block = { it.hasNext() }, + ) + reader = FailReader("あ".repeat(500)) + it = tokenizer.tokenizedSentenceIterator(Tokenizer.SplitMode.C, reader) assertFailsWith( - block = { result.next() }, + block = { it.next() }, ) } } diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java index 9f23f149..0ab3480d 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java @@ -255,7 +255,7 @@ public void tokenizerWithReaderAndNormalization() throws IOException { } @Test - public void tokenizerWithReadable() { + public void tokenizedSentenceIterator() { StringReader reader = new StringReader("京都。東京.東京都。京都"); Iterator> it = tokenizer.tokenizedSentenceIterator(reader); assertThat(it.hasNext(), is(true)); @@ -267,10 +267,16 @@ public void tokenizerWithReadable() { assertThat(it.hasNext(), is(true)); assertThat(it.next().size(), is(1)); assertThat(it.hasNext(), is(false)); + + reader = new StringReader("な。なに。"); + it = tokenizer.tokenizedSentenceIterator(reader); + assertThat(it.hasNext(), is(true)); + assertThat(it.next().size(), is(3)); + assertThat(it.hasNext(), is(false)); } @Test - public void tokenizerWithLongReadable() { + public void tokenizedSentenceIteratorWithLongText() { StringBuilder sb = new StringBuilder(); for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT * 2 / 3; i++) { sb.append("京都。"); @@ -288,7 +294,7 @@ public void tokenizerWithLongReadable() { } @Test - public void tokenizerWithReadableAndNormalization() { + public void tokenizedSentenceIteratorWithNormalization() { StringBuilder sb = new StringBuilder(); sb.append("東京都…。"); for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT / 3; i++) { @@ -308,6 +314,23 @@ public void tokenizerWithReadableAndNormalization() { assertThat(it.hasNext(), is(false)); } + @Test + public void tokenizedSentenceIteratorWithSurrogatePair() { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT - 1; i++) { + sb.append("。"); + } + sb.append("😀"); + StringReader reader = new StringReader(sb.toString()); + Iterator> it = tokenizer.tokenizedSentenceIterator(reader); + + assertThat(it.hasNext(), is(true)); + assertThat(it.next().size(), is(SentenceDetector.DEFAULT_LIMIT - 1)); + assertThat(it.hasNext(), is(true)); + assertThat(it.next().size(), is(1)); + assertThat(it.hasNext(), is(false)); + } + @Test public void zeroLengthMorpheme() { List s = tokenizer.tokenize("…"); From 323e98a9c6242c8219257a22227d82fb46fb6452 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Tue, 25 Jun 2024 11:15:32 +0900 Subject: [PATCH 7/8] introduce surrogate-aware readable wrapper --- .../java/com/worksap/nlp/sudachi/IOTools.java | 45 +++++++++++++++++++ .../nlp/sudachi/JapaneseTokenizer.java | 3 +- .../SentenceSplittingLazyAnalysis.java | 4 +- 3 files changed, 49 insertions(+), 3 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/IOTools.java b/src/main/java/com/worksap/nlp/sudachi/IOTools.java index 964988ce..712327ec 100644 --- a/src/main/java/com/worksap/nlp/sudachi/IOTools.java +++ b/src/main/java/com/worksap/nlp/sudachi/IOTools.java @@ -51,4 +51,49 @@ public static int readAsMuchAsCan(Readable readable, CharBuffer result) throws I } return totalRead; } + + /** + * Wrapper class for Readable, that uses {@link #readAsMuchAsCan} to read and + * guarantees that the last character read is not a high surrogate unless it is + * the last one in the readable. + */ + public static class SurrogateAwareReadable implements Readable { + private Readable readable; + char lastTrailingHighSurrogate; + + SurrogateAwareReadable(Readable input) { + this.readable = input; + } + + @Override + public int read(CharBuffer cb) throws IOException { + boolean trailingKept = false; + if (lastTrailingHighSurrogate != 0) { + cb.append(lastTrailingHighSurrogate); + lastTrailingHighSurrogate = 0; + trailingKept = true; + } + + int nread = IOTools.readAsMuchAsCan(readable, cb); + if (nread < 0) { + if (!trailingKept) { + return -1; + } + // the last char in the readable is a high surrogate and there is nothing we can + // do. + return 1; + } + if (trailingKept) { + nread += 1; + } + + char lastChar = cb.get(cb.position() - 1); + if (Character.isHighSurrogate(lastChar)) { + lastTrailingHighSurrogate = lastChar; + cb.position(cb.position() - 1); + nread -= 1; + } + return nread; + } + } } diff --git a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java index 7a131f36..9a06fe66 100644 --- a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java +++ b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java @@ -99,10 +99,11 @@ public Iterable tokenizeSentences(SplitMode mode, String text) { @Override public Iterable tokenizeSentences(SplitMode mode, Reader reader) throws IOException { + IOTools.SurrogateAwareReadable wrappedReader = new IOTools.SurrogateAwareReadable(reader); CharBuffer buffer = CharBuffer.allocate(SentenceDetector.DEFAULT_LIMIT); SentenceSplittingAnalysis analysis = new SentenceSplittingAnalysis(mode, this); - while (IOTools.readAsMuchAsCan(reader, buffer) > 0) { + while (wrappedReader.read(buffer) > 0) { buffer.flip(); int length = analysis.tokenizeBuffer(buffer); if (length < 0) { diff --git a/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java b/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java index 3e877d84..44ebfdce 100644 --- a/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java +++ b/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingLazyAnalysis.java @@ -40,7 +40,7 @@ SentenceSplittingLazyAnalysis(Tokenizer.SplitMode mode, JapaneseTokenizer tokenizer, Readable readable) { this.mode = mode; this.tokenizer = tokenizer; - this.readable = readable; + this.readable = new IOTools.SurrogateAwareReadable(readable); this.buffer = CharBuffer.allocate(SentenceDetector.DEFAULT_LIMIT); this.buffer.flip(); @@ -72,7 +72,7 @@ private int bosPosition() { private int reloadBuffer() throws IOException { buffer.position(bosPosition()); buffer.compact(); - int nread = IOTools.readAsMuchAsCan(readable, buffer); + int nread = readable.read(buffer); buffer.flip(); // align with new buffer state From d8e4d165ccfa38b5fae948326e17821548ed4921 Mon Sep 17 00:00:00 2001 From: mh-northlander Date: Tue, 25 Jun 2024 16:54:26 +0900 Subject: [PATCH 8/8] rename method --- .../worksap/nlp/sudachi/JapaneseTokenizer.java | 2 +- .../com/worksap/nlp/sudachi/Tokenizer.java | 13 ++++++------- .../sudachi/JapaneseTokenizerStreamingTest.kt | 14 ++++++++------ .../nlp/sudachi/JapaneseTokenizerTest.java | 18 +++++++++--------- 4 files changed, 24 insertions(+), 23 deletions(-) diff --git a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java index 9a06fe66..201d59b5 100644 --- a/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java +++ b/src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java @@ -122,7 +122,7 @@ public Iterable tokenizeSentences(SplitMode mode, Reader reader) t } @Override - public Iterator> tokenizedSentenceIterator(SplitMode mode, Readable readable) { + public Iterator> lazyTokenizeSentences(SplitMode mode, Readable readable) { return new SentenceSplittingLazyAnalysis(mode, this, readable); } diff --git a/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java b/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java index 61710b91..89f6adef 100644 --- a/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java +++ b/src/main/java/com/worksap/nlp/sudachi/Tokenizer.java @@ -89,8 +89,7 @@ default Iterable tokenizeSentences(String text) { * @return a result of tokenizing * @throws IOException * if reading a stream is failed - * @deprecated use {@link #tokenizedSentenceIterator(SplitMode, Readable)} - * instead. + * @deprecated use {@link #lazyTokenizeSentences(SplitMode, Readable)} instead. */ @Deprecated Iterable tokenizeSentences(SplitMode mode, Reader input) throws IOException; @@ -106,7 +105,7 @@ default Iterable tokenizeSentences(String text) { * @throws IOException * if reading a stream is failed * @see #tokenizeSentences(SplitMode,Reader) - * @deprecated use {@link #tokenizedSentenceIterator(Readable)} instead. + * @deprecated use {@link #lazyTokenizeSentences(Readable)} instead. */ @Deprecated default Iterable tokenizeSentences(Reader input) throws IOException { @@ -123,7 +122,7 @@ default Iterable tokenizeSentences(Reader input) throws IOExceptio * a readable input text * @return a result of tokenizing */ - Iterator> tokenizedSentenceIterator(SplitMode mode, Readable input); + Iterator> lazyTokenizeSentences(SplitMode mode, Readable input); /** * Read an input text from {@code input}, divide it into sentences and tokenize @@ -132,10 +131,10 @@ default Iterable tokenizeSentences(Reader input) throws IOExceptio * @param input * a readable input text * @return a result of tokenizing - * @see #tokenizedSentenceIterator(SplitMode,Readable) + * @see #lazyTokenizeSentences(SplitMode,Readable) */ - default Iterator> tokenizedSentenceIterator(Readable input) { - return tokenizedSentenceIterator(SplitMode.C, input); + default Iterator> lazyTokenizeSentences(Readable input) { + return lazyTokenizeSentences(SplitMode.C, input); } /** diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt index d5d7e9de..b5f3d54a 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt +++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerStreamingTest.kt @@ -52,6 +52,7 @@ class JapaneseTokenizerStreamingTest { @Test fun streamingTest() { + // Testing deprecated method `tokenizeSentences(Reader)` val reader = StringReader("あ".repeat(5000)) val result = tokenizer.tokenizeSentences(Tokenizer.SplitMode.C, reader) val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } } @@ -60,6 +61,7 @@ class JapaneseTokenizerStreamingTest { @Test fun streamingTestWithBadReader() { + // Testing deprecated method `tokenizeSentences(Reader)` val reader = BadReader("あ".repeat(5000)) val result = tokenizer.tokenizeSentences(Tokenizer.SplitMode.C, reader) val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } } @@ -69,7 +71,7 @@ class JapaneseTokenizerStreamingTest { @Test fun streamingReadable() { val reader = StringReader("あ".repeat(5000)) - val result = tokenizer.tokenizedSentenceIterator(Tokenizer.SplitMode.C, reader).asSequence() + val result = tokenizer.lazyTokenizeSentences(Tokenizer.SplitMode.C, reader).asSequence() val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } } assertEquals(5000, totalLength) } @@ -77,7 +79,7 @@ class JapaneseTokenizerStreamingTest { @Test fun callingNextWithoutTextFails() { val reader = StringReader("東京") - val it = tokenizer.tokenizedSentenceIterator(Tokenizer.SplitMode.C, reader) + val it = tokenizer.lazyTokenizeSentences(Tokenizer.SplitMode.C, reader) val morphemes = it.next() assertEquals("東京", morphemes.get(0).surface()) @@ -90,7 +92,7 @@ class JapaneseTokenizerStreamingTest { @Test fun streamingBlockingReadable() { val reader = BadReader("あ".repeat(5000)) - val result = tokenizer.tokenizedSentenceIterator(Tokenizer.SplitMode.C, reader).asSequence() + val result = tokenizer.lazyTokenizeSentences(Tokenizer.SplitMode.C, reader).asSequence() val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } } assertEquals(5000, totalLength) } @@ -98,7 +100,7 @@ class JapaneseTokenizerStreamingTest { @Test fun streamingLongTextShouldNotCauseOOM() { val reader = StringReader("あ".repeat(10 * 1024 * 1024)) - val result = tokenizer.tokenizedSentenceIterator(Tokenizer.SplitMode.C, reader).asSequence() + val result = tokenizer.lazyTokenizeSentences(Tokenizer.SplitMode.C, reader).asSequence() val totalLength = result.sumOf { sent -> sent.sumOf { mrph -> mrph.end() - mrph.begin() } } assertEquals(10 * 1024 * 1024, totalLength) } @@ -131,13 +133,13 @@ class JapaneseTokenizerStreamingTest { fun failsWhenReaderFails() { var reader = FailReader("あ".repeat(500)) // should not fail on the instantiation - var it = tokenizer.tokenizedSentenceIterator(Tokenizer.SplitMode.C, reader) + var it = tokenizer.lazyTokenizeSentences(Tokenizer.SplitMode.C, reader) assertFailsWith( block = { it.hasNext() }, ) reader = FailReader("あ".repeat(500)) - it = tokenizer.tokenizedSentenceIterator(Tokenizer.SplitMode.C, reader) + it = tokenizer.lazyTokenizeSentences(Tokenizer.SplitMode.C, reader) assertFailsWith( block = { it.next() }, ) diff --git a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java index 0ab3480d..a3242735 100644 --- a/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java +++ b/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerTest.java @@ -255,9 +255,9 @@ public void tokenizerWithReaderAndNormalization() throws IOException { } @Test - public void tokenizedSentenceIterator() { + public void lazyTokenizeSentences() { StringReader reader = new StringReader("京都。東京.東京都。京都"); - Iterator> it = tokenizer.tokenizedSentenceIterator(reader); + Iterator> it = tokenizer.lazyTokenizeSentences(reader); assertThat(it.hasNext(), is(true)); assertThat(it.next().size(), is(2)); assertThat(it.hasNext(), is(true)); @@ -269,21 +269,21 @@ public void tokenizedSentenceIterator() { assertThat(it.hasNext(), is(false)); reader = new StringReader("な。なに。"); - it = tokenizer.tokenizedSentenceIterator(reader); + it = tokenizer.lazyTokenizeSentences(reader); assertThat(it.hasNext(), is(true)); assertThat(it.next().size(), is(3)); assertThat(it.hasNext(), is(false)); } @Test - public void tokenizedSentenceIteratorWithLongText() { + public void lazyTokenizeSentencesWithLongText() { StringBuilder sb = new StringBuilder(); for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT * 2 / 3; i++) { sb.append("京都。"); } sb.append("京都"); StringReader reader = new StringReader(sb.toString()); - Iterator> it = tokenizer.tokenizedSentenceIterator(reader); + Iterator> it = tokenizer.lazyTokenizeSentences(reader); for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT * 2 / 3; i++) { assertThat(it.hasNext(), is(true)); assertThat(it.next().size(), is(2)); @@ -294,14 +294,14 @@ public void tokenizedSentenceIteratorWithLongText() { } @Test - public void tokenizedSentenceIteratorWithNormalization() { + public void lazyTokenizeSentencesWithNormalization() { StringBuilder sb = new StringBuilder(); sb.append("東京都…。"); for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT / 3; i++) { sb.append("京都。"); } StringReader reader = new StringReader(sb.toString()); - Iterator> it = tokenizer.tokenizedSentenceIterator(reader); + Iterator> it = tokenizer.lazyTokenizeSentences(reader); assertThat(it.hasNext(), is(true)); assertThat(it.next().size(), is(5)); for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT / 3; i++) { @@ -315,14 +315,14 @@ public void tokenizedSentenceIteratorWithNormalization() { } @Test - public void tokenizedSentenceIteratorWithSurrogatePair() { + public void lazyTokenizeSentencesWithSurrogatePair() { StringBuilder sb = new StringBuilder(); for (int i = 0; i < SentenceDetector.DEFAULT_LIMIT - 1; i++) { sb.append("。"); } sb.append("😀"); StringReader reader = new StringReader(sb.toString()); - Iterator> it = tokenizer.tokenizedSentenceIterator(reader); + Iterator> it = tokenizer.lazyTokenizeSentences(reader); assertThat(it.hasNext(), is(true)); assertThat(it.next().size(), is(SentenceDetector.DEFAULT_LIMIT - 1));