8000 Lazy sentence split and tokenization by mh-northlander · Pull Request #231 · WorksApplications/Sudachi · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Lazy sentence split and tokenization #231

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jun 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 52 additions & 9 deletions src/main/java/com/worksap/nlp/sudachi/IOTools.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023 Works Applications Co., Ltd.
* Copyright (c) 2023-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -17,7 +17,6 @@
package com.worksap.nlp.sudachi;

import java.io.IOException;
import java.io.Reader;
import java.nio.CharBuffer;

public class IOTools {
Expand All @@ -26,22 +25,21 @@ private IOTools() {
}

/**
* Read as much as possible from reader to the result buffer. Some readers
* perform filtering on input by reducing the number of read characters in each
* batch.
* Read as much as possible from the readable to the result buffer. Use this to
* make sure that the buffer is fulfilled or no text left unread.
*
* @param reader
* input reader
* @param readable
* input readable
* @param result
* buffer to read into
* @return number of read characters
* @throws IOException
* when read operation fails
*/
public static int readAsMuchAsCan(Reader reader, CharBuffer result) throws IOException {
public static int readAsMuchAsCan(Readable readable, CharBuffer result) throws IOException {
int totalRead = 0;
while (result.hasRemaining()) {
int read = reader.read(result);
int read = readable.read(result);
if (read < 0) {
if (totalRead == 0) {
return -1;
Expand All @@ -53,4 +51,49 @@ public static int readAsMuchAsCan(Reader reader, CharBuffer result) throws IOExc
}
return totalRead;
}

/**
* Wrapper class for Readable, that uses {@link #readAsMuchAsCan} to read and
* guarantees that the last character read is not a high surrogate unless it is
* the last one in the readable.
*/< 10000 /span>
public static class SurrogateAwareReadable implements Readable {
private Readable readable;
char lastTrailingHighSurrogate;

SurrogateAwareReadable(Readable input) {
this.readable = input;
}

@Override
public int read(CharBuffer cb) throws IOException {
boolean trailingKept = false;
if (lastTrailingHighSurrogate != 0) {
cb.append(lastTrailingHighSurrogate);
lastTrailingHighSurrogate = 0;
trailingKept = true;
}

int nread = IOTools.readAsMuchAsCan(readable, cb);
if (nread < 0) {
if (!trailingKept) {
return -1;
}
// the last char in the readable is a high surrogate and there is nothing we can
// do.
return 1;
}
if (trailingKept) {
nread += 1;
}

char lastChar = cb.get(cb.position() - 1);
if (Character.isHighSurrogate(lastChar)) {
lastTrailingHighSurrogate = lastChar;
cb.position(cb.position() - 1);
nread -= 1;
}
return nread;
}
}
}
11 changes: 9 additions & 2 deletions src/main/java/com/worksap/nlp/sudachi/JapaneseTokenizer.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -23,6 +23,7 @@
import java.nio.CharBuffer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;

import javax.json.Json;
Expand Down Expand Up @@ -98,10 +99,11 @@ public Iterable<MorphemeList> tokenizeSentences(SplitMode mode, String text) {

@Override
public Iterable<MorphemeList> tokenizeSentences(SplitMode mode, Reader reader) throws IOException {
IOTools.SurrogateAwareReadable wrappedReader = new IOTools.SurrogateAwareReadable(reader);
CharBuffer buffer = CharBuffer.allocate(SentenceDetector.DEFAULT_LIMIT);
SentenceSplittingAnalysis analysis = new SentenceSplittingAnalysis(mode, this);

while (IOTools.readAsMuchAsCan(reader, buffer) > 0) {
while (wrappedReader.read(buffer) > 0) {
buffer.flip();
int length = analysis.tokenizeBuffer(buffer);
if (length < 0) {
Expand All @@ -119,6 +121,11 @@ public Iterable<MorphemeList> tokenizeSentences(SplitMode mode, Reader reader) t
return sentences;
}

@Override
public Iterator<List<Morpheme>> lazyTokenizeSentences(SplitMode mode, Readable readable) {
return new SentenceSplittingLazyAnalysis(mode, this, readable);
}

@Override
public void setDumpOutput(PrintStream output) {
dumpOutput = output;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
/*
* Copyright (c) 2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.worksap.nlp.sudachi;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.nio.CharBuffer;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;

import com.worksap.nlp.sudachi.dictionary.LexiconSet;
import com.worksap.nlp.sudachi.sentdetect.SentenceDetector;

/**
* Provides lazy sentence split and analysis.
*/
/* internal */ class SentenceSplittingLazyAnalysis
implements SentenceDetector.NonBreakCheker, Iterator<List<Morpheme>> {
private final SentenceDetector detector = new SentenceDetector();

private final Tokenizer.SplitMode mode;
private final JapaneseTokenizer tokenizer;
private final Readable readable;

SentenceSplittingLazyAnalysis(Tokenizer.SplitMode mode, JapaneseTokenizer tokenizer, Readable readable) {
this.mode = mode;
this.tokenizer = tokenizer;
this.readable = new IOTools.SurrogateAwareReadable(readable);

this.buffer = CharBuffer.allocate(SentenceDetector.DEFAULT_LIMIT);
this.buffer.flip();
this.input = tokenizer.buildInputText("");
}

// input buffer
private final CharBuffer buffer;
// preprocessed InputText of the buffer.
// used to normalize text for the sentence detection.
private UTF8InputText input;
// begining-of-sentence index of next sentence in the input
private int bos = 0;
// normalized text left. corresponds to `input.getSubstring(bos,
// input.getText().length())`
private String normalized = "";

/** Return bos position in the buffer. */
private int bosPosition() {
return input.textIndexToOriginalTextIndex(bos);
}

/**
* Reset the buffer discarding processed text, then read from the input.
*
* @return the number of chars added to the buffer. -1 if input reabable is at
* its end.
*/
private int reloadBuffer() throws IOException {
buffer.position(bosPosition());
buffer.compact();
int nread = readable.read(buffer);
buffer.flip();

// align with new buffer state
input = tokenizer.buildInputText(buffer);
bos = 0;
normalized = input.getText();

return nread;
}

@Override
public boolean hasNext() {
if (!normalized.isEmpty()) {
return true;
}

int nread;
try {
nread = reloadBuffer();
} catch (IOException e) {
throw new UncheckedIOException(e.getMessage(), e);
}

return !(nread < 0 && !buffer.hasRemaining());
}

@Override
public MorphemeList next() {
int length = detector.getEos(normalized, this);
if (length > 0) { // sentence found
int eos = bos + length;
if (eos < normalized.length()) {
eos = input.getNextInOriginal(eos - 1);
length = eos - bos;
}
UTF8InputText sentence = input.slice(bos, eos);
bos = eos;
normalized = normalized.substring(length);
return tokenizer.tokenizeSentence(mode, sentence);
}

// buffer is just after reload but no (safe) eos found. need to clean it up.
// tokenize all text in the buffer.
if (bos == 0 && length < 0) {
bos = normalized.length();
normalized = "";
return tokenizer.tokenizeSentence(mode, input);
}

int nread;
try {
nread = reloadBuffer();
} catch (IOException e) {
throw new UncheckedIOException(e.getMessage(), e);
}

if (nread < 0 && !buffer.hasRemaining()) {
throw new NoSuchElementException("no texts left to analyze");
}

// recursive call with reloaded buffer.
return next();
}

@Override
public boolean hasNonBreakWord(int length) {
UTF8InputText inp = input;
int byteEOS = inp.getCodePointsOffsetLength(0, bos + length);
byte[] bytes = inp.getByteText();
LexiconSet lexicon = tokenizer.lexicon;
for (int i = Math.max(0, byteEOS - 64); i < byteEOS; i++) {
Iterator<int[]> iterator = lexicon.lookup(bytes, i);
while (iterator.hasNext()) {
int[] r = iterator.next();
int l = r[1];
if (l > byteEOS || (l == byteEOS && bos + length - inp.modifiedOffset(i) > 1)) {
return true;
}
}
}
return false;
}
}
43 changes: 38 additions & 5 deletions src/main/java/com/worksap/nlp/sudachi/Tokenizer.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -19,6 +19,8 @@
import java.io.IOException;
import java.io.PrintStream;
import java.io.Reader;
import java.util.Iterator;
import java.util.List;

/**
* A tokenizer of morphological analysis.
Expand Down Expand Up @@ -52,7 +54,7 @@ default MorphemeList tokenize(final String text) {

/**
* Tokenize sentences. This method divide an input text into sentences and
* tokenizes them.
* tokenizes them. When the text is long, it uses a lot of memory.
*
* @param mode
* a mode of splitting
Expand All @@ -64,7 +66,7 @@ default MorphemeList tokenize(final String text) {

/**
* Tokenize sentences. Divide an input text into sentences and tokenize them
* with {@link SplitMode}.C.
* with {@link SplitMode}.C. When the text is long, it uses a lot of memory.
*
* @param text
* input text
Expand All @@ -77,7 +79,8 @@ default Iterable<MorphemeList> tokenizeSentences(String text) {

/**
* Read an input text from {@code input}, divide it into sentences and tokenize
* them.
* them. It reads all text in the input and uses a lot of memory when the text
* is long.
*
* @param mode
* a mode of splitting
Expand All @@ -86,24 +89,54 @@ default Iterable<MorphemeList> tokenizeSentences(String text) {
* @return a result of tokenizing
* @throws IOException
* if reading a stream is failed
* @deprecated use {@link #lazyTokenizeSentences(SplitMode, Readable)} instead.
*/
@Deprecated
Iterable<MorphemeList> tokenizeSentences(SplitMode mode, Reader input) throws IOException;

/**
* Reads an input text from {@code input}, divide it into sentences and
* tokenizes them with {@link SplitMode}.C.
* tokenizes them with {@link SplitMode}.C. It reads all text in the input and
* uses a lot of memory when the text is long.
*
* @param input
* a reader of input text
* @return a result of tokenizing
* @throws IOException
* if reading a stream is failed
* @see #tokenizeSentences(SplitMode,Reader)
* @deprecated use {@link #lazyTokenizeSentences(Readable)} instead.
*/
@Deprecated
default Iterable<MorphemeList> tokenizeSentences(Reader input) throws IOException {
return tokenizeSentences(SplitMode.C, input);
}

/**
* Read an input text from {@code input}, divide it into sentences and tokenize
* them. It reads the input lazily.
*
* @param mode
* a mode of splitting
* @param input
* a readable input text
* @return a result of tokenizing
*/
Iterator<List<Morpheme>> lazyTokenizeSentences(SplitMode mode, Readable input);

/**
* Read an input text from {@code input}, divide it into sentences and tokenize
* them with {@link SplitMode}.C. It reads the input lazily.
*
* @param input
* a readable input text
* @return a result of tokenizing
* @see #lazyTokenizeSentences(SplitMode,Readable)
*/
default Iterator<List<Morpheme>> lazyTokenizeSentences(Readable input) {
return lazyTokenizeSentences(SplitMode.C, input);
}

/**
* Prints lattice structure of the analysis into the passed {@link PrintStream}.
*
Expand Down
Loading
Loading
0