8000 Update the Prepare KNBC script to break chunks by specified sequences by tushuhei · Pull Request #121 · google/budoux · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Update the Prepare KNBC script to break chunks by specified sequences #121

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 28 additions & 19 deletions scripts/prepare_knbc.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,18 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Loads the KNBC corpus to generate training data."""
"""Prepares a dataset from the KNBC corpus.

Before running this script, you need to download the KNBC corpus by running:

$ curl -o knbc.tar.bz2 https://nlp.ist.i.kyoto-u.ac.jp/kuntt/KNBC_v1.0_090925_utf8.tar.bz2
$ tar -xf knbc.tar.bz2

Now you should have a directory named `KNBC_v1.0_090925_utf8`.
Run the following to generate a dataset named `source_knbc.txt`.

$ python scripts/prepare_knbc.py KNBC_v1.0_090925_utf8 -o source_knbc.txt
"""

import argparse
import os
Expand Down Expand Up @@ -55,25 +66,21 @@ def handle_data(self, data: str) -> None:
self.current_word = data


def break_before_open_parentheses(chunks: typing.List[str]) -> typing.List[str]:
"""Adds chunk breaks before every open parentheses.
def break_before_sequence(chunks: typing.List[str],
sequence: str) -> typing.List[str]:
"""Breaks chunks before a specified character sequence appears.

Args:
chunks (List[str]): Source chunks.
chunks (List[str]): Chunks to break.
sequence (str): A character sequence to break chunks before.

Returns:
Processed chunks.
"""
out: typing.List[str] = []
for chunk in chunks:
if '(' in chunk:
index = chunk.index('(')
if index > 0:
out.append(chunk[:index])
out.append(chunk[index:])
else:
out.append(chunk)
return out
chunks = utils.SEP.join(chunks).replace(sequence,
utils.SEP + sequence).split(utils.SEP)
chunks = [chunk for chunk in chunks if len(chunk) > 0]
return chunks


def postprocess(chunks: typing.List[str]) -> typing.List[str]:
Expand All @@ -85,19 +92,21 @@ def postprocess(chunks: typing.List[str]) -> typing.List[str]:
Returns:
Processed chunks.
"""
chunks = break_before_open_parentheses(chunks)
chunks = break_before_sequence(chunks, '(')
chunks = break_before_sequence(chunks, 'もら')
return chunks


def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
DEFAULT_OUT_PATH = 'source.txt'
parser = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('source_dir', help='Path to the KNBC corpus directory.')
parser.add_argument(
'-o',
'--outfile',
help='''File path to output the training data.
(default: source.txt)''',
default='source.txt')
help=f'File path to the output dataset. (default: {DEFAULT_OUT_PATH})',
default=DEFAULT_OUT_PATH)
return parser.parse_args()


Expand Down
42 changes: 42 additions & 0 deletions scripts/tests/test_prepare_knbc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests the prepare KNBC script."""

import os
import sys
import unittest

# module hack
LIB_PATH = os.path.join(os.path.dirname(__file__), '..', '..')
sys.path.insert(0, os.path.abspath(LIB_PATH))

from scripts import prepare_knbc # type: ignore # noqa (module hack)


class TestBreakBeforeSequence(unittest.TestCase):

def test_standard(self) -> None:
chunks = ['abcdef', 'ghi']
result = prepare_knbc.break_before_sequence(chunks, 'de')
self.assertListEqual(result, ['abc', 'def', 'ghi'])

def test_sequence_on_top(self) -> None:
chunks = ['abcdef', 'ghi']
result = prepare_knbc.break_before_sequence(chunks, 'gh')
self.assertListEqual(result, ['abcdef', 'ghi'])

def test_multiple_hit(self) -> None:
chunks = ['abcabc', 'def']
result = prepare_knbc.break_before_sequence(chunks, 'bc')
self.assertListEqual(result, ['a', 'bca', 'bc', 'def'])
0