8000 Improve KNBC HTML Parser by tushuhei · Pull Request #137 · google/budoux · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Improve KNBC HTML Parser #137

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Apr 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 43 additions & 17 deletions scripts/prepare_knbc.py
8000
Original file line number Diff line number Diff line change
Expand Up @@ -33,36 +33,62 @@


class KNBCHTMLParser(HTMLParser):
"""Parses the HTML files in the KNBC corpus and outputs the chunks."""
"""Parses the HTML files in the KNBC corpus to collect chunks.

Attributes:
chunks: The collected chunks.
row: The current row index.
col: The current column index.
current_word: The current word to process.
on_split_row: Whether the scan is on the splitting row.
split_tab: Whether to split by tags in addition to Bunsetsu.
"""

BUNSETSU_SPLIT_ID = 'bnst-kugiri'
TAG_SPLIT_ID = 'tag-kugiri'

def __init__(self, split_tab: bool = False) -> None:
"""Initializes the HTML parser for the KNBC corpus.

def __init__(self, split_tab: bool = True) -> None:
Args:
split_tab: Split by tags in addition to Bunsetsu. (default: False)
"""
super().__init__()
self.chunks = ['']
self.n_rows = 0
self.n_cols = 0
self.current_word: typing.Optional[str] = None
self.row = 0
self.col = 0
self.current_word = ''
self.on_split_row = False
self.split_tab = split_tab

def handle_starttag(self, tag: str, _: typing.Any) -> None:
def handle_starttag(
self, tag: str,
attributes: typing.List[typing.Tuple[str, typing.Optional[str]]]) -> None:
if tag == 'tr':
self.n_rows += 1
self.n_cols = 0
self.current_word = None
self.row += 1
self.col = 0
self.current_word = ''
self.on_split_row = False

if tag == 'td':
self.n_cols += 1
self.col += 1
for name, value in attributes:
if (name == 'id' and value == self.BUNSETSU_SPLIT_ID) or (
self.split_tab and name == 'id' and value == self.TAG_SPLIT_ID):
self.on_split_row = True

def handle_endtag(self, tag: str) -> None:
if tag != 'tr':
if tag != 'tr': # Skip all tags but TR.
return None
if self.row < 3: # Skip the first two rows.
return None
flag1 = self.n_rows > 2 and self.n_cols == 1
flag2 = self.split_tab or self.current_word == '文節区切り'
if flag1 and flag2:
self.chunks.append('')
if self.n_cols == 5 and type(self.current_word) is str:
if self.on_split_row:
return self.chunks.append('')
if self.col == 5:
self.chunks[-1] += self.current_word

def handle_data(self, data: str) -> None:
if self.n_cols == 1:
if self.col == 1:
self.current_word = data


Expand Down
30 changes: 30 additions & 0 deletions scripts/tests/test_prepare_knbc.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,33 @@ def test_multiple_hit(self) -> None:
chunks = ['abcabc', 'def']
result = prepare_knbc.break_before_sequence(chunks, 'bc')
self.assertListEqual(result, ['a', 'bca', 'bc', 'def'])


class TestKNBCHTMLParser(unittest.TestCase):
example_html = '''
<html>
<body>
<table>
<tr><th>HA</th><th>HB</th><th>HC</th><th>HD</th><th>HE</th></tr>
<tr><td colspan="5" id="bnst-kugiri"><a>文節区切り</a></td></tr>
<tr><td>abc</td><td></td><td></td><td></td><td></td></tr>
<tr><td>de</td><td></td><td></td><td></td><td></td></tr>
<tr><td colspan="5" id="tag-kugiri"><a>タグ区切り</a></td></tr>
<tr><td>fgh</td><td></td><td></td><td></td><td> </td></tr>
<tr><td>ijkl</td><td></td><td></td><td></td><td> </td></tr>
<tr><td colspan="5" id="bnst-kugiri"><a>文節区切り</a></td></tr>
<tr><td>mn</td><td></td><td></td><td></td><td> </td></tr>
</table>
</body>
</html>
'''

def test_parse(self) -> None:
parser = prepare_knbc.KNBCHTMLParser(False)
parser.feed(self.example_html)
self.assertListEqual(parser.chunks, ['abcdefghijkl', 'mn'])

def test_parse_split_tags(self) -> None:
parser = prepare_knbc.KNBCHTMLParser(True)
parser.feed(self.example_html)
self.assertListEqual(parser.chunks, ['abcde', 'fghijkl', 'mn'])
0