From 3a0d5934c0c915b5dd7044f89d45a853d7fa2759 Mon Sep 17 00:00:00 2001 From: Junseong Kim Date: Mon, 5 Apr 2021 14:54:04 +0900 Subject: [PATCH 01/12] refactor: project clean up --- .circleci/config.yml | 98 ---------- .gitignore | 171 ------------------ LICENSE | 201 --------------------- Makefile | 23 ++- README.md | 126 +++---------- bert/__init__.py | 2 + bert_pytorch/__init__.py | 1 - bert_pytorch/__main__.py | 71 -------- bert_pytorch/dataset/__init__.py | 2 - bert_pytorch/dataset/dataset.py | 125 ------------- bert_pytorch/dataset/vocab.py | 185 ------------------- bert_pytorch/model/__init__.py | 2 - bert_pytorch/model/attention/__init__.py | 2 - bert_pytorch/model/attention/multi_head.py | 37 ---- bert_pytorch/model/attention/single.py | 25 --- bert_pytorch/model/bert.py | 48 ----- bert_pytorch/model/embedding/__init__.py | 1 - bert_pytorch/model/embedding/bert.py | 32 ---- bert_pytorch/model/embedding/position.py | 25 --- bert_pytorch/model/embedding/segment.py | 6 - bert_pytorch/model/embedding/token.py | 6 - bert_pytorch/model/language_model.py | 61 ------- bert_pytorch/model/transformer.py | 31 ---- bert_pytorch/model/utils/__init__.py | 4 - bert_pytorch/model/utils/feed_forward.py | 16 -- bert_pytorch/model/utils/gelu.py | 12 -- bert_pytorch/model/utils/layer_norm.py | 17 -- bert_pytorch/model/utils/sublayer.py | 18 -- bert_pytorch/trainer/__init__.py | 1 - bert_pytorch/trainer/optim_schedule.py | 35 ---- bert_pytorch/trainer/pretrain.py | 151 ---------------- pyproject.toml | 4 + requirements-dev.txt | 10 + requirements.txt | 3 - scripts/run_sample.py | 5 + setup.cfg | 11 ++ setup.py | 60 +----- test.py | 6 - tests/test_sample.py | 2 + 39 files changed, 91 insertions(+), 1545 deletions(-) delete mode 100644 .circleci/config.yml delete mode 100644 .gitignore delete mode 100644 LICENSE create mode 100644 bert/__init__.py delete mode 100644 bert_pytorch/__init__.py delete mode 100644 bert_pytorch/__main__.py delete mode 100644 bert_pytorch/dataset/__init__.py delete mode 100644 bert_pytorch/dataset/dataset.py delete mode 100644 bert_pytorch/dataset/vocab.py delete mode 100644 bert_pytorch/model/__init__.py delete mode 100644 bert_pytorch/model/attention/__init__.py delete mode 100644 bert_pytorch/model/attention/multi_head.py delete mode 100644 bert_pytorch/model/attention/single.py delete mode 100644 bert_pytorch/model/bert.py delete mode 100644 bert_pytorch/model/embedding/__init__.py delete mode 100644 bert_pytorch/model/embedding/bert.py delete mode 100644 bert_pytorch/model/embedding/position.py delete mode 100644 bert_pytorch/model/embedding/segment.py delete mode 100644 bert_pytorch/model/embedding/token.py delete mode 100644 bert_pytorch/model/language_model.py delete mode 100644 bert_pytorch/model/transformer.py delete mode 100644 bert_pytorch/model/utils/__init__.py delete mode 100644 bert_pytorch/model/utils/feed_forward.py delete mode 100644 bert_pytorch/model/utils/gelu.py delete mode 100644 bert_pytorch/model/utils/layer_norm.py delete mode 100644 bert_pytorch/model/utils/sublayer.py delete mode 100644 bert_pytorch/trainer/__init__.py delete mode 100644 bert_pytorch/trainer/optim_schedule.py delete mode 100644 bert_pytorch/trainer/pretrain.py create mode 100644 pyproject.toml create mode 100644 requirements-dev.txt create mode 100644 scripts/run_sample.py create mode 100644 setup.cfg delete mode 100644 test.py create mode 100644 tests/test_sample.py diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index a70fba8..0000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,98 +0,0 @@ -version: 2 -jobs: - build: - docker: - - image: circleci/python:3.6.1 - - working_directory: ~/repo - - steps: - - checkout - - - restore_cache: - keys: - - v1-dependencies-{{ checksum "requirements.txt" }} - - v1-dependencies- - - - run: - name: install dependencies - command: | - python3 -m venv venv - . venv/bin/activate - pip install -r requirements.txt - - - save_cache: - paths: - - ./venv - key: v1-dependencies-{{ checksum "requirements.txt" }} - - - run: - name: run tests - command: | - . venv/bin/activate - python -m unittest test.py - - - store_artifacts: - path: test-reports - destination: test-reports - - deploy: - docker: - - image: circleci/python:3.6.1 - - working_directory: ~/repo - - steps: - - checkout - - - restore_cache: - key: v1-dependency-cache-{{ checksum "setup.py" }}-{{ checksum "Makefile" }} - - - run: - name: verify git tag vs. version - command: | - python3 -m venv venv - . venv/bin/activate - python setup.py verify - pip install twine - - - save_cache: - key: v1-dependency-cache-{{ checksum "setup.py" }}-{{ checksum "Makefile" }} - paths: - - "venv" - - # Deploying to PyPI - # for pip install kor2vec - - run: - name: init .pypirc - command: | - echo -e "[pypi]" >> ~/.pypirc - echo -e "username = codertimo" >> ~/.pypirc - echo -e "password = $PYPI_PASSWORD" >> ~/.pypirc - - - run: - name: create packages - command: | - make package - - - run: - name: upload to pypi - command: | - . venv/bin/activate - twine upload dist/* -workflows: - version: 2 - build_and_deploy: - jobs: - - build: - filters: - tags: - only: /.*/ - - deploy: - requires: - - build - filters: - tags: - only: /.*/ - branches: - ignore: /.*/ diff --git a/.gitignore b/.gitignore deleted file mode 100644 index d892627..0000000 --- a/.gitignore +++ /dev/null @@ -1,171 +0,0 @@ -data/ -output/ - -# Created by .ignore support plugin (hsz.mobi) -### Python template -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -.hypothesis/ -.pytest_cache/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# pyenv -.python-version - -# celery beat schedule file -celerybeat-schedule - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -### JetBrains template -# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm -# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 - -# User-specific stuff -.idea/**/workspace.xml -.idea/**/tasks.xml -.idea/**/usage.statistics.xml -.idea/**/dictionaries -.idea/**/shelf - -# Sensitive or high-churn files -.idea/**/dataSources/ -.idea/**/dataSources.ids -.idea/**/dataSources.local.xml -.idea/**/sqlDataSources.xml -.idea/**/dynamic.xml -.idea/**/uiDesigner.xml -.idea/**/dbnavigator.xml - -# Gradle -.idea/**/gradle.xml -.idea/**/libraries - -# Gradle and Maven with auto-import -# When using Gradle or Maven with auto-import, you should exclude module files, -# since they will be recreated, and may cause churn. Uncomment if using -# auto-import. -# .idea/modules.xml -# .idea/*.iml -# .idea/modules - -# CMake -cmake-build-*/ - -# Mongo Explorer plugin -.idea/**/mongoSettings.xml - -# File-based project format -*.iws - -# IntelliJ -out/ - -# mpeltonen/sbt-idea plugin -.idea_modules/ - -# JIRA plugin -atlassian-ide-plugin.xml - -# Cursive Clojure plugin -.idea/replstate.xml - -# Crashlytics plugin (for Android Studio and IntelliJ) -com_crashlytics_export_strings.xml -crashlytics.properties -crashlytics-build.properties -fabric.properties - -# Editor-based Rest Client -.idea/httpRequests - diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 240374c..0000000 --- a/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright 2018 Junseong Kim, Scatter Lab, BERT contributors - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/Makefile b/Makefile index 09cdbf3..f932fe2 100644 --- a/Makefile +++ b/Makefile @@ -1,3 +1,20 @@ -package: - python setup.py sdist - python setup.py bdist_wheel +.PHONY: style quality test test-cov + +check_dirs := bert/ scripts/ tests/ +test_dirs := bert/ + +style: + black $(check_dirs) + isort $(check_dirs) + flake8 $(check_dirs) + +quality: + black --check $(check_dirs) + isort --check-only $(check_dirs) + flake8 $(check_dirs) + +test: + pytest + +test-cov: + pytest --cov-branch --cov $(test_dirs) diff --git a/README.md b/README.md index ced3be7..e512379 100644 --- a/README.md +++ b/README.md @@ -1,120 +1,50 @@ -# BERT-pytorch +# REPO-README -[![LICENSE](https://img.shields.io/github/license/codertimo/BERT-pytorch.svg)](https://github.com/codertimo/BERT-pytorch/blob/master/LICENSE) -![GitHub issues](https://img.shields.io/github/issues/codertimo/BERT-pytorch.svg) -[![GitHub stars](https://img.shields.io/github/stars/codertimo/BERT-pytorch.svg)](https://github.com/codertimo/BERT-pytorch/stargazers) -[![CircleCI](https://circleci.com/gh/codertimo/BERT-pytorch.svg?style=shield)](https://circleci.com/gh/codertimo/BERT-pytorch) -[![PyPI](https://img.shields.io/pypi/v/bert-pytorch.svg)](https://pypi.org/project/bert_pytorch/) -[![PyPI - Status](https://img.shields.io/pypi/status/bert-pytorch.svg)](https://pypi.org/project/bert_pytorch/) -[![Documentation Status](https://readthedocs.org/projects/bert-pytorch/badge/?version=latest)](https://bert-pytorch.readthedocs.io/en/latest/?badge=latest) +Template for my python projects -Pytorch implementation of Google AI's 2018 BERT, with simple annotation +## Template Replace Check-List -> BERT 2018 BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding -> Paper URL : https://arxiv.org/abs/1810.04805 +- [ ] Make your own package name 👋 +- [ ] Replace `package/` to new package name 🎉 +- [ ] Replace command in `.github/workflows/main.yml` with new package name 🔨 +- [ ] Replace command in `Makefile` with new package name +- [ ] Replace name, description, author etc in `setup.py` with new package setting 🏄‍♂️ +- [ ] Replace author, version in `package/__init__.py` to new package name +- [ ] Setting codecov (https://docs.codecov.io/docs/quick-start) to your repo +- [ ] Make REAL runnable code 👨‍💻 +- [ ] Make REAL test code 👩🏻‍💻 +- [ ] Remove this README and make your own story! 👍 +## Run Scripts -## Introduction +All runnable python scripts should be located in `scripts/` folder -Google AI's BERT paper shows the amazing result on various NLP task (new 17 NLP tasks SOTA), -including outperform the human F1 score on SQuAD v1.1 QA task. -This paper proved that Transformer(self-attention) based encoder can be powerfully used as -alternative of previous language model with proper language model training method. -And more importantly, they showed us that this pre-trained language model can be transfer -into any NLP task without making task specific model architecture. +And you can run the scripts through under command -This amazing result would be record in NLP history, -and I expect many further papers about BERT will be published very soon. - -This repo is implementation of BERT. Code is very simple and easy to understand fastly. -Some of these codes are based on [The Annotated Transformer](http://nlp.seas.harvard.edu/2018/04/03/attention.html) - -Currently this project is working on progress. And the code is not verified yet. - -## Installation -``` -pip install bert-pytorch +```shell +python -m scripts.run_sample ``` -## Quickstart +## Run Linting -**NOTICE : Your corpus should be prepared with two sentences in one line with tab(\t) separator** +This project use three Linter: `black`, `isort`, `flake8` -### 0. Prepare your corpus -``` -Welcome to the \t the jungle\n -I can stay \t here all night\n ``` +# use linter to fix code format +make style -or tokenized corpus (tokenization is not in package) -``` -Wel_ _come _to _the \t _the _jungle\n -_I _can _stay \t _here _all _night\n +# check lint error +make quality ``` +## Run Test -### 1. Building vocab based on your corpus -```shell -bert-vocab -c data/corpus.small -o data/vocab.small -``` +All runnable test codes should be located in `tests/` folder -### 2. Train your own BERT model ```shell -bert -c data/corpus.small -v data/vocab.small -o output/bert.model -``` - -## Language Model Pre-training - -In the paper, authors shows the new language model training methods, -which are "masked language model" and "predict next sentence". - - -### Masked Language Model - -> Original Paper : 3.3.1 Task #1: Masked LM - -``` -Input Sequence : The man went to [MASK] store with [MASK] dog -Target Sequence : the his +pytest ``` -#### Rules: -Randomly 15% of input token will be changed into something, based on under sub-rules - -1. Randomly 80% of tokens, gonna be a `[MASK]` token -2. Randomly 10% of tokens, gonna be a `[RANDOM]` token(another word) -3. Randomly 10% of tokens, will be remain as same. But need to be predicted. - -### Predict Next Sentence - -> Original Paper : 3.3.2 Task #2: Next Sentence Prediction - -``` -Input : [CLS] the man went to the store [SEP] he bought a gallon of milk [SEP] -Label : Is Next - -Input = [CLS] the man heading to the store [SEP] penguin [MASK] are flight ##less birds [SEP] -Label = NotNext -``` - -"Is this sentence can be continuously connected?" - - understanding the relationship, between two text sentences, which is -not directly captured by language modeling - -#### Rules: - -1. Randomly 50% of next sentence, gonna be continuous sentence. -2. Randomly 50% of next sentence, gonna be unrelated sentence. - - ## Author -Junseong Kim, Scatter Lab (codertimo@gmail.com / junseong.kim@scatterlab.co.kr) - -## License - -This project following Apache 2.0 License as written in LICENSE file - -Copyright 2018 Junseong Kim, Scatter Lab, respective BERT contributors -Copyright (c) 2018 Alexander Rush : [The Annotated Trasnformer](https://github.com/harvardnlp/annotated-transformer) +by @codertimo diff --git a/bert/__init__.py b/bert/__init__.py new file mode 100644 index 0000000..86fe2b4 --- /dev/null +++ b/bert/__init__.py @@ -0,0 +1,2 @@ +__version__ = "0.0.1" +__author__ = "Junseong Kim" diff --git a/bert_pytorch/__init__.py b/bert_pytorch/__init__.py deleted file mode 100644 index 478ced5..0000000 --- a/bert_pytorch/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .model import BERT diff --git a/bert_pytorch/__main__.py b/bert_pytorch/__main__.py deleted file mode 100644 index d4193f2..0000000 --- a/bert_pytorch/__main__.py +++ /dev/null @@ -1,71 +0,0 @@ -import argparse - -from torch.utils.data import DataLoader - -from .model import BERT -from .trainer import BERTTrainer -from .dataset import BERTDataset, WordVocab - - -def train(): - parser = argparse.ArgumentParser() - - parser.add_argument("-c", "--train_dataset", required=True, type=str, help="train dataset for train bert") - parser.add_argument("-t", "--test_dataset", type=str, default=None, help="test set for evaluate train set") - parser.add_argument("-v", "--vocab_path", required=True, type=str, help="built vocab model path with bert-vocab") - parser.add_argument("-o", "--output_path", required=True, type=str, help="ex)output/bert.model") - - parser.add_argument("-hs", "--hidden", type=int, default=256, help="hidden size of transformer model") - parser.add_argument("-l", "--layers", type=int, default=8, help="number of layers") - parser.add_argument("-a", "--attn_heads", type=int, default=8, help="number of attention heads") - parser.add_argument("-s", "--seq_len", type=int, default=20, help="maximum sequence len") - - parser.add_argument("-b", "--batch_size", type=int, default=64, help="number of batch_size") - parser.add_argument("-e", "--epochs", type=int, default=10, help="number of epochs") - parser.add_argument("-w", "--num_workers", type=int, default=5, help="dataloader worker size") - - parser.add_argument("--with_cuda", type=bool, default=True, help="training with CUDA: true, or false") - parser.add_argument("--log_freq", type=int, default=10, help="printing loss every n iter: setting n") - parser.add_argument("--corpus_lines", type=int, default=None, help="total number of lines in corpus") - parser.add_argument("--cuda_devices", type=int, nargs='+', default=None, help="CUDA device ids") - parser.add_argument("--on_memory", type=bool, default=True, help="Loading on memory: true or false") - - parser.add_argument("--lr", type=float, default=1e-3, help="learning rate of adam") - parser.add_argument("--adam_weight_decay", type=float, default=0.01, help="weight_decay of adam") - parser.add_argument("--adam_beta1", type=float, default=0.9, help="adam first beta value") - parser.add_argument("--adam_beta2", type=float, default=0.999, help="adam first beta value") - - args = parser.parse_args() - - print("Loading Vocab", args.vocab_path) - vocab = WordVocab.load_vocab(args.vocab_path) - print("Vocab Size: ", len(vocab)) - - print("Loading Train Dataset", args.train_dataset) - train_dataset = BERTDataset(args.train_dataset, vocab, seq_len=args.seq_len, - corpus_lines=args.corpus_lines, on_memory=args.on_memory) - - print("Loading Test Dataset", args.test_dataset) - test_dataset = BERTDataset(args.test_dataset, vocab, seq_len=args.seq_len, on_memory=args.on_memory) \ - if args.test_dataset is not None else None - - print("Creating Dataloader") - train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers) - test_data_loader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=args.num_workers) \ - if test_dataset is not None else None - - print("Building BERT model") - bert = BERT(len(vocab), hidden=args.hidden, n_layers=args.layers, attn_heads=args.attn_heads) - - print("Creating BERT Trainer") - trainer = BERTTrainer(bert, len(vocab), train_dataloader=train_data_loader, test_dataloader=test_data_loader, - lr=args.lr, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay, - with_cuda=args.with_cuda, cuda_devices=args.cuda_devices, log_freq=args.log_freq) - - print("Training Start") - for epoch in range(args.epochs): - trainer.train(epoch) - trainer.save(epoch, args.output_path) - - if test_data_loader is not None: - trainer.test(epoch) diff --git a/bert_pytorch/dataset/__init__.py b/bert_pytorch/dataset/__init__.py deleted file mode 100644 index 90e9036..0000000 --- a/bert_pytorch/dataset/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .dataset import BERTDataset -from .vocab import WordVocab diff --git a/bert_pytorch/dataset/dataset.py b/bert_pytorch/dataset/dataset.py deleted file mode 100644 index 7d787f3..0000000 --- a/bert_pytorch/dataset/dataset.py +++ /dev/null @@ -1,125 +0,0 @@ -from torch.utils.data import Dataset -import tqdm -import torch -import random - - -class BERTDataset(Dataset): - def __init__(self, corpus_path, vocab, seq_len, encoding="utf-8", corpus_lines=None, on_memory=True): - self.vocab = vocab - self.seq_len = seq_len - - self.on_memory = on_memory - self.corpus_lines = corpus_lines - self.corpus_path = corpus_path - self.encoding = encoding - - with open(corpus_path, "r", encoding=encoding) as f: - if self.corpus_lines is None and not on_memory: - for _ in tqdm.tqdm(f, desc="Loading Dataset", total=corpus_lines): - self.corpus_lines += 1 - - if on_memory: - self.lines = [line[:-1].split("\t") - for line in tqdm.tqdm(f, desc="Loading Dataset", total=corpus_lines)] - self.corpus_lines = len(self.lines) - - if not on_memory: - self.file = open(corpus_path, "r", encoding=encoding) - self.random_file = open(corpus_path, "r", encoding=encoding) - - for _ in range(random.randint(self.corpus_lines if self.corpus_lines < 1000 else 1000)): - self.random_file.__next__() - - def __len__(self): - return self.corpus_lines - - def __getitem__(self, item): - t1, t2, is_next_label = self.random_sent(item) - t1_random, t1_label = self.random_word(t1) - t2_random, t2_label = self.random_word(t2) - - # [CLS] tag = SOS tag, [SEP] tag = EOS tag - t1 = [self.vocab.sos_index] + t1_random + [self.vocab.eos_index] - t2 = t2_random + [self.vocab.eos_index] - - t1_label = [self.vocab.pad_index] + t1_label + [self.vocab.pad_index] - t2_label = t2_label + [self.vocab.pad_index] - - segment_label = ([1 for _ in range(len(t1))] + [2 for _ in range(len(t2))])[:self.seq_len] - bert_input = (t1 + t2)[:self.seq_len] - bert_label = (t1_label + t2_label)[:self.seq_len] - - padding = [self.vocab.pad_index for _ in range(self.seq_len - len(bert_input))] - bert_input.extend(padding), bert_label.extend(padding), segment_label.extend(padding) - - output = {"bert_input": bert_input, - "bert_label": bert_label, - "segment_label": segment_label, - "is_next": is_next_label} - - return {key: torch.tensor(value) for key, value in output.items()} - - def random_word(self, sentence): - tokens = sentence.split() - output_label = [] - - for i, token in enumerate(tokens): - prob = random.random() - if prob < 0.15: - prob /= 0.15 - - # 80% randomly change token to mask token - if prob < 0.8: - tokens[i] = self.vocab.mask_index - - # 10% randomly change token to random token - elif prob < 0.9: - tokens[i] = random.randrange(len(self.vocab)) - - # 10% randomly change token to current token - else: - tokens[i] = self.vocab.stoi.get(token, self.vocab.unk_index) - - output_label.append(self.vocab.stoi.get(token, self.vocab.unk_index)) - - else: - tokens[i] = self.vocab.stoi.get(token, self.vocab.unk_index) - output_label.append(0) - - return tokens, output_label - - def random_sent(self, index): - t1, t2 = self.get_corpus_line(index) - - # output_text, label(isNotNext:0, isNext:1) - if random.random() > 0.5: - return t1, t2, 1 - else: - return t1, self.get_random_line(), 0 - - def get_corpus_line(self, item): - if self.on_memory: - return self.lines[item][0], self.lines[item][1] - else: - line = self.file.__next__() - if line is None: - self.file.close() - self.file = open(self.corpus_path, "r", encoding=self.encoding) - line = self.file.__next__() - - t1, t2 = line[:-1].split("\t") - return t1, t2 - - def get_random_line(self): - if self.on_memory: - return self.lines[random.randrange(len(self.lines))][1] - - line = self.file.__next__() - if line is None: - self.file.close() - self.file = open(self.corpus_path, "r", encoding=self.encoding) - for _ in range(random.randint(self.corpus_lines if self.corpus_lines < 1000 else 1000)): - self.random_file.__next__() - line = self.random_file.__next__() - return line[:-1].split("\t")[1] diff --git a/bert_pytorch/dataset/vocab.py b/bert_pytorch/dataset/vocab.py deleted file mode 100644 index f7346a7..0000000 --- a/bert_pytorch/dataset/vocab.py +++ /dev/null @@ -1,185 +0,0 @@ -import pickle -import tqdm -from collections import Counter - - -class TorchVocab(object): - """Defines a vocabulary object that will be used to numericalize a field. - Attributes: - freqs: A collections.Counter object holding the frequencies of tokens - in the data used to build the Vocab. - stoi: A collections.defaultdict instance mapping token strings to - numerical identifiers. - itos: A list of token strings indexed by their numerical identifiers. - """ - - def __init__(self, counter, max_size=None, min_freq=1, specials=['', ''], - vectors=None, unk_init=None, vectors_cache=None): - """Create a Vocab object from a collections.Counter. - Arguments: - counter: collections.Counter object holding the frequencies of - each value found in the data. - max_size: The maximum size of the vocabulary, or None for no - maximum. Default: None. - min_freq: The minimum frequency needed to include a token in the - vocabulary. Values less than 1 will be set to 1. Default: 1. - specials: The list of special tokens (e.g., padding or eos) that - will be prepended to the vocabulary in addition to an - token. Default: [''] - vectors: One of either the available pretrained vectors - or custom pretrained vectors (see Vocab.load_vectors); - or a list of aforementioned vectors - unk_init (callback): by default, initialize out-of-vocabulary word vectors - to zero vectors; can be any function that takes in a Tensor and - returns a Tensor of the same size. Default: torch.Tensor.zero_ - vectors_cache: directory for cached vectors. Default: '.vector_cache' - """ - self.freqs = counter - counter = counter.copy() - min_freq = max(min_freq, 1) - - self.itos = list(specials) - # frequencies of special tokens are not counted when building vocabulary - # in frequency order - for tok in specials: - del counter[tok] - - max_size = None if max_size is None else max_size + len(self.itos) - - # sort by frequency, then alphabetically - words_and_frequencies = sorted(counter.items(), key=lambda tup: tup[0]) - words_and_frequencies.sort(key=lambda tup: tup[1], reverse=True) - - for word, freq in words_and_frequencies: - if freq < min_freq or len(self.itos) == max_size: - break - self.itos.append(word) - - # stoi is simply a reverse dict for itos - self.stoi = {tok: i for i, tok in enumerate(self.itos)} - - self.vectors = None - if vectors is not None: - self.load_vectors(vectors, unk_init=unk_init, cache=vectors_cache) - else: - assert unk_init is None and vectors_cache is None - - def __eq__(self, other): - if self.freqs != other.freqs: - return False - if self.stoi != other.stoi: - return False - if self.itos != other.itos: - return False - if self.vectors != other.vectors: - return False - return True - - def __len__(self): - return len(self.itos) - - def vocab_rerank(self): - self.stoi = {word: i for i, word in enumerate(self.itos)} - - def extend(self, v, sort=False): - words = sorted(v.itos) if sort else v.itos - for w in words: - if w not in self.stoi: - self.itos.append(w) - self.stoi[w] = len(self.itos) - 1 - - -class Vocab(TorchVocab): - def __init__(self, counter, max_size=None, min_freq=1): - self.pad_index = 0 - self.unk_index = 1 - self.eos_index = 2 - self.sos_index = 3 - self.mask_index = 4 - super().__init__(counter, specials=["", "", "", "", ""], - max_size=max_size, min_freq=min_freq) - - def to_seq(self, sentece, seq_len, with_eos=False, with_sos=False) -> list: - pass - - def from_seq(self, seq, join=False, with_pad=False): - pass - - @staticmethod - def load_vocab(vocab_path: str) -> 'Vocab': - with open(vocab_path, "rb") as f: - return pickle.load(f) - - def save_vocab(self, vocab_path): - with open(vocab_path, "wb") as f: - pickle.dump(self, f) - - -# Building Vocab with text files -class WordVocab(Vocab): - def __init__(self, texts, max_size=None, min_freq=1): - print("Building Vocab") - counter = Counter() - for line in tqdm.tqdm(texts): - if isinstance(line, list): - words = line - else: - words = line.replace("\n", "").replace("\t", "").split() - - for word in words: - counter[word] += 1 - super().__init__(counter, max_size=max_size, min_freq=min_freq) - - def to_seq(self, sentence, seq_len=None, with_eos=False, with_sos=False, with_len=False): - if isinstance(sentence, str): - sentence = sentence.split() - - seq = [self.stoi.get(word, self.unk_index) for word in sentence] - - if with_eos: - seq += [self.eos_index] # this would be index 1 - if with_sos: - seq = [self.sos_index] + seq - - origin_seq_len = len(seq) - - if seq_len is None: - pass - elif len(seq) <= seq_len: - seq += [self.pad_index for _ in range(seq_len - len(seq))] - else: - seq = seq[:seq_len] - - return (seq, origin_seq_len) if with_len else seq - - def from_seq(self, seq, join=False, with_pad=False): - words = [self.itos[idx] - if idx < len(self.itos) - else "<%d>" % idx - for idx in seq - if not with_pad or idx != self.pad_index] - - return " ".join(words) if join else words - - @staticmethod - def load_vocab(vocab_path: str) -> 'WordVocab': - with open(vocab_path, "rb") as f: - return pickle.load(f) - - -def build(): - import argparse - - parser = argparse.ArgumentParser() - parser.add_argument("-c", "--corpus_path", required=True, type=str) - parser.add_argument("-o", "--output_path", required=True, type=str) - parser.add_argument("-s", "--vocab_size", type=int, default=None) - parser.add_argument("-e", "--encoding", type=str, default="utf-8") - parser.add_argument("-m", "--min_freq", type=int, default=1) - args = parser.parse_args() - - with open(args.corpus_path, "r", encoding=args.encoding) as f: - vocab = WordVocab(f, max_size=args.vocab_size, min_freq=args.min_freq) - - print("VOCAB SIZE:", len(vocab)) - vocab.save_vocab(args.output_path) diff --git a/bert_pytorch/model/__init__.py b/bert_pytorch/model/__init__.py deleted file mode 100644 index aa318cb..0000000 --- a/bert_pytorch/model/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .bert import BERT -from .language_model import BERTLM diff --git a/bert_pytorch/model/attention/__init__.py b/bert_pytorch/model/attention/__init__.py deleted file mode 100644 index 6a39ec1..0000000 --- a/bert_pytorch/model/attention/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .multi_head import MultiHeadedAttention -from .single import Attention diff --git a/bert_pytorch/model/attention/multi_head.py b/bert_pytorch/model/attention/multi_head.py deleted file mode 100644 index c8a47f9..0000000 --- a/bert_pytorch/model/attention/multi_head.py +++ /dev/null @@ -1,37 +0,0 @@ -import torch.nn as nn -from .single import Attention - - -class MultiHeadedAttention(nn.Module): - """ - Take in model size and number of heads. - """ - - def __init__(self, h, d_model, dropout=0.1): - super().__init__() - assert d_model % h == 0 - - # We assume d_v always equals d_k - self.d_k = d_model // h - self.h = h - - self.linear_layers = nn.ModuleList([nn.Linear(d_model, d_model) for _ in range(3)]) - self.output_linear = nn.Linear(d_model, d_model) - self.attention = Attention() - - self.dropout = nn.Dropout(p=dropout) - - def forward(self, query, key, value, mask=None): - batch_size = query.size(0) - - # 1) Do all the linear projections in batch from d_model => h x d_k - query, key, value = [l(x).view(batch_size, -1, self.h, self.d_k).transpose(1, 2) - for l, x in zip(self.linear_layers, (query, key, value))] - - # 2) Apply attention on all the projected vectors in batch. - x, attn = self.attention(query, key, value, mask=mask, dropout=self.dropout) - - # 3) "Concat" using a view and apply a final linear. - x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k) - - return self.output_linear(x) diff --git a/bert_pytorch/model/attention/single.py b/bert_pytorch/model/attention/single.py deleted file mode 100644 index 701d2c2..0000000 --- a/bert_pytorch/model/attention/single.py +++ /dev/null @@ -1,25 +0,0 @@ -import torch.nn as nn -import torch.nn.functional as F -import torch - -import math - - -class Attention(nn.Module): - """ - Compute 'Scaled Dot Product Attention - """ - - def forward(self, query, key, value, mask=None, dropout=None): - scores = torch.matmul(query, key.transpose(-2, -1)) \ - / math.sqrt(query.size(-1)) - - if mask is not None: - scores = scores.masked_fill(mask == 0, -1e9) - - p_attn = F.softmax(scores, dim=-1) - - if dropout is not None: - p_attn = dropout(p_attn) - - return torch.matmul(p_attn, value), p_attn diff --git a/bert_pytorch/model/bert.py b/bert_pytorch/model/bert.py deleted file mode 100644 index c4cec4a..0000000 --- a/bert_pytorch/model/bert.py +++ /dev/null @@ -1,48 +0,0 @@ -import torch.nn as nn - -from .transformer import TransformerBlock -from .embedding import BERTEmbedding - - -class BERT(nn.Module): - """ - BERT model : Bidirectional Encoder Representations from Transformers. - """ - - def __init__(self, vocab_size, hidden=768, n_layers=12, attn_heads=12, dropout=0.1): - """ - :param vocab_size: vocab_size of total words - :param hidden: BERT model hidden size - :param n_layers: numbers of Transformer blocks(layers) - :param attn_heads: number of attention heads - :param dropout: dropout rate - """ - - super().__init__() - self.hidden = hidden - self.n_layers = n_layers - self.attn_heads = attn_heads - - # paper noted they used 4*hidden_size for ff_network_hidden_size - self.feed_forward_hidden = hidden * 4 - - # embedding for BERT, sum of positional, segment, token embeddings - self.embedding = BERTEmbedding(vocab_size=vocab_size, embed_size=hidden) - - # multi-layers transformer blocks, deep network - self.transformer_blocks = nn.ModuleList( - [TransformerBlock(hidden, attn_heads, hidden * 4, dropout) for _ in range(n_layers)]) - - def forward(self, x, segment_info): - # attention masking for padded token - # torch.ByteTensor([batch_size, 1, seq_len, seq_len) - mask = (x > 0).unsqueeze(1).repeat(1, x.size(1), 1).unsqueeze(1) - - # embedding the indexed sequence to sequence of vectors - x = self.embedding(x, segment_info) - - # running over multiple transformer blocks - for transformer in self.transformer_blocks: - x = transformer.forward(x, mask) - - return x diff --git a/bert_pytorch/model/embedding/__init__.py b/bert_pytorch/model/embedding/__init__.py deleted file mode 100644 index 0eb5843..0000000 --- a/bert_pytorch/model/embedding/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .bert import BERTEmbedding diff --git a/bert_pytorch/model/embedding/bert.py b/bert_pytorch/model/embedding/bert.py deleted file mode 100644 index bcd5115..0000000 --- a/bert_pytorch/model/embedding/bert.py +++ /dev/null @@ -1,32 +0,0 @@ -import torch.nn as nn -from .token import TokenEmbedding -from .position import PositionalEmbedding -from .segment import SegmentEmbedding - - -class BERTEmbedding(nn.Module): - """ - BERT Embedding which is consisted with under features - 1. TokenEmbedding : normal embedding matrix - 2. PositionalEmbedding : adding positional information using sin, cos - 2. SegmentEmbedding : adding sentence segment info, (sent_A:1, sent_B:2) - - sum of all these features are output of BERTEmbedding - """ - - def __init__(self, vocab_size, embed_size, dropout=0.1): - """ - :param vocab_size: total vocab size - :param embed_size: embedding size of token embedding - :param dropout: dropout rate - """ - super().__init__() - self.token = TokenEmbedding(vocab_size=vocab_size, embed_size=embed_size) - self.position = PositionalEmbedding(d_model=self.token.embedding_dim) - self.segment = SegmentEmbedding(embed_size=self.token.embedding_dim) - self.dropout = nn.Dropout(p=dropout) - self.embed_size = embed_size - - def forward(self, sequence, segment_label): - x = self.token(sequence) + self.position(sequence) + self.segment(segment_label) - return self.dropout(x) diff --git a/bert_pytorch/model/embedding/position.py b/bert_pytorch/model/embedding/position.py deleted file mode 100644 index d55c224..0000000 --- a/bert_pytorch/model/embedding/position.py +++ /dev/null @@ -1,25 +0,0 @@ -import torch.nn as nn -import torch -import math - - -class PositionalEmbedding(nn.Module): - - def __init__(self, d_model, max_len=512): - super().__init__() - - # Compute the positional encodings once in log space. - pe = torch.zeros(max_len, d_model).float() - pe.require_grad = False - - position = torch.arange(0, max_len).float().unsqueeze(1) - div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp() - - pe[:, 0::2] = torch.sin(position * div_term) - pe[:, 1::2] = torch.cos(position * div_term) - - pe = pe.unsqueeze(0) - self.register_buffer('pe', pe) - - def forward(self, x): - return self.pe[:, :x.size(1)] diff --git a/bert_pytorch/model/embedding/segment.py b/bert_pytorch/model/embedding/segment.py deleted file mode 100644 index cdf84d5..0000000 --- a/bert_pytorch/model/embedding/segment.py +++ /dev/null @@ -1,6 +0,0 @@ -import torch.nn as nn - - -class SegmentEmbedding(nn.Embedding): - def __init__(self, embed_size=512): - super().__init__(3, embed_size, padding_idx=0) diff --git a/bert_pytorch/model/embedding/token.py b/bert_pytorch/model/embedding/token.py deleted file mode 100644 index 79b5187..0000000 --- a/bert_pytorch/model/embedding/token.py +++ /dev/null @@ -1,6 +0,0 @@ -import torch.nn as nn - - -class TokenEmbedding(nn.Embedding): - def __init__(self, vocab_size, embed_size=512): - super().__init__(vocab_size, embed_size, padding_idx=0) diff --git a/bert_pytorch/model/language_model.py b/bert_pytorch/model/language_model.py deleted file mode 100644 index 608f42a..0000000 --- a/bert_pytorch/model/language_model.py +++ /dev/null @@ -1,61 +0,0 @@ -import torch.nn as nn - -from .bert import BERT - - -class BERTLM(nn.Module): - """ - BERT Language Model - Next Sentence Prediction Model + Masked Language Model - """ - - def __init__(self, bert: BERT, vocab_size): - """ - :param bert: BERT model which should be trained - :param vocab_size: total vocab size for masked_lm - """ - - super().__init__() - self.bert = bert - self.next_sentence = NextSentencePrediction(self.bert.hidden) - self.mask_lm = MaskedLanguageModel(self.bert.hidden, vocab_size) - - def forward(self, x, segment_label): - x = self.bert(x, segment_label) - return self.next_sentence(x), self.mask_lm(x) - - -class NextSentencePrediction(nn.Module): - """ - 2-class classification model : is_next, is_not_next - """ - - def __init__(self, hidden): - """ - :param hidden: BERT model output size - """ - super().__init__() - self.linear = nn.Linear(hidden, 2) - self.softmax = nn.LogSoftmax(dim=-1) - - def forward(self, x): - return self.softmax(self.linear(x[:, 0])) - - -class MaskedLanguageModel(nn.Module): - """ - predicting origin token from masked input sequence - n-class classification problem, n-class = vocab_size - """ - - def __init__(self, hidden, vocab_size): - """ - :param hidden: output size of BERT model - :param vocab_size: total vocab size - """ - super().__init__() - self.linear = nn.Linear(hidden, vocab_size) - self.softmax = nn.LogSoftmax(dim=-1) - - def forward(self, x): - return self.softmax(self.linear(x)) diff --git a/bert_pytorch/model/transformer.py b/bert_pytorch/model/transformer.py deleted file mode 100644 index 288de26..0000000 --- a/bert_pytorch/model/transformer.py +++ /dev/null @@ -1,31 +0,0 @@ -import torch.nn as nn - -from .attention import MultiHeadedAttention -from .utils import SublayerConnection, PositionwiseFeedForward - - -class TransformerBlock(nn.Module): - """ - Bidirectional Encoder = Transformer (self-attention) - Transformer = MultiHead_Attention + Feed_Forward with sublayer connection - """ - - def __init__(self, hidden, attn_heads, feed_forward_hidden, dropout): - """ - :param hidden: hidden size of transformer - :param attn_heads: head sizes of multi-head attention - :param feed_forward_hidden: feed_forward_hidden, usually 4*hidden_size - :param dropout: dropout rate - """ - - super().__init__() - self.attention = MultiHeadedAttention(h=attn_heads, d_model=hidden) - self.feed_forward = PositionwiseFeedForward(d_model=hidden, d_ff=feed_forward_hidden, dropout=dropout) - self.input_sublayer = SublayerConnection(size=hidden, dropout=dropout) - self.output_sublayer = SublayerConnection(size=hidden, dropout=dropout) - self.dropout = nn.Dropout(p=dropout) - - def forward(self, x, mask): - x = self.input_sublayer(x, lambda _x: self.attention.forward(_x, _x, _x, mask=mask)) - x = self.output_sublayer(x, self.feed_forward) - return self.dropout(x) diff --git a/bert_pytorch/model/utils/__init__.py b/bert_pytorch/model/utils/__init__.py deleted file mode 100644 index e7bddc6..0000000 --- a/bert_pytorch/model/utils/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .feed_forward import PositionwiseFeedForward -from .layer_norm import LayerNorm -from .sublayer import SublayerConnection -from .gelu import GELU diff --git a/bert_pytorch/model/utils/feed_forward.py b/bert_pytorch/model/utils/feed_forward.py deleted file mode 100644 index a225c5e..0000000 --- a/bert_pytorch/model/utils/feed_forward.py +++ /dev/null @@ -1,16 +0,0 @@ -import torch.nn as nn -from .gelu import GELU - - -class PositionwiseFeedForward(nn.Module): - "Implements FFN equation." - - def __init__(self, d_model, d_ff, dropout=0.1): - super(PositionwiseFeedForward, self).__init__() - self.w_1 = nn.Linear(d_model, d_ff) - self.w_2 = nn.Linear(d_ff, d_model) - self.dropout = nn.Dropout(dropout) - self.activation = GELU() - - def forward(self, x): - return self.w_2(self.dropout(self.activation(self.w_1(x)))) diff --git a/bert_pytorch/model/utils/gelu.py b/bert_pytorch/model/utils/gelu.py deleted file mode 100644 index a30ea33..0000000 --- a/bert_pytorch/model/utils/gelu.py +++ /dev/null @@ -1,12 +0,0 @@ -import torch.nn as nn -import torch -import math - - -class GELU(nn.Module): - """ - Paper Section 3.4, last paragraph notice that BERT used the GELU instead of RELU - """ - - def forward(self, x): - return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) diff --git a/bert_pytorch/model/utils/layer_norm.py b/bert_pytorch/model/utils/layer_norm.py deleted file mode 100644 index 6f67136..0000000 --- a/bert_pytorch/model/utils/layer_norm.py +++ /dev/null @@ -1,17 +0,0 @@ -import torch.nn as nn -import torch - - -class LayerNorm(nn.Module): - "Construct a layernorm module (See citation for details)." - - def __init__(self, features, eps=1e-6): - super(LayerNorm, self).__init__() - self.a_2 = nn.Parameter(torch.ones(features)) - self.b_2 = nn.Parameter(torch.zeros(features)) - self.eps = eps - - def forward(self, x): - mean = x.mean(-1, keepdim=True) - std = x.std(-1, keepdim=True) - return self.a_2 * (x - mean) / (std + self.eps) + self.b_2 diff --git a/bert_pytorch/model/utils/sublayer.py b/bert_pytorch/model/utils/sublayer.py deleted file mode 100644 index 6e36793..0000000 --- a/bert_pytorch/model/utils/sublayer.py +++ /dev/null @@ -1,18 +0,0 @@ -import torch.nn as nn -from .layer_norm import LayerNorm - - -class SublayerConnection(nn.Module): - """ - A residual connection followed by a layer norm. - Note for code simplicity the norm is first as opposed to last. - """ - - def __init__(self, size, dropout): - super(SublayerConnection, self).__init__() - self.norm = LayerNorm(size) - self.dropout = nn.Dropout(dropout) - - def forward(self, x, sublayer): - "Apply residual connection to any sublayer with the same size." - return x + self.dropout(sublayer(self.norm(x))) diff --git a/bert_pytorch/trainer/__init__.py b/bert_pytorch/trainer/__init__.py deleted file mode 100644 index 6a0eb37..0000000 --- a/bert_pytorch/trainer/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .pretrain import BERTTrainer diff --git a/bert_pytorch/trainer/optim_schedule.py b/bert_pytorch/trainer/optim_schedule.py deleted file mode 100644 index 5ccd222..0000000 --- a/bert_pytorch/trainer/optim_schedule.py +++ /dev/null @@ -1,35 +0,0 @@ -'''A wrapper class for optimizer ''' -import numpy as np - - -class ScheduledOptim(): - '''A simple wrapper class for learning rate scheduling''' - - def __init__(self, optimizer, d_model, n_warmup_steps): - self._optimizer = optimizer - self.n_warmup_steps = n_warmup_steps - self.n_current_steps = 0 - self.init_lr = np.power(d_model, -0.5) - - def step_and_update_lr(self): - "Step with the inner optimizer" - self._update_learning_rate() - self._optimizer.step() - - def zero_grad(self): - "Zero out the gradients by the inner optimizer" - self._optimizer.zero_grad() - - def _get_lr_scale(self): - return np.min([ - np.power(self.n_current_steps, -0.5), - np.power(self.n_warmup_steps, -1.5) * self.n_current_steps]) - - def _update_learning_rate(self): - ''' Learning rate scheduling per step ''' - - self.n_current_steps += 1 - lr = self.init_lr * self._get_lr_scale() - - for param_group in self._optimizer.param_groups: - param_group['lr'] = lr diff --git a/bert_pytorch/trainer/pretrain.py b/bert_pytorch/trainer/pretrain.py deleted file mode 100644 index 0b882dd..0000000 --- a/bert_pytorch/trainer/pretrain.py +++ /dev/null @@ -1,151 +0,0 @@ -import torch -import torch.nn as nn -from torch.optim import Adam -from torch.utils.data import DataLoader - -from ..model import BERTLM, BERT -from .optim_schedule import ScheduledOptim - -import tqdm - - -class BERTTrainer: - """ - BERTTrainer make the pretrained BERT model with two LM training method. - - 1. Masked Language Model : 3.3.1 Task #1: Masked LM - 2. Next Sentence prediction : 3.3.2 Task #2: Next Sentence Prediction - - please check the details on README.md with simple example. - - """ - - def __init__(self, bert: BERT, vocab_size: int, - train_dataloader: DataLoader, test_dataloader: DataLoader = None, - lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000, - with_cuda: bool = True, cuda_devices=None, log_freq: int = 10): - """ - :param bert: BERT model which you want to train - :param vocab_size: total word vocab size - :param train_dataloader: train dataset data loader - :param test_dataloader: test dataset data loader [can be None] - :param lr: learning rate of optimizer - :param betas: Adam optimizer betas - :param weight_decay: Adam optimizer weight decay param - :param with_cuda: traning with cuda - :param log_freq: logging frequency of the batch iteration - """ - - # Setup cuda device for BERT training, argument -c, --cuda should be true - cuda_condition = torch.cuda.is_available() and with_cuda - self.device = torch.device("cuda:0" if cuda_condition else "cpu") - - # This BERT model will be saved every epoch - self.bert = bert - # Initialize the BERT Language Model, with BERT model - self.model = BERTLM(bert, vocab_size).to(self.device) - - # Distributed GPU training if CUDA can detect more than 1 GPU - if with_cuda and torch.cuda.device_count() > 1: - print("Using %d GPUS for BERT" % torch.cuda.device_count()) - self.model = nn.DataParallel(self.model, device_ids=cuda_devices) - - # Setting the train and test data loader - self.train_data = train_dataloader - self.test_data = test_dataloader - - # Setting the Adam optimizer with hyper-param - self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay) - self.optim_schedule = ScheduledOptim(self.optim, self.bert.hidden, n_warmup_steps=warmup_steps) - - # Using Negative Log Likelihood Loss function for predicting the masked_token - self.criterion = nn.NLLLoss(ignore_index=0) - - self.log_freq = log_freq - - print("Total Parameters:", sum([p.nelement() for p in self.model.parameters()])) - - def train(self, epoch): - self.iteration(epoch, self.train_data) - - def test(self, epoch): - self.iteration(epoch, self.test_data, train=False) - - def iteration(self, epoch, data_loader, train=True): - """ - loop over the data_loader for training or testing - if on train status, backward operation is activated - and also auto save the model every peoch - - :param epoch: current epoch index - :param data_loader: torch.utils.data.DataLoader for iteration - :param train: boolean value of is train or test - :return: None - """ - str_code = "train" if train else "test" - - # Setting the tqdm progress bar - data_iter = tqdm.tqdm(enumerate(data_loader), - desc="EP_%s:%d" % (str_code, epoch), - total=len(data_loader), - bar_format="{l_bar}{r_bar}") - - avg_loss = 0.0 - total_correct = 0 - total_element = 0 - - for i, data in data_iter: - # 0. batch_data will be sent into the device(GPU or cpu) - data = {key: value.to(self.device) for key, value in data.items()} - - # 1. forward the next_sentence_prediction and masked_lm model - next_sent_output, mask_lm_output = self.model.forward(data["bert_input"], data["segment_label"]) - - # 2-1. NLL(negative log likelihood) loss of is_next classification result - next_loss = self.criterion(next_sent_output, data["is_next"]) - - # 2-2. NLLLoss of predicting masked token word - mask_loss = self.criterion(mask_lm_output.transpose(1, 2), data["bert_label"]) - - # 2-3. Adding next_loss and mask_loss : 3.4 Pre-training Procedure - loss = next_loss + mask_loss - - # 3. backward and optimization only in train - if train: - self.optim_schedule.zero_grad() - loss.backward() - self.optim_schedule.step_and_update_lr() - - # next sentence prediction accuracy - correct = next_sent_output.argmax(dim=-1).eq(data["is_next"]).sum().item() - avg_loss += loss.item() - total_correct += correct - total_element += data["is_next"].nelement() - - post_fix = { - "epoch": epoch, - "iter": i, - "avg_loss": avg_loss / (i + 1), - "avg_acc": total_correct / total_element * 100, - "loss": loss.item() - } - - if i % self.log_freq == 0: - data_iter.write(str(post_fix)) - - print("EP%d_%s, avg_loss=" % (epoch, str_code), avg_loss / len(data_iter), "total_acc=", - total_correct * 100.0 / total_element) - - def save(self, epoch, file_path="output/bert_trained.model"): - """ - Saving the current BERT model on file_path - - :param epoch: current epoch number - :param file_path: model output path which gonna be file_path+"ep%d" % epoch - :return: final_output_path - """ - output_path = file_path + ".ep%d" % epoch - torch.save(self.bert.cpu(), output_path) - self.bert.to(self.device) - print("EP:%d Model Saved on:" % epoch, output_path) - return output_path diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..489f5af --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,4 @@ +[tool.black] +line-length = 120 +target-version = ['py37'] +include = '\.py$' diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..b3c62da --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,10 @@ +# for clean code :) +isort +black +flake8 + +# for safe code :) +pytest +pytest-cov +codecov +coverage diff --git a/requirements.txt b/requirements.txt index 3689708..e69de29 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +0,0 @@ -tqdm -numpy -torch>=0.4.0 \ No newline at end of file diff --git a/scripts/run_sample.py b/scripts/run_sample.py new file mode 100644 index 0000000..b827738 --- /dev/null +++ b/scripts/run_sample.py @@ -0,0 +1,5 @@ +# Runing this script with under command +# python -m scripts.run_sample + +if __name__ == "__main__": + print("Replace this template script") diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..3fbd773 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,11 @@ +[flake8] +ignore = E203, W503, E501 +max-line-length = 120 + +[tool:isort] +multi_line_output = 3 +line_length = 120 +include_trailing_comma = True + +[tool:pytest] +addopts = -ra -v -l diff --git a/setup.py b/setup.py index 4e721cf..5aaa787 100644 --- a/setup.py +++ b/setup.py @@ -1,54 +1,12 @@ -from setuptools import setup, find_packages -from setuptools.command.install import install -import os -import sys - -__version__ = "0.0.1a4" - -with open("requirements.txt") as f: - require_packages = [line[:-1] if line[-1] == "\n" else line for line in f] - -with open("README.md", "r", encoding="utf-8") as f: - long_description = f.read() - - -class VerifyVersionCommand(install): - """Custom command to verify that the git tag matches our version""" - description = 'verify that the git tag matches our version' - - def run(self): - tag = os.getenv('CIRCLE_TAG') - - if tag != __version__: - info = "Git tag: {0} does not match the version of this app: {1}".format( - tag, __version__ - ) - sys.exit(info) - +from setuptools import find_packages, setup setup( - name="bert_pytorch", - version=__version__, - author='Junseong Kim', - author_email='codertimo@gmail.com', - packages=find_packages(), - install_requires=require_packages, - url="https://github.com/codertimo/BERT-pytorch", - description="Google AI 2018 BERT pytorch implementation", - long_description=long_description, - long_description_content_type="text/markdown", - classifiers=[ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: Apache Software License", - "Operating System :: OS Independent", - ], - entry_points={ - 'console_scripts': [ - 'bert = bert_pytorch.__main__:train', - 'bert-vocab = bert_pytorch.dataset.vocab:build', - ] - }, - cmdclass={ - 'verify': VerifyVersionCommand, - } + name="bert", + version="0.0.1", + description="package description", + install_requires=[], + url="https://github.com/codertimo/python-template", + author="codertimo", + author_email="codertimo@gmail.com", + packages=find_packages(exclude=["tests", "scripts"]), ) diff --git a/test.py b/test.py deleted file mode 100644 index 5b6dee3..0000000 --- a/test.py +++ /dev/null @@ -1,6 +0,0 @@ -import unittest -from bert_pytorch import BERT - - -class BERTVocabTestCase(unittest.TestCase): - pass diff --git a/tests/test_sample.py b/tests/test_sample.py new file mode 100644 index 0000000..d2b4018 --- /dev/null +++ b/tests/test_sample.py @@ -0,0 +1,2 @@ +def test_add(): + assert 1 + 2 == 3 From 2da522d0e41f266d9a1efaffbdeb08334421fb14 Mon Sep 17 00:00:00 2001 From: Junseong Kim Date: Mon, 5 Apr 2021 18:06:04 +0900 Subject: [PATCH 02/12] feat: project setting (requirements, gitignore) --- .gitignore | 110 ++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 2 + tests/__init__.py | 0 3 files changed, 112 insertions(+) create mode 100644 .gitignore create mode 100644 tests/__init__.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2ca9f17 --- /dev/null +++ b/.gitignore @@ -0,0 +1,110 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +# editor specific +.vscode/settings.json +.vscode/launch.json +.vscode/settings.json +.vscode/tasks.json diff --git a/requirements.txt b/requirements.txt index e69de29..0530f9f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -0,0 +1,2 @@ +torch +tqdm diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 From b5c901b1a2cf3df8bd1a29c635b40c6cf98aceff Mon Sep 17 00:00:00 2001 From: Junseong Kim Date: Mon, 5 Apr 2021 18:06:33 +0900 Subject: [PATCH 03/12] feat: bert model implementation --- bert/config.py | 18 +++++++ bert/model.py | 144 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 162 insertions(+) create mode 100644 bert/config.py create mode 100644 bert/model.py diff --git a/bert/config.py b/bert/config.py new file mode 100644 index 0000000..1ffbd00 --- /dev/null +++ b/bert/config.py @@ -0,0 +1,18 @@ +from typing import NamedTuple + + +class BertConfig(NamedTuple): + vocab_size: int + type_vocab_size: int + max_position_embeddings: int + + hidden_size: int + hidden_act: str + initializer_range: float + intermediate_size: int + num_attention_heads: int + num_hidden_layers: int + + layer_norm_eps: float + hidden_dropout_prob: float + attention_probs_dropout_prob: float diff --git a/bert/model.py b/bert/model.py new file mode 100644 index 0000000..5e23aba --- /dev/null +++ b/bert/model.py @@ -0,0 +1,144 @@ +import math +from typing import Tuple + +import torch +from torch import Tensor, nn +from torch.nn import functional as fnn + +from .config import BertConfig + + +class BertEmbedding(nn.Module): + def __init__(self, config: BertConfig): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + + self.layer_norm = nn.LayerNorm(config.hidden_size, config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, input_ids: Tensor, token_type_ids: Tensor, turn_type_ids: Tensor) -> Tensor: + word_embeds = self.word_embeddings(input_ids) + token_type_embeds = self.token_type_embeddings(token_type_ids) + position_embed = self.position_embeddings(turn_type_ids) + + embed_output = word_embeds + token_type_embeds + position_embed + embed_output = self.layer_norm(embed_output) + embed_output = self.dropout(embed_output) + return embed_output + + +class BertMultiHeadAttention(nn.Module): + def __init__(self, config: BertConfig): + super().__init__() + self.num_attention_heads = config.num_attention_heads + self.head_hidden_size = config.hidden_size // config.num_attention_heads + self.hidden_size = config.hidden_size + + self.query = nn.Linear(config.hidden_size, config.hidden_size) + self.key = nn.Linear(config.hidden_size, config.hidden_size) + self.value = nn.Linear(config.hidden_size, config.hidden_size) + self.attention_dropout = nn.Dropout(config.attention_probs_dropout_prob) + + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: Tensor, attention_mask: Tensor) -> Tensor: + # query, key, value linear projection + query_output = self.query(hidden_states) + key_output = self.key(hidden_states) + value_output = self.value(hidden_states) + + seq_len = hidden_states.size(1) + + # split hidden_state into num_heads pieces (hidden_size = num_attention_heads * head_hidden_size) + # ops #1: (batch, seq_len, hidden_size) -> (batch, seq_len, num_attention_heads, head_hidden_size) + # ops #2: (batch, seq_len, num_attention_heads, head_hidden_size) -> (batch, num_attention_heads, seq_len, head_hidden_size) + # output: (batch, num_attention_heads, seq_len, head_hidden_size) + query_output = query_output.view(-1, seq_len, self.num_attention_heads, self.head_hidden_size) + query_output = query_output.transpose(1, 2) + key_output = key_output.view(-1, seq_len, self.num_attention_heads, self.head_hidden_size) + key_output = key_output.transpose(1, 2) + value_output = value_output.view(-1, seq_len, self.num_attention_heads, self.head_hidden_size) + value_output = value_output.transpose(1, 2) + + # attention_ops: (batch, num_attention_heads, seq_len, head_hidden_size) x (batch, num_attention_heads, head_hidden_size, seq_len) + # output: (batch, num_attention_heads, seq_len, seq_len) + attention_scores = torch.matmul(query_output, key_output.transpose(2, 3)) + attention_scores = attention_scores / math.sqrt(self.head_hidden_size) + + # TODO: attention mask + # TODO: head mask + + # normalize attention scores to probs + attention_probs = fnn.softmax(attention_scores, dim=-1) + attention_probs = self.dropout(attention_probs) + + # context_ops: (batch, num_attention_heads, seq_len, seq_len) x (batch, num_attention_heads, seq_len, head_hidden_size) + # output: (batch, num_attention_heads, seq_len, hidden_size) + context_encoded_output = torch.matmul(attention_probs, value_output) + + # merge multi-head output to single head output + # ops1: (batch, num_attention_heads, seq_len, head_hidden_size) -> (batch, seq_len, num_attention_heads, head_hidden_size) + # ops2: (batch, seq_len, num_attention_heads, head_hidden_size) -> (batch, seq_len, hidden_size) + # output: (batch, seq_len, num_attention_heads, head_hidden_size) + context_encoded_output = context_encoded_output.transpose(1, 2).contiguous() + context_encoded_output = context_encoded_output.view(-1, seq_len, self.hidden_size) + + # output linear projection + layer norm + dropout + context_encoded_output = self.dense(context_encoded_output) + context_encoded_output = self.layer_norm(context_encoded_output) + context_encoded_output = self.dropout(context_encoded_output) + + return context_encoded_output + + +class BertLayer(nn.Module): + def __init__(self, config: BertConfig): + super().__init__() + self.attention = BertMultiHeadAttention(config) + + self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size) + self.intermediate_activation_fn = nn.GELU() + + self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.output_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.output_dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: Tensor, attention_mask: Tensor) -> Tensor: + context_encoded_output = self.attention(hidden_states, attention_mask) + + intermediate_output = self.intermediate_dense(context_encoded_output) + intermediate_output = self.intermediate_activation_fn(intermediate_output) + + layer_output = self.output_dense(intermediate_output) + layer_output = self.output_layer_norm(layer_output) + layer_output = self.output_dropout(layer_output) + return layer_output + + +class BertModel(nn.Module): + def __init__(self, config: BertConfig): + super().__init__() + self.config = config + + self.embedding = BertEmbedding(config) + self.layers = nn.ModuleList([BertLayer(config) for layer in range(config.num_hidden_layers)]) + + self.pooler_dense = nn.Linear(config.hidden_size, config.hidden_size) + self.pooler_activation_fn = nn.Tanh() + + def forward( + self, input_ids: Tensor, attention_mask: Tensor, token_type_ids: Tensor, position_ids: Tensor + ) -> Tuple[Tensor, Tensor]: + hidden_states = self.embedding(input_ids, token_type_ids, position_ids) + + for layer in self.layers: + hidden_states = layer(hidden_states, attention_mask) + + pooled_output = self.pooler_dense(hidden_states[:, 0]) + pooled_output = self.pooler_activation_fn(pooled_output) + + return pooled_output, hidden_states From 12ff135cbcbc48f5715a9345b8557219233f5c27 Mon Sep 17 00:00:00 2001 From: Junseong Kim Date: Mon, 5 Apr 2021 18:07:08 +0900 Subject: [PATCH 04/12] feat: add model.py test code --- tests/test_model.py | 79 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 tests/test_model.py diff --git a/tests/test_model.py b/tests/test_model.py new file mode 100644 index 0000000..97c84a4 --- /dev/null +++ b/tests/test_model.py @@ -0,0 +1,79 @@ +from typing import Tuple + +import pytest +import torch + +from bert.config import BertConfig +from bert.model import BertEmbedding, BertLayer, BertModel, BertMultiHeadAttention + +BATCH_SIZE, SEQ_LENGTH = 32, 64 +BertFeature = Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor] + + +@pytest.fixture +def bert_config() -> BertConfig: + config_dict = { + "hidden_size": 768, + "hidden_act": "gelu", + "initializer_range": 0.02, + "vocab_size": 30522, + "hidden_dropout_prob": 0.1, + "num_attention_heads": 12, + "type_vocab_size": 2, + "max_position_embeddings": 512, + "num_hidden_layers": 12, + "intermediate_size": 3072, + "attention_probs_dropout_prob": 0.1, + "layer_norm_eps": 1e-12, + } + return BertConfig(**config_dict) + + +@pytest.fixture +def bert_inputs(bert_config: BertConfig) -> BertFeature: + input_ids = torch.randint(0, bert_config.vocab_size, (BATCH_SIZE, SEQ_LENGTH)) + attention_mask = input_ids.gt(0).float() + token_type_ids = torch.randint(0, bert_config.type_vocab_size, (BATCH_SIZE, SEQ_LENGTH)) + position_ids = torch.arange(SEQ_LENGTH, dtype=torch.long) + return input_ids, attention_mask, token_type_ids, position_ids + + +def test_bert_embedding(bert_config: BertConfig, bert_inputs: BertFeature): + input_ids, _, token_type_ids, position_ids = bert_inputs + + embedding = BertEmbedding(bert_config) + embed_output = embedding(input_ids, token_type_ids, position_ids) + + assert tuple(embed_output.size()) == (BATCH_SIZE, SEQ_LENGTH, bert_config.hidden_size) + + +def test_bert_multihead_attention(bert_config: BertConfig, bert_inputs: BertFeature): + input_ids, attention_mask, token_type_ids, position_ids = bert_inputs + + embedding = BertEmbedding(bert_config) + hidden_states = embedding(input_ids, token_type_ids, position_ids) + + multi_head_attention = BertMultiHeadAttention(bert_config) + multi_head_attention_outputs = multi_head_attention.forward(hidden_states, attention_mask) + + assert tuple(multi_head_attention_outputs.size()) == (BATCH_SIZE, SEQ_LENGTH, bert_config.hidden_size) + + +def test_bert_layer(bert_config: BertConfig, bert_inputs: BertFeature): + input_ids, attention_mask, token_type_ids, position_ids = bert_inputs + + embedding = BertEmbedding(bert_config) + hidden_states = embedding(input_ids, token_type_ids, position_ids) + + bert_layer = BertLayer(bert_config) + layer_output = bert_layer.forward(hidden_states, attention_mask) + + assert tuple(layer_output.size()) == (BATCH_SIZE, SEQ_LENGTH, bert_config.hidden_size) + + +def test_bert_model(bert_config: BertConfig, bert_inputs: BertFeature): + bert = BertModel(bert_config) + pooled_output, seq_output = bert(*bert_inputs) + + assert tuple(pooled_output.size()) == (BATCH_SIZE, bert_config.hidden_size) + assert tuple(seq_output.size()) == (BATCH_SIZE, SEQ_LENGTH, bert_config.hidden_size) From b3821befbececda389793a2f278c9093062ce694 Mon Sep 17 00:00:00 2001 From: Junseong Kim Date: Mon, 5 Apr 2021 18:07:18 +0900 Subject: [PATCH 05/12] feat: add pretraining head --- bert/heads.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 bert/heads.py diff --git a/bert/heads.py b/bert/heads.py new file mode 100644 index 0000000..b4859b0 --- /dev/null +++ b/bert/heads.py @@ -0,0 +1,23 @@ +from typing import Tuple + +import torch +from torch import nn + +from .config import BertConfig +from .model import BertModel + + +class BertPretrainingHeads(nn.Module): + def __init__(self, config: BertConfig, bert: BertModel): + super().__init__() + self.bert = bert + self.language_model_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + self.language_model_head.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + self.next_sentence_prediction_head = nn.Linear(config.hidden_size, 2) + + def forward(self, *bert_input_args, **bert_input_kwargs) -> Tuple[torch.Tensor, torch.Tensor]: + pooled_output, seq_output = self.bert(*bert_input_args, **bert_input_kwargs) + lm_output = self.language_model_head(seq_output) + nsp_output = self.next_sentence_prediction_head(pooled_output) + return lm_output, nsp_output From cb290b7397203f18cda86d9d04aab60c4641d8ad Mon Sep 17 00:00:00 2001 From: Junseong Kim Date: Tue, 6 Apr 2021 01:15:40 +0900 Subject: [PATCH 06/12] feat: add build train dataset script --- bert/utils.py | 13 +++ scripts/run_build_train_dataset.py | 180 +++++++++++++++++++++++++++++ 2 files changed, 193 insertions(+) create mode 100644 bert/utils.py create mode 100644 scripts/run_build_train_dataset.py diff --git a/bert/utils.py b/bert/utils.py new file mode 100644 index 0000000..c604342 --- /dev/null +++ b/bert/utils.py @@ -0,0 +1,13 @@ +import logging +import sys + + +def get_logger(name: str = "BERT-PT"): + logger = logging.getLogger(name) + logger.setLevel(logging.DEBUG) + + stream_handler = logging.StreamHandler(sys.stdout) + stream_handler.setFormatter(logging.Formatter("[%(asctime)s] %(message)s")) + logger.addHandler(stream_handler) + + return logger diff --git a/scripts/run_build_train_dataset.py b/scripts/run_build_train_dataset.py new file mode 100644 index 0000000..706690e --- /dev/null +++ b/scripts/run_build_train_dataset.py @@ -0,0 +1,180 @@ +import random +from argparse import ArgumentParser, Namespace +from dataclasses import dataclass +from functools import partial +from glob import glob +from multiprocessing import Pool, cpu_count +from typing import List, Optional, Tuple + +from tqdm import tqdm + +from bert.pretrain.utils import get_logger + +logger = get_logger("BERT-data") + +# fmt: off +parser = ArgumentParser() +parser.add_argument("--corpus-paths", type=str, required=True, help="corpus paths (glob style)") +parser.add_argument("--num-workers", default=-1, type=int, help="num worker to multi-process (default: number of CPU cores)") +parser.add_argument("--max-seq-length", default=128, type=int, help="maximum sequence length of input features") +parser.add_argument("--short-seq-prob", default=0.2, type=float, help="probability to make shorter sequence") +parser.add_argument("--masked-lm-prob", default=0.15, type=float, help="masking prob") +parser.add_argument("--max-prediction-per-seq", default=20, type=int, help="maximum masked tokens per input") +# fmt: on + +# input_ids, attention_mask, token_type_ids, position_ids, mlm_targets, nsp_label +BertPretrainFeature = Tuple[List[int], List[float], List[int], List[int], List[int], int] + + +@dataclass +class Document: + texts: List[str] + tokenized_texts: Optional[List[List[int]]] = None + + +def make_bert_pretrain_feature( + positive_and_negative_documents: Tuple[Document, Document], + max_seq_length: int, + short_seq_prob: float, + masked_lm_prob: int, + max_prediction_per_seq: int, + cls_token_id: int, + sep_token_id: int, + mask_token_id: int, +) -> BertPretrainFeature: + positive_document, negative_document = positive_and_negative_documents + + # reserve special token space + target_seq_length = max_seq_length - 3 + if random.random() < short_seq_prob: + target_seq_length = random.randint(2, max_seq_length) + + segment_a, segment_b = [], [] + segment_a_target_length = random.randint(1, target_seq_length - 1) + + # postiive next sentence prediction sample + is_nsp_positive = int(random.random() >= 0.5) + if is_nsp_positive: + for tokenized_text in positive_document.tokenized_texts: + if len(segment_a) + len(tokenized_text) <= segment_a_target_length: + segment_a.extend(tokenized_text) + elif len(segment_a) + len(segment_b) + len(tokenized_text) <= target_seq_length: + segment_b.extend(tokenized_text) + else: + break + # negative next sentence prediction sample + else: + for tokenized_text in positive_document.tokenized_texts: + if len(segment_a) + len(tokenized_text) > segment_a_target_length: + break + segment_a.extend(tokenized_text) + + for tokenized_text in negative_document.tokenized_texts: + if len(segment_a) + len(segment_b) + len(tokenized_text) > target_seq_length: + break + segment_b.extend(tokenized_text) + + # adding special tokens + segment_a.insert(0, cls_token_id) + segment_a.append(sep_token_id) + segment_b.append(sep_token_id) + input_ids = segment_a + segment_b + + # random masking + masked_tokens_counts = 0 + masked_lm_targets = [], [] + for i, token_id in enumerate(input_ids): + not_special_token = token_id != cls_token_id and token_id != sep_token_id + if not_special_token and masked_tokens_counts < max_prediction_per_seq and random.random() > masked_lm_prob: + input_ids[i] = mask_token_id + masked_lm_targets.append(token_id) + masked_tokens_counts += 1 + else: + masked_lm_targets.append(-100) + + # padding and making attention mask and + padding_size = max_seq_length - len(input_ids) + attention_mask = [1.0] * len(input_ids) + [0.0] * padding_size + input_ids.extend([0] * len(padding_size)) + token_type_ids = [0] * len(segment_a) + [1] * len(segment_b) + position_ids = [i for i in range(max_seq_length)] + masked_lm_targets.extend([-100] * padding_size) + + assert len(input_ids) == max_seq_length + assert len(attention_mask) == max_seq_length + assert len(token_type_ids) == max_seq_length + assert len(position_ids) == max_seq_length + assert len(masked_lm_targets) == max_seq_length + + return input_ids, attention_mask, token_type_ids, position_ids, masked_lm_targets, is_nsp_positive + + +def load_corpus(corpus_path: str) -> List[Document]: + documents, texts = [], [] + with open(load_corpus) as f: + for line in f: + line = line.strip() + if line: + texts.append(line) + elif len(texts) > 0: + documents.append(Document(texts)) + texts = [] + return documents + + +def tokenize_document(document: Document, tokenizer): + document.tokenized_texts = [tokenizer.tokenize(text) for text in document.texts] + + +def main(args: Namespace): + documents = [] + for corpus_path in glob(args.corpus_paths): + logger.info(f"[+] Parsing corpus: {corpus_path}") + documents.extend(load_corpus(corpus_path)) + + # TODO: Add tokenizer + tokenizer = None + + logger.info(f"[+] Tokenizing {len(documents)} documents start") + num_worker = args.num_worker if args.num_worker else cpu_count() + with Pool(num_worker) as pool: + tokenize_fn = partial(tokenize_document, tokenizer=tokenizer) + list(tqdm(pool.imap_unordered(tokenize_fn, documents), total=len(documents), desc="tokenizing")) + + def random_document_generator(): + for document_index in range(len((documents))): + random_index = random.randint(0, len(documents)) + while random_index == document_index: + random_index = random.randint(0, len(documents)) + yield documents[random_index] + + logger.info("[+] Generating random documents for negative NSP") + random_documents = [random_document for random_document in random_document_generator(documents)] + + logger.info("[+] Making BERT pre-training features") + with Pool(num_worker) as pool: + make_bert_input_fn = partial( + make_bert_pretrain_feature, + max_seq_length=args.max_seq_length, + short_seq_prob=args.short_seq_prob, + masked_lm_probs=args.masked_lm_probs, + max_prediction_per_seq=args.max_prediction_per_seq, + cls_token_id=0, + sep_token_id=0, + mask_token_id=0, + ) + bert_pretrain_features: List[BertPretrainFeature] = [ + feature + for feature in tqdm( + pool.imap_unordered(make_bert_input_fn, zip(documents, random_documents)), + total=len(documents), + desc="making features", + ) + ] + + logger.info(f"[+] Writing {len(bert_pretrain_features)} features to ...") + # TODO: writing outputs + + +if __name__ == "__main__": + main() From 45a56c93ecb342ee1ea672bd36c172fe01039b14 Mon Sep 17 00:00:00 2001 From: Junseong Kim Date: Tue, 6 Apr 2021 15:48:06 +0900 Subject: [PATCH 07/12] feat: feature maker refactoring --- bert/pretrain/__init__.py | 0 bert/pretrain/feature.py | 106 +++++++++++++++ requirements.txt | 1 + scripts/run_build_train_dataset.py | 202 +++++++++-------------------- 4 files changed, 171 insertions(+), 138 deletions(-) create mode 100644 bert/pretrain/__init__.py create mode 100644 bert/pretrain/feature.py diff --git a/bert/pretrain/__init__.py b/bert/pretrain/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bert/pretrain/feature.py b/bert/pretrain/feature.py new file mode 100644 index 0000000..14446e5 --- /dev/null +++ b/bert/pretrain/feature.py @@ -0,0 +1,106 @@ +import random +from dataclasses import dataclass +from typing import List, Optional, Tuple + +# input_ids, attention_mask, token_type_ids, position_ids, mlm_targets, nsp_label +BertPretrainFeature = Tuple[List[int], List[float], List[int], List[int], List[int], int] + + +@dataclass +class Document: + texts: List[str] + tokenized_texts: Optional[List[List[int]]] = None + + +def make_bert_pretrain_feature( + positive_and_negative_documents: Tuple[Document, Document], + max_seq_length: int, + short_seq_prob: float, + masked_lm_prob: int, + max_prediction_per_seq: int, + cls_token_id: int, + sep_token_id: int, + mask_token_id: int, +) -> BertPretrainFeature: + positive_document, negative_document = positive_and_negative_documents + + # reserve special token space + target_seq_length = max_seq_length - 3 + if random.random() < short_seq_prob: + target_seq_length = random.randint(2, max_seq_length) + + segment_a, segment_b = [], [] + segment_a_target_length = random.randint(1, target_seq_length - 1) + + # postiive next sentence prediction sample + is_nsp_positive = int(random.random() >= 0.5) + if is_nsp_positive: + for tokenized_text in positive_document.tokenized_texts: + if len(segment_a) + len(tokenized_text) <= segment_a_target_length: + segment_a.extend(tokenized_text) + elif len(segment_a) + len(segment_b) + len(tokenized_text) <= target_seq_length: + segment_b.extend(tokenized_text) + else: + break + # negative next sentence prediction sample + else: + for tokenized_text in positive_document.tokenized_texts: + if len(segment_a) + len(tokenized_text) > segment_a_target_length: + break + segment_a.extend(tokenized_text) + + for tokenized_text in negative_document.tokenized_texts: + if len(segment_a) + len(segment_b) + len(tokenized_text) > target_seq_length: + break + segment_b.extend(tokenized_text) + + # adding special tokens + segment_a.insert(0, cls_token_id) + segment_a.append(sep_token_id) + segment_b.append(sep_token_id) + input_ids = segment_a + segment_b + + # random masking + masked_tokens_counts = 0 + masked_lm_targets = [], [] + for i, token_id in enumerate(input_ids): + not_special_token = token_id != cls_token_id and token_id != sep_token_id + if not_special_token and masked_tokens_counts < max_prediction_per_seq and random.random() > masked_lm_prob: + input_ids[i] = mask_token_id + masked_lm_targets.append(token_id) + masked_tokens_counts += 1 + else: + masked_lm_targets.append(-100) + + # padding and making attention mask and + padding_size = max_seq_length - len(input_ids) + attention_mask = [1.0] * len(input_ids) + [0.0] * padding_size + input_ids.extend([0] * len(padding_size)) + token_type_ids = [0] * len(segment_a) + [1] * len(segment_b) + position_ids = [i for i in range(max_seq_length)] + masked_lm_targets.extend([-100] * padding_size) + + assert len(input_ids) == max_seq_length + assert len(attention_mask) == max_seq_length + assert len(token_type_ids) == max_seq_length + assert len(position_ids) == max_seq_length + assert len(masked_lm_targets) == max_seq_length + + return input_ids, attention_mask, token_type_ids, position_ids, masked_lm_targets, is_nsp_positive + + +def tokenize_document(document: Document, tokenizer): + document.tokenized_texts = [tokenizer.tokenize(text) for text in document.texts] + + +def load_corpus(corpus_path: str) -> List[Document]: + documents, texts = [], [] + with open(load_corpus) as f: + for line in f: + line = line.strip() + if line: + texts.append(line) + elif len(texts) > 0: + documents.append(Document(texts)) + texts = [] + return documents diff --git a/requirements.txt b/requirements.txt index 0530f9f..6b89e84 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ torch tqdm +tokenizers diff --git a/scripts/run_build_train_dataset.py b/scripts/run_build_train_dataset.py index 706690e..02cba3f 100644 --- a/scripts/run_build_train_dataset.py +++ b/scripts/run_build_train_dataset.py @@ -1,139 +1,61 @@ +import os +import pickle import random from argparse import ArgumentParser, Namespace -from dataclasses import dataclass from functools import partial from glob import glob from multiprocessing import Pool, cpu_count -from typing import List, Optional, Tuple +from tokenizers import BertWordPieceTokenizer, Tokenizer +from tokenizers.trainers import WordPieceTrainer from tqdm import tqdm -from bert.pretrain.utils import get_logger - -logger = get_logger("BERT-data") +from bert.pretrain.feature import load_corpus, make_bert_pretrain_feature, tokenize_document +from bert.utils import get_logger # fmt: off parser = ArgumentParser() parser.add_argument("--corpus-paths", type=str, required=True, help="corpus paths (glob style)") -parser.add_argument("--num-workers", default=-1, type=int, help="num worker to multi-process (default: number of CPU cores)") +parser.add_argument("--output-dir", type=str, required=True, help="output directory") parser.add_argument("--max-seq-length", default=128, type=int, help="maximum sequence length of input features") parser.add_argument("--short-seq-prob", default=0.2, type=float, help="probability to make shorter sequence") parser.add_argument("--masked-lm-prob", default=0.15, type=float, help="masking prob") parser.add_argument("--max-prediction-per-seq", default=20, type=int, help="maximum masked tokens per input") +parser.add_argument("--num-duplicates", default=10, type=int, help="number of dumplication for each document") +parser.add_argument("--num-workers", default=-1, type=int, help="num worker to multi-process (default: number of CㅈPU cores)") +parser.add_argument("--num-features-per-file", default=100000, type=int, help="number of features to save on single file") +parser.add_argument("--vocab-size", default=50004, type=int, help="vocab size of tokenizer") +parser.add_argument("--pretrained-tokenizer-json-path", type=str, help="(optional) pretrained tokenizer json path, don't give anything to pretrained-tokenizer-vocab-{path, is-lowercase}") +parser.add_argument("--pretrained-tokenizer-vocab-path", type=str, help="(optional) pretrained tokenizer vocab path") +parser.add_argument("--pretrained-tokenizer-is-uncased", action="store_true", help="(optional, default:False) if pretrained-tokenizer-vocab is not None") # fmt: on -# input_ids, attention_mask, token_type_ids, position_ids, mlm_targets, nsp_label -BertPretrainFeature = Tuple[List[int], List[float], List[int], List[int], List[int], int] - - -@dataclass -class Document: - texts: List[str] - tokenized_texts: Optional[List[List[int]]] = None - - -def make_bert_pretrain_feature( - positive_and_negative_documents: Tuple[Document, Document], - max_seq_length: int, - short_seq_prob: float, - masked_lm_prob: int, - max_prediction_per_seq: int, - cls_token_id: int, - sep_token_id: int, - mask_token_id: int, -) -> BertPretrainFeature: - positive_document, negative_document = positive_and_negative_documents - - # reserve special token space - target_seq_length = max_seq_length - 3 - if random.random() < short_seq_prob: - target_seq_length = random.randint(2, max_seq_length) - - segment_a, segment_b = [], [] - segment_a_target_length = random.randint(1, target_seq_length - 1) - - # postiive next sentence prediction sample - is_nsp_positive = int(random.random() >= 0.5) - if is_nsp_positive: - for tokenized_text in positive_document.tokenized_texts: - if len(segment_a) + len(tokenized_text) <= segment_a_target_length: - segment_a.extend(tokenized_text) - elif len(segment_a) + len(segment_b) + len(tokenized_text) <= target_seq_length: - segment_b.extend(tokenized_text) - else: - break - # negative next sentence prediction sample - else: - for tokenized_text in positive_document.tokenized_texts: - if len(segment_a) + len(tokenized_text) > segment_a_target_length: - break - segment_a.extend(tokenized_text) - - for tokenized_text in negative_document.tokenized_texts: - if len(segment_a) + len(segment_b) + len(tokenized_text) > target_seq_length: - break - segment_b.extend(tokenized_text) - - # adding special tokens - segment_a.insert(0, cls_token_id) - segment_a.append(sep_token_id) - segment_b.append(sep_token_id) - input_ids = segment_a + segment_b - - # random masking - masked_tokens_counts = 0 - masked_lm_targets = [], [] - for i, token_id in enumerate(input_ids): - not_special_token = token_id != cls_token_id and token_id != sep_token_id - if not_special_token and masked_tokens_counts < max_prediction_per_seq and random.random() > masked_lm_prob: - input_ids[i] = mask_token_id - masked_lm_targets.append(token_id) - masked_tokens_counts += 1 - else: - masked_lm_targets.append(-100) - - # padding and making attention mask and - padding_size = max_seq_length - len(input_ids) - attention_mask = [1.0] * len(input_ids) + [0.0] * padding_size - input_ids.extend([0] * len(padding_size)) - token_type_ids = [0] * len(segment_a) + [1] * len(segment_b) - position_ids = [i for i in range(max_seq_length)] - masked_lm_targets.extend([-100] * padding_size) - - assert len(input_ids) == max_seq_length - assert len(attention_mask) == max_seq_length - assert len(token_type_ids) == max_seq_length - assert len(position_ids) == max_seq_length - assert len(masked_lm_targets) == max_seq_length - - return input_ids, attention_mask, token_type_ids, position_ids, masked_lm_targets, is_nsp_positive - - -def load_corpus(corpus_path: str) -> List[Document]: - documents, texts = [], [] - with open(load_corpus) as f: - for line in f: - line = line.strip() - if line: - texts.append(line) - elif len(texts) > 0: - documents.append(Document(texts)) - texts = [] - return documents - - -def tokenize_document(document: Document, tokenizer): - document.tokenized_texts = [tokenizer.tokenize(text) for text in document.texts] - - -def main(args: Namespace): +logger = get_logger("BERT-data") + + +def main(args: Namespace) -> int: documents = [] for corpus_path in glob(args.corpus_paths): logger.info(f"[+] Parsing corpus: {corpus_path}") documents.extend(load_corpus(corpus_path)) - # TODO: Add tokenizer - tokenizer = None + if args.pretrained_tokenizer_json_path: + logger.info(f"[+] Loading WordPieceTokenizer from {args.pretrained_tokenizer_json_path}") + tokenizer = Tokenizer.from_file(args.pretrained_tokenizer_json_path) + elif args.pretrained_tokenizer_vocab_path: + logger.info(f"[+] Loading WordPieceTokenizer from {args.pretraeind_vocab_path}") + tokenizer = BertWordPieceTokenizer( + args.pretrained_tokenizer_vocab_path, lowercase=not args.pretrained_tokenizer_is_cased + ) + else: + logger.info(f"[+] Training WordPieceTokenizer with {args.corpus_paths}") + tokenizer = BertWordPieceTokenizer() + trainer = WordPieceTrainer(vocab_size=args.vocab_size, min_frequency=1) + trainer.train(glob(args.corpus_paths)) + + trained_tokenizer_path = os.path.join(args.output_dir, "tokenizer.json") + logger.info(f"[+] Saving trained WordPieceTokeizer to {trained_tokenizer_path}") + tokenizer.save(trained_tokenizer_path, pretty=True) logger.info(f"[+] Tokenizing {len(documents)} documents start") num_worker = args.num_worker if args.num_worker else cpu_count() @@ -141,6 +63,8 @@ def main(args: Namespace): tokenize_fn = partial(tokenize_document, tokenizer=tokenizer) list(tqdm(pool.imap_unordered(tokenize_fn, documents), total=len(documents), desc="tokenizing")) + logger.info("[+] Generating random documents for negative NSP") + def random_document_generator(): for document_index in range(len((documents))): random_index = random.randint(0, len(documents)) @@ -148,33 +72,35 @@ def random_document_generator(): random_index = random.randint(0, len(documents)) yield documents[random_index] - logger.info("[+] Generating random documents for negative NSP") - random_documents = [random_document for random_document in random_document_generator(documents)] - - logger.info("[+] Making BERT pre-training features") - with Pool(num_worker) as pool: - make_bert_input_fn = partial( - make_bert_pretrain_feature, - max_seq_length=args.max_seq_length, - short_seq_prob=args.short_seq_prob, - masked_lm_probs=args.masked_lm_probs, - max_prediction_per_seq=args.max_prediction_per_seq, - cls_token_id=0, - sep_token_id=0, - mask_token_id=0, - ) - bert_pretrain_features: List[BertPretrainFeature] = [ - feature - for feature in tqdm( - pool.imap_unordered(make_bert_input_fn, zip(documents, random_documents)), - total=len(documents), - desc="making features", + random_documents = [random_document for random_document in random_document_generator()] + + logger.info("[+] Making BERT pre-training input features") + for i in range(0, len(documents), args.num_features_per_file): + output_path = os.path.join(args.output_dir, "bert_pretraining_features.{i:08}.records") + num_features = min(args.num_features_per_file, len(documents) - i) + logger.info(f"[+] Writing {num_features} features {output_path}") + + documents_chunk = documents[i : i + num_features] + random_documents_chunk = random_documents[i : i + num_features] + + with Pool(num_worker) as pool, open(output_path, "wb") as f_out: + make_bert_input_fn = partial( + make_bert_pretrain_feature, + max_seq_length=args.max_seq_length, + short_seq_prob=args.short_seq_prob, + masked_lm_probs=args.masked_lm_probs, + max_prediction_per_seq=args.max_prediction_per_seq, + cls_token_id=tokenizer.token_to_id("[CLS]"), + sep_token_id=tokenizer.token_to_id("[SEP]"), + mask_token_id=tokenizer.token_to_id("[MASK]"), ) - ] + featuring_iterator = pool.imap_unordered(make_bert_input_fn, zip(documents_chunk, random_documents_chunk)) + for feature in tqdm(featuring_iterator, total=len(documents_chunk), desc="making features"): + pickle.dump(feature, f_out) - logger.info(f"[+] Writing {len(bert_pretrain_features)} features to ...") - # TODO: writing outputs + logger.info("[+] Done!") + return 0 if __name__ == "__main__": - main() + exit(main()) From 3b1f282c60e96a71de1acc31b1b5b77c4e21e6d8 Mon Sep 17 00:00:00 2001 From: Junseong Kim Date: Fri, 9 Apr 2021 23:24:10 +0900 Subject: [PATCH 08/12] refactor: optimized feature making process --- bert/pretrain/feature.py | 211 +++++++++++++++++------------ bert/pretrain/utils.py | 37 +++++ requirements.txt | 1 + scripts/run_build_train_dataset.py | 102 ++++++++------ 4 files changed, 226 insertions(+), 125 deletions(-) create mode 100644 bert/pretrain/utils.py diff --git a/bert/pretrain/feature.py b/bert/pretrain/feature.py index 14446e5..7a8598c 100644 --- a/bert/pretrain/feature.py +++ b/bert/pretrain/feature.py @@ -1,9 +1,9 @@ import random from dataclasses import dataclass -from typing import List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union # input_ids, attention_mask, token_type_ids, position_ids, mlm_targets, nsp_label -BertPretrainFeature = Tuple[List[int], List[float], List[int], List[int], List[int], int] +BertPretrainFeatures = Dict[str, Union[List[int], List[float], int]] @dataclass @@ -12,8 +12,64 @@ class Document: tokenized_texts: Optional[List[List[int]]] = None -def make_bert_pretrain_feature( - positive_and_negative_documents: Tuple[Document, Document], +def positive_and_negative_documents_generator(documents: List[Document], num_negative_documents: int = 5): + for document in documents: + yield document, random.choices(documents, k=num_negative_documents) + + +def generate_segment_ab( + positive_document: Document, + negative_documents: List[Document], + max_seq_length: int, + short_seq_prob: float, +): + positive_texts_tokens = positive_document.tokenized_texts + if len(positive_texts_tokens) == 0: + print(positive_document.texts) + return + + positive_text_pointer = 0 + while positive_text_pointer < len(positive_texts_tokens): + segment_a, segment_b = [], [] + + target_seq_length = max_seq_length if random.random() > short_seq_prob else random.randint(5, max_seq_length) + target_seq_length -= 3 + + segment_a_target_length = random.randint(1, target_seq_length) + for i in range(positive_text_pointer, len(positive_texts_tokens)): + if len(positive_texts_tokens[i]) + len(segment_a) > segment_a_target_length: + break + segment_a.extend(positive_texts_tokens[i]) + positive_text_pointer += 1 + + segment_b_target_length = target_seq_length - len(segment_a) + + is_nsp_positive = int(random.random() >= 0.5) + if is_nsp_positive: + for i in range(positive_text_pointer, len(positive_texts_tokens)): + positive_text_pointer += 1 + if len(positive_texts_tokens[i]) + len(segment_b) > segment_b_target_length: + left_space = segment_b_target_length - len(segment_b) + segment_b.extend(positive_texts_tokens[i][:left_space]) + break + segment_b.extend(positive_texts_tokens[i]) + + else: + negative_document = random.choice(negative_documents) + negative_texts_tokens = negative_document.tokenized_texts + + negative_text_start = random.randint(0, len(negative_texts_tokens) - 1) + for i in range(negative_text_start, len(negative_texts_tokens)): + if len(negative_texts_tokens[i]) + len(segment_b) > segment_b_target_length: + break + segment_b.extend(negative_texts_tokens[i]) + + if segment_a and segment_b: + yield segment_a, segment_b, is_nsp_positive + + +def make_features_from_document( + positive_and_negative_documents: Tuple[Document, List[Document]], max_seq_length: int, short_seq_prob: float, masked_lm_prob: int, @@ -21,86 +77,69 @@ def make_bert_pretrain_feature( cls_token_id: int, sep_token_id: int, mask_token_id: int, -) -> BertPretrainFeature: - positive_document, negative_document = positive_and_negative_documents - - # reserve special token space - target_seq_length = max_seq_length - 3 - if random.random() < short_seq_prob: - target_seq_length = random.randint(2, max_seq_length) - - segment_a, segment_b = [], [] - segment_a_target_length = random.randint(1, target_seq_length - 1) - - # postiive next sentence prediction sample - is_nsp_positive = int(random.random() >= 0.5) - if is_nsp_positive: - for tokenized_text in positive_document.tokenized_texts: - if len(segment_a) + len(tokenized_text) <= segment_a_target_length: - segment_a.extend(tokenized_text) - elif len(segment_a) + len(segment_b) + len(tokenized_text) <= target_seq_length: - segment_b.extend(tokenized_text) + vocab_size: int, +) -> List[BertPretrainFeatures]: + features = [] + document, negative_documents = positive_and_negative_documents + for segment_a, segment_b, is_nsp_positive in generate_segment_ab( + document, negative_documents, max_seq_length, short_seq_prob + ): + # adding special tokens + segment_a.insert(0, cls_token_id) + segment_a.append(sep_token_id) + segment_b.append(sep_token_id) + + input_ids = segment_a + segment_b + + # random masking + masked_tokens_counts = 0 + masked_lm_targets = [] + for i, token_id in enumerate(input_ids): + not_special_token = token_id != cls_token_id and token_id != sep_token_id + is_not_full = masked_tokens_counts < max_prediction_per_seq + + # masked token + if random.random() < masked_lm_prob and not_special_token and is_not_full: + # 80% will be mask token + if random.random() < 0.8: + input_ids[i] = mask_token_id + + # 10% will be original token & 10% will be random token + elif random.random() > 0.5: + input_ids[i] = random.randint(3, vocab_size - 1) + + masked_lm_targets.append(token_id) + masked_tokens_counts += 1 + + # non-masked token else: - break - # negative next sentence prediction sample - else: - for tokenized_text in positive_document.tokenized_texts: - if len(segment_a) + len(tokenized_text) > segment_a_target_length: - break - segment_a.extend(tokenized_text) - - for tokenized_text in negative_document.tokenized_texts: - if len(segment_a) + len(segment_b) + len(tokenized_text) > target_seq_length: - break - segment_b.extend(tokenized_text) - - # adding special tokens - segment_a.insert(0, cls_token_id) - segment_a.append(sep_token_id) - segment_b.append(sep_token_id) - input_ids = segment_a + segment_b - - # random masking - masked_tokens_counts = 0 - masked_lm_targets = [], [] - for i, token_id in enumerate(input_ids): - not_special_token = token_id != cls_token_id and token_id != sep_token_id - if not_special_token and masked_tokens_counts < max_prediction_per_seq and random.random() > masked_lm_prob: - input_ids[i] = mask_token_id - masked_lm_targets.append(token_id) - masked_tokens_counts += 1 - else: - masked_lm_targets.append(-100) - - # padding and making attention mask and - padding_size = max_seq_length - len(input_ids) - attention_mask = [1.0] * len(input_ids) + [0.0] * padding_size - input_ids.extend([0] * len(padding_size)) - token_type_ids = [0] * len(segment_a) + [1] * len(segment_b) - position_ids = [i for i in range(max_seq_length)] - masked_lm_targets.extend([-100] * padding_size) - - assert len(input_ids) == max_seq_length - assert len(attention_mask) == max_seq_length - assert len(token_type_ids) == max_seq_length - assert len(position_ids) == max_seq_length - assert len(masked_lm_targets) == max_seq_length - - return input_ids, attention_mask, token_type_ids, position_ids, masked_lm_targets, is_nsp_positive - - -def tokenize_document(document: Document, tokenizer): - document.tokenized_texts = [tokenizer.tokenize(text) for text in document.texts] - - -def load_corpus(corpus_path: str) -> List[Document]: - documents, texts = [], [] - with open(load_corpus) as f: - for line in f: - line = line.strip() - if line: - texts.append(line) - elif len(texts) > 0: - documents.append(Document(texts)) - texts = [] - return documents + masked_lm_targets.append(-100) + + # padding and making attention mask and + attention_mask = [1.0] * len(input_ids) + token_type_ids = [0] * len(segment_a) + [1] * len(segment_b) + position_ids = [i for i in range(max_seq_length)] + + padding_size = max_seq_length - len(input_ids) + if padding_size > 0: + input_ids.extend([0] * padding_size) + attention_mask.extend([0.0] * padding_size) + token_type_ids.extend([0] * padding_size) + masked_lm_targets.extend([-100] * padding_size) + + assert len(input_ids) == max_seq_length + assert len(attention_mask) == max_seq_length + assert len(token_type_ids) == max_seq_length + assert len(position_ids) == max_seq_length + assert len(masked_lm_targets) == max_seq_length + + feature = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "token_type_ids": token_type_ids, + "position_ids": position_ids, + "mlm_labels": masked_lm_targets, + "nsp_label": is_nsp_positive, + } + features.append(feature) + return features diff --git a/bert/pretrain/utils.py b/bert/pretrain/utils.py new file mode 100644 index 0000000..7761420 --- /dev/null +++ b/bert/pretrain/utils.py @@ -0,0 +1,37 @@ +import pickle +from typing import List + +from .feature import BertPretrainFeatures, Document + + +def load_corpus(corpus_path: str) -> List[Document]: + documents, texts = [], [] + with open(corpus_path) as f: + for line in f: + line = line.strip() + if line: + texts.append(line) + elif len(texts) > 0 and sum(map(len, texts)) > 64: + documents.append(Document(texts)) + texts = [] + else: + texts = [] + return documents + + +def load_feature_records(record_file_path: str) -> List[BertPretrainFeatures]: + with open(record_file_path, "rb") as f: + feature_list = [] + while True: + try: + data = pickle.load(f) + except EOFError: + break + feature_list.append(data) + return feature_list + + +def save_feature_records(record_file_path: str, features: List[BertPretrainFeatures]): + with open(record_file_path, "wb") as f: + for feature in features: + pickle.dump(feature, f) diff --git a/requirements.txt b/requirements.txt index 6b89e84..f7c0236 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ torch tqdm tokenizers +nltk diff --git a/scripts/run_build_train_dataset.py b/scripts/run_build_train_dataset.py index 02cba3f..450a235 100644 --- a/scripts/run_build_train_dataset.py +++ b/scripts/run_build_train_dataset.py @@ -1,16 +1,16 @@ import os -import pickle import random from argparse import ArgumentParser, Namespace from functools import partial from glob import glob -from multiprocessing import Pool, cpu_count +from multiprocessing import Pool +import nltk from tokenizers import BertWordPieceTokenizer, Tokenizer -from tokenizers.trainers import WordPieceTrainer from tqdm import tqdm -from bert.pretrain.feature import load_corpus, make_bert_pretrain_feature, tokenize_document +from bert.pretrain.feature import make_features_from_document, positive_and_negative_documents_generator +from bert.pretrain.utils import load_corpus, save_feature_records from bert.utils import get_logger # fmt: off @@ -22,23 +22,35 @@ parser.add_argument("--masked-lm-prob", default=0.15, type=float, help="masking prob") parser.add_argument("--max-prediction-per-seq", default=20, type=int, help="maximum masked tokens per input") parser.add_argument("--num-duplicates", default=10, type=int, help="number of dumplication for each document") -parser.add_argument("--num-workers", default=-1, type=int, help="num worker to multi-process (default: number of CㅈPU cores)") +parser.add_argument("--num-workers", default=10, type=int, help="num worker to multi-process") parser.add_argument("--num-features-per-file", default=100000, type=int, help="number of features to save on single file") parser.add_argument("--vocab-size", default=50004, type=int, help="vocab size of tokenizer") +parser.add_argument("--use-sentence-splitter", action="store_true", help="split line using nltk splitter") parser.add_argument("--pretrained-tokenizer-json-path", type=str, help="(optional) pretrained tokenizer json path, don't give anything to pretrained-tokenizer-vocab-{path, is-lowercase}") parser.add_argument("--pretrained-tokenizer-vocab-path", type=str, help="(optional) pretrained tokenizer vocab path") parser.add_argument("--pretrained-tokenizer-is-uncased", action="store_true", help="(optional, default:False) if pretrained-tokenizer-vocab is not None") +parser.add_argument("--random-seed", default=0, type=int, help="random seed") # fmt: on logger = get_logger("BERT-data") +os.environ["TOKENIZERS_PARALLELISM"] = "true" def main(args: Namespace) -> int: + random.seed(args.random_seed) + os.makedirs(args.output_dir, exist_ok=True) + documents = [] for corpus_path in glob(args.corpus_paths): logger.info(f"[+] Parsing corpus: {corpus_path}") documents.extend(load_corpus(corpus_path)) + if args.use_sentence_splitter: + nltk.download("punkt") + logger.info("[+] Splitting long text line into sentences") + for document in documents: + document.texts = [splited_text for text in document.texts for splited_text in nltk.sent_tokenize(text)] + if args.pretrained_tokenizer_json_path: logger.info(f"[+] Loading WordPieceTokenizer from {args.pretrained_tokenizer_json_path}") tokenizer = Tokenizer.from_file(args.pretrained_tokenizer_json_path) @@ -50,57 +62,69 @@ def main(args: Namespace) -> int: else: logger.info(f"[+] Training WordPieceTokenizer with {args.corpus_paths}") tokenizer = BertWordPieceTokenizer() - trainer = WordPieceTrainer(vocab_size=args.vocab_size, min_frequency=1) - trainer.train(glob(args.corpus_paths)) + tokenizer.train(glob(args.corpus_paths), vocab_size=args.vocab_size, min_frequency=1) trained_tokenizer_path = os.path.join(args.output_dir, "tokenizer.json") logger.info(f"[+] Saving trained WordPieceTokeizer to {trained_tokenizer_path}") tokenizer.save(trained_tokenizer_path, pretty=True) - logger.info(f"[+] Tokenizing {len(documents)} documents start") - num_worker = args.num_worker if args.num_worker else cpu_count() - with Pool(num_worker) as pool: - tokenize_fn = partial(tokenize_document, tokenizer=tokenizer) - list(tqdm(pool.imap_unordered(tokenize_fn, documents), total=len(documents), desc="tokenizing")) - - logger.info("[+] Generating random documents for negative NSP") + if tokenizer.get_vocab_size() != args.vocab_size: + logger.warning(f"[-] arg.vocab_size({args.vocab_size}) != tokenizer.vocab_size({tokenizer.get_vocab_size()})") - def random_document_generator(): - for document_index in range(len((documents))): - random_index = random.randint(0, len(documents)) - while random_index == document_index: - random_index = random.randint(0, len(documents)) - yield documents[random_index] + num_total_texts = sum([len(document.texts) for document in documents]) + logger.info(f"[+] Tokenizing {len(documents)} documents {num_total_texts} texts start") + tokenizing_candidates = [text for document in documents for text in document.texts] + tokenized_texts = tokenizer.encode_batch(tokenizing_candidates, add_special_tokens=False) + for document in documents: + document.tokenized_texts = [tokenized_texts.pop(0).ids for _ in range(len(document.texts))] - random_documents = [random_document for random_document in random_document_generator()] + documents = documents * args.num_duplicates + logger.info(f"[+] Making BERT pre-training input features using {len(documents)} documents") - logger.info("[+] Making BERT pre-training input features") - for i in range(0, len(documents), args.num_features_per_file): - output_path = os.path.join(args.output_dir, "bert_pretraining_features.{i:08}.records") - num_features = min(args.num_features_per_file, len(documents) - i) - logger.info(f"[+] Writing {num_features} features {output_path}") - - documents_chunk = documents[i : i + num_features] - random_documents_chunk = random_documents[i : i + num_features] - - with Pool(num_worker) as pool, open(output_path, "wb") as f_out: - make_bert_input_fn = partial( - make_bert_pretrain_feature, + def feature_generator(): + with Pool(args.num_workers) as pool: + feature_maker_fn = partial( + make_features_from_document, max_seq_length=args.max_seq_length, short_seq_prob=args.short_seq_prob, - masked_lm_probs=args.masked_lm_probs, + masked_lm_prob=args.masked_lm_prob, max_prediction_per_seq=args.max_prediction_per_seq, cls_token_id=tokenizer.token_to_id("[CLS]"), sep_token_id=tokenizer.token_to_id("[SEP]"), mask_token_id=tokenizer.token_to_id("[MASK]"), + vocab_size=tokenizer.get_vocab_size(), ) - featuring_iterator = pool.imap_unordered(make_bert_input_fn, zip(documents_chunk, random_documents_chunk)) - for feature in tqdm(featuring_iterator, total=len(documents_chunk), desc="making features"): - pickle.dump(feature, f_out) - + feature_iter = pool.imap_unordered(feature_maker_fn, positive_and_negative_documents_generator(documents)) + for features in tqdm(feature_iter, total=len(documents), desc="feature generating"): + for feature in features: + yield feature + + feature_buffer, record_file_index = [], 0 + logger.info(f"[+] Generating {len(documents)} features") + for feature_id, feature in enumerate(feature_generator()): + feature_buffer.append(feature) + + if len(feature_buffer) == args.num_features_per_file: + output_path = os.path.join(args.output_dir, f"bert_pretrain_feature.{record_file_index:06d}.records") + logger.info(f"[+] Wrting {len(feature_buffer)} features to {output_path}") + save_feature_records(output_path, feature_buffer) + feature_buffer.clear() + record_file_index += 1 + + if feature_id < 5: + logger.info(f"======feature-{feature_id}======") + for key, value in feature.items(): + logger.info(f"{key}: {value}") + logger.info("\n") + + output_path = os.path.join(args.output_dir, f"bert_pretrain_feature.{record_file_index:06d}.records") + logger.info(f"[+] Wrting {len(feature_buffer)} features to {output_path}") + save_feature_records(output_path, feature_buffer) + + logger.info(f"[+] total {feature_id+1} features wrote into {record_file_index+1} record files") logger.info("[+] Done!") return 0 if __name__ == "__main__": - exit(main()) + exit(main(parser.parse_args())) From bc1210b4c95645af40d298d5c1368376efe5f4b7 Mon Sep 17 00:00:00 2001 From: Junseong Kim Date: Fri, 9 Apr 2021 23:27:16 +0900 Subject: [PATCH 09/12] style: rename dataset making script --- .../{run_build_train_dataset.py => create_pretraining_dataset.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename scripts/{run_build_train_dataset.py => create_pretraining_dataset.py} (100%) diff --git a/scripts/run_build_train_dataset.py b/scripts/create_pretraining_dataset.py similarity index 100% rename from scripts/run_build_train_dataset.py rename to scripts/create_pretraining_dataset.py From 92b109ce0cfbce8e8c76bcb737b14d28d2b09976 Mon Sep 17 00:00:00 2001 From: Junseong Kim Date: Fri, 9 Apr 2021 23:37:55 +0900 Subject: [PATCH 10/12] fix: remove useless print debugging --- bert/pretrain/feature.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bert/pretrain/feature.py b/bert/pretrain/feature.py index 7a8598c..4fdcf68 100644 --- a/bert/pretrain/feature.py +++ b/bert/pretrain/feature.py @@ -25,7 +25,6 @@ def generate_segment_ab( ): positive_texts_tokens = positive_document.tokenized_texts if len(positive_texts_tokens) == 0: - print(positive_document.texts) return positive_text_pointer = 0 From df1ee959fae0c23894ff3aadfc9b8bea62e19b5a Mon Sep 17 00:00:00 2001 From: Junseong Kim Date: Sat, 10 Apr 2021 02:00:22 +0900 Subject: [PATCH 11/12] feat: add runnable pretraning script --- bert/config.py | 9 +++- bert/pretrain/dataset.py | 19 +++++++ bert/utils.py | 24 +++++++-- scripts/run_pretrain.py | 106 +++++++++++++++++++++++++++++++++++++++ scripts/run_sample.py | 5 -- 5 files changed, 153 insertions(+), 10 deletions(-) create mode 100644 bert/pretrain/dataset.py create mode 100644 scripts/run_pretrain.py delete mode 100644 scripts/run_sample.py diff --git a/bert/config.py b/bert/config.py index 1ffbd00..af93937 100644 --- a/bert/config.py +++ b/bert/config.py @@ -1,3 +1,4 @@ +import json from typing import NamedTuple @@ -13,6 +14,12 @@ class BertConfig(NamedTuple): num_attention_heads: int num_hidden_layers: int - layer_norm_eps: float hidden_dropout_prob: float attention_probs_dropout_prob: float + layer_norm_eps: float = 1e-12 + + @classmethod + def from_json(cls, config_path: str) -> "BertConfig": + with open(config_path) as f: + config_dict = json.load(f) + return BertConfig(**config_dict) diff --git a/bert/pretrain/dataset.py b/bert/pretrain/dataset.py new file mode 100644 index 0000000..c78e0ee --- /dev/null +++ b/bert/pretrain/dataset.py @@ -0,0 +1,19 @@ +import pickle + +import torch +from torch.utils.data import IterableDataset + + +class BERTPretrainingIterableDataset(IterableDataset): + def __init__(self, dataset_path: str): + super().__init__() + self.dataset_path = dataset_path + + def __iter__(self): + with open(self.dataset_path, "rb") as f: + while True: + try: + features = pickle.load(f) + yield {key: torch.tensor(value) for key, value in features.items()} + except EOFError: + break diff --git a/bert/utils.py b/bert/utils.py index c604342..31f5e24 100644 --- a/bert/utils.py +++ b/bert/utils.py @@ -1,13 +1,29 @@ import logging -import sys + +import tqdm + + +class TqdmLoggingHandler(logging.Handler): + def __init__(self, level=logging.NOTSET): + super().__init__(level) + + def emit(self, record): + try: + msg = self.format(record) + tqdm.tqdm.write(msg) + self.flush() + except (KeyboardInterrupt, SystemExit): + raise + except Exception: + self.handleError(record) def get_logger(name: str = "BERT-PT"): logger = logging.getLogger(name) logger.setLevel(logging.DEBUG) - stream_handler = logging.StreamHandler(sys.stdout) - stream_handler.setFormatter(logging.Formatter("[%(asctime)s] %(message)s")) - logger.addHandler(stream_handler) + tqdm_handler = TqdmLoggingHandler() + tqdm_handler.setFormatter(logging.Formatter("[%(asctime)s] %(message)s")) + logger.addHandler(tqdm_handler) return logger diff --git a/scripts/run_pretrain.py b/scripts/run_pretrain.py new file mode 100644 index 0000000..d8a04d8 --- /dev/null +++ b/scripts/run_pretrain.py @@ -0,0 +1,106 @@ +import argparse +from glob import glob + +import numpy as np +import torch +from torch.nn import functional as fnn +from torch.optim import AdamW +from torch.utils.data import BufferedShuffleDataset, ChainDataset, DataLoader +from tqdm import tqdm + +from bert.config import BertConfig +from bert.heads import BertPretrainingHeads +from bert.model import BertModel +from bert.pretrain.dataset import BERTPretrainingIterableDataset +from bert.utils import get_logger + +# fmt: off +parser = argparse.ArgumentParser() +parser.add_argument("--bert-config-path", type=str, required=True, help="bert config json file path") +parser.add_argument("--input-files", type=str, required=True, help="input record file paths(glob)") +parser.add_argument("--output-dir", type=str, required=True, help="training artificts saving directory") +parser.add_argument("--max-seq-length", default=128, type=int, help="max sequence length make input") +parser.add_argument("--train-batch-size", default=128, type=int, help="train batch size") +parser.add_argument("--eval-batch-size", default=128, type=int, help="eval batch size") +parser.add_argument("--learning-rate", default=5e-5, type=float, help="train learning rate") +parser.add_argument("--num-train-steps", default=100000, type=int, help="total train steps") +parser.add_argument("--warmup-ratio", default=0.1, type=float, help="learning rate warmup ratio of total steps") +parser.add_argument("--checkpoint-save-steps", default=1000, type=int, help="checkpoint save interval (steps)") +parser.add_argument("--num-train-epochs", default=10, type=int, help="train epochs") +parser.add_argument("--num-buffer-size", default=1000, type=int, help="buffer sizes") +parser.add_argument("--num-logging-steps", default=10, type=int, help="training information will be printed every per num_logging_steps") +# fmt: on + + +def main(args: argparse.Namespace): + logger = get_logger() + + datasets = [BERTPretrainingIterableDataset(dataset_path) for dataset_path in glob(args.input_files)] + buffered_dataset = BufferedShuffleDataset(ChainDataset(datasets), buffer_size=1000) + dataloader = DataLoader(buffered_dataset, batch_size=args.train_batch_size) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + bert_config = BertConfig.from_json(args.bert_config_path) + bert_model = BertModel(bert_config) + model = BertPretrainingHeads(bert_config, bert_model).to(device) + + optimizer = AdamW(model.parameters(), lr=args.learning_rate, weight_decay=0.01) + + current_train_step = 0 + train_steps_per_epoch = args.num_train_steps // args.num_train_epochs + nsp_corrects, mlm_corrects, nsp_total, mlm_total = 0, 0, 0, 0 + mlm_loss_stack, nsp_loss_stack, total_loss_stack = [], [], [] + + for epoch in range(1, args.num_train_epochs + 1): + for step_id, batch_data in tqdm(enumerate(dataloader), total=train_steps_per_epoch, desc=f"train ep:{epoch}"): + batch_data = {key: value.to(device) for key, value in batch_data.items()} + + mlm_output, nsp_output = model.forward( + input_ids=batch_data["input_ids"], + attention_mask=batch_data["attention_mask"], + token_type_ids=batch_data["token_type_ids"], + position_ids=batch_data["position_ids"], + ) + + mlm_loss = fnn.cross_entropy(mlm_output.transpose(1, 2), batch_data["mlm_labels"]) + nsp_loss = fnn.cross_entropy(nsp_output, batch_data["nsp_label"]) + loss = mlm_loss + nsp_loss + + loss.backward() + optimizer.step() + optimizer.zero_grad() + + # for logging + mlm_loss_stack.append(mlm_loss.item()) + nsp_loss_stack.append(nsp_loss.item()) + total_loss_stack.append(loss.item()) + mlm_mask = batch_data["mlm_labels"].ge(0) + masked_mlm_labels = torch.masked_select(batch_data["mlm_labels"], mlm_mask) + masked_mlm_outputs = torch.masked_select(mlm_output.argmax(-1), mlm_mask) + mlm_corrects += masked_mlm_outputs.eq(masked_mlm_labels).sum().item() + nsp_corrects += nsp_output.argmax(-1).eq(batch_data["nsp_label"]).sum().item() + mlm_total += torch.numel(masked_mlm_labels) + nsp_total += batch_data["input_ids"].size(0) + + if step_id % args.num_logging_steps == 0: + total_loss = np.mean(total_loss_stack) + mlm_loss, mlm_acc = np.mean(mlm_loss_stack), mlm_corrects / mlm_total + nsp_loss, nsp_acc = np.mean(nsp_loss_stack), nsp_corrects / nsp_total + + logger.info( + f"ep: {epoch:02d} step: {step_id:06d}\t" + f"mlm_loss: {mlm_loss.item():.4f} mlm_acc: {mlm_acc:.4f}\t" + f"nsp_loss: {nsp_loss.item():.4f} nsp_acc: {nsp_acc:.4f}\t" + f"loss: {total_loss.item():.4f}" + ) + + nsp_corrects, mlm_corrects, nsp_total, mlm_total = 0, 0, 0, 0 + mlm_loss_stack, nsp_loss_stack, total_loss_stack = [], [], [] + + current_train_step += 1 + + return 0 + + +if __name__ == "__main__": + exit(main(parser.parse_args())) diff --git a/scripts/run_sample.py b/scripts/run_sample.py deleted file mode 100644 index b827738..0000000 --- a/scripts/run_sample.py +++ /dev/null @@ -1,5 +0,0 @@ -# Runing this script with under command -# python -m scripts.run_sample - -if __name__ == "__main__": - print("Replace this template script") From b0f408906b11a4efd47863028a002c1f3d8a7936 Mon Sep 17 00:00:00 2001 From: Junseong Kim Date: Sat, 10 Apr 2021 02:00:45 +0900 Subject: [PATCH 12/12] feat: add data/ outputs/ to gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 2ca9f17..3b732be 100644 --- a/.gitignore +++ b/.gitignore @@ -108,3 +108,6 @@ venv.bak/ .vscode/launch.json .vscode/settings.json .vscode/tasks.json + +data/ +outputs/