8000 Allow arbitary trainging args to be overridden by derekhiggins · Pull Request #1008 · instructlab/instructlab · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Allow arbitary trainging args to be overridden #1008

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions src/instructlab/lab.py
Original file line number Diff line number Diff line change
Expand Up @@ -977,6 +977,17 @@ def convert(self, value, param, ctx) -> "torch.device":
show_default=True,
help="model name to use in training",
)
@click.option(
"--override-training-args",
default="{}",
show_default=True,
hidden=True,
help=(
"Additional arguments for linux training (json string)"
"e.g. '--override-training-args {\"gradient_accumulation_steps\"=8}'"
"e.g. '--override-training-args \"$(< override_train_args.json)\"'"
),
)
@click.pass_context
def train(
ctx,
Expand All @@ -993,6 +1004,7 @@ def train(
device: "torch.device",
four_bit_quant: bool,
model_name: str,
override_training_args: str,
):
"""
Takes synthetic data generated locally with `ilab generate` and the previous model and learns a new model using the MLX API.
Expand All @@ -1006,6 +1018,11 @@ def train(
if four_bit_quant and device.type != "cuda":
ctx.fail("--4-bit-quant option requires --device=cuda")

try:
override_training_args_dict = json.loads(override_training_args)
except json.decoder.JSONDecodeError as e:
ctx.fail("Parsing override trainign args: " + str(e))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"trainign" nit on spelling.

I think the command fail (CLI exits) if the input is malformed, too, rather than proceeding and making the user ctl-c and reload.


# NOTE: If given a data_dir, input-dir is ignored in favor of existing!
if data_dir is None:
data_dir = "./taxonomy_data"
Expand Down Expand Up @@ -1058,6 +1075,7 @@ def train(
num_epochs=num_epochs,
device=device,
four_bit_quant=four_bit_quant,
override_training_args=override_training_args_dict,
)

training_results_dir = "./training_results"
Expand Down
40 changes: 24 additions & 16 deletions src/instructlab/train/linux_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ def linux_train(
num_epochs: Optional[int] = None,
device: torch.device = torch.device("cpu"),
four_bit_quant: bool = False,
override_training_args: dict = {},
):
"""Lab Train for Linux!"""
print("LINUX_TRAIN.PY: NUM EPOCHS IS: ", num_epochs)
Expand Down Expand Up @@ -293,25 +294,31 @@ def model_generate(user, **kwargs):
per_device_train_batch_size = 1
max_seq_length = 300

training_args = {}
training_args["num_train_epochs"] = num_epochs
training_args["per_device_train_batch_size"] = per_device_train_batch_size
training_args["save_strategy"] = "epoch"
training_args["report_to"] = "none"

if device.type == "hpu":
# Intel Gaudi trainer
# https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html
# https://huggingface.co/docs/optimum/habana/quickstart
# https://huggingface.co/docs/optimum/habana/package_reference/gaudi_config
if per_device_train_batch_size == 1:
per_device_train_batch_size = 8
training_args["per_device_train_batch_size"] = 8
training_args["bf16"] = True
training_args["use_habana"] = True
training_args["use_lazy_mode"] = True
training_args["save_on_each_node"] = True

# Update training args with user provided overrides
training_args.update(override_training_args)

training_arguments = GaudiTrainingArguments(
output_dir=output_dir,
num_train_epochs=num_epochs,
per_device_train_batch_size=per_device_train_batch_size,
bf16=True,
save_strategy="epoch",
report_to="none",
use_habana=True,
use_lazy_mode=True,
**training_args,
# create checkpoint directories
save_on_each_node=True,
# gaudi_config_name=gaudi_config_name,
)
gaudi_config = GaudiConfig(
Expand All @@ -335,16 +342,17 @@ def model_generate(user, **kwargs):
"generation_config": GaudiGenerationConfig(),
}
else:
training_args["fp16"] = use_fp16
training_args["bf16"] = not use_fp16
training_args["use_cpu"] = model.device.type == "cpu"

# Update training args with user provided overrides
training_args.update(override_training_args)

training_arguments = TrainingArguments(
output_dir=output_dir,
num_train_epochs=num_epochs,
per_device_train_batch_size=per_device_train_batch_size,
fp16=use_fp16,
bf16=not use_fp16,
**training_args,
# use_ipex=True, # TODO CPU test this possible optimization
use_cpu=model.device.type == "cpu",
save_strategy="epoch",
report_to="none",
# options to reduce GPU memory usage and improve performance
# https://huggingface.co/docs/transformers/perf_train_gpu_one
# https://stackoverflow.com/a/75793317
Expand Down
2 changes: 1 addition & 1 deletion tests/test_lab_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,7 @@ def test_train_linux(
assert linux_train_mock.call_args[1]["num_epochs"] == 1
assert linux_train_mock.call_args[1]["device"] is not None
assert not linux_train_mock.call_args[1]["four_bit_quant"]
assert len(linux_train_mock.call_args[1]) == 7
assert len(linux_train_mock.call_args[1]) == 8
is_macos_with_m_chip_mock.assert_called_once()
assert not os.path.isfile(LINUX_GGUF_FILE)

Expand Down
Loading
0