instructlab · tiran · Apr 25, 2024 · May 30, 2024 · May 30, 2024 · Jun 5, 2024
diff --git a/requirements.txt b/requirements.txt
@@ -21,6 +21,7 @@ numpy>=1.26.4,<2.0.0 ; python_version != '3.10'
 openai>=1.13.3,<2.0.0
 peft>=0.9.0,<0.10.0
 prompt-toolkit>=3.0.38,<4.0.0
+psutil>=5.9.8,<6.0.0
 pydantic>=2.6.0,<3.0.0
 pydantic_yaml>=1.2.0,<2.0.0
 PyYAML>=6.0.0,<7.0.0

diff --git a/src/instructlab/lab.py b/src/instructlab/lab.py
@@ -872,6 +872,8 @@ def convert(self, value, param, ctx) -> "torch.device":
                 device = torch.device(value)
             except RuntimeError as e:
                 self.fail(str(e), param, ctx)
+        else:
+            device = value
 
         if device.type not in self.supported_devices:
             supported = ", ".join(repr(s) for s in sorted(self.supported_devices))

diff --git a/src/instructlab/llamacpp/llamacpp_convert_to_gguf.py b/src/instructlab/llamacpp/llamacpp_convert_to_gguf.py
@@ -1627,7 +1627,7 @@ def convert_llama_to_gguf(
     big_endian: bool = False,
     pad_vocab: bool = False,
     skip_unknown: bool = False,
-):
+) -> str:
     """Convert a LLaMA model to a GGML compatible file"""
     # TODO validate vocab_type as was done in click.option declaration:
     # type=click.Choice(

diff --git a/src/instructlab/train/linux_train.py b/src/instructlab/train/linux_train.py
@@ -20,6 +20,7 @@
 )
 from trl import DataCollatorForCompletionOnlyLM, SFTTrainer
 import click
+import psutil
 import torch
 
 # Local
@@ -94,6 +95,7 @@ def report_cuda_device(args_device: torch.device, min_vram: int = 0) -> None:
     """Report CUDA/ROCm device properties"""
     print(f"  NVidia CUDA version: {torch.version.cuda or 'n/a'}")
     print(f"  AMD ROCm HIP version: {torch.version.hip or 'n/a'}")
+    print(f"  Supports bf16: {torch.cuda.is_bf16_supported()}")
 
     def _gib(size: int) -> str:
         return "{:.1f} GiB".format(size / 1024**3)
@@ -173,6 +175,50 @@ def linux_train(
         hpu.init()
         report_hpu_device(device)
 
+    # device register a module, e.g. torch.cpu or torch.cuda
+    device_module = getattr(torch, device.type, None)
+    # bfloat16 is not supported on older CUDA versions and devices
+    # with CUDA support level < 8.0.
+    if hasattr(device_module, "is_bf16_supported"):
+        use_bf16 = device_module.is_bf16_supported()
+        use_fp16 = not use_bf16
+    elif device.type == "cpu":
+        # TODO: check if Torch and CPU support AVX2, F16C, AVX512
+        use_bf16 = False
+        use_fp16 = False
+    else:
+        # assume bf16 supported unless device says otherwise
+        use_bf16 = True
+        use_fp16 = False
+
+    torch_dtype = "auto" if device.type == "cuda" else None
+    if device.type == "cpu":
+        total_memory = psutil.virtual_memory().total / (1024**3)
+        if total_memory < 60:
-        if total_memory < 60:
+        if total_memory < 62:
-        if total_memory < 60:
+        if total_memory < 62:
+            # Using our default model, a system with 32 GB of RAM
+            # will get OOM killed using torch_dtype=None, though we
+            # seem to get much better performance with this setting
+            # where there's enough memory. Using `None` makes it
+            # use float32 as opposed to float16 or bf16.
+            #
+            # Anecdotally, 64 GB seems to be enough, but this calculation
+            # may come out to be slightly less than 64 GB, so we just check
+            # for 60 GB. It would be better to do a smarter calculation on
+            # the actual memory requirement here.
+            torch_dtype = "auto"
+
+    # torch compile fails to build, see PyTorch #124707
+    # scaled_dot_product_attention(): argument 'is_causal' must be bool, not SymBool
+    use_torch_compile = False
+    # if device.type == "cuda" and torch.version.cuda is not None:
+    #     # check for NVIDIA V100, A100, or H100
+    #     cap = torch.cuda.get_device_capability(device)
+    #     use_torch_compile = cap in {(7, 0), (8, 0), (9, 0)}
+
+    print(
+        f"LINUX_TRAIN.PY: {use_bf16=}, {use_fp16=}, {torch_dtype=}, {use_torch_compile=}"
+    )
+
     print("LINUX_TRAIN.PY: LOADING DATASETS")
     # Get the file name
     train_dataset = load_dataset("json", data_files=train_file, split="train")
@@ -194,6 +240,7 @@ def linux_train(
 
     if four_bit_quant:
         print("LINUX_TRAIN.PY: USING 4-bit quantization with BitsAndBytes")
+        use_bf16 = False
         use_fp16 = True
         bnb_config = BitsAndBytesConfig(
             load_in_4bit=True,
@@ -203,7 +250,6 @@ def linux_train(
         )
     else:
         print("LINUX_TRAIN.PY: NOT USING 4-bit quantization")
-        use_fp16 = False
         bnb_config = None
 
     # Loading the model
@@ -214,7 +260,7 @@ def linux_train(
 
     model = AutoModelForCausalLM.from_pretrained(
         model_name,
-        torch_dtype="auto",
+        torch_dtype=torch_dtype,
         quantization_config=bnb_config,
         config=config,
         trust_remote_code=True,
@@ -340,7 +386,7 @@ def model_generate(user, **kwargs):
             num_train_epochs=num_epochs,
             per_device_train_batch_size=per_device_train_batch_size,
             fp16=use_fp16,
-            bf16=not use_fp16,
+            bf16=use_bf16,
             # use_ipex=True, # TODO CPU test this possible optimization
             use_cpu=model.device.type == "cpu",
             save_strategy="epoch",