janeyx99 · janeyx99 · Apr 24, 2024 · Apr 24, 2024 · Apr 24, 2024 · Apr 24, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,7 +25,7 @@ dependencies = [
     "omegaconf",
 
     # Quantization
-    "torchao==0.1",
+    "torchao==0.2",
 ]
 dynamic = ["version"]
 
@@ -41,12 +41,13 @@ tune = "torchtune._cli.tune:main"
 dev = [
     "bitsandbytes>=0.43.0",
     "pre-commit",
-    "pytest",
+    "pytest==7.4.0",
     "pytest-cov",
     "pytest-mock",
     "pytest-integration",
     "tensorboard",
     "wandb",
+    "expecttest==0.1.6",
 ]
 
 [tool.setuptools.dynamic]

diff --git a/recipes/configs/llama2/7B_qlora_fsdp2_dummy.yaml b/recipes/configs/llama2/7B_qlora_fsdp2_dummy.yaml
@@ -0,0 +1,92 @@
+# Config for single device QLoRA with lora_finetune_single_device.py
+# using a Llama2 7B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download meta-llama/Llama-2-7b-hf --output-dir /tmp/Llama-2-7b-hf --hf-token <HF_TOKEN>
+#
+# To launch on a single device, run the following command from root:
+#   tune run lora_finetune_single_device --config llama2/7B_qlora_single_device
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run lora_finetune_single_device --config 7B_qlora_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works only for training on single device.
+
+# Model Arguments
+model:
+  _component_: torchtune.models.llama2.qlora_llama2_7b
+  lora_attn_modules: ['q_proj', 'v_proj', 'k_proj']  # removed output_proj to match AnswerAI for apples<=>apples comparison
+  apply_lora_to_mlp: True
+  apply_lora_to_output: False
+  lora_rank: 64
+  lora_alpha: 16
+
+tokenizer:
+  _component_: torchtune.models.llama2.llama2_tokenizer
+  path: /tmp/Llama-2-7b-hf/tokenizer.model
+
+checkpointer:
+  _component_: torchtune.utils.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Llama-2-7b-hf
+  checkpoint_files: [
+    pytorch_model-00001-of-00002.bin,
+    pytorch_model-00002-of-00002.bin
+  ]
+  adapter_checkpoint: null
+  recipe_checkpoint: null
+  output_dir: /tmp/Llama-2-7b-hf
+  model_type: LLAMA2
+resume_from_checkpoint: False
+
+# Dataset and Sampler
+dataset:
+  _component_: torchtune.datasets.dummy_dataset
+  max_seq_len: 2048
+  num_samples: 48
+seed: null
+shuffle: True
+batch_size: 8
+
+# Optimizer and Scheduler
+optimizer:
+  _component_: torch.optim.AdamW
+  weight_decay: 0.01
+  lr: 3e-4
+lr_scheduler:
+  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  num_warmup_steps: 1
+
+loss:
+  _component_: torch.nn.CrossEntropyLoss
+
+fsdp:
+  cpu_offload: True
+
+# Training
+epochs: 1
+max_steps_per_epoch: null
+gradient_accumulation_steps: 1
+compile: False
+
+# Logging
+output_dir: /tmp/qlora_finetune_output
+metric_logger:
+  _component_: torchtune.utils.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+log_every_n_steps: 1
+log_peak_memory_stats: True
+
+# Environment
+device: cuda
+dtype: bf16
+enable_activation_checkpointing: True
+
+# # Show case the usage of pytorch profiler
+# # Set enabled to False as it's only needed for debugging training
+# profiler:
+#   _component_: torchtune.utils.profiler
+#   enabled: True
+#   output_dir: ${output_dir}/torchtune_perf_tracing.json
diff --git a/recipes/configs/llama2/7B_qlora_single_device.yaml b/recipes/configs/llama2/7B_qlora_single_device.yaml
@@ -21,7 +21,7 @@ model:
   lora_attn_modules: ['q_proj', 'v_proj', 'k_proj', 'output_proj']
   apply_lora_to_mlp: True
   apply_lora_to_output: False
-  lora_rank: 8
+  lora_rank: 64
   lora_alpha: 16
 
 tokenizer:
@@ -43,11 +43,11 @@ resume_from_checkpoint: False
 
 # Dataset and Sampler
 dataset:
-  _component_: torchtune.datasets.alpaca_cleaned_dataset
-  train_on_input: True
+  _component_: torchtune.datasets.dummy_dataset
+  max_seq_len: 2048
 seed: null
 shuffle: True
-batch_size: 2
+batch_size: 8
 
 # Optimizer and Scheduler
 optimizer:
@@ -56,15 +56,18 @@ optimizer:
   lr: 3e-4
 lr_scheduler:
   _component_: torchtune.modules.get_cosine_schedule_with_warmup
-  num_warmup_steps: 100
+  num_warmup_steps: 1
 
 loss:
   _component_: torch.nn.CrossEntropyLoss
 
+fsdp:
+  cpu_offload: False
+
 # Training
 epochs: 1
-max_steps_per_epoch: null
-gradient_accumulation_steps: 16
+max_steps_per_epoch: 3
+gradient_accumulation_steps: 1
 compile: False
 
 # Logging
@@ -73,7 +76,7 @@ metric_logger:
   _component_: torchtune.utils.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-    log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
@@ -84,5 +87,5 @@ enable_activation_checkpointing: True
 # Set enabled to False as it's only needed for debugging training
 profiler:
   _component_: torchtune.utils.profiler
-  enabled: False
+  enabled: True
   output_dir: ${output_dir}/torchtune_perf_tracing.json
diff --git a/recipes/configs/llama3/70B_qlora_fsdp2_dummy.yaml b/recipes/configs/llama3/70B_qlora_fsdp2_dummy.yaml
@@ -0,0 +1,91 @@
+# Model Arguments
+model:
+  _component_: torchtune.models.llama3.lora_llama3_70b
+  lora_attn_modules: ['q_proj', 'k_proj', 'v_proj']
+  apply_lora_to_mlp: False
+  apply_lora_to_output: False
+  lora_rank: 64
+  lora_alpha: 16
+
+tokenizer:
+  _component_: torchtune.models.llama3.llama3_tokenizer
+  path: /tmp/Meta-Llama-3-70b/original/tokenizer.model
+
+checkpointer:
+  _component_: torchtune.utils.FullModelHFCheckpointer
+  checkpoint_dir:  /tmp/Meta-Llama-3-70b
+  checkpoint_files: [
+    model-00001-of-00030.safetensors,
+    model-00002-of-00030.safetensors,
+    model-00003-of-00030.safetensors,
+    model-00004-of-00030.safetensors,
+    model-00005-of-00030.safetensors,
+    model-00006-of-00030.safetensors,
+    model-00007-of-00030.safetensors,
+    model-00008-of-00030.safetensors,
+    model-00009-of-00030.safetensors,
+    model-00010-of-00030.safetensors,
+    model-00011-of-00030.safetensors,
+    model-00012-of-00030.safetensors,
+    model-00013-of-00030.safetensors,
+    model-00014-of-00030.safetensors,
+    model-00015-of-00030.safetensors,
+    model-00016-of-00030.safetensors,
+    model-00017-of-00030.safetensors,
+    model-00018-of-00030.safetensors,
+    model-00019-of-00030.safetensors,
+    model-00020-of-00030.safetensors,
+    model-00021-of-00030.safetensors,
+    model-00022-of-00030.safetensors,
+    model-00023-of-00030.safetensors,
+    model-00024-of-00030.safetensors,
+    model-00025-of-00030.safetensors,
+    model-00026-of-00030.safetensors,
+    model-00027-of-00030.safetensors,
+    model-00028-of-00030.safetensors,
+    model-00029-of-00030.safetensors,
+    model-00030-of-00030.safetensors,
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Meta-Llama-3-70b
+  model_type: LLAMA3
+resume_from_checkpoint: False
+
+# Dataset and Sampler
+dataset:
+  _component_: torchtune.datasets.dummy_dataset
+  max_seq_len: 2048
+  num_samples: 12
+seed: null
+shuffle: True
+batch_size: 2
+
+# Optimizer and Scheduler
+optimizer:
+  _component_: torch.optim.AdamW
+  weight_decay: 0.01
+  lr: 3e-4
+lr_scheduler:
+  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  num_warmup_steps: 1
+
+loss:
+  _component_: torch.nn.CrossEntropyLoss
+
+# Training
+epochs: 1
+max_steps_per_epoch: null
+gradient_accumulation_steps: 1
+
+# Logging
+output_dir: /tmp/qlora_finetune_70B_output
+metric_logger:
+  _component_: torchtune.utils.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+log_every_n_steps: 1
+log_peak_memory_stats: True
+
+# Environment
+device: cuda
+dtype: bf16
+enable_activation_checkpointing: True