NVIDIA-NeMo · SahilJain314 · Apr 29, 2025 · Apr 28, 2025 · Apr 28, 2025 · Apr 28, 2025
diff --git a/...ma3.1-8b-instruct-4n8g-fsdp2tp1-long.yaml → ....1-8b-instruct-4n8g-fsdp2tp1-long.v2.yaml b/...ma3.1-8b-instruct-4n8g-fsdp2tp1-long.yaml → ....1-8b-instruct-4n8g-fsdp2tp1-long.v2.yaml
diff --git a/...o-llama3.2-1b-instruct-1n8g-fsdp2tp1.yaml → ...lama3.2-1b-instruct-1n8g-fsdp2tp1.v2.yaml b/...o-llama3.2-1b-instruct-1n8g-fsdp2tp1.yaml → ...lama3.2-1b-instruct-1n8g-fsdp2tp1.v2.yaml
diff --git a/....5-32b-16n8g-fsdp2tp8sp-actckpt-long.yaml → ...32b-16n8g-fsdp2tp8sp-actckpt-long.v2.yaml b/....5-32b-16n8g-fsdp2tp8sp-actckpt-long.yaml → ...32b-16n8g-fsdp2tp8sp-actckpt-long.v2.yaml
diff --git a/...qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.yaml → ...n2.5-32b-16n8g-fsdp2tp8sp-actckpt.v2.yaml b/...qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.yaml → ...n2.5-32b-16n8g-fsdp2tp8sp-actckpt.v2.yaml
diff --git a/.../grpo-qwen2.5-7b-instruct-4n8g-fsdp1.yaml → ...po-qwen2.5-7b-instruct-4n8g-fsdp1.v2.yaml b/.../grpo-qwen2.5-7b-instruct-4n8g-fsdp1.yaml → ...po-qwen2.5-7b-instruct-4n8g-fsdp1.v2.yaml
diff --git a/...-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.yaml → ...en2.5-7b-instruct-4n8g-fsdp2tp4sp.v2.yaml b/...-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.yaml → ...en2.5-7b-instruct-4n8g-fsdp2tp4sp.v2.yaml
diff --git a/...2.5-math-1.5b-instruct-1n8g-fsdp2tp1.yaml → ...-math-1.5b-instruct-1n8g-fsdp2tp1.v2.yaml b/...2.5-math-1.5b-instruct-1n8g-fsdp2tp1.yaml → ...-math-1.5b-instruct-1n8g-fsdp2tp1.v2.yaml
diff --git a/nemo_rl/environments/math_environment.py b/nemo_rl/environments/math_environment.py
@@ -15,7 +15,8 @@
 
 import ray
 import torch
-from math_verify import parse, verify
+from math_verify.metric import math_metric
+from math_verify.parser import ExprExtractionConfig, LatexExtractionConfig
 
 from nemo_rl.distributed.batched_data_dict import BatchedDataDict
 from nemo_rl.distributed.virtual_cluster import PY_EXECUTABLES
@@ -53,9 +54,23 @@ def verify(
         results = []
         for response, ground_truth in zip(pred_responses, ground_truths):
             try:
-                gold = parse(ground_truth)
-                pred = parse(response[-100:])  # avoid looking at the whole string
-                results.append(float(verify(gold, pred)))
+                # Use Latex and plain math extraction from predictions
+                # https://github.com/huggingface/Math-Verify?tab=readme-ov-file#extraction-targets
+                verify_func = math_metric(
+                    gold_extraction_target=(LatexExtractionConfig(),),
+                    pred_extraction_target=(
+                        ExprExtractionConfig(),
+                        LatexExtractionConfig(),
+                    ),
+                )
+
+                ground_truth_parsable = "\\boxed{" + ground_truth + "}"
+                try:
+                    ret_score, _ = verify_func([ground_truth_parsable], [response])
+                except Exception:
+                    ret_score = 0.0
+
+                results.append(float(ret_score))
             except Exception:
                 results.append(0)
         return results

diff --git a/...lama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh → ...a3.1-8b-instruct-4n8g-fsdp2tp1-long.v2.sh b/...lama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh → ...a3.1-8b-instruct-4n8g-fsdp2tp1-long.v2.sh
diff --git a/...rpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh → ...-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.sh b/...rpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh → ...-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.sh
diff --git a/...n2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh → ...5-32b-16n8g-fsdp2tp8sp-actckpt-long.v2.sh b/...n2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh → ...5-32b-16n8g-fsdp2tp8sp-actckpt-long.v2.sh
diff --git a/...o-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh → ...wen2.5-32b-16n8g-fsdp2tp8sp-actckpt.v2.sh b/...o-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh → ...wen2.5-32b-16n8g-fsdp2tp8sp-actckpt.v2.sh
diff --git a/...lm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh → ...grpo-qwen2.5-7b-instruct-4n8g-fsdp1.v2.sh b/...lm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh → ...grpo-qwen2.5-7b-instruct-4n8g-fsdp1.v2.sh
diff --git a/...po-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh → ...qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v2.sh b/...po-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh → ...qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v2.sh
diff --git a/...en2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh → ....5-math-1.5b-instruct-1n8g-fsdp2tp1.v2.sh b/...en2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh → ....5-math-1.5b-instruct-1n8g-fsdp2tp1.v2.sh
diff --git a/tests/test_suites/nightly.txt b/tests/test_suites/nightly.txt
@@ -3,15 +3,15 @@
 ########
 
 # Short 1N/1B runs (go past 200 steps - usually divergence happens by now) -- going to 4 nodes doesn't help that much
-tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh
-tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh
+tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v2.sh
+tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.sh
 
 # FSDP1 vs Dtensor (Qwen/Qwen2.5-7B-Instruct)
-tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh
-tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh
+tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.v2.sh
+tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v2.sh
 
 # Functional 32b run
-tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh
+tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.v2.sh
 
 #######
 # SFT #

diff --git a/tests/test_suites/release.txt b/tests/test_suites/release.txt
@@ -3,10 +3,10 @@
 ########
 
 # Long 8b run
-tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh
+tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v2.sh
 
 # Long 32b run
-tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh
+tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.v2.sh
 
 #######
 # SFT #