ray-project · ranandfigma · Jun 3, 2025 · Jun 5, 2025 · Copilot · Jun 3, 2025
@@ -863,7 +863,8 @@ def _try_combine_chunks_safe(
         chunk_size = chunk.nbytes
 
         if cur_slice_size_bytes + chunk_size > max_chunk_size:
-            slices.append(array.chunks[cur_slice_start:i])
+            if cur_slice_start != i:
+                slices.append(array.chunks[cur_slice_start:i])
 
             cur_slice_start = i
             cur_slice_size_bytes = 0

@@ -13,6 +13,7 @@
 from ray.air.util.tensor_extensions.arrow import ArrowTensorTypeV2
 from ray.data import DataContext
 from ray.data._internal.arrow_ops.transform_pyarrow import (
+    MIN_NUM_CHUNKS_TO_TRIGGER_COMBINE_CHUNKS,
     MIN_PYARROW_VERSION_TYPE_PROMOTION,
     concat,
     hash_partition,
@@ -49,6 +50,20 @@ def test_try_defragment_table():
     assert dt == t
 
 
+def test_defragment_large_table():
+
+    big = pa.array(list(range(800_000_000)), type=pa.int32())  # ~2GiB
-    big = pa.array(list(range(800_000_000)), type=pa.int32())  # ~2GiB
+    big = pa.array(np.arange(800_000_000), type=pa.int32())  # ~2GiB
-    big = pa.array(list(range(800_000_000)), type=pa.int32())  # ~2GiB
+    big = pa.array(np.arange(800_000_000), type=pa.int32())  # ~2GiB
+    chunked = [big]
+    for _ in range(MIN_NUM_CHUNKS_TO_TRIGGER_COMBINE_CHUNKS):
+        chunked.append(pa.array([1, 2, 3], type=pa.int32()))  # a little tail chunk
+    chunked = pa.chunked_array(chunked)
+
+    table = pa.Table.from_arrays([chunked], names=["col"])
+
+    data = try_combine_chunked_columns(table)
+    assert len(data["col"].chunks) == 2
+
+
 def test_hash_partitioning():
     # Test hash-partitioning of the empty table
     empty_table = pa.Table.from_pydict({"idx": []})