8000 Always ensure gpu_threads count >= warp size of 32 by antonysigma · Pull Request #8656 · halide/Halide · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Always ensure gpu_threads count >= warp size of 32 #8656

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 16 additions & 8 deletions apps/iir_blur/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ add_halide_generator(iir_blur.generator SOURCES iir_blur_generator.cpp)
add_halide_library(iir_blur FROM iir_blur.generator)
add_halide_library(iir_blur_auto_schedule FROM iir_blur.generator
GENERATOR iir_blur
AUTOSCHEDULER Halide::Mullapudi2016)
AUTOSCHEDULER Halide::Mullapudi2016
PARAMS autoscheduler.experimental_gpu_schedule=1)
Comment on lines +21 to +22
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Errors on the Buildbot halide-testbranch-main-llvm20-x86-64-osx-cmake:

/Users/halidenightly/build_bot/worker/halide-testbranch-main-llvm20-x86-64-osx-cmake/halide-build/apps/iir_blur/iir_blur_filter "rgba.png" "out.png"
2025-06-25 20:24:51.129 iir_blur_filter[12653:14179440] Metal API Validation Enabled
-[MTLDebugDevice newBufferWithLength:options:]:646: failed assertion `Buffer Validation
newBufferWith*:length 0xb4000000 must not exceed 2048 MB.
'
Manually-tuned time: 71.5462ms
Required regular expression not found. Regex=[Success!]

I don't have an OSX machine to reproduce the error. Any insights?

Here's the corresponding IR that might trigger the error:

 produce transpose$1 {
  consume blur {
   halide_copy_to_device((struct halide_buffer_t *)transpose$1.buffer, (struct halide_device_interface_t const *)halide_metal_device_interface())
   gpu_block<Metal> (transpose$1.s0.v2.v2_o.block_id_z, 0, 3) {
    gpu_block<Metal> (transpose$1.s0.v1.v1_o.block_id_y, 0, 80) {
     gpu_block<Metal> (transpose$1.s0.v0.v0_vo.block_id_x, 0, 48) {
      gpu_thread<Metal> (.thread_id_z, 0, 1) {
       gpu_thread<Metal> (.thread_id_y, 0, 32) {
        allocate blur$1.0[float32 * 1536]
        gpu_thread<Metal> (.thread_id_x, 0, 32) {
         produce blur$1 {
          blur$1.0[0] = blur[((((transpose$1.s0.v2.v2_o.block_id_z*80) + transpose$1.s0.v1.v1_o.block_id_y)*32) + .thread_id_y)*1536]
          let t1398 = ((transpose$1.s0.v2.v2_o.block_id_z*2560) + ((transpose$1.s0.v1.v1_o.block_id_y*32) + .thread_id_y))*1536
          for (blur$1.s2.r29$x.rebased, 0, 1535) {
           blur$1.0[blur$1.s2.r29$x.rebased + 1] = (blur$1.0[blur$1.s2.r29$x.rebased]*(1.000000f - (float32)alpha)) + (blur[(blur$1.s2.r29$x.rebased + t1398) + 1]*(float32)alpha)
          }
          for (blur$1.s3.r29$x.rebased, 0, 1535) {
           blur$1.0[1534 - blur$1.s3.r29$x.rebased] = (blur$1.0[1535 - blur$1.s3.r29$x.rebased]*(1.000000f - (float32)alpha)) + (blur$1.0[1534 - blur$1.s3.r29$x.rebased]*(float32)alpha)
          }
         }
         consume blur$1 {
          transpose$1[(((((transpose$1.s0.v1.v1_o.block_id_y*32) + .thread_id_y)*48) + ((transpose$1.s0.v2.v2_o.block_id_z*122880) + transpose$1.s0.v0.v0_vo.block_id_x))*32) + .thread_id_x] = blur$1.0[(transpose$1.s0.v0.v0_vo.block_id_x*32) + .thread_id_x]
         }
        }
        free blur$1.0
       }
      }
     }
    }
   }
   _halide_buffer_set_device_dirty((struct halide_buffer_t *)transpose$1.buffer, (uint1)1)
   halide_device_free(blur.buffer)
   free blur
  }
 }
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The buffer allocation of more than 2GB is outside of this IR. Either way strange that Metal complains about big buffers...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am running out of ideas... I am not even sure if I am correctly reproducing the errors on the Buildbot. And I don't have an Mac machine to test on. Any insights?

How expensive (as in cash) is to run a job on the Halide Buildbot? I plan to run ad hoc debugging code on this PR to print the stmt on the server as the error log, but I am not sure if it is frown upon by server owners....

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well you can always generate the IR and the code. You'll not be able to execute it though. But nothing stops you from locally compiling a Metal pipeline.


# Main executable
add_executable(iir_blur_filter filter.cpp)
Expand All @@ -31,11 +32,18 @@ target_link_libraries(iir_blur_filter PRIVATE
# Test that the app actually works!
set(IMAGE ${CMAKE_CURRENT_LIST_DIR}/../images/rgba.png)
if (EXISTS ${IMAGE})
configure_file(${IMAGE} rgba.png COPYONLY)
add_test(NAME iir_blur_filter
COMMAND iir_blur_filter rgba.png out.png)
set_tests_properties(iir_blur_filter PROPERTIES
LABELS iir_blur
PASS_REGULAR_EXPRESSION "Success!"
SKIP_REGULAR_EXPRESSION "\\[SKIP\\]")
if (Halide_TARGET MATCHES "opencl")
# Error message:
#
# Error: OpenCL error: CL_INVALID_COMMAND_QUEUE clFinish failed
message(WARNING "Skipping Mullapudi2016's GPU auto-schedules for OpenCL target.")
else ()
configure_file(${IMAGE} rgba.png COPYONLY)
add_test(NAME iir_blur_filter
COMMAND iir_blur_filter rgba.png out.png)
set_tests_properties(iir_blur_filter PROPERTIES
LABELS iir_blur
PASS_REGULAR_EXPRESSION "Success!"
SKIP_REGULAR_EXPRESSION "\\[SKIP\\]")
endif ()
Comment on lines -34 to +48
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@alexreinking Help wanted to troubleshoot the CMake (re-)configure error:

[2433/4259] Building CXX object test/correctness/CMakeFiles/correctness_fuzz_simplify.dir/fuzz_simplify.cpp.o
FAILED: test/correctness/CMakeFiles/correctness_fuzz_simplify.dir/fuzz_simplify.cpp.o 
/Users/halidenightly/build_bot/venv/lib/python3.9/site-packages/cmake/data/bin/cmake -E env CCACHE_CPP2=yes CCACHE_HASHDIR=yes CCACHE_SLOPPINESS=pch_defines,time_macros,include_file_mtime,include_file_ctime /opt/homebrew/bin/ccache /Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/c++ -DHALIDE_ENABLE_RTTI -DHALIDE_KEEP_MACROS -DHALIDE_VERSION_MAJOR=20 -DHALIDE_VERSION_MINOR=0 -DHALIDE_VERSION_PATCH=0 -DHALIDE_WITH_EXCEPTIONS -I/Users/halidenightly/build_bot/worker/halide-testbranch-main-llvm20-arm-64-osx-cmake/halide-source/test/common -I/Users/halidenightly/build_bot/worker/halide-testbranch-main-llvm20-arm-64-osx-cmake/halide-source/tools -I/Users/halidenightly/build_bot/worker/halide-testbranch-main-llvm20-arm-64-osx-cmake/halide-build/include -I/Users/halidenightly/build_bot/worker/halide-testbranch-main-llvm20-arm-64-osx-cmake/halide-source/src/runtime -Xclang -fno-pch-timestamp -std=c++17 -arch arm64 -isysroot /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX13.1.sdk -fvisibility=hidden -fvisibility-inlines-hidden -Wall -Wvla-extension -Wcast-qual -Wignored-qualifiers -Wimplicit-fallthrough -Woverloaded-virtual -Wno-old-style-cast -Winconsistent-missing-destructor-override -Winconsistent-missing-override -Wdeprecated-declarations -Wno-double-promotion -Wno-float-conversion -Wno-float-equal -Wno-missing-field-initializers -Wno-shadow -Wno-sign-conversion -Wno-switch-enum -Wno-undef -Wno-unused-function -Wno-unused-macros -Wno-unused-parameter -Wno-c++98-compat-pedantic -Wno-c++98-compat -Wno-cast-align -Wno-comma -Wno-covered-switch-default -Wno-documentation-unknown-command -Wno-documentation -Wno-exit-time-destructors -Wno-global-constructors -Wno-implicit-float-conversion -Wno-implicit-int-conversion -Wno-implicit-int-float-conversion -Wno-missing-prototypes -Wno-nonportable-system-include-path -Wno-reserved-id-macro -Wno-shadow-field-in-constructor -Wno-shadow-field -Wno-shorten-64-to-32 -Wno-undefined-func-template -Wno-unused-member-function -Wno-unused-template -Winvalid-pch -Xclang -include-pch -Xclang /Users/halidenightly/build_bot/worker/halide-testbranch-main-llvm20-arm-64-osx-cmake/halide-build/test/CMakeFiles/_test_internal.dir/cmake_pch.hxx.pch -Xclang -include -Xclang /Users/halidenightly/build_bot/worker/halide-testbranch-main-llvm20-arm-64-osx-cmake/halide-build/test/CMakeFiles/_test_internal.dir/cmake_pch.hxx -MD -MT test/correctness/CMakeFiles/correctness_fuzz_simplify.dir/fuzz_simplify.cpp.o -MF test/correctness/CMakeFiles/correctness_fuzz_simplify.dir/fuzz_simplify.cpp.o.d -o test/correctness/CMakeFiles/correctness_fuzz_simplify.dir/fuzz_simplify.cpp.o -c /Users/halidenightly/build_bot/worker/halide-testbranch-main-llvm20-arm-64-osx-cmake/halide-source/test/correctness/fuzz_simplify.cpp
fatal error: file '/Users/halidenightly/build_bot/worker/halide-testbranch-main-llvm20-arm-64-osx-cmake/halide-build/include/Halide.h' has been modified since the precompiled header '/Users/halidenightly/build_bot/worker/halide-testbranch-main-llvm20-arm-64-osx-cmake/halide-build/test/CMakeFiles/_test_internal.dir/cmake_pch.hxx.pch' was built: size changed (was 1338765, now 1338767)
note: please rebuild precompiled header '/Users/halidenightly/build_bot/worker/halide-testbranch-main-llvm20-arm-64-osx-cmake/halide-build/test/CMakeFiles/_test_internal.dir/cmake_pch.hxx.pch'
1 error generated.

How do I skip iir_blur tests for GPU target host-opencl ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or, is it simply a regression of the Buildbot server?

endif ()
26 changes: 21 additions & 5 deletions src/autoschedulers/mullapudi2016/AutoSchedule.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1000,9 +1000,14 @@ class GPUTileHelper {
case 2: {
const auto &x = vars.front();
const auto &y = vars.back();
internal_assert(x.strategy == y.strategy);

f.tile(x.v, y.v, x.outer, y.outer, x.inner, y.inner, x.factor, y.factor);
const auto tail_strategy = std::any_of(
vars.begin(), vars.end(), [](const auto &v) {
return v.strategy == TailStrategy::GuardWithIf;
}) ?
TailStrategy::GuardWithIf :
TailStrategy::Auto;

f.tile(x.v, y.v, x.outer, y.outer, x.inner, y.inner, x.factor, y.factor, tail_strategy);
oss << "tile("
<< x.v.name() << ", "
<< y.v.name() << ", " //
Expand All @@ -1029,7 +1034,15 @@ class GPUTileHelper {
const auto &x = vars[0];
const auto &y = vars[1];
const auto &z = vars[2];
f.tile({x.v, y.v, z.v}, {x.outer, y.outer, z.outer}, {x.inner, y.inner, z.inner}, {x.factor, y.factor, z.factor});

const auto tail_strategy = std::any_of(
8000 vars.begin(), vars.end(), [](const auto &v) {
return v.strategy == TailStrategy::GuardWithIf;
}) ?
TailStrategy::GuardWithIf :
TailStrategy::Auto;

f.tile({x.v, y.v, z.v}, {x.outer, y.outer, z.outer}, {x.inner, y.inner, z.inner}, {x.factor, y.factor, z.factor}, tail_strategy);

oss << "tile({"
<< x.v.name() << ", "
Expand Down Expand Up @@ -1209,7 +1222,10 @@ class GPUTilingDedup {
VarOrRVar outer{var + "_o", v.is_rvar};
VarOrRVar inner{var + "_i", v.is_rvar};

split_info entry{v, outer, inner, factor, TailStrategy::Auto};
split_info entry{v, outer, inner, factor,
can_prove(factor >= min_n_threads) ?
TailStrategy::Auto :
TailStrategy::GuardWithIf};
const auto [_, insertion_happened] = parallelize.try_emplace(var, entry);
if (!insertion_happened) {
return std::nullopt;
Expand Down
Loading
0