Open
Description
OCCA OpenMP Device Missing Default Optimization Flags
Bug Description
OpenMP device in OCCA fails to apply default -O3
optimization flags, causing 7.5x performance degradation. Discovered in gocca's kernel_program_parallel_test.go.
Reproduction
make clean
make
Results
Matrix-vector multiplication benchmark (640K chunks × 56×56 matrix):
Device | Flags | Performance |
---|---|---|
OpenMP | default | 24.06 GFLOPS |
OpenMP | explicit -O3 | 180.48 GFLOPS |
Serial | default | 5.72 GFLOPS |
Serial | explicit -O3 | 5.72 GFLOPS |
Issue: OpenMP default is 7.5x slower than OpenMP with -O3. Serial correctly applies -O3 by default (identical performance).
Root Cause
In src/occa/internal/modes/openmp/device.cpp:95
, the code uses +=
on potentially non-existent compiler_flags
:
allKernelProps["compiler_flags"] += " " + lastCompilerOpenMPFlag;
This creates an empty string with only the OpenMP flag, missing the -O3
default.
Fix
diff --git a/src/occa/internal/modes/openmp/device.cpp b/src/occa/internal/modes/openmp/device.cpp
index eb354a20..9dc8eeeb 100644
--- a/src/occa/internal/modes/openmp/device.cpp
+++ b/src/occa/internal/modes/openmp/device.cpp
@@ -101,33 +101,39 @@ namespace occa {
compiler = "cl.exe";
#endif
}
int vendor = allKernelProps["vendor"];
// Check if we need to re-compute the vendor
if (compiler.size()) {
vendor = sys::compilerVendor(compiler);
}
+
if (compiler != lastCompiler) {
lastCompiler = compiler;
lastCompilerOpenMPFlag = openmp::compilerFlag(vendor, compiler);
if (lastCompilerOpenMPFlag == openmp::notSupported) {
io::stderr << "Compiler [" << (std::string) allKernelProps["compiler"]
<< "] does not support OpenMP, defaulting to [Serial] mode\n";
}
}
const bool usingOpenMP = (lastCompilerOpenMPFlag != openmp::notSupported);
if (usingOpenMP) {
- allKernelProps["compiler_flags"] += " " + lastCompilerOpenMPFlag;
+ if (!allKernelProps.has("compiler_flags") ||
+ allKernelProps["compiler_flags"].toString().empty()) {
+ allKernelProps["compiler_flags"] = "-O3";
+ }
+ allKernelProps["compiler_flags"] += " " + lastCompilerOpenMPFlag;
+ // allKernelProps["compiler_flags"] += " " + lastCompilerOpenMPFlag;
}
modeKernel_t *k = serial::device::buildKernel(filename,
kernelName,
kernelHash,
allKernelProps);
if (k && usingOpenMP) {
k->modeDevice->removeKernelRef(k);
k->modeDevice = this;
Workaround
occa::device device({
{"mode", "OpenMP"},
{"kernel", {{"compiler_flags", "-O3"}}}
});
Repro
// openmp_bug_demo.cpp
#include <iostream>
#include <iomanip>
#include <chrono>
#include <vector>
#include <cmath>
#include <occa.hpp>
const char* matvec_kernel_source = R"KERNEL(
@kernel void matvec(const int N,
const int np,
const int chunks_per_block,
const double* matrix,
const double* input,
double* output) {
// N is total number of chunks, np is matrix size (56)
// chunks_per_block determines how many chunks each @inner processes
for (int block = 0; block < (N + chunks_per_block - 1) / chunks_per_block; ++block; @outer) {
for (int chunk_in_block = 0; chunk_in_block < chunks_per_block; ++chunk_in_block; @inner) {
int chunk = block * chunks_per_block + chunk_in_block;
if (chunk < N) {
for (int i = 0; i < np; ++i) {
double sum = 0.0;
for (int j = 0; j < np; ++j) {
sum += matrix[i*np + j] * input[chunk*np + j];
}
output[chunk*np + i] = sum;
}
}
}
}
}
)KERNEL";
struct BenchmarkResult {
double time_ms;
double gflops;
};
BenchmarkResult benchmarkMatvec(occa::device& device,
int N, int np, int chunks_per_block, int iterations) {
// N = number of chunks, np = 56 (matrix size)
size_t matrix_size = np * np * sizeof(double);
size_t vector_size = N * np * sizeof(double);
occa::memory o_matrix = device.malloc(matrix_size);
occa::memory o_input = device.malloc(vector_size);
occa::memory o_output = device.malloc(vector_size);
// Initialize host data
std::vector<double> h_matrix(np * np);
std::vector<double> h_input(N * np);
// Initialize matrix with some pattern
for (int i = 0; i < np * np; ++i) {
h_matrix[i] = (double)(i % 100) / 100.0;
}
// Initialize input vector
for (int i = 0; i < N * np; ++i) {
h_input[i] = (double)(i % 100) / 100.0;
}
// Copy to device
o_matrix.copyFrom(h_matrix.data());
o_input.copyFrom(h_input.data());
// Build kernel
occa::kernel kernel = device.buildKernelFromString(
matvec_kernel_source,
"matvec"
);
// Warmup
for (int i = 0; i < 5; ++i) {
kernel(N, np, chunks_per_block, o_matrix, o_input, o_output);
}
device.finish();
// Benchmark
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < iterations; ++i) {
kernel(N, np, chunks_per_block, o_matrix, o_input, o_output);
}
# Makefile for OCCA OpenMP Compiler Flag Bug Demonstration
CXX = g++
CXXFLAGS = -std=c++11 -O3 -fopenmp
LDFLAGS = -locca -fopenmp
# OCCA include and library paths
# Adjust these if OCCA is installed in a non-standard location
OCCA_INCLUDE = -I/usr/local/include
OCCA_LIB = -L/usr/local/lib
TARGET = openmp_bug_demo
SOURCE = openmp_bug_demo.cpp
all: $(TARGET) run
$(TARGET): $(SOURCE)
$(CXX) $(CXXFLAGS) $(OCCA_INCLUDE) $(SOURCE) -o $(TARGET) $(OCCA_LIB) $(LDFLAGS)
run: $(TARGET)
@echo "Running OCCA OpenMP bug demonstration..."
@echo "========================================"
@./$(TARGET)
clean:
rm -f $(TARGET)
.PHONY: all run clean