8000 Bug with OpenMP device - repro and fix attached - causes no optimization on OpenMP kernels by default · Issue #791 · libocca/occa · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content
Bug with OpenMP device - repro and fix attached - causes no optimization on OpenMP kernels by default #791
Open
@Notargets

Description

@Notargets

OCCA OpenMP Device Missing Default Optimization Flags

Bug Description

OpenMP device in OCCA fails to apply default -O3 optimization flags, causing 7.5x performance degradation. Discovered in gocca's kernel_program_parallel_test.go.

Reproduction

make clean
make

Results

Matrix-vector multiplication benchmark (640K chunks × 56×56 matrix):

Device Flags Performance
OpenMP default 24.06 GFLOPS
OpenMP explicit -O3 180.48 GFLOPS
Serial default 5.72 GFLOPS
Serial explicit -O3 5.72 GFLOPS

Issue: OpenMP default is 7.5x slower than OpenMP with -O3. Serial correctly applies -O3 by default (identical performance).

Root Cause

In src/occa/internal/modes/openmp/device.cpp:95, the code uses += on potentially non-existent compiler_flags:

allKernelProps["compiler_flags"] += " " + lastCompilerOpenMPFlag;

This creates an empty string with only the OpenMP flag, missing the -O3 default.

Fix

diff --git a/src/occa/internal/modes/openmp/device.cpp b/src/occa/internal/modes/openmp/device.cpp
index eb354a20..9dc8eeeb 100644
--- a/src/occa/internal/modes/openmp/device.cpp
+++ b/src/occa/internal/modes/openmp/device.cpp
@@ -101,33 +101,39 @@ namespace occa {
         compiler = "cl.exe";
 #endif
       }
 
       int vendor = allKernelProps["vendor"];
       // Check if we need to re-compute the vendor
       if (compiler.size()) {
         vendor = sys::compilerVendor(compiler);
       }
 
+
       if (compiler != lastCompiler) {
         lastCompiler = compiler;
         lastCompilerOpenMPFlag = openmp::compilerFlag(vendor, compiler);
 
         if (lastCompilerOpenMPFlag == openmp::notSupported) {
           io::stderr << "Compiler [" << (std::string) allKernelProps["compiler"]
                      << "] does not support OpenMP, defaulting to [Serial] mode\n";
         }
       }
 
       const bool usingOpenMP = (lastCompilerOpenMPFlag != openmp::notSupported);
       if (usingOpenMP) {
-        allKernelProps["compiler_flags"] += " " + lastCompilerOpenMPFlag;
+		  if (!allKernelProps.has("compiler_flags") ||
+				  allKernelProps["compiler_flags"].toString().empty()) {
+			  allKernelProps["compiler_flags"] = "-O3";
+		  }
+		  allKernelProps["compiler_flags"] += " " + lastCompilerOpenMPFlag;
+   //     allKernelProps["compiler_flags"] += " " + lastCompilerOpenMPFlag;
       }
 
       modeKernel_t *k = serial::device::buildKernel(filename,
                                                     kernelName,
                                                     kernelHash,
                                                     allKernelProps);
 
       if (k && usingOpenMP) {
         k->modeDevice->removeKernelRef(k);
         k->modeDevice = this;

Workaround

occa::device device({
  {"mode", "OpenMP"},
  {"kernel", {{"compiler_flags", "-O3"}}}
});

Repro

// openmp_bug_demo.cpp
#include <iostream>
#include <iomanip>
#include <chrono>
#include <vector>
#include <cmath>
#include <occa.hpp>

const char* matvec_kernel_source = R"KERNEL(
@kernel void matvec(const int N,
                    const int np,
                    const int chunks_per_block,
                    const double* matrix,
                    const double* input,
                    double* output) {
  // N is total number of chunks, np is matrix size (56)
  // chunks_per_block determines how many chunks each @inner processes
  for (int block = 0; block < (N + chunks_per_block - 1) / chunks_per_block; ++block; @outer) {
    for (int chunk_in_block = 0; chunk_in_block < chunks_per_block; ++chunk_in_block; @inner) {
      int chunk = block * chunks_per_block + chunk_in_block;
      if (chunk < N) {
        for (int i = 0; i < np; ++i) {
          double sum = 0.0;
          for (int j = 0; j < np; ++j) {
            sum += matrix[i*np + j] * input[chunk*np + j];
          }
          output[chunk*np + i] = sum;
        }
      }
    }
  }
}
)KERNEL";

struct BenchmarkResult {
  double time_ms;
  double gflops;
};

BenchmarkResult benchmarkMatvec(occa::device& device,
                               int N, int np, int chunks_per_block, int iterations) {
  // N = number of chunks, np = 56 (matrix size)
  size_t matrix_size = np * np * sizeof(double);
  size_t vector_size = N * np * sizeof(double);

  occa::memory o_matrix = device.malloc(matrix_size);
  occa::memory o_input = device.malloc(vector_size);
  occa::memory o_output = device.malloc(vector_size);

  // Initialize host data
  std::vector<double> h_matrix(np * np);
  std::vector<double> h_input(N * np);

  // Initialize matrix with some pattern
  for (int i = 0; i < np * np; ++i) {
    h_matrix[i] = (double)(i % 100) / 100.0;
  }

  // Initialize input vector
  for (int i = 0; i < N * np; ++i) {
    h_input[i] = (double)(i % 100) / 100.0;
  }

  // Copy to device
  o_matrix.copyFrom(h_matrix.data());
  o_input.copyFrom(h_input.data());

  // Build kernel
  occa::kernel kernel = device.buildKernelFromString(
    matvec_kernel_source,
    "matvec"
  );

  // Warmup
  for (int i = 0; i < 5; ++i) {
    kernel(N, np, chunks_per_block, o_matrix, o_input, o_output);
  }
  device.finish();

  // Benchmark
  auto start = std::chrono::high_resolution_clock::now();

  for (int i = 0; i < iterations; ++i) {
    kernel(N, np, chunks_per_block, o_matrix, o_input, o_output);
  }

# Makefile for OCCA OpenMP Compiler Flag Bug Demonstration

CXX = g++
CXXFLAGS = -std=c++11 -O3 -fopenmp
LDFLAGS = -locca -fopenmp

# OCCA include and library paths
# Adjust these if OCCA is installed in a non-standard location
OCCA_INCLUDE = -I/usr/local/include
OCCA_LIB = -L/usr/local/lib

TARGET = openmp_bug_demo
SOURCE = openmp_bug_demo.cpp

all: $(TARGET) run

$(TARGET): $(SOURCE)
    $(CXX) $(CXXFLAGS) $(OCCA_INCLUDE) $(SOURCE) -o $(TARGET) $(OCCA_LIB) $(LDFLAGS)

run: $(TARGET)
    @echo "Running OCCA OpenMP bug demonstration..."
    @echo "========================================"
    @./$(TARGET)

clean:
    rm -f $(TARGET)

.PHONY: all run clean

Metadata

Metadata

Assignees

No one assigned

    Labels

    OpenMPbugUse this label when reporting bugs!

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions

      0