38 #include "util_namespace.cuh"
39 #include "util_macro.cuh"
53 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
60 __global__
void EmptyKernel(
void) { }
66 template <
int ALLOCATIONS>
67 CUB_RUNTIME_FUNCTION __forceinline__
68 cudaError_t AliasTemporaries(
70 size_t &temp_storage_bytes,
71 void* (&allocations)[ALLOCATIONS],
72 size_t (&allocation_sizes)[ALLOCATIONS])
74 const int ALIGN_BYTES = 256;
75 const int ALIGN_MASK = ~(ALIGN_BYTES - 1);
78 size_t allocation_offsets[ALLOCATIONS];
79 size_t bytes_needed = 0;
80 for (
int i = 0; i < ALLOCATIONS; ++i)
82 size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
83 allocation_offsets[i] = bytes_needed;
84 bytes_needed += allocation_bytes;
90 temp_storage_bytes = bytes_needed;
95 if (temp_storage_bytes < bytes_needed)
97 return CubDebug(cudaErrorInvalidValue);
101 for (
int i = 0; i < ALLOCATIONS; ++i)
103 allocations[i] =
static_cast<char*
>(d_temp_storage) + allocation_offsets[i];
111 #endif // DOXYGEN_SHOULD_SKIP_THIS
118 CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
PtxVersion(
int &ptx_version)
123 typedef void (*EmptyKernelPtr)();
126 CUB_RUNTIME_FUNCTION __forceinline__
127 EmptyKernelPtr Empty()
129 return EmptyKernel<void>;
134 #ifndef CUB_RUNTIME_ENABLED
137 return cudaErrorInvalidConfiguration;
139 #elif (CUB_PTX_ARCH > 0)
141 ptx_version = CUB_PTX_ARCH;
146 cudaError_t error = cudaSuccess;
149 cudaFuncAttributes empty_kernel_attrs;
150 if (
CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel<void>)))
break;
151 ptx_version = empty_kernel_attrs.ptxVersion * 10;
164 CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
SmVersion(
int &sm_version,
int device_ordinal)
166 #ifndef CUB_RUNTIME_ENABLED
169 return cudaErrorInvalidConfiguration;
173 cudaError_t error = cudaSuccess;
178 if (
CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal)))
break;
179 if (
CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal)))
break;
180 sm_version = major * 100 + minor * 10;
190 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
195 CUB_RUNTIME_FUNCTION __forceinline__
196 static cudaError_t SyncStream(cudaStream_t stream)
198 #if (CUB_PTX_ARCH == 0)
199 return cudaStreamSynchronize(stream);
202 return cudaDeviceSynchronize();
210 template <
typename KernelPtr>
211 CUB_RUNTIME_FUNCTION __forceinline__
213 int &max_sm_occupancy,
215 KernelPtr kernel_ptr,
218 #ifndef CUB_RUNTIME_ENABLED
221 return CubDebug(cudaErrorInvalidConfiguration);
225 return cudaOccupancyMaxActiveBlocksPerMultiprocessor (
301 #endif // CUB_RUNTIME_ENABLED
304 #endif // Do not document
338 template <
typename KernelPtr>
339 CUB_RUNTIME_FUNCTION __forceinline__
341 int &max_sm_occupancy,
342 KernelPtr kernel_ptr,
345 #ifndef CUB_RUNTIME_ENABLED
348 return CubDebug(cudaErrorInvalidConfiguration);
352 cudaError_t error = cudaSuccess;
357 if (
CubDebug(error = cudaGetDevice(&device_ordinal)))
break;
370 #endif // CUB_RUNTIME_ENABLED