Unable to run samples - cudaMalloc fills all the system memory #34

AndreFrelicot · 2018-10-13T15:26:06Z

I'm building on Windows 10 (tried on Linux Ubuntu also, same problem).

When I run a sample it blocks in function allocReduceStorage :
gvdb_1.1\shared_cudpp\src\cudpp\app\reduce_app.cu

[...]
void allocReduceStorage(CUDPPReducePlan *plan)
...
    case CUDPP_FLOAT:
-->        cudaMalloc(&plan->m_blockSums, blocks * sizeof(float));
        break;

System :
Version 10.0.17134 Build 17134
Processor i7-8750H
Graphics card: NVIDIA GeForce GTX 1070 with Max-Q Design
RAM: 32GB
GVDB: 1.1
CUDA: 10

`

  plan	0x000001f057310b30 {m_threadsPerBlock=256 m_maxBlocks=64 m_blockSums=0xcdcdcdcdcdcdcdcd }	CUDPPReducePlan *

  CUDPPPlan	{m_config={algorithm=CUDPP_REDUCE (3) op=CUDPP_MAX (3) datatype=CUDPP_FLOAT (6) ...} m_numElements=1000000 ...}	CUDPPPlan

  __vfptr	0x00007ffb96c87928 {cudpp_1915x64d.dll!void(* CUDPPReducePlan::`vftable'[2])()} {0x00007ffb96417662 {cudpp_1915x64d.dll!CUDPPReducePlan::`vector deleting destructor'(unsigned int)}}	void * *

  m_config	{algorithm=CUDPP_REDUCE (3) op=CUDPP_MAX (3) datatype=CUDPP_FLOAT (6) ...}	CUDPPConfiguration
  m_numElements	1000000	unsigned __int64
  m_numRows	1	unsigned __int64
  m_rowPitch	0	unsigned __int64

  m_planManager	0x000001f0571fe540 {m_deviceProps={name=0x000001f0571fe540 "GeForce GTX 1070 with Max-Q Design" uuid=...} }	CUDPPManager *
  m_threadsPerBlock	256	unsigned int
  m_maxBlocks	64	unsigned int
  m_blockSums	0xcdcdcdcdcdcdcdcd	void *

  (*((CUDPPPlan*)plan)).m_planManager->m_deviceProps	{name=0x000001f0571fe540 "GeForce GTX 1070 with Max-Q Design" uuid={bytes=0x000001f0571fe640 "ÀæÙþ?ÆÐPÀÇ_¢ñ9¬3... } ...}	cudaDeviceProp

  name	0x000001f0571fe540 "GeForce GTX 1070 with Max-Q Design"	char[256]

  uuid	{bytes=0x000001f0571fe640 "ÀæÙþ?ÆÐPÀÇ_¢ñ9¬3... }	CUuuid_st

  luid	0x000001f0571fe650 "�û"	char[8]
  luidDeviceNodeMask	1	unsigned int
  totalGlobalMem	8589934592	unsigned __int64
  sharedMemPerBlock	49152	unsigned __int64
  regsPerBlock	65536	int
  warpSize	32	int
  memPitch	2147483647	unsigned __int64
  maxThreadsPerBlock	1024	int

  maxThreadsDim	0x000001f0571fe684 {1024, 1024, 64}	int[3]

  maxGridSize	0x000001f0571fe690 {2147483647, 65535, 65535}	int[3]
  clockRate	1265500	int
  totalConstMem	65536	unsigned __int64
  major	6	int
  minor	1	int
  textureAlignment	512	unsigned __int64
  texturePitchAlignment	32	unsigned __int64
  deviceOverlap	1	int
  multiProcessorCount	16	int
  kernelExecTimeoutEnabled	1	int
  integrated	0	int
  canMapHostMemory	1	int
  computeMode	0	int
  maxTexture1D	131072	int
  maxTexture1DMipmap	16384	int
  maxTexture1DLinear	134217728	int

  maxTexture2D	0x000001f0571fe6e4 {131072, 65536}	int[2]

  maxTexture2DMipmap	0x000001f0571fe6ec {32768, 32768}	int[2]

  maxTexture2DLinear	0x000001f0571fe6f4 {131072, 65000, 2097120}	int[3]

  maxTexture2DGather	0x000001f0571fe700 {32768, 32768}	int[2]

  maxTexture3D	0x000001f0571fe708 {16384, 16384, 16384}	int[3]

  maxTexture3DAlt	0x000001f0571fe714 {8192, 8192, 32768}	int[3]
  maxTextureCubemap	32768	int

  maxTexture1DLayered	0x000001f0571fe724 {32768, 2048}	int[2]

  maxTexture2DLayered	0x000001f0571fe72c {32768, 32768, 2048}	int[3]

  maxTextureCubemapLayered	0x000001f0571fe738 {32768, 2046}	int[2]
  maxSurface1D	32768	int

  maxSurface2D	0x000001f0571fe744 {131072, 65536}	int[2]

  maxSurface3D	0x000001f0571fe74c {16384, 16384, 16384}	int[3]

  maxSurface1DLayered	0x000001f0571fe758 {32768, 2048}	int[2]

  maxSurface2DLayered	0x000001f0571fe760 {32768, 32768, 2048}	int[3]
  maxSurfaceCubemap	32768	int

  maxSurfaceCubemapLayered	0x000001f0571fe770 {32768, 2046}	int[2]
  surfaceAlignment	512	unsigned __int64
  concurrentKernels	1	int
  ECCEnabled	0	int
  pciBusID	1	int
  pciDeviceID	0	int
  pciDomainID	0	int
  tccDriver	0	int
  asyncEngineCount	5	int
  unifiedAddressing	1	int
  memoryClockRate	4004000	int
  memoryBusWidth	256	int
  l2CacheSize	2097152	int
  maxThreadsPerMultiProcessor	2048	int
  streamPrioritiesSupported	1	int
  globalL1CacheSupported	1	int
  localL1CacheSupported	1	int
  sharedMemPerMultiprocessor	98304	unsigned __int64
  regsPerMultiprocessor	65536	int
  managedMemory	1	int
  isMultiGpuBoard	0	int
  multiGpuBoardGroupID	0	int
  hostNativeAtomicSupported	0	int
  singleToDoublePrecisionPerfRatio	32	int
  pageableMemoryAccess	0	int
  concurrentManagedAccess	0	int
  computePreemptionSupported	0	int
  canUseHostPointerForRegisteredMem	0	int
  cooperativeLaunch	0	int
  cooperativeMultiDeviceLaunch	0	int
  sharedMemPerBlockOptin	0	unsigned __int64
  pageableMemoryAccessUsesHostPageTables	0	int
  directManagedMemAccessFromHost	0	int

`

The text was updated successfully, but these errors were encountered:

oursnoir · 2018-11-20T15:03:43Z

Hello,

I had the exact same error (also eating up all my CPU RAM) under linux (ubuntu 18.04 with a fresh cuda 10 install from the nvidia repos).
The issue was resolved when I built the cudpp library in shared_cudpp and enforcing the compilation with architecture flags that match my GPU.
NB: I think this issue is related to #26 .
Best,

AndreFrelicot · 2018-11-22T00:37:40Z

Thank you, I'll test that

http://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/

https://github.com/tpruvot/ccminer/wiki/Compatibility

AndreFrelicot mentioned this issue Nov 16, 2018

4D visulization #37

Closed

AndreFrelicot closed this as completed Nov 22, 2018

nathanchrs mentioned this issue Jun 7, 2019

Crash on win10, cuda 10.1 #70

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Unable to run samples - cudaMalloc fills all the system memory #34

Unable to run samples - cudaMalloc fills all the system memory #34

Uh oh!

Uh oh!

Unable to run samples - cudaMalloc fills all the system memory #34

Unable to run samples - cudaMalloc fills all the system memory #34

Comments

Uh oh!

Uh oh!

Uh oh!