#define THREADS_PER_BLK 128
__shared__ float support[THREADS_PER_BLK+2]
Launch 8K thread blocks --> 1024*1024/128 = 8192
#define THREADS_PER_BLK 128
__shared__ float support[THREADS_PER_BLK+2]
Launch 8K thread blocks --> 1024*1024/128 = 8192