Following is a naive method to achieve the desired functionality:
- Generate a mask of pixel indices with dummy values for pixel with zero value.
- Count the number of non-zero pixels
- Create an output vector with length equal to non-zero count.
- Copy the non-zero pixel indices from the generated mask to the output vector (a process known as stream-compaction)
Following is a sample code for the above mentioned process.
#include <cstdio>
#include <vector>
#include <cuda_runtime.h>
#include <thrust/count.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <opencv2/opencv.hpp>
static void _check_err(cudaError_t err, const char* file, int line)
const char* err_str = cudaGetErrorString(err);
printf("CUDA Error: %s\nFile: %s\nLine: %d\n", err_str, file, line);
#define CHECK_ERR(err) _check_err((err), __FILE__, __LINE__)
__global__ void kernel_find_indices(const unsigned char* input, int width, int height, int step, int2* indices)
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if(x < width && y < height)
const int tidPixel = y * step + x;
const int tidIndex = y * width + x;
unsigned char value = input[tidPixel];
int2 index_to_write;
//Write actual index to pixels with non-zero value
index_to_write.x = x;
index_to_write.y = y;
//Write dummy index to pixels with zero value
index_to_write.x = -1;
index_to_write.y = -1;
indices[tidIndex] = index_to_write;
//Operator to check whether an index is of a non-zero pixel
struct isNonZeroIndex
__host__ __device__ bool operator()(const int2 &idx)
return (idx.x != -1) && (idx.y != -1);
std::vector<cv::Point> getIndicesOfNonZeroPixels(cv::Mat input)
std::vector<int2> output_int2;
std::vector<cv::Point> output;
int pixelCount = input.cols * input.rows;
size_t imageBytes= input.step * input.rows;
unsigned char* image_d;
thrust::device_vector<int2> index_buffer_d(pixelCount);
//Allocate device memory for input image
CHECK_ERR(cudaMalloc(&image_d, imageBytes));
//Copy input image to device
CHECK_ERR(cudaMemcpy(image_d, input.ptr(), imageBytes, cudaMemcpyHostToDevice));
dim3 block(16,16);
dim3 grid;
grid.x = (input.cols + block.x - 1) / block.x;
grid.y = (input.rows + block.y - 1) / block.y;
//Generate an index mask with dummy values for indices with zero pixel value
kernel_find_indices<<<grid, block>>>(image_d, input.cols, input.rows, input.step, thrust::raw_pointer_cast(;
int nonZeroCount = thrust::count_if(index_buffer_d.begin(), index_buffer_d.end(), isNonZeroIndex());
//Keep only those indices whose pixel value is non-zero (stream compaction)
thrust::device_vector<int2> compacted(nonZeroCount);
thrust::copy_if(index_buffer_d.begin(), index_buffer_d.end(), compacted.begin(), isNonZeroIndex());
//Copy non-zero pixel indices to host
thrust::copy(compacted.begin(), compacted.end(), output_int2.begin());
//Convert vector<int2> to vector<cv::Point>
for(size_t i=0; i<nonZeroCount; i++)
output[i] = cv::Point(output_int2[i].x, output_int2[i].y);
return output;
void run_test()
//Generate a sample test image
cv::Mat test = cv::Mat::zeros(100,100, CV_8UC1);
cv::rectangle(test, cv::Rect(5,5,20,20), cv::Scalar::all(255), CV_FILLED);
//Get pixel indices of non-zero pixels
std::vector<cv::Point> indices = getIndicesOfNonZeroPixels(test);
//Display those indices
for(size_t i=0; i<indices.size(); i++)
printf("%d, %d\n", indices[i].x, indices[i].y);
//Show image
cv::imshow("Sample", test);
int main(int argc, char** argv)
return 0;
Compilation Command
nvcc -o nz -arch=sm_61 -L/usr/local/lib -lopencv_core
-lopencv_highgui -lopencv_imgproc
Please keep in mind that this code is for image of type 8UC1
(8 bit, single channel) only. You can easily extend it to other data-types as required.
