[Solved] list of white pixels indices in image using CUDA

Question

Following is a naive method to achieve the desired functionality:

Generate a mask of pixel indices with dummy values for pixel with zero value.
Count the number of non-zero pixels
Create an output vector with length equal to non-zero count.
Copy the non-zero pixel indices from the generated mask to the output vector (a process known as stream-compaction)

Following is a sample code for the above mentioned process.

Code

#include <cstdio>
#include <vector>
#include <cuda_runtime.h>
#include <thrust/count.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <opencv2/opencv.hpp>


static void _check_err(cudaError_t err, const char* file, int line)
{
    if(err)
    {
        const char* err_str = cudaGetErrorString(err);

        printf("CUDA Error: %s\nFile: %s\nLine: %d\n", err_str, file, line);
        exit(EXIT_FAILURE);
    }
}

#define CHECK_ERR(err) _check_err((err), __FILE__, __LINE__)



__global__ void kernel_find_indices(const unsigned char* input, int width, int height, int step, int2* indices)
{
    const int x = blockIdx.x * blockDim.x + threadIdx.x;
    const int y = blockIdx.y * blockDim.y + threadIdx.y;

    if(x < width && y < height)
    {
        const int tidPixel = y * step + x;
        const int tidIndex = y * width + x;

        unsigned char value = input[tidPixel];

        int2 index_to_write;


        if(value)
        {
            //Write actual index to pixels with non-zero value
            index_to_write.x = x;
            index_to_write.y = y;
        }
        else
        {
            //Write dummy index to pixels with zero value
            index_to_write.x = -1;
            index_to_write.y = -1;
        }

        indices[tidIndex] = index_to_write;
    }
}


//Operator to check whether an index is of a non-zero pixel
struct isNonZeroIndex
{
  __host__ __device__ bool operator()(const int2 &idx)
  {
    return (idx.x != -1) && (idx.y != -1);
  }
};


std::vector<cv::Point> getIndicesOfNonZeroPixels(cv::Mat input)
{
    std::vector<int2> output_int2;
    std::vector<cv::Point> output;

    int pixelCount = input.cols * input.rows;
    size_t imageBytes=  input.step * input.rows;

    unsigned char* image_d;
    thrust::device_vector<int2> index_buffer_d(pixelCount);

    //Allocate device memory for input image
    CHECK_ERR(cudaMalloc(&image_d, imageBytes));
    //Copy input image to device
    CHECK_ERR(cudaMemcpy(image_d, input.ptr(), imageBytes, cudaMemcpyHostToDevice));

    dim3 block(16,16);
    dim3 grid;
    grid.x = (input.cols + block.x - 1) / block.x;
    grid.y = (input.rows + block.y - 1) / block.y;

    //Generate an index mask with dummy values for indices with zero pixel value
    kernel_find_indices<<<grid, block>>>(image_d, input.cols, input.rows, input.step, thrust::raw_pointer_cast(index_buffer_d.data()));
    CHECK_ERR(cudaDeviceSynchronize());

    int nonZeroCount = thrust::count_if(index_buffer_d.begin(), index_buffer_d.end(), isNonZeroIndex());

    //Keep only those indices whose pixel value is non-zero (stream compaction)
    thrust::device_vector<int2> compacted(nonZeroCount);
    thrust::copy_if(index_buffer_d.begin(), index_buffer_d.end(), compacted.begin(), isNonZeroIndex());

    //Copy non-zero pixel indices to host
    output_int2.resize(nonZeroCount);
    thrust::copy(compacted.begin(), compacted.end(), output_int2.begin());

    CHECK_ERR(cudaFree(image_d));

    //Convert vector<int2> to vector<cv::Point>
    output.resize(nonZeroCount);
    for(size_t i=0; i<nonZeroCount; i++)
        output[i] = cv::Point(output_int2[i].x, output_int2[i].y);

    return output;
}

void run_test()
{
    //Generate a sample test image
    cv::Mat test = cv::Mat::zeros(100,100, CV_8UC1);
    cv::rectangle(test, cv::Rect(5,5,20,20), cv::Scalar::all(255), CV_FILLED);

    //Get pixel indices of non-zero pixels
    std::vector<cv::Point> indices = getIndicesOfNonZeroPixels(test);

    //Display those indices
    for(size_t i=0; i<indices.size(); i++)
    {
        printf("%d, %d\n", indices[i].x, indices[i].y);
    }

    //Show image
    cv::imshow("Sample", test);
    cv::waitKey();
}

int main(int argc, char** argv)
{
    run_test();
    return 0;
}

Compilation Command

nvcc -o nz nz.cu -arch=sm_61 -L/usr/local/lib -lopencv_core
-lopencv_highgui -lopencv_imgproc

Please keep in mind that this code is for image of type 8UC1 (8 bit, single channel) only. You can easily extend it to other data-types as required.

Accepted Answer