[Solved] What would be the source code like to add two `N x N x N x N` tensors? [closed]

Question

What would be the source code like to add two N x N x N x N tensors?
If we have four dimensions in our problem, how can we write the add() function as we don’t have more than three dimensions?

As is usual in computer programming, there are many ways to do it.

Here is a demonstration of 4 methods

using a 1D grid (x), striding
using a 2D grid (x,y), with the grid “striding” in z, and w
using a 3D grid (x,y,z) with the grid “striding” in w
using a 3D grid (x,y,z) but “encoding” the w dimension in the x grid variable, therefore not requiring a loop in kernel code

example:

$ cat t2231.cu
#include <iostream>

using it = int;
const it NPOW = 6;         // 2^6 = 64
const it N = (1<<NPOW);    // the 4th power of this must fit in type it!
using dt = int;

//1D
__global__ void k1(dt *a, dt *b, dt *c, it n){

  for (it i = blockIdx.x*blockDim.x+threadIdx.x; i < n*n*n*n; i += gridDim.x*blockDim.x) // 1D grid-stride loop
    c[i] = a[i] + b[i];
}

//2D, "striding" in z, w, must launch threads to cover x,y
__global__ void k2(dt *a, dt *b, dt *c, it n){

  it x = blockIdx.x*blockDim.x+threadIdx.x;
  it y = blockIdx.y*blockDim.y+threadIdx.y;
  if (x < n && y < n)
    for (it w = 0; w < n; w++)
      for (it z = 0; z < n; z++){
        it idx = (((((w*n)+z)*n)+y)*n)+x;
        c[idx] = a[idx]+b[idx];}
}

//3D, "striding" in w, must launch threads to cover x,y,z
__global__ void k3(dt *a, dt *b, dt *c, it n){

  it x = blockIdx.x*blockDim.x+threadIdx.x;
  it y = blockIdx.y*blockDim.y+threadIdx.y;
  it z = blockIdx.z*blockDim.z+threadIdx.z;
  if (x < n && y < n && z < n)
    for (it w = 0; w < n; w++){
      it idx = (((((w*n)+z)*n)+y)*n)+x;
      c[idx] = a[idx]+b[idx];}
}

//4D, extracting w from x, must launch threads to cover w*x,y,z
__global__ void k4(dt *a, dt *b, dt *c, it n){

  it px = blockIdx.x*blockDim.x+threadIdx.x;
  it w = px>>NPOW;
  it x = px-(w<<NPOW);
  it y = blockIdx.y*blockDim.y+threadIdx.y;
  it z = blockIdx.z*blockDim.z+threadIdx.z;
  if (x < n && y < n && z < n && w < n){
    it idx = (((((w*n)+z)*n)+y)*n)+x;
    c[idx] = a[idx]+b[idx];}
}


int main(){
  it sz = N*N*N*N;
  dt *a = new dt[sz];
  dt *b = new dt[sz];
  dt *c = new dt[sz];
  dt *d_a, *d_b, *d_c;
  for (it i = 0; i < sz; i++) {a[i] = i+1; b[i] = i+1;}
  cudaMalloc(&d_a, sizeof(d_a[0])*sz);
  cudaMalloc(&d_b, sizeof(d_a[0])*sz);
  cudaMalloc(&d_c, sizeof(d_a[0])*sz);
  cudaMemcpy(d_a, a, sizeof(d_a[0])*sz, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, sizeof(d_b[0])*sz, cudaMemcpyHostToDevice);
  // choose "arbitrary" dimensions for 1D case
  k1<<<80, 1024>>>(d_a, d_b, d_c, N);
  cudaMemcpy(c, d_c, sizeof(d_c[0])*sz, cudaMemcpyDeviceToHost);
  for (it i = 0; i < sz; i++) if (c[i] != a[i]+b[i]) std::cout << "Error1" << std::endl;
  cudaMemset(d_c, 0, sizeof(d_c[0])*sz);
  // choose x,y to at least cover N,N for 2D case
  dim3 block2(8,8);
  dim3 grid2((N+block2.x-1)/block2.x, (N+block2.y-1)/block2.y);
  k2<<<grid2, block2>>>(d_a, d_b, d_c, N);
  cudaMemcpy(c, d_c, sizeof(d_c[0])*sz, cudaMemcpyDeviceToHost);
  for (it i = 0; i < sz; i++) if (c[i] != a[i]+b[i]) std::cout << "Error2" << std::endl;
  cudaMemset(d_c, 0, sizeof(d_c[0])*sz);
  // choose x,y,z to at least cover N,N,N for 3D case
  dim3 block3(8,8,8);
  dim3 grid3((N+block3.x-1)/block3.x, (N+block3.y-1)/block3.y, (N+block3.z-1)/block3.z);
  k3<<<grid3, block3>>>(d_a, d_b, d_c, N);
  cudaMemcpy(c, d_c, sizeof(d_c[0])*sz, cudaMemcpyDeviceToHost);
  for (it i = 0; i < sz; i++) if (c[i] != a[i]+b[i]) std::cout << "Error3" << std::endl;
  cudaMemset(d_c, 0, sizeof(d_c[0])*sz);
  // choose x,y,z to at least cover N*N,N,N for 4D case
  dim3 grid4((N*N+block3.x-1)/block3.x, (N+block3.y-1)/block3.y, (N+block3.z-1)/block3.z);
  k4<<<grid4, block3>>>(d_a, d_b, d_c, N);
  cudaMemcpy(c, d_c, sizeof(d_c[0])*sz, cudaMemcpyDeviceToHost);
  for (it i = 0; i < sz; i++) if (c[i] != a[i]+b[i]) std::cout << "Error4" << std::endl;
}

$ nvcc -o t2231 t2231.cu
$ compute-sanitizer ./t2231
========= COMPUTE-SANITIZER
========= ERROR SUMMARY: 0 errors
$

Accepted Answer

What would be the source code like to add two N x N x N x N tensors?
If we have four dimensions in our problem, how can we write the add() function as we don’t have more than three dimensions?

As is usual in computer programming, there are many ways to do it.

Here is a demonstration of 4 methods

using a 1D grid (x), striding
using a 2D grid (x,y), with the grid “striding” in z, and w
using a 3D grid (x,y,z) with the grid “striding” in w
using a 3D grid (x,y,z) but “encoding” the w dimension in the x grid variable, therefore not requiring a loop in kernel code

example:

$ cat t2231.cu
#include <iostream>

using it = int;
const it NPOW = 6;         // 2^6 = 64
const it N = (1<<NPOW);    // the 4th power of this must fit in type it!
using dt = int;

//1D
__global__ void k1(dt *a, dt *b, dt *c, it n){

  for (it i = blockIdx.x*blockDim.x+threadIdx.x; i < n*n*n*n; i += gridDim.x*blockDim.x) // 1D grid-stride loop
    c[i] = a[i] + b[i];
}

//2D, "striding" in z, w, must launch threads to cover x,y
__global__ void k2(dt *a, dt *b, dt *c, it n){

  it x = blockIdx.x*blockDim.x+threadIdx.x;
  it y = blockIdx.y*blockDim.y+threadIdx.y;
  if (x < n && y < n)
    for (it w = 0; w < n; w++)
      for (it z = 0; z < n; z++){
        it idx = (((((w*n)+z)*n)+y)*n)+x;
        c[idx] = a[idx]+b[idx];}
}

//3D, "striding" in w, must launch threads to cover x,y,z
__global__ void k3(dt *a, dt *b, dt *c, it n){

  it x = blockIdx.x*blockDim.x+threadIdx.x;
  it y = blockIdx.y*blockDim.y+threadIdx.y;
  it z = blockIdx.z*blockDim.z+threadIdx.z;
  if (x < n && y < n && z < n)
    for (it w = 0; w < n; w++){
      it idx = (((((w*n)+z)*n)+y)*n)+x;
      c[idx] = a[idx]+b[idx];}
}

//4D, extracting w from x, must launch threads to cover w*x,y,z
__global__ void k4(dt *a, dt *b, dt *c, it n){

  it px = blockIdx.x*blockDim.x+threadIdx.x;
  it w = px>>NPOW;
  it x = px-(w<<NPOW);
  it y = blockIdx.y*blockDim.y+threadIdx.y;
  it z = blockIdx.z*blockDim.z+threadIdx.z;
  if (x < n && y < n && z < n && w < n){
    it idx = (((((w*n)+z)*n)+y)*n)+x;
    c[idx] = a[idx]+b[idx];}
}


int main(){
  it sz = N*N*N*N;
  dt *a = new dt[sz];
  dt *b = new dt[sz];
  dt *c = new dt[sz];
  dt *d_a, *d_b, *d_c;
  for (it i = 0; i < sz; i++) {a[i] = i+1; b[i] = i+1;}
  cudaMalloc(&d_a, sizeof(d_a[0])*sz);
  cudaMalloc(&d_b, sizeof(d_a[0])*sz);
  cudaMalloc(&d_c, sizeof(d_a[0])*sz);
  cudaMemcpy(d_a, a, sizeof(d_a[0])*sz, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, sizeof(d_b[0])*sz, cudaMemcpyHostToDevice);
  // choose "arbitrary" dimensions for 1D case
  k1<<<80, 1024>>>(d_a, d_b, d_c, N);
  cudaMemcpy(c, d_c, sizeof(d_c[0])*sz, cudaMemcpyDeviceToHost);
  for (it i = 0; i < sz; i++) if (c[i] != a[i]+b[i]) std::cout << "Error1" << std::endl;
  cudaMemset(d_c, 0, sizeof(d_c[0])*sz);
  // choose x,y to at least cover N,N for 2D case
  dim3 block2(8,8);
  dim3 grid2((N+block2.x-1)/block2.x, (N+block2.y-1)/block2.y);
  k2<<<grid2, block2>>>(d_a, d_b, d_c, N);
  cudaMemcpy(c, d_c, sizeof(d_c[0])*sz, cudaMemcpyDeviceToHost);
  for (it i = 0; i < sz; i++) if (c[i] != a[i]+b[i]) std::cout << "Error2" << std::endl;
  cudaMemset(d_c, 0, sizeof(d_c[0])*sz);
  // choose x,y,z to at least cover N,N,N for 3D case
  dim3 block3(8,8,8);
  dim3 grid3((N+block3.x-1)/block3.x, (N+block3.y-1)/block3.y, (N+block3.z-1)/block3.z);
  k3<<<grid3, block3>>>(d_a, d_b, d_c, N);
  cudaMemcpy(c, d_c, sizeof(d_c[0])*sz, cudaMemcpyDeviceToHost);
  for (it i = 0; i < sz; i++) if (c[i] != a[i]+b[i]) std::cout << "Error3" << std::endl;
  cudaMemset(d_c, 0, sizeof(d_c[0])*sz);
  // choose x,y,z to at least cover N*N,N,N for 4D case
  dim3 grid4((N*N+block3.x-1)/block3.x, (N+block3.y-1)/block3.y, (N+block3.z-1)/block3.z);
  k4<<<grid4, block3>>>(d_a, d_b, d_c, N);
  cudaMemcpy(c, d_c, sizeof(d_c[0])*sz, cudaMemcpyDeviceToHost);
  for (it i = 0; i < sz; i++) if (c[i] != a[i]+b[i]) std::cout << "Error4" << std::endl;
}

$ nvcc -o t2231 t2231.cu
$ compute-sanitizer ./t2231
========= COMPUTE-SANITIZER
========= ERROR SUMMARY: 0 errors
$