Before you go through the exercise solutions, you might wanna check out the Chapter 3 notes.

Question 1

In this chapter we implemented a matrix multiplication kernel that has each thread produce one output matrix element. In this question, you will implement different matrix-matrix multiplication kernels and compare them.

a. Write a kernel that has each thread produce one output matrix row. Fill in the execution configuration parameters for the design.

b. Write a kernel that has each thread produce one output matrix column. Fill in the execution configuration parameters for the design.

c. Analyze the pros and cons of each of the two kernel designs.

Answer 1a

#include <iostream>
#include <cuda_runtime.h>
 
__global__
void MatrixMulKernel(const float* M, const float* N, float* P, int M_Height, int M_Width, int N_Width)
{
    int row = blockIdx.x*blockDim.x + threadIdx.x;
    
    if (row < M_Height)
    {
        for (int col = 0; col < N_Width; ++col)
        {
            float Pvalue = 0.0f;
            for (int k = 0; k < M_Width; ++k) {
                Pvalue += M[row * M_Width + k] * N[k * N_Width + col];
            }
            P[row * N_Width + col] = Pvalue;
        }
    }
}
 
int main()
{
    // Define non-square matrix dimensions
    const int M_Height = 3; // Matrix M height (rows)
    const int M_Width = 4;  // Matrix M width (cols) = Matrix N height (rows)
    const int N_Width = 5;  // Matrix N width (cols)
    
    // Calculate memory sizes
    const size_t size_M = M_Height * M_Width * sizeof(float);
    const size_t size_N = M_Width * N_Width * sizeof(float);
    const size_t size_P = M_Height * N_Width * sizeof(float);
 
    // Allocate host memory
    float *h_M = new float[M_Height * M_Width];
    float *h_N = new float[M_Width * N_Width];
    float *h_P = new float[M_Height * N_Width];
    float *h_expected = new float[M_Height * N_Width];
 
    // Initialize matrices with predictable pattern
    for (int i = 0; i < M_Height; ++i) {
        for (int j = 0; j < M_Width; ++j) {
            h_M[i * M_Width + j] = i + 1;  // Matrix M: row values [1,1,1,1], [2,2,2,2], etc.
        }
    }
    
    for (int i = 0; i < M_Width; ++i) {
        for (int j = 0; j < N_Width; ++j) {
            h_N[i * N_Width + j] = j + 1;  // Matrix N: column values [1,2,3,4,5], [1,2,3,4,5], etc.
        }
    }
 
    // Print input matrices
    std::cout << "Matrix M (" << M_Height << "x" << M_Width << "):" << std::endl;
    for (int i = 0; i < M_Height; ++i) {
        for (int j = 0; j < M_Width; ++j) {
            std::cout << h_M[i * M_Width + j] << " ";
        }
        std::cout << std::endl;
    }
 
    std::cout << "\nMatrix N (" << M_Width << "x" << N_Width << "):" << std::endl;
    for (int i = 0; i < M_Width; ++i) {
        for (int j = 0; j < N_Width; ++j) {
            std::cout << h_N[i * N_Width + j] << " ";
        }
        std::cout << std::endl;
    }
 
    // Compute expected result on CPU
    for (int i = 0; i < M_Height; ++i) {
        for (int j = 0; j < N_Width; ++j) {
            float sum = 0.0f;
            for (int k = 0; k < M_Width; ++k) {
                sum += h_M[i * M_Width + k] * h_N[k * N_Width + j];
            }
            h_expected[i * N_Width + j] = sum;
        }
    }
 
    std::cout << "\nExpected Result Matrix (" << M_Height << "x" << N_Width << ") (CPU):" << std::endl;
    for (int i = 0; i < M_Height; ++i) {
        for (int j = 0; j < N_Width; ++j) {
            std::cout << h_expected[i * N_Width + j] << " ";
        }
        std::cout << std::endl;
    }
 
    // Allocate device memory
    float *d_M, *d_N, *d_P;
    cudaMalloc(&d_M, size_M);
    cudaMalloc(&d_N, size_N);
    cudaMalloc(&d_P, size_P);
 
    // Copy matrices from host to device
    cudaMemcpy(d_M, h_M, size_M, cudaMemcpyHostToDevice);
    cudaMemcpy(d_N, h_N, size_N, cudaMemcpyHostToDevice);
 
    // Define block and grid dimensions
    dim3 blockDim(16*16);  // Using larger block size for better performance
    dim3 gridDim(
        (M_Height + blockDim.x - 1) / blockDim.x
    );
 
    // Launch kernel
    MatrixMulKernel<<<gridDim, blockDim>>>(d_M, d_N, d_P, M_Height, M_Width, N_Width);
    cudaDeviceSynchronize();
 
    // Check for errors
    cudaError_t cudaErr = cudaGetLastError();
    if (cudaErr != cudaSuccess) {
        std::cerr << "CUDA error: " << cudaGetErrorString(cudaErr) << std::endl;
        return -1;
    }
 
    // Copy result from device to host
    cudaMemcpy(h_P, d_P, size_P, cudaMemcpyDeviceToHost);
 
    // Print result
    std::cout << "\nActual Result Matrix (" << M_Height << "x" << N_Width << ") (GPU):" << std::endl;
    for (int i = 0; i < M_Height; ++i) {
        for (int j = 0; j < N_Width; ++j) {
            std::cout << h_P[i * N_Width + j] << " ";
        }
        std::cout << std::endl;
    }
 
    // Verify results
    bool correct = true;
    for (int i = 0; i < M_Height; ++i) {
        for (int j = 0; j < N_Width; ++j) {
            int idx = i * N_Width + j;
            if (fabs(h_P[idx] - h_expected[idx]) > 1e-5) {
                correct = false;
                std::cout << "Mismatch at position [" << i << "][" << j << "]: CPU=" 
                          << h_expected[idx] << ", GPU=" << h_P[idx] << std::endl;
            }
        }
    }
 
    if (correct) {
        std::cout << "\nMatrix multiplication completed successfully!" << std::endl;
    } else {
        std::cout << "\nThere were errors in the matrix multiplication." << std::endl;
    }
 
    // Free device memory
    cudaFree(d_M);
    cudaFree(d_N);
    cudaFree(d_P);
    
    // Free host memory
    delete[] h_M;
    delete[] h_N;
    delete[] h_P;
    delete[] h_expected;
 
    return 0;
}

Answer 1b

#include <iostream>
#include <cuda_runtime.h>
 
__global__
void MatrixMulKernel(const float* M, const float* N, float* P, int M_Height, int M_Width, int N_Width)
{
    int col = blockIdx.x*blockDim.x + threadIdx.x;
    
    if (col < N_Width)
    {
        for (int row = 0; row < M_Height; ++row)
        {
            float Pvalue = 0.0f;
            for (int k = 0; k < M_Width; ++k) {
                Pvalue += M[row * M_Width + k] * N[k * N_Width + col];
            }
            P[row * N_Width + col] = Pvalue;
        }
    }
}
 
int main()
{
    // Define non-square matrix dimensions
    const int M_Height = 3; // Matrix M height (rows)
    const int M_Width = 4;  // Matrix M width (cols) = Matrix N height (rows)
    const int N_Width = 5;  // Matrix N width (cols)
    
    // Calculate memory sizes
    const size_t size_M = M_Height * M_Width * sizeof(float);
    const size_t size_N = M_Width * N_Width * sizeof(float);
    const size_t size_P = M_Height * N_Width * sizeof(float);
 
    // Allocate host memory
    float *h_M = new float[M_Height * M_Width];
    float *h_N = new float[M_Width * N_Width];
    float *h_P = new float[M_Height * N_Width];
    float *h_expected = new float[M_Height * N_Width];
 
    // Initialize matrices with predictable pattern
    for (int i = 0; i < M_Height; ++i) {
        for (int j = 0; j < M_Width; ++j) {
            h_M[i * M_Width + j] = i + 1;  // Matrix M: row values [1,1,1,1], [2,2,2,2], etc.
        }
    }
    
    for (int i = 0; i < M_Width; ++i) {
        for (int j = 0; j < N_Width; ++j) {
            h_N[i * N_Width + j] = j + 1;  // Matrix N: column values [1,2,3,4,5], [1,2,3,4,5], etc.
        }
    }
 
    // Print input matrices
    std::cout << "Matrix M (" << M_Height << "x" << M_Width << "):" << std::endl;
    for (int i = 0; i < M_Height; ++i) {
        for (int j = 0; j < M_Width; ++j) {
            std::cout << h_M[i * M_Width + j] << " ";
        }
        std::cout << std::endl;
    }
 
    std::cout << "\nMatrix N (" << M_Width << "x" << N_Width << "):" << std::endl;
    for (int i = 0; i < M_Width; ++i) {
        for (int j = 0; j < N_Width; ++j) {
            std::cout << h_N[i * N_Width + j] << " ";
        }
        std::cout << std::endl;
    }
 
    // Compute expected result on CPU
    for (int i = 0; i < M_Height; ++i) {
        for (int j = 0; j < N_Width; ++j) {
            float sum = 0.0f;
            for (int k = 0; k < M_Width; ++k) {
                sum += h_M[i * M_Width + k] * h_N[k * N_Width + j];
            }
            h_expected[i * N_Width + j] = sum;
        }
    }
 
    std::cout << "\nExpected Result Matrix (" << M_Height << "x" << N_Width << ") (CPU):" << std::endl;
    for (int i = 0; i < M_Height; ++i) {
        for (int j = 0; j < N_Width; ++j) {
            std::cout << h_expected[i * N_Width + j] << " ";
        }
        std::cout << std::endl;
    }
 
    // Allocate device memory
    float *d_M, *d_N, *d_P;
    cudaMalloc(&d_M, size_M);
    cudaMalloc(&d_N, size_N);
    cudaMalloc(&d_P, size_P);
 
    // Copy matrices from host to device
    cudaMemcpy(d_M, h_M, size_M, cudaMemcpyHostToDevice);
    cudaMemcpy(d_N, h_N, size_N, cudaMemcpyHostToDevice);
 
    // Define block and grid dimensions
    dim3 blockDim(16*16);  // Using larger block size for better performance
    dim3 gridDim(
        (N_Width + blockDim.x - 1) / blockDim.x
    );
 
    // Launch kernel
    MatrixMulKernel<<<gridDim, blockDim>>>(d_M, d_N, d_P, M_Height, M_Width, N_Width);
    cudaDeviceSynchronize();
 
    // Check for errors
    cudaError_t cudaErr = cudaGetLastError();
    if (cudaErr != cudaSuccess) {
        std::cerr << "CUDA error: " << cudaGetErrorString(cudaErr) << std::endl;
        return -1;
    }
 
    // Copy result from device to host
    cudaMemcpy(h_P, d_P, size_P, cudaMemcpyDeviceToHost);
 
    // Print result
    std::cout << "\nActual Result Matrix (" << M_Height << "x" << N_Width << ") (GPU):" << std::endl;
    for (int i = 0; i < M_Height; ++i) {
        for (int j = 0; j < N_Width; ++j) {
            std::cout << h_P[i * N_Width + j] << " ";
        }
        std::cout << std::endl;
    }
 
    // Verify results
    bool correct = true;
    for (int i = 0; i < M_Height; ++i) {
        for (int j = 0; j < N_Width; ++j) {
            int idx = i * N_Width + j;
            if (fabs(h_P[idx] - h_expected[idx]) > 1e-5) {
                correct = false;
                std::cout << "Mismatch at position [" << i << "][" << j << "]: CPU=" 
                          << h_expected[idx] << ", GPU=" << h_P[idx] << std::endl;
            }
        }
    }
 
    if (correct) {
        std::cout << "\nMatrix multiplication completed successfully!" << std::endl;
    } else {
        std::cout << "\nThere were errors in the matrix multiplication." << std::endl;
    }
 
    // Free device memory
    cudaFree(d_M);
    cudaFree(d_N);
    cudaFree(d_P);
    
    // Free host memory
    delete[] h_M;
    delete[] h_N;
    delete[] h_P;
    delete[] h_expected;
 
    return 0;
}

Answer 1c

Pros: Block and thread can use a 1D array for both Depending on matrix shapes, we might want to spend more parallelization on rows/columns

Cons: More work per thread in terms of memory access - inefficient

Question 2

A matrix-vector multiplication takes an input matrix B and a vector C and produces one output vector A. Each element of the output vector A is the dot product of one row of the input matrix B and C, that is, .

For simplicity we will handle only square matrices whose elements are single-precision floating-point numbers. Write a matrix-vector multiplication kernel and the host stub function that can be called with four parameters: pointer to the output matrix, pointer to the input matrix, pointer to the input vector, and the number of elements in each dimension. Use one thread to calculate an output vector element.

Answer 2

#include <iostream>
#include <cuda_runtime.h>
 
__global__
void MatrixMulKernel(const float* M, const float* V, float* P, int Width)
{
    int row = blockIdx.x*blockDim.x + threadIdx.x;
    
    if (row < Width)
    {
        float Pvalue = 0.0f;
        for (int col = 0; col < Width; ++col) {
            Pvalue += M[row * Width + col] * V[col];
        }
        P[row] = Pvalue;
    }
}
 
int main()
{
    const int Width = 4; // Smaller matrix for easier verification
    const size_t size_m = Width * Width * sizeof(float);
    const size_t size_v = Width * sizeof(float);
 
    float *h_M = new float[Width * Width];
    float *h_V = new float[Width];
    float *h_P = new float[Width];
    float *h_expected = new float[Width];
 
    // Initialize matrices with predictable pattern
    for (int i = 0; i < Width; ++i) {
        for (int j = 0; j < Width; ++j) {
            h_M[i * Width + j] = i + 1; // Matrix M: row values [1,1,1,1], [2,2,2,2], etc.
        }
    }
 
    for (int i = 0; i < Width; ++i) {
      h_V[i] = i + 1; // Vector V: [1,2,3,4]
    }
 
    // Print input matrices
    std::cout << "Matrix M:" << std::endl;
    for (int i = 0; i < Width; ++i) {
        for (int j = 0; j < Width; ++j) {
            std::cout << h_M[i * Width + j] << " ";
        }
        std::cout << std::endl;
    }
 
    std::cout << "\nVector V:" << std::endl;
    for (int i = 0; i < Width; ++i) {
            std::cout << h_V[i] << " ";
    }
    std::cout << std::endl;
 
    // Compute expected result on CPU
    for (int i = 0; i < Width; ++i) {
        float sum = 0.0f;
        for (int j = 0; j < Width; ++j) {
              sum += h_M[i * Width + j] * h_V[j];
            }
        h_expected[i] = sum;
    }
 
    std::cout << "\nExpected Result Matrix (CPU):" << std::endl;
    for (int i = 0; i < Width; ++i) {
            std::cout << h_expected[i] << " ";
    }
    std::cout << std::endl;
 
    float *d_M, *d_V, *d_P;
    cudaMalloc(&d_M, size_m);
    cudaMalloc(&d_V, size_v);
    cudaMalloc(&d_P, size_v);
 
    cudaMemcpy(d_M, h_M, size_m, cudaMemcpyHostToDevice);
    cudaMemcpy(d_V, h_V, size_v, cudaMemcpyHostToDevice);
 
    // Define block and grid dimensions
    dim3 blockDim(256);  // Using larger block size for better performance
    dim3 gridDim(
        (Width + blockDim.x - 1) / blockDim.x
    );
 
    MatrixMulKernel<<<gridDim, blockDim>>>(d_M, d_V, d_P, Width);
    cudaDeviceSynchronize();
 
    cudaMemcpy(h_P, d_P, size_v, cudaMemcpyDeviceToHost);
 
    std::cout << "\nActual Result Matrix (GPU):" << std::endl;
    for (int i = 0; i < Width; ++i) {
            std::cout << h_P[i] << " ";
    }
 
    // Verify results
    bool correct = true;
    for (int i = 0; i < Width; ++i) {
        if (fabs(h_P[i] - h_expected[i]) > 1e-5) {
            correct = false;
            std::cout << "Mismatch at position " << i << ": CPU=" 
                      << h_expected[i] << ", GPU=" << h_P[i] << std::endl;
        }
    }
 
    if (correct) {
        std::cout << "\nMatrix-Vector multiplication completed successfully!" << std::endl;
    } else {
        std::cout << "\nThere were errors in the matrix-vector multiplication." << std::endl;
    }
 
    cudaFree(d_M);
    cudaFree(d_V);
    cudaFree(d_P);
    delete[] h_M;
    delete[] h_V;
    delete[] h_P;
    delete[] h_expected;
 
    return 0;
}

Question 3

Consider the following CUDA kernel and the corresponding host function that calls it:

01    __global__ void foo_kernel(float* a, float* b, unsigned int M,
       unsigned int N) {
02        unsigned int row = blockIdx.y*blockDim.y + threadIdx.y;
03        unsigned int col = blockIdx.x*blockDim.x + threadIdx.x;
04        if(row < M && col < N) {
05            b[row*N + col] = a[row*N + col]/2.1f + 4.8f;
06        }
07    }
08    void foo(float* a_d, float* b_d) {
09        unsigned int M = 150;
10        unsigned int N = 300;
11        dim3 bd(16, 32);
12        dim3 gd((N - 1)/16 + 1, (M - 1)/32 + 1);
13        foo_kernel <<< gd, bd >>>(a_d, b_d, M, N);
14    }

a. What is the number of threads per block?

b. What is the number of threads in the grid?

c. What is the number of blocks in the grid?

d. What is the number of threads that execute the code on line 05?

Answer 3

Answer a: threads per block

Answer b: threads in the grid

Answer c: blocks per grid blocks per grid

Answer d: threads execute based on the conditional in Line 04

Question 4

Consider a 2D matrix with a width of 400 and a height of 500. The matrix is stored as a one-dimensional array. Specify the array index of the matrix element at row 20 and column 10:

a. If the matrix is stored in row-major order.

b. If the matrix is stored in column-major order.

Answer 4

Answer a: row-major index

Answer b: column-major index

Question 5

Consider a 3D tensor with a width of 400, a height of 500, and a depth of 300. The tensor is stored as a one-dimensional array in row-major order. Specify the array index of the tensor element at x = 10, y = 20, and z = 5.

Answer 5

row-major index