Before you go through the exercise solutions, you might wanna check out the Chapter 3 notes.
Question 1
In this chapter we implemented a matrix multiplication kernel that has each thread produce one output matrix element. In this question, you will implement different matrix-matrix multiplication kernels and compare them.
a. Write a kernel that has each thread produce one output matrix row. Fill in the execution configuration parameters for the design.
b. Write a kernel that has each thread produce one output matrix column. Fill in the execution configuration parameters for the design.
c. Analyze the pros and cons of each of the two kernel designs.
Answer 1a
#include <iostream>#include <cuda_runtime.h>__global__void MatrixMulKernel(const float* M, const float* N, float* P, int M_Height, int M_Width, int N_Width){ int row = blockIdx.x*blockDim.x + threadIdx.x; if (row < M_Height) { for (int col = 0; col < N_Width; ++col) { float Pvalue = 0.0f; for (int k = 0; k < M_Width; ++k) { Pvalue += M[row * M_Width + k] * N[k * N_Width + col]; } P[row * N_Width + col] = Pvalue; } }}int main(){ // Define non-square matrix dimensions const int M_Height = 3; // Matrix M height (rows) const int M_Width = 4; // Matrix M width (cols) = Matrix N height (rows) const int N_Width = 5; // Matrix N width (cols) // Calculate memory sizes const size_t size_M = M_Height * M_Width * sizeof(float); const size_t size_N = M_Width * N_Width * sizeof(float); const size_t size_P = M_Height * N_Width * sizeof(float); // Allocate host memory float *h_M = new float[M_Height * M_Width]; float *h_N = new float[M_Width * N_Width]; float *h_P = new float[M_Height * N_Width]; float *h_expected = new float[M_Height * N_Width]; // Initialize matrices with predictable pattern for (int i = 0; i < M_Height; ++i) { for (int j = 0; j < M_Width; ++j) { h_M[i * M_Width + j] = i + 1; // Matrix M: row values [1,1,1,1], [2,2,2,2], etc. } } for (int i = 0; i < M_Width; ++i) { for (int j = 0; j < N_Width; ++j) { h_N[i * N_Width + j] = j + 1; // Matrix N: column values [1,2,3,4,5], [1,2,3,4,5], etc. } } // Print input matrices std::cout << "Matrix M (" << M_Height << "x" << M_Width << "):" << std::endl; for (int i = 0; i < M_Height; ++i) { for (int j = 0; j < M_Width; ++j) { std::cout << h_M[i * M_Width + j] << " "; } std::cout << std::endl; } std::cout << "\nMatrix N (" << M_Width << "x" << N_Width << "):" << std::endl; for (int i = 0; i < M_Width; ++i) { for (int j = 0; j < N_Width; ++j) { std::cout << h_N[i * N_Width + j] << " "; } std::cout << std::endl; } // Compute expected result on CPU for (int i = 0; i < M_Height; ++i) { for (int j = 0; j < N_Width; ++j) { float sum = 0.0f; for (int k = 0; k < M_Width; ++k) { sum += h_M[i * M_Width + k] * h_N[k * N_Width + j]; } h_expected[i * N_Width + j] = sum; } } std::cout << "\nExpected Result Matrix (" << M_Height << "x" << N_Width << ") (CPU):" << std::endl; for (int i = 0; i < M_Height; ++i) { for (int j = 0; j < N_Width; ++j) { std::cout << h_expected[i * N_Width + j] << " "; } std::cout << std::endl; } // Allocate device memory float *d_M, *d_N, *d_P; cudaMalloc(&d_M, size_M); cudaMalloc(&d_N, size_N); cudaMalloc(&d_P, size_P); // Copy matrices from host to device cudaMemcpy(d_M, h_M, size_M, cudaMemcpyHostToDevice); cudaMemcpy(d_N, h_N, size_N, cudaMemcpyHostToDevice); // Define block and grid dimensions dim3 blockDim(16*16); // Using larger block size for better performance dim3 gridDim( (M_Height + blockDim.x - 1) / blockDim.x ); // Launch kernel MatrixMulKernel<<<gridDim, blockDim>>>(d_M, d_N, d_P, M_Height, M_Width, N_Width); cudaDeviceSynchronize(); // Check for errors cudaError_t cudaErr = cudaGetLastError(); if (cudaErr != cudaSuccess) { std::cerr << "CUDA error: " << cudaGetErrorString(cudaErr) << std::endl; return -1; } // Copy result from device to host cudaMemcpy(h_P, d_P, size_P, cudaMemcpyDeviceToHost); // Print result std::cout << "\nActual Result Matrix (" << M_Height << "x" << N_Width << ") (GPU):" << std::endl; for (int i = 0; i < M_Height; ++i) { for (int j = 0; j < N_Width; ++j) { std::cout << h_P[i * N_Width + j] << " "; } std::cout << std::endl; } // Verify results bool correct = true; for (int i = 0; i < M_Height; ++i) { for (int j = 0; j < N_Width; ++j) { int idx = i * N_Width + j; if (fabs(h_P[idx] - h_expected[idx]) > 1e-5) { correct = false; std::cout << "Mismatch at position [" << i << "][" << j << "]: CPU=" << h_expected[idx] << ", GPU=" << h_P[idx] << std::endl; } } } if (correct) { std::cout << "\nMatrix multiplication completed successfully!" << std::endl; } else { std::cout << "\nThere were errors in the matrix multiplication." << std::endl; } // Free device memory cudaFree(d_M); cudaFree(d_N); cudaFree(d_P); // Free host memory delete[] h_M; delete[] h_N; delete[] h_P; delete[] h_expected; return 0;}
Answer 1b
#include <iostream>#include <cuda_runtime.h>__global__void MatrixMulKernel(const float* M, const float* N, float* P, int M_Height, int M_Width, int N_Width){ int col = blockIdx.x*blockDim.x + threadIdx.x; if (col < N_Width) { for (int row = 0; row < M_Height; ++row) { float Pvalue = 0.0f; for (int k = 0; k < M_Width; ++k) { Pvalue += M[row * M_Width + k] * N[k * N_Width + col]; } P[row * N_Width + col] = Pvalue; } }}int main(){ // Define non-square matrix dimensions const int M_Height = 3; // Matrix M height (rows) const int M_Width = 4; // Matrix M width (cols) = Matrix N height (rows) const int N_Width = 5; // Matrix N width (cols) // Calculate memory sizes const size_t size_M = M_Height * M_Width * sizeof(float); const size_t size_N = M_Width * N_Width * sizeof(float); const size_t size_P = M_Height * N_Width * sizeof(float); // Allocate host memory float *h_M = new float[M_Height * M_Width]; float *h_N = new float[M_Width * N_Width]; float *h_P = new float[M_Height * N_Width]; float *h_expected = new float[M_Height * N_Width]; // Initialize matrices with predictable pattern for (int i = 0; i < M_Height; ++i) { for (int j = 0; j < M_Width; ++j) { h_M[i * M_Width + j] = i + 1; // Matrix M: row values [1,1,1,1], [2,2,2,2], etc. } } for (int i = 0; i < M_Width; ++i) { for (int j = 0; j < N_Width; ++j) { h_N[i * N_Width + j] = j + 1; // Matrix N: column values [1,2,3,4,5], [1,2,3,4,5], etc. } } // Print input matrices std::cout << "Matrix M (" << M_Height << "x" << M_Width << "):" << std::endl; for (int i = 0; i < M_Height; ++i) { for (int j = 0; j < M_Width; ++j) { std::cout << h_M[i * M_Width + j] << " "; } std::cout << std::endl; } std::cout << "\nMatrix N (" << M_Width << "x" << N_Width << "):" << std::endl; for (int i = 0; i < M_Width; ++i) { for (int j = 0; j < N_Width; ++j) { std::cout << h_N[i * N_Width + j] << " "; } std::cout << std::endl; } // Compute expected result on CPU for (int i = 0; i < M_Height; ++i) { for (int j = 0; j < N_Width; ++j) { float sum = 0.0f; for (int k = 0; k < M_Width; ++k) { sum += h_M[i * M_Width + k] * h_N[k * N_Width + j]; } h_expected[i * N_Width + j] = sum; } } std::cout << "\nExpected Result Matrix (" << M_Height << "x" << N_Width << ") (CPU):" << std::endl; for (int i = 0; i < M_Height; ++i) { for (int j = 0; j < N_Width; ++j) { std::cout << h_expected[i * N_Width + j] << " "; } std::cout << std::endl; } // Allocate device memory float *d_M, *d_N, *d_P; cudaMalloc(&d_M, size_M); cudaMalloc(&d_N, size_N); cudaMalloc(&d_P, size_P); // Copy matrices from host to device cudaMemcpy(d_M, h_M, size_M, cudaMemcpyHostToDevice); cudaMemcpy(d_N, h_N, size_N, cudaMemcpyHostToDevice); // Define block and grid dimensions dim3 blockDim(16*16); // Using larger block size for better performance dim3 gridDim( (N_Width + blockDim.x - 1) / blockDim.x ); // Launch kernel MatrixMulKernel<<<gridDim, blockDim>>>(d_M, d_N, d_P, M_Height, M_Width, N_Width); cudaDeviceSynchronize(); // Check for errors cudaError_t cudaErr = cudaGetLastError(); if (cudaErr != cudaSuccess) { std::cerr << "CUDA error: " << cudaGetErrorString(cudaErr) << std::endl; return -1; } // Copy result from device to host cudaMemcpy(h_P, d_P, size_P, cudaMemcpyDeviceToHost); // Print result std::cout << "\nActual Result Matrix (" << M_Height << "x" << N_Width << ") (GPU):" << std::endl; for (int i = 0; i < M_Height; ++i) { for (int j = 0; j < N_Width; ++j) { std::cout << h_P[i * N_Width + j] << " "; } std::cout << std::endl; } // Verify results bool correct = true; for (int i = 0; i < M_Height; ++i) { for (int j = 0; j < N_Width; ++j) { int idx = i * N_Width + j; if (fabs(h_P[idx] - h_expected[idx]) > 1e-5) { correct = false; std::cout << "Mismatch at position [" << i << "][" << j << "]: CPU=" << h_expected[idx] << ", GPU=" << h_P[idx] << std::endl; } } } if (correct) { std::cout << "\nMatrix multiplication completed successfully!" << std::endl; } else { std::cout << "\nThere were errors in the matrix multiplication." << std::endl; } // Free device memory cudaFree(d_M); cudaFree(d_N); cudaFree(d_P); // Free host memory delete[] h_M; delete[] h_N; delete[] h_P; delete[] h_expected; return 0;}
Answer 1c
Pros: Block and thread can use a 1D array for both
Depending on matrix shapes, we might want to spend more parallelization on rows/columns
Cons: More work per thread in terms of memory access - inefficient
Question 2
A matrix-vector multiplication takes an input matrix B and a vector C and produces one output vector A. Each element of the output vector A is the dot product of one row of the input matrix B and C, that is, A[i]=ΣjB[i][j]∗C[j].
For simplicity we will handle only square matrices whose elements are single-precision floating-point numbers. Write a matrix-vector multiplication kernel and the host stub function that can be called with four parameters: pointer to the output matrix, pointer to the input matrix, pointer to the input vector, and the number of elements in each dimension. Use one thread to calculate an output vector element.
Answer 2
#include <iostream>#include <cuda_runtime.h>__global__void MatrixMulKernel(const float* M, const float* V, float* P, int Width){ int row = blockIdx.x*blockDim.x + threadIdx.x; if (row < Width) { float Pvalue = 0.0f; for (int col = 0; col < Width; ++col) { Pvalue += M[row * Width + col] * V[col]; } P[row] = Pvalue; }}int main(){ const int Width = 4; // Smaller matrix for easier verification const size_t size_m = Width * Width * sizeof(float); const size_t size_v = Width * sizeof(float); float *h_M = new float[Width * Width]; float *h_V = new float[Width]; float *h_P = new float[Width]; float *h_expected = new float[Width]; // Initialize matrices with predictable pattern for (int i = 0; i < Width; ++i) { for (int j = 0; j < Width; ++j) { h_M[i * Width + j] = i + 1; // Matrix M: row values [1,1,1,1], [2,2,2,2], etc. } } for (int i = 0; i < Width; ++i) { h_V[i] = i + 1; // Vector V: [1,2,3,4] } // Print input matrices std::cout << "Matrix M:" << std::endl; for (int i = 0; i < Width; ++i) { for (int j = 0; j < Width; ++j) { std::cout << h_M[i * Width + j] << " "; } std::cout << std::endl; } std::cout << "\nVector V:" << std::endl; for (int i = 0; i < Width; ++i) { std::cout << h_V[i] << " "; } std::cout << std::endl; // Compute expected result on CPU for (int i = 0; i < Width; ++i) { float sum = 0.0f; for (int j = 0; j < Width; ++j) { sum += h_M[i * Width + j] * h_V[j]; } h_expected[i] = sum; } std::cout << "\nExpected Result Matrix (CPU):" << std::endl; for (int i = 0; i < Width; ++i) { std::cout << h_expected[i] << " "; } std::cout << std::endl; float *d_M, *d_V, *d_P; cudaMalloc(&d_M, size_m); cudaMalloc(&d_V, size_v); cudaMalloc(&d_P, size_v); cudaMemcpy(d_M, h_M, size_m, cudaMemcpyHostToDevice); cudaMemcpy(d_V, h_V, size_v, cudaMemcpyHostToDevice); // Define block and grid dimensions dim3 blockDim(256); // Using larger block size for better performance dim3 gridDim( (Width + blockDim.x - 1) / blockDim.x ); MatrixMulKernel<<<gridDim, blockDim>>>(d_M, d_V, d_P, Width); cudaDeviceSynchronize(); cudaMemcpy(h_P, d_P, size_v, cudaMemcpyDeviceToHost); std::cout << "\nActual Result Matrix (GPU):" << std::endl; for (int i = 0; i < Width; ++i) { std::cout << h_P[i] << " "; } // Verify results bool correct = true; for (int i = 0; i < Width; ++i) { if (fabs(h_P[i] - h_expected[i]) > 1e-5) { correct = false; std::cout << "Mismatch at position " << i << ": CPU=" << h_expected[i] << ", GPU=" << h_P[i] << std::endl; } } if (correct) { std::cout << "\nMatrix-Vector multiplication completed successfully!" << std::endl; } else { std::cout << "\nThere were errors in the matrix-vector multiplication." << std::endl; } cudaFree(d_M); cudaFree(d_V); cudaFree(d_P); delete[] h_M; delete[] h_V; delete[] h_P; delete[] h_expected; return 0;}
Question 3
Consider the following CUDA kernel and the corresponding host function that calls it:
01 __global__ void foo_kernel(float* a, float* b, unsigned int M, unsigned int N) {02 unsigned int row = blockIdx.y*blockDim.y + threadIdx.y;03 unsigned int col = blockIdx.x*blockDim.x + threadIdx.x;04 if(row < M && col < N) {05 b[row*N + col] = a[row*N + col]/2.1f + 4.8f;06 }07 }08 void foo(float* a_d, float* b_d) {09 unsigned int M = 150;10 unsigned int N = 300;11 dim3 bd(16, 32);12 dim3 gd((N - 1)/16 + 1, (M - 1)/32 + 1);13 foo_kernel <<< gd, bd >>>(a_d, b_d, M, N);14 }
a. What is the number of threads per block?
b. What is the number of threads in the grid?
c. What is the number of blocks in the grid?
d. What is the number of threads that execute the code on line 05?
Answer 3
Answer a:16×32=512 threads per block
Answer b:95×512=48640 threads in the grid
Answer c:(N−1)/16+1×(M−1)/32+1 blocks per grid
=(299/16+1)(149/32+1)=(18+1)(4+1)=19×5=95 blocks per grid
Answer d:150∗300=45000 threads execute based on the conditional in Line 04
Question 4
Consider a 2D matrix with a width of 400 and a height of 500. The matrix is stored as a one-dimensional array. Specify the array index of the matrix element at row 20 and column 10:
a. If the matrix is stored in row-major order.
b. If the matrix is stored in column-major order.
Answer 4
Answer a:20∗400+10=8010 row-major index
Answer b:10∗500+20=5020 column-major index
My original incorrect answer
Answer a:19∗400+10=7610 row-major index
Answer b:9∗500+20=4570 column-major index
I had missed that indexing starts from 0, not 1 in C
Question 5
Consider a 3D tensor with a width of 400, a height of 500, and a depth of 300. The tensor is stored as a one-dimensional array in row-major order. Specify the array index of the tensor element at x = 10, y = 20, and z = 5.
Answer 5
10+20×400+5×400×500=1,008,010 row-major index
My original incorrect answer
9×500×300+19×300+5=1355705 row-major index
I had missed that indexing starts from 0, not 1 in C
I had also incorrectly given the column major order