Split View: CUDA GPU 프로그래밍 심화: Warp 최적화, Tensor Core, Triton 커널 작성까지
CUDA GPU 프로그래밍 심화: Warp 최적화, Tensor Core, Triton 커널 작성까지
목차
- CUDA 메모리 계층 구조
- 워프(Warp) 최적화
- 병렬 패턴 구현
- Tensor Core와 WMMA API
- Flash Attention 구현 원리
- CUDA 스트림과 비동기 실행
- 멀티-GPU와 NCCL
- OpenAI Triton 커스텀 커널
- 프로파일링: Nsight Compute & Systems
- 퀴즈
CUDA 메모리 계층 구조
GPU 성능 최적화의 핵심은 메모리 계층을 올바르게 이해하고 활용하는 것입니다. CUDA 메모리는 위치, 접근 속도, 용량에 따라 다음과 같이 구분됩니다.
메모리 유형별 특성
| 메모리 유형 | 위치 | 레이턴시 | 대역폭 | 용량 | 범위 |
|---|---|---|---|---|---|
| Register | 온칩 | ~1 cycle | 매우 높음 | 스레드당 255개 | 스레드 |
| Shared Memory | 온칩 | ~5 cycles | ~19 TB/s | SM당 48-164KB | 블록 |
| L1 Cache | 온칩 | ~28 cycles | ~19 TB/s | SM당 128KB | SM |
| L2 Cache | 오프칩 | ~193 cycles | ~7 TB/s | 수십 MB | 디바이스 |
| Global Memory | HBM | ~600 cycles | ~2 TB/s | 수십 GB | 디바이스 |
| Constant Memory | 캐시됨 | ~5 cycles(캐시히트) | - | 64KB | 디바이스 |
| Texture Memory | 캐시됨 | ~5 cycles(캐시히트) | - | L1에 캐시 | 디바이스 |
Shared Memory 활용 예시
Shared memory는 같은 블록 내 스레드들이 공유하는 온칩 메모리로, global memory 접근을 줄이는 핵심 수단입니다.
// Tiled Matrix Multiplication with Shared Memory
#define TILE_SIZE 16
__global__ void tiledMatMul(
float* A, float* B, float* C,
int M, int N, int K
) {
__shared__ float tileA[TILE_SIZE][TILE_SIZE];
__shared__ float tileB[TILE_SIZE][TILE_SIZE];
int row = blockIdx.y * TILE_SIZE + threadIdx.y;
int col = blockIdx.x * TILE_SIZE + threadIdx.x;
float sum = 0.0f;
int numTiles = (K + TILE_SIZE - 1) / TILE_SIZE;
for (int t = 0; t < numTiles; t++) {
// 타일을 shared memory에 로드
int aCol = t * TILE_SIZE + threadIdx.x;
int bRow = t * TILE_SIZE + threadIdx.y;
tileA[threadIdx.y][threadIdx.x] =
(row < M && aCol < K) ? A[row * K + aCol] : 0.0f;
tileB[threadIdx.y][threadIdx.x] =
(bRow < K && col < N) ? B[bRow * N + col] : 0.0f;
__syncthreads();
// 타일 내에서 행렬 곱 수행
for (int k = 0; k < TILE_SIZE; k++) {
sum += tileA[threadIdx.y][k] * tileB[k][threadIdx.x];
}
__syncthreads();
}
if (row < M && col < N) {
C[row * N + col] = sum;
}
}
Constant Memory와 Texture Memory
Constant memory는 읽기 전용 데이터를 캐싱하며, 모든 스레드가 동일한 주소에 접근할 때 브로드캐스트 메커니즘으로 단일 사이클에 처리됩니다.
__constant__ float convWeights[256];
__global__ void convKernel(float* input, float* output, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
float result = 0.0f;
for (int i = 0; i < 256; i++) {
result += input[idx + i] * convWeights[i]; // 브로드캐스트 접근
}
output[idx] = result;
}
}
워프(Warp) 최적화
Warp Divergence 제거
CUDA에서 32개의 스레드는 하나의 워프(warp)를 구성하며, SIMT 방식으로 동일한 명령을 실행합니다. 조건 분기가 워프 내에서 발생하면 직렬 실행되어 성능이 저하됩니다.
// 나쁜 예: Warp divergence 발생
__global__ void badKernel(float* data, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx % 2 == 0) { // 워프 내 분기 발생
data[idx] = data[idx] * 2.0f;
} else {
data[idx] = data[idx] + 1.0f;
}
}
// 좋은 예: 워프 단위로 분기 정렬
__global__ void goodKernel(float* data, int n) {
int warpId = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (warpId % 2 == 0) { // 워프 단위 분기 — divergence 없음
data[idx] = data[idx] * 2.0f;
} else {
data[idx] = data[idx] + 1.0f;
}
}
Coalesced Memory Access
연속된 메모리 접근(coalesced access)은 한 번의 트랜잭션으로 처리됩니다. 스트라이드 접근이나 불규칙 접근은 다수의 트랜잭션을 유발합니다.
// 나쁜 예: Column-major 접근 (stride = N)
__global__ void columnAccess(float* matrix, int M, int N) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
float sum = 0.0f;
for (int row = 0; row < M; row++) {
sum += matrix[row * N + col]; // 스트라이드 N 접근
}
}
// 좋은 예: Row-major coalesced 접근
__global__ void rowAccess(float* matrix, int M, int N) {
int row = blockIdx.x * blockDim.x + threadIdx.x;
float sum = 0.0f;
for (int col = 0; col < N; col++) {
sum += matrix[row * N + col]; // 연속 접근
}
}
Shared Memory Bank Conflict 해결
Shared memory는 32개의 bank로 나뉩니다. 같은 워프 내 스레드들이 동일한 bank의 서로 다른 주소에 접근하면 bank conflict가 발생해 직렬화됩니다.
// 나쁜 예: 2-way bank conflict (stride=2)
__shared__ float smem[32][32];
// threadIdx.x=0 → smem[0][0], threadIdx.x=1 → smem[0][2]
// 짝수/홀수 스레드가 같은 bank에 충돌
// 좋은 예: 패딩으로 bank conflict 제거
__shared__ float smem_padded[32][33]; // 열에 1 패딩 추가
// threadIdx.x=0 → bank 0, threadIdx.x=1 → bank 1, ...
__global__ void matTransposeNoBankConflict(float* in, float* out, int N) {
__shared__ float tile[TILE_SIZE][TILE_SIZE + 1]; // +1 패딩
int x = blockIdx.x * TILE_SIZE + threadIdx.x;
int y = blockIdx.y * TILE_SIZE + threadIdx.y;
if (x < N && y < N)
tile[threadIdx.y][threadIdx.x] = in[y * N + x];
__syncthreads();
x = blockIdx.y * TILE_SIZE + threadIdx.x;
y = blockIdx.x * TILE_SIZE + threadIdx.y;
if (x < N && y < N)
out[y * N + x] = tile[threadIdx.x][threadIdx.y];
}
병렬 패턴 구현
Parallel Reduction
// Warp-level reduction with shuffle instructions
__device__ float warpReduceSum(float val) {
for (int offset = 16; offset > 0; offset >>= 1) {
val += __shfl_down_sync(0xffffffff, val, offset);
}
return val;
}
__global__ void blockReduceSum(float* input, float* output, int n) {
__shared__ float shared[32];
int tid = threadIdx.x;
int idx = blockIdx.x * blockDim.x + tid;
float val = (idx < n) ? input[idx] : 0.0f;
// 워프 내 reduction
val = warpReduceSum(val);
// 각 워프의 결과를 shared memory에 저장
if (tid % 32 == 0) shared[tid / 32] = val;
__syncthreads();
// 첫 번째 워프에서 최종 reduction
if (tid < 32) {
val = (tid < blockDim.x / 32) ? shared[tid] : 0.0f;
val = warpReduceSum(val);
if (tid == 0) atomicAdd(output, val);
}
}
Prefix Sum (Inclusive Scan)
__global__ void exclusiveScan(int* input, int* output, int n) {
__shared__ int temp[1024 * 2];
int tid = threadIdx.x;
int offset = 1;
// 데이터 로드
temp[2 * tid] = (2 * tid < n) ? input[2 * tid] : 0;
temp[2 * tid + 1] = (2 * tid + 1 < n) ? input[2 * tid + 1] : 0;
// Up-sweep (reduce) 단계
for (int d = n >> 1; d > 0; d >>= 1) {
__syncthreads();
if (tid < d) {
int ai = offset * (2 * tid + 1) - 1;
int bi = offset * (2 * tid + 2) - 1;
temp[bi] += temp[ai];
}
offset <<= 1;
}
// 루트를 0으로 초기화
if (tid == 0) temp[n - 1] = 0;
// Down-sweep 단계
for (int d = 1; d < n; d <<= 1) {
offset >>= 1;
__syncthreads();
if (tid < d) {
int ai = offset * (2 * tid + 1) - 1;
int bi = offset * (2 * tid + 2) - 1;
int t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
if (2 * tid < n) output[2 * tid] = temp[2 * tid];
if (2 * tid + 1 < n) output[2 * tid + 1] = temp[2 * tid + 1];
}
Tensor Core와 WMMA API
Tensor Core는 Volta 아키텍처부터 도입된 특수 하드웨어 유닛으로, 4x4 행렬 곱셈을 단일 클록 사이클에 처리합니다. A100 GPU 기준 FP16 Tensor Core는 312 TFLOPS를 달성하며, FP32 CUDA core(19.5 TFLOPS) 대비 약 16배 높은 처리량을 제공합니다.
WMMA(Warp Matrix Multiply Accumulate) API
#include <mma.h>
using namespace nvcuda;
// WMMA 타일 크기 (Volta/Turing/Ampere 지원 크기)
#define WMMA_M 16
#define WMMA_N 16
#define WMMA_K 16
__global__ void tensorCoreMatMul(
half* A, half* B, float* C,
int M, int N, int K
) {
// WMMA 프래그먼트 선언
wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K,
half, wmma::row_major> aFrag;
wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K,
half, wmma::col_major> bFrag;
wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K,
float> cFrag;
int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
int warpN = (blockIdx.y * blockDim.y + threadIdx.y);
// accumulator 초기화
wmma::fill_fragment(cFrag, 0.0f);
// K 차원을 따라 타일 반복
for (int i = 0; i < K; i += WMMA_K) {
int aRow = warpM * WMMA_M;
int aCol = i;
int bRow = i;
int bCol = warpN * WMMA_N;
if (aRow < M && aCol < K && bRow < K && bCol < N) {
// 행렬 프래그먼트 로드
wmma::load_matrix_sync(aFrag, A + aRow * K + aCol, K);
wmma::load_matrix_sync(bFrag, B + bRow * N + bCol, N);
// Tensor Core MMA 실행
wmma::mma_sync(cFrag, aFrag, bFrag, cFrag);
}
}
// 결과 저장
int cRow = warpM * WMMA_M;
int cCol = warpN * WMMA_N;
if (cRow < M && cCol < N) {
wmma::store_matrix_sync(C + cRow * N + cCol, cFrag, N,
wmma::mem_row_major);
}
}
PTX 인라인 어셈블리
성능 한계를 극한까지 끌어올리기 위해 PTX(Parallel Thread Execution) 인라인 어셈블리를 사용할 수 있습니다.
__device__ float fastSqrt(float x) {
float result;
asm("sqrt.approx.f32 %0, %1;" : "=f"(result) : "f"(x));
return result;
}
__device__ int ldgInt(const int* ptr) {
int val;
// 캐시 힌트를 사용한 글로벌 메모리 로드 (L2 캐시 우선)
asm volatile("ld.global.cg.s32 %0, [%1];"
: "=r"(val) : "l"(ptr));
return val;
}
Flash Attention 구현 원리
표준 Attention은 시퀀스 길이 N에 대해 O(N²) 메모리를 HBM에 저장해야 합니다. Flash Attention은 SRAM(shared memory)을 타일 단위로 활용하여 HBM 접근을 최소화합니다.
핵심 아이디어: 온라인 소프트맥스 (Online Softmax)
표준 Attention:
1. S = Q @ K^T → HBM에 N×N 행렬 저장 (O(N²))
2. P = softmax(S) → HBM에서 읽고 쓰기
3. O = P @ V → HBM에서 읽고 쓰기
Flash Attention:
- Q, K, V를 타일로 분할 → SRAM에 로드
- 온라인 소프트맥스로 행 최댓값/분모를 점진적으로 갱신
- HBM에는 최종 O만 저장 → HBM 읽기/쓰기 O(N) 달성
// Flash Attention 핵심 루프 (단순화된 버전)
__global__ void flashAttentionKernel(
const float* Q, const float* K, const float* V,
float* O, float* l, float* m,
int seqLen, int headDim, float scale
) {
int tileSize = blockDim.x;
int qStart = blockIdx.x * tileSize;
extern __shared__ float smem[];
float* sQ = smem;
float* sK = smem + tileSize * headDim;
float* sV = smem + 2 * tileSize * headDim;
// Q 타일을 SRAM에 로드
for (int d = 0; d < headDim; d++) {
sQ[threadIdx.x * headDim + d] =
Q[(qStart + threadIdx.x) * headDim + d];
}
float mi = -INFINITY; // 행 최댓값 (수치 안정성)
float li = 0.0f; // 행 합산 (softmax 분모)
float oi[64] = {}; // 출력 누적
// K, V 타일을 순회
for (int kStart = 0; kStart < seqLen; kStart += tileSize) {
// K, V 타일 로드
for (int d = 0; d < headDim; d++) {
sK[threadIdx.x * headDim + d] =
K[(kStart + threadIdx.x) * headDim + d];
sV[threadIdx.x * headDim + d] =
V[(kStart + threadIdx.x) * headDim + d];
}
__syncthreads();
// S_ij = Q_i @ K_j^T * scale
for (int j = 0; j < tileSize; j++) {
float sij = 0.0f;
for (int d = 0; d < headDim; d++) {
sij += sQ[threadIdx.x * headDim + d]
* sK[j * headDim + d];
}
sij *= scale;
// 온라인 softmax 갱신
float miNew = fmaxf(mi, sij);
float liNew = li * expf(mi - miNew) + expf(sij - miNew);
for (int d = 0; d < headDim; d++) {
oi[d] = oi[d] * (li / liNew) * expf(mi - miNew)
+ (expf(sij - miNew) / liNew) * sV[j * headDim + d];
}
mi = miNew;
li = liNew;
}
__syncthreads();
}
// 결과 저장
for (int d = 0; d < headDim; d++) {
O[(qStart + threadIdx.x) * headDim + d] = oi[d];
}
l[qStart + threadIdx.x] = li;
m[qStart + threadIdx.x] = mi;
}
CUDA 스트림과 비동기 실행
CUDA 스트림을 통해 메모리 복사와 커널 실행을 오버랩하여 GPU 활용도를 높일 수 있습니다.
#include <cuda_runtime.h>
void pipelinedProcessing(float* h_input, float* h_output,
int totalSize, int chunkSize) {
const int numStreams = 4;
cudaStream_t streams[numStreams];
float *d_input[numStreams], *d_output[numStreams];
for (int i = 0; i < numStreams; i++) {
cudaStreamCreate(&streams[i]);
cudaMalloc(&d_input[i], chunkSize * sizeof(float));
cudaMalloc(&d_output[i], chunkSize * sizeof(float));
}
// 파이프라인: HtoD 복사 → 커널 → DtoH 복사가 중첩
for (int chunk = 0; chunk < totalSize / chunkSize; chunk++) {
int s = chunk % numStreams;
float* hIn = h_input + chunk * chunkSize;
float* hOut = h_output + chunk * chunkSize;
// 비동기 Host-to-Device 복사
cudaMemcpyAsync(d_input[s], hIn,
chunkSize * sizeof(float),
cudaMemcpyHostToDevice, streams[s]);
// 커널 실행 (이전 memcpy와 다른 스트림은 병렬 수행)
dim3 block(256);
dim3 grid((chunkSize + 255) / 256);
processKernel<<<grid, block, 0, streams[s]>>>(
d_input[s], d_output[s], chunkSize);
// 비동기 Device-to-Host 복사
cudaMemcpyAsync(hOut, d_output[s],
chunkSize * sizeof(float),
cudaMemcpyDeviceToHost, streams[s]);
}
// 모든 스트림 완료 대기
for (int i = 0; i < numStreams; i++) {
cudaStreamSynchronize(streams[i]);
cudaStreamDestroy(streams[i]);
cudaFree(d_input[i]);
cudaFree(d_output[i]);
}
}
스트림 이벤트 동기화
cudaEvent_t startEvent, stopEvent;
cudaEventCreate(&startEvent);
cudaEventCreate(&stopEvent);
cudaEventRecord(startEvent, stream1);
kernel1<<<grid, block, 0, stream1>>>(d_data);
// stream2가 stream1의 kernel1 완료를 기다림
cudaStreamWaitEvent(stream2, startEvent, 0);
kernel2<<<grid, block, 0, stream2>>>(d_data);
cudaEventRecord(stopEvent, stream2);
cudaEventSynchronize(stopEvent);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, startEvent, stopEvent);
멀티-GPU와 NCCL
P2P(Peer-to-Peer) 통신
// GPU 0에서 GPU 1로 직접 메모리 복사 (NVLink/PCIe P2P)
cudaSetDevice(0);
float* d_src;
cudaMalloc(&d_src, size);
cudaSetDevice(1);
float* d_dst;
cudaMalloc(&d_dst, size);
// P2P 활성화 확인
int canAccess;
cudaDeviceCanAccessPeer(&canAccess, 0, 1);
if (canAccess) {
cudaSetDevice(0);
cudaDeviceEnablePeerAccess(1, 0);
// GPU 0에서 GPU 1 메모리로 직접 복사
cudaMemcpyPeer(d_dst, 1, d_src, 0, size);
}
NCCL Ring-AllReduce
#include <nccl.h>
void ncclAllReduceExample(int numGPUs, float** d_buffers, size_t count) {
ncclComm_t comms[numGPUs];
int devList[numGPUs];
for (int i = 0; i < numGPUs; i++) devList[i] = i;
// NCCL 통신자 초기화
ncclCommInitAll(comms, numGPUs, devList);
cudaStream_t streams[numGPUs];
for (int i = 0; i < numGPUs; i++) {
cudaSetDevice(i);
cudaStreamCreate(&streams[i]);
}
// AllReduce: 모든 GPU의 버퍼를 합산하고 결과를 모든 GPU에 분배
ncclGroupStart();
for (int i = 0; i < numGPUs; i++) {
cudaSetDevice(i);
ncclAllReduce(d_buffers[i], d_buffers[i], count,
ncclFloat, ncclSum, comms[i], streams[i]);
}
ncclGroupEnd();
for (int i = 0; i < numGPUs; i++) {
cudaSetDevice(i);
cudaStreamSynchronize(streams[i]);
ncclCommDestroy(comms[i]);
}
}
Ring-AllReduce의 이론적 전송량은 2 * (N-1) / N * dataSize입니다. N개의 GPU가 링을 이루어 Reduce-Scatter와 AllGather 각각 N-1 단계를 수행하며, 각 단계에서 dataSize / N 크기를 전송합니다. 따라서 총 전송량은 2 * (N-1) * (dataSize / N)이 됩니다.
OpenAI Triton 커스텀 커널
Triton은 Python으로 GPU 커널을 작성하면서 CUDA 수준의 성능을 달성할 수 있는 프레임워크입니다.
Triton 행렬 곱셈 커널
import triton
import triton.language as tl
import torch
@triton.jit
def matmul_kernel(
a_ptr, b_ptr, c_ptr,
M, N, K,
stride_am, stride_ak,
stride_bk, stride_bn,
stride_cm, stride_cn,
BLOCK_M: tl.constexpr,
BLOCK_N: tl.constexpr,
BLOCK_K: tl.constexpr,
):
pid_m = tl.program_id(0)
pid_n = tl.program_id(1)
# 오프셋 계산
offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
offs_k = tl.arange(0, BLOCK_K)
# 포인터 초기화
a_ptrs = a_ptr + (offs_m[:, None] * stride_am + offs_k[None, :] * stride_ak)
b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_n[None, :] * stride_bn)
acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
for k in range(0, K, BLOCK_K):
# 경계 마스킹으로 안전하게 로드
a = tl.load(a_ptrs, mask=(offs_m[:, None] < M) & (offs_k[None, :] < K - k), other=0.0)
b = tl.load(b_ptrs, mask=(offs_k[:, None] < K - k) & (offs_n[None, :] < N), other=0.0)
acc += tl.dot(a, b)
a_ptrs += BLOCK_K * stride_ak
b_ptrs += BLOCK_K * stride_bk
# 결과 저장
c_ptrs = c_ptr + stride_cm * offs_m[:, None] + stride_cn * offs_n[None, :]
c_mask = (offs_m[:, None] < M) & (offs_n[None, :] < N)
tl.store(c_ptrs, acc, mask=c_mask)
def matmul(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
M, K = a.shape
K2, N = b.shape
assert K == K2
c = torch.empty((M, N), device=a.device, dtype=torch.float32)
grid = lambda meta: (
triton.cdiv(M, meta['BLOCK_M']),
triton.cdiv(N, meta['BLOCK_N']),
)
matmul_kernel[grid](
a, b, c,
M, N, K,
a.stride(0), a.stride(1),
b.stride(0), b.stride(1),
c.stride(0), c.stride(1),
BLOCK_M=128, BLOCK_N=256, BLOCK_K=64,
)
return c
Flash Attention in Triton
@triton.jit
def flash_attn_fwd_kernel(
Q_ptr, K_ptr, V_ptr, O_ptr,
L_ptr, M_ptr,
seqlen, headdim, scale,
BLOCK_Q: tl.constexpr,
BLOCK_KV: tl.constexpr,
):
start_q = tl.program_id(0) * BLOCK_Q
offs_q = start_q + tl.arange(0, BLOCK_Q)
offs_d = tl.arange(0, headdim)
# Q 블록 로드
q = tl.load(Q_ptr + offs_q[:, None] * headdim + offs_d[None, :],
mask=(offs_q[:, None] < seqlen), other=0.0)
mi = tl.full((BLOCK_Q,), value=-float('inf'), dtype=tl.float32)
li = tl.zeros((BLOCK_Q,), dtype=tl.float32)
acc = tl.zeros((BLOCK_Q, headdim), dtype=tl.float32)
for start_kv in range(0, seqlen, BLOCK_KV):
offs_kv = start_kv + tl.arange(0, BLOCK_KV)
k = tl.load(K_ptr + offs_kv[:, None] * headdim + offs_d[None, :],
mask=(offs_kv[:, None] < seqlen), other=0.0)
v = tl.load(V_ptr + offs_kv[:, None] * headdim + offs_d[None, :],
mask=(offs_kv[:, None] < seqlen), other=0.0)
# Attention score 계산
s = tl.dot(q, tl.trans(k)) * scale
# 온라인 softmax
mi_new = tl.maximum(mi, tl.max(s, axis=1))
p = tl.exp(s - mi_new[:, None])
li_new = li * tl.exp(mi - mi_new) + tl.sum(p, axis=1)
acc = acc * (li * tl.exp(mi - mi_new))[:, None] / li_new[:, None]
acc += tl.dot(p, v) / li_new[:, None]
mi = mi_new
li = li_new
# 결과 저장
tl.store(O_ptr + offs_q[:, None] * headdim + offs_d[None, :],
acc, mask=(offs_q[:, None] < seqlen))
tl.store(L_ptr + offs_q, li, mask=(offs_q < seqlen))
tl.store(M_ptr + offs_q, mi, mask=(offs_q < seqlen))
프로파일링: Nsight Compute & Systems
Nsight Compute 명령어
# 커널 상세 분석
ncu --set full -o profile_report ./my_cuda_app
# 특정 커널만 프로파일링
ncu --kernel-name tiledMatMul --launch-count 3 ./my_cuda_app
# 메모리 대역폭 분석 (Roofline 모델용)
ncu --metrics \
l1tex__t_bytes_pipe_lsu_mem_global_op_ld.sum,\
l1tex__t_bytes_pipe_lsu_mem_global_op_st.sum,\
sm__cycles_elapsed.avg,\
sm__cycles_elapsed.avg.per_second \
./my_cuda_app
# DRAM 대역폭 활용률
ncu --metrics \
gpu__dram_throughput.avg.pct_of_peak_sustained_elapsed \
./my_cuda_app
Nsight Systems 타임라인 분석
# 전체 애플리케이션 타임라인 수집
nsys profile --trace=cuda,nvtx,osrt \
--output=timeline_report \
./my_cuda_app
# 통계 요약 출력
nsys stats timeline_report.nsys-rep
# GPU 활동도 분석
nsys analyze --report gputrace timeline_report.nsys-rep
NVTX 마커로 코드 구간 표시
#include <nvtx3/nvToolsExt.h>
void profiledFunction() {
nvtxRangePush("DataPreprocessing");
preprocessData();
nvtxRangePop();
nvtxRangePush("ForwardPass");
forwardPass();
nvtxRangePop();
}
Roofline 모델 분석
Roofline 모델은 커널의 성능 병목이 연산 바운드인지 메모리 대역폭 바운드인지 판단합니다.
산술 강도(Arithmetic Intensity) = FLOP / Bytes
예시 (A100 80GB):
- 피크 FP16 Tensor Core: 312 TFLOPS
- HBM 대역폭: 2 TB/s
- Ridge Point: 312 / 2 = 156 FLOP/Byte
커널의 산술 강도 < 156 → 메모리 바운드
커널의 산술 강도 > 156 → 연산 바운드
# 커널 산술 강도 측정
ncu --metrics \
sm__ops_path_tensor_src_fp16.sum,\
dram__bytes.sum \
./matmul_app
# 산술 강도 = tensor_ops * 2 / dram_bytes
퀴즈
Q1. CUDA에서 shared memory bank conflict가 발생하는 조건과 해결 방법은?
정답: 같은 워프 내 두 개 이상의 스레드가 shared memory의 동일한 bank에 속하는 서로 다른 주소에 동시에 접근할 때 bank conflict가 발생합니다.
설명: Shared memory는 32개의 bank로 나뉘며, 각 bank는 4바이트 단위로 주소를 할당합니다. 스레드 i는 bank i % 32에 기본 접근합니다. 예를 들어 float smem[32][32]를 전치 과정에서 smem[threadIdx.x][threadIdx.y]로 접근하면 같은 열의 스레드들이 같은 bank에 접근하여 32-way conflict가 발생합니다. 해결책은 배열에 1개의 패딩 열을 추가하는 것입니다: float smem[32][33]. 이렇게 하면 각 행의 시작 주소가 bank 경계를 엇갈려 conflict가 사라집니다.
Q2. Warp divergence가 성능에 미치는 영향과 최소화하는 코드 패턴은?
정답: Warp divergence는 워프 내 스레드들이 서로 다른 코드 경로를 실행할 때 발생하며, 양쪽 경로가 순차적으로 실행되어 최대 2배의 성능 저하를 야기합니다.
설명: SIMT 아키텍처에서 워프의 모든 스레드는 동일한 명령어를 실행합니다. if-else 분기가 발생하면 true 경로 스레드들이 실행되는 동안 false 경로 스레드들은 마스킹(비활성화)되고, 이후 반대로 실행됩니다. 최소화 전략: (1) 분기 조건을 워프 ID 기반으로 작성하여 워프 단위로 균일한 분기 유도, (2) __builtin_expect나 __any_sync/__all_sync 등 워프 표결 함수 활용, (3) 데이터 정렬을 통해 같은 경로를 실행하는 스레드들을 같은 워프에 배치.
Q3. Tensor Core가 FP32 연산보다 행렬 곱셈을 빠르게 처리하는 하드웨어 원리는?
정답: Tensor Core는 4×4 행렬의 FMA(Fused Multiply-Add) 연산을 전용 하드웨어 회로로 단일 클록에 처리하며, 데이터 형식(FP16/BF16/INT8)을 낮춰 처리량을 극대화합니다.
설명: 일반 CUDA Core는 스칼라 FP32 FMA를 1클록에 1회 수행합니다. Tensor Core는 16×16×16 행렬 곱을 워프 단위(32스레드)로 수행하며, 하나의 워프당 단 몇 클록에 처리합니다. A100 기준 FP16 Tensor Core는 SM당 초당 1,248 TFLOPS(피크)를 달성합니다. 또한 Tensor Core는 FP16 입력으로 FP32 누산(Mixed Precision)을 지원하여 수치 정밀도를 유지하면서도 처리량을 높입니다. 이 때문에 PyTorch의 torch.cuda.amp는 내부적으로 Tensor Core를 활용합니다.
Q4. Flash Attention이 표준 Attention보다 메모리 효율적인 이유 (HBM vs SRAM 관점)?
정답: 표준 Attention은 N×N 크기의 중간 행렬을 HBM에 저장해야 하지만(O(N²) HBM), Flash Attention은 타일 단위로 SRAM에서만 처리하여 HBM 접근을 O(N)으로 줄입니다.
설명: 표준 Attention의 HBM I/O는 Q, K, V 읽기(O(Nd)) + S 쓰기(O(N²)) + S 읽기 + P 쓰기(O(N²)) + P 읽기 + O 쓰기(O(Nd))로 O(N²d)에 달합니다. Flash Attention은 SRAM 크기에 맞춰 Q, K, V를 타일로 분할하고, 온라인 소프트맥스를 사용해 중간 결과를 SRAM에서 누적합니다. HBM 접근은 Q, K, V 읽기와 O 쓰기만 수행하므로 O(N²d / M) 단계의 SRAM 처리로 O(Nd) HBM I/O를 달성합니다. SRAM의 대역폭은 HBM의 약 10배이므로 실질적인 처리 속도가 크게 향상됩니다.
Q5. NCCL Ring-AllReduce에서 데이터 전송량이 GPU 수에 관계없이 2*(N-1)/N * data_size인 이유는?
정답: Ring-AllReduce는 Reduce-Scatter와 AllGather 두 단계로 구성되며, 각 단계에서 N-1번의 전송이 이루어지고 매 전송 크기가 data_size / N이기 때문입니다.
설명: N개의 GPU가 링을 구성합니다. Reduce-Scatter 단계에서는 각 GPU가 자신의 데이터 청크(data_size / N)를 인접 GPU로 N-1회 전달하여 각 GPU가 전체 데이터의 1/N에 대한 reduce 결과를 보유합니다. AllGather 단계에서는 reduce된 청크를 다시 N-1회 전달하여 모든 GPU가 전체 결과를 갖습니다. 총 전송량 = 2 * (N-1) * (data_size / N) = 2 * (N-1) / N * data_size. N=2일 때 data_size, N=무한대일 때 2 * data_size로 수렴합니다. 이는 단순 브로드캐스트 방식((N-1) * data_size)보다 훨씬 효율적입니다.
Advanced CUDA GPU Programming: Warp Optimization, Tensor Cores, and Triton Kernels
Table of Contents
- CUDA Memory Hierarchy
- Warp Optimization
- Parallel Patterns
- Tensor Cores and the WMMA API
- Flash Attention: How It Works
- CUDA Streams and Async Execution
- Multi-GPU and NCCL
- OpenAI Triton Custom Kernels
- Profiling with Nsight Compute and Systems
- Quiz
CUDA Memory Hierarchy
Understanding the memory hierarchy is the single most important factor in GPU performance optimization. CUDA exposes several memory types that differ in location, latency, bandwidth, and scope.
Memory Type Comparison
| Memory Type | Location | Latency | Bandwidth | Capacity | Scope |
|---|---|---|---|---|---|
| Register | On-chip | ~1 cycle | Highest | 255 per thread | Thread |
| Shared Memory | On-chip | ~5 cycles | ~19 TB/s | 48–164 KB per SM | Block |
| L1 Cache | On-chip | ~28 cycles | ~19 TB/s | 128 KB per SM | SM |
| L2 Cache | Off-chip | ~193 cycles | ~7 TB/s | Tens of MB | Device |
| Global (HBM) | HBM | ~600 cycles | ~2 TB/s | Tens of GB | Device |
| Constant | Cached | ~5 cycles (hit) | — | 64 KB | Device |
| Texture | Cached | ~5 cycles (hit) | — | L1-cached | Device |
Using Shared Memory: Tiled Matrix Multiplication
Shared memory sits on-chip and is shared among threads in the same block. Loading tiles into shared memory dramatically reduces expensive global memory round-trips.
#define TILE_SIZE 16
__global__ void tiledMatMul(
float* A, float* B, float* C,
int M, int N, int K
) {
__shared__ float tileA[TILE_SIZE][TILE_SIZE];
__shared__ float tileB[TILE_SIZE][TILE_SIZE];
int row = blockIdx.y * TILE_SIZE + threadIdx.y;
int col = blockIdx.x * TILE_SIZE + threadIdx.x;
float sum = 0.0f;
int numTiles = (K + TILE_SIZE - 1) / TILE_SIZE;
for (int t = 0; t < numTiles; t++) {
int aCol = t * TILE_SIZE + threadIdx.x;
int bRow = t * TILE_SIZE + threadIdx.y;
// Collaboratively load tiles into shared memory
tileA[threadIdx.y][threadIdx.x] =
(row < M && aCol < K) ? A[row * K + aCol] : 0.0f;
tileB[threadIdx.y][threadIdx.x] =
(bRow < K && col < N) ? B[bRow * N + col] : 0.0f;
__syncthreads();
for (int k = 0; k < TILE_SIZE; k++) {
sum += tileA[threadIdx.y][k] * tileB[k][threadIdx.x];
}
__syncthreads();
}
if (row < M && col < N) {
C[row * N + col] = sum;
}
}
Constant Memory
Constant memory is read-only and cached. When all threads in a warp access the same address, the value is broadcast in a single cycle — ideal for filter weights or configuration data.
__constant__ float convWeights[256];
__global__ void convKernel(float* input, float* output, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
float result = 0.0f;
for (int i = 0; i < 256; i++) {
result += input[idx + i] * convWeights[i]; // broadcast access
}
output[idx] = result;
}
}
Warp Optimization
Eliminating Warp Divergence
A warp is a group of 32 threads that execute in lockstep under SIMT. When a conditional branch splits threads within a warp onto different paths, both paths execute serially — this is called warp divergence.
// Bad: divergence within a warp (threads 0,2,4,... vs 1,3,5,...)
__global__ void badKernel(float* data, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx % 2 == 0) {
data[idx] = data[idx] * 2.0f;
} else {
data[idx] = data[idx] + 1.0f;
}
}
// Good: branch aligned to warp boundary — no divergence
__global__ void goodKernel(float* data, int n) {
int warpId = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (warpId % 2 == 0) {
data[idx] = data[idx] * 2.0f;
} else {
data[idx] = data[idx] + 1.0f;
}
}
Coalesced Memory Access
Global memory is accessed in 128-byte transactions. If consecutive threads read consecutive addresses (coalesced), a single transaction serves the entire warp. Strided or random access can require 32 separate transactions.
// Bad: column-major strided access (stride = N bytes between threads)
__global__ void columnAccess(float* matrix, int M, int N) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
float sum = 0.0f;
for (int row = 0; row < M; row++) {
sum += matrix[row * N + col]; // stride-N access
}
}
// Good: row-major coalesced access
__global__ void rowAccess(float* matrix, int M, int N) {
int row = blockIdx.x * blockDim.x + threadIdx.x;
float sum = 0.0f;
for (int col = 0; col < N; col++) {
sum += matrix[row * N + col]; // sequential access
}
}
Shared Memory Bank Conflicts
Shared memory is divided into 32 banks (4 bytes each). When two or more threads in a warp access different addresses within the same bank, they serialize — this is a bank conflict.
// Bad: 32-way bank conflict during transpose (all threads hit bank 0)
__shared__ float smem[32][32];
// smem[0][0], smem[1][0], smem[2][0]... all map to bank 0
// Good: +1 column padding shifts each row to a different bank
__global__ void matTransposeNoBankConflict(float* in, float* out, int N) {
__shared__ float tile[TILE_SIZE][TILE_SIZE + 1]; // +1 padding
int x = blockIdx.x * TILE_SIZE + threadIdx.x;
int y = blockIdx.y * TILE_SIZE + threadIdx.y;
if (x < N && y < N)
tile[threadIdx.y][threadIdx.x] = in[y * N + x];
__syncthreads();
x = blockIdx.y * TILE_SIZE + threadIdx.x;
y = blockIdx.x * TILE_SIZE + threadIdx.y;
if (x < N && y < N)
out[y * N + x] = tile[threadIdx.x][threadIdx.y];
}
Parallel Patterns
Parallel Reduction with Warp Shuffle
// Warp-level sum using __shfl_down_sync (no shared memory needed)
__device__ float warpReduceSum(float val) {
for (int offset = 16; offset > 0; offset >>= 1) {
val += __shfl_down_sync(0xffffffff, val, offset);
}
return val;
}
__global__ void blockReduceSum(float* input, float* output, int n) {
__shared__ float shared[32];
int tid = threadIdx.x;
int idx = blockIdx.x * blockDim.x + tid;
float val = (idx < n) ? input[idx] : 0.0f;
// Step 1: reduce within each warp
val = warpReduceSum(val);
// Step 2: write each warp's result to shared memory
if (tid % 32 == 0) shared[tid / 32] = val;
__syncthreads();
// Step 3: final reduction in first warp
if (tid < 32) {
val = (tid < blockDim.x / 32) ? shared[tid] : 0.0f;
val = warpReduceSum(val);
if (tid == 0) atomicAdd(output, val);
}
}
Exclusive Prefix Sum (Scan)
The Blelloch (work-efficient) scan uses an up-sweep and down-sweep phase, each costing O(N) work.
__global__ void exclusiveScan(int* input, int* output, int n) {
__shared__ int temp[1024 * 2];
int tid = threadIdx.x;
int offset = 1;
temp[2 * tid] = (2 * tid < n) ? input[2 * tid] : 0;
temp[2 * tid + 1] = (2 * tid + 1 < n) ? input[2 * tid + 1] : 0;
// Up-sweep (reduce)
for (int d = n >> 1; d > 0; d >>= 1) {
__syncthreads();
if (tid < d) {
int ai = offset * (2 * tid + 1) - 1;
int bi = offset * (2 * tid + 2) - 1;
temp[bi] += temp[ai];
}
offset <<= 1;
}
if (tid == 0) temp[n - 1] = 0;
// Down-sweep
for (int d = 1; d < n; d <<= 1) {
offset >>= 1;
__syncthreads();
if (tid < d) {
int ai = offset * (2 * tid + 1) - 1;
int bi = offset * (2 * tid + 2) - 1;
int t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
__syncthreads();
if (2 * tid < n) output[2 * tid] = temp[2 * tid];
if (2 * tid + 1 < n) output[2 * tid + 1] = temp[2 * tid + 1];
}
Tensor Cores and the WMMA API
Tensor Cores are specialized hardware units introduced in Volta (2017) that perform a 4×4 FMA in a single clock cycle at the hardware level. At the warp level, the WMMA API exposes 16×16×16 matrix multiply-accumulate. On A100, FP16 Tensor Cores deliver 312 TFLOPS — roughly 16x the throughput of FP32 CUDA Cores.
Mixed-Precision WMMA Matrix Multiply
#include <mma.h>
using namespace nvcuda;
#define WMMA_M 16
#define WMMA_N 16
#define WMMA_K 16
__global__ void tensorCoreMatMul(
half* A, half* B, float* C,
int M, int N, int K
) {
wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K,
half, wmma::row_major> aFrag;
wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K,
half, wmma::col_major> bFrag;
wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K,
float> cFrag;
int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
int warpN = (blockIdx.y * blockDim.y + threadIdx.y);
wmma::fill_fragment(cFrag, 0.0f);
for (int i = 0; i < K; i += WMMA_K) {
int aRow = warpM * WMMA_M;
int aCol = i;
int bRow = i;
int bCol = warpN * WMMA_N;
if (aRow < M && aCol < K && bRow < K && bCol < N) {
wmma::load_matrix_sync(aFrag, A + aRow * K + aCol, K);
wmma::load_matrix_sync(bFrag, B + bRow * N + bCol, N);
wmma::mma_sync(cFrag, aFrag, bFrag, cFrag);
}
}
int cRow = warpM * WMMA_M;
int cCol = warpN * WMMA_N;
if (cRow < M && cCol < N) {
wmma::store_matrix_sync(C + cRow * N + cCol, cFrag, N,
wmma::mem_row_major);
}
}
PTX Inline Assembly
For performance-critical inner loops, PTX inline assembly unlocks hardware instructions not exposed through the CUDA C++ API.
__device__ float fastSqrt(float x) {
float result;
asm("sqrt.approx.f32 %0, %1;" : "=f"(result) : "f"(x));
return result;
}
__device__ int ldgInt(const int* ptr) {
int val;
// Cache-hint global load: prefer L2, bypass L1
asm volatile("ld.global.cg.s32 %0, [%1];"
: "=r"(val) : "l"(ptr));
return val;
}
Flash Attention: How It Works
Standard Attention requires O(N²) HBM memory to store the intermediate attention matrix S. Flash Attention tiles Q, K, V into SRAM and uses an online softmax trick to accumulate the output without ever materializing the full N×N matrix in HBM.
Key Idea: Online Softmax
Standard Attention HBM I/O:
Read Q, K, V → O(Nd)
Write S = QK^T → O(N^2) <-- bottleneck
Read S, write P → O(N^2)
Read P, write O → O(Nd)
Total: O(N^2 * d)
Flash Attention HBM I/O:
Read Q, K, V → O(Nd)
Write O → O(Nd)
Total: O(Nd) — no N^2 term!
// Simplified Flash Attention forward pass
__global__ void flashAttentionKernel(
const float* Q, const float* K, const float* V,
float* O, float* l, float* m,
int seqLen, int headDim, float scale
) {
int tileSize = blockDim.x;
int qStart = blockIdx.x * tileSize;
extern __shared__ float smem[];
float* sQ = smem;
float* sK = smem + tileSize * headDim;
float* sV = smem + 2 * tileSize * headDim;
// Load Q tile into SRAM
for (int d = 0; d < headDim; d++) {
sQ[threadIdx.x * headDim + d] =
Q[(qStart + threadIdx.x) * headDim + d];
}
float mi = -INFINITY; // running row max for numerical stability
float li = 0.0f; // running softmax denominator
float oi[64] = {}; // output accumulator
for (int kStart = 0; kStart < seqLen; kStart += tileSize) {
// Load K and V tiles into SRAM
for (int d = 0; d < headDim; d++) {
sK[threadIdx.x * headDim + d] =
K[(kStart + threadIdx.x) * headDim + d];
sV[threadIdx.x * headDim + d] =
V[(kStart + threadIdx.x) * headDim + d];
}
__syncthreads();
for (int j = 0; j < tileSize; j++) {
float sij = 0.0f;
for (int d = 0; d < headDim; d++) {
sij += sQ[threadIdx.x * headDim + d]
* sK[j * headDim + d];
}
sij *= scale;
// Online softmax update
float miNew = fmaxf(mi, sij);
float liNew = li * expf(mi - miNew) + expf(sij - miNew);
for (int d = 0; d < headDim; d++) {
oi[d] = oi[d] * (li / liNew) * expf(mi - miNew)
+ (expf(sij - miNew) / liNew) * sV[j * headDim + d];
}
mi = miNew;
li = liNew;
}
__syncthreads();
}
for (int d = 0; d < headDim; d++) {
O[(qStart + threadIdx.x) * headDim + d] = oi[d];
}
l[qStart + threadIdx.x] = li;
m[qStart + threadIdx.x] = mi;
}
CUDA Streams and Async Execution
CUDA streams enable overlapping of memory transfers and kernel execution, maximizing GPU utilization through pipelining.
void pipelinedProcessing(float* h_input, float* h_output,
int totalSize, int chunkSize) {
const int numStreams = 4;
cudaStream_t streams[numStreams];
float *d_input[numStreams], *d_output[numStreams];
for (int i = 0; i < numStreams; i++) {
cudaStreamCreate(&streams[i]);
cudaMalloc(&d_input[i], chunkSize * sizeof(float));
cudaMalloc(&d_output[i], chunkSize * sizeof(float));
}
// Pipeline: HtoD copy, kernel, DtoH copy interleaved across streams
for (int chunk = 0; chunk < totalSize / chunkSize; chunk++) {
int s = chunk % numStreams;
float* hIn = h_input + chunk * chunkSize;
float* hOut = h_output + chunk * chunkSize;
cudaMemcpyAsync(d_input[s], hIn,
chunkSize * sizeof(float),
cudaMemcpyHostToDevice, streams[s]);
dim3 block(256);
dim3 grid((chunkSize + 255) / 256);
processKernel<<<grid, block, 0, streams[s]>>>(
d_input[s], d_output[s], chunkSize);
cudaMemcpyAsync(hOut, d_output[s],
chunkSize * sizeof(float),
cudaMemcpyDeviceToHost, streams[s]);
}
for (int i = 0; i < numStreams; i++) {
cudaStreamSynchronize(streams[i]);
cudaStreamDestroy(streams[i]);
cudaFree(d_input[i]);
cudaFree(d_output[i]);
}
}
Inter-Stream Synchronization with Events
cudaEvent_t startEvent, stopEvent;
cudaEventCreate(&startEvent);
cudaEventCreate(&stopEvent);
// stream2 must wait until stream1's kernel finishes
cudaEventRecord(startEvent, stream1);
kernel1<<<grid, block, 0, stream1>>>(d_data);
cudaStreamWaitEvent(stream2, startEvent, 0);
kernel2<<<grid, block, 0, stream2>>>(d_data);
cudaEventRecord(stopEvent, stream2);
cudaEventSynchronize(stopEvent);
float ms = 0;
cudaEventElapsedTime(&ms, startEvent, stopEvent);
printf("Elapsed: %.3f ms\n", ms);
Multi-GPU and NCCL
Peer-to-Peer Direct Access
// Enable direct GPU-to-GPU memory copy (NVLink or PCIe P2P)
int canAccess;
cudaDeviceCanAccessPeer(&canAccess, 0, 1);
if (canAccess) {
cudaSetDevice(0);
cudaDeviceEnablePeerAccess(1, 0);
float* d_src; // on GPU 0
float* d_dst; // on GPU 1
cudaSetDevice(0);
cudaMalloc(&d_src, size);
cudaSetDevice(1);
cudaMalloc(&d_dst, size);
// Direct copy without staging through host
cudaMemcpyPeer(d_dst, 1, d_src, 0, size);
}
NCCL Ring-AllReduce
#include <nccl.h>
void ncclAllReduceExample(int numGPUs, float** d_buffers, size_t count) {
ncclComm_t comms[numGPUs];
int devList[numGPUs];
for (int i = 0; i < numGPUs; i++) devList[i] = i;
ncclCommInitAll(comms, numGPUs, devList);
cudaStream_t streams[numGPUs];
for (int i = 0; i < numGPUs; i++) {
cudaSetDevice(i);
cudaStreamCreate(&streams[i]);
}
// Sum all buffers and distribute the result to every GPU
ncclGroupStart();
for (int i = 0; i < numGPUs; i++) {
cudaSetDevice(i);
ncclAllReduce(d_buffers[i], d_buffers[i], count,
ncclFloat, ncclSum, comms[i], streams[i]);
}
ncclGroupEnd();
for (int i = 0; i < numGPUs; i++) {
cudaSetDevice(i);
cudaStreamSynchronize(streams[i]);
ncclCommDestroy(comms[i]);
}
}
The theoretical bandwidth cost of Ring-AllReduce is 2 * (N-1) / N * dataSize. Each of the N GPUs sends and receives dataSize / N chunks N-1 times across two phases (Reduce-Scatter + AllGather), giving 2 * (N-1) * (dataSize / N) total bytes transferred per GPU.
OpenAI Triton Custom Kernels
Triton lets you write GPU kernels in Python at near-CUDA performance. Triton's compiler handles tiling, vectorization, and shared memory allocation automatically.
Triton Matrix Multiply Kernel
import triton
import triton.language as tl
import torch
@triton.jit
def matmul_kernel(
a_ptr, b_ptr, c_ptr,
M, N, K,
stride_am, stride_ak,
stride_bk, stride_bn,
stride_cm, stride_cn,
BLOCK_M: tl.constexpr,
BLOCK_N: tl.constexpr,
BLOCK_K: tl.constexpr,
):
pid_m = tl.program_id(0)
pid_n = tl.program_id(1)
offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
offs_k = tl.arange(0, BLOCK_K)
a_ptrs = a_ptr + (offs_m[:, None] * stride_am + offs_k[None, :] * stride_ak)
b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_n[None, :] * stride_bn)
acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
for k in range(0, K, BLOCK_K):
a = tl.load(a_ptrs,
mask=(offs_m[:, None] < M) & (offs_k[None, :] < K - k),
other=0.0)
b = tl.load(b_ptrs,
mask=(offs_k[:, None] < K - k) & (offs_n[None, :] < N),
other=0.0)
acc += tl.dot(a, b)
a_ptrs += BLOCK_K * stride_ak
b_ptrs += BLOCK_K * stride_bk
c_ptrs = c_ptr + stride_cm * offs_m[:, None] + stride_cn * offs_n[None, :]
c_mask = (offs_m[:, None] < M) & (offs_n[None, :] < N)
tl.store(c_ptrs, acc, mask=c_mask)
def matmul(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
M, K = a.shape
K2, N = b.shape
assert K == K2
c = torch.empty((M, N), device=a.device, dtype=torch.float32)
grid = lambda meta: (
triton.cdiv(M, meta['BLOCK_M']),
triton.cdiv(N, meta['BLOCK_N']),
)
matmul_kernel[grid](
a, b, c,
M, N, K,
a.stride(0), a.stride(1),
b.stride(0), b.stride(1),
c.stride(0), c.stride(1),
BLOCK_M=128, BLOCK_N=256, BLOCK_K=64,
)
return c
Flash Attention in Triton
@triton.jit
def flash_attn_fwd_kernel(
Q_ptr, K_ptr, V_ptr, O_ptr,
L_ptr, M_ptr,
seqlen, headdim, scale,
BLOCK_Q: tl.constexpr,
BLOCK_KV: tl.constexpr,
):
start_q = tl.program_id(0) * BLOCK_Q
offs_q = start_q + tl.arange(0, BLOCK_Q)
offs_d = tl.arange(0, headdim)
q = tl.load(Q_ptr + offs_q[:, None] * headdim + offs_d[None, :],
mask=(offs_q[:, None] < seqlen), other=0.0)
mi = tl.full((BLOCK_Q,), value=-float('inf'), dtype=tl.float32)
li = tl.zeros((BLOCK_Q,), dtype=tl.float32)
acc = tl.zeros((BLOCK_Q, headdim), dtype=tl.float32)
for start_kv in range(0, seqlen, BLOCK_KV):
offs_kv = start_kv + tl.arange(0, BLOCK_KV)
k = tl.load(K_ptr + offs_kv[:, None] * headdim + offs_d[None, :],
mask=(offs_kv[:, None] < seqlen), other=0.0)
v = tl.load(V_ptr + offs_kv[:, None] * headdim + offs_d[None, :],
mask=(offs_kv[:, None] < seqlen), other=0.0)
s = tl.dot(q, tl.trans(k)) * scale
mi_new = tl.maximum(mi, tl.max(s, axis=1))
p = tl.exp(s - mi_new[:, None])
li_new = li * tl.exp(mi - mi_new) + tl.sum(p, axis=1)
acc = acc * (li * tl.exp(mi - mi_new))[:, None] / li_new[:, None]
acc += tl.dot(p, v) / li_new[:, None]
mi = mi_new
li = li_new
tl.store(O_ptr + offs_q[:, None] * headdim + offs_d[None, :],
acc, mask=(offs_q[:, None] < seqlen))
tl.store(L_ptr + offs_q, li, mask=(offs_q < seqlen))
tl.store(M_ptr + offs_q, mi, mask=(offs_q < seqlen))
Profiling with Nsight Compute and Systems
Nsight Compute: Kernel-Level Analysis
# Full profile for all metrics
ncu --set full -o profile_report ./my_cuda_app
# Profile only a specific kernel, 3 launches
ncu --kernel-name tiledMatMul --launch-count 3 ./my_cuda_app
# Collect memory bandwidth metrics for Roofline analysis
ncu --metrics \
l1tex__t_bytes_pipe_lsu_mem_global_op_ld.sum,\
l1tex__t_bytes_pipe_lsu_mem_global_op_st.sum,\
sm__cycles_elapsed.avg,\
sm__cycles_elapsed.avg.per_second \
./my_cuda_app
# DRAM bandwidth utilization as % of peak
ncu --metrics \
gpu__dram_throughput.avg.pct_of_peak_sustained_elapsed \
./my_cuda_app
Nsight Systems: Application-Level Timeline
# Capture full application trace
nsys profile --trace=cuda,nvtx,osrt \
--output=timeline_report \
./my_cuda_app
# Print summary statistics
nsys stats timeline_report.nsys-rep
# Export GPU trace details
nsys analyze --report gputrace timeline_report.nsys-rep
NVTX Markers for Custom Ranges
#include <nvtx3/nvToolsExt.h>
void profiledFunction() {
nvtxRangePush("DataPreprocessing");
preprocessData();
nvtxRangePop();
nvtxRangePush("ForwardPass");
forwardPass();
nvtxRangePop();
}
Roofline Model
The Roofline model determines whether a kernel is compute-bound or memory-bandwidth-bound.
Arithmetic Intensity (AI) = FLOP / Bytes accessed
Example for A100 80GB:
Peak FP16 Tensor Core throughput: 312 TFLOPS
HBM bandwidth: 2 TB/s
Ridge point: 312 / 2 = 156 FLOP/Byte
AI < 156 → memory-bandwidth bound
AI > 156 → compute bound
# Measure arithmetic intensity
ncu --metrics \
sm__ops_path_tensor_src_fp16.sum,\
dram__bytes.sum \
./matmul_app
# AI = (tensor_ops * 2) / dram_bytes
Quiz
Q1. What causes shared memory bank conflicts in CUDA, and how do you fix them?
Answer: A bank conflict occurs when two or more threads in the same warp access different addresses that map to the same shared memory bank, forcing the accesses to serialize.
Explanation: Shared memory has 32 banks, each 4 bytes wide. Thread i naturally accesses bank i % 32. A classic example is matrix transposition: reading tile[threadIdx.x][threadIdx.y] causes all 32 threads to hit bank 0 — a 32-way conflict. The fix is to pad the inner dimension by 1: __shared__ float tile[32][33]. This shifts every row's base address by one bank, so threads fan out across distinct banks with zero conflicts.
Q2. How does warp divergence affect performance, and what patterns minimize it?
Answer: Warp divergence serializes both branches of a conditional within a warp, potentially doubling execution time for that warp. With k distinct paths, execution time scales up to k times the single-path cost.
Explanation: Under SIMT, all 32 threads must execute the same instruction. When a branch is taken by only some threads, the hardware masks the others and reruns the warp for each divergent path. Mitigation strategies: (1) align branch conditions to warp boundaries using warp ID instead of thread ID, so every thread in a warp takes the same path; (2) use warp-vote intrinsics like __all_sync and __any_sync to check uniform conditions; (3) sort or rearrange input data so work of the same type is grouped into the same warps.
Q3. What hardware principle makes Tensor Cores faster than FP32 CUDA Cores for matrix multiply?
Answer: Tensor Cores implement a dedicated 4x4 FMA array in silicon that computes an entire 16x16x16 warp-level MMA in a handful of clocks, operating on lower-precision inputs (FP16/BF16/INT8) to maximize throughput.
Explanation: A standard CUDA Core executes one scalar FP32 FMA per clock. A Tensor Core fuses 256 multiply-adds (16x16 dot products of length 16) into a single hardware operation, sharing instruction overhead across the whole warp. On A100, each SM has 4 Tensor Core units capable of 1,248 FP16 FLOPS/clock/SM. Additionally, Tensor Cores accumulate in FP32 while computing in FP16 (mixed precision), preserving numerical stability. This is why torch.cuda.amp and cuBLAS both route GEMM operations through Tensor Cores when available.
Q4. Why is Flash Attention more memory-efficient than standard Attention from an HBM vs. SRAM perspective?
Answer: Standard Attention materializes the full N×N attention matrix in HBM (O(N²) storage), while Flash Attention tiles Q, K, V into on-chip SRAM and uses an online softmax to accumulate the output in SRAM — reducing HBM I/O from O(N²d) to O(Nd).
Explanation: In standard Attention the bottleneck is writing and re-reading the N×N score matrix S and probability matrix P to HBM. Flash Attention divides Q into row tiles and K, V into column tiles that fit in SRAM. For each tile pair it computes local scores and incrementally updates (mi, li, oi) using the online softmax recurrence: mi_new = max(mi, s_ij), li_new = li * exp(mi - mi_new) + exp(s_ij - mi_new). Only the final output O is written back to HBM. SRAM bandwidth is approximately 10x higher than HBM, so even though more arithmetic is performed, wall-clock time is lower because HBM traffic drops from O(N²) to O(N).
Q5. Why does NCCL Ring-AllReduce transfer exactly 2*(N-1)/N * data_size bytes per GPU regardless of N?
Answer: Ring-AllReduce consists of a Reduce-Scatter phase and an AllGather phase, each requiring N-1 steps in which every GPU sends a chunk of size data_size / N. The total is 2 * (N-1) * (data_size / N).
Explanation: Arrange N GPUs in a logical ring. In Reduce-Scatter, each GPU holds one data_size / N chunk that is the partial sum of the corresponding slice from all N GPUs after N-1 rounds of send-and-accumulate. Each round moves exactly data_size / N bytes. In AllGather, each GPU broadcasts its reduced chunk around the ring in another N-1 rounds. Total bytes per GPU = 2 * (N-1) * (data_size / N) = 2 * (N-1) / N * data_size. As N grows this approaches 2 * data_size, which is a constant — far better than the naive broadcast cost of (N-1) * data_size that grows linearly with N.