Split View: CUDA GPU 프로그래밍 심화: Warp 최적화, Tensor Core, Triton 커널 작성까지

CUDA GPU 프로그래밍 심화: Warp 최적화, Tensor Core, Triton 커널 작성까지

CUDA 메모리 계층 구조

GPU 성능 최적화의 핵심은 메모리 계층을 올바르게 이해하고 활용하는 것입니다. CUDA 메모리는 위치, 접근 속도, 용량에 따라 다음과 같이 구분됩니다.

메모리 유형별 특성

메모리 유형	위치	레이턴시	대역폭	용량	범위
Register	온칩	~1 cycle	매우 높음	스레드당 255개	스레드
Shared Memory	온칩	~5 cycles	~19 TB/s	SM당 48-164KB	블록
L1 Cache	온칩	~28 cycles	~19 TB/s	SM당 128KB	SM
L2 Cache	오프칩	~193 cycles	~7 TB/s	수십 MB	디바이스
Global Memory	HBM	~600 cycles	~2 TB/s	수십 GB	디바이스
Constant Memory	캐시됨	~5 cycles(캐시히트)	-	64KB	디바이스
Texture Memory	캐시됨	~5 cycles(캐시히트)	-	L1에 캐시	디바이스

Shared Memory 활용 예시

Shared memory는 같은 블록 내 스레드들이 공유하는 온칩 메모리로, global memory 접근을 줄이는 핵심 수단입니다.

// Tiled Matrix Multiplication with Shared Memory
#define TILE_SIZE 16

__global__ void tiledMatMul(
    float* A, float* B, float* C,
    int M, int N, int K
) {
    __shared__ float tileA[TILE_SIZE][TILE_SIZE];
    __shared__ float tileB[TILE_SIZE][TILE_SIZE];

    int row = blockIdx.y * TILE_SIZE + threadIdx.y;
    int col = blockIdx.x * TILE_SIZE + threadIdx.x;
    float sum = 0.0f;

    int numTiles = (K + TILE_SIZE - 1) / TILE_SIZE;

    for (int t = 0; t < numTiles; t++) {
        // 타일을 shared memory에 로드
        int aCol = t * TILE_SIZE + threadIdx.x;
        int bRow = t * TILE_SIZE + threadIdx.y;

        tileA[threadIdx.y][threadIdx.x] =
            (row < M && aCol < K) ? A[row * K + aCol] : 0.0f;
        tileB[threadIdx.y][threadIdx.x] =
            (bRow < K && col < N) ? B[bRow * N + col] : 0.0f;

        __syncthreads();

        // 타일 내에서 행렬 곱 수행
        for (int k = 0; k < TILE_SIZE; k++) {
            sum += tileA[threadIdx.y][k] * tileB[k][threadIdx.x];
        }

        __syncthreads();
    }

    if (row < M && col < N) {
        C[row * N + col] = sum;
    }
}

Constant Memory와 Texture Memory

Constant memory는 읽기 전용 데이터를 캐싱하며, 모든 스레드가 동일한 주소에 접근할 때 브로드캐스트 메커니즘으로 단일 사이클에 처리됩니다.

__constant__ float convWeights[256];

__global__ void convKernel(float* input, float* output, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        float result = 0.0f;
        for (int i = 0; i < 256; i++) {
            result += input[idx + i] * convWeights[i]; // 브로드캐스트 접근
        }
        output[idx] = result;
    }
}

워프(Warp) 최적화

Warp Divergence 제거

CUDA에서 32개의 스레드는 하나의 워프(warp)를 구성하며, SIMT 방식으로 동일한 명령을 실행합니다. 조건 분기가 워프 내에서 발생하면 직렬 실행되어 성능이 저하됩니다.

// 나쁜 예: Warp divergence 발생
__global__ void badKernel(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx % 2 == 0) {          // 워프 내 분기 발생
        data[idx] = data[idx] * 2.0f;
    } else {
        data[idx] = data[idx] + 1.0f;
    }
}

// 좋은 예: 워프 단위로 분기 정렬
__global__ void goodKernel(float* data, int n) {
    int warpId = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
    int idx    = blockIdx.x * blockDim.x + threadIdx.x;
    if (warpId % 2 == 0) {       // 워프 단위 분기 — divergence 없음
        data[idx] = data[idx] * 2.0f;
    } else {
        data[idx] = data[idx] + 1.0f;
    }
}

Coalesced Memory Access

연속된 메모리 접근(coalesced access)은 한 번의 트랜잭션으로 처리됩니다. 스트라이드 접근이나 불규칙 접근은 다수의 트랜잭션을 유발합니다.

// 나쁜 예: Column-major 접근 (stride = N)
__global__ void columnAccess(float* matrix, int M, int N) {
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    float sum = 0.0f;
    for (int row = 0; row < M; row++) {
        sum += matrix[row * N + col];   // 스트라이드 N 접근
    }
}

// 좋은 예: Row-major coalesced 접근
__global__ void rowAccess(float* matrix, int M, int N) {
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    float sum = 0.0f;
    for (int col = 0; col < N; col++) {
        sum += matrix[row * N + col];   // 연속 접근
    }
}

Shared Memory Bank Conflict 해결

Shared memory는 32개의 bank로 나뉩니다. 같은 워프 내 스레드들이 동일한 bank의 서로 다른 주소에 접근하면 bank conflict가 발생해 직렬화됩니다.

// 나쁜 예: 2-way bank conflict (stride=2)
__shared__ float smem[32][32];
// threadIdx.x=0 → smem[0][0], threadIdx.x=1 → smem[0][2]
// 짝수/홀수 스레드가 같은 bank에 충돌

// 좋은 예: 패딩으로 bank conflict 제거
__shared__ float smem_padded[32][33];  // 열에 1 패딩 추가
// threadIdx.x=0 → bank 0, threadIdx.x=1 → bank 1, ...

__global__ void matTransposeNoBankConflict(float* in, float* out, int N) {
    __shared__ float tile[TILE_SIZE][TILE_SIZE + 1]; // +1 패딩

    int x = blockIdx.x * TILE_SIZE + threadIdx.x;
    int y = blockIdx.y * TILE_SIZE + threadIdx.y;

    if (x < N && y < N)
        tile[threadIdx.y][threadIdx.x] = in[y * N + x];

    __syncthreads();

    x = blockIdx.y * TILE_SIZE + threadIdx.x;
    y = blockIdx.x * TILE_SIZE + threadIdx.y;

    if (x < N && y < N)
        out[y * N + x] = tile[threadIdx.x][threadIdx.y];
}

병렬 패턴 구현

Parallel Reduction

// Warp-level reduction with shuffle instructions
__device__ float warpReduceSum(float val) {
    for (int offset = 16; offset > 0; offset >>= 1) {
        val += __shfl_down_sync(0xffffffff, val, offset);
    }
    return val;
}

__global__ void blockReduceSum(float* input, float* output, int n) {
    __shared__ float shared[32];

    int tid   = threadIdx.x;
    int idx   = blockIdx.x * blockDim.x + tid;
    float val = (idx < n) ? input[idx] : 0.0f;

    // 워프 내 reduction
    val = warpReduceSum(val);

    // 각 워프의 결과를 shared memory에 저장
    if (tid % 32 == 0) shared[tid / 32] = val;
    __syncthreads();

    // 첫 번째 워프에서 최종 reduction
    if (tid < 32) {
        val = (tid < blockDim.x / 32) ? shared[tid] : 0.0f;
        val = warpReduceSum(val);
        if (tid == 0) atomicAdd(output, val);
    }
}

Prefix Sum (Inclusive Scan)

__global__ void exclusiveScan(int* input, int* output, int n) {
    __shared__ int temp[1024 * 2];

    int tid    = threadIdx.x;
    int offset = 1;

    // 데이터 로드
    temp[2 * tid]     = (2 * tid < n)     ? input[2 * tid]     : 0;
    temp[2 * tid + 1] = (2 * tid + 1 < n) ? input[2 * tid + 1] : 0;

    // Up-sweep (reduce) 단계
    for (int d = n >> 1; d > 0; d >>= 1) {
        __syncthreads();
        if (tid < d) {
            int ai = offset * (2 * tid + 1) - 1;
            int bi = offset * (2 * tid + 2) - 1;
            temp[bi] += temp[ai];
        }
        offset <<= 1;
    }

    // 루트를 0으로 초기화
    if (tid == 0) temp[n - 1] = 0;

    // Down-sweep 단계
    for (int d = 1; d < n; d <<= 1) {
        offset >>= 1;
        __syncthreads();
        if (tid < d) {
            int ai = offset * (2 * tid + 1) - 1;
            int bi = offset * (2 * tid + 2) - 1;
            int t  = temp[ai];
            temp[ai]  = temp[bi];
            temp[bi] += t;
        }
    }

    __syncthreads();
    if (2 * tid < n)     output[2 * tid]     = temp[2 * tid];
    if (2 * tid + 1 < n) output[2 * tid + 1] = temp[2 * tid + 1];
}

Tensor Core와 WMMA API

Tensor Core는 Volta 아키텍처부터 도입된 특수 하드웨어 유닛으로, 4x4 행렬 곱셈을 단일 클록 사이클에 처리합니다. A100 GPU 기준 FP16 Tensor Core는 312 TFLOPS를 달성하며, FP32 CUDA core(19.5 TFLOPS) 대비 약 16배 높은 처리량을 제공합니다.

WMMA(Warp Matrix Multiply Accumulate) API

#include <mma.h>
using namespace nvcuda;

// WMMA 타일 크기 (Volta/Turing/Ampere 지원 크기)
#define WMMA_M 16
#define WMMA_N 16
#define WMMA_K 16

__global__ void tensorCoreMatMul(
    half* A, half* B, float* C,
    int M, int N, int K
) {
    // WMMA 프래그먼트 선언
    wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K,
                   half, wmma::row_major> aFrag;
    wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K,
                   half, wmma::col_major> bFrag;
    wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K,
                   float> cFrag;

    int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
    int warpN = (blockIdx.y * blockDim.y + threadIdx.y);

    // accumulator 초기화
    wmma::fill_fragment(cFrag, 0.0f);

    // K 차원을 따라 타일 반복
    for (int i = 0; i < K; i += WMMA_K) {
        int aRow = warpM * WMMA_M;
        int aCol = i;
        int bRow = i;
        int bCol = warpN * WMMA_N;

        if (aRow < M && aCol < K && bRow < K && bCol < N) {
            // 행렬 프래그먼트 로드
            wmma::load_matrix_sync(aFrag, A + aRow * K + aCol, K);
            wmma::load_matrix_sync(bFrag, B + bRow * N + bCol, N);

            // Tensor Core MMA 실행
            wmma::mma_sync(cFrag, aFrag, bFrag, cFrag);
        }
    }

    // 결과 저장
    int cRow = warpM * WMMA_M;
    int cCol = warpN * WMMA_N;
    if (cRow < M && cCol < N) {
        wmma::store_matrix_sync(C + cRow * N + cCol, cFrag, N,
                                wmma::mem_row_major);
    }
}

PTX 인라인 어셈블리

성능 한계를 극한까지 끌어올리기 위해 PTX(Parallel Thread Execution) 인라인 어셈블리를 사용할 수 있습니다.

__device__ float fastSqrt(float x) {
    float result;
    asm("sqrt.approx.f32 %0, %1;" : "=f"(result) : "f"(x));
    return result;
}

__device__ int ldgInt(const int* ptr) {
    int val;
    // 캐시 힌트를 사용한 글로벌 메모리 로드 (L2 캐시 우선)
    asm volatile("ld.global.cg.s32 %0, [%1];"
                 : "=r"(val) : "l"(ptr));
    return val;
}

Flash Attention 구현 원리

표준 Attention은 시퀀스 길이 N에 대해 O(N²) 메모리를 HBM에 저장해야 합니다. Flash Attention은 SRAM(shared memory)을 타일 단위로 활용하여 HBM 접근을 최소화합니다.

핵심 아이디어: 온라인 소프트맥스 (Online Softmax)

표준 Attention:
  1. S = Q @ K^T        → HBM에 N×N 행렬 저장 (O(N²))
  2. P = softmax(S)     → HBM에서 읽고 쓰기
  3. O = P @ V          → HBM에서 읽고 쓰기

Flash Attention:
  - Q, K, V를 타일로 분할 → SRAM에 로드
  - 온라인 소프트맥스로 행 최댓값/분모를 점진적으로 갱신
  - HBM에는 최종 O만 저장 → HBM 읽기/쓰기 O(N) 달성

// Flash Attention 핵심 루프 (단순화된 버전)
__global__ void flashAttentionKernel(
    const float* Q, const float* K, const float* V,
    float* O, float* l, float* m,
    int seqLen, int headDim, float scale
) {
    int tileSize = blockDim.x;
    int qStart   = blockIdx.x * tileSize;

    extern __shared__ float smem[];
    float* sQ = smem;
    float* sK = smem + tileSize * headDim;
    float* sV = smem + 2 * tileSize * headDim;

    // Q 타일을 SRAM에 로드
    for (int d = 0; d < headDim; d++) {
        sQ[threadIdx.x * headDim + d] =
            Q[(qStart + threadIdx.x) * headDim + d];
    }

    float mi  = -INFINITY;  // 행 최댓값 (수치 안정성)
    float li  = 0.0f;       // 행 합산 (softmax 분모)
    float oi[64] = {};      // 출력 누적

    // K, V 타일을 순회
    for (int kStart = 0; kStart < seqLen; kStart += tileSize) {
        // K, V 타일 로드
        for (int d = 0; d < headDim; d++) {
            sK[threadIdx.x * headDim + d] =
                K[(kStart + threadIdx.x) * headDim + d];
            sV[threadIdx.x * headDim + d] =
                V[(kStart + threadIdx.x) * headDim + d];
        }
        __syncthreads();

        // S_ij = Q_i @ K_j^T * scale
        for (int j = 0; j < tileSize; j++) {
            float sij = 0.0f;
            for (int d = 0; d < headDim; d++) {
                sij += sQ[threadIdx.x * headDim + d]
                     * sK[j * headDim + d];
            }
            sij *= scale;

            // 온라인 softmax 갱신
            float miNew = fmaxf(mi, sij);
            float liNew = li * expf(mi - miNew) + expf(sij - miNew);

            for (int d = 0; d < headDim; d++) {
                oi[d] = oi[d] * (li / liNew) * expf(mi - miNew)
                      + (expf(sij - miNew) / liNew) * sV[j * headDim + d];
            }
            mi = miNew;
            li = liNew;
        }
        __syncthreads();
    }

    // 결과 저장
    for (int d = 0; d < headDim; d++) {
        O[(qStart + threadIdx.x) * headDim + d] = oi[d];
    }
    l[qStart + threadIdx.x] = li;
    m[qStart + threadIdx.x] = mi;
}

CUDA 스트림과 비동기 실행

CUDA 스트림을 통해 메모리 복사와 커널 실행을 오버랩하여 GPU 활용도를 높일 수 있습니다.

#include <cuda_runtime.h>

void pipelinedProcessing(float* h_input, float* h_output,
                         int totalSize, int chunkSize) {
    const int numStreams = 4;
    cudaStream_t streams[numStreams];
    float *d_input[numStreams], *d_output[numStreams];

    for (int i = 0; i < numStreams; i++) {
        cudaStreamCreate(&streams[i]);
        cudaMalloc(&d_input[i],  chunkSize * sizeof(float));
        cudaMalloc(&d_output[i], chunkSize * sizeof(float));
    }

    // 파이프라인: HtoD 복사 → 커널 → DtoH 복사가 중첩
    for (int chunk = 0; chunk < totalSize / chunkSize; chunk++) {
        int s = chunk % numStreams;
        float* hIn  = h_input  + chunk * chunkSize;
        float* hOut = h_output + chunk * chunkSize;

        // 비동기 Host-to-Device 복사
        cudaMemcpyAsync(d_input[s], hIn,
                        chunkSize * sizeof(float),
                        cudaMemcpyHostToDevice, streams[s]);

        // 커널 실행 (이전 memcpy와 다른 스트림은 병렬 수행)
        dim3 block(256);
        dim3 grid((chunkSize + 255) / 256);
        processKernel<<<grid, block, 0, streams[s]>>>(
            d_input[s], d_output[s], chunkSize);

        // 비동기 Device-to-Host 복사
        cudaMemcpyAsync(hOut, d_output[s],
                        chunkSize * sizeof(float),
                        cudaMemcpyDeviceToHost, streams[s]);
    }

    // 모든 스트림 완료 대기
    for (int i = 0; i < numStreams; i++) {
        cudaStreamSynchronize(streams[i]);
        cudaStreamDestroy(streams[i]);
        cudaFree(d_input[i]);
        cudaFree(d_output[i]);
    }
}

스트림 이벤트 동기화

cudaEvent_t startEvent, stopEvent;
cudaEventCreate(&startEvent);
cudaEventCreate(&stopEvent);

cudaEventRecord(startEvent, stream1);
kernel1<<<grid, block, 0, stream1>>>(d_data);

// stream2가 stream1의 kernel1 완료를 기다림
cudaStreamWaitEvent(stream2, startEvent, 0);
kernel2<<<grid, block, 0, stream2>>>(d_data);

cudaEventRecord(stopEvent, stream2);
cudaEventSynchronize(stopEvent);

float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, startEvent, stopEvent);

멀티-GPU와 NCCL

P2P(Peer-to-Peer) 통신

// GPU 0에서 GPU 1로 직접 메모리 복사 (NVLink/PCIe P2P)
cudaSetDevice(0);
float* d_src;
cudaMalloc(&d_src, size);

cudaSetDevice(1);
float* d_dst;
cudaMalloc(&d_dst, size);

// P2P 활성화 확인
int canAccess;
cudaDeviceCanAccessPeer(&canAccess, 0, 1);
if (canAccess) {
    cudaSetDevice(0);
    cudaDeviceEnablePeerAccess(1, 0);
    // GPU 0에서 GPU 1 메모리로 직접 복사
    cudaMemcpyPeer(d_dst, 1, d_src, 0, size);
}

NCCL Ring-AllReduce

#include <nccl.h>

void ncclAllReduceExample(int numGPUs, float** d_buffers, size_t count) {
    ncclComm_t comms[numGPUs];
    int devList[numGPUs];
    for (int i = 0; i < numGPUs; i++) devList[i] = i;

    // NCCL 통신자 초기화
    ncclCommInitAll(comms, numGPUs, devList);

    cudaStream_t streams[numGPUs];
    for (int i = 0; i < numGPUs; i++) {
        cudaSetDevice(i);
        cudaStreamCreate(&streams[i]);
    }

    // AllReduce: 모든 GPU의 버퍼를 합산하고 결과를 모든 GPU에 분배
    ncclGroupStart();
    for (int i = 0; i < numGPUs; i++) {
        cudaSetDevice(i);
        ncclAllReduce(d_buffers[i], d_buffers[i], count,
                      ncclFloat, ncclSum, comms[i], streams[i]);
    }
    ncclGroupEnd();

    for (int i = 0; i < numGPUs; i++) {
        cudaSetDevice(i);
        cudaStreamSynchronize(streams[i]);
        ncclCommDestroy(comms[i]);
    }
}

Ring-AllReduce의 이론적 전송량은 2 * (N-1) / N * dataSize입니다. N개의 GPU가 링을 이루어 Reduce-Scatter와 AllGather 각각 N-1 단계를 수행하며, 각 단계에서 dataSize / N 크기를 전송합니다. 따라서 총 전송량은 2 * (N-1) * (dataSize / N)이 됩니다.

OpenAI Triton 커스텀 커널

Triton은 Python으로 GPU 커널을 작성하면서 CUDA 수준의 성능을 달성할 수 있는 프레임워크입니다.

Triton 행렬 곱셈 커널

import triton
import triton.language as tl
import torch

@triton.jit
def matmul_kernel(
    a_ptr, b_ptr, c_ptr,
    M, N, K,
    stride_am, stride_ak,
    stride_bk, stride_bn,
    stride_cm, stride_cn,
    BLOCK_M: tl.constexpr,
    BLOCK_N: tl.constexpr,
    BLOCK_K: tl.constexpr,
):
    pid_m = tl.program_id(0)
    pid_n = tl.program_id(1)

    # 오프셋 계산
    offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    offs_k = tl.arange(0, BLOCK_K)

    # 포인터 초기화
    a_ptrs = a_ptr + (offs_m[:, None] * stride_am + offs_k[None, :] * stride_ak)
    b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_n[None, :] * stride_bn)

    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)

    for k in range(0, K, BLOCK_K):
        # 경계 마스킹으로 안전하게 로드
        a = tl.load(a_ptrs, mask=(offs_m[:, None] < M) & (offs_k[None, :] < K - k), other=0.0)
        b = tl.load(b_ptrs, mask=(offs_k[:, None] < K - k) & (offs_n[None, :] < N), other=0.0)

        acc += tl.dot(a, b)

        a_ptrs += BLOCK_K * stride_ak
        b_ptrs += BLOCK_K * stride_bk

    # 결과 저장
    c_ptrs = c_ptr + stride_cm * offs_m[:, None] + stride_cn * offs_n[None, :]
    c_mask = (offs_m[:, None] < M) & (offs_n[None, :] < N)
    tl.store(c_ptrs, acc, mask=c_mask)


def matmul(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
    M, K = a.shape
    K2, N = b.shape
    assert K == K2

    c = torch.empty((M, N), device=a.device, dtype=torch.float32)

    grid = lambda meta: (
        triton.cdiv(M, meta['BLOCK_M']),
        triton.cdiv(N, meta['BLOCK_N']),
    )

    matmul_kernel[grid](
        a, b, c,
        M, N, K,
        a.stride(0), a.stride(1),
        b.stride(0), b.stride(1),
        c.stride(0), c.stride(1),
        BLOCK_M=128, BLOCK_N=256, BLOCK_K=64,
    )
    return c

Flash Attention in Triton

@triton.jit
def flash_attn_fwd_kernel(
    Q_ptr, K_ptr, V_ptr, O_ptr,
    L_ptr, M_ptr,
    seqlen, headdim, scale,
    BLOCK_Q: tl.constexpr,
    BLOCK_KV: tl.constexpr,
):
    start_q = tl.program_id(0) * BLOCK_Q
    offs_q  = start_q + tl.arange(0, BLOCK_Q)
    offs_d  = tl.arange(0, headdim)

    # Q 블록 로드
    q = tl.load(Q_ptr + offs_q[:, None] * headdim + offs_d[None, :],
                mask=(offs_q[:, None] < seqlen), other=0.0)

    mi  = tl.full((BLOCK_Q,), value=-float('inf'), dtype=tl.float32)
    li  = tl.zeros((BLOCK_Q,), dtype=tl.float32)
    acc = tl.zeros((BLOCK_Q, headdim), dtype=tl.float32)

    for start_kv in range(0, seqlen, BLOCK_KV):
        offs_kv = start_kv + tl.arange(0, BLOCK_KV)

        k = tl.load(K_ptr + offs_kv[:, None] * headdim + offs_d[None, :],
                    mask=(offs_kv[:, None] < seqlen), other=0.0)
        v = tl.load(V_ptr + offs_kv[:, None] * headdim + offs_d[None, :],
                    mask=(offs_kv[:, None] < seqlen), other=0.0)

        # Attention score 계산
        s = tl.dot(q, tl.trans(k)) * scale

        # 온라인 softmax
        mi_new = tl.maximum(mi, tl.max(s, axis=1))
        p      = tl.exp(s - mi_new[:, None])
        li_new = li * tl.exp(mi - mi_new) + tl.sum(p, axis=1)

        acc = acc * (li * tl.exp(mi - mi_new))[:, None] / li_new[:, None]
        acc += tl.dot(p, v) / li_new[:, None]

        mi = mi_new
        li = li_new

    # 결과 저장
    tl.store(O_ptr + offs_q[:, None] * headdim + offs_d[None, :],
             acc, mask=(offs_q[:, None] < seqlen))
    tl.store(L_ptr + offs_q, li, mask=(offs_q < seqlen))
    tl.store(M_ptr + offs_q, mi, mask=(offs_q < seqlen))

프로파일링: Nsight Compute & Systems

Nsight Compute 명령어

# 커널 상세 분석
ncu --set full -o profile_report ./my_cuda_app

# 특정 커널만 프로파일링
ncu --kernel-name tiledMatMul --launch-count 3 ./my_cuda_app

# 메모리 대역폭 분석 (Roofline 모델용)
ncu --metrics \
    l1tex__t_bytes_pipe_lsu_mem_global_op_ld.sum,\
    l1tex__t_bytes_pipe_lsu_mem_global_op_st.sum,\
    sm__cycles_elapsed.avg,\
    sm__cycles_elapsed.avg.per_second \
    ./my_cuda_app

# DRAM 대역폭 활용률
ncu --metrics \
    gpu__dram_throughput.avg.pct_of_peak_sustained_elapsed \
    ./my_cuda_app

Nsight Systems 타임라인 분석

# 전체 애플리케이션 타임라인 수집
nsys profile --trace=cuda,nvtx,osrt \
             --output=timeline_report \
             ./my_cuda_app

# 통계 요약 출력
nsys stats timeline_report.nsys-rep

# GPU 활동도 분석
nsys analyze --report gputrace timeline_report.nsys-rep

NVTX 마커로 코드 구간 표시

#include <nvtx3/nvToolsExt.h>

void profiledFunction() {
    nvtxRangePush("DataPreprocessing");
    preprocessData();
    nvtxRangePop();

    nvtxRangePush("ForwardPass");
    forwardPass();
    nvtxRangePop();
}

Roofline 모델 분석

Roofline 모델은 커널의 성능 병목이 연산 바운드인지 메모리 대역폭 바운드인지 판단합니다.

산술 강도(Arithmetic Intensity) = FLOP / Bytes

예시 (A100 80GB):
- 피크 FP16 Tensor Core: 312 TFLOPS
- HBM 대역폭: 2 TB/s
- Ridge Point: 312 / 2 = 156 FLOP/Byte

커널의 산술 강도 < 156 → 메모리 바운드
커널의 산술 강도 > 156 → 연산 바운드

# 커널 산술 강도 측정
ncu --metrics \
    sm__ops_path_tensor_src_fp16.sum,\
    dram__bytes.sum \
    ./matmul_app
# 산술 강도 = tensor_ops * 2 / dram_bytes

퀴즈

Q1. CUDA에서 shared memory bank conflict가 발생하는 조건과 해결 방법은?

정답: 같은 워프 내 두 개 이상의 스레드가 shared memory의 동일한 bank에 속하는 서로 다른 주소에 동시에 접근할 때 bank conflict가 발생합니다.

설명: Shared memory는 32개의 bank로 나뉘며, 각 bank는 4바이트 단위로 주소를 할당합니다. 스레드 i는 bank i % 32에 기본 접근합니다. 예를 들어 float smem[32][32]를 전치 과정에서 smem[threadIdx.x][threadIdx.y]로 접근하면 같은 열의 스레드들이 같은 bank에 접근하여 32-way conflict가 발생합니다. 해결책은 배열에 1개의 패딩 열을 추가하는 것입니다: float smem[32][33]. 이렇게 하면 각 행의 시작 주소가 bank 경계를 엇갈려 conflict가 사라집니다.

Q2. Warp divergence가 성능에 미치는 영향과 최소화하는 코드 패턴은?

정답: Warp divergence는 워프 내 스레드들이 서로 다른 코드 경로를 실행할 때 발생하며, 양쪽 경로가 순차적으로 실행되어 최대 2배의 성능 저하를 야기합니다.

설명: SIMT 아키텍처에서 워프의 모든 스레드는 동일한 명령어를 실행합니다. if-else 분기가 발생하면 true 경로 스레드들이 실행되는 동안 false 경로 스레드들은 마스킹(비활성화)되고, 이후 반대로 실행됩니다. 최소화 전략: (1) 분기 조건을 워프 ID 기반으로 작성하여 워프 단위로 균일한 분기 유도, (2) __builtin_expect나 __any_sync/__all_sync 등 워프 표결 함수 활용, (3) 데이터 정렬을 통해 같은 경로를 실행하는 스레드들을 같은 워프에 배치.

Q3. Tensor Core가 FP32 연산보다 행렬 곱셈을 빠르게 처리하는 하드웨어 원리는?

정답: Tensor Core는 4×4 행렬의 FMA(Fused Multiply-Add) 연산을 전용 하드웨어 회로로 단일 클록에 처리하며, 데이터 형식(FP16/BF16/INT8)을 낮춰 처리량을 극대화합니다.

설명: 일반 CUDA Core는 스칼라 FP32 FMA를 1클록에 1회 수행합니다. Tensor Core는 16×16×16 행렬 곱을 워프 단위(32스레드)로 수행하며, 하나의 워프당 단 몇 클록에 처리합니다. A100 기준 FP16 Tensor Core는 SM당 초당 1,248 TFLOPS(피크)를 달성합니다. 또한 Tensor Core는 FP16 입력으로 FP32 누산(Mixed Precision)을 지원하여 수치 정밀도를 유지하면서도 처리량을 높입니다. 이 때문에 PyTorch의 torch.cuda.amp는 내부적으로 Tensor Core를 활용합니다.

Q4. Flash Attention이 표준 Attention보다 메모리 효율적인 이유 (HBM vs SRAM 관점)?

정답: 표준 Attention은 N×N 크기의 중간 행렬을 HBM에 저장해야 하지만(O(N²) HBM), Flash Attention은 타일 단위로 SRAM에서만 처리하여 HBM 접근을 O(N)으로 줄입니다.

설명: 표준 Attention의 HBM I/O는 Q, K, V 읽기(O(Nd)) + S 쓰기(O(N²)) + S 읽기 + P 쓰기(O(N²)) + P 읽기 + O 쓰기(O(Nd))로 O(N²d)에 달합니다. Flash Attention은 SRAM 크기에 맞춰 Q, K, V를 타일로 분할하고, 온라인 소프트맥스를 사용해 중간 결과를 SRAM에서 누적합니다. HBM 접근은 Q, K, V 읽기와 O 쓰기만 수행하므로 O(N²d / M) 단계의 SRAM 처리로 O(Nd) HBM I/O를 달성합니다. SRAM의 대역폭은 HBM의 약 10배이므로 실질적인 처리 속도가 크게 향상됩니다.

Q5. NCCL Ring-AllReduce에서 데이터 전송량이 GPU 수에 관계없이 2*(N-1)/N * data_size인 이유는?

정답: Ring-AllReduce는 Reduce-Scatter와 AllGather 두 단계로 구성되며, 각 단계에서 N-1번의 전송이 이루어지고 매 전송 크기가 data_size / N이기 때문입니다.

설명: N개의 GPU가 링을 구성합니다. Reduce-Scatter 단계에서는 각 GPU가 자신의 데이터 청크(data_size / N)를 인접 GPU로 N-1회 전달하여 각 GPU가 전체 데이터의 1/N에 대한 reduce 결과를 보유합니다. AllGather 단계에서는 reduce된 청크를 다시 N-1회 전달하여 모든 GPU가 전체 결과를 갖습니다. 총 전송량 = 2 * (N-1) * (data_size / N) = 2 * (N-1) / N * data_size. N=2일 때 data_size, N=무한대일 때 2 * data_size로 수렴합니다. 이는 단순 브로드캐스트 방식((N-1) * data_size)보다 훨씬 효율적입니다.

Advanced CUDA GPU Programming: Warp Optimization, Tensor Cores, and Triton Kernels

CUDA Memory Hierarchy
Warp Optimization
Parallel Patterns
Tensor Cores and the WMMA API
Flash Attention: How It Works
CUDA Streams and Async Execution
Multi-GPU and NCCL
OpenAI Triton Custom Kernels
Profiling with Nsight Compute and Systems
Quiz

CUDA Memory Hierarchy

Understanding the memory hierarchy is the single most important factor in GPU performance optimization. CUDA exposes several memory types that differ in location, latency, bandwidth, and scope.

Memory Type Comparison

Memory Type	Location	Latency	Bandwidth	Capacity	Scope
Register	On-chip	~1 cycle	Highest	255 per thread	Thread
Shared Memory	On-chip	~5 cycles	~19 TB/s	48–164 KB per SM	Block
L1 Cache	On-chip	~28 cycles	~19 TB/s	128 KB per SM	SM
L2 Cache	Off-chip	~193 cycles	~7 TB/s	Tens of MB	Device
Global (HBM)	HBM	~600 cycles	~2 TB/s	Tens of GB	Device
Constant	Cached	~5 cycles (hit)	—	64 KB	Device
Texture	Cached	~5 cycles (hit)	—	L1-cached	Device

Using Shared Memory: Tiled Matrix Multiplication

Shared memory sits on-chip and is shared among threads in the same block. Loading tiles into shared memory dramatically reduces expensive global memory round-trips.

#define TILE_SIZE 16

__global__ void tiledMatMul(
    float* A, float* B, float* C,
    int M, int N, int K
) {
    __shared__ float tileA[TILE_SIZE][TILE_SIZE];
    __shared__ float tileB[TILE_SIZE][TILE_SIZE];

    int row = blockIdx.y * TILE_SIZE + threadIdx.y;
    int col = blockIdx.x * TILE_SIZE + threadIdx.x;
    float sum = 0.0f;

    int numTiles = (K + TILE_SIZE - 1) / TILE_SIZE;

    for (int t = 0; t < numTiles; t++) {
        int aCol = t * TILE_SIZE + threadIdx.x;
        int bRow = t * TILE_SIZE + threadIdx.y;

        // Collaboratively load tiles into shared memory
        tileA[threadIdx.y][threadIdx.x] =
            (row < M && aCol < K) ? A[row * K + aCol] : 0.0f;
        tileB[threadIdx.y][threadIdx.x] =
            (bRow < K && col < N) ? B[bRow * N + col] : 0.0f;

        __syncthreads();

        for (int k = 0; k < TILE_SIZE; k++) {
            sum += tileA[threadIdx.y][k] * tileB[k][threadIdx.x];
        }

        __syncthreads();
    }

    if (row < M && col < N) {
        C[row * N + col] = sum;
    }
}

Constant Memory

Constant memory is read-only and cached. When all threads in a warp access the same address, the value is broadcast in a single cycle — ideal for filter weights or configuration data.

__constant__ float convWeights[256];

__global__ void convKernel(float* input, float* output, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        float result = 0.0f;
        for (int i = 0; i < 256; i++) {
            result += input[idx + i] * convWeights[i]; // broadcast access
        }
        output[idx] = result;
    }
}

Warp Optimization

Eliminating Warp Divergence

A warp is a group of 32 threads that execute in lockstep under SIMT. When a conditional branch splits threads within a warp onto different paths, both paths execute serially — this is called warp divergence.

// Bad: divergence within a warp (threads 0,2,4,... vs 1,3,5,...)
__global__ void badKernel(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx % 2 == 0) {
        data[idx] = data[idx] * 2.0f;
    } else {
        data[idx] = data[idx] + 1.0f;
    }
}

// Good: branch aligned to warp boundary — no divergence
__global__ void goodKernel(float* data, int n) {
    int warpId = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
    int idx    = blockIdx.x * blockDim.x + threadIdx.x;
    if (warpId % 2 == 0) {
        data[idx] = data[idx] * 2.0f;
    } else {
        data[idx] = data[idx] + 1.0f;
    }
}

Coalesced Memory Access

Global memory is accessed in 128-byte transactions. If consecutive threads read consecutive addresses (coalesced), a single transaction serves the entire warp. Strided or random access can require 32 separate transactions.

// Bad: column-major strided access (stride = N bytes between threads)
__global__ void columnAccess(float* matrix, int M, int N) {
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    float sum = 0.0f;
    for (int row = 0; row < M; row++) {
        sum += matrix[row * N + col]; // stride-N access
    }
}

// Good: row-major coalesced access
__global__ void rowAccess(float* matrix, int M, int N) {
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    float sum = 0.0f;
    for (int col = 0; col < N; col++) {
        sum += matrix[row * N + col]; // sequential access
    }
}

Shared Memory Bank Conflicts

Shared memory is divided into 32 banks (4 bytes each). When two or more threads in a warp access different addresses within the same bank, they serialize — this is a bank conflict.

// Bad: 32-way bank conflict during transpose (all threads hit bank 0)
__shared__ float smem[32][32];
// smem[0][0], smem[1][0], smem[2][0]... all map to bank 0

// Good: +1 column padding shifts each row to a different bank
__global__ void matTransposeNoBankConflict(float* in, float* out, int N) {
    __shared__ float tile[TILE_SIZE][TILE_SIZE + 1]; // +1 padding

    int x = blockIdx.x * TILE_SIZE + threadIdx.x;
    int y = blockIdx.y * TILE_SIZE + threadIdx.y;

    if (x < N && y < N)
        tile[threadIdx.y][threadIdx.x] = in[y * N + x];

    __syncthreads();

    x = blockIdx.y * TILE_SIZE + threadIdx.x;
    y = blockIdx.x * TILE_SIZE + threadIdx.y;

    if (x < N && y < N)
        out[y * N + x] = tile[threadIdx.x][threadIdx.y];
}

Parallel Patterns

Parallel Reduction with Warp Shuffle

// Warp-level sum using __shfl_down_sync (no shared memory needed)
__device__ float warpReduceSum(float val) {
    for (int offset = 16; offset > 0; offset >>= 1) {
        val += __shfl_down_sync(0xffffffff, val, offset);
    }
    return val;
}

__global__ void blockReduceSum(float* input, float* output, int n) {
    __shared__ float shared[32];

    int tid   = threadIdx.x;
    int idx   = blockIdx.x * blockDim.x + tid;
    float val = (idx < n) ? input[idx] : 0.0f;

    // Step 1: reduce within each warp
    val = warpReduceSum(val);

    // Step 2: write each warp's result to shared memory
    if (tid % 32 == 0) shared[tid / 32] = val;
    __syncthreads();

    // Step 3: final reduction in first warp
    if (tid < 32) {
        val = (tid < blockDim.x / 32) ? shared[tid] : 0.0f;
        val = warpReduceSum(val);
        if (tid == 0) atomicAdd(output, val);
    }
}

Exclusive Prefix Sum (Scan)

The Blelloch (work-efficient) scan uses an up-sweep and down-sweep phase, each costing O(N) work.

__global__ void exclusiveScan(int* input, int* output, int n) {
    __shared__ int temp[1024 * 2];
    int tid    = threadIdx.x;
    int offset = 1;

    temp[2 * tid]     = (2 * tid < n)     ? input[2 * tid]     : 0;
    temp[2 * tid + 1] = (2 * tid + 1 < n) ? input[2 * tid + 1] : 0;

    // Up-sweep (reduce)
    for (int d = n >> 1; d > 0; d >>= 1) {
        __syncthreads();
        if (tid < d) {
            int ai = offset * (2 * tid + 1) - 1;
            int bi = offset * (2 * tid + 2) - 1;
            temp[bi] += temp[ai];
        }
        offset <<= 1;
    }

    if (tid == 0) temp[n - 1] = 0;

    // Down-sweep
    for (int d = 1; d < n; d <<= 1) {
        offset >>= 1;
        __syncthreads();
        if (tid < d) {
            int ai = offset * (2 * tid + 1) - 1;
            int bi = offset * (2 * tid + 2) - 1;
            int t  = temp[ai];
            temp[ai]  = temp[bi];
            temp[bi] += t;
        }
    }

    __syncthreads();
    if (2 * tid < n)     output[2 * tid]     = temp[2 * tid];
    if (2 * tid + 1 < n) output[2 * tid + 1] = temp[2 * tid + 1];
}

Tensor Cores and the WMMA API

Tensor Cores are specialized hardware units introduced in Volta (2017) that perform a 4×4 FMA in a single clock cycle at the hardware level. At the warp level, the WMMA API exposes 16×16×16 matrix multiply-accumulate. On A100, FP16 Tensor Cores deliver 312 TFLOPS — roughly 16x the throughput of FP32 CUDA Cores.

Mixed-Precision WMMA Matrix Multiply

#include <mma.h>
using namespace nvcuda;

#define WMMA_M 16
#define WMMA_N 16
#define WMMA_K 16

__global__ void tensorCoreMatMul(
    half* A, half* B, float* C,
    int M, int N, int K
) {
    wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K,
                   half, wmma::row_major> aFrag;
    wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K,
                   half, wmma::col_major> bFrag;
    wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K,
                   float> cFrag;

    int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
    int warpN = (blockIdx.y * blockDim.y + threadIdx.y);

    wmma::fill_fragment(cFrag, 0.0f);

    for (int i = 0; i < K; i += WMMA_K) {
        int aRow = warpM * WMMA_M;
        int aCol = i;
        int bRow = i;
        int bCol = warpN * WMMA_N;

        if (aRow < M && aCol < K && bRow < K && bCol < N) {
            wmma::load_matrix_sync(aFrag, A + aRow * K + aCol, K);
            wmma::load_matrix_sync(bFrag, B + bRow * N + bCol, N);
            wmma::mma_sync(cFrag, aFrag, bFrag, cFrag);
        }
    }

    int cRow = warpM * WMMA_M;
    int cCol = warpN * WMMA_N;
    if (cRow < M && cCol < N) {
        wmma::store_matrix_sync(C + cRow * N + cCol, cFrag, N,
                                wmma::mem_row_major);
    }
}

PTX Inline Assembly

For performance-critical inner loops, PTX inline assembly unlocks hardware instructions not exposed through the CUDA C++ API.

__device__ float fastSqrt(float x) {
    float result;
    asm("sqrt.approx.f32 %0, %1;" : "=f"(result) : "f"(x));
    return result;
}

__device__ int ldgInt(const int* ptr) {
    int val;
    // Cache-hint global load: prefer L2, bypass L1
    asm volatile("ld.global.cg.s32 %0, [%1];"
                 : "=r"(val) : "l"(ptr));
    return val;
}

Flash Attention: How It Works

Standard Attention requires O(N²) HBM memory to store the intermediate attention matrix S. Flash Attention tiles Q, K, V into SRAM and uses an online softmax trick to accumulate the output without ever materializing the full N×N matrix in HBM.

Key Idea: Online Softmax

Standard Attention HBM I/O:
  Read Q, K, V    → O(Nd)
  Write S = QK^T  → O(N^2)      <-- bottleneck
  Read S, write P → O(N^2)
  Read P, write O → O(Nd)
  Total: O(N^2 * d)

Flash Attention HBM I/O:
  Read Q, K, V → O(Nd)
  Write O      → O(Nd)
  Total: O(Nd)  — no N^2 term!

// Simplified Flash Attention forward pass
__global__ void flashAttentionKernel(
    const float* Q, const float* K, const float* V,
    float* O, float* l, float* m,
    int seqLen, int headDim, float scale
) {
    int tileSize = blockDim.x;
    int qStart   = blockIdx.x * tileSize;

    extern __shared__ float smem[];
    float* sQ = smem;
    float* sK = smem + tileSize * headDim;
    float* sV = smem + 2 * tileSize * headDim;

    // Load Q tile into SRAM
    for (int d = 0; d < headDim; d++) {
        sQ[threadIdx.x * headDim + d] =
            Q[(qStart + threadIdx.x) * headDim + d];
    }

    float mi  = -INFINITY;   // running row max for numerical stability
    float li  = 0.0f;        // running softmax denominator
    float oi[64] = {};       // output accumulator

    for (int kStart = 0; kStart < seqLen; kStart += tileSize) {
        // Load K and V tiles into SRAM
        for (int d = 0; d < headDim; d++) {
            sK[threadIdx.x * headDim + d] =
                K[(kStart + threadIdx.x) * headDim + d];
            sV[threadIdx.x * headDim + d] =
                V[(kStart + threadIdx.x) * headDim + d];
        }
        __syncthreads();

        for (int j = 0; j < tileSize; j++) {
            float sij = 0.0f;
            for (int d = 0; d < headDim; d++) {
                sij += sQ[threadIdx.x * headDim + d]
                     * sK[j * headDim + d];
            }
            sij *= scale;

            // Online softmax update
            float miNew = fmaxf(mi, sij);
            float liNew = li * expf(mi - miNew) + expf(sij - miNew);

            for (int d = 0; d < headDim; d++) {
                oi[d] = oi[d] * (li / liNew) * expf(mi - miNew)
                      + (expf(sij - miNew) / liNew) * sV[j * headDim + d];
            }
            mi = miNew;
            li = liNew;
        }
        __syncthreads();
    }

    for (int d = 0; d < headDim; d++) {
        O[(qStart + threadIdx.x) * headDim + d] = oi[d];
    }
    l[qStart + threadIdx.x] = li;
    m[qStart + threadIdx.x] = mi;
}

CUDA Streams and Async Execution

CUDA streams enable overlapping of memory transfers and kernel execution, maximizing GPU utilization through pipelining.

void pipelinedProcessing(float* h_input, float* h_output,
                         int totalSize, int chunkSize) {
    const int numStreams = 4;
    cudaStream_t streams[numStreams];
    float *d_input[numStreams], *d_output[numStreams];

    for (int i = 0; i < numStreams; i++) {
        cudaStreamCreate(&streams[i]);
        cudaMalloc(&d_input[i],  chunkSize * sizeof(float));
        cudaMalloc(&d_output[i], chunkSize * sizeof(float));
    }

    // Pipeline: HtoD copy, kernel, DtoH copy interleaved across streams
    for (int chunk = 0; chunk < totalSize / chunkSize; chunk++) {
        int s = chunk % numStreams;
        float* hIn  = h_input  + chunk * chunkSize;
        float* hOut = h_output + chunk * chunkSize;

        cudaMemcpyAsync(d_input[s], hIn,
                        chunkSize * sizeof(float),
                        cudaMemcpyHostToDevice, streams[s]);

        dim3 block(256);
        dim3 grid((chunkSize + 255) / 256);
        processKernel<<<grid, block, 0, streams[s]>>>(
            d_input[s], d_output[s], chunkSize);

        cudaMemcpyAsync(hOut, d_output[s],
                        chunkSize * sizeof(float),
                        cudaMemcpyDeviceToHost, streams[s]);
    }

    for (int i = 0; i < numStreams; i++) {
        cudaStreamSynchronize(streams[i]);
        cudaStreamDestroy(streams[i]);
        cudaFree(d_input[i]);
        cudaFree(d_output[i]);
    }
}

Inter-Stream Synchronization with Events

cudaEvent_t startEvent, stopEvent;
cudaEventCreate(&startEvent);
cudaEventCreate(&stopEvent);

// stream2 must wait until stream1's kernel finishes
cudaEventRecord(startEvent, stream1);
kernel1<<<grid, block, 0, stream1>>>(d_data);
cudaStreamWaitEvent(stream2, startEvent, 0);
kernel2<<<grid, block, 0, stream2>>>(d_data);

cudaEventRecord(stopEvent, stream2);
cudaEventSynchronize(stopEvent);

float ms = 0;
cudaEventElapsedTime(&ms, startEvent, stopEvent);
printf("Elapsed: %.3f ms\n", ms);

Multi-GPU and NCCL

Peer-to-Peer Direct Access

// Enable direct GPU-to-GPU memory copy (NVLink or PCIe P2P)
int canAccess;
cudaDeviceCanAccessPeer(&canAccess, 0, 1);
if (canAccess) {
    cudaSetDevice(0);
    cudaDeviceEnablePeerAccess(1, 0);

    float* d_src; // on GPU 0
    float* d_dst; // on GPU 1
    cudaSetDevice(0);
    cudaMalloc(&d_src, size);

    cudaSetDevice(1);
    cudaMalloc(&d_dst, size);

    // Direct copy without staging through host
    cudaMemcpyPeer(d_dst, 1, d_src, 0, size);
}

NCCL Ring-AllReduce

#include <nccl.h>

void ncclAllReduceExample(int numGPUs, float** d_buffers, size_t count) {
    ncclComm_t comms[numGPUs];
    int devList[numGPUs];
    for (int i = 0; i < numGPUs; i++) devList[i] = i;

    ncclCommInitAll(comms, numGPUs, devList);

    cudaStream_t streams[numGPUs];
    for (int i = 0; i < numGPUs; i++) {
        cudaSetDevice(i);
        cudaStreamCreate(&streams[i]);
    }

    // Sum all buffers and distribute the result to every GPU
    ncclGroupStart();
    for (int i = 0; i < numGPUs; i++) {
        cudaSetDevice(i);
        ncclAllReduce(d_buffers[i], d_buffers[i], count,
                      ncclFloat, ncclSum, comms[i], streams[i]);
    }
    ncclGroupEnd();

    for (int i = 0; i < numGPUs; i++) {
        cudaSetDevice(i);
        cudaStreamSynchronize(streams[i]);
        ncclCommDestroy(comms[i]);
    }
}

The theoretical bandwidth cost of Ring-AllReduce is 2 * (N-1) / N * dataSize. Each of the N GPUs sends and receives dataSize / N chunks N-1 times across two phases (Reduce-Scatter + AllGather), giving 2 * (N-1) * (dataSize / N) total bytes transferred per GPU.

OpenAI Triton Custom Kernels

Triton lets you write GPU kernels in Python at near-CUDA performance. Triton's compiler handles tiling, vectorization, and shared memory allocation automatically.

Triton Matrix Multiply Kernel

import triton
import triton.language as tl
import torch

@triton.jit
def matmul_kernel(
    a_ptr, b_ptr, c_ptr,
    M, N, K,
    stride_am, stride_ak,
    stride_bk, stride_bn,
    stride_cm, stride_cn,
    BLOCK_M: tl.constexpr,
    BLOCK_N: tl.constexpr,
    BLOCK_K: tl.constexpr,
):
    pid_m = tl.program_id(0)
    pid_n = tl.program_id(1)

    offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    offs_k = tl.arange(0, BLOCK_K)

    a_ptrs = a_ptr + (offs_m[:, None] * stride_am + offs_k[None, :] * stride_ak)
    b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_n[None, :] * stride_bn)

    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)

    for k in range(0, K, BLOCK_K):
        a = tl.load(a_ptrs,
                    mask=(offs_m[:, None] < M) & (offs_k[None, :] < K - k),
                    other=0.0)
        b = tl.load(b_ptrs,
                    mask=(offs_k[:, None] < K - k) & (offs_n[None, :] < N),
                    other=0.0)
        acc += tl.dot(a, b)
        a_ptrs += BLOCK_K * stride_ak
        b_ptrs += BLOCK_K * stride_bk

    c_ptrs = c_ptr + stride_cm * offs_m[:, None] + stride_cn * offs_n[None, :]
    c_mask = (offs_m[:, None] < M) & (offs_n[None, :] < N)
    tl.store(c_ptrs, acc, mask=c_mask)


def matmul(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
    M, K = a.shape
    K2, N = b.shape
    assert K == K2
    c = torch.empty((M, N), device=a.device, dtype=torch.float32)
    grid = lambda meta: (
        triton.cdiv(M, meta['BLOCK_M']),
        triton.cdiv(N, meta['BLOCK_N']),
    )
    matmul_kernel[grid](
        a, b, c,
        M, N, K,
        a.stride(0), a.stride(1),
        b.stride(0), b.stride(1),
        c.stride(0), c.stride(1),
        BLOCK_M=128, BLOCK_N=256, BLOCK_K=64,
    )
    return c

Flash Attention in Triton

@triton.jit
def flash_attn_fwd_kernel(
    Q_ptr, K_ptr, V_ptr, O_ptr,
    L_ptr, M_ptr,
    seqlen, headdim, scale,
    BLOCK_Q:  tl.constexpr,
    BLOCK_KV: tl.constexpr,
):
    start_q = tl.program_id(0) * BLOCK_Q
    offs_q  = start_q + tl.arange(0, BLOCK_Q)
    offs_d  = tl.arange(0, headdim)

    q = tl.load(Q_ptr + offs_q[:, None] * headdim + offs_d[None, :],
                mask=(offs_q[:, None] < seqlen), other=0.0)

    mi  = tl.full((BLOCK_Q,), value=-float('inf'), dtype=tl.float32)
    li  = tl.zeros((BLOCK_Q,), dtype=tl.float32)
    acc = tl.zeros((BLOCK_Q, headdim), dtype=tl.float32)

    for start_kv in range(0, seqlen, BLOCK_KV):
        offs_kv = start_kv + tl.arange(0, BLOCK_KV)

        k = tl.load(K_ptr + offs_kv[:, None] * headdim + offs_d[None, :],
                    mask=(offs_kv[:, None] < seqlen), other=0.0)
        v = tl.load(V_ptr + offs_kv[:, None] * headdim + offs_d[None, :],
                    mask=(offs_kv[:, None] < seqlen), other=0.0)

        s      = tl.dot(q, tl.trans(k)) * scale
        mi_new = tl.maximum(mi, tl.max(s, axis=1))
        p      = tl.exp(s - mi_new[:, None])
        li_new = li * tl.exp(mi - mi_new) + tl.sum(p, axis=1)

        acc = acc * (li * tl.exp(mi - mi_new))[:, None] / li_new[:, None]
        acc += tl.dot(p, v) / li_new[:, None]

        mi = mi_new
        li = li_new

    tl.store(O_ptr + offs_q[:, None] * headdim + offs_d[None, :],
             acc, mask=(offs_q[:, None] < seqlen))
    tl.store(L_ptr + offs_q, li, mask=(offs_q < seqlen))
    tl.store(M_ptr + offs_q, mi, mask=(offs_q < seqlen))

Profiling with Nsight Compute and Systems

Nsight Compute: Kernel-Level Analysis

# Full profile for all metrics
ncu --set full -o profile_report ./my_cuda_app

# Profile only a specific kernel, 3 launches
ncu --kernel-name tiledMatMul --launch-count 3 ./my_cuda_app

# Collect memory bandwidth metrics for Roofline analysis
ncu --metrics \
    l1tex__t_bytes_pipe_lsu_mem_global_op_ld.sum,\
    l1tex__t_bytes_pipe_lsu_mem_global_op_st.sum,\
    sm__cycles_elapsed.avg,\
    sm__cycles_elapsed.avg.per_second \
    ./my_cuda_app

# DRAM bandwidth utilization as % of peak
ncu --metrics \
    gpu__dram_throughput.avg.pct_of_peak_sustained_elapsed \
    ./my_cuda_app

Nsight Systems: Application-Level Timeline

# Capture full application trace
nsys profile --trace=cuda,nvtx,osrt \
             --output=timeline_report \
             ./my_cuda_app

# Print summary statistics
nsys stats timeline_report.nsys-rep

# Export GPU trace details
nsys analyze --report gputrace timeline_report.nsys-rep

NVTX Markers for Custom Ranges

#include <nvtx3/nvToolsExt.h>

void profiledFunction() {
    nvtxRangePush("DataPreprocessing");
    preprocessData();
    nvtxRangePop();

    nvtxRangePush("ForwardPass");
    forwardPass();
    nvtxRangePop();
}

Roofline Model

The Roofline model determines whether a kernel is compute-bound or memory-bandwidth-bound.

Arithmetic Intensity (AI) = FLOP / Bytes accessed

Example for A100 80GB:
  Peak FP16 Tensor Core throughput: 312 TFLOPS
  HBM bandwidth:                      2 TB/s
  Ridge point: 312 / 2 = 156 FLOP/Byte

  AI < 156  → memory-bandwidth bound
  AI > 156  → compute bound

# Measure arithmetic intensity
ncu --metrics \
    sm__ops_path_tensor_src_fp16.sum,\
    dram__bytes.sum \
    ./matmul_app
# AI = (tensor_ops * 2) / dram_bytes

Quiz

Q1. What causes shared memory bank conflicts in CUDA, and how do you fix them?

Answer: A bank conflict occurs when two or more threads in the same warp access different addresses that map to the same shared memory bank, forcing the accesses to serialize.

Explanation: Shared memory has 32 banks, each 4 bytes wide. Thread i naturally accesses bank i % 32. A classic example is matrix transposition: reading tile[threadIdx.x][threadIdx.y] causes all 32 threads to hit bank 0 — a 32-way conflict. The fix is to pad the inner dimension by 1: __shared__ float tile[32][33]. This shifts every row's base address by one bank, so threads fan out across distinct banks with zero conflicts.

Q2. How does warp divergence affect performance, and what patterns minimize it?

Answer: Warp divergence serializes both branches of a conditional within a warp, potentially doubling execution time for that warp. With k distinct paths, execution time scales up to k times the single-path cost.

Explanation: Under SIMT, all 32 threads must execute the same instruction. When a branch is taken by only some threads, the hardware masks the others and reruns the warp for each divergent path. Mitigation strategies: (1) align branch conditions to warp boundaries using warp ID instead of thread ID, so every thread in a warp takes the same path; (2) use warp-vote intrinsics like __all_sync and __any_sync to check uniform conditions; (3) sort or rearrange input data so work of the same type is grouped into the same warps.

Q3. What hardware principle makes Tensor Cores faster than FP32 CUDA Cores for matrix multiply?

Answer: Tensor Cores implement a dedicated 4x4 FMA array in silicon that computes an entire 16x16x16 warp-level MMA in a handful of clocks, operating on lower-precision inputs (FP16/BF16/INT8) to maximize throughput.

Explanation: A standard CUDA Core executes one scalar FP32 FMA per clock. A Tensor Core fuses 256 multiply-adds (16x16 dot products of length 16) into a single hardware operation, sharing instruction overhead across the whole warp. On A100, each SM has 4 Tensor Core units capable of 1,248 FP16 FLOPS/clock/SM. Additionally, Tensor Cores accumulate in FP32 while computing in FP16 (mixed precision), preserving numerical stability. This is why torch.cuda.amp and cuBLAS both route GEMM operations through Tensor Cores when available.

Q4. Why is Flash Attention more memory-efficient than standard Attention from an HBM vs. SRAM perspective?

Answer: Standard Attention materializes the full N×N attention matrix in HBM (O(N²) storage), while Flash Attention tiles Q, K, V into on-chip SRAM and uses an online softmax to accumulate the output in SRAM — reducing HBM I/O from O(N²d) to O(Nd).

Explanation: In standard Attention the bottleneck is writing and re-reading the N×N score matrix S and probability matrix P to HBM. Flash Attention divides Q into row tiles and K, V into column tiles that fit in SRAM. For each tile pair it computes local scores and incrementally updates (mi, li, oi) using the online softmax recurrence: mi_new = max(mi, s_ij), li_new = li * exp(mi - mi_new) + exp(s_ij - mi_new). Only the final output O is written back to HBM. SRAM bandwidth is approximately 10x higher than HBM, so even though more arithmetic is performed, wall-clock time is lower because HBM traffic drops from O(N²) to O(N).

Q5. Why does NCCL Ring-AllReduce transfer exactly 2*(N-1)/N * data_size bytes per GPU regardless of N?

Answer: Ring-AllReduce consists of a Reduce-Scatter phase and an AllGather phase, each requiring N-1 steps in which every GPU sends a chunk of size data_size / N. The total is 2 * (N-1) * (data_size / N).

Explanation: Arrange N GPUs in a logical ring. In Reduce-Scatter, each GPU holds one data_size / N chunk that is the partial sum of the corresponding slice from all N GPUs after N-1 rounds of send-and-accumulate. Each round moves exactly data_size / N bytes. In AllGather, each GPU broadcasts its reduced chunk around the ring in another N-1 rounds. Total bytes per GPU = 2 * (N-1) * (data_size / N) = 2 * (N-1) / N * data_size. As N grows this approaches 2 * data_size, which is a constant — far better than the naive broadcast cost of (N-1) * data_size that grows linearly with N.

CUDA GPU 프로그래밍 심화: Warp 최적화, Tensor Core, Triton 커널 작성까지

목차

CUDA 메모리 계층 구조

메모리 유형별 특성

Shared Memory 활용 예시

Constant Memory와 Texture Memory

워프(Warp) 최적화

Warp Divergence 제거

Coalesced Memory Access

Shared Memory Bank Conflict 해결

병렬 패턴 구현

Parallel Reduction

Prefix Sum (Inclusive Scan)

Tensor Core와 WMMA API

WMMA(Warp Matrix Multiply Accumulate) API

PTX 인라인 어셈블리

Flash Attention 구현 원리

핵심 아이디어: 온라인 소프트맥스 (Online Softmax)

CUDA 스트림과 비동기 실행

스트림 이벤트 동기화

멀티-GPU와 NCCL

P2P(Peer-to-Peer) 통신

NCCL Ring-AllReduce

OpenAI Triton 커스텀 커널

Triton 행렬 곱셈 커널

Flash Attention in Triton

프로파일링: Nsight Compute & Systems

Nsight Compute 명령어

Nsight Systems 타임라인 분석

NVTX 마커로 코드 구간 표시

Roofline 모델 분석

퀴즈

Advanced CUDA GPU Programming: Warp Optimization, Tensor Cores, and Triton Kernels

Table of Contents

CUDA Memory Hierarchy

Memory Type Comparison

Using Shared Memory: Tiled Matrix Multiplication

Constant Memory

Warp Optimization

Eliminating Warp Divergence

Coalesced Memory Access

Shared Memory Bank Conflicts

Parallel Patterns

Parallel Reduction with Warp Shuffle

Exclusive Prefix Sum (Scan)

Tensor Cores and the WMMA API

Mixed-Precision WMMA Matrix Multiply

PTX Inline Assembly

Flash Attention: How It Works

Key Idea: Online Softmax

CUDA Streams and Async Execution

Inter-Stream Synchronization with Events

Multi-GPU and NCCL

Peer-to-Peer Direct Access

NCCL Ring-AllReduce

OpenAI Triton Custom Kernels

Triton Matrix Multiply Kernel

Flash Attention in Triton

Profiling with Nsight Compute and Systems

Nsight Compute: Kernel-Level Analysis

Nsight Systems: Application-Level Timeline

NVTX Markers for Custom Ranges

Roofline Model

Quiz