CUDA内存拷贝竞速 wu-kan

近期在做一个 CUDA 相关库的优化,发现稍作一点修改(下文中 simpleZcopynaiveZcopy 的修改)后就让运行时间在 16 秒左右的仿真过程快了将近 1 秒。猜想这个优化方法可能是非常通用的,于是单独做一次实验来验证猜想。

本文在显卡上双精度复数拷贝的场景进行试验,同时和一些常见的高性能拷贝 API 做性能对比。

实验环境

使用 v100 集群上一个结点的单张 v100 运行。

$ nvdia-smi
Mon Dec  2 08:38:49 2019
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 410.48                 Driver Version: 410.48                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  Tesla V100-PCIE...  On   | 00000000:3B:00.0 Off |                    0 |
| N/A   30C    P0    24W / 250W |      0MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+

实验过程与分析

一些说明

内存管理

我使用的显卡是 Tesla V100,单卡显存为 16130MiB。为简化代码,我用 <thrust/device_vector.h> 库创建了四个大小为$2^{28}$的向量用于双精度复数输入和输出。

const size_t
    n = 1 << 28;
thrust::device_vector<double>
    real_in(n, 1),
    imag_in(n, 1),
    real_out(n),
    imag_out(n);

cudaMalloc 分配的显存空间相比,thrust::device_vector<double> 的开销在本例中可以忽略。

计时方式

为简化代码,写了这样一个结构体用于各部分的计时,离开代码块的时候自动输出运行时间。通过调用 cudaEventElapsedTime 实现线程安全的计时。

struct WuKTimer
{
    cudaEvent_t beg, end;
    WuKTimer()
    {
        cudaEventCreate(&beg);
        cudaEventCreate(&end);
        cudaEventRecord(beg);
    }
    ~WuKTimer()
    {
        cudaEventRecord(end);
        cudaEventSynchronize(beg);
        cudaEventSynchronize(end);
        float elapsed_time;
        cudaEventElapsedTime(
            &elapsed_time,
            beg,
            end);
        printf("%f\n", elapsed_time);
    }
};

cudaOccupancyMaxPotentialBlockSize

从 CUDA 6.5 开始,提供了一个很有用的函数 cudaOccupancyMaxPotentialBlockSize,该函数定义在 <cuda_runtime.h>,接口及含义见下面的注释。

template <class T>
cudaError_t __inline__ __host__ CUDART_DEVICE
cudaOccupancyMaxPotentialBlockSize(
    int *minGridSize,           // Suggested min grid size to achieve a full machine launch.
    int *blockSize,             // Suggested block size to achieve maximum occupancy.
    T func,                     // Kernel function.
    size_t dynamicSMemSize = 0, //Size of dynamically allocated shared memory. Of course, it is known at runtime before any kernel launch. The size of the statically allocated shared memory is not needed as it is inferred by the properties of func.
    int blockSizeLimit = 0)     //blockSizeLimit  = Maximum size for each block. In the case of 1D kernels, it can coincide with the number of input elements.
{
    return cudaOccupancyMaxPotentialBlockSizeVariableSMem(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit);
}

通过这个接口可以获得在 SM 占用率最大时 blockDim 和对应的最小 gridDim ,这样就可以不去关心各种硬件资源的限制写出低开销的调用,同时也省去了自己调参数的过程。

唯一遗憾的是,这个函数获得的值是在运行时而非编译时确定的。也就是说,有时候需要通过 template 参数传 BlockDim 的大小来让编译器做一些优化时(如循环展开,再比如 Shared Memory 的大小),要通过switch语句,略显繁琐。

simpleZcopy

先来做最基础的算法优化版本。

void __global__ simpleZcopy(
	const size_t n,
	const double *real_in,
	const double *imag_in,
	double *real_out,
	double *imag_out)
{
	for (size_t i = threadIdx.x + blockDim.x * blockIdx.x;
		 i < n;
		 i += blockDim.x * gridDim.x)
	{
		real_out[i] = real_in[i];
		imag_out[i] = imag_in[i];
	}
}

运行时间为 12.118016ms。

naiveZcopy

simpleZcopy 基础上,把读操作和写操作分离开。

void __global__ naiveZcopy(
	const size_t n,
	const double *real_in,
	const double *imag_in,
	double *real_out,
	double *imag_out)
{
	for (size_t i = blockIdx.x * blockDim.x + threadIdx.x;
		 i < n;
		 i += blockDim.x * gridDim.x)
	{
		const double
			real = real_in[i],
			imag = imag_in[i];
		real_out[i] = real;
		imag_out[i] = imag;
	}
}

运行时间为 12.254208ms。

cudaMemcpy

直接使用 cuda 提供的内存拷贝接口实现,分别拷贝实部和虚部。

cudaMemcpy(
    thrust::raw_pointer_cast(real_out.data()),
    thrust::raw_pointer_cast(real_in.data()),
    sizeof(double) * n,
    cudaMemcpyDeviceToDevice);
cudaMemcpy(
    thrust::raw_pointer_cast(imag_out.data()),
    thrust::raw_pointer_cast(imag_in.data()),
    sizeof(double) * n,
    cudaMemcpyDeviceToDevice);

运行时间为 11.898560ms。

cublasDcopy

使用 cublas 库中提供的第一级向量操作接口实现。

cublasDcopy(
    wk_cublas_handle,
    n,
    thrust::raw_pointer_cast(real_in.data()),
    1,
    thrust::raw_pointer_cast(real_out.data()),
    1);
cublasDcopy(
    wk_cublas_handle,
    n,
    thrust::raw_pointer_cast(imag_in.data()),
    1,
    thrust::raw_pointer_cast(imag_out.data()),
    1);

运行时间为 12.208064ms,更慢了。

thrust::copy

thrust::copy(
    real_in.begin(),
    real_in.end(),
    real_out.begin());
thrust::copy(
    imag_in.begin(),
    imag_in.end(),
    imag_out.begin());

运行时间 11.536608ms。

thrust::device_vector<double>::operator=

我们是使用 thrust::device_vector<double> 进行内存管理的,我们也可以直接调用它的拷贝赋值函数。

real_out = real_in;
imag_out = imag_in;

运行时间 11.528096ms,和前一个差不多。

源代码

Zcopy.pbs

调度脚本。

#PBS -N Zcopy
#PBS -l nodes=1:ppn=32:gpus=1
#PBS -j oe
#PBS -q gpu
source /public/software/profile.d/cuda10.0.sh
cd $PBS_O_WORKDIR
nvcc Zcopy.cu -run -lcublas

Zcopy.o18921

运行结果,自上而下分别是 simpleZcopynaiveZcopycudaMemcpycublasDcopythrust::copythrust::device_vector<double>::operator= 的运行时间。

12.118016
12.254208
11.898560
12.208064
11.536608
11.528096

Zcopy.cu

#include <stdio.h>
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/copy.h>
#include <cublas_v2.h>
void __global__ simpleZcopy(
    const size_t n,
    const double *real_in,
    const double *imag_in,
    double *real_out,
    double *imag_out)
{
    for (size_t i = threadIdx.x + blockDim.x * blockIdx.x;
         i < n;
         i += blockDim.x * gridDim.x)
    {
        real_out[i] = real_in[i];
        imag_out[i] = imag_in[i];
    }
}
void __global__ naiveZcopy(
    const size_t n,
    const double *real_in,
    const double *imag_in,
    double *real_out,
    double *imag_out)
{
    for (size_t i = blockIdx.x * blockDim.x + threadIdx.x;
         i < n;
         i += blockDim.x * gridDim.x)
    {
        const double
            real = real_in[i],
            imag = imag_in[i];
        real_out[i] = real;
        imag_out[i] = imag;
    }
}
struct WuKTimer
{
    cudaEvent_t beg, end;
    WuKTimer()
    {
        cudaEventCreate(&beg);
        cudaEventCreate(&end);
        cudaEventRecord(beg);
    }
    ~WuKTimer()
    {
        cudaEventRecord(end);
        cudaEventSynchronize(beg);
        cudaEventSynchronize(end);
        float elapsed_time;
        cudaEventElapsedTime(
            &elapsed_time,
            beg,
            end);
        printf("%f\n", elapsed_time);
    }
};
const size_t
    n = 1 << 28;
thrust::device_vector<double>
    real_in(n, 1),
    imag_in(n, 1),
    real_out(n),
    imag_out(n);
int main()
{
    {
        WuKTimer wk_timer;
        int minGridSize, blockSize;
        cudaOccupancyMaxPotentialBlockSize(
            &minGridSize,
            &blockSize,
            simpleZcopy);
        simpleZcopy<<<
            minGridSize,
            blockSize>>>(
            n,
            thrust::raw_pointer_cast(real_in.data()),
            thrust::raw_pointer_cast(imag_in.data()),
            thrust::raw_pointer_cast(real_out.data()),
            thrust::raw_pointer_cast(imag_out.data()));
    }
    {
        WuKTimer wk_timer;
        int minGridSize, blockSize;
        cudaOccupancyMaxPotentialBlockSize(
            &minGridSize,
            &blockSize,
            naiveZcopy);
        naiveZcopy<<<
            minGridSize,
            blockSize>>>(
            n,
            thrust::raw_pointer_cast(real_in.data()),
            thrust::raw_pointer_cast(imag_in.data()),
            thrust::raw_pointer_cast(real_out.data()),
            thrust::raw_pointer_cast(imag_out.data()));
    }
    {
        WuKTimer wk_timer;
        cudaMemcpy(
            thrust::raw_pointer_cast(real_out.data()),
            thrust::raw_pointer_cast(real_in.data()),
            sizeof(double) * n,
            cudaMemcpyDeviceToDevice);
        cudaMemcpy(
            thrust::raw_pointer_cast(imag_out.data()),
            thrust::raw_pointer_cast(imag_in.data()),
            sizeof(double) * n,
            cudaMemcpyDeviceToDevice);
    }
    cublasHandle_t wk_cublas_handle;
    cublasCreate(&wk_cublas_handle);
    {
        WuKTimer wk_timer;
        cublasDcopy(
            wk_cublas_handle,
            n,
            thrust::raw_pointer_cast(real_in.data()),
            1,
            thrust::raw_pointer_cast(real_out.data()),
            1);
        cublasDcopy(
            wk_cublas_handle,
            n,
            thrust::raw_pointer_cast(imag_in.data()),
            1,
            thrust::raw_pointer_cast(imag_out.data()),
            1);
    }
    cublasDestroy(wk_cublas_handle);
    {
        WuKTimer wk_timer;
        thrust::copy(
            real_in.begin(),
            real_in.end(),
            real_out.begin());
        thrust::copy(
            imag_in.begin(),
            imag_in.end(),
            imag_out.begin());
    }
    {
        WuKTimer wk_timer;
        real_out = real_in;
        imag_out = imag_in;
    }
}