【cuda学习日记】4.4核函数带宽(矩阵转置问题)

news/2025/2/27 8:06:15

4.4.1 内存带宽
理论带宽是当前硬件可以实现的绝对最大带宽。在cuda中获取

int dev = 0;
    cudaSetDevice(dev);
    cudaDeviceProp deviceprop;
    CHECK(cudaGetDeviceProperties(&deviceprop,dev));
    printf("device %d: %s \n", dev, deviceprop.name);
    printf("Peak Memory Bandwidth (GB/s): %f\n",2.0 * deviceprop.memoryClockRate * (deviceprop.memoryBusWidth / 8) / 1.0e6);
    float pbnd = 2.0 * deviceprop.memoryClockRate * (deviceprop.memoryBusWidth / 8) / 1.0e6;

4.4.2 矩阵转置问题
矩阵转置是线性代数中一个基本问题。虽然是基本问题,但却在许多应用程序中被使用。
在这里插入图片描述
在这里插入图片描述
观察输入和输出布局,你会注意到:·读:通过原矩阵的行进行访问,结果为合并访问·写:通过转置矩阵的列进行访问,结果为交叉访问。交叉访问是使GPU性能变得最差的内存访问模式。但是,在矩阵转置操作中这是不可避免的。

CUDA代码实现:

#include <cuda_runtime.h>
#include <stdio.h>
#include "../common/common.h"

void tranposeHost_test(float* out, float* in, const int nx, const int ny){
    for (int i = 0; i < nx ; i ++){
        for (int j = 0; j < ny; j++){
            out[i*ny + j] = in[j * nx + i];
        }
    }

}

void tranposeHost(float* out, float* in, const int nx, const int ny){
    for (int iy = 0; iy < ny ; iy ++){
        for (int ix = 0; ix < nx; ix++){
            out[ix * ny + iy] = in[iy * nx + ix];
        }
    }

}

__global__ void warmup( float * out, float* in, const int nx, const int ny){
    unsigned int ix = blockDim.x * blockIdx.x + threadIdx.x;
    unsigned int iy = blockDim.y * blockIdx.y + threadIdx.y;
    if (ix < nx && iy < ny){
        out[iy*nx + ix] = in[iy*nx + ix];
    }
}


// load row store row
__global__ void copyRow( float * out, float* in, const int nx, const int ny){
    unsigned int ix = blockDim.x * blockIdx.x + threadIdx.x;
    unsigned int iy = blockDim.y * blockIdx.y + threadIdx.y;
    if (ix < nx && iy < ny){
        out[iy*nx + ix] = in[iy*nx + ix];
    }
}

// load col store col
__global__ void copyCol( float * out, float* in, const int nx, const int ny){
    unsigned int ix = blockDim.x * blockIdx.x + threadIdx.x;
    unsigned int iy = blockDim.y * blockIdx.y + threadIdx.y;
    if (ix < nx && iy < ny){
        out[ix*ny + iy] = in[ix*ny + iy];
    }
}


// load row, store col
__global__ void tranposeNaiveRow(float* out, float* in, const int nx, const int ny){
    unsigned int ix = blockDim.x * blockIdx.x + threadIdx.x;
    unsigned int iy = blockDim.y * blockIdx.y + threadIdx.y;
    if (ix < nx && iy < ny){
        out[ix*ny + iy] = in[iy*nx + ix];
    }
}

// load col, store row
__global__ void tranposeNaiveCol(float* out, float* in, const int nx, const int ny){
    unsigned int ix = blockDim.x * blockIdx.x + threadIdx.x;
    unsigned int iy = blockDim.y * blockIdx.y + threadIdx.y;
    if (ix < nx && iy < ny){
        out[iy*nx + ix] = in[ix*ny + iy];
    }
}

void initialData(float *ip, int nx, int ny)
{
    time_t t;
    srand((unsigned int) time(&t));

    for (int i = 0; i < nx; i++) {
        for (int j =0; j < ny; j ++){
            ip[i + j * nx] = (float) (rand() & 0xff) / 10.0f;
        }
    }
}

void checkResult(float *hostRef, float *gpuRef, const int N)
{
    double epsilon = 1.0E-8;
    bool match = 1;
    for (int i = 0; i < N; i++)
    {
        if (abs(hostRef[i] - gpuRef[i])> epsilon)
        {
            match = 0;
            printf("Array do not match\n");
            printf("host %5.2f gpu % 5.2f at current %d\n", hostRef[i], gpuRef[i], i);
            break;

        }
    }
    if (match) printf("array matches\n");
}

int main(int argc, char** argv){
    int dev = 0;
    cudaSetDevice(dev);
    cudaDeviceProp deviceprop;
    CHECK(cudaGetDeviceProperties(&deviceprop,dev));
    printf("device %d: %s \n", dev, deviceprop.name);
    printf("Peak Memory Bandwidth (GB/s): %f\n",2.0 * deviceprop.memoryClockRate * (deviceprop.memoryBusWidth / 8) / 1.0e6);

    float pbnd = 2.0 * deviceprop.memoryClockRate * (deviceprop.memoryBusWidth / 8) / 1.0e6;

    int nx = 1 << 11;
    int ny = 1 << 11;

    int iKernel = 0;
    int blockx = 16;
    int blocky = 16;

    if(argc > 1) iKernel = atoi(argv[1]);
    if(argc > 2) blockx = atoi(argv[2]);
    if(argc > 3) blocky = atoi(argv[3]);
    if(argc > 4) nx = atoi(argv[4]);
    if(argc > 5) ny = atoi(argv[5]);

    printf("matrix nx %d ny %d with kernel %d\n", nx, ny, iKernel);
    size_t nBytes = nx * ny * sizeof(float);

    dim3 block (blockx, blocky);
    dim3 grid ((nx + block.x-1)/block.x, (ny + block.y -1 )/ block.y);

    float *h_A = (float *)malloc(nBytes);
    float *hostRef = (float *)malloc(nBytes);
    float *gpuRef = (float *)malloc(nBytes);

    initialData(h_A, nx, ny);

    float *d_A, *d_C;
    CHECK(cudaMalloc((float**)&d_A, nBytes));
    CHECK(cudaMalloc((float**)&d_C, nBytes));

    // copy data from host to device
    CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));

    Timer timer;
    timer.start();
    warmup<<<grid,block>>>(d_A, d_C, nx, ny);
    cudaDeviceSynchronize();
    timer.stop();
    float elapsedTime = timer.elapsedms();
    printf("warmup <<<%4d, %4d>>> elapsed %f ms \n", grid.x, block.x,  elapsedTime);


    // kernel pointer
    void (*kernel)(float*, float*,int, int);
    char *kernelName;

    switch (iKernel){
        case 0:
            kernel = &copyRow;
            kernelName = "COPYROW";
            break;
        case 1:
            kernel = &copyCol;
            kernelName = "COPYCOL";
            break;
        case 2:
            kernel = &tranposeNaiveRow;
            kernelName = "tranposeNaiveRow";
            break;
        case 3:
            kernel = &tranposeNaiveCol;
            kernelName = "tranposeNaiveCol";
            break;
    }

    timer.start();
    kernel<<<grid,block>>>(d_C,d_A , nx, ny);
    cudaDeviceSynchronize();
    timer.stop();
    elapsedTime = timer.elapsedms();
    float ibnd = 2*nx * ny * sizeof(float)/1e6 / elapsedTime; 

    printf("%s elapsed %f ms <<<grid(%d, %d) block (%d, %d)>>> effective bandwidth %f GB/s, ratio %f%% \n", kernelName, elapsedTime, grid.x , grid.y, block.x, block.y, ibnd, ibnd/pbnd * 100);

    if (iKernel > 1){
        cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);
        checkResult(hostRef, gpuRef, nx*ny);
    }
    cudaFree(d_A);
    cudaFree(d_C);

    free(h_A);
    free(hostRef);
    free(gpuRef);

    cudaDeviceReset();
    return 0;
}

里面一共有4个函数,执行
transpose.exe 0
COPYROW elapsed 0.049184 ms <<<grid(128, 128) block (16, 16)>>> effective bandwidth 682.222534 GB/s, ratio 67.674362%
transpose.exe 1
COPYCOL elapsed 0.060224 ms <<<grid(128, 128) block (16, 16)>>> effective bandwidth 557.160461 GB/s, ratio 55.268593%
transpose.exe 2
tranposeNaiveRow elapsed 0.098144 ms <<<grid(128, 128) block (16, 16)>>> effective bandwidth 341.889801 GB/s, ratio 33.914410%
transpose.exe 3
tranposeNaiveCol elapsed 0.048128 ms <<<grid(128, 128) block (16, 16)>>> effective bandwidth 697.191467 GB/s, ratio 69.159233%

核函数:
copyrow – 复制一个矩阵,用行加载,用行存储;67.674362% 与峰值带宽比值
copycol – 复制一个矩阵,用列加载,用列存储;55.268593%
tranposeNaiveRow – 转置一个矩阵,用行加载,用列存储;33.914410%
tranposeNaiveCol – 转置一个矩阵,用列加载,用行存储; 69.159233%
结果表明,使用NaiveCol方法比NaiveRow方法性能表现得更好。导致这种性能提升的一个可能原因是在缓存中执行了交叉读取。

4.4.3 展开转置
展开的目的是为每个线程分配更独立的任务,从而最大化当前内存请求。
添加核函数:

__global__ void tranposeUnroll4Row(float* out, float* in, const int nx, const int ny){
    unsigned int ix = blockDim.x * blockIdx.x + threadIdx.x;
    unsigned int iy = blockDim.y * blockIdx.y + threadIdx.y;
    unsigned int ti = iy*nx + ix; //access in rows
    unsigned int to = ix*ny + iy; //access in cols
    if (ix + 3 * blockDim.x < nx && iy < ny){
        out[to] = in[ti];
        out[to + ny * blockDim.x] = in[ti + blockDim.x];
        out[to + ny * 2 * blockDim.x] = in[ti + 2 * blockDim.x];
        out[to + ny * 3 * blockDim.x] = in[ti + 3 * blockDim.x];
    }
}

__global__ void tranposeUnroll4Col(float* out, float* in, const int nx, const int ny){
    unsigned int ix = blockDim.x * blockIdx.x + threadIdx.x;
    unsigned int iy = blockDim.y * blockIdx.y + threadIdx.y;
    
    unsigned int ti = ix*ny + iy; //access in cols
    unsigned int to = iy*nx + ix; //access in rows
    
    if (ix + 3 * blockDim.x < nx && iy < ny){
        out[to] = in[ti];
        out[to + blockDim.x] = in[ti + blockDim.x * ny];
        out[to + blockDim.x * 2] = in[ti + blockDim.x * ny * 2];
        out[to + blockDim.x * 3] = in[ti + blockDim.x * ny * 3];
    }
}

向核函数switch语句中添加以下代码段,需要修改grid.x

		case 4:
            kernel = &tranposeUnroll4Row;
            kernelName = "tranposeUnroll4Row";
            grid.x = (nx + block.x * 4 - 1)/(block.x * 4);
            break;
        case 5:
            kernel = &tranposeUnroll4Col;
            kernelName = "tranposeUnroll4Col";
            grid.x = (nx + block.x * 4 - 1)/(block.x * 4);
            break;

transpose.exe 4
tranposeUnroll4Row elapsed 0.149536 ms <<<grid(32, 128) block (16, 16)>>> effective bandwidth 224.390335 GB/s, ratio 22.258825%
transpose.exe 5
tranposeUnroll4Col elapsed 0.049824 ms <<<grid(32, 128) block (16, 16)>>> effective bandwidth 673.459229 GB/s, ratio 66.805069%

4.4.4 对角转置
当启用一个线程块的网格时,线程块会被分配给SM。编程模型抽象可能用一个一维或二维布局来表示该网格,但是从硬件的角度来看,所有块都是一维的。二维线程块ID可以表示为:

int bid = blockIdx.y * gridDim.x  + blockIdx.x;

当启用一个核函数时,线程块被分配给SM的顺序由块ID来确定。一旦所有的SM都被完全占用,所有剩余的线程块都保持不变直到当前的执行被完成。一旦一个线程块执行结束,将为该SM分配另一个线程块。由于线程块完成的速度和顺序是不确定的,随着内核进程的执行,起初通过bid相连的活跃线程块会变得不太连续了。尽管无法直接调控线程块的顺序,但你可以灵活地使用块坐标blockIdx.x和blockIdx.y。
如图,是笛卡尔坐标系下的块坐标:
在这里插入图片描述
如图是对角块坐标系:
在这里插入图片描述
对角坐标系用于确定一维线程块的ID,但对于数据访问,仍需使用笛卡尔坐标系。因此,当用对角坐标表示块ID时,需要将对角坐标映射到笛卡尔坐标中,以便可以访问到正确的数据块。

block_x = (blockIdx.x + blockIdx.y) % gridDim.x;
block_y = blockIdx.x; //如上图,每一行的x坐标等于行号

这里,blockIdx.x和blockIdx.y为对角坐标。block_x和block_y是它们对应的笛卡尔坐标。
下面的核函数使用了对角线程块坐标,它借助合并读取和交叉写入实现了矩阵的转置:


__global__ void tranposediagonalRow(float* out, float* in, const int nx, const int ny){
    unsigned int blk_y = blockIdx.x;
    unsigned int blk_x = (blockIdx.x + blockIdx.y) % gridDim.x;
    
    unsigned int ix = blockDim.x * blk_x + threadIdx.x;
    unsigned int iy = blockDim.y * blk_y + threadIdx.y;
    
    
    if (ix < nx && iy < ny){
        out[ix*ny + iy] = in[iy*nx + ix];
    }
}

__global__ void tranposediagonalCol(float* out, float* in, const int nx, const int ny){
    unsigned int blk_y = blockIdx.x;
    unsigned int blk_x = (blockIdx.x + blockIdx.y) % gridDim.x;
    
    unsigned int ix = blockDim.x * blk_x + threadIdx.x;
    unsigned int iy = blockDim.y * blk_y + threadIdx.y;
    if (ix < nx && iy < ny){
        out[iy*nx + ix] = in[ix*ny + iy];
    }
}

向核函数switch语句中添加以下代码来调用这些核函数:

case 6:
            kernel = &tranposediagonalRow;
            kernelName = "tranposediagonalRow";
            break;
        case 7:
            kernel = &tranposediagonalCol;
            kernelName = "tranposediagonalCol";
            break;

transpose.exe 6
tranposediagonalRow elapsed 0.094048 ms <<<grid(128, 128) block (16, 16)>>> effective bandwidth 356.779846 GB/s, ratio 35.391457%
transpose.exe 7
tranposediagonalCol elapsed 0.045056 ms <<<grid(128, 128) block (16, 16)>>> effective bandwidth 744.727295 GB/s, ratio 73.874641%

4.4.5 使用瘦块来增加并行性
增加并行性最简单的方式是调整块的大小。之前的几节内容已经证明了这种简单的技术对提高性能的有效性。进一步对使用基于列的NaiveCol核函数的块大小进行试验(通过核函数switch语句中的case 3进行访问)
初始的块大小是 16,16。
基于case3修改块大小命令: transpose.exe 3 32 32 , 以此类推

block.x = 8
tranposeNaiveCol elapsed 0.041216 ms <<<grid(256, 64) block (8, 32)>>> effective bandwidth 814.111755 GB/s, ratio 80.757362%
tranposeNaiveCol elapsed 0.049152 ms <<<grid(256, 128) block (8, 16)>>> effective bandwidth 682.666626 GB/s, ratio 67.718414%
tranposeNaiveCol elapsed 0.064384 ms <<<grid(256, 256) block (8, 8)>>> effective bandwidth 521.161072 GB/s, ratio 51.697563%

block.x = 16
tranposeNaiveCol elapsed 0.056288 ms <<<grid(128, 64) block (16, 32)>>> effective bandwidth 596.120544 GB/s, ratio 59.133308%
tranposeNaiveCol elapsed 0.090080 ms <<<grid(128, 128) block (16, 16)>>> effective bandwidth 372.495911 GB/s, ratio 36.950439%
tranposeNaiveCol elapsed 0.061216 ms <<<grid(128, 256) block (16, 8)>>> effective bandwidth 548.131714 GB/s, ratio 54.372967%

block.x = 32
tranposeNaiveCol elapsed 0.066752 ms <<<grid(64, 64) block (32, 32)>>> effective bandwidth 502.673035 GB/s, ratio 49.863605%
tranposeNaiveCol elapsed 0.047936 ms <<<grid(64, 128) block (32, 16)>>> effective bandwidth 699.984009 GB/s, ratio 69.436249%
tranposeNaiveCol elapsed 0.062560 ms <<<grid(64, 256) block (32, 8)>>> effective bandwidth 536.356018 GB/s, ratio 53.204853%

以上结果相同setting run,每次结果都有不同。但 (8,32)的效果不错。理论依据是如下图:
通过增加存储在线程块中连续元素的数量,“瘦”块可以提高存储操作的效率
在这里插入图片描述


http://www.niftyadmin.cn/n/5869773.html

相关文章

Java | 基于Kerberos认证对接华为云Elasticsearch

可以通过华为官方提供的Java客户端&#xff0c;来实现基于Kerberos认证访问和操作华为云Elasticsearch&#xff1b;亦可以使用更加通用的开源Elasticsearch Java客户端bboss&#xff0c;来实现基于Kerberos认证访问和操作华为云Elasticsearch。 本文介绍使用bboss实现基于Kerb…

2024年10月中科院一区SCI-雪橇犬优化算法Sled Dog Optimizer -附Matlab免费代码

引言 本期介绍了一种新的仿生元启发式算法——雪橇犬优化算法Sled Dog Optimizer&#xff0c;SDO。SDO的灵感主要来自雪橇犬的各种行为模式。重点通过模拟狗拉雪橇、训练和退役行为的过程&#xff0c;构建数学模型。该算法于2024年10月最新发表在JCR1区&#xff0c;中科院1区S…

winfrom的progressBar 鼠标移上去显示 进度条的时间

需求描述&#xff1a; 播放IPC摄像头&#xff08;海康、大华&#xff09;的录像回放&#xff0c;视频窗口下方有个进度条&#xff0c;能显示当前录像播放的进度&#xff0c;点击进度条能将视频跳转到指定的时间点继续播放... 现在需要再进度条上显示视频的时间&#xff0c;用来…

SSM个人交友网站

&#x1f345;点赞收藏关注 → 添加文档最下方联系方式咨询本源代码、数据库&#x1f345; 本人在Java毕业设计领域有多年的经验&#xff0c;陆续会更新更多优质的Java实战项目希望你能有所收获&#xff0c;少走一些弯路。&#x1f345;关注我不迷路&#x1f345; 项目视频 SS…

力扣——零钱兑换

题目链接&#xff1a; 链接 题目描述&#xff1a; 思路&#xff1a; 类似于“完全平方数” 设金额 i i i所需的最少个数是 d p [ i ] dp[i] dp[i] 则&#xff1a; d p [ i ] m i n { d p [ i ] , d p [ i − c o i n s [ j ] ] 1 } dp[i] min\{ dp[i],dp[i-coins[j]] 1…

总结一下Java中的线程池的面试问题

部分内容来源&#xff1a;JavaGuide 使用ThreadPoolExecutor代码示例 package com.example.threadpool.Test;import org.apache.catalina.filters.RemoteIpFilter; import org.springframework.stereotype.Component;import java.util.concurrent.ArrayBlockingQueue; import …

JavaScript算法-合并两个有序链表

合并两个有序链表 描述 将两个已按升序排列的链表合并为一个新的升序链表&#xff0c;并返回该链表。 示例&#xff1a; 输⼊&#xff1a;1->3->5, 2->4->6 输出&#xff1a;1->2->3->4->5->6前置知识 递归链表 思路 使⽤递归的方式来实现&…

深入探讨K8s资源管理和性能优化

#作者&#xff1a;曹付江 文章目录 前言&#xff1a;1&#xff0e;监控 Kubernetes 集群的资源利用率1.1 Prometheus1.2 Kubernetes 度量服务器1.3 Grafana1.4 自定义指标 2. 识别资源瓶颈2.1. 监控工具2.2. 性能剖析2.3 Kubernetes 事件和日志2.4. 群集自动扩展2.5. 负载测试…