CUDA 及其 golang 调用 – 从入门到放弃 – 3. 真·向量内积的尽头

经过上一回的努力，我们终于将 GPU 计算的时间缩减到同 CPU 一个数量级，但是发现内存和显存之间的 memcpy 成了最主要的性能损耗。

一、固定内存

这里有一个惊人的事实，从分页的虚拟内存中的地址复制到显存，需要经过两次复制，中间需要复制到非分页的固定内存地址。同时，CUDA 还提供了 cudaMallocHost 以分配固定内存，我们可以申请这样的内存并将其地址赋予 golang 中的 slice，在 golang 端直接写这个地址，将其拷贝到显存以达到缩减 memcpy 时间的目的。

const size_t NTB = 256;const size_t EXT = 8;#define divCeil(a, b) (((a) + (b) - 1) / (b))struct Ctx {    float *x, *y;    float *xd, *yd, *rd;    size_t n;};extern "C" __declspec(dllexport) void init(Ctx **p, size_t n) {    Ctx *ctx = (Ctx *)malloc(sizeof(Ctx));    ctx->n = n;    size_t sz = sizeof(float) * n;    cudaMallocHost(&(ctx->x), sz);    cudaMallocHost(&(ctx->y), sz);    cudaMalloc(&(ctx->xd), sz);    cudaMalloc(&(ctx->yd), sz);    cudaMallocManaged(&(ctx->rd), sizeof(float) * divCeil(n, NTB) / EXT);    *p = ctx;}extern "C" __declspec(dllexport) void getInputs(Ctx *ctx, float **px, float **py) {    *px = ctx->x;    *py = ctx->y;}extern "C" __declspec(dllexport) void deinit(Ctx *ctx) {    cudaFreeHost(ctx->x);    cudaFreeHost(ctx->y);    cudaFree(ctx->xd);    cudaFree(ctx->yd);    cudaFree(ctx->rd);    free(ctx);}__global__ void devDot(float *x, float *y, size_t n, float *r) {    __shared__ float rb[NTB];    size_t itb = threadIdx.x;    size_t i = blockIdx.x * blockDim.x * EXT + itb;    float s = 0.0;    for (size_t j = 0; j < EXT && i < n; j++, i += blockDim.x) {        s += x[i] * y[i];    }    rb[itb] = s;    __syncthreads();    for (size_t i = NTB >> 1; i != 0; i >>= 1) {        if (itb < i) rb[itb] += rb[itb + i];        __syncthreads();    }    if (0 == itb) r[blockIdx.x] = rb[0];}extern "C" __declspec(dllexport) void dot(Ctx *ctx, float *r) {    size_t sz = sizeof(float) * ctx->n;    cudaMemcpy(ctx->xd, ctx->x, sz, cudaMemcpyHostToDevice);    cudaMemcpy(ctx->yd, ctx->y, sz, cudaMemcpyHostToDevice);    size_t nb = divCeil(ctx->n, NTB) / EXT;    float *rd = ctx->rd;    devDot<<<nb, NTB>>>(ctx->xd, ctx->yd, ctx->n, rd);    cudaDeviceSynchronize();    float s = 0.0;    for (size_t i = 0; i < nb; i++) s += rd[i];    *r = s;}

package mainimport (    "math/rand"    "reflect"    "syscall"    "time"    "unsafe")const N = 1 << 20type Lib struct {    dll        *syscall.DLL    deinitProc *syscall.Proc    dotProc    *syscall.Proc    handler    uintptr    X, Y       []float32}func LoadLib() (*Lib, error) {    l := &Lib{}    var err error    defer func() {        if nil != err {            l.Release()        }    }()    if l.dll, err = syscall.LoadDLL("cuda.dll"); nil != err {        return nil, err    }    if l.deinitProc, err = l.dll.FindProc("deinit"); nil != err {        return nil, err    }    if l.dotProc, err = l.dll.FindProc("dot"); nil != err {        return nil, err    }    proc, err := l.dll.FindProc("init")    if nil != err {        return nil, err    }    proc.Call(uintptr(unsafe.Pointer(&l.handler)), uintptr(N))    proc, err = l.dll.FindProc("getInputs")    if nil != err {        return nil, err    }    xh := (*reflect.SliceHeader)(unsafe.Pointer(&l.X))    yh := (*reflect.SliceHeader)(unsafe.Pointer(&l.Y))    xh.Len, xh.Cap, yh.Len, yh.Cap = N, N, N, N    proc.Call(l.handler,        uintptr(unsafe.Pointer(&xh.Data)), uintptr(unsafe.Pointer(&yh.Data)))    return l, nil}func (l *Lib) Release() {    if nil != l.deinitProc && 0 != l.handler {        l.deinitProc.Call(l.handler)    }    if nil != l.dll {        l.dll.Release()    }}func (l *Lib) Dot() float32 {    var r float32    l.dotProc.Call(l.handler, uintptr(unsafe.Pointer(&r)))    return r}func main() {    lib, err := LoadLib()    if nil != err {        println(err.Error())        return    }    defer lib.Release()    rand.Seed(time.Now().Unix())    x, y := lib.X, lib.Y    for i := 0; i < N; i++ {        x[i], y[i] = rand.Float32(), rand.Float32()    }    t := time.Now()    var r float32    for i := 0; i < 100; i++ {        r = 0        for i := 0; i < N; i++ {            r += x[i] * y[i]        }    }    println(time.Now().Sub(t).Microseconds())    println(r)    t = time.Now()    for i := 0; i < 100; i++ {        r = lib.Dot()    }    println(time.Now().Sub(t).Microseconds())    println(r)}

CPU 版仍约 120ms，GPU 版进一步下降到约 174ms，其中 cudaMemcpy 下降到约 134ms

二、内存映射

CUDA 还提供了避免拷贝的内存映射，使用 cudaHostAlloc 分配固定内存并映射到显存，使核函数能够访问

c 端有改动的代码如下：

struct Ctx {    float *x, *y, *r;    size_t n;};extern "C" __declspec(dllexport) void init(Ctx **p, size_t n) {    Ctx *ctx = (Ctx *)malloc(sizeof(Ctx));    ctx->n = n;    size_t sz = sizeof(float) * n;    cudaHostAlloc(&(ctx->x), sz, cudaHostAllocMapped);    cudaHostAlloc(&(ctx->y), sz, cudaHostAllocMapped);    cudaMallocManaged(&(ctx->r), sizeof(float) * divCeil(n, NTB) / EXT);    *p = ctx;}extern "C" __declspec(dllexport) void deinit(Ctx *ctx) {    cudaFreeHost(ctx->x);    cudaFreeHost(ctx->y);    cudaFree(ctx->r);    free(ctx);}extern "C" __declspec(dllexport) void dot(Ctx *ctx, float *r) {    size_t nb = divCeil(ctx->n, NTB) / EXT;    float *rd = ctx->r;    devDot<<<nb, NTB>>>(ctx->x, ctx->y, ctx->n, rd);    cudaDeviceSynchronize();    float s = 0.0;    for (size_t i = 0; i < nb; i++) s += rd[i];    *r = s;}

CPU 版仍约 120ms，GPU 版进一步下降到约 156ms，其中 cudaMemcpy 已不再需要，而 devDot 却上升到约 125ms，因为和访问显存相比，访问内存产生严重的性能损耗

三、统一寻址

CUDA 6.0 开始提供统一寻址机制，使用 cudaMallocManaged 分配统一寻址的内存，透明地在内存和显存间进行映像，两边都能访问

c 端有改动的代码如下：

extern "C" __declspec(dllexport) void init(Ctx **p, size_t n) {    Ctx *ctx = (Ctx *)malloc(sizeof(Ctx));    ctx->n = n;    size_t sz = sizeof(float) * n;    cudaMallocManaged(&(ctx->x), sz);    cudaMallocManaged(&(ctx->y), sz);    cudaMallocManaged(&(ctx->r), sizeof(float) * divCeil(n, NTB) / EXT);    *p = ctx;}extern "C" __declspec(dllexport) void deinit(Ctx *ctx) {    cudaFree(ctx->x);    cudaFree(ctx->y);    cudaFree(ctx->r);    free(ctx);}

喜讯！喜讯！GPU 终于赢了！CPU 版仍约 120ms，GPU 版进一步下降到约 86ms，其中 devDot 约 9.577ms，与 CPU 版的比值约为 7.98%，比上一篇稍慢一点。cudaMemcpy 也不存在了，而主要的性能损耗变成了之前毫不显眼的 cudaLaunchKernel 等 CUDA 运行时。

标签: golang 内存显存

评论已关闭

13条评论

KristieCuh说道：
2021年11月29日上午5:44
Only The Best Adult Games 2021. As Real As You Want it To be
Audreyjet说道：
2021年12月3日下午3:30
Everything you need to run your business.
Manage projects, create dazzling proposals
and get paid faster.
Black Friday! All plans are FREE, no credit card required.
Angelina说道：
2021年12月5日上午9:02
Good day !
I would like to buy ad space on your site for $1000
You can read our terms on the link below
Angelina说道：
2021年12月6日上午2:40
Good day !
Can I buy ad space on your site for $500
You can read our terms on the link below
Risa Mcilvaine说道：
2022年3月2日下午9:23
I recently started a blog, and I was just wondering how people have promoted their blogs online to get more followers so you aren’t just typing to nothing out in the internet?. Oh, and for my blog, I don’t really want my friends to be my followers, unless they find it on their own. My blog is about something that most of my friend’s can’t really help me with. (Photography is the main subject).
santa clara county gym说道：
2022年5月13日下午9:10
I am at a crossroads in my career. I am in the middle of applying for law school, but I really want to write. Is there a field that can combine the two, or do I choose one over the other? I am not looking to practice law, that much I know about myself. I want to do something that is challenging, fun, creative and meaningful. I fear that I have to give up writing in order to fullfill a working life..
Concetta Schwister说道：
2022年12月10日下午8:10
I was wondering if you ever thought of changing the layout of your website? Its very well written; I love what youve got to say. But maybe you could a little more in the way of content so people could connect with it better. Youve got an awful lot of text for only having one or two pictures. Maybe you could space it out better?
Lincoln Loessberg说道：
2022年12月16日上午1:22
magnificent issues altogether, you just gained a brand new reader. What could you suggest about your submit that you simply made a few days in the past? Any certain?
Sallie Ciallella说道：
2023年3月30日下午7:07
Once I originally 评论我点击了-Notify me when new feedback are added- 复选框，现在each time a remark is added I收到带有 the identical 评论的 4 封电子邮件.有没有什么方法你将能够删除我从那个服务？谢谢！
Allene Mercardo说道：
2023年4月10日下午1:56
噢，这是一篇非常好的帖子.在idea中，我would like to像这样写moreover——花时间和actual努力做出a very good文章……但是我能说什么……我拖延了很多并且绝对不会似乎得到一件事完成.
唐勤说道：
2023年6月2日上午12:19
你can definitely 看到你的 enthusiasm within the 你写的 work.世界希望有甚至更多热情的作家像你，他们不害怕提及他们的信仰.All time go after 你的心.
https://rosetta.digital说道：
2023年6月15日下午1:18
优秀网站.很多 useful info 这里.我将它发送给some buddies ans also 分享美味.还有显然，谢谢在你汗水！
Lindsey Bartlome说道：
2023年6月25日下午6:10
What type of images am I ‘legally’ allowed to include in my blog posts?

智云一二三科技

CUDA 及其 golang 调用 – 从入门到放弃 – 3. 真·向量内积的尽头

一、固定内存

二、内存映射

三、统一寻址

关于作者: 智云科技

13条评论

一、固定内存

二、内存映射

三、统一寻址

给这篇文章的作者打赏

关于作者: 智云科技

相关文章

Linux性能及调优指南之Linux进程管理

详解SpringIOC容器的基本配置

Goland配置GOROOT无法识别已安装的go

热门文章

1分享新浪图床上传接口源码

2PHP简单实现路由Route功能

3Tideways、xhprof 和 xhgui 打造 PHP 非侵入式监控平台

4centos系统如何查看是否安装了mysql

5curl 工具简述

13条评论