您的位置 首页 golang

CUDA 及其 golang 调用 – 从入门到放弃 – 3. 真·向量内积的尽头

经过上一回的努力,我们终于将 GPU 计算的时间缩减到同 CPU 一个数量级,但是发现内存和显存之间的 memcpy 成了最主要的性能损耗。

一、固定内存

这里有一个惊人的事实,从分页的虚拟内存中的地址复制到显存,需要经过两次复制,中间需要复制到非分页的固定内存地址。同时,CUDA 还提供了 cudaMallocHost 以分配固定内存,我们可以申请这样的内存并将其地址赋予 golang 中的 slice,在 golang 端直接写这个地址,将其拷贝到显存以达到缩减 memcpy 时间的目的。

const size_t NTB = 256;const size_t EXT = 8;#define divCeil(a, b) (((a) + (b) - 1) / (b))struct Ctx {    float *x, *y;    float *xd, *yd, *rd;    size_t n;};extern "C" __declspec(dllexport) void init(Ctx **p, size_t n) {    Ctx *ctx = (Ctx *)malloc(sizeof(Ctx));    ctx->n = n;    size_t sz = sizeof(float) * n;    cudaMallocHost(&(ctx->x), sz);    cudaMallocHost(&(ctx->y), sz);    cudaMalloc(&(ctx->xd), sz);    cudaMalloc(&(ctx->yd), sz);    cudaMallocManaged(&(ctx->rd), sizeof(float) * divCeil(n, NTB) / EXT);    *p = ctx;}extern "C" __declspec(dllexport) void getInputs(Ctx *ctx, float **px, float **py) {    *px = ctx->x;    *py = ctx->y;}extern "C" __declspec(dllexport) void deinit(Ctx *ctx) {    cudaFreeHost(ctx->x);    cudaFreeHost(ctx->y);    cudaFree(ctx->xd);    cudaFree(ctx->yd);    cudaFree(ctx->rd);    free(ctx);}__global__ void devDot(float *x, float *y, size_t n, float *r) {    __shared__ float rb[NTB];    size_t itb = threadIdx.x;    size_t i = blockIdx.x * blockDim.x * EXT + itb;    float s = 0.0;    for (size_t j = 0; j < EXT && i < n; j++, i += blockDim.x) {        s += x[i] * y[i];    }    rb[itb] = s;    __syncthreads();    for (size_t i = NTB >> 1; i != 0; i >>= 1) {        if (itb < i) rb[itb] += rb[itb + i];        __syncthreads();    }    if (0 == itb) r[blockIdx.x] = rb[0];}extern "C" __declspec(dllexport) void dot(Ctx *ctx, float *r) {    size_t sz = sizeof(float) * ctx->n;    cudaMemcpy(ctx->xd, ctx->x, sz, cudaMemcpyHostToDevice);    cudaMemcpy(ctx->yd, ctx->y, sz, cudaMemcpyHostToDevice);    size_t nb = divCeil(ctx->n, NTB) / EXT;    float *rd = ctx->rd;    devDot<<<nb, NTB>>>(ctx->xd, ctx->yd, ctx->n, rd);    cudaDeviceSynchronize();    float s = 0.0;    for (size_t i = 0; i < nb; i++) s += rd[i];    *r = s;}
package mainimport (    "math/rand"    "reflect"    "syscall"    "time"    "unsafe")const N = 1 << 20type Lib struct {    dll        *syscall.DLL    deinitProc *syscall.Proc    dotProc    *syscall.Proc    handler    uintptr    X, Y       []float32}func LoadLib() (*Lib, error) {    l := &Lib{}    var err error    defer func() {        if nil != err {            l.Release()        }    }()    if l.dll, err = syscall.LoadDLL("cuda.dll"); nil != err {        return nil, err    }    if l.deinitProc, err = l.dll.FindProc("deinit"); nil != err {        return nil, err    }    if l.dotProc, err = l.dll.FindProc("dot"); nil != err {        return nil, err    }    proc, err := l.dll.FindProc("init")    if nil != err {        return nil, err    }    proc.Call(uintptr(unsafe.Pointer(&l.handler)), uintptr(N))    proc, err = l.dll.FindProc("getInputs")    if nil != err {        return nil, err    }    xh := (*reflect.SliceHeader)(unsafe.Pointer(&l.X))    yh := (*reflect.SliceHeader)(unsafe.Pointer(&l.Y))    xh.Len, xh.Cap, yh.Len, yh.Cap = N, N, N, N    proc.Call(l.handler,        uintptr(unsafe.Pointer(&xh.Data)), uintptr(unsafe.Pointer(&yh.Data)))    return l, nil}func (l *Lib) Release() {    if nil != l.deinitProc && 0 != l.handler {        l.deinitProc.Call(l.handler)    }    if nil != l.dll {        l.dll.Release()    }}func (l *Lib) Dot() float32 {    var r float32    l.dotProc.Call(l.handler, uintptr(unsafe.Pointer(&r)))    return r}func main() {    lib, err := LoadLib()    if nil != err {        println(err.Error())        return    }    defer lib.Release()    rand.Seed(time.Now().Unix())    x, y := lib.X, lib.Y    for i := 0; i < N; i++ {        x[i], y[i] = rand.Float32(), rand.Float32()    }    t := time.Now()    var r float32    for i := 0; i < 100; i++ {        r = 0        for i := 0; i < N; i++ {            r += x[i] * y[i]        }    }    println(time.Now().Sub(t).Microseconds())    println(r)    t = time.Now()    for i := 0; i < 100; i++ {        r = lib.Dot()    }    println(time.Now().Sub(t).Microseconds())    println(r)}

CPU 版仍约 120ms,GPU 版进一步下降到约 174ms,其中 cudaMemcpy 下降到约 134ms

二、内存映射

CUDA 还提供了避免拷贝的内存映射,使用 cudaHostAlloc 分配固定内存并映射到显存,使核函数能够访问

c 端有改动的代码如下:

struct Ctx {    float *x, *y, *r;    size_t n;};extern "C" __declspec(dllexport) void init(Ctx **p, size_t n) {    Ctx *ctx = (Ctx *)malloc(sizeof(Ctx));    ctx->n = n;    size_t sz = sizeof(float) * n;    cudaHostAlloc(&(ctx->x), sz, cudaHostAllocMapped);    cudaHostAlloc(&(ctx->y), sz, cudaHostAllocMapped);    cudaMallocManaged(&(ctx->r), sizeof(float) * divCeil(n, NTB) / EXT);    *p = ctx;}extern "C" __declspec(dllexport) void deinit(Ctx *ctx) {    cudaFreeHost(ctx->x);    cudaFreeHost(ctx->y);    cudaFree(ctx->r);    free(ctx);}extern "C" __declspec(dllexport) void dot(Ctx *ctx, float *r) {    size_t nb = divCeil(ctx->n, NTB) / EXT;    float *rd = ctx->r;    devDot<<<nb, NTB>>>(ctx->x, ctx->y, ctx->n, rd);    cudaDeviceSynchronize();    float s = 0.0;    for (size_t i = 0; i < nb; i++) s += rd[i];    *r = s;}

CPU 版仍约 120ms,GPU 版进一步下降到约 156ms,其中 cudaMemcpy 已不再需要,而 devDot 却上升到约 125ms,因为和访问显存相比,访问内存产生严重的性能损耗

三、统一寻址

CUDA 6.0 开始提供统一寻址机制,使用 cudaMallocManaged 分配统一寻址的内存,透明地在内存和显存间进行映像,两边都能访问

c 端有改动的代码如下:

extern "C" __declspec(dllexport) void init(Ctx **p, size_t n) {    Ctx *ctx = (Ctx *)malloc(sizeof(Ctx));    ctx->n = n;    size_t sz = sizeof(float) * n;    cudaMallocManaged(&(ctx->x), sz);    cudaMallocManaged(&(ctx->y), sz);    cudaMallocManaged(&(ctx->r), sizeof(float) * divCeil(n, NTB) / EXT);    *p = ctx;}extern "C" __declspec(dllexport) void deinit(Ctx *ctx) {    cudaFree(ctx->x);    cudaFree(ctx->y);    cudaFree(ctx->r);    free(ctx);}

喜讯!喜讯!GPU 终于赢了!CPU 版仍约 120ms,GPU 版进一步下降到约 86ms,其中 devDot 约 9.577ms,与 CPU 版的比值约为 7.98%,比上一篇稍慢一点。cudaMemcpy 也不存在了,而主要的性能损耗变成了之前毫不显眼的 cudaLaunchKernel 等 CUDA 运行时。


文章来源:智云一二三科技

文章标题:CUDA 及其 golang 调用 – 从入门到放弃 – 3. 真·向量内积的尽头

文章地址:https://www.zhihuclub.com/6589.shtml

关于作者: 智云科技

热门文章

评论已关闭

13条评论

  1. Everything you need to run your business.
    Manage projects, create dazzling proposals
    and get paid faster.
    Black Friday! All plans are FREE, no credit card required.

  2. Good day !
    I would like to buy ad space on your site for $1000
    You can read our terms on the link below

  3. Good day !
    Can I buy ad space on your site for $500
    You can read our terms on the link below

  4. I recently started a blog, and I was just wondering how people have promoted their blogs online to get more followers so you aren’t just typing to nothing out in the internet?. Oh, and for my blog, I don’t really want my friends to be my followers, unless they find it on their own. My blog is about something that most of my friend’s can’t really help me with. (Photography is the main subject).

  5. I am at a crossroads in my career. I am in the middle of applying for law school, but I really want to write. Is there a field that can combine the two, or do I choose one over the other? I am not looking to practice law, that much I know about myself. I want to do something that is challenging, fun, creative and meaningful. I fear that I have to give up writing in order to fullfill a working life..

  6. I was wondering if you ever thought of changing the layout of your website? Its very well written; I love what youve got to say. But maybe you could a little more in the way of content so people could connect with it better. Youve got an awful lot of text for only having one or two pictures. Maybe you could space it out better?

  7. magnificent issues altogether, you just gained a brand new reader. What could you suggest about your submit that you simply made a few days in the past? Any certain?

  8. Once I originally 评论我点击了-Notify me when new feedback are added- 复选框,现在each time a remark is added I收到带有 the identical 评论的 4 封电子邮件.有没有什么方法 你将能够 删除我从那个服务?谢谢!

  9. 噢,这是一篇非常好的帖子.在idea中,我would like to像这样写moreover——花时间和actual努力做出a very good文章……但是我能说什么……我拖延了很多并且绝对不会似乎得到一件事完成.

  10. 你can definitely 看到你的 enthusiasm within the 你写的 work.世界希望有甚至更多热情的作家像你,他们不害怕提及他们的信仰.All time go after 你的心.

  11. 优秀 网站.很多 useful info 这里.我将它发送给some buddies ans also 分享美味.还有显然,谢谢 在你 汗水!

网站地图