您的位置 首页 golang

CUDA 及其 golang 调用 – 从入门到放弃 – 3. 真·向量内积的尽头

经过上一回的努力,我们终于将 GPU 计算的时间缩减到同 CPU 一个数量级,但是发现内存和显存之间的 memcpy 成了最主要的性能损耗。

一、固定内存

这里有一个惊人的事实,从分页的虚拟内存中的地址复制到显存,需要经过两次复制,中间需要复制到非分页的固定内存地址。同时,CUDA 还提供了 cudaMallocHost 以分配固定内存,我们可以申请这样的内存并将其地址赋予 golang 中的 slice,在 golang 端直接写这个地址,将其拷贝到显存以达到缩减 memcpy 时间的目的。

const size_t NTB = 256;const size_t EXT = 8;#define divCeil(a, b) (((a) + (b) - 1) / (b))struct Ctx {    float *x, *y;    float *xd, *yd, *rd;    size_t n;};extern "C" __declspec(dllexport) void init(Ctx **p, size_t n) {    Ctx *ctx = (Ctx *)malloc(sizeof(Ctx));    ctx->n = n;    size_t sz = sizeof(float) * n;    cudaMallocHost(&(ctx->x), sz);    cudaMallocHost(&(ctx->y), sz);    cudaMalloc(&(ctx->xd), sz);    cudaMalloc(&(ctx->yd), sz);    cudaMallocManaged(&(ctx->rd), sizeof(float) * divCeil(n, NTB) / EXT);    *p = ctx;}extern "C" __declspec(dllexport) void getInputs(Ctx *ctx, float **px, float **py) {    *px = ctx->x;    *py = ctx->y;}extern "C" __declspec(dllexport) void deinit(Ctx *ctx) {    cudaFreeHost(ctx->x);    cudaFreeHost(ctx->y);    cudaFree(ctx->xd);    cudaFree(ctx->yd);    cudaFree(ctx->rd);    free(ctx);}__global__ void devDot(float *x, float *y, size_t n, float *r) {    __shared__ float rb[NTB];    size_t itb = threadIdx.x;    size_t i = blockIdx.x * blockDim.x * EXT + itb;    float s = 0.0;    for (size_t j = 0; j < EXT && i < n; j++, i += blockDim.x) {        s += x[i] * y[i];    }    rb[itb] = s;    __syncthreads();    for (size_t i = NTB >> 1; i != 0; i >>= 1) {        if (itb < i) rb[itb] += rb[itb + i];        __syncthreads();    }    if (0 == itb) r[blockIdx.x] = rb[0];}extern "C" __declspec(dllexport) void dot(Ctx *ctx, float *r) {    size_t sz = sizeof(float) * ctx->n;    cudaMemcpy(ctx->xd, ctx->x, sz, cudaMemcpyHostToDevice);    cudaMemcpy(ctx->yd, ctx->y, sz, cudaMemcpyHostToDevice);    size_t nb = divCeil(ctx->n, NTB) / EXT;    float *rd = ctx->rd;    devDot<<<nb, NTB>>>(ctx->xd, ctx->yd, ctx->n, rd);    cudaDeviceSynchronize();    float s = 0.0;    for (size_t i = 0; i < nb; i++) s += rd[i];    *r = s;}
package mainimport (    "math/rand"    "reflect"    "syscall"    "time"    "unsafe")const N = 1 << 20type Lib struct {    dll        *syscall.DLL    deinitProc *syscall.Proc    dotProc    *syscall.Proc    handler    uintptr    X, Y       []float32}func LoadLib() (*Lib, error) {    l := &Lib{}    var err error    defer func() {        if nil != err {            l.Release()        }    }()    if l.dll, err = syscall.LoadDLL("cuda.dll"); nil != err {        return nil, err    }    if l.deinitProc, err = l.dll.FindProc("deinit"); nil != err {        return nil, err    }    if l.dotProc, err = l.dll.FindProc("dot"); nil != err {        return nil, err    }    proc, err := l.dll.FindProc("init")    if nil != err {        return nil, err    }    proc.Call(uintptr(unsafe.Pointer(&l.handler)), uintptr(N))    proc, err = l.dll.FindProc("getInputs")    if nil != err {        return nil, err    }    xh := (*reflect.SliceHeader)(unsafe.Pointer(&l.X))    yh := (*reflect.SliceHeader)(unsafe.Pointer(&l.Y))    xh.Len, xh.Cap, yh.Len, yh.Cap = N, N, N, N    proc.Call(l.handler,        uintptr(unsafe.Pointer(&xh.Data)), uintptr(unsafe.Pointer(&yh.Data)))    return l, nil}func (l *Lib) Release() {    if nil != l.deinitProc && 0 != l.handler {        l.deinitProc.Call(l.handler)    }    if nil != l.dll {        l.dll.Release()    }}func (l *Lib) Dot() float32 {    var r float32    l.dotProc.Call(l.handler, uintptr(unsafe.Pointer(&r)))    return r}func main() {    lib, err := LoadLib()    if nil != err {        println(err.Error())        return    }    defer lib.Release()    rand.Seed(time.Now().Unix())    x, y := lib.X, lib.Y    for i := 0; i < N; i++ {        x[i], y[i] = rand.Float32(), rand.Float32()    }    t := time.Now()    var r float32    for i := 0; i < 100; i++ {        r = 0        for i := 0; i < N; i++ {            r += x[i] * y[i]        }    }    println(time.Now().Sub(t).Microseconds())    println(r)    t = time.Now()    for i := 0; i < 100; i++ {        r = lib.Dot()    }    println(time.Now().Sub(t).Microseconds())    println(r)}

CPU 版仍约 120ms,GPU 版进一步下降到约 174ms,其中 cudaMemcpy 下降到约 134ms

二、内存映射

CUDA 还提供了避免拷贝的内存映射,使用 cudaHostAlloc 分配固定内存并映射到显存,使核函数能够访问

c 端有改动的代码如下:

struct Ctx {    float *x, *y, *r;    size_t n;};extern "C" __declspec(dllexport) void init(Ctx **p, size_t n) {    Ctx *ctx = (Ctx *)malloc(sizeof(Ctx));    ctx->n = n;    size_t sz = sizeof(float) * n;    cudaHostAlloc(&(ctx->x), sz, cudaHostAllocMapped);    cudaHostAlloc(&(ctx->y), sz, cudaHostAllocMapped);    cudaMallocManaged(&(ctx->r), sizeof(float) * divCeil(n, NTB) / EXT);    *p = ctx;}extern "C" __declspec(dllexport) void deinit(Ctx *ctx) {    cudaFreeHost(ctx->x);    cudaFreeHost(ctx->y);    cudaFree(ctx->r);    free(ctx);}extern "C" __declspec(dllexport) void dot(Ctx *ctx, float *r) {    size_t nb = divCeil(ctx->n, NTB) / EXT;    float *rd = ctx->r;    devDot<<<nb, NTB>>>(ctx->x, ctx->y, ctx->n, rd);    cudaDeviceSynchronize();    float s = 0.0;    for (size_t i = 0; i < nb; i++) s += rd[i];    *r = s;}

CPU 版仍约 120ms,GPU 版进一步下降到约 156ms,其中 cudaMemcpy 已不再需要,而 devDot 却上升到约 125ms,因为和访问显存相比,访问内存产生严重的性能损耗

三、统一寻址

CUDA 6.0 开始提供统一寻址机制,使用 cudaMallocManaged 分配统一寻址的内存,透明地在内存和显存间进行映像,两边都能访问

c 端有改动的代码如下:

extern "C" __declspec(dllexport) void init(Ctx **p, size_t n) {    Ctx *ctx = (Ctx *)malloc(sizeof(Ctx));    ctx->n = n;    size_t sz = sizeof(float) * n;    cudaMallocManaged(&(ctx->x), sz);    cudaMallocManaged(&(ctx->y), sz);    cudaMallocManaged(&(ctx->r), sizeof(float) * divCeil(n, NTB) / EXT);    *p = ctx;}extern "C" __declspec(dllexport) void deinit(Ctx *ctx) {    cudaFree(ctx->x);    cudaFree(ctx->y);    cudaFree(ctx->r);    free(ctx);}

喜讯!喜讯!GPU 终于赢了!CPU 版仍约 120ms,GPU 版进一步下降到约 86ms,其中 devDot 约 9.577ms,与 CPU 版的比值约为 7.98%,比上一篇稍慢一点。cudaMemcpy 也不存在了,而主要的性能损耗变成了之前毫不显眼的 cudaLaunchKernel 等 CUDA 运行时。


文章来源:智云一二三科技

文章标题:CUDA 及其 golang 调用 – 从入门到放弃 – 3. 真·向量内积的尽头

文章地址:https://www.zhihuclub.com/6589.shtml

关于作者: 智云科技

热门文章

发表评论

您的电子邮箱地址不会被公开。

6条评论

  1. Everything you need to run your business.
    Manage projects, create dazzling proposals
    and get paid faster.
    Black Friday! All plans are FREE, no credit card required.

  2. Good day !
    I would like to buy ad space on your site for $1000
    You can read our terms on the link below

  3. Good day !
    Can I buy ad space on your site for $500
    You can read our terms on the link below

  4. I recently started a blog, and I was just wondering how people have promoted their blogs online to get more followers so you aren’t just typing to nothing out in the internet?. Oh, and for my blog, I don’t really want my friends to be my followers, unless they find it on their own. My blog is about something that most of my friend’s can’t really help me with. (Photography is the main subject).

  5. I am at a crossroads in my career. I am in the middle of applying for law school, but I really want to write. Is there a field that can combine the two, or do I choose one over the other? I am not looking to practice law, that much I know about myself. I want to do something that is challenging, fun, creative and meaningful. I fear that I have to give up writing in order to fullfill a working life..

网站地图