程序员人生 网站导航

OpenCL之矩阵乘法实现

栏目:框架设计时间:2015-05-27 08:07:17

  • kernel
  • 配置
  • 效果

kernel

在opencl中,1般最优价值的就是kernel,前面写的配置文件基本没有很大的差别,主要是kernel写法上。其中矩阵运算又是最能体现opencl价值的地方。先上写的kernel:

__kernel void matrix_mult( const int Ndim, const int Mdim, const int Pdim, __global const float* A, __global const float* B, __global float* C) { int i = get_global_id(0); int j = get_global_id(1); int k; float tmp; if ((i < Ndim) && (j < Mdim)) { tmp = 0.0; for (k = 0; k < Pdim; k++) tmp += A[i*Pdim + k] * B[k*Mdim + j]; C[i*Mdim + j] = tmp; } }

上面的配置文件看起来简单其实已包括了两方面的并行,首先是里面的乘法,这里是对所有的乘法可以进行并行。如果是M×P,P×N的矩阵,那末最多可以进行:M×N×P次乘法,如果没有超过GPU里面流媒体的处理器个数的话那末就能够同时履行,否者也只能满负荷运行。接着计算完这个以后就是加法的并行操作。用if是避免越界。

配置

在这里要特别说明的就是我们在传数据给从机的时候我们是传的1维数组,再通过传矩阵的维度来还原回2维数组。

配置文件的说明可以参考我之前的博客:请点击!
直接贴代码:

#include <CL/cl.h> #include <stdio.h> #include <stdlib.h> #include <time.h> #include <iostream> #include <fstream> using namespace std; #define NWITEMS 6 #pragma comment (lib,"OpenCL.lib") //把文本文件读入1个 string 中 int convertToString(const char *filename, std::string& s) { size_t size; char* str; std::fstream f(filename, (std::fstream::in | std::fstream::binary)); if (f.is_open()) { size_t fileSize; f.seekg(0, std::fstream::end); size = fileSize = (size_t)f.tellg(); f.seekg(0, std::fstream::beg); str = new char[size + 1]; if (!str) { f.close(); return NULL; } f.read(str, fileSize); f.close(); str[size] = ''; s = str; delete[] str; return 0; } printf("Error: Failed to open file %s ", filename); return 1; } int main() { cl_uint status; cl_platform_id platform; //创建平台对象 status = clGetPlatformIDs(1, &platform, NULL); cl_device_id device; //创建 GPU 装备 clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); //创建context cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL); //创建命令队列 cl_command_queue commandQueue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, NULL); if (commandQueue == NULL) perror("Failed to create commandQueue for device 0."); //建立要传入从机的数据 /******** 创建内核和内存对象 ********/ const int Ndim = 20; const int Mdim = 20; const int Pdim = 20; int szA = Ndim * Pdim; int szB = Pdim * Mdim; int szC = Ndim * Mdim; float *A; float *B; float *C; A = (float *)malloc(szA * sizeof(float)); B = (float *)malloc(szB * sizeof(float)); C = (float *)malloc(szC * sizeof(float)); int i, j; for (i = 0; i < szA; i++) A[i] = (float)((float)i + 1.0); for (i = 0; i < szB; i++) B[i] = (float)((float)i + 1.0); //创建3个 OpenCL 内存对象,并把buf1 的内容通过隐式拷贝的方式 //拷贝到clbuf1, buf2 的内容通过显示拷贝的方式拷贝到clbuf2 cl_mem memObjects[3] = { 0, 0, 0 }; memObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float)* szA, A, NULL); memObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float)* szB, B, NULL); memObjects[2] = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float)* szC, C, NULL); if (memObjects[0] == NULL || memObjects[1] == NULL ||memObjects[2] == NULL) perror("Error in clCreateBuffer. "); const char * filename = "Vadd.cl"; std::string sourceStr; status = convertToString(filename, sourceStr); if (status) cout << status << " !!!!!!!!" << endl; const char * source = sourceStr.c_str(); size_t sourceSize[] = { strlen(source) }; //创建程序对象 cl_program program = clCreateProgramWithSource( context, 1, &source, sourceSize, NULL); //编译程序对象 status = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if (status) cout << status << " !!!!!!!!" <<endl; if (status != 0) { printf("clBuild failed:%d ", status); char tbuf[0x10000]; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0x10000, tbuf, NULL); printf(" %s ", tbuf); //return ?1; } //创建 Kernel 对象 cl_kernel kernel = clCreateKernel(program, "matrix_mult", NULL); //设置 Kernel 参数 cl_int clnum = NWITEMS; status = clSetKernelArg(kernel, 0, sizeof(int), &Ndim); status = clSetKernelArg(kernel, 1, sizeof(int), &Mdim); status = clSetKernelArg(kernel, 2, sizeof(int), &Pdim); status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &memObjects[0]); status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &memObjects[1]); status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &memObjects[2]); if (status) cout << "参数设置毛病" << endl; //履行 kernel size_t global[2]; cl_event prof_event; cl_ulong ev_start_time = (cl_ulong)0; cl_ulong ev_end_time = (cl_ulong)0; double rum_time; global[0] = (size_t)Ndim; global[1] = (size_t)Mdim; status = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, global, NULL, 0, NULL, &prof_event); if (status) cout << "履行内核时毛病" << endl; clFinish(commandQueue); //读取时间 status = clGetEventProfilingInfo(prof_event,CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong),&ev_start_time,NULL); status = clGetEventProfilingInfo(prof_event,CL_PROFILING_COMMAND_END, sizeof(cl_ulong),&ev_end_time,NULL); if (status) perror("读取时间的时候产生毛病 "); rum_time = (double)(ev_end_time - ev_start_time); cout << "履行时间为:" << rum_time << endl; //数据拷回 host 内存 status = clEnqueueReadBuffer(commandQueue, memObjects[2],CL_TRUE, 0, sizeof(float)* szC, C,0, NULL, NULL); if (status) perror("读回数据的时候产生毛病 "); //结果显示 printf(" Array A: "); for (i = 0; i < Ndim; i++) { for (j = 0; j < Pdim; j++) printf("%.3f ", A[i*Pdim + j]); printf(" "); } printf(" Array B: "); for (i = 0; i < Pdim; i++) { for (j = 0; j < Mdim; j++) printf("%.3f ", B[i*Mdim + j]); printf(" "); } printf(" Array C: "); for (i = 0; i < Ndim; i++) { for (j = 0; j < Mdim; j++) printf("%.3f ", C[i*Mdim + j]); printf(" "); } cout << endl; if (A) free(A); if (B) free(B); if (C) free(C); //删除 OpenCL 资源对象 clReleaseMemObject(memObjects[2]); clReleaseMemObject(memObjects[1]); clReleaseMemObject(memObjects[0]); clReleaseProgram(program); clReleaseCommandQueue(commandQueue); clReleaseContext(context); system("pause"); return 0; }

效果

我演示1个4×5与5×6的矩阵的乘法:

这里写图片描述

请点击:参考文档

另外可以避免积分下载AMD OpenCL教程:点击进入下载

------分隔线----------------------------
------分隔线----------------------------

最新技术推荐