CUDA學習3 Max pooling （python c++ cuda）

最後更新：2017-02-12 來源：互聯網

上載者：User

創建阿里雲帳戶，並獲得超過 40 款產品的免費試用版；而企業帳戶則可以享有總值 $1200 的免費試用版。立即註冊！

標籤：http 最佳化 3.0 lob memcpy uda ros cost bottom

1.Python

在CNN4 參數最佳化中有一個CNN模型，其中的限速步是max pooling。

如下所示，Python中運行一個50*100*24*24的max pooling需要3秒。

import numpy as npimport timedef simple_pool(input, ds=(2, 2)):    n, m, h, w = input.shape    d, s = ds    zh = h / d + h % d    zw = w / s + w % s    z = np.zeros((n, m,zh,zw))    for k in range(n):        for o in range(m):            for i in range(zh):                for j in range(zw):                    maxd = -10000                    for u in range(min(d,h-d*i) ):                        for v in range(min(d,w-d*j)):                            if input[k,o,d*i+u,d*j+v]>maxd:                                maxd=input[k,o,d*i+u,d*j+v]                    z[k, o, i, j] = maxd    return zN,M,H,W=[50,100,24,24]a=np.reshape(range(N*M*H*W),(N,M,H,W))*0.01start_time= time.time()out_data=simple_pool(a)print "Cost:",time.time()-start_time,"s"print out_data[0,0,0,:10]"""Cost: 3.08899998665 s[ 0.25  0.27  0.29  0.31  0.33  0.35  0.37  0.39  0.41  0.43]"""

2.C++

採用c++，僅需16~30ms。

#include<iostream>#include<windows.h>void MaxPool2d(const float* const bottom_data, const int num, const int channels,    const int height, const int width, const int pooled_height,float* top_data){    const int w = width;    const int h = height;    const int m = channels;    const int n = num;    const int d = pooled_height;    const int zh = h / d + h % d;    const int zw = w / d + w % d;    int i,j,k,o,u,v,index,index2=0;    float s;    for (k = 0; k < n; ++k)        for (o = 0; o < m; ++o)            for (i = 0; i < zh; ++i)                for (j = 0; j < zw; ++j)                {                    index=k*m*h*w+o*h*w+d*i*w+d*j;                    s=-10000.0;                    for (u = 0; u < d&&(u+d*i)<h; ++u)                        for (v = 0; v < d&&(v+d*j)<w; ++v)                            if (*(bottom_data+index+u*w+v)>s)                                s=*(bottom_data+index+u*w+v);                    *(top_data+index2)=s;                    ++index2;                }}int main(){  const int N=50,M=100,H=24,W=24,P=(H+1)/2;  float mul_min=0.01;  float *input,*output;  input=new float [N*M*H*W*sizeof(float)];  output=new float [N*M*P*P*sizeof(float)];  for(int i=0;i<N*M*H*W;i++)    *(input+i)=i*mul_min;  DWORD start_time=GetTickCount();  MaxPool2d(input,N,M,H,W,2,output);  DWORD end_time=GetTickCount();  std::cout<<"Cost: "<<end_time-start_time<<"ms."<<std::endl;  for(int i=0;i<10;i++)    std::cout<<*(output+i)<<std::endl;  delete []input;  delete []output;}/*Cost: 16ms.0.250.270.290.310.330.350.370.390.410.43*/

3.CUDA

在N=50時為16ms，N=500時為141ms（c++中為218ms），略有提升，應該是計算快了一些，資料交換慢了一些。

#include <windows.h>#include <iostream>__global__ void MaxPool2d(float* bottom_data, const int height, const int width,     const int pooled_height,const int out_height,float* top_data){    int x = blockIdx.x;    int y = blockIdx.y;    int i,j,u,v,index;    int index2=x*gridDim.y*out_height*out_height+y*out_height*out_height;    float s;    for (i = 0; i < out_height; ++i)        for (j = 0; j < out_height; ++j)        {            index=x*gridDim.y*height*width+y*height*width+i*pooled_height*width+j*pooled_height;            s=-10000.0;            for (u = 0; u < pooled_height&&(u+pooled_height*i)<height; ++u)                for (v = 0; v < pooled_height&&(v+pooled_height*j)<width; ++v)                    if (*(bottom_data+index+u*width+v)>s)                        s=*(bottom_data+index+u*width+v);            *(top_data+index2)=s;            ++index2;        }}int main(){  const int N=500,M=100,H=24,W=24,D=2;  const int PH=H / D + H % D;  int image_size = N*M*H*W*sizeof(float);  int out_size = N*M*PH*PH*sizeof(float);  float mul_by=0.01;  float *input,*output,*dev_output,*dev_input;  input = new float[image_size];  output = new float[out_size];  for (int i = 0; i<N*M*H*W; i++)      *(input + i) = i*mul_by;  cudaMalloc((void**)&dev_output, out_size);  cudaMalloc((void**)&dev_input, image_size);  cudaMemcpy(dev_input, input, image_size, cudaMemcpyHostToDevice);  dim3    grid(N, M);  DWORD start_time=GetTickCount();  MaxPool2d<<<grid,1>>>(dev_input,H,W,D,PH,dev_output);  cudaMemcpy(output, dev_output, out_size, cudaMemcpyDeviceToHost);  DWORD end_time=GetTickCount();  std::cout<<"Cost: "<<end_time-start_time<<"ms."<<std::endl;  for (int i = 0; i<10; i++)      std::cout << *(output + i) << std::endl;  cudaFree(dev_input);  cudaFree(dev_output);  delete[] output;  delete[] input;  system("pause");}/*Cost: 141ms.0.250.270.290.310.330.350.370.390.410.43*/

CUDA學習3 Max pooling （python c++ cuda）

本文章原先以中文撰寫並發佈於 aliyun.com，亦設英文版本，僅作資訊用途。本網站不對文章的準確性，完整性或可靠性或其任何翻譯作出任何明示或暗示的陳述或保證。如對該文章有任何疑慮或投訴，請傳送電郵至 info-contact@alibabacloud.com 並提供相關疑慮或投訴的詳細說明。職員會於 5 個工作天內與您聯絡，一經驗證之後，即會刪除該侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More

CUDA學習3 Max pooling （python c++ cuda）

聯繫我們

A Free Trial That Lets You Build Big!

Sales Support

After-Sales Support