The origins of this article come from the very last thing that made me very uncomfortable.
I recently changed an open source RNN toolkit currennt (http://sourceforge.net/projects/currennt/) to use it to implement RNNLM functionality.
Currennt uses a lot of object-oriented programming techniques and can use the GPU, which uses the thrust Library (https://code.google.com/p/thrust/) for vector operations.
RNNLM (http://rnnlm.org/) also has the corresponding open source implementation, the very algorithm style code, the vector operation is oneself uses the array realization.
Results...... Big out of my corpus, in the case of not using the GPU, currennt slow into a dog! I kept revising it until the end was almost completely rewritten in the currennt of a rnnlm ... The speed was finally unanimous. It took me a lot of time, and the most important thing was that I didn't plan on spending all this time.
So here simply a few of the commonly used vector operations to do a review, the next encounter at least in mind.
The vector implementations involved in the evaluation include:
- C + + Array
- C + + STL vector
- C + + thrust (CPU)
- C + + thrust (GPU)
- Python
- Python numpy
- Python Theano
Evaluation metrics include:
- creating, populating vectors
- Vector point multiplication, multiply
- Matrix multiplication
Test environment:
VS2010
Python 2.7.6
Intel Xeon CPU [email protected] x24
Thrust v1.5
C + + Array
Create all 0 vectors: 0.000s, almost no time-consuming
int vector_size=100000000;float* vector= (float*) calloc (vector_size,sizeof (float));
Create + fill vector: 0.140s
int vector_size=100000000;float* vector= (float*) calloc (vector_size,sizeof (float)); for (int i=0;i<vector_size;++ i) {vector[i]=0.01;}
Vector Point multiplication: 0.390s
float sum=0;for (int i=0;i<vector_size;++i) {sum+=vector1[i]*vector2[i];}
Vector multiplication: 0.265s
float sum=0;for (int i=0;i<vector_size;++i) {vector3[i]=vector1[i]*vector2[i];}
Matrix multiplication vector: 0.344s
int Matrix1_colnum=50000;int matrix1_rownum=2000;int matrix1_size=matrix1_colnum*matrix1_rownum;float* vector1= ( float*) calloc (matrix1_size,sizeof (float)); for (int i=0;i<matrix1_size;++i) {vector1[i]=0.01;} float* vector2= (float*) calloc (matrix1_colnum,sizeof (float)); for (int i=0;i<matrix1_colnum;++i) {vector2[i]=0.02 ;} Start_t=clock (); float* vector3= (float*) calloc (matrix1_rownum,sizeof (float)); for (int row=0;row<matrix1_rownum; ++row) {for (int col=0;col<matrix1_colnum;++col) {Vector3[row]+=vector1[row*matrix1_colnum+col]*vector2[col];}} End_t=clock ();
Matrix multiplication Matrix: 0.749
(Time-consuming is proportional to matrix1_rownum*matrix1_colnum*matrix2_colnum)
int Matrix1_rownum=200;int matrix1_colnum=5000;int matrix1_size=matrix1_colnum*matrix1_rownum;float* vector1= ( float*) calloc (matrix1_size,sizeof (float)); for (int i=0;i<matrix1_size;++i) {vector1[i]=0.01;} int Matrix2_rownum=5000;int matrix2_colnum=200;int matrix2_size=matrix2_rownum*matrix2_colnum;float* vector2= ( float*) calloc (matrix2_size,sizeof (float)); for (int i=0;i<matrix2_size;++i) {vector2[i]=0.02;} int matrix3_size=matrix1_rownum*matrix2_colnum;float* vector3= (float*) calloc (matrix3_size,sizeof (float)); start_t =clock (); for (int row1=0;row1<matrix1_rownum;++row1) {for (int. col2=0;col2<matrix2_colnum;++col2) {for (Int. col1 =0;COL1<MATRIX1_COLNUM;++COL1) {Vector3[row1*matrix2_colnum+col2]+=vector1[row1*matrix1_colnum+col1]*vector2 [Col1*matrix2_colnum+col2];}} End_t=clock ();
C + + STL vector
Create all 0 vectors: 0.140s
int vect_size=100000000;
Vector<float> vector (vect_size);
Create + fill vector: 0.140s
int vect_size=100000000;vector<float> vector (vect_size,0.01);
Vector Point multiplication: 0.375s
int vect_size=100000000;vector<float> Vector1 (vect_size,0.01);vector<float> Vector2 (vect_size,0.02); Start_t=clock (); float sum=0;for (int i=0;i<vect_size;++i) {sum+=vector1[i]*vector2[i];} End_t=clock ();
Vector multiplication: 0.250s
int vect_size=100000000;vector<float> Vector1 (vect_size,0.01);vector<float> Vector2 (vect_size,0.02); Vector<float> Vector3 (vect_size); Start_t=clock (); for (int i=0;i<vect_size;++i) {vector3[i]=vector1[i]* Vector2[i];} End_t=clock ();
Matrix multiplication vector: 0.390s
int Matrix1_colnum=50000;int Matrix1_rownum=2000;int Matrix1_size=matrix1_colnum*matrix1_rownum;vector<float > Vector1 (matrix1_size,0.01);vector<float> Vector2 (matrix1_colnum,0.02);vector<float> Vector3 ( Matrix1_rownum); Start_t=clock (); for (int row=0;row<matrix1_rownum;++row) {for (int col=0;col<matrix1_colnum;+ +col) {Vector3[row]+=vector1[row*matrix1_colnum+col]*vector2[col];}} End_t=clock ();
Matrix multiplication: 0.827s
int Matrix1_rownum=200;int Matrix1_colnum=5000;int matrix1_size=matrix1_colnum*matrix1_rownum;vector<float> Vector1 (matrix1_size,0.01); int Matrix2_rownum=5000;int Matrix2_colnum=200;int matrix2_size=matrix2_rownum*matrix2 _colnum;vector<float> Vector2 (matrix2_size,0.02); int matrix3_size=matrix1_rownum*matrix2_colnum;vector< Float> Vector3 (matrix3_size); Start_t=clock (); for (int row1=0;row1<matrix1_rownum;++row1) {for (int col2=0;col2 <matrix2_colnum;++col2) {for (int col1=0;col1<matrix1_colnum;++col1) {vector3[row1*matrix2_colnum+col2]+= Vector1[row1*matrix1_colnum+col1]*vector2[col1*matrix2_colnum+col2];}}} End_t=clock ();
C + + thrust (CPU)
Create all 0 vectors: 0.140s
int vect_size=100000000;thrust::host_vector<float> Vector1 (vect_size);
Create + fill vector: 0.140s
int vect_size=100000000;thrust::host_vector<float> Vector1 (vect_size,0.01);
Fill vector: 0.078s
Thrust::fill (Vector1.begin (), Vector1.end (), 0.01);
Vector Point multiplication: 0.359s
int vect_size=100000000;thrust::host_vector<float> Vector1 (vect_size, (float) 0.1);thrust::host_vector< Float> Vector2 (vect_size, (float) 0.2);thrust::host_vector<float> Vector3 (vect_size, (float) 0.2); start_t= Clock (); Thrust::transform (Vector1.begin (), Vector1.end (), Vector2.begin (), Vector3.begin (),thrust::multiplies< Float> ()); float Sum=thrust::reduce (Vector3.begin (), Vector3.end (), (float) 0,thrust::multiplies<float> ()) ; End_t=clock ();
Vector multiplication: 0.187s
int vect_size=100000000;thrust::host_vector<float> Vector1 (vect_size, (float) 0.1);thrust::host_vector< Float> Vector2 (vect_size, (float) 0.2);thrust::host_vector<float> Vector3 (vect_size); Start_t=clock (); Thrust::transform (Vector1.begin (), Vector1.end (), Vector2.begin (), Vector3.begin (), thrust::multiplies<float > ()); End_t=clock ();
Matrix multiplication vector: 0.110s
struct matrixxvect_func{thrust::host_vector<float>* matrix;thrust:: host_vector<float>* vector;int matrix_rownum;int matrix_colnum;__host__ __device__float operator () (const int & idx) const{float t=0;for (int col=0;col<matrix_colnum;++col) {t+= (*matrix) [idx*matrix_colnum+col]* (*vector) [Col];} return t;}; int Matrix1_colnum=50000;int matrix1_size=matrix1_colnum*matrix1_rownum;thrust::host_vector<float> Vector1 ( Matrix1_size, (float) 0.1);thrust::host_vector<float> Vector2 (Matrix1_colnum, (float) 0.2); thrust::host_ Vector<float> Vector3 (Matrix1_rownum); Start_t=clock (); Matrixxvect_func Fn;fn.matrix=&vector1;fn.vector =&vector2;fn.matrix_rownum=matrix1_rownum;fn.matrix_colnum=matrix1_colnum;thrust::transform (Thrust::counti Ng_iterator<int> (0), thrust::counting_iterator<int> (0) + Matrix1_rownum, Vector3.begin () , FN); End_t=clock ();
Matrix multiplication Matrix: 0.655s
struct matrixxmatrix_func{thrust::host_vector<float>* matrix1;thrust::host_vector<float>* matrix2;int Matrix1_rownum;int matrix1_colnum;int matrix2_rownum;int matrix2_colnum;__host__ __device__float operator () (const int& idx) const{int rownum=idx/matrix2_colnum;int colnum=idx%matrix2_colnum;float t=0;for (int col=0;col< Matrix1_colnum;++col) {t+= (*matrix1) [rownum*matrix1_colnum+col]* (*MATRIX2) [Col*matrix2_colnum+colnum];} return t;}; int Matrix1_rownum=200;int Matrix1_colnum=5000;int Matrix1_size=matrix1_colnum*matrix1_rownum;thrust::host_vector <float> Vector1 (matrix1_size, (float) 0.1); int Matrix2_rownum=5000;int Matrix2_colnum=200;int matrix2_size= Matrix2_rownum*matrix2_colnum;thrust::host_vector<float> Vector2 (matrix2_size, (float) 0.2); int matrix3_size =matrix1_rownum*matrix2_colnum;thrust::host_vector<float> Vector3 (matrix3_size); Start_t=clock (); Matrixxmatrix_func fn;fn.matrix1=&vector1;fn.matrix2=&vector2;fn.matrix1_rownum=matrix1_rownum;fn.maTrix1_colnum=matrix1_colnum;fn.matrix2_rownum=matrix2_rownum;fn.matrix2_colnum=matrix2_colnum;thrust:: Transform (thrust::counting_iterator<int> (0), thrust::counting_iterator<int> (0) + matrix 3_size, Vector3.begin (), FN); End_t=clock ();
C + + vs Python vector computing speed evaluation