Parallel Computing-Cannon algorithm (MPI implementation)

Last Update:2014-08-25 Source: Internet

Author: User

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

The principle is not explained, and the code is directly used.

The annotated source program in the code can be used to print intermediate results and check whether the calculation is correct.

# Include "MPI. H "# include <math. h> # include <stdio. h> # include <stdlib. h> # include <string. h> void scatter_matrix (int * fstream, int N1, int N2, int * q, int root, int tag) {/* size of each matrix block */INT rows = (N1 + root-1)/root; int Cols = (n2 + root-1)/root; int * tmp_matrix = (int *) malloc (rows * Cols * sizeof (INT); int I, j; memset (Q, 0, rows * Cols * sizeof (INT); for (I = 0; I <root; I ++) {for (j = 0; j <root; j ++) {int p = 0, q = 0; int Imin = I * Rows * N2; int jmin = J * C OLS; memset (tmp_matrix, 0, sizeof (tmp_matrix);/* when dividing a matrix, an array needs to be continuously stored due to the discontinuous ground space, to facilitate calling mpi_send */For (P = 0; P <rows; P ++, Imin + = n2) {for (q = 0; q <Cols; q ++) {tmp_matrix [p * Cols + q] = fstream [Imin + jmin + q] ;}} if (I = 0 & J = 0) {/* process 0 does not need to use mpi_send to send data to itself. You can directly use memcpy to copy the result. */memcpy (Q, tmp_matrix, rows * Cols * sizeof (INT);} else {/* sends the part to the process located in row I and column J */mpi_send (tmp_matrix, rows * cols, mpi_int, I * root + J, Tag, mpi_comm_world) ;}}}/** @ row: Matrix Row * @ Col: column of the Matrix * @ SP: sp = root = SQRT (nprocs) * @ return calculate the actual process number based on the row and column numbers */INT get_index (INT row, int Col, int SP) {int TMP = (row + SP) % sp) * SP + (COL + SP) % sp; return TMP;}/* calculate the matrix multiplication and save the result to C */void matrix_multi (int * a, int * B, int * C, int N1, int N2, int N3, int myid) {int I = 0, j = 0, K = 0; int * tmp_c = (int *) malloc (N1 * N3 * sizeof (INT); memset (tmp_c, 0, sizeof (INT) * N1 * N3); for (I = 0; I <N1; I ++) {for (j = 0; j <N3; j ++) {for (k = 0; k <N2; k ++) {tmp_c [I * N3 + J] + = A [I * N2 + k] * B [K * N3 + J];} c [I * N3 + J] + = tmp_c [I * N3 + J] ;}}/* used to locate alignment of matrix subscript */void shuffle (int *, int * buf_a, int buf_a_size, int * B, int * buf_ B, int buf_ B _size, int root, int myid) {int I, j; mpi_status status; int cur_col = 0; int cur_row = 0;/* obtain the row number and column number of the current process by calculating the process Number */cur_row = myid/root; cur_col = myid-cur_row * root;/* for matrix, the matrix of row I needs to be translated to the left for I times */for (I = 0; I <cur_row; I ++) {/* receives data from the right, and send the current matrix to the process on the left */mpi_sendrecv (A, buf_a_size, mpi_int, get _ Index (cur_row, cur_col-1, root), 102, buf_a, buf_a_size, mpi_int, get_index (cur_row, cur_col + 1, root), 102, mpi_comm_world, & status); memcpy (, buf_a, buf_a_size * sizeof (INT);/* buf_a cache matrix for communication */memset (buf_a, 0, buf_a_size * sizeof (INT ));} /* for matrix B, the matrix in column J needs to be translated J times */For (j = 0; j <cur_col; j ++) {/* receives data from the bottom and sends the current matrix to the top process */mpi_sendrecv (B, buf_ B _size, mpi_int, get_index (cur_row-1, cur_col, root), 103, buf_ B, buf_ B _size, mpi_int, get_inde X (cur_row + 1, cur_col, root), 103, mpi_comm_world, & status); memcpy (B, buf_ B, buf_ B _size * sizeof (INT )); /* buf_ B cache matrix for communication */memset (buf_ B, 0, buf_ B _size * sizeof (INT);}/* printf ("I have shuffled! \ N "); */} void cannon (int * a, int * buf_a, int buf_a_size, int * B, int * buf_ B, int buf_ B _size, int * C, int buf_c_size, int row_a, int col_a, int col_ B, int root, int myid) {mpi_status status; double elapsed_time, multiply_time = 0, passdata_time = 0; int I, j; memset (C, 0, sizeof (INT) * buf_c_size); int cur_col = 0; int cur_row = 0; /* obtain the row number and column number of the current process by calculating the process Number */cur_row = myid/root; cur_col = myid-cur_row * root; for (I = 0; I <root; I ++) {/* A total of Root Loops are required, Ro Ot = SQRT (nprocs) */elapsed_time = mpi_wtime (); matrix_multi (a, B, c, row_a, col_a, col_ B, myid ); /* calculate matrix multiplication */elapsed_time = mpi_wtime ()-elapsed_time; multiply_time + = elapsed_time;/* elapsed_time = mpi_wtime (); * // * receives data from the right side (row, col + 1), and send the current matrix to the left (row, col-1) process */mpi_sendrecv (A, buf_a_size, mpi_int, get_index (cur_row, cur_col-1, root), 102, buf_a, buf_a_size, mpi_int, get_index (cur_row, cur_col + 1, root), 102, mpi_comm_world, & status ); /* Receives data from the bottom (row + 1, col) and sends the current matrix to the top (Row-1, col) process */mpi_sendrecv (B, buf_ B _size, mpi_int, get_index (cur_row-1, cur_col, root), 103, buf_ B, buf_ B _size, mpi_int, get_index (cur_row + 1, cur_col, root), 103, mpi_comm_world, & status ); /* elapsed_time = mpi_wtime ()-elapsed_time; passdata_time + = elapsed_time; */memcpy (B, buf_ B, buf_ B _size * sizeof (INT )); /* copy the data in buf_ B to B */memcpy (A, buf_a, buf_a_size * sizeof (INT )); /* copy the data in buf_a to */}/* The calculation result is sent to the array C */mpi_send (C, row_a * col_ B, mpi_int, 0,104, mpi_comm_world); printf ("proc: % d, passdata time: % lf multiply time: % lf \ n ", myid, passdata_time, multiply_time);} void gather_matrix (int * fstream, int N1, int N3, int * C, int root, file * FHC) {mpi_status status; int rows = (N1 + root-1)/root; int Cols = (N3 + root-1)/root; int * tmp_matrix = (int *) malloc (rows * Cols * sizeof (INT); int I, j; for (I = 0; I <root; I ++) {for (j = 0; j <root; j ++) {in T p, q; int Imin = I * Rows * N3; int jmin = J * Cols; memset (tmp_matrix, 0, sizeof (tmp_matrix )); /* receive data from various processes */mpi_recv (tmp_matrix, rows * cols, mpi_int, I * root + J, 104, mpi_comm_world, & status ); /* printf ("I am passed proc: % d \ n", I * root + J); * // * Concatenates the received matrix TMP into matrix C, splicing is required in a proper order. Otherwise, an error will occur */For (P = 0; P <rows; P ++, Imin + = N3) {for (q = 0; q <Cols; q ++) {fstream [Imin + jmin + q] = tmp_matrix [p * Cols + q];/* printf ("% d ", (int *) fstream) [Imin + jmin + q]); * //}/* PR INTF ("\ n"); */}/* print the result to the file */for (I = 0; I <N1; I ++) {for (j = 0; j <N3; j ++) {fprintf (FHC, "% d", fstream [I * N3 + J]);} fprintf (FHC, "\ n") ;}} int main (INT argc, char ** argv) {int myid, numprocs; int I, j; mpi_status status; int root = 0; int dim [3]; double elapsed_time = 0; int max_rows_a, max_cols_a, numeric, max_cols_ B; int buf_a_size, buf_ B _size, buf_c_size; file * FHC;/* suppose a: N1 * N2, b: N2 * N3; N1, N2, N3 are read from input file */int N1, N2, N3;/* buffer for matrix A, B, C will be shifted, so they each have two buffer */int * a, * B, * C, * buf_a, * buf_ B;/* On proc0, buffers to cache matrix files of A, B and C */int * fstream_a = NULL, * fstream_ B = NULL, * fstream_c = NULL; mpi_init (& argc, & argv);/* initialize */mpi_comm_rank (mpi_comm_world, & myid);/* obtain the current process ID */mpi_comm_size (mpi_comm_world, & numprocs ); /* obtain the total number of processes */root = SQRT (numprocs); If (numprocs! = Root * root) {/* if the total number of processes is not the Worker Number, end the Program */printf ("process number must be a squre! \ N "); exit (-1);}/* On proc0, preprocess the command line, read in file for, B and put their sizes in dim [] */If (myid = 0) {file * file_a, * file_ B, * file_c; int N1, N2, N3; int I, j; file_a = fopen (argv [1], "R");/* Open File, the file name is obtained from the parameter given at runtime */file_ B = fopen (argv [2], "R");/* Open File B, the file name is obtained from the parameter given at runtime */fscanf (file_a, "% d", & N1, & N2 ); /* read the number of rows of matrix A from File A, number of columns */fscanf (file_ B, "% d", & N2, & N3 ); /* read the number of rows in matrix B from file B. The number of columns */dim [0] = N1, dim [1] = n2, dim [2] = N3; FS Tream_a = (int *) malloc (N1 * N2 * sizeof (INT);/* allocates a piece of memory for reading matrix A into memory */fstream_ B = (int *) malloc (N2 * N3 * sizeof (INT);/* allocates a piece of memory for reading matrix B into the memory * // * printf ("Yeah! I got n1 = % d, n2 = % d, N3 = % d \ n ", N1, N2, N3); * // * read matrix, save in fstream_a */for (I = 0; I <N1; I ++) for (j = 0; j <N2; j ++) fscanf (file_a, "% d", & (int *) fstream_a) [I * N2 + J]);/* read the matrix B and save it in fstream_ B */for (I = 0; I <N2; I ++) for (j = 0; j <N3; j ++) fscanf (file_ B, "% d", & (int *) fstream_ B) [I * N3 + J]);}/* broadcast the number of rows and columns of the Matrix to all processes through bcast */mpi_bcast (dim, 3, mpi_int, 0, mpi_comm_world ); n1 = dim [0], n2 = dim [1], N3 = dim [2];/* begin new version */max_rows_a = (N1 + root-1)/root; /* submoment Number of rows of array a */max_cols_a = (n2 + root-1)/root;/* Number of columns of child matrix A */max_rows_ B = max_cols_a; /* number of rows of child matrix block B */max_cols_ B = (N3 + root-1)/root;/* Number of columns of child matrix block B */buf_a_size = max_rows_a * max_cols_a; /* size of child matrix block A */buf_ B _size = max_rows_ B * max_cols_ B;/* size of child matrix block B */buf_c_size = max_rows_a * max_cols_ B; /* size of the sub-matrix block C * // * allocate memory space to a, buf_a, buf_ B, B, and C, where buf_a, buf_ B is used for communication caching */A = (int *) malloc (sizeof (INT) * buf_a_size); buf_a = (int *) malloc (sizeof (INT) * buf_a_size ); B = (int *) m Alloc (sizeof (INT) * buf_ B _size); buf_ B = (int *) malloc (sizeof (INT) * buf_ B _size); C = (int *) malloc (sizeof (INT) * buf_c_size); if (a = NULL | buf_a = NULL | B = NULL | buf_ B = NULL | C = NULL) {/* If the memory application fails, exit */printf ("memory allocation failed! \ N "); exit (-1);}/* proc 0 scatter A, B to other procs in a 2D block distribution fashion */If (myid = 0) {/* printf ("max_rows_a: % d \ n", max_rows_a); printf ("max_cols_a: % d \ n", max_cols_a); printf ("max_rows_ B: % d \ n ", max_rows_ B); printf (" max_cols_ B: % d \ n ", max_cols_ B); * // * process 0 divides the matrix A and B into small blocks, distribute to other processes */scatter_matrix (int *) fstream_a, N1, N2, A, root, 100);/* printf ("I am debuging! \ N "); */scatter_matrix (int x) fstream_ B, N2, N3, B, root, 101);/* printf (" I am finding fault! \ N "); */} else {/* Other processes receive matrices A, B */mpi_recv (A, max_rows_a * max_cols_a, mpi_int, 0,100, mpi_comm_world, & status); mpi_recv (B, max_rows_ B * max_cols_ B, mpi_int, 0,101, mpi_comm_world, & status);} mpi_barrier (mpi_comm_world. * // * Printf ("I am proc % d \ n", myid); for (I = 0; I <max_rows_a; I ++) {printf ("% d: ", myid); For (j = 0; j <max_cols_a; j ++) {printf (" % d ", a [I * max_cols_a + J]);} printf ("\ n");} printf ("I am proc % d \ n", myid); for (I = 0; I <max_rows_ B; I ++) {printf ("% d:", myid); For (j = 0; j <max_cols_ B; j ++) {printf ("% d ", B [I * max_cols_ B + J]);} printf ("\ n ");} * // * compute c = a * B by Cannon algorithm * // * The matrix block must be positioned and aligned. preprocessing is performed first */shuffle (A, buf_a, buf_a_size, B, buf_ B, buf_ B _ Size, root, myid); elapsed_time = mpi_wtime ();/* contains all cannon content */cannon (A, buf_a, buf_a_size, B, buf_ B, buf_ B _size, C, buf_c_size, latency, max_cols_a, max_cols_ B, root, myid); latency (mpi_comm_world); elapsed_time = mpi_wtime ()-elapsed_time;/* calculate the actual time consumed by the Cannon algorithm */latency ); /* wait for all processes to complete the Cannon algorithm and send the result to the process 0 */INT fsize_c = sizeof (INT) * N1 * N3; If (myid = 0) {/* process 0 creates a file write handle and prepares to write the computing result to the file */If (! (FHC = fopen (argv [3], "W") {printf ("Cant't open file % s \ n", argv [3]); mpi_finalize ();} fstream_c = (int *) malloc (fsize_c);/* process 0 receives the result matrix from each process, concatenates it into a complete result, and writes it to the file, result of persistent data */gather_matrix (fstream_c, N1, N3, C, root, FHC);} mpi_barrier (mpi_comm_world ); /* Make sure proc 0 read all it needs */If (myid = 0) {int I, j; printf ("Cannon algorithm: multiply a % d * % d with a % d * % d, use % lf (s) \ n ", N1, N2, N2, N3, elapsed_time ); /* printf ("I ha Ve finished! \ N "); */fclose (FHC);/* close the file read/write handle * // * release the applied memory space */free (fstream_a); free (fstream_ B ); free (fstream_c);}/* release applied memory space */free (a); free (buf_a); free (B); free (buf_ B); free (C ); mpi_finalize (); Return 0 ;}

Parallel Computing-Cannon algorithm (MPI implementation)

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More