Dismiss Notice
Join Physics Forums Today!
The friendliest, high quality science and math community on the planet! Everyone who loves science is here!

C/++/# MKL vs GSL speed

  1. Feb 3, 2017 #1
    I have two codes in C++ that are both working, yet I cannot figure out why one is so much faster than the other. To my knowledge, BLAS with MKL (Intel) should be much faster than GSL (GNU), although my code is showing quite the opposite. Here are the codes themselves where I am simply creating 2 matrices at the master node and then sending different rows to different "slave" processors (with OpenMPI) which compute the final matrices elements and then return them back to the master node.

    GSL example (the fast code):

    Code (Text):

        #include <iostream>
        #include <stdio.h>
        #include <iostream>
        #include <cmath>
        #include <mpi.h>
        #include <gsl/gsl_blas.h>
        using namespace std;
     
        int main(int argc, char** argv){
            int noprocs, nid;
            MPI_Status status;
            MPI_Init(&argc, &argv);
            MPI_Comm_rank(MPI_COMM_WORLD, &nid);
            MPI_Comm_size(MPI_COMM_WORLD, &noprocs);
            int master = 0;
         
            const int nsame = 1000; //must be same if matrices multiplied together = acols = brows
            const int arows = 1000;
            const int bcols = 1000;
            int rowsent;
            double * buff;
            buff = new double [nsame];
            double * b;
            b = new double [nsame*bcols];
       
            double** c = new double*[arows];
            for(int i = 0; i < arows; ++i)
                c = new double[bcols];
       
            double * CC;
            CC = new double [1*bcols]; //here ncols corresponds to numbers of rows for matrix b
            for (int i = 0; i < bcols; i++){
                        CC = 0.;
            }; //this is imply a 1-d array of zeros which will be updated and passed by processors
            // Master part
            if (nid == master ) {
           
                double** a = new double*[arows];
                for(int i = 0; i < arows; ++i){
                    a = new double[nsame];}
           
                for (int i = 0; i < arows; i++){
                    for (int j = 0; j < nsame; j++){
                        if (i == j)
                            a[j] = 1.;
                        else
                            a[j] = 0.;
                    }
                }
                for (int i = 0; i < (nsame*bcols); i++){
                    b = (10.*i + 3.)/(3.*i - 2.) ;
                }
                MPI_Bcast(b,nsame*bcols, MPI_DOUBLE_PRECISION, master, MPI_COMM_WORLD); //assumes stored as contguous block of code
                // send one row to each slave tagged with row number, assume nprocs<nrows
                rowsent=0;
                for (int i=1; i < (noprocs); i++) { //must be equal to noprocs otherwise it will not send to 3
                    MPI_Send(a[rowsent], nsame, MPI_DOUBLE_PRECISION,i,rowsent+1,MPI_COMM_WORLD);
                    rowsent++;
                }
           
                for (int i=0; i<arows; i++) {
                    MPI_Recv(CC, bcols, MPI_DOUBLE_PRECISION, MPI_ANY_SOURCE, MPI_ANY_TAG,
                             MPI_COMM_WORLD, &status);
                    int sender = status.MPI_SOURCE;
                    int anstype = status.MPI_TAG;            //row number+1
                    int IND_I = 0;
                    while (IND_I < bcols){
                        c[anstype - 1][IND_I] = CC[IND_I];
                        IND_I++;
                    }
                    if (rowsent < arows) {
                        MPI_Send(a[rowsent], nsame,MPI_DOUBLE_PRECISION,sender,rowsent+1,MPI_COMM_WORLD);
                        rowsent++;
                    }
                    else {       // tell sender no more work to do via a 0 TAG
                        MPI_Send(MPI_BOTTOM,0,MPI_DOUBLE_PRECISION,sender,0,MPI_COMM_WORLD);
                    }
                }
            }
       
            // Slave part
            else {
                MPI_Bcast(b,nsame*bcols, MPI_DOUBLE_PRECISION, master, MPI_COMM_WORLD);
                MPI_Recv(buff,nsame,MPI_DOUBLE_PRECISION,master,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
                while(status.MPI_TAG != 0) {
                    int crow = status.MPI_TAG;
                    gsl_matrix_view AAAA = gsl_matrix_view_array(buff, 1, nsame);
                    gsl_matrix_view BBBB = gsl_matrix_view_array(b, nsame, bcols);
                    gsl_matrix_view CCCC = gsl_matrix_view_array(CC, 1, bcols);
               
                    /* Compute C = A B */
                    gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, 1.0, &AAAA.matrix, &BBBB.matrix,
                                    0.0, &CCCC.matrix);
                    MPI_Send(CC,bcols,MPI_DOUBLE_PRECISION, master, crow, MPI_COMM_WORLD);
                    MPI_Recv(buff,nsame,MPI_DOUBLE_PRECISION,master,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
                    //            cout << ans << " OUTPUT \n";
                }
            }
         
            MPI_Finalize();
            return 0;
        };
     
    MKL example (the slow code):

    Code (Text):

        #include <iostream>
        #include <stdio.h>
        #include <iostream>
        #include <cmath>
        #include <mpi.h>
        #include </opt/intel/compilers_and_libraries_2017.1.126/mac/mkl/include/mkl.h>
        using namespace std;

        int main(int argc, char** argv){ //THE IDENTITY MATRIX ONLY WORKS IF arows = nsame!
            int noprocs, nid;
            MPI_Status status;
            MPI_Init(&argc, &argv);
            MPI_Comm_rank(MPI_COMM_WORLD, &nid);
            MPI_Comm_size(MPI_COMM_WORLD, &noprocs);
            int master = 0;
           
            const int nsame = 1000;
            const int arows = 1000;
            const int bcols = 1000;
            int rowsent;
            double * buff;
            buff = new double [nsame];
            double * b;
            b = new double [nsame*bcols];
         
            double** c = new double*[arows];
            for(int i = 0; i < arows; ++i)
                c = new double[bcols];
         
            double * CC;
            CC = new double [1*bcols];
            for (int i = 0; i < bcols; i++){
                CC = 0.;
            };
            // Master part
            if (nid == master ) {
             
                double** a = new double*[arows];
                for(int i = 0; i < arows; ++i){
                    a = new double[nsame];}
             
                for (int i = 0; i < arows; i++){
                    for (int j = 0; j < nsame; j++){
                        if (i == j)
                            a[j] = 1.;
                        else
                            a[j] = 0.;
                    }
                }
                for (int i = 0; i < (nsame*bcols); i++){
                    b = (10.*i + 3.)/(3.*i - 2.) ; // = 1.*i as test value
                }
                MPI_Bcast(b,nsame*bcols, MPI_DOUBLE_PRECISION, master, MPI_COMM_WORLD); //assumes stored as contguous block of code nprocs<nrows
                delete[] b;
                rowsent=0;
                for (int i=1; i < (noprocs); i++) { //must be equal to noprocs otherwise it will not send to 3
                    MPI_Send(a[rowsent], nsame, MPI_DOUBLE_PRECISION,i,rowsent+1,MPI_COMM_WORLD);
                    delete[] a[rowsent];
                    rowsent++;
                }
             
                for (int i=0; i<arows; i++) {
                    MPI_Recv(CC, bcols, MPI_DOUBLE_PRECISION, MPI_ANY_SOURCE, MPI_ANY_TAG,
                             MPI_COMM_WORLD, &status);
                    int sender = status.MPI_SOURCE;
                    int anstype = status.MPI_TAG;            //row number+1
                    int IND_I = 0;
                    while (IND_I < bcols){
                        c[anstype - 1][IND_I] = CC[IND_I];
                        IND_I++;
                    }
                    if (rowsent < arows) {
                        MPI_Send(a[rowsent], nsame,MPI_DOUBLE_PRECISION,sender,rowsent+1,MPI_COMM_WORLD);
                        delete[] a[rowsent];
                        rowsent++;
                    }
                    else {       // tell sender no more work to do via a 0 TAG
                        MPI_Send(MPI_BOTTOM,0,MPI_DOUBLE_PRECISION,sender,0,MPI_COMM_WORLD);
                    }
                }
            }
         
            // Slave part
            else {
                MPI_Bcast(b,nsame*bcols, MPI_DOUBLE_PRECISION, master, MPI_COMM_WORLD);
                MPI_Recv(buff,nsame,MPI_DOUBLE_PRECISION,master,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
                while(status.MPI_TAG != 0) {
                    int crow = status.MPI_TAG;
                 
                    /* Compute C = A B */
                    cblas_dgemm (CblasRowMajor, CblasNoTrans, CblasNoTrans, 1, bcols, nsame, 1.0, buff, nsame, b, bcols,
                                 0.0, CC, bcols);
                   
                    MPI_Send(CC,bcols,MPI_DOUBLE_PRECISION, master, crow, MPI_COMM_WORLD);
                    MPI_Recv(buff,nsame,MPI_DOUBLE_PRECISION,master,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
                }
            }
           
            MPI_Finalize();
            return 0;
        };
     

    I was thinking it might be due to me not deleting any of the new elements created, although I use essentially the same approach to initialize the arrays in both codes. I even tried deleting values in the MKL code (as shown) yet this appears to not have much of an effect. To clarify, when I added `delete` to the MKL example in hopes that it would speed the MKL code up since the MKL example was already much slower than the GSL example even when they both did not delete any arrays. With or without the deletes I've included, the MKL example is still slower. When I increase the size of the arrays from `nsame = arows = bcols = 1000` to `nsame = arows = bcols = 10000`, the time differences in the two codes can readily be observed (the GSL code takes approximately 45 seconds while the MKL code takes quite a few minutes). Thus I am wondering if this is simply inherent to the way GSL and MKL are designed and incorporated in my code or if there is perhaps something else more subtle going on.
     
  2. jcsd
Know someone interested in this topic? Share this thread via Reddit, Google+, Twitter, or Facebook

Can you offer guidance or do you also need help?
Draft saved Draft deleted