Is LAPACK slow on my system? And if so why?

In summary, the conversation discusses the use of LAPACK on a 2.7GHz CPU and the expectations for running times based on previous benchmarks. The conversation also mentions the possibility of overlooked features for improved performance and suggests trying different implementations such as OpenBLAS, Intel MKL, or ATLAS. It is also mentioned that a CUDA capable GPU can provide significantly faster speeds for certain tasks.
  • #1
Angelos K
48
0
I am a programming newbie, I hope I don't give you too much trouble.

I use LAPACK on my 2.7GHz CPU.

My expectations on the running times are based on Figure 4.1 in this dedicated publication , as well as on Fanfan's answer to this Stackoverflow Post :
Both computers mentioned (and especially the 2.2 GHz Opteron) should be quite a bit slower than mine. Yet, the Opteron seems to need less than a second for a typical random 1000X1000 symmetric matrix. My CPU needs about 2.2 seconds on average, with very small variations.

Have I overlooked some feature my implementation should have, in order to reach the Opteron?

Code:
The C++ code I use to measure the times is this
#include <stdlib.h>
#include <stdio.h>
#include <fstream>
#include <vector>
#include <sys/time.h>
#include <gsl/gsl_randist.h>
#include <gsl/gsl_matrix.h>
/* Parameters */
#define N 1000
#define REPS 100
#define LDA N
#define TOWERSIZE N
struct TOWER
{
   gsl_matrix *matrix_random_covariance;
};
int COMPUTE_RANDOM_COVARIANCE(struct TOWER *tower);
int WRITE_RANDOM_COVARIANCE(struct TOWER *tower);
int read_covariance (std::vector<double> & data)
  {
  double tmp;
  std::ifstream fin("random_covariance.data");
  while(fin >> tmp)
  {
  data.push_back(tmp);
  }
  return 0;
}
/* DSYEV prototype */
extern "C"{
void dsyev( char* jobz, char* uplo, int* n, double* a, int* lda,
  double* w, double* work, int* lwork, int* info );
}
/* Auxiliary routines prototypes */
extern "C"{
void print_matrix( char* desc, int m, int n, double* a, int lda );
}
/* Main program */
int main() {
     std::vector<double> time_series;
   double time_series_sum =0.0;
   double time_series_average =0.0;
     struct TOWER *tower;
     tower = (struct TOWER *)calloc(1,sizeof(struct TOWER));
     double* work;
  for (uint repj = 0; repj < REPS; repj++)
     {
     COMPUTE_RANDOM_COVARIANCE(tower);
     WRITE_RANDOM_COVARIANCE(tower);
  printf( "-----------------------------------------------> Entry main.\n" );
  /* Locals */
     std::vector<double> data;
  int n = N, lda = LDA, info, lwork;
  double wkopt;
  /* Local arrays */
   double w[N];
   static double a[LDA*N];
  printf( "-----------------------------------------------> Point 0.\n" );
   read_covariance(data);
  printf( "-----------------------------------------------> Point 1.\n" );
  std::copy(data.begin(), data.end(), a);
  printf( "-----------------------------------------------> Point 2.\n" );
 
   struct timeval tval_before, tval_after, tval_result;\
   gettimeofday(&tval_before, NULL);
 
   /* Executable statements */
  printf( " DSYEV Example Program Results\n" );
  /* Query and allocate the optimal workspace */
  lwork = -1;
  dsyev( "Vectors", "Upper", &n, a, &lda, w, &wkopt, &lwork, &info );
  printf( "-----------------------------------------------> Point 4.\n" );
  lwork = (int)wkopt;
  work = (double*)malloc( lwork*sizeof(double) );
  /* Solve eigenproblem */
  dsyev( "Vectors", "Upper", &n, a, &lda, w, work, &lwork, &info );
  /* Check for convergence */
  if( info > 0 ) {
  printf( "The algorithm failed to compute eigenvalues.\n" );
  exit( 1 );
  }
       gettimeofday(&tval_after, NULL);
       timersub(&tval_after, &tval_before, &tval_result);
       printf("Time one diag / secs: %ld.%06ld\n", (long int)tval_result.tv_sec, (long int)tval_result.tv_usec);
  time_series.push_back(tval_result.tv_sec);
     time_series_sum = time_series_sum + tval_result.tv_sec + 0.000001*tval_result.tv_usec;
     }
   
     time_series_average = time_series_sum/REPS;
     printf("Time Series Average = %e\n", time_series_average);
     printf("Time Series Length = %u\n", REPS);
  /* Print eigenvalues */
  //print_matrix( "Eigenvalues", 1, n, w, 1 ); //nocheat!
  /* Print eigenvectors */
  //print_matrix( "Eigenvectors (stored columnwise)", n, n, a, lda ); //nocheat!
  /* Free workspace */
  free( (void*)work );
   free(tower);
  exit( 0 );
} /* End of DSYEV Example */
/* Auxiliary routine: printing a matrix */
void print_matrix( char* desc, int m, int n, double* a, int lda ) {
  int i, j;
  printf( "\n %.2f\n", desc );
  for( i = 0; i < m; i++ ) {
  for( j = 0; j < n; j++ ) printf( " %.2f", a[i+j*lda] );
  printf( "\n" );
  }
}
/* ---  --- */
int COMPUTE_RANDOM_COVARIANCE(struct TOWER *tower) //relying on spline !
{
   int a,b;
   double aux, correl;
   const gsl_rng_type * T;
   gsl_rng * r;
   T = gsl_rng_default;
   r = gsl_rng_alloc (T);
   struct timeval tv;
 
   srand((time(0)));  // srand & time are built-in
     unsigned long int s = random();  // gsl_rng_uniform will eventually
      // want a non-negative "long" integer
   gsl_rng_env_setup();
     gsl_rng_set(r,s); // seed the random number generator;
 
   printf("extreme_kappa: building random covariance matrix...\n");
//#pragma omp parallel
//{   
   tower->matrix_random_covariance = gsl_matrix_calloc(TOWERSIZE,TOWERSIZE);
 
   for(a=0;a<TOWERSIZE;a++)
   {      
   
//#pragma omp for   
     for(b=a;b<TOWERSIZE;b++)
     {
       
     
       correl = gsl_ran_flat(r,0.0,1.0);
       gsl_matrix_set(tower->matrix_random_covariance,a,b,correl);
       gsl_matrix_set(tower->matrix_random_covariance,b,a,correl);
     }
   }
//} //PARALLEL LOOP!
   gsl_rng_free(r);
   return(0);
}
/* ---  --- */
/* --- --- */
int WRITE_RANDOM_COVARIANCE(struct TOWER *tower)
{
   int a,b;
   FILE *file;
   char fname[256];
   double aux;
 
   printf("extreme_kappa: writing random covariances...\n");
 
   sprintf(fname,"./random_covariance.data");
   file = fopen(fname,"w+");
   if(file == NULL)
   {
     printf("extreme_kappa: error opening file %s\n",fname);
     exit(2);
   }
 
   for(a=0;a<TOWERSIZE;a++)
   {
     for(b=0;b<TOWERSIZE;b++)
     {
       aux = gsl_matrix_get(tower->matrix_random_covariance,a,b);
       fprintf(file,"%e ",aux);
     }
     fprintf(file,"\n");
   }
 
   fclose(file);
   return(0);
}
/* --- --- */
 
Last edited:
Technology news on Phys.org
  • #3
Dr. Courtney said:
Is your cache size smaller?
I apologize. I do not know. How can I check?
 
  • #4
OP, there are many different implementations of LAPACK/BLAS---these names just denote *interfaces*. Most likely you are running a version of the netlib "reference BLAS"/LAPACK, which are not really intended to be used by people in actual programs, but simply provide a reference implementation which gives correct numbers (very slowly). If you are running some linux and you simply installed "blas"/"lapack" packages, there is a good chance you got one of those.

You might want to try linking your program with OpenBLAS, if available from somewhere, Intel MKL, or if nothing else is there, at least ATLAS.
 
  • #5
And if you're new to it all, you should know you can buy a cheap CUDA capable GPU nowadays for like $30 that will often do what you need like 50-100 times faster...
http://www.culatools.com/
edit (wrong link)
 

1. Is there a difference in LAPACK's speed across different systems?

Yes, there can be a difference in LAPACK's speed depending on the system it is being run on. This can be due to variations in hardware, operating system, and other system configurations.

2. How do I determine if LAPACK is slow on my system?

To determine if LAPACK is slow on your system, you can run benchmarks or performance tests specifically designed for LAPACK. These tests can measure the speed and efficiency of LAPACK on your system and provide insights into its performance.

3. Why is LAPACK sometimes slower than other libraries?

LAPACK is a highly optimized library for linear algebra operations, but it may sometimes be slower than other libraries due to differences in implementation or the specific algorithms used. Additionally, LAPACK may not be the best choice for certain types of problems, so it is important to consider the specific needs of your application when choosing a library.

4. Can LAPACK's performance be improved on my system?

Yes, LAPACK's performance can be improved on your system by optimizing your system's hardware and software configurations, using optimized BLAS libraries, and selecting the appropriate LAPACK routines for your specific problem.

5. Are there any alternatives to LAPACK that may be faster on my system?

Yes, there are alternative libraries to LAPACK that may be faster on your system, such as Intel's MKL or AMD's ACML. It is important to consider your specific needs and the capabilities of these alternative libraries before making a decision.

Similar threads

  • Programming and Computer Science
Replies
5
Views
1K
  • Programming and Computer Science
Replies
6
Views
2K
  • Programming and Computer Science
Replies
7
Views
1K
  • Programming and Computer Science
Replies
7
Views
4K
  • Programming and Computer Science
Replies
4
Views
6K
  • Programming and Computer Science
Replies
6
Views
925
  • Programming and Computer Science
Replies
4
Views
903
  • Programming and Computer Science
Replies
1
Views
8K
  • Programming and Computer Science
2
Replies
36
Views
2K
  • Programming and Computer Science
2
Replies
52
Views
3K
Back
Top