38 #if defined( _WIN32 ) || defined( _WIN64 ) 39 #define int64_t __int64 44 #define _CRT_SECURE_NO_DEPRECATE 45 #define _CRT_SECURE_NO_WARNINGS 56 #if defined( _WIN32 ) || defined( _WIN64 ) 60 #include <sys/resource.h> 63 #include "coreblas/lapacke.h" 65 #include "coreblas/coreblas.h" 68 #include "control/auxiliary.h" 70 #if defined(CHAMELEON_USE_MPI) 74 #if defined (CHAMELEON_SCHED_STARPU) 81 return (
void*)( NULL );
87 Test(int64_t n,
int *iparam,
88 double fixed_rank_decay,
93 int64_t M, N, K, NRHS;
95 #if defined(CHAMELEON_SIMULATION) 101 double fmuls, fadds, fp_per_mul, fp_per_add;
102 double sumgf, sumgf2, sumt, sd, flops, gflops;
110 "BLAS_NUM_THREADS",
"" 130 (void)M;(void)N;(void)K;(void)NRHS;
132 if ( (n < 0) || (thrdnbr < 0 ) ) {
133 if (gnuplot && (MORSE_My_Mpi_Rank() == 0) ) {
134 printf(
"set title '%d_NUM_THREADS: ", thrdnbr );
135 for (i = 0; env[i][0]; ++i) {
136 s = getenv( env[i] );
138 if (i) printf(
" " );
140 for (j = 0; j < 5 && env[i][j] && env[i][j] !=
'_'; ++j)
141 printf(
"%c", env[i][j] );
146 printf(
"->%s",
"?" );
149 printf(
"%s\n%s\n%s\n%s\n%s%s%s\n",
150 "set xlabel 'Matrix size'",
151 "set ylabel 'Gflop/s'",
153 gnuplot > 1 ?
"set terminal png giant\nset output 'timeplot.png'" :
"",
154 "plot '-' using 1:5 title '",
_NAME,
"' with linespoints" );
163 t = (
double*)malloc(niter*
sizeof(
double));
164 memset(t, 0, niter*
sizeof(
double));
176 flops = 1e-9 * (fmuls * fp_per_mul + fadds * fp_per_add);
182 int status =
RunTest( iparam, dparam, &(t[0]), rankfile);
183 if (status != MORSE_SUCCESS)
return status;
187 double sumgf_upper = 0.0;
191 for (iter = 0; iter < niter; iter++)
201 int status =
RunTest( iparam, dparam, &(t[iter]), rankfile);
202 if (status != MORSE_SUCCESS)
return status;
209 int status =
RunTest( iparam, dparam, &(t[iter]), rankfile);
210 if (status != MORSE_SUCCESS)
return status;
212 gflops = flops / t[iter];
214 #if defined (CHAMELEON_SCHED_STARPU) 217 double upper_gflops = 0.0;
219 double integer_tmin = 0.0;
222 FILE *out = fopen(
"bounddeps.pl",
"w");
223 starpu_bound_print_lp(out);
225 out = fopen(
"bound.dot",
"w");
226 starpu_bound_print_dot(out);
229 FILE *out = fopen(
"bound.pl",
"w");
230 starpu_bound_print_lp(out);
235 starpu_bound_compute(&tmin, &integer_tmin, 0);
236 upper_gflops = (flops / (tmin / 1000.0));
237 sumgf_upper += upper_gflops;
243 sumgf2 += gflops*gflops;
245 if ( MORSE_My_Mpi_Rank() == 0)
249 gflops = sumgf / niter;
250 sd = sqrt((sumgf2 - (sumgf*sumgf)/niter)/niter);
252 if ( MORSE_My_Mpi_Rank() == 0) {
253 printf(
"%9.3f %9.2f +-%7.2f ", sumt/niter, gflops, sd);
256 printf(
" %9.2f", sumgf_upper/niter);
276 printf(
"%8.5e %8.5e %8.5e %8.5e %8.5e SUCCESS",
283 printf(
" %8.5e %8.5e %8.5e %8.5e",
298 startswith(
const char *s,
const char *prefix) {
299 size_t n = strlen( prefix );
300 if (strncmp( s, prefix, n ))
306 get_range(
char *range,
int *start_p,
int *stop_p,
int *step_p) {
307 char *s, *s1, buf[21];
308 int colon_count, copy_len, nbuf=20, n;
309 int start=1000, stop=10000, step=1000;
312 for (s = strchr( range,
':'); s; s = strchr( s+1,
':'))
315 if (colon_count == 0) {
316 if (sscanf( range,
"%d", &start ) < 1 || start < 1)
319 if (step < 1) step = 1;
320 stop = start + 10 * step;
322 }
else if (colon_count == 1) {
324 s = strchr( range,
':' );
325 if (sscanf( s+1,
"%d", &stop ) < 1 || stop < 1)
330 copy_len = n > nbuf ? nbuf : n;
331 strncpy( buf, range, copy_len );
333 if (sscanf( buf,
"%d", &start ) < 1 || start > stop || start < 1)
337 step = (stop - start) / 10;
340 }
else if (colon_count == 2) {
342 s = strchr( range,
':' );
344 copy_len = n > nbuf ? nbuf : n;
345 strncpy( buf, range, copy_len );
347 if (sscanf( buf,
"%d", &start ) < 1 || start < 1)
351 s1 = strchr( s+1,
':' );
353 copy_len = n > nbuf ? nbuf : n;
354 strncpy( buf, s+1, copy_len );
356 if (sscanf( buf,
"%d", &stop ) < 1 || stop < start)
360 if (sscanf( s1+1,
"%d", &step ) < 1 || step < 1)
374 show_help(
char *prog_name) {
375 printf(
"Usage:\n%s [options]\n\n", prog_name );
376 printf(
"Options are:\n" 377 " --help Show this help\n" 379 " --threads=X Number of CPU workers (default: _SC_NPROCESSORS_ONLN)\n" 380 " --gpus=X Number of GPU workers (default: 0)\n" 382 " --[a]sync Enable/Disable synchronous calls in wrapper function such as POTRI. (default: async)\n" 383 " --[no]bigmat Allocating one big mat or plenty of small (default: bigmat)\n" 384 " --[no]check Check result (default: nocheck)\n" 385 " --[no]progress Display progress indicator (default: noprogress)\n" 386 " --[no]gemm3m Use gemm3m complex method (default: nogemm3m)\n" 387 " --[no]inv Check on inverse (default: noinv)\n" 388 " --[no]warmup Perform a warmup run to pre-load libraries (default: warmup)\n" 389 " --[no]trace Enable/Disable trace generation (default: notrace)\n" 390 " --[no]dag Enable/Disable DAG generation (default: nodag)\n" 391 " Generates a dot_dag_file.dot.\n" 392 " --[no]profile Print profiling informations (default: noprofile)\n" 393 " --nocpu All GPU kernels are exclusively executed on GPUs (default: 0)\n" 398 " --n_range=R Range of N values\n" 399 " with R=Start:Stop:Step (default: 500:5000:500)\n" 400 " --m=X dimension (M) of the matrices (default: N)\n" 401 " --k=X dimension (K) of the matrices (default: 1)\n" 402 " --nrhs=X Number of right-hand size (default: 1)\n" 403 " --nb=N Nb size. (default: 128)\n" 404 " --ib=N IB size. (default: 32)\n" 406 " --niter=N Number of iterations performed for each test (default: 1)\n" 408 " --rhblk=N If N > 0, enable Householder mode for QR and LQ factorization\n" 409 " N is the size of each subdomain (default: 0)\n" 411 " --acc fixed accuracy is used if fixed rank is equal to zero. This value is also used by STARSH-H in generation of matrix. Rndtiled also depends on this variable\n" 412 " --starshmaxrank buffer sizes in generation of matrices hcore_gytlr and the limit used in QR factorizations and SVD of hcore_gemm to verify that ranks do not excess current allocation.\n" 413 " Use one of the flags for selecting problem type:\n" 414 " --ss Spatial statistics with square exp kernel\n" 415 " --geostat Spatial statistics with Matern kernel\n" 416 " --edsin Electro dynamics with Sinus\n" 417 " --rnd Random matrix\n" 435 print_header(
char *prog_name,
int * iparam,
436 double fixed_rank_decay,
double wave_k) {
437 const char *bound_header = iparam[
IPARAM_BOUND] ?
" thGflop/s" :
"";
439 const char *check_header = iparam[
IPARAM_CHECK] ?
" ||DC-TLR|| ||init DC|| ||DC|| ||TLR|| ||DC-TLR||/||DC|| RETURN" :
"";
440 const char *inverse_header = iparam[
IPARAM_INVERSE] ?
" ||I-A*Ainv|| ||A|| ||Ainv|| ||Id - A*Ainv||/((||A|| ||Ainv||).N.eps)" :
"";
441 const char *peak_header = iparam[
IPARAM_PEAK] ?
" (% of peak) peak" :
"";
442 #if defined(CHAMELEON_SIMULATION) 452 #
if defined(CHAMELEON_USE_MPI)
461 "# fixed acc: %.1e\n" 462 "# alwaysfixedrank: %d\n" 471 #
if defined(CHAMELEON_USE_MPI)
492 printf(
"# M N K/NRHS seconds Gflop/s Deviation%s%s%s\n",
493 bound_header, peak_header, iparam[
IPARAM_INVERSE] ? inverse_header : check_header);
498 get_thread_count(
int *thrdnbr) {
499 #if defined WIN32 || defined WIN64 500 sscanf( getenv(
"NUMBER_OF_PROCESSORS" ),
"%d", thrdnbr );
502 *thrdnbr = sysconf(_SC_NPROCESSORS_ONLN);
516 double fixed_rank_decay = 0.0;
518 char* rankfile = calloc(2048,
sizeof(
char));
580 for (i = 1; i < argc && argv[i]; ++i) {
581 if ( startswith( argv[i],
"--help") || startswith( argv[i],
"-help") ||
582 startswith( argv[i],
"--h") || startswith( argv[i],
"-h") ) {
583 show_help( argv[0] );
585 }
else if (startswith( argv[i],
"--rk=" )) {
586 sscanf( strchr( argv[i],
'=' ) + 1,
"%d", &(iparam[
IPARAM_RK]) );
587 }
else if (startswith( argv[i],
"--acc=" )) {
588 sscanf( strchr( argv[i],
'=' ) + 1,
"%d", &(iparam[
IPARAM_ACC]) );
589 }
else if (startswith( argv[i],
"--alwaysfixedrank" )) {
591 }
else if (startswith( argv[i],
"--matfile=" )) {
593 sscanf( strchr( argv[i],
'=' ) + 1,
"%s",
strmatfile );
594 }
else if (startswith( argv[i],
"--rndusr" )) {
596 }
else if (startswith( argv[i],
"--rnd" )) {
598 }
else if (startswith( argv[i],
"--ss" )) {
600 }
else if (startswith( argv[i],
"--geostat" )) {
602 }
else if (startswith( argv[i],
"--edsin" )) {
604 }
else if (startswith( argv[i],
"--starshwavek=" )) {
605 sscanf( strchr( argv[i],
'=' ) + 1,
"%lf", &(wave_k) );
606 }
else if (startswith( argv[i],
"--starshdecay=" )) {
607 sscanf( strchr( argv[i],
'=' ) + 1,
"%lf", &(fixed_rank_decay) );
608 }
else if (startswith( argv[i],
"--starshmaxrank=" )) {
610 }
else if (startswith( argv[i],
"--maxrank=" )) {
612 }
else if (startswith( argv[i],
"--printmat" )) {
614 }
else if (startswith( argv[i],
"--printindexall" )) {
616 }
else if (startswith( argv[i],
"--rankfile" )) {
617 sscanf( strchr( argv[i],
'=' ) + 1,
"%s", rankfile );
618 }
else if (startswith( argv[i],
"--printindexend" )) {
620 }
else if (startswith( argv[i],
"--threads=" )) {
621 sscanf( strchr( argv[i],
'=' ) + 1,
"%d", &(iparam[
IPARAM_THRDNBR]) );
622 }
else if (startswith( argv[i],
"--gpus=" )) {
623 sscanf( strchr( argv[i],
'=' ) + 1,
"%d", &(iparam[
IPARAM_NCUDAS]) );
624 }
else if (startswith( argv[i],
"--check" )) {
626 }
else if (startswith( argv[i],
"--nocheck" )) {
628 }
else if (startswith( argv[i],
"--bigmat" )) {
630 }
else if (startswith( argv[i],
"--nobigmat" )) {
632 }
else if (startswith( argv[i],
"--inv" )) {
634 }
else if (startswith( argv[i],
"--noinv" )) {
636 }
else if (startswith( argv[i],
"--warmup" )) {
638 }
else if (startswith( argv[i],
"--nowarmup" )) {
644 }
else if (startswith( argv[i],
"--trace" )) {
646 }
else if (startswith( argv[i],
"--notrace" )) {
648 }
else if (startswith( argv[i],
"--gemm3m" )) {
650 }
else if (startswith( argv[i],
"--nogemm3m" )) {
652 }
else if (startswith( argv[i],
"--progress" )) {
654 }
else if (startswith( argv[i],
"--noprogress" )) {
656 }
else if (startswith( argv[i],
"--dag" )) {
658 }
else if (startswith( argv[i],
"--nodag" )) {
660 }
else if (startswith( argv[i],
"--sync" )) {
662 }
else if (startswith( argv[i],
"--async" )) {
664 }
else if (startswith( argv[i],
"--n_range=" )) {
665 get_range( strchr( argv[i],
'=' ) + 1, &start, &stop, &step );
666 }
else if (startswith( argv[i],
"--m=" )) {
667 sscanf( strchr( argv[i],
'=' ) + 1,
"%d", &(iparam[
IPARAM_M]) );
668 }
else if (startswith( argv[i],
"--nb=" )) {
669 sscanf( strchr( argv[i],
'=' ) + 1,
"%d", &(iparam[
IPARAM_NB]) );
671 }
else if (startswith( argv[i],
"--mb=" )) {
672 sscanf( strchr( argv[i],
'=' ) + 1,
"%d", &(iparam[
IPARAM_MB]) );
673 }
else if (startswith( argv[i],
"--nrhs=" )) {
674 sscanf( strchr( argv[i],
'=' ) + 1,
"%d", &(iparam[
IPARAM_K]) );
675 }
else if (startswith( argv[i],
"--k=" )) {
676 sscanf( strchr( argv[i],
'=' ) + 1,
"%d", &(iparam[
IPARAM_K]) );
677 }
else if (startswith( argv[i],
"--ib=" )) {
678 sscanf( strchr( argv[i],
'=' ) + 1,
"%d", &(iparam[
IPARAM_IB]) );
679 }
else if (startswith( argv[i],
"--niter=" )) {
680 sscanf( strchr( argv[i],
'=' ) + 1,
"%d", &iparam[
IPARAM_NITER] );
681 }
else if (startswith( argv[i],
"--mx=" )) {
682 sscanf( strchr( argv[i],
'=' ) + 1,
"%d", &(iparam[
IPARAM_MX]) );
683 }
else if (startswith( argv[i],
"--nx=" )) {
684 sscanf( strchr( argv[i],
'=' ) + 1,
"%d", &(iparam[
IPARAM_NX]) );
685 }
else if (startswith( argv[i],
"--rhblk=" )) {
686 sscanf( strchr( argv[i],
'=' ) + 1,
"%d", &(iparam[
IPARAM_RHBLK]) );
697 }
else if (startswith( argv[i],
"--profile" )) {
699 }
else if (startswith( argv[i],
"--peak" )) {
701 }
else if (startswith( argv[i],
"--noprofile" )) {
703 }
else if (startswith( argv[i],
"--printerrors" )) {
709 }
else if (startswith( argv[i],
"--nocpu" )) {
711 }
else if (startswith( argv[i],
"--bounddepsprio" )) {
715 }
else if (startswith( argv[i],
"--bounddeps" )) {
718 }
else if (startswith( argv[i],
"--bound" )) {
720 }
else if (startswith( argv[i],
"--p=" )) {
721 sscanf( strchr( argv[i],
'=' ) + 1,
"%d", &(iparam[
IPARAM_P]) );
722 }
else if (startswith( argv[i],
"--mode=" )) {
723 sscanf( strchr( argv[i],
'=' ) + 1,
"%d", &(iparam[
IPARAM_MODE]) );
725 fprintf( stderr,
"Invalid mode: %s from 0 to 20\n", argv[i] );
729 fprintf( stderr,
"Unknown option: %s\n", argv[i] );
732 #if !defined(CHAMELEON_USE_CUDA) 734 fprintf(stderr,
"ERROR: MORSE_USE_CUDA is not defined. " 735 "The number of CUDA devices must be set to 0 (--gpus=0).\n");
755 RUNTIME_stop_profiling();
757 MORSE_Disable(MORSE_AUTOTUNING);
758 MORSE_Set(MORSE_TILE_SIZE, iparam[
IPARAM_NB] );
759 MORSE_Set(MORSE_INNER_BLOCK_SIZE, iparam[
IPARAM_IB] );
763 MORSE_Set(MORSE_HOUSEHOLDER_MODE, MORSE_FLAT_HOUSEHOLDER);
765 MORSE_Set(MORSE_HOUSEHOLDER_MODE, MORSE_TREE_HOUSEHOLDER);
766 MORSE_Set(MORSE_HOUSEHOLDER_SIZE, iparam[
IPARAM_RHBLK]);
770 MORSE_Enable(MORSE_PROFILING_MODE);
773 MORSE_Enable(MORSE_WARNINGS);
776 MORSE_Enable(MORSE_PROGRESS);
779 MORSE_Enable(MORSE_GEMM3M);
781 #if defined(CHAMELEON_USE_MPI) 782 nbnode = MORSE_Comm_size( );
786 (nbnode % iparam[
IPARAM_P] != 0) ) {
787 fprintf(stderr,
"ERROR: %d doesn't divide the number of node %d\n",
797 if ( MORSE_My_Mpi_Rank() == 0 )
798 print_header( argv[0], iparam, fixed_rank_decay, wave_k);
800 if (step < 1) step = 1;
802 int status = Test( -1, iparam, fixed_rank_decay, wave_k, rankfile );
803 if (status != MORSE_SUCCESS)
return status;
804 for (i = start; i <= stop; i += step)
808 iparam[
IPARAM_N] = chameleon_max(1, i/nx);
809 }
else if ( mx > 0 ) {
810 iparam[
IPARAM_M] = chameleon_max(1, i/mx);
817 int status = Test( iparam[
IPARAM_N], iparam, fixed_rank_decay, wave_k, rankfile );
818 if (status != MORSE_SUCCESS)
return status;
823 starpu_data_display_memory_stats();
int RunTest(int *iparam, double *dparam, morse_time_t *t_, char *rankfile)
#define HICMA_STARSH_PROB_EDSIN
#define HICMA_STARSH_PROB_GEOSTAT
#define HICMA_STARSH_PROB_RND
char strmatfile[LEN_STR_MAT_FILE]
#define HICMA_STARSH_PROB_SS
#define HICMA_STARSH_PROB_FILE
#define HICMA_STARSH_PROB_RNDUSR
void * morse_getaddr_null(const MORSE_desc_t *A, int m, int n)
int main(int argc, char *argv[])