HiCMA
Hierarchical Computations on Manycore Architectures
time_zpotrf_tile.c
Go to the documentation of this file.
1 
17 /*
18  * @copyright (c) 2009-2014 The University of Tennessee and The University
19  * of Tennessee Research Foundation.
20  * All rights reserved.
21  * @copyright (c) 2012-2016 Inria. All rights reserved.
22  * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
23  */
24 
34 #include "morse.h"
35 #include "timing.h"
36 #include "hicma_constants.h"
37 #include "hicma_struct.h"
38 #include "hicma_z.h"
39 #include <stdio.h>
40 #include <time.h>
41 #include <sys/time.h>
42 //#include <mpi.h> //MPI_Wtime()
43 
44 #include "starpu.h"
45 #ifdef MKL
46 #include <mkl.h>
47 //#pragma message("MKL is used")
48 #else
49 #include <cblas.h>
50 #ifdef LAPACKE_UTILS
51 #include <lapacke_utils.h>
52 #endif
53 #include <lapacke.h>
54 //#pragma message("MKL is NOT used")
55 #endif
56 
57 #include "starsh-spatial.h"
58 
59 #include <assert.h>
60 #include "hicma_z.h"
61 #include "auxcompute_z.h"
62 #include "auxdescutil.h"
63 #include "hicma.h"
64 #include <math.h>
65 #include <time.h>
66 
67 #undef CBLAS_SADDR
68 #define CBLAS_SADDR(_val) (_val)
69 
70 // zgytlr uses starsh in MPI mode.
71 STARSH_blrf *mpiF;
72 
73 int print_progress = 1; // Print progress about the execution
74 char datebuf[128];
75 time_t timer;
76 struct tm* tm_info;
77 #define PROGRESS(str) \
78  if(print_progress){ \
79  int myrank = MORSE_My_Mpi_Rank();\
80  time(&timer); \
81  tm_info = localtime(&timer); \
82  strftime(datebuf, 26, "%Y-%m-%d %H:%M:%S",tm_info); \
83  fprintf(stderr, "%d:%s\t%d\t%s\t%s\n", myrank, datebuf, __LINE__, __func__, str);\
84  fflush(stderr);\
85  }
86 //#undef PROGRESS
87 //#define PROGRESS(str)
88 
90 int global_check = 0;
95 int run_potrf = 1;
96 int diag_nrows = 0;
98 int print_index = 0;
101 int print_mat = 0;
102 int use_scratch = 1; // Use scratch memory provided by starpu
103 int calc_rank_stat = 1;
104 
105 void fwrite_array(int m, int n, int ld, double* arr, char* file){
106  FILE* fp = fopen(file, "w");
107  if(fp == NULL){
108  fprintf(stderr, "File %s cannot be opened to write\n", file);
109  exit(1);
110  }
111  int i, j;
112  fprintf(fp, "%d %d\n", m, n);
113  for(i = 0; i < m; i++){
114  for(j = 0; j < n; j++){
115  fprintf(fp, "%d\t", (int)arr[ld*j+i] );
116  }
117  fprintf(fp, "\n" );
118  }
119  fclose(fp);
120 }
121 
122 double timediff(struct timeval begin, struct timeval end){
123  double elapsed = (end.tv_sec - begin.tv_sec) +
124  ((end.tv_usec - begin.tv_usec)/1000000.0);
125  return elapsed;
126 }
127 
128 
129 
130  int
131 RunTest(int *iparam, double *dparam, morse_time_t *t_, char* rankfile)
132 {
133  // print progress info only on ROOT process
134  if(MORSE_My_Mpi_Rank() != 0)
135  print_progress = 0;
136  PROGRESS("RunTest started");
137 
138  // this paramater enables storing only diagonal tiles in a tall and skinny matrix
140  //chameleon/runtime/starpu/control/runtime_descriptor.c
141  //MORSE_user_tag_size(31,26);
142  //MORSE_user_tag_size(31,29);
143  MORSE_user_tag_size(31,27);// When I added tile_to_lapack for descArk, I got not enough number of desc error
144 
145  // get parameters coming from command line
146  PASTE_CODE_IPARAM_LOCALS( iparam );
147 
148  // set global variable so that p.. files can fill dense matrix
150  // calculate total number of mpi processes (it is not used for now)
151  num_mpi_ranks = P*Q;
155  int64_t _nb = iparam[IPARAM_NB];
156  LDA = chameleon_max(M, iparam[IPARAM_LDA]);
157  int hicma_maxrank = iparam[IPARAM_HICMA_MAXRANK];
159 
160  int saveNB = NB;
161  NB = MB;
162  size_t ncols_AD;
163  int saveP = P;
164  int saveQ = Q;
165  if (store_only_diagonal_tiles == 1) {
166  ncols_AD = MB;
167  } else {
168  ncols_AD = M;
169  }
170  int saveN = N;
171  N = ncols_AD;
172  PASTE_CODE_ALLOCATE_MATRIX_TILE( descAD, 1, double, MorseRealDouble, LDA, M, N );
173  N = saveN;
174  P = saveP;
175  Q = saveQ;
176  PROGRESS("descAD is allocated");
177 
178  size_t ncols_Dense;
179  size_t ld_Dense;
180  int saveMB = MB;
181  if(check == 0) {
182  ncols_Dense = MT;
183  MB = NB = 1;
184  ld_Dense = MT;
185  } else {
186  ncols_Dense = M;
187  ld_Dense = M;
188  }
189  /*descDense is full matrix if numerical accuracy will be checked.
190  * Otherwise it is MB-by-MB matrix with 1-by-1 tiles */
191  PASTE_CODE_ALLOCATE_MATRIX_TILE( descDense, 1, double, MorseRealDouble, ld_Dense, ncols_Dense, ncols_Dense );
192  if(check == 0) {
193  MB = saveMB;
194  } else {
195  }
196  PROGRESS("descDense is allocated");
197  NB = saveNB;
198 
199  int MTMB = MT * MB; // roundup number of rows/columns for AUV
200  int nrows_AUV = MTMB;
201  int ld_AUV = MTMB;
202  // allocate descUV
203  saveN = N;
204  N = N * 2;
205  saveNB = NB;
206  NB = NB * 2;
207  //printf("N:%d NB:%d\n", N, NB);
208  PASTE_CODE_ALLOCATE_MATRIX_TILE( descAUV, 1, double, MorseRealDouble, ld_AUV, nrows_AUV, N );
209  N = saveN;
210  NB = saveNB;
211  PROGRESS("descAUV is allocated");
212 
213  /* tile dimension of rank descriptor must be 1 */
214  /* when LD for rk matrices is 1, program exits*/
215  int bigMB = MB;
216  int bigNB = NB;
217  MB = NB = 1;
218  PASTE_CODE_ALLOCATE_MATRIX_TILE( descArk, 1, double, MorseRealDouble, MT, MT, NT);
219  PROGRESS("descA's are allocated");
220  MB = bigMB;
221  NB = bigNB;
222 
223  int diag_dense = 1;
224  int fixedrank = iparam[IPARAM_RK]; //genargs->k
225  double fixedacc = pow(10, -1.0*iparam[IPARAM_ACC]);
226 
227  char sym;
228  if (run_potrf)
229  sym = 'S';
230  else
231  sym = 'N';
232  int probtype = iparam[IPARAM_HICMA_STARSH_PROB];
233  int maxrank = iparam[IPARAM_HICMA_STARSH_MAXRANK];
234  //double ddecay = pow(10, -1.0*iparam[IPARAM_HICMA_STARSH_DECAY]);
235  double ddecay = dparam[IPARAM_HICMA_STARSH_DECAY];
236 
237 
238  int initial_maxrank, final_maxrank;
239  double initial_avgrank, final_avgrank;
240  HICMA_problem_t hicma_problem;
241 
242  hicma_problem.ndim = 2;
243 
244  //BEGIN: rndtiled
246  hicma_problem.noise = 1.0; //value added to diagonal
247  }
248  //END: rndtiled
249 
250  //BEGIN: geostat
251  //double theta[3] = {1, 0.1, 0.5}; //initially
252  double theta[3] = {
253  1.0, //sigma
254  0.01, //beta
255  10.0 //nu Aleks used 10.0 in his paper
256  };
258  hicma_problem.theta = theta;
259  hicma_problem.noise = 0.0;
260  hicma_problem.noise = 1.e-2;
261  hicma_problem.kernel_type = STARSH_SPATIAL_MATERN2_SIMD;
262  }
263  //END: geostat
264 
265  //BEGIN: ss
267  //sigma=1.0 default value line 193 of stars-h/src/applications/spatial.c
268  // Correlation length
269  hicma_problem.beta = 0.1;
270  //If fixed rank is required set beta=1 and a sample case will be like this nb=25 maxrank=10 m=2500 So ranks will decrease.
271 
272  // Smoothing parameter for Matern kernel
273  hicma_problem.nu = 0.5;
274  // Shift added to diagonal elements
275  hicma_problem.noise = 1.e-4; //not enough for matrices larger than 600K
276  hicma_problem.noise = 5.e-4; //works for 640K but does not work for 10M
277  hicma_problem.noise = 1.e-2; //
278  }
279  //END: ss
280 
281  //BEGIN: edsin
283  // Wave number, >= 0
284  hicma_problem.wave_k = dparam[IPARAM_HICMA_STARSH_WAVE_K];
285  hicma_problem.diag = M;
286  //printf("%s %d: %g\n", __FILE__, __LINE__, hicma_problem.wave_k);
287  }
288  //END: edsin
289 
290  PROGRESS("generating coordinates started");
291  struct timeval tvalBefore, tvalAfter; // removed comma
292  gettimeofday (&tvalBefore, NULL);
293  HICMA_zgenerate_problem(probtype, sym, ddecay, M, MB, MT, NT, &hicma_problem);
294  gettimeofday (&tvalAfter, NULL);
295  if(MORSE_My_Mpi_Rank()==0){
296  printf("Tproblem:%g\n",
297  (tvalAfter.tv_sec - tvalBefore.tv_sec)
298  +(tvalAfter.tv_usec - tvalBefore.tv_usec)/1000000.0
299  );
300  fflush(stderr);
301  fflush(stdout);
302  }
303  PROGRESS("generating coordinates ended");
304  mpiF = hicma_problem.starsh_format; // This is assignment will be hidden from user in release
305 
306  // DO NOT enforce compression of diagonal tiles
307  int compress_diag = 0;
308  PROGRESS("nompi zgytlr starting");
309  //descDense original problem
310  gettimeofday (&tvalBefore, NULL);
311  HICMA_zgytlr_Tile(MorseLower, descAUV, descAD, descArk, 0, maxrank, fixedacc, compress_diag, descDense);
312  gettimeofday (&tvalAfter, NULL);
313  if(MORSE_My_Mpi_Rank()==0){
314  printf("Tcompress:%g\n",
315  (tvalAfter.tv_sec - tvalBefore.tv_sec)
316  +(tvalAfter.tv_usec - tvalBefore.tv_usec)/1000000.0
317  );
318  fflush(stderr);
319  fflush(stdout);
320  }
321  PROGRESS("nompi zgytlr finished");
322  fflush(stderr);
323  fflush(stdout);
324  /*return 0; //TODO*/
325 
326  if(calc_rank_stat == 1) {
327  PASTE_TILE_TO_LAPACK( descArk, Ark_initial, 1, double, MT, NT );
328  if(MORSE_My_Mpi_Rank()==0){
329 
330  sprintf(rankfile, "%s-1", rankfile);
331  fwrite_array(descArk->m, descArk->n, descArk->m, Ark_initial, rankfile);
332 
333  HICMA_stat_t hicma_statrk_initial;
334  zget_stat(MorseLower, Ark_initial, MT, NT, MT, &hicma_statrk_initial);
335  printf("initial_ranks:");
336  zprint_stat(hicma_statrk_initial);
337  fflush(stderr);
338  fflush(stdout);
339  }
340  }
341 
342  if (global_always_fixed_rank == 1) {
343  fprintf(stderr, "%s %d Fixed rank: %d\n", __FILE__, __LINE__, global_fixed_rank);
344  }
345 
346  if(0 && num_mpi_ranks == 1 && initial_maxrank > N){ //FIXME Enable for distributed mem
347  fprintf(stderr,"%s %d %d\t|N:%d is less than actual maxrank:%d\n", __FILE__, __LINE__, MORSE_My_Mpi_Rank(), N, initial_maxrank);
348  exit(1);
349  }
350  int set_diag = 0;
351 
352  /* Save A for check */
353  PROGRESS("pasting original dense descAD into Adense and Adense2 started");
354  // Adense: original dense problem.
355  PASTE_TILE_TO_LAPACK( descDense, Adense, check, double, LDA, M );
356  double one = 1.0, zero = 0.0, minusone = -1.0, diagVal = M;
357  double* swork = NULL;
358  //double* cp_L_Adense = calloc(LDA*M, sizeof(double));
359  if(check){
360  swork = calloc(2*M, sizeof(double));
361  {size_t i, j;
362  double* orgAdense = calloc(LDA*M, sizeof(double));
363  for(j = 0; j < M; j++){
364  for(i = 0; i < M; i++){
365  orgAdense[j*LDA+i] = Adense[j*LDA+i];
366  }
367  }
368  int info = LAPACKE_dpotrf_work(
369  LAPACK_COL_MAJOR,
370  'L',
371  M, orgAdense, LDA);
372  if(0 && info != 0){ //FIXME
373  fprintf(stderr, "%s\t|%d\t|Error in LAPACK potrf. info:%d, This errors means "
374  "that the matrix generated is not positive definite\n", __FILE__, __LINE__, info);
375  }
376  for(j = 0; j < M; j++){
377  for(i = 0; i < j; i++){
378  orgAdense[j*LDA+i] = zero;
379  }
380  }
381  /*for(j = 0; j < M; j++) { */
382  /*for(i = 0; i < M; i++){*/
383  /*cp_L_Adense[j*LDA+i] = orgAdense[j*LDA+i];*/
384  /*}*/
385  /*}*/
386  if(main_print_mat ){printf("L of Adense\n");printmat(orgAdense,M,M,LDA,MB, MB);}
387  double normOrgAdense = 0.0;
388  /*HICMA_znormest(M, M, orgAdense, &normOrgAdense, swork);*/
389  /*printf("norm_L_OrgAdense:%e\n",normOrgAdense);*/
390  free(orgAdense);
391  }
392  }
393  PASTE_TILE_TO_LAPACK( descDense, Adense2, check, double, LDA, M );
394  PROGRESS("pasting original dense descAD into Adense and Adense2 finished");
395  PROGRESS("potrf started");
396  START_TIMING();
397  HICMA_zpotrf_Tile(MorseLower, descAUV, descAD, descArk, fixedrank, maxrank, fixedacc );
398  STOP_TIMING();
399  fflush(stderr);
400  fflush(stdout);
401  PROGRESS("potrf finished");
402  if(check){
403  HICMA_zuncompress(MorseLower, descAUV, descDense, descArk);
404  HICMA_zdiag_vec2mat(descAD, descDense);
405  PASTE_CODE_FREE_MATRIX( descAD ); //@KADIRLBL001
406  descAD = descDense; // descAD was only diagonals.
407  // After this line, descAD is dense matrix containing approximate L
408  // So no need to adapt below code for descAD containg only diagonals.
409  }
410  if(calc_rank_stat == 1) {
411  PASTE_TILE_TO_LAPACK( descArk, Ark_final, 1, double, MT, NT );
412  if(MORSE_My_Mpi_Rank()==0){
413  sprintf(rankfile, "%s-2", rankfile);
414  fwrite_array(descArk->m, descArk->n, descArk->m, Ark_final, rankfile);
415  HICMA_stat_t hicma_statrk_final;
416  zget_stat(MorseLower, Ark_final, MT, NT, MT, &hicma_statrk_final);
417  printf("final_ranks:");
418  zprint_stat(hicma_statrk_final);
419  fflush(stderr);
420  fflush(stdout);
421  }
422  }
423 
424  int check_dense = 0;
425  int check_app = 1;
426  if(check == 0){
427  check_dense = check_app = 0;
428  }
429  if(check_app ) {
430  PROGRESS("checking accuracy");
431  if( MORSE_My_Mpi_Rank()==0){
432 #ifndef COMPLEX
433  if(main_print_mat){printf("Adense2\n");printmat(Adense2,M,M,LDA,MB, MB);}
434  double normA;
435  {size_t i, j;
436  for(j = 0; j < M; j++){
437  for(i = 0; i < j; i++){
438  Adense2[j*LDA+i] = zero;
439  }
440  }
441  }
442  PROGRESS("normaA started");
443  HICMA_znormest(M, M, Adense2, &normA, swork);
444  // Ahicma: result of TLR potrf
445  PASTE_TILE_TO_LAPACK( descAD, Ahicma, check, double, LDA, M );
446  /*if(0){size_t i,j;*/
447  /*for(j = 0; j < M; j++) { */
448  /*for(i = 0; i < M; i++){*/
449  /*Ahicma[j*LDA+i] = cp_L_Adense[j*LDA+i];*/
450  /*}*/
451  /*}*/
452  /*}*/
453  double normAhicma = 0.0;
454  {size_t i, j;
455  for(j = 0; j < M; j++){
456  for(i = 0; i < j; i++){
457  Ahicma[j*LDA+i] = zero;
458  }
459  }
460  double* orgAhicma = calloc(LDA*M, sizeof(double));
461  for(j = 0; j < M; j++){
462  for(i = 0; i < M; i++){
463  orgAhicma[j*LDA+i] = Ahicma[j*LDA+i];
464  }
465  }
466  HICMA_znormest(M, M, orgAhicma, &normAhicma, swork);
467  free(orgAhicma);
468  }
469  if(set_diag){size_t j; for(j = 0; j < M; j++){ Ahicma[j*LDA+j] = diagVal; } }
470  if(main_print_mat){printf("Ahicma\n");printmat(Ahicma,M,M,LDA, MB, MB);}
471  //LAPACKE_dpotrf(LAPACK_COL_MAJOR, 'L', M, Ahicma, LDA);
472  // AhicmaT: transpose of Ahicma
473  PROGRESS("copy descAd into AhicmaT started");
474  PASTE_TILE_TO_LAPACK( descAD, AhicmaT, check, double, LDA, M );
475 
476  {size_t i, j;
477  for(j = 0; j < M; j++){
478  for(i = 0; i < j; i++){
479  Adense[j*LDA+i] = zero;
480  }
481  }
482  }
483 
484  if(main_print_mat){printf("Ahicma-upperzero\n");printmat(Ahicma,M,M,LDA, MB, MB);}
485  PROGRESS("Transpose A started");
486  LAPACKE_dge_trans(LAPACK_COL_MAJOR, M, M, Ahicma, LDA, AhicmaT, LDA);
487  if(main_print_mat){printf("AhicmaT\n");printmat(AhicmaT,M,M,LDA, MB, MB);}
488  PROGRESS("TRMM started");
489  cblas_dtrmm (CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, M, M, one, Ahicma, LDA, AhicmaT, LDA);
490  if(main_print_mat){printf("Ahicma*AhicmaT\n");printmat(AhicmaT,M,M,LDA, MB, MB);}
491  //double tmpnorm;normest(M, M, AhicmaT, &tmpnorm, swork);printf("tmpnorm:%e\n",tmpnorm);
492  {size_t i, j;
493  for(j = 0; j < M; j++){
494  for(i = 0; i < j; i++){
495  AhicmaT[j*LDA+i] = zero;
496  }
497  }
498  }
499 
500  size_t nelm = M * M;
501  if(main_print_mat)printf("nelm:%zu M:%d N:%d\n", nelm, M, N);
502  PROGRESS("DAXPY started");
503  cblas_daxpy(nelm, minusone, AhicmaT, 1, Adense, 1);
504  if(main_print_mat){printf("Adense-(Ahicma*AhicmaT)\n");printmat(Adense,M,M,LDA, MB, MB);}
505 
506  double normDenseAppDiff;
507  PROGRESS("Norm of difference started");
508  HICMA_znormest(M, M, Adense, &normDenseAppDiff, swork);
509  double accuracyDenseAppDiff = normDenseAppDiff/normA;
510  //printf("normA:%.2e normDenseAppdiff:%.2e Accuracy: %.2e\n", normA, normDenseAppDiff, accuracyDenseAppDiff);
511  dparam[IPARAM_RES] = normDenseAppDiff;
512  dparam[IPARAM_ANORM] = normA;
513  dparam[IPARAM_XNORM] = normA;
514  dparam[IPARAM_BNORM] = normAhicma;
515 #endif
516  } else {
517  PASTE_TILE_TO_LAPACK( descAD, Ahicma, check, double, LDA, M );
518  PASTE_TILE_TO_LAPACK( descAD, AhicmaT, check, double, LDA, M );
519  }
520  PROGRESS("checking accuracy is finished");
521  }
522 
523  PASTE_CODE_FREE_MATRIX( descAUV );
524  PROGRESS("descAUV is freed");
525  if(check == 0) { // If there is no check, then descAD and descDense are different. Refer to @KADIRLBL001
526  PASTE_CODE_FREE_MATRIX( descAD );
527  PROGRESS("descAD is freed");
528  }
529  PASTE_CODE_FREE_MATRIX( descArk );
530  PROGRESS("descArk is freed");
531  PASTE_CODE_FREE_MATRIX( descDense );
532  PROGRESS("descDense is freed");
533  PROGRESS("freed descs");
534  return 0;
535 }
int print_mat
STARSH_blrf * mpiF
int print_progress
double morse_time_t
Definition: timing.h:42
int print_index_end
#define PASTE_CODE_ALLOCATE_MATRIX_TILE(_desc_, _cond_, _type_, _type2_, _lda_, _m_, _n_)
Definition: timing.h:150
int global_always_fixed_rank
#define HICMA_STARSH_PROB_EDSIN
char datebuf[128]
#define PASTE_TILE_TO_LAPACK(_desc_, _name_, _cond_, _type_, _lda_, _n_)
Definition: timing.h:168
#define HICMA_STARSH_PROB_GEOSTAT
int main_print_index
#define PASTE_CODE_FREE_MATRIX(_desc_)
Definition: timing.h:165
int run_potrf
struct tm * tm_info
int diag_nrows
#define HICMA_STARSH_PROB_RND
int check
Definition: testing_zposv.c:76
void fwrite_array(int m, int n, int ld, double *arr, char *file)
int num_mpi_ranks
int RunTest(int *iparam, double *dparam, morse_time_t *t_, char *rankfile)
STARSH_blrf * starsh_format
Definition: hicma_struct.h:22
int store_only_diagonal_tiles
int global_omit_computation
int HICMA_zgytlr_Tile(MORSE_enum uplo, MORSE_desc_t *AUV, MORSE_desc_t *AD, MORSE_desc_t *Ark, unsigned long long int seed, int maxrank, double tol, int compress_diag, MORSE_desc_t *Dense)
Definition: zgytlr.c:93
#define HICMA_STARSH_PROB_SS
time_t timer
int calc_rank_stat
#define START_TIMING()
Definition: timing.h:259
int print_index
int HICMA_zpotrf_Tile(MORSE_enum uplo, MORSE_desc_t *AUV, MORSE_desc_t *AD, MORSE_desc_t *Ark, int rk, int maxrk, double acc)
Definition: zpotrf.c:79
int main_print_mat
int global_check
#define STOP_TIMING()
Definition: timing.h:265
double timediff(struct timeval begin, struct timeval end)
#define PASTE_CODE_IPARAM_LOCALS(iparam)
Definition: timing.h:125
double * theta
Definition: hicma_struct.h:25
int use_scratch
#define PROGRESS(str)
int global_fixed_rank