HiCMA
Hierarchical Computations on Manycore Architectures
codelet_ztrsm.c
Go to the documentation of this file.
1 
16 #include "hicma.h"
17 #include "hicma_common.h"
18 #include "auxdescutil.h"
19 #include "coreblas.h"
20 #include "coreblas/lapacke.h"
21 #include "morse.h"
22 #include "runtime/starpu/chameleon_starpu.h"
23 //#include "runtime/starpu/include/runtime_codelet_z.h"
24 
25 #include <sys/time.h>
26 
27 #include "runtime/starpu/runtime_codelets.h"
28 ZCODELETS_HEADER(trsm_hcore)
29 
30 //UPDATE this definition. I only copy-paste from runtime/starpu/codelets/codelet_zcallback.c
31 /*CHAMELEON_CL_CB(ztrsm_hcore, starpu_matrix_get_nx(task->handles[1]), starpu_matrix_get_ny(task->handles[1]), 0, M*M*N)*/
32 
33 #undef CBLAS_SADDR
34 #define CBLAS_SADDR(_val) (_val)
35 
37 
38 void HICMA_TASK_ztrsm(const MORSE_option_t *options,
39  MORSE_enum side, MORSE_enum uplo, MORSE_enum transA, MORSE_enum diag,
40  int m,
41  double alpha, const MORSE_desc_t *A, int Am, int An, int lda,
42  const MORSE_desc_t *BUV, int Bm, int Bn, int ldb, const MORSE_desc_t *Brk)
43 {
44  int nBUV = BUV->nb;
45  struct starpu_codelet *codelet = &cl_ztrsm_hcore;
46  /*void (*callback)(void*) = options->profiling ? cl_ztrsm_hcore_callback : NULL;*/
47  void (*callback)(void*) = NULL;
48  int sizeA = lda*m;
49  int sizeB = ldb; //*nb; //@KADIR converted n to nb FIXME Size of B will be determined at runtime!!!
50  int execution_rank = BUV->get_rankof( BUV, Bm, Bn );
51  int rank_changed=0;
52  (void)execution_rank;
53 
54  /* force execution on the rank owning the largest data (tile) */
55  int threshold;
56  char* env = getenv("MORSE_COMM_FACTOR_THRESHOLD");
57  if (env != NULL)
58  threshold = (unsigned)atoi(env);
59  else
60  threshold = 10;
61  if ( sizeA > threshold*sizeB ){
62  execution_rank = A->get_rankof( A, Am, An );
63  rank_changed=1;
64  }
65  MORSE_BEGIN_ACCESS_DECLARATION;
66  MORSE_ACCESS_R(A, Am, An);
67  MORSE_ACCESS_RW(BUV, Bm, Bn);
68 #if !defined(HICMA_ALWAYS_FIX_RANK)
69  MORSE_ACCESS_R(Brk, Bm, Bn);
70 #endif
71  if (rank_changed)
72  MORSE_RANK_CHANGED(execution_rank);
73  MORSE_END_ACCESS_DECLARATION;
74 
75  starpu_insert_task(
76  starpu_mpi_codelet(codelet),
77  STARPU_VALUE, &side, sizeof(MORSE_enum),
78  STARPU_VALUE, &uplo, sizeof(MORSE_enum),
79  STARPU_VALUE, &transA, sizeof(MORSE_enum),
80  STARPU_VALUE, &diag, sizeof(MORSE_enum),
81  STARPU_VALUE, &m, sizeof(int),
82  STARPU_VALUE, &alpha, sizeof(double),
83  STARPU_R, RTBLKADDR(A, double, Am, An),
84  STARPU_VALUE, &lda, sizeof(int),
85  STARPU_RW, RTBLKADDR(BUV, double, Bm, Bn),
86  STARPU_VALUE, &ldb, sizeof(int),
87 #if !defined(HICMA_ALWAYS_FIX_RANK)
88  STARPU_R, RTBLKADDR(Brk, double, Bm, Bn),
89 #endif
90  STARPU_VALUE, &Am, sizeof(int),
91  STARPU_VALUE, &An, sizeof(int),
92  STARPU_VALUE, &Bm, sizeof(int),
93  STARPU_VALUE, &Bn, sizeof(int),
94  STARPU_VALUE, &nBUV, sizeof(int),
95  STARPU_PRIORITY, options->priority,
96  STARPU_CALLBACK, callback,
97 #if defined(CHAMELEON_USE_MPI)
98  STARPU_EXECUTE_ON_NODE, execution_rank,
99 #endif
100 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
101  STARPU_NAME, "hcore_ztrsm",
102 #endif
103  0);
104 }
105 
106 
107 #if !defined(CHAMELEON_SIMULATION)
108 static void cl_ztrsm_hcore_cpu_func(void *descr[], void *cl_arg)
109 {
110 #ifdef HICMA_DISABLE_ALL_COMPUTATIONS
111  return;
112 #endif
113 #ifdef HICMA_DISABLE_HCORE_COMPUTATIONS
114  return;
115 #endif
116  struct timeval tvalBefore, tvalAfter; // removed comma
117  gettimeofday (&tvalBefore, NULL);
118  MORSE_enum side;
119  MORSE_enum uplo;
120  MORSE_enum transA;
121  MORSE_enum diag;
122  int m;
123  double alpha;
124  double *A;
125  int lda;
126  double *BUV;
127  int ldb;
128  double *Brk;
129  int Am;
130  int An;
131  int Bm;
132  int Bn;
133  int nBUV;
134 
135  A = (double *)STARPU_MATRIX_GET_PTR(descr[0]);
136  BUV = (double *)STARPU_MATRIX_GET_PTR(descr[1]);
137 #if !defined(HICMA_ALWAYS_FIX_RANK)
138  Brk = (double *)STARPU_MATRIX_GET_PTR(descr[2]);
139  if(HICMA_get_always_fixed_rank() == 1){
140  fprintf(stderr, "global_always_fixed_rank is one. But HICMA_ALWAYS_FIX_RANK is not defined. Exiting...\n");
141  exit(1);
142  }
143 #else
144  if(HICMA_get_always_fixed_rank() != 1){
145  fprintf(stderr, "global_always_fixed_rank must be one. But it is %d. Exiting...\n", HICMA_get_always_fixed_rank());
146  exit(1);
147  }
148 #endif
149  int _Brk;
150  if(HICMA_get_always_fixed_rank() == 1){
151  _Brk = HICMA_get_fixed_rank();
152  } else {
153  _Brk = Brk[0];
154  }
155 
156  starpu_codelet_unpack_args(cl_arg, &side, &uplo, &transA, &diag, &m, &alpha, &lda, &ldb, &Am, &An, &Bm, &Bn, &nBUV);
157 
158  int nBU = nBUV/2;
159  size_t nelm_BU = (size_t)ldb * (size_t)nBU;
160  double *B = &(BUV[nelm_BU]);
161 
162  /*CORE_ztrsm(side, uplo,*/
163  /*transA, diag,*/
164  /*m, n,*/
165  /*alpha, A, lda,*/
166  /*B, ldb);*/
167  if(HICMA_get_print_index() == 1){
168  printf("%d+TRSM\t|AD(%d,%d) BV(%d,%d)%d m:%d lda(11):%d ldb(12):%d\n",MORSE_My_Mpi_Rank(),Am,An, Bm, Bn, _Brk, m, lda, ldb);
169  }
170  if(HICMA_get_print_mat() == 1){
171  printf("%d\ttrsm-input A\n", __LINE__);
172  _printmat(A, m, m, lda);
173  printf("%d\ttrsm-input B\n", __LINE__);
174  _printmat(B, m, _Brk, ldb);
175  }
176  cblas_dtrsm(
177  CblasColMajor,
178  (CBLAS_SIDE)side, (CBLAS_UPLO)uplo,
179  (CBLAS_TRANSPOSE)transA, (CBLAS_DIAG)diag,
180  m,
181  _Brk,
182  CBLAS_SADDR(alpha), A, lda,
183  B, ldb);
185  gettimeofday (&tvalAfter, NULL);
186  printf("%d-TRSM\t|AD(%d,%d)%dx%d-%d BV(%d,%d)%dx%d-%d m:%d\t\t\t\tTRSM: %.4f\n",MORSE_My_Mpi_Rank(),Am,An, m, m, lda,Bm, Bn, m, _Brk, ldb, m,
187  (tvalAfter.tv_sec - tvalBefore.tv_sec)
188  +(tvalAfter.tv_usec - tvalBefore.tv_usec)/1000000.0
189  );
190  }
191  if(HICMA_get_print_mat() == 1){
192  printf("%d\ttrsm-output\n", __LINE__);
193  _printmat(B, m, _Brk, ldb);
194  }
195 }
196 #endif /* !defined(CHAMELEON_SIMULATION) */
197 
198 /*
199  * Codelet definition
200  */
201 #if defined(HICMA_ALWAYS_FIX_RANK)
202 CODELETS_CPU(ztrsm_hcore, 2, cl_ztrsm_hcore_cpu_func)
203 // CODELETS(ztrsm_hcore, 2, cl_ztrsm_hcore_cpu_func, cl_ztrsm_hcore_cuda_func, STARPU_CUDA_ASYNC)
204 #else
205 CODELETS_CPU(ztrsm_hcore, 3, cl_ztrsm_hcore_cpu_func)
206 // CODELETS(ztrsm_hcore, 3, cl_ztrsm_hcore_cpu_func, cl_ztrsm_hcore_cuda_func, STARPU_CUDA_ASYNC)
207 #endif
#define A(m, n)
Definition: pzgemm.c:56
int trsm_print_index_end
Definition: codelet_ztrsm.c:36
#define BUV(m, n)
Definition: pzgemm.c:61
int side[2]
#define CBLAS_SADDR(_val)
Definition: codelet_ztrsm.c:34
int diag[2]
int HICMA_get_print_mat()
Definition: hicma_init.c:56
void _printmat(double *A, int m, int n, int ld)
int HICMA_get_print_index_end()
Definition: hicma_init.c:53
int uplo[2]
int HICMA_get_fixed_rank()
Definition: hicma_init.c:43
int HICMA_get_print_index()
Definition: hicma_init.c:50
void HICMA_TASK_ztrsm(const MORSE_option_t *options, MORSE_enum side, MORSE_enum uplo, MORSE_enum transA, MORSE_enum diag, int m, double alpha, const MORSE_desc_t *A, int Am, int An, int lda, const MORSE_desc_t *BUV, int Bm, int Bn, int ldb, const MORSE_desc_t *Brk)
Definition: codelet_ztrsm.c:38
#define B(m, n)
Definition: pzgemm.c:57
int HICMA_get_always_fixed_rank()
Definition: hicma_init.c:40