mt/at_matmul/at_matmul.c

   1 //**************************************************************************
   2 // Multi-threaded Matrix Multiply benchmark
   3 //--------------------------------------------------------------------------
   4 // TA     : Christopher Celio
   5 // Student:
   6 //
   7 //
   8 // This benchmark multiplies two 2-D arrays together and writes the results to
   9 // a third vector. The input data (and reference data) should be generated
  10 // using the matmul_gendata.pl perl script and dumped to a file named
  11 // dataset.h.
  12
  13
  14 // print out arrays, etc.
  15 //#define DEBUG
  16
  17 //--------------------------------------------------------------------------
  18 // Includes
  19
  20 #include <string.h>
  21 #include <stdlib.h>
  22 #include <stdio.h>
  23
  24
  25 //--------------------------------------------------------------------------
  26 // Input/Reference Data
  27
  28 typedef float data_t;
  29 #include "dataset.h"
  30
  31
  32 //--------------------------------------------------------------------------
  33 // Basic Utilities and Multi-thread Support
  34
  35 __thread unsigned long coreid;
  36 unsigned long ncores;
  37
  38 #include "util.h"
  39
  40 #define stringify_1(s) #s
  41 #define stringify(s) stringify_1(s)
  42 #define stats(code) do { \
  43 unsigned long _c = -rdcycle(), _i = -rdinstret(); \
  44 code; \
  45 _c += rdcycle(), _i += rdinstret(); \
  46 if (coreid == 0) \
  47 printf("%s: %ld cycles, %ld.%ld cycles/iter, %ld.%ld CPI\n", \
  48 stringify(code), _c, _c/DIM_SIZE/DIM_SIZE/DIM_SIZE, 10*_c/DIM_SIZE/DIM_SIZE/DIM_SIZE%10, _c/_i, 10*_c/_i%10); \
  49 } while(0)
  50
  51
  52 //--------------------------------------------------------------------------
  53 // Helper functions
  54
  55 void printArray( char name[], int n, data_t arr[] )
  56 {
  57         int i;
  58         if (coreid != 0)
  59                 return;
  60
  61         printf( " %10s :", name );
  62         for ( i = 0; i < n; i++ )
  63                 printf( " %3ld ", (long) arr[i] );
  64         printf( "\n" );
  65 }
  66
  67 void __attribute__((noinline)) verify(size_t n, const data_t* test, const data_t* correct)
  68 {
  69         if (coreid != 0)
  70                 return;
  71
  72         size_t i;
  73         for (i = 0; i < n; i++)
  74         {
  75                 if (test[i] != correct[i])
  76                 {
  77                         printf("FAILED test[%d]= %3ld, correct[%d]= %3ld\n",
  78                                                  i, (long)test[i], i, (long)correct[i]);
  79                         exit(-1);
  80                 }
  81         }
  82
  83         return;
  84 }
  85
  86 //--------------------------------------------------------------------------
  87 // matmul function
  88
  89 // single-thread, naive version
  90 void __attribute__((noinline)) matmul_naive(const int lda,  const data_t A[], const data_t B[], data_t C[] )
  91 {
  92         int i, j, k;
  93
  94         if (coreid > 0)
  95                 return;
  96
  97         for ( i = 0; i < lda; i++ )
  98         {
  99                 for ( j = 0; j < lda; j++ )
 100                 {
 101                         for ( k = 0; k < lda; k++ )
 102                         {
 103                                 C[i + j*lda] += A[j*lda + k] * B[k*lda + i];
 104                         }
 105                 }
 106         }
 107 }
 108
 109
 110
 111 void __attribute__((noinline)) matmul(const int lda,  const data_t A[], const data_t B[], data_t C[] )
 112 {
 113
 114         // ***************************** //
 115         // **** ADD YOUR CODE HERE ***** //
 116         // ***************************** //
 117         //
 118         // feel free to make a separate function for MI and MSI versions.
 119
 120         int i, j, k;
 121
 122         /*547287
 123          for ( i = coreid*lda/ncores; i < (coreid+1)*lda/ncores; i++ )
 124          {
 125          for ( j = 0; j < lda; j++ )
 126          {
 127          int aIndex = j*lda;
 128          int cIndex = i + aIndex;
 129          C[cIndex] += A[aIndex] * B[i];
 130          C[cIndex] += A[aIndex + 1] * B[1*lda + i];
 131          C[cIndex] += A[aIndex + 2] * B[2*lda + i];
 132          C[cIndex] += A[aIndex + 3] * B[3*lda + i];
 133          C[cIndex] += A[aIndex + 4] * B[4*lda + i];
 134          C[cIndex] += A[aIndex + 5] * B[5*lda + i];
 135          C[cIndex] += A[aIndex + 6] * B[6*lda + i];
 136          C[cIndex] += A[aIndex + 7] * B[7*lda + i];
 137          C[cIndex] += A[aIndex + 8] * B[8*lda + i];
 138          C[cIndex] += A[aIndex + 9] * B[9*lda + i];
 139          C[cIndex] += A[aIndex + 10] * B[10*lda + i];
 140          C[cIndex] += A[aIndex + 11] * B[11*lda + i];
 141          C[cIndex] += A[aIndex + 12] * B[12*lda + i];
 142          C[cIndex] += A[aIndex + 13] * B[13*lda + i];
 143          C[cIndex] += A[aIndex + 14] * B[14*lda + i];
 144          C[cIndex] += A[aIndex + 15] * B[15*lda + i];
 145          C[cIndex] += A[aIndex + 16] * B[16*lda + i];
 146          C[cIndex] += A[aIndex + 17] * B[17*lda + i];
 147          C[cIndex] += A[aIndex + 18] * B[18*lda + i];
 148          C[cIndex] += A[aIndex + 19] * B[19*lda + i];
 149          C[cIndex] += A[aIndex + 20] * B[20*lda + i];
 150          C[cIndex] += A[aIndex + 21] * B[21*lda + i];
 151          C[cIndex] += A[aIndex + 22] * B[22*lda + i];
 152          C[cIndex] += A[aIndex + 23] * B[23*lda + i];
 153          C[cIndex] += A[aIndex + 24] * B[24*lda + i];
 154          C[cIndex] += A[aIndex + 25] * B[25*lda + i];
 155          C[cIndex] += A[aIndex + 26] * B[26*lda + i];
 156          C[cIndex] += A[aIndex + 27] * B[27*lda + i];
 157          C[cIndex] += A[aIndex + 28] * B[28*lda + i];
 158          C[cIndex] += A[aIndex + 29] * B[29*lda + i];
 159          C[cIndex] += A[aIndex + 30] * B[30*lda + i];
 160          C[cIndex] += A[aIndex + 31] * B[31*lda + i];
 161          }
 162          }
 163          */
 164
 165         //492827
 166         /*      for ( i = coreid*lda/ncores; i < (coreid+1)*lda/ncores; i++ )
 167          {
 168          for ( j = 0; j < lda; j++ )
 169          {
 170
 171          int aIndex = j*lda;
 172          int cIndex = i + aIndex;
 173          for ( k = 0; k < lda; k++)
 174          {
 175          C[cIndex] += A[aIndex + k] * B[k*lda + i];
 176          /*     C[cIndex] += A[aIndex + k+1] * B[(k+1)*lda + i];
 177          C[cIndex] += A[aIndex + k+2] * B[(k+2)*lda + i];
 178          C[cIndex] += A[aIndex + k+3] * B[(k+3)*lda + i];
 179          C[cIndex] += A[aIndex + k+4] * B[(k+4)*lda + i];
 180          C[cIndex] += A[aIndex + k+5] * B[(k+5)*lda + i];
 181          C[cIndex] += A[aIndex + k+6] * B[(k+6)*lda + i];
 182          C[cIndex] += A[aIndex + k+7] * B[(k+7)*lda + i];
 183          C[cIndex] += A[aIndex + k+8] * B[(k+8)*lda + i];
 184          C[cIndex] += A[aIndex + k+9] * B[(k+9)*lda + i];
 185          C[cIndex] += A[aIndex + k+10] * B[(k+10)*lda + i];
 186          C[cIndex] += A[aIndex + k+11] * B[(k+11)*lda + i];
 187          C[cIndex] += A[aIndex + k+12] * B[(k+12)*lda + i];
 188          C[cIndex] += A[aIndex + k+13] * B[(k+13)*lda + i];
 189          C[cIndex] += A[aIndex + k+14] * B[(k+14)*lda + i];
 190          C[cIndex] += A[aIndex + k+15] * B[(k+15)*lda + i];*/
 191         /*              }
 192          }
 193          }*/
 194         /*
 195          //326378
 196          data_t bTrans[1024];
 197
 198          for (int counti = 0; counti < 32; counti++) {
 199          for (int countj = 0; countj < 32; countj++) {
 200          *(bTrans + counti + countj*lda) = *(B + countj + counti*lda);
 201          }
 202          }
 203
 204
 205          int BLOCKSIZE = 8;
 206          for ( i = 0; i < lda; i+=BLOCKSIZE )
 207          {
 208          for ( int iTemp = i; iTemp < i + BLOCKSIZE; iTemp++ ) {
 209          int iFlag = iTemp*lda;
 210          for ( j = coreid*lda/ncores; j < (coreid+1)*lda/ncores; j++ ) {
 211          int jFlag = j*lda;
 212          int cLoc = jFlag+iTemp;
 213          for ( k = 0; k < lda; k+=8) {
 214          *(C+cLoc) += *(A+jFlag+k) * *(bTrans+iFlag+k);
 215          *(C+cLoc) += *(A+jFlag+k+1) * *(bTrans+iFlag+k+1);
 216          *(C+cLoc) += *(A+jFlag+k+2) * *(bTrans+iFlag+k+2);
 217          *(C+cLoc) += *(A+jFlag+k+3) * *(bTrans+iFlag+k+3);
 218          *(C+cLoc) += *(A+jFlag+k+4) * *(bTrans+iFlag+k+4);
 219          *(C+cLoc) += *(A+jFlag+k+5) * *(bTrans+iFlag+k+5);
 220          *(C+cLoc) += *(A+jFlag+k+6) * *(bTrans+iFlag+k+6);
 221          *(C+cLoc) += *(A+jFlag+k+7) * *(bTrans+iFlag+k+7);
 222          }
 223          }
 224          }
 225          }*/
 226         data_t bTrans[1024];
 227
 228         for (int counti = 0; counti < 32; counti++) {
 229                 for (int countj = 0; countj < 32; countj++) {
 230                         *(bTrans + counti + countj*lda) = *(B + countj + counti*lda);
 231                 }
 232         }
 233
 234
 235         int BLOCKSIZE = 8;
 236         for ( j = 0; j < lda; j++ )
 237         {
 238                 //for ( int jTemp = j; jTemp < j + BLOCKSIZE; jTemp++ ) {
 239                 int jFlag = j*lda;
 240                 for ( i = coreid*lda/ncores; i < (coreid+1)*lda/ncores; i+=BLOCKSIZE ) {
 241                         for ( int iTemp = i; iTemp < i + BLOCKSIZE; iTemp++ ) {
 242
 243                                 int iFlag = iTemp*lda;
 244                                 int cLoc = jFlag+iTemp;
 245                                 for ( k = 0; k < lda; k+=16) {
 246                                         *(C+cLoc) += *(A+jFlag+k) * *(bTrans+iFlag+k);
 247                                         *(C+cLoc) += *(A+jFlag+k+1) * *(bTrans+iFlag+k+1);
 248                                         *(C+cLoc) += *(A+jFlag+k+2) * *(bTrans+iFlag+k+2);
 249                                         *(C+cLoc) += *(A+jFlag+k+3) * *(bTrans+iFlag+k+3);
 250                                         *(C+cLoc) += *(A+jFlag+k+4) * *(bTrans+iFlag+k+4);
 251                                         *(C+cLoc) += *(A+jFlag+k+5) * *(bTrans+iFlag+k+5);
 252                                         *(C+cLoc) += *(A+jFlag+k+6) * *(bTrans+iFlag+k+6);
 253                                         *(C+cLoc) += *(A+jFlag+k+7) * *(bTrans+iFlag+k+7);
 254                                         *(C+cLoc) += *(A+jFlag+k+8) * *(bTrans+iFlag+k+8);
 255                                         *(C+cLoc) += *(A+jFlag+k+9) * *(bTrans+iFlag+k+9);
 256                                         *(C+cLoc) += *(A+jFlag+k+10) * *(bTrans+iFlag+k+10);
 257                                         *(C+cLoc) += *(A+jFlag+k+11) * *(bTrans+iFlag+k+11);
 258                                         *(C+cLoc) += *(A+jFlag+k+12) * *(bTrans+iFlag+k+12);
 259                                         *(C+cLoc) += *(A+jFlag+k+13) * *(bTrans+iFlag+k+13);
 260                                         *(C+cLoc) += *(A+jFlag+k+14) * *(bTrans+iFlag+k+14);
 261                                         *(C+cLoc) += *(A+jFlag+k+15) * *(bTrans+iFlag+k+15);
 262                                 }
 263                         }
 264                 }
 265                 //}
 266         }
 267
 268
 269 }
 270
 271 //--------------------------------------------------------------------------
 272 // Main
 273 //
 274 // all threads start executing thread_entry(). Use their "coreid" to
 275 // differentiate between threads (each thread is running on a separate core).
 276
 277 void thread_entry(int cid, int nc)
 278 {
 279         coreid = cid;
 280         ncores = nc;
 281
 282         // static allocates data in the binary, which is visible to both threads
 283         static data_t results_data[ARRAY_SIZE];
 284
 285
 286 //      // Execute the provided, naive matmul
 287 //      barrier();
 288 //      stats(matmul_naive(DIM_SIZE, input1_data, input2_data, results_data); barrier());
 289 //
 290 //
 291 //      // verify
 292 //      verify(ARRAY_SIZE, results_data, verify_data);
 293 //
 294 //      // clear results from the first trial
 295 //      size_t i;
 296 //      if (coreid == 0)
 297 //              for (i=0; i < ARRAY_SIZE; i++)
 298 //                      results_data[i] = 0;
 299 //      barrier();
 300
 301
 302         // Execute your faster matmul
 303         barrier();
 304         stats(matmul(DIM_SIZE, input1_data, input2_data, results_data); barrier());
 305
 306 #ifdef DEBUG
 307         printArray("results:", ARRAY_SIZE, results_data);
 308         printArray("verify :", ARRAY_SIZE, verify_data);
 309 #endif
 310
 311         // verify
 312         verify(ARRAY_SIZE, results_data, verify_data);
 313         barrier();
 314
 315         exit(0);
 316 }
 317