mt/bp_matmul/matmul_mi.c

   1 //**************************************************************************
   2 // Multi-threaded Matrix Multiply benchmark
   3 //--------------------------------------------------------------------------
   4 // TA     : Christopher Celio
   5 // Student:
   6 //
   7 //
   8 // This benchmark multiplies two 2-D arrays together and writes the results to
   9 // a third vector. The input data (and reference data) should be generated
  10 // using the matmul_gendata.pl perl script and dumped to a file named
  11 // dataset.h.
  12
  13
  14 // print out arrays, etc.
  15 //#define DEBUG
  16
  17 //--------------------------------------------------------------------------
  18 // Includes
  19
  20 #include <string.h>
  21 #include <stdlib.h>
  22 #include <stdio.h>
  23
  24
  25 //--------------------------------------------------------------------------
  26 // Input/Reference Data
  27
  28 typedef float data_t;
  29 #include "dataset.h"
  30
  31
  32 //--------------------------------------------------------------------------
  33 // Basic Utilities and Multi-thread Support
  34
  35 __thread unsigned long coreid;
  36 unsigned long ncores;
  37
  38 #include "util.h"
  39
  40 #define stringify_1(s) #s
  41 #define stringify(s) stringify_1(s)
  42 #define stats(code) do { \
  43     unsigned long _c = -rdcycle(), _i = -rdinstret(); \
  44     code; \
  45     _c += rdcycle(), _i += rdinstret(); \
  46     if (coreid == 0) \
  47       printf("%s: %ld cycles, %ld.%ld cycles/iter, %ld.%ld CPI\n", \
  48              stringify(code), _c, _c/DIM_SIZE/DIM_SIZE/DIM_SIZE, 10*_c/DIM_SIZE/DIM_SIZE/DIM_SIZE%10, _c/_i, 10*_c/_i%10); \
  49   } while(0)
  50
  51
  52 //--------------------------------------------------------------------------
  53 // Helper functions
  54
  55 void printArray( char name[], int n, data_t arr[] )
  56 {
  57    int i;
  58    if (coreid != 0)
  59       return;
  60
  61    printf( " %10s :", name );
  62    for ( i = 0; i < n; i++ )
  63       printf( " %3ld ", (long) arr[i] );
  64    printf( "\n" );
  65 }
  66
  67 void __attribute__((noinline)) verify(size_t n, const data_t* test, const data_t* correct)
  68 {
  69    if (coreid != 0)
  70       return;
  71
  72    size_t i;
  73    for (i = 0; i < n; i++)
  74    {
  75       if (test[i] != correct[i])
  76       {
  77          printf("FAILED test[%d]= %3ld, correct[%d]= %3ld\n",
  78             i, (long)test[i], i, (long)correct[i]);
  79          exit(-1);
  80       }
  81    }
  82
  83    return;
  84 }
  85
  86 //--------------------------------------------------------------------------
  87 // matmul function
  88
  89 // single-thread, naive version
  90 void __attribute__((noinline)) matmul_naive(const int lda,  const data_t A[], const data_t B[], data_t C[] )
  91 {
  92    int i, j, k;
  93
  94    if (coreid > 0)
  95       return;
  96
  97    for ( i = 0; i < lda; i++ )
  98       for ( j = 0; j < lda; j++ )
  99       {
 100          for ( k = 0; k < lda; k++ )
 101          {
 102             C[i + j*lda] += A[j*lda + k] * B[k*lda + i];
 103          }
 104       }
 105
 106 }
 107
 108 void __attribute__((noinline)) matmul_MI_transpose(const int lda,  const data_t A[], const data_t B[], data_t C[] )
 109 {
 110     int i, j, k;
 111     data_t B_trans[32*32];
 112     data_t acc_temp0, acc_temp1;
 113     data_t *A_j, *B_i;
 114     data_t *A_j_k, *B_i_k;
 115     int z;
 116
 117     //for (i = 0; i < 32; i++) {
 118     //    for (j = 0; j < 32; j++) {
 119     //        B_trans[i*lda+j] = B[i+j*lda];
 120     //    }
 121     //}
 122
 123     if (coreid == 0) {
 124         for (i = 0; i < 32; i++) {
 125             B_i = B_trans+i*32;
 126             for (z = 0; z < 32; z++) {
 127                 *(B_i+z) = B[i+z*32];
 128             }
 129             for (j = 0; j < 16; j+=2) {
 130                 A_j = A+j*lda;
 131                 acc_temp0 = 0;
 132                 for (k = 0; k < 32; k+=8) {
 133                     A_j_k = A_j+k;
 134                     B_i_k = B_i+k;
 135                     acc_temp0 += *(A_j_k)     * *(B_i_k);
 136                     acc_temp0 += *(A_j_k + 1) * *(B_i_k + 1);
 137                     acc_temp0 += *(A_j_k + 2) * *(B_i_k + 2);
 138                     acc_temp0 += *(A_j_k + 3) * *(B_i_k + 3);
 139                     acc_temp0 += *(A_j_k + 4) * *(B_i_k + 4);
 140                     acc_temp0 += *(A_j_k + 5) * *(B_i_k + 5);
 141                     acc_temp0 += *(A_j_k + 6) * *(B_i_k + 6);
 142                     acc_temp0 += *(A_j_k + 7) * *(B_i_k + 7);
 143                 }
 144                 A_j += 32;
 145
 146                 acc_temp1 = 0;
 147                 for (k = 0; k < 32; k+=8) {
 148                     acc_temp1 += *(A_j+k) * *(B_i+k);
 149                     acc_temp1 += *(A_j+k + 1) * *(B_i+k + 1);
 150                     acc_temp1 += *(A_j+k + 2) * *(B_i+k + 2);
 151                     acc_temp1 += *(A_j+k + 3) * *(B_i+k + 3);
 152                     acc_temp1 += *(A_j+k + 4) * *(B_i+k + 4);
 153                     acc_temp1 += *(A_j+k + 5) * *(B_i+k + 5);
 154                     acc_temp1 += *(A_j+k + 6) * *(B_i+k + 6);
 155                     acc_temp1 += *(A_j+k + 7) * *(B_i+k + 7);
 156                 }
 157
 158                 C[i + j*lda] = acc_temp0;
 159                 C[i + (j+1)*lda] = acc_temp1;
 160             }
 161         }
 162     } else if (coreid == 1) {
 163         for (i = 0; i < 32; i++) {
 164             B_i = B_trans+i*32;
 165             for (z = 0; z < 32; z++) {
 166                 *(B_i+z) = B[i+z*32];
 167             }
 168             for (j = 16; j < 32; j+=2) {
 169                 A_j = A+j*lda;
 170                 acc_temp0 = 0;
 171                 for (k = 0; k < 32; k+=8) {
 172                     acc_temp0 += *(A_j+k) * *(B_i+k);
 173                     acc_temp0 += *(A_j+k + 1) * *(B_i+k + 1);
 174                     acc_temp0 += *(A_j+k + 2) * *(B_i+k + 2);
 175                     acc_temp0 += *(A_j+k + 3) * *(B_i+k + 3);
 176                     acc_temp0 += *(A_j+k + 4) * *(B_i+k + 4);
 177                     acc_temp0 += *(A_j+k + 5) * *(B_i+k + 5);
 178                     acc_temp0 += *(A_j+k + 6) * *(B_i+k + 6);
 179                     acc_temp0 += *(A_j+k + 7) * *(B_i+k + 7);
 180                 }
 181                 A_j += 32;
 182
 183                 acc_temp1 = 0;
 184                 for (k = 0; k < 32; k+=8) {
 185                     acc_temp1 += *(A_j+k) * *(B_i+k);
 186                     acc_temp1 += *(A_j+k + 1) * *(B_i+k + 1);
 187                     acc_temp1 += *(A_j+k + 2) * *(B_i+k + 2);
 188                     acc_temp1 += *(A_j+k + 3) * *(B_i+k + 3);
 189                     acc_temp1 += *(A_j+k + 4) * *(B_i+k + 4);
 190                     acc_temp1 += *(A_j+k + 5) * *(B_i+k + 5);
 191                     acc_temp1 += *(A_j+k + 6) * *(B_i+k + 6);
 192                     acc_temp1 += *(A_j+k + 7) * *(B_i+k + 7);
 193                 }
 194                 C[i + j*lda] = acc_temp0;
 195                 C[i + (j+1)*lda] = acc_temp1;
 196             }
 197         }
 198     }
 199 }
 200
 201 void __attribute__((noinline)) matmul_MI(const int lda,  const data_t A[], const data_t B[], data_t C[] )
 202 {
 203     int i, j, k;
 204     data_t acc_temp;
 205     data_t *A_j, *B_i;
 206     int j_start = coreid*16;
 207     int j_end = (coreid*16)+16;
 208     if (coreid == 0) {
 209         for ( i = 0; i < 32; i++ ) {
 210             B_i = B + i;
 211             for ( j = j_start; j < j_end; j++ )
 212             {
 213                 acc_temp = 0;
 214                 A_j = A + j*32;
 215                 for ( k = 0; k < 32; k++ )
 216                 {
 217                     acc_temp += *(A_j + k) * *(B_i + k*32);
 218                 }
 219                 C[i + j*32] = acc_temp;
 220             }
 221         }
 222     } else if (coreid == 1) {
 223         for ( i = 16; i < 32; i++ ) {
 224             B_i = B + i;
 225             for ( j = j_start; j < j_end; j++ )
 226             {
 227                 acc_temp = 0;
 228                 A_j = A + j*32;
 229                 for ( k = 0; k < 32; k+=4 )
 230                 {
 231                     acc_temp += *(A_j + k) * *(B_i + k*32);
 232                     acc_temp += *(A_j + k + 1) * *(B_i + (k+1)*32);
 233                     acc_temp += *(A_j + k + 2) * *(B_i + (k+2)*32);
 234                     acc_temp += *(A_j + k + 3) * *(B_i + (k+3)*32);
 235                 }
 236                 C[i + j*32] = acc_temp;
 237             }
 238         }
 239         for ( i = 0; i < 16; i++ ) {
 240             B_i = B + i;
 241             for ( j = j_start; j < j_end; j++ )
 242             {
 243                 acc_temp = 0;
 244                 A_j = A + j*32;
 245                 for ( k = 0; k < 32; k+=4 )
 246                 {
 247                     acc_temp += *(A_j + k) * *(B_i + k*32);
 248                     acc_temp += *(A_j + k + 1) * *(B_i + (k+1)*32);
 249                     acc_temp += *(A_j + k + 2) * *(B_i + (k+2)*32);
 250                     acc_temp += *(A_j + k + 3) * *(B_i + (k+3)*32);
 251                 }
 252                 C[i + j*32] = acc_temp;
 253             }
 254         }
 255
 256     }
 257 }
 258
 259 void __attribute__((noinline)) matmul_MSI(const int lda,  const data_t A[], const data_t B[], data_t C[] )
 260 {
 261     int i, j, k;
 262     data_t acc_temp;
 263     data_t *A_j, *B_i;
 264     int j_start = coreid*16;
 265     int j_end = (coreid*16)+16;
 266     for ( i = 0; i < 32; i++ ) {
 267         B_i = B + i;
 268         for ( j = j_start; j < j_end; j++ )
 269         {
 270             acc_temp = 0;
 271             A_j = A + j*32;
 272             for ( k = 0; k < 32; k++ )
 273             {
 274                 acc_temp += *(A_j + k) * *(B_i + k*32);
 275             }
 276             C[i + j*32] = acc_temp;
 277         }
 278     }
 279 }
 280
 281 void __attribute__((noinline)) matmul(const int lda,  const data_t A[], const data_t B[], data_t C[] )
 282 {
 283
 284     // ***************************** //
 285     // **** ADD YOUR CODE HERE ***** //
 286     // ***************************** //
 287     //
 288     // feel free to make a separate function for MI and MSI versions.
 289     // ENABLE_SHARING = false is MI
 290     // ENABLE_SHARING = true is MSI
 291     matmul_MI_transpose(lda, A, B, C);
 292     //matmul_MSI(lda, A, B, C);
 293 }
 294
 295 //--------------------------------------------------------------------------
 296 // Main
 297 //
 298 // all threads start executing thread_entry(). Use their "coreid" to
 299 // differentiate between threads (each thread is running on a separate core).
 300
 301 void thread_entry(int cid, int nc)
 302 {
 303    coreid = cid;
 304    ncores = nc;
 305
 306    // static allocates data in the binary, which is visible to both threads
 307    static data_t results_data[ARRAY_SIZE];
 308
 309
 310 //   // Execute the provided, naive matmul
 311 //   barrier();
 312 //   //stats(matmul_naive(DIM_SIZE, input1_data, input2_data, results_data); barrier());
 313 //
 314 //
 315 //   // verify
 316 //   //verify(ARRAY_SIZE, results_data, verify_data);
 317 //
 318 //   // clear results from the first trial
 319 //   size_t i;
 320 //   if (coreid == 0)
 321 //      for (i=0; i < ARRAY_SIZE; i++)
 322 //         results_data[i] = 0;
 323 //   barrier();
 324
 325
 326    // Execute your faster matmul
 327    barrier();
 328    stats(matmul(DIM_SIZE, input1_data, input2_data, results_data); barrier());
 329
 330 #ifdef DEBUG
 331    printArray("results:", ARRAY_SIZE, results_data);
 332    printArray("verify :", ARRAY_SIZE, verify_data);
 333 #endif
 334
 335    // verify
 336    verify(ARRAY_SIZE, results_data, verify_data);
 337    barrier();
 338
 339    exit(0);
 340 }
 341