Clean up canonical mt benchmarks and reorganize extra versions in /mt. All versions...
[riscv-tests.git] / mt / df_matmul.c
diff --git a/mt/df_matmul.c b/mt/df_matmul.c
new file mode 100755 (executable)
index 0000000..56d9c4b
--- /dev/null
@@ -0,0 +1,237 @@
+#include "stdlib.h"
+
+#include "util.h"
+
+#include "dataset.h"
+void __attribute__((noinline)) matmul(const int coreid, const int ncores, const int lda,  const data_t A[], const data_t B[], data_t C[] )
+{
+   
+   // ***************************** //
+   // **** ADD YOUR CODE HERE ***** //
+   // ***************************** //
+   //
+   // feel free to make a separate function for MI and MSI versions.
+   int j, k;
+   data_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+   data_t temp8, temp9, temp10, temp11, temp12, temp13, temp14, temp15;
+   if(coreid == 0) {
+      //16*0:16*(0+1) ;; 16*1+16*(1+1)
+      //0:16 ;; 16:32
+
+      //complete Q1
+      for(j = 0; j < 16; j++) {
+         temp0  = C[j*lda];
+         temp1  = C[1  + j*lda];
+         temp2  = C[2  + j*lda];
+         temp3  = C[3  + j*lda];
+         temp4  = C[4  + j*lda];
+         temp5  = C[5  + j*lda];
+         temp6  = C[6  + j*lda];
+         temp7  = C[7  + j*lda];
+         temp8  = C[8  + j*lda];
+         temp9  = C[9  + j*lda];
+         temp10 = C[10 + j*lda];
+         temp11 = C[11 + j*lda];
+         temp12 = C[12 + j*lda];
+         temp13 = C[13 + j*lda];
+         temp14 = C[14 + j*lda];
+         temp15 = C[15 + j*lda];
+         for(k = 0; k < 32; k++) {
+            temp0  += A[j*lda + k] * B[k*lda];
+            temp1  += A[j*lda + k] * B[1+k*lda];
+            temp2  += A[j*lda + k] * B[2+k*lda];
+            temp3  += A[j*lda + k] * B[3+k*lda];
+            temp4  += A[j*lda + k] * B[4+k*lda];
+            temp5  += A[j*lda + k] * B[5+k*lda];
+            temp6  += A[j*lda + k] * B[6+k*lda];
+            temp7  += A[j*lda + k] * B[7+k*lda];
+            temp8  += A[j*lda + k] * B[8+k*lda];
+            temp9  += A[j*lda + k] * B[9+k*lda];
+            temp10 += A[j*lda + k] * B[10+k*lda];
+            temp11 += A[j*lda + k] * B[11+k*lda];
+            temp12 += A[j*lda + k] * B[12+k*lda];
+            temp13 += A[j*lda + k] * B[13+k*lda];
+            temp14 += A[j*lda + k] * B[14+k*lda];
+            temp15 += A[j*lda + k] * B[15+k*lda];
+         }
+         C[j*lda] = temp0;
+         C[1  + j*lda] = temp1;
+         C[2  + j*lda] = temp2;
+         C[3  + j*lda] = temp3;
+         C[4  + j*lda] = temp4;
+         C[5  + j*lda] = temp5;
+         C[6  + j*lda] = temp6;
+         C[7  + j*lda] = temp7;
+         C[8  + j*lda] = temp8;
+         C[9  + j*lda] = temp9;
+         C[10 + j*lda] = temp10;
+         C[11 + j*lda] = temp11;
+         C[12 + j*lda] = temp12;
+         C[13 + j*lda] = temp13;
+         C[14 + j*lda] = temp14;
+         C[15 + j*lda] = temp15;
+      }
+      for(j = 16; j < 32; j++) {
+         temp0  = C[j*lda];
+         temp1  = C[1  + j*lda];
+         temp2  = C[2  + j*lda];
+         temp3  = C[3  + j*lda];
+         temp4  = C[4  + j*lda];
+         temp5  = C[5  + j*lda];
+         temp6  = C[6  + j*lda];
+         temp7  = C[7  + j*lda];
+         temp8  = C[8  + j*lda];
+         temp9  = C[9  + j*lda];
+         temp10 = C[10 + j*lda];
+         temp11 = C[11 + j*lda];
+         temp12 = C[12 + j*lda];
+         temp13 = C[13 + j*lda];
+         temp14 = C[14 + j*lda];
+         temp15 = C[15 + j*lda];
+         for(k = 0; k < 32; k++) {
+            temp0  += A[j*lda + k] * B[k*lda];
+            temp1  += A[j*lda + k] * B[1+k*lda];
+            temp2  += A[j*lda + k] * B[2+k*lda];
+            temp3  += A[j*lda + k] * B[3+k*lda];
+            temp4  += A[j*lda + k] * B[4+k*lda];
+            temp5  += A[j*lda + k] * B[5+k*lda];
+            temp6  += A[j*lda + k] * B[6+k*lda];
+            temp7  += A[j*lda + k] * B[7+k*lda];
+            temp8  += A[j*lda + k] * B[8+k*lda];
+            temp9  += A[j*lda + k] * B[9+k*lda];
+            temp10 += A[j*lda + k] * B[10+k*lda];
+            temp11 += A[j*lda + k] * B[11+k*lda];
+            temp12 += A[j*lda + k] * B[12+k*lda];
+            temp13 += A[j*lda + k] * B[13+k*lda];
+            temp14 += A[j*lda + k] * B[14+k*lda];
+            temp15 += A[j*lda + k] * B[15+k*lda];
+         }
+         C[j*lda] = temp0;
+         C[1  + j*lda] = temp1;
+         C[2  + j*lda] = temp2;
+         C[3  + j*lda] = temp3;
+         C[4  + j*lda] = temp4;
+         C[5  + j*lda] = temp5;
+         C[6  + j*lda] = temp6;
+         C[7  + j*lda] = temp7;
+         C[8  + j*lda] = temp8;
+         C[9  + j*lda] = temp9;
+         C[10 + j*lda] = temp10;
+         C[11 + j*lda] = temp11;
+         C[12 + j*lda] = temp12;
+         C[13 + j*lda] = temp13;
+         C[14 + j*lda] = temp14;
+         C[15 + j*lda] = temp15;
+      }
+   }
+   //16*(2-1) : 16*2 ;; 16*(1-1) : 16*1
+   //16:32 ;; 0:16 
+   if(coreid == 1 || ncores == 1)   {
+         //complete Q3
+      for(j = 16; j < 32; j++) {
+         temp0  = C[16+j*lda];
+         temp1  = C[17+j*lda];
+         temp2  = C[18+j*lda];
+         temp3  = C[19+j*lda];
+         temp4  = C[20+j*lda];
+         temp5  = C[21+j*lda];
+         temp6  = C[22+j*lda];
+         temp7  = C[23+j*lda];
+         temp8  = C[24+j*lda];
+         temp9  = C[25+j*lda];
+         temp10 = C[26+j*lda];
+         temp11 = C[27+j*lda];
+         temp12 = C[28+j*lda];
+         temp13 = C[29+j*lda];
+         temp14 = C[30+j*lda];
+         temp15 = C[31+j*lda];
+         for(k = 0; k < 32; k++) {
+            temp0  += A[j*lda + k] * B[16+k*lda];
+            temp1  += A[j*lda + k] * B[17+k*lda];
+            temp2  += A[j*lda + k] * B[18+k*lda];
+            temp3  += A[j*lda + k] * B[19+k*lda];
+            temp4  += A[j*lda + k] * B[20+k*lda];
+            temp5  += A[j*lda + k] * B[21+k*lda];
+            temp6  += A[j*lda + k] * B[22+k*lda];
+            temp7  += A[j*lda + k] * B[23+k*lda];
+            temp8  += A[j*lda + k] * B[24+k*lda];
+            temp9  += A[j*lda + k] * B[25+k*lda];
+            temp10 += A[j*lda + k] * B[26+k*lda];
+            temp11 += A[j*lda + k] * B[27+k*lda];
+            temp12 += A[j*lda + k] * B[28+k*lda];
+            temp13 += A[j*lda + k] * B[29+k*lda];
+            temp14 += A[j*lda + k] * B[30+k*lda];
+            temp15 += A[j*lda + k] * B[31+k*lda];
+         }
+         C[16 + j*lda] = temp0;
+         C[17 + j*lda] = temp1;
+         C[18 + j*lda] = temp2;
+         C[19 + j*lda] = temp3;
+         C[20 + j*lda] = temp4;
+         C[21 + j*lda] = temp5;
+         C[22 + j*lda] = temp6;
+         C[23 + j*lda] = temp7;
+         C[24 + j*lda] = temp8;
+         C[25 + j*lda] = temp9;
+         C[26 + j*lda] = temp10;
+         C[27 + j*lda] = temp11;
+         C[28 + j*lda] = temp12;
+         C[29 + j*lda] = temp13;
+         C[30 + j*lda] = temp14;
+         C[31 + j*lda] = temp15;
+      }
+      //complete Q4
+      for(j = 0; j < 16; j++) {
+         temp0  = C[16 + j*lda];
+         temp1  = C[17 + j*lda];
+         temp2  = C[18 + j*lda];
+         temp3  = C[19 + j*lda];
+         temp4  = C[20 + j*lda];
+         temp5  = C[21 + j*lda];
+         temp6  = C[22 + j*lda];
+         temp7  = C[23 + j*lda];
+         temp8  = C[24 + j*lda];
+         temp9  = C[25 + j*lda];
+         temp10 = C[26 + j*lda];
+         temp11 = C[27 + j*lda];
+         temp12 = C[28 + j*lda];
+         temp13 = C[29 + j*lda];
+         temp14 = C[30 + j*lda];
+         temp15 = C[31 + j*lda];
+         for(k = 0; k < 32; k++) {
+            temp0  += A[j*lda + k] * B[16 + k*lda];
+            temp1  += A[j*lda + k] * B[17 + k*lda];
+            temp2  += A[j*lda + k] * B[18 + k*lda];
+            temp3  += A[j*lda + k] * B[19 + k*lda];
+            temp4  += A[j*lda + k] * B[20 + k*lda];
+            temp5  += A[j*lda + k] * B[21 + k*lda];
+            temp6  += A[j*lda + k] * B[22 + k*lda];
+            temp7  += A[j*lda + k] * B[23 + k*lda];
+            temp8  += A[j*lda + k] * B[24 + k*lda];
+            temp9  += A[j*lda + k] * B[25 + k*lda];
+            temp10 += A[j*lda + k] * B[26 + k*lda];
+            temp11 += A[j*lda + k] * B[27 + k*lda];
+            temp12 += A[j*lda + k] * B[28 + k*lda];
+            temp13 += A[j*lda + k] * B[29 + k*lda];
+            temp14 += A[j*lda + k] * B[30 + k*lda];
+            temp15 += A[j*lda + k] * B[31 + k*lda];
+         }
+         C[16 + j*lda] = temp0;
+         C[17 + j*lda] = temp1;
+         C[18 + j*lda] = temp2;
+         C[19 + j*lda] = temp3;
+         C[20 + j*lda] = temp4;
+         C[21 + j*lda] = temp5;
+         C[22 + j*lda] = temp6;
+         C[23 + j*lda] = temp7;
+         C[24 + j*lda] = temp8;
+         C[25 + j*lda] = temp9;
+         C[26 + j*lda] = temp10;
+         C[27 + j*lda] = temp11;
+         C[28 + j*lda] = temp12;
+         C[29 + j*lda] = temp13;
+         C[30 + j*lda] = temp14;
+         C[31 + j*lda] = temp15;
+      }
+   }
+}