1 //**************************************************************************
2 // Vector-Thread Vector Matrix Multiply benchmark
3 //--------------------------------------------------------------------------
5 // This benchmark multiplies two 2-D arrays together and writes the results to
6 // a third vector. The input data (and reference data) should be generated
7 // using the matmul_gendata.pl perl script and dumped to a file named
8 // dataset.h. The riscv-gcc toolchain does not support system calls so printf's
9 // can only be used on a host system, not on the riscv-v processor simulator
12 // HOWEVER: printstr() and printhex() are provided, for a primitive form of
13 // printing strings and hexadecimal values to stdout.
16 // Choose which implementation you wish to test... but leave only one on!
17 // (only the first one will be executed).
22 //--------------------------------------------------------------------------
25 // Set HOST_DEBUG to 1 if you are going to compile this for a host
26 // machine (ie Athena/Linux) for debug purposes and set HOST_DEBUG
27 // to 0 if you are compiling with the smips-gcc toolchain.
33 // Set PREALLOCATE to 1 if you want to preallocate the benchmark
34 // function before starting stats. If you have instruction/data
35 // caches and you don't want to count the overhead of misses, then
36 // you will need to use preallocation.
42 // Set SET_STATS to 1 if you want to carve out the piece that actually
43 // does the computation.
49 //--------------------------------------------------------------------------
50 // Host Platform Includes
56 void printstr(const char*);
61 //--------------------------------------------------------------------------
62 // Input/Reference Data
64 //#include "dataset_test.h"
67 //--------------------------------------------------------------------------
70 int verify( int n
, float test
[], float correct
[] )
73 for ( i
= 0; i
< n
; i
++ ) {
74 if ( test
[i
] > 1.02*correct
[i
]
75 || test
[i
] < 0.98*correct
[i
]) {
77 printf(" test[%d] : %3.2f\n", i
, test
[i
]);
78 printf(" corr[%d] : %3.2f\n", i
, correct
[i
]);
80 // tell us which index fails + 2
81 // (so that if i==0,i==1 fails, we don't
82 // think it was a 'not-finished yet' or pass)
90 void printArray( char name
[], int n
, float arr
[] )
93 printf( " %10s :", name
);
94 for ( i
= 0; i
< n
; i
++ )
95 printf( " %03.2f ", arr
[i
] );
101 void finishTest( int correct
, long long num_cycles
, long long num_retired
)
103 int toHostValue
= correct
;
105 if ( toHostValue
== 1 )
106 printf( "*** PASSED ***\n" );
108 printf( "*** FAILED *** (tohost = %d)\n", toHostValue
);
111 // we no longer run in -testrun mode, which means we can't use
112 // the tohost register to communicate "test is done" and "test results"
113 // so instead we will communicate through print* functions!
116 printstr( "*** PASSED *** (num_cycles = 0x" );
117 printhex(num_cycles
);
118 printstr( ", num_inst_retired = 0x");
119 printhex(num_retired
);
124 printstr( "*** FAILED *** (num_cycles = 0x");
125 printhex(num_cycles
);
126 printstr( ", num_inst_retired = 0x");
127 printhex(num_retired
);
135 // deprecated - cr10/stats-enable register no longer exists
136 void setStats( int enable
)
138 #if ( !HOST_DEBUG && SET_STATS )
139 asm( "mtpcr %0, cr10" : : "r" (enable
) );
143 long long getCycles()
145 long long cycles
= 1337;
146 #if ( !HOST_DEBUG && SET_STATS )
147 __asm__
__volatile__( "rdcycle %0" : "=r" (cycles
) );
152 long long getInstRetired()
154 long long inst_retired
= 1338;
155 #if ( !HOST_DEBUG && SET_STATS )
156 __asm__
__volatile__( "rdinstret %0" : "=r" (inst_retired
) );
161 //--------------------------------------------------------------------------
164 // scalar C implementation
165 void matmul(const int lda
, const float A
[], const float B
[], float C
[] )
169 for ( j
= 0; j
< lda
; j
++ )
170 for ( i
= 0; i
< lda
; i
++ )
172 float cij
= C
[i
+ j
*lda
];
173 for ( k
= 0; k
< lda
; k
++ )
175 cij
+= A
[j
*lda
+ k
] * B
[k
*lda
+ i
];
182 // assembly implementations can be found in *_asm.S
184 //--------------------------------------------------------------------------
187 int main( int argc
, char* argv
[] )
190 long long start_cycles
= 0;
191 long long stop_cycles
= 0;
192 long long num_cycles
;
193 long long start_retired
= 0;
194 long long stop_retired
= 0;
195 long long num_retired
;
197 float results_data
[ARRAY_SIZE
];
198 for ( i
= 0; i
< DIM_SIZE
; i
++ )
199 for ( j
= 0; j
< DIM_SIZE
; j
++ )
200 results_data
[i
+ j
*DIM_SIZE
] = 0.0f
;
202 // Output the input array
205 printArray( "input1", ARRAY_SIZE
, input1_data
);
206 printArray( "input2", ARRAY_SIZE
, input2_data
);
207 printArray( "verify", ARRAY_SIZE
, verify_data
);
208 printArray( "results", ARRAY_SIZE
, results_data
);
211 // --------------------------------------------------
212 // If needed we preallocate everything in the caches
220 // --------------------------------------------------
222 start_cycles
= getCycles();
223 start_retired
= getInstRetired();
226 matmul( DIM_SIZE
, input1_data
, input2_data
, results_data
);
230 scalar_matmul_asm( DIM_SIZE
, input1_data
, input2_data
, results_data
);
235 vt_matmul_asm( DIM_SIZE
, input1_data
, input2_data
, results_data
);
242 stop_cycles
= getCycles();
243 stop_retired
= getInstRetired();
244 num_cycles
= stop_cycles
- start_cycles
;
245 num_retired
= stop_retired
- start_retired
;
248 // --------------------------------------------------
249 // Print out the results
252 printArray( "results", ARRAY_SIZE
, results_data
);
256 // --------------------------------------------------
258 int correct
= verify( ARRAY_SIZE
, results_data
, verify_data
);
259 finishTest(correct
, num_cycles
, num_retired
);