mt-vvadd \
mt-matmul \
-bmarks_host = \
- median \
- qsort \
- towers \
- vvadd \
- multiply \
- spmv \
- vec-vvadd \
- vec-cmplxmult \
- vec-matmul \
-
#--------------------------------------------------------------------
# Build rules
#--------------------------------------------------------------------
-HOST_OPTS = -std=gnu99 -DPREALLOCATE=0 -DHOST_DEBUG=1
-HOST_COMP = gcc $(HOST_OPTS)
-
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf-
RISCV_GCC ?= $(RISCV_PREFIX)gcc
RISCV_GCC_OPTS ?= -mcmodel=medany -static -std=gnu99 -O2 -ffast-math -fno-common -fno-builtin-printf
bmarks_riscv_dump = $(addsuffix .riscv.dump, $(bmarks))
bmarks_riscv_out = $(addsuffix .riscv.out, $(bmarks))
-bmarks_defs = -DPREALLOCATE=1 -DHOST_DEBUG=0
+bmarks_defs = -DPREALLOCATE=1
bmarks_cycles = 80000
$(bmarks_riscv_dump): %.riscv.dump: %.riscv
junk += $(bmarks_riscv_bin) $(bmarks_riscv_dump) $(bmarks_riscv_hex) $(bmarks_riscv_out)
-#------------------------------------------------------------
-# Build and run benchmarks on host machine
-
-bmarks_host_bin = $(addsuffix .host, $(bmarks_host))
-bmarks_host_out = $(addsuffix .host.out, $(bmarks_host))
-
-$(bmarks_host_out): %.host.out: %.host
- ./$< > $@
-
-host: $(bmarks_host_bin)
-run-host: $(bmarks_host_out)
- echo; perl -ne 'print " [$$1] $$ARGV \t$$2\n" if /\*{3}(.{8})\*{3}(.*)/' \
- $(bmarks_host_out); echo;
-
-junk += $(bmarks_host_bin) $(bmarks_host_out)
-
#------------------------------------------------------------
# Default
#ifndef __UTIL_H
#define __UTIL_H
-//--------------------------------------------------------------------------
-// Macros
-
-// Set HOST_DEBUG to 1 if you are going to compile this for a host
-// machine (ie Athena/Linux) for debug purposes and set HOST_DEBUG
-// to 0 if you are compiling with the smips-gcc toolchain.
-
-#ifndef HOST_DEBUG
-#define HOST_DEBUG 0
-#endif
-
-// Set PREALLOCATE to 1 if you want to preallocate the benchmark
-// function before starting stats. If you have instruction/data
-// caches and you don't want to count the overhead of misses, then
-// you will need to use preallocation.
-
-#ifndef PREALLOCATE
-#define PREALLOCATE 0
-#endif
-
-// Set SET_STATS to 1 if you want to carve out the piece that actually
-// does the computation.
-
-#if HOST_DEBUG
-#include <stdio.h>
-static void setStats(int enable) {}
-#else
extern void setStats(int enable);
-#endif
#include <stdint.h>
#define static_assert(cond) switch(0) { case 0: case !!(long)(cond): ; }
-static void printArray(const char name[], int n, const int arr[])
-{
-#if HOST_DEBUG
- int i;
- printf( " %10s :", name );
- for ( i = 0; i < n; i++ )
- printf( " %3d ", arr[i] );
- printf( "\n" );
-#endif
-}
-
-static void printDoubleArray(const char name[], int n, const double arr[])
-{
-#if HOST_DEBUG
- int i;
- printf( " %10s :", name );
- for ( i = 0; i < n; i++ )
- printf( " %g ", arr[i] );
- printf( "\n" );
-#endif
-}
-
static int verify(int n, const volatile int* test, const int* verify)
{
int i;
//--------------------------------------------------------------------------
//
// This is the classic Dhrystone synthetic integer benchmark.
-// You should not change anything except the HOST_DEBUG and
-// PREALLOCATE macros for your timing run.
+//
#pragma GCC optimize ("no-inline")
#include "dhrystone.h"
-//--------------------------------------------------------------------------
-// Macros
-
-// Set HOST_DEBUG to 1 if you are going to compile this for a host
-// machine (ie Athena/Linux) for debug purposes and set HOST_DEBUG
-// to 0 if you are compiling with the smips-gcc toolchain.
-
-#ifndef HOST_DEBUG
-#define HOST_DEBUG 0
-#endif
-
-// Set PREALLOCATE to 1 if you want to preallocate the benchmark
-// function before starting stats. If you have instruction/data
-// caches and you don't want to count the overhead of misses, then
-// you will need to use preallocation.
-
-#ifndef PREALLOCATE
-#define PREALLOCATE 0
-#endif
-
-// Set SET_STATS to 1 if you want to carve out the piece that actually
-// does the computation.
-
-#ifndef SET_STATS
-#define SET_STATS 0
-#endif
-
-#if HOST_DEBUG
-# define debug_printf printf
-#else
void debug_printf(const char* str, ...);
-#endif
#include "util.h"
REG int Number_Of_Runs;
/* Arguments */
-#if HOST_DEBUG
- if (argc > 2)
- {
- printf("Usage: %s [number of loops]\n", argv[0]);
- exit (1);
- }
- if (argc == 2)
- {
- Number_Of_Runs = atoi (argv[1]);
- }
- else if (Number_Of_Runs <= 0)
-#endif
- {
- Number_Of_Runs = NUMBER_OF_RUNS;
- }
+ Number_Of_Runs = NUMBER_OF_RUNS;
/* Initializations */
// This benchmark performs a 1D three element median filter. The
// input data (and reference data) should be generated using the
// median_gendata.pl perl script and dumped to a file named
-// dataset1.h You should not change anything except the
-// HOST_DEBUG and PREALLOCATE macros for your timing run.
+// dataset1.h.
#include "util.h"
{
int results_data[DATA_SIZE];
- // Output the input array
- printArray( "input", DATA_SIZE, input_data );
- printArray( "verify", DATA_SIZE, verify_data );
-
#if PREALLOCATE
// If needed we preallocate everything in the caches
median( DATA_SIZE, input_data, results_data );
median( DATA_SIZE, input_data, results_data );
setStats(0);
- // Print out the results
- printArray( "results", DATA_SIZE, results_data );
-
// Check the results
return verify( DATA_SIZE, results_data, verify_data );
}
// using the matmul_gendata.pl perl script and dumped to a file named
// dataset.h.
-
-// print out arrays, etc.
-//#define DEBUG
-
//--------------------------------------------------------------------------
// Includes
int res = verify(ARRAY_SIZE, results_data, verify_data);
-#ifdef DEBUG
- printArray("results:", ARRAY_SIZE, results_data);
- printArray("verify :", ARRAY_SIZE, verify_data);
-#endif
-
exit(res);
}
// generated using the vvadd_gendata.pl perl script and dumped
// to a file named dataset.h
-// to print out arrays, etc.
-//#define DEBUG
-
//--------------------------------------------------------------------------
// Includes
stats(vvadd(cid, nc, DATA_SIZE, input1_data, input2_data, results_data); barrier(nc), DATA_SIZE);
if(cid == 0) {
-#ifdef DEBUG
- printDoubleArray("out-of-place results: ", DATA_SIZE, results_data);
- printDoubleArray("out-of-place verify : ", DATA_SIZE, verify_data);
-#endif
int res = verifyDouble(DATA_SIZE, results_data, verify_data);
if(res) exit(res);
}
stats(vvadd(cid, nc, DATA_SIZE, results_data, input2_data, results_data); barrier(nc), DATA_SIZE);
if(cid == 0) {
-#ifdef DEBUG
- printDoubleArray("in-place results: ", DATA_SIZE, results_data);
- printDoubleArray("in-place verify : ", DATA_SIZE, verify_data);
-#endif
int res = verifyDouble(DATA_SIZE, results_data, verify_data);
if(res) exit(res);
}
// This benchmark tests the software multiply implemenation. The
// input data (and reference data) should be generated using the
// multiply_gendata.pl perl script and dumped to a file named
-// dataset1.h You should not change anything except the
-// HOST_DEBUG and VERIFY macros for your timing run.
+// dataset1.h
#include "util.h"
int i;
int results_data[DATA_SIZE];
- // Output the input arrays
- printArray( "input1", DATA_SIZE, input_data1 );
- printArray( "input2", DATA_SIZE, input_data2 );
- printArray( "verify", DATA_SIZE, verify_data );
-
#if PREALLOCATE
for (i = 0; i < DATA_SIZE; i++)
{
results_data[i] = multiply( input_data1[i], input_data2[i] );
}
setStats(0);
-
- // Print out the results
- printArray( "results", DATA_SIZE, results_data );
// Check the results
return verify( DATA_SIZE, results_data, verify_data );
// implementation is largely adapted from Numerical Recipes for C. The
// input data (and reference data) should be generated using the
// qsort_gendata.pl perl script and dumped to a file named
-// dataset1.h The smips-gcc toolchain does not support system calls
-// so printf's can only be used on a host system, not on the smips
-// processor simulator itself. You should not change anything except
-// the HOST_DEBUG and PREALLOCATE macros for your timing run.
+// dataset1.h.
#include "util.h"
#include <string.h>
for (;;)
{
-#if HOST_DEBUG
- printArray( "", n, arr );
-#endif
-
// Insertion sort when subarray small enough.
if ( ir-l < INSERTION_THRESHOLD )
{
// Push pointers to larger subarray on stack,
// process smaller subarray immediately.
-#if HOST_DEBUG
- assert(stackp < stack+NSTACK);
-#endif
-
if ( ir-i+1 >= j-l )
{
stackp[0] = ir;
int main( int argc, char* argv[] )
{
- // Output the input array
- printArray( "input", DATA_SIZE, input_data );
- printArray( "verify", DATA_SIZE, verify_data );
-
#if PREALLOCATE
// If needed we preallocate everything in the caches
sort(DATA_SIZE, verify_data);
sort( DATA_SIZE, input_data );
setStats(0);
- // Print out the results
- printArray( "test", DATA_SIZE, input_data );
-
// Check the results
return verify( DATA_SIZE, input_data, verify_data );
}
about how to write assembly in C here:
http://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html
- + If you look at the example benchmarks you will see that I have two
- important macros HOST_DEBUG and VERIFY. Use HOST_DEBUG to compile the
- benchmark on your host workstation (ie use standard gcc on Athena/Linux
- box) and then debug the benchmark. Since you are using standard gcc you
- can use printf's to make sure that your benchmark actually works before
- trying it out on your RISCV processor.
-
+ Debugging C compiled code on the RISCV processor is a real pain. It is
hard to associate the assembly with the C code and there is no
debugger. So if you encounter a bug in your processor when running a C
// implementation is largely adapted from Numerical Recipes for C. The
// input data (and reference data) should be generated using the
// qsort_gendata.pl perl script and dumped to a file named
-// dataset1.h The smips-gcc toolchain does not support system calls
-// so printf's can only be used on a host system, not on the smips
-// processor simulator itself. You should not change anything except
-// the HOST_DEBUG and PREALLOCATE macros for your timing run.
+// dataset1.h
#include "util.h"
#include <string.h>
int main( int argc, char* argv[] )
{
static type scratch[DATA_SIZE];
- // Output the input array
- printArray( "input", DATA_SIZE, input_data );
- printArray( "verify", DATA_SIZE, verify_data );
#if PREALLOCATE
// If needed we preallocate everything in the caches
sort(DATA_SIZE, input_data, scratch);
setStats(0);
- // Print out the results
- printArray( "test", DATA_SIZE, input_data );
-
// Check the results
return verify( DATA_SIZE, input_data, verify_data );
}
// disc on top of a smaller disc.
//
// This implementation starts with NUM_DISC discs and uses a recursive
-// algorithm to sovel the puzzle. The smips-gcc toolchain does not support
-// system calls so printf's can only be used on a host system, not on the
-// smips processor simulator itself. You should not change anything except
-// the HOST_DEBUG and PREALLOCATE macros for your timing run.
+// algorithm to sovel the puzzle.
#include "util.h"
}
-#if HOST_DEBUG
-void towers_print( struct Towers* this )
-{
- struct Node* ptr;
- int i, numElements;
-
- printf( " pegA: " );
- for ( i = 0; i < ((this->numDiscs)-list_getSize(&(this->pegA))); i++ )
- printf( ". " );
- for ( ptr = this->pegA.head; ptr != 0; ptr = ptr->next )
- printf( "%d ", ptr->val );
-
- printf( " pegB: " );
- for ( i = 0; i < ((this->numDiscs)-list_getSize(&(this->pegB))); i++ )
- printf( ". " );
- for ( ptr = this->pegB.head; ptr != 0; ptr = ptr->next )
- printf( "%d ", ptr->val );
-
- printf( " pegC: " );
- for ( i = 0; i < ((this->numDiscs)-list_getSize(&(this->pegC))); i++ )
- printf( ". " );
- for ( ptr = this->pegC.head; ptr != 0; ptr = ptr->next )
- printf( "%d ", ptr->val );
-
- printf( "\n" );
-}
-#endif
-
void towers_solve_h( struct Towers* this, int n,
struct List* startPeg,
struct List* tempPeg,
int val;
if ( n == 1 ) {
-#if HOST_DEBUG
- towers_print(this);
-#endif
val = list_pop(startPeg);
list_push(destPeg,val);
this->numMoves++;
int numDiscs = 0;
if ( list_getSize(&this->pegA) != 0 ) {
-#if HOST_DEBUG
- printf( "ERROR: Peg A is not empty!\n" );
-#endif
return 2;
}
if ( list_getSize(&this->pegB) != 0 ) {
-#if HOST_DEBUG
- printf( "ERROR: Peg B is not empty!\n" );
-#endif
return 3;
}
if ( list_getSize(&this->pegC) != this->numDiscs ) {
-#if HOST_DEBUG
- printf( " ERROR: Expected %d discs but found only %d discs!\n", \
- this->numDiscs, list_getSize(&this->pegC) );
-#endif
return 4;
}
for ( ptr = this->pegC.head; ptr != 0; ptr = ptr->next ) {
numDiscs++;
if ( ptr->val != numDiscs ) {
-#if HOST_DEBUG
- printf( " ERROR: Expecting disc %d on peg C but found disc %d instead!\n", \
- numDiscs, ptr->val );
-#endif
return 5;
}
}
if ( this->numMoves != ((1 << this->numDiscs) - 1) ) {
-#if HOST_DEBUG
- printf( " ERROR: Expecting %d num moves but found %d instead!\n", \
- ((1 << this->numDiscs) - 1), this->numMoves );
-#endif
return 6;
}
towers_solve( &towers );
setStats(0);
- // Print out the results
-
-#if HOST_DEBUG
- towers_print( &towers );
-#endif
-
// Check the results
return towers_verify( &towers );
}
// This benchmark uses adds to vectors and writes the results to a
// third vector. The input data (and reference data) should be
// generated using the vvadd_gendata.pl perl script and dumped
-// to a file named dataset1.h The smips-gcc toolchain does not
-// support system calls so printf's can only be used on a host system,
-// not on the smips processor simulator itself. You should not change
-// anything except the HOST_DEBUG and PREALLOCATE macros for your timing
-// runs.
+// to a file named dataset1.h.
#include "util.h"
{
int results_data[DATA_SIZE];
- // Output the input array
- printArray( "input1", DATA_SIZE, input1_data );
- printArray( "input2", DATA_SIZE, input2_data );
- printArray( "verify", DATA_SIZE, verify_data );
-
#if PREALLOCATE
// If needed we preallocate everything in the caches
vvadd( DATA_SIZE, input1_data, input2_data, results_data );
vvadd( DATA_SIZE, input1_data, input2_data, results_data );
setStats(0);
- // Print out the results
- printArray( "results", DATA_SIZE, results_data );
-
// Check the results
return verify( DATA_SIZE, results_data, verify_data );
}