From 39ee063f47d6467084f4442624cb28430ac0eebc Mon Sep 17 00:00:00 2001 From: Andy Wright Date: Thu, 1 Dec 2016 15:04:34 -0500 Subject: [PATCH] Added comments about the modified Duff's Device in execute.cc (#77) --- riscv/execute.cc | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/riscv/execute.cc b/riscv/execute.cc index 7b42262..36e7896 100644 --- a/riscv/execute.cc +++ b/riscv/execute.cc @@ -40,6 +40,9 @@ inline void processor_t::update_histogram(reg_t pc) #endif } +// This is expected to be inlined by the compiler so each use of execute_insn +// includes a duplicated body of the function to get separate fetch.func +// function calls. static reg_t execute_insn(processor_t* p, reg_t pc, insn_fetch_t fetch) { commit_log_stash_privilege(p->get_state()); @@ -121,9 +124,40 @@ void processor_t::step(size_t n) } else while (instret < n) { + // This code uses a modified Duff's Device to improve the performance + // of executing instructions. While typical Duff's Devices are used + // for software pipelining, the switch statement below primarily + // benefits from separate call points for the fetch.func function call + // found in each execute_insn. This function call is an indirect jump + // that depends on the current instruction. By having an indirect jump + // dedicated for each icache entry, you improve the performance of the + // host's next address predictor. Each case in the switch statement + // allows for the program flow to contine to the next case if it + // corresponds to the next instruction in the program and instret is + // still less than n. + // + // According to Andrew Waterman's recollection, this optimization + // resulted in approximately a 2x performance increase. + // + // If there is support for compressed instructions, the mmu and the + // switch statement get more complicated. Each branch target is stored + // in the index corresponding to mmu->icache_index(), but consecutive + // non-branching instructions are stored in consecutive indices even if + // mmu->icache_index() specifies a different index (which is the case + // for 32-bit instructions in the presence of compressed instructions). + + // This figures out where to jump to in the switch statement size_t idx = _mmu->icache_index(pc); + + // This gets the cached decoded instruction form the MMU. If the MMU + // does not have the current pc cached, it will refill the MMU and + // return the correct entry. ic_entry->data.func is the C++ function + // corresponding to the instruction. auto ic_entry = _mmu->access_icache(pc); + // This macro is included in "icache.h" included within the switch + // statement below. The indirect jump corresponding to the instruction + // is located within the execute_insn() function call. #define ICACHE_ACCESS(i) { \ insn_fetch_t fetch = ic_entry->data; \ ic_entry++; \ @@ -135,7 +169,10 @@ void processor_t::step(size_t n) state.pc = pc; \ } + // This switch statement implements the modified Duff's device as + // explained above. switch (idx) { + // "icache.h" is generated by the gen_icache script #include "icache.h" } -- 2.30.2