Update README

[riscv-isa-sim.git] / riscv / execute.cc
diff --git a/riscv/execute.cc b/riscv/execute.cc

index ae4c9597a1bd00af5b9ecfd992c470c807659d66..e639e90462cf9063404f5cfde99b712ad201b94f 100644 (file)
--- a/riscv/execute.cc
+++ b/riscv/execute.cc
@@ -2,34 +2,63 @@
  
  #include "processor.h"
  #include "mmu.h"
-#include "sim.h"
  #include <cassert>
  
  
-static void commit_log_stash_privilege(state_t* state)
+static void commit_log_stash_privilege(processor_t* p)
  {
  #ifdef RISCV_ENABLE_COMMITLOG
+  state_t* state = p->get_state();
    state->last_inst_priv = state->prv;
+  state->last_inst_xlen = p->get_xlen();
+  state->last_inst_flen = p->get_flen();
  #endif
  }
  
+static void commit_log_print_value(int width, uint64_t hi, uint64_t lo)
+{
+  switch (width) {
+    case 16:
+      fprintf(stderr, "0x%04" PRIx16, (uint16_t)lo);
+      break;
+    case 32:
+      fprintf(stderr, "0x%08" PRIx32, (uint32_t)lo);
+      break;
+    case 64:
+      fprintf(stderr, "0x%016" PRIx64, lo);
+      break;
+    case 128:
+      fprintf(stderr, "0x%016" PRIx64 "%016" PRIx64, hi, lo);
+      break;
+    default:
+      abort();
+  }
+}
+
  static void commit_log_print_insn(state_t* state, reg_t pc, insn_t insn)
  {
  #ifdef RISCV_ENABLE_COMMITLOG
-  int32_t priv = state->last_inst_priv;
-  uint64_t mask = (insn.length() == 8 ? uint64_t(0) : (uint64_t(1) << (insn.length() * 8))) - 1;
-  if (state->log_reg_write.addr) {
-    fprintf(stderr, "%1d 0x%016" PRIx64 " (0x%08" PRIx64 ") %c%2" PRIu64 " 0x%016" PRIx64 "\n",
-            priv,
-            pc,
-            insn.bits() & mask,
-            state->log_reg_write.addr & 1 ? 'f' : 'x',
-            state->log_reg_write.addr >> 1,
-            state->log_reg_write.data);
+  auto& reg = state->log_reg_write;
+  int priv = state->last_inst_priv;
+  int xlen = state->last_inst_xlen;
+  int flen = state->last_inst_flen;
+
+  fprintf(stderr, "%1d ", priv);
+  commit_log_print_value(xlen, 0, pc);
+  fprintf(stderr, " (");
+  commit_log_print_value(insn.length() * 8, 0, insn.bits());
+
+  if (reg.addr) {
+    bool fp = reg.addr & 1;
+    int rd = reg.addr >> 1;
+    int size = fp ? flen : xlen;
+    fprintf(stderr, ") %c%2d ", fp ? 'f' : 'x', rd);
+    commit_log_print_value(size, reg.data.v[1], reg.data.v[0]);
+    fprintf(stderr, "\n");
    } else {
-    fprintf(stderr, "%1d 0x%016" PRIx64 " (0x%08" PRIx64 ")\n", priv, pc, insn.bits() & mask);
+    fprintf(stderr, ")\n");
    }
-  state->log_reg_write.addr = 0;
+  reg.addr = 0;
  #endif
  }
  
@@ -40,30 +69,35 @@ inline void processor_t::update_histogram(reg_t pc)
  #endif
  }
  
+// This is expected to be inlined by the compiler so each use of execute_insn
+// includes a duplicated body of the function to get separate fetch.func
+// function calls.
  static reg_t execute_insn(processor_t* p, reg_t pc, insn_fetch_t fetch)
  {
-  commit_log_stash_privilege(p->get_state());
+  commit_log_stash_privilege(p);
    reg_t npc = fetch.func(p, fetch.insn, pc);
-  if (!invalid_pc(npc)) {
+  if (npc != PC_SERIALIZE_BEFORE) {
      commit_log_print_insn(p->get_state(), pc, fetch.insn);
      p->update_histogram(pc);
    }
    return npc;
  }
  
+bool processor_t::slow_path()
+{
+  return debug || state.single_step != state.STEP_NONE || state.dcsr.cause;
+}
+
  // fetch/decode/execute loop
  void processor_t::step(size_t n)
  {
-  // TODO: get_interrupt() isn't super fast. Does that matter?
-  if (state.dcsr.cause == DCSR_CAUSE_NONE &&
-      sim->debug_module.get_interrupt(id)) {
-    enter_debug_mode(DCSR_CAUSE_DEBUGINT);
-  }
-
-  if (state.dcsr.cause != DCSR_CAUSE_NONE) {
-    // In Debug Mode, just do 10 steps at a time. Otherwise we're going to be
-    // spinning the rest of the time anyway.
-    n = std::max(n, (size_t) 10);
+  if (state.dcsr.cause == DCSR_CAUSE_NONE) {
+    if (halt_request) {
+      enter_debug_mode(DCSR_CAUSE_DEBUGINT);
+    } // !!!The halt bit in DCSR is deprecated.
+    else if (state.dcsr.halt) {
+      enter_debug_mode(DCSR_CAUSE_HALT);
+    }
    }
  
    while (n > 0) {
@@ -75,7 +109,8 @@ void processor_t::step(size_t n)
       if (unlikely(invalid_pc(pc))) { \
         switch (pc) { \
           case PC_SERIALIZE_BEFORE: state.serialized = true; break; \
-         case PC_SERIALIZE_AFTER: instret++; break; \
+         case PC_SERIALIZE_AFTER: ++instret; break; \
+         case PC_SERIALIZE_WFI: n = ++instret; break; \
           default: abort(); \
         } \
         pc = state.pc; \
@@ -87,53 +122,127 @@ void processor_t::step(size_t n)
  
      try
      {
-      take_interrupt();
+      take_pending_interrupt();
  
-      if (unlikely(debug))
+      if (unlikely(slow_path()))
        {
          while (instret < n)
          {
+          if (unlikely(!state.serialized && state.single_step == state.STEP_STEPPED)) {
+            state.single_step = state.STEP_NONE;
+            if (state.dcsr.cause == DCSR_CAUSE_NONE) {
+              enter_debug_mode(DCSR_CAUSE_STEP);
+              // enter_debug_mode changed state.pc, so we can't just continue.
+              break;
+            }
+          }
+
+          if (unlikely(state.single_step == state.STEP_STEPPING)) {
+            state.single_step = state.STEP_STEPPED;
+          }
+
            insn_fetch_t fetch = mmu->load_insn(pc);
-          if (!state.serialized)
+          if (debug && !state.serialized)
              disasm(fetch.insn);
            pc = execute_insn(this, pc, fetch);
+
            advance_pc();
+
+          if (unlikely(state.pc >= DEBUG_ROM_ENTRY &&
+                       state.pc < DEBUG_END)) {
+            // We're waiting for the debugger to tell us something.
+            return;
+          }
+
          }
        }
        else while (instret < n)
        {
+        // This code uses a modified Duff's Device to improve the performance
+        // of executing instructions. While typical Duff's Devices are used
+        // for software pipelining, the switch statement below primarily
+        // benefits from separate call points for the fetch.func function call
+        // found in each execute_insn. This function call is an indirect jump
+        // that depends on the current instruction. By having an indirect jump
+        // dedicated for each icache entry, you improve the performance of the
+        // host's next address predictor. Each case in the switch statement
+        // allows for the program flow to contine to the next case if it
+        // corresponds to the next instruction in the program and instret is
+        // still less than n.
+        //
+        // According to Andrew Waterman's recollection, this optimization
+        // resulted in approximately a 2x performance increase.
+
+        // This figures out where to jump to in the switch statement
          size_t idx = _mmu->icache_index(pc);
+
+        // This gets the cached decoded instruction from the MMU. If the MMU
+        // does not have the current pc cached, it will refill the MMU and
+        // return the correct entry. ic_entry->data.func is the C++ function
+        // corresponding to the instruction.
          auto ic_entry = _mmu->access_icache(pc);
  
+        // This macro is included in "icache.h" included within the switch
+        // statement below. The indirect jump corresponding to the instruction
+        // is located within the execute_insn() function call.
          #define ICACHE_ACCESS(i) { \
            insn_fetch_t fetch = ic_entry->data; \
-          ic_entry++; \
            pc = execute_insn(this, pc, fetch); \
+          ic_entry = ic_entry->next; \
            if (i == mmu_t::ICACHE_ENTRIES-1) break; \
-          if (unlikely(ic_entry->tag != pc)) goto miss; \
+          if (unlikely(ic_entry->tag != pc)) break; \
            if (unlikely(instret+1 == n)) break; \
            instret++; \
            state.pc = pc; \
          }
  
+        // This switch statement implements the modified Duff's device as
+        // explained above.
          switch (idx) {
+          // "icache.h" is generated by the gen_icache script
            #include "icache.h"
          }
  
          advance_pc();
-        continue;
-
-miss:
-        advance_pc();
-        // refill I$ if it looks like there wasn't a taken branch
-        if (pc > (ic_entry-1)->tag && pc <= (ic_entry-1)->tag + MAX_INSN_LENGTH)
-          _mmu->refill_icache(pc, ic_entry);
        }
      }
      catch(trap_t& t)
      {
        take_trap(t, pc);
        n = instret;
+
+      if (unlikely(state.single_step == state.STEP_STEPPED)) {
+        state.single_step = state.STEP_NONE;
+        enter_debug_mode(DCSR_CAUSE_STEP);
+      }
+    }
+    catch (trigger_matched_t& t)
+    {
+      if (mmu->matched_trigger) {
+        // This exception came from the MMU. That means the instruction hasn't
+        // fully executed yet. We start it again, but this time it won't throw
+        // an exception because matched_trigger is already set. (All memory
+        // instructions are idempotent so restarting is safe.)
+
+        insn_fetch_t fetch = mmu->load_insn(pc);
+        pc = execute_insn(this, pc, fetch);
+        advance_pc();
+
+        delete mmu->matched_trigger;
+        mmu->matched_trigger = NULL;
+      }
+      switch (state.mcontrol[t.index].action) {
+        case ACTION_DEBUG_MODE:
+          enter_debug_mode(DCSR_CAUSE_HWBP);
+          break;
+        case ACTION_DEBUG_EXCEPTION: {
+          mem_trap_t trap(CAUSE_BREAKPOINT, t.address);
+          take_trap(trap, pc);
+          break;
+        }
+        default:
+          abort();
+      }
      }
  
      state.minstret += instret;