cpu: generate SimPoint basic block vector profiles
authorDam Sunwoo <dam.sunwoo@arm.com>
Mon, 22 Apr 2013 17:20:31 +0000 (13:20 -0400)
committerDam Sunwoo <dam.sunwoo@arm.com>
Mon, 22 Apr 2013 17:20:31 +0000 (13:20 -0400)
This patch is based on http://reviews.m5sim.org/r/1474/ originally written by
Mitch Hayenga. Basic block vectors are generated (simpoint.bb.gz in simout
folder) based on start and end addresses of basic blocks.

Some comments to the original patch are addressed and hooks are added to create
and resume from checkpoints based on instruction counts dictated by external
SimPoint analysis tools.

SimPoint creation/resuming options will be implemented as a separate patch.

configs/common/Options.py
configs/example/se.py
src/cpu/BaseCPU.py
src/cpu/base.cc
src/cpu/simple/AtomicSimpleCPU.py
src/cpu/simple/atomic.cc
src/cpu/simple/atomic.hh

index 0c651b501017cf5336d0f306499c16e5bd286a1d..474da94f4c3fd5fa7d93036605e6ebdf0d0bdc82 100644 (file)
@@ -50,6 +50,10 @@ def addCommonOptions(parser):
     parser.add_option("--caches", action="store_true")
     parser.add_option("--l2cache", action="store_true")
     parser.add_option("--fastmem", action="store_true")
+    parser.add_option("--simpoint-profile", action="store_true",
+                      help="Enable basic block profiling for SimPoints")
+    parser.add_option("--simpoint-interval", type="int", default=10000000,
+                      help="SimPoint interval in num of instructions")
     parser.add_option("--clock", action="store", type="string", default='2GHz')
     parser.add_option("--num-dirs", type="int", default=1)
     parser.add_option("--num-l2caches", type="int", default=1)
index 20149cccddbd5ace9265c290b02d5abece8c7277..a5f0204fd85ecf043a5f1c21c3ab12609b524c66 100644 (file)
@@ -166,6 +166,13 @@ if options.fastmem:
     if (options.caches or options.l2cache):
         fatal("You cannot use fastmem in combination with caches!")
 
+if options.simpoint_profile:
+    if not options.fastmem:
+        # Atomic CPU checked with fastmem option already
+        fatal("SimPoint generation should be done with atomic cpu and fastmem")
+    if np > 1:
+        fatal("SimPoint generation not supported with more than one CPUs")
+
 for i in xrange(np):
     if options.smt:
         system.cpu[i].workload = multiprocesses
@@ -177,6 +184,10 @@ for i in xrange(np):
     if options.fastmem:
         system.cpu[i].fastmem = True
 
+    if options.simpoint_profile:
+        system.cpu[i].simpoint_profile = True
+        system.cpu[i].simpoint_interval = options.simpoint_interval
+
     if options.checker:
         system.cpu[i].addCheckerCpu()
 
index 5e1a0a96135b70c21aecc8d46f0f296f9335ae45..f47838e8363d31c7cf9fd2726f7f0de536931da1 100644 (file)
@@ -187,6 +187,8 @@ class BaseCPU(MemObject):
         "terminate when all threads have reached this inst count")
     max_insts_any_thread = Param.Counter(0,
         "terminate when any thread reaches this inst count")
+    simpoint_start_insts = VectorParam.Counter([],
+        "starting instruction counts of simpoints")
     max_loads_all_threads = Param.Counter(0,
         "terminate when all threads have reached this load count")
     max_loads_any_thread = Param.Counter(0,
index de0f8b23b867eeddc66ffdd660b8106dcd08966d..c7c1dadda5ef6c757128d9b66c874310b6f26dc3 100644 (file)
@@ -153,6 +153,18 @@ BaseCPU::BaseCPU(Params *p, bool is_checker)
         }
     }
 
+    // Set up instruction-count-based termination events for SimPoints
+    // Typically, there are more than one action points.
+    // Simulation.py is responsible to take the necessary actions upon
+    // exitting the simulation loop.
+    if (!p->simpoint_start_insts.empty()) {
+        const char *cause = "simpoint starting point found";
+        for (size_t i = 0; i < p->simpoint_start_insts.size(); ++i) {
+            Event *event = new SimLoopExitEvent(cause, 0);
+            comInstEventQueue[0]->schedule(event, p->simpoint_start_insts[i]);
+        }
+    }
+
     if (p->max_insts_all_threads != 0) {
         const char *cause = "all threads reached the max instruction count";
 
index c747582f624de042349203cb0e1d1554d4dd7883..7a066457d1a9ab9c78c3a70f6cf2cdb1b91e9774 100644 (file)
@@ -61,3 +61,6 @@ class AtomicSimpleCPU(BaseSimpleCPU):
     simulate_data_stalls = Param.Bool(False, "Simulate dcache stall cycles")
     simulate_inst_stalls = Param.Bool(False, "Simulate icache stall cycles")
     fastmem = Param.Bool(False, "Access memory directly")
+    simpoint_profile = Param.Bool(False, "Generate SimPoint BBVs")
+    simpoint_interval = Param.UInt64(100000000, "SimPoint Interval Size (insts)")
+    simpoint_profile_file = Param.String("simpoint.bb.gz", "SimPoint BBV file")
index d7c4190ee17eb0771c8beeb0939afdb1d5974774..1dd9675f9aa09e233d8d836b46a88a0fe33ef014 100644 (file)
@@ -44,6 +44,7 @@
 #include "arch/mmapped_ipr.hh"
 #include "arch/utility.hh"
 #include "base/bigint.hh"
+#include "base/output.hh"
 #include "config/the_isa.hh"
 #include "cpu/simple/atomic.hh"
 #include "cpu/exetrace.hh"
@@ -109,9 +110,20 @@ AtomicSimpleCPU::AtomicSimpleCPU(AtomicSimpleCPUParams *p)
       drain_manager(NULL),
       icachePort(name() + ".icache_port", this),
       dcachePort(name() + ".dcache_port", this),
-      fastmem(p->fastmem)
+      fastmem(p->fastmem),
+      simpoint(p->simpoint_profile),
+      intervalSize(p->simpoint_interval),
+      intervalCount(0),
+      intervalDrift(0),
+      simpointStream(NULL),
+      currentBBV(0, 0),
+      currentBBVInstCount(0)
 {
     _status = Idle;
+
+    if (simpoint) {
+        simpointStream = simout.create(p->simpoint_profile_file, false);
+    }
 }
 
 
@@ -120,6 +132,9 @@ AtomicSimpleCPU::~AtomicSimpleCPU()
     if (tickEvent.scheduled()) {
         deschedule(tickEvent);
     }
+    if (simpointStream) {
+        simout.close(simpointStream);
+    }
 }
 
 unsigned int
@@ -534,6 +549,13 @@ AtomicSimpleCPU::tick()
                         curStaticInst->isFirstMicroop()))
                 instCnt++;
 
+            // profile for SimPoints if enabled and macro inst is finished
+            if (simpoint && curStaticInst && (fault == NoFault) &&
+                    (!curStaticInst->isMicroop() ||
+                     curStaticInst->isLastMicroop())) {
+                profileSimPoint();
+            }
+
             Tick stall_ticks = 0;
             if (simulate_inst_stalls && icache_access)
                 stall_ticks += icache_latency;
@@ -572,6 +594,67 @@ AtomicSimpleCPU::printAddr(Addr a)
     dcachePort.printAddr(a);
 }
 
+void
+AtomicSimpleCPU::profileSimPoint()
+{
+    if (!currentBBVInstCount)
+        currentBBV.first = thread->pcState().instAddr();
+
+    ++intervalCount;
+    ++currentBBVInstCount;
+
+    // If inst is control inst, assume end of basic block.
+    if (curStaticInst->isControl()) {
+        currentBBV.second = thread->pcState().instAddr();
+
+        auto map_itr = bbMap.find(currentBBV);
+        if (map_itr == bbMap.end()){
+            // If a new (previously unseen) basic block is found,
+            // add a new unique id, record num of insts and insert into bbMap.
+            BBInfo info;
+            info.id = bbMap.size() + 1;
+            info.insts = currentBBVInstCount;
+            info.count = currentBBVInstCount;
+            bbMap.insert(std::make_pair(currentBBV, info));
+        } else {
+            // If basic block is seen before, just increment the count by the
+            // number of insts in basic block.
+            BBInfo& info = map_itr->second;
+            assert(info.insts == currentBBVInstCount);
+            info.count += currentBBVInstCount;
+        }
+        currentBBVInstCount = 0;
+
+        // Reached end of interval if the sum of the current inst count
+        // (intervalCount) and the excessive inst count from the previous
+        // interval (intervalDrift) is greater than/equal to the interval size.
+        if (intervalCount + intervalDrift >= intervalSize) {
+            // summarize interval and display BBV info
+            std::vector<pair<uint64_t, uint64_t> > counts;
+            for (auto map_itr = bbMap.begin(); map_itr != bbMap.end();
+                    ++map_itr) {
+                BBInfo& info = map_itr->second;
+                if (info.count != 0) {
+                    counts.push_back(std::make_pair(info.id, info.count));
+                    info.count = 0;
+                }
+            }
+            std::sort(counts.begin(), counts.end());
+
+            // Print output BBV info
+            *simpointStream << "T";
+            for (auto cnt_itr = counts.begin(); cnt_itr != counts.end();
+                    ++cnt_itr) {
+                *simpointStream << ":" << cnt_itr->first
+                                << ":" << cnt_itr->second << " ";
+            }
+            *simpointStream << "\n";
+
+            intervalDrift = (intervalCount + intervalDrift) - intervalSize;
+            intervalCount = 0;
+        }
+    }
+}
 
 ////////////////////////////////////////////////////////////////////////
 //
index 9bb653bcc1419dba9f0f852aa85767b68c8a249e..5a9275a773dab9672690dd6c0afa2527e9d05f2e 100644 (file)
 #ifndef __CPU_SIMPLE_ATOMIC_HH__
 #define __CPU_SIMPLE_ATOMIC_HH__
 
+#include "base/hashmap.hh"
 #include "cpu/simple/base.hh"
 #include "params/AtomicSimpleCPU.hh"
 
+/**
+ *  Start and end address of basic block for SimPoint profiling.
+ *  This structure is used to look up the hash table of BBVs.
+ *  - first: PC of first inst in basic block
+ *  - second: PC of last inst in basic block
+ */
+typedef std::pair<Addr, Addr> BasicBlockRange;
+
+/** Overload hash function for BasicBlockRange type */
+__hash_namespace_begin
+template <>
+class hash<BasicBlockRange>
+{
+  public:
+    size_t operator()(const BasicBlockRange &bb) const {
+        return hash<Addr>()(bb.first + bb.second);
+    }
+};
+__hash_namespace_end
+
+
 class AtomicSimpleCPU : public BaseSimpleCPU
 {
   public:
@@ -161,6 +183,50 @@ class AtomicSimpleCPU : public BaseSimpleCPU
     bool dcache_access;
     Tick dcache_latency;
 
+    /**
+     * Profile basic blocks for SimPoints.
+     * Called at every macro inst to increment basic block inst counts and
+     * to profile block if end of block.
+     */
+    void profileSimPoint();
+
+    /** Data structures for SimPoints BBV generation
+     *  @{
+     */
+
+    /** Whether SimPoint BBV profiling is enabled */
+    const bool simpoint;
+    /** SimPoint profiling interval size in instructions */
+    const uint64_t intervalSize;
+
+    /** Inst count in current basic block */
+    uint64_t intervalCount;
+    /** Excess inst count from previous interval*/
+    uint64_t intervalDrift;
+    /** Pointer to SimPoint BBV output stream */
+    std::ostream *simpointStream;
+
+    /** Basic Block information */
+    struct BBInfo {
+        /** Unique ID */
+        uint64_t id;
+        /** Num of static insts in BB */
+        uint64_t insts;
+        /** Accumulated dynamic inst count executed by BB */
+        uint64_t count;
+    };
+
+    /** Hash table containing all previously seen basic blocks */
+    m5::hash_map<BasicBlockRange, BBInfo> bbMap;
+    /** Currently executing basic block */
+    BasicBlockRange currentBBV;
+    /** inst count in current basic block */
+    uint64_t currentBBVInstCount;
+
+    /** @}
+     *  End of data structures for SimPoints BBV generation
+     */
+
   protected:
 
     /** Return a reference to the data port. */