src/cpu/trace/trace_cpu.hh

   1 /*
   2  * Copyright (c) 2013 - 2016 ARM Limited
   3  * All rights reserved
   4  *
   5  * The license below extends only to copyright in the software and shall
   6  * not be construed as granting a license to any other intellectual
   7  * property including but not limited to intellectual property relating
   8  * to a hardware implementation of the functionality of the software
   9  * licensed hereunder.  You may use the software subject to the license
  10  * terms below provided that you ensure that this notice is replicated
  11  * unmodified and in its entirety in all distributions of the software,
  12  * modified or unmodified, in source code or in binary form.
  13  *
  14  * Redistribution and use in source and binary forms, with or without
  15  * modification, are permitted provided that the following conditions are
  16  * met: redistributions of source code must retain the above copyright
  17  * notice, this list of conditions and the following disclaimer;
  18  * redistributions in binary form must reproduce the above copyright
  19  * notice, this list of conditions and the following disclaimer in the
  20  * documentation and/or other materials provided with the distribution;
  21  * neither the name of the copyright holders nor the names of its
  22  * contributors may be used to endorse or promote products derived from
  23  * this software without specific prior written permission.
  24  *
  25  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  26  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  27  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  28  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  29  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  30  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  31  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  32  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  33  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  34  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  35  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  36  */
  37
  38 #ifndef __CPU_TRACE_TRACE_CPU_HH__
  39 #define __CPU_TRACE_TRACE_CPU_HH__
  40
  41 #include <cstdint>
  42 #include <list>
  43 #include <queue>
  44 #include <set>
  45 #include <unordered_map>
  46
  47 #include "arch/registers.hh"
  48 #include "base/statistics.hh"
  49 #include "cpu/base.hh"
  50 #include "debug/TraceCPUData.hh"
  51 #include "debug/TraceCPUInst.hh"
  52 #include "params/TraceCPU.hh"
  53 #include "proto/inst_dep_record.pb.h"
  54 #include "proto/packet.pb.h"
  55 #include "proto/protoio.hh"
  56 #include "sim/sim_events.hh"
  57
  58 /**
  59  * The trace cpu replays traces generated using the elastic trace probe
  60  * attached to the O3 CPU model. The elastic trace is an execution trace with
  61  * register data dependencies and ordering dependencies annotated to it. The
  62  * trace cpu also replays a fixed timestamp fetch trace that is also generated
  63  * by the elastic trace probe. This trace cpu model aims at achieving faster
  64  * simulation compared to the detailed cpu model and good correlation when the
  65  * same trace is used for playback on different memory sub-systems.
  66  *
  67  * The TraceCPU inherits from BaseCPU so some virtual methods need to be
  68  * defined. It has two port subclasses inherited from RequestPort for
  69  * instruction and data ports. It issues the memory requests deducing the
  70  * timing from the trace and without performing real execution of micro-ops. As
  71  * soon as the last dependency for an instruction is complete, its
  72  * computational delay, also provided in the input trace is added. The
  73  * dependency-free nodes are maintained in a list, called 'ReadyList', ordered
  74  * by ready time. Instructions which depend on load stall until the responses
  75  * for read requests are received thus achieving elastic replay. If the
  76  * dependency is not found when adding a new node, it is assumed complete.
  77  * Thus, if this node is found to be completely dependency-free its issue time
  78  * is calculated and it is added to the ready list immediately. This is
  79  * encapsulated in the subclass ElasticDataGen.
  80  *
  81  * If ready nodes are issued in an unconstrained way there can be more nodes
  82  * outstanding which results in divergence in timing compared to the O3CPU.
  83  * Therefore, the Trace CPU also models hardware resources. A sub-class to
  84  * model hardware resources contains the maximum sizes of load buffer, store
  85  * buffer and ROB. If resources are not available, the node is not issued. Such
  86  * nodes that are pending issue are held in the 'depFreeQueue' structure.
  87  *
  88  * Modeling the ROB size in the Trace CPU as a resource limitation is arguably
  89  * the most important parameter of all resources. The ROB occupancy is
  90  * estimated using the newly added field 'robNum'. We need to use ROB number as
  91  * sequence number is at times much higher due to squashing and trace replay is
  92  * focused on correct path modeling.
  93  *
  94  * A map called 'inFlightNodes' is added to track nodes that are not only in
  95  * the readyList but also load nodes that are executed (and thus removed from
  96  * readyList) but are not complete. ReadyList handles what and when to execute
  97  * next node while the inFlightNodes is used for resource modelling. The oldest
  98  * ROB number is updated when any node occupies the ROB or when an entry in the
  99  * ROB is released. The ROB occupancy is equal to the difference in the ROB
 100  * number of the newly dependency-free node and the oldest ROB number in
 101  * flight.
 102  *
 103  * If no node depends on a non load/store node then there is no reason to
 104  * track it in the dependency graph. We filter out such nodes but count them
 105  * and add a weight field to the subsequent node that we do include in the
 106  * trace. The weight field is used to model ROB occupancy during replay.
 107  *
 108  * The depFreeQueue is chosen to be FIFO so that child nodes which are in
 109  * program order get pushed into it in that order and thus issued in program
 110  * order, like in the O3CPU. This is also why the dependents is made a
 111  * sequential container, std::set to std::vector. We only check head of the
 112  * depFreeQueue as nodes are issued in order and blocking on head models that
 113  * better than looping the entire queue. An alternative choice would be to
 114  * inspect top N pending nodes where N is the issue-width. This is left for
 115  * future as the timing correlation looks good as it is.
 116  *
 117  * At the start of an execution event, first we attempt to issue such pending
 118  * nodes by checking if appropriate resources have become available. If yes, we
 119  * compute the execute tick with respect to the time then. Then we proceed to
 120  * complete nodes from the readyList.
 121  *
 122  * When a read response is received, sometimes a dependency on it that was
 123  * supposed to be released when it was issued is still not released. This
 124  * occurs because the dependent gets added to the graph after the read was
 125  * sent. So the check is made less strict and the dependency is marked complete
 126  * on read response instead of insisting that it should have been removed on
 127  * read sent.
 128  *
 129  * There is a check for requests spanning two cache lines as this condition
 130  * triggers an assert fail in the L1 cache. If it does then truncate the size
 131  * to access only until the end of that line and ignore the remainder.
 132  * Strictly-ordered requests are skipped and the dependencies on such requests
 133  * are handled by simply marking them complete immediately.
 134  *
 135  * A CountedExitEvent that contains a static int belonging to the Trace CPU
 136  * class as a down counter is used to implement multi Trace CPU simulation
 137  * exit.
 138  */
 139
 140 class TraceCPU : public BaseCPU
 141 {
 142
 143   public:
 144     TraceCPU(const TraceCPUParams &params);
 145
 146     void init();
 147
 148     /**
 149      * This is a pure virtual function in BaseCPU. As we don't know how many
 150      * insts are in the trace but only know how how many micro-ops are we
 151      * cannot count this stat.
 152      *
 153      * @return 0
 154      */
 155     Counter totalInsts() const { return 0; }
 156
 157     /**
 158      * Return totalOps as the number of committed micro-ops plus the
 159      * speculatively issued loads that are modelled in the TraceCPU replay.
 160      *
 161      * @return number of micro-ops i.e. nodes in the elastic data generator
 162      */
 163     Counter totalOps() const { return traceStats.numOps.value(); }
 164
 165     /*
 166      * Set the no. of ops when elastic data generator completes executing a
 167      * node.
 168      */
 169     void updateNumOps(uint64_t rob_num);
 170
 171     /* Pure virtual function in BaseCPU. Do nothing. */
 172     void wakeup(ThreadID tid=0) { return; }
 173
 174     /*
 175      * When resuming from checkpoint in FS mode, the TraceCPU takes over from
 176      * the old cpu. This function overrides the takeOverFrom() function in the
 177      * BaseCPU. It unbinds the ports of the old CPU and binds the ports of the
 178      * TraceCPU.
 179      */
 180     void takeOverFrom(BaseCPU *oldCPU);
 181
 182     /**
 183      * When instruction cache port receives a retry, schedule event
 184      * icacheNextEvent.
 185      */
 186     void icacheRetryRecvd();
 187
 188     /**
 189      * When data cache port receives a retry, schedule event
 190      * dcacheNextEvent.
 191      */
 192     void dcacheRetryRecvd();
 193
 194     /**
 195      * When data cache port receives a response, this calls the dcache
 196      * generator method handle to complete the load writeback.
 197      *
 198      * @param pkt Pointer to packet received
 199      */
 200     void dcacheRecvTimingResp(PacketPtr pkt);
 201
 202     /**
 203      * Schedule event dcacheNextEvent at the given tick
 204      *
 205      * @param when Tick at which to schedule event
 206      */
 207     void schedDcacheNextEvent(Tick when);
 208
 209   protected:
 210
 211     /**
 212      * IcachePort class that interfaces with L1 Instruction Cache.
 213      */
 214     class IcachePort : public RequestPort
 215     {
 216       public:
 217         /** Default constructor. */
 218         IcachePort(TraceCPU* _cpu) :
 219             RequestPort(_cpu->name() + ".icache_port", _cpu), owner(_cpu)
 220         {}
 221
 222       public:
 223         /**
 224          * Receive the timing reponse and simply delete the packet since
 225          * instruction fetch requests are issued as per the timing in the trace
 226          * and responses are ignored.
 227          *
 228          * @param pkt Pointer to packet received
 229          * @return true
 230          */
 231         bool recvTimingResp(PacketPtr pkt);
 232
 233         /**
 234          * Required functionally but do nothing.
 235          *
 236          * @param pkt Pointer to packet received
 237          */
 238         void recvTimingSnoopReq(PacketPtr pkt) {}
 239
 240         /**
 241          * Handle a retry signalled by the cache if instruction read failed in
 242          * the first attempt.
 243          */
 244         void recvReqRetry();
 245
 246       private:
 247         TraceCPU* owner;
 248     };
 249
 250     /**
 251      * DcachePort class that interfaces with L1 Data Cache.
 252      */
 253     class DcachePort : public RequestPort
 254     {
 255
 256       public:
 257         /** Default constructor. */
 258         DcachePort(TraceCPU* _cpu) :
 259             RequestPort(_cpu->name() + ".dcache_port", _cpu), owner(_cpu)
 260         {}
 261
 262       public:
 263
 264         /**
 265          * Receive the timing reponse and call dcacheRecvTimingResp() method
 266          * of the dcacheGen to handle completing the load
 267          *
 268          * @param pkt Pointer to packet received
 269          * @return true
 270          */
 271         bool recvTimingResp(PacketPtr pkt);
 272
 273         /**
 274          * Required functionally but do nothing.
 275          *
 276          * @param pkt Pointer to packet received
 277          */
 278         void recvTimingSnoopReq(PacketPtr pkt) {}
 279
 280         /**
 281          * Required functionally but do nothing.
 282          *
 283          * @param pkt Pointer to packet received
 284          */
 285         void recvFunctionalSnoop(PacketPtr pkt) {}
 286
 287         /**
 288          * Handle a retry signalled by the cache if data access failed in the
 289          * first attempt.
 290          */
 291         void recvReqRetry();
 292
 293         /**
 294          * Required functionally.
 295          *
 296          * @return true since we have to snoop
 297          */
 298         bool isSnooping() const { return true; }
 299
 300       private:
 301         TraceCPU* owner;
 302     };
 303
 304     /** Port to connect to L1 instruction cache. */
 305     IcachePort icachePort;
 306
 307     /** Port to connect to L1 data cache. */
 308     DcachePort dcachePort;
 309
 310     /** Requestor id for instruction read requests. */
 311     const RequestorID instRequestorID;
 312
 313     /** Requestor id for data read and write requests. */
 314     const RequestorID dataRequestorID;
 315
 316     /** File names for input instruction and data traces. */
 317     std::string instTraceFile, dataTraceFile;
 318
 319     /**
 320      * Generator to read protobuf trace containing memory requests at fixed
 321      * timestamps, perform flow control and issue memory requests. If L1 cache
 322      * port sends packet succesfully, determine the tick to send the next
 323      * packet else wait for retry from cache.
 324      */
 325     class FixedRetryGen
 326     {
 327
 328       private:
 329
 330         /**
 331          * This struct stores a line in the trace file.
 332          */
 333         struct TraceElement
 334         {
 335
 336             /** Specifies if the request is to be a read or a write */
 337             MemCmd cmd;
 338
 339             /** The address for the request */
 340             Addr addr;
 341
 342             /** The size of the access for the request */
 343             Addr blocksize;
 344
 345             /** The time at which the request should be sent */
 346             Tick tick;
 347
 348             /** Potential request flags to use */
 349             Request::FlagsType flags;
 350
 351             /** Instruction PC */
 352             Addr pc;
 353
 354             /**
 355              * Check validity of this element.
 356              *
 357              * @return if this element is valid
 358              */
 359             bool isValid() const { return cmd != MemCmd::InvalidCmd; }
 360
 361             /**
 362              * Make this element invalid.
 363              */
 364             void clear() { cmd = MemCmd::InvalidCmd; }
 365         };
 366
 367         /**
 368          * The InputStream encapsulates a trace file and the
 369          * internal buffers and populates TraceElements based on
 370          * the input.
 371          */
 372         class InputStream
 373         {
 374           private:
 375             // Input file stream for the protobuf trace
 376             ProtoInputStream trace;
 377
 378           public:
 379             /**
 380              * Create a trace input stream for a given file name.
 381              *
 382              * @param filename Path to the file to read from
 383              */
 384             InputStream(const std::string& filename);
 385
 386             /**
 387              * Reset the stream such that it can be played once
 388              * again.
 389              */
 390             void reset();
 391
 392             /**
 393              * Attempt to read a trace element from the stream,
 394              * and also notify the caller if the end of the file
 395              * was reached.
 396              *
 397              * @param element Trace element to populate
 398              * @return True if an element could be read successfully
 399              */
 400             bool read(TraceElement* element);
 401         };
 402
 403       public:
 404         /* Constructor */
 405         FixedRetryGen(TraceCPU& _owner, const std::string& _name,
 406                    RequestPort& _port, RequestorID requestor_id,
 407                    const std::string& trace_file) :
 408             owner(_owner),
 409             port(_port),
 410             requestorId(requestor_id),
 411             trace(trace_file),
 412             genName(owner.name() + ".fixedretry." + _name),
 413             retryPkt(nullptr),
 414             delta(0),
 415             traceComplete(false), fixedStats(&_owner, _name)
 416         {
 417         }
 418
 419         /**
 420          * Called from TraceCPU init(). Reads the first message from the
 421          * input trace file and returns the send tick.
 422          *
 423          * @return Tick when first packet must be sent
 424          */
 425         Tick init();
 426
 427         /**
 428          * This tries to send current or retry packet and returns true if
 429          * successfull. It calls nextExecute() to read next message.
 430          *
 431          * @return bool true if packet is sent successfully
 432          */
 433         bool tryNext();
 434
 435         /** Returns name of the FixedRetryGen instance. */
 436         const std::string& name() const { return genName; }
 437
 438         /**
 439          * Creates a new request assigning the request parameters passed by the
 440          * arguments. Calls the port's sendTimingReq() and returns true if
 441          * the packet was sent succesfully. It is called by tryNext()
 442          *
 443          * @param addr address of request
 444          * @param size size of request
 445          * @param cmd if it is a read or write request
 446          * @param flags associated request flags
 447          * @param pc instruction PC that generated the request
 448          *
 449          * @return true if packet was sent successfully
 450          */
 451         bool send(Addr addr, unsigned size, const MemCmd& cmd,
 452               Request::FlagsType flags, Addr pc);
 453
 454         /** Exit the FixedRetryGen. */
 455         void exit();
 456
 457         /**
 458          * Reads a line of the trace file. Returns the tick
 459          * when the next request should be generated. If the end
 460          * of the file has been reached, it returns false.
 461          *
 462          * @return bool false id end of file has been reached
 463          */
 464         bool nextExecute();
 465
 466         /**
 467          * Returns the traceComplete variable which is set when end of the
 468          * input trace file is reached.
 469          *
 470          * @return bool true if traceComplete is set, false otherwise.
 471          */
 472         bool isTraceComplete() { return traceComplete; }
 473
 474         int64_t tickDelta() { return delta; }
 475
 476       private:
 477         /** Reference of the TraceCPU. */
 478         TraceCPU& owner;
 479
 480         /** Reference of the port to be used to issue memory requests. */
 481         RequestPort& port;
 482
 483         /** RequestorID used for the requests being sent. */
 484         const RequestorID requestorId;
 485
 486         /** Input stream used for reading the input trace file. */
 487         InputStream trace;
 488
 489         /** String to store the name of the FixedRetryGen. */
 490         std::string genName;
 491
 492         /** PacketPtr used to store the packet to retry. */
 493         PacketPtr retryPkt;
 494
 495         /**
 496          * Stores the difference in the send ticks of the current and last
 497          * packets. Keeping this signed to check overflow to a negative value
 498          * which will be caught by assert(delta > 0)
 499          */
 500         int64_t delta;
 501
 502         /**
 503          * Set to true when end of trace is reached.
 504          */
 505         bool traceComplete;
 506
 507         /** Store an element read from the trace to send as the next packet. */
 508         TraceElement currElement;
 509       protected:
 510         struct FixedRetryGenStatGroup : public Stats::Group
 511         {
 512             /** name is the extension to the name for these stats */
 513             FixedRetryGenStatGroup(Stats::Group *parent,
 514                                    const std::string& _name);
 515             /** Stats for instruction accesses replayed. */
 516             Stats::Scalar numSendAttempted;
 517             Stats::Scalar numSendSucceeded;
 518             Stats::Scalar numSendFailed;
 519             Stats::Scalar numRetrySucceeded;
 520             /** Last simulated tick by the FixedRetryGen */
 521             Stats::Scalar instLastTick;
 522         } fixedStats;
 523
 524     };
 525
 526     /**
 527      * The elastic data memory request generator to read protobuf trace
 528      * containing execution trace annotated with data and ordering
 529      * dependencies. It deduces the time at which to send a load/store request
 530      * by tracking the dependencies. It attempts to send a memory request for a
 531      * load/store without performing real execution of micro-ops. If L1 cache
 532      * port sends packet succesfully, the generator checks which instructions
 533      * became dependency free as a result of this and schedules an event
 534      * accordingly. If it fails to send the packet, it waits for a retry from
 535      * the cache.
 536      */
 537     class ElasticDataGen
 538     {
 539       private:
 540         /** Node sequence number type. */
 541         typedef uint64_t NodeSeqNum;
 542
 543         /** Node ROB number type. */
 544         typedef uint64_t NodeRobNum;
 545
 546         typedef ProtoMessage::InstDepRecord::RecordType RecordType;
 547         typedef ProtoMessage::InstDepRecord Record;
 548
 549         /**
 550          * The struct GraphNode stores an instruction in the trace file. The
 551          * format of the trace file favours constructing a dependency graph of
 552          * the execution and this struct is used to encapsulate the request
 553          * data as well as pointers to its dependent GraphNodes.
 554          */
 555         class GraphNode
 556         {
 557           public:
 558             /** Typedef for the list containing the ROB dependencies */
 559             typedef std::list<NodeSeqNum> RobDepList;
 560
 561             /** Typedef for the list containing the register dependencies */
 562             typedef std::list<NodeSeqNum> RegDepList;
 563
 564             /** Instruction sequence number */
 565             NodeSeqNum seqNum;
 566
 567             /** ROB occupancy number */
 568             NodeRobNum robNum;
 569
 570            /**
 571             * Type of the node corresponding to the instruction modeled by
 572             * it.
 573             */
 574             RecordType type;
 575
 576             /** The address for the request if any */
 577             Addr physAddr;
 578
 579             /** The virtual address for the request if any */
 580             Addr virtAddr;
 581
 582             /** Size of request if any */
 583             uint32_t size;
 584
 585             /** Request flags if any */
 586             Request::Flags flags;
 587
 588             /** Instruction PC */
 589             Addr pc;
 590
 591             /** List of order dependencies. */
 592             RobDepList robDep;
 593
 594             /** Computational delay */
 595             uint64_t compDelay;
 596
 597             /**
 598              * List of register dependencies (incoming) if any. Maximum number
 599              * of source registers used to set maximum size of the array
 600              */
 601             RegDepList regDep;
 602
 603             /**
 604              * A vector of nodes dependent (outgoing) on this node. A
 605              * sequential container is chosen because when dependents become
 606              * free, they attempt to issue in program order.
 607              */
 608             std::vector<GraphNode *> dependents;
 609
 610             /** Is the node a load */
 611             bool isLoad() const { return (type == Record::LOAD); }
 612
 613             /** Is the node a store */
 614             bool isStore() const { return (type == Record::STORE); }
 615
 616             /** Is the node a compute (non load/store) node */
 617             bool isComp() const { return (type == Record::COMP); }
 618
 619             /** Remove completed instruction from register dependency array */
 620             bool removeRegDep(NodeSeqNum reg_dep);
 621
 622             /** Remove completed instruction from order dependency array */
 623             bool removeRobDep(NodeSeqNum rob_dep);
 624
 625             /** Check for all dependencies on completed inst */
 626             bool removeDepOnInst(NodeSeqNum done_seq_num);
 627
 628             /** Return true if node has a request which is strictly ordered */
 629             bool
 630             isStrictlyOrdered() const
 631             {
 632                 return (flags.isSet(Request::STRICT_ORDER));
 633             }
 634             /**
 635              * Write out element in trace-compatible format using debug flag
 636              * TraceCPUData.
 637              */
 638             void writeElementAsTrace() const;
 639
 640             /** Return string specifying the type of the node */
 641             std::string typeToStr() const;
 642         };
 643
 644         /** Struct to store a ready-to-execute node and its execution tick. */
 645         struct ReadyNode
 646         {
 647             /** The sequence number of the ready node */
 648             NodeSeqNum seqNum;
 649
 650             /** The tick at which the ready node must be executed */
 651             Tick execTick;
 652         };
 653
 654         /**
 655          * The HardwareResource class models structures that hold the in-flight
 656          * nodes. When a node becomes dependency free, first check if resources
 657          * are available to issue it.
 658          */
 659         class HardwareResource
 660         {
 661           public:
 662             /**
 663              * Constructor that initializes the sizes of the structures.
 664              *
 665              * @param max_rob size of the Reorder Buffer
 666              * @param max_stores size of Store Buffer
 667              * @param max_loads size of Load Buffer
 668              */
 669             HardwareResource(uint16_t max_rob, uint16_t max_stores,
 670                                 uint16_t max_loads);
 671
 672             /**
 673              * Occupy appropriate structures for an issued node.
 674              *
 675              * @param node_ptr pointer to the issued node
 676              */
 677             void occupy(const GraphNode* new_node);
 678
 679             /**
 680              * Release appropriate structures for a completed node.
 681              *
 682              * @param node_ptr pointer to the completed node
 683              */
 684             void release(const GraphNode* done_node);
 685
 686             /** Release store buffer entry for a completed store */
 687             void releaseStoreBuffer();
 688
 689             /**
 690              * Check if structures required to issue a node are free.
 691              *
 692              * @param node_ptr pointer to the node ready to issue
 693              * @return true if resources are available
 694              */
 695             bool isAvailable(const GraphNode* new_node) const;
 696
 697             /**
 698              * Check if there are any outstanding requests, i.e. requests for
 699              * which we are yet to receive a response.
 700              *
 701              * @return true if there is at least one read or write request
 702              *      outstanding
 703              */
 704             bool awaitingResponse() const;
 705
 706             /** Print resource occupancy for debugging. */
 707             void printOccupancy();
 708
 709           private:
 710             /**
 711              * The size of the ROB used to throttle the max. number of
 712              * in-flight nodes.
 713              */
 714             const uint16_t sizeROB;
 715
 716             /**
 717              * The size of store buffer. This is used to throttle the max.
 718              * number of in-flight stores.
 719              */
 720             const uint16_t sizeStoreBuffer;
 721
 722             /**
 723              * The size of load buffer. This is used to throttle the max.
 724              * number of in-flight loads.
 725              */
 726             const uint16_t sizeLoadBuffer;
 727
 728             /**
 729              * A map from the sequence number to the ROB number of the in-
 730              * flight nodes. This includes all nodes that are in the readyList
 731              * plus the loads for which a request has been sent which are not
 732              * present in the readyList. But such loads are not yet complete
 733              * and thus occupy resources. We need to query the oldest in-flight
 734              * node and since a map container keeps all its keys sorted using
 735              * the less than criterion, the first element is the in-flight node
 736              * with the least sequence number, i.e. the oldest in-flight node.
 737              */
 738             std::map<NodeSeqNum, NodeRobNum> inFlightNodes;
 739
 740             /** The ROB number of the oldest in-flight node */
 741             NodeRobNum oldestInFlightRobNum;
 742
 743             /** Number of ready loads for which request may or may not be
 744              * sent.
 745              */
 746             uint16_t numInFlightLoads;
 747
 748             /** Number of ready stores for which request may or may not be
 749              * sent.
 750              */
 751             uint16_t numInFlightStores;
 752         };
 753
 754         /**
 755          * The InputStream encapsulates a trace file and the
 756          * internal buffers and populates GraphNodes based on
 757          * the input.
 758          */
 759         class InputStream
 760         {
 761           private:
 762             /** Input file stream for the protobuf trace */
 763             ProtoInputStream trace;
 764
 765             /**
 766              * A multiplier for the compute delays in the trace to modulate
 767              * the Trace CPU frequency either up or down. The Trace CPU's
 768              * clock domain frequency must also be set to match the expected
 769              * result of frequency scaling.
 770              */
 771             const double timeMultiplier;
 772
 773             /** Count of committed ops read from trace plus the filtered ops */
 774             uint64_t microOpCount;
 775
 776             /**
 777              * The window size that is read from the header of the protobuf
 778              * trace and used to process the dependency trace
 779              */
 780             uint32_t windowSize;
 781
 782           public:
 783             /**
 784              * Create a trace input stream for a given file name.
 785              *
 786              * @param filename Path to the file to read from
 787              * @param time_multiplier used to scale the compute delays
 788              */
 789             InputStream(const std::string& filename,
 790                         const double time_multiplier);
 791
 792             /**
 793              * Reset the stream such that it can be played once
 794              * again.
 795              */
 796             void reset();
 797
 798             /**
 799              * Attempt to read a trace element from the stream,
 800              * and also notify the caller if the end of the file
 801              * was reached.
 802              *
 803              * @param element Trace element to populate
 804              * @param size of register dependency array stored in the element
 805              * @return True if an element could be read successfully
 806              */
 807             bool read(GraphNode* element);
 808
 809             /** Get window size from trace */
 810             uint32_t getWindowSize() const { return windowSize; }
 811
 812             /** Get number of micro-ops modelled in the TraceCPU replay */
 813             uint64_t getMicroOpCount() const { return microOpCount; }
 814         };
 815
 816         public:
 817         /* Constructor */
 818         ElasticDataGen(TraceCPU& _owner, const std::string& _name,
 819                    RequestPort& _port, RequestorID requestor_id,
 820                    const std::string& trace_file,
 821                    const TraceCPUParams &params) :
 822             owner(_owner),
 823             port(_port),
 824             requestorId(requestor_id),
 825             trace(trace_file, 1.0 / params.freqMultiplier),
 826             genName(owner.name() + ".elastic." + _name),
 827             retryPkt(nullptr),
 828             traceComplete(false),
 829             nextRead(false),
 830             execComplete(false),
 831             windowSize(trace.getWindowSize()),
 832             hwResource(params.sizeROB, params.sizeStoreBuffer,
 833                        params.sizeLoadBuffer), elasticStats(&_owner, _name)
 834         {
 835             DPRINTF(TraceCPUData, "Window size in the trace is %d.\n",
 836                     windowSize);
 837         }
 838
 839         /**
 840          * Called from TraceCPU init(). Reads the first message from the
 841          * input trace file and returns the send tick.
 842          *
 843          * @return Tick when first packet must be sent
 844          */
 845         Tick init();
 846
 847         /**
 848          * Adjust traceOffset based on what TraceCPU init() determines on
 849          * comparing the offsets in the fetch request and elastic traces.
 850          *
 851          * @param trace_offset trace offset set by comparing both traces
 852          */
 853         void adjustInitTraceOffset(Tick& offset);
 854
 855         /** Returns name of the ElasticDataGen instance. */
 856         const std::string& name() const { return genName; }
 857
 858         /** Exit the ElasticDataGen. */
 859         void exit();
 860
 861         /**
 862          * Reads a line of the trace file. Returns the tick when the next
 863          * request should be generated. If the end of the file has been
 864          * reached, it returns false.
 865          *
 866          * @return bool false if end of file has been reached else true
 867          */
 868         bool readNextWindow();
 869
 870         /**
 871          * Iterate over the dependencies of a new node and add the new node
 872          * to the list of dependents of the parent node.
 873          *
 874          * @param   new_node    new node to add to the graph
 875          * @tparam  dep_list    the dependency list of type rob or register,
 876          *                      that is to be iterated, and may get modified
 877          */
 878         template<typename T>
 879         void addDepsOnParent(GraphNode *new_node, T& dep_list);
 880
 881         /**
 882          * This is the main execute function which consumes nodes from the
 883          * sorted readyList. First attempt to issue the pending dependency-free
 884          * nodes held in the depFreeQueue. Insert the ready-to-issue nodes into
 885          * the readyList. Then iterate through the readyList and when a node
 886          * has its execute tick equal to curTick(), execute it. If the node is
 887          * a load or a store call executeMemReq() and if it is neither, simply
 888          * mark it complete.
 889          */
 890         void execute();
 891
 892         /**
 893          * Creates a new request for a load or store assigning the request
 894          * parameters. Calls the port's sendTimingReq() and returns a packet
 895          * if the send failed so that it can be saved for a retry.
 896          *
 897          * @param node_ptr pointer to the load or store node to be executed
 898          *
 899          * @return packet pointer if the request failed and nullptr if it was
 900          *          sent successfully
 901          */
 902         PacketPtr executeMemReq(GraphNode* node_ptr);
 903
 904         /**
 905          * Add a ready node to the readyList. When inserting, ensure the nodes
 906          * are sorted in ascending order of their execute ticks.
 907          *
 908          * @param seq_num seq. num of ready node
 909          * @param exec_tick the execute tick of the ready node
 910          */
 911         void addToSortedReadyList(NodeSeqNum seq_num, Tick exec_tick);
 912
 913         /** Print readyList for debugging using debug flag TraceCPUData. */
 914         void printReadyList();
 915
 916         /**
 917          * When a load writeback is received, that is when the load completes,
 918          * release the dependents on it. This is called from the dcache port
 919          * recvTimingResp().
 920          */
 921         void completeMemAccess(PacketPtr pkt);
 922
 923         /**
 924          * Returns the execComplete variable which is set when the last
 925          * node is executed.
 926          *
 927          * @return bool true if execComplete is set, false otherwise.
 928          */
 929         bool isExecComplete() const { return execComplete; }
 930
 931         /**
 932          * Attempts to issue a node once the node's source dependencies are
 933          * complete. If resources are available then add it to the readyList,
 934          * otherwise the node is not issued and is stored in depFreeQueue
 935          * until resources become available.
 936          *
 937          * @param node_ptr pointer to node to be issued
 938          * @param first true if this is the first attempt to issue this node
 939          * @return true if node was added to readyList
 940          */
 941         bool checkAndIssue(const GraphNode* node_ptr, bool first=true);
 942
 943         /** Get number of micro-ops modelled in the TraceCPU replay */
 944         uint64_t getMicroOpCount() const { return trace.getMicroOpCount(); }
 945
 946       private:
 947         /** Reference of the TraceCPU. */
 948         TraceCPU& owner;
 949
 950         /** Reference of the port to be used to issue memory requests. */
 951         RequestPort& port;
 952
 953         /** RequestorID used for the requests being sent. */
 954         const RequestorID requestorId;
 955
 956         /** Input stream used for reading the input trace file. */
 957         InputStream trace;
 958
 959         /** String to store the name of the FixedRetryGen. */
 960         std::string genName;
 961
 962         /** PacketPtr used to store the packet to retry. */
 963         PacketPtr retryPkt;
 964
 965         /** Set to true when end of trace is reached. */
 966         bool traceComplete;
 967
 968         /** Set to true when the next window of instructions need to be read */
 969         bool nextRead;
 970
 971         /** Set true when execution of trace is complete */
 972         bool execComplete;
 973
 974         /**
 975          * Window size within which to check for dependencies. Its value is
 976          * made equal to the window size used to generate the trace which is
 977          * recorded in the trace header. The dependency graph must be
 978          * populated enough such that when a node completes, its potential
 979          * child node must be found and the dependency removed before the
 980          * completed node itself is removed. Thus as soon as the graph shrinks
 981          * to become smaller than this window, we read in the next window.
 982          */
 983         const uint32_t windowSize;
 984
 985         /**
 986          * Hardware resources required to contain in-flight nodes and to
 987          * throttle issuing of new nodes when resources are not available.
 988          */
 989         HardwareResource hwResource;
 990
 991         /** Store the depGraph of GraphNodes */
 992         std::unordered_map<NodeSeqNum, GraphNode*> depGraph;
 993
 994         /**
 995          * Queue of dependency-free nodes that are pending issue because
 996          * resources are not available. This is chosen to be FIFO so that
 997          * dependent nodes which become free in program order get pushed
 998          * into the queue in that order. Thus nodes are more likely to
 999          * issue in program order.
1000          */
1001         std::queue<const GraphNode*> depFreeQueue;
1002
1003         /** List of nodes that are ready to execute */
1004         std::list<ReadyNode> readyList;
1005
1006       protected:
1007         // Defining the a stat group
1008         struct ElasticDataGenStatGroup : public Stats::Group
1009         {
1010             /** name is the extension to the name for these stats */
1011             ElasticDataGenStatGroup(Stats::Group *parent,
1012                                     const std::string& _name);
1013             /** Stats for data memory accesses replayed. */
1014             Stats::Scalar maxDependents;
1015             Stats::Scalar maxReadyListSize;
1016             Stats::Scalar numSendAttempted;
1017             Stats::Scalar numSendSucceeded;
1018             Stats::Scalar numSendFailed;
1019             Stats::Scalar numRetrySucceeded;
1020             Stats::Scalar numSplitReqs;
1021             Stats::Scalar numSOLoads;
1022             Stats::Scalar numSOStores;
1023             /** Tick when ElasticDataGen completes execution */
1024             Stats::Scalar dataLastTick;
1025         } elasticStats;
1026     };
1027
1028     /** Instance of FixedRetryGen to replay instruction read requests. */
1029     FixedRetryGen icacheGen;
1030
1031     /** Instance of ElasticDataGen to replay data read and write requests. */
1032     ElasticDataGen dcacheGen;
1033
1034     /**
1035      * This is the control flow that uses the functionality of the icacheGen to
1036      * replay the trace. It calls tryNext(). If it returns true then next event
1037      * is scheduled at curTick() plus delta. If it returns false then delta is
1038      * ignored and control is brought back via recvRetry().
1039      */
1040     void schedIcacheNext();
1041
1042     /**
1043      * This is the control flow that uses the functionality of the dcacheGen to
1044      * replay the trace. It calls execute(). It checks if execution is complete
1045      * and schedules an event to exit simulation accordingly.
1046      */
1047     void schedDcacheNext();
1048
1049     /** Event for the control flow method schedIcacheNext() */
1050     EventFunctionWrapper icacheNextEvent;
1051
1052     /** Event for the control flow method schedDcacheNext() */
1053     EventFunctionWrapper dcacheNextEvent;
1054
1055     /**
1056      * This is called when either generator finishes executing from the
1057      * trace.
1058      */
1059     void checkAndSchedExitEvent();
1060
1061     /** Set to true when one of the generators finishes replaying its trace. */
1062     bool oneTraceComplete;
1063
1064     /**
1065      * This stores the time offset in the trace, which is taken away from
1066      * the ready times of requests. This is specially useful because the time
1067      * offset can be very large if the traces are generated from the middle of
1068      * a program.
1069      */
1070     Tick traceOffset;
1071
1072     /**
1073      * Number of Trace CPUs in the system used as a shared variable and passed
1074      * to the CountedExitEvent event used for counting down exit events.  It is
1075      * incremented in the constructor call so that the total is arrived at
1076      * automatically.
1077      */
1078     static int numTraceCPUs;
1079
1080    /**
1081     * A CountedExitEvent which when serviced decrements the counter. A sim
1082     * exit event is scheduled when the counter equals zero, that is all
1083     * instances of Trace CPU have had their execCompleteEvent serviced.
1084     */
1085     CountedExitEvent *execCompleteEvent;
1086
1087     /**
1088      * Exit when any one Trace CPU completes its execution. If this is
1089      * configured true then the execCompleteEvent is not scheduled.
1090      */
1091     const bool enableEarlyExit;
1092
1093     /**
1094       * Interval of committed instructions specified by the user at which a
1095       * progress info message is printed
1096       */
1097     const uint64_t progressMsgInterval;
1098
1099     /*
1100      * The progress msg threshold is kept updated to the next multiple of the
1101      * progress msg interval. As soon as the threshold is reached, an info
1102      * message is printed.
1103      */
1104     uint64_t progressMsgThreshold;
1105     struct TraceStats : public Stats::Group
1106     {
1107         TraceStats(TraceCPU *trace);
1108         Stats::Scalar numSchedDcacheEvent;
1109         Stats::Scalar numSchedIcacheEvent;
1110
1111         /** Stat for number of simulated micro-ops. */
1112         Stats::Scalar numOps;
1113         /** Stat for the CPI. This is really cycles per
1114          *  micro-op and not inst. */
1115         Stats::Formula cpi;
1116     } traceStats;
1117
1118   public:
1119
1120     /** Used to get a reference to the icache port. */
1121     Port &getInstPort() { return icachePort; }
1122
1123     /** Used to get a reference to the dcache port. */
1124     Port &getDataPort() { return dcachePort; }
1125
1126 };
1127 #endif // __CPU_TRACE_TRACE_CPU_HH__